* [igt-dev] [PATCH i-g-t v4] i915/i915_pm_rps: Check impact of fence ordering on waitboosting
@ 2022-07-13 10:53 Karolina Drobnik
2022-07-15 17:30 ` Kamil Konieczny
` (2 more replies)
0 siblings, 3 replies; 10+ messages in thread
From: Karolina Drobnik @ 2022-07-13 10:53 UTC (permalink / raw)
To: igt-dev; +Cc: Tvrtko Ursulin, Chris Wilson
From: Chris Wilson <chris@chris-wilson.co.uk>
Currently the wait boost heuristic is evaluated at the start of each
fence wait for a series within dma-resv. There is no strict ordering of
fences defined by dma-resv, and so it turns out that the same operation
under different circumstances can result in different heuristics being
applied, and dramatic performance variations in user applications.
Link: https://gitlab.freedesktop.org/drm/intel/-/issues/6284
Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
Signed-off-by: Karolina Drobnik <karolina.drobnik@intel.com>
---
v4:
- Added test descriptions with igt_describe()
v3:
- Removed unnecessary calls to gem_write
v2:
- Introduced batch_create() with arbitration points to prevent hangs
- Added engine-order subtest
- Added reporting of GPU frequency
- Changed runtimes and switched prints from us to ms
tests/i915/i915_pm_rps.c | 261 ++++++++++++++++++++++++++++++++++++++-
1 file changed, 255 insertions(+), 6 deletions(-)
diff --git a/tests/i915/i915_pm_rps.c b/tests/i915/i915_pm_rps.c
index b37f15c2..6125ea60 100644
--- a/tests/i915/i915_pm_rps.c
+++ b/tests/i915/i915_pm_rps.c
@@ -37,8 +37,10 @@
#include <sys/wait.h>
#include "i915/gem.h"
+#include "i915/gem_create.h"
#include "igt.h"
#include "igt_dummyload.h"
+#include "igt_perf.h"
#include "igt_sysfs.h"
IGT_TEST_DESCRIPTION("Render P-States tests - verify GPU frequency changes");
@@ -614,14 +616,253 @@ static void waitboost(int fd, bool reset)
igt_assert_lt(post_freqs[CUR], post_freqs[MAX]);
}
+static uint32_t batch_create(int i915, uint64_t sz)
+{
+ const uint32_t bbe = MI_BATCH_BUFFER_END;
+ const uint32_t chk = 0x5 << 23;
+ uint32_t handle = gem_create(i915, sz);
+
+ for (uint64_t pg = 4096; pg + 4096 < sz; pg += 4096)
+ gem_write(i915, handle, pg, &chk, sizeof(chk));
+
+ gem_write(i915, handle, sz - sizeof(bbe), &bbe, sizeof(bbe));
+
+ return handle;
+}
+
+static uint64_t __fence_order(int i915,
+ struct drm_i915_gem_exec_object2 *obj,
+ struct drm_i915_gem_execbuffer2 *eb,
+ uint64_t flags0, uint64_t flags1,
+ double *outf)
+{
+ uint64_t before[2], after[2];
+ struct timespec tv;
+ int fd;
+
+ gem_quiescent_gpu(i915);
+ fd = perf_i915_open(i915, I915_PMU_ACTUAL_FREQUENCY);
+
+ igt_gettime(&tv);
+
+ obj->flags = flags0;
+ gem_execbuf(i915, eb);
+
+ obj->flags = flags1;
+ gem_execbuf(i915, eb);
+
+ read(fd, before, sizeof(before));
+ gem_sync(i915, obj->handle);
+ read(fd, after, sizeof(after));
+ close(fd);
+
+ after[0] -= before[0];
+ after[1] -= before[1];
+
+ *outf = 1e9 * after[0] / after[1];
+ return igt_nsec_elapsed(&tv);
+}
+
+static void fence_order(int i915)
+{
+ const uint64_t sz = 512ull << 20;
+ struct drm_i915_gem_exec_object2 obj[2] = {
+ { .handle = gem_create(i915, 4096) },
+ { .handle = batch_create(i915, sz) },
+ };
+ struct drm_i915_gem_execbuffer2 execbuf = {
+ .buffers_ptr = to_user_pointer(obj),
+ .buffer_count = ARRAY_SIZE(obj),
+ };
+ uint64_t wr, rw;
+ int min, max;
+ double freq;
+ int sysfs;
+
+ /*
+ * Check the order of fences found during GEM_WAIT does not affect
+ * waitboosting.
+ *
+ * Internally, implicit fences are tracked within a dma-resv which
+ * imposes no order on the individually fences tracked within. Since
+ * there is no defined order, the sequence of waits (and the associated
+ * waitboosts) is also undefined, undermining the consistency of the
+ * waitboost heuristic.
+ *
+ * In particular, we can influence the sequence of fence storage
+ * within dma-resv by mixing read/write semantics for implicit fences.
+ * We can exploit this property of dma-resv to exercise that no matter
+ * the stored order, the heuristic is applied consistently for the
+ * user's GEM_WAIT ioctl.
+ */
+
+ sysfs = igt_sysfs_open(i915);
+ min = igt_sysfs_get_u32(sysfs, "gt_RPn_freq_mhz");
+ max = igt_sysfs_get_u32(sysfs, "gt_RP0_freq_mhz");
+ igt_require(max > min);
+
+ /* Only allow ourselves to upclock via waitboosting */
+ igt_sysfs_printf(sysfs, "gt_min_freq_mhz", "%d", min);
+ igt_sysfs_printf(sysfs, "gt_max_freq_mhz", "%d", min);
+ igt_sysfs_printf(sysfs, "gt_boost_freq_mhz", "%d", max);
+
+ /* Warm up to bind the vma */
+ __fence_order(i915, &obj[0], &execbuf, 0, 0, &freq);
+
+ wr = __fence_order(i915, &obj[0], &execbuf, EXEC_OBJECT_WRITE, 0, &freq);
+ igt_info("Write-then-read: %.2fms @ %.3fMHz\n", wr * 1e-6, freq);
+
+ rw = __fence_order(i915, &obj[0], &execbuf, 0, EXEC_OBJECT_WRITE, &freq);
+ igt_info("Read-then-write: %.2fms @ %.3fMHz\n", rw * 1e-6, freq);
+
+ gem_close(i915, obj[0].handle);
+ gem_close(i915, obj[1].handle);
+
+ igt_sysfs_printf(sysfs, "gt_min_freq_mhz", "%d", min);
+ igt_sysfs_printf(sysfs, "gt_max_freq_mhz", "%d", max);
+
+ igt_assert(4 * rw > 3 * wr && 4 * wr > 3 * rw);
+}
+
+static uint64_t __engine_order(int i915,
+ struct drm_i915_gem_exec_object2 *obj,
+ struct drm_i915_gem_execbuffer2 *eb,
+ unsigned int *engines0,
+ unsigned int *engines1,
+ unsigned int num_engines,
+ double *outf)
+{
+ uint64_t before[2], after[2];
+ struct timespec tv;
+ int fd;
+
+ gem_quiescent_gpu(i915);
+ fd = perf_i915_open(i915, I915_PMU_ACTUAL_FREQUENCY);
+
+ igt_gettime(&tv);
+
+ obj->flags = EXEC_OBJECT_WRITE;
+ for (unsigned int n = 0; n < num_engines; n++) {
+ eb->flags &= ~63ull;
+ eb->flags |= engines0[n];
+ gem_execbuf_wr(i915, eb);
+ }
+
+ obj->flags = 0;
+ for (unsigned int n = 0; n < num_engines; n++) {
+ eb->flags &= ~63ull;
+ eb->flags |= engines1[n];
+ gem_execbuf(i915, eb);
+ }
+
+ read(fd, before, sizeof(before));
+ gem_sync(i915, obj->handle);
+ read(fd, after, sizeof(after));
+ close(fd);
+
+ after[0] -= before[0];
+ after[1] -= before[1];
+
+ *outf = 1e9 * after[0] / after[1];
+ return igt_nsec_elapsed(&tv);
+}
+
+static void engine_order(int i915)
+{
+ const uint64_t sz = 512ull << 20;
+ struct drm_i915_gem_exec_object2 obj[2] = {
+ { .handle = gem_create(i915, 4096) },
+ { .handle = batch_create(i915, sz) },
+ };
+ struct drm_i915_gem_execbuffer2 execbuf = {
+ .buffers_ptr = to_user_pointer(obj),
+ .buffer_count = ARRAY_SIZE(obj),
+ };
+ const struct intel_execution_engine2 *e;
+ unsigned int engines[2], reverse[2];
+ uint64_t forward, backward, both;
+ unsigned int num_engines;
+ const intel_ctx_t *ctx;
+ int min, max;
+ double freq;
+ int sysfs;
+
+ /*
+ * Check the order of fences found during GEM_WAIT does not affect
+ * waitboosting. (See fence_order())
+ *
+ * Another way we can manipulate the order of fences within the
+ * dma-resv is through repeated use of the same contexts.
+ */
+
+ num_engines = 0;
+ ctx = intel_ctx_create_all_physical(i915);
+ for_each_ctx_engine(i915, ctx, e) {
+ /*
+ * Avoid using the cmdparser as it will try to allocate
+ * a new shadow batch for each submission -> oom
+ */
+ if (!gem_engine_has_mutable_submission(i915, e->class))
+ continue;
+
+ engines[num_engines++] = e->flags;
+ if (num_engines == ARRAY_SIZE(engines))
+ break;
+ }
+ igt_require(num_engines > 1);
+ for (unsigned int n = 0; n < num_engines; n++)
+ reverse[n] = engines[num_engines - n - 1];
+ execbuf.rsvd1 = ctx->id;
+
+ sysfs = igt_sysfs_open(i915);
+ min = igt_sysfs_get_u32(sysfs, "gt_RPn_freq_mhz");
+ max = igt_sysfs_get_u32(sysfs, "gt_RP0_freq_mhz");
+ igt_require(max > min);
+
+ /* Only allow ourselves to upclock via waitboosting */
+ igt_sysfs_printf(sysfs, "gt_min_freq_mhz", "%d", min);
+ igt_sysfs_printf(sysfs, "gt_max_freq_mhz", "%d", min);
+ igt_sysfs_printf(sysfs, "gt_boost_freq_mhz", "%d", max);
+
+ /* Warm up to bind the vma */
+ gem_execbuf(i915, &execbuf);
+
+ forward = __engine_order(i915, &obj[0], &execbuf,
+ engines, engines, num_engines,
+ &freq);
+ igt_info("Forwards: %.2fms @ %.3fMhz\n", forward * 1e-6, freq);
+
+ backward = __engine_order(i915, &obj[0], &execbuf,
+ reverse, reverse, num_engines,
+ &freq);
+ igt_info("Backwards: %.2fms @ %.3fMhz\n", backward * 1e-6, freq);
+
+ both = __engine_order(i915, &obj[0], &execbuf,
+ engines, reverse, num_engines,
+ &freq);
+ igt_info("Bidirectional: %.2fms @ %.3fMhz\n", both * 1e-6, freq);
+
+ gem_close(i915, obj[0].handle);
+ gem_close(i915, obj[1].handle);
+ intel_ctx_destroy(i915, ctx);
+
+ igt_sysfs_printf(sysfs, "gt_min_freq_mhz", "%d", min);
+ igt_sysfs_printf(sysfs, "gt_max_freq_mhz", "%d", max);
+
+ igt_assert(4 * forward > 3 * backward && 4 * backward > 3 * forward);
+ igt_assert(4 * forward > 3 * both && 4 * both > 3 * forward);
+}
+
static void pm_rps_exit_handler(int sig)
{
- if (origfreqs[MIN] > readval(sysfs_files[MAX].filp)) {
- writeval(sysfs_files[MAX].filp, origfreqs[MAX]);
- writeval(sysfs_files[MIN].filp, origfreqs[MIN]);
- } else {
- writeval(sysfs_files[MIN].filp, origfreqs[MIN]);
- writeval(sysfs_files[MAX].filp, origfreqs[MAX]);
+ if (sysfs_files[MAX].filp) {
+ if (origfreqs[MIN] > readval(sysfs_files[MAX].filp)) {
+ writeval(sysfs_files[MAX].filp, origfreqs[MAX]);
+ writeval(sysfs_files[MIN].filp, origfreqs[MIN]);
+ } else {
+ writeval(sysfs_files[MIN].filp, origfreqs[MIN]);
+ writeval(sysfs_files[MAX].filp, origfreqs[MAX]);
+ }
}
if (lh.igt_proc.running)
@@ -683,6 +924,14 @@ igt_main
igt_subtest("waitboost")
waitboost(drm_fd, false);
+ igt_describe("Check if the order of fences does not affect waitboosting");
+ igt_subtest("fence-order")
+ fence_order(drm_fd);
+
+ igt_describe("Check if context reuse does not affect waitboosting");
+ igt_subtest("engine-order")
+ engine_order(drm_fd);
+
/* Test boost frequency after GPU reset */
igt_subtest("reset") {
igt_hang_t hang = igt_allow_hang(drm_fd, 0, 0);
--
2.25.1
^ permalink raw reply related [flat|nested] 10+ messages in thread
* Re: [igt-dev] [PATCH i-g-t v4] i915/i915_pm_rps: Check impact of fence ordering on waitboosting
2022-07-13 10:53 [igt-dev] [PATCH i-g-t v4] i915/i915_pm_rps: Check impact of fence ordering on waitboosting Karolina Drobnik
@ 2022-07-15 17:30 ` Kamil Konieczny
2022-07-18 9:46 ` Karolina Drobnik
2022-07-26 10:49 ` Kamil Konieczny
2022-07-26 14:18 ` Kamil Konieczny
2 siblings, 1 reply; 10+ messages in thread
From: Kamil Konieczny @ 2022-07-15 17:30 UTC (permalink / raw)
To: igt-dev; +Cc: Chris Wilson
Hi,
On 2022-07-13 at 12:53:42 +0200, Karolina Drobnik wrote:
> From: Chris Wilson <chris@chris-wilson.co.uk>
>
> Currently the wait boost heuristic is evaluated at the start of each
> fence wait for a series within dma-resv. There is no strict ordering of
> fences defined by dma-resv, and so it turns out that the same operation
> under different circumstances can result in different heuristics being
> applied, and dramatic performance variations in user applications.
>
> Link: https://gitlab.freedesktop.org/drm/intel/-/issues/6284
> Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
> Signed-off-by: Karolina Drobnik <karolina.drobnik@intel.com>
> ---
> v4:
> - Added test descriptions with igt_describe()
> v3:
> - Removed unnecessary calls to gem_write
> v2:
> - Introduced batch_create() with arbitration points to prevent hangs
> - Added engine-order subtest
> - Added reporting of GPU frequency
> - Changed runtimes and switched prints from us to ms
>
> tests/i915/i915_pm_rps.c | 261 ++++++++++++++++++++++++++++++++++++++-
> 1 file changed, 255 insertions(+), 6 deletions(-)
>
> diff --git a/tests/i915/i915_pm_rps.c b/tests/i915/i915_pm_rps.c
> index b37f15c2..6125ea60 100644
> --- a/tests/i915/i915_pm_rps.c
> +++ b/tests/i915/i915_pm_rps.c
> @@ -37,8 +37,10 @@
> #include <sys/wait.h>
>
> #include "i915/gem.h"
> +#include "i915/gem_create.h"
> #include "igt.h"
> #include "igt_dummyload.h"
> +#include "igt_perf.h"
> #include "igt_sysfs.h"
>
> IGT_TEST_DESCRIPTION("Render P-States tests - verify GPU frequency changes");
> @@ -614,14 +616,253 @@ static void waitboost(int fd, bool reset)
> igt_assert_lt(post_freqs[CUR], post_freqs[MAX]);
> }
>
> +static uint32_t batch_create(int i915, uint64_t sz)
> +{
> + const uint32_t bbe = MI_BATCH_BUFFER_END;
> + const uint32_t chk = 0x5 << 23;
> + uint32_t handle = gem_create(i915, sz);
> +
> + for (uint64_t pg = 4096; pg + 4096 < sz; pg += 4096)
> + gem_write(i915, handle, pg, &chk, sizeof(chk));
> +
> + gem_write(i915, handle, sz - sizeof(bbe), &bbe, sizeof(bbe));
> +
> + return handle;
> +}
> +
> +static uint64_t __fence_order(int i915,
> + struct drm_i915_gem_exec_object2 *obj,
> + struct drm_i915_gem_execbuffer2 *eb,
> + uint64_t flags0, uint64_t flags1,
> + double *outf)
> +{
> + uint64_t before[2], after[2];
> + struct timespec tv;
> + int fd;
> +
> + gem_quiescent_gpu(i915);
> + fd = perf_i915_open(i915, I915_PMU_ACTUAL_FREQUENCY);
> +
> + igt_gettime(&tv);
> +
> + obj->flags = flags0;
> + gem_execbuf(i915, eb);
> +
> + obj->flags = flags1;
> + gem_execbuf(i915, eb);
> +
> + read(fd, before, sizeof(before));
> + gem_sync(i915, obj->handle);
> + read(fd, after, sizeof(after));
imho here is better place to read elapsed time.
> + close(fd);
> +
> + after[0] -= before[0];
> + after[1] -= before[1];
> +
> + *outf = 1e9 * after[0] / after[1];
> + return igt_nsec_elapsed(&tv);
See comment above.
> +}
> +
> +static void fence_order(int i915)
> +{
> + const uint64_t sz = 512ull << 20;
> + struct drm_i915_gem_exec_object2 obj[2] = {
> + { .handle = gem_create(i915, 4096) },
> + { .handle = batch_create(i915, sz) },
> + };
> + struct drm_i915_gem_execbuffer2 execbuf = {
> + .buffers_ptr = to_user_pointer(obj),
> + .buffer_count = ARRAY_SIZE(obj),
> + };
> + uint64_t wr, rw;
> + int min, max;
> + double freq;
> + int sysfs;
> +
> + /*
> + * Check the order of fences found during GEM_WAIT does not affect
> + * waitboosting.
> + *
> + * Internally, implicit fences are tracked within a dma-resv which
> + * imposes no order on the individually fences tracked within. Since
> + * there is no defined order, the sequence of waits (and the associated
> + * waitboosts) is also undefined, undermining the consistency of the
> + * waitboost heuristic.
> + *
> + * In particular, we can influence the sequence of fence storage
> + * within dma-resv by mixing read/write semantics for implicit fences.
> + * We can exploit this property of dma-resv to exercise that no matter
> + * the stored order, the heuristic is applied consistently for the
> + * user's GEM_WAIT ioctl.
> + */
> +
> + sysfs = igt_sysfs_open(i915);
> + min = igt_sysfs_get_u32(sysfs, "gt_RPn_freq_mhz");
> + max = igt_sysfs_get_u32(sysfs, "gt_RP0_freq_mhz");
> + igt_require(max > min);
> +
> + /* Only allow ourselves to upclock via waitboosting */
> + igt_sysfs_printf(sysfs, "gt_min_freq_mhz", "%d", min);
> + igt_sysfs_printf(sysfs, "gt_max_freq_mhz", "%d", min);
> + igt_sysfs_printf(sysfs, "gt_boost_freq_mhz", "%d", max);
> +
> + /* Warm up to bind the vma */
> + __fence_order(i915, &obj[0], &execbuf, 0, 0, &freq);
> +
> + wr = __fence_order(i915, &obj[0], &execbuf, EXEC_OBJECT_WRITE, 0, &freq);
> + igt_info("Write-then-read: %.2fms @ %.3fMHz\n", wr * 1e-6, freq);
imho this should be igt_debug (here and below)
s/igt_print/igt_debug/
Regards,
Kamil
> +
> + rw = __fence_order(i915, &obj[0], &execbuf, 0, EXEC_OBJECT_WRITE, &freq);
> + igt_info("Read-then-write: %.2fms @ %.3fMHz\n", rw * 1e-6, freq);
> +
> + gem_close(i915, obj[0].handle);
> + gem_close(i915, obj[1].handle);
> +
> + igt_sysfs_printf(sysfs, "gt_min_freq_mhz", "%d", min);
> + igt_sysfs_printf(sysfs, "gt_max_freq_mhz", "%d", max);
> +
> + igt_assert(4 * rw > 3 * wr && 4 * wr > 3 * rw);
> +}
> +
> +static uint64_t __engine_order(int i915,
> + struct drm_i915_gem_exec_object2 *obj,
> + struct drm_i915_gem_execbuffer2 *eb,
> + unsigned int *engines0,
> + unsigned int *engines1,
> + unsigned int num_engines,
> + double *outf)
> +{
> + uint64_t before[2], after[2];
> + struct timespec tv;
> + int fd;
> +
> + gem_quiescent_gpu(i915);
> + fd = perf_i915_open(i915, I915_PMU_ACTUAL_FREQUENCY);
> +
> + igt_gettime(&tv);
> +
> + obj->flags = EXEC_OBJECT_WRITE;
> + for (unsigned int n = 0; n < num_engines; n++) {
> + eb->flags &= ~63ull;
> + eb->flags |= engines0[n];
> + gem_execbuf_wr(i915, eb);
> + }
> +
> + obj->flags = 0;
> + for (unsigned int n = 0; n < num_engines; n++) {
> + eb->flags &= ~63ull;
> + eb->flags |= engines1[n];
> + gem_execbuf(i915, eb);
> + }
> +
> + read(fd, before, sizeof(before));
> + gem_sync(i915, obj->handle);
> + read(fd, after, sizeof(after));
> + close(fd);
> +
> + after[0] -= before[0];
> + after[1] -= before[1];
> +
> + *outf = 1e9 * after[0] / after[1];
> + return igt_nsec_elapsed(&tv);
> +}
> +
> +static void engine_order(int i915)
> +{
> + const uint64_t sz = 512ull << 20;
> + struct drm_i915_gem_exec_object2 obj[2] = {
> + { .handle = gem_create(i915, 4096) },
> + { .handle = batch_create(i915, sz) },
> + };
> + struct drm_i915_gem_execbuffer2 execbuf = {
> + .buffers_ptr = to_user_pointer(obj),
> + .buffer_count = ARRAY_SIZE(obj),
> + };
> + const struct intel_execution_engine2 *e;
> + unsigned int engines[2], reverse[2];
> + uint64_t forward, backward, both;
> + unsigned int num_engines;
> + const intel_ctx_t *ctx;
> + int min, max;
> + double freq;
> + int sysfs;
> +
> + /*
> + * Check the order of fences found during GEM_WAIT does not affect
> + * waitboosting. (See fence_order())
> + *
> + * Another way we can manipulate the order of fences within the
> + * dma-resv is through repeated use of the same contexts.
> + */
> +
> + num_engines = 0;
> + ctx = intel_ctx_create_all_physical(i915);
> + for_each_ctx_engine(i915, ctx, e) {
> + /*
> + * Avoid using the cmdparser as it will try to allocate
> + * a new shadow batch for each submission -> oom
> + */
> + if (!gem_engine_has_mutable_submission(i915, e->class))
> + continue;
> +
> + engines[num_engines++] = e->flags;
> + if (num_engines == ARRAY_SIZE(engines))
> + break;
> + }
> + igt_require(num_engines > 1);
> + for (unsigned int n = 0; n < num_engines; n++)
> + reverse[n] = engines[num_engines - n - 1];
> + execbuf.rsvd1 = ctx->id;
> +
> + sysfs = igt_sysfs_open(i915);
> + min = igt_sysfs_get_u32(sysfs, "gt_RPn_freq_mhz");
> + max = igt_sysfs_get_u32(sysfs, "gt_RP0_freq_mhz");
> + igt_require(max > min);
> +
> + /* Only allow ourselves to upclock via waitboosting */
> + igt_sysfs_printf(sysfs, "gt_min_freq_mhz", "%d", min);
> + igt_sysfs_printf(sysfs, "gt_max_freq_mhz", "%d", min);
> + igt_sysfs_printf(sysfs, "gt_boost_freq_mhz", "%d", max);
> +
> + /* Warm up to bind the vma */
> + gem_execbuf(i915, &execbuf);
> +
> + forward = __engine_order(i915, &obj[0], &execbuf,
> + engines, engines, num_engines,
> + &freq);
> + igt_info("Forwards: %.2fms @ %.3fMhz\n", forward * 1e-6, freq);
> +
> + backward = __engine_order(i915, &obj[0], &execbuf,
> + reverse, reverse, num_engines,
> + &freq);
> + igt_info("Backwards: %.2fms @ %.3fMhz\n", backward * 1e-6, freq);
> +
> + both = __engine_order(i915, &obj[0], &execbuf,
> + engines, reverse, num_engines,
> + &freq);
> + igt_info("Bidirectional: %.2fms @ %.3fMhz\n", both * 1e-6, freq);
> +
> + gem_close(i915, obj[0].handle);
> + gem_close(i915, obj[1].handle);
> + intel_ctx_destroy(i915, ctx);
> +
> + igt_sysfs_printf(sysfs, "gt_min_freq_mhz", "%d", min);
> + igt_sysfs_printf(sysfs, "gt_max_freq_mhz", "%d", max);
> +
> + igt_assert(4 * forward > 3 * backward && 4 * backward > 3 * forward);
> + igt_assert(4 * forward > 3 * both && 4 * both > 3 * forward);
> +}
> +
> static void pm_rps_exit_handler(int sig)
> {
> - if (origfreqs[MIN] > readval(sysfs_files[MAX].filp)) {
> - writeval(sysfs_files[MAX].filp, origfreqs[MAX]);
> - writeval(sysfs_files[MIN].filp, origfreqs[MIN]);
> - } else {
> - writeval(sysfs_files[MIN].filp, origfreqs[MIN]);
> - writeval(sysfs_files[MAX].filp, origfreqs[MAX]);
> + if (sysfs_files[MAX].filp) {
> + if (origfreqs[MIN] > readval(sysfs_files[MAX].filp)) {
> + writeval(sysfs_files[MAX].filp, origfreqs[MAX]);
> + writeval(sysfs_files[MIN].filp, origfreqs[MIN]);
> + } else {
> + writeval(sysfs_files[MIN].filp, origfreqs[MIN]);
> + writeval(sysfs_files[MAX].filp, origfreqs[MAX]);
> + }
> }
>
> if (lh.igt_proc.running)
> @@ -683,6 +924,14 @@ igt_main
> igt_subtest("waitboost")
> waitboost(drm_fd, false);
>
> + igt_describe("Check if the order of fences does not affect waitboosting");
> + igt_subtest("fence-order")
> + fence_order(drm_fd);
> +
> + igt_describe("Check if context reuse does not affect waitboosting");
> + igt_subtest("engine-order")
> + engine_order(drm_fd);
> +
> /* Test boost frequency after GPU reset */
> igt_subtest("reset") {
> igt_hang_t hang = igt_allow_hang(drm_fd, 0, 0);
> --
> 2.25.1
^ permalink raw reply [flat|nested] 10+ messages in thread
* Re: [igt-dev] [PATCH i-g-t v4] i915/i915_pm_rps: Check impact of fence ordering on waitboosting
2022-07-15 17:30 ` Kamil Konieczny
@ 2022-07-18 9:46 ` Karolina Drobnik
2022-07-25 7:41 ` Karolina Drobnik
2022-07-25 17:06 ` Kamil Konieczny
0 siblings, 2 replies; 10+ messages in thread
From: Karolina Drobnik @ 2022-07-18 9:46 UTC (permalink / raw)
To: Kamil Konieczny; +Cc: igt-dev, Chris Wilson
Hi,
On 15.07.2022 19:30, Kamil Konieczny wrote:
> Hi,
>
> On 2022-07-13 at 12:53:42 +0200, Karolina Drobnik wrote:
>> From: Chris Wilson <chris@chris-wilson.co.uk>
>>
>> Currently the wait boost heuristic is evaluated at the start of each
>> fence wait for a series within dma-resv. There is no strict ordering of
>> fences defined by dma-resv, and so it turns out that the same operation
>> under different circumstances can result in different heuristics being
>> applied, and dramatic performance variations in user applications.
>>
>> Link: https://gitlab.freedesktop.org/drm/intel/-/issues/6284
>> Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
>> Signed-off-by: Karolina Drobnik <karolina.drobnik@intel.com>
>> ---
>> v4:
>> - Added test descriptions with igt_describe()
>> v3:
>> - Removed unnecessary calls to gem_write
>> v2:
>> - Introduced batch_create() with arbitration points to prevent hangs
>> - Added engine-order subtest
>> - Added reporting of GPU frequency
>> - Changed runtimes and switched prints from us to ms
>>
>> tests/i915/i915_pm_rps.c | 261 ++++++++++++++++++++++++++++++++++++++-
>> 1 file changed, 255 insertions(+), 6 deletions(-)
>>
>> diff --git a/tests/i915/i915_pm_rps.c b/tests/i915/i915_pm_rps.c
>> index b37f15c2..6125ea60 100644
>> --- a/tests/i915/i915_pm_rps.c
>> +++ b/tests/i915/i915_pm_rps.c
>> @@ -37,8 +37,10 @@
>> #include <sys/wait.h>
>>
>> #include "i915/gem.h"
>> +#include "i915/gem_create.h"
>> #include "igt.h"
>> #include "igt_dummyload.h"
>> +#include "igt_perf.h"
>> #include "igt_sysfs.h"
>>
>> IGT_TEST_DESCRIPTION("Render P-States tests - verify GPU frequency changes");
>> @@ -614,14 +616,253 @@ static void waitboost(int fd, bool reset)
>> igt_assert_lt(post_freqs[CUR], post_freqs[MAX]);
>> }
>>
>> +static uint32_t batch_create(int i915, uint64_t sz)
>> +{
>> + const uint32_t bbe = MI_BATCH_BUFFER_END;
>> + const uint32_t chk = 0x5 << 23;
>> + uint32_t handle = gem_create(i915, sz);
>> +
>> + for (uint64_t pg = 4096; pg + 4096 < sz; pg += 4096)
>> + gem_write(i915, handle, pg, &chk, sizeof(chk));
>> +
>> + gem_write(i915, handle, sz - sizeof(bbe), &bbe, sizeof(bbe));
>> +
>> + return handle;
>> +}
>> +
>> +static uint64_t __fence_order(int i915,
>> + struct drm_i915_gem_exec_object2 *obj,
>> + struct drm_i915_gem_execbuffer2 *eb,
>> + uint64_t flags0, uint64_t flags1,
>> + double *outf)
>> +{
>> + uint64_t before[2], after[2];
>> + struct timespec tv;
>> + int fd;
>> +
>> + gem_quiescent_gpu(i915);
>> + fd = perf_i915_open(i915, I915_PMU_ACTUAL_FREQUENCY);
>> +
>> + igt_gettime(&tv);
>> +
>> + obj->flags = flags0;
>> + gem_execbuf(i915, eb);
>> +
>> + obj->flags = flags1;
>> + gem_execbuf(i915, eb);
>> +
>> + read(fd, before, sizeof(before));
>> + gem_sync(i915, obj->handle);
>> + read(fd, after, sizeof(after));
>
> imho here is better place to read elapsed time.
We would like to measure time of the whole operation, so this is correct
as it is now
>> + close(fd);
>> +
>> + after[0] -= before[0];
>> + after[1] -= before[1];
>> +
>> + *outf = 1e9 * after[0] / after[1];
>> + return igt_nsec_elapsed(&tv);
>
> See comment above.
>
>> +}
>> +
>> +static void fence_order(int i915)
>> +{
>> + const uint64_t sz = 512ull << 20;
>> + struct drm_i915_gem_exec_object2 obj[2] = {
>> + { .handle = gem_create(i915, 4096) },
>> + { .handle = batch_create(i915, sz) },
>> + };
>> + struct drm_i915_gem_execbuffer2 execbuf = {
>> + .buffers_ptr = to_user_pointer(obj),
>> + .buffer_count = ARRAY_SIZE(obj),
>> + };
>> + uint64_t wr, rw;
>> + int min, max;
>> + double freq;
>> + int sysfs;
>> +
>> + /*
>> + * Check the order of fences found during GEM_WAIT does not affect
>> + * waitboosting.
>> + *
>> + * Internally, implicit fences are tracked within a dma-resv which
>> + * imposes no order on the individually fences tracked within. Since
>> + * there is no defined order, the sequence of waits (and the associated
>> + * waitboosts) is also undefined, undermining the consistency of the
>> + * waitboost heuristic.
>> + *
>> + * In particular, we can influence the sequence of fence storage
>> + * within dma-resv by mixing read/write semantics for implicit fences.
>> + * We can exploit this property of dma-resv to exercise that no matter
>> + * the stored order, the heuristic is applied consistently for the
>> + * user's GEM_WAIT ioctl.
>> + */
>> +
>> + sysfs = igt_sysfs_open(i915);
>> + min = igt_sysfs_get_u32(sysfs, "gt_RPn_freq_mhz");
>> + max = igt_sysfs_get_u32(sysfs, "gt_RP0_freq_mhz");
>> + igt_require(max > min);
>> +
>> + /* Only allow ourselves to upclock via waitboosting */
>> + igt_sysfs_printf(sysfs, "gt_min_freq_mhz", "%d", min);
>> + igt_sysfs_printf(sysfs, "gt_max_freq_mhz", "%d", min);
>> + igt_sysfs_printf(sysfs, "gt_boost_freq_mhz", "%d", max);
>> +
>> + /* Warm up to bind the vma */
>> + __fence_order(i915, &obj[0], &execbuf, 0, 0, &freq);
>> +
>> + wr = __fence_order(i915, &obj[0], &execbuf, EXEC_OBJECT_WRITE, 0, &freq);
>> + igt_info("Write-then-read: %.2fms @ %.3fMHz\n", wr * 1e-6, freq);
>
> imho this should be igt_debug (here and below)
> s/igt_print/igt_debug/
This information is essential in verifying the test results, so igt_info
is required
All the best,
Karolina
> Regards,
> Kamil
>
>> +
>> + rw = __fence_order(i915, &obj[0], &execbuf, 0, EXEC_OBJECT_WRITE, &freq);
>> + igt_info("Read-then-write: %.2fms @ %.3fMHz\n", rw * 1e-6, freq);
>> +
>> + gem_close(i915, obj[0].handle);
>> + gem_close(i915, obj[1].handle);
>> +
>> + igt_sysfs_printf(sysfs, "gt_min_freq_mhz", "%d", min);
>> + igt_sysfs_printf(sysfs, "gt_max_freq_mhz", "%d", max);
>> +
>> + igt_assert(4 * rw > 3 * wr && 4 * wr > 3 * rw);
>> +}
>> +
>> +static uint64_t __engine_order(int i915,
>> + struct drm_i915_gem_exec_object2 *obj,
>> + struct drm_i915_gem_execbuffer2 *eb,
>> + unsigned int *engines0,
>> + unsigned int *engines1,
>> + unsigned int num_engines,
>> + double *outf)
>> +{
>> + uint64_t before[2], after[2];
>> + struct timespec tv;
>> + int fd;
>> +
>> + gem_quiescent_gpu(i915);
>> + fd = perf_i915_open(i915, I915_PMU_ACTUAL_FREQUENCY);
>> +
>> + igt_gettime(&tv);
>> +
>> + obj->flags = EXEC_OBJECT_WRITE;
>> + for (unsigned int n = 0; n < num_engines; n++) {
>> + eb->flags &= ~63ull;
>> + eb->flags |= engines0[n];
>> + gem_execbuf_wr(i915, eb);
>> + }
>> +
>> + obj->flags = 0;
>> + for (unsigned int n = 0; n < num_engines; n++) {
>> + eb->flags &= ~63ull;
>> + eb->flags |= engines1[n];
>> + gem_execbuf(i915, eb);
>> + }
>> +
>> + read(fd, before, sizeof(before));
>> + gem_sync(i915, obj->handle);
>> + read(fd, after, sizeof(after));
>> + close(fd);
>> +
>> + after[0] -= before[0];
>> + after[1] -= before[1];
>> +
>> + *outf = 1e9 * after[0] / after[1];
>> + return igt_nsec_elapsed(&tv);
>> +}
>> +
>> +static void engine_order(int i915)
>> +{
>> + const uint64_t sz = 512ull << 20;
>> + struct drm_i915_gem_exec_object2 obj[2] = {
>> + { .handle = gem_create(i915, 4096) },
>> + { .handle = batch_create(i915, sz) },
>> + };
>> + struct drm_i915_gem_execbuffer2 execbuf = {
>> + .buffers_ptr = to_user_pointer(obj),
>> + .buffer_count = ARRAY_SIZE(obj),
>> + };
>> + const struct intel_execution_engine2 *e;
>> + unsigned int engines[2], reverse[2];
>> + uint64_t forward, backward, both;
>> + unsigned int num_engines;
>> + const intel_ctx_t *ctx;
>> + int min, max;
>> + double freq;
>> + int sysfs;
>> +
>> + /*
>> + * Check the order of fences found during GEM_WAIT does not affect
>> + * waitboosting. (See fence_order())
>> + *
>> + * Another way we can manipulate the order of fences within the
>> + * dma-resv is through repeated use of the same contexts.
>> + */
>> +
>> + num_engines = 0;
>> + ctx = intel_ctx_create_all_physical(i915);
>> + for_each_ctx_engine(i915, ctx, e) {
>> + /*
>> + * Avoid using the cmdparser as it will try to allocate
>> + * a new shadow batch for each submission -> oom
>> + */
>> + if (!gem_engine_has_mutable_submission(i915, e->class))
>> + continue;
>> +
>> + engines[num_engines++] = e->flags;
>> + if (num_engines == ARRAY_SIZE(engines))
>> + break;
>> + }
>> + igt_require(num_engines > 1);
>> + for (unsigned int n = 0; n < num_engines; n++)
>> + reverse[n] = engines[num_engines - n - 1];
>> + execbuf.rsvd1 = ctx->id;
>> +
>> + sysfs = igt_sysfs_open(i915);
>> + min = igt_sysfs_get_u32(sysfs, "gt_RPn_freq_mhz");
>> + max = igt_sysfs_get_u32(sysfs, "gt_RP0_freq_mhz");
>> + igt_require(max > min);
>> +
>> + /* Only allow ourselves to upclock via waitboosting */
>> + igt_sysfs_printf(sysfs, "gt_min_freq_mhz", "%d", min);
>> + igt_sysfs_printf(sysfs, "gt_max_freq_mhz", "%d", min);
>> + igt_sysfs_printf(sysfs, "gt_boost_freq_mhz", "%d", max);
>> +
>> + /* Warm up to bind the vma */
>> + gem_execbuf(i915, &execbuf);
>> +
>> + forward = __engine_order(i915, &obj[0], &execbuf,
>> + engines, engines, num_engines,
>> + &freq);
>> + igt_info("Forwards: %.2fms @ %.3fMhz\n", forward * 1e-6, freq);
>> +
>> + backward = __engine_order(i915, &obj[0], &execbuf,
>> + reverse, reverse, num_engines,
>> + &freq);
>> + igt_info("Backwards: %.2fms @ %.3fMhz\n", backward * 1e-6, freq);
>> +
>> + both = __engine_order(i915, &obj[0], &execbuf,
>> + engines, reverse, num_engines,
>> + &freq);
>> + igt_info("Bidirectional: %.2fms @ %.3fMhz\n", both * 1e-6, freq);
>> +
>> + gem_close(i915, obj[0].handle);
>> + gem_close(i915, obj[1].handle);
>> + intel_ctx_destroy(i915, ctx);
>> +
>> + igt_sysfs_printf(sysfs, "gt_min_freq_mhz", "%d", min);
>> + igt_sysfs_printf(sysfs, "gt_max_freq_mhz", "%d", max);
>> +
>> + igt_assert(4 * forward > 3 * backward && 4 * backward > 3 * forward);
>> + igt_assert(4 * forward > 3 * both && 4 * both > 3 * forward);
>> +}
>> +
>> static void pm_rps_exit_handler(int sig)
>> {
>> - if (origfreqs[MIN] > readval(sysfs_files[MAX].filp)) {
>> - writeval(sysfs_files[MAX].filp, origfreqs[MAX]);
>> - writeval(sysfs_files[MIN].filp, origfreqs[MIN]);
>> - } else {
>> - writeval(sysfs_files[MIN].filp, origfreqs[MIN]);
>> - writeval(sysfs_files[MAX].filp, origfreqs[MAX]);
>> + if (sysfs_files[MAX].filp) {
>> + if (origfreqs[MIN] > readval(sysfs_files[MAX].filp)) {
>> + writeval(sysfs_files[MAX].filp, origfreqs[MAX]);
>> + writeval(sysfs_files[MIN].filp, origfreqs[MIN]);
>> + } else {
>> + writeval(sysfs_files[MIN].filp, origfreqs[MIN]);
>> + writeval(sysfs_files[MAX].filp, origfreqs[MAX]);
>> + }
>> }
>>
>> if (lh.igt_proc.running)
>> @@ -683,6 +924,14 @@ igt_main
>> igt_subtest("waitboost")
>> waitboost(drm_fd, false);
>>
>> + igt_describe("Check if the order of fences does not affect waitboosting");
>> + igt_subtest("fence-order")
>> + fence_order(drm_fd);
>> +
>> + igt_describe("Check if context reuse does not affect waitboosting");
>> + igt_subtest("engine-order")
>> + engine_order(drm_fd);
>> +
>> /* Test boost frequency after GPU reset */
>> igt_subtest("reset") {
>> igt_hang_t hang = igt_allow_hang(drm_fd, 0, 0);
>> --
>> 2.25.1
^ permalink raw reply [flat|nested] 10+ messages in thread
* Re: [igt-dev] [PATCH i-g-t v4] i915/i915_pm_rps: Check impact of fence ordering on waitboosting
2022-07-18 9:46 ` Karolina Drobnik
@ 2022-07-25 7:41 ` Karolina Drobnik
2022-07-25 17:06 ` Kamil Konieczny
1 sibling, 0 replies; 10+ messages in thread
From: Karolina Drobnik @ 2022-07-25 7:41 UTC (permalink / raw)
To: Kamil Konieczny, Zbigniew Kempczyński; +Cc: igt-dev, Chris Wilson
Hi Kamil and Zbigniew,
Do you have any comments to the patch or my replies? If not, could any
of you r-b this patch?
Many thanks,
Karolina
On 18.07.2022 11:46, Karolina Drobnik wrote:
> Hi,
>
> On 15.07.2022 19:30, Kamil Konieczny wrote:
>> Hi,
>>
>> On 2022-07-13 at 12:53:42 +0200, Karolina Drobnik wrote:
>>> From: Chris Wilson <chris@chris-wilson.co.uk>
>>>
>>> Currently the wait boost heuristic is evaluated at the start of each
>>> fence wait for a series within dma-resv. There is no strict ordering of
>>> fences defined by dma-resv, and so it turns out that the same operation
>>> under different circumstances can result in different heuristics being
>>> applied, and dramatic performance variations in user applications.
>>>
>>> Link: https://gitlab.freedesktop.org/drm/intel/-/issues/6284
>>> Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
>>> Signed-off-by: Karolina Drobnik <karolina.drobnik@intel.com>
>>> ---
>>> v4:
>>> - Added test descriptions with igt_describe()
>>> v3:
>>> - Removed unnecessary calls to gem_write
>>> v2:
>>> - Introduced batch_create() with arbitration points to prevent hangs
>>> - Added engine-order subtest
>>> - Added reporting of GPU frequency
>>> - Changed runtimes and switched prints from us to ms
>>>
>>> tests/i915/i915_pm_rps.c | 261 ++++++++++++++++++++++++++++++++++++++-
>>> 1 file changed, 255 insertions(+), 6 deletions(-)
>>>
>>> diff --git a/tests/i915/i915_pm_rps.c b/tests/i915/i915_pm_rps.c
>>> index b37f15c2..6125ea60 100644
>>> --- a/tests/i915/i915_pm_rps.c
>>> +++ b/tests/i915/i915_pm_rps.c
>>> @@ -37,8 +37,10 @@
>>> #include <sys/wait.h>
>>>
>>> #include "i915/gem.h"
>>> +#include "i915/gem_create.h"
>>> #include "igt.h"
>>> #include "igt_dummyload.h"
>>> +#include "igt_perf.h"
>>> #include "igt_sysfs.h"
>>>
>>> IGT_TEST_DESCRIPTION("Render P-States tests - verify GPU frequency
>>> changes");
>>> @@ -614,14 +616,253 @@ static void waitboost(int fd, bool reset)
>>> igt_assert_lt(post_freqs[CUR], post_freqs[MAX]);
>>> }
>>>
>>> +static uint32_t batch_create(int i915, uint64_t sz)
>>> +{
>>> + const uint32_t bbe = MI_BATCH_BUFFER_END;
>>> + const uint32_t chk = 0x5 << 23;
>>> + uint32_t handle = gem_create(i915, sz);
>>> +
>>> + for (uint64_t pg = 4096; pg + 4096 < sz; pg += 4096)
>>> + gem_write(i915, handle, pg, &chk, sizeof(chk));
>>> +
>>> + gem_write(i915, handle, sz - sizeof(bbe), &bbe, sizeof(bbe));
>>> +
>>> + return handle;
>>> +}
>>> +
>>> +static uint64_t __fence_order(int i915,
>>> + struct drm_i915_gem_exec_object2 *obj,
>>> + struct drm_i915_gem_execbuffer2 *eb,
>>> + uint64_t flags0, uint64_t flags1,
>>> + double *outf)
>>> +{
>>> + uint64_t before[2], after[2];
>>> + struct timespec tv;
>>> + int fd;
>>> +
>>> + gem_quiescent_gpu(i915);
>>> + fd = perf_i915_open(i915, I915_PMU_ACTUAL_FREQUENCY);
>>> +
>>> + igt_gettime(&tv);
>>> +
>>> + obj->flags = flags0;
>>> + gem_execbuf(i915, eb);
>>> +
>>> + obj->flags = flags1;
>>> + gem_execbuf(i915, eb);
>>> +
>>> + read(fd, before, sizeof(before));
>>> + gem_sync(i915, obj->handle);
>>> + read(fd, after, sizeof(after));
>>
>> imho here is better place to read elapsed time.
>
> We would like to measure time of the whole operation, so this is correct
> as it is now
>
>>> + close(fd);
>>> +
>>> + after[0] -= before[0];
>>> + after[1] -= before[1];
>>> +
>>> + *outf = 1e9 * after[0] / after[1];
>>> + return igt_nsec_elapsed(&tv);
>>
>> See comment above.
>>
>>> +}
>>> +
>>> +static void fence_order(int i915)
>>> +{
>>> + const uint64_t sz = 512ull << 20;
>>> + struct drm_i915_gem_exec_object2 obj[2] = {
>>> + { .handle = gem_create(i915, 4096) },
>>> + { .handle = batch_create(i915, sz) },
>>> + };
>>> + struct drm_i915_gem_execbuffer2 execbuf = {
>>> + .buffers_ptr = to_user_pointer(obj),
>>> + .buffer_count = ARRAY_SIZE(obj),
>>> + };
>>> + uint64_t wr, rw;
>>> + int min, max;
>>> + double freq;
>>> + int sysfs;
>>> +
>>> + /*
>>> + * Check the order of fences found during GEM_WAIT does not affect
>>> + * waitboosting.
>>> + *
>>> + * Internally, implicit fences are tracked within a dma-resv which
>>> + * imposes no order on the individually fences tracked within.
>>> Since
>>> + * there is no defined order, the sequence of waits (and the
>>> associated
>>> + * waitboosts) is also undefined, undermining the consistency of
>>> the
>>> + * waitboost heuristic.
>>> + *
>>> + * In particular, we can influence the sequence of fence storage
>>> + * within dma-resv by mixing read/write semantics for implicit
>>> fences.
>>> + * We can exploit this property of dma-resv to exercise that no
>>> matter
>>> + * the stored order, the heuristic is applied consistently for the
>>> + * user's GEM_WAIT ioctl.
>>> + */
>>> +
>>> + sysfs = igt_sysfs_open(i915);
>>> + min = igt_sysfs_get_u32(sysfs, "gt_RPn_freq_mhz");
>>> + max = igt_sysfs_get_u32(sysfs, "gt_RP0_freq_mhz");
>>> + igt_require(max > min);
>>> +
>>> + /* Only allow ourselves to upclock via waitboosting */
>>> + igt_sysfs_printf(sysfs, "gt_min_freq_mhz", "%d", min);
>>> + igt_sysfs_printf(sysfs, "gt_max_freq_mhz", "%d", min);
>>> + igt_sysfs_printf(sysfs, "gt_boost_freq_mhz", "%d", max);
>>> +
>>> + /* Warm up to bind the vma */
>>> + __fence_order(i915, &obj[0], &execbuf, 0, 0, &freq);
>>> +
>>> + wr = __fence_order(i915, &obj[0], &execbuf, EXEC_OBJECT_WRITE,
>>> 0, &freq);
>>> + igt_info("Write-then-read: %.2fms @ %.3fMHz\n", wr * 1e-6, freq);
>>
>> imho this should be igt_debug (here and below)
>> s/igt_print/igt_debug/
>
> This information is essential in verifying the test results, so igt_info
> is required
>
> All the best,
> Karolina
>
>> Regards,
>> Kamil
>>
>>> +
>>> + rw = __fence_order(i915, &obj[0], &execbuf, 0,
>>> EXEC_OBJECT_WRITE, &freq);
>>> + igt_info("Read-then-write: %.2fms @ %.3fMHz\n", rw * 1e-6, freq);
>>> +
>>> + gem_close(i915, obj[0].handle);
>>> + gem_close(i915, obj[1].handle);
>>> +
>>> + igt_sysfs_printf(sysfs, "gt_min_freq_mhz", "%d", min);
>>> + igt_sysfs_printf(sysfs, "gt_max_freq_mhz", "%d", max);
>>> +
>>> + igt_assert(4 * rw > 3 * wr && 4 * wr > 3 * rw);
>>> +}
>>> +
>>> +static uint64_t __engine_order(int i915,
>>> + struct drm_i915_gem_exec_object2 *obj,
>>> + struct drm_i915_gem_execbuffer2 *eb,
>>> + unsigned int *engines0,
>>> + unsigned int *engines1,
>>> + unsigned int num_engines,
>>> + double *outf)
>>> +{
>>> + uint64_t before[2], after[2];
>>> + struct timespec tv;
>>> + int fd;
>>> +
>>> + gem_quiescent_gpu(i915);
>>> + fd = perf_i915_open(i915, I915_PMU_ACTUAL_FREQUENCY);
>>> +
>>> + igt_gettime(&tv);
>>> +
>>> + obj->flags = EXEC_OBJECT_WRITE;
>>> + for (unsigned int n = 0; n < num_engines; n++) {
>>> + eb->flags &= ~63ull;
>>> + eb->flags |= engines0[n];
>>> + gem_execbuf_wr(i915, eb);
>>> + }
>>> +
>>> + obj->flags = 0;
>>> + for (unsigned int n = 0; n < num_engines; n++) {
>>> + eb->flags &= ~63ull;
>>> + eb->flags |= engines1[n];
>>> + gem_execbuf(i915, eb);
>>> + }
>>> +
>>> + read(fd, before, sizeof(before));
>>> + gem_sync(i915, obj->handle);
>>> + read(fd, after, sizeof(after));
>>> + close(fd);
>>> +
>>> + after[0] -= before[0];
>>> + after[1] -= before[1];
>>> +
>>> + *outf = 1e9 * after[0] / after[1];
>>> + return igt_nsec_elapsed(&tv);
>>> +}
>>> +
>>> +static void engine_order(int i915)
>>> +{
>>> + const uint64_t sz = 512ull << 20;
>>> + struct drm_i915_gem_exec_object2 obj[2] = {
>>> + { .handle = gem_create(i915, 4096) },
>>> + { .handle = batch_create(i915, sz) },
>>> + };
>>> + struct drm_i915_gem_execbuffer2 execbuf = {
>>> + .buffers_ptr = to_user_pointer(obj),
>>> + .buffer_count = ARRAY_SIZE(obj),
>>> + };
>>> + const struct intel_execution_engine2 *e;
>>> + unsigned int engines[2], reverse[2];
>>> + uint64_t forward, backward, both;
>>> + unsigned int num_engines;
>>> + const intel_ctx_t *ctx;
>>> + int min, max;
>>> + double freq;
>>> + int sysfs;
>>> +
>>> + /*
>>> + * Check the order of fences found during GEM_WAIT does not affect
>>> + * waitboosting. (See fence_order())
>>> + *
>>> + * Another way we can manipulate the order of fences within the
>>> + * dma-resv is through repeated use of the same contexts.
>>> + */
>>> +
>>> + num_engines = 0;
>>> + ctx = intel_ctx_create_all_physical(i915);
>>> + for_each_ctx_engine(i915, ctx, e) {
>>> + /*
>>> + * Avoid using the cmdparser as it will try to allocate
>>> + * a new shadow batch for each submission -> oom
>>> + */
>>> + if (!gem_engine_has_mutable_submission(i915, e->class))
>>> + continue;
>>> +
>>> + engines[num_engines++] = e->flags;
>>> + if (num_engines == ARRAY_SIZE(engines))
>>> + break;
>>> + }
>>> + igt_require(num_engines > 1);
>>> + for (unsigned int n = 0; n < num_engines; n++)
>>> + reverse[n] = engines[num_engines - n - 1];
>>> + execbuf.rsvd1 = ctx->id;
>>> +
>>> + sysfs = igt_sysfs_open(i915);
>>> + min = igt_sysfs_get_u32(sysfs, "gt_RPn_freq_mhz");
>>> + max = igt_sysfs_get_u32(sysfs, "gt_RP0_freq_mhz");
>>> + igt_require(max > min);
>>> +
>>> + /* Only allow ourselves to upclock via waitboosting */
>>> + igt_sysfs_printf(sysfs, "gt_min_freq_mhz", "%d", min);
>>> + igt_sysfs_printf(sysfs, "gt_max_freq_mhz", "%d", min);
>>> + igt_sysfs_printf(sysfs, "gt_boost_freq_mhz", "%d", max);
>>> +
>>> + /* Warm up to bind the vma */
>>> + gem_execbuf(i915, &execbuf);
>>> +
>>> + forward = __engine_order(i915, &obj[0], &execbuf,
>>> + engines, engines, num_engines,
>>> + &freq);
>>> + igt_info("Forwards: %.2fms @ %.3fMhz\n", forward * 1e-6, freq);
>>> +
>>> + backward = __engine_order(i915, &obj[0], &execbuf,
>>> + reverse, reverse, num_engines,
>>> + &freq);
>>> + igt_info("Backwards: %.2fms @ %.3fMhz\n", backward * 1e-6, freq);
>>> +
>>> + both = __engine_order(i915, &obj[0], &execbuf,
>>> + engines, reverse, num_engines,
>>> + &freq);
>>> + igt_info("Bidirectional: %.2fms @ %.3fMhz\n", both * 1e-6, freq);
>>> +
>>> + gem_close(i915, obj[0].handle);
>>> + gem_close(i915, obj[1].handle);
>>> + intel_ctx_destroy(i915, ctx);
>>> +
>>> + igt_sysfs_printf(sysfs, "gt_min_freq_mhz", "%d", min);
>>> + igt_sysfs_printf(sysfs, "gt_max_freq_mhz", "%d", max);
>>> +
>>> + igt_assert(4 * forward > 3 * backward && 4 * backward > 3 *
>>> forward);
>>> + igt_assert(4 * forward > 3 * both && 4 * both > 3 * forward);
>>> +}
>>> +
>>> static void pm_rps_exit_handler(int sig)
>>> {
>>> - if (origfreqs[MIN] > readval(sysfs_files[MAX].filp)) {
>>> - writeval(sysfs_files[MAX].filp, origfreqs[MAX]);
>>> - writeval(sysfs_files[MIN].filp, origfreqs[MIN]);
>>> - } else {
>>> - writeval(sysfs_files[MIN].filp, origfreqs[MIN]);
>>> - writeval(sysfs_files[MAX].filp, origfreqs[MAX]);
>>> + if (sysfs_files[MAX].filp) {
>>> + if (origfreqs[MIN] > readval(sysfs_files[MAX].filp)) {
>>> + writeval(sysfs_files[MAX].filp, origfreqs[MAX]);
>>> + writeval(sysfs_files[MIN].filp, origfreqs[MIN]);
>>> + } else {
>>> + writeval(sysfs_files[MIN].filp, origfreqs[MIN]);
>>> + writeval(sysfs_files[MAX].filp, origfreqs[MAX]);
>>> + }
>>> }
>>>
>>> if (lh.igt_proc.running)
>>> @@ -683,6 +924,14 @@ igt_main
>>> igt_subtest("waitboost")
>>> waitboost(drm_fd, false);
>>>
>>> + igt_describe("Check if the order of fences does not affect
>>> waitboosting");
>>> + igt_subtest("fence-order")
>>> + fence_order(drm_fd);
>>> +
>>> + igt_describe("Check if context reuse does not affect
>>> waitboosting");
>>> + igt_subtest("engine-order")
>>> + engine_order(drm_fd);
>>> +
>>> /* Test boost frequency after GPU reset */
>>> igt_subtest("reset") {
>>> igt_hang_t hang = igt_allow_hang(drm_fd, 0, 0);
>>> --
>>> 2.25.1
^ permalink raw reply [flat|nested] 10+ messages in thread
* Re: [igt-dev] [PATCH i-g-t v4] i915/i915_pm_rps: Check impact of fence ordering on waitboosting
2022-07-18 9:46 ` Karolina Drobnik
2022-07-25 7:41 ` Karolina Drobnik
@ 2022-07-25 17:06 ` Kamil Konieczny
1 sibling, 0 replies; 10+ messages in thread
From: Kamil Konieczny @ 2022-07-25 17:06 UTC (permalink / raw)
To: igt-dev; +Cc: Chris Wilson
Hi,
On 2022-07-18 at 11:46:04 +0200, Karolina Drobnik wrote:
> Hi,
>
> On 15.07.2022 19:30, Kamil Konieczny wrote:
> > Hi,
> >
> > On 2022-07-13 at 12:53:42 +0200, Karolina Drobnik wrote:
> > > From: Chris Wilson <chris@chris-wilson.co.uk>
> > >
> > > Currently the wait boost heuristic is evaluated at the start of each
> > > fence wait for a series within dma-resv. There is no strict ordering of
> > > fences defined by dma-resv, and so it turns out that the same operation
> > > under different circumstances can result in different heuristics being
> > > applied, and dramatic performance variations in user applications.
> > >
> > > Link: https://gitlab.freedesktop.org/drm/intel/-/issues/6284
> > > Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
> > > Signed-off-by: Karolina Drobnik <karolina.drobnik@intel.com>
> > > ---
> > > v4:
> > > - Added test descriptions with igt_describe()
> > > v3:
> > > - Removed unnecessary calls to gem_write
> > > v2:
> > > - Introduced batch_create() with arbitration points to prevent hangs
> > > - Added engine-order subtest
> > > - Added reporting of GPU frequency
> > > - Changed runtimes and switched prints from us to ms
> > >
> > > tests/i915/i915_pm_rps.c | 261 ++++++++++++++++++++++++++++++++++++++-
> > > 1 file changed, 255 insertions(+), 6 deletions(-)
> > >
> > > diff --git a/tests/i915/i915_pm_rps.c b/tests/i915/i915_pm_rps.c
> > > index b37f15c2..6125ea60 100644
> > > --- a/tests/i915/i915_pm_rps.c
> > > +++ b/tests/i915/i915_pm_rps.c
> > > @@ -37,8 +37,10 @@
> > > #include <sys/wait.h>
> > >
> > > #include "i915/gem.h"
> > > +#include "i915/gem_create.h"
> > > #include "igt.h"
> > > #include "igt_dummyload.h"
> > > +#include "igt_perf.h"
> > > #include "igt_sysfs.h"
> > >
> > > IGT_TEST_DESCRIPTION("Render P-States tests - verify GPU frequency changes");
> > > @@ -614,14 +616,253 @@ static void waitboost(int fd, bool reset)
> > > igt_assert_lt(post_freqs[CUR], post_freqs[MAX]);
> > > }
> > >
> > > +static uint32_t batch_create(int i915, uint64_t sz)
> > > +{
> > > + const uint32_t bbe = MI_BATCH_BUFFER_END;
> > > + const uint32_t chk = 0x5 << 23;
> > > + uint32_t handle = gem_create(i915, sz);
> > > +
> > > + for (uint64_t pg = 4096; pg + 4096 < sz; pg += 4096)
> > > + gem_write(i915, handle, pg, &chk, sizeof(chk));
> > > +
> > > + gem_write(i915, handle, sz - sizeof(bbe), &bbe, sizeof(bbe));
> > > +
> > > + return handle;
> > > +}
> > > +
> > > +static uint64_t __fence_order(int i915,
> > > + struct drm_i915_gem_exec_object2 *obj,
> > > + struct drm_i915_gem_execbuffer2 *eb,
> > > + uint64_t flags0, uint64_t flags1,
> > > + double *outf)
> > > +{
> > > + uint64_t before[2], after[2];
> > > + struct timespec tv;
> > > + int fd;
> > > +
> > > + gem_quiescent_gpu(i915);
> > > + fd = perf_i915_open(i915, I915_PMU_ACTUAL_FREQUENCY);
> > > +
> > > + igt_gettime(&tv);
> > > +
> > > + obj->flags = flags0;
> > > + gem_execbuf(i915, eb);
> > > +
> > > + obj->flags = flags1;
> > > + gem_execbuf(i915, eb);
> > > +
> > > + read(fd, before, sizeof(before));
> > > + gem_sync(i915, obj->handle);
> > > + read(fd, after, sizeof(after));
> >
> > imho here is better place to read elapsed time.
>
> We would like to measure time of the whole operation, so this is correct as
> it is now
ok.
> > > + close(fd);
> > > +
> > > + after[0] -= before[0];
> > > + after[1] -= before[1];
> > > +
> > > + *outf = 1e9 * after[0] / after[1];
> > > + return igt_nsec_elapsed(&tv);
> >
> > See comment above.
> >
> > > +}
> > > +
> > > +static void fence_order(int i915)
> > > +{
> > > + const uint64_t sz = 512ull << 20;
> > > + struct drm_i915_gem_exec_object2 obj[2] = {
> > > + { .handle = gem_create(i915, 4096) },
> > > + { .handle = batch_create(i915, sz) },
> > > + };
> > > + struct drm_i915_gem_execbuffer2 execbuf = {
> > > + .buffers_ptr = to_user_pointer(obj),
> > > + .buffer_count = ARRAY_SIZE(obj),
> > > + };
> > > + uint64_t wr, rw;
> > > + int min, max;
> > > + double freq;
> > > + int sysfs;
> > > +
> > > + /*
> > > + * Check the order of fences found during GEM_WAIT does not affect
> > > + * waitboosting.
> > > + *
> > > + * Internally, implicit fences are tracked within a dma-resv which
> > > + * imposes no order on the individually fences tracked within. Since
> > > + * there is no defined order, the sequence of waits (and the associated
> > > + * waitboosts) is also undefined, undermining the consistency of the
> > > + * waitboost heuristic.
> > > + *
> > > + * In particular, we can influence the sequence of fence storage
> > > + * within dma-resv by mixing read/write semantics for implicit fences.
> > > + * We can exploit this property of dma-resv to exercise that no matter
> > > + * the stored order, the heuristic is applied consistently for the
> > > + * user's GEM_WAIT ioctl.
> > > + */
> > > +
> > > + sysfs = igt_sysfs_open(i915);
> > > + min = igt_sysfs_get_u32(sysfs, "gt_RPn_freq_mhz");
> > > + max = igt_sysfs_get_u32(sysfs, "gt_RP0_freq_mhz");
> > > + igt_require(max > min);
> > > +
> > > + /* Only allow ourselves to upclock via waitboosting */
> > > + igt_sysfs_printf(sysfs, "gt_min_freq_mhz", "%d", min);
> > > + igt_sysfs_printf(sysfs, "gt_max_freq_mhz", "%d", min);
> > > + igt_sysfs_printf(sysfs, "gt_boost_freq_mhz", "%d", max);
> > > +
> > > + /* Warm up to bind the vma */
> > > + __fence_order(i915, &obj[0], &execbuf, 0, 0, &freq);
> > > +
> > > + wr = __fence_order(i915, &obj[0], &execbuf, EXEC_OBJECT_WRITE, 0, &freq);
> > > + igt_info("Write-then-read: %.2fms @ %.3fMHz\n", wr * 1e-6, freq);
> >
> > imho this should be igt_debug (here and below)
> > s/igt_print/igt_debug/
>
> This information is essential in verifying the test results, so igt_info is
> required
>
> All the best,
> Karolina
Well, verifing is done by igt_asserts, do we intend to look into
logs for verifing ? If asserts fails then previous debugs will
be printed. While at this, maybe it is also worth to print out
min and max freqs.
Regards,
Kamil
> > > +
> > > + rw = __fence_order(i915, &obj[0], &execbuf, 0, EXEC_OBJECT_WRITE, &freq);
> > > + igt_info("Read-then-write: %.2fms @ %.3fMHz\n", rw * 1e-6, freq);
> > > +
> > > + gem_close(i915, obj[0].handle);
> > > + gem_close(i915, obj[1].handle);
> > > +
> > > + igt_sysfs_printf(sysfs, "gt_min_freq_mhz", "%d", min);
> > > + igt_sysfs_printf(sysfs, "gt_max_freq_mhz", "%d", max);
> > > +
> > > + igt_assert(4 * rw > 3 * wr && 4 * wr > 3 * rw);
> > > +}
> > > +
> > > +static uint64_t __engine_order(int i915,
> > > + struct drm_i915_gem_exec_object2 *obj,
> > > + struct drm_i915_gem_execbuffer2 *eb,
> > > + unsigned int *engines0,
> > > + unsigned int *engines1,
> > > + unsigned int num_engines,
> > > + double *outf)
> > > +{
> > > + uint64_t before[2], after[2];
> > > + struct timespec tv;
> > > + int fd;
> > > +
> > > + gem_quiescent_gpu(i915);
> > > + fd = perf_i915_open(i915, I915_PMU_ACTUAL_FREQUENCY);
> > > +
> > > + igt_gettime(&tv);
> > > +
> > > + obj->flags = EXEC_OBJECT_WRITE;
> > > + for (unsigned int n = 0; n < num_engines; n++) {
> > > + eb->flags &= ~63ull;
> > > + eb->flags |= engines0[n];
> > > + gem_execbuf_wr(i915, eb);
> > > + }
> > > +
> > > + obj->flags = 0;
> > > + for (unsigned int n = 0; n < num_engines; n++) {
> > > + eb->flags &= ~63ull;
> > > + eb->flags |= engines1[n];
> > > + gem_execbuf(i915, eb);
> > > + }
> > > +
> > > + read(fd, before, sizeof(before));
> > > + gem_sync(i915, obj->handle);
> > > + read(fd, after, sizeof(after));
> > > + close(fd);
> > > +
> > > + after[0] -= before[0];
> > > + after[1] -= before[1];
> > > +
> > > + *outf = 1e9 * after[0] / after[1];
> > > + return igt_nsec_elapsed(&tv);
> > > +}
> > > +
> > > +static void engine_order(int i915)
> > > +{
> > > + const uint64_t sz = 512ull << 20;
> > > + struct drm_i915_gem_exec_object2 obj[2] = {
> > > + { .handle = gem_create(i915, 4096) },
> > > + { .handle = batch_create(i915, sz) },
> > > + };
> > > + struct drm_i915_gem_execbuffer2 execbuf = {
> > > + .buffers_ptr = to_user_pointer(obj),
> > > + .buffer_count = ARRAY_SIZE(obj),
> > > + };
> > > + const struct intel_execution_engine2 *e;
> > > + unsigned int engines[2], reverse[2];
> > > + uint64_t forward, backward, both;
> > > + unsigned int num_engines;
> > > + const intel_ctx_t *ctx;
> > > + int min, max;
> > > + double freq;
> > > + int sysfs;
> > > +
> > > + /*
> > > + * Check the order of fences found during GEM_WAIT does not affect
> > > + * waitboosting. (See fence_order())
> > > + *
> > > + * Another way we can manipulate the order of fences within the
> > > + * dma-resv is through repeated use of the same contexts.
> > > + */
> > > +
> > > + num_engines = 0;
> > > + ctx = intel_ctx_create_all_physical(i915);
> > > + for_each_ctx_engine(i915, ctx, e) {
> > > + /*
> > > + * Avoid using the cmdparser as it will try to allocate
> > > + * a new shadow batch for each submission -> oom
> > > + */
> > > + if (!gem_engine_has_mutable_submission(i915, e->class))
> > > + continue;
> > > +
> > > + engines[num_engines++] = e->flags;
> > > + if (num_engines == ARRAY_SIZE(engines))
> > > + break;
> > > + }
> > > + igt_require(num_engines > 1);
> > > + for (unsigned int n = 0; n < num_engines; n++)
> > > + reverse[n] = engines[num_engines - n - 1];
> > > + execbuf.rsvd1 = ctx->id;
> > > +
> > > + sysfs = igt_sysfs_open(i915);
> > > + min = igt_sysfs_get_u32(sysfs, "gt_RPn_freq_mhz");
> > > + max = igt_sysfs_get_u32(sysfs, "gt_RP0_freq_mhz");
> > > + igt_require(max > min);
> > > +
> > > + /* Only allow ourselves to upclock via waitboosting */
> > > + igt_sysfs_printf(sysfs, "gt_min_freq_mhz", "%d", min);
> > > + igt_sysfs_printf(sysfs, "gt_max_freq_mhz", "%d", min);
> > > + igt_sysfs_printf(sysfs, "gt_boost_freq_mhz", "%d", max);
> > > +
> > > + /* Warm up to bind the vma */
> > > + gem_execbuf(i915, &execbuf);
> > > +
> > > + forward = __engine_order(i915, &obj[0], &execbuf,
> > > + engines, engines, num_engines,
> > > + &freq);
> > > + igt_info("Forwards: %.2fms @ %.3fMhz\n", forward * 1e-6, freq);
> > > +
> > > + backward = __engine_order(i915, &obj[0], &execbuf,
> > > + reverse, reverse, num_engines,
> > > + &freq);
> > > + igt_info("Backwards: %.2fms @ %.3fMhz\n", backward * 1e-6, freq);
> > > +
> > > + both = __engine_order(i915, &obj[0], &execbuf,
> > > + engines, reverse, num_engines,
> > > + &freq);
> > > + igt_info("Bidirectional: %.2fms @ %.3fMhz\n", both * 1e-6, freq);
> > > +
> > > + gem_close(i915, obj[0].handle);
> > > + gem_close(i915, obj[1].handle);
> > > + intel_ctx_destroy(i915, ctx);
> > > +
> > > + igt_sysfs_printf(sysfs, "gt_min_freq_mhz", "%d", min);
> > > + igt_sysfs_printf(sysfs, "gt_max_freq_mhz", "%d", max);
> > > +
> > > + igt_assert(4 * forward > 3 * backward && 4 * backward > 3 * forward);
> > > + igt_assert(4 * forward > 3 * both && 4 * both > 3 * forward);
> > > +}
> > > +
> > > static void pm_rps_exit_handler(int sig)
> > > {
> > > - if (origfreqs[MIN] > readval(sysfs_files[MAX].filp)) {
> > > - writeval(sysfs_files[MAX].filp, origfreqs[MAX]);
> > > - writeval(sysfs_files[MIN].filp, origfreqs[MIN]);
> > > - } else {
> > > - writeval(sysfs_files[MIN].filp, origfreqs[MIN]);
> > > - writeval(sysfs_files[MAX].filp, origfreqs[MAX]);
> > > + if (sysfs_files[MAX].filp) {
> > > + if (origfreqs[MIN] > readval(sysfs_files[MAX].filp)) {
> > > + writeval(sysfs_files[MAX].filp, origfreqs[MAX]);
> > > + writeval(sysfs_files[MIN].filp, origfreqs[MIN]);
> > > + } else {
> > > + writeval(sysfs_files[MIN].filp, origfreqs[MIN]);
> > > + writeval(sysfs_files[MAX].filp, origfreqs[MAX]);
> > > + }
> > > }
> > >
> > > if (lh.igt_proc.running)
> > > @@ -683,6 +924,14 @@ igt_main
> > > igt_subtest("waitboost")
> > > waitboost(drm_fd, false);
> > >
> > > + igt_describe("Check if the order of fences does not affect waitboosting");
> > > + igt_subtest("fence-order")
> > > + fence_order(drm_fd);
> > > +
> > > + igt_describe("Check if context reuse does not affect waitboosting");
> > > + igt_subtest("engine-order")
> > > + engine_order(drm_fd);
> > > +
> > > /* Test boost frequency after GPU reset */
> > > igt_subtest("reset") {
> > > igt_hang_t hang = igt_allow_hang(drm_fd, 0, 0);
> > > --
> > > 2.25.1
^ permalink raw reply [flat|nested] 10+ messages in thread
* Re: [igt-dev] [PATCH i-g-t v4] i915/i915_pm_rps: Check impact of fence ordering on waitboosting
2022-07-13 10:53 [igt-dev] [PATCH i-g-t v4] i915/i915_pm_rps: Check impact of fence ordering on waitboosting Karolina Drobnik
2022-07-15 17:30 ` Kamil Konieczny
@ 2022-07-26 10:49 ` Kamil Konieczny
2022-07-27 12:27 ` Karolina Drobnik
2022-07-26 14:18 ` Kamil Konieczny
2 siblings, 1 reply; 10+ messages in thread
From: Kamil Konieczny @ 2022-07-26 10:49 UTC (permalink / raw)
To: igt-dev; +Cc: Chris Wilson
Hi Karolina,
few more nits, see below.
On 2022-07-13 at 12:53:42 +0200, Karolina Drobnik wrote:
> From: Chris Wilson <chris@chris-wilson.co.uk>
>
> Currently the wait boost heuristic is evaluated at the start of each
> fence wait for a series within dma-resv. There is no strict ordering of
> fences defined by dma-resv, and so it turns out that the same operation
> under different circumstances can result in different heuristics being
> applied, and dramatic performance variations in user applications.
>
> Link: https://gitlab.freedesktop.org/drm/intel/-/issues/6284
> Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
> Signed-off-by: Karolina Drobnik <karolina.drobnik@intel.com>
> ---
> v4:
> - Added test descriptions with igt_describe()
> v3:
> - Removed unnecessary calls to gem_write
> v2:
> - Introduced batch_create() with arbitration points to prevent hangs
> - Added engine-order subtest
> - Added reporting of GPU frequency
> - Changed runtimes and switched prints from us to ms
>
> tests/i915/i915_pm_rps.c | 261 ++++++++++++++++++++++++++++++++++++++-
> 1 file changed, 255 insertions(+), 6 deletions(-)
>
> diff --git a/tests/i915/i915_pm_rps.c b/tests/i915/i915_pm_rps.c
> index b37f15c2..6125ea60 100644
> --- a/tests/i915/i915_pm_rps.c
> +++ b/tests/i915/i915_pm_rps.c
> @@ -37,8 +37,10 @@
> #include <sys/wait.h>
>
> #include "i915/gem.h"
> +#include "i915/gem_create.h"
> #include "igt.h"
> #include "igt_dummyload.h"
> +#include "igt_perf.h"
> #include "igt_sysfs.h"
>
> IGT_TEST_DESCRIPTION("Render P-States tests - verify GPU frequency changes");
> @@ -614,14 +616,253 @@ static void waitboost(int fd, bool reset)
> igt_assert_lt(post_freqs[CUR], post_freqs[MAX]);
> }
>
> +static uint32_t batch_create(int i915, uint64_t sz)
> +{
> + const uint32_t bbe = MI_BATCH_BUFFER_END;
> + const uint32_t chk = 0x5 << 23;
> + uint32_t handle = gem_create(i915, sz);
> +
> + for (uint64_t pg = 4096; pg + 4096 < sz; pg += 4096)
> + gem_write(i915, handle, pg, &chk, sizeof(chk));
> +
> + gem_write(i915, handle, sz - sizeof(bbe), &bbe, sizeof(bbe));
> +
> + return handle;
> +}
> +
> +static uint64_t __fence_order(int i915,
> + struct drm_i915_gem_exec_object2 *obj,
> + struct drm_i915_gem_execbuffer2 *eb,
> + uint64_t flags0, uint64_t flags1,
> + double *outf)
> +{
> + uint64_t before[2], after[2];
> + struct timespec tv;
> + int fd;
> +
> + gem_quiescent_gpu(i915);
> + fd = perf_i915_open(i915, I915_PMU_ACTUAL_FREQUENCY);
> +
> + igt_gettime(&tv);
> +
> + obj->flags = flags0;
> + gem_execbuf(i915, eb);
> +
> + obj->flags = flags1;
> + gem_execbuf(i915, eb);
> +
> + read(fd, before, sizeof(before));
> + gem_sync(i915, obj->handle);
> + read(fd, after, sizeof(after));
> + close(fd);
> +
> + after[0] -= before[0];
> + after[1] -= before[1];
Put igt_assert here (check for zero before divide).
> +
> + *outf = 1e9 * after[0] / after[1];
> + return igt_nsec_elapsed(&tv);
> +}
> +
> +static void fence_order(int i915)
> +{
> + const uint64_t sz = 512ull << 20;
> + struct drm_i915_gem_exec_object2 obj[2] = {
> + { .handle = gem_create(i915, 4096) },
> + { .handle = batch_create(i915, sz) },
> + };
> + struct drm_i915_gem_execbuffer2 execbuf = {
> + .buffers_ptr = to_user_pointer(obj),
> + .buffer_count = ARRAY_SIZE(obj),
> + };
> + uint64_t wr, rw;
> + int min, max;
> + double freq;
> + int sysfs;
> +
> + /*
> + * Check the order of fences found during GEM_WAIT does not affect
> + * waitboosting.
> + *
> + * Internally, implicit fences are tracked within a dma-resv which
> + * imposes no order on the individually fences tracked within. Since
> + * there is no defined order, the sequence of waits (and the associated
> + * waitboosts) is also undefined, undermining the consistency of the
> + * waitboost heuristic.
> + *
> + * In particular, we can influence the sequence of fence storage
> + * within dma-resv by mixing read/write semantics for implicit fences.
> + * We can exploit this property of dma-resv to exercise that no matter
> + * the stored order, the heuristic is applied consistently for the
> + * user's GEM_WAIT ioctl.
> + */
> +
> + sysfs = igt_sysfs_open(i915);
> + min = igt_sysfs_get_u32(sysfs, "gt_RPn_freq_mhz");
> + max = igt_sysfs_get_u32(sysfs, "gt_RP0_freq_mhz");
> + igt_require(max > min);
> +
> + /* Only allow ourselves to upclock via waitboosting */
> + igt_sysfs_printf(sysfs, "gt_min_freq_mhz", "%d", min);
> + igt_sysfs_printf(sysfs, "gt_max_freq_mhz", "%d", min);
> + igt_sysfs_printf(sysfs, "gt_boost_freq_mhz", "%d", max);
> +
> + /* Warm up to bind the vma */
> + __fence_order(i915, &obj[0], &execbuf, 0, 0, &freq);
> +
> + wr = __fence_order(i915, &obj[0], &execbuf, EXEC_OBJECT_WRITE, 0, &freq);
> + igt_info("Write-then-read: %.2fms @ %.3fMHz\n", wr * 1e-6, freq);
> +
> + rw = __fence_order(i915, &obj[0], &execbuf, 0, EXEC_OBJECT_WRITE, &freq);
> + igt_info("Read-then-write: %.2fms @ %.3fMHz\n", rw * 1e-6, freq);
> +
> + gem_close(i915, obj[0].handle);
> + gem_close(i915, obj[1].handle);
Close sysfs here.
> +
> + igt_sysfs_printf(sysfs, "gt_min_freq_mhz", "%d", min);
> + igt_sysfs_printf(sysfs, "gt_max_freq_mhz", "%d", max);
Here we should put previous values, not those readed from RPx,
but if they are always the same it is ok.
> +
> + igt_assert(4 * rw > 3 * wr && 4 * wr > 3 * rw);
> +}
> +
> +static uint64_t __engine_order(int i915,
> + struct drm_i915_gem_exec_object2 *obj,
> + struct drm_i915_gem_execbuffer2 *eb,
> + unsigned int *engines0,
> + unsigned int *engines1,
> + unsigned int num_engines,
> + double *outf)
> +{
> + uint64_t before[2], after[2];
> + struct timespec tv;
> + int fd;
> +
> + gem_quiescent_gpu(i915);
> + fd = perf_i915_open(i915, I915_PMU_ACTUAL_FREQUENCY);
> +
> + igt_gettime(&tv);
> +
> + obj->flags = EXEC_OBJECT_WRITE;
> + for (unsigned int n = 0; n < num_engines; n++) {
> + eb->flags &= ~63ull;
> + eb->flags |= engines0[n];
> + gem_execbuf_wr(i915, eb);
> + }
> +
> + obj->flags = 0;
> + for (unsigned int n = 0; n < num_engines; n++) {
> + eb->flags &= ~63ull;
> + eb->flags |= engines1[n];
> + gem_execbuf(i915, eb);
> + }
> +
> + read(fd, before, sizeof(before));
> + gem_sync(i915, obj->handle);
> + read(fd, after, sizeof(after));
> + close(fd);
> +
> + after[0] -= before[0];
> + after[1] -= before[1];
Put igt_assert here (check for zero before divide).
> +
> + *outf = 1e9 * after[0] / after[1];
> + return igt_nsec_elapsed(&tv);
> +}
> +
> +static void engine_order(int i915)
> +{
> + const uint64_t sz = 512ull << 20;
> + struct drm_i915_gem_exec_object2 obj[2] = {
> + { .handle = gem_create(i915, 4096) },
> + { .handle = batch_create(i915, sz) },
> + };
> + struct drm_i915_gem_execbuffer2 execbuf = {
> + .buffers_ptr = to_user_pointer(obj),
> + .buffer_count = ARRAY_SIZE(obj),
> + };
> + const struct intel_execution_engine2 *e;
> + unsigned int engines[2], reverse[2];
> + uint64_t forward, backward, both;
> + unsigned int num_engines;
> + const intel_ctx_t *ctx;
> + int min, max;
> + double freq;
> + int sysfs;
> +
> + /*
> + * Check the order of fences found during GEM_WAIT does not affect
> + * waitboosting. (See fence_order())
> + *
> + * Another way we can manipulate the order of fences within the
> + * dma-resv is through repeated use of the same contexts.
> + */
> +
> + num_engines = 0;
> + ctx = intel_ctx_create_all_physical(i915);
> + for_each_ctx_engine(i915, ctx, e) {
> + /*
> + * Avoid using the cmdparser as it will try to allocate
> + * a new shadow batch for each submission -> oom
> + */
> + if (!gem_engine_has_mutable_submission(i915, e->class))
> + continue;
> +
> + engines[num_engines++] = e->flags;
> + if (num_engines == ARRAY_SIZE(engines))
> + break;
> + }
> + igt_require(num_engines > 1);
> + for (unsigned int n = 0; n < num_engines; n++)
> + reverse[n] = engines[num_engines - n - 1];
> + execbuf.rsvd1 = ctx->id;
> +
> + sysfs = igt_sysfs_open(i915);
> + min = igt_sysfs_get_u32(sysfs, "gt_RPn_freq_mhz");
> + max = igt_sysfs_get_u32(sysfs, "gt_RP0_freq_mhz");
> + igt_require(max > min);
> +
> + /* Only allow ourselves to upclock via waitboosting */
> + igt_sysfs_printf(sysfs, "gt_min_freq_mhz", "%d", min);
> + igt_sysfs_printf(sysfs, "gt_max_freq_mhz", "%d", min);
> + igt_sysfs_printf(sysfs, "gt_boost_freq_mhz", "%d", max);
> +
> + /* Warm up to bind the vma */
> + gem_execbuf(i915, &execbuf);
> +
> + forward = __engine_order(i915, &obj[0], &execbuf,
> + engines, engines, num_engines,
> + &freq);
> + igt_info("Forwards: %.2fms @ %.3fMhz\n", forward * 1e-6, freq);
> +
> + backward = __engine_order(i915, &obj[0], &execbuf,
> + reverse, reverse, num_engines,
> + &freq);
> + igt_info("Backwards: %.2fms @ %.3fMhz\n", backward * 1e-6, freq);
> +
> + both = __engine_order(i915, &obj[0], &execbuf,
> + engines, reverse, num_engines,
> + &freq);
> + igt_info("Bidirectional: %.2fms @ %.3fMhz\n", both * 1e-6, freq);
> +
> + gem_close(i915, obj[0].handle);
> + gem_close(i915, obj[1].handle);
Close sysfs here.
> + intel_ctx_destroy(i915, ctx);
> +
> + igt_sysfs_printf(sysfs, "gt_min_freq_mhz", "%d", min);
> + igt_sysfs_printf(sysfs, "gt_max_freq_mhz", "%d", max);
> +
> + igt_assert(4 * forward > 3 * backward && 4 * backward > 3 * forward);
> + igt_assert(4 * forward > 3 * both && 4 * both > 3 * forward);
> +}
> +
> static void pm_rps_exit_handler(int sig)
> {
> - if (origfreqs[MIN] > readval(sysfs_files[MAX].filp)) {
> - writeval(sysfs_files[MAX].filp, origfreqs[MAX]);
> - writeval(sysfs_files[MIN].filp, origfreqs[MIN]);
> - } else {
> - writeval(sysfs_files[MIN].filp, origfreqs[MIN]);
> - writeval(sysfs_files[MAX].filp, origfreqs[MAX]);
> + if (sysfs_files[MAX].filp) {
> + if (origfreqs[MIN] > readval(sysfs_files[MAX].filp)) {
> + writeval(sysfs_files[MAX].filp, origfreqs[MAX]);
> + writeval(sysfs_files[MIN].filp, origfreqs[MIN]);
> + } else {
> + writeval(sysfs_files[MIN].filp, origfreqs[MIN]);
> + writeval(sysfs_files[MAX].filp, origfreqs[MAX]);
> + }
This change looks unrelated. Please split this as separate patch
and post as a fix.
Regards,
Kamil
> }
>
> if (lh.igt_proc.running)
> @@ -683,6 +924,14 @@ igt_main
> igt_subtest("waitboost")
> waitboost(drm_fd, false);
>
> + igt_describe("Check if the order of fences does not affect waitboosting");
> + igt_subtest("fence-order")
> + fence_order(drm_fd);
> +
> + igt_describe("Check if context reuse does not affect waitboosting");
> + igt_subtest("engine-order")
> + engine_order(drm_fd);
> +
> /* Test boost frequency after GPU reset */
> igt_subtest("reset") {
> igt_hang_t hang = igt_allow_hang(drm_fd, 0, 0);
> --
> 2.25.1
^ permalink raw reply [flat|nested] 10+ messages in thread
* Re: [igt-dev] [PATCH i-g-t v4] i915/i915_pm_rps: Check impact of fence ordering on waitboosting
2022-07-13 10:53 [igt-dev] [PATCH i-g-t v4] i915/i915_pm_rps: Check impact of fence ordering on waitboosting Karolina Drobnik
2022-07-15 17:30 ` Kamil Konieczny
2022-07-26 10:49 ` Kamil Konieczny
@ 2022-07-26 14:18 ` Kamil Konieczny
2 siblings, 0 replies; 10+ messages in thread
From: Kamil Konieczny @ 2022-07-26 14:18 UTC (permalink / raw)
To: igt-dev; +Cc: Chris Wilson
Hi Karolina,
On 2022-07-13 at 12:53:42 +0200, Karolina Drobnik wrote:
> From: Chris Wilson <chris@chris-wilson.co.uk>
>
> Currently the wait boost heuristic is evaluated at the start of each
> fence wait for a series within dma-resv. There is no strict ordering of
> fences defined by dma-resv, and so it turns out that the same operation
> under different circumstances can result in different heuristics being
> applied, and dramatic performance variations in user applications.
>
> Link: https://gitlab.freedesktop.org/drm/intel/-/issues/6284
> Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
> Signed-off-by: Karolina Drobnik <karolina.drobnik@intel.com>
> ---
> v4:
> - Added test descriptions with igt_describe()
> v3:
> - Removed unnecessary calls to gem_write
> v2:
> - Introduced batch_create() with arbitration points to prevent hangs
> - Added engine-order subtest
> - Added reporting of GPU frequency
> - Changed runtimes and switched prints from us to ms
>
> tests/i915/i915_pm_rps.c | 261 ++++++++++++++++++++++++++++++++++++++-
> 1 file changed, 255 insertions(+), 6 deletions(-)
>
> diff --git a/tests/i915/i915_pm_rps.c b/tests/i915/i915_pm_rps.c
> index b37f15c2..6125ea60 100644
> --- a/tests/i915/i915_pm_rps.c
> +++ b/tests/i915/i915_pm_rps.c
> @@ -37,8 +37,10 @@
> #include <sys/wait.h>
>
> #include "i915/gem.h"
> +#include "i915/gem_create.h"
> #include "igt.h"
> #include "igt_dummyload.h"
> +#include "igt_perf.h"
> #include "igt_sysfs.h"
>
> IGT_TEST_DESCRIPTION("Render P-States tests - verify GPU frequency changes");
> @@ -614,14 +616,253 @@ static void waitboost(int fd, bool reset)
> igt_assert_lt(post_freqs[CUR], post_freqs[MAX]);
> }
>
> +static uint32_t batch_create(int i915, uint64_t sz)
> +{
> + const uint32_t bbe = MI_BATCH_BUFFER_END;
> + const uint32_t chk = 0x5 << 23;
> + uint32_t handle = gem_create(i915, sz);
> +
> + for (uint64_t pg = 4096; pg + 4096 < sz; pg += 4096)
> + gem_write(i915, handle, pg, &chk, sizeof(chk));
> +
> + gem_write(i915, handle, sz - sizeof(bbe), &bbe, sizeof(bbe));
> +
> + return handle;
> +}
> +
> +static uint64_t __fence_order(int i915,
> + struct drm_i915_gem_exec_object2 *obj,
> + struct drm_i915_gem_execbuffer2 *eb,
> + uint64_t flags0, uint64_t flags1,
> + double *outf)
> +{
> + uint64_t before[2], after[2];
> + struct timespec tv;
> + int fd;
> +
> + gem_quiescent_gpu(i915);
> + fd = perf_i915_open(i915, I915_PMU_ACTUAL_FREQUENCY);
> +
> + igt_gettime(&tv);
> +
> + obj->flags = flags0;
> + gem_execbuf(i915, eb);
> +
> + obj->flags = flags1;
> + gem_execbuf(i915, eb);
> +
> + read(fd, before, sizeof(before));
> + gem_sync(i915, obj->handle);
> + read(fd, after, sizeof(after));
> + close(fd);
> +
> + after[0] -= before[0];
> + after[1] -= before[1];
> +
> + *outf = 1e9 * after[0] / after[1];
> + return igt_nsec_elapsed(&tv);
> +}
> +
> +static void fence_order(int i915)
> +{
> + const uint64_t sz = 512ull << 20;
I tested this on one machine with drm-tip and test lasted for
around 300 sec, can we lower this to 128 MB ? It will run
around 15s, about 20 times faster.
Regards,
Kamil
> + struct drm_i915_gem_exec_object2 obj[2] = {
> + { .handle = gem_create(i915, 4096) },
> + { .handle = batch_create(i915, sz) },
> + };
> + struct drm_i915_gem_execbuffer2 execbuf = {
> + .buffers_ptr = to_user_pointer(obj),
> + .buffer_count = ARRAY_SIZE(obj),
> + };
> + uint64_t wr, rw;
> + int min, max;
> + double freq;
> + int sysfs;
> +
> + /*
> + * Check the order of fences found during GEM_WAIT does not affect
> + * waitboosting.
> + *
> + * Internally, implicit fences are tracked within a dma-resv which
> + * imposes no order on the individually fences tracked within. Since
> + * there is no defined order, the sequence of waits (and the associated
> + * waitboosts) is also undefined, undermining the consistency of the
> + * waitboost heuristic.
> + *
> + * In particular, we can influence the sequence of fence storage
> + * within dma-resv by mixing read/write semantics for implicit fences.
> + * We can exploit this property of dma-resv to exercise that no matter
> + * the stored order, the heuristic is applied consistently for the
> + * user's GEM_WAIT ioctl.
> + */
> +
> + sysfs = igt_sysfs_open(i915);
> + min = igt_sysfs_get_u32(sysfs, "gt_RPn_freq_mhz");
> + max = igt_sysfs_get_u32(sysfs, "gt_RP0_freq_mhz");
> + igt_require(max > min);
> +
> + /* Only allow ourselves to upclock via waitboosting */
> + igt_sysfs_printf(sysfs, "gt_min_freq_mhz", "%d", min);
> + igt_sysfs_printf(sysfs, "gt_max_freq_mhz", "%d", min);
> + igt_sysfs_printf(sysfs, "gt_boost_freq_mhz", "%d", max);
> +
> + /* Warm up to bind the vma */
> + __fence_order(i915, &obj[0], &execbuf, 0, 0, &freq);
> +
> + wr = __fence_order(i915, &obj[0], &execbuf, EXEC_OBJECT_WRITE, 0, &freq);
> + igt_info("Write-then-read: %.2fms @ %.3fMHz\n", wr * 1e-6, freq);
> +
> + rw = __fence_order(i915, &obj[0], &execbuf, 0, EXEC_OBJECT_WRITE, &freq);
> + igt_info("Read-then-write: %.2fms @ %.3fMHz\n", rw * 1e-6, freq);
> +
> + gem_close(i915, obj[0].handle);
> + gem_close(i915, obj[1].handle);
> +
> + igt_sysfs_printf(sysfs, "gt_min_freq_mhz", "%d", min);
> + igt_sysfs_printf(sysfs, "gt_max_freq_mhz", "%d", max);
> +
> + igt_assert(4 * rw > 3 * wr && 4 * wr > 3 * rw);
> +}
> +
> +static uint64_t __engine_order(int i915,
> + struct drm_i915_gem_exec_object2 *obj,
> + struct drm_i915_gem_execbuffer2 *eb,
> + unsigned int *engines0,
> + unsigned int *engines1,
> + unsigned int num_engines,
> + double *outf)
> +{
> + uint64_t before[2], after[2];
> + struct timespec tv;
> + int fd;
> +
> + gem_quiescent_gpu(i915);
> + fd = perf_i915_open(i915, I915_PMU_ACTUAL_FREQUENCY);
> +
> + igt_gettime(&tv);
> +
> + obj->flags = EXEC_OBJECT_WRITE;
> + for (unsigned int n = 0; n < num_engines; n++) {
> + eb->flags &= ~63ull;
> + eb->flags |= engines0[n];
> + gem_execbuf_wr(i915, eb);
> + }
> +
> + obj->flags = 0;
> + for (unsigned int n = 0; n < num_engines; n++) {
> + eb->flags &= ~63ull;
> + eb->flags |= engines1[n];
> + gem_execbuf(i915, eb);
> + }
> +
> + read(fd, before, sizeof(before));
> + gem_sync(i915, obj->handle);
> + read(fd, after, sizeof(after));
> + close(fd);
> +
> + after[0] -= before[0];
> + after[1] -= before[1];
> +
> + *outf = 1e9 * after[0] / after[1];
> + return igt_nsec_elapsed(&tv);
> +}
> +
> +static void engine_order(int i915)
> +{
> + const uint64_t sz = 512ull << 20;
> + struct drm_i915_gem_exec_object2 obj[2] = {
> + { .handle = gem_create(i915, 4096) },
> + { .handle = batch_create(i915, sz) },
> + };
> + struct drm_i915_gem_execbuffer2 execbuf = {
> + .buffers_ptr = to_user_pointer(obj),
> + .buffer_count = ARRAY_SIZE(obj),
> + };
> + const struct intel_execution_engine2 *e;
> + unsigned int engines[2], reverse[2];
> + uint64_t forward, backward, both;
> + unsigned int num_engines;
> + const intel_ctx_t *ctx;
> + int min, max;
> + double freq;
> + int sysfs;
> +
> + /*
> + * Check the order of fences found during GEM_WAIT does not affect
> + * waitboosting. (See fence_order())
> + *
> + * Another way we can manipulate the order of fences within the
> + * dma-resv is through repeated use of the same contexts.
> + */
> +
> + num_engines = 0;
> + ctx = intel_ctx_create_all_physical(i915);
> + for_each_ctx_engine(i915, ctx, e) {
> + /*
> + * Avoid using the cmdparser as it will try to allocate
> + * a new shadow batch for each submission -> oom
> + */
> + if (!gem_engine_has_mutable_submission(i915, e->class))
> + continue;
> +
> + engines[num_engines++] = e->flags;
> + if (num_engines == ARRAY_SIZE(engines))
> + break;
> + }
> + igt_require(num_engines > 1);
> + for (unsigned int n = 0; n < num_engines; n++)
> + reverse[n] = engines[num_engines - n - 1];
> + execbuf.rsvd1 = ctx->id;
> +
> + sysfs = igt_sysfs_open(i915);
> + min = igt_sysfs_get_u32(sysfs, "gt_RPn_freq_mhz");
> + max = igt_sysfs_get_u32(sysfs, "gt_RP0_freq_mhz");
> + igt_require(max > min);
> +
> + /* Only allow ourselves to upclock via waitboosting */
> + igt_sysfs_printf(sysfs, "gt_min_freq_mhz", "%d", min);
> + igt_sysfs_printf(sysfs, "gt_max_freq_mhz", "%d", min);
> + igt_sysfs_printf(sysfs, "gt_boost_freq_mhz", "%d", max);
> +
> + /* Warm up to bind the vma */
> + gem_execbuf(i915, &execbuf);
> +
> + forward = __engine_order(i915, &obj[0], &execbuf,
> + engines, engines, num_engines,
> + &freq);
> + igt_info("Forwards: %.2fms @ %.3fMhz\n", forward * 1e-6, freq);
> +
> + backward = __engine_order(i915, &obj[0], &execbuf,
> + reverse, reverse, num_engines,
> + &freq);
> + igt_info("Backwards: %.2fms @ %.3fMhz\n", backward * 1e-6, freq);
> +
> + both = __engine_order(i915, &obj[0], &execbuf,
> + engines, reverse, num_engines,
> + &freq);
> + igt_info("Bidirectional: %.2fms @ %.3fMhz\n", both * 1e-6, freq);
> +
> + gem_close(i915, obj[0].handle);
> + gem_close(i915, obj[1].handle);
> + intel_ctx_destroy(i915, ctx);
> +
> + igt_sysfs_printf(sysfs, "gt_min_freq_mhz", "%d", min);
> + igt_sysfs_printf(sysfs, "gt_max_freq_mhz", "%d", max);
> +
> + igt_assert(4 * forward > 3 * backward && 4 * backward > 3 * forward);
> + igt_assert(4 * forward > 3 * both && 4 * both > 3 * forward);
> +}
> +
> static void pm_rps_exit_handler(int sig)
> {
> - if (origfreqs[MIN] > readval(sysfs_files[MAX].filp)) {
> - writeval(sysfs_files[MAX].filp, origfreqs[MAX]);
> - writeval(sysfs_files[MIN].filp, origfreqs[MIN]);
> - } else {
> - writeval(sysfs_files[MIN].filp, origfreqs[MIN]);
> - writeval(sysfs_files[MAX].filp, origfreqs[MAX]);
> + if (sysfs_files[MAX].filp) {
> + if (origfreqs[MIN] > readval(sysfs_files[MAX].filp)) {
> + writeval(sysfs_files[MAX].filp, origfreqs[MAX]);
> + writeval(sysfs_files[MIN].filp, origfreqs[MIN]);
> + } else {
> + writeval(sysfs_files[MIN].filp, origfreqs[MIN]);
> + writeval(sysfs_files[MAX].filp, origfreqs[MAX]);
> + }
> }
>
> if (lh.igt_proc.running)
> @@ -683,6 +924,14 @@ igt_main
> igt_subtest("waitboost")
> waitboost(drm_fd, false);
>
> + igt_describe("Check if the order of fences does not affect waitboosting");
> + igt_subtest("fence-order")
> + fence_order(drm_fd);
> +
> + igt_describe("Check if context reuse does not affect waitboosting");
> + igt_subtest("engine-order")
> + engine_order(drm_fd);
> +
> /* Test boost frequency after GPU reset */
> igt_subtest("reset") {
> igt_hang_t hang = igt_allow_hang(drm_fd, 0, 0);
> --
> 2.25.1
^ permalink raw reply [flat|nested] 10+ messages in thread
* Re: [igt-dev] [PATCH i-g-t v4] i915/i915_pm_rps: Check impact of fence ordering on waitboosting
2022-07-26 10:49 ` Kamil Konieczny
@ 2022-07-27 12:27 ` Karolina Drobnik
[not found] ` <YuIeu2vSAFE8YnB3@kamilkon-desk1>
0 siblings, 1 reply; 10+ messages in thread
From: Karolina Drobnik @ 2022-07-27 12:27 UTC (permalink / raw)
To: Kamil Konieczny; +Cc: igt-dev, Chris Wilson
Hi,
On 26.07.2022 12:49, Kamil Konieczny wrote:
> Hi Karolina,
>
> few more nits, see below.
>
> On 2022-07-13 at 12:53:42 +0200, Karolina Drobnik wrote:
>> From: Chris Wilson <chris@chris-wilson.co.uk>
>>
>> Currently the wait boost heuristic is evaluated at the start of each
>> fence wait for a series within dma-resv. There is no strict ordering of
>> fences defined by dma-resv, and so it turns out that the same operation
>> under different circumstances can result in different heuristics being
>> applied, and dramatic performance variations in user applications.
>>
>> Link: https://gitlab.freedesktop.org/drm/intel/-/issues/6284
>> Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
>> Signed-off-by: Karolina Drobnik <karolina.drobnik@intel.com>
>> ---
>> v4:
>> - Added test descriptions with igt_describe()
>> v3:
>> - Removed unnecessary calls to gem_write
>> v2:
>> - Introduced batch_create() with arbitration points to prevent hangs
>> - Added engine-order subtest
>> - Added reporting of GPU frequency
>> - Changed runtimes and switched prints from us to ms
>>
>> tests/i915/i915_pm_rps.c | 261 ++++++++++++++++++++++++++++++++++++++-
>> 1 file changed, 255 insertions(+), 6 deletions(-)
>>
>> diff --git a/tests/i915/i915_pm_rps.c b/tests/i915/i915_pm_rps.c
>> index b37f15c2..6125ea60 100644
>> --- a/tests/i915/i915_pm_rps.c
>> +++ b/tests/i915/i915_pm_rps.c
>> @@ -37,8 +37,10 @@
>> #include <sys/wait.h>
>>
>> #include "i915/gem.h"
>> +#include "i915/gem_create.h"
>> #include "igt.h"
>> #include "igt_dummyload.h"
>> +#include "igt_perf.h"
>> #include "igt_sysfs.h"
>>
>> IGT_TEST_DESCRIPTION("Render P-States tests - verify GPU frequency changes");
>> @@ -614,14 +616,253 @@ static void waitboost(int fd, bool reset)
>> igt_assert_lt(post_freqs[CUR], post_freqs[MAX]);
>> }
>>
>> +static uint32_t batch_create(int i915, uint64_t sz)
>> +{
>> + const uint32_t bbe = MI_BATCH_BUFFER_END;
>> + const uint32_t chk = 0x5 << 23;
>> + uint32_t handle = gem_create(i915, sz);
>> +
>> + for (uint64_t pg = 4096; pg + 4096 < sz; pg += 4096)
>> + gem_write(i915, handle, pg, &chk, sizeof(chk));
>> +
>> + gem_write(i915, handle, sz - sizeof(bbe), &bbe, sizeof(bbe));
>> +
>> + return handle;
>> +}
>> +
>> +static uint64_t __fence_order(int i915,
>> + struct drm_i915_gem_exec_object2 *obj,
>> + struct drm_i915_gem_execbuffer2 *eb,
>> + uint64_t flags0, uint64_t flags1,
>> + double *outf)
>> +{
>> + uint64_t before[2], after[2];
>> + struct timespec tv;
>> + int fd;
>> +
>> + gem_quiescent_gpu(i915);
>> + fd = perf_i915_open(i915, I915_PMU_ACTUAL_FREQUENCY);
>> +
>> + igt_gettime(&tv);
>> +
>> + obj->flags = flags0;
>> + gem_execbuf(i915, eb);
>> +
>> + obj->flags = flags1;
>> + gem_execbuf(i915, eb);
>> +
>> + read(fd, before, sizeof(before));
>> + gem_sync(i915, obj->handle);
>> + read(fd, after, sizeof(after));
>> + close(fd);
>> +
>> + after[0] -= before[0];
>> + after[1] -= before[1];
>
> Put igt_assert here (check for zero before divide).
Hmm, isn't that more of a belts and suspenders approach? It's unlikely
we'd get zero here. Also, don't we put asserts towards the end of the
test whenever possible?
>> +
>> + *outf = 1e9 * after[0] / after[1];
>> + return igt_nsec_elapsed(&tv);
>> +}
>> +
>> +static void fence_order(int i915)
>> +{
>> + const uint64_t sz = 512ull << 20;
>> + struct drm_i915_gem_exec_object2 obj[2] = {
>> + { .handle = gem_create(i915, 4096) },
>> + { .handle = batch_create(i915, sz) },
>> + };
>> + struct drm_i915_gem_execbuffer2 execbuf = {
>> + .buffers_ptr = to_user_pointer(obj),
>> + .buffer_count = ARRAY_SIZE(obj),
>> + };
>> + uint64_t wr, rw;
>> + int min, max;
>> + double freq;
>> + int sysfs;
>> +
>> + /*
>> + * Check the order of fences found during GEM_WAIT does not affect
>> + * waitboosting.
>> + *
>> + * Internally, implicit fences are tracked within a dma-resv which
>> + * imposes no order on the individually fences tracked within. Since
>> + * there is no defined order, the sequence of waits (and the associated
>> + * waitboosts) is also undefined, undermining the consistency of the
>> + * waitboost heuristic.
>> + *
>> + * In particular, we can influence the sequence of fence storage
>> + * within dma-resv by mixing read/write semantics for implicit fences.
>> + * We can exploit this property of dma-resv to exercise that no matter
>> + * the stored order, the heuristic is applied consistently for the
>> + * user's GEM_WAIT ioctl.
>> + */
>> +
>> + sysfs = igt_sysfs_open(i915);
>> + min = igt_sysfs_get_u32(sysfs, "gt_RPn_freq_mhz");
>> + max = igt_sysfs_get_u32(sysfs, "gt_RP0_freq_mhz");
>> + igt_require(max > min);
>> +
>> + /* Only allow ourselves to upclock via waitboosting */
>> + igt_sysfs_printf(sysfs, "gt_min_freq_mhz", "%d", min);
>> + igt_sysfs_printf(sysfs, "gt_max_freq_mhz", "%d", min);
>> + igt_sysfs_printf(sysfs, "gt_boost_freq_mhz", "%d", max);
>> +
>> + /* Warm up to bind the vma */
>> + __fence_order(i915, &obj[0], &execbuf, 0, 0, &freq);
>> +
>> + wr = __fence_order(i915, &obj[0], &execbuf, EXEC_OBJECT_WRITE, 0, &freq);
>> + igt_info("Write-then-read: %.2fms @ %.3fMHz\n", wr * 1e-6, freq);
>> +
>> + rw = __fence_order(i915, &obj[0], &execbuf, 0, EXEC_OBJECT_WRITE, &freq);
>> + igt_info("Read-then-write: %.2fms @ %.3fMHz\n", rw * 1e-6, freq);
>> +
>> + gem_close(i915, obj[0].handle);
>> + gem_close(i915, obj[1].handle);
>
> Close sysfs here.
Good catch. I'll close it before asserts (as agreed).
>> +
>> + igt_sysfs_printf(sysfs, "gt_min_freq_mhz", "%d", min);
>> + igt_sysfs_printf(sysfs, "gt_max_freq_mhz", "%d", max);
>
> Here we should put previous values, not those readed from RPx,
> but if they are always the same it is ok.
We restore the values we read from sysfs
>> +
>> + igt_assert(4 * rw > 3 * wr && 4 * wr > 3 * rw);
>> +}
>> +
>> +static uint64_t __engine_order(int i915,
>> + struct drm_i915_gem_exec_object2 *obj,
>> + struct drm_i915_gem_execbuffer2 *eb,
>> + unsigned int *engines0,
>> + unsigned int *engines1,
>> + unsigned int num_engines,
>> + double *outf)
>> +{
>> + uint64_t before[2], after[2];
>> + struct timespec tv;
>> + int fd;
>> +
>> + gem_quiescent_gpu(i915);
>> + fd = perf_i915_open(i915, I915_PMU_ACTUAL_FREQUENCY);
>> +
>> + igt_gettime(&tv);
>> +
>> + obj->flags = EXEC_OBJECT_WRITE;
>> + for (unsigned int n = 0; n < num_engines; n++) {
>> + eb->flags &= ~63ull;
>> + eb->flags |= engines0[n];
>> + gem_execbuf_wr(i915, eb);
>> + }
>> +
>> + obj->flags = 0;
>> + for (unsigned int n = 0; n < num_engines; n++) {
>> + eb->flags &= ~63ull;
>> + eb->flags |= engines1[n];
>> + gem_execbuf(i915, eb);
>> + }
>> +
>> + read(fd, before, sizeof(before));
>> + gem_sync(i915, obj->handle);
>> + read(fd, after, sizeof(after));
>> + close(fd);
>> +
>> + after[0] -= before[0];
>> + after[1] -= before[1];
>
> Put igt_assert here (check for zero before divide).
>
>> +
>> + *outf = 1e9 * after[0] / after[1];
>> + return igt_nsec_elapsed(&tv);
>> +}
>> +
>> +static void engine_order(int i915)
>> +{
>> + const uint64_t sz = 512ull << 20;
>> + struct drm_i915_gem_exec_object2 obj[2] = {
>> + { .handle = gem_create(i915, 4096) },
>> + { .handle = batch_create(i915, sz) },
>> + };
>> + struct drm_i915_gem_execbuffer2 execbuf = {
>> + .buffers_ptr = to_user_pointer(obj),
>> + .buffer_count = ARRAY_SIZE(obj),
>> + };
>> + const struct intel_execution_engine2 *e;
>> + unsigned int engines[2], reverse[2];
>> + uint64_t forward, backward, both;
>> + unsigned int num_engines;
>> + const intel_ctx_t *ctx;
>> + int min, max;
>> + double freq;
>> + int sysfs;
>> +
>> + /*
>> + * Check the order of fences found during GEM_WAIT does not affect
>> + * waitboosting. (See fence_order())
>> + *
>> + * Another way we can manipulate the order of fences within the
>> + * dma-resv is through repeated use of the same contexts.
>> + */
>> +
>> + num_engines = 0;
>> + ctx = intel_ctx_create_all_physical(i915);
>> + for_each_ctx_engine(i915, ctx, e) {
>> + /*
>> + * Avoid using the cmdparser as it will try to allocate
>> + * a new shadow batch for each submission -> oom
>> + */
>> + if (!gem_engine_has_mutable_submission(i915, e->class))
>> + continue;
>> +
>> + engines[num_engines++] = e->flags;
>> + if (num_engines == ARRAY_SIZE(engines))
>> + break;
>> + }
>> + igt_require(num_engines > 1);
>> + for (unsigned int n = 0; n < num_engines; n++)
>> + reverse[n] = engines[num_engines - n - 1];
>> + execbuf.rsvd1 = ctx->id;
>> +
>> + sysfs = igt_sysfs_open(i915);
>> + min = igt_sysfs_get_u32(sysfs, "gt_RPn_freq_mhz");
>> + max = igt_sysfs_get_u32(sysfs, "gt_RP0_freq_mhz");
>> + igt_require(max > min);
>> +
>> + /* Only allow ourselves to upclock via waitboosting */
>> + igt_sysfs_printf(sysfs, "gt_min_freq_mhz", "%d", min);
>> + igt_sysfs_printf(sysfs, "gt_max_freq_mhz", "%d", min);
>> + igt_sysfs_printf(sysfs, "gt_boost_freq_mhz", "%d", max);
>> +
>> + /* Warm up to bind the vma */
>> + gem_execbuf(i915, &execbuf);
>> +
>> + forward = __engine_order(i915, &obj[0], &execbuf,
>> + engines, engines, num_engines,
>> + &freq);
>> + igt_info("Forwards: %.2fms @ %.3fMhz\n", forward * 1e-6, freq);
>> +
>> + backward = __engine_order(i915, &obj[0], &execbuf,
>> + reverse, reverse, num_engines,
>> + &freq);
>> + igt_info("Backwards: %.2fms @ %.3fMhz\n", backward * 1e-6, freq);
>> +
>> + both = __engine_order(i915, &obj[0], &execbuf,
>> + engines, reverse, num_engines,
>> + &freq);
>> + igt_info("Bidirectional: %.2fms @ %.3fMhz\n", both * 1e-6, freq);
>> +
>> + gem_close(i915, obj[0].handle);
>> + gem_close(i915, obj[1].handle);
>
> Close sysfs here.
Ack
>
>> + intel_ctx_destroy(i915, ctx);
>> +
>> + igt_sysfs_printf(sysfs, "gt_min_freq_mhz", "%d", min);
>> + igt_sysfs_printf(sysfs, "gt_max_freq_mhz", "%d", max);
>> +
>> + igt_assert(4 * forward > 3 * backward && 4 * backward > 3 * forward);
>> + igt_assert(4 * forward > 3 * both && 4 * both > 3 * forward);
>> +}
>> +
>> static void pm_rps_exit_handler(int sig)
>> {
>> - if (origfreqs[MIN] > readval(sysfs_files[MAX].filp)) {
>> - writeval(sysfs_files[MAX].filp, origfreqs[MAX]);
>> - writeval(sysfs_files[MIN].filp, origfreqs[MIN]);
>> - } else {
>> - writeval(sysfs_files[MIN].filp, origfreqs[MIN]);
>> - writeval(sysfs_files[MAX].filp, origfreqs[MAX]);
>> + if (sysfs_files[MAX].filp) {
>> + if (origfreqs[MIN] > readval(sysfs_files[MAX].filp)) {
>> + writeval(sysfs_files[MAX].filp, origfreqs[MAX]);
>> + writeval(sysfs_files[MIN].filp, origfreqs[MIN]);
>> + } else {
>> + writeval(sysfs_files[MIN].filp, origfreqs[MIN]);
>> + writeval(sysfs_files[MAX].filp, origfreqs[MAX]);
>> + }
>
> This change looks unrelated. Please split this as separate patch
> and post as a fix.
Will do so
Many thanks,
Karolina
> Regards,
> Kamil
>
>> }
>>
>> if (lh.igt_proc.running)
>> @@ -683,6 +924,14 @@ igt_main
>> igt_subtest("waitboost")
>> waitboost(drm_fd, false);
>>
>> + igt_describe("Check if the order of fences does not affect waitboosting");
>> + igt_subtest("fence-order")
>> + fence_order(drm_fd);
>> +
>> + igt_describe("Check if context reuse does not affect waitboosting");
>> + igt_subtest("engine-order")
>> + engine_order(drm_fd);
>> +
>> /* Test boost frequency after GPU reset */
>> igt_subtest("reset") {
>> igt_hang_t hang = igt_allow_hang(drm_fd, 0, 0);
>> --
>> 2.25.1
^ permalink raw reply [flat|nested] 10+ messages in thread
* Re: [igt-dev] [PATCH i-g-t v4] i915/i915_pm_rps: Check impact of fence ordering on waitboosting
[not found] ` <YuIeu2vSAFE8YnB3@kamilkon-desk1>
@ 2022-07-28 14:04 ` Karolina Drobnik
2022-07-28 14:12 ` Chris Wilson
1 sibling, 0 replies; 10+ messages in thread
From: Karolina Drobnik @ 2022-07-28 14:04 UTC (permalink / raw)
To: Kamil Konieczny; +Cc: igt-dev, Chris Wilson
Hi,
On 28.07.2022 07:29, Kamil Konieczny wrote:
> Hi,
>
> On 2022-07-27 at 14:27:02 +0200, Karolina Drobnik wrote:
>> Hi,
>>
>> On 26.07.2022 12:49, Kamil Konieczny wrote:
>>> Hi Karolina,
>>>
>>> few more nits, see below.
>>>
>>> On 2022-07-13 at 12:53:42 +0200, Karolina Drobnik wrote:
>>>> From: Chris Wilson <chris@chris-wilson.co.uk>
>>>>
>>>> Currently the wait boost heuristic is evaluated at the start of each
>>>> fence wait for a series within dma-resv. There is no strict ordering of
>>>> fences defined by dma-resv, and so it turns out that the same operation
>>>> under different circumstances can result in different heuristics being
>>>> applied, and dramatic performance variations in user applications.
>>>>
>>>> Link: https://gitlab.freedesktop.org/drm/intel/-/issues/6284
>>>> Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
>>>> Signed-off-by: Karolina Drobnik <karolina.drobnik@intel.com>
>>>> ---
>>>> v4:
>>>> - Added test descriptions with igt_describe()
>>>> v3:
>>>> - Removed unnecessary calls to gem_write
>>>> v2:
>>>> - Introduced batch_create() with arbitration points to prevent hangs
>>>> - Added engine-order subtest
>>>> - Added reporting of GPU frequency
>>>> - Changed runtimes and switched prints from us to ms
>>>>
>>>> tests/i915/i915_pm_rps.c | 261 ++++++++++++++++++++++++++++++++++++++-
>>>> 1 file changed, 255 insertions(+), 6 deletions(-)
>>>>
>>>> diff --git a/tests/i915/i915_pm_rps.c b/tests/i915/i915_pm_rps.c
>>>> index b37f15c2..6125ea60 100644
>>>> --- a/tests/i915/i915_pm_rps.c
>>>> +++ b/tests/i915/i915_pm_rps.c
>>>> @@ -37,8 +37,10 @@
>>>> #include <sys/wait.h>
>>>>
>>>> #include "i915/gem.h"
>>>> +#include "i915/gem_create.h"
>>>> #include "igt.h"
>>>> #include "igt_dummyload.h"
>>>> +#include "igt_perf.h"
>>>> #include "igt_sysfs.h"
>>>>
>>>> IGT_TEST_DESCRIPTION("Render P-States tests - verify GPU frequency changes");
>>>> @@ -614,14 +616,253 @@ static void waitboost(int fd, bool reset)
>>>> igt_assert_lt(post_freqs[CUR], post_freqs[MAX]);
>>>> }
>>>>
>>>> +static uint32_t batch_create(int i915, uint64_t sz)
>>>> +{
>>>> + const uint32_t bbe = MI_BATCH_BUFFER_END;
>>>> + const uint32_t chk = 0x5 << 23;
>>>> + uint32_t handle = gem_create(i915, sz);
>>>> +
>>>> + for (uint64_t pg = 4096; pg + 4096 < sz; pg += 4096)
>>>> + gem_write(i915, handle, pg, &chk, sizeof(chk));
>>>> +
>>>> + gem_write(i915, handle, sz - sizeof(bbe), &bbe, sizeof(bbe));
>>>> +
>>>> + return handle;
>>>> +}
>>>> +
>>>> +static uint64_t __fence_order(int i915,
>>>> + struct drm_i915_gem_exec_object2 *obj,
>>>> + struct drm_i915_gem_execbuffer2 *eb,
>>>> + uint64_t flags0, uint64_t flags1,
>>>> + double *outf)
>>>> +{
>>>> + uint64_t before[2], after[2];
>>>> + struct timespec tv;
>>>> + int fd;
>>>> +
>>>> + gem_quiescent_gpu(i915);
>>>> + fd = perf_i915_open(i915, I915_PMU_ACTUAL_FREQUENCY);
>>>> +
>>>> + igt_gettime(&tv);
>>>> +
>>>> + obj->flags = flags0;
>>>> + gem_execbuf(i915, eb);
>>>> +
>>>> + obj->flags = flags1;
>>>> + gem_execbuf(i915, eb);
>>>> +
>>>> + read(fd, before, sizeof(before));
>>>> + gem_sync(i915, obj->handle);
>>>> + read(fd, after, sizeof(after));
>>>> + close(fd);
>>>> +
>>>> + after[0] -= before[0];
>>>> + after[1] -= before[1];
>>>
>>> Put igt_assert here (check for zero before divide).
>>
>> Hmm, isn't that more of a belts and suspenders approach? It's unlikely we'd
>> get zero here. Also, don't we put asserts towards the end of the test
>> whenever possible?
>
> It is userspace so we can afford some extra checks, so maybe
> if (after[1] == 0)
> return 0;
I agree that it won't cost much, but I just wonder if we're going to end
up in such situation.
>>>> +
>>>> + *outf = 1e9 * after[0] / after[1];
>>>> + return igt_nsec_elapsed(&tv);
>>>> +}
>>>> +
>>>> +static void fence_order(int i915)
>>>> +{
>>>> + const uint64_t sz = 512ull << 20;
>>>> + struct drm_i915_gem_exec_object2 obj[2] = {
>>>> + { .handle = gem_create(i915, 4096) },
>>>> + { .handle = batch_create(i915, sz) },
>>>> + };
>>>> + struct drm_i915_gem_execbuffer2 execbuf = {
>>>> + .buffers_ptr = to_user_pointer(obj),
>>>> + .buffer_count = ARRAY_SIZE(obj),
>>>> + };
>>>> + uint64_t wr, rw;
>>>> + int min, max;
>>>> + double freq;
>>>> + int sysfs;
>>>> +
>>>> + /*
>>>> + * Check the order of fences found during GEM_WAIT does not affect
>>>> + * waitboosting.
>>>> + *
>>>> + * Internally, implicit fences are tracked within a dma-resv which
>>>> + * imposes no order on the individually fences tracked within. Since
>>>> + * there is no defined order, the sequence of waits (and the associated
>>>> + * waitboosts) is also undefined, undermining the consistency of the
>>>> + * waitboost heuristic.
>>>> + *
>>>> + * In particular, we can influence the sequence of fence storage
>>>> + * within dma-resv by mixing read/write semantics for implicit fences.
>>>> + * We can exploit this property of dma-resv to exercise that no matter
>>>> + * the stored order, the heuristic is applied consistently for the
>>>> + * user's GEM_WAIT ioctl.
>>>> + */
>>>> +
>>>> + sysfs = igt_sysfs_open(i915);
>>>> + min = igt_sysfs_get_u32(sysfs, "gt_RPn_freq_mhz");
>>>> + max = igt_sysfs_get_u32(sysfs, "gt_RP0_freq_mhz");
>>>> + igt_require(max > min);
>>>> +
>>>> + /* Only allow ourselves to upclock via waitboosting */
>>>> + igt_sysfs_printf(sysfs, "gt_min_freq_mhz", "%d", min);
>>>> + igt_sysfs_printf(sysfs, "gt_max_freq_mhz", "%d", min);
>>>> + igt_sysfs_printf(sysfs, "gt_boost_freq_mhz", "%d", max);
>>>> +
>>>> + /* Warm up to bind the vma */
>>>> + __fence_order(i915, &obj[0], &execbuf, 0, 0, &freq);
>>>> +
>>>> + wr = __fence_order(i915, &obj[0], &execbuf, EXEC_OBJECT_WRITE, 0, &freq);
>>>> + igt_info("Write-then-read: %.2fms @ %.3fMHz\n", wr * 1e-6, freq);
>>>> +
>>>> + rw = __fence_order(i915, &obj[0], &execbuf, 0, EXEC_OBJECT_WRITE, &freq);
>>>> + igt_info("Read-then-write: %.2fms @ %.3fMHz\n", rw * 1e-6, freq);
>>>> +
>>>> + gem_close(i915, obj[0].handle);
>>>> + gem_close(i915, obj[1].handle);
>>>
>>> Close sysfs here.
>>
>> Good catch. I'll close it before asserts (as agreed).
>>
>
> My bad, this close should be after resetting max freq, two lines
> below. Btw why not reuse code from other tests, reading old freqs
> at beginnig of test or reusing freqs readed at fixture in
> igt_main ?
No worries. Hm, I'd say it's because of how the fixture is likely to
change in the future.
>>>> +
>>>> + igt_sysfs_printf(sysfs, "gt_min_freq_mhz", "%d", min);
>>>> + igt_sysfs_printf(sysfs, "gt_max_freq_mhz", "%d", max);
>>>
>>> Here we should put previous values, not those readed from RPx,
>>> but if they are always the same it is ok.
>>
>> We restore the values we read from sysfs
>>
>
> But we readed RPx values, not gt_min|gt_max freq ones, so here
> you can use origfreqs[MIN] and origfreqs[MAX] or declare in test
>
> int old_freqs[NUMFREQ];
>
> and read them with read_freqs(old_freqs) and then use for restore.
I'd keep it as it is
Many thanks,
Karolina
> Regards,
> Kamil
>
>>>> +
>>>> + igt_assert(4 * rw > 3 * wr && 4 * wr > 3 * rw);
>>>> +}
>>>> +
>>>> +static uint64_t __engine_order(int i915,
>>>> + struct drm_i915_gem_exec_object2 *obj,
>>>> + struct drm_i915_gem_execbuffer2 *eb,
>>>> + unsigned int *engines0,
>>>> + unsigned int *engines1,
>>>> + unsigned int num_engines,
>>>> + double *outf)
>>>> +{
>>>> + uint64_t before[2], after[2];
>>>> + struct timespec tv;
>>>> + int fd;
>>>> +
>>>> + gem_quiescent_gpu(i915);
>>>> + fd = perf_i915_open(i915, I915_PMU_ACTUAL_FREQUENCY);
>>>> +
>>>> + igt_gettime(&tv);
>>>> +
>>>> + obj->flags = EXEC_OBJECT_WRITE;
>>>> + for (unsigned int n = 0; n < num_engines; n++) {
>>>> + eb->flags &= ~63ull;
>>>> + eb->flags |= engines0[n];
>>>> + gem_execbuf_wr(i915, eb);
>>>> + }
>>>> +
>>>> + obj->flags = 0;
>>>> + for (unsigned int n = 0; n < num_engines; n++) {
>>>> + eb->flags &= ~63ull;
>>>> + eb->flags |= engines1[n];
>>>> + gem_execbuf(i915, eb);
>>>> + }
>>>> +
>>>> + read(fd, before, sizeof(before));
>>>> + gem_sync(i915, obj->handle);
>>>> + read(fd, after, sizeof(after));
>>>> + close(fd);
>>>> +
>>>> + after[0] -= before[0];
>>>> + after[1] -= before[1];
>>>
>>> Put igt_assert here (check for zero before divide).
>>>
>>>> +
>>>> + *outf = 1e9 * after[0] / after[1];
>>>> + return igt_nsec_elapsed(&tv);
>>>> +}
>>>> +
>>>> +static void engine_order(int i915)
>>>> +{
>>>> + const uint64_t sz = 512ull << 20;
>>>> + struct drm_i915_gem_exec_object2 obj[2] = {
>>>> + { .handle = gem_create(i915, 4096) },
>>>> + { .handle = batch_create(i915, sz) },
>>>> + };
>>>> + struct drm_i915_gem_execbuffer2 execbuf = {
>>>> + .buffers_ptr = to_user_pointer(obj),
>>>> + .buffer_count = ARRAY_SIZE(obj),
>>>> + };
>>>> + const struct intel_execution_engine2 *e;
>>>> + unsigned int engines[2], reverse[2];
>>>> + uint64_t forward, backward, both;
>>>> + unsigned int num_engines;
>>>> + const intel_ctx_t *ctx;
>>>> + int min, max;
>>>> + double freq;
>>>> + int sysfs;
>>>> +
>>>> + /*
>>>> + * Check the order of fences found during GEM_WAIT does not affect
>>>> + * waitboosting. (See fence_order())
>>>> + *
>>>> + * Another way we can manipulate the order of fences within the
>>>> + * dma-resv is through repeated use of the same contexts.
>>>> + */
>>>> +
>>>> + num_engines = 0;
>>>> + ctx = intel_ctx_create_all_physical(i915);
>>>> + for_each_ctx_engine(i915, ctx, e) {
>>>> + /*
>>>> + * Avoid using the cmdparser as it will try to allocate
>>>> + * a new shadow batch for each submission -> oom
>>>> + */
>>>> + if (!gem_engine_has_mutable_submission(i915, e->class))
>>>> + continue;
>>>> +
>>>> + engines[num_engines++] = e->flags;
>>>> + if (num_engines == ARRAY_SIZE(engines))
>>>> + break;
>>>> + }
>>>> + igt_require(num_engines > 1);
>>>> + for (unsigned int n = 0; n < num_engines; n++)
>>>> + reverse[n] = engines[num_engines - n - 1];
>>>> + execbuf.rsvd1 = ctx->id;
>>>> +
>>>> + sysfs = igt_sysfs_open(i915);
>>>> + min = igt_sysfs_get_u32(sysfs, "gt_RPn_freq_mhz");
>>>> + max = igt_sysfs_get_u32(sysfs, "gt_RP0_freq_mhz");
>>>> + igt_require(max > min);
>>>> +
>>>> + /* Only allow ourselves to upclock via waitboosting */
>>>> + igt_sysfs_printf(sysfs, "gt_min_freq_mhz", "%d", min);
>>>> + igt_sysfs_printf(sysfs, "gt_max_freq_mhz", "%d", min);
>>>> + igt_sysfs_printf(sysfs, "gt_boost_freq_mhz", "%d", max);
>>>> +
>>>> + /* Warm up to bind the vma */
>>>> + gem_execbuf(i915, &execbuf);
>>>> +
>>>> + forward = __engine_order(i915, &obj[0], &execbuf,
>>>> + engines, engines, num_engines,
>>>> + &freq);
>>>> + igt_info("Forwards: %.2fms @ %.3fMhz\n", forward * 1e-6, freq);
>>>> +
>>>> + backward = __engine_order(i915, &obj[0], &execbuf,
>>>> + reverse, reverse, num_engines,
>>>> + &freq);
>>>> + igt_info("Backwards: %.2fms @ %.3fMhz\n", backward * 1e-6, freq);
>>>> +
>>>> + both = __engine_order(i915, &obj[0], &execbuf,
>>>> + engines, reverse, num_engines,
>>>> + &freq);
>>>> + igt_info("Bidirectional: %.2fms @ %.3fMhz\n", both * 1e-6, freq);
>>>> +
>>>> + gem_close(i915, obj[0].handle);
>>>> + gem_close(i915, obj[1].handle);
>>>
>>> Close sysfs here.
>>
>> Ack
>>
>>>
>>>> + intel_ctx_destroy(i915, ctx);
>>>> +
>>>> + igt_sysfs_printf(sysfs, "gt_min_freq_mhz", "%d", min);
>>>> + igt_sysfs_printf(sysfs, "gt_max_freq_mhz", "%d", max);
>>>> +
>>>> + igt_assert(4 * forward > 3 * backward && 4 * backward > 3 * forward);
>>>> + igt_assert(4 * forward > 3 * both && 4 * both > 3 * forward);
>>>> +}
>>>> +
>>>> static void pm_rps_exit_handler(int sig)
>>>> {
>>>> - if (origfreqs[MIN] > readval(sysfs_files[MAX].filp)) {
>>>> - writeval(sysfs_files[MAX].filp, origfreqs[MAX]);
>>>> - writeval(sysfs_files[MIN].filp, origfreqs[MIN]);
>>>> - } else {
>>>> - writeval(sysfs_files[MIN].filp, origfreqs[MIN]);
>>>> - writeval(sysfs_files[MAX].filp, origfreqs[MAX]);
>>>> + if (sysfs_files[MAX].filp) {
>>>> + if (origfreqs[MIN] > readval(sysfs_files[MAX].filp)) {
>>>> + writeval(sysfs_files[MAX].filp, origfreqs[MAX]);
>>>> + writeval(sysfs_files[MIN].filp, origfreqs[MIN]);
>>>> + } else {
>>>> + writeval(sysfs_files[MIN].filp, origfreqs[MIN]);
>>>> + writeval(sysfs_files[MAX].filp, origfreqs[MAX]);
>>>> + }
>>>
>>> This change looks unrelated. Please split this as separate patch
>>> and post as a fix.
>>
>> Will do so
>>
>> Many thanks,
>> Karolina
>>
>>> Regards,
>>> Kamil
>>>
>>>> }
>>>>
>>>> if (lh.igt_proc.running)
>>>> @@ -683,6 +924,14 @@ igt_main
>>>> igt_subtest("waitboost")
>>>> waitboost(drm_fd, false);
>>>>
>>>> + igt_describe("Check if the order of fences does not affect waitboosting");
>>>> + igt_subtest("fence-order")
>>>> + fence_order(drm_fd);
>>>> +
>>>> + igt_describe("Check if context reuse does not affect waitboosting");
>>>> + igt_subtest("engine-order")
>>>> + engine_order(drm_fd);
>>>> +
>>>> /* Test boost frequency after GPU reset */
>>>> igt_subtest("reset") {
>>>> igt_hang_t hang = igt_allow_hang(drm_fd, 0, 0);
>>>> --
>>>> 2.25.1
^ permalink raw reply [flat|nested] 10+ messages in thread
* Re: [igt-dev] [PATCH i-g-t v4] i915/i915_pm_rps: Check impact of fence ordering on waitboosting
[not found] ` <YuIeu2vSAFE8YnB3@kamilkon-desk1>
2022-07-28 14:04 ` Karolina Drobnik
@ 2022-07-28 14:12 ` Chris Wilson
1 sibling, 0 replies; 10+ messages in thread
From: Chris Wilson @ 2022-07-28 14:12 UTC (permalink / raw)
To: Kamil Konieczny, igt-dev
Quoting Kamil Konieczny (2022-07-28 06:29:31)
> Hi,
>
> On 2022-07-27 at 14:27:02 +0200, Karolina Drobnik wrote:
> > Hi,
> >
> > On 26.07.2022 12:49, Kamil Konieczny wrote:
> > > Hi Karolina,
> > >
> > > few more nits, see below.
> > >
> > > On 2022-07-13 at 12:53:42 +0200, Karolina Drobnik wrote:
> > > > From: Chris Wilson <chris@chris-wilson.co.uk>
> > > >
> > > > Currently the wait boost heuristic is evaluated at the start of each
> > > > fence wait for a series within dma-resv. There is no strict ordering of
> > > > fences defined by dma-resv, and so it turns out that the same operation
> > > > under different circumstances can result in different heuristics being
> > > > applied, and dramatic performance variations in user applications.
> > > >
> > > > Link: https://gitlab.freedesktop.org/drm/intel/-/issues/6284
> > > > Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
> > > > Signed-off-by: Karolina Drobnik <karolina.drobnik@intel.com>
> > > > ---
> > > > v4:
> > > > - Added test descriptions with igt_describe()
> > > > v3:
> > > > - Removed unnecessary calls to gem_write
> > > > v2:
> > > > - Introduced batch_create() with arbitration points to prevent hangs
> > > > - Added engine-order subtest
> > > > - Added reporting of GPU frequency
> > > > - Changed runtimes and switched prints from us to ms
> > > >
> > > > tests/i915/i915_pm_rps.c | 261 ++++++++++++++++++++++++++++++++++++++-
> > > > 1 file changed, 255 insertions(+), 6 deletions(-)
> > > >
> > > > diff --git a/tests/i915/i915_pm_rps.c b/tests/i915/i915_pm_rps.c
> > > > index b37f15c2..6125ea60 100644
> > > > --- a/tests/i915/i915_pm_rps.c
> > > > +++ b/tests/i915/i915_pm_rps.c
> > > > @@ -37,8 +37,10 @@
> > > > #include <sys/wait.h>
> > > >
> > > > #include "i915/gem.h"
> > > > +#include "i915/gem_create.h"
> > > > #include "igt.h"
> > > > #include "igt_dummyload.h"
> > > > +#include "igt_perf.h"
> > > > #include "igt_sysfs.h"
> > > >
> > > > IGT_TEST_DESCRIPTION("Render P-States tests - verify GPU frequency changes");
> > > > @@ -614,14 +616,253 @@ static void waitboost(int fd, bool reset)
> > > > igt_assert_lt(post_freqs[CUR], post_freqs[MAX]);
> > > > }
> > > >
> > > > +static uint32_t batch_create(int i915, uint64_t sz)
> > > > +{
> > > > + const uint32_t bbe = MI_BATCH_BUFFER_END;
> > > > + const uint32_t chk = 0x5 << 23;
> > > > + uint32_t handle = gem_create(i915, sz);
> > > > +
> > > > + for (uint64_t pg = 4096; pg + 4096 < sz; pg += 4096)
> > > > + gem_write(i915, handle, pg, &chk, sizeof(chk));
> > > > +
> > > > + gem_write(i915, handle, sz - sizeof(bbe), &bbe, sizeof(bbe));
> > > > +
> > > > + return handle;
> > > > +}
> > > > +
> > > > +static uint64_t __fence_order(int i915,
> > > > + struct drm_i915_gem_exec_object2 *obj,
> > > > + struct drm_i915_gem_execbuffer2 *eb,
> > > > + uint64_t flags0, uint64_t flags1,
> > > > + double *outf)
> > > > +{
> > > > + uint64_t before[2], after[2];
> > > > + struct timespec tv;
> > > > + int fd;
> > > > +
> > > > + gem_quiescent_gpu(i915);
> > > > + fd = perf_i915_open(i915, I915_PMU_ACTUAL_FREQUENCY);
> > > > +
> > > > + igt_gettime(&tv);
> > > > +
> > > > + obj->flags = flags0;
> > > > + gem_execbuf(i915, eb);
> > > > +
> > > > + obj->flags = flags1;
> > > > + gem_execbuf(i915, eb);
> > > > +
> > > > + read(fd, before, sizeof(before));
> > > > + gem_sync(i915, obj->handle);
> > > > + read(fd, after, sizeof(after));
> > > > + close(fd);
> > > > +
> > > > + after[0] -= before[0];
> > > > + after[1] -= before[1];
> > >
> > > Put igt_assert here (check for zero before divide).
> >
> > Hmm, isn't that more of a belts and suspenders approach? It's unlikely we'd
> > get zero here. Also, don't we put asserts towards the end of the test
> > whenever possible?
>
> It is userspace so we can afford some extra checks, so maybe
> if (after[1] == 0)
> return 0;
> >
> > > > +
> > > > + *outf = 1e9 * after[0] / after[1];
> > > > + return igt_nsec_elapsed(&tv);
> > > > +}
> > > > +
> > > > +static void fence_order(int i915)
> > > > +{
> > > > + const uint64_t sz = 512ull << 20;
> > > > + struct drm_i915_gem_exec_object2 obj[2] = {
> > > > + { .handle = gem_create(i915, 4096) },
> > > > + { .handle = batch_create(i915, sz) },
> > > > + };
> > > > + struct drm_i915_gem_execbuffer2 execbuf = {
> > > > + .buffers_ptr = to_user_pointer(obj),
> > > > + .buffer_count = ARRAY_SIZE(obj),
> > > > + };
> > > > + uint64_t wr, rw;
> > > > + int min, max;
> > > > + double freq;
> > > > + int sysfs;
> > > > +
> > > > + /*
> > > > + * Check the order of fences found during GEM_WAIT does not affect
> > > > + * waitboosting.
> > > > + *
> > > > + * Internally, implicit fences are tracked within a dma-resv which
> > > > + * imposes no order on the individually fences tracked within. Since
> > > > + * there is no defined order, the sequence of waits (and the associated
> > > > + * waitboosts) is also undefined, undermining the consistency of the
> > > > + * waitboost heuristic.
> > > > + *
> > > > + * In particular, we can influence the sequence of fence storage
> > > > + * within dma-resv by mixing read/write semantics for implicit fences.
> > > > + * We can exploit this property of dma-resv to exercise that no matter
> > > > + * the stored order, the heuristic is applied consistently for the
> > > > + * user's GEM_WAIT ioctl.
> > > > + */
> > > > +
> > > > + sysfs = igt_sysfs_open(i915);
> > > > + min = igt_sysfs_get_u32(sysfs, "gt_RPn_freq_mhz");
> > > > + max = igt_sysfs_get_u32(sysfs, "gt_RP0_freq_mhz");
> > > > + igt_require(max > min);
> > > > +
> > > > + /* Only allow ourselves to upclock via waitboosting */
> > > > + igt_sysfs_printf(sysfs, "gt_min_freq_mhz", "%d", min);
> > > > + igt_sysfs_printf(sysfs, "gt_max_freq_mhz", "%d", min);
> > > > + igt_sysfs_printf(sysfs, "gt_boost_freq_mhz", "%d", max);
> > > > +
> > > > + /* Warm up to bind the vma */
> > > > + __fence_order(i915, &obj[0], &execbuf, 0, 0, &freq);
> > > > +
> > > > + wr = __fence_order(i915, &obj[0], &execbuf, EXEC_OBJECT_WRITE, 0, &freq);
> > > > + igt_info("Write-then-read: %.2fms @ %.3fMHz\n", wr * 1e-6, freq);
> > > > +
> > > > + rw = __fence_order(i915, &obj[0], &execbuf, 0, EXEC_OBJECT_WRITE, &freq);
> > > > + igt_info("Read-then-write: %.2fms @ %.3fMHz\n", rw * 1e-6, freq);
> > > > +
> > > > + gem_close(i915, obj[0].handle);
> > > > + gem_close(i915, obj[1].handle);
> > >
> > > Close sysfs here.
> >
> > Good catch. I'll close it before asserts (as agreed).
> >
>
> My bad, this close should be after resetting max freq, two lines
> below. Btw why not reuse code from other tests, reading old freqs
> at beginnig of test or reusing freqs readed at fixture in
> igt_main ?
>
> > > > +
> > > > + igt_sysfs_printf(sysfs, "gt_min_freq_mhz", "%d", min);
> > > > + igt_sysfs_printf(sysfs, "gt_max_freq_mhz", "%d", max);
> > >
> > > Here we should put previous values, not those readed from RPx,
> > > but if they are always the same it is ok.
> >
> > We restore the values we read from sysfs
> >
>
> But we readed RPx values, not gt_min|gt_max freq ones, so here
> you can use origfreqs[MIN] and origfreqs[MAX] or declare in test
Not required. igt owns the device. We completely take over the device,
restoring default properties, all previous state will be lost. Anything
that pretends otherwise is misleading.
-Chris
^ permalink raw reply [flat|nested] 10+ messages in thread
end of thread, other threads:[~2022-07-28 14:28 UTC | newest]
Thread overview: 10+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2022-07-13 10:53 [igt-dev] [PATCH i-g-t v4] i915/i915_pm_rps: Check impact of fence ordering on waitboosting Karolina Drobnik
2022-07-15 17:30 ` Kamil Konieczny
2022-07-18 9:46 ` Karolina Drobnik
2022-07-25 7:41 ` Karolina Drobnik
2022-07-25 17:06 ` Kamil Konieczny
2022-07-26 10:49 ` Kamil Konieczny
2022-07-27 12:27 ` Karolina Drobnik
[not found] ` <YuIeu2vSAFE8YnB3@kamilkon-desk1>
2022-07-28 14:04 ` Karolina Drobnik
2022-07-28 14:12 ` Chris Wilson
2022-07-26 14:18 ` Kamil Konieczny
This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.