All of lore.kernel.org
 help / color / mirror / Atom feed
* [PATCH 1/3] drm/i915/perf: Add helper macros for comparing with whitelisted registers
@ 2019-10-14 18:55 Umesh Nerlige Ramappa
  2019-10-14 18:55 ` [PATCH 2/3] drm/i915/tgl: Add perf support on TGL Umesh Nerlige Ramappa
                   ` (2 more replies)
  0 siblings, 3 replies; 6+ messages in thread
From: Umesh Nerlige Ramappa @ 2019-10-14 18:55 UTC (permalink / raw)
  To: intel-gfx; +Cc: Lucas De Marchi, Chris Wilson

Add helper macros for range and equality comparisons and use them to
check with whitelisted registers in oa configurations.

Signed-off-by: Umesh Nerlige Ramappa <umesh.nerlige.ramappa@intel.com>
---
 drivers/gpu/drm/i915/i915_perf.c | 54 +++++++++++++++++---------------
 1 file changed, 28 insertions(+), 26 deletions(-)

diff --git a/drivers/gpu/drm/i915/i915_perf.c b/drivers/gpu/drm/i915/i915_perf.c
index 366580701ba2..d54d3a41ceca 100644
--- a/drivers/gpu/drm/i915/i915_perf.c
+++ b/drivers/gpu/drm/i915/i915_perf.c
@@ -3455,56 +3455,58 @@ static bool gen8_is_valid_flex_addr(struct i915_perf *perf, u32 addr)
 	return false;
 }
 
+#define ADDR_IN_RANGE(addr, start, end) \
+	((addr) >= (start) && \
+	 (addr) <= (end))
+
+#define REG_IN_RANGE(addr, start, end) \
+	((addr) >= i915_mmio_reg_offset(start) && \
+	 (addr) <= i915_mmio_reg_offset(end))
+
+#define REG_EQUAL(addr, mmio) \
+	((addr) == i915_mmio_reg_offset(mmio))
+
 static bool gen7_is_valid_b_counter_addr(struct i915_perf *perf, u32 addr)
 {
-	return (addr >= i915_mmio_reg_offset(OASTARTTRIG1) &&
-		addr <= i915_mmio_reg_offset(OASTARTTRIG8)) ||
-		(addr >= i915_mmio_reg_offset(OAREPORTTRIG1) &&
-		 addr <= i915_mmio_reg_offset(OAREPORTTRIG8)) ||
-		(addr >= i915_mmio_reg_offset(OACEC0_0) &&
-		 addr <= i915_mmio_reg_offset(OACEC7_1));
+	return REG_IN_RANGE(addr, OASTARTTRIG1, OASTARTTRIG8) ||
+	       REG_IN_RANGE(addr, OAREPORTTRIG1, OAREPORTTRIG8) ||
+	       REG_IN_RANGE(addr, OACEC0_0, OACEC7_1);
 }
 
 static bool gen7_is_valid_mux_addr(struct i915_perf *perf, u32 addr)
 {
-	return addr == i915_mmio_reg_offset(HALF_SLICE_CHICKEN2) ||
-		(addr >= i915_mmio_reg_offset(MICRO_BP0_0) &&
-		 addr <= i915_mmio_reg_offset(NOA_WRITE)) ||
-		(addr >= i915_mmio_reg_offset(OA_PERFCNT1_LO) &&
-		 addr <= i915_mmio_reg_offset(OA_PERFCNT2_HI)) ||
-		(addr >= i915_mmio_reg_offset(OA_PERFMATRIX_LO) &&
-		 addr <= i915_mmio_reg_offset(OA_PERFMATRIX_HI));
+	return REG_EQUAL(addr, HALF_SLICE_CHICKEN2) ||
+	       REG_IN_RANGE(addr, MICRO_BP0_0, NOA_WRITE) ||
+	       REG_IN_RANGE(addr, OA_PERFCNT1_LO, OA_PERFCNT2_HI) ||
+	       REG_IN_RANGE(addr, OA_PERFMATRIX_LO, OA_PERFMATRIX_HI);
 }
 
 static bool gen8_is_valid_mux_addr(struct i915_perf *perf, u32 addr)
 {
 	return gen7_is_valid_mux_addr(perf, addr) ||
-		addr == i915_mmio_reg_offset(WAIT_FOR_RC6_EXIT) ||
-		(addr >= i915_mmio_reg_offset(RPM_CONFIG0) &&
-		 addr <= i915_mmio_reg_offset(NOA_CONFIG(8)));
+	       REG_EQUAL(addr, WAIT_FOR_RC6_EXIT) ||
+	       REG_IN_RANGE(addr, RPM_CONFIG0, NOA_CONFIG(8));
 }
 
 static bool gen10_is_valid_mux_addr(struct i915_perf *perf, u32 addr)
 {
 	return gen8_is_valid_mux_addr(perf, addr) ||
-		addr == i915_mmio_reg_offset(GEN10_NOA_WRITE_HIGH) ||
-		(addr >= i915_mmio_reg_offset(OA_PERFCNT3_LO) &&
-		 addr <= i915_mmio_reg_offset(OA_PERFCNT4_HI));
+	       REG_EQUAL(addr, GEN10_NOA_WRITE_HIGH) ||
+	       REG_IN_RANGE(addr, OA_PERFCNT3_LO, OA_PERFCNT4_HI);
 }
 
 static bool hsw_is_valid_mux_addr(struct i915_perf *perf, u32 addr)
 {
 	return gen7_is_valid_mux_addr(perf, addr) ||
-		(addr >= 0x25100 && addr <= 0x2FF90) ||
-		(addr >= i915_mmio_reg_offset(HSW_MBVID2_NOA0) &&
-		 addr <= i915_mmio_reg_offset(HSW_MBVID2_NOA9)) ||
-		addr == i915_mmio_reg_offset(HSW_MBVID2_MISR0);
+	       ADDR_IN_RANGE(addr, 0x25100, 0x2FF90) ||
+	       REG_IN_RANGE(addr, HSW_MBVID2_NOA0, HSW_MBVID2_NOA9) ||
+	       REG_EQUAL(addr, HSW_MBVID2_MISR0);
 }
 
 static bool chv_is_valid_mux_addr(struct i915_perf *perf, u32 addr)
 {
 	return gen7_is_valid_mux_addr(perf, addr) ||
-		(addr >= 0x182300 && addr <= 0x1823A4);
+	       ADDR_IN_RANGE(addr, 0x182300, 0x1823A4);
 }
 
 static u32 mask_reg_value(u32 reg, u32 val)
@@ -3513,14 +3515,14 @@ static u32 mask_reg_value(u32 reg, u32 val)
 	 * WaDisableSTUnitPowerOptimization workaround. Make sure the value
 	 * programmed by userspace doesn't change this.
 	 */
-	if (i915_mmio_reg_offset(HALF_SLICE_CHICKEN2) == reg)
+	if (REG_EQUAL(reg, HALF_SLICE_CHICKEN2))
 		val = val & ~_MASKED_BIT_ENABLE(GEN8_ST_PO_DISABLE);
 
 	/* WAIT_FOR_RC6_EXIT has only one bit fullfilling the function
 	 * indicated by its name and a bunch of selection fields used by OA
 	 * configs.
 	 */
-	if (i915_mmio_reg_offset(WAIT_FOR_RC6_EXIT) == reg)
+	if (REG_EQUAL(reg, WAIT_FOR_RC6_EXIT))
 		val = val & ~_MASKED_BIT_ENABLE(HSW_WAIT_FOR_RC6_EXIT_ENABLE);
 
 	return val;
-- 
2.20.1

_______________________________________________
Intel-gfx mailing list
Intel-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/intel-gfx

^ permalink raw reply related	[flat|nested] 6+ messages in thread

* [PATCH 2/3] drm/i915/tgl: Add perf support on TGL
  2019-10-14 18:55 [PATCH 1/3] drm/i915/perf: Add helper macros for comparing with whitelisted registers Umesh Nerlige Ramappa
@ 2019-10-14 18:55 ` Umesh Nerlige Ramappa
  2019-10-14 18:55 ` [PATCH 3/3] drm/i915/perf: enable OAR context save/restore of performance counters Umesh Nerlige Ramappa
  2019-10-14 22:33 ` ✗ Fi.CI.BUILD: failure for series starting with [1/3] drm/i915/perf: Add helper macros for comparing with whitelisted registers Patchwork
  2 siblings, 0 replies; 6+ messages in thread
From: Umesh Nerlige Ramappa @ 2019-10-14 18:55 UTC (permalink / raw)
  To: intel-gfx; +Cc: Lucas De Marchi, Chris Wilson

From: Lionel Landwerlin <lionel.g.landwerlin@intel.com>

The design of the OA unit has been split into several units. We now
have a global unit (OAG) and a render specific unit (OAR). This leads
to some changes on how we program things. Some details :

OAR:
  - has its own set of counter registers, they are per-context
    saved/restored
  - counters are not written to the circular OA buffer
  - a snapshot of the counters can be acquired with
    MI_RECORD_PERF_COUNT, or a single counter can be read with
    MI_STORE_REGISTER_MEM.

OAG:
  - has global counters that increment across context switches
  - counters are written into the circular OA buffer (if requested)

v2: Fix checkpatch warnings on code style (Lucas)
v3: (Umesh)
  - Update register from which tail, status and head are read
  - Update logic to sample context reports
  - Update whitelist mux and b counter regs

BSpec: 28727, 30021

Signed-off-by: Lionel Landwerlin <lionel.g.landwerlin@intel.com>
Signed-off-by: Umesh Nerlige Ramappa <umesh.nerlige.ramappa@intel.com>
Signed-off-by: Lucas De Marchi <lucas.demarchi@intel.com>
---
 drivers/gpu/drm/i915/Makefile         |   3 +-
 drivers/gpu/drm/i915/i915_perf.c      | 280 +++++++++++++++++++++++---
 drivers/gpu/drm/i915/i915_reg.h       | 103 ++++++++++
 drivers/gpu/drm/i915/oa/i915_oa_tgl.c | 121 +++++++++++
 drivers/gpu/drm/i915/oa/i915_oa_tgl.h |  16 ++
 5 files changed, 492 insertions(+), 31 deletions(-)
 create mode 100644 drivers/gpu/drm/i915/oa/i915_oa_tgl.c
 create mode 100644 drivers/gpu/drm/i915/oa/i915_oa_tgl.h

diff --git a/drivers/gpu/drm/i915/Makefile b/drivers/gpu/drm/i915/Makefile
index e791d9323b51..0ec9fee58baa 100644
--- a/drivers/gpu/drm/i915/Makefile
+++ b/drivers/gpu/drm/i915/Makefile
@@ -242,7 +242,8 @@ i915-y += \
 	oa/i915_oa_cflgt2.o \
 	oa/i915_oa_cflgt3.o \
 	oa/i915_oa_cnl.o \
-	oa/i915_oa_icl.o
+	oa/i915_oa_icl.o \
+	oa/i915_oa_tgl.o
 i915-y += i915_perf.o
 
 # Post-mortem debug and GPU hang state capture
diff --git a/drivers/gpu/drm/i915/i915_perf.c b/drivers/gpu/drm/i915/i915_perf.c
index d54d3a41ceca..df3b6976ba5b 100644
--- a/drivers/gpu/drm/i915/i915_perf.c
+++ b/drivers/gpu/drm/i915/i915_perf.c
@@ -217,6 +217,7 @@
 #include "oa/i915_oa_cflgt3.h"
 #include "oa/i915_oa_cnl.h"
 #include "oa/i915_oa_icl.h"
+#include "oa/i915_oa_tgl.h"
 
 /* HW requires this to be a power of two, between 128k and 16M, though driver
  * is currently generally designed assuming the largest 16M size is used such
@@ -292,7 +293,8 @@ static u32 i915_perf_stream_paranoid = true;
 #define INVALID_CTX_ID 0xffffffff
 
 /* On Gen8+ automatically triggered OA reports include a 'reason' field... */
-#define OAREPORT_REASON_MASK           0x3f
+#define OAREPORT_REASON_MASK           (IS_GEN(stream->perf->i915, 12) ? \
+					0x7f : 0x3f)
 #define OAREPORT_REASON_SHIFT          19
 #define OAREPORT_REASON_TIMER          (1<<0)
 #define OAREPORT_REASON_CTX_SWITCH     (1<<3)
@@ -338,6 +340,10 @@ static const struct i915_oa_format gen8_plus_oa_formats[I915_OA_FORMAT_MAX] = {
 	[I915_OA_FORMAT_C4_B8]		    = { 7, 64 },
 };
 
+static const struct i915_oa_format gen12_oa_formats[I915_OA_FORMAT_MAX] = {
+	[I915_OA_FORMAT_A32u40_A4u32_B8_C8] = { 5, 256 },
+};
+
 #define SAMPLE_OA_REPORT      (1<<0)
 
 /**
@@ -415,6 +421,14 @@ static void free_oa_config_bo(struct i915_oa_config_bo *oa_bo)
 	kfree(oa_bo);
 }
 
+static u32 gen12_oa_hw_tail_read(struct i915_perf_stream *stream)
+{
+	struct intel_uncore *uncore = stream->uncore;
+
+	return intel_uncore_read(uncore, GEN12_OAG_OATAILPTR) &
+	       GEN12_OAG_OATAILPTR_MASK;
+}
+
 static u32 gen8_oa_hw_tail_read(struct i915_perf_stream *stream)
 {
 	struct intel_uncore *uncore = stream->uncore;
@@ -535,7 +549,7 @@ static bool oa_buffer_check_unlocked(struct i915_perf_stream *stream)
 				aging_tail = hw_tail;
 			stream->oa_buffer.aging_timestamp = now;
 		} else {
-			DRM_ERROR("Ignoring spurious out of range OA buffer tail pointer = %u\n",
+			DRM_ERROR("Ignoring spurious out of range OA buffer tail pointer = %x\n",
 				  hw_tail);
 		}
 	}
@@ -754,7 +768,8 @@ static int gen8_append_oa_reports(struct i915_perf_stream *stream,
 		 * Note: that we don't clear the valid_ctx_bit so userspace can
 		 * understand that the ID has been squashed by the kernel.
 		 */
-		if (!(report32[0] & stream->perf->gen8_valid_ctx_bit))
+		if (!(report32[0] & stream->perf->gen8_valid_ctx_bit) &&
+		    INTEL_GEN(stream->perf->i915) <= 11)
 			ctx_id = report32[2] = INVALID_CTX_ID;
 
 		/*
@@ -821,6 +836,11 @@ static int gen8_append_oa_reports(struct i915_perf_stream *stream,
 	}
 
 	if (start_offset != *offset) {
+		i915_reg_t oaheadptr;
+
+		oaheadptr = IS_GEN(stream->perf->i915, 12) ?
+			    GEN12_OAG_OAHEADPTR : GEN8_OAHEADPTR;
+
 		spin_lock_irqsave(&stream->oa_buffer.ptr_lock, flags);
 
 		/*
@@ -828,9 +848,8 @@ static int gen8_append_oa_reports(struct i915_perf_stream *stream,
 		 * relative to oa_buf_base so put back here...
 		 */
 		head += gtt_offset;
-
-		intel_uncore_write(uncore, GEN8_OAHEADPTR,
-				   head & GEN8_OAHEADPTR_MASK);
+		intel_uncore_write(uncore, oaheadptr,
+				   head & GEN12_OAG_OAHEADPTR_MASK);
 		stream->oa_buffer.head = head;
 
 		spin_unlock_irqrestore(&stream->oa_buffer.ptr_lock, flags);
@@ -866,12 +885,16 @@ static int gen8_oa_read(struct i915_perf_stream *stream,
 {
 	struct intel_uncore *uncore = stream->uncore;
 	u32 oastatus;
+	i915_reg_t oastatus_reg;
 	int ret;
 
 	if (WARN_ON(!stream->oa_buffer.vaddr))
 		return -EIO;
 
-	oastatus = intel_uncore_read(uncore, GEN8_OASTATUS);
+	oastatus_reg = IS_GEN(stream->perf->i915, 12) ?
+		       GEN12_OAG_OASTATUS : GEN8_OASTATUS;
+
+	oastatus = intel_uncore_read(uncore, oastatus_reg);
 
 	/*
 	 * We treat OABUFFER_OVERFLOW as a significant error:
@@ -903,7 +926,7 @@ static int gen8_oa_read(struct i915_perf_stream *stream,
 		 * Note: .oa_enable() is expected to re-init the oabuffer and
 		 * reset GEN8_OASTATUS for us
 		 */
-		oastatus = intel_uncore_read(uncore, GEN8_OASTATUS);
+		oastatus = intel_uncore_read(uncore, oastatus_reg);
 	}
 
 	if (oastatus & GEN8_OASTATUS_REPORT_LOST) {
@@ -911,7 +934,7 @@ static int gen8_oa_read(struct i915_perf_stream *stream,
 				       DRM_I915_PERF_RECORD_OA_REPORT_LOST);
 		if (ret)
 			return ret;
-		intel_uncore_write(uncore, GEN8_OASTATUS,
+		intel_uncore_write(uncore, oastatus_reg,
 				   oastatus & ~GEN8_OASTATUS_REPORT_LOST);
 	}
 
@@ -1485,6 +1508,67 @@ static void gen8_init_oa_buffer(struct i915_perf_stream *stream)
 	stream->pollin = false;
 }
 
+static void gen12_init_oa_buffer(struct i915_perf_stream *stream)
+{
+	struct intel_uncore *uncore = stream->uncore;
+	u32 gtt_offset = i915_ggtt_offset(stream->oa_buffer.vma);
+	unsigned long flags;
+
+	spin_lock_irqsave(&stream->oa_buffer.ptr_lock, flags);
+
+	intel_uncore_write(uncore, GEN12_OAG_OASTATUS, 0);
+	intel_uncore_write(uncore, GEN12_OAG_OAHEADPTR,
+			   gtt_offset & GEN12_OAG_OAHEADPTR_MASK);
+	stream->oa_buffer.head = gtt_offset;
+
+	/*
+	 * PRM says:
+	 *
+	 *  "This MMIO must be set before the OATAILPTR
+	 *  register and after the OAHEADPTR register. This is
+	 *  to enable proper functionality of the overflow
+	 *  bit."
+	 */
+	intel_uncore_write(uncore, GEN12_OAG_OABUFFER, gtt_offset |
+			   OABUFFER_SIZE_16M | GEN8_OABUFFER_MEM_SELECT_GGTT);
+	intel_uncore_write(uncore, GEN12_OAG_OATAILPTR,
+			   gtt_offset & GEN12_OAG_OATAILPTR_MASK);
+
+	/* Mark that we need updated tail pointers to read from... */
+	stream->oa_buffer.tails[0].offset = INVALID_TAIL_PTR;
+	stream->oa_buffer.tails[1].offset = INVALID_TAIL_PTR;
+
+	/*
+	 * Reset state used to recognise context switches, affecting which
+	 * reports we will forward to userspace while filtering for a single
+	 * context.
+	 */
+	stream->oa_buffer.last_ctx_id = INVALID_CTX_ID;
+
+	spin_unlock_irqrestore(&stream->oa_buffer.ptr_lock, flags);
+
+	/*
+	 * NB: although the OA buffer will initially be allocated
+	 * zeroed via shmfs (and so this memset is redundant when
+	 * first allocating), we may re-init the OA buffer, either
+	 * when re-enabling a stream or in error/reset paths.
+	 *
+	 * The reason we clear the buffer for each re-init is for the
+	 * sanity check in gen8_append_oa_reports() that looks at the
+	 * reason field to make sure it's non-zero which relies on
+	 * the assumption that new reports are being written to zeroed
+	 * memory...
+	 */
+	memset(stream->oa_buffer.vaddr, 0,
+	       stream->oa_buffer.vma->size);
+
+	/*
+	 * Maybe make ->pollin per-stream state if we support multiple
+	 * concurrent streams in the future.
+	 */
+	stream->pollin = false;
+}
+
 static int alloc_oa_buffer(struct i915_perf_stream *stream)
 {
 	struct drm_i915_gem_object *bo;
@@ -1991,7 +2075,7 @@ gen8_update_reg_state_unlocked(const struct intel_context *ce,
 		(stream->periodic ? GEN8_OA_TIMER_ENABLE : 0) |
 		GEN8_OA_COUNTER_RESUME;
 
-	for (i = 0; i < ARRAY_SIZE(flex_regs); i++)
+	for (i = 0; !!ctx_flexeu0 && i < ARRAY_SIZE(flex_regs); i++)
 		reg_state[ctx_flexeu0 + i * 2 + 1] =
 			oa_config_flex_reg(stream->oa_config, flex_regs[i]);
 
@@ -2148,8 +2232,8 @@ static int gen8_configure_context(struct i915_gem_context *ctx,
  *
  * Note: it's only the RCS/Render context that has any OA state.
  */
-static int gen8_configure_all_contexts(struct i915_perf_stream *stream,
-				       const struct i915_oa_config *oa_config)
+static int lrc_configure_all_contexts(struct i915_perf_stream *stream,
+				      const struct i915_oa_config *oa_config)
 {
 	struct drm_i915_private *i915 = stream->perf->i915;
 	/* The MMIO offsets for Flex EU registers aren't contiguous */
@@ -2161,11 +2245,9 @@ static int gen8_configure_all_contexts(struct i915_perf_stream *stream,
 			CTX_R_PWR_CLK_STATE,
 		},
 		{
-			GEN8_OACTXCONTROL,
+			IS_GEN(i915, 12) ?
+			GEN12_OAR_OACONTROL: GEN8_OACTXCONTROL,
 			stream->perf->ctx_oactxctrl_offset + 1,
-			((stream->period_exponent << GEN8_OA_TIMER_PERIOD_SHIFT) |
-			 (stream->periodic ? GEN8_OA_TIMER_ENABLE : 0) |
-			 GEN8_OA_COUNTER_RESUME)
 		},
 		{ EU_PERF_CNTL0, ctx_flexeuN(0) },
 		{ EU_PERF_CNTL1, ctx_flexeuN(1) },
@@ -2178,9 +2260,23 @@ static int gen8_configure_all_contexts(struct i915_perf_stream *stream,
 #undef ctx_flexeuN
 	struct intel_engine_cs *engine;
 	struct i915_gem_context *ctx, *cn;
+	size_t array_size = IS_GEN(i915, 12) ? 2 : ARRAY_SIZE(regs);
 	int i, err;
 
-	for (i = 2; i < ARRAY_SIZE(regs); i++)
+	if (IS_GEN(i915, 12)) {
+		u32 format = stream->oa_buffer.format;
+
+		regs[1].value =
+			(format << GEN12_OAR_OACONTROL_COUNTER_FORMAT_SHIFT) |
+			(oa_config ? GEN12_OAR_OACONTROL_COUNTER_ENABLE : 0);
+	} else {
+		regs[1].value =
+			(stream->period_exponent << GEN8_OA_TIMER_PERIOD_SHIFT) |
+			(stream->periodic ? GEN8_OA_TIMER_ENABLE : 0) |
+			GEN8_OA_COUNTER_RESUME;
+	}
+
+	for (i = 2; !!ctx_flexeu0 && i < array_size; i++)
 		regs[i].value = oa_config_flex_reg(oa_config, regs[i].reg);
 
 	lockdep_assert_held(&stream->perf->lock);
@@ -2211,7 +2307,7 @@ static int gen8_configure_all_contexts(struct i915_perf_stream *stream,
 
 		spin_unlock(&i915->gem.contexts.lock);
 
-		err = gen8_configure_context(ctx, regs, ARRAY_SIZE(regs));
+		err = gen8_configure_context(ctx, regs, array_size);
 		if (err) {
 			i915_gem_context_put(ctx);
 			return err;
@@ -2236,7 +2332,7 @@ static int gen8_configure_all_contexts(struct i915_perf_stream *stream,
 
 		regs[0].value = intel_sseu_make_rpcs(i915, &ce->sseu);
 
-		err = gen8_modify_self(ce, regs, ARRAY_SIZE(regs));
+		err = gen8_modify_self(ce, regs, array_size);
 		if (err)
 			return err;
 	}
@@ -2284,7 +2380,45 @@ static int gen8_enable_metric_set(struct i915_perf_stream *stream)
 	 * to make sure all slices/subslices are ON before writing to NOA
 	 * registers.
 	 */
-	ret = gen8_configure_all_contexts(stream, oa_config);
+	ret = lrc_configure_all_contexts(stream, oa_config);
+	if (ret)
+		return ret;
+
+	return emit_oa_config(stream, oa_context(stream));
+}
+
+static int gen12_enable_metric_set(struct i915_perf_stream *stream)
+{
+	struct intel_uncore *uncore = stream->uncore;
+	struct i915_oa_config *oa_config = stream->oa_config;
+	bool periodic = stream->periodic;
+	u32 period_exponent = stream->period_exponent;
+	int ret;
+
+	intel_uncore_write(uncore, GEN12_OAG_OA_DEBUG,
+			   /* Disable clk ratio reports, like previous Gens. */
+			   _MASKED_BIT_ENABLE(GEN12_OAG_OA_DEBUG_DISABLE_CLK_RATIO_REPORTS |
+					      GEN12_OAG_OA_DEBUG_INCLUDE_CLK_RATIO) |
+			   /*
+			    * If the user didn't require OA reports, instruct the
+			    * hardware not to emit ctx switch reports.
+			    */
+			   !(stream->sample_flags & SAMPLE_OA_REPORT) ?
+			   _MASKED_BIT_ENABLE(GEN12_OAG_OA_DEBUG_DISABLE_CTX_SWITCH_REPORTS) :
+			   _MASKED_BIT_DISABLE(GEN12_OAG_OA_DEBUG_DISABLE_CTX_SWITCH_REPORTS));
+
+	intel_uncore_write(uncore, GEN12_OAG_OAGLBCTXCTRL, periodic ?
+			   (GEN12_OAG_OAGLBCTXCTRL_COUNTER_RESUME |
+			    GEN12_OAG_OAGLBCTXCTRL_TIMER_ENABLE |
+			    (period_exponent << GEN12_OAG_OAGLBCTXCTRL_TIMER_PERIOD_SHIFT))
+			    : 0);
+
+	/*
+	 * Update all contexts prior writing the mux configurations as we need
+	 * to make sure all slices/subslices are ON before writing to NOA
+	 * registers.
+	 */
+	ret = lrc_configure_all_contexts(stream, oa_config);
 	if (ret)
 		return ret;
 
@@ -2296,7 +2430,7 @@ static void gen8_disable_metric_set(struct i915_perf_stream *stream)
 	struct intel_uncore *uncore = stream->uncore;
 
 	/* Reset all contexts' slices/subslices configurations. */
-	gen8_configure_all_contexts(stream, NULL);
+	lrc_configure_all_contexts(stream, NULL);
 
 	intel_uncore_rmw(uncore, GDT_CHICKEN_BITS, GT_NOA_ENABLE, 0);
 }
@@ -2306,7 +2440,7 @@ static void gen10_disable_metric_set(struct i915_perf_stream *stream)
 	struct intel_uncore *uncore = stream->uncore;
 
 	/* Reset all contexts' slices/subslices configurations. */
-	gen8_configure_all_contexts(stream, NULL);
+	lrc_configure_all_contexts(stream, NULL);
 
 	/* Make sure we disable noa to save power. */
 	intel_uncore_rmw(uncore, RPM_CONFIG1, GEN10_GT_NOA_ENABLE, 0);
@@ -2368,6 +2502,25 @@ static void gen8_oa_enable(struct i915_perf_stream *stream)
 			   GEN8_OA_COUNTER_ENABLE);
 }
 
+static void gen12_oa_enable(struct i915_perf_stream *stream)
+{
+	struct intel_uncore *uncore = stream->uncore;
+	u32 report_format = stream->oa_buffer.format;
+
+	/*
+	 * If we don't want OA reports from the OA buffer, then we don't even
+	 * need to program the OAG unit.
+	 */
+	if (!(stream->sample_flags & SAMPLE_OA_REPORT))
+		return;
+
+	gen12_init_oa_buffer(stream);
+
+	intel_uncore_write(uncore, GEN12_OAG_OACONTROL,
+			   (report_format << GEN12_OAG_OACONTROL_OA_COUNTER_FORMAT_SHIFT) |
+			   GEN12_OAG_OACONTROL_OA_COUNTER_ENABLE);
+}
+
 /**
  * i915_oa_stream_enable - handle `I915_PERF_IOCTL_ENABLE` for OA stream
  * @stream: An i915 perf stream opened for OA metrics
@@ -2409,6 +2562,18 @@ static void gen8_oa_disable(struct i915_perf_stream *stream)
 		DRM_ERROR("wait for OA to be disabled timed out\n");
 }
 
+static void gen12_oa_disable(struct i915_perf_stream *stream)
+{
+	struct intel_uncore *uncore = stream->uncore;
+
+	intel_uncore_write(uncore, GEN12_OAG_OACONTROL, 0);
+	if (intel_wait_for_register(uncore,
+				    GEN12_OAG_OACONTROL,
+				    GEN12_OAG_OACONTROL_OA_COUNTER_ENABLE, 0,
+				    50))
+		DRM_ERROR("wait for OA to be disabled timed out\n");
+}
+
 /**
  * i915_oa_stream_disable - handle `I915_PERF_IOCTL_DISABLE` for OA stream
  * @stream: An i915 perf stream opened for OA metrics
@@ -2608,7 +2773,7 @@ void i915_oa_init_reg_state(const struct intel_context *ce,
 {
 	struct i915_perf_stream *stream;
 
-	/* perf.exclusive_stream serialised by gen8_configure_all_contexts() */
+	/* perf.exclusive_stream serialised by lrc_configure_all_contexts() */
 	lockdep_assert_held(&ce->pin_mutex);
 
 	if (engine->class != RENDER_CLASS)
@@ -3037,16 +3202,24 @@ i915_perf_open_ioctl_locked(struct i915_perf *perf,
 	 * rest of the system, which we consider acceptable for a
 	 * non-privileged client.
 	 *
-	 * For Gen8+ the OA unit no longer supports clock gating off for a
+	 * For Gen8->11 the OA unit no longer supports clock gating off for a
 	 * specific context and the kernel can't securely stop the counters
 	 * from updating as system-wide / global values. Even though we can
 	 * filter reports based on the included context ID we can't block
 	 * clients from seeing the raw / global counter values via
 	 * MI_REPORT_PERF_COUNT commands and so consider it a privileged op to
 	 * enable the OA unit by default.
+	 *
+	 * For Gen12+ we gain a new OAR unit that only monitors the RCS on a
+	 * per context basis. So we can relax requirements there if the user
+	 * doesn't request global stream access (i.e. query based sampling
+	 * using MI_RECORD_PERF_COUNT.
 	 */
 	if (IS_HASWELL(perf->i915) && specific_ctx)
 		privileged_op = false;
+	else if (IS_GEN(perf->i915, 12) && specific_ctx &&
+		 (props->sample_flags & SAMPLE_OA_REPORT) == 0)
+		privileged_op = false;
 
 	/* Similar to perf's kernel.perf_paranoid_cpu sysctl option
 	 * we check a dev.i915.perf_stream_paranoid sysctl option
@@ -3358,7 +3531,9 @@ void i915_perf_register(struct drm_i915_private *i915)
 
 	sysfs_attr_init(&perf->test_config.sysfs_metric_id.attr);
 
-	if (INTEL_GEN(i915) >= 11) {
+	if (IS_TIGERLAKE(i915)) {
+		i915_perf_load_test_config_tgl(i915);
+	} else if (INTEL_GEN(i915) >= 11) {
 		i915_perf_load_test_config_icl(i915);
 	} else if (IS_CANNONLAKE(i915)) {
 		i915_perf_load_test_config_cnl(i915);
@@ -3509,6 +3684,28 @@ static bool chv_is_valid_mux_addr(struct i915_perf *perf, u32 addr)
 	       ADDR_IN_RANGE(addr, 0x182300, 0x1823A4);
 }
 
+static bool gen12_is_valid_b_counter_addr(struct i915_perf *perf, u32 addr)
+{
+	return REG_IN_RANGE(addr, GEN12_OAG_OASTARTTRIG1, GEN12_OAG_OASTARTTRIG8) ||
+	       REG_IN_RANGE(addr, GEN12_OAG_OAREPORTTRIG1, GEN12_OAG_OAREPORTTRIG8) ||
+	       REG_IN_RANGE(addr, GEN12_OAG_CEC0_0, GEN12_OAG_CEC7_1) ||
+	       REG_IN_RANGE(addr, GEN12_OAG_SCEC0_0, GEN12_OAG_SCEC7_1) ||
+	       REG_EQUAL(addr, GEN12_OAA_DBG_REG) ||
+	       REG_EQUAL(addr, GEN12_OAG_OA_PESS) ||
+	       REG_EQUAL(addr, GEN12_OAG_SPCTR_CNF);
+}
+
+static bool gen12_is_valid_mux_addr(struct i915_perf *perf, u32 addr)
+{
+	return REG_EQUAL(addr, NOA_WRITE) ||
+	       REG_EQUAL(addr, GEN10_NOA_WRITE_HIGH) ||
+	       REG_EQUAL(addr, GDT_CHICKEN_BITS) ||
+	       REG_EQUAL(addr, WAIT_FOR_RC6_EXIT) ||
+	       REG_EQUAL(addr, RPM_CONFIG0) ||
+	       REG_EQUAL(addr, RPM_CONFIG1) ||
+	       REG_IN_RANGE(addr, NOA_CONFIG(0), NOA_CONFIG(8));
+}
+
 static u32 mask_reg_value(u32 reg, u32 val)
 {
 	/* HALF_SLICE_CHICKEN2 is programmed with a the
@@ -3902,14 +4099,11 @@ void i915_perf_init(struct drm_i915_private *i915)
 		 * worth the complexity to maintain now that BDW+ enable
 		 * execlist mode by default.
 		 */
-		perf->oa_formats = gen8_plus_oa_formats;
-
-		perf->ops.oa_enable = gen8_oa_enable;
-		perf->ops.oa_disable = gen8_oa_disable;
 		perf->ops.read = gen8_oa_read;
-		perf->ops.oa_hw_tail_read = gen8_oa_hw_tail_read;
 
 		if (IS_GEN_RANGE(i915, 8, 9)) {
+			perf->oa_formats = gen8_plus_oa_formats;
+
 			perf->ops.is_valid_b_counter_reg =
 				gen7_is_valid_b_counter_addr;
 			perf->ops.is_valid_mux_reg =
@@ -3922,8 +4116,11 @@ void i915_perf_init(struct drm_i915_private *i915)
 					chv_is_valid_mux_addr;
 			}
 
+			perf->ops.oa_enable = gen8_oa_enable;
+			perf->ops.oa_disable = gen8_oa_disable;
 			perf->ops.enable_metric_set = gen8_enable_metric_set;
 			perf->ops.disable_metric_set = gen8_disable_metric_set;
+			perf->ops.oa_hw_tail_read = gen8_oa_hw_tail_read;
 
 			if (IS_GEN(i915, 8)) {
 				perf->ctx_oactxctrl_offset = 0x120;
@@ -3937,6 +4134,8 @@ void i915_perf_init(struct drm_i915_private *i915)
 				perf->gen8_valid_ctx_bit = BIT(16);
 			}
 		} else if (IS_GEN_RANGE(i915, 10, 11)) {
+			perf->oa_formats = gen8_plus_oa_formats;
+
 			perf->ops.is_valid_b_counter_reg =
 				gen7_is_valid_b_counter_addr;
 			perf->ops.is_valid_mux_reg =
@@ -3944,8 +4143,11 @@ void i915_perf_init(struct drm_i915_private *i915)
 			perf->ops.is_valid_flex_reg =
 				gen8_is_valid_flex_addr;
 
+			perf->ops.oa_enable = gen8_oa_enable;
+			perf->ops.oa_disable = gen8_oa_disable;
 			perf->ops.enable_metric_set = gen8_enable_metric_set;
 			perf->ops.disable_metric_set = gen10_disable_metric_set;
+			perf->ops.oa_hw_tail_read = gen8_oa_hw_tail_read;
 
 			if (IS_GEN(i915, 10)) {
 				perf->ctx_oactxctrl_offset = 0x128;
@@ -3955,6 +4157,24 @@ void i915_perf_init(struct drm_i915_private *i915)
 				perf->ctx_flexeu0_offset = 0x78e;
 			}
 			perf->gen8_valid_ctx_bit = BIT(16);
+		} else if (IS_GEN(i915, 12)) {
+			perf->oa_formats = gen12_oa_formats;
+
+			perf->ops.is_valid_b_counter_reg =
+				gen12_is_valid_b_counter_addr;
+			perf->ops.is_valid_mux_reg =
+				gen12_is_valid_mux_addr;
+			perf->ops.is_valid_flex_reg =
+				gen8_is_valid_flex_addr;
+
+			perf->ops.oa_enable = gen12_oa_enable;
+			perf->ops.oa_disable = gen12_oa_disable;
+			perf->ops.enable_metric_set = gen12_enable_metric_set;
+			perf->ops.disable_metric_set = gen10_disable_metric_set;
+			perf->ops.oa_hw_tail_read = gen12_oa_hw_tail_read;
+
+			perf->ctx_flexeu0_offset = 0;
+			perf->ctx_oactxctrl_offset = 0x144;
 		}
 	}
 
diff --git a/drivers/gpu/drm/i915/i915_reg.h b/drivers/gpu/drm/i915/i915_reg.h
index e24991e54897..2e11963f3eb5 100644
--- a/drivers/gpu/drm/i915/i915_reg.h
+++ b/drivers/gpu/drm/i915/i915_reg.h
@@ -684,6 +684,45 @@ static inline bool i915_mmio_reg_valid(i915_reg_t reg)
 #define OABUFFER_SIZE_8M    (6 << 3)
 #define OABUFFER_SIZE_16M   (7 << 3)
 
+/* Gen12 OAR unit */
+#define GEN12_OAR_OACONTROL _MMIO(0x2960)
+#define  GEN12_OAR_OACONTROL_COUNTER_FORMAT_SHIFT 1
+#define  GEN12_OAR_OACONTROL_COUNTER_ENABLE       (1 << 0)
+
+#define GEN12_OACTXCONTROL _MMIO(0x2360)
+#define GEN12_OAR_OASTATUS _MMIO(0x2968)
+
+/* Gen12 OAG unit */
+#define GEN12_OAG_OAHEADPTR _MMIO(0xdb00)
+#define  GEN12_OAG_OAHEADPTR_MASK 0xffffffc0
+#define GEN12_OAG_OATAILPTR _MMIO(0xdb04)
+#define  GEN12_OAG_OATAILPTR_MASK 0xffffffc0
+
+#define GEN12_OAG_OABUFFER  _MMIO(0xdb08)
+#define  GEN12_OAG_OABUFFER_BUFFER_SIZE_MASK  (0x7)
+#define  GEN12_OAG_OABUFFER_BUFFER_SIZE_SHIFT (3)
+#define  GEN12_OAG_OABUFFER_MEMORY_SELECT     (1 << 0) /* 0: PPGTT, 1: GGTT */
+
+#define GEN12_OAG_OAGLBCTXCTRL _MMIO(0x2b28)
+#define  GEN12_OAG_OAGLBCTXCTRL_TIMER_PERIOD_SHIFT 2
+#define  GEN12_OAG_OAGLBCTXCTRL_TIMER_ENABLE       (1 << 1)
+#define  GEN12_OAG_OAGLBCTXCTRL_COUNTER_RESUME     (1 << 0)
+
+#define GEN12_OAG_OACONTROL _MMIO(0xdaf4)
+#define  GEN12_OAG_OACONTROL_OA_COUNTER_FORMAT_SHIFT 2
+#define  GEN12_OAG_OACONTROL_OA_COUNTER_ENABLE       (1 << 0)
+
+#define GEN12_OAG_OA_DEBUG _MMIO(0xdaf8)
+#define  GEN12_OAG_OA_DEBUG_INCLUDE_CLK_RATIO          (1 << 6)
+#define  GEN12_OAG_OA_DEBUG_DISABLE_CLK_RATIO_REPORTS  (1 << 5)
+#define  GEN12_OAG_OA_DEBUG_DISABLE_GO_1_0_REPORTS     (1 << 2)
+#define  GEN12_OAG_OA_DEBUG_DISABLE_CTX_SWITCH_REPORTS (1 << 1)
+
+#define GEN12_OAG_OASTATUS _MMIO(0xdafc)
+#define  GEN12_OAG_OASTATUS_COUNTER_OVERFLOW (1 << 2)
+#define  GEN12_OAG_OASTATUS_BUFFER_OVERFLOW  (1 << 1)
+#define  GEN12_OAG_OASTATUS_REPORT_LOST      (1 << 0)
+
 /*
  * Flexible, Aggregate EU Counter Registers.
  * Note: these aren't contiguous
@@ -920,6 +959,26 @@ static inline bool i915_mmio_reg_valid(i915_reg_t reg)
 #define OAREPORTTRIG8_NOA_SELECT_6_SHIFT    24
 #define OAREPORTTRIG8_NOA_SELECT_7_SHIFT    28
 
+/* Same layout as OASTARTTRIGX */
+#define GEN12_OAG_OASTARTTRIG1 _MMIO(0xd900)
+#define GEN12_OAG_OASTARTTRIG2 _MMIO(0xd904)
+#define GEN12_OAG_OASTARTTRIG3 _MMIO(0xd908)
+#define GEN12_OAG_OASTARTTRIG4 _MMIO(0xd90c)
+#define GEN12_OAG_OASTARTTRIG5 _MMIO(0xd910)
+#define GEN12_OAG_OASTARTTRIG6 _MMIO(0xd914)
+#define GEN12_OAG_OASTARTTRIG7 _MMIO(0xd918)
+#define GEN12_OAG_OASTARTTRIG8 _MMIO(0xd91c)
+
+/* Same layout as OAREPORTTRIGX */
+#define GEN12_OAG_OAREPORTTRIG1 _MMIO(0xd920)
+#define GEN12_OAG_OAREPORTTRIG2 _MMIO(0xd924)
+#define GEN12_OAG_OAREPORTTRIG3 _MMIO(0xd928)
+#define GEN12_OAG_OAREPORTTRIG4 _MMIO(0xd92c)
+#define GEN12_OAG_OAREPORTTRIG5 _MMIO(0xd930)
+#define GEN12_OAG_OAREPORTTRIG6 _MMIO(0xd934)
+#define GEN12_OAG_OAREPORTTRIG7 _MMIO(0xd938)
+#define GEN12_OAG_OAREPORTTRIG8 _MMIO(0xd93c)
+
 /* CECX_0 */
 #define OACEC_COMPARE_LESS_OR_EQUAL	6
 #define OACEC_COMPARE_NOT_EQUAL		5
@@ -936,6 +995,10 @@ static inline bool i915_mmio_reg_valid(i915_reg_t reg)
 #define OACEC_SELECT_PREV	(1 << 19)
 #define OACEC_SELECT_BOOLEAN	(2 << 19)
 
+/* 11-bit array 0: pass-through, 1: negated */
+#define GEN12_OASCEC_NEGATE_MASK  0x7ff
+#define GEN12_OASCEC_NEGATE_SHIFT 21
+
 /* CECX_1 */
 #define OACEC_MASK_MASK		    0xffff
 #define OACEC_CONSIDERATIONS_MASK   0xffff
@@ -958,6 +1021,42 @@ static inline bool i915_mmio_reg_valid(i915_reg_t reg)
 #define OACEC7_0 _MMIO(0x27a8)
 #define OACEC7_1 _MMIO(0x27ac)
 
+/* Same layout as CECX_Y */
+#define GEN12_OAG_CEC0_0 _MMIO(0xd940)
+#define GEN12_OAG_CEC0_1 _MMIO(0xd944)
+#define GEN12_OAG_CEC1_0 _MMIO(0xd948)
+#define GEN12_OAG_CEC1_1 _MMIO(0xd94c)
+#define GEN12_OAG_CEC2_0 _MMIO(0xd950)
+#define GEN12_OAG_CEC2_1 _MMIO(0xd954)
+#define GEN12_OAG_CEC3_0 _MMIO(0xd958)
+#define GEN12_OAG_CEC3_1 _MMIO(0xd95c)
+#define GEN12_OAG_CEC4_0 _MMIO(0xd960)
+#define GEN12_OAG_CEC4_1 _MMIO(0xd964)
+#define GEN12_OAG_CEC5_0 _MMIO(0xd968)
+#define GEN12_OAG_CEC5_1 _MMIO(0xd96c)
+#define GEN12_OAG_CEC6_0 _MMIO(0xd970)
+#define GEN12_OAG_CEC6_1 _MMIO(0xd974)
+#define GEN12_OAG_CEC7_0 _MMIO(0xd978)
+#define GEN12_OAG_CEC7_1 _MMIO(0xd97c)
+
+/* Same layout as CECX_Y + negate 11-bit array */
+#define GEN12_OAG_SCEC0_0 _MMIO(0xdc00)
+#define GEN12_OAG_SCEC0_1 _MMIO(0xdc04)
+#define GEN12_OAG_SCEC1_0 _MMIO(0xdc08)
+#define GEN12_OAG_SCEC1_1 _MMIO(0xdc0c)
+#define GEN12_OAG_SCEC2_0 _MMIO(0xdc10)
+#define GEN12_OAG_SCEC2_1 _MMIO(0xdc14)
+#define GEN12_OAG_SCEC3_0 _MMIO(0xdc18)
+#define GEN12_OAG_SCEC3_1 _MMIO(0xdc1c)
+#define GEN12_OAG_SCEC4_0 _MMIO(0xdc20)
+#define GEN12_OAG_SCEC4_1 _MMIO(0xdc24)
+#define GEN12_OAG_SCEC5_0 _MMIO(0xdc28)
+#define GEN12_OAG_SCEC5_1 _MMIO(0xdc2c)
+#define GEN12_OAG_SCEC6_0 _MMIO(0xdc30)
+#define GEN12_OAG_SCEC6_1 _MMIO(0xdc34)
+#define GEN12_OAG_SCEC7_0 _MMIO(0xdc38)
+#define GEN12_OAG_SCEC7_1 _MMIO(0xdc3c)
+
 /* OA perf counters */
 #define OA_PERFCNT1_LO      _MMIO(0x91B8)
 #define OA_PERFCNT1_HI      _MMIO(0x91BC)
@@ -1038,6 +1137,10 @@ static inline bool i915_mmio_reg_valid(i915_reg_t reg)
 #define MICRO_BP3_COUNT_STATUS23	_MMIO(0x9838)
 #define MICRO_BP_FIRED_ARMED		_MMIO(0x983C)
 
+#define GEN12_OAA_DBG_REG _MMIO(0xdc44)
+#define GEN12_OAG_OA_PESS _MMIO(0x2b2c)
+#define GEN12_OAG_SPCTR_CNF _MMIO(0xdc40)
+
 #define GDT_CHICKEN_BITS    _MMIO(0x9840)
 #define   GT_NOA_ENABLE	    0x00000080
 
diff --git a/drivers/gpu/drm/i915/oa/i915_oa_tgl.c b/drivers/gpu/drm/i915/oa/i915_oa_tgl.c
new file mode 100644
index 000000000000..a29d93707345
--- /dev/null
+++ b/drivers/gpu/drm/i915/oa/i915_oa_tgl.c
@@ -0,0 +1,121 @@
+// SPDX-License-Identifier: MIT
+/*
+ * Copyright © 2018 Intel Corporation
+ *
+ * Autogenerated file by GPU Top : https://github.com/rib/gputop
+ * DO NOT EDIT manually!
+ */
+
+#include <linux/sysfs.h>
+
+#include "i915_drv.h"
+#include "i915_oa_tgl.h"
+
+static const struct i915_oa_reg b_counter_config_test_oa[] = {
+	{ _MMIO(0xD920), 0x00000000 },
+	{ _MMIO(0xD900), 0x00000000 },
+	{ _MMIO(0xD904), 0xF0800000 },
+	{ _MMIO(0xD910), 0x00000000 },
+	{ _MMIO(0xD914), 0xF0800000 },
+	{ _MMIO(0xDC40), 0x00FF0000 },
+	{ _MMIO(0xD940), 0x00000004 },
+	{ _MMIO(0xD944), 0x0000FFFF },
+	{ _MMIO(0xDC00), 0x00000004 },
+	{ _MMIO(0xDC04), 0x0000FFFF },
+	{ _MMIO(0xD948), 0x00000003 },
+	{ _MMIO(0xD94C), 0x0000FFFF },
+	{ _MMIO(0xDC08), 0x00000003 },
+	{ _MMIO(0xDC0C), 0x0000FFFF },
+	{ _MMIO(0xD950), 0x00000007 },
+	{ _MMIO(0xD954), 0x0000FFFF },
+	{ _MMIO(0xDC10), 0x00000007 },
+	{ _MMIO(0xDC14), 0x0000FFFF },
+	{ _MMIO(0xD958), 0x00100002 },
+	{ _MMIO(0xD95C), 0x0000FFF7 },
+	{ _MMIO(0xDC18), 0x00100002 },
+	{ _MMIO(0xDC1C), 0x0000FFF7 },
+	{ _MMIO(0xD960), 0x00100002 },
+	{ _MMIO(0xD964), 0x0000FFCF },
+	{ _MMIO(0xDC20), 0x00100002 },
+	{ _MMIO(0xDC24), 0x0000FFCF },
+	{ _MMIO(0xD968), 0x00100082 },
+	{ _MMIO(0xD96C), 0x0000FFEF },
+	{ _MMIO(0xDC28), 0x00100082 },
+	{ _MMIO(0xDC2C), 0x0000FFEF },
+	{ _MMIO(0xD970), 0x001000C2 },
+	{ _MMIO(0xD974), 0x0000FFE7 },
+	{ _MMIO(0xDC30), 0x001000C2 },
+	{ _MMIO(0xDC34), 0x0000FFE7 },
+	{ _MMIO(0xD978), 0x00100001 },
+	{ _MMIO(0xD97C), 0x0000FFE7 },
+	{ _MMIO(0xDC38), 0x00100001 },
+	{ _MMIO(0xDC3C), 0x0000FFE7 },
+};
+
+static const struct i915_oa_reg flex_eu_config_test_oa[] = {
+};
+
+static const struct i915_oa_reg mux_config_test_oa[] = {
+	{ _MMIO(0x0D04), 0x00000200 },
+	{ _MMIO(0x9840), 0x00000000 },
+	{ _MMIO(0x9884), 0x00000000 },
+	{ _MMIO(0x9888), 0x280E0000 },
+	{ _MMIO(0x9888), 0x1E0E0147 },
+	{ _MMIO(0x9888), 0x180E0000 },
+	{ _MMIO(0x9888), 0x160E0000 },
+	{ _MMIO(0x9888), 0x1E0F1000 },
+	{ _MMIO(0x9888), 0x1E104000 },
+	{ _MMIO(0x9888), 0x2E020100 },
+	{ _MMIO(0x9888), 0x2C030004 },
+	{ _MMIO(0x9888), 0x38003000 },
+	{ _MMIO(0x9888), 0x1E0A8000 },
+	{ _MMIO(0x9884), 0x00000003 },
+	{ _MMIO(0x9888), 0x49110000 },
+	{ _MMIO(0x9888), 0x5D101400 },
+	{ _MMIO(0x9888), 0x1D140020 },
+	{ _MMIO(0x9888), 0x1D1103A3 },
+	{ _MMIO(0x9888), 0x01110000 },
+	{ _MMIO(0x9888), 0x61111000 },
+	{ _MMIO(0x9888), 0x1F128000 },
+	{ _MMIO(0x9888), 0x17100000 },
+	{ _MMIO(0x9888), 0x55100630 },
+	{ _MMIO(0x9888), 0x57100000 },
+	{ _MMIO(0x9888), 0x31100000 },
+	{ _MMIO(0x9884), 0x00000003 },
+	{ _MMIO(0x9888), 0x65100002 },
+	{ _MMIO(0x9884), 0x00000000 },
+	{ _MMIO(0x9888), 0x42000001 },
+};
+
+static ssize_t
+show_test_oa_id(struct device *kdev, struct device_attribute *attr, char *buf)
+{
+	return sprintf(buf, "1\n");
+}
+
+void
+i915_perf_load_test_config_tgl(struct drm_i915_private *dev_priv)
+{
+	strlcpy(dev_priv->perf.test_config.uuid,
+		"80a833f0-2504-4321-8894-e9277844ce7b",
+		sizeof(dev_priv->perf.test_config.uuid));
+	dev_priv->perf.test_config.id = 1;
+
+	dev_priv->perf.test_config.mux_regs = mux_config_test_oa;
+	dev_priv->perf.test_config.mux_regs_len = ARRAY_SIZE(mux_config_test_oa);
+
+	dev_priv->perf.test_config.b_counter_regs = b_counter_config_test_oa;
+	dev_priv->perf.test_config.b_counter_regs_len = ARRAY_SIZE(b_counter_config_test_oa);
+
+	dev_priv->perf.test_config.flex_regs = flex_eu_config_test_oa;
+	dev_priv->perf.test_config.flex_regs_len = ARRAY_SIZE(flex_eu_config_test_oa);
+
+	dev_priv->perf.test_config.sysfs_metric.name = "80a833f0-2504-4321-8894-e9277844ce7b";
+	dev_priv->perf.test_config.sysfs_metric.attrs = dev_priv->perf.test_config.attrs;
+
+	dev_priv->perf.test_config.attrs[0] = &dev_priv->perf.test_config.sysfs_metric_id.attr;
+
+	dev_priv->perf.test_config.sysfs_metric_id.attr.name = "id";
+	dev_priv->perf.test_config.sysfs_metric_id.attr.mode = 0444;
+	dev_priv->perf.test_config.sysfs_metric_id.show = show_test_oa_id;
+}
diff --git a/drivers/gpu/drm/i915/oa/i915_oa_tgl.h b/drivers/gpu/drm/i915/oa/i915_oa_tgl.h
new file mode 100644
index 000000000000..4c25f0be825c
--- /dev/null
+++ b/drivers/gpu/drm/i915/oa/i915_oa_tgl.h
@@ -0,0 +1,16 @@
+/* SPDX-License-Identifier: MIT */
+/*
+ * Copyright © 2018 Intel Corporation
+ *
+ * Autogenerated file by GPU Top : https://github.com/rib/gputop
+ * DO NOT EDIT manually!
+ */
+
+#ifndef __I915_OA_TGL_H__
+#define __I915_OA_TGL_H__
+
+struct drm_i915_private;
+
+void i915_perf_load_test_config_tgl(struct drm_i915_private *dev_priv);
+
+#endif
-- 
2.20.1

_______________________________________________
Intel-gfx mailing list
Intel-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/intel-gfx

^ permalink raw reply related	[flat|nested] 6+ messages in thread

* [PATCH 3/3] drm/i915/perf: enable OAR context save/restore of performance counters
  2019-10-14 18:55 [PATCH 1/3] drm/i915/perf: Add helper macros for comparing with whitelisted registers Umesh Nerlige Ramappa
  2019-10-14 18:55 ` [PATCH 2/3] drm/i915/tgl: Add perf support on TGL Umesh Nerlige Ramappa
@ 2019-10-14 18:55 ` Umesh Nerlige Ramappa
  2019-10-14 19:58   ` Chris Wilson
  2019-10-14 22:33 ` ✗ Fi.CI.BUILD: failure for series starting with [1/3] drm/i915/perf: Add helper macros for comparing with whitelisted registers Patchwork
  2 siblings, 1 reply; 6+ messages in thread
From: Umesh Nerlige Ramappa @ 2019-10-14 18:55 UTC (permalink / raw)
  To: intel-gfx; +Cc: Lucas De Marchi, Chris Wilson

From: Lionel Landwerlin <lionel.g.landwerlin@intel.com>

We want this so we can preempt performance queries and keep the system
responsive even when long running queries are ongoing. We avoid doing
it for all contexts.

Signed-off-by: Lionel Landwerlin <lionel.g.landwerlin@intel.com>
---
 drivers/gpu/drm/i915/gt/intel_lrc.c | 23 +++++---
 drivers/gpu/drm/i915/gt/intel_lrc.h |  3 +
 drivers/gpu/drm/i915/i915_perf.c    | 89 +++++++++++++++++++++++++++++
 3 files changed, 107 insertions(+), 8 deletions(-)

diff --git a/drivers/gpu/drm/i915/gt/intel_lrc.c b/drivers/gpu/drm/i915/gt/intel_lrc.c
index 9d8eaf8edaab..1bab671dc7a0 100644
--- a/drivers/gpu/drm/i915/gt/intel_lrc.c
+++ b/drivers/gpu/drm/i915/gt/intel_lrc.c
@@ -1218,6 +1218,19 @@ static bool can_merge_rq(const struct i915_request *prev,
 	return true;
 }
 
+u32 intel_lrc_make_ctx_control(const struct intel_engine_cs *engine)
+{
+	u32 value =
+		_MASKED_BIT_DISABLE(CTX_CTRL_ENGINE_CTX_RESTORE_INHIBIT) |
+		_MASKED_BIT_ENABLE(CTX_CTRL_INHIBIT_SYN_CTX_SWITCH);
+
+	if (INTEL_GEN(engine->i915) < 11)
+		value |= _MASKED_BIT_DISABLE(CTX_CTRL_ENGINE_CTX_SAVE_INHIBIT |
+					    CTX_CTRL_RS_CTX_ENABLE);
+
+	return value;
+}
+
 static void virtual_update_register_offsets(u32 *regs,
 					    struct intel_engine_cs *engine)
 {
@@ -2124,6 +2137,7 @@ __execlists_update_reg_state(const struct intel_context *ce,
 
 		i915_oa_init_reg_state(ce, engine);
 	}
+	regs[CTX_CONTEXT_CONTROL] |= intel_lrc_make_ctx_control(engine);
 }
 
 static int
@@ -3628,14 +3642,7 @@ static void init_common_reg_state(u32 * const regs,
 				  const struct intel_engine_cs *engine,
 				  const struct intel_ring *ring)
 {
-	regs[CTX_CONTEXT_CONTROL] =
-		_MASKED_BIT_DISABLE(CTX_CTRL_ENGINE_CTX_RESTORE_INHIBIT) |
-		_MASKED_BIT_ENABLE(CTX_CTRL_INHIBIT_SYN_CTX_SWITCH);
-	if (INTEL_GEN(engine->i915) < 11)
-		regs[CTX_CONTEXT_CONTROL] |=
-			_MASKED_BIT_DISABLE(CTX_CTRL_ENGINE_CTX_SAVE_INHIBIT |
-					    CTX_CTRL_RS_CTX_ENABLE);
-
+	regs[CTX_CONTEXT_CONTROL] = intel_lrc_make_ctx_control(engine);
 	regs[CTX_RING_BUFFER_CONTROL] = RING_CTL_SIZE(ring->size) | RING_VALID;
 	regs[CTX_BB_STATE] = RING_BB_PPGTT;
 }
diff --git a/drivers/gpu/drm/i915/gt/intel_lrc.h b/drivers/gpu/drm/i915/gt/intel_lrc.h
index 99dc576a4e25..6b2b196f09e7 100644
--- a/drivers/gpu/drm/i915/gt/intel_lrc.h
+++ b/drivers/gpu/drm/i915/gt/intel_lrc.h
@@ -43,6 +43,7 @@ struct intel_engine_cs;
 #define	  CTX_CTRL_ENGINE_CTX_RESTORE_INHIBIT	(1 << 0)
 #define   CTX_CTRL_RS_CTX_ENABLE		(1 << 1)
 #define	  CTX_CTRL_ENGINE_CTX_SAVE_INHIBIT	(1 << 2)
+#define	  GEN12_CTX_CTRL_OAR_CONTEXT_ENABLE	(1 << 8)
 #define RING_CONTEXT_STATUS_PTR(base)		_MMIO((base) + 0x3a0)
 #define RING_EXECLIST_SQ_CONTENTS(base)		_MMIO((base) + 0x510)
 #define RING_EXECLIST_CONTROL(base)		_MMIO((base) + 0x550)
@@ -145,4 +146,6 @@ struct intel_engine_cs *
 intel_virtual_engine_get_sibling(struct intel_engine_cs *engine,
 				 unsigned int sibling);
 
+u32 intel_lrc_make_ctx_control(const struct intel_engine_cs *engine);
+
 #endif /* _INTEL_LRC_H_ */
diff --git a/drivers/gpu/drm/i915/i915_perf.c b/drivers/gpu/drm/i915/i915_perf.c
index df3b6976ba5b..d0ddd37dc078 100644
--- a/drivers/gpu/drm/i915/i915_perf.c
+++ b/drivers/gpu/drm/i915/i915_perf.c
@@ -2208,6 +2208,70 @@ static int gen8_configure_context(struct i915_gem_context *ctx,
 	return err;
 }
 
+static int gen12_emit_oar_config(struct intel_context *ce, bool enable)
+{
+	struct i915_request *rq;
+	u32 *cs, offset;
+	int err = 0;
+
+	rq = i915_request_create(ce);
+	if (IS_ERR(rq))
+		return PTR_ERR(rq);
+
+	cs = intel_ring_begin(rq, 4);
+	if (IS_ERR(cs)) {
+		err = PTR_ERR(cs);
+		goto out;
+	}
+
+	offset = i915_ggtt_offset(ce->state) + LRC_STATE_PN * PAGE_SIZE;
+
+	*cs++ = MI_STORE_DWORD_IMM_GEN4 | MI_USE_GGTT;
+	*cs++ = offset + CTX_CONTEXT_CONTROL * sizeof(u32);
+	*cs++ = 0;
+	*cs++ = intel_lrc_make_ctx_control(ce->engine) |
+		(enable ?
+		 _MASKED_BIT_ENABLE(GEN12_CTX_CTRL_OAR_CONTEXT_ENABLE) :
+		 _MASKED_BIT_DISABLE(GEN12_CTX_CTRL_OAR_CONTEXT_ENABLE));
+
+	intel_ring_advance(rq, cs);
+
+out:
+	i915_request_add(rq);
+
+	return err;
+}
+
+static int gen12_configure_context_oar(struct i915_gem_context *ctx,
+				       bool enable)
+{
+	struct i915_gem_engines_iter it;
+	struct intel_context *ce;
+	int err = 0;
+
+	for_each_gem_engine(ce, i915_gem_context_lock_engines(ctx), it) {
+		GEM_BUG_ON(ce == ce->engine->kernel_context);
+
+		if (ce->engine->class != RENDER_CLASS)
+			continue;
+
+		err = intel_context_lock_pinned(ce);
+		if (err)
+			break;
+
+		/* Otherwise OA settings will be set upon first use */
+		if (intel_context_is_pinned(ce))
+			err = gen12_emit_oar_config(ce, enable);
+
+		intel_context_unlock_pinned(ce);
+		if (err)
+			break;
+	}
+	i915_gem_context_unlock_engines(ctx);
+
+	return err;
+}
+
 /*
  * Manages updating the per-context aspects of the OA stream
  * configuration across all contexts.
@@ -2313,6 +2377,17 @@ static int lrc_configure_all_contexts(struct i915_perf_stream *stream,
 			return err;
 		}
 
+		/*
+		 * For Gen12, performance counters are context
+		 * saved/restored. Only enable it for the context that
+		 * requested this.
+		 */
+		if (ctx == stream->ctx && IS_GEN(i915, 12)) {
+			err = gen12_configure_context_oar(ctx, oa_config != NULL);
+			if (err)
+				return err;
+		}
+
 		spin_lock(&i915->gem.contexts.lock);
 		list_safe_reset_next(ctx, cn, link);
 		i915_gem_context_put(ctx);
@@ -2782,6 +2857,20 @@ void i915_oa_init_reg_state(const struct intel_context *ce,
 	stream = engine->i915->perf.exclusive_stream;
 	if (stream)
 		gen8_update_reg_state_unlocked(ce, stream);
+
+	/*
+	 * Enable context save & restore of performance counters for
+	 * the OAR unit only on the context selected for performance
+	 * queries.
+	 */
+	if (IS_GEN(engine->i915, 12)) {
+		u32 *regs = ce->lrc_reg_state;
+
+		regs[CTX_CONTEXT_CONTROL] |=
+			(stream && ce->gem_context == stream->ctx)  ?
+			_MASKED_BIT_ENABLE(GEN12_CTX_CTRL_OAR_CONTEXT_ENABLE) :
+			_MASKED_BIT_DISABLE(GEN12_CTX_CTRL_OAR_CONTEXT_ENABLE);
+	}
 }
 
 /**
-- 
2.20.1

_______________________________________________
Intel-gfx mailing list
Intel-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/intel-gfx

^ permalink raw reply related	[flat|nested] 6+ messages in thread

* Re: [PATCH 3/3] drm/i915/perf: enable OAR context save/restore of performance counters
  2019-10-14 18:55 ` [PATCH 3/3] drm/i915/perf: enable OAR context save/restore of performance counters Umesh Nerlige Ramappa
@ 2019-10-14 19:58   ` Chris Wilson
  2019-10-15  7:44     ` Lionel Landwerlin
  0 siblings, 1 reply; 6+ messages in thread
From: Chris Wilson @ 2019-10-14 19:58 UTC (permalink / raw)
  To: Umesh Nerlige Ramappa, intel-gfx; +Cc: Lucas De Marchi

Quoting Umesh Nerlige Ramappa (2019-10-14 19:55:31)
> @@ -2313,6 +2377,17 @@ static int lrc_configure_all_contexts(struct i915_perf_stream *stream,
>                         return err;
>                 }
>  
> +               /*
> +                * For Gen12, performance counters are context
> +                * saved/restored. Only enable it for the context that
> +                * requested this.
> +                */
> +               if (ctx == stream->ctx && IS_GEN(i915, 12)) {
> +                       err = gen12_configure_context_oar(ctx, oa_config != NULL);


regs[CTX_CONTEXT_CONTROL] |= intel_lrc_make_ctx_control(engine);

can be modified with a plain LRI (at least I have successfully modified
INHIBIT_SYN_CTX before). Have you tried just emitting an LRI in the pinned
context?
-Chris
---------------------------------------------------------------------
Intel Corporation (UK) Limited
Registered No. 1134945 (England)
Registered Office: Pipers Way, Swindon SN3 1RJ
VAT No: 860 2173 47

This e-mail and any attachments may contain confidential material for
the sole use of the intended recipient(s). Any review or distribution
by others is strictly prohibited. If you are not the intended
recipient, please contact the sender and delete all copies.
_______________________________________________
Intel-gfx mailing list
Intel-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/intel-gfx

^ permalink raw reply	[flat|nested] 6+ messages in thread

* ✗ Fi.CI.BUILD: failure for series starting with [1/3] drm/i915/perf: Add helper macros for comparing with whitelisted registers
  2019-10-14 18:55 [PATCH 1/3] drm/i915/perf: Add helper macros for comparing with whitelisted registers Umesh Nerlige Ramappa
  2019-10-14 18:55 ` [PATCH 2/3] drm/i915/tgl: Add perf support on TGL Umesh Nerlige Ramappa
  2019-10-14 18:55 ` [PATCH 3/3] drm/i915/perf: enable OAR context save/restore of performance counters Umesh Nerlige Ramappa
@ 2019-10-14 22:33 ` Patchwork
  2 siblings, 0 replies; 6+ messages in thread
From: Patchwork @ 2019-10-14 22:33 UTC (permalink / raw)
  To: Umesh Nerlige Ramappa; +Cc: intel-gfx

== Series Details ==

Series: series starting with [1/3] drm/i915/perf: Add helper macros for comparing with whitelisted registers
URL   : https://patchwork.freedesktop.org/series/67987/
State : failure

== Summary ==

Applying: drm/i915/perf: Add helper macros for comparing with whitelisted registers
Applying: drm/i915/tgl: Add perf support on TGL
error: sha1 information is lacking or useless (drivers/gpu/drm/i915/i915_perf.c).
error: could not build fake ancestor
hint: Use 'git am --show-current-patch' to see the failed patch
Patch failed at 0002 drm/i915/tgl: Add perf support on TGL
When you have resolved this problem, run "git am --continue".
If you prefer to skip this patch, run "git am --skip" instead.
To restore the original branch and stop patching, run "git am --abort".

_______________________________________________
Intel-gfx mailing list
Intel-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/intel-gfx

^ permalink raw reply	[flat|nested] 6+ messages in thread

* Re: [PATCH 3/3] drm/i915/perf: enable OAR context save/restore of performance counters
  2019-10-14 19:58   ` Chris Wilson
@ 2019-10-15  7:44     ` Lionel Landwerlin
  0 siblings, 0 replies; 6+ messages in thread
From: Lionel Landwerlin @ 2019-10-15  7:44 UTC (permalink / raw)
  To: Chris Wilson, Umesh Nerlige Ramappa, intel-gfx; +Cc: Lucas De Marchi

On 14/10/2019 22:58, Chris Wilson wrote:
> Quoting Umesh Nerlige Ramappa (2019-10-14 19:55:31)
>> @@ -2313,6 +2377,17 @@ static int lrc_configure_all_contexts(struct i915_perf_stream *stream,
>>                          return err;
>>                  }
>>   
>> +               /*
>> +                * For Gen12, performance counters are context
>> +                * saved/restored. Only enable it for the context that
>> +                * requested this.
>> +                */
>> +               if (ctx == stream->ctx && IS_GEN(i915, 12)) {
>> +                       err = gen12_configure_context_oar(ctx, oa_config != NULL);
>
> regs[CTX_CONTEXT_CONTROL] |= intel_lrc_make_ctx_control(engine);
>
> can be modified with a plain LRI (at least I have successfully modified
> INHIBIT_SYN_CTX before). Have you tried just emitting an LRI in the pinned
> context?
> -Chris
>
Not yet, good to know :)

Sounds a lot simpler.


-Lionel

_______________________________________________
Intel-gfx mailing list
Intel-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/intel-gfx

^ permalink raw reply	[flat|nested] 6+ messages in thread

end of thread, other threads:[~2019-10-15  7:44 UTC | newest]

Thread overview: 6+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2019-10-14 18:55 [PATCH 1/3] drm/i915/perf: Add helper macros for comparing with whitelisted registers Umesh Nerlige Ramappa
2019-10-14 18:55 ` [PATCH 2/3] drm/i915/tgl: Add perf support on TGL Umesh Nerlige Ramappa
2019-10-14 18:55 ` [PATCH 3/3] drm/i915/perf: enable OAR context save/restore of performance counters Umesh Nerlige Ramappa
2019-10-14 19:58   ` Chris Wilson
2019-10-15  7:44     ` Lionel Landwerlin
2019-10-14 22:33 ` ✗ Fi.CI.BUILD: failure for series starting with [1/3] drm/i915/perf: Add helper macros for comparing with whitelisted registers Patchwork

This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.