All of lore.kernel.org
 help / color / mirror / Atom feed
* [Intel-gfx] [PATCH 0/1] Allow privileged user to map the OA buffer
@ 2020-07-14  7:22 Umesh Nerlige Ramappa
  2020-07-14  7:22 ` [Intel-gfx] [PATCH 1/1] drm/i915/perf: Map OA buffer to user space Umesh Nerlige Ramappa
                   ` (2 more replies)
  0 siblings, 3 replies; 13+ messages in thread
From: Umesh Nerlige Ramappa @ 2020-07-14  7:22 UTC (permalink / raw)
  To: Lionel G Landwerlin, intel-gfx

This cover letter is included to trigger "Test-with" an IGT patch.

Signed-off-by: Umesh Nerlige Ramappa <umesh.nerlige.ramappa@intel.com>
Test-with: 20200714071334.69883-1-umesh.nerlige.ramappa@intel.com

Piotr Maciejewski (1):
  drm/i915/perf: Map OA buffer to user space

 drivers/gpu/drm/i915/gt/intel_workarounds.c |  54 ++++++++
 drivers/gpu/drm/i915/i915_perf.c            | 130 +++++++++++++++++++-
 drivers/gpu/drm/i915/i915_perf_types.h      |  13 ++
 drivers/gpu/drm/i915/i915_reg.h             |  14 +++
 include/uapi/drm/i915_drm.h                 |  19 +++
 5 files changed, 227 insertions(+), 3 deletions(-)

-- 
2.20.1

_______________________________________________
Intel-gfx mailing list
Intel-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/intel-gfx

^ permalink raw reply	[flat|nested] 13+ messages in thread

* [Intel-gfx] [PATCH 1/1] drm/i915/perf: Map OA buffer to user space
  2020-07-14  7:22 [Intel-gfx] [PATCH 0/1] Allow privileged user to map the OA buffer Umesh Nerlige Ramappa
@ 2020-07-14  7:22 ` Umesh Nerlige Ramappa
  2020-07-14  7:44   ` Umesh Nerlige Ramappa
                     ` (2 more replies)
  2020-07-14  8:17 ` [Intel-gfx] ✗ Fi.CI.CHECKPATCH: warning for Allow privileged user to map the OA buffer Patchwork
  2020-07-14  8:44 ` [Intel-gfx] ✗ Fi.CI.BAT: failure " Patchwork
  2 siblings, 3 replies; 13+ messages in thread
From: Umesh Nerlige Ramappa @ 2020-07-14  7:22 UTC (permalink / raw)
  To: Lionel G Landwerlin, intel-gfx

From: Piotr Maciejewski <piotr.maciejewski@intel.com>

i915 used to support time based sampling mode which is good for overall
system monitoring, but is not enough for query mode used to measure a
single draw call or dispatch. Gen9-Gen11 are using current i915 perf
implementation for query, but Gen12+ requires a new approach based on
triggered reports within oa buffer. In order to enable above feature
two changes are required:

1. Whitelist update:
- enable triggered reports within oa buffer
- reading oa buffer head/tail/status information
- reading gpu ticks counter.

2. Map oa buffer at umd driver level to solve below constraints related
   to time based sampling interface:
- longer time to access reports collected by oa buffer
- slow oa reports browsing since oa buffer size is large
- missing oa report index, so query cannot browse report directly
- with direct access to oa buffer, query can extract other useful
  reports like context switch information needed to calculate correct
  performance counters values.

Signed-off-by: Piotr Maciejewski <piotr.maciejewski@intel.com>
---
 drivers/gpu/drm/i915/gt/intel_workarounds.c |  54 ++++++++
 drivers/gpu/drm/i915/i915_perf.c            | 130 +++++++++++++++++++-
 drivers/gpu/drm/i915/i915_perf_types.h      |  13 ++
 drivers/gpu/drm/i915/i915_reg.h             |  14 +++
 include/uapi/drm/i915_drm.h                 |  19 +++
 5 files changed, 227 insertions(+), 3 deletions(-)

diff --git a/drivers/gpu/drm/i915/gt/intel_workarounds.c b/drivers/gpu/drm/i915/gt/intel_workarounds.c
index 5726cd0a37e0..cf89928fc3a5 100644
--- a/drivers/gpu/drm/i915/gt/intel_workarounds.c
+++ b/drivers/gpu/drm/i915/gt/intel_workarounds.c
@@ -1365,6 +1365,48 @@ whitelist_reg(struct i915_wa_list *wal, i915_reg_t reg)
 	whitelist_reg_ext(wal, reg, RING_FORCE_TO_NONPRIV_ACCESS_RW);
 }
 
+static void gen9_whitelist_build_performance_counters(struct i915_wa_list *w)
+{
+	/* OA buffer trigger report 2/6 used by performance query */
+	whitelist_reg(w, OAREPORTTRIG2);
+	whitelist_reg(w, OAREPORTTRIG6);
+
+	/* Performance counters A18-20 used by tbs marker query */
+	whitelist_reg_ext(w, OA_PERF_COUNTER_A18,
+			  RING_FORCE_TO_NONPRIV_ACCESS_RW |
+			  RING_FORCE_TO_NONPRIV_RANGE_16);
+
+	/* Read access to gpu ticks */
+	whitelist_reg_ext(w, GEN8_GPU_TICKS,
+			  RING_FORCE_TO_NONPRIV_ACCESS_RD);
+
+	/* Read access to: oa status, head, tail, buffer settings */
+	whitelist_reg_ext(w, GEN8_OASTATUS,
+			  RING_FORCE_TO_NONPRIV_ACCESS_RD |
+			  RING_FORCE_TO_NONPRIV_RANGE_4);
+}
+
+static void gen12_whitelist_build_performance_counters(struct i915_wa_list *w)
+{
+	/* OA buffer trigger report 2/6 used by performance query */
+	whitelist_reg(w, GEN12_OAG_OAREPORTTRIG2);
+	whitelist_reg(w, GEN12_OAG_OAREPORTTRIG6);
+
+	/* Performance counters A18-20 used by tbs marker query */
+	whitelist_reg_ext(w, GEN12_OAG_PERF_COUNTER_A18,
+			  RING_FORCE_TO_NONPRIV_ACCESS_RW |
+			  RING_FORCE_TO_NONPRIV_RANGE_16);
+
+	/* Read access to gpu ticks */
+	whitelist_reg_ext(w, GEN12_OAG_GPU_TICKS,
+			  RING_FORCE_TO_NONPRIV_ACCESS_RD);
+
+	/* Read access to: oa status, head, tail, buffer settings */
+	whitelist_reg_ext(w, GEN12_OAG_OASTATUS,
+			  RING_FORCE_TO_NONPRIV_ACCESS_RD |
+			  RING_FORCE_TO_NONPRIV_RANGE_4);
+}
+
 static void gen9_whitelist_build(struct i915_wa_list *w)
 {
 	/* WaVFEStateAfterPipeControlwithMediaStateClear:skl,bxt,glk,cfl */
@@ -1378,6 +1420,9 @@ static void gen9_whitelist_build(struct i915_wa_list *w)
 
 	/* WaSendPushConstantsFromMMIO:skl,bxt */
 	whitelist_reg(w, COMMON_SLICE_CHICKEN2);
+
+	/* Performance counters support */
+	gen9_whitelist_build_performance_counters(w);
 }
 
 static void skl_whitelist_build(struct intel_engine_cs *engine)
@@ -1471,6 +1516,9 @@ static void cnl_whitelist_build(struct intel_engine_cs *engine)
 
 	/* WaEnablePreemptionGranularityControlByUMD:cnl */
 	whitelist_reg(w, GEN8_CS_CHICKEN1);
+
+	/* Performance counters support */
+	gen9_whitelist_build_performance_counters(w);
 }
 
 static void icl_whitelist_build(struct intel_engine_cs *engine)
@@ -1500,6 +1548,9 @@ static void icl_whitelist_build(struct intel_engine_cs *engine)
 		whitelist_reg_ext(w, PS_INVOCATION_COUNT,
 				  RING_FORCE_TO_NONPRIV_ACCESS_RD |
 				  RING_FORCE_TO_NONPRIV_RANGE_4);
+
+		/* Performance counters support */
+		gen9_whitelist_build_performance_counters(w);
 		break;
 
 	case VIDEO_DECODE_CLASS:
@@ -1550,6 +1601,9 @@ static void tgl_whitelist_build(struct intel_engine_cs *engine)
 
 		/* Wa_1806527549:tgl */
 		whitelist_reg(w, HIZ_CHICKEN);
+
+		/* Performance counters support */
+		gen12_whitelist_build_performance_counters(w);
 		break;
 	default:
 		whitelist_reg_ext(w,
diff --git a/drivers/gpu/drm/i915/i915_perf.c b/drivers/gpu/drm/i915/i915_perf.c
index c6f6370283cf..06a3fff52dfa 100644
--- a/drivers/gpu/drm/i915/i915_perf.c
+++ b/drivers/gpu/drm/i915/i915_perf.c
@@ -192,6 +192,7 @@
  */
 
 #include <linux/anon_inodes.h>
+#include <linux/mman.h>
 #include <linux/sizes.h>
 #include <linux/uuid.h>
 
@@ -434,6 +435,30 @@ static u32 gen7_oa_hw_tail_read(struct i915_perf_stream *stream)
 	return oastatus1 & GEN7_OASTATUS1_TAIL_MASK;
 }
 
+static u32 gen12_oa_hw_head_read(struct i915_perf_stream *stream)
+{
+	struct intel_uncore *uncore = stream->uncore;
+
+	return intel_uncore_read(uncore, GEN12_OAG_OAHEADPTR) &
+	       GEN12_OAG_OAHEADPTR_MASK;
+}
+
+static u32 gen8_oa_hw_head_read(struct i915_perf_stream *stream)
+{
+	struct intel_uncore *uncore = stream->uncore;
+
+	return intel_uncore_read(uncore, GEN8_OAHEADPTR) &
+	       GEN8_OAHEADPTR_MASK;
+}
+
+static u32 gen7_oa_hw_head_read(struct i915_perf_stream *stream)
+{
+	struct intel_uncore *uncore = stream->uncore;
+	u32 oastatus2 = intel_uncore_read(uncore, GEN7_OASTATUS2);
+
+	return oastatus2 & GEN7_OASTATUS2_HEAD_MASK;
+}
+
 /**
  * oa_buffer_check_unlocked - check for data and update tail ptr state
  * @stream: i915 stream instance
@@ -1328,6 +1353,7 @@ free_oa_buffer(struct i915_perf_stream *stream)
 	i915_vma_unpin_and_release(&stream->oa_buffer.vma,
 				   I915_VMA_RELEASE_MAP);
 
+	stream->oa_buffer.cpu_address = 0;
 	stream->oa_buffer.vaddr = NULL;
 }
 
@@ -1448,7 +1474,8 @@ static void gen8_init_oa_buffer(struct i915_perf_stream *stream)
 	 *  bit."
 	 */
 	intel_uncore_write(uncore, GEN8_OABUFFER, gtt_offset |
-		   OABUFFER_SIZE_16M | GEN8_OABUFFER_MEM_SELECT_GGTT);
+			   OABUFFER_SIZE_16M | GEN8_OABUFFER_MEM_SELECT_GGTT |
+			   GEN7_OABUFFER_EDGE_TRIGGER);
 	intel_uncore_write(uncore, GEN8_OATAILPTR, gtt_offset & GEN8_OATAILPTR_MASK);
 
 	/* Mark that we need updated tail pointers to read from... */
@@ -1501,7 +1528,8 @@ static void gen12_init_oa_buffer(struct i915_perf_stream *stream)
 	 *  bit."
 	 */
 	intel_uncore_write(uncore, GEN12_OAG_OABUFFER, gtt_offset |
-			   OABUFFER_SIZE_16M | GEN8_OABUFFER_MEM_SELECT_GGTT);
+			   OABUFFER_SIZE_16M | GEN8_OABUFFER_MEM_SELECT_GGTT |
+			   GEN7_OABUFFER_EDGE_TRIGGER);
 	intel_uncore_write(uncore, GEN12_OAG_OATAILPTR,
 			   gtt_offset & GEN12_OAG_OATAILPTR_MASK);
 
@@ -1562,6 +1590,7 @@ static int alloc_oa_buffer(struct i915_perf_stream *stream)
 		goto err_unref;
 	}
 	stream->oa_buffer.vma = vma;
+	stream->oa_buffer.cpu_address = 0;
 
 	stream->oa_buffer.vaddr =
 		i915_gem_object_pin_map(bo, I915_MAP_WB);
@@ -1584,6 +1613,52 @@ static int alloc_oa_buffer(struct i915_perf_stream *stream)
 	return ret;
 }
 
+static int map_oa_buffer(struct i915_perf_stream *stream)
+{
+	unsigned long address = 0;
+	const u64 size = OA_BUFFER_SIZE;
+	struct i915_vma *oabuffer_vma = stream->oa_buffer.vma;
+	struct drm_i915_gem_object *oabuffer_obj = oabuffer_vma->obj;
+	struct mm_struct *mm = current->mm;
+	struct vm_area_struct *vma = NULL;
+
+	if(stream->oa_buffer.cpu_address != 0)
+		return 0;
+
+	if (!boot_cpu_has(X86_FEATURE_PAT))
+		return -ENODEV;
+
+	if (!oabuffer_obj || !oabuffer_vma)
+		return -ENOENT;
+
+	if (!oabuffer_obj->base.filp)
+		return -ENXIO;
+
+	if (range_overflows_t(u64, 0, size, oabuffer_obj->base.size))
+		return -EINVAL;
+
+	address = vm_mmap(oabuffer_obj->base.filp, 0, size,
+			  PROT_READ, MAP_SHARED, 0);
+
+	if (IS_ERR_VALUE(address))
+		return address;
+
+	if (mmap_write_lock_killable(mm))
+		return -EINTR;
+
+	vma = find_vma(mm, address);
+	if (vma) {
+		vma->vm_page_prot =
+			pgprot_writecombine(vm_get_page_prot(vma->vm_flags));
+
+		stream->oa_buffer.cpu_address = address;
+	}
+
+	mmap_write_unlock(mm);
+
+	return vma ? 0 : -ENOMEM;
+}
+
 static u32 *save_restore_register(struct i915_perf_stream *stream, u32 *cs,
 				  bool save, i915_reg_t reg, u32 offset,
 				  u32 dword_count)
@@ -2493,6 +2568,13 @@ gen12_enable_metric_set(struct i915_perf_stream *stream,
 			    (period_exponent << GEN12_OAG_OAGLBCTXCTRL_TIMER_PERIOD_SHIFT))
 			    : 0);
 
+	/*
+	 * Initialize Super Queue Internal Cnt Register
+	 * BIT(30) - PMON Enable - set in order to collect valid metrics.
+	 */
+	intel_uncore_write(uncore, GEN12_SQCNT1,
+			   intel_uncore_read(uncore, GEN12_SQCNT1) | BIT(30));
+
 	/*
 	 * Update all contexts prior writing the mux configurations as we need
 	 * to make sure all slices/subslices are ON before writing to NOA
@@ -3199,6 +3281,39 @@ static long i915_perf_config_locked(struct i915_perf_stream *stream,
 	return ret;
 }
 
+/**
+ * i915_perf_get_oa_buffer_info_locked - Properties of the i915-perf OA buffer
+ * @arg: pointer to oa buffer info populated by this function.
+ */
+static int i915_perf_get_oa_buffer_info_locked(struct i915_perf_stream *stream,
+					       unsigned long arg)
+{
+	struct drm_i915_perf_oa_buffer_info info;
+	void __user *output = (void __user *) arg;
+	int ret;
+
+	if (!output)
+		return -EINVAL;
+
+	memset(&info, 0, sizeof(info));
+
+	info.size = stream->oa_buffer.vma->size;
+	info.head = stream->perf->ops.oa_hw_head_read(stream);
+	info.tail = stream->perf->ops.oa_hw_tail_read(stream);
+	info.gpu_address = i915_ggtt_offset(stream->oa_buffer.vma);
+
+	ret = map_oa_buffer(stream);
+	if (ret)
+		return ret;
+
+	info.cpu_address = stream->oa_buffer.cpu_address;
+
+	if (copy_to_user(output, &info, sizeof(info)))
+		return -EFAULT;
+
+	return 0;
+}
+
 /**
  * i915_perf_ioctl - support ioctl() usage with i915 perf stream FDs
  * @stream: An i915 perf stream
@@ -3224,6 +3339,8 @@ static long i915_perf_ioctl_locked(struct i915_perf_stream *stream,
 		return 0;
 	case I915_PERF_IOCTL_CONFIG:
 		return i915_perf_config_locked(stream, arg);
+	case I915_PERF_IOCTL_GET_OA_BUFFER_INFO:
+		return i915_perf_get_oa_buffer_info_locked(stream, arg);
 	}
 
 	return -EINVAL;
@@ -4245,6 +4362,7 @@ void i915_perf_init(struct drm_i915_private *i915)
 		perf->ops.oa_disable = gen7_oa_disable;
 		perf->ops.read = gen7_oa_read;
 		perf->ops.oa_hw_tail_read = gen7_oa_hw_tail_read;
+		perf->ops.oa_hw_head_read = gen7_oa_hw_head_read;
 
 		perf->oa_formats = hsw_oa_formats;
 	} else if (HAS_LOGICAL_RING_CONTEXTS(i915)) {
@@ -4276,6 +4394,7 @@ void i915_perf_init(struct drm_i915_private *i915)
 			perf->ops.enable_metric_set = gen8_enable_metric_set;
 			perf->ops.disable_metric_set = gen8_disable_metric_set;
 			perf->ops.oa_hw_tail_read = gen8_oa_hw_tail_read;
+			perf->ops.oa_hw_head_read = gen8_oa_hw_head_read;
 
 			if (IS_GEN(i915, 8)) {
 				perf->ctx_oactxctrl_offset = 0x120;
@@ -4303,6 +4422,7 @@ void i915_perf_init(struct drm_i915_private *i915)
 			perf->ops.enable_metric_set = gen8_enable_metric_set;
 			perf->ops.disable_metric_set = gen10_disable_metric_set;
 			perf->ops.oa_hw_tail_read = gen8_oa_hw_tail_read;
+			perf->ops.oa_hw_head_read = gen8_oa_hw_head_read;
 
 			if (IS_GEN(i915, 10)) {
 				perf->ctx_oactxctrl_offset = 0x128;
@@ -4327,6 +4447,7 @@ void i915_perf_init(struct drm_i915_private *i915)
 			perf->ops.enable_metric_set = gen12_enable_metric_set;
 			perf->ops.disable_metric_set = gen12_disable_metric_set;
 			perf->ops.oa_hw_tail_read = gen12_oa_hw_tail_read;
+			perf->ops.oa_hw_head_read = gen12_oa_hw_head_read;
 
 			perf->ctx_flexeu0_offset = 0;
 			perf->ctx_oactxctrl_offset = 0x144;
@@ -4432,8 +4553,11 @@ int i915_perf_ioctl_version(void)
 	 *
 	 * 5: Add DRM_I915_PERF_PROP_POLL_OA_PERIOD parameter that controls the
 	 *    interval for the hrtimer used to check for OA data.
+	 *
+	 * 6: Added an option to map oa buffer at umd driver level and trigger
+	 *    oa reports within oa buffer from command buffer.
 	 */
-	return 5;
+	return 6;
 }
 
 #if IS_ENABLED(CONFIG_DRM_I915_SELFTEST)
diff --git a/drivers/gpu/drm/i915/i915_perf_types.h b/drivers/gpu/drm/i915/i915_perf_types.h
index a36a455ae336..5b40e20c2aa9 100644
--- a/drivers/gpu/drm/i915/i915_perf_types.h
+++ b/drivers/gpu/drm/i915/i915_perf_types.h
@@ -251,6 +251,14 @@ struct i915_perf_stream {
 		int format_size;
 		int size_exponent;
 
+		/**
+		 * @cpu_address: OA buffer cpu address.
+		 *
+		 * Needed to map OA buffer at umd driver level
+		 * to obtain cpu pointer and browse reports.
+		 */
+		u64 cpu_address;
+
 		/**
 		 * @ptr_lock: Locks reads and writes to all head/tail state
 		 *
@@ -377,6 +385,11 @@ struct i915_oa_ops {
 	 * generations.
 	 */
 	u32 (*oa_hw_tail_read)(struct i915_perf_stream *stream);
+
+	/**
+	 * @oa_hw_head_read: read the OA head pointer register
+	 */
+	u32 (*oa_hw_head_read)(struct i915_perf_stream *stream);
 };
 
 struct i915_perf {
diff --git a/drivers/gpu/drm/i915/i915_reg.h b/drivers/gpu/drm/i915/i915_reg.h
index 86a23ced051b..2e3d264339e0 100644
--- a/drivers/gpu/drm/i915/i915_reg.h
+++ b/drivers/gpu/drm/i915/i915_reg.h
@@ -675,6 +675,7 @@ static inline bool i915_mmio_reg_valid(i915_reg_t reg)
 #define  GEN7_OASTATUS2_HEAD_MASK           0xffffffc0
 #define  GEN7_OASTATUS2_MEM_SELECT_GGTT     (1 << 0) /* 0: PPGTT, 1: GGTT */
 
+#define GEN8_GPU_TICKS _MMIO(0x2910)
 #define GEN8_OASTATUS _MMIO(0x2b08)
 #define  GEN8_OASTATUS_OVERRUN_STATUS	    (1 << 3)
 #define  GEN8_OASTATUS_COUNTER_OVERFLOW     (1 << 2)
@@ -696,6 +697,7 @@ static inline bool i915_mmio_reg_valid(i915_reg_t reg)
 #define OABUFFER_SIZE_16M   (7 << 3)
 
 #define GEN12_OA_TLB_INV_CR _MMIO(0xceec)
+#define GEN12_SQCNT1 _MMIO(0x8718)
 
 /* Gen12 OAR unit */
 #define GEN12_OAR_OACONTROL _MMIO(0x2960)
@@ -731,6 +733,7 @@ static inline bool i915_mmio_reg_valid(i915_reg_t reg)
 #define  GEN12_OAG_OA_DEBUG_DISABLE_GO_1_0_REPORTS     (1 << 2)
 #define  GEN12_OAG_OA_DEBUG_DISABLE_CTX_SWITCH_REPORTS (1 << 1)
 
+#define GEN12_OAG_GPU_TICKS _MMIO(0xda90)
 #define GEN12_OAG_OASTATUS _MMIO(0xdafc)
 #define  GEN12_OAG_OASTATUS_COUNTER_OVERFLOW (1 << 2)
 #define  GEN12_OAG_OASTATUS_BUFFER_OVERFLOW  (1 << 1)
@@ -972,6 +975,17 @@ static inline bool i915_mmio_reg_valid(i915_reg_t reg)
 #define OAREPORTTRIG8_NOA_SELECT_6_SHIFT    24
 #define OAREPORTTRIG8_NOA_SELECT_7_SHIFT    28
 
+/* Performance counters registers */
+#define OA_PERF_COUNTER_A18   _MMIO(0x2890)
+#define OA_PERF_COUNTER_A19   _MMIO(0x2898)
+#define OA_PERF_COUNTER_A20   _MMIO(0x28A0)
+
+/* Gen12 Performance counters registers */
+#define GEN12_OAG_PERF_COUNTER_A16   _MMIO(0xDA00)
+#define GEN12_OAG_PERF_COUNTER_A18   _MMIO(0xDA10)
+#define GEN12_OAG_PERF_COUNTER_A19   _MMIO(0xDA18)
+#define GEN12_OAG_PERF_COUNTER_A20   _MMIO(0xDA20)
+
 /* Same layout as OASTARTTRIGX */
 #define GEN12_OAG_OASTARTTRIG1 _MMIO(0xd900)
 #define GEN12_OAG_OASTARTTRIG2 _MMIO(0xd904)
diff --git a/include/uapi/drm/i915_drm.h b/include/uapi/drm/i915_drm.h
index 14b67cd6b54b..62b88c0123c8 100644
--- a/include/uapi/drm/i915_drm.h
+++ b/include/uapi/drm/i915_drm.h
@@ -2048,6 +2048,25 @@ struct drm_i915_perf_open_param {
  */
 #define I915_PERF_IOCTL_CONFIG	_IO('i', 0x2)
 
+/**
+ * Returns OA buffer properties.
+ *
+ * This ioctl is available in perf revision 6.
+ */
+#define I915_PERF_IOCTL_GET_OA_BUFFER_INFO _IO('i', 0x3)
+
+/**
+ * OA buffer information structure.
+ */
+struct drm_i915_perf_oa_buffer_info {
+	__u32 size;
+	__u32 head;
+	__u32 tail;
+	__u32 gpu_address;
+	__u64 cpu_address;
+	__u64 reserved[4];
+};
+
 /**
  * Common to all i915 perf records
  */
-- 
2.20.1

_______________________________________________
Intel-gfx mailing list
Intel-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/intel-gfx

^ permalink raw reply related	[flat|nested] 13+ messages in thread

* Re: [Intel-gfx] [PATCH 1/1] drm/i915/perf: Map OA buffer to user space
  2020-07-14  7:22 ` [Intel-gfx] [PATCH 1/1] drm/i915/perf: Map OA buffer to user space Umesh Nerlige Ramappa
@ 2020-07-14  7:44   ` Umesh Nerlige Ramappa
  2020-07-14 11:28   ` Chris Wilson
  2020-07-16 15:32   ` Lionel Landwerlin
  2 siblings, 0 replies; 13+ messages in thread
From: Umesh Nerlige Ramappa @ 2020-07-14  7:44 UTC (permalink / raw)
  To: Lionel G Landwerlin, intel-gfx

On Tue, Jul 14, 2020 at 12:22:39AM -0700, Umesh Nerlige Ramappa wrote:
>From: Piotr Maciejewski <piotr.maciejewski@intel.com>
>
>i915 used to support time based sampling mode which is good for overall
>system monitoring, but is not enough for query mode used to measure a
>single draw call or dispatch. Gen9-Gen11 are using current i915 perf
>implementation for query, but Gen12+ requires a new approach based on
>triggered reports within oa buffer. In order to enable above feature
>two changes are required:
>
>1. Whitelist update:
>- enable triggered reports within oa buffer
>- reading oa buffer head/tail/status information
>- reading gpu ticks counter.
>
>2. Map oa buffer at umd driver level to solve below constraints related
>   to time based sampling interface:
>- longer time to access reports collected by oa buffer
>- slow oa reports browsing since oa buffer size is large
>- missing oa report index, so query cannot browse report directly
>- with direct access to oa buffer, query can extract other useful
>  reports like context switch information needed to calculate correct
>  performance counters values.
>
>Signed-off-by: Piotr Maciejewski <piotr.maciejewski@intel.com>
>---
> drivers/gpu/drm/i915/gt/intel_workarounds.c |  54 ++++++++
> drivers/gpu/drm/i915/i915_perf.c            | 130 +++++++++++++++++++-
> drivers/gpu/drm/i915/i915_perf_types.h      |  13 ++
> drivers/gpu/drm/i915/i915_reg.h             |  14 +++
> include/uapi/drm/i915_drm.h                 |  19 +++
> 5 files changed, 227 insertions(+), 3 deletions(-)
>
>diff --git a/drivers/gpu/drm/i915/gt/intel_workarounds.c b/drivers/gpu/drm/i915/gt/intel_workarounds.c
>index 5726cd0a37e0..cf89928fc3a5 100644
>--- a/drivers/gpu/drm/i915/gt/intel_workarounds.c
>+++ b/drivers/gpu/drm/i915/gt/intel_workarounds.c
>@@ -1365,6 +1365,48 @@ whitelist_reg(struct i915_wa_list *wal, i915_reg_t reg)
> 	whitelist_reg_ext(wal, reg, RING_FORCE_TO_NONPRIV_ACCESS_RW);
> }
>
>+static void gen9_whitelist_build_performance_counters(struct i915_wa_list *w)
>+{
>+	/* OA buffer trigger report 2/6 used by performance query */
>+	whitelist_reg(w, OAREPORTTRIG2);
>+	whitelist_reg(w, OAREPORTTRIG6);
>+
>+	/* Performance counters A18-20 used by tbs marker query */
>+	whitelist_reg_ext(w, OA_PERF_COUNTER_A18,
>+			  RING_FORCE_TO_NONPRIV_ACCESS_RW |
>+			  RING_FORCE_TO_NONPRIV_RANGE_16);

the above whitelist should be broken into

	whitelist_reg_ext(w, OA_PERF_COUNTER_A18,
			  RING_FORCE_TO_NONPRIV_ACCESS_RW |
			  RING_FORCE_TO_NONPRIV_RANGE_4);
	whitelist_reg(w, OA_PERF_COUNTER_A20);
	whitelist_reg(w, OA_PERF_COUNTER_A20_UPPER);

>+
>+	/* Read access to gpu ticks */
>+	whitelist_reg_ext(w, GEN8_GPU_TICKS,
>+			  RING_FORCE_TO_NONPRIV_ACCESS_RD);
>+
>+	/* Read access to: oa status, head, tail, buffer settings */
>+	whitelist_reg_ext(w, GEN8_OASTATUS,
>+			  RING_FORCE_TO_NONPRIV_ACCESS_RD |
>+			  RING_FORCE_TO_NONPRIV_RANGE_4);
>+}
>+
>+static void gen12_whitelist_build_performance_counters(struct i915_wa_list *w)
>+{
>+	/* OA buffer trigger report 2/6 used by performance query */
>+	whitelist_reg(w, GEN12_OAG_OAREPORTTRIG2);
>+	whitelist_reg(w, GEN12_OAG_OAREPORTTRIG6);
>+
>+	/* Performance counters A18-20 used by tbs marker query */
>+	whitelist_reg_ext(w, GEN12_OAG_PERF_COUNTER_A18,
>+			  RING_FORCE_TO_NONPRIV_ACCESS_RW |
>+			  RING_FORCE_TO_NONPRIV_RANGE_16);

same as the above comment

>+
>+	/* Read access to gpu ticks */
>+	whitelist_reg_ext(w, GEN12_OAG_GPU_TICKS,
>+			  RING_FORCE_TO_NONPRIV_ACCESS_RD);
>+
>+	/* Read access to: oa status, head, tail, buffer settings */
>+	whitelist_reg_ext(w, GEN12_OAG_OASTATUS,
>+			  RING_FORCE_TO_NONPRIV_ACCESS_RD |
>+			  RING_FORCE_TO_NONPRIV_RANGE_4);
>+}
>+
> static void gen9_whitelist_build(struct i915_wa_list *w)
> {
> 	/* WaVFEStateAfterPipeControlwithMediaStateClear:skl,bxt,glk,cfl */
>@@ -1378,6 +1420,9 @@ static void gen9_whitelist_build(struct i915_wa_list *w)
>
> 	/* WaSendPushConstantsFromMMIO:skl,bxt */
> 	whitelist_reg(w, COMMON_SLICE_CHICKEN2);
>+
>+	/* Performance counters support */
>+	gen9_whitelist_build_performance_counters(w);
> }
>
> static void skl_whitelist_build(struct intel_engine_cs *engine)
>@@ -1471,6 +1516,9 @@ static void cnl_whitelist_build(struct intel_engine_cs *engine)
>
> 	/* WaEnablePreemptionGranularityControlByUMD:cnl */
> 	whitelist_reg(w, GEN8_CS_CHICKEN1);
>+
>+	/* Performance counters support */
>+	gen9_whitelist_build_performance_counters(w);
> }
>
> static void icl_whitelist_build(struct intel_engine_cs *engine)
>@@ -1500,6 +1548,9 @@ static void icl_whitelist_build(struct intel_engine_cs *engine)
> 		whitelist_reg_ext(w, PS_INVOCATION_COUNT,
> 				  RING_FORCE_TO_NONPRIV_ACCESS_RD |
> 				  RING_FORCE_TO_NONPRIV_RANGE_4);
>+
>+		/* Performance counters support */
>+		gen9_whitelist_build_performance_counters(w);
> 		break;
>
> 	case VIDEO_DECODE_CLASS:
>@@ -1550,6 +1601,9 @@ static void tgl_whitelist_build(struct intel_engine_cs *engine)
>
> 		/* Wa_1806527549:tgl */
> 		whitelist_reg(w, HIZ_CHICKEN);
>+
>+		/* Performance counters support */
>+		gen12_whitelist_build_performance_counters(w);
> 		break;
> 	default:
> 		whitelist_reg_ext(w,
>diff --git a/drivers/gpu/drm/i915/i915_perf.c b/drivers/gpu/drm/i915/i915_perf.c
>index c6f6370283cf..06a3fff52dfa 100644
>--- a/drivers/gpu/drm/i915/i915_perf.c
>+++ b/drivers/gpu/drm/i915/i915_perf.c
>@@ -192,6 +192,7 @@
>  */
>
> #include <linux/anon_inodes.h>
>+#include <linux/mman.h>
> #include <linux/sizes.h>
> #include <linux/uuid.h>
>
>@@ -434,6 +435,30 @@ static u32 gen7_oa_hw_tail_read(struct i915_perf_stream *stream)
> 	return oastatus1 & GEN7_OASTATUS1_TAIL_MASK;
> }
>
>+static u32 gen12_oa_hw_head_read(struct i915_perf_stream *stream)
>+{
>+	struct intel_uncore *uncore = stream->uncore;
>+
>+	return intel_uncore_read(uncore, GEN12_OAG_OAHEADPTR) &
>+	       GEN12_OAG_OAHEADPTR_MASK;
>+}
>+
>+static u32 gen8_oa_hw_head_read(struct i915_perf_stream *stream)
>+{
>+	struct intel_uncore *uncore = stream->uncore;
>+
>+	return intel_uncore_read(uncore, GEN8_OAHEADPTR) &
>+	       GEN8_OAHEADPTR_MASK;
>+}
>+
>+static u32 gen7_oa_hw_head_read(struct i915_perf_stream *stream)
>+{
>+	struct intel_uncore *uncore = stream->uncore;
>+	u32 oastatus2 = intel_uncore_read(uncore, GEN7_OASTATUS2);
>+
>+	return oastatus2 & GEN7_OASTATUS2_HEAD_MASK;
>+}
>+
> /**
>  * oa_buffer_check_unlocked - check for data and update tail ptr state
>  * @stream: i915 stream instance
>@@ -1328,6 +1353,7 @@ free_oa_buffer(struct i915_perf_stream *stream)
> 	i915_vma_unpin_and_release(&stream->oa_buffer.vma,
> 				   I915_VMA_RELEASE_MAP);
>
>+	stream->oa_buffer.cpu_address = 0;
> 	stream->oa_buffer.vaddr = NULL;
> }
>
>@@ -1448,7 +1474,8 @@ static void gen8_init_oa_buffer(struct i915_perf_stream *stream)
> 	 *  bit."
> 	 */
> 	intel_uncore_write(uncore, GEN8_OABUFFER, gtt_offset |
>-		   OABUFFER_SIZE_16M | GEN8_OABUFFER_MEM_SELECT_GGTT);
>+			   OABUFFER_SIZE_16M | GEN8_OABUFFER_MEM_SELECT_GGTT |
>+			   GEN7_OABUFFER_EDGE_TRIGGER);
> 	intel_uncore_write(uncore, GEN8_OATAILPTR, gtt_offset & GEN8_OATAILPTR_MASK);
>
> 	/* Mark that we need updated tail pointers to read from... */
>@@ -1501,7 +1528,8 @@ static void gen12_init_oa_buffer(struct i915_perf_stream *stream)
> 	 *  bit."
> 	 */
> 	intel_uncore_write(uncore, GEN12_OAG_OABUFFER, gtt_offset |
>-			   OABUFFER_SIZE_16M | GEN8_OABUFFER_MEM_SELECT_GGTT);
>+			   OABUFFER_SIZE_16M | GEN8_OABUFFER_MEM_SELECT_GGTT |
>+			   GEN7_OABUFFER_EDGE_TRIGGER);
> 	intel_uncore_write(uncore, GEN12_OAG_OATAILPTR,
> 			   gtt_offset & GEN12_OAG_OATAILPTR_MASK);
>
>@@ -1562,6 +1590,7 @@ static int alloc_oa_buffer(struct i915_perf_stream *stream)
> 		goto err_unref;
> 	}
> 	stream->oa_buffer.vma = vma;
>+	stream->oa_buffer.cpu_address = 0;
>
> 	stream->oa_buffer.vaddr =
> 		i915_gem_object_pin_map(bo, I915_MAP_WB);
>@@ -1584,6 +1613,52 @@ static int alloc_oa_buffer(struct i915_perf_stream *stream)
> 	return ret;
> }
>
>+static int map_oa_buffer(struct i915_perf_stream *stream)
>+{
>+	unsigned long address = 0;
>+	const u64 size = OA_BUFFER_SIZE;
>+	struct i915_vma *oabuffer_vma = stream->oa_buffer.vma;
>+	struct drm_i915_gem_object *oabuffer_obj = oabuffer_vma->obj;
>+	struct mm_struct *mm = current->mm;
>+	struct vm_area_struct *vma = NULL;
>+
>+	if(stream->oa_buffer.cpu_address != 0)
>+		return 0;
>+
>+	if (!boot_cpu_has(X86_FEATURE_PAT))
>+		return -ENODEV;
>+
>+	if (!oabuffer_obj || !oabuffer_vma)
>+		return -ENOENT;
>+
>+	if (!oabuffer_obj->base.filp)
>+		return -ENXIO;
>+
>+	if (range_overflows_t(u64, 0, size, oabuffer_obj->base.size))
>+		return -EINVAL;
>+
>+	address = vm_mmap(oabuffer_obj->base.filp, 0, size,
>+			  PROT_READ, MAP_SHARED, 0);
>+
>+	if (IS_ERR_VALUE(address))
>+		return address;
>+
>+	if (mmap_write_lock_killable(mm))
>+		return -EINTR;
>+
>+	vma = find_vma(mm, address);
>+	if (vma) {
>+		vma->vm_page_prot =
>+			pgprot_writecombine(vm_get_page_prot(vma->vm_flags));
>+
>+		stream->oa_buffer.cpu_address = address;
>+	}
>+
>+	mmap_write_unlock(mm);
>+
>+	return vma ? 0 : -ENOMEM;
>+}
>+
> static u32 *save_restore_register(struct i915_perf_stream *stream, u32 *cs,
> 				  bool save, i915_reg_t reg, u32 offset,
> 				  u32 dword_count)
>@@ -2493,6 +2568,13 @@ gen12_enable_metric_set(struct i915_perf_stream *stream,
> 			    (period_exponent << GEN12_OAG_OAGLBCTXCTRL_TIMER_PERIOD_SHIFT))
> 			    : 0);
>
>+	/*
>+	 * Initialize Super Queue Internal Cnt Register
>+	 * BIT(30) - PMON Enable - set in order to collect valid metrics.
>+	 */
>+	intel_uncore_write(uncore, GEN12_SQCNT1,
>+			   intel_uncore_read(uncore, GEN12_SQCNT1) | BIT(30));
>+
> 	/*
> 	 * Update all contexts prior writing the mux configurations as we need
> 	 * to make sure all slices/subslices are ON before writing to NOA
>@@ -3199,6 +3281,39 @@ static long i915_perf_config_locked(struct i915_perf_stream *stream,
> 	return ret;
> }
>
>+/**
>+ * i915_perf_get_oa_buffer_info_locked - Properties of the i915-perf OA buffer
>+ * @arg: pointer to oa buffer info populated by this function.
>+ */
>+static int i915_perf_get_oa_buffer_info_locked(struct i915_perf_stream *stream,
>+					       unsigned long arg)
>+{
>+	struct drm_i915_perf_oa_buffer_info info;
>+	void __user *output = (void __user *) arg;
>+	int ret;
>+
>+	if (!output)
>+		return -EINVAL;
>+
>+	memset(&info, 0, sizeof(info));
>+
>+	info.size = stream->oa_buffer.vma->size;
>+	info.head = stream->perf->ops.oa_hw_head_read(stream);
>+	info.tail = stream->perf->ops.oa_hw_tail_read(stream);
>+	info.gpu_address = i915_ggtt_offset(stream->oa_buffer.vma);
>+
>+	ret = map_oa_buffer(stream);
>+	if (ret)
>+		return ret;
>+
>+	info.cpu_address = stream->oa_buffer.cpu_address;
>+
>+	if (copy_to_user(output, &info, sizeof(info)))
>+		return -EFAULT;
>+
>+	return 0;
>+}
>+
> /**
>  * i915_perf_ioctl - support ioctl() usage with i915 perf stream FDs
>  * @stream: An i915 perf stream
>@@ -3224,6 +3339,8 @@ static long i915_perf_ioctl_locked(struct i915_perf_stream *stream,
> 		return 0;
> 	case I915_PERF_IOCTL_CONFIG:
> 		return i915_perf_config_locked(stream, arg);
>+	case I915_PERF_IOCTL_GET_OA_BUFFER_INFO:
>+		return i915_perf_get_oa_buffer_info_locked(stream, arg);
> 	}
>
> 	return -EINVAL;
>@@ -4245,6 +4362,7 @@ void i915_perf_init(struct drm_i915_private *i915)
> 		perf->ops.oa_disable = gen7_oa_disable;
> 		perf->ops.read = gen7_oa_read;
> 		perf->ops.oa_hw_tail_read = gen7_oa_hw_tail_read;
>+		perf->ops.oa_hw_head_read = gen7_oa_hw_head_read;
>
> 		perf->oa_formats = hsw_oa_formats;
> 	} else if (HAS_LOGICAL_RING_CONTEXTS(i915)) {
>@@ -4276,6 +4394,7 @@ void i915_perf_init(struct drm_i915_private *i915)
> 			perf->ops.enable_metric_set = gen8_enable_metric_set;
> 			perf->ops.disable_metric_set = gen8_disable_metric_set;
> 			perf->ops.oa_hw_tail_read = gen8_oa_hw_tail_read;
>+			perf->ops.oa_hw_head_read = gen8_oa_hw_head_read;
>
> 			if (IS_GEN(i915, 8)) {
> 				perf->ctx_oactxctrl_offset = 0x120;
>@@ -4303,6 +4422,7 @@ void i915_perf_init(struct drm_i915_private *i915)
> 			perf->ops.enable_metric_set = gen8_enable_metric_set;
> 			perf->ops.disable_metric_set = gen10_disable_metric_set;
> 			perf->ops.oa_hw_tail_read = gen8_oa_hw_tail_read;
>+			perf->ops.oa_hw_head_read = gen8_oa_hw_head_read;
>
> 			if (IS_GEN(i915, 10)) {
> 				perf->ctx_oactxctrl_offset = 0x128;
>@@ -4327,6 +4447,7 @@ void i915_perf_init(struct drm_i915_private *i915)
> 			perf->ops.enable_metric_set = gen12_enable_metric_set;
> 			perf->ops.disable_metric_set = gen12_disable_metric_set;
> 			perf->ops.oa_hw_tail_read = gen12_oa_hw_tail_read;
>+			perf->ops.oa_hw_head_read = gen12_oa_hw_head_read;
>
> 			perf->ctx_flexeu0_offset = 0;
> 			perf->ctx_oactxctrl_offset = 0x144;
>@@ -4432,8 +4553,11 @@ int i915_perf_ioctl_version(void)
> 	 *
> 	 * 5: Add DRM_I915_PERF_PROP_POLL_OA_PERIOD parameter that controls the
> 	 *    interval for the hrtimer used to check for OA data.
>+	 *
>+	 * 6: Added an option to map oa buffer at umd driver level and trigger
>+	 *    oa reports within oa buffer from command buffer.
> 	 */
>-	return 5;
>+	return 6;
> }
>
> #if IS_ENABLED(CONFIG_DRM_I915_SELFTEST)
>diff --git a/drivers/gpu/drm/i915/i915_perf_types.h b/drivers/gpu/drm/i915/i915_perf_types.h
>index a36a455ae336..5b40e20c2aa9 100644
>--- a/drivers/gpu/drm/i915/i915_perf_types.h
>+++ b/drivers/gpu/drm/i915/i915_perf_types.h
>@@ -251,6 +251,14 @@ struct i915_perf_stream {
> 		int format_size;
> 		int size_exponent;
>
>+		/**
>+		 * @cpu_address: OA buffer cpu address.
>+		 *
>+		 * Needed to map OA buffer at umd driver level
>+		 * to obtain cpu pointer and browse reports.
>+		 */
>+		u64 cpu_address;
>+
> 		/**
> 		 * @ptr_lock: Locks reads and writes to all head/tail state
> 		 *
>@@ -377,6 +385,11 @@ struct i915_oa_ops {
> 	 * generations.
> 	 */
> 	u32 (*oa_hw_tail_read)(struct i915_perf_stream *stream);
>+
>+	/**
>+	 * @oa_hw_head_read: read the OA head pointer register
>+	 */
>+	u32 (*oa_hw_head_read)(struct i915_perf_stream *stream);
> };
>
> struct i915_perf {
>diff --git a/drivers/gpu/drm/i915/i915_reg.h b/drivers/gpu/drm/i915/i915_reg.h
>index 86a23ced051b..2e3d264339e0 100644
>--- a/drivers/gpu/drm/i915/i915_reg.h
>+++ b/drivers/gpu/drm/i915/i915_reg.h
>@@ -675,6 +675,7 @@ static inline bool i915_mmio_reg_valid(i915_reg_t reg)
> #define  GEN7_OASTATUS2_HEAD_MASK           0xffffffc0
> #define  GEN7_OASTATUS2_MEM_SELECT_GGTT     (1 << 0) /* 0: PPGTT, 1: GGTT */
>
>+#define GEN8_GPU_TICKS _MMIO(0x2910)
> #define GEN8_OASTATUS _MMIO(0x2b08)
> #define  GEN8_OASTATUS_OVERRUN_STATUS	    (1 << 3)
> #define  GEN8_OASTATUS_COUNTER_OVERFLOW     (1 << 2)
>@@ -696,6 +697,7 @@ static inline bool i915_mmio_reg_valid(i915_reg_t reg)
> #define OABUFFER_SIZE_16M   (7 << 3)
>
> #define GEN12_OA_TLB_INV_CR _MMIO(0xceec)
>+#define GEN12_SQCNT1 _MMIO(0x8718)
>
> /* Gen12 OAR unit */
> #define GEN12_OAR_OACONTROL _MMIO(0x2960)
>@@ -731,6 +733,7 @@ static inline bool i915_mmio_reg_valid(i915_reg_t reg)
> #define  GEN12_OAG_OA_DEBUG_DISABLE_GO_1_0_REPORTS     (1 << 2)
> #define  GEN12_OAG_OA_DEBUG_DISABLE_CTX_SWITCH_REPORTS (1 << 1)
>
>+#define GEN12_OAG_GPU_TICKS _MMIO(0xda90)
> #define GEN12_OAG_OASTATUS _MMIO(0xdafc)
> #define  GEN12_OAG_OASTATUS_COUNTER_OVERFLOW (1 << 2)
> #define  GEN12_OAG_OASTATUS_BUFFER_OVERFLOW  (1 << 1)
>@@ -972,6 +975,17 @@ static inline bool i915_mmio_reg_valid(i915_reg_t reg)
> #define OAREPORTTRIG8_NOA_SELECT_6_SHIFT    24
> #define OAREPORTTRIG8_NOA_SELECT_7_SHIFT    28
>
>+/* Performance counters registers */
>+#define OA_PERF_COUNTER_A18   _MMIO(0x2890)
>+#define OA_PERF_COUNTER_A19   _MMIO(0x2898)
>+#define OA_PERF_COUNTER_A20   _MMIO(0x28A0)
>+
>+/* Gen12 Performance counters registers */
>+#define GEN12_OAG_PERF_COUNTER_A16   _MMIO(0xDA00)

unused. remove ^

>+#define GEN12_OAG_PERF_COUNTER_A18   _MMIO(0xDA10)
>+#define GEN12_OAG_PERF_COUNTER_A19   _MMIO(0xDA18)
>+#define GEN12_OAG_PERF_COUNTER_A20   _MMIO(0xDA20)
>+
> /* Same layout as OASTARTTRIGX */
> #define GEN12_OAG_OASTARTTRIG1 _MMIO(0xd900)
> #define GEN12_OAG_OASTARTTRIG2 _MMIO(0xd904)
>diff --git a/include/uapi/drm/i915_drm.h b/include/uapi/drm/i915_drm.h
>index 14b67cd6b54b..62b88c0123c8 100644
>--- a/include/uapi/drm/i915_drm.h
>+++ b/include/uapi/drm/i915_drm.h
>@@ -2048,6 +2048,25 @@ struct drm_i915_perf_open_param {
>  */
> #define I915_PERF_IOCTL_CONFIG	_IO('i', 0x2)
>
>+/**
>+ * Returns OA buffer properties.
>+ *
>+ * This ioctl is available in perf revision 6.
>+ */
>+#define I915_PERF_IOCTL_GET_OA_BUFFER_INFO _IO('i', 0x3)
>+
>+/**
>+ * OA buffer information structure.
>+ */
>+struct drm_i915_perf_oa_buffer_info {
>+	__u32 size;
>+	__u32 head;
>+	__u32 tail;
>+	__u32 gpu_address;
>+	__u64 cpu_address;
>+	__u64 reserved[4];
>+};
>+
> /**
>  * Common to all i915 perf records
>  */
>-- 
>2.20.1
>
>_______________________________________________
>Intel-gfx mailing list
>Intel-gfx@lists.freedesktop.org
>https://lists.freedesktop.org/mailman/listinfo/intel-gfx
_______________________________________________
Intel-gfx mailing list
Intel-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/intel-gfx

^ permalink raw reply	[flat|nested] 13+ messages in thread

* [Intel-gfx] ✗ Fi.CI.CHECKPATCH: warning for Allow privileged user to map the OA buffer
  2020-07-14  7:22 [Intel-gfx] [PATCH 0/1] Allow privileged user to map the OA buffer Umesh Nerlige Ramappa
  2020-07-14  7:22 ` [Intel-gfx] [PATCH 1/1] drm/i915/perf: Map OA buffer to user space Umesh Nerlige Ramappa
@ 2020-07-14  8:17 ` Patchwork
  2020-07-14  8:44 ` [Intel-gfx] ✗ Fi.CI.BAT: failure " Patchwork
  2 siblings, 0 replies; 13+ messages in thread
From: Patchwork @ 2020-07-14  8:17 UTC (permalink / raw)
  To: Umesh Nerlige Ramappa; +Cc: intel-gfx

== Series Details ==

Series: Allow privileged user to map the OA buffer
URL   : https://patchwork.freedesktop.org/series/79460/
State : warning

== Summary ==

$ dim checkpatch origin/drm-tip
69208bf02940 drm/i915/perf: Map OA buffer to user space
-:214: ERROR:SPACING: space required before the open parenthesis '('
#214: FILE: drivers/gpu/drm/i915/i915_perf.c:1625:
+	if(stream->oa_buffer.cpu_address != 0)

-:280: CHECK:SPACING: No space is necessary after a cast
#280: FILE: drivers/gpu/drm/i915/i915_perf.c:3292:
+	void __user *output = (void __user *) arg;

total: 1 errors, 0 warnings, 1 checks, 393 lines checked

_______________________________________________
Intel-gfx mailing list
Intel-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/intel-gfx

^ permalink raw reply	[flat|nested] 13+ messages in thread

* [Intel-gfx] ✗ Fi.CI.BAT: failure for Allow privileged user to map the OA buffer
  2020-07-14  7:22 [Intel-gfx] [PATCH 0/1] Allow privileged user to map the OA buffer Umesh Nerlige Ramappa
  2020-07-14  7:22 ` [Intel-gfx] [PATCH 1/1] drm/i915/perf: Map OA buffer to user space Umesh Nerlige Ramappa
  2020-07-14  8:17 ` [Intel-gfx] ✗ Fi.CI.CHECKPATCH: warning for Allow privileged user to map the OA buffer Patchwork
@ 2020-07-14  8:44 ` Patchwork
  2 siblings, 0 replies; 13+ messages in thread
From: Patchwork @ 2020-07-14  8:44 UTC (permalink / raw)
  To: Umesh Nerlige Ramappa; +Cc: intel-gfx

== Series Details ==

Series: Allow privileged user to map the OA buffer
URL   : https://patchwork.freedesktop.org/series/79460/
State : failure

== Summary ==

CI Bug Log - changes from CI_DRM_8741 -> Patchwork_18154
====================================================

Summary
-------

  **FAILURE**

  Serious unknown changes coming with Patchwork_18154 absolutely need to be
  verified manually.
  
  If you think the reported changes have nothing to do with the changes
  introduced in Patchwork_18154, please notify your bug team to allow them
  to document this new failure mode, which will reduce false positives in CI.

  External URL: https://intel-gfx-ci.01.org/tree/drm-tip/Patchwork_18154/index.html

Possible new issues
-------------------

  Here are the unknown changes that may have been introduced in Patchwork_18154:

### IGT changes ###

#### Possible regressions ####

  * igt@i915_selftest@live@workarounds:
    - fi-tgl-u2:          [PASS][1] -> [DMESG-FAIL][2]
   [1]: https://intel-gfx-ci.01.org/tree/drm-tip/CI_DRM_8741/fi-tgl-u2/igt@i915_selftest@live@workarounds.html
   [2]: https://intel-gfx-ci.01.org/tree/drm-tip/Patchwork_18154/fi-tgl-u2/igt@i915_selftest@live@workarounds.html
    - fi-skl-6700k2:      [PASS][3] -> [DMESG-FAIL][4]
   [3]: https://intel-gfx-ci.01.org/tree/drm-tip/CI_DRM_8741/fi-skl-6700k2/igt@i915_selftest@live@workarounds.html
   [4]: https://intel-gfx-ci.01.org/tree/drm-tip/Patchwork_18154/fi-skl-6700k2/igt@i915_selftest@live@workarounds.html
    - fi-cml-s:           [PASS][5] -> [DMESG-FAIL][6]
   [5]: https://intel-gfx-ci.01.org/tree/drm-tip/CI_DRM_8741/fi-cml-s/igt@i915_selftest@live@workarounds.html
   [6]: https://intel-gfx-ci.01.org/tree/drm-tip/Patchwork_18154/fi-cml-s/igt@i915_selftest@live@workarounds.html
    - fi-icl-y:           [PASS][7] -> [DMESG-FAIL][8]
   [7]: https://intel-gfx-ci.01.org/tree/drm-tip/CI_DRM_8741/fi-icl-y/igt@i915_selftest@live@workarounds.html
   [8]: https://intel-gfx-ci.01.org/tree/drm-tip/Patchwork_18154/fi-icl-y/igt@i915_selftest@live@workarounds.html
    - fi-kbl-x1275:       [PASS][9] -> [DMESG-FAIL][10]
   [9]: https://intel-gfx-ci.01.org/tree/drm-tip/CI_DRM_8741/fi-kbl-x1275/igt@i915_selftest@live@workarounds.html
   [10]: https://intel-gfx-ci.01.org/tree/drm-tip/Patchwork_18154/fi-kbl-x1275/igt@i915_selftest@live@workarounds.html
    - fi-cfl-guc:         [PASS][11] -> [DMESG-FAIL][12]
   [11]: https://intel-gfx-ci.01.org/tree/drm-tip/CI_DRM_8741/fi-cfl-guc/igt@i915_selftest@live@workarounds.html
   [12]: https://intel-gfx-ci.01.org/tree/drm-tip/Patchwork_18154/fi-cfl-guc/igt@i915_selftest@live@workarounds.html
    - fi-tgl-y:           [PASS][13] -> [DMESG-FAIL][14]
   [13]: https://intel-gfx-ci.01.org/tree/drm-tip/CI_DRM_8741/fi-tgl-y/igt@i915_selftest@live@workarounds.html
   [14]: https://intel-gfx-ci.01.org/tree/drm-tip/Patchwork_18154/fi-tgl-y/igt@i915_selftest@live@workarounds.html
    - fi-skl-guc:         [PASS][15] -> [DMESG-FAIL][16]
   [15]: https://intel-gfx-ci.01.org/tree/drm-tip/CI_DRM_8741/fi-skl-guc/igt@i915_selftest@live@workarounds.html
   [16]: https://intel-gfx-ci.01.org/tree/drm-tip/Patchwork_18154/fi-skl-guc/igt@i915_selftest@live@workarounds.html
    - fi-kbl-8809g:       [PASS][17] -> [DMESG-FAIL][18]
   [17]: https://intel-gfx-ci.01.org/tree/drm-tip/CI_DRM_8741/fi-kbl-8809g/igt@i915_selftest@live@workarounds.html
   [18]: https://intel-gfx-ci.01.org/tree/drm-tip/Patchwork_18154/fi-kbl-8809g/igt@i915_selftest@live@workarounds.html
    - fi-cfl-8700k:       [PASS][19] -> [DMESG-FAIL][20]
   [19]: https://intel-gfx-ci.01.org/tree/drm-tip/CI_DRM_8741/fi-cfl-8700k/igt@i915_selftest@live@workarounds.html
   [20]: https://intel-gfx-ci.01.org/tree/drm-tip/Patchwork_18154/fi-cfl-8700k/igt@i915_selftest@live@workarounds.html
    - fi-kbl-r:           [PASS][21] -> [DMESG-FAIL][22]
   [21]: https://intel-gfx-ci.01.org/tree/drm-tip/CI_DRM_8741/fi-kbl-r/igt@i915_selftest@live@workarounds.html
   [22]: https://intel-gfx-ci.01.org/tree/drm-tip/Patchwork_18154/fi-kbl-r/igt@i915_selftest@live@workarounds.html
    - fi-icl-u2:          [PASS][23] -> [DMESG-FAIL][24]
   [23]: https://intel-gfx-ci.01.org/tree/drm-tip/CI_DRM_8741/fi-icl-u2/igt@i915_selftest@live@workarounds.html
   [24]: https://intel-gfx-ci.01.org/tree/drm-tip/Patchwork_18154/fi-icl-u2/igt@i915_selftest@live@workarounds.html
    - fi-cfl-8109u:       [PASS][25] -> [DMESG-FAIL][26]
   [25]: https://intel-gfx-ci.01.org/tree/drm-tip/CI_DRM_8741/fi-cfl-8109u/igt@i915_selftest@live@workarounds.html
   [26]: https://intel-gfx-ci.01.org/tree/drm-tip/Patchwork_18154/fi-cfl-8109u/igt@i915_selftest@live@workarounds.html
    - fi-skl-lmem:        [PASS][27] -> [DMESG-FAIL][28]
   [27]: https://intel-gfx-ci.01.org/tree/drm-tip/CI_DRM_8741/fi-skl-lmem/igt@i915_selftest@live@workarounds.html
   [28]: https://intel-gfx-ci.01.org/tree/drm-tip/Patchwork_18154/fi-skl-lmem/igt@i915_selftest@live@workarounds.html
    - fi-kbl-7500u:       [PASS][29] -> [DMESG-FAIL][30]
   [29]: https://intel-gfx-ci.01.org/tree/drm-tip/CI_DRM_8741/fi-kbl-7500u/igt@i915_selftest@live@workarounds.html
   [30]: https://intel-gfx-ci.01.org/tree/drm-tip/Patchwork_18154/fi-kbl-7500u/igt@i915_selftest@live@workarounds.html
    - fi-kbl-guc:         [PASS][31] -> [DMESG-FAIL][32]
   [31]: https://intel-gfx-ci.01.org/tree/drm-tip/CI_DRM_8741/fi-kbl-guc/igt@i915_selftest@live@workarounds.html
   [32]: https://intel-gfx-ci.01.org/tree/drm-tip/Patchwork_18154/fi-kbl-guc/igt@i915_selftest@live@workarounds.html
    - fi-kbl-soraka:      [PASS][33] -> [DMESG-FAIL][34]
   [33]: https://intel-gfx-ci.01.org/tree/drm-tip/CI_DRM_8741/fi-kbl-soraka/igt@i915_selftest@live@workarounds.html
   [34]: https://intel-gfx-ci.01.org/tree/drm-tip/Patchwork_18154/fi-kbl-soraka/igt@i915_selftest@live@workarounds.html
    - fi-whl-u:           [PASS][35] -> [DMESG-FAIL][36]
   [35]: https://intel-gfx-ci.01.org/tree/drm-tip/CI_DRM_8741/fi-whl-u/igt@i915_selftest@live@workarounds.html
   [36]: https://intel-gfx-ci.01.org/tree/drm-tip/Patchwork_18154/fi-whl-u/igt@i915_selftest@live@workarounds.html
    - fi-cml-u2:          [PASS][37] -> [DMESG-FAIL][38]
   [37]: https://intel-gfx-ci.01.org/tree/drm-tip/CI_DRM_8741/fi-cml-u2/igt@i915_selftest@live@workarounds.html
   [38]: https://intel-gfx-ci.01.org/tree/drm-tip/Patchwork_18154/fi-cml-u2/igt@i915_selftest@live@workarounds.html

  
#### Suppressed ####

  The following results come from untrusted machines, tests, or statuses.
  They do not affect the overall result.

  * igt@i915_selftest@live@gem:
    - {fi-tgl-dsi}:       [PASS][39] -> [INCOMPLETE][40]
   [39]: https://intel-gfx-ci.01.org/tree/drm-tip/CI_DRM_8741/fi-tgl-dsi/igt@i915_selftest@live@gem.html
   [40]: https://intel-gfx-ci.01.org/tree/drm-tip/Patchwork_18154/fi-tgl-dsi/igt@i915_selftest@live@gem.html

  * igt@i915_selftest@live@workarounds:
    - {fi-tgl-dsi}:       [PASS][41] -> [DMESG-FAIL][42]
   [41]: https://intel-gfx-ci.01.org/tree/drm-tip/CI_DRM_8741/fi-tgl-dsi/igt@i915_selftest@live@workarounds.html
   [42]: https://intel-gfx-ci.01.org/tree/drm-tip/Patchwork_18154/fi-tgl-dsi/igt@i915_selftest@live@workarounds.html
    - {fi-ehl-1}:         [PASS][43] -> [DMESG-FAIL][44]
   [43]: https://intel-gfx-ci.01.org/tree/drm-tip/CI_DRM_8741/fi-ehl-1/igt@i915_selftest@live@workarounds.html
   [44]: https://intel-gfx-ci.01.org/tree/drm-tip/Patchwork_18154/fi-ehl-1/igt@i915_selftest@live@workarounds.html

  
Known issues
------------

  Here are the changes found in Patchwork_18154 that come from known issues:

### IGT changes ###

#### Issues hit ####

  * igt@i915_module_load@reload:
    - fi-apl-guc:         [PASS][45] -> [DMESG-WARN][46] ([i915#1635] / [i915#1982])
   [45]: https://intel-gfx-ci.01.org/tree/drm-tip/CI_DRM_8741/fi-apl-guc/igt@i915_module_load@reload.html
   [46]: https://intel-gfx-ci.01.org/tree/drm-tip/Patchwork_18154/fi-apl-guc/igt@i915_module_load@reload.html

  * igt@i915_pm_rpm@module-reload:
    - fi-tgl-y:           [PASS][47] -> [DMESG-WARN][48] ([i915#1982]) +1 similar issue
   [47]: https://intel-gfx-ci.01.org/tree/drm-tip/CI_DRM_8741/fi-tgl-y/igt@i915_pm_rpm@module-reload.html
   [48]: https://intel-gfx-ci.01.org/tree/drm-tip/Patchwork_18154/fi-tgl-y/igt@i915_pm_rpm@module-reload.html

  * igt@i915_selftest@live@active:
    - fi-skl-lmem:        [PASS][49] -> [DMESG-FAIL][50] ([i915#666])
   [49]: https://intel-gfx-ci.01.org/tree/drm-tip/CI_DRM_8741/fi-skl-lmem/igt@i915_selftest@live@active.html
   [50]: https://intel-gfx-ci.01.org/tree/drm-tip/Patchwork_18154/fi-skl-lmem/igt@i915_selftest@live@active.html

  * igt@i915_selftest@live@workarounds:
    - fi-apl-guc:         [PASS][51] -> [DMESG-FAIL][52] ([i915#1635])
   [51]: https://intel-gfx-ci.01.org/tree/drm-tip/CI_DRM_8741/fi-apl-guc/igt@i915_selftest@live@workarounds.html
   [52]: https://intel-gfx-ci.01.org/tree/drm-tip/Patchwork_18154/fi-apl-guc/igt@i915_selftest@live@workarounds.html
    - fi-bxt-dsi:         [PASS][53] -> [DMESG-FAIL][54] ([i915#1635])
   [53]: https://intel-gfx-ci.01.org/tree/drm-tip/CI_DRM_8741/fi-bxt-dsi/igt@i915_selftest@live@workarounds.html
   [54]: https://intel-gfx-ci.01.org/tree/drm-tip/Patchwork_18154/fi-bxt-dsi/igt@i915_selftest@live@workarounds.html

  * igt@kms_busy@basic@flip:
    - fi-kbl-x1275:       [PASS][55] -> [DMESG-WARN][56] ([i915#62] / [i915#92] / [i915#95])
   [55]: https://intel-gfx-ci.01.org/tree/drm-tip/CI_DRM_8741/fi-kbl-x1275/igt@kms_busy@basic@flip.html
   [56]: https://intel-gfx-ci.01.org/tree/drm-tip/Patchwork_18154/fi-kbl-x1275/igt@kms_busy@basic@flip.html

  * igt@kms_cursor_legacy@basic-busy-flip-before-cursor-atomic:
    - fi-kbl-r:           [PASS][57] -> [DMESG-WARN][58] ([i915#1982])
   [57]: https://intel-gfx-ci.01.org/tree/drm-tip/CI_DRM_8741/fi-kbl-r/igt@kms_cursor_legacy@basic-busy-flip-before-cursor-atomic.html
   [58]: https://intel-gfx-ci.01.org/tree/drm-tip/Patchwork_18154/fi-kbl-r/igt@kms_cursor_legacy@basic-busy-flip-before-cursor-atomic.html

  * igt@prime_self_import@basic-with_two_bos:
    - fi-tgl-y:           [PASS][59] -> [DMESG-WARN][60] ([i915#402]) +1 similar issue
   [59]: https://intel-gfx-ci.01.org/tree/drm-tip/CI_DRM_8741/fi-tgl-y/igt@prime_self_import@basic-with_two_bos.html
   [60]: https://intel-gfx-ci.01.org/tree/drm-tip/Patchwork_18154/fi-tgl-y/igt@prime_self_import@basic-with_two_bos.html

  
#### Possible fixes ####

  * igt@i915_pm_rpm@basic-pci-d3-state:
    - fi-tgl-y:           [DMESG-WARN][61] ([i915#1982]) -> [PASS][62]
   [61]: https://intel-gfx-ci.01.org/tree/drm-tip/CI_DRM_8741/fi-tgl-y/igt@i915_pm_rpm@basic-pci-d3-state.html
   [62]: https://intel-gfx-ci.01.org/tree/drm-tip/Patchwork_18154/fi-tgl-y/igt@i915_pm_rpm@basic-pci-d3-state.html

  * igt@i915_selftest@live@execlists:
    - fi-icl-y:           [INCOMPLETE][63] ([i915#1684]) -> [PASS][64]
   [63]: https://intel-gfx-ci.01.org/tree/drm-tip/CI_DRM_8741/fi-icl-y/igt@i915_selftest@live@execlists.html
   [64]: https://intel-gfx-ci.01.org/tree/drm-tip/Patchwork_18154/fi-icl-y/igt@i915_selftest@live@execlists.html

  * igt@vgem_basic@setversion:
    - fi-tgl-y:           [DMESG-WARN][65] ([i915#402]) -> [PASS][66] +1 similar issue
   [65]: https://intel-gfx-ci.01.org/tree/drm-tip/CI_DRM_8741/fi-tgl-y/igt@vgem_basic@setversion.html
   [66]: https://intel-gfx-ci.01.org/tree/drm-tip/Patchwork_18154/fi-tgl-y/igt@vgem_basic@setversion.html

  
#### Warnings ####

  * igt@debugfs_test@read_all_entries:
    - fi-kbl-x1275:       [DMESG-WARN][67] ([i915#62] / [i915#92]) -> [DMESG-WARN][68] ([i915#62] / [i915#92] / [i915#95]) +3 similar issues
   [67]: https://intel-gfx-ci.01.org/tree/drm-tip/CI_DRM_8741/fi-kbl-x1275/igt@debugfs_test@read_all_entries.html
   [68]: https://intel-gfx-ci.01.org/tree/drm-tip/Patchwork_18154/fi-kbl-x1275/igt@debugfs_test@read_all_entries.html

  * igt@kms_flip@basic-plain-flip@a-dp1:
    - fi-kbl-x1275:       [DMESG-WARN][69] ([i915#62] / [i915#92] / [i915#95]) -> [DMESG-WARN][70] ([i915#62] / [i915#92])
   [69]: https://intel-gfx-ci.01.org/tree/drm-tip/CI_DRM_8741/fi-kbl-x1275/igt@kms_flip@basic-plain-flip@a-dp1.html
   [70]: https://intel-gfx-ci.01.org/tree/drm-tip/Patchwork_18154/fi-kbl-x1275/igt@kms_flip@basic-plain-flip@a-dp1.html

  
  {name}: This element is suppressed. This means it is ignored when computing
          the status of the difference (SUCCESS, WARNING, or FAILURE).

  [i915#1635]: https://gitlab.freedesktop.org/drm/intel/issues/1635
  [i915#1684]: https://gitlab.freedesktop.org/drm/intel/issues/1684
  [i915#1982]: https://gitlab.freedesktop.org/drm/intel/issues/1982
  [i915#402]: https://gitlab.freedesktop.org/drm/intel/issues/402
  [i915#62]: https://gitlab.freedesktop.org/drm/intel/issues/62
  [i915#666]: https://gitlab.freedesktop.org/drm/intel/issues/666
  [i915#92]: https://gitlab.freedesktop.org/drm/intel/issues/92
  [i915#95]: https://gitlab.freedesktop.org/drm/intel/issues/95


Participating hosts (46 -> 39)
------------------------------

  Missing    (7): fi-ilk-m540 fi-hsw-4200u fi-byt-squawks fi-bsw-cyan fi-ctg-p8600 fi-byt-clapper fi-bdw-samus 


Build changes
-------------

  * IGT: IGT_5735 -> IGTPW_4761
  * Linux: CI_DRM_8741 -> Patchwork_18154

  CI-20190529: 20190529
  CI_DRM_8741: 91eb8c9eebcf41f7368745d05c7f178903752ddd @ git://anongit.freedesktop.org/gfx-ci/linux
  IGTPW_4761: https://intel-gfx-ci.01.org/tree/drm-tip/IGTPW_4761/index.html
  IGT_5735: 21f8204e54c122e4a0f8ca4b59e4b2db8d1ba687 @ git://anongit.freedesktop.org/xorg/app/intel-gpu-tools
  Patchwork_18154: 69208bf0294076de2cf8412dd081a40d1891f781 @ git://anongit.freedesktop.org/gfx-ci/linux


== Linux commits ==

69208bf02940 drm/i915/perf: Map OA buffer to user space

== Logs ==

For more details see: https://intel-gfx-ci.01.org/tree/drm-tip/Patchwork_18154/index.html
_______________________________________________
Intel-gfx mailing list
Intel-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/intel-gfx

^ permalink raw reply	[flat|nested] 13+ messages in thread

* Re: [Intel-gfx] [PATCH 1/1] drm/i915/perf: Map OA buffer to user space
  2020-07-14  7:22 ` [Intel-gfx] [PATCH 1/1] drm/i915/perf: Map OA buffer to user space Umesh Nerlige Ramappa
  2020-07-14  7:44   ` Umesh Nerlige Ramappa
@ 2020-07-14 11:28   ` Chris Wilson
  2020-07-14 20:10     ` Umesh Nerlige Ramappa
  2020-07-16 15:32   ` Lionel Landwerlin
  2 siblings, 1 reply; 13+ messages in thread
From: Chris Wilson @ 2020-07-14 11:28 UTC (permalink / raw)
  To: Lionel G Landwerlin, Umesh Nerlige Ramappa, intel-gfx

Quoting Umesh Nerlige Ramappa (2020-07-14 08:22:39)
> From: Piotr Maciejewski <piotr.maciejewski@intel.com>
> 
> i915 used to support time based sampling mode which is good for overall
> system monitoring, but is not enough for query mode used to measure a
> single draw call or dispatch. Gen9-Gen11 are using current i915 perf
> implementation for query, but Gen12+ requires a new approach based on
> triggered reports within oa buffer. In order to enable above feature
> two changes are required:
> 
> 1. Whitelist update:
> - enable triggered reports within oa buffer
> - reading oa buffer head/tail/status information
> - reading gpu ticks counter.
> 
> 2. Map oa buffer at umd driver level to solve below constraints related
>    to time based sampling interface:
> - longer time to access reports collected by oa buffer

If you aren't talking about a few 10us, then something else is wrong.

> - slow oa reports browsing since oa buffer size is large

Nothing changes on the surface. That does not sound like inherent
inefficiencies. Since the same number of events will be generated and
need to be processed. You may argue that they are easier to process in
situ, and that the number of events dwarf L1 cache. An mmap interface
could eliminate one copy (and certainly a copy-to-user).

> - missing oa report index, so query cannot browse report directly

There's more to it than that otherwise you would have proposed an
extension to the event format.

> - with direct access to oa buffer, query can extract other useful
>   reports like context switch information needed to calculate correct
>   performance counters values.

Why would you not start with an unprivileged mediated mmapped buffer?
If the goal is to reduce sample latency by replacing read ioctls with a
mmap, that would seem to be an orthogonal step to exposing the raw OA
buffer. The inference would be that you do want to extract extra details
from the OA that are not being catered for. That's perfectly fine, our
goal is to _safely_ expose HW and not get in the way of userspace. But
if that was the intent, it should not appear to be an afterthought.
[i.e. that mmap should be inherently faster for accessing a large ring
of data is much less important than discussing the safety concerns of
letting userspace have direct control/access of OA.]

> Signed-off-by: Piotr Maciejewski <piotr.maciejewski@intel.com>
> ---
>  drivers/gpu/drm/i915/gt/intel_workarounds.c |  54 ++++++++
>  drivers/gpu/drm/i915/i915_perf.c            | 130 +++++++++++++++++++-
>  drivers/gpu/drm/i915/i915_perf_types.h      |  13 ++
>  drivers/gpu/drm/i915/i915_reg.h             |  14 +++
>  include/uapi/drm/i915_drm.h                 |  19 +++
>  5 files changed, 227 insertions(+), 3 deletions(-)
> 
> diff --git a/drivers/gpu/drm/i915/gt/intel_workarounds.c b/drivers/gpu/drm/i915/gt/intel_workarounds.c
> index 5726cd0a37e0..cf89928fc3a5 100644
> --- a/drivers/gpu/drm/i915/gt/intel_workarounds.c
> +++ b/drivers/gpu/drm/i915/gt/intel_workarounds.c
> @@ -1365,6 +1365,48 @@ whitelist_reg(struct i915_wa_list *wal, i915_reg_t reg)
>         whitelist_reg_ext(wal, reg, RING_FORCE_TO_NONPRIV_ACCESS_RW);
>  }
>  
> +static void gen9_whitelist_build_performance_counters(struct i915_wa_list *w)
> +{
> +       /* OA buffer trigger report 2/6 used by performance query */
> +       whitelist_reg(w, OAREPORTTRIG2);
> +       whitelist_reg(w, OAREPORTTRIG6);
> +
> +       /* Performance counters A18-20 used by tbs marker query */
> +       whitelist_reg_ext(w, OA_PERF_COUNTER_A18,
> +                         RING_FORCE_TO_NONPRIV_ACCESS_RW |
> +                         RING_FORCE_TO_NONPRIV_RANGE_16);
> +
> +       /* Read access to gpu ticks */
> +       whitelist_reg_ext(w, GEN8_GPU_TICKS,
> +                         RING_FORCE_TO_NONPRIV_ACCESS_RD);
> +
> +       /* Read access to: oa status, head, tail, buffer settings */
> +       whitelist_reg_ext(w, GEN8_OASTATUS,
> +                         RING_FORCE_TO_NONPRIV_ACCESS_RD |
> +                         RING_FORCE_TO_NONPRIV_RANGE_4);
> +}
> +
> +static void gen12_whitelist_build_performance_counters(struct i915_wa_list *w)
> +{
> +       /* OA buffer trigger report 2/6 used by performance query */
> +       whitelist_reg(w, GEN12_OAG_OAREPORTTRIG2);
> +       whitelist_reg(w, GEN12_OAG_OAREPORTTRIG6);
> +
> +       /* Performance counters A18-20 used by tbs marker query */
> +       whitelist_reg_ext(w, GEN12_OAG_PERF_COUNTER_A18,
> +                         RING_FORCE_TO_NONPRIV_ACCESS_RW |
> +                         RING_FORCE_TO_NONPRIV_RANGE_16);
> +
> +       /* Read access to gpu ticks */
> +       whitelist_reg_ext(w, GEN12_OAG_GPU_TICKS,
> +                         RING_FORCE_TO_NONPRIV_ACCESS_RD);
> +
> +       /* Read access to: oa status, head, tail, buffer settings */
> +       whitelist_reg_ext(w, GEN12_OAG_OASTATUS,
> +                         RING_FORCE_TO_NONPRIV_ACCESS_RD |
> +                         RING_FORCE_TO_NONPRIV_RANGE_4);
> +}
> +
>  static void gen9_whitelist_build(struct i915_wa_list *w)
>  {
>         /* WaVFEStateAfterPipeControlwithMediaStateClear:skl,bxt,glk,cfl */
> @@ -1378,6 +1420,9 @@ static void gen9_whitelist_build(struct i915_wa_list *w)
>  
>         /* WaSendPushConstantsFromMMIO:skl,bxt */
>         whitelist_reg(w, COMMON_SLICE_CHICKEN2);
> +
> +       /* Performance counters support */
> +       gen9_whitelist_build_performance_counters(w);
>  }
>  
>  static void skl_whitelist_build(struct intel_engine_cs *engine)
> @@ -1471,6 +1516,9 @@ static void cnl_whitelist_build(struct intel_engine_cs *engine)
>  
>         /* WaEnablePreemptionGranularityControlByUMD:cnl */
>         whitelist_reg(w, GEN8_CS_CHICKEN1);
> +
> +       /* Performance counters support */
> +       gen9_whitelist_build_performance_counters(w);
>  }
>  
>  static void icl_whitelist_build(struct intel_engine_cs *engine)
> @@ -1500,6 +1548,9 @@ static void icl_whitelist_build(struct intel_engine_cs *engine)
>                 whitelist_reg_ext(w, PS_INVOCATION_COUNT,
>                                   RING_FORCE_TO_NONPRIV_ACCESS_RD |
>                                   RING_FORCE_TO_NONPRIV_RANGE_4);
> +
> +               /* Performance counters support */
> +               gen9_whitelist_build_performance_counters(w);
>                 break;
>  
>         case VIDEO_DECODE_CLASS:
> @@ -1550,6 +1601,9 @@ static void tgl_whitelist_build(struct intel_engine_cs *engine)
>  
>                 /* Wa_1806527549:tgl */
>                 whitelist_reg(w, HIZ_CHICKEN);
> +
> +               /* Performance counters support */
> +               gen12_whitelist_build_performance_counters(w);
>                 break;
>         default:
>                 whitelist_reg_ext(w,
> diff --git a/drivers/gpu/drm/i915/i915_perf.c b/drivers/gpu/drm/i915/i915_perf.c
> index c6f6370283cf..06a3fff52dfa 100644
> --- a/drivers/gpu/drm/i915/i915_perf.c
> +++ b/drivers/gpu/drm/i915/i915_perf.c
> @@ -192,6 +192,7 @@
>   */
>  
>  #include <linux/anon_inodes.h>
> +#include <linux/mman.h>
>  #include <linux/sizes.h>
>  #include <linux/uuid.h>
>  
> @@ -434,6 +435,30 @@ static u32 gen7_oa_hw_tail_read(struct i915_perf_stream *stream)
>         return oastatus1 & GEN7_OASTATUS1_TAIL_MASK;
>  }
>  
> +static u32 gen12_oa_hw_head_read(struct i915_perf_stream *stream)
> +{
> +       struct intel_uncore *uncore = stream->uncore;
> +
> +       return intel_uncore_read(uncore, GEN12_OAG_OAHEADPTR) &
> +              GEN12_OAG_OAHEADPTR_MASK;
> +}
> +
> +static u32 gen8_oa_hw_head_read(struct i915_perf_stream *stream)
> +{
> +       struct intel_uncore *uncore = stream->uncore;
> +
> +       return intel_uncore_read(uncore, GEN8_OAHEADPTR) &
> +              GEN8_OAHEADPTR_MASK;
> +}
> +
> +static u32 gen7_oa_hw_head_read(struct i915_perf_stream *stream)
> +{
> +       struct intel_uncore *uncore = stream->uncore;
> +       u32 oastatus2 = intel_uncore_read(uncore, GEN7_OASTATUS2);
> +
> +       return oastatus2 & GEN7_OASTATUS2_HEAD_MASK;
> +}
> +
>  /**
>   * oa_buffer_check_unlocked - check for data and update tail ptr state
>   * @stream: i915 stream instance
> @@ -1328,6 +1353,7 @@ free_oa_buffer(struct i915_perf_stream *stream)
>         i915_vma_unpin_and_release(&stream->oa_buffer.vma,
>                                    I915_VMA_RELEASE_MAP);
>  
> +       stream->oa_buffer.cpu_address = 0;
>         stream->oa_buffer.vaddr = NULL;
>  }
>  
> @@ -1448,7 +1474,8 @@ static void gen8_init_oa_buffer(struct i915_perf_stream *stream)
>          *  bit."
>          */
>         intel_uncore_write(uncore, GEN8_OABUFFER, gtt_offset |
> -                  OABUFFER_SIZE_16M | GEN8_OABUFFER_MEM_SELECT_GGTT);
> +                          OABUFFER_SIZE_16M | GEN8_OABUFFER_MEM_SELECT_GGTT |
> +                          GEN7_OABUFFER_EDGE_TRIGGER);
>         intel_uncore_write(uncore, GEN8_OATAILPTR, gtt_offset & GEN8_OATAILPTR_MASK);
>  
>         /* Mark that we need updated tail pointers to read from... */
> @@ -1501,7 +1528,8 @@ static void gen12_init_oa_buffer(struct i915_perf_stream *stream)
>          *  bit."
>          */
>         intel_uncore_write(uncore, GEN12_OAG_OABUFFER, gtt_offset |
> -                          OABUFFER_SIZE_16M | GEN8_OABUFFER_MEM_SELECT_GGTT);
> +                          OABUFFER_SIZE_16M | GEN8_OABUFFER_MEM_SELECT_GGTT |
> +                          GEN7_OABUFFER_EDGE_TRIGGER);
>         intel_uncore_write(uncore, GEN12_OAG_OATAILPTR,
>                            gtt_offset & GEN12_OAG_OATAILPTR_MASK);
>  
> @@ -1562,6 +1590,7 @@ static int alloc_oa_buffer(struct i915_perf_stream *stream)
>                 goto err_unref;
>         }
>         stream->oa_buffer.vma = vma;
> +       stream->oa_buffer.cpu_address = 0;
>  
>         stream->oa_buffer.vaddr =
>                 i915_gem_object_pin_map(bo, I915_MAP_WB);
> @@ -1584,6 +1613,52 @@ static int alloc_oa_buffer(struct i915_perf_stream *stream)
>         return ret;
>  }
>  
> +static int map_oa_buffer(struct i915_perf_stream *stream)

You have a per-client perf fd. mmap that, with a known offset for the OA
buffer. That is make userspace call something along the lines of

ctl = mmap(0, 4096, PROT_WRITE, MAP_PRIVATE, perf_fd, PERF_MMAP_CTL);

oa = mmap(0, ctl->oa_size, PROT_READ, MAP_PRIVATE, perf_fd, ctl->oa_offset);

[ctl can be an ioctl to retrieve the size/offset]

At every available opportunity, it should be reiterated that the mmap is
unfiltered information leaks, continual reminders.

> +{
> +       unsigned long address = 0;
> +       const u64 size = OA_BUFFER_SIZE;
> +       struct i915_vma *oabuffer_vma = stream->oa_buffer.vma;
> +       struct drm_i915_gem_object *oabuffer_obj = oabuffer_vma->obj;
> +       struct mm_struct *mm = current->mm;
> +       struct vm_area_struct *vma = NULL;
> +
> +       if(stream->oa_buffer.cpu_address != 0)
> +               return 0;
> +
> +       if (!boot_cpu_has(X86_FEATURE_PAT))
> +               return -ENODEV;
> +
> +       if (!oabuffer_obj || !oabuffer_vma)
> +               return -ENOENT;
> +
> +       if (!oabuffer_obj->base.filp)
> +               return -ENXIO;
> +
> +       if (range_overflows_t(u64, 0, size, oabuffer_obj->base.size))
> +               return -EINVAL;
> +
> +       address = vm_mmap(oabuffer_obj->base.filp, 0, size,
> +                         PROT_READ, MAP_SHARED, 0);
> +
> +       if (IS_ERR_VALUE(address))
> +               return address;
> +
> +       if (mmap_write_lock_killable(mm))
> +               return -EINTR;
> +
> +       vma = find_vma(mm, address);
> +       if (vma) {
> +               vma->vm_page_prot =
> +                       pgprot_writecombine(vm_get_page_prot(vma->vm_flags));

This is dangerous code to copy! Let's avoid repeating mistakes of the
past.

> +
> +               stream->oa_buffer.cpu_address = address;
> +       }
> +
> +       mmap_write_unlock(mm);
> +
> +       return vma ? 0 : -ENOMEM;
> +}

> +/**
> + * i915_perf_get_oa_buffer_info_locked - Properties of the i915-perf OA buffer
> + * @arg: pointer to oa buffer info populated by this function.
> + */
> +static int i915_perf_get_oa_buffer_info_locked(struct i915_perf_stream *stream,
> +                                              unsigned long arg)
> +{
> +       struct drm_i915_perf_oa_buffer_info info;
> +       void __user *output = (void __user *) arg;
> +       int ret;
> +
> +       if (!output)
> +               return -EINVAL;
> +
> +       memset(&info, 0, sizeof(info));
> +
> +       info.size = stream->oa_buffer.vma->size;

> +       info.head = stream->perf->ops.oa_hw_head_read(stream);
> +       info.tail = stream->perf->ops.oa_hw_tail_read(stream);

So this is a snapshot of the HW state, you expect this ioctl to be
called frequently?

> +       info.gpu_address = i915_ggtt_offset(stream->oa_buffer.vma);

I'm not happy leaking this. I presume you have a very good reason. Add a
comment to justify it, or better don't include it.
-Chris
_______________________________________________
Intel-gfx mailing list
Intel-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/intel-gfx

^ permalink raw reply	[flat|nested] 13+ messages in thread

* Re: [Intel-gfx] [PATCH 1/1] drm/i915/perf: Map OA buffer to user space
  2020-07-14 11:28   ` Chris Wilson
@ 2020-07-14 20:10     ` Umesh Nerlige Ramappa
  2020-07-14 20:24       ` Chris Wilson
  0 siblings, 1 reply; 13+ messages in thread
From: Umesh Nerlige Ramappa @ 2020-07-14 20:10 UTC (permalink / raw)
  To: Chris Wilson; +Cc: intel-gfx

On Tue, Jul 14, 2020 at 12:28:15PM +0100, Chris Wilson wrote:
>Quoting Umesh Nerlige Ramappa (2020-07-14 08:22:39)
>> From: Piotr Maciejewski <piotr.maciejewski@intel.com>
>>
>> i915 used to support time based sampling mode which is good for overall
>> system monitoring, but is not enough for query mode used to measure a
>> single draw call or dispatch. Gen9-Gen11 are using current i915 perf
>> implementation for query, but Gen12+ requires a new approach based on
>> triggered reports within oa buffer. In order to enable above feature
>> two changes are required:
>>
>> 1. Whitelist update:
>> - enable triggered reports within oa buffer
>> - reading oa buffer head/tail/status information
>> - reading gpu ticks counter.
>>
>> 2. Map oa buffer at umd driver level to solve below constraints related
>>    to time based sampling interface:
>> - longer time to access reports collected by oa buffer
>
>If you aren't talking about a few 10us, then something else is wrong.
>
>> - slow oa reports browsing since oa buffer size is large
>
>Nothing changes on the surface. That does not sound like inherent
>inefficiencies. Since the same number of events will be generated and
>need to be processed. You may argue that they are easier to process in
>situ, and that the number of events dwarf L1 cache. An mmap interface
>could eliminate one copy (and certainly a copy-to-user).
>
>> - missing oa report index, so query cannot browse report directly
>
>There's more to it than that otherwise you would have proposed an
>extension to the event format.
>
>> - with direct access to oa buffer, query can extract other useful
>>   reports like context switch information needed to calculate correct
>>   performance counters values.
>
>Why would you not start with an unprivileged mediated mmapped buffer?
>If the goal is to reduce sample latency by replacing read ioctls with a
>mmap, that would seem to be an orthogonal step to exposing the raw OA
>buffer. The inference would be that you do want to extract extra details
>from the OA that are not being catered for. That's perfectly fine, our
>goal is to _safely_ expose HW and not get in the way of userspace. But
>if that was the intent, it should not appear to be an afterthought.
>[i.e. that mmap should be inherently faster for accessing a large ring
>of data is much less important than discussing the safety concerns of
>letting userspace have direct control/access of OA.]

fwiu

The goal is mainly being able to quickly view reports of interest within 
a window in the OA buffer. The start and end of the window are defined 
by triggered OA reports using the OAREPORTTRIG registers along with some 
counters that act as markers (A18 - A20).

In initial implementation, the user was trying to maintain a copy of the 
oa buffer that was being built by reading OA reports from the perf_fd 
read interface. Using a new ioctl interface that returned HW TAIL, the 
user would try to index into the window in this buffer. However, the 
mapping was not 1:1 because perf_fd read would drop reports that are 
zeroed out or invalid. Added to that, the user buffer was limited by 
size. This made mapping things to an intermediate user buffer more messy 
as it would require more information from the driver to do so.

Hence, the decision was to map the OA buffer to umd provided (1) that it 
is accessed from a privileged user and (2) OA report filtering is not 
used. These 2 conditions would satisfy the safety criteria that the 
perf_fd interface addresses.

>
>> Signed-off-by: Piotr Maciejewski <piotr.maciejewski@intel.com>
>> ---
>>  drivers/gpu/drm/i915/gt/intel_workarounds.c |  54 ++++++++
>>  drivers/gpu/drm/i915/i915_perf.c            | 130 +++++++++++++++++++-
>>  drivers/gpu/drm/i915/i915_perf_types.h      |  13 ++
>>  drivers/gpu/drm/i915/i915_reg.h             |  14 +++
>>  include/uapi/drm/i915_drm.h                 |  19 +++
>>  5 files changed, 227 insertions(+), 3 deletions(-)
>>
>> diff --git a/drivers/gpu/drm/i915/gt/intel_workarounds.c b/drivers/gpu/drm/i915/gt/intel_workarounds.c
>> index 5726cd0a37e0..cf89928fc3a5 100644
>> --- a/drivers/gpu/drm/i915/gt/intel_workarounds.c
>> +++ b/drivers/gpu/drm/i915/gt/intel_workarounds.c
>> @@ -1365,6 +1365,48 @@ whitelist_reg(struct i915_wa_list *wal, i915_reg_t reg)
>>         whitelist_reg_ext(wal, reg, RING_FORCE_TO_NONPRIV_ACCESS_RW);
>>  }
>>
>> +static void gen9_whitelist_build_performance_counters(struct i915_wa_list *w)
>> +{
>> +       /* OA buffer trigger report 2/6 used by performance query */
>> +       whitelist_reg(w, OAREPORTTRIG2);
>> +       whitelist_reg(w, OAREPORTTRIG6);
>> +
>> +       /* Performance counters A18-20 used by tbs marker query */
>> +       whitelist_reg_ext(w, OA_PERF_COUNTER_A18,
>> +                         RING_FORCE_TO_NONPRIV_ACCESS_RW |
>> +                         RING_FORCE_TO_NONPRIV_RANGE_16);
>> +
>> +       /* Read access to gpu ticks */
>> +       whitelist_reg_ext(w, GEN8_GPU_TICKS,
>> +                         RING_FORCE_TO_NONPRIV_ACCESS_RD);
>> +
>> +       /* Read access to: oa status, head, tail, buffer settings */
>> +       whitelist_reg_ext(w, GEN8_OASTATUS,
>> +                         RING_FORCE_TO_NONPRIV_ACCESS_RD |
>> +                         RING_FORCE_TO_NONPRIV_RANGE_4);
>> +}
>> +
>> +static void gen12_whitelist_build_performance_counters(struct i915_wa_list *w)
>> +{
>> +       /* OA buffer trigger report 2/6 used by performance query */
>> +       whitelist_reg(w, GEN12_OAG_OAREPORTTRIG2);
>> +       whitelist_reg(w, GEN12_OAG_OAREPORTTRIG6);
>> +
>> +       /* Performance counters A18-20 used by tbs marker query */
>> +       whitelist_reg_ext(w, GEN12_OAG_PERF_COUNTER_A18,
>> +                         RING_FORCE_TO_NONPRIV_ACCESS_RW |
>> +                         RING_FORCE_TO_NONPRIV_RANGE_16);
>> +
>> +       /* Read access to gpu ticks */
>> +       whitelist_reg_ext(w, GEN12_OAG_GPU_TICKS,
>> +                         RING_FORCE_TO_NONPRIV_ACCESS_RD);
>> +
>> +       /* Read access to: oa status, head, tail, buffer settings */
>> +       whitelist_reg_ext(w, GEN12_OAG_OASTATUS,
>> +                         RING_FORCE_TO_NONPRIV_ACCESS_RD |
>> +                         RING_FORCE_TO_NONPRIV_RANGE_4);
>> +}
>> +
>>  static void gen9_whitelist_build(struct i915_wa_list *w)
>>  {
>>         /* WaVFEStateAfterPipeControlwithMediaStateClear:skl,bxt,glk,cfl */
>> @@ -1378,6 +1420,9 @@ static void gen9_whitelist_build(struct i915_wa_list *w)
>>
>>         /* WaSendPushConstantsFromMMIO:skl,bxt */
>>         whitelist_reg(w, COMMON_SLICE_CHICKEN2);
>> +
>> +       /* Performance counters support */
>> +       gen9_whitelist_build_performance_counters(w);
>>  }
>>
>>  static void skl_whitelist_build(struct intel_engine_cs *engine)
>> @@ -1471,6 +1516,9 @@ static void cnl_whitelist_build(struct intel_engine_cs *engine)
>>
>>         /* WaEnablePreemptionGranularityControlByUMD:cnl */
>>         whitelist_reg(w, GEN8_CS_CHICKEN1);
>> +
>> +       /* Performance counters support */
>> +       gen9_whitelist_build_performance_counters(w);
>>  }
>>
>>  static void icl_whitelist_build(struct intel_engine_cs *engine)
>> @@ -1500,6 +1548,9 @@ static void icl_whitelist_build(struct intel_engine_cs *engine)
>>                 whitelist_reg_ext(w, PS_INVOCATION_COUNT,
>>                                   RING_FORCE_TO_NONPRIV_ACCESS_RD |
>>                                   RING_FORCE_TO_NONPRIV_RANGE_4);
>> +
>> +               /* Performance counters support */
>> +               gen9_whitelist_build_performance_counters(w);
>>                 break;
>>
>>         case VIDEO_DECODE_CLASS:
>> @@ -1550,6 +1601,9 @@ static void tgl_whitelist_build(struct intel_engine_cs *engine)
>>
>>                 /* Wa_1806527549:tgl */
>>                 whitelist_reg(w, HIZ_CHICKEN);
>> +
>> +               /* Performance counters support */
>> +               gen12_whitelist_build_performance_counters(w);
>>                 break;
>>         default:
>>                 whitelist_reg_ext(w,
>> diff --git a/drivers/gpu/drm/i915/i915_perf.c b/drivers/gpu/drm/i915/i915_perf.c
>> index c6f6370283cf..06a3fff52dfa 100644
>> --- a/drivers/gpu/drm/i915/i915_perf.c
>> +++ b/drivers/gpu/drm/i915/i915_perf.c
>> @@ -192,6 +192,7 @@
>>   */
>>
>>  #include <linux/anon_inodes.h>
>> +#include <linux/mman.h>
>>  #include <linux/sizes.h>
>>  #include <linux/uuid.h>
>>
>> @@ -434,6 +435,30 @@ static u32 gen7_oa_hw_tail_read(struct i915_perf_stream *stream)
>>         return oastatus1 & GEN7_OASTATUS1_TAIL_MASK;
>>  }
>>
>> +static u32 gen12_oa_hw_head_read(struct i915_perf_stream *stream)
>> +{
>> +       struct intel_uncore *uncore = stream->uncore;
>> +
>> +       return intel_uncore_read(uncore, GEN12_OAG_OAHEADPTR) &
>> +              GEN12_OAG_OAHEADPTR_MASK;
>> +}
>> +
>> +static u32 gen8_oa_hw_head_read(struct i915_perf_stream *stream)
>> +{
>> +       struct intel_uncore *uncore = stream->uncore;
>> +
>> +       return intel_uncore_read(uncore, GEN8_OAHEADPTR) &
>> +              GEN8_OAHEADPTR_MASK;
>> +}
>> +
>> +static u32 gen7_oa_hw_head_read(struct i915_perf_stream *stream)
>> +{
>> +       struct intel_uncore *uncore = stream->uncore;
>> +       u32 oastatus2 = intel_uncore_read(uncore, GEN7_OASTATUS2);
>> +
>> +       return oastatus2 & GEN7_OASTATUS2_HEAD_MASK;
>> +}
>> +
>>  /**
>>   * oa_buffer_check_unlocked - check for data and update tail ptr state
>>   * @stream: i915 stream instance
>> @@ -1328,6 +1353,7 @@ free_oa_buffer(struct i915_perf_stream *stream)
>>         i915_vma_unpin_and_release(&stream->oa_buffer.vma,
>>                                    I915_VMA_RELEASE_MAP);
>>
>> +       stream->oa_buffer.cpu_address = 0;
>>         stream->oa_buffer.vaddr = NULL;
>>  }
>>
>> @@ -1448,7 +1474,8 @@ static void gen8_init_oa_buffer(struct i915_perf_stream *stream)
>>          *  bit."
>>          */
>>         intel_uncore_write(uncore, GEN8_OABUFFER, gtt_offset |
>> -                  OABUFFER_SIZE_16M | GEN8_OABUFFER_MEM_SELECT_GGTT);
>> +                          OABUFFER_SIZE_16M | GEN8_OABUFFER_MEM_SELECT_GGTT |
>> +                          GEN7_OABUFFER_EDGE_TRIGGER);
>>         intel_uncore_write(uncore, GEN8_OATAILPTR, gtt_offset & GEN8_OATAILPTR_MASK);
>>
>>         /* Mark that we need updated tail pointers to read from... */
>> @@ -1501,7 +1528,8 @@ static void gen12_init_oa_buffer(struct i915_perf_stream *stream)
>>          *  bit."
>>          */
>>         intel_uncore_write(uncore, GEN12_OAG_OABUFFER, gtt_offset |
>> -                          OABUFFER_SIZE_16M | GEN8_OABUFFER_MEM_SELECT_GGTT);
>> +                          OABUFFER_SIZE_16M | GEN8_OABUFFER_MEM_SELECT_GGTT |
>> +                          GEN7_OABUFFER_EDGE_TRIGGER);
>>         intel_uncore_write(uncore, GEN12_OAG_OATAILPTR,
>>                            gtt_offset & GEN12_OAG_OATAILPTR_MASK);
>>
>> @@ -1562,6 +1590,7 @@ static int alloc_oa_buffer(struct i915_perf_stream *stream)
>>                 goto err_unref;
>>         }
>>         stream->oa_buffer.vma = vma;
>> +       stream->oa_buffer.cpu_address = 0;
>>
>>         stream->oa_buffer.vaddr =
>>                 i915_gem_object_pin_map(bo, I915_MAP_WB);
>> @@ -1584,6 +1613,52 @@ static int alloc_oa_buffer(struct i915_perf_stream *stream)
>>         return ret;
>>  }
>>
>> +static int map_oa_buffer(struct i915_perf_stream *stream)
>
>You have a per-client perf fd. mmap that, with a known offset for the OA
>buffer. That is make userspace call something along the lines of
>
>ctl = mmap(0, 4096, PROT_WRITE, MAP_PRIVATE, perf_fd, PERF_MMAP_CTL);
>
>oa = mmap(0, ctl->oa_size, PROT_READ, MAP_PRIVATE, perf_fd, ctl->oa_offset);
>
>[ctl can be an ioctl to retrieve the size/offset]
>
>At every available opportunity, it should be reiterated that the mmap is
>unfiltered information leaks, continual reminders.
>
>> +{
>> +       unsigned long address = 0;
>> +       const u64 size = OA_BUFFER_SIZE;
>> +       struct i915_vma *oabuffer_vma = stream->oa_buffer.vma;
>> +       struct drm_i915_gem_object *oabuffer_obj = oabuffer_vma->obj;
>> +       struct mm_struct *mm = current->mm;
>> +       struct vm_area_struct *vma = NULL;
>> +
>> +       if(stream->oa_buffer.cpu_address != 0)
>> +               return 0;
>> +
>> +       if (!boot_cpu_has(X86_FEATURE_PAT))
>> +               return -ENODEV;
>> +
>> +       if (!oabuffer_obj || !oabuffer_vma)
>> +               return -ENOENT;
>> +
>> +       if (!oabuffer_obj->base.filp)
>> +               return -ENXIO;
>> +
>> +       if (range_overflows_t(u64, 0, size, oabuffer_obj->base.size))
>> +               return -EINVAL;
>> +
>> +       address = vm_mmap(oabuffer_obj->base.filp, 0, size,
>> +                         PROT_READ, MAP_SHARED, 0);
>> +
>> +       if (IS_ERR_VALUE(address))
>> +               return address;
>> +
>> +       if (mmap_write_lock_killable(mm))
>> +               return -EINTR;
>> +
>> +       vma = find_vma(mm, address);
>> +       if (vma) {
>> +               vma->vm_page_prot =
>> +                       pgprot_writecombine(vm_get_page_prot(vma->vm_flags));
>
>This is dangerous code to copy! Let's avoid repeating mistakes of the
>past.
>
>> +
>> +               stream->oa_buffer.cpu_address = address;
>> +       }
>> +
>> +       mmap_write_unlock(mm);
>> +
>> +       return vma ? 0 : -ENOMEM;
>> +}
>
>> +/**
>> + * i915_perf_get_oa_buffer_info_locked - Properties of the i915-perf OA buffer
>> + * @arg: pointer to oa buffer info populated by this function.
>> + */
>> +static int i915_perf_get_oa_buffer_info_locked(struct i915_perf_stream *stream,
>> +                                              unsigned long arg)
>> +{
>> +       struct drm_i915_perf_oa_buffer_info info;
>> +       void __user *output = (void __user *) arg;
>> +       int ret;
>> +
>> +       if (!output)
>> +               return -EINVAL;
>> +
>> +       memset(&info, 0, sizeof(info));
>> +
>> +       info.size = stream->oa_buffer.vma->size;
>
>> +       info.head = stream->perf->ops.oa_hw_head_read(stream);
>> +       info.tail = stream->perf->ops.oa_hw_tail_read(stream);
>
>So this is a snapshot of the HW state, you expect this ioctl to be
>called frequently?

yes. this information is queried when the user wants to capture a window 
of events.

>
>> +       info.gpu_address = i915_ggtt_offset(stream->oa_buffer.vma);
>
>I'm not happy leaking this. I presume you have a very good reason. Add a
>comment to justify it, or better don't include it.

I don't see a use other than to calculate offset for the head/tail from 
cpu_address. Will remove and return offset directly in head/tail above.

>-Chris
_______________________________________________
Intel-gfx mailing list
Intel-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/intel-gfx

^ permalink raw reply	[flat|nested] 13+ messages in thread

* Re: [Intel-gfx] [PATCH 1/1] drm/i915/perf: Map OA buffer to user space
  2020-07-14 20:10     ` Umesh Nerlige Ramappa
@ 2020-07-14 20:24       ` Chris Wilson
  0 siblings, 0 replies; 13+ messages in thread
From: Chris Wilson @ 2020-07-14 20:24 UTC (permalink / raw)
  To: Umesh Nerlige Ramappa; +Cc: intel-gfx

Quoting Umesh Nerlige Ramappa (2020-07-14 21:10:02)
> On Tue, Jul 14, 2020 at 12:28:15PM +0100, Chris Wilson wrote:
> >Quoting Umesh Nerlige Ramappa (2020-07-14 08:22:39)
> >> From: Piotr Maciejewski <piotr.maciejewski@intel.com>
> >>
> >> i915 used to support time based sampling mode which is good for overall
> >> system monitoring, but is not enough for query mode used to measure a
> >> single draw call or dispatch. Gen9-Gen11 are using current i915 perf
> >> implementation for query, but Gen12+ requires a new approach based on
> >> triggered reports within oa buffer. In order to enable above feature
> >> two changes are required:
> >>
> >> 1. Whitelist update:
> >> - enable triggered reports within oa buffer
> >> - reading oa buffer head/tail/status information
> >> - reading gpu ticks counter.
> >>
> >> 2. Map oa buffer at umd driver level to solve below constraints related
> >>    to time based sampling interface:
> >> - longer time to access reports collected by oa buffer
> >
> >If you aren't talking about a few 10us, then something else is wrong.
> >
> >> - slow oa reports browsing since oa buffer size is large
> >
> >Nothing changes on the surface. That does not sound like inherent
> >inefficiencies. Since the same number of events will be generated and
> >need to be processed. You may argue that they are easier to process in
> >situ, and that the number of events dwarf L1 cache. An mmap interface
> >could eliminate one copy (and certainly a copy-to-user).
> >
> >> - missing oa report index, so query cannot browse report directly
> >
> >There's more to it than that otherwise you would have proposed an
> >extension to the event format.
> >
> >> - with direct access to oa buffer, query can extract other useful
> >>   reports like context switch information needed to calculate correct
> >>   performance counters values.
> >
> >Why would you not start with an unprivileged mediated mmapped buffer?
> >If the goal is to reduce sample latency by replacing read ioctls with a
> >mmap, that would seem to be an orthogonal step to exposing the raw OA
> >buffer. The inference would be that you do want to extract extra details
> >from the OA that are not being catered for. That's perfectly fine, our
> >goal is to _safely_ expose HW and not get in the way of userspace. But
> >if that was the intent, it should not appear to be an afterthought.
> >[i.e. that mmap should be inherently faster for accessing a large ring
> >of data is much less important than discussing the safety concerns of
> >letting userspace have direct control/access of OA.]
> 
> fwiu
> 
> The goal is mainly being able to quickly view reports of interest within 
> a window in the OA buffer. The start and end of the window are defined 
> by triggered OA reports using the OAREPORTTRIG registers along with some 
> counters that act as markers (A18 - A20).

Ok, that was not apparent from reading. Could you put that usecase to
the front; aiui userspace can read/define a region of interest within
the mmap that correspond to the oareporttrig. In particular describe how
userspace either triggers or responds to the OA and finds the roi.
[Psuedocode for the win.]

That's useful to know why we need this, and some details into the how to
flesh out the interface.

> In initial implementation, the user was trying to maintain a copy of the 
> oa buffer that was being built by reading OA reports from the perf_fd 
> read interface. Using a new ioctl interface that returned HW TAIL, the 
> user would try to index into the window in this buffer. However, the 
> mapping was not 1:1 because perf_fd read would drop reports that are 
> zeroed out or invalid. Added to that, the user buffer was limited by 
> size. This made mapping things to an intermediate user buffer more messy 
> as it would require more information from the driver to do so.
> 
> Hence, the decision was to map the OA buffer to umd provided (1) that it 
> is accessed from a privileged user and (2) OA report filtering is not 
> used. These 2 conditions would satisfy the safety criteria that the 
> perf_fd interface addresses.

It is refreshing to keep the paranoid perf discussions to the front. We
still need to think about whether any new method exposes more
information that was previously available, and if that is safe even for
a privileged user. It's just a matter of writing down the details and
making sure we are happy we've hooked up the protections. And in the
code, we want comments as to why.

It's just a matter of recording the details for due diligence. And you
never know, we might just uncover something horrid.
-Chris
_______________________________________________
Intel-gfx mailing list
Intel-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/intel-gfx

^ permalink raw reply	[flat|nested] 13+ messages in thread

* Re: [Intel-gfx] [PATCH 1/1] drm/i915/perf: Map OA buffer to user space
  2020-07-14  7:22 ` [Intel-gfx] [PATCH 1/1] drm/i915/perf: Map OA buffer to user space Umesh Nerlige Ramappa
  2020-07-14  7:44   ` Umesh Nerlige Ramappa
  2020-07-14 11:28   ` Chris Wilson
@ 2020-07-16 15:32   ` Lionel Landwerlin
  2020-07-16 18:06     ` Umesh Nerlige Ramappa
  2 siblings, 1 reply; 13+ messages in thread
From: Lionel Landwerlin @ 2020-07-16 15:32 UTC (permalink / raw)
  To: Umesh Nerlige Ramappa, intel-gfx

On 14/07/2020 10:22, Umesh Nerlige Ramappa wrote:
> From: Piotr Maciejewski <piotr.maciejewski@intel.com>
>
> i915 used to support time based sampling mode which is good for overall
> system monitoring, but is not enough for query mode used to measure a
> single draw call or dispatch. Gen9-Gen11 are using current i915 perf
> implementation for query, but Gen12+ requires a new approach based on
> triggered reports within oa buffer. In order to enable above feature
> two changes are required:
>
> 1. Whitelist update:
> - enable triggered reports within oa buffer
> - reading oa buffer head/tail/status information
> - reading gpu ticks counter.

I would break the patch into feature sets :

     - 1 patch to whitelist the trigger registers

     - 1 patch to whitelist OA counters & tail/head/... registers

     - 1 patch for mmap feature


Here are some IGT tests for the trigger feature : 
https://patchwork.freedesktop.org/series/75311/


We should verify that when not i915-perf is not active the whitelisted 
OA counters are not incrementing.

Otherwise we're starting to leak information that was not available before.


>
> 2. Map oa buffer at umd driver level to solve below constraints related
>     to time based sampling interface:
> - longer time to access reports collected by oa buffer
> - slow oa reports browsing since oa buffer size is large
> - missing oa report index, so query cannot browse report directly
> - with direct access to oa buffer, query can extract other useful
>    reports like context switch information needed to calculate correct
>    performance counters values.
>
> Signed-off-by: Piotr Maciejewski <piotr.maciejewski@intel.com>
> ---
>   drivers/gpu/drm/i915/gt/intel_workarounds.c |  54 ++++++++
>   drivers/gpu/drm/i915/i915_perf.c            | 130 +++++++++++++++++++-
>   drivers/gpu/drm/i915/i915_perf_types.h      |  13 ++
>   drivers/gpu/drm/i915/i915_reg.h             |  14 +++
>   include/uapi/drm/i915_drm.h                 |  19 +++
>   5 files changed, 227 insertions(+), 3 deletions(-)
>
> diff --git a/drivers/gpu/drm/i915/gt/intel_workarounds.c b/drivers/gpu/drm/i915/gt/intel_workarounds.c
> index 5726cd0a37e0..cf89928fc3a5 100644
> --- a/drivers/gpu/drm/i915/gt/intel_workarounds.c
> +++ b/drivers/gpu/drm/i915/gt/intel_workarounds.c
> @@ -1365,6 +1365,48 @@ whitelist_reg(struct i915_wa_list *wal, i915_reg_t reg)
>   	whitelist_reg_ext(wal, reg, RING_FORCE_TO_NONPRIV_ACCESS_RW);
>   }
>   
> +static void gen9_whitelist_build_performance_counters(struct i915_wa_list *w)
> +{
> +	/* OA buffer trigger report 2/6 used by performance query */
> +	whitelist_reg(w, OAREPORTTRIG2);
> +	whitelist_reg(w, OAREPORTTRIG6);
> +
> +	/* Performance counters A18-20 used by tbs marker query */
> +	whitelist_reg_ext(w, OA_PERF_COUNTER_A18,
> +			  RING_FORCE_TO_NONPRIV_ACCESS_RW |
> +			  RING_FORCE_TO_NONPRIV_RANGE_16);

I would actually whitelist the entire set of OA_PERF_COUNTER (0-36).

I would like to make use of them in Mesa for the Vulkan driver.

> +
> +	/* Read access to gpu ticks */
> +	whitelist_reg_ext(w, GEN8_GPU_TICKS,
> +			  RING_FORCE_TO_NONPRIV_ACCESS_RD);
> +
> +	/* Read access to: oa status, head, tail, buffer settings */
> +	whitelist_reg_ext(w, GEN8_OASTATUS,
> +			  RING_FORCE_TO_NONPRIV_ACCESS_RD |
> +			  RING_FORCE_TO_NONPRIV_RANGE_4);
> +}
> +
> +static void gen12_whitelist_build_performance_counters(struct i915_wa_list *w)
> +{
> +	/* OA buffer trigger report 2/6 used by performance query */
> +	whitelist_reg(w, GEN12_OAG_OAREPORTTRIG2);
> +	whitelist_reg(w, GEN12_OAG_OAREPORTTRIG6);
> +
> +	/* Performance counters A18-20 used by tbs marker query */
> +	whitelist_reg_ext(w, GEN12_OAG_PERF_COUNTER_A18,
> +			  RING_FORCE_TO_NONPRIV_ACCESS_RW |
> +			  RING_FORCE_TO_NONPRIV_RANGE_16);
> +
> +	/* Read access to gpu ticks */
> +	whitelist_reg_ext(w, GEN12_OAG_GPU_TICKS,
> +			  RING_FORCE_TO_NONPRIV_ACCESS_RD);
> +
> +	/* Read access to: oa status, head, tail, buffer settings */
> +	whitelist_reg_ext(w, GEN12_OAG_OASTATUS,
> +			  RING_FORCE_TO_NONPRIV_ACCESS_RD |
> +			  RING_FORCE_TO_NONPRIV_RANGE_4);
> +}
> +

> ...

>   
>   struct i915_perf {
> diff --git a/drivers/gpu/drm/i915/i915_reg.h b/drivers/gpu/drm/i915/i915_reg.h
> index 86a23ced051b..2e3d264339e0 100644
> --- a/drivers/gpu/drm/i915/i915_reg.h
> +++ b/drivers/gpu/drm/i915/i915_reg.h
> @@ -675,6 +675,7 @@ static inline bool i915_mmio_reg_valid(i915_reg_t reg)
>   #define  GEN7_OASTATUS2_HEAD_MASK           0xffffffc0
>   #define  GEN7_OASTATUS2_MEM_SELECT_GGTT     (1 << 0) /* 0: PPGTT, 1: GGTT */
>   
> +#define GEN8_GPU_TICKS _MMIO(0x2910)
>   #define GEN8_OASTATUS _MMIO(0x2b08)
>   #define  GEN8_OASTATUS_OVERRUN_STATUS	    (1 << 3)
>   #define  GEN8_OASTATUS_COUNTER_OVERFLOW     (1 << 2)
> @@ -696,6 +697,7 @@ static inline bool i915_mmio_reg_valid(i915_reg_t reg)
>   #define OABUFFER_SIZE_16M   (7 << 3)
>   
>   #define GEN12_OA_TLB_INV_CR _MMIO(0xceec)
> +#define GEN12_SQCNT1 _MMIO(0x8718)
>   
>   /* Gen12 OAR unit */
>   #define GEN12_OAR_OACONTROL _MMIO(0x2960)
> @@ -731,6 +733,7 @@ static inline bool i915_mmio_reg_valid(i915_reg_t reg)
>   #define  GEN12_OAG_OA_DEBUG_DISABLE_GO_1_0_REPORTS     (1 << 2)
>   #define  GEN12_OAG_OA_DEBUG_DISABLE_CTX_SWITCH_REPORTS (1 << 1)
>   
> +#define GEN12_OAG_GPU_TICKS _MMIO(0xda90)
>   #define GEN12_OAG_OASTATUS _MMIO(0xdafc)
>   #define  GEN12_OAG_OASTATUS_COUNTER_OVERFLOW (1 << 2)
>   #define  GEN12_OAG_OASTATUS_BUFFER_OVERFLOW  (1 << 1)
> @@ -972,6 +975,17 @@ static inline bool i915_mmio_reg_valid(i915_reg_t reg)
>   #define OAREPORTTRIG8_NOA_SELECT_6_SHIFT    24
>   #define OAREPORTTRIG8_NOA_SELECT_7_SHIFT    28
>   
> +/* Performance counters registers */
> +#define OA_PERF_COUNTER_A18   _MMIO(0x2890)
> +#define OA_PERF_COUNTER_A19   _MMIO(0x2898)
> +#define OA_PERF_COUNTER_A20   _MMIO(0x28A0)
Maybe turn this into OA_PERF_COUNTER(idx) _MMIO(0x2800 + idx * 8)
> +
> +/* Gen12 Performance counters registers */
> +#define GEN12_OAG_PERF_COUNTER_A16   _MMIO(0xDA00)
> +#define GEN12_OAG_PERF_COUNTER_A18   _MMIO(0xDA10)
> +#define GEN12_OAG_PERF_COUNTER_A19   _MMIO(0xDA18)
> +#define GEN12_OAG_PERF_COUNTER_A20   _MMIO(0xDA20)
Same here
> +
>   /* Same layout as OASTARTTRIGX */
>   #define GEN12_OAG_OASTARTTRIG1 _MMIO(0xd900)
>   #define GEN12_OAG_OASTARTTRIG2 _MMIO(0xd904)
> diff --git a/include/uapi/drm/i915_drm.h b/include/uapi/drm/i915_drm.h
> index 14b67cd6b54b..62b88c0123c8 100644
> --- a/include/uapi/drm/i915_drm.h
> +++ b/include/uapi/drm/i915_drm.h
> @@ -2048,6 +2048,25 @@ struct drm_i915_perf_open_param {z
>    */
>   #define I915_PERF_IOCTL_CONFIG	_IO('i', 0x2)
>   
> +/**
> + * Returns OA buffer properties.
> + *
> + * This ioctl is available in perf revision 6.
> + */
> +#define I915_PERF_IOCTL_GET_OA_BUFFER_INFO _IO('i', 0x3)
> +
> +/**
> + * OA buffer information structure.
> + */
> +struct drm_i915_perf_oa_buffer_info {
> +	__u32 size;
> +	__u32 head;
> +	__u32 tail;
> +	__u32 gpu_address;
> +	__u64 cpu_address;
> +	__u64 reserved[4];
> +};
> +
>   /**
>    * Common to all i915 perf records
>    */

Thanks a lot,


-Lionel

_______________________________________________
Intel-gfx mailing list
Intel-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/intel-gfx

^ permalink raw reply	[flat|nested] 13+ messages in thread

* Re: [Intel-gfx] [PATCH 1/1] drm/i915/perf: Map OA buffer to user space
  2020-07-16 15:32   ` Lionel Landwerlin
@ 2020-07-16 18:06     ` Umesh Nerlige Ramappa
  2020-07-16 18:44       ` Lionel Landwerlin
  0 siblings, 1 reply; 13+ messages in thread
From: Umesh Nerlige Ramappa @ 2020-07-16 18:06 UTC (permalink / raw)
  To: Lionel Landwerlin; +Cc: intel-gfx

On Thu, Jul 16, 2020 at 06:32:10PM +0300, Lionel Landwerlin wrote:
>On 14/07/2020 10:22, Umesh Nerlige Ramappa wrote:
>>From: Piotr Maciejewski <piotr.maciejewski@intel.com>
>>
>>i915 used to support time based sampling mode which is good for overall
>>system monitoring, but is not enough for query mode used to measure a
>>single draw call or dispatch. Gen9-Gen11 are using current i915 perf
>>implementation for query, but Gen12+ requires a new approach based on
>>triggered reports within oa buffer. In order to enable above feature
>>two changes are required:
>>
>>1. Whitelist update:
>>- enable triggered reports within oa buffer
>>- reading oa buffer head/tail/status information
>>- reading gpu ticks counter.
>
>I would break the patch into feature sets :
>
>    - 1 patch to whitelist the trigger registers
>
>    - 1 patch to whitelist OA counters & tail/head/... registers
>
>    - 1 patch for mmap feature
>
>
>Here are some IGT tests for the trigger feature : 
>https://patchwork.freedesktop.org/series/75311/
>
>
>We should verify that when not i915-perf is not active the whitelisted 
>OA counters are not incrementing.
>
>Otherwise we're starting to leak information that was not available before.
>
>
>>
>>2. Map oa buffer at umd driver level to solve below constraints related
>>    to time based sampling interface:
>>- longer time to access reports collected by oa buffer
>>- slow oa reports browsing since oa buffer size is large
>>- missing oa report index, so query cannot browse report directly
>>- with direct access to oa buffer, query can extract other useful
>>   reports like context switch information needed to calculate correct
>>   performance counters values.
>>
>>Signed-off-by: Piotr Maciejewski <piotr.maciejewski@intel.com>
>>---
>>  drivers/gpu/drm/i915/gt/intel_workarounds.c |  54 ++++++++
>>  drivers/gpu/drm/i915/i915_perf.c            | 130 +++++++++++++++++++-
>>  drivers/gpu/drm/i915/i915_perf_types.h      |  13 ++
>>  drivers/gpu/drm/i915/i915_reg.h             |  14 +++
>>  include/uapi/drm/i915_drm.h                 |  19 +++
>>  5 files changed, 227 insertions(+), 3 deletions(-)
>>
>>diff --git a/drivers/gpu/drm/i915/gt/intel_workarounds.c b/drivers/gpu/drm/i915/gt/intel_workarounds.c
>>index 5726cd0a37e0..cf89928fc3a5 100644
>>--- a/drivers/gpu/drm/i915/gt/intel_workarounds.c
>>+++ b/drivers/gpu/drm/i915/gt/intel_workarounds.c
>>@@ -1365,6 +1365,48 @@ whitelist_reg(struct i915_wa_list *wal, i915_reg_t reg)
>>  	whitelist_reg_ext(wal, reg, RING_FORCE_TO_NONPRIV_ACCESS_RW);
>>  }
>>+static void gen9_whitelist_build_performance_counters(struct i915_wa_list *w)
>>+{
>>+	/* OA buffer trigger report 2/6 used by performance query */
>>+	whitelist_reg(w, OAREPORTTRIG2);
>>+	whitelist_reg(w, OAREPORTTRIG6);
>>+
>>+	/* Performance counters A18-20 used by tbs marker query */
>>+	whitelist_reg_ext(w, OA_PERF_COUNTER_A18,
>>+			  RING_FORCE_TO_NONPRIV_ACCESS_RW |
>>+			  RING_FORCE_TO_NONPRIV_RANGE_16);
>
>I would actually whitelist the entire set of OA_PERF_COUNTER (0-36).

Not sure about that. I thought the only reason we chose to whitelist 
A18-A20 was because these counters did not count anything. Whitelisting 
all A counters would just mean that an unprivileged user can keep 
sampling them all the time from a command buffer. Right?

Thanks,
Umesh

>
>I would like to make use of them in Mesa for the Vulkan driver.
>
>>+
>>+	/* Read access to gpu ticks */
>>+	whitelist_reg_ext(w, GEN8_GPU_TICKS,
>>+			  RING_FORCE_TO_NONPRIV_ACCESS_RD);
>>+
>>+	/* Read access to: oa status, head, tail, buffer settings */
>>+	whitelist_reg_ext(w, GEN8_OASTATUS,
>>+			  RING_FORCE_TO_NONPRIV_ACCESS_RD |
>>+			  RING_FORCE_TO_NONPRIV_RANGE_4);
>>+}
>>+
>>+static void gen12_whitelist_build_performance_counters(struct i915_wa_list *w)
>>+{
>>+	/* OA buffer trigger report 2/6 used by performance query */
>>+	whitelist_reg(w, GEN12_OAG_OAREPORTTRIG2);
>>+	whitelist_reg(w, GEN12_OAG_OAREPORTTRIG6);
>>+
>>+	/* Performance counters A18-20 used by tbs marker query */
>>+	whitelist_reg_ext(w, GEN12_OAG_PERF_COUNTER_A18,
>>+			  RING_FORCE_TO_NONPRIV_ACCESS_RW |
>>+			  RING_FORCE_TO_NONPRIV_RANGE_16);
>>+
>>+	/* Read access to gpu ticks */
>>+	whitelist_reg_ext(w, GEN12_OAG_GPU_TICKS,
>>+			  RING_FORCE_TO_NONPRIV_ACCESS_RD);
>>+
>>+	/* Read access to: oa status, head, tail, buffer settings */
>>+	whitelist_reg_ext(w, GEN12_OAG_OASTATUS,
>>+			  RING_FORCE_TO_NONPRIV_ACCESS_RD |
>>+			  RING_FORCE_TO_NONPRIV_RANGE_4);
>>+}
>>+
>
>>...
>
>>  struct i915_perf {
>>diff --git a/drivers/gpu/drm/i915/i915_reg.h b/drivers/gpu/drm/i915/i915_reg.h
>>index 86a23ced051b..2e3d264339e0 100644
>>--- a/drivers/gpu/drm/i915/i915_reg.h
>>+++ b/drivers/gpu/drm/i915/i915_reg.h
>>@@ -675,6 +675,7 @@ static inline bool i915_mmio_reg_valid(i915_reg_t reg)
>>  #define  GEN7_OASTATUS2_HEAD_MASK           0xffffffc0
>>  #define  GEN7_OASTATUS2_MEM_SELECT_GGTT     (1 << 0) /* 0: PPGTT, 1: GGTT */
>>+#define GEN8_GPU_TICKS _MMIO(0x2910)
>>  #define GEN8_OASTATUS _MMIO(0x2b08)
>>  #define  GEN8_OASTATUS_OVERRUN_STATUS	    (1 << 3)
>>  #define  GEN8_OASTATUS_COUNTER_OVERFLOW     (1 << 2)
>>@@ -696,6 +697,7 @@ static inline bool i915_mmio_reg_valid(i915_reg_t reg)
>>  #define OABUFFER_SIZE_16M   (7 << 3)
>>  #define GEN12_OA_TLB_INV_CR _MMIO(0xceec)
>>+#define GEN12_SQCNT1 _MMIO(0x8718)
>>  /* Gen12 OAR unit */
>>  #define GEN12_OAR_OACONTROL _MMIO(0x2960)
>>@@ -731,6 +733,7 @@ static inline bool i915_mmio_reg_valid(i915_reg_t reg)
>>  #define  GEN12_OAG_OA_DEBUG_DISABLE_GO_1_0_REPORTS     (1 << 2)
>>  #define  GEN12_OAG_OA_DEBUG_DISABLE_CTX_SWITCH_REPORTS (1 << 1)
>>+#define GEN12_OAG_GPU_TICKS _MMIO(0xda90)
>>  #define GEN12_OAG_OASTATUS _MMIO(0xdafc)
>>  #define  GEN12_OAG_OASTATUS_COUNTER_OVERFLOW (1 << 2)
>>  #define  GEN12_OAG_OASTATUS_BUFFER_OVERFLOW  (1 << 1)
>>@@ -972,6 +975,17 @@ static inline bool i915_mmio_reg_valid(i915_reg_t reg)
>>  #define OAREPORTTRIG8_NOA_SELECT_6_SHIFT    24
>>  #define OAREPORTTRIG8_NOA_SELECT_7_SHIFT    28
>>+/* Performance counters registers */
>>+#define OA_PERF_COUNTER_A18   _MMIO(0x2890)
>>+#define OA_PERF_COUNTER_A19   _MMIO(0x2898)
>>+#define OA_PERF_COUNTER_A20   _MMIO(0x28A0)
>Maybe turn this into OA_PERF_COUNTER(idx) _MMIO(0x2800 + idx * 8)
>>+
>>+/* Gen12 Performance counters registers */
>>+#define GEN12_OAG_PERF_COUNTER_A16   _MMIO(0xDA00)
>>+#define GEN12_OAG_PERF_COUNTER_A18   _MMIO(0xDA10)
>>+#define GEN12_OAG_PERF_COUNTER_A19   _MMIO(0xDA18)
>>+#define GEN12_OAG_PERF_COUNTER_A20   _MMIO(0xDA20)
>Same here
>>+
>>  /* Same layout as OASTARTTRIGX */
>>  #define GEN12_OAG_OASTARTTRIG1 _MMIO(0xd900)
>>  #define GEN12_OAG_OASTARTTRIG2 _MMIO(0xd904)
>>diff --git a/include/uapi/drm/i915_drm.h b/include/uapi/drm/i915_drm.h
>>index 14b67cd6b54b..62b88c0123c8 100644
>>--- a/include/uapi/drm/i915_drm.h
>>+++ b/include/uapi/drm/i915_drm.h
>>@@ -2048,6 +2048,25 @@ struct drm_i915_perf_open_param {z
>>   */
>>  #define I915_PERF_IOCTL_CONFIG	_IO('i', 0x2)
>>+/**
>>+ * Returns OA buffer properties.
>>+ *
>>+ * This ioctl is available in perf revision 6.
>>+ */
>>+#define I915_PERF_IOCTL_GET_OA_BUFFER_INFO _IO('i', 0x3)
>>+
>>+/**
>>+ * OA buffer information structure.
>>+ */
>>+struct drm_i915_perf_oa_buffer_info {
>>+	__u32 size;
>>+	__u32 head;
>>+	__u32 tail;
>>+	__u32 gpu_address;
>>+	__u64 cpu_address;
>>+	__u64 reserved[4];
>>+};
>>+
>>  /**
>>   * Common to all i915 perf records
>>   */
>
>Thanks a lot,
>
>
>-Lionel
>
_______________________________________________
Intel-gfx mailing list
Intel-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/intel-gfx

^ permalink raw reply	[flat|nested] 13+ messages in thread

* Re: [Intel-gfx] [PATCH 1/1] drm/i915/perf: Map OA buffer to user space
  2020-07-16 18:06     ` Umesh Nerlige Ramappa
@ 2020-07-16 18:44       ` Lionel Landwerlin
  2020-07-16 19:28         ` Umesh Nerlige Ramappa
  0 siblings, 1 reply; 13+ messages in thread
From: Lionel Landwerlin @ 2020-07-16 18:44 UTC (permalink / raw)
  To: Umesh Nerlige Ramappa; +Cc: intel-gfx

On 16/07/2020 21:06, Umesh Nerlige Ramappa wrote:
> On Thu, Jul 16, 2020 at 06:32:10PM +0300, Lionel Landwerlin wrote:
>> On 14/07/2020 10:22, Umesh Nerlige Ramappa wrote:
>>> From: Piotr Maciejewski <piotr.maciejewski@intel.com>
>>>
>>> i915 used to support time based sampling mode which is good for overall
>>> system monitoring, but is not enough for query mode used to measure a
>>> single draw call or dispatch. Gen9-Gen11 are using current i915 perf
>>> implementation for query, but Gen12+ requires a new approach based on
>>> triggered reports within oa buffer. In order to enable above feature
>>> two changes are required:
>>>
>>> 1. Whitelist update:
>>> - enable triggered reports within oa buffer
>>> - reading oa buffer head/tail/status information
>>> - reading gpu ticks counter.
>>
>> I would break the patch into feature sets :
>>
>>     - 1 patch to whitelist the trigger registers
>>
>>     - 1 patch to whitelist OA counters & tail/head/... registers
>>
>>     - 1 patch for mmap feature
>>
>>
>> Here are some IGT tests for the trigger feature : 
>> https://patchwork.freedesktop.org/series/75311/
>>
>>
>> We should verify that when not i915-perf is not active the 
>> whitelisted OA counters are not incrementing.
>>
>> Otherwise we're starting to leak information that was not available 
>> before.
>>
>>
>>>
>>> 2. Map oa buffer at umd driver level to solve below constraints related
>>>    to time based sampling interface:
>>> - longer time to access reports collected by oa buffer
>>> - slow oa reports browsing since oa buffer size is large
>>> - missing oa report index, so query cannot browse report directly
>>> - with direct access to oa buffer, query can extract other useful
>>>   reports like context switch information needed to calculate correct
>>>   performance counters values.
>>>
>>> Signed-off-by: Piotr Maciejewski <piotr.maciejewski@intel.com>
>>> ---
>>>  drivers/gpu/drm/i915/gt/intel_workarounds.c |  54 ++++++++
>>>  drivers/gpu/drm/i915/i915_perf.c            | 130 +++++++++++++++++++-
>>>  drivers/gpu/drm/i915/i915_perf_types.h      |  13 ++
>>>  drivers/gpu/drm/i915/i915_reg.h             |  14 +++
>>>  include/uapi/drm/i915_drm.h                 |  19 +++
>>>  5 files changed, 227 insertions(+), 3 deletions(-)
>>>
>>> diff --git a/drivers/gpu/drm/i915/gt/intel_workarounds.c 
>>> b/drivers/gpu/drm/i915/gt/intel_workarounds.c
>>> index 5726cd0a37e0..cf89928fc3a5 100644
>>> --- a/drivers/gpu/drm/i915/gt/intel_workarounds.c
>>> +++ b/drivers/gpu/drm/i915/gt/intel_workarounds.c
>>> @@ -1365,6 +1365,48 @@ whitelist_reg(struct i915_wa_list *wal, 
>>> i915_reg_t reg)
>>>      whitelist_reg_ext(wal, reg, RING_FORCE_TO_NONPRIV_ACCESS_RW);
>>>  }
>>> +static void gen9_whitelist_build_performance_counters(struct 
>>> i915_wa_list *w)
>>> +{
>>> +    /* OA buffer trigger report 2/6 used by performance query */
>>> +    whitelist_reg(w, OAREPORTTRIG2);
>>> +    whitelist_reg(w, OAREPORTTRIG6);
>>> +
>>> +    /* Performance counters A18-20 used by tbs marker query */
>>> +    whitelist_reg_ext(w, OA_PERF_COUNTER_A18,
>>> +              RING_FORCE_TO_NONPRIV_ACCESS_RW |
>>> +              RING_FORCE_TO_NONPRIV_RANGE_16);
>>
>> I would actually whitelist the entire set of OA_PERF_COUNTER (0-36).
>
> Not sure about that. I thought the only reason we chose to whitelist 
> A18-A20 was because these counters did not count anything. 
> Whitelisting all A counters would just mean that an unprivileged user 
> can keep sampling them all the time from a command buffer. Right?


A7-20 are flex EU counters, they can be programmed to count particular 
events.

We just happen to not program them all.


Sure an unprivileged user could sample them, but it can already sample 
them through MI_RPC.


My reason for whitelisting them is that I can sample them without having 
to look at the OA buffer since on Gen12+ MI_RPC sources values from OAR 
which doesn't have the same values as OAG for the counters.


-Lionel


>
> Thanks,
> Umesh
>
>>
>> I would like to make use of them in Mesa for the Vulkan driver.
>>
>>> +
>>> +    /* Read access to gpu ticks */
>>> +    whitelist_reg_ext(w, GEN8_GPU_TICKS,
>>> +              RING_FORCE_TO_NONPRIV_ACCESS_RD);
>>> +
>>> +    /* Read access to: oa status, head, tail, buffer settings */
>>> +    whitelist_reg_ext(w, GEN8_OASTATUS,
>>> +              RING_FORCE_TO_NONPRIV_ACCESS_RD |
>>> +              RING_FORCE_TO_NONPRIV_RANGE_4);
>>> +}
>>> +
>>> +static void gen12_whitelist_build_performance_counters(struct 
>>> i915_wa_list *w)
>>> +{
>>> +    /* OA buffer trigger report 2/6 used by performance query */
>>> +    whitelist_reg(w, GEN12_OAG_OAREPORTTRIG2);
>>> +    whitelist_reg(w, GEN12_OAG_OAREPORTTRIG6);
>>> +
>>> +    /* Performance counters A18-20 used by tbs marker query */
>>> +    whitelist_reg_ext(w, GEN12_OAG_PERF_COUNTER_A18,
>>> +              RING_FORCE_TO_NONPRIV_ACCESS_RW |
>>> +              RING_FORCE_TO_NONPRIV_RANGE_16);
>>> +
>>> +    /* Read access to gpu ticks */
>>> +    whitelist_reg_ext(w, GEN12_OAG_GPU_TICKS,
>>> +              RING_FORCE_TO_NONPRIV_ACCESS_RD);
>>> +
>>> +    /* Read access to: oa status, head, tail, buffer settings */
>>> +    whitelist_reg_ext(w, GEN12_OAG_OASTATUS,
>>> +              RING_FORCE_TO_NONPRIV_ACCESS_RD |
>>> +              RING_FORCE_TO_NONPRIV_RANGE_4);
>>> +}
>>> +
>>
>>> ...
>>
>>>  struct i915_perf {
>>> diff --git a/drivers/gpu/drm/i915/i915_reg.h 
>>> b/drivers/gpu/drm/i915/i915_reg.h
>>> index 86a23ced051b..2e3d264339e0 100644
>>> --- a/drivers/gpu/drm/i915/i915_reg.h
>>> +++ b/drivers/gpu/drm/i915/i915_reg.h
>>> @@ -675,6 +675,7 @@ static inline bool 
>>> i915_mmio_reg_valid(i915_reg_t reg)
>>>  #define  GEN7_OASTATUS2_HEAD_MASK           0xffffffc0
>>>  #define  GEN7_OASTATUS2_MEM_SELECT_GGTT     (1 << 0) /* 0: PPGTT, 
>>> 1: GGTT */
>>> +#define GEN8_GPU_TICKS _MMIO(0x2910)
>>>  #define GEN8_OASTATUS _MMIO(0x2b08)
>>>  #define  GEN8_OASTATUS_OVERRUN_STATUS        (1 << 3)
>>>  #define  GEN8_OASTATUS_COUNTER_OVERFLOW     (1 << 2)
>>> @@ -696,6 +697,7 @@ static inline bool 
>>> i915_mmio_reg_valid(i915_reg_t reg)
>>>  #define OABUFFER_SIZE_16M   (7 << 3)
>>>  #define GEN12_OA_TLB_INV_CR _MMIO(0xceec)
>>> +#define GEN12_SQCNT1 _MMIO(0x8718)
>>>  /* Gen12 OAR unit */
>>>  #define GEN12_OAR_OACONTROL _MMIO(0x2960)
>>> @@ -731,6 +733,7 @@ static inline bool 
>>> i915_mmio_reg_valid(i915_reg_t reg)
>>>  #define  GEN12_OAG_OA_DEBUG_DISABLE_GO_1_0_REPORTS     (1 << 2)
>>>  #define  GEN12_OAG_OA_DEBUG_DISABLE_CTX_SWITCH_REPORTS (1 << 1)
>>> +#define GEN12_OAG_GPU_TICKS _MMIO(0xda90)
>>>  #define GEN12_OAG_OASTATUS _MMIO(0xdafc)
>>>  #define  GEN12_OAG_OASTATUS_COUNTER_OVERFLOW (1 << 2)
>>>  #define  GEN12_OAG_OASTATUS_BUFFER_OVERFLOW  (1 << 1)
>>> @@ -972,6 +975,17 @@ static inline bool 
>>> i915_mmio_reg_valid(i915_reg_t reg)
>>>  #define OAREPORTTRIG8_NOA_SELECT_6_SHIFT    24
>>>  #define OAREPORTTRIG8_NOA_SELECT_7_SHIFT    28
>>> +/* Performance counters registers */
>>> +#define OA_PERF_COUNTER_A18   _MMIO(0x2890)
>>> +#define OA_PERF_COUNTER_A19   _MMIO(0x2898)
>>> +#define OA_PERF_COUNTER_A20   _MMIO(0x28A0)
>> Maybe turn this into OA_PERF_COUNTER(idx) _MMIO(0x2800 + idx * 8)
>>> +
>>> +/* Gen12 Performance counters registers */
>>> +#define GEN12_OAG_PERF_COUNTER_A16   _MMIO(0xDA00)
>>> +#define GEN12_OAG_PERF_COUNTER_A18   _MMIO(0xDA10)
>>> +#define GEN12_OAG_PERF_COUNTER_A19   _MMIO(0xDA18)
>>> +#define GEN12_OAG_PERF_COUNTER_A20   _MMIO(0xDA20)
>> Same here
>>> +
>>>  /* Same layout as OASTARTTRIGX */
>>>  #define GEN12_OAG_OASTARTTRIG1 _MMIO(0xd900)
>>>  #define GEN12_OAG_OASTARTTRIG2 _MMIO(0xd904)
>>> diff --git a/include/uapi/drm/i915_drm.h b/include/uapi/drm/i915_drm.h
>>> index 14b67cd6b54b..62b88c0123c8 100644
>>> --- a/include/uapi/drm/i915_drm.h
>>> +++ b/include/uapi/drm/i915_drm.h
>>> @@ -2048,6 +2048,25 @@ struct drm_i915_perf_open_param {z
>>>   */
>>>  #define I915_PERF_IOCTL_CONFIG    _IO('i', 0x2)
>>> +/**
>>> + * Returns OA buffer properties.
>>> + *
>>> + * This ioctl is available in perf revision 6.
>>> + */
>>> +#define I915_PERF_IOCTL_GET_OA_BUFFER_INFO _IO('i', 0x3)
>>> +
>>> +/**
>>> + * OA buffer information structure.
>>> + */
>>> +struct drm_i915_perf_oa_buffer_info {
>>> +    __u32 size;
>>> +    __u32 head;
>>> +    __u32 tail;
>>> +    __u32 gpu_address;
>>> +    __u64 cpu_address;
>>> +    __u64 reserved[4];
>>> +};
>>> +
>>>  /**
>>>   * Common to all i915 perf records
>>>   */
>>
>> Thanks a lot,
>>
>>
>> -Lionel
>>

_______________________________________________
Intel-gfx mailing list
Intel-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/intel-gfx

^ permalink raw reply	[flat|nested] 13+ messages in thread

* Re: [Intel-gfx] [PATCH 1/1] drm/i915/perf: Map OA buffer to user space
  2020-07-16 18:44       ` Lionel Landwerlin
@ 2020-07-16 19:28         ` Umesh Nerlige Ramappa
  2020-07-17  0:28           ` Umesh Nerlige Ramappa
  0 siblings, 1 reply; 13+ messages in thread
From: Umesh Nerlige Ramappa @ 2020-07-16 19:28 UTC (permalink / raw)
  To: Lionel Landwerlin; +Cc: intel-gfx

On Thu, Jul 16, 2020 at 09:44:46PM +0300, Lionel Landwerlin wrote:
>On 16/07/2020 21:06, Umesh Nerlige Ramappa wrote:
>>On Thu, Jul 16, 2020 at 06:32:10PM +0300, Lionel Landwerlin wrote:
>>>On 14/07/2020 10:22, Umesh Nerlige Ramappa wrote:
>>>>From: Piotr Maciejewski <piotr.maciejewski@intel.com>
>>>>
>>>>i915 used to support time based sampling mode which is good for overall
>>>>system monitoring, but is not enough for query mode used to measure a
>>>>single draw call or dispatch. Gen9-Gen11 are using current i915 perf
>>>>implementation for query, but Gen12+ requires a new approach based on
>>>>triggered reports within oa buffer. In order to enable above feature
>>>>two changes are required:
>>>>
>>>>1. Whitelist update:
>>>>- enable triggered reports within oa buffer
>>>>- reading oa buffer head/tail/status information
>>>>- reading gpu ticks counter.
>>>
>>>I would break the patch into feature sets :
>>>
>>>    - 1 patch to whitelist the trigger registers
>>>
>>>    - 1 patch to whitelist OA counters & tail/head/... registers
>>>
>>>    - 1 patch for mmap feature
>>>
>>>
>>>Here are some IGT tests for the trigger feature : 
>>>https://patchwork.freedesktop.org/series/75311/
>>>
>>>
>>>We should verify that when not i915-perf is not active the 
>>>whitelisted OA counters are not incrementing.
>>>
>>>Otherwise we're starting to leak information that was not 
>>>available before.
>>>
>>>
>>>>
>>>>2. Map oa buffer at umd driver level to solve below constraints related
>>>>   to time based sampling interface:
>>>>- longer time to access reports collected by oa buffer
>>>>- slow oa reports browsing since oa buffer size is large
>>>>- missing oa report index, so query cannot browse report directly
>>>>- with direct access to oa buffer, query can extract other useful
>>>>  reports like context switch information needed to calculate correct
>>>>  performance counters values.
>>>>
>>>>Signed-off-by: Piotr Maciejewski <piotr.maciejewski@intel.com>
>>>>---
>>>> drivers/gpu/drm/i915/gt/intel_workarounds.c |  54 ++++++++
>>>> drivers/gpu/drm/i915/i915_perf.c            | 130 +++++++++++++++++++-
>>>> drivers/gpu/drm/i915/i915_perf_types.h      |  13 ++
>>>> drivers/gpu/drm/i915/i915_reg.h             |  14 +++
>>>> include/uapi/drm/i915_drm.h                 |  19 +++
>>>> 5 files changed, 227 insertions(+), 3 deletions(-)
>>>>
>>>>diff --git a/drivers/gpu/drm/i915/gt/intel_workarounds.c 
>>>>b/drivers/gpu/drm/i915/gt/intel_workarounds.c
>>>>index 5726cd0a37e0..cf89928fc3a5 100644
>>>>--- a/drivers/gpu/drm/i915/gt/intel_workarounds.c
>>>>+++ b/drivers/gpu/drm/i915/gt/intel_workarounds.c
>>>>@@ -1365,6 +1365,48 @@ whitelist_reg(struct i915_wa_list *wal, 
>>>>i915_reg_t reg)
>>>>     whitelist_reg_ext(wal, reg, RING_FORCE_TO_NONPRIV_ACCESS_RW);
>>>> }
>>>>+static void gen9_whitelist_build_performance_counters(struct 
>>>>i915_wa_list *w)
>>>>+{
>>>>+    /* OA buffer trigger report 2/6 used by performance query */
>>>>+    whitelist_reg(w, OAREPORTTRIG2);
>>>>+    whitelist_reg(w, OAREPORTTRIG6);
>>>>+
>>>>+    /* Performance counters A18-20 used by tbs marker query */
>>>>+    whitelist_reg_ext(w, OA_PERF_COUNTER_A18,
>>>>+              RING_FORCE_TO_NONPRIV_ACCESS_RW |
>>>>+              RING_FORCE_TO_NONPRIV_RANGE_16);
>>>
>>>I would actually whitelist the entire set of OA_PERF_COUNTER (0-36).
>>
>>Not sure about that. I thought the only reason we chose to whitelist 
>>A18-A20 was because these counters did not count anything. 
>>Whitelisting all A counters would just mean that an unprivileged 
>>user can keep sampling them all the time from a command buffer. 
>>Right?
>
>
>A7-20 are flex EU counters, they can be programmed to count particular 
>events.
>
>We just happen to not program them all.
>
>
>Sure an unprivileged user could sample them, but it can already sample 
>them through MI_RPC.
>
>
>My reason for whitelisting them is that I can sample them without 
>having to look at the OA buffer since on Gen12+ MI_RPC sources values 
>from OAR which doesn't have the same values as OAG for the counters.
>

I see. gen12 broke an earlier use case due to OAR/OAG split. I will 
follow up on the range of registers you requested for Vulkan.

Thanks,
Umesh

>
>-Lionel
>
>
>>
>>Thanks,
>>Umesh
>>
>>>
>>>I would like to make use of them in Mesa for the Vulkan driver.
>>>
>>>>+
>>>>+    /* Read access to gpu ticks */
>>>>+    whitelist_reg_ext(w, GEN8_GPU_TICKS,
>>>>+              RING_FORCE_TO_NONPRIV_ACCESS_RD);
>>>>+
>>>>+    /* Read access to: oa status, head, tail, buffer settings */
>>>>+    whitelist_reg_ext(w, GEN8_OASTATUS,
>>>>+              RING_FORCE_TO_NONPRIV_ACCESS_RD |
>>>>+              RING_FORCE_TO_NONPRIV_RANGE_4);
>>>>+}
>>>>+
>>>>+static void gen12_whitelist_build_performance_counters(struct 
>>>>i915_wa_list *w)
>>>>+{
>>>>+    /* OA buffer trigger report 2/6 used by performance query */
>>>>+    whitelist_reg(w, GEN12_OAG_OAREPORTTRIG2);
>>>>+    whitelist_reg(w, GEN12_OAG_OAREPORTTRIG6);
>>>>+
>>>>+    /* Performance counters A18-20 used by tbs marker query */
>>>>+    whitelist_reg_ext(w, GEN12_OAG_PERF_COUNTER_A18,
>>>>+              RING_FORCE_TO_NONPRIV_ACCESS_RW |
>>>>+              RING_FORCE_TO_NONPRIV_RANGE_16);
>>>>+
>>>>+    /* Read access to gpu ticks */
>>>>+    whitelist_reg_ext(w, GEN12_OAG_GPU_TICKS,
>>>>+              RING_FORCE_TO_NONPRIV_ACCESS_RD);
>>>>+
>>>>+    /* Read access to: oa status, head, tail, buffer settings */
>>>>+    whitelist_reg_ext(w, GEN12_OAG_OASTATUS,
>>>>+              RING_FORCE_TO_NONPRIV_ACCESS_RD |
>>>>+              RING_FORCE_TO_NONPRIV_RANGE_4);
>>>>+}
>>>>+
>>>
>>>>...
>>>
>>>> struct i915_perf {
>>>>diff --git a/drivers/gpu/drm/i915/i915_reg.h 
>>>>b/drivers/gpu/drm/i915/i915_reg.h
>>>>index 86a23ced051b..2e3d264339e0 100644
>>>>--- a/drivers/gpu/drm/i915/i915_reg.h
>>>>+++ b/drivers/gpu/drm/i915/i915_reg.h
>>>>@@ -675,6 +675,7 @@ static inline bool 
>>>>i915_mmio_reg_valid(i915_reg_t reg)
>>>> #define  GEN7_OASTATUS2_HEAD_MASK           0xffffffc0
>>>> #define  GEN7_OASTATUS2_MEM_SELECT_GGTT     (1 << 0) /* 0: 
>>>>PPGTT, 1: GGTT */
>>>>+#define GEN8_GPU_TICKS _MMIO(0x2910)
>>>> #define GEN8_OASTATUS _MMIO(0x2b08)
>>>> #define  GEN8_OASTATUS_OVERRUN_STATUS        (1 << 3)
>>>> #define  GEN8_OASTATUS_COUNTER_OVERFLOW     (1 << 2)
>>>>@@ -696,6 +697,7 @@ static inline bool 
>>>>i915_mmio_reg_valid(i915_reg_t reg)
>>>> #define OABUFFER_SIZE_16M   (7 << 3)
>>>> #define GEN12_OA_TLB_INV_CR _MMIO(0xceec)
>>>>+#define GEN12_SQCNT1 _MMIO(0x8718)
>>>> /* Gen12 OAR unit */
>>>> #define GEN12_OAR_OACONTROL _MMIO(0x2960)
>>>>@@ -731,6 +733,7 @@ static inline bool 
>>>>i915_mmio_reg_valid(i915_reg_t reg)
>>>> #define  GEN12_OAG_OA_DEBUG_DISABLE_GO_1_0_REPORTS     (1 << 2)
>>>> #define  GEN12_OAG_OA_DEBUG_DISABLE_CTX_SWITCH_REPORTS (1 << 1)
>>>>+#define GEN12_OAG_GPU_TICKS _MMIO(0xda90)
>>>> #define GEN12_OAG_OASTATUS _MMIO(0xdafc)
>>>> #define  GEN12_OAG_OASTATUS_COUNTER_OVERFLOW (1 << 2)
>>>> #define  GEN12_OAG_OASTATUS_BUFFER_OVERFLOW  (1 << 1)
>>>>@@ -972,6 +975,17 @@ static inline bool 
>>>>i915_mmio_reg_valid(i915_reg_t reg)
>>>> #define OAREPORTTRIG8_NOA_SELECT_6_SHIFT    24
>>>> #define OAREPORTTRIG8_NOA_SELECT_7_SHIFT    28
>>>>+/* Performance counters registers */
>>>>+#define OA_PERF_COUNTER_A18   _MMIO(0x2890)
>>>>+#define OA_PERF_COUNTER_A19   _MMIO(0x2898)
>>>>+#define OA_PERF_COUNTER_A20   _MMIO(0x28A0)
>>>Maybe turn this into OA_PERF_COUNTER(idx) _MMIO(0x2800 + idx * 8)
>>>>+
>>>>+/* Gen12 Performance counters registers */
>>>>+#define GEN12_OAG_PERF_COUNTER_A16   _MMIO(0xDA00)
>>>>+#define GEN12_OAG_PERF_COUNTER_A18   _MMIO(0xDA10)
>>>>+#define GEN12_OAG_PERF_COUNTER_A19   _MMIO(0xDA18)
>>>>+#define GEN12_OAG_PERF_COUNTER_A20   _MMIO(0xDA20)
>>>Same here
>>>>+
>>>> /* Same layout as OASTARTTRIGX */
>>>> #define GEN12_OAG_OASTARTTRIG1 _MMIO(0xd900)
>>>> #define GEN12_OAG_OASTARTTRIG2 _MMIO(0xd904)
>>>>diff --git a/include/uapi/drm/i915_drm.h b/include/uapi/drm/i915_drm.h
>>>>index 14b67cd6b54b..62b88c0123c8 100644
>>>>--- a/include/uapi/drm/i915_drm.h
>>>>+++ b/include/uapi/drm/i915_drm.h
>>>>@@ -2048,6 +2048,25 @@ struct drm_i915_perf_open_param {z
>>>>  */
>>>> #define I915_PERF_IOCTL_CONFIG    _IO('i', 0x2)
>>>>+/**
>>>>+ * Returns OA buffer properties.
>>>>+ *
>>>>+ * This ioctl is available in perf revision 6.
>>>>+ */
>>>>+#define I915_PERF_IOCTL_GET_OA_BUFFER_INFO _IO('i', 0x3)
>>>>+
>>>>+/**
>>>>+ * OA buffer information structure.
>>>>+ */
>>>>+struct drm_i915_perf_oa_buffer_info {
>>>>+    __u32 size;
>>>>+    __u32 head;
>>>>+    __u32 tail;
>>>>+    __u32 gpu_address;
>>>>+    __u64 cpu_address;
>>>>+    __u64 reserved[4];
>>>>+};
>>>>+
>>>> /**
>>>>  * Common to all i915 perf records
>>>>  */
>>>
>>>Thanks a lot,
>>>
>>>
>>>-Lionel
>>>
>
_______________________________________________
Intel-gfx mailing list
Intel-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/intel-gfx

^ permalink raw reply	[flat|nested] 13+ messages in thread

* Re: [Intel-gfx] [PATCH 1/1] drm/i915/perf: Map OA buffer to user space
  2020-07-16 19:28         ` Umesh Nerlige Ramappa
@ 2020-07-17  0:28           ` Umesh Nerlige Ramappa
  0 siblings, 0 replies; 13+ messages in thread
From: Umesh Nerlige Ramappa @ 2020-07-17  0:28 UTC (permalink / raw)
  To: Lionel Landwerlin; +Cc: intel-gfx

On Thu, Jul 16, 2020 at 12:28:47PM -0700, Umesh Nerlige Ramappa wrote:
>On Thu, Jul 16, 2020 at 09:44:46PM +0300, Lionel Landwerlin wrote:
>>On 16/07/2020 21:06, Umesh Nerlige Ramappa wrote:
>>>On Thu, Jul 16, 2020 at 06:32:10PM +0300, Lionel Landwerlin wrote:
>>>>On 14/07/2020 10:22, Umesh Nerlige Ramappa wrote:
>>>>>From: Piotr Maciejewski <piotr.maciejewski@intel.com>
>>>>>
>>>>>i915 used to support time based sampling mode which is good for overall
>>>>>system monitoring, but is not enough for query mode used to measure a
>>>>>single draw call or dispatch. Gen9-Gen11 are using current i915 perf
>>>>>implementation for query, but Gen12+ requires a new approach based on
>>>>>triggered reports within oa buffer. In order to enable above feature
>>>>>two changes are required:
>>>>>
>>>>>1. Whitelist update:
>>>>>- enable triggered reports within oa buffer
>>>>>- reading oa buffer head/tail/status information
>>>>>- reading gpu ticks counter.
>>>>
>>>>I would break the patch into feature sets :
>>>>
>>>>    - 1 patch to whitelist the trigger registers
>>>>
>>>>    - 1 patch to whitelist OA counters & tail/head/... registers
>>>>
>>>>    - 1 patch for mmap feature
>>>>
>>>>
>>>>Here are some IGT tests for the trigger feature : 
>>>>https://patchwork.freedesktop.org/series/75311/
>>>>
>>>>
>>>>We should verify that when not i915-perf is not active the 
>>>>whitelisted OA counters are not incrementing.
>>>>
>>>>Otherwise we're starting to leak information that was not 
>>>>available before.
>>>>
>>>>
>>>>>
>>>>>2. Map oa buffer at umd driver level to solve below constraints related
>>>>>   to time based sampling interface:
>>>>>- longer time to access reports collected by oa buffer
>>>>>- slow oa reports browsing since oa buffer size is large
>>>>>- missing oa report index, so query cannot browse report directly
>>>>>- with direct access to oa buffer, query can extract other useful
>>>>>  reports like context switch information needed to calculate correct
>>>>>  performance counters values.
>>>>>
>>>>>Signed-off-by: Piotr Maciejewski <piotr.maciejewski@intel.com>
>>>>>---
>>>>> drivers/gpu/drm/i915/gt/intel_workarounds.c |  54 ++++++++
>>>>> drivers/gpu/drm/i915/i915_perf.c            | 130 +++++++++++++++++++-
>>>>> drivers/gpu/drm/i915/i915_perf_types.h      |  13 ++
>>>>> drivers/gpu/drm/i915/i915_reg.h             |  14 +++
>>>>> include/uapi/drm/i915_drm.h                 |  19 +++
>>>>> 5 files changed, 227 insertions(+), 3 deletions(-)
>>>>>
>>>>>diff --git a/drivers/gpu/drm/i915/gt/intel_workarounds.c 
>>>>>b/drivers/gpu/drm/i915/gt/intel_workarounds.c
>>>>>index 5726cd0a37e0..cf89928fc3a5 100644
>>>>>--- a/drivers/gpu/drm/i915/gt/intel_workarounds.c
>>>>>+++ b/drivers/gpu/drm/i915/gt/intel_workarounds.c
>>>>>@@ -1365,6 +1365,48 @@ whitelist_reg(struct i915_wa_list *wal, 
>>>>>i915_reg_t reg)
>>>>>     whitelist_reg_ext(wal, reg, RING_FORCE_TO_NONPRIV_ACCESS_RW);
>>>>> }
>>>>>+static void gen9_whitelist_build_performance_counters(struct 
>>>>>i915_wa_list *w)
>>>>>+{
>>>>>+    /* OA buffer trigger report 2/6 used by performance query */
>>>>>+    whitelist_reg(w, OAREPORTTRIG2);
>>>>>+    whitelist_reg(w, OAREPORTTRIG6);
>>>>>+
>>>>>+    /* Performance counters A18-20 used by tbs marker query */
>>>>>+    whitelist_reg_ext(w, OA_PERF_COUNTER_A18,
>>>>>+              RING_FORCE_TO_NONPRIV_ACCESS_RW |
>>>>>+              RING_FORCE_TO_NONPRIV_RANGE_16);
>>>>
>>>>I would actually whitelist the entire set of OA_PERF_COUNTER (0-36).
>>>
>>>Not sure about that. I thought the only reason we chose to 
>>>whitelist A18-A20 was because these counters did not count 
>>>anything. Whitelisting all A counters would just mean that an 
>>>unprivileged user can keep sampling them all the time from a 
>>>command buffer. Right?
>>
>>
>>A7-20 are flex EU counters, they can be programmed to count 
>>particular events.
>>
>>We just happen to not program them all.
>>
>>
>>Sure an unprivileged user could sample them, but it can already 
>>sample them through MI_RPC.
>>
>>
>>My reason for whitelisting them is that I can sample them without 
>>having to look at the OA buffer since on Gen12+ MI_RPC sources 
>>values from OAR which doesn't have the same values as OAG for the 
>>counters.
>>
>
>I see. gen12 broke an earlier use case due to OAR/OAG split. I will 
>follow up on the range of registers you requested for Vulkan.
>
>Thanks,
>Umesh
>
>>
>>-Lionel
>>
>>
>>>
>>>Thanks,
>>>Umesh
>>>
>>>>
>>>>I would like to make use of them in Mesa for the Vulkan driver.
>>>>
>>>>>+
>>>>>+    /* Read access to gpu ticks */
>>>>>+    whitelist_reg_ext(w, GEN8_GPU_TICKS,
>>>>>+              RING_FORCE_TO_NONPRIV_ACCESS_RD);
>>>>>+
>>>>>+    /* Read access to: oa status, head, tail, buffer settings */
>>>>>+    whitelist_reg_ext(w, GEN8_OASTATUS,
>>>>>+              RING_FORCE_TO_NONPRIV_ACCESS_RD |
>>>>>+              RING_FORCE_TO_NONPRIV_RANGE_4);
>>>>>+}
>>>>>+
>>>>>+static void gen12_whitelist_build_performance_counters(struct 
>>>>>i915_wa_list *w)
>>>>>+{
>>>>>+    /* OA buffer trigger report 2/6 used by performance query */
>>>>>+    whitelist_reg(w, GEN12_OAG_OAREPORTTRIG2);
>>>>>+    whitelist_reg(w, GEN12_OAG_OAREPORTTRIG6);
>>>>>+
>>>>>+    /* Performance counters A18-20 used by tbs marker query */
>>>>>+    whitelist_reg_ext(w, GEN12_OAG_PERF_COUNTER_A18,
>>>>>+              RING_FORCE_TO_NONPRIV_ACCESS_RW |
>>>>>+              RING_FORCE_TO_NONPRIV_RANGE_16);
>>>>>+
>>>>>+    /* Read access to gpu ticks */
>>>>>+    whitelist_reg_ext(w, GEN12_OAG_GPU_TICKS,
>>>>>+              RING_FORCE_TO_NONPRIV_ACCESS_RD);
>>>>>+
>>>>>+    /* Read access to: oa status, head, tail, buffer settings */
>>>>>+    whitelist_reg_ext(w, GEN12_OAG_OASTATUS,
>>>>>+              RING_FORCE_TO_NONPRIV_ACCESS_RD |
>>>>>+              RING_FORCE_TO_NONPRIV_RANGE_4);
>>>>>+}
>>>>>+
>>>>
>>>>>...
>>>>
>>>>> struct i915_perf {
>>>>>diff --git a/drivers/gpu/drm/i915/i915_reg.h 
>>>>>b/drivers/gpu/drm/i915/i915_reg.h
>>>>>index 86a23ced051b..2e3d264339e0 100644
>>>>>--- a/drivers/gpu/drm/i915/i915_reg.h
>>>>>+++ b/drivers/gpu/drm/i915/i915_reg.h
>>>>>@@ -675,6 +675,7 @@ static inline bool 
>>>>>i915_mmio_reg_valid(i915_reg_t reg)
>>>>> #define  GEN7_OASTATUS2_HEAD_MASK           0xffffffc0
>>>>> #define  GEN7_OASTATUS2_MEM_SELECT_GGTT     (1 << 0) /* 0: 
>>>>>PPGTT, 1: GGTT */
>>>>>+#define GEN8_GPU_TICKS _MMIO(0x2910)
>>>>> #define GEN8_OASTATUS _MMIO(0x2b08)
>>>>> #define  GEN8_OASTATUS_OVERRUN_STATUS        (1 << 3)
>>>>> #define  GEN8_OASTATUS_COUNTER_OVERFLOW     (1 << 2)
>>>>>@@ -696,6 +697,7 @@ static inline bool 
>>>>>i915_mmio_reg_valid(i915_reg_t reg)
>>>>> #define OABUFFER_SIZE_16M   (7 << 3)
>>>>> #define GEN12_OA_TLB_INV_CR _MMIO(0xceec)
>>>>>+#define GEN12_SQCNT1 _MMIO(0x8718)
>>>>> /* Gen12 OAR unit */
>>>>> #define GEN12_OAR_OACONTROL _MMIO(0x2960)
>>>>>@@ -731,6 +733,7 @@ static inline bool 
>>>>>i915_mmio_reg_valid(i915_reg_t reg)
>>>>> #define  GEN12_OAG_OA_DEBUG_DISABLE_GO_1_0_REPORTS     (1 << 2)
>>>>> #define  GEN12_OAG_OA_DEBUG_DISABLE_CTX_SWITCH_REPORTS (1 << 1)
>>>>>+#define GEN12_OAG_GPU_TICKS _MMIO(0xda90)
>>>>> #define GEN12_OAG_OASTATUS _MMIO(0xdafc)
>>>>> #define  GEN12_OAG_OASTATUS_COUNTER_OVERFLOW (1 << 2)
>>>>> #define  GEN12_OAG_OASTATUS_BUFFER_OVERFLOW  (1 << 1)
>>>>>@@ -972,6 +975,17 @@ static inline bool 
>>>>>i915_mmio_reg_valid(i915_reg_t reg)
>>>>> #define OAREPORTTRIG8_NOA_SELECT_6_SHIFT    24
>>>>> #define OAREPORTTRIG8_NOA_SELECT_7_SHIFT    28
>>>>>+/* Performance counters registers */
>>>>>+#define OA_PERF_COUNTER_A18   _MMIO(0x2890)
>>>>>+#define OA_PERF_COUNTER_A19   _MMIO(0x2898)
>>>>>+#define OA_PERF_COUNTER_A20   _MMIO(0x28A0)
>>>>Maybe turn this into OA_PERF_COUNTER(idx) _MMIO(0x2800 + idx * 8)

That's a good idea, although not all _UPPER counters exist. Some 
counters are just 32 bits A33 etc. I would rather leave it like
this and index into the counters if we end up adding more here.

Thanks,
Umesh

>>>>>+
>>>>>+/* Gen12 Performance counters registers */
>>>>>+#define GEN12_OAG_PERF_COUNTER_A16   _MMIO(0xDA00)
>>>>>+#define GEN12_OAG_PERF_COUNTER_A18   _MMIO(0xDA10)
>>>>>+#define GEN12_OAG_PERF_COUNTER_A19   _MMIO(0xDA18)
>>>>>+#define GEN12_OAG_PERF_COUNTER_A20   _MMIO(0xDA20)
>>>>Same here
>>>>>+
>>>>> /* Same layout as OASTARTTRIGX */
>>>>> #define GEN12_OAG_OASTARTTRIG1 _MMIO(0xd900)
>>>>> #define GEN12_OAG_OASTARTTRIG2 _MMIO(0xd904)
>>>>>diff --git a/include/uapi/drm/i915_drm.h b/include/uapi/drm/i915_drm.h
>>>>>index 14b67cd6b54b..62b88c0123c8 100644
>>>>>--- a/include/uapi/drm/i915_drm.h
>>>>>+++ b/include/uapi/drm/i915_drm.h
>>>>>@@ -2048,6 +2048,25 @@ struct drm_i915_perf_open_param {z
>>>>>  */
>>>>> #define I915_PERF_IOCTL_CONFIG    _IO('i', 0x2)
>>>>>+/**
>>>>>+ * Returns OA buffer properties.
>>>>>+ *
>>>>>+ * This ioctl is available in perf revision 6.
>>>>>+ */
>>>>>+#define I915_PERF_IOCTL_GET_OA_BUFFER_INFO _IO('i', 0x3)
>>>>>+
>>>>>+/**
>>>>>+ * OA buffer information structure.
>>>>>+ */
>>>>>+struct drm_i915_perf_oa_buffer_info {
>>>>>+    __u32 size;
>>>>>+    __u32 head;
>>>>>+    __u32 tail;
>>>>>+    __u32 gpu_address;
>>>>>+    __u64 cpu_address;
>>>>>+    __u64 reserved[4];
>>>>>+};
>>>>>+
>>>>> /**
>>>>>  * Common to all i915 perf records
>>>>>  */
>>>>
>>>>Thanks a lot,
>>>>
>>>>
>>>>-Lionel
>>>>
>>
>_______________________________________________
>Intel-gfx mailing list
>Intel-gfx@lists.freedesktop.org
>https://lists.freedesktop.org/mailman/listinfo/intel-gfx
_______________________________________________
Intel-gfx mailing list
Intel-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/intel-gfx

^ permalink raw reply	[flat|nested] 13+ messages in thread

end of thread, other threads:[~2020-07-17  0:28 UTC | newest]

Thread overview: 13+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2020-07-14  7:22 [Intel-gfx] [PATCH 0/1] Allow privileged user to map the OA buffer Umesh Nerlige Ramappa
2020-07-14  7:22 ` [Intel-gfx] [PATCH 1/1] drm/i915/perf: Map OA buffer to user space Umesh Nerlige Ramappa
2020-07-14  7:44   ` Umesh Nerlige Ramappa
2020-07-14 11:28   ` Chris Wilson
2020-07-14 20:10     ` Umesh Nerlige Ramappa
2020-07-14 20:24       ` Chris Wilson
2020-07-16 15:32   ` Lionel Landwerlin
2020-07-16 18:06     ` Umesh Nerlige Ramappa
2020-07-16 18:44       ` Lionel Landwerlin
2020-07-16 19:28         ` Umesh Nerlige Ramappa
2020-07-17  0:28           ` Umesh Nerlige Ramappa
2020-07-14  8:17 ` [Intel-gfx] ✗ Fi.CI.CHECKPATCH: warning for Allow privileged user to map the OA buffer Patchwork
2020-07-14  8:44 ` [Intel-gfx] ✗ Fi.CI.BAT: failure " Patchwork

This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.