All of lore.kernel.org
 help / color / mirror / Atom feed
* [Intel-gfx] [PATCH] i915/query: Correlate engine and cpu timestamps with better accuracy
@ 2021-03-02 18:29 Umesh Nerlige Ramappa
  2021-03-02 20:35 ` Lionel Landwerlin
                   ` (2 more replies)
  0 siblings, 3 replies; 19+ messages in thread
From: Umesh Nerlige Ramappa @ 2021-03-02 18:29 UTC (permalink / raw)
  To: intel-gfx, Chris Wilson, Tvrtko Ursulin, Lionel G Landwerlin

Perf measurements rely on CPU and engine timestamps to correlate
events of interest across these time domains. Current mechanisms get
these timestamps separately and the calculated delta between these
timestamps lack enough accuracy.

To improve the accuracy of these time measurements to within a few us,
add a query that returns the engine and cpu timestamps captured as
close to each other as possible.

v2: (Tvrtko)
- document clock reference used
- return cpu timestamp always
- capture cpu time just before lower dword of cs timestamp

v3: (Chris)
- use uncore-rpm
- use __query_cs_timestamp helper

v4: (Lionel)
- Kernel perf subsytem allows users to specify the clock id to be used
  in perf_event_open. This clock id is used by the perf subsystem to
  return the appropriate cpu timestamp in perf events. Similarly, let
  the user pass the clockid to this query so that cpu timestamp
  corresponds to the clock id requested.

v5: (Tvrtko)
- Use normal ktime accessors instead of fast versions
- Add more uApi documentation

v6: (Lionel)
- Move switch out of spinlock

v7: (Chris)
- cs_timestamp is a misnomer, use cs_cycles instead
- return the cs cycle frequency as well in the query

v8:
- Add platform and engine specific checks

Signed-off-by: Umesh Nerlige Ramappa <umesh.nerlige.ramappa@intel.com>
---
 drivers/gpu/drm/i915/i915_query.c | 140 ++++++++++++++++++++++++++++++
 include/uapi/drm/i915_drm.h       |  43 +++++++++
 2 files changed, 183 insertions(+)

diff --git a/drivers/gpu/drm/i915/i915_query.c b/drivers/gpu/drm/i915/i915_query.c
index fed337ad7b68..763f0f918065 100644
--- a/drivers/gpu/drm/i915/i915_query.c
+++ b/drivers/gpu/drm/i915/i915_query.c
@@ -6,6 +6,8 @@
 
 #include <linux/nospec.h>
 
+#include "gt/intel_engine_pm.h"
+#include "gt/intel_engine_user.h"
 #include "i915_drv.h"
 #include "i915_perf.h"
 #include "i915_query.h"
@@ -90,6 +92,143 @@ static int query_topology_info(struct drm_i915_private *dev_priv,
 	return total_length;
 }
 
+typedef u64 (*__ktime_func_t)(void);
+static __ktime_func_t __clock_id_to_func(clockid_t clk_id)
+{
+	/*
+	 * Use logic same as the perf subsystem to allow user to select the
+	 * reference clock id to be used for timestamps.
+	 */
+	switch (clk_id) {
+	case CLOCK_MONOTONIC:
+		return &ktime_get_ns;
+	case CLOCK_MONOTONIC_RAW:
+		return &ktime_get_raw_ns;
+	case CLOCK_REALTIME:
+		return &ktime_get_real_ns;
+	case CLOCK_BOOTTIME:
+		return &ktime_get_boottime_ns;
+	case CLOCK_TAI:
+		return &ktime_get_clocktai_ns;
+	default:
+		return NULL;
+	}
+}
+
+static inline int
+__read_timestamps(struct intel_uncore *uncore,
+		  i915_reg_t lower_reg,
+		  i915_reg_t upper_reg,
+		  u64 *cs_ts,
+		  u64 *cpu_ts,
+		  __ktime_func_t cpu_clock)
+{
+	u32 upper, lower, old_upper, loop = 0;
+
+	upper = intel_uncore_read_fw(uncore, upper_reg);
+	do {
+		*cpu_ts = cpu_clock();
+		lower = intel_uncore_read_fw(uncore, lower_reg);
+		old_upper = upper;
+		upper = intel_uncore_read_fw(uncore, upper_reg);
+	} while (upper != old_upper && loop++ < 2);
+
+	*cs_ts = (u64)upper << 32 | lower;
+
+	return 0;
+}
+
+static int
+__query_cs_cycles(struct intel_engine_cs *engine,
+		  u64 *cs_ts, u64 *cpu_ts,
+		  __ktime_func_t cpu_clock)
+{
+	struct intel_uncore *uncore = engine->uncore;
+	enum forcewake_domains fw_domains;
+	u32 base = engine->mmio_base;
+	intel_wakeref_t wakeref;
+	int ret;
+
+	fw_domains = intel_uncore_forcewake_for_reg(uncore,
+						    RING_TIMESTAMP(base),
+						    FW_REG_READ);
+
+	with_intel_runtime_pm(uncore->rpm, wakeref) {
+		spin_lock_irq(&uncore->lock);
+		intel_uncore_forcewake_get__locked(uncore, fw_domains);
+
+		ret = __read_timestamps(uncore,
+					RING_TIMESTAMP(base),
+					RING_TIMESTAMP_UDW(base),
+					cs_ts,
+					cpu_ts,
+					cpu_clock);
+
+		intel_uncore_forcewake_put__locked(uncore, fw_domains);
+		spin_unlock_irq(&uncore->lock);
+	}
+
+	return ret;
+}
+
+static int
+query_cs_cycles(struct drm_i915_private *i915,
+		struct drm_i915_query_item *query_item)
+{
+	struct drm_i915_query_cs_cycles __user *query_ptr;
+	struct drm_i915_query_cs_cycles query;
+	struct intel_engine_cs *engine;
+	__ktime_func_t cpu_clock;
+	int ret;
+
+	if (INTEL_GEN(i915) < 6)
+		return -ENODEV;
+
+	query_ptr = u64_to_user_ptr(query_item->data_ptr);
+	ret = copy_query_item(&query, sizeof(query), sizeof(query), query_item);
+	if (ret != 0)
+		return ret;
+
+	if (query.flags)
+		return -EINVAL;
+
+	if (query.rsvd)
+		return -EINVAL;
+
+	cpu_clock = __clock_id_to_func(query.clockid);
+	if (!cpu_clock)
+		return -EINVAL;
+
+	engine = intel_engine_lookup_user(i915,
+					  query.engine.engine_class,
+					  query.engine.engine_instance);
+	if (!engine)
+		return -EINVAL;
+
+	if (IS_GEN(i915, 6) &&
+	    query.engine.engine_class != I915_ENGINE_CLASS_RENDER)
+		return -ENODEV;
+
+	query.cs_frequency = engine->gt->clock_frequency;
+	ret = __query_cs_cycles(engine,
+				&query.cs_cycles,
+				&query.cpu_timestamp,
+				cpu_clock);
+	if (ret)
+		return ret;
+
+	if (put_user(query.cs_frequency, &query_ptr->cs_frequency))
+		return -EFAULT;
+
+	if (put_user(query.cpu_timestamp, &query_ptr->cpu_timestamp))
+		return -EFAULT;
+
+	if (put_user(query.cs_cycles, &query_ptr->cs_cycles))
+		return -EFAULT;
+
+	return sizeof(query);
+}
+
 static int
 query_engine_info(struct drm_i915_private *i915,
 		  struct drm_i915_query_item *query_item)
@@ -424,6 +563,7 @@ static int (* const i915_query_funcs[])(struct drm_i915_private *dev_priv,
 	query_topology_info,
 	query_engine_info,
 	query_perf_config,
+	query_cs_cycles,
 };
 
 int i915_query_ioctl(struct drm_device *dev, void *data, struct drm_file *file)
diff --git a/include/uapi/drm/i915_drm.h b/include/uapi/drm/i915_drm.h
index 1987e2ea79a3..379ae6e7aeb0 100644
--- a/include/uapi/drm/i915_drm.h
+++ b/include/uapi/drm/i915_drm.h
@@ -2176,6 +2176,10 @@ struct drm_i915_query_item {
 #define DRM_I915_QUERY_TOPOLOGY_INFO    1
 #define DRM_I915_QUERY_ENGINE_INFO	2
 #define DRM_I915_QUERY_PERF_CONFIG      3
+	/**
+	 * Query Command Streamer timestamp register.
+	 */
+#define DRM_I915_QUERY_CS_CYCLES	4
 /* Must be kept compact -- no holes and well documented */
 
 	/*
@@ -2309,6 +2313,45 @@ struct drm_i915_engine_info {
 	__u64 rsvd1[4];
 };
 
+/**
+ * struct drm_i915_query_cs_cycles
+ *
+ * The query returns the command streamer cycles and the frequency that can be
+ * used to calculate the command streamer timestamp. In addition the query
+ * returns the cpu timestamp that indicates when the command streamer cycle
+ * count was captured.
+ */
+struct drm_i915_query_cs_cycles {
+	/** Engine for which command streamer cycles is queried. */
+	struct i915_engine_class_instance engine;
+
+	/** Must be zero. */
+	__u32 flags;
+
+	/**
+	 * Command streamer cycles as read from the command streamer
+	 * register at 0x358 offset.
+	 */
+	__u64 cs_cycles;
+
+	/** Frequency of the cs cycles in Hz. */
+	__u64 cs_frequency;
+
+	/** CPU timestamp in nanoseconds. */
+	__u64 cpu_timestamp;
+
+	/**
+	 * Reference clock id for CPU timestamp. For definition, see
+	 * clock_gettime(2) and perf_event_open(2). Supported clock ids are
+	 * CLOCK_MONOTONIC, CLOCK_MONOTONIC_RAW, CLOCK_REALTIME, CLOCK_BOOTTIME,
+	 * CLOCK_TAI.
+	 */
+	__s32 clockid;
+
+	/** Must be zero. */
+	__u32 rsvd;
+};
+
 /**
  * struct drm_i915_query_engine_info
  *
-- 
2.20.1

_______________________________________________
Intel-gfx mailing list
Intel-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/intel-gfx

^ permalink raw reply related	[flat|nested] 19+ messages in thread

* Re: [Intel-gfx] [PATCH] i915/query: Correlate engine and cpu timestamps with better accuracy
  2021-03-02 18:29 [Intel-gfx] [PATCH] i915/query: Correlate engine and cpu timestamps with better accuracy Umesh Nerlige Ramappa
@ 2021-03-02 20:35 ` Lionel Landwerlin
  2021-03-03  0:12   ` Umesh Nerlige Ramappa
  2021-03-03 17:02 ` [Intel-gfx] ✓ Fi.CI.BAT: success for " Patchwork
  2021-03-03 20:23 ` [Intel-gfx] ✓ Fi.CI.IGT: " Patchwork
  2 siblings, 1 reply; 19+ messages in thread
From: Lionel Landwerlin @ 2021-03-02 20:35 UTC (permalink / raw)
  To: Umesh Nerlige Ramappa, intel-gfx, Chris Wilson, Tvrtko Ursulin

Thanks a bunch for sharing this!

On 02/03/2021 20:29, Umesh Nerlige Ramappa wrote:
> Perf measurements rely on CPU and engine timestamps to correlate
> events of interest across these time domains. Current mechanisms get
> these timestamps separately and the calculated delta between these
> timestamps lack enough accuracy.
>
> To improve the accuracy of these time measurements to within a few us,
> add a query that returns the engine and cpu timestamps captured as
> close to each other as possible.
>
> v2: (Tvrtko)
> - document clock reference used
> - return cpu timestamp always
> - capture cpu time just before lower dword of cs timestamp
>
> v3: (Chris)
> - use uncore-rpm
> - use __query_cs_timestamp helper
>
> v4: (Lionel)
> - Kernel perf subsytem allows users to specify the clock id to be used
>    in perf_event_open. This clock id is used by the perf subsystem to
>    return the appropriate cpu timestamp in perf events. Similarly, let
>    the user pass the clockid to this query so that cpu timestamp
>    corresponds to the clock id requested.
>
> v5: (Tvrtko)
> - Use normal ktime accessors instead of fast versions
> - Add more uApi documentation
>
> v6: (Lionel)
> - Move switch out of spinlock
>
> v7: (Chris)
> - cs_timestamp is a misnomer, use cs_cycles instead
> - return the cs cycle frequency as well in the query
>
> v8:
> - Add platform and engine specific checks
>
> Signed-off-by: Umesh Nerlige Ramappa <umesh.nerlige.ramappa@intel.com>
> ---
>   drivers/gpu/drm/i915/i915_query.c | 140 ++++++++++++++++++++++++++++++
>   include/uapi/drm/i915_drm.h       |  43 +++++++++
>   2 files changed, 183 insertions(+)
>
> diff --git a/drivers/gpu/drm/i915/i915_query.c b/drivers/gpu/drm/i915/i915_query.c
> index fed337ad7b68..763f0f918065 100644
> --- a/drivers/gpu/drm/i915/i915_query.c
> +++ b/drivers/gpu/drm/i915/i915_query.c
> @@ -6,6 +6,8 @@
>   
>   #include <linux/nospec.h>
>   
> +#include "gt/intel_engine_pm.h"
> +#include "gt/intel_engine_user.h"
>   #include "i915_drv.h"
>   #include "i915_perf.h"
>   #include "i915_query.h"
> @@ -90,6 +92,143 @@ static int query_topology_info(struct drm_i915_private *dev_priv,
>   	return total_length;
>   }
>   
> +typedef u64 (*__ktime_func_t)(void);
> +static __ktime_func_t __clock_id_to_func(clockid_t clk_id)
> +{
> +	/*
> +	 * Use logic same as the perf subsystem to allow user to select the
> +	 * reference clock id to be used for timestamps.
> +	 */
> +	switch (clk_id) {
> +	case CLOCK_MONOTONIC:
> +		return &ktime_get_ns;
> +	case CLOCK_MONOTONIC_RAW:
> +		return &ktime_get_raw_ns;
> +	case CLOCK_REALTIME:
> +		return &ktime_get_real_ns;
> +	case CLOCK_BOOTTIME:
> +		return &ktime_get_boottime_ns;
> +	case CLOCK_TAI:
> +		return &ktime_get_clocktai_ns;
> +	default:
> +		return NULL;
> +	}
> +}
> +
> +static inline int
> +__read_timestamps(struct intel_uncore *uncore,
> +		  i915_reg_t lower_reg,
> +		  i915_reg_t upper_reg,
> +		  u64 *cs_ts,
> +		  u64 *cpu_ts,
> +		  __ktime_func_t cpu_clock)
> +{
> +	u32 upper, lower, old_upper, loop = 0;
> +
> +	upper = intel_uncore_read_fw(uncore, upper_reg);
> +	do {
> +		*cpu_ts = cpu_clock();
> +		lower = intel_uncore_read_fw(uncore, lower_reg);
> +		old_upper = upper;
> +		upper = intel_uncore_read_fw(uncore, upper_reg);
> +	} while (upper != old_upper && loop++ < 2);


With the 2 cpu timestamps things I mentioned below, this would be


do {

     *cpu_ts0 = cpu_clock();

     lower = intel_uncore_read_fw(uncore, lower_reg);

     *cpu_ts1 = cpu_clock();

     upper = intel_uncore_read_fw(uncore, upper_reg);

} while (upper != old_upper && loop++ < 2);


> +
> +	*cs_ts = (u64)upper << 32 | lower;
> +
> +	return 0;
> +}
> +
> +static int
> +__query_cs_cycles(struct intel_engine_cs *engine,
> +		  u64 *cs_ts, u64 *cpu_ts,
> +		  __ktime_func_t cpu_clock)
> +{
> +	struct intel_uncore *uncore = engine->uncore;
> +	enum forcewake_domains fw_domains;
> +	u32 base = engine->mmio_base;
> +	intel_wakeref_t wakeref;
> +	int ret;
> +
> +	fw_domains = intel_uncore_forcewake_for_reg(uncore,
> +						    RING_TIMESTAMP(base),
> +						    FW_REG_READ);
> +
> +	with_intel_runtime_pm(uncore->rpm, wakeref) {
> +		spin_lock_irq(&uncore->lock);
> +		intel_uncore_forcewake_get__locked(uncore, fw_domains);
> +
> +		ret = __read_timestamps(uncore,
> +					RING_TIMESTAMP(base),
> +					RING_TIMESTAMP_UDW(base),
> +					cs_ts,
> +					cpu_ts,
> +					cpu_clock);
> +
> +		intel_uncore_forcewake_put__locked(uncore, fw_domains);
> +		spin_unlock_irq(&uncore->lock);
> +	}
> +
> +	return ret;
> +}
> +
> +static int
> +query_cs_cycles(struct drm_i915_private *i915,
> +		struct drm_i915_query_item *query_item)
> +{
> +	struct drm_i915_query_cs_cycles __user *query_ptr;
> +	struct drm_i915_query_cs_cycles query;
> +	struct intel_engine_cs *engine;
> +	__ktime_func_t cpu_clock;
> +	int ret;
> +
> +	if (INTEL_GEN(i915) < 6)
> +		return -ENODEV;
> +
> +	query_ptr = u64_to_user_ptr(query_item->data_ptr);
> +	ret = copy_query_item(&query, sizeof(query), sizeof(query), query_item);
> +	if (ret != 0)
> +		return ret;
> +
> +	if (query.flags)
> +		return -EINVAL;
> +
> +	if (query.rsvd)
> +		return -EINVAL;
> +
> +	cpu_clock = __clock_id_to_func(query.clockid);
> +	if (!cpu_clock)
> +		return -EINVAL;
> +
> +	engine = intel_engine_lookup_user(i915,
> +					  query.engine.engine_class,
> +					  query.engine.engine_instance);
> +	if (!engine)
> +		return -EINVAL;
> +
> +	if (IS_GEN(i915, 6) &&
> +	    query.engine.engine_class != I915_ENGINE_CLASS_RENDER)
> +		return -ENODEV;
> +
> +	query.cs_frequency = engine->gt->clock_frequency;
> +	ret = __query_cs_cycles(engine,
> +				&query.cs_cycles,
> +				&query.cpu_timestamp,
> +				cpu_clock);
> +	if (ret)
> +		return ret;
> +
> +	if (put_user(query.cs_frequency, &query_ptr->cs_frequency))
> +		return -EFAULT;
> +
> +	if (put_user(query.cpu_timestamp, &query_ptr->cpu_timestamp))
> +		return -EFAULT;
> +
> +	if (put_user(query.cs_cycles, &query_ptr->cs_cycles))
> +		return -EFAULT;
> +
> +	return sizeof(query);
> +}
> +
>   static int
>   query_engine_info(struct drm_i915_private *i915,
>   		  struct drm_i915_query_item *query_item)
> @@ -424,6 +563,7 @@ static int (* const i915_query_funcs[])(struct drm_i915_private *dev_priv,
>   	query_topology_info,
>   	query_engine_info,
>   	query_perf_config,
> +	query_cs_cycles,
>   };
>   
>   int i915_query_ioctl(struct drm_device *dev, void *data, struct drm_file *file)
> diff --git a/include/uapi/drm/i915_drm.h b/include/uapi/drm/i915_drm.h
> index 1987e2ea79a3..379ae6e7aeb0 100644
> --- a/include/uapi/drm/i915_drm.h
> +++ b/include/uapi/drm/i915_drm.h
> @@ -2176,6 +2176,10 @@ struct drm_i915_query_item {
>   #define DRM_I915_QUERY_TOPOLOGY_INFO    1
>   #define DRM_I915_QUERY_ENGINE_INFO	2
>   #define DRM_I915_QUERY_PERF_CONFIG      3
> +	/**
> +	 * Query Command Streamer timestamp register.
> +	 */
> +#define DRM_I915_QUERY_CS_CYCLES	4
>   /* Must be kept compact -- no holes and well documented */
>   
>   	/*
> @@ -2309,6 +2313,45 @@ struct drm_i915_engine_info {
>   	__u64 rsvd1[4];
>   };
>   
> +/**
> + * struct drm_i915_query_cs_cycles
> + *
> + * The query returns the command streamer cycles and the frequency that can be
> + * used to calculate the command streamer timestamp. In addition the query
> + * returns the cpu timestamp that indicates when the command streamer cycle
> + * count was captured.
> + */
> +struct drm_i915_query_cs_cycles {
> +	/** Engine for which command streamer cycles is queried. */
> +	struct i915_engine_class_instance engine;
> +
> +	/** Must be zero. */
> +	__u32 flags;
> +
> +	/**
> +	 * Command streamer cycles as read from the command streamer
> +	 * register at 0x358 offset.
> +	 */
> +	__u64 cs_cycles;
> +
> +	/** Frequency of the cs cycles in Hz. */
> +	__u64 cs_frequency;
> +
> +	/** CPU timestamp in nanoseconds. */
> +	__u64 cpu_timestamp;


Would it be possible to have : u64 cpu_timestamps[2];

with cpu_timestamps[0] taken before & cpu_timestamps[1] taken after the 
cs_cycles, so we can have an idea of how long the read takes.


> +
> +	/**
> +	 * Reference clock id for CPU timestamp. For definition, see
> +	 * clock_gettime(2) and perf_event_open(2). Supported clock ids are
> +	 * CLOCK_MONOTONIC, CLOCK_MONOTONIC_RAW, CLOCK_REALTIME, CLOCK_BOOTTIME,
> +	 * CLOCK_TAI.
> +	 */
> +	__s32 clockid;
> +
> +	/** Must be zero. */
> +	__u32 rsvd;
> +};
> +
>   /**
>    * struct drm_i915_query_engine_info
>    *


_______________________________________________
Intel-gfx mailing list
Intel-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/intel-gfx

^ permalink raw reply	[flat|nested] 19+ messages in thread

* Re: [Intel-gfx] [PATCH] i915/query: Correlate engine and cpu timestamps with better accuracy
  2021-03-02 20:35 ` Lionel Landwerlin
@ 2021-03-03  0:12   ` Umesh Nerlige Ramappa
  2021-03-03  9:21     ` Lionel Landwerlin
  0 siblings, 1 reply; 19+ messages in thread
From: Umesh Nerlige Ramappa @ 2021-03-03  0:12 UTC (permalink / raw)
  To: Lionel Landwerlin; +Cc: intel-gfx, Chris Wilson

On Tue, Mar 02, 2021 at 10:35:19PM +0200, Lionel Landwerlin wrote:
>Thanks a bunch for sharing this!
>
>On 02/03/2021 20:29, Umesh Nerlige Ramappa wrote:
>>Perf measurements rely on CPU and engine timestamps to correlate
>>events of interest across these time domains. Current mechanisms get
>>these timestamps separately and the calculated delta between these
>>timestamps lack enough accuracy.
>>
>>To improve the accuracy of these time measurements to within a few us,
>>add a query that returns the engine and cpu timestamps captured as
>>close to each other as possible.
>>
>>v2: (Tvrtko)
>>- document clock reference used
>>- return cpu timestamp always
>>- capture cpu time just before lower dword of cs timestamp
>>
>>v3: (Chris)
>>- use uncore-rpm
>>- use __query_cs_timestamp helper
>>
>>v4: (Lionel)
>>- Kernel perf subsytem allows users to specify the clock id to be used
>>   in perf_event_open. This clock id is used by the perf subsystem to
>>   return the appropriate cpu timestamp in perf events. Similarly, let
>>   the user pass the clockid to this query so that cpu timestamp
>>   corresponds to the clock id requested.
>>
>>v5: (Tvrtko)
>>- Use normal ktime accessors instead of fast versions
>>- Add more uApi documentation
>>
>>v6: (Lionel)
>>- Move switch out of spinlock
>>
>>v7: (Chris)
>>- cs_timestamp is a misnomer, use cs_cycles instead
>>- return the cs cycle frequency as well in the query
>>
>>v8:
>>- Add platform and engine specific checks
>>
>>Signed-off-by: Umesh Nerlige Ramappa <umesh.nerlige.ramappa@intel.com>
>>---
>>  drivers/gpu/drm/i915/i915_query.c | 140 ++++++++++++++++++++++++++++++
>>  include/uapi/drm/i915_drm.h       |  43 +++++++++
>>  2 files changed, 183 insertions(+)
>>
>>diff --git a/drivers/gpu/drm/i915/i915_query.c b/drivers/gpu/drm/i915/i915_query.c
>>index fed337ad7b68..763f0f918065 100644
>>--- a/drivers/gpu/drm/i915/i915_query.c
>>+++ b/drivers/gpu/drm/i915/i915_query.c
>>@@ -6,6 +6,8 @@
>>  #include <linux/nospec.h>
>>+#include "gt/intel_engine_pm.h"
>>+#include "gt/intel_engine_user.h"
>>  #include "i915_drv.h"
>>  #include "i915_perf.h"
>>  #include "i915_query.h"
>>@@ -90,6 +92,143 @@ static int query_topology_info(struct drm_i915_private *dev_priv,
>>  	return total_length;
>>  }
>>+typedef u64 (*__ktime_func_t)(void);
>>+static __ktime_func_t __clock_id_to_func(clockid_t clk_id)
>>+{
>>+	/*
>>+	 * Use logic same as the perf subsystem to allow user to select the
>>+	 * reference clock id to be used for timestamps.
>>+	 */
>>+	switch (clk_id) {
>>+	case CLOCK_MONOTONIC:
>>+		return &ktime_get_ns;
>>+	case CLOCK_MONOTONIC_RAW:
>>+		return &ktime_get_raw_ns;
>>+	case CLOCK_REALTIME:
>>+		return &ktime_get_real_ns;
>>+	case CLOCK_BOOTTIME:
>>+		return &ktime_get_boottime_ns;
>>+	case CLOCK_TAI:
>>+		return &ktime_get_clocktai_ns;
>>+	default:
>>+		return NULL;
>>+	}
>>+}
>>+
>>+static inline int
>>+__read_timestamps(struct intel_uncore *uncore,
>>+		  i915_reg_t lower_reg,
>>+		  i915_reg_t upper_reg,
>>+		  u64 *cs_ts,
>>+		  u64 *cpu_ts,
>>+		  __ktime_func_t cpu_clock)
>>+{
>>+	u32 upper, lower, old_upper, loop = 0;
>>+
>>+	upper = intel_uncore_read_fw(uncore, upper_reg);
>>+	do {
>>+		*cpu_ts = cpu_clock();
>>+		lower = intel_uncore_read_fw(uncore, lower_reg);
>>+		old_upper = upper;
>>+		upper = intel_uncore_read_fw(uncore, upper_reg);
>>+	} while (upper != old_upper && loop++ < 2);
>
>
>With the 2 cpu timestamps things I mentioned below, this would be
>
>
>do {
>
>    *cpu_ts0 = cpu_clock();
>
>    lower = intel_uncore_read_fw(uncore, lower_reg);
>
>    *cpu_ts1 = cpu_clock();
>
>    upper = intel_uncore_read_fw(uncore, upper_reg);
>
>} while (upper != old_upper && loop++ < 2);
>
>
>>+
>>+	*cs_ts = (u64)upper << 32 | lower;
>>+
>>+	return 0;
>>+}
>>+
>>+static int
>>+__query_cs_cycles(struct intel_engine_cs *engine,
>>+		  u64 *cs_ts, u64 *cpu_ts,
>>+		  __ktime_func_t cpu_clock)
>>+{
>>+	struct intel_uncore *uncore = engine->uncore;
>>+	enum forcewake_domains fw_domains;
>>+	u32 base = engine->mmio_base;
>>+	intel_wakeref_t wakeref;
>>+	int ret;
>>+
>>+	fw_domains = intel_uncore_forcewake_for_reg(uncore,
>>+						    RING_TIMESTAMP(base),
>>+						    FW_REG_READ);
>>+
>>+	with_intel_runtime_pm(uncore->rpm, wakeref) {
>>+		spin_lock_irq(&uncore->lock);
>>+		intel_uncore_forcewake_get__locked(uncore, fw_domains);
>>+
>>+		ret = __read_timestamps(uncore,
>>+					RING_TIMESTAMP(base),
>>+					RING_TIMESTAMP_UDW(base),
>>+					cs_ts,
>>+					cpu_ts,
>>+					cpu_clock);
>>+
>>+		intel_uncore_forcewake_put__locked(uncore, fw_domains);
>>+		spin_unlock_irq(&uncore->lock);
>>+	}
>>+
>>+	return ret;
>>+}
>>+
>>+static int
>>+query_cs_cycles(struct drm_i915_private *i915,
>>+		struct drm_i915_query_item *query_item)
>>+{
>>+	struct drm_i915_query_cs_cycles __user *query_ptr;
>>+	struct drm_i915_query_cs_cycles query;
>>+	struct intel_engine_cs *engine;
>>+	__ktime_func_t cpu_clock;
>>+	int ret;
>>+
>>+	if (INTEL_GEN(i915) < 6)
>>+		return -ENODEV;
>>+
>>+	query_ptr = u64_to_user_ptr(query_item->data_ptr);
>>+	ret = copy_query_item(&query, sizeof(query), sizeof(query), query_item);
>>+	if (ret != 0)
>>+		return ret;
>>+
>>+	if (query.flags)
>>+		return -EINVAL;
>>+
>>+	if (query.rsvd)
>>+		return -EINVAL;
>>+
>>+	cpu_clock = __clock_id_to_func(query.clockid);
>>+	if (!cpu_clock)
>>+		return -EINVAL;
>>+
>>+	engine = intel_engine_lookup_user(i915,
>>+					  query.engine.engine_class,
>>+					  query.engine.engine_instance);
>>+	if (!engine)
>>+		return -EINVAL;
>>+
>>+	if (IS_GEN(i915, 6) &&
>>+	    query.engine.engine_class != I915_ENGINE_CLASS_RENDER)
>>+		return -ENODEV;
>>+
>>+	query.cs_frequency = engine->gt->clock_frequency;
>>+	ret = __query_cs_cycles(engine,
>>+				&query.cs_cycles,
>>+				&query.cpu_timestamp,
>>+				cpu_clock);
>>+	if (ret)
>>+		return ret;
>>+
>>+	if (put_user(query.cs_frequency, &query_ptr->cs_frequency))
>>+		return -EFAULT;
>>+
>>+	if (put_user(query.cpu_timestamp, &query_ptr->cpu_timestamp))
>>+		return -EFAULT;
>>+
>>+	if (put_user(query.cs_cycles, &query_ptr->cs_cycles))
>>+		return -EFAULT;
>>+
>>+	return sizeof(query);
>>+}
>>+
>>  static int
>>  query_engine_info(struct drm_i915_private *i915,
>>  		  struct drm_i915_query_item *query_item)
>>@@ -424,6 +563,7 @@ static int (* const i915_query_funcs[])(struct drm_i915_private *dev_priv,
>>  	query_topology_info,
>>  	query_engine_info,
>>  	query_perf_config,
>>+	query_cs_cycles,
>>  };
>>  int i915_query_ioctl(struct drm_device *dev, void *data, struct drm_file *file)
>>diff --git a/include/uapi/drm/i915_drm.h b/include/uapi/drm/i915_drm.h
>>index 1987e2ea79a3..379ae6e7aeb0 100644
>>--- a/include/uapi/drm/i915_drm.h
>>+++ b/include/uapi/drm/i915_drm.h
>>@@ -2176,6 +2176,10 @@ struct drm_i915_query_item {
>>  #define DRM_I915_QUERY_TOPOLOGY_INFO    1
>>  #define DRM_I915_QUERY_ENGINE_INFO	2
>>  #define DRM_I915_QUERY_PERF_CONFIG      3
>>+	/**
>>+	 * Query Command Streamer timestamp register.
>>+	 */
>>+#define DRM_I915_QUERY_CS_CYCLES	4
>>  /* Must be kept compact -- no holes and well documented */
>>  	/*
>>@@ -2309,6 +2313,45 @@ struct drm_i915_engine_info {
>>  	__u64 rsvd1[4];
>>  };
>>+/**
>>+ * struct drm_i915_query_cs_cycles
>>+ *
>>+ * The query returns the command streamer cycles and the frequency that can be
>>+ * used to calculate the command streamer timestamp. In addition the query
>>+ * returns the cpu timestamp that indicates when the command streamer cycle
>>+ * count was captured.
>>+ */
>>+struct drm_i915_query_cs_cycles {
>>+	/** Engine for which command streamer cycles is queried. */
>>+	struct i915_engine_class_instance engine;
>>+
>>+	/** Must be zero. */
>>+	__u32 flags;
>>+
>>+	/**
>>+	 * Command streamer cycles as read from the command streamer
>>+	 * register at 0x358 offset.
>>+	 */
>>+	__u64 cs_cycles;
>>+
>>+	/** Frequency of the cs cycles in Hz. */
>>+	__u64 cs_frequency;
>>+
>>+	/** CPU timestamp in nanoseconds. */
>>+	__u64 cpu_timestamp;
>
>
>Would it be possible to have : u64 cpu_timestamps[2];
>
>with cpu_timestamps[0] taken before & cpu_timestamps[1] taken after 
>the cs_cycles, so we can have an idea of how long the read takes.

Possible, but I thought multiple queries would indirectly provide such 
information. If query1 returns cpu1 and cs1 time and query2 returns cpu2 
and cs2 times. Assuming neither overflowed,

|((cpu2 - cpu1) - (cs1 - cs2))|

should be the worst case time taken to read the register (essentially 
delta_delta in the IGT test). Thoughts?

Thanks,
Umesh

>
>
>>+
>>+	/**
>>+	 * Reference clock id for CPU timestamp. For definition, see
>>+	 * clock_gettime(2) and perf_event_open(2). Supported clock ids are
>>+	 * CLOCK_MONOTONIC, CLOCK_MONOTONIC_RAW, CLOCK_REALTIME, CLOCK_BOOTTIME,
>>+	 * CLOCK_TAI.
>>+	 */
>>+	__s32 clockid;
>>+
>>+	/** Must be zero. */
>>+	__u32 rsvd;
>>+};
>>+
>>  /**
>>   * struct drm_i915_query_engine_info
>>   *
>
>
_______________________________________________
Intel-gfx mailing list
Intel-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/intel-gfx

^ permalink raw reply	[flat|nested] 19+ messages in thread

* Re: [Intel-gfx] [PATCH] i915/query: Correlate engine and cpu timestamps with better accuracy
  2021-03-03  0:12   ` Umesh Nerlige Ramappa
@ 2021-03-03  9:21     ` Lionel Landwerlin
  2021-03-03 16:27       ` Umesh Nerlige Ramappa
  0 siblings, 1 reply; 19+ messages in thread
From: Lionel Landwerlin @ 2021-03-03  9:21 UTC (permalink / raw)
  To: Umesh Nerlige Ramappa; +Cc: intel-gfx, Chris Wilson

On 03/03/2021 02:12, Umesh Nerlige Ramappa wrote:
> On Tue, Mar 02, 2021 at 10:35:19PM +0200, Lionel Landwerlin wrote:
>> Thanks a bunch for sharing this!
>>
>> On 02/03/2021 20:29, Umesh Nerlige Ramappa wrote:
>>> Perf measurements rely on CPU and engine timestamps to correlate
>>> events of interest across these time domains. Current mechanisms get
>>> these timestamps separately and the calculated delta between these
>>> timestamps lack enough accuracy.
>>>
>>> To improve the accuracy of these time measurements to within a few us,
>>> add a query that returns the engine and cpu timestamps captured as
>>> close to each other as possible.
>>>
>>> v2: (Tvrtko)
>>> - document clock reference used
>>> - return cpu timestamp always
>>> - capture cpu time just before lower dword of cs timestamp
>>>
>>> v3: (Chris)
>>> - use uncore-rpm
>>> - use __query_cs_timestamp helper
>>>
>>> v4: (Lionel)
>>> - Kernel perf subsytem allows users to specify the clock id to be used
>>>   in perf_event_open. This clock id is used by the perf subsystem to
>>>   return the appropriate cpu timestamp in perf events. Similarly, let
>>>   the user pass the clockid to this query so that cpu timestamp
>>>   corresponds to the clock id requested.
>>>
>>> v5: (Tvrtko)
>>> - Use normal ktime accessors instead of fast versions
>>> - Add more uApi documentation
>>>
>>> v6: (Lionel)
>>> - Move switch out of spinlock
>>>
>>> v7: (Chris)
>>> - cs_timestamp is a misnomer, use cs_cycles instead
>>> - return the cs cycle frequency as well in the query
>>>
>>> v8:
>>> - Add platform and engine specific checks
>>>
>>> Signed-off-by: Umesh Nerlige Ramappa <umesh.nerlige.ramappa@intel.com>
>>> ---
>>>  drivers/gpu/drm/i915/i915_query.c | 140 ++++++++++++++++++++++++++++++
>>>  include/uapi/drm/i915_drm.h       |  43 +++++++++
>>>  2 files changed, 183 insertions(+)
>>>
>>> diff --git a/drivers/gpu/drm/i915/i915_query.c 
>>> b/drivers/gpu/drm/i915/i915_query.c
>>> index fed337ad7b68..763f0f918065 100644
>>> --- a/drivers/gpu/drm/i915/i915_query.c
>>> +++ b/drivers/gpu/drm/i915/i915_query.c
>>> @@ -6,6 +6,8 @@
>>>  #include <linux/nospec.h>
>>> +#include "gt/intel_engine_pm.h"
>>> +#include "gt/intel_engine_user.h"
>>>  #include "i915_drv.h"
>>>  #include "i915_perf.h"
>>>  #include "i915_query.h"
>>> @@ -90,6 +92,143 @@ static int query_topology_info(struct 
>>> drm_i915_private *dev_priv,
>>>      return total_length;
>>>  }
>>> +typedef u64 (*__ktime_func_t)(void);
>>> +static __ktime_func_t __clock_id_to_func(clockid_t clk_id)
>>> +{
>>> +    /*
>>> +     * Use logic same as the perf subsystem to allow user to select 
>>> the
>>> +     * reference clock id to be used for timestamps.
>>> +     */
>>> +    switch (clk_id) {
>>> +    case CLOCK_MONOTONIC:
>>> +        return &ktime_get_ns;
>>> +    case CLOCK_MONOTONIC_RAW:
>>> +        return &ktime_get_raw_ns;
>>> +    case CLOCK_REALTIME:
>>> +        return &ktime_get_real_ns;
>>> +    case CLOCK_BOOTTIME:
>>> +        return &ktime_get_boottime_ns;
>>> +    case CLOCK_TAI:
>>> +        return &ktime_get_clocktai_ns;
>>> +    default:
>>> +        return NULL;
>>> +    }
>>> +}
>>> +
>>> +static inline int
>>> +__read_timestamps(struct intel_uncore *uncore,
>>> +          i915_reg_t lower_reg,
>>> +          i915_reg_t upper_reg,
>>> +          u64 *cs_ts,
>>> +          u64 *cpu_ts,
>>> +          __ktime_func_t cpu_clock)
>>> +{
>>> +    u32 upper, lower, old_upper, loop = 0;
>>> +
>>> +    upper = intel_uncore_read_fw(uncore, upper_reg);
>>> +    do {
>>> +        *cpu_ts = cpu_clock();
>>> +        lower = intel_uncore_read_fw(uncore, lower_reg);
>>> +        old_upper = upper;
>>> +        upper = intel_uncore_read_fw(uncore, upper_reg);
>>> +    } while (upper != old_upper && loop++ < 2);
>>
>>
>> With the 2 cpu timestamps things I mentioned below, this would be
>>
>>
>> do {
>>
>>     *cpu_ts0 = cpu_clock();
>>
>>     lower = intel_uncore_read_fw(uncore, lower_reg);
>>
>>     *cpu_ts1 = cpu_clock();
>>
>>     upper = intel_uncore_read_fw(uncore, upper_reg);
>>
>> } while (upper != old_upper && loop++ < 2);
>>
>>
>>> +
>>> +    *cs_ts = (u64)upper << 32 | lower;
>>> +
>>> +    return 0;
>>> +}
>>> +
>>> +static int
>>> +__query_cs_cycles(struct intel_engine_cs *engine,
>>> +          u64 *cs_ts, u64 *cpu_ts,
>>> +          __ktime_func_t cpu_clock)
>>> +{
>>> +    struct intel_uncore *uncore = engine->uncore;
>>> +    enum forcewake_domains fw_domains;
>>> +    u32 base = engine->mmio_base;
>>> +    intel_wakeref_t wakeref;
>>> +    int ret;
>>> +
>>> +    fw_domains = intel_uncore_forcewake_for_reg(uncore,
>>> +                            RING_TIMESTAMP(base),
>>> +                            FW_REG_READ);
>>> +
>>> +    with_intel_runtime_pm(uncore->rpm, wakeref) {
>>> +        spin_lock_irq(&uncore->lock);
>>> +        intel_uncore_forcewake_get__locked(uncore, fw_domains);
>>> +
>>> +        ret = __read_timestamps(uncore,
>>> +                    RING_TIMESTAMP(base),
>>> +                    RING_TIMESTAMP_UDW(base),
>>> +                    cs_ts,
>>> +                    cpu_ts,
>>> +                    cpu_clock);
>>> +
>>> +        intel_uncore_forcewake_put__locked(uncore, fw_domains);
>>> +        spin_unlock_irq(&uncore->lock);
>>> +    }
>>> +
>>> +    return ret;
>>> +}
>>> +
>>> +static int
>>> +query_cs_cycles(struct drm_i915_private *i915,
>>> +        struct drm_i915_query_item *query_item)
>>> +{
>>> +    struct drm_i915_query_cs_cycles __user *query_ptr;
>>> +    struct drm_i915_query_cs_cycles query;
>>> +    struct intel_engine_cs *engine;
>>> +    __ktime_func_t cpu_clock;
>>> +    int ret;
>>> +
>>> +    if (INTEL_GEN(i915) < 6)
>>> +        return -ENODEV;
>>> +
>>> +    query_ptr = u64_to_user_ptr(query_item->data_ptr);
>>> +    ret = copy_query_item(&query, sizeof(query), sizeof(query), 
>>> query_item);
>>> +    if (ret != 0)
>>> +        return ret;
>>> +
>>> +    if (query.flags)
>>> +        return -EINVAL;
>>> +
>>> +    if (query.rsvd)
>>> +        return -EINVAL;
>>> +
>>> +    cpu_clock = __clock_id_to_func(query.clockid);
>>> +    if (!cpu_clock)
>>> +        return -EINVAL;
>>> +
>>> +    engine = intel_engine_lookup_user(i915,
>>> +                      query.engine.engine_class,
>>> +                      query.engine.engine_instance);
>>> +    if (!engine)
>>> +        return -EINVAL;
>>> +
>>> +    if (IS_GEN(i915, 6) &&
>>> +        query.engine.engine_class != I915_ENGINE_CLASS_RENDER)
>>> +        return -ENODEV;
>>> +
>>> +    query.cs_frequency = engine->gt->clock_frequency;
>>> +    ret = __query_cs_cycles(engine,
>>> +                &query.cs_cycles,
>>> +                &query.cpu_timestamp,
>>> +                cpu_clock);
>>> +    if (ret)
>>> +        return ret;
>>> +
>>> +    if (put_user(query.cs_frequency, &query_ptr->cs_frequency))
>>> +        return -EFAULT;
>>> +
>>> +    if (put_user(query.cpu_timestamp, &query_ptr->cpu_timestamp))
>>> +        return -EFAULT;
>>> +
>>> +    if (put_user(query.cs_cycles, &query_ptr->cs_cycles))
>>> +        return -EFAULT;
>>> +
>>> +    return sizeof(query);
>>> +}
>>> +
>>>  static int
>>>  query_engine_info(struct drm_i915_private *i915,
>>>            struct drm_i915_query_item *query_item)
>>> @@ -424,6 +563,7 @@ static int (* const i915_query_funcs[])(struct 
>>> drm_i915_private *dev_priv,
>>>      query_topology_info,
>>>      query_engine_info,
>>>      query_perf_config,
>>> +    query_cs_cycles,
>>>  };
>>>  int i915_query_ioctl(struct drm_device *dev, void *data, struct 
>>> drm_file *file)
>>> diff --git a/include/uapi/drm/i915_drm.h b/include/uapi/drm/i915_drm.h
>>> index 1987e2ea79a3..379ae6e7aeb0 100644
>>> --- a/include/uapi/drm/i915_drm.h
>>> +++ b/include/uapi/drm/i915_drm.h
>>> @@ -2176,6 +2176,10 @@ struct drm_i915_query_item {
>>>  #define DRM_I915_QUERY_TOPOLOGY_INFO    1
>>>  #define DRM_I915_QUERY_ENGINE_INFO    2
>>>  #define DRM_I915_QUERY_PERF_CONFIG      3
>>> +    /**
>>> +     * Query Command Streamer timestamp register.
>>> +     */
>>> +#define DRM_I915_QUERY_CS_CYCLES    4
>>>  /* Must be kept compact -- no holes and well documented */
>>>      /*
>>> @@ -2309,6 +2313,45 @@ struct drm_i915_engine_info {
>>>      __u64 rsvd1[4];
>>>  };
>>> +/**
>>> + * struct drm_i915_query_cs_cycles
>>> + *
>>> + * The query returns the command streamer cycles and the frequency 
>>> that can be
>>> + * used to calculate the command streamer timestamp. In addition 
>>> the query
>>> + * returns the cpu timestamp that indicates when the command 
>>> streamer cycle
>>> + * count was captured.
>>> + */
>>> +struct drm_i915_query_cs_cycles {
>>> +    /** Engine for which command streamer cycles is queried. */
>>> +    struct i915_engine_class_instance engine;
>>> +
>>> +    /** Must be zero. */
>>> +    __u32 flags;
>>> +
>>> +    /**
>>> +     * Command streamer cycles as read from the command streamer
>>> +     * register at 0x358 offset.
>>> +     */
>>> +    __u64 cs_cycles;
>>> +
>>> +    /** Frequency of the cs cycles in Hz. */
>>> +    __u64 cs_frequency;
>>> +
>>> +    /** CPU timestamp in nanoseconds. */
>>> +    __u64 cpu_timestamp;
>>
>>
>> Would it be possible to have : u64 cpu_timestamps[2];
>>
>> with cpu_timestamps[0] taken before & cpu_timestamps[1] taken after 
>> the cs_cycles, so we can have an idea of how long the read takes.
>
> Possible, but I thought multiple queries would indirectly provide such 
> information. If query1 returns cpu1 and cs1 time and query2 returns 
> cpu2 and cs2 times. Assuming neither overflowed,
>
> |((cpu2 - cpu1) - (cs1 - cs2))|
>
> should be the worst case time taken to read the register (essentially 
> delta_delta in the IGT test). Thoughts?


Going through 2 syscalls introduces a delay.

I did some measurements and it appears to be in the orders of 20~30us.


While doing the 2 cpu timestamp capture with a single mmio read in 
between should be below 2us.

We're hoping to go as precise as possible with this :)


-Lionel


>
> Thanks,
> Umesh
>
>>
>>
>>> +
>>> +    /**
>>> +     * Reference clock id for CPU timestamp. For definition, see
>>> +     * clock_gettime(2) and perf_event_open(2). Supported clock ids 
>>> are
>>> +     * CLOCK_MONOTONIC, CLOCK_MONOTONIC_RAW, CLOCK_REALTIME, 
>>> CLOCK_BOOTTIME,
>>> +     * CLOCK_TAI.
>>> +     */
>>> +    __s32 clockid;
>>> +
>>> +    /** Must be zero. */
>>> +    __u32 rsvd;
>>> +};
>>> +
>>>  /**
>>>   * struct drm_i915_query_engine_info
>>>   *
>>
>>

_______________________________________________
Intel-gfx mailing list
Intel-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/intel-gfx

^ permalink raw reply	[flat|nested] 19+ messages in thread

* Re: [Intel-gfx] [PATCH] i915/query: Correlate engine and cpu timestamps with better accuracy
  2021-03-03  9:21     ` Lionel Landwerlin
@ 2021-03-03 16:27       ` Umesh Nerlige Ramappa
  2021-03-03 16:28         ` Lionel Landwerlin
  0 siblings, 1 reply; 19+ messages in thread
From: Umesh Nerlige Ramappa @ 2021-03-03 16:27 UTC (permalink / raw)
  To: Lionel Landwerlin; +Cc: intel-gfx, Chris Wilson

On Wed, Mar 03, 2021 at 11:21:39AM +0200, Lionel Landwerlin wrote:
>On 03/03/2021 02:12, Umesh Nerlige Ramappa wrote:
>>On Tue, Mar 02, 2021 at 10:35:19PM +0200, Lionel Landwerlin wrote:
>>>Thanks a bunch for sharing this!
>>>
>>>On 02/03/2021 20:29, Umesh Nerlige Ramappa wrote:
>>>>Perf measurements rely on CPU and engine timestamps to correlate
>>>>events of interest across these time domains. Current mechanisms get
>>>>these timestamps separately and the calculated delta between these
>>>>timestamps lack enough accuracy.
>>>>
>>>>To improve the accuracy of these time measurements to within a few us,
>>>>add a query that returns the engine and cpu timestamps captured as
>>>>close to each other as possible.
>>>>
>>>>v2: (Tvrtko)
>>>>- document clock reference used
>>>>- return cpu timestamp always
>>>>- capture cpu time just before lower dword of cs timestamp
>>>>
>>>>v3: (Chris)
>>>>- use uncore-rpm
>>>>- use __query_cs_timestamp helper
>>>>
>>>>v4: (Lionel)
>>>>- Kernel perf subsytem allows users to specify the clock id to be used
>>>>  in perf_event_open. This clock id is used by the perf subsystem to
>>>>  return the appropriate cpu timestamp in perf events. Similarly, let
>>>>  the user pass the clockid to this query so that cpu timestamp
>>>>  corresponds to the clock id requested.
>>>>
>>>>v5: (Tvrtko)
>>>>- Use normal ktime accessors instead of fast versions
>>>>- Add more uApi documentation
>>>>
>>>>v6: (Lionel)
>>>>- Move switch out of spinlock
>>>>
>>>>v7: (Chris)
>>>>- cs_timestamp is a misnomer, use cs_cycles instead
>>>>- return the cs cycle frequency as well in the query
>>>>
>>>>v8:
>>>>- Add platform and engine specific checks
>>>>
>>>>Signed-off-by: Umesh Nerlige Ramappa <umesh.nerlige.ramappa@intel.com>
>>>>---
>>>> drivers/gpu/drm/i915/i915_query.c | 140 ++++++++++++++++++++++++++++++
>>>> include/uapi/drm/i915_drm.h       |  43 +++++++++
>>>> 2 files changed, 183 insertions(+)
>>>>
>>>>diff --git a/drivers/gpu/drm/i915/i915_query.c 
>>>>b/drivers/gpu/drm/i915/i915_query.c
>>>>index fed337ad7b68..763f0f918065 100644
>>>>--- a/drivers/gpu/drm/i915/i915_query.c
>>>>+++ b/drivers/gpu/drm/i915/i915_query.c
>>>>@@ -6,6 +6,8 @@
>>>> #include <linux/nospec.h>
>>>>+#include "gt/intel_engine_pm.h"
>>>>+#include "gt/intel_engine_user.h"
>>>> #include "i915_drv.h"
>>>> #include "i915_perf.h"
>>>> #include "i915_query.h"
>>>>@@ -90,6 +92,143 @@ static int query_topology_info(struct 
>>>>drm_i915_private *dev_priv,
>>>>     return total_length;
>>>> }
>>>>+typedef u64 (*__ktime_func_t)(void);
>>>>+static __ktime_func_t __clock_id_to_func(clockid_t clk_id)
>>>>+{
>>>>+    /*
>>>>+     * Use logic same as the perf subsystem to allow user to 
>>>>select the
>>>>+     * reference clock id to be used for timestamps.
>>>>+     */
>>>>+    switch (clk_id) {
>>>>+    case CLOCK_MONOTONIC:
>>>>+        return &ktime_get_ns;
>>>>+    case CLOCK_MONOTONIC_RAW:
>>>>+        return &ktime_get_raw_ns;
>>>>+    case CLOCK_REALTIME:
>>>>+        return &ktime_get_real_ns;
>>>>+    case CLOCK_BOOTTIME:
>>>>+        return &ktime_get_boottime_ns;
>>>>+    case CLOCK_TAI:
>>>>+        return &ktime_get_clocktai_ns;
>>>>+    default:
>>>>+        return NULL;
>>>>+    }
>>>>+}
>>>>+
>>>>+static inline int
>>>>+__read_timestamps(struct intel_uncore *uncore,
>>>>+          i915_reg_t lower_reg,
>>>>+          i915_reg_t upper_reg,
>>>>+          u64 *cs_ts,
>>>>+          u64 *cpu_ts,
>>>>+          __ktime_func_t cpu_clock)
>>>>+{
>>>>+    u32 upper, lower, old_upper, loop = 0;
>>>>+
>>>>+    upper = intel_uncore_read_fw(uncore, upper_reg);
>>>>+    do {
>>>>+        *cpu_ts = cpu_clock();
>>>>+        lower = intel_uncore_read_fw(uncore, lower_reg);
>>>>+        old_upper = upper;
>>>>+        upper = intel_uncore_read_fw(uncore, upper_reg);
>>>>+    } while (upper != old_upper && loop++ < 2);
>>>
>>>
>>>With the 2 cpu timestamps things I mentioned below, this would be
>>>
>>>
>>>do {
>>>
>>>    *cpu_ts0 = cpu_clock();
>>>
>>>    lower = intel_uncore_read_fw(uncore, lower_reg);
>>>
>>>    *cpu_ts1 = cpu_clock();
>>>
>>>    upper = intel_uncore_read_fw(uncore, upper_reg);
>>>
>>>} while (upper != old_upper && loop++ < 2);
>>>
>>>
>>>>+
>>>>+    *cs_ts = (u64)upper << 32 | lower;
>>>>+
>>>>+    return 0;
>>>>+}
>>>>+
>>>>+static int
>>>>+__query_cs_cycles(struct intel_engine_cs *engine,
>>>>+          u64 *cs_ts, u64 *cpu_ts,
>>>>+          __ktime_func_t cpu_clock)
>>>>+{
>>>>+    struct intel_uncore *uncore = engine->uncore;
>>>>+    enum forcewake_domains fw_domains;
>>>>+    u32 base = engine->mmio_base;
>>>>+    intel_wakeref_t wakeref;
>>>>+    int ret;
>>>>+
>>>>+    fw_domains = intel_uncore_forcewake_for_reg(uncore,
>>>>+                            RING_TIMESTAMP(base),
>>>>+                            FW_REG_READ);
>>>>+
>>>>+    with_intel_runtime_pm(uncore->rpm, wakeref) {
>>>>+        spin_lock_irq(&uncore->lock);
>>>>+        intel_uncore_forcewake_get__locked(uncore, fw_domains);
>>>>+
>>>>+        ret = __read_timestamps(uncore,
>>>>+                    RING_TIMESTAMP(base),
>>>>+                    RING_TIMESTAMP_UDW(base),
>>>>+                    cs_ts,
>>>>+                    cpu_ts,
>>>>+                    cpu_clock);
>>>>+
>>>>+        intel_uncore_forcewake_put__locked(uncore, fw_domains);
>>>>+        spin_unlock_irq(&uncore->lock);
>>>>+    }
>>>>+
>>>>+    return ret;
>>>>+}
>>>>+
>>>>+static int
>>>>+query_cs_cycles(struct drm_i915_private *i915,
>>>>+        struct drm_i915_query_item *query_item)
>>>>+{
>>>>+    struct drm_i915_query_cs_cycles __user *query_ptr;
>>>>+    struct drm_i915_query_cs_cycles query;
>>>>+    struct intel_engine_cs *engine;
>>>>+    __ktime_func_t cpu_clock;
>>>>+    int ret;
>>>>+
>>>>+    if (INTEL_GEN(i915) < 6)
>>>>+        return -ENODEV;
>>>>+
>>>>+    query_ptr = u64_to_user_ptr(query_item->data_ptr);
>>>>+    ret = copy_query_item(&query, sizeof(query), sizeof(query), 
>>>>query_item);
>>>>+    if (ret != 0)
>>>>+        return ret;
>>>>+
>>>>+    if (query.flags)
>>>>+        return -EINVAL;
>>>>+
>>>>+    if (query.rsvd)
>>>>+        return -EINVAL;
>>>>+
>>>>+    cpu_clock = __clock_id_to_func(query.clockid);
>>>>+    if (!cpu_clock)
>>>>+        return -EINVAL;
>>>>+
>>>>+    engine = intel_engine_lookup_user(i915,
>>>>+                      query.engine.engine_class,
>>>>+                      query.engine.engine_instance);
>>>>+    if (!engine)
>>>>+        return -EINVAL;
>>>>+
>>>>+    if (IS_GEN(i915, 6) &&
>>>>+        query.engine.engine_class != I915_ENGINE_CLASS_RENDER)
>>>>+        return -ENODEV;
>>>>+
>>>>+    query.cs_frequency = engine->gt->clock_frequency;
>>>>+    ret = __query_cs_cycles(engine,
>>>>+                &query.cs_cycles,
>>>>+                &query.cpu_timestamp,
>>>>+                cpu_clock);
>>>>+    if (ret)
>>>>+        return ret;
>>>>+
>>>>+    if (put_user(query.cs_frequency, &query_ptr->cs_frequency))
>>>>+        return -EFAULT;
>>>>+
>>>>+    if (put_user(query.cpu_timestamp, &query_ptr->cpu_timestamp))
>>>>+        return -EFAULT;
>>>>+
>>>>+    if (put_user(query.cs_cycles, &query_ptr->cs_cycles))
>>>>+        return -EFAULT;
>>>>+
>>>>+    return sizeof(query);
>>>>+}
>>>>+
>>>> static int
>>>> query_engine_info(struct drm_i915_private *i915,
>>>>           struct drm_i915_query_item *query_item)
>>>>@@ -424,6 +563,7 @@ static int (* const 
>>>>i915_query_funcs[])(struct drm_i915_private *dev_priv,
>>>>     query_topology_info,
>>>>     query_engine_info,
>>>>     query_perf_config,
>>>>+    query_cs_cycles,
>>>> };
>>>> int i915_query_ioctl(struct drm_device *dev, void *data, struct 
>>>>drm_file *file)
>>>>diff --git a/include/uapi/drm/i915_drm.h b/include/uapi/drm/i915_drm.h
>>>>index 1987e2ea79a3..379ae6e7aeb0 100644
>>>>--- a/include/uapi/drm/i915_drm.h
>>>>+++ b/include/uapi/drm/i915_drm.h
>>>>@@ -2176,6 +2176,10 @@ struct drm_i915_query_item {
>>>> #define DRM_I915_QUERY_TOPOLOGY_INFO    1
>>>> #define DRM_I915_QUERY_ENGINE_INFO    2
>>>> #define DRM_I915_QUERY_PERF_CONFIG      3
>>>>+    /**
>>>>+     * Query Command Streamer timestamp register.
>>>>+     */
>>>>+#define DRM_I915_QUERY_CS_CYCLES    4
>>>> /* Must be kept compact -- no holes and well documented */
>>>>     /*
>>>>@@ -2309,6 +2313,45 @@ struct drm_i915_engine_info {
>>>>     __u64 rsvd1[4];
>>>> };
>>>>+/**
>>>>+ * struct drm_i915_query_cs_cycles
>>>>+ *
>>>>+ * The query returns the command streamer cycles and the 
>>>>frequency that can be
>>>>+ * used to calculate the command streamer timestamp. In 
>>>>addition the query
>>>>+ * returns the cpu timestamp that indicates when the command 
>>>>streamer cycle
>>>>+ * count was captured.
>>>>+ */
>>>>+struct drm_i915_query_cs_cycles {
>>>>+    /** Engine for which command streamer cycles is queried. */
>>>>+    struct i915_engine_class_instance engine;
>>>>+
>>>>+    /** Must be zero. */
>>>>+    __u32 flags;
>>>>+
>>>>+    /**
>>>>+     * Command streamer cycles as read from the command streamer
>>>>+     * register at 0x358 offset.
>>>>+     */
>>>>+    __u64 cs_cycles;
>>>>+
>>>>+    /** Frequency of the cs cycles in Hz. */
>>>>+    __u64 cs_frequency;
>>>>+
>>>>+    /** CPU timestamp in nanoseconds. */
>>>>+    __u64 cpu_timestamp;
>>>
>>>
>>>Would it be possible to have : u64 cpu_timestamps[2];
>>>
>>>with cpu_timestamps[0] taken before & cpu_timestamps[1] taken 
>>>after the cs_cycles, so we can have an idea of how long the read 
>>>takes.
>>
>>Possible, but I thought multiple queries would indirectly provide 
>>such information. If query1 returns cpu1 and cs1 time and query2 
>>returns cpu2 and cs2 times. Assuming neither overflowed,
>>
>>|((cpu2 - cpu1) - (cs1 - cs2))|
>>
>>should be the worst case time taken to read the register 
>>(essentially delta_delta in the IGT test). Thoughts?
>
>
>Going through 2 syscalls introduces a delay.
>
>I did some measurements and it appears to be in the orders of 20~30us.
>

Have you tried multiple query items in the same call to the query ioctl?  
Does that make any difference?

>
>While doing the 2 cpu timestamp capture with a single mmio read in 
>between should be below 2us.
>
>We're hoping to go as precise as possible with this :)

I see. I will post an update.

Can you also share how you intend to use the query result with 2 cpu 
timestamps? I want to add that to the IGT.

Thanks,
Umesh

>
>
>-Lionel
>
>
>>
>>Thanks,
>>Umesh
>>
>>>
>>>
>>>>+
>>>>+    /**
>>>>+     * Reference clock id for CPU timestamp. For definition, see
>>>>+     * clock_gettime(2) and perf_event_open(2). Supported clock 
>>>>ids are
>>>>+     * CLOCK_MONOTONIC, CLOCK_MONOTONIC_RAW, CLOCK_REALTIME, 
>>>>CLOCK_BOOTTIME,
>>>>+     * CLOCK_TAI.
>>>>+     */
>>>>+    __s32 clockid;
>>>>+
>>>>+    /** Must be zero. */
>>>>+    __u32 rsvd;
>>>>+};
>>>>+
>>>> /**
>>>>  * struct drm_i915_query_engine_info
>>>>  *
>>>
>>>
>
_______________________________________________
Intel-gfx mailing list
Intel-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/intel-gfx

^ permalink raw reply	[flat|nested] 19+ messages in thread

* Re: [Intel-gfx] [PATCH] i915/query: Correlate engine and cpu timestamps with better accuracy
  2021-03-03 16:27       ` Umesh Nerlige Ramappa
@ 2021-03-03 16:28         ` Lionel Landwerlin
  0 siblings, 0 replies; 19+ messages in thread
From: Lionel Landwerlin @ 2021-03-03 16:28 UTC (permalink / raw)
  To: Umesh Nerlige Ramappa; +Cc: intel-gfx, Chris Wilson

On 03/03/2021 18:27, Umesh Nerlige Ramappa wrote:
> On Wed, Mar 03, 2021 at 11:21:39AM +0200, Lionel Landwerlin wrote:
>> On 03/03/2021 02:12, Umesh Nerlige Ramappa wrote:
>>> On Tue, Mar 02, 2021 at 10:35:19PM +0200, Lionel Landwerlin wrote:
>>>> Thanks a bunch for sharing this!
>>>>
>>>> On 02/03/2021 20:29, Umesh Nerlige Ramappa wrote:
>>>>> Perf measurements rely on CPU and engine timestamps to correlate
>>>>> events of interest across these time domains. Current mechanisms get
>>>>> these timestamps separately and the calculated delta between these
>>>>> timestamps lack enough accuracy.
>>>>>
>>>>> To improve the accuracy of these time measurements to within a few 
>>>>> us,
>>>>> add a query that returns the engine and cpu timestamps captured as
>>>>> close to each other as possible.
>>>>>
>>>>> v2: (Tvrtko)
>>>>> - document clock reference used
>>>>> - return cpu timestamp always
>>>>> - capture cpu time just before lower dword of cs timestamp
>>>>>
>>>>> v3: (Chris)
>>>>> - use uncore-rpm
>>>>> - use __query_cs_timestamp helper
>>>>>
>>>>> v4: (Lionel)
>>>>> - Kernel perf subsytem allows users to specify the clock id to be 
>>>>> used
>>>>>   in perf_event_open. This clock id is used by the perf subsystem to
>>>>>   return the appropriate cpu timestamp in perf events. Similarly, let
>>>>>   the user pass the clockid to this query so that cpu timestamp
>>>>>   corresponds to the clock id requested.
>>>>>
>>>>> v5: (Tvrtko)
>>>>> - Use normal ktime accessors instead of fast versions
>>>>> - Add more uApi documentation
>>>>>
>>>>> v6: (Lionel)
>>>>> - Move switch out of spinlock
>>>>>
>>>>> v7: (Chris)
>>>>> - cs_timestamp is a misnomer, use cs_cycles instead
>>>>> - return the cs cycle frequency as well in the query
>>>>>
>>>>> v8:
>>>>> - Add platform and engine specific checks
>>>>>
>>>>> Signed-off-by: Umesh Nerlige Ramappa 
>>>>> <umesh.nerlige.ramappa@intel.com>
>>>>> ---
>>>>>  drivers/gpu/drm/i915/i915_query.c | 140 
>>>>> ++++++++++++++++++++++++++++++
>>>>>  include/uapi/drm/i915_drm.h       |  43 +++++++++
>>>>>  2 files changed, 183 insertions(+)
>>>>>
>>>>> diff --git a/drivers/gpu/drm/i915/i915_query.c 
>>>>> b/drivers/gpu/drm/i915/i915_query.c
>>>>> index fed337ad7b68..763f0f918065 100644
>>>>> --- a/drivers/gpu/drm/i915/i915_query.c
>>>>> +++ b/drivers/gpu/drm/i915/i915_query.c
>>>>> @@ -6,6 +6,8 @@
>>>>>  #include <linux/nospec.h>
>>>>> +#include "gt/intel_engine_pm.h"
>>>>> +#include "gt/intel_engine_user.h"
>>>>>  #include "i915_drv.h"
>>>>>  #include "i915_perf.h"
>>>>>  #include "i915_query.h"
>>>>> @@ -90,6 +92,143 @@ static int query_topology_info(struct 
>>>>> drm_i915_private *dev_priv,
>>>>>      return total_length;
>>>>>  }
>>>>> +typedef u64 (*__ktime_func_t)(void);
>>>>> +static __ktime_func_t __clock_id_to_func(clockid_t clk_id)
>>>>> +{
>>>>> +    /*
>>>>> +     * Use logic same as the perf subsystem to allow user to 
>>>>> select the
>>>>> +     * reference clock id to be used for timestamps.
>>>>> +     */
>>>>> +    switch (clk_id) {
>>>>> +    case CLOCK_MONOTONIC:
>>>>> +        return &ktime_get_ns;
>>>>> +    case CLOCK_MONOTONIC_RAW:
>>>>> +        return &ktime_get_raw_ns;
>>>>> +    case CLOCK_REALTIME:
>>>>> +        return &ktime_get_real_ns;
>>>>> +    case CLOCK_BOOTTIME:
>>>>> +        return &ktime_get_boottime_ns;
>>>>> +    case CLOCK_TAI:
>>>>> +        return &ktime_get_clocktai_ns;
>>>>> +    default:
>>>>> +        return NULL;
>>>>> +    }
>>>>> +}
>>>>> +
>>>>> +static inline int
>>>>> +__read_timestamps(struct intel_uncore *uncore,
>>>>> +          i915_reg_t lower_reg,
>>>>> +          i915_reg_t upper_reg,
>>>>> +          u64 *cs_ts,
>>>>> +          u64 *cpu_ts,
>>>>> +          __ktime_func_t cpu_clock)
>>>>> +{
>>>>> +    u32 upper, lower, old_upper, loop = 0;
>>>>> +
>>>>> +    upper = intel_uncore_read_fw(uncore, upper_reg);
>>>>> +    do {
>>>>> +        *cpu_ts = cpu_clock();
>>>>> +        lower = intel_uncore_read_fw(uncore, lower_reg);
>>>>> +        old_upper = upper;
>>>>> +        upper = intel_uncore_read_fw(uncore, upper_reg);
>>>>> +    } while (upper != old_upper && loop++ < 2);
>>>>
>>>>
>>>> With the 2 cpu timestamps things I mentioned below, this would be
>>>>
>>>>
>>>> do {
>>>>
>>>>     *cpu_ts0 = cpu_clock();
>>>>
>>>>     lower = intel_uncore_read_fw(uncore, lower_reg);
>>>>
>>>>     *cpu_ts1 = cpu_clock();
>>>>
>>>>     upper = intel_uncore_read_fw(uncore, upper_reg);
>>>>
>>>> } while (upper != old_upper && loop++ < 2);
>>>>
>>>>
>>>>> +
>>>>> +    *cs_ts = (u64)upper << 32 | lower;
>>>>> +
>>>>> +    return 0;
>>>>> +}
>>>>> +
>>>>> +static int
>>>>> +__query_cs_cycles(struct intel_engine_cs *engine,
>>>>> +          u64 *cs_ts, u64 *cpu_ts,
>>>>> +          __ktime_func_t cpu_clock)
>>>>> +{
>>>>> +    struct intel_uncore *uncore = engine->uncore;
>>>>> +    enum forcewake_domains fw_domains;
>>>>> +    u32 base = engine->mmio_base;
>>>>> +    intel_wakeref_t wakeref;
>>>>> +    int ret;
>>>>> +
>>>>> +    fw_domains = intel_uncore_forcewake_for_reg(uncore,
>>>>> +                            RING_TIMESTAMP(base),
>>>>> +                            FW_REG_READ);
>>>>> +
>>>>> +    with_intel_runtime_pm(uncore->rpm, wakeref) {
>>>>> +        spin_lock_irq(&uncore->lock);
>>>>> +        intel_uncore_forcewake_get__locked(uncore, fw_domains);
>>>>> +
>>>>> +        ret = __read_timestamps(uncore,
>>>>> +                    RING_TIMESTAMP(base),
>>>>> +                    RING_TIMESTAMP_UDW(base),
>>>>> +                    cs_ts,
>>>>> +                    cpu_ts,
>>>>> +                    cpu_clock);
>>>>> +
>>>>> +        intel_uncore_forcewake_put__locked(uncore, fw_domains);
>>>>> +        spin_unlock_irq(&uncore->lock);
>>>>> +    }
>>>>> +
>>>>> +    return ret;
>>>>> +}
>>>>> +
>>>>> +static int
>>>>> +query_cs_cycles(struct drm_i915_private *i915,
>>>>> +        struct drm_i915_query_item *query_item)
>>>>> +{
>>>>> +    struct drm_i915_query_cs_cycles __user *query_ptr;
>>>>> +    struct drm_i915_query_cs_cycles query;
>>>>> +    struct intel_engine_cs *engine;
>>>>> +    __ktime_func_t cpu_clock;
>>>>> +    int ret;
>>>>> +
>>>>> +    if (INTEL_GEN(i915) < 6)
>>>>> +        return -ENODEV;
>>>>> +
>>>>> +    query_ptr = u64_to_user_ptr(query_item->data_ptr);
>>>>> +    ret = copy_query_item(&query, sizeof(query), sizeof(query), 
>>>>> query_item);
>>>>> +    if (ret != 0)
>>>>> +        return ret;
>>>>> +
>>>>> +    if (query.flags)
>>>>> +        return -EINVAL;
>>>>> +
>>>>> +    if (query.rsvd)
>>>>> +        return -EINVAL;
>>>>> +
>>>>> +    cpu_clock = __clock_id_to_func(query.clockid);
>>>>> +    if (!cpu_clock)
>>>>> +        return -EINVAL;
>>>>> +
>>>>> +    engine = intel_engine_lookup_user(i915,
>>>>> +                      query.engine.engine_class,
>>>>> +                      query.engine.engine_instance);
>>>>> +    if (!engine)
>>>>> +        return -EINVAL;
>>>>> +
>>>>> +    if (IS_GEN(i915, 6) &&
>>>>> +        query.engine.engine_class != I915_ENGINE_CLASS_RENDER)
>>>>> +        return -ENODEV;
>>>>> +
>>>>> +    query.cs_frequency = engine->gt->clock_frequency;
>>>>> +    ret = __query_cs_cycles(engine,
>>>>> +                &query.cs_cycles,
>>>>> +                &query.cpu_timestamp,
>>>>> +                cpu_clock);
>>>>> +    if (ret)
>>>>> +        return ret;
>>>>> +
>>>>> +    if (put_user(query.cs_frequency, &query_ptr->cs_frequency))
>>>>> +        return -EFAULT;
>>>>> +
>>>>> +    if (put_user(query.cpu_timestamp, &query_ptr->cpu_timestamp))
>>>>> +        return -EFAULT;
>>>>> +
>>>>> +    if (put_user(query.cs_cycles, &query_ptr->cs_cycles))
>>>>> +        return -EFAULT;
>>>>> +
>>>>> +    return sizeof(query);
>>>>> +}
>>>>> +
>>>>>  static int
>>>>>  query_engine_info(struct drm_i915_private *i915,
>>>>>            struct drm_i915_query_item *query_item)
>>>>> @@ -424,6 +563,7 @@ static int (* const i915_query_funcs[])(struct 
>>>>> drm_i915_private *dev_priv,
>>>>>      query_topology_info,
>>>>>      query_engine_info,
>>>>>      query_perf_config,
>>>>> +    query_cs_cycles,
>>>>>  };
>>>>>  int i915_query_ioctl(struct drm_device *dev, void *data, struct 
>>>>> drm_file *file)
>>>>> diff --git a/include/uapi/drm/i915_drm.h 
>>>>> b/include/uapi/drm/i915_drm.h
>>>>> index 1987e2ea79a3..379ae6e7aeb0 100644
>>>>> --- a/include/uapi/drm/i915_drm.h
>>>>> +++ b/include/uapi/drm/i915_drm.h
>>>>> @@ -2176,6 +2176,10 @@ struct drm_i915_query_item {
>>>>>  #define DRM_I915_QUERY_TOPOLOGY_INFO    1
>>>>>  #define DRM_I915_QUERY_ENGINE_INFO    2
>>>>>  #define DRM_I915_QUERY_PERF_CONFIG      3
>>>>> +    /**
>>>>> +     * Query Command Streamer timestamp register.
>>>>> +     */
>>>>> +#define DRM_I915_QUERY_CS_CYCLES    4
>>>>>  /* Must be kept compact -- no holes and well documented */
>>>>>      /*
>>>>> @@ -2309,6 +2313,45 @@ struct drm_i915_engine_info {
>>>>>      __u64 rsvd1[4];
>>>>>  };
>>>>> +/**
>>>>> + * struct drm_i915_query_cs_cycles
>>>>> + *
>>>>> + * The query returns the command streamer cycles and the 
>>>>> frequency that can be
>>>>> + * used to calculate the command streamer timestamp. In addition 
>>>>> the query
>>>>> + * returns the cpu timestamp that indicates when the command 
>>>>> streamer cycle
>>>>> + * count was captured.
>>>>> + */
>>>>> +struct drm_i915_query_cs_cycles {
>>>>> +    /** Engine for which command streamer cycles is queried. */
>>>>> +    struct i915_engine_class_instance engine;
>>>>> +
>>>>> +    /** Must be zero. */
>>>>> +    __u32 flags;
>>>>> +
>>>>> +    /**
>>>>> +     * Command streamer cycles as read from the command streamer
>>>>> +     * register at 0x358 offset.
>>>>> +     */
>>>>> +    __u64 cs_cycles;
>>>>> +
>>>>> +    /** Frequency of the cs cycles in Hz. */
>>>>> +    __u64 cs_frequency;
>>>>> +
>>>>> +    /** CPU timestamp in nanoseconds. */
>>>>> +    __u64 cpu_timestamp;
>>>>
>>>>
>>>> Would it be possible to have : u64 cpu_timestamps[2];
>>>>
>>>> with cpu_timestamps[0] taken before & cpu_timestamps[1] taken after 
>>>> the cs_cycles, so we can have an idea of how long the read takes.
>>>
>>> Possible, but I thought multiple queries would indirectly provide 
>>> such information. If query1 returns cpu1 and cs1 time and query2 
>>> returns cpu2 and cs2 times. Assuming neither overflowed,
>>>
>>> |((cpu2 - cpu1) - (cs1 - cs2))|
>>>
>>> should be the worst case time taken to read the register 
>>> (essentially delta_delta in the IGT test). Thoughts?
>>
>>
>> Going through 2 syscalls introduces a delay.
>>
>> I did some measurements and it appears to be in the orders of 20~30us.
>>
>
> Have you tried multiple query items in the same call to the query 
> ioctl?  Does that make any difference?


Yeah, it was the average :/


-Lionel


>
>>
>> While doing the 2 cpu timestamp capture with a single mmio read in 
>> between should be below 2us.
>>
>> We're hoping to go as precise as possible with this :)
>
> I see. I will post an update.
>
> Can you also share how you intend to use the query result with 2 cpu 
> timestamps? I want to add that to the IGT.
>
> Thanks,
> Umesh
>
>>
>>
>> -Lionel
>>
>>
>>>
>>> Thanks,
>>> Umesh
>>>
>>>>
>>>>
>>>>> +
>>>>> +    /**
>>>>> +     * Reference clock id for CPU timestamp. For definition, see
>>>>> +     * clock_gettime(2) and perf_event_open(2). Supported clock 
>>>>> ids are
>>>>> +     * CLOCK_MONOTONIC, CLOCK_MONOTONIC_RAW, CLOCK_REALTIME, 
>>>>> CLOCK_BOOTTIME,
>>>>> +     * CLOCK_TAI.
>>>>> +     */
>>>>> +    __s32 clockid;
>>>>> +
>>>>> +    /** Must be zero. */
>>>>> +    __u32 rsvd;
>>>>> +};
>>>>> +
>>>>>  /**
>>>>>   * struct drm_i915_query_engine_info
>>>>>   *
>>>>
>>>>
>>

_______________________________________________
Intel-gfx mailing list
Intel-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/intel-gfx

^ permalink raw reply	[flat|nested] 19+ messages in thread

* [Intel-gfx] ✓ Fi.CI.BAT: success for i915/query: Correlate engine and cpu timestamps with better accuracy
  2021-03-02 18:29 [Intel-gfx] [PATCH] i915/query: Correlate engine and cpu timestamps with better accuracy Umesh Nerlige Ramappa
  2021-03-02 20:35 ` Lionel Landwerlin
@ 2021-03-03 17:02 ` Patchwork
  2021-03-03 20:23 ` [Intel-gfx] ✓ Fi.CI.IGT: " Patchwork
  2 siblings, 0 replies; 19+ messages in thread
From: Patchwork @ 2021-03-03 17:02 UTC (permalink / raw)
  To: Umesh Nerlige Ramappa; +Cc: intel-gfx


[-- Attachment #1.1: Type: text/plain, Size: 7545 bytes --]

== Series Details ==

Series: i915/query: Correlate engine and cpu timestamps with better accuracy
URL   : https://patchwork.freedesktop.org/series/87552/
State : success

== Summary ==

CI Bug Log - changes from CI_DRM_9823 -> Patchwork_19746
====================================================

Summary
-------

  **WARNING**

  Minor unknown changes coming with Patchwork_19746 need to be verified
  manually.
  
  If you think the reported changes have nothing to do with the changes
  introduced in Patchwork_19746, please notify your bug team to allow them
  to document this new failure mode, which will reduce false positives in CI.

  External URL: https://intel-gfx-ci.01.org/tree/drm-tip/Patchwork_19746/index.html

Possible new issues
-------------------

  Here are the unknown changes that may have been introduced in Patchwork_19746:

### IGT changes ###

#### Warnings ####

  * igt@i915_pm_rpm@module-reload:
    - fi-glk-dsi:         [DMESG-WARN][1] ([i915#1982]) -> [DMESG-WARN][2]
   [1]: https://intel-gfx-ci.01.org/tree/drm-tip/CI_DRM_9823/fi-glk-dsi/igt@i915_pm_rpm@module-reload.html
   [2]: https://intel-gfx-ci.01.org/tree/drm-tip/Patchwork_19746/fi-glk-dsi/igt@i915_pm_rpm@module-reload.html

  
Known issues
------------

  Here are the changes found in Patchwork_19746 that come from known issues:

### IGT changes ###

#### Issues hit ####

  * igt@amdgpu/amd_basic@cs-compute:
    - fi-elk-e7500:       NOTRUN -> [SKIP][3] ([fdo#109271]) +46 similar issues
   [3]: https://intel-gfx-ci.01.org/tree/drm-tip/Patchwork_19746/fi-elk-e7500/igt@amdgpu/amd_basic@cs-compute.html

  * igt@amdgpu/amd_basic@cs-sdma:
    - fi-kbl-guc:         NOTRUN -> [SKIP][4] ([fdo#109271]) +58 similar issues
   [4]: https://intel-gfx-ci.01.org/tree/drm-tip/Patchwork_19746/fi-kbl-guc/igt@amdgpu/amd_basic@cs-sdma.html

  * igt@amdgpu/amd_cs_nop@nop-compute0:
    - fi-cml-s:           NOTRUN -> [SKIP][5] ([fdo#109315]) +17 similar issues
   [5]: https://intel-gfx-ci.01.org/tree/drm-tip/Patchwork_19746/fi-cml-s/igt@amdgpu/amd_cs_nop@nop-compute0.html

  * igt@gem_exec_fence@basic-wait@bcs0:
    - fi-cml-s:           NOTRUN -> [SKIP][6] ([i915#1208]) +1 similar issue
   [6]: https://intel-gfx-ci.01.org/tree/drm-tip/Patchwork_19746/fi-cml-s/igt@gem_exec_fence@basic-wait@bcs0.html

  * igt@gem_huc_copy@huc-copy:
    - fi-cml-s:           NOTRUN -> [SKIP][7] ([i915#2190])
   [7]: https://intel-gfx-ci.01.org/tree/drm-tip/Patchwork_19746/fi-cml-s/igt@gem_huc_copy@huc-copy.html
    - fi-kbl-guc:         NOTRUN -> [SKIP][8] ([fdo#109271] / [i915#2190])
   [8]: https://intel-gfx-ci.01.org/tree/drm-tip/Patchwork_19746/fi-kbl-guc/igt@gem_huc_copy@huc-copy.html

  * igt@gem_tiled_fence_blits@basic:
    - fi-kbl-8809g:       [PASS][9] -> [TIMEOUT][10] ([i915#3145])
   [9]: https://intel-gfx-ci.01.org/tree/drm-tip/CI_DRM_9823/fi-kbl-8809g/igt@gem_tiled_fence_blits@basic.html
   [10]: https://intel-gfx-ci.01.org/tree/drm-tip/Patchwork_19746/fi-kbl-8809g/igt@gem_tiled_fence_blits@basic.html

  * igt@kms_chamelium@dp-crc-fast:
    - fi-kbl-7500u:       [PASS][11] -> [FAIL][12] ([i915#1372])
   [11]: https://intel-gfx-ci.01.org/tree/drm-tip/CI_DRM_9823/fi-kbl-7500u/igt@kms_chamelium@dp-crc-fast.html
   [12]: https://intel-gfx-ci.01.org/tree/drm-tip/Patchwork_19746/fi-kbl-7500u/igt@kms_chamelium@dp-crc-fast.html

  * igt@kms_chamelium@dp-hpd-fast:
    - fi-cml-s:           NOTRUN -> [SKIP][13] ([fdo#109284] / [fdo#111827]) +8 similar issues
   [13]: https://intel-gfx-ci.01.org/tree/drm-tip/Patchwork_19746/fi-cml-s/igt@kms_chamelium@dp-hpd-fast.html

  * igt@kms_chamelium@vga-hpd-fast:
    - fi-kbl-guc:         NOTRUN -> [SKIP][14] ([fdo#109271] / [fdo#111827]) +8 similar issues
   [14]: https://intel-gfx-ci.01.org/tree/drm-tip/Patchwork_19746/fi-kbl-guc/igt@kms_chamelium@vga-hpd-fast.html

  * igt@kms_force_connector_basic@force-load-detect:
    - fi-cml-s:           NOTRUN -> [SKIP][15] ([fdo#109285])
   [15]: https://intel-gfx-ci.01.org/tree/drm-tip/Patchwork_19746/fi-cml-s/igt@kms_force_connector_basic@force-load-detect.html

  * igt@kms_pipe_crc_basic@compare-crc-sanitycheck-pipe-d:
    - fi-kbl-guc:         NOTRUN -> [SKIP][16] ([fdo#109271] / [i915#533])
   [16]: https://intel-gfx-ci.01.org/tree/drm-tip/Patchwork_19746/fi-kbl-guc/igt@kms_pipe_crc_basic@compare-crc-sanitycheck-pipe-d.html
    - fi-cml-s:           NOTRUN -> [SKIP][17] ([fdo#109278] / [i915#533])
   [17]: https://intel-gfx-ci.01.org/tree/drm-tip/Patchwork_19746/fi-cml-s/igt@kms_pipe_crc_basic@compare-crc-sanitycheck-pipe-d.html

  
#### Possible fixes ####

  * igt@gem_exec_gttfill@basic:
    - fi-kbl-8809g:       [TIMEOUT][18] ([i915#3145]) -> [PASS][19]
   [18]: https://intel-gfx-ci.01.org/tree/drm-tip/CI_DRM_9823/fi-kbl-8809g/igt@gem_exec_gttfill@basic.html
   [19]: https://intel-gfx-ci.01.org/tree/drm-tip/Patchwork_19746/fi-kbl-8809g/igt@gem_exec_gttfill@basic.html

  * igt@gem_linear_blits@basic:
    - fi-kbl-8809g:       [TIMEOUT][20] ([i915#2502]) -> [PASS][21]
   [20]: https://intel-gfx-ci.01.org/tree/drm-tip/CI_DRM_9823/fi-kbl-8809g/igt@gem_linear_blits@basic.html
   [21]: https://intel-gfx-ci.01.org/tree/drm-tip/Patchwork_19746/fi-kbl-8809g/igt@gem_linear_blits@basic.html

  * igt@i915_selftest@live@client:
    - fi-glk-dsi:         [DMESG-FAIL][22] ([i915#3047]) -> [PASS][23]
   [22]: https://intel-gfx-ci.01.org/tree/drm-tip/CI_DRM_9823/fi-glk-dsi/igt@i915_selftest@live@client.html
   [23]: https://intel-gfx-ci.01.org/tree/drm-tip/Patchwork_19746/fi-glk-dsi/igt@i915_selftest@live@client.html

  
  {name}: This element is suppressed. This means it is ignored when computing
          the status of the difference (SUCCESS, WARNING, or FAILURE).

  [fdo#109271]: https://bugs.freedesktop.org/show_bug.cgi?id=109271
  [fdo#109278]: https://bugs.freedesktop.org/show_bug.cgi?id=109278
  [fdo#109284]: https://bugs.freedesktop.org/show_bug.cgi?id=109284
  [fdo#109285]: https://bugs.freedesktop.org/show_bug.cgi?id=109285
  [fdo#109315]: https://bugs.freedesktop.org/show_bug.cgi?id=109315
  [fdo#111827]: https://bugs.freedesktop.org/show_bug.cgi?id=111827
  [i915#1208]: https://gitlab.freedesktop.org/drm/intel/issues/1208
  [i915#1222]: https://gitlab.freedesktop.org/drm/intel/issues/1222
  [i915#1372]: https://gitlab.freedesktop.org/drm/intel/issues/1372
  [i915#1982]: https://gitlab.freedesktop.org/drm/intel/issues/1982
  [i915#2190]: https://gitlab.freedesktop.org/drm/intel/issues/2190
  [i915#2502]: https://gitlab.freedesktop.org/drm/intel/issues/2502
  [i915#3047]: https://gitlab.freedesktop.org/drm/intel/issues/3047
  [i915#3145]: https://gitlab.freedesktop.org/drm/intel/issues/3145
  [i915#533]: https://gitlab.freedesktop.org/drm/intel/issues/533


Participating hosts (38 -> 35)
------------------------------

  Additional (3): fi-cml-s fi-kbl-guc fi-elk-e7500 
  Missing    (6): fi-ilk-m540 fi-hsw-4200u fi-bsw-cyan fi-ctg-p8600 fi-cml-drallion fi-bdw-samus 


Build changes
-------------

  * Linux: CI_DRM_9823 -> Patchwork_19746

  CI-20190529: 20190529
  CI_DRM_9823: d559dcad6da00e498bb2c41967a9e5487545b8ba @ git://anongit.freedesktop.org/gfx-ci/linux
  IGT_6020: 8382d9e87bba39ecc6fa879b2491e28c7fb2e3c7 @ git://anongit.freedesktop.org/xorg/app/intel-gpu-tools
  Patchwork_19746: ec60068b1dccef4673089e53c75f7f1c1ca82df7 @ git://anongit.freedesktop.org/gfx-ci/linux


== Linux commits ==

ec60068b1dcc i915/query: Correlate engine and cpu timestamps with better accuracy

== Logs ==

For more details see: https://intel-gfx-ci.01.org/tree/drm-tip/Patchwork_19746/index.html

[-- Attachment #1.2: Type: text/html, Size: 9040 bytes --]

[-- Attachment #2: Type: text/plain, Size: 160 bytes --]

_______________________________________________
Intel-gfx mailing list
Intel-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/intel-gfx

^ permalink raw reply	[flat|nested] 19+ messages in thread

* [Intel-gfx] ✓ Fi.CI.IGT: success for i915/query: Correlate engine and cpu timestamps with better accuracy
  2021-03-02 18:29 [Intel-gfx] [PATCH] i915/query: Correlate engine and cpu timestamps with better accuracy Umesh Nerlige Ramappa
  2021-03-02 20:35 ` Lionel Landwerlin
  2021-03-03 17:02 ` [Intel-gfx] ✓ Fi.CI.BAT: success for " Patchwork
@ 2021-03-03 20:23 ` Patchwork
  2 siblings, 0 replies; 19+ messages in thread
From: Patchwork @ 2021-03-03 20:23 UTC (permalink / raw)
  To: Umesh Nerlige Ramappa; +Cc: intel-gfx


[-- Attachment #1.1: Type: text/plain, Size: 30291 bytes --]

== Series Details ==

Series: i915/query: Correlate engine and cpu timestamps with better accuracy
URL   : https://patchwork.freedesktop.org/series/87552/
State : success

== Summary ==

CI Bug Log - changes from CI_DRM_9823_full -> Patchwork_19746_full
====================================================

Summary
-------

  **WARNING**

  Minor unknown changes coming with Patchwork_19746_full need to be verified
  manually.
  
  If you think the reported changes have nothing to do with the changes
  introduced in Patchwork_19746_full, please notify your bug team to allow them
  to document this new failure mode, which will reduce false positives in CI.

  

Possible new issues
-------------------

  Here are the unknown changes that may have been introduced in Patchwork_19746_full:

### IGT changes ###

#### Warnings ####

  * igt@kms_content_protection@dp-mst-lic-type-1:
    - shard-iclb:         [SKIP][1] ([i915#3116]) -> [FAIL][2] +1 similar issue
   [1]: https://intel-gfx-ci.01.org/tree/drm-tip/CI_DRM_9823/shard-iclb8/igt@kms_content_protection@dp-mst-lic-type-1.html
   [2]: https://intel-gfx-ci.01.org/tree/drm-tip/Patchwork_19746/shard-iclb1/igt@kms_content_protection@dp-mst-lic-type-1.html

  * igt@runner@aborted:
    - shard-skl:          ([FAIL][3], [FAIL][4], [FAIL][5], [FAIL][6], [FAIL][7], [FAIL][8], [FAIL][9], [FAIL][10], [FAIL][11]) ([i915#1436] / [i915#1814] / [i915#2029] / [i915#2426] / [i915#2724] / [i915#3002]) -> ([FAIL][12], [FAIL][13], [FAIL][14], [FAIL][15], [FAIL][16], [FAIL][17]) ([i915#1814] / [i915#2029] / [i915#2724] / [i915#3002])
   [3]: https://intel-gfx-ci.01.org/tree/drm-tip/CI_DRM_9823/shard-skl7/igt@runner@aborted.html
   [4]: https://intel-gfx-ci.01.org/tree/drm-tip/CI_DRM_9823/shard-skl2/igt@runner@aborted.html
   [5]: https://intel-gfx-ci.01.org/tree/drm-tip/CI_DRM_9823/shard-skl5/igt@runner@aborted.html
   [6]: https://intel-gfx-ci.01.org/tree/drm-tip/CI_DRM_9823/shard-skl3/igt@runner@aborted.html
   [7]: https://intel-gfx-ci.01.org/tree/drm-tip/CI_DRM_9823/shard-skl9/igt@runner@aborted.html
   [8]: https://intel-gfx-ci.01.org/tree/drm-tip/CI_DRM_9823/shard-skl4/igt@runner@aborted.html
   [9]: https://intel-gfx-ci.01.org/tree/drm-tip/CI_DRM_9823/shard-skl6/igt@runner@aborted.html
   [10]: https://intel-gfx-ci.01.org/tree/drm-tip/CI_DRM_9823/shard-skl3/igt@runner@aborted.html
   [11]: https://intel-gfx-ci.01.org/tree/drm-tip/CI_DRM_9823/shard-skl3/igt@runner@aborted.html
   [12]: https://intel-gfx-ci.01.org/tree/drm-tip/Patchwork_19746/shard-skl5/igt@runner@aborted.html
   [13]: https://intel-gfx-ci.01.org/tree/drm-tip/Patchwork_19746/shard-skl6/igt@runner@aborted.html
   [14]: https://intel-gfx-ci.01.org/tree/drm-tip/Patchwork_19746/shard-skl3/igt@runner@aborted.html
   [15]: https://intel-gfx-ci.01.org/tree/drm-tip/Patchwork_19746/shard-skl3/igt@runner@aborted.html
   [16]: https://intel-gfx-ci.01.org/tree/drm-tip/Patchwork_19746/shard-skl1/igt@runner@aborted.html
   [17]: https://intel-gfx-ci.01.org/tree/drm-tip/Patchwork_19746/shard-skl3/igt@runner@aborted.html

  
Known issues
------------

  Here are the changes found in Patchwork_19746_full that come from known issues:

### IGT changes ###

#### Issues hit ####

  * igt@gem_ctx_persistence@legacy-engines-queued:
    - shard-snb:          NOTRUN -> [SKIP][18] ([fdo#109271] / [i915#1099]) +5 similar issues
   [18]: https://intel-gfx-ci.01.org/tree/drm-tip/Patchwork_19746/shard-snb6/igt@gem_ctx_persistence@legacy-engines-queued.html

  * igt@gem_ctx_persistence@replace@rcs0:
    - shard-skl:          [PASS][19] -> [FAIL][20] ([i915#2410])
   [19]: https://intel-gfx-ci.01.org/tree/drm-tip/CI_DRM_9823/shard-skl1/igt@gem_ctx_persistence@replace@rcs0.html
   [20]: https://intel-gfx-ci.01.org/tree/drm-tip/Patchwork_19746/shard-skl10/igt@gem_ctx_persistence@replace@rcs0.html

  * igt@gem_exec_fair@basic-none-vip@rcs0:
    - shard-kbl:          [PASS][21] -> [FAIL][22] ([i915#2842]) +3 similar issues
   [21]: https://intel-gfx-ci.01.org/tree/drm-tip/CI_DRM_9823/shard-kbl1/igt@gem_exec_fair@basic-none-vip@rcs0.html
   [22]: https://intel-gfx-ci.01.org/tree/drm-tip/Patchwork_19746/shard-kbl3/igt@gem_exec_fair@basic-none-vip@rcs0.html

  * igt@gem_exec_reloc@basic-wide-active@rcs0:
    - shard-snb:          NOTRUN -> [FAIL][23] ([i915#2389]) +2 similar issues
   [23]: https://intel-gfx-ci.01.org/tree/drm-tip/Patchwork_19746/shard-snb6/igt@gem_exec_reloc@basic-wide-active@rcs0.html

  * igt@gem_exec_reloc@basic-wide-active@vcs1:
    - shard-iclb:         NOTRUN -> [FAIL][24] ([i915#2389]) +4 similar issues
   [24]: https://intel-gfx-ci.01.org/tree/drm-tip/Patchwork_19746/shard-iclb2/igt@gem_exec_reloc@basic-wide-active@vcs1.html

  * igt@gem_exec_schedule@u-fairslice@rcs0:
    - shard-apl:          NOTRUN -> [DMESG-WARN][25] ([i915#1610])
   [25]: https://intel-gfx-ci.01.org/tree/drm-tip/Patchwork_19746/shard-apl7/igt@gem_exec_schedule@u-fairslice@rcs0.html

  * igt@gem_pwrite@basic-exhaustion:
    - shard-snb:          NOTRUN -> [WARN][26] ([i915#2658]) +1 similar issue
   [26]: https://intel-gfx-ci.01.org/tree/drm-tip/Patchwork_19746/shard-snb2/igt@gem_pwrite@basic-exhaustion.html

  * igt@gem_render_copy@x-tiled-to-vebox-yf-tiled:
    - shard-iclb:         NOTRUN -> [SKIP][27] ([i915#768])
   [27]: https://intel-gfx-ci.01.org/tree/drm-tip/Patchwork_19746/shard-iclb8/igt@gem_render_copy@x-tiled-to-vebox-yf-tiled.html

  * igt@gem_softpin@evict-snoop-interruptible:
    - shard-iclb:         NOTRUN -> [SKIP][28] ([fdo#109312])
   [28]: https://intel-gfx-ci.01.org/tree/drm-tip/Patchwork_19746/shard-iclb8/igt@gem_softpin@evict-snoop-interruptible.html

  * igt@gem_userptr_blits@input-checking:
    - shard-snb:          NOTRUN -> [DMESG-WARN][29] ([i915#3002])
   [29]: https://intel-gfx-ci.01.org/tree/drm-tip/Patchwork_19746/shard-snb2/igt@gem_userptr_blits@input-checking.html

  * igt@gem_userptr_blits@mmap-offset-invalidate-active@wb:
    - shard-kbl:          NOTRUN -> [SKIP][30] ([fdo#109271]) +70 similar issues
   [30]: https://intel-gfx-ci.01.org/tree/drm-tip/Patchwork_19746/shard-kbl1/igt@gem_userptr_blits@mmap-offset-invalidate-active@wb.html

  * igt@gen9_exec_parse@batch-without-end:
    - shard-iclb:         NOTRUN -> [SKIP][31] ([fdo#112306])
   [31]: https://intel-gfx-ci.01.org/tree/drm-tip/Patchwork_19746/shard-iclb8/igt@gen9_exec_parse@batch-without-end.html

  * igt@i915_selftest@live@client:
    - shard-glk:          [PASS][32] -> [DMESG-FAIL][33] ([i915#3047])
   [32]: https://intel-gfx-ci.01.org/tree/drm-tip/CI_DRM_9823/shard-glk9/igt@i915_selftest@live@client.html
   [33]: https://intel-gfx-ci.01.org/tree/drm-tip/Patchwork_19746/shard-glk6/igt@i915_selftest@live@client.html

  * igt@i915_selftest@live@hangcheck:
    - shard-snb:          NOTRUN -> [INCOMPLETE][34] ([i915#2782])
   [34]: https://intel-gfx-ci.01.org/tree/drm-tip/Patchwork_19746/shard-snb6/igt@i915_selftest@live@hangcheck.html

  * igt@kms_async_flips@alternate-sync-async-flip:
    - shard-skl:          [PASS][35] -> [FAIL][36] ([i915#2521])
   [35]: https://intel-gfx-ci.01.org/tree/drm-tip/CI_DRM_9823/shard-skl8/igt@kms_async_flips@alternate-sync-async-flip.html
   [36]: https://intel-gfx-ci.01.org/tree/drm-tip/Patchwork_19746/shard-skl1/igt@kms_async_flips@alternate-sync-async-flip.html

  * igt@kms_atomic_transition@modeset-transition-nonblocking@2x-outputs:
    - shard-glk:          [PASS][37] -> [DMESG-WARN][38] ([i915#118] / [i915#95])
   [37]: https://intel-gfx-ci.01.org/tree/drm-tip/CI_DRM_9823/shard-glk9/igt@kms_atomic_transition@modeset-transition-nonblocking@2x-outputs.html
   [38]: https://intel-gfx-ci.01.org/tree/drm-tip/Patchwork_19746/shard-glk2/igt@kms_atomic_transition@modeset-transition-nonblocking@2x-outputs.html

  * igt@kms_big_fb@linear-8bpp-rotate-90:
    - shard-iclb:         NOTRUN -> [SKIP][39] ([fdo#110725] / [fdo#111614])
   [39]: https://intel-gfx-ci.01.org/tree/drm-tip/Patchwork_19746/shard-iclb8/igt@kms_big_fb@linear-8bpp-rotate-90.html

  * igt@kms_big_joiner@basic:
    - shard-iclb:         NOTRUN -> [SKIP][40] ([i915#2705])
   [40]: https://intel-gfx-ci.01.org/tree/drm-tip/Patchwork_19746/shard-iclb8/igt@kms_big_joiner@basic.html
    - shard-apl:          NOTRUN -> [SKIP][41] ([fdo#109271] / [i915#2705])
   [41]: https://intel-gfx-ci.01.org/tree/drm-tip/Patchwork_19746/shard-apl7/igt@kms_big_joiner@basic.html

  * igt@kms_ccs@pipe-c-crc-primary-rotation-180:
    - shard-skl:          NOTRUN -> [SKIP][42] ([fdo#109271] / [fdo#111304])
   [42]: https://intel-gfx-ci.01.org/tree/drm-tip/Patchwork_19746/shard-skl10/igt@kms_ccs@pipe-c-crc-primary-rotation-180.html

  * igt@kms_chamelium@hdmi-hpd-storm:
    - shard-kbl:          NOTRUN -> [SKIP][43] ([fdo#109271] / [fdo#111827]) +10 similar issues
   [43]: https://intel-gfx-ci.01.org/tree/drm-tip/Patchwork_19746/shard-kbl2/igt@kms_chamelium@hdmi-hpd-storm.html

  * igt@kms_color@pipe-d-ctm-0-5:
    - shard-iclb:         NOTRUN -> [SKIP][44] ([fdo#109278] / [i915#1149])
   [44]: https://intel-gfx-ci.01.org/tree/drm-tip/Patchwork_19746/shard-iclb8/igt@kms_color@pipe-d-ctm-0-5.html

  * igt@kms_color_chamelium@pipe-a-ctm-limited-range:
    - shard-apl:          NOTRUN -> [SKIP][45] ([fdo#109271] / [fdo#111827]) +16 similar issues
   [45]: https://intel-gfx-ci.01.org/tree/drm-tip/Patchwork_19746/shard-apl8/igt@kms_color_chamelium@pipe-a-ctm-limited-range.html

  * igt@kms_color_chamelium@pipe-a-ctm-max:
    - shard-iclb:         NOTRUN -> [SKIP][46] ([fdo#109284] / [fdo#111827]) +2 similar issues
   [46]: https://intel-gfx-ci.01.org/tree/drm-tip/Patchwork_19746/shard-iclb8/igt@kms_color_chamelium@pipe-a-ctm-max.html

  * igt@kms_color_chamelium@pipe-b-ctm-max:
    - shard-skl:          NOTRUN -> [SKIP][47] ([fdo#109271] / [fdo#111827]) +15 similar issues
   [47]: https://intel-gfx-ci.01.org/tree/drm-tip/Patchwork_19746/shard-skl8/igt@kms_color_chamelium@pipe-b-ctm-max.html

  * igt@kms_color_chamelium@pipe-invalid-ctm-matrix-sizes:
    - shard-snb:          NOTRUN -> [SKIP][48] ([fdo#109271] / [fdo#111827]) +14 similar issues
   [48]: https://intel-gfx-ci.01.org/tree/drm-tip/Patchwork_19746/shard-snb2/igt@kms_color_chamelium@pipe-invalid-ctm-matrix-sizes.html

  * igt@kms_content_protection@legacy:
    - shard-iclb:         NOTRUN -> [SKIP][49] ([fdo#109300] / [fdo#111066])
   [49]: https://intel-gfx-ci.01.org/tree/drm-tip/Patchwork_19746/shard-iclb8/igt@kms_content_protection@legacy.html

  * igt@kms_content_protection@srm:
    - shard-apl:          NOTRUN -> [TIMEOUT][50] ([i915#1319])
   [50]: https://intel-gfx-ci.01.org/tree/drm-tip/Patchwork_19746/shard-apl1/igt@kms_content_protection@srm.html

  * igt@kms_cursor_crc@pipe-a-cursor-512x170-sliding:
    - shard-iclb:         NOTRUN -> [SKIP][51] ([fdo#109278] / [fdo#109279])
   [51]: https://intel-gfx-ci.01.org/tree/drm-tip/Patchwork_19746/shard-iclb8/igt@kms_cursor_crc@pipe-a-cursor-512x170-sliding.html

  * igt@kms_cursor_crc@pipe-c-cursor-64x64-offscreen:
    - shard-skl:          [PASS][52] -> [FAIL][53] ([i915#54])
   [52]: https://intel-gfx-ci.01.org/tree/drm-tip/CI_DRM_9823/shard-skl5/igt@kms_cursor_crc@pipe-c-cursor-64x64-offscreen.html
   [53]: https://intel-gfx-ci.01.org/tree/drm-tip/Patchwork_19746/shard-skl7/igt@kms_cursor_crc@pipe-c-cursor-64x64-offscreen.html

  * igt@kms_cursor_crc@pipe-c-cursor-suspend:
    - shard-kbl:          [PASS][54] -> [DMESG-WARN][55] ([i915#180]) +1 similar issue
   [54]: https://intel-gfx-ci.01.org/tree/drm-tip/CI_DRM_9823/shard-kbl2/igt@kms_cursor_crc@pipe-c-cursor-suspend.html
   [55]: https://intel-gfx-ci.01.org/tree/drm-tip/Patchwork_19746/shard-kbl6/igt@kms_cursor_crc@pipe-c-cursor-suspend.html

  * igt@kms_cursor_legacy@flip-vs-cursor-atomic:
    - shard-tglb:         [PASS][56] -> [FAIL][57] ([i915#2346])
   [56]: https://intel-gfx-ci.01.org/tree/drm-tip/CI_DRM_9823/shard-tglb8/igt@kms_cursor_legacy@flip-vs-cursor-atomic.html
   [57]: https://intel-gfx-ci.01.org/tree/drm-tip/Patchwork_19746/shard-tglb2/igt@kms_cursor_legacy@flip-vs-cursor-atomic.html

  * igt@kms_cursor_legacy@flip-vs-cursor-legacy:
    - shard-skl:          [PASS][58] -> [FAIL][59] ([i915#2346])
   [58]: https://intel-gfx-ci.01.org/tree/drm-tip/CI_DRM_9823/shard-skl8/igt@kms_cursor_legacy@flip-vs-cursor-legacy.html
   [59]: https://intel-gfx-ci.01.org/tree/drm-tip/Patchwork_19746/shard-skl4/igt@kms_cursor_legacy@flip-vs-cursor-legacy.html

  * igt@kms_cursor_legacy@pipe-d-single-move:
    - shard-iclb:         NOTRUN -> [SKIP][60] ([fdo#109278]) +2 similar issues
   [60]: https://intel-gfx-ci.01.org/tree/drm-tip/Patchwork_19746/shard-iclb8/igt@kms_cursor_legacy@pipe-d-single-move.html

  * igt@kms_flip@2x-nonexisting-fb:
    - shard-iclb:         NOTRUN -> [SKIP][61] ([fdo#109274]) +2 similar issues
   [61]: https://intel-gfx-ci.01.org/tree/drm-tip/Patchwork_19746/shard-iclb8/igt@kms_flip@2x-nonexisting-fb.html

  * igt@kms_flip@flip-vs-expired-vblank@a-edp1:
    - shard-tglb:         [PASS][62] -> [FAIL][63] ([i915#2598])
   [62]: https://intel-gfx-ci.01.org/tree/drm-tip/CI_DRM_9823/shard-tglb2/igt@kms_flip@flip-vs-expired-vblank@a-edp1.html
   [63]: https://intel-gfx-ci.01.org/tree/drm-tip/Patchwork_19746/shard-tglb7/igt@kms_flip@flip-vs-expired-vblank@a-edp1.html

  * igt@kms_flip@flip-vs-expired-vblank@a-hdmi-a1:
    - shard-glk:          [PASS][64] -> [FAIL][65] ([i915#79])
   [64]: https://intel-gfx-ci.01.org/tree/drm-tip/CI_DRM_9823/shard-glk5/igt@kms_flip@flip-vs-expired-vblank@a-hdmi-a1.html
   [65]: https://intel-gfx-ci.01.org/tree/drm-tip/Patchwork_19746/shard-glk3/igt@kms_flip@flip-vs-expired-vblank@a-hdmi-a1.html

  * igt@kms_flip_scaled_crc@flip-32bpp-ytileccs-to-64bpp-ytile:
    - shard-kbl:          NOTRUN -> [SKIP][66] ([fdo#109271] / [i915#2642])
   [66]: https://intel-gfx-ci.01.org/tree/drm-tip/Patchwork_19746/shard-kbl1/igt@kms_flip_scaled_crc@flip-32bpp-ytileccs-to-64bpp-ytile.html

  * igt@kms_frontbuffer_tracking@fbc-1p-primscrn-cur-indfb-draw-blt:
    - shard-apl:          NOTRUN -> [FAIL][67] ([i915#49])
   [67]: https://intel-gfx-ci.01.org/tree/drm-tip/Patchwork_19746/shard-apl1/igt@kms_frontbuffer_tracking@fbc-1p-primscrn-cur-indfb-draw-blt.html

  * igt@kms_frontbuffer_tracking@fbc-2p-scndscrn-pri-indfb-draw-blt:
    - shard-skl:          NOTRUN -> [SKIP][68] ([fdo#109271]) +106 similar issues
   [68]: https://intel-gfx-ci.01.org/tree/drm-tip/Patchwork_19746/shard-skl4/igt@kms_frontbuffer_tracking@fbc-2p-scndscrn-pri-indfb-draw-blt.html

  * igt@kms_frontbuffer_tracking@psr-2p-primscrn-pri-indfb-draw-render:
    - shard-iclb:         NOTRUN -> [SKIP][69] ([fdo#109280]) +7 similar issues
   [69]: https://intel-gfx-ci.01.org/tree/drm-tip/Patchwork_19746/shard-iclb8/igt@kms_frontbuffer_tracking@psr-2p-primscrn-pri-indfb-draw-render.html

  * igt@kms_frontbuffer_tracking@psr-2p-scndscrn-spr-indfb-draw-mmap-cpu:
    - shard-snb:          NOTRUN -> [SKIP][70] ([fdo#109271]) +253 similar issues
   [70]: https://intel-gfx-ci.01.org/tree/drm-tip/Patchwork_19746/shard-snb2/igt@kms_frontbuffer_tracking@psr-2p-scndscrn-spr-indfb-draw-mmap-cpu.html

  * igt@kms_hdr@bpc-switch-suspend:
    - shard-apl:          NOTRUN -> [DMESG-WARN][71] ([i915#180]) +1 similar issue
   [71]: https://intel-gfx-ci.01.org/tree/drm-tip/Patchwork_19746/shard-apl7/igt@kms_hdr@bpc-switch-suspend.html
    - shard-skl:          NOTRUN -> [FAIL][72] ([i915#1188])
   [72]: https://intel-gfx-ci.01.org/tree/drm-tip/Patchwork_19746/shard-skl10/igt@kms_hdr@bpc-switch-suspend.html

  * igt@kms_pipe_crc_basic@hang-read-crc-pipe-d:
    - shard-kbl:          NOTRUN -> [SKIP][73] ([fdo#109271] / [i915#533])
   [73]: https://intel-gfx-ci.01.org/tree/drm-tip/Patchwork_19746/shard-kbl1/igt@kms_pipe_crc_basic@hang-read-crc-pipe-d.html

  * igt@kms_plane_alpha_blend@pipe-a-alpha-7efc:
    - shard-apl:          NOTRUN -> [FAIL][74] ([fdo#108145] / [i915#265]) +2 similar issues
   [74]: https://intel-gfx-ci.01.org/tree/drm-tip/Patchwork_19746/shard-apl1/igt@kms_plane_alpha_blend@pipe-a-alpha-7efc.html

  * igt@kms_plane_alpha_blend@pipe-a-alpha-basic:
    - shard-skl:          NOTRUN -> [FAIL][75] ([fdo#108145] / [i915#265])
   [75]: https://intel-gfx-ci.01.org/tree/drm-tip/Patchwork_19746/shard-skl4/igt@kms_plane_alpha_blend@pipe-a-alpha-basic.html

  * igt@kms_plane_alpha_blend@pipe-c-alpha-transparent-fb:
    - shard-apl:          NOTRUN -> [FAIL][76] ([i915#265])
   [76]: https://intel-gfx-ci.01.org/tree/drm-tip/Patchwork_19746/shard-apl7/igt@kms_plane_alpha_blend@pipe-c-alpha-transparent-fb.html

  * igt@kms_psr2_sf@overlay-plane-update-sf-dmg-area-5:
    - shard-apl:          NOTRUN -> [SKIP][77] ([fdo#109271] / [i915#658]) +4 similar issues
   [77]: https://intel-gfx-ci.01.org/tree/drm-tip/Patchwork_19746/shard-apl1/igt@kms_psr2_sf@overlay-plane-update-sf-dmg-area-5.html

  * igt@kms_psr2_sf@overlay-primary-update-sf-dmg-area-3:
    - shard-skl:          NOTRUN -> [SKIP][78] ([fdo#109271] / [i915#658])
   [78]: https://intel-gfx-ci.01.org/tree/drm-tip/Patchwork_19746/shard-skl10/igt@kms_psr2_sf@overlay-primary-update-sf-dmg-area-3.html

  * igt@kms_psr@psr2_no_drrs:
    - shard-iclb:         [PASS][79] -> [SKIP][80] ([fdo#109441]) +1 similar issue
   [79]: https://intel-gfx-ci.01.org/tree/drm-tip/CI_DRM_9823/shard-iclb2/igt@kms_psr@psr2_no_drrs.html
   [80]: https://intel-gfx-ci.01.org/tree/drm-tip/Patchwork_19746/shard-iclb1/igt@kms_psr@psr2_no_drrs.html

  * igt@kms_vblank@pipe-d-ts-continuation-idle:
    - shard-apl:          NOTRUN -> [SKIP][81] ([fdo#109271]) +165 similar issues
   [81]: https://intel-gfx-ci.01.org/tree/drm-tip/Patchwork_19746/shard-apl1/igt@kms_vblank@pipe-d-ts-continuation-idle.html

  * igt@kms_writeback@writeback-check-output:
    - shard-iclb:         NOTRUN -> [SKIP][82] ([i915#2437])
   [82]: https://intel-gfx-ci.01.org/tree/drm-tip/Patchwork_19746/shard-iclb8/igt@kms_writeback@writeback-check-output.html

  * igt@kms_writeback@writeback-pixel-formats:
    - shard-skl:          NOTRUN -> [SKIP][83] ([fdo#109271] / [i915#2437]) +1 similar issue
   [83]: https://intel-gfx-ci.01.org/tree/drm-tip/Patchwork_19746/shard-skl8/igt@kms_writeback@writeback-pixel-formats.html
    - shard-apl:          NOTRUN -> [SKIP][84] ([fdo#109271] / [i915#2437])
   [84]: https://intel-gfx-ci.01.org/tree/drm-tip/Patchwork_19746/shard-apl1/igt@kms_writeback@writeback-pixel-formats.html

  * igt@perf@gen12-oa-tlb-invalidate:
    - shard-iclb:         NOTRUN -> [SKIP][85] ([fdo#109289])
   [85]: https://intel-gfx-ci.01.org/tree/drm-tip/Patchwork_19746/shard-iclb8/igt@perf@gen12-oa-tlb-invalidate.html

  * igt@perf@polling-parameterized:
    - shard-tglb:         [PASS][86] -> [FAIL][87] ([i915#1542])
   [86]: https://intel-gfx-ci.01.org/tree/drm-tip/CI_DRM_9823/shard-tglb3/igt@perf@polling-parameterized.html
   [87]: https://intel-gfx-ci.01.org/tree/drm-tip/Patchwork_19746/shard-tglb3/igt@perf@polling-parameterized.html

  * igt@prime_nv_api@nv_i915_import_twice_check_flink_name:
    - shard-iclb:         NOTRUN -> [SKIP][88] ([fdo#109291])
   [88]: https://intel-gfx-ci.01.org/tree/drm-tip/Patchwork_19746/shard-iclb8/igt@prime_nv_api@nv_i915_import_twice_check_flink_name.html

  * igt@sysfs_clients@recycle:
    - shard-kbl:          [PASS][89] -> [FAIL][90] ([i915#3028])
   [89]: https://intel-gfx-ci.01.org/tree/drm-tip/CI_DRM_9823/shard-kbl2/igt@sysfs_clients@recycle.html
   [90]: https://intel-gfx-ci.01.org/tree/drm-tip/Patchwork_19746/shard-kbl4/igt@sysfs_clients@recycle.html

  * igt@sysfs_clients@recycle-many:
    - shard-iclb:         [PASS][91] -> [FAIL][92] ([i915#3028])
   [91]: https://intel-gfx-ci.01.org/tree/drm-tip/CI_DRM_9823/shard-iclb3/igt@sysfs_clients@recycle-many.html
   [92]: https://intel-gfx-ci.01.org/tree/drm-tip/Patchwork_19746/shard-iclb1/igt@sysfs_clients@recycle-many.html
    - shard-glk:          [PASS][93] -> [FAIL][94] ([i915#3028])
   [93]: https://intel-gfx-ci.01.org/tree/drm-tip/CI_DRM_9823/shard-glk6/igt@sysfs_clients@recycle-many.html
   [94]: https://intel-gfx-ci.01.org/tree/drm-tip/Patchwork_19746/shard-glk6/igt@sysfs_clients@recycle-many.html
    - shard-tglb:         [PASS][95] -> [FAIL][96] ([i915#3028])
   [95]: https://intel-gfx-ci.01.org/tree/drm-tip/CI_DRM_9823/shard-tglb2/igt@sysfs_clients@recycle-many.html
   [96]: https://intel-gfx-ci.01.org/tree/drm-tip/Patchwork_19746/shard-tglb7/igt@sysfs_clients@recycle-many.html

  
#### Possible fixes ####

  * igt@gem_ctx_persistence@many-contexts:
    - shard-iclb:         [INCOMPLETE][97] ([i915#3057]) -> [PASS][98]
   [97]: https://intel-gfx-ci.01.org/tree/drm-tip/CI_DRM_9823/shard-iclb4/igt@gem_ctx_persistence@many-contexts.html
   [98]: https://intel-gfx-ci.01.org/tree/drm-tip/Patchwork_19746/shard-iclb8/igt@gem_ctx_persistence@many-contexts.html

  * igt@gem_exec_fair@basic-flow@rcs0:
    - shard-tglb:         [FAIL][99] ([i915#2842]) -> [PASS][100] +2 similar issues
   [99]: https://intel-gfx-ci.01.org/tree/drm-tip/CI_DRM_9823/shard-tglb7/igt@gem_exec_fair@basic-flow@rcs0.html
   [100]: https://intel-gfx-ci.01.org/tree/drm-tip/Patchwork_19746/shard-tglb8/igt@gem_exec_fair@basic-flow@rcs0.html

  * igt@gem_exec_fair@basic-pace@rcs0:
    - shard-glk:          [FAIL][101] ([i915#2842]) -> [PASS][102]
   [101]: https://intel-gfx-ci.01.org/tree/drm-tip/CI_DRM_9823/shard-glk8/igt@gem_exec_fair@basic-pace@rcs0.html
   [102]: https://intel-gfx-ci.01.org/tree/drm-tip/Patchwork_19746/shard-glk4/igt@gem_exec_fair@basic-pace@rcs0.html

  * igt@gem_exec_fair@basic-throttle@rcs0:
    - shard-iclb:         [FAIL][103] ([i915#2842]) -> [PASS][104]
   [103]: https://intel-gfx-ci.01.org/tree/drm-tip/CI_DRM_9823/shard-iclb4/igt@gem_exec_fair@basic-throttle@rcs0.html
   [104]: https://intel-gfx-ci.01.org/tree/drm-tip/Patchwork_19746/shard-iclb8/igt@gem_exec_fair@basic-throttle@rcs0.html

  * igt@gem_exec_schedule@u-fairslice@rcs0:
    - shard-skl:          [DMESG-WARN][105] ([i915#2803]) -> [PASS][106]
   [105]: https://intel-gfx-ci.01.org/tree/drm-tip/CI_DRM_9823/shard-skl7/igt@gem_exec_schedule@u-fairslice@rcs0.html
   [106]: https://intel-gfx-ci.01.org/tree/drm-tip/Patchwork_19746/shard-skl8/igt@gem_exec_schedule@u-fairslice@rcs0.html

  * igt@gem_exec_whisper@basic-queues-forked-all:
    - shard-glk:          [DMESG-WARN][107] ([i915#118] / [i915#95]) -> [PASS][108]
   [107]: https://intel-gfx-ci.01.org/tree/drm-tip/CI_DRM_9823/shard-glk8/igt@gem_exec_whisper@basic-queues-forked-all.html
   [108]: https://intel-gfx-ci.01.org/tree/drm-tip/Patchwork_19746/shard-glk8/igt@gem_exec_whisper@basic-queues-forked-all.html

  * igt@gem_huc_copy@huc-copy:
    - shard-tglb:         [SKIP][109] ([i915#2190]) -> [PASS][110]
   [109]: https://intel-gfx-ci.01.org/tree/drm-tip/CI_DRM_9823/shard-tglb6/igt@gem_huc_copy@huc-copy.html
   [110]: https://intel-gfx-ci.01.org/tree/drm-tip/Patchwork_19746/shard-tglb1/igt@gem_huc_copy@huc-copy.html

  * igt@gem_mmap_gtt@cpuset-big-copy-odd:
    - shard-iclb:         [FAIL][111] -> [PASS][112] +1 similar issue
   [111]: https://intel-gfx-ci.01.org/tree/drm-tip/CI_DRM_9823/shard-iclb6/igt@gem_mmap_gtt@cpuset-big-copy-odd.html
   [112]: https://intel-gfx-ci.01.org/tree/drm-tip/Patchwork_19746/shard-iclb5/igt@gem_mmap_gtt@cpuset-big-copy-odd.html

  * igt@gen9_exec_parse@allowed-all:
    - shard-skl:          [DMESG-WARN][113] ([i915#1436] / [i915#716]) -> [PASS][114]
   [113]: https://intel-gfx-ci.01.org/tree/drm-tip/CI_DRM_9823/shard-skl2/igt@gen9_exec_parse@allowed-all.html
   [114]: https://intel-gfx-ci.01.org/tree/drm-tip/Patchwork_19746/shard-skl8/igt@gen9_exec_parse@allowed-all.html

  * igt@kms_async_flips@test-time-stamp:
    - shard-tglb:         [FAIL][115] ([i915#2574]) -> [PASS][116]
   [115]: https://intel-gfx-ci.01.org/tree/drm-tip/CI_DRM_9823/shard-tglb2/igt@kms_async_flips@test-time-stamp.html
   [116]: https://intel-gfx-ci.01.org/tree/drm-tip/Patchwork_19746/shard-tglb7/igt@kms_async_flips@test-time-stamp.html

  * igt@kms_color@pipe-b-ctm-0-25:
    - shard-skl:          [DMESG-WARN][117] ([i915#1982]) -> [PASS][118]
   [117]: https://intel-gfx-ci.01.org/tree/drm-tip/CI_DRM_9823/shard-skl8/igt@kms_color@pipe-b-ctm-0-25.html
   [118]: https://intel-gfx-ci.01.org/tree/drm-tip/Patchwork_19746/shard-skl1/igt@kms_color@pipe-b-ctm-0-25.html

  * igt@kms_cursor_crc@pipe-a-cursor-suspend:
    - shard-kbl:          [INCOMPLETE][119] ([i915#155]) -> [PASS][120]
   [119]: https://intel-gfx-ci.01.org/tree/drm-tip/CI_DRM_9823/shard-kbl3/igt@kms_cursor_crc@pipe-a-cursor-suspend.html
   [120]: https://intel-gfx-ci.01.org/tree/drm-tip/Patchwork_19746/shard-kbl2/igt@kms_cursor_crc@pipe-a-cursor-suspend.html

  * igt@kms_cursor_crc@pipe-c-cursor-128x42-offscreen:
    - shard-skl:          [FAIL][121] ([i915#54]) -> [PASS][122] +1 similar issue
   [121]: https://intel-gfx-ci.01.org/tree/drm-tip/CI_DRM_9823/shard-skl4/igt@kms_cursor_crc@pipe-c-cursor-128x42-offscreen.html
   [122]: https://intel-gfx-ci.01.org/tree/drm-tip/Patchwork_19746/shard-skl9/igt@kms_cursor_crc@pipe-c-cursor-128x42-offscreen.html

  * igt@kms_cursor_legacy@2x-long-flip-vs-cursor-legacy:
    - shard-glk:          [FAIL][123] ([i915#72]) -> [PASS][124]
   [123]: https://intel-gfx-ci.01.org/tree/drm-tip/CI_DRM_9823/shard-glk4/igt@kms_cursor_legacy@2x-long-flip-vs-cursor-legacy.html
   [124]: https://intel-gfx-ci.01.org/tree/drm-tip/Patchwork_19746/shard-glk8/igt@kms_cursor_legacy@2x-long-flip-vs-cursor-legacy.html

  * igt@kms_flip@flip-vs-expired-vblank@b-hdmi-a2:
    - shard-glk:          [FAIL][125] ([i915#2122]) -> [PASS][126]
   [125]: https://intel-gfx-ci.01.org/tree/drm-tip/CI_DRM_9823/shard-glk5/igt@kms_flip@flip-vs-expired-vblank@b-hdmi-a2.html
   [126]: https://intel-gfx-ci.01.org/tree/drm-tip/Patchwork_19746/shard-glk3/igt@kms_flip@flip-vs-expired-vblank@b-hdmi-a2.html

  * igt@kms_flip@flip-vs-suspend-interruptible@a-dp1:
    - shard-kbl:          [DMESG-WARN][127] ([i915#180]) -> [PASS][128] +3 similar issues
   [127]: https://intel-gfx-ci.01.org/tree/drm-tip/CI_DRM_9823/shard-kbl7/igt@kms_flip@flip-vs-suspend-interruptible@a-dp1.html
   [128]: https://intel-gfx-ci.01.org/tree/drm-tip/Patchwork_19746/shard-kbl7/igt@kms_flip@flip-vs-suspend-interruptible@a-dp1.html
    - shard-apl:          [DMESG-WARN][129] ([i915#180]) -> [PASS][130]
   [129]: https://intel-gfx-ci.01.org/tree/drm-tip/CI_DRM_9823/shard-apl1/igt@kms_flip@flip-vs-suspend-interruptible@a-dp1.html
   [130]: https://intel-gfx-ci.01.org/tree/drm-tip/Patchwork_19746/shard-apl2/igt@kms_flip@flip-vs-suspend-interruptible@a-dp1.html

  * igt@kms_plane_alpha_blend@pipe-c-coverage-7efc:
    - shard-skl:          [FAIL][131] ([fdo#108145] / [i915#265]) -> [PASS][132] +1 similar issue
   [131]: https://intel-gfx-ci.01.org/tree/drm-tip/CI_DRM_9823/shard-skl2/igt@kms_plane_alpha_blend@pipe-c-coverage-7efc.html
   [132]: https://intel-gfx-ci.01.org/tree/drm-tip/Patchwork_19746/shard-skl8/igt@kms_plane_alpha_blend@pipe-c-coverage-7efc.html

  * igt@kms_psr@psr2_sprite_plane_move:
    - shard-iclb:         [SKIP][133] ([fdo#109441]) -> [PASS][134] +3 similar issues
   [133]: https://intel-gfx-ci.01.org/tree/drm-tip/CI_DRM_9823/shard-iclb8/igt@kms_psr@psr2_sprite_plane_move.html
   [134]: https://intel-gfx-ci.01.org/tree/drm-tip/Patchwork_19746/shard-iclb2/igt@kms_psr@psr2_sprite_plane_move.html

  * igt@kms_vblank@pipe-a-ts-continuation-suspend:
    - shard-kbl:          [DMESG-WARN][135] ([i915#180] / [i915#295]) -> [PASS][136]
   [135]: https://intel-gfx-ci.01.org/tree/drm-tip/CI_DRM_9823/shard-kbl1/igt@kms_vblank@pipe-a-ts-continuation-suspend.html
   [136]: https://intel-gfx-ci.01.org/tree/drm-tip/Patchwork_19746/shard-kbl3/igt@kms_vblank@pipe-a-ts-continuation-suspend.html

  * igt@perf@polling-small-buf:
    - shard-skl:          [FAIL][137] ([i915#1722]) -> [PASS][138]
   [137]: https://intel-gfx-ci.01.org/tree/drm-tip/CI_DRM_9823/shard-skl6/igt@perf@polling-small-buf.html
   [138]: https://intel-gfx-ci.01.org/tree/drm-tip/Patchwork_19746/shard-skl9/igt@perf@polling-small-buf.html

  * igt@sysfs_clients@busy@bcs0:
    - shard-kbl:          [FAIL][139] ([i915#3009]) -> [PASS][140]
   [139]: https://intel-gfx-ci.01.org/tree/drm-tip/CI_DRM_9823/shard-kbl3/igt@sysfs_clients@busy@bcs0.html
   [140]: https://intel-gfx-ci.01.org/tree/drm-tip/Patchwork_19746/shard-kbl2/igt@sysfs_clients@busy@bcs0.html

  
#### Warnings ####

  * igt@gem_exec_balancer@hang:
    - shard-iclb:         [INCOMPLETE][141] ([i915#1895] / [i915#3031]) -> [INCOMPLETE][142] ([i915#1895])
   [141]: https://intel-gfx-ci.01.org/tree/drm-tip/CI_DRM_9823/shard-iclb4/igt@gem_exec_balancer@hang.html
   [142]: https://intel-gfx-ci.01.org/tree/drm-tip/Patchwork_19746/shard-iclb2/igt@gem_exec_balancer@hang.html

  * igt@i915_pm_rc6_residency@rc6-idle:
    - shard-iclb:         [WARN][143] ([i915#1804] / [i915#2684]) -> [WARN][144] ([i915#2684])
   [143]: https://intel-gfx-ci.01.org/tree/drm-tip/CI_DRM_9823/shard-iclb4/igt@i915_pm_rc6_residency@rc6-idle.html
   [144]: https://intel-gfx-ci.01.org/tree/drm-tip/Patchwork_19746/shard-iclb2/igt@i915_pm_rc6_residency@rc6-idle.html

  * igt@kms_atomic_transition@modeset-transition-nonblocking@1x-outputs:
    - shard-glk:          [DMESG-FAIL][145] ([i915#118] / [i915#95]) -> [DMESG-WARN][146] ([i915#118] / [i915#95])
   [145]: https://intel-gfx-ci.01.org/tree/drm-tip/CI_DRM_9823/shard-glk9/igt@kms_atomic_transition@modeset-transition-nonblocking@1x-outputs.html
   [146]: https://intel-gfx-ci.01.org/tree/drm-tip/Patchwork_19746/shard-glk2/igt@kms_atomic_transition@modeset-transition-nonblocking@1x-outputs.html

  * igt@kms_content_protection@dp-mst-lic-type-0:
    - shard-iclb:         [FAIL][147] -> [SKIP][148] ([i915#3116])
   [147]: https://intel-gfx-ci.01.org/tree/drm-tip/CI_DRM_9823/shard-iclb1/igt@kms_content_protection@dp-mst-lic-type-0.html
   [148]: https://intel-gfx-ci.01.org/tree/drm-tip/Patchwork_19746/shard-iclb6/igt@kms_content_protection@dp-mst-lic-type-0.html

  * igt@kms_cursor_legacy@flip-vs-cursor-busy-crc-atomic:
    - shard-kbl:          [DMESG-FAIL][149] ([IGT#6] / [i915#1982]) -> [DMESG-FAIL][150] ([IGT#6])
   [149]: https://intel-gfx-ci.01.org/tree/drm-tip/CI_DRM_9823/shard-kbl7/igt@kms_cursor_legacy@flip-vs-cursor-busy-crc-atomic.html
   [150]: https://intel-gfx-c

== Logs ==

For more details see: https://intel-gfx-ci.01.org/tree/drm-tip/Patchwork_19746/index.html

[-- Attachment #1.2: Type: text/html, Size: 33904 bytes --]

[-- Attachment #2: Type: text/plain, Size: 160 bytes --]

_______________________________________________
Intel-gfx mailing list
Intel-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/intel-gfx

^ permalink raw reply	[flat|nested] 19+ messages in thread

* [Intel-gfx] [PATCH] i915/query: Correlate engine and cpu timestamps with better accuracy
@ 2021-03-05 19:37 Umesh Nerlige Ramappa
  0 siblings, 0 replies; 19+ messages in thread
From: Umesh Nerlige Ramappa @ 2021-03-05 19:37 UTC (permalink / raw)
  To: intel-gfx, Chris Wilson, Tvrtko Ursulin, Lionel G Landwerlin

Perf measurements rely on CPU and engine timestamps to correlate
events of interest across these time domains. Current mechanisms get
these timestamps separately and the calculated delta between these
timestamps lack enough accuracy.

To improve the accuracy of these time measurements to within a few us,
add a query that returns the engine and cpu timestamps captured as
close to each other as possible.

v2: (Tvrtko)
- document clock reference used
- return cpu timestamp always
- capture cpu time just before lower dword of cs timestamp

v3: (Chris)
- use uncore-rpm
- use __query_cs_timestamp helper

v4: (Lionel)
- Kernel perf subsytem allows users to specify the clock id to be used
  in perf_event_open. This clock id is used by the perf subsystem to
  return the appropriate cpu timestamp in perf events. Similarly, let
  the user pass the clockid to this query so that cpu timestamp
  corresponds to the clock id requested.

v5: (Tvrtko)
- Use normal ktime accessors instead of fast versions
- Add more uApi documentation

v6: (Lionel)
- Move switch out of spinlock

v7: (Chris)
- cs_timestamp is a misnomer, use cs_cycles instead
- return the cs cycle frequency as well in the query

v8:
- Add platform and engine specific checks

v9: (Lionel)
- Return 2 cpu timestamps in the query - captured before and after the
  register read

v10: (Chris)
- Use local_clock() to measure time taken to read lower dword of
  register and return it to user.

Signed-off-by: Umesh Nerlige Ramappa <umesh.nerlige.ramappa@intel.com>
---
 drivers/gpu/drm/i915/i915_query.c | 145 ++++++++++++++++++++++++++++++
 include/uapi/drm/i915_drm.h       |  48 ++++++++++
 2 files changed, 193 insertions(+)

diff --git a/drivers/gpu/drm/i915/i915_query.c b/drivers/gpu/drm/i915/i915_query.c
index fed337ad7b68..25b96927ab92 100644
--- a/drivers/gpu/drm/i915/i915_query.c
+++ b/drivers/gpu/drm/i915/i915_query.c
@@ -6,6 +6,8 @@
 
 #include <linux/nospec.h>
 
+#include "gt/intel_engine_pm.h"
+#include "gt/intel_engine_user.h"
 #include "i915_drv.h"
 #include "i915_perf.h"
 #include "i915_query.h"
@@ -90,6 +92,148 @@ static int query_topology_info(struct drm_i915_private *dev_priv,
 	return total_length;
 }
 
+typedef u64 (*__ktime_func_t)(void);
+static __ktime_func_t __clock_id_to_func(clockid_t clk_id)
+{
+	/*
+	 * Use logic same as the perf subsystem to allow user to select the
+	 * reference clock id to be used for timestamps.
+	 */
+	switch (clk_id) {
+	case CLOCK_MONOTONIC:
+		return &ktime_get_ns;
+	case CLOCK_MONOTONIC_RAW:
+		return &ktime_get_raw_ns;
+	case CLOCK_REALTIME:
+		return &ktime_get_real_ns;
+	case CLOCK_BOOTTIME:
+		return &ktime_get_boottime_ns;
+	case CLOCK_TAI:
+		return &ktime_get_clocktai_ns;
+	default:
+		return NULL;
+	}
+}
+
+static inline int
+__read_timestamps(struct intel_uncore *uncore,
+		  i915_reg_t lower_reg,
+		  i915_reg_t upper_reg,
+		  u64 *cs_ts,
+		  u64 *cpu_ts,
+		  __ktime_func_t cpu_clock)
+{
+	u32 upper, lower, old_upper, loop = 0;
+
+	upper = intel_uncore_read_fw(uncore, upper_reg);
+	do {
+		cpu_ts[1] = local_clock();
+		cpu_ts[0] = cpu_clock();
+		lower = intel_uncore_read_fw(uncore, lower_reg);
+		cpu_ts[1] = local_clock() - cpu_ts[1];
+		old_upper = upper;
+		upper = intel_uncore_read_fw(uncore, upper_reg);
+	} while (upper != old_upper && loop++ < 2);
+
+	*cs_ts = (u64)upper << 32 | lower;
+
+	return 0;
+}
+
+static int
+__query_cs_cycles(struct intel_engine_cs *engine,
+		  u64 *cs_ts, u64 *cpu_ts,
+		  __ktime_func_t cpu_clock)
+{
+	struct intel_uncore *uncore = engine->uncore;
+	enum forcewake_domains fw_domains;
+	u32 base = engine->mmio_base;
+	intel_wakeref_t wakeref;
+	int ret;
+
+	fw_domains = intel_uncore_forcewake_for_reg(uncore,
+						    RING_TIMESTAMP(base),
+						    FW_REG_READ);
+
+	with_intel_runtime_pm(uncore->rpm, wakeref) {
+		spin_lock_irq(&uncore->lock);
+		intel_uncore_forcewake_get__locked(uncore, fw_domains);
+
+		ret = __read_timestamps(uncore,
+					RING_TIMESTAMP(base),
+					RING_TIMESTAMP_UDW(base),
+					cs_ts,
+					cpu_ts,
+					cpu_clock);
+
+		intel_uncore_forcewake_put__locked(uncore, fw_domains);
+		spin_unlock_irq(&uncore->lock);
+	}
+
+	return ret;
+}
+
+static int
+query_cs_cycles(struct drm_i915_private *i915,
+		struct drm_i915_query_item *query_item)
+{
+	struct drm_i915_query_cs_cycles __user *query_ptr;
+	struct drm_i915_query_cs_cycles query;
+	struct intel_engine_cs *engine;
+	__ktime_func_t cpu_clock;
+	int ret;
+
+	if (INTEL_GEN(i915) < 6)
+		return -ENODEV;
+
+	query_ptr = u64_to_user_ptr(query_item->data_ptr);
+	ret = copy_query_item(&query, sizeof(query), sizeof(query), query_item);
+	if (ret != 0)
+		return ret;
+
+	if (query.flags)
+		return -EINVAL;
+
+	if (query.rsvd)
+		return -EINVAL;
+
+	cpu_clock = __clock_id_to_func(query.clockid);
+	if (!cpu_clock)
+		return -EINVAL;
+
+	engine = intel_engine_lookup_user(i915,
+					  query.engine.engine_class,
+					  query.engine.engine_instance);
+	if (!engine)
+		return -EINVAL;
+
+	if (IS_GEN(i915, 6) &&
+	    query.engine.engine_class != I915_ENGINE_CLASS_RENDER)
+		return -ENODEV;
+
+	query.cs_frequency = engine->gt->clock_frequency;
+	ret = __query_cs_cycles(engine,
+				&query.cs_cycles,
+				query.cpu_timestamp,
+				cpu_clock);
+	if (ret)
+		return ret;
+
+	if (put_user(query.cs_frequency, &query_ptr->cs_frequency))
+		return -EFAULT;
+
+	if (put_user(query.cpu_timestamp[0], &query_ptr->cpu_timestamp[0]))
+		return -EFAULT;
+
+	if (put_user(query.cpu_timestamp[1], &query_ptr->cpu_timestamp[1]))
+		return -EFAULT;
+
+	if (put_user(query.cs_cycles, &query_ptr->cs_cycles))
+		return -EFAULT;
+
+	return sizeof(query);
+}
+
 static int
 query_engine_info(struct drm_i915_private *i915,
 		  struct drm_i915_query_item *query_item)
@@ -424,6 +568,7 @@ static int (* const i915_query_funcs[])(struct drm_i915_private *dev_priv,
 	query_topology_info,
 	query_engine_info,
 	query_perf_config,
+	query_cs_cycles,
 };
 
 int i915_query_ioctl(struct drm_device *dev, void *data, struct drm_file *file)
diff --git a/include/uapi/drm/i915_drm.h b/include/uapi/drm/i915_drm.h
index 1987e2ea79a3..001e4d0bd0a2 100644
--- a/include/uapi/drm/i915_drm.h
+++ b/include/uapi/drm/i915_drm.h
@@ -2176,6 +2176,10 @@ struct drm_i915_query_item {
 #define DRM_I915_QUERY_TOPOLOGY_INFO    1
 #define DRM_I915_QUERY_ENGINE_INFO	2
 #define DRM_I915_QUERY_PERF_CONFIG      3
+	/**
+	 * Query Command Streamer timestamp register.
+	 */
+#define DRM_I915_QUERY_CS_CYCLES	4
 /* Must be kept compact -- no holes and well documented */
 
 	/*
@@ -2309,6 +2313,50 @@ struct drm_i915_engine_info {
 	__u64 rsvd1[4];
 };
 
+/**
+ * struct drm_i915_query_cs_cycles
+ *
+ * The query returns the command streamer cycles and the frequency that can be
+ * used to calculate the command streamer timestamp. In addition the query
+ * returns a set of cpu timestamps that indicate when the command streamer cycle
+ * count was captured.
+ */
+struct drm_i915_query_cs_cycles {
+	/** Engine for which command streamer cycles is queried. */
+	struct i915_engine_class_instance engine;
+
+	/** Must be zero. */
+	__u32 flags;
+
+	/**
+	 * Command streamer cycles as read from the command streamer
+	 * register at 0x358 offset.
+	 */
+	__u64 cs_cycles;
+
+	/** Frequency of the cs cycles in Hz. */
+	__u64 cs_frequency;
+
+	/**
+	 * CPU timestamps in ns. cpu_timestamp[0] is captured before reading the
+	 * cs_cycles register using the reference clockid set by the user.
+	 * cpu_timestamp[1] is the time taken in ns to read the lower dword of
+	 * the cs_cycles register.
+	 */
+	__u64 cpu_timestamp[2];
+
+	/**
+	 * Reference clock id for CPU timestamp. For definition, see
+	 * clock_gettime(2) and perf_event_open(2). Supported clock ids are
+	 * CLOCK_MONOTONIC, CLOCK_MONOTONIC_RAW, CLOCK_REALTIME, CLOCK_BOOTTIME,
+	 * CLOCK_TAI.
+	 */
+	__s32 clockid;
+
+	/** Must be zero. */
+	__u32 rsvd;
+};
+
 /**
  * struct drm_i915_query_engine_info
  *
-- 
2.20.1

_______________________________________________
Intel-gfx mailing list
Intel-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/intel-gfx

^ permalink raw reply related	[flat|nested] 19+ messages in thread

* Re: [Intel-gfx] [PATCH] i915/query: Correlate engine and cpu timestamps with better accuracy
  2021-03-03 21:28 [Intel-gfx] [PATCH] " Umesh Nerlige Ramappa
  2021-03-04  0:09 ` Chris Wilson
  2021-03-04  8:32 ` Lionel Landwerlin
@ 2021-03-04 12:23 ` Lionel Landwerlin
  2 siblings, 0 replies; 19+ messages in thread
From: Lionel Landwerlin @ 2021-03-04 12:23 UTC (permalink / raw)
  To: Umesh Nerlige Ramappa, intel-gfx, Chris Wilson, Tvrtko Ursulin

On 03/03/2021 23:28, Umesh Nerlige Ramappa wrote:
> Perf measurements rely on CPU and engine timestamps to correlate
> events of interest across these time domains. Current mechanisms get
> these timestamps separately and the calculated delta between these
> timestamps lack enough accuracy.
>
> To improve the accuracy of these time measurements to within a few us,
> add a query that returns the engine and cpu timestamps captured as
> close to each other as possible.
>
> v2: (Tvrtko)
> - document clock reference used
> - return cpu timestamp always
> - capture cpu time just before lower dword of cs timestamp
>
> v3: (Chris)
> - use uncore-rpm
> - use __query_cs_timestamp helper
>
> v4: (Lionel)
> - Kernel perf subsytem allows users to specify the clock id to be used
>    in perf_event_open. This clock id is used by the perf subsystem to
>    return the appropriate cpu timestamp in perf events. Similarly, let
>    the user pass the clockid to this query so that cpu timestamp
>    corresponds to the clock id requested.
>
> v5: (Tvrtko)
> - Use normal ktime accessors instead of fast versions
> - Add more uApi documentation
>
> v6: (Lionel)
> - Move switch out of spinlock
>
> v7: (Chris)
> - cs_timestamp is a misnomer, use cs_cycles instead
> - return the cs cycle frequency as well in the query
>
> v8:
> - Add platform and engine specific checks
>
> v9: (Lionel)
> - Return 2 cpu timestamps in the query - captured before and after the
>    register read
>
> Signed-off-by: Umesh Nerlige Ramappa<umesh.nerlige.ramappa@intel.com>
> ---
>   drivers/gpu/drm/i915/i915_query.c | 144 ++++++++++++++++++++++++++++++

FYI, the MR for Mesa : 
https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/9407


-Lionel

_______________________________________________
Intel-gfx mailing list
Intel-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/intel-gfx

^ permalink raw reply	[flat|nested] 19+ messages in thread

* Re: [Intel-gfx] [PATCH] i915/query: Correlate engine and cpu timestamps with better accuracy
  2021-03-04  9:54         ` Chris Wilson
@ 2021-03-04 10:27           ` Lionel Landwerlin
  0 siblings, 0 replies; 19+ messages in thread
From: Lionel Landwerlin @ 2021-03-04 10:27 UTC (permalink / raw)
  To: Chris Wilson, Tvrtko Ursulin, Umesh Nerlige Ramappa, intel-gfx


[-- Attachment #1.1: Type: text/plain, Size: 2002 bytes --]

On 04/03/2021 11:54, Chris Wilson wrote:
>>>> Actually if we want the best accuracy we can just deal with the lower dword.
>>> Accuracy of what? The lower dword read perhaps, or the accuracy of the
>>> sample point for the combined reads for the timestamp, which is closer
>>> to an external observer (cpu_clock() implies reference to an external
>>> observer).
>>>
>>> The two clock samples are not even necessarily closely related due to the
>>> nmi adjustments. If you wanted an unadjusted elapsed time for the read
>>> you can use local_clock() then return the chosen cpu_clock() before plus
>>> the elapsed delta from around the read as the estimated error.
>>>
>>> cpu_ts[1] = local_clock();
>>> cpu_ts[0] = cpu_clock();
>>> lower = intel_uncore_read_fw(uncore, lower_reg);
>>> cpu_ts[1] = local_clock() - cpu_ts[1];
>>> -Chris
>> Thanks,
>>
>>
>> I meant the accuracy of having 2 samples GPU/CPU as close as possible.
>>
>> Avoiding to account another register read in there is nice.
>>
>>
>> My testing was also mostly done with CLOCK_MONOTONIC_RAW which doesn't
>> seem to be adjusted like CLOCK_MONOTONIC so maybe that why I didn't see
>> the issue.
> _RAW is still adjusted for skews, just not coupled into the ntp feedback.
> That is less obvious than the other clocks, and why it's preferred for
> comparing against other HW sources. But two reads of _RAW are only
> monotonic, not necessarily on the same time base. local_clock() is
> tsc/arat, so counting the CPU cycles between the two reads with the
> frequency (at least on x86) held constant (and arat should be frequency
> invariant).
>
> If we want much better accuracy, we are supposed to use cyclecounter_t
> and the system_device_crosststamp.
> -Chris

Thanks for the pointers.

I think people are mostly trying to map what's coming out of OA or 
queries from the various command streamers back to perf/ftrace.

As far I know perf will only let you select a clockid.


So maybe cyclecounter_t is not that useful atm.


-Lionel


[-- Attachment #1.2: Type: text/html, Size: 2740 bytes --]

[-- Attachment #2: Type: text/plain, Size: 160 bytes --]

_______________________________________________
Intel-gfx mailing list
Intel-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/intel-gfx

^ permalink raw reply	[flat|nested] 19+ messages in thread

* Re: [Intel-gfx] [PATCH] i915/query: Correlate engine and cpu timestamps with better accuracy
  2021-03-04  9:45       ` Lionel Landwerlin
@ 2021-03-04  9:54         ` Chris Wilson
  2021-03-04 10:27           ` Lionel Landwerlin
  0 siblings, 1 reply; 19+ messages in thread
From: Chris Wilson @ 2021-03-04  9:54 UTC (permalink / raw)
  To: Lionel Landwerlin, Tvrtko Ursulin, Umesh Nerlige Ramappa, intel-gfx

Quoting Lionel Landwerlin (2021-03-04 09:45:47)
> On 04/03/2021 10:58, Chris Wilson wrote:
> > Quoting Lionel Landwerlin (2021-03-04 08:28:59)
> >> On 04/03/2021 02:09, Chris Wilson wrote:
> >>> Quoting Umesh Nerlige Ramappa (2021-03-03 21:28:00)
> >>>> Perf measurements rely on CPU and engine timestamps to correlate
> >>>> events of interest across these time domains. Current mechanisms get
> >>>> these timestamps separately and the calculated delta between these
> >>>> timestamps lack enough accuracy.
> >>>>
> >>>> To improve the accuracy of these time measurements to within a few us,
> >>>> add a query that returns the engine and cpu timestamps captured as
> >>>> close to each other as possible.
> >>>>
> >>>> v2: (Tvrtko)
> >>>> - document clock reference used
> >>>> - return cpu timestamp always
> >>>> - capture cpu time just before lower dword of cs timestamp
> >>>>
> >>>> v3: (Chris)
> >>>> - use uncore-rpm
> >>>> - use __query_cs_timestamp helper
> >>>>
> >>>> v4: (Lionel)
> >>>> - Kernel perf subsytem allows users to specify the clock id to be used
> >>>>     in perf_event_open. This clock id is used by the perf subsystem to
> >>>>     return the appropriate cpu timestamp in perf events. Similarly, let
> >>>>     the user pass the clockid to this query so that cpu timestamp
> >>>>     corresponds to the clock id requested.
> >>>>
> >>>> v5: (Tvrtko)
> >>>> - Use normal ktime accessors instead of fast versions
> >>>> - Add more uApi documentation
> >>>>
> >>>> v6: (Lionel)
> >>>> - Move switch out of spinlock
> >>>>
> >>>> v7: (Chris)
> >>>> - cs_timestamp is a misnomer, use cs_cycles instead
> >>>> - return the cs cycle frequency as well in the query
> >>>>
> >>>> v8:
> >>>> - Add platform and engine specific checks
> >>>>
> >>>> v9: (Lionel)
> >>>> - Return 2 cpu timestamps in the query - captured before and after the
> >>>>     register read
> >>>>
> >>>> Signed-off-by: Umesh Nerlige Ramappa <umesh.nerlige.ramappa@intel.com>
> >>>> ---
> >>>>    drivers/gpu/drm/i915/i915_query.c | 144 ++++++++++++++++++++++++++++++
> >>>>    include/uapi/drm/i915_drm.h       |  47 ++++++++++
> >>>>    2 files changed, 191 insertions(+)
> >>>>
> >>>> diff --git a/drivers/gpu/drm/i915/i915_query.c b/drivers/gpu/drm/i915/i915_query.c
> >>>> index fed337ad7b68..acca22ee6014 100644
> >>>> --- a/drivers/gpu/drm/i915/i915_query.c
> >>>> +++ b/drivers/gpu/drm/i915/i915_query.c
> >>>> @@ -6,6 +6,8 @@
> >>>>    
> >>>>    #include <linux/nospec.h>
> >>>>    
> >>>> +#include "gt/intel_engine_pm.h"
> >>>> +#include "gt/intel_engine_user.h"
> >>>>    #include "i915_drv.h"
> >>>>    #include "i915_perf.h"
> >>>>    #include "i915_query.h"
> >>>> @@ -90,6 +92,147 @@ static int query_topology_info(struct drm_i915_private *dev_priv,
> >>>>           return total_length;
> >>>>    }
> >>>>    
> >>>> +typedef u64 (*__ktime_func_t)(void);
> >>>> +static __ktime_func_t __clock_id_to_func(clockid_t clk_id)
> >>>> +{
> >>>> +       /*
> >>>> +        * Use logic same as the perf subsystem to allow user to select the
> >>>> +        * reference clock id to be used for timestamps.
> >>>> +        */
> >>>> +       switch (clk_id) {
> >>>> +       case CLOCK_MONOTONIC:
> >>>> +               return &ktime_get_ns;
> >>>> +       case CLOCK_MONOTONIC_RAW:
> >>>> +               return &ktime_get_raw_ns;
> >>>> +       case CLOCK_REALTIME:
> >>>> +               return &ktime_get_real_ns;
> >>>> +       case CLOCK_BOOTTIME:
> >>>> +               return &ktime_get_boottime_ns;
> >>>> +       case CLOCK_TAI:
> >>>> +               return &ktime_get_clocktai_ns;
> >>>> +       default:
> >>>> +               return NULL;
> >>>> +       }
> >>>> +}
> >>>> +
> >>>> +static inline int
> >>>> +__read_timestamps(struct intel_uncore *uncore,
> >>>> +                 i915_reg_t lower_reg,
> >>>> +                 i915_reg_t upper_reg,
> >>>> +                 u64 *cs_ts,
> >>>> +                 u64 *cpu_ts,
> >>>> +                 __ktime_func_t cpu_clock)
> >>>> +{
> >>>> +       u32 upper, lower, old_upper, loop = 0;
> >>>> +
> >>>> +       upper = intel_uncore_read_fw(uncore, upper_reg);
> >>>> +       do {
> >>>> +               cpu_ts[0] = cpu_clock();
> >>>> +               lower = intel_uncore_read_fw(uncore, lower_reg);
> >>>> +               cpu_ts[1] = cpu_clock();
> >>>> +               old_upper = upper;
> >>>> +               upper = intel_uncore_read_fw(uncore, upper_reg);
> >>> Both register reads comprise the timestamp returned to userspace, so
> >>> presumably you want cpu_ts[] to wrap both.
> >>>
> >>>          do {
> >>>                  old_upper = upper;
> >>>
> >>>                  cpu_ts[0] = cpu_clock();
> >>>                  lower = intel_uncore_read_fw(uncore, lower_reg);
> >>>                  upper = intel_uncore_read_fw(uncore, upper_reg);
> >>>                  cpu_ts[1] = cpu_clock();
> >>>          } while (upper != old_upper && loop++ < 2);
> >> Actually if we want the best accuracy we can just deal with the lower dword.
> > Accuracy of what? The lower dword read perhaps, or the accuracy of the
> > sample point for the combined reads for the timestamp, which is closer
> > to an external observer (cpu_clock() implies reference to an external
> > observer).
> >
> > The two clock samples are not even necessarily closely related due to the
> > nmi adjustments. If you wanted an unadjusted elapsed time for the read
> > you can use local_clock() then return the chosen cpu_clock() before plus
> > the elapsed delta from around the read as the estimated error.
> >
> > cpu_ts[1] = local_clock();
> > cpu_ts[0] = cpu_clock();
> > lower = intel_uncore_read_fw(uncore, lower_reg);
> > cpu_ts[1] = local_clock() - cpu_ts[1];
> > -Chris
> 
> Thanks,
> 
> 
> I meant the accuracy of having 2 samples GPU/CPU as close as possible.
> 
> Avoiding to account another register read in there is nice.
> 
> 
> My testing was also mostly done with CLOCK_MONOTONIC_RAW which doesn't 
> seem to be adjusted like CLOCK_MONOTONIC so maybe that why I didn't see 
> the issue.

_RAW is still adjusted for skews, just not coupled into the ntp feedback.
That is less obvious than the other clocks, and why it's preferred for
comparing against other HW sources. But two reads of _RAW are only
monotonic, not necessarily on the same time base. local_clock() is
tsc/arat, so counting the CPU cycles between the two reads with the
frequency (at least on x86) held constant (and arat should be frequency
invariant).

If we want much better accuracy, we are supposed to use cyclecounter_t
and the system_device_crosststamp.
-Chris
_______________________________________________
Intel-gfx mailing list
Intel-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/intel-gfx

^ permalink raw reply	[flat|nested] 19+ messages in thread

* Re: [Intel-gfx] [PATCH] i915/query: Correlate engine and cpu timestamps with better accuracy
  2021-03-04  8:58     ` Chris Wilson
@ 2021-03-04  9:45       ` Lionel Landwerlin
  2021-03-04  9:54         ` Chris Wilson
  0 siblings, 1 reply; 19+ messages in thread
From: Lionel Landwerlin @ 2021-03-04  9:45 UTC (permalink / raw)
  To: Chris Wilson, Tvrtko Ursulin, Umesh Nerlige Ramappa, intel-gfx

On 04/03/2021 10:58, Chris Wilson wrote:
> Quoting Lionel Landwerlin (2021-03-04 08:28:59)
>> On 04/03/2021 02:09, Chris Wilson wrote:
>>> Quoting Umesh Nerlige Ramappa (2021-03-03 21:28:00)
>>>> Perf measurements rely on CPU and engine timestamps to correlate
>>>> events of interest across these time domains. Current mechanisms get
>>>> these timestamps separately and the calculated delta between these
>>>> timestamps lack enough accuracy.
>>>>
>>>> To improve the accuracy of these time measurements to within a few us,
>>>> add a query that returns the engine and cpu timestamps captured as
>>>> close to each other as possible.
>>>>
>>>> v2: (Tvrtko)
>>>> - document clock reference used
>>>> - return cpu timestamp always
>>>> - capture cpu time just before lower dword of cs timestamp
>>>>
>>>> v3: (Chris)
>>>> - use uncore-rpm
>>>> - use __query_cs_timestamp helper
>>>>
>>>> v4: (Lionel)
>>>> - Kernel perf subsytem allows users to specify the clock id to be used
>>>>     in perf_event_open. This clock id is used by the perf subsystem to
>>>>     return the appropriate cpu timestamp in perf events. Similarly, let
>>>>     the user pass the clockid to this query so that cpu timestamp
>>>>     corresponds to the clock id requested.
>>>>
>>>> v5: (Tvrtko)
>>>> - Use normal ktime accessors instead of fast versions
>>>> - Add more uApi documentation
>>>>
>>>> v6: (Lionel)
>>>> - Move switch out of spinlock
>>>>
>>>> v7: (Chris)
>>>> - cs_timestamp is a misnomer, use cs_cycles instead
>>>> - return the cs cycle frequency as well in the query
>>>>
>>>> v8:
>>>> - Add platform and engine specific checks
>>>>
>>>> v9: (Lionel)
>>>> - Return 2 cpu timestamps in the query - captured before and after the
>>>>     register read
>>>>
>>>> Signed-off-by: Umesh Nerlige Ramappa <umesh.nerlige.ramappa@intel.com>
>>>> ---
>>>>    drivers/gpu/drm/i915/i915_query.c | 144 ++++++++++++++++++++++++++++++
>>>>    include/uapi/drm/i915_drm.h       |  47 ++++++++++
>>>>    2 files changed, 191 insertions(+)
>>>>
>>>> diff --git a/drivers/gpu/drm/i915/i915_query.c b/drivers/gpu/drm/i915/i915_query.c
>>>> index fed337ad7b68..acca22ee6014 100644
>>>> --- a/drivers/gpu/drm/i915/i915_query.c
>>>> +++ b/drivers/gpu/drm/i915/i915_query.c
>>>> @@ -6,6 +6,8 @@
>>>>    
>>>>    #include <linux/nospec.h>
>>>>    
>>>> +#include "gt/intel_engine_pm.h"
>>>> +#include "gt/intel_engine_user.h"
>>>>    #include "i915_drv.h"
>>>>    #include "i915_perf.h"
>>>>    #include "i915_query.h"
>>>> @@ -90,6 +92,147 @@ static int query_topology_info(struct drm_i915_private *dev_priv,
>>>>           return total_length;
>>>>    }
>>>>    
>>>> +typedef u64 (*__ktime_func_t)(void);
>>>> +static __ktime_func_t __clock_id_to_func(clockid_t clk_id)
>>>> +{
>>>> +       /*
>>>> +        * Use logic same as the perf subsystem to allow user to select the
>>>> +        * reference clock id to be used for timestamps.
>>>> +        */
>>>> +       switch (clk_id) {
>>>> +       case CLOCK_MONOTONIC:
>>>> +               return &ktime_get_ns;
>>>> +       case CLOCK_MONOTONIC_RAW:
>>>> +               return &ktime_get_raw_ns;
>>>> +       case CLOCK_REALTIME:
>>>> +               return &ktime_get_real_ns;
>>>> +       case CLOCK_BOOTTIME:
>>>> +               return &ktime_get_boottime_ns;
>>>> +       case CLOCK_TAI:
>>>> +               return &ktime_get_clocktai_ns;
>>>> +       default:
>>>> +               return NULL;
>>>> +       }
>>>> +}
>>>> +
>>>> +static inline int
>>>> +__read_timestamps(struct intel_uncore *uncore,
>>>> +                 i915_reg_t lower_reg,
>>>> +                 i915_reg_t upper_reg,
>>>> +                 u64 *cs_ts,
>>>> +                 u64 *cpu_ts,
>>>> +                 __ktime_func_t cpu_clock)
>>>> +{
>>>> +       u32 upper, lower, old_upper, loop = 0;
>>>> +
>>>> +       upper = intel_uncore_read_fw(uncore, upper_reg);
>>>> +       do {
>>>> +               cpu_ts[0] = cpu_clock();
>>>> +               lower = intel_uncore_read_fw(uncore, lower_reg);
>>>> +               cpu_ts[1] = cpu_clock();
>>>> +               old_upper = upper;
>>>> +               upper = intel_uncore_read_fw(uncore, upper_reg);
>>> Both register reads comprise the timestamp returned to userspace, so
>>> presumably you want cpu_ts[] to wrap both.
>>>
>>>          do {
>>>                  old_upper = upper;
>>>
>>>                  cpu_ts[0] = cpu_clock();
>>>                  lower = intel_uncore_read_fw(uncore, lower_reg);
>>>                  upper = intel_uncore_read_fw(uncore, upper_reg);
>>>                  cpu_ts[1] = cpu_clock();
>>>          } while (upper != old_upper && loop++ < 2);
>> Actually if we want the best accuracy we can just deal with the lower dword.
> Accuracy of what? The lower dword read perhaps, or the accuracy of the
> sample point for the combined reads for the timestamp, which is closer
> to an external observer (cpu_clock() implies reference to an external
> observer).
>
> The two clock samples are not even necessarily closely related due to the
> nmi adjustments. If you wanted an unadjusted elapsed time for the read
> you can use local_clock() then return the chosen cpu_clock() before plus
> the elapsed delta from around the read as the estimated error.
>
> cpu_ts[1] = local_clock();
> cpu_ts[0] = cpu_clock();
> lower = intel_uncore_read_fw(uncore, lower_reg);
> cpu_ts[1] = local_clock() - cpu_ts[1];
> -Chris

Thanks,


I meant the accuracy of having 2 samples GPU/CPU as close as possible.

Avoiding to account another register read in there is nice.


My testing was also mostly done with CLOCK_MONOTONIC_RAW which doesn't 
seem to be adjusted like CLOCK_MONOTONIC so maybe that why I didn't see 
the issue.


-Lionel

_______________________________________________
Intel-gfx mailing list
Intel-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/intel-gfx

^ permalink raw reply	[flat|nested] 19+ messages in thread

* Re: [Intel-gfx] [PATCH] i915/query: Correlate engine and cpu timestamps with better accuracy
  2021-03-04  8:28   ` Lionel Landwerlin
@ 2021-03-04  8:58     ` Chris Wilson
  2021-03-04  9:45       ` Lionel Landwerlin
  0 siblings, 1 reply; 19+ messages in thread
From: Chris Wilson @ 2021-03-04  8:58 UTC (permalink / raw)
  To: Lionel Landwerlin, Tvrtko Ursulin, Umesh Nerlige Ramappa, intel-gfx

Quoting Lionel Landwerlin (2021-03-04 08:28:59)
> On 04/03/2021 02:09, Chris Wilson wrote:
> > Quoting Umesh Nerlige Ramappa (2021-03-03 21:28:00)
> >> Perf measurements rely on CPU and engine timestamps to correlate
> >> events of interest across these time domains. Current mechanisms get
> >> these timestamps separately and the calculated delta between these
> >> timestamps lack enough accuracy.
> >>
> >> To improve the accuracy of these time measurements to within a few us,
> >> add a query that returns the engine and cpu timestamps captured as
> >> close to each other as possible.
> >>
> >> v2: (Tvrtko)
> >> - document clock reference used
> >> - return cpu timestamp always
> >> - capture cpu time just before lower dword of cs timestamp
> >>
> >> v3: (Chris)
> >> - use uncore-rpm
> >> - use __query_cs_timestamp helper
> >>
> >> v4: (Lionel)
> >> - Kernel perf subsytem allows users to specify the clock id to be used
> >>    in perf_event_open. This clock id is used by the perf subsystem to
> >>    return the appropriate cpu timestamp in perf events. Similarly, let
> >>    the user pass the clockid to this query so that cpu timestamp
> >>    corresponds to the clock id requested.
> >>
> >> v5: (Tvrtko)
> >> - Use normal ktime accessors instead of fast versions
> >> - Add more uApi documentation
> >>
> >> v6: (Lionel)
> >> - Move switch out of spinlock
> >>
> >> v7: (Chris)
> >> - cs_timestamp is a misnomer, use cs_cycles instead
> >> - return the cs cycle frequency as well in the query
> >>
> >> v8:
> >> - Add platform and engine specific checks
> >>
> >> v9: (Lionel)
> >> - Return 2 cpu timestamps in the query - captured before and after the
> >>    register read
> >>
> >> Signed-off-by: Umesh Nerlige Ramappa <umesh.nerlige.ramappa@intel.com>
> >> ---
> >>   drivers/gpu/drm/i915/i915_query.c | 144 ++++++++++++++++++++++++++++++
> >>   include/uapi/drm/i915_drm.h       |  47 ++++++++++
> >>   2 files changed, 191 insertions(+)
> >>
> >> diff --git a/drivers/gpu/drm/i915/i915_query.c b/drivers/gpu/drm/i915/i915_query.c
> >> index fed337ad7b68..acca22ee6014 100644
> >> --- a/drivers/gpu/drm/i915/i915_query.c
> >> +++ b/drivers/gpu/drm/i915/i915_query.c
> >> @@ -6,6 +6,8 @@
> >>   
> >>   #include <linux/nospec.h>
> >>   
> >> +#include "gt/intel_engine_pm.h"
> >> +#include "gt/intel_engine_user.h"
> >>   #include "i915_drv.h"
> >>   #include "i915_perf.h"
> >>   #include "i915_query.h"
> >> @@ -90,6 +92,147 @@ static int query_topology_info(struct drm_i915_private *dev_priv,
> >>          return total_length;
> >>   }
> >>   
> >> +typedef u64 (*__ktime_func_t)(void);
> >> +static __ktime_func_t __clock_id_to_func(clockid_t clk_id)
> >> +{
> >> +       /*
> >> +        * Use logic same as the perf subsystem to allow user to select the
> >> +        * reference clock id to be used for timestamps.
> >> +        */
> >> +       switch (clk_id) {
> >> +       case CLOCK_MONOTONIC:
> >> +               return &ktime_get_ns;
> >> +       case CLOCK_MONOTONIC_RAW:
> >> +               return &ktime_get_raw_ns;
> >> +       case CLOCK_REALTIME:
> >> +               return &ktime_get_real_ns;
> >> +       case CLOCK_BOOTTIME:
> >> +               return &ktime_get_boottime_ns;
> >> +       case CLOCK_TAI:
> >> +               return &ktime_get_clocktai_ns;
> >> +       default:
> >> +               return NULL;
> >> +       }
> >> +}
> >> +
> >> +static inline int
> >> +__read_timestamps(struct intel_uncore *uncore,
> >> +                 i915_reg_t lower_reg,
> >> +                 i915_reg_t upper_reg,
> >> +                 u64 *cs_ts,
> >> +                 u64 *cpu_ts,
> >> +                 __ktime_func_t cpu_clock)
> >> +{
> >> +       u32 upper, lower, old_upper, loop = 0;
> >> +
> >> +       upper = intel_uncore_read_fw(uncore, upper_reg);
> >> +       do {
> >> +               cpu_ts[0] = cpu_clock();
> >> +               lower = intel_uncore_read_fw(uncore, lower_reg);
> >> +               cpu_ts[1] = cpu_clock();
> >> +               old_upper = upper;
> >> +               upper = intel_uncore_read_fw(uncore, upper_reg);
> > Both register reads comprise the timestamp returned to userspace, so
> > presumably you want cpu_ts[] to wrap both.
> >
> >         do {
> >                 old_upper = upper;
> >
> >                 cpu_ts[0] = cpu_clock();
> >                 lower = intel_uncore_read_fw(uncore, lower_reg);
> >                 upper = intel_uncore_read_fw(uncore, upper_reg);
> >                 cpu_ts[1] = cpu_clock();
> >         } while (upper != old_upper && loop++ < 2);
> 
> Actually if we want the best accuracy we can just deal with the lower dword.

Accuracy of what? The lower dword read perhaps, or the accuracy of the
sample point for the combined reads for the timestamp, which is closer
to an external observer (cpu_clock() implies reference to an external
observer).

The two clock samples are not even necessarily closely related due to the
nmi adjustments. If you wanted an unadjusted elapsed time for the read
you can use local_clock() then return the chosen cpu_clock() before plus
the elapsed delta from around the read as the estimated error.

cpu_ts[1] = local_clock();
cpu_ts[0] = cpu_clock();
lower = intel_uncore_read_fw(uncore, lower_reg);
cpu_ts[1] = local_clock() - cpu_ts[1];
-Chris
_______________________________________________
Intel-gfx mailing list
Intel-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/intel-gfx

^ permalink raw reply	[flat|nested] 19+ messages in thread

* Re: [Intel-gfx] [PATCH] i915/query: Correlate engine and cpu timestamps with better accuracy
  2021-03-03 21:28 [Intel-gfx] [PATCH] " Umesh Nerlige Ramappa
  2021-03-04  0:09 ` Chris Wilson
@ 2021-03-04  8:32 ` Lionel Landwerlin
  2021-03-04 12:23 ` Lionel Landwerlin
  2 siblings, 0 replies; 19+ messages in thread
From: Lionel Landwerlin @ 2021-03-04  8:32 UTC (permalink / raw)
  To: Umesh Nerlige Ramappa, intel-gfx, Chris Wilson, Tvrtko Ursulin

On 03/03/2021 23:28, Umesh Nerlige Ramappa wrote:
> Perf measurements rely on CPU and engine timestamps to correlate
> events of interest across these time domains. Current mechanisms get
> these timestamps separately and the calculated delta between these
> timestamps lack enough accuracy.
>
> To improve the accuracy of these time measurements to within a few us,
> add a query that returns the engine and cpu timestamps captured as
> close to each other as possible.
>
> v2: (Tvrtko)
> - document clock reference used
> - return cpu timestamp always
> - capture cpu time just before lower dword of cs timestamp
>
> v3: (Chris)
> - use uncore-rpm
> - use __query_cs_timestamp helper
>
> v4: (Lionel)
> - Kernel perf subsytem allows users to specify the clock id to be used
>    in perf_event_open. This clock id is used by the perf subsystem to
>    return the appropriate cpu timestamp in perf events. Similarly, let
>    the user pass the clockid to this query so that cpu timestamp
>    corresponds to the clock id requested.
>
> v5: (Tvrtko)
> - Use normal ktime accessors instead of fast versions
> - Add more uApi documentation
>
> v6: (Lionel)
> - Move switch out of spinlock
>
> v7: (Chris)
> - cs_timestamp is a misnomer, use cs_cycles instead
> - return the cs cycle frequency as well in the query
>
> v8:
> - Add platform and engine specific checks
>
> v9: (Lionel)
> - Return 2 cpu timestamps in the query - captured before and after the
>    register read
>
> Signed-off-by: Umesh Nerlige Ramappa <umesh.nerlige.ramappa@intel.com>


Looks good to me, thanks!


Reviewed-by: Lionel Landwerlin <lionel.g.landwerlin@intel.com>


Let me prepare a Mesa MR to use this with VK_EXT_calibrated_timestamps.


-Lionel


> ---
>   drivers/gpu/drm/i915/i915_query.c | 144 ++++++++++++++++++++++++++++++
>   include/uapi/drm/i915_drm.h       |  47 ++++++++++
>   2 files changed, 191 insertions(+)
>
> diff --git a/drivers/gpu/drm/i915/i915_query.c b/drivers/gpu/drm/i915/i915_query.c
> index fed337ad7b68..acca22ee6014 100644
> --- a/drivers/gpu/drm/i915/i915_query.c
> +++ b/drivers/gpu/drm/i915/i915_query.c
> @@ -6,6 +6,8 @@
>   
>   #include <linux/nospec.h>
>   
> +#include "gt/intel_engine_pm.h"
> +#include "gt/intel_engine_user.h"
>   #include "i915_drv.h"
>   #include "i915_perf.h"
>   #include "i915_query.h"
> @@ -90,6 +92,147 @@ static int query_topology_info(struct drm_i915_private *dev_priv,
>   	return total_length;
>   }
>   
> +typedef u64 (*__ktime_func_t)(void);
> +static __ktime_func_t __clock_id_to_func(clockid_t clk_id)
> +{
> +	/*
> +	 * Use logic same as the perf subsystem to allow user to select the
> +	 * reference clock id to be used for timestamps.
> +	 */
> +	switch (clk_id) {
> +	case CLOCK_MONOTONIC:
> +		return &ktime_get_ns;
> +	case CLOCK_MONOTONIC_RAW:
> +		return &ktime_get_raw_ns;
> +	case CLOCK_REALTIME:
> +		return &ktime_get_real_ns;
> +	case CLOCK_BOOTTIME:
> +		return &ktime_get_boottime_ns;
> +	case CLOCK_TAI:
> +		return &ktime_get_clocktai_ns;
> +	default:
> +		return NULL;
> +	}
> +}
> +
> +static inline int
> +__read_timestamps(struct intel_uncore *uncore,
> +		  i915_reg_t lower_reg,
> +		  i915_reg_t upper_reg,
> +		  u64 *cs_ts,
> +		  u64 *cpu_ts,
> +		  __ktime_func_t cpu_clock)
> +{
> +	u32 upper, lower, old_upper, loop = 0;
> +
> +	upper = intel_uncore_read_fw(uncore, upper_reg);
> +	do {
> +		cpu_ts[0] = cpu_clock();
> +		lower = intel_uncore_read_fw(uncore, lower_reg);
> +		cpu_ts[1] = cpu_clock();
> +		old_upper = upper;
> +		upper = intel_uncore_read_fw(uncore, upper_reg);
> +	} while (upper != old_upper && loop++ < 2);
> +
> +	*cs_ts = (u64)upper << 32 | lower;
> +
> +	return 0;
> +}
> +
> +static int
> +__query_cs_cycles(struct intel_engine_cs *engine,
> +		  u64 *cs_ts, u64 *cpu_ts,
> +		  __ktime_func_t cpu_clock)
> +{
> +	struct intel_uncore *uncore = engine->uncore;
> +	enum forcewake_domains fw_domains;
> +	u32 base = engine->mmio_base;
> +	intel_wakeref_t wakeref;
> +	int ret;
> +
> +	fw_domains = intel_uncore_forcewake_for_reg(uncore,
> +						    RING_TIMESTAMP(base),
> +						    FW_REG_READ);
> +
> +	with_intel_runtime_pm(uncore->rpm, wakeref) {
> +		spin_lock_irq(&uncore->lock);
> +		intel_uncore_forcewake_get__locked(uncore, fw_domains);
> +
> +		ret = __read_timestamps(uncore,
> +					RING_TIMESTAMP(base),
> +					RING_TIMESTAMP_UDW(base),
> +					cs_ts,
> +					cpu_ts,
> +					cpu_clock);
> +
> +		intel_uncore_forcewake_put__locked(uncore, fw_domains);
> +		spin_unlock_irq(&uncore->lock);
> +	}
> +
> +	return ret;
> +}
> +
> +static int
> +query_cs_cycles(struct drm_i915_private *i915,
> +		struct drm_i915_query_item *query_item)
> +{
> +	struct drm_i915_query_cs_cycles __user *query_ptr;
> +	struct drm_i915_query_cs_cycles query;
> +	struct intel_engine_cs *engine;
> +	__ktime_func_t cpu_clock;
> +	int ret;
> +
> +	if (INTEL_GEN(i915) < 6)
> +		return -ENODEV;
> +
> +	query_ptr = u64_to_user_ptr(query_item->data_ptr);
> +	ret = copy_query_item(&query, sizeof(query), sizeof(query), query_item);
> +	if (ret != 0)
> +		return ret;
> +
> +	if (query.flags)
> +		return -EINVAL;
> +
> +	if (query.rsvd)
> +		return -EINVAL;
> +
> +	cpu_clock = __clock_id_to_func(query.clockid);
> +	if (!cpu_clock)
> +		return -EINVAL;
> +
> +	engine = intel_engine_lookup_user(i915,
> +					  query.engine.engine_class,
> +					  query.engine.engine_instance);
> +	if (!engine)
> +		return -EINVAL;
> +
> +	if (IS_GEN(i915, 6) &&
> +	    query.engine.engine_class != I915_ENGINE_CLASS_RENDER)
> +		return -ENODEV;
> +
> +	query.cs_frequency = engine->gt->clock_frequency;
> +	ret = __query_cs_cycles(engine,
> +				&query.cs_cycles,
> +				query.cpu_timestamp,
> +				cpu_clock);
> +	if (ret)
> +		return ret;
> +
> +	if (put_user(query.cs_frequency, &query_ptr->cs_frequency))
> +		return -EFAULT;
> +
> +	if (put_user(query.cpu_timestamp[0], &query_ptr->cpu_timestamp[0]))
> +		return -EFAULT;
> +
> +	if (put_user(query.cpu_timestamp[1], &query_ptr->cpu_timestamp[1]))
> +		return -EFAULT;
> +
> +	if (put_user(query.cs_cycles, &query_ptr->cs_cycles))
> +		return -EFAULT;
> +
> +	return sizeof(query);
> +}
> +
>   static int
>   query_engine_info(struct drm_i915_private *i915,
>   		  struct drm_i915_query_item *query_item)
> @@ -424,6 +567,7 @@ static int (* const i915_query_funcs[])(struct drm_i915_private *dev_priv,
>   	query_topology_info,
>   	query_engine_info,
>   	query_perf_config,
> +	query_cs_cycles,
>   };
>   
>   int i915_query_ioctl(struct drm_device *dev, void *data, struct drm_file *file)
> diff --git a/include/uapi/drm/i915_drm.h b/include/uapi/drm/i915_drm.h
> index 1987e2ea79a3..abcc479e2be1 100644
> --- a/include/uapi/drm/i915_drm.h
> +++ b/include/uapi/drm/i915_drm.h
> @@ -2176,6 +2176,10 @@ struct drm_i915_query_item {
>   #define DRM_I915_QUERY_TOPOLOGY_INFO    1
>   #define DRM_I915_QUERY_ENGINE_INFO	2
>   #define DRM_I915_QUERY_PERF_CONFIG      3
> +	/**
> +	 * Query Command Streamer timestamp register.
> +	 */
> +#define DRM_I915_QUERY_CS_CYCLES	4
>   /* Must be kept compact -- no holes and well documented */
>   
>   	/*
> @@ -2309,6 +2313,49 @@ struct drm_i915_engine_info {
>   	__u64 rsvd1[4];
>   };
>   
> +/**
> + * struct drm_i915_query_cs_cycles
> + *
> + * The query returns the command streamer cycles and the frequency that can be
> + * used to calculate the command streamer timestamp. In addition the query
> + * returns the cpu timestamp that indicates when the command streamer cycle
> + * count was captured.
> + */
> +struct drm_i915_query_cs_cycles {
> +	/** Engine for which command streamer cycles is queried. */
> +	struct i915_engine_class_instance engine;
> +
> +	/** Must be zero. */
> +	__u32 flags;
> +
> +	/**
> +	 * Command streamer cycles as read from the command streamer
> +	 * register at 0x358 offset.
> +	 */
> +	__u64 cs_cycles;
> +
> +	/** Frequency of the cs cycles in Hz. */
> +	__u64 cs_frequency;
> +
> +	/**
> +	 * CPU timestamp in nanoseconds. cpu_timestamp[0] is captured before
> +	 * reading the cs_cycles register and cpu_timestamp[1] is captured after
> +	 * reading the register.
> +	 **/
> +	__u64 cpu_timestamp[2];
> +
> +	/**
> +	 * Reference clock id for CPU timestamp. For definition, see
> +	 * clock_gettime(2) and perf_event_open(2). Supported clock ids are
> +	 * CLOCK_MONOTONIC, CLOCK_MONOTONIC_RAW, CLOCK_REALTIME, CLOCK_BOOTTIME,
> +	 * CLOCK_TAI.
> +	 */
> +	__s32 clockid;
> +
> +	/** Must be zero. */
> +	__u32 rsvd;
> +};
> +
>   /**
>    * struct drm_i915_query_engine_info
>    *


_______________________________________________
Intel-gfx mailing list
Intel-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/intel-gfx

^ permalink raw reply	[flat|nested] 19+ messages in thread

* Re: [Intel-gfx] [PATCH] i915/query: Correlate engine and cpu timestamps with better accuracy
  2021-03-04  0:09 ` Chris Wilson
@ 2021-03-04  8:28   ` Lionel Landwerlin
  2021-03-04  8:58     ` Chris Wilson
  0 siblings, 1 reply; 19+ messages in thread
From: Lionel Landwerlin @ 2021-03-04  8:28 UTC (permalink / raw)
  To: Chris Wilson, Tvrtko Ursulin, Umesh Nerlige Ramappa, intel-gfx

On 04/03/2021 02:09, Chris Wilson wrote:
> Quoting Umesh Nerlige Ramappa (2021-03-03 21:28:00)
>> Perf measurements rely on CPU and engine timestamps to correlate
>> events of interest across these time domains. Current mechanisms get
>> these timestamps separately and the calculated delta between these
>> timestamps lack enough accuracy.
>>
>> To improve the accuracy of these time measurements to within a few us,
>> add a query that returns the engine and cpu timestamps captured as
>> close to each other as possible.
>>
>> v2: (Tvrtko)
>> - document clock reference used
>> - return cpu timestamp always
>> - capture cpu time just before lower dword of cs timestamp
>>
>> v3: (Chris)
>> - use uncore-rpm
>> - use __query_cs_timestamp helper
>>
>> v4: (Lionel)
>> - Kernel perf subsytem allows users to specify the clock id to be used
>>    in perf_event_open. This clock id is used by the perf subsystem to
>>    return the appropriate cpu timestamp in perf events. Similarly, let
>>    the user pass the clockid to this query so that cpu timestamp
>>    corresponds to the clock id requested.
>>
>> v5: (Tvrtko)
>> - Use normal ktime accessors instead of fast versions
>> - Add more uApi documentation
>>
>> v6: (Lionel)
>> - Move switch out of spinlock
>>
>> v7: (Chris)
>> - cs_timestamp is a misnomer, use cs_cycles instead
>> - return the cs cycle frequency as well in the query
>>
>> v8:
>> - Add platform and engine specific checks
>>
>> v9: (Lionel)
>> - Return 2 cpu timestamps in the query - captured before and after the
>>    register read
>>
>> Signed-off-by: Umesh Nerlige Ramappa <umesh.nerlige.ramappa@intel.com>
>> ---
>>   drivers/gpu/drm/i915/i915_query.c | 144 ++++++++++++++++++++++++++++++
>>   include/uapi/drm/i915_drm.h       |  47 ++++++++++
>>   2 files changed, 191 insertions(+)
>>
>> diff --git a/drivers/gpu/drm/i915/i915_query.c b/drivers/gpu/drm/i915/i915_query.c
>> index fed337ad7b68..acca22ee6014 100644
>> --- a/drivers/gpu/drm/i915/i915_query.c
>> +++ b/drivers/gpu/drm/i915/i915_query.c
>> @@ -6,6 +6,8 @@
>>   
>>   #include <linux/nospec.h>
>>   
>> +#include "gt/intel_engine_pm.h"
>> +#include "gt/intel_engine_user.h"
>>   #include "i915_drv.h"
>>   #include "i915_perf.h"
>>   #include "i915_query.h"
>> @@ -90,6 +92,147 @@ static int query_topology_info(struct drm_i915_private *dev_priv,
>>          return total_length;
>>   }
>>   
>> +typedef u64 (*__ktime_func_t)(void);
>> +static __ktime_func_t __clock_id_to_func(clockid_t clk_id)
>> +{
>> +       /*
>> +        * Use logic same as the perf subsystem to allow user to select the
>> +        * reference clock id to be used for timestamps.
>> +        */
>> +       switch (clk_id) {
>> +       case CLOCK_MONOTONIC:
>> +               return &ktime_get_ns;
>> +       case CLOCK_MONOTONIC_RAW:
>> +               return &ktime_get_raw_ns;
>> +       case CLOCK_REALTIME:
>> +               return &ktime_get_real_ns;
>> +       case CLOCK_BOOTTIME:
>> +               return &ktime_get_boottime_ns;
>> +       case CLOCK_TAI:
>> +               return &ktime_get_clocktai_ns;
>> +       default:
>> +               return NULL;
>> +       }
>> +}
>> +
>> +static inline int
>> +__read_timestamps(struct intel_uncore *uncore,
>> +                 i915_reg_t lower_reg,
>> +                 i915_reg_t upper_reg,
>> +                 u64 *cs_ts,
>> +                 u64 *cpu_ts,
>> +                 __ktime_func_t cpu_clock)
>> +{
>> +       u32 upper, lower, old_upper, loop = 0;
>> +
>> +       upper = intel_uncore_read_fw(uncore, upper_reg);
>> +       do {
>> +               cpu_ts[0] = cpu_clock();
>> +               lower = intel_uncore_read_fw(uncore, lower_reg);
>> +               cpu_ts[1] = cpu_clock();
>> +               old_upper = upper;
>> +               upper = intel_uncore_read_fw(uncore, upper_reg);
> Both register reads comprise the timestamp returned to userspace, so
> presumably you want cpu_ts[] to wrap both.
>
>         do {
>                 old_upper = upper;
>
>                 cpu_ts[0] = cpu_clock();
>                 lower = intel_uncore_read_fw(uncore, lower_reg);
>                 upper = intel_uncore_read_fw(uncore, upper_reg);
>                 cpu_ts[1] = cpu_clock();
>         } while (upper != old_upper && loop++ < 2);

Actually if we want the best accuracy we can just deal with the lower dword.

We can check the upper one hasn't changed outside of the 2 cpu_clock() 
calls.


-Lionel


_______________________________________________
Intel-gfx mailing list
Intel-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/intel-gfx

^ permalink raw reply	[flat|nested] 19+ messages in thread

* Re: [Intel-gfx] [PATCH] i915/query: Correlate engine and cpu timestamps with better accuracy
@ 2021-03-04  0:11 kernel test robot
  0 siblings, 0 replies; 19+ messages in thread
From: kernel test robot @ 2021-03-04  0:11 UTC (permalink / raw)
  To: kbuild

[-- Attachment #1: Type: text/plain, Size: 4294 bytes --]

CC: kbuild-all(a)lists.01.org
In-Reply-To: <20210303212800.43787-1-umesh.nerlige.ramappa@intel.com>
References: <20210303212800.43787-1-umesh.nerlige.ramappa@intel.com>
TO: Umesh Nerlige Ramappa <umesh.nerlige.ramappa@intel.com>
TO: intel-gfx(a)lists.freedesktop.org
TO: Chris Wilson <chris.p.wilson@intel.com>
TO: Tvrtko Ursulin <tvrtko.ursulin@linux.intel.com>
TO: Lionel G Landwerlin <lionel.g.landwerlin@intel.com>

Hi Umesh,

Thank you for the patch! Perhaps something to improve:

[auto build test WARNING on drm-tip/drm-tip]
[also build test WARNING on drm-intel/for-linux-next v5.12-rc1 next-20210303]
[If your patch is applied to the wrong git tree, kindly drop us a note.
And when submitting patch, we suggest to use '--base' as documented in
https://git-scm.com/docs/git-format-patch]

url:    https://github.com/0day-ci/linux/commits/Umesh-Nerlige-Ramappa/i915-query-Correlate-engine-and-cpu-timestamps-with-better-accuracy/20210304-053002
base:   git://anongit.freedesktop.org/drm/drm-tip drm-tip
:::::: branch date: 3 hours ago
:::::: commit date: 3 hours ago
config: i386-randconfig-m021-20210304 (attached as .config)
compiler: gcc-9 (Debian 9.3.0-22) 9.3.0

If you fix the issue, kindly add following tag as appropriate
Reported-by: kernel test robot <lkp@intel.com>
Reported-by: Dan Carpenter <dan.carpenter@oracle.com>

smatch warnings:
drivers/gpu/drm/i915/i915_query.c:172 __query_cs_cycles() error: uninitialized symbol 'ret'.

vim +/ret +172 drivers/gpu/drm/i915/i915_query.c

ef81d31bf15aff Umesh Nerlige Ramappa 2021-03-03  141  
ef81d31bf15aff Umesh Nerlige Ramappa 2021-03-03  142  static int
ef81d31bf15aff Umesh Nerlige Ramappa 2021-03-03  143  __query_cs_cycles(struct intel_engine_cs *engine,
ef81d31bf15aff Umesh Nerlige Ramappa 2021-03-03  144  		  u64 *cs_ts, u64 *cpu_ts,
ef81d31bf15aff Umesh Nerlige Ramappa 2021-03-03  145  		  __ktime_func_t cpu_clock)
ef81d31bf15aff Umesh Nerlige Ramappa 2021-03-03  146  {
ef81d31bf15aff Umesh Nerlige Ramappa 2021-03-03  147  	struct intel_uncore *uncore = engine->uncore;
ef81d31bf15aff Umesh Nerlige Ramappa 2021-03-03  148  	enum forcewake_domains fw_domains;
ef81d31bf15aff Umesh Nerlige Ramappa 2021-03-03  149  	u32 base = engine->mmio_base;
ef81d31bf15aff Umesh Nerlige Ramappa 2021-03-03  150  	intel_wakeref_t wakeref;
ef81d31bf15aff Umesh Nerlige Ramappa 2021-03-03  151  	int ret;
ef81d31bf15aff Umesh Nerlige Ramappa 2021-03-03  152  
ef81d31bf15aff Umesh Nerlige Ramappa 2021-03-03  153  	fw_domains = intel_uncore_forcewake_for_reg(uncore,
ef81d31bf15aff Umesh Nerlige Ramappa 2021-03-03  154  						    RING_TIMESTAMP(base),
ef81d31bf15aff Umesh Nerlige Ramappa 2021-03-03  155  						    FW_REG_READ);
ef81d31bf15aff Umesh Nerlige Ramappa 2021-03-03  156  
ef81d31bf15aff Umesh Nerlige Ramappa 2021-03-03  157  	with_intel_runtime_pm(uncore->rpm, wakeref) {
ef81d31bf15aff Umesh Nerlige Ramappa 2021-03-03  158  		spin_lock_irq(&uncore->lock);
ef81d31bf15aff Umesh Nerlige Ramappa 2021-03-03  159  		intel_uncore_forcewake_get__locked(uncore, fw_domains);
ef81d31bf15aff Umesh Nerlige Ramappa 2021-03-03  160  
ef81d31bf15aff Umesh Nerlige Ramappa 2021-03-03  161  		ret = __read_timestamps(uncore,
ef81d31bf15aff Umesh Nerlige Ramappa 2021-03-03  162  					RING_TIMESTAMP(base),
ef81d31bf15aff Umesh Nerlige Ramappa 2021-03-03  163  					RING_TIMESTAMP_UDW(base),
ef81d31bf15aff Umesh Nerlige Ramappa 2021-03-03  164  					cs_ts,
ef81d31bf15aff Umesh Nerlige Ramappa 2021-03-03  165  					cpu_ts,
ef81d31bf15aff Umesh Nerlige Ramappa 2021-03-03  166  					cpu_clock);
ef81d31bf15aff Umesh Nerlige Ramappa 2021-03-03  167  
ef81d31bf15aff Umesh Nerlige Ramappa 2021-03-03  168  		intel_uncore_forcewake_put__locked(uncore, fw_domains);
ef81d31bf15aff Umesh Nerlige Ramappa 2021-03-03  169  		spin_unlock_irq(&uncore->lock);
ef81d31bf15aff Umesh Nerlige Ramappa 2021-03-03  170  	}
ef81d31bf15aff Umesh Nerlige Ramappa 2021-03-03  171  
ef81d31bf15aff Umesh Nerlige Ramappa 2021-03-03 @172  	return ret;
ef81d31bf15aff Umesh Nerlige Ramappa 2021-03-03  173  }
ef81d31bf15aff Umesh Nerlige Ramappa 2021-03-03  174  

---
0-DAY CI Kernel Test Service, Intel Corporation
https://lists.01.org/hyperkitty/list/kbuild-all(a)lists.01.org

[-- Attachment #2: config.gz --]
[-- Type: application/gzip, Size: 32979 bytes --]

^ permalink raw reply	[flat|nested] 19+ messages in thread

* Re: [Intel-gfx] [PATCH] i915/query: Correlate engine and cpu timestamps with better accuracy
  2021-03-03 21:28 [Intel-gfx] [PATCH] " Umesh Nerlige Ramappa
@ 2021-03-04  0:09 ` Chris Wilson
  2021-03-04  8:28   ` Lionel Landwerlin
  2021-03-04  8:32 ` Lionel Landwerlin
  2021-03-04 12:23 ` Lionel Landwerlin
  2 siblings, 1 reply; 19+ messages in thread
From: Chris Wilson @ 2021-03-04  0:09 UTC (permalink / raw)
  To: Lionel G Landwerlin, Tvrtko Ursulin, Umesh Nerlige Ramappa, intel-gfx

Quoting Umesh Nerlige Ramappa (2021-03-03 21:28:00)
> Perf measurements rely on CPU and engine timestamps to correlate
> events of interest across these time domains. Current mechanisms get
> these timestamps separately and the calculated delta between these
> timestamps lack enough accuracy.
> 
> To improve the accuracy of these time measurements to within a few us,
> add a query that returns the engine and cpu timestamps captured as
> close to each other as possible.
> 
> v2: (Tvrtko)
> - document clock reference used
> - return cpu timestamp always
> - capture cpu time just before lower dword of cs timestamp
> 
> v3: (Chris)
> - use uncore-rpm
> - use __query_cs_timestamp helper
> 
> v4: (Lionel)
> - Kernel perf subsytem allows users to specify the clock id to be used
>   in perf_event_open. This clock id is used by the perf subsystem to
>   return the appropriate cpu timestamp in perf events. Similarly, let
>   the user pass the clockid to this query so that cpu timestamp
>   corresponds to the clock id requested.
> 
> v5: (Tvrtko)
> - Use normal ktime accessors instead of fast versions
> - Add more uApi documentation
> 
> v6: (Lionel)
> - Move switch out of spinlock
> 
> v7: (Chris)
> - cs_timestamp is a misnomer, use cs_cycles instead
> - return the cs cycle frequency as well in the query
> 
> v8:
> - Add platform and engine specific checks
> 
> v9: (Lionel)
> - Return 2 cpu timestamps in the query - captured before and after the
>   register read
> 
> Signed-off-by: Umesh Nerlige Ramappa <umesh.nerlige.ramappa@intel.com>
> ---
>  drivers/gpu/drm/i915/i915_query.c | 144 ++++++++++++++++++++++++++++++
>  include/uapi/drm/i915_drm.h       |  47 ++++++++++
>  2 files changed, 191 insertions(+)
> 
> diff --git a/drivers/gpu/drm/i915/i915_query.c b/drivers/gpu/drm/i915/i915_query.c
> index fed337ad7b68..acca22ee6014 100644
> --- a/drivers/gpu/drm/i915/i915_query.c
> +++ b/drivers/gpu/drm/i915/i915_query.c
> @@ -6,6 +6,8 @@
>  
>  #include <linux/nospec.h>
>  
> +#include "gt/intel_engine_pm.h"
> +#include "gt/intel_engine_user.h"
>  #include "i915_drv.h"
>  #include "i915_perf.h"
>  #include "i915_query.h"
> @@ -90,6 +92,147 @@ static int query_topology_info(struct drm_i915_private *dev_priv,
>         return total_length;
>  }
>  
> +typedef u64 (*__ktime_func_t)(void);
> +static __ktime_func_t __clock_id_to_func(clockid_t clk_id)
> +{
> +       /*
> +        * Use logic same as the perf subsystem to allow user to select the
> +        * reference clock id to be used for timestamps.
> +        */
> +       switch (clk_id) {
> +       case CLOCK_MONOTONIC:
> +               return &ktime_get_ns;
> +       case CLOCK_MONOTONIC_RAW:
> +               return &ktime_get_raw_ns;
> +       case CLOCK_REALTIME:
> +               return &ktime_get_real_ns;
> +       case CLOCK_BOOTTIME:
> +               return &ktime_get_boottime_ns;
> +       case CLOCK_TAI:
> +               return &ktime_get_clocktai_ns;
> +       default:
> +               return NULL;
> +       }
> +}
> +
> +static inline int
> +__read_timestamps(struct intel_uncore *uncore,
> +                 i915_reg_t lower_reg,
> +                 i915_reg_t upper_reg,
> +                 u64 *cs_ts,
> +                 u64 *cpu_ts,
> +                 __ktime_func_t cpu_clock)
> +{
> +       u32 upper, lower, old_upper, loop = 0;
> +
> +       upper = intel_uncore_read_fw(uncore, upper_reg);
> +       do {
> +               cpu_ts[0] = cpu_clock();
> +               lower = intel_uncore_read_fw(uncore, lower_reg);
> +               cpu_ts[1] = cpu_clock();
> +               old_upper = upper;
> +               upper = intel_uncore_read_fw(uncore, upper_reg);

Both register reads comprise the timestamp returned to userspace, so
presumably you want cpu_ts[] to wrap both.

       do {
               old_upper = upper;

               cpu_ts[0] = cpu_clock();
               lower = intel_uncore_read_fw(uncore, lower_reg);
               upper = intel_uncore_read_fw(uncore, upper_reg);
               cpu_ts[1] = cpu_clock();
       } while (upper != old_upper && loop++ < 2);
---------------------------------------------------------------------
Intel Corporation (UK) Limited
Registered No. 1134945 (England)
Registered Office: Pipers Way, Swindon SN3 1RJ
VAT No: 860 2173 47

This e-mail and any attachments may contain confidential material for
the sole use of the intended recipient(s). Any review or distribution
by others is strictly prohibited. If you are not the intended
recipient, please contact the sender and delete all copies.
_______________________________________________
Intel-gfx mailing list
Intel-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/intel-gfx

^ permalink raw reply	[flat|nested] 19+ messages in thread

* [Intel-gfx] [PATCH] i915/query: Correlate engine and cpu timestamps with better accuracy
@ 2021-03-03 21:28 Umesh Nerlige Ramappa
  2021-03-04  0:09 ` Chris Wilson
                   ` (2 more replies)
  0 siblings, 3 replies; 19+ messages in thread
From: Umesh Nerlige Ramappa @ 2021-03-03 21:28 UTC (permalink / raw)
  To: intel-gfx, Chris Wilson, Tvrtko Ursulin, Lionel G Landwerlin

Perf measurements rely on CPU and engine timestamps to correlate
events of interest across these time domains. Current mechanisms get
these timestamps separately and the calculated delta between these
timestamps lack enough accuracy.

To improve the accuracy of these time measurements to within a few us,
add a query that returns the engine and cpu timestamps captured as
close to each other as possible.

v2: (Tvrtko)
- document clock reference used
- return cpu timestamp always
- capture cpu time just before lower dword of cs timestamp

v3: (Chris)
- use uncore-rpm
- use __query_cs_timestamp helper

v4: (Lionel)
- Kernel perf subsytem allows users to specify the clock id to be used
  in perf_event_open. This clock id is used by the perf subsystem to
  return the appropriate cpu timestamp in perf events. Similarly, let
  the user pass the clockid to this query so that cpu timestamp
  corresponds to the clock id requested.

v5: (Tvrtko)
- Use normal ktime accessors instead of fast versions
- Add more uApi documentation

v6: (Lionel)
- Move switch out of spinlock

v7: (Chris)
- cs_timestamp is a misnomer, use cs_cycles instead
- return the cs cycle frequency as well in the query

v8:
- Add platform and engine specific checks

v9: (Lionel)
- Return 2 cpu timestamps in the query - captured before and after the
  register read

Signed-off-by: Umesh Nerlige Ramappa <umesh.nerlige.ramappa@intel.com>
---
 drivers/gpu/drm/i915/i915_query.c | 144 ++++++++++++++++++++++++++++++
 include/uapi/drm/i915_drm.h       |  47 ++++++++++
 2 files changed, 191 insertions(+)

diff --git a/drivers/gpu/drm/i915/i915_query.c b/drivers/gpu/drm/i915/i915_query.c
index fed337ad7b68..acca22ee6014 100644
--- a/drivers/gpu/drm/i915/i915_query.c
+++ b/drivers/gpu/drm/i915/i915_query.c
@@ -6,6 +6,8 @@
 
 #include <linux/nospec.h>
 
+#include "gt/intel_engine_pm.h"
+#include "gt/intel_engine_user.h"
 #include "i915_drv.h"
 #include "i915_perf.h"
 #include "i915_query.h"
@@ -90,6 +92,147 @@ static int query_topology_info(struct drm_i915_private *dev_priv,
 	return total_length;
 }
 
+typedef u64 (*__ktime_func_t)(void);
+static __ktime_func_t __clock_id_to_func(clockid_t clk_id)
+{
+	/*
+	 * Use logic same as the perf subsystem to allow user to select the
+	 * reference clock id to be used for timestamps.
+	 */
+	switch (clk_id) {
+	case CLOCK_MONOTONIC:
+		return &ktime_get_ns;
+	case CLOCK_MONOTONIC_RAW:
+		return &ktime_get_raw_ns;
+	case CLOCK_REALTIME:
+		return &ktime_get_real_ns;
+	case CLOCK_BOOTTIME:
+		return &ktime_get_boottime_ns;
+	case CLOCK_TAI:
+		return &ktime_get_clocktai_ns;
+	default:
+		return NULL;
+	}
+}
+
+static inline int
+__read_timestamps(struct intel_uncore *uncore,
+		  i915_reg_t lower_reg,
+		  i915_reg_t upper_reg,
+		  u64 *cs_ts,
+		  u64 *cpu_ts,
+		  __ktime_func_t cpu_clock)
+{
+	u32 upper, lower, old_upper, loop = 0;
+
+	upper = intel_uncore_read_fw(uncore, upper_reg);
+	do {
+		cpu_ts[0] = cpu_clock();
+		lower = intel_uncore_read_fw(uncore, lower_reg);
+		cpu_ts[1] = cpu_clock();
+		old_upper = upper;
+		upper = intel_uncore_read_fw(uncore, upper_reg);
+	} while (upper != old_upper && loop++ < 2);
+
+	*cs_ts = (u64)upper << 32 | lower;
+
+	return 0;
+}
+
+static int
+__query_cs_cycles(struct intel_engine_cs *engine,
+		  u64 *cs_ts, u64 *cpu_ts,
+		  __ktime_func_t cpu_clock)
+{
+	struct intel_uncore *uncore = engine->uncore;
+	enum forcewake_domains fw_domains;
+	u32 base = engine->mmio_base;
+	intel_wakeref_t wakeref;
+	int ret;
+
+	fw_domains = intel_uncore_forcewake_for_reg(uncore,
+						    RING_TIMESTAMP(base),
+						    FW_REG_READ);
+
+	with_intel_runtime_pm(uncore->rpm, wakeref) {
+		spin_lock_irq(&uncore->lock);
+		intel_uncore_forcewake_get__locked(uncore, fw_domains);
+
+		ret = __read_timestamps(uncore,
+					RING_TIMESTAMP(base),
+					RING_TIMESTAMP_UDW(base),
+					cs_ts,
+					cpu_ts,
+					cpu_clock);
+
+		intel_uncore_forcewake_put__locked(uncore, fw_domains);
+		spin_unlock_irq(&uncore->lock);
+	}
+
+	return ret;
+}
+
+static int
+query_cs_cycles(struct drm_i915_private *i915,
+		struct drm_i915_query_item *query_item)
+{
+	struct drm_i915_query_cs_cycles __user *query_ptr;
+	struct drm_i915_query_cs_cycles query;
+	struct intel_engine_cs *engine;
+	__ktime_func_t cpu_clock;
+	int ret;
+
+	if (INTEL_GEN(i915) < 6)
+		return -ENODEV;
+
+	query_ptr = u64_to_user_ptr(query_item->data_ptr);
+	ret = copy_query_item(&query, sizeof(query), sizeof(query), query_item);
+	if (ret != 0)
+		return ret;
+
+	if (query.flags)
+		return -EINVAL;
+
+	if (query.rsvd)
+		return -EINVAL;
+
+	cpu_clock = __clock_id_to_func(query.clockid);
+	if (!cpu_clock)
+		return -EINVAL;
+
+	engine = intel_engine_lookup_user(i915,
+					  query.engine.engine_class,
+					  query.engine.engine_instance);
+	if (!engine)
+		return -EINVAL;
+
+	if (IS_GEN(i915, 6) &&
+	    query.engine.engine_class != I915_ENGINE_CLASS_RENDER)
+		return -ENODEV;
+
+	query.cs_frequency = engine->gt->clock_frequency;
+	ret = __query_cs_cycles(engine,
+				&query.cs_cycles,
+				query.cpu_timestamp,
+				cpu_clock);
+	if (ret)
+		return ret;
+
+	if (put_user(query.cs_frequency, &query_ptr->cs_frequency))
+		return -EFAULT;
+
+	if (put_user(query.cpu_timestamp[0], &query_ptr->cpu_timestamp[0]))
+		return -EFAULT;
+
+	if (put_user(query.cpu_timestamp[1], &query_ptr->cpu_timestamp[1]))
+		return -EFAULT;
+
+	if (put_user(query.cs_cycles, &query_ptr->cs_cycles))
+		return -EFAULT;
+
+	return sizeof(query);
+}
+
 static int
 query_engine_info(struct drm_i915_private *i915,
 		  struct drm_i915_query_item *query_item)
@@ -424,6 +567,7 @@ static int (* const i915_query_funcs[])(struct drm_i915_private *dev_priv,
 	query_topology_info,
 	query_engine_info,
 	query_perf_config,
+	query_cs_cycles,
 };
 
 int i915_query_ioctl(struct drm_device *dev, void *data, struct drm_file *file)
diff --git a/include/uapi/drm/i915_drm.h b/include/uapi/drm/i915_drm.h
index 1987e2ea79a3..abcc479e2be1 100644
--- a/include/uapi/drm/i915_drm.h
+++ b/include/uapi/drm/i915_drm.h
@@ -2176,6 +2176,10 @@ struct drm_i915_query_item {
 #define DRM_I915_QUERY_TOPOLOGY_INFO    1
 #define DRM_I915_QUERY_ENGINE_INFO	2
 #define DRM_I915_QUERY_PERF_CONFIG      3
+	/**
+	 * Query Command Streamer timestamp register.
+	 */
+#define DRM_I915_QUERY_CS_CYCLES	4
 /* Must be kept compact -- no holes and well documented */
 
 	/*
@@ -2309,6 +2313,49 @@ struct drm_i915_engine_info {
 	__u64 rsvd1[4];
 };
 
+/**
+ * struct drm_i915_query_cs_cycles
+ *
+ * The query returns the command streamer cycles and the frequency that can be
+ * used to calculate the command streamer timestamp. In addition the query
+ * returns the cpu timestamp that indicates when the command streamer cycle
+ * count was captured.
+ */
+struct drm_i915_query_cs_cycles {
+	/** Engine for which command streamer cycles is queried. */
+	struct i915_engine_class_instance engine;
+
+	/** Must be zero. */
+	__u32 flags;
+
+	/**
+	 * Command streamer cycles as read from the command streamer
+	 * register at 0x358 offset.
+	 */
+	__u64 cs_cycles;
+
+	/** Frequency of the cs cycles in Hz. */
+	__u64 cs_frequency;
+
+	/**
+	 * CPU timestamp in nanoseconds. cpu_timestamp[0] is captured before
+	 * reading the cs_cycles register and cpu_timestamp[1] is captured after
+	 * reading the register.
+	 **/
+	__u64 cpu_timestamp[2];
+
+	/**
+	 * Reference clock id for CPU timestamp. For definition, see
+	 * clock_gettime(2) and perf_event_open(2). Supported clock ids are
+	 * CLOCK_MONOTONIC, CLOCK_MONOTONIC_RAW, CLOCK_REALTIME, CLOCK_BOOTTIME,
+	 * CLOCK_TAI.
+	 */
+	__s32 clockid;
+
+	/** Must be zero. */
+	__u32 rsvd;
+};
+
 /**
  * struct drm_i915_query_engine_info
  *
-- 
2.20.1

_______________________________________________
Intel-gfx mailing list
Intel-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/intel-gfx

^ permalink raw reply related	[flat|nested] 19+ messages in thread

end of thread, other threads:[~2021-03-05 19:38 UTC | newest]

Thread overview: 19+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2021-03-02 18:29 [Intel-gfx] [PATCH] i915/query: Correlate engine and cpu timestamps with better accuracy Umesh Nerlige Ramappa
2021-03-02 20:35 ` Lionel Landwerlin
2021-03-03  0:12   ` Umesh Nerlige Ramappa
2021-03-03  9:21     ` Lionel Landwerlin
2021-03-03 16:27       ` Umesh Nerlige Ramappa
2021-03-03 16:28         ` Lionel Landwerlin
2021-03-03 17:02 ` [Intel-gfx] ✓ Fi.CI.BAT: success for " Patchwork
2021-03-03 20:23 ` [Intel-gfx] ✓ Fi.CI.IGT: " Patchwork
2021-03-03 21:28 [Intel-gfx] [PATCH] " Umesh Nerlige Ramappa
2021-03-04  0:09 ` Chris Wilson
2021-03-04  8:28   ` Lionel Landwerlin
2021-03-04  8:58     ` Chris Wilson
2021-03-04  9:45       ` Lionel Landwerlin
2021-03-04  9:54         ` Chris Wilson
2021-03-04 10:27           ` Lionel Landwerlin
2021-03-04  8:32 ` Lionel Landwerlin
2021-03-04 12:23 ` Lionel Landwerlin
2021-03-04  0:11 kernel test robot
2021-03-05 19:37 Umesh Nerlige Ramappa

This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.