All of lore.kernel.org
 help / color / mirror / Atom feed
* [PATCH 0/3] drm/msm: Revive GPU workload profiling
@ 2018-10-17 13:03 Sharat Masetty
       [not found] ` <1539781441-13076-1-git-send-email-smasetty-sgV2jX0FEOL9JmXXK+q4OQ@public.gmane.org>
  0 siblings, 1 reply; 9+ messages in thread
From: Sharat Masetty @ 2018-10-17 13:03 UTC (permalink / raw)
  To: freedreno; +Cc: linux-arm-msm, Sharat Masetty, dri-devel

I was looking for a simple mechanism to measure GPU workloads, it turns out
that the support is already present in the msm driver. In this series I try to
make use of the hardware counters where possible for a more accurate workload
estimation. Please review and share your thoughts on this.

I could have piggybacked on the devfreq samples and accumulated the busytimes,
but I chose not to, since it puts a dependency on the gpu devfreq which in
itself is an optional feature. There is one pending issue with GPU recovery +
sampling use case, which I plan to address soon.

Sharat Masetty (3):
  drm/msm: Change gpu_busy() function
  drm/msm/a6xx: Move power counter selectable to resume()
  drm/msm: Use Hardware counters for perf profiling

 drivers/gpu/drm/msm/adreno/a5xx_gpu.c |  6 +++---
 drivers/gpu/drm/msm/adreno/a6xx_gpu.c | 22 +++++++++++-----------
 drivers/gpu/drm/msm/msm_gpu.c         | 33 ++++++++++++++++++++++++++++-----
 drivers/gpu/drm/msm/msm_gpu.h         |  7 ++++---
 drivers/gpu/drm/msm/msm_perf.c        | 10 +++++-----
 5 files changed, 51 insertions(+), 27 deletions(-)

--
1.9.1

_______________________________________________
dri-devel mailing list
dri-devel@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/dri-devel

^ permalink raw reply	[flat|nested] 9+ messages in thread

* [PATCH 1/3] drm/msm: Change gpu_busy() function
       [not found] ` <1539781441-13076-1-git-send-email-smasetty-sgV2jX0FEOL9JmXXK+q4OQ@public.gmane.org>
@ 2018-10-17 13:03   ` Sharat Masetty
       [not found]     ` <1539781441-13076-2-git-send-email-smasetty-sgV2jX0FEOL9JmXXK+q4OQ@public.gmane.org>
  2018-10-17 13:04   ` [PATCH 2/3] drm/msm/a6xx: Move power counter selectable to resume() Sharat Masetty
  2018-10-17 13:04   ` [PATCH 3/3] drm/msm: Use Hardware counters for perf profiling Sharat Masetty
  2 siblings, 1 reply; 9+ messages in thread
From: Sharat Masetty @ 2018-10-17 13:03 UTC (permalink / raw)
  To: freedreno-PD4FTy7X32lNgt0PjOBp9y5qC8QIuHrW
  Cc: linux-arm-msm-u79uwXL29TY76Z2rM5mHXA,
	jcrouse-sgV2jX0FEOL9JmXXK+q4OQ, Sharat Masetty,
	dri-devel-PD4FTy7X32lNgt0PjOBp9y5qC8QIuHrW

The gpu_busy() function turns out is useful for performance profiling as
well. So we change the input params to pass in the pointer to the previous
busy cycles, this makes the function more generic and removes the dependency
on the gpu devfreq.

Signed-off-by: Sharat Masetty <smasetty@codeaurora.org>
---
 drivers/gpu/drm/msm/adreno/a5xx_gpu.c | 6 +++---
 drivers/gpu/drm/msm/adreno/a6xx_gpu.c | 6 +++---
 drivers/gpu/drm/msm/msm_gpu.c         | 3 ++-
 drivers/gpu/drm/msm/msm_gpu.h         | 2 +-
 4 files changed, 9 insertions(+), 8 deletions(-)

diff --git a/drivers/gpu/drm/msm/adreno/a5xx_gpu.c b/drivers/gpu/drm/msm/adreno/a5xx_gpu.c
index 236a6ed..b5df80c 100644
--- a/drivers/gpu/drm/msm/adreno/a5xx_gpu.c
+++ b/drivers/gpu/drm/msm/adreno/a5xx_gpu.c
@@ -1428,7 +1428,7 @@ static struct msm_ringbuffer *a5xx_active_ring(struct msm_gpu *gpu)
 	return a5xx_gpu->cur_ring;
 }
 
-static unsigned long a5xx_gpu_busy(struct msm_gpu *gpu)
+static unsigned long a5xx_gpu_busy(struct msm_gpu *gpu, u64 *prev_cycles)
 {
 	u64 busy_cycles;
 	unsigned long busy_time;
@@ -1436,10 +1436,10 @@ static unsigned long a5xx_gpu_busy(struct msm_gpu *gpu)
 	busy_cycles = gpu_read64(gpu, REG_A5XX_RBBM_PERFCTR_RBBM_0_LO,
 			REG_A5XX_RBBM_PERFCTR_RBBM_0_HI);
 
-	busy_time = (busy_cycles - gpu->devfreq.busy_cycles) /
+	busy_time = (busy_cycles - *prev_cycles) /
 		(clk_get_rate(gpu->core_clk) / 1000000);
 
-	gpu->devfreq.busy_cycles = busy_cycles;
+	*prev_cycles = busy_cycles;
 
 	return busy_time;
 }
diff --git a/drivers/gpu/drm/msm/adreno/a6xx_gpu.c b/drivers/gpu/drm/msm/adreno/a6xx_gpu.c
index 07c3c3c..c0cd3ac 100644
--- a/drivers/gpu/drm/msm/adreno/a6xx_gpu.c
+++ b/drivers/gpu/drm/msm/adreno/a6xx_gpu.c
@@ -722,7 +722,7 @@ static void a6xx_destroy(struct msm_gpu *gpu)
 	kfree(a6xx_gpu);
 }
 
-static unsigned long a6xx_gpu_busy(struct msm_gpu *gpu)
+static unsigned long a6xx_gpu_busy(struct msm_gpu *gpu, u64 *prev_cycles)
 {
 	struct adreno_gpu *adreno_gpu = to_adreno_gpu(gpu);
 	struct a6xx_gpu *a6xx_gpu = to_a6xx_gpu(adreno_gpu);
@@ -733,9 +733,9 @@ static unsigned long a6xx_gpu_busy(struct msm_gpu *gpu)
 			REG_A6XX_GMU_CX_GMU_POWER_COUNTER_XOCLK_0_L,
 			REG_A6XX_GMU_CX_GMU_POWER_COUNTER_XOCLK_0_H);
 
-	busy_time = ((busy_cycles - gpu->devfreq.busy_cycles) * 10) / 192;
+	busy_time = ((busy_cycles - *prev_cycles) * 10) / 192;
 
-	gpu->devfreq.busy_cycles = busy_cycles;
+	*prev_cycles = busy_cycles;
 
 	return busy_time;
 }
diff --git a/drivers/gpu/drm/msm/msm_gpu.c b/drivers/gpu/drm/msm/msm_gpu.c
index ca573f6..e9b5426 100644
--- a/drivers/gpu/drm/msm/msm_gpu.c
+++ b/drivers/gpu/drm/msm/msm_gpu.c
@@ -62,7 +62,8 @@ static int msm_devfreq_get_dev_status(struct device *dev,
 	else
 		status->current_frequency = clk_get_rate(gpu->core_clk);
 
-	status->busy_time = gpu->funcs->gpu_busy(gpu);
+	status->busy_time = gpu->funcs->gpu_busy(gpu,
+			&gpu->devfreq.busy_cycles);
 
 	time = ktime_get();
 	status->total_time = ktime_us_delta(time, gpu->devfreq.time);
diff --git a/drivers/gpu/drm/msm/msm_gpu.h b/drivers/gpu/drm/msm/msm_gpu.h
index 9df48e3..0ff23ca 100644
--- a/drivers/gpu/drm/msm/msm_gpu.h
+++ b/drivers/gpu/drm/msm/msm_gpu.h
@@ -71,7 +71,7 @@ struct msm_gpu_funcs {
 	/* for generation specific debugfs: */
 	int (*debugfs_init)(struct msm_gpu *gpu, struct drm_minor *minor);
 #endif
-	unsigned long (*gpu_busy)(struct msm_gpu *gpu);
+	unsigned long (*gpu_busy)(struct msm_gpu *gpu, u64 *busy_cycles);
 	struct msm_gpu_state *(*gpu_state_get)(struct msm_gpu *gpu);
 	int (*gpu_state_put)(struct msm_gpu_state *state);
 	unsigned long (*gpu_get_freq)(struct msm_gpu *gpu);
-- 
1.9.1

_______________________________________________
Freedreno mailing list
Freedreno@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/freedreno

^ permalink raw reply related	[flat|nested] 9+ messages in thread

* [PATCH 2/3] drm/msm/a6xx: Move power counter selectable to resume()
       [not found] ` <1539781441-13076-1-git-send-email-smasetty-sgV2jX0FEOL9JmXXK+q4OQ@public.gmane.org>
  2018-10-17 13:03   ` [PATCH 1/3] drm/msm: Change gpu_busy() function Sharat Masetty
@ 2018-10-17 13:04   ` Sharat Masetty
       [not found]     ` <1539781441-13076-3-git-send-email-smasetty-sgV2jX0FEOL9JmXXK+q4OQ@public.gmane.org>
  2018-10-17 13:04   ` [PATCH 3/3] drm/msm: Use Hardware counters for perf profiling Sharat Masetty
  2 siblings, 1 reply; 9+ messages in thread
From: Sharat Masetty @ 2018-10-17 13:04 UTC (permalink / raw)
  To: freedreno-PD4FTy7X32lNgt0PjOBp9y5qC8QIuHrW
  Cc: linux-arm-msm-u79uwXL29TY76Z2rM5mHXA,
	jcrouse-sgV2jX0FEOL9JmXXK+q4OQ, Sharat Masetty,
	dri-devel-PD4FTy7X32lNgt0PjOBp9y5qC8QIuHrW

Moving this to resume helps with both GPU DCVS and the performance
sampling. For GPU DCVS this makes sure that the frequency does not scale
when there are no GPU submissions. In the case of performance profiling,
the GPU is UP, but its possible that the hw_init() was not called yet,
so we need this to get sane perf values in all cases.

Signed-off-by: Sharat Masetty <smasetty@codeaurora.org>
---
 drivers/gpu/drm/msm/adreno/a6xx_gpu.c | 16 ++++++++--------
 1 file changed, 8 insertions(+), 8 deletions(-)

diff --git a/drivers/gpu/drm/msm/adreno/a6xx_gpu.c b/drivers/gpu/drm/msm/adreno/a6xx_gpu.c
index c0cd3ac..2c52b7c 100644
--- a/drivers/gpu/drm/msm/adreno/a6xx_gpu.c
+++ b/drivers/gpu/drm/msm/adreno/a6xx_gpu.c
@@ -387,14 +387,6 @@ static int a6xx_hw_init(struct msm_gpu *gpu)
 	/* Select CP0 to always count cycles */
 	gpu_write(gpu, REG_A6XX_CP_PERFCTR_CP_SEL_0, PERF_CP_ALWAYS_COUNT);
 
-	/* FIXME: not sure if this should live here or in a6xx_gmu.c */
-	gmu_write(&a6xx_gpu->gmu,  REG_A6XX_GPU_GMU_AO_GPU_CX_BUSY_MASK,
-		0xff000000);
-	gmu_rmw(&a6xx_gpu->gmu, REG_A6XX_GMU_CX_GMU_POWER_COUNTER_SELECT_0,
-		0xff, 0x20);
-	gmu_write(&a6xx_gpu->gmu, REG_A6XX_GMU_CX_GMU_POWER_COUNTER_ENABLE,
-		0x01);
-
 	gpu_write(gpu, REG_A6XX_RB_NC_MODE_CNTL, 2 << 1);
 	gpu_write(gpu, REG_A6XX_TPL1_NC_MODE_CNTL, 2 << 1);
 	gpu_write(gpu, REG_A6XX_SP_NC_MODE_CNTL, 2 << 1);
@@ -655,6 +647,14 @@ static int a6xx_pm_resume(struct msm_gpu *gpu)
 
 	gpu->needs_hw_init = true;
 
+	/* FIXME: not sure if this should live here or in a6xx_gmu.c */
+	gmu_write(&a6xx_gpu->gmu,  REG_A6XX_GPU_GMU_AO_GPU_CX_BUSY_MASK,
+		0xff000000);
+	gmu_rmw(&a6xx_gpu->gmu, REG_A6XX_GMU_CX_GMU_POWER_COUNTER_SELECT_0,
+		0xff, 0x20);
+	gmu_write(&a6xx_gpu->gmu, REG_A6XX_GMU_CX_GMU_POWER_COUNTER_ENABLE,
+		0x01);
+
 	msm_gpu_resume_devfreq(gpu);
 
 	return ret;
-- 
1.9.1

_______________________________________________
Freedreno mailing list
Freedreno@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/freedreno

^ permalink raw reply related	[flat|nested] 9+ messages in thread

* [PATCH 3/3] drm/msm: Use Hardware counters for perf profiling
       [not found] ` <1539781441-13076-1-git-send-email-smasetty-sgV2jX0FEOL9JmXXK+q4OQ@public.gmane.org>
  2018-10-17 13:03   ` [PATCH 1/3] drm/msm: Change gpu_busy() function Sharat Masetty
  2018-10-17 13:04   ` [PATCH 2/3] drm/msm/a6xx: Move power counter selectable to resume() Sharat Masetty
@ 2018-10-17 13:04   ` Sharat Masetty
       [not found]     ` <1539781441-13076-4-git-send-email-smasetty-sgV2jX0FEOL9JmXXK+q4OQ@public.gmane.org>
  2 siblings, 1 reply; 9+ messages in thread
From: Sharat Masetty @ 2018-10-17 13:04 UTC (permalink / raw)
  To: freedreno-PD4FTy7X32lNgt0PjOBp9y5qC8QIuHrW
  Cc: linux-arm-msm-u79uwXL29TY76Z2rM5mHXA,
	jcrouse-sgV2jX0FEOL9JmXXK+q4OQ, Sharat Masetty,
	dri-devel-PD4FTy7X32lNgt0PjOBp9y5qC8QIuHrW

This patch attempts to make use of the hardware counters for GPU busy %
estimation when possible and skip using the software counters as it also
accounts for software side delays. This should help give more accurate
representation of the GPU workload.

Signed-off-by: Sharat Masetty <smasetty@codeaurora.org>
---
 drivers/gpu/drm/msm/msm_gpu.c  | 30 ++++++++++++++++++++++++++----
 drivers/gpu/drm/msm/msm_gpu.h  |  5 +++--
 drivers/gpu/drm/msm/msm_perf.c | 10 +++++-----
 3 files changed, 34 insertions(+), 11 deletions(-)

diff --git a/drivers/gpu/drm/msm/msm_gpu.c b/drivers/gpu/drm/msm/msm_gpu.c
index e9b5426..a896541 100644
--- a/drivers/gpu/drm/msm/msm_gpu.c
+++ b/drivers/gpu/drm/msm/msm_gpu.c
@@ -592,6 +592,9 @@ static void update_sw_cntrs(struct msm_gpu *gpu)
 	uint32_t elapsed;
 	unsigned long flags;
 
+	if (gpu->funcs->gpu_busy)
+		return;
+
 	spin_lock_irqsave(&gpu->perf_lock, flags);
 	if (!gpu->perfcntr_active)
 		goto out;
@@ -620,6 +623,7 @@ void msm_gpu_perfcntr_start(struct msm_gpu *gpu)
 	/* we could dynamically enable/disable perfcntr registers too.. */
 	gpu->last_sample.active = msm_gpu_active(gpu);
 	gpu->last_sample.time = ktime_get();
+	gpu->last_sample.busy_cycles = 0;
 	gpu->activetime = gpu->totaltime = 0;
 	gpu->perfcntr_active = true;
 	update_hw_cntrs(gpu, 0, NULL);
@@ -632,9 +636,22 @@ void msm_gpu_perfcntr_stop(struct msm_gpu *gpu)
 	pm_runtime_put_sync(&gpu->pdev->dev);
 }
 
+static void msm_gpu_hw_sample(struct msm_gpu *gpu, uint64_t *activetime,
+		uint64_t *totaltime)
+{
+	ktime_t time;
+
+	*activetime = gpu->funcs->gpu_busy(gpu,
+			&gpu->last_sample.busy_cycles);
+
+	time = ktime_get();
+	*totaltime = ktime_us_delta(time, gpu->last_sample.time);
+	gpu->last_sample.time = time;
+}
+
 /* returns -errno or # of cntrs sampled */
-int msm_gpu_perfcntr_sample(struct msm_gpu *gpu, uint32_t *activetime,
-		uint32_t *totaltime, uint32_t ncntrs, uint32_t *cntrs)
+int msm_gpu_perfcntr_sample(struct msm_gpu *gpu, uint64_t *activetime,
+		uint64_t *totaltime, uint32_t ncntrs, uint32_t *cntrs)
 {
 	unsigned long flags;
 	int ret;
@@ -646,13 +663,18 @@ int msm_gpu_perfcntr_sample(struct msm_gpu *gpu, uint32_t *activetime,
 		goto out;
 	}
 
+	ret = update_hw_cntrs(gpu, ncntrs, cntrs);
+
+	if (gpu->funcs->gpu_busy) {
+		msm_gpu_hw_sample(gpu, activetime, totaltime);
+		goto out;
+	}
+
 	*activetime = gpu->activetime;
 	*totaltime = gpu->totaltime;
 
 	gpu->activetime = gpu->totaltime = 0;
 
-	ret = update_hw_cntrs(gpu, ncntrs, cntrs);
-
 out:
 	spin_unlock_irqrestore(&gpu->perf_lock, flags);
 
diff --git a/drivers/gpu/drm/msm/msm_gpu.h b/drivers/gpu/drm/msm/msm_gpu.h
index 0ff23ca..7dc775f 100644
--- a/drivers/gpu/drm/msm/msm_gpu.h
+++ b/drivers/gpu/drm/msm/msm_gpu.h
@@ -90,6 +90,7 @@ struct msm_gpu {
 	struct {
 		bool active;
 		ktime_t time;
+		u64 busy_cycles;
 	} last_sample;
 	uint32_t totaltime, activetime;    /* sw counters */
 	uint32_t last_cntrs[5];            /* hw counters */
@@ -275,8 +276,8 @@ static inline void gpu_write64(struct msm_gpu *gpu, u32 lo, u32 hi, u64 val)
 
 void msm_gpu_perfcntr_start(struct msm_gpu *gpu);
 void msm_gpu_perfcntr_stop(struct msm_gpu *gpu);
-int msm_gpu_perfcntr_sample(struct msm_gpu *gpu, uint32_t *activetime,
-		uint32_t *totaltime, uint32_t ncntrs, uint32_t *cntrs);
+int msm_gpu_perfcntr_sample(struct msm_gpu *gpu, uint64_t *activetime,
+		uint64_t *totaltime, uint32_t ncntrs, uint32_t *cntrs);
 
 void msm_gpu_retire(struct msm_gpu *gpu);
 void msm_gpu_submit(struct msm_gpu *gpu, struct msm_gem_submit *submit,
diff --git a/drivers/gpu/drm/msm/msm_perf.c b/drivers/gpu/drm/msm/msm_perf.c
index 5ab21bd..318f7dd 100644
--- a/drivers/gpu/drm/msm/msm_perf.c
+++ b/drivers/gpu/drm/msm/msm_perf.c
@@ -17,7 +17,7 @@
 
 /* For profiling, userspace can:
  *
- *   tail -f /sys/kernel/debug/dri/<minor>/gpu
+ *   tail -f /sys/kernel/debug/dri/<minor>/perf
  *
  * This will enable performance counters/profiling to track the busy time
  * and any gpu specific performance counters that are supported.
@@ -85,9 +85,9 @@ static int refill_buf(struct msm_perf_state *perf)
 		}
 	} else {
 		/* Sample line: */
-		uint32_t activetime = 0, totaltime = 0;
+		uint64_t activetime = 0, totaltime = 0;
 		uint32_t cntrs[5];
-		uint32_t val;
+		uint64_t val;
 		int ret;
 
 		/* sleep until next sample time: */
@@ -101,14 +101,14 @@ static int refill_buf(struct msm_perf_state *perf)
 			return ret;
 
 		val = totaltime ? 1000 * activetime / totaltime : 0;
-		n = snprintf(ptr, rem, "%3d.%d%%", val / 10, val % 10);
+		n = snprintf(ptr, rem, "%3llu.%llu%%", val / 10, val % 10);
 		ptr += n;
 		rem -= n;
 
 		for (i = 0; i < ret; i++) {
 			/* cycle counters (I think).. convert to MHz.. */
 			val = cntrs[i] / 10000;
-			n = snprintf(ptr, rem, "\t%5d.%02d",
+			n = snprintf(ptr, rem, "\t%5llu.%02llu",
 					val / 100, val % 100);
 			ptr += n;
 			rem -= n;
-- 
1.9.1

_______________________________________________
Freedreno mailing list
Freedreno@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/freedreno

^ permalink raw reply related	[flat|nested] 9+ messages in thread

* Re: [PATCH 2/3] drm/msm/a6xx: Move power counter selectable to resume()
       [not found]     ` <1539781441-13076-3-git-send-email-smasetty-sgV2jX0FEOL9JmXXK+q4OQ@public.gmane.org>
@ 2018-10-17 14:09       ` Jordan Crouse
  0 siblings, 0 replies; 9+ messages in thread
From: Jordan Crouse @ 2018-10-17 14:09 UTC (permalink / raw)
  To: Sharat Masetty
  Cc: linux-arm-msm-u79uwXL29TY76Z2rM5mHXA,
	freedreno-PD4FTy7X32lNgt0PjOBp9y5qC8QIuHrW,
	dri-devel-PD4FTy7X32lNgt0PjOBp9y5qC8QIuHrW

On Wed, Oct 17, 2018 at 06:34:00PM +0530, Sharat Masetty wrote:
> Moving this to resume helps with both GPU DCVS and the performance
> sampling. For GPU DCVS this makes sure that the frequency does not scale
> when there are no GPU submissions. In the case of performance profiling,
> the GPU is UP, but its possible that the hw_init() was not called yet,
> so we need this to get sane perf values in all cases.

This description is worded oddly to me but I think the gist is that you
want the counters running before you start sampling. I agree which is why I
think that the /* FIXME: */ is right and this should go into in the GMU code
(a6xx_rpmh_start to be exact). The timing should still work out and the GMU code
can stay with the GMU code which is always a bonus.

> Signed-off-by: Sharat Masetty <smasetty@codeaurora.org>
> ---
>  drivers/gpu/drm/msm/adreno/a6xx_gpu.c | 16 ++++++++--------
>  1 file changed, 8 insertions(+), 8 deletions(-)
> 
> diff --git a/drivers/gpu/drm/msm/adreno/a6xx_gpu.c b/drivers/gpu/drm/msm/adreno/a6xx_gpu.c
> index c0cd3ac..2c52b7c 100644
> --- a/drivers/gpu/drm/msm/adreno/a6xx_gpu.c
> +++ b/drivers/gpu/drm/msm/adreno/a6xx_gpu.c
> @@ -387,14 +387,6 @@ static int a6xx_hw_init(struct msm_gpu *gpu)
>  	/* Select CP0 to always count cycles */
>  	gpu_write(gpu, REG_A6XX_CP_PERFCTR_CP_SEL_0, PERF_CP_ALWAYS_COUNT);
>  
> -	/* FIXME: not sure if this should live here or in a6xx_gmu.c */
> -	gmu_write(&a6xx_gpu->gmu,  REG_A6XX_GPU_GMU_AO_GPU_CX_BUSY_MASK,
> -		0xff000000);
> -	gmu_rmw(&a6xx_gpu->gmu, REG_A6XX_GMU_CX_GMU_POWER_COUNTER_SELECT_0,
> -		0xff, 0x20);
> -	gmu_write(&a6xx_gpu->gmu, REG_A6XX_GMU_CX_GMU_POWER_COUNTER_ENABLE,
> -		0x01);
> -
>  	gpu_write(gpu, REG_A6XX_RB_NC_MODE_CNTL, 2 << 1);
>  	gpu_write(gpu, REG_A6XX_TPL1_NC_MODE_CNTL, 2 << 1);
>  	gpu_write(gpu, REG_A6XX_SP_NC_MODE_CNTL, 2 << 1);
> @@ -655,6 +647,14 @@ static int a6xx_pm_resume(struct msm_gpu *gpu)
>  
>  	gpu->needs_hw_init = true;
>  
> +	/* FIXME: not sure if this should live here or in a6xx_gmu.c */

If in case you don't move this,  this line needs to go away.

> +	gmu_write(&a6xx_gpu->gmu,  REG_A6XX_GPU_GMU_AO_GPU_CX_BUSY_MASK,
> +		0xff000000);
> +	gmu_rmw(&a6xx_gpu->gmu, REG_A6XX_GMU_CX_GMU_POWER_COUNTER_SELECT_0,
> +		0xff, 0x20);
> +	gmu_write(&a6xx_gpu->gmu, REG_A6XX_GMU_CX_GMU_POWER_COUNTER_ENABLE,
> +		0x01);
> +
>  	msm_gpu_resume_devfreq(gpu);
>  
>  	return ret;
> -- 
> 1.9.1
> 

-- 
The Qualcomm Innovation Center, Inc. is a member of Code Aurora Forum,
a Linux Foundation Collaborative Project
_______________________________________________
Freedreno mailing list
Freedreno@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/freedreno

^ permalink raw reply	[flat|nested] 9+ messages in thread

* Re: [PATCH 3/3] drm/msm: Use Hardware counters for perf profiling
       [not found]     ` <1539781441-13076-4-git-send-email-smasetty-sgV2jX0FEOL9JmXXK+q4OQ@public.gmane.org>
@ 2018-10-17 14:35       ` Jordan Crouse
       [not found]         ` <20181017143548.GD4751-9PYrDHPZ2Orvke4nUoYGnHL1okKdlPRT@public.gmane.org>
  0 siblings, 1 reply; 9+ messages in thread
From: Jordan Crouse @ 2018-10-17 14:35 UTC (permalink / raw)
  To: Sharat Masetty
  Cc: linux-arm-msm-u79uwXL29TY76Z2rM5mHXA,
	freedreno-PD4FTy7X32lNgt0PjOBp9y5qC8QIuHrW,
	dri-devel-PD4FTy7X32lNgt0PjOBp9y5qC8QIuHrW

On Wed, Oct 17, 2018 at 06:34:01PM +0530, Sharat Masetty wrote:
> This patch attempts to make use of the hardware counters for GPU busy %
> estimation when possible and skip using the software counters as it also
> accounts for software side delays. This should help give more accurate
> representation of the GPU workload.

I would leave this more to Rob as this particular file makes more sense for
freedreno than it does for us but I think in general if you want to do this
then we should do use the hardware for all targets and get rid of the
mix of the old and the new.

> Signed-off-by: Sharat Masetty <smasetty@codeaurora.org>
> ---
>  drivers/gpu/drm/msm/msm_gpu.c  | 30 ++++++++++++++++++++++++++----
>  drivers/gpu/drm/msm/msm_gpu.h  |  5 +++--
>  drivers/gpu/drm/msm/msm_perf.c | 10 +++++-----
>  3 files changed, 34 insertions(+), 11 deletions(-)
> 
> diff --git a/drivers/gpu/drm/msm/msm_gpu.c b/drivers/gpu/drm/msm/msm_gpu.c
> index e9b5426..a896541 100644
> --- a/drivers/gpu/drm/msm/msm_gpu.c
> +++ b/drivers/gpu/drm/msm/msm_gpu.c
> @@ -592,6 +592,9 @@ static void update_sw_cntrs(struct msm_gpu *gpu)
>  	uint32_t elapsed;
>  	unsigned long flags;
>  
> +	if (gpu->funcs->gpu_busy)
> +		return;

Like here - there isn't any reason to not have a gpu_busy for each target
so then this code could mostly go away.

>  	spin_lock_irqsave(&gpu->perf_lock, flags);
>  	if (!gpu->perfcntr_active)
>  		goto out;
> @@ -620,6 +623,7 @@ void msm_gpu_perfcntr_start(struct msm_gpu *gpu)
>  	/* we could dynamically enable/disable perfcntr registers too.. */
>  	gpu->last_sample.active = msm_gpu_active(gpu);
>  	gpu->last_sample.time = ktime_get();
> +	gpu->last_sample.busy_cycles = 0;
>  	gpu->activetime = gpu->totaltime = 0;
>  	gpu->perfcntr_active = true;
>  	update_hw_cntrs(gpu, 0, NULL);
> @@ -632,9 +636,22 @@ void msm_gpu_perfcntr_stop(struct msm_gpu *gpu)
>  	pm_runtime_put_sync(&gpu->pdev->dev);
>  }
>  
> +static void msm_gpu_hw_sample(struct msm_gpu *gpu, uint64_t *activetime,
> +		uint64_t *totaltime)
> +{
> +	ktime_t time;
> +
> +	*activetime = gpu->funcs->gpu_busy(gpu,
> +			&gpu->last_sample.busy_cycles);
> +
> +	time = ktime_get();
> +	*totaltime = ktime_us_delta(time, gpu->last_sample.time);
> +	gpu->last_sample.time = time;
> +}

This can all be done inline in the sample function below.

>  /* returns -errno or # of cntrs sampled */
> -int msm_gpu_perfcntr_sample(struct msm_gpu *gpu, uint32_t *activetime,
> -		uint32_t *totaltime, uint32_t ncntrs, uint32_t *cntrs)
> +int msm_gpu_perfcntr_sample(struct msm_gpu *gpu, uint64_t *activetime,
> +		uint64_t *totaltime, uint32_t ncntrs, uint32_t *cntrs)

This might be an good change (though if our activetime or totaltime ever
goes over 32 bits we've got ourselves a problem) - but it should be in a
separate patch.

>  {
>  	unsigned long flags;
>  	int ret;
> @@ -646,13 +663,18 @@ int msm_gpu_perfcntr_sample(struct msm_gpu *gpu, uint32_t *activetime,
>  		goto out;
>  	}
>  
> +	ret = update_hw_cntrs(gpu, ncntrs, cntrs);
> +
> +	if (gpu->funcs->gpu_busy) {
> +		msm_gpu_hw_sample(gpu, activetime, totaltime);
> +		goto out;
> +	}
> +
>  	*activetime = gpu->activetime;
>  	*totaltime = gpu->totaltime;
>  
>  	gpu->activetime = gpu->totaltime = 0;
>  
> -	ret = update_hw_cntrs(gpu, ncntrs, cntrs);
> -
>  out:
>  	spin_unlock_irqrestore(&gpu->perf_lock, flags);
>  
> diff --git a/drivers/gpu/drm/msm/msm_gpu.h b/drivers/gpu/drm/msm/msm_gpu.h
> index 0ff23ca..7dc775f 100644
> --- a/drivers/gpu/drm/msm/msm_gpu.h
> +++ b/drivers/gpu/drm/msm/msm_gpu.h
> @@ -90,6 +90,7 @@ struct msm_gpu {
>  	struct {
>  		bool active;
>  		ktime_t time;
> +		u64 busy_cycles;
>  	} last_sample;
>  	uint32_t totaltime, activetime;    /* sw counters */
>  	uint32_t last_cntrs[5];            /* hw counters */
> @@ -275,8 +276,8 @@ static inline void gpu_write64(struct msm_gpu *gpu, u32 lo, u32 hi, u64 val)
>  
>  void msm_gpu_perfcntr_start(struct msm_gpu *gpu);
>  void msm_gpu_perfcntr_stop(struct msm_gpu *gpu);
> -int msm_gpu_perfcntr_sample(struct msm_gpu *gpu, uint32_t *activetime,
> -		uint32_t *totaltime, uint32_t ncntrs, uint32_t *cntrs);
> +int msm_gpu_perfcntr_sample(struct msm_gpu *gpu, uint64_t *activetime,
> +		uint64_t *totaltime, uint32_t ncntrs, uint32_t *cntrs);
>  
>  void msm_gpu_retire(struct msm_gpu *gpu);
>  void msm_gpu_submit(struct msm_gpu *gpu, struct msm_gem_submit *submit,
> diff --git a/drivers/gpu/drm/msm/msm_perf.c b/drivers/gpu/drm/msm/msm_perf.c
> index 5ab21bd..318f7dd 100644
> --- a/drivers/gpu/drm/msm/msm_perf.c
> +++ b/drivers/gpu/drm/msm/msm_perf.c
> @@ -17,7 +17,7 @@
>  
>  /* For profiling, userspace can:
>   *
> - *   tail -f /sys/kernel/debug/dri/<minor>/gpu
> + *   tail -f /sys/kernel/debug/dri/<minor>/perf
>   *
>   * This will enable performance counters/profiling to track the busy time
>   * and any gpu specific performance counters that are supported.
> @@ -85,9 +85,9 @@ static int refill_buf(struct msm_perf_state *perf)
>  		}
>  	} else {
>  		/* Sample line: */
> -		uint32_t activetime = 0, totaltime = 0;
> +		uint64_t activetime = 0, totaltime = 0;

All the changes to msm_perf.c shuld also be in a separate patch that moves
the sample size to u64.

>  		uint32_t cntrs[5];
> -		uint32_t val;
> +		uint64_t val;
>  		int ret;
>  
>  		/* sleep until next sample time: */
> @@ -101,14 +101,14 @@ static int refill_buf(struct msm_perf_state *perf)
>  			return ret;
>  
>  		val = totaltime ? 1000 * activetime / totaltime : 0;
> -		n = snprintf(ptr, rem, "%3d.%d%%", val / 10, val % 10);
> +		n = snprintf(ptr, rem, "%3llu.%llu%%", val / 10, val % 10);
>  		ptr += n;
>  		rem -= n;
>  
>  		for (i = 0; i < ret; i++) {
>  			/* cycle counters (I think).. convert to MHz.. */
>  			val = cntrs[i] / 10000;
> -			n = snprintf(ptr, rem, "\t%5d.%02d",
> +			n = snprintf(ptr, rem, "\t%5llu.%02llu",
>  					val / 100, val % 100);
>  			ptr += n;
>  			rem -= n;
> -- 
> 1.9.1
> 

-- 
The Qualcomm Innovation Center, Inc. is a member of Code Aurora Forum,
a Linux Foundation Collaborative Project
_______________________________________________
Freedreno mailing list
Freedreno@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/freedreno

^ permalink raw reply	[flat|nested] 9+ messages in thread

* Re: [PATCH 1/3] drm/msm: Change gpu_busy() function
       [not found]     ` <1539781441-13076-2-git-send-email-smasetty-sgV2jX0FEOL9JmXXK+q4OQ@public.gmane.org>
@ 2018-10-17 14:39       ` Jordan Crouse
  0 siblings, 0 replies; 9+ messages in thread
From: Jordan Crouse @ 2018-10-17 14:39 UTC (permalink / raw)
  To: Sharat Masetty
  Cc: linux-arm-msm-u79uwXL29TY76Z2rM5mHXA,
	freedreno-PD4FTy7X32lNgt0PjOBp9y5qC8QIuHrW,
	dri-devel-PD4FTy7X32lNgt0PjOBp9y5qC8QIuHrW

On Wed, Oct 17, 2018 at 06:33:59PM +0530, Sharat Masetty wrote:
> The gpu_busy() function turns out is useful for performance profiling as
> well. So we change the input params to pass in the pointer to the previous
> busy cycles, this makes the function more generic and removes the dependency
> on the gpu devfreq.

This patch is good in theory.

> Signed-off-by: Sharat Masetty <smasetty@codeaurora.org>
> ---
>  drivers/gpu/drm/msm/adreno/a5xx_gpu.c | 6 +++---
>  drivers/gpu/drm/msm/adreno/a6xx_gpu.c | 6 +++---
>  drivers/gpu/drm/msm/msm_gpu.c         | 3 ++-
>  drivers/gpu/drm/msm/msm_gpu.h         | 2 +-
>  4 files changed, 9 insertions(+), 8 deletions(-)
> 
> diff --git a/drivers/gpu/drm/msm/adreno/a5xx_gpu.c b/drivers/gpu/drm/msm/adreno/a5xx_gpu.c
> index 236a6ed..b5df80c 100644
> --- a/drivers/gpu/drm/msm/adreno/a5xx_gpu.c
> +++ b/drivers/gpu/drm/msm/adreno/a5xx_gpu.c
> @@ -1428,7 +1428,7 @@ static struct msm_ringbuffer *a5xx_active_ring(struct msm_gpu *gpu)
>  	return a5xx_gpu->cur_ring;
>  }
>  
> -static unsigned long a5xx_gpu_busy(struct msm_gpu *gpu)
> +static unsigned long a5xx_gpu_busy(struct msm_gpu *gpu, u64 *prev_cycles)
>  {
>  	u64 busy_cycles;
>  	unsigned long busy_time;
> @@ -1436,10 +1436,10 @@ static unsigned long a5xx_gpu_busy(struct msm_gpu *gpu)
>  	busy_cycles = gpu_read64(gpu, REG_A5XX_RBBM_PERFCTR_RBBM_0_LO,
>  			REG_A5XX_RBBM_PERFCTR_RBBM_0_HI);
>  
> -	busy_time = (busy_cycles - gpu->devfreq.busy_cycles) /
> +	busy_time = (busy_cycles - *prev_cycles) /
>  		(clk_get_rate(gpu->core_clk) / 1000000);
>  
> -	gpu->devfreq.busy_cycles = busy_cycles;
> +	*prev_cycles = busy_cycles;
>  
>  	return busy_time;
>  }
> diff --git a/drivers/gpu/drm/msm/adreno/a6xx_gpu.c b/drivers/gpu/drm/msm/adreno/a6xx_gpu.c
> index 07c3c3c..c0cd3ac 100644
> --- a/drivers/gpu/drm/msm/adreno/a6xx_gpu.c
> +++ b/drivers/gpu/drm/msm/adreno/a6xx_gpu.c
> @@ -722,7 +722,7 @@ static void a6xx_destroy(struct msm_gpu *gpu)
>  	kfree(a6xx_gpu);
>  }
>  
> -static unsigned long a6xx_gpu_busy(struct msm_gpu *gpu)
> +static unsigned long a6xx_gpu_busy(struct msm_gpu *gpu, u64 *prev_cycles)
>  {
>  	struct adreno_gpu *adreno_gpu = to_adreno_gpu(gpu);
>  	struct a6xx_gpu *a6xx_gpu = to_a6xx_gpu(adreno_gpu);
> @@ -733,9 +733,9 @@ static unsigned long a6xx_gpu_busy(struct msm_gpu *gpu)
>  			REG_A6XX_GMU_CX_GMU_POWER_COUNTER_XOCLK_0_L,
>  			REG_A6XX_GMU_CX_GMU_POWER_COUNTER_XOCLK_0_H);
>  
> -	busy_time = ((busy_cycles - gpu->devfreq.busy_cycles) * 10) / 192;
> +	busy_time = ((busy_cycles - *prev_cycles) * 10) / 192;

You must be using older code  - this is missing the fixes from Sean Paul.

> -	gpu->devfreq.busy_cycles = busy_cycles;
> +	*prev_cycles = busy_cycles;
>  
>  	return busy_time;
>  }
> diff --git a/drivers/gpu/drm/msm/msm_gpu.c b/drivers/gpu/drm/msm/msm_gpu.c
> index ca573f6..e9b5426 100644
> --- a/drivers/gpu/drm/msm/msm_gpu.c
> +++ b/drivers/gpu/drm/msm/msm_gpu.c
> @@ -62,7 +62,8 @@ static int msm_devfreq_get_dev_status(struct device *dev,
>  	else
>  		status->current_frequency = clk_get_rate(gpu->core_clk);
>  
> -	status->busy_time = gpu->funcs->gpu_busy(gpu);
> +	status->busy_time = gpu->funcs->gpu_busy(gpu,
> +			&gpu->devfreq.busy_cycles);
>  
>  	time = ktime_get();
>  	status->total_time = ktime_us_delta(time, gpu->devfreq.time);
> diff --git a/drivers/gpu/drm/msm/msm_gpu.h b/drivers/gpu/drm/msm/msm_gpu.h
> index 9df48e3..0ff23ca 100644
> --- a/drivers/gpu/drm/msm/msm_gpu.h
> +++ b/drivers/gpu/drm/msm/msm_gpu.h
> @@ -71,7 +71,7 @@ struct msm_gpu_funcs {
>  	/* for generation specific debugfs: */
>  	int (*debugfs_init)(struct msm_gpu *gpu, struct drm_minor *minor);
>  #endif
> -	unsigned long (*gpu_busy)(struct msm_gpu *gpu);
> +	unsigned long (*gpu_busy)(struct msm_gpu *gpu, u64 *busy_cycles);
>  	struct msm_gpu_state *(*gpu_state_get)(struct msm_gpu *gpu);
>  	int (*gpu_state_put)(struct msm_gpu_state *state);
>  	unsigned long (*gpu_get_freq)(struct msm_gpu *gpu);
> -- 
> 1.9.1
> 

-- 
The Qualcomm Innovation Center, Inc. is a member of Code Aurora Forum,
a Linux Foundation Collaborative Project
_______________________________________________
Freedreno mailing list
Freedreno@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/freedreno

^ permalink raw reply	[flat|nested] 9+ messages in thread

* Re: [PATCH 3/3] drm/msm: Use Hardware counters for perf profiling
       [not found]         ` <20181017143548.GD4751-9PYrDHPZ2Orvke4nUoYGnHL1okKdlPRT@public.gmane.org>
@ 2018-10-26 13:46           ` Sharat Masetty
       [not found]             ` <f57bf8b2-c44a-68f5-a051-2a17c1ac5704-sgV2jX0FEOL9JmXXK+q4OQ@public.gmane.org>
  0 siblings, 1 reply; 9+ messages in thread
From: Sharat Masetty @ 2018-10-26 13:46 UTC (permalink / raw)
  To: freedreno-PD4FTy7X32lNgt0PjOBp9y5qC8QIuHrW,
	dri-devel-PD4FTy7X32lNgt0PjOBp9y5qC8QIuHrW,
	linux-arm-msm-u79uwXL29TY76Z2rM5mHXA
  Cc: Rob Clark

Added Rob to this thread.

On 10/17/2018 8:05 PM, Jordan Crouse wrote:
> On Wed, Oct 17, 2018 at 06:34:01PM +0530, Sharat Masetty wrote:
>> This patch attempts to make use of the hardware counters for GPU busy %
>> estimation when possible and skip using the software counters as it also
>> accounts for software side delays. This should help give more accurate
>> representation of the GPU workload.
> 
> I would leave this more to Rob as this particular file makes more sense for
> freedreno than it does for us but I think in general if you want to do this
> then we should do use the hardware for all targets and get rid of the
> mix of the old and the new.
Thanks for the comments Jordan. Yes, I prefer the same too, but the only 
limiting factor for me is that I don't have a way to test changes for 
targets such as a3xx and a4xx, and I also do not know if there is anyone 
actually using this currently.

Hi Rob,
Can you please share your comments on this? Is it okay to remove 
software counters functionality?

Sharat
> 
>> Signed-off-by: Sharat Masetty <smasetty@codeaurora.org>
>> ---
>>   drivers/gpu/drm/msm/msm_gpu.c  | 30 ++++++++++++++++++++++++++----
>>   drivers/gpu/drm/msm/msm_gpu.h  |  5 +++--
>>   drivers/gpu/drm/msm/msm_perf.c | 10 +++++-----
>>   3 files changed, 34 insertions(+), 11 deletions(-)
>>
>> diff --git a/drivers/gpu/drm/msm/msm_gpu.c b/drivers/gpu/drm/msm/msm_gpu.c
>> index e9b5426..a896541 100644
>> --- a/drivers/gpu/drm/msm/msm_gpu.c
>> +++ b/drivers/gpu/drm/msm/msm_gpu.c
>> @@ -592,6 +592,9 @@ static void update_sw_cntrs(struct msm_gpu *gpu)
>>   	uint32_t elapsed;
>>   	unsigned long flags;
>>   
>> +	if (gpu->funcs->gpu_busy)
>> +		return;
> 
> Like here - there isn't any reason to not have a gpu_busy for each target
> so then this code could mostly go away.
> 
>>   	spin_lock_irqsave(&gpu->perf_lock, flags);
>>   	if (!gpu->perfcntr_active)
>>   		goto out;
>> @@ -620,6 +623,7 @@ void msm_gpu_perfcntr_start(struct msm_gpu *gpu)
>>   	/* we could dynamically enable/disable perfcntr registers too.. */
>>   	gpu->last_sample.active = msm_gpu_active(gpu);
>>   	gpu->last_sample.time = ktime_get();
>> +	gpu->last_sample.busy_cycles = 0;
>>   	gpu->activetime = gpu->totaltime = 0;
>>   	gpu->perfcntr_active = true;
>>   	update_hw_cntrs(gpu, 0, NULL);
>> @@ -632,9 +636,22 @@ void msm_gpu_perfcntr_stop(struct msm_gpu *gpu)
>>   	pm_runtime_put_sync(&gpu->pdev->dev);
>>   }
>>   
>> +static void msm_gpu_hw_sample(struct msm_gpu *gpu, uint64_t *activetime,
>> +		uint64_t *totaltime)
>> +{
>> +	ktime_t time;
>> +
>> +	*activetime = gpu->funcs->gpu_busy(gpu,
>> +			&gpu->last_sample.busy_cycles);
>> +
>> +	time = ktime_get();
>> +	*totaltime = ktime_us_delta(time, gpu->last_sample.time);
>> +	gpu->last_sample.time = time;
>> +}
> 
> This can all be done inline in the sample function below.
> 
>>   /* returns -errno or # of cntrs sampled */
>> -int msm_gpu_perfcntr_sample(struct msm_gpu *gpu, uint32_t *activetime,
>> -		uint32_t *totaltime, uint32_t ncntrs, uint32_t *cntrs)
>> +int msm_gpu_perfcntr_sample(struct msm_gpu *gpu, uint64_t *activetime,
>> +		uint64_t *totaltime, uint32_t ncntrs, uint32_t *cntrs)
> 
> This might be an good change (though if our activetime or totaltime ever
> goes over 32 bits we've got ourselves a problem) - but it should be in a
> separate patch.
> 
>>   {
>>   	unsigned long flags;
>>   	int ret;
>> @@ -646,13 +663,18 @@ int msm_gpu_perfcntr_sample(struct msm_gpu *gpu, uint32_t *activetime,
>>   		goto out;
>>   	}
>>   
>> +	ret = update_hw_cntrs(gpu, ncntrs, cntrs);
>> +
>> +	if (gpu->funcs->gpu_busy) {
>> +		msm_gpu_hw_sample(gpu, activetime, totaltime);
>> +		goto out;
>> +	}
>> +
>>   	*activetime = gpu->activetime;
>>   	*totaltime = gpu->totaltime;
>>   
>>   	gpu->activetime = gpu->totaltime = 0;
>>   
>> -	ret = update_hw_cntrs(gpu, ncntrs, cntrs);
>> -
>>   out:
>>   	spin_unlock_irqrestore(&gpu->perf_lock, flags);
>>   
>> diff --git a/drivers/gpu/drm/msm/msm_gpu.h b/drivers/gpu/drm/msm/msm_gpu.h
>> index 0ff23ca..7dc775f 100644
>> --- a/drivers/gpu/drm/msm/msm_gpu.h
>> +++ b/drivers/gpu/drm/msm/msm_gpu.h
>> @@ -90,6 +90,7 @@ struct msm_gpu {
>>   	struct {
>>   		bool active;
>>   		ktime_t time;
>> +		u64 busy_cycles;
>>   	} last_sample;
>>   	uint32_t totaltime, activetime;    /* sw counters */
>>   	uint32_t last_cntrs[5];            /* hw counters */
>> @@ -275,8 +276,8 @@ static inline void gpu_write64(struct msm_gpu *gpu, u32 lo, u32 hi, u64 val)
>>   
>>   void msm_gpu_perfcntr_start(struct msm_gpu *gpu);
>>   void msm_gpu_perfcntr_stop(struct msm_gpu *gpu);
>> -int msm_gpu_perfcntr_sample(struct msm_gpu *gpu, uint32_t *activetime,
>> -		uint32_t *totaltime, uint32_t ncntrs, uint32_t *cntrs);
>> +int msm_gpu_perfcntr_sample(struct msm_gpu *gpu, uint64_t *activetime,
>> +		uint64_t *totaltime, uint32_t ncntrs, uint32_t *cntrs);
>>   
>>   void msm_gpu_retire(struct msm_gpu *gpu);
>>   void msm_gpu_submit(struct msm_gpu *gpu, struct msm_gem_submit *submit,
>> diff --git a/drivers/gpu/drm/msm/msm_perf.c b/drivers/gpu/drm/msm/msm_perf.c
>> index 5ab21bd..318f7dd 100644
>> --- a/drivers/gpu/drm/msm/msm_perf.c
>> +++ b/drivers/gpu/drm/msm/msm_perf.c
>> @@ -17,7 +17,7 @@
>>   
>>   /* For profiling, userspace can:
>>    *
>> - *   tail -f /sys/kernel/debug/dri/<minor>/gpu
>> + *   tail -f /sys/kernel/debug/dri/<minor>/perf
>>    *
>>    * This will enable performance counters/profiling to track the busy time
>>    * and any gpu specific performance counters that are supported.
>> @@ -85,9 +85,9 @@ static int refill_buf(struct msm_perf_state *perf)
>>   		}
>>   	} else {
>>   		/* Sample line: */
>> -		uint32_t activetime = 0, totaltime = 0;
>> +		uint64_t activetime = 0, totaltime = 0;
> 
> All the changes to msm_perf.c shuld also be in a separate patch that moves
> the sample size to u64.
> 
>>   		uint32_t cntrs[5];
>> -		uint32_t val;
>> +		uint64_t val;
>>   		int ret;
>>   
>>   		/* sleep until next sample time: */
>> @@ -101,14 +101,14 @@ static int refill_buf(struct msm_perf_state *perf)
>>   			return ret;
>>   
>>   		val = totaltime ? 1000 * activetime / totaltime : 0;
>> -		n = snprintf(ptr, rem, "%3d.%d%%", val / 10, val % 10);
>> +		n = snprintf(ptr, rem, "%3llu.%llu%%", val / 10, val % 10);
>>   		ptr += n;
>>   		rem -= n;
>>   
>>   		for (i = 0; i < ret; i++) {
>>   			/* cycle counters (I think).. convert to MHz.. */
>>   			val = cntrs[i] / 10000;
>> -			n = snprintf(ptr, rem, "\t%5d.%02d",
>> +			n = snprintf(ptr, rem, "\t%5llu.%02llu",
>>   					val / 100, val % 100);
>>   			ptr += n;
>>   			rem -= n;
>> -- 
>> 1.9.1
>>
> 

-- 
The Qualcomm Innovation Center, Inc. is a member of the Code Aurora Forum,
Linux Foundation Collaborative Project
_______________________________________________
Freedreno mailing list
Freedreno@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/freedreno

^ permalink raw reply	[flat|nested] 9+ messages in thread

* Re: [PATCH 3/3] drm/msm: Use Hardware counters for perf profiling
       [not found]             ` <f57bf8b2-c44a-68f5-a051-2a17c1ac5704-sgV2jX0FEOL9JmXXK+q4OQ@public.gmane.org>
@ 2018-11-28 20:09               ` Rob Clark
  0 siblings, 0 replies; 9+ messages in thread
From: Rob Clark @ 2018-11-28 20:09 UTC (permalink / raw)
  To: Sharat Masetty; +Cc: linux-arm-msm, freedreno, dri-devel

On Fri, Oct 26, 2018 at 9:46 AM Sharat Masetty <smasetty@codeaurora.org> wrote:
>
> Added Rob to this thread.
>
> On 10/17/2018 8:05 PM, Jordan Crouse wrote:
> > On Wed, Oct 17, 2018 at 06:34:01PM +0530, Sharat Masetty wrote:
> >> This patch attempts to make use of the hardware counters for GPU busy %
> >> estimation when possible and skip using the software counters as it also
> >> accounts for software side delays. This should help give more accurate
> >> representation of the GPU workload.
> >
> > I would leave this more to Rob as this particular file makes more sense for
> > freedreno than it does for us but I think in general if you want to do this
> > then we should do use the hardware for all targets and get rid of the
> > mix of the old and the new.
> Thanks for the comments Jordan. Yes, I prefer the same too, but the only
> limiting factor for me is that I don't have a way to test changes for
> targets such as a3xx and a4xx, and I also do not know if there is anyone
> actually using this currently.
>
> Hi Rob,
> Can you please share your comments on this? Is it okay to remove
> software counters functionality?

In principle yes..  although one side-comment is that there are
patches floating around for a2xx, which is from long enough ago that I
don't remember what the perf-ctr situation is there.

I guess if we can be reasonably confident to implement it w/ hw
counters for all the generations, then I don't see a big need to keep
the sw counter functionality.

I should, in theory, be able to test a3xx/a4xx/a5xx.. a4xx might be a
bit harder to get a recent upstream kernel running on

BR,
-R

>
> Sharat
> >
> >> Signed-off-by: Sharat Masetty <smasetty@codeaurora.org>
> >> ---
> >>   drivers/gpu/drm/msm/msm_gpu.c  | 30 ++++++++++++++++++++++++++----
> >>   drivers/gpu/drm/msm/msm_gpu.h  |  5 +++--
> >>   drivers/gpu/drm/msm/msm_perf.c | 10 +++++-----
> >>   3 files changed, 34 insertions(+), 11 deletions(-)
> >>
> >> diff --git a/drivers/gpu/drm/msm/msm_gpu.c b/drivers/gpu/drm/msm/msm_gpu.c
> >> index e9b5426..a896541 100644
> >> --- a/drivers/gpu/drm/msm/msm_gpu.c
> >> +++ b/drivers/gpu/drm/msm/msm_gpu.c
> >> @@ -592,6 +592,9 @@ static void update_sw_cntrs(struct msm_gpu *gpu)
> >>      uint32_t elapsed;
> >>      unsigned long flags;
> >>
> >> +    if (gpu->funcs->gpu_busy)
> >> +            return;
> >
> > Like here - there isn't any reason to not have a gpu_busy for each target
> > so then this code could mostly go away.
> >
> >>      spin_lock_irqsave(&gpu->perf_lock, flags);
> >>      if (!gpu->perfcntr_active)
> >>              goto out;
> >> @@ -620,6 +623,7 @@ void msm_gpu_perfcntr_start(struct msm_gpu *gpu)
> >>      /* we could dynamically enable/disable perfcntr registers too.. */
> >>      gpu->last_sample.active = msm_gpu_active(gpu);
> >>      gpu->last_sample.time = ktime_get();
> >> +    gpu->last_sample.busy_cycles = 0;
> >>      gpu->activetime = gpu->totaltime = 0;
> >>      gpu->perfcntr_active = true;
> >>      update_hw_cntrs(gpu, 0, NULL);
> >> @@ -632,9 +636,22 @@ void msm_gpu_perfcntr_stop(struct msm_gpu *gpu)
> >>      pm_runtime_put_sync(&gpu->pdev->dev);
> >>   }
> >>
> >> +static void msm_gpu_hw_sample(struct msm_gpu *gpu, uint64_t *activetime,
> >> +            uint64_t *totaltime)
> >> +{
> >> +    ktime_t time;
> >> +
> >> +    *activetime = gpu->funcs->gpu_busy(gpu,
> >> +                    &gpu->last_sample.busy_cycles);
> >> +
> >> +    time = ktime_get();
> >> +    *totaltime = ktime_us_delta(time, gpu->last_sample.time);
> >> +    gpu->last_sample.time = time;
> >> +}
> >
> > This can all be done inline in the sample function below.
> >
> >>   /* returns -errno or # of cntrs sampled */
> >> -int msm_gpu_perfcntr_sample(struct msm_gpu *gpu, uint32_t *activetime,
> >> -            uint32_t *totaltime, uint32_t ncntrs, uint32_t *cntrs)
> >> +int msm_gpu_perfcntr_sample(struct msm_gpu *gpu, uint64_t *activetime,
> >> +            uint64_t *totaltime, uint32_t ncntrs, uint32_t *cntrs)
> >
> > This might be an good change (though if our activetime or totaltime ever
> > goes over 32 bits we've got ourselves a problem) - but it should be in a
> > separate patch.
> >
> >>   {
> >>      unsigned long flags;
> >>      int ret;
> >> @@ -646,13 +663,18 @@ int msm_gpu_perfcntr_sample(struct msm_gpu *gpu, uint32_t *activetime,
> >>              goto out;
> >>      }
> >>
> >> +    ret = update_hw_cntrs(gpu, ncntrs, cntrs);
> >> +
> >> +    if (gpu->funcs->gpu_busy) {
> >> +            msm_gpu_hw_sample(gpu, activetime, totaltime);
> >> +            goto out;
> >> +    }
> >> +
> >>      *activetime = gpu->activetime;
> >>      *totaltime = gpu->totaltime;
> >>
> >>      gpu->activetime = gpu->totaltime = 0;
> >>
> >> -    ret = update_hw_cntrs(gpu, ncntrs, cntrs);
> >> -
> >>   out:
> >>      spin_unlock_irqrestore(&gpu->perf_lock, flags);
> >>
> >> diff --git a/drivers/gpu/drm/msm/msm_gpu.h b/drivers/gpu/drm/msm/msm_gpu.h
> >> index 0ff23ca..7dc775f 100644
> >> --- a/drivers/gpu/drm/msm/msm_gpu.h
> >> +++ b/drivers/gpu/drm/msm/msm_gpu.h
> >> @@ -90,6 +90,7 @@ struct msm_gpu {
> >>      struct {
> >>              bool active;
> >>              ktime_t time;
> >> +            u64 busy_cycles;
> >>      } last_sample;
> >>      uint32_t totaltime, activetime;    /* sw counters */
> >>      uint32_t last_cntrs[5];            /* hw counters */
> >> @@ -275,8 +276,8 @@ static inline void gpu_write64(struct msm_gpu *gpu, u32 lo, u32 hi, u64 val)
> >>
> >>   void msm_gpu_perfcntr_start(struct msm_gpu *gpu);
> >>   void msm_gpu_perfcntr_stop(struct msm_gpu *gpu);
> >> -int msm_gpu_perfcntr_sample(struct msm_gpu *gpu, uint32_t *activetime,
> >> -            uint32_t *totaltime, uint32_t ncntrs, uint32_t *cntrs);
> >> +int msm_gpu_perfcntr_sample(struct msm_gpu *gpu, uint64_t *activetime,
> >> +            uint64_t *totaltime, uint32_t ncntrs, uint32_t *cntrs);
> >>
> >>   void msm_gpu_retire(struct msm_gpu *gpu);
> >>   void msm_gpu_submit(struct msm_gpu *gpu, struct msm_gem_submit *submit,
> >> diff --git a/drivers/gpu/drm/msm/msm_perf.c b/drivers/gpu/drm/msm/msm_perf.c
> >> index 5ab21bd..318f7dd 100644
> >> --- a/drivers/gpu/drm/msm/msm_perf.c
> >> +++ b/drivers/gpu/drm/msm/msm_perf.c
> >> @@ -17,7 +17,7 @@
> >>
> >>   /* For profiling, userspace can:
> >>    *
> >> - *   tail -f /sys/kernel/debug/dri/<minor>/gpu
> >> + *   tail -f /sys/kernel/debug/dri/<minor>/perf
> >>    *
> >>    * This will enable performance counters/profiling to track the busy time
> >>    * and any gpu specific performance counters that are supported.
> >> @@ -85,9 +85,9 @@ static int refill_buf(struct msm_perf_state *perf)
> >>              }
> >>      } else {
> >>              /* Sample line: */
> >> -            uint32_t activetime = 0, totaltime = 0;
> >> +            uint64_t activetime = 0, totaltime = 0;
> >
> > All the changes to msm_perf.c shuld also be in a separate patch that moves
> > the sample size to u64.
> >
> >>              uint32_t cntrs[5];
> >> -            uint32_t val;
> >> +            uint64_t val;
> >>              int ret;
> >>
> >>              /* sleep until next sample time: */
> >> @@ -101,14 +101,14 @@ static int refill_buf(struct msm_perf_state *perf)
> >>                      return ret;
> >>
> >>              val = totaltime ? 1000 * activetime / totaltime : 0;
> >> -            n = snprintf(ptr, rem, "%3d.%d%%", val / 10, val % 10);
> >> +            n = snprintf(ptr, rem, "%3llu.%llu%%", val / 10, val % 10);
> >>              ptr += n;
> >>              rem -= n;
> >>
> >>              for (i = 0; i < ret; i++) {
> >>                      /* cycle counters (I think).. convert to MHz.. */
> >>                      val = cntrs[i] / 10000;
> >> -                    n = snprintf(ptr, rem, "\t%5d.%02d",
> >> +                    n = snprintf(ptr, rem, "\t%5llu.%02llu",
> >>                                      val / 100, val % 100);
> >>                      ptr += n;
> >>                      rem -= n;
> >> --
> >> 1.9.1
> >>
> >
>
> --
> The Qualcomm Innovation Center, Inc. is a member of the Code Aurora Forum,
> Linux Foundation Collaborative Project
_______________________________________________
Freedreno mailing list
Freedreno@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/freedreno

^ permalink raw reply	[flat|nested] 9+ messages in thread

end of thread, other threads:[~2018-11-28 20:09 UTC | newest]

Thread overview: 9+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2018-10-17 13:03 [PATCH 0/3] drm/msm: Revive GPU workload profiling Sharat Masetty
     [not found] ` <1539781441-13076-1-git-send-email-smasetty-sgV2jX0FEOL9JmXXK+q4OQ@public.gmane.org>
2018-10-17 13:03   ` [PATCH 1/3] drm/msm: Change gpu_busy() function Sharat Masetty
     [not found]     ` <1539781441-13076-2-git-send-email-smasetty-sgV2jX0FEOL9JmXXK+q4OQ@public.gmane.org>
2018-10-17 14:39       ` Jordan Crouse
2018-10-17 13:04   ` [PATCH 2/3] drm/msm/a6xx: Move power counter selectable to resume() Sharat Masetty
     [not found]     ` <1539781441-13076-3-git-send-email-smasetty-sgV2jX0FEOL9JmXXK+q4OQ@public.gmane.org>
2018-10-17 14:09       ` Jordan Crouse
2018-10-17 13:04   ` [PATCH 3/3] drm/msm: Use Hardware counters for perf profiling Sharat Masetty
     [not found]     ` <1539781441-13076-4-git-send-email-smasetty-sgV2jX0FEOL9JmXXK+q4OQ@public.gmane.org>
2018-10-17 14:35       ` Jordan Crouse
     [not found]         ` <20181017143548.GD4751-9PYrDHPZ2Orvke4nUoYGnHL1okKdlPRT@public.gmane.org>
2018-10-26 13:46           ` Sharat Masetty
     [not found]             ` <f57bf8b2-c44a-68f5-a051-2a17c1ac5704-sgV2jX0FEOL9JmXXK+q4OQ@public.gmane.org>
2018-11-28 20:09               ` Rob Clark

This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.