All of lore.kernel.org
 help / color / mirror / Atom feed
From: Boris Brezillon <boris.brezillon@collabora.com>
To: Rob Herring <robh+dt@kernel.org>,
	Tomeu Vizoso <tomeu@tomeuvizoso.net>,
	dri-devel@lists.freedesktop.org
Cc: kernel@collabora.com,
	Boris Brezillon <boris.brezillon@collabora.com>,
	Alyssa Rosenzweig <alyssa@rosenzweig.io>,
	Neil Armstrong <narmstrong@baylibre.com>
Subject: [PATCH 2/3] drm/panfrost: Expose HW counters to userspace
Date: Thu,  4 Apr 2019 17:20:50 +0200	[thread overview]
Message-ID: <20190404152051.17996-3-boris.brezillon@collabora.com> (raw)
In-Reply-To: <20190404152051.17996-1-boris.brezillon@collabora.com>

Add the necessary infrastructure to expose GPU counters to userspace.
This takes the form of 4 new ioctls to:

- query the available counters
- create/destroy a performance monitor
- retrieve its values

The drm_panfrost_submit struct is extended to pass a list of perfmons
to attach to a job, which means perfmons will only track changes caused
by the jobs they are attached too.

Signed-off-by: Boris Brezillon <boris.brezillon@collabora.com>
---
 drivers/gpu/drm/panfrost/Makefile           |   3 +-
 drivers/gpu/drm/panfrost/panfrost_device.c  |   8 +
 drivers/gpu/drm/panfrost/panfrost_device.h  |  11 +
 drivers/gpu/drm/panfrost/panfrost_drv.c     |  22 +-
 drivers/gpu/drm/panfrost/panfrost_gpu.c     |  43 +-
 drivers/gpu/drm/panfrost/panfrost_job.c     |  24 +
 drivers/gpu/drm/panfrost/panfrost_job.h     |   4 +
 drivers/gpu/drm/panfrost/panfrost_perfcnt.c | 954 ++++++++++++++++++++
 drivers/gpu/drm/panfrost/panfrost_perfcnt.h |  54 ++
 drivers/gpu/drm/panfrost/panfrost_regs.h    |  19 +
 include/uapi/drm/panfrost_drm.h             | 122 +++
 11 files changed, 1260 insertions(+), 4 deletions(-)
 create mode 100644 drivers/gpu/drm/panfrost/panfrost_perfcnt.c
 create mode 100644 drivers/gpu/drm/panfrost/panfrost_perfcnt.h

diff --git a/drivers/gpu/drm/panfrost/Makefile b/drivers/gpu/drm/panfrost/Makefile
index d07e0971b687..31cfb9d25682 100644
--- a/drivers/gpu/drm/panfrost/Makefile
+++ b/drivers/gpu/drm/panfrost/Makefile
@@ -6,6 +6,7 @@ panfrost-y := \
 	panfrost_gem.o \
 	panfrost_gpu.o \
 	panfrost_job.o \
-	panfrost_mmu.o
+	panfrost_mmu.o \
+	panfrost_perfcnt.o
 
 obj-$(CONFIG_DRM_PANFROST) += panfrost.o
diff --git a/drivers/gpu/drm/panfrost/panfrost_device.c b/drivers/gpu/drm/panfrost/panfrost_device.c
index 148b5caa2322..f6a87bfa486b 100644
--- a/drivers/gpu/drm/panfrost/panfrost_device.c
+++ b/drivers/gpu/drm/panfrost/panfrost_device.c
@@ -13,6 +13,7 @@
 #include "panfrost_gpu.h"
 #include "panfrost_job.h"
 #include "panfrost_mmu.h"
+#include "panfrost_perfcnt.h"
 
 static int panfrost_reset_init(struct panfrost_device *pfdev)
 {
@@ -147,7 +148,13 @@ int panfrost_device_init(struct panfrost_device *pfdev)
 	pm_runtime_mark_last_busy(pfdev->dev);
 	pm_runtime_put_autosuspend(pfdev->dev);
 
+	err = panfrost_perfcnt_init(pfdev);
+	if (err)
+		goto err_out5;
+
 	return 0;
+err_out5:
+	panfrost_job_fini(pfdev);
 err_out4:
 	panfrost_mmu_fini(pfdev);
 err_out3:
@@ -163,6 +170,7 @@ int panfrost_device_init(struct panfrost_device *pfdev)
 
 void panfrost_device_fini(struct panfrost_device *pfdev)
 {
+	panfrost_perfcnt_fini(pfdev);
 	panfrost_job_fini(pfdev);
 	panfrost_mmu_fini(pfdev);
 	panfrost_gpu_fini(pfdev);
diff --git a/drivers/gpu/drm/panfrost/panfrost_device.h b/drivers/gpu/drm/panfrost/panfrost_device.h
index a821b50a14c3..f7c4e9e55f1b 100644
--- a/drivers/gpu/drm/panfrost/panfrost_device.h
+++ b/drivers/gpu/drm/panfrost/panfrost_device.h
@@ -9,11 +9,13 @@
 #include <drm/drm_device.h>
 #include <drm/drm_mm.h>
 #include <drm/gpu_scheduler.h>
+#include <drm/panfrost_drm.h>
 
 struct panfrost_device;
 struct panfrost_mmu;
 struct panfrost_job_slot;
 struct panfrost_job;
+struct panfrost_perfcnt;
 
 #define NUM_JOB_SLOTS 3
 
@@ -45,6 +47,8 @@ struct panfrost_features {
 
 	unsigned long hw_features[64 / BITS_PER_LONG];
 	unsigned long hw_issues[64 / BITS_PER_LONG];
+
+	struct drm_panfrost_block_perfcounters perfcnt_layout[PANFROST_NUM_BLOCKS];
 };
 
 struct panfrost_device {
@@ -70,6 +74,8 @@ struct panfrost_device {
 	struct panfrost_job *jobs[NUM_JOB_SLOTS];
 	struct list_head scheduled_jobs;
 
+	struct panfrost_perfcnt *perfcnt;
+
 	struct mutex sched_lock;
 };
 
@@ -77,6 +83,11 @@ struct panfrost_file_priv {
 	struct panfrost_device *pfdev;
 
 	struct drm_sched_entity sched_entity[NUM_JOB_SLOTS];
+
+	struct {
+		struct idr idr;
+		struct mutex lock;
+	} perfmon;
 };
 
 static inline struct panfrost_device *to_panfrost_device(struct drm_device *ddev)
diff --git a/drivers/gpu/drm/panfrost/panfrost_drv.c b/drivers/gpu/drm/panfrost/panfrost_drv.c
index 8cffb70a3548..e5375b31627f 100644
--- a/drivers/gpu/drm/panfrost/panfrost_drv.c
+++ b/drivers/gpu/drm/panfrost/panfrost_drv.c
@@ -19,6 +19,7 @@
 #include "panfrost_mmu.h"
 #include "panfrost_job.h"
 #include "panfrost_gpu.h"
+#include "panfrost_perfcnt.h"
 
 static int panfrost_ioctl_get_param(struct drm_device *ddev, void *data, struct drm_file *file)
 {
@@ -219,6 +220,10 @@ static int panfrost_ioctl_submit(struct drm_device *dev, void *data,
 	if (ret)
 		goto fail;
 
+	ret = panfrost_perfcnt_create_job_ctx(job, file, args);
+	if (ret)
+		goto fail;
+
 	ret = panfrost_job_push(job);
 	if (ret)
 		goto fail;
@@ -313,6 +318,7 @@ panfrost_open(struct drm_device *dev, struct drm_file *file)
 {
 	struct panfrost_device *pfdev = dev->dev_private;
 	struct panfrost_file_priv *panfrost_priv;
+	int ret;
 
 	panfrost_priv = kzalloc(sizeof(*panfrost_priv), GFP_KERNEL);
 	if (!panfrost_priv)
@@ -321,7 +327,16 @@ panfrost_open(struct drm_device *dev, struct drm_file *file)
 	panfrost_priv->pfdev = pfdev;
 	file->driver_priv = panfrost_priv;
 
-	return panfrost_job_open(panfrost_priv);
+	ret = panfrost_job_open(panfrost_priv);
+	if (ret)
+		goto err_free_priv;
+
+	panfrost_perfcnt_open(panfrost_priv);
+	return 0;
+
+err_free_priv:
+	kfree(panfrost_priv);
+	return ret;
 }
 
 static void
@@ -329,6 +344,7 @@ panfrost_postclose(struct drm_device *dev, struct drm_file *file)
 {
 	struct panfrost_file_priv *panfrost_priv = file->driver_priv;
 
+	panfrost_perfcnt_close(panfrost_priv);
 	panfrost_job_close(panfrost_priv);
 
 	kfree(panfrost_priv);
@@ -348,6 +364,10 @@ static const struct drm_ioctl_desc panfrost_drm_driver_ioctls[] = {
 	PANFROST_IOCTL(MMAP_BO,		mmap_bo,	DRM_RENDER_ALLOW),
 	PANFROST_IOCTL(GET_PARAM,	get_param,	DRM_RENDER_ALLOW),
 	PANFROST_IOCTL(GET_BO_OFFSET,	get_bo_offset,	DRM_RENDER_ALLOW),
+	PANFROST_IOCTL(GET_PERFCNT_LAYOUT, get_perfcnt_layout, DRM_RENDER_ALLOW),
+	PANFROST_IOCTL(CREATE_PERFMON,	create_perfmon,	DRM_RENDER_ALLOW),
+	PANFROST_IOCTL(DESTROY_PERFMON,	destroy_perfmon, DRM_RENDER_ALLOW),
+	PANFROST_IOCTL(GET_PERFMON_VALUES, get_perfmon_values, DRM_RENDER_ALLOW),
 };
 
 DEFINE_DRM_GEM_SHMEM_FOPS(panfrost_drm_driver_fops);
diff --git a/drivers/gpu/drm/panfrost/panfrost_gpu.c b/drivers/gpu/drm/panfrost/panfrost_gpu.c
index d46d36170e18..c28a31c547cc 100644
--- a/drivers/gpu/drm/panfrost/panfrost_gpu.c
+++ b/drivers/gpu/drm/panfrost/panfrost_gpu.c
@@ -13,6 +13,7 @@
 #include "panfrost_features.h"
 #include "panfrost_issues.h"
 #include "panfrost_gpu.h"
+#include "panfrost_perfcnt.h"
 #include "panfrost_regs.h"
 
 static irqreturn_t panfrost_gpu_irq_handler(int irq, void *data)
@@ -42,6 +43,12 @@ static irqreturn_t panfrost_gpu_irq_handler(int irq, void *data)
 		done = true;
 	}
 
+	if (state & GPU_IRQ_PERFCNT_SAMPLE_COMPLETED)
+		panfrost_perfcnt_sample_done(pfdev);
+
+	if (state & GPU_IRQ_CLEAN_CACHES_COMPLETED)
+		panfrost_perfcnt_clean_cache_done(pfdev);
+
 	gpu_write(pfdev, GPU_INT_CLEAR, state);
 
 	return IRQ_HANDLED;
@@ -152,14 +159,16 @@ struct panfrost_model {
 		u32 revision;
 		u64 issues;
 	} revs[MAX_HW_REVS];
+	u64 perfcnt[PANFROST_NUM_BLOCKS];
 };
 
 #define GPU_MODEL(_name, _id, ...) \
-{\
+{								\
 	.name = __stringify(_name),				\
 	.id = _id,						\
 	.features = hw_features_##_name,			\
 	.issues = hw_issues_##_name,				\
+	.perfcnt = hw_perfcnt_##_name,				\
 	.revs = { __VA_ARGS__ },				\
 }
 
@@ -198,13 +207,17 @@ static const struct panfrost_model gpu_models[] = {
 
 static void panfrost_gpu_init_features(struct panfrost_device *pfdev)
 {
+	struct drm_panfrost_block_perfcounters *perfcnt_layout;
 	u32 gpu_id, num_js, major, minor, status, rev;
 	const char *name = "unknown";
 	u64 hw_feat = 0;
-	u64 hw_issues = hw_issues_all;
+	u64 hw_issues = hw_issues_all, mask;
 	const struct panfrost_model *model;
+	unsigned int num;
 	int i;
 
+	perfcnt_layout = pfdev->features.perfcnt_layout;
+
 	pfdev->features.l2_features = gpu_read(pfdev, GPU_L2_FEATURES);
 	pfdev->features.core_features = gpu_read(pfdev, GPU_CORE_FEATURES);
 	pfdev->features.tiler_features = gpu_read(pfdev, GPU_TILER_FEATURES);
@@ -272,9 +285,35 @@ static void panfrost_gpu_init_features(struct panfrost_device *pfdev)
 		if (best >= 0)
 			hw_issues |= model->revs[best].issues;
 
+		for (i = 0; i < PANFROST_NUM_BLOCKS; i++)
+			perfcnt_layout[i].counters = model->perfcnt[i];
+
 		break;
 	}
 
+	/* Only one Job Manager. */
+	perfcnt_layout[PANFROST_JM_BLOCK].instances = BIT(0);
+	perfcnt_layout[PANFROST_SHADER_BLOCK].instances =
+						pfdev->features.shader_present;
+
+	/*
+	 * In v4 HW we have one tiler per core group, with the number
+	 * of core groups being equal to the number of L2 caches. Other
+	 * HW versions just have one tiler and the number of L2 caches
+	 * can be extracted from the mem_features field.
+	 */
+	if (hw_feat & HW_FEATURE_V4) {
+		num = hweight64(pfdev->features.l2_present);
+		mask = GENMASK(num - 1, 0);
+		perfcnt_layout[PANFROST_MMU_L2_BLOCK].instances = mask;
+		perfcnt_layout[PANFROST_TILER_BLOCK].instances = mask;
+	} else {
+		perfcnt_layout[PANFROST_TILER_BLOCK].instances = BIT(0);
+		num = ((pfdev->features.mem_features >> 8) & GENMASK(3, 0)) + 1;
+		mask = GENMASK(num - 1, 0);
+		perfcnt_layout[PANFROST_MMU_L2_BLOCK].instances = mask;
+	}
+
 	bitmap_from_u64(pfdev->features.hw_features, hw_feat);
 	bitmap_from_u64(pfdev->features.hw_issues, hw_issues);
 
diff --git a/drivers/gpu/drm/panfrost/panfrost_job.c b/drivers/gpu/drm/panfrost/panfrost_job.c
index 8d570c3f15d0..c2be61a9ebff 100644
--- a/drivers/gpu/drm/panfrost/panfrost_job.c
+++ b/drivers/gpu/drm/panfrost/panfrost_job.c
@@ -15,6 +15,7 @@
 #include "panfrost_features.h"
 #include "panfrost_issues.h"
 #include "panfrost_gem.h"
+#include "panfrost_perfcnt.h"
 #include "panfrost_regs.h"
 #include "panfrost_gpu.h"
 #include "panfrost_mmu.h"
@@ -153,6 +154,7 @@ static void panfrost_job_hw_submit(struct panfrost_job *job, int js)
 		goto end;
 
 	spin_lock_irqsave(&pfdev->hwaccess_lock, flags);
+	panfrost_perfcnt_run_job(job);
 
 	job_write(pfdev, JS_HEAD_NEXT_LO(js), jc_head & 0xFFFFFFFF);
 	job_write(pfdev, JS_HEAD_NEXT_HI(js), jc_head >> 32);
@@ -233,6 +235,12 @@ int panfrost_job_push(struct panfrost_job *job)
 		goto unlock;
 	}
 
+	ret = panfrost_perfcnt_push_job(job);
+	if (ret) {
+		mutex_unlock(&pfdev->sched_lock);
+		goto unlock;
+	}
+
 	job->render_done_fence = dma_fence_get(&job->base.s_fence->finished);
 
 	kref_get(&job->refcount); /* put by scheduler job completion */
@@ -272,6 +280,9 @@ static void panfrost_job_cleanup(struct kref *ref)
 
 	for (i = 0; i < job->bo_count; i++)
 		drm_gem_object_put_unlocked(job->bos[i]);
+
+	panfrost_perfcnt_clean_job_ctx(job);
+
 	kvfree(job->bos);
 
 	kfree(job);
@@ -316,6 +327,13 @@ static struct dma_fence *panfrost_job_dependency(struct drm_sched_job *sched_job
 		}
 	}
 
+	/* Return the perfmon wait fence if any. */
+	if (job->perfcnt_fence) {
+		fence = job->perfcnt_fence;
+		job->perfcnt_fence = NULL;
+		return fence;
+	}
+
 	return NULL;
 }
 
@@ -399,6 +417,11 @@ static void panfrost_job_timedout(struct drm_sched_job *sched_job)
 	/* restart scheduler after GPU is usable again */
 	for (i = 0; i < NUM_JOB_SLOTS; i++)
 		drm_sched_start(&pfdev->js->queue[i].sched, true);
+
+	/* For now, just say we're done. No reset and retry. */
+//	job_write(pfdev, JS_COMMAND(js), JS_COMMAND_HARD_STOP);
+	dma_fence_signal(job->done_fence);
+	panfrost_perfcnt_finish_job(job, true);
 }
 
 static const struct drm_sched_backend_ops panfrost_sched_ops = {
@@ -442,6 +465,7 @@ static irqreturn_t panfrost_job_irq_handler(int irq, void *data)
 
 		if (status & JOB_INT_MASK_DONE(j)) {
 			dma_fence_signal(pfdev->jobs[j]->done_fence);
+			panfrost_perfcnt_finish_job(pfdev->jobs[j], false);
 		}
 
 		status &= ~mask;
diff --git a/drivers/gpu/drm/panfrost/panfrost_job.h b/drivers/gpu/drm/panfrost/panfrost_job.h
index 62454128a792..18646cc5eebb 100644
--- a/drivers/gpu/drm/panfrost/panfrost_job.h
+++ b/drivers/gpu/drm/panfrost/panfrost_job.h
@@ -37,6 +37,10 @@ struct panfrost_job {
 
 	/* Fence to be signaled by drm-sched once its done with the job */
 	struct dma_fence *render_done_fence;
+
+	/* Perfcnt context */
+	struct panfrost_perfcnt_job_ctx *perfcnt_ctx;
+	struct dma_fence *perfcnt_fence;
 };
 
 int panfrost_job_init(struct panfrost_device *pfdev);
diff --git a/drivers/gpu/drm/panfrost/panfrost_perfcnt.c b/drivers/gpu/drm/panfrost/panfrost_perfcnt.c
new file mode 100644
index 000000000000..4491f153ad48
--- /dev/null
+++ b/drivers/gpu/drm/panfrost/panfrost_perfcnt.c
@@ -0,0 +1,954 @@
+// SPDX-License-Identifier: GPL-2.0
+/* Copyright 2019 Collabora Ltd */
+
+#include <drm/drm_file.h>
+#include <drm/drm_gem_shmem_helper.h>
+#include <drm/panfrost_drm.h>
+#include <linux/iopoll.h>
+#include <linux/slab.h>
+#include <linux/uaccess.h>
+
+#include "panfrost_device.h"
+#include "panfrost_features.h"
+#include "panfrost_gem.h"
+#include "panfrost_issues.h"
+#include "panfrost_job.h"
+#include "panfrost_mmu.h"
+#include "panfrost_regs.h"
+
+#define COUNTERS_PER_BLOCK		64
+#define BYTES_PER_COUNTER		4
+#define BLOCKS_PER_COREGROUP		8
+#define V4_SHADERS_PER_COREGROUP	4
+
+struct panfrost_perfcnt_job_ctx {
+	refcount_t refcount;
+	struct panfrost_device *pfdev;
+	struct dma_fence *wait_fence;
+	struct dma_fence *done_fence;
+	struct panfrost_perfmon **perfmons;
+	u32 perfmon_count;
+};
+
+struct panfrost_perfcnt {
+	struct work_struct dumpwork;
+	u64 fence_context;
+	u64 emit_seqno;
+	spinlock_t fence_lock;
+	struct mutex cfg_lock;
+	u32 cur_cfg[PANFROST_NUM_BLOCKS];
+	struct panfrost_gem_object *bo;
+	void *buf;
+	spinlock_t ctx_lock;
+	struct panfrost_perfcnt_job_ctx *last_ctx;
+	struct panfrost_perfcnt_job_ctx *dump_ctx;
+};
+
+struct panfrost_perfcnt_fence {
+	struct dma_fence base;
+	struct drm_device *dev;
+	u64 seqno;
+};
+
+struct panfrost_perfmon {
+	refcount_t refcnt;
+	atomic_t busycnt;
+	struct wait_queue_head wq;
+	struct drm_panfrost_block_perfcounters counters[PANFROST_NUM_BLOCKS];
+	u32 *values[PANFROST_NUM_BLOCKS];
+};
+
+static inline struct panfrost_perfcnt_fence *
+to_panfrost_perfcnt_fence(struct dma_fence *fence)
+{
+	return container_of(fence, struct panfrost_perfcnt_fence, base);
+}
+
+static const char *
+panfrost_perfcnt_fence_get_driver_name(struct dma_fence *fence)
+{
+	return "panfrost";
+}
+
+static const char *
+panfrost_perfcnt_fence_get_timeline_name(struct dma_fence *fence)
+{
+	return "panfrost-perfcnt";
+}
+
+static const struct dma_fence_ops panfrost_perfcnt_fence_ops = {
+	.get_driver_name = panfrost_perfcnt_fence_get_driver_name,
+	.get_timeline_name = panfrost_perfcnt_fence_get_timeline_name,
+};
+
+static struct dma_fence *
+panfrost_perfcnt_fence_create(struct panfrost_device *pfdev)
+{
+	struct panfrost_perfcnt_fence *fence;
+
+	fence = kzalloc(sizeof(*fence), GFP_KERNEL);
+	if (!fence)
+		return ERR_PTR(-ENOMEM);
+
+	fence->dev = pfdev->ddev;
+	fence->seqno = ++pfdev->perfcnt->emit_seqno;
+	dma_fence_init(&fence->base, &panfrost_perfcnt_fence_ops,
+		       &pfdev->perfcnt->fence_lock,
+		       pfdev->perfcnt->fence_context, fence->seqno);
+
+	return &fence->base;
+}
+
+static void panfrost_perfmon_get(struct panfrost_perfmon *perfmon)
+{
+	if (perfmon)
+		refcount_inc(&perfmon->refcnt);
+}
+
+static void panfrost_perfmon_put(struct panfrost_perfmon *perfmon)
+{
+	if (perfmon && refcount_dec_and_test(&perfmon->refcnt)) {
+		unsigned int i;
+
+		for (i = 0; i < PANFROST_NUM_BLOCKS; i++)
+			kfree(perfmon->values[i]);
+
+		kfree(perfmon);
+	}
+}
+
+static struct panfrost_perfmon *
+panfrost_perfcnt_find_perfmon(struct panfrost_file_priv *pfile, int id)
+{
+	struct panfrost_perfmon *perfmon;
+
+	mutex_lock(&pfile->perfmon.lock);
+	perfmon = idr_find(&pfile->perfmon.idr, id);
+	panfrost_perfmon_get(perfmon);
+	mutex_unlock(&pfile->perfmon.lock);
+
+	return perfmon;
+}
+
+void panfrost_perfcnt_open(struct panfrost_file_priv *pfile)
+{
+	mutex_init(&pfile->perfmon.lock);
+	idr_init(&pfile->perfmon.idr);
+}
+
+static int panfrost_perfcnt_idr_del(int id, void *elem, void *data)
+{
+	struct panfrost_perfmon *perfmon = elem;
+
+	panfrost_perfmon_put(perfmon);
+
+	return 0;
+}
+
+void panfrost_perfcnt_close(struct panfrost_file_priv *pfile)
+{
+	mutex_lock(&pfile->perfmon.lock);
+	idr_for_each(&pfile->perfmon.idr, panfrost_perfcnt_idr_del, NULL);
+	idr_destroy(&pfile->perfmon.idr);
+	mutex_unlock(&pfile->perfmon.lock);
+}
+
+int panfrost_ioctl_get_perfcnt_layout(struct drm_device *dev, void *data,
+				      struct drm_file *file_priv)
+{
+	struct panfrost_file_priv *pfile = file_priv->driver_priv;
+	struct panfrost_device *pfdev = pfile->pfdev;
+	struct drm_panfrost_get_perfcnt_layout *layout = data;
+
+	memcpy(layout->counters, pfdev->features.perfcnt_layout,
+	       sizeof(layout->counters));
+
+	return 0;
+}
+
+int panfrost_ioctl_create_perfmon(struct drm_device *dev, void *data,
+				  struct drm_file *file_priv)
+{
+	struct panfrost_file_priv *pfile = file_priv->driver_priv;
+	struct panfrost_device *pfdev = pfile->pfdev;
+	struct drm_panfrost_create_perfmon *req = data;
+	struct drm_panfrost_block_perfcounters *layout;
+	struct panfrost_perfmon *perfmon;
+	unsigned int i;
+	int ret;
+
+	if (req->padding)
+		return -EINVAL;
+
+	perfmon = kzalloc(sizeof(*perfmon), GFP_KERNEL);
+	if (!perfmon)
+		return -ENOMEM;
+
+	ret = -ENOMEM;
+	layout = pfdev->features.perfcnt_layout;
+	for (i = 0; i < PANFROST_NUM_BLOCKS; i++) {
+		unsigned int ncounters;
+
+		/* Make sure the request matches the available counters. */
+		if (~layout[i].instances & req->counters[i].instances ||
+		    ~layout[i].counters & req->counters[i].counters)
+			goto err_free_perfmon;
+
+		ncounters = hweight64(req->counters[i].instances) *
+			    hweight64(req->counters[i].counters);
+		if (!ncounters)
+			continue;
+
+		perfmon->counters[i] = req->counters[i];
+		perfmon->values[i] = kcalloc(ncounters, sizeof(u32), GFP_KERNEL);
+		if (!perfmon->values)
+			goto err_free_perfmon;
+	}
+
+	refcount_set(&perfmon->refcnt, 1);
+	init_waitqueue_head(&perfmon->wq);
+
+	mutex_lock(&pfile->perfmon.lock);
+	ret = idr_alloc(&pfile->perfmon.idr, perfmon, 1, U32_MAX, GFP_KERNEL);
+	mutex_unlock(&pfile->perfmon.lock);
+
+	if (ret < 0)
+		goto err_free_perfmon;
+
+	req->id = ret;
+	return 0;
+
+err_free_perfmon:
+	for (i = 0; i < PANFROST_NUM_BLOCKS; i++)
+		kfree(perfmon->values[i]);
+
+	kfree(perfmon);
+	return ret;
+}
+
+int panfrost_ioctl_destroy_perfmon(struct drm_device *dev, void *data,
+				   struct drm_file *file_priv)
+{
+	struct panfrost_file_priv *pfile = file_priv->driver_priv;
+	struct drm_panfrost_destroy_perfmon *req = data;
+	struct panfrost_perfmon *perfmon;
+
+	mutex_lock(&pfile->perfmon.lock);
+	perfmon = idr_remove(&pfile->perfmon.idr, req->id);
+	mutex_unlock(&pfile->perfmon.lock);
+
+	if (!perfmon)
+		return -EINVAL;
+
+	panfrost_perfmon_put(perfmon);
+	return 0;
+}
+
+int panfrost_ioctl_get_perfmon_values(struct drm_device *dev, void *data,
+				      struct drm_file *file_priv)
+{
+	struct panfrost_file_priv *pfile = file_priv->driver_priv;
+	struct drm_panfrost_get_perfmon_values *req = data;
+	struct panfrost_perfmon *perfmon;
+	unsigned int i;
+	int ret = 0;
+
+	mutex_lock(&pfile->perfmon.lock);
+	perfmon = idr_find(&pfile->perfmon.idr, req->id);
+	panfrost_perfmon_get(perfmon);
+	mutex_unlock(&pfile->perfmon.lock);
+
+	if (!perfmon)
+		return -EINVAL;
+
+	if (!(req->flags & DRM_PANFROST_GET_PERFMON_VALS_DONT_WAIT))
+		ret = wait_event_interruptible(perfmon->wq,
+					       !atomic_read(&perfmon->busycnt));
+	else if (atomic_read(&perfmon->busycnt))
+		ret = -EBUSY;
+
+	if (ret)
+		goto out;
+
+	for (i = 0; i < PANFROST_NUM_BLOCKS; i++) {
+		unsigned int ncounters;
+
+		ncounters = hweight64(perfmon->counters[i].instances) *
+			    hweight64(perfmon->counters[i].counters);
+		if (!ncounters)
+			continue;
+
+		if (copy_to_user(u64_to_user_ptr(req->values_ptrs[i]),
+				 perfmon->values[i],
+				 ncounters * sizeof(u32))) {
+			ret = -EFAULT;
+			break;
+		}
+
+		if (req->flags & DRM_PANFROST_GET_PERFMON_VALS_RESET)
+			memset(perfmon->values[i], 0, ncounters * sizeof(u32));
+	}
+
+out:
+	panfrost_perfmon_put(perfmon);
+	return ret;
+}
+
+/*
+ * Returns true if the 2 jobs have exactly the same perfcnt context, false
+ * otherwise.
+ */
+static bool panfrost_perfcnt_job_ctx_cmp(struct panfrost_perfcnt_job_ctx *a,
+					 struct panfrost_perfcnt_job_ctx *b)
+{
+	unsigned int i, j;
+
+	if (a->perfmon_count != b->perfmon_count)
+		return false;
+
+	for (i = 0; i < a->perfmon_count; i++) {
+		for (j = 0; j < b->perfmon_count; j++) {
+			if (a->perfmons[i] == b->perfmons[j])
+				break;
+		}
+
+		if (j == b->perfmon_count)
+			return false;
+	}
+
+	return true;
+}
+
+static u32 counters_u64_to_u32(u64 in)
+{
+	unsigned int i;
+	u32 out = 0;
+
+	for (i = 0; i < 64; i += 4) {
+		if (GENMASK(i + 3, i) & in)
+			out |= BIT(i / 4);
+	}
+
+	return out;
+}
+
+void panfrost_perfcnt_run_job(struct panfrost_job *job)
+{
+	struct panfrost_perfcnt_job_ctx *ctx = job->perfcnt_ctx;
+	struct panfrost_device *pfdev = job->pfdev;
+	u32 perfcnt_en[PANFROST_NUM_BLOCKS] = { };
+	bool disable_perfcnt = true, config_changed = false;
+	unsigned int i, j;
+	u64 gpuva;
+	u32 cfg;
+
+	mutex_lock(&pfdev->perfcnt->cfg_lock);
+	for (i = 0; i < PANFROST_NUM_BLOCKS; i++) {
+		for (j = 0; j < ctx->perfmon_count; j++) {
+			u64 counters = ctx->perfmons[j]->counters[i].counters;
+
+			perfcnt_en[i] |= counters_u64_to_u32(counters);
+		}
+
+		if (perfcnt_en[i])
+			disable_perfcnt = false;
+
+		if (perfcnt_en[i] != pfdev->perfcnt->cur_cfg[i]) {
+			pfdev->perfcnt->cur_cfg[i] = perfcnt_en[i];
+			config_changed = true;
+		}
+	}
+	mutex_unlock(&pfdev->perfcnt->cfg_lock);
+
+	if (!config_changed)
+		return;
+
+	/*
+	 * Always use address space 0 for now.
+	 * FIXME: this needs to be updated when we start using different
+	 * address space.
+	 */
+	cfg = GPU_PERFCNT_CFG_AS(0);
+	if (panfrost_model_cmp(pfdev, 0x1000) >= 0)
+		cfg |= GPU_PERFCNT_CFG_SETSEL(1);
+
+	gpu_write(pfdev, GPU_PERFCNT_CFG,
+		  cfg | GPU_PERFCNT_CFG_MODE(GPU_PERFCNT_CFG_MODE_OFF));
+
+	if (disable_perfcnt)
+		return;
+
+	gpu_write(pfdev, GPU_PRFCNT_JM_EN, perfcnt_en[PANFROST_JM_BLOCK]);
+	gpu_write(pfdev, GPU_PRFCNT_SHADER_EN,
+		  perfcnt_en[PANFROST_SHADER_BLOCK]);
+	gpu_write(pfdev, GPU_PRFCNT_MMU_L2_EN,
+		  perfcnt_en[PANFROST_MMU_L2_BLOCK]);
+	gpuva = pfdev->perfcnt->bo->node.start << PAGE_SHIFT;
+	gpu_write(pfdev, GPU_PERFCNT_BASE_LO, gpuva);
+	gpu_write(pfdev, GPU_PERFCNT_BASE_HI, gpuva >> 32);
+
+	/*
+	 * Due to PRLAM-8186 we need to disable the Tiler before we enable HW
+	 * counters.
+	 */
+	if (panfrost_has_hw_issue(pfdev, HW_ISSUE_8186))
+		gpu_write(pfdev, GPU_PRFCNT_TILER_EN, 0);
+	else
+		gpu_write(pfdev, GPU_PRFCNT_TILER_EN,
+			  perfcnt_en[PANFROST_TILER_BLOCK]);
+
+	gpu_write(pfdev, GPU_PERFCNT_CFG,
+		  cfg | GPU_PERFCNT_CFG_MODE(GPU_PERFCNT_CFG_MODE_MANUAL));
+
+	if (panfrost_has_hw_issue(pfdev, HW_ISSUE_8186))
+		gpu_write(pfdev, GPU_PRFCNT_TILER_EN,
+			  perfcnt_en[PANFROST_TILER_BLOCK]);
+}
+
+static void
+panfrost_perfcnt_release_job_ctx(struct panfrost_perfcnt_job_ctx *ctx)
+{
+	unsigned int i;
+
+	WARN_ON(refcount_read(&ctx->refcount));
+	for (i = 0; i < ctx->perfmon_count; i++) {
+		if (atomic_dec_and_test(&ctx->perfmons[i]->busycnt))
+			wake_up(&ctx->perfmons[i]->wq);
+		panfrost_perfmon_put(ctx->perfmons[i]);
+	}
+
+	dma_fence_put(ctx->wait_fence);
+	dma_fence_put(ctx->done_fence);
+	kfree(ctx->perfmons);
+	kfree(ctx);
+}
+
+static void panfrost_perfcnt_put_job_ctx(struct panfrost_perfcnt_job_ctx *ctx)
+{
+	if (!IS_ERR_OR_NULL(ctx) && refcount_dec_and_test(&ctx->refcount))
+		panfrost_perfcnt_release_job_ctx(ctx);
+}
+
+struct panfrost_perfcnt_job_ctx *
+panfrost_perfcnt_get_job_ctx(struct panfrost_perfcnt_job_ctx *ctx)
+{
+	if (ctx)
+		refcount_inc(&ctx->refcount);
+
+	return ctx;
+}
+
+static void panfrost_perfcnt_dump_done(struct panfrost_perfcnt_job_ctx *ctx)
+{
+	struct panfrost_device *pfdev;
+	unsigned long flags;
+
+	pfdev = ctx->pfdev;
+	spin_lock_irqsave(&pfdev->perfcnt->ctx_lock, flags);
+	pfdev->perfcnt->dump_ctx = NULL;
+	if (pfdev->perfcnt->last_ctx == ctx)
+		pfdev->perfcnt->last_ctx = NULL;
+	spin_unlock_irqrestore(&pfdev->perfcnt->ctx_lock, flags);
+
+	dma_fence_signal(ctx->done_fence);
+	panfrost_perfcnt_release_job_ctx(ctx);
+}
+
+static void
+panfrost_perfcnt_get_counter_vals(struct panfrost_device *pfdev,
+				  enum drm_panfrost_block_id block,
+				  unsigned int instance, u32 *vals)
+{
+	u64 shader_present = pfdev->features.shader_present;
+	unsigned int bufoffs, shaderid, shadernum;
+
+	if (panfrost_has_hw_feature(pfdev, HW_FEATURE_V4)) {
+		unsigned int ncoregroups;
+
+		ncoregroups = hweight64(pfdev->features.l2_present);
+
+		switch (block) {
+		case PANFROST_SHADER_BLOCK:
+			for (shaderid = 0, shadernum = 0; shaderid < 64;
+			     shaderid++) {
+				if (!(BIT_ULL(shaderid) & shader_present))
+					continue;
+
+				if (shadernum == instance)
+					break;
+
+				shadernum++;
+			}
+
+			if (WARN_ON(shaderid == 64))
+				return;
+
+			/* 4 shaders per core group. */
+			bufoffs = ((shaderid / V4_SHADERS_PER_COREGROUP) *
+				   2048) +
+				  ((shaderid % V4_SHADERS_PER_COREGROUP) *
+				   256);
+			break;
+
+		case PANFROST_TILER_BLOCK:
+			if (WARN_ON(instance >= ncoregroups))
+				return;
+
+			bufoffs = (instance * 2048) + 1024;
+			break;
+		case PANFROST_MMU_L2_BLOCK:
+			if (WARN_ON(instance >= ncoregroups))
+				return;
+
+			bufoffs = (instance * 2048) + 1280;
+			break;
+		case PANFROST_JM_BLOCK:
+			if (WARN_ON(instance))
+				return;
+			bufoffs = 1792;
+			break;
+		default:
+			WARN_ON(1);
+			return;
+		}
+	} else {
+		unsigned int nl2c, ncores;
+
+		/*
+		 * TODO: define a macro to extract the number of l2 caches from
+		 * mem_features.
+		 */
+		nl2c = ((pfdev->features.mem_features >> 8) & GENMASK(3, 0)) + 1;
+
+		/*
+		 * The ARM driver is grouping cores per core group and then
+		 * only using the number of cores in group 0 to calculate the
+		 * size. Not sure why this is done like that, but I guess
+		 * shader_present will only show cores in the first group
+		 * anyway.
+		 */
+		ncores = hweight64(pfdev->features.shader_present);
+
+		switch (block) {
+		case PANFROST_SHADER_BLOCK:
+			for (shaderid = 0, shadernum = 0; shaderid < 64;
+			     shaderid++) {
+				if (!(BIT_ULL(shaderid) & shader_present))
+					continue;
+
+				if (shadernum == instance)
+					break;
+
+				shadernum++;
+			}
+
+			if (WARN_ON(shaderid == 64))
+				return;
+
+			/* 4 shaders per core group. */
+			bufoffs = 512 + ((nl2c + shaderid) * 256);
+			break;
+
+		case PANFROST_TILER_BLOCK:
+			if (WARN_ON(instance))
+				return;
+
+			bufoffs = 256;
+			break;
+		case PANFROST_MMU_L2_BLOCK:
+			if (WARN_ON(instance >= nl2c))
+				return;
+
+			bufoffs = 512 + (instance * 256);
+			break;
+		case PANFROST_JM_BLOCK:
+			if (WARN_ON(instance))
+				return;
+			bufoffs = 0;
+			break;
+		default:
+			WARN_ON(1);
+			return;
+		}
+	}
+
+	memcpy(vals, pfdev->perfcnt->buf + bufoffs, 256);
+}
+
+static void
+panfrost_perfmon_upd_counter_vals(struct panfrost_perfmon *perfmon,
+				  enum drm_panfrost_block_id block,
+				  unsigned int instance, u32 *invals)
+{
+	u32 *outvals = perfmon->values[block];
+	unsigned int inidx, outidx;
+
+	if (WARN_ON(instance >= hweight64(perfmon->counters[block].instances)))
+		return;
+
+	if (!(perfmon->counters[block].instances & BIT_ULL(instance)))
+		return;
+
+	outvals += instance * hweight64(perfmon->counters[block].counters);
+	for (inidx = 0, outidx = 0; inidx < 64; inidx++) {
+		if (!(perfmon->counters[block].counters & BIT_ULL(inidx)))
+			continue;
+
+		if (U32_MAX - outvals[outidx] < invals[inidx])
+			outvals[outidx] = U32_MAX;
+		else
+			outvals[outidx] += invals[inidx];
+		outidx++;
+	}
+}
+
+static void panfrost_perfcnt_dump_work(struct work_struct *w)
+{
+	struct panfrost_perfcnt *perfcnt = container_of(w,
+						struct panfrost_perfcnt,
+						dumpwork);
+	struct panfrost_perfcnt_job_ctx *ctx = perfcnt->dump_ctx;
+	unsigned int block, instance, pmonidx, num;
+
+	if (!ctx)
+		return;
+
+	for (block = 0; block < PANFROST_NUM_BLOCKS; block++) {
+		struct panfrost_perfmon *perfmon;
+		u32 vals[COUNTERS_PER_BLOCK];
+		u64 instances = 0;
+
+		for (pmonidx = 0; pmonidx < ctx->perfmon_count; pmonidx++) {
+			perfmon = ctx->perfmons[pmonidx];
+			instances |= perfmon->counters[block].instances;
+		}
+
+		for (instance = 0, num = 0; instance < 64; instance++) {
+			if (!(instances & BIT_ULL(instance)))
+				continue;
+
+			panfrost_perfcnt_get_counter_vals(ctx->pfdev, block,
+							  instance, vals);
+
+			for (pmonidx = 0; pmonidx < ctx->perfmon_count;
+			     pmonidx++) {
+				perfmon = ctx->perfmons[pmonidx];
+				panfrost_perfmon_upd_counter_vals(perfmon,
+								  block,
+								  num,
+								  vals);
+			}
+			num++;
+		}
+	}
+
+	panfrost_perfcnt_dump_done(ctx);
+}
+
+void panfrost_perfcnt_clean_cache_done(struct panfrost_device *pfdev)
+{
+	schedule_work(&pfdev->perfcnt->dumpwork);
+}
+
+void panfrost_perfcnt_sample_done(struct panfrost_device *pfdev)
+{
+	gpu_write(pfdev, GPU_CMD, GPU_CMD_CLEAN_CACHES);
+}
+
+void panfrost_perfcnt_clean_job_ctx(struct panfrost_job *job)
+{
+	return panfrost_perfcnt_put_job_ctx(job->perfcnt_ctx);
+}
+
+int panfrost_perfcnt_create_job_ctx(struct panfrost_job *job,
+				    struct drm_file *file_priv,
+				    struct drm_panfrost_submit *args)
+{
+	struct panfrost_device *pfdev = job->pfdev;
+	struct panfrost_file_priv *pfile = file_priv->driver_priv;
+	struct panfrost_perfcnt_job_ctx *ctx;
+	unsigned int i, j;
+	u32 *handles;
+	int ret;
+
+	ctx = kzalloc(sizeof(*ctx), GFP_KERNEL);
+	if (!ctx)
+		return -ENOMEM;
+
+	ctx->pfdev = pfdev;
+	refcount_set(&ctx->refcount, 1);
+
+	ctx->perfmon_count = args->perfmon_handle_count;
+	if (!ctx->perfmon_count) {
+		job->perfcnt_ctx = ctx;
+		return 0;
+	}
+
+	handles = kcalloc(ctx->perfmon_count, sizeof(u32), GFP_KERNEL);
+	if (!handles) {
+		ret = -ENOMEM;
+		goto err_put_ctx;
+	}
+
+	if (copy_from_user(handles,
+			   u64_to_user_ptr(args->perfmon_handles),
+			   ctx->perfmon_count * sizeof(u32))) {
+		ret = -EFAULT;
+		DRM_DEBUG("Failed to copy in perfmon handles\n");
+		goto err_free_handles;
+	}
+
+	/* Make sure each perfmon only appears once. */
+	for (i = 0; i < ctx->perfmon_count - 1; i++) {
+		for (j = i + 1; j < ctx->perfmon_count; j++) {
+			if (handles[i] == handles[j]) {
+				ret = -EINVAL;
+				goto err_free_handles;
+			}
+		}
+	}
+
+	ctx->perfmons = kcalloc(ctx->perfmon_count, sizeof(*ctx->perfmons),
+				GFP_KERNEL | __GFP_ZERO);
+	if (!ctx->perfmons) {
+		ret = -ENOMEM;
+		goto err_free_handles;
+	}
+
+	for (i = 0; i < ctx->perfmon_count; i++) {
+		ctx->perfmons[i] = panfrost_perfcnt_find_perfmon(pfile,
+								 handles[i]);
+		if (!ctx->perfmons[i]) {
+			ret = -EINVAL;
+			goto err_free_handles;
+		}
+		atomic_inc(&ctx->perfmons[i]->busycnt);
+	}
+
+	job->perfcnt_ctx = ctx;
+	kfree(handles);
+	return 0;
+
+err_free_handles:
+	kfree(handles);
+
+err_put_ctx:
+	panfrost_perfcnt_put_job_ctx(ctx);
+	return ret;
+}
+
+void panfrost_perfcnt_finish_job(struct panfrost_job *job, bool skip_dump)
+{
+	struct panfrost_perfcnt_job_ctx *ctx = job->perfcnt_ctx;
+
+	if (WARN_ON(!ctx))
+		return;
+
+	job->perfcnt_ctx = NULL;
+	if (!refcount_dec_and_test(&ctx->refcount))
+		return;
+
+	if (!ctx->perfmon_count || skip_dump) {
+		panfrost_perfcnt_dump_done(ctx);
+		return;
+	}
+
+	ctx->pfdev->perfcnt->dump_ctx = ctx;
+	gpu_write(ctx->pfdev, GPU_CMD, GPU_CMD_PERFCNT_SAMPLE);
+}
+
+static bool panfrost_perfcnt_try_reuse_last_job_ctx(struct panfrost_job *job)
+{
+	struct panfrost_perfcnt_job_ctx *prev_ctx, *new_ctx;
+	struct panfrost_device *pfdev = job->pfdev;
+	unsigned int i;
+
+	new_ctx = job->perfcnt_ctx;
+	prev_ctx = pfdev->perfcnt->last_ctx;
+	if (!prev_ctx)
+		return false;
+
+	if (!refcount_inc_not_zero(&prev_ctx->refcount))
+		return false;
+
+	if (!panfrost_perfcnt_job_ctx_cmp(prev_ctx, new_ctx)) {
+		refcount_dec(&prev_ctx->refcount);
+		return false;
+	}
+
+	/*
+	 * Make sure we increment busycnt, as panfrost_perfcnt_put_job_ctx()
+	 * will decrement it.
+	 */
+	for (i = 0; i < prev_ctx->perfmon_count; i++)
+		atomic_inc(&prev_ctx->perfmons[i]->busycnt);
+
+	panfrost_perfcnt_put_job_ctx(new_ctx);
+	job->perfcnt_ctx = prev_ctx;
+	job->perfcnt_fence = dma_fence_get(prev_ctx->wait_fence);
+	return true;
+}
+
+int panfrost_perfcnt_push_job(struct panfrost_job *job)
+{
+	struct panfrost_perfcnt_job_ctx *prev_ctx, *new_ctx;
+	struct panfrost_device *pfdev = job->pfdev;
+	unsigned long flags;
+	int ret = 0;
+
+	spin_lock_irqsave(&pfdev->perfcnt->ctx_lock, flags);
+	new_ctx = job->perfcnt_ctx;
+	prev_ctx = pfdev->perfcnt->last_ctx;
+	/*
+	 * In order to keep things relatively fast even when HW counters are
+	 * enabled we try to avoid having to dump perfcounters at the end of
+	 * each job (which implies making other jobs wait for this dump to
+	 * finish) when that's possible.
+	 * This is only acceptable if all queued jobs share the same perfctx,
+	 * that is, they have the same list of jobs attached to them. In this
+	 * condition we are guaranteed that nothing will increment the counters
+	 * behind our back.
+	 */
+	if (panfrost_perfcnt_try_reuse_last_job_ctx(job))
+		goto out;
+
+	new_ctx->done_fence = panfrost_perfcnt_fence_create(pfdev);
+	if (IS_ERR(new_ctx->done_fence)) {
+		ret = PTR_ERR(new_ctx->done_fence);
+		goto out;
+	}
+
+	/*
+	 * The previous job has a different perfmon ctx, so we must wait for it
+	 * to be done dumping the counters before we can schedule this new job,
+	 * otherwise we might corrupt the counter values.
+	 */
+	if (prev_ctx)
+		new_ctx->wait_fence = dma_fence_get(prev_ctx->done_fence);
+
+	job->perfcnt_fence = dma_fence_get(new_ctx->wait_fence);
+	pfdev->perfcnt->last_ctx = new_ctx;
+
+out:
+	spin_unlock_irqrestore(&pfdev->perfcnt->ctx_lock, flags);
+	return ret;
+}
+
+int panfrost_perfcnt_init(struct panfrost_device *pfdev)
+{
+	struct panfrost_perfcnt *perfcnt;
+	struct drm_gem_shmem_object *bo;
+	size_t size;
+	u32 status;
+	int ret;
+
+	if (panfrost_has_hw_feature(pfdev, HW_FEATURE_V4)) {
+		unsigned int ncoregroups;
+
+		ncoregroups = hweight64(pfdev->features.l2_present);
+		size = ncoregroups * BLOCKS_PER_COREGROUP *
+		       COUNTERS_PER_BLOCK * BYTES_PER_COUNTER;
+	} else {
+		unsigned int nl2c, ncores;
+
+		/*
+		 * TODO: define a macro to extract the number of l2 caches from
+		 * mem_features.
+		 */
+		nl2c = ((pfdev->features.mem_features >> 8) & GENMASK(3, 0)) + 1;
+
+		/*
+		 * The ARM driver is grouping cores per core group and then
+		 * only using the number of cores in group 0 to calculate the
+		 * size. Not sure why this is done like that, but I guess
+		 * shader_present will only show cores in the first group
+		 * anyway.
+		 */
+		ncores = hweight64(pfdev->features.shader_present);
+
+		/*
+		 * There's always one JM and one Tiler block, hence the '+ 2'
+		 * here.
+		 */
+		size = (nl2c + ncores + 2) *
+		       COUNTERS_PER_BLOCK * BYTES_PER_COUNTER;
+	}
+
+	perfcnt = devm_kzalloc(pfdev->dev, sizeof(*perfcnt), GFP_KERNEL);
+	if (!perfcnt)
+		return -ENOMEM;
+
+	bo = drm_gem_shmem_create(pfdev->ddev, size);
+	if (IS_ERR(bo))
+		return PTR_ERR(bo);
+
+	perfcnt->bo = to_panfrost_bo(&bo->base);
+
+	/*
+	 * We always use the same buffer, so let's map it once and keep it
+	 * mapped until the driver is unloaded. This might be a problem if
+	 * we start using different AS and the perfcnt BO is not mapped at
+	 * the same GPU virtual address.
+	 */
+	ret = panfrost_mmu_map(perfcnt->bo);
+	if (ret)
+		goto err_put_bo;
+
+	/* Disable everything. */
+	gpu_write(pfdev, GPU_PERFCNT_CFG,
+		  GPU_PERFCNT_CFG_AS(0) |
+		  GPU_PERFCNT_CFG_MODE(GPU_PERFCNT_CFG_MODE_OFF) |
+		  (panfrost_model_cmp(pfdev, 0x1000) >= 0 ?
+		   GPU_PERFCNT_CFG_SETSEL(1) : 0));
+	gpu_write(pfdev, GPU_PRFCNT_JM_EN, 0);
+	gpu_write(pfdev, GPU_PRFCNT_SHADER_EN, 0);
+	gpu_write(pfdev, GPU_PRFCNT_MMU_L2_EN, 0);
+	gpu_write(pfdev, GPU_PRFCNT_TILER_EN, 0);
+
+	perfcnt->buf = drm_gem_vmap(&bo->base);
+	if (IS_ERR(perfcnt->buf)) {
+		ret = PTR_ERR(perfcnt->buf);
+		goto err_put_bo;
+	}
+
+	INIT_WORK(&perfcnt->dumpwork, panfrost_perfcnt_dump_work);
+	mutex_init(&perfcnt->cfg_lock);
+	spin_lock_init(&perfcnt->fence_lock);
+	spin_lock_init(&perfcnt->ctx_lock);
+	perfcnt->fence_context = dma_fence_context_alloc(1);
+	pfdev->perfcnt = perfcnt;
+
+	/*
+	 * Invalidate the cache and clear the counters to start from a fresh
+	 * state.
+	 */
+	gpu_write(pfdev, GPU_INT_MASK, 0);
+	gpu_write(pfdev, GPU_INT_CLEAR, GPU_IRQ_CLEAN_CACHES_COMPLETED);
+
+	gpu_write(pfdev, GPU_CMD, GPU_CMD_PERFCNT_CLEAR);
+	gpu_write(pfdev, GPU_CMD, GPU_CMD_CLEAN_INV_CACHES);
+	ret = readl_relaxed_poll_timeout(pfdev->iomem + GPU_INT_RAWSTAT,
+					 status,
+					 status &
+					 GPU_IRQ_CLEAN_CACHES_COMPLETED,
+					 100, 10000);
+	if (ret)
+		goto err_gem_vunmap;
+
+	gpu_write(pfdev, GPU_INT_MASK, GPU_IRQ_MASK_ALL);
+
+	return 0;
+
+err_gem_vunmap:
+	drm_gem_vunmap(&pfdev->perfcnt->bo->base.base, pfdev->perfcnt->buf);
+
+err_put_bo:
+	drm_gem_object_put_unlocked(&bo->base);
+	return ret;
+}
+
+void panfrost_perfcnt_fini(struct panfrost_device *pfdev)
+{
+	drm_gem_vunmap(&pfdev->perfcnt->bo->base.base, pfdev->perfcnt->buf);
+	drm_gem_object_put_unlocked(&pfdev->perfcnt->bo->base.base);
+}
diff --git a/drivers/gpu/drm/panfrost/panfrost_perfcnt.h b/drivers/gpu/drm/panfrost/panfrost_perfcnt.h
new file mode 100644
index 000000000000..7cbfeb072aa1
--- /dev/null
+++ b/drivers/gpu/drm/panfrost/panfrost_perfcnt.h
@@ -0,0 +1,54 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/* Copyright 2019 Collabora Ltd */
+#ifndef __PANFROST_PERFCNT_H__
+#define __PANFROST_PERFCNT_H__
+
+#include <linux/bitops.h>
+
+struct panfrost_perfcnt_job_ctx;
+
+#define PERFCNT(_shader, _tiler, _mmu_l2, _jm)		\
+	{ _shader, _tiler, _mmu_l2, _jm }
+#define NO_PERFCNT      PERFCNT(0, 0, 0, 0)
+
+/* FIXME: Declare counters for all models */
+#define hw_perfcnt_t600	NO_PERFCNT
+#define hw_perfcnt_t620	NO_PERFCNT
+#define hw_perfcnt_t720	NO_PERFCNT
+#define hw_perfcnt_t760	NO_PERFCNT
+#define hw_perfcnt_t820	NO_PERFCNT
+#define hw_perfcnt_t830	NO_PERFCNT
+#define hw_perfcnt_t860	NO_PERFCNT
+#define hw_perfcnt_t880	NO_PERFCNT
+#define hw_perfcnt_g76	NO_PERFCNT
+#define hw_perfcnt_g71	NO_PERFCNT
+#define hw_perfcnt_g72	NO_PERFCNT
+#define hw_perfcnt_g51	NO_PERFCNT
+#define hw_perfcnt_g52	NO_PERFCNT
+#define hw_perfcnt_g31	NO_PERFCNT
+
+void panfrost_perfcnt_sample_done(struct panfrost_device *pfdev);
+void panfrost_perfcnt_clean_cache_done(struct panfrost_device *pfdev);
+int panfrost_perfcnt_push_job(struct panfrost_job *job);
+void panfrost_perfcnt_run_job(struct panfrost_job *job);
+void panfrost_perfcnt_finish_job(struct panfrost_job *job,
+				 bool skip_dump);
+void panfrost_perfcnt_clean_job_ctx(struct panfrost_job *job);
+int panfrost_perfcnt_create_job_ctx(struct panfrost_job *job,
+				    struct drm_file *file_priv,
+				    struct drm_panfrost_submit *args);
+void panfrost_perfcnt_open(struct panfrost_file_priv *pfile);
+void panfrost_perfcnt_close(struct panfrost_file_priv *pfile);
+int panfrost_perfcnt_init(struct panfrost_device *pfdev);
+void panfrost_perfcnt_fini(struct panfrost_device *pfdev);
+
+int panfrost_ioctl_get_perfcnt_layout(struct drm_device *dev, void *data,
+				      struct drm_file *file_priv);
+int panfrost_ioctl_create_perfmon(struct drm_device *dev, void *data,
+				  struct drm_file *file_priv);
+int panfrost_ioctl_destroy_perfmon(struct drm_device *dev, void *data,
+				   struct drm_file *file_priv);
+int panfrost_ioctl_get_perfmon_values(struct drm_device *dev, void *data,
+				      struct drm_file *file_priv);
+
+#endif
diff --git a/drivers/gpu/drm/panfrost/panfrost_regs.h b/drivers/gpu/drm/panfrost/panfrost_regs.h
index 42d08860fd76..ea38ac60581c 100644
--- a/drivers/gpu/drm/panfrost/panfrost_regs.h
+++ b/drivers/gpu/drm/panfrost/panfrost_regs.h
@@ -44,12 +44,31 @@
 	 GPU_IRQ_MULTIPLE_FAULT)
 #define GPU_CMD				0x30
 #define   GPU_CMD_SOFT_RESET		0x01
+#define   GPU_CMD_PERFCNT_CLEAR		0x03
+#define   GPU_CMD_PERFCNT_SAMPLE	0x04
+#define   GPU_CMD_CLEAN_CACHES		0x07
+#define   GPU_CMD_CLEAN_INV_CACHES	0x08
 #define GPU_STATUS			0x34
+#define   GPU_STATUS_PRFCNT_ACTIVE	BIT(2)
 #define GPU_LATEST_FLUSH_ID		0x38
 #define GPU_FAULT_STATUS		0x3C
 #define GPU_FAULT_ADDRESS_LO		0x40
 #define GPU_FAULT_ADDRESS_HI		0x44
 
+#define GPU_PERFCNT_BASE_LO		0x60
+#define GPU_PERFCNT_BASE_HI		0x64
+#define GPU_PERFCNT_CFG			0x68
+#define   GPU_PERFCNT_CFG_MODE(x)	(x)
+#define   GPU_PERFCNT_CFG_MODE_OFF	0
+#define   GPU_PERFCNT_CFG_MODE_MANUAL	1
+#define   GPU_PERFCNT_CFG_MODE_TILE	2
+#define   GPU_PERFCNT_CFG_AS(x)		((x) << 4)
+#define   GPU_PERFCNT_CFG_SETSEL(x)	((x) << 8)
+#define GPU_PRFCNT_JM_EN		0x6c
+#define GPU_PRFCNT_SHADER_EN		0x70
+#define GPU_PRFCNT_TILER_EN		0x74
+#define GPU_PRFCNT_MMU_L2_EN		0x7c
+
 #define GPU_THREAD_MAX_THREADS		0x0A0	/* (RO) Maximum number of threads per core */
 #define GPU_THREAD_MAX_WORKGROUP_SIZE	0x0A4	/* (RO) Maximum workgroup size */
 #define GPU_THREAD_MAX_BARRIER_SIZE	0x0A8	/* (RO) Maximum threads waiting at a barrier */
diff --git a/include/uapi/drm/panfrost_drm.h b/include/uapi/drm/panfrost_drm.h
index 508b9621d9db..e09b35bf6035 100644
--- a/include/uapi/drm/panfrost_drm.h
+++ b/include/uapi/drm/panfrost_drm.h
@@ -18,6 +18,10 @@ extern "C" {
 #define DRM_PANFROST_MMAP_BO			0x03
 #define DRM_PANFROST_GET_PARAM			0x04
 #define DRM_PANFROST_GET_BO_OFFSET		0x05
+#define DRM_PANFROST_GET_PERFCNT_LAYOUT		0x06
+#define DRM_PANFROST_CREATE_PERFMON		0x07
+#define DRM_PANFROST_DESTROY_PERFMON		0x08
+#define DRM_PANFROST_GET_PERFMON_VALUES		0x09
 
 #define DRM_IOCTL_PANFROST_SUBMIT		DRM_IOW(DRM_COMMAND_BASE + DRM_PANFROST_SUBMIT, struct drm_panfrost_submit)
 #define DRM_IOCTL_PANFROST_WAIT_BO		DRM_IOW(DRM_COMMAND_BASE + DRM_PANFROST_WAIT_BO, struct drm_panfrost_wait_bo)
@@ -25,6 +29,10 @@ extern "C" {
 #define DRM_IOCTL_PANFROST_MMAP_BO		DRM_IOWR(DRM_COMMAND_BASE + DRM_PANFROST_MMAP_BO, struct drm_panfrost_mmap_bo)
 #define DRM_IOCTL_PANFROST_GET_PARAM		DRM_IOWR(DRM_COMMAND_BASE + DRM_PANFROST_GET_PARAM, struct drm_panfrost_get_param)
 #define DRM_IOCTL_PANFROST_GET_BO_OFFSET	DRM_IOWR(DRM_COMMAND_BASE + DRM_PANFROST_GET_BO_OFFSET, struct drm_panfrost_get_bo_offset)
+#define DRM_IOCTL_PANFROST_GET_PERFCNT_LAYOUT	DRM_IOWR(DRM_COMMAND_BASE + DRM_PANFROST_GET_PERFCNT_LAYOUT, struct drm_panfrost_get_perfcnt_layout)
+#define DRM_IOCTL_PANFROST_CREATE_PERFMON	DRM_IOWR(DRM_COMMAND_BASE + DRM_PANFROST_CREATE_PERFMON, struct drm_panfrost_create_perfmon)
+#define DRM_IOCTL_PANFROST_DESTROY_PERFMON	DRM_IOWR(DRM_COMMAND_BASE + DRM_PANFROST_DESTROY_PERFMON, struct drm_panfrost_destroy_perfmon)
+#define DRM_IOCTL_PANFROST_GET_PERFMON_VALUES	DRM_IOWR(DRM_COMMAND_BASE + DRM_PANFROST_GET_PERFMON_VALUES, struct drm_panfrost_get_perfmon_values)
 
 #define PANFROST_JD_REQ_FS (1 << 0)
 /**
@@ -55,6 +63,15 @@ struct drm_panfrost_submit {
 
 	/** A combination of PANFROST_JD_REQ_* */
 	__u32 requirements;
+
+	/** Pointer to a u32 array of perfmons that should be attached to the job. */
+	__u64 perfmon_handles;
+
+	/** Number of perfmon handles passed in (size is that times 4). */
+	__u32 perfmon_handle_count;
+
+	/** Unused field, should be set to 0. */
+	__u32 padding;
 };
 
 /**
@@ -133,6 +150,111 @@ struct drm_panfrost_get_bo_offset {
 	__u64 offset;
 };
 
+/**
+ * Panfrost HW block ids used to group HW counters. There might be several
+ * shader, tiler and MMU/L2 blocks in a given GPU. How many of them are
+ * available is exposed through the instances field of
+ * drm_panfrost_block_perfcounters.
+ */
+enum drm_panfrost_block_id {
+	PANFROST_SHADER_BLOCK,
+	PANFROST_TILER_BLOCK,
+	PANFROST_MMU_L2_BLOCK,
+	PANFROST_JM_BLOCK,
+	PANFROST_NUM_BLOCKS,
+};
+
+struct drm_panfrost_block_perfcounters {
+	/*
+	 * For DRM_IOCTL_PANFROST_GET_PERFCNT_LAYOUT, encodes the available
+	 * instances for a specific given block type.
+	 * For DRM_IOCTL_PANFROST_CREATE_PERFMON, encodes the instances the
+	 * user wants to monitor.
+	 * Note: the bitmap might be sparse.
+	 */
+	__u64 instances;
+
+	/*
+	 * For DRM_IOCTL_PANFROST_GET_PERFCNT_LAYOUT, encodes the available
+	 * counters attached to a specific block type.
+	 * For DRM_IOCTL_PANFROST_CREATE_PERFMON, encodes the counters the user
+	 * wants to monitor.
+	 * Note: the bitmap might be sparse.
+	 */
+	__u64 counters;
+};
+
+/**
+ * Used to retrieve available HW counters.
+ */
+struct drm_panfrost_get_perfcnt_layout {
+	struct drm_panfrost_block_perfcounters counters[PANFROST_NUM_BLOCKS];
+};
+
+/**
+ * Used to create a performance monitor. Each perfmonance monitor is assigned an
+ * ID that can later be passed when submitting a job to capture hardware counter
+ * values (and thus count things related to this specific job).
+ * Performance monitors are attached to the GPU file descriptor and IDs are
+ * unique within this context, not across all GPU users.
+ * This implies that
+ * - perfmons are automatically released when the FD is closed
+ * - perfmons can't be shared across GPU context
+ */
+struct drm_panfrost_create_perfmon {
+	/* Input Fields. */
+	/* List all HW counters this performance monitor should track. */
+	struct drm_panfrost_block_perfcounters counters[PANFROST_NUM_BLOCKS];
+
+	/* Output fields. */
+	/* ID of the newly created perfmon. */
+	__u32 id;
+
+	/* Padding: must be set to 0. */
+	__u32 padding;
+};
+
+/**
+ * Destroy an existing performance monitor.
+ */
+struct drm_panfrost_destroy_perfmon {
+	/*
+	 * ID of the perfmon to destroy (the one returned by
+	 * DRM_IOCTL_PANFROST_CREATE_PERFMON)
+	 */
+	__u32 id;
+};
+
+/*
+ * Don't wait when trying to get perfmon values. If the perfmon is still active
+ * (still attached to a queued or running job), EBUSY is returned.
+ */
+#define DRM_PANFROST_GET_PERFMON_VALS_DONT_WAIT		0x1
+
+/* Reset all perfmon values to zero after reading them. */
+#define DRM_PANFROST_GET_PERFMON_VALS_RESET		0x2
+
+/**
+ * Used to query values collected by a performance monitor.
+ */
+struct drm_panfrost_get_perfmon_values {
+	/* ID of the perfmon to query value on. */
+	__u32 id;
+
+	/* See DRM_PANFROST_GET_PERFMON_VALS_XXX flags */
+	__u32 flags;
+
+	/*
+	 * An array of u32 userspace pointers where counters values will be
+	 * copied too.
+	 * The array sizes depend on the counters/instances activated at
+	 * perfmon creation time: hweight64(instances) * hweight64(counters).
+	 * Note that some entries in values_ptrs[] might be NULL if no counters
+	 * on a specific block were activated.
+	 */
+	__u64 values_ptrs[PANFROST_NUM_BLOCKS];
+};
+
 #if defined(__cplusplus)
 }
 #endif
-- 
2.20.1

_______________________________________________
dri-devel mailing list
dri-devel@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/dri-devel

  parent reply	other threads:[~2019-04-04 15:21 UTC|newest]

Thread overview: 26+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2019-04-04 15:20 [PATCH 0/3] drm/panfrost: Expose HW counters to userspace Boris Brezillon
2019-04-04 15:20 ` [PATCH 1/3] drm/panfrost: Move gpu_{write, read}() macros to panfrost_regs.h Boris Brezillon
2019-04-04 15:20 ` Boris Brezillon [this message]
2019-04-04 15:41   ` [PATCH 2/3] drm/panfrost: Expose HW counters to userspace Alyssa Rosenzweig
2019-04-04 18:17     ` Boris Brezillon
2019-04-04 22:40       ` Alyssa Rosenzweig
2019-04-05 15:36     ` Eric Anholt
2019-04-05 16:17       ` Alyssa Rosenzweig
2019-04-04 15:20 ` [PATCH 3/3] panfrost/drm: Define T860 perf counters Boris Brezillon
2019-04-05 15:20 ` [PATCH 0/3] drm/panfrost: Expose HW counters to userspace Steven Price
2019-04-05 16:33   ` Alyssa Rosenzweig
2019-04-05 17:40     ` Boris Brezillon
2019-04-05 17:43       ` Alyssa Rosenzweig
2019-04-30 12:42   ` Boris Brezillon
2019-04-30 13:10     ` Rob Clark
2019-04-30 15:49       ` Jordan Crouse
2019-05-12 13:40         ` Boris Brezillon
2019-05-13 15:00           ` Jordan Crouse
2019-05-01 17:12     ` Eric Anholt
2019-05-12 13:17       ` Boris Brezillon
2019-05-11 22:32     ` Alyssa Rosenzweig
2019-05-12 13:38       ` Boris Brezillon
2019-05-13 12:48         ` Steven Price
2019-05-13 13:39           ` Boris Brezillon
2019-05-13 14:13             ` Steven Price
2019-05-13 14:56             ` Alyssa Rosenzweig

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=20190404152051.17996-3-boris.brezillon@collabora.com \
    --to=boris.brezillon@collabora.com \
    --cc=alyssa@rosenzweig.io \
    --cc=dri-devel@lists.freedesktop.org \
    --cc=kernel@collabora.com \
    --cc=narmstrong@baylibre.com \
    --cc=robh+dt@kernel.org \
    --cc=tomeu@tomeuvizoso.net \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line before the message body.
This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.