[PATCH 1/9] drm: Create a driver hook for allocating GEM object structs.

All of lore.kernel.org
 help / color / mirror / Atom feed

* [PATCH 1/9] drm: Create a driver hook for allocating GEM object structs.
@ 2015-12-01 20:35 ` Eric Anholt
  0 siblings, 0 replies; 53+ messages in thread
From: Eric Anholt @ 2015-12-01 20:35 UTC (permalink / raw)
  To: dri-devel; +Cc: linux-kernel, David Airlie, Eric Anholt

The CMA helpers had no way for a driver to extend the struct with its
own fields.  Since the CMA helpers are mostly "Allocate a
drm_gem_cma_object, then fill in a few fields", it's hard to write as
pure helpers without passing in a driver callback for the allocate
step.

Signed-off-by: Eric Anholt <eric@anholt.net>
---
 drivers/gpu/drm/drm_gem_cma_helper.c | 10 ++++++----
 include/drm/drmP.h                   |  7 +++++++
 2 files changed, 13 insertions(+), 4 deletions(-)

diff --git a/drivers/gpu/drm/drm_gem_cma_helper.c b/drivers/gpu/drm/drm_gem_cma_helper.c
index e109b49..0f7b00ba 100644
--- a/drivers/gpu/drm/drm_gem_cma_helper.c
+++ b/drivers/gpu/drm/drm_gem_cma_helper.c
@@ -59,11 +59,13 @@ __drm_gem_cma_create(struct drm_device *drm, size_t size)
 	struct drm_gem_object *gem_obj;
 	int ret;
 
-	cma_obj = kzalloc(sizeof(*cma_obj), GFP_KERNEL);
-	if (!cma_obj)
+	if (drm->driver->gem_create_object)
+		gem_obj = drm->driver->gem_create_object(drm, size);
+	else
+		gem_obj = kzalloc(sizeof(*cma_obj), GFP_KERNEL);
+	if (!gem_obj)
 		return ERR_PTR(-ENOMEM);
-
-	gem_obj = &cma_obj->base;
+	cma_obj = container_of(gem_obj, struct drm_gem_cma_object, base);
 
 	ret = drm_gem_object_init(drm, gem_obj, size);
 	if (ret)
diff --git a/include/drm/drmP.h b/include/drm/drmP.h
index 0b921ae..22ff162 100644
--- a/include/drm/drmP.h
+++ b/include/drm/drmP.h
@@ -580,6 +580,13 @@ struct drm_driver {
 	int (*gem_open_object) (struct drm_gem_object *, struct drm_file *);
 	void (*gem_close_object) (struct drm_gem_object *, struct drm_file *);
 
+	/**
+	 * Hook for allocating the GEM object struct, for use by core
+	 * helpers.
+	 */
+	struct drm_gem_object *(*gem_create_object)(struct drm_device *dev,
+						    size_t size);
+
 	/* prime: */
 	/* export handle -> fd (see drm_gem_prime_handle_to_fd() helper) */
 	int (*prime_handle_to_fd)(struct drm_device *dev, struct drm_file *file_priv,
-- 
2.6.2


^ permalink raw reply related	[flat|nested] 53+ messages in thread

* [PATCH 1/9] drm: Create a driver hook for allocating GEM object structs.
@ 2015-12-01 20:35 ` Eric Anholt
  0 siblings, 0 replies; 53+ messages in thread
From: Eric Anholt @ 2015-12-01 20:35 UTC (permalink / raw)
  To: dri-devel; +Cc: linux-kernel

The CMA helpers had no way for a driver to extend the struct with its
own fields.  Since the CMA helpers are mostly "Allocate a
drm_gem_cma_object, then fill in a few fields", it's hard to write as
pure helpers without passing in a driver callback for the allocate
step.

Signed-off-by: Eric Anholt <eric@anholt.net>
---
 drivers/gpu/drm/drm_gem_cma_helper.c | 10 ++++++----
 include/drm/drmP.h                   |  7 +++++++
 2 files changed, 13 insertions(+), 4 deletions(-)

diff --git a/drivers/gpu/drm/drm_gem_cma_helper.c b/drivers/gpu/drm/drm_gem_cma_helper.c
index e109b49..0f7b00ba 100644
--- a/drivers/gpu/drm/drm_gem_cma_helper.c
+++ b/drivers/gpu/drm/drm_gem_cma_helper.c
@@ -59,11 +59,13 @@ __drm_gem_cma_create(struct drm_device *drm, size_t size)
 	struct drm_gem_object *gem_obj;
 	int ret;
 
-	cma_obj = kzalloc(sizeof(*cma_obj), GFP_KERNEL);
-	if (!cma_obj)
+	if (drm->driver->gem_create_object)
+		gem_obj = drm->driver->gem_create_object(drm, size);
+	else
+		gem_obj = kzalloc(sizeof(*cma_obj), GFP_KERNEL);
+	if (!gem_obj)
 		return ERR_PTR(-ENOMEM);
-
-	gem_obj = &cma_obj->base;
+	cma_obj = container_of(gem_obj, struct drm_gem_cma_object, base);
 
 	ret = drm_gem_object_init(drm, gem_obj, size);
 	if (ret)
diff --git a/include/drm/drmP.h b/include/drm/drmP.h
index 0b921ae..22ff162 100644
--- a/include/drm/drmP.h
+++ b/include/drm/drmP.h
@@ -580,6 +580,13 @@ struct drm_driver {
 	int (*gem_open_object) (struct drm_gem_object *, struct drm_file *);
 	void (*gem_close_object) (struct drm_gem_object *, struct drm_file *);
 
+	/**
+	 * Hook for allocating the GEM object struct, for use by core
+	 * helpers.
+	 */
+	struct drm_gem_object *(*gem_create_object)(struct drm_device *dev,
+						    size_t size);
+
 	/* prime: */
 	/* export handle -> fd (see drm_gem_prime_handle_to_fd() helper) */
 	int (*prime_handle_to_fd)(struct drm_device *dev, struct drm_file *file_priv,
-- 
2.6.2

_______________________________________________
dri-devel mailing list
dri-devel@lists.freedesktop.org
http://lists.freedesktop.org/mailman/listinfo/dri-devel

^ permalink raw reply related	[flat|nested] 53+ messages in thread

* [PATCH 2/9] drm/vc4: Add a BO cache.
  2015-12-01 20:35 ` Eric Anholt
@ 2015-12-01 20:35   ` Eric Anholt
  -1 siblings, 0 replies; 53+ messages in thread
From: Eric Anholt @ 2015-12-01 20:35 UTC (permalink / raw)
  To: dri-devel; +Cc: linux-kernel, David Airlie, Eric Anholt

We need to allocate new BOs in the kernel as part of each frame, but
the CMA allocator is way too slow for that.  As an optimization, keep
track of recently-freed BOs and reuse them, with a 1 second timeout to
fully free them back to the system.

This improves 3D performance by about 15%.

Signed-off-by: Eric Anholt <eric@anholt.net>
---
 drivers/gpu/drm/vc4/vc4_bo.c      | 336 +++++++++++++++++++++++++++++++++++++-
 drivers/gpu/drm/vc4/vc4_debugfs.c |   1 +
 drivers/gpu/drm/vc4/vc4_drv.c     |   6 +-
 drivers/gpu/drm/vc4/vc4_drv.h     |  49 +++++-
 4 files changed, 384 insertions(+), 8 deletions(-)

diff --git a/drivers/gpu/drm/vc4/vc4_bo.c b/drivers/gpu/drm/vc4/vc4_bo.c
index ab9f510..18faa5b 100644
--- a/drivers/gpu/drm/vc4/vc4_bo.c
+++ b/drivers/gpu/drm/vc4/vc4_bo.c
@@ -12,19 +12,229 @@
  * access to system memory with no MMU in between.  To support it, we
  * use the GEM CMA helper functions to allocate contiguous ranges of
  * physical memory for our BOs.
+ *
+ * Since the CMA allocator is very slow, we keep a cache of recently
+ * freed BOs around so that the kernel's allocation of objects for 3D
+ * rendering can return quickly.
  */
 
 #include "vc4_drv.h"
 
-struct vc4_bo *vc4_bo_create(struct drm_device *dev, size_t size)
+static void vc4_bo_stats_dump(struct vc4_dev *vc4)
+{
+	DRM_INFO("num bos allocated: %d\n",
+		 vc4->bo_stats.num_allocated);
+	DRM_INFO("size bos allocated: %dkb\n",
+		 vc4->bo_stats.size_allocated / 1024);
+	DRM_INFO("num bos used: %d\n",
+		 vc4->bo_stats.num_allocated - vc4->bo_stats.num_cached);
+	DRM_INFO("size bos used: %dkb\n",
+		 (vc4->bo_stats.size_allocated -
+		  vc4->bo_stats.size_cached) / 1024);
+	DRM_INFO("num bos cached: %d\n",
+		 vc4->bo_stats.num_cached);
+	DRM_INFO("size bos cached: %dkb\n",
+		 vc4->bo_stats.size_cached / 1024);
+}
+
+#ifdef CONFIG_DEBUG_FS
+int vc4_bo_stats_debugfs(struct seq_file *m, void *unused)
+{
+	struct drm_info_node *node = (struct drm_info_node *)m->private;
+	struct drm_device *dev = node->minor->dev;
+	struct vc4_dev *vc4 = to_vc4_dev(dev);
+	struct vc4_bo_stats stats;
+
+	/* Take a snapshot of the current stats with the lock held. */
+	mutex_lock(&vc4->bo_lock);
+	stats = vc4->bo_stats;
+	mutex_unlock(&vc4->bo_lock);
+
+	seq_printf(m, "num bos allocated: %d\n",
+		   stats.num_allocated);
+	seq_printf(m, "size bos allocated: %dkb\n",
+		   stats.size_allocated / 1024);
+	seq_printf(m, "num bos used: %d\n",
+		   stats.num_allocated - stats.num_cached);
+	seq_printf(m, "size bos used: %dkb\n",
+		   (stats.size_allocated - stats.size_cached) / 1024);
+	seq_printf(m, "num bos cached: %d\n",
+		   stats.num_cached);
+	seq_printf(m, "size bos cached: %dkb\n",
+		   stats.size_cached / 1024);
+
+	return 0;
+}
+#endif
+
+static uint32_t bo_page_index(size_t size)
+{
+	return (size / PAGE_SIZE) - 1;
+}
+
+/* Must be called with bo_lock held. */
+static void vc4_bo_destroy(struct vc4_bo *bo)
 {
+	struct drm_gem_object *obj = &bo->base.base;
+	struct vc4_dev *vc4 = to_vc4_dev(obj->dev);
+
+	vc4->bo_stats.num_allocated--;
+	vc4->bo_stats.size_allocated -= obj->size;
+	drm_gem_cma_free_object(obj);
+}
+
+/* Must be called with bo_lock held. */
+static void vc4_bo_remove_from_cache(struct vc4_bo *bo)
+{
+	struct drm_gem_object *obj = &bo->base.base;
+	struct vc4_dev *vc4 = to_vc4_dev(obj->dev);
+
+	vc4->bo_stats.num_cached--;
+	vc4->bo_stats.size_cached -= obj->size;
+
+	list_del(&bo->unref_head);
+	list_del(&bo->size_head);
+}
+
+static struct list_head *vc4_get_cache_list_for_size(struct drm_device *dev,
+						     size_t size)
+{
+	struct vc4_dev *vc4 = to_vc4_dev(dev);
+	uint32_t page_index = bo_page_index(size);
+
+	if (vc4->bo_cache.size_list_size <= page_index) {
+		uint32_t new_size = max(vc4->bo_cache.size_list_size * 2,
+					page_index + 1);
+		struct list_head *new_list;
+		uint32_t i;
+
+		new_list = kmalloc_array(new_size, sizeof(struct list_head),
+					 GFP_KERNEL);
+		if (!new_list)
+			return NULL;
+
+		/* Rebase the old cached BO lists to their new list
+		 * head locations.
+		 */
+		for (i = 0; i < vc4->bo_cache.size_list_size; i++) {
+			struct list_head *old_list =
+				&vc4->bo_cache.size_list[i];
+
+			if (list_empty(old_list))
+				INIT_LIST_HEAD(&new_list[i]);
+			else
+				list_replace(old_list, &new_list[i]);
+		}
+		/* And initialize the brand new BO list heads. */
+		for (i = vc4->bo_cache.size_list_size; i < new_size; i++)
+			INIT_LIST_HEAD(&new_list[i]);
+
+		kfree(vc4->bo_cache.size_list);
+		vc4->bo_cache.size_list = new_list;
+		vc4->bo_cache.size_list_size = new_size;
+	}
+
+	return &vc4->bo_cache.size_list[page_index];
+}
+
+void vc4_bo_cache_purge(struct drm_device *dev)
+{
+	struct vc4_dev *vc4 = to_vc4_dev(dev);
+
+	mutex_lock(&vc4->bo_lock);
+	while (!list_empty(&vc4->bo_cache.time_list)) {
+		struct vc4_bo *bo = list_last_entry(&vc4->bo_cache.time_list,
+						    struct vc4_bo, unref_head);
+		vc4_bo_remove_from_cache(bo);
+		vc4_bo_destroy(bo);
+	}
+	mutex_unlock(&vc4->bo_lock);
+}
+
+static struct vc4_bo *vc4_bo_get_from_cache(struct drm_device *dev,
+					    uint32_t size)
+{
+	struct vc4_dev *vc4 = to_vc4_dev(dev);
+	uint32_t page_index = bo_page_index(size);
+	struct vc4_bo *bo = NULL;
+
+	size = roundup(size, PAGE_SIZE);
+
+	mutex_lock(&vc4->bo_lock);
+	if (page_index >= vc4->bo_cache.size_list_size)
+		goto out;
+
+	if (list_empty(&vc4->bo_cache.size_list[page_index]))
+		goto out;
+
+	bo = list_first_entry(&vc4->bo_cache.size_list[page_index],
+			      struct vc4_bo, size_head);
+	vc4_bo_remove_from_cache(bo);
+	kref_init(&bo->base.base.refcount);
+
+out:
+	mutex_unlock(&vc4->bo_lock);
+	return bo;
+}
+
+/**
+ * vc4_gem_create_object - Implementation of driver->gem_create_object.
+ *
+ * This lets the CMA helpers allocate object structs for us, and keep
+ * our BO stats correct.
+ */
+struct drm_gem_object *vc4_create_object(struct drm_device *dev, size_t size)
+{
+	struct vc4_dev *vc4 = to_vc4_dev(dev);
+	struct vc4_bo *bo;
+
+	bo = kzalloc(sizeof(*bo), GFP_KERNEL);
+	if (!bo)
+		return ERR_PTR(-ENOMEM);
+
+	mutex_lock(&vc4->bo_lock);
+	vc4->bo_stats.num_allocated++;
+	vc4->bo_stats.size_allocated += size;
+	mutex_unlock(&vc4->bo_lock);
+
+	return &bo->base.base;
+}
+
+struct vc4_bo *vc4_bo_create(struct drm_device *dev, size_t unaligned_size,
+			     bool from_cache)
+{
+	size_t size = roundup(unaligned_size, PAGE_SIZE);
+	struct vc4_dev *vc4 = to_vc4_dev(dev);
 	struct drm_gem_cma_object *cma_obj;
 
-	cma_obj = drm_gem_cma_create(dev, size);
-	if (IS_ERR(cma_obj))
+	if (size == 0)
 		return NULL;
-	else
-		return to_vc4_bo(&cma_obj->base);
+
+	/* First, try to get a vc4_bo from the kernel BO cache. */
+	if (from_cache) {
+		struct vc4_bo *bo = vc4_bo_get_from_cache(dev, size);
+
+		if (bo)
+			return bo;
+	}
+
+	cma_obj = drm_gem_cma_create(dev, size);
+	if (IS_ERR(cma_obj)) {
+		/*
+		 * If we've run out of CMA memory, kill the cache of
+		 * CMA allocations we've got laying around and try again.
+		 */
+		vc4_bo_cache_purge(dev);
+
+		cma_obj = drm_gem_cma_create(dev, size);
+		if (IS_ERR(cma_obj)) {
+			DRM_ERROR("Failed to allocate from CMA:\n");
+			vc4_bo_stats_dump(vc4);
+			return NULL;
+		}
+	}
+
+	return to_vc4_bo(&cma_obj->base);
 }
 
 int vc4_dumb_create(struct drm_file *file_priv,
@@ -41,7 +251,7 @@ int vc4_dumb_create(struct drm_file *file_priv,
 	if (args->size < args->pitch * args->height)
 		args->size = args->pitch * args->height;
 
-	bo = vc4_bo_create(dev, roundup(args->size, PAGE_SIZE));
+	bo = vc4_bo_create(dev, args->size, false);
 	if (!bo)
 		return -ENOMEM;
 
@@ -50,3 +260,117 @@ int vc4_dumb_create(struct drm_file *file_priv,
 
 	return ret;
 }
+
+/* Must be called with bo_lock held. */
+static void vc4_bo_cache_free_old(struct drm_device *dev)
+{
+	struct vc4_dev *vc4 = to_vc4_dev(dev);
+	unsigned long expire_time = jiffies - msecs_to_jiffies(1000);
+
+	while (!list_empty(&vc4->bo_cache.time_list)) {
+		struct vc4_bo *bo = list_last_entry(&vc4->bo_cache.time_list,
+						    struct vc4_bo, unref_head);
+		if (time_before(expire_time, bo->free_time)) {
+			mod_timer(&vc4->bo_cache.time_timer,
+				  round_jiffies_up(jiffies +
+						   msecs_to_jiffies(1000)));
+			return;
+		}
+
+		vc4_bo_remove_from_cache(bo);
+		vc4_bo_destroy(bo);
+	}
+}
+
+/* Called on the last userspace/kernel unreference of the BO.  Returns
+ * it to the BO cache if possible, otherwise frees it.
+ *
+ * Note that this is called with the struct_mutex held.
+ */
+void vc4_free_object(struct drm_gem_object *gem_bo)
+{
+	struct drm_device *dev = gem_bo->dev;
+	struct vc4_dev *vc4 = to_vc4_dev(dev);
+	struct vc4_bo *bo = to_vc4_bo(gem_bo);
+	struct list_head *cache_list;
+
+	mutex_lock(&vc4->bo_lock);
+	/* If the object references someone else's memory, we can't cache it.
+	 */
+	if (gem_bo->import_attach) {
+		vc4_bo_destroy(bo);
+		goto out;
+	}
+
+	/* Don't cache if it was publicly named. */
+	if (gem_bo->name) {
+		vc4_bo_destroy(bo);
+		goto out;
+	}
+
+	cache_list = vc4_get_cache_list_for_size(dev, gem_bo->size);
+	if (!cache_list) {
+		vc4_bo_destroy(bo);
+		goto out;
+	}
+
+	bo->free_time = jiffies;
+	list_add(&bo->size_head, cache_list);
+	list_add(&bo->unref_head, &vc4->bo_cache.time_list);
+
+	vc4->bo_stats.num_cached++;
+	vc4->bo_stats.size_cached += gem_bo->size;
+
+	vc4_bo_cache_free_old(dev);
+
+out:
+	mutex_unlock(&vc4->bo_lock);
+}
+
+static void vc4_bo_cache_time_work(struct work_struct *work)
+{
+	struct vc4_dev *vc4 =
+		container_of(work, struct vc4_dev, bo_cache.time_work);
+	struct drm_device *dev = vc4->dev;
+
+	mutex_lock(&vc4->bo_lock);
+	vc4_bo_cache_free_old(dev);
+	mutex_unlock(&vc4->bo_lock);
+}
+
+static void vc4_bo_cache_time_timer(unsigned long data)
+{
+	struct drm_device *dev = (struct drm_device *)data;
+	struct vc4_dev *vc4 = to_vc4_dev(dev);
+
+	schedule_work(&vc4->bo_cache.time_work);
+}
+
+void vc4_bo_cache_init(struct drm_device *dev)
+{
+	struct vc4_dev *vc4 = to_vc4_dev(dev);
+
+	mutex_init(&vc4->bo_lock);
+
+	INIT_LIST_HEAD(&vc4->bo_cache.time_list);
+
+	INIT_WORK(&vc4->bo_cache.time_work, vc4_bo_cache_time_work);
+	setup_timer(&vc4->bo_cache.time_timer,
+		    vc4_bo_cache_time_timer,
+		    (unsigned long)dev);
+}
+
+void vc4_bo_cache_destroy(struct drm_device *dev)
+{
+	struct vc4_dev *vc4 = to_vc4_dev(dev);
+
+	del_timer(&vc4->bo_cache.time_timer);
+	cancel_work_sync(&vc4->bo_cache.time_work);
+
+	vc4_bo_cache_purge(dev);
+
+	if (vc4->bo_stats.num_allocated) {
+		DRM_ERROR("Destroying BO cache while BOs still allocated:\n");
+		vc4_bo_stats_dump(vc4);
+	}
+}
diff --git a/drivers/gpu/drm/vc4/vc4_debugfs.c b/drivers/gpu/drm/vc4/vc4_debugfs.c
index 4297b0a5..6bcf96e 100644
--- a/drivers/gpu/drm/vc4/vc4_debugfs.c
+++ b/drivers/gpu/drm/vc4/vc4_debugfs.c
@@ -16,6 +16,7 @@
 #include "vc4_regs.h"
 
 static const struct drm_info_list vc4_debugfs_list[] = {
+	{"bo_stats", vc4_bo_stats_debugfs, 0},
 	{"hdmi_regs", vc4_hdmi_debugfs_regs, 0},
 	{"hvs_regs", vc4_hvs_debugfs_regs, 0},
 	{"crtc0_regs", vc4_crtc_debugfs_regs, 0, (void *)(uintptr_t)0},
diff --git a/drivers/gpu/drm/vc4/vc4_drv.c b/drivers/gpu/drm/vc4/vc4_drv.c
index 6e73060..da041fa 100644
--- a/drivers/gpu/drm/vc4/vc4_drv.c
+++ b/drivers/gpu/drm/vc4/vc4_drv.c
@@ -92,7 +92,8 @@ static struct drm_driver vc4_drm_driver = {
 	.debugfs_cleanup = vc4_debugfs_cleanup,
 #endif
 
-	.gem_free_object = drm_gem_cma_free_object,
+	.gem_create_object = vc4_create_object,
+	.gem_free_object = vc4_free_object,
 	.gem_vm_ops = &drm_gem_cma_vm_ops,
 
 	.prime_handle_to_fd = drm_gem_prime_handle_to_fd,
@@ -170,6 +171,8 @@ static int vc4_drm_bind(struct device *dev)
 
 	drm_dev_set_unique(drm, dev_name(dev));
 
+	vc4_bo_cache_init(drm);
+
 	drm_mode_config_init(drm);
 	if (ret)
 		goto unref;
@@ -202,6 +205,7 @@ unbind_all:
 	component_unbind_all(dev, drm);
 unref:
 	drm_dev_unref(drm);
+	vc4_bo_cache_destroy(drm);
 	return ret;
 }
 
diff --git a/drivers/gpu/drm/vc4/vc4_drv.h b/drivers/gpu/drm/vc4/vc4_drv.h
index fd8319f..39a1ff5 100644
--- a/drivers/gpu/drm/vc4/vc4_drv.h
+++ b/drivers/gpu/drm/vc4/vc4_drv.h
@@ -17,6 +17,37 @@ struct vc4_dev {
 	struct vc4_crtc *crtc[3];
 
 	struct drm_fbdev_cma *fbdev;
+
+	/* The kernel-space BO cache.  Tracks buffers that have been
+	 * unreferenced by all other users (refcounts of 0!) but not
+	 * yet freed, so we can do cheap allocations.
+	 */
+	struct vc4_bo_cache {
+		/* Array of list heads for entries in the BO cache,
+		 * based on number of pages, so we can do O(1) lookups
+		 * in the cache when allocating.
+		 */
+		struct list_head *size_list;
+		uint32_t size_list_size;
+
+		/* List of all BOs in the cache, ordered by age, so we
+		 * can do O(1) lookups when trying to free old
+		 * buffers.
+		 */
+		struct list_head time_list;
+		struct work_struct time_work;
+		struct timer_list time_timer;
+	} bo_cache;
+
+	struct vc4_bo_stats {
+		u32 num_allocated;
+		u32 size_allocated;
+		u32 num_cached;
+		u32 size_cached;
+	} bo_stats;
+
+	/* Protects bo_cache and the BO stats. */
+	struct mutex bo_lock;
 };
 
 static inline struct vc4_dev *
@@ -27,6 +58,17 @@ to_vc4_dev(struct drm_device *dev)
 
 struct vc4_bo {
 	struct drm_gem_cma_object base;
+
+	/* List entry for the BO's position in either
+	 * vc4_exec_info->unref_list or vc4_dev->bo_cache.time_list
+	 */
+	struct list_head unref_head;
+
+	/* Time in jiffies when the BO was put in vc4->bo_cache. */
+	unsigned long free_time;
+
+	/* List entry for the BO's position in vc4_dev->bo_cache.size_list */
+	struct list_head size_head;
 };
 
 static inline struct vc4_bo *
@@ -104,13 +146,18 @@ to_vc4_encoder(struct drm_encoder *encoder)
 #define wait_for(COND, MS) _wait_for(COND, MS, 1)
 
 /* vc4_bo.c */
+struct drm_gem_object *vc4_create_object(struct drm_device *dev, size_t size);
 void vc4_free_object(struct drm_gem_object *gem_obj);
-struct vc4_bo *vc4_bo_create(struct drm_device *dev, size_t size);
+struct vc4_bo *vc4_bo_create(struct drm_device *dev, size_t size,
+			     bool from_cache);
 int vc4_dumb_create(struct drm_file *file_priv,
 		    struct drm_device *dev,
 		    struct drm_mode_create_dumb *args);
 struct dma_buf *vc4_prime_export(struct drm_device *dev,
 				 struct drm_gem_object *obj, int flags);
+void vc4_bo_cache_init(struct drm_device *dev);
+void vc4_bo_cache_destroy(struct drm_device *dev);
+int vc4_bo_stats_debugfs(struct seq_file *m, void *arg);
 
 /* vc4_crtc.c */
 extern struct platform_driver vc4_crtc_driver;
-- 
2.6.2


^ permalink raw reply related	[flat|nested] 53+ messages in thread

* [PATCH 2/9] drm/vc4: Add a BO cache.
@ 2015-12-01 20:35   ` Eric Anholt
  0 siblings, 0 replies; 53+ messages in thread
From: Eric Anholt @ 2015-12-01 20:35 UTC (permalink / raw)
  To: dri-devel; +Cc: linux-kernel

We need to allocate new BOs in the kernel as part of each frame, but
the CMA allocator is way too slow for that.  As an optimization, keep
track of recently-freed BOs and reuse them, with a 1 second timeout to
fully free them back to the system.

This improves 3D performance by about 15%.

Signed-off-by: Eric Anholt <eric@anholt.net>
---
 drivers/gpu/drm/vc4/vc4_bo.c      | 336 +++++++++++++++++++++++++++++++++++++-
 drivers/gpu/drm/vc4/vc4_debugfs.c |   1 +
 drivers/gpu/drm/vc4/vc4_drv.c     |   6 +-
 drivers/gpu/drm/vc4/vc4_drv.h     |  49 +++++-
 4 files changed, 384 insertions(+), 8 deletions(-)

diff --git a/drivers/gpu/drm/vc4/vc4_bo.c b/drivers/gpu/drm/vc4/vc4_bo.c
index ab9f510..18faa5b 100644
--- a/drivers/gpu/drm/vc4/vc4_bo.c
+++ b/drivers/gpu/drm/vc4/vc4_bo.c
@@ -12,19 +12,229 @@
  * access to system memory with no MMU in between.  To support it, we
  * use the GEM CMA helper functions to allocate contiguous ranges of
  * physical memory for our BOs.
+ *
+ * Since the CMA allocator is very slow, we keep a cache of recently
+ * freed BOs around so that the kernel's allocation of objects for 3D
+ * rendering can return quickly.
  */
 
 #include "vc4_drv.h"
 
-struct vc4_bo *vc4_bo_create(struct drm_device *dev, size_t size)
+static void vc4_bo_stats_dump(struct vc4_dev *vc4)
+{
+	DRM_INFO("num bos allocated: %d\n",
+		 vc4->bo_stats.num_allocated);
+	DRM_INFO("size bos allocated: %dkb\n",
+		 vc4->bo_stats.size_allocated / 1024);
+	DRM_INFO("num bos used: %d\n",
+		 vc4->bo_stats.num_allocated - vc4->bo_stats.num_cached);
+	DRM_INFO("size bos used: %dkb\n",
+		 (vc4->bo_stats.size_allocated -
+		  vc4->bo_stats.size_cached) / 1024);
+	DRM_INFO("num bos cached: %d\n",
+		 vc4->bo_stats.num_cached);
+	DRM_INFO("size bos cached: %dkb\n",
+		 vc4->bo_stats.size_cached / 1024);
+}
+
+#ifdef CONFIG_DEBUG_FS
+int vc4_bo_stats_debugfs(struct seq_file *m, void *unused)
+{
+	struct drm_info_node *node = (struct drm_info_node *)m->private;
+	struct drm_device *dev = node->minor->dev;
+	struct vc4_dev *vc4 = to_vc4_dev(dev);
+	struct vc4_bo_stats stats;
+
+	/* Take a snapshot of the current stats with the lock held. */
+	mutex_lock(&vc4->bo_lock);
+	stats = vc4->bo_stats;
+	mutex_unlock(&vc4->bo_lock);
+
+	seq_printf(m, "num bos allocated: %d\n",
+		   stats.num_allocated);
+	seq_printf(m, "size bos allocated: %dkb\n",
+		   stats.size_allocated / 1024);
+	seq_printf(m, "num bos used: %d\n",
+		   stats.num_allocated - stats.num_cached);
+	seq_printf(m, "size bos used: %dkb\n",
+		   (stats.size_allocated - stats.size_cached) / 1024);
+	seq_printf(m, "num bos cached: %d\n",
+		   stats.num_cached);
+	seq_printf(m, "size bos cached: %dkb\n",
+		   stats.size_cached / 1024);
+
+	return 0;
+}
+#endif
+
+static uint32_t bo_page_index(size_t size)
+{
+	return (size / PAGE_SIZE) - 1;
+}
+
+/* Must be called with bo_lock held. */
+static void vc4_bo_destroy(struct vc4_bo *bo)
 {
+	struct drm_gem_object *obj = &bo->base.base;
+	struct vc4_dev *vc4 = to_vc4_dev(obj->dev);
+
+	vc4->bo_stats.num_allocated--;
+	vc4->bo_stats.size_allocated -= obj->size;
+	drm_gem_cma_free_object(obj);
+}
+
+/* Must be called with bo_lock held. */
+static void vc4_bo_remove_from_cache(struct vc4_bo *bo)
+{
+	struct drm_gem_object *obj = &bo->base.base;
+	struct vc4_dev *vc4 = to_vc4_dev(obj->dev);
+
+	vc4->bo_stats.num_cached--;
+	vc4->bo_stats.size_cached -= obj->size;
+
+	list_del(&bo->unref_head);
+	list_del(&bo->size_head);
+}
+
+static struct list_head *vc4_get_cache_list_for_size(struct drm_device *dev,
+						     size_t size)
+{
+	struct vc4_dev *vc4 = to_vc4_dev(dev);
+	uint32_t page_index = bo_page_index(size);
+
+	if (vc4->bo_cache.size_list_size <= page_index) {
+		uint32_t new_size = max(vc4->bo_cache.size_list_size * 2,
+					page_index + 1);
+		struct list_head *new_list;
+		uint32_t i;
+
+		new_list = kmalloc_array(new_size, sizeof(struct list_head),
+					 GFP_KERNEL);
+		if (!new_list)
+			return NULL;
+
+		/* Rebase the old cached BO lists to their new list
+		 * head locations.
+		 */
+		for (i = 0; i < vc4->bo_cache.size_list_size; i++) {
+			struct list_head *old_list =
+				&vc4->bo_cache.size_list[i];
+
+			if (list_empty(old_list))
+				INIT_LIST_HEAD(&new_list[i]);
+			else
+				list_replace(old_list, &new_list[i]);
+		}
+		/* And initialize the brand new BO list heads. */
+		for (i = vc4->bo_cache.size_list_size; i < new_size; i++)
+			INIT_LIST_HEAD(&new_list[i]);
+
+		kfree(vc4->bo_cache.size_list);
+		vc4->bo_cache.size_list = new_list;
+		vc4->bo_cache.size_list_size = new_size;
+	}
+
+	return &vc4->bo_cache.size_list[page_index];
+}
+
+void vc4_bo_cache_purge(struct drm_device *dev)
+{
+	struct vc4_dev *vc4 = to_vc4_dev(dev);
+
+	mutex_lock(&vc4->bo_lock);
+	while (!list_empty(&vc4->bo_cache.time_list)) {
+		struct vc4_bo *bo = list_last_entry(&vc4->bo_cache.time_list,
+						    struct vc4_bo, unref_head);
+		vc4_bo_remove_from_cache(bo);
+		vc4_bo_destroy(bo);
+	}
+	mutex_unlock(&vc4->bo_lock);
+}
+
+static struct vc4_bo *vc4_bo_get_from_cache(struct drm_device *dev,
+					    uint32_t size)
+{
+	struct vc4_dev *vc4 = to_vc4_dev(dev);
+	uint32_t page_index = bo_page_index(size);
+	struct vc4_bo *bo = NULL;
+
+	size = roundup(size, PAGE_SIZE);
+
+	mutex_lock(&vc4->bo_lock);
+	if (page_index >= vc4->bo_cache.size_list_size)
+		goto out;
+
+	if (list_empty(&vc4->bo_cache.size_list[page_index]))
+		goto out;
+
+	bo = list_first_entry(&vc4->bo_cache.size_list[page_index],
+			      struct vc4_bo, size_head);
+	vc4_bo_remove_from_cache(bo);
+	kref_init(&bo->base.base.refcount);
+
+out:
+	mutex_unlock(&vc4->bo_lock);
+	return bo;
+}
+
+/**
+ * vc4_gem_create_object - Implementation of driver->gem_create_object.
+ *
+ * This lets the CMA helpers allocate object structs for us, and keep
+ * our BO stats correct.
+ */
+struct drm_gem_object *vc4_create_object(struct drm_device *dev, size_t size)
+{
+	struct vc4_dev *vc4 = to_vc4_dev(dev);
+	struct vc4_bo *bo;
+
+	bo = kzalloc(sizeof(*bo), GFP_KERNEL);
+	if (!bo)
+		return ERR_PTR(-ENOMEM);
+
+	mutex_lock(&vc4->bo_lock);
+	vc4->bo_stats.num_allocated++;
+	vc4->bo_stats.size_allocated += size;
+	mutex_unlock(&vc4->bo_lock);
+
+	return &bo->base.base;
+}
+
+struct vc4_bo *vc4_bo_create(struct drm_device *dev, size_t unaligned_size,
+			     bool from_cache)
+{
+	size_t size = roundup(unaligned_size, PAGE_SIZE);
+	struct vc4_dev *vc4 = to_vc4_dev(dev);
 	struct drm_gem_cma_object *cma_obj;
 
-	cma_obj = drm_gem_cma_create(dev, size);
-	if (IS_ERR(cma_obj))
+	if (size == 0)
 		return NULL;
-	else
-		return to_vc4_bo(&cma_obj->base);
+
+	/* First, try to get a vc4_bo from the kernel BO cache. */
+	if (from_cache) {
+		struct vc4_bo *bo = vc4_bo_get_from_cache(dev, size);
+
+		if (bo)
+			return bo;
+	}
+
+	cma_obj = drm_gem_cma_create(dev, size);
+	if (IS_ERR(cma_obj)) {
+		/*
+		 * If we've run out of CMA memory, kill the cache of
+		 * CMA allocations we've got laying around and try again.
+		 */
+		vc4_bo_cache_purge(dev);
+
+		cma_obj = drm_gem_cma_create(dev, size);
+		if (IS_ERR(cma_obj)) {
+			DRM_ERROR("Failed to allocate from CMA:\n");
+			vc4_bo_stats_dump(vc4);
+			return NULL;
+		}
+	}
+
+	return to_vc4_bo(&cma_obj->base);
 }
 
 int vc4_dumb_create(struct drm_file *file_priv,
@@ -41,7 +251,7 @@ int vc4_dumb_create(struct drm_file *file_priv,
 	if (args->size < args->pitch * args->height)
 		args->size = args->pitch * args->height;
 
-	bo = vc4_bo_create(dev, roundup(args->size, PAGE_SIZE));
+	bo = vc4_bo_create(dev, args->size, false);
 	if (!bo)
 		return -ENOMEM;
 
@@ -50,3 +260,117 @@ int vc4_dumb_create(struct drm_file *file_priv,
 
 	return ret;
 }
+
+/* Must be called with bo_lock held. */
+static void vc4_bo_cache_free_old(struct drm_device *dev)
+{
+	struct vc4_dev *vc4 = to_vc4_dev(dev);
+	unsigned long expire_time = jiffies - msecs_to_jiffies(1000);
+
+	while (!list_empty(&vc4->bo_cache.time_list)) {
+		struct vc4_bo *bo = list_last_entry(&vc4->bo_cache.time_list,
+						    struct vc4_bo, unref_head);
+		if (time_before(expire_time, bo->free_time)) {
+			mod_timer(&vc4->bo_cache.time_timer,
+				  round_jiffies_up(jiffies +
+						   msecs_to_jiffies(1000)));
+			return;
+		}
+
+		vc4_bo_remove_from_cache(bo);
+		vc4_bo_destroy(bo);
+	}
+}
+
+/* Called on the last userspace/kernel unreference of the BO.  Returns
+ * it to the BO cache if possible, otherwise frees it.
+ *
+ * Note that this is called with the struct_mutex held.
+ */
+void vc4_free_object(struct drm_gem_object *gem_bo)
+{
+	struct drm_device *dev = gem_bo->dev;
+	struct vc4_dev *vc4 = to_vc4_dev(dev);
+	struct vc4_bo *bo = to_vc4_bo(gem_bo);
+	struct list_head *cache_list;
+
+	mutex_lock(&vc4->bo_lock);
+	/* If the object references someone else's memory, we can't cache it.
+	 */
+	if (gem_bo->import_attach) {
+		vc4_bo_destroy(bo);
+		goto out;
+	}
+
+	/* Don't cache if it was publicly named. */
+	if (gem_bo->name) {
+		vc4_bo_destroy(bo);
+		goto out;
+	}
+
+	cache_list = vc4_get_cache_list_for_size(dev, gem_bo->size);
+	if (!cache_list) {
+		vc4_bo_destroy(bo);
+		goto out;
+	}
+
+	bo->free_time = jiffies;
+	list_add(&bo->size_head, cache_list);
+	list_add(&bo->unref_head, &vc4->bo_cache.time_list);
+
+	vc4->bo_stats.num_cached++;
+	vc4->bo_stats.size_cached += gem_bo->size;
+
+	vc4_bo_cache_free_old(dev);
+
+out:
+	mutex_unlock(&vc4->bo_lock);
+}
+
+static void vc4_bo_cache_time_work(struct work_struct *work)
+{
+	struct vc4_dev *vc4 =
+		container_of(work, struct vc4_dev, bo_cache.time_work);
+	struct drm_device *dev = vc4->dev;
+
+	mutex_lock(&vc4->bo_lock);
+	vc4_bo_cache_free_old(dev);
+	mutex_unlock(&vc4->bo_lock);
+}
+
+static void vc4_bo_cache_time_timer(unsigned long data)
+{
+	struct drm_device *dev = (struct drm_device *)data;
+	struct vc4_dev *vc4 = to_vc4_dev(dev);
+
+	schedule_work(&vc4->bo_cache.time_work);
+}
+
+void vc4_bo_cache_init(struct drm_device *dev)
+{
+	struct vc4_dev *vc4 = to_vc4_dev(dev);
+
+	mutex_init(&vc4->bo_lock);
+
+	INIT_LIST_HEAD(&vc4->bo_cache.time_list);
+
+	INIT_WORK(&vc4->bo_cache.time_work, vc4_bo_cache_time_work);
+	setup_timer(&vc4->bo_cache.time_timer,
+		    vc4_bo_cache_time_timer,
+		    (unsigned long)dev);
+}
+
+void vc4_bo_cache_destroy(struct drm_device *dev)
+{
+	struct vc4_dev *vc4 = to_vc4_dev(dev);
+
+	del_timer(&vc4->bo_cache.time_timer);
+	cancel_work_sync(&vc4->bo_cache.time_work);
+
+	vc4_bo_cache_purge(dev);
+
+	if (vc4->bo_stats.num_allocated) {
+		DRM_ERROR("Destroying BO cache while BOs still allocated:\n");
+		vc4_bo_stats_dump(vc4);
+	}
+}
diff --git a/drivers/gpu/drm/vc4/vc4_debugfs.c b/drivers/gpu/drm/vc4/vc4_debugfs.c
index 4297b0a5..6bcf96e 100644
--- a/drivers/gpu/drm/vc4/vc4_debugfs.c
+++ b/drivers/gpu/drm/vc4/vc4_debugfs.c
@@ -16,6 +16,7 @@
 #include "vc4_regs.h"
 
 static const struct drm_info_list vc4_debugfs_list[] = {
+	{"bo_stats", vc4_bo_stats_debugfs, 0},
 	{"hdmi_regs", vc4_hdmi_debugfs_regs, 0},
 	{"hvs_regs", vc4_hvs_debugfs_regs, 0},
 	{"crtc0_regs", vc4_crtc_debugfs_regs, 0, (void *)(uintptr_t)0},
diff --git a/drivers/gpu/drm/vc4/vc4_drv.c b/drivers/gpu/drm/vc4/vc4_drv.c
index 6e73060..da041fa 100644
--- a/drivers/gpu/drm/vc4/vc4_drv.c
+++ b/drivers/gpu/drm/vc4/vc4_drv.c
@@ -92,7 +92,8 @@ static struct drm_driver vc4_drm_driver = {
 	.debugfs_cleanup = vc4_debugfs_cleanup,
 #endif
 
-	.gem_free_object = drm_gem_cma_free_object,
+	.gem_create_object = vc4_create_object,
+	.gem_free_object = vc4_free_object,
 	.gem_vm_ops = &drm_gem_cma_vm_ops,
 
 	.prime_handle_to_fd = drm_gem_prime_handle_to_fd,
@@ -170,6 +171,8 @@ static int vc4_drm_bind(struct device *dev)
 
 	drm_dev_set_unique(drm, dev_name(dev));
 
+	vc4_bo_cache_init(drm);
+
 	drm_mode_config_init(drm);
 	if (ret)
 		goto unref;
@@ -202,6 +205,7 @@ unbind_all:
 	component_unbind_all(dev, drm);
 unref:
 	drm_dev_unref(drm);
+	vc4_bo_cache_destroy(drm);
 	return ret;
 }
 
diff --git a/drivers/gpu/drm/vc4/vc4_drv.h b/drivers/gpu/drm/vc4/vc4_drv.h
index fd8319f..39a1ff5 100644
--- a/drivers/gpu/drm/vc4/vc4_drv.h
+++ b/drivers/gpu/drm/vc4/vc4_drv.h
@@ -17,6 +17,37 @@ struct vc4_dev {
 	struct vc4_crtc *crtc[3];
 
 	struct drm_fbdev_cma *fbdev;
+
+	/* The kernel-space BO cache.  Tracks buffers that have been
+	 * unreferenced by all other users (refcounts of 0!) but not
+	 * yet freed, so we can do cheap allocations.
+	 */
+	struct vc4_bo_cache {
+		/* Array of list heads for entries in the BO cache,
+		 * based on number of pages, so we can do O(1) lookups
+		 * in the cache when allocating.
+		 */
+		struct list_head *size_list;
+		uint32_t size_list_size;
+
+		/* List of all BOs in the cache, ordered by age, so we
+		 * can do O(1) lookups when trying to free old
+		 * buffers.
+		 */
+		struct list_head time_list;
+		struct work_struct time_work;
+		struct timer_list time_timer;
+	} bo_cache;
+
+	struct vc4_bo_stats {
+		u32 num_allocated;
+		u32 size_allocated;
+		u32 num_cached;
+		u32 size_cached;
+	} bo_stats;
+
+	/* Protects bo_cache and the BO stats. */
+	struct mutex bo_lock;
 };
 
 static inline struct vc4_dev *
@@ -27,6 +58,17 @@ to_vc4_dev(struct drm_device *dev)
 
 struct vc4_bo {
 	struct drm_gem_cma_object base;
+
+	/* List entry for the BO's position in either
+	 * vc4_exec_info->unref_list or vc4_dev->bo_cache.time_list
+	 */
+	struct list_head unref_head;
+
+	/* Time in jiffies when the BO was put in vc4->bo_cache. */
+	unsigned long free_time;
+
+	/* List entry for the BO's position in vc4_dev->bo_cache.size_list */
+	struct list_head size_head;
 };
 
 static inline struct vc4_bo *
@@ -104,13 +146,18 @@ to_vc4_encoder(struct drm_encoder *encoder)
 #define wait_for(COND, MS) _wait_for(COND, MS, 1)
 
 /* vc4_bo.c */
+struct drm_gem_object *vc4_create_object(struct drm_device *dev, size_t size);
 void vc4_free_object(struct drm_gem_object *gem_obj);
-struct vc4_bo *vc4_bo_create(struct drm_device *dev, size_t size);
+struct vc4_bo *vc4_bo_create(struct drm_device *dev, size_t size,
+			     bool from_cache);
 int vc4_dumb_create(struct drm_file *file_priv,
 		    struct drm_device *dev,
 		    struct drm_mode_create_dumb *args);
 struct dma_buf *vc4_prime_export(struct drm_device *dev,
 				 struct drm_gem_object *obj, int flags);
+void vc4_bo_cache_init(struct drm_device *dev);
+void vc4_bo_cache_destroy(struct drm_device *dev);
+int vc4_bo_stats_debugfs(struct seq_file *m, void *arg);
 
 /* vc4_crtc.c */
 extern struct platform_driver vc4_crtc_driver;
-- 
2.6.2

_______________________________________________
dri-devel mailing list
dri-devel@lists.freedesktop.org
http://lists.freedesktop.org/mailman/listinfo/dri-devel

^ permalink raw reply related	[flat|nested] 53+ messages in thread

* [PATCH 3/9] drm/vc4: Add create and map BO ioctls.
  2015-12-01 20:35 ` Eric Anholt
@ 2015-12-01 20:35   ` Eric Anholt
  -1 siblings, 0 replies; 53+ messages in thread
From: Eric Anholt @ 2015-12-01 20:35 UTC (permalink / raw)
  To: dri-devel; +Cc: linux-kernel, David Airlie, Eric Anholt

While there exist dumb APIs for creating and mapping BOs, one of the
rules is that drivers doing 3D acceleration have to provide their own
APIs for buffer allocation (besides, the pitch/height parameters of
the dumb alloc don't really make sense for a lot of 3D allocations).

Signed-off-by: Eric Anholt <eric@anholt.net>
---
 drivers/gpu/drm/vc4/vc4_bo.c  | 41 ++++++++++++++++++++++++++
 drivers/gpu/drm/vc4/vc4_drv.c |  3 ++
 drivers/gpu/drm/vc4/vc4_drv.h |  4 +++
 include/uapi/drm/Kbuild       |  1 +
 include/uapi/drm/vc4_drm.h    | 68 +++++++++++++++++++++++++++++++++++++++++++
 5 files changed, 117 insertions(+)
 create mode 100644 include/uapi/drm/vc4_drm.h

diff --git a/drivers/gpu/drm/vc4/vc4_bo.c b/drivers/gpu/drm/vc4/vc4_bo.c
index 18faa5b..06cba26 100644
--- a/drivers/gpu/drm/vc4/vc4_bo.c
+++ b/drivers/gpu/drm/vc4/vc4_bo.c
@@ -19,6 +19,7 @@
  */
 
 #include "vc4_drv.h"
+#include "uapi/drm/vc4_drm.h"
 
 static void vc4_bo_stats_dump(struct vc4_dev *vc4)
 {
@@ -346,6 +347,46 @@ static void vc4_bo_cache_time_timer(unsigned long data)
 	schedule_work(&vc4->bo_cache.time_work);
 }
 
+int vc4_create_bo_ioctl(struct drm_device *dev, void *data,
+			struct drm_file *file_priv)
+{
+	struct drm_vc4_create_bo *args = data;
+	struct vc4_bo *bo = NULL;
+	int ret;
+
+	/*
+	 * We can't allocate from the BO cache, because the BOs don't
+	 * get zeroed, and that might leak data between users.
+	 */
+	bo = vc4_bo_create(dev, args->size, false);
+	if (!bo)
+		return -ENOMEM;
+
+	ret = drm_gem_handle_create(file_priv, &bo->base.base, &args->handle);
+	drm_gem_object_unreference_unlocked(&bo->base.base);
+
+	return ret;
+}
+
+int vc4_mmap_bo_ioctl(struct drm_device *dev, void *data,
+		      struct drm_file *file_priv)
+{
+	struct drm_vc4_mmap_bo *args = data;
+	struct drm_gem_object *gem_obj;
+
+	gem_obj = drm_gem_object_lookup(dev, file_priv, args->handle);
+	if (!gem_obj) {
+		DRM_ERROR("Failed to look up GEM BO %d\n", args->handle);
+		return -EINVAL;
+	}
+
+	/* The mmap offset was set up at BO allocation time. */
+	args->offset = drm_vma_node_offset_addr(&gem_obj->vma_node);
+
+	drm_gem_object_unreference_unlocked(gem_obj);
+	return 0;
+}
+
 void vc4_bo_cache_init(struct drm_device *dev)
 {
 	struct vc4_dev *vc4 = to_vc4_dev(dev);
diff --git a/drivers/gpu/drm/vc4/vc4_drv.c b/drivers/gpu/drm/vc4/vc4_drv.c
index da041fa..5fa4688 100644
--- a/drivers/gpu/drm/vc4/vc4_drv.c
+++ b/drivers/gpu/drm/vc4/vc4_drv.c
@@ -16,6 +16,7 @@
 #include <linux/platform_device.h>
 #include "drm_fb_cma_helper.h"
 
+#include "uapi/drm/vc4_drm.h"
 #include "vc4_drv.h"
 #include "vc4_regs.h"
 
@@ -73,6 +74,8 @@ static const struct file_operations vc4_drm_fops = {
 };
 
 static const struct drm_ioctl_desc vc4_drm_ioctls[] = {
+	DRM_IOCTL_DEF_DRV(VC4_CREATE_BO, vc4_create_bo_ioctl, 0),
+	DRM_IOCTL_DEF_DRV(VC4_MMAP_BO, vc4_mmap_bo_ioctl, 0),
 };
 
 static struct drm_driver vc4_drm_driver = {
diff --git a/drivers/gpu/drm/vc4/vc4_drv.h b/drivers/gpu/drm/vc4/vc4_drv.h
index 39a1ff5..fddb0a0 100644
--- a/drivers/gpu/drm/vc4/vc4_drv.h
+++ b/drivers/gpu/drm/vc4/vc4_drv.h
@@ -155,6 +155,10 @@ int vc4_dumb_create(struct drm_file *file_priv,
 		    struct drm_mode_create_dumb *args);
 struct dma_buf *vc4_prime_export(struct drm_device *dev,
 				 struct drm_gem_object *obj, int flags);
+int vc4_create_bo_ioctl(struct drm_device *dev, void *data,
+			struct drm_file *file_priv);
+int vc4_mmap_bo_ioctl(struct drm_device *dev, void *data,
+		      struct drm_file *file_priv);
 void vc4_bo_cache_init(struct drm_device *dev);
 void vc4_bo_cache_destroy(struct drm_device *dev);
 int vc4_bo_stats_debugfs(struct seq_file *m, void *arg);
diff --git a/include/uapi/drm/Kbuild b/include/uapi/drm/Kbuild
index 38d4370..974fcd5 100644
--- a/include/uapi/drm/Kbuild
+++ b/include/uapi/drm/Kbuild
@@ -17,4 +17,5 @@ header-y += tegra_drm.h
 header-y += via_drm.h
 header-y += vmwgfx_drm.h
 header-y += msm_drm.h
+header-y += vc4_drm.h
 header-y += virtgpu_drm.h
diff --git a/include/uapi/drm/vc4_drm.h b/include/uapi/drm/vc4_drm.h
new file mode 100644
index 0000000..068aab9
--- /dev/null
+++ b/include/uapi/drm/vc4_drm.h
@@ -0,0 +1,68 @@
+/*
+ * Copyright © 2014-2015 Broadcom
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ */
+
+#ifndef _UAPI_VC4_DRM_H_
+#define _UAPI_VC4_DRM_H_
+
+#include <drm/drm.h>
+
+#define DRM_VC4_CREATE_BO                         0x03
+#define DRM_VC4_MMAP_BO                           0x04
+
+#define DRM_IOCTL_VC4_CREATE_BO           DRM_IOWR(DRM_COMMAND_BASE + DRM_VC4_CREATE_BO, struct drm_vc4_create_bo)
+#define DRM_IOCTL_VC4_MMAP_BO             DRM_IOWR(DRM_COMMAND_BASE + DRM_VC4_MMAP_BO, struct drm_vc4_mmap_bo)
+
+/**
+ * struct drm_vc4_create_bo - ioctl argument for creating VC4 BOs.
+ *
+ * There are currently no values for the flags argument, but it may be
+ * used in a future extension.
+ */
+struct drm_vc4_create_bo {
+	uint32_t size;
+	uint32_t flags;
+	/** Returned GEM handle for the BO. */
+	uint32_t handle;
+	uint32_t pad;
+};
+
+/**
+ * struct drm_vc4_mmap_bo - ioctl argument for mapping VC4 BOs.
+ *
+ * This doesn't actually perform an mmap.  Instead, it returns the
+ * offset you need to use in an mmap on the DRM device node.  This
+ * means that tools like valgrind end up knowing about the mapped
+ * memory.
+ *
+ * There are currently no values for the flags argument, but it may be
+ * used in a future extension.
+ */
+struct drm_vc4_mmap_bo {
+	/** Handle for the object being mapped. */
+	uint32_t handle;
+	uint32_t flags;
+	/** offset into the drm node to use for subsequent mmap call. */
+	uint64_t offset;
+};
+
+#endif /* _UAPI_VC4_DRM_H_ */
-- 
2.6.2


^ permalink raw reply related	[flat|nested] 53+ messages in thread

* [PATCH 3/9] drm/vc4: Add create and map BO ioctls.
@ 2015-12-01 20:35   ` Eric Anholt
  0 siblings, 0 replies; 53+ messages in thread
From: Eric Anholt @ 2015-12-01 20:35 UTC (permalink / raw)
  To: dri-devel; +Cc: linux-kernel

While there exist dumb APIs for creating and mapping BOs, one of the
rules is that drivers doing 3D acceleration have to provide their own
APIs for buffer allocation (besides, the pitch/height parameters of
the dumb alloc don't really make sense for a lot of 3D allocations).

Signed-off-by: Eric Anholt <eric@anholt.net>
---
 drivers/gpu/drm/vc4/vc4_bo.c  | 41 ++++++++++++++++++++++++++
 drivers/gpu/drm/vc4/vc4_drv.c |  3 ++
 drivers/gpu/drm/vc4/vc4_drv.h |  4 +++
 include/uapi/drm/Kbuild       |  1 +
 include/uapi/drm/vc4_drm.h    | 68 +++++++++++++++++++++++++++++++++++++++++++
 5 files changed, 117 insertions(+)
 create mode 100644 include/uapi/drm/vc4_drm.h

diff --git a/drivers/gpu/drm/vc4/vc4_bo.c b/drivers/gpu/drm/vc4/vc4_bo.c
index 18faa5b..06cba26 100644
--- a/drivers/gpu/drm/vc4/vc4_bo.c
+++ b/drivers/gpu/drm/vc4/vc4_bo.c
@@ -19,6 +19,7 @@
  */
 
 #include "vc4_drv.h"
+#include "uapi/drm/vc4_drm.h"
 
 static void vc4_bo_stats_dump(struct vc4_dev *vc4)
 {
@@ -346,6 +347,46 @@ static void vc4_bo_cache_time_timer(unsigned long data)
 	schedule_work(&vc4->bo_cache.time_work);
 }
 
+int vc4_create_bo_ioctl(struct drm_device *dev, void *data,
+			struct drm_file *file_priv)
+{
+	struct drm_vc4_create_bo *args = data;
+	struct vc4_bo *bo = NULL;
+	int ret;
+
+	/*
+	 * We can't allocate from the BO cache, because the BOs don't
+	 * get zeroed, and that might leak data between users.
+	 */
+	bo = vc4_bo_create(dev, args->size, false);
+	if (!bo)
+		return -ENOMEM;
+
+	ret = drm_gem_handle_create(file_priv, &bo->base.base, &args->handle);
+	drm_gem_object_unreference_unlocked(&bo->base.base);
+
+	return ret;
+}
+
+int vc4_mmap_bo_ioctl(struct drm_device *dev, void *data,
+		      struct drm_file *file_priv)
+{
+	struct drm_vc4_mmap_bo *args = data;
+	struct drm_gem_object *gem_obj;
+
+	gem_obj = drm_gem_object_lookup(dev, file_priv, args->handle);
+	if (!gem_obj) {
+		DRM_ERROR("Failed to look up GEM BO %d\n", args->handle);
+		return -EINVAL;
+	}
+
+	/* The mmap offset was set up at BO allocation time. */
+	args->offset = drm_vma_node_offset_addr(&gem_obj->vma_node);
+
+	drm_gem_object_unreference_unlocked(gem_obj);
+	return 0;
+}
+
 void vc4_bo_cache_init(struct drm_device *dev)
 {
 	struct vc4_dev *vc4 = to_vc4_dev(dev);
diff --git a/drivers/gpu/drm/vc4/vc4_drv.c b/drivers/gpu/drm/vc4/vc4_drv.c
index da041fa..5fa4688 100644
--- a/drivers/gpu/drm/vc4/vc4_drv.c
+++ b/drivers/gpu/drm/vc4/vc4_drv.c
@@ -16,6 +16,7 @@
 #include <linux/platform_device.h>
 #include "drm_fb_cma_helper.h"
 
+#include "uapi/drm/vc4_drm.h"
 #include "vc4_drv.h"
 #include "vc4_regs.h"
 
@@ -73,6 +74,8 @@ static const struct file_operations vc4_drm_fops = {
 };
 
 static const struct drm_ioctl_desc vc4_drm_ioctls[] = {
+	DRM_IOCTL_DEF_DRV(VC4_CREATE_BO, vc4_create_bo_ioctl, 0),
+	DRM_IOCTL_DEF_DRV(VC4_MMAP_BO, vc4_mmap_bo_ioctl, 0),
 };
 
 static struct drm_driver vc4_drm_driver = {
diff --git a/drivers/gpu/drm/vc4/vc4_drv.h b/drivers/gpu/drm/vc4/vc4_drv.h
index 39a1ff5..fddb0a0 100644
--- a/drivers/gpu/drm/vc4/vc4_drv.h
+++ b/drivers/gpu/drm/vc4/vc4_drv.h
@@ -155,6 +155,10 @@ int vc4_dumb_create(struct drm_file *file_priv,
 		    struct drm_mode_create_dumb *args);
 struct dma_buf *vc4_prime_export(struct drm_device *dev,
 				 struct drm_gem_object *obj, int flags);
+int vc4_create_bo_ioctl(struct drm_device *dev, void *data,
+			struct drm_file *file_priv);
+int vc4_mmap_bo_ioctl(struct drm_device *dev, void *data,
+		      struct drm_file *file_priv);
 void vc4_bo_cache_init(struct drm_device *dev);
 void vc4_bo_cache_destroy(struct drm_device *dev);
 int vc4_bo_stats_debugfs(struct seq_file *m, void *arg);
diff --git a/include/uapi/drm/Kbuild b/include/uapi/drm/Kbuild
index 38d4370..974fcd5 100644
--- a/include/uapi/drm/Kbuild
+++ b/include/uapi/drm/Kbuild
@@ -17,4 +17,5 @@ header-y += tegra_drm.h
 header-y += via_drm.h
 header-y += vmwgfx_drm.h
 header-y += msm_drm.h
+header-y += vc4_drm.h
 header-y += virtgpu_drm.h
diff --git a/include/uapi/drm/vc4_drm.h b/include/uapi/drm/vc4_drm.h
new file mode 100644
index 0000000..068aab9
--- /dev/null
+++ b/include/uapi/drm/vc4_drm.h
@@ -0,0 +1,68 @@
+/*
+ * Copyright © 2014-2015 Broadcom
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ */
+
+#ifndef _UAPI_VC4_DRM_H_
+#define _UAPI_VC4_DRM_H_
+
+#include <drm/drm.h>
+
+#define DRM_VC4_CREATE_BO                         0x03
+#define DRM_VC4_MMAP_BO                           0x04
+
+#define DRM_IOCTL_VC4_CREATE_BO           DRM_IOWR(DRM_COMMAND_BASE + DRM_VC4_CREATE_BO, struct drm_vc4_create_bo)
+#define DRM_IOCTL_VC4_MMAP_BO             DRM_IOWR(DRM_COMMAND_BASE + DRM_VC4_MMAP_BO, struct drm_vc4_mmap_bo)
+
+/**
+ * struct drm_vc4_create_bo - ioctl argument for creating VC4 BOs.
+ *
+ * There are currently no values for the flags argument, but it may be
+ * used in a future extension.
+ */
+struct drm_vc4_create_bo {
+	uint32_t size;
+	uint32_t flags;
+	/** Returned GEM handle for the BO. */
+	uint32_t handle;
+	uint32_t pad;
+};
+
+/**
+ * struct drm_vc4_mmap_bo - ioctl argument for mapping VC4 BOs.
+ *
+ * This doesn't actually perform an mmap.  Instead, it returns the
+ * offset you need to use in an mmap on the DRM device node.  This
+ * means that tools like valgrind end up knowing about the mapped
+ * memory.
+ *
+ * There are currently no values for the flags argument, but it may be
+ * used in a future extension.
+ */
+struct drm_vc4_mmap_bo {
+	/** Handle for the object being mapped. */
+	uint32_t handle;
+	uint32_t flags;
+	/** offset into the drm node to use for subsequent mmap call. */
+	uint64_t offset;
+};
+
+#endif /* _UAPI_VC4_DRM_H_ */
-- 
2.6.2

_______________________________________________
dri-devel mailing list
dri-devel@lists.freedesktop.org
http://lists.freedesktop.org/mailman/listinfo/dri-devel

^ permalink raw reply related	[flat|nested] 53+ messages in thread

* [PATCH 4/9] drm/vc4: Add an API for creating GPU shaders in GEM BOs.
  2015-12-01 20:35 ` Eric Anholt
@ 2015-12-01 20:35   ` Eric Anholt
  -1 siblings, 0 replies; 53+ messages in thread
From: Eric Anholt @ 2015-12-01 20:35 UTC (permalink / raw)
  To: dri-devel; +Cc: linux-kernel, David Airlie, Eric Anholt

Since we have no MMU, the kernel needs to validate that the submitted
shader code won't make any accesses to memory that the user doesn't
control, which involves banning some operations (general purpose DMA
writes), and tracking where we need to write out pointers for other
operations (texture sampling).  Once it's validated, we return a GEM
BO containing the shader, which doesn't allow mapping for write or
exporting to other subsystems.

Signed-off-by: Eric Anholt <eric@anholt.net>
---
 drivers/gpu/drm/vc4/Makefile               |   3 +-
 drivers/gpu/drm/vc4/vc4_bo.c               | 140 ++++++++
 drivers/gpu/drm/vc4/vc4_drv.c              |   9 +-
 drivers/gpu/drm/vc4/vc4_drv.h              |  50 +++
 drivers/gpu/drm/vc4/vc4_qpu_defines.h      | 264 +++++++++++++++
 drivers/gpu/drm/vc4/vc4_validate_shaders.c | 513 +++++++++++++++++++++++++++++
 include/uapi/drm/vc4_drm.h                 |  25 ++
 7 files changed, 999 insertions(+), 5 deletions(-)
 create mode 100644 drivers/gpu/drm/vc4/vc4_qpu_defines.h
 create mode 100644 drivers/gpu/drm/vc4/vc4_validate_shaders.c

diff --git a/drivers/gpu/drm/vc4/Makefile b/drivers/gpu/drm/vc4/Makefile
index 32b4f9c..eb776a6 100644
--- a/drivers/gpu/drm/vc4/Makefile
+++ b/drivers/gpu/drm/vc4/Makefile
@@ -10,7 +10,8 @@ vc4-y := \
 	vc4_kms.o \
 	vc4_hdmi.o \
 	vc4_hvs.o \
-	vc4_plane.o
+	vc4_plane.o \
+	vc4_validate_shaders.o
 
 vc4-$(CONFIG_DEBUG_FS) += vc4_debugfs.o
 
diff --git a/drivers/gpu/drm/vc4/vc4_bo.c b/drivers/gpu/drm/vc4/vc4_bo.c
index 06cba26..18dfe3e 100644
--- a/drivers/gpu/drm/vc4/vc4_bo.c
+++ b/drivers/gpu/drm/vc4/vc4_bo.c
@@ -79,6 +79,12 @@ static void vc4_bo_destroy(struct vc4_bo *bo)
 	struct drm_gem_object *obj = &bo->base.base;
 	struct vc4_dev *vc4 = to_vc4_dev(obj->dev);
 
+	if (bo->validated_shader) {
+		kfree(bo->validated_shader->texture_samples);
+		kfree(bo->validated_shader);
+		bo->validated_shader = NULL;
+	}
+
 	vc4->bo_stats.num_allocated--;
 	vc4->bo_stats.size_allocated -= obj->size;
 	drm_gem_cma_free_object(obj);
@@ -315,6 +321,12 @@ void vc4_free_object(struct drm_gem_object *gem_bo)
 		goto out;
 	}
 
+	if (bo->validated_shader) {
+		kfree(bo->validated_shader->texture_samples);
+		kfree(bo->validated_shader);
+		bo->validated_shader = NULL;
+	}
+
 	bo->free_time = jiffies;
 	list_add(&bo->size_head, cache_list);
 	list_add(&bo->unref_head, &vc4->bo_cache.time_list);
@@ -347,6 +359,78 @@ static void vc4_bo_cache_time_timer(unsigned long data)
 	schedule_work(&vc4->bo_cache.time_work);
 }
 
+struct dma_buf *
+vc4_prime_export(struct drm_device *dev, struct drm_gem_object *obj, int flags)
+{
+	struct vc4_bo *bo = to_vc4_bo(obj);
+
+	if (bo->validated_shader) {
+		DRM_ERROR("Attempting to export shader BO\n");
+		return ERR_PTR(-EINVAL);
+	}
+
+	return drm_gem_prime_export(dev, obj, flags);
+}
+
+int vc4_mmap(struct file *filp, struct vm_area_struct *vma)
+{
+	struct drm_gem_object *gem_obj;
+	struct vc4_bo *bo;
+	int ret;
+
+	ret = drm_gem_mmap(filp, vma);
+	if (ret)
+		return ret;
+
+	gem_obj = vma->vm_private_data;
+	bo = to_vc4_bo(gem_obj);
+
+	if (bo->validated_shader && (vma->vm_flags & VM_WRITE)) {
+		DRM_ERROR("mmaping of shader BOs for writing not allowed.\n");
+		return -EINVAL;
+	}
+
+	/*
+	 * Clear the VM_PFNMAP flag that was set by drm_gem_mmap(), and set the
+	 * vm_pgoff (used as a fake buffer offset by DRM) to 0 as we want to map
+	 * the whole buffer.
+	 */
+	vma->vm_flags &= ~VM_PFNMAP;
+	vma->vm_pgoff = 0;
+
+	ret = dma_mmap_writecombine(bo->base.base.dev->dev, vma,
+				    bo->base.vaddr, bo->base.paddr,
+				    vma->vm_end - vma->vm_start);
+	if (ret)
+		drm_gem_vm_close(vma);
+
+	return ret;
+}
+
+int vc4_prime_mmap(struct drm_gem_object *obj, struct vm_area_struct *vma)
+{
+	struct vc4_bo *bo = to_vc4_bo(obj);
+
+	if (bo->validated_shader && (vma->vm_flags & VM_WRITE)) {
+		DRM_ERROR("mmaping of shader BOs for writing not allowed.\n");
+		return -EINVAL;
+	}
+
+	return drm_gem_cma_prime_mmap(obj, vma);
+}
+
+void *vc4_prime_vmap(struct drm_gem_object *obj)
+{
+	struct vc4_bo *bo = to_vc4_bo(obj);
+
+	if (bo->validated_shader) {
+		DRM_ERROR("mmaping of shader BOs not allowed.\n");
+		return ERR_PTR(-EINVAL);
+	}
+
+	return drm_gem_cma_prime_vmap(obj);
+}
+
 int vc4_create_bo_ioctl(struct drm_device *dev, void *data,
 			struct drm_file *file_priv)
 {
@@ -387,6 +471,62 @@ int vc4_mmap_bo_ioctl(struct drm_device *dev, void *data,
 	return 0;
 }
 
+int
+vc4_create_shader_bo_ioctl(struct drm_device *dev, void *data,
+			   struct drm_file *file_priv)
+{
+	struct drm_vc4_create_shader_bo *args = data;
+	struct vc4_bo *bo = NULL;
+	int ret;
+
+	if (args->size == 0)
+		return -EINVAL;
+
+	if (args->size % sizeof(u64) != 0)
+		return -EINVAL;
+
+	if (args->flags != 0) {
+		DRM_INFO("Unknown flags set: 0x%08x\n", args->flags);
+		return -EINVAL;
+	}
+
+	if (args->pad != 0) {
+		DRM_INFO("Pad set: 0x%08x\n", args->pad);
+		return -EINVAL;
+	}
+
+	bo = vc4_bo_create(dev, args->size, true);
+	if (!bo)
+		return -ENOMEM;
+
+	ret = copy_from_user(bo->base.vaddr,
+			     (void __user *)(uintptr_t)args->data,
+			     args->size);
+	if (ret != 0)
+		goto fail;
+	/* Clear the rest of the memory from allocating from the BO
+	 * cache.
+	 */
+	memset(bo->base.vaddr + args->size, 0,
+	       bo->base.base.size - args->size);
+
+	bo->validated_shader = vc4_validate_shader(&bo->base);
+	if (!bo->validated_shader) {
+		ret = -EINVAL;
+		goto fail;
+	}
+
+	/* We have to create the handle after validation, to avoid
+	 * races for users to do doing things like mmap the shader BO.
+	 */
+	ret = drm_gem_handle_create(file_priv, &bo->base.base, &args->handle);
+
+ fail:
+	drm_gem_object_unreference_unlocked(&bo->base.base);
+
+	return ret;
+}
+
 void vc4_bo_cache_init(struct drm_device *dev)
 {
 	struct vc4_dev *vc4 = to_vc4_dev(dev);
diff --git a/drivers/gpu/drm/vc4/vc4_drv.c b/drivers/gpu/drm/vc4/vc4_drv.c
index 5fa4688..da4be9c 100644
--- a/drivers/gpu/drm/vc4/vc4_drv.c
+++ b/drivers/gpu/drm/vc4/vc4_drv.c
@@ -64,7 +64,7 @@ static const struct file_operations vc4_drm_fops = {
 	.open = drm_open,
 	.release = drm_release,
 	.unlocked_ioctl = drm_ioctl,
-	.mmap = drm_gem_cma_mmap,
+	.mmap = vc4_mmap,
 	.poll = drm_poll,
 	.read = drm_read,
 #ifdef CONFIG_COMPAT
@@ -76,6 +76,7 @@ static const struct file_operations vc4_drm_fops = {
 static const struct drm_ioctl_desc vc4_drm_ioctls[] = {
 	DRM_IOCTL_DEF_DRV(VC4_CREATE_BO, vc4_create_bo_ioctl, 0),
 	DRM_IOCTL_DEF_DRV(VC4_MMAP_BO, vc4_mmap_bo_ioctl, 0),
+	DRM_IOCTL_DEF_DRV(VC4_CREATE_SHADER_BO, vc4_create_shader_bo_ioctl, 0),
 };
 
 static struct drm_driver vc4_drm_driver = {
@@ -102,12 +103,12 @@ static struct drm_driver vc4_drm_driver = {
 	.prime_handle_to_fd = drm_gem_prime_handle_to_fd,
 	.prime_fd_to_handle = drm_gem_prime_fd_to_handle,
 	.gem_prime_import = drm_gem_prime_import,
-	.gem_prime_export = drm_gem_prime_export,
+	.gem_prime_export = vc4_prime_export,
 	.gem_prime_get_sg_table	= drm_gem_cma_prime_get_sg_table,
 	.gem_prime_import_sg_table = drm_gem_cma_prime_import_sg_table,
-	.gem_prime_vmap = drm_gem_cma_prime_vmap,
+	.gem_prime_vmap = vc4_prime_vmap,
 	.gem_prime_vunmap = drm_gem_cma_prime_vunmap,
-	.gem_prime_mmap = drm_gem_cma_prime_mmap,
+	.gem_prime_mmap = vc4_prime_mmap,
 
 	.dumb_create = vc4_dumb_create,
 	.dumb_map_offset = drm_gem_cma_dumb_map_offset,
diff --git a/drivers/gpu/drm/vc4/vc4_drv.h b/drivers/gpu/drm/vc4/vc4_drv.h
index fddb0a0..bd77d55 100644
--- a/drivers/gpu/drm/vc4/vc4_drv.h
+++ b/drivers/gpu/drm/vc4/vc4_drv.h
@@ -69,6 +69,11 @@ struct vc4_bo {
 
 	/* List entry for the BO's position in vc4_dev->bo_cache.size_list */
 	struct list_head size_head;
+
+	/* Struct for shader validation state, if created by
+	 * DRM_IOCTL_VC4_CREATE_SHADER_BO.
+	 */
+	struct vc4_validated_shader_info *validated_shader;
 };
 
 static inline struct vc4_bo *
@@ -118,6 +123,42 @@ to_vc4_encoder(struct drm_encoder *encoder)
 #define HVS_WRITE(offset, val) writel(val, vc4->hvs->regs + offset)
 
 /**
+ * struct vc4_texture_sample_info - saves the offsets into the UBO for texture
+ * setup parameters.
+ *
+ * This will be used at draw time to relocate the reference to the texture
+ * contents in p0, and validate that the offset combined with
+ * width/height/stride/etc. from p1 and p2/p3 doesn't sample outside the BO.
+ * Note that the hardware treats unprovided config parameters as 0, so not all
+ * of them need to be set up for every texure sample, and we'll store ~0 as
+ * the offset to mark the unused ones.
+ *
+ * See the VC4 3D architecture guide page 41 ("Texture and Memory Lookup Unit
+ * Setup") for definitions of the texture parameters.
+ */
+struct vc4_texture_sample_info {
+	bool is_direct;
+	uint32_t p_offset[4];
+};
+
+/**
+ * struct vc4_validated_shader_info - information about validated shaders that
+ * needs to be used from command list validation.
+ *
+ * For a given shader, each time a shader state record references it, we need
+ * to verify that the shader doesn't read more uniforms than the shader state
+ * record's uniform BO pointer can provide, and we need to apply relocations
+ * and validate the shader state record's uniforms that define the texture
+ * samples.
+ */
+struct vc4_validated_shader_info {
+	uint32_t uniforms_size;
+	uint32_t uniforms_src_size;
+	uint32_t num_texture_samples;
+	struct vc4_texture_sample_info *texture_samples;
+};
+
+/**
  * _wait_for - magic (register) wait macro
  *
  * Does the right thing for modeset paths when run under kdgb or similar atomic
@@ -157,8 +198,13 @@ struct dma_buf *vc4_prime_export(struct drm_device *dev,
 				 struct drm_gem_object *obj, int flags);
 int vc4_create_bo_ioctl(struct drm_device *dev, void *data,
 			struct drm_file *file_priv);
+int vc4_create_shader_bo_ioctl(struct drm_device *dev, void *data,
+			       struct drm_file *file_priv);
 int vc4_mmap_bo_ioctl(struct drm_device *dev, void *data,
 		      struct drm_file *file_priv);
+int vc4_mmap(struct file *filp, struct vm_area_struct *vma);
+int vc4_prime_mmap(struct drm_gem_object *obj, struct vm_area_struct *vma);
+void *vc4_prime_vmap(struct drm_gem_object *obj);
 void vc4_bo_cache_init(struct drm_device *dev);
 void vc4_bo_cache_destroy(struct drm_device *dev);
 int vc4_bo_stats_debugfs(struct seq_file *m, void *arg);
@@ -194,3 +240,7 @@ struct drm_plane *vc4_plane_init(struct drm_device *dev,
 				 enum drm_plane_type type);
 u32 vc4_plane_write_dlist(struct drm_plane *plane, u32 __iomem *dlist);
 u32 vc4_plane_dlist_size(struct drm_plane_state *state);
+
+/* vc4_validate_shader.c */
+struct vc4_validated_shader_info *
+vc4_validate_shader(struct drm_gem_cma_object *shader_obj);
diff --git a/drivers/gpu/drm/vc4/vc4_qpu_defines.h b/drivers/gpu/drm/vc4/vc4_qpu_defines.h
new file mode 100644
index 0000000..d5c2f3c
--- /dev/null
+++ b/drivers/gpu/drm/vc4/vc4_qpu_defines.h
@@ -0,0 +1,264 @@
+/*
+ * Copyright © 2014 Broadcom
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ */
+
+#ifndef VC4_QPU_DEFINES_H
+#define VC4_QPU_DEFINES_H
+
+enum qpu_op_add {
+	QPU_A_NOP,
+	QPU_A_FADD,
+	QPU_A_FSUB,
+	QPU_A_FMIN,
+	QPU_A_FMAX,
+	QPU_A_FMINABS,
+	QPU_A_FMAXABS,
+	QPU_A_FTOI,
+	QPU_A_ITOF,
+	QPU_A_ADD = 12,
+	QPU_A_SUB,
+	QPU_A_SHR,
+	QPU_A_ASR,
+	QPU_A_ROR,
+	QPU_A_SHL,
+	QPU_A_MIN,
+	QPU_A_MAX,
+	QPU_A_AND,
+	QPU_A_OR,
+	QPU_A_XOR,
+	QPU_A_NOT,
+	QPU_A_CLZ,
+	QPU_A_V8ADDS = 30,
+	QPU_A_V8SUBS = 31,
+};
+
+enum qpu_op_mul {
+	QPU_M_NOP,
+	QPU_M_FMUL,
+	QPU_M_MUL24,
+	QPU_M_V8MULD,
+	QPU_M_V8MIN,
+	QPU_M_V8MAX,
+	QPU_M_V8ADDS,
+	QPU_M_V8SUBS,
+};
+
+enum qpu_raddr {
+	QPU_R_FRAG_PAYLOAD_ZW = 15, /* W for A file, Z for B file */
+	/* 0-31 are the plain regfile a or b fields */
+	QPU_R_UNIF = 32,
+	QPU_R_VARY = 35,
+	QPU_R_ELEM_QPU = 38,
+	QPU_R_NOP,
+	QPU_R_XY_PIXEL_COORD = 41,
+	QPU_R_MS_REV_FLAGS = 41,
+	QPU_R_VPM = 48,
+	QPU_R_VPM_LD_BUSY,
+	QPU_R_VPM_LD_WAIT,
+	QPU_R_MUTEX_ACQUIRE,
+};
+
+enum qpu_waddr {
+	/* 0-31 are the plain regfile a or b fields */
+	QPU_W_ACC0 = 32, /* aka r0 */
+	QPU_W_ACC1,
+	QPU_W_ACC2,
+	QPU_W_ACC3,
+	QPU_W_TMU_NOSWAP,
+	QPU_W_ACC5,
+	QPU_W_HOST_INT,
+	QPU_W_NOP,
+	QPU_W_UNIFORMS_ADDRESS,
+	QPU_W_QUAD_XY, /* X for regfile a, Y for regfile b */
+	QPU_W_MS_FLAGS = 42,
+	QPU_W_REV_FLAG = 42,
+	QPU_W_TLB_STENCIL_SETUP = 43,
+	QPU_W_TLB_Z,
+	QPU_W_TLB_COLOR_MS,
+	QPU_W_TLB_COLOR_ALL,
+	QPU_W_TLB_ALPHA_MASK,
+	QPU_W_VPM,
+	QPU_W_VPMVCD_SETUP, /* LD for regfile a, ST for regfile b */
+	QPU_W_VPM_ADDR, /* LD for regfile a, ST for regfile b */
+	QPU_W_MUTEX_RELEASE,
+	QPU_W_SFU_RECIP,
+	QPU_W_SFU_RECIPSQRT,
+	QPU_W_SFU_EXP,
+	QPU_W_SFU_LOG,
+	QPU_W_TMU0_S,
+	QPU_W_TMU0_T,
+	QPU_W_TMU0_R,
+	QPU_W_TMU0_B,
+	QPU_W_TMU1_S,
+	QPU_W_TMU1_T,
+	QPU_W_TMU1_R,
+	QPU_W_TMU1_B,
+};
+
+enum qpu_sig_bits {
+	QPU_SIG_SW_BREAKPOINT,
+	QPU_SIG_NONE,
+	QPU_SIG_THREAD_SWITCH,
+	QPU_SIG_PROG_END,
+	QPU_SIG_WAIT_FOR_SCOREBOARD,
+	QPU_SIG_SCOREBOARD_UNLOCK,
+	QPU_SIG_LAST_THREAD_SWITCH,
+	QPU_SIG_COVERAGE_LOAD,
+	QPU_SIG_COLOR_LOAD,
+	QPU_SIG_COLOR_LOAD_END,
+	QPU_SIG_LOAD_TMU0,
+	QPU_SIG_LOAD_TMU1,
+	QPU_SIG_ALPHA_MASK_LOAD,
+	QPU_SIG_SMALL_IMM,
+	QPU_SIG_LOAD_IMM,
+	QPU_SIG_BRANCH
+};
+
+enum qpu_mux {
+	/* hardware mux values */
+	QPU_MUX_R0,
+	QPU_MUX_R1,
+	QPU_MUX_R2,
+	QPU_MUX_R3,
+	QPU_MUX_R4,
+	QPU_MUX_R5,
+	QPU_MUX_A,
+	QPU_MUX_B,
+
+	/* non-hardware mux values */
+	QPU_MUX_IMM,
+};
+
+enum qpu_cond {
+	QPU_COND_NEVER,
+	QPU_COND_ALWAYS,
+	QPU_COND_ZS,
+	QPU_COND_ZC,
+	QPU_COND_NS,
+	QPU_COND_NC,
+	QPU_COND_CS,
+	QPU_COND_CC,
+};
+
+enum qpu_pack_mul {
+	QPU_PACK_MUL_NOP,
+	/* replicated to each 8 bits of the 32-bit dst. */
+	QPU_PACK_MUL_8888 = 3,
+	QPU_PACK_MUL_8A,
+	QPU_PACK_MUL_8B,
+	QPU_PACK_MUL_8C,
+	QPU_PACK_MUL_8D,
+};
+
+enum qpu_pack_a {
+	QPU_PACK_A_NOP,
+	/* convert to 16 bit float if float input, or to int16. */
+	QPU_PACK_A_16A,
+	QPU_PACK_A_16B,
+	/* replicated to each 8 bits of the 32-bit dst. */
+	QPU_PACK_A_8888,
+	/* Convert to 8-bit unsigned int. */
+	QPU_PACK_A_8A,
+	QPU_PACK_A_8B,
+	QPU_PACK_A_8C,
+	QPU_PACK_A_8D,
+
+	/* Saturating variants of the previous instructions. */
+	QPU_PACK_A_32_SAT, /* int-only */
+	QPU_PACK_A_16A_SAT, /* int or float */
+	QPU_PACK_A_16B_SAT,
+	QPU_PACK_A_8888_SAT,
+	QPU_PACK_A_8A_SAT,
+	QPU_PACK_A_8B_SAT,
+	QPU_PACK_A_8C_SAT,
+	QPU_PACK_A_8D_SAT,
+};
+
+enum qpu_unpack_r4 {
+	QPU_UNPACK_R4_NOP,
+	QPU_UNPACK_R4_F16A_TO_F32,
+	QPU_UNPACK_R4_F16B_TO_F32,
+	QPU_UNPACK_R4_8D_REP,
+	QPU_UNPACK_R4_8A,
+	QPU_UNPACK_R4_8B,
+	QPU_UNPACK_R4_8C,
+	QPU_UNPACK_R4_8D,
+};
+
+#define QPU_MASK(high, low) \
+	((((uint64_t)1 << ((high) - (low) + 1)) - 1) << (low))
+
+#define QPU_GET_FIELD(word, field) \
+	((uint32_t)(((word)  & field ## _MASK) >> field ## _SHIFT))
+
+#define QPU_SIG_SHIFT                   60
+#define QPU_SIG_MASK                    QPU_MASK(63, 60)
+
+#define QPU_UNPACK_SHIFT                57
+#define QPU_UNPACK_MASK                 QPU_MASK(59, 57)
+
+/**
+ * If set, the pack field means PACK_MUL or R4 packing, instead of normal
+ * regfile a packing.
+ */
+#define QPU_PM                          ((uint64_t)1 << 56)
+
+#define QPU_PACK_SHIFT                  52
+#define QPU_PACK_MASK                   QPU_MASK(55, 52)
+
+#define QPU_COND_ADD_SHIFT              49
+#define QPU_COND_ADD_MASK               QPU_MASK(51, 49)
+#define QPU_COND_MUL_SHIFT              46
+#define QPU_COND_MUL_MASK               QPU_MASK(48, 46)
+
+#define QPU_SF                          ((uint64_t)1 << 45)
+
+#define QPU_WADDR_ADD_SHIFT             38
+#define QPU_WADDR_ADD_MASK              QPU_MASK(43, 38)
+#define QPU_WADDR_MUL_SHIFT             32
+#define QPU_WADDR_MUL_MASK              QPU_MASK(37, 32)
+
+#define QPU_OP_MUL_SHIFT                29
+#define QPU_OP_MUL_MASK                 QPU_MASK(31, 29)
+
+#define QPU_RADDR_A_SHIFT               18
+#define QPU_RADDR_A_MASK                QPU_MASK(23, 18)
+#define QPU_RADDR_B_SHIFT               12
+#define QPU_RADDR_B_MASK                QPU_MASK(17, 12)
+#define QPU_SMALL_IMM_SHIFT             12
+#define QPU_SMALL_IMM_MASK              QPU_MASK(17, 12)
+
+#define QPU_ADD_A_SHIFT                 9
+#define QPU_ADD_A_MASK                  QPU_MASK(11, 9)
+#define QPU_ADD_B_SHIFT                 6
+#define QPU_ADD_B_MASK                  QPU_MASK(8, 6)
+#define QPU_MUL_A_SHIFT                 3
+#define QPU_MUL_A_MASK                  QPU_MASK(5, 3)
+#define QPU_MUL_B_SHIFT                 0
+#define QPU_MUL_B_MASK                  QPU_MASK(2, 0)
+
+#define QPU_WS                          ((uint64_t)1 << 44)
+
+#define QPU_OP_ADD_SHIFT                24
+#define QPU_OP_ADD_MASK                 QPU_MASK(28, 24)
+
+#endif /* VC4_QPU_DEFINES_H */
diff --git a/drivers/gpu/drm/vc4/vc4_validate_shaders.c b/drivers/gpu/drm/vc4/vc4_validate_shaders.c
new file mode 100644
index 0000000..f67124b
--- /dev/null
+++ b/drivers/gpu/drm/vc4/vc4_validate_shaders.c
@@ -0,0 +1,513 @@
+/*
+ * Copyright © 2014 Broadcom
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ */
+
+/**
+ * DOC: Shader validator for VC4.
+ *
+ * The VC4 has no IOMMU between it and system memory, so a user with
+ * access to execute shaders could escalate privilege by overwriting
+ * system memory (using the VPM write address register in the
+ * general-purpose DMA mode) or reading system memory it shouldn't
+ * (reading it as a texture, or uniform data, or vertex data).
+ *
+ * This walks over a shader BO, ensuring that its accesses are
+ * appropriately bounded, and recording how many texture accesses are
+ * made and where so that we can do relocations for them in the
+ * uniform stream.
+ */
+
+#include "vc4_drv.h"
+#include "vc4_qpu_defines.h"
+
+struct vc4_shader_validation_state {
+	struct vc4_texture_sample_info tmu_setup[2];
+	int tmu_write_count[2];
+
+	/* For registers that were last written to by a MIN instruction with
+	 * one argument being a uniform, the address of the uniform.
+	 * Otherwise, ~0.
+	 *
+	 * This is used for the validation of direct address memory reads.
+	 */
+	uint32_t live_min_clamp_offsets[32 + 32 + 4];
+	bool live_max_clamp_regs[32 + 32 + 4];
+};
+
+static uint32_t
+waddr_to_live_reg_index(uint32_t waddr, bool is_b)
+{
+	if (waddr < 32) {
+		if (is_b)
+			return 32 + waddr;
+		else
+			return waddr;
+	} else if (waddr <= QPU_W_ACC3) {
+		return 64 + waddr - QPU_W_ACC0;
+	} else {
+		return ~0;
+	}
+}
+
+static uint32_t
+raddr_add_a_to_live_reg_index(uint64_t inst)
+{
+	uint32_t sig = QPU_GET_FIELD(inst, QPU_SIG);
+	uint32_t add_a = QPU_GET_FIELD(inst, QPU_ADD_A);
+	uint32_t raddr_a = QPU_GET_FIELD(inst, QPU_RADDR_A);
+	uint32_t raddr_b = QPU_GET_FIELD(inst, QPU_RADDR_B);
+
+	if (add_a == QPU_MUX_A)
+		return raddr_a;
+	else if (add_a == QPU_MUX_B && sig != QPU_SIG_SMALL_IMM)
+		return 32 + raddr_b;
+	else if (add_a <= QPU_MUX_R3)
+		return 64 + add_a;
+	else
+		return ~0;
+}
+
+static bool
+is_tmu_submit(uint32_t waddr)
+{
+	return (waddr == QPU_W_TMU0_S ||
+		waddr == QPU_W_TMU1_S);
+}
+
+static bool
+is_tmu_write(uint32_t waddr)
+{
+	return (waddr >= QPU_W_TMU0_S &&
+		waddr <= QPU_W_TMU1_B);
+}
+
+static bool
+record_texture_sample(struct vc4_validated_shader_info *validated_shader,
+		      struct vc4_shader_validation_state *validation_state,
+		      int tmu)
+{
+	uint32_t s = validated_shader->num_texture_samples;
+	int i;
+	struct vc4_texture_sample_info *temp_samples;
+
+	temp_samples = krealloc(validated_shader->texture_samples,
+				(s + 1) * sizeof(*temp_samples),
+				GFP_KERNEL);
+	if (!temp_samples)
+		return false;
+
+	memcpy(&temp_samples[s],
+	       &validation_state->tmu_setup[tmu],
+	       sizeof(*temp_samples));
+
+	validated_shader->num_texture_samples = s + 1;
+	validated_shader->texture_samples = temp_samples;
+
+	for (i = 0; i < 4; i++)
+		validation_state->tmu_setup[tmu].p_offset[i] = ~0;
+
+	return true;
+}
+
+static bool
+check_tmu_write(uint64_t inst,
+		struct vc4_validated_shader_info *validated_shader,
+		struct vc4_shader_validation_state *validation_state,
+		bool is_mul)
+{
+	uint32_t waddr = (is_mul ?
+			  QPU_GET_FIELD(inst, QPU_WADDR_MUL) :
+			  QPU_GET_FIELD(inst, QPU_WADDR_ADD));
+	uint32_t raddr_a = QPU_GET_FIELD(inst, QPU_RADDR_A);
+	uint32_t raddr_b = QPU_GET_FIELD(inst, QPU_RADDR_B);
+	int tmu = waddr > QPU_W_TMU0_B;
+	bool submit = is_tmu_submit(waddr);
+	bool is_direct = submit && validation_state->tmu_write_count[tmu] == 0;
+	uint32_t sig = QPU_GET_FIELD(inst, QPU_SIG);
+
+	if (is_direct) {
+		uint32_t add_b = QPU_GET_FIELD(inst, QPU_ADD_B);
+		uint32_t clamp_reg, clamp_offset;
+
+		if (sig == QPU_SIG_SMALL_IMM) {
+			DRM_ERROR("direct TMU read used small immediate\n");
+			return false;
+		}
+
+		/* Make sure that this texture load is an add of the base
+		 * address of the UBO to a clamped offset within the UBO.
+		 */
+		if (is_mul ||
+		    QPU_GET_FIELD(inst, QPU_OP_ADD) != QPU_A_ADD) {
+			DRM_ERROR("direct TMU load wasn't an add\n");
+			return false;
+		}
+
+		/* We assert that the the clamped address is the first
+		 * argument, and the UBO base address is the second argument.
+		 * This is arbitrary, but simpler than supporting flipping the
+		 * two either way.
+		 */
+		clamp_reg = raddr_add_a_to_live_reg_index(inst);
+		if (clamp_reg == ~0) {
+			DRM_ERROR("direct TMU load wasn't clamped\n");
+			return false;
+		}
+
+		clamp_offset = validation_state->live_min_clamp_offsets[clamp_reg];
+		if (clamp_offset == ~0) {
+			DRM_ERROR("direct TMU load wasn't clamped\n");
+			return false;
+		}
+
+		/* Store the clamp value's offset in p1 (see reloc_tex() in
+		 * vc4_validate.c).
+		 */
+		validation_state->tmu_setup[tmu].p_offset[1] =
+			clamp_offset;
+
+		if (!(add_b == QPU_MUX_A && raddr_a == QPU_R_UNIF) &&
+		    !(add_b == QPU_MUX_B && raddr_b == QPU_R_UNIF)) {
+			DRM_ERROR("direct TMU load didn't add to a uniform\n");
+			return false;
+		}
+
+		validation_state->tmu_setup[tmu].is_direct = true;
+	} else {
+		if (raddr_a == QPU_R_UNIF || (sig != QPU_SIG_SMALL_IMM &&
+					      raddr_b == QPU_R_UNIF)) {
+			DRM_ERROR("uniform read in the same instruction as "
+				  "texture setup.\n");
+			return false;
+		}
+	}
+
+	if (validation_state->tmu_write_count[tmu] >= 4) {
+		DRM_ERROR("TMU%d got too many parameters before dispatch\n",
+			  tmu);
+		return false;
+	}
+	validation_state->tmu_setup[tmu].p_offset[validation_state->tmu_write_count[tmu]] =
+		validated_shader->uniforms_size;
+	validation_state->tmu_write_count[tmu]++;
+	/* Since direct uses a RADDR uniform reference, it will get counted in
+	 * check_instruction_reads()
+	 */
+	if (!is_direct)
+		validated_shader->uniforms_size += 4;
+
+	if (submit) {
+		if (!record_texture_sample(validated_shader,
+					   validation_state, tmu)) {
+			return false;
+		}
+
+		validation_state->tmu_write_count[tmu] = 0;
+	}
+
+	return true;
+}
+
+static bool
+check_reg_write(uint64_t inst,
+		struct vc4_validated_shader_info *validated_shader,
+		struct vc4_shader_validation_state *validation_state,
+		bool is_mul)
+{
+	uint32_t waddr = (is_mul ?
+			  QPU_GET_FIELD(inst, QPU_WADDR_MUL) :
+			  QPU_GET_FIELD(inst, QPU_WADDR_ADD));
+
+	switch (waddr) {
+	case QPU_W_UNIFORMS_ADDRESS:
+		/* XXX: We'll probably need to support this for reladdr, but
+		 * it's definitely a security-related one.
+		 */
+		DRM_ERROR("uniforms address load unsupported\n");
+		return false;
+
+	case QPU_W_TLB_COLOR_MS:
+	case QPU_W_TLB_COLOR_ALL:
+	case QPU_W_TLB_Z:
+		/* These only interact with the tile buffer, not main memory,
+		 * so they're safe.
+		 */
+		return true;
+
+	case QPU_W_TMU0_S:
+	case QPU_W_TMU0_T:
+	case QPU_W_TMU0_R:
+	case QPU_W_TMU0_B:
+	case QPU_W_TMU1_S:
+	case QPU_W_TMU1_T:
+	case QPU_W_TMU1_R:
+	case QPU_W_TMU1_B:
+		return check_tmu_write(inst, validated_shader, validation_state,
+				       is_mul);
+
+	case QPU_W_HOST_INT:
+	case QPU_W_TMU_NOSWAP:
+	case QPU_W_TLB_ALPHA_MASK:
+	case QPU_W_MUTEX_RELEASE:
+		/* XXX: I haven't thought about these, so don't support them
+		 * for now.
+		 */
+		DRM_ERROR("Unsupported waddr %d\n", waddr);
+		return false;
+
+	case QPU_W_VPM_ADDR:
+		DRM_ERROR("General VPM DMA unsupported\n");
+		return false;
+
+	case QPU_W_VPM:
+	case QPU_W_VPMVCD_SETUP:
+		/* We allow VPM setup in general, even including VPM DMA
+		 * configuration setup, because the (unsafe) DMA can only be
+		 * triggered by QPU_W_VPM_ADDR writes.
+		 */
+		return true;
+
+	case QPU_W_TLB_STENCIL_SETUP:
+		return true;
+	}
+
+	return true;
+}
+
+static void
+track_live_clamps(uint64_t inst,
+		  struct vc4_validated_shader_info *validated_shader,
+		  struct vc4_shader_validation_state *validation_state)
+{
+	uint32_t op_add = QPU_GET_FIELD(inst, QPU_OP_ADD);
+	uint32_t waddr_add = QPU_GET_FIELD(inst, QPU_WADDR_ADD);
+	uint32_t waddr_mul = QPU_GET_FIELD(inst, QPU_WADDR_MUL);
+	uint32_t cond_add = QPU_GET_FIELD(inst, QPU_COND_ADD);
+	uint32_t add_a = QPU_GET_FIELD(inst, QPU_ADD_A);
+	uint32_t add_b = QPU_GET_FIELD(inst, QPU_ADD_B);
+	uint32_t raddr_a = QPU_GET_FIELD(inst, QPU_RADDR_A);
+	uint32_t raddr_b = QPU_GET_FIELD(inst, QPU_RADDR_B);
+	uint32_t sig = QPU_GET_FIELD(inst, QPU_SIG);
+	bool ws = inst & QPU_WS;
+	uint32_t lri_add_a, lri_add, lri_mul;
+	bool add_a_is_min_0;
+
+	/* Check whether OP_ADD's A argumennt comes from a live MAX(x, 0),
+	 * before we clear previous live state.
+	 */
+	lri_add_a = raddr_add_a_to_live_reg_index(inst);
+	add_a_is_min_0 = (lri_add_a != ~0 &&
+			  validation_state->live_max_clamp_regs[lri_add_a]);
+
+	/* Clear live state for registers written by our instruction. */
+	lri_add = waddr_to_live_reg_index(waddr_add, ws);
+	lri_mul = waddr_to_live_reg_index(waddr_mul, !ws);
+	if (lri_mul != ~0) {
+		validation_state->live_max_clamp_regs[lri_mul] = false;
+		validation_state->live_min_clamp_offsets[lri_mul] = ~0;
+	}
+	if (lri_add != ~0) {
+		validation_state->live_max_clamp_regs[lri_add] = false;
+		validation_state->live_min_clamp_offsets[lri_add] = ~0;
+	} else {
+		/* Nothing further to do for live tracking, since only ADDs
+		 * generate new live clamp registers.
+		 */
+		return;
+	}
+
+	/* Now, handle remaining live clamp tracking for the ADD operation. */
+
+	if (cond_add != QPU_COND_ALWAYS)
+		return;
+
+	if (op_add == QPU_A_MAX) {
+		/* Track live clamps of a value to a minimum of 0 (in either
+		 * arg).
+		 */
+		if (sig != QPU_SIG_SMALL_IMM || raddr_b != 0 ||
+		    (add_a != QPU_MUX_B && add_b != QPU_MUX_B)) {
+			return;
+		}
+
+		validation_state->live_max_clamp_regs[lri_add] = true;
+	} else if (op_add == QPU_A_MIN) {
+		/* Track live clamps of a value clamped to a minimum of 0 and
+		 * a maximum of some uniform's offset.
+		 */
+		if (!add_a_is_min_0)
+			return;
+
+		if (!(add_b == QPU_MUX_A && raddr_a == QPU_R_UNIF) &&
+		    !(add_b == QPU_MUX_B && raddr_b == QPU_R_UNIF &&
+		      sig != QPU_SIG_SMALL_IMM)) {
+			return;
+		}
+
+		validation_state->live_min_clamp_offsets[lri_add] =
+			validated_shader->uniforms_size;
+	}
+}
+
+static bool
+check_instruction_writes(uint64_t inst,
+			 struct vc4_validated_shader_info *validated_shader,
+			 struct vc4_shader_validation_state *validation_state)
+{
+	uint32_t waddr_add = QPU_GET_FIELD(inst, QPU_WADDR_ADD);
+	uint32_t waddr_mul = QPU_GET_FIELD(inst, QPU_WADDR_MUL);
+	bool ok;
+
+	if (is_tmu_write(waddr_add) && is_tmu_write(waddr_mul)) {
+		DRM_ERROR("ADD and MUL both set up textures\n");
+		return false;
+	}
+
+	ok = (check_reg_write(inst, validated_shader, validation_state,
+			      false) &&
+	      check_reg_write(inst, validated_shader, validation_state,
+			      true));
+
+	track_live_clamps(inst, validated_shader, validation_state);
+
+	return ok;
+}
+
+static bool
+check_instruction_reads(uint64_t inst,
+			struct vc4_validated_shader_info *validated_shader)
+{
+	uint32_t raddr_a = QPU_GET_FIELD(inst, QPU_RADDR_A);
+	uint32_t raddr_b = QPU_GET_FIELD(inst, QPU_RADDR_B);
+	uint32_t sig = QPU_GET_FIELD(inst, QPU_SIG);
+
+	if (raddr_a == QPU_R_UNIF ||
+	    (raddr_b == QPU_R_UNIF && sig != QPU_SIG_SMALL_IMM)) {
+		/* This can't overflow the uint32_t, because we're reading 8
+		 * bytes of instruction to increment by 4 here, so we'd
+		 * already be OOM.
+		 */
+		validated_shader->uniforms_size += 4;
+	}
+
+	return true;
+}
+
+struct vc4_validated_shader_info *
+vc4_validate_shader(struct drm_gem_cma_object *shader_obj)
+{
+	bool found_shader_end = false;
+	int shader_end_ip = 0;
+	uint32_t ip, max_ip;
+	uint64_t *shader;
+	struct vc4_validated_shader_info *validated_shader;
+	struct vc4_shader_validation_state validation_state;
+	int i;
+
+	memset(&validation_state, 0, sizeof(validation_state));
+
+	for (i = 0; i < 8; i++)
+		validation_state.tmu_setup[i / 4].p_offset[i % 4] = ~0;
+	for (i = 0; i < ARRAY_SIZE(validation_state.live_min_clamp_offsets); i++)
+		validation_state.live_min_clamp_offsets[i] = ~0;
+
+	shader = shader_obj->vaddr;
+	max_ip = shader_obj->base.size / sizeof(uint64_t);
+
+	validated_shader = kcalloc(1, sizeof(*validated_shader), GFP_KERNEL);
+	if (!validated_shader)
+		return NULL;
+
+	for (ip = 0; ip < max_ip; ip++) {
+		uint64_t inst = shader[ip];
+		uint32_t sig = QPU_GET_FIELD(inst, QPU_SIG);
+
+		switch (sig) {
+		case QPU_SIG_NONE:
+		case QPU_SIG_WAIT_FOR_SCOREBOARD:
+		case QPU_SIG_SCOREBOARD_UNLOCK:
+		case QPU_SIG_COLOR_LOAD:
+		case QPU_SIG_LOAD_TMU0:
+		case QPU_SIG_LOAD_TMU1:
+		case QPU_SIG_PROG_END:
+		case QPU_SIG_SMALL_IMM:
+			if (!check_instruction_writes(inst, validated_shader,
+						      &validation_state)) {
+				DRM_ERROR("Bad write at ip %d\n", ip);
+				goto fail;
+			}
+
+			if (!check_instruction_reads(inst, validated_shader))
+				goto fail;
+
+			if (sig == QPU_SIG_PROG_END) {
+				found_shader_end = true;
+				shader_end_ip = ip;
+			}
+
+			break;
+
+		case QPU_SIG_LOAD_IMM:
+			if (!check_instruction_writes(inst, validated_shader,
+						      &validation_state)) {
+				DRM_ERROR("Bad LOAD_IMM write at ip %d\n", ip);
+				goto fail;
+			}
+			break;
+
+		default:
+			DRM_ERROR("Unsupported QPU signal %d at "
+				  "instruction %d\n", sig, ip);
+			goto fail;
+		}
+
+		/* There are two delay slots after program end is signaled
+		 * that are still executed, then we're finished.
+		 */
+		if (found_shader_end && ip == shader_end_ip + 2)
+			break;
+	}
+
+	if (ip == max_ip) {
+		DRM_ERROR("shader failed to terminate before "
+			  "shader BO end at %zd\n",
+			  shader_obj->base.size);
+		goto fail;
+	}
+
+	/* Again, no chance of integer overflow here because the worst case
+	 * scenario is 8 bytes of uniforms plus handles per 8-byte
+	 * instruction.
+	 */
+	validated_shader->uniforms_src_size =
+		(validated_shader->uniforms_size +
+		 4 * validated_shader->num_texture_samples);
+
+	return validated_shader;
+
+fail:
+	if (validated_shader) {
+		kfree(validated_shader->texture_samples);
+		kfree(validated_shader);
+	}
+	return NULL;
+}
diff --git a/include/uapi/drm/vc4_drm.h b/include/uapi/drm/vc4_drm.h
index 068aab9..0716b50 100644
--- a/include/uapi/drm/vc4_drm.h
+++ b/include/uapi/drm/vc4_drm.h
@@ -28,9 +28,11 @@
 
 #define DRM_VC4_CREATE_BO                         0x03
 #define DRM_VC4_MMAP_BO                           0x04
+#define DRM_VC4_CREATE_SHADER_BO                  0x05
 
 #define DRM_IOCTL_VC4_CREATE_BO           DRM_IOWR(DRM_COMMAND_BASE + DRM_VC4_CREATE_BO, struct drm_vc4_create_bo)
 #define DRM_IOCTL_VC4_MMAP_BO             DRM_IOWR(DRM_COMMAND_BASE + DRM_VC4_MMAP_BO, struct drm_vc4_mmap_bo)
+#define DRM_IOCTL_VC4_CREATE_SHADER_BO    DRM_IOWR(DRM_COMMAND_BASE + DRM_VC4_CREATE_SHADER_BO, struct drm_vc4_create_shader_bo)
 
 /**
  * struct drm_vc4_create_bo - ioctl argument for creating VC4 BOs.
@@ -65,4 +67,27 @@ struct drm_vc4_mmap_bo {
 	uint64_t offset;
 };
 
+/**
+ * struct drm_vc4_create_shader_bo - ioctl argument for creating VC4
+ * shader BOs.
+ *
+ * Since allowing a shader to be overwritten while it's also being
+ * executed from would allow privlege escalation, shaders must be
+ * created using this ioctl, and they can't be mmapped later.
+ */
+struct drm_vc4_create_shader_bo {
+	/* Size of the data argument. */
+	uint32_t size;
+	/* Flags, currently must be 0. */
+	uint32_t flags;
+
+	/* Pointer to the data. */
+	uint64_t data;
+
+	/** Returned GEM handle for the BO. */
+	uint32_t handle;
+	/* Pad, must be 0. */
+	uint32_t pad;
+};
+
 #endif /* _UAPI_VC4_DRM_H_ */
-- 
2.6.2


^ permalink raw reply related	[flat|nested] 53+ messages in thread

* [PATCH 4/9] drm/vc4: Add an API for creating GPU shaders in GEM BOs.
@ 2015-12-01 20:35   ` Eric Anholt
  0 siblings, 0 replies; 53+ messages in thread
From: Eric Anholt @ 2015-12-01 20:35 UTC (permalink / raw)
  To: dri-devel; +Cc: linux-kernel

Since we have no MMU, the kernel needs to validate that the submitted
shader code won't make any accesses to memory that the user doesn't
control, which involves banning some operations (general purpose DMA
writes), and tracking where we need to write out pointers for other
operations (texture sampling).  Once it's validated, we return a GEM
BO containing the shader, which doesn't allow mapping for write or
exporting to other subsystems.

Signed-off-by: Eric Anholt <eric@anholt.net>
---
 drivers/gpu/drm/vc4/Makefile               |   3 +-
 drivers/gpu/drm/vc4/vc4_bo.c               | 140 ++++++++
 drivers/gpu/drm/vc4/vc4_drv.c              |   9 +-
 drivers/gpu/drm/vc4/vc4_drv.h              |  50 +++
 drivers/gpu/drm/vc4/vc4_qpu_defines.h      | 264 +++++++++++++++
 drivers/gpu/drm/vc4/vc4_validate_shaders.c | 513 +++++++++++++++++++++++++++++
 include/uapi/drm/vc4_drm.h                 |  25 ++
 7 files changed, 999 insertions(+), 5 deletions(-)
 create mode 100644 drivers/gpu/drm/vc4/vc4_qpu_defines.h
 create mode 100644 drivers/gpu/drm/vc4/vc4_validate_shaders.c

diff --git a/drivers/gpu/drm/vc4/Makefile b/drivers/gpu/drm/vc4/Makefile
index 32b4f9c..eb776a6 100644
--- a/drivers/gpu/drm/vc4/Makefile
+++ b/drivers/gpu/drm/vc4/Makefile
@@ -10,7 +10,8 @@ vc4-y := \
 	vc4_kms.o \
 	vc4_hdmi.o \
 	vc4_hvs.o \
-	vc4_plane.o
+	vc4_plane.o \
+	vc4_validate_shaders.o
 
 vc4-$(CONFIG_DEBUG_FS) += vc4_debugfs.o
 
diff --git a/drivers/gpu/drm/vc4/vc4_bo.c b/drivers/gpu/drm/vc4/vc4_bo.c
index 06cba26..18dfe3e 100644
--- a/drivers/gpu/drm/vc4/vc4_bo.c
+++ b/drivers/gpu/drm/vc4/vc4_bo.c
@@ -79,6 +79,12 @@ static void vc4_bo_destroy(struct vc4_bo *bo)
 	struct drm_gem_object *obj = &bo->base.base;
 	struct vc4_dev *vc4 = to_vc4_dev(obj->dev);
 
+	if (bo->validated_shader) {
+		kfree(bo->validated_shader->texture_samples);
+		kfree(bo->validated_shader);
+		bo->validated_shader = NULL;
+	}
+
 	vc4->bo_stats.num_allocated--;
 	vc4->bo_stats.size_allocated -= obj->size;
 	drm_gem_cma_free_object(obj);
@@ -315,6 +321,12 @@ void vc4_free_object(struct drm_gem_object *gem_bo)
 		goto out;
 	}
 
+	if (bo->validated_shader) {
+		kfree(bo->validated_shader->texture_samples);
+		kfree(bo->validated_shader);
+		bo->validated_shader = NULL;
+	}
+
 	bo->free_time = jiffies;
 	list_add(&bo->size_head, cache_list);
 	list_add(&bo->unref_head, &vc4->bo_cache.time_list);
@@ -347,6 +359,78 @@ static void vc4_bo_cache_time_timer(unsigned long data)
 	schedule_work(&vc4->bo_cache.time_work);
 }
 
+struct dma_buf *
+vc4_prime_export(struct drm_device *dev, struct drm_gem_object *obj, int flags)
+{
+	struct vc4_bo *bo = to_vc4_bo(obj);
+
+	if (bo->validated_shader) {
+		DRM_ERROR("Attempting to export shader BO\n");
+		return ERR_PTR(-EINVAL);
+	}
+
+	return drm_gem_prime_export(dev, obj, flags);
+}
+
+int vc4_mmap(struct file *filp, struct vm_area_struct *vma)
+{
+	struct drm_gem_object *gem_obj;
+	struct vc4_bo *bo;
+	int ret;
+
+	ret = drm_gem_mmap(filp, vma);
+	if (ret)
+		return ret;
+
+	gem_obj = vma->vm_private_data;
+	bo = to_vc4_bo(gem_obj);
+
+	if (bo->validated_shader && (vma->vm_flags & VM_WRITE)) {
+		DRM_ERROR("mmaping of shader BOs for writing not allowed.\n");
+		return -EINVAL;
+	}
+
+	/*
+	 * Clear the VM_PFNMAP flag that was set by drm_gem_mmap(), and set the
+	 * vm_pgoff (used as a fake buffer offset by DRM) to 0 as we want to map
+	 * the whole buffer.
+	 */
+	vma->vm_flags &= ~VM_PFNMAP;
+	vma->vm_pgoff = 0;
+
+	ret = dma_mmap_writecombine(bo->base.base.dev->dev, vma,
+				    bo->base.vaddr, bo->base.paddr,
+				    vma->vm_end - vma->vm_start);
+	if (ret)
+		drm_gem_vm_close(vma);
+
+	return ret;
+}
+
+int vc4_prime_mmap(struct drm_gem_object *obj, struct vm_area_struct *vma)
+{
+	struct vc4_bo *bo = to_vc4_bo(obj);
+
+	if (bo->validated_shader && (vma->vm_flags & VM_WRITE)) {
+		DRM_ERROR("mmaping of shader BOs for writing not allowed.\n");
+		return -EINVAL;
+	}
+
+	return drm_gem_cma_prime_mmap(obj, vma);
+}
+
+void *vc4_prime_vmap(struct drm_gem_object *obj)
+{
+	struct vc4_bo *bo = to_vc4_bo(obj);
+
+	if (bo->validated_shader) {
+		DRM_ERROR("mmaping of shader BOs not allowed.\n");
+		return ERR_PTR(-EINVAL);
+	}
+
+	return drm_gem_cma_prime_vmap(obj);
+}
+
 int vc4_create_bo_ioctl(struct drm_device *dev, void *data,
 			struct drm_file *file_priv)
 {
@@ -387,6 +471,62 @@ int vc4_mmap_bo_ioctl(struct drm_device *dev, void *data,
 	return 0;
 }
 
+int
+vc4_create_shader_bo_ioctl(struct drm_device *dev, void *data,
+			   struct drm_file *file_priv)
+{
+	struct drm_vc4_create_shader_bo *args = data;
+	struct vc4_bo *bo = NULL;
+	int ret;
+
+	if (args->size == 0)
+		return -EINVAL;
+
+	if (args->size % sizeof(u64) != 0)
+		return -EINVAL;
+
+	if (args->flags != 0) {
+		DRM_INFO("Unknown flags set: 0x%08x\n", args->flags);
+		return -EINVAL;
+	}
+
+	if (args->pad != 0) {
+		DRM_INFO("Pad set: 0x%08x\n", args->pad);
+		return -EINVAL;
+	}
+
+	bo = vc4_bo_create(dev, args->size, true);
+	if (!bo)
+		return -ENOMEM;
+
+	ret = copy_from_user(bo->base.vaddr,
+			     (void __user *)(uintptr_t)args->data,
+			     args->size);
+	if (ret != 0)
+		goto fail;
+	/* Clear the rest of the memory from allocating from the BO
+	 * cache.
+	 */
+	memset(bo->base.vaddr + args->size, 0,
+	       bo->base.base.size - args->size);
+
+	bo->validated_shader = vc4_validate_shader(&bo->base);
+	if (!bo->validated_shader) {
+		ret = -EINVAL;
+		goto fail;
+	}
+
+	/* We have to create the handle after validation, to avoid
+	 * races for users to do doing things like mmap the shader BO.
+	 */
+	ret = drm_gem_handle_create(file_priv, &bo->base.base, &args->handle);
+
+ fail:
+	drm_gem_object_unreference_unlocked(&bo->base.base);
+
+	return ret;
+}
+
 void vc4_bo_cache_init(struct drm_device *dev)
 {
 	struct vc4_dev *vc4 = to_vc4_dev(dev);
diff --git a/drivers/gpu/drm/vc4/vc4_drv.c b/drivers/gpu/drm/vc4/vc4_drv.c
index 5fa4688..da4be9c 100644
--- a/drivers/gpu/drm/vc4/vc4_drv.c
+++ b/drivers/gpu/drm/vc4/vc4_drv.c
@@ -64,7 +64,7 @@ static const struct file_operations vc4_drm_fops = {
 	.open = drm_open,
 	.release = drm_release,
 	.unlocked_ioctl = drm_ioctl,
-	.mmap = drm_gem_cma_mmap,
+	.mmap = vc4_mmap,
 	.poll = drm_poll,
 	.read = drm_read,
 #ifdef CONFIG_COMPAT
@@ -76,6 +76,7 @@ static const struct file_operations vc4_drm_fops = {
 static const struct drm_ioctl_desc vc4_drm_ioctls[] = {
 	DRM_IOCTL_DEF_DRV(VC4_CREATE_BO, vc4_create_bo_ioctl, 0),
 	DRM_IOCTL_DEF_DRV(VC4_MMAP_BO, vc4_mmap_bo_ioctl, 0),
+	DRM_IOCTL_DEF_DRV(VC4_CREATE_SHADER_BO, vc4_create_shader_bo_ioctl, 0),
 };
 
 static struct drm_driver vc4_drm_driver = {
@@ -102,12 +103,12 @@ static struct drm_driver vc4_drm_driver = {
 	.prime_handle_to_fd = drm_gem_prime_handle_to_fd,
 	.prime_fd_to_handle = drm_gem_prime_fd_to_handle,
 	.gem_prime_import = drm_gem_prime_import,
-	.gem_prime_export = drm_gem_prime_export,
+	.gem_prime_export = vc4_prime_export,
 	.gem_prime_get_sg_table	= drm_gem_cma_prime_get_sg_table,
 	.gem_prime_import_sg_table = drm_gem_cma_prime_import_sg_table,
-	.gem_prime_vmap = drm_gem_cma_prime_vmap,
+	.gem_prime_vmap = vc4_prime_vmap,
 	.gem_prime_vunmap = drm_gem_cma_prime_vunmap,
-	.gem_prime_mmap = drm_gem_cma_prime_mmap,
+	.gem_prime_mmap = vc4_prime_mmap,
 
 	.dumb_create = vc4_dumb_create,
 	.dumb_map_offset = drm_gem_cma_dumb_map_offset,
diff --git a/drivers/gpu/drm/vc4/vc4_drv.h b/drivers/gpu/drm/vc4/vc4_drv.h
index fddb0a0..bd77d55 100644
--- a/drivers/gpu/drm/vc4/vc4_drv.h
+++ b/drivers/gpu/drm/vc4/vc4_drv.h
@@ -69,6 +69,11 @@ struct vc4_bo {
 
 	/* List entry for the BO's position in vc4_dev->bo_cache.size_list */
 	struct list_head size_head;
+
+	/* Struct for shader validation state, if created by
+	 * DRM_IOCTL_VC4_CREATE_SHADER_BO.
+	 */
+	struct vc4_validated_shader_info *validated_shader;
 };
 
 static inline struct vc4_bo *
@@ -118,6 +123,42 @@ to_vc4_encoder(struct drm_encoder *encoder)
 #define HVS_WRITE(offset, val) writel(val, vc4->hvs->regs + offset)
 
 /**
+ * struct vc4_texture_sample_info - saves the offsets into the UBO for texture
+ * setup parameters.
+ *
+ * This will be used at draw time to relocate the reference to the texture
+ * contents in p0, and validate that the offset combined with
+ * width/height/stride/etc. from p1 and p2/p3 doesn't sample outside the BO.
+ * Note that the hardware treats unprovided config parameters as 0, so not all
+ * of them need to be set up for every texure sample, and we'll store ~0 as
+ * the offset to mark the unused ones.
+ *
+ * See the VC4 3D architecture guide page 41 ("Texture and Memory Lookup Unit
+ * Setup") for definitions of the texture parameters.
+ */
+struct vc4_texture_sample_info {
+	bool is_direct;
+	uint32_t p_offset[4];
+};
+
+/**
+ * struct vc4_validated_shader_info - information about validated shaders that
+ * needs to be used from command list validation.
+ *
+ * For a given shader, each time a shader state record references it, we need
+ * to verify that the shader doesn't read more uniforms than the shader state
+ * record's uniform BO pointer can provide, and we need to apply relocations
+ * and validate the shader state record's uniforms that define the texture
+ * samples.
+ */
+struct vc4_validated_shader_info {
+	uint32_t uniforms_size;
+	uint32_t uniforms_src_size;
+	uint32_t num_texture_samples;
+	struct vc4_texture_sample_info *texture_samples;
+};
+
+/**
  * _wait_for - magic (register) wait macro
  *
  * Does the right thing for modeset paths when run under kdgb or similar atomic
@@ -157,8 +198,13 @@ struct dma_buf *vc4_prime_export(struct drm_device *dev,
 				 struct drm_gem_object *obj, int flags);
 int vc4_create_bo_ioctl(struct drm_device *dev, void *data,
 			struct drm_file *file_priv);
+int vc4_create_shader_bo_ioctl(struct drm_device *dev, void *data,
+			       struct drm_file *file_priv);
 int vc4_mmap_bo_ioctl(struct drm_device *dev, void *data,
 		      struct drm_file *file_priv);
+int vc4_mmap(struct file *filp, struct vm_area_struct *vma);
+int vc4_prime_mmap(struct drm_gem_object *obj, struct vm_area_struct *vma);
+void *vc4_prime_vmap(struct drm_gem_object *obj);
 void vc4_bo_cache_init(struct drm_device *dev);
 void vc4_bo_cache_destroy(struct drm_device *dev);
 int vc4_bo_stats_debugfs(struct seq_file *m, void *arg);
@@ -194,3 +240,7 @@ struct drm_plane *vc4_plane_init(struct drm_device *dev,
 				 enum drm_plane_type type);
 u32 vc4_plane_write_dlist(struct drm_plane *plane, u32 __iomem *dlist);
 u32 vc4_plane_dlist_size(struct drm_plane_state *state);
+
+/* vc4_validate_shader.c */
+struct vc4_validated_shader_info *
+vc4_validate_shader(struct drm_gem_cma_object *shader_obj);
diff --git a/drivers/gpu/drm/vc4/vc4_qpu_defines.h b/drivers/gpu/drm/vc4/vc4_qpu_defines.h
new file mode 100644
index 0000000..d5c2f3c
--- /dev/null
+++ b/drivers/gpu/drm/vc4/vc4_qpu_defines.h
@@ -0,0 +1,264 @@
+/*
+ * Copyright © 2014 Broadcom
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ */
+
+#ifndef VC4_QPU_DEFINES_H
+#define VC4_QPU_DEFINES_H
+
+enum qpu_op_add {
+	QPU_A_NOP,
+	QPU_A_FADD,
+	QPU_A_FSUB,
+	QPU_A_FMIN,
+	QPU_A_FMAX,
+	QPU_A_FMINABS,
+	QPU_A_FMAXABS,
+	QPU_A_FTOI,
+	QPU_A_ITOF,
+	QPU_A_ADD = 12,
+	QPU_A_SUB,
+	QPU_A_SHR,
+	QPU_A_ASR,
+	QPU_A_ROR,
+	QPU_A_SHL,
+	QPU_A_MIN,
+	QPU_A_MAX,
+	QPU_A_AND,
+	QPU_A_OR,
+	QPU_A_XOR,
+	QPU_A_NOT,
+	QPU_A_CLZ,
+	QPU_A_V8ADDS = 30,
+	QPU_A_V8SUBS = 31,
+};
+
+enum qpu_op_mul {
+	QPU_M_NOP,
+	QPU_M_FMUL,
+	QPU_M_MUL24,
+	QPU_M_V8MULD,
+	QPU_M_V8MIN,
+	QPU_M_V8MAX,
+	QPU_M_V8ADDS,
+	QPU_M_V8SUBS,
+};
+
+enum qpu_raddr {
+	QPU_R_FRAG_PAYLOAD_ZW = 15, /* W for A file, Z for B file */
+	/* 0-31 are the plain regfile a or b fields */
+	QPU_R_UNIF = 32,
+	QPU_R_VARY = 35,
+	QPU_R_ELEM_QPU = 38,
+	QPU_R_NOP,
+	QPU_R_XY_PIXEL_COORD = 41,
+	QPU_R_MS_REV_FLAGS = 41,
+	QPU_R_VPM = 48,
+	QPU_R_VPM_LD_BUSY,
+	QPU_R_VPM_LD_WAIT,
+	QPU_R_MUTEX_ACQUIRE,
+};
+
+enum qpu_waddr {
+	/* 0-31 are the plain regfile a or b fields */
+	QPU_W_ACC0 = 32, /* aka r0 */
+	QPU_W_ACC1,
+	QPU_W_ACC2,
+	QPU_W_ACC3,
+	QPU_W_TMU_NOSWAP,
+	QPU_W_ACC5,
+	QPU_W_HOST_INT,
+	QPU_W_NOP,
+	QPU_W_UNIFORMS_ADDRESS,
+	QPU_W_QUAD_XY, /* X for regfile a, Y for regfile b */
+	QPU_W_MS_FLAGS = 42,
+	QPU_W_REV_FLAG = 42,
+	QPU_W_TLB_STENCIL_SETUP = 43,
+	QPU_W_TLB_Z,
+	QPU_W_TLB_COLOR_MS,
+	QPU_W_TLB_COLOR_ALL,
+	QPU_W_TLB_ALPHA_MASK,
+	QPU_W_VPM,
+	QPU_W_VPMVCD_SETUP, /* LD for regfile a, ST for regfile b */
+	QPU_W_VPM_ADDR, /* LD for regfile a, ST for regfile b */
+	QPU_W_MUTEX_RELEASE,
+	QPU_W_SFU_RECIP,
+	QPU_W_SFU_RECIPSQRT,
+	QPU_W_SFU_EXP,
+	QPU_W_SFU_LOG,
+	QPU_W_TMU0_S,
+	QPU_W_TMU0_T,
+	QPU_W_TMU0_R,
+	QPU_W_TMU0_B,
+	QPU_W_TMU1_S,
+	QPU_W_TMU1_T,
+	QPU_W_TMU1_R,
+	QPU_W_TMU1_B,
+};
+
+enum qpu_sig_bits {
+	QPU_SIG_SW_BREAKPOINT,
+	QPU_SIG_NONE,
+	QPU_SIG_THREAD_SWITCH,
+	QPU_SIG_PROG_END,
+	QPU_SIG_WAIT_FOR_SCOREBOARD,
+	QPU_SIG_SCOREBOARD_UNLOCK,
+	QPU_SIG_LAST_THREAD_SWITCH,
+	QPU_SIG_COVERAGE_LOAD,
+	QPU_SIG_COLOR_LOAD,
+	QPU_SIG_COLOR_LOAD_END,
+	QPU_SIG_LOAD_TMU0,
+	QPU_SIG_LOAD_TMU1,
+	QPU_SIG_ALPHA_MASK_LOAD,
+	QPU_SIG_SMALL_IMM,
+	QPU_SIG_LOAD_IMM,
+	QPU_SIG_BRANCH
+};
+
+enum qpu_mux {
+	/* hardware mux values */
+	QPU_MUX_R0,
+	QPU_MUX_R1,
+	QPU_MUX_R2,
+	QPU_MUX_R3,
+	QPU_MUX_R4,
+	QPU_MUX_R5,
+	QPU_MUX_A,
+	QPU_MUX_B,
+
+	/* non-hardware mux values */
+	QPU_MUX_IMM,
+};
+
+enum qpu_cond {
+	QPU_COND_NEVER,
+	QPU_COND_ALWAYS,
+	QPU_COND_ZS,
+	QPU_COND_ZC,
+	QPU_COND_NS,
+	QPU_COND_NC,
+	QPU_COND_CS,
+	QPU_COND_CC,
+};
+
+enum qpu_pack_mul {
+	QPU_PACK_MUL_NOP,
+	/* replicated to each 8 bits of the 32-bit dst. */
+	QPU_PACK_MUL_8888 = 3,
+	QPU_PACK_MUL_8A,
+	QPU_PACK_MUL_8B,
+	QPU_PACK_MUL_8C,
+	QPU_PACK_MUL_8D,
+};
+
+enum qpu_pack_a {
+	QPU_PACK_A_NOP,
+	/* convert to 16 bit float if float input, or to int16. */
+	QPU_PACK_A_16A,
+	QPU_PACK_A_16B,
+	/* replicated to each 8 bits of the 32-bit dst. */
+	QPU_PACK_A_8888,
+	/* Convert to 8-bit unsigned int. */
+	QPU_PACK_A_8A,
+	QPU_PACK_A_8B,
+	QPU_PACK_A_8C,
+	QPU_PACK_A_8D,
+
+	/* Saturating variants of the previous instructions. */
+	QPU_PACK_A_32_SAT, /* int-only */
+	QPU_PACK_A_16A_SAT, /* int or float */
+	QPU_PACK_A_16B_SAT,
+	QPU_PACK_A_8888_SAT,
+	QPU_PACK_A_8A_SAT,
+	QPU_PACK_A_8B_SAT,
+	QPU_PACK_A_8C_SAT,
+	QPU_PACK_A_8D_SAT,
+};
+
+enum qpu_unpack_r4 {
+	QPU_UNPACK_R4_NOP,
+	QPU_UNPACK_R4_F16A_TO_F32,
+	QPU_UNPACK_R4_F16B_TO_F32,
+	QPU_UNPACK_R4_8D_REP,
+	QPU_UNPACK_R4_8A,
+	QPU_UNPACK_R4_8B,
+	QPU_UNPACK_R4_8C,
+	QPU_UNPACK_R4_8D,
+};
+
+#define QPU_MASK(high, low) \
+	((((uint64_t)1 << ((high) - (low) + 1)) - 1) << (low))
+
+#define QPU_GET_FIELD(word, field) \
+	((uint32_t)(((word)  & field ## _MASK) >> field ## _SHIFT))
+
+#define QPU_SIG_SHIFT                   60
+#define QPU_SIG_MASK                    QPU_MASK(63, 60)
+
+#define QPU_UNPACK_SHIFT                57
+#define QPU_UNPACK_MASK                 QPU_MASK(59, 57)
+
+/**
+ * If set, the pack field means PACK_MUL or R4 packing, instead of normal
+ * regfile a packing.
+ */
+#define QPU_PM                          ((uint64_t)1 << 56)
+
+#define QPU_PACK_SHIFT                  52
+#define QPU_PACK_MASK                   QPU_MASK(55, 52)
+
+#define QPU_COND_ADD_SHIFT              49
+#define QPU_COND_ADD_MASK               QPU_MASK(51, 49)
+#define QPU_COND_MUL_SHIFT              46
+#define QPU_COND_MUL_MASK               QPU_MASK(48, 46)
+
+#define QPU_SF                          ((uint64_t)1 << 45)
+
+#define QPU_WADDR_ADD_SHIFT             38
+#define QPU_WADDR_ADD_MASK              QPU_MASK(43, 38)
+#define QPU_WADDR_MUL_SHIFT             32
+#define QPU_WADDR_MUL_MASK              QPU_MASK(37, 32)
+
+#define QPU_OP_MUL_SHIFT                29
+#define QPU_OP_MUL_MASK                 QPU_MASK(31, 29)
+
+#define QPU_RADDR_A_SHIFT               18
+#define QPU_RADDR_A_MASK                QPU_MASK(23, 18)
+#define QPU_RADDR_B_SHIFT               12
+#define QPU_RADDR_B_MASK                QPU_MASK(17, 12)
+#define QPU_SMALL_IMM_SHIFT             12
+#define QPU_SMALL_IMM_MASK              QPU_MASK(17, 12)
+
+#define QPU_ADD_A_SHIFT                 9
+#define QPU_ADD_A_MASK                  QPU_MASK(11, 9)
+#define QPU_ADD_B_SHIFT                 6
+#define QPU_ADD_B_MASK                  QPU_MASK(8, 6)
+#define QPU_MUL_A_SHIFT                 3
+#define QPU_MUL_A_MASK                  QPU_MASK(5, 3)
+#define QPU_MUL_B_SHIFT                 0
+#define QPU_MUL_B_MASK                  QPU_MASK(2, 0)
+
+#define QPU_WS                          ((uint64_t)1 << 44)
+
+#define QPU_OP_ADD_SHIFT                24
+#define QPU_OP_ADD_MASK                 QPU_MASK(28, 24)
+
+#endif /* VC4_QPU_DEFINES_H */
diff --git a/drivers/gpu/drm/vc4/vc4_validate_shaders.c b/drivers/gpu/drm/vc4/vc4_validate_shaders.c
new file mode 100644
index 0000000..f67124b
--- /dev/null
+++ b/drivers/gpu/drm/vc4/vc4_validate_shaders.c
@@ -0,0 +1,513 @@
+/*
+ * Copyright © 2014 Broadcom
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ */
+
+/**
+ * DOC: Shader validator for VC4.
+ *
+ * The VC4 has no IOMMU between it and system memory, so a user with
+ * access to execute shaders could escalate privilege by overwriting
+ * system memory (using the VPM write address register in the
+ * general-purpose DMA mode) or reading system memory it shouldn't
+ * (reading it as a texture, or uniform data, or vertex data).
+ *
+ * This walks over a shader BO, ensuring that its accesses are
+ * appropriately bounded, and recording how many texture accesses are
+ * made and where so that we can do relocations for them in the
+ * uniform stream.
+ */
+
+#include "vc4_drv.h"
+#include "vc4_qpu_defines.h"
+
+struct vc4_shader_validation_state {
+	struct vc4_texture_sample_info tmu_setup[2];
+	int tmu_write_count[2];
+
+	/* For registers that were last written to by a MIN instruction with
+	 * one argument being a uniform, the address of the uniform.
+	 * Otherwise, ~0.
+	 *
+	 * This is used for the validation of direct address memory reads.
+	 */
+	uint32_t live_min_clamp_offsets[32 + 32 + 4];
+	bool live_max_clamp_regs[32 + 32 + 4];
+};
+
+static uint32_t
+waddr_to_live_reg_index(uint32_t waddr, bool is_b)
+{
+	if (waddr < 32) {
+		if (is_b)
+			return 32 + waddr;
+		else
+			return waddr;
+	} else if (waddr <= QPU_W_ACC3) {
+		return 64 + waddr - QPU_W_ACC0;
+	} else {
+		return ~0;
+	}
+}
+
+static uint32_t
+raddr_add_a_to_live_reg_index(uint64_t inst)
+{
+	uint32_t sig = QPU_GET_FIELD(inst, QPU_SIG);
+	uint32_t add_a = QPU_GET_FIELD(inst, QPU_ADD_A);
+	uint32_t raddr_a = QPU_GET_FIELD(inst, QPU_RADDR_A);
+	uint32_t raddr_b = QPU_GET_FIELD(inst, QPU_RADDR_B);
+
+	if (add_a == QPU_MUX_A)
+		return raddr_a;
+	else if (add_a == QPU_MUX_B && sig != QPU_SIG_SMALL_IMM)
+		return 32 + raddr_b;
+	else if (add_a <= QPU_MUX_R3)
+		return 64 + add_a;
+	else
+		return ~0;
+}
+
+static bool
+is_tmu_submit(uint32_t waddr)
+{
+	return (waddr == QPU_W_TMU0_S ||
+		waddr == QPU_W_TMU1_S);
+}
+
+static bool
+is_tmu_write(uint32_t waddr)
+{
+	return (waddr >= QPU_W_TMU0_S &&
+		waddr <= QPU_W_TMU1_B);
+}
+
+static bool
+record_texture_sample(struct vc4_validated_shader_info *validated_shader,
+		      struct vc4_shader_validation_state *validation_state,
+		      int tmu)
+{
+	uint32_t s = validated_shader->num_texture_samples;
+	int i;
+	struct vc4_texture_sample_info *temp_samples;
+
+	temp_samples = krealloc(validated_shader->texture_samples,
+				(s + 1) * sizeof(*temp_samples),
+				GFP_KERNEL);
+	if (!temp_samples)
+		return false;
+
+	memcpy(&temp_samples[s],
+	       &validation_state->tmu_setup[tmu],
+	       sizeof(*temp_samples));
+
+	validated_shader->num_texture_samples = s + 1;
+	validated_shader->texture_samples = temp_samples;
+
+	for (i = 0; i < 4; i++)
+		validation_state->tmu_setup[tmu].p_offset[i] = ~0;
+
+	return true;
+}
+
+static bool
+check_tmu_write(uint64_t inst,
+		struct vc4_validated_shader_info *validated_shader,
+		struct vc4_shader_validation_state *validation_state,
+		bool is_mul)
+{
+	uint32_t waddr = (is_mul ?
+			  QPU_GET_FIELD(inst, QPU_WADDR_MUL) :
+			  QPU_GET_FIELD(inst, QPU_WADDR_ADD));
+	uint32_t raddr_a = QPU_GET_FIELD(inst, QPU_RADDR_A);
+	uint32_t raddr_b = QPU_GET_FIELD(inst, QPU_RADDR_B);
+	int tmu = waddr > QPU_W_TMU0_B;
+	bool submit = is_tmu_submit(waddr);
+	bool is_direct = submit && validation_state->tmu_write_count[tmu] == 0;
+	uint32_t sig = QPU_GET_FIELD(inst, QPU_SIG);
+
+	if (is_direct) {
+		uint32_t add_b = QPU_GET_FIELD(inst, QPU_ADD_B);
+		uint32_t clamp_reg, clamp_offset;
+
+		if (sig == QPU_SIG_SMALL_IMM) {
+			DRM_ERROR("direct TMU read used small immediate\n");
+			return false;
+		}
+
+		/* Make sure that this texture load is an add of the base
+		 * address of the UBO to a clamped offset within the UBO.
+		 */
+		if (is_mul ||
+		    QPU_GET_FIELD(inst, QPU_OP_ADD) != QPU_A_ADD) {
+			DRM_ERROR("direct TMU load wasn't an add\n");
+			return false;
+		}
+
+		/* We assert that the the clamped address is the first
+		 * argument, and the UBO base address is the second argument.
+		 * This is arbitrary, but simpler than supporting flipping the
+		 * two either way.
+		 */
+		clamp_reg = raddr_add_a_to_live_reg_index(inst);
+		if (clamp_reg == ~0) {
+			DRM_ERROR("direct TMU load wasn't clamped\n");
+			return false;
+		}
+
+		clamp_offset = validation_state->live_min_clamp_offsets[clamp_reg];
+		if (clamp_offset == ~0) {
+			DRM_ERROR("direct TMU load wasn't clamped\n");
+			return false;
+		}
+
+		/* Store the clamp value's offset in p1 (see reloc_tex() in
+		 * vc4_validate.c).
+		 */
+		validation_state->tmu_setup[tmu].p_offset[1] =
+			clamp_offset;
+
+		if (!(add_b == QPU_MUX_A && raddr_a == QPU_R_UNIF) &&
+		    !(add_b == QPU_MUX_B && raddr_b == QPU_R_UNIF)) {
+			DRM_ERROR("direct TMU load didn't add to a uniform\n");
+			return false;
+		}
+
+		validation_state->tmu_setup[tmu].is_direct = true;
+	} else {
+		if (raddr_a == QPU_R_UNIF || (sig != QPU_SIG_SMALL_IMM &&
+					      raddr_b == QPU_R_UNIF)) {
+			DRM_ERROR("uniform read in the same instruction as "
+				  "texture setup.\n");
+			return false;
+		}
+	}
+
+	if (validation_state->tmu_write_count[tmu] >= 4) {
+		DRM_ERROR("TMU%d got too many parameters before dispatch\n",
+			  tmu);
+		return false;
+	}
+	validation_state->tmu_setup[tmu].p_offset[validation_state->tmu_write_count[tmu]] =
+		validated_shader->uniforms_size;
+	validation_state->tmu_write_count[tmu]++;
+	/* Since direct uses a RADDR uniform reference, it will get counted in
+	 * check_instruction_reads()
+	 */
+	if (!is_direct)
+		validated_shader->uniforms_size += 4;
+
+	if (submit) {
+		if (!record_texture_sample(validated_shader,
+					   validation_state, tmu)) {
+			return false;
+		}
+
+		validation_state->tmu_write_count[tmu] = 0;
+	}
+
+	return true;
+}
+
+static bool
+check_reg_write(uint64_t inst,
+		struct vc4_validated_shader_info *validated_shader,
+		struct vc4_shader_validation_state *validation_state,
+		bool is_mul)
+{
+	uint32_t waddr = (is_mul ?
+			  QPU_GET_FIELD(inst, QPU_WADDR_MUL) :
+			  QPU_GET_FIELD(inst, QPU_WADDR_ADD));
+
+	switch (waddr) {
+	case QPU_W_UNIFORMS_ADDRESS:
+		/* XXX: We'll probably need to support this for reladdr, but
+		 * it's definitely a security-related one.
+		 */
+		DRM_ERROR("uniforms address load unsupported\n");
+		return false;
+
+	case QPU_W_TLB_COLOR_MS:
+	case QPU_W_TLB_COLOR_ALL:
+	case QPU_W_TLB_Z:
+		/* These only interact with the tile buffer, not main memory,
+		 * so they're safe.
+		 */
+		return true;
+
+	case QPU_W_TMU0_S:
+	case QPU_W_TMU0_T:
+	case QPU_W_TMU0_R:
+	case QPU_W_TMU0_B:
+	case QPU_W_TMU1_S:
+	case QPU_W_TMU1_T:
+	case QPU_W_TMU1_R:
+	case QPU_W_TMU1_B:
+		return check_tmu_write(inst, validated_shader, validation_state,
+				       is_mul);
+
+	case QPU_W_HOST_INT:
+	case QPU_W_TMU_NOSWAP:
+	case QPU_W_TLB_ALPHA_MASK:
+	case QPU_W_MUTEX_RELEASE:
+		/* XXX: I haven't thought about these, so don't support them
+		 * for now.
+		 */
+		DRM_ERROR("Unsupported waddr %d\n", waddr);
+		return false;
+
+	case QPU_W_VPM_ADDR:
+		DRM_ERROR("General VPM DMA unsupported\n");
+		return false;
+
+	case QPU_W_VPM:
+	case QPU_W_VPMVCD_SETUP:
+		/* We allow VPM setup in general, even including VPM DMA
+		 * configuration setup, because the (unsafe) DMA can only be
+		 * triggered by QPU_W_VPM_ADDR writes.
+		 */
+		return true;
+
+	case QPU_W_TLB_STENCIL_SETUP:
+		return true;
+	}
+
+	return true;
+}
+
+static void
+track_live_clamps(uint64_t inst,
+		  struct vc4_validated_shader_info *validated_shader,
+		  struct vc4_shader_validation_state *validation_state)
+{
+	uint32_t op_add = QPU_GET_FIELD(inst, QPU_OP_ADD);
+	uint32_t waddr_add = QPU_GET_FIELD(inst, QPU_WADDR_ADD);
+	uint32_t waddr_mul = QPU_GET_FIELD(inst, QPU_WADDR_MUL);
+	uint32_t cond_add = QPU_GET_FIELD(inst, QPU_COND_ADD);
+	uint32_t add_a = QPU_GET_FIELD(inst, QPU_ADD_A);
+	uint32_t add_b = QPU_GET_FIELD(inst, QPU_ADD_B);
+	uint32_t raddr_a = QPU_GET_FIELD(inst, QPU_RADDR_A);
+	uint32_t raddr_b = QPU_GET_FIELD(inst, QPU_RADDR_B);
+	uint32_t sig = QPU_GET_FIELD(inst, QPU_SIG);
+	bool ws = inst & QPU_WS;
+	uint32_t lri_add_a, lri_add, lri_mul;
+	bool add_a_is_min_0;
+
+	/* Check whether OP_ADD's A argumennt comes from a live MAX(x, 0),
+	 * before we clear previous live state.
+	 */
+	lri_add_a = raddr_add_a_to_live_reg_index(inst);
+	add_a_is_min_0 = (lri_add_a != ~0 &&
+			  validation_state->live_max_clamp_regs[lri_add_a]);
+
+	/* Clear live state for registers written by our instruction. */
+	lri_add = waddr_to_live_reg_index(waddr_add, ws);
+	lri_mul = waddr_to_live_reg_index(waddr_mul, !ws);
+	if (lri_mul != ~0) {
+		validation_state->live_max_clamp_regs[lri_mul] = false;
+		validation_state->live_min_clamp_offsets[lri_mul] = ~0;
+	}
+	if (lri_add != ~0) {
+		validation_state->live_max_clamp_regs[lri_add] = false;
+		validation_state->live_min_clamp_offsets[lri_add] = ~0;
+	} else {
+		/* Nothing further to do for live tracking, since only ADDs
+		 * generate new live clamp registers.
+		 */
+		return;
+	}
+
+	/* Now, handle remaining live clamp tracking for the ADD operation. */
+
+	if (cond_add != QPU_COND_ALWAYS)
+		return;
+
+	if (op_add == QPU_A_MAX) {
+		/* Track live clamps of a value to a minimum of 0 (in either
+		 * arg).
+		 */
+		if (sig != QPU_SIG_SMALL_IMM || raddr_b != 0 ||
+		    (add_a != QPU_MUX_B && add_b != QPU_MUX_B)) {
+			return;
+		}
+
+		validation_state->live_max_clamp_regs[lri_add] = true;
+	} else if (op_add == QPU_A_MIN) {
+		/* Track live clamps of a value clamped to a minimum of 0 and
+		 * a maximum of some uniform's offset.
+		 */
+		if (!add_a_is_min_0)
+			return;
+
+		if (!(add_b == QPU_MUX_A && raddr_a == QPU_R_UNIF) &&
+		    !(add_b == QPU_MUX_B && raddr_b == QPU_R_UNIF &&
+		      sig != QPU_SIG_SMALL_IMM)) {
+			return;
+		}
+
+		validation_state->live_min_clamp_offsets[lri_add] =
+			validated_shader->uniforms_size;
+	}
+}
+
+static bool
+check_instruction_writes(uint64_t inst,
+			 struct vc4_validated_shader_info *validated_shader,
+			 struct vc4_shader_validation_state *validation_state)
+{
+	uint32_t waddr_add = QPU_GET_FIELD(inst, QPU_WADDR_ADD);
+	uint32_t waddr_mul = QPU_GET_FIELD(inst, QPU_WADDR_MUL);
+	bool ok;
+
+	if (is_tmu_write(waddr_add) && is_tmu_write(waddr_mul)) {
+		DRM_ERROR("ADD and MUL both set up textures\n");
+		return false;
+	}
+
+	ok = (check_reg_write(inst, validated_shader, validation_state,
+			      false) &&
+	      check_reg_write(inst, validated_shader, validation_state,
+			      true));
+
+	track_live_clamps(inst, validated_shader, validation_state);
+
+	return ok;
+}
+
+static bool
+check_instruction_reads(uint64_t inst,
+			struct vc4_validated_shader_info *validated_shader)
+{
+	uint32_t raddr_a = QPU_GET_FIELD(inst, QPU_RADDR_A);
+	uint32_t raddr_b = QPU_GET_FIELD(inst, QPU_RADDR_B);
+	uint32_t sig = QPU_GET_FIELD(inst, QPU_SIG);
+
+	if (raddr_a == QPU_R_UNIF ||
+	    (raddr_b == QPU_R_UNIF && sig != QPU_SIG_SMALL_IMM)) {
+		/* This can't overflow the uint32_t, because we're reading 8
+		 * bytes of instruction to increment by 4 here, so we'd
+		 * already be OOM.
+		 */
+		validated_shader->uniforms_size += 4;
+	}
+
+	return true;
+}
+
+struct vc4_validated_shader_info *
+vc4_validate_shader(struct drm_gem_cma_object *shader_obj)
+{
+	bool found_shader_end = false;
+	int shader_end_ip = 0;
+	uint32_t ip, max_ip;
+	uint64_t *shader;
+	struct vc4_validated_shader_info *validated_shader;
+	struct vc4_shader_validation_state validation_state;
+	int i;
+
+	memset(&validation_state, 0, sizeof(validation_state));
+
+	for (i = 0; i < 8; i++)
+		validation_state.tmu_setup[i / 4].p_offset[i % 4] = ~0;
+	for (i = 0; i < ARRAY_SIZE(validation_state.live_min_clamp_offsets); i++)
+		validation_state.live_min_clamp_offsets[i] = ~0;
+
+	shader = shader_obj->vaddr;
+	max_ip = shader_obj->base.size / sizeof(uint64_t);
+
+	validated_shader = kcalloc(1, sizeof(*validated_shader), GFP_KERNEL);
+	if (!validated_shader)
+		return NULL;
+
+	for (ip = 0; ip < max_ip; ip++) {
+		uint64_t inst = shader[ip];
+		uint32_t sig = QPU_GET_FIELD(inst, QPU_SIG);
+
+		switch (sig) {
+		case QPU_SIG_NONE:
+		case QPU_SIG_WAIT_FOR_SCOREBOARD:
+		case QPU_SIG_SCOREBOARD_UNLOCK:
+		case QPU_SIG_COLOR_LOAD:
+		case QPU_SIG_LOAD_TMU0:
+		case QPU_SIG_LOAD_TMU1:
+		case QPU_SIG_PROG_END:
+		case QPU_SIG_SMALL_IMM:
+			if (!check_instruction_writes(inst, validated_shader,
+						      &validation_state)) {
+				DRM_ERROR("Bad write at ip %d\n", ip);
+				goto fail;
+			}
+
+			if (!check_instruction_reads(inst, validated_shader))
+				goto fail;
+
+			if (sig == QPU_SIG_PROG_END) {
+				found_shader_end = true;
+				shader_end_ip = ip;
+			}
+
+			break;
+
+		case QPU_SIG_LOAD_IMM:
+			if (!check_instruction_writes(inst, validated_shader,
+						      &validation_state)) {
+				DRM_ERROR("Bad LOAD_IMM write at ip %d\n", ip);
+				goto fail;
+			}
+			break;
+
+		default:
+			DRM_ERROR("Unsupported QPU signal %d at "
+				  "instruction %d\n", sig, ip);
+			goto fail;
+		}
+
+		/* There are two delay slots after program end is signaled
+		 * that are still executed, then we're finished.
+		 */
+		if (found_shader_end && ip == shader_end_ip + 2)
+			break;
+	}
+
+	if (ip == max_ip) {
+		DRM_ERROR("shader failed to terminate before "
+			  "shader BO end at %zd\n",
+			  shader_obj->base.size);
+		goto fail;
+	}
+
+	/* Again, no chance of integer overflow here because the worst case
+	 * scenario is 8 bytes of uniforms plus handles per 8-byte
+	 * instruction.
+	 */
+	validated_shader->uniforms_src_size =
+		(validated_shader->uniforms_size +
+		 4 * validated_shader->num_texture_samples);
+
+	return validated_shader;
+
+fail:
+	if (validated_shader) {
+		kfree(validated_shader->texture_samples);
+		kfree(validated_shader);
+	}
+	return NULL;
+}
diff --git a/include/uapi/drm/vc4_drm.h b/include/uapi/drm/vc4_drm.h
index 068aab9..0716b50 100644
--- a/include/uapi/drm/vc4_drm.h
+++ b/include/uapi/drm/vc4_drm.h
@@ -28,9 +28,11 @@
 
 #define DRM_VC4_CREATE_BO                         0x03
 #define DRM_VC4_MMAP_BO                           0x04
+#define DRM_VC4_CREATE_SHADER_BO                  0x05
 
 #define DRM_IOCTL_VC4_CREATE_BO           DRM_IOWR(DRM_COMMAND_BASE + DRM_VC4_CREATE_BO, struct drm_vc4_create_bo)
 #define DRM_IOCTL_VC4_MMAP_BO             DRM_IOWR(DRM_COMMAND_BASE + DRM_VC4_MMAP_BO, struct drm_vc4_mmap_bo)
+#define DRM_IOCTL_VC4_CREATE_SHADER_BO    DRM_IOWR(DRM_COMMAND_BASE + DRM_VC4_CREATE_SHADER_BO, struct drm_vc4_create_shader_bo)
 
 /**
  * struct drm_vc4_create_bo - ioctl argument for creating VC4 BOs.
@@ -65,4 +67,27 @@ struct drm_vc4_mmap_bo {
 	uint64_t offset;
 };
 
+/**
+ * struct drm_vc4_create_shader_bo - ioctl argument for creating VC4
+ * shader BOs.
+ *
+ * Since allowing a shader to be overwritten while it's also being
+ * executed from would allow privlege escalation, shaders must be
+ * created using this ioctl, and they can't be mmapped later.
+ */
+struct drm_vc4_create_shader_bo {
+	/* Size of the data argument. */
+	uint32_t size;
+	/* Flags, currently must be 0. */
+	uint32_t flags;
+
+	/* Pointer to the data. */
+	uint64_t data;
+
+	/** Returned GEM handle for the BO. */
+	uint32_t handle;
+	/* Pad, must be 0. */
+	uint32_t pad;
+};
+
 #endif /* _UAPI_VC4_DRM_H_ */
-- 
2.6.2

_______________________________________________
dri-devel mailing list
dri-devel@lists.freedesktop.org
http://lists.freedesktop.org/mailman/listinfo/dri-devel

^ permalink raw reply related	[flat|nested] 53+ messages in thread

* [PATCH 5/9] drm/vc4: Fix a typo in a V3D debug register.
  2015-12-01 20:35 ` Eric Anholt
@ 2015-12-01 20:35   ` Eric Anholt
  -1 siblings, 0 replies; 53+ messages in thread
From: Eric Anholt @ 2015-12-01 20:35 UTC (permalink / raw)
  To: dri-devel; +Cc: linux-kernel, David Airlie, Eric Anholt

Signed-off-by: Eric Anholt <eric@anholt.net>
---
 drivers/gpu/drm/vc4/vc4_regs.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/drivers/gpu/drm/vc4/vc4_regs.h b/drivers/gpu/drm/vc4/vc4_regs.h
index 9e4e904..4e52a0a 100644
--- a/drivers/gpu/drm/vc4/vc4_regs.h
+++ b/drivers/gpu/drm/vc4/vc4_regs.h
@@ -154,7 +154,7 @@
 #define V3D_PCTRS14  0x006f4
 #define V3D_PCTR15   0x006f8
 #define V3D_PCTRS15  0x006fc
-#define V3D_BGE      0x00f00
+#define V3D_DBGE     0x00f00
 #define V3D_FDBGO    0x00f04
 #define V3D_FDBGB    0x00f08
 #define V3D_FDBGR    0x00f0c
-- 
2.6.2


^ permalink raw reply related	[flat|nested] 53+ messages in thread

* [PATCH 5/9] drm/vc4: Fix a typo in a V3D debug register.
@ 2015-12-01 20:35   ` Eric Anholt
  0 siblings, 0 replies; 53+ messages in thread
From: Eric Anholt @ 2015-12-01 20:35 UTC (permalink / raw)
  To: dri-devel; +Cc: linux-kernel

Signed-off-by: Eric Anholt <eric@anholt.net>
---
 drivers/gpu/drm/vc4/vc4_regs.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/drivers/gpu/drm/vc4/vc4_regs.h b/drivers/gpu/drm/vc4/vc4_regs.h
index 9e4e904..4e52a0a 100644
--- a/drivers/gpu/drm/vc4/vc4_regs.h
+++ b/drivers/gpu/drm/vc4/vc4_regs.h
@@ -154,7 +154,7 @@
 #define V3D_PCTRS14  0x006f4
 #define V3D_PCTR15   0x006f8
 #define V3D_PCTRS15  0x006fc
-#define V3D_BGE      0x00f00
+#define V3D_DBGE     0x00f00
 #define V3D_FDBGO    0x00f04
 #define V3D_FDBGB    0x00f08
 #define V3D_FDBGR    0x00f0c
-- 
2.6.2

_______________________________________________
dri-devel mailing list
dri-devel@lists.freedesktop.org
http://lists.freedesktop.org/mailman/listinfo/dri-devel

^ permalink raw reply related	[flat|nested] 53+ messages in thread

* [PATCH 6/9] drm/vc4: Bind and initialize the V3D engine.
  2015-12-01 20:35 ` Eric Anholt
@ 2015-12-01 20:35   ` Eric Anholt
  -1 siblings, 0 replies; 53+ messages in thread
From: Eric Anholt @ 2015-12-01 20:35 UTC (permalink / raw)
  To: dri-devel; +Cc: linux-kernel, David Airlie, Eric Anholt

This is the component of the GPU that does 3D rendering.

Signed-off-by: Eric Anholt <eric@anholt.net>
---
 drivers/gpu/drm/vc4/Makefile      |   1 +
 drivers/gpu/drm/vc4/vc4_debugfs.c |   2 +
 drivers/gpu/drm/vc4/vc4_drv.c     |   1 +
 drivers/gpu/drm/vc4/vc4_drv.h     |  13 +++
 drivers/gpu/drm/vc4/vc4_v3d.c     | 225 ++++++++++++++++++++++++++++++++++++++
 5 files changed, 242 insertions(+)
 create mode 100644 drivers/gpu/drm/vc4/vc4_v3d.c

diff --git a/drivers/gpu/drm/vc4/Makefile b/drivers/gpu/drm/vc4/Makefile
index eb776a6..e87a6f2 100644
--- a/drivers/gpu/drm/vc4/Makefile
+++ b/drivers/gpu/drm/vc4/Makefile
@@ -11,6 +11,7 @@ vc4-y := \
 	vc4_hdmi.o \
 	vc4_hvs.o \
 	vc4_plane.o \
+	vc4_v3d.o \
 	vc4_validate_shaders.o
 
 vc4-$(CONFIG_DEBUG_FS) += vc4_debugfs.o
diff --git a/drivers/gpu/drm/vc4/vc4_debugfs.c b/drivers/gpu/drm/vc4/vc4_debugfs.c
index 6bcf96e..d76ad10 100644
--- a/drivers/gpu/drm/vc4/vc4_debugfs.c
+++ b/drivers/gpu/drm/vc4/vc4_debugfs.c
@@ -22,6 +22,8 @@ static const struct drm_info_list vc4_debugfs_list[] = {
 	{"crtc0_regs", vc4_crtc_debugfs_regs, 0, (void *)(uintptr_t)0},
 	{"crtc1_regs", vc4_crtc_debugfs_regs, 0, (void *)(uintptr_t)1},
 	{"crtc2_regs", vc4_crtc_debugfs_regs, 0, (void *)(uintptr_t)2},
+	{"v3d_ident", vc4_v3d_debugfs_ident, 0},
+	{"v3d_regs", vc4_v3d_debugfs_regs, 0},
 };
 
 #define VC4_DEBUGFS_ENTRIES ARRAY_SIZE(vc4_debugfs_list)
diff --git a/drivers/gpu/drm/vc4/vc4_drv.c b/drivers/gpu/drm/vc4/vc4_drv.c
index da4be9c..db58d74 100644
--- a/drivers/gpu/drm/vc4/vc4_drv.c
+++ b/drivers/gpu/drm/vc4/vc4_drv.c
@@ -236,6 +236,7 @@ static struct platform_driver *const component_drivers[] = {
 	&vc4_hdmi_driver,
 	&vc4_crtc_driver,
 	&vc4_hvs_driver,
+	&vc4_v3d_driver,
 };
 
 static int vc4_platform_drm_probe(struct platform_device *pdev)
diff --git a/drivers/gpu/drm/vc4/vc4_drv.h b/drivers/gpu/drm/vc4/vc4_drv.h
index bd77d55..8945463 100644
--- a/drivers/gpu/drm/vc4/vc4_drv.h
+++ b/drivers/gpu/drm/vc4/vc4_drv.h
@@ -15,6 +15,7 @@ struct vc4_dev {
 	struct vc4_hdmi *hdmi;
 	struct vc4_hvs *hvs;
 	struct vc4_crtc *crtc[3];
+	struct vc4_v3d *v3d;
 
 	struct drm_fbdev_cma *fbdev;
 
@@ -82,6 +83,11 @@ to_vc4_bo(struct drm_gem_object *bo)
 	return (struct vc4_bo *)bo;
 }
 
+struct vc4_v3d {
+	struct platform_device *pdev;
+	void __iomem *regs;
+};
+
 struct vc4_hvs {
 	struct platform_device *pdev;
 	void __iomem *regs;
@@ -119,6 +125,8 @@ to_vc4_encoder(struct drm_encoder *encoder)
 	return container_of(encoder, struct vc4_encoder, base);
 }
 
+#define V3D_READ(offset) readl(vc4->v3d->regs + offset)
+#define V3D_WRITE(offset, val) writel(val, vc4->v3d->regs + offset)
 #define HVS_READ(offset) readl(vc4->hvs->regs + offset)
 #define HVS_WRITE(offset, val) writel(val, vc4->hvs->regs + offset)
 
@@ -241,6 +249,11 @@ struct drm_plane *vc4_plane_init(struct drm_device *dev,
 u32 vc4_plane_write_dlist(struct drm_plane *plane, u32 __iomem *dlist);
 u32 vc4_plane_dlist_size(struct drm_plane_state *state);
 
+/* vc4_v3d.c */
+extern struct platform_driver vc4_v3d_driver;
+int vc4_v3d_debugfs_ident(struct seq_file *m, void *unused);
+int vc4_v3d_debugfs_regs(struct seq_file *m, void *unused);
+
 /* vc4_validate_shader.c */
 struct vc4_validated_shader_info *
 vc4_validate_shader(struct drm_gem_cma_object *shader_obj);
diff --git a/drivers/gpu/drm/vc4/vc4_v3d.c b/drivers/gpu/drm/vc4/vc4_v3d.c
new file mode 100644
index 0000000..040ad0d
--- /dev/null
+++ b/drivers/gpu/drm/vc4/vc4_v3d.c
@@ -0,0 +1,225 @@
+/*
+ * Copyright (c) 2014 The Linux Foundation. All rights reserved.
+ * Copyright (C) 2013 Red Hat
+ * Author: Rob Clark <robdclark@gmail.com>
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License version 2 as published by
+ * the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
+ * more details.
+ *
+ * You should have received a copy of the GNU General Public License along with
+ * this program.  If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#include "linux/component.h"
+#include "vc4_drv.h"
+#include "vc4_regs.h"
+
+#ifdef CONFIG_DEBUG_FS
+#define REGDEF(reg) { reg, #reg }
+static const struct {
+	uint32_t reg;
+	const char *name;
+} vc4_reg_defs[] = {
+	REGDEF(V3D_IDENT0),
+	REGDEF(V3D_IDENT1),
+	REGDEF(V3D_IDENT2),
+	REGDEF(V3D_SCRATCH),
+	REGDEF(V3D_L2CACTL),
+	REGDEF(V3D_SLCACTL),
+	REGDEF(V3D_INTCTL),
+	REGDEF(V3D_INTENA),
+	REGDEF(V3D_INTDIS),
+	REGDEF(V3D_CT0CS),
+	REGDEF(V3D_CT1CS),
+	REGDEF(V3D_CT0EA),
+	REGDEF(V3D_CT1EA),
+	REGDEF(V3D_CT0CA),
+	REGDEF(V3D_CT1CA),
+	REGDEF(V3D_CT00RA0),
+	REGDEF(V3D_CT01RA0),
+	REGDEF(V3D_CT0LC),
+	REGDEF(V3D_CT1LC),
+	REGDEF(V3D_CT0PC),
+	REGDEF(V3D_CT1PC),
+	REGDEF(V3D_PCS),
+	REGDEF(V3D_BFC),
+	REGDEF(V3D_RFC),
+	REGDEF(V3D_BPCA),
+	REGDEF(V3D_BPCS),
+	REGDEF(V3D_BPOA),
+	REGDEF(V3D_BPOS),
+	REGDEF(V3D_BXCF),
+	REGDEF(V3D_SQRSV0),
+	REGDEF(V3D_SQRSV1),
+	REGDEF(V3D_SQCNTL),
+	REGDEF(V3D_SRQPC),
+	REGDEF(V3D_SRQUA),
+	REGDEF(V3D_SRQUL),
+	REGDEF(V3D_SRQCS),
+	REGDEF(V3D_VPACNTL),
+	REGDEF(V3D_VPMBASE),
+	REGDEF(V3D_PCTRC),
+	REGDEF(V3D_PCTRE),
+	REGDEF(V3D_PCTR0),
+	REGDEF(V3D_PCTRS0),
+	REGDEF(V3D_PCTR1),
+	REGDEF(V3D_PCTRS1),
+	REGDEF(V3D_PCTR2),
+	REGDEF(V3D_PCTRS2),
+	REGDEF(V3D_PCTR3),
+	REGDEF(V3D_PCTRS3),
+	REGDEF(V3D_PCTR4),
+	REGDEF(V3D_PCTRS4),
+	REGDEF(V3D_PCTR5),
+	REGDEF(V3D_PCTRS5),
+	REGDEF(V3D_PCTR6),
+	REGDEF(V3D_PCTRS6),
+	REGDEF(V3D_PCTR7),
+	REGDEF(V3D_PCTRS7),
+	REGDEF(V3D_PCTR8),
+	REGDEF(V3D_PCTRS8),
+	REGDEF(V3D_PCTR9),
+	REGDEF(V3D_PCTRS9),
+	REGDEF(V3D_PCTR10),
+	REGDEF(V3D_PCTRS10),
+	REGDEF(V3D_PCTR11),
+	REGDEF(V3D_PCTRS11),
+	REGDEF(V3D_PCTR12),
+	REGDEF(V3D_PCTRS12),
+	REGDEF(V3D_PCTR13),
+	REGDEF(V3D_PCTRS13),
+	REGDEF(V3D_PCTR14),
+	REGDEF(V3D_PCTRS14),
+	REGDEF(V3D_PCTR15),
+	REGDEF(V3D_PCTRS15),
+	REGDEF(V3D_DBGE),
+	REGDEF(V3D_FDBGO),
+	REGDEF(V3D_FDBGB),
+	REGDEF(V3D_FDBGR),
+	REGDEF(V3D_FDBGS),
+	REGDEF(V3D_ERRSTAT),
+};
+
+int vc4_v3d_debugfs_regs(struct seq_file *m, void *unused)
+{
+	struct drm_info_node *node = (struct drm_info_node *)m->private;
+	struct drm_device *dev = node->minor->dev;
+	struct vc4_dev *vc4 = to_vc4_dev(dev);
+	int i;
+
+	for (i = 0; i < ARRAY_SIZE(vc4_reg_defs); i++) {
+		seq_printf(m, "%s (0x%04x): 0x%08x\n",
+			   vc4_reg_defs[i].name, vc4_reg_defs[i].reg,
+			   V3D_READ(vc4_reg_defs[i].reg));
+	}
+
+	return 0;
+}
+
+int vc4_v3d_debugfs_ident(struct seq_file *m, void *unused)
+{
+	struct drm_info_node *node = (struct drm_info_node *)m->private;
+	struct drm_device *dev = node->minor->dev;
+	struct vc4_dev *vc4 = to_vc4_dev(dev);
+	uint32_t ident1 = V3D_READ(V3D_IDENT1);
+	uint32_t nslc = VC4_GET_FIELD(ident1, V3D_IDENT1_NSLC);
+	uint32_t tups = VC4_GET_FIELD(ident1, V3D_IDENT1_TUPS);
+	uint32_t qups = VC4_GET_FIELD(ident1, V3D_IDENT1_QUPS);
+
+	seq_printf(m, "Revision:   %d\n",
+		   VC4_GET_FIELD(ident1, V3D_IDENT1_REV));
+	seq_printf(m, "Slices:     %d\n", nslc);
+	seq_printf(m, "TMUs:       %d\n", nslc * tups);
+	seq_printf(m, "QPUs:       %d\n", nslc * qups);
+	seq_printf(m, "Semaphores: %d\n",
+		   VC4_GET_FIELD(ident1, V3D_IDENT1_NSEM));
+
+	return 0;
+}
+#endif /* CONFIG_DEBUG_FS */
+
+static void vc4_v3d_init_hw(struct drm_device *dev)
+{
+	struct vc4_dev *vc4 = to_vc4_dev(dev);
+
+	/* Take all the memory that would have been reserved for user
+	 * QPU programs, since we don't have an interface for running
+	 * them, anyway.
+	 */
+	V3D_WRITE(V3D_VPMBASE, 0);
+}
+
+static int vc4_v3d_bind(struct device *dev, struct device *master, void *data)
+{
+	struct platform_device *pdev = to_platform_device(dev);
+	struct drm_device *drm = dev_get_drvdata(master);
+	struct vc4_dev *vc4 = to_vc4_dev(drm);
+	struct vc4_v3d *v3d = NULL;
+
+	v3d = devm_kzalloc(&pdev->dev, sizeof(*v3d), GFP_KERNEL);
+	if (!v3d)
+		return -ENOMEM;
+
+	v3d->pdev = pdev;
+
+	v3d->regs = vc4_ioremap_regs(pdev, 0);
+	if (IS_ERR(v3d->regs))
+		return PTR_ERR(v3d->regs);
+
+	vc4->v3d = v3d;
+
+	if (V3D_READ(V3D_IDENT0) != V3D_EXPECTED_IDENT0) {
+		DRM_ERROR("V3D_IDENT0 read 0x%08x instead of 0x%08x\n",
+			  V3D_READ(V3D_IDENT0), V3D_EXPECTED_IDENT0);
+		return -EINVAL;
+	}
+
+	vc4_v3d_init_hw(drm);
+
+	return 0;
+}
+
+static void vc4_v3d_unbind(struct device *dev, struct device *master,
+			   void *data)
+{
+	struct drm_device *drm = dev_get_drvdata(master);
+	struct vc4_dev *vc4 = to_vc4_dev(drm);
+
+	vc4->v3d = NULL;
+}
+
+static const struct component_ops vc4_v3d_ops = {
+	.bind   = vc4_v3d_bind,
+	.unbind = vc4_v3d_unbind,
+};
+
+static int vc4_v3d_dev_probe(struct platform_device *pdev)
+{
+	return component_add(&pdev->dev, &vc4_v3d_ops);
+}
+
+static int vc4_v3d_dev_remove(struct platform_device *pdev)
+{
+	component_del(&pdev->dev, &vc4_v3d_ops);
+	return 0;
+}
+
+static const struct of_device_id vc4_v3d_dt_match[] = {
+	{ .compatible = "brcm,vc4-v3d" },
+	{}
+};
+
+struct platform_driver vc4_v3d_driver = {
+	.probe = vc4_v3d_dev_probe,
+	.remove = vc4_v3d_dev_remove,
+	.driver = {
+		.name = "vc4_v3d",
+		.of_match_table = vc4_v3d_dt_match,
+	},
+};
-- 
2.6.2


^ permalink raw reply related	[flat|nested] 53+ messages in thread

* [PATCH 6/9] drm/vc4: Bind and initialize the V3D engine.
@ 2015-12-01 20:35   ` Eric Anholt
  0 siblings, 0 replies; 53+ messages in thread
From: Eric Anholt @ 2015-12-01 20:35 UTC (permalink / raw)
  To: dri-devel; +Cc: linux-kernel

This is the component of the GPU that does 3D rendering.

Signed-off-by: Eric Anholt <eric@anholt.net>
---
 drivers/gpu/drm/vc4/Makefile      |   1 +
 drivers/gpu/drm/vc4/vc4_debugfs.c |   2 +
 drivers/gpu/drm/vc4/vc4_drv.c     |   1 +
 drivers/gpu/drm/vc4/vc4_drv.h     |  13 +++
 drivers/gpu/drm/vc4/vc4_v3d.c     | 225 ++++++++++++++++++++++++++++++++++++++
 5 files changed, 242 insertions(+)
 create mode 100644 drivers/gpu/drm/vc4/vc4_v3d.c

diff --git a/drivers/gpu/drm/vc4/Makefile b/drivers/gpu/drm/vc4/Makefile
index eb776a6..e87a6f2 100644
--- a/drivers/gpu/drm/vc4/Makefile
+++ b/drivers/gpu/drm/vc4/Makefile
@@ -11,6 +11,7 @@ vc4-y := \
 	vc4_hdmi.o \
 	vc4_hvs.o \
 	vc4_plane.o \
+	vc4_v3d.o \
 	vc4_validate_shaders.o
 
 vc4-$(CONFIG_DEBUG_FS) += vc4_debugfs.o
diff --git a/drivers/gpu/drm/vc4/vc4_debugfs.c b/drivers/gpu/drm/vc4/vc4_debugfs.c
index 6bcf96e..d76ad10 100644
--- a/drivers/gpu/drm/vc4/vc4_debugfs.c
+++ b/drivers/gpu/drm/vc4/vc4_debugfs.c
@@ -22,6 +22,8 @@ static const struct drm_info_list vc4_debugfs_list[] = {
 	{"crtc0_regs", vc4_crtc_debugfs_regs, 0, (void *)(uintptr_t)0},
 	{"crtc1_regs", vc4_crtc_debugfs_regs, 0, (void *)(uintptr_t)1},
 	{"crtc2_regs", vc4_crtc_debugfs_regs, 0, (void *)(uintptr_t)2},
+	{"v3d_ident", vc4_v3d_debugfs_ident, 0},
+	{"v3d_regs", vc4_v3d_debugfs_regs, 0},
 };
 
 #define VC4_DEBUGFS_ENTRIES ARRAY_SIZE(vc4_debugfs_list)
diff --git a/drivers/gpu/drm/vc4/vc4_drv.c b/drivers/gpu/drm/vc4/vc4_drv.c
index da4be9c..db58d74 100644
--- a/drivers/gpu/drm/vc4/vc4_drv.c
+++ b/drivers/gpu/drm/vc4/vc4_drv.c
@@ -236,6 +236,7 @@ static struct platform_driver *const component_drivers[] = {
 	&vc4_hdmi_driver,
 	&vc4_crtc_driver,
 	&vc4_hvs_driver,
+	&vc4_v3d_driver,
 };
 
 static int vc4_platform_drm_probe(struct platform_device *pdev)
diff --git a/drivers/gpu/drm/vc4/vc4_drv.h b/drivers/gpu/drm/vc4/vc4_drv.h
index bd77d55..8945463 100644
--- a/drivers/gpu/drm/vc4/vc4_drv.h
+++ b/drivers/gpu/drm/vc4/vc4_drv.h
@@ -15,6 +15,7 @@ struct vc4_dev {
 	struct vc4_hdmi *hdmi;
 	struct vc4_hvs *hvs;
 	struct vc4_crtc *crtc[3];
+	struct vc4_v3d *v3d;
 
 	struct drm_fbdev_cma *fbdev;
 
@@ -82,6 +83,11 @@ to_vc4_bo(struct drm_gem_object *bo)
 	return (struct vc4_bo *)bo;
 }
 
+struct vc4_v3d {
+	struct platform_device *pdev;
+	void __iomem *regs;
+};
+
 struct vc4_hvs {
 	struct platform_device *pdev;
 	void __iomem *regs;
@@ -119,6 +125,8 @@ to_vc4_encoder(struct drm_encoder *encoder)
 	return container_of(encoder, struct vc4_encoder, base);
 }
 
+#define V3D_READ(offset) readl(vc4->v3d->regs + offset)
+#define V3D_WRITE(offset, val) writel(val, vc4->v3d->regs + offset)
 #define HVS_READ(offset) readl(vc4->hvs->regs + offset)
 #define HVS_WRITE(offset, val) writel(val, vc4->hvs->regs + offset)
 
@@ -241,6 +249,11 @@ struct drm_plane *vc4_plane_init(struct drm_device *dev,
 u32 vc4_plane_write_dlist(struct drm_plane *plane, u32 __iomem *dlist);
 u32 vc4_plane_dlist_size(struct drm_plane_state *state);
 
+/* vc4_v3d.c */
+extern struct platform_driver vc4_v3d_driver;
+int vc4_v3d_debugfs_ident(struct seq_file *m, void *unused);
+int vc4_v3d_debugfs_regs(struct seq_file *m, void *unused);
+
 /* vc4_validate_shader.c */
 struct vc4_validated_shader_info *
 vc4_validate_shader(struct drm_gem_cma_object *shader_obj);
diff --git a/drivers/gpu/drm/vc4/vc4_v3d.c b/drivers/gpu/drm/vc4/vc4_v3d.c
new file mode 100644
index 0000000..040ad0d
--- /dev/null
+++ b/drivers/gpu/drm/vc4/vc4_v3d.c
@@ -0,0 +1,225 @@
+/*
+ * Copyright (c) 2014 The Linux Foundation. All rights reserved.
+ * Copyright (C) 2013 Red Hat
+ * Author: Rob Clark <robdclark@gmail.com>
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License version 2 as published by
+ * the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
+ * more details.
+ *
+ * You should have received a copy of the GNU General Public License along with
+ * this program.  If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#include "linux/component.h"
+#include "vc4_drv.h"
+#include "vc4_regs.h"
+
+#ifdef CONFIG_DEBUG_FS
+#define REGDEF(reg) { reg, #reg }
+static const struct {
+	uint32_t reg;
+	const char *name;
+} vc4_reg_defs[] = {
+	REGDEF(V3D_IDENT0),
+	REGDEF(V3D_IDENT1),
+	REGDEF(V3D_IDENT2),
+	REGDEF(V3D_SCRATCH),
+	REGDEF(V3D_L2CACTL),
+	REGDEF(V3D_SLCACTL),
+	REGDEF(V3D_INTCTL),
+	REGDEF(V3D_INTENA),
+	REGDEF(V3D_INTDIS),
+	REGDEF(V3D_CT0CS),
+	REGDEF(V3D_CT1CS),
+	REGDEF(V3D_CT0EA),
+	REGDEF(V3D_CT1EA),
+	REGDEF(V3D_CT0CA),
+	REGDEF(V3D_CT1CA),
+	REGDEF(V3D_CT00RA0),
+	REGDEF(V3D_CT01RA0),
+	REGDEF(V3D_CT0LC),
+	REGDEF(V3D_CT1LC),
+	REGDEF(V3D_CT0PC),
+	REGDEF(V3D_CT1PC),
+	REGDEF(V3D_PCS),
+	REGDEF(V3D_BFC),
+	REGDEF(V3D_RFC),
+	REGDEF(V3D_BPCA),
+	REGDEF(V3D_BPCS),
+	REGDEF(V3D_BPOA),
+	REGDEF(V3D_BPOS),
+	REGDEF(V3D_BXCF),
+	REGDEF(V3D_SQRSV0),
+	REGDEF(V3D_SQRSV1),
+	REGDEF(V3D_SQCNTL),
+	REGDEF(V3D_SRQPC),
+	REGDEF(V3D_SRQUA),
+	REGDEF(V3D_SRQUL),
+	REGDEF(V3D_SRQCS),
+	REGDEF(V3D_VPACNTL),
+	REGDEF(V3D_VPMBASE),
+	REGDEF(V3D_PCTRC),
+	REGDEF(V3D_PCTRE),
+	REGDEF(V3D_PCTR0),
+	REGDEF(V3D_PCTRS0),
+	REGDEF(V3D_PCTR1),
+	REGDEF(V3D_PCTRS1),
+	REGDEF(V3D_PCTR2),
+	REGDEF(V3D_PCTRS2),
+	REGDEF(V3D_PCTR3),
+	REGDEF(V3D_PCTRS3),
+	REGDEF(V3D_PCTR4),
+	REGDEF(V3D_PCTRS4),
+	REGDEF(V3D_PCTR5),
+	REGDEF(V3D_PCTRS5),
+	REGDEF(V3D_PCTR6),
+	REGDEF(V3D_PCTRS6),
+	REGDEF(V3D_PCTR7),
+	REGDEF(V3D_PCTRS7),
+	REGDEF(V3D_PCTR8),
+	REGDEF(V3D_PCTRS8),
+	REGDEF(V3D_PCTR9),
+	REGDEF(V3D_PCTRS9),
+	REGDEF(V3D_PCTR10),
+	REGDEF(V3D_PCTRS10),
+	REGDEF(V3D_PCTR11),
+	REGDEF(V3D_PCTRS11),
+	REGDEF(V3D_PCTR12),
+	REGDEF(V3D_PCTRS12),
+	REGDEF(V3D_PCTR13),
+	REGDEF(V3D_PCTRS13),
+	REGDEF(V3D_PCTR14),
+	REGDEF(V3D_PCTRS14),
+	REGDEF(V3D_PCTR15),
+	REGDEF(V3D_PCTRS15),
+	REGDEF(V3D_DBGE),
+	REGDEF(V3D_FDBGO),
+	REGDEF(V3D_FDBGB),
+	REGDEF(V3D_FDBGR),
+	REGDEF(V3D_FDBGS),
+	REGDEF(V3D_ERRSTAT),
+};
+
+int vc4_v3d_debugfs_regs(struct seq_file *m, void *unused)
+{
+	struct drm_info_node *node = (struct drm_info_node *)m->private;
+	struct drm_device *dev = node->minor->dev;
+	struct vc4_dev *vc4 = to_vc4_dev(dev);
+	int i;
+
+	for (i = 0; i < ARRAY_SIZE(vc4_reg_defs); i++) {
+		seq_printf(m, "%s (0x%04x): 0x%08x\n",
+			   vc4_reg_defs[i].name, vc4_reg_defs[i].reg,
+			   V3D_READ(vc4_reg_defs[i].reg));
+	}
+
+	return 0;
+}
+
+int vc4_v3d_debugfs_ident(struct seq_file *m, void *unused)
+{
+	struct drm_info_node *node = (struct drm_info_node *)m->private;
+	struct drm_device *dev = node->minor->dev;
+	struct vc4_dev *vc4 = to_vc4_dev(dev);
+	uint32_t ident1 = V3D_READ(V3D_IDENT1);
+	uint32_t nslc = VC4_GET_FIELD(ident1, V3D_IDENT1_NSLC);
+	uint32_t tups = VC4_GET_FIELD(ident1, V3D_IDENT1_TUPS);
+	uint32_t qups = VC4_GET_FIELD(ident1, V3D_IDENT1_QUPS);
+
+	seq_printf(m, "Revision:   %d\n",
+		   VC4_GET_FIELD(ident1, V3D_IDENT1_REV));
+	seq_printf(m, "Slices:     %d\n", nslc);
+	seq_printf(m, "TMUs:       %d\n", nslc * tups);
+	seq_printf(m, "QPUs:       %d\n", nslc * qups);
+	seq_printf(m, "Semaphores: %d\n",
+		   VC4_GET_FIELD(ident1, V3D_IDENT1_NSEM));
+
+	return 0;
+}
+#endif /* CONFIG_DEBUG_FS */
+
+static void vc4_v3d_init_hw(struct drm_device *dev)
+{
+	struct vc4_dev *vc4 = to_vc4_dev(dev);
+
+	/* Take all the memory that would have been reserved for user
+	 * QPU programs, since we don't have an interface for running
+	 * them, anyway.
+	 */
+	V3D_WRITE(V3D_VPMBASE, 0);
+}
+
+static int vc4_v3d_bind(struct device *dev, struct device *master, void *data)
+{
+	struct platform_device *pdev = to_platform_device(dev);
+	struct drm_device *drm = dev_get_drvdata(master);
+	struct vc4_dev *vc4 = to_vc4_dev(drm);
+	struct vc4_v3d *v3d = NULL;
+
+	v3d = devm_kzalloc(&pdev->dev, sizeof(*v3d), GFP_KERNEL);
+	if (!v3d)
+		return -ENOMEM;
+
+	v3d->pdev = pdev;
+
+	v3d->regs = vc4_ioremap_regs(pdev, 0);
+	if (IS_ERR(v3d->regs))
+		return PTR_ERR(v3d->regs);
+
+	vc4->v3d = v3d;
+
+	if (V3D_READ(V3D_IDENT0) != V3D_EXPECTED_IDENT0) {
+		DRM_ERROR("V3D_IDENT0 read 0x%08x instead of 0x%08x\n",
+			  V3D_READ(V3D_IDENT0), V3D_EXPECTED_IDENT0);
+		return -EINVAL;
+	}
+
+	vc4_v3d_init_hw(drm);
+
+	return 0;
+}
+
+static void vc4_v3d_unbind(struct device *dev, struct device *master,
+			   void *data)
+{
+	struct drm_device *drm = dev_get_drvdata(master);
+	struct vc4_dev *vc4 = to_vc4_dev(drm);
+
+	vc4->v3d = NULL;
+}
+
+static const struct component_ops vc4_v3d_ops = {
+	.bind   = vc4_v3d_bind,
+	.unbind = vc4_v3d_unbind,
+};
+
+static int vc4_v3d_dev_probe(struct platform_device *pdev)
+{
+	return component_add(&pdev->dev, &vc4_v3d_ops);
+}
+
+static int vc4_v3d_dev_remove(struct platform_device *pdev)
+{
+	component_del(&pdev->dev, &vc4_v3d_ops);
+	return 0;
+}
+
+static const struct of_device_id vc4_v3d_dt_match[] = {
+	{ .compatible = "brcm,vc4-v3d" },
+	{}
+};
+
+struct platform_driver vc4_v3d_driver = {
+	.probe = vc4_v3d_dev_probe,
+	.remove = vc4_v3d_dev_remove,
+	.driver = {
+		.name = "vc4_v3d",
+		.of_match_table = vc4_v3d_dt_match,
+	},
+};
-- 
2.6.2

_______________________________________________
dri-devel mailing list
dri-devel@lists.freedesktop.org
http://lists.freedesktop.org/mailman/listinfo/dri-devel

^ permalink raw reply related	[flat|nested] 53+ messages in thread

* [PATCH 7/9] drm/vc4: Add support for drawing 3D frames.
  2015-12-01 20:35 ` Eric Anholt
@ 2015-12-01 20:35   ` Eric Anholt
  -1 siblings, 0 replies; 53+ messages in thread
From: Eric Anholt @ 2015-12-01 20:35 UTC (permalink / raw)
  To: dri-devel; +Cc: linux-kernel, David Airlie, Eric Anholt

The user submission is basically a pointer to a command list and a
pointer to uniforms.  We copy those in to the kernel, validate and
relocate them, and store the result in a GPU BO which we queue for
execution.

Signed-off-by: Eric Anholt <eric@anholt.net>
---
 drivers/gpu/drm/vc4/Makefile           |   7 +
 drivers/gpu/drm/vc4/vc4_drv.c          |  15 +-
 drivers/gpu/drm/vc4/vc4_drv.h          | 192 +++++++
 drivers/gpu/drm/vc4/vc4_gem.c          | 640 ++++++++++++++++++++++
 drivers/gpu/drm/vc4/vc4_irq.c          | 210 ++++++++
 drivers/gpu/drm/vc4/vc4_packet.h       | 399 ++++++++++++++
 drivers/gpu/drm/vc4/vc4_render_cl.c    | 631 ++++++++++++++++++++++
 drivers/gpu/drm/vc4/vc4_trace.h        |  63 +++
 drivers/gpu/drm/vc4/vc4_trace_points.c |  14 +
 drivers/gpu/drm/vc4/vc4_v3d.c          |  37 ++
 drivers/gpu/drm/vc4/vc4_validate.c     | 955 +++++++++++++++++++++++++++++++++
 include/uapi/drm/vc4_drm.h             | 141 +++++
 12 files changed, 3303 insertions(+), 1 deletion(-)
 create mode 100644 drivers/gpu/drm/vc4/vc4_gem.c
 create mode 100644 drivers/gpu/drm/vc4/vc4_irq.c
 create mode 100644 drivers/gpu/drm/vc4/vc4_packet.h
 create mode 100644 drivers/gpu/drm/vc4/vc4_render_cl.c
 create mode 100644 drivers/gpu/drm/vc4/vc4_trace.h
 create mode 100644 drivers/gpu/drm/vc4/vc4_trace_points.c
 create mode 100644 drivers/gpu/drm/vc4/vc4_validate.c

diff --git a/drivers/gpu/drm/vc4/Makefile b/drivers/gpu/drm/vc4/Makefile
index e87a6f2..4c6a99f 100644
--- a/drivers/gpu/drm/vc4/Makefile
+++ b/drivers/gpu/drm/vc4/Makefile
@@ -8,12 +8,19 @@ vc4-y := \
 	vc4_crtc.o \
 	vc4_drv.o \
 	vc4_kms.o \
+	vc4_gem.o \
 	vc4_hdmi.o \
 	vc4_hvs.o \
+	vc4_irq.o \
 	vc4_plane.o \
+	vc4_render_cl.o \
+	vc4_trace_points.o \
 	vc4_v3d.o \
+	vc4_validate.o \
 	vc4_validate_shaders.o
 
 vc4-$(CONFIG_DEBUG_FS) += vc4_debugfs.o
 
 obj-$(CONFIG_DRM_VC4)  += vc4.o
+
+CFLAGS_vc4_trace_points.o := -I$(src)
diff --git a/drivers/gpu/drm/vc4/vc4_drv.c b/drivers/gpu/drm/vc4/vc4_drv.c
index db58d74..2cfee59 100644
--- a/drivers/gpu/drm/vc4/vc4_drv.c
+++ b/drivers/gpu/drm/vc4/vc4_drv.c
@@ -74,6 +74,9 @@ static const struct file_operations vc4_drm_fops = {
 };
 
 static const struct drm_ioctl_desc vc4_drm_ioctls[] = {
+	DRM_IOCTL_DEF_DRV(VC4_SUBMIT_CL, vc4_submit_cl_ioctl, 0),
+	DRM_IOCTL_DEF_DRV(VC4_WAIT_SEQNO, vc4_wait_seqno_ioctl, 0),
+	DRM_IOCTL_DEF_DRV(VC4_WAIT_BO, vc4_wait_bo_ioctl, 0),
 	DRM_IOCTL_DEF_DRV(VC4_CREATE_BO, vc4_create_bo_ioctl, 0),
 	DRM_IOCTL_DEF_DRV(VC4_MMAP_BO, vc4_mmap_bo_ioctl, 0),
 	DRM_IOCTL_DEF_DRV(VC4_CREATE_SHADER_BO, vc4_create_shader_bo_ioctl, 0),
@@ -83,10 +86,16 @@ static struct drm_driver vc4_drm_driver = {
 	.driver_features = (DRIVER_MODESET |
 			    DRIVER_ATOMIC |
 			    DRIVER_GEM |
+			    DRIVER_HAVE_IRQ |
 			    DRIVER_PRIME),
 	.lastclose = vc4_lastclose,
 	.preclose = vc4_drm_preclose,
 
+	.irq_handler = vc4_irq,
+	.irq_preinstall = vc4_irq_preinstall,
+	.irq_postinstall = vc4_irq_postinstall,
+	.irq_uninstall = vc4_irq_uninstall,
+
 	.enable_vblank = vc4_enable_vblank,
 	.disable_vblank = vc4_disable_vblank,
 	.get_vblank_counter = drm_vblank_count,
@@ -181,9 +190,11 @@ static int vc4_drm_bind(struct device *dev)
 	if (ret)
 		goto unref;
 
+	vc4_gem_init(drm);
+
 	ret = component_bind_all(dev, drm);
 	if (ret)
-		goto unref;
+		goto gem_destroy;
 
 	ret = drm_dev_register(drm, 0);
 	if (ret < 0)
@@ -207,6 +218,8 @@ unregister:
 	drm_dev_unregister(drm);
 unbind_all:
 	component_unbind_all(dev, drm);
+gem_destroy:
+	vc4_gem_destroy(drm);
 unref:
 	drm_dev_unref(drm);
 	vc4_bo_cache_destroy(drm);
diff --git a/drivers/gpu/drm/vc4/vc4_drv.h b/drivers/gpu/drm/vc4/vc4_drv.h
index 8945463..a3dbfee 100644
--- a/drivers/gpu/drm/vc4/vc4_drv.h
+++ b/drivers/gpu/drm/vc4/vc4_drv.h
@@ -49,6 +49,48 @@ struct vc4_dev {
 
 	/* Protects bo_cache and the BO stats. */
 	struct mutex bo_lock;
+
+	/* Sequence number for the last job queued in job_list.
+	 * Starts at 0 (no jobs emitted).
+	 */
+	uint64_t emit_seqno;
+
+	/* Sequence number for the last completed job on the GPU.
+	 * Starts at 0 (no jobs completed).
+	 */
+	uint64_t finished_seqno;
+
+	/* List of all struct vc4_exec_info for jobs to be executed.
+	 * The first job in the list is the one currently programmed
+	 * into ct0ca/ct1ca for execution.
+	 */
+	struct list_head job_list;
+	/* List of the finished vc4_exec_infos waiting to be freed by
+	 * job_done_work.
+	 */
+	struct list_head job_done_list;
+	/* Spinlock used to synchronize the job_list and seqno
+	 * accesses between the IRQ handler and GEM ioctls.
+	 */
+	spinlock_t job_lock;
+	wait_queue_head_t job_wait_queue;
+	struct work_struct job_done_work;
+
+	/* The binner overflow memory that's currently set up in
+	 * BPOA/BPOS registers.  When overflow occurs and a new one is
+	 * allocated, the previous one will be moved to
+	 * vc4->current_exec's free list.
+	 */
+	struct vc4_bo *overflow_mem;
+	struct work_struct overflow_mem_work;
+
+	struct {
+		uint32_t last_ct0ca, last_ct1ca;
+		struct timer_list timer;
+		struct work_struct reset_work;
+	} hangcheck;
+
+	struct semaphore async_modeset;
 };
 
 static inline struct vc4_dev *
@@ -60,6 +102,9 @@ to_vc4_dev(struct drm_device *dev)
 struct vc4_bo {
 	struct drm_gem_cma_object base;
 
+	/* seqno of the last job to render to this BO. */
+	uint64_t seqno;
+
 	/* List entry for the BO's position in either
 	 * vc4_exec_info->unref_list or vc4_dev->bo_cache.time_list
 	 */
@@ -130,6 +175,109 @@ to_vc4_encoder(struct drm_encoder *encoder)
 #define HVS_READ(offset) readl(vc4->hvs->regs + offset)
 #define HVS_WRITE(offset, val) writel(val, vc4->hvs->regs + offset)
 
+enum vc4_bo_mode {
+	VC4_MODE_UNDECIDED,
+	VC4_MODE_RENDER,
+	VC4_MODE_SHADER,
+};
+
+struct vc4_bo_exec_state {
+	struct drm_gem_cma_object *bo;
+	enum vc4_bo_mode mode;
+};
+
+struct vc4_exec_info {
+	/* Sequence number for this bin/render job. */
+	uint64_t seqno;
+
+	/* Kernel-space copy of the ioctl arguments */
+	struct drm_vc4_submit_cl *args;
+
+	/* This is the array of BOs that were looked up at the start of exec.
+	 * Command validation will use indices into this array.
+	 */
+	struct vc4_bo_exec_state *bo;
+	uint32_t bo_count;
+
+	/* Pointers for our position in vc4->job_list */
+	struct list_head head;
+
+	/* List of other BOs used in the job that need to be released
+	 * once the job is complete.
+	 */
+	struct list_head unref_list;
+
+	/* Current unvalidated indices into @bo loaded by the non-hardware
+	 * VC4_PACKET_GEM_HANDLES.
+	 */
+	uint32_t bo_index[2];
+
+	/* This is the BO where we store the validated command lists, shader
+	 * records, and uniforms.
+	 */
+	struct drm_gem_cma_object *exec_bo;
+
+	/**
+	 * This tracks the per-shader-record state (packet 64) that
+	 * determines the length of the shader record and the offset
+	 * it's expected to be found at.  It gets read in from the
+	 * command lists.
+	 */
+	struct vc4_shader_state {
+		uint8_t packet;
+		uint32_t addr;
+		/* Maximum vertex index referenced by any primitive using this
+		 * shader state.
+		 */
+		uint32_t max_index;
+	} *shader_state;
+
+	/** How many shader states the user declared they were using. */
+	uint32_t shader_state_size;
+	/** How many shader state records the validator has seen. */
+	uint32_t shader_state_count;
+
+	bool found_tile_binning_mode_config_packet;
+	bool found_start_tile_binning_packet;
+	bool found_increment_semaphore_packet;
+	uint8_t bin_tiles_x, bin_tiles_y;
+	struct drm_gem_cma_object *tile_bo;
+	uint32_t tile_alloc_offset;
+
+	/**
+	 * Computed addresses pointing into exec_bo where we start the
+	 * bin thread (ct0) and render thread (ct1).
+	 */
+	uint32_t ct0ca, ct0ea;
+	uint32_t ct1ca, ct1ea;
+
+	/* Pointers to the shader recs.  These paddr gets incremented as CL
+	 * packets are relocated in validate_gl_shader_state, and the vaddrs
+	 * (u and v) get incremented and size decremented as the shader recs
+	 * themselves are validated.
+	 */
+	void *shader_rec_u;
+	void *shader_rec_v;
+	uint32_t shader_rec_p;
+	uint32_t shader_rec_size;
+
+	/* Pointers to the uniform data.  These pointers are incremented, and
+	 * size decremented, as each batch of uniforms is uploaded.
+	 */
+	void *uniforms_u;
+	void *uniforms_v;
+	uint32_t uniforms_p;
+	uint32_t uniforms_size;
+};
+
+static inline struct vc4_exec_info *
+vc4_first_job(struct vc4_dev *vc4)
+{
+	if (list_empty(&vc4->job_list))
+		return NULL;
+	return list_first_entry(&vc4->job_list, struct vc4_exec_info, head);
+}
+
 /**
  * struct vc4_texture_sample_info - saves the offsets into the UBO for texture
  * setup parameters.
@@ -231,10 +379,31 @@ void vc4_debugfs_cleanup(struct drm_minor *minor);
 /* vc4_drv.c */
 void __iomem *vc4_ioremap_regs(struct platform_device *dev, int index);
 
+/* vc4_gem.c */
+void vc4_gem_init(struct drm_device *dev);
+void vc4_gem_destroy(struct drm_device *dev);
+int vc4_submit_cl_ioctl(struct drm_device *dev, void *data,
+			struct drm_file *file_priv);
+int vc4_wait_seqno_ioctl(struct drm_device *dev, void *data,
+			 struct drm_file *file_priv);
+int vc4_wait_bo_ioctl(struct drm_device *dev, void *data,
+		      struct drm_file *file_priv);
+void vc4_submit_next_job(struct drm_device *dev);
+int vc4_wait_for_seqno(struct drm_device *dev, uint64_t seqno,
+		       uint64_t timeout_ns, bool interruptible);
+void vc4_job_handle_completed(struct vc4_dev *vc4);
+
 /* vc4_hdmi.c */
 extern struct platform_driver vc4_hdmi_driver;
 int vc4_hdmi_debugfs_regs(struct seq_file *m, void *unused);
 
+/* vc4_irq.c */
+irqreturn_t vc4_irq(int irq, void *arg);
+void vc4_irq_preinstall(struct drm_device *dev);
+int vc4_irq_postinstall(struct drm_device *dev);
+void vc4_irq_uninstall(struct drm_device *dev);
+void vc4_irq_reset(struct drm_device *dev);
+
 /* vc4_hvs.c */
 extern struct platform_driver vc4_hvs_driver;
 void vc4_hvs_dump_state(struct drm_device *dev);
@@ -253,6 +422,29 @@ u32 vc4_plane_dlist_size(struct drm_plane_state *state);
 extern struct platform_driver vc4_v3d_driver;
 int vc4_v3d_debugfs_ident(struct seq_file *m, void *unused);
 int vc4_v3d_debugfs_regs(struct seq_file *m, void *unused);
+int vc4_v3d_set_power(struct vc4_dev *vc4, bool on);
+
+/* vc4_validate.c */
+int
+vc4_validate_bin_cl(struct drm_device *dev,
+		    void *validated,
+		    void *unvalidated,
+		    struct vc4_exec_info *exec);
+
+int
+vc4_validate_shader_recs(struct drm_device *dev, struct vc4_exec_info *exec);
+
+bool vc4_use_bo(struct vc4_exec_info *exec,
+		uint32_t hindex,
+		enum vc4_bo_mode mode,
+		struct drm_gem_cma_object **obj);
+
+int vc4_get_rcl(struct drm_device *dev, struct vc4_exec_info *exec);
+
+bool vc4_check_tex_size(struct vc4_exec_info *exec,
+			struct drm_gem_cma_object *fbo,
+			uint32_t offset, uint8_t tiling_format,
+			uint32_t width, uint32_t height, uint8_t cpp);
 
 /* vc4_validate_shader.c */
 struct vc4_validated_shader_info *
diff --git a/drivers/gpu/drm/vc4/vc4_gem.c b/drivers/gpu/drm/vc4/vc4_gem.c
new file mode 100644
index 0000000..ddcb018
--- /dev/null
+++ b/drivers/gpu/drm/vc4/vc4_gem.c
@@ -0,0 +1,640 @@
+/*
+ * Copyright © 2014 Broadcom
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ */
+
+#include <linux/module.h>
+#include <linux/platform_device.h>
+#include <linux/device.h>
+#include <linux/io.h>
+
+#include "uapi/drm/vc4_drm.h"
+#include "vc4_drv.h"
+#include "vc4_regs.h"
+#include "vc4_trace.h"
+
+static void
+vc4_queue_hangcheck(struct drm_device *dev)
+{
+	struct vc4_dev *vc4 = to_vc4_dev(dev);
+
+	mod_timer(&vc4->hangcheck.timer,
+		  round_jiffies_up(jiffies + msecs_to_jiffies(100)));
+}
+
+static void
+vc4_reset(struct drm_device *dev)
+{
+	struct vc4_dev *vc4 = to_vc4_dev(dev);
+
+	DRM_INFO("Resetting GPU.\n");
+	vc4_v3d_set_power(vc4, false);
+	vc4_v3d_set_power(vc4, true);
+
+	vc4_irq_reset(dev);
+
+	/* Rearm the hangcheck -- another job might have been waiting
+	 * for our hung one to get kicked off, and vc4_irq_reset()
+	 * would have started it.
+	 */
+	vc4_queue_hangcheck(dev);
+}
+
+static void
+vc4_reset_work(struct work_struct *work)
+{
+	struct vc4_dev *vc4 =
+		container_of(work, struct vc4_dev, hangcheck.reset_work);
+
+	vc4_reset(vc4->dev);
+}
+
+static void
+vc4_hangcheck_elapsed(unsigned long data)
+{
+	struct drm_device *dev = (struct drm_device *)data;
+	struct vc4_dev *vc4 = to_vc4_dev(dev);
+	uint32_t ct0ca, ct1ca;
+
+	/* If idle, we can stop watching for hangs. */
+	if (list_empty(&vc4->job_list))
+		return;
+
+	ct0ca = V3D_READ(V3D_CTNCA(0));
+	ct1ca = V3D_READ(V3D_CTNCA(1));
+
+	/* If we've made any progress in execution, rearm the timer
+	 * and wait.
+	 */
+	if (ct0ca != vc4->hangcheck.last_ct0ca ||
+	    ct1ca != vc4->hangcheck.last_ct1ca) {
+		vc4->hangcheck.last_ct0ca = ct0ca;
+		vc4->hangcheck.last_ct1ca = ct1ca;
+		vc4_queue_hangcheck(dev);
+		return;
+	}
+
+	/* We've gone too long with no progress, reset.  This has to
+	 * be done from a work struct, since resetting can sleep and
+	 * this timer hook isn't allowed to.
+	 */
+	schedule_work(&vc4->hangcheck.reset_work);
+}
+
+static void
+submit_cl(struct drm_device *dev, uint32_t thread, uint32_t start, uint32_t end)
+{
+	struct vc4_dev *vc4 = to_vc4_dev(dev);
+
+	/* Set the current and end address of the control list.
+	 * Writing the end register is what starts the job.
+	 */
+	V3D_WRITE(V3D_CTNCA(thread), start);
+	V3D_WRITE(V3D_CTNEA(thread), end);
+}
+
+int
+vc4_wait_for_seqno(struct drm_device *dev, uint64_t seqno, uint64_t timeout_ns,
+		   bool interruptible)
+{
+	struct vc4_dev *vc4 = to_vc4_dev(dev);
+	int ret = 0;
+	unsigned long timeout_expire;
+	DEFINE_WAIT(wait);
+
+	if (vc4->finished_seqno >= seqno)
+		return 0;
+
+	if (timeout_ns == 0)
+		return -ETIME;
+
+	timeout_expire = jiffies + nsecs_to_jiffies(timeout_ns);
+
+	trace_vc4_wait_for_seqno_begin(dev, seqno, timeout_ns);
+	for (;;) {
+		prepare_to_wait(&vc4->job_wait_queue, &wait,
+				interruptible ? TASK_INTERRUPTIBLE :
+				TASK_UNINTERRUPTIBLE);
+
+		if (interruptible && signal_pending(current)) {
+			ret = -ERESTARTSYS;
+			break;
+		}
+
+		if (vc4->finished_seqno >= seqno)
+			break;
+
+		if (timeout_ns != ~0ull) {
+			if (time_after_eq(jiffies, timeout_expire)) {
+				ret = -ETIME;
+				break;
+			}
+			schedule_timeout(timeout_expire - jiffies);
+		} else {
+			schedule();
+		}
+	}
+
+	finish_wait(&vc4->job_wait_queue, &wait);
+	trace_vc4_wait_for_seqno_end(dev, seqno);
+
+	if (ret && ret != -ERESTARTSYS) {
+		DRM_ERROR("timeout waiting for render thread idle\n");
+		return ret;
+	}
+
+	return 0;
+}
+
+static void
+vc4_flush_caches(struct drm_device *dev)
+{
+	struct vc4_dev *vc4 = to_vc4_dev(dev);
+
+	/* Flush the GPU L2 caches.  These caches sit on top of system
+	 * L3 (the 128kb or so shared with the CPU), and are
+	 * non-allocating in the L3.
+	 */
+	V3D_WRITE(V3D_L2CACTL,
+		  V3D_L2CACTL_L2CCLR);
+
+	V3D_WRITE(V3D_SLCACTL,
+		  VC4_SET_FIELD(0xf, V3D_SLCACTL_T1CC) |
+		  VC4_SET_FIELD(0xf, V3D_SLCACTL_T0CC) |
+		  VC4_SET_FIELD(0xf, V3D_SLCACTL_UCC) |
+		  VC4_SET_FIELD(0xf, V3D_SLCACTL_ICC));
+}
+
+/* Sets the registers for the next job to be actually be executed in
+ * the hardware.
+ *
+ * The job_lock should be held during this.
+ */
+void
+vc4_submit_next_job(struct drm_device *dev)
+{
+	struct vc4_dev *vc4 = to_vc4_dev(dev);
+	struct vc4_exec_info *exec = vc4_first_job(vc4);
+
+	if (!exec)
+		return;
+
+	vc4_flush_caches(dev);
+
+	/* Disable the binner's pre-loaded overflow memory address */
+	V3D_WRITE(V3D_BPOA, 0);
+	V3D_WRITE(V3D_BPOS, 0);
+
+	if (exec->ct0ca != exec->ct0ea)
+		submit_cl(dev, 0, exec->ct0ca, exec->ct0ea);
+	submit_cl(dev, 1, exec->ct1ca, exec->ct1ea);
+}
+
+static void
+vc4_update_bo_seqnos(struct vc4_exec_info *exec, uint64_t seqno)
+{
+	struct vc4_bo *bo;
+	unsigned i;
+
+	for (i = 0; i < exec->bo_count; i++) {
+		bo = to_vc4_bo(&exec->bo[i].bo->base);
+		bo->seqno = seqno;
+	}
+
+	list_for_each_entry(bo, &exec->unref_list, unref_head) {
+		bo->seqno = seqno;
+	}
+}
+
+/* Queues a struct vc4_exec_info for execution.  If no job is
+ * currently executing, then submits it.
+ *
+ * Unlike most GPUs, our hardware only handles one command list at a
+ * time.  To queue multiple jobs at once, we'd need to edit the
+ * previous command list to have a jump to the new one at the end, and
+ * then bump the end address.  That's a change for a later date,
+ * though.
+ */
+static void
+vc4_queue_submit(struct drm_device *dev, struct vc4_exec_info *exec)
+{
+	struct vc4_dev *vc4 = to_vc4_dev(dev);
+	uint64_t seqno;
+	unsigned long irqflags;
+
+	spin_lock_irqsave(&vc4->job_lock, irqflags);
+
+	seqno = ++vc4->emit_seqno;
+	exec->seqno = seqno;
+	vc4_update_bo_seqnos(exec, seqno);
+
+	list_add_tail(&exec->head, &vc4->job_list);
+
+	/* If no job was executing, kick ours off.  Otherwise, it'll
+	 * get started when the previous job's frame done interrupt
+	 * occurs.
+	 */
+	if (vc4_first_job(vc4) == exec) {
+		vc4_submit_next_job(dev);
+		vc4_queue_hangcheck(dev);
+	}
+
+	spin_unlock_irqrestore(&vc4->job_lock, irqflags);
+}
+
+/**
+ * Looks up a bunch of GEM handles for BOs and stores the array for
+ * use in the command validator that actually writes relocated
+ * addresses pointing to them.
+ */
+static int
+vc4_cl_lookup_bos(struct drm_device *dev,
+		  struct drm_file *file_priv,
+		  struct vc4_exec_info *exec)
+{
+	struct drm_vc4_submit_cl *args = exec->args;
+	uint32_t *handles;
+	int ret = 0;
+	int i;
+
+	exec->bo_count = args->bo_handle_count;
+
+	if (!exec->bo_count) {
+		/* See comment on bo_index for why we have to check
+		 * this.
+		 */
+		DRM_ERROR("Rendering requires BOs to validate\n");
+		return -EINVAL;
+	}
+
+	exec->bo = kcalloc(exec->bo_count, sizeof(struct vc4_bo_exec_state),
+			   GFP_KERNEL);
+	if (!exec->bo) {
+		DRM_ERROR("Failed to allocate validated BO pointers\n");
+		return -ENOMEM;
+	}
+
+	handles = drm_malloc_ab(exec->bo_count, sizeof(uint32_t));
+	if (!handles) {
+		DRM_ERROR("Failed to allocate incoming GEM handles\n");
+		goto fail;
+	}
+
+	ret = copy_from_user(handles,
+			     (void __user *)(uintptr_t)args->bo_handles,
+			     exec->bo_count * sizeof(uint32_t));
+	if (ret) {
+		DRM_ERROR("Failed to copy in GEM handles\n");
+		goto fail;
+	}
+
+	spin_lock(&file_priv->table_lock);
+	for (i = 0; i < exec->bo_count; i++) {
+		struct drm_gem_object *bo = idr_find(&file_priv->object_idr,
+						     handles[i]);
+		if (!bo) {
+			DRM_ERROR("Failed to look up GEM BO %d: %d\n",
+				  i, handles[i]);
+			ret = -EINVAL;
+			spin_unlock(&file_priv->table_lock);
+			goto fail;
+		}
+		drm_gem_object_reference(bo);
+		exec->bo[i].bo = (struct drm_gem_cma_object *)bo;
+	}
+	spin_unlock(&file_priv->table_lock);
+
+fail:
+	kfree(handles);
+	return 0;
+}
+
+static int
+vc4_get_bcl(struct drm_device *dev, struct vc4_exec_info *exec)
+{
+	struct drm_vc4_submit_cl *args = exec->args;
+	void *temp = NULL;
+	void *bin;
+	int ret = 0;
+	uint32_t bin_offset = 0;
+	uint32_t shader_rec_offset = roundup(bin_offset + args->bin_cl_size,
+					     16);
+	uint32_t uniforms_offset = shader_rec_offset + args->shader_rec_size;
+	uint32_t exec_size = uniforms_offset + args->uniforms_size;
+	uint32_t temp_size = exec_size + (sizeof(struct vc4_shader_state) *
+					  args->shader_rec_count);
+	struct vc4_bo *bo;
+
+	if (uniforms_offset < shader_rec_offset ||
+	    exec_size < uniforms_offset ||
+	    args->shader_rec_count >= (UINT_MAX /
+					  sizeof(struct vc4_shader_state)) ||
+	    temp_size < exec_size) {
+		DRM_ERROR("overflow in exec arguments\n");
+		goto fail;
+	}
+
+	/* Allocate space where we'll store the copied in user command lists
+	 * and shader records.
+	 *
+	 * We don't just copy directly into the BOs because we need to
+	 * read the contents back for validation, and I think the
+	 * bo->vaddr is uncached access.
+	 */
+	temp = kmalloc(temp_size, GFP_KERNEL);
+	if (!temp) {
+		DRM_ERROR("Failed to allocate storage for copying "
+			  "in bin/render CLs.\n");
+		ret = -ENOMEM;
+		goto fail;
+	}
+	bin = temp + bin_offset;
+	exec->shader_rec_u = temp + shader_rec_offset;
+	exec->uniforms_u = temp + uniforms_offset;
+	exec->shader_state = temp + exec_size;
+	exec->shader_state_size = args->shader_rec_count;
+
+	ret = copy_from_user(bin,
+			     (void __user *)(uintptr_t)args->bin_cl,
+			     args->bin_cl_size);
+	if (ret) {
+		DRM_ERROR("Failed to copy in bin cl\n");
+		goto fail;
+	}
+
+	ret = copy_from_user(exec->shader_rec_u,
+			     (void __user *)(uintptr_t)args->shader_rec,
+			     args->shader_rec_size);
+	if (ret) {
+		DRM_ERROR("Failed to copy in shader recs\n");
+		goto fail;
+	}
+
+	ret = copy_from_user(exec->uniforms_u,
+			     (void __user *)(uintptr_t)args->uniforms,
+			     args->uniforms_size);
+	if (ret) {
+		DRM_ERROR("Failed to copy in uniforms cl\n");
+		goto fail;
+	}
+
+	bo = vc4_bo_create(dev, exec_size, true);
+	if (!bo) {
+		DRM_ERROR("Couldn't allocate BO for binning\n");
+		ret = PTR_ERR(exec->exec_bo);
+		goto fail;
+	}
+	exec->exec_bo = &bo->base;
+
+	list_add_tail(&to_vc4_bo(&exec->exec_bo->base)->unref_head,
+		      &exec->unref_list);
+
+	exec->ct0ca = exec->exec_bo->paddr + bin_offset;
+
+	exec->shader_rec_v = exec->exec_bo->vaddr + shader_rec_offset;
+	exec->shader_rec_p = exec->exec_bo->paddr + shader_rec_offset;
+	exec->shader_rec_size = args->shader_rec_size;
+
+	exec->uniforms_v = exec->exec_bo->vaddr + uniforms_offset;
+	exec->uniforms_p = exec->exec_bo->paddr + uniforms_offset;
+	exec->uniforms_size = args->uniforms_size;
+
+	ret = vc4_validate_bin_cl(dev,
+				  exec->exec_bo->vaddr + bin_offset,
+				  bin,
+				  exec);
+	if (ret)
+		goto fail;
+
+	ret = vc4_validate_shader_recs(dev, exec);
+
+fail:
+	kfree(temp);
+	return ret;
+}
+
+static void
+vc4_complete_exec(struct drm_device *dev, struct vc4_exec_info *exec)
+{
+	unsigned i;
+
+	/* Need the struct lock for drm_gem_object_unreference(). */
+	mutex_lock(&dev->struct_mutex);
+	if (exec->bo) {
+		for (i = 0; i < exec->bo_count; i++)
+			drm_gem_object_unreference(&exec->bo[i].bo->base);
+		kfree(exec->bo);
+	}
+
+	while (!list_empty(&exec->unref_list)) {
+		struct vc4_bo *bo = list_first_entry(&exec->unref_list,
+						     struct vc4_bo, unref_head);
+		list_del(&bo->unref_head);
+		drm_gem_object_unreference(&bo->base.base);
+	}
+	mutex_unlock(&dev->struct_mutex);
+
+	kfree(exec);
+}
+
+void
+vc4_job_handle_completed(struct vc4_dev *vc4)
+{
+	unsigned long irqflags;
+
+	spin_lock_irqsave(&vc4->job_lock, irqflags);
+	while (!list_empty(&vc4->job_done_list)) {
+		struct vc4_exec_info *exec =
+			list_first_entry(&vc4->job_done_list,
+					 struct vc4_exec_info, head);
+		list_del(&exec->head);
+
+		spin_unlock_irqrestore(&vc4->job_lock, irqflags);
+		vc4_complete_exec(vc4->dev, exec);
+		spin_lock_irqsave(&vc4->job_lock, irqflags);
+	}
+	spin_unlock_irqrestore(&vc4->job_lock, irqflags);
+}
+
+/* Scheduled when any job has been completed, this walks the list of
+ * jobs that had completed and unrefs their BOs and frees their exec
+ * structs.
+ */
+static void
+vc4_job_done_work(struct work_struct *work)
+{
+	struct vc4_dev *vc4 =
+		container_of(work, struct vc4_dev, job_done_work);
+
+	vc4_job_handle_completed(vc4);
+}
+
+static int
+vc4_wait_for_seqno_ioctl_helper(struct drm_device *dev,
+				uint64_t seqno,
+				uint64_t *timeout_ns)
+{
+	unsigned long start = jiffies;
+	int ret = vc4_wait_for_seqno(dev, seqno, *timeout_ns, true);
+
+	if ((ret == -EINTR || ret == -ERESTARTSYS) && *timeout_ns != ~0ull) {
+		uint64_t delta = jiffies_to_nsecs(jiffies - start);
+
+		if (*timeout_ns >= delta)
+			*timeout_ns -= delta;
+	}
+
+	return ret;
+}
+
+int
+vc4_wait_seqno_ioctl(struct drm_device *dev, void *data,
+		     struct drm_file *file_priv)
+{
+	struct drm_vc4_wait_seqno *args = data;
+
+	return vc4_wait_for_seqno_ioctl_helper(dev, args->seqno,
+					       &args->timeout_ns);
+}
+
+int
+vc4_wait_bo_ioctl(struct drm_device *dev, void *data,
+		  struct drm_file *file_priv)
+{
+	int ret;
+	struct drm_vc4_wait_bo *args = data;
+	struct drm_gem_object *gem_obj;
+	struct vc4_bo *bo;
+
+	gem_obj = drm_gem_object_lookup(dev, file_priv, args->handle);
+	if (!gem_obj) {
+		DRM_ERROR("Failed to look up GEM BO %d\n", args->handle);
+		return -EINVAL;
+	}
+	bo = to_vc4_bo(gem_obj);
+
+	ret = vc4_wait_for_seqno_ioctl_helper(dev, bo->seqno,
+					      &args->timeout_ns);
+
+	drm_gem_object_unreference_unlocked(gem_obj);
+	return ret;
+}
+
+/**
+ * Submits a command list to the VC4.
+ *
+ * This is what is called batchbuffer emitting on other hardware.
+ */
+int
+vc4_submit_cl_ioctl(struct drm_device *dev, void *data,
+		    struct drm_file *file_priv)
+{
+	struct vc4_dev *vc4 = to_vc4_dev(dev);
+	struct drm_vc4_submit_cl *args = data;
+	struct vc4_exec_info *exec;
+	int ret;
+
+	if ((args->flags & ~VC4_SUBMIT_CL_USE_CLEAR_COLOR) != 0) {
+		DRM_ERROR("Unknown flags: 0x%02x\n", args->flags);
+		return -EINVAL;
+	}
+
+	exec = kcalloc(1, sizeof(*exec), GFP_KERNEL);
+	if (!exec) {
+		DRM_ERROR("malloc failure on exec struct\n");
+		return -ENOMEM;
+	}
+
+	exec->args = args;
+	INIT_LIST_HEAD(&exec->unref_list);
+
+	ret = vc4_cl_lookup_bos(dev, file_priv, exec);
+	if (ret)
+		goto fail;
+
+	if (exec->args->bin_cl_size != 0) {
+		ret = vc4_get_bcl(dev, exec);
+		if (ret)
+			goto fail;
+	} else {
+		exec->ct0ca = 0;
+		exec->ct0ea = 0;
+	}
+
+	ret = vc4_get_rcl(dev, exec);
+	if (ret)
+		goto fail;
+
+	/* Clear this out of the struct we'll be putting in the queue,
+	 * since it's part of our stack.
+	 */
+	exec->args = NULL;
+
+	vc4_queue_submit(dev, exec);
+
+	/* Return the seqno for our job. */
+	args->seqno = vc4->emit_seqno;
+
+	return 0;
+
+fail:
+	vc4_complete_exec(vc4->dev, exec);
+
+	return ret;
+}
+
+void
+vc4_gem_init(struct drm_device *dev)
+{
+	struct vc4_dev *vc4 = to_vc4_dev(dev);
+
+	INIT_LIST_HEAD(&vc4->job_list);
+	INIT_LIST_HEAD(&vc4->job_done_list);
+	spin_lock_init(&vc4->job_lock);
+
+	INIT_WORK(&vc4->hangcheck.reset_work, vc4_reset_work);
+	setup_timer(&vc4->hangcheck.timer,
+		    vc4_hangcheck_elapsed,
+		    (unsigned long)dev);
+
+	INIT_WORK(&vc4->job_done_work, vc4_job_done_work);
+}
+
+void
+vc4_gem_destroy(struct drm_device *dev)
+{
+	struct vc4_dev *vc4 = to_vc4_dev(dev);
+
+	/* Waiting for exec to finish would need to be done before
+	 * unregistering V3D.
+	 */
+	WARN_ON(vc4->emit_seqno != vc4->finished_seqno);
+
+	/* V3D should already have disabled its interrupt and cleared
+	 * the overflow allocation registers.  Now free the object.
+	 */
+	if (vc4->overflow_mem) {
+		drm_gem_object_unreference_unlocked(&vc4->overflow_mem->base.base);
+		vc4->overflow_mem = NULL;
+	}
+
+	vc4_bo_cache_destroy(dev);
+}
diff --git a/drivers/gpu/drm/vc4/vc4_irq.c b/drivers/gpu/drm/vc4/vc4_irq.c
new file mode 100644
index 0000000..b68060e
--- /dev/null
+++ b/drivers/gpu/drm/vc4/vc4_irq.c
@@ -0,0 +1,210 @@
+/*
+ * Copyright © 2014 Broadcom
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ */
+
+/** DOC: Interrupt management for the V3D engine.
+ *
+ * We have an interrupt status register (V3D_INTCTL) which reports
+ * interrupts, and where writing 1 bits clears those interrupts.
+ * There are also a pair of interrupt registers
+ * (V3D_INTENA/V3D_INTDIS) where writing a 1 to their bits enables or
+ * disables that specific interrupt, and 0s written are ignored
+ * (reading either one returns the set of enabled interrupts).
+ *
+ * When we take a render frame interrupt, we need to wake the
+ * processes waiting for some frame to be done, and get the next frame
+ * submitted ASAP (so the hardware doesn't sit idle when there's work
+ * to do).
+ *
+ * When we take the binner out of memory interrupt, we need to
+ * allocate some new memory and pass it to the binner so that the
+ * current job can make progress.
+ */
+
+#include "vc4_drv.h"
+#include "vc4_regs.h"
+
+#define V3D_DRIVER_IRQS (V3D_INT_OUTOMEM | \
+			 V3D_INT_FRDONE)
+
+DECLARE_WAIT_QUEUE_HEAD(render_wait);
+
+static void
+vc4_overflow_mem_work(struct work_struct *work)
+{
+	struct vc4_dev *vc4 =
+		container_of(work, struct vc4_dev, overflow_mem_work);
+	struct drm_device *dev = vc4->dev;
+	struct vc4_bo *bo;
+
+	bo = vc4_bo_create(dev, 256 * 1024, true);
+	if (!bo) {
+		DRM_ERROR("Couldn't allocate binner overflow mem\n");
+		return;
+	}
+
+	/* If there's a job executing currently, then our previous
+	 * overflow allocation is getting used in that job and we need
+	 * to queue it to be released when the job is done.  But if no
+	 * job is executing at all, then we can free the old overflow
+	 * object direcctly.
+	 *
+	 * No lock necessary for this pointer since we're the only
+	 * ones that update the pointer, and our workqueue won't
+	 * reenter.
+	 */
+	if (vc4->overflow_mem) {
+		struct vc4_exec_info *current_exec;
+		unsigned long irqflags;
+
+		spin_lock_irqsave(&vc4->job_lock, irqflags);
+		current_exec = vc4_first_job(vc4);
+		if (current_exec) {
+			vc4->overflow_mem->seqno = vc4->finished_seqno + 1;
+			list_add_tail(&vc4->overflow_mem->unref_head,
+				      &current_exec->unref_list);
+			vc4->overflow_mem = NULL;
+		}
+		spin_unlock_irqrestore(&vc4->job_lock, irqflags);
+	}
+
+	if (vc4->overflow_mem)
+		drm_gem_object_unreference_unlocked(&vc4->overflow_mem->base.base);
+	vc4->overflow_mem = bo;
+
+	V3D_WRITE(V3D_BPOA, bo->base.paddr);
+	V3D_WRITE(V3D_BPOS, bo->base.base.size);
+	V3D_WRITE(V3D_INTCTL, V3D_INT_OUTOMEM);
+	V3D_WRITE(V3D_INTENA, V3D_INT_OUTOMEM);
+}
+
+static void
+vc4_irq_finish_job(struct drm_device *dev)
+{
+	struct vc4_dev *vc4 = to_vc4_dev(dev);
+	struct vc4_exec_info *exec = vc4_first_job(vc4);
+
+	if (!exec)
+		return;
+
+	vc4->finished_seqno++;
+	list_move_tail(&exec->head, &vc4->job_done_list);
+	vc4_submit_next_job(dev);
+
+	wake_up_all(&vc4->job_wait_queue);
+	schedule_work(&vc4->job_done_work);
+}
+
+irqreturn_t
+vc4_irq(int irq, void *arg)
+{
+	struct drm_device *dev = arg;
+	struct vc4_dev *vc4 = to_vc4_dev(dev);
+	uint32_t intctl;
+	irqreturn_t status = IRQ_NONE;
+
+	barrier();
+	intctl = V3D_READ(V3D_INTCTL);
+
+	/* Acknowledge the interrupts we're handling here. The render
+	 * frame done interrupt will be cleared, while OUTOMEM will
+	 * stay high until the underlying cause is cleared.
+	 */
+	V3D_WRITE(V3D_INTCTL, intctl);
+
+	if (intctl & V3D_INT_OUTOMEM) {
+		/* Disable OUTOMEM until the work is done. */
+		V3D_WRITE(V3D_INTDIS, V3D_INT_OUTOMEM);
+		schedule_work(&vc4->overflow_mem_work);
+		status = IRQ_HANDLED;
+	}
+
+	if (intctl & V3D_INT_FRDONE) {
+		spin_lock(&vc4->job_lock);
+		vc4_irq_finish_job(dev);
+		spin_unlock(&vc4->job_lock);
+		status = IRQ_HANDLED;
+	}
+
+	return status;
+}
+
+void
+vc4_irq_preinstall(struct drm_device *dev)
+{
+	struct vc4_dev *vc4 = to_vc4_dev(dev);
+
+	init_waitqueue_head(&vc4->job_wait_queue);
+	INIT_WORK(&vc4->overflow_mem_work, vc4_overflow_mem_work);
+
+	/* Clear any pending interrupts someone might have left around
+	 * for us.
+	 */
+	V3D_WRITE(V3D_INTCTL, V3D_DRIVER_IRQS);
+}
+
+int
+vc4_irq_postinstall(struct drm_device *dev)
+{
+	struct vc4_dev *vc4 = to_vc4_dev(dev);
+
+	/* Enable both the render done and out of memory interrupts. */
+	V3D_WRITE(V3D_INTENA, V3D_DRIVER_IRQS);
+
+	return 0;
+}
+
+void
+vc4_irq_uninstall(struct drm_device *dev)
+{
+	struct vc4_dev *vc4 = to_vc4_dev(dev);
+
+	/* Disable sending interrupts for our driver's IRQs. */
+	V3D_WRITE(V3D_INTDIS, V3D_DRIVER_IRQS);
+
+	/* Clear any pending interrupts we might have left. */
+	V3D_WRITE(V3D_INTCTL, V3D_DRIVER_IRQS);
+
+	cancel_work_sync(&vc4->overflow_mem_work);
+}
+
+/** Reinitializes interrupt registers when a GPU reset is performed. */
+void vc4_irq_reset(struct drm_device *dev)
+{
+	struct vc4_dev *vc4 = to_vc4_dev(dev);
+	unsigned long irqflags;
+
+	/* Acknowledge any stale IRQs. */
+	V3D_WRITE(V3D_INTCTL, V3D_DRIVER_IRQS);
+
+	/*
+	 * Turn all our interrupts on.  Binner out of memory is the
+	 * only one we expect to trigger at this point, since we've
+	 * just come from poweron and haven't supplied any overflow
+	 * memory yet.
+	 */
+	V3D_WRITE(V3D_INTENA, V3D_DRIVER_IRQS);
+
+	spin_lock_irqsave(&vc4->job_lock, irqflags);
+	vc4_irq_finish_job(dev);
+	spin_unlock_irqrestore(&vc4->job_lock, irqflags);
+}
diff --git a/drivers/gpu/drm/vc4/vc4_packet.h b/drivers/gpu/drm/vc4/vc4_packet.h
new file mode 100644
index 0000000..0f31cc0
--- /dev/null
+++ b/drivers/gpu/drm/vc4/vc4_packet.h
@@ -0,0 +1,399 @@
+/*
+ * Copyright © 2014 Broadcom
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ */
+
+#ifndef VC4_PACKET_H
+#define VC4_PACKET_H
+
+#include "vc4_regs.h" /* for VC4_MASK, VC4_GET_FIELD, VC4_SET_FIELD */
+
+enum vc4_packet {
+	VC4_PACKET_HALT = 0,
+	VC4_PACKET_NOP = 1,
+
+	VC4_PACKET_FLUSH = 4,
+	VC4_PACKET_FLUSH_ALL = 5,
+	VC4_PACKET_START_TILE_BINNING = 6,
+	VC4_PACKET_INCREMENT_SEMAPHORE = 7,
+	VC4_PACKET_WAIT_ON_SEMAPHORE = 8,
+
+	VC4_PACKET_BRANCH = 16,
+	VC4_PACKET_BRANCH_TO_SUB_LIST = 17,
+
+	VC4_PACKET_STORE_MS_TILE_BUFFER = 24,
+	VC4_PACKET_STORE_MS_TILE_BUFFER_AND_EOF = 25,
+	VC4_PACKET_STORE_FULL_RES_TILE_BUFFER = 26,
+	VC4_PACKET_LOAD_FULL_RES_TILE_BUFFER = 27,
+	VC4_PACKET_STORE_TILE_BUFFER_GENERAL = 28,
+	VC4_PACKET_LOAD_TILE_BUFFER_GENERAL = 29,
+
+	VC4_PACKET_GL_INDEXED_PRIMITIVE = 32,
+	VC4_PACKET_GL_ARRAY_PRIMITIVE = 33,
+
+	VC4_PACKET_COMPRESSED_PRIMITIVE = 48,
+	VC4_PACKET_CLIPPED_COMPRESSED_PRIMITIVE = 49,
+
+	VC4_PACKET_PRIMITIVE_LIST_FORMAT = 56,
+
+	VC4_PACKET_GL_SHADER_STATE = 64,
+	VC4_PACKET_NV_SHADER_STATE = 65,
+	VC4_PACKET_VG_SHADER_STATE = 66,
+
+	VC4_PACKET_CONFIGURATION_BITS = 96,
+	VC4_PACKET_FLAT_SHADE_FLAGS = 97,
+	VC4_PACKET_POINT_SIZE = 98,
+	VC4_PACKET_LINE_WIDTH = 99,
+	VC4_PACKET_RHT_X_BOUNDARY = 100,
+	VC4_PACKET_DEPTH_OFFSET = 101,
+	VC4_PACKET_CLIP_WINDOW = 102,
+	VC4_PACKET_VIEWPORT_OFFSET = 103,
+	VC4_PACKET_Z_CLIPPING = 104,
+	VC4_PACKET_CLIPPER_XY_SCALING = 105,
+	VC4_PACKET_CLIPPER_Z_SCALING = 106,
+
+	VC4_PACKET_TILE_BINNING_MODE_CONFIG = 112,
+	VC4_PACKET_TILE_RENDERING_MODE_CONFIG = 113,
+	VC4_PACKET_CLEAR_COLORS = 114,
+	VC4_PACKET_TILE_COORDINATES = 115,
+
+	/* Not an actual hardware packet -- this is what we use to put
+	 * references to GEM bos in the command stream, since we need the u32
+	 * int the actual address packet in order to store the offset from the
+	 * start of the BO.
+	 */
+	VC4_PACKET_GEM_HANDLES = 254,
+} __attribute__ ((__packed__));
+
+#define VC4_PACKET_HALT_SIZE						1
+#define VC4_PACKET_NOP_SIZE						1
+#define VC4_PACKET_FLUSH_SIZE						1
+#define VC4_PACKET_FLUSH_ALL_SIZE					1
+#define VC4_PACKET_START_TILE_BINNING_SIZE				1
+#define VC4_PACKET_INCREMENT_SEMAPHORE_SIZE				1
+#define VC4_PACKET_WAIT_ON_SEMAPHORE_SIZE				1
+#define VC4_PACKET_BRANCH_SIZE						5
+#define VC4_PACKET_BRANCH_TO_SUB_LIST_SIZE				5
+#define VC4_PACKET_STORE_MS_TILE_BUFFER_SIZE				1
+#define VC4_PACKET_STORE_MS_TILE_BUFFER_AND_EOF_SIZE			1
+#define VC4_PACKET_STORE_FULL_RES_TILE_BUFFER_SIZE			5
+#define VC4_PACKET_LOAD_FULL_RES_TILE_BUFFER_SIZE			5
+#define VC4_PACKET_STORE_TILE_BUFFER_GENERAL_SIZE			7
+#define VC4_PACKET_LOAD_TILE_BUFFER_GENERAL_SIZE			7
+#define VC4_PACKET_GL_INDEXED_PRIMITIVE_SIZE				14
+#define VC4_PACKET_GL_ARRAY_PRIMITIVE_SIZE				10
+#define VC4_PACKET_COMPRESSED_PRIMITIVE_SIZE				1
+#define VC4_PACKET_CLIPPED_COMPRESSED_PRIMITIVE_SIZE			1
+#define VC4_PACKET_PRIMITIVE_LIST_FORMAT_SIZE				2
+#define VC4_PACKET_GL_SHADER_STATE_SIZE					5
+#define VC4_PACKET_NV_SHADER_STATE_SIZE					5
+#define VC4_PACKET_VG_SHADER_STATE_SIZE					5
+#define VC4_PACKET_CONFIGURATION_BITS_SIZE				4
+#define VC4_PACKET_FLAT_SHADE_FLAGS_SIZE				5
+#define VC4_PACKET_POINT_SIZE_SIZE					5
+#define VC4_PACKET_LINE_WIDTH_SIZE					5
+#define VC4_PACKET_RHT_X_BOUNDARY_SIZE					3
+#define VC4_PACKET_DEPTH_OFFSET_SIZE					5
+#define VC4_PACKET_CLIP_WINDOW_SIZE					9
+#define VC4_PACKET_VIEWPORT_OFFSET_SIZE					5
+#define VC4_PACKET_Z_CLIPPING_SIZE					9
+#define VC4_PACKET_CLIPPER_XY_SCALING_SIZE				9
+#define VC4_PACKET_CLIPPER_Z_SCALING_SIZE				9
+#define VC4_PACKET_TILE_BINNING_MODE_CONFIG_SIZE			16
+#define VC4_PACKET_TILE_RENDERING_MODE_CONFIG_SIZE			11
+#define VC4_PACKET_CLEAR_COLORS_SIZE					14
+#define VC4_PACKET_TILE_COORDINATES_SIZE				3
+#define VC4_PACKET_GEM_HANDLES_SIZE					9
+
+/* Number of multisamples supported. */
+#define VC4_MAX_SAMPLES							4
+/* Size of a full resolution color or Z tile buffer load/store. */
+#define VC4_TILE_BUFFER_SIZE			(64 * 64 * 4)
+
+/** @{
+ * Bits used by packets like VC4_PACKET_STORE_TILE_BUFFER_GENERAL and
+ * VC4_PACKET_TILE_RENDERING_MODE_CONFIG.
+*/
+#define VC4_TILING_FORMAT_LINEAR    0
+#define VC4_TILING_FORMAT_T         1
+#define VC4_TILING_FORMAT_LT        2
+/** @} */
+
+/** @{
+ *
+ * low bits of VC4_PACKET_STORE_FULL_RES_TILE_BUFFER and
+ * VC4_PACKET_LOAD_FULL_RES_TILE_BUFFER.
+ */
+#define VC4_LOADSTORE_FULL_RES_EOF                     BIT(3)
+#define VC4_LOADSTORE_FULL_RES_DISABLE_CLEAR_ALL       BIT(2)
+#define VC4_LOADSTORE_FULL_RES_DISABLE_ZS              BIT(1)
+#define VC4_LOADSTORE_FULL_RES_DISABLE_COLOR           BIT(0)
+
+/** @{
+ *
+ * low bits of VC4_PACKET_STORE_FULL_RES_TILE_BUFFER and
+ * VC4_PACKET_LOAD_FULL_RES_TILE_BUFFER.
+ */
+#define VC4_LOADSTORE_FULL_RES_EOF                     BIT(3)
+#define VC4_LOADSTORE_FULL_RES_DISABLE_CLEAR_ALL       BIT(2)
+#define VC4_LOADSTORE_FULL_RES_DISABLE_ZS              BIT(1)
+#define VC4_LOADSTORE_FULL_RES_DISABLE_COLOR           BIT(0)
+
+/** @{
+ *
+ * byte 2 of VC4_PACKET_STORE_TILE_BUFFER_GENERAL and
+ * VC4_PACKET_LOAD_TILE_BUFFER_GENERAL (low bits of the address)
+ */
+
+#define VC4_LOADSTORE_TILE_BUFFER_EOF                  BIT(3)
+#define VC4_LOADSTORE_TILE_BUFFER_DISABLE_FULL_VG_MASK BIT(2)
+#define VC4_LOADSTORE_TILE_BUFFER_DISABLE_FULL_ZS      BIT(1)
+#define VC4_LOADSTORE_TILE_BUFFER_DISABLE_FULL_COLOR   BIT(0)
+
+/** @} */
+
+/** @{
+ *
+ * byte 0-1 of VC4_PACKET_STORE_TILE_BUFFER_GENERAL and
+ * VC4_PACKET_LOAD_TILE_BUFFER_GENERAL
+ */
+#define VC4_STORE_TILE_BUFFER_DISABLE_VG_MASK_CLEAR BIT(15)
+#define VC4_STORE_TILE_BUFFER_DISABLE_ZS_CLEAR     BIT(14)
+#define VC4_STORE_TILE_BUFFER_DISABLE_COLOR_CLEAR  BIT(13)
+#define VC4_STORE_TILE_BUFFER_DISABLE_SWAP         BIT(12)
+
+#define VC4_LOADSTORE_TILE_BUFFER_FORMAT_MASK      VC4_MASK(9, 8)
+#define VC4_LOADSTORE_TILE_BUFFER_FORMAT_SHIFT     8
+#define VC4_LOADSTORE_TILE_BUFFER_RGBA8888         0
+#define VC4_LOADSTORE_TILE_BUFFER_BGR565_DITHER    1
+#define VC4_LOADSTORE_TILE_BUFFER_BGR565           2
+/** @} */
+
+/** @{
+ *
+ * byte 0 of VC4_PACKET_STORE_TILE_BUFFER_GENERAL and
+ * VC4_PACKET_LOAD_TILE_BUFFER_GENERAL
+ */
+#define VC4_STORE_TILE_BUFFER_MODE_MASK            VC4_MASK(7, 6)
+#define VC4_STORE_TILE_BUFFER_MODE_SHIFT           6
+#define VC4_STORE_TILE_BUFFER_MODE_SAMPLE0         (0 << 6)
+#define VC4_STORE_TILE_BUFFER_MODE_DECIMATE_X4     (1 << 6)
+#define VC4_STORE_TILE_BUFFER_MODE_DECIMATE_X16    (2 << 6)
+
+/** The values of the field are VC4_TILING_FORMAT_* */
+#define VC4_LOADSTORE_TILE_BUFFER_TILING_MASK      VC4_MASK(5, 4)
+#define VC4_LOADSTORE_TILE_BUFFER_TILING_SHIFT     4
+
+#define VC4_LOADSTORE_TILE_BUFFER_BUFFER_MASK      VC4_MASK(2, 0)
+#define VC4_LOADSTORE_TILE_BUFFER_BUFFER_SHIFT     0
+#define VC4_LOADSTORE_TILE_BUFFER_NONE             0
+#define VC4_LOADSTORE_TILE_BUFFER_COLOR            1
+#define VC4_LOADSTORE_TILE_BUFFER_ZS               2
+#define VC4_LOADSTORE_TILE_BUFFER_Z                3
+#define VC4_LOADSTORE_TILE_BUFFER_VG_MASK          4
+#define VC4_LOADSTORE_TILE_BUFFER_FULL             5
+/** @} */
+
+#define VC4_INDEX_BUFFER_U8                        (0 << 4)
+#define VC4_INDEX_BUFFER_U16                       (1 << 4)
+
+/* This flag is only present in NV shader state. */
+#define VC4_SHADER_FLAG_SHADED_CLIP_COORDS         BIT(3)
+#define VC4_SHADER_FLAG_ENABLE_CLIPPING            BIT(2)
+#define VC4_SHADER_FLAG_VS_POINT_SIZE              BIT(1)
+#define VC4_SHADER_FLAG_FS_SINGLE_THREAD           BIT(0)
+
+/** @{ byte 2 of config bits. */
+#define VC4_CONFIG_BITS_EARLY_Z_UPDATE             BIT(1)
+#define VC4_CONFIG_BITS_EARLY_Z                    BIT(0)
+/** @} */
+
+/** @{ byte 1 of config bits. */
+#define VC4_CONFIG_BITS_Z_UPDATE                   BIT(7)
+/** same values in this 3-bit field as PIPE_FUNC_* */
+#define VC4_CONFIG_BITS_DEPTH_FUNC_SHIFT           4
+#define VC4_CONFIG_BITS_COVERAGE_READ_LEAVE        BIT(3)
+
+#define VC4_CONFIG_BITS_COVERAGE_UPDATE_NONZERO    (0 << 1)
+#define VC4_CONFIG_BITS_COVERAGE_UPDATE_ODD        (1 << 1)
+#define VC4_CONFIG_BITS_COVERAGE_UPDATE_OR         (2 << 1)
+#define VC4_CONFIG_BITS_COVERAGE_UPDATE_ZERO       (3 << 1)
+
+#define VC4_CONFIG_BITS_COVERAGE_PIPE_SELECT       BIT(0)
+/** @} */
+
+/** @{ byte 0 of config bits. */
+#define VC4_CONFIG_BITS_RASTERIZER_OVERSAMPLE_NONE (0 << 6)
+#define VC4_CONFIG_BITS_RASTERIZER_OVERSAMPLE_4X   (1 << 6)
+#define VC4_CONFIG_BITS_RASTERIZER_OVERSAMPLE_16X  (2 << 6)
+
+#define VC4_CONFIG_BITS_AA_POINTS_AND_LINES        BIT(4)
+#define VC4_CONFIG_BITS_ENABLE_DEPTH_OFFSET        BIT(3)
+#define VC4_CONFIG_BITS_CW_PRIMITIVES              BIT(2)
+#define VC4_CONFIG_BITS_ENABLE_PRIM_BACK           BIT(1)
+#define VC4_CONFIG_BITS_ENABLE_PRIM_FRONT          BIT(0)
+/** @} */
+
+/** @{ bits in the last u8 of VC4_PACKET_TILE_BINNING_MODE_CONFIG */
+#define VC4_BIN_CONFIG_DB_NON_MS                   BIT(7)
+
+#define VC4_BIN_CONFIG_ALLOC_BLOCK_SIZE_MASK       VC4_MASK(6, 5)
+#define VC4_BIN_CONFIG_ALLOC_BLOCK_SIZE_SHIFT      5
+#define VC4_BIN_CONFIG_ALLOC_BLOCK_SIZE_32         0
+#define VC4_BIN_CONFIG_ALLOC_BLOCK_SIZE_64         1
+#define VC4_BIN_CONFIG_ALLOC_BLOCK_SIZE_128        2
+#define VC4_BIN_CONFIG_ALLOC_BLOCK_SIZE_256        3
+
+#define VC4_BIN_CONFIG_ALLOC_INIT_BLOCK_SIZE_MASK  VC4_MASK(4, 3)
+#define VC4_BIN_CONFIG_ALLOC_INIT_BLOCK_SIZE_SHIFT 3
+#define VC4_BIN_CONFIG_ALLOC_INIT_BLOCK_SIZE_32    0
+#define VC4_BIN_CONFIG_ALLOC_INIT_BLOCK_SIZE_64    1
+#define VC4_BIN_CONFIG_ALLOC_INIT_BLOCK_SIZE_128   2
+#define VC4_BIN_CONFIG_ALLOC_INIT_BLOCK_SIZE_256   3
+
+#define VC4_BIN_CONFIG_AUTO_INIT_TSDA              BIT(2)
+#define VC4_BIN_CONFIG_TILE_BUFFER_64BIT           BIT(1)
+#define VC4_BIN_CONFIG_MS_MODE_4X                  BIT(0)
+/** @} */
+
+/** @{ bits in the last u16 of VC4_PACKET_TILE_RENDERING_MODE_CONFIG */
+#define VC4_RENDER_CONFIG_DB_NON_MS                BIT(12)
+#define VC4_RENDER_CONFIG_EARLY_Z_COVERAGE_DISABLE BIT(11)
+#define VC4_RENDER_CONFIG_EARLY_Z_DIRECTION_G      BIT(10)
+#define VC4_RENDER_CONFIG_COVERAGE_MODE            BIT(9)
+#define VC4_RENDER_CONFIG_ENABLE_VG_MASK           BIT(8)
+
+/** The values of the field are VC4_TILING_FORMAT_* */
+#define VC4_RENDER_CONFIG_MEMORY_FORMAT_MASK       VC4_MASK(7, 6)
+#define VC4_RENDER_CONFIG_MEMORY_FORMAT_SHIFT      6
+
+#define VC4_RENDER_CONFIG_DECIMATE_MODE_1X         (0 << 4)
+#define VC4_RENDER_CONFIG_DECIMATE_MODE_4X         (1 << 4)
+#define VC4_RENDER_CONFIG_DECIMATE_MODE_16X        (2 << 4)
+
+#define VC4_RENDER_CONFIG_FORMAT_MASK              VC4_MASK(3, 2)
+#define VC4_RENDER_CONFIG_FORMAT_SHIFT             2
+#define VC4_RENDER_CONFIG_FORMAT_BGR565_DITHERED   0
+#define VC4_RENDER_CONFIG_FORMAT_RGBA8888          1
+#define VC4_RENDER_CONFIG_FORMAT_BGR565            2
+
+#define VC4_RENDER_CONFIG_TILE_BUFFER_64BIT        BIT(1)
+#define VC4_RENDER_CONFIG_MS_MODE_4X               BIT(0)
+
+#define VC4_PRIMITIVE_LIST_FORMAT_16_INDEX         (1 << 4)
+#define VC4_PRIMITIVE_LIST_FORMAT_32_XY            (3 << 4)
+#define VC4_PRIMITIVE_LIST_FORMAT_TYPE_POINTS      (0 << 0)
+#define VC4_PRIMITIVE_LIST_FORMAT_TYPE_LINES       (1 << 0)
+#define VC4_PRIMITIVE_LIST_FORMAT_TYPE_TRIANGLES   (2 << 0)
+#define VC4_PRIMITIVE_LIST_FORMAT_TYPE_RHT         (3 << 0)
+
+enum vc4_texture_data_type {
+	VC4_TEXTURE_TYPE_RGBA8888 = 0,
+	VC4_TEXTURE_TYPE_RGBX8888 = 1,
+	VC4_TEXTURE_TYPE_RGBA4444 = 2,
+	VC4_TEXTURE_TYPE_RGBA5551 = 3,
+	VC4_TEXTURE_TYPE_RGB565 = 4,
+	VC4_TEXTURE_TYPE_LUMINANCE = 5,
+	VC4_TEXTURE_TYPE_ALPHA = 6,
+	VC4_TEXTURE_TYPE_LUMALPHA = 7,
+	VC4_TEXTURE_TYPE_ETC1 = 8,
+	VC4_TEXTURE_TYPE_S16F = 9,
+	VC4_TEXTURE_TYPE_S8 = 10,
+	VC4_TEXTURE_TYPE_S16 = 11,
+	VC4_TEXTURE_TYPE_BW1 = 12,
+	VC4_TEXTURE_TYPE_A4 = 13,
+	VC4_TEXTURE_TYPE_A1 = 14,
+	VC4_TEXTURE_TYPE_RGBA64 = 15,
+	VC4_TEXTURE_TYPE_RGBA32R = 16,
+	VC4_TEXTURE_TYPE_YUV422R = 17,
+};
+
+#define VC4_TEX_P0_OFFSET_MASK                     VC4_MASK(31, 12)
+#define VC4_TEX_P0_OFFSET_SHIFT                    12
+#define VC4_TEX_P0_CSWIZ_MASK                      VC4_MASK(11, 10)
+#define VC4_TEX_P0_CSWIZ_SHIFT                     10
+#define VC4_TEX_P0_CMMODE_MASK                     VC4_MASK(9, 9)
+#define VC4_TEX_P0_CMMODE_SHIFT                    9
+#define VC4_TEX_P0_FLIPY_MASK                      VC4_MASK(8, 8)
+#define VC4_TEX_P0_FLIPY_SHIFT                     8
+#define VC4_TEX_P0_TYPE_MASK                       VC4_MASK(7, 4)
+#define VC4_TEX_P0_TYPE_SHIFT                      4
+#define VC4_TEX_P0_MIPLVLS_MASK                    VC4_MASK(3, 0)
+#define VC4_TEX_P0_MIPLVLS_SHIFT                   0
+
+#define VC4_TEX_P1_TYPE4_MASK                      VC4_MASK(31, 31)
+#define VC4_TEX_P1_TYPE4_SHIFT                     31
+#define VC4_TEX_P1_HEIGHT_MASK                     VC4_MASK(30, 20)
+#define VC4_TEX_P1_HEIGHT_SHIFT                    20
+#define VC4_TEX_P1_ETCFLIP_MASK                    VC4_MASK(19, 19)
+#define VC4_TEX_P1_ETCFLIP_SHIFT                   19
+#define VC4_TEX_P1_WIDTH_MASK                      VC4_MASK(18, 8)
+#define VC4_TEX_P1_WIDTH_SHIFT                     8
+
+#define VC4_TEX_P1_MAGFILT_MASK                    VC4_MASK(7, 7)
+#define VC4_TEX_P1_MAGFILT_SHIFT                   7
+# define VC4_TEX_P1_MAGFILT_LINEAR                 0
+# define VC4_TEX_P1_MAGFILT_NEAREST                1
+
+#define VC4_TEX_P1_MINFILT_MASK                    VC4_MASK(6, 4)
+#define VC4_TEX_P1_MINFILT_SHIFT                   4
+# define VC4_TEX_P1_MINFILT_LINEAR                 0
+# define VC4_TEX_P1_MINFILT_NEAREST                1
+# define VC4_TEX_P1_MINFILT_NEAR_MIP_NEAR          2
+# define VC4_TEX_P1_MINFILT_NEAR_MIP_LIN           3
+# define VC4_TEX_P1_MINFILT_LIN_MIP_NEAR           4
+# define VC4_TEX_P1_MINFILT_LIN_MIP_LIN            5
+
+#define VC4_TEX_P1_WRAP_T_MASK                     VC4_MASK(3, 2)
+#define VC4_TEX_P1_WRAP_T_SHIFT                    2
+#define VC4_TEX_P1_WRAP_S_MASK                     VC4_MASK(1, 0)
+#define VC4_TEX_P1_WRAP_S_SHIFT                    0
+# define VC4_TEX_P1_WRAP_REPEAT                    0
+# define VC4_TEX_P1_WRAP_CLAMP                     1
+# define VC4_TEX_P1_WRAP_MIRROR                    2
+# define VC4_TEX_P1_WRAP_BORDER                    3
+
+#define VC4_TEX_P2_PTYPE_MASK                      VC4_MASK(31, 30)
+#define VC4_TEX_P2_PTYPE_SHIFT                     30
+# define VC4_TEX_P2_PTYPE_IGNORED                  0
+# define VC4_TEX_P2_PTYPE_CUBE_MAP_STRIDE          1
+# define VC4_TEX_P2_PTYPE_CHILD_IMAGE_DIMENSIONS   2
+# define VC4_TEX_P2_PTYPE_CHILD_IMAGE_OFFSETS      3
+
+/* VC4_TEX_P2_PTYPE_CUBE_MAP_STRIDE bits */
+#define VC4_TEX_P2_CMST_MASK                       VC4_MASK(29, 12)
+#define VC4_TEX_P2_CMST_SHIFT                      12
+#define VC4_TEX_P2_BSLOD_MASK                      VC4_MASK(0, 0)
+#define VC4_TEX_P2_BSLOD_SHIFT                     0
+
+/* VC4_TEX_P2_PTYPE_CHILD_IMAGE_DIMENSIONS */
+#define VC4_TEX_P2_CHEIGHT_MASK                    VC4_MASK(22, 12)
+#define VC4_TEX_P2_CHEIGHT_SHIFT                   12
+#define VC4_TEX_P2_CWIDTH_MASK                     VC4_MASK(10, 0)
+#define VC4_TEX_P2_CWIDTH_SHIFT                    0
+
+/* VC4_TEX_P2_PTYPE_CHILD_IMAGE_OFFSETS */
+#define VC4_TEX_P2_CYOFF_MASK                      VC4_MASK(22, 12)
+#define VC4_TEX_P2_CYOFF_SHIFT                     12
+#define VC4_TEX_P2_CXOFF_MASK                      VC4_MASK(10, 0)
+#define VC4_TEX_P2_CXOFF_SHIFT                     0
+
+#endif /* VC4_PACKET_H */
diff --git a/drivers/gpu/drm/vc4/vc4_render_cl.c b/drivers/gpu/drm/vc4/vc4_render_cl.c
new file mode 100644
index 0000000..1dc3996
--- /dev/null
+++ b/drivers/gpu/drm/vc4/vc4_render_cl.c
@@ -0,0 +1,631 @@
+/*
+ * Copyright © 2014-2015 Broadcom
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ */
+
+/**
+ * DOC: Render command list generation
+ *
+ * In the VC4 driver, render command list generation is performed by the
+ * kernel instead of userspace.  We do this because validating a
+ * user-submitted command list is hard to get right and has high CPU overhead,
+ * while the number of valid configurations for render command lists is
+ * actually fairly low.
+ */
+
+#include "uapi/drm/vc4_drm.h"
+#include "vc4_drv.h"
+#include "vc4_packet.h"
+
+struct vc4_rcl_setup {
+	struct drm_gem_cma_object *color_read;
+	struct drm_gem_cma_object *color_write;
+	struct drm_gem_cma_object *zs_read;
+	struct drm_gem_cma_object *zs_write;
+	struct drm_gem_cma_object *msaa_color_write;
+	struct drm_gem_cma_object *msaa_zs_write;
+
+	struct drm_gem_cma_object *rcl;
+	u32 next_offset;
+};
+
+static inline void rcl_u8(struct vc4_rcl_setup *setup, u8 val)
+{
+	*(u8 *)(setup->rcl->vaddr + setup->next_offset) = val;
+	setup->next_offset += 1;
+}
+
+static inline void rcl_u16(struct vc4_rcl_setup *setup, u16 val)
+{
+	*(u16 *)(setup->rcl->vaddr + setup->next_offset) = val;
+	setup->next_offset += 2;
+}
+
+static inline void rcl_u32(struct vc4_rcl_setup *setup, u32 val)
+{
+	*(u32 *)(setup->rcl->vaddr + setup->next_offset) = val;
+	setup->next_offset += 4;
+}
+
+/*
+ * Emits a no-op STORE_TILE_BUFFER_GENERAL.
+ *
+ * If we emit a PACKET_TILE_COORDINATES, it must be followed by a store of
+ * some sort before another load is triggered.
+ */
+static void vc4_store_before_load(struct vc4_rcl_setup *setup)
+{
+	rcl_u8(setup, VC4_PACKET_STORE_TILE_BUFFER_GENERAL);
+	rcl_u16(setup,
+		VC4_SET_FIELD(VC4_LOADSTORE_TILE_BUFFER_NONE,
+			      VC4_LOADSTORE_TILE_BUFFER_BUFFER) |
+		VC4_STORE_TILE_BUFFER_DISABLE_COLOR_CLEAR |
+		VC4_STORE_TILE_BUFFER_DISABLE_ZS_CLEAR |
+		VC4_STORE_TILE_BUFFER_DISABLE_VG_MASK_CLEAR);
+	rcl_u32(setup, 0); /* no address, since we're in None mode */
+}
+
+/*
+ * Calculates the physical address of the start of a tile in a RCL surface.
+ *
+ * Unlike the other load/store packets,
+ * VC4_PACKET_LOAD/STORE_FULL_RES_TILE_BUFFER don't look at the tile
+ * coordinates packet, and instead just store to the address given.
+ */
+static uint32_t vc4_full_res_offset(struct vc4_exec_info *exec,
+				    struct drm_gem_cma_object *bo,
+				    struct drm_vc4_submit_rcl_surface *surf,
+				    uint8_t x, uint8_t y)
+{
+	return bo->paddr + surf->offset + VC4_TILE_BUFFER_SIZE *
+		(DIV_ROUND_UP(exec->args->width, 32) * y + x);
+}
+
+/*
+ * Emits a PACKET_TILE_COORDINATES if one isn't already pending.
+ *
+ * The tile coordinates packet triggers a pending load if there is one, are
+ * used for clipping during rendering, and determine where loads/stores happen
+ * relative to their base address.
+ */
+static void vc4_tile_coordinates(struct vc4_rcl_setup *setup,
+				 uint32_t x, uint32_t y)
+{
+	rcl_u8(setup, VC4_PACKET_TILE_COORDINATES);
+	rcl_u8(setup, x);
+	rcl_u8(setup, y);
+}
+
+static void emit_tile(struct vc4_exec_info *exec,
+		      struct vc4_rcl_setup *setup,
+		      uint8_t x, uint8_t y, bool first, bool last)
+{
+	struct drm_vc4_submit_cl *args = exec->args;
+	bool has_bin = args->bin_cl_size != 0;
+
+	/* Note that the load doesn't actually occur until the
+	 * tile coords packet is processed, and only one load
+	 * may be outstanding at a time.
+	 */
+	if (setup->color_read) {
+		if (args->color_read.flags &
+		    VC4_SUBMIT_RCL_SURFACE_READ_IS_FULL_RES) {
+			rcl_u8(setup, VC4_PACKET_LOAD_FULL_RES_TILE_BUFFER);
+			rcl_u32(setup,
+				vc4_full_res_offset(exec, setup->color_read,
+						    &args->color_read, x, y) |
+				VC4_LOADSTORE_FULL_RES_DISABLE_ZS);
+		} else {
+			rcl_u8(setup, VC4_PACKET_LOAD_TILE_BUFFER_GENERAL);
+			rcl_u16(setup, args->color_read.bits);
+			rcl_u32(setup, setup->color_read->paddr +
+				args->color_read.offset);
+		}
+	}
+
+	if (setup->zs_read) {
+		if (args->zs_read.flags &
+		    VC4_SUBMIT_RCL_SURFACE_READ_IS_FULL_RES) {
+			rcl_u8(setup, VC4_PACKET_LOAD_FULL_RES_TILE_BUFFER);
+			rcl_u32(setup,
+				vc4_full_res_offset(exec, setup->zs_read,
+						    &args->zs_read, x, y) |
+				VC4_LOADSTORE_FULL_RES_DISABLE_COLOR);
+		} else {
+			if (setup->color_read) {
+				/* Exec previous load. */
+				vc4_tile_coordinates(setup, x, y);
+				vc4_store_before_load(setup);
+			}
+
+			rcl_u8(setup, VC4_PACKET_LOAD_TILE_BUFFER_GENERAL);
+			rcl_u16(setup, args->zs_read.bits);
+			rcl_u32(setup, setup->zs_read->paddr +
+				args->zs_read.offset);
+		}
+	}
+
+	/* Clipping depends on tile coordinates having been
+	 * emitted, so we always need one here.
+	 */
+	vc4_tile_coordinates(setup, x, y);
+
+	/* Wait for the binner before jumping to the first
+	 * tile's lists.
+	 */
+	if (first && has_bin)
+		rcl_u8(setup, VC4_PACKET_WAIT_ON_SEMAPHORE);
+
+	if (has_bin) {
+		rcl_u8(setup, VC4_PACKET_BRANCH_TO_SUB_LIST);
+		rcl_u32(setup, (exec->tile_bo->paddr +
+				exec->tile_alloc_offset +
+				(y * exec->bin_tiles_x + x) * 32));
+	}
+
+	if (setup->msaa_color_write) {
+		bool last_tile_write = (!setup->msaa_zs_write &&
+					!setup->zs_write &&
+					!setup->color_write);
+		uint32_t bits = VC4_LOADSTORE_FULL_RES_DISABLE_ZS;
+
+		if (!last_tile_write)
+			bits |= VC4_LOADSTORE_FULL_RES_DISABLE_CLEAR_ALL;
+		else if (last)
+			bits |= VC4_LOADSTORE_FULL_RES_EOF;
+		rcl_u8(setup, VC4_PACKET_STORE_FULL_RES_TILE_BUFFER);
+		rcl_u32(setup,
+			vc4_full_res_offset(exec, setup->msaa_color_write,
+					    &args->msaa_color_write, x, y) |
+			bits);
+	}
+
+	if (setup->msaa_zs_write) {
+		bool last_tile_write = (!setup->zs_write &&
+					!setup->color_write);
+		uint32_t bits = VC4_LOADSTORE_FULL_RES_DISABLE_COLOR;
+
+		if (setup->msaa_color_write)
+			vc4_tile_coordinates(setup, x, y);
+		if (!last_tile_write)
+			bits |= VC4_LOADSTORE_FULL_RES_DISABLE_CLEAR_ALL;
+		else if (last)
+			bits |= VC4_LOADSTORE_FULL_RES_EOF;
+		rcl_u8(setup, VC4_PACKET_STORE_FULL_RES_TILE_BUFFER);
+		rcl_u32(setup,
+			vc4_full_res_offset(exec, setup->msaa_zs_write,
+					    &args->msaa_zs_write, x, y) |
+			bits);
+	}
+
+	if (setup->zs_write) {
+		bool last_tile_write = !setup->color_write;
+
+		if (setup->msaa_color_write || setup->msaa_zs_write)
+			vc4_tile_coordinates(setup, x, y);
+
+		rcl_u8(setup, VC4_PACKET_STORE_TILE_BUFFER_GENERAL);
+		rcl_u16(setup, args->zs_write.bits |
+			(last_tile_write ?
+			 0 : VC4_STORE_TILE_BUFFER_DISABLE_COLOR_CLEAR));
+		rcl_u32(setup,
+			(setup->zs_write->paddr + args->zs_write.offset) |
+			((last && last_tile_write) ?
+			 VC4_LOADSTORE_TILE_BUFFER_EOF : 0));
+	}
+
+	if (setup->color_write) {
+		if (setup->msaa_color_write || setup->msaa_zs_write ||
+		    setup->zs_write) {
+			vc4_tile_coordinates(setup, x, y);
+		}
+
+		if (last)
+			rcl_u8(setup, VC4_PACKET_STORE_MS_TILE_BUFFER_AND_EOF);
+		else
+			rcl_u8(setup, VC4_PACKET_STORE_MS_TILE_BUFFER);
+	}
+}
+
+static int vc4_create_rcl_bo(struct drm_device *dev, struct vc4_exec_info *exec,
+			     struct vc4_rcl_setup *setup)
+{
+	struct drm_vc4_submit_cl *args = exec->args;
+	bool has_bin = args->bin_cl_size != 0;
+	uint8_t min_x_tile = args->min_x_tile;
+	uint8_t min_y_tile = args->min_y_tile;
+	uint8_t max_x_tile = args->max_x_tile;
+	uint8_t max_y_tile = args->max_y_tile;
+	uint8_t xtiles = max_x_tile - min_x_tile + 1;
+	uint8_t ytiles = max_y_tile - min_y_tile + 1;
+	uint8_t x, y;
+	uint32_t size, loop_body_size;
+
+	size = VC4_PACKET_TILE_RENDERING_MODE_CONFIG_SIZE;
+	loop_body_size = VC4_PACKET_TILE_COORDINATES_SIZE;
+
+	if (args->flags & VC4_SUBMIT_CL_USE_CLEAR_COLOR) {
+		size += VC4_PACKET_CLEAR_COLORS_SIZE +
+			VC4_PACKET_TILE_COORDINATES_SIZE +
+			VC4_PACKET_STORE_TILE_BUFFER_GENERAL_SIZE;
+	}
+
+	if (setup->color_read) {
+		if (args->color_read.flags &
+		    VC4_SUBMIT_RCL_SURFACE_READ_IS_FULL_RES) {
+			loop_body_size += VC4_PACKET_LOAD_FULL_RES_TILE_BUFFER_SIZE;
+		} else {
+			loop_body_size += VC4_PACKET_LOAD_TILE_BUFFER_GENERAL_SIZE;
+		}
+	}
+	if (setup->zs_read) {
+		if (args->zs_read.flags &
+		    VC4_SUBMIT_RCL_SURFACE_READ_IS_FULL_RES) {
+			loop_body_size += VC4_PACKET_LOAD_FULL_RES_TILE_BUFFER_SIZE;
+		} else {
+			if (setup->color_read &&
+			    !(args->color_read.flags &
+			      VC4_SUBMIT_RCL_SURFACE_READ_IS_FULL_RES)) {
+				loop_body_size += VC4_PACKET_TILE_COORDINATES_SIZE;
+				loop_body_size += VC4_PACKET_STORE_TILE_BUFFER_GENERAL_SIZE;
+			}
+			loop_body_size += VC4_PACKET_LOAD_TILE_BUFFER_GENERAL_SIZE;
+		}
+	}
+
+	if (has_bin) {
+		size += VC4_PACKET_WAIT_ON_SEMAPHORE_SIZE;
+		loop_body_size += VC4_PACKET_BRANCH_TO_SUB_LIST_SIZE;
+	}
+
+	if (setup->msaa_color_write)
+		loop_body_size += VC4_PACKET_STORE_FULL_RES_TILE_BUFFER_SIZE;
+	if (setup->msaa_zs_write)
+		loop_body_size += VC4_PACKET_STORE_FULL_RES_TILE_BUFFER_SIZE;
+
+	if (setup->zs_write)
+		loop_body_size += VC4_PACKET_STORE_TILE_BUFFER_GENERAL_SIZE;
+	if (setup->color_write)
+		loop_body_size += VC4_PACKET_STORE_MS_TILE_BUFFER_SIZE;
+
+	/* We need a VC4_PACKET_TILE_COORDINATES in between each store. */
+	loop_body_size += VC4_PACKET_TILE_COORDINATES_SIZE *
+		((setup->msaa_color_write != NULL) +
+		 (setup->msaa_zs_write != NULL) +
+		 (setup->color_write != NULL) +
+		 (setup->zs_write != NULL) - 1);
+
+	size += xtiles * ytiles * loop_body_size;
+
+	setup->rcl = &vc4_bo_create(dev, size, true)->base;
+	if (!setup->rcl)
+		return -ENOMEM;
+	list_add_tail(&to_vc4_bo(&setup->rcl->base)->unref_head,
+		      &exec->unref_list);
+
+	rcl_u8(setup, VC4_PACKET_TILE_RENDERING_MODE_CONFIG);
+	rcl_u32(setup,
+		(setup->color_write ? (setup->color_write->paddr +
+				       args->color_write.offset) :
+		 0));
+	rcl_u16(setup, args->width);
+	rcl_u16(setup, args->height);
+	rcl_u16(setup, args->color_write.bits);
+
+	/* The tile buffer gets cleared when the previous tile is stored.  If
+	 * the clear values changed between frames, then the tile buffer has
+	 * stale clear values in it, so we have to do a store in None mode (no
+	 * writes) so that we trigger the tile buffer clear.
+	 */
+	if (args->flags & VC4_SUBMIT_CL_USE_CLEAR_COLOR) {
+		rcl_u8(setup, VC4_PACKET_CLEAR_COLORS);
+		rcl_u32(setup, args->clear_color[0]);
+		rcl_u32(setup, args->clear_color[1]);
+		rcl_u32(setup, args->clear_z);
+		rcl_u8(setup, args->clear_s);
+
+		vc4_tile_coordinates(setup, 0, 0);
+
+		rcl_u8(setup, VC4_PACKET_STORE_TILE_BUFFER_GENERAL);
+		rcl_u16(setup, VC4_LOADSTORE_TILE_BUFFER_NONE);
+		rcl_u32(setup, 0); /* no address, since we're in None mode */
+	}
+
+	for (y = min_y_tile; y <= max_y_tile; y++) {
+		for (x = min_x_tile; x <= max_x_tile; x++) {
+			bool first = (x == min_x_tile && y == min_y_tile);
+			bool last = (x == max_x_tile && y == max_y_tile);
+
+			emit_tile(exec, setup, x, y, first, last);
+		}
+	}
+
+	BUG_ON(setup->next_offset != size);
+	exec->ct1ca = setup->rcl->paddr;
+	exec->ct1ea = setup->rcl->paddr + setup->next_offset;
+
+	return 0;
+}
+
+static int vc4_full_res_bounds_check(struct vc4_exec_info *exec,
+				     struct drm_gem_cma_object *obj,
+				     struct drm_vc4_submit_rcl_surface *surf)
+{
+	struct drm_vc4_submit_cl *args = exec->args;
+	u32 render_tiles_stride = DIV_ROUND_UP(exec->args->width, 32);
+
+	if (surf->offset > obj->base.size) {
+		DRM_ERROR("surface offset %d > BO size %zd\n",
+			  surf->offset, obj->base.size);
+		return -EINVAL;
+	}
+
+	if ((obj->base.size - surf->offset) / VC4_TILE_BUFFER_SIZE <
+	    render_tiles_stride * args->max_y_tile + args->max_x_tile) {
+		DRM_ERROR("MSAA tile %d, %d out of bounds "
+			  "(bo size %zd, offset %d).\n",
+			  args->max_x_tile, args->max_y_tile,
+			  obj->base.size,
+			  surf->offset);
+		return -EINVAL;
+	}
+
+	return 0;
+}
+
+static int vc4_rcl_msaa_surface_setup(struct vc4_exec_info *exec,
+				      struct drm_gem_cma_object **obj,
+				      struct drm_vc4_submit_rcl_surface *surf)
+{
+	if (surf->flags != 0 || surf->bits != 0) {
+		DRM_ERROR("MSAA surface had nonzero flags/bits\n");
+		return -EINVAL;
+	}
+
+	if (surf->hindex == ~0)
+		return 0;
+
+	if (!vc4_use_bo(exec, surf->hindex, VC4_MODE_RENDER, obj))
+		return -EINVAL;
+
+	if (surf->offset & 0xf) {
+		DRM_ERROR("MSAA write must be 16b aligned.\n");
+		return -EINVAL;
+	}
+
+	return vc4_full_res_bounds_check(exec, *obj, surf);
+}
+
+static int vc4_rcl_surface_setup(struct vc4_exec_info *exec,
+				 struct drm_gem_cma_object **obj,
+				 struct drm_vc4_submit_rcl_surface *surf)
+{
+	uint8_t tiling = VC4_GET_FIELD(surf->bits,
+				       VC4_LOADSTORE_TILE_BUFFER_TILING);
+	uint8_t buffer = VC4_GET_FIELD(surf->bits,
+				       VC4_LOADSTORE_TILE_BUFFER_BUFFER);
+	uint8_t format = VC4_GET_FIELD(surf->bits,
+				       VC4_LOADSTORE_TILE_BUFFER_FORMAT);
+	int cpp;
+	int ret;
+
+	if (surf->flags & ~VC4_SUBMIT_RCL_SURFACE_READ_IS_FULL_RES) {
+		DRM_ERROR("Extra flags set\n");
+		return -EINVAL;
+	}
+
+	if (surf->hindex == ~0)
+		return 0;
+
+	if (!vc4_use_bo(exec, surf->hindex, VC4_MODE_RENDER, obj))
+		return -EINVAL;
+
+	if (surf->flags & VC4_SUBMIT_RCL_SURFACE_READ_IS_FULL_RES) {
+		if (surf == &exec->args->zs_write) {
+			DRM_ERROR("general zs write may not be a full-res.\n");
+			return -EINVAL;
+		}
+
+		if (surf->bits != 0) {
+			DRM_ERROR("load/store general bits set with "
+				  "full res load/store.\n");
+			return -EINVAL;
+		}
+
+		ret = vc4_full_res_bounds_check(exec, *obj, surf);
+		if (!ret)
+			return ret;
+
+		return 0;
+	}
+
+	if (surf->bits & ~(VC4_LOADSTORE_TILE_BUFFER_TILING_MASK |
+			   VC4_LOADSTORE_TILE_BUFFER_BUFFER_MASK |
+			   VC4_LOADSTORE_TILE_BUFFER_FORMAT_MASK)) {
+		DRM_ERROR("Unknown bits in load/store: 0x%04x\n",
+			  surf->bits);
+		return -EINVAL;
+	}
+
+	if (tiling > VC4_TILING_FORMAT_LT) {
+		DRM_ERROR("Bad tiling format\n");
+		return -EINVAL;
+	}
+
+	if (buffer == VC4_LOADSTORE_TILE_BUFFER_ZS) {
+		if (format != 0) {
+			DRM_ERROR("No color format should be set for ZS\n");
+			return -EINVAL;
+		}
+		cpp = 4;
+	} else if (buffer == VC4_LOADSTORE_TILE_BUFFER_COLOR) {
+		switch (format) {
+		case VC4_LOADSTORE_TILE_BUFFER_BGR565:
+		case VC4_LOADSTORE_TILE_BUFFER_BGR565_DITHER:
+			cpp = 2;
+			break;
+		case VC4_LOADSTORE_TILE_BUFFER_RGBA8888:
+			cpp = 4;
+			break;
+		default:
+			DRM_ERROR("Bad tile buffer format\n");
+			return -EINVAL;
+		}
+	} else {
+		DRM_ERROR("Bad load/store buffer %d.\n", buffer);
+		return -EINVAL;
+	}
+
+	if (surf->offset & 0xf) {
+		DRM_ERROR("load/store buffer must be 16b aligned.\n");
+		return -EINVAL;
+	}
+
+	if (!vc4_check_tex_size(exec, *obj, surf->offset, tiling,
+				exec->args->width, exec->args->height, cpp)) {
+		return -EINVAL;
+	}
+
+	return 0;
+}
+
+static int
+vc4_rcl_render_config_surface_setup(struct vc4_exec_info *exec,
+				    struct vc4_rcl_setup *setup,
+				    struct drm_gem_cma_object **obj,
+				    struct drm_vc4_submit_rcl_surface *surf)
+{
+	uint8_t tiling = VC4_GET_FIELD(surf->bits,
+				       VC4_RENDER_CONFIG_MEMORY_FORMAT);
+	uint8_t format = VC4_GET_FIELD(surf->bits,
+				       VC4_RENDER_CONFIG_FORMAT);
+	int cpp;
+
+	if (surf->flags != 0) {
+		DRM_ERROR("No flags supported on render config.\n");
+		return -EINVAL;
+	}
+
+	if (surf->bits & ~(VC4_RENDER_CONFIG_MEMORY_FORMAT_MASK |
+			   VC4_RENDER_CONFIG_FORMAT_MASK |
+			   VC4_RENDER_CONFIG_MS_MODE_4X |
+			   VC4_RENDER_CONFIG_DECIMATE_MODE_4X)) {
+		DRM_ERROR("Unknown bits in render config: 0x%04x\n",
+			  surf->bits);
+		return -EINVAL;
+	}
+
+	if (surf->hindex == ~0)
+		return 0;
+
+	if (!vc4_use_bo(exec, surf->hindex, VC4_MODE_RENDER, obj))
+		return -EINVAL;
+
+	if (tiling > VC4_TILING_FORMAT_LT) {
+		DRM_ERROR("Bad tiling format\n");
+		return -EINVAL;
+	}
+
+	switch (format) {
+	case VC4_RENDER_CONFIG_FORMAT_BGR565_DITHERED:
+	case VC4_RENDER_CONFIG_FORMAT_BGR565:
+		cpp = 2;
+		break;
+	case VC4_RENDER_CONFIG_FORMAT_RGBA8888:
+		cpp = 4;
+		break;
+	default:
+		DRM_ERROR("Bad tile buffer format\n");
+		return -EINVAL;
+	}
+
+	if (!vc4_check_tex_size(exec, *obj, surf->offset, tiling,
+				exec->args->width, exec->args->height, cpp)) {
+		return -EINVAL;
+	}
+
+	return 0;
+}
+
+int vc4_get_rcl(struct drm_device *dev, struct vc4_exec_info *exec)
+{
+	struct vc4_rcl_setup setup = {0};
+	struct drm_vc4_submit_cl *args = exec->args;
+	bool has_bin = args->bin_cl_size != 0;
+	int ret;
+
+	if (args->min_x_tile > args->max_x_tile ||
+	    args->min_y_tile > args->max_y_tile) {
+		DRM_ERROR("Bad render tile set (%d,%d)-(%d,%d)\n",
+			  args->min_x_tile, args->min_y_tile,
+			  args->max_x_tile, args->max_y_tile);
+		return -EINVAL;
+	}
+
+	if (has_bin &&
+	    (args->max_x_tile > exec->bin_tiles_x ||
+	     args->max_y_tile > exec->bin_tiles_y)) {
+		DRM_ERROR("Render tiles (%d,%d) outside of bin config "
+			  "(%d,%d)\n",
+			  args->max_x_tile, args->max_y_tile,
+			  exec->bin_tiles_x, exec->bin_tiles_y);
+		return -EINVAL;
+	}
+
+	ret = vc4_rcl_render_config_surface_setup(exec, &setup,
+						  &setup.color_write,
+						  &args->color_write);
+	if (ret)
+		return ret;
+
+	ret = vc4_rcl_surface_setup(exec, &setup.color_read, &args->color_read);
+	if (ret)
+		return ret;
+
+	ret = vc4_rcl_surface_setup(exec, &setup.zs_read, &args->zs_read);
+	if (ret)
+		return ret;
+
+	ret = vc4_rcl_surface_setup(exec, &setup.zs_write, &args->zs_write);
+	if (ret)
+		return ret;
+
+	ret = vc4_rcl_msaa_surface_setup(exec, &setup.msaa_color_write,
+					 &args->msaa_color_write);
+	if (ret)
+		return ret;
+
+	ret = vc4_rcl_msaa_surface_setup(exec, &setup.msaa_zs_write,
+					 &args->msaa_zs_write);
+	if (ret)
+		return ret;
+
+	/* We shouldn't even have the job submitted to us if there's no
+	 * surface to write out.
+	 */
+	if (!setup.color_write && !setup.zs_write &&
+	    !setup.msaa_color_write && !setup.msaa_zs_write) {
+		DRM_ERROR("RCL requires color or Z/S write\n");
+		return -EINVAL;
+	}
+
+	return vc4_create_rcl_bo(dev, exec, &setup);
+}
diff --git a/drivers/gpu/drm/vc4/vc4_trace.h b/drivers/gpu/drm/vc4/vc4_trace.h
new file mode 100644
index 0000000..ad7b1ea
--- /dev/null
+++ b/drivers/gpu/drm/vc4/vc4_trace.h
@@ -0,0 +1,63 @@
+/*
+ * Copyright (C) 2015 Broadcom
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ */
+
+#if !defined(_VC4_TRACE_H_) || defined(TRACE_HEADER_MULTI_READ)
+#define _VC4_TRACE_H_
+
+#include <linux/stringify.h>
+#include <linux/types.h>
+#include <linux/tracepoint.h>
+
+#undef TRACE_SYSTEM
+#define TRACE_SYSTEM vc4
+#define TRACE_INCLUDE_FILE vc4_trace
+
+TRACE_EVENT(vc4_wait_for_seqno_begin,
+	    TP_PROTO(struct drm_device *dev, uint64_t seqno, uint64_t timeout),
+	    TP_ARGS(dev, seqno, timeout),
+
+	    TP_STRUCT__entry(
+			     __field(u32, dev)
+			     __field(u64, seqno)
+			     __field(u64, timeout)
+			     ),
+
+	    TP_fast_assign(
+			   __entry->dev = dev->primary->index;
+			   __entry->seqno = seqno;
+			   __entry->timeout = timeout;
+			   ),
+
+	    TP_printk("dev=%u, seqno=%llu, timeout=%llu",
+		      __entry->dev, __entry->seqno, __entry->timeout)
+);
+
+TRACE_EVENT(vc4_wait_for_seqno_end,
+	    TP_PROTO(struct drm_device *dev, uint64_t seqno),
+	    TP_ARGS(dev, seqno),
+
+	    TP_STRUCT__entry(
+			     __field(u32, dev)
+			     __field(u64, seqno)
+			     ),
+
+	    TP_fast_assign(
+			   __entry->dev = dev->primary->index;
+			   __entry->seqno = seqno;
+			   ),
+
+	    TP_printk("dev=%u, seqno=%llu",
+		      __entry->dev, __entry->seqno)
+);
+
+#endif /* _VC4_TRACE_H_ */
+
+/* This part must be outside protection */
+#undef TRACE_INCLUDE_PATH
+#define TRACE_INCLUDE_PATH .
+#include <trace/define_trace.h>
diff --git a/drivers/gpu/drm/vc4/vc4_trace_points.c b/drivers/gpu/drm/vc4/vc4_trace_points.c
new file mode 100644
index 0000000..e6278f2
--- /dev/null
+++ b/drivers/gpu/drm/vc4/vc4_trace_points.c
@@ -0,0 +1,14 @@
+/*
+ * Copyright (C) 2015 Broadcom
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ */
+
+#include "vc4_drv.h"
+
+#ifndef __CHECKER__
+#define CREATE_TRACE_POINTS
+#include "vc4_trace.h"
+#endif
diff --git a/drivers/gpu/drm/vc4/vc4_v3d.c b/drivers/gpu/drm/vc4/vc4_v3d.c
index 040ad0d..424d515 100644
--- a/drivers/gpu/drm/vc4/vc4_v3d.c
+++ b/drivers/gpu/drm/vc4/vc4_v3d.c
@@ -144,6 +144,21 @@ int vc4_v3d_debugfs_ident(struct seq_file *m, void *unused)
 }
 #endif /* CONFIG_DEBUG_FS */
 
+/*
+ * Asks the firmware to turn on power to the V3D engine.
+ *
+ * This may be doable with just the clocks interface, though this
+ * packet does some other register setup from the firmware, too.
+ */
+int
+vc4_v3d_set_power(struct vc4_dev *vc4, bool on)
+{
+	if (on)
+		return pm_generic_poweroff(&vc4->v3d->pdev->dev);
+	else
+		return pm_generic_resume(&vc4->v3d->pdev->dev);
+}
+
 static void vc4_v3d_init_hw(struct drm_device *dev)
 {
 	struct vc4_dev *vc4 = to_vc4_dev(dev);
@@ -161,6 +176,7 @@ static int vc4_v3d_bind(struct device *dev, struct device *master, void *data)
 	struct drm_device *drm = dev_get_drvdata(master);
 	struct vc4_dev *vc4 = to_vc4_dev(drm);
 	struct vc4_v3d *v3d = NULL;
+	int ret;
 
 	v3d = devm_kzalloc(&pdev->dev, sizeof(*v3d), GFP_KERNEL);
 	if (!v3d)
@@ -180,8 +196,20 @@ static int vc4_v3d_bind(struct device *dev, struct device *master, void *data)
 		return -EINVAL;
 	}
 
+	/* Reset the binner overflow address/size at setup, to be sure
+	 * we don't reuse an old one.
+	 */
+	V3D_WRITE(V3D_BPOA, 0);
+	V3D_WRITE(V3D_BPOS, 0);
+
 	vc4_v3d_init_hw(drm);
 
+	ret = drm_irq_install(drm, platform_get_irq(pdev, 0));
+	if (ret) {
+		DRM_ERROR("Failed to install IRQ handler\n");
+		return ret;
+	}
+
 	return 0;
 }
 
@@ -191,6 +219,15 @@ static void vc4_v3d_unbind(struct device *dev, struct device *master,
 	struct drm_device *drm = dev_get_drvdata(master);
 	struct vc4_dev *vc4 = to_vc4_dev(drm);
 
+	drm_irq_uninstall(drm);
+
+	/* Disable the binner's overflow memory address, so the next
+	 * driver probe (if any) doesn't try to reuse our old
+	 * allocation.
+	 */
+	V3D_WRITE(V3D_BPOA, 0);
+	V3D_WRITE(V3D_BPOS, 0);
+
 	vc4->v3d = NULL;
 }
 
diff --git a/drivers/gpu/drm/vc4/vc4_validate.c b/drivers/gpu/drm/vc4/vc4_validate.c
new file mode 100644
index 0000000..2f22f19
--- /dev/null
+++ b/drivers/gpu/drm/vc4/vc4_validate.c
@@ -0,0 +1,955 @@
+/*
+ * Copyright © 2014 Broadcom
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ */
+
+/**
+ * Command list validator for VC4.
+ *
+ * The VC4 has no IOMMU between it and system memory.  So, a user with
+ * access to execute command lists could escalate privilege by
+ * overwriting system memory (drawing to it as a framebuffer) or
+ * reading system memory it shouldn't (reading it as a texture, or
+ * uniform data, or vertex data).
+ *
+ * This validates command lists to ensure that all accesses are within
+ * the bounds of the GEM objects referenced.  It explicitly whitelists
+ * packets, and looks at the offsets in any address fields to make
+ * sure they're constrained within the BOs they reference.
+ *
+ * Note that because of the validation that's happening anyway, this
+ * is where GEM relocation processing happens.
+ */
+
+#include "uapi/drm/vc4_drm.h"
+#include "vc4_drv.h"
+#include "vc4_packet.h"
+
+#define VALIDATE_ARGS \
+	struct vc4_exec_info *exec,			\
+	void *validated,				\
+	void *untrusted
+
+/** Return the width in pixels of a 64-byte microtile. */
+static uint32_t
+utile_width(int cpp)
+{
+	switch (cpp) {
+	case 1:
+	case 2:
+		return 8;
+	case 4:
+		return 4;
+	case 8:
+		return 2;
+	default:
+		DRM_ERROR("unknown cpp: %d\n", cpp);
+		return 1;
+	}
+}
+
+/** Return the height in pixels of a 64-byte microtile. */
+static uint32_t
+utile_height(int cpp)
+{
+	switch (cpp) {
+	case 1:
+		return 8;
+	case 2:
+	case 4:
+	case 8:
+		return 4;
+	default:
+		DRM_ERROR("unknown cpp: %d\n", cpp);
+		return 1;
+	}
+}
+
+/**
+ * The texture unit decides what tiling format a particular miplevel is using
+ * this function, so we lay out our miptrees accordingly.
+ */
+static bool
+size_is_lt(uint32_t width, uint32_t height, int cpp)
+{
+	return (width <= 4 * utile_width(cpp) ||
+		height <= 4 * utile_height(cpp));
+}
+
+bool
+vc4_use_bo(struct vc4_exec_info *exec,
+	   uint32_t hindex,
+	   enum vc4_bo_mode mode,
+	   struct drm_gem_cma_object **obj)
+{
+	*obj = NULL;
+
+	if (hindex >= exec->bo_count) {
+		DRM_ERROR("BO index %d greater than BO count %d\n",
+			  hindex, exec->bo_count);
+		return false;
+	}
+
+	if (exec->bo[hindex].mode != mode) {
+		if (exec->bo[hindex].mode == VC4_MODE_UNDECIDED) {
+			exec->bo[hindex].mode = mode;
+		} else {
+			DRM_ERROR("BO index %d reused with mode %d vs %d\n",
+				  hindex, exec->bo[hindex].mode, mode);
+			return false;
+		}
+	}
+
+	*obj = exec->bo[hindex].bo;
+	return true;
+}
+
+static bool
+vc4_use_handle(struct vc4_exec_info *exec,
+	       uint32_t gem_handles_packet_index,
+	       enum vc4_bo_mode mode,
+	       struct drm_gem_cma_object **obj)
+{
+	return vc4_use_bo(exec, exec->bo_index[gem_handles_packet_index],
+			  mode, obj);
+}
+
+static uint32_t
+gl_shader_rec_size(uint32_t pointer_bits)
+{
+	uint32_t attribute_count = pointer_bits & 7;
+	bool extended = pointer_bits & 8;
+
+	if (attribute_count == 0)
+		attribute_count = 8;
+
+	if (extended)
+		return 100 + attribute_count * 4;
+	else
+		return 36 + attribute_count * 8;
+}
+
+bool
+vc4_check_tex_size(struct vc4_exec_info *exec, struct drm_gem_cma_object *fbo,
+		   uint32_t offset, uint8_t tiling_format,
+		   uint32_t width, uint32_t height, uint8_t cpp)
+{
+	uint32_t aligned_width, aligned_height, stride, size;
+	uint32_t utile_w = utile_width(cpp);
+	uint32_t utile_h = utile_height(cpp);
+
+	/* The shaded vertex format stores signed 12.4 fixed point
+	 * (-2048,2047) offsets from the viewport center, so we should
+	 * never have a render target larger than 4096.  The texture
+	 * unit can only sample from 2048x2048, so it's even more
+	 * restricted.  This lets us avoid worrying about overflow in
+	 * our math.
+	 */
+	if (width > 4096 || height > 4096) {
+		DRM_ERROR("Surface dimesions (%d,%d) too large", width, height);
+		return false;
+	}
+
+	switch (tiling_format) {
+	case VC4_TILING_FORMAT_LINEAR:
+		aligned_width = round_up(width, utile_w);
+		aligned_height = height;
+		break;
+	case VC4_TILING_FORMAT_T:
+		aligned_width = round_up(width, utile_w * 8);
+		aligned_height = round_up(height, utile_h * 8);
+		break;
+	case VC4_TILING_FORMAT_LT:
+		aligned_width = round_up(width, utile_w);
+		aligned_height = round_up(height, utile_h);
+		break;
+	default:
+		DRM_ERROR("buffer tiling %d unsupported\n", tiling_format);
+		return false;
+	}
+
+	stride = aligned_width * cpp;
+	size = stride * aligned_height;
+
+	if (size + offset < size ||
+	    size + offset > fbo->base.size) {
+		DRM_ERROR("Overflow in %dx%d (%dx%d) fbo size (%d + %d > %zd)\n",
+			  width, height,
+			  aligned_width, aligned_height,
+			  size, offset, fbo->base.size);
+		return false;
+	}
+
+	return true;
+}
+
+static int
+validate_flush_all(VALIDATE_ARGS)
+{
+	if (exec->found_increment_semaphore_packet) {
+		DRM_ERROR("VC4_PACKET_FLUSH_ALL after "
+			  "VC4_PACKET_INCREMENT_SEMAPHORE\n");
+		return -EINVAL;
+	}
+
+	return 0;
+}
+
+static int
+validate_start_tile_binning(VALIDATE_ARGS)
+{
+	if (exec->found_start_tile_binning_packet) {
+		DRM_ERROR("Duplicate VC4_PACKET_START_TILE_BINNING\n");
+		return -EINVAL;
+	}
+	exec->found_start_tile_binning_packet = true;
+
+	if (!exec->found_tile_binning_mode_config_packet) {
+		DRM_ERROR("missing VC4_PACKET_TILE_BINNING_MODE_CONFIG\n");
+		return -EINVAL;
+	}
+
+	return 0;
+}
+
+static int
+validate_increment_semaphore(VALIDATE_ARGS)
+{
+	if (exec->found_increment_semaphore_packet) {
+		DRM_ERROR("Duplicate VC4_PACKET_INCREMENT_SEMAPHORE\n");
+		return -EINVAL;
+	}
+	exec->found_increment_semaphore_packet = true;
+
+	/* Once we've found the semaphore increment, there should be one FLUSH
+	 * then the end of the command list.  The FLUSH actually triggers the
+	 * increment, so we only need to make sure there
+	 */
+
+	return 0;
+}
+
+static int
+validate_indexed_prim_list(VALIDATE_ARGS)
+{
+	struct drm_gem_cma_object *ib;
+	uint32_t length = *(uint32_t *)(untrusted + 1);
+	uint32_t offset = *(uint32_t *)(untrusted + 5);
+	uint32_t max_index = *(uint32_t *)(untrusted + 9);
+	uint32_t index_size = (*(uint8_t *)(untrusted + 0) >> 4) ? 2 : 1;
+	struct vc4_shader_state *shader_state;
+
+	if (exec->found_increment_semaphore_packet) {
+		DRM_ERROR("Drawing after VC4_PACKET_INCREMENT_SEMAPHORE\n");
+		return -EINVAL;
+	}
+
+	/* Check overflow condition */
+	if (exec->shader_state_count == 0) {
+		DRM_ERROR("shader state must precede primitives\n");
+		return -EINVAL;
+	}
+	shader_state = &exec->shader_state[exec->shader_state_count - 1];
+
+	if (max_index > shader_state->max_index)
+		shader_state->max_index = max_index;
+
+	if (!vc4_use_handle(exec, 0, VC4_MODE_RENDER, &ib))
+		return -EINVAL;
+
+	if (offset > ib->base.size ||
+	    (ib->base.size - offset) / index_size < length) {
+		DRM_ERROR("IB access overflow (%d + %d*%d > %zd)\n",
+			  offset, length, index_size, ib->base.size);
+		return -EINVAL;
+	}
+
+	*(uint32_t *)(validated + 5) = ib->paddr + offset;
+
+	return 0;
+}
+
+static int
+validate_gl_array_primitive(VALIDATE_ARGS)
+{
+	uint32_t length = *(uint32_t *)(untrusted + 1);
+	uint32_t base_index = *(uint32_t *)(untrusted + 5);
+	uint32_t max_index;
+	struct vc4_shader_state *shader_state;
+
+	if (exec->found_increment_semaphore_packet) {
+		DRM_ERROR("Drawing after VC4_PACKET_INCREMENT_SEMAPHORE\n");
+		return -EINVAL;
+	}
+
+	/* Check overflow condition */
+	if (exec->shader_state_count == 0) {
+		DRM_ERROR("shader state must precede primitives\n");
+		return -EINVAL;
+	}
+	shader_state = &exec->shader_state[exec->shader_state_count - 1];
+
+	if (length + base_index < length) {
+		DRM_ERROR("primitive vertex count overflow\n");
+		return -EINVAL;
+	}
+	max_index = length + base_index - 1;
+
+	if (max_index > shader_state->max_index)
+		shader_state->max_index = max_index;
+
+	return 0;
+}
+
+static int
+validate_gl_shader_state(VALIDATE_ARGS)
+{
+	uint32_t i = exec->shader_state_count++;
+
+	if (i >= exec->shader_state_size) {
+		DRM_ERROR("More requests for shader states than declared\n");
+		return -EINVAL;
+	}
+
+	exec->shader_state[i].packet = VC4_PACKET_GL_SHADER_STATE;
+	exec->shader_state[i].addr = *(uint32_t *)untrusted;
+	exec->shader_state[i].max_index = 0;
+
+	if (exec->shader_state[i].addr & ~0xf) {
+		DRM_ERROR("high bits set in GL shader rec reference\n");
+		return -EINVAL;
+	}
+
+	*(uint32_t *)validated = (exec->shader_rec_p +
+				  exec->shader_state[i].addr);
+
+	exec->shader_rec_p +=
+		roundup(gl_shader_rec_size(exec->shader_state[i].addr), 16);
+
+	return 0;
+}
+
+static int
+validate_nv_shader_state(VALIDATE_ARGS)
+{
+	uint32_t i = exec->shader_state_count++;
+
+	if (i >= exec->shader_state_size) {
+		DRM_ERROR("More requests for shader states than declared\n");
+		return -EINVAL;
+	}
+
+	exec->shader_state[i].packet = VC4_PACKET_NV_SHADER_STATE;
+	exec->shader_state[i].addr = *(uint32_t *)untrusted;
+
+	if (exec->shader_state[i].addr & 15) {
+		DRM_ERROR("NV shader state address 0x%08x misaligned\n",
+			  exec->shader_state[i].addr);
+		return -EINVAL;
+	}
+
+	*(uint32_t *)validated = (exec->shader_state[i].addr +
+				  exec->shader_rec_p);
+
+	return 0;
+}
+
+static int
+validate_tile_binning_config(VALIDATE_ARGS)
+{
+	struct drm_device *dev = exec->exec_bo->base.dev;
+	struct vc4_bo *tile_bo;
+	uint8_t flags;
+	uint32_t tile_state_size, tile_alloc_size;
+	uint32_t tile_count;
+
+	if (exec->found_tile_binning_mode_config_packet) {
+		DRM_ERROR("Duplicate VC4_PACKET_TILE_BINNING_MODE_CONFIG\n");
+		return -EINVAL;
+	}
+	exec->found_tile_binning_mode_config_packet = true;
+
+	exec->bin_tiles_x = *(uint8_t *)(untrusted + 12);
+	exec->bin_tiles_y = *(uint8_t *)(untrusted + 13);
+	tile_count = exec->bin_tiles_x * exec->bin_tiles_y;
+	flags = *(uint8_t *)(untrusted + 14);
+
+	if (exec->bin_tiles_x == 0 ||
+	    exec->bin_tiles_y == 0) {
+		DRM_ERROR("Tile binning config of %dx%d too small\n",
+			  exec->bin_tiles_x, exec->bin_tiles_y);
+		return -EINVAL;
+	}
+
+	if (flags & (VC4_BIN_CONFIG_DB_NON_MS |
+		     VC4_BIN_CONFIG_TILE_BUFFER_64BIT)) {
+		DRM_ERROR("unsupported binning config flags 0x%02x\n", flags);
+		return -EINVAL;
+	}
+
+	/* The tile state data array is 48 bytes per tile, and we put it at
+	 * the start of a BO containing both it and the tile alloc.
+	 */
+	tile_state_size = 48 * tile_count;
+
+	/* Since the tile alloc array will follow us, align. */
+	exec->tile_alloc_offset = roundup(tile_state_size, 4096);
+
+	*(uint8_t *)(validated + 14) =
+		((flags & ~(VC4_BIN_CONFIG_ALLOC_INIT_BLOCK_SIZE_MASK |
+			    VC4_BIN_CONFIG_ALLOC_BLOCK_SIZE_MASK)) |
+		 VC4_BIN_CONFIG_AUTO_INIT_TSDA |
+		 VC4_SET_FIELD(VC4_BIN_CONFIG_ALLOC_INIT_BLOCK_SIZE_32,
+			       VC4_BIN_CONFIG_ALLOC_INIT_BLOCK_SIZE) |
+		 VC4_SET_FIELD(VC4_BIN_CONFIG_ALLOC_BLOCK_SIZE_128,
+			       VC4_BIN_CONFIG_ALLOC_BLOCK_SIZE));
+
+	/* Initial block size. */
+	tile_alloc_size = 32 * tile_count;
+
+	/*
+	 * The initial allocation gets rounded to the next 256 bytes before
+	 * the hardware starts fulfilling further allocations.
+	 */
+	tile_alloc_size = roundup(tile_alloc_size, 256);
+
+	/* Add space for the extra allocations.  This is what gets used first,
+	 * before overflow memory.  It must have at least 4096 bytes, but we
+	 * want to avoid overflow memory usage if possible.
+	 */
+	tile_alloc_size += 1024 * 1024;
+
+	tile_bo = vc4_bo_create(dev, exec->tile_alloc_offset + tile_alloc_size,
+				true);
+	exec->tile_bo = &tile_bo->base;
+	if (!exec->tile_bo)
+		return -ENOMEM;
+	list_add_tail(&tile_bo->unref_head, &exec->unref_list);
+
+	/* tile alloc address. */
+	*(uint32_t *)(validated + 0) = (exec->tile_bo->paddr +
+					exec->tile_alloc_offset);
+	/* tile alloc size. */
+	*(uint32_t *)(validated + 4) = tile_alloc_size;
+	/* tile state address. */
+	*(uint32_t *)(validated + 8) = exec->tile_bo->paddr;
+
+	return 0;
+}
+
+static int
+validate_gem_handles(VALIDATE_ARGS)
+{
+	memcpy(exec->bo_index, untrusted, sizeof(exec->bo_index));
+	return 0;
+}
+
+#define VC4_DEFINE_PACKET(packet, func) \
+	[packet] = { packet ## _SIZE, #packet, func }
+
+static const struct cmd_info {
+	uint16_t len;
+	const char *name;
+	int (*func)(struct vc4_exec_info *exec, void *validated,
+		    void *untrusted);
+} cmd_info[] = {
+	VC4_DEFINE_PACKET(VC4_PACKET_HALT, NULL),
+	VC4_DEFINE_PACKET(VC4_PACKET_NOP, NULL),
+	VC4_DEFINE_PACKET(VC4_PACKET_FLUSH, NULL),
+	VC4_DEFINE_PACKET(VC4_PACKET_FLUSH_ALL, validate_flush_all),
+	VC4_DEFINE_PACKET(VC4_PACKET_START_TILE_BINNING,
+			  validate_start_tile_binning),
+	VC4_DEFINE_PACKET(VC4_PACKET_INCREMENT_SEMAPHORE,
+			  validate_increment_semaphore),
+
+	VC4_DEFINE_PACKET(VC4_PACKET_GL_INDEXED_PRIMITIVE,
+			  validate_indexed_prim_list),
+	VC4_DEFINE_PACKET(VC4_PACKET_GL_ARRAY_PRIMITIVE,
+			  validate_gl_array_primitive),
+
+	VC4_DEFINE_PACKET(VC4_PACKET_PRIMITIVE_LIST_FORMAT, NULL),
+
+	VC4_DEFINE_PACKET(VC4_PACKET_GL_SHADER_STATE, validate_gl_shader_state),
+	VC4_DEFINE_PACKET(VC4_PACKET_NV_SHADER_STATE, validate_nv_shader_state),
+
+	VC4_DEFINE_PACKET(VC4_PACKET_CONFIGURATION_BITS, NULL),
+	VC4_DEFINE_PACKET(VC4_PACKET_FLAT_SHADE_FLAGS, NULL),
+	VC4_DEFINE_PACKET(VC4_PACKET_POINT_SIZE, NULL),
+	VC4_DEFINE_PACKET(VC4_PACKET_LINE_WIDTH, NULL),
+	VC4_DEFINE_PACKET(VC4_PACKET_RHT_X_BOUNDARY, NULL),
+	VC4_DEFINE_PACKET(VC4_PACKET_DEPTH_OFFSET, NULL),
+	VC4_DEFINE_PACKET(VC4_PACKET_CLIP_WINDOW, NULL),
+	VC4_DEFINE_PACKET(VC4_PACKET_VIEWPORT_OFFSET, NULL),
+	VC4_DEFINE_PACKET(VC4_PACKET_CLIPPER_XY_SCALING, NULL),
+	/* Note: The docs say this was also 105, but it was 106 in the
+	 * initial userland code drop.
+	 */
+	VC4_DEFINE_PACKET(VC4_PACKET_CLIPPER_Z_SCALING, NULL),
+
+	VC4_DEFINE_PACKET(VC4_PACKET_TILE_BINNING_MODE_CONFIG,
+			  validate_tile_binning_config),
+
+	VC4_DEFINE_PACKET(VC4_PACKET_GEM_HANDLES, validate_gem_handles),
+};
+
+int
+vc4_validate_bin_cl(struct drm_device *dev,
+		    void *validated,
+		    void *unvalidated,
+		    struct vc4_exec_info *exec)
+{
+	uint32_t len = exec->args->bin_cl_size;
+	uint32_t dst_offset = 0;
+	uint32_t src_offset = 0;
+
+	while (src_offset < len) {
+		void *dst_pkt = validated + dst_offset;
+		void *src_pkt = unvalidated + src_offset;
+		u8 cmd = *(uint8_t *)src_pkt;
+		const struct cmd_info *info;
+
+		if (cmd >= ARRAY_SIZE(cmd_info)) {
+			DRM_ERROR("0x%08x: packet %d out of bounds\n",
+				  src_offset, cmd);
+			return -EINVAL;
+		}
+
+		info = &cmd_info[cmd];
+		if (!info->name) {
+			DRM_ERROR("0x%08x: packet %d invalid\n",
+				  src_offset, cmd);
+			return -EINVAL;
+		}
+
+		if (src_offset + info->len > len) {
+			DRM_ERROR("0x%08x: packet %d (%s) length 0x%08x "
+				  "exceeds bounds (0x%08x)\n",
+				  src_offset, cmd, info->name, info->len,
+				  src_offset + len);
+			return -EINVAL;
+		}
+
+		if (cmd != VC4_PACKET_GEM_HANDLES)
+			memcpy(dst_pkt, src_pkt, info->len);
+
+		if (info->func && info->func(exec,
+					     dst_pkt + 1,
+					     src_pkt + 1)) {
+			DRM_ERROR("0x%08x: packet %d (%s) failed to validate\n",
+				  src_offset, cmd, info->name);
+			return -EINVAL;
+		}
+
+		src_offset += info->len;
+		/* GEM handle loading doesn't produce HW packets. */
+		if (cmd != VC4_PACKET_GEM_HANDLES)
+			dst_offset += info->len;
+
+		/* When the CL hits halt, it'll stop reading anything else. */
+		if (cmd == VC4_PACKET_HALT)
+			break;
+	}
+
+	exec->ct0ea = exec->ct0ca + dst_offset;
+
+	if (!exec->found_start_tile_binning_packet) {
+		DRM_ERROR("Bin CL missing VC4_PACKET_START_TILE_BINNING\n");
+		return -EINVAL;
+	}
+
+	if (!exec->found_increment_semaphore_packet) {
+		DRM_ERROR("Bin CL missing VC4_PACKET_INCREMENT_SEMAPHORE\n");
+		return -EINVAL;
+	}
+
+	return 0;
+}
+
+static bool
+reloc_tex(struct vc4_exec_info *exec,
+	  void *uniform_data_u,
+	  struct vc4_texture_sample_info *sample,
+	  uint32_t texture_handle_index)
+
+{
+	struct drm_gem_cma_object *tex;
+	uint32_t p0 = *(uint32_t *)(uniform_data_u + sample->p_offset[0]);
+	uint32_t p1 = *(uint32_t *)(uniform_data_u + sample->p_offset[1]);
+	uint32_t p2 = (sample->p_offset[2] != ~0 ?
+		       *(uint32_t *)(uniform_data_u + sample->p_offset[2]) : 0);
+	uint32_t p3 = (sample->p_offset[3] != ~0 ?
+		       *(uint32_t *)(uniform_data_u + sample->p_offset[3]) : 0);
+	uint32_t *validated_p0 = exec->uniforms_v + sample->p_offset[0];
+	uint32_t offset = p0 & VC4_TEX_P0_OFFSET_MASK;
+	uint32_t miplevels = VC4_GET_FIELD(p0, VC4_TEX_P0_MIPLVLS);
+	uint32_t width = VC4_GET_FIELD(p1, VC4_TEX_P1_WIDTH);
+	uint32_t height = VC4_GET_FIELD(p1, VC4_TEX_P1_HEIGHT);
+	uint32_t cpp, tiling_format, utile_w, utile_h;
+	uint32_t i;
+	uint32_t cube_map_stride = 0;
+	enum vc4_texture_data_type type;
+
+	if (!vc4_use_bo(exec, texture_handle_index, VC4_MODE_RENDER, &tex))
+		return false;
+
+	if (sample->is_direct) {
+		uint32_t remaining_size = tex->base.size - p0;
+
+		if (p0 > tex->base.size - 4) {
+			DRM_ERROR("UBO offset greater than UBO size\n");
+			goto fail;
+		}
+		if (p1 > remaining_size - 4) {
+			DRM_ERROR("UBO clamp would allow reads "
+				  "outside of UBO\n");
+			goto fail;
+		}
+		*validated_p0 = tex->paddr + p0;
+		return true;
+	}
+
+	if (width == 0)
+		width = 2048;
+	if (height == 0)
+		height = 2048;
+
+	if (p0 & VC4_TEX_P0_CMMODE_MASK) {
+		if (VC4_GET_FIELD(p2, VC4_TEX_P2_PTYPE) ==
+		    VC4_TEX_P2_PTYPE_CUBE_MAP_STRIDE)
+			cube_map_stride = p2 & VC4_TEX_P2_CMST_MASK;
+		if (VC4_GET_FIELD(p3, VC4_TEX_P2_PTYPE) ==
+		    VC4_TEX_P2_PTYPE_CUBE_MAP_STRIDE) {
+			if (cube_map_stride) {
+				DRM_ERROR("Cube map stride set twice\n");
+				goto fail;
+			}
+
+			cube_map_stride = p3 & VC4_TEX_P2_CMST_MASK;
+		}
+		if (!cube_map_stride) {
+			DRM_ERROR("Cube map stride not set\n");
+			goto fail;
+		}
+	}
+
+	type = (VC4_GET_FIELD(p0, VC4_TEX_P0_TYPE) |
+		(VC4_GET_FIELD(p1, VC4_TEX_P1_TYPE4) << 4));
+
+	switch (type) {
+	case VC4_TEXTURE_TYPE_RGBA8888:
+	case VC4_TEXTURE_TYPE_RGBX8888:
+	case VC4_TEXTURE_TYPE_RGBA32R:
+		cpp = 4;
+		break;
+	case VC4_TEXTURE_TYPE_RGBA4444:
+	case VC4_TEXTURE_TYPE_RGBA5551:
+	case VC4_TEXTURE_TYPE_RGB565:
+	case VC4_TEXTURE_TYPE_LUMALPHA:
+	case VC4_TEXTURE_TYPE_S16F:
+	case VC4_TEXTURE_TYPE_S16:
+		cpp = 2;
+		break;
+	case VC4_TEXTURE_TYPE_LUMINANCE:
+	case VC4_TEXTURE_TYPE_ALPHA:
+	case VC4_TEXTURE_TYPE_S8:
+		cpp = 1;
+		break;
+	case VC4_TEXTURE_TYPE_ETC1:
+	case VC4_TEXTURE_TYPE_BW1:
+	case VC4_TEXTURE_TYPE_A4:
+	case VC4_TEXTURE_TYPE_A1:
+	case VC4_TEXTURE_TYPE_RGBA64:
+	case VC4_TEXTURE_TYPE_YUV422R:
+	default:
+		DRM_ERROR("Texture format %d unsupported\n", type);
+		goto fail;
+	}
+	utile_w = utile_width(cpp);
+	utile_h = utile_height(cpp);
+
+	if (type == VC4_TEXTURE_TYPE_RGBA32R) {
+		tiling_format = VC4_TILING_FORMAT_LINEAR;
+	} else {
+		if (size_is_lt(width, height, cpp))
+			tiling_format = VC4_TILING_FORMAT_LT;
+		else
+			tiling_format = VC4_TILING_FORMAT_T;
+	}
+
+	if (!vc4_check_tex_size(exec, tex, offset + cube_map_stride * 5,
+				tiling_format, width, height, cpp)) {
+		goto fail;
+	}
+
+	/* The mipmap levels are stored before the base of the texture.  Make
+	 * sure there is actually space in the BO.
+	 */
+	for (i = 1; i <= miplevels; i++) {
+		uint32_t level_width = max(width >> i, 1u);
+		uint32_t level_height = max(height >> i, 1u);
+		uint32_t aligned_width, aligned_height;
+		uint32_t level_size;
+
+		/* Once the levels get small enough, they drop from T to LT. */
+		if (tiling_format == VC4_TILING_FORMAT_T &&
+		    size_is_lt(level_width, level_height, cpp)) {
+			tiling_format = VC4_TILING_FORMAT_LT;
+		}
+
+		switch (tiling_format) {
+		case VC4_TILING_FORMAT_T:
+			aligned_width = round_up(level_width, utile_w * 8);
+			aligned_height = round_up(level_height, utile_h * 8);
+			break;
+		case VC4_TILING_FORMAT_LT:
+			aligned_width = round_up(level_width, utile_w);
+			aligned_height = round_up(level_height, utile_h);
+			break;
+		default:
+			aligned_width = round_up(level_width, utile_w);
+			aligned_height = level_height;
+			break;
+		}
+
+		level_size = aligned_width * cpp * aligned_height;
+
+		if (offset < level_size) {
+			DRM_ERROR("Level %d (%dx%d -> %dx%d) size %db "
+				  "overflowed buffer bounds (offset %d)\n",
+				  i, level_width, level_height,
+				  aligned_width, aligned_height,
+				  level_size, offset);
+			goto fail;
+		}
+
+		offset -= level_size;
+	}
+
+	*validated_p0 = tex->paddr + p0;
+
+	return true;
+ fail:
+	DRM_INFO("Texture p0 at %d: 0x%08x\n", sample->p_offset[0], p0);
+	DRM_INFO("Texture p1 at %d: 0x%08x\n", sample->p_offset[1], p1);
+	DRM_INFO("Texture p2 at %d: 0x%08x\n", sample->p_offset[2], p2);
+	DRM_INFO("Texture p3 at %d: 0x%08x\n", sample->p_offset[3], p3);
+	return false;
+}
+
+static int
+validate_shader_rec(struct drm_device *dev,
+		    struct vc4_exec_info *exec,
+		    struct vc4_shader_state *state)
+{
+	uint32_t *src_handles;
+	void *pkt_u, *pkt_v;
+	enum shader_rec_reloc_type {
+		RELOC_CODE,
+		RELOC_VBO,
+	};
+	struct shader_rec_reloc {
+		enum shader_rec_reloc_type type;
+		uint32_t offset;
+	};
+	static const struct shader_rec_reloc gl_relocs[] = {
+		{ RELOC_CODE, 4 },  /* fs */
+		{ RELOC_CODE, 16 }, /* vs */
+		{ RELOC_CODE, 28 }, /* cs */
+	};
+	static const struct shader_rec_reloc nv_relocs[] = {
+		{ RELOC_CODE, 4 }, /* fs */
+		{ RELOC_VBO, 12 }
+	};
+	const struct shader_rec_reloc *relocs;
+	struct drm_gem_cma_object *bo[ARRAY_SIZE(gl_relocs) + 8];
+	uint32_t nr_attributes = 0, nr_fixed_relocs, nr_relocs, packet_size;
+	int i;
+	struct vc4_validated_shader_info *shader;
+
+	if (state->packet == VC4_PACKET_NV_SHADER_STATE) {
+		relocs = nv_relocs;
+		nr_fixed_relocs = ARRAY_SIZE(nv_relocs);
+
+		packet_size = 16;
+	} else {
+		relocs = gl_relocs;
+		nr_fixed_relocs = ARRAY_SIZE(gl_relocs);
+
+		nr_attributes = state->addr & 0x7;
+		if (nr_attributes == 0)
+			nr_attributes = 8;
+		packet_size = gl_shader_rec_size(state->addr);
+	}
+	nr_relocs = nr_fixed_relocs + nr_attributes;
+
+	if (nr_relocs * 4 > exec->shader_rec_size) {
+		DRM_ERROR("overflowed shader recs reading %d handles "
+			  "from %d bytes left\n",
+			  nr_relocs, exec->shader_rec_size);
+		return -EINVAL;
+	}
+	src_handles = exec->shader_rec_u;
+	exec->shader_rec_u += nr_relocs * 4;
+	exec->shader_rec_size -= nr_relocs * 4;
+
+	if (packet_size > exec->shader_rec_size) {
+		DRM_ERROR("overflowed shader recs copying %db packet "
+			  "from %d bytes left\n",
+			  packet_size, exec->shader_rec_size);
+		return -EINVAL;
+	}
+	pkt_u = exec->shader_rec_u;
+	pkt_v = exec->shader_rec_v;
+	memcpy(pkt_v, pkt_u, packet_size);
+	exec->shader_rec_u += packet_size;
+	/* Shader recs have to be aligned to 16 bytes (due to the attribute
+	 * flags being in the low bytes), so round the next validated shader
+	 * rec address up.  This should be safe, since we've got so many
+	 * relocations in a shader rec packet.
+	 */
+	BUG_ON(roundup(packet_size, 16) - packet_size > nr_relocs * 4);
+	exec->shader_rec_v += roundup(packet_size, 16);
+	exec->shader_rec_size -= packet_size;
+
+	for (i = 0; i < nr_relocs; i++) {
+		enum vc4_bo_mode mode;
+
+		if (i < nr_fixed_relocs && relocs[i].type == RELOC_CODE)
+			mode = VC4_MODE_SHADER;
+		else
+			mode = VC4_MODE_RENDER;
+
+		if (!vc4_use_bo(exec, src_handles[i], mode, &bo[i]))
+			return false;
+	}
+
+	for (i = 0; i < nr_fixed_relocs; i++) {
+		struct vc4_bo *vc4_bo;
+		uint32_t o = relocs[i].offset;
+		uint32_t src_offset = *(uint32_t *)(pkt_u + o);
+		uint32_t *texture_handles_u;
+		void *uniform_data_u;
+		uint32_t tex;
+
+		*(uint32_t *)(pkt_v + o) = bo[i]->paddr + src_offset;
+
+		switch (relocs[i].type) {
+		case RELOC_CODE:
+			if (src_offset != 0) {
+				DRM_ERROR("Shaders must be at offset 0 "
+					  "of the BO.\n");
+				goto fail;
+			}
+
+			vc4_bo = to_vc4_bo(&bo[i]->base);
+			shader = vc4_bo->validated_shader;
+			if (!shader)
+				goto fail;
+
+			if (shader->uniforms_src_size > exec->uniforms_size) {
+				DRM_ERROR("Uniforms src buffer overflow\n");
+				goto fail;
+			}
+
+			texture_handles_u = exec->uniforms_u;
+			uniform_data_u = (texture_handles_u +
+					  shader->num_texture_samples);
+
+			memcpy(exec->uniforms_v, uniform_data_u,
+			       shader->uniforms_size);
+
+			for (tex = 0;
+			     tex < shader->num_texture_samples;
+			     tex++) {
+				if (!reloc_tex(exec,
+					       uniform_data_u,
+					       &shader->texture_samples[tex],
+					       texture_handles_u[tex])) {
+					goto fail;
+				}
+			}
+
+			*(uint32_t *)(pkt_v + o + 4) = exec->uniforms_p;
+
+			exec->uniforms_u += shader->uniforms_src_size;
+			exec->uniforms_v += shader->uniforms_size;
+			exec->uniforms_p += shader->uniforms_size;
+
+			break;
+
+		case RELOC_VBO:
+			break;
+		}
+	}
+
+	for (i = 0; i < nr_attributes; i++) {
+		struct drm_gem_cma_object *vbo = bo[nr_fixed_relocs + i];
+		uint32_t o = 36 + i * 8;
+		uint32_t offset = *(uint32_t *)(pkt_u + o + 0);
+		uint32_t attr_size = *(uint8_t *)(pkt_u + o + 4) + 1;
+		uint32_t stride = *(uint8_t *)(pkt_u + o + 5);
+		uint32_t max_index;
+
+		if (state->addr & 0x8)
+			stride |= (*(uint32_t *)(pkt_u + 100 + i * 4)) & ~0xff;
+
+		if (vbo->base.size < offset ||
+		    vbo->base.size - offset < attr_size) {
+			DRM_ERROR("BO offset overflow (%d + %d > %d)\n",
+				  offset, attr_size, vbo->base.size);
+			return -EINVAL;
+		}
+
+		if (stride != 0) {
+			max_index = ((vbo->base.size - offset - attr_size) /
+				     stride);
+			if (state->max_index > max_index) {
+				DRM_ERROR("primitives use index %d out of "
+					  "supplied %d\n",
+					  state->max_index, max_index);
+				return -EINVAL;
+			}
+		}
+
+		*(uint32_t *)(pkt_v + o) = vbo->paddr + offset;
+	}
+
+	return 0;
+
+fail:
+	return -EINVAL;
+}
+
+int
+vc4_validate_shader_recs(struct drm_device *dev,
+			 struct vc4_exec_info *exec)
+{
+	uint32_t i;
+	int ret = 0;
+
+	for (i = 0; i < exec->shader_state_count; i++) {
+		ret = validate_shader_rec(dev, exec, &exec->shader_state[i]);
+		if (ret)
+			return ret;
+	}
+
+	return ret;
+}
diff --git a/include/uapi/drm/vc4_drm.h b/include/uapi/drm/vc4_drm.h
index 0716b50..e711ec0 100644
--- a/include/uapi/drm/vc4_drm.h
+++ b/include/uapi/drm/vc4_drm.h
@@ -26,14 +26,155 @@
 
 #include <drm/drm.h>
 
+#define DRM_VC4_SUBMIT_CL                         0x00
+#define DRM_VC4_WAIT_SEQNO                        0x01
+#define DRM_VC4_WAIT_BO                           0x02
 #define DRM_VC4_CREATE_BO                         0x03
 #define DRM_VC4_MMAP_BO                           0x04
 #define DRM_VC4_CREATE_SHADER_BO                  0x05
 
+#define DRM_IOCTL_VC4_SUBMIT_CL           DRM_IOWR(DRM_COMMAND_BASE + DRM_VC4_SUBMIT_CL, struct drm_vc4_submit_cl)
+#define DRM_IOCTL_VC4_WAIT_SEQNO          DRM_IOWR(DRM_COMMAND_BASE + DRM_VC4_WAIT_SEQNO, struct drm_vc4_wait_seqno)
+#define DRM_IOCTL_VC4_WAIT_BO             DRM_IOWR(DRM_COMMAND_BASE + DRM_VC4_WAIT_BO, struct drm_vc4_wait_bo)
 #define DRM_IOCTL_VC4_CREATE_BO           DRM_IOWR(DRM_COMMAND_BASE + DRM_VC4_CREATE_BO, struct drm_vc4_create_bo)
 #define DRM_IOCTL_VC4_MMAP_BO             DRM_IOWR(DRM_COMMAND_BASE + DRM_VC4_MMAP_BO, struct drm_vc4_mmap_bo)
 #define DRM_IOCTL_VC4_CREATE_SHADER_BO    DRM_IOWR(DRM_COMMAND_BASE + DRM_VC4_CREATE_SHADER_BO, struct drm_vc4_create_shader_bo)
 
+struct drm_vc4_submit_rcl_surface {
+	uint32_t hindex; /* Handle index, or ~0 if not present. */
+	uint32_t offset; /* Offset to start of buffer. */
+	/*
+	 * Bits for either render config (color_write) or load/store packet.
+	 * Bits should all be 0 for MSAA load/stores.
+	 */
+	uint16_t bits;
+
+#define VC4_SUBMIT_RCL_SURFACE_READ_IS_FULL_RES		(1 << 0)
+	uint16_t flags;
+};
+
+/**
+ * struct drm_vc4_submit_cl - ioctl argument for submitting commands to the 3D
+ * engine.
+ *
+ * Drivers typically use GPU BOs to store batchbuffers / command lists and
+ * their associated state.  However, because the VC4 lacks an MMU, we have to
+ * do validation of memory accesses by the GPU commands.  If we were to store
+ * our commands in BOs, we'd need to do uncached readback from them to do the
+ * validation process, which is too expensive.  Instead, userspace accumulates
+ * commands and associated state in plain memory, then the kernel copies the
+ * data to its own address space, and then validates and stores it in a GPU
+ * BO.
+ */
+struct drm_vc4_submit_cl {
+	/* Pointer to the binner command list.
+	 *
+	 * This is the first set of commands executed, which runs the
+	 * coordinate shader to determine where primitives land on the screen,
+	 * then writes out the state updates and draw calls necessary per tile
+	 * to the tile allocation BO.
+	 */
+	uint64_t bin_cl;
+
+	/* Pointer to the shader records.
+	 *
+	 * Shader records are the structures read by the hardware that contain
+	 * pointers to uniforms, shaders, and vertex attributes.  The
+	 * reference to the shader record has enough information to determine
+	 * how many pointers are necessary (fixed number for shaders/uniforms,
+	 * and an attribute count), so those BO indices into bo_handles are
+	 * just stored as uint32_ts before each shader record passed in.
+	 */
+	uint64_t shader_rec;
+
+	/* Pointer to uniform data and texture handles for the textures
+	 * referenced by the shader.
+	 *
+	 * For each shader state record, there is a set of uniform data in the
+	 * order referenced by the record (FS, VS, then CS).  Each set of
+	 * uniform data has a uint32_t index into bo_handles per texture
+	 * sample operation, in the order the QPU_W_TMUn_S writes appear in
+	 * the program.  Following the texture BO handle indices is the actual
+	 * uniform data.
+	 *
+	 * The individual uniform state blocks don't have sizes passed in,
+	 * because the kernel has to determine the sizes anyway during shader
+	 * code validation.
+	 */
+	uint64_t uniforms;
+	uint64_t bo_handles;
+
+	/* Size in bytes of the binner command list. */
+	uint32_t bin_cl_size;
+	/* Size in bytes of the set of shader records. */
+	uint32_t shader_rec_size;
+	/* Number of shader records.
+	 *
+	 * This could just be computed from the contents of shader_records and
+	 * the address bits of references to them from the bin CL, but it
+	 * keeps the kernel from having to resize some allocations it makes.
+	 */
+	uint32_t shader_rec_count;
+	/* Size in bytes of the uniform state. */
+	uint32_t uniforms_size;
+
+	/* Number of BO handles passed in (size is that times 4). */
+	uint32_t bo_handle_count;
+
+	/* RCL setup: */
+	uint16_t width;
+	uint16_t height;
+	uint8_t min_x_tile;
+	uint8_t min_y_tile;
+	uint8_t max_x_tile;
+	uint8_t max_y_tile;
+	struct drm_vc4_submit_rcl_surface color_read;
+	struct drm_vc4_submit_rcl_surface color_write;
+	struct drm_vc4_submit_rcl_surface zs_read;
+	struct drm_vc4_submit_rcl_surface zs_write;
+	struct drm_vc4_submit_rcl_surface msaa_color_write;
+	struct drm_vc4_submit_rcl_surface msaa_zs_write;
+	uint32_t clear_color[2];
+	uint32_t clear_z;
+	uint8_t clear_s;
+
+	uint32_t pad:24;
+
+#define VC4_SUBMIT_CL_USE_CLEAR_COLOR			(1 << 0)
+	uint32_t flags;
+
+	/* Returned value of the seqno of this render job (for the
+	 * wait ioctl).
+	 */
+	uint64_t seqno;
+};
+
+/**
+ * struct drm_vc4_wait_seqno - ioctl argument for waiting for
+ * DRM_VC4_SUBMIT_CL completion using its returned seqno.
+ *
+ * timeout_ns is the timeout in nanoseconds, where "0" means "don't
+ * block, just return the status."
+ */
+struct drm_vc4_wait_seqno {
+	uint64_t seqno;
+	uint64_t timeout_ns;
+};
+
+/**
+ * struct drm_vc4_wait_bo - ioctl argument for waiting for
+ * completion of the last DRM_VC4_SUBMIT_CL on a BO.
+ *
+ * This is useful for cases where multiple processes might be
+ * rendering to a BO and you want to wait for all rendering to be
+ * completed.
+ */
+struct drm_vc4_wait_bo {
+	uint32_t handle;
+	uint32_t pad;
+	uint64_t timeout_ns;
+};
+
 /**
  * struct drm_vc4_create_bo - ioctl argument for creating VC4 BOs.
  *
-- 
2.6.2


^ permalink raw reply related	[flat|nested] 53+ messages in thread

* [PATCH 7/9] drm/vc4: Add support for drawing 3D frames.
@ 2015-12-01 20:35   ` Eric Anholt
  0 siblings, 0 replies; 53+ messages in thread
From: Eric Anholt @ 2015-12-01 20:35 UTC (permalink / raw)
  To: dri-devel; +Cc: linux-kernel

The user submission is basically a pointer to a command list and a
pointer to uniforms.  We copy those in to the kernel, validate and
relocate them, and store the result in a GPU BO which we queue for
execution.

Signed-off-by: Eric Anholt <eric@anholt.net>
---
 drivers/gpu/drm/vc4/Makefile           |   7 +
 drivers/gpu/drm/vc4/vc4_drv.c          |  15 +-
 drivers/gpu/drm/vc4/vc4_drv.h          | 192 +++++++
 drivers/gpu/drm/vc4/vc4_gem.c          | 640 ++++++++++++++++++++++
 drivers/gpu/drm/vc4/vc4_irq.c          | 210 ++++++++
 drivers/gpu/drm/vc4/vc4_packet.h       | 399 ++++++++++++++
 drivers/gpu/drm/vc4/vc4_render_cl.c    | 631 ++++++++++++++++++++++
 drivers/gpu/drm/vc4/vc4_trace.h        |  63 +++
 drivers/gpu/drm/vc4/vc4_trace_points.c |  14 +
 drivers/gpu/drm/vc4/vc4_v3d.c          |  37 ++
 drivers/gpu/drm/vc4/vc4_validate.c     | 955 +++++++++++++++++++++++++++++++++
 include/uapi/drm/vc4_drm.h             | 141 +++++
 12 files changed, 3303 insertions(+), 1 deletion(-)
 create mode 100644 drivers/gpu/drm/vc4/vc4_gem.c
 create mode 100644 drivers/gpu/drm/vc4/vc4_irq.c
 create mode 100644 drivers/gpu/drm/vc4/vc4_packet.h
 create mode 100644 drivers/gpu/drm/vc4/vc4_render_cl.c
 create mode 100644 drivers/gpu/drm/vc4/vc4_trace.h
 create mode 100644 drivers/gpu/drm/vc4/vc4_trace_points.c
 create mode 100644 drivers/gpu/drm/vc4/vc4_validate.c

diff --git a/drivers/gpu/drm/vc4/Makefile b/drivers/gpu/drm/vc4/Makefile
index e87a6f2..4c6a99f 100644
--- a/drivers/gpu/drm/vc4/Makefile
+++ b/drivers/gpu/drm/vc4/Makefile
@@ -8,12 +8,19 @@ vc4-y := \
 	vc4_crtc.o \
 	vc4_drv.o \
 	vc4_kms.o \
+	vc4_gem.o \
 	vc4_hdmi.o \
 	vc4_hvs.o \
+	vc4_irq.o \
 	vc4_plane.o \
+	vc4_render_cl.o \
+	vc4_trace_points.o \
 	vc4_v3d.o \
+	vc4_validate.o \
 	vc4_validate_shaders.o
 
 vc4-$(CONFIG_DEBUG_FS) += vc4_debugfs.o
 
 obj-$(CONFIG_DRM_VC4)  += vc4.o
+
+CFLAGS_vc4_trace_points.o := -I$(src)
diff --git a/drivers/gpu/drm/vc4/vc4_drv.c b/drivers/gpu/drm/vc4/vc4_drv.c
index db58d74..2cfee59 100644
--- a/drivers/gpu/drm/vc4/vc4_drv.c
+++ b/drivers/gpu/drm/vc4/vc4_drv.c
@@ -74,6 +74,9 @@ static const struct file_operations vc4_drm_fops = {
 };
 
 static const struct drm_ioctl_desc vc4_drm_ioctls[] = {
+	DRM_IOCTL_DEF_DRV(VC4_SUBMIT_CL, vc4_submit_cl_ioctl, 0),
+	DRM_IOCTL_DEF_DRV(VC4_WAIT_SEQNO, vc4_wait_seqno_ioctl, 0),
+	DRM_IOCTL_DEF_DRV(VC4_WAIT_BO, vc4_wait_bo_ioctl, 0),
 	DRM_IOCTL_DEF_DRV(VC4_CREATE_BO, vc4_create_bo_ioctl, 0),
 	DRM_IOCTL_DEF_DRV(VC4_MMAP_BO, vc4_mmap_bo_ioctl, 0),
 	DRM_IOCTL_DEF_DRV(VC4_CREATE_SHADER_BO, vc4_create_shader_bo_ioctl, 0),
@@ -83,10 +86,16 @@ static struct drm_driver vc4_drm_driver = {
 	.driver_features = (DRIVER_MODESET |
 			    DRIVER_ATOMIC |
 			    DRIVER_GEM |
+			    DRIVER_HAVE_IRQ |
 			    DRIVER_PRIME),
 	.lastclose = vc4_lastclose,
 	.preclose = vc4_drm_preclose,
 
+	.irq_handler = vc4_irq,
+	.irq_preinstall = vc4_irq_preinstall,
+	.irq_postinstall = vc4_irq_postinstall,
+	.irq_uninstall = vc4_irq_uninstall,
+
 	.enable_vblank = vc4_enable_vblank,
 	.disable_vblank = vc4_disable_vblank,
 	.get_vblank_counter = drm_vblank_count,
@@ -181,9 +190,11 @@ static int vc4_drm_bind(struct device *dev)
 	if (ret)
 		goto unref;
 
+	vc4_gem_init(drm);
+
 	ret = component_bind_all(dev, drm);
 	if (ret)
-		goto unref;
+		goto gem_destroy;
 
 	ret = drm_dev_register(drm, 0);
 	if (ret < 0)
@@ -207,6 +218,8 @@ unregister:
 	drm_dev_unregister(drm);
 unbind_all:
 	component_unbind_all(dev, drm);
+gem_destroy:
+	vc4_gem_destroy(drm);
 unref:
 	drm_dev_unref(drm);
 	vc4_bo_cache_destroy(drm);
diff --git a/drivers/gpu/drm/vc4/vc4_drv.h b/drivers/gpu/drm/vc4/vc4_drv.h
index 8945463..a3dbfee 100644
--- a/drivers/gpu/drm/vc4/vc4_drv.h
+++ b/drivers/gpu/drm/vc4/vc4_drv.h
@@ -49,6 +49,48 @@ struct vc4_dev {
 
 	/* Protects bo_cache and the BO stats. */
 	struct mutex bo_lock;
+
+	/* Sequence number for the last job queued in job_list.
+	 * Starts at 0 (no jobs emitted).
+	 */
+	uint64_t emit_seqno;
+
+	/* Sequence number for the last completed job on the GPU.
+	 * Starts at 0 (no jobs completed).
+	 */
+	uint64_t finished_seqno;
+
+	/* List of all struct vc4_exec_info for jobs to be executed.
+	 * The first job in the list is the one currently programmed
+	 * into ct0ca/ct1ca for execution.
+	 */
+	struct list_head job_list;
+	/* List of the finished vc4_exec_infos waiting to be freed by
+	 * job_done_work.
+	 */
+	struct list_head job_done_list;
+	/* Spinlock used to synchronize the job_list and seqno
+	 * accesses between the IRQ handler and GEM ioctls.
+	 */
+	spinlock_t job_lock;
+	wait_queue_head_t job_wait_queue;
+	struct work_struct job_done_work;
+
+	/* The binner overflow memory that's currently set up in
+	 * BPOA/BPOS registers.  When overflow occurs and a new one is
+	 * allocated, the previous one will be moved to
+	 * vc4->current_exec's free list.
+	 */
+	struct vc4_bo *overflow_mem;
+	struct work_struct overflow_mem_work;
+
+	struct {
+		uint32_t last_ct0ca, last_ct1ca;
+		struct timer_list timer;
+		struct work_struct reset_work;
+	} hangcheck;
+
+	struct semaphore async_modeset;
 };
 
 static inline struct vc4_dev *
@@ -60,6 +102,9 @@ to_vc4_dev(struct drm_device *dev)
 struct vc4_bo {
 	struct drm_gem_cma_object base;
 
+	/* seqno of the last job to render to this BO. */
+	uint64_t seqno;
+
 	/* List entry for the BO's position in either
 	 * vc4_exec_info->unref_list or vc4_dev->bo_cache.time_list
 	 */
@@ -130,6 +175,109 @@ to_vc4_encoder(struct drm_encoder *encoder)
 #define HVS_READ(offset) readl(vc4->hvs->regs + offset)
 #define HVS_WRITE(offset, val) writel(val, vc4->hvs->regs + offset)
 
+enum vc4_bo_mode {
+	VC4_MODE_UNDECIDED,
+	VC4_MODE_RENDER,
+	VC4_MODE_SHADER,
+};
+
+struct vc4_bo_exec_state {
+	struct drm_gem_cma_object *bo;
+	enum vc4_bo_mode mode;
+};
+
+struct vc4_exec_info {
+	/* Sequence number for this bin/render job. */
+	uint64_t seqno;
+
+	/* Kernel-space copy of the ioctl arguments */
+	struct drm_vc4_submit_cl *args;
+
+	/* This is the array of BOs that were looked up at the start of exec.
+	 * Command validation will use indices into this array.
+	 */
+	struct vc4_bo_exec_state *bo;
+	uint32_t bo_count;
+
+	/* Pointers for our position in vc4->job_list */
+	struct list_head head;
+
+	/* List of other BOs used in the job that need to be released
+	 * once the job is complete.
+	 */
+	struct list_head unref_list;
+
+	/* Current unvalidated indices into @bo loaded by the non-hardware
+	 * VC4_PACKET_GEM_HANDLES.
+	 */
+	uint32_t bo_index[2];
+
+	/* This is the BO where we store the validated command lists, shader
+	 * records, and uniforms.
+	 */
+	struct drm_gem_cma_object *exec_bo;
+
+	/**
+	 * This tracks the per-shader-record state (packet 64) that
+	 * determines the length of the shader record and the offset
+	 * it's expected to be found at.  It gets read in from the
+	 * command lists.
+	 */
+	struct vc4_shader_state {
+		uint8_t packet;
+		uint32_t addr;
+		/* Maximum vertex index referenced by any primitive using this
+		 * shader state.
+		 */
+		uint32_t max_index;
+	} *shader_state;
+
+	/** How many shader states the user declared they were using. */
+	uint32_t shader_state_size;
+	/** How many shader state records the validator has seen. */
+	uint32_t shader_state_count;
+
+	bool found_tile_binning_mode_config_packet;
+	bool found_start_tile_binning_packet;
+	bool found_increment_semaphore_packet;
+	uint8_t bin_tiles_x, bin_tiles_y;
+	struct drm_gem_cma_object *tile_bo;
+	uint32_t tile_alloc_offset;
+
+	/**
+	 * Computed addresses pointing into exec_bo where we start the
+	 * bin thread (ct0) and render thread (ct1).
+	 */
+	uint32_t ct0ca, ct0ea;
+	uint32_t ct1ca, ct1ea;
+
+	/* Pointers to the shader recs.  These paddr gets incremented as CL
+	 * packets are relocated in validate_gl_shader_state, and the vaddrs
+	 * (u and v) get incremented and size decremented as the shader recs
+	 * themselves are validated.
+	 */
+	void *shader_rec_u;
+	void *shader_rec_v;
+	uint32_t shader_rec_p;
+	uint32_t shader_rec_size;
+
+	/* Pointers to the uniform data.  These pointers are incremented, and
+	 * size decremented, as each batch of uniforms is uploaded.
+	 */
+	void *uniforms_u;
+	void *uniforms_v;
+	uint32_t uniforms_p;
+	uint32_t uniforms_size;
+};
+
+static inline struct vc4_exec_info *
+vc4_first_job(struct vc4_dev *vc4)
+{
+	if (list_empty(&vc4->job_list))
+		return NULL;
+	return list_first_entry(&vc4->job_list, struct vc4_exec_info, head);
+}
+
 /**
  * struct vc4_texture_sample_info - saves the offsets into the UBO for texture
  * setup parameters.
@@ -231,10 +379,31 @@ void vc4_debugfs_cleanup(struct drm_minor *minor);
 /* vc4_drv.c */
 void __iomem *vc4_ioremap_regs(struct platform_device *dev, int index);
 
+/* vc4_gem.c */
+void vc4_gem_init(struct drm_device *dev);
+void vc4_gem_destroy(struct drm_device *dev);
+int vc4_submit_cl_ioctl(struct drm_device *dev, void *data,
+			struct drm_file *file_priv);
+int vc4_wait_seqno_ioctl(struct drm_device *dev, void *data,
+			 struct drm_file *file_priv);
+int vc4_wait_bo_ioctl(struct drm_device *dev, void *data,
+		      struct drm_file *file_priv);
+void vc4_submit_next_job(struct drm_device *dev);
+int vc4_wait_for_seqno(struct drm_device *dev, uint64_t seqno,
+		       uint64_t timeout_ns, bool interruptible);
+void vc4_job_handle_completed(struct vc4_dev *vc4);
+
 /* vc4_hdmi.c */
 extern struct platform_driver vc4_hdmi_driver;
 int vc4_hdmi_debugfs_regs(struct seq_file *m, void *unused);
 
+/* vc4_irq.c */
+irqreturn_t vc4_irq(int irq, void *arg);
+void vc4_irq_preinstall(struct drm_device *dev);
+int vc4_irq_postinstall(struct drm_device *dev);
+void vc4_irq_uninstall(struct drm_device *dev);
+void vc4_irq_reset(struct drm_device *dev);
+
 /* vc4_hvs.c */
 extern struct platform_driver vc4_hvs_driver;
 void vc4_hvs_dump_state(struct drm_device *dev);
@@ -253,6 +422,29 @@ u32 vc4_plane_dlist_size(struct drm_plane_state *state);
 extern struct platform_driver vc4_v3d_driver;
 int vc4_v3d_debugfs_ident(struct seq_file *m, void *unused);
 int vc4_v3d_debugfs_regs(struct seq_file *m, void *unused);
+int vc4_v3d_set_power(struct vc4_dev *vc4, bool on);
+
+/* vc4_validate.c */
+int
+vc4_validate_bin_cl(struct drm_device *dev,
+		    void *validated,
+		    void *unvalidated,
+		    struct vc4_exec_info *exec);
+
+int
+vc4_validate_shader_recs(struct drm_device *dev, struct vc4_exec_info *exec);
+
+bool vc4_use_bo(struct vc4_exec_info *exec,
+		uint32_t hindex,
+		enum vc4_bo_mode mode,
+		struct drm_gem_cma_object **obj);
+
+int vc4_get_rcl(struct drm_device *dev, struct vc4_exec_info *exec);
+
+bool vc4_check_tex_size(struct vc4_exec_info *exec,
+			struct drm_gem_cma_object *fbo,
+			uint32_t offset, uint8_t tiling_format,
+			uint32_t width, uint32_t height, uint8_t cpp);
 
 /* vc4_validate_shader.c */
 struct vc4_validated_shader_info *
diff --git a/drivers/gpu/drm/vc4/vc4_gem.c b/drivers/gpu/drm/vc4/vc4_gem.c
new file mode 100644
index 0000000..ddcb018
--- /dev/null
+++ b/drivers/gpu/drm/vc4/vc4_gem.c
@@ -0,0 +1,640 @@
+/*
+ * Copyright © 2014 Broadcom
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ */
+
+#include <linux/module.h>
+#include <linux/platform_device.h>
+#include <linux/device.h>
+#include <linux/io.h>
+
+#include "uapi/drm/vc4_drm.h"
+#include "vc4_drv.h"
+#include "vc4_regs.h"
+#include "vc4_trace.h"
+
+static void
+vc4_queue_hangcheck(struct drm_device *dev)
+{
+	struct vc4_dev *vc4 = to_vc4_dev(dev);
+
+	mod_timer(&vc4->hangcheck.timer,
+		  round_jiffies_up(jiffies + msecs_to_jiffies(100)));
+}
+
+static void
+vc4_reset(struct drm_device *dev)
+{
+	struct vc4_dev *vc4 = to_vc4_dev(dev);
+
+	DRM_INFO("Resetting GPU.\n");
+	vc4_v3d_set_power(vc4, false);
+	vc4_v3d_set_power(vc4, true);
+
+	vc4_irq_reset(dev);
+
+	/* Rearm the hangcheck -- another job might have been waiting
+	 * for our hung one to get kicked off, and vc4_irq_reset()
+	 * would have started it.
+	 */
+	vc4_queue_hangcheck(dev);
+}
+
+static void
+vc4_reset_work(struct work_struct *work)
+{
+	struct vc4_dev *vc4 =
+		container_of(work, struct vc4_dev, hangcheck.reset_work);
+
+	vc4_reset(vc4->dev);
+}
+
+static void
+vc4_hangcheck_elapsed(unsigned long data)
+{
+	struct drm_device *dev = (struct drm_device *)data;
+	struct vc4_dev *vc4 = to_vc4_dev(dev);
+	uint32_t ct0ca, ct1ca;
+
+	/* If idle, we can stop watching for hangs. */
+	if (list_empty(&vc4->job_list))
+		return;
+
+	ct0ca = V3D_READ(V3D_CTNCA(0));
+	ct1ca = V3D_READ(V3D_CTNCA(1));
+
+	/* If we've made any progress in execution, rearm the timer
+	 * and wait.
+	 */
+	if (ct0ca != vc4->hangcheck.last_ct0ca ||
+	    ct1ca != vc4->hangcheck.last_ct1ca) {
+		vc4->hangcheck.last_ct0ca = ct0ca;
+		vc4->hangcheck.last_ct1ca = ct1ca;
+		vc4_queue_hangcheck(dev);
+		return;
+	}
+
+	/* We've gone too long with no progress, reset.  This has to
+	 * be done from a work struct, since resetting can sleep and
+	 * this timer hook isn't allowed to.
+	 */
+	schedule_work(&vc4->hangcheck.reset_work);
+}
+
+static void
+submit_cl(struct drm_device *dev, uint32_t thread, uint32_t start, uint32_t end)
+{
+	struct vc4_dev *vc4 = to_vc4_dev(dev);
+
+	/* Set the current and end address of the control list.
+	 * Writing the end register is what starts the job.
+	 */
+	V3D_WRITE(V3D_CTNCA(thread), start);
+	V3D_WRITE(V3D_CTNEA(thread), end);
+}
+
+int
+vc4_wait_for_seqno(struct drm_device *dev, uint64_t seqno, uint64_t timeout_ns,
+		   bool interruptible)
+{
+	struct vc4_dev *vc4 = to_vc4_dev(dev);
+	int ret = 0;
+	unsigned long timeout_expire;
+	DEFINE_WAIT(wait);
+
+	if (vc4->finished_seqno >= seqno)
+		return 0;
+
+	if (timeout_ns == 0)
+		return -ETIME;
+
+	timeout_expire = jiffies + nsecs_to_jiffies(timeout_ns);
+
+	trace_vc4_wait_for_seqno_begin(dev, seqno, timeout_ns);
+	for (;;) {
+		prepare_to_wait(&vc4->job_wait_queue, &wait,
+				interruptible ? TASK_INTERRUPTIBLE :
+				TASK_UNINTERRUPTIBLE);
+
+		if (interruptible && signal_pending(current)) {
+			ret = -ERESTARTSYS;
+			break;
+		}
+
+		if (vc4->finished_seqno >= seqno)
+			break;
+
+		if (timeout_ns != ~0ull) {
+			if (time_after_eq(jiffies, timeout_expire)) {
+				ret = -ETIME;
+				break;
+			}
+			schedule_timeout(timeout_expire - jiffies);
+		} else {
+			schedule();
+		}
+	}
+
+	finish_wait(&vc4->job_wait_queue, &wait);
+	trace_vc4_wait_for_seqno_end(dev, seqno);
+
+	if (ret && ret != -ERESTARTSYS) {
+		DRM_ERROR("timeout waiting for render thread idle\n");
+		return ret;
+	}
+
+	return 0;
+}
+
+static void
+vc4_flush_caches(struct drm_device *dev)
+{
+	struct vc4_dev *vc4 = to_vc4_dev(dev);
+
+	/* Flush the GPU L2 caches.  These caches sit on top of system
+	 * L3 (the 128kb or so shared with the CPU), and are
+	 * non-allocating in the L3.
+	 */
+	V3D_WRITE(V3D_L2CACTL,
+		  V3D_L2CACTL_L2CCLR);
+
+	V3D_WRITE(V3D_SLCACTL,
+		  VC4_SET_FIELD(0xf, V3D_SLCACTL_T1CC) |
+		  VC4_SET_FIELD(0xf, V3D_SLCACTL_T0CC) |
+		  VC4_SET_FIELD(0xf, V3D_SLCACTL_UCC) |
+		  VC4_SET_FIELD(0xf, V3D_SLCACTL_ICC));
+}
+
+/* Sets the registers for the next job to be actually be executed in
+ * the hardware.
+ *
+ * The job_lock should be held during this.
+ */
+void
+vc4_submit_next_job(struct drm_device *dev)
+{
+	struct vc4_dev *vc4 = to_vc4_dev(dev);
+	struct vc4_exec_info *exec = vc4_first_job(vc4);
+
+	if (!exec)
+		return;
+
+	vc4_flush_caches(dev);
+
+	/* Disable the binner's pre-loaded overflow memory address */
+	V3D_WRITE(V3D_BPOA, 0);
+	V3D_WRITE(V3D_BPOS, 0);
+
+	if (exec->ct0ca != exec->ct0ea)
+		submit_cl(dev, 0, exec->ct0ca, exec->ct0ea);
+	submit_cl(dev, 1, exec->ct1ca, exec->ct1ea);
+}
+
+static void
+vc4_update_bo_seqnos(struct vc4_exec_info *exec, uint64_t seqno)
+{
+	struct vc4_bo *bo;
+	unsigned i;
+
+	for (i = 0; i < exec->bo_count; i++) {
+		bo = to_vc4_bo(&exec->bo[i].bo->base);
+		bo->seqno = seqno;
+	}
+
+	list_for_each_entry(bo, &exec->unref_list, unref_head) {
+		bo->seqno = seqno;
+	}
+}
+
+/* Queues a struct vc4_exec_info for execution.  If no job is
+ * currently executing, then submits it.
+ *
+ * Unlike most GPUs, our hardware only handles one command list at a
+ * time.  To queue multiple jobs at once, we'd need to edit the
+ * previous command list to have a jump to the new one at the end, and
+ * then bump the end address.  That's a change for a later date,
+ * though.
+ */
+static void
+vc4_queue_submit(struct drm_device *dev, struct vc4_exec_info *exec)
+{
+	struct vc4_dev *vc4 = to_vc4_dev(dev);
+	uint64_t seqno;
+	unsigned long irqflags;
+
+	spin_lock_irqsave(&vc4->job_lock, irqflags);
+
+	seqno = ++vc4->emit_seqno;
+	exec->seqno = seqno;
+	vc4_update_bo_seqnos(exec, seqno);
+
+	list_add_tail(&exec->head, &vc4->job_list);
+
+	/* If no job was executing, kick ours off.  Otherwise, it'll
+	 * get started when the previous job's frame done interrupt
+	 * occurs.
+	 */
+	if (vc4_first_job(vc4) == exec) {
+		vc4_submit_next_job(dev);
+		vc4_queue_hangcheck(dev);
+	}
+
+	spin_unlock_irqrestore(&vc4->job_lock, irqflags);
+}
+
+/**
+ * Looks up a bunch of GEM handles for BOs and stores the array for
+ * use in the command validator that actually writes relocated
+ * addresses pointing to them.
+ */
+static int
+vc4_cl_lookup_bos(struct drm_device *dev,
+		  struct drm_file *file_priv,
+		  struct vc4_exec_info *exec)
+{
+	struct drm_vc4_submit_cl *args = exec->args;
+	uint32_t *handles;
+	int ret = 0;
+	int i;
+
+	exec->bo_count = args->bo_handle_count;
+
+	if (!exec->bo_count) {
+		/* See comment on bo_index for why we have to check
+		 * this.
+		 */
+		DRM_ERROR("Rendering requires BOs to validate\n");
+		return -EINVAL;
+	}
+
+	exec->bo = kcalloc(exec->bo_count, sizeof(struct vc4_bo_exec_state),
+			   GFP_KERNEL);
+	if (!exec->bo) {
+		DRM_ERROR("Failed to allocate validated BO pointers\n");
+		return -ENOMEM;
+	}
+
+	handles = drm_malloc_ab(exec->bo_count, sizeof(uint32_t));
+	if (!handles) {
+		DRM_ERROR("Failed to allocate incoming GEM handles\n");
+		goto fail;
+	}
+
+	ret = copy_from_user(handles,
+			     (void __user *)(uintptr_t)args->bo_handles,
+			     exec->bo_count * sizeof(uint32_t));
+	if (ret) {
+		DRM_ERROR("Failed to copy in GEM handles\n");
+		goto fail;
+	}
+
+	spin_lock(&file_priv->table_lock);
+	for (i = 0; i < exec->bo_count; i++) {
+		struct drm_gem_object *bo = idr_find(&file_priv->object_idr,
+						     handles[i]);
+		if (!bo) {
+			DRM_ERROR("Failed to look up GEM BO %d: %d\n",
+				  i, handles[i]);
+			ret = -EINVAL;
+			spin_unlock(&file_priv->table_lock);
+			goto fail;
+		}
+		drm_gem_object_reference(bo);
+		exec->bo[i].bo = (struct drm_gem_cma_object *)bo;
+	}
+	spin_unlock(&file_priv->table_lock);
+
+fail:
+	kfree(handles);
+	return 0;
+}
+
+static int
+vc4_get_bcl(struct drm_device *dev, struct vc4_exec_info *exec)
+{
+	struct drm_vc4_submit_cl *args = exec->args;
+	void *temp = NULL;
+	void *bin;
+	int ret = 0;
+	uint32_t bin_offset = 0;
+	uint32_t shader_rec_offset = roundup(bin_offset + args->bin_cl_size,
+					     16);
+	uint32_t uniforms_offset = shader_rec_offset + args->shader_rec_size;
+	uint32_t exec_size = uniforms_offset + args->uniforms_size;
+	uint32_t temp_size = exec_size + (sizeof(struct vc4_shader_state) *
+					  args->shader_rec_count);
+	struct vc4_bo *bo;
+
+	if (uniforms_offset < shader_rec_offset ||
+	    exec_size < uniforms_offset ||
+	    args->shader_rec_count >= (UINT_MAX /
+					  sizeof(struct vc4_shader_state)) ||
+	    temp_size < exec_size) {
+		DRM_ERROR("overflow in exec arguments\n");
+		goto fail;
+	}
+
+	/* Allocate space where we'll store the copied in user command lists
+	 * and shader records.
+	 *
+	 * We don't just copy directly into the BOs because we need to
+	 * read the contents back for validation, and I think the
+	 * bo->vaddr is uncached access.
+	 */
+	temp = kmalloc(temp_size, GFP_KERNEL);
+	if (!temp) {
+		DRM_ERROR("Failed to allocate storage for copying "
+			  "in bin/render CLs.\n");
+		ret = -ENOMEM;
+		goto fail;
+	}
+	bin = temp + bin_offset;
+	exec->shader_rec_u = temp + shader_rec_offset;
+	exec->uniforms_u = temp + uniforms_offset;
+	exec->shader_state = temp + exec_size;
+	exec->shader_state_size = args->shader_rec_count;
+
+	ret = copy_from_user(bin,
+			     (void __user *)(uintptr_t)args->bin_cl,
+			     args->bin_cl_size);
+	if (ret) {
+		DRM_ERROR("Failed to copy in bin cl\n");
+		goto fail;
+	}
+
+	ret = copy_from_user(exec->shader_rec_u,
+			     (void __user *)(uintptr_t)args->shader_rec,
+			     args->shader_rec_size);
+	if (ret) {
+		DRM_ERROR("Failed to copy in shader recs\n");
+		goto fail;
+	}
+
+	ret = copy_from_user(exec->uniforms_u,
+			     (void __user *)(uintptr_t)args->uniforms,
+			     args->uniforms_size);
+	if (ret) {
+		DRM_ERROR("Failed to copy in uniforms cl\n");
+		goto fail;
+	}
+
+	bo = vc4_bo_create(dev, exec_size, true);
+	if (!bo) {
+		DRM_ERROR("Couldn't allocate BO for binning\n");
+		ret = PTR_ERR(exec->exec_bo);
+		goto fail;
+	}
+	exec->exec_bo = &bo->base;
+
+	list_add_tail(&to_vc4_bo(&exec->exec_bo->base)->unref_head,
+		      &exec->unref_list);
+
+	exec->ct0ca = exec->exec_bo->paddr + bin_offset;
+
+	exec->shader_rec_v = exec->exec_bo->vaddr + shader_rec_offset;
+	exec->shader_rec_p = exec->exec_bo->paddr + shader_rec_offset;
+	exec->shader_rec_size = args->shader_rec_size;
+
+	exec->uniforms_v = exec->exec_bo->vaddr + uniforms_offset;
+	exec->uniforms_p = exec->exec_bo->paddr + uniforms_offset;
+	exec->uniforms_size = args->uniforms_size;
+
+	ret = vc4_validate_bin_cl(dev,
+				  exec->exec_bo->vaddr + bin_offset,
+				  bin,
+				  exec);
+	if (ret)
+		goto fail;
+
+	ret = vc4_validate_shader_recs(dev, exec);
+
+fail:
+	kfree(temp);
+	return ret;
+}
+
+static void
+vc4_complete_exec(struct drm_device *dev, struct vc4_exec_info *exec)
+{
+	unsigned i;
+
+	/* Need the struct lock for drm_gem_object_unreference(). */
+	mutex_lock(&dev->struct_mutex);
+	if (exec->bo) {
+		for (i = 0; i < exec->bo_count; i++)
+			drm_gem_object_unreference(&exec->bo[i].bo->base);
+		kfree(exec->bo);
+	}
+
+	while (!list_empty(&exec->unref_list)) {
+		struct vc4_bo *bo = list_first_entry(&exec->unref_list,
+						     struct vc4_bo, unref_head);
+		list_del(&bo->unref_head);
+		drm_gem_object_unreference(&bo->base.base);
+	}
+	mutex_unlock(&dev->struct_mutex);
+
+	kfree(exec);
+}
+
+void
+vc4_job_handle_completed(struct vc4_dev *vc4)
+{
+	unsigned long irqflags;
+
+	spin_lock_irqsave(&vc4->job_lock, irqflags);
+	while (!list_empty(&vc4->job_done_list)) {
+		struct vc4_exec_info *exec =
+			list_first_entry(&vc4->job_done_list,
+					 struct vc4_exec_info, head);
+		list_del(&exec->head);
+
+		spin_unlock_irqrestore(&vc4->job_lock, irqflags);
+		vc4_complete_exec(vc4->dev, exec);
+		spin_lock_irqsave(&vc4->job_lock, irqflags);
+	}
+	spin_unlock_irqrestore(&vc4->job_lock, irqflags);
+}
+
+/* Scheduled when any job has been completed, this walks the list of
+ * jobs that had completed and unrefs their BOs and frees their exec
+ * structs.
+ */
+static void
+vc4_job_done_work(struct work_struct *work)
+{
+	struct vc4_dev *vc4 =
+		container_of(work, struct vc4_dev, job_done_work);
+
+	vc4_job_handle_completed(vc4);
+}
+
+static int
+vc4_wait_for_seqno_ioctl_helper(struct drm_device *dev,
+				uint64_t seqno,
+				uint64_t *timeout_ns)
+{
+	unsigned long start = jiffies;
+	int ret = vc4_wait_for_seqno(dev, seqno, *timeout_ns, true);
+
+	if ((ret == -EINTR || ret == -ERESTARTSYS) && *timeout_ns != ~0ull) {
+		uint64_t delta = jiffies_to_nsecs(jiffies - start);
+
+		if (*timeout_ns >= delta)
+			*timeout_ns -= delta;
+	}
+
+	return ret;
+}
+
+int
+vc4_wait_seqno_ioctl(struct drm_device *dev, void *data,
+		     struct drm_file *file_priv)
+{
+	struct drm_vc4_wait_seqno *args = data;
+
+	return vc4_wait_for_seqno_ioctl_helper(dev, args->seqno,
+					       &args->timeout_ns);
+}
+
+int
+vc4_wait_bo_ioctl(struct drm_device *dev, void *data,
+		  struct drm_file *file_priv)
+{
+	int ret;
+	struct drm_vc4_wait_bo *args = data;
+	struct drm_gem_object *gem_obj;
+	struct vc4_bo *bo;
+
+	gem_obj = drm_gem_object_lookup(dev, file_priv, args->handle);
+	if (!gem_obj) {
+		DRM_ERROR("Failed to look up GEM BO %d\n", args->handle);
+		return -EINVAL;
+	}
+	bo = to_vc4_bo(gem_obj);
+
+	ret = vc4_wait_for_seqno_ioctl_helper(dev, bo->seqno,
+					      &args->timeout_ns);
+
+	drm_gem_object_unreference_unlocked(gem_obj);
+	return ret;
+}
+
+/**
+ * Submits a command list to the VC4.
+ *
+ * This is what is called batchbuffer emitting on other hardware.
+ */
+int
+vc4_submit_cl_ioctl(struct drm_device *dev, void *data,
+		    struct drm_file *file_priv)
+{
+	struct vc4_dev *vc4 = to_vc4_dev(dev);
+	struct drm_vc4_submit_cl *args = data;
+	struct vc4_exec_info *exec;
+	int ret;
+
+	if ((args->flags & ~VC4_SUBMIT_CL_USE_CLEAR_COLOR) != 0) {
+		DRM_ERROR("Unknown flags: 0x%02x\n", args->flags);
+		return -EINVAL;
+	}
+
+	exec = kcalloc(1, sizeof(*exec), GFP_KERNEL);
+	if (!exec) {
+		DRM_ERROR("malloc failure on exec struct\n");
+		return -ENOMEM;
+	}
+
+	exec->args = args;
+	INIT_LIST_HEAD(&exec->unref_list);
+
+	ret = vc4_cl_lookup_bos(dev, file_priv, exec);
+	if (ret)
+		goto fail;
+
+	if (exec->args->bin_cl_size != 0) {
+		ret = vc4_get_bcl(dev, exec);
+		if (ret)
+			goto fail;
+	} else {
+		exec->ct0ca = 0;
+		exec->ct0ea = 0;
+	}
+
+	ret = vc4_get_rcl(dev, exec);
+	if (ret)
+		goto fail;
+
+	/* Clear this out of the struct we'll be putting in the queue,
+	 * since it's part of our stack.
+	 */
+	exec->args = NULL;
+
+	vc4_queue_submit(dev, exec);
+
+	/* Return the seqno for our job. */
+	args->seqno = vc4->emit_seqno;
+
+	return 0;
+
+fail:
+	vc4_complete_exec(vc4->dev, exec);
+
+	return ret;
+}
+
+void
+vc4_gem_init(struct drm_device *dev)
+{
+	struct vc4_dev *vc4 = to_vc4_dev(dev);
+
+	INIT_LIST_HEAD(&vc4->job_list);
+	INIT_LIST_HEAD(&vc4->job_done_list);
+	spin_lock_init(&vc4->job_lock);
+
+	INIT_WORK(&vc4->hangcheck.reset_work, vc4_reset_work);
+	setup_timer(&vc4->hangcheck.timer,
+		    vc4_hangcheck_elapsed,
+		    (unsigned long)dev);
+
+	INIT_WORK(&vc4->job_done_work, vc4_job_done_work);
+}
+
+void
+vc4_gem_destroy(struct drm_device *dev)
+{
+	struct vc4_dev *vc4 = to_vc4_dev(dev);
+
+	/* Waiting for exec to finish would need to be done before
+	 * unregistering V3D.
+	 */
+	WARN_ON(vc4->emit_seqno != vc4->finished_seqno);
+
+	/* V3D should already have disabled its interrupt and cleared
+	 * the overflow allocation registers.  Now free the object.
+	 */
+	if (vc4->overflow_mem) {
+		drm_gem_object_unreference_unlocked(&vc4->overflow_mem->base.base);
+		vc4->overflow_mem = NULL;
+	}
+
+	vc4_bo_cache_destroy(dev);
+}
diff --git a/drivers/gpu/drm/vc4/vc4_irq.c b/drivers/gpu/drm/vc4/vc4_irq.c
new file mode 100644
index 0000000..b68060e
--- /dev/null
+++ b/drivers/gpu/drm/vc4/vc4_irq.c
@@ -0,0 +1,210 @@
+/*
+ * Copyright © 2014 Broadcom
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ */
+
+/** DOC: Interrupt management for the V3D engine.
+ *
+ * We have an interrupt status register (V3D_INTCTL) which reports
+ * interrupts, and where writing 1 bits clears those interrupts.
+ * There are also a pair of interrupt registers
+ * (V3D_INTENA/V3D_INTDIS) where writing a 1 to their bits enables or
+ * disables that specific interrupt, and 0s written are ignored
+ * (reading either one returns the set of enabled interrupts).
+ *
+ * When we take a render frame interrupt, we need to wake the
+ * processes waiting for some frame to be done, and get the next frame
+ * submitted ASAP (so the hardware doesn't sit idle when there's work
+ * to do).
+ *
+ * When we take the binner out of memory interrupt, we need to
+ * allocate some new memory and pass it to the binner so that the
+ * current job can make progress.
+ */
+
+#include "vc4_drv.h"
+#include "vc4_regs.h"
+
+#define V3D_DRIVER_IRQS (V3D_INT_OUTOMEM | \
+			 V3D_INT_FRDONE)
+
+DECLARE_WAIT_QUEUE_HEAD(render_wait);
+
+static void
+vc4_overflow_mem_work(struct work_struct *work)
+{
+	struct vc4_dev *vc4 =
+		container_of(work, struct vc4_dev, overflow_mem_work);
+	struct drm_device *dev = vc4->dev;
+	struct vc4_bo *bo;
+
+	bo = vc4_bo_create(dev, 256 * 1024, true);
+	if (!bo) {
+		DRM_ERROR("Couldn't allocate binner overflow mem\n");
+		return;
+	}
+
+	/* If there's a job executing currently, then our previous
+	 * overflow allocation is getting used in that job and we need
+	 * to queue it to be released when the job is done.  But if no
+	 * job is executing at all, then we can free the old overflow
+	 * object direcctly.
+	 *
+	 * No lock necessary for this pointer since we're the only
+	 * ones that update the pointer, and our workqueue won't
+	 * reenter.
+	 */
+	if (vc4->overflow_mem) {
+		struct vc4_exec_info *current_exec;
+		unsigned long irqflags;
+
+		spin_lock_irqsave(&vc4->job_lock, irqflags);
+		current_exec = vc4_first_job(vc4);
+		if (current_exec) {
+			vc4->overflow_mem->seqno = vc4->finished_seqno + 1;
+			list_add_tail(&vc4->overflow_mem->unref_head,
+				      &current_exec->unref_list);
+			vc4->overflow_mem = NULL;
+		}
+		spin_unlock_irqrestore(&vc4->job_lock, irqflags);
+	}
+
+	if (vc4->overflow_mem)
+		drm_gem_object_unreference_unlocked(&vc4->overflow_mem->base.base);
+	vc4->overflow_mem = bo;
+
+	V3D_WRITE(V3D_BPOA, bo->base.paddr);
+	V3D_WRITE(V3D_BPOS, bo->base.base.size);
+	V3D_WRITE(V3D_INTCTL, V3D_INT_OUTOMEM);
+	V3D_WRITE(V3D_INTENA, V3D_INT_OUTOMEM);
+}
+
+static void
+vc4_irq_finish_job(struct drm_device *dev)
+{
+	struct vc4_dev *vc4 = to_vc4_dev(dev);
+	struct vc4_exec_info *exec = vc4_first_job(vc4);
+
+	if (!exec)
+		return;
+
+	vc4->finished_seqno++;
+	list_move_tail(&exec->head, &vc4->job_done_list);
+	vc4_submit_next_job(dev);
+
+	wake_up_all(&vc4->job_wait_queue);
+	schedule_work(&vc4->job_done_work);
+}
+
+irqreturn_t
+vc4_irq(int irq, void *arg)
+{
+	struct drm_device *dev = arg;
+	struct vc4_dev *vc4 = to_vc4_dev(dev);
+	uint32_t intctl;
+	irqreturn_t status = IRQ_NONE;
+
+	barrier();
+	intctl = V3D_READ(V3D_INTCTL);
+
+	/* Acknowledge the interrupts we're handling here. The render
+	 * frame done interrupt will be cleared, while OUTOMEM will
+	 * stay high until the underlying cause is cleared.
+	 */
+	V3D_WRITE(V3D_INTCTL, intctl);
+
+	if (intctl & V3D_INT_OUTOMEM) {
+		/* Disable OUTOMEM until the work is done. */
+		V3D_WRITE(V3D_INTDIS, V3D_INT_OUTOMEM);
+		schedule_work(&vc4->overflow_mem_work);
+		status = IRQ_HANDLED;
+	}
+
+	if (intctl & V3D_INT_FRDONE) {
+		spin_lock(&vc4->job_lock);
+		vc4_irq_finish_job(dev);
+		spin_unlock(&vc4->job_lock);
+		status = IRQ_HANDLED;
+	}
+
+	return status;
+}
+
+void
+vc4_irq_preinstall(struct drm_device *dev)
+{
+	struct vc4_dev *vc4 = to_vc4_dev(dev);
+
+	init_waitqueue_head(&vc4->job_wait_queue);
+	INIT_WORK(&vc4->overflow_mem_work, vc4_overflow_mem_work);
+
+	/* Clear any pending interrupts someone might have left around
+	 * for us.
+	 */
+	V3D_WRITE(V3D_INTCTL, V3D_DRIVER_IRQS);
+}
+
+int
+vc4_irq_postinstall(struct drm_device *dev)
+{
+	struct vc4_dev *vc4 = to_vc4_dev(dev);
+
+	/* Enable both the render done and out of memory interrupts. */
+	V3D_WRITE(V3D_INTENA, V3D_DRIVER_IRQS);
+
+	return 0;
+}
+
+void
+vc4_irq_uninstall(struct drm_device *dev)
+{
+	struct vc4_dev *vc4 = to_vc4_dev(dev);
+
+	/* Disable sending interrupts for our driver's IRQs. */
+	V3D_WRITE(V3D_INTDIS, V3D_DRIVER_IRQS);
+
+	/* Clear any pending interrupts we might have left. */
+	V3D_WRITE(V3D_INTCTL, V3D_DRIVER_IRQS);
+
+	cancel_work_sync(&vc4->overflow_mem_work);
+}
+
+/** Reinitializes interrupt registers when a GPU reset is performed. */
+void vc4_irq_reset(struct drm_device *dev)
+{
+	struct vc4_dev *vc4 = to_vc4_dev(dev);
+	unsigned long irqflags;
+
+	/* Acknowledge any stale IRQs. */
+	V3D_WRITE(V3D_INTCTL, V3D_DRIVER_IRQS);
+
+	/*
+	 * Turn all our interrupts on.  Binner out of memory is the
+	 * only one we expect to trigger at this point, since we've
+	 * just come from poweron and haven't supplied any overflow
+	 * memory yet.
+	 */
+	V3D_WRITE(V3D_INTENA, V3D_DRIVER_IRQS);
+
+	spin_lock_irqsave(&vc4->job_lock, irqflags);
+	vc4_irq_finish_job(dev);
+	spin_unlock_irqrestore(&vc4->job_lock, irqflags);
+}
diff --git a/drivers/gpu/drm/vc4/vc4_packet.h b/drivers/gpu/drm/vc4/vc4_packet.h
new file mode 100644
index 0000000..0f31cc0
--- /dev/null
+++ b/drivers/gpu/drm/vc4/vc4_packet.h
@@ -0,0 +1,399 @@
+/*
+ * Copyright © 2014 Broadcom
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ */
+
+#ifndef VC4_PACKET_H
+#define VC4_PACKET_H
+
+#include "vc4_regs.h" /* for VC4_MASK, VC4_GET_FIELD, VC4_SET_FIELD */
+
+enum vc4_packet {
+	VC4_PACKET_HALT = 0,
+	VC4_PACKET_NOP = 1,
+
+	VC4_PACKET_FLUSH = 4,
+	VC4_PACKET_FLUSH_ALL = 5,
+	VC4_PACKET_START_TILE_BINNING = 6,
+	VC4_PACKET_INCREMENT_SEMAPHORE = 7,
+	VC4_PACKET_WAIT_ON_SEMAPHORE = 8,
+
+	VC4_PACKET_BRANCH = 16,
+	VC4_PACKET_BRANCH_TO_SUB_LIST = 17,
+
+	VC4_PACKET_STORE_MS_TILE_BUFFER = 24,
+	VC4_PACKET_STORE_MS_TILE_BUFFER_AND_EOF = 25,
+	VC4_PACKET_STORE_FULL_RES_TILE_BUFFER = 26,
+	VC4_PACKET_LOAD_FULL_RES_TILE_BUFFER = 27,
+	VC4_PACKET_STORE_TILE_BUFFER_GENERAL = 28,
+	VC4_PACKET_LOAD_TILE_BUFFER_GENERAL = 29,
+
+	VC4_PACKET_GL_INDEXED_PRIMITIVE = 32,
+	VC4_PACKET_GL_ARRAY_PRIMITIVE = 33,
+
+	VC4_PACKET_COMPRESSED_PRIMITIVE = 48,
+	VC4_PACKET_CLIPPED_COMPRESSED_PRIMITIVE = 49,
+
+	VC4_PACKET_PRIMITIVE_LIST_FORMAT = 56,
+
+	VC4_PACKET_GL_SHADER_STATE = 64,
+	VC4_PACKET_NV_SHADER_STATE = 65,
+	VC4_PACKET_VG_SHADER_STATE = 66,
+
+	VC4_PACKET_CONFIGURATION_BITS = 96,
+	VC4_PACKET_FLAT_SHADE_FLAGS = 97,
+	VC4_PACKET_POINT_SIZE = 98,
+	VC4_PACKET_LINE_WIDTH = 99,
+	VC4_PACKET_RHT_X_BOUNDARY = 100,
+	VC4_PACKET_DEPTH_OFFSET = 101,
+	VC4_PACKET_CLIP_WINDOW = 102,
+	VC4_PACKET_VIEWPORT_OFFSET = 103,
+	VC4_PACKET_Z_CLIPPING = 104,
+	VC4_PACKET_CLIPPER_XY_SCALING = 105,
+	VC4_PACKET_CLIPPER_Z_SCALING = 106,
+
+	VC4_PACKET_TILE_BINNING_MODE_CONFIG = 112,
+	VC4_PACKET_TILE_RENDERING_MODE_CONFIG = 113,
+	VC4_PACKET_CLEAR_COLORS = 114,
+	VC4_PACKET_TILE_COORDINATES = 115,
+
+	/* Not an actual hardware packet -- this is what we use to put
+	 * references to GEM bos in the command stream, since we need the u32
+	 * int the actual address packet in order to store the offset from the
+	 * start of the BO.
+	 */
+	VC4_PACKET_GEM_HANDLES = 254,
+} __attribute__ ((__packed__));
+
+#define VC4_PACKET_HALT_SIZE						1
+#define VC4_PACKET_NOP_SIZE						1
+#define VC4_PACKET_FLUSH_SIZE						1
+#define VC4_PACKET_FLUSH_ALL_SIZE					1
+#define VC4_PACKET_START_TILE_BINNING_SIZE				1
+#define VC4_PACKET_INCREMENT_SEMAPHORE_SIZE				1
+#define VC4_PACKET_WAIT_ON_SEMAPHORE_SIZE				1
+#define VC4_PACKET_BRANCH_SIZE						5
+#define VC4_PACKET_BRANCH_TO_SUB_LIST_SIZE				5
+#define VC4_PACKET_STORE_MS_TILE_BUFFER_SIZE				1
+#define VC4_PACKET_STORE_MS_TILE_BUFFER_AND_EOF_SIZE			1
+#define VC4_PACKET_STORE_FULL_RES_TILE_BUFFER_SIZE			5
+#define VC4_PACKET_LOAD_FULL_RES_TILE_BUFFER_SIZE			5
+#define VC4_PACKET_STORE_TILE_BUFFER_GENERAL_SIZE			7
+#define VC4_PACKET_LOAD_TILE_BUFFER_GENERAL_SIZE			7
+#define VC4_PACKET_GL_INDEXED_PRIMITIVE_SIZE				14
+#define VC4_PACKET_GL_ARRAY_PRIMITIVE_SIZE				10
+#define VC4_PACKET_COMPRESSED_PRIMITIVE_SIZE				1
+#define VC4_PACKET_CLIPPED_COMPRESSED_PRIMITIVE_SIZE			1
+#define VC4_PACKET_PRIMITIVE_LIST_FORMAT_SIZE				2
+#define VC4_PACKET_GL_SHADER_STATE_SIZE					5
+#define VC4_PACKET_NV_SHADER_STATE_SIZE					5
+#define VC4_PACKET_VG_SHADER_STATE_SIZE					5
+#define VC4_PACKET_CONFIGURATION_BITS_SIZE				4
+#define VC4_PACKET_FLAT_SHADE_FLAGS_SIZE				5
+#define VC4_PACKET_POINT_SIZE_SIZE					5
+#define VC4_PACKET_LINE_WIDTH_SIZE					5
+#define VC4_PACKET_RHT_X_BOUNDARY_SIZE					3
+#define VC4_PACKET_DEPTH_OFFSET_SIZE					5
+#define VC4_PACKET_CLIP_WINDOW_SIZE					9
+#define VC4_PACKET_VIEWPORT_OFFSET_SIZE					5
+#define VC4_PACKET_Z_CLIPPING_SIZE					9
+#define VC4_PACKET_CLIPPER_XY_SCALING_SIZE				9
+#define VC4_PACKET_CLIPPER_Z_SCALING_SIZE				9
+#define VC4_PACKET_TILE_BINNING_MODE_CONFIG_SIZE			16
+#define VC4_PACKET_TILE_RENDERING_MODE_CONFIG_SIZE			11
+#define VC4_PACKET_CLEAR_COLORS_SIZE					14
+#define VC4_PACKET_TILE_COORDINATES_SIZE				3
+#define VC4_PACKET_GEM_HANDLES_SIZE					9
+
+/* Number of multisamples supported. */
+#define VC4_MAX_SAMPLES							4
+/* Size of a full resolution color or Z tile buffer load/store. */
+#define VC4_TILE_BUFFER_SIZE			(64 * 64 * 4)
+
+/** @{
+ * Bits used by packets like VC4_PACKET_STORE_TILE_BUFFER_GENERAL and
+ * VC4_PACKET_TILE_RENDERING_MODE_CONFIG.
+*/
+#define VC4_TILING_FORMAT_LINEAR    0
+#define VC4_TILING_FORMAT_T         1
+#define VC4_TILING_FORMAT_LT        2
+/** @} */
+
+/** @{
+ *
+ * low bits of VC4_PACKET_STORE_FULL_RES_TILE_BUFFER and
+ * VC4_PACKET_LOAD_FULL_RES_TILE_BUFFER.
+ */
+#define VC4_LOADSTORE_FULL_RES_EOF                     BIT(3)
+#define VC4_LOADSTORE_FULL_RES_DISABLE_CLEAR_ALL       BIT(2)
+#define VC4_LOADSTORE_FULL_RES_DISABLE_ZS              BIT(1)
+#define VC4_LOADSTORE_FULL_RES_DISABLE_COLOR           BIT(0)
+
+/** @{
+ *
+ * low bits of VC4_PACKET_STORE_FULL_RES_TILE_BUFFER and
+ * VC4_PACKET_LOAD_FULL_RES_TILE_BUFFER.
+ */
+#define VC4_LOADSTORE_FULL_RES_EOF                     BIT(3)
+#define VC4_LOADSTORE_FULL_RES_DISABLE_CLEAR_ALL       BIT(2)
+#define VC4_LOADSTORE_FULL_RES_DISABLE_ZS              BIT(1)
+#define VC4_LOADSTORE_FULL_RES_DISABLE_COLOR           BIT(0)
+
+/** @{
+ *
+ * byte 2 of VC4_PACKET_STORE_TILE_BUFFER_GENERAL and
+ * VC4_PACKET_LOAD_TILE_BUFFER_GENERAL (low bits of the address)
+ */
+
+#define VC4_LOADSTORE_TILE_BUFFER_EOF                  BIT(3)
+#define VC4_LOADSTORE_TILE_BUFFER_DISABLE_FULL_VG_MASK BIT(2)
+#define VC4_LOADSTORE_TILE_BUFFER_DISABLE_FULL_ZS      BIT(1)
+#define VC4_LOADSTORE_TILE_BUFFER_DISABLE_FULL_COLOR   BIT(0)
+
+/** @} */
+
+/** @{
+ *
+ * byte 0-1 of VC4_PACKET_STORE_TILE_BUFFER_GENERAL and
+ * VC4_PACKET_LOAD_TILE_BUFFER_GENERAL
+ */
+#define VC4_STORE_TILE_BUFFER_DISABLE_VG_MASK_CLEAR BIT(15)
+#define VC4_STORE_TILE_BUFFER_DISABLE_ZS_CLEAR     BIT(14)
+#define VC4_STORE_TILE_BUFFER_DISABLE_COLOR_CLEAR  BIT(13)
+#define VC4_STORE_TILE_BUFFER_DISABLE_SWAP         BIT(12)
+
+#define VC4_LOADSTORE_TILE_BUFFER_FORMAT_MASK      VC4_MASK(9, 8)
+#define VC4_LOADSTORE_TILE_BUFFER_FORMAT_SHIFT     8
+#define VC4_LOADSTORE_TILE_BUFFER_RGBA8888         0
+#define VC4_LOADSTORE_TILE_BUFFER_BGR565_DITHER    1
+#define VC4_LOADSTORE_TILE_BUFFER_BGR565           2
+/** @} */
+
+/** @{
+ *
+ * byte 0 of VC4_PACKET_STORE_TILE_BUFFER_GENERAL and
+ * VC4_PACKET_LOAD_TILE_BUFFER_GENERAL
+ */
+#define VC4_STORE_TILE_BUFFER_MODE_MASK            VC4_MASK(7, 6)
+#define VC4_STORE_TILE_BUFFER_MODE_SHIFT           6
+#define VC4_STORE_TILE_BUFFER_MODE_SAMPLE0         (0 << 6)
+#define VC4_STORE_TILE_BUFFER_MODE_DECIMATE_X4     (1 << 6)
+#define VC4_STORE_TILE_BUFFER_MODE_DECIMATE_X16    (2 << 6)
+
+/** The values of the field are VC4_TILING_FORMAT_* */
+#define VC4_LOADSTORE_TILE_BUFFER_TILING_MASK      VC4_MASK(5, 4)
+#define VC4_LOADSTORE_TILE_BUFFER_TILING_SHIFT     4
+
+#define VC4_LOADSTORE_TILE_BUFFER_BUFFER_MASK      VC4_MASK(2, 0)
+#define VC4_LOADSTORE_TILE_BUFFER_BUFFER_SHIFT     0
+#define VC4_LOADSTORE_TILE_BUFFER_NONE             0
+#define VC4_LOADSTORE_TILE_BUFFER_COLOR            1
+#define VC4_LOADSTORE_TILE_BUFFER_ZS               2
+#define VC4_LOADSTORE_TILE_BUFFER_Z                3
+#define VC4_LOADSTORE_TILE_BUFFER_VG_MASK          4
+#define VC4_LOADSTORE_TILE_BUFFER_FULL             5
+/** @} */
+
+#define VC4_INDEX_BUFFER_U8                        (0 << 4)
+#define VC4_INDEX_BUFFER_U16                       (1 << 4)
+
+/* This flag is only present in NV shader state. */
+#define VC4_SHADER_FLAG_SHADED_CLIP_COORDS         BIT(3)
+#define VC4_SHADER_FLAG_ENABLE_CLIPPING            BIT(2)
+#define VC4_SHADER_FLAG_VS_POINT_SIZE              BIT(1)
+#define VC4_SHADER_FLAG_FS_SINGLE_THREAD           BIT(0)
+
+/** @{ byte 2 of config bits. */
+#define VC4_CONFIG_BITS_EARLY_Z_UPDATE             BIT(1)
+#define VC4_CONFIG_BITS_EARLY_Z                    BIT(0)
+/** @} */
+
+/** @{ byte 1 of config bits. */
+#define VC4_CONFIG_BITS_Z_UPDATE                   BIT(7)
+/** same values in this 3-bit field as PIPE_FUNC_* */
+#define VC4_CONFIG_BITS_DEPTH_FUNC_SHIFT           4
+#define VC4_CONFIG_BITS_COVERAGE_READ_LEAVE        BIT(3)
+
+#define VC4_CONFIG_BITS_COVERAGE_UPDATE_NONZERO    (0 << 1)
+#define VC4_CONFIG_BITS_COVERAGE_UPDATE_ODD        (1 << 1)
+#define VC4_CONFIG_BITS_COVERAGE_UPDATE_OR         (2 << 1)
+#define VC4_CONFIG_BITS_COVERAGE_UPDATE_ZERO       (3 << 1)
+
+#define VC4_CONFIG_BITS_COVERAGE_PIPE_SELECT       BIT(0)
+/** @} */
+
+/** @{ byte 0 of config bits. */
+#define VC4_CONFIG_BITS_RASTERIZER_OVERSAMPLE_NONE (0 << 6)
+#define VC4_CONFIG_BITS_RASTERIZER_OVERSAMPLE_4X   (1 << 6)
+#define VC4_CONFIG_BITS_RASTERIZER_OVERSAMPLE_16X  (2 << 6)
+
+#define VC4_CONFIG_BITS_AA_POINTS_AND_LINES        BIT(4)
+#define VC4_CONFIG_BITS_ENABLE_DEPTH_OFFSET        BIT(3)
+#define VC4_CONFIG_BITS_CW_PRIMITIVES              BIT(2)
+#define VC4_CONFIG_BITS_ENABLE_PRIM_BACK           BIT(1)
+#define VC4_CONFIG_BITS_ENABLE_PRIM_FRONT          BIT(0)
+/** @} */
+
+/** @{ bits in the last u8 of VC4_PACKET_TILE_BINNING_MODE_CONFIG */
+#define VC4_BIN_CONFIG_DB_NON_MS                   BIT(7)
+
+#define VC4_BIN_CONFIG_ALLOC_BLOCK_SIZE_MASK       VC4_MASK(6, 5)
+#define VC4_BIN_CONFIG_ALLOC_BLOCK_SIZE_SHIFT      5
+#define VC4_BIN_CONFIG_ALLOC_BLOCK_SIZE_32         0
+#define VC4_BIN_CONFIG_ALLOC_BLOCK_SIZE_64         1
+#define VC4_BIN_CONFIG_ALLOC_BLOCK_SIZE_128        2
+#define VC4_BIN_CONFIG_ALLOC_BLOCK_SIZE_256        3
+
+#define VC4_BIN_CONFIG_ALLOC_INIT_BLOCK_SIZE_MASK  VC4_MASK(4, 3)
+#define VC4_BIN_CONFIG_ALLOC_INIT_BLOCK_SIZE_SHIFT 3
+#define VC4_BIN_CONFIG_ALLOC_INIT_BLOCK_SIZE_32    0
+#define VC4_BIN_CONFIG_ALLOC_INIT_BLOCK_SIZE_64    1
+#define VC4_BIN_CONFIG_ALLOC_INIT_BLOCK_SIZE_128   2
+#define VC4_BIN_CONFIG_ALLOC_INIT_BLOCK_SIZE_256   3
+
+#define VC4_BIN_CONFIG_AUTO_INIT_TSDA              BIT(2)
+#define VC4_BIN_CONFIG_TILE_BUFFER_64BIT           BIT(1)
+#define VC4_BIN_CONFIG_MS_MODE_4X                  BIT(0)
+/** @} */
+
+/** @{ bits in the last u16 of VC4_PACKET_TILE_RENDERING_MODE_CONFIG */
+#define VC4_RENDER_CONFIG_DB_NON_MS                BIT(12)
+#define VC4_RENDER_CONFIG_EARLY_Z_COVERAGE_DISABLE BIT(11)
+#define VC4_RENDER_CONFIG_EARLY_Z_DIRECTION_G      BIT(10)
+#define VC4_RENDER_CONFIG_COVERAGE_MODE            BIT(9)
+#define VC4_RENDER_CONFIG_ENABLE_VG_MASK           BIT(8)
+
+/** The values of the field are VC4_TILING_FORMAT_* */
+#define VC4_RENDER_CONFIG_MEMORY_FORMAT_MASK       VC4_MASK(7, 6)
+#define VC4_RENDER_CONFIG_MEMORY_FORMAT_SHIFT      6
+
+#define VC4_RENDER_CONFIG_DECIMATE_MODE_1X         (0 << 4)
+#define VC4_RENDER_CONFIG_DECIMATE_MODE_4X         (1 << 4)
+#define VC4_RENDER_CONFIG_DECIMATE_MODE_16X        (2 << 4)
+
+#define VC4_RENDER_CONFIG_FORMAT_MASK              VC4_MASK(3, 2)
+#define VC4_RENDER_CONFIG_FORMAT_SHIFT             2
+#define VC4_RENDER_CONFIG_FORMAT_BGR565_DITHERED   0
+#define VC4_RENDER_CONFIG_FORMAT_RGBA8888          1
+#define VC4_RENDER_CONFIG_FORMAT_BGR565            2
+
+#define VC4_RENDER_CONFIG_TILE_BUFFER_64BIT        BIT(1)
+#define VC4_RENDER_CONFIG_MS_MODE_4X               BIT(0)
+
+#define VC4_PRIMITIVE_LIST_FORMAT_16_INDEX         (1 << 4)
+#define VC4_PRIMITIVE_LIST_FORMAT_32_XY            (3 << 4)
+#define VC4_PRIMITIVE_LIST_FORMAT_TYPE_POINTS      (0 << 0)
+#define VC4_PRIMITIVE_LIST_FORMAT_TYPE_LINES       (1 << 0)
+#define VC4_PRIMITIVE_LIST_FORMAT_TYPE_TRIANGLES   (2 << 0)
+#define VC4_PRIMITIVE_LIST_FORMAT_TYPE_RHT         (3 << 0)
+
+enum vc4_texture_data_type {
+	VC4_TEXTURE_TYPE_RGBA8888 = 0,
+	VC4_TEXTURE_TYPE_RGBX8888 = 1,
+	VC4_TEXTURE_TYPE_RGBA4444 = 2,
+	VC4_TEXTURE_TYPE_RGBA5551 = 3,
+	VC4_TEXTURE_TYPE_RGB565 = 4,
+	VC4_TEXTURE_TYPE_LUMINANCE = 5,
+	VC4_TEXTURE_TYPE_ALPHA = 6,
+	VC4_TEXTURE_TYPE_LUMALPHA = 7,
+	VC4_TEXTURE_TYPE_ETC1 = 8,
+	VC4_TEXTURE_TYPE_S16F = 9,
+	VC4_TEXTURE_TYPE_S8 = 10,
+	VC4_TEXTURE_TYPE_S16 = 11,
+	VC4_TEXTURE_TYPE_BW1 = 12,
+	VC4_TEXTURE_TYPE_A4 = 13,
+	VC4_TEXTURE_TYPE_A1 = 14,
+	VC4_TEXTURE_TYPE_RGBA64 = 15,
+	VC4_TEXTURE_TYPE_RGBA32R = 16,
+	VC4_TEXTURE_TYPE_YUV422R = 17,
+};
+
+#define VC4_TEX_P0_OFFSET_MASK                     VC4_MASK(31, 12)
+#define VC4_TEX_P0_OFFSET_SHIFT                    12
+#define VC4_TEX_P0_CSWIZ_MASK                      VC4_MASK(11, 10)
+#define VC4_TEX_P0_CSWIZ_SHIFT                     10
+#define VC4_TEX_P0_CMMODE_MASK                     VC4_MASK(9, 9)
+#define VC4_TEX_P0_CMMODE_SHIFT                    9
+#define VC4_TEX_P0_FLIPY_MASK                      VC4_MASK(8, 8)
+#define VC4_TEX_P0_FLIPY_SHIFT                     8
+#define VC4_TEX_P0_TYPE_MASK                       VC4_MASK(7, 4)
+#define VC4_TEX_P0_TYPE_SHIFT                      4
+#define VC4_TEX_P0_MIPLVLS_MASK                    VC4_MASK(3, 0)
+#define VC4_TEX_P0_MIPLVLS_SHIFT                   0
+
+#define VC4_TEX_P1_TYPE4_MASK                      VC4_MASK(31, 31)
+#define VC4_TEX_P1_TYPE4_SHIFT                     31
+#define VC4_TEX_P1_HEIGHT_MASK                     VC4_MASK(30, 20)
+#define VC4_TEX_P1_HEIGHT_SHIFT                    20
+#define VC4_TEX_P1_ETCFLIP_MASK                    VC4_MASK(19, 19)
+#define VC4_TEX_P1_ETCFLIP_SHIFT                   19
+#define VC4_TEX_P1_WIDTH_MASK                      VC4_MASK(18, 8)
+#define VC4_TEX_P1_WIDTH_SHIFT                     8
+
+#define VC4_TEX_P1_MAGFILT_MASK                    VC4_MASK(7, 7)
+#define VC4_TEX_P1_MAGFILT_SHIFT                   7
+# define VC4_TEX_P1_MAGFILT_LINEAR                 0
+# define VC4_TEX_P1_MAGFILT_NEAREST                1
+
+#define VC4_TEX_P1_MINFILT_MASK                    VC4_MASK(6, 4)
+#define VC4_TEX_P1_MINFILT_SHIFT                   4
+# define VC4_TEX_P1_MINFILT_LINEAR                 0
+# define VC4_TEX_P1_MINFILT_NEAREST                1
+# define VC4_TEX_P1_MINFILT_NEAR_MIP_NEAR          2
+# define VC4_TEX_P1_MINFILT_NEAR_MIP_LIN           3
+# define VC4_TEX_P1_MINFILT_LIN_MIP_NEAR           4
+# define VC4_TEX_P1_MINFILT_LIN_MIP_LIN            5
+
+#define VC4_TEX_P1_WRAP_T_MASK                     VC4_MASK(3, 2)
+#define VC4_TEX_P1_WRAP_T_SHIFT                    2
+#define VC4_TEX_P1_WRAP_S_MASK                     VC4_MASK(1, 0)
+#define VC4_TEX_P1_WRAP_S_SHIFT                    0
+# define VC4_TEX_P1_WRAP_REPEAT                    0
+# define VC4_TEX_P1_WRAP_CLAMP                     1
+# define VC4_TEX_P1_WRAP_MIRROR                    2
+# define VC4_TEX_P1_WRAP_BORDER                    3
+
+#define VC4_TEX_P2_PTYPE_MASK                      VC4_MASK(31, 30)
+#define VC4_TEX_P2_PTYPE_SHIFT                     30
+# define VC4_TEX_P2_PTYPE_IGNORED                  0
+# define VC4_TEX_P2_PTYPE_CUBE_MAP_STRIDE          1
+# define VC4_TEX_P2_PTYPE_CHILD_IMAGE_DIMENSIONS   2
+# define VC4_TEX_P2_PTYPE_CHILD_IMAGE_OFFSETS      3
+
+/* VC4_TEX_P2_PTYPE_CUBE_MAP_STRIDE bits */
+#define VC4_TEX_P2_CMST_MASK                       VC4_MASK(29, 12)
+#define VC4_TEX_P2_CMST_SHIFT                      12
+#define VC4_TEX_P2_BSLOD_MASK                      VC4_MASK(0, 0)
+#define VC4_TEX_P2_BSLOD_SHIFT                     0
+
+/* VC4_TEX_P2_PTYPE_CHILD_IMAGE_DIMENSIONS */
+#define VC4_TEX_P2_CHEIGHT_MASK                    VC4_MASK(22, 12)
+#define VC4_TEX_P2_CHEIGHT_SHIFT                   12
+#define VC4_TEX_P2_CWIDTH_MASK                     VC4_MASK(10, 0)
+#define VC4_TEX_P2_CWIDTH_SHIFT                    0
+
+/* VC4_TEX_P2_PTYPE_CHILD_IMAGE_OFFSETS */
+#define VC4_TEX_P2_CYOFF_MASK                      VC4_MASK(22, 12)
+#define VC4_TEX_P2_CYOFF_SHIFT                     12
+#define VC4_TEX_P2_CXOFF_MASK                      VC4_MASK(10, 0)
+#define VC4_TEX_P2_CXOFF_SHIFT                     0
+
+#endif /* VC4_PACKET_H */
diff --git a/drivers/gpu/drm/vc4/vc4_render_cl.c b/drivers/gpu/drm/vc4/vc4_render_cl.c
new file mode 100644
index 0000000..1dc3996
--- /dev/null
+++ b/drivers/gpu/drm/vc4/vc4_render_cl.c
@@ -0,0 +1,631 @@
+/*
+ * Copyright © 2014-2015 Broadcom
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ */
+
+/**
+ * DOC: Render command list generation
+ *
+ * In the VC4 driver, render command list generation is performed by the
+ * kernel instead of userspace.  We do this because validating a
+ * user-submitted command list is hard to get right and has high CPU overhead,
+ * while the number of valid configurations for render command lists is
+ * actually fairly low.
+ */
+
+#include "uapi/drm/vc4_drm.h"
+#include "vc4_drv.h"
+#include "vc4_packet.h"
+
+struct vc4_rcl_setup {
+	struct drm_gem_cma_object *color_read;
+	struct drm_gem_cma_object *color_write;
+	struct drm_gem_cma_object *zs_read;
+	struct drm_gem_cma_object *zs_write;
+	struct drm_gem_cma_object *msaa_color_write;
+	struct drm_gem_cma_object *msaa_zs_write;
+
+	struct drm_gem_cma_object *rcl;
+	u32 next_offset;
+};
+
+static inline void rcl_u8(struct vc4_rcl_setup *setup, u8 val)
+{
+	*(u8 *)(setup->rcl->vaddr + setup->next_offset) = val;
+	setup->next_offset += 1;
+}
+
+static inline void rcl_u16(struct vc4_rcl_setup *setup, u16 val)
+{
+	*(u16 *)(setup->rcl->vaddr + setup->next_offset) = val;
+	setup->next_offset += 2;
+}
+
+static inline void rcl_u32(struct vc4_rcl_setup *setup, u32 val)
+{
+	*(u32 *)(setup->rcl->vaddr + setup->next_offset) = val;
+	setup->next_offset += 4;
+}
+
+/*
+ * Emits a no-op STORE_TILE_BUFFER_GENERAL.
+ *
+ * If we emit a PACKET_TILE_COORDINATES, it must be followed by a store of
+ * some sort before another load is triggered.
+ */
+static void vc4_store_before_load(struct vc4_rcl_setup *setup)
+{
+	rcl_u8(setup, VC4_PACKET_STORE_TILE_BUFFER_GENERAL);
+	rcl_u16(setup,
+		VC4_SET_FIELD(VC4_LOADSTORE_TILE_BUFFER_NONE,
+			      VC4_LOADSTORE_TILE_BUFFER_BUFFER) |
+		VC4_STORE_TILE_BUFFER_DISABLE_COLOR_CLEAR |
+		VC4_STORE_TILE_BUFFER_DISABLE_ZS_CLEAR |
+		VC4_STORE_TILE_BUFFER_DISABLE_VG_MASK_CLEAR);
+	rcl_u32(setup, 0); /* no address, since we're in None mode */
+}
+
+/*
+ * Calculates the physical address of the start of a tile in a RCL surface.
+ *
+ * Unlike the other load/store packets,
+ * VC4_PACKET_LOAD/STORE_FULL_RES_TILE_BUFFER don't look at the tile
+ * coordinates packet, and instead just store to the address given.
+ */
+static uint32_t vc4_full_res_offset(struct vc4_exec_info *exec,
+				    struct drm_gem_cma_object *bo,
+				    struct drm_vc4_submit_rcl_surface *surf,
+				    uint8_t x, uint8_t y)
+{
+	return bo->paddr + surf->offset + VC4_TILE_BUFFER_SIZE *
+		(DIV_ROUND_UP(exec->args->width, 32) * y + x);
+}
+
+/*
+ * Emits a PACKET_TILE_COORDINATES if one isn't already pending.
+ *
+ * The tile coordinates packet triggers a pending load if there is one, are
+ * used for clipping during rendering, and determine where loads/stores happen
+ * relative to their base address.
+ */
+static void vc4_tile_coordinates(struct vc4_rcl_setup *setup,
+				 uint32_t x, uint32_t y)
+{
+	rcl_u8(setup, VC4_PACKET_TILE_COORDINATES);
+	rcl_u8(setup, x);
+	rcl_u8(setup, y);
+}
+
+static void emit_tile(struct vc4_exec_info *exec,
+		      struct vc4_rcl_setup *setup,
+		      uint8_t x, uint8_t y, bool first, bool last)
+{
+	struct drm_vc4_submit_cl *args = exec->args;
+	bool has_bin = args->bin_cl_size != 0;
+
+	/* Note that the load doesn't actually occur until the
+	 * tile coords packet is processed, and only one load
+	 * may be outstanding at a time.
+	 */
+	if (setup->color_read) {
+		if (args->color_read.flags &
+		    VC4_SUBMIT_RCL_SURFACE_READ_IS_FULL_RES) {
+			rcl_u8(setup, VC4_PACKET_LOAD_FULL_RES_TILE_BUFFER);
+			rcl_u32(setup,
+				vc4_full_res_offset(exec, setup->color_read,
+						    &args->color_read, x, y) |
+				VC4_LOADSTORE_FULL_RES_DISABLE_ZS);
+		} else {
+			rcl_u8(setup, VC4_PACKET_LOAD_TILE_BUFFER_GENERAL);
+			rcl_u16(setup, args->color_read.bits);
+			rcl_u32(setup, setup->color_read->paddr +
+				args->color_read.offset);
+		}
+	}
+
+	if (setup->zs_read) {
+		if (args->zs_read.flags &
+		    VC4_SUBMIT_RCL_SURFACE_READ_IS_FULL_RES) {
+			rcl_u8(setup, VC4_PACKET_LOAD_FULL_RES_TILE_BUFFER);
+			rcl_u32(setup,
+				vc4_full_res_offset(exec, setup->zs_read,
+						    &args->zs_read, x, y) |
+				VC4_LOADSTORE_FULL_RES_DISABLE_COLOR);
+		} else {
+			if (setup->color_read) {
+				/* Exec previous load. */
+				vc4_tile_coordinates(setup, x, y);
+				vc4_store_before_load(setup);
+			}
+
+			rcl_u8(setup, VC4_PACKET_LOAD_TILE_BUFFER_GENERAL);
+			rcl_u16(setup, args->zs_read.bits);
+			rcl_u32(setup, setup->zs_read->paddr +
+				args->zs_read.offset);
+		}
+	}
+
+	/* Clipping depends on tile coordinates having been
+	 * emitted, so we always need one here.
+	 */
+	vc4_tile_coordinates(setup, x, y);
+
+	/* Wait for the binner before jumping to the first
+	 * tile's lists.
+	 */
+	if (first && has_bin)
+		rcl_u8(setup, VC4_PACKET_WAIT_ON_SEMAPHORE);
+
+	if (has_bin) {
+		rcl_u8(setup, VC4_PACKET_BRANCH_TO_SUB_LIST);
+		rcl_u32(setup, (exec->tile_bo->paddr +
+				exec->tile_alloc_offset +
+				(y * exec->bin_tiles_x + x) * 32));
+	}
+
+	if (setup->msaa_color_write) {
+		bool last_tile_write = (!setup->msaa_zs_write &&
+					!setup->zs_write &&
+					!setup->color_write);
+		uint32_t bits = VC4_LOADSTORE_FULL_RES_DISABLE_ZS;
+
+		if (!last_tile_write)
+			bits |= VC4_LOADSTORE_FULL_RES_DISABLE_CLEAR_ALL;
+		else if (last)
+			bits |= VC4_LOADSTORE_FULL_RES_EOF;
+		rcl_u8(setup, VC4_PACKET_STORE_FULL_RES_TILE_BUFFER);
+		rcl_u32(setup,
+			vc4_full_res_offset(exec, setup->msaa_color_write,
+					    &args->msaa_color_write, x, y) |
+			bits);
+	}
+
+	if (setup->msaa_zs_write) {
+		bool last_tile_write = (!setup->zs_write &&
+					!setup->color_write);
+		uint32_t bits = VC4_LOADSTORE_FULL_RES_DISABLE_COLOR;
+
+		if (setup->msaa_color_write)
+			vc4_tile_coordinates(setup, x, y);
+		if (!last_tile_write)
+			bits |= VC4_LOADSTORE_FULL_RES_DISABLE_CLEAR_ALL;
+		else if (last)
+			bits |= VC4_LOADSTORE_FULL_RES_EOF;
+		rcl_u8(setup, VC4_PACKET_STORE_FULL_RES_TILE_BUFFER);
+		rcl_u32(setup,
+			vc4_full_res_offset(exec, setup->msaa_zs_write,
+					    &args->msaa_zs_write, x, y) |
+			bits);
+	}
+
+	if (setup->zs_write) {
+		bool last_tile_write = !setup->color_write;
+
+		if (setup->msaa_color_write || setup->msaa_zs_write)
+			vc4_tile_coordinates(setup, x, y);
+
+		rcl_u8(setup, VC4_PACKET_STORE_TILE_BUFFER_GENERAL);
+		rcl_u16(setup, args->zs_write.bits |
+			(last_tile_write ?
+			 0 : VC4_STORE_TILE_BUFFER_DISABLE_COLOR_CLEAR));
+		rcl_u32(setup,
+			(setup->zs_write->paddr + args->zs_write.offset) |
+			((last && last_tile_write) ?
+			 VC4_LOADSTORE_TILE_BUFFER_EOF : 0));
+	}
+
+	if (setup->color_write) {
+		if (setup->msaa_color_write || setup->msaa_zs_write ||
+		    setup->zs_write) {
+			vc4_tile_coordinates(setup, x, y);
+		}
+
+		if (last)
+			rcl_u8(setup, VC4_PACKET_STORE_MS_TILE_BUFFER_AND_EOF);
+		else
+			rcl_u8(setup, VC4_PACKET_STORE_MS_TILE_BUFFER);
+	}
+}
+
+static int vc4_create_rcl_bo(struct drm_device *dev, struct vc4_exec_info *exec,
+			     struct vc4_rcl_setup *setup)
+{
+	struct drm_vc4_submit_cl *args = exec->args;
+	bool has_bin = args->bin_cl_size != 0;
+	uint8_t min_x_tile = args->min_x_tile;
+	uint8_t min_y_tile = args->min_y_tile;
+	uint8_t max_x_tile = args->max_x_tile;
+	uint8_t max_y_tile = args->max_y_tile;
+	uint8_t xtiles = max_x_tile - min_x_tile + 1;
+	uint8_t ytiles = max_y_tile - min_y_tile + 1;
+	uint8_t x, y;
+	uint32_t size, loop_body_size;
+
+	size = VC4_PACKET_TILE_RENDERING_MODE_CONFIG_SIZE;
+	loop_body_size = VC4_PACKET_TILE_COORDINATES_SIZE;
+
+	if (args->flags & VC4_SUBMIT_CL_USE_CLEAR_COLOR) {
+		size += VC4_PACKET_CLEAR_COLORS_SIZE +
+			VC4_PACKET_TILE_COORDINATES_SIZE +
+			VC4_PACKET_STORE_TILE_BUFFER_GENERAL_SIZE;
+	}
+
+	if (setup->color_read) {
+		if (args->color_read.flags &
+		    VC4_SUBMIT_RCL_SURFACE_READ_IS_FULL_RES) {
+			loop_body_size += VC4_PACKET_LOAD_FULL_RES_TILE_BUFFER_SIZE;
+		} else {
+			loop_body_size += VC4_PACKET_LOAD_TILE_BUFFER_GENERAL_SIZE;
+		}
+	}
+	if (setup->zs_read) {
+		if (args->zs_read.flags &
+		    VC4_SUBMIT_RCL_SURFACE_READ_IS_FULL_RES) {
+			loop_body_size += VC4_PACKET_LOAD_FULL_RES_TILE_BUFFER_SIZE;
+		} else {
+			if (setup->color_read &&
+			    !(args->color_read.flags &
+			      VC4_SUBMIT_RCL_SURFACE_READ_IS_FULL_RES)) {
+				loop_body_size += VC4_PACKET_TILE_COORDINATES_SIZE;
+				loop_body_size += VC4_PACKET_STORE_TILE_BUFFER_GENERAL_SIZE;
+			}
+			loop_body_size += VC4_PACKET_LOAD_TILE_BUFFER_GENERAL_SIZE;
+		}
+	}
+
+	if (has_bin) {
+		size += VC4_PACKET_WAIT_ON_SEMAPHORE_SIZE;
+		loop_body_size += VC4_PACKET_BRANCH_TO_SUB_LIST_SIZE;
+	}
+
+	if (setup->msaa_color_write)
+		loop_body_size += VC4_PACKET_STORE_FULL_RES_TILE_BUFFER_SIZE;
+	if (setup->msaa_zs_write)
+		loop_body_size += VC4_PACKET_STORE_FULL_RES_TILE_BUFFER_SIZE;
+
+	if (setup->zs_write)
+		loop_body_size += VC4_PACKET_STORE_TILE_BUFFER_GENERAL_SIZE;
+	if (setup->color_write)
+		loop_body_size += VC4_PACKET_STORE_MS_TILE_BUFFER_SIZE;
+
+	/* We need a VC4_PACKET_TILE_COORDINATES in between each store. */
+	loop_body_size += VC4_PACKET_TILE_COORDINATES_SIZE *
+		((setup->msaa_color_write != NULL) +
+		 (setup->msaa_zs_write != NULL) +
+		 (setup->color_write != NULL) +
+		 (setup->zs_write != NULL) - 1);
+
+	size += xtiles * ytiles * loop_body_size;
+
+	setup->rcl = &vc4_bo_create(dev, size, true)->base;
+	if (!setup->rcl)
+		return -ENOMEM;
+	list_add_tail(&to_vc4_bo(&setup->rcl->base)->unref_head,
+		      &exec->unref_list);
+
+	rcl_u8(setup, VC4_PACKET_TILE_RENDERING_MODE_CONFIG);
+	rcl_u32(setup,
+		(setup->color_write ? (setup->color_write->paddr +
+				       args->color_write.offset) :
+		 0));
+	rcl_u16(setup, args->width);
+	rcl_u16(setup, args->height);
+	rcl_u16(setup, args->color_write.bits);
+
+	/* The tile buffer gets cleared when the previous tile is stored.  If
+	 * the clear values changed between frames, then the tile buffer has
+	 * stale clear values in it, so we have to do a store in None mode (no
+	 * writes) so that we trigger the tile buffer clear.
+	 */
+	if (args->flags & VC4_SUBMIT_CL_USE_CLEAR_COLOR) {
+		rcl_u8(setup, VC4_PACKET_CLEAR_COLORS);
+		rcl_u32(setup, args->clear_color[0]);
+		rcl_u32(setup, args->clear_color[1]);
+		rcl_u32(setup, args->clear_z);
+		rcl_u8(setup, args->clear_s);
+
+		vc4_tile_coordinates(setup, 0, 0);
+
+		rcl_u8(setup, VC4_PACKET_STORE_TILE_BUFFER_GENERAL);
+		rcl_u16(setup, VC4_LOADSTORE_TILE_BUFFER_NONE);
+		rcl_u32(setup, 0); /* no address, since we're in None mode */
+	}
+
+	for (y = min_y_tile; y <= max_y_tile; y++) {
+		for (x = min_x_tile; x <= max_x_tile; x++) {
+			bool first = (x == min_x_tile && y == min_y_tile);
+			bool last = (x == max_x_tile && y == max_y_tile);
+
+			emit_tile(exec, setup, x, y, first, last);
+		}
+	}
+
+	BUG_ON(setup->next_offset != size);
+	exec->ct1ca = setup->rcl->paddr;
+	exec->ct1ea = setup->rcl->paddr + setup->next_offset;
+
+	return 0;
+}
+
+static int vc4_full_res_bounds_check(struct vc4_exec_info *exec,
+				     struct drm_gem_cma_object *obj,
+				     struct drm_vc4_submit_rcl_surface *surf)
+{
+	struct drm_vc4_submit_cl *args = exec->args;
+	u32 render_tiles_stride = DIV_ROUND_UP(exec->args->width, 32);
+
+	if (surf->offset > obj->base.size) {
+		DRM_ERROR("surface offset %d > BO size %zd\n",
+			  surf->offset, obj->base.size);
+		return -EINVAL;
+	}
+
+	if ((obj->base.size - surf->offset) / VC4_TILE_BUFFER_SIZE <
+	    render_tiles_stride * args->max_y_tile + args->max_x_tile) {
+		DRM_ERROR("MSAA tile %d, %d out of bounds "
+			  "(bo size %zd, offset %d).\n",
+			  args->max_x_tile, args->max_y_tile,
+			  obj->base.size,
+			  surf->offset);
+		return -EINVAL;
+	}
+
+	return 0;
+}
+
+static int vc4_rcl_msaa_surface_setup(struct vc4_exec_info *exec,
+				      struct drm_gem_cma_object **obj,
+				      struct drm_vc4_submit_rcl_surface *surf)
+{
+	if (surf->flags != 0 || surf->bits != 0) {
+		DRM_ERROR("MSAA surface had nonzero flags/bits\n");
+		return -EINVAL;
+	}
+
+	if (surf->hindex == ~0)
+		return 0;
+
+	if (!vc4_use_bo(exec, surf->hindex, VC4_MODE_RENDER, obj))
+		return -EINVAL;
+
+	if (surf->offset & 0xf) {
+		DRM_ERROR("MSAA write must be 16b aligned.\n");
+		return -EINVAL;
+	}
+
+	return vc4_full_res_bounds_check(exec, *obj, surf);
+}
+
+static int vc4_rcl_surface_setup(struct vc4_exec_info *exec,
+				 struct drm_gem_cma_object **obj,
+				 struct drm_vc4_submit_rcl_surface *surf)
+{
+	uint8_t tiling = VC4_GET_FIELD(surf->bits,
+				       VC4_LOADSTORE_TILE_BUFFER_TILING);
+	uint8_t buffer = VC4_GET_FIELD(surf->bits,
+				       VC4_LOADSTORE_TILE_BUFFER_BUFFER);
+	uint8_t format = VC4_GET_FIELD(surf->bits,
+				       VC4_LOADSTORE_TILE_BUFFER_FORMAT);
+	int cpp;
+	int ret;
+
+	if (surf->flags & ~VC4_SUBMIT_RCL_SURFACE_READ_IS_FULL_RES) {
+		DRM_ERROR("Extra flags set\n");
+		return -EINVAL;
+	}
+
+	if (surf->hindex == ~0)
+		return 0;
+
+	if (!vc4_use_bo(exec, surf->hindex, VC4_MODE_RENDER, obj))
+		return -EINVAL;
+
+	if (surf->flags & VC4_SUBMIT_RCL_SURFACE_READ_IS_FULL_RES) {
+		if (surf == &exec->args->zs_write) {
+			DRM_ERROR("general zs write may not be a full-res.\n");
+			return -EINVAL;
+		}
+
+		if (surf->bits != 0) {
+			DRM_ERROR("load/store general bits set with "
+				  "full res load/store.\n");
+			return -EINVAL;
+		}
+
+		ret = vc4_full_res_bounds_check(exec, *obj, surf);
+		if (!ret)
+			return ret;
+
+		return 0;
+	}
+
+	if (surf->bits & ~(VC4_LOADSTORE_TILE_BUFFER_TILING_MASK |
+			   VC4_LOADSTORE_TILE_BUFFER_BUFFER_MASK |
+			   VC4_LOADSTORE_TILE_BUFFER_FORMAT_MASK)) {
+		DRM_ERROR("Unknown bits in load/store: 0x%04x\n",
+			  surf->bits);
+		return -EINVAL;
+	}
+
+	if (tiling > VC4_TILING_FORMAT_LT) {
+		DRM_ERROR("Bad tiling format\n");
+		return -EINVAL;
+	}
+
+	if (buffer == VC4_LOADSTORE_TILE_BUFFER_ZS) {
+		if (format != 0) {
+			DRM_ERROR("No color format should be set for ZS\n");
+			return -EINVAL;
+		}
+		cpp = 4;
+	} else if (buffer == VC4_LOADSTORE_TILE_BUFFER_COLOR) {
+		switch (format) {
+		case VC4_LOADSTORE_TILE_BUFFER_BGR565:
+		case VC4_LOADSTORE_TILE_BUFFER_BGR565_DITHER:
+			cpp = 2;
+			break;
+		case VC4_LOADSTORE_TILE_BUFFER_RGBA8888:
+			cpp = 4;
+			break;
+		default:
+			DRM_ERROR("Bad tile buffer format\n");
+			return -EINVAL;
+		}
+	} else {
+		DRM_ERROR("Bad load/store buffer %d.\n", buffer);
+		return -EINVAL;
+	}
+
+	if (surf->offset & 0xf) {
+		DRM_ERROR("load/store buffer must be 16b aligned.\n");
+		return -EINVAL;
+	}
+
+	if (!vc4_check_tex_size(exec, *obj, surf->offset, tiling,
+				exec->args->width, exec->args->height, cpp)) {
+		return -EINVAL;
+	}
+
+	return 0;
+}
+
+static int
+vc4_rcl_render_config_surface_setup(struct vc4_exec_info *exec,
+				    struct vc4_rcl_setup *setup,
+				    struct drm_gem_cma_object **obj,
+				    struct drm_vc4_submit_rcl_surface *surf)
+{
+	uint8_t tiling = VC4_GET_FIELD(surf->bits,
+				       VC4_RENDER_CONFIG_MEMORY_FORMAT);
+	uint8_t format = VC4_GET_FIELD(surf->bits,
+				       VC4_RENDER_CONFIG_FORMAT);
+	int cpp;
+
+	if (surf->flags != 0) {
+		DRM_ERROR("No flags supported on render config.\n");
+		return -EINVAL;
+	}
+
+	if (surf->bits & ~(VC4_RENDER_CONFIG_MEMORY_FORMAT_MASK |
+			   VC4_RENDER_CONFIG_FORMAT_MASK |
+			   VC4_RENDER_CONFIG_MS_MODE_4X |
+			   VC4_RENDER_CONFIG_DECIMATE_MODE_4X)) {
+		DRM_ERROR("Unknown bits in render config: 0x%04x\n",
+			  surf->bits);
+		return -EINVAL;
+	}
+
+	if (surf->hindex == ~0)
+		return 0;
+
+	if (!vc4_use_bo(exec, surf->hindex, VC4_MODE_RENDER, obj))
+		return -EINVAL;
+
+	if (tiling > VC4_TILING_FORMAT_LT) {
+		DRM_ERROR("Bad tiling format\n");
+		return -EINVAL;
+	}
+
+	switch (format) {
+	case VC4_RENDER_CONFIG_FORMAT_BGR565_DITHERED:
+	case VC4_RENDER_CONFIG_FORMAT_BGR565:
+		cpp = 2;
+		break;
+	case VC4_RENDER_CONFIG_FORMAT_RGBA8888:
+		cpp = 4;
+		break;
+	default:
+		DRM_ERROR("Bad tile buffer format\n");
+		return -EINVAL;
+	}
+
+	if (!vc4_check_tex_size(exec, *obj, surf->offset, tiling,
+				exec->args->width, exec->args->height, cpp)) {
+		return -EINVAL;
+	}
+
+	return 0;
+}
+
+int vc4_get_rcl(struct drm_device *dev, struct vc4_exec_info *exec)
+{
+	struct vc4_rcl_setup setup = {0};
+	struct drm_vc4_submit_cl *args = exec->args;
+	bool has_bin = args->bin_cl_size != 0;
+	int ret;
+
+	if (args->min_x_tile > args->max_x_tile ||
+	    args->min_y_tile > args->max_y_tile) {
+		DRM_ERROR("Bad render tile set (%d,%d)-(%d,%d)\n",
+			  args->min_x_tile, args->min_y_tile,
+			  args->max_x_tile, args->max_y_tile);
+		return -EINVAL;
+	}
+
+	if (has_bin &&
+	    (args->max_x_tile > exec->bin_tiles_x ||
+	     args->max_y_tile > exec->bin_tiles_y)) {
+		DRM_ERROR("Render tiles (%d,%d) outside of bin config "
+			  "(%d,%d)\n",
+			  args->max_x_tile, args->max_y_tile,
+			  exec->bin_tiles_x, exec->bin_tiles_y);
+		return -EINVAL;
+	}
+
+	ret = vc4_rcl_render_config_surface_setup(exec, &setup,
+						  &setup.color_write,
+						  &args->color_write);
+	if (ret)
+		return ret;
+
+	ret = vc4_rcl_surface_setup(exec, &setup.color_read, &args->color_read);
+	if (ret)
+		return ret;
+
+	ret = vc4_rcl_surface_setup(exec, &setup.zs_read, &args->zs_read);
+	if (ret)
+		return ret;
+
+	ret = vc4_rcl_surface_setup(exec, &setup.zs_write, &args->zs_write);
+	if (ret)
+		return ret;
+
+	ret = vc4_rcl_msaa_surface_setup(exec, &setup.msaa_color_write,
+					 &args->msaa_color_write);
+	if (ret)
+		return ret;
+
+	ret = vc4_rcl_msaa_surface_setup(exec, &setup.msaa_zs_write,
+					 &args->msaa_zs_write);
+	if (ret)
+		return ret;
+
+	/* We shouldn't even have the job submitted to us if there's no
+	 * surface to write out.
+	 */
+	if (!setup.color_write && !setup.zs_write &&
+	    !setup.msaa_color_write && !setup.msaa_zs_write) {
+		DRM_ERROR("RCL requires color or Z/S write\n");
+		return -EINVAL;
+	}
+
+	return vc4_create_rcl_bo(dev, exec, &setup);
+}
diff --git a/drivers/gpu/drm/vc4/vc4_trace.h b/drivers/gpu/drm/vc4/vc4_trace.h
new file mode 100644
index 0000000..ad7b1ea
--- /dev/null
+++ b/drivers/gpu/drm/vc4/vc4_trace.h
@@ -0,0 +1,63 @@
+/*
+ * Copyright (C) 2015 Broadcom
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ */
+
+#if !defined(_VC4_TRACE_H_) || defined(TRACE_HEADER_MULTI_READ)
+#define _VC4_TRACE_H_
+
+#include <linux/stringify.h>
+#include <linux/types.h>
+#include <linux/tracepoint.h>
+
+#undef TRACE_SYSTEM
+#define TRACE_SYSTEM vc4
+#define TRACE_INCLUDE_FILE vc4_trace
+
+TRACE_EVENT(vc4_wait_for_seqno_begin,
+	    TP_PROTO(struct drm_device *dev, uint64_t seqno, uint64_t timeout),
+	    TP_ARGS(dev, seqno, timeout),
+
+	    TP_STRUCT__entry(
+			     __field(u32, dev)
+			     __field(u64, seqno)
+			     __field(u64, timeout)
+			     ),
+
+	    TP_fast_assign(
+			   __entry->dev = dev->primary->index;
+			   __entry->seqno = seqno;
+			   __entry->timeout = timeout;
+			   ),
+
+	    TP_printk("dev=%u, seqno=%llu, timeout=%llu",
+		      __entry->dev, __entry->seqno, __entry->timeout)
+);
+
+TRACE_EVENT(vc4_wait_for_seqno_end,
+	    TP_PROTO(struct drm_device *dev, uint64_t seqno),
+	    TP_ARGS(dev, seqno),
+
+	    TP_STRUCT__entry(
+			     __field(u32, dev)
+			     __field(u64, seqno)
+			     ),
+
+	    TP_fast_assign(
+			   __entry->dev = dev->primary->index;
+			   __entry->seqno = seqno;
+			   ),
+
+	    TP_printk("dev=%u, seqno=%llu",
+		      __entry->dev, __entry->seqno)
+);
+
+#endif /* _VC4_TRACE_H_ */
+
+/* This part must be outside protection */
+#undef TRACE_INCLUDE_PATH
+#define TRACE_INCLUDE_PATH .
+#include <trace/define_trace.h>
diff --git a/drivers/gpu/drm/vc4/vc4_trace_points.c b/drivers/gpu/drm/vc4/vc4_trace_points.c
new file mode 100644
index 0000000..e6278f2
--- /dev/null
+++ b/drivers/gpu/drm/vc4/vc4_trace_points.c
@@ -0,0 +1,14 @@
+/*
+ * Copyright (C) 2015 Broadcom
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ */
+
+#include "vc4_drv.h"
+
+#ifndef __CHECKER__
+#define CREATE_TRACE_POINTS
+#include "vc4_trace.h"
+#endif
diff --git a/drivers/gpu/drm/vc4/vc4_v3d.c b/drivers/gpu/drm/vc4/vc4_v3d.c
index 040ad0d..424d515 100644
--- a/drivers/gpu/drm/vc4/vc4_v3d.c
+++ b/drivers/gpu/drm/vc4/vc4_v3d.c
@@ -144,6 +144,21 @@ int vc4_v3d_debugfs_ident(struct seq_file *m, void *unused)
 }
 #endif /* CONFIG_DEBUG_FS */
 
+/*
+ * Asks the firmware to turn on power to the V3D engine.
+ *
+ * This may be doable with just the clocks interface, though this
+ * packet does some other register setup from the firmware, too.
+ */
+int
+vc4_v3d_set_power(struct vc4_dev *vc4, bool on)
+{
+	if (on)
+		return pm_generic_poweroff(&vc4->v3d->pdev->dev);
+	else
+		return pm_generic_resume(&vc4->v3d->pdev->dev);
+}
+
 static void vc4_v3d_init_hw(struct drm_device *dev)
 {
 	struct vc4_dev *vc4 = to_vc4_dev(dev);
@@ -161,6 +176,7 @@ static int vc4_v3d_bind(struct device *dev, struct device *master, void *data)
 	struct drm_device *drm = dev_get_drvdata(master);
 	struct vc4_dev *vc4 = to_vc4_dev(drm);
 	struct vc4_v3d *v3d = NULL;
+	int ret;
 
 	v3d = devm_kzalloc(&pdev->dev, sizeof(*v3d), GFP_KERNEL);
 	if (!v3d)
@@ -180,8 +196,20 @@ static int vc4_v3d_bind(struct device *dev, struct device *master, void *data)
 		return -EINVAL;
 	}
 
+	/* Reset the binner overflow address/size at setup, to be sure
+	 * we don't reuse an old one.
+	 */
+	V3D_WRITE(V3D_BPOA, 0);
+	V3D_WRITE(V3D_BPOS, 0);
+
 	vc4_v3d_init_hw(drm);
 
+	ret = drm_irq_install(drm, platform_get_irq(pdev, 0));
+	if (ret) {
+		DRM_ERROR("Failed to install IRQ handler\n");
+		return ret;
+	}
+
 	return 0;
 }
 
@@ -191,6 +219,15 @@ static void vc4_v3d_unbind(struct device *dev, struct device *master,
 	struct drm_device *drm = dev_get_drvdata(master);
 	struct vc4_dev *vc4 = to_vc4_dev(drm);
 
+	drm_irq_uninstall(drm);
+
+	/* Disable the binner's overflow memory address, so the next
+	 * driver probe (if any) doesn't try to reuse our old
+	 * allocation.
+	 */
+	V3D_WRITE(V3D_BPOA, 0);
+	V3D_WRITE(V3D_BPOS, 0);
+
 	vc4->v3d = NULL;
 }
 
diff --git a/drivers/gpu/drm/vc4/vc4_validate.c b/drivers/gpu/drm/vc4/vc4_validate.c
new file mode 100644
index 0000000..2f22f19
--- /dev/null
+++ b/drivers/gpu/drm/vc4/vc4_validate.c
@@ -0,0 +1,955 @@
+/*
+ * Copyright © 2014 Broadcom
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ */
+
+/**
+ * Command list validator for VC4.
+ *
+ * The VC4 has no IOMMU between it and system memory.  So, a user with
+ * access to execute command lists could escalate privilege by
+ * overwriting system memory (drawing to it as a framebuffer) or
+ * reading system memory it shouldn't (reading it as a texture, or
+ * uniform data, or vertex data).
+ *
+ * This validates command lists to ensure that all accesses are within
+ * the bounds of the GEM objects referenced.  It explicitly whitelists
+ * packets, and looks at the offsets in any address fields to make
+ * sure they're constrained within the BOs they reference.
+ *
+ * Note that because of the validation that's happening anyway, this
+ * is where GEM relocation processing happens.
+ */
+
+#include "uapi/drm/vc4_drm.h"
+#include "vc4_drv.h"
+#include "vc4_packet.h"
+
+#define VALIDATE_ARGS \
+	struct vc4_exec_info *exec,			\
+	void *validated,				\
+	void *untrusted
+
+/** Return the width in pixels of a 64-byte microtile. */
+static uint32_t
+utile_width(int cpp)
+{
+	switch (cpp) {
+	case 1:
+	case 2:
+		return 8;
+	case 4:
+		return 4;
+	case 8:
+		return 2;
+	default:
+		DRM_ERROR("unknown cpp: %d\n", cpp);
+		return 1;
+	}
+}
+
+/** Return the height in pixels of a 64-byte microtile. */
+static uint32_t
+utile_height(int cpp)
+{
+	switch (cpp) {
+	case 1:
+		return 8;
+	case 2:
+	case 4:
+	case 8:
+		return 4;
+	default:
+		DRM_ERROR("unknown cpp: %d\n", cpp);
+		return 1;
+	}
+}
+
+/**
+ * The texture unit decides what tiling format a particular miplevel is using
+ * this function, so we lay out our miptrees accordingly.
+ */
+static bool
+size_is_lt(uint32_t width, uint32_t height, int cpp)
+{
+	return (width <= 4 * utile_width(cpp) ||
+		height <= 4 * utile_height(cpp));
+}
+
+bool
+vc4_use_bo(struct vc4_exec_info *exec,
+	   uint32_t hindex,
+	   enum vc4_bo_mode mode,
+	   struct drm_gem_cma_object **obj)
+{
+	*obj = NULL;
+
+	if (hindex >= exec->bo_count) {
+		DRM_ERROR("BO index %d greater than BO count %d\n",
+			  hindex, exec->bo_count);
+		return false;
+	}
+
+	if (exec->bo[hindex].mode != mode) {
+		if (exec->bo[hindex].mode == VC4_MODE_UNDECIDED) {
+			exec->bo[hindex].mode = mode;
+		} else {
+			DRM_ERROR("BO index %d reused with mode %d vs %d\n",
+				  hindex, exec->bo[hindex].mode, mode);
+			return false;
+		}
+	}
+
+	*obj = exec->bo[hindex].bo;
+	return true;
+}
+
+static bool
+vc4_use_handle(struct vc4_exec_info *exec,
+	       uint32_t gem_handles_packet_index,
+	       enum vc4_bo_mode mode,
+	       struct drm_gem_cma_object **obj)
+{
+	return vc4_use_bo(exec, exec->bo_index[gem_handles_packet_index],
+			  mode, obj);
+}
+
+static uint32_t
+gl_shader_rec_size(uint32_t pointer_bits)
+{
+	uint32_t attribute_count = pointer_bits & 7;
+	bool extended = pointer_bits & 8;
+
+	if (attribute_count == 0)
+		attribute_count = 8;
+
+	if (extended)
+		return 100 + attribute_count * 4;
+	else
+		return 36 + attribute_count * 8;
+}
+
+bool
+vc4_check_tex_size(struct vc4_exec_info *exec, struct drm_gem_cma_object *fbo,
+		   uint32_t offset, uint8_t tiling_format,
+		   uint32_t width, uint32_t height, uint8_t cpp)
+{
+	uint32_t aligned_width, aligned_height, stride, size;
+	uint32_t utile_w = utile_width(cpp);
+	uint32_t utile_h = utile_height(cpp);
+
+	/* The shaded vertex format stores signed 12.4 fixed point
+	 * (-2048,2047) offsets from the viewport center, so we should
+	 * never have a render target larger than 4096.  The texture
+	 * unit can only sample from 2048x2048, so it's even more
+	 * restricted.  This lets us avoid worrying about overflow in
+	 * our math.
+	 */
+	if (width > 4096 || height > 4096) {
+		DRM_ERROR("Surface dimesions (%d,%d) too large", width, height);
+		return false;
+	}
+
+	switch (tiling_format) {
+	case VC4_TILING_FORMAT_LINEAR:
+		aligned_width = round_up(width, utile_w);
+		aligned_height = height;
+		break;
+	case VC4_TILING_FORMAT_T:
+		aligned_width = round_up(width, utile_w * 8);
+		aligned_height = round_up(height, utile_h * 8);
+		break;
+	case VC4_TILING_FORMAT_LT:
+		aligned_width = round_up(width, utile_w);
+		aligned_height = round_up(height, utile_h);
+		break;
+	default:
+		DRM_ERROR("buffer tiling %d unsupported\n", tiling_format);
+		return false;
+	}
+
+	stride = aligned_width * cpp;
+	size = stride * aligned_height;
+
+	if (size + offset < size ||
+	    size + offset > fbo->base.size) {
+		DRM_ERROR("Overflow in %dx%d (%dx%d) fbo size (%d + %d > %zd)\n",
+			  width, height,
+			  aligned_width, aligned_height,
+			  size, offset, fbo->base.size);
+		return false;
+	}
+
+	return true;
+}
+
+static int
+validate_flush_all(VALIDATE_ARGS)
+{
+	if (exec->found_increment_semaphore_packet) {
+		DRM_ERROR("VC4_PACKET_FLUSH_ALL after "
+			  "VC4_PACKET_INCREMENT_SEMAPHORE\n");
+		return -EINVAL;
+	}
+
+	return 0;
+}
+
+static int
+validate_start_tile_binning(VALIDATE_ARGS)
+{
+	if (exec->found_start_tile_binning_packet) {
+		DRM_ERROR("Duplicate VC4_PACKET_START_TILE_BINNING\n");
+		return -EINVAL;
+	}
+	exec->found_start_tile_binning_packet = true;
+
+	if (!exec->found_tile_binning_mode_config_packet) {
+		DRM_ERROR("missing VC4_PACKET_TILE_BINNING_MODE_CONFIG\n");
+		return -EINVAL;
+	}
+
+	return 0;
+}
+
+static int
+validate_increment_semaphore(VALIDATE_ARGS)
+{
+	if (exec->found_increment_semaphore_packet) {
+		DRM_ERROR("Duplicate VC4_PACKET_INCREMENT_SEMAPHORE\n");
+		return -EINVAL;
+	}
+	exec->found_increment_semaphore_packet = true;
+
+	/* Once we've found the semaphore increment, there should be one FLUSH
+	 * then the end of the command list.  The FLUSH actually triggers the
+	 * increment, so we only need to make sure there
+	 */
+
+	return 0;
+}
+
+static int
+validate_indexed_prim_list(VALIDATE_ARGS)
+{
+	struct drm_gem_cma_object *ib;
+	uint32_t length = *(uint32_t *)(untrusted + 1);
+	uint32_t offset = *(uint32_t *)(untrusted + 5);
+	uint32_t max_index = *(uint32_t *)(untrusted + 9);
+	uint32_t index_size = (*(uint8_t *)(untrusted + 0) >> 4) ? 2 : 1;
+	struct vc4_shader_state *shader_state;
+
+	if (exec->found_increment_semaphore_packet) {
+		DRM_ERROR("Drawing after VC4_PACKET_INCREMENT_SEMAPHORE\n");
+		return -EINVAL;
+	}
+
+	/* Check overflow condition */
+	if (exec->shader_state_count == 0) {
+		DRM_ERROR("shader state must precede primitives\n");
+		return -EINVAL;
+	}
+	shader_state = &exec->shader_state[exec->shader_state_count - 1];
+
+	if (max_index > shader_state->max_index)
+		shader_state->max_index = max_index;
+
+	if (!vc4_use_handle(exec, 0, VC4_MODE_RENDER, &ib))
+		return -EINVAL;
+
+	if (offset > ib->base.size ||
+	    (ib->base.size - offset) / index_size < length) {
+		DRM_ERROR("IB access overflow (%d + %d*%d > %zd)\n",
+			  offset, length, index_size, ib->base.size);
+		return -EINVAL;
+	}
+
+	*(uint32_t *)(validated + 5) = ib->paddr + offset;
+
+	return 0;
+}
+
+static int
+validate_gl_array_primitive(VALIDATE_ARGS)
+{
+	uint32_t length = *(uint32_t *)(untrusted + 1);
+	uint32_t base_index = *(uint32_t *)(untrusted + 5);
+	uint32_t max_index;
+	struct vc4_shader_state *shader_state;
+
+	if (exec->found_increment_semaphore_packet) {
+		DRM_ERROR("Drawing after VC4_PACKET_INCREMENT_SEMAPHORE\n");
+		return -EINVAL;
+	}
+
+	/* Check overflow condition */
+	if (exec->shader_state_count == 0) {
+		DRM_ERROR("shader state must precede primitives\n");
+		return -EINVAL;
+	}
+	shader_state = &exec->shader_state[exec->shader_state_count - 1];
+
+	if (length + base_index < length) {
+		DRM_ERROR("primitive vertex count overflow\n");
+		return -EINVAL;
+	}
+	max_index = length + base_index - 1;
+
+	if (max_index > shader_state->max_index)
+		shader_state->max_index = max_index;
+
+	return 0;
+}
+
+static int
+validate_gl_shader_state(VALIDATE_ARGS)
+{
+	uint32_t i = exec->shader_state_count++;
+
+	if (i >= exec->shader_state_size) {
+		DRM_ERROR("More requests for shader states than declared\n");
+		return -EINVAL;
+	}
+
+	exec->shader_state[i].packet = VC4_PACKET_GL_SHADER_STATE;
+	exec->shader_state[i].addr = *(uint32_t *)untrusted;
+	exec->shader_state[i].max_index = 0;
+
+	if (exec->shader_state[i].addr & ~0xf) {
+		DRM_ERROR("high bits set in GL shader rec reference\n");
+		return -EINVAL;
+	}
+
+	*(uint32_t *)validated = (exec->shader_rec_p +
+				  exec->shader_state[i].addr);
+
+	exec->shader_rec_p +=
+		roundup(gl_shader_rec_size(exec->shader_state[i].addr), 16);
+
+	return 0;
+}
+
+static int
+validate_nv_shader_state(VALIDATE_ARGS)
+{
+	uint32_t i = exec->shader_state_count++;
+
+	if (i >= exec->shader_state_size) {
+		DRM_ERROR("More requests for shader states than declared\n");
+		return -EINVAL;
+	}
+
+	exec->shader_state[i].packet = VC4_PACKET_NV_SHADER_STATE;
+	exec->shader_state[i].addr = *(uint32_t *)untrusted;
+
+	if (exec->shader_state[i].addr & 15) {
+		DRM_ERROR("NV shader state address 0x%08x misaligned\n",
+			  exec->shader_state[i].addr);
+		return -EINVAL;
+	}
+
+	*(uint32_t *)validated = (exec->shader_state[i].addr +
+				  exec->shader_rec_p);
+
+	return 0;
+}
+
+static int
+validate_tile_binning_config(VALIDATE_ARGS)
+{
+	struct drm_device *dev = exec->exec_bo->base.dev;
+	struct vc4_bo *tile_bo;
+	uint8_t flags;
+	uint32_t tile_state_size, tile_alloc_size;
+	uint32_t tile_count;
+
+	if (exec->found_tile_binning_mode_config_packet) {
+		DRM_ERROR("Duplicate VC4_PACKET_TILE_BINNING_MODE_CONFIG\n");
+		return -EINVAL;
+	}
+	exec->found_tile_binning_mode_config_packet = true;
+
+	exec->bin_tiles_x = *(uint8_t *)(untrusted + 12);
+	exec->bin_tiles_y = *(uint8_t *)(untrusted + 13);
+	tile_count = exec->bin_tiles_x * exec->bin_tiles_y;
+	flags = *(uint8_t *)(untrusted + 14);
+
+	if (exec->bin_tiles_x == 0 ||
+	    exec->bin_tiles_y == 0) {
+		DRM_ERROR("Tile binning config of %dx%d too small\n",
+			  exec->bin_tiles_x, exec->bin_tiles_y);
+		return -EINVAL;
+	}
+
+	if (flags & (VC4_BIN_CONFIG_DB_NON_MS |
+		     VC4_BIN_CONFIG_TILE_BUFFER_64BIT)) {
+		DRM_ERROR("unsupported binning config flags 0x%02x\n", flags);
+		return -EINVAL;
+	}
+
+	/* The tile state data array is 48 bytes per tile, and we put it at
+	 * the start of a BO containing both it and the tile alloc.
+	 */
+	tile_state_size = 48 * tile_count;
+
+	/* Since the tile alloc array will follow us, align. */
+	exec->tile_alloc_offset = roundup(tile_state_size, 4096);
+
+	*(uint8_t *)(validated + 14) =
+		((flags & ~(VC4_BIN_CONFIG_ALLOC_INIT_BLOCK_SIZE_MASK |
+			    VC4_BIN_CONFIG_ALLOC_BLOCK_SIZE_MASK)) |
+		 VC4_BIN_CONFIG_AUTO_INIT_TSDA |
+		 VC4_SET_FIELD(VC4_BIN_CONFIG_ALLOC_INIT_BLOCK_SIZE_32,
+			       VC4_BIN_CONFIG_ALLOC_INIT_BLOCK_SIZE) |
+		 VC4_SET_FIELD(VC4_BIN_CONFIG_ALLOC_BLOCK_SIZE_128,
+			       VC4_BIN_CONFIG_ALLOC_BLOCK_SIZE));
+
+	/* Initial block size. */
+	tile_alloc_size = 32 * tile_count;
+
+	/*
+	 * The initial allocation gets rounded to the next 256 bytes before
+	 * the hardware starts fulfilling further allocations.
+	 */
+	tile_alloc_size = roundup(tile_alloc_size, 256);
+
+	/* Add space for the extra allocations.  This is what gets used first,
+	 * before overflow memory.  It must have at least 4096 bytes, but we
+	 * want to avoid overflow memory usage if possible.
+	 */
+	tile_alloc_size += 1024 * 1024;
+
+	tile_bo = vc4_bo_create(dev, exec->tile_alloc_offset + tile_alloc_size,
+				true);
+	exec->tile_bo = &tile_bo->base;
+	if (!exec->tile_bo)
+		return -ENOMEM;
+	list_add_tail(&tile_bo->unref_head, &exec->unref_list);
+
+	/* tile alloc address. */
+	*(uint32_t *)(validated + 0) = (exec->tile_bo->paddr +
+					exec->tile_alloc_offset);
+	/* tile alloc size. */
+	*(uint32_t *)(validated + 4) = tile_alloc_size;
+	/* tile state address. */
+	*(uint32_t *)(validated + 8) = exec->tile_bo->paddr;
+
+	return 0;
+}
+
+static int
+validate_gem_handles(VALIDATE_ARGS)
+{
+	memcpy(exec->bo_index, untrusted, sizeof(exec->bo_index));
+	return 0;
+}
+
+#define VC4_DEFINE_PACKET(packet, func) \
+	[packet] = { packet ## _SIZE, #packet, func }
+
+static const struct cmd_info {
+	uint16_t len;
+	const char *name;
+	int (*func)(struct vc4_exec_info *exec, void *validated,
+		    void *untrusted);
+} cmd_info[] = {
+	VC4_DEFINE_PACKET(VC4_PACKET_HALT, NULL),
+	VC4_DEFINE_PACKET(VC4_PACKET_NOP, NULL),
+	VC4_DEFINE_PACKET(VC4_PACKET_FLUSH, NULL),
+	VC4_DEFINE_PACKET(VC4_PACKET_FLUSH_ALL, validate_flush_all),
+	VC4_DEFINE_PACKET(VC4_PACKET_START_TILE_BINNING,
+			  validate_start_tile_binning),
+	VC4_DEFINE_PACKET(VC4_PACKET_INCREMENT_SEMAPHORE,
+			  validate_increment_semaphore),
+
+	VC4_DEFINE_PACKET(VC4_PACKET_GL_INDEXED_PRIMITIVE,
+			  validate_indexed_prim_list),
+	VC4_DEFINE_PACKET(VC4_PACKET_GL_ARRAY_PRIMITIVE,
+			  validate_gl_array_primitive),
+
+	VC4_DEFINE_PACKET(VC4_PACKET_PRIMITIVE_LIST_FORMAT, NULL),
+
+	VC4_DEFINE_PACKET(VC4_PACKET_GL_SHADER_STATE, validate_gl_shader_state),
+	VC4_DEFINE_PACKET(VC4_PACKET_NV_SHADER_STATE, validate_nv_shader_state),
+
+	VC4_DEFINE_PACKET(VC4_PACKET_CONFIGURATION_BITS, NULL),
+	VC4_DEFINE_PACKET(VC4_PACKET_FLAT_SHADE_FLAGS, NULL),
+	VC4_DEFINE_PACKET(VC4_PACKET_POINT_SIZE, NULL),
+	VC4_DEFINE_PACKET(VC4_PACKET_LINE_WIDTH, NULL),
+	VC4_DEFINE_PACKET(VC4_PACKET_RHT_X_BOUNDARY, NULL),
+	VC4_DEFINE_PACKET(VC4_PACKET_DEPTH_OFFSET, NULL),
+	VC4_DEFINE_PACKET(VC4_PACKET_CLIP_WINDOW, NULL),
+	VC4_DEFINE_PACKET(VC4_PACKET_VIEWPORT_OFFSET, NULL),
+	VC4_DEFINE_PACKET(VC4_PACKET_CLIPPER_XY_SCALING, NULL),
+	/* Note: The docs say this was also 105, but it was 106 in the
+	 * initial userland code drop.
+	 */
+	VC4_DEFINE_PACKET(VC4_PACKET_CLIPPER_Z_SCALING, NULL),
+
+	VC4_DEFINE_PACKET(VC4_PACKET_TILE_BINNING_MODE_CONFIG,
+			  validate_tile_binning_config),
+
+	VC4_DEFINE_PACKET(VC4_PACKET_GEM_HANDLES, validate_gem_handles),
+};
+
+int
+vc4_validate_bin_cl(struct drm_device *dev,
+		    void *validated,
+		    void *unvalidated,
+		    struct vc4_exec_info *exec)
+{
+	uint32_t len = exec->args->bin_cl_size;
+	uint32_t dst_offset = 0;
+	uint32_t src_offset = 0;
+
+	while (src_offset < len) {
+		void *dst_pkt = validated + dst_offset;
+		void *src_pkt = unvalidated + src_offset;
+		u8 cmd = *(uint8_t *)src_pkt;
+		const struct cmd_info *info;
+
+		if (cmd >= ARRAY_SIZE(cmd_info)) {
+			DRM_ERROR("0x%08x: packet %d out of bounds\n",
+				  src_offset, cmd);
+			return -EINVAL;
+		}
+
+		info = &cmd_info[cmd];
+		if (!info->name) {
+			DRM_ERROR("0x%08x: packet %d invalid\n",
+				  src_offset, cmd);
+			return -EINVAL;
+		}
+
+		if (src_offset + info->len > len) {
+			DRM_ERROR("0x%08x: packet %d (%s) length 0x%08x "
+				  "exceeds bounds (0x%08x)\n",
+				  src_offset, cmd, info->name, info->len,
+				  src_offset + len);
+			return -EINVAL;
+		}
+
+		if (cmd != VC4_PACKET_GEM_HANDLES)
+			memcpy(dst_pkt, src_pkt, info->len);
+
+		if (info->func && info->func(exec,
+					     dst_pkt + 1,
+					     src_pkt + 1)) {
+			DRM_ERROR("0x%08x: packet %d (%s) failed to validate\n",
+				  src_offset, cmd, info->name);
+			return -EINVAL;
+		}
+
+		src_offset += info->len;
+		/* GEM handle loading doesn't produce HW packets. */
+		if (cmd != VC4_PACKET_GEM_HANDLES)
+			dst_offset += info->len;
+
+		/* When the CL hits halt, it'll stop reading anything else. */
+		if (cmd == VC4_PACKET_HALT)
+			break;
+	}
+
+	exec->ct0ea = exec->ct0ca + dst_offset;
+
+	if (!exec->found_start_tile_binning_packet) {
+		DRM_ERROR("Bin CL missing VC4_PACKET_START_TILE_BINNING\n");
+		return -EINVAL;
+	}
+
+	if (!exec->found_increment_semaphore_packet) {
+		DRM_ERROR("Bin CL missing VC4_PACKET_INCREMENT_SEMAPHORE\n");
+		return -EINVAL;
+	}
+
+	return 0;
+}
+
+static bool
+reloc_tex(struct vc4_exec_info *exec,
+	  void *uniform_data_u,
+	  struct vc4_texture_sample_info *sample,
+	  uint32_t texture_handle_index)
+
+{
+	struct drm_gem_cma_object *tex;
+	uint32_t p0 = *(uint32_t *)(uniform_data_u + sample->p_offset[0]);
+	uint32_t p1 = *(uint32_t *)(uniform_data_u + sample->p_offset[1]);
+	uint32_t p2 = (sample->p_offset[2] != ~0 ?
+		       *(uint32_t *)(uniform_data_u + sample->p_offset[2]) : 0);
+	uint32_t p3 = (sample->p_offset[3] != ~0 ?
+		       *(uint32_t *)(uniform_data_u + sample->p_offset[3]) : 0);
+	uint32_t *validated_p0 = exec->uniforms_v + sample->p_offset[0];
+	uint32_t offset = p0 & VC4_TEX_P0_OFFSET_MASK;
+	uint32_t miplevels = VC4_GET_FIELD(p0, VC4_TEX_P0_MIPLVLS);
+	uint32_t width = VC4_GET_FIELD(p1, VC4_TEX_P1_WIDTH);
+	uint32_t height = VC4_GET_FIELD(p1, VC4_TEX_P1_HEIGHT);
+	uint32_t cpp, tiling_format, utile_w, utile_h;
+	uint32_t i;
+	uint32_t cube_map_stride = 0;
+	enum vc4_texture_data_type type;
+
+	if (!vc4_use_bo(exec, texture_handle_index, VC4_MODE_RENDER, &tex))
+		return false;
+
+	if (sample->is_direct) {
+		uint32_t remaining_size = tex->base.size - p0;
+
+		if (p0 > tex->base.size - 4) {
+			DRM_ERROR("UBO offset greater than UBO size\n");
+			goto fail;
+		}
+		if (p1 > remaining_size - 4) {
+			DRM_ERROR("UBO clamp would allow reads "
+				  "outside of UBO\n");
+			goto fail;
+		}
+		*validated_p0 = tex->paddr + p0;
+		return true;
+	}
+
+	if (width == 0)
+		width = 2048;
+	if (height == 0)
+		height = 2048;
+
+	if (p0 & VC4_TEX_P0_CMMODE_MASK) {
+		if (VC4_GET_FIELD(p2, VC4_TEX_P2_PTYPE) ==
+		    VC4_TEX_P2_PTYPE_CUBE_MAP_STRIDE)
+			cube_map_stride = p2 & VC4_TEX_P2_CMST_MASK;
+		if (VC4_GET_FIELD(p3, VC4_TEX_P2_PTYPE) ==
+		    VC4_TEX_P2_PTYPE_CUBE_MAP_STRIDE) {
+			if (cube_map_stride) {
+				DRM_ERROR("Cube map stride set twice\n");
+				goto fail;
+			}
+
+			cube_map_stride = p3 & VC4_TEX_P2_CMST_MASK;
+		}
+		if (!cube_map_stride) {
+			DRM_ERROR("Cube map stride not set\n");
+			goto fail;
+		}
+	}
+
+	type = (VC4_GET_FIELD(p0, VC4_TEX_P0_TYPE) |
+		(VC4_GET_FIELD(p1, VC4_TEX_P1_TYPE4) << 4));
+
+	switch (type) {
+	case VC4_TEXTURE_TYPE_RGBA8888:
+	case VC4_TEXTURE_TYPE_RGBX8888:
+	case VC4_TEXTURE_TYPE_RGBA32R:
+		cpp = 4;
+		break;
+	case VC4_TEXTURE_TYPE_RGBA4444:
+	case VC4_TEXTURE_TYPE_RGBA5551:
+	case VC4_TEXTURE_TYPE_RGB565:
+	case VC4_TEXTURE_TYPE_LUMALPHA:
+	case VC4_TEXTURE_TYPE_S16F:
+	case VC4_TEXTURE_TYPE_S16:
+		cpp = 2;
+		break;
+	case VC4_TEXTURE_TYPE_LUMINANCE:
+	case VC4_TEXTURE_TYPE_ALPHA:
+	case VC4_TEXTURE_TYPE_S8:
+		cpp = 1;
+		break;
+	case VC4_TEXTURE_TYPE_ETC1:
+	case VC4_TEXTURE_TYPE_BW1:
+	case VC4_TEXTURE_TYPE_A4:
+	case VC4_TEXTURE_TYPE_A1:
+	case VC4_TEXTURE_TYPE_RGBA64:
+	case VC4_TEXTURE_TYPE_YUV422R:
+	default:
+		DRM_ERROR("Texture format %d unsupported\n", type);
+		goto fail;
+	}
+	utile_w = utile_width(cpp);
+	utile_h = utile_height(cpp);
+
+	if (type == VC4_TEXTURE_TYPE_RGBA32R) {
+		tiling_format = VC4_TILING_FORMAT_LINEAR;
+	} else {
+		if (size_is_lt(width, height, cpp))
+			tiling_format = VC4_TILING_FORMAT_LT;
+		else
+			tiling_format = VC4_TILING_FORMAT_T;
+	}
+
+	if (!vc4_check_tex_size(exec, tex, offset + cube_map_stride * 5,
+				tiling_format, width, height, cpp)) {
+		goto fail;
+	}
+
+	/* The mipmap levels are stored before the base of the texture.  Make
+	 * sure there is actually space in the BO.
+	 */
+	for (i = 1; i <= miplevels; i++) {
+		uint32_t level_width = max(width >> i, 1u);
+		uint32_t level_height = max(height >> i, 1u);
+		uint32_t aligned_width, aligned_height;
+		uint32_t level_size;
+
+		/* Once the levels get small enough, they drop from T to LT. */
+		if (tiling_format == VC4_TILING_FORMAT_T &&
+		    size_is_lt(level_width, level_height, cpp)) {
+			tiling_format = VC4_TILING_FORMAT_LT;
+		}
+
+		switch (tiling_format) {
+		case VC4_TILING_FORMAT_T:
+			aligned_width = round_up(level_width, utile_w * 8);
+			aligned_height = round_up(level_height, utile_h * 8);
+			break;
+		case VC4_TILING_FORMAT_LT:
+			aligned_width = round_up(level_width, utile_w);
+			aligned_height = round_up(level_height, utile_h);
+			break;
+		default:
+			aligned_width = round_up(level_width, utile_w);
+			aligned_height = level_height;
+			break;
+		}
+
+		level_size = aligned_width * cpp * aligned_height;
+
+		if (offset < level_size) {
+			DRM_ERROR("Level %d (%dx%d -> %dx%d) size %db "
+				  "overflowed buffer bounds (offset %d)\n",
+				  i, level_width, level_height,
+				  aligned_width, aligned_height,
+				  level_size, offset);
+			goto fail;
+		}
+
+		offset -= level_size;
+	}
+
+	*validated_p0 = tex->paddr + p0;
+
+	return true;
+ fail:
+	DRM_INFO("Texture p0 at %d: 0x%08x\n", sample->p_offset[0], p0);
+	DRM_INFO("Texture p1 at %d: 0x%08x\n", sample->p_offset[1], p1);
+	DRM_INFO("Texture p2 at %d: 0x%08x\n", sample->p_offset[2], p2);
+	DRM_INFO("Texture p3 at %d: 0x%08x\n", sample->p_offset[3], p3);
+	return false;
+}
+
+static int
+validate_shader_rec(struct drm_device *dev,
+		    struct vc4_exec_info *exec,
+		    struct vc4_shader_state *state)
+{
+	uint32_t *src_handles;
+	void *pkt_u, *pkt_v;
+	enum shader_rec_reloc_type {
+		RELOC_CODE,
+		RELOC_VBO,
+	};
+	struct shader_rec_reloc {
+		enum shader_rec_reloc_type type;
+		uint32_t offset;
+	};
+	static const struct shader_rec_reloc gl_relocs[] = {
+		{ RELOC_CODE, 4 },  /* fs */
+		{ RELOC_CODE, 16 }, /* vs */
+		{ RELOC_CODE, 28 }, /* cs */
+	};
+	static const struct shader_rec_reloc nv_relocs[] = {
+		{ RELOC_CODE, 4 }, /* fs */
+		{ RELOC_VBO, 12 }
+	};
+	const struct shader_rec_reloc *relocs;
+	struct drm_gem_cma_object *bo[ARRAY_SIZE(gl_relocs) + 8];
+	uint32_t nr_attributes = 0, nr_fixed_relocs, nr_relocs, packet_size;
+	int i;
+	struct vc4_validated_shader_info *shader;
+
+	if (state->packet == VC4_PACKET_NV_SHADER_STATE) {
+		relocs = nv_relocs;
+		nr_fixed_relocs = ARRAY_SIZE(nv_relocs);
+
+		packet_size = 16;
+	} else {
+		relocs = gl_relocs;
+		nr_fixed_relocs = ARRAY_SIZE(gl_relocs);
+
+		nr_attributes = state->addr & 0x7;
+		if (nr_attributes == 0)
+			nr_attributes = 8;
+		packet_size = gl_shader_rec_size(state->addr);
+	}
+	nr_relocs = nr_fixed_relocs + nr_attributes;
+
+	if (nr_relocs * 4 > exec->shader_rec_size) {
+		DRM_ERROR("overflowed shader recs reading %d handles "
+			  "from %d bytes left\n",
+			  nr_relocs, exec->shader_rec_size);
+		return -EINVAL;
+	}
+	src_handles = exec->shader_rec_u;
+	exec->shader_rec_u += nr_relocs * 4;
+	exec->shader_rec_size -= nr_relocs * 4;
+
+	if (packet_size > exec->shader_rec_size) {
+		DRM_ERROR("overflowed shader recs copying %db packet "
+			  "from %d bytes left\n",
+			  packet_size, exec->shader_rec_size);
+		return -EINVAL;
+	}
+	pkt_u = exec->shader_rec_u;
+	pkt_v = exec->shader_rec_v;
+	memcpy(pkt_v, pkt_u, packet_size);
+	exec->shader_rec_u += packet_size;
+	/* Shader recs have to be aligned to 16 bytes (due to the attribute
+	 * flags being in the low bytes), so round the next validated shader
+	 * rec address up.  This should be safe, since we've got so many
+	 * relocations in a shader rec packet.
+	 */
+	BUG_ON(roundup(packet_size, 16) - packet_size > nr_relocs * 4);
+	exec->shader_rec_v += roundup(packet_size, 16);
+	exec->shader_rec_size -= packet_size;
+
+	for (i = 0; i < nr_relocs; i++) {
+		enum vc4_bo_mode mode;
+
+		if (i < nr_fixed_relocs && relocs[i].type == RELOC_CODE)
+			mode = VC4_MODE_SHADER;
+		else
+			mode = VC4_MODE_RENDER;
+
+		if (!vc4_use_bo(exec, src_handles[i], mode, &bo[i]))
+			return false;
+	}
+
+	for (i = 0; i < nr_fixed_relocs; i++) {
+		struct vc4_bo *vc4_bo;
+		uint32_t o = relocs[i].offset;
+		uint32_t src_offset = *(uint32_t *)(pkt_u + o);
+		uint32_t *texture_handles_u;
+		void *uniform_data_u;
+		uint32_t tex;
+
+		*(uint32_t *)(pkt_v + o) = bo[i]->paddr + src_offset;
+
+		switch (relocs[i].type) {
+		case RELOC_CODE:
+			if (src_offset != 0) {
+				DRM_ERROR("Shaders must be at offset 0 "
+					  "of the BO.\n");
+				goto fail;
+			}
+
+			vc4_bo = to_vc4_bo(&bo[i]->base);
+			shader = vc4_bo->validated_shader;
+			if (!shader)
+				goto fail;
+
+			if (shader->uniforms_src_size > exec->uniforms_size) {
+				DRM_ERROR("Uniforms src buffer overflow\n");
+				goto fail;
+			}
+
+			texture_handles_u = exec->uniforms_u;
+			uniform_data_u = (texture_handles_u +
+					  shader->num_texture_samples);
+
+			memcpy(exec->uniforms_v, uniform_data_u,
+			       shader->uniforms_size);
+
+			for (tex = 0;
+			     tex < shader->num_texture_samples;
+			     tex++) {
+				if (!reloc_tex(exec,
+					       uniform_data_u,
+					       &shader->texture_samples[tex],
+					       texture_handles_u[tex])) {
+					goto fail;
+				}
+			}
+
+			*(uint32_t *)(pkt_v + o + 4) = exec->uniforms_p;
+
+			exec->uniforms_u += shader->uniforms_src_size;
+			exec->uniforms_v += shader->uniforms_size;
+			exec->uniforms_p += shader->uniforms_size;
+
+			break;
+
+		case RELOC_VBO:
+			break;
+		}
+	}
+
+	for (i = 0; i < nr_attributes; i++) {
+		struct drm_gem_cma_object *vbo = bo[nr_fixed_relocs + i];
+		uint32_t o = 36 + i * 8;
+		uint32_t offset = *(uint32_t *)(pkt_u + o + 0);
+		uint32_t attr_size = *(uint8_t *)(pkt_u + o + 4) + 1;
+		uint32_t stride = *(uint8_t *)(pkt_u + o + 5);
+		uint32_t max_index;
+
+		if (state->addr & 0x8)
+			stride |= (*(uint32_t *)(pkt_u + 100 + i * 4)) & ~0xff;
+
+		if (vbo->base.size < offset ||
+		    vbo->base.size - offset < attr_size) {
+			DRM_ERROR("BO offset overflow (%d + %d > %d)\n",
+				  offset, attr_size, vbo->base.size);
+			return -EINVAL;
+		}
+
+		if (stride != 0) {
+			max_index = ((vbo->base.size - offset - attr_size) /
+				     stride);
+			if (state->max_index > max_index) {
+				DRM_ERROR("primitives use index %d out of "
+					  "supplied %d\n",
+					  state->max_index, max_index);
+				return -EINVAL;
+			}
+		}
+
+		*(uint32_t *)(pkt_v + o) = vbo->paddr + offset;
+	}
+
+	return 0;
+
+fail:
+	return -EINVAL;
+}
+
+int
+vc4_validate_shader_recs(struct drm_device *dev,
+			 struct vc4_exec_info *exec)
+{
+	uint32_t i;
+	int ret = 0;
+
+	for (i = 0; i < exec->shader_state_count; i++) {
+		ret = validate_shader_rec(dev, exec, &exec->shader_state[i]);
+		if (ret)
+			return ret;
+	}
+
+	return ret;
+}
diff --git a/include/uapi/drm/vc4_drm.h b/include/uapi/drm/vc4_drm.h
index 0716b50..e711ec0 100644
--- a/include/uapi/drm/vc4_drm.h
+++ b/include/uapi/drm/vc4_drm.h
@@ -26,14 +26,155 @@
 
 #include <drm/drm.h>
 
+#define DRM_VC4_SUBMIT_CL                         0x00
+#define DRM_VC4_WAIT_SEQNO                        0x01
+#define DRM_VC4_WAIT_BO                           0x02
 #define DRM_VC4_CREATE_BO                         0x03
 #define DRM_VC4_MMAP_BO                           0x04
 #define DRM_VC4_CREATE_SHADER_BO                  0x05
 
+#define DRM_IOCTL_VC4_SUBMIT_CL           DRM_IOWR(DRM_COMMAND_BASE + DRM_VC4_SUBMIT_CL, struct drm_vc4_submit_cl)
+#define DRM_IOCTL_VC4_WAIT_SEQNO          DRM_IOWR(DRM_COMMAND_BASE + DRM_VC4_WAIT_SEQNO, struct drm_vc4_wait_seqno)
+#define DRM_IOCTL_VC4_WAIT_BO             DRM_IOWR(DRM_COMMAND_BASE + DRM_VC4_WAIT_BO, struct drm_vc4_wait_bo)
 #define DRM_IOCTL_VC4_CREATE_BO           DRM_IOWR(DRM_COMMAND_BASE + DRM_VC4_CREATE_BO, struct drm_vc4_create_bo)
 #define DRM_IOCTL_VC4_MMAP_BO             DRM_IOWR(DRM_COMMAND_BASE + DRM_VC4_MMAP_BO, struct drm_vc4_mmap_bo)
 #define DRM_IOCTL_VC4_CREATE_SHADER_BO    DRM_IOWR(DRM_COMMAND_BASE + DRM_VC4_CREATE_SHADER_BO, struct drm_vc4_create_shader_bo)
 
+struct drm_vc4_submit_rcl_surface {
+	uint32_t hindex; /* Handle index, or ~0 if not present. */
+	uint32_t offset; /* Offset to start of buffer. */
+	/*
+	 * Bits for either render config (color_write) or load/store packet.
+	 * Bits should all be 0 for MSAA load/stores.
+	 */
+	uint16_t bits;
+
+#define VC4_SUBMIT_RCL_SURFACE_READ_IS_FULL_RES		(1 << 0)
+	uint16_t flags;
+};
+
+/**
+ * struct drm_vc4_submit_cl - ioctl argument for submitting commands to the 3D
+ * engine.
+ *
+ * Drivers typically use GPU BOs to store batchbuffers / command lists and
+ * their associated state.  However, because the VC4 lacks an MMU, we have to
+ * do validation of memory accesses by the GPU commands.  If we were to store
+ * our commands in BOs, we'd need to do uncached readback from them to do the
+ * validation process, which is too expensive.  Instead, userspace accumulates
+ * commands and associated state in plain memory, then the kernel copies the
+ * data to its own address space, and then validates and stores it in a GPU
+ * BO.
+ */
+struct drm_vc4_submit_cl {
+	/* Pointer to the binner command list.
+	 *
+	 * This is the first set of commands executed, which runs the
+	 * coordinate shader to determine where primitives land on the screen,
+	 * then writes out the state updates and draw calls necessary per tile
+	 * to the tile allocation BO.
+	 */
+	uint64_t bin_cl;
+
+	/* Pointer to the shader records.
+	 *
+	 * Shader records are the structures read by the hardware that contain
+	 * pointers to uniforms, shaders, and vertex attributes.  The
+	 * reference to the shader record has enough information to determine
+	 * how many pointers are necessary (fixed number for shaders/uniforms,
+	 * and an attribute count), so those BO indices into bo_handles are
+	 * just stored as uint32_ts before each shader record passed in.
+	 */
+	uint64_t shader_rec;
+
+	/* Pointer to uniform data and texture handles for the textures
+	 * referenced by the shader.
+	 *
+	 * For each shader state record, there is a set of uniform data in the
+	 * order referenced by the record (FS, VS, then CS).  Each set of
+	 * uniform data has a uint32_t index into bo_handles per texture
+	 * sample operation, in the order the QPU_W_TMUn_S writes appear in
+	 * the program.  Following the texture BO handle indices is the actual
+	 * uniform data.
+	 *
+	 * The individual uniform state blocks don't have sizes passed in,
+	 * because the kernel has to determine the sizes anyway during shader
+	 * code validation.
+	 */
+	uint64_t uniforms;
+	uint64_t bo_handles;
+
+	/* Size in bytes of the binner command list. */
+	uint32_t bin_cl_size;
+	/* Size in bytes of the set of shader records. */
+	uint32_t shader_rec_size;
+	/* Number of shader records.
+	 *
+	 * This could just be computed from the contents of shader_records and
+	 * the address bits of references to them from the bin CL, but it
+	 * keeps the kernel from having to resize some allocations it makes.
+	 */
+	uint32_t shader_rec_count;
+	/* Size in bytes of the uniform state. */
+	uint32_t uniforms_size;
+
+	/* Number of BO handles passed in (size is that times 4). */
+	uint32_t bo_handle_count;
+
+	/* RCL setup: */
+	uint16_t width;
+	uint16_t height;
+	uint8_t min_x_tile;
+	uint8_t min_y_tile;
+	uint8_t max_x_tile;
+	uint8_t max_y_tile;
+	struct drm_vc4_submit_rcl_surface color_read;
+	struct drm_vc4_submit_rcl_surface color_write;
+	struct drm_vc4_submit_rcl_surface zs_read;
+	struct drm_vc4_submit_rcl_surface zs_write;
+	struct drm_vc4_submit_rcl_surface msaa_color_write;
+	struct drm_vc4_submit_rcl_surface msaa_zs_write;
+	uint32_t clear_color[2];
+	uint32_t clear_z;
+	uint8_t clear_s;
+
+	uint32_t pad:24;
+
+#define VC4_SUBMIT_CL_USE_CLEAR_COLOR			(1 << 0)
+	uint32_t flags;
+
+	/* Returned value of the seqno of this render job (for the
+	 * wait ioctl).
+	 */
+	uint64_t seqno;
+};
+
+/**
+ * struct drm_vc4_wait_seqno - ioctl argument for waiting for
+ * DRM_VC4_SUBMIT_CL completion using its returned seqno.
+ *
+ * timeout_ns is the timeout in nanoseconds, where "0" means "don't
+ * block, just return the status."
+ */
+struct drm_vc4_wait_seqno {
+	uint64_t seqno;
+	uint64_t timeout_ns;
+};
+
+/**
+ * struct drm_vc4_wait_bo - ioctl argument for waiting for
+ * completion of the last DRM_VC4_SUBMIT_CL on a BO.
+ *
+ * This is useful for cases where multiple processes might be
+ * rendering to a BO and you want to wait for all rendering to be
+ * completed.
+ */
+struct drm_vc4_wait_bo {
+	uint32_t handle;
+	uint32_t pad;
+	uint64_t timeout_ns;
+};
+
 /**
  * struct drm_vc4_create_bo - ioctl argument for creating VC4 BOs.
  *
-- 
2.6.2

_______________________________________________
dri-devel mailing list
dri-devel@lists.freedesktop.org
http://lists.freedesktop.org/mailman/listinfo/dri-devel

^ permalink raw reply related	[flat|nested] 53+ messages in thread

* [PATCH 8/9] drm/vc4: Add support for async pageflips.
  2015-12-01 20:35 ` Eric Anholt
@ 2015-12-01 20:35   ` Eric Anholt
  -1 siblings, 0 replies; 53+ messages in thread
From: Eric Anholt @ 2015-12-01 20:35 UTC (permalink / raw)
  To: dri-devel; +Cc: linux-kernel, David Airlie, Eric Anholt

An async pageflip stores the modeset to be done and executes it once
the BOs are ready to be displayed.  This gets us about 3x performance
in full screen rendering with pageflipping.

Signed-off-by: Eric Anholt <eric@anholt.net>
---
 drivers/gpu/drm/vc4/vc4_crtc.c  |  99 +++++++++++++++++++++++++-
 drivers/gpu/drm/vc4/vc4_drv.h   |  16 +++++
 drivers/gpu/drm/vc4/vc4_gem.c   |  40 +++++++++++
 drivers/gpu/drm/vc4/vc4_kms.c   | 149 +++++++++++++++++++++++++++++++++++++++-
 drivers/gpu/drm/vc4/vc4_plane.c |  40 +++++++++++
 5 files changed, 342 insertions(+), 2 deletions(-)

diff --git a/drivers/gpu/drm/vc4/vc4_crtc.c b/drivers/gpu/drm/vc4/vc4_crtc.c
index 7a9f476..a319332 100644
--- a/drivers/gpu/drm/vc4/vc4_crtc.c
+++ b/drivers/gpu/drm/vc4/vc4_crtc.c
@@ -35,6 +35,7 @@
 #include "drm_atomic_helper.h"
 #include "drm_crtc_helper.h"
 #include "linux/clk.h"
+#include "drm_fb_cma_helper.h"
 #include "linux/component.h"
 #include "linux/of_device.h"
 #include "vc4_drv.h"
@@ -475,10 +476,106 @@ static irqreturn_t vc4_crtc_irq_handler(int irq, void *data)
 	return ret;
 }
 
+struct vc4_async_flip_state {
+	struct drm_crtc *crtc;
+	struct drm_framebuffer *fb;
+	struct drm_pending_vblank_event *event;
+
+	struct vc4_seqno_cb cb;
+};
+
+/* Called when the V3D execution for the BO being flipped to is done, so that
+ * we can actually update the plane's address to point to it.
+ */
+static void
+vc4_async_page_flip_complete(struct vc4_seqno_cb *cb)
+{
+	struct vc4_async_flip_state *flip_state =
+		container_of(cb, struct vc4_async_flip_state, cb);
+	struct drm_crtc *crtc = flip_state->crtc;
+	struct drm_device *dev = crtc->dev;
+	struct vc4_dev *vc4 = to_vc4_dev(dev);
+	struct drm_plane *plane = crtc->primary;
+
+	vc4_plane_async_set_fb(plane, flip_state->fb);
+	if (flip_state->event) {
+		unsigned long flags;
+
+		spin_lock_irqsave(&dev->event_lock, flags);
+		drm_crtc_send_vblank_event(crtc, flip_state->event);
+		spin_unlock_irqrestore(&dev->event_lock, flags);
+	}
+
+	drm_framebuffer_unreference(flip_state->fb);
+	kfree(flip_state);
+
+	up(&vc4->async_modeset);
+}
+
+/* Implements async (non-vblank-synced) page flips.
+ *
+ * The page flip ioctl needs to return immediately, so we grab the
+ * modeset semaphore on the pipe, and queue the address update for
+ * when V3D is done with the BO being flipped to.
+ */
+static int vc4_async_page_flip(struct drm_crtc *crtc,
+			       struct drm_framebuffer *fb,
+			       struct drm_pending_vblank_event *event,
+			       uint32_t flags)
+{
+	struct drm_device *dev = crtc->dev;
+	struct vc4_dev *vc4 = to_vc4_dev(dev);
+	struct drm_plane *plane = crtc->primary;
+	int ret = 0;
+	struct vc4_async_flip_state *flip_state;
+	struct drm_gem_cma_object *cma_bo = drm_fb_cma_get_gem_obj(fb, 0);
+	struct vc4_bo *bo = to_vc4_bo(&cma_bo->base);
+
+	flip_state = kzalloc(sizeof(*flip_state), GFP_KERNEL);
+	if (!flip_state)
+		return -ENOMEM;
+
+	drm_framebuffer_reference(fb);
+	flip_state->fb = fb;
+	flip_state->crtc = crtc;
+	flip_state->event = event;
+
+	/* Make sure all other async modesetes have landed. */
+	ret = down_interruptible(&vc4->async_modeset);
+	if (ret) {
+		kfree(flip_state);
+		return ret;
+	}
+
+	/* Immediately update the plane's legacy fb pointer, so that later
+	 * modeset prep sees the state that will be present when the semaphore
+	 * is released.
+	 */
+	drm_atomic_set_fb_for_plane(plane->state, fb);
+	plane->fb = fb;
+
+	vc4_queue_seqno_cb(dev, &flip_state->cb, bo->seqno,
+			   vc4_async_page_flip_complete);
+
+	/* Driver takes ownership of state on successful async commit. */
+	return 0;
+}
+
+static int vc4_page_flip(struct drm_crtc *crtc,
+			 struct drm_framebuffer *fb,
+			 struct drm_pending_vblank_event *event,
+			 uint32_t flags)
+{
+	if (flags & DRM_MODE_PAGE_FLIP_ASYNC)
+		return vc4_async_page_flip(crtc, fb, event, flags);
+	else
+		return drm_atomic_helper_page_flip(crtc, fb, event, flags);
+}
+
 static const struct drm_crtc_funcs vc4_crtc_funcs = {
 	.set_config = drm_atomic_helper_set_config,
 	.destroy = vc4_crtc_destroy,
-	.page_flip = drm_atomic_helper_page_flip,
+	.page_flip = vc4_page_flip,
 	.set_property = NULL,
 	.cursor_set = NULL, /* handled by drm_mode_cursor_universal */
 	.cursor_move = NULL, /* handled by drm_mode_cursor_universal */
diff --git a/drivers/gpu/drm/vc4/vc4_drv.h b/drivers/gpu/drm/vc4/vc4_drv.h
index a3dbfee..9a8ee23 100644
--- a/drivers/gpu/drm/vc4/vc4_drv.h
+++ b/drivers/gpu/drm/vc4/vc4_drv.h
@@ -76,6 +76,11 @@ struct vc4_dev {
 	wait_queue_head_t job_wait_queue;
 	struct work_struct job_done_work;
 
+	/* List of struct vc4_seqno_cb for callbacks to be made from a
+	 * workqueue when the given seqno is passed.
+	 */
+	struct list_head seqno_cb_list;
+
 	/* The binner overflow memory that's currently set up in
 	 * BPOA/BPOS registers.  When overflow occurs and a new one is
 	 * allocated, the previous one will be moved to
@@ -128,6 +133,12 @@ to_vc4_bo(struct drm_gem_object *bo)
 	return (struct vc4_bo *)bo;
 }
 
+struct vc4_seqno_cb {
+	struct work_struct work;
+	uint64_t seqno;
+	void (*func)(struct vc4_seqno_cb *cb);
+};
+
 struct vc4_v3d {
 	struct platform_device *pdev;
 	void __iomem *regs;
@@ -392,6 +403,9 @@ void vc4_submit_next_job(struct drm_device *dev);
 int vc4_wait_for_seqno(struct drm_device *dev, uint64_t seqno,
 		       uint64_t timeout_ns, bool interruptible);
 void vc4_job_handle_completed(struct vc4_dev *vc4);
+int vc4_queue_seqno_cb(struct drm_device *dev,
+		       struct vc4_seqno_cb *cb, uint64_t seqno,
+		       void (*func)(struct vc4_seqno_cb *cb));
 
 /* vc4_hdmi.c */
 extern struct platform_driver vc4_hdmi_driver;
@@ -417,6 +431,8 @@ struct drm_plane *vc4_plane_init(struct drm_device *dev,
 				 enum drm_plane_type type);
 u32 vc4_plane_write_dlist(struct drm_plane *plane, u32 __iomem *dlist);
 u32 vc4_plane_dlist_size(struct drm_plane_state *state);
+void vc4_plane_async_set_fb(struct drm_plane *plane,
+			    struct drm_framebuffer *fb);
 
 /* vc4_v3d.c */
 extern struct platform_driver vc4_v3d_driver;
diff --git a/drivers/gpu/drm/vc4/vc4_gem.c b/drivers/gpu/drm/vc4/vc4_gem.c
index ddcb018..a62ca68 100644
--- a/drivers/gpu/drm/vc4/vc4_gem.c
+++ b/drivers/gpu/drm/vc4/vc4_gem.c
@@ -459,6 +459,7 @@ void
 vc4_job_handle_completed(struct vc4_dev *vc4)
 {
 	unsigned long irqflags;
+	struct vc4_seqno_cb *cb, *cb_temp;
 
 	spin_lock_irqsave(&vc4->job_lock, irqflags);
 	while (!list_empty(&vc4->job_done_list)) {
@@ -471,7 +472,45 @@ vc4_job_handle_completed(struct vc4_dev *vc4)
 		vc4_complete_exec(vc4->dev, exec);
 		spin_lock_irqsave(&vc4->job_lock, irqflags);
 	}
+
+	list_for_each_entry_safe(cb, cb_temp, &vc4->seqno_cb_list, work.entry) {
+		if (cb->seqno <= vc4->finished_seqno) {
+			list_del_init(&cb->work.entry);
+			schedule_work(&cb->work);
+		}
+	}
+
+	spin_unlock_irqrestore(&vc4->job_lock, irqflags);
+}
+
+static void vc4_seqno_cb_work(struct work_struct *work)
+{
+	struct vc4_seqno_cb *cb = container_of(work, struct vc4_seqno_cb, work);
+
+	cb->func(cb);
+}
+
+int vc4_queue_seqno_cb(struct drm_device *dev,
+		       struct vc4_seqno_cb *cb, uint64_t seqno,
+		       void (*func)(struct vc4_seqno_cb *cb))
+{
+	struct vc4_dev *vc4 = to_vc4_dev(dev);
+	int ret = 0;
+	unsigned long irqflags;
+
+	cb->func = func;
+	INIT_WORK(&cb->work, vc4_seqno_cb_work);
+
+	spin_lock_irqsave(&vc4->job_lock, irqflags);
+	if (seqno > vc4->finished_seqno) {
+		cb->seqno = seqno;
+		list_add_tail(&cb->work.entry, &vc4->seqno_cb_list);
+	} else {
+		schedule_work(&cb->work);
+	}
 	spin_unlock_irqrestore(&vc4->job_lock, irqflags);
+
+	return ret;
 }
 
 /* Scheduled when any job has been completed, this walks the list of
@@ -608,6 +647,7 @@ vc4_gem_init(struct drm_device *dev)
 
 	INIT_LIST_HEAD(&vc4->job_list);
 	INIT_LIST_HEAD(&vc4->job_done_list);
+	INIT_LIST_HEAD(&vc4->seqno_cb_list);
 	spin_lock_init(&vc4->job_lock);
 
 	INIT_WORK(&vc4->hangcheck.reset_work, vc4_reset_work);
diff --git a/drivers/gpu/drm/vc4/vc4_kms.c b/drivers/gpu/drm/vc4/vc4_kms.c
index 2e5597d..f95f2df 100644
--- a/drivers/gpu/drm/vc4/vc4_kms.c
+++ b/drivers/gpu/drm/vc4/vc4_kms.c
@@ -15,6 +15,7 @@
  */
 
 #include "drm_crtc.h"
+#include "drm_atomic.h"
 #include "drm_atomic_helper.h"
 #include "drm_crtc_helper.h"
 #include "drm_plane_helper.h"
@@ -29,10 +30,152 @@ static void vc4_output_poll_changed(struct drm_device *dev)
 		drm_fbdev_cma_hotplug_event(vc4->fbdev);
 }
 
+struct vc4_commit {
+	struct drm_device *dev;
+	struct drm_atomic_state *state;
+	struct vc4_seqno_cb cb;
+};
+
+static void
+vc4_atomic_complete_commit(struct vc4_commit *c)
+{
+	struct drm_atomic_state *state = c->state;
+	struct drm_device *dev = state->dev;
+	struct vc4_dev *vc4 = to_vc4_dev(dev);
+
+	drm_atomic_helper_commit_modeset_disables(dev, state);
+
+	drm_atomic_helper_commit_planes(dev, state, false);
+
+	drm_atomic_helper_commit_modeset_enables(dev, state);
+
+	drm_atomic_helper_wait_for_vblanks(dev, state);
+
+	drm_atomic_helper_cleanup_planes(dev, state);
+
+	drm_atomic_state_free(state);
+
+	up(&vc4->async_modeset);
+
+	kfree(c);
+}
+
+static void
+vc4_atomic_complete_commit_seqno_cb(struct vc4_seqno_cb *cb)
+{
+	struct vc4_commit *c = container_of(cb, struct vc4_commit, cb);
+
+	vc4_atomic_complete_commit(c);
+}
+
+static struct vc4_commit *commit_init(struct drm_atomic_state *state)
+{
+	struct vc4_commit *c = kzalloc(sizeof(*c), GFP_KERNEL);
+
+	if (!c)
+		return NULL;
+	c->dev = state->dev;
+	c->state = state;
+
+	return c;
+}
+
+/**
+ * vc4_atomic_commit - commit validated state object
+ * @dev: DRM device
+ * @state: the driver state object
+ * @async: asynchronous commit
+ *
+ * This function commits a with drm_atomic_helper_check() pre-validated state
+ * object. This can still fail when e.g. the framebuffer reservation fails. For
+ * now this doesn't implement asynchronous commits.
+ *
+ * RETURNS
+ * Zero for success or -errno.
+ */
+static int vc4_atomic_commit(struct drm_device *dev,
+			     struct drm_atomic_state *state,
+			     bool async)
+{
+	struct vc4_dev *vc4 = to_vc4_dev(dev);
+	int ret;
+	int i;
+	uint64_t wait_seqno = 0;
+	struct vc4_commit *c;
+
+	c = commit_init(state);
+	if (!c)
+		return -ENOMEM;
+
+	/* Make sure that any outstanding modesets have finished. */
+	ret = down_interruptible(&vc4->async_modeset);
+	if (ret) {
+		kfree(c);
+		return ret;
+	}
+
+	ret = drm_atomic_helper_prepare_planes(dev, state);
+	if (ret) {
+		kfree(c);
+		up(&vc4->async_modeset);
+		return ret;
+	}
+
+	for (i = 0; i < dev->mode_config.num_total_plane; i++) {
+		struct drm_plane *plane = state->planes[i];
+		struct drm_plane_state *new_state = state->plane_states[i];
+
+		if (!plane)
+			continue;
+
+		if ((plane->state->fb != new_state->fb) && new_state->fb) {
+			struct drm_gem_cma_object *cma_bo =
+				drm_fb_cma_get_gem_obj(new_state->fb, 0);
+			struct vc4_bo *bo = to_vc4_bo(&cma_bo->base);
+
+			wait_seqno = max(bo->seqno, wait_seqno);
+		}
+	}
+
+	/*
+	 * This is the point of no return - everything below never fails except
+	 * when the hw goes bonghits. Which means we can commit the new state on
+	 * the software side now.
+	 */
+
+	drm_atomic_helper_swap_state(dev, state);
+
+	/*
+	 * Everything below can be run asynchronously without the need to grab
+	 * any modeset locks at all under one condition: It must be guaranteed
+	 * that the asynchronous work has either been cancelled (if the driver
+	 * supports it, which at least requires that the framebuffers get
+	 * cleaned up with drm_atomic_helper_cleanup_planes()) or completed
+	 * before the new state gets committed on the software side with
+	 * drm_atomic_helper_swap_state().
+	 *
+	 * This scheme allows new atomic state updates to be prepared and
+	 * checked in parallel to the asynchronous completion of the previous
+	 * update. Which is important since compositors need to figure out the
+	 * composition of the next frame right after having submitted the
+	 * current layout.
+	 */
+
+	if (async) {
+		vc4_queue_seqno_cb(dev, &c->cb, wait_seqno,
+				   vc4_atomic_complete_commit_seqno_cb);
+	} else {
+		vc4_wait_for_seqno(dev, wait_seqno, ~0ull, false);
+		vc4_atomic_complete_commit(c);
+	}
+
+	return 0;
+}
+
 static const struct drm_mode_config_funcs vc4_mode_funcs = {
 	.output_poll_changed = vc4_output_poll_changed,
 	.atomic_check = drm_atomic_helper_check,
-	.atomic_commit = drm_atomic_helper_commit,
+	.atomic_commit = vc4_atomic_commit,
 	.fb_create = drm_fb_cma_create,
 };
 
@@ -41,6 +184,8 @@ int vc4_kms_load(struct drm_device *dev)
 	struct vc4_dev *vc4 = to_vc4_dev(dev);
 	int ret;
 
+	sema_init(&vc4->async_modeset, 1);
+
 	ret = drm_vblank_init(dev, dev->mode_config.num_crtc);
 	if (ret < 0) {
 		dev_err(dev->dev, "failed to initialize vblank\n");
@@ -51,6 +196,8 @@ int vc4_kms_load(struct drm_device *dev)
 	dev->mode_config.max_height = 2048;
 	dev->mode_config.funcs = &vc4_mode_funcs;
 	dev->mode_config.preferred_depth = 24;
+	dev->mode_config.async_page_flip = true;
+
 	dev->vblank_disable_allowed = true;
 
 	drm_mode_config_reset(dev);
diff --git a/drivers/gpu/drm/vc4/vc4_plane.c b/drivers/gpu/drm/vc4/vc4_plane.c
index cdd8b10..db32c373 100644
--- a/drivers/gpu/drm/vc4/vc4_plane.c
+++ b/drivers/gpu/drm/vc4/vc4_plane.c
@@ -29,6 +29,14 @@ struct vc4_plane_state {
 	u32 *dlist;
 	u32 dlist_size; /* Number of dwords in allocated for the display list */
 	u32 dlist_count; /* Number of used dwords in the display list. */
+
+	/* Offset in the dlist to pointer word 0. */
+	u32 pw0_offset;
+
+	/* Offset where the plane's dlist was last stored in the
+	   hardware at vc4_crtc_atomic_flush() time.
+	*/
+	u32 *hw_dlist;
 };
 
 static inline struct vc4_plane_state *
@@ -197,6 +205,8 @@ static int vc4_plane_mode_set(struct drm_plane *plane,
 	/* Position Word 3: Context.  Written by the HVS. */
 	vc4_dlist_write(vc4_state, 0xc0c0c0c0);
 
+	vc4_state->pw0_offset = vc4_state->dlist_count;
+
 	/* Pointer Word 0: RGB / Y Pointer */
 	vc4_dlist_write(vc4_state, bo->paddr + offset);
 
@@ -248,6 +258,8 @@ u32 vc4_plane_write_dlist(struct drm_plane *plane, u32 __iomem *dlist)
 	struct vc4_plane_state *vc4_state = to_vc4_plane_state(plane->state);
 	int i;
 
+	vc4_state->hw_dlist = dlist;
+
 	/* Can't memcpy_toio() because it needs to be 32-bit writes. */
 	for (i = 0; i < vc4_state->dlist_count; i++)
 		writel(vc4_state->dlist[i], &dlist[i]);
@@ -262,6 +274,34 @@ u32 vc4_plane_dlist_size(struct drm_plane_state *state)
 	return vc4_state->dlist_count;
 }
 
+/* Updates the plane to immediately (well, once the FIFO needs
+ * refilling) scan out from at a new framebuffer.
+ */
+void vc4_plane_async_set_fb(struct drm_plane *plane, struct drm_framebuffer *fb)
+{
+	struct vc4_plane_state *vc4_state = to_vc4_plane_state(plane->state);
+	struct drm_gem_cma_object *bo = drm_fb_cma_get_gem_obj(fb, 0);
+	uint32_t addr;
+
+	/* We're skipping the address adjustment for negative origin,
+	 * because this is only called on the primary plane.
+	 */
+	WARN_ON_ONCE(plane->state->crtc_x < 0 || plane->state->crtc_y < 0);
+	addr = bo->paddr + fb->offsets[0];
+
+	/* Write the new address into the hardware immediately.  The
+	 * scanout will start from this address as soon as the FIFO
+	 * needs to refill with pixels.
+	 */
+	writel(addr, &vc4_state->hw_dlist[vc4_state->pw0_offset]);
+
+	/* Also update the CPU-side dlist copy, so that any later
+	 * atomic updates that don't do a new modeset on our plane
+	 * also use our updated address.
+	 */
+	vc4_state->dlist[vc4_state->pw0_offset] = addr;
+}
+
 static const struct drm_plane_helper_funcs vc4_plane_helper_funcs = {
 	.prepare_fb = NULL,
 	.cleanup_fb = NULL,
-- 
2.6.2


^ permalink raw reply related	[flat|nested] 53+ messages in thread

* [PATCH 8/9] drm/vc4: Add support for async pageflips.
@ 2015-12-01 20:35   ` Eric Anholt
  0 siblings, 0 replies; 53+ messages in thread
From: Eric Anholt @ 2015-12-01 20:35 UTC (permalink / raw)
  To: dri-devel; +Cc: linux-kernel

An async pageflip stores the modeset to be done and executes it once
the BOs are ready to be displayed.  This gets us about 3x performance
in full screen rendering with pageflipping.

Signed-off-by: Eric Anholt <eric@anholt.net>
---
 drivers/gpu/drm/vc4/vc4_crtc.c  |  99 +++++++++++++++++++++++++-
 drivers/gpu/drm/vc4/vc4_drv.h   |  16 +++++
 drivers/gpu/drm/vc4/vc4_gem.c   |  40 +++++++++++
 drivers/gpu/drm/vc4/vc4_kms.c   | 149 +++++++++++++++++++++++++++++++++++++++-
 drivers/gpu/drm/vc4/vc4_plane.c |  40 +++++++++++
 5 files changed, 342 insertions(+), 2 deletions(-)

diff --git a/drivers/gpu/drm/vc4/vc4_crtc.c b/drivers/gpu/drm/vc4/vc4_crtc.c
index 7a9f476..a319332 100644
--- a/drivers/gpu/drm/vc4/vc4_crtc.c
+++ b/drivers/gpu/drm/vc4/vc4_crtc.c
@@ -35,6 +35,7 @@
 #include "drm_atomic_helper.h"
 #include "drm_crtc_helper.h"
 #include "linux/clk.h"
+#include "drm_fb_cma_helper.h"
 #include "linux/component.h"
 #include "linux/of_device.h"
 #include "vc4_drv.h"
@@ -475,10 +476,106 @@ static irqreturn_t vc4_crtc_irq_handler(int irq, void *data)
 	return ret;
 }
 
+struct vc4_async_flip_state {
+	struct drm_crtc *crtc;
+	struct drm_framebuffer *fb;
+	struct drm_pending_vblank_event *event;
+
+	struct vc4_seqno_cb cb;
+};
+
+/* Called when the V3D execution for the BO being flipped to is done, so that
+ * we can actually update the plane's address to point to it.
+ */
+static void
+vc4_async_page_flip_complete(struct vc4_seqno_cb *cb)
+{
+	struct vc4_async_flip_state *flip_state =
+		container_of(cb, struct vc4_async_flip_state, cb);
+	struct drm_crtc *crtc = flip_state->crtc;
+	struct drm_device *dev = crtc->dev;
+	struct vc4_dev *vc4 = to_vc4_dev(dev);
+	struct drm_plane *plane = crtc->primary;
+
+	vc4_plane_async_set_fb(plane, flip_state->fb);
+	if (flip_state->event) {
+		unsigned long flags;
+
+		spin_lock_irqsave(&dev->event_lock, flags);
+		drm_crtc_send_vblank_event(crtc, flip_state->event);
+		spin_unlock_irqrestore(&dev->event_lock, flags);
+	}
+
+	drm_framebuffer_unreference(flip_state->fb);
+	kfree(flip_state);
+
+	up(&vc4->async_modeset);
+}
+
+/* Implements async (non-vblank-synced) page flips.
+ *
+ * The page flip ioctl needs to return immediately, so we grab the
+ * modeset semaphore on the pipe, and queue the address update for
+ * when V3D is done with the BO being flipped to.
+ */
+static int vc4_async_page_flip(struct drm_crtc *crtc,
+			       struct drm_framebuffer *fb,
+			       struct drm_pending_vblank_event *event,
+			       uint32_t flags)
+{
+	struct drm_device *dev = crtc->dev;
+	struct vc4_dev *vc4 = to_vc4_dev(dev);
+	struct drm_plane *plane = crtc->primary;
+	int ret = 0;
+	struct vc4_async_flip_state *flip_state;
+	struct drm_gem_cma_object *cma_bo = drm_fb_cma_get_gem_obj(fb, 0);
+	struct vc4_bo *bo = to_vc4_bo(&cma_bo->base);
+
+	flip_state = kzalloc(sizeof(*flip_state), GFP_KERNEL);
+	if (!flip_state)
+		return -ENOMEM;
+
+	drm_framebuffer_reference(fb);
+	flip_state->fb = fb;
+	flip_state->crtc = crtc;
+	flip_state->event = event;
+
+	/* Make sure all other async modesetes have landed. */
+	ret = down_interruptible(&vc4->async_modeset);
+	if (ret) {
+		kfree(flip_state);
+		return ret;
+	}
+
+	/* Immediately update the plane's legacy fb pointer, so that later
+	 * modeset prep sees the state that will be present when the semaphore
+	 * is released.
+	 */
+	drm_atomic_set_fb_for_plane(plane->state, fb);
+	plane->fb = fb;
+
+	vc4_queue_seqno_cb(dev, &flip_state->cb, bo->seqno,
+			   vc4_async_page_flip_complete);
+
+	/* Driver takes ownership of state on successful async commit. */
+	return 0;
+}
+
+static int vc4_page_flip(struct drm_crtc *crtc,
+			 struct drm_framebuffer *fb,
+			 struct drm_pending_vblank_event *event,
+			 uint32_t flags)
+{
+	if (flags & DRM_MODE_PAGE_FLIP_ASYNC)
+		return vc4_async_page_flip(crtc, fb, event, flags);
+	else
+		return drm_atomic_helper_page_flip(crtc, fb, event, flags);
+}
+
 static const struct drm_crtc_funcs vc4_crtc_funcs = {
 	.set_config = drm_atomic_helper_set_config,
 	.destroy = vc4_crtc_destroy,
-	.page_flip = drm_atomic_helper_page_flip,
+	.page_flip = vc4_page_flip,
 	.set_property = NULL,
 	.cursor_set = NULL, /* handled by drm_mode_cursor_universal */
 	.cursor_move = NULL, /* handled by drm_mode_cursor_universal */
diff --git a/drivers/gpu/drm/vc4/vc4_drv.h b/drivers/gpu/drm/vc4/vc4_drv.h
index a3dbfee..9a8ee23 100644
--- a/drivers/gpu/drm/vc4/vc4_drv.h
+++ b/drivers/gpu/drm/vc4/vc4_drv.h
@@ -76,6 +76,11 @@ struct vc4_dev {
 	wait_queue_head_t job_wait_queue;
 	struct work_struct job_done_work;
 
+	/* List of struct vc4_seqno_cb for callbacks to be made from a
+	 * workqueue when the given seqno is passed.
+	 */
+	struct list_head seqno_cb_list;
+
 	/* The binner overflow memory that's currently set up in
 	 * BPOA/BPOS registers.  When overflow occurs and a new one is
 	 * allocated, the previous one will be moved to
@@ -128,6 +133,12 @@ to_vc4_bo(struct drm_gem_object *bo)
 	return (struct vc4_bo *)bo;
 }
 
+struct vc4_seqno_cb {
+	struct work_struct work;
+	uint64_t seqno;
+	void (*func)(struct vc4_seqno_cb *cb);
+};
+
 struct vc4_v3d {
 	struct platform_device *pdev;
 	void __iomem *regs;
@@ -392,6 +403,9 @@ void vc4_submit_next_job(struct drm_device *dev);
 int vc4_wait_for_seqno(struct drm_device *dev, uint64_t seqno,
 		       uint64_t timeout_ns, bool interruptible);
 void vc4_job_handle_completed(struct vc4_dev *vc4);
+int vc4_queue_seqno_cb(struct drm_device *dev,
+		       struct vc4_seqno_cb *cb, uint64_t seqno,
+		       void (*func)(struct vc4_seqno_cb *cb));
 
 /* vc4_hdmi.c */
 extern struct platform_driver vc4_hdmi_driver;
@@ -417,6 +431,8 @@ struct drm_plane *vc4_plane_init(struct drm_device *dev,
 				 enum drm_plane_type type);
 u32 vc4_plane_write_dlist(struct drm_plane *plane, u32 __iomem *dlist);
 u32 vc4_plane_dlist_size(struct drm_plane_state *state);
+void vc4_plane_async_set_fb(struct drm_plane *plane,
+			    struct drm_framebuffer *fb);
 
 /* vc4_v3d.c */
 extern struct platform_driver vc4_v3d_driver;
diff --git a/drivers/gpu/drm/vc4/vc4_gem.c b/drivers/gpu/drm/vc4/vc4_gem.c
index ddcb018..a62ca68 100644
--- a/drivers/gpu/drm/vc4/vc4_gem.c
+++ b/drivers/gpu/drm/vc4/vc4_gem.c
@@ -459,6 +459,7 @@ void
 vc4_job_handle_completed(struct vc4_dev *vc4)
 {
 	unsigned long irqflags;
+	struct vc4_seqno_cb *cb, *cb_temp;
 
 	spin_lock_irqsave(&vc4->job_lock, irqflags);
 	while (!list_empty(&vc4->job_done_list)) {
@@ -471,7 +472,45 @@ vc4_job_handle_completed(struct vc4_dev *vc4)
 		vc4_complete_exec(vc4->dev, exec);
 		spin_lock_irqsave(&vc4->job_lock, irqflags);
 	}
+
+	list_for_each_entry_safe(cb, cb_temp, &vc4->seqno_cb_list, work.entry) {
+		if (cb->seqno <= vc4->finished_seqno) {
+			list_del_init(&cb->work.entry);
+			schedule_work(&cb->work);
+		}
+	}
+
+	spin_unlock_irqrestore(&vc4->job_lock, irqflags);
+}
+
+static void vc4_seqno_cb_work(struct work_struct *work)
+{
+	struct vc4_seqno_cb *cb = container_of(work, struct vc4_seqno_cb, work);
+
+	cb->func(cb);
+}
+
+int vc4_queue_seqno_cb(struct drm_device *dev,
+		       struct vc4_seqno_cb *cb, uint64_t seqno,
+		       void (*func)(struct vc4_seqno_cb *cb))
+{
+	struct vc4_dev *vc4 = to_vc4_dev(dev);
+	int ret = 0;
+	unsigned long irqflags;
+
+	cb->func = func;
+	INIT_WORK(&cb->work, vc4_seqno_cb_work);
+
+	spin_lock_irqsave(&vc4->job_lock, irqflags);
+	if (seqno > vc4->finished_seqno) {
+		cb->seqno = seqno;
+		list_add_tail(&cb->work.entry, &vc4->seqno_cb_list);
+	} else {
+		schedule_work(&cb->work);
+	}
 	spin_unlock_irqrestore(&vc4->job_lock, irqflags);
+
+	return ret;
 }
 
 /* Scheduled when any job has been completed, this walks the list of
@@ -608,6 +647,7 @@ vc4_gem_init(struct drm_device *dev)
 
 	INIT_LIST_HEAD(&vc4->job_list);
 	INIT_LIST_HEAD(&vc4->job_done_list);
+	INIT_LIST_HEAD(&vc4->seqno_cb_list);
 	spin_lock_init(&vc4->job_lock);
 
 	INIT_WORK(&vc4->hangcheck.reset_work, vc4_reset_work);
diff --git a/drivers/gpu/drm/vc4/vc4_kms.c b/drivers/gpu/drm/vc4/vc4_kms.c
index 2e5597d..f95f2df 100644
--- a/drivers/gpu/drm/vc4/vc4_kms.c
+++ b/drivers/gpu/drm/vc4/vc4_kms.c
@@ -15,6 +15,7 @@
  */
 
 #include "drm_crtc.h"
+#include "drm_atomic.h"
 #include "drm_atomic_helper.h"
 #include "drm_crtc_helper.h"
 #include "drm_plane_helper.h"
@@ -29,10 +30,152 @@ static void vc4_output_poll_changed(struct drm_device *dev)
 		drm_fbdev_cma_hotplug_event(vc4->fbdev);
 }
 
+struct vc4_commit {
+	struct drm_device *dev;
+	struct drm_atomic_state *state;
+	struct vc4_seqno_cb cb;
+};
+
+static void
+vc4_atomic_complete_commit(struct vc4_commit *c)
+{
+	struct drm_atomic_state *state = c->state;
+	struct drm_device *dev = state->dev;
+	struct vc4_dev *vc4 = to_vc4_dev(dev);
+
+	drm_atomic_helper_commit_modeset_disables(dev, state);
+
+	drm_atomic_helper_commit_planes(dev, state, false);
+
+	drm_atomic_helper_commit_modeset_enables(dev, state);
+
+	drm_atomic_helper_wait_for_vblanks(dev, state);
+
+	drm_atomic_helper_cleanup_planes(dev, state);
+
+	drm_atomic_state_free(state);
+
+	up(&vc4->async_modeset);
+
+	kfree(c);
+}
+
+static void
+vc4_atomic_complete_commit_seqno_cb(struct vc4_seqno_cb *cb)
+{
+	struct vc4_commit *c = container_of(cb, struct vc4_commit, cb);
+
+	vc4_atomic_complete_commit(c);
+}
+
+static struct vc4_commit *commit_init(struct drm_atomic_state *state)
+{
+	struct vc4_commit *c = kzalloc(sizeof(*c), GFP_KERNEL);
+
+	if (!c)
+		return NULL;
+	c->dev = state->dev;
+	c->state = state;
+
+	return c;
+}
+
+/**
+ * vc4_atomic_commit - commit validated state object
+ * @dev: DRM device
+ * @state: the driver state object
+ * @async: asynchronous commit
+ *
+ * This function commits a with drm_atomic_helper_check() pre-validated state
+ * object. This can still fail when e.g. the framebuffer reservation fails. For
+ * now this doesn't implement asynchronous commits.
+ *
+ * RETURNS
+ * Zero for success or -errno.
+ */
+static int vc4_atomic_commit(struct drm_device *dev,
+			     struct drm_atomic_state *state,
+			     bool async)
+{
+	struct vc4_dev *vc4 = to_vc4_dev(dev);
+	int ret;
+	int i;
+	uint64_t wait_seqno = 0;
+	struct vc4_commit *c;
+
+	c = commit_init(state);
+	if (!c)
+		return -ENOMEM;
+
+	/* Make sure that any outstanding modesets have finished. */
+	ret = down_interruptible(&vc4->async_modeset);
+	if (ret) {
+		kfree(c);
+		return ret;
+	}
+
+	ret = drm_atomic_helper_prepare_planes(dev, state);
+	if (ret) {
+		kfree(c);
+		up(&vc4->async_modeset);
+		return ret;
+	}
+
+	for (i = 0; i < dev->mode_config.num_total_plane; i++) {
+		struct drm_plane *plane = state->planes[i];
+		struct drm_plane_state *new_state = state->plane_states[i];
+
+		if (!plane)
+			continue;
+
+		if ((plane->state->fb != new_state->fb) && new_state->fb) {
+			struct drm_gem_cma_object *cma_bo =
+				drm_fb_cma_get_gem_obj(new_state->fb, 0);
+			struct vc4_bo *bo = to_vc4_bo(&cma_bo->base);
+
+			wait_seqno = max(bo->seqno, wait_seqno);
+		}
+	}
+
+	/*
+	 * This is the point of no return - everything below never fails except
+	 * when the hw goes bonghits. Which means we can commit the new state on
+	 * the software side now.
+	 */
+
+	drm_atomic_helper_swap_state(dev, state);
+
+	/*
+	 * Everything below can be run asynchronously without the need to grab
+	 * any modeset locks at all under one condition: It must be guaranteed
+	 * that the asynchronous work has either been cancelled (if the driver
+	 * supports it, which at least requires that the framebuffers get
+	 * cleaned up with drm_atomic_helper_cleanup_planes()) or completed
+	 * before the new state gets committed on the software side with
+	 * drm_atomic_helper_swap_state().
+	 *
+	 * This scheme allows new atomic state updates to be prepared and
+	 * checked in parallel to the asynchronous completion of the previous
+	 * update. Which is important since compositors need to figure out the
+	 * composition of the next frame right after having submitted the
+	 * current layout.
+	 */
+
+	if (async) {
+		vc4_queue_seqno_cb(dev, &c->cb, wait_seqno,
+				   vc4_atomic_complete_commit_seqno_cb);
+	} else {
+		vc4_wait_for_seqno(dev, wait_seqno, ~0ull, false);
+		vc4_atomic_complete_commit(c);
+	}
+
+	return 0;
+}
+
 static const struct drm_mode_config_funcs vc4_mode_funcs = {
 	.output_poll_changed = vc4_output_poll_changed,
 	.atomic_check = drm_atomic_helper_check,
-	.atomic_commit = drm_atomic_helper_commit,
+	.atomic_commit = vc4_atomic_commit,
 	.fb_create = drm_fb_cma_create,
 };
 
@@ -41,6 +184,8 @@ int vc4_kms_load(struct drm_device *dev)
 	struct vc4_dev *vc4 = to_vc4_dev(dev);
 	int ret;
 
+	sema_init(&vc4->async_modeset, 1);
+
 	ret = drm_vblank_init(dev, dev->mode_config.num_crtc);
 	if (ret < 0) {
 		dev_err(dev->dev, "failed to initialize vblank\n");
@@ -51,6 +196,8 @@ int vc4_kms_load(struct drm_device *dev)
 	dev->mode_config.max_height = 2048;
 	dev->mode_config.funcs = &vc4_mode_funcs;
 	dev->mode_config.preferred_depth = 24;
+	dev->mode_config.async_page_flip = true;
+
 	dev->vblank_disable_allowed = true;
 
 	drm_mode_config_reset(dev);
diff --git a/drivers/gpu/drm/vc4/vc4_plane.c b/drivers/gpu/drm/vc4/vc4_plane.c
index cdd8b10..db32c373 100644
--- a/drivers/gpu/drm/vc4/vc4_plane.c
+++ b/drivers/gpu/drm/vc4/vc4_plane.c
@@ -29,6 +29,14 @@ struct vc4_plane_state {
 	u32 *dlist;
 	u32 dlist_size; /* Number of dwords in allocated for the display list */
 	u32 dlist_count; /* Number of used dwords in the display list. */
+
+	/* Offset in the dlist to pointer word 0. */
+	u32 pw0_offset;
+
+	/* Offset where the plane's dlist was last stored in the
+	   hardware at vc4_crtc_atomic_flush() time.
+	*/
+	u32 *hw_dlist;
 };
 
 static inline struct vc4_plane_state *
@@ -197,6 +205,8 @@ static int vc4_plane_mode_set(struct drm_plane *plane,
 	/* Position Word 3: Context.  Written by the HVS. */
 	vc4_dlist_write(vc4_state, 0xc0c0c0c0);
 
+	vc4_state->pw0_offset = vc4_state->dlist_count;
+
 	/* Pointer Word 0: RGB / Y Pointer */
 	vc4_dlist_write(vc4_state, bo->paddr + offset);
 
@@ -248,6 +258,8 @@ u32 vc4_plane_write_dlist(struct drm_plane *plane, u32 __iomem *dlist)
 	struct vc4_plane_state *vc4_state = to_vc4_plane_state(plane->state);
 	int i;
 
+	vc4_state->hw_dlist = dlist;
+
 	/* Can't memcpy_toio() because it needs to be 32-bit writes. */
 	for (i = 0; i < vc4_state->dlist_count; i++)
 		writel(vc4_state->dlist[i], &dlist[i]);
@@ -262,6 +274,34 @@ u32 vc4_plane_dlist_size(struct drm_plane_state *state)
 	return vc4_state->dlist_count;
 }
 
+/* Updates the plane to immediately (well, once the FIFO needs
+ * refilling) scan out from at a new framebuffer.
+ */
+void vc4_plane_async_set_fb(struct drm_plane *plane, struct drm_framebuffer *fb)
+{
+	struct vc4_plane_state *vc4_state = to_vc4_plane_state(plane->state);
+	struct drm_gem_cma_object *bo = drm_fb_cma_get_gem_obj(fb, 0);
+	uint32_t addr;
+
+	/* We're skipping the address adjustment for negative origin,
+	 * because this is only called on the primary plane.
+	 */
+	WARN_ON_ONCE(plane->state->crtc_x < 0 || plane->state->crtc_y < 0);
+	addr = bo->paddr + fb->offsets[0];
+
+	/* Write the new address into the hardware immediately.  The
+	 * scanout will start from this address as soon as the FIFO
+	 * needs to refill with pixels.
+	 */
+	writel(addr, &vc4_state->hw_dlist[vc4_state->pw0_offset]);
+
+	/* Also update the CPU-side dlist copy, so that any later
+	 * atomic updates that don't do a new modeset on our plane
+	 * also use our updated address.
+	 */
+	vc4_state->dlist[vc4_state->pw0_offset] = addr;
+}
+
 static const struct drm_plane_helper_funcs vc4_plane_helper_funcs = {
 	.prepare_fb = NULL,
 	.cleanup_fb = NULL,
-- 
2.6.2

_______________________________________________
dri-devel mailing list
dri-devel@lists.freedesktop.org
http://lists.freedesktop.org/mailman/listinfo/dri-devel

^ permalink raw reply related	[flat|nested] 53+ messages in thread

* [PATCH 9/9] drm/vc4: Add an interface for capturing the GPU state after a hang.
  2015-12-01 20:35 ` Eric Anholt
@ 2015-12-01 20:35   ` Eric Anholt
  -1 siblings, 0 replies; 53+ messages in thread
From: Eric Anholt @ 2015-12-01 20:35 UTC (permalink / raw)
  To: dri-devel; +Cc: linux-kernel, David Airlie, Eric Anholt

This can be parsed with vc4-gpu-tools tools for trying to figure out
what was going on.

Signed-off-by: Eric Anholt <eric@anholt.net>
---
 drivers/gpu/drm/vc4/vc4_drv.c |   2 +
 drivers/gpu/drm/vc4/vc4_drv.h |   4 +
 drivers/gpu/drm/vc4/vc4_gem.c | 185 ++++++++++++++++++++++++++++++++++++++++++
 include/uapi/drm/vc4_drm.h    |  45 ++++++++++
 4 files changed, 236 insertions(+)

diff --git a/drivers/gpu/drm/vc4/vc4_drv.c b/drivers/gpu/drm/vc4/vc4_drv.c
index 2cfee59..97226b6 100644
--- a/drivers/gpu/drm/vc4/vc4_drv.c
+++ b/drivers/gpu/drm/vc4/vc4_drv.c
@@ -80,6 +80,8 @@ static const struct drm_ioctl_desc vc4_drm_ioctls[] = {
 	DRM_IOCTL_DEF_DRV(VC4_CREATE_BO, vc4_create_bo_ioctl, 0),
 	DRM_IOCTL_DEF_DRV(VC4_MMAP_BO, vc4_mmap_bo_ioctl, 0),
 	DRM_IOCTL_DEF_DRV(VC4_CREATE_SHADER_BO, vc4_create_shader_bo_ioctl, 0),
+	DRM_IOCTL_DEF_DRV(VC4_GET_HANG_STATE, vc4_get_hang_state_ioctl,
+			  DRM_ROOT_ONLY),
 };
 
 static struct drm_driver vc4_drm_driver = {
diff --git a/drivers/gpu/drm/vc4/vc4_drv.h b/drivers/gpu/drm/vc4/vc4_drv.h
index 9a8ee23..cd3e243 100644
--- a/drivers/gpu/drm/vc4/vc4_drv.h
+++ b/drivers/gpu/drm/vc4/vc4_drv.h
@@ -19,6 +19,8 @@ struct vc4_dev {
 
 	struct drm_fbdev_cma *fbdev;
 
+	struct vc4_hang_state *hang_state;
+
 	/* The kernel-space BO cache.  Tracks buffers that have been
 	 * unreferenced by all other users (refcounts of 0!) but not
 	 * yet freed, so we can do cheap allocations.
@@ -369,6 +371,8 @@ int vc4_create_shader_bo_ioctl(struct drm_device *dev, void *data,
 			       struct drm_file *file_priv);
 int vc4_mmap_bo_ioctl(struct drm_device *dev, void *data,
 		      struct drm_file *file_priv);
+int vc4_get_hang_state_ioctl(struct drm_device *dev, void *data,
+			     struct drm_file *file_priv);
 int vc4_mmap(struct file *filp, struct vm_area_struct *vma);
 int vc4_prime_mmap(struct drm_gem_object *obj, struct vm_area_struct *vma);
 void *vc4_prime_vmap(struct drm_gem_object *obj);
diff --git a/drivers/gpu/drm/vc4/vc4_gem.c b/drivers/gpu/drm/vc4/vc4_gem.c
index a62ca68..fb0b92d 100644
--- a/drivers/gpu/drm/vc4/vc4_gem.c
+++ b/drivers/gpu/drm/vc4/vc4_gem.c
@@ -40,6 +40,186 @@ vc4_queue_hangcheck(struct drm_device *dev)
 		  round_jiffies_up(jiffies + msecs_to_jiffies(100)));
 }
 
+struct vc4_hang_state {
+	struct drm_vc4_get_hang_state user_state;
+
+	u32 bo_count;
+	struct drm_gem_object **bo;
+};
+
+static void
+vc4_free_hang_state(struct drm_device *dev, struct vc4_hang_state *state)
+{
+	unsigned int i;
+
+	mutex_lock(&dev->struct_mutex);
+	for (i = 0; i < state->user_state.bo_count; i++)
+		drm_gem_object_unreference(state->bo[i]);
+	mutex_unlock(&dev->struct_mutex);
+
+	kfree(state);
+}
+
+int
+vc4_get_hang_state_ioctl(struct drm_device *dev, void *data,
+			 struct drm_file *file_priv)
+{
+	struct drm_vc4_get_hang_state *get_state = data;
+	struct drm_vc4_get_hang_state_bo *bo_state;
+	struct vc4_hang_state *kernel_state;
+	struct drm_vc4_get_hang_state *state;
+	struct vc4_dev *vc4 = to_vc4_dev(dev);
+	unsigned long irqflags;
+	u32 i;
+	int ret;
+
+	spin_lock_irqsave(&vc4->job_lock, irqflags);
+	kernel_state = vc4->hang_state;
+	if (!kernel_state) {
+		spin_unlock_irqrestore(&vc4->job_lock, irqflags);
+		return -ENOENT;
+	}
+	state = &kernel_state->user_state;
+
+	/* If the user's array isn't big enough, just return the
+	 * required array size.
+	 */
+	if (get_state->bo_count < state->bo_count) {
+		get_state->bo_count = state->bo_count;
+		spin_unlock_irqrestore(&vc4->job_lock, irqflags);
+		return 0;
+	}
+
+	vc4->hang_state = NULL;
+	spin_unlock_irqrestore(&vc4->job_lock, irqflags);
+
+	/* Save the user's BO pointer, so we don't stomp it with the memcpy. */
+	state->bo = get_state->bo;
+	memcpy(get_state, state, sizeof(*state));
+
+	bo_state = kcalloc(state->bo_count, sizeof(*bo_state), GFP_KERNEL);
+	if (!bo_state) {
+		ret = -ENOMEM;
+		goto err_free;
+	}
+
+	for (i = 0; i < state->bo_count; i++) {
+		struct vc4_bo *vc4_bo = to_vc4_bo(kernel_state->bo[i]);
+		u32 handle;
+
+		ret = drm_gem_handle_create(file_priv, kernel_state->bo[i],
+					    &handle);
+
+		if (ret) {
+			state->bo_count = i - 1;
+			goto err;
+		}
+		bo_state[i].handle = handle;
+		bo_state[i].paddr = vc4_bo->base.paddr;
+		bo_state[i].size = vc4_bo->base.base.size;
+	}
+
+	ret = copy_to_user((void __user *)(uintptr_t)get_state->bo,
+			   bo_state,
+			   state->bo_count * sizeof(*bo_state));
+	kfree(bo_state);
+
+err_free:
+
+	vc4_free_hang_state(dev, kernel_state);
+
+err:
+	return ret;
+}
+
+static void
+vc4_save_hang_state(struct drm_device *dev)
+{
+	struct vc4_dev *vc4 = to_vc4_dev(dev);
+	struct drm_vc4_get_hang_state *state;
+	struct vc4_hang_state *kernel_state;
+	struct vc4_exec_info *exec;
+	struct vc4_bo *bo;
+	unsigned long irqflags;
+	unsigned int i, unref_list_count;
+
+	kernel_state = kcalloc(1, sizeof(*state), GFP_KERNEL);
+	if (!kernel_state)
+		return;
+
+	state = &kernel_state->user_state;
+
+	spin_lock_irqsave(&vc4->job_lock, irqflags);
+	exec = vc4_first_job(vc4);
+	if (!exec) {
+		spin_unlock_irqrestore(&vc4->job_lock, irqflags);
+		return;
+	}
+
+	unref_list_count = 0;
+	list_for_each_entry(bo, &exec->unref_list, unref_head)
+		unref_list_count++;
+
+	state->bo_count = exec->bo_count + unref_list_count;
+	kernel_state->bo = kcalloc(state->bo_count, sizeof(*kernel_state->bo),
+				   GFP_ATOMIC);
+	if (!kernel_state->bo) {
+		spin_unlock_irqrestore(&vc4->job_lock, irqflags);
+		return;
+	}
+
+	for (i = 0; i < exec->bo_count; i++) {
+		drm_gem_object_reference(&exec->bo[i].bo->base);
+		kernel_state->bo[i] = &exec->bo[i].bo->base;
+	}
+
+	list_for_each_entry(bo, &exec->unref_list, unref_head) {
+		drm_gem_object_reference(&bo->base.base);
+		kernel_state->bo[i] = &bo->base.base;
+		i++;
+	}
+
+	state->start_bin = exec->ct0ca;
+	state->start_render = exec->ct1ca;
+
+	spin_unlock_irqrestore(&vc4->job_lock, irqflags);
+
+	state->ct0ca = V3D_READ(V3D_CTNCA(0));
+	state->ct0ea = V3D_READ(V3D_CTNEA(0));
+
+	state->ct1ca = V3D_READ(V3D_CTNCA(1));
+	state->ct1ea = V3D_READ(V3D_CTNEA(1));
+
+	state->ct0cs = V3D_READ(V3D_CTNCS(0));
+	state->ct1cs = V3D_READ(V3D_CTNCS(1));
+
+	state->ct0ra0 = V3D_READ(V3D_CT00RA0);
+	state->ct1ra0 = V3D_READ(V3D_CT01RA0);
+
+	state->bpca = V3D_READ(V3D_BPCA);
+	state->bpcs = V3D_READ(V3D_BPCS);
+	state->bpoa = V3D_READ(V3D_BPOA);
+	state->bpos = V3D_READ(V3D_BPOS);
+
+	state->vpmbase = V3D_READ(V3D_VPMBASE);
+
+	state->dbge = V3D_READ(V3D_DBGE);
+	state->fdbgo = V3D_READ(V3D_FDBGO);
+	state->fdbgb = V3D_READ(V3D_FDBGB);
+	state->fdbgr = V3D_READ(V3D_FDBGR);
+	state->fdbgs = V3D_READ(V3D_FDBGS);
+	state->errstat = V3D_READ(V3D_ERRSTAT);
+
+	spin_lock_irqsave(&vc4->job_lock, irqflags);
+	if (vc4->hang_state) {
+		spin_unlock_irqrestore(&vc4->job_lock, irqflags);
+		vc4_free_hang_state(dev, kernel_state);
+	} else {
+		vc4->hang_state = kernel_state;
+		spin_unlock_irqrestore(&vc4->job_lock, irqflags);
+	}
+}
+
 static void
 vc4_reset(struct drm_device *dev)
 {
@@ -64,6 +244,8 @@ vc4_reset_work(struct work_struct *work)
 	struct vc4_dev *vc4 =
 		container_of(work, struct vc4_dev, hangcheck.reset_work);
 
+	vc4_save_hang_state(vc4->dev);
+
 	vc4_reset(vc4->dev);
 }
 
@@ -677,4 +859,7 @@ vc4_gem_destroy(struct drm_device *dev)
 	}
 
 	vc4_bo_cache_destroy(dev);
+
+	if (vc4->hang_state)
+		vc4_free_hang_state(dev, vc4->hang_state);
 }
diff --git a/include/uapi/drm/vc4_drm.h b/include/uapi/drm/vc4_drm.h
index e711ec0..c548f43 100644
--- a/include/uapi/drm/vc4_drm.h
+++ b/include/uapi/drm/vc4_drm.h
@@ -32,6 +32,7 @@
 #define DRM_VC4_CREATE_BO                         0x03
 #define DRM_VC4_MMAP_BO                           0x04
 #define DRM_VC4_CREATE_SHADER_BO                  0x05
+#define DRM_VC4_GET_HANG_STATE                    0x06
 
 #define DRM_IOCTL_VC4_SUBMIT_CL           DRM_IOWR(DRM_COMMAND_BASE + DRM_VC4_SUBMIT_CL, struct drm_vc4_submit_cl)
 #define DRM_IOCTL_VC4_WAIT_SEQNO          DRM_IOWR(DRM_COMMAND_BASE + DRM_VC4_WAIT_SEQNO, struct drm_vc4_wait_seqno)
@@ -39,6 +40,7 @@
 #define DRM_IOCTL_VC4_CREATE_BO           DRM_IOWR(DRM_COMMAND_BASE + DRM_VC4_CREATE_BO, struct drm_vc4_create_bo)
 #define DRM_IOCTL_VC4_MMAP_BO             DRM_IOWR(DRM_COMMAND_BASE + DRM_VC4_MMAP_BO, struct drm_vc4_mmap_bo)
 #define DRM_IOCTL_VC4_CREATE_SHADER_BO    DRM_IOWR(DRM_COMMAND_BASE + DRM_VC4_CREATE_SHADER_BO, struct drm_vc4_create_shader_bo)
+#define DRM_IOCTL_VC4_GET_HANG_STATE      DRM_IOWR(DRM_COMMAND_BASE + DRM_VC4_GET_HANG_STATE, struct drm_vc4_get_hang_state)
 
 struct drm_vc4_submit_rcl_surface {
 	uint32_t hindex; /* Handle index, or ~0 if not present. */
@@ -231,4 +233,47 @@ struct drm_vc4_create_shader_bo {
 	uint32_t pad;
 };
 
+struct drm_vc4_get_hang_state_bo {
+	uint32_t handle;
+	uint32_t paddr;
+	uint32_t size;
+	uint32_t pad;
+};
+
+/**
+ * struct drm_vc4_hang_state - ioctl argument for collecting state
+ * from a GPU hang for analysis.
+*/
+struct drm_vc4_get_hang_state {
+	/** Pointer to array of struct drm_vc4_get_hang_state_bo. */
+	uint64_t bo;
+	/**
+	 * On input, the size of the bo array.  Output is the number
+	 * of bos to be returned.
+	 */
+	uint32_t bo_count;
+
+	uint32_t start_bin, start_render;
+
+	uint32_t ct0ca, ct0ea;
+	uint32_t ct1ca, ct1ea;
+	uint32_t ct0cs, ct1cs;
+	uint32_t ct0ra0, ct1ra0;
+
+	uint32_t bpca, bpcs;
+	uint32_t bpoa, bpos;
+
+	uint32_t vpmbase;
+
+	uint32_t dbge;
+	uint32_t fdbgo;
+	uint32_t fdbgb;
+	uint32_t fdbgr;
+	uint32_t fdbgs;
+	uint32_t errstat;
+
+	/* Pad that we may save more registers into in the future. */
+	uint32_t pad[16];
+};
+
 #endif /* _UAPI_VC4_DRM_H_ */
-- 
2.6.2


^ permalink raw reply related	[flat|nested] 53+ messages in thread

* [PATCH 9/9] drm/vc4: Add an interface for capturing the GPU state after a hang.
@ 2015-12-01 20:35   ` Eric Anholt
  0 siblings, 0 replies; 53+ messages in thread
From: Eric Anholt @ 2015-12-01 20:35 UTC (permalink / raw)
  To: dri-devel; +Cc: linux-kernel

This can be parsed with vc4-gpu-tools tools for trying to figure out
what was going on.

Signed-off-by: Eric Anholt <eric@anholt.net>
---
 drivers/gpu/drm/vc4/vc4_drv.c |   2 +
 drivers/gpu/drm/vc4/vc4_drv.h |   4 +
 drivers/gpu/drm/vc4/vc4_gem.c | 185 ++++++++++++++++++++++++++++++++++++++++++
 include/uapi/drm/vc4_drm.h    |  45 ++++++++++
 4 files changed, 236 insertions(+)

diff --git a/drivers/gpu/drm/vc4/vc4_drv.c b/drivers/gpu/drm/vc4/vc4_drv.c
index 2cfee59..97226b6 100644
--- a/drivers/gpu/drm/vc4/vc4_drv.c
+++ b/drivers/gpu/drm/vc4/vc4_drv.c
@@ -80,6 +80,8 @@ static const struct drm_ioctl_desc vc4_drm_ioctls[] = {
 	DRM_IOCTL_DEF_DRV(VC4_CREATE_BO, vc4_create_bo_ioctl, 0),
 	DRM_IOCTL_DEF_DRV(VC4_MMAP_BO, vc4_mmap_bo_ioctl, 0),
 	DRM_IOCTL_DEF_DRV(VC4_CREATE_SHADER_BO, vc4_create_shader_bo_ioctl, 0),
+	DRM_IOCTL_DEF_DRV(VC4_GET_HANG_STATE, vc4_get_hang_state_ioctl,
+			  DRM_ROOT_ONLY),
 };
 
 static struct drm_driver vc4_drm_driver = {
diff --git a/drivers/gpu/drm/vc4/vc4_drv.h b/drivers/gpu/drm/vc4/vc4_drv.h
index 9a8ee23..cd3e243 100644
--- a/drivers/gpu/drm/vc4/vc4_drv.h
+++ b/drivers/gpu/drm/vc4/vc4_drv.h
@@ -19,6 +19,8 @@ struct vc4_dev {
 
 	struct drm_fbdev_cma *fbdev;
 
+	struct vc4_hang_state *hang_state;
+
 	/* The kernel-space BO cache.  Tracks buffers that have been
 	 * unreferenced by all other users (refcounts of 0!) but not
 	 * yet freed, so we can do cheap allocations.
@@ -369,6 +371,8 @@ int vc4_create_shader_bo_ioctl(struct drm_device *dev, void *data,
 			       struct drm_file *file_priv);
 int vc4_mmap_bo_ioctl(struct drm_device *dev, void *data,
 		      struct drm_file *file_priv);
+int vc4_get_hang_state_ioctl(struct drm_device *dev, void *data,
+			     struct drm_file *file_priv);
 int vc4_mmap(struct file *filp, struct vm_area_struct *vma);
 int vc4_prime_mmap(struct drm_gem_object *obj, struct vm_area_struct *vma);
 void *vc4_prime_vmap(struct drm_gem_object *obj);
diff --git a/drivers/gpu/drm/vc4/vc4_gem.c b/drivers/gpu/drm/vc4/vc4_gem.c
index a62ca68..fb0b92d 100644
--- a/drivers/gpu/drm/vc4/vc4_gem.c
+++ b/drivers/gpu/drm/vc4/vc4_gem.c
@@ -40,6 +40,186 @@ vc4_queue_hangcheck(struct drm_device *dev)
 		  round_jiffies_up(jiffies + msecs_to_jiffies(100)));
 }
 
+struct vc4_hang_state {
+	struct drm_vc4_get_hang_state user_state;
+
+	u32 bo_count;
+	struct drm_gem_object **bo;
+};
+
+static void
+vc4_free_hang_state(struct drm_device *dev, struct vc4_hang_state *state)
+{
+	unsigned int i;
+
+	mutex_lock(&dev->struct_mutex);
+	for (i = 0; i < state->user_state.bo_count; i++)
+		drm_gem_object_unreference(state->bo[i]);
+	mutex_unlock(&dev->struct_mutex);
+
+	kfree(state);
+}
+
+int
+vc4_get_hang_state_ioctl(struct drm_device *dev, void *data,
+			 struct drm_file *file_priv)
+{
+	struct drm_vc4_get_hang_state *get_state = data;
+	struct drm_vc4_get_hang_state_bo *bo_state;
+	struct vc4_hang_state *kernel_state;
+	struct drm_vc4_get_hang_state *state;
+	struct vc4_dev *vc4 = to_vc4_dev(dev);
+	unsigned long irqflags;
+	u32 i;
+	int ret;
+
+	spin_lock_irqsave(&vc4->job_lock, irqflags);
+	kernel_state = vc4->hang_state;
+	if (!kernel_state) {
+		spin_unlock_irqrestore(&vc4->job_lock, irqflags);
+		return -ENOENT;
+	}
+	state = &kernel_state->user_state;
+
+	/* If the user's array isn't big enough, just return the
+	 * required array size.
+	 */
+	if (get_state->bo_count < state->bo_count) {
+		get_state->bo_count = state->bo_count;
+		spin_unlock_irqrestore(&vc4->job_lock, irqflags);
+		return 0;
+	}
+
+	vc4->hang_state = NULL;
+	spin_unlock_irqrestore(&vc4->job_lock, irqflags);
+
+	/* Save the user's BO pointer, so we don't stomp it with the memcpy. */
+	state->bo = get_state->bo;
+	memcpy(get_state, state, sizeof(*state));
+
+	bo_state = kcalloc(state->bo_count, sizeof(*bo_state), GFP_KERNEL);
+	if (!bo_state) {
+		ret = -ENOMEM;
+		goto err_free;
+	}
+
+	for (i = 0; i < state->bo_count; i++) {
+		struct vc4_bo *vc4_bo = to_vc4_bo(kernel_state->bo[i]);
+		u32 handle;
+
+		ret = drm_gem_handle_create(file_priv, kernel_state->bo[i],
+					    &handle);
+
+		if (ret) {
+			state->bo_count = i - 1;
+			goto err;
+		}
+		bo_state[i].handle = handle;
+		bo_state[i].paddr = vc4_bo->base.paddr;
+		bo_state[i].size = vc4_bo->base.base.size;
+	}
+
+	ret = copy_to_user((void __user *)(uintptr_t)get_state->bo,
+			   bo_state,
+			   state->bo_count * sizeof(*bo_state));
+	kfree(bo_state);
+
+err_free:
+
+	vc4_free_hang_state(dev, kernel_state);
+
+err:
+	return ret;
+}
+
+static void
+vc4_save_hang_state(struct drm_device *dev)
+{
+	struct vc4_dev *vc4 = to_vc4_dev(dev);
+	struct drm_vc4_get_hang_state *state;
+	struct vc4_hang_state *kernel_state;
+	struct vc4_exec_info *exec;
+	struct vc4_bo *bo;
+	unsigned long irqflags;
+	unsigned int i, unref_list_count;
+
+	kernel_state = kcalloc(1, sizeof(*state), GFP_KERNEL);
+	if (!kernel_state)
+		return;
+
+	state = &kernel_state->user_state;
+
+	spin_lock_irqsave(&vc4->job_lock, irqflags);
+	exec = vc4_first_job(vc4);
+	if (!exec) {
+		spin_unlock_irqrestore(&vc4->job_lock, irqflags);
+		return;
+	}
+
+	unref_list_count = 0;
+	list_for_each_entry(bo, &exec->unref_list, unref_head)
+		unref_list_count++;
+
+	state->bo_count = exec->bo_count + unref_list_count;
+	kernel_state->bo = kcalloc(state->bo_count, sizeof(*kernel_state->bo),
+				   GFP_ATOMIC);
+	if (!kernel_state->bo) {
+		spin_unlock_irqrestore(&vc4->job_lock, irqflags);
+		return;
+	}
+
+	for (i = 0; i < exec->bo_count; i++) {
+		drm_gem_object_reference(&exec->bo[i].bo->base);
+		kernel_state->bo[i] = &exec->bo[i].bo->base;
+	}
+
+	list_for_each_entry(bo, &exec->unref_list, unref_head) {
+		drm_gem_object_reference(&bo->base.base);
+		kernel_state->bo[i] = &bo->base.base;
+		i++;
+	}
+
+	state->start_bin = exec->ct0ca;
+	state->start_render = exec->ct1ca;
+
+	spin_unlock_irqrestore(&vc4->job_lock, irqflags);
+
+	state->ct0ca = V3D_READ(V3D_CTNCA(0));
+	state->ct0ea = V3D_READ(V3D_CTNEA(0));
+
+	state->ct1ca = V3D_READ(V3D_CTNCA(1));
+	state->ct1ea = V3D_READ(V3D_CTNEA(1));
+
+	state->ct0cs = V3D_READ(V3D_CTNCS(0));
+	state->ct1cs = V3D_READ(V3D_CTNCS(1));
+
+	state->ct0ra0 = V3D_READ(V3D_CT00RA0);
+	state->ct1ra0 = V3D_READ(V3D_CT01RA0);
+
+	state->bpca = V3D_READ(V3D_BPCA);
+	state->bpcs = V3D_READ(V3D_BPCS);
+	state->bpoa = V3D_READ(V3D_BPOA);
+	state->bpos = V3D_READ(V3D_BPOS);
+
+	state->vpmbase = V3D_READ(V3D_VPMBASE);
+
+	state->dbge = V3D_READ(V3D_DBGE);
+	state->fdbgo = V3D_READ(V3D_FDBGO);
+	state->fdbgb = V3D_READ(V3D_FDBGB);
+	state->fdbgr = V3D_READ(V3D_FDBGR);
+	state->fdbgs = V3D_READ(V3D_FDBGS);
+	state->errstat = V3D_READ(V3D_ERRSTAT);
+
+	spin_lock_irqsave(&vc4->job_lock, irqflags);
+	if (vc4->hang_state) {
+		spin_unlock_irqrestore(&vc4->job_lock, irqflags);
+		vc4_free_hang_state(dev, kernel_state);
+	} else {
+		vc4->hang_state = kernel_state;
+		spin_unlock_irqrestore(&vc4->job_lock, irqflags);
+	}
+}
+
 static void
 vc4_reset(struct drm_device *dev)
 {
@@ -64,6 +244,8 @@ vc4_reset_work(struct work_struct *work)
 	struct vc4_dev *vc4 =
 		container_of(work, struct vc4_dev, hangcheck.reset_work);
 
+	vc4_save_hang_state(vc4->dev);
+
 	vc4_reset(vc4->dev);
 }
 
@@ -677,4 +859,7 @@ vc4_gem_destroy(struct drm_device *dev)
 	}
 
 	vc4_bo_cache_destroy(dev);
+
+	if (vc4->hang_state)
+		vc4_free_hang_state(dev, vc4->hang_state);
 }
diff --git a/include/uapi/drm/vc4_drm.h b/include/uapi/drm/vc4_drm.h
index e711ec0..c548f43 100644
--- a/include/uapi/drm/vc4_drm.h
+++ b/include/uapi/drm/vc4_drm.h
@@ -32,6 +32,7 @@
 #define DRM_VC4_CREATE_BO                         0x03
 #define DRM_VC4_MMAP_BO                           0x04
 #define DRM_VC4_CREATE_SHADER_BO                  0x05
+#define DRM_VC4_GET_HANG_STATE                    0x06
 
 #define DRM_IOCTL_VC4_SUBMIT_CL           DRM_IOWR(DRM_COMMAND_BASE + DRM_VC4_SUBMIT_CL, struct drm_vc4_submit_cl)
 #define DRM_IOCTL_VC4_WAIT_SEQNO          DRM_IOWR(DRM_COMMAND_BASE + DRM_VC4_WAIT_SEQNO, struct drm_vc4_wait_seqno)
@@ -39,6 +40,7 @@
 #define DRM_IOCTL_VC4_CREATE_BO           DRM_IOWR(DRM_COMMAND_BASE + DRM_VC4_CREATE_BO, struct drm_vc4_create_bo)
 #define DRM_IOCTL_VC4_MMAP_BO             DRM_IOWR(DRM_COMMAND_BASE + DRM_VC4_MMAP_BO, struct drm_vc4_mmap_bo)
 #define DRM_IOCTL_VC4_CREATE_SHADER_BO    DRM_IOWR(DRM_COMMAND_BASE + DRM_VC4_CREATE_SHADER_BO, struct drm_vc4_create_shader_bo)
+#define DRM_IOCTL_VC4_GET_HANG_STATE      DRM_IOWR(DRM_COMMAND_BASE + DRM_VC4_GET_HANG_STATE, struct drm_vc4_get_hang_state)
 
 struct drm_vc4_submit_rcl_surface {
 	uint32_t hindex; /* Handle index, or ~0 if not present. */
@@ -231,4 +233,47 @@ struct drm_vc4_create_shader_bo {
 	uint32_t pad;
 };
 
+struct drm_vc4_get_hang_state_bo {
+	uint32_t handle;
+	uint32_t paddr;
+	uint32_t size;
+	uint32_t pad;
+};
+
+/**
+ * struct drm_vc4_hang_state - ioctl argument for collecting state
+ * from a GPU hang for analysis.
+*/
+struct drm_vc4_get_hang_state {
+	/** Pointer to array of struct drm_vc4_get_hang_state_bo. */
+	uint64_t bo;
+	/**
+	 * On input, the size of the bo array.  Output is the number
+	 * of bos to be returned.
+	 */
+	uint32_t bo_count;
+
+	uint32_t start_bin, start_render;
+
+	uint32_t ct0ca, ct0ea;
+	uint32_t ct1ca, ct1ea;
+	uint32_t ct0cs, ct1cs;
+	uint32_t ct0ra0, ct1ra0;
+
+	uint32_t bpca, bpcs;
+	uint32_t bpoa, bpos;
+
+	uint32_t vpmbase;
+
+	uint32_t dbge;
+	uint32_t fdbgo;
+	uint32_t fdbgb;
+	uint32_t fdbgr;
+	uint32_t fdbgs;
+	uint32_t errstat;
+
+	/* Pad that we may save more registers into in the future. */
+	uint32_t pad[16];
+};
+
 #endif /* _UAPI_VC4_DRM_H_ */
-- 
2.6.2

_______________________________________________
dri-devel mailing list
dri-devel@lists.freedesktop.org
http://lists.freedesktop.org/mailman/listinfo/dri-devel

^ permalink raw reply related	[flat|nested] 53+ messages in thread

* Re: [PATCH 1/9] drm: Create a driver hook for allocating GEM object structs.
  2015-12-01 20:35 ` Eric Anholt
@ 2015-12-02  8:58   ` Daniel Vetter
  -1 siblings, 0 replies; 53+ messages in thread
From: Daniel Vetter @ 2015-12-02  8:58 UTC (permalink / raw)
  To: Eric Anholt; +Cc: dri-devel, linux-kernel

On Tue, Dec 01, 2015 at 12:35:50PM -0800, Eric Anholt wrote:
> The CMA helpers had no way for a driver to extend the struct with its
> own fields.  Since the CMA helpers are mostly "Allocate a
> drm_gem_cma_object, then fill in a few fields", it's hard to write as
> pure helpers without passing in a driver callback for the allocate
> step.
> 
> Signed-off-by: Eric Anholt <eric@anholt.net>

Yeah there's no other way to make this happen without ridiculous amounts
of boilerplate.

Reviewed-by: Daniel Vetter <daniel.vetter@ffwll.ch>

> ---
>  drivers/gpu/drm/drm_gem_cma_helper.c | 10 ++++++----
>  include/drm/drmP.h                   |  7 +++++++
>  2 files changed, 13 insertions(+), 4 deletions(-)
> 
> diff --git a/drivers/gpu/drm/drm_gem_cma_helper.c b/drivers/gpu/drm/drm_gem_cma_helper.c
> index e109b49..0f7b00ba 100644
> --- a/drivers/gpu/drm/drm_gem_cma_helper.c
> +++ b/drivers/gpu/drm/drm_gem_cma_helper.c
> @@ -59,11 +59,13 @@ __drm_gem_cma_create(struct drm_device *drm, size_t size)
>  	struct drm_gem_object *gem_obj;
>  	int ret;
>  
> -	cma_obj = kzalloc(sizeof(*cma_obj), GFP_KERNEL);
> -	if (!cma_obj)
> +	if (drm->driver->gem_create_object)
> +		gem_obj = drm->driver->gem_create_object(drm, size);
> +	else
> +		gem_obj = kzalloc(sizeof(*cma_obj), GFP_KERNEL);
> +	if (!gem_obj)
>  		return ERR_PTR(-ENOMEM);
> -
> -	gem_obj = &cma_obj->base;
> +	cma_obj = container_of(gem_obj, struct drm_gem_cma_object, base);
>  
>  	ret = drm_gem_object_init(drm, gem_obj, size);
>  	if (ret)
> diff --git a/include/drm/drmP.h b/include/drm/drmP.h
> index 0b921ae..22ff162 100644
> --- a/include/drm/drmP.h
> +++ b/include/drm/drmP.h
> @@ -580,6 +580,13 @@ struct drm_driver {
>  	int (*gem_open_object) (struct drm_gem_object *, struct drm_file *);
>  	void (*gem_close_object) (struct drm_gem_object *, struct drm_file *);
>  
> +	/**
> +	 * Hook for allocating the GEM object struct, for use by core
> +	 * helpers.
> +	 */
> +	struct drm_gem_object *(*gem_create_object)(struct drm_device *dev,
> +						    size_t size);
> +
>  	/* prime: */
>  	/* export handle -> fd (see drm_gem_prime_handle_to_fd() helper) */
>  	int (*prime_handle_to_fd)(struct drm_device *dev, struct drm_file *file_priv,
> -- 
> 2.6.2
> 
> _______________________________________________
> dri-devel mailing list
> dri-devel@lists.freedesktop.org
> http://lists.freedesktop.org/mailman/listinfo/dri-devel

-- 
Daniel Vetter
Software Engineer, Intel Corporation
http://blog.ffwll.ch

^ permalink raw reply	[flat|nested] 53+ messages in thread

* Re: [PATCH 1/9] drm: Create a driver hook for allocating GEM object structs.
@ 2015-12-02  8:58   ` Daniel Vetter
  0 siblings, 0 replies; 53+ messages in thread
From: Daniel Vetter @ 2015-12-02  8:58 UTC (permalink / raw)
  To: Eric Anholt; +Cc: linux-kernel, dri-devel

On Tue, Dec 01, 2015 at 12:35:50PM -0800, Eric Anholt wrote:
> The CMA helpers had no way for a driver to extend the struct with its
> own fields.  Since the CMA helpers are mostly "Allocate a
> drm_gem_cma_object, then fill in a few fields", it's hard to write as
> pure helpers without passing in a driver callback for the allocate
> step.
> 
> Signed-off-by: Eric Anholt <eric@anholt.net>

Yeah there's no other way to make this happen without ridiculous amounts
of boilerplate.

Reviewed-by: Daniel Vetter <daniel.vetter@ffwll.ch>

> ---
>  drivers/gpu/drm/drm_gem_cma_helper.c | 10 ++++++----
>  include/drm/drmP.h                   |  7 +++++++
>  2 files changed, 13 insertions(+), 4 deletions(-)
> 
> diff --git a/drivers/gpu/drm/drm_gem_cma_helper.c b/drivers/gpu/drm/drm_gem_cma_helper.c
> index e109b49..0f7b00ba 100644
> --- a/drivers/gpu/drm/drm_gem_cma_helper.c
> +++ b/drivers/gpu/drm/drm_gem_cma_helper.c
> @@ -59,11 +59,13 @@ __drm_gem_cma_create(struct drm_device *drm, size_t size)
>  	struct drm_gem_object *gem_obj;
>  	int ret;
>  
> -	cma_obj = kzalloc(sizeof(*cma_obj), GFP_KERNEL);
> -	if (!cma_obj)
> +	if (drm->driver->gem_create_object)
> +		gem_obj = drm->driver->gem_create_object(drm, size);
> +	else
> +		gem_obj = kzalloc(sizeof(*cma_obj), GFP_KERNEL);
> +	if (!gem_obj)
>  		return ERR_PTR(-ENOMEM);
> -
> -	gem_obj = &cma_obj->base;
> +	cma_obj = container_of(gem_obj, struct drm_gem_cma_object, base);
>  
>  	ret = drm_gem_object_init(drm, gem_obj, size);
>  	if (ret)
> diff --git a/include/drm/drmP.h b/include/drm/drmP.h
> index 0b921ae..22ff162 100644
> --- a/include/drm/drmP.h
> +++ b/include/drm/drmP.h
> @@ -580,6 +580,13 @@ struct drm_driver {
>  	int (*gem_open_object) (struct drm_gem_object *, struct drm_file *);
>  	void (*gem_close_object) (struct drm_gem_object *, struct drm_file *);
>  
> +	/**
> +	 * Hook for allocating the GEM object struct, for use by core
> +	 * helpers.
> +	 */
> +	struct drm_gem_object *(*gem_create_object)(struct drm_device *dev,
> +						    size_t size);
> +
>  	/* prime: */
>  	/* export handle -> fd (see drm_gem_prime_handle_to_fd() helper) */
>  	int (*prime_handle_to_fd)(struct drm_device *dev, struct drm_file *file_priv,
> -- 
> 2.6.2
> 
> _______________________________________________
> dri-devel mailing list
> dri-devel@lists.freedesktop.org
> http://lists.freedesktop.org/mailman/listinfo/dri-devel

-- 
Daniel Vetter
Software Engineer, Intel Corporation
http://blog.ffwll.ch
_______________________________________________
dri-devel mailing list
dri-devel@lists.freedesktop.org
http://lists.freedesktop.org/mailman/listinfo/dri-devel

^ permalink raw reply	[flat|nested] 53+ messages in thread

* Re: [PATCH 7/9] drm/vc4: Add support for drawing 3D frames.
  2015-12-01 20:35   ` Eric Anholt
@ 2015-12-02 11:29     ` Emil Velikov
  -1 siblings, 0 replies; 53+ messages in thread
From: Emil Velikov @ 2015-12-02 11:29 UTC (permalink / raw)
  To: Eric Anholt; +Cc: ML dri-devel, Linux-Kernel@Vger. Kernel. Org

Hi Eric,

On 1 December 2015 at 20:35, Eric Anholt <eric@anholt.net> wrote:
> The user submission is basically a pointer to a command list and a
> pointer to uniforms.  We copy those in to the kernel, validate and
> relocate them, and store the result in a GPU BO which we queue for
> execution.
>
> Signed-off-by: Eric Anholt <eric@anholt.net>
> ---
>  drivers/gpu/drm/vc4/Makefile           |   7 +
>  drivers/gpu/drm/vc4/vc4_drv.c          |  15 +-
>  drivers/gpu/drm/vc4/vc4_drv.h          | 192 +++++++
>  drivers/gpu/drm/vc4/vc4_gem.c          | 640 ++++++++++++++++++++++
>  drivers/gpu/drm/vc4/vc4_irq.c          | 210 ++++++++
>  drivers/gpu/drm/vc4/vc4_packet.h       | 399 ++++++++++++++
>  drivers/gpu/drm/vc4/vc4_render_cl.c    | 631 ++++++++++++++++++++++
>  drivers/gpu/drm/vc4/vc4_trace.h        |  63 +++
>  drivers/gpu/drm/vc4/vc4_trace_points.c |  14 +
>  drivers/gpu/drm/vc4/vc4_v3d.c          |  37 ++
>  drivers/gpu/drm/vc4/vc4_validate.c     | 955 +++++++++++++++++++++++++++++++++
>  include/uapi/drm/vc4_drm.h             | 141 +++++
>  12 files changed, 3303 insertions(+), 1 deletion(-)

Am I assuming correct that the above are the exact same ones copied in
mesa (and used only then simulator is used) ? In the short term
keeping the two in sync would be fine, although wondering if you have
plans to reshuffle things in the long term ?

Thanks
Emil

^ permalink raw reply	[flat|nested] 53+ messages in thread

* Re: [PATCH 7/9] drm/vc4: Add support for drawing 3D frames.
@ 2015-12-02 11:29     ` Emil Velikov
  0 siblings, 0 replies; 53+ messages in thread
From: Emil Velikov @ 2015-12-02 11:29 UTC (permalink / raw)
  To: Eric Anholt; +Cc: Linux-Kernel@Vger. Kernel. Org, ML dri-devel

Hi Eric,

On 1 December 2015 at 20:35, Eric Anholt <eric@anholt.net> wrote:
> The user submission is basically a pointer to a command list and a
> pointer to uniforms.  We copy those in to the kernel, validate and
> relocate them, and store the result in a GPU BO which we queue for
> execution.
>
> Signed-off-by: Eric Anholt <eric@anholt.net>
> ---
>  drivers/gpu/drm/vc4/Makefile           |   7 +
>  drivers/gpu/drm/vc4/vc4_drv.c          |  15 +-
>  drivers/gpu/drm/vc4/vc4_drv.h          | 192 +++++++
>  drivers/gpu/drm/vc4/vc4_gem.c          | 640 ++++++++++++++++++++++
>  drivers/gpu/drm/vc4/vc4_irq.c          | 210 ++++++++
>  drivers/gpu/drm/vc4/vc4_packet.h       | 399 ++++++++++++++
>  drivers/gpu/drm/vc4/vc4_render_cl.c    | 631 ++++++++++++++++++++++
>  drivers/gpu/drm/vc4/vc4_trace.h        |  63 +++
>  drivers/gpu/drm/vc4/vc4_trace_points.c |  14 +
>  drivers/gpu/drm/vc4/vc4_v3d.c          |  37 ++
>  drivers/gpu/drm/vc4/vc4_validate.c     | 955 +++++++++++++++++++++++++++++++++
>  include/uapi/drm/vc4_drm.h             | 141 +++++
>  12 files changed, 3303 insertions(+), 1 deletion(-)

Am I assuming correct that the above are the exact same ones copied in
mesa (and used only then simulator is used) ? In the short term
keeping the two in sync would be fine, although wondering if you have
plans to reshuffle things in the long term ?

Thanks
Emil
_______________________________________________
dri-devel mailing list
dri-devel@lists.freedesktop.org
http://lists.freedesktop.org/mailman/listinfo/dri-devel

^ permalink raw reply	[flat|nested] 53+ messages in thread

* Re: [PATCH 4/9] drm/vc4: Add an API for creating GPU shaders in GEM BOs.
  2015-12-01 20:35   ` Eric Anholt
@ 2015-12-02 11:35     ` Emil Velikov
  -1 siblings, 0 replies; 53+ messages in thread
From: Emil Velikov @ 2015-12-02 11:35 UTC (permalink / raw)
  To: Eric Anholt; +Cc: ML dri-devel, Linux-Kernel@Vger. Kernel. Org

Hi Eric,

On 1 December 2015 at 20:35, Eric Anholt <eric@anholt.net> wrote:
> Since we have no MMU, the kernel needs to validate that the submitted
> shader code won't make any accesses to memory that the user doesn't
> control, which involves banning some operations (general purpose DMA
> writes), and tracking where we need to write out pointers for other
> operations (texture sampling).  Once it's validated, we return a GEM
> BO containing the shader, which doesn't allow mapping for write or
> exporting to other subsystems.
>
> Signed-off-by: Eric Anholt <eric@anholt.net>
> ---
>  drivers/gpu/drm/vc4/Makefile               |   3 +-
>  drivers/gpu/drm/vc4/vc4_bo.c               | 140 ++++++++
>  drivers/gpu/drm/vc4/vc4_drv.c              |   9 +-
>  drivers/gpu/drm/vc4/vc4_drv.h              |  50 +++
>  drivers/gpu/drm/vc4/vc4_qpu_defines.h      | 264 +++++++++++++++

May I suggest that one 'exports' the header to something like
libdrm_vc4 ? There are patches in flight for nouveau which in similar
fashion, allow userspare to reuse existing api/abi.

-Emil

^ permalink raw reply	[flat|nested] 53+ messages in thread

* Re: [PATCH 4/9] drm/vc4: Add an API for creating GPU shaders in GEM BOs.
@ 2015-12-02 11:35     ` Emil Velikov
  0 siblings, 0 replies; 53+ messages in thread
From: Emil Velikov @ 2015-12-02 11:35 UTC (permalink / raw)
  To: Eric Anholt; +Cc: Linux-Kernel@Vger. Kernel. Org, ML dri-devel

Hi Eric,

On 1 December 2015 at 20:35, Eric Anholt <eric@anholt.net> wrote:
> Since we have no MMU, the kernel needs to validate that the submitted
> shader code won't make any accesses to memory that the user doesn't
> control, which involves banning some operations (general purpose DMA
> writes), and tracking where we need to write out pointers for other
> operations (texture sampling).  Once it's validated, we return a GEM
> BO containing the shader, which doesn't allow mapping for write or
> exporting to other subsystems.
>
> Signed-off-by: Eric Anholt <eric@anholt.net>
> ---
>  drivers/gpu/drm/vc4/Makefile               |   3 +-
>  drivers/gpu/drm/vc4/vc4_bo.c               | 140 ++++++++
>  drivers/gpu/drm/vc4/vc4_drv.c              |   9 +-
>  drivers/gpu/drm/vc4/vc4_drv.h              |  50 +++
>  drivers/gpu/drm/vc4/vc4_qpu_defines.h      | 264 +++++++++++++++

May I suggest that one 'exports' the header to something like
libdrm_vc4 ? There are patches in flight for nouveau which in similar
fashion, allow userspare to reuse existing api/abi.

-Emil
_______________________________________________
dri-devel mailing list
dri-devel@lists.freedesktop.org
http://lists.freedesktop.org/mailman/listinfo/dri-devel

^ permalink raw reply	[flat|nested] 53+ messages in thread

* Re: [PATCH 9/9] drm/vc4: Add an interface for capturing the GPU state after a hang.
  2015-12-01 20:35   ` Eric Anholt
@ 2015-12-02 11:45     ` Emil Velikov
  -1 siblings, 0 replies; 53+ messages in thread
From: Emil Velikov @ 2015-12-02 11:45 UTC (permalink / raw)
  To: Eric Anholt; +Cc: ML dri-devel, Linux-Kernel@Vger. Kernel. Org

On 1 December 2015 at 20:35, Eric Anholt <eric@anholt.net> wrote:
> This can be parsed with vc4-gpu-tools tools for trying to figure out
> what was going on.
>
I might be pushing my luck here ... have you thought about basing
(forking) vc4-gpu-tools of intel-gpu-tools ? I'd imagine that the
macros and helpers will come in handy, despite that some are quite
intel specific.

On a related note - with the above project in place I'd imagine you
have (re)considered about having libdrm-vc4 ? Copying hunks around
might lead to interesting moments (as you have already noticed :-P)

On that note I'll stop now with beating the libdrm drum :-)

Thanks
Emil

^ permalink raw reply	[flat|nested] 53+ messages in thread

* Re: [PATCH 9/9] drm/vc4: Add an interface for capturing the GPU state after a hang.
@ 2015-12-02 11:45     ` Emil Velikov
  0 siblings, 0 replies; 53+ messages in thread
From: Emil Velikov @ 2015-12-02 11:45 UTC (permalink / raw)
  To: Eric Anholt; +Cc: Linux-Kernel@Vger. Kernel. Org, ML dri-devel

On 1 December 2015 at 20:35, Eric Anholt <eric@anholt.net> wrote:
> This can be parsed with vc4-gpu-tools tools for trying to figure out
> what was going on.
>
I might be pushing my luck here ... have you thought about basing
(forking) vc4-gpu-tools of intel-gpu-tools ? I'd imagine that the
macros and helpers will come in handy, despite that some are quite
intel specific.

On a related note - with the above project in place I'd imagine you
have (re)considered about having libdrm-vc4 ? Copying hunks around
might lead to interesting moments (as you have already noticed :-P)

On that note I'll stop now with beating the libdrm drum :-)

Thanks
Emil
_______________________________________________
dri-devel mailing list
dri-devel@lists.freedesktop.org
http://lists.freedesktop.org/mailman/listinfo/dri-devel

^ permalink raw reply	[flat|nested] 53+ messages in thread

* Re: [PATCH 8/9] drm/vc4: Add support for async pageflips.
  2015-12-01 20:35   ` Eric Anholt
@ 2015-12-02 14:05     ` Daniel Stone
  -1 siblings, 0 replies; 53+ messages in thread
From: Daniel Stone @ 2015-12-02 14:05 UTC (permalink / raw)
  To: Eric Anholt; +Cc: dri-devel, Linux Kernel Mailing List

Hi,

On 1 December 2015 at 20:35, Eric Anholt <eric@anholt.net> wrote:
> An async pageflip stores the modeset to be done and executes it once
> the BOs are ready to be displayed.  This gets us about 3x performance
> in full screen rendering with pageflipping.

Looks good, but you're missing a preclose callback to reap dead events, a la:
https://git.collabora.com/cgit/user/daniels/linux.git/commit/?h=wip/4.4.x/rockchip-drm-fixes&id=d14f21bcd7e7a1b9ca129c411a9da9c911037965

(and parent commit to wire that through to core DRM preclose)

Cheers,
Daniel

^ permalink raw reply	[flat|nested] 53+ messages in thread

* Re: [PATCH 8/9] drm/vc4: Add support for async pageflips.
@ 2015-12-02 14:05     ` Daniel Stone
  0 siblings, 0 replies; 53+ messages in thread
From: Daniel Stone @ 2015-12-02 14:05 UTC (permalink / raw)
  To: Eric Anholt; +Cc: Linux Kernel Mailing List, dri-devel

Hi,

On 1 December 2015 at 20:35, Eric Anholt <eric@anholt.net> wrote:
> An async pageflip stores the modeset to be done and executes it once
> the BOs are ready to be displayed.  This gets us about 3x performance
> in full screen rendering with pageflipping.

Looks good, but you're missing a preclose callback to reap dead events, a la:
https://git.collabora.com/cgit/user/daniels/linux.git/commit/?h=wip/4.4.x/rockchip-drm-fixes&id=d14f21bcd7e7a1b9ca129c411a9da9c911037965

(and parent commit to wire that through to core DRM preclose)

Cheers,
Daniel
_______________________________________________
dri-devel mailing list
dri-devel@lists.freedesktop.org
http://lists.freedesktop.org/mailman/listinfo/dri-devel

^ permalink raw reply	[flat|nested] 53+ messages in thread

* Re: [PATCH 4/9] drm/vc4: Add an API for creating GPU shaders in GEM BOs.
  2015-12-02 11:35     ` Emil Velikov
@ 2015-12-02 19:24       ` Eric Anholt
  -1 siblings, 0 replies; 53+ messages in thread
From: Eric Anholt @ 2015-12-02 19:24 UTC (permalink / raw)
  To: Emil Velikov; +Cc: ML dri-devel, Linux-Kernel@Vger. Kernel. Org

[-- Attachment #1: Type: text/plain, Size: 1275 bytes --]

Emil Velikov <emil.l.velikov@gmail.com> writes:

> Hi Eric,
>
> On 1 December 2015 at 20:35, Eric Anholt <eric@anholt.net> wrote:
>> Since we have no MMU, the kernel needs to validate that the submitted
>> shader code won't make any accesses to memory that the user doesn't
>> control, which involves banning some operations (general purpose DMA
>> writes), and tracking where we need to write out pointers for other
>> operations (texture sampling).  Once it's validated, we return a GEM
>> BO containing the shader, which doesn't allow mapping for write or
>> exporting to other subsystems.
>>
>> Signed-off-by: Eric Anholt <eric@anholt.net>
>> ---
>>  drivers/gpu/drm/vc4/Makefile               |   3 +-
>>  drivers/gpu/drm/vc4/vc4_bo.c               | 140 ++++++++
>>  drivers/gpu/drm/vc4/vc4_drv.c              |   9 +-
>>  drivers/gpu/drm/vc4/vc4_drv.h              |  50 +++
>>  drivers/gpu/drm/vc4/vc4_qpu_defines.h      | 264 +++++++++++++++
>
> May I suggest that one 'exports' the header to something like
> libdrm_vc4 ? There are patches in flight for nouveau which in similar
> fashion, allow userspare to reuse existing api/abi.

I'll be putting this header in libdrm for use by Mesa and vc4-gpu-tools.
It will still have to live in the kernel as well, though.

[-- Attachment #2: signature.asc --]
[-- Type: application/pgp-signature, Size: 818 bytes --]

^ permalink raw reply	[flat|nested] 53+ messages in thread

* Re: [PATCH 4/9] drm/vc4: Add an API for creating GPU shaders in GEM BOs.
@ 2015-12-02 19:24       ` Eric Anholt
  0 siblings, 0 replies; 53+ messages in thread
From: Eric Anholt @ 2015-12-02 19:24 UTC (permalink / raw)
  To: Emil Velikov; +Cc: Linux-Kernel@Vger. Kernel. Org, ML dri-devel


[-- Attachment #1.1: Type: text/plain, Size: 1275 bytes --]

Emil Velikov <emil.l.velikov@gmail.com> writes:

> Hi Eric,
>
> On 1 December 2015 at 20:35, Eric Anholt <eric@anholt.net> wrote:
>> Since we have no MMU, the kernel needs to validate that the submitted
>> shader code won't make any accesses to memory that the user doesn't
>> control, which involves banning some operations (general purpose DMA
>> writes), and tracking where we need to write out pointers for other
>> operations (texture sampling).  Once it's validated, we return a GEM
>> BO containing the shader, which doesn't allow mapping for write or
>> exporting to other subsystems.
>>
>> Signed-off-by: Eric Anholt <eric@anholt.net>
>> ---
>>  drivers/gpu/drm/vc4/Makefile               |   3 +-
>>  drivers/gpu/drm/vc4/vc4_bo.c               | 140 ++++++++
>>  drivers/gpu/drm/vc4/vc4_drv.c              |   9 +-
>>  drivers/gpu/drm/vc4/vc4_drv.h              |  50 +++
>>  drivers/gpu/drm/vc4/vc4_qpu_defines.h      | 264 +++++++++++++++
>
> May I suggest that one 'exports' the header to something like
> libdrm_vc4 ? There are patches in flight for nouveau which in similar
> fashion, allow userspare to reuse existing api/abi.

I'll be putting this header in libdrm for use by Mesa and vc4-gpu-tools.
It will still have to live in the kernel as well, though.

[-- Attachment #1.2: signature.asc --]
[-- Type: application/pgp-signature, Size: 818 bytes --]

[-- Attachment #2: Type: text/plain, Size: 159 bytes --]

_______________________________________________
dri-devel mailing list
dri-devel@lists.freedesktop.org
http://lists.freedesktop.org/mailman/listinfo/dri-devel

^ permalink raw reply	[flat|nested] 53+ messages in thread

* Re: [PATCH 7/9] drm/vc4: Add support for drawing 3D frames.
  2015-12-02 11:29     ` Emil Velikov
@ 2015-12-02 19:27       ` Eric Anholt
  -1 siblings, 0 replies; 53+ messages in thread
From: Eric Anholt @ 2015-12-02 19:27 UTC (permalink / raw)
  To: Emil Velikov; +Cc: ML dri-devel, Linux-Kernel@Vger. Kernel. Org

[-- Attachment #1: Type: text/plain, Size: 1700 bytes --]

Emil Velikov <emil.l.velikov@gmail.com> writes:

> Hi Eric,
>
> On 1 December 2015 at 20:35, Eric Anholt <eric@anholt.net> wrote:
>> The user submission is basically a pointer to a command list and a
>> pointer to uniforms.  We copy those in to the kernel, validate and
>> relocate them, and store the result in a GPU BO which we queue for
>> execution.
>>
>> Signed-off-by: Eric Anholt <eric@anholt.net>
>> ---
>>  drivers/gpu/drm/vc4/Makefile           |   7 +
>>  drivers/gpu/drm/vc4/vc4_drv.c          |  15 +-
>>  drivers/gpu/drm/vc4/vc4_drv.h          | 192 +++++++
>>  drivers/gpu/drm/vc4/vc4_gem.c          | 640 ++++++++++++++++++++++
>>  drivers/gpu/drm/vc4/vc4_irq.c          | 210 ++++++++
>>  drivers/gpu/drm/vc4/vc4_packet.h       | 399 ++++++++++++++
>>  drivers/gpu/drm/vc4/vc4_render_cl.c    | 631 ++++++++++++++++++++++
>>  drivers/gpu/drm/vc4/vc4_trace.h        |  63 +++
>>  drivers/gpu/drm/vc4/vc4_trace_points.c |  14 +
>>  drivers/gpu/drm/vc4/vc4_v3d.c          |  37 ++
>>  drivers/gpu/drm/vc4/vc4_validate.c     | 955 +++++++++++++++++++++++++++++++++
>>  include/uapi/drm/vc4_drm.h             | 141 +++++
>>  12 files changed, 3303 insertions(+), 1 deletion(-)
>
> Am I assuming correct that the above are the exact same ones copied in
> mesa (and used only then simulator is used) ? In the short term
> keeping the two in sync would be fine, although wondering if you have
> plans to reshuffle things in the long term ?

This code is not exactly synced with Mesa, but I do sync between them on
a regular basis.  The in-Mesa copy is secondary, and just a way for me
to get testing done faster (in the simulation environment with gdb) than
I can in the kernel with printfs.

[-- Attachment #2: signature.asc --]
[-- Type: application/pgp-signature, Size: 818 bytes --]

^ permalink raw reply	[flat|nested] 53+ messages in thread

* Re: [PATCH 7/9] drm/vc4: Add support for drawing 3D frames.
@ 2015-12-02 19:27       ` Eric Anholt
  0 siblings, 0 replies; 53+ messages in thread
From: Eric Anholt @ 2015-12-02 19:27 UTC (permalink / raw)
  To: Emil Velikov; +Cc: Linux-Kernel@Vger. Kernel. Org, ML dri-devel


[-- Attachment #1.1: Type: text/plain, Size: 1700 bytes --]

Emil Velikov <emil.l.velikov@gmail.com> writes:

> Hi Eric,
>
> On 1 December 2015 at 20:35, Eric Anholt <eric@anholt.net> wrote:
>> The user submission is basically a pointer to a command list and a
>> pointer to uniforms.  We copy those in to the kernel, validate and
>> relocate them, and store the result in a GPU BO which we queue for
>> execution.
>>
>> Signed-off-by: Eric Anholt <eric@anholt.net>
>> ---
>>  drivers/gpu/drm/vc4/Makefile           |   7 +
>>  drivers/gpu/drm/vc4/vc4_drv.c          |  15 +-
>>  drivers/gpu/drm/vc4/vc4_drv.h          | 192 +++++++
>>  drivers/gpu/drm/vc4/vc4_gem.c          | 640 ++++++++++++++++++++++
>>  drivers/gpu/drm/vc4/vc4_irq.c          | 210 ++++++++
>>  drivers/gpu/drm/vc4/vc4_packet.h       | 399 ++++++++++++++
>>  drivers/gpu/drm/vc4/vc4_render_cl.c    | 631 ++++++++++++++++++++++
>>  drivers/gpu/drm/vc4/vc4_trace.h        |  63 +++
>>  drivers/gpu/drm/vc4/vc4_trace_points.c |  14 +
>>  drivers/gpu/drm/vc4/vc4_v3d.c          |  37 ++
>>  drivers/gpu/drm/vc4/vc4_validate.c     | 955 +++++++++++++++++++++++++++++++++
>>  include/uapi/drm/vc4_drm.h             | 141 +++++
>>  12 files changed, 3303 insertions(+), 1 deletion(-)
>
> Am I assuming correct that the above are the exact same ones copied in
> mesa (and used only then simulator is used) ? In the short term
> keeping the two in sync would be fine, although wondering if you have
> plans to reshuffle things in the long term ?

This code is not exactly synced with Mesa, but I do sync between them on
a regular basis.  The in-Mesa copy is secondary, and just a way for me
to get testing done faster (in the simulation environment with gdb) than
I can in the kernel with printfs.

[-- Attachment #1.2: signature.asc --]
[-- Type: application/pgp-signature, Size: 818 bytes --]

[-- Attachment #2: Type: text/plain, Size: 159 bytes --]

_______________________________________________
dri-devel mailing list
dri-devel@lists.freedesktop.org
http://lists.freedesktop.org/mailman/listinfo/dri-devel

^ permalink raw reply	[flat|nested] 53+ messages in thread

* Re: [PATCH 9/9] drm/vc4: Add an interface for capturing the GPU state after a hang.
  2015-12-02 11:45     ` Emil Velikov
@ 2015-12-02 19:35       ` Eric Anholt
  -1 siblings, 0 replies; 53+ messages in thread
From: Eric Anholt @ 2015-12-02 19:35 UTC (permalink / raw)
  To: Emil Velikov; +Cc: ML dri-devel, Linux-Kernel@Vger. Kernel. Org

[-- Attachment #1: Type: text/plain, Size: 1529 bytes --]

Emil Velikov <emil.l.velikov@gmail.com> writes:

> On 1 December 2015 at 20:35, Eric Anholt <eric@anholt.net> wrote:
>> This can be parsed with vc4-gpu-tools tools for trying to figure out
>> what was going on.
>>
> I might be pushing my luck here ... have you thought about basing
> (forking) vc4-gpu-tools of intel-gpu-tools ? I'd imagine that the
> macros and helpers will come in handy, despite that some are quite
> intel specific.
>
> On a related note - with the above project in place I'd imagine you
> have (re)considered about having libdrm-vc4 ? Copying hunks around
> might lead to interesting moments (as you have already noticed :-P)
>
> On that note I'll stop now with beating the libdrm drum :-)

The headers and code that I copy between the two userspace locations
will go in libdrm when I have a kernel ABI, but vc4_drm.h can't go in
until merging to the kernel, and there's not a whole lot of point
without that.

Yes, I have thought about basing vc4-gpu-tools off of intel-gpu-tools.
I've actually tried to build and use the kms testing stuff on vc4, and
it was a total bust.  Someone needs to do a lot of work to make igt
useful for non-intel.  If you'd like me to build my vc4 testing inside
of igt, I'd someone to demo one of my tests building inside of igt, with
the test runner working and none of the intel-specific tests reporting
failure, and get me permission to just push code to that repository
(It's hard enough getting piglit tests reviewed, vc4-specific tests and
tools would never get review).

[-- Attachment #2: signature.asc --]
[-- Type: application/pgp-signature, Size: 818 bytes --]

^ permalink raw reply	[flat|nested] 53+ messages in thread

* Re: [PATCH 9/9] drm/vc4: Add an interface for capturing the GPU state after a hang.
@ 2015-12-02 19:35       ` Eric Anholt
  0 siblings, 0 replies; 53+ messages in thread
From: Eric Anholt @ 2015-12-02 19:35 UTC (permalink / raw)
  To: Emil Velikov; +Cc: Linux-Kernel@Vger. Kernel. Org, ML dri-devel

[-- Attachment #1.1: Type: text/plain, Size: 1529 bytes --]

Emil Velikov <emil.l.velikov@gmail.com> writes:

> On 1 December 2015 at 20:35, Eric Anholt <eric@anholt.net> wrote:
>> This can be parsed with vc4-gpu-tools tools for trying to figure out
>> what was going on.
>>
> I might be pushing my luck here ... have you thought about basing
> (forking) vc4-gpu-tools of intel-gpu-tools ? I'd imagine that the
> macros and helpers will come in handy, despite that some are quite
> intel specific.
>
> On a related note - with the above project in place I'd imagine you
> have (re)considered about having libdrm-vc4 ? Copying hunks around
> might lead to interesting moments (as you have already noticed :-P)
>
> On that note I'll stop now with beating the libdrm drum :-)

The headers and code that I copy between the two userspace locations
will go in libdrm when I have a kernel ABI, but vc4_drm.h can't go in
until merging to the kernel, and there's not a whole lot of point
without that.

Yes, I have thought about basing vc4-gpu-tools off of intel-gpu-tools.
I've actually tried to build and use the kms testing stuff on vc4, and
it was a total bust.  Someone needs to do a lot of work to make igt
useful for non-intel.  If you'd like me to build my vc4 testing inside
of igt, I'd someone to demo one of my tests building inside of igt, with
the test runner working and none of the intel-specific tests reporting
failure, and get me permission to just push code to that repository
(It's hard enough getting piglit tests reviewed, vc4-specific tests and
tools would never get review).

[-- Attachment #1.2: signature.asc --]
[-- Type: application/pgp-signature, Size: 818 bytes --]

[-- Attachment #2: Type: text/plain, Size: 159 bytes --]

_______________________________________________
dri-devel mailing list
dri-devel@lists.freedesktop.org
http://lists.freedesktop.org/mailman/listinfo/dri-devel

^ permalink raw reply	[flat|nested] 53+ messages in thread

* Re: [PATCH 9/9] drm/vc4: Add an interface for capturing the GPU state after a hang.
  2015-12-02 19:35       ` Eric Anholt
@ 2015-12-02 22:26         ` Daniel Vetter
  -1 siblings, 0 replies; 53+ messages in thread
From: Daniel Vetter @ 2015-12-02 22:26 UTC (permalink / raw)
  To: Eric Anholt; +Cc: Emil Velikov, Linux-Kernel@Vger. Kernel. Org, ML dri-devel

On Wed, Dec 02, 2015 at 11:35:16AM -0800, Eric Anholt wrote:
> Emil Velikov <emil.l.velikov@gmail.com> writes:
> 
> > On 1 December 2015 at 20:35, Eric Anholt <eric@anholt.net> wrote:
> >> This can be parsed with vc4-gpu-tools tools for trying to figure out
> >> what was going on.
> >>
> > I might be pushing my luck here ... have you thought about basing
> > (forking) vc4-gpu-tools of intel-gpu-tools ? I'd imagine that the
> > macros and helpers will come in handy, despite that some are quite
> > intel specific.
> >
> > On a related note - with the above project in place I'd imagine you
> > have (re)considered about having libdrm-vc4 ? Copying hunks around
> > might lead to interesting moments (as you have already noticed :-P)
> >
> > On that note I'll stop now with beating the libdrm drum :-)
> 
> The headers and code that I copy between the two userspace locations
> will go in libdrm when I have a kernel ABI, but vc4_drm.h can't go in
> until merging to the kernel, and there's not a whole lot of point
> without that.
> 
> Yes, I have thought about basing vc4-gpu-tools off of intel-gpu-tools.
> I've actually tried to build and use the kms testing stuff on vc4, and
> it was a total bust.  Someone needs to do a lot of work to make igt
> useful for non-intel.  If you'd like me to build my vc4 testing inside
> of igt, I'd someone to demo one of my tests building inside of igt, with
> the test runner working and none of the intel-specific tests reporting
> failure, and get me permission to just push code to that repository
> (It's hard enough getting piglit tests reviewed, vc4-specific tests and
> tools would never get review).

Daniel Stone claimed that this Just Works but evidently it doesn't.
There's some autoconfig fail where igt wants too much intel crap that just
doesn't build on arm. Iirc Daniel had some patches floating around for
that.

Wrt building tests itself I think we should require libdrm_$vendor
everywhere, since without that you can embedded a vendor specific testcase
into a shared one (e.g. to test tiled pageflipping). And if that's not
possible then either we'll end up in #ifdef hell or code sharing is really
hard, rendering a shared testsuite pretty useless.

Wrt commit access I'm ok with that, atm rule for igt is to send to
intel-gfx and push after 24h of no complaints. For purely vc4-specific
stuff we should probably create a vc4 subdir.

Wrt tests failing: I sometimes run the entire suite here on a radeon, and
at least in the past that resulted in a perfect 100% skip score. So should
work, too.

Cheers, Daniel
-- 
Daniel Vetter
Software Engineer, Intel Corporation
http://blog.ffwll.ch

^ permalink raw reply	[flat|nested] 53+ messages in thread

* Re: [PATCH 9/9] drm/vc4: Add an interface for capturing the GPU state after a hang.
@ 2015-12-02 22:26         ` Daniel Vetter
  0 siblings, 0 replies; 53+ messages in thread
From: Daniel Vetter @ 2015-12-02 22:26 UTC (permalink / raw)
  To: Eric Anholt; +Cc: Emil Velikov, Linux-Kernel@Vger. Kernel. Org, ML dri-devel

On Wed, Dec 02, 2015 at 11:35:16AM -0800, Eric Anholt wrote:
> Emil Velikov <emil.l.velikov@gmail.com> writes:
> 
> > On 1 December 2015 at 20:35, Eric Anholt <eric@anholt.net> wrote:
> >> This can be parsed with vc4-gpu-tools tools for trying to figure out
> >> what was going on.
> >>
> > I might be pushing my luck here ... have you thought about basing
> > (forking) vc4-gpu-tools of intel-gpu-tools ? I'd imagine that the
> > macros and helpers will come in handy, despite that some are quite
> > intel specific.
> >
> > On a related note - with the above project in place I'd imagine you
> > have (re)considered about having libdrm-vc4 ? Copying hunks around
> > might lead to interesting moments (as you have already noticed :-P)
> >
> > On that note I'll stop now with beating the libdrm drum :-)
> 
> The headers and code that I copy between the two userspace locations
> will go in libdrm when I have a kernel ABI, but vc4_drm.h can't go in
> until merging to the kernel, and there's not a whole lot of point
> without that.
> 
> Yes, I have thought about basing vc4-gpu-tools off of intel-gpu-tools.
> I've actually tried to build and use the kms testing stuff on vc4, and
> it was a total bust.  Someone needs to do a lot of work to make igt
> useful for non-intel.  If you'd like me to build my vc4 testing inside
> of igt, I'd someone to demo one of my tests building inside of igt, with
> the test runner working and none of the intel-specific tests reporting
> failure, and get me permission to just push code to that repository
> (It's hard enough getting piglit tests reviewed, vc4-specific tests and
> tools would never get review).

Daniel Stone claimed that this Just Works but evidently it doesn't.
There's some autoconfig fail where igt wants too much intel crap that just
doesn't build on arm. Iirc Daniel had some patches floating around for
that.

Wrt building tests itself I think we should require libdrm_$vendor
everywhere, since without that you can embedded a vendor specific testcase
into a shared one (e.g. to test tiled pageflipping). And if that's not
possible then either we'll end up in #ifdef hell or code sharing is really
hard, rendering a shared testsuite pretty useless.

Wrt commit access I'm ok with that, atm rule for igt is to send to
intel-gfx and push after 24h of no complaints. For purely vc4-specific
stuff we should probably create a vc4 subdir.

Wrt tests failing: I sometimes run the entire suite here on a radeon, and
at least in the past that resulted in a perfect 100% skip score. So should
work, too.

Cheers, Daniel
-- 
Daniel Vetter
Software Engineer, Intel Corporation
http://blog.ffwll.ch
_______________________________________________
dri-devel mailing list
dri-devel@lists.freedesktop.org
http://lists.freedesktop.org/mailman/listinfo/dri-devel

^ permalink raw reply	[flat|nested] 53+ messages in thread

* Re: [PATCH 9/9] drm/vc4: Add an interface for capturing the GPU state after a hang.
  2015-12-02 22:26         ` Daniel Vetter
  (?)
@ 2015-12-02 22:58         ` Daniel Stone
  2015-12-03 11:55             ` Emil Velikov
  -1 siblings, 1 reply; 53+ messages in thread
From: Daniel Stone @ 2015-12-02 22:58 UTC (permalink / raw)
  To: Eric Anholt, Emil Velikov, Linux-Kernel@Vger. Kernel. Org, ML dri-devel

Hey,

On 2 December 2015 at 22:26, Daniel Vetter <daniel@ffwll.ch> wrote:
> On Wed, Dec 02, 2015 at 11:35:16AM -0800, Eric Anholt wrote:
>> Yes, I have thought about basing vc4-gpu-tools off of intel-gpu-tools.
>> I've actually tried to build and use the kms testing stuff on vc4, and
>> it was a total bust.  Someone needs to do a lot of work to make igt
>> useful for non-intel.  If you'd like me to build my vc4 testing inside
>> of igt, I'd someone to demo one of my tests building inside of igt, with
>> the test runner working and none of the intel-specific tests reporting
>> failure, and get me permission to just push code to that repository
>> (It's hard enough getting piglit tests reviewed, vc4-specific tests and
>> tools would never get review).
>
> Daniel Stone claimed that this Just Works but evidently it doesn't.
> There's some autoconfig fail where igt wants too much intel crap that just
> doesn't build on arm. Iirc Daniel had some patches floating around for
> that.

Yeah, it was working, though with my ARM farm still being in pieces, I
haven't been able to keep on top of it lately. Apparently the patch to
disable the ancilliary tools fixes the build, so I'll get that pushed
when I can actually test it, or for the meantime:
http://paste.fedoraproject.org/296836/09692714

This does still require libpciaccess and libdrm-intel to be built, but
they _are_ totally possible to build on ARM, without any stupid hacks.
My first cut at getting igt running on ARM (before Micah took over)
actually started by eviscerating those as well, but it ended up being
too much of a yak-shave.

If you can get past those, you get a skip for all but the core tests.
For the rest, they either need porting to just use dumb-buffers, or
split such that the bits exercising Intel-specific tiling modes can be
run separately, or left as Intel only.

Cheers,
Daniel

^ permalink raw reply	[flat|nested] 53+ messages in thread

* Re: [PATCH 9/9] drm/vc4: Add an interface for capturing the GPU state after a hang.
  2015-12-02 19:35       ` Eric Anholt
@ 2015-12-03 11:46         ` Emil Velikov
  -1 siblings, 0 replies; 53+ messages in thread
From: Emil Velikov @ 2015-12-03 11:46 UTC (permalink / raw)
  To: Eric Anholt; +Cc: ML dri-devel, Linux-Kernel@Vger. Kernel. Org

On 2 December 2015 at 19:35, Eric Anholt <eric@anholt.net> wrote:
> Emil Velikov <emil.l.velikov@gmail.com> writes:
>
>> On 1 December 2015 at 20:35, Eric Anholt <eric@anholt.net> wrote:
>>> This can be parsed with vc4-gpu-tools tools for trying to figure out
>>> what was going on.
>>>
>> I might be pushing my luck here ... have you thought about basing
>> (forking) vc4-gpu-tools of intel-gpu-tools ? I'd imagine that the
>> macros and helpers will come in handy, despite that some are quite
>> intel specific.
>>
>> On a related note - with the above project in place I'd imagine you
>> have (re)considered about having libdrm-vc4 ? Copying hunks around
>> might lead to interesting moments (as you have already noticed :-P)
>>
>> On that note I'll stop now with beating the libdrm drum :-)
>
> The headers and code that I copy between the two userspace locations
> will go in libdrm when I have a kernel ABI, but vc4_drm.h can't go in
> until merging to the kernel, and there's not a whole lot of point
> without that.
>
Definately - I wasn't suggesting about getting things in libdrm before
the kernel. On the ABI front you might want to follow
nouveau/freedreno/omap approach by using a (disabled by default)
experimental-vc4 switch and keeping both vc4_drm.h, vc4_qpu_defines.h
(and other?) in the separate libdrm-vc4 package. As things stabilise
vc4_drm.h can be moved to the core libdrm package.

> Yes, I have thought about basing vc4-gpu-tools off of intel-gpu-tools.
> I've actually tried to build and use the kms testing stuff on vc4, and
> it was a total bust.  Someone needs to do a lot of work to make igt
> useful for non-intel.  If you'd like me to build my vc4 testing inside
> of igt, I'd someone to demo one of my tests building inside of igt, with
> the test runner working and none of the intel-specific tests reporting
> failure, and get me permission to just push code to that repository
> (It's hard enough getting piglit tests reviewed, vc4-specific tests and
> tools would never get review).

As Dan and Dan covered most of the concerns, can you elaborate which
of the following (and others?) are show stoppers:

 - libpciaccess requirement
 - libdrm-intel requirement
 - other misc requirements (xv x11 xext dri2proto cairo-xlib)
 - no clean "pass" when executed on non intel hardware

Thanks
Emil

^ permalink raw reply	[flat|nested] 53+ messages in thread

* Re: [PATCH 9/9] drm/vc4: Add an interface for capturing the GPU state after a hang.
@ 2015-12-03 11:46         ` Emil Velikov
  0 siblings, 0 replies; 53+ messages in thread
From: Emil Velikov @ 2015-12-03 11:46 UTC (permalink / raw)
  To: Eric Anholt; +Cc: Linux-Kernel@Vger. Kernel. Org, ML dri-devel

On 2 December 2015 at 19:35, Eric Anholt <eric@anholt.net> wrote:
> Emil Velikov <emil.l.velikov@gmail.com> writes:
>
>> On 1 December 2015 at 20:35, Eric Anholt <eric@anholt.net> wrote:
>>> This can be parsed with vc4-gpu-tools tools for trying to figure out
>>> what was going on.
>>>
>> I might be pushing my luck here ... have you thought about basing
>> (forking) vc4-gpu-tools of intel-gpu-tools ? I'd imagine that the
>> macros and helpers will come in handy, despite that some are quite
>> intel specific.
>>
>> On a related note - with the above project in place I'd imagine you
>> have (re)considered about having libdrm-vc4 ? Copying hunks around
>> might lead to interesting moments (as you have already noticed :-P)
>>
>> On that note I'll stop now with beating the libdrm drum :-)
>
> The headers and code that I copy between the two userspace locations
> will go in libdrm when I have a kernel ABI, but vc4_drm.h can't go in
> until merging to the kernel, and there's not a whole lot of point
> without that.
>
Definately - I wasn't suggesting about getting things in libdrm before
the kernel. On the ABI front you might want to follow
nouveau/freedreno/omap approach by using a (disabled by default)
experimental-vc4 switch and keeping both vc4_drm.h, vc4_qpu_defines.h
(and other?) in the separate libdrm-vc4 package. As things stabilise
vc4_drm.h can be moved to the core libdrm package.

> Yes, I have thought about basing vc4-gpu-tools off of intel-gpu-tools.
> I've actually tried to build and use the kms testing stuff on vc4, and
> it was a total bust.  Someone needs to do a lot of work to make igt
> useful for non-intel.  If you'd like me to build my vc4 testing inside
> of igt, I'd someone to demo one of my tests building inside of igt, with
> the test runner working and none of the intel-specific tests reporting
> failure, and get me permission to just push code to that repository
> (It's hard enough getting piglit tests reviewed, vc4-specific tests and
> tools would never get review).

As Dan and Dan covered most of the concerns, can you elaborate which
of the following (and others?) are show stoppers:

 - libpciaccess requirement
 - libdrm-intel requirement
 - other misc requirements (xv x11 xext dri2proto cairo-xlib)
 - no clean "pass" when executed on non intel hardware

Thanks
Emil
_______________________________________________
dri-devel mailing list
dri-devel@lists.freedesktop.org
http://lists.freedesktop.org/mailman/listinfo/dri-devel

^ permalink raw reply	[flat|nested] 53+ messages in thread

* Re: [PATCH 9/9] drm/vc4: Add an interface for capturing the GPU state after a hang.
  2015-12-02 22:58         ` Daniel Stone
@ 2015-12-03 11:55             ` Emil Velikov
  0 siblings, 0 replies; 53+ messages in thread
From: Emil Velikov @ 2015-12-03 11:55 UTC (permalink / raw)
  To: Daniel Stone; +Cc: Eric Anholt, Linux-Kernel@Vger. Kernel. Org, ML dri-devel

Hi Dan,

On 2 December 2015 at 22:58, Daniel Stone <daniel@fooishbar.org> wrote:
> Hey,
>
> On 2 December 2015 at 22:26, Daniel Vetter <daniel@ffwll.ch> wrote:
>> On Wed, Dec 02, 2015 at 11:35:16AM -0800, Eric Anholt wrote:
>>> Yes, I have thought about basing vc4-gpu-tools off of intel-gpu-tools.
>>> I've actually tried to build and use the kms testing stuff on vc4, and
>>> it was a total bust.  Someone needs to do a lot of work to make igt
>>> useful for non-intel.  If you'd like me to build my vc4 testing inside
>>> of igt, I'd someone to demo one of my tests building inside of igt, with
>>> the test runner working and none of the intel-specific tests reporting
>>> failure, and get me permission to just push code to that repository
>>> (It's hard enough getting piglit tests reviewed, vc4-specific tests and
>>> tools would never get review).
>>
>> Daniel Stone claimed that this Just Works but evidently it doesn't.
>> There's some autoconfig fail where igt wants too much intel crap that just
>> doesn't build on arm. Iirc Daniel had some patches floating around for
>> that.
>
> Yeah, it was working, though with my ARM farm still being in pieces, I
> haven't been able to keep on top of it lately. Apparently the patch to
> disable the ancilliary tools fixes the build, so I'll get that pushed
> when I can actually test it, or for the meantime:
> http://paste.fedoraproject.org/296836/09692714
>
Afaict pretty much all of the binaries "disabled" with this patch are
intel specific. As such please throw a configure switch (enabled by
default) rather than the current build_x86 heuristic, otherwise things
are bound to get very ugly pretty quickly.

> This does still require libpciaccess and libdrm-intel to be built, but
> they _are_ totally possible to build on ARM, without any stupid hacks.
> My first cut at getting igt running on ARM (before Micah took over)
> actually started by eviscerating those as well, but it ended up being
> too much of a yak-shave.
>
The libpciaccess will be a pain although libdrm-intel should be
relatively easier. I doubt people will have major issues if the former
requirement stays forever.

Thanks
Emil

^ permalink raw reply	[flat|nested] 53+ messages in thread

* Re: [PATCH 9/9] drm/vc4: Add an interface for capturing the GPU state after a hang.
@ 2015-12-03 11:55             ` Emil Velikov
  0 siblings, 0 replies; 53+ messages in thread
From: Emil Velikov @ 2015-12-03 11:55 UTC (permalink / raw)
  To: Daniel Stone; +Cc: Linux-Kernel@Vger. Kernel. Org, ML dri-devel

Hi Dan,

On 2 December 2015 at 22:58, Daniel Stone <daniel@fooishbar.org> wrote:
> Hey,
>
> On 2 December 2015 at 22:26, Daniel Vetter <daniel@ffwll.ch> wrote:
>> On Wed, Dec 02, 2015 at 11:35:16AM -0800, Eric Anholt wrote:
>>> Yes, I have thought about basing vc4-gpu-tools off of intel-gpu-tools.
>>> I've actually tried to build and use the kms testing stuff on vc4, and
>>> it was a total bust.  Someone needs to do a lot of work to make igt
>>> useful for non-intel.  If you'd like me to build my vc4 testing inside
>>> of igt, I'd someone to demo one of my tests building inside of igt, with
>>> the test runner working and none of the intel-specific tests reporting
>>> failure, and get me permission to just push code to that repository
>>> (It's hard enough getting piglit tests reviewed, vc4-specific tests and
>>> tools would never get review).
>>
>> Daniel Stone claimed that this Just Works but evidently it doesn't.
>> There's some autoconfig fail where igt wants too much intel crap that just
>> doesn't build on arm. Iirc Daniel had some patches floating around for
>> that.
>
> Yeah, it was working, though with my ARM farm still being in pieces, I
> haven't been able to keep on top of it lately. Apparently the patch to
> disable the ancilliary tools fixes the build, so I'll get that pushed
> when I can actually test it, or for the meantime:
> http://paste.fedoraproject.org/296836/09692714
>
Afaict pretty much all of the binaries "disabled" with this patch are
intel specific. As such please throw a configure switch (enabled by
default) rather than the current build_x86 heuristic, otherwise things
are bound to get very ugly pretty quickly.

> This does still require libpciaccess and libdrm-intel to be built, but
> they _are_ totally possible to build on ARM, without any stupid hacks.
> My first cut at getting igt running on ARM (before Micah took over)
> actually started by eviscerating those as well, but it ended up being
> too much of a yak-shave.
>
The libpciaccess will be a pain although libdrm-intel should be
relatively easier. I doubt people will have major issues if the former
requirement stays forever.

Thanks
Emil
_______________________________________________
dri-devel mailing list
dri-devel@lists.freedesktop.org
http://lists.freedesktop.org/mailman/listinfo/dri-devel

^ permalink raw reply	[flat|nested] 53+ messages in thread

* Re: [PATCH 8/9] drm/vc4: Add support for async pageflips.
  2015-12-02 14:05     ` Daniel Stone
@ 2015-12-04  1:42       ` Eric Anholt
  -1 siblings, 0 replies; 53+ messages in thread
From: Eric Anholt @ 2015-12-04  1:42 UTC (permalink / raw)
  To: Daniel Stone; +Cc: dri-devel, Linux Kernel Mailing List

[-- Attachment #1: Type: text/plain, Size: 693 bytes --]

Daniel Stone <daniel@fooishbar.org> writes:

> Hi,
>
> On 1 December 2015 at 20:35, Eric Anholt <eric@anholt.net> wrote:
>> An async pageflip stores the modeset to be done and executes it once
>> the BOs are ready to be displayed.  This gets us about 3x performance
>> in full screen rendering with pageflipping.
>
> Looks good, but you're missing a preclose callback to reap dead events, a la:
> https://git.collabora.com/cgit/user/daniels/linux.git/commit/?h=wip/4.4.x/rockchip-drm-fixes&id=d14f21bcd7e7a1b9ca129c411a9da9c911037965
>
> (and parent commit to wire that through to core DRM preclose)

We've already got a preclose callback that reaps crtc->event -- see
vc4_cancel_page_flip().

[-- Attachment #2: signature.asc --]
[-- Type: application/pgp-signature, Size: 818 bytes --]

^ permalink raw reply	[flat|nested] 53+ messages in thread

* Re: [PATCH 8/9] drm/vc4: Add support for async pageflips.
@ 2015-12-04  1:42       ` Eric Anholt
  0 siblings, 0 replies; 53+ messages in thread
From: Eric Anholt @ 2015-12-04  1:42 UTC (permalink / raw)
  To: Daniel Stone; +Cc: Linux Kernel Mailing List, dri-devel


[-- Attachment #1.1: Type: text/plain, Size: 693 bytes --]

Daniel Stone <daniel@fooishbar.org> writes:

> Hi,
>
> On 1 December 2015 at 20:35, Eric Anholt <eric@anholt.net> wrote:
>> An async pageflip stores the modeset to be done and executes it once
>> the BOs are ready to be displayed.  This gets us about 3x performance
>> in full screen rendering with pageflipping.
>
> Looks good, but you're missing a preclose callback to reap dead events, a la:
> https://git.collabora.com/cgit/user/daniels/linux.git/commit/?h=wip/4.4.x/rockchip-drm-fixes&id=d14f21bcd7e7a1b9ca129c411a9da9c911037965
>
> (and parent commit to wire that through to core DRM preclose)

We've already got a preclose callback that reaps crtc->event -- see
vc4_cancel_page_flip().

[-- Attachment #1.2: signature.asc --]
[-- Type: application/pgp-signature, Size: 818 bytes --]

[-- Attachment #2: Type: text/plain, Size: 159 bytes --]

_______________________________________________
dri-devel mailing list
dri-devel@lists.freedesktop.org
http://lists.freedesktop.org/mailman/listinfo/dri-devel

^ permalink raw reply	[flat|nested] 53+ messages in thread

* Re: [PATCH 3/9] drm/vc4: Add create and map BO ioctls.
  2015-12-01 20:35   ` Eric Anholt
@ 2015-12-04 14:29     ` Emil Velikov
  -1 siblings, 0 replies; 53+ messages in thread
From: Emil Velikov @ 2015-12-04 14:29 UTC (permalink / raw)
  To: Eric Anholt; +Cc: ML dri-devel, Linux-Kernel@Vger. Kernel. Org

Hi Eric,

A couple of suggestions/requests on the UAPI header side

On 1 December 2015 at 20:35, Eric Anholt <eric@anholt.net> wrote:

> --- /dev/null
> +++ b/include/uapi/drm/vc4_drm.h

> +#include <drm/drm.h>
> +
Can we make this a "drm.h" ?


> +struct drm_vc4_create_bo {
> +       uint32_t size;
> +       uint32_t flags;
> +       /** Returned GEM handle for the BO. */
> +       uint32_t handle;
> +       uint32_t pad;
... and not use the stdint.h types but __{s,u}* through the file.

I'd rather than not nag why we want/need those, but if you prefer I
can repeat myself/others.

Thanks
Emil

^ permalink raw reply	[flat|nested] 53+ messages in thread

* Re: [PATCH 3/9] drm/vc4: Add create and map BO ioctls.
@ 2015-12-04 14:29     ` Emil Velikov
  0 siblings, 0 replies; 53+ messages in thread
From: Emil Velikov @ 2015-12-04 14:29 UTC (permalink / raw)
  To: Eric Anholt; +Cc: Linux-Kernel@Vger. Kernel. Org, ML dri-devel

Hi Eric,

A couple of suggestions/requests on the UAPI header side

On 1 December 2015 at 20:35, Eric Anholt <eric@anholt.net> wrote:

> --- /dev/null
> +++ b/include/uapi/drm/vc4_drm.h

> +#include <drm/drm.h>
> +
Can we make this a "drm.h" ?


> +struct drm_vc4_create_bo {
> +       uint32_t size;
> +       uint32_t flags;
> +       /** Returned GEM handle for the BO. */
> +       uint32_t handle;
> +       uint32_t pad;
... and not use the stdint.h types but __{s,u}* through the file.

I'd rather than not nag why we want/need those, but if you prefer I
can repeat myself/others.

Thanks
Emil
_______________________________________________
dri-devel mailing list
dri-devel@lists.freedesktop.org
http://lists.freedesktop.org/mailman/listinfo/dri-devel

^ permalink raw reply	[flat|nested] 53+ messages in thread

* Re: [PATCH 8/9] drm/vc4: Add support for async pageflips.
  2015-12-04  1:42       ` Eric Anholt
@ 2015-12-07 10:13         ` Daniel Stone
  -1 siblings, 0 replies; 53+ messages in thread
From: Daniel Stone @ 2015-12-07 10:13 UTC (permalink / raw)
  To: Eric Anholt; +Cc: dri-devel, Linux Kernel Mailing List

Hi,

On 4 December 2015 at 01:42, Eric Anholt <eric@anholt.net> wrote:
> Daniel Stone <daniel@fooishbar.org> writes:
>> On 1 December 2015 at 20:35, Eric Anholt <eric@anholt.net> wrote:
>>> An async pageflip stores the modeset to be done and executes it once
>>> the BOs are ready to be displayed.  This gets us about 3x performance
>>> in full screen rendering with pageflipping.
>>
>> Looks good, but you're missing a preclose callback to reap dead events, a la:
>> https://git.collabora.com/cgit/user/daniels/linux.git/commit/?h=wip/4.4.x/rockchip-drm-fixes&id=d14f21bcd7e7a1b9ca129c411a9da9c911037965
>>
>> (and parent commit to wire that through to core DRM preclose)
>
> We've already got a preclose callback that reaps crtc->event -- see
> vc4_cancel_page_flip().

Right you are. Carry on then. :)

Cheers,
Daniel

^ permalink raw reply	[flat|nested] 53+ messages in thread

* Re: [PATCH 8/9] drm/vc4: Add support for async pageflips.
@ 2015-12-07 10:13         ` Daniel Stone
  0 siblings, 0 replies; 53+ messages in thread
From: Daniel Stone @ 2015-12-07 10:13 UTC (permalink / raw)
  To: Eric Anholt; +Cc: Linux Kernel Mailing List, dri-devel

Hi,

On 4 December 2015 at 01:42, Eric Anholt <eric@anholt.net> wrote:
> Daniel Stone <daniel@fooishbar.org> writes:
>> On 1 December 2015 at 20:35, Eric Anholt <eric@anholt.net> wrote:
>>> An async pageflip stores the modeset to be done and executes it once
>>> the BOs are ready to be displayed.  This gets us about 3x performance
>>> in full screen rendering with pageflipping.
>>
>> Looks good, but you're missing a preclose callback to reap dead events, a la:
>> https://git.collabora.com/cgit/user/daniels/linux.git/commit/?h=wip/4.4.x/rockchip-drm-fixes&id=d14f21bcd7e7a1b9ca129c411a9da9c911037965
>>
>> (and parent commit to wire that through to core DRM preclose)
>
> We've already got a preclose callback that reaps crtc->event -- see
> vc4_cancel_page_flip().

Right you are. Carry on then. :)

Cheers,
Daniel
_______________________________________________
dri-devel mailing list
dri-devel@lists.freedesktop.org
http://lists.freedesktop.org/mailman/listinfo/dri-devel

^ permalink raw reply	[flat|nested] 53+ messages in thread

* Re: [PATCH 3/9] drm/vc4: Add create and map BO ioctls.
  2015-12-04 14:29     ` Emil Velikov
@ 2015-12-08  1:16       ` Eric Anholt
  -1 siblings, 0 replies; 53+ messages in thread
From: Eric Anholt @ 2015-12-08  1:16 UTC (permalink / raw)
  To: Emil Velikov; +Cc: ML dri-devel, Linux-Kernel@Vger. Kernel. Org

[-- Attachment #1: Type: text/plain, Size: 833 bytes --]

Emil Velikov <emil.l.velikov@gmail.com> writes:

> Hi Eric,
>
> A couple of suggestions/requests on the UAPI header side
>
> On 1 December 2015 at 20:35, Eric Anholt <eric@anholt.net> wrote:
>
>> --- /dev/null
>> +++ b/include/uapi/drm/vc4_drm.h
>
>> +#include <drm/drm.h>
>> +
> Can we make this a "drm.h" ?

Nope.

include/uapi/drm/vc4_drm.h:27:17: fatal error: drm.h: No such file or directory

and none of the drivers do, either.

>> +struct drm_vc4_create_bo {
>> +       uint32_t size;
>> +       uint32_t flags;
>> +       /** Returned GEM handle for the BO. */
>> +       uint32_t handle;
>> +       uint32_t pad;
> ... and not use the stdint.h types but __{s,u}* through the file.
>
> I'd rather than not nag why we want/need those, but if you prefer I
> can repeat myself/others.

I've switched to using the special types.

[-- Attachment #2: signature.asc --]
[-- Type: application/pgp-signature, Size: 818 bytes --]

^ permalink raw reply	[flat|nested] 53+ messages in thread

* Re: [PATCH 3/9] drm/vc4: Add create and map BO ioctls.
@ 2015-12-08  1:16       ` Eric Anholt
  0 siblings, 0 replies; 53+ messages in thread
From: Eric Anholt @ 2015-12-08  1:16 UTC (permalink / raw)
  To: Emil Velikov; +Cc: Linux-Kernel@Vger. Kernel. Org, ML dri-devel


[-- Attachment #1.1: Type: text/plain, Size: 833 bytes --]

Emil Velikov <emil.l.velikov@gmail.com> writes:

> Hi Eric,
>
> A couple of suggestions/requests on the UAPI header side
>
> On 1 December 2015 at 20:35, Eric Anholt <eric@anholt.net> wrote:
>
>> --- /dev/null
>> +++ b/include/uapi/drm/vc4_drm.h
>
>> +#include <drm/drm.h>
>> +
> Can we make this a "drm.h" ?

Nope.

include/uapi/drm/vc4_drm.h:27:17: fatal error: drm.h: No such file or directory

and none of the drivers do, either.

>> +struct drm_vc4_create_bo {
>> +       uint32_t size;
>> +       uint32_t flags;
>> +       /** Returned GEM handle for the BO. */
>> +       uint32_t handle;
>> +       uint32_t pad;
> ... and not use the stdint.h types but __{s,u}* through the file.
>
> I'd rather than not nag why we want/need those, but if you prefer I
> can repeat myself/others.

I've switched to using the special types.

[-- Attachment #1.2: signature.asc --]
[-- Type: application/pgp-signature, Size: 818 bytes --]

[-- Attachment #2: Type: text/plain, Size: 159 bytes --]

_______________________________________________
dri-devel mailing list
dri-devel@lists.freedesktop.org
http://lists.freedesktop.org/mailman/listinfo/dri-devel

^ permalink raw reply	[flat|nested] 53+ messages in thread

* Re: [PATCH 3/9] drm/vc4: Add create and map BO ioctls.
  2015-12-08  1:16       ` Eric Anholt
@ 2015-12-08  2:56         ` Michel Dänzer
  -1 siblings, 0 replies; 53+ messages in thread
From: Michel Dänzer @ 2015-12-08  2:56 UTC (permalink / raw)
  To: Eric Anholt, Emil Velikov; +Cc: Linux-Kernel@Vger. Kernel. Org, ML dri-devel

On 08.12.2015 10:16, Eric Anholt wrote:
> Emil Velikov <emil.l.velikov@gmail.com> writes:
> 
>> Hi Eric,
>>
>> A couple of suggestions/requests on the UAPI header side
>>
>> On 1 December 2015 at 20:35, Eric Anholt <eric@anholt.net> wrote:
>>
>>> --- /dev/null
>>> +++ b/include/uapi/drm/vc4_drm.h
>>
>>> +#include <drm/drm.h>
>>> +
>> Can we make this a "drm.h" ?
> 
> Nope.
> 
> include/uapi/drm/vc4_drm.h:27:17: fatal error: drm.h: No such file or directory

What happened to include/uapi/drm/drm.h in that tree?


> and none of the drivers do, either.

daenzer@kaveri|11:55:31> grep '#include "drm.h"' include/uapi/drm/*
include/uapi/drm/amdgpu_drm.h:#include "drm.h"
include/uapi/drm/radeon_drm.h:#include "drm.h"

Patches to convert the rest are pending.


-- 
Earthling Michel Dänzer               |               http://www.amd.com
Libre software enthusiast             |             Mesa and X developer

^ permalink raw reply	[flat|nested] 53+ messages in thread

* Re: [PATCH 3/9] drm/vc4: Add create and map BO ioctls.
@ 2015-12-08  2:56         ` Michel Dänzer
  0 siblings, 0 replies; 53+ messages in thread
From: Michel Dänzer @ 2015-12-08  2:56 UTC (permalink / raw)
  To: Eric Anholt, Emil Velikov; +Cc: Linux-Kernel@Vger. Kernel. Org, ML dri-devel

On 08.12.2015 10:16, Eric Anholt wrote:
> Emil Velikov <emil.l.velikov@gmail.com> writes:
> 
>> Hi Eric,
>>
>> A couple of suggestions/requests on the UAPI header side
>>
>> On 1 December 2015 at 20:35, Eric Anholt <eric@anholt.net> wrote:
>>
>>> --- /dev/null
>>> +++ b/include/uapi/drm/vc4_drm.h
>>
>>> +#include <drm/drm.h>
>>> +
>> Can we make this a "drm.h" ?
> 
> Nope.
> 
> include/uapi/drm/vc4_drm.h:27:17: fatal error: drm.h: No such file or directory

What happened to include/uapi/drm/drm.h in that tree?


> and none of the drivers do, either.

daenzer@kaveri|11:55:31> grep '#include "drm.h"' include/uapi/drm/*
include/uapi/drm/amdgpu_drm.h:#include "drm.h"
include/uapi/drm/radeon_drm.h:#include "drm.h"

Patches to convert the rest are pending.


-- 
Earthling Michel Dänzer               |               http://www.amd.com
Libre software enthusiast             |             Mesa and X developer
_______________________________________________
dri-devel mailing list
dri-devel@lists.freedesktop.org
http://lists.freedesktop.org/mailman/listinfo/dri-devel

^ permalink raw reply	[flat|nested] 53+ messages in thread

* Re: [PATCH 3/9] drm/vc4: Add create and map BO ioctls.
  2015-12-08  2:56         ` Michel Dänzer
@ 2015-12-08  4:05           ` Eric Anholt
  -1 siblings, 0 replies; 53+ messages in thread
From: Eric Anholt @ 2015-12-08  4:05 UTC (permalink / raw)
  To: Michel Dänzer, Emil Velikov
  Cc: Linux-Kernel@Vger. Kernel. Org, ML dri-devel

[-- Attachment #1: Type: text/plain, Size: 696 bytes --]

Michel Dänzer <michel@daenzer.net> writes:

> On 08.12.2015 10:16, Eric Anholt wrote:
>> Emil Velikov <emil.l.velikov@gmail.com> writes:
>> 
>>> Hi Eric,
>>>
>>> A couple of suggestions/requests on the UAPI header side
>>>
>>> On 1 December 2015 at 20:35, Eric Anholt <eric@anholt.net> wrote:
>>>
>>>> --- /dev/null
>>>> +++ b/include/uapi/drm/vc4_drm.h
>>>
>>>> +#include <drm/drm.h>
>>>> +
>>> Can we make this a "drm.h" ?
>> 
>> Nope.
>> 
>> include/uapi/drm/vc4_drm.h:27:17: fatal error: drm.h: No such file or directory
>
> What happened to include/uapi/drm/drm.h in that tree?

Looks like it's just <drm.h> versus "drm.h" failure.  I've changed over
to "drm.h"


[-- Attachment #2: signature.asc --]
[-- Type: application/pgp-signature, Size: 818 bytes --]

^ permalink raw reply	[flat|nested] 53+ messages in thread

* Re: [PATCH 3/9] drm/vc4: Add create and map BO ioctls.
@ 2015-12-08  4:05           ` Eric Anholt
  0 siblings, 0 replies; 53+ messages in thread
From: Eric Anholt @ 2015-12-08  4:05 UTC (permalink / raw)
  To: Michel Dänzer, Emil Velikov
  Cc: Linux-Kernel@Vger. Kernel. Org, ML dri-devel


[-- Attachment #1.1: Type: text/plain, Size: 696 bytes --]

Michel Dänzer <michel@daenzer.net> writes:

> On 08.12.2015 10:16, Eric Anholt wrote:
>> Emil Velikov <emil.l.velikov@gmail.com> writes:
>> 
>>> Hi Eric,
>>>
>>> A couple of suggestions/requests on the UAPI header side
>>>
>>> On 1 December 2015 at 20:35, Eric Anholt <eric@anholt.net> wrote:
>>>
>>>> --- /dev/null
>>>> +++ b/include/uapi/drm/vc4_drm.h
>>>
>>>> +#include <drm/drm.h>
>>>> +
>>> Can we make this a "drm.h" ?
>> 
>> Nope.
>> 
>> include/uapi/drm/vc4_drm.h:27:17: fatal error: drm.h: No such file or directory
>
> What happened to include/uapi/drm/drm.h in that tree?

Looks like it's just <drm.h> versus "drm.h" failure.  I've changed over
to "drm.h"


[-- Attachment #1.2: signature.asc --]
[-- Type: application/pgp-signature, Size: 818 bytes --]

[-- Attachment #2: Type: text/plain, Size: 159 bytes --]

_______________________________________________
dri-devel mailing list
dri-devel@lists.freedesktop.org
http://lists.freedesktop.org/mailman/listinfo/dri-devel

^ permalink raw reply	[flat|nested] 53+ messages in thread

end of thread, other threads:[~2015-12-08  4:06 UTC | newest]

Thread overview: 53+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2015-12-01 20:35 [PATCH 1/9] drm: Create a driver hook for allocating GEM object structs Eric Anholt
2015-12-01 20:35 ` Eric Anholt
2015-12-01 20:35 ` [PATCH 2/9] drm/vc4: Add a BO cache Eric Anholt
2015-12-01 20:35   ` Eric Anholt
2015-12-01 20:35 ` [PATCH 3/9] drm/vc4: Add create and map BO ioctls Eric Anholt
2015-12-01 20:35   ` Eric Anholt
2015-12-04 14:29   ` Emil Velikov
2015-12-04 14:29     ` Emil Velikov
2015-12-08  1:16     ` Eric Anholt
2015-12-08  1:16       ` Eric Anholt
2015-12-08  2:56       ` Michel Dänzer
2015-12-08  2:56         ` Michel Dänzer
2015-12-08  4:05         ` Eric Anholt
2015-12-08  4:05           ` Eric Anholt
2015-12-01 20:35 ` [PATCH 4/9] drm/vc4: Add an API for creating GPU shaders in GEM BOs Eric Anholt
2015-12-01 20:35   ` Eric Anholt
2015-12-02 11:35   ` Emil Velikov
2015-12-02 11:35     ` Emil Velikov
2015-12-02 19:24     ` Eric Anholt
2015-12-02 19:24       ` Eric Anholt
2015-12-01 20:35 ` [PATCH 5/9] drm/vc4: Fix a typo in a V3D debug register Eric Anholt
2015-12-01 20:35   ` Eric Anholt
2015-12-01 20:35 ` [PATCH 6/9] drm/vc4: Bind and initialize the V3D engine Eric Anholt
2015-12-01 20:35   ` Eric Anholt
2015-12-01 20:35 ` [PATCH 7/9] drm/vc4: Add support for drawing 3D frames Eric Anholt
2015-12-01 20:35   ` Eric Anholt
2015-12-02 11:29   ` Emil Velikov
2015-12-02 11:29     ` Emil Velikov
2015-12-02 19:27     ` Eric Anholt
2015-12-02 19:27       ` Eric Anholt
2015-12-01 20:35 ` [PATCH 8/9] drm/vc4: Add support for async pageflips Eric Anholt
2015-12-01 20:35   ` Eric Anholt
2015-12-02 14:05   ` Daniel Stone
2015-12-02 14:05     ` Daniel Stone
2015-12-04  1:42     ` Eric Anholt
2015-12-04  1:42       ` Eric Anholt
2015-12-07 10:13       ` Daniel Stone
2015-12-07 10:13         ` Daniel Stone
2015-12-01 20:35 ` [PATCH 9/9] drm/vc4: Add an interface for capturing the GPU state after a hang Eric Anholt
2015-12-01 20:35   ` Eric Anholt
2015-12-02 11:45   ` Emil Velikov
2015-12-02 11:45     ` Emil Velikov
2015-12-02 19:35     ` Eric Anholt
2015-12-02 19:35       ` Eric Anholt
2015-12-02 22:26       ` Daniel Vetter
2015-12-02 22:26         ` Daniel Vetter
2015-12-02 22:58         ` Daniel Stone
2015-12-03 11:55           ` Emil Velikov
2015-12-03 11:55             ` Emil Velikov
2015-12-03 11:46       ` Emil Velikov
2015-12-03 11:46         ` Emil Velikov
2015-12-02  8:58 ` [PATCH 1/9] drm: Create a driver hook for allocating GEM object structs Daniel Vetter
2015-12-02  8:58   ` Daniel Vetter

This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.