dri-devel.lists.freedesktop.org archive mirror
 help / color / mirror / Atom feed
* [PATCH 1/4] drm: rename {drm_clflush_sg, drm_clflush_pages}
@ 2018-01-17  0:35 Gurchetan Singh
  2018-01-17  0:35 ` [PATCH 2/4] drm: add additional parameter in drm_flush_pages() and drm_flush_sg() Gurchetan Singh
       [not found] ` <20180117003559.67837-1-gurchetansingh-F7+t8E8rja9g9hUCZPvPmw@public.gmane.org>
  0 siblings, 2 replies; 19+ messages in thread
From: Gurchetan Singh @ 2018-01-17  0:35 UTC (permalink / raw)
  To: dri-devel-PD4FTy7X32lNgt0PjOBp9y5qC8QIuHrW
  Cc: thierry.reding-Re5JQEeQqe8AvxtiuMwx3w,
	heiko-4mtYJXux2i+zQB+pC5nmwQ,
	daniel.vetter-ral2JQCrhuEAvxtiuMwx3w,
	chris-Y6uKTt2uX1cEflXRtASbqLVCufUGDwFn,
	linux-tegra-u79uwXL29TY76Z2rM5mHXA, Gurchetan Singh

Since clfush is an x86-only instruction, these function names won't
make much sense if we start adding cases for other architectures.

Signed-off-by: Gurchetan Singh <gurchetansingh-F7+t8E8rja9g9hUCZPvPmw@public.gmane.org>
---
 drivers/gpu/drm/drm_cache.c                 | 12 ++++++------
 drivers/gpu/drm/i915/i915_gem.c             |  2 +-
 drivers/gpu/drm/i915/i915_gem_clflush.c     |  2 +-
 drivers/gpu/drm/rockchip/rockchip_drm_gem.c |  2 +-
 drivers/gpu/drm/ttm/ttm_tt.c                |  2 +-
 drivers/gpu/drm/vgem/vgem_drv.c             |  2 +-
 include/drm/drm_cache.h                     |  4 ++--
 7 files changed, 13 insertions(+), 13 deletions(-)

diff --git a/drivers/gpu/drm/drm_cache.c b/drivers/gpu/drm/drm_cache.c
index 3bd76e918b5d..89cdd32fe1f3 100644
--- a/drivers/gpu/drm/drm_cache.c
+++ b/drivers/gpu/drm/drm_cache.c
@@ -70,7 +70,7 @@ static void drm_cache_flush_clflush(struct page *pages[],
 #endif
 
 /**
- * drm_clflush_pages - Flush dcache lines of a set of pages.
+ * drm_flush_pages - Flush dcache lines of a set of pages.
  * @pages: List of pages to be flushed.
  * @num_pages: Number of pages in the array.
  *
@@ -78,7 +78,7 @@ static void drm_cache_flush_clflush(struct page *pages[],
  * to a page in the array.
  */
 void
-drm_clflush_pages(struct page *pages[], unsigned long num_pages)
+drm_flush_pages(struct page *pages[], unsigned long num_pages)
 {
 
 #if defined(CONFIG_X86)
@@ -109,17 +109,17 @@ drm_clflush_pages(struct page *pages[], unsigned long num_pages)
 	WARN_ON_ONCE(1);
 #endif
 }
-EXPORT_SYMBOL(drm_clflush_pages);
+EXPORT_SYMBOL(drm_flush_pages);
 
 /**
- * drm_clflush_sg - Flush dcache lines pointing to a scather-gather.
+ * drm_flush_sg - Flush dcache lines pointing to a scather-gather.
  * @st: struct sg_table.
  *
  * Flush every data cache line entry that points to an address in the
  * sg.
  */
 void
-drm_clflush_sg(struct sg_table *st)
+drm_flush_sg(struct sg_table *st)
 {
 #if defined(CONFIG_X86)
 	if (static_cpu_has(X86_FEATURE_CLFLUSH)) {
@@ -140,7 +140,7 @@ drm_clflush_sg(struct sg_table *st)
 	WARN_ON_ONCE(1);
 #endif
 }
-EXPORT_SYMBOL(drm_clflush_sg);
+EXPORT_SYMBOL(drm_flush_sg);
 
 /**
  * drm_clflush_virt_range - Flush dcache lines of a region
diff --git a/drivers/gpu/drm/i915/i915_gem.c b/drivers/gpu/drm/i915/i915_gem.c
index 61ba321e9970..fe191d0e84e1 100644
--- a/drivers/gpu/drm/i915/i915_gem.c
+++ b/drivers/gpu/drm/i915/i915_gem.c
@@ -259,7 +259,7 @@ __i915_gem_object_release_shmem(struct drm_i915_gem_object *obj,
 	if (needs_clflush &&
 	    (obj->base.read_domains & I915_GEM_DOMAIN_CPU) == 0 &&
 	    !(obj->cache_coherent & I915_BO_CACHE_COHERENT_FOR_READ))
-		drm_clflush_sg(pages);
+		drm_flush_sg(pages);
 
 	__start_cpu_write(obj);
 }
diff --git a/drivers/gpu/drm/i915/i915_gem_clflush.c b/drivers/gpu/drm/i915/i915_gem_clflush.c
index f663cd919795..f413c5e5735d 100644
--- a/drivers/gpu/drm/i915/i915_gem_clflush.c
+++ b/drivers/gpu/drm/i915/i915_gem_clflush.c
@@ -71,7 +71,7 @@ static const struct dma_fence_ops i915_clflush_ops = {
 static void __i915_do_clflush(struct drm_i915_gem_object *obj)
 {
 	GEM_BUG_ON(!i915_gem_object_has_pages(obj));
-	drm_clflush_sg(obj->mm.pages);
+	drm_flush_sg(obj->mm.pages);
 	intel_fb_obj_flush(obj, ORIGIN_CPU);
 }
 
diff --git a/drivers/gpu/drm/rockchip/rockchip_drm_gem.c b/drivers/gpu/drm/rockchip/rockchip_drm_gem.c
index 1d9655576b6e..8ac7eb25e46d 100644
--- a/drivers/gpu/drm/rockchip/rockchip_drm_gem.c
+++ b/drivers/gpu/drm/rockchip/rockchip_drm_gem.c
@@ -100,7 +100,7 @@ static int rockchip_gem_get_pages(struct rockchip_gem_object *rk_obj)
 	 * Fake up the SG table so that dma_sync_sg_for_device() can be used
 	 * to flush the pages associated with it.
 	 *
-	 * TODO: Replace this by drm_clflush_sg() once it can be implemented
+	 * TODO: Replace this by drm_flush_sg() once it can be implemented
 	 * without relying on symbols that are not exported.
 	 */
 	for_each_sg(rk_obj->sgt->sgl, s, rk_obj->sgt->nents, i)
diff --git a/drivers/gpu/drm/ttm/ttm_tt.c b/drivers/gpu/drm/ttm/ttm_tt.c
index 8ebc8d3560c3..59e272a58752 100644
--- a/drivers/gpu/drm/ttm/ttm_tt.c
+++ b/drivers/gpu/drm/ttm/ttm_tt.c
@@ -122,7 +122,7 @@ static int ttm_tt_set_caching(struct ttm_tt *ttm,
 	}
 
 	if (ttm->caching_state == tt_cached)
-		drm_clflush_pages(ttm->pages, ttm->num_pages);
+		drm_flush_pages(ttm->pages, ttm->num_pages);
 
 	for (i = 0; i < ttm->num_pages; ++i) {
 		cur_page = ttm->pages[i];
diff --git a/drivers/gpu/drm/vgem/vgem_drv.c b/drivers/gpu/drm/vgem/vgem_drv.c
index 2524ff116f00..802a97e1a4bf 100644
--- a/drivers/gpu/drm/vgem/vgem_drv.c
+++ b/drivers/gpu/drm/vgem/vgem_drv.c
@@ -325,7 +325,7 @@ static int vgem_prime_pin(struct drm_gem_object *obj)
 	/* Flush the object from the CPU cache so that importers can rely
 	 * on coherent indirect access via the exported dma-address.
 	 */
-	drm_clflush_pages(pages, n_pages);
+	drm_flush_pages(pages, n_pages);
 
 	return 0;
 }
diff --git a/include/drm/drm_cache.h b/include/drm/drm_cache.h
index beab0f0d0cfb..25c029470315 100644
--- a/include/drm/drm_cache.h
+++ b/include/drm/drm_cache.h
@@ -35,8 +35,8 @@
 
 #include <linux/scatterlist.h>
 
-void drm_clflush_pages(struct page *pages[], unsigned long num_pages);
-void drm_clflush_sg(struct sg_table *st);
+void drm_flush_pages(struct page *pages[], unsigned long num_pages);
+void drm_flush_sg(struct sg_table *st);
 void drm_clflush_virt_range(void *addr, unsigned long length);
 
 static inline bool drm_arch_can_wc_memory(void)
-- 
2.13.5

^ permalink raw reply related	[flat|nested] 19+ messages in thread

* [PATCH 2/4] drm: add additional parameter in drm_flush_pages() and drm_flush_sg()
  2018-01-17  0:35 [PATCH 1/4] drm: rename {drm_clflush_sg, drm_clflush_pages} Gurchetan Singh
@ 2018-01-17  0:35 ` Gurchetan Singh
       [not found] ` <20180117003559.67837-1-gurchetansingh-F7+t8E8rja9g9hUCZPvPmw@public.gmane.org>
  1 sibling, 0 replies; 19+ messages in thread
From: Gurchetan Singh @ 2018-01-17  0:35 UTC (permalink / raw)
  To: dri-devel; +Cc: Gurchetan Singh, thierry.reding, linux-tegra, daniel.vetter

We've found the DMA API is effective for flushing the cache on ARM
devices, and it requires a struct device *.

Signed-off-by: Gurchetan Singh <gurchetansingh@chromium.org>
---
 drivers/gpu/drm/drm_cache.c             | 5 +++--
 drivers/gpu/drm/i915/i915_gem.c         | 2 +-
 drivers/gpu/drm/i915/i915_gem_clflush.c | 2 +-
 drivers/gpu/drm/ttm/ttm_tt.c            | 2 +-
 drivers/gpu/drm/vgem/vgem_drv.c         | 2 +-
 include/drm/drm_cache.h                 | 5 +++--
 6 files changed, 10 insertions(+), 8 deletions(-)

diff --git a/drivers/gpu/drm/drm_cache.c b/drivers/gpu/drm/drm_cache.c
index 89cdd32fe1f3..3d2bb9d71a60 100644
--- a/drivers/gpu/drm/drm_cache.c
+++ b/drivers/gpu/drm/drm_cache.c
@@ -78,7 +78,8 @@ static void drm_cache_flush_clflush(struct page *pages[],
  * to a page in the array.
  */
 void
-drm_flush_pages(struct page *pages[], unsigned long num_pages)
+drm_flush_pages(struct device *dev, struct page *pages[],
+		unsigned long num_pages)
 {
 
 #if defined(CONFIG_X86)
@@ -119,7 +120,7 @@ EXPORT_SYMBOL(drm_flush_pages);
  * sg.
  */
 void
-drm_flush_sg(struct sg_table *st)
+drm_flush_sg(struct device *dev, struct sg_table *st)
 {
 #if defined(CONFIG_X86)
 	if (static_cpu_has(X86_FEATURE_CLFLUSH)) {
diff --git a/drivers/gpu/drm/i915/i915_gem.c b/drivers/gpu/drm/i915/i915_gem.c
index fe191d0e84e1..045866f2b5dd 100644
--- a/drivers/gpu/drm/i915/i915_gem.c
+++ b/drivers/gpu/drm/i915/i915_gem.c
@@ -259,7 +259,7 @@ __i915_gem_object_release_shmem(struct drm_i915_gem_object *obj,
 	if (needs_clflush &&
 	    (obj->base.read_domains & I915_GEM_DOMAIN_CPU) == 0 &&
 	    !(obj->cache_coherent & I915_BO_CACHE_COHERENT_FOR_READ))
-		drm_flush_sg(pages);
+		drm_flush_sg(obj->base.dev->dev, pages);
 
 	__start_cpu_write(obj);
 }
diff --git a/drivers/gpu/drm/i915/i915_gem_clflush.c b/drivers/gpu/drm/i915/i915_gem_clflush.c
index f413c5e5735d..b5938f14141f 100644
--- a/drivers/gpu/drm/i915/i915_gem_clflush.c
+++ b/drivers/gpu/drm/i915/i915_gem_clflush.c
@@ -71,7 +71,7 @@ static const struct dma_fence_ops i915_clflush_ops = {
 static void __i915_do_clflush(struct drm_i915_gem_object *obj)
 {
 	GEM_BUG_ON(!i915_gem_object_has_pages(obj));
-	drm_flush_sg(obj->mm.pages);
+	drm_flush_sg(obj->base.dev->dev, obj->mm.pages);
 	intel_fb_obj_flush(obj, ORIGIN_CPU);
 }
 
diff --git a/drivers/gpu/drm/ttm/ttm_tt.c b/drivers/gpu/drm/ttm/ttm_tt.c
index 59e272a58752..fb2382d01bba 100644
--- a/drivers/gpu/drm/ttm/ttm_tt.c
+++ b/drivers/gpu/drm/ttm/ttm_tt.c
@@ -122,7 +122,7 @@ static int ttm_tt_set_caching(struct ttm_tt *ttm,
 	}
 
 	if (ttm->caching_state == tt_cached)
-		drm_flush_pages(ttm->pages, ttm->num_pages);
+		drm_flush_pages(NULL, ttm->pages, ttm->num_pages);
 
 	for (i = 0; i < ttm->num_pages; ++i) {
 		cur_page = ttm->pages[i];
diff --git a/drivers/gpu/drm/vgem/vgem_drv.c b/drivers/gpu/drm/vgem/vgem_drv.c
index 802a97e1a4bf..35bfdfb746a7 100644
--- a/drivers/gpu/drm/vgem/vgem_drv.c
+++ b/drivers/gpu/drm/vgem/vgem_drv.c
@@ -325,7 +325,7 @@ static int vgem_prime_pin(struct drm_gem_object *obj)
 	/* Flush the object from the CPU cache so that importers can rely
 	 * on coherent indirect access via the exported dma-address.
 	 */
-	drm_flush_pages(pages, n_pages);
+	drm_flush_pages(obj->dev->dev, pages, n_pages);
 
 	return 0;
 }
diff --git a/include/drm/drm_cache.h b/include/drm/drm_cache.h
index 25c029470315..cb77315dd8dd 100644
--- a/include/drm/drm_cache.h
+++ b/include/drm/drm_cache.h
@@ -35,8 +35,9 @@
 
 #include <linux/scatterlist.h>
 
-void drm_flush_pages(struct page *pages[], unsigned long num_pages);
-void drm_flush_sg(struct sg_table *st);
+void drm_flush_pages(struct device *dev, struct page *pages[],
+		     unsigned long num_pages);
+void drm_flush_sg(struct device *dev, struct sg_table *st);
 void drm_clflush_virt_range(void *addr, unsigned long length);
 
 static inline bool drm_arch_can_wc_memory(void)
-- 
2.13.5

_______________________________________________
dri-devel mailing list
dri-devel@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/dri-devel

^ permalink raw reply related	[flat|nested] 19+ messages in thread

* [PATCH 3/4] drm: add ARM flush implementations
       [not found] ` <20180117003559.67837-1-gurchetansingh-F7+t8E8rja9g9hUCZPvPmw@public.gmane.org>
@ 2018-01-17  0:35   ` Gurchetan Singh
       [not found]     ` <20180117003559.67837-3-gurchetansingh-F7+t8E8rja9g9hUCZPvPmw@public.gmane.org>
  2018-01-17  0:35   ` [PATCH 4/4] drm/vgem: flush page during page fault Gurchetan Singh
  1 sibling, 1 reply; 19+ messages in thread
From: Gurchetan Singh @ 2018-01-17  0:35 UTC (permalink / raw)
  To: dri-devel-PD4FTy7X32lNgt0PjOBp9y5qC8QIuHrW
  Cc: thierry.reding-Re5JQEeQqe8AvxtiuMwx3w,
	heiko-4mtYJXux2i+zQB+pC5nmwQ,
	daniel.vetter-ral2JQCrhuEAvxtiuMwx3w,
	chris-Y6uKTt2uX1cEflXRtASbqLVCufUGDwFn,
	linux-tegra-u79uwXL29TY76Z2rM5mHXA, Gurchetan Singh

The DMA API can be used to flush scatter gather tables and physical
pages on ARM devices.

Signed-off-by: Gurchetan Singh <gurchetansingh-F7+t8E8rja9g9hUCZPvPmw@public.gmane.org>
---
 drivers/gpu/drm/drm_cache.c                 | 17 +++++++++++++++++
 drivers/gpu/drm/rockchip/rockchip_drm_gem.c |  7 ++-----
 drivers/gpu/drm/tegra/gem.c                 |  6 +-----
 3 files changed, 20 insertions(+), 10 deletions(-)

diff --git a/drivers/gpu/drm/drm_cache.c b/drivers/gpu/drm/drm_cache.c
index 3d2bb9d71a60..98d6ebb40e96 100644
--- a/drivers/gpu/drm/drm_cache.c
+++ b/drivers/gpu/drm/drm_cache.c
@@ -105,6 +105,18 @@ drm_flush_pages(struct device *dev, struct page *pages[],
 				   (unsigned long)page_virtual + PAGE_SIZE);
 		kunmap_atomic(page_virtual);
 	}
+#elif defined(CONFIG_ARM) || defined(CONFIG_ARM64)
+	unsigned long i;
+	dma_addr_t dma_handle;
+
+	if (!dev)
+		return;
+
+	for (i = 0; i < num_pages; i++) {
+		dma_handle = phys_to_dma(drm->dev, page_to_phys(pages[i]));
+		dma_sync_single_for_device(dev, dma_handle, PAGE_SIZE,
+					   DMA_TO_DEVICE);
+	}
 #else
 	pr_err("Architecture has no drm_cache.c support\n");
 	WARN_ON_ONCE(1);
@@ -136,6 +148,11 @@ drm_flush_sg(struct device *dev, struct sg_table *st)
 
 	if (wbinvd_on_all_cpus())
 		pr_err("Timed out waiting for cache flush\n");
+#elif defined(CONFIG_ARM) || defined(CONFIG_ARM64)
+	if (!dev)
+		return;
+
+	dma_sync_sg_for_device(dev, st->sgl, st->nents, DMA_TO_DEVICE);
 #else
 	pr_err("Architecture has no drm_cache.c support\n");
 	WARN_ON_ONCE(1);
diff --git a/drivers/gpu/drm/rockchip/rockchip_drm_gem.c b/drivers/gpu/drm/rockchip/rockchip_drm_gem.c
index 8ac7eb25e46d..0157f90b5d10 100644
--- a/drivers/gpu/drm/rockchip/rockchip_drm_gem.c
+++ b/drivers/gpu/drm/rockchip/rockchip_drm_gem.c
@@ -14,6 +14,7 @@
 
 #include <drm/drm.h>
 #include <drm/drmP.h>
+#include <drm/drm_cache.h>
 #include <drm/drm_gem.h>
 #include <drm/drm_vma_manager.h>
 #include <linux/iommu.h>
@@ -99,15 +100,11 @@ static int rockchip_gem_get_pages(struct rockchip_gem_object *rk_obj)
 	/*
 	 * Fake up the SG table so that dma_sync_sg_for_device() can be used
 	 * to flush the pages associated with it.
-	 *
-	 * TODO: Replace this by drm_flush_sg() once it can be implemented
-	 * without relying on symbols that are not exported.
 	 */
 	for_each_sg(rk_obj->sgt->sgl, s, rk_obj->sgt->nents, i)
 		sg_dma_address(s) = sg_phys(s);
 
-	dma_sync_sg_for_device(drm->dev, rk_obj->sgt->sgl, rk_obj->sgt->nents,
-			       DMA_TO_DEVICE);
+	drm_flush_sg(drm->dev, rk_obj->sgt);
 
 	return 0;
 
diff --git a/drivers/gpu/drm/tegra/gem.c b/drivers/gpu/drm/tegra/gem.c
index ab1e53d434e8..9945fd2f6bd6 100644
--- a/drivers/gpu/drm/tegra/gem.c
+++ b/drivers/gpu/drm/tegra/gem.c
@@ -230,15 +230,11 @@ static int tegra_bo_get_pages(struct drm_device *drm, struct tegra_bo *bo)
 	/*
 	 * Fake up the SG table so that dma_sync_sg_for_device() can be used
 	 * to flush the pages associated with it.
-	 *
-	 * TODO: Replace this by drm_clflash_sg() once it can be implemented
-	 * without relying on symbols that are not exported.
 	 */
 	for_each_sg(bo->sgt->sgl, s, bo->sgt->nents, i)
 		sg_dma_address(s) = sg_phys(s);
 
-	dma_sync_sg_for_device(drm->dev, bo->sgt->sgl, bo->sgt->nents,
-			       DMA_TO_DEVICE);
+	drm_flush_sg(drm->dev, bo->sgt);
 
 	return 0;
 
-- 
2.13.5

^ permalink raw reply related	[flat|nested] 19+ messages in thread

* [PATCH 4/4] drm/vgem: flush page during page fault
       [not found] ` <20180117003559.67837-1-gurchetansingh-F7+t8E8rja9g9hUCZPvPmw@public.gmane.org>
  2018-01-17  0:35   ` [PATCH 3/4] drm: add ARM flush implementations Gurchetan Singh
@ 2018-01-17  0:35   ` Gurchetan Singh
       [not found]     ` <20180117003559.67837-4-gurchetansingh-F7+t8E8rja9g9hUCZPvPmw@public.gmane.org>
  1 sibling, 1 reply; 19+ messages in thread
From: Gurchetan Singh @ 2018-01-17  0:35 UTC (permalink / raw)
  To: dri-devel-PD4FTy7X32lNgt0PjOBp9y5qC8QIuHrW
  Cc: thierry.reding-Re5JQEeQqe8AvxtiuMwx3w,
	heiko-4mtYJXux2i+zQB+pC5nmwQ,
	daniel.vetter-ral2JQCrhuEAvxtiuMwx3w,
	chris-Y6uKTt2uX1cEflXRtASbqLVCufUGDwFn,
	linux-tegra-u79uwXL29TY76Z2rM5mHXA, Gurchetan Singh

This is required to use buffers allocated by vgem on AMD and ARM devices.
We're experiencing a case where eviction of the cache races with userspace
writes. To fix this, flush the cache after retrieving a page.

Signed-off-by: Gurchetan Singh <gurchetansingh-F7+t8E8rja9g9hUCZPvPmw@public.gmane.org>
---
 drivers/gpu/drm/vgem/vgem_drv.c | 1 +
 1 file changed, 1 insertion(+)

diff --git a/drivers/gpu/drm/vgem/vgem_drv.c b/drivers/gpu/drm/vgem/vgem_drv.c
index 35bfdfb746a7..fb263969f02d 100644
--- a/drivers/gpu/drm/vgem/vgem_drv.c
+++ b/drivers/gpu/drm/vgem/vgem_drv.c
@@ -112,6 +112,7 @@ static int vgem_gem_fault(struct vm_fault *vmf)
 				break;
 		}
 
+		drm_flush_pages(obj->base.dev->dev, &page, 1);
 	}
 	return ret;
 }
-- 
2.13.5

^ permalink raw reply related	[flat|nested] 19+ messages in thread

* Re: [PATCH 3/4] drm: add ARM flush implementations
       [not found]     ` <20180117003559.67837-3-gurchetansingh-F7+t8E8rja9g9hUCZPvPmw@public.gmane.org>
@ 2018-01-17  8:31       ` Daniel Vetter
       [not found]         ` <20180117083105.GG2759-dv86pmgwkMBes7Z6vYuT8azUEOm+Xw19@public.gmane.org>
  2018-01-17 22:46         ` Gurchetan Singh
  0 siblings, 2 replies; 19+ messages in thread
From: Daniel Vetter @ 2018-01-17  8:31 UTC (permalink / raw)
  To: Gurchetan Singh
  Cc: dri-devel-PD4FTy7X32lNgt0PjOBp9y5qC8QIuHrW,
	thierry.reding-Re5JQEeQqe8AvxtiuMwx3w,
	linux-tegra-u79uwXL29TY76Z2rM5mHXA,
	daniel.vetter-ral2JQCrhuEAvxtiuMwx3w

On Tue, Jan 16, 2018 at 04:35:58PM -0800, Gurchetan Singh wrote:
> The DMA API can be used to flush scatter gather tables and physical
> pages on ARM devices.
> 
> Signed-off-by: Gurchetan Singh <gurchetansingh-F7+t8E8rja9g9hUCZPvPmw@public.gmane.org>
> ---
>  drivers/gpu/drm/drm_cache.c                 | 17 +++++++++++++++++
>  drivers/gpu/drm/rockchip/rockchip_drm_gem.c |  7 ++-----
>  drivers/gpu/drm/tegra/gem.c                 |  6 +-----
>  3 files changed, 20 insertions(+), 10 deletions(-)
> 
> diff --git a/drivers/gpu/drm/drm_cache.c b/drivers/gpu/drm/drm_cache.c
> index 3d2bb9d71a60..98d6ebb40e96 100644
> --- a/drivers/gpu/drm/drm_cache.c
> +++ b/drivers/gpu/drm/drm_cache.c
> @@ -105,6 +105,18 @@ drm_flush_pages(struct device *dev, struct page *pages[],
>  				   (unsigned long)page_virtual + PAGE_SIZE);
>  		kunmap_atomic(page_virtual);
>  	}
> +#elif defined(CONFIG_ARM) || defined(CONFIG_ARM64)
> +	unsigned long i;
> +	dma_addr_t dma_handle;
> +
> +	if (!dev)
> +		return;
> +
> +	for (i = 0; i < num_pages; i++) {
> +		dma_handle = phys_to_dma(drm->dev, page_to_phys(pages[i]));
> +		dma_sync_single_for_device(dev, dma_handle, PAGE_SIZE,
> +					   DMA_TO_DEVICE);

Erm no. These functions here are super-low-level functions used by drivers
which know exactly what they're doing. Which is reimplementing half of the
dma api behind the dma api's back because the dma api just isn't quite
sufficient for implementing fast gpu drivers.

If all you end up doing is calling the dma api again, then pls just call
it directly.

And just to make it clear: I'd be perfectly fine with adding arm support
here, but then it'd need to directly flush cpu caches and bypass the dma
api. Otherwise this is pointless.
-Daniel

> +	}
>  #else
>  	pr_err("Architecture has no drm_cache.c support\n");
>  	WARN_ON_ONCE(1);
> @@ -136,6 +148,11 @@ drm_flush_sg(struct device *dev, struct sg_table *st)
>  
>  	if (wbinvd_on_all_cpus())
>  		pr_err("Timed out waiting for cache flush\n");
> +#elif defined(CONFIG_ARM) || defined(CONFIG_ARM64)
> +	if (!dev)
> +		return;
> +
> +	dma_sync_sg_for_device(dev, st->sgl, st->nents, DMA_TO_DEVICE);
>  #else
>  	pr_err("Architecture has no drm_cache.c support\n");
>  	WARN_ON_ONCE(1);
> diff --git a/drivers/gpu/drm/rockchip/rockchip_drm_gem.c b/drivers/gpu/drm/rockchip/rockchip_drm_gem.c
> index 8ac7eb25e46d..0157f90b5d10 100644
> --- a/drivers/gpu/drm/rockchip/rockchip_drm_gem.c
> +++ b/drivers/gpu/drm/rockchip/rockchip_drm_gem.c
> @@ -14,6 +14,7 @@
>  
>  #include <drm/drm.h>
>  #include <drm/drmP.h>
> +#include <drm/drm_cache.h>
>  #include <drm/drm_gem.h>
>  #include <drm/drm_vma_manager.h>
>  #include <linux/iommu.h>
> @@ -99,15 +100,11 @@ static int rockchip_gem_get_pages(struct rockchip_gem_object *rk_obj)
>  	/*
>  	 * Fake up the SG table so that dma_sync_sg_for_device() can be used
>  	 * to flush the pages associated with it.
> -	 *
> -	 * TODO: Replace this by drm_flush_sg() once it can be implemented
> -	 * without relying on symbols that are not exported.
>  	 */
>  	for_each_sg(rk_obj->sgt->sgl, s, rk_obj->sgt->nents, i)
>  		sg_dma_address(s) = sg_phys(s);
>  
> -	dma_sync_sg_for_device(drm->dev, rk_obj->sgt->sgl, rk_obj->sgt->nents,
> -			       DMA_TO_DEVICE);
> +	drm_flush_sg(drm->dev, rk_obj->sgt);
>  
>  	return 0;
>  
> diff --git a/drivers/gpu/drm/tegra/gem.c b/drivers/gpu/drm/tegra/gem.c
> index ab1e53d434e8..9945fd2f6bd6 100644
> --- a/drivers/gpu/drm/tegra/gem.c
> +++ b/drivers/gpu/drm/tegra/gem.c
> @@ -230,15 +230,11 @@ static int tegra_bo_get_pages(struct drm_device *drm, struct tegra_bo *bo)
>  	/*
>  	 * Fake up the SG table so that dma_sync_sg_for_device() can be used
>  	 * to flush the pages associated with it.
> -	 *
> -	 * TODO: Replace this by drm_clflash_sg() once it can be implemented
> -	 * without relying on symbols that are not exported.
>  	 */
>  	for_each_sg(bo->sgt->sgl, s, bo->sgt->nents, i)
>  		sg_dma_address(s) = sg_phys(s);
>  
> -	dma_sync_sg_for_device(drm->dev, bo->sgt->sgl, bo->sgt->nents,
> -			       DMA_TO_DEVICE);
> +	drm_flush_sg(drm->dev, bo->sgt);
>  
>  	return 0;
>  
> -- 
> 2.13.5
> 
> _______________________________________________
> dri-devel mailing list
> dri-devel-PD4FTy7X32lNgt0PjOBp9y5qC8QIuHrW@public.gmane.org
> https://lists.freedesktop.org/mailman/listinfo/dri-devel

-- 
Daniel Vetter
Software Engineer, Intel Corporation
http://blog.ffwll.ch

^ permalink raw reply	[flat|nested] 19+ messages in thread

* Re: [PATCH 4/4] drm/vgem: flush page during page fault
       [not found]     ` <20180117003559.67837-4-gurchetansingh-F7+t8E8rja9g9hUCZPvPmw@public.gmane.org>
@ 2018-01-17  8:39       ` Daniel Vetter
  2018-01-17 22:49         ` Gurchetan Singh
  0 siblings, 1 reply; 19+ messages in thread
From: Daniel Vetter @ 2018-01-17  8:39 UTC (permalink / raw)
  To: Gurchetan Singh
  Cc: dri-devel-PD4FTy7X32lNgt0PjOBp9y5qC8QIuHrW,
	thierry.reding-Re5JQEeQqe8AvxtiuMwx3w,
	linux-tegra-u79uwXL29TY76Z2rM5mHXA,
	daniel.vetter-ral2JQCrhuEAvxtiuMwx3w

On Tue, Jan 16, 2018 at 04:35:59PM -0800, Gurchetan Singh wrote:
> This is required to use buffers allocated by vgem on AMD and ARM devices.
> We're experiencing a case where eviction of the cache races with userspace
> writes. To fix this, flush the cache after retrieving a page.
> 
> Signed-off-by: Gurchetan Singh <gurchetansingh-F7+t8E8rja9g9hUCZPvPmw@public.gmane.org>
> ---
>  drivers/gpu/drm/vgem/vgem_drv.c | 1 +
>  1 file changed, 1 insertion(+)
> 
> diff --git a/drivers/gpu/drm/vgem/vgem_drv.c b/drivers/gpu/drm/vgem/vgem_drv.c
> index 35bfdfb746a7..fb263969f02d 100644
> --- a/drivers/gpu/drm/vgem/vgem_drv.c
> +++ b/drivers/gpu/drm/vgem/vgem_drv.c
> @@ -112,6 +112,7 @@ static int vgem_gem_fault(struct vm_fault *vmf)
>  				break;
>  		}
>  
> +		drm_flush_pages(obj->base.dev->dev, &page, 1);

Uh ... what exactly are you doing?

Asking because the entire "who's responsible for coherency" story is
entirely undefined still when doing buffer sharing :-/ What is clear is
that currently vgem entirely ignores this (there's not
begin/end_cpu_access callback), mostly because the shared dma-buf support
in drm_prime.c also entirely ignores this. And doing a one-time only
flushing in your fault handler is definitely not going to fix this (at
least not if you do anything else than one-shot uploads).
-Daniel

>  	}
>  	return ret;
>  }
> -- 
> 2.13.5
> 
> _______________________________________________
> dri-devel mailing list
> dri-devel-PD4FTy7X32lNgt0PjOBp9y5qC8QIuHrW@public.gmane.org
> https://lists.freedesktop.org/mailman/listinfo/dri-devel

-- 
Daniel Vetter
Software Engineer, Intel Corporation
http://blog.ffwll.ch

^ permalink raw reply	[flat|nested] 19+ messages in thread

* Re: [PATCH 3/4] drm: add ARM flush implementations
       [not found]         ` <20180117083105.GG2759-dv86pmgwkMBes7Z6vYuT8azUEOm+Xw19@public.gmane.org>
@ 2018-01-17 18:53           ` Sean Paul
  2018-01-17 21:22             ` Daniel Vetter
  0 siblings, 1 reply; 19+ messages in thread
From: Sean Paul @ 2018-01-17 18:53 UTC (permalink / raw)
  To: Daniel Vetter
  Cc: Gurchetan Singh, linux-tegra-u79uwXL29TY76Z2rM5mHXA,
	daniel.vetter-ral2JQCrhuEAvxtiuMwx3w,
	thierry.reding-Re5JQEeQqe8AvxtiuMwx3w,
	dri-devel-PD4FTy7X32lNgt0PjOBp9y5qC8QIuHrW

On Wed, Jan 17, 2018 at 09:31:05AM +0100, Daniel Vetter wrote:
> On Tue, Jan 16, 2018 at 04:35:58PM -0800, Gurchetan Singh wrote:
> > The DMA API can be used to flush scatter gather tables and physical
> > pages on ARM devices.
> > 
> > Signed-off-by: Gurchetan Singh <gurchetansingh-F7+t8E8rja9g9hUCZPvPmw@public.gmane.org>
> > ---
> >  drivers/gpu/drm/drm_cache.c                 | 17 +++++++++++++++++
> >  drivers/gpu/drm/rockchip/rockchip_drm_gem.c |  7 ++-----
> >  drivers/gpu/drm/tegra/gem.c                 |  6 +-----
> >  3 files changed, 20 insertions(+), 10 deletions(-)
> > 
> > diff --git a/drivers/gpu/drm/drm_cache.c b/drivers/gpu/drm/drm_cache.c
> > index 3d2bb9d71a60..98d6ebb40e96 100644
> > --- a/drivers/gpu/drm/drm_cache.c
> > +++ b/drivers/gpu/drm/drm_cache.c
> > @@ -105,6 +105,18 @@ drm_flush_pages(struct device *dev, struct page *pages[],
> >  				   (unsigned long)page_virtual + PAGE_SIZE);
> >  		kunmap_atomic(page_virtual);
> >  	}
> > +#elif defined(CONFIG_ARM) || defined(CONFIG_ARM64)
> > +	unsigned long i;
> > +	dma_addr_t dma_handle;
> > +
> > +	if (!dev)
> > +		return;
> > +
> > +	for (i = 0; i < num_pages; i++) {
> > +		dma_handle = phys_to_dma(drm->dev, page_to_phys(pages[i]));
> > +		dma_sync_single_for_device(dev, dma_handle, PAGE_SIZE,
> > +					   DMA_TO_DEVICE);
> 
> Erm no. These functions here are super-low-level functions used by drivers
> which know exactly what they're doing. Which is reimplementing half of the
> dma api behind the dma api's back because the dma api just isn't quite
> sufficient for implementing fast gpu drivers.

This isn't obvious from reading the docs, and doubly confusing when reading the
TODOs in the individual drivers. I don't think it's reasonable to assume
everyone (especially first time contributors!) will know this.

> 
> If all you end up doing is calling the dma api again, then pls just call
> it directly.

Given that some drivers already midlayer the flush, I don't think it's wrong to
provide "dumb" default behavior so drivers don't need to worry about knowing
exactly what they're doing.

> 
> And just to make it clear: I'd be perfectly fine with adding arm support
> here, but then it'd need to directly flush cpu caches and bypass the dma
> api. Otherwise this is pointless.

I don't really have skin in this game, but I think providing one call that works
across all platforms is pretty nice.

Sean


> -Daniel
> 
> > +	}
> >  #else
> >  	pr_err("Architecture has no drm_cache.c support\n");
> >  	WARN_ON_ONCE(1);
> > @@ -136,6 +148,11 @@ drm_flush_sg(struct device *dev, struct sg_table *st)
> >  
> >  	if (wbinvd_on_all_cpus())
> >  		pr_err("Timed out waiting for cache flush\n");
> > +#elif defined(CONFIG_ARM) || defined(CONFIG_ARM64)
> > +	if (!dev)
> > +		return;
> > +
> > +	dma_sync_sg_for_device(dev, st->sgl, st->nents, DMA_TO_DEVICE);
> >  #else
> >  	pr_err("Architecture has no drm_cache.c support\n");
> >  	WARN_ON_ONCE(1);
> > diff --git a/drivers/gpu/drm/rockchip/rockchip_drm_gem.c b/drivers/gpu/drm/rockchip/rockchip_drm_gem.c
> > index 8ac7eb25e46d..0157f90b5d10 100644
> > --- a/drivers/gpu/drm/rockchip/rockchip_drm_gem.c
> > +++ b/drivers/gpu/drm/rockchip/rockchip_drm_gem.c
> > @@ -14,6 +14,7 @@
> >  
> >  #include <drm/drm.h>
> >  #include <drm/drmP.h>
> > +#include <drm/drm_cache.h>
> >  #include <drm/drm_gem.h>
> >  #include <drm/drm_vma_manager.h>
> >  #include <linux/iommu.h>
> > @@ -99,15 +100,11 @@ static int rockchip_gem_get_pages(struct rockchip_gem_object *rk_obj)
> >  	/*
> >  	 * Fake up the SG table so that dma_sync_sg_for_device() can be used
> >  	 * to flush the pages associated with it.
> > -	 *
> > -	 * TODO: Replace this by drm_flush_sg() once it can be implemented
> > -	 * without relying on symbols that are not exported.
> >  	 */
> >  	for_each_sg(rk_obj->sgt->sgl, s, rk_obj->sgt->nents, i)
> >  		sg_dma_address(s) = sg_phys(s);
> >  
> > -	dma_sync_sg_for_device(drm->dev, rk_obj->sgt->sgl, rk_obj->sgt->nents,
> > -			       DMA_TO_DEVICE);
> > +	drm_flush_sg(drm->dev, rk_obj->sgt);
> >  
> >  	return 0;
> >  
> > diff --git a/drivers/gpu/drm/tegra/gem.c b/drivers/gpu/drm/tegra/gem.c
> > index ab1e53d434e8..9945fd2f6bd6 100644
> > --- a/drivers/gpu/drm/tegra/gem.c
> > +++ b/drivers/gpu/drm/tegra/gem.c
> > @@ -230,15 +230,11 @@ static int tegra_bo_get_pages(struct drm_device *drm, struct tegra_bo *bo)
> >  	/*
> >  	 * Fake up the SG table so that dma_sync_sg_for_device() can be used
> >  	 * to flush the pages associated with it.
> > -	 *
> > -	 * TODO: Replace this by drm_clflash_sg() once it can be implemented
> > -	 * without relying on symbols that are not exported.
> >  	 */
> >  	for_each_sg(bo->sgt->sgl, s, bo->sgt->nents, i)
> >  		sg_dma_address(s) = sg_phys(s);
> >  
> > -	dma_sync_sg_for_device(drm->dev, bo->sgt->sgl, bo->sgt->nents,
> > -			       DMA_TO_DEVICE);
> > +	drm_flush_sg(drm->dev, bo->sgt);
> >  
> >  	return 0;
> >  
> > -- 
> > 2.13.5
> > 
> > _______________________________________________
> > dri-devel mailing list
> > dri-devel-PD4FTy7X32lNgt0PjOBp9y5qC8QIuHrW@public.gmane.org
> > https://lists.freedesktop.org/mailman/listinfo/dri-devel
> 
> -- 
> Daniel Vetter
> Software Engineer, Intel Corporation
> http://blog.ffwll.ch
> _______________________________________________
> dri-devel mailing list
> dri-devel-PD4FTy7X32lNgt0PjOBp9y5qC8QIuHrW@public.gmane.org
> https://lists.freedesktop.org/mailman/listinfo/dri-devel

-- 
Sean Paul, Software Engineer, Google / Chromium OS

^ permalink raw reply	[flat|nested] 19+ messages in thread

* Re: [PATCH 3/4] drm: add ARM flush implementations
  2018-01-17 18:53           ` Sean Paul
@ 2018-01-17 21:22             ` Daniel Vetter
  0 siblings, 0 replies; 19+ messages in thread
From: Daniel Vetter @ 2018-01-17 21:22 UTC (permalink / raw)
  To: Sean Paul
  Cc: Gurchetan Singh, linux-tegra-u79uwXL29TY76Z2rM5mHXA,
	Daniel Vetter, Thierry Reding, dri-devel

On Wed, Jan 17, 2018 at 7:53 PM, Sean Paul <seanpaul-F7+t8E8rja9g9hUCZPvPmw@public.gmane.org> wrote:
> On Wed, Jan 17, 2018 at 09:31:05AM +0100, Daniel Vetter wrote:
>> On Tue, Jan 16, 2018 at 04:35:58PM -0800, Gurchetan Singh wrote:
>> > The DMA API can be used to flush scatter gather tables and physical
>> > pages on ARM devices.
>> >
>> > Signed-off-by: Gurchetan Singh <gurchetansingh-F7+t8E8rja9g9hUCZPvPmw@public.gmane.org>
>> > ---
>> >  drivers/gpu/drm/drm_cache.c                 | 17 +++++++++++++++++
>> >  drivers/gpu/drm/rockchip/rockchip_drm_gem.c |  7 ++-----
>> >  drivers/gpu/drm/tegra/gem.c                 |  6 +-----
>> >  3 files changed, 20 insertions(+), 10 deletions(-)
>> >
>> > diff --git a/drivers/gpu/drm/drm_cache.c b/drivers/gpu/drm/drm_cache.c
>> > index 3d2bb9d71a60..98d6ebb40e96 100644
>> > --- a/drivers/gpu/drm/drm_cache.c
>> > +++ b/drivers/gpu/drm/drm_cache.c
>> > @@ -105,6 +105,18 @@ drm_flush_pages(struct device *dev, struct page *pages[],
>> >                                (unsigned long)page_virtual + PAGE_SIZE);
>> >             kunmap_atomic(page_virtual);
>> >     }
>> > +#elif defined(CONFIG_ARM) || defined(CONFIG_ARM64)
>> > +   unsigned long i;
>> > +   dma_addr_t dma_handle;
>> > +
>> > +   if (!dev)
>> > +           return;
>> > +
>> > +   for (i = 0; i < num_pages; i++) {
>> > +           dma_handle = phys_to_dma(drm->dev, page_to_phys(pages[i]));
>> > +           dma_sync_single_for_device(dev, dma_handle, PAGE_SIZE,
>> > +                                      DMA_TO_DEVICE);
>>
>> Erm no. These functions here are super-low-level functions used by drivers
>> which know exactly what they're doing. Which is reimplementing half of the
>> dma api behind the dma api's back because the dma api just isn't quite
>> sufficient for implementing fast gpu drivers.
>
> This isn't obvious from reading the docs, and doubly confusing when reading the
> TODOs in the individual drivers. I don't think it's reasonable to assume
> everyone (especially first time contributors!) will know this.

There isn't any docs, because all this stuff is really badly littered
with ill-defined behaviour (managed dma-buf and dma api coherency for
gpu drivers). I've discussed it with lots of people, but thus far no
insight how we should be do this.

>> If all you end up doing is calling the dma api again, then pls just call
>> it directly.
>
> Given that some drivers already midlayer the flush, I don't think it's wrong to
> provide "dumb" default behavior so drivers don't need to worry about knowing
> exactly what they're doing.
>
>>
>> And just to make it clear: I'd be perfectly fine with adding arm support
>> here, but then it'd need to directly flush cpu caches and bypass the dma
>> api. Otherwise this is pointless.
>
> I don't really have skin in this game, but I think providing one call that works
> across all platforms is pretty nice.

The problem I have is not with calling back down into the dma api.
It's with adding a devcie paramater that isn't needed, but just there
because we call down into the wrong layer (i.e. the previous patch).
If you can make this happen without the dummy device then I'd be happy
to ack the patches. But I thought ARM has a cacheline flush
instruction too?
-Daniel

>
> Sean
>
>
>> -Daniel
>>
>> > +   }
>> >  #else
>> >     pr_err("Architecture has no drm_cache.c support\n");
>> >     WARN_ON_ONCE(1);
>> > @@ -136,6 +148,11 @@ drm_flush_sg(struct device *dev, struct sg_table *st)
>> >
>> >     if (wbinvd_on_all_cpus())
>> >             pr_err("Timed out waiting for cache flush\n");
>> > +#elif defined(CONFIG_ARM) || defined(CONFIG_ARM64)
>> > +   if (!dev)
>> > +           return;
>> > +
>> > +   dma_sync_sg_for_device(dev, st->sgl, st->nents, DMA_TO_DEVICE);
>> >  #else
>> >     pr_err("Architecture has no drm_cache.c support\n");
>> >     WARN_ON_ONCE(1);
>> > diff --git a/drivers/gpu/drm/rockchip/rockchip_drm_gem.c b/drivers/gpu/drm/rockchip/rockchip_drm_gem.c
>> > index 8ac7eb25e46d..0157f90b5d10 100644
>> > --- a/drivers/gpu/drm/rockchip/rockchip_drm_gem.c
>> > +++ b/drivers/gpu/drm/rockchip/rockchip_drm_gem.c
>> > @@ -14,6 +14,7 @@
>> >
>> >  #include <drm/drm.h>
>> >  #include <drm/drmP.h>
>> > +#include <drm/drm_cache.h>
>> >  #include <drm/drm_gem.h>
>> >  #include <drm/drm_vma_manager.h>
>> >  #include <linux/iommu.h>
>> > @@ -99,15 +100,11 @@ static int rockchip_gem_get_pages(struct rockchip_gem_object *rk_obj)
>> >     /*
>> >      * Fake up the SG table so that dma_sync_sg_for_device() can be used
>> >      * to flush the pages associated with it.
>> > -    *
>> > -    * TODO: Replace this by drm_flush_sg() once it can be implemented
>> > -    * without relying on symbols that are not exported.
>> >      */
>> >     for_each_sg(rk_obj->sgt->sgl, s, rk_obj->sgt->nents, i)
>> >             sg_dma_address(s) = sg_phys(s);
>> >
>> > -   dma_sync_sg_for_device(drm->dev, rk_obj->sgt->sgl, rk_obj->sgt->nents,
>> > -                          DMA_TO_DEVICE);
>> > +   drm_flush_sg(drm->dev, rk_obj->sgt);
>> >
>> >     return 0;
>> >
>> > diff --git a/drivers/gpu/drm/tegra/gem.c b/drivers/gpu/drm/tegra/gem.c
>> > index ab1e53d434e8..9945fd2f6bd6 100644
>> > --- a/drivers/gpu/drm/tegra/gem.c
>> > +++ b/drivers/gpu/drm/tegra/gem.c
>> > @@ -230,15 +230,11 @@ static int tegra_bo_get_pages(struct drm_device *drm, struct tegra_bo *bo)
>> >     /*
>> >      * Fake up the SG table so that dma_sync_sg_for_device() can be used
>> >      * to flush the pages associated with it.
>> > -    *
>> > -    * TODO: Replace this by drm_clflash_sg() once it can be implemented
>> > -    * without relying on symbols that are not exported.
>> >      */
>> >     for_each_sg(bo->sgt->sgl, s, bo->sgt->nents, i)
>> >             sg_dma_address(s) = sg_phys(s);
>> >
>> > -   dma_sync_sg_for_device(drm->dev, bo->sgt->sgl, bo->sgt->nents,
>> > -                          DMA_TO_DEVICE);
>> > +   drm_flush_sg(drm->dev, bo->sgt);
>> >
>> >     return 0;
>> >
>> > --
>> > 2.13.5
>> >
>> > _______________________________________________
>> > dri-devel mailing list
>> > dri-devel-PD4FTy7X32lNgt0PjOBp9y5qC8QIuHrW@public.gmane.org
>> > https://lists.freedesktop.org/mailman/listinfo/dri-devel
>>
>> --
>> Daniel Vetter
>> Software Engineer, Intel Corporation
>> http://blog.ffwll.ch
>> _______________________________________________
>> dri-devel mailing list
>> dri-devel-PD4FTy7X32lNgt0PjOBp9y5qC8QIuHrW@public.gmane.org
>> https://lists.freedesktop.org/mailman/listinfo/dri-devel
>
> --
> Sean Paul, Software Engineer, Google / Chromium OS



-- 
Daniel Vetter
Software Engineer, Intel Corporation
+41 (0) 79 365 57 48 - http://blog.ffwll.ch

^ permalink raw reply	[flat|nested] 19+ messages in thread

* Re: [PATCH 3/4] drm: add ARM flush implementations
  2018-01-17  8:31       ` Daniel Vetter
       [not found]         ` <20180117083105.GG2759-dv86pmgwkMBes7Z6vYuT8azUEOm+Xw19@public.gmane.org>
@ 2018-01-17 22:46         ` Gurchetan Singh
       [not found]           ` <CAAfnVBm4Mp8vC4aBmrP2rJeRSBN_AFN5gaZfijg9BCJEBupDzw-JsoAwUIsXosN+BqQ9rBEUg@public.gmane.org>
  1 sibling, 1 reply; 19+ messages in thread
From: Gurchetan Singh @ 2018-01-17 22:46 UTC (permalink / raw)
  To: Daniel Vetter
  Cc: linux-tegra, daniel.vetter, thierry.reding, laurent.pinchart,
	ML dri-devel


[-- Attachment #1.1: Type: text/plain, Size: 5966 bytes --]

>
> dma api just isn't quite sufficient for implementing fast gpu drivers.


Can you elaborate?  IIRC the DMA API has strong synchronization guarantees
and that can be problematic for GPU drivers.  However, using the DMA API
for flushing doesn't necessarily mean the driver has to use the rest of the
DMA API.

but then it'd need to directly flush cpu caches and bypass the dma api.


On ARM, cache flushing seems to vary from chipset to chipset.  For example,
on ARM32 a typical call-stack for dma_sync_single_for_device looks like:

arm_dma_sync_single_for_device
__dma_page_cpu_to_dev
outer_clean_range
outer_cache.clean_range

There are multiple clean_range implementations out there
(i.e, aurora_clean_range, l2c210_clean_range, feroceon_l2_clean_range), so
that's why the DMA API was used in this case.  On ARM64, things are a
little simpler, but the DMA API seems to go directly to assembly
(__dma_map_area) after a little indirection.  Why do you think that's
inefficient?

On Wed, Jan 17, 2018 at 12:31 AM, Daniel Vetter <daniel@ffwll.ch> wrote:

> On Tue, Jan 16, 2018 at 04:35:58PM -0800, Gurchetan Singh wrote:
> > The DMA API can be used to flush scatter gather tables and physical
> > pages on ARM devices.
> >
> > Signed-off-by: Gurchetan Singh <gurchetansingh@chromium.org>
> > ---
> >  drivers/gpu/drm/drm_cache.c                 | 17 +++++++++++++++++
> >  drivers/gpu/drm/rockchip/rockchip_drm_gem.c |  7 ++-----
> >  drivers/gpu/drm/tegra/gem.c                 |  6 +-----
> >  3 files changed, 20 insertions(+), 10 deletions(-)
> >
> > diff --git a/drivers/gpu/drm/drm_cache.c b/drivers/gpu/drm/drm_cache.c
> > index 3d2bb9d71a60..98d6ebb40e96 100644
> > --- a/drivers/gpu/drm/drm_cache.c
> > +++ b/drivers/gpu/drm/drm_cache.c
> > @@ -105,6 +105,18 @@ drm_flush_pages(struct device *dev, struct page
> *pages[],
> >                                  (unsigned long)page_virtual +
> PAGE_SIZE);
> >               kunmap_atomic(page_virtual);
> >       }
> > +#elif defined(CONFIG_ARM) || defined(CONFIG_ARM64)
> > +     unsigned long i;
> > +     dma_addr_t dma_handle;
> > +
> > +     if (!dev)
> > +             return;
> > +
> > +     for (i = 0; i < num_pages; i++) {
> > +             dma_handle = phys_to_dma(drm->dev, page_to_phys(pages[i]));
> > +             dma_sync_single_for_device(dev, dma_handle, PAGE_SIZE,
> > +                                        DMA_TO_DEVICE);
>
> Erm no. These functions here are super-low-level functions used by drivers
> which know exactly what they're doing. Which is reimplementing half of the
> dma api behind the dma api's back because the dma api just isn't quite
> sufficient for implementing fast gpu drivers.
>
> If all you end up doing is calling the dma api again, then pls just call
> it directly.
>
> And just to make it clear: I'd be perfectly fine with adding arm support
> here, but then it'd need to directly flush cpu caches and bypass the dma
> api. Otherwise this is pointless.
> -Daniel
>
> > +     }
> >  #else
> >       pr_err("Architecture has no drm_cache.c support\n");
> >       WARN_ON_ONCE(1);
> > @@ -136,6 +148,11 @@ drm_flush_sg(struct device *dev, struct sg_table
> *st)
> >
> >       if (wbinvd_on_all_cpus())
> >               pr_err("Timed out waiting for cache flush\n");
> > +#elif defined(CONFIG_ARM) || defined(CONFIG_ARM64)
> > +     if (!dev)
> > +             return;
> > +
> > +     dma_sync_sg_for_device(dev, st->sgl, st->nents, DMA_TO_DEVICE);
> >  #else
> >       pr_err("Architecture has no drm_cache.c support\n");
> >       WARN_ON_ONCE(1);
> > diff --git a/drivers/gpu/drm/rockchip/rockchip_drm_gem.c
> b/drivers/gpu/drm/rockchip/rockchip_drm_gem.c
> > index 8ac7eb25e46d..0157f90b5d10 100644
> > --- a/drivers/gpu/drm/rockchip/rockchip_drm_gem.c
> > +++ b/drivers/gpu/drm/rockchip/rockchip_drm_gem.c
> > @@ -14,6 +14,7 @@
> >
> >  #include <drm/drm.h>
> >  #include <drm/drmP.h>
> > +#include <drm/drm_cache.h>
> >  #include <drm/drm_gem.h>
> >  #include <drm/drm_vma_manager.h>
> >  #include <linux/iommu.h>
> > @@ -99,15 +100,11 @@ static int rockchip_gem_get_pages(struct
> rockchip_gem_object *rk_obj)
> >       /*
> >        * Fake up the SG table so that dma_sync_sg_for_device() can be
> used
> >        * to flush the pages associated with it.
> > -      *
> > -      * TODO: Replace this by drm_flush_sg() once it can be implemented
> > -      * without relying on symbols that are not exported.
> >        */
> >       for_each_sg(rk_obj->sgt->sgl, s, rk_obj->sgt->nents, i)
> >               sg_dma_address(s) = sg_phys(s);
> >
> > -     dma_sync_sg_for_device(drm->dev, rk_obj->sgt->sgl,
> rk_obj->sgt->nents,
> > -                            DMA_TO_DEVICE);
> > +     drm_flush_sg(drm->dev, rk_obj->sgt);
> >
> >       return 0;
> >
> > diff --git a/drivers/gpu/drm/tegra/gem.c b/drivers/gpu/drm/tegra/gem.c
> > index ab1e53d434e8..9945fd2f6bd6 100644
> > --- a/drivers/gpu/drm/tegra/gem.c
> > +++ b/drivers/gpu/drm/tegra/gem.c
> > @@ -230,15 +230,11 @@ static int tegra_bo_get_pages(struct drm_device
> *drm, struct tegra_bo *bo)
> >       /*
> >        * Fake up the SG table so that dma_sync_sg_for_device() can be
> used
> >        * to flush the pages associated with it.
> > -      *
> > -      * TODO: Replace this by drm_clflash_sg() once it can be
> implemented
> > -      * without relying on symbols that are not exported.
> >        */
> >       for_each_sg(bo->sgt->sgl, s, bo->sgt->nents, i)
> >               sg_dma_address(s) = sg_phys(s);
> >
> > -     dma_sync_sg_for_device(drm->dev, bo->sgt->sgl, bo->sgt->nents,
> > -                            DMA_TO_DEVICE);
> > +     drm_flush_sg(drm->dev, bo->sgt);
> >
> >       return 0;
> >
> > --
> > 2.13.5
> >
> > _______________________________________________
> > dri-devel mailing list
> > dri-devel@lists.freedesktop.org
> > https://lists.freedesktop.org/mailman/listinfo/dri-devel
>
> --
> Daniel Vetter
> Software Engineer, Intel Corporation
> http://blog.ffwll.ch
>

[-- Attachment #1.2: Type: text/html, Size: 10330 bytes --]

[-- Attachment #2: Type: text/plain, Size: 160 bytes --]

_______________________________________________
dri-devel mailing list
dri-devel@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/dri-devel

^ permalink raw reply	[flat|nested] 19+ messages in thread

* Re: [PATCH 4/4] drm/vgem: flush page during page fault
  2018-01-17  8:39       ` Daniel Vetter
@ 2018-01-17 22:49         ` Gurchetan Singh
       [not found]           ` <CAAfnVBkxhEec1U8Ck4UyMXjwKvSFJs-bpip5K-7sbB51TNK0bA-JsoAwUIsXosN+BqQ9rBEUg@public.gmane.org>
  0 siblings, 1 reply; 19+ messages in thread
From: Gurchetan Singh @ 2018-01-17 22:49 UTC (permalink / raw)
  To: Daniel Vetter; +Cc: linux-tegra, daniel.vetter, thierry.reding, ML dri-devel


[-- Attachment #1.1: Type: text/plain, Size: 2496 bytes --]

On Wed, Jan 17, 2018 at 12:39 AM, Daniel Vetter <daniel@ffwll.ch> wrote:

> On Tue, Jan 16, 2018 at 04:35:59PM -0800, Gurchetan Singh wrote:
> > This is required to use buffers allocated by vgem on AMD and ARM devices.
> > We're experiencing a case where eviction of the cache races with
> userspace
> > writes. To fix this, flush the cache after retrieving a page.
> >
> > Signed-off-by: Gurchetan Singh <gurchetansingh@chromium.org>
> > ---
> >  drivers/gpu/drm/vgem/vgem_drv.c | 1 +
> >  1 file changed, 1 insertion(+)
> >
> > diff --git a/drivers/gpu/drm/vgem/vgem_drv.c
> b/drivers/gpu/drm/vgem/vgem_drv.c
> > index 35bfdfb746a7..fb263969f02d 100644
> > --- a/drivers/gpu/drm/vgem/vgem_drv.c
> > +++ b/drivers/gpu/drm/vgem/vgem_drv.c
> > @@ -112,6 +112,7 @@ static int vgem_gem_fault(struct vm_fault *vmf)
> >                               break;
> >               }
> >
> > +             drm_flush_pages(obj->base.dev->dev, &page, 1);
>
> Uh ... what exactly are you doing?
>
> Asking because the entire "who's responsible for coherency" story is
> entirely undefined still when doing buffer sharing :-/ What is clear is
> that currently vgem entirely ignores this (there's not
> begin/end_cpu_access callback), mostly because the shared dma-buf support
> in drm_prime.c also entirely ignores this.



This patch isn't trying to address the case of a dma-buf imported into
vgem.  It's trying to address the case when a buffer is created by
vgem_gem_dumb_create, mapped by vgem_gem_dumb_map and then accessed by user
space.  Since the page retrieved by shmem_read_mapping_page during the page
fault may still be in the cache, we're experiencing incorrect data in
buffer.  Here's the test case we're running:

https://chromium.googlesource.com/chromiumos/platform/drm-tests/+/master/vgem_test.c

It fails on line 210 on AMD and ARM devices (not Intel though).

And doing a one-time only
> flushing in your fault handler is definitely not going to fix this (at
> least not if you do anything else than one-shot uploads).
>

There used to be a be vgem_gem_get_pages function, but that's been
removed.  I don't know where else to flush in this situation.


> -Daniel
>
> >       }
> >       return ret;
> >  }
> > --
> > 2.13.5
> >
> > _______________________________________________
> > dri-devel mailing list
> > dri-devel@lists.freedesktop.org
> > https://lists.freedesktop.org/mailman/listinfo/dri-devel
>
> --
> Daniel Vetter
> Software Engineer, Intel Corporation
> http://blog.ffwll.ch
>

[-- Attachment #1.2: Type: text/html, Size: 4089 bytes --]

[-- Attachment #2: Type: text/plain, Size: 160 bytes --]

_______________________________________________
dri-devel mailing list
dri-devel@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/dri-devel

^ permalink raw reply	[flat|nested] 19+ messages in thread

* Re: [PATCH 4/4] drm/vgem: flush page during page fault
       [not found]           ` <CAAfnVBkxhEec1U8Ck4UyMXjwKvSFJs-bpip5K-7sbB51TNK0bA-JsoAwUIsXosN+BqQ9rBEUg@public.gmane.org>
@ 2018-01-18  7:38             ` Daniel Vetter
  2018-01-18 17:23               ` Gurchetan Singh
  0 siblings, 1 reply; 19+ messages in thread
From: Daniel Vetter @ 2018-01-18  7:38 UTC (permalink / raw)
  To: Gurchetan Singh
  Cc: ML dri-devel, Thierry Reding, linux-tegra-u79uwXL29TY76Z2rM5mHXA,
	Daniel Vetter

On Wed, Jan 17, 2018 at 11:49 PM, Gurchetan Singh
<gurchetansingh-F7+t8E8rja9g9hUCZPvPmw@public.gmane.org> wrote:
>
> On Wed, Jan 17, 2018 at 12:39 AM, Daniel Vetter <daniel-/w4YWyX8dFk@public.gmane.org> wrote:
>>
>> On Tue, Jan 16, 2018 at 04:35:59PM -0800, Gurchetan Singh wrote:
>> > This is required to use buffers allocated by vgem on AMD and ARM
>> > devices.
>> > We're experiencing a case where eviction of the cache races with
>> > userspace
>> > writes. To fix this, flush the cache after retrieving a page.
>> >
>> > Signed-off-by: Gurchetan Singh <gurchetansingh-F7+t8E8rja9g9hUCZPvPmw@public.gmane.org>
>> > ---
>> >  drivers/gpu/drm/vgem/vgem_drv.c | 1 +
>> >  1 file changed, 1 insertion(+)
>> >
>> > diff --git a/drivers/gpu/drm/vgem/vgem_drv.c
>> > b/drivers/gpu/drm/vgem/vgem_drv.c
>> > index 35bfdfb746a7..fb263969f02d 100644
>> > --- a/drivers/gpu/drm/vgem/vgem_drv.c
>> > +++ b/drivers/gpu/drm/vgem/vgem_drv.c
>> > @@ -112,6 +112,7 @@ static int vgem_gem_fault(struct vm_fault *vmf)
>> >                               break;
>> >               }
>> >
>> > +             drm_flush_pages(obj->base.dev->dev, &page, 1);
>>
>> Uh ... what exactly are you doing?
>>
>> Asking because the entire "who's responsible for coherency" story is
>> entirely undefined still when doing buffer sharing :-/ What is clear is
>> that currently vgem entirely ignores this (there's not
>> begin/end_cpu_access callback), mostly because the shared dma-buf support
>> in drm_prime.c also entirely ignores this.
>
>
>
> This patch isn't trying to address the case of a dma-buf imported into vgem.
> It's trying to address the case when a buffer is created by
> vgem_gem_dumb_create, mapped by vgem_gem_dumb_map and then accessed by user
> space.  Since the page retrieved by shmem_read_mapping_page during the page
> fault may still be in the cache, we're experiencing incorrect data in
> buffer.  Here's the test case we're running:
>
> https://chromium.googlesource.com/chromiumos/platform/drm-tests/+/master/vgem_test.c

404s over here (Internal url?).

> It fails on line 210 on AMD and ARM devices (not Intel though).

So you _do_ import it on the other device driver as a dma-buf (and
export it from vgem)? Because coherency isn't well-defined for dma-buf
no matter who the exporter/importer is.

>> And doing a one-time only
>> flushing in your fault handler is definitely not going to fix this (at
>> least not if you do anything else than one-shot uploads).
>
>
> There used to be a be vgem_gem_get_pages function, but that's been removed.
> I don't know where else to flush in this situation.

dma_buf begin/end cpu access. Even exposed as an ioctl when you're
using the dma-buf mmap stuff. Vgem doesn't have that, which means
as-is the dumb mmap support of vgem can't really support this if you
want to do explicit flushing.

What would work is uncached/writecombine/coherent dma memory. But then
we're in the middle of the entire
"who's responsible.
-Daniel

>
>>
>> -Daniel
>>
>> >       }
>> >       return ret;
>> >  }
>> > --
>> > 2.13.5
>> >
>> > _______________________________________________
>> > dri-devel mailing list
>> > dri-devel-PD4FTy7X32lNgt0PjOBp9y5qC8QIuHrW@public.gmane.org
>> > https://lists.freedesktop.org/mailman/listinfo/dri-devel
>>
>> --
>> Daniel Vetter
>> Software Engineer, Intel Corporation
>> http://blog.ffwll.ch
>
>



-- 
Daniel Vetter
Software Engineer, Intel Corporation
+41 (0) 79 365 57 48 - http://blog.ffwll.ch

^ permalink raw reply	[flat|nested] 19+ messages in thread

* Re: [PATCH 3/4] drm: add ARM flush implementations
       [not found]           ` <CAAfnVBm4Mp8vC4aBmrP2rJeRSBN_AFN5gaZfijg9BCJEBupDzw-JsoAwUIsXosN+BqQ9rBEUg@public.gmane.org>
@ 2018-01-18  7:42             ` Daniel Vetter
  2018-01-18 17:20               ` Gurchetan Singh
       [not found]               ` <CAKMK7uHoAqYsgqOTT4O=NQFvj51M2tKp+c2Sm8t4t01QN9Jkfg-JsoAwUIsXosN+BqQ9rBEUg@public.gmane.org>
  0 siblings, 2 replies; 19+ messages in thread
From: Daniel Vetter @ 2018-01-18  7:42 UTC (permalink / raw)
  To: Gurchetan Singh
  Cc: ML dri-devel, Thierry Reding, linux-tegra-u79uwXL29TY76Z2rM5mHXA,
	Daniel Vetter, Laurent Pinchart

On Wed, Jan 17, 2018 at 11:46 PM, Gurchetan Singh
<gurchetansingh-F7+t8E8rja9g9hUCZPvPmw@public.gmane.org> wrote:
>> dma api just isn't quite sufficient for implementing fast gpu drivers.
>
>
> Can you elaborate?  IIRC the DMA API has strong synchronization guarantees
> and that can be problematic for GPU drivers.  However, using the DMA API for
> flushing doesn't necessarily mean the driver has to use the rest of the DMA
> API.
>
>> but then it'd need to directly flush cpu caches and bypass the dma api.
>
>
> On ARM, cache flushing seems to vary from chipset to chipset.  For example,
> on ARM32 a typical call-stack for dma_sync_single_for_device looks like:
>
> arm_dma_sync_single_for_device
> __dma_page_cpu_to_dev
> outer_clean_range
> outer_cache.clean_range
>
> There are multiple clean_range implementations out there (i.e,
> aurora_clean_range, l2c210_clean_range, feroceon_l2_clean_range), so that's
> why the DMA API was used in this case.  On ARM64, things are a little
> simpler, but the DMA API seems to go directly to assembly (__dma_map_area)
> after a little indirection.  Why do you think that's inefficient?

I never said it's inefficient. My only gripe is with adding the
pointless struct device * argument to flushing functions which really
don't care (nor should care) about the device. Because what we really
want to call is outer_clean_range here, and that doesn't need a struct
device *. Imo that function (or something similar) needs to be
exported and then used by drm_flush_* functions.

Also note that arm has both flush and invalidate functions, but on x86
those are the same implementation (and we don't have a separate
drm_invalidate_* set of functions). That doesn't look like a too good
idea.

Of course that doesn't solve the problem of who's supposed to call it
and when in the dma-buf sharing situation.
-Daniel

> On Wed, Jan 17, 2018 at 12:31 AM, Daniel Vetter <daniel-/w4YWyX8dFk@public.gmane.org> wrote:
>>
>> On Tue, Jan 16, 2018 at 04:35:58PM -0800, Gurchetan Singh wrote:
>> > The DMA API can be used to flush scatter gather tables and physical
>> > pages on ARM devices.
>> >
>> > Signed-off-by: Gurchetan Singh <gurchetansingh-F7+t8E8rja9g9hUCZPvPmw@public.gmane.org>
>> > ---
>> >  drivers/gpu/drm/drm_cache.c                 | 17 +++++++++++++++++
>> >  drivers/gpu/drm/rockchip/rockchip_drm_gem.c |  7 ++-----
>> >  drivers/gpu/drm/tegra/gem.c                 |  6 +-----
>> >  3 files changed, 20 insertions(+), 10 deletions(-)
>> >
>> > diff --git a/drivers/gpu/drm/drm_cache.c b/drivers/gpu/drm/drm_cache.c
>> > index 3d2bb9d71a60..98d6ebb40e96 100644
>> > --- a/drivers/gpu/drm/drm_cache.c
>> > +++ b/drivers/gpu/drm/drm_cache.c
>> > @@ -105,6 +105,18 @@ drm_flush_pages(struct device *dev, struct page
>> > *pages[],
>> >                                  (unsigned long)page_virtual +
>> > PAGE_SIZE);
>> >               kunmap_atomic(page_virtual);
>> >       }
>> > +#elif defined(CONFIG_ARM) || defined(CONFIG_ARM64)
>> > +     unsigned long i;
>> > +     dma_addr_t dma_handle;
>> > +
>> > +     if (!dev)
>> > +             return;
>> > +
>> > +     for (i = 0; i < num_pages; i++) {
>> > +             dma_handle = phys_to_dma(drm->dev,
>> > page_to_phys(pages[i]));
>> > +             dma_sync_single_for_device(dev, dma_handle, PAGE_SIZE,
>> > +                                        DMA_TO_DEVICE);
>>
>> Erm no. These functions here are super-low-level functions used by drivers
>> which know exactly what they're doing. Which is reimplementing half of the
>> dma api behind the dma api's back because the dma api just isn't quite
>> sufficient for implementing fast gpu drivers.
>>
>> If all you end up doing is calling the dma api again, then pls just call
>> it directly.
>>
>> And just to make it clear: I'd be perfectly fine with adding arm support
>> here, but then it'd need to directly flush cpu caches and bypass the dma
>> api. Otherwise this is pointless.
>> -Daniel
>>
>> > +     }
>> >  #else
>> >       pr_err("Architecture has no drm_cache.c support\n");
>> >       WARN_ON_ONCE(1);
>> > @@ -136,6 +148,11 @@ drm_flush_sg(struct device *dev, struct sg_table
>> > *st)
>> >
>> >       if (wbinvd_on_all_cpus())
>> >               pr_err("Timed out waiting for cache flush\n");
>> > +#elif defined(CONFIG_ARM) || defined(CONFIG_ARM64)
>> > +     if (!dev)
>> > +             return;
>> > +
>> > +     dma_sync_sg_for_device(dev, st->sgl, st->nents, DMA_TO_DEVICE);
>> >  #else
>> >       pr_err("Architecture has no drm_cache.c support\n");
>> >       WARN_ON_ONCE(1);
>> > diff --git a/drivers/gpu/drm/rockchip/rockchip_drm_gem.c
>> > b/drivers/gpu/drm/rockchip/rockchip_drm_gem.c
>> > index 8ac7eb25e46d..0157f90b5d10 100644
>> > --- a/drivers/gpu/drm/rockchip/rockchip_drm_gem.c
>> > +++ b/drivers/gpu/drm/rockchip/rockchip_drm_gem.c
>> > @@ -14,6 +14,7 @@
>> >
>> >  #include <drm/drm.h>
>> >  #include <drm/drmP.h>
>> > +#include <drm/drm_cache.h>
>> >  #include <drm/drm_gem.h>
>> >  #include <drm/drm_vma_manager.h>
>> >  #include <linux/iommu.h>
>> > @@ -99,15 +100,11 @@ static int rockchip_gem_get_pages(struct
>> > rockchip_gem_object *rk_obj)
>> >       /*
>> >        * Fake up the SG table so that dma_sync_sg_for_device() can be
>> > used
>> >        * to flush the pages associated with it.
>> > -      *
>> > -      * TODO: Replace this by drm_flush_sg() once it can be implemented
>> > -      * without relying on symbols that are not exported.
>> >        */
>> >       for_each_sg(rk_obj->sgt->sgl, s, rk_obj->sgt->nents, i)
>> >               sg_dma_address(s) = sg_phys(s);
>> >
>> > -     dma_sync_sg_for_device(drm->dev, rk_obj->sgt->sgl,
>> > rk_obj->sgt->nents,
>> > -                            DMA_TO_DEVICE);
>> > +     drm_flush_sg(drm->dev, rk_obj->sgt);
>> >
>> >       return 0;
>> >
>> > diff --git a/drivers/gpu/drm/tegra/gem.c b/drivers/gpu/drm/tegra/gem.c
>> > index ab1e53d434e8..9945fd2f6bd6 100644
>> > --- a/drivers/gpu/drm/tegra/gem.c
>> > +++ b/drivers/gpu/drm/tegra/gem.c
>> > @@ -230,15 +230,11 @@ static int tegra_bo_get_pages(struct drm_device
>> > *drm, struct tegra_bo *bo)
>> >       /*
>> >        * Fake up the SG table so that dma_sync_sg_for_device() can be
>> > used
>> >        * to flush the pages associated with it.
>> > -      *
>> > -      * TODO: Replace this by drm_clflash_sg() once it can be
>> > implemented
>> > -      * without relying on symbols that are not exported.
>> >        */
>> >       for_each_sg(bo->sgt->sgl, s, bo->sgt->nents, i)
>> >               sg_dma_address(s) = sg_phys(s);
>> >
>> > -     dma_sync_sg_for_device(drm->dev, bo->sgt->sgl, bo->sgt->nents,
>> > -                            DMA_TO_DEVICE);
>> > +     drm_flush_sg(drm->dev, bo->sgt);
>> >
>> >       return 0;
>> >
>> > --
>> > 2.13.5
>> >
>> > _______________________________________________
>> > dri-devel mailing list
>> > dri-devel-PD4FTy7X32lNgt0PjOBp9y5qC8QIuHrW@public.gmane.org
>> > https://lists.freedesktop.org/mailman/listinfo/dri-devel
>>
>> --
>> Daniel Vetter
>> Software Engineer, Intel Corporation
>> http://blog.ffwll.ch
>
>



-- 
Daniel Vetter
Software Engineer, Intel Corporation
+41 (0) 79 365 57 48 - http://blog.ffwll.ch

^ permalink raw reply	[flat|nested] 19+ messages in thread

* Re: [PATCH 3/4] drm: add ARM flush implementations
  2018-01-18  7:42             ` Daniel Vetter
@ 2018-01-18 17:20               ` Gurchetan Singh
  2018-01-30  9:09                 ` Daniel Vetter
       [not found]               ` <CAKMK7uHoAqYsgqOTT4O=NQFvj51M2tKp+c2Sm8t4t01QN9Jkfg-JsoAwUIsXosN+BqQ9rBEUg@public.gmane.org>
  1 sibling, 1 reply; 19+ messages in thread
From: Gurchetan Singh @ 2018-01-18 17:20 UTC (permalink / raw)
  To: Daniel Vetter
  Cc: linux-tegra, Daniel Vetter, Thierry Reding, Laurent Pinchart,
	ML dri-devel


[-- Attachment #1.1: Type: text/plain, Size: 7918 bytes --]

On Wed, Jan 17, 2018 at 11:42 PM, Daniel Vetter <daniel@ffwll.ch> wrote:

> On Wed, Jan 17, 2018 at 11:46 PM, Gurchetan Singh
> <gurchetansingh@chromium.org> wrote:
> >> dma api just isn't quite sufficient for implementing fast gpu drivers.
> >
> >
> > Can you elaborate?  IIRC the DMA API has strong synchronization
> guarantees
> > and that can be problematic for GPU drivers.  However, using the DMA API
> for
> > flushing doesn't necessarily mean the driver has to use the rest of the
> DMA
> > API.
> >
> >> but then it'd need to directly flush cpu caches and bypass the dma api.
> >
> >
> > On ARM, cache flushing seems to vary from chipset to chipset.  For
> example,
> > on ARM32 a typical call-stack for dma_sync_single_for_device looks like:
> >
> > arm_dma_sync_single_for_device
> > __dma_page_cpu_to_dev
> > outer_clean_range
> > outer_cache.clean_range
> >
> > There are multiple clean_range implementations out there (i.e,
> > aurora_clean_range, l2c210_clean_range, feroceon_l2_clean_range), so
> that's
> > why the DMA API was used in this case.  On ARM64, things are a little
> > simpler, but the DMA API seems to go directly to assembly
> (__dma_map_area)
> > after a little indirection.  Why do you think that's inefficient?
>
> I never said it's inefficient. My only gripe is with adding the
> pointless struct device * argument to flushing functions which really
> don't care (nor should care) about the device. Because what we really
> want to call is outer_clean_range here, and that doesn't need a struct
> device *. Imo that function (or something similar) needs to be
> exported and then used by drm_flush_* functions.
>

Okay, we can go with outer_clean_range on ARM and
__dma_map_area/__dma_flush_area
on ARM64.  One concern, on ARM64 at-least, the header
(arm64/include/asm/cacheflush.h) indicates we shouldn't call these
functions directly for whatever reason:

"Cache maintenance functions used by the DMA API. No to be used directly."

However, if no one objects, that's the route we'll go.

>
> Also note that arm has both flush and invalidate functions, but on x86
> those are the same implementation (and we don't have a separate
> drm_invalidate_* set of functions). That doesn't look like a too good
> idea.
>
> Of course that doesn't solve the problem of who's supposed to call it
> and when in the dma-buf sharing situation.
> -Daniel
>
> > On Wed, Jan 17, 2018 at 12:31 AM, Daniel Vetter <daniel@ffwll.ch> wrote:
> >>
> >> On Tue, Jan 16, 2018 at 04:35:58PM -0800, Gurchetan Singh wrote:
> >> > The DMA API can be used to flush scatter gather tables and physical
> >> > pages on ARM devices.
> >> >
> >> > Signed-off-by: Gurchetan Singh <gurchetansingh@chromium.org>
> >> > ---
> >> >  drivers/gpu/drm/drm_cache.c                 | 17 +++++++++++++++++
> >> >  drivers/gpu/drm/rockchip/rockchip_drm_gem.c |  7 ++-----
> >> >  drivers/gpu/drm/tegra/gem.c                 |  6 +-----
> >> >  3 files changed, 20 insertions(+), 10 deletions(-)
> >> >
> >> > diff --git a/drivers/gpu/drm/drm_cache.c b/drivers/gpu/drm/drm_cache.c
> >> > index 3d2bb9d71a60..98d6ebb40e96 100644
> >> > --- a/drivers/gpu/drm/drm_cache.c
> >> > +++ b/drivers/gpu/drm/drm_cache.c
> >> > @@ -105,6 +105,18 @@ drm_flush_pages(struct device *dev, struct page
> >> > *pages[],
> >> >                                  (unsigned long)page_virtual +
> >> > PAGE_SIZE);
> >> >               kunmap_atomic(page_virtual);
> >> >       }
> >> > +#elif defined(CONFIG_ARM) || defined(CONFIG_ARM64)
> >> > +     unsigned long i;
> >> > +     dma_addr_t dma_handle;
> >> > +
> >> > +     if (!dev)
> >> > +             return;
> >> > +
> >> > +     for (i = 0; i < num_pages; i++) {
> >> > +             dma_handle = phys_to_dma(drm->dev,
> >> > page_to_phys(pages[i]));
> >> > +             dma_sync_single_for_device(dev, dma_handle, PAGE_SIZE,
> >> > +                                        DMA_TO_DEVICE);
> >>
> >> Erm no. These functions here are super-low-level functions used by
> drivers
> >> which know exactly what they're doing. Which is reimplementing half of
> the
> >> dma api behind the dma api's back because the dma api just isn't quite
> >> sufficient for implementing fast gpu drivers.
> >>
> >> If all you end up doing is calling the dma api again, then pls just call
> >> it directly.
> >>
> >> And just to make it clear: I'd be perfectly fine with adding arm support
> >> here, but then it'd need to directly flush cpu caches and bypass the dma
> >> api. Otherwise this is pointless.
> >> -Daniel
> >>
> >> > +     }
> >> >  #else
> >> >       pr_err("Architecture has no drm_cache.c support\n");
> >> >       WARN_ON_ONCE(1);
> >> > @@ -136,6 +148,11 @@ drm_flush_sg(struct device *dev, struct sg_table
> >> > *st)
> >> >
> >> >       if (wbinvd_on_all_cpus())
> >> >               pr_err("Timed out waiting for cache flush\n");
> >> > +#elif defined(CONFIG_ARM) || defined(CONFIG_ARM64)
> >> > +     if (!dev)
> >> > +             return;
> >> > +
> >> > +     dma_sync_sg_for_device(dev, st->sgl, st->nents, DMA_TO_DEVICE);
> >> >  #else
> >> >       pr_err("Architecture has no drm_cache.c support\n");
> >> >       WARN_ON_ONCE(1);
> >> > diff --git a/drivers/gpu/drm/rockchip/rockchip_drm_gem.c
> >> > b/drivers/gpu/drm/rockchip/rockchip_drm_gem.c
> >> > index 8ac7eb25e46d..0157f90b5d10 100644
> >> > --- a/drivers/gpu/drm/rockchip/rockchip_drm_gem.c
> >> > +++ b/drivers/gpu/drm/rockchip/rockchip_drm_gem.c
> >> > @@ -14,6 +14,7 @@
> >> >
> >> >  #include <drm/drm.h>
> >> >  #include <drm/drmP.h>
> >> > +#include <drm/drm_cache.h>
> >> >  #include <drm/drm_gem.h>
> >> >  #include <drm/drm_vma_manager.h>
> >> >  #include <linux/iommu.h>
> >> > @@ -99,15 +100,11 @@ static int rockchip_gem_get_pages(struct
> >> > rockchip_gem_object *rk_obj)
> >> >       /*
> >> >        * Fake up the SG table so that dma_sync_sg_for_device() can be
> >> > used
> >> >        * to flush the pages associated with it.
> >> > -      *
> >> > -      * TODO: Replace this by drm_flush_sg() once it can be
> implemented
> >> > -      * without relying on symbols that are not exported.
> >> >        */
> >> >       for_each_sg(rk_obj->sgt->sgl, s, rk_obj->sgt->nents, i)
> >> >               sg_dma_address(s) = sg_phys(s);
> >> >
> >> > -     dma_sync_sg_for_device(drm->dev, rk_obj->sgt->sgl,
> >> > rk_obj->sgt->nents,
> >> > -                            DMA_TO_DEVICE);
> >> > +     drm_flush_sg(drm->dev, rk_obj->sgt);
> >> >
> >> >       return 0;
> >> >
> >> > diff --git a/drivers/gpu/drm/tegra/gem.c b/drivers/gpu/drm/tegra/gem.c
> >> > index ab1e53d434e8..9945fd2f6bd6 100644
> >> > --- a/drivers/gpu/drm/tegra/gem.c
> >> > +++ b/drivers/gpu/drm/tegra/gem.c
> >> > @@ -230,15 +230,11 @@ static int tegra_bo_get_pages(struct drm_device
> >> > *drm, struct tegra_bo *bo)
> >> >       /*
> >> >        * Fake up the SG table so that dma_sync_sg_for_device() can be
> >> > used
> >> >        * to flush the pages associated with it.
> >> > -      *
> >> > -      * TODO: Replace this by drm_clflash_sg() once it can be
> >> > implemented
> >> > -      * without relying on symbols that are not exported.
> >> >        */
> >> >       for_each_sg(bo->sgt->sgl, s, bo->sgt->nents, i)
> >> >               sg_dma_address(s) = sg_phys(s);
> >> >
> >> > -     dma_sync_sg_for_device(drm->dev, bo->sgt->sgl, bo->sgt->nents,
> >> > -                            DMA_TO_DEVICE);
> >> > +     drm_flush_sg(drm->dev, bo->sgt);
> >> >
> >> >       return 0;
> >> >
> >> > --
> >> > 2.13.5
> >> >
> >> > _______________________________________________
> >> > dri-devel mailing list
> >> > dri-devel@lists.freedesktop.org
> >> > https://lists.freedesktop.org/mailman/listinfo/dri-devel
> >>
> >> --
> >> Daniel Vetter
> >> Software Engineer, Intel Corporation
> >> http://blog.ffwll.ch
> >
> >
>
>
>
> --
> Daniel Vetter
> Software Engineer, Intel Corporation
> +41 (0) 79 365 57 48 - http://blog.ffwll.ch
>

[-- Attachment #1.2: Type: text/html, Size: 11740 bytes --]

[-- Attachment #2: Type: text/plain, Size: 160 bytes --]

_______________________________________________
dri-devel mailing list
dri-devel@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/dri-devel

^ permalink raw reply	[flat|nested] 19+ messages in thread

* Re: [PATCH 4/4] drm/vgem: flush page during page fault
  2018-01-18  7:38             ` Daniel Vetter
@ 2018-01-18 17:23               ` Gurchetan Singh
  2018-01-30  9:14                 ` Daniel Vetter
  0 siblings, 1 reply; 19+ messages in thread
From: Gurchetan Singh @ 2018-01-18 17:23 UTC (permalink / raw)
  To: Daniel Vetter; +Cc: linux-tegra, Daniel Vetter, Thierry Reding, ML dri-devel


[-- Attachment #1.1: Type: text/plain, Size: 4171 bytes --]

On Wed, Jan 17, 2018 at 11:38 PM, Daniel Vetter <daniel@ffwll.ch> wrote:

> On Wed, Jan 17, 2018 at 11:49 PM, Gurchetan Singh
> <gurchetansingh@chromium.org> wrote:
> >
> > On Wed, Jan 17, 2018 at 12:39 AM, Daniel Vetter <daniel@ffwll.ch> wrote:
> >>
> >> On Tue, Jan 16, 2018 at 04:35:59PM -0800, Gurchetan Singh wrote:
> >> > This is required to use buffers allocated by vgem on AMD and ARM
> >> > devices.
> >> > We're experiencing a case where eviction of the cache races with
> >> > userspace
> >> > writes. To fix this, flush the cache after retrieving a page.
> >> >
> >> > Signed-off-by: Gurchetan Singh <gurchetansingh@chromium.org>
> >> > ---
> >> >  drivers/gpu/drm/vgem/vgem_drv.c | 1 +
> >> >  1 file changed, 1 insertion(+)
> >> >
> >> > diff --git a/drivers/gpu/drm/vgem/vgem_drv.c
> >> > b/drivers/gpu/drm/vgem/vgem_drv.c
> >> > index 35bfdfb746a7..fb263969f02d 100644
> >> > --- a/drivers/gpu/drm/vgem/vgem_drv.c
> >> > +++ b/drivers/gpu/drm/vgem/vgem_drv.c
> >> > @@ -112,6 +112,7 @@ static int vgem_gem_fault(struct vm_fault *vmf)
> >> >                               break;
> >> >               }
> >> >
> >> > +             drm_flush_pages(obj->base.dev->dev, &page, 1);
> >>
> >> Uh ... what exactly are you doing?
> >>
> >> Asking because the entire "who's responsible for coherency" story is
> >> entirely undefined still when doing buffer sharing :-/ What is clear is
> >> that currently vgem entirely ignores this (there's not
> >> begin/end_cpu_access callback), mostly because the shared dma-buf
> support
> >> in drm_prime.c also entirely ignores this.
> >
> >
> >
> > This patch isn't trying to address the case of a dma-buf imported into
> vgem.
> > It's trying to address the case when a buffer is created by
> > vgem_gem_dumb_create, mapped by vgem_gem_dumb_map and then accessed by
> user
> > space.  Since the page retrieved by shmem_read_mapping_page during the
> page
> > fault may still be in the cache, we're experiencing incorrect data in
> > buffer.  Here's the test case we're running:
> >
> > https://chromium.googlesource.com/chromiumos/platform/drm-te
> sts/+/master/vgem_test.c
>
> 404s over here (Internal url?).


Hmm ... I was able to access that link without being logged in to any
accounts.

> It fails on line 210 on AMD and ARM devices (not Intel though).
>
> So you _do_ import it on the other device driver as a dma-buf (and
> export it from vgem)?


Those portions of the test work fine (if the platform has a drm_clflush
implementation).  vgem_prime_pin calls drm_clflush_pages for the exporting
case.  Typically, ARM drivers flush the cache after drm_gem_get_pages() and
only do WC mappings, so import works.  For Intel, there is some hardware
level coherency involved.  The problem is vgem doesn't flush the cache on
ARM/AMD when getting pages for the non-export/non-import case (when
faulting after a vgem_gem_dumb_map, not during dma-buf mmap) -- i.e, during
regular use of the buffer.

>
> >> And doing a one-time only
> >> flushing in your fault handler is definitely not going to fix this (at
> >> least not if you do anything else than one-shot uploads).
> >
> >
> > There used to be a be vgem_gem_get_pages function, but that's been
> removed.
> > I don't know where else to flush in this situation.
>
> dma_buf begin/end cpu access. Even exposed as an ioctl when you're
> using the dma-buf mmap stuff. Vgem doesn't have that, which means
> as-is the dumb mmap support of vgem can't really support this if you
> want to do explicit flushing.


> What would work is uncached/writecombine/coherent dma memory. But then
> we're in the middle of the entire
> "who's responsible.
> -Daniel
>
> >
> >>
> >> -Daniel
> >>
> >> >       }
> >> >       return ret;
> >> >  }
> >> > --
> >> > 2.13.5
> >> >
> >> > _______________________________________________
> >> > dri-devel mailing list
> >> > dri-devel@lists.freedesktop.org
> >> > https://lists.freedesktop.org/mailman/listinfo/dri-devel
> >>
> >> --
> >> Daniel Vetter
> >> Software Engineer, Intel Corporation
> >> http://blog.ffwll.ch
> >
> >
>
>
>
> --
> Daniel Vetter
> Software Engineer, Intel Corporation
> +41 (0) 79 365 57 48 - http://blog.ffwll.ch
>

[-- Attachment #1.2: Type: text/html, Size: 7625 bytes --]

[-- Attachment #2: Type: text/plain, Size: 160 bytes --]

_______________________________________________
dri-devel mailing list
dri-devel@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/dri-devel

^ permalink raw reply	[flat|nested] 19+ messages in thread

* Re: [PATCH 3/4] drm: add ARM flush implementations
  2018-01-18 17:20               ` Gurchetan Singh
@ 2018-01-30  9:09                 ` Daniel Vetter
  0 siblings, 0 replies; 19+ messages in thread
From: Daniel Vetter @ 2018-01-30  9:09 UTC (permalink / raw)
  To: Gurchetan Singh
  Cc: ML dri-devel, Thierry Reding, Laurent Pinchart, linux-tegra,
	Daniel Vetter

On Thu, Jan 18, 2018 at 09:20:11AM -0800, Gurchetan Singh wrote:
> On Wed, Jan 17, 2018 at 11:42 PM, Daniel Vetter <daniel@ffwll.ch> wrote:
> 
> > On Wed, Jan 17, 2018 at 11:46 PM, Gurchetan Singh
> > <gurchetansingh@chromium.org> wrote:
> > >> dma api just isn't quite sufficient for implementing fast gpu drivers.
> > >
> > >
> > > Can you elaborate?  IIRC the DMA API has strong synchronization
> > guarantees
> > > and that can be problematic for GPU drivers.  However, using the DMA API
> > for
> > > flushing doesn't necessarily mean the driver has to use the rest of the
> > DMA
> > > API.
> > >
> > >> but then it'd need to directly flush cpu caches and bypass the dma api.
> > >
> > >
> > > On ARM, cache flushing seems to vary from chipset to chipset.  For
> > example,
> > > on ARM32 a typical call-stack for dma_sync_single_for_device looks like:
> > >
> > > arm_dma_sync_single_for_device
> > > __dma_page_cpu_to_dev
> > > outer_clean_range
> > > outer_cache.clean_range
> > >
> > > There are multiple clean_range implementations out there (i.e,
> > > aurora_clean_range, l2c210_clean_range, feroceon_l2_clean_range), so
> > that's
> > > why the DMA API was used in this case.  On ARM64, things are a little
> > > simpler, but the DMA API seems to go directly to assembly
> > (__dma_map_area)
> > > after a little indirection.  Why do you think that's inefficient?
> >
> > I never said it's inefficient. My only gripe is with adding the
> > pointless struct device * argument to flushing functions which really
> > don't care (nor should care) about the device. Because what we really
> > want to call is outer_clean_range here, and that doesn't need a struct
> > device *. Imo that function (or something similar) needs to be
> > exported and then used by drm_flush_* functions.
> >
> 
> Okay, we can go with outer_clean_range on ARM and
> __dma_map_area/__dma_flush_area
> on ARM64.  One concern, on ARM64 at-least, the header
> (arm64/include/asm/cacheflush.h) indicates we shouldn't call these
> functions directly for whatever reason:
> 
> "Cache maintenance functions used by the DMA API. No to be used directly."
> 
> However, if no one objects, that's the route we'll go.

Yeah, drm_flush.c is all about not following the dma api and digging a
direct path to the underlying concepts. So totally fine.

Maybe we should improve the kerneldoc a bit as part of your patch series,
to explain this better?
-Daniel

> 
> >
> > Also note that arm has both flush and invalidate functions, but on x86
> > those are the same implementation (and we don't have a separate
> > drm_invalidate_* set of functions). That doesn't look like a too good
> > idea.
> >
> > Of course that doesn't solve the problem of who's supposed to call it
> > and when in the dma-buf sharing situation.
> > -Daniel
> >
> > > On Wed, Jan 17, 2018 at 12:31 AM, Daniel Vetter <daniel@ffwll.ch> wrote:
> > >>
> > >> On Tue, Jan 16, 2018 at 04:35:58PM -0800, Gurchetan Singh wrote:
> > >> > The DMA API can be used to flush scatter gather tables and physical
> > >> > pages on ARM devices.
> > >> >
> > >> > Signed-off-by: Gurchetan Singh <gurchetansingh@chromium.org>
> > >> > ---
> > >> >  drivers/gpu/drm/drm_cache.c                 | 17 +++++++++++++++++
> > >> >  drivers/gpu/drm/rockchip/rockchip_drm_gem.c |  7 ++-----
> > >> >  drivers/gpu/drm/tegra/gem.c                 |  6 +-----
> > >> >  3 files changed, 20 insertions(+), 10 deletions(-)
> > >> >
> > >> > diff --git a/drivers/gpu/drm/drm_cache.c b/drivers/gpu/drm/drm_cache.c
> > >> > index 3d2bb9d71a60..98d6ebb40e96 100644
> > >> > --- a/drivers/gpu/drm/drm_cache.c
> > >> > +++ b/drivers/gpu/drm/drm_cache.c
> > >> > @@ -105,6 +105,18 @@ drm_flush_pages(struct device *dev, struct page
> > >> > *pages[],
> > >> >                                  (unsigned long)page_virtual +
> > >> > PAGE_SIZE);
> > >> >               kunmap_atomic(page_virtual);
> > >> >       }
> > >> > +#elif defined(CONFIG_ARM) || defined(CONFIG_ARM64)
> > >> > +     unsigned long i;
> > >> > +     dma_addr_t dma_handle;
> > >> > +
> > >> > +     if (!dev)
> > >> > +             return;
> > >> > +
> > >> > +     for (i = 0; i < num_pages; i++) {
> > >> > +             dma_handle = phys_to_dma(drm->dev,
> > >> > page_to_phys(pages[i]));
> > >> > +             dma_sync_single_for_device(dev, dma_handle, PAGE_SIZE,
> > >> > +                                        DMA_TO_DEVICE);
> > >>
> > >> Erm no. These functions here are super-low-level functions used by
> > drivers
> > >> which know exactly what they're doing. Which is reimplementing half of
> > the
> > >> dma api behind the dma api's back because the dma api just isn't quite
> > >> sufficient for implementing fast gpu drivers.
> > >>
> > >> If all you end up doing is calling the dma api again, then pls just call
> > >> it directly.
> > >>
> > >> And just to make it clear: I'd be perfectly fine with adding arm support
> > >> here, but then it'd need to directly flush cpu caches and bypass the dma
> > >> api. Otherwise this is pointless.
> > >> -Daniel
> > >>
> > >> > +     }
> > >> >  #else
> > >> >       pr_err("Architecture has no drm_cache.c support\n");
> > >> >       WARN_ON_ONCE(1);
> > >> > @@ -136,6 +148,11 @@ drm_flush_sg(struct device *dev, struct sg_table
> > >> > *st)
> > >> >
> > >> >       if (wbinvd_on_all_cpus())
> > >> >               pr_err("Timed out waiting for cache flush\n");
> > >> > +#elif defined(CONFIG_ARM) || defined(CONFIG_ARM64)
> > >> > +     if (!dev)
> > >> > +             return;
> > >> > +
> > >> > +     dma_sync_sg_for_device(dev, st->sgl, st->nents, DMA_TO_DEVICE);
> > >> >  #else
> > >> >       pr_err("Architecture has no drm_cache.c support\n");
> > >> >       WARN_ON_ONCE(1);
> > >> > diff --git a/drivers/gpu/drm/rockchip/rockchip_drm_gem.c
> > >> > b/drivers/gpu/drm/rockchip/rockchip_drm_gem.c
> > >> > index 8ac7eb25e46d..0157f90b5d10 100644
> > >> > --- a/drivers/gpu/drm/rockchip/rockchip_drm_gem.c
> > >> > +++ b/drivers/gpu/drm/rockchip/rockchip_drm_gem.c
> > >> > @@ -14,6 +14,7 @@
> > >> >
> > >> >  #include <drm/drm.h>
> > >> >  #include <drm/drmP.h>
> > >> > +#include <drm/drm_cache.h>
> > >> >  #include <drm/drm_gem.h>
> > >> >  #include <drm/drm_vma_manager.h>
> > >> >  #include <linux/iommu.h>
> > >> > @@ -99,15 +100,11 @@ static int rockchip_gem_get_pages(struct
> > >> > rockchip_gem_object *rk_obj)
> > >> >       /*
> > >> >        * Fake up the SG table so that dma_sync_sg_for_device() can be
> > >> > used
> > >> >        * to flush the pages associated with it.
> > >> > -      *
> > >> > -      * TODO: Replace this by drm_flush_sg() once it can be
> > implemented
> > >> > -      * without relying on symbols that are not exported.
> > >> >        */
> > >> >       for_each_sg(rk_obj->sgt->sgl, s, rk_obj->sgt->nents, i)
> > >> >               sg_dma_address(s) = sg_phys(s);
> > >> >
> > >> > -     dma_sync_sg_for_device(drm->dev, rk_obj->sgt->sgl,
> > >> > rk_obj->sgt->nents,
> > >> > -                            DMA_TO_DEVICE);
> > >> > +     drm_flush_sg(drm->dev, rk_obj->sgt);
> > >> >
> > >> >       return 0;
> > >> >
> > >> > diff --git a/drivers/gpu/drm/tegra/gem.c b/drivers/gpu/drm/tegra/gem.c
> > >> > index ab1e53d434e8..9945fd2f6bd6 100644
> > >> > --- a/drivers/gpu/drm/tegra/gem.c
> > >> > +++ b/drivers/gpu/drm/tegra/gem.c
> > >> > @@ -230,15 +230,11 @@ static int tegra_bo_get_pages(struct drm_device
> > >> > *drm, struct tegra_bo *bo)
> > >> >       /*
> > >> >        * Fake up the SG table so that dma_sync_sg_for_device() can be
> > >> > used
> > >> >        * to flush the pages associated with it.
> > >> > -      *
> > >> > -      * TODO: Replace this by drm_clflash_sg() once it can be
> > >> > implemented
> > >> > -      * without relying on symbols that are not exported.
> > >> >        */
> > >> >       for_each_sg(bo->sgt->sgl, s, bo->sgt->nents, i)
> > >> >               sg_dma_address(s) = sg_phys(s);
> > >> >
> > >> > -     dma_sync_sg_for_device(drm->dev, bo->sgt->sgl, bo->sgt->nents,
> > >> > -                            DMA_TO_DEVICE);
> > >> > +     drm_flush_sg(drm->dev, bo->sgt);
> > >> >
> > >> >       return 0;
> > >> >
> > >> > --
> > >> > 2.13.5
> > >> >
> > >> > _______________________________________________
> > >> > dri-devel mailing list
> > >> > dri-devel@lists.freedesktop.org
> > >> > https://lists.freedesktop.org/mailman/listinfo/dri-devel
> > >>
> > >> --
> > >> Daniel Vetter
> > >> Software Engineer, Intel Corporation
> > >> http://blog.ffwll.ch
> > >
> > >
> >
> >
> >
> > --
> > Daniel Vetter
> > Software Engineer, Intel Corporation
> > +41 (0) 79 365 57 48 - http://blog.ffwll.ch
> >

-- 
Daniel Vetter
Software Engineer, Intel Corporation
http://blog.ffwll.ch
_______________________________________________
dri-devel mailing list
dri-devel@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/dri-devel

^ permalink raw reply	[flat|nested] 19+ messages in thread

* Re: [PATCH 4/4] drm/vgem: flush page during page fault
  2018-01-18 17:23               ` Gurchetan Singh
@ 2018-01-30  9:14                 ` Daniel Vetter
  2018-01-30 20:01                   ` Gurchetan Singh
  0 siblings, 1 reply; 19+ messages in thread
From: Daniel Vetter @ 2018-01-30  9:14 UTC (permalink / raw)
  To: Gurchetan Singh; +Cc: linux-tegra, Daniel Vetter, Thierry Reding, ML dri-devel

On Thu, Jan 18, 2018 at 09:23:31AM -0800, Gurchetan Singh wrote:
> On Wed, Jan 17, 2018 at 11:38 PM, Daniel Vetter <daniel@ffwll.ch> wrote:
> 
> > On Wed, Jan 17, 2018 at 11:49 PM, Gurchetan Singh
> > <gurchetansingh@chromium.org> wrote:
> > >
> > > On Wed, Jan 17, 2018 at 12:39 AM, Daniel Vetter <daniel@ffwll.ch> wrote:
> > >>
> > >> On Tue, Jan 16, 2018 at 04:35:59PM -0800, Gurchetan Singh wrote:
> > >> > This is required to use buffers allocated by vgem on AMD and ARM
> > >> > devices.
> > >> > We're experiencing a case where eviction of the cache races with
> > >> > userspace
> > >> > writes. To fix this, flush the cache after retrieving a page.
> > >> >
> > >> > Signed-off-by: Gurchetan Singh <gurchetansingh@chromium.org>
> > >> > ---
> > >> >  drivers/gpu/drm/vgem/vgem_drv.c | 1 +
> > >> >  1 file changed, 1 insertion(+)
> > >> >
> > >> > diff --git a/drivers/gpu/drm/vgem/vgem_drv.c
> > >> > b/drivers/gpu/drm/vgem/vgem_drv.c
> > >> > index 35bfdfb746a7..fb263969f02d 100644
> > >> > --- a/drivers/gpu/drm/vgem/vgem_drv.c
> > >> > +++ b/drivers/gpu/drm/vgem/vgem_drv.c
> > >> > @@ -112,6 +112,7 @@ static int vgem_gem_fault(struct vm_fault *vmf)
> > >> >                               break;
> > >> >               }
> > >> >
> > >> > +             drm_flush_pages(obj->base.dev->dev, &page, 1);
> > >>
> > >> Uh ... what exactly are you doing?
> > >>
> > >> Asking because the entire "who's responsible for coherency" story is
> > >> entirely undefined still when doing buffer sharing :-/ What is clear is
> > >> that currently vgem entirely ignores this (there's not
> > >> begin/end_cpu_access callback), mostly because the shared dma-buf
> > support
> > >> in drm_prime.c also entirely ignores this.
> > >
> > >
> > >
> > > This patch isn't trying to address the case of a dma-buf imported into
> > vgem.
> > > It's trying to address the case when a buffer is created by
> > > vgem_gem_dumb_create, mapped by vgem_gem_dumb_map and then accessed by
> > user
> > > space.  Since the page retrieved by shmem_read_mapping_page during the
> > page
> > > fault may still be in the cache, we're experiencing incorrect data in
> > > buffer.  Here's the test case we're running:
> > >
> > > https://chromium.googlesource.com/chromiumos/platform/drm-te
> > sts/+/master/vgem_test.c
> >
> > 404s over here (Internal url?).
> 
> 
> Hmm ... I was able to access that link without being logged in to any
> accounts.
> 
> > It fails on line 210 on AMD and ARM devices (not Intel though).
> >
> > So you _do_ import it on the other device driver as a dma-buf (and
> > export it from vgem)?
> 
> 
> Those portions of the test work fine (if the platform has a drm_clflush
> implementation).  vgem_prime_pin calls drm_clflush_pages for the exporting
> case.  Typically, ARM drivers flush the cache after drm_gem_get_pages() and
> only do WC mappings, so import works.  For Intel, there is some hardware
> level coherency involved.  The problem is vgem doesn't flush the cache on
> ARM/AMD when getting pages for the non-export/non-import case (when
> faulting after a vgem_gem_dumb_map, not during dma-buf mmap) -- i.e, during
> regular use of the buffer.

So if the idea is that the vgem buffers should be accessed using WC, then
we need to switch the dump_map stuff to do wc. Flushing once is not going
to fix things if you write again (afaik CrOS only does write-once and then
throws buffers away again, but not sure).

The other option is to throw dumb_map into the wind and only support
dma-buf mmaping, which has a special ioctl for range flushing (so that we
could flush before/after each access as needed). A gross hack would be to
keep using dumb_map but abuse the dma-buf flushing ioctl for the flushing.

The problem ofc is that there's no agreement between importers/exporters
on who should flush when and where. Fixing that is way too much work, so
personally I think the simplest clean fix is something along the lines of
using the dma-buf flush ioctl (DMA_BUF_IOCTL_SYNC).

Cheers, Daniel

> 
> >
> > >> And doing a one-time only
> > >> flushing in your fault handler is definitely not going to fix this (at
> > >> least not if you do anything else than one-shot uploads).
> > >
> > >
> > > There used to be a be vgem_gem_get_pages function, but that's been
> > removed.
> > > I don't know where else to flush in this situation.
> >
> > dma_buf begin/end cpu access. Even exposed as an ioctl when you're
> > using the dma-buf mmap stuff. Vgem doesn't have that, which means
> > as-is the dumb mmap support of vgem can't really support this if you
> > want to do explicit flushing.
> 
> 
> > What would work is uncached/writecombine/coherent dma memory. But then
> > we're in the middle of the entire
> > "who's responsible.
> > -Daniel
> >
> > >
> > >>
> > >> -Daniel
> > >>
> > >> >       }
> > >> >       return ret;
> > >> >  }
> > >> > --
> > >> > 2.13.5
> > >> >
> > >> > _______________________________________________
> > >> > dri-devel mailing list
> > >> > dri-devel@lists.freedesktop.org
> > >> > https://lists.freedesktop.org/mailman/listinfo/dri-devel
> > >>
> > >> --
> > >> Daniel Vetter
> > >> Software Engineer, Intel Corporation
> > >> http://blog.ffwll.ch
> > >
> > >
> >
> >
> >
> > --
> > Daniel Vetter
> > Software Engineer, Intel Corporation
> > +41 (0) 79 365 57 48 - http://blog.ffwll.ch
> >

-- 
Daniel Vetter
Software Engineer, Intel Corporation
http://blog.ffwll.ch
_______________________________________________
dri-devel mailing list
dri-devel@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/dri-devel

^ permalink raw reply	[flat|nested] 19+ messages in thread

* Re: [PATCH 3/4] drm: add ARM flush implementations
       [not found]               ` <CAKMK7uHoAqYsgqOTT4O=NQFvj51M2tKp+c2Sm8t4t01QN9Jkfg-JsoAwUIsXosN+BqQ9rBEUg@public.gmane.org>
@ 2018-01-30 13:34                 ` Ville Syrjälä
  0 siblings, 0 replies; 19+ messages in thread
From: Ville Syrjälä @ 2018-01-30 13:34 UTC (permalink / raw)
  To: Daniel Vetter
  Cc: Gurchetan Singh, linux-tegra-u79uwXL29TY76Z2rM5mHXA,
	Daniel Vetter, Thierry Reding, Laurent Pinchart, ML dri-devel

On Thu, Jan 18, 2018 at 08:42:55AM +0100, Daniel Vetter wrote:
> On Wed, Jan 17, 2018 at 11:46 PM, Gurchetan Singh
> <gurchetansingh-F7+t8E8rja9g9hUCZPvPmw@public.gmane.org> wrote:
> >> dma api just isn't quite sufficient for implementing fast gpu drivers.
> >
> >
> > Can you elaborate?  IIRC the DMA API has strong synchronization guarantees
> > and that can be problematic for GPU drivers.  However, using the DMA API for
> > flushing doesn't necessarily mean the driver has to use the rest of the DMA
> > API.
> >
> >> but then it'd need to directly flush cpu caches and bypass the dma api.
> >
> >
> > On ARM, cache flushing seems to vary from chipset to chipset.  For example,
> > on ARM32 a typical call-stack for dma_sync_single_for_device looks like:
> >
> > arm_dma_sync_single_for_device
> > __dma_page_cpu_to_dev
> > outer_clean_range
> > outer_cache.clean_range
> >
> > There are multiple clean_range implementations out there (i.e,
> > aurora_clean_range, l2c210_clean_range, feroceon_l2_clean_range), so that's
> > why the DMA API was used in this case.  On ARM64, things are a little
> > simpler, but the DMA API seems to go directly to assembly (__dma_map_area)
> > after a little indirection.  Why do you think that's inefficient?
> 
> I never said it's inefficient. My only gripe is with adding the
> pointless struct device * argument to flushing functions which really
> don't care (nor should care) about the device. Because what we really
> want to call is outer_clean_range here, and that doesn't need a struct
> device *. Imo that function (or something similar) needs to be
> exported and then used by drm_flush_* functions.
> 
> Also note that arm has both flush and invalidate functions, but on x86
> those are the same implementation (and we don't have a separate
> drm_invalidate_* set of functions). That doesn't look like a too good
> idea.

IMO if someone adds some new functions they should talk about
"writeback" and "invalidate". I think "flush" is a very vague
term that could mean different things to different people.

x86 has clflush which is writeback+invalidate, and more recently
x86 gained the clwb instruction for doing just writeback without
the invalidate. On ARM IIRC you can choose whether to do
writeback or invalidate or both.

> 
> Of course that doesn't solve the problem of who's supposed to call it
> and when in the dma-buf sharing situation.
> -Daniel
> 
> > On Wed, Jan 17, 2018 at 12:31 AM, Daniel Vetter <daniel-/w4YWyX8dFk@public.gmane.org> wrote:
> >>
> >> On Tue, Jan 16, 2018 at 04:35:58PM -0800, Gurchetan Singh wrote:
> >> > The DMA API can be used to flush scatter gather tables and physical
> >> > pages on ARM devices.
> >> >
> >> > Signed-off-by: Gurchetan Singh <gurchetansingh-F7+t8E8rja9g9hUCZPvPmw@public.gmane.org>
> >> > ---
> >> >  drivers/gpu/drm/drm_cache.c                 | 17 +++++++++++++++++
> >> >  drivers/gpu/drm/rockchip/rockchip_drm_gem.c |  7 ++-----
> >> >  drivers/gpu/drm/tegra/gem.c                 |  6 +-----
> >> >  3 files changed, 20 insertions(+), 10 deletions(-)
> >> >
> >> > diff --git a/drivers/gpu/drm/drm_cache.c b/drivers/gpu/drm/drm_cache.c
> >> > index 3d2bb9d71a60..98d6ebb40e96 100644
> >> > --- a/drivers/gpu/drm/drm_cache.c
> >> > +++ b/drivers/gpu/drm/drm_cache.c
> >> > @@ -105,6 +105,18 @@ drm_flush_pages(struct device *dev, struct page
> >> > *pages[],
> >> >                                  (unsigned long)page_virtual +
> >> > PAGE_SIZE);
> >> >               kunmap_atomic(page_virtual);
> >> >       }
> >> > +#elif defined(CONFIG_ARM) || defined(CONFIG_ARM64)
> >> > +     unsigned long i;
> >> > +     dma_addr_t dma_handle;
> >> > +
> >> > +     if (!dev)
> >> > +             return;
> >> > +
> >> > +     for (i = 0; i < num_pages; i++) {
> >> > +             dma_handle = phys_to_dma(drm->dev,
> >> > page_to_phys(pages[i]));
> >> > +             dma_sync_single_for_device(dev, dma_handle, PAGE_SIZE,
> >> > +                                        DMA_TO_DEVICE);
> >>
> >> Erm no. These functions here are super-low-level functions used by drivers
> >> which know exactly what they're doing. Which is reimplementing half of the
> >> dma api behind the dma api's back because the dma api just isn't quite
> >> sufficient for implementing fast gpu drivers.
> >>
> >> If all you end up doing is calling the dma api again, then pls just call
> >> it directly.
> >>
> >> And just to make it clear: I'd be perfectly fine with adding arm support
> >> here, but then it'd need to directly flush cpu caches and bypass the dma
> >> api. Otherwise this is pointless.
> >> -Daniel
> >>
> >> > +     }
> >> >  #else
> >> >       pr_err("Architecture has no drm_cache.c support\n");
> >> >       WARN_ON_ONCE(1);
> >> > @@ -136,6 +148,11 @@ drm_flush_sg(struct device *dev, struct sg_table
> >> > *st)
> >> >
> >> >       if (wbinvd_on_all_cpus())
> >> >               pr_err("Timed out waiting for cache flush\n");
> >> > +#elif defined(CONFIG_ARM) || defined(CONFIG_ARM64)
> >> > +     if (!dev)
> >> > +             return;
> >> > +
> >> > +     dma_sync_sg_for_device(dev, st->sgl, st->nents, DMA_TO_DEVICE);
> >> >  #else
> >> >       pr_err("Architecture has no drm_cache.c support\n");
> >> >       WARN_ON_ONCE(1);
> >> > diff --git a/drivers/gpu/drm/rockchip/rockchip_drm_gem.c
> >> > b/drivers/gpu/drm/rockchip/rockchip_drm_gem.c
> >> > index 8ac7eb25e46d..0157f90b5d10 100644
> >> > --- a/drivers/gpu/drm/rockchip/rockchip_drm_gem.c
> >> > +++ b/drivers/gpu/drm/rockchip/rockchip_drm_gem.c
> >> > @@ -14,6 +14,7 @@
> >> >
> >> >  #include <drm/drm.h>
> >> >  #include <drm/drmP.h>
> >> > +#include <drm/drm_cache.h>
> >> >  #include <drm/drm_gem.h>
> >> >  #include <drm/drm_vma_manager.h>
> >> >  #include <linux/iommu.h>
> >> > @@ -99,15 +100,11 @@ static int rockchip_gem_get_pages(struct
> >> > rockchip_gem_object *rk_obj)
> >> >       /*
> >> >        * Fake up the SG table so that dma_sync_sg_for_device() can be
> >> > used
> >> >        * to flush the pages associated with it.
> >> > -      *
> >> > -      * TODO: Replace this by drm_flush_sg() once it can be implemented
> >> > -      * without relying on symbols that are not exported.
> >> >        */
> >> >       for_each_sg(rk_obj->sgt->sgl, s, rk_obj->sgt->nents, i)
> >> >               sg_dma_address(s) = sg_phys(s);
> >> >
> >> > -     dma_sync_sg_for_device(drm->dev, rk_obj->sgt->sgl,
> >> > rk_obj->sgt->nents,
> >> > -                            DMA_TO_DEVICE);
> >> > +     drm_flush_sg(drm->dev, rk_obj->sgt);
> >> >
> >> >       return 0;
> >> >
> >> > diff --git a/drivers/gpu/drm/tegra/gem.c b/drivers/gpu/drm/tegra/gem.c
> >> > index ab1e53d434e8..9945fd2f6bd6 100644
> >> > --- a/drivers/gpu/drm/tegra/gem.c
> >> > +++ b/drivers/gpu/drm/tegra/gem.c
> >> > @@ -230,15 +230,11 @@ static int tegra_bo_get_pages(struct drm_device
> >> > *drm, struct tegra_bo *bo)
> >> >       /*
> >> >        * Fake up the SG table so that dma_sync_sg_for_device() can be
> >> > used
> >> >        * to flush the pages associated with it.
> >> > -      *
> >> > -      * TODO: Replace this by drm_clflash_sg() once it can be
> >> > implemented
> >> > -      * without relying on symbols that are not exported.
> >> >        */
> >> >       for_each_sg(bo->sgt->sgl, s, bo->sgt->nents, i)
> >> >               sg_dma_address(s) = sg_phys(s);
> >> >
> >> > -     dma_sync_sg_for_device(drm->dev, bo->sgt->sgl, bo->sgt->nents,
> >> > -                            DMA_TO_DEVICE);
> >> > +     drm_flush_sg(drm->dev, bo->sgt);
> >> >
> >> >       return 0;
> >> >
> >> > --
> >> > 2.13.5
> >> >
> >> > _______________________________________________
> >> > dri-devel mailing list
> >> > dri-devel-PD4FTy7X32lNgt0PjOBp9y5qC8QIuHrW@public.gmane.org
> >> > https://lists.freedesktop.org/mailman/listinfo/dri-devel
> >>
> >> --
> >> Daniel Vetter
> >> Software Engineer, Intel Corporation
> >> http://blog.ffwll.ch
> >
> >
> 
> 
> 
> -- 
> Daniel Vetter
> Software Engineer, Intel Corporation
> +41 (0) 79 365 57 48 - http://blog.ffwll.ch
> _______________________________________________
> dri-devel mailing list
> dri-devel-PD4FTy7X32lNgt0PjOBp9y5qC8QIuHrW@public.gmane.org
> https://lists.freedesktop.org/mailman/listinfo/dri-devel

-- 
Ville Syrjälä
Intel OTC

^ permalink raw reply	[flat|nested] 19+ messages in thread

* Re: [PATCH 4/4] drm/vgem: flush page during page fault
  2018-01-30  9:14                 ` Daniel Vetter
@ 2018-01-30 20:01                   ` Gurchetan Singh
  2018-01-30 22:08                     ` Daniel Vetter
  0 siblings, 1 reply; 19+ messages in thread
From: Gurchetan Singh @ 2018-01-30 20:01 UTC (permalink / raw)
  To: Daniel Vetter; +Cc: Daniel Vetter, Thierry Reding, ML dri-devel

On Tue, Jan 30, 2018 at 1:14 AM, Daniel Vetter <daniel@ffwll.ch> wrote:
>
> On Thu, Jan 18, 2018 at 09:23:31AM -0800, Gurchetan Singh wrote:
> > On Wed, Jan 17, 2018 at 11:38 PM, Daniel Vetter <daniel@ffwll.ch> wrote:
> >
> > > On Wed, Jan 17, 2018 at 11:49 PM, Gurchetan Singh
> > > <gurchetansingh@chromium.org> wrote:
> > > >
> > > > On Wed, Jan 17, 2018 at 12:39 AM, Daniel Vetter <daniel@ffwll.ch> wrote:
> > > >>
> > > >> On Tue, Jan 16, 2018 at 04:35:59PM -0800, Gurchetan Singh wrote:
> > > >> > This is required to use buffers allocated by vgem on AMD and ARM
> > > >> > devices.
> > > >> > We're experiencing a case where eviction of the cache races with
> > > >> > userspace
> > > >> > writes. To fix this, flush the cache after retrieving a page.
> > > >> >
> > > >> > Signed-off-by: Gurchetan Singh <gurchetansingh@chromium.org>
> > > >> > ---
> > > >> >  drivers/gpu/drm/vgem/vgem_drv.c | 1 +
> > > >> >  1 file changed, 1 insertion(+)
> > > >> >
> > > >> > diff --git a/drivers/gpu/drm/vgem/vgem_drv.c
> > > >> > b/drivers/gpu/drm/vgem/vgem_drv.c
> > > >> > index 35bfdfb746a7..fb263969f02d 100644
> > > >> > --- a/drivers/gpu/drm/vgem/vgem_drv.c
> > > >> > +++ b/drivers/gpu/drm/vgem/vgem_drv.c
> > > >> > @@ -112,6 +112,7 @@ static int vgem_gem_fault(struct vm_fault *vmf)
> > > >> >                               break;
> > > >> >               }
> > > >> >
> > > >> > +             drm_flush_pages(obj->base.dev->dev, &page, 1);
> > > >>
> > > >> Uh ... what exactly are you doing?
> > > >>
> > > >> Asking because the entire "who's responsible for coherency" story is
> > > >> entirely undefined still when doing buffer sharing :-/ What is clear is
> > > >> that currently vgem entirely ignores this (there's not
> > > >> begin/end_cpu_access callback), mostly because the shared dma-buf
> > > support
> > > >> in drm_prime.c also entirely ignores this.
> > > >
> > > >
> > > >
> > > > This patch isn't trying to address the case of a dma-buf imported into
> > > vgem.
> > > > It's trying to address the case when a buffer is created by
> > > > vgem_gem_dumb_create, mapped by vgem_gem_dumb_map and then accessed by
> > > user
> > > > space.  Since the page retrieved by shmem_read_mapping_page during the
> > > page
> > > > fault may still be in the cache, we're experiencing incorrect data in
> > > > buffer.  Here's the test case we're running:
> > > >
> > > > https://chromium.googlesource.com/chromiumos/platform/drm-te
> > > sts/+/master/vgem_test.c
> > >
> > > 404s over here (Internal url?).
> >
> >
> > Hmm ... I was able to access that link without being logged in to any
> > accounts.
> >
> > > It fails on line 210 on AMD and ARM devices (not Intel though).
> > >
> > > So you _do_ import it on the other device driver as a dma-buf (and
> > > export it from vgem)?
> >
> >
> > Those portions of the test work fine (if the platform has a drm_clflush
> > implementation).  vgem_prime_pin calls drm_clflush_pages for the exporting
> > case.  Typically, ARM drivers flush the cache after drm_gem_get_pages() and
> > only do WC mappings, so import works.  For Intel, there is some hardware
> > level coherency involved.  The problem is vgem doesn't flush the cache on
> > ARM/AMD when getting pages for the non-export/non-import case (when
> > faulting after a vgem_gem_dumb_map, not during dma-buf mmap) -- i.e, during
> > regular use of the buffer.
>
> So if the idea is that the vgem buffers should be accessed using WC, then
> we need to switch the dump_map stuff to do wc. Flushing once is not going
> to fix things if you write again (afaik CrOS only does write-once and then
> throws buffers away again, but not sure).

vgem dumb mmap is already write-combined, no?  As I understand it,
after calling the dumb map ioctl (to create the fake offset) and then
calling mmap(), the stack looks something like this:

drm_gem_mmap(..)
vgem_mmap(..)
call_mmap(..)
mmap_region(..)
mmap_pgoff(..)
SyS_mmap_pgoff(..)

drm_gem_mmap() always maps WC.

> The other option is to throw dumb_map into the wind and only support
> dma-buf mmaping, which has a special ioctl for range flushing (so that we
> could flush before/after each access as needed). A gross hack would be to
> keep using dumb_map but abuse the dma-buf flushing ioctl for the flushing.

Some SW drivers (kms_swrast in Mesa) rely on dumb mapping, though in
theory they can be modified to use dma-buf mmap.

> The problem ofc is that there's no agreement between importers/exporters
> on who should flush when and where. Fixing that is way too much work, so
> personally I think the simplest clean fix is something along the lines of
> using the dma-buf flush ioctl (DMA_BUF_IOCTL_SYNC).

The failure case I'm experiencing is not related to
importing/exporting.  The issue is the pages retrieved on ARM
architecture usually have to be flushed before they can be used (see
rockchip_gem_get_pages / tegra_bo_get_pages /
etnaviv_gem_scatter_map()), even if they are mapped WC later.  Since
we couldn't get any agreement on implementing ARM support in
drm_cache.c, the recommended approach for getting vgem working on ARM
seems to be:

1) Create vgem_get_pages / vgem_put_pages functions.  vgem_get_pages
will be called from vgem_gem_fault (since currently vgem does delayed
allocation) and vgem_pin_pages.
2) Call dma_map_sg() / dma_unmap_sg in vgem_get_pages / vgem_put_pages
if the architecture is ARM (we'll have some #ifdef-ery).

Does that work for you?

>
> Cheers, Daniel
>
> >
> > >
> > > >> And doing a one-time only
> > > >> flushing in your fault handler is definitely not going to fix this (at
> > > >> least not if you do anything else than one-shot uploads).
> > > >
> > > >
> > > > There used to be a be vgem_gem_get_pages function, but that's been
> > > removed.
> > > > I don't know where else to flush in this situation.
> > >
> > > dma_buf begin/end cpu access. Even exposed as an ioctl when you're
> > > using the dma-buf mmap stuff. Vgem doesn't have that, which means
> > > as-is the dumb mmap support of vgem can't really support this if you
> > > want to do explicit flushing.
> >
> >
> > > What would work is uncached/writecombine/coherent dma memory. But then
> > > we're in the middle of the entire
> > > "who's responsible.
> > > -Daniel
> > >
> > > >
> > > >>
> > > >> -Daniel
> > > >>
> > > >> >       }
> > > >> >       return ret;
> > > >> >  }
> > > >> > --
> > > >> > 2.13.5
> > > >> >
> > > >> > _______________________________________________
> > > >> > dri-devel mailing list
> > > >> > dri-devel@lists.freedesktop.org
> > > >> > https://lists.freedesktop.org/mailman/listinfo/dri-devel
> > > >>
> > > >> --
> > > >> Daniel Vetter
> > > >> Software Engineer, Intel Corporation
> > > >> http://blog.ffwll.ch
> > > >
> > > >
> > >
> > >
> > >
> > > --
> > > Daniel Vetter
> > > Software Engineer, Intel Corporation
> > > +41 (0) 79 365 57 48 - http://blog.ffwll.ch
> > >
>
> --
> Daniel Vetter
> Software Engineer, Intel Corporation
> http://blog.ffwll.ch
_______________________________________________
dri-devel mailing list
dri-devel@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/dri-devel

^ permalink raw reply	[flat|nested] 19+ messages in thread

* Re: [PATCH 4/4] drm/vgem: flush page during page fault
  2018-01-30 20:01                   ` Gurchetan Singh
@ 2018-01-30 22:08                     ` Daniel Vetter
  0 siblings, 0 replies; 19+ messages in thread
From: Daniel Vetter @ 2018-01-30 22:08 UTC (permalink / raw)
  To: Gurchetan Singh; +Cc: Daniel Vetter, Thierry Reding, ML dri-devel

On Tue, Jan 30, 2018 at 9:01 PM, Gurchetan Singh
<gurchetansingh@chromium.org> wrote:
> On Tue, Jan 30, 2018 at 1:14 AM, Daniel Vetter <daniel@ffwll.ch> wrote:
>>
>> On Thu, Jan 18, 2018 at 09:23:31AM -0800, Gurchetan Singh wrote:
>> > On Wed, Jan 17, 2018 at 11:38 PM, Daniel Vetter <daniel@ffwll.ch> wrote:
>> >
>> > > On Wed, Jan 17, 2018 at 11:49 PM, Gurchetan Singh
>> > > <gurchetansingh@chromium.org> wrote:
>> > > >
>> > > > On Wed, Jan 17, 2018 at 12:39 AM, Daniel Vetter <daniel@ffwll.ch> wrote:
>> > > >>
>> > > >> On Tue, Jan 16, 2018 at 04:35:59PM -0800, Gurchetan Singh wrote:
>> > > >> > This is required to use buffers allocated by vgem on AMD and ARM
>> > > >> > devices.
>> > > >> > We're experiencing a case where eviction of the cache races with
>> > > >> > userspace
>> > > >> > writes. To fix this, flush the cache after retrieving a page.
>> > > >> >
>> > > >> > Signed-off-by: Gurchetan Singh <gurchetansingh@chromium.org>
>> > > >> > ---
>> > > >> >  drivers/gpu/drm/vgem/vgem_drv.c | 1 +
>> > > >> >  1 file changed, 1 insertion(+)
>> > > >> >
>> > > >> > diff --git a/drivers/gpu/drm/vgem/vgem_drv.c
>> > > >> > b/drivers/gpu/drm/vgem/vgem_drv.c
>> > > >> > index 35bfdfb746a7..fb263969f02d 100644
>> > > >> > --- a/drivers/gpu/drm/vgem/vgem_drv.c
>> > > >> > +++ b/drivers/gpu/drm/vgem/vgem_drv.c
>> > > >> > @@ -112,6 +112,7 @@ static int vgem_gem_fault(struct vm_fault *vmf)
>> > > >> >                               break;
>> > > >> >               }
>> > > >> >
>> > > >> > +             drm_flush_pages(obj->base.dev->dev, &page, 1);
>> > > >>
>> > > >> Uh ... what exactly are you doing?
>> > > >>
>> > > >> Asking because the entire "who's responsible for coherency" story is
>> > > >> entirely undefined still when doing buffer sharing :-/ What is clear is
>> > > >> that currently vgem entirely ignores this (there's not
>> > > >> begin/end_cpu_access callback), mostly because the shared dma-buf
>> > > support
>> > > >> in drm_prime.c also entirely ignores this.
>> > > >
>> > > >
>> > > >
>> > > > This patch isn't trying to address the case of a dma-buf imported into
>> > > vgem.
>> > > > It's trying to address the case when a buffer is created by
>> > > > vgem_gem_dumb_create, mapped by vgem_gem_dumb_map and then accessed by
>> > > user
>> > > > space.  Since the page retrieved by shmem_read_mapping_page during the
>> > > page
>> > > > fault may still be in the cache, we're experiencing incorrect data in
>> > > > buffer.  Here's the test case we're running:
>> > > >
>> > > > https://chromium.googlesource.com/chromiumos/platform/drm-te
>> > > sts/+/master/vgem_test.c
>> > >
>> > > 404s over here (Internal url?).
>> >
>> >
>> > Hmm ... I was able to access that link without being logged in to any
>> > accounts.
>> >
>> > > It fails on line 210 on AMD and ARM devices (not Intel though).
>> > >
>> > > So you _do_ import it on the other device driver as a dma-buf (and
>> > > export it from vgem)?
>> >
>> >
>> > Those portions of the test work fine (if the platform has a drm_clflush
>> > implementation).  vgem_prime_pin calls drm_clflush_pages for the exporting
>> > case.  Typically, ARM drivers flush the cache after drm_gem_get_pages() and
>> > only do WC mappings, so import works.  For Intel, there is some hardware
>> > level coherency involved.  The problem is vgem doesn't flush the cache on
>> > ARM/AMD when getting pages for the non-export/non-import case (when
>> > faulting after a vgem_gem_dumb_map, not during dma-buf mmap) -- i.e, during
>> > regular use of the buffer.
>>
>> So if the idea is that the vgem buffers should be accessed using WC, then
>> we need to switch the dump_map stuff to do wc. Flushing once is not going
>> to fix things if you write again (afaik CrOS only does write-once and then
>> throws buffers away again, but not sure).
>
> vgem dumb mmap is already write-combined, no?  As I understand it,
> after calling the dumb map ioctl (to create the fake offset) and then
> calling mmap(), the stack looks something like this:
>
> drm_gem_mmap(..)
> vgem_mmap(..)
> call_mmap(..)
> mmap_region(..)
> mmap_pgoff(..)
> SyS_mmap_pgoff(..)
>
> drm_gem_mmap() always maps WC.

Oh, interesting. Never realized that, but yeah if you just return a
page (and dont call one of the vm_insert_* functions from your fault
handler directly), then the default pgprot settings of drm_gem_mmap
take over, which is wc.

Now I wonder how this ever worked/s on i915 ... Other issue is that if
you get a lowmem page on arm32 you have caching attribute aliasing
issues as Russell points out. We'd not just need to make sure the
pages are flushed, but also allocated as something we can use for wc.

>> The other option is to throw dumb_map into the wind and only support
>> dma-buf mmaping, which has a special ioctl for range flushing (so that we
>> could flush before/after each access as needed). A gross hack would be to
>> keep using dumb_map but abuse the dma-buf flushing ioctl for the flushing.
>
> Some SW drivers (kms_swrast in Mesa) rely on dumb mapping, though in
> theory they can be modified to use dma-buf mmap.

As long as you don't share dumb is good enough.

>> The problem ofc is that there's no agreement between importers/exporters
>> on who should flush when and where. Fixing that is way too much work, so
>> personally I think the simplest clean fix is something along the lines of
>> using the dma-buf flush ioctl (DMA_BUF_IOCTL_SYNC).
>
> The failure case I'm experiencing is not related to
> importing/exporting.  The issue is the pages retrieved on ARM
> architecture usually have to be flushed before they can be used (see
> rockchip_gem_get_pages / tegra_bo_get_pages /
> etnaviv_gem_scatter_map()), even if they are mapped WC later.  Since
> we couldn't get any agreement on implementing ARM support in
> drm_cache.c, the recommended approach for getting vgem working on ARM
> seems to be:
>
> 1) Create vgem_get_pages / vgem_put_pages functions.  vgem_get_pages
> will be called from vgem_gem_fault (since currently vgem does delayed
> allocation) and vgem_pin_pages.
> 2) Call dma_map_sg() / dma_unmap_sg in vgem_get_pages / vgem_put_pages
> if the architecture is ARM (we'll have some #ifdef-ery).
>
> Does that work for you?

Tbh I have no idea how you get at wc-capable pages in a
platform-agnostic way. Figuring that out will probably solve the the
"must flush them at allocation time" issue too.

But yeah, from your description a vgem_get_pages function that makes
sure the pages are correctly flushed before anyone starts to use them
sounds like the right thing to do.
-Daniel

>
>>
>> Cheers, Daniel
>>
>> >
>> > >
>> > > >> And doing a one-time only
>> > > >> flushing in your fault handler is definitely not going to fix this (at
>> > > >> least not if you do anything else than one-shot uploads).
>> > > >
>> > > >
>> > > > There used to be a be vgem_gem_get_pages function, but that's been
>> > > removed.
>> > > > I don't know where else to flush in this situation.
>> > >
>> > > dma_buf begin/end cpu access. Even exposed as an ioctl when you're
>> > > using the dma-buf mmap stuff. Vgem doesn't have that, which means
>> > > as-is the dumb mmap support of vgem can't really support this if you
>> > > want to do explicit flushing.
>> >
>> >
>> > > What would work is uncached/writecombine/coherent dma memory. But then
>> > > we're in the middle of the entire
>> > > "who's responsible.
>> > > -Daniel
>> > >
>> > > >
>> > > >>
>> > > >> -Daniel
>> > > >>
>> > > >> >       }
>> > > >> >       return ret;
>> > > >> >  }
>> > > >> > --
>> > > >> > 2.13.5
>> > > >> >
>> > > >> > _______________________________________________
>> > > >> > dri-devel mailing list
>> > > >> > dri-devel@lists.freedesktop.org
>> > > >> > https://lists.freedesktop.org/mailman/listinfo/dri-devel
>> > > >>
>> > > >> --
>> > > >> Daniel Vetter
>> > > >> Software Engineer, Intel Corporation
>> > > >> http://blog.ffwll.ch
>> > > >
>> > > >
>> > >
>> > >
>> > >
>> > > --
>> > > Daniel Vetter
>> > > Software Engineer, Intel Corporation
>> > > +41 (0) 79 365 57 48 - http://blog.ffwll.ch
>> > >
>>
>> --
>> Daniel Vetter
>> Software Engineer, Intel Corporation
>> http://blog.ffwll.ch



-- 
Daniel Vetter
Software Engineer, Intel Corporation
+41 (0) 79 365 57 48 - http://blog.ffwll.ch
_______________________________________________
dri-devel mailing list
dri-devel@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/dri-devel

^ permalink raw reply	[flat|nested] 19+ messages in thread

end of thread, other threads:[~2018-01-30 22:08 UTC | newest]

Thread overview: 19+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2018-01-17  0:35 [PATCH 1/4] drm: rename {drm_clflush_sg, drm_clflush_pages} Gurchetan Singh
2018-01-17  0:35 ` [PATCH 2/4] drm: add additional parameter in drm_flush_pages() and drm_flush_sg() Gurchetan Singh
     [not found] ` <20180117003559.67837-1-gurchetansingh-F7+t8E8rja9g9hUCZPvPmw@public.gmane.org>
2018-01-17  0:35   ` [PATCH 3/4] drm: add ARM flush implementations Gurchetan Singh
     [not found]     ` <20180117003559.67837-3-gurchetansingh-F7+t8E8rja9g9hUCZPvPmw@public.gmane.org>
2018-01-17  8:31       ` Daniel Vetter
     [not found]         ` <20180117083105.GG2759-dv86pmgwkMBes7Z6vYuT8azUEOm+Xw19@public.gmane.org>
2018-01-17 18:53           ` Sean Paul
2018-01-17 21:22             ` Daniel Vetter
2018-01-17 22:46         ` Gurchetan Singh
     [not found]           ` <CAAfnVBm4Mp8vC4aBmrP2rJeRSBN_AFN5gaZfijg9BCJEBupDzw-JsoAwUIsXosN+BqQ9rBEUg@public.gmane.org>
2018-01-18  7:42             ` Daniel Vetter
2018-01-18 17:20               ` Gurchetan Singh
2018-01-30  9:09                 ` Daniel Vetter
     [not found]               ` <CAKMK7uHoAqYsgqOTT4O=NQFvj51M2tKp+c2Sm8t4t01QN9Jkfg-JsoAwUIsXosN+BqQ9rBEUg@public.gmane.org>
2018-01-30 13:34                 ` Ville Syrjälä
2018-01-17  0:35   ` [PATCH 4/4] drm/vgem: flush page during page fault Gurchetan Singh
     [not found]     ` <20180117003559.67837-4-gurchetansingh-F7+t8E8rja9g9hUCZPvPmw@public.gmane.org>
2018-01-17  8:39       ` Daniel Vetter
2018-01-17 22:49         ` Gurchetan Singh
     [not found]           ` <CAAfnVBkxhEec1U8Ck4UyMXjwKvSFJs-bpip5K-7sbB51TNK0bA-JsoAwUIsXosN+BqQ9rBEUg@public.gmane.org>
2018-01-18  7:38             ` Daniel Vetter
2018-01-18 17:23               ` Gurchetan Singh
2018-01-30  9:14                 ` Daniel Vetter
2018-01-30 20:01                   ` Gurchetan Singh
2018-01-30 22:08                     ` Daniel Vetter

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).