All of lore.kernel.org
 help / color / mirror / Atom feed
* [PATCH 2/5] drm: add ARM flush implementation
@ 2018-01-24  2:56 ` Gurchetan Singh
  0 siblings, 0 replies; 33+ messages in thread
From: Gurchetan Singh @ 2018-01-24  2:56 UTC (permalink / raw)
  To: linux-arm-kernel

The dma_cache_maint_page function is important for cache maintenance on
ARM32 (this was determined via testing).

Since we desire direct control of the caches in drm_cache.c, let's make
a copy of the function, rename it and use it.

v2: Don't use DMA API, call functions directly (Daniel)

Signed-off-by: Gurchetan Singh <gurchetansingh@chromium.org>
---
 drivers/gpu/drm/drm_cache.c | 61 +++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 61 insertions(+)

diff --git a/drivers/gpu/drm/drm_cache.c b/drivers/gpu/drm/drm_cache.c
index 89cdd32fe1f3..5124582451c6 100644
--- a/drivers/gpu/drm/drm_cache.c
+++ b/drivers/gpu/drm/drm_cache.c
@@ -69,6 +69,55 @@ static void drm_cache_flush_clflush(struct page *pages[],
 }
 #endif
 
+#if defined(CONFIG_ARM)
+static void drm_cache_maint_page(struct page *page, unsigned long offset,
+				 size_t size, enum dma_data_direction dir,
+				 void (*op)(const void *, size_t, int))
+{
+	unsigned long pfn;
+	size_t left = size;
+
+	pfn = page_to_pfn(page) + offset / PAGE_SIZE;
+	offset %= PAGE_SIZE;
+
+	/*
+	 * A single sg entry may refer to multiple physically contiguous
+	 * pages.  But we still need to process highmem pages individually.
+	 * If highmem is not configured then the bulk of this loop gets
+	 * optimized out.
+	 */
+	do {
+		size_t len = left;
+		void *vaddr;
+
+		page = pfn_to_page(pfn);
+
+		if (PageHighMem(page)) {
+			if (len + offset > PAGE_SIZE)
+				len = PAGE_SIZE - offset;
+
+			if (cache_is_vipt_nonaliasing()) {
+				vaddr = kmap_atomic(page);
+				op(vaddr + offset, len, dir);
+				kunmap_atomic(vaddr);
+			} else {
+				vaddr = kmap_high_get(page);
+				if (vaddr) {
+					op(vaddr + offset, len, dir);
+					kunmap_high(page);
+				}
+			}
+		} else {
+			vaddr = page_address(page) + offset;
+			op(vaddr, len, dir);
+		}
+		offset = 0;
+		pfn++;
+		left -= len;
+	} while (left);
+}
+#endif
+
 /**
  * drm_flush_pages - Flush dcache lines of a set of pages.
  * @pages: List of pages to be flushed.
@@ -104,6 +153,12 @@ drm_flush_pages(struct page *pages[], unsigned long num_pages)
 				   (unsigned long)page_virtual + PAGE_SIZE);
 		kunmap_atomic(page_virtual);
 	}
+#elif defined(CONFIG_ARM)
+	unsigned long i;
+
+	for (i = 0; i < num_pages; i++)
+		drm_cache_maint_page(pages[i], 0, PAGE_SIZE, DMA_TO_DEVICE,
+				     dmac_map_area);
 #else
 	pr_err("Architecture has no drm_cache.c support\n");
 	WARN_ON_ONCE(1);
@@ -135,6 +190,12 @@ drm_flush_sg(struct sg_table *st)
 
 	if (wbinvd_on_all_cpus())
 		pr_err("Timed out waiting for cache flush\n");
+#elif defined(CONFIG_ARM)
+	struct sg_page_iter sg_iter;
+
+	for_each_sg_page(st->sgl, &sg_iter, st->nents, 0)
+		drm_cache_maint_page(sg_page_iter_page(&sg_iter), 0, PAGE_SIZE,
+				     DMA_TO_DEVICE, dmac_map_area);
 #else
 	pr_err("Architecture has no drm_cache.c support\n");
 	WARN_ON_ONCE(1);
-- 
2.13.5

^ permalink raw reply related	[flat|nested] 33+ messages in thread

* [PATCH 2/5] drm: add ARM flush implementation
@ 2018-01-24  2:56 ` Gurchetan Singh
  0 siblings, 0 replies; 33+ messages in thread
From: Gurchetan Singh @ 2018-01-24  2:56 UTC (permalink / raw)
  To: dri-devel
  Cc: Gurchetan Singh, thierry.reding, laurent.pinchart, daniel.vetter,
	linux-arm-kernel

The dma_cache_maint_page function is important for cache maintenance on
ARM32 (this was determined via testing).

Since we desire direct control of the caches in drm_cache.c, let's make
a copy of the function, rename it and use it.

v2: Don't use DMA API, call functions directly (Daniel)

Signed-off-by: Gurchetan Singh <gurchetansingh@chromium.org>
---
 drivers/gpu/drm/drm_cache.c | 61 +++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 61 insertions(+)

diff --git a/drivers/gpu/drm/drm_cache.c b/drivers/gpu/drm/drm_cache.c
index 89cdd32fe1f3..5124582451c6 100644
--- a/drivers/gpu/drm/drm_cache.c
+++ b/drivers/gpu/drm/drm_cache.c
@@ -69,6 +69,55 @@ static void drm_cache_flush_clflush(struct page *pages[],
 }
 #endif
 
+#if defined(CONFIG_ARM)
+static void drm_cache_maint_page(struct page *page, unsigned long offset,
+				 size_t size, enum dma_data_direction dir,
+				 void (*op)(const void *, size_t, int))
+{
+	unsigned long pfn;
+	size_t left = size;
+
+	pfn = page_to_pfn(page) + offset / PAGE_SIZE;
+	offset %= PAGE_SIZE;
+
+	/*
+	 * A single sg entry may refer to multiple physically contiguous
+	 * pages.  But we still need to process highmem pages individually.
+	 * If highmem is not configured then the bulk of this loop gets
+	 * optimized out.
+	 */
+	do {
+		size_t len = left;
+		void *vaddr;
+
+		page = pfn_to_page(pfn);
+
+		if (PageHighMem(page)) {
+			if (len + offset > PAGE_SIZE)
+				len = PAGE_SIZE - offset;
+
+			if (cache_is_vipt_nonaliasing()) {
+				vaddr = kmap_atomic(page);
+				op(vaddr + offset, len, dir);
+				kunmap_atomic(vaddr);
+			} else {
+				vaddr = kmap_high_get(page);
+				if (vaddr) {
+					op(vaddr + offset, len, dir);
+					kunmap_high(page);
+				}
+			}
+		} else {
+			vaddr = page_address(page) + offset;
+			op(vaddr, len, dir);
+		}
+		offset = 0;
+		pfn++;
+		left -= len;
+	} while (left);
+}
+#endif
+
 /**
  * drm_flush_pages - Flush dcache lines of a set of pages.
  * @pages: List of pages to be flushed.
@@ -104,6 +153,12 @@ drm_flush_pages(struct page *pages[], unsigned long num_pages)
 				   (unsigned long)page_virtual + PAGE_SIZE);
 		kunmap_atomic(page_virtual);
 	}
+#elif defined(CONFIG_ARM)
+	unsigned long i;
+
+	for (i = 0; i < num_pages; i++)
+		drm_cache_maint_page(pages[i], 0, PAGE_SIZE, DMA_TO_DEVICE,
+				     dmac_map_area);
 #else
 	pr_err("Architecture has no drm_cache.c support\n");
 	WARN_ON_ONCE(1);
@@ -135,6 +190,12 @@ drm_flush_sg(struct sg_table *st)
 
 	if (wbinvd_on_all_cpus())
 		pr_err("Timed out waiting for cache flush\n");
+#elif defined(CONFIG_ARM)
+	struct sg_page_iter sg_iter;
+
+	for_each_sg_page(st->sgl, &sg_iter, st->nents, 0)
+		drm_cache_maint_page(sg_page_iter_page(&sg_iter), 0, PAGE_SIZE,
+				     DMA_TO_DEVICE, dmac_map_area);
 #else
 	pr_err("Architecture has no drm_cache.c support\n");
 	WARN_ON_ONCE(1);
-- 
2.13.5

_______________________________________________
dri-devel mailing list
dri-devel@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/dri-devel

^ permalink raw reply related	[flat|nested] 33+ messages in thread

* [PATCH 3/5] drm: add ARM64 flush implementations
  2018-01-24  2:56 ` Gurchetan Singh
@ 2018-01-24  2:56   ` Gurchetan Singh
  -1 siblings, 0 replies; 33+ messages in thread
From: Gurchetan Singh @ 2018-01-24  2:56 UTC (permalink / raw)
  To: linux-arm-kernel

This patch uses the __dma_map_area function to flush the cache
on ARM64.

v2: Don't use DMA API, call functions directly (Daniel)

Signed-off-by: Gurchetan Singh <gurchetansingh@chromium.org>
---
 drivers/gpu/drm/drm_cache.c | 13 +++++++++++++
 1 file changed, 13 insertions(+)

diff --git a/drivers/gpu/drm/drm_cache.c b/drivers/gpu/drm/drm_cache.c
index 5124582451c6..250cdfbb711f 100644
--- a/drivers/gpu/drm/drm_cache.c
+++ b/drivers/gpu/drm/drm_cache.c
@@ -159,6 +159,12 @@ drm_flush_pages(struct page *pages[], unsigned long num_pages)
 	for (i = 0; i < num_pages; i++)
 		drm_cache_maint_page(pages[i], 0, PAGE_SIZE, DMA_TO_DEVICE,
 				     dmac_map_area);
+#elif defined(CONFIG_ARM64)
+	unsigned long i;
+
+	for (i = 0; i < num_pages; i++)
+		__dma_map_area(phys_to_virt(page_to_phys(pages[i])), PAGE_SIZE,
+			       DMA_TO_DEVICE);
 #else
 	pr_err("Architecture has no drm_cache.c support\n");
 	WARN_ON_ONCE(1);
@@ -196,6 +202,13 @@ drm_flush_sg(struct sg_table *st)
 	for_each_sg_page(st->sgl, &sg_iter, st->nents, 0)
 		drm_cache_maint_page(sg_page_iter_page(&sg_iter), 0, PAGE_SIZE,
 				     DMA_TO_DEVICE, dmac_map_area);
+#elif defined(CONFIG_ARM64)
+	int i;
+	struct scatterlist *sg;
+
+	for_each_sg(st->sgl, sg, st->nents, i)
+		__dma_map_area(phys_to_virt(sg_phys(sg)), sg->length,
+			       DMA_TO_DEVICE);
 #else
 	pr_err("Architecture has no drm_cache.c support\n");
 	WARN_ON_ONCE(1);
-- 
2.13.5

^ permalink raw reply related	[flat|nested] 33+ messages in thread

* [PATCH 3/5] drm: add ARM64 flush implementations
@ 2018-01-24  2:56   ` Gurchetan Singh
  0 siblings, 0 replies; 33+ messages in thread
From: Gurchetan Singh @ 2018-01-24  2:56 UTC (permalink / raw)
  To: dri-devel
  Cc: Gurchetan Singh, thierry.reding, laurent.pinchart, daniel.vetter,
	linux-arm-kernel

This patch uses the __dma_map_area function to flush the cache
on ARM64.

v2: Don't use DMA API, call functions directly (Daniel)

Signed-off-by: Gurchetan Singh <gurchetansingh@chromium.org>
---
 drivers/gpu/drm/drm_cache.c | 13 +++++++++++++
 1 file changed, 13 insertions(+)

diff --git a/drivers/gpu/drm/drm_cache.c b/drivers/gpu/drm/drm_cache.c
index 5124582451c6..250cdfbb711f 100644
--- a/drivers/gpu/drm/drm_cache.c
+++ b/drivers/gpu/drm/drm_cache.c
@@ -159,6 +159,12 @@ drm_flush_pages(struct page *pages[], unsigned long num_pages)
 	for (i = 0; i < num_pages; i++)
 		drm_cache_maint_page(pages[i], 0, PAGE_SIZE, DMA_TO_DEVICE,
 				     dmac_map_area);
+#elif defined(CONFIG_ARM64)
+	unsigned long i;
+
+	for (i = 0; i < num_pages; i++)
+		__dma_map_area(phys_to_virt(page_to_phys(pages[i])), PAGE_SIZE,
+			       DMA_TO_DEVICE);
 #else
 	pr_err("Architecture has no drm_cache.c support\n");
 	WARN_ON_ONCE(1);
@@ -196,6 +202,13 @@ drm_flush_sg(struct sg_table *st)
 	for_each_sg_page(st->sgl, &sg_iter, st->nents, 0)
 		drm_cache_maint_page(sg_page_iter_page(&sg_iter), 0, PAGE_SIZE,
 				     DMA_TO_DEVICE, dmac_map_area);
+#elif defined(CONFIG_ARM64)
+	int i;
+	struct scatterlist *sg;
+
+	for_each_sg(st->sgl, sg, st->nents, i)
+		__dma_map_area(phys_to_virt(sg_phys(sg)), sg->length,
+			       DMA_TO_DEVICE);
 #else
 	pr_err("Architecture has no drm_cache.c support\n");
 	WARN_ON_ONCE(1);
-- 
2.13.5

_______________________________________________
dri-devel mailing list
dri-devel@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/dri-devel

^ permalink raw reply related	[flat|nested] 33+ messages in thread

* [PATCH 4/5] drm/vgem: flush page during page fault
  2018-01-24  2:56 ` Gurchetan Singh
@ 2018-01-24  2:56   ` Gurchetan Singh
  -1 siblings, 0 replies; 33+ messages in thread
From: Gurchetan Singh @ 2018-01-24  2:56 UTC (permalink / raw)
  To: linux-arm-kernel

This is required to use buffers allocated by vgem on AMD and ARM devices.
We're experiencing a case where eviction of the cache races with userspace
writes. To fix this, flush the cache after retrieving a page.

Signed-off-by: Gurchetan Singh <gurchetansingh@chromium.org>
---
 drivers/gpu/drm/vgem/vgem_drv.c | 1 +
 1 file changed, 1 insertion(+)

diff --git a/drivers/gpu/drm/vgem/vgem_drv.c b/drivers/gpu/drm/vgem/vgem_drv.c
index 802a97e1a4bf..ed6db7218f04 100644
--- a/drivers/gpu/drm/vgem/vgem_drv.c
+++ b/drivers/gpu/drm/vgem/vgem_drv.c
@@ -112,6 +112,7 @@ static int vgem_gem_fault(struct vm_fault *vmf)
 				break;
 		}
 
+		drm_flush_pages(&page, 1);
 	}
 	return ret;
 }
-- 
2.13.5

^ permalink raw reply related	[flat|nested] 33+ messages in thread

* [PATCH 4/5] drm/vgem: flush page during page fault
@ 2018-01-24  2:56   ` Gurchetan Singh
  0 siblings, 0 replies; 33+ messages in thread
From: Gurchetan Singh @ 2018-01-24  2:56 UTC (permalink / raw)
  To: dri-devel
  Cc: Gurchetan Singh, thierry.reding, laurent.pinchart, daniel.vetter,
	linux-arm-kernel

This is required to use buffers allocated by vgem on AMD and ARM devices.
We're experiencing a case where eviction of the cache races with userspace
writes. To fix this, flush the cache after retrieving a page.

Signed-off-by: Gurchetan Singh <gurchetansingh@chromium.org>
---
 drivers/gpu/drm/vgem/vgem_drv.c | 1 +
 1 file changed, 1 insertion(+)

diff --git a/drivers/gpu/drm/vgem/vgem_drv.c b/drivers/gpu/drm/vgem/vgem_drv.c
index 802a97e1a4bf..ed6db7218f04 100644
--- a/drivers/gpu/drm/vgem/vgem_drv.c
+++ b/drivers/gpu/drm/vgem/vgem_drv.c
@@ -112,6 +112,7 @@ static int vgem_gem_fault(struct vm_fault *vmf)
 				break;
 		}
 
+		drm_flush_pages(&page, 1);
 	}
 	return ret;
 }
-- 
2.13.5

_______________________________________________
dri-devel mailing list
dri-devel@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/dri-devel

^ permalink raw reply related	[flat|nested] 33+ messages in thread

* [PATCH 5/5] drm: use drm_flush_sg where possible
  2018-01-24  2:56 ` Gurchetan Singh
@ 2018-01-24  2:56   ` Gurchetan Singh
  -1 siblings, 0 replies; 33+ messages in thread
From: Gurchetan Singh @ 2018-01-24  2:56 UTC (permalink / raw)
  To: linux-arm-kernel

We should use our common cache maintenance functions when possible.

Signed-off-by: Gurchetan Singh <gurchetansingh@chromium.org>
---
 drivers/gpu/drm/rockchip/rockchip_drm_gem.c | 7 ++-----
 drivers/gpu/drm/tegra/gem.c                 | 7 ++-----
 2 files changed, 4 insertions(+), 10 deletions(-)

diff --git a/drivers/gpu/drm/rockchip/rockchip_drm_gem.c b/drivers/gpu/drm/rockchip/rockchip_drm_gem.c
index 8ac7eb25e46d..0652c2f79719 100644
--- a/drivers/gpu/drm/rockchip/rockchip_drm_gem.c
+++ b/drivers/gpu/drm/rockchip/rockchip_drm_gem.c
@@ -14,6 +14,7 @@
 
 #include <drm/drm.h>
 #include <drm/drmP.h>
+#include <drm/drm_cache.h>
 #include <drm/drm_gem.h>
 #include <drm/drm_vma_manager.h>
 #include <linux/iommu.h>
@@ -99,15 +100,11 @@ static int rockchip_gem_get_pages(struct rockchip_gem_object *rk_obj)
 	/*
 	 * Fake up the SG table so that dma_sync_sg_for_device() can be used
 	 * to flush the pages associated with it.
-	 *
-	 * TODO: Replace this by drm_flush_sg() once it can be implemented
-	 * without relying on symbols that are not exported.
 	 */
 	for_each_sg(rk_obj->sgt->sgl, s, rk_obj->sgt->nents, i)
 		sg_dma_address(s) = sg_phys(s);
 
-	dma_sync_sg_for_device(drm->dev, rk_obj->sgt->sgl, rk_obj->sgt->nents,
-			       DMA_TO_DEVICE);
+	drm_flush_sg(rk_obj->sgt);
 
 	return 0;
 
diff --git a/drivers/gpu/drm/tegra/gem.c b/drivers/gpu/drm/tegra/gem.c
index 49b9bf28f872..0db403653adc 100644
--- a/drivers/gpu/drm/tegra/gem.c
+++ b/drivers/gpu/drm/tegra/gem.c
@@ -15,6 +15,7 @@
 
 #include <linux/dma-buf.h>
 #include <linux/iommu.h>
+#include <drm/drm_cache.h>
 #include <drm/tegra_drm.h>
 
 #include "drm.h"
@@ -229,15 +230,11 @@ static int tegra_bo_get_pages(struct drm_device *drm, struct tegra_bo *bo)
 	/*
 	 * Fake up the SG table so that dma_sync_sg_for_device() can be used
 	 * to flush the pages associated with it.
-	 *
-	 * TODO: Replace this by drm_clflash_sg() once it can be implemented
-	 * without relying on symbols that are not exported.
 	 */
 	for_each_sg(bo->sgt->sgl, s, bo->sgt->nents, i)
 		sg_dma_address(s) = sg_phys(s);
 
-	dma_sync_sg_for_device(drm->dev, bo->sgt->sgl, bo->sgt->nents,
-			       DMA_TO_DEVICE);
+	drm_flush_sg(bo->sgt);
 
 	return 0;
 
-- 
2.13.5

^ permalink raw reply related	[flat|nested] 33+ messages in thread

* [PATCH 5/5] drm: use drm_flush_sg where possible
@ 2018-01-24  2:56   ` Gurchetan Singh
  0 siblings, 0 replies; 33+ messages in thread
From: Gurchetan Singh @ 2018-01-24  2:56 UTC (permalink / raw)
  To: dri-devel
  Cc: Gurchetan Singh, thierry.reding, laurent.pinchart, daniel.vetter,
	linux-arm-kernel

We should use our common cache maintenance functions when possible.

Signed-off-by: Gurchetan Singh <gurchetansingh@chromium.org>
---
 drivers/gpu/drm/rockchip/rockchip_drm_gem.c | 7 ++-----
 drivers/gpu/drm/tegra/gem.c                 | 7 ++-----
 2 files changed, 4 insertions(+), 10 deletions(-)

diff --git a/drivers/gpu/drm/rockchip/rockchip_drm_gem.c b/drivers/gpu/drm/rockchip/rockchip_drm_gem.c
index 8ac7eb25e46d..0652c2f79719 100644
--- a/drivers/gpu/drm/rockchip/rockchip_drm_gem.c
+++ b/drivers/gpu/drm/rockchip/rockchip_drm_gem.c
@@ -14,6 +14,7 @@
 
 #include <drm/drm.h>
 #include <drm/drmP.h>
+#include <drm/drm_cache.h>
 #include <drm/drm_gem.h>
 #include <drm/drm_vma_manager.h>
 #include <linux/iommu.h>
@@ -99,15 +100,11 @@ static int rockchip_gem_get_pages(struct rockchip_gem_object *rk_obj)
 	/*
 	 * Fake up the SG table so that dma_sync_sg_for_device() can be used
 	 * to flush the pages associated with it.
-	 *
-	 * TODO: Replace this by drm_flush_sg() once it can be implemented
-	 * without relying on symbols that are not exported.
 	 */
 	for_each_sg(rk_obj->sgt->sgl, s, rk_obj->sgt->nents, i)
 		sg_dma_address(s) = sg_phys(s);
 
-	dma_sync_sg_for_device(drm->dev, rk_obj->sgt->sgl, rk_obj->sgt->nents,
-			       DMA_TO_DEVICE);
+	drm_flush_sg(rk_obj->sgt);
 
 	return 0;
 
diff --git a/drivers/gpu/drm/tegra/gem.c b/drivers/gpu/drm/tegra/gem.c
index 49b9bf28f872..0db403653adc 100644
--- a/drivers/gpu/drm/tegra/gem.c
+++ b/drivers/gpu/drm/tegra/gem.c
@@ -15,6 +15,7 @@
 
 #include <linux/dma-buf.h>
 #include <linux/iommu.h>
+#include <drm/drm_cache.h>
 #include <drm/tegra_drm.h>
 
 #include "drm.h"
@@ -229,15 +230,11 @@ static int tegra_bo_get_pages(struct drm_device *drm, struct tegra_bo *bo)
 	/*
 	 * Fake up the SG table so that dma_sync_sg_for_device() can be used
 	 * to flush the pages associated with it.
-	 *
-	 * TODO: Replace this by drm_clflash_sg() once it can be implemented
-	 * without relying on symbols that are not exported.
 	 */
 	for_each_sg(bo->sgt->sgl, s, bo->sgt->nents, i)
 		sg_dma_address(s) = sg_phys(s);
 
-	dma_sync_sg_for_device(drm->dev, bo->sgt->sgl, bo->sgt->nents,
-			       DMA_TO_DEVICE);
+	drm_flush_sg(bo->sgt);
 
 	return 0;
 
-- 
2.13.5

_______________________________________________
dri-devel mailing list
dri-devel@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/dri-devel

^ permalink raw reply related	[flat|nested] 33+ messages in thread

* [PATCH 2/5] drm: add ARM flush implementation
  2018-01-24  2:56 ` Gurchetan Singh
@ 2018-01-24 11:50   ` Lucas Stach
  -1 siblings, 0 replies; 33+ messages in thread
From: Lucas Stach @ 2018-01-24 11:50 UTC (permalink / raw)
  To: linux-arm-kernel

Am Dienstag, den 23.01.2018, 18:56 -0800 schrieb Gurchetan Singh:
> The dma_cache_maint_page function is important for cache maintenance on
> ARM32 (this was determined via testing).
> 
> Since we desire direct control of the caches in drm_cache.c, let's make
> a copy of the function, rename it and use it.

I don't see why having a (potentially incoherent) copy of
dma_cache_maint_page in DRM does any good. Why can't we just export the
 original function to make it usable within DRM?

Regards,
Lucas

> 
> v2: Don't use DMA API, call functions directly (Daniel)
> 
> > Signed-off-by: Gurchetan Singh <gurchetansingh@chromium.org>
> ---
> ?drivers/gpu/drm/drm_cache.c | 61 +++++++++++++++++++++++++++++++++++++++++++++
> ?1 file changed, 61 insertions(+)
> 
> diff --git a/drivers/gpu/drm/drm_cache.c b/drivers/gpu/drm/drm_cache.c
> index 89cdd32fe1f3..5124582451c6 100644
> --- a/drivers/gpu/drm/drm_cache.c
> +++ b/drivers/gpu/drm/drm_cache.c
> @@ -69,6 +69,55 @@ static void drm_cache_flush_clflush(struct page *pages[],
> ?}
> ?#endif
> ?
> +#if defined(CONFIG_ARM)
> +static void drm_cache_maint_page(struct page *page, unsigned long offset,
> > +				?size_t size, enum dma_data_direction dir,
> > +				?void (*op)(const void *, size_t, int))
> +{
> > +	unsigned long pfn;
> > +	size_t left = size;
> +
> > +	pfn = page_to_pfn(page) + offset / PAGE_SIZE;
> > +	offset %= PAGE_SIZE;
> +
> > +	/*
> > +	?* A single sg entry may refer to multiple physically contiguous
> > +	?* pages.??But we still need to process highmem pages individually.
> > +	?* If highmem is not configured then the bulk of this loop gets
> > +	?* optimized out.
> > +	?*/
> > +	do {
> > +		size_t len = left;
> > +		void *vaddr;
> +
> > +		page = pfn_to_page(pfn);
> +
> > +		if (PageHighMem(page)) {
> > +			if (len + offset > PAGE_SIZE)
> > +				len = PAGE_SIZE - offset;
> +
> > +			if (cache_is_vipt_nonaliasing()) {
> > +				vaddr = kmap_atomic(page);
> > +				op(vaddr + offset, len, dir);
> > +				kunmap_atomic(vaddr);
> > +			} else {
> > +				vaddr = kmap_high_get(page);
> > +				if (vaddr) {
> > +					op(vaddr + offset, len, dir);
> > +					kunmap_high(page);
> > +				}
> > +			}
> > +		} else {
> > +			vaddr = page_address(page) + offset;
> > +			op(vaddr, len, dir);
> > +		}
> > +		offset = 0;
> > +		pfn++;
> > +		left -= len;
> > +	} while (left);
> +}
> +#endif
> +
> ?/**
> ? * drm_flush_pages - Flush dcache lines of a set of pages.
> ? * @pages: List of pages to be flushed.
> @@ -104,6 +153,12 @@ drm_flush_pages(struct page *pages[], unsigned long num_pages)
> > ?				???(unsigned long)page_virtual + PAGE_SIZE);
> > ?		kunmap_atomic(page_virtual);
> > ?	}
> +#elif defined(CONFIG_ARM)
> > +	unsigned long i;
> +
> > +	for (i = 0; i < num_pages; i++)
> > +		drm_cache_maint_page(pages[i], 0, PAGE_SIZE, DMA_TO_DEVICE,
> > +				?????dmac_map_area);
> ?#else
> > ?	pr_err("Architecture has no drm_cache.c support\n");
> > ?	WARN_ON_ONCE(1);
> @@ -135,6 +190,12 @@ drm_flush_sg(struct sg_table *st)
> ?
> > ?	if (wbinvd_on_all_cpus())
> > ?		pr_err("Timed out waiting for cache flush\n");
> +#elif defined(CONFIG_ARM)
> > +	struct sg_page_iter sg_iter;
> +
> > +	for_each_sg_page(st->sgl, &sg_iter, st->nents, 0)
> > +		drm_cache_maint_page(sg_page_iter_page(&sg_iter), 0, PAGE_SIZE,
> > +				?????DMA_TO_DEVICE, dmac_map_area);
> ?#else
> > ?	pr_err("Architecture has no drm_cache.c support\n");
> > ?	WARN_ON_ONCE(1);

^ permalink raw reply	[flat|nested] 33+ messages in thread

* Re: [PATCH 2/5] drm: add ARM flush implementation
@ 2018-01-24 11:50   ` Lucas Stach
  0 siblings, 0 replies; 33+ messages in thread
From: Lucas Stach @ 2018-01-24 11:50 UTC (permalink / raw)
  To: Gurchetan Singh, dri-devel
  Cc: daniel.vetter, thierry.reding, heiko, linux-arm-kernel, laurent.pinchart

Am Dienstag, den 23.01.2018, 18:56 -0800 schrieb Gurchetan Singh:
> The dma_cache_maint_page function is important for cache maintenance on
> ARM32 (this was determined via testing).
> 
> Since we desire direct control of the caches in drm_cache.c, let's make
> a copy of the function, rename it and use it.

I don't see why having a (potentially incoherent) copy of
dma_cache_maint_page in DRM does any good. Why can't we just export the
 original function to make it usable within DRM?

Regards,
Lucas

> 
> v2: Don't use DMA API, call functions directly (Daniel)
> 
> > Signed-off-by: Gurchetan Singh <gurchetansingh@chromium.org>
> ---
>  drivers/gpu/drm/drm_cache.c | 61 +++++++++++++++++++++++++++++++++++++++++++++
>  1 file changed, 61 insertions(+)
> 
> diff --git a/drivers/gpu/drm/drm_cache.c b/drivers/gpu/drm/drm_cache.c
> index 89cdd32fe1f3..5124582451c6 100644
> --- a/drivers/gpu/drm/drm_cache.c
> +++ b/drivers/gpu/drm/drm_cache.c
> @@ -69,6 +69,55 @@ static void drm_cache_flush_clflush(struct page *pages[],
>  }
>  #endif
>  
> +#if defined(CONFIG_ARM)
> +static void drm_cache_maint_page(struct page *page, unsigned long offset,
> > +				 size_t size, enum dma_data_direction dir,
> > +				 void (*op)(const void *, size_t, int))
> +{
> > +	unsigned long pfn;
> > +	size_t left = size;
> +
> > +	pfn = page_to_pfn(page) + offset / PAGE_SIZE;
> > +	offset %= PAGE_SIZE;
> +
> > +	/*
> > +	 * A single sg entry may refer to multiple physically contiguous
> > +	 * pages.  But we still need to process highmem pages individually.
> > +	 * If highmem is not configured then the bulk of this loop gets
> > +	 * optimized out.
> > +	 */
> > +	do {
> > +		size_t len = left;
> > +		void *vaddr;
> +
> > +		page = pfn_to_page(pfn);
> +
> > +		if (PageHighMem(page)) {
> > +			if (len + offset > PAGE_SIZE)
> > +				len = PAGE_SIZE - offset;
> +
> > +			if (cache_is_vipt_nonaliasing()) {
> > +				vaddr = kmap_atomic(page);
> > +				op(vaddr + offset, len, dir);
> > +				kunmap_atomic(vaddr);
> > +			} else {
> > +				vaddr = kmap_high_get(page);
> > +				if (vaddr) {
> > +					op(vaddr + offset, len, dir);
> > +					kunmap_high(page);
> > +				}
> > +			}
> > +		} else {
> > +			vaddr = page_address(page) + offset;
> > +			op(vaddr, len, dir);
> > +		}
> > +		offset = 0;
> > +		pfn++;
> > +		left -= len;
> > +	} while (left);
> +}
> +#endif
> +
>  /**
>   * drm_flush_pages - Flush dcache lines of a set of pages.
>   * @pages: List of pages to be flushed.
> @@ -104,6 +153,12 @@ drm_flush_pages(struct page *pages[], unsigned long num_pages)
> >  				   (unsigned long)page_virtual + PAGE_SIZE);
> >  		kunmap_atomic(page_virtual);
> >  	}
> +#elif defined(CONFIG_ARM)
> > +	unsigned long i;
> +
> > +	for (i = 0; i < num_pages; i++)
> > +		drm_cache_maint_page(pages[i], 0, PAGE_SIZE, DMA_TO_DEVICE,
> > +				     dmac_map_area);
>  #else
> >  	pr_err("Architecture has no drm_cache.c support\n");
> >  	WARN_ON_ONCE(1);
> @@ -135,6 +190,12 @@ drm_flush_sg(struct sg_table *st)
>  
> >  	if (wbinvd_on_all_cpus())
> >  		pr_err("Timed out waiting for cache flush\n");
> +#elif defined(CONFIG_ARM)
> > +	struct sg_page_iter sg_iter;
> +
> > +	for_each_sg_page(st->sgl, &sg_iter, st->nents, 0)
> > +		drm_cache_maint_page(sg_page_iter_page(&sg_iter), 0, PAGE_SIZE,
> > +				     DMA_TO_DEVICE, dmac_map_area);
>  #else
> >  	pr_err("Architecture has no drm_cache.c support\n");
> >  	WARN_ON_ONCE(1);

_______________________________________________
linux-arm-kernel mailing list
linux-arm-kernel@lists.infradead.org
http://lists.infradead.org/mailman/listinfo/linux-arm-kernel

^ permalink raw reply	[flat|nested] 33+ messages in thread

* [PATCH 3/5] drm: add ARM64 flush implementations
  2018-01-24  2:56   ` Gurchetan Singh
@ 2018-01-24 12:00     ` Robin Murphy
  -1 siblings, 0 replies; 33+ messages in thread
From: Robin Murphy @ 2018-01-24 12:00 UTC (permalink / raw)
  To: linux-arm-kernel

On 24/01/18 02:56, Gurchetan Singh wrote:
> This patch uses the __dma_map_area function to flush the cache
> on ARM64.
> 
> v2: Don't use DMA API, call functions directly (Daniel)
> 
> Signed-off-by: Gurchetan Singh <gurchetansingh@chromium.org>
> ---
>   drivers/gpu/drm/drm_cache.c | 13 +++++++++++++
>   1 file changed, 13 insertions(+)
> 
> diff --git a/drivers/gpu/drm/drm_cache.c b/drivers/gpu/drm/drm_cache.c
> index 5124582451c6..250cdfbb711f 100644
> --- a/drivers/gpu/drm/drm_cache.c
> +++ b/drivers/gpu/drm/drm_cache.c
> @@ -159,6 +159,12 @@ drm_flush_pages(struct page *pages[], unsigned long num_pages)
>   	for (i = 0; i < num_pages; i++)
>   		drm_cache_maint_page(pages[i], 0, PAGE_SIZE, DMA_TO_DEVICE,
>   				     dmac_map_area);
> +#elif defined(CONFIG_ARM64)
> +	unsigned long i;
> +
> +	for (i = 0; i < num_pages; i++)
> +		__dma_map_area(phys_to_virt(page_to_phys(pages[i])), PAGE_SIZE,
> +			       DMA_TO_DEVICE);

Note that this is not exactly equivalent to clflush - if it's at all 
possible for a non-coherent GPU to write back to these pages and someone 
somewhere expects to be able to read the updated data from a CPU, that's 
  going to be subtly broken.

This also breaks building DRM as a module. And I doubt that either of 
the two easy solutions to that are going to fly with the respective 
maintainers...

Given all the bodging around which seems to happen in DRM/ION/etc., it 
would be really nice to pin down what exactly the shortcomings of the 
DMA API are for these use-cases, and extend it to address them properly.

Robin.

>   #else
>   	pr_err("Architecture has no drm_cache.c support\n");
>   	WARN_ON_ONCE(1);
> @@ -196,6 +202,13 @@ drm_flush_sg(struct sg_table *st)
>   	for_each_sg_page(st->sgl, &sg_iter, st->nents, 0)
>   		drm_cache_maint_page(sg_page_iter_page(&sg_iter), 0, PAGE_SIZE,
>   				     DMA_TO_DEVICE, dmac_map_area);
> +#elif defined(CONFIG_ARM64)
> +	int i;
> +	struct scatterlist *sg;
> +
> +	for_each_sg(st->sgl, sg, st->nents, i)
> +		__dma_map_area(phys_to_virt(sg_phys(sg)), sg->length,
> +			       DMA_TO_DEVICE);
>   #else
>   	pr_err("Architecture has no drm_cache.c support\n");
>   	WARN_ON_ONCE(1);
> 

^ permalink raw reply	[flat|nested] 33+ messages in thread

* Re: [PATCH 3/5] drm: add ARM64 flush implementations
@ 2018-01-24 12:00     ` Robin Murphy
  0 siblings, 0 replies; 33+ messages in thread
From: Robin Murphy @ 2018-01-24 12:00 UTC (permalink / raw)
  To: Gurchetan Singh, dri-devel
  Cc: daniel.vetter, thierry.reding, linux-arm-kernel, laurent.pinchart

On 24/01/18 02:56, Gurchetan Singh wrote:
> This patch uses the __dma_map_area function to flush the cache
> on ARM64.
> 
> v2: Don't use DMA API, call functions directly (Daniel)
> 
> Signed-off-by: Gurchetan Singh <gurchetansingh@chromium.org>
> ---
>   drivers/gpu/drm/drm_cache.c | 13 +++++++++++++
>   1 file changed, 13 insertions(+)
> 
> diff --git a/drivers/gpu/drm/drm_cache.c b/drivers/gpu/drm/drm_cache.c
> index 5124582451c6..250cdfbb711f 100644
> --- a/drivers/gpu/drm/drm_cache.c
> +++ b/drivers/gpu/drm/drm_cache.c
> @@ -159,6 +159,12 @@ drm_flush_pages(struct page *pages[], unsigned long num_pages)
>   	for (i = 0; i < num_pages; i++)
>   		drm_cache_maint_page(pages[i], 0, PAGE_SIZE, DMA_TO_DEVICE,
>   				     dmac_map_area);
> +#elif defined(CONFIG_ARM64)
> +	unsigned long i;
> +
> +	for (i = 0; i < num_pages; i++)
> +		__dma_map_area(phys_to_virt(page_to_phys(pages[i])), PAGE_SIZE,
> +			       DMA_TO_DEVICE);

Note that this is not exactly equivalent to clflush - if it's at all 
possible for a non-coherent GPU to write back to these pages and someone 
somewhere expects to be able to read the updated data from a CPU, that's 
  going to be subtly broken.

This also breaks building DRM as a module. And I doubt that either of 
the two easy solutions to that are going to fly with the respective 
maintainers...

Given all the bodging around which seems to happen in DRM/ION/etc., it 
would be really nice to pin down what exactly the shortcomings of the 
DMA API are for these use-cases, and extend it to address them properly.

Robin.

>   #else
>   	pr_err("Architecture has no drm_cache.c support\n");
>   	WARN_ON_ONCE(1);
> @@ -196,6 +202,13 @@ drm_flush_sg(struct sg_table *st)
>   	for_each_sg_page(st->sgl, &sg_iter, st->nents, 0)
>   		drm_cache_maint_page(sg_page_iter_page(&sg_iter), 0, PAGE_SIZE,
>   				     DMA_TO_DEVICE, dmac_map_area);
> +#elif defined(CONFIG_ARM64)
> +	int i;
> +	struct scatterlist *sg;
> +
> +	for_each_sg(st->sgl, sg, st->nents, i)
> +		__dma_map_area(phys_to_virt(sg_phys(sg)), sg->length,
> +			       DMA_TO_DEVICE);
>   #else
>   	pr_err("Architecture has no drm_cache.c support\n");
>   	WARN_ON_ONCE(1);
> 
_______________________________________________
dri-devel mailing list
dri-devel@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/dri-devel

^ permalink raw reply	[flat|nested] 33+ messages in thread

* [PATCH 3/5] drm: add ARM64 flush implementations
  2018-01-24 12:00     ` Robin Murphy
@ 2018-01-24 12:36       ` Russell King - ARM Linux
  -1 siblings, 0 replies; 33+ messages in thread
From: Russell King - ARM Linux @ 2018-01-24 12:36 UTC (permalink / raw)
  To: linux-arm-kernel

On Wed, Jan 24, 2018 at 12:00:59PM +0000, Robin Murphy wrote:
> On 24/01/18 02:56, Gurchetan Singh wrote:
> >This patch uses the __dma_map_area function to flush the cache
> >on ARM64.
> >
> >v2: Don't use DMA API, call functions directly (Daniel)
> >
> >Signed-off-by: Gurchetan Singh <gurchetansingh@chromium.org>
> >---
> >  drivers/gpu/drm/drm_cache.c | 13 +++++++++++++
> >  1 file changed, 13 insertions(+)
> >
> >diff --git a/drivers/gpu/drm/drm_cache.c b/drivers/gpu/drm/drm_cache.c
> >index 5124582451c6..250cdfbb711f 100644
> >--- a/drivers/gpu/drm/drm_cache.c
> >+++ b/drivers/gpu/drm/drm_cache.c
> >@@ -159,6 +159,12 @@ drm_flush_pages(struct page *pages[], unsigned long num_pages)
> >  	for (i = 0; i < num_pages; i++)
> >  		drm_cache_maint_page(pages[i], 0, PAGE_SIZE, DMA_TO_DEVICE,
> >  				     dmac_map_area);
> >+#elif defined(CONFIG_ARM64)
> >+	unsigned long i;
> >+
> >+	for (i = 0; i < num_pages; i++)
> >+		__dma_map_area(phys_to_virt(page_to_phys(pages[i])), PAGE_SIZE,
> >+			       DMA_TO_DEVICE);
> 
> Note that this is not exactly equivalent to clflush - if it's at all
> possible for a non-coherent GPU to write back to these pages and someone
> somewhere expects to be able to read the updated data from a CPU, that's
> going to be subtly broken.
> 
> This also breaks building DRM as a module. And I doubt that either of the
> two easy solutions to that are going to fly with the respective
> maintainers...
> 
> Given all the bodging around which seems to happen in DRM/ION/etc., it would
> be really nice to pin down what exactly the shortcomings of the DMA API are
> for these use-cases, and extend it to address them properly.

Aren't the cache flushing instructions available from userspace on
ARM64?  (I've certainly used them in my spectre/meltdown PoC).  If
userspace wants to flush before reading a GPU buffer that has been
writtent to, why can't it use the already available cache maintenance
instructions?  It's likely to be quicker, and it can target just the
area of the buffer it wants to read.

32-bit ARM is a different matter, and I'll reply separately.

-- 
RMK's Patch system: http://www.armlinux.org.uk/developer/patches/
FTTC broadband for 0.8mile line in suburbia: sync at 8.8Mbps down 630kbps up
According to speedtest.net: 8.21Mbps down 510kbps up

^ permalink raw reply	[flat|nested] 33+ messages in thread

* Re: [PATCH 3/5] drm: add ARM64 flush implementations
@ 2018-01-24 12:36       ` Russell King - ARM Linux
  0 siblings, 0 replies; 33+ messages in thread
From: Russell King - ARM Linux @ 2018-01-24 12:36 UTC (permalink / raw)
  To: Robin Murphy
  Cc: heiko, dri-devel, Gurchetan Singh, thierry.reding,
	laurent.pinchart, daniel.vetter, linux-arm-kernel

On Wed, Jan 24, 2018 at 12:00:59PM +0000, Robin Murphy wrote:
> On 24/01/18 02:56, Gurchetan Singh wrote:
> >This patch uses the __dma_map_area function to flush the cache
> >on ARM64.
> >
> >v2: Don't use DMA API, call functions directly (Daniel)
> >
> >Signed-off-by: Gurchetan Singh <gurchetansingh@chromium.org>
> >---
> >  drivers/gpu/drm/drm_cache.c | 13 +++++++++++++
> >  1 file changed, 13 insertions(+)
> >
> >diff --git a/drivers/gpu/drm/drm_cache.c b/drivers/gpu/drm/drm_cache.c
> >index 5124582451c6..250cdfbb711f 100644
> >--- a/drivers/gpu/drm/drm_cache.c
> >+++ b/drivers/gpu/drm/drm_cache.c
> >@@ -159,6 +159,12 @@ drm_flush_pages(struct page *pages[], unsigned long num_pages)
> >  	for (i = 0; i < num_pages; i++)
> >  		drm_cache_maint_page(pages[i], 0, PAGE_SIZE, DMA_TO_DEVICE,
> >  				     dmac_map_area);
> >+#elif defined(CONFIG_ARM64)
> >+	unsigned long i;
> >+
> >+	for (i = 0; i < num_pages; i++)
> >+		__dma_map_area(phys_to_virt(page_to_phys(pages[i])), PAGE_SIZE,
> >+			       DMA_TO_DEVICE);
> 
> Note that this is not exactly equivalent to clflush - if it's at all
> possible for a non-coherent GPU to write back to these pages and someone
> somewhere expects to be able to read the updated data from a CPU, that's
> going to be subtly broken.
> 
> This also breaks building DRM as a module. And I doubt that either of the
> two easy solutions to that are going to fly with the respective
> maintainers...
> 
> Given all the bodging around which seems to happen in DRM/ION/etc., it would
> be really nice to pin down what exactly the shortcomings of the DMA API are
> for these use-cases, and extend it to address them properly.

Aren't the cache flushing instructions available from userspace on
ARM64?  (I've certainly used them in my spectre/meltdown PoC).  If
userspace wants to flush before reading a GPU buffer that has been
writtent to, why can't it use the already available cache maintenance
instructions?  It's likely to be quicker, and it can target just the
area of the buffer it wants to read.

32-bit ARM is a different matter, and I'll reply separately.

-- 
RMK's Patch system: http://www.armlinux.org.uk/developer/patches/
FTTC broadband for 0.8mile line in suburbia: sync at 8.8Mbps down 630kbps up
According to speedtest.net: 8.21Mbps down 510kbps up

^ permalink raw reply	[flat|nested] 33+ messages in thread

* [PATCH 2/5] drm: add ARM flush implementation
  2018-01-24  2:56 ` Gurchetan Singh
@ 2018-01-24 12:45   ` Russell King - ARM Linux
  -1 siblings, 0 replies; 33+ messages in thread
From: Russell King - ARM Linux @ 2018-01-24 12:45 UTC (permalink / raw)
  To: linux-arm-kernel

On Tue, Jan 23, 2018 at 06:56:03PM -0800, Gurchetan Singh wrote:
> The dma_cache_maint_page function is important for cache maintenance on
> ARM32 (this was determined via testing).
> 
> Since we desire direct control of the caches in drm_cache.c, let's make
> a copy of the function, rename it and use it.
> 
> v2: Don't use DMA API, call functions directly (Daniel)
> 
> Signed-off-by: Gurchetan Singh <gurchetansingh@chromium.org>
> ---
>  drivers/gpu/drm/drm_cache.c | 61 +++++++++++++++++++++++++++++++++++++++++++++
>  1 file changed, 61 insertions(+)
> 
> diff --git a/drivers/gpu/drm/drm_cache.c b/drivers/gpu/drm/drm_cache.c
> index 89cdd32fe1f3..5124582451c6 100644
> --- a/drivers/gpu/drm/drm_cache.c
> +++ b/drivers/gpu/drm/drm_cache.c
> @@ -69,6 +69,55 @@ static void drm_cache_flush_clflush(struct page *pages[],
>  }
>  #endif
>  
> +#if defined(CONFIG_ARM)
> +static void drm_cache_maint_page(struct page *page, unsigned long offset,
> +				 size_t size, enum dma_data_direction dir,
> +				 void (*op)(const void *, size_t, int))
> +{
> +	unsigned long pfn;
> +	size_t left = size;
> +
> +	pfn = page_to_pfn(page) + offset / PAGE_SIZE;
> +	offset %= PAGE_SIZE;
> +
> +	/*
> +	 * A single sg entry may refer to multiple physically contiguous
> +	 * pages.  But we still need to process highmem pages individually.
> +	 * If highmem is not configured then the bulk of this loop gets
> +	 * optimized out.
> +	 */
> +	do {
> +		size_t len = left;
> +		void *vaddr;
> +
> +		page = pfn_to_page(pfn);
> +
> +		if (PageHighMem(page)) {
> +			if (len + offset > PAGE_SIZE)
> +				len = PAGE_SIZE - offset;
> +
> +			if (cache_is_vipt_nonaliasing()) {
> +				vaddr = kmap_atomic(page);
> +				op(vaddr + offset, len, dir);
> +				kunmap_atomic(vaddr);
> +			} else {
> +				vaddr = kmap_high_get(page);
> +				if (vaddr) {
> +					op(vaddr + offset, len, dir);
> +					kunmap_high(page);
> +				}
> +			}
> +		} else {
> +			vaddr = page_address(page) + offset;
> +			op(vaddr, len, dir);
> +		}
> +		offset = 0;
> +		pfn++;
> +		left -= len;
> +	} while (left);
> +}
> +#endif
> +
>  /**
>   * drm_flush_pages - Flush dcache lines of a set of pages.
>   * @pages: List of pages to be flushed.
> @@ -104,6 +153,12 @@ drm_flush_pages(struct page *pages[], unsigned long num_pages)
>  				   (unsigned long)page_virtual + PAGE_SIZE);
>  		kunmap_atomic(page_virtual);
>  	}
> +#elif defined(CONFIG_ARM)
> +	unsigned long i;
> +
> +	for (i = 0; i < num_pages; i++)
> +		drm_cache_maint_page(pages[i], 0, PAGE_SIZE, DMA_TO_DEVICE,
> +				     dmac_map_area);
>  #else
>  	pr_err("Architecture has no drm_cache.c support\n");
>  	WARN_ON_ONCE(1);
> @@ -135,6 +190,12 @@ drm_flush_sg(struct sg_table *st)
>  
>  	if (wbinvd_on_all_cpus())
>  		pr_err("Timed out waiting for cache flush\n");
> +#elif defined(CONFIG_ARM)
> +	struct sg_page_iter sg_iter;
> +
> +	for_each_sg_page(st->sgl, &sg_iter, st->nents, 0)
> +		drm_cache_maint_page(sg_page_iter_page(&sg_iter), 0, PAGE_SIZE,
> +				     DMA_TO_DEVICE, dmac_map_area);
>  #else
>  	pr_err("Architecture has no drm_cache.c support\n");
>  	WARN_ON_ONCE(1);

NAK.  With this, you're "going under the covers" of the architecture and
using architecture private functions (dmac_map_area/dmac_unmap_area) in
a driver.

The hint is that dmac_map_area() and dmac_unmap_area() are not declared
in any asm/*.h header file, but in arch/arm/mm/dma.h, and they carry this
warning above their definition:

/*
 * These are private to the dma-mapping API.  Do not use directly.
 * Their sole purpose is to ensure that data held in the cache
 * is visible to DMA, or data written by DMA to system memory is
 * visible to the CPU.
 */

So no, this is not an acceptable approach.

Secondly, in light of spectre and meltdown, do we _really_ want to
export cache flushing to userspace in any case - these attacks rely
on being able to flush specific cache lines from the caches in order
to do the timing attacks (while leaving others in place.)

Currently, 32-bit ARM does not export such flushing capabilities to
userspace, which makes it very difficult (I'm not going to say
impossible) to get any working proof-of-code program that even
illustrates the timing attack.  Exposing this functionality changes
that game, and means that we're much more open to these exploits.
(Some may say that you can flush cache lines by reading a large
enough buffer - I'm aware, I've tried that, the results are too
unreliable even for a simple attempt which doesn't involve crossing
privilege boundaries.)

Do you really need cacheable GPU buffers, or will write combining
buffers (as we use elsewhere such as etnaviv) suffice?  Please provide
some _real_ _world_ performance measurements that demonstrate that
there is a real need for this functionality.

-- 
RMK's Patch system: http://www.armlinux.org.uk/developer/patches/
FTTC broadband for 0.8mile line in suburbia: sync at 8.8Mbps down 630kbps up
According to speedtest.net: 8.21Mbps down 510kbps up

^ permalink raw reply	[flat|nested] 33+ messages in thread

* Re: [PATCH 2/5] drm: add ARM flush implementation
@ 2018-01-24 12:45   ` Russell King - ARM Linux
  0 siblings, 0 replies; 33+ messages in thread
From: Russell King - ARM Linux @ 2018-01-24 12:45 UTC (permalink / raw)
  To: Gurchetan Singh
  Cc: heiko, dri-devel, thierry.reding, laurent.pinchart,
	daniel.vetter, linux-arm-kernel

On Tue, Jan 23, 2018 at 06:56:03PM -0800, Gurchetan Singh wrote:
> The dma_cache_maint_page function is important for cache maintenance on
> ARM32 (this was determined via testing).
> 
> Since we desire direct control of the caches in drm_cache.c, let's make
> a copy of the function, rename it and use it.
> 
> v2: Don't use DMA API, call functions directly (Daniel)
> 
> Signed-off-by: Gurchetan Singh <gurchetansingh@chromium.org>
> ---
>  drivers/gpu/drm/drm_cache.c | 61 +++++++++++++++++++++++++++++++++++++++++++++
>  1 file changed, 61 insertions(+)
> 
> diff --git a/drivers/gpu/drm/drm_cache.c b/drivers/gpu/drm/drm_cache.c
> index 89cdd32fe1f3..5124582451c6 100644
> --- a/drivers/gpu/drm/drm_cache.c
> +++ b/drivers/gpu/drm/drm_cache.c
> @@ -69,6 +69,55 @@ static void drm_cache_flush_clflush(struct page *pages[],
>  }
>  #endif
>  
> +#if defined(CONFIG_ARM)
> +static void drm_cache_maint_page(struct page *page, unsigned long offset,
> +				 size_t size, enum dma_data_direction dir,
> +				 void (*op)(const void *, size_t, int))
> +{
> +	unsigned long pfn;
> +	size_t left = size;
> +
> +	pfn = page_to_pfn(page) + offset / PAGE_SIZE;
> +	offset %= PAGE_SIZE;
> +
> +	/*
> +	 * A single sg entry may refer to multiple physically contiguous
> +	 * pages.  But we still need to process highmem pages individually.
> +	 * If highmem is not configured then the bulk of this loop gets
> +	 * optimized out.
> +	 */
> +	do {
> +		size_t len = left;
> +		void *vaddr;
> +
> +		page = pfn_to_page(pfn);
> +
> +		if (PageHighMem(page)) {
> +			if (len + offset > PAGE_SIZE)
> +				len = PAGE_SIZE - offset;
> +
> +			if (cache_is_vipt_nonaliasing()) {
> +				vaddr = kmap_atomic(page);
> +				op(vaddr + offset, len, dir);
> +				kunmap_atomic(vaddr);
> +			} else {
> +				vaddr = kmap_high_get(page);
> +				if (vaddr) {
> +					op(vaddr + offset, len, dir);
> +					kunmap_high(page);
> +				}
> +			}
> +		} else {
> +			vaddr = page_address(page) + offset;
> +			op(vaddr, len, dir);
> +		}
> +		offset = 0;
> +		pfn++;
> +		left -= len;
> +	} while (left);
> +}
> +#endif
> +
>  /**
>   * drm_flush_pages - Flush dcache lines of a set of pages.
>   * @pages: List of pages to be flushed.
> @@ -104,6 +153,12 @@ drm_flush_pages(struct page *pages[], unsigned long num_pages)
>  				   (unsigned long)page_virtual + PAGE_SIZE);
>  		kunmap_atomic(page_virtual);
>  	}
> +#elif defined(CONFIG_ARM)
> +	unsigned long i;
> +
> +	for (i = 0; i < num_pages; i++)
> +		drm_cache_maint_page(pages[i], 0, PAGE_SIZE, DMA_TO_DEVICE,
> +				     dmac_map_area);
>  #else
>  	pr_err("Architecture has no drm_cache.c support\n");
>  	WARN_ON_ONCE(1);
> @@ -135,6 +190,12 @@ drm_flush_sg(struct sg_table *st)
>  
>  	if (wbinvd_on_all_cpus())
>  		pr_err("Timed out waiting for cache flush\n");
> +#elif defined(CONFIG_ARM)
> +	struct sg_page_iter sg_iter;
> +
> +	for_each_sg_page(st->sgl, &sg_iter, st->nents, 0)
> +		drm_cache_maint_page(sg_page_iter_page(&sg_iter), 0, PAGE_SIZE,
> +				     DMA_TO_DEVICE, dmac_map_area);
>  #else
>  	pr_err("Architecture has no drm_cache.c support\n");
>  	WARN_ON_ONCE(1);

NAK.  With this, you're "going under the covers" of the architecture and
using architecture private functions (dmac_map_area/dmac_unmap_area) in
a driver.

The hint is that dmac_map_area() and dmac_unmap_area() are not declared
in any asm/*.h header file, but in arch/arm/mm/dma.h, and they carry this
warning above their definition:

/*
 * These are private to the dma-mapping API.  Do not use directly.
 * Their sole purpose is to ensure that data held in the cache
 * is visible to DMA, or data written by DMA to system memory is
 * visible to the CPU.
 */

So no, this is not an acceptable approach.

Secondly, in light of spectre and meltdown, do we _really_ want to
export cache flushing to userspace in any case - these attacks rely
on being able to flush specific cache lines from the caches in order
to do the timing attacks (while leaving others in place.)

Currently, 32-bit ARM does not export such flushing capabilities to
userspace, which makes it very difficult (I'm not going to say
impossible) to get any working proof-of-code program that even
illustrates the timing attack.  Exposing this functionality changes
that game, and means that we're much more open to these exploits.
(Some may say that you can flush cache lines by reading a large
enough buffer - I'm aware, I've tried that, the results are too
unreliable even for a simple attempt which doesn't involve crossing
privilege boundaries.)

Do you really need cacheable GPU buffers, or will write combining
buffers (as we use elsewhere such as etnaviv) suffice?  Please provide
some _real_ _world_ performance measurements that demonstrate that
there is a real need for this functionality.

-- 
RMK's Patch system: http://www.armlinux.org.uk/developer/patches/
FTTC broadband for 0.8mile line in suburbia: sync at 8.8Mbps down 630kbps up
According to speedtest.net: 8.21Mbps down 510kbps up

^ permalink raw reply	[flat|nested] 33+ messages in thread

* [PATCH 3/5] drm: add ARM64 flush implementations
  2018-01-24 12:36       ` Russell King - ARM Linux
@ 2018-01-24 13:14         ` Robin Murphy
  -1 siblings, 0 replies; 33+ messages in thread
From: Robin Murphy @ 2018-01-24 13:14 UTC (permalink / raw)
  To: linux-arm-kernel

On 24/01/18 12:36, Russell King - ARM Linux wrote:
> On Wed, Jan 24, 2018 at 12:00:59PM +0000, Robin Murphy wrote:
>> On 24/01/18 02:56, Gurchetan Singh wrote:
>>> This patch uses the __dma_map_area function to flush the cache
>>> on ARM64.
>>>
>>> v2: Don't use DMA API, call functions directly (Daniel)
>>>
>>> Signed-off-by: Gurchetan Singh <gurchetansingh@chromium.org>
>>> ---
>>>   drivers/gpu/drm/drm_cache.c | 13 +++++++++++++
>>>   1 file changed, 13 insertions(+)
>>>
>>> diff --git a/drivers/gpu/drm/drm_cache.c b/drivers/gpu/drm/drm_cache.c
>>> index 5124582451c6..250cdfbb711f 100644
>>> --- a/drivers/gpu/drm/drm_cache.c
>>> +++ b/drivers/gpu/drm/drm_cache.c
>>> @@ -159,6 +159,12 @@ drm_flush_pages(struct page *pages[], unsigned long num_pages)
>>>   	for (i = 0; i < num_pages; i++)
>>>   		drm_cache_maint_page(pages[i], 0, PAGE_SIZE, DMA_TO_DEVICE,
>>>   				     dmac_map_area);
>>> +#elif defined(CONFIG_ARM64)
>>> +	unsigned long i;
>>> +
>>> +	for (i = 0; i < num_pages; i++)
>>> +		__dma_map_area(phys_to_virt(page_to_phys(pages[i])), PAGE_SIZE,
>>> +			       DMA_TO_DEVICE);
>>
>> Note that this is not exactly equivalent to clflush - if it's at all
>> possible for a non-coherent GPU to write back to these pages and someone
>> somewhere expects to be able to read the updated data from a CPU, that's
>> going to be subtly broken.
>>
>> This also breaks building DRM as a module. And I doubt that either of the
>> two easy solutions to that are going to fly with the respective
>> maintainers...
>>
>> Given all the bodging around which seems to happen in DRM/ION/etc., it would
>> be really nice to pin down what exactly the shortcomings of the DMA API are
>> for these use-cases, and extend it to address them properly.
> 
> Aren't the cache flushing instructions available from userspace on
> ARM64?  (I've certainly used them in my spectre/meltdown PoC).  If
> userspace wants to flush before reading a GPU buffer that has been
> writtent to, why can't it use the already available cache maintenance
> instructions?  It's likely to be quicker, and it can target just the
> area of the buffer it wants to read.

Indeed, whilst the main reason we enable EL0 cache maintenance access is 
for JITs cleaning to PoU, it inherently makes PoC operations available 
as well. The semantics of drm_clflush_*() weren't immediately obvious, 
but if it is purely a sync point around userspace reading/writing GPU 
buffers directly, then what you suggest does sound ideal (and 
__dma_map_area is definitely wrong for the read case as above).

> 
> 32-bit ARM is a different matter, and I'll reply separately.
> 

Thanks,
Robin.

^ permalink raw reply	[flat|nested] 33+ messages in thread

* Re: [PATCH 3/5] drm: add ARM64 flush implementations
@ 2018-01-24 13:14         ` Robin Murphy
  0 siblings, 0 replies; 33+ messages in thread
From: Robin Murphy @ 2018-01-24 13:14 UTC (permalink / raw)
  To: Russell King - ARM Linux
  Cc: dri-devel, Gurchetan Singh, thierry.reding, laurent.pinchart,
	daniel.vetter, linux-arm-kernel

On 24/01/18 12:36, Russell King - ARM Linux wrote:
> On Wed, Jan 24, 2018 at 12:00:59PM +0000, Robin Murphy wrote:
>> On 24/01/18 02:56, Gurchetan Singh wrote:
>>> This patch uses the __dma_map_area function to flush the cache
>>> on ARM64.
>>>
>>> v2: Don't use DMA API, call functions directly (Daniel)
>>>
>>> Signed-off-by: Gurchetan Singh <gurchetansingh@chromium.org>
>>> ---
>>>   drivers/gpu/drm/drm_cache.c | 13 +++++++++++++
>>>   1 file changed, 13 insertions(+)
>>>
>>> diff --git a/drivers/gpu/drm/drm_cache.c b/drivers/gpu/drm/drm_cache.c
>>> index 5124582451c6..250cdfbb711f 100644
>>> --- a/drivers/gpu/drm/drm_cache.c
>>> +++ b/drivers/gpu/drm/drm_cache.c
>>> @@ -159,6 +159,12 @@ drm_flush_pages(struct page *pages[], unsigned long num_pages)
>>>   	for (i = 0; i < num_pages; i++)
>>>   		drm_cache_maint_page(pages[i], 0, PAGE_SIZE, DMA_TO_DEVICE,
>>>   				     dmac_map_area);
>>> +#elif defined(CONFIG_ARM64)
>>> +	unsigned long i;
>>> +
>>> +	for (i = 0; i < num_pages; i++)
>>> +		__dma_map_area(phys_to_virt(page_to_phys(pages[i])), PAGE_SIZE,
>>> +			       DMA_TO_DEVICE);
>>
>> Note that this is not exactly equivalent to clflush - if it's at all
>> possible for a non-coherent GPU to write back to these pages and someone
>> somewhere expects to be able to read the updated data from a CPU, that's
>> going to be subtly broken.
>>
>> This also breaks building DRM as a module. And I doubt that either of the
>> two easy solutions to that are going to fly with the respective
>> maintainers...
>>
>> Given all the bodging around which seems to happen in DRM/ION/etc., it would
>> be really nice to pin down what exactly the shortcomings of the DMA API are
>> for these use-cases, and extend it to address them properly.
> 
> Aren't the cache flushing instructions available from userspace on
> ARM64?  (I've certainly used them in my spectre/meltdown PoC).  If
> userspace wants to flush before reading a GPU buffer that has been
> writtent to, why can't it use the already available cache maintenance
> instructions?  It's likely to be quicker, and it can target just the
> area of the buffer it wants to read.

Indeed, whilst the main reason we enable EL0 cache maintenance access is 
for JITs cleaning to PoU, it inherently makes PoC operations available 
as well. The semantics of drm_clflush_*() weren't immediately obvious, 
but if it is purely a sync point around userspace reading/writing GPU 
buffers directly, then what you suggest does sound ideal (and 
__dma_map_area is definitely wrong for the read case as above).

> 
> 32-bit ARM is a different matter, and I'll reply separately.
> 

Thanks,
Robin.
_______________________________________________
dri-devel mailing list
dri-devel@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/dri-devel

^ permalink raw reply	[flat|nested] 33+ messages in thread

* Re: [PATCH 2/5] drm: add ARM flush implementation
  2018-01-24 12:45   ` Russell King - ARM Linux
  (?)
@ 2018-01-24 18:45   ` Gurchetan Singh
  2018-01-24 19:26       ` Russell King - ARM Linux
  -1 siblings, 1 reply; 33+ messages in thread
From: Gurchetan Singh @ 2018-01-24 18:45 UTC (permalink / raw)
  To: Russell King - ARM Linux
  Cc: jeffy.chen, ML dri-devel, Thierry Reding, Laurent Pinchart,
	Daniel Vetter, linux-arm-kernel


[-- Attachment #1.1: Type: text/plain, Size: 7348 bytes --]

On Wed, Jan 24, 2018 at 4:45 AM, Russell King - ARM Linux <
linux@armlinux.org.uk> wrote:

> On Tue, Jan 23, 2018 at 06:56:03PM -0800, Gurchetan Singh wrote:
> > The dma_cache_maint_page function is important for cache maintenance on
> > ARM32 (this was determined via testing).
> >
> > Since we desire direct control of the caches in drm_cache.c, let's make
> > a copy of the function, rename it and use it.
> >
> > v2: Don't use DMA API, call functions directly (Daniel)
> >
> > Signed-off-by: Gurchetan Singh <gurchetansingh@chromium.org>
> > ---
> >  drivers/gpu/drm/drm_cache.c | 61 ++++++++++++++++++++++++++++++
> +++++++++++++++
> >  1 file changed, 61 insertions(+)
> >
> > diff --git a/drivers/gpu/drm/drm_cache.c b/drivers/gpu/drm/drm_cache.c
> > index 89cdd32fe1f3..5124582451c6 100644
> > --- a/drivers/gpu/drm/drm_cache.c
> > +++ b/drivers/gpu/drm/drm_cache.c
> > @@ -69,6 +69,55 @@ static void drm_cache_flush_clflush(struct page
> *pages[],
> >  }
> >  #endif
> >
> > +#if defined(CONFIG_ARM)
> > +static void drm_cache_maint_page(struct page *page, unsigned long
> offset,
> > +                              size_t size, enum dma_data_direction dir,
> > +                              void (*op)(const void *, size_t, int))
> > +{
> > +     unsigned long pfn;
> > +     size_t left = size;
> > +
> > +     pfn = page_to_pfn(page) + offset / PAGE_SIZE;
> > +     offset %= PAGE_SIZE;
> > +
> > +     /*
> > +      * A single sg entry may refer to multiple physically contiguous
> > +      * pages.  But we still need to process highmem pages individually.
> > +      * If highmem is not configured then the bulk of this loop gets
> > +      * optimized out.
> > +      */
> > +     do {
> > +             size_t len = left;
> > +             void *vaddr;
> > +
> > +             page = pfn_to_page(pfn);
> > +
> > +             if (PageHighMem(page)) {
> > +                     if (len + offset > PAGE_SIZE)
> > +                             len = PAGE_SIZE - offset;
> > +
> > +                     if (cache_is_vipt_nonaliasing()) {
> > +                             vaddr = kmap_atomic(page);
> > +                             op(vaddr + offset, len, dir);
> > +                             kunmap_atomic(vaddr);
> > +                     } else {
> > +                             vaddr = kmap_high_get(page);
> > +                             if (vaddr) {
> > +                                     op(vaddr + offset, len, dir);
> > +                                     kunmap_high(page);
> > +                             }
> > +                     }
> > +             } else {
> > +                     vaddr = page_address(page) + offset;
> > +                     op(vaddr, len, dir);
> > +             }
> > +             offset = 0;
> > +             pfn++;
> > +             left -= len;
> > +     } while (left);
> > +}
> > +#endif
> > +
> >  /**
> >   * drm_flush_pages - Flush dcache lines of a set of pages.
> >   * @pages: List of pages to be flushed.
> > @@ -104,6 +153,12 @@ drm_flush_pages(struct page *pages[], unsigned long
> num_pages)
> >                                  (unsigned long)page_virtual +
> PAGE_SIZE);
> >               kunmap_atomic(page_virtual);
> >       }
> > +#elif defined(CONFIG_ARM)
> > +     unsigned long i;
> > +
> > +     for (i = 0; i < num_pages; i++)
> > +             drm_cache_maint_page(pages[i], 0, PAGE_SIZE,
> DMA_TO_DEVICE,
> > +                                  dmac_map_area);
> >  #else
> >       pr_err("Architecture has no drm_cache.c support\n");
> >       WARN_ON_ONCE(1);
> > @@ -135,6 +190,12 @@ drm_flush_sg(struct sg_table *st)
> >
> >       if (wbinvd_on_all_cpus())
> >               pr_err("Timed out waiting for cache flush\n");
> > +#elif defined(CONFIG_ARM)
> > +     struct sg_page_iter sg_iter;
> > +
> > +     for_each_sg_page(st->sgl, &sg_iter, st->nents, 0)
> > +             drm_cache_maint_page(sg_page_iter_page(&sg_iter), 0,
> PAGE_SIZE,
> > +                                  DMA_TO_DEVICE, dmac_map_area);
> >  #else
> >       pr_err("Architecture has no drm_cache.c support\n");
> >       WARN_ON_ONCE(1);
>
> NAK.  With this, you're "going under the covers" of the architecture and
> using architecture private functions (dmac_map_area/dmac_unmap_area) in
> a driver.
>
> The hint is that dmac_map_area() and dmac_unmap_area() are not declared
> in any asm/*.h header file, but in arch/arm/mm/dma.h, and they carry this
> warning above their definition:
>
> /*
>  * These are private to the dma-mapping API.  Do not use directly.
>  * Their sole purpose is to ensure that data held in the cache
>  * is visible to DMA, or data written by DMA to system memory is
>  * visible to the CPU.
>  */
>

My preference is to use the dma-mapping API (version 1 of this
patchset) too because:

1) the warnings in these headers
2) this approach entails code duplication

However, I've received feedback that's not desirable for DRM.  Given your
disapproval of this approach, I think the dma-mapping API is the way to go.


> So no, this is not an acceptable approach.
>
> Secondly, in light of spectre and meltdown, do we _really_ want to
> export cache flushing to userspace in any case - these attacks rely
> on being able to flush specific cache lines from the caches in order
> to do the timing attacks (while leaving others in place.)

Currently, 32-bit ARM does not export such flushing capabilities to
> userspace, which makes it very difficult (I'm not going to say
> impossible) to get any working proof-of-code program that even
> illustrates the timing attack.  Exposing this functionality changes
> that game, and means that we're much more open to these exploits.
> (Some may say that you can flush cache lines by reading a large
> enough buffer - I'm aware, I've tried that, the results are too
> unreliable even for a simple attempt which doesn't involve crossing
> privilege boundaries.)
>

Will using the DMA API (dma_sync_single_for_device /
dma_sync_sg_for_device) mitigate your Meltdown / Spectre concerns in any
way?


> Do you really need cacheable GPU buffers, or will write combining
> buffers (as we use elsewhere such as etnaviv) suffice?  Please provide
> some _real_ _world_ performance measurements that demonstrate that
> there is a real need for this functionality.


My desire is for the vgem driver to work correctly on ARM, which requires
cache flushing.  The mappings vgem itself creates are write combine.  The
issue is the pages retrieved on ARM architecture usually have to be flushed
before they can be used (see rockchip_gem_get_pages / tegra_bo_get_pages).
This patch set attempts to do the flushing in an architecture independent
manner (since vgem is intended to work on ARM / x86).

There is some interest in cache-able DRM buffers (though, again, this
patchset is not about that).  Renderscript accesses are very slow on ARM
and we keep shadow buffers to improve performance (see
crrev.com/602736).  Jeffy
has done some tests with memcpys in our camera stack that shows
improvements (with caching --> 4 to 7 ms, without caching --> 20 to 90ms).
However, I do agree Spectre complicates things.


> --
> RMK's Patch system: http://www.armlinux.org.uk/developer/patches/
> FTTC broadband for 0.8mile line in suburbia: sync at 8.8Mbps down 630kbps
> up
> According to speedtest.net: 8.21Mbps down 510kbps up
>

[-- Attachment #1.2: Type: text/html, Size: 12594 bytes --]

[-- Attachment #2: Type: text/plain, Size: 160 bytes --]

_______________________________________________
dri-devel mailing list
dri-devel@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/dri-devel

^ permalink raw reply	[flat|nested] 33+ messages in thread

* [PATCH 2/5] drm: add ARM flush implementation
  2018-01-24 18:45   ` Gurchetan Singh
@ 2018-01-24 19:26       ` Russell King - ARM Linux
  0 siblings, 0 replies; 33+ messages in thread
From: Russell King - ARM Linux @ 2018-01-24 19:26 UTC (permalink / raw)
  To: linux-arm-kernel

On Wed, Jan 24, 2018 at 10:45:28AM -0800, Gurchetan Singh wrote:
> On Wed, Jan 24, 2018 at 4:45 AM, Russell King - ARM Linux <
> linux at armlinux.org.uk> wrote:
> > So no, this is not an acceptable approach.
> >
> > Secondly, in light of spectre and meltdown, do we _really_ want to
> > export cache flushing to userspace in any case - these attacks rely
> > on being able to flush specific cache lines from the caches in order
> > to do the timing attacks (while leaving others in place.)
> 
> > Currently, 32-bit ARM does not export such flushing capabilities to
> > userspace, which makes it very difficult (I'm not going to say
> > impossible) to get any working proof-of-code program that even
> > illustrates the timing attack.  Exposing this functionality changes
> > that game, and means that we're much more open to these exploits.
> > (Some may say that you can flush cache lines by reading a large
> > enough buffer - I'm aware, I've tried that, the results are too
> > unreliable even for a simple attempt which doesn't involve crossing
> > privilege boundaries.)
> >
> 
> Will using the DMA API (dma_sync_single_for_device /
> dma_sync_sg_for_device) mitigate your Meltdown / Spectre concerns in any
> way?

I see no point in answering that question based on what you've written
below (see below for why).

> > Do you really need cacheable GPU buffers, or will write combining
> > buffers (as we use elsewhere such as etnaviv) suffice?  Please provide
> > some _real_ _world_ performance measurements that demonstrate that
> > there is a real need for this functionality.
> 
> 
> My desire is for the vgem driver to work correctly on ARM, which requires
> cache flushing.  The mappings vgem itself creates are write combine.

If the pages are mapped write-combine, they are by definition *not*
cacheable, so there should be no cache flushing required.

> The
> issue is the pages retrieved on ARM architecture usually have to be flushed
> before they can be used (see rockchip_gem_get_pages / tegra_bo_get_pages).
> This patch set attempts to do the flushing in an architecture independent
> manner (since vgem is intended to work on ARM / x86).

I see rockchip_gem_get_pages() using shmem_read_mapping_page() to get
the pages.  That's more or less fine, we do that on Etnaviv too.

(Side note: provided the pages are not coming from lowmem, as mapping
lowmem pages are mapped cacheable, and if you also map them elsewhere
as write-combine, you're stepping into some potential cache attribute
issues.)

How we deal with this in Etnaviv is to use dma_map_sg() after we get
the pages - see

  etnaviv_gem_get_pages(), which calls the memory specific .get_pages
    method, and goes on to call etnaviv_gem_scatter_map().

There's no need for the faking up of a SG table that way, just let
dma_map_sg() do whatever it needs to do.  This means you're not abusing
the DMA API, and if you have a system IOMMU in the way, as a bonus that
gets setup for you.

> There is some interest in cache-able DRM buffers (though, again, this
> patchset is not about that).  Renderscript accesses are very slow on ARM
> and we keep shadow buffers to improve performance (see
> crrev.com/602736).

404.

> Jeffy
> has done some tests with memcpys in our camera stack that shows
> improvements (with caching --> 4 to 7 ms, without caching --> 20 to 90ms).
> However, I do agree Spectre complicates things.

At the moment, on 32-bit ARM, we have very little mitigation work for
Spectre beyond the generic kernel work that is going on at the moment
for all architectures, so I really do not want to introduce anything
that makes 32-bit ARM more easily vulnerable to attack.  That may
change in the future (sorry, I can't say when), but right now I don't
think we have much of an option.

-- 
RMK's Patch system: http://www.armlinux.org.uk/developer/patches/
FTTC broadband for 0.8mile line in suburbia: sync at 8.8Mbps down 630kbps up
According to speedtest.net: 8.21Mbps down 510kbps up

^ permalink raw reply	[flat|nested] 33+ messages in thread

* Re: [PATCH 2/5] drm: add ARM flush implementation
@ 2018-01-24 19:26       ` Russell King - ARM Linux
  0 siblings, 0 replies; 33+ messages in thread
From: Russell King - ARM Linux @ 2018-01-24 19:26 UTC (permalink / raw)
  To: Gurchetan Singh
  Cc: heiko, jeffy.chen, ML dri-devel, Thierry Reding,
	Laurent Pinchart, Daniel Vetter, linux-arm-kernel

On Wed, Jan 24, 2018 at 10:45:28AM -0800, Gurchetan Singh wrote:
> On Wed, Jan 24, 2018 at 4:45 AM, Russell King - ARM Linux <
> linux@armlinux.org.uk> wrote:
> > So no, this is not an acceptable approach.
> >
> > Secondly, in light of spectre and meltdown, do we _really_ want to
> > export cache flushing to userspace in any case - these attacks rely
> > on being able to flush specific cache lines from the caches in order
> > to do the timing attacks (while leaving others in place.)
> 
> > Currently, 32-bit ARM does not export such flushing capabilities to
> > userspace, which makes it very difficult (I'm not going to say
> > impossible) to get any working proof-of-code program that even
> > illustrates the timing attack.  Exposing this functionality changes
> > that game, and means that we're much more open to these exploits.
> > (Some may say that you can flush cache lines by reading a large
> > enough buffer - I'm aware, I've tried that, the results are too
> > unreliable even for a simple attempt which doesn't involve crossing
> > privilege boundaries.)
> >
> 
> Will using the DMA API (dma_sync_single_for_device /
> dma_sync_sg_for_device) mitigate your Meltdown / Spectre concerns in any
> way?

I see no point in answering that question based on what you've written
below (see below for why).

> > Do you really need cacheable GPU buffers, or will write combining
> > buffers (as we use elsewhere such as etnaviv) suffice?  Please provide
> > some _real_ _world_ performance measurements that demonstrate that
> > there is a real need for this functionality.
> 
> 
> My desire is for the vgem driver to work correctly on ARM, which requires
> cache flushing.  The mappings vgem itself creates are write combine.

If the pages are mapped write-combine, they are by definition *not*
cacheable, so there should be no cache flushing required.

> The
> issue is the pages retrieved on ARM architecture usually have to be flushed
> before they can be used (see rockchip_gem_get_pages / tegra_bo_get_pages).
> This patch set attempts to do the flushing in an architecture independent
> manner (since vgem is intended to work on ARM / x86).

I see rockchip_gem_get_pages() using shmem_read_mapping_page() to get
the pages.  That's more or less fine, we do that on Etnaviv too.

(Side note: provided the pages are not coming from lowmem, as mapping
lowmem pages are mapped cacheable, and if you also map them elsewhere
as write-combine, you're stepping into some potential cache attribute
issues.)

How we deal with this in Etnaviv is to use dma_map_sg() after we get
the pages - see

  etnaviv_gem_get_pages(), which calls the memory specific .get_pages
    method, and goes on to call etnaviv_gem_scatter_map().

There's no need for the faking up of a SG table that way, just let
dma_map_sg() do whatever it needs to do.  This means you're not abusing
the DMA API, and if you have a system IOMMU in the way, as a bonus that
gets setup for you.

> There is some interest in cache-able DRM buffers (though, again, this
> patchset is not about that).  Renderscript accesses are very slow on ARM
> and we keep shadow buffers to improve performance (see
> crrev.com/602736).

404.

> Jeffy
> has done some tests with memcpys in our camera stack that shows
> improvements (with caching --> 4 to 7 ms, without caching --> 20 to 90ms).
> However, I do agree Spectre complicates things.

At the moment, on 32-bit ARM, we have very little mitigation work for
Spectre beyond the generic kernel work that is going on at the moment
for all architectures, so I really do not want to introduce anything
that makes 32-bit ARM more easily vulnerable to attack.  That may
change in the future (sorry, I can't say when), but right now I don't
think we have much of an option.

-- 
RMK's Patch system: http://www.armlinux.org.uk/developer/patches/
FTTC broadband for 0.8mile line in suburbia: sync at 8.8Mbps down 630kbps up
According to speedtest.net: 8.21Mbps down 510kbps up

^ permalink raw reply	[flat|nested] 33+ messages in thread

* [PATCH 2/5] drm: add ARM flush implementation
  2018-01-24 19:26       ` Russell King - ARM Linux
@ 2018-01-24 23:43         ` Gurchetan Singh
  -1 siblings, 0 replies; 33+ messages in thread
From: Gurchetan Singh @ 2018-01-24 23:43 UTC (permalink / raw)
  To: linux-arm-kernel

On Wed, Jan 24, 2018 at 11:26 AM, Russell King - ARM Linux
<linux@armlinux.org.uk> wrote:
> On Wed, Jan 24, 2018 at 10:45:28AM -0800, Gurchetan Singh wrote:
>> On Wed, Jan 24, 2018 at 4:45 AM, Russell King - ARM Linux <
>> linux at armlinux.org.uk> wrote:
>> > So no, this is not an acceptable approach.
>> >
>> > Secondly, in light of spectre and meltdown, do we _really_ want to
>> > export cache flushing to userspace in any case - these attacks rely
>> > on being able to flush specific cache lines from the caches in order
>> > to do the timing attacks (while leaving others in place.)
>>
>> > Currently, 32-bit ARM does not export such flushing capabilities to
>> > userspace, which makes it very difficult (I'm not going to say
>> > impossible) to get any working proof-of-code program that even
>> > illustrates the timing attack.  Exposing this functionality changes
>> > that game, and means that we're much more open to these exploits.
>> > (Some may say that you can flush cache lines by reading a large
>> > enough buffer - I'm aware, I've tried that, the results are too
>> > unreliable even for a simple attempt which doesn't involve crossing
>> > privilege boundaries.)
>> >
>>
>> Will using the DMA API (dma_sync_single_for_device /
>> dma_sync_sg_for_device) mitigate your Meltdown / Spectre concerns in any
>> way?
>
> I see no point in answering that question based on what you've written
> below (see below for why).
>
>> > Do you really need cacheable GPU buffers, or will write combining
>> > buffers (as we use elsewhere such as etnaviv) suffice?  Please provide
>> > some _real_ _world_ performance measurements that demonstrate that
>> > there is a real need for this functionality.
>>
>>
>> My desire is for the vgem driver to work correctly on ARM, which requires
>> cache flushing.  The mappings vgem itself creates are write combine.
>
> If the pages are mapped write-combine, they are by definition *not*
> cacheable, so there should be no cache flushing required.
>
>> The
>> issue is the pages retrieved on ARM architecture usually have to be flushed
>> before they can be used (see rockchip_gem_get_pages / tegra_bo_get_pages).
>> This patch set attempts to do the flushing in an architecture independent
>> manner (since vgem is intended to work on ARM / x86).
>
> I see rockchip_gem_get_pages() using shmem_read_mapping_page() to get
> the pages.  That's more or less fine, we do that on Etnaviv too.
>
> (Side note: provided the pages are not coming from lowmem, as mapping
> lowmem pages are mapped cacheable, and if you also map them elsewhere
> as write-combine, you're stepping into some potential cache attribute
> issues.)
>
> How we deal with this in Etnaviv is to use dma_map_sg() after we get
> the pages - see
>
>   etnaviv_gem_get_pages(), which calls the memory specific .get_pages
>     method, and goes on to call etnaviv_gem_scatter_map().
>
> There's no need for the faking up of a SG table that way, just let
> dma_map_sg() do whatever it needs to do.  This means you're not abusing
> the DMA API, and if you have a system IOMMU in the way, as a bonus that
> gets setup for you.

Okay, so the recommended solution to my problem (vgem doesn't work on
ARM) is to:

1) Create vgem_get_pages / vgem_put_pages functions.  vgem_get_pages
will be called from vgem_gem_fault (since currently vgem does delayed
allocation) and vgem_pin_pages.
2) Call dma_map_sg() / dma_unmap_sg in vgem_get_pages / vgem_put_pages
if the architecture is ARM.

That works for me, if that works for everyone else.

>> There is some interest in cache-able DRM buffers (though, again, this
>> patchset is not about that).  Renderscript accesses are very slow on ARM
>> and we keep shadow buffers to improve performance (see
>> crrev.com/602736).
>
> 404.

Oops, the correct URL is crrev.com/c/602736.

>> Jeffy
>> has done some tests with memcpys in our camera stack that shows
>> improvements (with caching --> 4 to 7 ms, without caching --> 20 to 90ms).
>> However, I do agree Spectre complicates things.
>
> At the moment, on 32-bit ARM, we have very little mitigation work for
> Spectre beyond the generic kernel work that is going on at the moment
> for all architectures, so I really do not want to introduce anything
> that makes 32-bit ARM more easily vulnerable to attack.

Fair enough.

>That may change in the future (sorry, I can't say when), but right now I don't
> think we have much of an option.
>
> --
> RMK's Patch system: http://www.armlinux.org.uk/developer/patches/
> FTTC broadband for 0.8mile line in suburbia: sync at 8.8Mbps down 630kbps up
> According to speedtest.net: 8.21Mbps down 510kbps up

^ permalink raw reply	[flat|nested] 33+ messages in thread

* Re: [PATCH 2/5] drm: add ARM flush implementation
@ 2018-01-24 23:43         ` Gurchetan Singh
  0 siblings, 0 replies; 33+ messages in thread
From: Gurchetan Singh @ 2018-01-24 23:43 UTC (permalink / raw)
  To: Russell King - ARM Linux
  Cc: Jeffy Chen, ML dri-devel, Thierry Reding, Laurent Pinchart,
	Daniel Vetter, linux-arm-kernel

On Wed, Jan 24, 2018 at 11:26 AM, Russell King - ARM Linux
<linux@armlinux.org.uk> wrote:
> On Wed, Jan 24, 2018 at 10:45:28AM -0800, Gurchetan Singh wrote:
>> On Wed, Jan 24, 2018 at 4:45 AM, Russell King - ARM Linux <
>> linux@armlinux.org.uk> wrote:
>> > So no, this is not an acceptable approach.
>> >
>> > Secondly, in light of spectre and meltdown, do we _really_ want to
>> > export cache flushing to userspace in any case - these attacks rely
>> > on being able to flush specific cache lines from the caches in order
>> > to do the timing attacks (while leaving others in place.)
>>
>> > Currently, 32-bit ARM does not export such flushing capabilities to
>> > userspace, which makes it very difficult (I'm not going to say
>> > impossible) to get any working proof-of-code program that even
>> > illustrates the timing attack.  Exposing this functionality changes
>> > that game, and means that we're much more open to these exploits.
>> > (Some may say that you can flush cache lines by reading a large
>> > enough buffer - I'm aware, I've tried that, the results are too
>> > unreliable even for a simple attempt which doesn't involve crossing
>> > privilege boundaries.)
>> >
>>
>> Will using the DMA API (dma_sync_single_for_device /
>> dma_sync_sg_for_device) mitigate your Meltdown / Spectre concerns in any
>> way?
>
> I see no point in answering that question based on what you've written
> below (see below for why).
>
>> > Do you really need cacheable GPU buffers, or will write combining
>> > buffers (as we use elsewhere such as etnaviv) suffice?  Please provide
>> > some _real_ _world_ performance measurements that demonstrate that
>> > there is a real need for this functionality.
>>
>>
>> My desire is for the vgem driver to work correctly on ARM, which requires
>> cache flushing.  The mappings vgem itself creates are write combine.
>
> If the pages are mapped write-combine, they are by definition *not*
> cacheable, so there should be no cache flushing required.
>
>> The
>> issue is the pages retrieved on ARM architecture usually have to be flushed
>> before they can be used (see rockchip_gem_get_pages / tegra_bo_get_pages).
>> This patch set attempts to do the flushing in an architecture independent
>> manner (since vgem is intended to work on ARM / x86).
>
> I see rockchip_gem_get_pages() using shmem_read_mapping_page() to get
> the pages.  That's more or less fine, we do that on Etnaviv too.
>
> (Side note: provided the pages are not coming from lowmem, as mapping
> lowmem pages are mapped cacheable, and if you also map them elsewhere
> as write-combine, you're stepping into some potential cache attribute
> issues.)
>
> How we deal with this in Etnaviv is to use dma_map_sg() after we get
> the pages - see
>
>   etnaviv_gem_get_pages(), which calls the memory specific .get_pages
>     method, and goes on to call etnaviv_gem_scatter_map().
>
> There's no need for the faking up of a SG table that way, just let
> dma_map_sg() do whatever it needs to do.  This means you're not abusing
> the DMA API, and if you have a system IOMMU in the way, as a bonus that
> gets setup for you.

Okay, so the recommended solution to my problem (vgem doesn't work on
ARM) is to:

1) Create vgem_get_pages / vgem_put_pages functions.  vgem_get_pages
will be called from vgem_gem_fault (since currently vgem does delayed
allocation) and vgem_pin_pages.
2) Call dma_map_sg() / dma_unmap_sg in vgem_get_pages / vgem_put_pages
if the architecture is ARM.

That works for me, if that works for everyone else.

>> There is some interest in cache-able DRM buffers (though, again, this
>> patchset is not about that).  Renderscript accesses are very slow on ARM
>> and we keep shadow buffers to improve performance (see
>> crrev.com/602736).
>
> 404.

Oops, the correct URL is crrev.com/c/602736.

>> Jeffy
>> has done some tests with memcpys in our camera stack that shows
>> improvements (with caching --> 4 to 7 ms, without caching --> 20 to 90ms).
>> However, I do agree Spectre complicates things.
>
> At the moment, on 32-bit ARM, we have very little mitigation work for
> Spectre beyond the generic kernel work that is going on at the moment
> for all architectures, so I really do not want to introduce anything
> that makes 32-bit ARM more easily vulnerable to attack.

Fair enough.

>That may change in the future (sorry, I can't say when), but right now I don't
> think we have much of an option.
>
> --
> RMK's Patch system: http://www.armlinux.org.uk/developer/patches/
> FTTC broadband for 0.8mile line in suburbia: sync at 8.8Mbps down 630kbps up
> According to speedtest.net: 8.21Mbps down 510kbps up
_______________________________________________
dri-devel mailing list
dri-devel@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/dri-devel

^ permalink raw reply	[flat|nested] 33+ messages in thread

* [PATCH 2/5] drm: add ARM flush implementation
  2018-01-24 19:26       ` Russell King - ARM Linux
@ 2018-01-25 14:06         ` Thierry Reding
  -1 siblings, 0 replies; 33+ messages in thread
From: Thierry Reding @ 2018-01-25 14:06 UTC (permalink / raw)
  To: linux-arm-kernel

On Wed, Jan 24, 2018 at 07:26:11PM +0000, Russell King - ARM Linux wrote:
> On Wed, Jan 24, 2018 at 10:45:28AM -0800, Gurchetan Singh wrote:
> > On Wed, Jan 24, 2018 at 4:45 AM, Russell King - ARM Linux <
> > linux at armlinux.org.uk> wrote:
> > > So no, this is not an acceptable approach.
> > >
> > > Secondly, in light of spectre and meltdown, do we _really_ want to
> > > export cache flushing to userspace in any case - these attacks rely
> > > on being able to flush specific cache lines from the caches in order
> > > to do the timing attacks (while leaving others in place.)
> > 
> > > Currently, 32-bit ARM does not export such flushing capabilities to
> > > userspace, which makes it very difficult (I'm not going to say
> > > impossible) to get any working proof-of-code program that even
> > > illustrates the timing attack.  Exposing this functionality changes
> > > that game, and means that we're much more open to these exploits.
> > > (Some may say that you can flush cache lines by reading a large
> > > enough buffer - I'm aware, I've tried that, the results are too
> > > unreliable even for a simple attempt which doesn't involve crossing
> > > privilege boundaries.)
> > >
> > 
> > Will using the DMA API (dma_sync_single_for_device /
> > dma_sync_sg_for_device) mitigate your Meltdown / Spectre concerns in any
> > way?
> 
> I see no point in answering that question based on what you've written
> below (see below for why).
> 
> > > Do you really need cacheable GPU buffers, or will write combining
> > > buffers (as we use elsewhere such as etnaviv) suffice?  Please provide
> > > some _real_ _world_ performance measurements that demonstrate that
> > > there is a real need for this functionality.
> > 
> > 
> > My desire is for the vgem driver to work correctly on ARM, which requires
> > cache flushing.  The mappings vgem itself creates are write combine.
> 
> If the pages are mapped write-combine, they are by definition *not*
> cacheable, so there should be no cache flushing required.
> 
> > The
> > issue is the pages retrieved on ARM architecture usually have to be flushed
> > before they can be used (see rockchip_gem_get_pages / tegra_bo_get_pages).
> > This patch set attempts to do the flushing in an architecture independent
> > manner (since vgem is intended to work on ARM / x86).
> 
> I see rockchip_gem_get_pages() using shmem_read_mapping_page() to get
> the pages.  That's more or less fine, we do that on Etnaviv too.
> 
> (Side note: provided the pages are not coming from lowmem, as mapping
> lowmem pages are mapped cacheable, and if you also map them elsewhere
> as write-combine, you're stepping into some potential cache attribute
> issues.)
> 
> How we deal with this in Etnaviv is to use dma_map_sg() after we get
> the pages - see
> 
>   etnaviv_gem_get_pages(), which calls the memory specific .get_pages
>     method, and goes on to call etnaviv_gem_scatter_map().

I think I'm to blame for this. Back at the time the patch was based on
my incomplete understanding of the DMA API. It's also possible that it
wasn't working at the time because the DMA/IOMMU glue wasn't quite the
same as it is today. I have a vague recollection that I tried using
dma_map_sg() and it was creating a second IOVA mapping (in addition to
the one we explicitly create in the driver with the IOMMU API).

However, that's no longer happening today, so I ended up doing something
very similar to etnaviv. I've got the below patch queued for v4.17 and I
think the same should work for both Rockchip and VGEM.

Thierry

--- >8 ---
>From 0f83d1aefcd0ca49c88d483a4161e3a02b5d1f32 Mon Sep 17 00:00:00 2001
From: Thierry Reding <treding@nvidia.com>
Date: Wed, 13 Dec 2017 12:22:48 +0100
Subject: [PATCH] drm/tegra: gem: Map pages via the DMA API

When allocating pages, map them with the DMA API in order to invalidate
caches. This is the correct usage of the API and works just as well as
faking up the SG table and using the dma_sync_sg_for_device() function.

Signed-off-by: Thierry Reding <treding@nvidia.com>
---
 drivers/gpu/drm/tegra/gem.c | 32 ++++++++++++++++----------------
 1 file changed, 16 insertions(+), 16 deletions(-)

diff --git a/drivers/gpu/drm/tegra/gem.c b/drivers/gpu/drm/tegra/gem.c
index 1bc5c6d1e5b5..8ab6057808e6 100644
--- a/drivers/gpu/drm/tegra/gem.c
+++ b/drivers/gpu/drm/tegra/gem.c
@@ -203,6 +203,8 @@ static struct tegra_bo *tegra_bo_alloc_object(struct drm_device *drm,
 static void tegra_bo_free(struct drm_device *drm, struct tegra_bo *bo)
 {
 	if (bo->pages) {
+		dma_unmap_sg(drm->dev, bo->sgt->sgl, bo->sgt->nents,
+			     DMA_BIDIRECTIONAL);
 		drm_gem_put_pages(&bo->gem, bo->pages, true, true);
 		sg_free_table(bo->sgt);
 		kfree(bo->sgt);
@@ -213,8 +215,7 @@ static void tegra_bo_free(struct drm_device *drm, struct tegra_bo *bo)
 
 static int tegra_bo_get_pages(struct drm_device *drm, struct tegra_bo *bo)
 {
-	struct scatterlist *s;
-	unsigned int i;
+	int err;
 
 	bo->pages = drm_gem_get_pages(&bo->gem);
 	if (IS_ERR(bo->pages))
@@ -223,27 +224,26 @@ static int tegra_bo_get_pages(struct drm_device *drm, struct tegra_bo *bo)
 	bo->num_pages = bo->gem.size >> PAGE_SHIFT;
 
 	bo->sgt = drm_prime_pages_to_sg(bo->pages, bo->num_pages);
-	if (IS_ERR(bo->sgt))
+	if (IS_ERR(bo->sgt)) {
+		err = PTR_ERR(bo->sgt);
 		goto put_pages;
+	}
 
-	/*
-	 * Fake up the SG table so that dma_sync_sg_for_device() can be used
-	 * to flush the pages associated with it.
-	 *
-	 * TODO: Replace this by drm_clflash_sg() once it can be implemented
-	 * without relying on symbols that are not exported.
-	 */
-	for_each_sg(bo->sgt->sgl, s, bo->sgt->nents, i)
-		sg_dma_address(s) = sg_phys(s);
-
-	dma_sync_sg_for_device(drm->dev, bo->sgt->sgl, bo->sgt->nents,
-			       DMA_TO_DEVICE);
+	err = dma_map_sg(drm->dev, bo->sgt->sgl, bo->sgt->nents,
+			 DMA_BIDIRECTIONAL);
+	if (err == 0) {
+		err = -EFAULT;
+		goto free_sgt;
+	}
 
 	return 0;
 
+free_sgt:
+	sg_free_table(bo->sgt);
+	kfree(bo->sgt);
 put_pages:
 	drm_gem_put_pages(&bo->gem, bo->pages, false, false);
-	return PTR_ERR(bo->sgt);
+	return err;
 }
 
 static int tegra_bo_alloc(struct drm_device *drm, struct tegra_bo *bo)
-- 
2.15.1
-------------- next part --------------
A non-text attachment was scrubbed...
Name: signature.asc
Type: application/pgp-signature
Size: 833 bytes
Desc: not available
URL: <http://lists.infradead.org/pipermail/linux-arm-kernel/attachments/20180125/a76bd96b/attachment.sig>

^ permalink raw reply related	[flat|nested] 33+ messages in thread

* Re: [PATCH 2/5] drm: add ARM flush implementation
@ 2018-01-25 14:06         ` Thierry Reding
  0 siblings, 0 replies; 33+ messages in thread
From: Thierry Reding @ 2018-01-25 14:06 UTC (permalink / raw)
  To: Russell King - ARM Linux
  Cc: jeffy.chen, ML dri-devel, Gurchetan Singh, Laurent Pinchart,
	Daniel Vetter, linux-arm-kernel


[-- Attachment #1.1: Type: text/plain, Size: 6314 bytes --]

On Wed, Jan 24, 2018 at 07:26:11PM +0000, Russell King - ARM Linux wrote:
> On Wed, Jan 24, 2018 at 10:45:28AM -0800, Gurchetan Singh wrote:
> > On Wed, Jan 24, 2018 at 4:45 AM, Russell King - ARM Linux <
> > linux@armlinux.org.uk> wrote:
> > > So no, this is not an acceptable approach.
> > >
> > > Secondly, in light of spectre and meltdown, do we _really_ want to
> > > export cache flushing to userspace in any case - these attacks rely
> > > on being able to flush specific cache lines from the caches in order
> > > to do the timing attacks (while leaving others in place.)
> > 
> > > Currently, 32-bit ARM does not export such flushing capabilities to
> > > userspace, which makes it very difficult (I'm not going to say
> > > impossible) to get any working proof-of-code program that even
> > > illustrates the timing attack.  Exposing this functionality changes
> > > that game, and means that we're much more open to these exploits.
> > > (Some may say that you can flush cache lines by reading a large
> > > enough buffer - I'm aware, I've tried that, the results are too
> > > unreliable even for a simple attempt which doesn't involve crossing
> > > privilege boundaries.)
> > >
> > 
> > Will using the DMA API (dma_sync_single_for_device /
> > dma_sync_sg_for_device) mitigate your Meltdown / Spectre concerns in any
> > way?
> 
> I see no point in answering that question based on what you've written
> below (see below for why).
> 
> > > Do you really need cacheable GPU buffers, or will write combining
> > > buffers (as we use elsewhere such as etnaviv) suffice?  Please provide
> > > some _real_ _world_ performance measurements that demonstrate that
> > > there is a real need for this functionality.
> > 
> > 
> > My desire is for the vgem driver to work correctly on ARM, which requires
> > cache flushing.  The mappings vgem itself creates are write combine.
> 
> If the pages are mapped write-combine, they are by definition *not*
> cacheable, so there should be no cache flushing required.
> 
> > The
> > issue is the pages retrieved on ARM architecture usually have to be flushed
> > before they can be used (see rockchip_gem_get_pages / tegra_bo_get_pages).
> > This patch set attempts to do the flushing in an architecture independent
> > manner (since vgem is intended to work on ARM / x86).
> 
> I see rockchip_gem_get_pages() using shmem_read_mapping_page() to get
> the pages.  That's more or less fine, we do that on Etnaviv too.
> 
> (Side note: provided the pages are not coming from lowmem, as mapping
> lowmem pages are mapped cacheable, and if you also map them elsewhere
> as write-combine, you're stepping into some potential cache attribute
> issues.)
> 
> How we deal with this in Etnaviv is to use dma_map_sg() after we get
> the pages - see
> 
>   etnaviv_gem_get_pages(), which calls the memory specific .get_pages
>     method, and goes on to call etnaviv_gem_scatter_map().

I think I'm to blame for this. Back at the time the patch was based on
my incomplete understanding of the DMA API. It's also possible that it
wasn't working at the time because the DMA/IOMMU glue wasn't quite the
same as it is today. I have a vague recollection that I tried using
dma_map_sg() and it was creating a second IOVA mapping (in addition to
the one we explicitly create in the driver with the IOMMU API).

However, that's no longer happening today, so I ended up doing something
very similar to etnaviv. I've got the below patch queued for v4.17 and I
think the same should work for both Rockchip and VGEM.

Thierry

--- >8 ---
From 0f83d1aefcd0ca49c88d483a4161e3a02b5d1f32 Mon Sep 17 00:00:00 2001
From: Thierry Reding <treding@nvidia.com>
Date: Wed, 13 Dec 2017 12:22:48 +0100
Subject: [PATCH] drm/tegra: gem: Map pages via the DMA API

When allocating pages, map them with the DMA API in order to invalidate
caches. This is the correct usage of the API and works just as well as
faking up the SG table and using the dma_sync_sg_for_device() function.

Signed-off-by: Thierry Reding <treding@nvidia.com>
---
 drivers/gpu/drm/tegra/gem.c | 32 ++++++++++++++++----------------
 1 file changed, 16 insertions(+), 16 deletions(-)

diff --git a/drivers/gpu/drm/tegra/gem.c b/drivers/gpu/drm/tegra/gem.c
index 1bc5c6d1e5b5..8ab6057808e6 100644
--- a/drivers/gpu/drm/tegra/gem.c
+++ b/drivers/gpu/drm/tegra/gem.c
@@ -203,6 +203,8 @@ static struct tegra_bo *tegra_bo_alloc_object(struct drm_device *drm,
 static void tegra_bo_free(struct drm_device *drm, struct tegra_bo *bo)
 {
 	if (bo->pages) {
+		dma_unmap_sg(drm->dev, bo->sgt->sgl, bo->sgt->nents,
+			     DMA_BIDIRECTIONAL);
 		drm_gem_put_pages(&bo->gem, bo->pages, true, true);
 		sg_free_table(bo->sgt);
 		kfree(bo->sgt);
@@ -213,8 +215,7 @@ static void tegra_bo_free(struct drm_device *drm, struct tegra_bo *bo)
 
 static int tegra_bo_get_pages(struct drm_device *drm, struct tegra_bo *bo)
 {
-	struct scatterlist *s;
-	unsigned int i;
+	int err;
 
 	bo->pages = drm_gem_get_pages(&bo->gem);
 	if (IS_ERR(bo->pages))
@@ -223,27 +224,26 @@ static int tegra_bo_get_pages(struct drm_device *drm, struct tegra_bo *bo)
 	bo->num_pages = bo->gem.size >> PAGE_SHIFT;
 
 	bo->sgt = drm_prime_pages_to_sg(bo->pages, bo->num_pages);
-	if (IS_ERR(bo->sgt))
+	if (IS_ERR(bo->sgt)) {
+		err = PTR_ERR(bo->sgt);
 		goto put_pages;
+	}
 
-	/*
-	 * Fake up the SG table so that dma_sync_sg_for_device() can be used
-	 * to flush the pages associated with it.
-	 *
-	 * TODO: Replace this by drm_clflash_sg() once it can be implemented
-	 * without relying on symbols that are not exported.
-	 */
-	for_each_sg(bo->sgt->sgl, s, bo->sgt->nents, i)
-		sg_dma_address(s) = sg_phys(s);
-
-	dma_sync_sg_for_device(drm->dev, bo->sgt->sgl, bo->sgt->nents,
-			       DMA_TO_DEVICE);
+	err = dma_map_sg(drm->dev, bo->sgt->sgl, bo->sgt->nents,
+			 DMA_BIDIRECTIONAL);
+	if (err == 0) {
+		err = -EFAULT;
+		goto free_sgt;
+	}
 
 	return 0;
 
+free_sgt:
+	sg_free_table(bo->sgt);
+	kfree(bo->sgt);
 put_pages:
 	drm_gem_put_pages(&bo->gem, bo->pages, false, false);
-	return PTR_ERR(bo->sgt);
+	return err;
 }
 
 static int tegra_bo_alloc(struct drm_device *drm, struct tegra_bo *bo)
-- 
2.15.1

[-- Attachment #1.2: signature.asc --]
[-- Type: application/pgp-signature, Size: 833 bytes --]

[-- Attachment #2: Type: text/plain, Size: 160 bytes --]

_______________________________________________
dri-devel mailing list
dri-devel@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/dri-devel

^ permalink raw reply related	[flat|nested] 33+ messages in thread

* [PATCH 3/5] drm: add ARM64 flush implementations
  2018-01-24 12:00     ` Robin Murphy
@ 2018-01-30  9:53       ` Daniel Vetter
  -1 siblings, 0 replies; 33+ messages in thread
From: Daniel Vetter @ 2018-01-30  9:53 UTC (permalink / raw)
  To: linux-arm-kernel

On Wed, Jan 24, 2018 at 12:00:59PM +0000, Robin Murphy wrote:
> On 24/01/18 02:56, Gurchetan Singh wrote:
> > This patch uses the __dma_map_area function to flush the cache
> > on ARM64.
> > 
> > v2: Don't use DMA API, call functions directly (Daniel)
> > 
> > Signed-off-by: Gurchetan Singh <gurchetansingh@chromium.org>
> > ---
> >   drivers/gpu/drm/drm_cache.c | 13 +++++++++++++
> >   1 file changed, 13 insertions(+)
> > 
> > diff --git a/drivers/gpu/drm/drm_cache.c b/drivers/gpu/drm/drm_cache.c
> > index 5124582451c6..250cdfbb711f 100644
> > --- a/drivers/gpu/drm/drm_cache.c
> > +++ b/drivers/gpu/drm/drm_cache.c
> > @@ -159,6 +159,12 @@ drm_flush_pages(struct page *pages[], unsigned long num_pages)
> >   	for (i = 0; i < num_pages; i++)
> >   		drm_cache_maint_page(pages[i], 0, PAGE_SIZE, DMA_TO_DEVICE,
> >   				     dmac_map_area);
> > +#elif defined(CONFIG_ARM64)
> > +	unsigned long i;
> > +
> > +	for (i = 0; i < num_pages; i++)
> > +		__dma_map_area(phys_to_virt(page_to_phys(pages[i])), PAGE_SIZE,
> > +			       DMA_TO_DEVICE);
> 
> Note that this is not exactly equivalent to clflush - if it's at all
> possible for a non-coherent GPU to write back to these pages and someone
> somewhere expects to be able to read the updated data from a CPU, that's
> going to be subtly broken.
> 
> This also breaks building DRM as a module. And I doubt that either of the
> two easy solutions to that are going to fly with the respective
> maintainers...
> 
> Given all the bodging around which seems to happen in DRM/ION/etc., it would
> be really nice to pin down what exactly the shortcomings of the DMA API are
> for these use-cases, and extend it to address them properly.

tldr; a dma-buf exporter more-or-less has to act like a dma-api
architecture implementation. Which means flushing stuff at the right time.
Going through the dma-api means we need to pass around a struct device *
where none is needed, which seems silly. The original patches did add that
dummy struct device *, but after digging around in the actual
implementations we've noticed that there's no need for them.

Ofc you can question whether gpu drivers really need to noodle around in
such platform details, but given that all of them do that (all = those
that implement rendering, not just display) I'm just accepting that as a
fact of life. It's definitely unrealistic to demand those all get fixed,
even if the end result would be more maintainable.

We can ofc postpone this entire discussion by mandating that all shared
gpu buffers on ARM32 must be wc mapped by everyone. But at least on some
x86 machines (it's a function of how big your caches are and where your
gpu sits) cached access is actually faster for upload/download buffers.
Ofc since the dma-api tries to hide all this the wc vs. cached assumptions
are all implicit in dma-buf, which makes this all lots of fun.
-Daniel

> 
> Robin.
> 
> >   #else
> >   	pr_err("Architecture has no drm_cache.c support\n");
> >   	WARN_ON_ONCE(1);
> > @@ -196,6 +202,13 @@ drm_flush_sg(struct sg_table *st)
> >   	for_each_sg_page(st->sgl, &sg_iter, st->nents, 0)
> >   		drm_cache_maint_page(sg_page_iter_page(&sg_iter), 0, PAGE_SIZE,
> >   				     DMA_TO_DEVICE, dmac_map_area);
> > +#elif defined(CONFIG_ARM64)
> > +	int i;
> > +	struct scatterlist *sg;
> > +
> > +	for_each_sg(st->sgl, sg, st->nents, i)
> > +		__dma_map_area(phys_to_virt(sg_phys(sg)), sg->length,
> > +			       DMA_TO_DEVICE);
> >   #else
> >   	pr_err("Architecture has no drm_cache.c support\n");
> >   	WARN_ON_ONCE(1);
> > 
> _______________________________________________
> dri-devel mailing list
> dri-devel at lists.freedesktop.org
> https://lists.freedesktop.org/mailman/listinfo/dri-devel

-- 
Daniel Vetter
Software Engineer, Intel Corporation
http://blog.ffwll.ch

^ permalink raw reply	[flat|nested] 33+ messages in thread

* Re: [PATCH 3/5] drm: add ARM64 flush implementations
@ 2018-01-30  9:53       ` Daniel Vetter
  0 siblings, 0 replies; 33+ messages in thread
From: Daniel Vetter @ 2018-01-30  9:53 UTC (permalink / raw)
  To: Robin Murphy
  Cc: dri-devel, Gurchetan Singh, thierry.reding, laurent.pinchart,
	daniel.vetter, linux-arm-kernel

On Wed, Jan 24, 2018 at 12:00:59PM +0000, Robin Murphy wrote:
> On 24/01/18 02:56, Gurchetan Singh wrote:
> > This patch uses the __dma_map_area function to flush the cache
> > on ARM64.
> > 
> > v2: Don't use DMA API, call functions directly (Daniel)
> > 
> > Signed-off-by: Gurchetan Singh <gurchetansingh@chromium.org>
> > ---
> >   drivers/gpu/drm/drm_cache.c | 13 +++++++++++++
> >   1 file changed, 13 insertions(+)
> > 
> > diff --git a/drivers/gpu/drm/drm_cache.c b/drivers/gpu/drm/drm_cache.c
> > index 5124582451c6..250cdfbb711f 100644
> > --- a/drivers/gpu/drm/drm_cache.c
> > +++ b/drivers/gpu/drm/drm_cache.c
> > @@ -159,6 +159,12 @@ drm_flush_pages(struct page *pages[], unsigned long num_pages)
> >   	for (i = 0; i < num_pages; i++)
> >   		drm_cache_maint_page(pages[i], 0, PAGE_SIZE, DMA_TO_DEVICE,
> >   				     dmac_map_area);
> > +#elif defined(CONFIG_ARM64)
> > +	unsigned long i;
> > +
> > +	for (i = 0; i < num_pages; i++)
> > +		__dma_map_area(phys_to_virt(page_to_phys(pages[i])), PAGE_SIZE,
> > +			       DMA_TO_DEVICE);
> 
> Note that this is not exactly equivalent to clflush - if it's at all
> possible for a non-coherent GPU to write back to these pages and someone
> somewhere expects to be able to read the updated data from a CPU, that's
> going to be subtly broken.
> 
> This also breaks building DRM as a module. And I doubt that either of the
> two easy solutions to that are going to fly with the respective
> maintainers...
> 
> Given all the bodging around which seems to happen in DRM/ION/etc., it would
> be really nice to pin down what exactly the shortcomings of the DMA API are
> for these use-cases, and extend it to address them properly.

tldr; a dma-buf exporter more-or-less has to act like a dma-api
architecture implementation. Which means flushing stuff at the right time.
Going through the dma-api means we need to pass around a struct device *
where none is needed, which seems silly. The original patches did add that
dummy struct device *, but after digging around in the actual
implementations we've noticed that there's no need for them.

Ofc you can question whether gpu drivers really need to noodle around in
such platform details, but given that all of them do that (all = those
that implement rendering, not just display) I'm just accepting that as a
fact of life. It's definitely unrealistic to demand those all get fixed,
even if the end result would be more maintainable.

We can ofc postpone this entire discussion by mandating that all shared
gpu buffers on ARM32 must be wc mapped by everyone. But at least on some
x86 machines (it's a function of how big your caches are and where your
gpu sits) cached access is actually faster for upload/download buffers.
Ofc since the dma-api tries to hide all this the wc vs. cached assumptions
are all implicit in dma-buf, which makes this all lots of fun.
-Daniel

> 
> Robin.
> 
> >   #else
> >   	pr_err("Architecture has no drm_cache.c support\n");
> >   	WARN_ON_ONCE(1);
> > @@ -196,6 +202,13 @@ drm_flush_sg(struct sg_table *st)
> >   	for_each_sg_page(st->sgl, &sg_iter, st->nents, 0)
> >   		drm_cache_maint_page(sg_page_iter_page(&sg_iter), 0, PAGE_SIZE,
> >   				     DMA_TO_DEVICE, dmac_map_area);
> > +#elif defined(CONFIG_ARM64)
> > +	int i;
> > +	struct scatterlist *sg;
> > +
> > +	for_each_sg(st->sgl, sg, st->nents, i)
> > +		__dma_map_area(phys_to_virt(sg_phys(sg)), sg->length,
> > +			       DMA_TO_DEVICE);
> >   #else
> >   	pr_err("Architecture has no drm_cache.c support\n");
> >   	WARN_ON_ONCE(1);
> > 
> _______________________________________________
> dri-devel mailing list
> dri-devel@lists.freedesktop.org
> https://lists.freedesktop.org/mailman/listinfo/dri-devel

-- 
Daniel Vetter
Software Engineer, Intel Corporation
http://blog.ffwll.ch
_______________________________________________
dri-devel mailing list
dri-devel@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/dri-devel

^ permalink raw reply	[flat|nested] 33+ messages in thread

* [PATCH 2/5] drm: add ARM flush implementation
  2018-01-24  2:56 ` Gurchetan Singh
@ 2018-01-30 10:14   ` Daniel Vetter
  -1 siblings, 0 replies; 33+ messages in thread
From: Daniel Vetter @ 2018-01-30 10:14 UTC (permalink / raw)
  To: linux-arm-kernel

On Tue, Jan 23, 2018 at 06:56:03PM -0800, Gurchetan Singh wrote:
> The dma_cache_maint_page function is important for cache maintenance on
> ARM32 (this was determined via testing).
> 
> Since we desire direct control of the caches in drm_cache.c, let's make
> a copy of the function, rename it and use it.
> 
> v2: Don't use DMA API, call functions directly (Daniel)
> 
> Signed-off-by: Gurchetan Singh <gurchetansingh@chromium.org>

fwiw, in principle, this approach has my Ack from the drm side.

But if we can't get any agreement from the arch side then I guess we'll
just have to suck it up and mandate that any dma-buf on ARM32 must be wc
mapped, always. Not sure that's a good idea either, but should at least
get things moving.
-Daniel

> ---
>  drivers/gpu/drm/drm_cache.c | 61 +++++++++++++++++++++++++++++++++++++++++++++
>  1 file changed, 61 insertions(+)
> 
> diff --git a/drivers/gpu/drm/drm_cache.c b/drivers/gpu/drm/drm_cache.c
> index 89cdd32fe1f3..5124582451c6 100644
> --- a/drivers/gpu/drm/drm_cache.c
> +++ b/drivers/gpu/drm/drm_cache.c
> @@ -69,6 +69,55 @@ static void drm_cache_flush_clflush(struct page *pages[],
>  }
>  #endif
>  
> +#if defined(CONFIG_ARM)
> +static void drm_cache_maint_page(struct page *page, unsigned long offset,
> +				 size_t size, enum dma_data_direction dir,
> +				 void (*op)(const void *, size_t, int))
> +{
> +	unsigned long pfn;
> +	size_t left = size;
> +
> +	pfn = page_to_pfn(page) + offset / PAGE_SIZE;
> +	offset %= PAGE_SIZE;
> +
> +	/*
> +	 * A single sg entry may refer to multiple physically contiguous
> +	 * pages.  But we still need to process highmem pages individually.
> +	 * If highmem is not configured then the bulk of this loop gets
> +	 * optimized out.
> +	 */
> +	do {
> +		size_t len = left;
> +		void *vaddr;
> +
> +		page = pfn_to_page(pfn);
> +
> +		if (PageHighMem(page)) {
> +			if (len + offset > PAGE_SIZE)
> +				len = PAGE_SIZE - offset;
> +
> +			if (cache_is_vipt_nonaliasing()) {
> +				vaddr = kmap_atomic(page);
> +				op(vaddr + offset, len, dir);
> +				kunmap_atomic(vaddr);
> +			} else {
> +				vaddr = kmap_high_get(page);
> +				if (vaddr) {
> +					op(vaddr + offset, len, dir);
> +					kunmap_high(page);
> +				}
> +			}
> +		} else {
> +			vaddr = page_address(page) + offset;
> +			op(vaddr, len, dir);
> +		}
> +		offset = 0;
> +		pfn++;
> +		left -= len;
> +	} while (left);
> +}
> +#endif
> +
>  /**
>   * drm_flush_pages - Flush dcache lines of a set of pages.
>   * @pages: List of pages to be flushed.
> @@ -104,6 +153,12 @@ drm_flush_pages(struct page *pages[], unsigned long num_pages)
>  				   (unsigned long)page_virtual + PAGE_SIZE);
>  		kunmap_atomic(page_virtual);
>  	}
> +#elif defined(CONFIG_ARM)
> +	unsigned long i;
> +
> +	for (i = 0; i < num_pages; i++)
> +		drm_cache_maint_page(pages[i], 0, PAGE_SIZE, DMA_TO_DEVICE,
> +				     dmac_map_area);
>  #else
>  	pr_err("Architecture has no drm_cache.c support\n");
>  	WARN_ON_ONCE(1);
> @@ -135,6 +190,12 @@ drm_flush_sg(struct sg_table *st)
>  
>  	if (wbinvd_on_all_cpus())
>  		pr_err("Timed out waiting for cache flush\n");
> +#elif defined(CONFIG_ARM)
> +	struct sg_page_iter sg_iter;
> +
> +	for_each_sg_page(st->sgl, &sg_iter, st->nents, 0)
> +		drm_cache_maint_page(sg_page_iter_page(&sg_iter), 0, PAGE_SIZE,
> +				     DMA_TO_DEVICE, dmac_map_area);
>  #else
>  	pr_err("Architecture has no drm_cache.c support\n");
>  	WARN_ON_ONCE(1);
> -- 
> 2.13.5
> 
> _______________________________________________
> dri-devel mailing list
> dri-devel at lists.freedesktop.org
> https://lists.freedesktop.org/mailman/listinfo/dri-devel

-- 
Daniel Vetter
Software Engineer, Intel Corporation
http://blog.ffwll.ch

^ permalink raw reply	[flat|nested] 33+ messages in thread

* Re: [PATCH 2/5] drm: add ARM flush implementation
@ 2018-01-30 10:14   ` Daniel Vetter
  0 siblings, 0 replies; 33+ messages in thread
From: Daniel Vetter @ 2018-01-30 10:14 UTC (permalink / raw)
  To: Gurchetan Singh
  Cc: linux-arm-kernel, daniel.vetter, thierry.reding,
	laurent.pinchart, dri-devel

On Tue, Jan 23, 2018 at 06:56:03PM -0800, Gurchetan Singh wrote:
> The dma_cache_maint_page function is important for cache maintenance on
> ARM32 (this was determined via testing).
> 
> Since we desire direct control of the caches in drm_cache.c, let's make
> a copy of the function, rename it and use it.
> 
> v2: Don't use DMA API, call functions directly (Daniel)
> 
> Signed-off-by: Gurchetan Singh <gurchetansingh@chromium.org>

fwiw, in principle, this approach has my Ack from the drm side.

But if we can't get any agreement from the arch side then I guess we'll
just have to suck it up and mandate that any dma-buf on ARM32 must be wc
mapped, always. Not sure that's a good idea either, but should at least
get things moving.
-Daniel

> ---
>  drivers/gpu/drm/drm_cache.c | 61 +++++++++++++++++++++++++++++++++++++++++++++
>  1 file changed, 61 insertions(+)
> 
> diff --git a/drivers/gpu/drm/drm_cache.c b/drivers/gpu/drm/drm_cache.c
> index 89cdd32fe1f3..5124582451c6 100644
> --- a/drivers/gpu/drm/drm_cache.c
> +++ b/drivers/gpu/drm/drm_cache.c
> @@ -69,6 +69,55 @@ static void drm_cache_flush_clflush(struct page *pages[],
>  }
>  #endif
>  
> +#if defined(CONFIG_ARM)
> +static void drm_cache_maint_page(struct page *page, unsigned long offset,
> +				 size_t size, enum dma_data_direction dir,
> +				 void (*op)(const void *, size_t, int))
> +{
> +	unsigned long pfn;
> +	size_t left = size;
> +
> +	pfn = page_to_pfn(page) + offset / PAGE_SIZE;
> +	offset %= PAGE_SIZE;
> +
> +	/*
> +	 * A single sg entry may refer to multiple physically contiguous
> +	 * pages.  But we still need to process highmem pages individually.
> +	 * If highmem is not configured then the bulk of this loop gets
> +	 * optimized out.
> +	 */
> +	do {
> +		size_t len = left;
> +		void *vaddr;
> +
> +		page = pfn_to_page(pfn);
> +
> +		if (PageHighMem(page)) {
> +			if (len + offset > PAGE_SIZE)
> +				len = PAGE_SIZE - offset;
> +
> +			if (cache_is_vipt_nonaliasing()) {
> +				vaddr = kmap_atomic(page);
> +				op(vaddr + offset, len, dir);
> +				kunmap_atomic(vaddr);
> +			} else {
> +				vaddr = kmap_high_get(page);
> +				if (vaddr) {
> +					op(vaddr + offset, len, dir);
> +					kunmap_high(page);
> +				}
> +			}
> +		} else {
> +			vaddr = page_address(page) + offset;
> +			op(vaddr, len, dir);
> +		}
> +		offset = 0;
> +		pfn++;
> +		left -= len;
> +	} while (left);
> +}
> +#endif
> +
>  /**
>   * drm_flush_pages - Flush dcache lines of a set of pages.
>   * @pages: List of pages to be flushed.
> @@ -104,6 +153,12 @@ drm_flush_pages(struct page *pages[], unsigned long num_pages)
>  				   (unsigned long)page_virtual + PAGE_SIZE);
>  		kunmap_atomic(page_virtual);
>  	}
> +#elif defined(CONFIG_ARM)
> +	unsigned long i;
> +
> +	for (i = 0; i < num_pages; i++)
> +		drm_cache_maint_page(pages[i], 0, PAGE_SIZE, DMA_TO_DEVICE,
> +				     dmac_map_area);
>  #else
>  	pr_err("Architecture has no drm_cache.c support\n");
>  	WARN_ON_ONCE(1);
> @@ -135,6 +190,12 @@ drm_flush_sg(struct sg_table *st)
>  
>  	if (wbinvd_on_all_cpus())
>  		pr_err("Timed out waiting for cache flush\n");
> +#elif defined(CONFIG_ARM)
> +	struct sg_page_iter sg_iter;
> +
> +	for_each_sg_page(st->sgl, &sg_iter, st->nents, 0)
> +		drm_cache_maint_page(sg_page_iter_page(&sg_iter), 0, PAGE_SIZE,
> +				     DMA_TO_DEVICE, dmac_map_area);
>  #else
>  	pr_err("Architecture has no drm_cache.c support\n");
>  	WARN_ON_ONCE(1);
> -- 
> 2.13.5
> 
> _______________________________________________
> dri-devel mailing list
> dri-devel@lists.freedesktop.org
> https://lists.freedesktop.org/mailman/listinfo/dri-devel

-- 
Daniel Vetter
Software Engineer, Intel Corporation
http://blog.ffwll.ch
_______________________________________________
dri-devel mailing list
dri-devel@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/dri-devel

^ permalink raw reply	[flat|nested] 33+ messages in thread

* [PATCH 2/5] drm: add ARM flush implementation
  2018-01-30 10:14   ` Daniel Vetter
@ 2018-01-30 11:31     ` Russell King - ARM Linux
  -1 siblings, 0 replies; 33+ messages in thread
From: Russell King - ARM Linux @ 2018-01-30 11:31 UTC (permalink / raw)
  To: linux-arm-kernel

On Tue, Jan 30, 2018 at 11:14:36AM +0100, Daniel Vetter wrote:
> On Tue, Jan 23, 2018 at 06:56:03PM -0800, Gurchetan Singh wrote:
> > The dma_cache_maint_page function is important for cache maintenance on
> > ARM32 (this was determined via testing).
> > 
> > Since we desire direct control of the caches in drm_cache.c, let's make
> > a copy of the function, rename it and use it.
> > 
> > v2: Don't use DMA API, call functions directly (Daniel)
> > 
> > Signed-off-by: Gurchetan Singh <gurchetansingh@chromium.org>
> 
> fwiw, in principle, this approach has my Ack from the drm side.
> 
> But if we can't get any agreement from the arch side then I guess we'll
> just have to suck it up and mandate that any dma-buf on ARM32 must be wc
> mapped, always. Not sure that's a good idea either, but should at least
> get things moving.

Let me expand on my objection, as I find your tone to be problematical
here.

The patch 2 (which is the earliest patch in this series) makes use of
facilities such as dmac_map_area(), and those are defined as macros in
arch/arm/mm/mm.h.  I see no way that drm_cache.c can pick up on that
_unless_ there's another patch (maybe that's patch 1) which moves the
definition.

dmac_map_area() is non-trivial to export (it's not a function, it's
macro which either points to a function or a function pointer structure
member) so it's likely that this patch also breaks building DRM as a
module.

We've been here before with drivers abusing the architecture private APIs,
which is _exactly_ why dmac_map_area() is defined in arch/arm/mm.h and not
in some kernel-wide asm header file - it's an implementation detail of the
architectures DMA API that drivers have no business mucking about with.

I've always said if the interfaces don't do what you want, talk to
architecture people, don't go poking about in architecture private parts
of the kernel and start abusing stuff.  I say this because years ago, we
had people doing _exactly_ that for the older virtually cached ARMs.  Then
ARMv6 came along, which needed an entire revamp of the architecture cache
interfaces, and lo and behold, drivers got broken because of this kind of
abuse.  IOW, abusing interfaces makes kernel maintenance harder.

For example, interfaces designed for flushing the cache when page tables
get torn down were abused in drivers to flush data for DMA or coherency
purposes, which meant that on ARMv6, where we no longer needed to flush
for page table maintenance, suddenly the interfaces that drivers were
using became no-ops.

In this case, dmac_map_area() exists to perform any cache maintenance
for the kernel view of that memory required for a non-coherent DMA
mapping.  What it does depends on the processsor and the requested
DMA_xxx type - it _may_ invalidate (discard) or clean (writeback but
leave in the cache) cache lines, or do nothing.

dmac_unmap_area() has the same issues - what it does depends on what
operation is being requested and what the processor requires to
achieve coherency.

The two functions are designed to work _together_, dmac_map_area()
before the DMA operation and dmac_unmap_area() after the DMA operation.
Only when they are both used together do you get the correct behaviour.

These functions are only guaranteed to operate on the kernel mapping
passed in as virtual addresses to the dmac_* functions.  They make no
guarantees about other mappings of the same memory elsewhere in the
system, which, depending on the architecture of the caches, may also
contain dirty cache lines (the same comment applies to the DMA API too.)
On certain cache architectures (VIPT) where colouring effects apply,
flushing the kernel mapping may not even be appropriate if the desired
effect is to flush data from a user mapping.

This is exactly why abusing APIs (like what is done in this patch) is
completely unacceptable from the architecture point of view - while
it may _appear_ to work, it may only work for one class of CPU or one
implementation.

Hence why the dmac_{un,}map_area() interfaces are not exported to
drivers.  You can't just abuse one of them.  They are a pair that
must be used together, and the DMA API knows that, and the DMA API
requirements ensure that happens.  It's not really surprising, these
functions were written to support the DMA API, and the DMA API is
the kernel-wide interface to these functions.

-- 
RMK's Patch system: http://www.armlinux.org.uk/developer/patches/
FTTC broadband for 0.8mile line in suburbia: sync at 8.8Mbps down 630kbps up
According to speedtest.net: 8.21Mbps down 510kbps up

^ permalink raw reply	[flat|nested] 33+ messages in thread

* Re: [PATCH 2/5] drm: add ARM flush implementation
@ 2018-01-30 11:31     ` Russell King - ARM Linux
  0 siblings, 0 replies; 33+ messages in thread
From: Russell King - ARM Linux @ 2018-01-30 11:31 UTC (permalink / raw)
  To: Daniel Vetter
  Cc: dri-devel, Gurchetan Singh, thierry.reding, laurent.pinchart,
	daniel.vetter, linux-arm-kernel

On Tue, Jan 30, 2018 at 11:14:36AM +0100, Daniel Vetter wrote:
> On Tue, Jan 23, 2018 at 06:56:03PM -0800, Gurchetan Singh wrote:
> > The dma_cache_maint_page function is important for cache maintenance on
> > ARM32 (this was determined via testing).
> > 
> > Since we desire direct control of the caches in drm_cache.c, let's make
> > a copy of the function, rename it and use it.
> > 
> > v2: Don't use DMA API, call functions directly (Daniel)
> > 
> > Signed-off-by: Gurchetan Singh <gurchetansingh@chromium.org>
> 
> fwiw, in principle, this approach has my Ack from the drm side.
> 
> But if we can't get any agreement from the arch side then I guess we'll
> just have to suck it up and mandate that any dma-buf on ARM32 must be wc
> mapped, always. Not sure that's a good idea either, but should at least
> get things moving.

Let me expand on my objection, as I find your tone to be problematical
here.

The patch 2 (which is the earliest patch in this series) makes use of
facilities such as dmac_map_area(), and those are defined as macros in
arch/arm/mm/mm.h.  I see no way that drm_cache.c can pick up on that
_unless_ there's another patch (maybe that's patch 1) which moves the
definition.

dmac_map_area() is non-trivial to export (it's not a function, it's
macro which either points to a function or a function pointer structure
member) so it's likely that this patch also breaks building DRM as a
module.

We've been here before with drivers abusing the architecture private APIs,
which is _exactly_ why dmac_map_area() is defined in arch/arm/mm.h and not
in some kernel-wide asm header file - it's an implementation detail of the
architectures DMA API that drivers have no business mucking about with.

I've always said if the interfaces don't do what you want, talk to
architecture people, don't go poking about in architecture private parts
of the kernel and start abusing stuff.  I say this because years ago, we
had people doing _exactly_ that for the older virtually cached ARMs.  Then
ARMv6 came along, which needed an entire revamp of the architecture cache
interfaces, and lo and behold, drivers got broken because of this kind of
abuse.  IOW, abusing interfaces makes kernel maintenance harder.

For example, interfaces designed for flushing the cache when page tables
get torn down were abused in drivers to flush data for DMA or coherency
purposes, which meant that on ARMv6, where we no longer needed to flush
for page table maintenance, suddenly the interfaces that drivers were
using became no-ops.

In this case, dmac_map_area() exists to perform any cache maintenance
for the kernel view of that memory required for a non-coherent DMA
mapping.  What it does depends on the processsor and the requested
DMA_xxx type - it _may_ invalidate (discard) or clean (writeback but
leave in the cache) cache lines, or do nothing.

dmac_unmap_area() has the same issues - what it does depends on what
operation is being requested and what the processor requires to
achieve coherency.

The two functions are designed to work _together_, dmac_map_area()
before the DMA operation and dmac_unmap_area() after the DMA operation.
Only when they are both used together do you get the correct behaviour.

These functions are only guaranteed to operate on the kernel mapping
passed in as virtual addresses to the dmac_* functions.  They make no
guarantees about other mappings of the same memory elsewhere in the
system, which, depending on the architecture of the caches, may also
contain dirty cache lines (the same comment applies to the DMA API too.)
On certain cache architectures (VIPT) where colouring effects apply,
flushing the kernel mapping may not even be appropriate if the desired
effect is to flush data from a user mapping.

This is exactly why abusing APIs (like what is done in this patch) is
completely unacceptable from the architecture point of view - while
it may _appear_ to work, it may only work for one class of CPU or one
implementation.

Hence why the dmac_{un,}map_area() interfaces are not exported to
drivers.  You can't just abuse one of them.  They are a pair that
must be used together, and the DMA API knows that, and the DMA API
requirements ensure that happens.  It's not really surprising, these
functions were written to support the DMA API, and the DMA API is
the kernel-wide interface to these functions.

-- 
RMK's Patch system: http://www.armlinux.org.uk/developer/patches/
FTTC broadband for 0.8mile line in suburbia: sync at 8.8Mbps down 630kbps up
According to speedtest.net: 8.21Mbps down 510kbps up

^ permalink raw reply	[flat|nested] 33+ messages in thread

* [PATCH 2/5] drm: add ARM flush implementation
  2018-01-30 11:31     ` Russell King - ARM Linux
@ 2018-01-30 12:27       ` Daniel Vetter
  -1 siblings, 0 replies; 33+ messages in thread
From: Daniel Vetter @ 2018-01-30 12:27 UTC (permalink / raw)
  To: linux-arm-kernel

On Tue, Jan 30, 2018 at 12:31 PM, Russell King - ARM Linux
<linux@armlinux.org.uk> wrote:
> On Tue, Jan 30, 2018 at 11:14:36AM +0100, Daniel Vetter wrote:
>> On Tue, Jan 23, 2018 at 06:56:03PM -0800, Gurchetan Singh wrote:
>> > The dma_cache_maint_page function is important for cache maintenance on
>> > ARM32 (this was determined via testing).
>> >
>> > Since we desire direct control of the caches in drm_cache.c, let's make
>> > a copy of the function, rename it and use it.
>> >
>> > v2: Don't use DMA API, call functions directly (Daniel)
>> >
>> > Signed-off-by: Gurchetan Singh <gurchetansingh@chromium.org>
>>
>> fwiw, in principle, this approach has my Ack from the drm side.
>>
>> But if we can't get any agreement from the arch side then I guess we'll
>> just have to suck it up and mandate that any dma-buf on ARM32 must be wc
>> mapped, always. Not sure that's a good idea either, but should at least
>> get things moving.
>
> Let me expand on my objection, as I find your tone to be problematical
> here.
>
> The patch 2 (which is the earliest patch in this series) makes use of
> facilities such as dmac_map_area(), and those are defined as macros in
> arch/arm/mm/mm.h.  I see no way that drm_cache.c can pick up on that
> _unless_ there's another patch (maybe that's patch 1) which moves the
> definition.
>
> dmac_map_area() is non-trivial to export (it's not a function, it's
> macro which either points to a function or a function pointer structure
> member) so it's likely that this patch also breaks building DRM as a
> module.
>
> We've been here before with drivers abusing the architecture private APIs,
> which is _exactly_ why dmac_map_area() is defined in arch/arm/mm.h and not
> in some kernel-wide asm header file - it's an implementation detail of the
> architectures DMA API that drivers have no business mucking about with.
>
> I've always said if the interfaces don't do what you want, talk to
> architecture people, don't go poking about in architecture private parts
> of the kernel and start abusing stuff.  I say this because years ago, we
> had people doing _exactly_ that for the older virtually cached ARMs.  Then
> ARMv6 came along, which needed an entire revamp of the architecture cache
> interfaces, and lo and behold, drivers got broken because of this kind of
> abuse.  IOW, abusing interfaces makes kernel maintenance harder.
>
> For example, interfaces designed for flushing the cache when page tables
> get torn down were abused in drivers to flush data for DMA or coherency
> purposes, which meant that on ARMv6, where we no longer needed to flush
> for page table maintenance, suddenly the interfaces that drivers were
> using became no-ops.
>
> In this case, dmac_map_area() exists to perform any cache maintenance
> for the kernel view of that memory required for a non-coherent DMA
> mapping.  What it does depends on the processsor and the requested
> DMA_xxx type - it _may_ invalidate (discard) or clean (writeback but
> leave in the cache) cache lines, or do nothing.
>
> dmac_unmap_area() has the same issues - what it does depends on what
> operation is being requested and what the processor requires to
> achieve coherency.
>
> The two functions are designed to work _together_, dmac_map_area()
> before the DMA operation and dmac_unmap_area() after the DMA operation.
> Only when they are both used together do you get the correct behaviour.
>
> These functions are only guaranteed to operate on the kernel mapping
> passed in as virtual addresses to the dmac_* functions.  They make no
> guarantees about other mappings of the same memory elsewhere in the
> system, which, depending on the architecture of the caches, may also
> contain dirty cache lines (the same comment applies to the DMA API too.)
> On certain cache architectures (VIPT) where colouring effects apply,
> flushing the kernel mapping may not even be appropriate if the desired
> effect is to flush data from a user mapping.
>
> This is exactly why abusing APIs (like what is done in this patch) is
> completely unacceptable from the architecture point of view - while
> it may _appear_ to work, it may only work for one class of CPU or one
> implementation.
>
> Hence why the dmac_{un,}map_area() interfaces are not exported to
> drivers.  You can't just abuse one of them.  They are a pair that
> must be used together, and the DMA API knows that, and the DMA API
> requirements ensure that happens.  It's not really surprising, these
> functions were written to support the DMA API, and the DMA API is
> the kernel-wide interface to these functions.

With "in principle" I meant that from a design pov I think it's
totally fine if drm drivers do implement their own cache management.
The implementation isn't fine, since it misses the invalidate/flush
pair (which happens to be the same on x86), largely also because the
CrOS use-case is very limited. I commented on that in the previous
discussion, the current proposed changes.

It's also clear that any such usage essentially makes the driver very
tied to the platform it's running on. I think that's also fine. Like I
said, drivers/gpu is already full of such hacks. I also don't care
what we end up caling these (there's a patch 1, somehow the threading
is broken and it's not part of the patch series).

I think in an ideal world we'd split the dma_sync* stuff into a struct
device and cpu specific parts. Plus figure out clear semantics for
dma-buf around who must flush when, and how exactly snooped vs.
non-snooped bus transactions are agreed on. And also fix up all the
existing drivers ofc. But there's no one even close to willing to do
all that work, so realistically muddling on is the one option we do
have.
-Daniel
-- 
Daniel Vetter
Software Engineer, Intel Corporation
+41 (0) 79 365 57 48 - http://blog.ffwll.ch

^ permalink raw reply	[flat|nested] 33+ messages in thread

* Re: [PATCH 2/5] drm: add ARM flush implementation
@ 2018-01-30 12:27       ` Daniel Vetter
  0 siblings, 0 replies; 33+ messages in thread
From: Daniel Vetter @ 2018-01-30 12:27 UTC (permalink / raw)
  To: Russell King - ARM Linux
  Cc: dri-devel, Gurchetan Singh, Thierry Reding, Laurent Pinchart,
	Daniel Vetter, Linux ARM

On Tue, Jan 30, 2018 at 12:31 PM, Russell King - ARM Linux
<linux@armlinux.org.uk> wrote:
> On Tue, Jan 30, 2018 at 11:14:36AM +0100, Daniel Vetter wrote:
>> On Tue, Jan 23, 2018 at 06:56:03PM -0800, Gurchetan Singh wrote:
>> > The dma_cache_maint_page function is important for cache maintenance on
>> > ARM32 (this was determined via testing).
>> >
>> > Since we desire direct control of the caches in drm_cache.c, let's make
>> > a copy of the function, rename it and use it.
>> >
>> > v2: Don't use DMA API, call functions directly (Daniel)
>> >
>> > Signed-off-by: Gurchetan Singh <gurchetansingh@chromium.org>
>>
>> fwiw, in principle, this approach has my Ack from the drm side.
>>
>> But if we can't get any agreement from the arch side then I guess we'll
>> just have to suck it up and mandate that any dma-buf on ARM32 must be wc
>> mapped, always. Not sure that's a good idea either, but should at least
>> get things moving.
>
> Let me expand on my objection, as I find your tone to be problematical
> here.
>
> The patch 2 (which is the earliest patch in this series) makes use of
> facilities such as dmac_map_area(), and those are defined as macros in
> arch/arm/mm/mm.h.  I see no way that drm_cache.c can pick up on that
> _unless_ there's another patch (maybe that's patch 1) which moves the
> definition.
>
> dmac_map_area() is non-trivial to export (it's not a function, it's
> macro which either points to a function or a function pointer structure
> member) so it's likely that this patch also breaks building DRM as a
> module.
>
> We've been here before with drivers abusing the architecture private APIs,
> which is _exactly_ why dmac_map_area() is defined in arch/arm/mm.h and not
> in some kernel-wide asm header file - it's an implementation detail of the
> architectures DMA API that drivers have no business mucking about with.
>
> I've always said if the interfaces don't do what you want, talk to
> architecture people, don't go poking about in architecture private parts
> of the kernel and start abusing stuff.  I say this because years ago, we
> had people doing _exactly_ that for the older virtually cached ARMs.  Then
> ARMv6 came along, which needed an entire revamp of the architecture cache
> interfaces, and lo and behold, drivers got broken because of this kind of
> abuse.  IOW, abusing interfaces makes kernel maintenance harder.
>
> For example, interfaces designed for flushing the cache when page tables
> get torn down were abused in drivers to flush data for DMA or coherency
> purposes, which meant that on ARMv6, where we no longer needed to flush
> for page table maintenance, suddenly the interfaces that drivers were
> using became no-ops.
>
> In this case, dmac_map_area() exists to perform any cache maintenance
> for the kernel view of that memory required for a non-coherent DMA
> mapping.  What it does depends on the processsor and the requested
> DMA_xxx type - it _may_ invalidate (discard) or clean (writeback but
> leave in the cache) cache lines, or do nothing.
>
> dmac_unmap_area() has the same issues - what it does depends on what
> operation is being requested and what the processor requires to
> achieve coherency.
>
> The two functions are designed to work _together_, dmac_map_area()
> before the DMA operation and dmac_unmap_area() after the DMA operation.
> Only when they are both used together do you get the correct behaviour.
>
> These functions are only guaranteed to operate on the kernel mapping
> passed in as virtual addresses to the dmac_* functions.  They make no
> guarantees about other mappings of the same memory elsewhere in the
> system, which, depending on the architecture of the caches, may also
> contain dirty cache lines (the same comment applies to the DMA API too.)
> On certain cache architectures (VIPT) where colouring effects apply,
> flushing the kernel mapping may not even be appropriate if the desired
> effect is to flush data from a user mapping.
>
> This is exactly why abusing APIs (like what is done in this patch) is
> completely unacceptable from the architecture point of view - while
> it may _appear_ to work, it may only work for one class of CPU or one
> implementation.
>
> Hence why the dmac_{un,}map_area() interfaces are not exported to
> drivers.  You can't just abuse one of them.  They are a pair that
> must be used together, and the DMA API knows that, and the DMA API
> requirements ensure that happens.  It's not really surprising, these
> functions were written to support the DMA API, and the DMA API is
> the kernel-wide interface to these functions.

With "in principle" I meant that from a design pov I think it's
totally fine if drm drivers do implement their own cache management.
The implementation isn't fine, since it misses the invalidate/flush
pair (which happens to be the same on x86), largely also because the
CrOS use-case is very limited. I commented on that in the previous
discussion, the current proposed changes.

It's also clear that any such usage essentially makes the driver very
tied to the platform it's running on. I think that's also fine. Like I
said, drivers/gpu is already full of such hacks. I also don't care
what we end up caling these (there's a patch 1, somehow the threading
is broken and it's not part of the patch series).

I think in an ideal world we'd split the dma_sync* stuff into a struct
device and cpu specific parts. Plus figure out clear semantics for
dma-buf around who must flush when, and how exactly snooped vs.
non-snooped bus transactions are agreed on. And also fix up all the
existing drivers ofc. But there's no one even close to willing to do
all that work, so realistically muddling on is the one option we do
have.
-Daniel
-- 
Daniel Vetter
Software Engineer, Intel Corporation
+41 (0) 79 365 57 48 - http://blog.ffwll.ch
_______________________________________________
dri-devel mailing list
dri-devel@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/dri-devel

^ permalink raw reply	[flat|nested] 33+ messages in thread

end of thread, other threads:[~2018-01-30 12:27 UTC | newest]

Thread overview: 33+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2018-01-24  2:56 [PATCH 2/5] drm: add ARM flush implementation Gurchetan Singh
2018-01-24  2:56 ` Gurchetan Singh
2018-01-24  2:56 ` [PATCH 3/5] drm: add ARM64 flush implementations Gurchetan Singh
2018-01-24  2:56   ` Gurchetan Singh
2018-01-24 12:00   ` Robin Murphy
2018-01-24 12:00     ` Robin Murphy
2018-01-24 12:36     ` Russell King - ARM Linux
2018-01-24 12:36       ` Russell King - ARM Linux
2018-01-24 13:14       ` Robin Murphy
2018-01-24 13:14         ` Robin Murphy
2018-01-30  9:53     ` Daniel Vetter
2018-01-30  9:53       ` Daniel Vetter
2018-01-24  2:56 ` [PATCH 4/5] drm/vgem: flush page during page fault Gurchetan Singh
2018-01-24  2:56   ` Gurchetan Singh
2018-01-24  2:56 ` [PATCH 5/5] drm: use drm_flush_sg where possible Gurchetan Singh
2018-01-24  2:56   ` Gurchetan Singh
2018-01-24 11:50 ` [PATCH 2/5] drm: add ARM flush implementation Lucas Stach
2018-01-24 11:50   ` Lucas Stach
2018-01-24 12:45 ` Russell King - ARM Linux
2018-01-24 12:45   ` Russell King - ARM Linux
2018-01-24 18:45   ` Gurchetan Singh
2018-01-24 19:26     ` Russell King - ARM Linux
2018-01-24 19:26       ` Russell King - ARM Linux
2018-01-24 23:43       ` Gurchetan Singh
2018-01-24 23:43         ` Gurchetan Singh
2018-01-25 14:06       ` Thierry Reding
2018-01-25 14:06         ` Thierry Reding
2018-01-30 10:14 ` Daniel Vetter
2018-01-30 10:14   ` Daniel Vetter
2018-01-30 11:31   ` Russell King - ARM Linux
2018-01-30 11:31     ` Russell King - ARM Linux
2018-01-30 12:27     ` Daniel Vetter
2018-01-30 12:27       ` Daniel Vetter

This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.