linux-kernel.vger.kernel.org archive mirror
 help / color / mirror / Atom feed
* dma_alloc_pages / dma_alloc_noncoherent fixups
@ 2020-09-30 16:09 Christoph Hellwig
  2020-09-30 16:09 ` [PATCH 1/8] dma-mapping: remove the {alloc,free}_noncoherent methods Christoph Hellwig
                   ` (7 more replies)
  0 siblings, 8 replies; 28+ messages in thread
From: Christoph Hellwig @ 2020-09-30 16:09 UTC (permalink / raw)
  To: Mauro Carvalho Chehab, Marek Szyprowski, Tomasz Figa, iommu
  Cc: Robin Murphy, linux-doc, linux-kernel, linux-media

Hi all,

this series has a bunch of fixups for the noncoherent DMA allocator
rework that recently landed in linux-next.

I think the most important part is that the idea of vmap()ing
non-contiguous allocations in dma_alloc_noncoherent doesn't work very
well after all.  It means we can't just rely on virt_to_page to get
the page and just use remap_pfn_range or stuff it into other APIs,
but on the other hand it also isn't really generic enought for what
the media APIs seems to want.

So the first patch reverts that change, and the last patch suggests
a different lower level API which should allow the media code to do
all it wants.

I'd suggest all but the last patch for the current merge window, and
we should have a discussion on how well the last one suits the media
subsystem, and probably merge it together with any media changes to
use the required API.

^ permalink raw reply	[flat|nested] 28+ messages in thread

* [PATCH 1/8] dma-mapping: remove the {alloc,free}_noncoherent methods
  2020-09-30 16:09 dma_alloc_pages / dma_alloc_noncoherent fixups Christoph Hellwig
@ 2020-09-30 16:09 ` Christoph Hellwig
  2020-09-30 16:09 ` [PATCH 2/8] dma-mapping: document dma_{alloc,free}_pages Christoph Hellwig
                   ` (6 subsequent siblings)
  7 siblings, 0 replies; 28+ messages in thread
From: Christoph Hellwig @ 2020-09-30 16:09 UTC (permalink / raw)
  To: Mauro Carvalho Chehab, Marek Szyprowski, Tomasz Figa, iommu
  Cc: Robin Murphy, linux-doc, linux-kernel, linux-media

It turns out allowing non-contigous allocations here was a rather bad
idea, as we'll now need to define ways to get the pages for mmaping
or dma_buf sharing.  Revert this change and stick to the original
concept.  A different API for the use case of non-contigous allocations
will be added back later.

Signed-off-by: Christoph Hellwig <hch@lst.de>
---
 drivers/iommu/dma-iommu.c   | 30 ------------------------------
 include/linux/dma-mapping.h |  5 -----
 kernel/dma/mapping.c        | 33 ++++++---------------------------
 3 files changed, 6 insertions(+), 62 deletions(-)

diff --git a/drivers/iommu/dma-iommu.c b/drivers/iommu/dma-iommu.c
index c12c1dc43d312e..b363b20a9f41ce 100644
--- a/drivers/iommu/dma-iommu.c
+++ b/drivers/iommu/dma-iommu.c
@@ -1055,34 +1055,6 @@ static void *iommu_dma_alloc(struct device *dev, size_t size,
 	return cpu_addr;
 }
 
-#ifdef CONFIG_DMA_REMAP
-static void *iommu_dma_alloc_noncoherent(struct device *dev, size_t size,
-		dma_addr_t *handle, enum dma_data_direction dir, gfp_t gfp)
-{
-	if (!gfpflags_allow_blocking(gfp)) {
-		struct page *page;
-
-		page = dma_common_alloc_pages(dev, size, handle, dir, gfp);
-		if (!page)
-			return NULL;
-		return page_address(page);
-	}
-
-	return iommu_dma_alloc_remap(dev, size, handle, gfp | __GFP_ZERO,
-				     PAGE_KERNEL, 0);
-}
-
-static void iommu_dma_free_noncoherent(struct device *dev, size_t size,
-		void *cpu_addr, dma_addr_t handle, enum dma_data_direction dir)
-{
-	__iommu_dma_unmap(dev, handle, size);
-	__iommu_dma_free(dev, size, cpu_addr);
-}
-#else
-#define iommu_dma_alloc_noncoherent		NULL
-#define iommu_dma_free_noncoherent		NULL
-#endif /* CONFIG_DMA_REMAP */
-
 static int iommu_dma_mmap(struct device *dev, struct vm_area_struct *vma,
 		void *cpu_addr, dma_addr_t dma_addr, size_t size,
 		unsigned long attrs)
@@ -1153,8 +1125,6 @@ static const struct dma_map_ops iommu_dma_ops = {
 	.free			= iommu_dma_free,
 	.alloc_pages		= dma_common_alloc_pages,
 	.free_pages		= dma_common_free_pages,
-	.alloc_noncoherent	= iommu_dma_alloc_noncoherent,
-	.free_noncoherent	= iommu_dma_free_noncoherent,
 	.mmap			= iommu_dma_mmap,
 	.get_sgtable		= iommu_dma_get_sgtable,
 	.map_page		= iommu_dma_map_page,
diff --git a/include/linux/dma-mapping.h b/include/linux/dma-mapping.h
index 7c77cd6f3604a7..4b9b1d64f5ec9e 100644
--- a/include/linux/dma-mapping.h
+++ b/include/linux/dma-mapping.h
@@ -74,11 +74,6 @@ struct dma_map_ops {
 			gfp_t gfp);
 	void (*free_pages)(struct device *dev, size_t size, struct page *vaddr,
 			dma_addr_t dma_handle, enum dma_data_direction dir);
-	void* (*alloc_noncoherent)(struct device *dev, size_t size,
-			dma_addr_t *dma_handle, enum dma_data_direction dir,
-			gfp_t gfp);
-	void (*free_noncoherent)(struct device *dev, size_t size, void *vaddr,
-			dma_addr_t dma_handle, enum dma_data_direction dir);
 	int (*mmap)(struct device *, struct vm_area_struct *,
 			  void *, dma_addr_t, size_t,
 			  unsigned long attrs);
diff --git a/kernel/dma/mapping.c b/kernel/dma/mapping.c
index 9669550656a0b4..06115f59f4ffbf 100644
--- a/kernel/dma/mapping.c
+++ b/kernel/dma/mapping.c
@@ -513,40 +513,19 @@ EXPORT_SYMBOL_GPL(dma_free_pages);
 void *dma_alloc_noncoherent(struct device *dev, size_t size,
 		dma_addr_t *dma_handle, enum dma_data_direction dir, gfp_t gfp)
 {
-	const struct dma_map_ops *ops = get_dma_ops(dev);
-	void *vaddr;
-
-	if (!ops || !ops->alloc_noncoherent) {
-		struct page *page;
-
-		page = dma_alloc_pages(dev, size, dma_handle, dir, gfp);
-		if (!page)
-			return NULL;
-		return page_address(page);
-	}
+	struct page *page;
 
-	size = PAGE_ALIGN(size);
-	vaddr = ops->alloc_noncoherent(dev, size, dma_handle, dir, gfp);
-	if (vaddr)
-		debug_dma_map_page(dev, virt_to_page(vaddr), 0, size, dir,
-				   *dma_handle);
-	return vaddr;
+	page = dma_alloc_pages(dev, size, dma_handle, dir, gfp);
+	if (!page)
+		return NULL;
+	return page_address(page);
 }
 EXPORT_SYMBOL_GPL(dma_alloc_noncoherent);
 
 void dma_free_noncoherent(struct device *dev, size_t size, void *vaddr,
 		dma_addr_t dma_handle, enum dma_data_direction dir)
 {
-	const struct dma_map_ops *ops = get_dma_ops(dev);
-
-	if (!ops || !ops->free_noncoherent) {
-		dma_free_pages(dev, size, virt_to_page(vaddr), dma_handle, dir);
-		return;
-	}
-
-	size = PAGE_ALIGN(size);
-	debug_dma_unmap_page(dev, dma_handle, size, dir);
-	ops->free_noncoherent(dev, size, vaddr, dma_handle, dir);
+	dma_free_pages(dev, size, virt_to_page(vaddr), dma_handle, dir);
 }
 EXPORT_SYMBOL_GPL(dma_free_noncoherent);
 
-- 
2.28.0


^ permalink raw reply related	[flat|nested] 28+ messages in thread

* [PATCH 2/8] dma-mapping: document dma_{alloc,free}_pages
  2020-09-30 16:09 dma_alloc_pages / dma_alloc_noncoherent fixups Christoph Hellwig
  2020-09-30 16:09 ` [PATCH 1/8] dma-mapping: remove the {alloc,free}_noncoherent methods Christoph Hellwig
@ 2020-09-30 16:09 ` Christoph Hellwig
  2020-09-30 16:09 ` [PATCH 3/8] dma-direct check for highmem pages in dma_direct_alloc_pages Christoph Hellwig
                   ` (5 subsequent siblings)
  7 siblings, 0 replies; 28+ messages in thread
From: Christoph Hellwig @ 2020-09-30 16:09 UTC (permalink / raw)
  To: Mauro Carvalho Chehab, Marek Szyprowski, Tomasz Figa, iommu
  Cc: Robin Murphy, linux-doc, linux-kernel, linux-media

Document the new dma_alloc_pages and dma_free_pages APIs, and fix
up the documentation for dma_alloc_noncoherent and dma_free_noncoherent.

Reported-by: Robin Murphy <robin.murphy@arm.com>
Signed-off-by: Christoph Hellwig <hch@lst.de>
---
 Documentation/core-api/dma-api.rst | 45 ++++++++++++++++++++++++++----
 1 file changed, 40 insertions(+), 5 deletions(-)

diff --git a/Documentation/core-api/dma-api.rst b/Documentation/core-api/dma-api.rst
index ea0413276ddb70..a75c469dbcaa7c 100644
--- a/Documentation/core-api/dma-api.rst
+++ b/Documentation/core-api/dma-api.rst
@@ -534,11 +534,9 @@ an I/O device, you should not be using this part of the API.
 			dma_addr_t *dma_handle, enum dma_data_direction dir,
 			gfp_t gfp)
 
-This routine allocates a region of <size> bytes of consistent memory.  It
+This routine allocates a region of <size> bytes of non-coherent memory.  It
 returns a pointer to the allocated region (in the processor's virtual address
-space) or NULL if the allocation failed.  The returned memory may or may not
-be in the kernels direct mapping.  Drivers must not call virt_to_page on
-the returned memory region.
+space) or NULL if the allocation failed.
 
 It also returns a <dma_handle> which may be cast to an unsigned integer the
 same width as the bus and given to the device as the DMA address base of
@@ -565,7 +563,44 @@ reused.
 Free a region of memory previously allocated using dma_alloc_noncoherent().
 dev, size and dma_handle and dir must all be the same as those passed into
 dma_alloc_noncoherent().  cpu_addr must be the virtual address returned by
-the dma_alloc_noncoherent().
+dma_alloc_noncoherent().
+
+::
+
+	struct page *
+	dma_alloc_pages(struct device *dev, size_t size, dma_addr_t *dma_handle,
+			enum dma_data_direction dir, gfp_t gfp)
+
+This routine allocates a region of <size> bytes of non-coherent memory.  It
+returns a pointer to first struct page for the region, or NULL if the
+allocation failed.
+
+It also returns a <dma_handle> which may be cast to an unsigned integer the
+same width as the bus and given to the device as the DMA address base of
+the region.
+
+The dir parameter specified if data is read and/or written by the device,
+see dma_map_single() for details.
+
+The gfp parameter allows the caller to specify the ``GFP_`` flags (see
+kmalloc()) for the allocation, but rejects flags used to specify a memory
+zone such as GFP_DMA or GFP_HIGHMEM.
+
+Before giving the memory to the device, dma_sync_single_for_device() needs
+to be called, and before reading memory written by the device,
+dma_sync_single_for_cpu(), just like for streaming DMA mappings that are
+reused.
+
+::
+
+	void
+	dma_free_pages(struct device *dev, size_t size, struct page *page,
+			dma_addr_t dma_handle, enum dma_data_direction dir)
+
+Free a region of memory previously allocated using dma_alloc_pages().
+dev, size and dma_handle and dir must all be the same as those passed into
+dma_alloc_noncoherent().  page must be the pointer returned by
+dma_alloc_pages().
 
 ::
 
-- 
2.28.0


^ permalink raw reply related	[flat|nested] 28+ messages in thread

* [PATCH 3/8] dma-direct check for highmem pages in dma_direct_alloc_pages
  2020-09-30 16:09 dma_alloc_pages / dma_alloc_noncoherent fixups Christoph Hellwig
  2020-09-30 16:09 ` [PATCH 1/8] dma-mapping: remove the {alloc,free}_noncoherent methods Christoph Hellwig
  2020-09-30 16:09 ` [PATCH 2/8] dma-mapping: document dma_{alloc,free}_pages Christoph Hellwig
@ 2020-09-30 16:09 ` Christoph Hellwig
  2020-09-30 16:09 ` [PATCH 4/8] dma-direct: use __GFP_ZERO " Christoph Hellwig
                   ` (4 subsequent siblings)
  7 siblings, 0 replies; 28+ messages in thread
From: Christoph Hellwig @ 2020-09-30 16:09 UTC (permalink / raw)
  To: Mauro Carvalho Chehab, Marek Szyprowski, Tomasz Figa, iommu
  Cc: Robin Murphy, linux-doc, linux-kernel, linux-media

Check for highmem pages from CMA, just like in the dma_direct_alloc path.

Signed-off-by: Christoph Hellwig <hch@lst.de>
---
 kernel/dma/direct.c | 11 +++++++++++
 1 file changed, 11 insertions(+)

diff --git a/kernel/dma/direct.c b/kernel/dma/direct.c
index 121a9c1969dd3a..b5f20781d3a96f 100644
--- a/kernel/dma/direct.c
+++ b/kernel/dma/direct.c
@@ -309,6 +309,17 @@ struct page *dma_direct_alloc_pages(struct device *dev, size_t size,
 	page = __dma_direct_alloc_pages(dev, size, gfp);
 	if (!page)
 		return NULL;
+	if (PageHighMem(page)) {
+		/*
+		 * Depending on the cma= arguments and per-arch setup
+		 * dma_alloc_contiguous could return highmem pages.
+		 * Without remapping there is no way to return them here,
+		 * so log an error and fail.
+		 */
+		dev_info(dev, "Rejecting highmem page from CMA.\n");
+		goto out_free_pages;
+	}
+
 	ret = page_address(page);
 	if (force_dma_unencrypted(dev)) {
 		if (set_memory_decrypted((unsigned long)ret,
-- 
2.28.0


^ permalink raw reply related	[flat|nested] 28+ messages in thread

* [PATCH 4/8] dma-direct: use __GFP_ZERO in dma_direct_alloc_pages
  2020-09-30 16:09 dma_alloc_pages / dma_alloc_noncoherent fixups Christoph Hellwig
                   ` (2 preceding siblings ...)
  2020-09-30 16:09 ` [PATCH 3/8] dma-direct check for highmem pages in dma_direct_alloc_pages Christoph Hellwig
@ 2020-09-30 16:09 ` Christoph Hellwig
  2020-09-30 16:09 ` [PATCH 5/8] dma-direct: factor out a dma_direct_alloc_from_pool helper Christoph Hellwig
                   ` (3 subsequent siblings)
  7 siblings, 0 replies; 28+ messages in thread
From: Christoph Hellwig @ 2020-09-30 16:09 UTC (permalink / raw)
  To: Mauro Carvalho Chehab, Marek Szyprowski, Tomasz Figa, iommu
  Cc: Robin Murphy, linux-doc, linux-kernel, linux-media

Prepare for supporting the DMA_ATTR_NO_KERNEL_MAPPING flag in
dma_alloc_pages.

Signed-off-by: Christoph Hellwig <hch@lst.de>
---
 kernel/dma/direct.c | 9 ++++-----
 1 file changed, 4 insertions(+), 5 deletions(-)

diff --git a/kernel/dma/direct.c b/kernel/dma/direct.c
index b5f20781d3a96f..b5d56810130b22 100644
--- a/kernel/dma/direct.c
+++ b/kernel/dma/direct.c
@@ -296,9 +296,10 @@ struct page *dma_direct_alloc_pages(struct device *dev, size_t size,
 		dma_addr_t *dma_handle, enum dma_data_direction dir, gfp_t gfp)
 {
 	struct page *page;
-	void *ret;
 
 	if (dma_should_alloc_from_pool(dev, gfp, 0)) {
+		void *ret;
+
 		page = dma_alloc_from_pool(dev, size, &ret, gfp,
 				dma_coherent_ok);
 		if (!page)
@@ -306,7 +307,7 @@ struct page *dma_direct_alloc_pages(struct device *dev, size_t size,
 		goto done;
 	}
 
-	page = __dma_direct_alloc_pages(dev, size, gfp);
+	page = __dma_direct_alloc_pages(dev, size, gfp | __GFP_ZERO);
 	if (!page)
 		return NULL;
 	if (PageHighMem(page)) {
@@ -320,13 +321,11 @@ struct page *dma_direct_alloc_pages(struct device *dev, size_t size,
 		goto out_free_pages;
 	}
 
-	ret = page_address(page);
 	if (force_dma_unencrypted(dev)) {
-		if (set_memory_decrypted((unsigned long)ret,
+		if (set_memory_decrypted((unsigned long)page_address(page),
 				1 << get_order(size)))
 			goto out_free_pages;
 	}
-	memset(ret, 0, size);
 done:
 	*dma_handle = phys_to_dma_direct(dev, page_to_phys(page));
 	return page;
-- 
2.28.0


^ permalink raw reply related	[flat|nested] 28+ messages in thread

* [PATCH 5/8] dma-direct: factor out a dma_direct_alloc_from_pool helper
  2020-09-30 16:09 dma_alloc_pages / dma_alloc_noncoherent fixups Christoph Hellwig
                   ` (3 preceding siblings ...)
  2020-09-30 16:09 ` [PATCH 4/8] dma-direct: use __GFP_ZERO " Christoph Hellwig
@ 2020-09-30 16:09 ` Christoph Hellwig
  2020-09-30 16:09 ` [PATCH 6/8] dma-direct: simplify the DMA_ATTR_NO_KERNEL_MAPPING handling Christoph Hellwig
                   ` (2 subsequent siblings)
  7 siblings, 0 replies; 28+ messages in thread
From: Christoph Hellwig @ 2020-09-30 16:09 UTC (permalink / raw)
  To: Mauro Carvalho Chehab, Marek Szyprowski, Tomasz Figa, iommu
  Cc: Robin Murphy, linux-doc, linux-kernel, linux-media

This ensures dma_direct_alloc_pages will use the right gfp mask, as
well as keeping the code for that common.

Signed-off-by: Christoph Hellwig <hch@lst.de>
---
 kernel/dma/direct.c | 41 ++++++++++++++++++++---------------------
 1 file changed, 20 insertions(+), 21 deletions(-)

diff --git a/kernel/dma/direct.c b/kernel/dma/direct.c
index b5d56810130b22..ace9159c992f65 100644
--- a/kernel/dma/direct.c
+++ b/kernel/dma/direct.c
@@ -147,6 +147,22 @@ static struct page *__dma_direct_alloc_pages(struct device *dev, size_t size,
 	return page;
 }
 
+static void *dma_direct_alloc_from_pool(struct device *dev, size_t size,
+		dma_addr_t *dma_handle, gfp_t gfp)
+{
+	struct page *page;
+	u64 phys_mask;
+	void *ret;
+
+	gfp |= dma_direct_optimal_gfp_mask(dev, dev->coherent_dma_mask,
+					   &phys_mask);
+	page = dma_alloc_from_pool(dev, size, &ret, gfp, dma_coherent_ok);
+	if (!page)
+		return NULL;
+	*dma_handle = phys_to_dma_direct(dev, page_to_phys(page));
+	return ret;
+}
+
 void *dma_direct_alloc(struct device *dev, size_t size,
 		dma_addr_t *dma_handle, gfp_t gfp, unsigned long attrs)
 {
@@ -163,17 +179,8 @@ void *dma_direct_alloc(struct device *dev, size_t size,
 	if (attrs & DMA_ATTR_NO_WARN)
 		gfp |= __GFP_NOWARN;
 
-	if (dma_should_alloc_from_pool(dev, gfp, attrs)) {
-		u64 phys_mask;
-
-		gfp |= dma_direct_optimal_gfp_mask(dev, dev->coherent_dma_mask,
-				&phys_mask);
-		page = dma_alloc_from_pool(dev, size, &ret, gfp,
-				dma_coherent_ok);
-		if (!page)
-			return NULL;
-		goto done;
-	}
+	if (dma_should_alloc_from_pool(dev, gfp, attrs))
+		return dma_direct_alloc_from_pool(dev, size, dma_handle, gfp);
 
 	/* we always manually zero the memory once we are done */
 	page = __dma_direct_alloc_pages(dev, size, gfp & ~__GFP_ZERO);
@@ -297,15 +304,8 @@ struct page *dma_direct_alloc_pages(struct device *dev, size_t size,
 {
 	struct page *page;
 
-	if (dma_should_alloc_from_pool(dev, gfp, 0)) {
-		void *ret;
-
-		page = dma_alloc_from_pool(dev, size, &ret, gfp,
-				dma_coherent_ok);
-		if (!page)
-			return NULL;
-		goto done;
-	}
+	if (dma_should_alloc_from_pool(dev, gfp, 0))
+		return dma_direct_alloc_from_pool(dev, size, dma_handle, gfp);
 
 	page = __dma_direct_alloc_pages(dev, size, gfp | __GFP_ZERO);
 	if (!page)
@@ -326,7 +326,6 @@ struct page *dma_direct_alloc_pages(struct device *dev, size_t size,
 				1 << get_order(size)))
 			goto out_free_pages;
 	}
-done:
 	*dma_handle = phys_to_dma_direct(dev, page_to_phys(page));
 	return page;
 out_free_pages:
-- 
2.28.0


^ permalink raw reply related	[flat|nested] 28+ messages in thread

* [PATCH 6/8] dma-direct: simplify the DMA_ATTR_NO_KERNEL_MAPPING handling
  2020-09-30 16:09 dma_alloc_pages / dma_alloc_noncoherent fixups Christoph Hellwig
                   ` (4 preceding siblings ...)
  2020-09-30 16:09 ` [PATCH 5/8] dma-direct: factor out a dma_direct_alloc_from_pool helper Christoph Hellwig
@ 2020-09-30 16:09 ` Christoph Hellwig
  2020-09-30 16:09 ` [PATCH 7/8] dma-iommu: remove __iommu_dma_mmap Christoph Hellwig
  2020-09-30 16:09 ` [PATCH 8/8] WIP: add a dma_alloc_contiguous API Christoph Hellwig
  7 siblings, 0 replies; 28+ messages in thread
From: Christoph Hellwig @ 2020-09-30 16:09 UTC (permalink / raw)
  To: Mauro Carvalho Chehab, Marek Szyprowski, Tomasz Figa, iommu
  Cc: Robin Murphy, linux-doc, linux-kernel, linux-media

Use and entirely separate code path for the DMA_ATTR_NO_KERNEL_MAPPING
path.  This avoids any confusion about the ret type, and avoids lots of
attr checks and helpers that can be significantly simplified now.

It also ensures that common handling is applied to architetures still
using the arch alloc/free hooks.

Signed-off-by: Christoph Hellwig <hch@lst.de>
---
 include/linux/dma-noncoherent.h |  13 -----
 kernel/dma/direct.c             | 100 +++++++++++++-------------------
 2 files changed, 39 insertions(+), 74 deletions(-)

diff --git a/include/linux/dma-noncoherent.h b/include/linux/dma-noncoherent.h
index e61283e06576a8..73ac149fa181b4 100644
--- a/include/linux/dma-noncoherent.h
+++ b/include/linux/dma-noncoherent.h
@@ -21,19 +21,6 @@ static inline bool dev_is_dma_coherent(struct device *dev)
 }
 #endif /* CONFIG_ARCH_HAS_DMA_COHERENCE_H */
 
-/*
- * Check if an allocation needs to be marked uncached to be coherent.
- */
-static __always_inline bool dma_alloc_need_uncached(struct device *dev,
-		unsigned long attrs)
-{
-	if (dev_is_dma_coherent(dev))
-		return false;
-	if (attrs & DMA_ATTR_NO_KERNEL_MAPPING)
-		return false;
-	return true;
-}
-
 void *arch_dma_alloc(struct device *dev, size_t size, dma_addr_t *dma_handle,
 		gfp_t gfp, unsigned long attrs);
 void arch_dma_free(struct device *dev, size_t size, void *cpu_addr,
diff --git a/kernel/dma/direct.c b/kernel/dma/direct.c
index ace9159c992f65..a3c619b424edf0 100644
--- a/kernel/dma/direct.c
+++ b/kernel/dma/direct.c
@@ -75,39 +75,6 @@ static bool dma_coherent_ok(struct device *dev, phys_addr_t phys, size_t size)
 		min_not_zero(dev->coherent_dma_mask, dev->bus_dma_limit);
 }
 
-/*
- * Decrypting memory is allowed to block, so if this device requires
- * unencrypted memory it must come from atomic pools.
- */
-static inline bool dma_should_alloc_from_pool(struct device *dev, gfp_t gfp,
-					      unsigned long attrs)
-{
-	if (!IS_ENABLED(CONFIG_DMA_COHERENT_POOL))
-		return false;
-	if (gfpflags_allow_blocking(gfp))
-		return false;
-	if (force_dma_unencrypted(dev))
-		return true;
-	if (!IS_ENABLED(CONFIG_DMA_DIRECT_REMAP))
-		return false;
-	if (dma_alloc_need_uncached(dev, attrs))
-		return true;
-	return false;
-}
-
-static inline bool dma_should_free_from_pool(struct device *dev,
-					     unsigned long attrs)
-{
-	if (IS_ENABLED(CONFIG_DMA_COHERENT_POOL))
-		return true;
-	if ((attrs & DMA_ATTR_NO_KERNEL_MAPPING) &&
-	    !force_dma_unencrypted(dev))
-		return false;
-	if (IS_ENABLED(CONFIG_DMA_DIRECT_REMAP))
-		return true;
-	return false;
-}
-
 static struct page *__dma_direct_alloc_pages(struct device *dev, size_t size,
 		gfp_t gfp)
 {
@@ -170,35 +137,45 @@ void *dma_direct_alloc(struct device *dev, size_t size,
 	void *ret;
 	int err;
 
-	if (!IS_ENABLED(CONFIG_ARCH_HAS_DMA_SET_UNCACHED) &&
-	    !IS_ENABLED(CONFIG_DMA_DIRECT_REMAP) &&
-	    dma_alloc_need_uncached(dev, attrs))
-		return arch_dma_alloc(dev, size, dma_handle, gfp, attrs);
-
 	size = PAGE_ALIGN(size);
 	if (attrs & DMA_ATTR_NO_WARN)
 		gfp |= __GFP_NOWARN;
 
-	if (dma_should_alloc_from_pool(dev, gfp, attrs))
-		return dma_direct_alloc_from_pool(dev, size, dma_handle, gfp);
-
-	/* we always manually zero the memory once we are done */
-	page = __dma_direct_alloc_pages(dev, size, gfp & ~__GFP_ZERO);
-	if (!page)
-		return NULL;
-
 	if ((attrs & DMA_ATTR_NO_KERNEL_MAPPING) &&
 	    !force_dma_unencrypted(dev)) {
+		page = __dma_direct_alloc_pages(dev, size, gfp);
+		if (!page)
+			return NULL;
 		/* remove any dirty cache lines on the kernel alias */
 		if (!PageHighMem(page))
 			arch_dma_prep_coherent(page, size);
+		*dma_handle = phys_to_dma_direct(dev, page_to_phys(page));
 		/* return the page pointer as the opaque cookie */
-		ret = page;
-		goto done;
+		return page;
 	}
 
+	if (!IS_ENABLED(CONFIG_ARCH_HAS_DMA_SET_UNCACHED) &&
+	    !IS_ENABLED(CONFIG_DMA_DIRECT_REMAP) &&
+	    !dev_is_dma_coherent(dev))
+		return arch_dma_alloc(dev, size, dma_handle, gfp, attrs);
+
+	/*
+	 * Remapping or decrypting memory may block. If either is required and
+	 * we can't block, allocate the memory from the atomic pools.
+	 */
+	if (IS_ENABLED(CONFIG_DMA_COHERENT_POOL) &&
+	    !gfpflags_allow_blocking(gfp) &&
+	    (force_dma_unencrypted(dev) ||
+	     (IS_ENABLED(CONFIG_DMA_DIRECT_REMAP) && !dev_is_dma_coherent(dev))))
+		return dma_direct_alloc_from_pool(dev, size, dma_handle, gfp);
+
+	/* we always manually zero the memory once we are done */
+	page = __dma_direct_alloc_pages(dev, size, gfp & ~__GFP_ZERO);
+	if (!page)
+		return NULL;
+
 	if ((IS_ENABLED(CONFIG_DMA_DIRECT_REMAP) &&
-	     dma_alloc_need_uncached(dev, attrs)) ||
+	     !dev_is_dma_coherent(dev)) ||
 	    (IS_ENABLED(CONFIG_DMA_REMAP) && PageHighMem(page))) {
 		/* remove any dirty cache lines on the kernel alias */
 		arch_dma_prep_coherent(page, size);
@@ -241,7 +218,7 @@ void *dma_direct_alloc(struct device *dev, size_t size,
 	memset(ret, 0, size);
 
 	if (IS_ENABLED(CONFIG_ARCH_HAS_DMA_SET_UNCACHED) &&
-	    dma_alloc_need_uncached(dev, attrs)) {
+	    !dev_is_dma_coherent(dev)) {
 		arch_dma_prep_coherent(page, size);
 		ret = arch_dma_set_uncached(ret, size);
 		if (IS_ERR(ret))
@@ -269,25 +246,25 @@ void dma_direct_free(struct device *dev, size_t size,
 {
 	unsigned int page_order = get_order(size);
 
+	if ((attrs & DMA_ATTR_NO_KERNEL_MAPPING) &&
+	    !force_dma_unencrypted(dev)) {
+		/* cpu_addr is a struct page cookie, not a kernel address */
+		dma_free_contiguous(dev, cpu_addr, size);
+		return;
+	}
+
 	if (!IS_ENABLED(CONFIG_ARCH_HAS_DMA_SET_UNCACHED) &&
 	    !IS_ENABLED(CONFIG_DMA_DIRECT_REMAP) &&
-	    dma_alloc_need_uncached(dev, attrs)) {
+	    !dev_is_dma_coherent(dev)) {
 		arch_dma_free(dev, size, cpu_addr, dma_addr, attrs);
 		return;
 	}
 
 	/* If cpu_addr is not from an atomic pool, dma_free_from_pool() fails */
-	if (dma_should_free_from_pool(dev, attrs) &&
+	if (IS_ENABLED(CONFIG_DMA_COHERENT_POOL) &&
 	    dma_free_from_pool(dev, cpu_addr, PAGE_ALIGN(size)))
 		return;
 
-	if ((attrs & DMA_ATTR_NO_KERNEL_MAPPING) &&
-	    !force_dma_unencrypted(dev)) {
-		/* cpu_addr is a struct page cookie, not a kernel address */
-		dma_free_contiguous(dev, cpu_addr, size);
-		return;
-	}
-
 	if (force_dma_unencrypted(dev))
 		set_memory_encrypted((unsigned long)cpu_addr, 1 << page_order);
 
@@ -304,7 +281,8 @@ struct page *dma_direct_alloc_pages(struct device *dev, size_t size,
 {
 	struct page *page;
 
-	if (dma_should_alloc_from_pool(dev, gfp, 0))
+	if (IS_ENABLED(CONFIG_DMA_COHERENT_POOL) &&
+	    force_dma_unencrypted(dev) && !gfpflags_allow_blocking(gfp))
 		return dma_direct_alloc_from_pool(dev, size, dma_handle, gfp);
 
 	page = __dma_direct_alloc_pages(dev, size, gfp | __GFP_ZERO);
@@ -341,7 +319,7 @@ void dma_direct_free_pages(struct device *dev, size_t size,
 	void *vaddr = page_address(page);
 
 	/* If cpu_addr is not from an atomic pool, dma_free_from_pool() fails */
-	if (dma_should_free_from_pool(dev, 0) &&
+	if (IS_ENABLED(CONFIG_DMA_COHERENT_POOL) &&
 	    dma_free_from_pool(dev, vaddr, size))
 		return;
 
-- 
2.28.0


^ permalink raw reply related	[flat|nested] 28+ messages in thread

* [PATCH 7/8] dma-iommu: remove __iommu_dma_mmap
  2020-09-30 16:09 dma_alloc_pages / dma_alloc_noncoherent fixups Christoph Hellwig
                   ` (5 preceding siblings ...)
  2020-09-30 16:09 ` [PATCH 6/8] dma-direct: simplify the DMA_ATTR_NO_KERNEL_MAPPING handling Christoph Hellwig
@ 2020-09-30 16:09 ` Christoph Hellwig
  2020-09-30 16:09 ` [PATCH 8/8] WIP: add a dma_alloc_contiguous API Christoph Hellwig
  7 siblings, 0 replies; 28+ messages in thread
From: Christoph Hellwig @ 2020-09-30 16:09 UTC (permalink / raw)
  To: Mauro Carvalho Chehab, Marek Szyprowski, Tomasz Figa, iommu
  Cc: Robin Murphy, linux-doc, linux-kernel, linux-media

The function has a single caller, so open code it there and take
advantage of the precalculated page count variable.

Signed-off-by: Christoph Hellwig <hch@lst.de>
---
 drivers/iommu/dma-iommu.c | 17 +----------------
 1 file changed, 1 insertion(+), 16 deletions(-)

diff --git a/drivers/iommu/dma-iommu.c b/drivers/iommu/dma-iommu.c
index b363b20a9f41ce..7922f545cd5eef 100644
--- a/drivers/iommu/dma-iommu.c
+++ b/drivers/iommu/dma-iommu.c
@@ -656,21 +656,6 @@ static void *iommu_dma_alloc_remap(struct device *dev, size_t size,
 	return NULL;
 }
 
-/**
- * __iommu_dma_mmap - Map a buffer into provided user VMA
- * @pages: Array representing buffer from __iommu_dma_alloc()
- * @size: Size of buffer in bytes
- * @vma: VMA describing requested userspace mapping
- *
- * Maps the pages of the buffer in @pages into @vma. The caller is responsible
- * for verifying the correct size and protection of @vma beforehand.
- */
-static int __iommu_dma_mmap(struct page **pages, size_t size,
-		struct vm_area_struct *vma)
-{
-	return vm_map_pages(vma, pages, PAGE_ALIGN(size) >> PAGE_SHIFT);
-}
-
 static void iommu_dma_sync_single_for_cpu(struct device *dev,
 		dma_addr_t dma_handle, size_t size, enum dma_data_direction dir)
 {
@@ -1075,7 +1060,7 @@ static int iommu_dma_mmap(struct device *dev, struct vm_area_struct *vma,
 		struct page **pages = dma_common_find_pages(cpu_addr);
 
 		if (pages)
-			return __iommu_dma_mmap(pages, size, vma);
+			return vm_map_pages(vma, pages, nr_pages);
 		pfn = vmalloc_to_pfn(cpu_addr);
 	} else {
 		pfn = page_to_pfn(virt_to_page(cpu_addr));
-- 
2.28.0


^ permalink raw reply related	[flat|nested] 28+ messages in thread

* [PATCH 8/8] WIP: add a dma_alloc_contiguous API
  2020-09-30 16:09 dma_alloc_pages / dma_alloc_noncoherent fixups Christoph Hellwig
                   ` (6 preceding siblings ...)
  2020-09-30 16:09 ` [PATCH 7/8] dma-iommu: remove __iommu_dma_mmap Christoph Hellwig
@ 2020-09-30 16:09 ` Christoph Hellwig
  2020-10-02 17:50   ` Tomasz Figa
                     ` (2 more replies)
  7 siblings, 3 replies; 28+ messages in thread
From: Christoph Hellwig @ 2020-09-30 16:09 UTC (permalink / raw)
  To: Mauro Carvalho Chehab, Marek Szyprowski, Tomasz Figa, iommu
  Cc: Robin Murphy, linux-doc, linux-kernel, linux-media

Add a new API that returns a virtually non-contigous array of pages
and dma address.  This API is only implemented for dma-iommu and will
not be implemented for non-iommu DMA API instances that have to allocate
contiguous memory.  It is up to the caller to check if the API is
available.

The intent is that media drivers can use this API if either:

 - no kernel mapping or only temporary kernel mappings are required.
   That is as a better replacement for DMA_ATTR_NO_KERNEL_MAPPING
 - a kernel mapping is required for cached and DMA mapped pages, but
   the driver also needs the pages to e.g. map them to userspace.
   In that sense it is a replacement for some aspects of the recently
   removed and never fully implemented DMA_ATTR_NON_CONSISTENT

Signed-off-by: Christoph Hellwig <hch@lst.de>
---
 drivers/iommu/dma-iommu.c   | 73 +++++++++++++++++++++++++------------
 include/linux/dma-mapping.h |  9 +++++
 kernel/dma/mapping.c        | 35 ++++++++++++++++++
 3 files changed, 93 insertions(+), 24 deletions(-)

diff --git a/drivers/iommu/dma-iommu.c b/drivers/iommu/dma-iommu.c
index 7922f545cd5eef..158026a856622c 100644
--- a/drivers/iommu/dma-iommu.c
+++ b/drivers/iommu/dma-iommu.c
@@ -565,23 +565,12 @@ static struct page **__iommu_dma_alloc_pages(struct device *dev,
 	return pages;
 }
 
-/**
- * iommu_dma_alloc_remap - Allocate and map a buffer contiguous in IOVA space
- * @dev: Device to allocate memory for. Must be a real device
- *	 attached to an iommu_dma_domain
- * @size: Size of buffer in bytes
- * @dma_handle: Out argument for allocated DMA handle
- * @gfp: Allocation flags
- * @prot: pgprot_t to use for the remapped mapping
- * @attrs: DMA attributes for this allocation
- *
- * If @size is less than PAGE_SIZE, then a full CPU page will be allocated,
+/*
+ * If size is less than PAGE_SIZE, then a full CPU page will be allocated,
  * but an IOMMU which supports smaller pages might not map the whole thing.
- *
- * Return: Mapped virtual address, or NULL on failure.
  */
-static void *iommu_dma_alloc_remap(struct device *dev, size_t size,
-		dma_addr_t *dma_handle, gfp_t gfp, pgprot_t prot,
+static struct page **__iommu_dma_alloc_noncontiguous(struct device *dev,
+		size_t size, dma_addr_t *dma_handle, gfp_t gfp, pgprot_t prot,
 		unsigned long attrs)
 {
 	struct iommu_domain *domain = iommu_get_dma_domain(dev);
@@ -593,7 +582,6 @@ static void *iommu_dma_alloc_remap(struct device *dev, size_t size,
 	struct page **pages;
 	struct sg_table sgt;
 	dma_addr_t iova;
-	void *vaddr;
 
 	*dma_handle = DMA_MAPPING_ERROR;
 
@@ -636,17 +624,10 @@ static void *iommu_dma_alloc_remap(struct device *dev, size_t size,
 			< size)
 		goto out_free_sg;
 
-	vaddr = dma_common_pages_remap(pages, size, prot,
-			__builtin_return_address(0));
-	if (!vaddr)
-		goto out_unmap;
-
 	*dma_handle = iova;
 	sg_free_table(&sgt);
-	return vaddr;
+	return pages;
 
-out_unmap:
-	__iommu_dma_unmap(dev, iova, size);
 out_free_sg:
 	sg_free_table(&sgt);
 out_free_iova:
@@ -656,6 +637,46 @@ static void *iommu_dma_alloc_remap(struct device *dev, size_t size,
 	return NULL;
 }
 
+static void *iommu_dma_alloc_remap(struct device *dev, size_t size,
+		dma_addr_t *dma_handle, gfp_t gfp, pgprot_t prot,
+		unsigned long attrs)
+{
+	struct page **pages;
+	void *vaddr;
+
+	pages = __iommu_dma_alloc_noncontiguous(dev, size, dma_handle, gfp,
+						prot, attrs);
+	if (!pages)
+		return NULL;
+	vaddr = dma_common_pages_remap(pages, size, prot,
+			__builtin_return_address(0));
+	if (!vaddr)
+		goto out_unmap;
+	return vaddr;
+
+out_unmap:
+	__iommu_dma_unmap(dev, *dma_handle, size);
+	__iommu_dma_free_pages(pages, PAGE_ALIGN(size) >> PAGE_SHIFT);
+	return NULL;
+}
+
+#ifdef CONFIG_DMA_REMAP
+static struct page **iommu_dma_alloc_noncontiguous(struct device *dev,
+		size_t size, dma_addr_t *dma_handle, gfp_t gfp,
+		unsigned long attrs)
+{
+	return __iommu_dma_alloc_noncontiguous(dev, size, dma_handle, gfp,
+					       PAGE_KERNEL, attrs);
+}
+
+static void iommu_dma_free_noncontiguous(struct device *dev, size_t size,
+		struct page **pages, dma_addr_t dma_handle)
+{
+	__iommu_dma_unmap(dev, dma_handle, size);
+	__iommu_dma_free_pages(pages, PAGE_ALIGN(size) >> PAGE_SHIFT);
+}
+#endif
+
 static void iommu_dma_sync_single_for_cpu(struct device *dev,
 		dma_addr_t dma_handle, size_t size, enum dma_data_direction dir)
 {
@@ -1110,6 +1131,10 @@ static const struct dma_map_ops iommu_dma_ops = {
 	.free			= iommu_dma_free,
 	.alloc_pages		= dma_common_alloc_pages,
 	.free_pages		= dma_common_free_pages,
+#ifdef CONFIG_DMA_REMAP
+	.alloc_noncontiguous	= iommu_dma_alloc_noncontiguous,
+	.free_noncontiguous	= iommu_dma_free_noncontiguous,
+#endif
 	.mmap			= iommu_dma_mmap,
 	.get_sgtable		= iommu_dma_get_sgtable,
 	.map_page		= iommu_dma_map_page,
diff --git a/include/linux/dma-mapping.h b/include/linux/dma-mapping.h
index 4b9b1d64f5ec9e..51bbc32365bb8d 100644
--- a/include/linux/dma-mapping.h
+++ b/include/linux/dma-mapping.h
@@ -74,6 +74,10 @@ struct dma_map_ops {
 			gfp_t gfp);
 	void (*free_pages)(struct device *dev, size_t size, struct page *vaddr,
 			dma_addr_t dma_handle, enum dma_data_direction dir);
+	struct page **(*alloc_noncontiguous)(struct device *dev, size_t size,
+			dma_addr_t *dma_handle, gfp_t gfp, unsigned long attrs);
+	void (*free_noncontiguous)(struct device *dev, size_t size,
+			struct page **pages, dma_addr_t dma_handle);
 	int (*mmap)(struct device *, struct vm_area_struct *,
 			  void *, dma_addr_t, size_t,
 			  unsigned long attrs);
@@ -384,6 +388,11 @@ void *dma_alloc_noncoherent(struct device *dev, size_t size,
 		dma_addr_t *dma_handle, enum dma_data_direction dir, gfp_t gfp);
 void dma_free_noncoherent(struct device *dev, size_t size, void *vaddr,
 		dma_addr_t dma_handle, enum dma_data_direction dir);
+bool dma_can_alloc_noncontiguous(struct device *dev);
+struct page **dma_alloc_noncontiguous(struct device *dev, size_t size,
+		dma_addr_t *dma_handle, gfp_t gfp, unsigned long attrs);
+void dma_free_noncontiguous(struct device *dev, size_t size,
+		struct page **pages, dma_addr_t dma_handle);
 
 static inline dma_addr_t dma_map_single_attrs(struct device *dev, void *ptr,
 		size_t size, enum dma_data_direction dir, unsigned long attrs)
diff --git a/kernel/dma/mapping.c b/kernel/dma/mapping.c
index 06115f59f4ffbf..6d975d1a20dd72 100644
--- a/kernel/dma/mapping.c
+++ b/kernel/dma/mapping.c
@@ -529,6 +529,41 @@ void dma_free_noncoherent(struct device *dev, size_t size, void *vaddr,
 }
 EXPORT_SYMBOL_GPL(dma_free_noncoherent);
 
+bool dma_can_alloc_noncontiguous(struct device *dev)
+{
+	const struct dma_map_ops *ops = get_dma_ops(dev);
+
+	return ops && ops->free_noncontiguous;
+}
+EXPORT_SYMBOL_GPL(dma_can_alloc_noncontiguous);
+
+struct page **dma_alloc_noncontiguous(struct device *dev, size_t size,
+		dma_addr_t *dma_handle, gfp_t gfp, unsigned long attrs)
+{
+	const struct dma_map_ops *ops = get_dma_ops(dev);
+
+	if (WARN_ON_ONCE(!dma_can_alloc_noncontiguous(dev)))
+		return NULL;
+	if (attrs & ~DMA_ATTR_ALLOC_SINGLE_PAGES) {
+		dev_warn(dev, "invalid flags (0x%lx) for %s\n",
+			 attrs, __func__);
+		return NULL;
+	}
+	return ops->alloc_noncontiguous(dev, size, dma_handle, gfp, attrs);
+}
+EXPORT_SYMBOL_GPL(dma_alloc_noncontiguous);
+
+void dma_free_noncontiguous(struct device *dev, size_t size,
+		struct page **pages, dma_addr_t dma_handle)
+{
+	const struct dma_map_ops *ops = get_dma_ops(dev);
+
+	if (WARN_ON_ONCE(!dma_can_alloc_noncontiguous(dev)))
+		return;
+	ops->free_noncontiguous(dev, size, pages, dma_handle);
+}
+EXPORT_SYMBOL_GPL(dma_free_noncontiguous);
+
 int dma_supported(struct device *dev, u64 mask)
 {
 	const struct dma_map_ops *ops = get_dma_ops(dev);
-- 
2.28.0


^ permalink raw reply related	[flat|nested] 28+ messages in thread

* Re: [PATCH 8/8] WIP: add a dma_alloc_contiguous API
  2020-09-30 16:09 ` [PATCH 8/8] WIP: add a dma_alloc_contiguous API Christoph Hellwig
@ 2020-10-02 17:50   ` Tomasz Figa
  2020-10-05  8:26     ` Christoph Hellwig
  2020-10-14 13:20   ` Tomasz Figa
  2020-11-18 14:25   ` [PATCH] WIP! media: uvcvideo: Use dma_alloc_noncontiguos API Ricardo Ribalda
  2 siblings, 1 reply; 28+ messages in thread
From: Tomasz Figa @ 2020-10-02 17:50 UTC (permalink / raw)
  To: Christoph Hellwig
  Cc: Mauro Carvalho Chehab, Marek Szyprowski, iommu, Robin Murphy,
	linux-doc, linux-kernel, linux-media

Hi Christoph,

On Wed, Sep 30, 2020 at 06:09:17PM +0200, Christoph Hellwig wrote:
> Add a new API that returns a virtually non-contigous array of pages
> and dma address.  This API is only implemented for dma-iommu and will
> not be implemented for non-iommu DMA API instances that have to allocate
> contiguous memory.  It is up to the caller to check if the API is
> available.

Would you mind scheding some more light on what made the previous attempt
not work well? I liked the previous API because it was more consistent with
the regular dma_alloc_coherent().

> 
> The intent is that media drivers can use this API if either:

FWIW, the USB subsystem also has similar needs, and so do some DRM drivers
using DMA API rather than IOMMU API directly. Basically I believe that all
the users removed in your previous series relied on custom downstream
patches to make DMA_ATTR_NON_CONSISTENT work and could be finally made work
in upstream using this API.

> 
>  - no kernel mapping or only temporary kernel mappings are required.
>    That is as a better replacement for DMA_ATTR_NO_KERNEL_MAPPING
>  - a kernel mapping is required for cached and DMA mapped pages, but
>    the driver also needs the pages to e.g. map them to userspace.
>    In that sense it is a replacement for some aspects of the recently
>    removed and never fully implemented DMA_ATTR_NON_CONSISTENT

What's the expected allocation and mapping flow with the latter? Would that be

pages = dma_alloc_noncoherent(...)
vaddr = vmap(pages, ...);

?

Would one just use the usual dma_sync_for_{cpu,device}() for cache
invallidate/clean, while keeping the mapping in place?

Best regards,
Tomasz

^ permalink raw reply	[flat|nested] 28+ messages in thread

* Re: [PATCH 8/8] WIP: add a dma_alloc_contiguous API
  2020-10-02 17:50   ` Tomasz Figa
@ 2020-10-05  8:26     ` Christoph Hellwig
  2020-10-06 20:56       ` Tomasz Figa
  0 siblings, 1 reply; 28+ messages in thread
From: Christoph Hellwig @ 2020-10-05  8:26 UTC (permalink / raw)
  To: Tomasz Figa
  Cc: Christoph Hellwig, Mauro Carvalho Chehab, Marek Szyprowski,
	iommu, Robin Murphy, linux-doc, linux-kernel, linux-media

On Fri, Oct 02, 2020 at 05:50:40PM +0000, Tomasz Figa wrote:
> Hi Christoph,
> 
> On Wed, Sep 30, 2020 at 06:09:17PM +0200, Christoph Hellwig wrote:
> > Add a new API that returns a virtually non-contigous array of pages
> > and dma address.  This API is only implemented for dma-iommu and will
> > not be implemented for non-iommu DMA API instances that have to allocate
> > contiguous memory.  It is up to the caller to check if the API is
> > available.
> 
> Would you mind scheding some more light on what made the previous attempt
> not work well? I liked the previous API because it was more consistent with
> the regular dma_alloc_coherent().

The problem is that with a dma_alloc_noncoherent that can return pages
not in the kernel mapping we can't just use virt_to_page to fill in
scatterlists or mmap the buffer to userspace, but would need new helpers
and another two methods.

> >  - no kernel mapping or only temporary kernel mappings are required.
> >    That is as a better replacement for DMA_ATTR_NO_KERNEL_MAPPING
> >  - a kernel mapping is required for cached and DMA mapped pages, but
> >    the driver also needs the pages to e.g. map them to userspace.
> >    In that sense it is a replacement for some aspects of the recently
> >    removed and never fully implemented DMA_ATTR_NON_CONSISTENT
> 
> What's the expected allocation and mapping flow with the latter? Would that be
> 
> pages = dma_alloc_noncoherent(...)
> vaddr = vmap(pages, ...);
> 
> ?

Yes.  Witht the vmap step optional for replacements of
DMA_ATTR_NO_KERNEL_MAPPING, which is another nightmare to deal with.

> Would one just use the usual dma_sync_for_{cpu,device}() for cache
> invallidate/clean, while keeping the mapping in place?

Yes.  And make sure the API isn't implemented when VIVT caches are
used, but that isn't really different from the current interface.

^ permalink raw reply	[flat|nested] 28+ messages in thread

* Re: [PATCH 8/8] WIP: add a dma_alloc_contiguous API
  2020-10-05  8:26     ` Christoph Hellwig
@ 2020-10-06 20:56       ` Tomasz Figa
  2020-10-07  6:21         ` Christoph Hellwig
  0 siblings, 1 reply; 28+ messages in thread
From: Tomasz Figa @ 2020-10-06 20:56 UTC (permalink / raw)
  To: Christoph Hellwig, Sergey Senozhatsky
  Cc: Mauro Carvalho Chehab, Marek Szyprowski, open list:IOMMU DRIVERS,
	Robin Murphy, Linux Doc Mailing List, Linux Kernel Mailing List,
	Linux Media Mailing List

On Mon, Oct 5, 2020 at 10:26 AM Christoph Hellwig <hch@lst.de> wrote:
>
> On Fri, Oct 02, 2020 at 05:50:40PM +0000, Tomasz Figa wrote:
> > Hi Christoph,
> >
> > On Wed, Sep 30, 2020 at 06:09:17PM +0200, Christoph Hellwig wrote:
> > > Add a new API that returns a virtually non-contigous array of pages
> > > and dma address.  This API is only implemented for dma-iommu and will
> > > not be implemented for non-iommu DMA API instances that have to allocate
> > > contiguous memory.  It is up to the caller to check if the API is
> > > available.
> >
> > Would you mind scheding some more light on what made the previous attempt
> > not work well? I liked the previous API because it was more consistent with
> > the regular dma_alloc_coherent().
>
> The problem is that with a dma_alloc_noncoherent that can return pages
> not in the kernel mapping we can't just use virt_to_page to fill in
> scatterlists or mmap the buffer to userspace, but would need new helpers
> and another two methods.
>
> > >  - no kernel mapping or only temporary kernel mappings are required.
> > >    That is as a better replacement for DMA_ATTR_NO_KERNEL_MAPPING
> > >  - a kernel mapping is required for cached and DMA mapped pages, but
> > >    the driver also needs the pages to e.g. map them to userspace.
> > >    In that sense it is a replacement for some aspects of the recently
> > >    removed and never fully implemented DMA_ATTR_NON_CONSISTENT
> >
> > What's the expected allocation and mapping flow with the latter? Would that be
> >
> > pages = dma_alloc_noncoherent(...)
> > vaddr = vmap(pages, ...);
> >
> > ?
>
> Yes.  Witht the vmap step optional for replacements of
> DMA_ATTR_NO_KERNEL_MAPPING, which is another nightmare to deal with.
>
> > Would one just use the usual dma_sync_for_{cpu,device}() for cache
> > invallidate/clean, while keeping the mapping in place?
>
> Yes.  And make sure the API isn't implemented when VIVT caches are
> used, but that isn't really different from the current interface.

Okay, thanks. Let's see if we can make necessary changes to the videobuf2.

+Sergey Senozhatsky for awareness too.

Best regrards,
Tomasz

^ permalink raw reply	[flat|nested] 28+ messages in thread

* Re: [PATCH 8/8] WIP: add a dma_alloc_contiguous API
  2020-10-06 20:56       ` Tomasz Figa
@ 2020-10-07  6:21         ` Christoph Hellwig
  2020-10-07 12:21           ` Tomasz Figa
  0 siblings, 1 reply; 28+ messages in thread
From: Christoph Hellwig @ 2020-10-07  6:21 UTC (permalink / raw)
  To: Tomasz Figa
  Cc: Christoph Hellwig, Sergey Senozhatsky, Mauro Carvalho Chehab,
	Marek Szyprowski, open list:IOMMU DRIVERS, Robin Murphy,
	Linux Doc Mailing List, Linux Kernel Mailing List,
	Linux Media Mailing List

On Tue, Oct 06, 2020 at 10:56:04PM +0200, Tomasz Figa wrote:
> > Yes.  And make sure the API isn't implemented when VIVT caches are
> > used, but that isn't really different from the current interface.
> 
> Okay, thanks. Let's see if we can make necessary changes to the videobuf2.
> 
> +Sergey Senozhatsky for awareness too.

I can defer the changes a bit to see if you'd really much prefer
the former interface.  I think for now the most important thing is
that it works properly for the potential users, and the prime one is
videobuf2 for now.  drm also seems like a big potential users, but I
had a really hard time getting the developers to engage in API
development.

^ permalink raw reply	[flat|nested] 28+ messages in thread

* Re: [PATCH 8/8] WIP: add a dma_alloc_contiguous API
  2020-10-07  6:21         ` Christoph Hellwig
@ 2020-10-07 12:21           ` Tomasz Figa
  2020-10-07 12:24             ` Christoph Hellwig
  0 siblings, 1 reply; 28+ messages in thread
From: Tomasz Figa @ 2020-10-07 12:21 UTC (permalink / raw)
  To: Christoph Hellwig
  Cc: Sergey Senozhatsky, Mauro Carvalho Chehab, Marek Szyprowski,
	open list:IOMMU DRIVERS, Robin Murphy, Linux Doc Mailing List,
	Linux Kernel Mailing List, Linux Media Mailing List

On Wed, Oct 7, 2020 at 8:21 AM Christoph Hellwig <hch@lst.de> wrote:
>
> On Tue, Oct 06, 2020 at 10:56:04PM +0200, Tomasz Figa wrote:
> > > Yes.  And make sure the API isn't implemented when VIVT caches are
> > > used, but that isn't really different from the current interface.
> >
> > Okay, thanks. Let's see if we can make necessary changes to the videobuf2.
> >
> > +Sergey Senozhatsky for awareness too.
>
> I can defer the changes a bit to see if you'd really much prefer
> the former interface.  I think for now the most important thing is
> that it works properly for the potential users, and the prime one is
> videobuf2 for now.  drm also seems like a big potential users, but I
> had a really hard time getting the developers to engage in API
> development.

My initial feeling is that it should work, but we'll give you a
definitive answer once we prototype it. :)

We might actually give it a try in the USB HCD subsystem as well, to
implement usb_alloc_noncoherent(), as an optimization for drivers
which have to perform multiple random accesses to the URB buffers. I
think you might recall discussing this by the way of the pwc and
uvcvideo camera drivers.

Best regards,
Tomasz

^ permalink raw reply	[flat|nested] 28+ messages in thread

* Re: [PATCH 8/8] WIP: add a dma_alloc_contiguous API
  2020-10-07 12:21           ` Tomasz Figa
@ 2020-10-07 12:24             ` Christoph Hellwig
  0 siblings, 0 replies; 28+ messages in thread
From: Christoph Hellwig @ 2020-10-07 12:24 UTC (permalink / raw)
  To: Tomasz Figa
  Cc: Christoph Hellwig, Sergey Senozhatsky, Mauro Carvalho Chehab,
	Marek Szyprowski, open list:IOMMU DRIVERS, Robin Murphy,
	Linux Doc Mailing List, Linux Kernel Mailing List,
	Linux Media Mailing List

On Wed, Oct 07, 2020 at 02:21:43PM +0200, Tomasz Figa wrote:
> My initial feeling is that it should work, but we'll give you a
> definitive answer once we prototype it. :)
> 
> We might actually give it a try in the USB HCD subsystem as well, to
> implement usb_alloc_noncoherent(), as an optimization for drivers
> which have to perform multiple random accesses to the URB buffers. I
> think you might recall discussing this by the way of the pwc and
> uvcvideo camera drivers.

Yes.  I guess for usb the dma_alloc_noncoherent as-is in linux-next
might be better.   So if you have the cycles please prototype it
either way, although for that we'd also need at least a
mmap_noncoherent method, and probaby a get_sgtable_noncoherent one.

^ permalink raw reply	[flat|nested] 28+ messages in thread

* Re: [PATCH 8/8] WIP: add a dma_alloc_contiguous API
  2020-09-30 16:09 ` [PATCH 8/8] WIP: add a dma_alloc_contiguous API Christoph Hellwig
  2020-10-02 17:50   ` Tomasz Figa
@ 2020-10-14 13:20   ` Tomasz Figa
  2020-10-14 15:03     ` David Laight
  2020-11-09 14:53     ` Ricardo Ribalda
  2020-11-18 14:25   ` [PATCH] WIP! media: uvcvideo: Use dma_alloc_noncontiguos API Ricardo Ribalda
  2 siblings, 2 replies; 28+ messages in thread
From: Tomasz Figa @ 2020-10-14 13:20 UTC (permalink / raw)
  To: Christoph Hellwig
  Cc: Mauro Carvalho Chehab, Marek Szyprowski,
	list@263.net:IOMMU DRIVERS
	<iommu@lists.linux-foundation.org>,
	Joerg Roedel <joro@8bytes.org>,,
	Robin Murphy, Linux Doc Mailing List, Linux Kernel Mailing List,
	Linux Media Mailing List, ribalda

+CC Ricardo who will be looking into using this in the USB stack (UVC
camera driver).

On Wed, Sep 30, 2020 at 6:09 PM Christoph Hellwig <hch@lst.de> wrote:
>
> Add a new API that returns a virtually non-contigous array of pages
> and dma address.  This API is only implemented for dma-iommu and will
> not be implemented for non-iommu DMA API instances that have to allocate
> contiguous memory.  It is up to the caller to check if the API is
> available.
>
> The intent is that media drivers can use this API if either:
>
>  - no kernel mapping or only temporary kernel mappings are required.
>    That is as a better replacement for DMA_ATTR_NO_KERNEL_MAPPING
>  - a kernel mapping is required for cached and DMA mapped pages, but
>    the driver also needs the pages to e.g. map them to userspace.
>    In that sense it is a replacement for some aspects of the recently
>    removed and never fully implemented DMA_ATTR_NON_CONSISTENT
>
> Signed-off-by: Christoph Hellwig <hch@lst.de>
> ---
>  drivers/iommu/dma-iommu.c   | 73 +++++++++++++++++++++++++------------
>  include/linux/dma-mapping.h |  9 +++++
>  kernel/dma/mapping.c        | 35 ++++++++++++++++++
>  3 files changed, 93 insertions(+), 24 deletions(-)
>
> diff --git a/drivers/iommu/dma-iommu.c b/drivers/iommu/dma-iommu.c
> index 7922f545cd5eef..158026a856622c 100644
> --- a/drivers/iommu/dma-iommu.c
> +++ b/drivers/iommu/dma-iommu.c
> @@ -565,23 +565,12 @@ static struct page **__iommu_dma_alloc_pages(struct device *dev,
>         return pages;
>  }
>
> -/**
> - * iommu_dma_alloc_remap - Allocate and map a buffer contiguous in IOVA space
> - * @dev: Device to allocate memory for. Must be a real device
> - *      attached to an iommu_dma_domain
> - * @size: Size of buffer in bytes
> - * @dma_handle: Out argument for allocated DMA handle
> - * @gfp: Allocation flags
> - * @prot: pgprot_t to use for the remapped mapping
> - * @attrs: DMA attributes for this allocation
> - *
> - * If @size is less than PAGE_SIZE, then a full CPU page will be allocated,
> +/*
> + * If size is less than PAGE_SIZE, then a full CPU page will be allocated,
>   * but an IOMMU which supports smaller pages might not map the whole thing.
> - *
> - * Return: Mapped virtual address, or NULL on failure.
>   */
> -static void *iommu_dma_alloc_remap(struct device *dev, size_t size,
> -               dma_addr_t *dma_handle, gfp_t gfp, pgprot_t prot,
> +static struct page **__iommu_dma_alloc_noncontiguous(struct device *dev,
> +               size_t size, dma_addr_t *dma_handle, gfp_t gfp, pgprot_t prot,
>                 unsigned long attrs)
>  {
>         struct iommu_domain *domain = iommu_get_dma_domain(dev);
> @@ -593,7 +582,6 @@ static void *iommu_dma_alloc_remap(struct device *dev, size_t size,
>         struct page **pages;
>         struct sg_table sgt;
>         dma_addr_t iova;
> -       void *vaddr;
>
>         *dma_handle = DMA_MAPPING_ERROR;
>
> @@ -636,17 +624,10 @@ static void *iommu_dma_alloc_remap(struct device *dev, size_t size,
>                         < size)
>                 goto out_free_sg;
>
> -       vaddr = dma_common_pages_remap(pages, size, prot,
> -                       __builtin_return_address(0));
> -       if (!vaddr)
> -               goto out_unmap;
> -
>         *dma_handle = iova;
>         sg_free_table(&sgt);
> -       return vaddr;
> +       return pages;
>
> -out_unmap:
> -       __iommu_dma_unmap(dev, iova, size);
>  out_free_sg:
>         sg_free_table(&sgt);
>  out_free_iova:
> @@ -656,6 +637,46 @@ static void *iommu_dma_alloc_remap(struct device *dev, size_t size,
>         return NULL;
>  }
>
> +static void *iommu_dma_alloc_remap(struct device *dev, size_t size,
> +               dma_addr_t *dma_handle, gfp_t gfp, pgprot_t prot,
> +               unsigned long attrs)
> +{
> +       struct page **pages;
> +       void *vaddr;
> +
> +       pages = __iommu_dma_alloc_noncontiguous(dev, size, dma_handle, gfp,
> +                                               prot, attrs);
> +       if (!pages)
> +               return NULL;
> +       vaddr = dma_common_pages_remap(pages, size, prot,
> +                       __builtin_return_address(0));
> +       if (!vaddr)
> +               goto out_unmap;
> +       return vaddr;
> +
> +out_unmap:
> +       __iommu_dma_unmap(dev, *dma_handle, size);
> +       __iommu_dma_free_pages(pages, PAGE_ALIGN(size) >> PAGE_SHIFT);
> +       return NULL;
> +}
> +
> +#ifdef CONFIG_DMA_REMAP
> +static struct page **iommu_dma_alloc_noncontiguous(struct device *dev,
> +               size_t size, dma_addr_t *dma_handle, gfp_t gfp,
> +               unsigned long attrs)
> +{
> +       return __iommu_dma_alloc_noncontiguous(dev, size, dma_handle, gfp,
> +                                              PAGE_KERNEL, attrs);
> +}
> +
> +static void iommu_dma_free_noncontiguous(struct device *dev, size_t size,
> +               struct page **pages, dma_addr_t dma_handle)
> +{
> +       __iommu_dma_unmap(dev, dma_handle, size);
> +       __iommu_dma_free_pages(pages, PAGE_ALIGN(size) >> PAGE_SHIFT);
> +}
> +#endif
> +
>  static void iommu_dma_sync_single_for_cpu(struct device *dev,
>                 dma_addr_t dma_handle, size_t size, enum dma_data_direction dir)
>  {
> @@ -1110,6 +1131,10 @@ static const struct dma_map_ops iommu_dma_ops = {
>         .free                   = iommu_dma_free,
>         .alloc_pages            = dma_common_alloc_pages,
>         .free_pages             = dma_common_free_pages,
> +#ifdef CONFIG_DMA_REMAP
> +       .alloc_noncontiguous    = iommu_dma_alloc_noncontiguous,
> +       .free_noncontiguous     = iommu_dma_free_noncontiguous,
> +#endif
>         .mmap                   = iommu_dma_mmap,
>         .get_sgtable            = iommu_dma_get_sgtable,
>         .map_page               = iommu_dma_map_page,
> diff --git a/include/linux/dma-mapping.h b/include/linux/dma-mapping.h
> index 4b9b1d64f5ec9e..51bbc32365bb8d 100644
> --- a/include/linux/dma-mapping.h
> +++ b/include/linux/dma-mapping.h
> @@ -74,6 +74,10 @@ struct dma_map_ops {
>                         gfp_t gfp);
>         void (*free_pages)(struct device *dev, size_t size, struct page *vaddr,
>                         dma_addr_t dma_handle, enum dma_data_direction dir);
> +       struct page **(*alloc_noncontiguous)(struct device *dev, size_t size,
> +                       dma_addr_t *dma_handle, gfp_t gfp, unsigned long attrs);
> +       void (*free_noncontiguous)(struct device *dev, size_t size,
> +                       struct page **pages, dma_addr_t dma_handle);
>         int (*mmap)(struct device *, struct vm_area_struct *,
>                           void *, dma_addr_t, size_t,
>                           unsigned long attrs);
> @@ -384,6 +388,11 @@ void *dma_alloc_noncoherent(struct device *dev, size_t size,
>                 dma_addr_t *dma_handle, enum dma_data_direction dir, gfp_t gfp);
>  void dma_free_noncoherent(struct device *dev, size_t size, void *vaddr,
>                 dma_addr_t dma_handle, enum dma_data_direction dir);
> +bool dma_can_alloc_noncontiguous(struct device *dev);
> +struct page **dma_alloc_noncontiguous(struct device *dev, size_t size,
> +               dma_addr_t *dma_handle, gfp_t gfp, unsigned long attrs);
> +void dma_free_noncontiguous(struct device *dev, size_t size,
> +               struct page **pages, dma_addr_t dma_handle);
>
>  static inline dma_addr_t dma_map_single_attrs(struct device *dev, void *ptr,
>                 size_t size, enum dma_data_direction dir, unsigned long attrs)
> diff --git a/kernel/dma/mapping.c b/kernel/dma/mapping.c
> index 06115f59f4ffbf..6d975d1a20dd72 100644
> --- a/kernel/dma/mapping.c
> +++ b/kernel/dma/mapping.c
> @@ -529,6 +529,41 @@ void dma_free_noncoherent(struct device *dev, size_t size, void *vaddr,
>  }
>  EXPORT_SYMBOL_GPL(dma_free_noncoherent);
>
> +bool dma_can_alloc_noncontiguous(struct device *dev)
> +{
> +       const struct dma_map_ops *ops = get_dma_ops(dev);
> +
> +       return ops && ops->free_noncontiguous;
> +}
> +EXPORT_SYMBOL_GPL(dma_can_alloc_noncontiguous);
> +
> +struct page **dma_alloc_noncontiguous(struct device *dev, size_t size,
> +               dma_addr_t *dma_handle, gfp_t gfp, unsigned long attrs)
> +{
> +       const struct dma_map_ops *ops = get_dma_ops(dev);
> +
> +       if (WARN_ON_ONCE(!dma_can_alloc_noncontiguous(dev)))
> +               return NULL;
> +       if (attrs & ~DMA_ATTR_ALLOC_SINGLE_PAGES) {
> +               dev_warn(dev, "invalid flags (0x%lx) for %s\n",
> +                        attrs, __func__);
> +               return NULL;
> +       }
> +       return ops->alloc_noncontiguous(dev, size, dma_handle, gfp, attrs);
> +}
> +EXPORT_SYMBOL_GPL(dma_alloc_noncontiguous);
> +
> +void dma_free_noncontiguous(struct device *dev, size_t size,
> +               struct page **pages, dma_addr_t dma_handle)
> +{
> +       const struct dma_map_ops *ops = get_dma_ops(dev);
> +
> +       if (WARN_ON_ONCE(!dma_can_alloc_noncontiguous(dev)))
> +               return;
> +       ops->free_noncontiguous(dev, size, pages, dma_handle);
> +}
> +EXPORT_SYMBOL_GPL(dma_free_noncontiguous);
> +
>  int dma_supported(struct device *dev, u64 mask)
>  {
>         const struct dma_map_ops *ops = get_dma_ops(dev);
> --
> 2.28.0
>

^ permalink raw reply	[flat|nested] 28+ messages in thread

* RE: [PATCH 8/8] WIP: add a dma_alloc_contiguous API
  2020-10-14 13:20   ` Tomasz Figa
@ 2020-10-14 15:03     ` David Laight
  2020-11-09 14:53     ` Ricardo Ribalda
  1 sibling, 0 replies; 28+ messages in thread
From: David Laight @ 2020-10-14 15:03 UTC (permalink / raw)
  To: 'Tomasz Figa', Christoph Hellwig
  Cc: Mauro Carvalho Chehab, Marek Szyprowski,
	list@263.net:IOMMU DRIVERS
	<iommu@lists.linux-foundation.org>,
	Joerg  Roedel <joro@8bytes.org>,,
	Robin Murphy, Linux Doc Mailing List, Linux Kernel Mailing List,
	Linux Media Mailing List, ribalda

> On Wed, Sep 30, 2020 at 6:09 PM Christoph Hellwig <hch@lst.de> wrote:
> >
> > Add a new API that returns a virtually non-contigous array of pages
> > and dma address.  This API is only implemented for dma-iommu and will
> > not be implemented for non-iommu DMA API instances that have to allocate
> > contiguous memory.  It is up to the caller to check if the API is
> > available.

Isn't there already a flag that is only implemented for ARM
systems with iommu that forces pages to actually be physically
contiguous?

So isn't this some kind of strange opposite?

	David

-
Registered Address Lakeside, Bramley Road, Mount Farm, Milton Keynes, MK1 1PT, UK
Registration No: 1397386 (Wales)

^ permalink raw reply	[flat|nested] 28+ messages in thread

* Re: [PATCH 8/8] WIP: add a dma_alloc_contiguous API
  2020-10-14 13:20   ` Tomasz Figa
  2020-10-14 15:03     ` David Laight
@ 2020-11-09 14:53     ` Ricardo Ribalda
  2020-11-10  9:25       ` Christoph Hellwig
  1 sibling, 1 reply; 28+ messages in thread
From: Ricardo Ribalda @ 2020-11-09 14:53 UTC (permalink / raw)
  To: Tomasz Figa
  Cc: Christoph Hellwig, Mauro Carvalho Chehab, Marek Szyprowski,
	list@263.net:IOMMU DRIVERS
	<iommu@lists.linux-foundation.org>,
	Joerg Roedel <joro@8bytes.org>,,
	Robin Murphy, Linux Doc Mailing List, Linux Kernel Mailing List,
	Linux Media Mailing List

Hi Christoph

I have started now to give a try to your patchset. Sorry for the delay.

For uvc I have prepared this patch:
https://github.com/ribalda/linux/commit/9094fe223fe38f8c8ff21366d893b43cbbdf0113

I have tested successfully in a x86_64 noteboot..., yes I know there
is no change for that platform :).
I am trying to get hold of an arm device that can run the latest
kernel from upstream.

On the meanwhile if you could take a look to the patch to verify that
this the way that you expect the drivers to use your api I would
appreciate it

Thanks



On Wed, Oct 14, 2020 at 3:20 PM Tomasz Figa <tfiga@chromium.org> wrote:
>
> +CC Ricardo who will be looking into using this in the USB stack (UVC
> camera driver).
>
> On Wed, Sep 30, 2020 at 6:09 PM Christoph Hellwig <hch@lst.de> wrote:
> >
> > Add a new API that returns a virtually non-contigous array of pages
> > and dma address.  This API is only implemented for dma-iommu and will
> > not be implemented for non-iommu DMA API instances that have to allocate
> > contiguous memory.  It is up to the caller to check if the API is
> > available.
> >
> > The intent is that media drivers can use this API if either:
> >
> >  - no kernel mapping or only temporary kernel mappings are required.
> >    That is as a better replacement for DMA_ATTR_NO_KERNEL_MAPPING
> >  - a kernel mapping is required for cached and DMA mapped pages, but
> >    the driver also needs the pages to e.g. map them to userspace.
> >    In that sense it is a replacement for some aspects of the recently
> >    removed and never fully implemented DMA_ATTR_NON_CONSISTENT
> >
> > Signed-off-by: Christoph Hellwig <hch@lst.de>
> > ---
> >  drivers/iommu/dma-iommu.c   | 73 +++++++++++++++++++++++++------------
> >  include/linux/dma-mapping.h |  9 +++++
> >  kernel/dma/mapping.c        | 35 ++++++++++++++++++
> >  3 files changed, 93 insertions(+), 24 deletions(-)
> >
> > diff --git a/drivers/iommu/dma-iommu.c b/drivers/iommu/dma-iommu.c
> > index 7922f545cd5eef..158026a856622c 100644
> > --- a/drivers/iommu/dma-iommu.c
> > +++ b/drivers/iommu/dma-iommu.c
> > @@ -565,23 +565,12 @@ static struct page **__iommu_dma_alloc_pages(struct device *dev,
> >         return pages;
> >  }
> >
> > -/**
> > - * iommu_dma_alloc_remap - Allocate and map a buffer contiguous in IOVA space
> > - * @dev: Device to allocate memory for. Must be a real device
> > - *      attached to an iommu_dma_domain
> > - * @size: Size of buffer in bytes
> > - * @dma_handle: Out argument for allocated DMA handle
> > - * @gfp: Allocation flags
> > - * @prot: pgprot_t to use for the remapped mapping
> > - * @attrs: DMA attributes for this allocation
> > - *
> > - * If @size is less than PAGE_SIZE, then a full CPU page will be allocated,
> > +/*
> > + * If size is less than PAGE_SIZE, then a full CPU page will be allocated,
> >   * but an IOMMU which supports smaller pages might not map the whole thing.
> > - *
> > - * Return: Mapped virtual address, or NULL on failure.
> >   */
> > -static void *iommu_dma_alloc_remap(struct device *dev, size_t size,
> > -               dma_addr_t *dma_handle, gfp_t gfp, pgprot_t prot,
> > +static struct page **__iommu_dma_alloc_noncontiguous(struct device *dev,
> > +               size_t size, dma_addr_t *dma_handle, gfp_t gfp, pgprot_t prot,
> >                 unsigned long attrs)
> >  {
> >         struct iommu_domain *domain = iommu_get_dma_domain(dev);
> > @@ -593,7 +582,6 @@ static void *iommu_dma_alloc_remap(struct device *dev, size_t size,
> >         struct page **pages;
> >         struct sg_table sgt;
> >         dma_addr_t iova;
> > -       void *vaddr;
> >
> >         *dma_handle = DMA_MAPPING_ERROR;
> >
> > @@ -636,17 +624,10 @@ static void *iommu_dma_alloc_remap(struct device *dev, size_t size,
> >                         < size)
> >                 goto out_free_sg;
> >
> > -       vaddr = dma_common_pages_remap(pages, size, prot,
> > -                       __builtin_return_address(0));
> > -       if (!vaddr)
> > -               goto out_unmap;
> > -
> >         *dma_handle = iova;
> >         sg_free_table(&sgt);
> > -       return vaddr;
> > +       return pages;
> >
> > -out_unmap:
> > -       __iommu_dma_unmap(dev, iova, size);
> >  out_free_sg:
> >         sg_free_table(&sgt);
> >  out_free_iova:
> > @@ -656,6 +637,46 @@ static void *iommu_dma_alloc_remap(struct device *dev, size_t size,
> >         return NULL;
> >  }
> >
> > +static void *iommu_dma_alloc_remap(struct device *dev, size_t size,
> > +               dma_addr_t *dma_handle, gfp_t gfp, pgprot_t prot,
> > +               unsigned long attrs)
> > +{
> > +       struct page **pages;
> > +       void *vaddr;
> > +
> > +       pages = __iommu_dma_alloc_noncontiguous(dev, size, dma_handle, gfp,
> > +                                               prot, attrs);
> > +       if (!pages)
> > +               return NULL;
> > +       vaddr = dma_common_pages_remap(pages, size, prot,
> > +                       __builtin_return_address(0));
> > +       if (!vaddr)
> > +               goto out_unmap;
> > +       return vaddr;
> > +
> > +out_unmap:
> > +       __iommu_dma_unmap(dev, *dma_handle, size);
> > +       __iommu_dma_free_pages(pages, PAGE_ALIGN(size) >> PAGE_SHIFT);
> > +       return NULL;
> > +}
> > +
> > +#ifdef CONFIG_DMA_REMAP
> > +static struct page **iommu_dma_alloc_noncontiguous(struct device *dev,
> > +               size_t size, dma_addr_t *dma_handle, gfp_t gfp,
> > +               unsigned long attrs)
> > +{
> > +       return __iommu_dma_alloc_noncontiguous(dev, size, dma_handle, gfp,
> > +                                              PAGE_KERNEL, attrs);
> > +}
> > +
> > +static void iommu_dma_free_noncontiguous(struct device *dev, size_t size,
> > +               struct page **pages, dma_addr_t dma_handle)
> > +{
> > +       __iommu_dma_unmap(dev, dma_handle, size);
> > +       __iommu_dma_free_pages(pages, PAGE_ALIGN(size) >> PAGE_SHIFT);
> > +}
> > +#endif
> > +
> >  static void iommu_dma_sync_single_for_cpu(struct device *dev,
> >                 dma_addr_t dma_handle, size_t size, enum dma_data_direction dir)
> >  {
> > @@ -1110,6 +1131,10 @@ static const struct dma_map_ops iommu_dma_ops = {
> >         .free                   = iommu_dma_free,
> >         .alloc_pages            = dma_common_alloc_pages,
> >         .free_pages             = dma_common_free_pages,
> > +#ifdef CONFIG_DMA_REMAP
> > +       .alloc_noncontiguous    = iommu_dma_alloc_noncontiguous,
> > +       .free_noncontiguous     = iommu_dma_free_noncontiguous,
> > +#endif
> >         .mmap                   = iommu_dma_mmap,
> >         .get_sgtable            = iommu_dma_get_sgtable,
> >         .map_page               = iommu_dma_map_page,
> > diff --git a/include/linux/dma-mapping.h b/include/linux/dma-mapping.h
> > index 4b9b1d64f5ec9e..51bbc32365bb8d 100644
> > --- a/include/linux/dma-mapping.h
> > +++ b/include/linux/dma-mapping.h
> > @@ -74,6 +74,10 @@ struct dma_map_ops {
> >                         gfp_t gfp);
> >         void (*free_pages)(struct device *dev, size_t size, struct page *vaddr,
> >                         dma_addr_t dma_handle, enum dma_data_direction dir);
> > +       struct page **(*alloc_noncontiguous)(struct device *dev, size_t size,
> > +                       dma_addr_t *dma_handle, gfp_t gfp, unsigned long attrs);
> > +       void (*free_noncontiguous)(struct device *dev, size_t size,
> > +                       struct page **pages, dma_addr_t dma_handle);
> >         int (*mmap)(struct device *, struct vm_area_struct *,
> >                           void *, dma_addr_t, size_t,
> >                           unsigned long attrs);
> > @@ -384,6 +388,11 @@ void *dma_alloc_noncoherent(struct device *dev, size_t size,
> >                 dma_addr_t *dma_handle, enum dma_data_direction dir, gfp_t gfp);
> >  void dma_free_noncoherent(struct device *dev, size_t size, void *vaddr,
> >                 dma_addr_t dma_handle, enum dma_data_direction dir);
> > +bool dma_can_alloc_noncontiguous(struct device *dev);
> > +struct page **dma_alloc_noncontiguous(struct device *dev, size_t size,
> > +               dma_addr_t *dma_handle, gfp_t gfp, unsigned long attrs);
> > +void dma_free_noncontiguous(struct device *dev, size_t size,
> > +               struct page **pages, dma_addr_t dma_handle);
> >
> >  static inline dma_addr_t dma_map_single_attrs(struct device *dev, void *ptr,
> >                 size_t size, enum dma_data_direction dir, unsigned long attrs)
> > diff --git a/kernel/dma/mapping.c b/kernel/dma/mapping.c
> > index 06115f59f4ffbf..6d975d1a20dd72 100644
> > --- a/kernel/dma/mapping.c
> > +++ b/kernel/dma/mapping.c
> > @@ -529,6 +529,41 @@ void dma_free_noncoherent(struct device *dev, size_t size, void *vaddr,
> >  }
> >  EXPORT_SYMBOL_GPL(dma_free_noncoherent);
> >
> > +bool dma_can_alloc_noncontiguous(struct device *dev)
> > +{
> > +       const struct dma_map_ops *ops = get_dma_ops(dev);
> > +
> > +       return ops && ops->free_noncontiguous;
> > +}
> > +EXPORT_SYMBOL_GPL(dma_can_alloc_noncontiguous);
> > +
> > +struct page **dma_alloc_noncontiguous(struct device *dev, size_t size,
> > +               dma_addr_t *dma_handle, gfp_t gfp, unsigned long attrs)
> > +{
> > +       const struct dma_map_ops *ops = get_dma_ops(dev);
> > +
> > +       if (WARN_ON_ONCE(!dma_can_alloc_noncontiguous(dev)))
> > +               return NULL;
> > +       if (attrs & ~DMA_ATTR_ALLOC_SINGLE_PAGES) {
> > +               dev_warn(dev, "invalid flags (0x%lx) for %s\n",
> > +                        attrs, __func__);
> > +               return NULL;
> > +       }
> > +       return ops->alloc_noncontiguous(dev, size, dma_handle, gfp, attrs);
> > +}
> > +EXPORT_SYMBOL_GPL(dma_alloc_noncontiguous);
> > +
> > +void dma_free_noncontiguous(struct device *dev, size_t size,
> > +               struct page **pages, dma_addr_t dma_handle)
> > +{
> > +       const struct dma_map_ops *ops = get_dma_ops(dev);
> > +
> > +       if (WARN_ON_ONCE(!dma_can_alloc_noncontiguous(dev)))
> > +               return;
> > +       ops->free_noncontiguous(dev, size, pages, dma_handle);
> > +}
> > +EXPORT_SYMBOL_GPL(dma_free_noncontiguous);
> > +
> >  int dma_supported(struct device *dev, u64 mask)
> >  {
> >         const struct dma_map_ops *ops = get_dma_ops(dev);
> > --
> > 2.28.0
> >



-- 
Ricardo Ribalda

^ permalink raw reply	[flat|nested] 28+ messages in thread

* Re: [PATCH 8/8] WIP: add a dma_alloc_contiguous API
  2020-11-09 14:53     ` Ricardo Ribalda
@ 2020-11-10  9:25       ` Christoph Hellwig
  2020-11-10  9:33         ` Ricardo Ribalda
  0 siblings, 1 reply; 28+ messages in thread
From: Christoph Hellwig @ 2020-11-10  9:25 UTC (permalink / raw)
  To: Ricardo Ribalda
  Cc: Tomasz Figa, Christoph Hellwig, Mauro Carvalho Chehab,
	Marek Szyprowski,
	list@263.net:IOMMU DRIVERS
	<iommu@lists.linux-foundation.org>,
	Joerg Roedel <joro@8bytes.org>,,
	Robin Murphy, Linux Doc Mailing List, Linux Kernel Mailing List,
	Linux Media Mailing List

On Mon, Nov 09, 2020 at 03:53:55PM +0100, Ricardo Ribalda wrote:
> Hi Christoph
> 
> I have started now to give a try to your patchset. Sorry for the delay.
> 
> For uvc I have prepared this patch:
> https://github.com/ribalda/linux/commit/9094fe223fe38f8c8ff21366d893b43cbbdf0113
> 
> I have tested successfully in a x86_64 noteboot..., yes I know there
> is no change for that platform :).
> I am trying to get hold of an arm device that can run the latest
> kernel from upstream.
> 
> On the meanwhile if you could take a look to the patch to verify that
> this the way that you expect the drivers to use your api I would
> appreciate it

This looks pretty reaosnable.

Note that ifdef  CONFIG_DMA_NONCOHERENT in the old code doesn't actually
work, as that option is an internal thing just for mips and sh..

^ permalink raw reply	[flat|nested] 28+ messages in thread

* Re: [PATCH 8/8] WIP: add a dma_alloc_contiguous API
  2020-11-10  9:25       ` Christoph Hellwig
@ 2020-11-10  9:33         ` Ricardo Ribalda
  2020-11-10  9:41           ` Christoph Hellwig
  2020-11-10  9:50           ` Tomasz Figa
  0 siblings, 2 replies; 28+ messages in thread
From: Ricardo Ribalda @ 2020-11-10  9:33 UTC (permalink / raw)
  To: Christoph Hellwig
  Cc: Tomasz Figa, Mauro Carvalho Chehab, Marek Szyprowski,
	list@263.net:IOMMU DRIVERS
	<iommu@lists.linux-foundation.org>,
	Joerg Roedel <joro@8bytes.org>,,
	Robin Murphy, Linux Doc Mailing List, Linux Kernel Mailing List,
	Linux Media Mailing List

Hi Christoph

On Tue, Nov 10, 2020 at 10:25 AM Christoph Hellwig <hch@lst.de> wrote:
>
> On Mon, Nov 09, 2020 at 03:53:55PM +0100, Ricardo Ribalda wrote:
> > Hi Christoph
> >
> > I have started now to give a try to your patchset. Sorry for the delay.
> >
> > For uvc I have prepared this patch:
> > https://github.com/ribalda/linux/commit/9094fe223fe38f8c8ff21366d893b43cbbdf0113
> >
> > I have tested successfully in a x86_64 noteboot..., yes I know there
> > is no change for that platform :).
> > I am trying to get hold of an arm device that can run the latest
> > kernel from upstream.
> >
> > On the meanwhile if you could take a look to the patch to verify that
> > this the way that you expect the drivers to use your api I would
> > appreciate it
>
> This looks pretty reaosnable.
>

Great

Also FYI, I managed to boot an ARM device with that tree. But I could
not test the uvc driver (it was a remote device with no usb device
attached)

Hopefully I will be able to test it for real this week.

Any suggestions for how to measure performance difference?

Thanks!

> Note that ifdef  CONFIG_DMA_NONCOHERENT in the old code doesn't actually
> work, as that option is an internal thing just for mips and sh..



-- 
Ricardo Ribalda

^ permalink raw reply	[flat|nested] 28+ messages in thread

* Re: [PATCH 8/8] WIP: add a dma_alloc_contiguous API
  2020-11-10  9:33         ` Ricardo Ribalda
@ 2020-11-10  9:41           ` Christoph Hellwig
  2020-11-10  9:50           ` Tomasz Figa
  1 sibling, 0 replies; 28+ messages in thread
From: Christoph Hellwig @ 2020-11-10  9:41 UTC (permalink / raw)
  To: Ricardo Ribalda
  Cc: Christoph Hellwig, Tomasz Figa, Mauro Carvalho Chehab,
	Marek Szyprowski,
	list@263.net:IOMMU DRIVERS
	<iommu@lists.linux-foundation.org>,
	Joerg Roedel <joro@8bytes.org>,,
	Robin Murphy, Linux Doc Mailing List, Linux Kernel Mailing List,
	Linux Media Mailing List

On Tue, Nov 10, 2020 at 10:33:05AM +0100, Ricardo Ribalda wrote:
> Also FYI, I managed to boot an ARM device with that tree. But I could
> not test the uvc driver (it was a remote device with no usb device
> attached)
> 
> Hopefully I will be able to test it for real this week.
> 
> Any suggestions for how to measure performance difference?

I have to admit I don't know at all how uvc works.  But the main
problem with dma_alloc_coherent is that all access is uncached.  So
anything that does larger and/or many data transfers to and from it
will be glacially slow.  With the dma streaming API we still have to
pay for cache flushes, but only before and after the transfers, and
in many cases in a somewhat optimized fashion.

^ permalink raw reply	[flat|nested] 28+ messages in thread

* Re: [PATCH 8/8] WIP: add a dma_alloc_contiguous API
  2020-11-10  9:33         ` Ricardo Ribalda
  2020-11-10  9:41           ` Christoph Hellwig
@ 2020-11-10  9:50           ` Tomasz Figa
  2020-11-10  9:57             ` Christoph Hellwig
  1 sibling, 1 reply; 28+ messages in thread
From: Tomasz Figa @ 2020-11-10  9:50 UTC (permalink / raw)
  To: Ricardo Ribalda, Christoph Hellwig, Kieran Bingham, Laurent Pinchart
  Cc: Mauro Carvalho Chehab, Marek Szyprowski,
	list@263.net:IOMMU DRIVERS
	<iommu@lists.linux-foundation.org>,
	Joerg Roedel <joro@8bytes.org>,,
	Robin Murphy, Linux Doc Mailing List, Linux Kernel Mailing List,
	Linux Media Mailing List

On Tue, Nov 10, 2020 at 6:33 PM Ricardo Ribalda <ribalda@chromium.org> wrote:
>
> Hi Christoph
>
> On Tue, Nov 10, 2020 at 10:25 AM Christoph Hellwig <hch@lst.de> wrote:
> >
> > On Mon, Nov 09, 2020 at 03:53:55PM +0100, Ricardo Ribalda wrote:
> > > Hi Christoph
> > >
> > > I have started now to give a try to your patchset. Sorry for the delay.
> > >
> > > For uvc I have prepared this patch:
> > > https://github.com/ribalda/linux/commit/9094fe223fe38f8c8ff21366d893b43cbbdf0113
> > >
> > > I have tested successfully in a x86_64 noteboot..., yes I know there
> > > is no change for that platform :).
> > > I am trying to get hold of an arm device that can run the latest
> > > kernel from upstream.
> > >
> > > On the meanwhile if you could take a look to the patch to verify that
> > > this the way that you expect the drivers to use your api I would
> > > appreciate it
> >
> > This looks pretty reaosnable.
> >
>
> Great
>

Thanks Christoph for taking a look quickly.

> Also FYI, I managed to boot an ARM device with that tree. But I could
> not test the uvc driver (it was a remote device with no usb device
> attached)
>
> Hopefully I will be able to test it for real this week.
>
> Any suggestions for how to measure performance difference?

Back in time Kieran (+CC) shared a patch to add extra statistics for
packet processing and payload assembly, with results of various
approaches summarized in a spreadsheet:
https://docs.google.com/spreadsheets/d/1uPdbdVcebO9OQ0LQ8hR2LGIEySWgSnGwwhzv7LPXAlU/edit#gid=0

That and just simple CPU usage comparison would be enough.

>
> Thanks!
>
> > Note that ifdef  CONFIG_DMA_NONCOHERENT in the old code doesn't actually
> > work, as that option is an internal thing just for mips and sh..

In what terms it doesn't actually work? Last time I checked some
platforms actually defined CONFIG_DMA_NONCOHERENT, so those would
instead use the kmalloc() + dma_map() path. I don't have any
background on why that was added and whether it needs to be preserved,
though. Kieran, Laurent, do you have any insight?

Best regards,
Tomasz

^ permalink raw reply	[flat|nested] 28+ messages in thread

* Re: [PATCH 8/8] WIP: add a dma_alloc_contiguous API
  2020-11-10  9:50           ` Tomasz Figa
@ 2020-11-10  9:57             ` Christoph Hellwig
  2020-11-17 21:21               ` Ricardo Ribalda
  0 siblings, 1 reply; 28+ messages in thread
From: Christoph Hellwig @ 2020-11-10  9:57 UTC (permalink / raw)
  To: Tomasz Figa
  Cc: Ricardo Ribalda, Christoph Hellwig, Kieran Bingham,
	Laurent Pinchart, Mauro Carvalho Chehab, Marek Szyprowski,
	list@263.net:IOMMU DRIVERS
	<iommu@lists.linux-foundation.org>,
	Joerg Roedel <joro@8bytes.org>,,
	Robin Murphy, Linux Doc Mailing List, Linux Kernel Mailing List,
	Linux Media Mailing List

On Tue, Nov 10, 2020 at 06:50:32PM +0900, Tomasz Figa wrote:
> In what terms it doesn't actually work? Last time I checked some
> platforms actually defined CONFIG_DMA_NONCOHERENT, so those would
> instead use the kmalloc() + dma_map() path. I don't have any
> background on why that was added and whether it needs to be preserved,
> though. Kieran, Laurent, do you have any insight?

CONFIG_DMA_NONCOHERENT is set on sh and mips for platforms that may
support non-coherent DMA at compile time (but at least for mips that
doesn't actually means this gets used).  Using that ifdef to decide
on using usb_alloc_coherent vs letting the usb layer map the data
seems at best odd, and if we are unlucky papering over a bug somewhere.

^ permalink raw reply	[flat|nested] 28+ messages in thread

* Re: [PATCH 8/8] WIP: add a dma_alloc_contiguous API
  2020-11-10  9:57             ` Christoph Hellwig
@ 2020-11-17 21:21               ` Ricardo Ribalda
  0 siblings, 0 replies; 28+ messages in thread
From: Ricardo Ribalda @ 2020-11-17 21:21 UTC (permalink / raw)
  To: Christoph Hellwig
  Cc: Tomasz Figa, Kieran Bingham, Laurent Pinchart,
	Mauro Carvalho Chehab, Marek Szyprowski,
	list@263.net:IOMMU DRIVERS
	<iommu@lists.linux-foundation.org>,
	Joerg Roedel <joro@8bytes.org>,,
	Robin Murphy, Linux Doc Mailing List, Linux Kernel Mailing List,
	Linux Media Mailing List

Hi Christoph

I have been testing with real hardware on arm64 your patchset. And uvc
performs 20 times better using Kieran's test

https://github.com/ribalda/linux/tree/uvc-noncontiguous

These are the result of running   yavta --capture=1000


dma_alloc_noncontiguous

frames:  999
packets: 999
empty:   0 (0 %)
errors:  0
invalid: 0
pts: 0 early, 0 initial, 999 ok
scr: 0 count ok, 0 diff ok
sof: 2048 <= sof <= 0, freq 0.000 kHz
bytes 78466000 : duration 33303
FPS: 29.99
URB: 418105/5000 uS/qty: 83.621 avg 98.783 std 17.396 min 1264.688 max (uS)
header: 100040/5000 uS/qty: 20.008 avg 19.458 std 2.969 min 454.167 max (uS)
latency: 347653/5000 uS/qty: 69.530 avg 98.937 std 9.114 min 1256.875 max (uS)
decode: 70452/5000 uS/qty: 14.090 avg 11.547 std 6.146 min 271.510 max (uS)
raw decode speed: 8.967 Gbits/s
raw URB handling speed: 1.501 Gbits/s
throughput: 18.848 Mbits/s
URB decode CPU usage 0.211500 %


usb_alloc_coherent

frames:  999
packets: 999
empty:   0 (0 %)
errors:  0
invalid: 0
pts: 0 early, 0 initial, 999 ok
scr: 0 count ok, 0 diff ok
sof: 2048 <= sof <= 0, freq 0.000 kHz
bytes 70501712 : duration 33319
FPS: 29.98
URB: 1854128/5000 uS/qty: 370.825 avg 417.133 std 14.539 min 2875.760 max (uS)
header: 98765/5000 uS/qty: 19.753 avg 30.714 std 1.042 min 573.463 max (uS)
latency: 453316/5000 uS/qty: 90.663 avg 114.987 std 4.065 min 860.795 max (uS)
decode: 1400811/5000 uS/qty: 280.162 avg 330.786 std 6.305 min 2758.202 max (uS)
raw decode speed: 402.866 Mbits/s
raw URB handling speed: 304.214 Mbits/s
throughput: 16.927 Mbits/s
URB decode CPU usage 4.204200 %


Best regards

On Tue, Nov 10, 2020 at 10:57 AM Christoph Hellwig <hch@lst.de> wrote:
>
> On Tue, Nov 10, 2020 at 06:50:32PM +0900, Tomasz Figa wrote:
> > In what terms it doesn't actually work? Last time I checked some
> > platforms actually defined CONFIG_DMA_NONCOHERENT, so those would
> > instead use the kmalloc() + dma_map() path. I don't have any
> > background on why that was added and whether it needs to be preserved,
> > though. Kieran, Laurent, do you have any insight?
>
> CONFIG_DMA_NONCOHERENT is set on sh and mips for platforms that may
> support non-coherent DMA at compile time (but at least for mips that
> doesn't actually means this gets used).  Using that ifdef to decide
> on using usb_alloc_coherent vs letting the usb layer map the data
> seems at best odd, and if we are unlucky papering over a bug somewhere.



-- 
Ricardo Ribalda

^ permalink raw reply	[flat|nested] 28+ messages in thread

* [PATCH] WIP! media: uvcvideo: Use dma_alloc_noncontiguos API
  2020-09-30 16:09 ` [PATCH 8/8] WIP: add a dma_alloc_contiguous API Christoph Hellwig
  2020-10-02 17:50   ` Tomasz Figa
  2020-10-14 13:20   ` Tomasz Figa
@ 2020-11-18 14:25   ` Ricardo Ribalda
  2020-11-24 11:35     ` Christoph Hellwig
  2 siblings, 1 reply; 28+ messages in thread
From: Ricardo Ribalda @ 2020-11-18 14:25 UTC (permalink / raw)
  To: Christoph Hellwig, Mauro Carvalho Chehab, Marek Szyprowski,
	IOMMU DRIVERS, Joerg Roedel, Robin Murphy,
	Linux Doc Mailing List, Linux Kernel Mailing List,
	Linux Media Mailing List, Tomasz Figa
  Cc: Ricardo Ribalda

On architectures where the is no coherent caching such as ARM use the
dma_alloc_noncontiguos API and handle manually the cache flushing using
dma_sync_single().

With this patch on the affected architectures we can measure up to 20x
performance improvement in uvc_video_copy_data_work().

Signed-off-by: Ricardo Ribalda <ribalda@chromium.org>
---

This patch depends on dma_alloc_contiguous API1315351diffmboxseries

https://lore.kernel.org/patchwork/patch/1315351/#1535182

 drivers/media/usb/uvc/uvc_video.c | 69 +++++++++++++++++++++++++------
 drivers/media/usb/uvc/uvcvideo.h  |  1 +
 2 files changed, 58 insertions(+), 12 deletions(-)

diff --git a/drivers/media/usb/uvc/uvc_video.c b/drivers/media/usb/uvc/uvc_video.c
index ff624bb857d3..ef1b029b8576 100644
--- a/drivers/media/usb/uvc/uvc_video.c
+++ b/drivers/media/usb/uvc/uvc_video.c
@@ -1641,6 +1641,11 @@ static void uvc_video_encode_bulk(struct uvc_urb *uvc_urb,
 	urb->transfer_buffer_length = stream->urb_size - len;
 }
 
+static inline struct device *stream_to_dmadev(struct uvc_streaming *stream)
+{
+	return stream->dev->udev->bus->controller->parent;
+}
+
 static void uvc_video_complete(struct urb *urb)
 {
 	struct uvc_urb *uvc_urb = urb->context;
@@ -1693,6 +1698,11 @@ static void uvc_video_complete(struct urb *urb)
 	 * Process the URB headers, and optionally queue expensive memcpy tasks
 	 * to be deferred to a work queue.
 	 */
+	if (uvc_urb->pages)
+		dma_sync_single_for_cpu(stream_to_dmadev(stream),
+					urb->transfer_dma,
+					urb->transfer_buffer_length,
+					DMA_FROM_DEVICE);
 	stream->decode(uvc_urb, buf, buf_meta);
 
 	/* If no async work is needed, resubmit the URB immediately. */
@@ -1723,8 +1733,15 @@ static void uvc_free_urb_buffers(struct uvc_streaming *stream)
 			continue;
 
 #ifndef CONFIG_DMA_NONCOHERENT
-		usb_free_coherent(stream->dev->udev, stream->urb_size,
-				  uvc_urb->buffer, uvc_urb->dma);
+		if (uvc_urb->pages) {
+			vunmap(uvc_urb->buffer);
+			dma_free_noncontiguous(stream_to_dmadev(stream),
+					       stream->urb_size,
+					       uvc_urb->pages, uvc_urb->dma);
+		} else {
+			usb_free_coherent(stream->dev->udev, stream->urb_size,
+					  uvc_urb->buffer, uvc_urb->dma);
+		}
 #else
 		kfree(uvc_urb->buffer);
 #endif
@@ -1734,6 +1751,42 @@ static void uvc_free_urb_buffers(struct uvc_streaming *stream)
 	stream->urb_size = 0;
 }
 
+#ifndef CONFIG_DMA_NONCOHERENT
+static bool uvc_alloc_urb_buffer(struct uvc_streaming *stream, struct uvc_urb *uvc_urb,
+				 gfp_t gfp_flags)
+{
+	struct device *dma_dev = dma_dev = stream_to_dmadev(stream);
+
+	if (!dma_can_alloc_noncontiguous(dma_dev)) {
+		uvc_urb->buffer = usb_alloc_coherent(stream->dev->udev, stream->urb_size,
+						     gfp_flags | __GFP_NOWARN, &uvc_urb->dma);
+		return uvc_urb->buffer != NULL;
+	}
+
+	uvc_urb->pages = dma_alloc_noncontiguous(dma_dev, stream->urb_size,
+						 &uvc_urb->dma, gfp_flags | __GFP_NOWARN, 0);
+	if (!uvc_urb->pages)
+		return false;
+
+	uvc_urb->buffer = vmap(uvc_urb->pages, PAGE_ALIGN(stream->urb_size) >> PAGE_SHIFT,
+			       VM_DMA_COHERENT, PAGE_KERNEL);
+	if (!uvc_urb->buffer) {
+		dma_free_noncontiguous(dma_dev, stream->urb_size, uvc_urb->pages, uvc_urb->dma);
+		return false;
+	}
+
+	return true;
+}
+#else
+static bool uvc_alloc_urb_buffer(struct uvc_streaming *stream, struct uvc_urb *uvc_urb,
+				 gfp_t gfp_flags)
+{
+	uvc_urb->buffer = kmalloc(stream->urb_size, gfp_flags | __GFP_NOWARN);
+
+	return uvc_urb->buffer != NULL;
+}
+#endif
+
 /*
  * Allocate transfer buffers. This function can be called with buffers
  * already allocated when resuming from suspend, in which case it will
@@ -1764,19 +1817,11 @@ static int uvc_alloc_urb_buffers(struct uvc_streaming *stream,
 
 	/* Retry allocations until one succeed. */
 	for (; npackets > 1; npackets /= 2) {
+		stream->urb_size = psize * npackets;
 		for (i = 0; i < UVC_URBS; ++i) {
 			struct uvc_urb *uvc_urb = &stream->uvc_urb[i];
 
-			stream->urb_size = psize * npackets;
-#ifndef CONFIG_DMA_NONCOHERENT
-			uvc_urb->buffer = usb_alloc_coherent(
-				stream->dev->udev, stream->urb_size,
-				gfp_flags | __GFP_NOWARN, &uvc_urb->dma);
-#else
-			uvc_urb->buffer =
-			    kmalloc(stream->urb_size, gfp_flags | __GFP_NOWARN);
-#endif
-			if (!uvc_urb->buffer) {
+			if (!uvc_alloc_urb_buffer(stream, uvc_urb, gfp_flags)) {
 				uvc_free_urb_buffers(stream);
 				break;
 			}
diff --git a/drivers/media/usb/uvc/uvcvideo.h b/drivers/media/usb/uvc/uvcvideo.h
index 60d830d74ac1..80eeeaf3cd06 100644
--- a/drivers/media/usb/uvc/uvcvideo.h
+++ b/drivers/media/usb/uvc/uvcvideo.h
@@ -544,6 +544,7 @@ struct uvc_urb {
 
 	char *buffer;
 	dma_addr_t dma;
+	struct page **pages;
 
 	unsigned int async_operations;
 	struct uvc_copy_op copy_operations[UVC_MAX_PACKETS];
-- 
2.29.2.299.gdc1121823c-goog


^ permalink raw reply related	[flat|nested] 28+ messages in thread

* Re: [PATCH] WIP! media: uvcvideo: Use dma_alloc_noncontiguos API
  2020-11-18 14:25   ` [PATCH] WIP! media: uvcvideo: Use dma_alloc_noncontiguos API Ricardo Ribalda
@ 2020-11-24 11:35     ` Christoph Hellwig
  2020-11-24 12:01       ` Ricardo Ribalda
  0 siblings, 1 reply; 28+ messages in thread
From: Christoph Hellwig @ 2020-11-24 11:35 UTC (permalink / raw)
  To: Ricardo Ribalda
  Cc: Christoph Hellwig, Mauro Carvalho Chehab, Marek Szyprowski,
	IOMMU DRIVERS, Joerg Roedel, Robin Murphy,
	Linux Doc Mailing List, Linux Kernel Mailing List,
	Linux Media Mailing List, Tomasz Figa

On Wed, Nov 18, 2020 at 03:25:46PM +0100, Ricardo Ribalda wrote:
> On architectures where the is no coherent caching such as ARM use the
> dma_alloc_noncontiguos API and handle manually the cache flushing using
> dma_sync_single().
> 
> With this patch on the affected architectures we can measure up to 20x
> performance improvement in uvc_video_copy_data_work().

This has a bunch of crazy long lines, but otherwise looks fine to me.

> 
> Signed-off-by: Ricardo Ribalda <ribalda@chromium.org>
> ---
> 
> This patch depends on dma_alloc_contiguous API1315351diffmboxseries

How do we want to proceed?  Do the media maintainers want to pick up
that patch?  Should I pick up the media patch in the dma-mapping tree?

Can you respost a combined series to get started?

^ permalink raw reply	[flat|nested] 28+ messages in thread

* Re: [PATCH] WIP! media: uvcvideo: Use dma_alloc_noncontiguos API
  2020-11-24 11:35     ` Christoph Hellwig
@ 2020-11-24 12:01       ` Ricardo Ribalda
  2020-11-24 13:33         ` Christoph Hellwig
  0 siblings, 1 reply; 28+ messages in thread
From: Ricardo Ribalda @ 2020-11-24 12:01 UTC (permalink / raw)
  To: Christoph Hellwig
  Cc: Mauro Carvalho Chehab, Marek Szyprowski, IOMMU DRIVERS,
	Joerg Roedel, Robin Murphy, Linux Doc Mailing List,
	Linux Kernel Mailing List, Linux Media Mailing List, Tomasz Figa,
	Sergey Senozhatsky

HI Christoph

On Tue, Nov 24, 2020 at 12:35 PM Christoph Hellwig <hch@lst.de> wrote:
>
> On Wed, Nov 18, 2020 at 03:25:46PM +0100, Ricardo Ribalda wrote:
> > On architectures where the is no coherent caching such as ARM use the
> > dma_alloc_noncontiguos API and handle manually the cache flushing using
> > dma_sync_single().
> >
> > With this patch on the affected architectures we can measure up to 20x
> > performance improvement in uvc_video_copy_data_work().
>
> This has a bunch of crazy long lines, but otherwise looks fine to me.

That is easy to solve :)

https://github.com/ribalda/linux/commit/17ab65a08302e845ad7ae7775ce54b387a58a887

>
> >
> > Signed-off-by: Ricardo Ribalda <ribalda@chromium.org>
> > ---
> >
> > This patch depends on dma_alloc_contiguous API1315351diffmboxseries
>
> How do we want to proceed?  Do the media maintainers want to pick up
> that patch?  Should I pick up the media patch in the dma-mapping tree?

I was hoping that you could answer that question :).

Do you have other use-cases than linux-media in mind?

I think Sergey wants to experiment also with vb2, to figure out how
much it affects it.
His change will be much more complicated than mine thought, there are
more cornercases there.

>
> Can you respost a combined series to get started?

Sure. Shall I also include the profiling patch?


Best regards
-- 
Ricardo Ribalda

^ permalink raw reply	[flat|nested] 28+ messages in thread

* Re: [PATCH] WIP! media: uvcvideo: Use dma_alloc_noncontiguos API
  2020-11-24 12:01       ` Ricardo Ribalda
@ 2020-11-24 13:33         ` Christoph Hellwig
  0 siblings, 0 replies; 28+ messages in thread
From: Christoph Hellwig @ 2020-11-24 13:33 UTC (permalink / raw)
  To: Ricardo Ribalda
  Cc: Christoph Hellwig, Mauro Carvalho Chehab, Marek Szyprowski,
	IOMMU DRIVERS, Joerg Roedel, Robin Murphy,
	Linux Doc Mailing List, Linux Kernel Mailing List,
	Linux Media Mailing List, Tomasz Figa, Sergey Senozhatsky

On Tue, Nov 24, 2020 at 01:01:33PM +0100, Ricardo Ribalda wrote:
> I was hoping that you could answer that question :).
> 
> Do you have other use-cases than linux-media in mind?
> 
> I think Sergey wants to experiment also with vb2, to figure out how
> much it affects it.
> His change will be much more complicated than mine thought, there are
> more cornercases there.

I don't have anything urgend lined up, although I think there are plenty
other potential use cases.

> > Can you respost a combined series to get started?
> 
> Sure. Shall I also include the profiling patch?

That is in the media code, right?  I don't really care too much.

^ permalink raw reply	[flat|nested] 28+ messages in thread

end of thread, other threads:[~2020-11-24 13:33 UTC | newest]

Thread overview: 28+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2020-09-30 16:09 dma_alloc_pages / dma_alloc_noncoherent fixups Christoph Hellwig
2020-09-30 16:09 ` [PATCH 1/8] dma-mapping: remove the {alloc,free}_noncoherent methods Christoph Hellwig
2020-09-30 16:09 ` [PATCH 2/8] dma-mapping: document dma_{alloc,free}_pages Christoph Hellwig
2020-09-30 16:09 ` [PATCH 3/8] dma-direct check for highmem pages in dma_direct_alloc_pages Christoph Hellwig
2020-09-30 16:09 ` [PATCH 4/8] dma-direct: use __GFP_ZERO " Christoph Hellwig
2020-09-30 16:09 ` [PATCH 5/8] dma-direct: factor out a dma_direct_alloc_from_pool helper Christoph Hellwig
2020-09-30 16:09 ` [PATCH 6/8] dma-direct: simplify the DMA_ATTR_NO_KERNEL_MAPPING handling Christoph Hellwig
2020-09-30 16:09 ` [PATCH 7/8] dma-iommu: remove __iommu_dma_mmap Christoph Hellwig
2020-09-30 16:09 ` [PATCH 8/8] WIP: add a dma_alloc_contiguous API Christoph Hellwig
2020-10-02 17:50   ` Tomasz Figa
2020-10-05  8:26     ` Christoph Hellwig
2020-10-06 20:56       ` Tomasz Figa
2020-10-07  6:21         ` Christoph Hellwig
2020-10-07 12:21           ` Tomasz Figa
2020-10-07 12:24             ` Christoph Hellwig
2020-10-14 13:20   ` Tomasz Figa
2020-10-14 15:03     ` David Laight
2020-11-09 14:53     ` Ricardo Ribalda
2020-11-10  9:25       ` Christoph Hellwig
2020-11-10  9:33         ` Ricardo Ribalda
2020-11-10  9:41           ` Christoph Hellwig
2020-11-10  9:50           ` Tomasz Figa
2020-11-10  9:57             ` Christoph Hellwig
2020-11-17 21:21               ` Ricardo Ribalda
2020-11-18 14:25   ` [PATCH] WIP! media: uvcvideo: Use dma_alloc_noncontiguos API Ricardo Ribalda
2020-11-24 11:35     ` Christoph Hellwig
2020-11-24 12:01       ` Ricardo Ribalda
2020-11-24 13:33         ` Christoph Hellwig

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).