* Re: XDP performance regression due to CONFIG_RETPOLINE Spectre V2
@ 2018-04-16 12:27 ` Christoph Hellwig
0 siblings, 0 replies; 20+ messages in thread
From: Christoph Hellwig @ 2018-04-16 12:27 UTC (permalink / raw)
To: Jesper Dangaard Brouer
Cc: xdp-newbies, netdev, Christoph Hellwig, David Woodhouse,
William Tu, Björn Töpel, Karlsson, Magnus,
Alexander Duyck, Arnaldo Carvalho de Melo
Can you try the following hack which avoids indirect calls entirely
for the fast path direct mapping case?
---
From b256a008c1b305e6a1c2afe7c004c54ad2e96d4b Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Mon, 16 Apr 2018 14:18:14 +0200
Subject: dma-mapping: bypass dma_ops for direct mappings
Reportedly the retpoline mitigation for spectre causes huge penalties
for indirect function calls. This hack bypasses the dma_ops mechanism
for simple direct mappings.
Signed-off-by: Christoph Hellwig <hch@lst.de>
---
include/linux/device.h | 1 +
include/linux/dma-mapping.h | 53 +++++++++++++++++++++++++++----------
lib/dma-direct.c | 4 +--
3 files changed, 42 insertions(+), 16 deletions(-)
diff --git a/include/linux/device.h b/include/linux/device.h
index 0059b99e1f25..725eec4c6653 100644
--- a/include/linux/device.h
+++ b/include/linux/device.h
@@ -990,6 +990,7 @@ struct device {
bool offline_disabled:1;
bool offline:1;
bool of_node_reused:1;
+ bool is_dma_direct:1;
};
static inline struct device *kobj_to_dev(struct kobject *kobj)
diff --git a/include/linux/dma-mapping.h b/include/linux/dma-mapping.h
index f8ab1c0f589e..c5d384ae25d6 100644
--- a/include/linux/dma-mapping.h
+++ b/include/linux/dma-mapping.h
@@ -223,6 +223,13 @@ static inline const struct dma_map_ops *get_dma_ops(struct device *dev)
}
#endif
+/* do not use directly! */
+dma_addr_t dma_direct_map_page(struct device *dev, struct page *page,
+ unsigned long offset, size_t size, enum dma_data_direction dir,
+ unsigned long attrs);
+int dma_direct_map_sg(struct device *dev, struct scatterlist *sgl,
+ int nents, enum dma_data_direction dir, unsigned long attrs);
+
static inline dma_addr_t dma_map_single_attrs(struct device *dev, void *ptr,
size_t size,
enum dma_data_direction dir,
@@ -232,9 +239,13 @@ static inline dma_addr_t dma_map_single_attrs(struct device *dev, void *ptr,
dma_addr_t addr;
BUG_ON(!valid_dma_direction(dir));
- addr = ops->map_page(dev, virt_to_page(ptr),
- offset_in_page(ptr), size,
- dir, attrs);
+ if (dev->is_dma_direct) {
+ addr = dma_direct_map_page(dev, virt_to_page(ptr),
+ offset_in_page(ptr), size, dir, attrs);
+ } else {
+ addr = ops->map_page(dev, virt_to_page(ptr),
+ offset_in_page(ptr), size, dir, attrs);
+ }
debug_dma_map_page(dev, virt_to_page(ptr),
offset_in_page(ptr), size,
dir, addr, true);
@@ -249,7 +260,7 @@ static inline void dma_unmap_single_attrs(struct device *dev, dma_addr_t addr,
const struct dma_map_ops *ops = get_dma_ops(dev);
BUG_ON(!valid_dma_direction(dir));
- if (ops->unmap_page)
+ if (!dev->is_dma_direct && ops->unmap_page)
ops->unmap_page(dev, addr, size, dir, attrs);
debug_dma_unmap_page(dev, addr, size, dir, true);
}
@@ -266,7 +277,10 @@ static inline int dma_map_sg_attrs(struct device *dev, struct scatterlist *sg,
int ents;
BUG_ON(!valid_dma_direction(dir));
- ents = ops->map_sg(dev, sg, nents, dir, attrs);
+ if (dev->is_dma_direct)
+ ents = dma_direct_map_sg(dev, sg, nents, dir, attrs);
+ else
+ ents = ops->map_sg(dev, sg, nents, dir, attrs);
BUG_ON(ents < 0);
debug_dma_map_sg(dev, sg, nents, ents, dir);
@@ -281,7 +295,7 @@ static inline void dma_unmap_sg_attrs(struct device *dev, struct scatterlist *sg
BUG_ON(!valid_dma_direction(dir));
debug_dma_unmap_sg(dev, sg, nents, dir);
- if (ops->unmap_sg)
+ if (!dev->is_dma_direct && ops->unmap_sg)
ops->unmap_sg(dev, sg, nents, dir, attrs);
}
@@ -295,7 +309,10 @@ static inline dma_addr_t dma_map_page_attrs(struct device *dev,
dma_addr_t addr;
BUG_ON(!valid_dma_direction(dir));
- addr = ops->map_page(dev, page, offset, size, dir, attrs);
+ if (dev->is_dma_direct)
+ addr = dma_direct_map_page(dev, page, offset, size, dir, attrs);
+ else
+ addr = ops->map_page(dev, page, offset, size, dir, attrs);
debug_dma_map_page(dev, page, offset, size, dir, addr, false);
return addr;
@@ -309,7 +326,7 @@ static inline void dma_unmap_page_attrs(struct device *dev,
const struct dma_map_ops *ops = get_dma_ops(dev);
BUG_ON(!valid_dma_direction(dir));
- if (ops->unmap_page)
+ if (!dev->is_dma_direct && ops->unmap_page)
ops->unmap_page(dev, addr, size, dir, attrs);
debug_dma_unmap_page(dev, addr, size, dir, false);
}
@@ -356,7 +373,7 @@ static inline void dma_sync_single_for_cpu(struct device *dev, dma_addr_t addr,
const struct dma_map_ops *ops = get_dma_ops(dev);
BUG_ON(!valid_dma_direction(dir));
- if (ops->sync_single_for_cpu)
+ if (!dev->is_dma_direct && ops->sync_single_for_cpu)
ops->sync_single_for_cpu(dev, addr, size, dir);
debug_dma_sync_single_for_cpu(dev, addr, size, dir);
}
@@ -368,7 +385,7 @@ static inline void dma_sync_single_for_device(struct device *dev,
const struct dma_map_ops *ops = get_dma_ops(dev);
BUG_ON(!valid_dma_direction(dir));
- if (ops->sync_single_for_device)
+ if (!dev->is_dma_direct && ops->sync_single_for_device)
ops->sync_single_for_device(dev, addr, size, dir);
debug_dma_sync_single_for_device(dev, addr, size, dir);
}
@@ -382,7 +399,7 @@ static inline void dma_sync_single_range_for_cpu(struct device *dev,
const struct dma_map_ops *ops = get_dma_ops(dev);
BUG_ON(!valid_dma_direction(dir));
- if (ops->sync_single_for_cpu)
+ if (!dev->is_dma_direct && ops->sync_single_for_cpu)
ops->sync_single_for_cpu(dev, addr + offset, size, dir);
debug_dma_sync_single_range_for_cpu(dev, addr, offset, size, dir);
}
@@ -396,7 +413,7 @@ static inline void dma_sync_single_range_for_device(struct device *dev,
const struct dma_map_ops *ops = get_dma_ops(dev);
BUG_ON(!valid_dma_direction(dir));
- if (ops->sync_single_for_device)
+ if (!dev->is_dma_direct && ops->sync_single_for_device)
ops->sync_single_for_device(dev, addr + offset, size, dir);
debug_dma_sync_single_range_for_device(dev, addr, offset, size, dir);
}
@@ -408,7 +425,7 @@ dma_sync_sg_for_cpu(struct device *dev, struct scatterlist *sg,
const struct dma_map_ops *ops = get_dma_ops(dev);
BUG_ON(!valid_dma_direction(dir));
- if (ops->sync_sg_for_cpu)
+ if (!dev->is_dma_direct && ops->sync_sg_for_cpu)
ops->sync_sg_for_cpu(dev, sg, nelems, dir);
debug_dma_sync_sg_for_cpu(dev, sg, nelems, dir);
}
@@ -420,7 +437,7 @@ dma_sync_sg_for_device(struct device *dev, struct scatterlist *sg,
const struct dma_map_ops *ops = get_dma_ops(dev);
BUG_ON(!valid_dma_direction(dir));
- if (ops->sync_sg_for_device)
+ if (!dev->is_dma_direct && ops->sync_sg_for_device)
ops->sync_sg_for_device(dev, sg, nelems, dir);
debug_dma_sync_sg_for_device(dev, sg, nelems, dir);
@@ -600,6 +617,8 @@ static inline int dma_supported(struct device *dev, u64 mask)
return ops->dma_supported(dev, mask);
}
+extern const struct dma_map_ops swiotlb_dma_ops;
+
#ifndef HAVE_ARCH_DMA_SET_MASK
static inline int dma_set_mask(struct device *dev, u64 mask)
{
@@ -609,6 +628,12 @@ static inline int dma_set_mask(struct device *dev, u64 mask)
dma_check_mask(dev, mask);
*dev->dma_mask = mask;
+ if (dev->dma_ops == &dma_direct_ops ||
+ (dev->dma_ops == &swiotlb_dma_ops &&
+ mask == DMA_BIT_MASK(64)))
+ dev->is_dma_direct = true;
+ else
+ dev->is_dma_direct = false;
return 0;
}
#endif
diff --git a/lib/dma-direct.c b/lib/dma-direct.c
index c0bba30fef0a..3deb8666974b 100644
--- a/lib/dma-direct.c
+++ b/lib/dma-direct.c
@@ -120,7 +120,7 @@ void dma_direct_free(struct device *dev, size_t size, void *cpu_addr,
free_pages((unsigned long)cpu_addr, page_order);
}
-static dma_addr_t dma_direct_map_page(struct device *dev, struct page *page,
+dma_addr_t dma_direct_map_page(struct device *dev, struct page *page,
unsigned long offset, size_t size, enum dma_data_direction dir,
unsigned long attrs)
{
@@ -131,7 +131,7 @@ static dma_addr_t dma_direct_map_page(struct device *dev, struct page *page,
return dma_addr;
}
-static int dma_direct_map_sg(struct device *dev, struct scatterlist *sgl,
+int dma_direct_map_sg(struct device *dev, struct scatterlist *sgl,
int nents, enum dma_data_direction dir, unsigned long attrs)
{
int i;
--
2.17.0
^ permalink raw reply related [flat|nested] 20+ messages in thread
* Re: XDP performance regression due to CONFIG_RETPOLINE Spectre V2
2018-04-16 12:27 ` Christoph Hellwig
(?)
@ 2018-04-16 16:04 ` Alexander Duyck
2018-04-17 6:19 ` Christoph Hellwig
-1 siblings, 1 reply; 20+ messages in thread
From: Alexander Duyck @ 2018-04-16 16:04 UTC (permalink / raw)
To: Christoph Hellwig
Cc: Jesper Dangaard Brouer, xdp-newbies, netdev, Christoph Hellwig,
David Woodhouse, William Tu, Björn Töpel, Karlsson,
Magnus, Arnaldo Carvalho de Melo
On Mon, Apr 16, 2018 at 5:27 AM, Christoph Hellwig <hch@infradead.org> wrote:
> Can you try the following hack which avoids indirect calls entirely
> for the fast path direct mapping case?
>
> ---
> From b256a008c1b305e6a1c2afe7c004c54ad2e96d4b Mon Sep 17 00:00:00 2001
> From: Christoph Hellwig <hch@lst.de>
> Date: Mon, 16 Apr 2018 14:18:14 +0200
> Subject: dma-mapping: bypass dma_ops for direct mappings
>
> Reportedly the retpoline mitigation for spectre causes huge penalties
> for indirect function calls. This hack bypasses the dma_ops mechanism
> for simple direct mappings.
>
> Signed-off-by: Christoph Hellwig <hch@lst.de>
> ---
> include/linux/device.h | 1 +
> include/linux/dma-mapping.h | 53 +++++++++++++++++++++++++++----------
> lib/dma-direct.c | 4 +--
> 3 files changed, 42 insertions(+), 16 deletions(-)
>
> diff --git a/include/linux/device.h b/include/linux/device.h
> index 0059b99e1f25..725eec4c6653 100644
> --- a/include/linux/device.h
> +++ b/include/linux/device.h
> @@ -990,6 +990,7 @@ struct device {
> bool offline_disabled:1;
> bool offline:1;
> bool of_node_reused:1;
> + bool is_dma_direct:1;
> };
>
> static inline struct device *kobj_to_dev(struct kobject *kobj)
> diff --git a/include/linux/dma-mapping.h b/include/linux/dma-mapping.h
> index f8ab1c0f589e..c5d384ae25d6 100644
> --- a/include/linux/dma-mapping.h
> +++ b/include/linux/dma-mapping.h
> @@ -223,6 +223,13 @@ static inline const struct dma_map_ops *get_dma_ops(struct device *dev)
> }
> #endif
>
> +/* do not use directly! */
> +dma_addr_t dma_direct_map_page(struct device *dev, struct page *page,
> + unsigned long offset, size_t size, enum dma_data_direction dir,
> + unsigned long attrs);
> +int dma_direct_map_sg(struct device *dev, struct scatterlist *sgl,
> + int nents, enum dma_data_direction dir, unsigned long attrs);
> +
> static inline dma_addr_t dma_map_single_attrs(struct device *dev, void *ptr,
> size_t size,
> enum dma_data_direction dir,
> @@ -232,9 +239,13 @@ static inline dma_addr_t dma_map_single_attrs(struct device *dev, void *ptr,
> dma_addr_t addr;
>
> BUG_ON(!valid_dma_direction(dir));
> - addr = ops->map_page(dev, virt_to_page(ptr),
> - offset_in_page(ptr), size,
> - dir, attrs);
> + if (dev->is_dma_direct) {
> + addr = dma_direct_map_page(dev, virt_to_page(ptr),
> + offset_in_page(ptr), size, dir, attrs);
> + } else {
> + addr = ops->map_page(dev, virt_to_page(ptr),
> + offset_in_page(ptr), size, dir, attrs);
> + }
> debug_dma_map_page(dev, virt_to_page(ptr),
> offset_in_page(ptr), size,
> dir, addr, true);
I'm not sure if I am really a fan of trying to solve this in this way.
It seems like this is going to be optimizing the paths for one case at
the detriment of others. Historically mapping and unmapping has always
been expensive, especially in the case of IOMMU enabled environments.
I would much rather see us focus on having swiotlb_dma_ops replaced
with dma_direct_ops in the cases where the device can access all of
physical memory.
> @@ -249,7 +260,7 @@ static inline void dma_unmap_single_attrs(struct device *dev, dma_addr_t addr,
> const struct dma_map_ops *ops = get_dma_ops(dev);
>
> BUG_ON(!valid_dma_direction(dir));
> - if (ops->unmap_page)
> + if (!dev->is_dma_direct && ops->unmap_page)
If I understand correctly this is only needed for the swiotlb case and
not the dma_direct case. It would make much more sense to just
overwrite the dev->dma_ops pointer with dma_direct_ops to address all
of the sync and unmap cases.
> ops->unmap_page(dev, addr, size, dir, attrs);
> debug_dma_unmap_page(dev, addr, size, dir, true);
> }
> @@ -266,7 +277,10 @@ static inline int dma_map_sg_attrs(struct device *dev, struct scatterlist *sg,
> int ents;
>
> BUG_ON(!valid_dma_direction(dir));
> - ents = ops->map_sg(dev, sg, nents, dir, attrs);
> + if (dev->is_dma_direct)
> + ents = dma_direct_map_sg(dev, sg, nents, dir, attrs);
> + else
> + ents = ops->map_sg(dev, sg, nents, dir, attrs);
> BUG_ON(ents < 0);
> debug_dma_map_sg(dev, sg, nents, ents, dir);
>
> @@ -281,7 +295,7 @@ static inline void dma_unmap_sg_attrs(struct device *dev, struct scatterlist *sg
>
> BUG_ON(!valid_dma_direction(dir));
> debug_dma_unmap_sg(dev, sg, nents, dir);
> - if (ops->unmap_sg)
> + if (!dev->is_dma_direct && ops->unmap_sg)
> ops->unmap_sg(dev, sg, nents, dir, attrs);
> }
>
> @@ -295,7 +309,10 @@ static inline dma_addr_t dma_map_page_attrs(struct device *dev,
> dma_addr_t addr;
>
> BUG_ON(!valid_dma_direction(dir));
> - addr = ops->map_page(dev, page, offset, size, dir, attrs);
> + if (dev->is_dma_direct)
> + addr = dma_direct_map_page(dev, page, offset, size, dir, attrs);
> + else
> + addr = ops->map_page(dev, page, offset, size, dir, attrs);
> debug_dma_map_page(dev, page, offset, size, dir, addr, false);
>
> return addr;
> @@ -309,7 +326,7 @@ static inline void dma_unmap_page_attrs(struct device *dev,
> const struct dma_map_ops *ops = get_dma_ops(dev);
>
> BUG_ON(!valid_dma_direction(dir));
> - if (ops->unmap_page)
> + if (!dev->is_dma_direct && ops->unmap_page)
> ops->unmap_page(dev, addr, size, dir, attrs);
> debug_dma_unmap_page(dev, addr, size, dir, false);
> }
> @@ -356,7 +373,7 @@ static inline void dma_sync_single_for_cpu(struct device *dev, dma_addr_t addr,
> const struct dma_map_ops *ops = get_dma_ops(dev);
>
> BUG_ON(!valid_dma_direction(dir));
> - if (ops->sync_single_for_cpu)
> + if (!dev->is_dma_direct && ops->sync_single_for_cpu)
> ops->sync_single_for_cpu(dev, addr, size, dir);
> debug_dma_sync_single_for_cpu(dev, addr, size, dir);
> }
> @@ -368,7 +385,7 @@ static inline void dma_sync_single_for_device(struct device *dev,
> const struct dma_map_ops *ops = get_dma_ops(dev);
>
> BUG_ON(!valid_dma_direction(dir));
> - if (ops->sync_single_for_device)
> + if (!dev->is_dma_direct && ops->sync_single_for_device)
> ops->sync_single_for_device(dev, addr, size, dir);
> debug_dma_sync_single_for_device(dev, addr, size, dir);
> }
> @@ -382,7 +399,7 @@ static inline void dma_sync_single_range_for_cpu(struct device *dev,
> const struct dma_map_ops *ops = get_dma_ops(dev);
>
> BUG_ON(!valid_dma_direction(dir));
> - if (ops->sync_single_for_cpu)
> + if (!dev->is_dma_direct && ops->sync_single_for_cpu)
> ops->sync_single_for_cpu(dev, addr + offset, size, dir);
> debug_dma_sync_single_range_for_cpu(dev, addr, offset, size, dir);
> }
> @@ -396,7 +413,7 @@ static inline void dma_sync_single_range_for_device(struct device *dev,
> const struct dma_map_ops *ops = get_dma_ops(dev);
>
> BUG_ON(!valid_dma_direction(dir));
> - if (ops->sync_single_for_device)
> + if (!dev->is_dma_direct && ops->sync_single_for_device)
> ops->sync_single_for_device(dev, addr + offset, size, dir);
> debug_dma_sync_single_range_for_device(dev, addr, offset, size, dir);
> }
> @@ -408,7 +425,7 @@ dma_sync_sg_for_cpu(struct device *dev, struct scatterlist *sg,
> const struct dma_map_ops *ops = get_dma_ops(dev);
>
> BUG_ON(!valid_dma_direction(dir));
> - if (ops->sync_sg_for_cpu)
> + if (!dev->is_dma_direct && ops->sync_sg_for_cpu)
> ops->sync_sg_for_cpu(dev, sg, nelems, dir);
> debug_dma_sync_sg_for_cpu(dev, sg, nelems, dir);
> }
> @@ -420,7 +437,7 @@ dma_sync_sg_for_device(struct device *dev, struct scatterlist *sg,
> const struct dma_map_ops *ops = get_dma_ops(dev);
>
> BUG_ON(!valid_dma_direction(dir));
> - if (ops->sync_sg_for_device)
> + if (!dev->is_dma_direct && ops->sync_sg_for_device)
> ops->sync_sg_for_device(dev, sg, nelems, dir);
> debug_dma_sync_sg_for_device(dev, sg, nelems, dir);
>
> @@ -600,6 +617,8 @@ static inline int dma_supported(struct device *dev, u64 mask)
> return ops->dma_supported(dev, mask);
> }
>
> +extern const struct dma_map_ops swiotlb_dma_ops;
> +
> #ifndef HAVE_ARCH_DMA_SET_MASK
> static inline int dma_set_mask(struct device *dev, u64 mask)
> {
> @@ -609,6 +628,12 @@ static inline int dma_set_mask(struct device *dev, u64 mask)
> dma_check_mask(dev, mask);
>
> *dev->dma_mask = mask;
> + if (dev->dma_ops == &dma_direct_ops ||
> + (dev->dma_ops == &swiotlb_dma_ops &&
> + mask == DMA_BIT_MASK(64)))
> + dev->is_dma_direct = true;
> + else
> + dev->is_dma_direct = false;
So I am not sure this will work on x86. If I am not mistaken I believe
dev->dma_ops is normally not set and instead the default dma
operations are pulled via get_arch_dma_ops which returns the global
dma_ops pointer.
What you may want to consider as an alternative would be to look at
modifying drivers that are using the swiotlb so that you could just
overwrite the dev->dma_ops with the dma_direct_ops in the cases where
the hardware can support accessing all of physical hardware and where
we aren't forcing the use of the bounce buffers in the swiotlb.
Then for the above code you only have to worry about the map calls,
and you could just do a check against the dma_direct_ops pointer
instead of having to add a new flag.
> return 0;
> }
> #endif
> diff --git a/lib/dma-direct.c b/lib/dma-direct.c
> index c0bba30fef0a..3deb8666974b 100644
> --- a/lib/dma-direct.c
> +++ b/lib/dma-direct.c
> @@ -120,7 +120,7 @@ void dma_direct_free(struct device *dev, size_t size, void *cpu_addr,
> free_pages((unsigned long)cpu_addr, page_order);
> }
>
> -static dma_addr_t dma_direct_map_page(struct device *dev, struct page *page,
> +dma_addr_t dma_direct_map_page(struct device *dev, struct page *page,
> unsigned long offset, size_t size, enum dma_data_direction dir,
> unsigned long attrs)
> {
> @@ -131,7 +131,7 @@ static dma_addr_t dma_direct_map_page(struct device *dev, struct page *page,
> return dma_addr;
> }
>
> -static int dma_direct_map_sg(struct device *dev, struct scatterlist *sgl,
> +int dma_direct_map_sg(struct device *dev, struct scatterlist *sgl,
> int nents, enum dma_data_direction dir, unsigned long attrs)
> {
> int i;
> --
> 2.17.0
>
>
^ permalink raw reply [flat|nested] 20+ messages in thread
* Re: XDP performance regression due to CONFIG_RETPOLINE Spectre V2
2018-04-16 16:04 ` Alexander Duyck
@ 2018-04-17 6:19 ` Christoph Hellwig
0 siblings, 0 replies; 20+ messages in thread
From: Christoph Hellwig @ 2018-04-17 6:19 UTC (permalink / raw)
To: Alexander Duyck
Cc: Christoph Hellwig, Jesper Dangaard Brouer, xdp-newbies, netdev,
Christoph Hellwig, David Woodhouse, William Tu,
Björn Töpel, Karlsson, Magnus,
Arnaldo Carvalho de Melo
> I'm not sure if I am really a fan of trying to solve this in this way.
> It seems like this is going to be optimizing the paths for one case at
> the detriment of others. Historically mapping and unmapping has always
> been expensive, especially in the case of IOMMU enabled environments.
> I would much rather see us focus on having swiotlb_dma_ops replaced
> with dma_direct_ops in the cases where the device can access all of
> physical memory.
I am definitively not a fan, but IFF indirect calls are such an overhead
it makes sense to avoid it for the common and simple case. And the
direct mapping is a common case present on just about every
architecture, and it is a very simple fast path that just adds an offset
to the physical address. So if we want to speed something up, this is
it.
> > - if (ops->unmap_page)
> > + if (!dev->is_dma_direct && ops->unmap_page)
>
> If I understand correctly this is only needed for the swiotlb case and
> not the dma_direct case. It would make much more sense to just
> overwrite the dev->dma_ops pointer with dma_direct_ops to address all
> of the sync and unmap cases.
Yes.
> > + if (dev->dma_ops == &dma_direct_ops ||
> > + (dev->dma_ops == &swiotlb_dma_ops &&
> > + mask == DMA_BIT_MASK(64)))
> > + dev->is_dma_direct = true;
> > + else
> > + dev->is_dma_direct = false;
>
> So I am not sure this will work on x86. If I am not mistaken I believe
> dev->dma_ops is normally not set and instead the default dma
> operations are pulled via get_arch_dma_ops which returns the global
> dma_ops pointer.
True, for x86 we'd need to check get_arch_dma_ops().
> What you may want to consider as an alternative would be to look at
> modifying drivers that are using the swiotlb so that you could just
> overwrite the dev->dma_ops with the dma_direct_ops in the cases where
> the hardware can support accessing all of physical hardware and where
> we aren't forcing the use of the bounce buffers in the swiotlb.
>
> Then for the above code you only have to worry about the map calls,
> and you could just do a check against the dma_direct_ops pointer
> instead of having to add a new flag.
That would be the long term plan IFF we go down this route. For now I
just wanted a quick hack for performance testing.
^ permalink raw reply [flat|nested] 20+ messages in thread
* Re: dma-mapping: bypass dma_ops for direct mappings
2018-04-16 12:27 ` Christoph Hellwig
(?)
(?)
@ 2018-04-16 18:05 ` kbuild test robot
2018-04-16 18:26 ` Jesper Dangaard Brouer
-1 siblings, 1 reply; 20+ messages in thread
From: kbuild test robot @ 2018-04-16 18:05 UTC (permalink / raw)
To: Christoph Hellwig
Cc: kbuild-all, Jesper Dangaard Brouer, xdp-newbies, netdev,
Christoph Hellwig, David Woodhouse, William Tu,
Björn Töpel, Karlsson, Magnus, Alexander Duyck,
Arnaldo Carvalho de Melo
[-- Attachment #1: Type: text/plain, Size: 1561 bytes --]
Hi Christoph,
I love your patch! Yet something to improve:
[auto build test ERROR on linus/master]
[also build test ERROR on v4.17-rc1 next-20180416]
[if your patch is applied to the wrong git tree, please drop us a note to help improve the system]
url: https://github.com/0day-ci/linux/commits/Christoph-Hellwig/dma-mapping-bypass-dma_ops-for-direct-mappings/20180416-230032
config: i386-defconfig (attached as .config)
compiler: gcc-7 (Debian 7.3.0-1) 7.3.0
reproduce:
# save the attached .config to linux build tree
make ARCH=i386
All errors (new ones prefixed by >>):
drivers/char/agp/intel-gtt.o: In function `intel_gmch_probe':
>> intel-gtt.c:(.text+0x11e4): undefined reference to `swiotlb_dma_ops'
drivers/ata/ahci.o: In function `ahci_init_one':
>> ahci.c:(.text+0x108d): undefined reference to `swiotlb_dma_ops'
drivers/net/ethernet/broadcom/bnx2.o: In function `bnx2_init_one':
>> bnx2.c:(.text+0x7fe7): undefined reference to `swiotlb_dma_ops'
drivers/net/ethernet/broadcom/tg3.o: In function `tg3_init_one':
>> tg3.c:(.text+0x13549): undefined reference to `swiotlb_dma_ops'
drivers/net/ethernet/intel/e1000/e1000_main.o: In function `e1000_probe':
>> e1000_main.c:(.text+0x49b3): undefined reference to `swiotlb_dma_ops'
drivers/net/ethernet/intel/e1000e/netdev.o:netdev.c:(.text+0xa65e): more undefined references to `swiotlb_dma_ops' follow
---
0-DAY kernel test infrastructure Open Source Technology Center
https://lists.01.org/pipermail/kbuild-all Intel Corporation
[-- Attachment #2: .config.gz --]
[-- Type: application/gzip, Size: 26269 bytes --]
^ permalink raw reply [flat|nested] 20+ messages in thread
* Re: dma-mapping: bypass dma_ops for direct mappings
2018-04-16 18:05 ` dma-mapping: bypass dma_ops for direct mappings kbuild test robot
@ 2018-04-16 18:26 ` Jesper Dangaard Brouer
0 siblings, 0 replies; 20+ messages in thread
From: Jesper Dangaard Brouer @ 2018-04-16 18:26 UTC (permalink / raw)
To: kbuild test robot
Cc: Christoph Hellwig, kbuild-all, netdev, Christoph Hellwig,
David Woodhouse, William Tu, Björn Töpel, Karlsson,
Magnus, Alexander Duyck, Arnaldo Carvalho de Melo, brouer
On Tue, 17 Apr 2018 02:05:12 +0800
kbuild test robot <lkp@intel.com> wrote:
> Hi Christoph,
>
> I love your patch! Yet something to improve:
I was just about to complain about the same compile error ;-)
> [auto build test ERROR on linus/master]
> [also build test ERROR on v4.17-rc1 next-20180416]
> [if your patch is applied to the wrong git tree, please drop us a note to help improve the system]
>
> url: https://github.com/0day-ci/linux/commits/Christoph-Hellwig/dma-mapping-bypass-dma_ops-for-direct-mappings/20180416-230032
> config: i386-defconfig (attached as .config)
> compiler: gcc-7 (Debian 7.3.0-1) 7.3.0
> reproduce:
> # save the attached .config to linux build tree
> make ARCH=i386
>
> All errors (new ones prefixed by >>):
>
> drivers/char/agp/intel-gtt.o: In function `intel_gmch_probe':
> >> intel-gtt.c:(.text+0x11e4): undefined reference to `swiotlb_dma_ops'
> drivers/ata/ahci.o: In function `ahci_init_one':
> >> ahci.c:(.text+0x108d): undefined reference to `swiotlb_dma_ops'
> drivers/net/ethernet/broadcom/bnx2.o: In function `bnx2_init_one':
> >> bnx2.c:(.text+0x7fe7): undefined reference to `swiotlb_dma_ops'
> drivers/net/ethernet/broadcom/tg3.o: In function `tg3_init_one':
> >> tg3.c:(.text+0x13549): undefined reference to `swiotlb_dma_ops'
> drivers/net/ethernet/intel/e1000/e1000_main.o: In function `e1000_probe':
> >> e1000_main.c:(.text+0x49b3): undefined reference to `swiotlb_dma_ops'
> drivers/net/ethernet/intel/e1000e/netdev.o:netdev.c:(.text+0xa65e): more undefined references to `swiotlb_dma_ops' follow
>
> ---
> 0-DAY kernel test infrastructure Open Source Technology Center
> https://lists.01.org/pipermail/kbuild-all Intel Corporation
--
Best regards,
Jesper Dangaard Brouer
MSc.CS, Principal Kernel Engineer at Red Hat
LinkedIn: http://www.linkedin.com/in/brouer
^ permalink raw reply [flat|nested] 20+ messages in thread
* Re: dma-mapping: bypass dma_ops for direct mappings
2018-04-16 12:27 ` Christoph Hellwig
` (2 preceding siblings ...)
(?)
@ 2018-04-16 18:31 ` kbuild test robot
-1 siblings, 0 replies; 20+ messages in thread
From: kbuild test robot @ 2018-04-16 18:31 UTC (permalink / raw)
To: Christoph Hellwig
Cc: kbuild-all, Jesper Dangaard Brouer, xdp-newbies, netdev,
Christoph Hellwig, David Woodhouse, William Tu,
Björn Töpel, Karlsson, Magnus, Alexander Duyck,
Arnaldo Carvalho de Melo
[-- Attachment #1: Type: text/plain, Size: 12130 bytes --]
Hi Christoph,
I love your patch! Perhaps something to improve:
[auto build test WARNING on linus/master]
[also build test WARNING on v4.17-rc1 next-20180416]
[if your patch is applied to the wrong git tree, please drop us a note to help improve the system]
url: https://github.com/0day-ci/linux/commits/Christoph-Hellwig/dma-mapping-bypass-dma_ops-for-direct-mappings/20180416-230032
reproduce: make htmldocs
All warnings (new ones prefixed by >>):
include/net/mac80211.h:2083: warning: bad line: >
include/net/mac80211.h:2083: warning: bad line: >
include/net/mac80211.h:2083: warning: bad line: >
include/net/mac80211.h:2083: warning: bad line: >
include/net/mac80211.h:2083: warning: bad line: >
include/net/mac80211.h:2083: warning: bad line: >
include/net/mac80211.h:2083: warning: bad line: >
include/net/mac80211.h:2083: warning: bad line: >
include/net/mac80211.h:2083: warning: bad line: >
include/net/mac80211.h:2083: warning: bad line: >
include/net/mac80211.h:2083: warning: bad line: >
include/net/mac80211.h:2083: warning: bad line: >
include/net/mac80211.h:2083: warning: bad line: >
include/net/mac80211.h:2083: warning: bad line: >
include/net/mac80211.h:2083: warning: bad line: >
include/net/mac80211.h:2083: warning: bad line: >
include/net/mac80211.h:2083: warning: bad line: >
include/net/mac80211.h:2083: warning: bad line: >
include/net/mac80211.h:2083: warning: bad line: >
include/net/mac80211.h:2083: warning: bad line: >
include/net/mac80211.h:2083: warning: bad line: >
include/net/mac80211.h:2083: warning: bad line: >
include/net/mac80211.h:2083: warning: bad line: >
include/net/mac80211.h:2083: warning: bad line: >
include/net/mac80211.h:2083: warning: bad line: >
include/net/mac80211.h:2083: warning: bad line: >
include/net/mac80211.h:2083: warning: bad line: >
include/net/mac80211.h:2083: warning: bad line: >
include/net/mac80211.h:2083: warning: bad line: >
include/net/mac80211.h:2083: warning: bad line: >
include/net/mac80211.h:2083: warning: bad line: >
include/net/mac80211.h:2083: warning: bad line: >
include/net/mac80211.h:2083: warning: bad line: >
include/net/mac80211.h:2083: warning: bad line: >
include/net/mac80211.h:2083: warning: bad line: >
include/net/mac80211.h:2083: warning: bad line: >
include/net/mac80211.h:2083: warning: bad line: >
include/net/mac80211.h:2083: warning: bad line: >
include/net/mac80211.h:2083: warning: bad line: >
include/net/mac80211.h:2083: warning: bad line: >
include/net/mac80211.h:2083: warning: bad line: >
include/net/mac80211.h:2083: warning: bad line: >
include/net/mac80211.h:2083: warning: bad line: >
include/net/mac80211.h:2083: warning: bad line: >
include/net/mac80211.h:2083: warning: bad line: >
include/net/mac80211.h:2083: warning: bad line: >
include/net/mac80211.h:2083: warning: bad line: >
include/net/mac80211.h:2083: warning: bad line: >
include/net/mac80211.h:2083: warning: bad line: >
include/net/mac80211.h:2083: warning: bad line: >
include/net/mac80211.h:2083: warning: bad line: >
include/net/mac80211.h:2083: warning: bad line: >
include/net/mac80211.h:2083: warning: bad line: >
include/net/mac80211.h:2083: warning: bad line: >
include/net/mac80211.h:2083: warning: bad line: >
include/net/mac80211.h:2083: warning: bad line: >
include/net/mac80211.h:2083: warning: bad line: >
net/mac80211/sta_info.h:586: warning: Function parameter or member 'rx_stats_avg' not described in 'sta_info'
net/mac80211/sta_info.h:586: warning: Function parameter or member 'rx_stats_avg.signal' not described in 'sta_info'
net/mac80211/sta_info.h:586: warning: Function parameter or member 'rx_stats_avg.chain_signal' not described in 'sta_info'
net/mac80211/sta_info.h:586: warning: Function parameter or member 'status_stats.filtered' not described in 'sta_info'
net/mac80211/sta_info.h:586: warning: Function parameter or member 'status_stats.retry_failed' not described in 'sta_info'
net/mac80211/sta_info.h:586: warning: Function parameter or member 'status_stats.retry_count' not described in 'sta_info'
net/mac80211/sta_info.h:586: warning: Function parameter or member 'status_stats.lost_packets' not described in 'sta_info'
net/mac80211/sta_info.h:586: warning: Function parameter or member 'status_stats.last_tdls_pkt_time' not described in 'sta_info'
net/mac80211/sta_info.h:586: warning: Function parameter or member 'status_stats.msdu_retries' not described in 'sta_info'
net/mac80211/sta_info.h:586: warning: Function parameter or member 'status_stats.msdu_failed' not described in 'sta_info'
net/mac80211/sta_info.h:586: warning: Function parameter or member 'status_stats.last_ack' not described in 'sta_info'
net/mac80211/sta_info.h:586: warning: Function parameter or member 'status_stats.last_ack_signal' not described in 'sta_info'
net/mac80211/sta_info.h:586: warning: Function parameter or member 'status_stats.ack_signal_filled' not described in 'sta_info'
net/mac80211/sta_info.h:586: warning: Function parameter or member 'tx_stats.packets' not described in 'sta_info'
net/mac80211/sta_info.h:586: warning: Function parameter or member 'tx_stats.bytes' not described in 'sta_info'
net/mac80211/sta_info.h:586: warning: Function parameter or member 'tx_stats.last_rate' not described in 'sta_info'
net/mac80211/sta_info.h:586: warning: Function parameter or member 'tx_stats.msdu' not described in 'sta_info'
kernel/sched/fair.c:3731: warning: Function parameter or member 'flags' not described in 'attach_entity_load_avg'
include/linux/dma-buf.h:307: warning: Function parameter or member 'cb_excl.cb' not described in 'dma_buf'
include/linux/dma-buf.h:307: warning: Function parameter or member 'cb_excl.poll' not described in 'dma_buf'
include/linux/dma-buf.h:307: warning: Function parameter or member 'cb_excl.active' not described in 'dma_buf'
include/linux/dma-buf.h:307: warning: Function parameter or member 'cb_shared.cb' not described in 'dma_buf'
include/linux/dma-buf.h:307: warning: Function parameter or member 'cb_shared.poll' not described in 'dma_buf'
include/linux/dma-buf.h:307: warning: Function parameter or member 'cb_shared.active' not described in 'dma_buf'
include/linux/dma-fence-array.h:54: warning: Function parameter or member 'work' not described in 'dma_fence_array'
Error: Cannot open file drivers/base/firmware_class.c
WARNING: kernel-doc 'scripts/kernel-doc -rst -enable-lineno -function request_firmware drivers/base/firmware_class.c' failed with return code 1
Error: Cannot open file drivers/base/firmware_class.c
WARNING: kernel-doc 'scripts/kernel-doc -rst -enable-lineno -function request_firmware_direct drivers/base/firmware_class.c' failed with return code 1
Error: Cannot open file drivers/base/firmware_class.c
WARNING: kernel-doc 'scripts/kernel-doc -rst -enable-lineno -function request_firmware_into_buf drivers/base/firmware_class.c' failed with return code 1
Error: Cannot open file drivers/base/firmware_class.c
WARNING: kernel-doc 'scripts/kernel-doc -rst -enable-lineno -function request_firmware_nowait drivers/base/firmware_class.c' failed with return code 1
Error: Cannot open file drivers/base/firmware_class.c
WARNING: kernel-doc 'scripts/kernel-doc -rst -enable-lineno -function firmware_request_cache drivers/base/firmware_class.c' failed with return code 1
include/linux/gpio/driver.h:142: warning: Function parameter or member 'request_key' not described in 'gpio_irq_chip'
include/linux/iio/iio.h:270: warning: Function parameter or member 'scan_type.sign' not described in 'iio_chan_spec'
include/linux/iio/iio.h:270: warning: Function parameter or member 'scan_type.realbits' not described in 'iio_chan_spec'
include/linux/iio/iio.h:270: warning: Function parameter or member 'scan_type.storagebits' not described in 'iio_chan_spec'
include/linux/iio/iio.h:270: warning: Function parameter or member 'scan_type.shift' not described in 'iio_chan_spec'
include/linux/iio/iio.h:270: warning: Function parameter or member 'scan_type.repeat' not described in 'iio_chan_spec'
include/linux/iio/iio.h:270: warning: Function parameter or member 'scan_type.endianness' not described in 'iio_chan_spec'
include/linux/iio/hw-consumer.h:1: warning: no structured comments found
>> include/linux/device.h:995: warning: Function parameter or member 'is_dma_direct' not described in 'device'
Error: Cannot open file drivers/base/firmware_class.c
Error: Cannot open file drivers/base/firmware_class.c
WARNING: kernel-doc 'scripts/kernel-doc -rst -enable-lineno -export drivers/base/firmware_class.c' failed with return code 2
include/linux/input/sparse-keymap.h:46: warning: Function parameter or member 'sw' not described in 'key_entry'
include/linux/mtd/rawnand.h:752: warning: Function parameter or member 'timings.sdr' not described in 'nand_data_interface'
include/linux/mtd/rawnand.h:817: warning: Function parameter or member 'buf' not described in 'nand_op_data_instr'
include/linux/mtd/rawnand.h:817: warning: Function parameter or member 'buf.in' not described in 'nand_op_data_instr'
include/linux/mtd/rawnand.h:817: warning: Function parameter or member 'buf.out' not described in 'nand_op_data_instr'
include/linux/mtd/rawnand.h:863: warning: Function parameter or member 'ctx' not described in 'nand_op_instr'
include/linux/mtd/rawnand.h:863: warning: Function parameter or member 'ctx.cmd' not described in 'nand_op_instr'
include/linux/mtd/rawnand.h:863: warning: Function parameter or member 'ctx.addr' not described in 'nand_op_instr'
include/linux/mtd/rawnand.h:863: warning: Function parameter or member 'ctx.data' not described in 'nand_op_instr'
include/linux/mtd/rawnand.h:863: warning: Function parameter or member 'ctx.waitrdy' not described in 'nand_op_instr'
include/linux/mtd/rawnand.h:1010: warning: Function parameter or member 'ctx' not described in 'nand_op_parser_pattern_elem'
include/linux/mtd/rawnand.h:1010: warning: Function parameter or member 'ctx.addr' not described in 'nand_op_parser_pattern_elem'
include/linux/mtd/rawnand.h:1010: warning: Function parameter or member 'ctx.data' not described in 'nand_op_parser_pattern_elem'
include/linux/mtd/rawnand.h:1313: warning: Function parameter or member 'manufacturer.desc' not described in 'nand_chip'
include/linux/mtd/rawnand.h:1313: warning: Function parameter or member 'manufacturer.priv' not described in 'nand_chip'
include/linux/regulator/driver.h:222: warning: Function parameter or member 'resume_early' not described in 'regulator_ops'
drivers/regulator/core.c:4306: warning: Excess function parameter 'state' description in 'regulator_suspend_late'
arch/s390/include/asm/cio.h:245: warning: Function parameter or member 'esw.esw0' not described in 'irb'
arch/s390/include/asm/cio.h:245: warning: Function parameter or member 'esw.esw1' not described in 'irb'
arch/s390/include/asm/cio.h:245: warning: Function parameter or member 'esw.esw2' not described in 'irb'
arch/s390/include/asm/cio.h:245: warning: Function parameter or member 'esw.esw3' not described in 'irb'
arch/s390/include/asm/cio.h:245: warning: Function parameter or member 'esw.eadm' not described in 'irb'
drivers/usb/typec/mux.c:186: warning: Function parameter or member 'mux' not described in 'typec_mux_unregister'
drivers/usb/typec/mux.c:186: warning: Excess function parameter 'sw' description in 'typec_mux_unregister'
vim +995 include/linux/device.h
^1da177e Linus Torvalds 2005-04-16 @995
:::::: The code at line 995 was first introduced by commit
:::::: 1da177e4c3f41524e886b7f1b8a0c1fc7321cac2 Linux-2.6.12-rc2
:::::: TO: Linus Torvalds <torvalds@ppc970.osdl.org>
:::::: CC: Linus Torvalds <torvalds@ppc970.osdl.org>
---
0-DAY kernel test infrastructure Open Source Technology Center
https://lists.01.org/pipermail/kbuild-all Intel Corporation
[-- Attachment #2: .config.gz --]
[-- Type: application/gzip, Size: 6353 bytes --]
^ permalink raw reply [flat|nested] 20+ messages in thread
* Re: XDP performance regression due to CONFIG_RETPOLINE Spectre V2
2018-04-16 12:27 ` Christoph Hellwig
` (3 preceding siblings ...)
(?)
@ 2018-04-16 21:07 ` Jesper Dangaard Brouer
2018-04-17 6:15 ` Christoph Hellwig
-1 siblings, 1 reply; 20+ messages in thread
From: Jesper Dangaard Brouer @ 2018-04-16 21:07 UTC (permalink / raw)
To: Christoph Hellwig
Cc: xdp-newbies, netdev, Christoph Hellwig, David Woodhouse,
William Tu, Björn Töpel, Karlsson, Magnus,
Alexander Duyck, Arnaldo Carvalho de Melo, brouer
On Mon, 16 Apr 2018 05:27:06 -0700
Christoph Hellwig <hch@infradead.org> wrote:
> Can you try the following hack which avoids indirect calls entirely
> for the fast path direct mapping case?
>
> ---
> From b256a008c1b305e6a1c2afe7c004c54ad2e96d4b Mon Sep 17 00:00:00 2001
> From: Christoph Hellwig <hch@lst.de>
> Date: Mon, 16 Apr 2018 14:18:14 +0200
> Subject: dma-mapping: bypass dma_ops for direct mappings
>
> Reportedly the retpoline mitigation for spectre causes huge penalties
> for indirect function calls. This hack bypasses the dma_ops mechanism
> for simple direct mappings.
I did below to get it compiling, and working...
On X86 swiotlb fallback (via get_dma_ops -> get_arch_dma_ops) to use
x86_swiotlb_dma_ops, instead of swiotlb_dma_ops. I also included that
in below fix patch.
Performance improved to 8.9 Mpps from approx 6.5Mpps.
(This was without my bulking for net_device->ndo_xdp_xmit, so that
number should improve more).
---
[PATCH RFC] fixups for Hellwig's DMA avoid retpoline overhead patch
From: Jesper Dangaard Brouer <brouer@redhat.com>
Performance improved to 8.9 Mpps
8917613 pkt/s
it was around 6.5 Mpps before.
---
arch/x86/kernel/pci-swiotlb.c | 3 ++-
drivers/net/ethernet/intel/ixgbe/ixgbe_main.c | 1 +
include/linux/dma-mapping.h | 14 +++++++++++++-
lib/Kconfig | 2 +-
lib/Makefile | 1 +
lib/dma-direct.c | 2 ++
lib/swiotlb.c | 1 +
7 files changed, 21 insertions(+), 3 deletions(-)
diff --git a/arch/x86/kernel/pci-swiotlb.c b/arch/x86/kernel/pci-swiotlb.c
index 0ee0f8f34251..46207e288587 100644
--- a/arch/x86/kernel/pci-swiotlb.c
+++ b/arch/x86/kernel/pci-swiotlb.c
@@ -48,7 +48,7 @@ void x86_swiotlb_free_coherent(struct device *dev, size_t size,
dma_generic_free_coherent(dev, size, vaddr, dma_addr, attrs);
}
-static const struct dma_map_ops x86_swiotlb_dma_ops = {
+const struct dma_map_ops x86_swiotlb_dma_ops = {
.mapping_error = swiotlb_dma_mapping_error,
.alloc = x86_swiotlb_alloc_coherent,
.free = x86_swiotlb_free_coherent,
@@ -62,6 +62,7 @@ static const struct dma_map_ops x86_swiotlb_dma_ops = {
.unmap_page = swiotlb_unmap_page,
.dma_supported = NULL,
};
+EXPORT_SYMBOL(x86_swiotlb_dma_ops);
/*
* pci_swiotlb_detect_override - set swiotlb to 1 if necessary
diff --git a/drivers/net/ethernet/intel/ixgbe/ixgbe_main.c b/drivers/net/ethernet/intel/ixgbe/ixgbe_main.c
index 0daccaf72a30..6d2e3f75febc 100644
--- a/drivers/net/ethernet/intel/ixgbe/ixgbe_main.c
+++ b/drivers/net/ethernet/intel/ixgbe/ixgbe_main.c
@@ -10297,6 +10297,7 @@ static int ixgbe_probe(struct pci_dev *pdev, const struct pci_device_id *ent)
return err;
if (!dma_set_mask_and_coherent(&pdev->dev, DMA_BIT_MASK(64))) {
+ pr_info("XXX %s() dma_set_mask_and_coherent\n", __func__);
pci_using_dac = 1;
} else {
err = dma_set_mask_and_coherent(&pdev->dev, DMA_BIT_MASK(32));
diff --git a/include/linux/dma-mapping.h b/include/linux/dma-mapping.h
index f2fb5aec7626..7fa92664ebfd 100644
--- a/include/linux/dma-mapping.h
+++ b/include/linux/dma-mapping.h
@@ -622,6 +622,7 @@ static inline int dma_supported(struct device *dev, u64 mask)
}
extern const struct dma_map_ops swiotlb_dma_ops;
+extern const struct dma_map_ops x86_swiotlb_dma_ops;
#ifndef HAVE_ARCH_DMA_SET_MASK
static inline int dma_set_mask(struct device *dev, u64 mask)
@@ -632,12 +633,23 @@ static inline int dma_set_mask(struct device *dev, u64 mask)
dma_check_mask(dev, mask);
*dev->dma_mask = mask;
+#ifdef CONFIG_DMA_DIRECT_OPS
if (dev->dma_ops == &dma_direct_ops ||
+# ifdef CONFIG_SWIOTLB
(dev->dma_ops == &swiotlb_dma_ops &&
- mask == DMA_BIT_MASK(64)))
+ mask == DMA_BIT_MASK(64)) ||
+# ifdef CONFIG_X86
+ (get_dma_ops(dev) == &x86_swiotlb_dma_ops &&
+ mask == DMA_BIT_MASK(64))
+# endif /* CONFIG_X86 */
+# endif /* CONFIG_SWIOTLB */
+ )
dev->is_dma_direct = true;
else
+#endif /* CONFIG_DMA_DIRECT_OPS */
dev->is_dma_direct = false;
+
+ pr_info("XXX: %s() DMA is direct: %d\n", __func__, dev->is_dma_direct);
return 0;
}
#endif
diff --git a/lib/Kconfig b/lib/Kconfig
index e96089499371..6eba2bcf468a 100644
--- a/lib/Kconfig
+++ b/lib/Kconfig
@@ -416,7 +416,7 @@ config SGL_ALLOC
config DMA_DIRECT_OPS
bool
depends on HAS_DMA && (!64BIT || ARCH_DMA_ADDR_T_64BIT)
- default n
+ default y
config DMA_VIRT_OPS
bool
diff --git a/lib/Makefile b/lib/Makefile
index a90d4fcd748f..df4885eabf9c 100644
--- a/lib/Makefile
+++ b/lib/Makefile
@@ -29,6 +29,7 @@ lib-y := ctype.o string.o vsprintf.o cmdline.o \
lib-$(CONFIG_MMU) += ioremap.o
lib-$(CONFIG_SMP) += cpumask.o
lib-$(CONFIG_DMA_DIRECT_OPS) += dma-direct.o
+#lib-y += dma-direct.o
lib-$(CONFIG_DMA_VIRT_OPS) += dma-virt.o
lib-y += kobject.o klist.o
diff --git a/lib/dma-direct.c b/lib/dma-direct.c
index ea69f8777e7f..d945efea3dae 100644
--- a/lib/dma-direct.c
+++ b/lib/dma-direct.c
@@ -107,6 +107,7 @@ dma_addr_t dma_direct_map_page(struct device *dev, struct page *page,
return DIRECT_MAPPING_ERROR;
return dma_addr;
}
+EXPORT_SYMBOL(dma_direct_map_page);
int dma_direct_map_sg(struct device *dev, struct scatterlist *sgl,
int nents, enum dma_data_direction dir, unsigned long attrs)
@@ -125,6 +126,7 @@ int dma_direct_map_sg(struct device *dev, struct scatterlist *sgl,
return nents;
}
+EXPORT_SYMBOL(dma_direct_map_sg);
int dma_direct_supported(struct device *dev, u64 mask)
{
diff --git a/lib/swiotlb.c b/lib/swiotlb.c
index c43ec2271469..ecb70f5e95ba 100644
--- a/lib/swiotlb.c
+++ b/lib/swiotlb.c
@@ -1132,4 +1132,5 @@ const struct dma_map_ops swiotlb_dma_ops = {
.unmap_page = swiotlb_unmap_page,
.dma_supported = swiotlb_dma_supported,
};
+EXPORT_SYMBOL(swiotlb_dma_ops);
#endif /* CONFIG_DMA_DIRECT_OPS */
^ permalink raw reply related [flat|nested] 20+ messages in thread
* Re: XDP performance regression due to CONFIG_RETPOLINE Spectre V2
2018-04-16 21:07 ` XDP performance regression due to CONFIG_RETPOLINE Spectre V2 Jesper Dangaard Brouer
@ 2018-04-17 6:15 ` Christoph Hellwig
2018-04-17 7:07 ` Jesper Dangaard Brouer
0 siblings, 1 reply; 20+ messages in thread
From: Christoph Hellwig @ 2018-04-17 6:15 UTC (permalink / raw)
To: Jesper Dangaard Brouer
Cc: Christoph Hellwig, xdp-newbies, netdev, Christoph Hellwig,
David Woodhouse, William Tu, Björn Töpel, Karlsson,
Magnus, Alexander Duyck, Arnaldo Carvalho de Melo
On Mon, Apr 16, 2018 at 11:07:04PM +0200, Jesper Dangaard Brouer wrote:
> On X86 swiotlb fallback (via get_dma_ops -> get_arch_dma_ops) to use
> x86_swiotlb_dma_ops, instead of swiotlb_dma_ops. I also included that
> in below fix patch.
x86_swiotlb_dma_ops should not exist any mor, and x86 now uses
dma_direct_ops. Looks like you are applying it to an old kernel :)
> Performance improved to 8.9 Mpps from approx 6.5Mpps.
>
> (This was without my bulking for net_device->ndo_xdp_xmit, so that
> number should improve more).
What is the number for the otherwise comparable setup without repolines?
^ permalink raw reply [flat|nested] 20+ messages in thread
* Re: XDP performance regression due to CONFIG_RETPOLINE Spectre V2
2018-04-17 6:15 ` Christoph Hellwig
@ 2018-04-17 7:07 ` Jesper Dangaard Brouer
2018-04-17 7:13 ` Christoph Hellwig
0 siblings, 1 reply; 20+ messages in thread
From: Jesper Dangaard Brouer @ 2018-04-17 7:07 UTC (permalink / raw)
To: Christoph Hellwig
Cc: xdp-newbies, netdev, Christoph Hellwig, David Woodhouse,
William Tu, Björn Töpel, Karlsson, Magnus,
Alexander Duyck, Arnaldo Carvalho de Melo, brouer
On Mon, 16 Apr 2018 23:15:50 -0700
Christoph Hellwig <hch@infradead.org> wrote:
> On Mon, Apr 16, 2018 at 11:07:04PM +0200, Jesper Dangaard Brouer wrote:
> > On X86 swiotlb fallback (via get_dma_ops -> get_arch_dma_ops) to use
> > x86_swiotlb_dma_ops, instead of swiotlb_dma_ops. I also included that
> > in below fix patch.
>
> x86_swiotlb_dma_ops should not exist any mor, and x86 now uses
> dma_direct_ops. Looks like you are applying it to an old kernel :)
>
> > Performance improved to 8.9 Mpps from approx 6.5Mpps.
> >
> > (This was without my bulking for net_device->ndo_xdp_xmit, so that
> > number should improve more).
>
> What is the number for the otherwise comparable setup without repolines?
Approx 12 Mpps.
You forgot to handle the dma_direct_mapping_error() case, which still
used the retpoline in the above 8.9 Mpps measurement, I fixed it up and
performance increased to 9.6 Mpps.
Notice, in this test there are still two retpoline/indirect-calls
left. The net_device->ndo_xdp_xmit and the invocation of the XDP BPF
prog.
--
Best regards,
Jesper Dangaard Brouer
MSc.CS, Principal Kernel Engineer at Red Hat
LinkedIn: http://www.linkedin.com/in/brouer
^ permalink raw reply [flat|nested] 20+ messages in thread
* Re: XDP performance regression due to CONFIG_RETPOLINE Spectre V2
2018-04-17 7:07 ` Jesper Dangaard Brouer
@ 2018-04-17 7:13 ` Christoph Hellwig
0 siblings, 0 replies; 20+ messages in thread
From: Christoph Hellwig @ 2018-04-17 7:13 UTC (permalink / raw)
To: Jesper Dangaard Brouer
Cc: Christoph Hellwig, xdp-newbies, netdev, Christoph Hellwig,
David Woodhouse, William Tu, Björn Töpel, Karlsson,
Magnus, Alexander Duyck, Arnaldo Carvalho de Melo
On Tue, Apr 17, 2018 at 09:07:01AM +0200, Jesper Dangaard Brouer wrote:
> > > number should improve more).
> >
> > What is the number for the otherwise comparable setup without repolines?
>
> Approx 12 Mpps.
>
> You forgot to handle the dma_direct_mapping_error() case, which still
> used the retpoline in the above 8.9 Mpps measurement, I fixed it up and
> performance increased to 9.6 Mpps.
>
> Notice, in this test there are still two retpoline/indirect-calls
> left. The net_device->ndo_xdp_xmit and the invocation of the XDP BPF
> prog.
But that seems like a pretty clear indicator that we want the fast path
direct mapping. I'll try to find some time over the next weeks to
do a cleaner version of it.
^ permalink raw reply [flat|nested] 20+ messages in thread