* [PATCH] arm64: optimize memcpy_{from,to}io() and memset_io()
@ 2014-07-30 6:28 Joonwoo Park
2014-08-01 6:30 ` Joonwoo Park
0 siblings, 1 reply; 12+ messages in thread
From: Joonwoo Park @ 2014-07-30 6:28 UTC (permalink / raw)
To: linux-arm-kernel
Optimize memcpy_{from,to}io() and memset_io() by transferring in 64 bit
as much as possible with minimized barrier usage. This simplest optimization
brings faster throughput compare to current byte-by-byte read and write with
barrier in the loop. Code's skeleton is taken from the powerpc.
Signed-off-by: Joonwoo Park <joonwoop@codeaurora.org>
Acked-by: Trilok Soni <tsoni@codeaurora.org>
---
arch/arm64/kernel/io.c | 72 +++++++++++++++++++++++++++++++++++++++++++-------
1 file changed, 62 insertions(+), 10 deletions(-)
diff --git a/arch/arm64/kernel/io.c b/arch/arm64/kernel/io.c
index 7d37ead..c0e3ab1 100644
--- a/arch/arm64/kernel/io.c
+++ b/arch/arm64/kernel/io.c
@@ -20,18 +20,34 @@
#include <linux/types.h>
#include <linux/io.h>
+#define IO_CHECK_ALIGN(v, a) ((((unsigned long)(v)) & ((a) - 1)) == 0)
+
/*
* Copy data from IO memory space to "real" memory space.
*/
void __memcpy_fromio(void *to, const volatile void __iomem *from, size_t count)
{
- unsigned char *t = to;
- while (count) {
+ while (count && (!IO_CHECK_ALIGN(from, 8) || !IO_CHECK_ALIGN(to, 8))) {
+ *(u8 *)to = readb_relaxed(from);
+ from++;
+ to++;
count--;
- *t = readb(from);
- t++;
+ }
+
+ while (count >= 8) {
+ *(u64 *)to = readq_relaxed(from);
+ from += 8;
+ to += 8;
+ count -= 8;
+ }
+
+ while (count) {
+ *(u8 *)to = readb_relaxed(from);
from++;
+ to++;
+ count--;
}
+ __iormb();
}
EXPORT_SYMBOL(__memcpy_fromio);
@@ -40,12 +56,28 @@ EXPORT_SYMBOL(__memcpy_fromio);
*/
void __memcpy_toio(volatile void __iomem *to, const void *from, size_t count)
{
- const unsigned char *f = from;
+ void *p = (void __force *)from;
+
+ __iowmb();
+ while (count && (!IO_CHECK_ALIGN(p, 8) || !IO_CHECK_ALIGN(from, 8))) {
+ writeb_relaxed(*(volatile u8 *)from, p);
+ from++;
+ p++;
+ count--;
+ }
+
+ while (count >= 8) {
+ writeq_relaxed(*(volatile u64 *)from, p);
+ from += 8;
+ p += 8;
+ count -= 8;
+ }
+
while (count) {
+ writeb_relaxed(*(volatile u8 *)from, p);
+ from++;
+ p++;
count--;
- writeb(*f, to);
- f++;
- to++;
}
}
EXPORT_SYMBOL(__memcpy_toio);
@@ -55,10 +87,30 @@ EXPORT_SYMBOL(__memcpy_toio);
*/
void __memset_io(volatile void __iomem *dst, int c, size_t count)
{
+ void *p = (void __force *)dst;
+ u64 qc = c;
+
+ qc |= qc << 8;
+ qc |= qc << 16;
+ qc |= qc << 32;
+
+ __iowmb();
+ while (count && !IO_CHECK_ALIGN(p, 8)) {
+ writeb_relaxed(c, p);
+ p++;
+ count--;
+ }
+
+ while (count >= 8) {
+ writeq_relaxed(c, p);
+ p += 8;
+ count -= 8;
+ }
+
while (count) {
+ writeb_relaxed(c, p);
+ p++;
count--;
- writeb(c, dst);
- dst++;
}
}
EXPORT_SYMBOL(__memset_io);
--
The Qualcomm Innovation Center, Inc. is a member of the Code Aurora Forum,
hosted by The Linux Foundation
^ permalink raw reply related [flat|nested] 12+ messages in thread
* [PATCH] arm64: optimize memcpy_{from,to}io() and memset_io()
2014-07-30 6:28 [PATCH] arm64: optimize memcpy_{from,to}io() and memset_io() Joonwoo Park
@ 2014-08-01 6:30 ` Joonwoo Park
2014-08-01 8:32 ` Will Deacon
2014-10-03 16:31 ` Catalin Marinas
0 siblings, 2 replies; 12+ messages in thread
From: Joonwoo Park @ 2014-08-01 6:30 UTC (permalink / raw)
To: linux-arm-kernel
+ Catalin, Will
Thanks,
Joonwoo
On Tue, Jul 29, 2014 at 11:28:26PM -0700, Joonwoo Park wrote:
> Optimize memcpy_{from,to}io() and memset_io() by transferring in 64 bit
> as much as possible with minimized barrier usage. This simplest optimization
> brings faster throughput compare to current byte-by-byte read and write with
> barrier in the loop. Code's skeleton is taken from the powerpc.
>
> Signed-off-by: Joonwoo Park <joonwoop@codeaurora.org>
> Acked-by: Trilok Soni <tsoni@codeaurora.org>
> ---
> arch/arm64/kernel/io.c | 72 +++++++++++++++++++++++++++++++++++++++++++-------
> 1 file changed, 62 insertions(+), 10 deletions(-)
>
> diff --git a/arch/arm64/kernel/io.c b/arch/arm64/kernel/io.c
> index 7d37ead..c0e3ab1 100644
> --- a/arch/arm64/kernel/io.c
> +++ b/arch/arm64/kernel/io.c
> @@ -20,18 +20,34 @@
> #include <linux/types.h>
> #include <linux/io.h>
>
> +#define IO_CHECK_ALIGN(v, a) ((((unsigned long)(v)) & ((a) - 1)) == 0)
> +
> /*
> * Copy data from IO memory space to "real" memory space.
> */
> void __memcpy_fromio(void *to, const volatile void __iomem *from, size_t count)
> {
> - unsigned char *t = to;
> - while (count) {
> + while (count && (!IO_CHECK_ALIGN(from, 8) || !IO_CHECK_ALIGN(to, 8))) {
> + *(u8 *)to = readb_relaxed(from);
> + from++;
> + to++;
> count--;
> - *t = readb(from);
> - t++;
> + }
> +
> + while (count >= 8) {
> + *(u64 *)to = readq_relaxed(from);
> + from += 8;
> + to += 8;
> + count -= 8;
> + }
> +
> + while (count) {
> + *(u8 *)to = readb_relaxed(from);
> from++;
> + to++;
> + count--;
> }
> + __iormb();
> }
> EXPORT_SYMBOL(__memcpy_fromio);
>
> @@ -40,12 +56,28 @@ EXPORT_SYMBOL(__memcpy_fromio);
> */
> void __memcpy_toio(volatile void __iomem *to, const void *from, size_t count)
> {
> - const unsigned char *f = from;
> + void *p = (void __force *)from;
> +
> + __iowmb();
> + while (count && (!IO_CHECK_ALIGN(p, 8) || !IO_CHECK_ALIGN(from, 8))) {
> + writeb_relaxed(*(volatile u8 *)from, p);
> + from++;
> + p++;
> + count--;
> + }
> +
> + while (count >= 8) {
> + writeq_relaxed(*(volatile u64 *)from, p);
> + from += 8;
> + p += 8;
> + count -= 8;
> + }
> +
> while (count) {
> + writeb_relaxed(*(volatile u8 *)from, p);
> + from++;
> + p++;
> count--;
> - writeb(*f, to);
> - f++;
> - to++;
> }
> }
> EXPORT_SYMBOL(__memcpy_toio);
> @@ -55,10 +87,30 @@ EXPORT_SYMBOL(__memcpy_toio);
> */
> void __memset_io(volatile void __iomem *dst, int c, size_t count)
> {
> + void *p = (void __force *)dst;
> + u64 qc = c;
> +
> + qc |= qc << 8;
> + qc |= qc << 16;
> + qc |= qc << 32;
> +
> + __iowmb();
> + while (count && !IO_CHECK_ALIGN(p, 8)) {
> + writeb_relaxed(c, p);
> + p++;
> + count--;
> + }
> +
> + while (count >= 8) {
> + writeq_relaxed(c, p);
> + p += 8;
> + count -= 8;
> + }
> +
> while (count) {
> + writeb_relaxed(c, p);
> + p++;
> count--;
> - writeb(c, dst);
> - dst++;
> }
> }
> EXPORT_SYMBOL(__memset_io);
> --
> The Qualcomm Innovation Center, Inc. is a member of the Code Aurora Forum,
> hosted by The Linux Foundation
>
--
Sent by an employee of the Qualcomm Innovation Center, Inc.
The Qualcomm Innovation Center, Inc. is a member of the Code Aurora Forum.
^ permalink raw reply [flat|nested] 12+ messages in thread
* [PATCH] arm64: optimize memcpy_{from,to}io() and memset_io()
2014-08-01 6:30 ` Joonwoo Park
@ 2014-08-01 8:32 ` Will Deacon
2014-08-02 1:38 ` Joonwoo Park
2014-10-03 16:31 ` Catalin Marinas
1 sibling, 1 reply; 12+ messages in thread
From: Will Deacon @ 2014-08-01 8:32 UTC (permalink / raw)
To: linux-arm-kernel
On Fri, Aug 01, 2014 at 07:30:09AM +0100, Joonwoo Park wrote:
> + Catalin, Will
>
> Thanks,
> Joonwoo
> On Tue, Jul 29, 2014 at 11:28:26PM -0700, Joonwoo Park wrote:
> > Optimize memcpy_{from,to}io() and memset_io() by transferring in 64 bit
> > as much as possible with minimized barrier usage. This simplest optimization
> > brings faster throughput compare to current byte-by-byte read and write with
> > barrier in the loop. Code's skeleton is taken from the powerpc.
Hmm, I've never really understood the use-case for memcpy_{to,from}io on
ARM, so getting to the bottom of that would help in reviewing this patch.
Can you point me at the drivers which are using this for ARM please? Doing a
blind byte-by-byte copy could easily cause problems with some peripherals,
so there must be an underlying assumption somewhere about how this is
supposed to work.
Will
^ permalink raw reply [flat|nested] 12+ messages in thread
* [PATCH] arm64: optimize memcpy_{from,to}io() and memset_io()
2014-08-01 8:32 ` Will Deacon
@ 2014-08-02 1:38 ` Joonwoo Park
2014-08-04 9:57 ` Will Deacon
0 siblings, 1 reply; 12+ messages in thread
From: Joonwoo Park @ 2014-08-02 1:38 UTC (permalink / raw)
To: linux-arm-kernel
On Fri, Aug 01, 2014 at 09:32:46AM +0100, Will Deacon wrote:
> On Fri, Aug 01, 2014 at 07:30:09AM +0100, Joonwoo Park wrote:
> > + Catalin, Will
> >
> > Thanks,
> > Joonwoo
> > On Tue, Jul 29, 2014 at 11:28:26PM -0700, Joonwoo Park wrote:
> > > Optimize memcpy_{from,to}io() and memset_io() by transferring in 64 bit
> > > as much as possible with minimized barrier usage. This simplest optimization
> > > brings faster throughput compare to current byte-by-byte read and write with
> > > barrier in the loop. Code's skeleton is taken from the powerpc.
>
> Hmm, I've never really understood the use-case for memcpy_{to,from}io on
> ARM, so getting to the bottom of that would help in reviewing this patch.
>
> Can you point me at the drivers which are using this for ARM please? Doing a
Sure. This peripheral-loader.c driver now moved under drivers/soc/ so it can be used for ARM and ARM64.
https://android.googlesource.com/kernel/msm.git/+/db34f44bcba24345d26b8a4b8137cf94d70afa73/arch/arm/mach-msm/peripheral-loader.c
static int load_segment(const struct elf32_phdr *phdr, unsigned num, struct pil_device *pil)
{
<snip>
while (count > 0) {
int size;
u8 __iomem *buf;
size = min_t(size_t, IOMAP_SIZE, count);
buf = ioremap(paddr, size);
}
<snip>
memset(buf, 0, size);
<snip>
}
As you can see the function load_segment() does ioremap() followed by memset() and memcpy() which can cause unaligned multi-byte (maybe ARM64 traps 8byte unaligned access?) write to device memory.
Because of this I was fixing the driver to use memset_io() and memcpy_io() but the existing implementations were too slow compare to the one I'm proposing.
> blind byte-by-byte copy could easily cause problems with some peripherals,
> so there must be an underlying assumption somewhere about how this is
> supposed to work.
Would you mind to explain more about the problem with byte-by-byte copying you're worried about?
I thought byte-by-byte copy always safe with regard to aligned access and that's the reason existing implementation does byte-by-byte copy.
I can imagine there are some peripherals don't allow per-byte access. But if that is the case, should they not use memset_io() and memcpy_{from,to}io() anyway?
Also I found this. http://lists.infradead.org/pipermail/linux-arm-kernel/2014-February/234729.html
Looks like Catalin also had a similar idea with mine.
Thanks,
Joonwoo
>
> Will
--
The Qualcomm Innovation Center, Inc. is a member of Code Aurora Forum,
hosted by The Linux Foundation
^ permalink raw reply [flat|nested] 12+ messages in thread
* [PATCH] arm64: optimize memcpy_{from,to}io() and memset_io()
2014-08-02 1:38 ` Joonwoo Park
@ 2014-08-04 9:57 ` Will Deacon
0 siblings, 0 replies; 12+ messages in thread
From: Will Deacon @ 2014-08-04 9:57 UTC (permalink / raw)
To: linux-arm-kernel
On Sat, Aug 02, 2014 at 02:38:34AM +0100, Joonwoo Park wrote:
> On Fri, Aug 01, 2014 at 09:32:46AM +0100, Will Deacon wrote:
> > On Fri, Aug 01, 2014 at 07:30:09AM +0100, Joonwoo Park wrote:
> > > On Tue, Jul 29, 2014 at 11:28:26PM -0700, Joonwoo Park wrote:
> > > > Optimize memcpy_{from,to}io() and memset_io() by transferring in 64 bit
> > > > as much as possible with minimized barrier usage. This simplest optimization
> > > > brings faster throughput compare to current byte-by-byte read and write with
> > > > barrier in the loop. Code's skeleton is taken from the powerpc.
> >
> > Hmm, I've never really understood the use-case for memcpy_{to,from}io on
> > ARM, so getting to the bottom of that would help in reviewing this patch.
> >
> > Can you point me at the drivers which are using this for ARM please? Doing a
> Sure. This peripheral-loader.c driver now moved under drivers/soc/ so it
> can be used for ARM and ARM64.
> https://android.googlesource.com/kernel/msm.git/+/db34f44bcba24345d26b8a4b8137cf94d70afa73/arch/arm/mach-msm/peripheral-loader.c
> static int load_segment(const struct elf32_phdr *phdr, unsigned num, struct pil_device *pil)
> {
> <snip>
> while (count > 0) {
> int size;
> u8 __iomem *buf;
> size = min_t(size_t, IOMAP_SIZE, count);
> buf = ioremap(paddr, size);
> }
> <snip>
> memset(buf, 0, size);
> <snip>
Right, but that code doesn't exist in mainline afaict.
> As you can see the function load_segment() does ioremap() followed by
> memset() and memcpy() which can cause unaligned multi-byte (maybe ARM64
> traps 8byte unaligned access?) write to device memory.
> Because of this I was fixing the driver to use memset_io() and memcpy_io()
> but the existing implementations were too slow compare to the one I'm
> proposing.
>
> > blind byte-by-byte copy could easily cause problems with some peripherals,
> > so there must be an underlying assumption somewhere about how this is
> > supposed to work.
> Would you mind to explain more about the problem with byte-by-byte copying
> you're worried about?
> I thought byte-by-byte copy always safe with regard to aligned access and
> that's the reason existing implementation does byte-by-byte copy.
> I can imagine there are some peripherals don't allow per-byte access. But
> if that is the case, should they not use memset_io() and
> memcpy_{from,to}io() anyway?
Yes, if somebody tried to use memset_io to zero a bunch of control
registers, for example, you'd likely get a bunch of aborts because the
endpoint would give you a SLVERR for a byte-access to a word register.
It just seems like the expected usage of this function should be documented
somewhere to avoid it becoming highly dependent on the architecture.
Will
^ permalink raw reply [flat|nested] 12+ messages in thread
* [PATCH] arm64: optimize memcpy_{from,to}io() and memset_io()
2014-08-01 6:30 ` Joonwoo Park
2014-08-01 8:32 ` Will Deacon
@ 2014-10-03 16:31 ` Catalin Marinas
2014-10-09 2:39 ` Joonwoo Park
1 sibling, 1 reply; 12+ messages in thread
From: Catalin Marinas @ 2014-10-03 16:31 UTC (permalink / raw)
To: linux-arm-kernel
On Tue, Jul 29, 2014 at 11:28:26PM -0700, Joonwoo Park wrote:
> Optimize memcpy_{from,to}io() and memset_io() by transferring in 64 bit
> as much as possible with minimized barrier usage. This simplest optimization
> brings faster throughput compare to current byte-by-byte read and write with
> barrier in the loop. Code's skeleton is taken from the powerpc.
>
> Signed-off-by: Joonwoo Park <joonwoop@codeaurora.org>
> Acked-by: Trilok Soni <tsoni@codeaurora.org>
I thought about merging this but there are some issues. Comments below.
> diff --git a/arch/arm64/kernel/io.c b/arch/arm64/kernel/io.c
> index 7d37ead..c0e3ab1 100644
> --- a/arch/arm64/kernel/io.c
> +++ b/arch/arm64/kernel/io.c
> @@ -20,18 +20,34 @@
> #include <linux/types.h>
> #include <linux/io.h>
>
> +#define IO_CHECK_ALIGN(v, a) ((((unsigned long)(v)) & ((a) - 1)) == 0)
Can you not use just IS_ALIGNED?
> /*
> * Copy data from IO memory space to "real" memory space.
> */
> void __memcpy_fromio(void *to, const volatile void __iomem *from, size_t count)
> {
> - unsigned char *t = to;
> - while (count) {
> + while (count && (!IO_CHECK_ALIGN(from, 8) || !IO_CHECK_ALIGN(to, 8))) {
> + *(u8 *)to = readb_relaxed(from);
We should not use the relaxed accessors here as we don't really care
about endianness conversion. We just copy data from one place to another
without interpreting it, so __raw_read*() is sufficient.
> + from++;
> + to++;
> count--;
> - *t = readb(from);
> - t++;
> + }
> +
> + while (count >= 8) {
> + *(u64 *)to = readq_relaxed(from);
> + from += 8;
> + to += 8;
> + count -= 8;
> + }
> +
> + while (count) {
> + *(u8 *)to = readb_relaxed(from);
> from++;
> + to++;
> + count--;
> }
> + __iormb();
We don't need this barrier here. In the readl() implementation, it's use
is for ordering between I/O polling and DMA buffer access.
> }
> EXPORT_SYMBOL(__memcpy_fromio);
>
> @@ -40,12 +56,28 @@ EXPORT_SYMBOL(__memcpy_fromio);
> */
> void __memcpy_toio(volatile void __iomem *to, const void *from, size_t count)
> {
> - const unsigned char *f = from;
> + void *p = (void __force *)from;
Why do you need this?
> +
> + __iowmb();
Not needed here either.
> + while (count && (!IO_CHECK_ALIGN(p, 8) || !IO_CHECK_ALIGN(from, 8))) {
> + writeb_relaxed(*(volatile u8 *)from, p);
Oh, so you copy from "from" to "from". Have you really tested this?
> @@ -55,10 +87,30 @@ EXPORT_SYMBOL(__memcpy_toio);
> */
> void __memset_io(volatile void __iomem *dst, int c, size_t count)
> {
> + void *p = (void __force *)dst;
Do you need this?
> + u64 qc = c;
> +
> + qc |= qc << 8;
> + qc |= qc << 16;
> + qc |= qc << 32;
> +
> + __iowmb();
What's this for?
> + while (count && !IO_CHECK_ALIGN(p, 8)) {
> + writeb_relaxed(c, p);
Using dst here directly here should work (__raw_writeb takes the same
type as the second argument).
--
Catalin
^ permalink raw reply [flat|nested] 12+ messages in thread
* [PATCH] arm64: optimize memcpy_{from,to}io() and memset_io()
2014-10-03 16:31 ` Catalin Marinas
@ 2014-10-09 2:39 ` Joonwoo Park
2014-10-09 10:16 ` Catalin Marinas
0 siblings, 1 reply; 12+ messages in thread
From: Joonwoo Park @ 2014-10-09 2:39 UTC (permalink / raw)
To: linux-arm-kernel
On Fri, Oct 03, 2014 at 05:31:42PM +0100, Catalin Marinas wrote:
> On Tue, Jul 29, 2014 at 11:28:26PM -0700, Joonwoo Park wrote:
> > Optimize memcpy_{from,to}io() and memset_io() by transferring in 64 bit
> > as much as possible with minimized barrier usage. This simplest optimization
> > brings faster throughput compare to current byte-by-byte read and write with
> > barrier in the loop. Code's skeleton is taken from the powerpc.
> >
> > Signed-off-by: Joonwoo Park <joonwoop@codeaurora.org>
> > Acked-by: Trilok Soni <tsoni@codeaurora.org>
>
> I thought about merging this but there are some issues. Comments below.
Thanks for reviewing.
>
> > diff --git a/arch/arm64/kernel/io.c b/arch/arm64/kernel/io.c
> > index 7d37ead..c0e3ab1 100644
> > --- a/arch/arm64/kernel/io.c
> > +++ b/arch/arm64/kernel/io.c
> > @@ -20,18 +20,34 @@
> > #include <linux/types.h>
> > #include <linux/io.h>
> >
> > +#define IO_CHECK_ALIGN(v, a) ((((unsigned long)(v)) & ((a) - 1)) == 0)
>
> Can you not use just IS_ALIGNED?
>
Will do. I would need to cast from/to with unsigned long.
> > /*
> > * Copy data from IO memory space to "real" memory space.
> > */
> > void __memcpy_fromio(void *to, const volatile void __iomem *from, size_t count)
> > {
> > - unsigned char *t = to;
> > - while (count) {
> > + while (count && (!IO_CHECK_ALIGN(from, 8) || !IO_CHECK_ALIGN(to, 8))) {
> > + *(u8 *)to = readb_relaxed(from);
>
> We should not use the relaxed accessors here as we don't really care
> about endianness conversion. We just copy data from one place to another
> without interpreting it, so __raw_read*() is sufficient.
>
Will do.
> > + from++;
> > + to++;
> > count--;
> > - *t = readb(from);
> > - t++;
> > + }
> > +
> > + while (count >= 8) {
> > + *(u64 *)to = readq_relaxed(from);
> > + from += 8;
> > + to += 8;
> > + count -= 8;
> > + }
> > +
> > + while (count) {
> > + *(u8 *)to = readb_relaxed(from);
> > from++;
> > + to++;
> > + count--;
> > }
> > + __iormb();
>
> We don't need this barrier here. In the readl() implementation, it's use
> is for ordering between I/O polling and DMA buffer access.
>
The barriers here and down below are for accessing different devices in a row.
I thought that's what your suggestion too.
http://lists.infradead.org/pipermail/linux-arm-kernel/2012-September/123178.html
> > }
> > EXPORT_SYMBOL(__memcpy_fromio);
> >
> > @@ -40,12 +56,28 @@ EXPORT_SYMBOL(__memcpy_fromio);
> > */
> > void __memcpy_toio(volatile void __iomem *to, const void *from, size_t count)
> > {
> > - const unsigned char *f = from;
> > + void *p = (void __force *)from;
>
> Why do you need this?
>
Will drop this.
> > +
> > + __iowmb();
>
> Not needed here either.
>
> > + while (count && (!IO_CHECK_ALIGN(p, 8) || !IO_CHECK_ALIGN(from, 8))) {
> > + writeb_relaxed(*(volatile u8 *)from, p);
>
> Oh, so you copy from "from" to "from". Have you really tested this?
>
I also found this issue with more testing after sending this patch. Will fix.
> > @@ -55,10 +87,30 @@ EXPORT_SYMBOL(__memcpy_toio);
> > */
> > void __memset_io(volatile void __iomem *dst, int c, size_t count)
> > {
> > + void *p = (void __force *)dst;
>
> Do you need this?
>
Will drop this.
> > + u64 qc = c;
> > +
> > + qc |= qc << 8;
> > + qc |= qc << 16;
> > + qc |= qc << 32;
> > +
> > + __iowmb();
>
> What's this for?
>
This barrier was for the same reason with one above for writing different devices that doesn't guarantee writing order.
> > + while (count && !IO_CHECK_ALIGN(p, 8)) {
> > + writeb_relaxed(c, p);
>
> Using dst here directly here should work (__raw_writeb takes the same
> type as the second argument).
>
Will update.
I'm quite not sure if barriers are not needed or not indeed.
The situation I'm worried about is like 'memcpy_io(device A); memcpy_io(device B);' which I think memcpy_io() needs to guarantee the order.
Please let me know. I'll submit new version then.
Thanks,
Joonwoo
>
> --
> Catalin
--
The Qualcomm Innovation Center, Inc. is a member of Code Aurora Forum,
hosted by The Linux Foundation
^ permalink raw reply [flat|nested] 12+ messages in thread
* [PATCH] arm64: optimize memcpy_{from,to}io() and memset_io()
2014-10-09 2:39 ` Joonwoo Park
@ 2014-10-09 10:16 ` Catalin Marinas
2014-10-14 4:04 ` Joonwoo Park
0 siblings, 1 reply; 12+ messages in thread
From: Catalin Marinas @ 2014-10-09 10:16 UTC (permalink / raw)
To: linux-arm-kernel
On Thu, Oct 09, 2014 at 03:39:33AM +0100, Joonwoo Park wrote:
> On Fri, Oct 03, 2014 at 05:31:42PM +0100, Catalin Marinas wrote:
> > On Tue, Jul 29, 2014 at 11:28:26PM -0700, Joonwoo Park wrote:
> > > diff --git a/arch/arm64/kernel/io.c b/arch/arm64/kernel/io.c
> > > index 7d37ead..c0e3ab1 100644
> > > --- a/arch/arm64/kernel/io.c
> > > +++ b/arch/arm64/kernel/io.c
> > > @@ -20,18 +20,34 @@
> > > #include <linux/types.h>
> > > #include <linux/io.h>
> > >
> > > +#define IO_CHECK_ALIGN(v, a) ((((unsigned long)(v)) & ((a) - 1)) == 0)
> >
> > Can you not use just IS_ALIGNED?
>
> Will do. I would need to cast from/to with unsigned long.
Or define IO_CHECK_ALIGN as IS_ALIGNED((unsigned long)v, a)
> > > + from++;
> > > + to++;
> > > count--;
> > > - *t = readb(from);
> > > - t++;
> > > + }
> > > +
> > > + while (count >= 8) {
> > > + *(u64 *)to = readq_relaxed(from);
> > > + from += 8;
> > > + to += 8;
> > > + count -= 8;
> > > + }
> > > +
> > > + while (count) {
> > > + *(u8 *)to = readb_relaxed(from);
> > > from++;
> > > + to++;
> > > + count--;
> > > }
> > > + __iormb();
> >
> > We don't need this barrier here. In the readl() implementation, it's use
> > is for ordering between I/O polling and DMA buffer access.
>
> The barriers here and down below are for accessing different devices in a row.
> I thought that's what your suggestion too.
> http://lists.infradead.org/pipermail/linux-arm-kernel/2012-September/123178.html
I think we should leave them out until we find a use case. I currently
don't see any (writel/readl etc. still have the barriers).
> > > + while (count && !IO_CHECK_ALIGN(p, 8)) {
> > > + writeb_relaxed(c, p);
> >
> > Using dst here directly here should work (__raw_writeb takes the same
> > type as the second argument).
>
> Will update.
>
> I'm quite not sure if barriers are not needed or not indeed.
> The situation I'm worried about is like 'memcpy_io(device A);
> memcpy_io(device B);' which I think memcpy_io() needs to guarantee the
> order.
Without barriers, ordering between device A and B would not be
guaranteed. But do you have a scenario where this ordering actually
matters? Most case we have something like:
memcpy_io(device A); // mapped as Device or Normal NonCacheable
writel(device B); // or an I/O register of device A
Here writel() has the correct barrier.
--
Catalin
^ permalink raw reply [flat|nested] 12+ messages in thread
* [PATCH] arm64: optimize memcpy_{from,to}io() and memset_io()
2014-10-09 10:16 ` Catalin Marinas
@ 2014-10-14 4:04 ` Joonwoo Park
2014-10-14 4:12 ` Joonwoo Park
0 siblings, 1 reply; 12+ messages in thread
From: Joonwoo Park @ 2014-10-14 4:04 UTC (permalink / raw)
To: linux-arm-kernel
On Thu, Oct 09, 2014 at 11:16:14AM +0100, Catalin Marinas wrote:
> On Thu, Oct 09, 2014 at 03:39:33AM +0100, Joonwoo Park wrote:
> > On Fri, Oct 03, 2014 at 05:31:42PM +0100, Catalin Marinas wrote:
> > > On Tue, Jul 29, 2014 at 11:28:26PM -0700, Joonwoo Park wrote:
> > > > diff --git a/arch/arm64/kernel/io.c b/arch/arm64/kernel/io.c
> > > > index 7d37ead..c0e3ab1 100644
> > > > --- a/arch/arm64/kernel/io.c
> > > > +++ b/arch/arm64/kernel/io.c
> > > > @@ -20,18 +20,34 @@
> > > > #include <linux/types.h>
> > > > #include <linux/io.h>
> > > >
> > > > +#define IO_CHECK_ALIGN(v, a) ((((unsigned long)(v)) & ((a) - 1)) == 0)
> > >
> > > Can you not use just IS_ALIGNED?
> >
> > Will do. I would need to cast from/to with unsigned long.
>
> Or define IO_CHECK_ALIGN as IS_ALIGNED((unsigned long)v, a)
>
> > > > + from++;
> > > > + to++;
> > > > count--;
> > > > - *t = readb(from);
> > > > - t++;
> > > > + }
> > > > +
> > > > + while (count >= 8) {
> > > > + *(u64 *)to = readq_relaxed(from);
> > > > + from += 8;
> > > > + to += 8;
> > > > + count -= 8;
> > > > + }
> > > > +
> > > > + while (count) {
> > > > + *(u8 *)to = readb_relaxed(from);
> > > > from++;
> > > > + to++;
> > > > + count--;
> > > > }
> > > > + __iormb();
> > >
> > > We don't need this barrier here. In the readl() implementation, it's use
> > > is for ordering between I/O polling and DMA buffer access.
> >
> > The barriers here and down below are for accessing different devices in a row.
> > I thought that's what your suggestion too.
> > http://lists.infradead.org/pipermail/linux-arm-kernel/2012-September/123178.html
>
> I think we should leave them out until we find a use case. I currently
> don't see any (writel/readl etc. still have the barriers).
>
> > > > + while (count && !IO_CHECK_ALIGN(p, 8)) {
> > > > + writeb_relaxed(c, p);
> > >
> > > Using dst here directly here should work (__raw_writeb takes the same
> > > type as the second argument).
> >
> > Will update.
> >
> > I'm quite not sure if barriers are not needed or not indeed.
> > The situation I'm worried about is like 'memcpy_io(device A);
> > memcpy_io(device B);' which I think memcpy_io() needs to guarantee the
> > order.
>
> Without barriers, ordering between device A and B would not be
> guaranteed. But do you have a scenario where this ordering actually
> matters? Most case we have something like:
>
> memcpy_io(device A); // mapped as Device or Normal NonCacheable
> writel(device B); // or an I/O register of device A
>
> Here writel() has the correct barrier.
I don't have such use case that ordering matters either. I agree that we should leave until it's needed.
Thanks,
Joonwoo
>
> --
> Catalin
--
The Qualcomm Innovation Center, Inc. is a member of Code Aurora Forum,
hosted by The Linux Foundation
^ permalink raw reply [flat|nested] 12+ messages in thread
* [PATCH] arm64: optimize memcpy_{from,to}io() and memset_io()
2014-10-14 4:04 ` Joonwoo Park
@ 2014-10-14 4:12 ` Joonwoo Park
2014-10-20 13:33 ` Catalin Marinas
0 siblings, 1 reply; 12+ messages in thread
From: Joonwoo Park @ 2014-10-14 4:12 UTC (permalink / raw)
To: linux-arm-kernel
>From 1cb0bb81e0d399255e679337929a8438946a4b9a Mon Sep 17 00:00:00 2001
From: Joonwoo Park <joonwoop@codeaurora.org>
Date: Mon, 28 Jul 2014 17:32:06 -0700
Subject: [PATCH v2] arm64: optimize memcpy_{from,to}io() and memset_io()
Optimize memcpy_{from,to}io() and memset_io() by transferring in 64 bit
as much as possible with minimized barrier usage. This simplest
optimization brings faster throughput compare to current byte-by-byte read
and write with barrier in the loop. Code's skeleton is taken from the
powerpc.
Reviewed-by: Trilok Soni <tsoni@codeaurora.org>
Signed-off-by: Joonwoo Park <joonwoop@codeaurora.org>
---
Changes in v2:
* dropped barriers.
* Use IS_ALIGNED() macro.
* use __raw_read{b,q} and __raw_write{b,q}.
* fix bug in __memcpy_toio()
arch/arm64/kernel/io.c | 66 ++++++++++++++++++++++++++++++++++++++++++--------
1 file changed, 56 insertions(+), 10 deletions(-)
diff --git a/arch/arm64/kernel/io.c b/arch/arm64/kernel/io.c
index 7d37ead..a47cd65 100644
--- a/arch/arm64/kernel/io.c
+++ b/arch/arm64/kernel/io.c
@@ -25,12 +25,26 @@
*/
void __memcpy_fromio(void *to, const volatile void __iomem *from, size_t count)
{
- unsigned char *t = to;
- while (count) {
+ while (count && (!IS_ALIGNED((unsigned long)from, 8) ||
+ !IS_ALIGNED((unsigned long)to, 8))) {
+ *(u8 *)to = __raw_readb(from);
+ from++;
+ to++;
count--;
- *t = readb(from);
- t++;
+ }
+
+ while (count >= 8) {
+ *(u64 *)to = __raw_readq(from);
+ from += 8;
+ to += 8;
+ count -= 8;
+ }
+
+ while (count) {
+ *(u8 *)to = __raw_readb(from);
from++;
+ to++;
+ count--;
}
}
EXPORT_SYMBOL(__memcpy_fromio);
@@ -40,12 +54,26 @@ EXPORT_SYMBOL(__memcpy_fromio);
*/
void __memcpy_toio(volatile void __iomem *to, const void *from, size_t count)
{
- const unsigned char *f = from;
- while (count) {
+ while (count && (!IS_ALIGNED((unsigned long)to, 8) ||
+ !IS_ALIGNED((unsigned long)from, 8))) {
+ __raw_writeb(*(volatile u8 *)from, to);
+ from++;
+ to++;
count--;
- writeb(*f, to);
- f++;
+ }
+
+ while (count >= 8) {
+ __raw_writeq(*(volatile u64 *)from, to);
+ from += 8;
+ to += 8;
+ count -= 8;
+ }
+
+ while (count) {
+ __raw_writeb(*(volatile u8 *)from, to);
+ from++;
to++;
+ count--;
}
}
EXPORT_SYMBOL(__memcpy_toio);
@@ -55,10 +83,28 @@ EXPORT_SYMBOL(__memcpy_toio);
*/
void __memset_io(volatile void __iomem *dst, int c, size_t count)
{
- while (count) {
+ u64 qc = c;
+
+ qc |= qc << 8;
+ qc |= qc << 16;
+ qc |= qc << 32;
+
+ while (count && !IS_ALIGNED((unsigned long)dst, 8)) {
+ __raw_writeb(c, dst);
+ dst++;
count--;
- writeb(c, dst);
+ }
+
+ while (count >= 8) {
+ __raw_writeq(qc, dst);
+ dst += 8;
+ count -= 8;
+ }
+
+ while (count) {
+ __raw_writeb(c, dst);
dst++;
+ count--;
}
}
EXPORT_SYMBOL(__memset_io);
--
The Qualcomm Innovation Center, Inc. is a member of the Code Aurora Forum,
hosted by The Linux Foundation
^ permalink raw reply related [flat|nested] 12+ messages in thread
* [PATCH] arm64: optimize memcpy_{from,to}io() and memset_io()
2014-10-14 4:12 ` Joonwoo Park
@ 2014-10-20 13:33 ` Catalin Marinas
0 siblings, 0 replies; 12+ messages in thread
From: Catalin Marinas @ 2014-10-20 13:33 UTC (permalink / raw)
To: linux-arm-kernel
On Tue, Oct 14, 2014 at 05:12:41AM +0100, Joonwoo Park wrote:
> From 1cb0bb81e0d399255e679337929a8438946a4b9a Mon Sep 17 00:00:00 2001
> From: Joonwoo Park <joonwoop@codeaurora.org>
> Date: Mon, 28 Jul 2014 17:32:06 -0700
> Subject: [PATCH v2] arm64: optimize memcpy_{from,to}io() and memset_io()
>
> Optimize memcpy_{from,to}io() and memset_io() by transferring in 64 bit
> as much as possible with minimized barrier usage. This simplest
> optimization brings faster throughput compare to current byte-by-byte read
> and write with barrier in the loop. Code's skeleton is taken from the
> powerpc.
>
> Reviewed-by: Trilok Soni <tsoni@codeaurora.org>
> Signed-off-by: Joonwoo Park <joonwoop@codeaurora.org>
[...]
> @@ -55,10 +83,28 @@ EXPORT_SYMBOL(__memcpy_toio);
> */
> void __memset_io(volatile void __iomem *dst, int c, size_t count)
> {
> - while (count) {
> + u64 qc = c;
I think you should use a (u8) cast here (I'm not sure there is a
requirement for memset to only pass u8 values).
u64 qc = (u8)c;
Otherwise the patch looks fine. Since Will is handling the next merging
window, you can add:
Reviewed-by: Catalin Marinas <catalin.marinas@arm.com>
^ permalink raw reply [flat|nested] 12+ messages in thread
* [PATCH] arm64: optimize memcpy_{from,to}io() and memset_io()
@ 2014-10-21 0:59 Joonwoo Park
0 siblings, 0 replies; 12+ messages in thread
From: Joonwoo Park @ 2014-10-21 0:59 UTC (permalink / raw)
To: linux-arm-kernel
Optimize memcpy_{from,to}io() and memset_io() by transferring in 64 bit
as much as possible with minimized barrier usage. This simplest
optimization brings faster throughput compare to current byte-by-byte read
and write with barrier in the loop. Code's skeleton is taken from the
powerpc.
Link: http://lkml.kernel.org/p/20141020133304.GH23751 at e104818-lin.cambridge.arm.com
Reviewed-by: Catalin Marinas <catalin.marinas@arm.com>
Reviewed-by: Trilok Soni <tsoni@codeaurora.org>
Signed-off-by: Joonwoo Park <joonwoop@codeaurora.org>
---
arch/arm64/kernel/io.c | 66 ++++++++++++++++++++++++++++++++++++++++++--------
1 file changed, 56 insertions(+), 10 deletions(-)
diff --git a/arch/arm64/kernel/io.c b/arch/arm64/kernel/io.c
index 7d37ead..354be2a 100644
--- a/arch/arm64/kernel/io.c
+++ b/arch/arm64/kernel/io.c
@@ -25,12 +25,26 @@
*/
void __memcpy_fromio(void *to, const volatile void __iomem *from, size_t count)
{
- unsigned char *t = to;
- while (count) {
+ while (count && (!IS_ALIGNED((unsigned long)from, 8) ||
+ !IS_ALIGNED((unsigned long)to, 8))) {
+ *(u8 *)to = __raw_readb(from);
+ from++;
+ to++;
count--;
- *t = readb(from);
- t++;
+ }
+
+ while (count >= 8) {
+ *(u64 *)to = __raw_readq(from);
+ from += 8;
+ to += 8;
+ count -= 8;
+ }
+
+ while (count) {
+ *(u8 *)to = __raw_readb(from);
from++;
+ to++;
+ count--;
}
}
EXPORT_SYMBOL(__memcpy_fromio);
@@ -40,12 +54,26 @@ EXPORT_SYMBOL(__memcpy_fromio);
*/
void __memcpy_toio(volatile void __iomem *to, const void *from, size_t count)
{
- const unsigned char *f = from;
- while (count) {
+ while (count && (!IS_ALIGNED((unsigned long)to, 8) ||
+ !IS_ALIGNED((unsigned long)from, 8))) {
+ __raw_writeb(*(volatile u8 *)from, to);
+ from++;
+ to++;
count--;
- writeb(*f, to);
- f++;
+ }
+
+ while (count >= 8) {
+ __raw_writeq(*(volatile u64 *)from, to);
+ from += 8;
+ to += 8;
+ count -= 8;
+ }
+
+ while (count) {
+ __raw_writeb(*(volatile u8 *)from, to);
+ from++;
to++;
+ count--;
}
}
EXPORT_SYMBOL(__memcpy_toio);
@@ -55,10 +83,28 @@ EXPORT_SYMBOL(__memcpy_toio);
*/
void __memset_io(volatile void __iomem *dst, int c, size_t count)
{
- while (count) {
+ u64 qc = (u8)c;
+
+ qc |= qc << 8;
+ qc |= qc << 16;
+ qc |= qc << 32;
+
+ while (count && !IS_ALIGNED((unsigned long)dst, 8)) {
+ __raw_writeb(c, dst);
+ dst++;
count--;
- writeb(c, dst);
+ }
+
+ while (count >= 8) {
+ __raw_writeq(qc, dst);
+ dst += 8;
+ count -= 8;
+ }
+
+ while (count) {
+ __raw_writeb(c, dst);
dst++;
+ count--;
}
}
EXPORT_SYMBOL(__memset_io);
--
The Qualcomm Innovation Center, Inc. is a member of the Code Aurora Forum,
hosted by The Linux Foundation
^ permalink raw reply related [flat|nested] 12+ messages in thread
end of thread, other threads:[~2014-10-21 0:59 UTC | newest]
Thread overview: 12+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2014-07-30 6:28 [PATCH] arm64: optimize memcpy_{from,to}io() and memset_io() Joonwoo Park
2014-08-01 6:30 ` Joonwoo Park
2014-08-01 8:32 ` Will Deacon
2014-08-02 1:38 ` Joonwoo Park
2014-08-04 9:57 ` Will Deacon
2014-10-03 16:31 ` Catalin Marinas
2014-10-09 2:39 ` Joonwoo Park
2014-10-09 10:16 ` Catalin Marinas
2014-10-14 4:04 ` Joonwoo Park
2014-10-14 4:12 ` Joonwoo Park
2014-10-20 13:33 ` Catalin Marinas
2014-10-21 0:59 Joonwoo Park
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).