linux-kernel.vger.kernel.org archive mirror
 help / color / mirror / Atom feed
* [patch 2.6.17-rc5 1/2] i386 memcpy: use as few moves as possible for I/O
@ 2006-05-31  0:59 Chuck Ebbert
  2006-05-31  1:36 ` H. Peter Anvin
                   ` (2 more replies)
  0 siblings, 3 replies; 5+ messages in thread
From: Chuck Ebbert @ 2006-05-31  0:59 UTC (permalink / raw)
  To: linux-kernel; +Cc: Chris Lesiak, H. Peter Anvin, Andrew Morton, Linus Torvalds

Chris Lesiak reported that changes to i386's __memcpy() broke his device
because it can't handle byte moves and the new code uses them for
all trailing bytes when the length is not divisible by four.  The old
code tried to use a 16-bit move and/or a byte move as needed.

H. Peter Anvin:
"There are only a few semantics that make sense: fixed 8, 16, 32, or 64
bits, plus "optimal"; the latter to be used for anything that doesn't
require a specific transfer size.  Logically, an unqualified
"memcpy_to/fromio" should be the optimal size (as few transfers as
possible)"

So add back the old code as __minimal_memcpy and have IO transfers
use that.

Signed-off-by: Chuck Ebbert <76306.1226@compuserve.com>

---

 include/asm-i386/io.h     |    4 ++--
 include/asm-i386/string.h |   21 +++++++++++++++++++++
 2 files changed, 23 insertions(+), 2 deletions(-)

--- 2.6.17-rc5-32.orig/include/asm-i386/io.h
+++ 2.6.17-rc5-32/include/asm-i386/io.h
@@ -202,11 +202,11 @@ static inline void memset_io(volatile vo
 }
 static inline void memcpy_fromio(void *dst, const volatile void __iomem *src, int count)
 {
-	__memcpy(dst, (void __force *) src, count);
+	__minimal_memcpy(dst, (void __force *) src, count);
 }
 static inline void memcpy_toio(volatile void __iomem *dst, const void *src, int count)
 {
-	__memcpy((void __force *) dst, src, count);
+	__minimal_memcpy((void __force *) dst, src, count);
 }
 
 /*
--- 2.6.17-rc5-32.orig/include/asm-i386/string.h
+++ 2.6.17-rc5-32/include/asm-i386/string.h
@@ -220,6 +220,28 @@ return (to);
 }
 
 /*
+ * Do memcpy with as few moves as possible (for transfers to/from IO space.)
+ */
+static inline void * __minimal_memcpy(void * to, const void * from, size_t n)
+{
+int d0, d1, d2;
+__asm__ __volatile__(
+	"rep ; movsl\n\t"
+	"testb $2,%b4\n\t"
+	"jz 1f\n\t"
+	"movsw\n"
+	"1:\n\t"
+	"testb $1,%b4\n\t"
+	"jz 2f\n\t"
+	"movsb\n"
+	"2:"
+	: "=&c" (d0), "=&D" (d1), "=&S" (d2)
+	:"0" (n/4), "q" (n), "1" ((long) to), "2" ((long) from)
+	: "memory");
+return to;
+}
+
+/*
  * This looks ugly, but the compiler can optimize it totally,
  * as the count is constant.
  */
-- 
Chuck

^ permalink raw reply	[flat|nested] 5+ messages in thread

* Re: [patch 2.6.17-rc5 1/2] i386 memcpy: use as few moves as  possible for I/O
  2006-05-31  0:59 [patch 2.6.17-rc5 1/2] i386 memcpy: use as few moves as possible for I/O Chuck Ebbert
@ 2006-05-31  1:36 ` H. Peter Anvin
  2006-05-31  3:42 ` H. Peter Anvin
  2006-05-31 11:39 ` linux-os (Dick Johnson)
  2 siblings, 0 replies; 5+ messages in thread
From: H. Peter Anvin @ 2006-05-31  1:36 UTC (permalink / raw)
  To: Chuck Ebbert; +Cc: linux-kernel, Chris Lesiak, Andrew Morton, Linus Torvalds

One concern: this is not the standard return value for memcpy().  It either needs a 
comment to that effect (stating it returns a pointer to the end of the area), or just make 
it return void.

Also, the formatting looks nonstandard.

>  
>  /*
> + * Do memcpy with as few moves as possible (for transfers to/from IO space.)
> + */
> +static inline void * __minimal_memcpy(void * to, const void * from, size_t n)
> +{
> +int d0, d1, d2;
> +__asm__ __volatile__(
> +	"rep ; movsl\n\t"
> +	"testb $2,%b4\n\t"
> +	"jz 1f\n\t"
> +	"movsw\n"
> +	"1:\n\t"
> +	"testb $1,%b4\n\t"
> +	"jz 2f\n\t"
> +	"movsb\n"
> +	"2:"
> +	: "=&c" (d0), "=&D" (d1), "=&S" (d2)
> +	:"0" (n/4), "q" (n), "1" ((long) to), "2" ((long) from)
> +	: "memory");
> +return to;
> +}


	-hpa

^ permalink raw reply	[flat|nested] 5+ messages in thread

* Re: [patch 2.6.17-rc5 1/2] i386 memcpy: use as few moves as  possible for I/O
  2006-05-31  0:59 [patch 2.6.17-rc5 1/2] i386 memcpy: use as few moves as possible for I/O Chuck Ebbert
  2006-05-31  1:36 ` H. Peter Anvin
@ 2006-05-31  3:42 ` H. Peter Anvin
  2006-05-31 11:39 ` linux-os (Dick Johnson)
  2 siblings, 0 replies; 5+ messages in thread
From: H. Peter Anvin @ 2006-05-31  3:42 UTC (permalink / raw)
  To: Chuck Ebbert; +Cc: linux-kernel, Chris Lesiak, Andrew Morton, Linus Torvalds

[-- Attachment #1: Type: text/plain, Size: 978 bytes --]

Chuck Ebbert wrote:
> Chris Lesiak reported that changes to i386's __memcpy() broke his device
> because it can't handle byte moves and the new code uses them for
> all trailing bytes when the length is not divisible by four.  The old
> code tried to use a 16-bit move and/or a byte move as needed.
> 
> H. Peter Anvin:
> "There are only a few semantics that make sense: fixed 8, 16, 32, or 64
> bits, plus "optimal"; the latter to be used for anything that doesn't
> require a specific transfer size.  Logically, an unqualified
> "memcpy_to/fromio" should be the optimal size (as few transfers as
> possible)"
> 
> So add back the old code as __minimal_memcpy and have IO transfers
> use that.
> 

I was thinking some more about that, and I suspect the "right" way to do 
this looks something like the attached code.  Note that it assymetric, 
and that it's probably too long to inline.

I haven't tested this yet, and I probably won't have time to do so this 
evening.

	-hpa

[-- Attachment #2: memcpy_io.S --]
[-- Type: text/plain, Size: 1443 bytes --]

/*
 * arch/i386/lib/memcpy_io.S
 *
 * The most general form of memory copy to/from I/O space, used for
 * devices which can handle arbitrary transactions with appropriate
 * handling of byte enables.  The goal is to produce the minimum
 * number of naturally aligned transactions on the bus.
 */
	
#include <linux/config.h>
	
	.globl	memcpy_toio
	.type	memcpy_toio, @function
	
memcpy_toio:
	pushl	%edi
	pushl	%esi
	
#ifdef CONFIG_REGPARM
	movl	%eax, %edi
	movl	%edx, %esi
#else	
	movl	12(%esp), %eax
	movl	16(%esp), %edx
	movl	20(%esp), %ecx
#endif

	jecxz	1f
	
	testl	$1, %edi
	jz	2f
	movsb
	decl	%ecx
2:
	cmpl	$2, %ecx
	jb	3f
	testl	$2, %edi
	jz	4f
	movsw
	decl	%ecx
	decl	%ecx
4:
	movl	%ecx, %edx
	shrl	$2, %ecx
	jz	5f
	rep ; movsl
5:
	movl	%edx, %ecx
	testb	$2, %cl
	jz	3f
	movsw
3:
	testb	$1, %cl
	jz	1f
	movsb
1:
	pop	%esi
	pop	%edi
	ret

	.size	memcpy_toio, .-memcpy_toio

	.globl	memcpy_toio
	.type	memcpy_fromio, @function
	
memcpy_fromio:
	pushl	%edi
	pushl	%esi
	
#ifdef CONFIG_REGPARM
	movl	%eax, %edi
	movl	%edx, %esi
#else	
	movl	12(%esp), %eax
	movl	16(%esp), %edx
	movl	20(%esp), %ecx
#endif

	jecxz	1f
	
	testl	$1, %esi
	jz	2f
	movsb
	decl	%ecx
2:
	cmpl	$2, %ecx
	jb	3f
	testl	$2, %esi
	jz	4f
	movsw
	decl	%ecx
	decl	%ecx
4:
	movl	%ecx, %edx
	shrl	$2, %ecx
	jz	5f
	rep ; movsl
5:
	movl	%edx, %ecx
	testb	$2, %cl
	jz	3f
	movsw
3:
	testb	$1, %cl
	jz	1f
	movsb
1:
	pop	%esi
	pop	%edi
	ret

	.size	memcpy_fromio, .-memcpy_fromio

^ permalink raw reply	[flat|nested] 5+ messages in thread

* Re: [patch 2.6.17-rc5 1/2] i386 memcpy: use as few moves as  possible for I/O
  2006-05-31  0:59 [patch 2.6.17-rc5 1/2] i386 memcpy: use as few moves as possible for I/O Chuck Ebbert
  2006-05-31  1:36 ` H. Peter Anvin
  2006-05-31  3:42 ` H. Peter Anvin
@ 2006-05-31 11:39 ` linux-os (Dick Johnson)
  2 siblings, 0 replies; 5+ messages in thread
From: linux-os (Dick Johnson) @ 2006-05-31 11:39 UTC (permalink / raw)
  To: Chuck Ebbert
  Cc: linux-kernel, Chris Lesiak, H. Peter Anvin, Andrew Morton,
	Linus Torvalds


On Tue, 30 May 2006, Chuck Ebbert wrote:

> Chris Lesiak reported that changes to i386's __memcpy() broke his device
> because it can't handle byte moves and the new code uses them for
> all trailing bytes when the length is not divisible by four.  The old
> code tried to use a 16-bit move and/or a byte move as needed.
>
> H. Peter Anvin:
> "There are only a few semantics that make sense: fixed 8, 16, 32, or 64
> bits, plus "optimal"; the latter to be used for anything that doesn't
> require a specific transfer size.  Logically, an unqualified
> "memcpy_to/fromio" should be the optimal size (as few transfers as
> possible)"
>
> So add back the old code as __minimal_memcpy and have IO transfers
> use that.
>
> Signed-off-by: Chuck Ebbert <76306.1226@compuserve.com>
>
> ---
>
> include/asm-i386/io.h     |    4 ++--
> include/asm-i386/string.h |   21 +++++++++++++++++++++
> 2 files changed, 23 insertions(+), 2 deletions(-)
>
> --- 2.6.17-rc5-32.orig/include/asm-i386/io.h
> +++ 2.6.17-rc5-32/include/asm-i386/io.h
> @@ -202,11 +202,11 @@ static inline void memset_io(volatile vo
> }
> static inline void memcpy_fromio(void *dst, const volatile void __iomem *src, int count)
> {
> -	__memcpy(dst, (void __force *) src, count);
> +	__minimal_memcpy(dst, (void __force *) src, count);
> }
> static inline void memcpy_toio(volatile void __iomem *dst, const void *src, int count)
> {
> -	__memcpy((void __force *) dst, src, count);
> +	__minimal_memcpy((void __force *) dst, src, count);
> }
>
> /*
> --- 2.6.17-rc5-32.orig/include/asm-i386/string.h
> +++ 2.6.17-rc5-32/include/asm-i386/string.h
> @@ -220,6 +220,28 @@ return (to);
> }
>
> /*
> + * Do memcpy with as few moves as possible (for transfers to/from IO space.)
> + */
> +static inline void * __minimal_memcpy(void * to, const void * from, size_t n)
> +{
> +int d0, d1, d2;
> +__asm__ __volatile__(
> +	"rep ; movsl\n\t"
> +	"testb $2,%b4\n\t"
> +	"jz 1f\n\t"
> +	"movsw\n"
> +	"1:\n\t"
> +	"testb $1,%b4\n\t"
> +	"jz 2f\n\t"
> +	"movsb\n"
> +	"2:"
> +	: "=&c" (d0), "=&D" (d1), "=&S" (d2)
> +	:"0" (n/4), "q" (n), "1" ((long) to), "2" ((long) from)
> +	: "memory");
> +return to;
> +}
> +
> +/*
>  * This looks ugly, but the compiler can optimize it totally,
>  * as the count is constant.
>  */

> +	"rep ; movsl\n\t"
^^^^^^^^^^^^^^^^^^^^^^^^
Huh, you copyied stuff into three variables, then you needed to make
memory accesses to those variables, and you claim "the compiler can
optimize it totally". Sure. I got a bridge I'd like to sell you.

Please, instead of guessing, why don't measure the exact number of
CPU cycles with rdtsc?

> --
> Chuck
> -
> To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
> the body of a message to majordomo@vger.kernel.org
> More majordomo info at  http://vger.kernel.org/majordomo-info.html
> Please read the FAQ at  http://www.tux.org/lkml/
>

Cheers,
Dick Johnson
Penguin : Linux version 2.6.16.4 on an i686 machine (5592.73 BogoMips).
New book: http://www.AbominableFirebug.com/
_
\x1a\x04

****************************************************************
The information transmitted in this message is confidential and may be privileged.  Any review, retransmission, dissemination, or other use of this information by persons or entities other than the intended recipient is prohibited.  If you are not the intended recipient, please notify Analogic Corporation immediately - by replying to this message or by sending an email to DeliveryErrors@analogic.com - and destroy all copies of this information, including any attachments, without reading or disclosing them.

Thank you.

^ permalink raw reply	[flat|nested] 5+ messages in thread

* Re: [patch 2.6.17-rc5 1/2] i386 memcpy: use as few moves as  possible for I/O
@ 2006-05-31 23:01 Chuck Ebbert
  0 siblings, 0 replies; 5+ messages in thread
From: Chuck Ebbert @ 2006-05-31 23:01 UTC (permalink / raw)
  To: H. Peter Anvin; +Cc: Linus Torvalds, Andrew Morton, Chris Lesiak, linux-kernel

In-Reply-To: <447D1094.20409@zytor.com>

On Tue, 30 May 2006 20:42:12 -0700, H. Peter Anvin wrote:

> I was thinking some more about that, and I suspect the "right" way to do 
> this looks something like the attached code.  Note that it assymetric, 
> and that it's probably too long to inline.
> 
> I haven't tested this yet, and I probably won't have time to do so this 
> evening.

There were some small problems, but I think I fixed them:


/*
 * arch/i386/lib/memcpy_io.S
 *
 * The most general form of memory copy to/from I/O space, used for
 * devices which can handle arbitrary transactions with appropriate
 * handling of byte enables.  The goal is to produce the minimum
 * number of naturally aligned transactions on the bus.
 */
	
#include <linux/config.h>
#include <linux/linkage.h>
	
.macro	build_memcpy_io_fn fn_name,align_reg

	.globl	\fn_name
	.type	\fn_name, @function
	
	ALIGN
\fn_name:

	ebp_space=0
#ifdef CONFIG_FRAME_POINTER
	pushl	%ebp
	movl	%esp,%ebp
	ebp_space=4
#endif

	pushl	%edi
	pushl	%esi
	
#ifdef CONFIG_REGPARM
	movl	%eax, %edi
	movl	%edx, %esi
#else
	movl	12+ebp_space(%esp), %edi
	movl	20+ebp_space(%esp), %ecx
	movl	16+ebp_space(%esp), %esi
#endif

	jecxz	1f
	
	testl	$1, \align_reg
	jz	2f
	movsb
	decl	%ecx
2:
	cmpl	$2, %ecx
	jb	3f
	testl	$2, \align_reg
	jz	4f
	movsw
	decl	%ecx
	decl	%ecx
4:
	movl	%ecx, %edx
	shrl	$2, %ecx
	jz	5f
	rep ; movsl
5:
	movl	%edx, %ecx
	testb	$2, %cl
	jz	3f
	movsw
3:
	testb	$1, %cl
	jz	1f
	movsb
1:
	pop	%esi
	pop	%edi

#ifdef CONFIG_FRAME_POINTER
	leave
#endif

	ret

	.size	\fn_name, .-\fn_name

.endm

	build_memcpy_io_fn fn_name=memcpy_fromio,align_reg=%esi
	build_memcpy_io_fn fn_name=memcpy_toio,align_reg=%edi
-- 
Chuck

^ permalink raw reply	[flat|nested] 5+ messages in thread

end of thread, other threads:[~2006-05-31 23:07 UTC | newest]

Thread overview: 5+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2006-05-31  0:59 [patch 2.6.17-rc5 1/2] i386 memcpy: use as few moves as possible for I/O Chuck Ebbert
2006-05-31  1:36 ` H. Peter Anvin
2006-05-31  3:42 ` H. Peter Anvin
2006-05-31 11:39 ` linux-os (Dick Johnson)
2006-05-31 23:01 Chuck Ebbert

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).