All of lore.kernel.org
 help / color / mirror / Atom feed
From: "Guan Xuetao" <guanxuetao@mprc.pku.edu.cn>
To: <linux-arch@vger.kernel.org>, <linux-kernel@vger.kernel.org>
Subject: [PATCHv1 7/8] unicore32 additional architecture files: low-level lib: checksum
Date: Mon, 3 Jan 2011 19:54:03 +0800	[thread overview]
Message-ID: <013101cbab3c$eb0a7670$c11f6350$@mprc.pku.edu.cn> (raw)

From: Guan Xuetao <guanxuetao@mprc.pku.edu.cn>

Patch 7 implements low-level checksum libraries.

Signed-off-by: Guan Xuetao <guanxuetao@mprc.pku.edu.cn>
---
 arch/unicore32/include/asm/checksum.h       |  142 +++++++++++
 arch/unicore32/lib/csumipv6.S               |   36 +++
 arch/unicore32/lib/csumpartial.S            |  126 ++++++++++
 arch/unicore32/lib/csumpartialcopy.S        |   61 +++++
 arch/unicore32/lib/csumpartialcopygeneric.S |  335 +++++++++++++++++++++++++++
 arch/unicore32/lib/csumpartialcopyuser.S    |   92 ++++++++
 6 files changed, 792 insertions(+), 0 deletions(-)

diff --git a/arch/unicore32/include/asm/checksum.h b/arch/unicore32/include/asm/checksum.h
new file mode 100644
index 0000000..59a97d8
--- /dev/null
+++ b/arch/unicore32/include/asm/checksum.h
@@ -0,0 +1,142 @@
+/*
+ * linux/arch/unicore32/include/asm/checksum.h
+ *
+ * Code specific to PKUnity SoC and UniCore ISA
+ *
+ * Copyright (C) 2001-2010 GUAN Xue-tao
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ *
+ * IP checksum routines
+ */
+#ifndef __UNICORE_CHECKSUM_H__
+#define __UNICORE_CHECKSUM_H__
+
+#include <linux/in6.h>
+
+/*
+ * computes the checksum of a memory block at buff, length len,
+ * and adds in "sum" (32-bit)
+ *
+ * returns a 32-bit number suitable for feeding into itself
+ * or csum_tcpudp_magic
+ *
+ * this function must be called with even lengths, except
+ * for the last fragment, which may be odd
+ *
+ * it's best to have buff aligned on a 32-bit boundary
+ */
+__wsum csum_partial(const void *buff, int len, __wsum sum);
+
+/*
+ * the same as csum_partial, but copies from src while it
+ * checksums, and handles user-space pointer exceptions correctly, when needed.
+ *
+ * here even more important to align src and dst on a 32-bit (or even
+ * better 64-bit) boundary
+ */
+
+__wsum
+csum_partial_copy_nocheck(const void *src, void *dst, int len, __wsum sum);
+
+__wsum
+csum_partial_copy_from_user(const void __user *src, void *dst,
+		int len, __wsum sum, int *err_ptr);
+
+/*
+ *	Fold a partial checksum without adding pseudo headers
+ */
+static inline __sum16 csum_fold(__wsum sum)
+{
+	__asm__(
+	"add	%0, %1, %1 <> #16	@ csum_fold"
+	: "=r" (sum)
+	: "r" (sum)
+	: "cc");
+	return (__force __sum16)(~(__force u32)sum >> 16);
+}
+
+/*
+ *	This is a version of ip_compute_csum() optimized for IP headers,
+ *	which always checksum on 4 octet boundaries.
+ */
+static inline __sum16
+ip_fast_csum(const void *iph, unsigned int ihl)
+{
+	unsigned int tmp1;
+	__wsum sum;
+
+	__asm__ __volatile__(
+	"ldw.w	%0, [%1]+, #4		@ ip_fast_csum"
+	"ldw.w	%3, [%1]+, #4"
+	"sub	%2, %2, #5"
+	"add.a	%0, %0, %3"
+	"ldw.w	%3, [%1]+, #4"
+	"addc.a	%0, %0, %3"
+	"ldw.w	%3, [%1]+, #4"
+"1:	addc.a	%0, %0, %3"
+	"ldw.w	%3, [%1]+, #4"
+	"cmpand.a	%2, #15		@ do this carefully"
+	"beq	2f"
+	"sub	%2, %2, #1		@ without destroying"
+	"bne	1b			@ the carry flag"
+"2:	addc.a	%0, %0, %3"
+	"addc	%0, %0, #0"
+	: "=r" (sum), "=r" (iph), "=r" (ihl), "=r" (tmp1)
+	: "1" (iph), "2" (ihl)
+	: "cc", "memory");
+	return csum_fold(sum);
+}
+
+static inline __wsum
+csum_tcpudp_nofold(__be32 saddr, __be32 daddr, unsigned short len,
+		   unsigned short proto, __wsum sum)
+{
+	__asm__(
+	"add.a	%0, %1, %2		@ csum_tcpudp_nofold"
+	"addc.a	%0, %0, %3"
+	"addc.a	%0, %0, %4 << #8"
+	"addc.a	%0, %0, %5"
+	"addc	%0, %0, #0"
+	: "=&r"(sum)
+	: "r" (sum), "r" (daddr), "r" (saddr), "r" (len), "Ir" (htons(proto))
+	: "cc");
+	return sum;
+}
+/*
+ * computes the checksum of the TCP/UDP pseudo-header
+ * returns a 16-bit checksum, already complemented
+ */
+static inline __sum16
+csum_tcpudp_magic(__be32 saddr, __be32 daddr, unsigned short len,
+		  unsigned short proto, __wsum sum)
+{
+	return csum_fold(csum_tcpudp_nofold(saddr, daddr, len, proto, sum));
+}
+
+
+/*
+ * this routine is used for miscellaneous IP-like checksums, mainly
+ * in icmp.c
+ */
+static inline __sum16
+ip_compute_csum(const void *buff, int len)
+{
+	return csum_fold(csum_partial(buff, len, 0));
+}
+
+#define _HAVE_ARCH_IPV6_CSUM
+extern __wsum
+__csum_ipv6_magic(const struct in6_addr *saddr, const struct in6_addr *daddr,
+		__be32 len, __be32 proto, __wsum sum);
+
+static inline __sum16
+csum_ipv6_magic(const struct in6_addr *saddr, const struct in6_addr *daddr,
+		__u32 len, unsigned short proto, __wsum sum)
+{
+	return csum_fold(__csum_ipv6_magic(saddr, daddr, htonl(len),
+					   htonl(proto), sum));
+}
+#endif
diff --git a/arch/unicore32/lib/csumipv6.S b/arch/unicore32/lib/csumipv6.S
new file mode 100644
index 0000000..47fad61
--- /dev/null
+++ b/arch/unicore32/lib/csumipv6.S
@@ -0,0 +1,36 @@
+/*
+ * linux/arch/unicore32/lib/csumipv6.S
+ *
+ * Code specific to PKUnity SoC and UniCore ISA
+ *
+ * Copyright (C) 2001-2010 GUAN Xue-tao
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ */
+#include <linux/linkage.h>
+#include <asm/assembler.h>
+
+		.text
+
+ENTRY(__csum_ipv6_magic)
+		stw.w	lr, [sp+], #-4
+		add.a	ip, r2, r3
+
+		ldm   (r1 - r4), [r1]+
+		addc.a	ip, ip, r1
+		addc.a	ip, ip, r2
+		addc.a	ip, ip, r3
+		addc.a	ip, ip, r4
+		ldm	(r0 - r3), [r0]+
+		addc.a	r0, ip, r0
+		addc.a	r0, r0, r1
+		addc.a	r0, r0, r2
+		ldw	r2, [sp+], #4
+		addc.a	r0, r0, r3
+		addc.a	r0, r0, r2
+		addc.a	r0, r0, #0
+		ldm.w	(pc), [sp]+
+ENDPROC(__csum_ipv6_magic)
+
diff --git a/arch/unicore32/lib/csumpartial.S b/arch/unicore32/lib/csumpartial.S
new file mode 100644
index 0000000..23e36c5
--- /dev/null
+++ b/arch/unicore32/lib/csumpartial.S
@@ -0,0 +1,126 @@
+/*
+ * linux/arch/unicore32/lib/csumpartial.S
+ *
+ * Code specific to PKUnity SoC and UniCore ISA
+ *
+ * Copyright (C) 2001-2010 GUAN Xue-tao
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ */
+#include <linux/linkage.h>
+#include <asm/assembler.h>
+
+		.text
+
+/*
+ * Function: __u32 csum_partial(const char *src, int len, __u32 sum)
+ * Params  : r0 = buffer, r1 = len, r2 = checksum
+ * Returns : r0 = new checksum
+ */
+
+buf	.req	r0
+len	.req	r1
+sum	.req	r2
+td0	.req	r3
+td1	.req	r4
+td2	.req	r5
+td3	.req	r6
+
+.Lzero:		mov	r0, sum
+		add	sp, sp, #4
+		ldw.w	pc, [sp]+, #4
+
+		/*
+		 * Handle 0 to 7 bytes, with any alignment of source and
+		 * destination pointers.  Note that when we get here, C = 0
+		 */
+.Lless8:		cxor.a	len, #0			@ check for zero count
+		beq	.Lzero
+
+		/* we must have at least one byte. */
+		cand.a	buf, #1				@ odd address?
+		beq	.Lless4
+		mov	sum, sum <> #8
+		ldb.w	td0, [buf]+, #1
+		sub	len, len, #1
+		addc.a	sum, sum, td0 put_byte_1
+
+.Lless4:	cand.a	len, #6
+		beq	.Lless8_byte
+
+		/* we are now half-word aligned */
+
+.Lless8_wordlp:
+		ldh.w	td0, [buf]+, #2
+		sub	len, len, #2
+		addc.a	sum, sum, td0
+		cand.a	len, #6
+		bne	.Lless8_wordlp
+
+.Lless8_byte:	cand.a	len, #1			@ odd number of bytes
+		beq	.Ldone
+		ldb.w	td0, [buf]+, #1		@ include last byte
+		addc.a	sum, sum, td0 put_byte_0	@ update checksum
+
+.Ldone:		addc	r0, sum, #0		@ collect up the last carry
+		ldw.w	td0, [sp]+, #4
+		cand.a	td0, #1			@ check buffer alignment
+		cmovne	r0, r0 <> #8		@ rotate checksum by 8 bits
+		ldw.w	pc, [sp]+, #4		@ return
+
+.Lnot_aligned:	cand.a	buf, #1			@ odd address
+		beq	201f
+		ldb.w	td0, [buf]+, #1		@ make even
+		sub	len, len, #1
+		addc.a	sum, sum, td0 put_byte_1	@ update checksum
+	201:
+		cand.a	buf, #2			@ 32-bit aligned?
+		beq	201f
+		ldh.w	td0, [buf]+, #2		@ make 32-bit aligned
+		sub	len, len, #2
+		addc.a	sum, sum, td0		@ update checksum
+	201:
+		mov	pc, lr
+
+ENTRY(csum_partial)
+		stm.w	(lr), [sp-]
+		stm.w	(buf), [sp-]
+		csub.a	len, #8			@ Ensure that we have at least
+		bub	.Lless8			@ 8 bytes to copy.
+
+		cand.a	buf, #1
+		cmovne	sum, sum <> #8
+
+		add.a	sum, sum, #0		@ C = 0
+		cand.a	buf, #3			@ Test destination alignment
+		bne.l	.Lnot_aligned		@ align destination, return here
+
+1:		andn.a	ip, len, #31
+		beq	3f
+
+2:		ldm.w	(td0, td1, td2, td3), [buf]+
+		addc.a	sum, sum, td0
+		addc.a	sum, sum, td1
+		addc.a	sum, sum, td2
+		addc.a	sum, sum, td3
+		ldm.w	(td0, td1, td2, td3), [buf]+
+		addc.a	sum, sum, td0
+		addc.a	sum, sum, td1
+		addc.a	sum, sum, td2
+		addc.a	sum, sum, td3
+		sub	ip, ip, #32
+		cxor.a	ip, #0
+		bne	2b
+
+3:		cand.a	len, #0x1c		@ should not change C
+		beq	.Lless4
+
+4:		ldw.w	td0, [buf]+, #4
+		sub	len, len, #4
+		addc.a	sum, sum, td0
+		cand.a	len, #0x1c
+		bne	4b
+		b	.Lless4
+ENDPROC(csum_partial)
diff --git a/arch/unicore32/lib/csumpartialcopy.S b/arch/unicore32/lib/csumpartialcopy.S
new file mode 100644
index 0000000..e4fa5c2
--- /dev/null
+++ b/arch/unicore32/lib/csumpartialcopy.S
@@ -0,0 +1,61 @@
+/*
+ * linux/arch/unicore32/lib/csumpartialcopy.S
+ *
+ * Code specific to PKUnity SoC and UniCore ISA
+ *
+ * Copyright (C) 2001-2010 GUAN Xue-tao
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ */
+#include <linux/linkage.h>
+#include <asm/assembler.h>
+
+		.text
+
+/*
+ * Function: __u32 csum_partial_copy_nocheck
+ * 			(const char *src, char *dst, int len, __u32 sum)
+ * Params  : r0 = src, r1 = dst, r2 = len, r3 = checksum
+ * Returns : r0 = new checksum
+ */
+
+		.macro	save_regs
+		mov	ip, sp
+		stm.w	(fp, ip, lr, pc), [sp-]
+		stm.w	(r1), [sp-]
+		sub	fp, ip, #4
+		.endm
+
+		.macro	load_regs
+		ldm.w	(r1), [sp]+
+		ldm	(fp, sp, pc), [sp]+
+		.endm
+
+		.macro	load1b, reg1
+		ldb.w	\reg1, [r0]+, #1
+		.endm
+
+		.macro	load2b, reg1, reg2
+		ldb.w	\reg1, [r0]+, #1
+		ldb.w	\reg2, [r0]+, #1
+		.endm
+
+		.macro	load1l, reg1
+		ldw.w	\reg1, [r0]+, #4
+		.endm
+
+		.macro	load2l, reg1, reg2
+		ldw.w	\reg1, [r0]+, #4
+		ldw.w	\reg2, [r0]+, #4
+		.endm
+
+		.macro	load4l, reg1, reg2, reg3, reg4
+		ldm.w	(\reg1, \reg2, \reg3, \reg4), [r0]+
+		.endm
+
+#define FN_ENTRY	ENTRY(csum_partial_copy_nocheck)
+#define FN_EXIT		ENDPROC(csum_partial_copy_nocheck)
+
+#include "csumpartialcopygeneric.S"
diff --git a/arch/unicore32/lib/csumpartialcopygeneric.S b/arch/unicore32/lib/csumpartialcopygeneric.S
new file mode 100644
index 0000000..d5a4a3d
--- /dev/null
+++ b/arch/unicore32/lib/csumpartialcopygeneric.S
@@ -0,0 +1,335 @@
+/*
+ * linux/arch/unicore32/lib/csumpartialcopygeneric.S
+ *
+ * Code specific to PKUnity SoC and UniCore ISA
+ *
+ * Copyright (C) 2001-2010 GUAN Xue-tao
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ */
+
+/*
+ * unsigned int
+ * csum_partial_copy_xxx(const char *src, char *dst, int len, int sum, )
+ *  r0 = src, r1 = dst, r2 = len, r3 = sum
+ *  Returns : r0 = checksum
+ *
+ * Note that 'tst' and 'teq' preserve the carry flag.
+ */
+
+src	.req	r0
+dst	.req	r1
+len	.req	r2
+sum	.req	r3
+
+.Lzero:		mov	r0, sum
+		load_regs
+
+		/*
+		 * Align an unaligned destination pointer.  We know that
+		 * we have >= 8 bytes here, so we don't need to check
+		 * the length.  Note that the source pointer hasn't been
+		 * aligned yet.
+		 */
+.Ldst_unaligned:
+		cand.a	dst, #1
+		beq	.Ldst_16bit
+
+		load1b	ip
+		sub	len, len, #1
+		addc.a	sum, sum, ip put_byte_1		@ update checksum
+		stb.w	ip, [dst]+, #1
+		cand.a	dst, #2
+		cmoveq	pc, lr			@ dst is now 32bit aligned
+
+.Ldst_16bit:	load2b	r8, ip
+		sub	len, len, #2
+		addc.a	sum, sum, r8 put_byte_0
+		stb.w	r8, [dst]+, #1
+		addc.a	sum, sum, ip put_byte_1
+		stb.w	ip, [dst]+, #1
+		mov	pc, lr			@ dst is now 32bit aligned
+
+		/*
+		 * Handle 0 to 7 bytes, with any alignment of source and
+		 * destination pointers.  Note that when we get here, C = 0
+		 */
+.Lless8:	cxor.a	len, #0			@ check for zero count
+		beq	.Lzero
+
+		/* we must have at least one byte. */
+		cand.a	dst, #1				@ dst 16-bit aligned
+		beq	.Lless8_aligned
+
+		/* Align dst */
+		load1b	ip
+		sub	len, len, #1
+		addc.a	sum, sum, ip put_byte_1		@ update checksum
+		stb.w	ip, [dst]+, #1
+		cand.a	len, #6
+		beq	.Lless8_byteonly
+
+1:		load2b	r8, ip
+		sub	len, len, #2
+		addc.a	sum, sum, r8 put_byte_0
+		stb.w	r8, [dst]+, #1
+		addc.a	sum, sum, ip put_byte_1
+		stb.w	ip, [dst]+, #1
+.Lless8_aligned:
+		cand.a	len, #6
+		bne	1b
+.Lless8_byteonly:
+		cand.a	len, #1
+		beq	.Ldone
+		load1b	r8
+		addc.a	sum, sum, r8 put_byte_0		@ update checksum
+		stb.w	r8, [dst]+, #1
+		b	.Ldone
+
+FN_ENTRY
+		save_regs
+
+		csub.a	len, #8			@ Ensure that we have at least
+		bub	.Lless8			@ 8 bytes to copy.
+
+		add.a	sum, sum, #0		@ C = 0
+		cand.a	dst, #3			@ Test destination alignment
+		bne.l	.Ldst_unaligned		@ align destination, return here
+
+		/*
+		 * Ok, the dst pointer is now 32bit aligned, and we know
+		 * that we must have more than 4 bytes to copy.  Note
+		 * that C contains the carry from the dst alignment above.
+		 */
+
+		cand.a	src, #3			@ Test source alignment
+		bne	.Lsrc_not_aligned
+
+		/* Routine for src & dst aligned */
+
+		andn.a	ip, len, #15
+		beq	2f
+
+1:		load4l	r4, r5, r6, r7
+		stm.w	(r4, r5, r6, r7), [dst]+
+		addc.a	sum, sum, r4
+		addc.a	sum, sum, r5
+		addc.a	sum, sum, r6
+		addc.a	sum, sum, r7
+		sub	ip, ip, #16
+		cxor.a	ip, #0
+		bne	1b
+
+2:		and.a	ip, len, #12
+		beq	4f
+		cand.a	ip, #8
+		beq	3f
+		load2l	r4, r5
+		stm.w	(r4, r5), [dst]+
+		addc.a	sum, sum, r4
+		addc.a	sum, sum, r5
+		cand.a	ip, #4
+		beq	4f
+
+3:		load1l	r4
+		stw.w	r4, [dst]+, #4
+		addc.a	sum, sum, r4
+
+4:		and.a	len, len, #3
+		beq	.Ldone
+		load1l	r4
+		cand.a	len, #2
+		mov	r5, r4 get_byte_0
+		beq	.Lexit
+		addc.a	sum, sum, r4 push #16
+		stb.w	r5, [dst]+, #1
+		mov	r5, r4 get_byte_1
+		stb.w	r5, [dst]+, #1
+		mov	r5, r4 get_byte_2
+.Lexit:		cand.a	len, #1
+		beq	.Ldone
+		stb.w	r5, [dst]+, #1
+		and	r5, r5, #255
+		addc.a	sum, sum, r5 put_byte_0
+
+		/*
+		 * If the dst pointer was not 16-bit aligned, we
+		 * need to rotate the checksum here to get around
+		 * the inefficient byte manipulations in the
+		 * architecture independent code.
+		 */
+.Ldone:		addc	r0, sum, #0
+		ldw	sum, [sp+], #0		@ dst
+		cand.a	sum, #1
+		cmovne	r0, r0 <> #8
+		load_regs
+
+.Lsrc_not_aligned:
+		addc	sum, sum, #0		@ include C from dst alignment
+		and	ip, src, #3
+		andn	src, src, #3
+		load1l	r5
+		csub.a	ip, #2
+		beq	.Lsrc2_aligned
+		bua	.Lsrc3_aligned
+		mov	r4, r5 pull #8		@ C = 0
+		andn.a	ip, len, #15
+		beq	2f
+1:		load4l	r5, r6, r7, r8
+		or	r4, r4, r5 push #24
+		mov	r5, r5 pull #8
+		or	r5, r5, r6 push #24
+		mov	r6, r6 pull #8
+		or	r6, r6, r7 push #24
+		mov	r7, r7 pull #8
+		or	r7, r7, r8 push #24
+		stm.w	(r4, r5, r6, r7), [dst]+
+		addc.a	sum, sum, r4
+		addc.a	sum, sum, r5
+		addc.a	sum, sum, r6
+		addc.a	sum, sum, r7
+		mov	r4, r8 pull #8
+		sub	ip, ip, #16
+		cxor.a	ip, #0
+		bne	1b
+2:		and.a	ip, len, #12
+		beq	4f
+		cand.a	ip, #8
+		beq	3f
+		load2l	r5, r6
+		or	r4, r4, r5 push #24
+		mov	r5, r5 pull #8
+		or	r5, r5, r6 push #24
+		stm.w	(r4, r5), [dst]+
+		addc.a	sum, sum, r4
+		addc.a	sum, sum, r5
+		mov	r4, r6 pull #8
+		cand.a	ip, #4
+		beq	4f
+3:		load1l	r5
+		or	r4, r4, r5 push #24
+		stw.w	r4, [dst]+, #4
+		addc.a	sum, sum, r4
+		mov	r4, r5 pull #8
+4:		and.a	len, len, #3
+		beq	.Ldone
+		mov	r5, r4 get_byte_0
+		cand.a	len, #2
+		beq	.Lexit
+		addc.a	sum, sum, r4 push #16
+		stb.w	r5, [dst]+, #1
+		mov	r5, r4 get_byte_1
+		stb.w	r5, [dst]+, #1
+		mov	r5, r4 get_byte_2
+		b	.Lexit
+
+.Lsrc2_aligned:	mov	r4, r5 pull #16
+		add.a	sum, sum, #0
+		andn.a	ip, len, #15
+		beq	2f
+1:		load4l	r5, r6, r7, r8
+		or	r4, r4, r5 push #16
+		mov	r5, r5 pull #16
+		or	r5, r5, r6 push #16
+		mov	r6, r6 pull #16
+		or	r6, r6, r7 push #16
+		mov	r7, r7 pull #16
+		or	r7, r7, r8 push #16
+		stm.w	(r4, r5, r6, r7), [dst]+
+		addc.a	sum, sum, r4
+		addc.a	sum, sum, r5
+		addc.a	sum, sum, r6
+		addc.a	sum, sum, r7
+		mov	r4, r8 pull #16
+		sub	ip, ip, #16
+		cxor.a	ip, #0
+		bne	1b
+2:		and.a	ip, len, #12
+		beq	4f
+		cand.a	ip, #8
+		beq	3f
+		load2l	r5, r6
+		or	r4, r4, r5 push #16
+		mov	r5, r5 pull #16
+		or	r5, r5, r6 push #16
+		stm.w	(r4, r5), [dst]+
+		addc.a	sum, sum, r4
+		addc.a	sum, sum, r5
+		mov	r4, r6 pull #16
+		cand.a	ip, #4
+		beq	4f
+3:		load1l	r5
+		or	r4, r4, r5 push #16
+		stw.w	r4, [dst]+, #4
+		addc.a	sum, sum, r4
+		mov	r4, r5 pull #16
+4:		and.a	len, len, #3
+		beq	.Ldone
+		mov	r5, r4 get_byte_0
+		cand.a	len, #2
+		beq	.Lexit
+		addc.a	sum, sum, r4
+		stb.w	r5, [dst]+, #1
+		mov	r5, r4 get_byte_1
+		stb.w	r5, [dst]+, #1
+		cand.a	len, #1
+		beq	.Ldone
+		load1b	r5
+		b	.Lexit
+
+.Lsrc3_aligned:	mov	r4, r5 pull #24
+		add.a	sum, sum, #0
+		andn.a	ip, len, #15
+		beq	2f
+1:		load4l	r5, r6, r7, r8
+		or	r4, r4, r5 push #8
+		mov	r5, r5 pull #24
+		or	r5, r5, r6 push #8
+		mov	r6, r6 pull #24
+		or	r6, r6, r7 push #8
+		mov	r7, r7 pull #24
+		or	r7, r7, r8 push #8
+		stm.w	(r4, r5, r6, r7), [dst]+
+		addc.a	sum, sum, r4
+		addc.a	sum, sum, r5
+		addc.a	sum, sum, r6
+		addc.a	sum, sum, r7
+		mov	r4, r8 pull #24
+		sub	ip, ip, #16
+		cxor.a	ip, #0
+		bne	1b
+2:		and.a	ip, len, #12
+		beq	4f
+		cand.a	ip, #8
+		beq	3f
+		load2l	r5, r6
+		or	r4, r4, r5 push #8
+		mov	r5, r5 pull #24
+		or	r5, r5, r6 push #8
+		stm.w	(r4, r5), [dst]+
+		addc.a	sum, sum, r4
+		addc.a	sum, sum, r5
+		mov	r4, r6 pull #24
+		cand.a	ip, #4
+		beq	4f
+3:		load1l	r5
+		or	r4, r4, r5 push #8
+		stw.w	r4, [dst]+, #4
+		addc.a	sum, sum, r4
+		mov	r4, r5 pull #24
+4:		and.a	len, len, #3
+		beq	.Ldone
+		mov	r5, r4 get_byte_0
+		cand.a	len, #2
+		beq	.Lexit
+		stb.w	r5, [dst]+, #1
+		addc.a	sum, sum, r4
+		load1l	r4
+		mov	r5, r4 get_byte_0
+		stb.w	r5, [dst]+, #1
+		addc.a	sum, sum, r4 push #24
+		mov	r5, r4 get_byte_1
+		b	.Lexit
+FN_EXIT
diff --git a/arch/unicore32/lib/csumpartialcopyuser.S b/arch/unicore32/lib/csumpartialcopyuser.S
new file mode 100644
index 0000000..23a292f
--- /dev/null
+++ b/arch/unicore32/lib/csumpartialcopyuser.S
@@ -0,0 +1,92 @@
+/*
+ * linux/arch/unicore32/lib/csumpartialcopyuser.S
+ *
+ * Code specific to PKUnity SoC and UniCore ISA
+ *
+ * Copyright (C) 2001-2010 GUAN Xue-tao
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ *
+ * 27/03/03 Ian Molton Clean up CONFIG_CPU
+ *
+ */
+#include <linux/linkage.h>
+#include <asm/assembler.h>
+#include <asm/errno.h>
+#include <generated/asm-offsets.h>
+
+		.text
+
+		.macro	save_regs
+		mov	ip, sp
+		stm.w	(fp, ip, lr, pc), [sp-]
+		stm.w	(r1 - r2), [sp-]
+		sub	fp, ip, #4
+		.endm
+
+		.macro	load_regs
+		ldm.w	(r1 - r2), [sp]+
+		ldm	(fp, sp, pc), [sp]+
+		.endm
+
+		.macro	load1b,	reg1
+		ldrusr	\reg1, r0, 1
+		.endm
+
+		.macro	load2b, reg1, reg2
+		ldrusr	\reg1, r0, 1
+		ldrusr	\reg2, r0, 1
+		.endm
+
+		.macro	load1l, reg1
+		ldrusr	\reg1, r0, 4
+		.endm
+
+		.macro	load2l, reg1, reg2
+		ldrusr	\reg1, r0, 4
+		ldrusr	\reg2, r0, 4
+		.endm
+
+		.macro	load4l, reg1, reg2, reg3, reg4
+		ldrusr	\reg1, r0, 4
+		ldrusr	\reg2, r0, 4
+		ldrusr	\reg3, r0, 4
+		ldrusr	\reg4, r0, 4
+		.endm
+
+/*
+ * unsigned int
+ * csum_partial_copy_from_user
+ *		(const char *src, char *dst, int len, int sum, int *err_ptr)
+ *  r0 = src, r1 = dst, r2 = len, r3 = sum, [sp] = *err_ptr
+ *  Returns : r0 = checksum, [[sp, #0], #0] = 0 or -EFAULT
+ */
+
+#define FN_ENTRY	ENTRY(csum_partial_copy_from_user)
+#define FN_EXIT		ENDPROC(csum_partial_copy_from_user)
+
+#include "csumpartialcopygeneric.S"
+
+/*
+ * FIXME: minor buglet here
+ * We don't return the checksum for the data present in the buffer.  To do
+ * so properly, we would have to add in whatever registers were loaded before
+ * the fault, which, with the current asm above is not predictable.
+ */
+		.pushsection .fixup,"ax"
+		.align	4
+9001:		mov	r4, #-EFAULT
+		ldw	r5, [sp+], #8*4		@ *err_ptr
+		stw	r4, [r5]
+		ldm	(r1, r2), [sp]+		@ retrieve dst, len
+		add	r2, r2, r1
+		mov	r0, #0			@ zero the buffer
+9002:		cxor.a	r2, r1
+		beq	201f
+		stb.w	r0, [r1]+, #1
+		b	9002b
+201:
+		load_regs
+		.popsection


WARNING: multiple messages have this Message-ID (diff)
From: "Guan Xuetao" <guanxuetao@mprc.pku.edu.cn>
To: linux-arch@vger.kernel.org, linux-kernel@vger.kernel.org
Subject: [PATCHv1 7/8] unicore32 additional architecture files: low-level lib: checksum
Date: Mon, 3 Jan 2011 19:54:03 +0800	[thread overview]
Message-ID: <013101cbab3c$eb0a7670$c11f6350$@mprc.pku.edu.cn> (raw)

From: Guan Xuetao <guanxuetao@mprc.pku.edu.cn>

Patch 7 implements low-level checksum libraries.

Signed-off-by: Guan Xuetao <guanxuetao@mprc.pku.edu.cn>
---
 arch/unicore32/include/asm/checksum.h       |  142 +++++++++++
 arch/unicore32/lib/csumipv6.S               |   36 +++
 arch/unicore32/lib/csumpartial.S            |  126 ++++++++++
 arch/unicore32/lib/csumpartialcopy.S        |   61 +++++
 arch/unicore32/lib/csumpartialcopygeneric.S |  335 +++++++++++++++++++++++++++
 arch/unicore32/lib/csumpartialcopyuser.S    |   92 ++++++++
 6 files changed, 792 insertions(+), 0 deletions(-)

diff --git a/arch/unicore32/include/asm/checksum.h b/arch/unicore32/include/asm/checksum.h
new file mode 100644
index 0000000..59a97d8
--- /dev/null
+++ b/arch/unicore32/include/asm/checksum.h
@@ -0,0 +1,142 @@
+/*
+ * linux/arch/unicore32/include/asm/checksum.h
+ *
+ * Code specific to PKUnity SoC and UniCore ISA
+ *
+ * Copyright (C) 2001-2010 GUAN Xue-tao
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ *
+ * IP checksum routines
+ */
+#ifndef __UNICORE_CHECKSUM_H__
+#define __UNICORE_CHECKSUM_H__
+
+#include <linux/in6.h>
+
+/*
+ * computes the checksum of a memory block at buff, length len,
+ * and adds in "sum" (32-bit)
+ *
+ * returns a 32-bit number suitable for feeding into itself
+ * or csum_tcpudp_magic
+ *
+ * this function must be called with even lengths, except
+ * for the last fragment, which may be odd
+ *
+ * it's best to have buff aligned on a 32-bit boundary
+ */
+__wsum csum_partial(const void *buff, int len, __wsum sum);
+
+/*
+ * the same as csum_partial, but copies from src while it
+ * checksums, and handles user-space pointer exceptions correctly, when needed.
+ *
+ * here even more important to align src and dst on a 32-bit (or even
+ * better 64-bit) boundary
+ */
+
+__wsum
+csum_partial_copy_nocheck(const void *src, void *dst, int len, __wsum sum);
+
+__wsum
+csum_partial_copy_from_user(const void __user *src, void *dst,
+		int len, __wsum sum, int *err_ptr);
+
+/*
+ *	Fold a partial checksum without adding pseudo headers
+ */
+static inline __sum16 csum_fold(__wsum sum)
+{
+	__asm__(
+	"add	%0, %1, %1 <> #16	@ csum_fold"
+	: "=r" (sum)
+	: "r" (sum)
+	: "cc");
+	return (__force __sum16)(~(__force u32)sum >> 16);
+}
+
+/*
+ *	This is a version of ip_compute_csum() optimized for IP headers,
+ *	which always checksum on 4 octet boundaries.
+ */
+static inline __sum16
+ip_fast_csum(const void *iph, unsigned int ihl)
+{
+	unsigned int tmp1;
+	__wsum sum;
+
+	__asm__ __volatile__(
+	"ldw.w	%0, [%1]+, #4		@ ip_fast_csum"
+	"ldw.w	%3, [%1]+, #4"
+	"sub	%2, %2, #5"
+	"add.a	%0, %0, %3"
+	"ldw.w	%3, [%1]+, #4"
+	"addc.a	%0, %0, %3"
+	"ldw.w	%3, [%1]+, #4"
+"1:	addc.a	%0, %0, %3"
+	"ldw.w	%3, [%1]+, #4"
+	"cmpand.a	%2, #15		@ do this carefully"
+	"beq	2f"
+	"sub	%2, %2, #1		@ without destroying"
+	"bne	1b			@ the carry flag"
+"2:	addc.a	%0, %0, %3"
+	"addc	%0, %0, #0"
+	: "=r" (sum), "=r" (iph), "=r" (ihl), "=r" (tmp1)
+	: "1" (iph), "2" (ihl)
+	: "cc", "memory");
+	return csum_fold(sum);
+}
+
+static inline __wsum
+csum_tcpudp_nofold(__be32 saddr, __be32 daddr, unsigned short len,
+		   unsigned short proto, __wsum sum)
+{
+	__asm__(
+	"add.a	%0, %1, %2		@ csum_tcpudp_nofold"
+	"addc.a	%0, %0, %3"
+	"addc.a	%0, %0, %4 << #8"
+	"addc.a	%0, %0, %5"
+	"addc	%0, %0, #0"
+	: "=&r"(sum)
+	: "r" (sum), "r" (daddr), "r" (saddr), "r" (len), "Ir" (htons(proto))
+	: "cc");
+	return sum;
+}
+/*
+ * computes the checksum of the TCP/UDP pseudo-header
+ * returns a 16-bit checksum, already complemented
+ */
+static inline __sum16
+csum_tcpudp_magic(__be32 saddr, __be32 daddr, unsigned short len,
+		  unsigned short proto, __wsum sum)
+{
+	return csum_fold(csum_tcpudp_nofold(saddr, daddr, len, proto, sum));
+}
+
+
+/*
+ * this routine is used for miscellaneous IP-like checksums, mainly
+ * in icmp.c
+ */
+static inline __sum16
+ip_compute_csum(const void *buff, int len)
+{
+	return csum_fold(csum_partial(buff, len, 0));
+}
+
+#define _HAVE_ARCH_IPV6_CSUM
+extern __wsum
+__csum_ipv6_magic(const struct in6_addr *saddr, const struct in6_addr *daddr,
+		__be32 len, __be32 proto, __wsum sum);
+
+static inline __sum16
+csum_ipv6_magic(const struct in6_addr *saddr, const struct in6_addr *daddr,
+		__u32 len, unsigned short proto, __wsum sum)
+{
+	return csum_fold(__csum_ipv6_magic(saddr, daddr, htonl(len),
+					   htonl(proto), sum));
+}
+#endif
diff --git a/arch/unicore32/lib/csumipv6.S b/arch/unicore32/lib/csumipv6.S
new file mode 100644
index 0000000..47fad61
--- /dev/null
+++ b/arch/unicore32/lib/csumipv6.S
@@ -0,0 +1,36 @@
+/*
+ * linux/arch/unicore32/lib/csumipv6.S
+ *
+ * Code specific to PKUnity SoC and UniCore ISA
+ *
+ * Copyright (C) 2001-2010 GUAN Xue-tao
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ */
+#include <linux/linkage.h>
+#include <asm/assembler.h>
+
+		.text
+
+ENTRY(__csum_ipv6_magic)
+		stw.w	lr, [sp+], #-4
+		add.a	ip, r2, r3
+
+		ldm   (r1 - r4), [r1]+
+		addc.a	ip, ip, r1
+		addc.a	ip, ip, r2
+		addc.a	ip, ip, r3
+		addc.a	ip, ip, r4
+		ldm	(r0 - r3), [r0]+
+		addc.a	r0, ip, r0
+		addc.a	r0, r0, r1
+		addc.a	r0, r0, r2
+		ldw	r2, [sp+], #4
+		addc.a	r0, r0, r3
+		addc.a	r0, r0, r2
+		addc.a	r0, r0, #0
+		ldm.w	(pc), [sp]+
+ENDPROC(__csum_ipv6_magic)
+
diff --git a/arch/unicore32/lib/csumpartial.S b/arch/unicore32/lib/csumpartial.S
new file mode 100644
index 0000000..23e36c5
--- /dev/null
+++ b/arch/unicore32/lib/csumpartial.S
@@ -0,0 +1,126 @@
+/*
+ * linux/arch/unicore32/lib/csumpartial.S
+ *
+ * Code specific to PKUnity SoC and UniCore ISA
+ *
+ * Copyright (C) 2001-2010 GUAN Xue-tao
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ */
+#include <linux/linkage.h>
+#include <asm/assembler.h>
+
+		.text
+
+/*
+ * Function: __u32 csum_partial(const char *src, int len, __u32 sum)
+ * Params  : r0 = buffer, r1 = len, r2 = checksum
+ * Returns : r0 = new checksum
+ */
+
+buf	.req	r0
+len	.req	r1
+sum	.req	r2
+td0	.req	r3
+td1	.req	r4
+td2	.req	r5
+td3	.req	r6
+
+.Lzero:		mov	r0, sum
+		add	sp, sp, #4
+		ldw.w	pc, [sp]+, #4
+
+		/*
+		 * Handle 0 to 7 bytes, with any alignment of source and
+		 * destination pointers.  Note that when we get here, C = 0
+		 */
+.Lless8:		cxor.a	len, #0			@ check for zero count
+		beq	.Lzero
+
+		/* we must have at least one byte. */
+		cand.a	buf, #1				@ odd address?
+		beq	.Lless4
+		mov	sum, sum <> #8
+		ldb.w	td0, [buf]+, #1
+		sub	len, len, #1
+		addc.a	sum, sum, td0 put_byte_1
+
+.Lless4:	cand.a	len, #6
+		beq	.Lless8_byte
+
+		/* we are now half-word aligned */
+
+.Lless8_wordlp:
+		ldh.w	td0, [buf]+, #2
+		sub	len, len, #2
+		addc.a	sum, sum, td0
+		cand.a	len, #6
+		bne	.Lless8_wordlp
+
+.Lless8_byte:	cand.a	len, #1			@ odd number of bytes
+		beq	.Ldone
+		ldb.w	td0, [buf]+, #1		@ include last byte
+		addc.a	sum, sum, td0 put_byte_0	@ update checksum
+
+.Ldone:		addc	r0, sum, #0		@ collect up the last carry
+		ldw.w	td0, [sp]+, #4
+		cand.a	td0, #1			@ check buffer alignment
+		cmovne	r0, r0 <> #8		@ rotate checksum by 8 bits
+		ldw.w	pc, [sp]+, #4		@ return
+
+.Lnot_aligned:	cand.a	buf, #1			@ odd address
+		beq	201f
+		ldb.w	td0, [buf]+, #1		@ make even
+		sub	len, len, #1
+		addc.a	sum, sum, td0 put_byte_1	@ update checksum
+	201:
+		cand.a	buf, #2			@ 32-bit aligned?
+		beq	201f
+		ldh.w	td0, [buf]+, #2		@ make 32-bit aligned
+		sub	len, len, #2
+		addc.a	sum, sum, td0		@ update checksum
+	201:
+		mov	pc, lr
+
+ENTRY(csum_partial)
+		stm.w	(lr), [sp-]
+		stm.w	(buf), [sp-]
+		csub.a	len, #8			@ Ensure that we have at least
+		bub	.Lless8			@ 8 bytes to copy.
+
+		cand.a	buf, #1
+		cmovne	sum, sum <> #8
+
+		add.a	sum, sum, #0		@ C = 0
+		cand.a	buf, #3			@ Test destination alignment
+		bne.l	.Lnot_aligned		@ align destination, return here
+
+1:		andn.a	ip, len, #31
+		beq	3f
+
+2:		ldm.w	(td0, td1, td2, td3), [buf]+
+		addc.a	sum, sum, td0
+		addc.a	sum, sum, td1
+		addc.a	sum, sum, td2
+		addc.a	sum, sum, td3
+		ldm.w	(td0, td1, td2, td3), [buf]+
+		addc.a	sum, sum, td0
+		addc.a	sum, sum, td1
+		addc.a	sum, sum, td2
+		addc.a	sum, sum, td3
+		sub	ip, ip, #32
+		cxor.a	ip, #0
+		bne	2b
+
+3:		cand.a	len, #0x1c		@ should not change C
+		beq	.Lless4
+
+4:		ldw.w	td0, [buf]+, #4
+		sub	len, len, #4
+		addc.a	sum, sum, td0
+		cand.a	len, #0x1c
+		bne	4b
+		b	.Lless4
+ENDPROC(csum_partial)
diff --git a/arch/unicore32/lib/csumpartialcopy.S b/arch/unicore32/lib/csumpartialcopy.S
new file mode 100644
index 0000000..e4fa5c2
--- /dev/null
+++ b/arch/unicore32/lib/csumpartialcopy.S
@@ -0,0 +1,61 @@
+/*
+ * linux/arch/unicore32/lib/csumpartialcopy.S
+ *
+ * Code specific to PKUnity SoC and UniCore ISA
+ *
+ * Copyright (C) 2001-2010 GUAN Xue-tao
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ */
+#include <linux/linkage.h>
+#include <asm/assembler.h>
+
+		.text
+
+/*
+ * Function: __u32 csum_partial_copy_nocheck
+ * 			(const char *src, char *dst, int len, __u32 sum)
+ * Params  : r0 = src, r1 = dst, r2 = len, r3 = checksum
+ * Returns : r0 = new checksum
+ */
+
+		.macro	save_regs
+		mov	ip, sp
+		stm.w	(fp, ip, lr, pc), [sp-]
+		stm.w	(r1), [sp-]
+		sub	fp, ip, #4
+		.endm
+
+		.macro	load_regs
+		ldm.w	(r1), [sp]+
+		ldm	(fp, sp, pc), [sp]+
+		.endm
+
+		.macro	load1b, reg1
+		ldb.w	\reg1, [r0]+, #1
+		.endm
+
+		.macro	load2b, reg1, reg2
+		ldb.w	\reg1, [r0]+, #1
+		ldb.w	\reg2, [r0]+, #1
+		.endm
+
+		.macro	load1l, reg1
+		ldw.w	\reg1, [r0]+, #4
+		.endm
+
+		.macro	load2l, reg1, reg2
+		ldw.w	\reg1, [r0]+, #4
+		ldw.w	\reg2, [r0]+, #4
+		.endm
+
+		.macro	load4l, reg1, reg2, reg3, reg4
+		ldm.w	(\reg1, \reg2, \reg3, \reg4), [r0]+
+		.endm
+
+#define FN_ENTRY	ENTRY(csum_partial_copy_nocheck)
+#define FN_EXIT		ENDPROC(csum_partial_copy_nocheck)
+
+#include "csumpartialcopygeneric.S"
diff --git a/arch/unicore32/lib/csumpartialcopygeneric.S b/arch/unicore32/lib/csumpartialcopygeneric.S
new file mode 100644
index 0000000..d5a4a3d
--- /dev/null
+++ b/arch/unicore32/lib/csumpartialcopygeneric.S
@@ -0,0 +1,335 @@
+/*
+ * linux/arch/unicore32/lib/csumpartialcopygeneric.S
+ *
+ * Code specific to PKUnity SoC and UniCore ISA
+ *
+ * Copyright (C) 2001-2010 GUAN Xue-tao
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ */
+
+/*
+ * unsigned int
+ * csum_partial_copy_xxx(const char *src, char *dst, int len, int sum, )
+ *  r0 = src, r1 = dst, r2 = len, r3 = sum
+ *  Returns : r0 = checksum
+ *
+ * Note that 'tst' and 'teq' preserve the carry flag.
+ */
+
+src	.req	r0
+dst	.req	r1
+len	.req	r2
+sum	.req	r3
+
+.Lzero:		mov	r0, sum
+		load_regs
+
+		/*
+		 * Align an unaligned destination pointer.  We know that
+		 * we have >= 8 bytes here, so we don't need to check
+		 * the length.  Note that the source pointer hasn't been
+		 * aligned yet.
+		 */
+.Ldst_unaligned:
+		cand.a	dst, #1
+		beq	.Ldst_16bit
+
+		load1b	ip
+		sub	len, len, #1
+		addc.a	sum, sum, ip put_byte_1		@ update checksum
+		stb.w	ip, [dst]+, #1
+		cand.a	dst, #2
+		cmoveq	pc, lr			@ dst is now 32bit aligned
+
+.Ldst_16bit:	load2b	r8, ip
+		sub	len, len, #2
+		addc.a	sum, sum, r8 put_byte_0
+		stb.w	r8, [dst]+, #1
+		addc.a	sum, sum, ip put_byte_1
+		stb.w	ip, [dst]+, #1
+		mov	pc, lr			@ dst is now 32bit aligned
+
+		/*
+		 * Handle 0 to 7 bytes, with any alignment of source and
+		 * destination pointers.  Note that when we get here, C = 0
+		 */
+.Lless8:	cxor.a	len, #0			@ check for zero count
+		beq	.Lzero
+
+		/* we must have at least one byte. */
+		cand.a	dst, #1				@ dst 16-bit aligned
+		beq	.Lless8_aligned
+
+		/* Align dst */
+		load1b	ip
+		sub	len, len, #1
+		addc.a	sum, sum, ip put_byte_1		@ update checksum
+		stb.w	ip, [dst]+, #1
+		cand.a	len, #6
+		beq	.Lless8_byteonly
+
+1:		load2b	r8, ip
+		sub	len, len, #2
+		addc.a	sum, sum, r8 put_byte_0
+		stb.w	r8, [dst]+, #1
+		addc.a	sum, sum, ip put_byte_1
+		stb.w	ip, [dst]+, #1
+.Lless8_aligned:
+		cand.a	len, #6
+		bne	1b
+.Lless8_byteonly:
+		cand.a	len, #1
+		beq	.Ldone
+		load1b	r8
+		addc.a	sum, sum, r8 put_byte_0		@ update checksum
+		stb.w	r8, [dst]+, #1
+		b	.Ldone
+
+FN_ENTRY
+		save_regs
+
+		csub.a	len, #8			@ Ensure that we have at least
+		bub	.Lless8			@ 8 bytes to copy.
+
+		add.a	sum, sum, #0		@ C = 0
+		cand.a	dst, #3			@ Test destination alignment
+		bne.l	.Ldst_unaligned		@ align destination, return here
+
+		/*
+		 * Ok, the dst pointer is now 32bit aligned, and we know
+		 * that we must have more than 4 bytes to copy.  Note
+		 * that C contains the carry from the dst alignment above.
+		 */
+
+		cand.a	src, #3			@ Test source alignment
+		bne	.Lsrc_not_aligned
+
+		/* Routine for src & dst aligned */
+
+		andn.a	ip, len, #15
+		beq	2f
+
+1:		load4l	r4, r5, r6, r7
+		stm.w	(r4, r5, r6, r7), [dst]+
+		addc.a	sum, sum, r4
+		addc.a	sum, sum, r5
+		addc.a	sum, sum, r6
+		addc.a	sum, sum, r7
+		sub	ip, ip, #16
+		cxor.a	ip, #0
+		bne	1b
+
+2:		and.a	ip, len, #12
+		beq	4f
+		cand.a	ip, #8
+		beq	3f
+		load2l	r4, r5
+		stm.w	(r4, r5), [dst]+
+		addc.a	sum, sum, r4
+		addc.a	sum, sum, r5
+		cand.a	ip, #4
+		beq	4f
+
+3:		load1l	r4
+		stw.w	r4, [dst]+, #4
+		addc.a	sum, sum, r4
+
+4:		and.a	len, len, #3
+		beq	.Ldone
+		load1l	r4
+		cand.a	len, #2
+		mov	r5, r4 get_byte_0
+		beq	.Lexit
+		addc.a	sum, sum, r4 push #16
+		stb.w	r5, [dst]+, #1
+		mov	r5, r4 get_byte_1
+		stb.w	r5, [dst]+, #1
+		mov	r5, r4 get_byte_2
+.Lexit:		cand.a	len, #1
+		beq	.Ldone
+		stb.w	r5, [dst]+, #1
+		and	r5, r5, #255
+		addc.a	sum, sum, r5 put_byte_0
+
+		/*
+		 * If the dst pointer was not 16-bit aligned, we
+		 * need to rotate the checksum here to get around
+		 * the inefficient byte manipulations in the
+		 * architecture independent code.
+		 */
+.Ldone:		addc	r0, sum, #0
+		ldw	sum, [sp+], #0		@ dst
+		cand.a	sum, #1
+		cmovne	r0, r0 <> #8
+		load_regs
+
+.Lsrc_not_aligned:
+		addc	sum, sum, #0		@ include C from dst alignment
+		and	ip, src, #3
+		andn	src, src, #3
+		load1l	r5
+		csub.a	ip, #2
+		beq	.Lsrc2_aligned
+		bua	.Lsrc3_aligned
+		mov	r4, r5 pull #8		@ C = 0
+		andn.a	ip, len, #15
+		beq	2f
+1:		load4l	r5, r6, r7, r8
+		or	r4, r4, r5 push #24
+		mov	r5, r5 pull #8
+		or	r5, r5, r6 push #24
+		mov	r6, r6 pull #8
+		or	r6, r6, r7 push #24
+		mov	r7, r7 pull #8
+		or	r7, r7, r8 push #24
+		stm.w	(r4, r5, r6, r7), [dst]+
+		addc.a	sum, sum, r4
+		addc.a	sum, sum, r5
+		addc.a	sum, sum, r6
+		addc.a	sum, sum, r7
+		mov	r4, r8 pull #8
+		sub	ip, ip, #16
+		cxor.a	ip, #0
+		bne	1b
+2:		and.a	ip, len, #12
+		beq	4f
+		cand.a	ip, #8
+		beq	3f
+		load2l	r5, r6
+		or	r4, r4, r5 push #24
+		mov	r5, r5 pull #8
+		or	r5, r5, r6 push #24
+		stm.w	(r4, r5), [dst]+
+		addc.a	sum, sum, r4
+		addc.a	sum, sum, r5
+		mov	r4, r6 pull #8
+		cand.a	ip, #4
+		beq	4f
+3:		load1l	r5
+		or	r4, r4, r5 push #24
+		stw.w	r4, [dst]+, #4
+		addc.a	sum, sum, r4
+		mov	r4, r5 pull #8
+4:		and.a	len, len, #3
+		beq	.Ldone
+		mov	r5, r4 get_byte_0
+		cand.a	len, #2
+		beq	.Lexit
+		addc.a	sum, sum, r4 push #16
+		stb.w	r5, [dst]+, #1
+		mov	r5, r4 get_byte_1
+		stb.w	r5, [dst]+, #1
+		mov	r5, r4 get_byte_2
+		b	.Lexit
+
+.Lsrc2_aligned:	mov	r4, r5 pull #16
+		add.a	sum, sum, #0
+		andn.a	ip, len, #15
+		beq	2f
+1:		load4l	r5, r6, r7, r8
+		or	r4, r4, r5 push #16
+		mov	r5, r5 pull #16
+		or	r5, r5, r6 push #16
+		mov	r6, r6 pull #16
+		or	r6, r6, r7 push #16
+		mov	r7, r7 pull #16
+		or	r7, r7, r8 push #16
+		stm.w	(r4, r5, r6, r7), [dst]+
+		addc.a	sum, sum, r4
+		addc.a	sum, sum, r5
+		addc.a	sum, sum, r6
+		addc.a	sum, sum, r7
+		mov	r4, r8 pull #16
+		sub	ip, ip, #16
+		cxor.a	ip, #0
+		bne	1b
+2:		and.a	ip, len, #12
+		beq	4f
+		cand.a	ip, #8
+		beq	3f
+		load2l	r5, r6
+		or	r4, r4, r5 push #16
+		mov	r5, r5 pull #16
+		or	r5, r5, r6 push #16
+		stm.w	(r4, r5), [dst]+
+		addc.a	sum, sum, r4
+		addc.a	sum, sum, r5
+		mov	r4, r6 pull #16
+		cand.a	ip, #4
+		beq	4f
+3:		load1l	r5
+		or	r4, r4, r5 push #16
+		stw.w	r4, [dst]+, #4
+		addc.a	sum, sum, r4
+		mov	r4, r5 pull #16
+4:		and.a	len, len, #3
+		beq	.Ldone
+		mov	r5, r4 get_byte_0
+		cand.a	len, #2
+		beq	.Lexit
+		addc.a	sum, sum, r4
+		stb.w	r5, [dst]+, #1
+		mov	r5, r4 get_byte_1
+		stb.w	r5, [dst]+, #1
+		cand.a	len, #1
+		beq	.Ldone
+		load1b	r5
+		b	.Lexit
+
+.Lsrc3_aligned:	mov	r4, r5 pull #24
+		add.a	sum, sum, #0
+		andn.a	ip, len, #15
+		beq	2f
+1:		load4l	r5, r6, r7, r8
+		or	r4, r4, r5 push #8
+		mov	r5, r5 pull #24
+		or	r5, r5, r6 push #8
+		mov	r6, r6 pull #24
+		or	r6, r6, r7 push #8
+		mov	r7, r7 pull #24
+		or	r7, r7, r8 push #8
+		stm.w	(r4, r5, r6, r7), [dst]+
+		addc.a	sum, sum, r4
+		addc.a	sum, sum, r5
+		addc.a	sum, sum, r6
+		addc.a	sum, sum, r7
+		mov	r4, r8 pull #24
+		sub	ip, ip, #16
+		cxor.a	ip, #0
+		bne	1b
+2:		and.a	ip, len, #12
+		beq	4f
+		cand.a	ip, #8
+		beq	3f
+		load2l	r5, r6
+		or	r4, r4, r5 push #8
+		mov	r5, r5 pull #24
+		or	r5, r5, r6 push #8
+		stm.w	(r4, r5), [dst]+
+		addc.a	sum, sum, r4
+		addc.a	sum, sum, r5
+		mov	r4, r6 pull #24
+		cand.a	ip, #4
+		beq	4f
+3:		load1l	r5
+		or	r4, r4, r5 push #8
+		stw.w	r4, [dst]+, #4
+		addc.a	sum, sum, r4
+		mov	r4, r5 pull #24
+4:		and.a	len, len, #3
+		beq	.Ldone
+		mov	r5, r4 get_byte_0
+		cand.a	len, #2
+		beq	.Lexit
+		stb.w	r5, [dst]+, #1
+		addc.a	sum, sum, r4
+		load1l	r4
+		mov	r5, r4 get_byte_0
+		stb.w	r5, [dst]+, #1
+		addc.a	sum, sum, r4 push #24
+		mov	r5, r4 get_byte_1
+		b	.Lexit
+FN_EXIT
diff --git a/arch/unicore32/lib/csumpartialcopyuser.S b/arch/unicore32/lib/csumpartialcopyuser.S
new file mode 100644
index 0000000..23a292f
--- /dev/null
+++ b/arch/unicore32/lib/csumpartialcopyuser.S
@@ -0,0 +1,92 @@
+/*
+ * linux/arch/unicore32/lib/csumpartialcopyuser.S
+ *
+ * Code specific to PKUnity SoC and UniCore ISA
+ *
+ * Copyright (C) 2001-2010 GUAN Xue-tao
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ *
+ * 27/03/03 Ian Molton Clean up CONFIG_CPU
+ *
+ */
+#include <linux/linkage.h>
+#include <asm/assembler.h>
+#include <asm/errno.h>
+#include <generated/asm-offsets.h>
+
+		.text
+
+		.macro	save_regs
+		mov	ip, sp
+		stm.w	(fp, ip, lr, pc), [sp-]
+		stm.w	(r1 - r2), [sp-]
+		sub	fp, ip, #4
+		.endm
+
+		.macro	load_regs
+		ldm.w	(r1 - r2), [sp]+
+		ldm	(fp, sp, pc), [sp]+
+		.endm
+
+		.macro	load1b,	reg1
+		ldrusr	\reg1, r0, 1
+		.endm
+
+		.macro	load2b, reg1, reg2
+		ldrusr	\reg1, r0, 1
+		ldrusr	\reg2, r0, 1
+		.endm
+
+		.macro	load1l, reg1
+		ldrusr	\reg1, r0, 4
+		.endm
+
+		.macro	load2l, reg1, reg2
+		ldrusr	\reg1, r0, 4
+		ldrusr	\reg2, r0, 4
+		.endm
+
+		.macro	load4l, reg1, reg2, reg3, reg4
+		ldrusr	\reg1, r0, 4
+		ldrusr	\reg2, r0, 4
+		ldrusr	\reg3, r0, 4
+		ldrusr	\reg4, r0, 4
+		.endm
+
+/*
+ * unsigned int
+ * csum_partial_copy_from_user
+ *		(const char *src, char *dst, int len, int sum, int *err_ptr)
+ *  r0 = src, r1 = dst, r2 = len, r3 = sum, [sp] = *err_ptr
+ *  Returns : r0 = checksum, [[sp, #0], #0] = 0 or -EFAULT
+ */
+
+#define FN_ENTRY	ENTRY(csum_partial_copy_from_user)
+#define FN_EXIT		ENDPROC(csum_partial_copy_from_user)
+
+#include "csumpartialcopygeneric.S"
+
+/*
+ * FIXME: minor buglet here
+ * We don't return the checksum for the data present in the buffer.  To do
+ * so properly, we would have to add in whatever registers were loaded before
+ * the fault, which, with the current asm above is not predictable.
+ */
+		.pushsection .fixup,"ax"
+		.align	4
+9001:		mov	r4, #-EFAULT
+		ldw	r5, [sp+], #8*4		@ *err_ptr
+		stw	r4, [r5]
+		ldm	(r1, r2), [sp]+		@ retrieve dst, len
+		add	r2, r2, r1
+		mov	r0, #0			@ zero the buffer
+9002:		cxor.a	r2, r1
+		beq	201f
+		stb.w	r0, [r1]+, #1
+		b	9002b
+201:
+		load_regs
+		.popsection

             reply	other threads:[~2011-01-03 11:54 UTC|newest]

Thread overview: 3+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2011-01-03 11:54 Guan Xuetao [this message]
2011-01-03 11:54 ` [PATCHv1 7/8] unicore32 additional architecture files: low-level lib: checksum Guan Xuetao
2011-01-25 16:22 ` Arnd Bergmann

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to='013101cbab3c$eb0a7670$c11f6350$@mprc.pku.edu.cn' \
    --to=guanxuetao@mprc.pku.edu.cn \
    --cc=linux-arch@vger.kernel.org \
    --cc=linux-kernel@vger.kernel.org \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line before the message body.
This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.