From mboxrd@z Thu Jan 1 00:00:00 1970 From: James Hogan Subject: [PATCH v2 32/44] metag: Optimised library functions Date: Wed, 5 Dec 2012 16:08:50 +0000 Message-ID: <1354723742-6195-33-git-send-email-james.hogan@imgtec.com> References: <1354723742-6195-1-git-send-email-james.hogan@imgtec.com> Mime-Version: 1.0 Content-Type: text/plain Return-path: In-Reply-To: <1354723742-6195-1-git-send-email-james.hogan@imgtec.com> Sender: linux-kernel-owner@vger.kernel.org To: linux-arch@vger.kernel.org, linux-kernel@vger.kernel.org Cc: Arnd Bergmann , James Hogan List-Id: linux-arch.vger.kernel.org Add optimised library functions for metag. Signed-off-by: James Hogan --- arch/metag/include/asm/checksum.h | 92 +++ arch/metag/include/asm/div64.h | 12 + arch/metag/include/asm/string.h | 13 + arch/metag/lib/ashldi3.S | 33 + arch/metag/lib/ashrdi3.S | 33 + arch/metag/lib/checksum.c | 169 +++++ arch/metag/lib/clear_page.S | 17 + arch/metag/lib/cmpdi2.S | 32 + arch/metag/lib/copy_page.S | 20 + arch/metag/lib/delay.c | 55 ++ arch/metag/lib/div64.S | 108 +++ arch/metag/lib/divsi3.S | 100 +++ arch/metag/lib/ip_fast_csum.S | 32 + arch/metag/lib/lshrdi3.S | 33 + arch/metag/lib/memcpy.S | 185 +++++ arch/metag/lib/memmove.S | 345 ++++++++++ arch/metag/lib/memset.S | 86 +++ arch/metag/lib/modsi3.S | 38 + arch/metag/lib/muldi3.S | 44 ++ arch/metag/lib/ucmpdi2.S | 27 + arch/metag/lib/usercopy.c | 1341 +++++++++++++++++++++++++++++++++++++ 21 files changed, 2815 insertions(+), 0 deletions(-) create mode 100644 arch/metag/include/asm/checksum.h create mode 100644 arch/metag/include/asm/div64.h create mode 100644 arch/metag/include/asm/string.h create mode 100644 arch/metag/lib/ashldi3.S create mode 100644 arch/metag/lib/ashrdi3.S create mode 100644 arch/metag/lib/checksum.c create mode 100644 arch/metag/lib/clear_page.S create mode 100644 arch/metag/lib/cmpdi2.S create mode 100644 arch/metag/lib/copy_page.S create mode 100644 arch/metag/lib/delay.c create mode 100644 arch/metag/lib/div64.S create mode 100644 arch/metag/lib/divsi3.S create mode 100644 arch/metag/lib/ip_fast_csum.S create mode 100644 arch/metag/lib/lshrdi3.S create mode 100644 arch/metag/lib/memcpy.S create mode 100644 arch/metag/lib/memmove.S create mode 100644 arch/metag/lib/memset.S create mode 100644 arch/metag/lib/modsi3.S create mode 100644 arch/metag/lib/muldi3.S create mode 100644 arch/metag/lib/ucmpdi2.S create mode 100644 arch/metag/lib/usercopy.c diff --git a/arch/metag/include/asm/checksum.h b/arch/metag/include/asm/checksum.h new file mode 100644 index 0000000..1113f4b --- /dev/null +++ b/arch/metag/include/asm/checksum.h @@ -0,0 +1,92 @@ +#ifndef _METAG_CHECKSUM_H +#define _METAG_CHECKSUM_H + +/* + * computes the checksum of a memory block at buff, length len, + * and adds in "sum" (32-bit) + * + * returns a 32-bit number suitable for feeding into itself + * or csum_tcpudp_magic + * + * this function must be called with even lengths, except + * for the last fragment, which may be odd + * + * it's best to have buff aligned on a 32-bit boundary + */ +extern __wsum csum_partial(const void *buff, int len, __wsum sum); + +/* + * the same as csum_partial, but copies from src while it + * checksums + * + * here even more important to align src and dst on a 32-bit (or even + * better 64-bit) boundary + */ +extern __wsum csum_partial_copy(const void *src, void *dst, int len, + __wsum sum); + +/* + * the same as csum_partial_copy, but copies from user space. + * + * here even more important to align src and dst on a 32-bit (or even + * better 64-bit) boundary + */ +extern __wsum csum_partial_copy_from_user(const void __user *src, void *dst, + int len, __wsum sum, int *csum_err); + +#define csum_partial_copy_nocheck(src, dst, len, sum) \ + csum_partial_copy((src), (dst), (len), (sum)) + +/* + * Fold a partial checksum + */ +static inline __sum16 csum_fold(__wsum csum) +{ + u32 sum = (__force u32)csum; + sum = (sum & 0xffff) + (sum >> 16); + sum = (sum & 0xffff) + (sum >> 16); + return (__force __sum16)~sum; +} + +/* + * This is a version of ip_compute_csum() optimized for IP headers, + * which always checksum on 4 octet boundaries. + */ +extern __sum16 ip_fast_csum(const void *iph, unsigned int ihl); + +/* + * computes the checksum of the TCP/UDP pseudo-header + * returns a 16-bit checksum, already complemented + */ +static inline __wsum csum_tcpudp_nofold(__be32 saddr, __be32 daddr, + unsigned short len, + unsigned short proto, + __wsum sum) +{ + unsigned long len_proto = (proto + len) << 8; + __asm__("ADD %0, %0, %1\n" + "ADDS %0, %0, %2\n" + "ADDCS %0, %0, #1\n" + "ADDS %0, %0, %3\n" + "ADDCS %0, %0, #1\n" + : "=d"(sum) + : "d"(daddr), "d"(saddr), "d"(len_proto), + "0"(sum) + : "cc"); + return sum; +} + +static inline __sum16 +csum_tcpudp_magic(__be32 saddr, __be32 daddr, unsigned short len, + unsigned short proto, __wsum sum) +{ + return csum_fold(csum_tcpudp_nofold(saddr, daddr, len, proto, sum)); +} + +/* + * this routine is used for miscellaneous IP-like checksums, mainly + * in icmp.c + */ +extern __sum16 ip_compute_csum(const void *buff, int len); + +#endif /* _METAG_CHECKSUM_H */ diff --git a/arch/metag/include/asm/div64.h b/arch/metag/include/asm/div64.h new file mode 100644 index 0000000..0fdd116 --- /dev/null +++ b/arch/metag/include/asm/div64.h @@ -0,0 +1,12 @@ +#ifndef __ASM_DIV64_H__ +#define __ASM_DIV64_H__ + +#include + +extern u64 div_u64(u64 dividend, u64 divisor); +extern s64 div_s64(s64 dividend, s64 divisor); + +#define div_u64 div_u64 +#define div_s64 div_s64 + +#endif diff --git a/arch/metag/include/asm/string.h b/arch/metag/include/asm/string.h new file mode 100644 index 0000000..53e3806 --- /dev/null +++ b/arch/metag/include/asm/string.h @@ -0,0 +1,13 @@ +#ifndef _METAG_STRING_H_ +#define _METAG_STRING_H_ + +#define __HAVE_ARCH_MEMSET +extern void *memset(void *__s, int __c, size_t __count); + +#define __HAVE_ARCH_MEMCPY +void *memcpy(void *__to, __const__ void *__from, size_t __n); + +#define __HAVE_ARCH_MEMMOVE +extern void *memmove(void *__dest, __const__ void *__src, size_t __n); + +#endif /* _METAG_STRING_H_ */ diff --git a/arch/metag/lib/ashldi3.S b/arch/metag/lib/ashldi3.S new file mode 100644 index 0000000..78d6974 --- /dev/null +++ b/arch/metag/lib/ashldi3.S @@ -0,0 +1,33 @@ +! Copyright (C) 2012 by Imagination Technologies Ltd. +! +! 64-bit arithmetic shift left routine. +! + + .text + .global ___ashldi3 + .type ___ashldi3,function + +___ashldi3: + MOV D0Re0,D0Ar2 + MOV D1Re0,D1Ar1 + CMP D1Ar3,#0 ! COUNT == 0 + MOVEQ PC,D1RtP ! Yes, return + + SUBS D0Ar4,D1Ar3,#32 ! N = COUNT - 32 + BGE $L10 + +!! Shift < 32 + NEG D0Ar4,D0Ar4 ! N = - N + LSL D1Re0,D1Re0,D1Ar3 ! HI = HI << COUNT + LSR D0Ar6,D0Re0,D0Ar4 ! TMP= LO >> -(COUNT - 32) + OR D1Re0,D1Re0,D0Ar6 ! HI = HI | TMP + SWAP D0Ar4,D1Ar3 + LSL D0Re0,D0Re0,D0Ar4 ! LO = LO << COUNT + MOV PC,D1RtP + +$L10: +!! Shift >= 32 + LSL D1Re0,D0Re0,D0Ar4 ! HI = LO << N + MOV D0Re0,#0 ! LO = 0 + MOV PC,D1RtP + .size ___ashldi3,.-___ashldi3 diff --git a/arch/metag/lib/ashrdi3.S b/arch/metag/lib/ashrdi3.S new file mode 100644 index 0000000..7cb7ed3 --- /dev/null +++ b/arch/metag/lib/ashrdi3.S @@ -0,0 +1,33 @@ +! Copyright (C) 2012 by Imagination Technologies Ltd. +! +! 64-bit arithmetic shift right routine. +! + + .text + .global ___ashrdi3 + .type ___ashrdi3,function + +___ashrdi3: + MOV D0Re0,D0Ar2 + MOV D1Re0,D1Ar1 + CMP D1Ar3,#0 ! COUNT == 0 + MOVEQ PC,D1RtP ! Yes, return + + MOV D0Ar4,D1Ar3 + SUBS D1Ar3,D1Ar3,#32 ! N = COUNT - 32 + BGE $L20 + +!! Shift < 32 + NEG D1Ar3,D1Ar3 ! N = - N + LSR D0Re0,D0Re0,D0Ar4 ! LO = LO >> COUNT + LSL D0Ar6,D1Re0,D1Ar3 ! TMP= HI << -(COUNT - 32) + OR D0Re0,D0Re0,D0Ar6 ! LO = LO | TMP + SWAP D1Ar3,D0Ar4 + ASR D1Re0,D1Re0,D1Ar3 ! HI = HI >> COUNT + MOV PC,D1RtP +$L20: +!! Shift >= 32 + ASR D0Re0,D1Re0,D1Ar3 ! LO = HI >> N + ASR D1Re0,D1Re0,#31 ! HI = HI >> 31 + MOV PC,D1RtP + .size ___ashrdi3,.-___ashrdi3 diff --git a/arch/metag/lib/checksum.c b/arch/metag/lib/checksum.c new file mode 100644 index 0000000..ac2daa2 --- /dev/null +++ b/arch/metag/lib/checksum.c @@ -0,0 +1,169 @@ +/* + * + * INET An implementation of the TCP/IP protocol suite for the LINUX + * operating system. INET is implemented using the BSD Socket + * interface as the means of communication with the user level. + * + * IP/TCP/UDP checksumming routines + * + * Authors: Jorge Cwik, + * Arnt Gulbrandsen, + * Tom May, + * Andreas Schwab, + * Lots of code moved from tcp.c and ip.c; see those files + * for more names. + * + * 03/02/96 Jes Sorensen, Andreas Schwab, Roman Hodek: + * Fixed some nasty bugs, causing some horrible crashes. + * A: At some points, the sum (%0) was used as + * length-counter instead of the length counter + * (%1). Thanks to Roman Hodek for pointing this out. + * B: GCC seems to mess up if one uses too many + * data-registers to hold input values and one tries to + * specify d0 and d1 as scratch registers. Letting gcc + * choose these registers itself solves the problem. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + */ + +/* Revised by Kenneth Albanowski for m68knommu. Basic problem: unaligned access + kills, so most of the assembly has to go. */ + +#include +#include + +#include + +static inline unsigned short from32to16(unsigned int x) +{ + /* add up 16-bit and 16-bit for 16+c bit */ + x = (x & 0xffff) + (x >> 16); + /* add up carry.. */ + x = (x & 0xffff) + (x >> 16); + return x; +} + +static unsigned int do_csum(const unsigned char *buff, int len) +{ + int odd; + unsigned int result = 0; + + if (len <= 0) + goto out; + odd = 1 & (unsigned long) buff; + if (odd) { +#ifdef __LITTLE_ENDIAN + result += (*buff << 8); +#else + result = *buff; +#endif + len--; + buff++; + } + if (len >= 2) { + if (2 & (unsigned long) buff) { + result += *(unsigned short *) buff; + len -= 2; + buff += 2; + } + if (len >= 4) { + const unsigned char *end = buff + ((unsigned)len & ~3); + unsigned int carry = 0; + do { + unsigned int w = *(unsigned int *) buff; + buff += 4; + result += carry; + result += w; + carry = (w > result); + } while (buff < end); + result += carry; + result = (result & 0xffff) + (result >> 16); + } + if (len & 2) { + result += *(unsigned short *) buff; + buff += 2; + } + } + if (len & 1) +#ifdef __LITTLE_ENDIAN + result += *buff; +#else + result += (*buff << 8); +#endif + result = from32to16(result); + if (odd) + result = ((result >> 8) & 0xff) | ((result & 0xff) << 8); +out: + return result; +} + +EXPORT_SYMBOL(ip_fast_csum); + +/* + * computes the checksum of a memory block at buff, length len, + * and adds in "sum" (32-bit) + * + * returns a 32-bit number suitable for feeding into itself + * or csum_tcpudp_magic + * + * this function must be called with even lengths, except + * for the last fragment, which may be odd + * + * it's best to have buff aligned on a 32-bit boundary + */ +__wsum csum_partial(const void *buff, int len, __wsum wsum) +{ + unsigned int sum = (__force unsigned int)wsum; + unsigned int result = do_csum(buff, len); + + /* add in old sum, and carry.. */ + result += sum; + if (sum > result) + result += 1; + return (__force __wsum)result; +} +EXPORT_SYMBOL(csum_partial); + +/* + * this routine is used for miscellaneous IP-like checksums, mainly + * in icmp.c + */ +__sum16 ip_compute_csum(const void *buff, int len) +{ + return (__force __sum16)~do_csum(buff, len); +} +EXPORT_SYMBOL(ip_compute_csum); + +/* + * copy from fs while checksumming, otherwise like csum_partial + */ +__wsum +csum_partial_copy_from_user(const void __user *src, void *dst, int len, + __wsum sum, int *csum_err) +{ + int missing; + + missing = __copy_from_user(dst, src, len); + if (missing) { + memset(dst + len - missing, 0, missing); + *csum_err = -EFAULT; + } else + *csum_err = 0; + + return csum_partial(dst, len, sum); +} +EXPORT_SYMBOL(csum_partial_copy_from_user); + +/* + * copy from ds while checksumming, otherwise like csum_partial + */ +__wsum +csum_partial_copy(const void *src, void *dst, int len, __wsum sum) +{ + memcpy(dst, src, len); + return csum_partial(dst, len, sum); +} +EXPORT_SYMBOL(csum_partial_copy); diff --git a/arch/metag/lib/clear_page.S b/arch/metag/lib/clear_page.S new file mode 100644 index 0000000..43144ee --- /dev/null +++ b/arch/metag/lib/clear_page.S @@ -0,0 +1,17 @@ + ! Copyright 2007,2008,2009 Imagination Technologies Ltd. + +#include + + .text + .global _clear_page + .type _clear_page,function + !! D1Ar1 - page +_clear_page: + MOV TXRPT,#((PAGE_SIZE / 8) - 1) + MOV D0Re0,#0 + MOV D1Re0,#0 +$Lclear_page_loop: + SETL [D1Ar1++],D0Re0,D1Re0 + BR $Lclear_page_loop + MOV PC,D1RtP + .size _clear_page,.-_clear_page diff --git a/arch/metag/lib/cmpdi2.S b/arch/metag/lib/cmpdi2.S new file mode 100644 index 0000000..9c5c663 --- /dev/null +++ b/arch/metag/lib/cmpdi2.S @@ -0,0 +1,32 @@ +! Copyright (C) 2012 by Imagination Technologies Ltd. +! +! 64-bit signed compare routine. +! + + .text + .global ___cmpdi2 + .type ___cmpdi2,function + +! low high +! s64 a (D0Ar2, D1Ar1) +! s64 b (D0Ar4, D1Ar3) +___cmpdi2: + ! start at 1 (equal) and conditionally increment or decrement + MOV D0Re0,#1 + + ! high words differ? + CMP D1Ar1,D1Ar3 + BNE $Lhigh_differ + + ! unsigned compare low words + CMP D0Ar2,D0Ar4 + SUBLO D0Re0,D0Re0,#1 + ADDHI D0Re0,D0Re0,#1 + MOV PC,D1RtP + +$Lhigh_differ: + ! signed compare high words + SUBLT D0Re0,D0Re0,#1 + ADDGT D0Re0,D0Re0,#1 + MOV PC,D1RtP + .size ___cmpdi2,.-___cmpdi2 diff --git a/arch/metag/lib/copy_page.S b/arch/metag/lib/copy_page.S new file mode 100644 index 0000000..91f7d46 --- /dev/null +++ b/arch/metag/lib/copy_page.S @@ -0,0 +1,20 @@ + ! Copyright 2007,2008 Imagination Technologies Ltd. + +#include + + .text + .global _copy_page + .type _copy_page,function + !! D1Ar1 - to + !! D0Ar2 - from +_copy_page: + MOV D0FrT,#PAGE_SIZE +$Lcopy_page_loop: + GETL D0Re0,D1Re0,[D0Ar2++] + GETL D0Ar6,D1Ar5,[D0Ar2++] + SETL [D1Ar1++],D0Re0,D1Re0 + SETL [D1Ar1++],D0Ar6,D1Ar5 + SUBS D0FrT,D0FrT,#16 + BNZ $Lcopy_page_loop + MOV PC,D1RtP + .size _copy_page,.-_copy_page diff --git a/arch/metag/lib/delay.c b/arch/metag/lib/delay.c new file mode 100644 index 0000000..e1cfcbb --- /dev/null +++ b/arch/metag/lib/delay.c @@ -0,0 +1,55 @@ +/* + * Precise Delay Loops for Meta + * + * Copyright (C) 1993 Linus Torvalds + * Copyright (C) 1997 Martin Mares + * Copyright (C) 2007,2009 Imagination Technologies Ltd. + * + */ + +#include +#include +#include + +#include + +/* + * TXTACTCYC is only 24 bits, so on chips with fast clocks it will wrap + * many times per-second. If it does wrap __delay will return prematurely, + * but this is only likely with large delay values. + * + * We also can't implement read_current_timer() with TXTACTCYC due to + * this wrapping behaviour. + */ +#define rdtimer(t) asm volatile ("MOV %0,TXACTCYC\n" : "=r" (t)); + +void __delay(unsigned long loops) +{ + unsigned long bclock, now; + + rdtimer(bclock); + do { + asm("NOP"); + rdtimer(now); + } while ((now-bclock) < loops); +} +EXPORT_SYMBOL(__delay); + +inline void __const_udelay(unsigned long xloops) +{ + u64 loops = (u64)xloops * (u64)loops_per_jiffy * HZ; + __delay(loops >> 32); +} +EXPORT_SYMBOL(__const_udelay); + +void __udelay(unsigned long usecs) +{ + __const_udelay(usecs * 0x000010c7); /* 2**32 / 1000000 (rounded up) */ +} +EXPORT_SYMBOL(__udelay); + +void __ndelay(unsigned long nsecs) +{ + __const_udelay(nsecs * 0x00005); /* 2**32 / 1000000000 (rounded up) */ +} +EXPORT_SYMBOL(__ndelay); diff --git a/arch/metag/lib/div64.S b/arch/metag/lib/div64.S new file mode 100644 index 0000000..1cfc934 --- /dev/null +++ b/arch/metag/lib/div64.S @@ -0,0 +1,108 @@ +! Copyright (C) 2012 Imagination Technologies Ltd. +! +! Signed/unsigned 64-bit division routines. +! + + .text + .global _div_u64 + .type _div_u64,function + +_div_u64: +$L1: + ORS A0.3,D1Ar3,D0Ar4 + BNE $L3 +$L2: + MOV D0Re0,D0Ar2 + MOV D1Re0,D1Ar1 + MOV PC,D1RtP +$L3: + CMP D1Ar3,D1Ar1 + CMPEQ D0Ar4,D0Ar2 + MOV D0Re0,#1 + MOV D1Re0,#0 + BHS $L6 +$L4: + ADDS D0Ar6,D0Ar4,D0Ar4 + ADD D1Ar5,D1Ar3,D1Ar3 + ADDCS D1Ar5,D1Ar5,#1 + CMP D1Ar5,D1Ar3 + CMPEQ D0Ar6,D0Ar4 + BLO $L6 +$L5: + MOV D0Ar4,D0Ar6 + MOV D1Ar3,D1Ar5 + ADDS D0Re0,D0Re0,D0Re0 + ADD D1Re0,D1Re0,D1Re0 + ADDCS D1Re0,D1Re0,#1 + CMP D1Ar3,D1Ar1 + CMPEQ D0Ar4,D0Ar2 + BLO $L4 +$L6: + ORS A0.3,D1Re0,D0Re0 + MOV D0Ar6,#0 + MOV D1Ar5,D0Ar6 + BEQ $L10 +$L7: + CMP D1Ar1,D1Ar3 + CMPEQ D0Ar2,D0Ar4 + BLO $L9 +$L8: + ADDS D0Ar6,D0Ar6,D0Re0 + ADD D1Ar5,D1Ar5,D1Re0 + ADDCS D1Ar5,D1Ar5,#1 + + SUBS D0Ar2,D0Ar2,D0Ar4 + SUB D1Ar1,D1Ar1,D1Ar3 + SUBCS D1Ar1,D1Ar1,#1 +$L9: + LSL A0.3,D1Re0,#31 + LSR D0Re0,D0Re0,#1 + LSR D1Re0,D1Re0,#1 + OR D0Re0,D0Re0,A0.3 + LSL A0.3,D1Ar3,#31 + LSR D0Ar4,D0Ar4,#1 + LSR D1Ar3,D1Ar3,#1 + OR D0Ar4,D0Ar4,A0.3 + ORS A0.3,D1Re0,D0Re0 + BNE $L7 +$L10: + MOV D0Re0,D0Ar6 + MOV D1Re0,D1Ar5 + MOV PC,D1RtP + .size _div_u64,.-_div_u64 + + .text + .global _div_s64 + .type _div_s64,function +_div_s64: + MSETL [A0StP],D0FrT,D0.5 + XOR D0.5,D0Ar2,D0Ar4 + XOR D1.5,D1Ar1,D1Ar3 + TSTT D1Ar1,#HI(0x80000000) + BZ $L25 + + NEGS D0Ar2,D0Ar2 + NEG D1Ar1,D1Ar1 + SUBCS D1Ar1,D1Ar1,#1 +$L25: + TSTT D1Ar3,#HI(0x80000000) + BZ $L27 + + NEGS D0Ar4,D0Ar4 + NEG D1Ar3,D1Ar3 + SUBCS D1Ar3,D1Ar3,#1 +$L27: + CALLR D1RtP,_div_u64 + TSTT D1.5,#HI(0x80000000) + BZ $L29 + + NEGS D0Re0,D0Re0 + NEG D1Re0,D1Re0 + SUBCS D1Re0,D1Re0,#1 +$L29: + + GETL D0FrT,D1RtP,[A0StP+#(-16)] + GETL D0.5,D1.5,[A0StP+#(-8)] + SUB A0StP,A0StP,#16 + MOV PC,D1RtP + .size _div_s64,.-_div_s64 diff --git a/arch/metag/lib/divsi3.S b/arch/metag/lib/divsi3.S new file mode 100644 index 0000000..7c8a8ae --- /dev/null +++ b/arch/metag/lib/divsi3.S @@ -0,0 +1,100 @@ +! Copyright (C) 2001, 2002, 2003, 2004, 2005, 2006, 2007 +! Imagination Technologies Ltd +! +! Integer divide routines. +! + + .text + .global ___udivsi3 + .type ___udivsi3,function + .align 2 +___udivsi3: +!! +!! Since core is signed divide case, just set control variable +!! + MOV D1Re0,D0Ar2 ! Au already in A1Ar1, Bu -> D1Re0 + MOV D0Re0,#0 ! Result is 0 + MOV D0Ar4,#0 ! Return positive result + B $LIDMCUStart + .size ___udivsi3,.-___udivsi3 + +!! +!! 32-bit division signed i/p - passed signed 32-bit numbers +!! + .global ___divsi3 + .type ___divsi3,function + .align 2 +___divsi3: +!! +!! A already in D1Ar1, B already in D0Ar2 -> make B abs(B) +!! + MOV D1Re0,D0Ar2 ! A already in A1Ar1, B -> D1Re0 + MOV D0Re0,#0 ! Result is 0 + XOR D0Ar4,D1Ar1,D1Re0 ! D0Ar4 -ive if result is -ive + ABS D1Ar1,D1Ar1 ! abs(A) -> Au + ABS D1Re0,D1Re0 ! abs(B) -> Bu +$LIDMCUStart: + CMP D1Ar1,D1Re0 ! Is ( Au > Bu )? + LSR D1Ar3,D1Ar1,#2 ! Calculate (Au & (~3)) >> 2 + CMPHI D1Re0,D1Ar3 ! OR ( (Au & (~3)) <= (Bu << 2) )? + LSLSHI D1Ar3,D1Re0,#1 ! Buq = Bu << 1 + BLS $LIDMCUSetup ! Yes: Do normal divide +!! +!! Quick divide setup can assume that CurBit only needs to start at 2 +!! +$LIDMCQuick: + CMP D1Ar1,D1Ar3 ! ( A >= Buq )? + ADDCC D0Re0,D0Re0,#2 ! If yes result += 2 + SUBCC D1Ar1,D1Ar1,D1Ar3 ! and A -= Buq + CMP D1Ar1,D1Re0 ! ( A >= Bu )? + ADDCC D0Re0,D0Re0,#1 ! If yes result += 1 + SUBCC D1Ar1,D1Ar1,D1Re0 ! and A -= Bu + ORS D0Ar4,D0Ar4,D0Ar4 ! Return neg result? + NEG D0Ar2,D0Re0 ! Calulate neg result + MOVMI D0Re0,D0Ar2 ! Yes: Take neg result +$LIDMCRet: + MOV PC,D1RtP +!! +!! Setup for general unsigned divide code +!! +!! D0Re0 is used to form the result, already set to Zero +!! D1Re0 is the input Bu value, this gets trashed +!! D0Ar6 is curbit which is set to 1 at the start and shifted up +!! D0Ar4 is negative if we should return a negative result +!! D1Ar1 is the input Au value, eventually this holds the remainder +!! +$LIDMCUSetup: + CMP D1Ar1,D1Re0 ! Is ( Au < Bu )? + MOV D0Ar6,#1 ! Set curbit to 1 + BCS $LIDMCRet ! Yes: Return 0 remainder Au +!! +!! Calculate alignment using FFB instruction +!! + FFB D1Ar5,D1Ar1 ! Find first bit of Au + ANDN D1Ar5,D1Ar5,#31 ! Handle exceptional case. + ORN D1Ar5,D1Ar5,#31 ! if N bit set, set to 31 + FFB D1Ar3,D1Re0 ! Find first bit of Bu + ANDN D1Ar3,D1Ar3,#31 ! Handle exceptional case. + ORN D1Ar3,D1Ar3,#31 ! if N bit set, set to 31 + SUBS D1Ar3,D1Ar5,D1Ar3 ! calculate diff, ffbA - ffbB + MOV D0Ar2,D1Ar3 ! copy into bank 0 + LSLGT D1Re0,D1Re0,D1Ar3 ! ( > 0) ? left shift B + LSLGT D0Ar6,D0Ar6,D0Ar2 ! ( > 0) ? left shift curbit +!! +!! Now we start the divide proper, logic is +!! +!! if ( A >= B ) add curbit to result and subtract B from A +!! shift curbit and B down by 1 in either case +!! +$LIDMCLoop: + CMP D1Ar1, D1Re0 ! ( A >= B )? + ADDCC D0Re0, D0Re0, D0Ar6 ! If yes result += curbit + SUBCC D1Ar1, D1Ar1, D1Re0 ! and A -= B + LSRS D0Ar6, D0Ar6, #1 ! Shift down curbit, is it zero? + LSR D1Re0, D1Re0, #1 ! Shift down B + BNZ $LIDMCLoop ! Was single bit in curbit lost? + ORS D0Ar4,D0Ar4,D0Ar4 ! Return neg result? + NEG D0Ar2,D0Re0 ! Calulate neg result + MOVMI D0Re0,D0Ar2 ! Yes: Take neg result + MOV PC,D1RtP + .size ___divsi3,.-___divsi3 diff --git a/arch/metag/lib/ip_fast_csum.S b/arch/metag/lib/ip_fast_csum.S new file mode 100644 index 0000000..533b1e7 --- /dev/null +++ b/arch/metag/lib/ip_fast_csum.S @@ -0,0 +1,32 @@ + + .text +/* + * This is a version of ip_compute_csum() optimized for IP headers, + * which always checksum on 4 octet boundaries. + * + * extern __sum16 ip_fast_csum(const void *iph, unsigned int ihl); + * + */ + .global _ip_fast_csum + .type _ip_fast_csum,function +_ip_fast_csum: + !! TXRPT needs loops - 1 + SUBS TXRPT,D0Ar2,#1 + MOV D0Re0,#0 + BLO $Lfast_csum_exit +$Lfast_csum_loop: + GETD D1Ar3,[D1Ar1++] + ADDS D0Re0,D0Re0,D1Ar3 + ADDCS D0Re0,D0Re0,#1 + BR $Lfast_csum_loop + LSR D0Ar4,D0Re0,#16 + AND D0Re0,D0Re0,#0xffff + AND D0Ar4,D0Ar4,#0xffff + ADD D0Re0,D0Re0,D0Ar4 + LSR D0Ar4,D0Re0,#16 + ADD D0Re0,D0Re0,D0Ar4 + XOR D0Re0,D0Re0,#-1 + AND D0Re0,D0Re0,#0xffff +$Lfast_csum_exit: + MOV PC,D1RtP + .size _ip_fast_csum,.-_ip_fast_csum diff --git a/arch/metag/lib/lshrdi3.S b/arch/metag/lib/lshrdi3.S new file mode 100644 index 0000000..47f7202 --- /dev/null +++ b/arch/metag/lib/lshrdi3.S @@ -0,0 +1,33 @@ +! Copyright (C) 2012 by Imagination Technologies Ltd. +! +! 64-bit logical shift right routine. +! + + .text + .global ___lshrdi3 + .type ___lshrdi3,function + +___lshrdi3: + MOV D0Re0,D0Ar2 + MOV D1Re0,D1Ar1 + CMP D1Ar3,#0 ! COUNT == 0 + MOVEQ PC,D1RtP ! Yes, return + + MOV D0Ar4,D1Ar3 + SUBS D1Ar3,D1Ar3,#32 ! N = COUNT - 32 + BGE $L30 + +!! Shift < 32 + NEG D1Ar3,D1Ar3 ! N = - N + LSR D0Re0,D0Re0,D0Ar4 ! LO = LO >> COUNT + LSL D0Ar6,D1Re0,D1Ar3 ! TMP= HI << -(COUNT - 32) + OR D0Re0,D0Re0,D0Ar6 ! LO = LO | TMP + SWAP D1Ar3,D0Ar4 + LSR D1Re0,D1Re0,D1Ar3 ! HI = HI >> COUNT + MOV PC,D1RtP +$L30: +!! Shift >= 32 + LSR D0Re0,D1Re0,D1Ar3 ! LO = HI >> N + MOV D1Re0,#0 ! HI = 0 + MOV PC,D1RtP + .size ___lshrdi3,.-___lshrdi3 diff --git a/arch/metag/lib/memcpy.S b/arch/metag/lib/memcpy.S new file mode 100644 index 0000000..46b7a2b --- /dev/null +++ b/arch/metag/lib/memcpy.S @@ -0,0 +1,185 @@ +! Copyright (C) 2008-2012 Imagination Technologies Ltd. + + .text + .global _memcpy + .type _memcpy,function +! D1Ar1 dst +! D0Ar2 src +! D1Ar3 cnt +! D0Re0 dst +_memcpy: + CMP D1Ar3, #16 + MOV A1.2, D0Ar2 ! source pointer + MOV A0.2, D1Ar1 ! destination pointer + MOV A0.3, D1Ar1 ! for return value +! If there are less than 16 bytes to copy use the byte copy loop + BGE $Llong_copy + +$Lbyte_copy: +! Simply copy a byte at a time + SUBS TXRPT, D1Ar3, #1 + BLT $Lend +$Lloop_byte: + GETB D1Re0, [A1.2++] + SETB [A0.2++], D1Re0 + BR $Lloop_byte + +$Lend: +! Finally set return value and return + MOV D0Re0, A0.3 + MOV PC, D1RtP + +$Llong_copy: + ANDS D1Ar5, D1Ar1, #7 ! test destination alignment + BZ $Laligned_dst + +! The destination address is not 8 byte aligned. We will copy bytes from +! the source to the destination until the remaining data has an 8 byte +! destination address alignment (i.e we should never copy more than 7 +! bytes here). +$Lalign_dst: + GETB D0Re0, [A1.2++] + ADD D1Ar5, D1Ar5, #1 ! dest is aligned when D1Ar5 reaches #8 + SUB D1Ar3, D1Ar3, #1 ! decrement count of remaining bytes + SETB [A0.2++], D0Re0 + CMP D1Ar5, #8 + BNE $Lalign_dst + +! We have at least (16 - 7) = 9 bytes to copy - calculate the number of 8 byte +! blocks, then jump to the unaligned copy loop or fall through to the aligned +! copy loop as appropriate. +$Laligned_dst: + MOV D0Ar4, A1.2 + LSR D1Ar5, D1Ar3, #3 ! D1Ar5 = number of 8 byte blocks + ANDS D0Ar4, D0Ar4, #7 ! test source alignment + BNZ $Lunaligned_copy ! if unaligned, use unaligned copy loop + +! Both source and destination are 8 byte aligned - the easy case. +$Laligned_copy: + LSRS D1Ar5, D1Ar3, #5 ! D1Ar5 = number of 32 byte blocks + BZ $Lbyte_copy + SUB TXRPT, D1Ar5, #1 + +$Laligned_32: + GETL D0Re0, D1Re0, [A1.2++] + GETL D0Ar6, D1Ar5, [A1.2++] + SETL [A0.2++], D0Re0, D1Re0 + SETL [A0.2++], D0Ar6, D1Ar5 + GETL D0Re0, D1Re0, [A1.2++] + GETL D0Ar6, D1Ar5, [A1.2++] + SETL [A0.2++], D0Re0, D1Re0 + SETL [A0.2++], D0Ar6, D1Ar5 + BR $Laligned_32 + +! If there are any remaining bytes use the byte copy loop, otherwise we are done + ANDS D1Ar3, D1Ar3, #0x1f + BNZ $Lbyte_copy + B $Lend + +! The destination is 8 byte aligned but the source is not, and there are 8 +! or more bytes to be copied. +$Lunaligned_copy: +! Adjust the source pointer (A1.2) to the 8 byte boundary before its +! current value + MOV D0Ar4, A1.2 + MOV D0Ar6, A1.2 + ANDMB D0Ar4, D0Ar4, #0xfff8 + MOV A1.2, D0Ar4 +! Save the number of bytes of mis-alignment in D0Ar4 for use later + SUBS D0Ar6, D0Ar6, D0Ar4 + MOV D0Ar4, D0Ar6 +! if there is no mis-alignment after all, use the aligned copy loop + BZ $Laligned_copy + +! prefetch 8 bytes + GETL D0Re0, D1Re0, [A1.2] + + SUB TXRPT, D1Ar5, #1 + +! There are 3 mis-alignment cases to be considered. Less than 4 bytes, exactly +! 4 bytes, and more than 4 bytes. + CMP D0Ar6, #4 + BLT $Lunaligned_1_2_3 ! use 1-3 byte mis-alignment loop + BZ $Lunaligned_4 ! use 4 byte mis-alignment loop + +! The mis-alignment is more than 4 bytes +$Lunaligned_5_6_7: + SUB D0Ar6, D0Ar6, #4 +! Calculate the bit offsets required for the shift operations necesssary +! to align the data. +! D0Ar6 = bit offset, D1Ar5 = (32 - bit offset) + MULW D0Ar6, D0Ar6, #8 + MOV D1Ar5, #32 + SUB D1Ar5, D1Ar5, D0Ar6 +! Move data 4 bytes before we enter the main loop + MOV D0Re0, D1Re0 + +$Lloop_5_6_7: + GETL D0Ar2, D1Ar1, [++A1.2] +! form 64-bit data in D0Re0, D1Re0 + LSR D0Re0, D0Re0, D0Ar6 + MOV D1Re0, D0Ar2 + LSL D1Re0, D1Re0, D1Ar5 + ADD D0Re0, D0Re0, D1Re0 + + LSR D0Ar2, D0Ar2, D0Ar6 + LSL D1Re0, D1Ar1, D1Ar5 + ADD D1Re0, D1Re0, D0Ar2 + + SETL [A0.2++], D0Re0, D1Re0 + MOV D0Re0, D1Ar1 + BR $Lloop_5_6_7 + + B $Lunaligned_end + +$Lunaligned_1_2_3: +! Calculate the bit offsets required for the shift operations necesssary +! to align the data. +! D0Ar6 = bit offset, D1Ar5 = (32 - bit offset) + MULW D0Ar6, D0Ar6, #8 + MOV D1Ar5, #32 + SUB D1Ar5, D1Ar5, D0Ar6 + +$Lloop_1_2_3: +! form 64-bit data in D0Re0,D1Re0 + LSR D0Re0, D0Re0, D0Ar6 + LSL D1Ar1, D1Re0, D1Ar5 + ADD D0Re0, D0Re0, D1Ar1 + MOV D0Ar2, D1Re0 + LSR D0FrT, D0Ar2, D0Ar6 + GETL D0Ar2, D1Ar1, [++A1.2] + + MOV D1Re0, D0Ar2 + LSL D1Re0, D1Re0, D1Ar5 + ADD D1Re0, D1Re0, D0FrT + + SETL [A0.2++], D0Re0, D1Re0 + MOV D0Re0, D0Ar2 + MOV D1Re0, D1Ar1 + BR $Lloop_1_2_3 + + B $Lunaligned_end + +! The 4 byte mis-alignment case - this does not require any shifting, just a +! shuffling of registers. +$Lunaligned_4: + MOV D0Re0, D1Re0 +$Lloop_4: + GETL D0Ar2, D1Ar1, [++A1.2] + MOV D1Re0, D0Ar2 + SETL [A0.2++], D0Re0, D1Re0 + MOV D0Re0, D1Ar1 + BR $Lloop_4 + +$Lunaligned_end: +! If there are no remaining bytes to copy, we are done. + ANDS D1Ar3, D1Ar3, #7 + BZ $Lend +! Re-adjust the source pointer (A1.2) back to the actual (unaligned) byte +! address of the remaining bytes, and fall through to the byte copy loop. + MOV D0Ar6, A1.2 + ADD D1Ar5, D0Ar4, D0Ar6 + MOV A1.2, D1Ar5 + B $Lbyte_copy + + .size _memcpy,.-_memcpy diff --git a/arch/metag/lib/memmove.S b/arch/metag/lib/memmove.S new file mode 100644 index 0000000..228ea04 --- /dev/null +++ b/arch/metag/lib/memmove.S @@ -0,0 +1,345 @@ +! Copyright (C) 2008-2012 Imagination Technologies Ltd. + + .text + .global _memmove + .type _memmove,function +! D1Ar1 dst +! D0Ar2 src +! D1Ar3 cnt +! D0Re0 dst +_memmove: + CMP D1Ar3, #0 + MOV D0Re0, D1Ar1 + BZ $LEND2 + MSETL [A0StP], D0.5, D0.6, D0.7 + MOV D1Ar5, D0Ar2 + CMP D1Ar1, D1Ar5 + BLT $Lforwards_copy + SUB D0Ar4, D1Ar1, D1Ar3 + ADD D0Ar4, D0Ar4, #1 + CMP D0Ar2, D0Ar4 + BLT $Lforwards_copy + ! should copy backwards + MOV D1Re0, D0Ar2 + ! adjust pointer to the end of mem + ADD D0Ar2, D1Re0, D1Ar3 + ADD D1Ar1, D1Ar1, D1Ar3 + + MOV A1.2, D0Ar2 + MOV A0.2, D1Ar1 + CMP D1Ar3, #8 + BLT $Lbbyte_loop + + MOV D0Ar4, D0Ar2 + MOV D1Ar5, D1Ar1 + + ! test 8 byte alignment + ANDS D1Ar5, D1Ar5, #7 + BNE $Lbdest_unaligned + + ANDS D0Ar4, D0Ar4, #7 + BNE $Lbsrc_unaligned + + LSR D1Ar5, D1Ar3, #3 + +$Lbaligned_loop: + GETL D0Re0, D1Re0, [--A1.2] + SETL [--A0.2], D0Re0, D1Re0 + SUBS D1Ar5, D1Ar5, #1 + BNE $Lbaligned_loop + + ANDS D1Ar3, D1Ar3, #7 + BZ $Lbbyte_loop_exit +$Lbbyte_loop: + GETB D1Re0, [--A1.2] + SETB [--A0.2], D1Re0 + SUBS D1Ar3, D1Ar3, #1 + BNE $Lbbyte_loop +$Lbbyte_loop_exit: + MOV D0Re0, A0.2 +$LEND: + SUB A0.2, A0StP, #24 + MGETL D0.5, D0.6, D0.7, [A0.2] + SUB A0StP, A0StP, #24 +$LEND2: + MOV PC, D1RtP + +$Lbdest_unaligned: + GETB D0Re0, [--A1.2] + SETB [--A0.2], D0Re0 + SUBS D1Ar5, D1Ar5, #1 + SUB D1Ar3, D1Ar3, #1 + BNE $Lbdest_unaligned + CMP D1Ar3, #8 + BLT $Lbbyte_loop +$Lbsrc_unaligned: + LSR D1Ar5, D1Ar3, #3 + ! adjust A1.2 + MOV D0Ar4, A1.2 + ! save original address + MOV D0Ar6, A1.2 + + ADD D0Ar4, D0Ar4, #7 + ANDMB D0Ar4, D0Ar4, #0xfff8 + ! new address is the 8-byte aligned one above the original + MOV A1.2, D0Ar4 + + ! A0.2 dst 64-bit is aligned + ! measure the gap size + SUB D0Ar6, D0Ar4, D0Ar6 + MOVS D0Ar4, D0Ar6 + ! keep this information for the later adjustment + ! both aligned + BZ $Lbaligned_loop + + ! prefetch + GETL D0Re0, D1Re0, [--A1.2] + + CMP D0Ar6, #4 + BLT $Lbunaligned_1_2_3 + ! 32-bit aligned + BZ $Lbaligned_4 + + SUB D0Ar6, D0Ar6, #4 + ! D1.6 stores the gap size in bits + MULW D1.6, D0Ar6, #8 + MOV D0.6, #32 + ! D0.6 stores the complement of the gap size + SUB D0.6, D0.6, D1.6 + +$Lbunaligned_5_6_7: + GETL D0.7, D1.7, [--A1.2] + ! form 64-bit data in D0Re0, D1Re0 + MOV D1Re0, D0Re0 + ! D1Re0 << gap-size + LSL D1Re0, D1Re0, D1.6 + MOV D0Re0, D1.7 + ! D0Re0 >> complement + LSR D0Re0, D0Re0, D0.6 + MOV D1.5, D0Re0 + ! combine the both + ADD D1Re0, D1Re0, D1.5 + + MOV D1.5, D1.7 + LSL D1.5, D1.5, D1.6 + MOV D0Re0, D0.7 + LSR D0Re0, D0Re0, D0.6 + MOV D0.5, D1.5 + ADD D0Re0, D0Re0, D0.5 + + SETL [--A0.2], D0Re0, D1Re0 + MOV D0Re0, D0.7 + MOV D1Re0, D1.7 + SUBS D1Ar5, D1Ar5, #1 + BNE $Lbunaligned_5_6_7 + + ANDS D1Ar3, D1Ar3, #7 + BZ $Lbbyte_loop_exit + ! Adjust A1.2 + ! A1.2 <- A1.2 +8 - gapsize + ADD A1.2, A1.2, #8 + SUB A1.2, A1.2, D0Ar4 + B $Lbbyte_loop + +$Lbunaligned_1_2_3: + MULW D1.6, D0Ar6, #8 + MOV D0.6, #32 + SUB D0.6, D0.6, D1.6 + +$Lbunaligned_1_2_3_loop: + GETL D0.7, D1.7, [--A1.2] + ! form 64-bit data in D0Re0, D1Re0 + LSL D1Re0, D1Re0, D1.6 + ! save D0Re0 for later use + MOV D0.5, D0Re0 + LSR D0Re0, D0Re0, D0.6 + MOV D1.5, D0Re0 + ADD D1Re0, D1Re0, D1.5 + + ! orignal data in D0Re0 + MOV D1.5, D0.5 + LSL D1.5, D1.5, D1.6 + MOV D0Re0, D1.7 + LSR D0Re0, D0Re0, D0.6 + MOV D0.5, D1.5 + ADD D0Re0, D0Re0, D0.5 + + SETL [--A0.2], D0Re0, D1Re0 + MOV D0Re0, D0.7 + MOV D1Re0, D1.7 + SUBS D1Ar5, D1Ar5, #1 + BNE $Lbunaligned_1_2_3_loop + + ANDS D1Ar3, D1Ar3, #7 + BZ $Lbbyte_loop_exit + ! Adjust A1.2 + ADD A1.2, A1.2, #8 + SUB A1.2, A1.2, D0Ar4 + B $Lbbyte_loop + +$Lbaligned_4: + GETL D0.7, D1.7, [--A1.2] + MOV D1Re0, D0Re0 + MOV D0Re0, D1.7 + SETL [--A0.2], D0Re0, D1Re0 + MOV D0Re0, D0.7 + MOV D1Re0, D1.7 + SUBS D1Ar5, D1Ar5, #1 + BNE $Lbaligned_4 + ANDS D1Ar3, D1Ar3, #7 + BZ $Lbbyte_loop_exit + ! Adjust A1.2 + ADD A1.2, A1.2, #8 + SUB A1.2, A1.2, D0Ar4 + B $Lbbyte_loop + +$Lforwards_copy: + MOV A1.2, D0Ar2 + MOV A0.2, D1Ar1 + CMP D1Ar3, #8 + BLT $Lfbyte_loop + + MOV D0Ar4, D0Ar2 + MOV D1Ar5, D1Ar1 + + ANDS D1Ar5, D1Ar5, #7 + BNE $Lfdest_unaligned + + ANDS D0Ar4, D0Ar4, #7 + BNE $Lfsrc_unaligned + + LSR D1Ar5, D1Ar3, #3 + +$Lfaligned_loop: + GETL D0Re0, D1Re0, [A1.2++] + SUBS D1Ar5, D1Ar5, #1 + SETL [A0.2++], D0Re0, D1Re0 + BNE $Lfaligned_loop + + ANDS D1Ar3, D1Ar3, #7 + BZ $Lfbyte_loop_exit +$Lfbyte_loop: + GETB D1Re0, [A1.2++] + SETB [A0.2++], D1Re0 + SUBS D1Ar3, D1Ar3, #1 + BNE $Lfbyte_loop +$Lfbyte_loop_exit: + MOV D0Re0, D1Ar1 + B $LEND + +$Lfdest_unaligned: + GETB D0Re0, [A1.2++] + ADD D1Ar5, D1Ar5, #1 + SUB D1Ar3, D1Ar3, #1 + SETB [A0.2++], D0Re0 + CMP D1Ar5, #8 + BNE $Lfdest_unaligned + CMP D1Ar3, #8 + BLT $Lfbyte_loop +$Lfsrc_unaligned: + ! adjust A1.2 + LSR D1Ar5, D1Ar3, #3 + + MOV D0Ar4, A1.2 + MOV D0Ar6, A1.2 + ANDMB D0Ar4, D0Ar4, #0xfff8 + MOV A1.2, D0Ar4 + + ! A0.2 dst 64-bit is aligned + SUB D0Ar6, D0Ar6, D0Ar4 + ! keep the information for the later adjustment + MOVS D0Ar4, D0Ar6 + + ! both aligned + BZ $Lfaligned_loop + + ! prefetch + GETL D0Re0, D1Re0, [A1.2] + + CMP D0Ar6, #4 + BLT $Lfunaligned_1_2_3 + BZ $Lfaligned_4 + + SUB D0Ar6, D0Ar6, #4 + MULW D0.6, D0Ar6, #8 + MOV D1.6, #32 + SUB D1.6, D1.6, D0.6 + +$Lfunaligned_5_6_7: + GETL D0.7, D1.7, [++A1.2] + ! form 64-bit data in D0Re0, D1Re0 + MOV D0Re0, D1Re0 + LSR D0Re0, D0Re0, D0.6 + MOV D1Re0, D0.7 + LSL D1Re0, D1Re0, D1.6 + MOV D0.5, D1Re0 + ADD D0Re0, D0Re0, D0.5 + + MOV D0.5, D0.7 + LSR D0.5, D0.5, D0.6 + MOV D1Re0, D1.7 + LSL D1Re0, D1Re0, D1.6 + MOV D1.5, D0.5 + ADD D1Re0, D1Re0, D1.5 + + SETL [A0.2++], D0Re0, D1Re0 + MOV D0Re0, D0.7 + MOV D1Re0, D1.7 + SUBS D1Ar5, D1Ar5, #1 + BNE $Lfunaligned_5_6_7 + + ANDS D1Ar3, D1Ar3, #7 + BZ $Lfbyte_loop_exit + ! Adjust A1.2 + ADD A1.2, A1.2, D0Ar4 + B $Lfbyte_loop + +$Lfunaligned_1_2_3: + MULW D0.6, D0Ar6, #8 + MOV D1.6, #32 + SUB D1.6, D1.6, D0.6 + +$Lfunaligned_1_2_3_loop: + GETL D0.7, D1.7, [++A1.2] + ! form 64-bit data in D0Re0, D1Re0 + LSR D0Re0, D0Re0, D0.6 + MOV D1.5, D1Re0 + LSL D1Re0, D1Re0, D1.6 + MOV D0.5, D1Re0 + ADD D0Re0, D0Re0, D0.5 + + MOV D0.5, D1.5 + LSR D0.5, D0.5, D0.6 + MOV D1Re0, D0.7 + LSL D1Re0, D1Re0, D1.6 + MOV D1.5, D0.5 + ADD D1Re0, D1Re0, D1.5 + + SETL [A0.2++], D0Re0, D1Re0 + MOV D0Re0, D0.7 + MOV D1Re0, D1.7 + SUBS D1Ar5, D1Ar5, #1 + BNE $Lfunaligned_1_2_3_loop + + ANDS D1Ar3, D1Ar3, #7 + BZ $Lfbyte_loop_exit + ! Adjust A1.2 + ADD A1.2, A1.2, D0Ar4 + B $Lfbyte_loop + +$Lfaligned_4: + GETL D0.7, D1.7, [++A1.2] + MOV D0Re0, D1Re0 + MOV D1Re0, D0.7 + SETL [A0.2++], D0Re0, D1Re0 + MOV D0Re0, D0.7 + MOV D1Re0, D1.7 + SUBS D1Ar5, D1Ar5, #1 + BNE $Lfaligned_4 + ANDS D1Ar3, D1Ar3, #7 + BZ $Lfbyte_loop_exit + ! Adjust A1.2 + ADD A1.2, A1.2, D0Ar4 + B $Lfbyte_loop + + .size _memmove,.-_memmove diff --git a/arch/metag/lib/memset.S b/arch/metag/lib/memset.S new file mode 100644 index 0000000..721085b --- /dev/null +++ b/arch/metag/lib/memset.S @@ -0,0 +1,86 @@ +! Copyright (C) 2008-2012 Imagination Technologies Ltd. + + .text + .global _memset + .type _memset,function +! D1Ar1 dst +! D0Ar2 c +! D1Ar3 cnt +! D0Re0 dst +_memset: + AND D0Ar2,D0Ar2,#0xFF ! Ensure a byte input value + MULW D0Ar2,D0Ar2,#0x0101 ! Duplicate byte value into 0-15 + ANDS D0Ar4,D1Ar1,#7 ! Extract bottom LSBs of dst + LSL D0Re0,D0Ar2,#16 ! Duplicate byte value into 16-31 + ADD A0.2,D0Ar2,D0Re0 ! Duplicate byte value into 4 (A0.2) + MOV D0Re0,D1Ar1 ! Return dst + BZ $LLongStub ! if start address is aligned + ! start address is not aligned on an 8 byte boundary, so we + ! need the number of bytes up to the next 8 byte address + ! boundary, or the length of the string if less than 8, in D1Ar5 + MOV D0Ar2,#8 ! Need 8 - N in D1Ar5 ... + SUB D1Ar5,D0Ar2,D0Ar4 ! ... subtract N + CMP D1Ar3,D1Ar5 + MOVMI D1Ar5,D1Ar3 + B $LByteStub ! dst is mis-aligned, do $LByteStub + +! +! Preamble to LongLoop which generates 4*8 bytes per interation (5 cycles) +! +$LLongStub: + LSRS D0Ar2,D1Ar3,#5 + AND D1Ar3,D1Ar3,#0x1F + MOV A1.2,A0.2 + BEQ $LLongishStub + SUB TXRPT,D0Ar2,#1 + CMP D1Ar3,#0 +$LLongLoop: + SETL [D1Ar1++],A0.2,A1.2 + SETL [D1Ar1++],A0.2,A1.2 + SETL [D1Ar1++],A0.2,A1.2 + SETL [D1Ar1++],A0.2,A1.2 + BR $LLongLoop + BZ $Lexit +! +! Preamble to LongishLoop which generates 1*8 bytes per interation (2 cycles) +! +$LLongishStub: + LSRS D0Ar2,D1Ar3,#3 + AND D1Ar3,D1Ar3,#0x7 + MOV D1Ar5,D1Ar3 + BEQ $LByteStub + SUB TXRPT,D0Ar2,#1 + CMP D1Ar3,#0 +$LLongishLoop: + SETL [D1Ar1++],A0.2,A1.2 + BR $LLongishLoop + BZ $Lexit +! +! This does a byte structured burst of up to 7 bytes +! +! D1Ar1 should point to the location required +! D1Ar3 should be the remaining total byte count +! D1Ar5 should be burst size (<= D1Ar3) +! +$LByteStub: + SUBS D1Ar3,D1Ar3,D1Ar5 ! Reduce count + ADD D1Ar1,D1Ar1,D1Ar5 ! Advance pointer to end of area + MULW D1Ar5,D1Ar5,#4 ! Scale to (1*4), (2*4), (3*4) + SUB D1Ar5,D1Ar5,#(8*4) ! Rebase to -(7*4), -(6*4), -(5*4), ... + MOV A1.2,D1Ar5 + SUB PC,CPC1,A1.2 ! Jump into table below + SETB [D1Ar1+#(-7)],A0.2 + SETB [D1Ar1+#(-6)],A0.2 + SETB [D1Ar1+#(-5)],A0.2 + SETB [D1Ar1+#(-4)],A0.2 + SETB [D1Ar1+#(-3)],A0.2 + SETB [D1Ar1+#(-2)],A0.2 + SETB [D1Ar1+#(-1)],A0.2 +! +! Return if all data has been output, otherwise do $LLongStub +! + BNZ $LLongStub +$Lexit: + MOV PC,D1RtP + .size _memset,.-_memset + diff --git a/arch/metag/lib/modsi3.S b/arch/metag/lib/modsi3.S new file mode 100644 index 0000000..210cfa8 --- /dev/null +++ b/arch/metag/lib/modsi3.S @@ -0,0 +1,38 @@ +! Copyright (C) 2001, 2002, 2003, 2004, 2005, 2006, 2007 +! Imagination Technologies Ltd +! +! Integer modulus routines. +! +!! +!! 32-bit modulus unsigned i/p - passed unsigned 32-bit numbers +!! + .text + .global ___umodsi3 + .type ___umodsi3,function + .align 2 +___umodsi3: + MOV D0FrT,D1RtP ! Save original return address + CALLR D1RtP,___udivsi3 + MOV D1RtP,D0FrT ! Recover return address + MOV D0Re0,D1Ar1 ! Return remainder + MOV PC,D1RtP + .size ___umodsi3,.-___umodsi3 + +!! +!! 32-bit modulus signed i/p - passed signed 32-bit numbers +!! + .global ___modsi3 + .type ___modsi3,function + .align 2 +___modsi3: + MOV D0FrT,D1RtP ! Save original return address + MOV A0.2,D1Ar1 ! Save A in A0.2 + CALLR D1RtP,___divsi3 + MOV D1RtP,D0FrT ! Recover return address + MOV D1Re0,A0.2 ! Recover A + MOV D0Re0,D1Ar1 ! Return remainder + ORS D1Re0,D1Re0,D1Re0 ! Was A negative? + NEG D1Ar1,D1Ar1 ! Negate remainder + MOVMI D0Re0,D1Ar1 ! Return neg remainder + MOV PC, D1RtP + .size ___modsi3,.-___modsi3 diff --git a/arch/metag/lib/muldi3.S b/arch/metag/lib/muldi3.S new file mode 100644 index 0000000..ee66ca8 --- /dev/null +++ b/arch/metag/lib/muldi3.S @@ -0,0 +1,44 @@ +! Copyright (C) 2012 by Imagination Technologies Ltd. +! +! 64-bit multiply routine. +! + +! +! 64-bit signed/unsigned multiply +! +! A = D1Ar1:D0Ar2 = a 2^48 + b 2^32 + c 2^16 + d 2^0 +! +! B = D1Ar3:D0Ar4 = w 2^48 + x 2^32 + y 2^16 + z 2^0 +! + .text + .global ___muldi3 + .type ___muldi3,function + +___muldi3: + MULD D1Re0,D1Ar1,D0Ar4 ! (a 2^48 + b 2^32)(y 2^16 + z 2^0) + MULD D0Re0,D0Ar2,D1Ar3 ! (w 2^48 + x 2^32)(c 2^16 + d 2^0) + ADD D1Re0,D1Re0,D0Re0 + + MULW D0Re0,D0Ar2,D0Ar4 ! (d 2^0) * (z 2^0) + + RTDW D0Ar2,D0Ar2 + MULW D0Ar6,D0Ar2,D0Ar4 ! (c 2^16)(z 2^0) + LSR D1Ar5,D0Ar6,#16 + LSL D0Ar6,D0Ar6,#16 + ADDS D0Re0,D0Re0,D0Ar6 + ADDCS D1Re0,D1Re0,#1 + RTDW D0Ar4,D0Ar4 + ADD D1Re0,D1Re0,D1Ar5 + + MULW D0Ar6,D0Ar2,D0Ar4 ! (c 2^16)(y 2^16) + ADD D1Re0,D1Re0,D0Ar6 + + RTDW D0Ar2,D0Ar2 + MULW D0Ar6,D0Ar2,D0Ar4 ! (d 2^0)(y 2^16) + LSR D1Ar5,D0Ar6,#16 + LSL D0Ar6,D0Ar6,#16 + ADDS D0Re0,D0Re0,D0Ar6 + ADD D1Re0,D1Re0,D1Ar5 + ADDCS D1Re0,D1Re0,#1 + MOV PC, D1RtP + .size ___muldi3,.-___muldi3 diff --git a/arch/metag/lib/ucmpdi2.S b/arch/metag/lib/ucmpdi2.S new file mode 100644 index 0000000..6f3347f --- /dev/null +++ b/arch/metag/lib/ucmpdi2.S @@ -0,0 +1,27 @@ +! Copyright (C) 2012 by Imagination Technologies Ltd. +! +! 64-bit unsigned compare routine. +! + + .text + .global ___ucmpdi2 + .type ___ucmpdi2,function + +! low high +! u64 a (D0Ar2, D1Ar1) +! u64 b (D0Ar4, D1Ar3) +___ucmpdi2: + ! start at 1 (equal) and conditionally increment or decrement + MOV D0Re0,#1 + + ! high words + CMP D1Ar1,D1Ar3 + ! or if equal, low words + CMPEQ D0Ar2,D0Ar4 + + ! unsigned compare + SUBLO D0Re0,D0Re0,#1 + ADDHI D0Re0,D0Re0,#1 + + MOV PC,D1RtP + .size ___ucmpdi2,.-___ucmpdi2 diff --git a/arch/metag/lib/usercopy.c b/arch/metag/lib/usercopy.c new file mode 100644 index 0000000..72a4c6d --- /dev/null +++ b/arch/metag/lib/usercopy.c @@ -0,0 +1,1341 @@ +/* + * User address space access functions. + * The non-inlined parts of asm-metag/uaccess.h are here. + * + * Copyright (C) 2006, Imagination Technologies. + * Copyright (C) 2000, Axis Communications AB. + * + * Written by Hans-Peter Nilsson. + * Pieces used from memcpy, originally by Kenny Ranerup long time ago. + * Modified for Meta by Will Newton. + */ + +#include +#include /* def of L1_CACHE_BYTES */ + +#define USE_RAPF +#define RAPF_MIN_BUF_SIZE (3*L1_CACHE_BYTES) + + +/* The "double write" in this code is because the Meta will not fault + * immediately unless the memory pipe is forced to by e.g. a data stall or + * another memory op. The second write should be discarded by the write + * combiner so should have virtually no cost. + */ + +#define __asm_copy_user_cont(to, from, ret, COPY, FIXUP, TENTRY) \ + __asm__ __volatile__ ( \ + COPY \ + "1:\n" \ + " .section .fixup,\"ax\"\n" \ + " MOV D1Ar1,#0\n" \ + FIXUP \ + " MOVT D1Ar1,#HI(1b)\n" \ + " JUMP D1Ar1,#LO(1b)\n" \ + " .previous\n" \ + " .section __ex_table,\"a\"\n" \ + TENTRY \ + " .previous\n" \ + : "=r" (to), "=r" (from), "=r" (ret) \ + : "0" (to), "1" (from), "2" (ret) \ + : "D1Ar1", "memory") + + +#define __asm_copy_to_user_1(to, from, ret) \ + __asm_copy_user_cont(to, from, ret, \ + " GETB D1Ar1,[%1++]\n" \ + " SETB [%0],D1Ar1\n" \ + "2: SETB [%0++],D1Ar1\n", \ + "3: ADD %2,%2,#1\n", \ + " .long 2b,3b\n") + +#define __asm_copy_to_user_2x_cont(to, from, ret, COPY, FIXUP, TENTRY) \ + __asm_copy_user_cont(to, from, ret, \ + " GETW D1Ar1,[%1++]\n" \ + " SETW [%0],D1Ar1\n" \ + "2: SETW [%0++],D1Ar1\n" COPY, \ + "3: ADD %2,%2,#2\n" FIXUP, \ + " .long 2b,3b\n" TENTRY) + +#define __asm_copy_to_user_2(to, from, ret) \ + __asm_copy_to_user_2x_cont(to, from, ret, "", "", "") + +#define __asm_copy_to_user_3(to, from, ret) \ + __asm_copy_to_user_2x_cont(to, from, ret, \ + " GETB D1Ar1,[%1++]\n" \ + " SETB [%0],D1Ar1\n" \ + "4: SETB [%0++],D1Ar1\n", \ + "5: ADD %2,%2,#1\n", \ + " .long 4b,5b\n") + +#define __asm_copy_to_user_4x_cont(to, from, ret, COPY, FIXUP, TENTRY) \ + __asm_copy_user_cont(to, from, ret, \ + " GETD D1Ar1,[%1++]\n" \ + " SETD [%0],D1Ar1\n" \ + "2: SETD [%0++],D1Ar1\n" COPY, \ + "3: ADD %2,%2,#4\n" FIXUP, \ + " .long 2b,3b\n" TENTRY) + +#define __asm_copy_to_user_4(to, from, ret) \ + __asm_copy_to_user_4x_cont(to, from, ret, "", "", "") + +#define __asm_copy_to_user_5(to, from, ret) \ + __asm_copy_to_user_4x_cont(to, from, ret, \ + " GETB D1Ar1,[%1++]\n" \ + " SETB [%0],D1Ar1\n" \ + "4: SETB [%0++],D1Ar1\n", \ + "5: ADD %2,%2,#1\n", \ + " .long 4b,5b\n") + +#define __asm_copy_to_user_6x_cont(to, from, ret, COPY, FIXUP, TENTRY) \ + __asm_copy_to_user_4x_cont(to, from, ret, \ + " GETW D1Ar1,[%1++]\n" \ + " SETW [%0],D1Ar1\n" \ + "4: SETW [%0++],D1Ar1\n" COPY, \ + "5: ADD %2,%2,#2\n" FIXUP, \ + " .long 4b,5b\n" TENTRY) + +#define __asm_copy_to_user_6(to, from, ret) \ + __asm_copy_to_user_6x_cont(to, from, ret, "", "", "") + +#define __asm_copy_to_user_7(to, from, ret) \ + __asm_copy_to_user_6x_cont(to, from, ret, \ + " GETB D1Ar1,[%1++]\n" \ + " SETB [%0],D1Ar1\n" \ + "6: SETB [%0++],D1Ar1\n", \ + "7: ADD %2,%2,#1\n", \ + " .long 6b,7b\n") + +#define __asm_copy_to_user_8x_cont(to, from, ret, COPY, FIXUP, TENTRY) \ + __asm_copy_to_user_4x_cont(to, from, ret, \ + " GETD D1Ar1,[%1++]\n" \ + " SETD [%0],D1Ar1\n" \ + "4: SETD [%0++],D1Ar1\n" COPY, \ + "5: ADD %2,%2,#4\n" FIXUP, \ + " .long 4b,5b\n" TENTRY) + +#define __asm_copy_to_user_8(to, from, ret) \ + __asm_copy_to_user_8x_cont(to, from, ret, "", "", "") + +#define __asm_copy_to_user_9(to, from, ret) \ + __asm_copy_to_user_8x_cont(to, from, ret, \ + " GETB D1Ar1,[%1++]\n" \ + " SETB [%0],D1Ar1\n" \ + "6: SETB [%0++],D1Ar1\n", \ + "7: ADD %2,%2,#1\n", \ + " .long 6b,7b\n") + +#define __asm_copy_to_user_10x_cont(to, from, ret, COPY, FIXUP, TENTRY) \ + __asm_copy_to_user_8x_cont(to, from, ret, \ + " GETW D1Ar1,[%1++]\n" \ + " SETW [%0],D1Ar1\n" \ + "6: SETW [%0++],D1Ar1\n" COPY, \ + "7: ADD %2,%2,#2\n" FIXUP, \ + " .long 6b,7b\n" TENTRY) + +#define __asm_copy_to_user_10(to, from, ret) \ + __asm_copy_to_user_10x_cont(to, from, ret, "", "", "") + +#define __asm_copy_to_user_11(to, from, ret) \ + __asm_copy_to_user_10x_cont(to, from, ret, \ + " GETB D1Ar1,[%1++]\n" \ + " SETB [%0],D1Ar1\n" \ + "8: SETB [%0++],D1Ar1\n", \ + "9: ADD %2,%2,#1\n", \ + " .long 8b,9b\n") + +#define __asm_copy_to_user_12x_cont(to, from, ret, COPY, FIXUP, TENTRY) \ + __asm_copy_to_user_8x_cont(to, from, ret, \ + " GETD D1Ar1,[%1++]\n" \ + " SETD [%0],D1Ar1\n" \ + "6: SETD [%0++],D1Ar1\n" COPY, \ + "7: ADD %2,%2,#4\n" FIXUP, \ + " .long 6b,7b\n" TENTRY) +#define __asm_copy_to_user_12(to, from, ret) \ + __asm_copy_to_user_12x_cont(to, from, ret, "", "", "") + +#define __asm_copy_to_user_13(to, from, ret) \ + __asm_copy_to_user_12x_cont(to, from, ret, \ + " GETB D1Ar1,[%1++]\n" \ + " SETB [%0],D1Ar1\n" \ + "8: SETB [%0++],D1Ar1\n", \ + "9: ADD %2,%2,#1\n", \ + " .long 8b,9b\n") + +#define __asm_copy_to_user_14x_cont(to, from, ret, COPY, FIXUP, TENTRY) \ + __asm_copy_to_user_12x_cont(to, from, ret, \ + " GETW D1Ar1,[%1++]\n" \ + " SETW [%0],D1Ar1\n" \ + "8: SETW [%0++],D1Ar1\n" COPY, \ + "9: ADD %2,%2,#2\n" FIXUP, \ + " .long 8b,9b\n" TENTRY) + +#define __asm_copy_to_user_14(to, from, ret) \ + __asm_copy_to_user_14x_cont(to, from, ret, "", "", "") + +#define __asm_copy_to_user_15(to, from, ret) \ + __asm_copy_to_user_14x_cont(to, from, ret, \ + " GETB D1Ar1,[%1++]\n" \ + " SETB [%0],D1Ar1\n" \ + "10: SETB [%0++],D1Ar1\n", \ + "11: ADD %2,%2,#1\n", \ + " .long 10b,11b\n") + +#define __asm_copy_to_user_16x_cont(to, from, ret, COPY, FIXUP, TENTRY) \ + __asm_copy_to_user_12x_cont(to, from, ret, \ + " GETD D1Ar1,[%1++]\n" \ + " SETD [%0],D1Ar1\n" \ + "8: SETD [%0++],D1Ar1\n" COPY, \ + "9: ADD %2,%2,#4\n" FIXUP, \ + " .long 8b,9b\n" TENTRY) + +#define __asm_copy_to_user_16(to, from, ret) \ + __asm_copy_to_user_16x_cont(to, from, ret, "", "", "") + +#define __asm_copy_to_user_8x64(to, from, ret) \ + __asm__ __volatile__ ( \ + " GETL D0Ar2,D1Ar1,[%1++]\n" \ + " SETL [%0],D0Ar2,D1Ar1\n" \ + "2: SETL [%0++],D0Ar2,D1Ar1\n" \ + "1:\n" \ + " .section .fixup,\"ax\"\n" \ + "3: ADD %2,%2,#8\n" \ + " MOVT D0Ar2,#HI(1b)\n" \ + " JUMP D0Ar2,#LO(1b)\n" \ + " .previous\n" \ + " .section __ex_table,\"a\"\n" \ + " .long 2b,3b\n" \ + " .previous\n" \ + : "=r" (to), "=r" (from), "=r" (ret) \ + : "0" (to), "1" (from), "2" (ret) \ + : "D1Ar1", "D0Ar2", "memory") + +/* + * optimized copying loop using RAPF when 64 bit aligned + * + * n will be automatically decremented inside the loop + * ret will be left intact. if error occurs we will rewind + * so that the original non optimized code will fill up + * this value correctly. + * + * on fault: + * > n will hold total number of uncopied bytes + * + * > {'to','from'} will be rewind back so that + * the non-optimized code will do the proper fix up + * + * DCACHE drops the cacheline which helps in reducing cache + * pollution. + * + * We introduce an extra SETL at the end of the loop to + * ensure we don't fall off the loop before we catch all + * erros. + * + * NOTICE: + * LSM_STEP in TXSTATUS must be cleared in fix up code. + * since we're using M{S,G}ETL, a fault might happen at + * any address in the middle of M{S,G}ETL causing + * the value of LSM_STEP to be incorrect which can + * cause subsequent use of M{S,G}ET{L,D} to go wrong. + * ie: if LSM_STEP was 1 when a fault occurs, the + * next call to M{S,G}ET{L,D} will skip the first + * copy/getting as it think that the first 1 has already + * been done. + * + */ +#define __asm_copy_user_64bit_rapf_loop( \ + to, from, ret, n, id, FIXUP) \ + __asm__ __volatile__ ( \ + ".balign 8\n" \ + "MOV RAPF, %1\n" \ + "MSETL [A0StP++], D0Ar6, D0FrT, D0.5, D0.6, D0.7\n" \ + "MOV D0Ar6, #0\n" \ + "LSR D1Ar5, %3, #6\n" \ + "SUB TXRPT, D1Ar5, #2\n" \ + "MOV RAPF, %1\n" \ + "$Lloop"id":\n" \ + "ADD RAPF, %1, #64\n" \ + "21:\n" \ + "MGETL D0FrT, D0.5, D0.6, D0.7, [%1++]\n" \ + "22:\n" \ + "MSETL [%0++], D0FrT, D0.5, D0.6, D0.7\n" \ + "SUB %3, %3, #32\n" \ + "23:\n" \ + "MGETL D0FrT, D0.5, D0.6, D0.7, [%1++]\n" \ + "24:\n" \ + "MSETL [%0++], D0FrT, D0.5, D0.6, D0.7\n" \ + "SUB %3, %3, #32\n" \ + "DCACHE [%1+#-64], D0Ar6\n" \ + "BR $Lloop"id"\n" \ + \ + "MOV RAPF, %1\n" \ + "25:\n" \ + "MGETL D0FrT, D0.5, D0.6, D0.7, [%1++]\n" \ + "26:\n" \ + "MSETL [%0++], D0FrT, D0.5, D0.6, D0.7\n" \ + "SUB %3, %3, #32\n" \ + "27:\n" \ + "MGETL D0FrT, D0.5, D0.6, D0.7, [%1++]\n" \ + "28:\n" \ + "MSETL [%0++], D0FrT, D0.5, D0.6, D0.7\n" \ + "SUB %0, %0, #8\n" \ + "29:\n" \ + "SETL [%0++], D0.7, D1.7\n" \ + "SUB %3, %3, #32\n" \ + "1:" \ + "DCACHE [%1+#-64], D0Ar6\n" \ + "GETL D0Ar6, D1Ar5, [A0StP+#-40]\n" \ + "GETL D0FrT, D1RtP, [A0StP+#-32]\n" \ + "GETL D0.5, D1.5, [A0StP+#-24]\n" \ + "GETL D0.6, D1.6, [A0StP+#-16]\n" \ + "GETL D0.7, D1.7, [A0StP+#-8]\n" \ + "SUB A0StP, A0StP, #40\n" \ + " .section .fixup,\"ax\"\n" \ + "4:\n" \ + " ADD %0, %0, #8\n" \ + "3:\n" \ + " MOV D0Ar2, TXSTATUS\n" \ + " MOV D1Ar1, TXSTATUS\n" \ + " AND D1Ar1, D1Ar1, #0xFFFFF8FF\n" \ + " MOV TXSTATUS, D1Ar1\n" \ + FIXUP \ + " MOVT D0Ar2,#HI(1b)\n" \ + " JUMP D0Ar2,#LO(1b)\n" \ + " .previous\n" \ + " .section __ex_table,\"a\"\n" \ + " .long 21b,3b\n" \ + " .long 22b,3b\n" \ + " .long 23b,3b\n" \ + " .long 24b,3b\n" \ + " .long 25b,3b\n" \ + " .long 26b,3b\n" \ + " .long 27b,3b\n" \ + " .long 28b,3b\n" \ + " .long 29b,4b\n" \ + " .previous\n" \ + : "=r" (to), "=r" (from), "=r" (ret), "=d" (n) \ + : "0" (to), "1" (from), "2" (ret), "3" (n) \ + : "D1Ar1", "D0Ar2", "memory") + +/* rewind 'to' and 'from' pointers when a fault occurs + * + * Rationale: + * A fault always occurs on writing to user buffer. A fault + * is at a single address, so we need to rewind by only 4 + * bytes. + * Since we do a complete read from kernel buffer before + * writing, we need to rewind it also. The amount to be + * rewind equals the number of faulty writes in MSETD + * which is: [4 - (LSM_STEP-1)]*8 + * LSM_STEP is bits 10:8 in TXSTATUS which is already read + * and stored in D0Ar2 + * + * NOTE: If a fault occurs at the last operation in M{G,S}ETL + * LSM_STEP will be 0. ie: we do 4 writes in our case, if + * a fault happens at the 4th write, LSM_STEP will be 0 + * instead of 4. The code copes with that. + * + * n is updated by the number of successful writes, which is: + * n = n - (LSM_STEP-1)*8 + */ +#define __asm_copy_to_user_64bit_rapf_loop(to, from, ret, n, id)\ + __asm_copy_user_64bit_rapf_loop(to, from, ret, n, id, \ + "LSR D0Ar2, D0Ar2, #8\n" \ + "AND D0Ar2, D0Ar2, #0x7\n" \ + "ADDZ D0Ar2, D0Ar2, #4\n" \ + "SUB D0Ar2, D0Ar2, #1\n" \ + "MOV D1Ar1, #4\n" \ + "SUB D0Ar2, D1Ar1, D0Ar2\n" \ + "LSL D0Ar2, D0Ar2, #3\n" \ + "LSL D1Ar1, D1Ar1, #3\n" \ + "SUB D1Ar1, D1Ar1, D0Ar2\n" \ + "SUB %0, %0, #8\n" \ + "SUB %1, %1,D0Ar2\n" \ + "SUB %3, %3, D1Ar1\n") + +/* + * optimized copying loop using RAPF when 32 bit aligned + * + * n will be automatically decremented inside the loop + * ret will be left intact. if error occurs we will rewind + * so that the original non optimized code will fill up + * this value correctly. + * + * on fault: + * > n will hold total number of uncopied bytes + * + * > {'to','from'} will be rewind back so that + * the non-optimized code will do the proper fix up + * + * DCACHE drops the cacheline which helps in reducing cache + * pollution. + * + * We introduce an extra SETD at the end of the loop to + * ensure we don't fall off the loop before we catch all + * erros. + * + * NOTICE: + * LSM_STEP in TXSTATUS must be cleared in fix up code. + * since we're using M{S,G}ETL, a fault might happen at + * any address in the middle of M{S,G}ETL causing + * the value of LSM_STEP to be incorrect which can + * cause subsequent use of M{S,G}ET{L,D} to go wrong. + * ie: if LSM_STEP was 1 when a fault occurs, the + * next call to M{S,G}ET{L,D} will skip the first + * copy/getting as it think that the first 1 has already + * been done. + * + */ +#define __asm_copy_user_32bit_rapf_loop( \ + to, from, ret, n, id, FIXUP) \ + __asm__ __volatile__ ( \ + ".balign 8\n" \ + "MOV RAPF, %1\n" \ + "MSETL [A0StP++], D0Ar6, D0FrT, D0.5, D0.6, D0.7\n" \ + "MOV D0Ar6, #0\n" \ + "LSR D1Ar5, %3, #6\n" \ + "SUB TXRPT, D1Ar5, #2\n" \ + "MOV RAPF, %1\n" \ + "$Lloop"id":\n" \ + "ADD RAPF, %1, #64\n" \ + "21:\n" \ + "MGETD D0FrT, D0.5, D0.6, D0.7, [%1++]\n" \ + "22:\n" \ + "MSETD [%0++], D0FrT, D0.5, D0.6, D0.7\n" \ + "SUB %3, %3, #16\n" \ + "23:\n" \ + "MGETD D0FrT, D0.5, D0.6, D0.7, [%1++]\n" \ + "24:\n" \ + "MSETD [%0++], D0FrT, D0.5, D0.6, D0.7\n" \ + "SUB %3, %3, #16\n" \ + "25:\n" \ + "MGETD D0FrT, D0.5, D0.6, D0.7, [%1++]\n" \ + "26:\n" \ + "MSETD [%0++], D0FrT, D0.5, D0.6, D0.7\n" \ + "SUB %3, %3, #16\n" \ + "27:\n" \ + "MGETD D0FrT, D0.5, D0.6, D0.7, [%1++]\n" \ + "28:\n" \ + "MSETD [%0++], D0FrT, D0.5, D0.6, D0.7\n" \ + "SUB %3, %3, #16\n" \ + "DCACHE [%1+#-64], D0Ar6\n" \ + "BR $Lloop"id"\n" \ + \ + "MOV RAPF, %1\n" \ + "29:\n" \ + "MGETD D0FrT, D0.5, D0.6, D0.7, [%1++]\n" \ + "30:\n" \ + "MSETD [%0++], D0FrT, D0.5, D0.6, D0.7\n" \ + "SUB %3, %3, #16\n" \ + "31:\n" \ + "MGETD D0FrT, D0.5, D0.6, D0.7, [%1++]\n" \ + "32:\n" \ + "MSETD [%0++], D0FrT, D0.5, D0.6, D0.7\n" \ + "SUB %3, %3, #16\n" \ + "33:\n" \ + "MGETD D0FrT, D0.5, D0.6, D0.7, [%1++]\n" \ + "34:\n" \ + "MSETD [%0++], D0FrT, D0.5, D0.6, D0.7\n" \ + "SUB %3, %3, #16\n" \ + "35:\n" \ + "MGETD D0FrT, D0.5, D0.6, D0.7, [%1++]\n" \ + "36:\n" \ + "MSETD [%0++], D0FrT, D0.5, D0.6, D0.7\n" \ + "SUB %0, %0, #4\n" \ + "37:\n" \ + "SETD [%0++], D0.7\n" \ + "SUB %3, %3, #16\n" \ + "1:" \ + "DCACHE [%1+#-64], D0Ar6\n" \ + "GETL D0Ar6, D1Ar5, [A0StP+#-40]\n" \ + "GETL D0FrT, D1RtP, [A0StP+#-32]\n" \ + "GETL D0.5, D1.5, [A0StP+#-24]\n" \ + "GETL D0.6, D1.6, [A0StP+#-16]\n" \ + "GETL D0.7, D1.7, [A0StP+#-8]\n" \ + "SUB A0StP, A0StP, #40\n" \ + " .section .fixup,\"ax\"\n" \ + "4:\n" \ + " ADD %0, %0, #4\n" \ + "3:\n" \ + " MOV D0Ar2, TXSTATUS\n" \ + " MOV D1Ar1, TXSTATUS\n" \ + " AND D1Ar1, D1Ar1, #0xFFFFF8FF\n" \ + " MOV TXSTATUS, D1Ar1\n" \ + FIXUP \ + " MOVT D0Ar2,#HI(1b)\n" \ + " JUMP D0Ar2,#LO(1b)\n" \ + " .previous\n" \ + " .section __ex_table,\"a\"\n" \ + " .long 21b,3b\n" \ + " .long 22b,3b\n" \ + " .long 23b,3b\n" \ + " .long 24b,3b\n" \ + " .long 25b,3b\n" \ + " .long 26b,3b\n" \ + " .long 27b,3b\n" \ + " .long 28b,3b\n" \ + " .long 29b,3b\n" \ + " .long 30b,3b\n" \ + " .long 31b,3b\n" \ + " .long 32b,3b\n" \ + " .long 33b,3b\n" \ + " .long 34b,3b\n" \ + " .long 35b,3b\n" \ + " .long 36b,3b\n" \ + " .long 37b,4b\n" \ + " .previous\n" \ + : "=r" (to), "=r" (from), "=r" (ret), "=d" (n) \ + : "0" (to), "1" (from), "2" (ret), "3" (n) \ + : "D1Ar1", "D0Ar2", "memory") + +/* rewind 'to' and 'from' pointers when a fault occurs + * + * Rationale: + * A fault always occurs on writing to user buffer. A fault + * is at a single address, so we need to rewind by only 4 + * bytes. + * Since we do a complete read from kernel buffer before + * writing, we need to rewind it also. The amount to be + * rewind equals the number of faulty writes in MSETD + * which is: [4 - (LSM_STEP-1)]*4 + * LSM_STEP is bits 10:8 in TXSTATUS which is already read + * and stored in D0Ar2 + * + * NOTE: If a fault occurs at the last operation in M{G,S}ETL + * LSM_STEP will be 0. ie: we do 4 writes in our case, if + * a fault happens at the 4th write, LSM_STEP will be 0 + * instead of 4. The code copes with that. + * + * n is updated by the number of successful writes, which is: + * n = n - (LSM_STEP-1)*4 + */ +#define __asm_copy_to_user_32bit_rapf_loop(to, from, ret, n, id)\ + __asm_copy_user_32bit_rapf_loop(to, from, ret, n, id, \ + "LSR D0Ar2, D0Ar2, #8\n" \ + "AND D0Ar2, D0Ar2, #0x7\n" \ + "ADDZ D0Ar2, D0Ar2, #4\n" \ + "SUB D0Ar2, D0Ar2, #1\n" \ + "MOV D1Ar1, #4\n" \ + "SUB D0Ar2, D1Ar1, D0Ar2\n" \ + "LSL D0Ar2, D0Ar2, #2\n" \ + "LSL D1Ar1, D1Ar1, #2\n" \ + "SUB D1Ar1, D1Ar1, D0Ar2\n" \ + "SUB %0, %0, #4\n" \ + "SUB %1, %1, D0Ar2\n" \ + "SUB %3, %3, D1Ar1\n") + +unsigned long __copy_user(void __user *pdst, const void *psrc, + unsigned long n) +{ + register char __user *dst __asm__ ("A0.2") = pdst; + register const char *src __asm__ ("A1.2") = psrc; + unsigned long retn = 0; + + if (n == 0) + return 0; + + if ((unsigned long) src & 1) { + __asm_copy_to_user_1(dst, src, retn); + n--; + } + if ((unsigned long) dst & 1) { + /* Worst case - byte copy */ + while (n > 0) { + __asm_copy_to_user_1(dst, src, retn); + n--; + } + } + if (((unsigned long) src & 2) && n >= 2) { + __asm_copy_to_user_2(dst, src, retn); + n -= 2; + } + if ((unsigned long) dst & 2) { + /* Second worst case - word copy */ + while (n >= 2) { + __asm_copy_to_user_2(dst, src, retn); + n -= 2; + } + } + +#ifdef USE_RAPF + /* 64 bit copy loop */ + if (!(((unsigned long) src | (__force unsigned long) dst) & 7)) { + if (n >= RAPF_MIN_BUF_SIZE) { + /* copy user using 64 bit rapf copy */ + __asm_copy_to_user_64bit_rapf_loop(dst, src, retn, + n, "64cu"); + } + while (n >= 8) { + __asm_copy_to_user_8x64(dst, src, retn); + n -= 8; + } + } + if (n >= RAPF_MIN_BUF_SIZE) { + /* copy user using 32 bit rapf copy */ + __asm_copy_to_user_32bit_rapf_loop(dst, src, retn, n, "32cu"); + } +#else + /* 64 bit copy loop */ + if (!(((unsigned long) src | (__force unsigned long) dst) & 7)) { + while (n >= 8) { + __asm_copy_to_user_8x64(dst, src, retn); + n -= 8; + } + } +#endif + + while (n >= 16) { + __asm_copy_to_user_16(dst, src, retn); + n -= 16; + } + + while (n >= 4) { + __asm_copy_to_user_4(dst, src, retn); + n -= 4; + } + + switch (n) { + case 0: + break; + case 1: + __asm_copy_to_user_1(dst, src, retn); + break; + case 2: + __asm_copy_to_user_2(dst, src, retn); + break; + case 3: + __asm_copy_to_user_3(dst, src, retn); + break; + } + + return retn; +} + +#define __asm_copy_from_user_1(to, from, ret) \ + __asm_copy_user_cont(to, from, ret, \ + " GETB D1Ar1,[%1++]\n" \ + "2: SETB [%0++],D1Ar1\n", \ + "3: ADD %2,%2,#1\n" \ + " SETB [%0++],D1Ar1\n", \ + " .long 2b,3b\n") + +#define __asm_copy_from_user_2x_cont(to, from, ret, COPY, FIXUP, TENTRY) \ + __asm_copy_user_cont(to, from, ret, \ + " GETW D1Ar1,[%1++]\n" \ + "2: SETW [%0++],D1Ar1\n" COPY, \ + "3: ADD %2,%2,#2\n" \ + " SETW [%0++],D1Ar1\n" FIXUP, \ + " .long 2b,3b\n" TENTRY) + +#define __asm_copy_from_user_2(to, from, ret) \ + __asm_copy_from_user_2x_cont(to, from, ret, "", "", "") + +#define __asm_copy_from_user_3(to, from, ret) \ + __asm_copy_from_user_2x_cont(to, from, ret, \ + " GETB D1Ar1,[%1++]\n" \ + "4: SETB [%0++],D1Ar1\n", \ + "5: ADD %2,%2,#1\n" \ + " SETB [%0++],D1Ar1\n", \ + " .long 4b,5b\n") + +#define __asm_copy_from_user_4x_cont(to, from, ret, COPY, FIXUP, TENTRY) \ + __asm_copy_user_cont(to, from, ret, \ + " GETD D1Ar1,[%1++]\n" \ + "2: SETD [%0++],D1Ar1\n" COPY, \ + "3: ADD %2,%2,#4\n" \ + " SETD [%0++],D1Ar1\n" FIXUP, \ + " .long 2b,3b\n" TENTRY) + +#define __asm_copy_from_user_4(to, from, ret) \ + __asm_copy_from_user_4x_cont(to, from, ret, "", "", "") + +#define __asm_copy_from_user_5(to, from, ret) \ + __asm_copy_from_user_4x_cont(to, from, ret, \ + " GETB D1Ar1,[%1++]\n" \ + "4: SETB [%0++],D1Ar1\n", \ + "5: ADD %2,%2,#1\n" \ + " SETB [%0++],D1Ar1\n", \ + " .long 4b,5b\n") + +#define __asm_copy_from_user_6x_cont(to, from, ret, COPY, FIXUP, TENTRY) \ + __asm_copy_from_user_4x_cont(to, from, ret, \ + " GETW D1Ar1,[%1++]\n" \ + "4: SETW [%0++],D1Ar1\n" COPY, \ + "5: ADD %2,%2,#2\n" \ + " SETW [%0++],D1Ar1\n" FIXUP, \ + " .long 4b,5b\n" TENTRY) + +#define __asm_copy_from_user_6(to, from, ret) \ + __asm_copy_from_user_6x_cont(to, from, ret, "", "", "") + +#define __asm_copy_from_user_7(to, from, ret) \ + __asm_copy_from_user_6x_cont(to, from, ret, \ + " GETB D1Ar1,[%1++]\n" \ + "6: SETB [%0++],D1Ar1\n", \ + "7: ADD %2,%2,#1\n" \ + " SETB [%0++],D1Ar1\n", \ + " .long 6b,7b\n") + +#define __asm_copy_from_user_8x_cont(to, from, ret, COPY, FIXUP, TENTRY) \ + __asm_copy_from_user_4x_cont(to, from, ret, \ + " GETD D1Ar1,[%1++]\n" \ + "4: SETD [%0++],D1Ar1\n" COPY, \ + "5: ADD %2,%2,#4\n" \ + " SETD [%0++],D1Ar1\n" FIXUP, \ + " .long 4b,5b\n" TENTRY) + +#define __asm_copy_from_user_8(to, from, ret) \ + __asm_copy_from_user_8x_cont(to, from, ret, "", "", "") + +#define __asm_copy_from_user_9(to, from, ret) \ + __asm_copy_from_user_8x_cont(to, from, ret, \ + " GETB D1Ar1,[%1++]\n" \ + "6: SETB [%0++],D1Ar1\n", \ + "7: ADD %2,%2,#1\n" \ + " SETB [%0++],D1Ar1\n", \ + " .long 6b,7b\n") + +#define __asm_copy_from_user_10x_cont(to, from, ret, COPY, FIXUP, TENTRY) \ + __asm_copy_from_user_8x_cont(to, from, ret, \ + " GETW D1Ar1,[%1++]\n" \ + "6: SETW [%0++],D1Ar1\n" COPY, \ + "7: ADD %2,%2,#2\n" \ + " SETW [%0++],D1Ar1\n" FIXUP, \ + " .long 6b,7b\n" TENTRY) + +#define __asm_copy_from_user_10(to, from, ret) \ + __asm_copy_from_user_10x_cont(to, from, ret, "", "", "") + +#define __asm_copy_from_user_11(to, from, ret) \ + __asm_copy_from_user_10x_cont(to, from, ret, \ + " GETB D1Ar1,[%1++]\n" \ + "8: SETB [%0++],D1Ar1\n", \ + "9: ADD %2,%2,#1\n" \ + " SETB [%0++],D1Ar1\n", \ + " .long 8b,9b\n") + +#define __asm_copy_from_user_12x_cont(to, from, ret, COPY, FIXUP, TENTRY) \ + __asm_copy_from_user_8x_cont(to, from, ret, \ + " GETD D1Ar1,[%1++]\n" \ + "6: SETD [%0++],D1Ar1\n" COPY, \ + "7: ADD %2,%2,#4\n" \ + " SETD [%0++],D1Ar1\n" FIXUP, \ + " .long 6b,7b\n" TENTRY) + +#define __asm_copy_from_user_12(to, from, ret) \ + __asm_copy_from_user_12x_cont(to, from, ret, "", "", "") + +#define __asm_copy_from_user_13(to, from, ret) \ + __asm_copy_from_user_12x_cont(to, from, ret, \ + " GETB D1Ar1,[%1++]\n" \ + "8: SETB [%0++],D1Ar1\n", \ + "9: ADD %2,%2,#1\n" \ + " SETB [%0++],D1Ar1\n", \ + " .long 8b,9b\n") + +#define __asm_copy_from_user_14x_cont(to, from, ret, COPY, FIXUP, TENTRY) \ + __asm_copy_from_user_12x_cont(to, from, ret, \ + " GETW D1Ar1,[%1++]\n" \ + "8: SETW [%0++],D1Ar1\n" COPY, \ + "9: ADD %2,%2,#2\n" \ + " SETW [%0++],D1Ar1\n" FIXUP, \ + " .long 8b,9b\n" TENTRY) + +#define __asm_copy_from_user_14(to, from, ret) \ + __asm_copy_from_user_14x_cont(to, from, ret, "", "", "") + +#define __asm_copy_from_user_15(to, from, ret) \ + __asm_copy_from_user_14x_cont(to, from, ret, \ + " GETB D1Ar1,[%1++]\n" \ + "10: SETB [%0++],D1Ar1\n", \ + "11: ADD %2,%2,#1\n" \ + " SETB [%0++],D1Ar1\n", \ + " .long 10b,11b\n") + +#define __asm_copy_from_user_16x_cont(to, from, ret, COPY, FIXUP, TENTRY) \ + __asm_copy_from_user_12x_cont(to, from, ret, \ + " GETD D1Ar1,[%1++]\n" \ + "8: SETD [%0++],D1Ar1\n" COPY, \ + "9: ADD %2,%2,#4\n" \ + " SETD [%0++],D1Ar1\n" FIXUP, \ + " .long 8b,9b\n" TENTRY) + +#define __asm_copy_from_user_16(to, from, ret) \ + __asm_copy_from_user_16x_cont(to, from, ret, "", "", "") + +#define __asm_copy_from_user_8x64(to, from, ret) \ + __asm__ __volatile__ ( \ + " GETL D0Ar2,D1Ar1,[%1++]\n" \ + "2: SETL [%0++],D0Ar2,D1Ar1\n" \ + "1:\n" \ + " .section .fixup,\"ax\"\n" \ + " MOV D1Ar1,#0\n" \ + " MOV D0Ar2,#0\n" \ + "3: ADD %2,%2,#8\n" \ + " SETL [%0++],D0Ar2,D1Ar1\n" \ + " MOVT D0Ar2,#HI(1b)\n" \ + " JUMP D0Ar2,#LO(1b)\n" \ + " .previous\n" \ + " .section __ex_table,\"a\"\n" \ + " .long 2b,3b\n" \ + " .previous\n" \ + : "=a" (to), "=r" (from), "=r" (ret) \ + : "0" (to), "1" (from), "2" (ret) \ + : "D1Ar1", "D0Ar2", "memory") + +/* rewind 'from' pointer when a fault occurs + * + * Rationale: + * A fault occurs while reading from user buffer, which is the + * source. Since the fault is at a single address, we only + * need to rewind by 8 bytes. + * Since we don't write to kernel buffer until we read first, + * the kernel buffer is at the right state and needn't be + * corrected. + */ +#define __asm_copy_from_user_64bit_rapf_loop(to, from, ret, n, id) \ + __asm_copy_user_64bit_rapf_loop(to, from, ret, n, id, \ + "SUB %1, %1, #8\n") + +/* rewind 'from' pointer when a fault occurs + * + * Rationale: + * A fault occurs while reading from user buffer, which is the + * source. Since the fault is at a single address, we only + * need to rewind by 4 bytes. + * Since we don't write to kernel buffer until we read first, + * the kernel buffer is at the right state and needn't be + * corrected. + */ +#define __asm_copy_from_user_32bit_rapf_loop(to, from, ret, n, id) \ + __asm_copy_user_32bit_rapf_loop(to, from, ret, n, id, \ + "SUB %1, %1, #4\n") + + +/* Copy from user to kernel, zeroing the bytes that were inaccessible in + userland. The return-value is the number of bytes that were + inaccessible. */ +unsigned long __copy_user_zeroing(void *pdst, const void __user *psrc, + unsigned long n) +{ + register char *dst __asm__ ("A0.2") = pdst; + register const char __user *src __asm__ ("A1.2") = psrc; + unsigned long retn = 0; + + if (n == 0) + return 0; + + if ((unsigned long) src & 1) { + __asm_copy_from_user_1(dst, src, retn); + n--; + } + if ((unsigned long) dst & 1) { + /* Worst case - byte copy */ + while (n > 0) { + __asm_copy_from_user_1(dst, src, retn); + n--; + if (retn) + goto copy_exception_bytes; + } + } + if (((unsigned long) src & 2) && n >= 2) { + __asm_copy_from_user_2(dst, src, retn); + n -= 2; + } + if ((unsigned long) dst & 2) { + /* Second worst case - word copy */ + while (n >= 2) { + __asm_copy_from_user_2(dst, src, retn); + n -= 2; + if (retn) + goto copy_exception_bytes; + } + } + + /* We only need one check after the unalignment-adjustments, + because if both adjustments were done, either both or + neither reference had an exception. */ + if (retn != 0) + goto copy_exception_bytes; + +#ifdef USE_RAPF + /* 64 bit copy loop */ + if (!(((unsigned long) src | (unsigned long) dst) & 7)) { + if (n >= RAPF_MIN_BUF_SIZE) { + /* Copy using fast 64bit rapf */ + __asm_copy_from_user_64bit_rapf_loop(dst, src, retn, + n, "64cuz"); + } + while (n >= 8) { + __asm_copy_from_user_8x64(dst, src, retn); + n -= 8; + if (retn) + goto copy_exception_bytes; + } + } + + if (n >= RAPF_MIN_BUF_SIZE) { + /* Copy using fast 32bit rapf */ + __asm_copy_from_user_32bit_rapf_loop(dst, src, retn, + n, "32cuz"); + } +#else + /* 64 bit copy loop */ + if (!(((unsigned long) src | (unsigned long) dst) & 7)) { + while (n >= 8) { + __asm_copy_from_user_8x64(dst, src, retn); + n -= 8; + if (retn) + goto copy_exception_bytes; + } + } +#endif + + while (n >= 4) { + __asm_copy_from_user_4(dst, src, retn); + n -= 4; + + if (retn) + goto copy_exception_bytes; + } + + /* If we get here, there were no memory read faults. */ + switch (n) { + /* These copies are at least "naturally aligned" (so we don't + have to check each byte), due to the src alignment code. + The *_3 case *will* get the correct count for retn. */ + case 0: + /* This case deliberately left in (if you have doubts check the + generated assembly code). */ + break; + case 1: + __asm_copy_from_user_1(dst, src, retn); + break; + case 2: + __asm_copy_from_user_2(dst, src, retn); + break; + case 3: + __asm_copy_from_user_3(dst, src, retn); + break; + } + + /* If we get here, retn correctly reflects the number of failing + bytes. */ + return retn; + + copy_exception_bytes: + /* We already have "retn" bytes cleared, and need to clear the + remaining "n" bytes. A non-optimized simple byte-for-byte in-line + memset is preferred here, since this isn't speed-critical code and + we'd rather have this a leaf-function than calling memset. */ + { + char *endp; + for (endp = dst + n; dst < endp; dst++) + *dst = 0; + } + + return retn + n; +} + +#define __asm_clear_8x64(to, ret) \ + __asm__ __volatile__ ( \ + " MOV D0Ar2,#0\n" \ + " MOV D1Ar1,#0\n" \ + " SETL [%0],D0Ar2,D1Ar1\n" \ + "2: SETL [%0++],D0Ar2,D1Ar1\n" \ + "1:\n" \ + " .section .fixup,\"ax\"\n" \ + "3: ADD %1,%1,#8\n" \ + " MOVT D0Ar2,#HI(1b)\n" \ + " JUMP D0Ar2,#LO(1b)\n" \ + " .previous\n" \ + " .section __ex_table,\"a\"\n" \ + " .long 2b,3b\n" \ + " .previous\n" \ + : "=r" (to), "=r" (ret) \ + : "0" (to), "1" (ret) \ + : "D1Ar1", "D0Ar2", "memory") + +/* Zero userspace. */ + +#define __asm_clear(to, ret, CLEAR, FIXUP, TENTRY) \ + __asm__ __volatile__ ( \ + " MOV D1Ar1,#0\n" \ + CLEAR \ + "1:\n" \ + " .section .fixup,\"ax\"\n" \ + FIXUP \ + " MOVT D1Ar1,#HI(1b)\n" \ + " JUMP D1Ar1,#LO(1b)\n" \ + " .previous\n" \ + " .section __ex_table,\"a\"\n" \ + TENTRY \ + " .previous" \ + : "=r" (to), "=r" (ret) \ + : "0" (to), "1" (ret) \ + : "D1Ar1", "memory") + +#define __asm_clear_1(to, ret) \ + __asm_clear(to, ret, \ + " SETB [%0],D1Ar1\n" \ + "2: SETB [%0++],D1Ar1\n", \ + "3: ADD %1,%1,#1\n", \ + " .long 2b,3b\n") + +#define __asm_clear_2(to, ret) \ + __asm_clear(to, ret, \ + " SETW [%0],D1Ar1\n" \ + "2: SETW [%0++],D1Ar1\n", \ + "3: ADD %1,%1,#2\n", \ + " .long 2b,3b\n") + +#define __asm_clear_3(to, ret) \ + __asm_clear(to, ret, \ + "2: SETW [%0++],D1Ar1\n" \ + " SETB [%0],D1Ar1\n" \ + "3: SETB [%0++],D1Ar1\n", \ + "4: ADD %1,%1,#2\n" \ + "5: ADD %1,%1,#1\n", \ + " .long 2b,4b\n" \ + " .long 3b,5b\n") + +#define __asm_clear_4x_cont(to, ret, CLEAR, FIXUP, TENTRY) \ + __asm_clear(to, ret, \ + " SETD [%0],D1Ar1\n" \ + "2: SETD [%0++],D1Ar1\n" CLEAR, \ + "3: ADD %1,%1,#4\n" FIXUP, \ + " .long 2b,3b\n" TENTRY) + +#define __asm_clear_4(to, ret) \ + __asm_clear_4x_cont(to, ret, "", "", "") + +#define __asm_clear_8x_cont(to, ret, CLEAR, FIXUP, TENTRY) \ + __asm_clear_4x_cont(to, ret, \ + " SETD [%0],D1Ar1\n" \ + "4: SETD [%0++],D1Ar1\n" CLEAR, \ + "5: ADD %1,%1,#4\n" FIXUP, \ + " .long 4b,5b\n" TENTRY) + +#define __asm_clear_8(to, ret) \ + __asm_clear_8x_cont(to, ret, "", "", "") + +#define __asm_clear_12x_cont(to, ret, CLEAR, FIXUP, TENTRY) \ + __asm_clear_8x_cont(to, ret, \ + " SETD [%0],D1Ar1\n" \ + "6: SETD [%0++],D1Ar1\n" CLEAR, \ + "7: ADD %1,%1,#4\n" FIXUP, \ + " .long 6b,7b\n" TENTRY) + +#define __asm_clear_12(to, ret) \ + __asm_clear_12x_cont(to, ret, "", "", "") + +#define __asm_clear_16x_cont(to, ret, CLEAR, FIXUP, TENTRY) \ + __asm_clear_12x_cont(to, ret, \ + " SETD [%0],D1Ar1\n" \ + "8: SETD [%0++],D1Ar1\n" CLEAR, \ + "9: ADD %1,%1,#4\n" FIXUP, \ + " .long 8b,9b\n" TENTRY) + +#define __asm_clear_16(to, ret) \ + __asm_clear_16x_cont(to, ret, "", "", "") + +unsigned long __do_clear_user(void __user *pto, unsigned long pn) +{ + register char __user *dst __asm__ ("D0Re0") = pto; + register unsigned long n __asm__ ("D1Re0") = pn; + register unsigned long retn __asm__ ("D0Ar6") = 0; + + if ((unsigned long) dst & 1) { + __asm_clear_1(dst, retn); + n--; + } + + if ((unsigned long) dst & 2) { + __asm_clear_2(dst, retn); + n -= 2; + } + + /* 64 bit copy loop */ + if (!((__force unsigned long) dst & 7)) { + while (n >= 8) { + __asm_clear_8x64(dst, retn); + n -= 8; + } + } + + while (n >= 16) { + __asm_clear_16(dst, retn); + n -= 16; + } + + while (n >= 4) { + __asm_clear_4(dst, retn); + n -= 4; + } + + switch (n) { + case 0: + break; + case 1: + __asm_clear_1(dst, retn); + break; + case 2: + __asm_clear_2(dst, retn); + break; + case 3: + __asm_clear_3(dst, retn); + break; + } + + return retn; +} + +unsigned char __get_user_asm_b(const void __user *addr, long *err) +{ + register unsigned char x __asm__ ("D0Re0") = 0; + __asm__ __volatile__( + " GETB %0,[%2]\n" + "1:\n" + " GETB %0,[%2]\n" + "2:\n" + " .section .fixup,\"ax\"\n" + "3: MOV D0FrT,%3\n" + " SETD [%1],D0FrT\n" + " MOVT D0FrT,#HI(2b)\n" + " JUMP D0FrT,#LO(2b)\n" + " .previous\n" + " .section __ex_table,\"a\"\n" + " .long 1b,3b\n" + " .previous\n" + : "=r" (x) + : "r" (err), "r" (addr), "P" (-EFAULT) + : "D0FrT"); + return x; +} + +unsigned short __get_user_asm_w(const void __user *addr, long *err) +{ + register unsigned short x __asm__ ("D0Re0") = 0; + __asm__ __volatile__( + " GETW %0,[%2]\n" + "1:\n" + " GETW %0,[%2]\n" + "2:\n" + " .section .fixup,\"ax\"\n" + "3: MOV D0FrT,%3\n" + " SETD [%1],D0FrT\n" + " MOVT D0FrT,#HI(2b)\n" + " JUMP D0FrT,#LO(2b)\n" + " .previous\n" + " .section __ex_table,\"a\"\n" + " .long 1b,3b\n" + " .previous\n" + : "=r" (x) + : "r" (err), "r" (addr), "P" (-EFAULT) + : "D0FrT"); + return x; +} + +unsigned int __get_user_asm_d(const void __user *addr, long *err) +{ + register unsigned int x __asm__ ("D0Re0") = 0; + __asm__ __volatile__( + " GETD %0,[%2]\n" + "1:\n" + " GETD %0,[%2]\n" + "2:\n" + " .section .fixup,\"ax\"\n" + "3: MOV D0FrT,%3\n" + " SETD [%1],D0FrT\n" + " MOVT D0FrT,#HI(2b)\n" + " JUMP D0FrT,#LO(2b)\n" + " .previous\n" + " .section __ex_table,\"a\"\n" + " .long 1b,3b\n" + " .previous\n" + : "=r" (x) + : "r" (err), "r" (addr), "P" (-EFAULT) + : "D0FrT"); + return x; +} + +long __put_user_asm_b(unsigned int x, void __user *addr) +{ + register unsigned int err __asm__ ("D0Re0") = 0; + __asm__ __volatile__( + " MOV %0,#0\n" + " SETB [%2],%1\n" + "1:\n" + " SETB [%2],%1\n" + "2:\n" + ".section .fixup,\"ax\"\n" + "3: MOV %0,%3\n" + " MOVT D0FrT,#HI(2b)\n" + " JUMP D0FrT,#LO(2b)\n" + ".previous\n" + ".section __ex_table,\"a\"\n" + " .long 1b,3b\n" + ".previous" + : "=r"(err) + : "d" (x), "a" (addr), "P"(-EFAULT) + : "D0FrT"); + return err; +} + +long __put_user_asm_w(unsigned int x, void __user *addr) +{ + register unsigned int err __asm__ ("D0Re0") = 0; + __asm__ __volatile__( + " MOV %0,#0\n" + " SETW [%2],%1\n" + "1:\n" + " SETW [%2],%1\n" + "2:\n" + ".section .fixup,\"ax\"\n" + "3: MOV %0,%3\n" + " MOVT D0FrT,#HI(2b)\n" + " JUMP D0FrT,#LO(2b)\n" + ".previous\n" + ".section __ex_table,\"a\"\n" + " .long 1b,3b\n" + ".previous" + : "=r"(err) + : "d" (x), "a" (addr), "P"(-EFAULT) + : "D0FrT"); + return err; +} + +long __put_user_asm_d(unsigned int x, void __user *addr) +{ + register unsigned int err __asm__ ("D0Re0") = 0; + __asm__ __volatile__( + " MOV %0,#0\n" + " SETD [%2],%1\n" + "1:\n" + " SETD [%2],%1\n" + "2:\n" + ".section .fixup,\"ax\"\n" + "3: MOV %0,%3\n" + " MOVT D0FrT,#HI(2b)\n" + " JUMP D0FrT,#LO(2b)\n" + ".previous\n" + ".section __ex_table,\"a\"\n" + " .long 1b,3b\n" + ".previous" + : "=r"(err) + : "d" (x), "a" (addr), "P"(-EFAULT) + : "D0FrT"); + return err; +} + +long __put_user_asm_l(unsigned long long x, void __user *addr) +{ + register unsigned int err __asm__ ("D0Re0") = 0; + __asm__ __volatile__( + " MOV %0,#0\n" + " SETL [%2],%1,%t1\n" + "1:\n" + " SETL [%2],%1,%t1\n" + "2:\n" + ".section .fixup,\"ax\"\n" + "3: MOV %0,%3\n" + " MOVT D0FrT,#HI(2b)\n" + " JUMP D0FrT,#LO(2b)\n" + ".previous\n" + ".section __ex_table,\"a\"\n" + " .long 1b,3b\n" + ".previous" + : "=r"(err) + : "d" (x), "a" (addr), "P"(-EFAULT) + : "D0FrT"); + return err; +} + +long strnlen_user(const char __user *src, long count) +{ + long res; + + if (!access_ok(VERIFY_READ, src, 0)) + return 0; + + asm volatile (" MOV D0Ar4, %1\n" + " MOV D0Ar6, %2\n" + "0:\n" + " SUBS D0FrT, D0Ar6, #0\n" + " SUB D0Ar6, D0Ar6, #1\n" + " BLE 2f\n" + " GETB D0FrT, [D0Ar4+#1++]\n" + "1:\n" + " TST D0FrT, #255\n" + " BNE 0b\n" + "2:\n" + " SUB %0, %2, D0Ar6\n" + "3:\n" + " .section .fixup,\"ax\"\n" + "4:\n" + " MOV %0, #0\n" + " MOVT D0FrT,#HI(3b)\n" + " JUMP D0FrT,#LO(3b)\n" + " .previous\n" + " .section __ex_table,\"a\"\n" + " .long 1b,4b\n" + " .previous\n" + : "=r" (res) + : "r" (src), "r" (count) + : "D0FrT", "D0Ar4", "D0Ar6", "cc"); + + return res; +} + +long __strncpy_from_user(char *dst, const char __user *src, long count) +{ + long res; + + if (count == 0) + return 0; + + /* + * Currently, in 2.4.0-test9, most ports use a simple byte-copy loop. + * So do we. + * + * This code is deduced from: + * + * char tmp2; + * long tmp1, tmp3; + * tmp1 = count; + * while ((*dst++ = (tmp2 = *src++)) != 0 + * && --tmp1) + * ; + * + * res = count - tmp1; + * + * with tweaks. + */ + + __asm__ __volatile__(" MOV %0,%3\n" + "1:\n" + " GETB D0FrT,[%2++]\n" + "2:\n" + " CMP D0FrT,#0\n" + " SETB [%1++],D0FrT\n" + " BEQ 3f\n" + " SUBS %0,%0,#1\n" + " BNZ 1b\n" + "3:\n" + " SUB %0,%3,%0\n" + "4:\n" + " .section .fixup,\"ax\"\n" + "5:\n" + " MOV %0,%7\n" + " MOVT D0FrT,#HI(4b)\n" + " JUMP D0FrT,#LO(4b)\n" + " .previous\n" + " .section __ex_table,\"a\"\n" + " .long 2b,5b\n" + " .previous" + : "=r" (res), "=r" (dst), "=r" (src), "=r" (count) + : "3" (count), "1" (dst), "2" (src), "P" (-EFAULT) + : "D0FrT", "memory", "cc"); + + return res; +} -- 1.7.7.6 From mboxrd@z Thu Jan 1 00:00:00 1970 Return-Path: Received: from multi.imgtec.com ([194.200.65.239]:55549 "EHLO multi.imgtec.com" rhost-flags-OK-OK-OK-OK) by vger.kernel.org with ESMTP id S1753539Ab2LEQJn (ORCPT ); Wed, 5 Dec 2012 11:09:43 -0500 From: James Hogan Subject: [PATCH v2 32/44] metag: Optimised library functions Date: Wed, 5 Dec 2012 16:08:50 +0000 Message-ID: <1354723742-6195-33-git-send-email-james.hogan@imgtec.com> In-Reply-To: <1354723742-6195-1-git-send-email-james.hogan@imgtec.com> References: <1354723742-6195-1-git-send-email-james.hogan@imgtec.com> MIME-Version: 1.0 Content-Type: text/plain Sender: linux-arch-owner@vger.kernel.org List-ID: To: linux-arch@vger.kernel.org, linux-kernel@vger.kernel.org Cc: Arnd Bergmann , James Hogan Message-ID: <20121205160850.zpad-UodPpLE8ZhzY4L0Bhx_eRQxzC_gylCuK9aLBWI@z> Add optimised library functions for metag. Signed-off-by: James Hogan --- arch/metag/include/asm/checksum.h | 92 +++ arch/metag/include/asm/div64.h | 12 + arch/metag/include/asm/string.h | 13 + arch/metag/lib/ashldi3.S | 33 + arch/metag/lib/ashrdi3.S | 33 + arch/metag/lib/checksum.c | 169 +++++ arch/metag/lib/clear_page.S | 17 + arch/metag/lib/cmpdi2.S | 32 + arch/metag/lib/copy_page.S | 20 + arch/metag/lib/delay.c | 55 ++ arch/metag/lib/div64.S | 108 +++ arch/metag/lib/divsi3.S | 100 +++ arch/metag/lib/ip_fast_csum.S | 32 + arch/metag/lib/lshrdi3.S | 33 + arch/metag/lib/memcpy.S | 185 +++++ arch/metag/lib/memmove.S | 345 ++++++++++ arch/metag/lib/memset.S | 86 +++ arch/metag/lib/modsi3.S | 38 + arch/metag/lib/muldi3.S | 44 ++ arch/metag/lib/ucmpdi2.S | 27 + arch/metag/lib/usercopy.c | 1341 +++++++++++++++++++++++++++++++++++++ 21 files changed, 2815 insertions(+), 0 deletions(-) create mode 100644 arch/metag/include/asm/checksum.h create mode 100644 arch/metag/include/asm/div64.h create mode 100644 arch/metag/include/asm/string.h create mode 100644 arch/metag/lib/ashldi3.S create mode 100644 arch/metag/lib/ashrdi3.S create mode 100644 arch/metag/lib/checksum.c create mode 100644 arch/metag/lib/clear_page.S create mode 100644 arch/metag/lib/cmpdi2.S create mode 100644 arch/metag/lib/copy_page.S create mode 100644 arch/metag/lib/delay.c create mode 100644 arch/metag/lib/div64.S create mode 100644 arch/metag/lib/divsi3.S create mode 100644 arch/metag/lib/ip_fast_csum.S create mode 100644 arch/metag/lib/lshrdi3.S create mode 100644 arch/metag/lib/memcpy.S create mode 100644 arch/metag/lib/memmove.S create mode 100644 arch/metag/lib/memset.S create mode 100644 arch/metag/lib/modsi3.S create mode 100644 arch/metag/lib/muldi3.S create mode 100644 arch/metag/lib/ucmpdi2.S create mode 100644 arch/metag/lib/usercopy.c diff --git a/arch/metag/include/asm/checksum.h b/arch/metag/include/asm/checksum.h new file mode 100644 index 0000000..1113f4b --- /dev/null +++ b/arch/metag/include/asm/checksum.h @@ -0,0 +1,92 @@ +#ifndef _METAG_CHECKSUM_H +#define _METAG_CHECKSUM_H + +/* + * computes the checksum of a memory block at buff, length len, + * and adds in "sum" (32-bit) + * + * returns a 32-bit number suitable for feeding into itself + * or csum_tcpudp_magic + * + * this function must be called with even lengths, except + * for the last fragment, which may be odd + * + * it's best to have buff aligned on a 32-bit boundary + */ +extern __wsum csum_partial(const void *buff, int len, __wsum sum); + +/* + * the same as csum_partial, but copies from src while it + * checksums + * + * here even more important to align src and dst on a 32-bit (or even + * better 64-bit) boundary + */ +extern __wsum csum_partial_copy(const void *src, void *dst, int len, + __wsum sum); + +/* + * the same as csum_partial_copy, but copies from user space. + * + * here even more important to align src and dst on a 32-bit (or even + * better 64-bit) boundary + */ +extern __wsum csum_partial_copy_from_user(const void __user *src, void *dst, + int len, __wsum sum, int *csum_err); + +#define csum_partial_copy_nocheck(src, dst, len, sum) \ + csum_partial_copy((src), (dst), (len), (sum)) + +/* + * Fold a partial checksum + */ +static inline __sum16 csum_fold(__wsum csum) +{ + u32 sum = (__force u32)csum; + sum = (sum & 0xffff) + (sum >> 16); + sum = (sum & 0xffff) + (sum >> 16); + return (__force __sum16)~sum; +} + +/* + * This is a version of ip_compute_csum() optimized for IP headers, + * which always checksum on 4 octet boundaries. + */ +extern __sum16 ip_fast_csum(const void *iph, unsigned int ihl); + +/* + * computes the checksum of the TCP/UDP pseudo-header + * returns a 16-bit checksum, already complemented + */ +static inline __wsum csum_tcpudp_nofold(__be32 saddr, __be32 daddr, + unsigned short len, + unsigned short proto, + __wsum sum) +{ + unsigned long len_proto = (proto + len) << 8; + __asm__("ADD %0, %0, %1\n" + "ADDS %0, %0, %2\n" + "ADDCS %0, %0, #1\n" + "ADDS %0, %0, %3\n" + "ADDCS %0, %0, #1\n" + : "=d"(sum) + : "d"(daddr), "d"(saddr), "d"(len_proto), + "0"(sum) + : "cc"); + return sum; +} + +static inline __sum16 +csum_tcpudp_magic(__be32 saddr, __be32 daddr, unsigned short len, + unsigned short proto, __wsum sum) +{ + return csum_fold(csum_tcpudp_nofold(saddr, daddr, len, proto, sum)); +} + +/* + * this routine is used for miscellaneous IP-like checksums, mainly + * in icmp.c + */ +extern __sum16 ip_compute_csum(const void *buff, int len); + +#endif /* _METAG_CHECKSUM_H */ diff --git a/arch/metag/include/asm/div64.h b/arch/metag/include/asm/div64.h new file mode 100644 index 0000000..0fdd116 --- /dev/null +++ b/arch/metag/include/asm/div64.h @@ -0,0 +1,12 @@ +#ifndef __ASM_DIV64_H__ +#define __ASM_DIV64_H__ + +#include + +extern u64 div_u64(u64 dividend, u64 divisor); +extern s64 div_s64(s64 dividend, s64 divisor); + +#define div_u64 div_u64 +#define div_s64 div_s64 + +#endif diff --git a/arch/metag/include/asm/string.h b/arch/metag/include/asm/string.h new file mode 100644 index 0000000..53e3806 --- /dev/null +++ b/arch/metag/include/asm/string.h @@ -0,0 +1,13 @@ +#ifndef _METAG_STRING_H_ +#define _METAG_STRING_H_ + +#define __HAVE_ARCH_MEMSET +extern void *memset(void *__s, int __c, size_t __count); + +#define __HAVE_ARCH_MEMCPY +void *memcpy(void *__to, __const__ void *__from, size_t __n); + +#define __HAVE_ARCH_MEMMOVE +extern void *memmove(void *__dest, __const__ void *__src, size_t __n); + +#endif /* _METAG_STRING_H_ */ diff --git a/arch/metag/lib/ashldi3.S b/arch/metag/lib/ashldi3.S new file mode 100644 index 0000000..78d6974 --- /dev/null +++ b/arch/metag/lib/ashldi3.S @@ -0,0 +1,33 @@ +! Copyright (C) 2012 by Imagination Technologies Ltd. +! +! 64-bit arithmetic shift left routine. +! + + .text + .global ___ashldi3 + .type ___ashldi3,function + +___ashldi3: + MOV D0Re0,D0Ar2 + MOV D1Re0,D1Ar1 + CMP D1Ar3,#0 ! COUNT == 0 + MOVEQ PC,D1RtP ! Yes, return + + SUBS D0Ar4,D1Ar3,#32 ! N = COUNT - 32 + BGE $L10 + +!! Shift < 32 + NEG D0Ar4,D0Ar4 ! N = - N + LSL D1Re0,D1Re0,D1Ar3 ! HI = HI << COUNT + LSR D0Ar6,D0Re0,D0Ar4 ! TMP= LO >> -(COUNT - 32) + OR D1Re0,D1Re0,D0Ar6 ! HI = HI | TMP + SWAP D0Ar4,D1Ar3 + LSL D0Re0,D0Re0,D0Ar4 ! LO = LO << COUNT + MOV PC,D1RtP + +$L10: +!! Shift >= 32 + LSL D1Re0,D0Re0,D0Ar4 ! HI = LO << N + MOV D0Re0,#0 ! LO = 0 + MOV PC,D1RtP + .size ___ashldi3,.-___ashldi3 diff --git a/arch/metag/lib/ashrdi3.S b/arch/metag/lib/ashrdi3.S new file mode 100644 index 0000000..7cb7ed3 --- /dev/null +++ b/arch/metag/lib/ashrdi3.S @@ -0,0 +1,33 @@ +! Copyright (C) 2012 by Imagination Technologies Ltd. +! +! 64-bit arithmetic shift right routine. +! + + .text + .global ___ashrdi3 + .type ___ashrdi3,function + +___ashrdi3: + MOV D0Re0,D0Ar2 + MOV D1Re0,D1Ar1 + CMP D1Ar3,#0 ! COUNT == 0 + MOVEQ PC,D1RtP ! Yes, return + + MOV D0Ar4,D1Ar3 + SUBS D1Ar3,D1Ar3,#32 ! N = COUNT - 32 + BGE $L20 + +!! Shift < 32 + NEG D1Ar3,D1Ar3 ! N = - N + LSR D0Re0,D0Re0,D0Ar4 ! LO = LO >> COUNT + LSL D0Ar6,D1Re0,D1Ar3 ! TMP= HI << -(COUNT - 32) + OR D0Re0,D0Re0,D0Ar6 ! LO = LO | TMP + SWAP D1Ar3,D0Ar4 + ASR D1Re0,D1Re0,D1Ar3 ! HI = HI >> COUNT + MOV PC,D1RtP +$L20: +!! Shift >= 32 + ASR D0Re0,D1Re0,D1Ar3 ! LO = HI >> N + ASR D1Re0,D1Re0,#31 ! HI = HI >> 31 + MOV PC,D1RtP + .size ___ashrdi3,.-___ashrdi3 diff --git a/arch/metag/lib/checksum.c b/arch/metag/lib/checksum.c new file mode 100644 index 0000000..ac2daa2 --- /dev/null +++ b/arch/metag/lib/checksum.c @@ -0,0 +1,169 @@ +/* + * + * INET An implementation of the TCP/IP protocol suite for the LINUX + * operating system. INET is implemented using the BSD Socket + * interface as the means of communication with the user level. + * + * IP/TCP/UDP checksumming routines + * + * Authors: Jorge Cwik, + * Arnt Gulbrandsen, + * Tom May, + * Andreas Schwab, + * Lots of code moved from tcp.c and ip.c; see those files + * for more names. + * + * 03/02/96 Jes Sorensen, Andreas Schwab, Roman Hodek: + * Fixed some nasty bugs, causing some horrible crashes. + * A: At some points, the sum (%0) was used as + * length-counter instead of the length counter + * (%1). Thanks to Roman Hodek for pointing this out. + * B: GCC seems to mess up if one uses too many + * data-registers to hold input values and one tries to + * specify d0 and d1 as scratch registers. Letting gcc + * choose these registers itself solves the problem. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + */ + +/* Revised by Kenneth Albanowski for m68knommu. Basic problem: unaligned access + kills, so most of the assembly has to go. */ + +#include +#include + +#include + +static inline unsigned short from32to16(unsigned int x) +{ + /* add up 16-bit and 16-bit for 16+c bit */ + x = (x & 0xffff) + (x >> 16); + /* add up carry.. */ + x = (x & 0xffff) + (x >> 16); + return x; +} + +static unsigned int do_csum(const unsigned char *buff, int len) +{ + int odd; + unsigned int result = 0; + + if (len <= 0) + goto out; + odd = 1 & (unsigned long) buff; + if (odd) { +#ifdef __LITTLE_ENDIAN + result += (*buff << 8); +#else + result = *buff; +#endif + len--; + buff++; + } + if (len >= 2) { + if (2 & (unsigned long) buff) { + result += *(unsigned short *) buff; + len -= 2; + buff += 2; + } + if (len >= 4) { + const unsigned char *end = buff + ((unsigned)len & ~3); + unsigned int carry = 0; + do { + unsigned int w = *(unsigned int *) buff; + buff += 4; + result += carry; + result += w; + carry = (w > result); + } while (buff < end); + result += carry; + result = (result & 0xffff) + (result >> 16); + } + if (len & 2) { + result += *(unsigned short *) buff; + buff += 2; + } + } + if (len & 1) +#ifdef __LITTLE_ENDIAN + result += *buff; +#else + result += (*buff << 8); +#endif + result = from32to16(result); + if (odd) + result = ((result >> 8) & 0xff) | ((result & 0xff) << 8); +out: + return result; +} + +EXPORT_SYMBOL(ip_fast_csum); + +/* + * computes the checksum of a memory block at buff, length len, + * and adds in "sum" (32-bit) + * + * returns a 32-bit number suitable for feeding into itself + * or csum_tcpudp_magic + * + * this function must be called with even lengths, except + * for the last fragment, which may be odd + * + * it's best to have buff aligned on a 32-bit boundary + */ +__wsum csum_partial(const void *buff, int len, __wsum wsum) +{ + unsigned int sum = (__force unsigned int)wsum; + unsigned int result = do_csum(buff, len); + + /* add in old sum, and carry.. */ + result += sum; + if (sum > result) + result += 1; + return (__force __wsum)result; +} +EXPORT_SYMBOL(csum_partial); + +/* + * this routine is used for miscellaneous IP-like checksums, mainly + * in icmp.c + */ +__sum16 ip_compute_csum(const void *buff, int len) +{ + return (__force __sum16)~do_csum(buff, len); +} +EXPORT_SYMBOL(ip_compute_csum); + +/* + * copy from fs while checksumming, otherwise like csum_partial + */ +__wsum +csum_partial_copy_from_user(const void __user *src, void *dst, int len, + __wsum sum, int *csum_err) +{ + int missing; + + missing = __copy_from_user(dst, src, len); + if (missing) { + memset(dst + len - missing, 0, missing); + *csum_err = -EFAULT; + } else + *csum_err = 0; + + return csum_partial(dst, len, sum); +} +EXPORT_SYMBOL(csum_partial_copy_from_user); + +/* + * copy from ds while checksumming, otherwise like csum_partial + */ +__wsum +csum_partial_copy(const void *src, void *dst, int len, __wsum sum) +{ + memcpy(dst, src, len); + return csum_partial(dst, len, sum); +} +EXPORT_SYMBOL(csum_partial_copy); diff --git a/arch/metag/lib/clear_page.S b/arch/metag/lib/clear_page.S new file mode 100644 index 0000000..43144ee --- /dev/null +++ b/arch/metag/lib/clear_page.S @@ -0,0 +1,17 @@ + ! Copyright 2007,2008,2009 Imagination Technologies Ltd. + +#include + + .text + .global _clear_page + .type _clear_page,function + !! D1Ar1 - page +_clear_page: + MOV TXRPT,#((PAGE_SIZE / 8) - 1) + MOV D0Re0,#0 + MOV D1Re0,#0 +$Lclear_page_loop: + SETL [D1Ar1++],D0Re0,D1Re0 + BR $Lclear_page_loop + MOV PC,D1RtP + .size _clear_page,.-_clear_page diff --git a/arch/metag/lib/cmpdi2.S b/arch/metag/lib/cmpdi2.S new file mode 100644 index 0000000..9c5c663 --- /dev/null +++ b/arch/metag/lib/cmpdi2.S @@ -0,0 +1,32 @@ +! Copyright (C) 2012 by Imagination Technologies Ltd. +! +! 64-bit signed compare routine. +! + + .text + .global ___cmpdi2 + .type ___cmpdi2,function + +! low high +! s64 a (D0Ar2, D1Ar1) +! s64 b (D0Ar4, D1Ar3) +___cmpdi2: + ! start at 1 (equal) and conditionally increment or decrement + MOV D0Re0,#1 + + ! high words differ? + CMP D1Ar1,D1Ar3 + BNE $Lhigh_differ + + ! unsigned compare low words + CMP D0Ar2,D0Ar4 + SUBLO D0Re0,D0Re0,#1 + ADDHI D0Re0,D0Re0,#1 + MOV PC,D1RtP + +$Lhigh_differ: + ! signed compare high words + SUBLT D0Re0,D0Re0,#1 + ADDGT D0Re0,D0Re0,#1 + MOV PC,D1RtP + .size ___cmpdi2,.-___cmpdi2 diff --git a/arch/metag/lib/copy_page.S b/arch/metag/lib/copy_page.S new file mode 100644 index 0000000..91f7d46 --- /dev/null +++ b/arch/metag/lib/copy_page.S @@ -0,0 +1,20 @@ + ! Copyright 2007,2008 Imagination Technologies Ltd. + +#include + + .text + .global _copy_page + .type _copy_page,function + !! D1Ar1 - to + !! D0Ar2 - from +_copy_page: + MOV D0FrT,#PAGE_SIZE +$Lcopy_page_loop: + GETL D0Re0,D1Re0,[D0Ar2++] + GETL D0Ar6,D1Ar5,[D0Ar2++] + SETL [D1Ar1++],D0Re0,D1Re0 + SETL [D1Ar1++],D0Ar6,D1Ar5 + SUBS D0FrT,D0FrT,#16 + BNZ $Lcopy_page_loop + MOV PC,D1RtP + .size _copy_page,.-_copy_page diff --git a/arch/metag/lib/delay.c b/arch/metag/lib/delay.c new file mode 100644 index 0000000..e1cfcbb --- /dev/null +++ b/arch/metag/lib/delay.c @@ -0,0 +1,55 @@ +/* + * Precise Delay Loops for Meta + * + * Copyright (C) 1993 Linus Torvalds + * Copyright (C) 1997 Martin Mares + * Copyright (C) 2007,2009 Imagination Technologies Ltd. + * + */ + +#include +#include +#include + +#include + +/* + * TXTACTCYC is only 24 bits, so on chips with fast clocks it will wrap + * many times per-second. If it does wrap __delay will return prematurely, + * but this is only likely with large delay values. + * + * We also can't implement read_current_timer() with TXTACTCYC due to + * this wrapping behaviour. + */ +#define rdtimer(t) asm volatile ("MOV %0,TXACTCYC\n" : "=r" (t)); + +void __delay(unsigned long loops) +{ + unsigned long bclock, now; + + rdtimer(bclock); + do { + asm("NOP"); + rdtimer(now); + } while ((now-bclock) < loops); +} +EXPORT_SYMBOL(__delay); + +inline void __const_udelay(unsigned long xloops) +{ + u64 loops = (u64)xloops * (u64)loops_per_jiffy * HZ; + __delay(loops >> 32); +} +EXPORT_SYMBOL(__const_udelay); + +void __udelay(unsigned long usecs) +{ + __const_udelay(usecs * 0x000010c7); /* 2**32 / 1000000 (rounded up) */ +} +EXPORT_SYMBOL(__udelay); + +void __ndelay(unsigned long nsecs) +{ + __const_udelay(nsecs * 0x00005); /* 2**32 / 1000000000 (rounded up) */ +} +EXPORT_SYMBOL(__ndelay); diff --git a/arch/metag/lib/div64.S b/arch/metag/lib/div64.S new file mode 100644 index 0000000..1cfc934 --- /dev/null +++ b/arch/metag/lib/div64.S @@ -0,0 +1,108 @@ +! Copyright (C) 2012 Imagination Technologies Ltd. +! +! Signed/unsigned 64-bit division routines. +! + + .text + .global _div_u64 + .type _div_u64,function + +_div_u64: +$L1: + ORS A0.3,D1Ar3,D0Ar4 + BNE $L3 +$L2: + MOV D0Re0,D0Ar2 + MOV D1Re0,D1Ar1 + MOV PC,D1RtP +$L3: + CMP D1Ar3,D1Ar1 + CMPEQ D0Ar4,D0Ar2 + MOV D0Re0,#1 + MOV D1Re0,#0 + BHS $L6 +$L4: + ADDS D0Ar6,D0Ar4,D0Ar4 + ADD D1Ar5,D1Ar3,D1Ar3 + ADDCS D1Ar5,D1Ar5,#1 + CMP D1Ar5,D1Ar3 + CMPEQ D0Ar6,D0Ar4 + BLO $L6 +$L5: + MOV D0Ar4,D0Ar6 + MOV D1Ar3,D1Ar5 + ADDS D0Re0,D0Re0,D0Re0 + ADD D1Re0,D1Re0,D1Re0 + ADDCS D1Re0,D1Re0,#1 + CMP D1Ar3,D1Ar1 + CMPEQ D0Ar4,D0Ar2 + BLO $L4 +$L6: + ORS A0.3,D1Re0,D0Re0 + MOV D0Ar6,#0 + MOV D1Ar5,D0Ar6 + BEQ $L10 +$L7: + CMP D1Ar1,D1Ar3 + CMPEQ D0Ar2,D0Ar4 + BLO $L9 +$L8: + ADDS D0Ar6,D0Ar6,D0Re0 + ADD D1Ar5,D1Ar5,D1Re0 + ADDCS D1Ar5,D1Ar5,#1 + + SUBS D0Ar2,D0Ar2,D0Ar4 + SUB D1Ar1,D1Ar1,D1Ar3 + SUBCS D1Ar1,D1Ar1,#1 +$L9: + LSL A0.3,D1Re0,#31 + LSR D0Re0,D0Re0,#1 + LSR D1Re0,D1Re0,#1 + OR D0Re0,D0Re0,A0.3 + LSL A0.3,D1Ar3,#31 + LSR D0Ar4,D0Ar4,#1 + LSR D1Ar3,D1Ar3,#1 + OR D0Ar4,D0Ar4,A0.3 + ORS A0.3,D1Re0,D0Re0 + BNE $L7 +$L10: + MOV D0Re0,D0Ar6 + MOV D1Re0,D1Ar5 + MOV PC,D1RtP + .size _div_u64,.-_div_u64 + + .text + .global _div_s64 + .type _div_s64,function +_div_s64: + MSETL [A0StP],D0FrT,D0.5 + XOR D0.5,D0Ar2,D0Ar4 + XOR D1.5,D1Ar1,D1Ar3 + TSTT D1Ar1,#HI(0x80000000) + BZ $L25 + + NEGS D0Ar2,D0Ar2 + NEG D1Ar1,D1Ar1 + SUBCS D1Ar1,D1Ar1,#1 +$L25: + TSTT D1Ar3,#HI(0x80000000) + BZ $L27 + + NEGS D0Ar4,D0Ar4 + NEG D1Ar3,D1Ar3 + SUBCS D1Ar3,D1Ar3,#1 +$L27: + CALLR D1RtP,_div_u64 + TSTT D1.5,#HI(0x80000000) + BZ $L29 + + NEGS D0Re0,D0Re0 + NEG D1Re0,D1Re0 + SUBCS D1Re0,D1Re0,#1 +$L29: + + GETL D0FrT,D1RtP,[A0StP+#(-16)] + GETL D0.5,D1.5,[A0StP+#(-8)] + SUB A0StP,A0StP,#16 + MOV PC,D1RtP + .size _div_s64,.-_div_s64 diff --git a/arch/metag/lib/divsi3.S b/arch/metag/lib/divsi3.S new file mode 100644 index 0000000..7c8a8ae --- /dev/null +++ b/arch/metag/lib/divsi3.S @@ -0,0 +1,100 @@ +! Copyright (C) 2001, 2002, 2003, 2004, 2005, 2006, 2007 +! Imagination Technologies Ltd +! +! Integer divide routines. +! + + .text + .global ___udivsi3 + .type ___udivsi3,function + .align 2 +___udivsi3: +!! +!! Since core is signed divide case, just set control variable +!! + MOV D1Re0,D0Ar2 ! Au already in A1Ar1, Bu -> D1Re0 + MOV D0Re0,#0 ! Result is 0 + MOV D0Ar4,#0 ! Return positive result + B $LIDMCUStart + .size ___udivsi3,.-___udivsi3 + +!! +!! 32-bit division signed i/p - passed signed 32-bit numbers +!! + .global ___divsi3 + .type ___divsi3,function + .align 2 +___divsi3: +!! +!! A already in D1Ar1, B already in D0Ar2 -> make B abs(B) +!! + MOV D1Re0,D0Ar2 ! A already in A1Ar1, B -> D1Re0 + MOV D0Re0,#0 ! Result is 0 + XOR D0Ar4,D1Ar1,D1Re0 ! D0Ar4 -ive if result is -ive + ABS D1Ar1,D1Ar1 ! abs(A) -> Au + ABS D1Re0,D1Re0 ! abs(B) -> Bu +$LIDMCUStart: + CMP D1Ar1,D1Re0 ! Is ( Au > Bu )? + LSR D1Ar3,D1Ar1,#2 ! Calculate (Au & (~3)) >> 2 + CMPHI D1Re0,D1Ar3 ! OR ( (Au & (~3)) <= (Bu << 2) )? + LSLSHI D1Ar3,D1Re0,#1 ! Buq = Bu << 1 + BLS $LIDMCUSetup ! Yes: Do normal divide +!! +!! Quick divide setup can assume that CurBit only needs to start at 2 +!! +$LIDMCQuick: + CMP D1Ar1,D1Ar3 ! ( A >= Buq )? + ADDCC D0Re0,D0Re0,#2 ! If yes result += 2 + SUBCC D1Ar1,D1Ar1,D1Ar3 ! and A -= Buq + CMP D1Ar1,D1Re0 ! ( A >= Bu )? + ADDCC D0Re0,D0Re0,#1 ! If yes result += 1 + SUBCC D1Ar1,D1Ar1,D1Re0 ! and A -= Bu + ORS D0Ar4,D0Ar4,D0Ar4 ! Return neg result? + NEG D0Ar2,D0Re0 ! Calulate neg result + MOVMI D0Re0,D0Ar2 ! Yes: Take neg result +$LIDMCRet: + MOV PC,D1RtP +!! +!! Setup for general unsigned divide code +!! +!! D0Re0 is used to form the result, already set to Zero +!! D1Re0 is the input Bu value, this gets trashed +!! D0Ar6 is curbit which is set to 1 at the start and shifted up +!! D0Ar4 is negative if we should return a negative result +!! D1Ar1 is the input Au value, eventually this holds the remainder +!! +$LIDMCUSetup: + CMP D1Ar1,D1Re0 ! Is ( Au < Bu )? + MOV D0Ar6,#1 ! Set curbit to 1 + BCS $LIDMCRet ! Yes: Return 0 remainder Au +!! +!! Calculate alignment using FFB instruction +!! + FFB D1Ar5,D1Ar1 ! Find first bit of Au + ANDN D1Ar5,D1Ar5,#31 ! Handle exceptional case. + ORN D1Ar5,D1Ar5,#31 ! if N bit set, set to 31 + FFB D1Ar3,D1Re0 ! Find first bit of Bu + ANDN D1Ar3,D1Ar3,#31 ! Handle exceptional case. + ORN D1Ar3,D1Ar3,#31 ! if N bit set, set to 31 + SUBS D1Ar3,D1Ar5,D1Ar3 ! calculate diff, ffbA - ffbB + MOV D0Ar2,D1Ar3 ! copy into bank 0 + LSLGT D1Re0,D1Re0,D1Ar3 ! ( > 0) ? left shift B + LSLGT D0Ar6,D0Ar6,D0Ar2 ! ( > 0) ? left shift curbit +!! +!! Now we start the divide proper, logic is +!! +!! if ( A >= B ) add curbit to result and subtract B from A +!! shift curbit and B down by 1 in either case +!! +$LIDMCLoop: + CMP D1Ar1, D1Re0 ! ( A >= B )? + ADDCC D0Re0, D0Re0, D0Ar6 ! If yes result += curbit + SUBCC D1Ar1, D1Ar1, D1Re0 ! and A -= B + LSRS D0Ar6, D0Ar6, #1 ! Shift down curbit, is it zero? + LSR D1Re0, D1Re0, #1 ! Shift down B + BNZ $LIDMCLoop ! Was single bit in curbit lost? + ORS D0Ar4,D0Ar4,D0Ar4 ! Return neg result? + NEG D0Ar2,D0Re0 ! Calulate neg result + MOVMI D0Re0,D0Ar2 ! Yes: Take neg result + MOV PC,D1RtP + .size ___divsi3,.-___divsi3 diff --git a/arch/metag/lib/ip_fast_csum.S b/arch/metag/lib/ip_fast_csum.S new file mode 100644 index 0000000..533b1e7 --- /dev/null +++ b/arch/metag/lib/ip_fast_csum.S @@ -0,0 +1,32 @@ + + .text +/* + * This is a version of ip_compute_csum() optimized for IP headers, + * which always checksum on 4 octet boundaries. + * + * extern __sum16 ip_fast_csum(const void *iph, unsigned int ihl); + * + */ + .global _ip_fast_csum + .type _ip_fast_csum,function +_ip_fast_csum: + !! TXRPT needs loops - 1 + SUBS TXRPT,D0Ar2,#1 + MOV D0Re0,#0 + BLO $Lfast_csum_exit +$Lfast_csum_loop: + GETD D1Ar3,[D1Ar1++] + ADDS D0Re0,D0Re0,D1Ar3 + ADDCS D0Re0,D0Re0,#1 + BR $Lfast_csum_loop + LSR D0Ar4,D0Re0,#16 + AND D0Re0,D0Re0,#0xffff + AND D0Ar4,D0Ar4,#0xffff + ADD D0Re0,D0Re0,D0Ar4 + LSR D0Ar4,D0Re0,#16 + ADD D0Re0,D0Re0,D0Ar4 + XOR D0Re0,D0Re0,#-1 + AND D0Re0,D0Re0,#0xffff +$Lfast_csum_exit: + MOV PC,D1RtP + .size _ip_fast_csum,.-_ip_fast_csum diff --git a/arch/metag/lib/lshrdi3.S b/arch/metag/lib/lshrdi3.S new file mode 100644 index 0000000..47f7202 --- /dev/null +++ b/arch/metag/lib/lshrdi3.S @@ -0,0 +1,33 @@ +! Copyright (C) 2012 by Imagination Technologies Ltd. +! +! 64-bit logical shift right routine. +! + + .text + .global ___lshrdi3 + .type ___lshrdi3,function + +___lshrdi3: + MOV D0Re0,D0Ar2 + MOV D1Re0,D1Ar1 + CMP D1Ar3,#0 ! COUNT == 0 + MOVEQ PC,D1RtP ! Yes, return + + MOV D0Ar4,D1Ar3 + SUBS D1Ar3,D1Ar3,#32 ! N = COUNT - 32 + BGE $L30 + +!! Shift < 32 + NEG D1Ar3,D1Ar3 ! N = - N + LSR D0Re0,D0Re0,D0Ar4 ! LO = LO >> COUNT + LSL D0Ar6,D1Re0,D1Ar3 ! TMP= HI << -(COUNT - 32) + OR D0Re0,D0Re0,D0Ar6 ! LO = LO | TMP + SWAP D1Ar3,D0Ar4 + LSR D1Re0,D1Re0,D1Ar3 ! HI = HI >> COUNT + MOV PC,D1RtP +$L30: +!! Shift >= 32 + LSR D0Re0,D1Re0,D1Ar3 ! LO = HI >> N + MOV D1Re0,#0 ! HI = 0 + MOV PC,D1RtP + .size ___lshrdi3,.-___lshrdi3 diff --git a/arch/metag/lib/memcpy.S b/arch/metag/lib/memcpy.S new file mode 100644 index 0000000..46b7a2b --- /dev/null +++ b/arch/metag/lib/memcpy.S @@ -0,0 +1,185 @@ +! Copyright (C) 2008-2012 Imagination Technologies Ltd. + + .text + .global _memcpy + .type _memcpy,function +! D1Ar1 dst +! D0Ar2 src +! D1Ar3 cnt +! D0Re0 dst +_memcpy: + CMP D1Ar3, #16 + MOV A1.2, D0Ar2 ! source pointer + MOV A0.2, D1Ar1 ! destination pointer + MOV A0.3, D1Ar1 ! for return value +! If there are less than 16 bytes to copy use the byte copy loop + BGE $Llong_copy + +$Lbyte_copy: +! Simply copy a byte at a time + SUBS TXRPT, D1Ar3, #1 + BLT $Lend +$Lloop_byte: + GETB D1Re0, [A1.2++] + SETB [A0.2++], D1Re0 + BR $Lloop_byte + +$Lend: +! Finally set return value and return + MOV D0Re0, A0.3 + MOV PC, D1RtP + +$Llong_copy: + ANDS D1Ar5, D1Ar1, #7 ! test destination alignment + BZ $Laligned_dst + +! The destination address is not 8 byte aligned. We will copy bytes from +! the source to the destination until the remaining data has an 8 byte +! destination address alignment (i.e we should never copy more than 7 +! bytes here). +$Lalign_dst: + GETB D0Re0, [A1.2++] + ADD D1Ar5, D1Ar5, #1 ! dest is aligned when D1Ar5 reaches #8 + SUB D1Ar3, D1Ar3, #1 ! decrement count of remaining bytes + SETB [A0.2++], D0Re0 + CMP D1Ar5, #8 + BNE $Lalign_dst + +! We have at least (16 - 7) = 9 bytes to copy - calculate the number of 8 byte +! blocks, then jump to the unaligned copy loop or fall through to the aligned +! copy loop as appropriate. +$Laligned_dst: + MOV D0Ar4, A1.2 + LSR D1Ar5, D1Ar3, #3 ! D1Ar5 = number of 8 byte blocks + ANDS D0Ar4, D0Ar4, #7 ! test source alignment + BNZ $Lunaligned_copy ! if unaligned, use unaligned copy loop + +! Both source and destination are 8 byte aligned - the easy case. +$Laligned_copy: + LSRS D1Ar5, D1Ar3, #5 ! D1Ar5 = number of 32 byte blocks + BZ $Lbyte_copy + SUB TXRPT, D1Ar5, #1 + +$Laligned_32: + GETL D0Re0, D1Re0, [A1.2++] + GETL D0Ar6, D1Ar5, [A1.2++] + SETL [A0.2++], D0Re0, D1Re0 + SETL [A0.2++], D0Ar6, D1Ar5 + GETL D0Re0, D1Re0, [A1.2++] + GETL D0Ar6, D1Ar5, [A1.2++] + SETL [A0.2++], D0Re0, D1Re0 + SETL [A0.2++], D0Ar6, D1Ar5 + BR $Laligned_32 + +! If there are any remaining bytes use the byte copy loop, otherwise we are done + ANDS D1Ar3, D1Ar3, #0x1f + BNZ $Lbyte_copy + B $Lend + +! The destination is 8 byte aligned but the source is not, and there are 8 +! or more bytes to be copied. +$Lunaligned_copy: +! Adjust the source pointer (A1.2) to the 8 byte boundary before its +! current value + MOV D0Ar4, A1.2 + MOV D0Ar6, A1.2 + ANDMB D0Ar4, D0Ar4, #0xfff8 + MOV A1.2, D0Ar4 +! Save the number of bytes of mis-alignment in D0Ar4 for use later + SUBS D0Ar6, D0Ar6, D0Ar4 + MOV D0Ar4, D0Ar6 +! if there is no mis-alignment after all, use the aligned copy loop + BZ $Laligned_copy + +! prefetch 8 bytes + GETL D0Re0, D1Re0, [A1.2] + + SUB TXRPT, D1Ar5, #1 + +! There are 3 mis-alignment cases to be considered. Less than 4 bytes, exactly +! 4 bytes, and more than 4 bytes. + CMP D0Ar6, #4 + BLT $Lunaligned_1_2_3 ! use 1-3 byte mis-alignment loop + BZ $Lunaligned_4 ! use 4 byte mis-alignment loop + +! The mis-alignment is more than 4 bytes +$Lunaligned_5_6_7: + SUB D0Ar6, D0Ar6, #4 +! Calculate the bit offsets required for the shift operations necesssary +! to align the data. +! D0Ar6 = bit offset, D1Ar5 = (32 - bit offset) + MULW D0Ar6, D0Ar6, #8 + MOV D1Ar5, #32 + SUB D1Ar5, D1Ar5, D0Ar6 +! Move data 4 bytes before we enter the main loop + MOV D0Re0, D1Re0 + +$Lloop_5_6_7: + GETL D0Ar2, D1Ar1, [++A1.2] +! form 64-bit data in D0Re0, D1Re0 + LSR D0Re0, D0Re0, D0Ar6 + MOV D1Re0, D0Ar2 + LSL D1Re0, D1Re0, D1Ar5 + ADD D0Re0, D0Re0, D1Re0 + + LSR D0Ar2, D0Ar2, D0Ar6 + LSL D1Re0, D1Ar1, D1Ar5 + ADD D1Re0, D1Re0, D0Ar2 + + SETL [A0.2++], D0Re0, D1Re0 + MOV D0Re0, D1Ar1 + BR $Lloop_5_6_7 + + B $Lunaligned_end + +$Lunaligned_1_2_3: +! Calculate the bit offsets required for the shift operations necesssary +! to align the data. +! D0Ar6 = bit offset, D1Ar5 = (32 - bit offset) + MULW D0Ar6, D0Ar6, #8 + MOV D1Ar5, #32 + SUB D1Ar5, D1Ar5, D0Ar6 + +$Lloop_1_2_3: +! form 64-bit data in D0Re0,D1Re0 + LSR D0Re0, D0Re0, D0Ar6 + LSL D1Ar1, D1Re0, D1Ar5 + ADD D0Re0, D0Re0, D1Ar1 + MOV D0Ar2, D1Re0 + LSR D0FrT, D0Ar2, D0Ar6 + GETL D0Ar2, D1Ar1, [++A1.2] + + MOV D1Re0, D0Ar2 + LSL D1Re0, D1Re0, D1Ar5 + ADD D1Re0, D1Re0, D0FrT + + SETL [A0.2++], D0Re0, D1Re0 + MOV D0Re0, D0Ar2 + MOV D1Re0, D1Ar1 + BR $Lloop_1_2_3 + + B $Lunaligned_end + +! The 4 byte mis-alignment case - this does not require any shifting, just a +! shuffling of registers. +$Lunaligned_4: + MOV D0Re0, D1Re0 +$Lloop_4: + GETL D0Ar2, D1Ar1, [++A1.2] + MOV D1Re0, D0Ar2 + SETL [A0.2++], D0Re0, D1Re0 + MOV D0Re0, D1Ar1 + BR $Lloop_4 + +$Lunaligned_end: +! If there are no remaining bytes to copy, we are done. + ANDS D1Ar3, D1Ar3, #7 + BZ $Lend +! Re-adjust the source pointer (A1.2) back to the actual (unaligned) byte +! address of the remaining bytes, and fall through to the byte copy loop. + MOV D0Ar6, A1.2 + ADD D1Ar5, D0Ar4, D0Ar6 + MOV A1.2, D1Ar5 + B $Lbyte_copy + + .size _memcpy,.-_memcpy diff --git a/arch/metag/lib/memmove.S b/arch/metag/lib/memmove.S new file mode 100644 index 0000000..228ea04 --- /dev/null +++ b/arch/metag/lib/memmove.S @@ -0,0 +1,345 @@ +! Copyright (C) 2008-2012 Imagination Technologies Ltd. + + .text + .global _memmove + .type _memmove,function +! D1Ar1 dst +! D0Ar2 src +! D1Ar3 cnt +! D0Re0 dst +_memmove: + CMP D1Ar3, #0 + MOV D0Re0, D1Ar1 + BZ $LEND2 + MSETL [A0StP], D0.5, D0.6, D0.7 + MOV D1Ar5, D0Ar2 + CMP D1Ar1, D1Ar5 + BLT $Lforwards_copy + SUB D0Ar4, D1Ar1, D1Ar3 + ADD D0Ar4, D0Ar4, #1 + CMP D0Ar2, D0Ar4 + BLT $Lforwards_copy + ! should copy backwards + MOV D1Re0, D0Ar2 + ! adjust pointer to the end of mem + ADD D0Ar2, D1Re0, D1Ar3 + ADD D1Ar1, D1Ar1, D1Ar3 + + MOV A1.2, D0Ar2 + MOV A0.2, D1Ar1 + CMP D1Ar3, #8 + BLT $Lbbyte_loop + + MOV D0Ar4, D0Ar2 + MOV D1Ar5, D1Ar1 + + ! test 8 byte alignment + ANDS D1Ar5, D1Ar5, #7 + BNE $Lbdest_unaligned + + ANDS D0Ar4, D0Ar4, #7 + BNE $Lbsrc_unaligned + + LSR D1Ar5, D1Ar3, #3 + +$Lbaligned_loop: + GETL D0Re0, D1Re0, [--A1.2] + SETL [--A0.2], D0Re0, D1Re0 + SUBS D1Ar5, D1Ar5, #1 + BNE $Lbaligned_loop + + ANDS D1Ar3, D1Ar3, #7 + BZ $Lbbyte_loop_exit +$Lbbyte_loop: + GETB D1Re0, [--A1.2] + SETB [--A0.2], D1Re0 + SUBS D1Ar3, D1Ar3, #1 + BNE $Lbbyte_loop +$Lbbyte_loop_exit: + MOV D0Re0, A0.2 +$LEND: + SUB A0.2, A0StP, #24 + MGETL D0.5, D0.6, D0.7, [A0.2] + SUB A0StP, A0StP, #24 +$LEND2: + MOV PC, D1RtP + +$Lbdest_unaligned: + GETB D0Re0, [--A1.2] + SETB [--A0.2], D0Re0 + SUBS D1Ar5, D1Ar5, #1 + SUB D1Ar3, D1Ar3, #1 + BNE $Lbdest_unaligned + CMP D1Ar3, #8 + BLT $Lbbyte_loop +$Lbsrc_unaligned: + LSR D1Ar5, D1Ar3, #3 + ! adjust A1.2 + MOV D0Ar4, A1.2 + ! save original address + MOV D0Ar6, A1.2 + + ADD D0Ar4, D0Ar4, #7 + ANDMB D0Ar4, D0Ar4, #0xfff8 + ! new address is the 8-byte aligned one above the original + MOV A1.2, D0Ar4 + + ! A0.2 dst 64-bit is aligned + ! measure the gap size + SUB D0Ar6, D0Ar4, D0Ar6 + MOVS D0Ar4, D0Ar6 + ! keep this information for the later adjustment + ! both aligned + BZ $Lbaligned_loop + + ! prefetch + GETL D0Re0, D1Re0, [--A1.2] + + CMP D0Ar6, #4 + BLT $Lbunaligned_1_2_3 + ! 32-bit aligned + BZ $Lbaligned_4 + + SUB D0Ar6, D0Ar6, #4 + ! D1.6 stores the gap size in bits + MULW D1.6, D0Ar6, #8 + MOV D0.6, #32 + ! D0.6 stores the complement of the gap size + SUB D0.6, D0.6, D1.6 + +$Lbunaligned_5_6_7: + GETL D0.7, D1.7, [--A1.2] + ! form 64-bit data in D0Re0, D1Re0 + MOV D1Re0, D0Re0 + ! D1Re0 << gap-size + LSL D1Re0, D1Re0, D1.6 + MOV D0Re0, D1.7 + ! D0Re0 >> complement + LSR D0Re0, D0Re0, D0.6 + MOV D1.5, D0Re0 + ! combine the both + ADD D1Re0, D1Re0, D1.5 + + MOV D1.5, D1.7 + LSL D1.5, D1.5, D1.6 + MOV D0Re0, D0.7 + LSR D0Re0, D0Re0, D0.6 + MOV D0.5, D1.5 + ADD D0Re0, D0Re0, D0.5 + + SETL [--A0.2], D0Re0, D1Re0 + MOV D0Re0, D0.7 + MOV D1Re0, D1.7 + SUBS D1Ar5, D1Ar5, #1 + BNE $Lbunaligned_5_6_7 + + ANDS D1Ar3, D1Ar3, #7 + BZ $Lbbyte_loop_exit + ! Adjust A1.2 + ! A1.2 <- A1.2 +8 - gapsize + ADD A1.2, A1.2, #8 + SUB A1.2, A1.2, D0Ar4 + B $Lbbyte_loop + +$Lbunaligned_1_2_3: + MULW D1.6, D0Ar6, #8 + MOV D0.6, #32 + SUB D0.6, D0.6, D1.6 + +$Lbunaligned_1_2_3_loop: + GETL D0.7, D1.7, [--A1.2] + ! form 64-bit data in D0Re0, D1Re0 + LSL D1Re0, D1Re0, D1.6 + ! save D0Re0 for later use + MOV D0.5, D0Re0 + LSR D0Re0, D0Re0, D0.6 + MOV D1.5, D0Re0 + ADD D1Re0, D1Re0, D1.5 + + ! orignal data in D0Re0 + MOV D1.5, D0.5 + LSL D1.5, D1.5, D1.6 + MOV D0Re0, D1.7 + LSR D0Re0, D0Re0, D0.6 + MOV D0.5, D1.5 + ADD D0Re0, D0Re0, D0.5 + + SETL [--A0.2], D0Re0, D1Re0 + MOV D0Re0, D0.7 + MOV D1Re0, D1.7 + SUBS D1Ar5, D1Ar5, #1 + BNE $Lbunaligned_1_2_3_loop + + ANDS D1Ar3, D1Ar3, #7 + BZ $Lbbyte_loop_exit + ! Adjust A1.2 + ADD A1.2, A1.2, #8 + SUB A1.2, A1.2, D0Ar4 + B $Lbbyte_loop + +$Lbaligned_4: + GETL D0.7, D1.7, [--A1.2] + MOV D1Re0, D0Re0 + MOV D0Re0, D1.7 + SETL [--A0.2], D0Re0, D1Re0 + MOV D0Re0, D0.7 + MOV D1Re0, D1.7 + SUBS D1Ar5, D1Ar5, #1 + BNE $Lbaligned_4 + ANDS D1Ar3, D1Ar3, #7 + BZ $Lbbyte_loop_exit + ! Adjust A1.2 + ADD A1.2, A1.2, #8 + SUB A1.2, A1.2, D0Ar4 + B $Lbbyte_loop + +$Lforwards_copy: + MOV A1.2, D0Ar2 + MOV A0.2, D1Ar1 + CMP D1Ar3, #8 + BLT $Lfbyte_loop + + MOV D0Ar4, D0Ar2 + MOV D1Ar5, D1Ar1 + + ANDS D1Ar5, D1Ar5, #7 + BNE $Lfdest_unaligned + + ANDS D0Ar4, D0Ar4, #7 + BNE $Lfsrc_unaligned + + LSR D1Ar5, D1Ar3, #3 + +$Lfaligned_loop: + GETL D0Re0, D1Re0, [A1.2++] + SUBS D1Ar5, D1Ar5, #1 + SETL [A0.2++], D0Re0, D1Re0 + BNE $Lfaligned_loop + + ANDS D1Ar3, D1Ar3, #7 + BZ $Lfbyte_loop_exit +$Lfbyte_loop: + GETB D1Re0, [A1.2++] + SETB [A0.2++], D1Re0 + SUBS D1Ar3, D1Ar3, #1 + BNE $Lfbyte_loop +$Lfbyte_loop_exit: + MOV D0Re0, D1Ar1 + B $LEND + +$Lfdest_unaligned: + GETB D0Re0, [A1.2++] + ADD D1Ar5, D1Ar5, #1 + SUB D1Ar3, D1Ar3, #1 + SETB [A0.2++], D0Re0 + CMP D1Ar5, #8 + BNE $Lfdest_unaligned + CMP D1Ar3, #8 + BLT $Lfbyte_loop +$Lfsrc_unaligned: + ! adjust A1.2 + LSR D1Ar5, D1Ar3, #3 + + MOV D0Ar4, A1.2 + MOV D0Ar6, A1.2 + ANDMB D0Ar4, D0Ar4, #0xfff8 + MOV A1.2, D0Ar4 + + ! A0.2 dst 64-bit is aligned + SUB D0Ar6, D0Ar6, D0Ar4 + ! keep the information for the later adjustment + MOVS D0Ar4, D0Ar6 + + ! both aligned + BZ $Lfaligned_loop + + ! prefetch + GETL D0Re0, D1Re0, [A1.2] + + CMP D0Ar6, #4 + BLT $Lfunaligned_1_2_3 + BZ $Lfaligned_4 + + SUB D0Ar6, D0Ar6, #4 + MULW D0.6, D0Ar6, #8 + MOV D1.6, #32 + SUB D1.6, D1.6, D0.6 + +$Lfunaligned_5_6_7: + GETL D0.7, D1.7, [++A1.2] + ! form 64-bit data in D0Re0, D1Re0 + MOV D0Re0, D1Re0 + LSR D0Re0, D0Re0, D0.6 + MOV D1Re0, D0.7 + LSL D1Re0, D1Re0, D1.6 + MOV D0.5, D1Re0 + ADD D0Re0, D0Re0, D0.5 + + MOV D0.5, D0.7 + LSR D0.5, D0.5, D0.6 + MOV D1Re0, D1.7 + LSL D1Re0, D1Re0, D1.6 + MOV D1.5, D0.5 + ADD D1Re0, D1Re0, D1.5 + + SETL [A0.2++], D0Re0, D1Re0 + MOV D0Re0, D0.7 + MOV D1Re0, D1.7 + SUBS D1Ar5, D1Ar5, #1 + BNE $Lfunaligned_5_6_7 + + ANDS D1Ar3, D1Ar3, #7 + BZ $Lfbyte_loop_exit + ! Adjust A1.2 + ADD A1.2, A1.2, D0Ar4 + B $Lfbyte_loop + +$Lfunaligned_1_2_3: + MULW D0.6, D0Ar6, #8 + MOV D1.6, #32 + SUB D1.6, D1.6, D0.6 + +$Lfunaligned_1_2_3_loop: + GETL D0.7, D1.7, [++A1.2] + ! form 64-bit data in D0Re0, D1Re0 + LSR D0Re0, D0Re0, D0.6 + MOV D1.5, D1Re0 + LSL D1Re0, D1Re0, D1.6 + MOV D0.5, D1Re0 + ADD D0Re0, D0Re0, D0.5 + + MOV D0.5, D1.5 + LSR D0.5, D0.5, D0.6 + MOV D1Re0, D0.7 + LSL D1Re0, D1Re0, D1.6 + MOV D1.5, D0.5 + ADD D1Re0, D1Re0, D1.5 + + SETL [A0.2++], D0Re0, D1Re0 + MOV D0Re0, D0.7 + MOV D1Re0, D1.7 + SUBS D1Ar5, D1Ar5, #1 + BNE $Lfunaligned_1_2_3_loop + + ANDS D1Ar3, D1Ar3, #7 + BZ $Lfbyte_loop_exit + ! Adjust A1.2 + ADD A1.2, A1.2, D0Ar4 + B $Lfbyte_loop + +$Lfaligned_4: + GETL D0.7, D1.7, [++A1.2] + MOV D0Re0, D1Re0 + MOV D1Re0, D0.7 + SETL [A0.2++], D0Re0, D1Re0 + MOV D0Re0, D0.7 + MOV D1Re0, D1.7 + SUBS D1Ar5, D1Ar5, #1 + BNE $Lfaligned_4 + ANDS D1Ar3, D1Ar3, #7 + BZ $Lfbyte_loop_exit + ! Adjust A1.2 + ADD A1.2, A1.2, D0Ar4 + B $Lfbyte_loop + + .size _memmove,.-_memmove diff --git a/arch/metag/lib/memset.S b/arch/metag/lib/memset.S new file mode 100644 index 0000000..721085b --- /dev/null +++ b/arch/metag/lib/memset.S @@ -0,0 +1,86 @@ +! Copyright (C) 2008-2012 Imagination Technologies Ltd. + + .text + .global _memset + .type _memset,function +! D1Ar1 dst +! D0Ar2 c +! D1Ar3 cnt +! D0Re0 dst +_memset: + AND D0Ar2,D0Ar2,#0xFF ! Ensure a byte input value + MULW D0Ar2,D0Ar2,#0x0101 ! Duplicate byte value into 0-15 + ANDS D0Ar4,D1Ar1,#7 ! Extract bottom LSBs of dst + LSL D0Re0,D0Ar2,#16 ! Duplicate byte value into 16-31 + ADD A0.2,D0Ar2,D0Re0 ! Duplicate byte value into 4 (A0.2) + MOV D0Re0,D1Ar1 ! Return dst + BZ $LLongStub ! if start address is aligned + ! start address is not aligned on an 8 byte boundary, so we + ! need the number of bytes up to the next 8 byte address + ! boundary, or the length of the string if less than 8, in D1Ar5 + MOV D0Ar2,#8 ! Need 8 - N in D1Ar5 ... + SUB D1Ar5,D0Ar2,D0Ar4 ! ... subtract N + CMP D1Ar3,D1Ar5 + MOVMI D1Ar5,D1Ar3 + B $LByteStub ! dst is mis-aligned, do $LByteStub + +! +! Preamble to LongLoop which generates 4*8 bytes per interation (5 cycles) +! +$LLongStub: + LSRS D0Ar2,D1Ar3,#5 + AND D1Ar3,D1Ar3,#0x1F + MOV A1.2,A0.2 + BEQ $LLongishStub + SUB TXRPT,D0Ar2,#1 + CMP D1Ar3,#0 +$LLongLoop: + SETL [D1Ar1++],A0.2,A1.2 + SETL [D1Ar1++],A0.2,A1.2 + SETL [D1Ar1++],A0.2,A1.2 + SETL [D1Ar1++],A0.2,A1.2 + BR $LLongLoop + BZ $Lexit +! +! Preamble to LongishLoop which generates 1*8 bytes per interation (2 cycles) +! +$LLongishStub: + LSRS D0Ar2,D1Ar3,#3 + AND D1Ar3,D1Ar3,#0x7 + MOV D1Ar5,D1Ar3 + BEQ $LByteStub + SUB TXRPT,D0Ar2,#1 + CMP D1Ar3,#0 +$LLongishLoop: + SETL [D1Ar1++],A0.2,A1.2 + BR $LLongishLoop + BZ $Lexit +! +! This does a byte structured burst of up to 7 bytes +! +! D1Ar1 should point to the location required +! D1Ar3 should be the remaining total byte count +! D1Ar5 should be burst size (<= D1Ar3) +! +$LByteStub: + SUBS D1Ar3,D1Ar3,D1Ar5 ! Reduce count + ADD D1Ar1,D1Ar1,D1Ar5 ! Advance pointer to end of area + MULW D1Ar5,D1Ar5,#4 ! Scale to (1*4), (2*4), (3*4) + SUB D1Ar5,D1Ar5,#(8*4) ! Rebase to -(7*4), -(6*4), -(5*4), ... + MOV A1.2,D1Ar5 + SUB PC,CPC1,A1.2 ! Jump into table below + SETB [D1Ar1+#(-7)],A0.2 + SETB [D1Ar1+#(-6)],A0.2 + SETB [D1Ar1+#(-5)],A0.2 + SETB [D1Ar1+#(-4)],A0.2 + SETB [D1Ar1+#(-3)],A0.2 + SETB [D1Ar1+#(-2)],A0.2 + SETB [D1Ar1+#(-1)],A0.2 +! +! Return if all data has been output, otherwise do $LLongStub +! + BNZ $LLongStub +$Lexit: + MOV PC,D1RtP + .size _memset,.-_memset + diff --git a/arch/metag/lib/modsi3.S b/arch/metag/lib/modsi3.S new file mode 100644 index 0000000..210cfa8 --- /dev/null +++ b/arch/metag/lib/modsi3.S @@ -0,0 +1,38 @@ +! Copyright (C) 2001, 2002, 2003, 2004, 2005, 2006, 2007 +! Imagination Technologies Ltd +! +! Integer modulus routines. +! +!! +!! 32-bit modulus unsigned i/p - passed unsigned 32-bit numbers +!! + .text + .global ___umodsi3 + .type ___umodsi3,function + .align 2 +___umodsi3: + MOV D0FrT,D1RtP ! Save original return address + CALLR D1RtP,___udivsi3 + MOV D1RtP,D0FrT ! Recover return address + MOV D0Re0,D1Ar1 ! Return remainder + MOV PC,D1RtP + .size ___umodsi3,.-___umodsi3 + +!! +!! 32-bit modulus signed i/p - passed signed 32-bit numbers +!! + .global ___modsi3 + .type ___modsi3,function + .align 2 +___modsi3: + MOV D0FrT,D1RtP ! Save original return address + MOV A0.2,D1Ar1 ! Save A in A0.2 + CALLR D1RtP,___divsi3 + MOV D1RtP,D0FrT ! Recover return address + MOV D1Re0,A0.2 ! Recover A + MOV D0Re0,D1Ar1 ! Return remainder + ORS D1Re0,D1Re0,D1Re0 ! Was A negative? + NEG D1Ar1,D1Ar1 ! Negate remainder + MOVMI D0Re0,D1Ar1 ! Return neg remainder + MOV PC, D1RtP + .size ___modsi3,.-___modsi3 diff --git a/arch/metag/lib/muldi3.S b/arch/metag/lib/muldi3.S new file mode 100644 index 0000000..ee66ca8 --- /dev/null +++ b/arch/metag/lib/muldi3.S @@ -0,0 +1,44 @@ +! Copyright (C) 2012 by Imagination Technologies Ltd. +! +! 64-bit multiply routine. +! + +! +! 64-bit signed/unsigned multiply +! +! A = D1Ar1:D0Ar2 = a 2^48 + b 2^32 + c 2^16 + d 2^0 +! +! B = D1Ar3:D0Ar4 = w 2^48 + x 2^32 + y 2^16 + z 2^0 +! + .text + .global ___muldi3 + .type ___muldi3,function + +___muldi3: + MULD D1Re0,D1Ar1,D0Ar4 ! (a 2^48 + b 2^32)(y 2^16 + z 2^0) + MULD D0Re0,D0Ar2,D1Ar3 ! (w 2^48 + x 2^32)(c 2^16 + d 2^0) + ADD D1Re0,D1Re0,D0Re0 + + MULW D0Re0,D0Ar2,D0Ar4 ! (d 2^0) * (z 2^0) + + RTDW D0Ar2,D0Ar2 + MULW D0Ar6,D0Ar2,D0Ar4 ! (c 2^16)(z 2^0) + LSR D1Ar5,D0Ar6,#16 + LSL D0Ar6,D0Ar6,#16 + ADDS D0Re0,D0Re0,D0Ar6 + ADDCS D1Re0,D1Re0,#1 + RTDW D0Ar4,D0Ar4 + ADD D1Re0,D1Re0,D1Ar5 + + MULW D0Ar6,D0Ar2,D0Ar4 ! (c 2^16)(y 2^16) + ADD D1Re0,D1Re0,D0Ar6 + + RTDW D0Ar2,D0Ar2 + MULW D0Ar6,D0Ar2,D0Ar4 ! (d 2^0)(y 2^16) + LSR D1Ar5,D0Ar6,#16 + LSL D0Ar6,D0Ar6,#16 + ADDS D0Re0,D0Re0,D0Ar6 + ADD D1Re0,D1Re0,D1Ar5 + ADDCS D1Re0,D1Re0,#1 + MOV PC, D1RtP + .size ___muldi3,.-___muldi3 diff --git a/arch/metag/lib/ucmpdi2.S b/arch/metag/lib/ucmpdi2.S new file mode 100644 index 0000000..6f3347f --- /dev/null +++ b/arch/metag/lib/ucmpdi2.S @@ -0,0 +1,27 @@ +! Copyright (C) 2012 by Imagination Technologies Ltd. +! +! 64-bit unsigned compare routine. +! + + .text + .global ___ucmpdi2 + .type ___ucmpdi2,function + +! low high +! u64 a (D0Ar2, D1Ar1) +! u64 b (D0Ar4, D1Ar3) +___ucmpdi2: + ! start at 1 (equal) and conditionally increment or decrement + MOV D0Re0,#1 + + ! high words + CMP D1Ar1,D1Ar3 + ! or if equal, low words + CMPEQ D0Ar2,D0Ar4 + + ! unsigned compare + SUBLO D0Re0,D0Re0,#1 + ADDHI D0Re0,D0Re0,#1 + + MOV PC,D1RtP + .size ___ucmpdi2,.-___ucmpdi2 diff --git a/arch/metag/lib/usercopy.c b/arch/metag/lib/usercopy.c new file mode 100644 index 0000000..72a4c6d --- /dev/null +++ b/arch/metag/lib/usercopy.c @@ -0,0 +1,1341 @@ +/* + * User address space access functions. + * The non-inlined parts of asm-metag/uaccess.h are here. + * + * Copyright (C) 2006, Imagination Technologies. + * Copyright (C) 2000, Axis Communications AB. + * + * Written by Hans-Peter Nilsson. + * Pieces used from memcpy, originally by Kenny Ranerup long time ago. + * Modified for Meta by Will Newton. + */ + +#include +#include /* def of L1_CACHE_BYTES */ + +#define USE_RAPF +#define RAPF_MIN_BUF_SIZE (3*L1_CACHE_BYTES) + + +/* The "double write" in this code is because the Meta will not fault + * immediately unless the memory pipe is forced to by e.g. a data stall or + * another memory op. The second write should be discarded by the write + * combiner so should have virtually no cost. + */ + +#define __asm_copy_user_cont(to, from, ret, COPY, FIXUP, TENTRY) \ + __asm__ __volatile__ ( \ + COPY \ + "1:\n" \ + " .section .fixup,\"ax\"\n" \ + " MOV D1Ar1,#0\n" \ + FIXUP \ + " MOVT D1Ar1,#HI(1b)\n" \ + " JUMP D1Ar1,#LO(1b)\n" \ + " .previous\n" \ + " .section __ex_table,\"a\"\n" \ + TENTRY \ + " .previous\n" \ + : "=r" (to), "=r" (from), "=r" (ret) \ + : "0" (to), "1" (from), "2" (ret) \ + : "D1Ar1", "memory") + + +#define __asm_copy_to_user_1(to, from, ret) \ + __asm_copy_user_cont(to, from, ret, \ + " GETB D1Ar1,[%1++]\n" \ + " SETB [%0],D1Ar1\n" \ + "2: SETB [%0++],D1Ar1\n", \ + "3: ADD %2,%2,#1\n", \ + " .long 2b,3b\n") + +#define __asm_copy_to_user_2x_cont(to, from, ret, COPY, FIXUP, TENTRY) \ + __asm_copy_user_cont(to, from, ret, \ + " GETW D1Ar1,[%1++]\n" \ + " SETW [%0],D1Ar1\n" \ + "2: SETW [%0++],D1Ar1\n" COPY, \ + "3: ADD %2,%2,#2\n" FIXUP, \ + " .long 2b,3b\n" TENTRY) + +#define __asm_copy_to_user_2(to, from, ret) \ + __asm_copy_to_user_2x_cont(to, from, ret, "", "", "") + +#define __asm_copy_to_user_3(to, from, ret) \ + __asm_copy_to_user_2x_cont(to, from, ret, \ + " GETB D1Ar1,[%1++]\n" \ + " SETB [%0],D1Ar1\n" \ + "4: SETB [%0++],D1Ar1\n", \ + "5: ADD %2,%2,#1\n", \ + " .long 4b,5b\n") + +#define __asm_copy_to_user_4x_cont(to, from, ret, COPY, FIXUP, TENTRY) \ + __asm_copy_user_cont(to, from, ret, \ + " GETD D1Ar1,[%1++]\n" \ + " SETD [%0],D1Ar1\n" \ + "2: SETD [%0++],D1Ar1\n" COPY, \ + "3: ADD %2,%2,#4\n" FIXUP, \ + " .long 2b,3b\n" TENTRY) + +#define __asm_copy_to_user_4(to, from, ret) \ + __asm_copy_to_user_4x_cont(to, from, ret, "", "", "") + +#define __asm_copy_to_user_5(to, from, ret) \ + __asm_copy_to_user_4x_cont(to, from, ret, \ + " GETB D1Ar1,[%1++]\n" \ + " SETB [%0],D1Ar1\n" \ + "4: SETB [%0++],D1Ar1\n", \ + "5: ADD %2,%2,#1\n", \ + " .long 4b,5b\n") + +#define __asm_copy_to_user_6x_cont(to, from, ret, COPY, FIXUP, TENTRY) \ + __asm_copy_to_user_4x_cont(to, from, ret, \ + " GETW D1Ar1,[%1++]\n" \ + " SETW [%0],D1Ar1\n" \ + "4: SETW [%0++],D1Ar1\n" COPY, \ + "5: ADD %2,%2,#2\n" FIXUP, \ + " .long 4b,5b\n" TENTRY) + +#define __asm_copy_to_user_6(to, from, ret) \ + __asm_copy_to_user_6x_cont(to, from, ret, "", "", "") + +#define __asm_copy_to_user_7(to, from, ret) \ + __asm_copy_to_user_6x_cont(to, from, ret, \ + " GETB D1Ar1,[%1++]\n" \ + " SETB [%0],D1Ar1\n" \ + "6: SETB [%0++],D1Ar1\n", \ + "7: ADD %2,%2,#1\n", \ + " .long 6b,7b\n") + +#define __asm_copy_to_user_8x_cont(to, from, ret, COPY, FIXUP, TENTRY) \ + __asm_copy_to_user_4x_cont(to, from, ret, \ + " GETD D1Ar1,[%1++]\n" \ + " SETD [%0],D1Ar1\n" \ + "4: SETD [%0++],D1Ar1\n" COPY, \ + "5: ADD %2,%2,#4\n" FIXUP, \ + " .long 4b,5b\n" TENTRY) + +#define __asm_copy_to_user_8(to, from, ret) \ + __asm_copy_to_user_8x_cont(to, from, ret, "", "", "") + +#define __asm_copy_to_user_9(to, from, ret) \ + __asm_copy_to_user_8x_cont(to, from, ret, \ + " GETB D1Ar1,[%1++]\n" \ + " SETB [%0],D1Ar1\n" \ + "6: SETB [%0++],D1Ar1\n", \ + "7: ADD %2,%2,#1\n", \ + " .long 6b,7b\n") + +#define __asm_copy_to_user_10x_cont(to, from, ret, COPY, FIXUP, TENTRY) \ + __asm_copy_to_user_8x_cont(to, from, ret, \ + " GETW D1Ar1,[%1++]\n" \ + " SETW [%0],D1Ar1\n" \ + "6: SETW [%0++],D1Ar1\n" COPY, \ + "7: ADD %2,%2,#2\n" FIXUP, \ + " .long 6b,7b\n" TENTRY) + +#define __asm_copy_to_user_10(to, from, ret) \ + __asm_copy_to_user_10x_cont(to, from, ret, "", "", "") + +#define __asm_copy_to_user_11(to, from, ret) \ + __asm_copy_to_user_10x_cont(to, from, ret, \ + " GETB D1Ar1,[%1++]\n" \ + " SETB [%0],D1Ar1\n" \ + "8: SETB [%0++],D1Ar1\n", \ + "9: ADD %2,%2,#1\n", \ + " .long 8b,9b\n") + +#define __asm_copy_to_user_12x_cont(to, from, ret, COPY, FIXUP, TENTRY) \ + __asm_copy_to_user_8x_cont(to, from, ret, \ + " GETD D1Ar1,[%1++]\n" \ + " SETD [%0],D1Ar1\n" \ + "6: SETD [%0++],D1Ar1\n" COPY, \ + "7: ADD %2,%2,#4\n" FIXUP, \ + " .long 6b,7b\n" TENTRY) +#define __asm_copy_to_user_12(to, from, ret) \ + __asm_copy_to_user_12x_cont(to, from, ret, "", "", "") + +#define __asm_copy_to_user_13(to, from, ret) \ + __asm_copy_to_user_12x_cont(to, from, ret, \ + " GETB D1Ar1,[%1++]\n" \ + " SETB [%0],D1Ar1\n" \ + "8: SETB [%0++],D1Ar1\n", \ + "9: ADD %2,%2,#1\n", \ + " .long 8b,9b\n") + +#define __asm_copy_to_user_14x_cont(to, from, ret, COPY, FIXUP, TENTRY) \ + __asm_copy_to_user_12x_cont(to, from, ret, \ + " GETW D1Ar1,[%1++]\n" \ + " SETW [%0],D1Ar1\n" \ + "8: SETW [%0++],D1Ar1\n" COPY, \ + "9: ADD %2,%2,#2\n" FIXUP, \ + " .long 8b,9b\n" TENTRY) + +#define __asm_copy_to_user_14(to, from, ret) \ + __asm_copy_to_user_14x_cont(to, from, ret, "", "", "") + +#define __asm_copy_to_user_15(to, from, ret) \ + __asm_copy_to_user_14x_cont(to, from, ret, \ + " GETB D1Ar1,[%1++]\n" \ + " SETB [%0],D1Ar1\n" \ + "10: SETB [%0++],D1Ar1\n", \ + "11: ADD %2,%2,#1\n", \ + " .long 10b,11b\n") + +#define __asm_copy_to_user_16x_cont(to, from, ret, COPY, FIXUP, TENTRY) \ + __asm_copy_to_user_12x_cont(to, from, ret, \ + " GETD D1Ar1,[%1++]\n" \ + " SETD [%0],D1Ar1\n" \ + "8: SETD [%0++],D1Ar1\n" COPY, \ + "9: ADD %2,%2,#4\n" FIXUP, \ + " .long 8b,9b\n" TENTRY) + +#define __asm_copy_to_user_16(to, from, ret) \ + __asm_copy_to_user_16x_cont(to, from, ret, "", "", "") + +#define __asm_copy_to_user_8x64(to, from, ret) \ + __asm__ __volatile__ ( \ + " GETL D0Ar2,D1Ar1,[%1++]\n" \ + " SETL [%0],D0Ar2,D1Ar1\n" \ + "2: SETL [%0++],D0Ar2,D1Ar1\n" \ + "1:\n" \ + " .section .fixup,\"ax\"\n" \ + "3: ADD %2,%2,#8\n" \ + " MOVT D0Ar2,#HI(1b)\n" \ + " JUMP D0Ar2,#LO(1b)\n" \ + " .previous\n" \ + " .section __ex_table,\"a\"\n" \ + " .long 2b,3b\n" \ + " .previous\n" \ + : "=r" (to), "=r" (from), "=r" (ret) \ + : "0" (to), "1" (from), "2" (ret) \ + : "D1Ar1", "D0Ar2", "memory") + +/* + * optimized copying loop using RAPF when 64 bit aligned + * + * n will be automatically decremented inside the loop + * ret will be left intact. if error occurs we will rewind + * so that the original non optimized code will fill up + * this value correctly. + * + * on fault: + * > n will hold total number of uncopied bytes + * + * > {'to','from'} will be rewind back so that + * the non-optimized code will do the proper fix up + * + * DCACHE drops the cacheline which helps in reducing cache + * pollution. + * + * We introduce an extra SETL at the end of the loop to + * ensure we don't fall off the loop before we catch all + * erros. + * + * NOTICE: + * LSM_STEP in TXSTATUS must be cleared in fix up code. + * since we're using M{S,G}ETL, a fault might happen at + * any address in the middle of M{S,G}ETL causing + * the value of LSM_STEP to be incorrect which can + * cause subsequent use of M{S,G}ET{L,D} to go wrong. + * ie: if LSM_STEP was 1 when a fault occurs, the + * next call to M{S,G}ET{L,D} will skip the first + * copy/getting as it think that the first 1 has already + * been done. + * + */ +#define __asm_copy_user_64bit_rapf_loop( \ + to, from, ret, n, id, FIXUP) \ + __asm__ __volatile__ ( \ + ".balign 8\n" \ + "MOV RAPF, %1\n" \ + "MSETL [A0StP++], D0Ar6, D0FrT, D0.5, D0.6, D0.7\n" \ + "MOV D0Ar6, #0\n" \ + "LSR D1Ar5, %3, #6\n" \ + "SUB TXRPT, D1Ar5, #2\n" \ + "MOV RAPF, %1\n" \ + "$Lloop"id":\n" \ + "ADD RAPF, %1, #64\n" \ + "21:\n" \ + "MGETL D0FrT, D0.5, D0.6, D0.7, [%1++]\n" \ + "22:\n" \ + "MSETL [%0++], D0FrT, D0.5, D0.6, D0.7\n" \ + "SUB %3, %3, #32\n" \ + "23:\n" \ + "MGETL D0FrT, D0.5, D0.6, D0.7, [%1++]\n" \ + "24:\n" \ + "MSETL [%0++], D0FrT, D0.5, D0.6, D0.7\n" \ + "SUB %3, %3, #32\n" \ + "DCACHE [%1+#-64], D0Ar6\n" \ + "BR $Lloop"id"\n" \ + \ + "MOV RAPF, %1\n" \ + "25:\n" \ + "MGETL D0FrT, D0.5, D0.6, D0.7, [%1++]\n" \ + "26:\n" \ + "MSETL [%0++], D0FrT, D0.5, D0.6, D0.7\n" \ + "SUB %3, %3, #32\n" \ + "27:\n" \ + "MGETL D0FrT, D0.5, D0.6, D0.7, [%1++]\n" \ + "28:\n" \ + "MSETL [%0++], D0FrT, D0.5, D0.6, D0.7\n" \ + "SUB %0, %0, #8\n" \ + "29:\n" \ + "SETL [%0++], D0.7, D1.7\n" \ + "SUB %3, %3, #32\n" \ + "1:" \ + "DCACHE [%1+#-64], D0Ar6\n" \ + "GETL D0Ar6, D1Ar5, [A0StP+#-40]\n" \ + "GETL D0FrT, D1RtP, [A0StP+#-32]\n" \ + "GETL D0.5, D1.5, [A0StP+#-24]\n" \ + "GETL D0.6, D1.6, [A0StP+#-16]\n" \ + "GETL D0.7, D1.7, [A0StP+#-8]\n" \ + "SUB A0StP, A0StP, #40\n" \ + " .section .fixup,\"ax\"\n" \ + "4:\n" \ + " ADD %0, %0, #8\n" \ + "3:\n" \ + " MOV D0Ar2, TXSTATUS\n" \ + " MOV D1Ar1, TXSTATUS\n" \ + " AND D1Ar1, D1Ar1, #0xFFFFF8FF\n" \ + " MOV TXSTATUS, D1Ar1\n" \ + FIXUP \ + " MOVT D0Ar2,#HI(1b)\n" \ + " JUMP D0Ar2,#LO(1b)\n" \ + " .previous\n" \ + " .section __ex_table,\"a\"\n" \ + " .long 21b,3b\n" \ + " .long 22b,3b\n" \ + " .long 23b,3b\n" \ + " .long 24b,3b\n" \ + " .long 25b,3b\n" \ + " .long 26b,3b\n" \ + " .long 27b,3b\n" \ + " .long 28b,3b\n" \ + " .long 29b,4b\n" \ + " .previous\n" \ + : "=r" (to), "=r" (from), "=r" (ret), "=d" (n) \ + : "0" (to), "1" (from), "2" (ret), "3" (n) \ + : "D1Ar1", "D0Ar2", "memory") + +/* rewind 'to' and 'from' pointers when a fault occurs + * + * Rationale: + * A fault always occurs on writing to user buffer. A fault + * is at a single address, so we need to rewind by only 4 + * bytes. + * Since we do a complete read from kernel buffer before + * writing, we need to rewind it also. The amount to be + * rewind equals the number of faulty writes in MSETD + * which is: [4 - (LSM_STEP-1)]*8 + * LSM_STEP is bits 10:8 in TXSTATUS which is already read + * and stored in D0Ar2 + * + * NOTE: If a fault occurs at the last operation in M{G,S}ETL + * LSM_STEP will be 0. ie: we do 4 writes in our case, if + * a fault happens at the 4th write, LSM_STEP will be 0 + * instead of 4. The code copes with that. + * + * n is updated by the number of successful writes, which is: + * n = n - (LSM_STEP-1)*8 + */ +#define __asm_copy_to_user_64bit_rapf_loop(to, from, ret, n, id)\ + __asm_copy_user_64bit_rapf_loop(to, from, ret, n, id, \ + "LSR D0Ar2, D0Ar2, #8\n" \ + "AND D0Ar2, D0Ar2, #0x7\n" \ + "ADDZ D0Ar2, D0Ar2, #4\n" \ + "SUB D0Ar2, D0Ar2, #1\n" \ + "MOV D1Ar1, #4\n" \ + "SUB D0Ar2, D1Ar1, D0Ar2\n" \ + "LSL D0Ar2, D0Ar2, #3\n" \ + "LSL D1Ar1, D1Ar1, #3\n" \ + "SUB D1Ar1, D1Ar1, D0Ar2\n" \ + "SUB %0, %0, #8\n" \ + "SUB %1, %1,D0Ar2\n" \ + "SUB %3, %3, D1Ar1\n") + +/* + * optimized copying loop using RAPF when 32 bit aligned + * + * n will be automatically decremented inside the loop + * ret will be left intact. if error occurs we will rewind + * so that the original non optimized code will fill up + * this value correctly. + * + * on fault: + * > n will hold total number of uncopied bytes + * + * > {'to','from'} will be rewind back so that + * the non-optimized code will do the proper fix up + * + * DCACHE drops the cacheline which helps in reducing cache + * pollution. + * + * We introduce an extra SETD at the end of the loop to + * ensure we don't fall off the loop before we catch all + * erros. + * + * NOTICE: + * LSM_STEP in TXSTATUS must be cleared in fix up code. + * since we're using M{S,G}ETL, a fault might happen at + * any address in the middle of M{S,G}ETL causing + * the value of LSM_STEP to be incorrect which can + * cause subsequent use of M{S,G}ET{L,D} to go wrong. + * ie: if LSM_STEP was 1 when a fault occurs, the + * next call to M{S,G}ET{L,D} will skip the first + * copy/getting as it think that the first 1 has already + * been done. + * + */ +#define __asm_copy_user_32bit_rapf_loop( \ + to, from, ret, n, id, FIXUP) \ + __asm__ __volatile__ ( \ + ".balign 8\n" \ + "MOV RAPF, %1\n" \ + "MSETL [A0StP++], D0Ar6, D0FrT, D0.5, D0.6, D0.7\n" \ + "MOV D0Ar6, #0\n" \ + "LSR D1Ar5, %3, #6\n" \ + "SUB TXRPT, D1Ar5, #2\n" \ + "MOV RAPF, %1\n" \ + "$Lloop"id":\n" \ + "ADD RAPF, %1, #64\n" \ + "21:\n" \ + "MGETD D0FrT, D0.5, D0.6, D0.7, [%1++]\n" \ + "22:\n" \ + "MSETD [%0++], D0FrT, D0.5, D0.6, D0.7\n" \ + "SUB %3, %3, #16\n" \ + "23:\n" \ + "MGETD D0FrT, D0.5, D0.6, D0.7, [%1++]\n" \ + "24:\n" \ + "MSETD [%0++], D0FrT, D0.5, D0.6, D0.7\n" \ + "SUB %3, %3, #16\n" \ + "25:\n" \ + "MGETD D0FrT, D0.5, D0.6, D0.7, [%1++]\n" \ + "26:\n" \ + "MSETD [%0++], D0FrT, D0.5, D0.6, D0.7\n" \ + "SUB %3, %3, #16\n" \ + "27:\n" \ + "MGETD D0FrT, D0.5, D0.6, D0.7, [%1++]\n" \ + "28:\n" \ + "MSETD [%0++], D0FrT, D0.5, D0.6, D0.7\n" \ + "SUB %3, %3, #16\n" \ + "DCACHE [%1+#-64], D0Ar6\n" \ + "BR $Lloop"id"\n" \ + \ + "MOV RAPF, %1\n" \ + "29:\n" \ + "MGETD D0FrT, D0.5, D0.6, D0.7, [%1++]\n" \ + "30:\n" \ + "MSETD [%0++], D0FrT, D0.5, D0.6, D0.7\n" \ + "SUB %3, %3, #16\n" \ + "31:\n" \ + "MGETD D0FrT, D0.5, D0.6, D0.7, [%1++]\n" \ + "32:\n" \ + "MSETD [%0++], D0FrT, D0.5, D0.6, D0.7\n" \ + "SUB %3, %3, #16\n" \ + "33:\n" \ + "MGETD D0FrT, D0.5, D0.6, D0.7, [%1++]\n" \ + "34:\n" \ + "MSETD [%0++], D0FrT, D0.5, D0.6, D0.7\n" \ + "SUB %3, %3, #16\n" \ + "35:\n" \ + "MGETD D0FrT, D0.5, D0.6, D0.7, [%1++]\n" \ + "36:\n" \ + "MSETD [%0++], D0FrT, D0.5, D0.6, D0.7\n" \ + "SUB %0, %0, #4\n" \ + "37:\n" \ + "SETD [%0++], D0.7\n" \ + "SUB %3, %3, #16\n" \ + "1:" \ + "DCACHE [%1+#-64], D0Ar6\n" \ + "GETL D0Ar6, D1Ar5, [A0StP+#-40]\n" \ + "GETL D0FrT, D1RtP, [A0StP+#-32]\n" \ + "GETL D0.5, D1.5, [A0StP+#-24]\n" \ + "GETL D0.6, D1.6, [A0StP+#-16]\n" \ + "GETL D0.7, D1.7, [A0StP+#-8]\n" \ + "SUB A0StP, A0StP, #40\n" \ + " .section .fixup,\"ax\"\n" \ + "4:\n" \ + " ADD %0, %0, #4\n" \ + "3:\n" \ + " MOV D0Ar2, TXSTATUS\n" \ + " MOV D1Ar1, TXSTATUS\n" \ + " AND D1Ar1, D1Ar1, #0xFFFFF8FF\n" \ + " MOV TXSTATUS, D1Ar1\n" \ + FIXUP \ + " MOVT D0Ar2,#HI(1b)\n" \ + " JUMP D0Ar2,#LO(1b)\n" \ + " .previous\n" \ + " .section __ex_table,\"a\"\n" \ + " .long 21b,3b\n" \ + " .long 22b,3b\n" \ + " .long 23b,3b\n" \ + " .long 24b,3b\n" \ + " .long 25b,3b\n" \ + " .long 26b,3b\n" \ + " .long 27b,3b\n" \ + " .long 28b,3b\n" \ + " .long 29b,3b\n" \ + " .long 30b,3b\n" \ + " .long 31b,3b\n" \ + " .long 32b,3b\n" \ + " .long 33b,3b\n" \ + " .long 34b,3b\n" \ + " .long 35b,3b\n" \ + " .long 36b,3b\n" \ + " .long 37b,4b\n" \ + " .previous\n" \ + : "=r" (to), "=r" (from), "=r" (ret), "=d" (n) \ + : "0" (to), "1" (from), "2" (ret), "3" (n) \ + : "D1Ar1", "D0Ar2", "memory") + +/* rewind 'to' and 'from' pointers when a fault occurs + * + * Rationale: + * A fault always occurs on writing to user buffer. A fault + * is at a single address, so we need to rewind by only 4 + * bytes. + * Since we do a complete read from kernel buffer before + * writing, we need to rewind it also. The amount to be + * rewind equals the number of faulty writes in MSETD + * which is: [4 - (LSM_STEP-1)]*4 + * LSM_STEP is bits 10:8 in TXSTATUS which is already read + * and stored in D0Ar2 + * + * NOTE: If a fault occurs at the last operation in M{G,S}ETL + * LSM_STEP will be 0. ie: we do 4 writes in our case, if + * a fault happens at the 4th write, LSM_STEP will be 0 + * instead of 4. The code copes with that. + * + * n is updated by the number of successful writes, which is: + * n = n - (LSM_STEP-1)*4 + */ +#define __asm_copy_to_user_32bit_rapf_loop(to, from, ret, n, id)\ + __asm_copy_user_32bit_rapf_loop(to, from, ret, n, id, \ + "LSR D0Ar2, D0Ar2, #8\n" \ + "AND D0Ar2, D0Ar2, #0x7\n" \ + "ADDZ D0Ar2, D0Ar2, #4\n" \ + "SUB D0Ar2, D0Ar2, #1\n" \ + "MOV D1Ar1, #4\n" \ + "SUB D0Ar2, D1Ar1, D0Ar2\n" \ + "LSL D0Ar2, D0Ar2, #2\n" \ + "LSL D1Ar1, D1Ar1, #2\n" \ + "SUB D1Ar1, D1Ar1, D0Ar2\n" \ + "SUB %0, %0, #4\n" \ + "SUB %1, %1, D0Ar2\n" \ + "SUB %3, %3, D1Ar1\n") + +unsigned long __copy_user(void __user *pdst, const void *psrc, + unsigned long n) +{ + register char __user *dst __asm__ ("A0.2") = pdst; + register const char *src __asm__ ("A1.2") = psrc; + unsigned long retn = 0; + + if (n == 0) + return 0; + + if ((unsigned long) src & 1) { + __asm_copy_to_user_1(dst, src, retn); + n--; + } + if ((unsigned long) dst & 1) { + /* Worst case - byte copy */ + while (n > 0) { + __asm_copy_to_user_1(dst, src, retn); + n--; + } + } + if (((unsigned long) src & 2) && n >= 2) { + __asm_copy_to_user_2(dst, src, retn); + n -= 2; + } + if ((unsigned long) dst & 2) { + /* Second worst case - word copy */ + while (n >= 2) { + __asm_copy_to_user_2(dst, src, retn); + n -= 2; + } + } + +#ifdef USE_RAPF + /* 64 bit copy loop */ + if (!(((unsigned long) src | (__force unsigned long) dst) & 7)) { + if (n >= RAPF_MIN_BUF_SIZE) { + /* copy user using 64 bit rapf copy */ + __asm_copy_to_user_64bit_rapf_loop(dst, src, retn, + n, "64cu"); + } + while (n >= 8) { + __asm_copy_to_user_8x64(dst, src, retn); + n -= 8; + } + } + if (n >= RAPF_MIN_BUF_SIZE) { + /* copy user using 32 bit rapf copy */ + __asm_copy_to_user_32bit_rapf_loop(dst, src, retn, n, "32cu"); + } +#else + /* 64 bit copy loop */ + if (!(((unsigned long) src | (__force unsigned long) dst) & 7)) { + while (n >= 8) { + __asm_copy_to_user_8x64(dst, src, retn); + n -= 8; + } + } +#endif + + while (n >= 16) { + __asm_copy_to_user_16(dst, src, retn); + n -= 16; + } + + while (n >= 4) { + __asm_copy_to_user_4(dst, src, retn); + n -= 4; + } + + switch (n) { + case 0: + break; + case 1: + __asm_copy_to_user_1(dst, src, retn); + break; + case 2: + __asm_copy_to_user_2(dst, src, retn); + break; + case 3: + __asm_copy_to_user_3(dst, src, retn); + break; + } + + return retn; +} + +#define __asm_copy_from_user_1(to, from, ret) \ + __asm_copy_user_cont(to, from, ret, \ + " GETB D1Ar1,[%1++]\n" \ + "2: SETB [%0++],D1Ar1\n", \ + "3: ADD %2,%2,#1\n" \ + " SETB [%0++],D1Ar1\n", \ + " .long 2b,3b\n") + +#define __asm_copy_from_user_2x_cont(to, from, ret, COPY, FIXUP, TENTRY) \ + __asm_copy_user_cont(to, from, ret, \ + " GETW D1Ar1,[%1++]\n" \ + "2: SETW [%0++],D1Ar1\n" COPY, \ + "3: ADD %2,%2,#2\n" \ + " SETW [%0++],D1Ar1\n" FIXUP, \ + " .long 2b,3b\n" TENTRY) + +#define __asm_copy_from_user_2(to, from, ret) \ + __asm_copy_from_user_2x_cont(to, from, ret, "", "", "") + +#define __asm_copy_from_user_3(to, from, ret) \ + __asm_copy_from_user_2x_cont(to, from, ret, \ + " GETB D1Ar1,[%1++]\n" \ + "4: SETB [%0++],D1Ar1\n", \ + "5: ADD %2,%2,#1\n" \ + " SETB [%0++],D1Ar1\n", \ + " .long 4b,5b\n") + +#define __asm_copy_from_user_4x_cont(to, from, ret, COPY, FIXUP, TENTRY) \ + __asm_copy_user_cont(to, from, ret, \ + " GETD D1Ar1,[%1++]\n" \ + "2: SETD [%0++],D1Ar1\n" COPY, \ + "3: ADD %2,%2,#4\n" \ + " SETD [%0++],D1Ar1\n" FIXUP, \ + " .long 2b,3b\n" TENTRY) + +#define __asm_copy_from_user_4(to, from, ret) \ + __asm_copy_from_user_4x_cont(to, from, ret, "", "", "") + +#define __asm_copy_from_user_5(to, from, ret) \ + __asm_copy_from_user_4x_cont(to, from, ret, \ + " GETB D1Ar1,[%1++]\n" \ + "4: SETB [%0++],D1Ar1\n", \ + "5: ADD %2,%2,#1\n" \ + " SETB [%0++],D1Ar1\n", \ + " .long 4b,5b\n") + +#define __asm_copy_from_user_6x_cont(to, from, ret, COPY, FIXUP, TENTRY) \ + __asm_copy_from_user_4x_cont(to, from, ret, \ + " GETW D1Ar1,[%1++]\n" \ + "4: SETW [%0++],D1Ar1\n" COPY, \ + "5: ADD %2,%2,#2\n" \ + " SETW [%0++],D1Ar1\n" FIXUP, \ + " .long 4b,5b\n" TENTRY) + +#define __asm_copy_from_user_6(to, from, ret) \ + __asm_copy_from_user_6x_cont(to, from, ret, "", "", "") + +#define __asm_copy_from_user_7(to, from, ret) \ + __asm_copy_from_user_6x_cont(to, from, ret, \ + " GETB D1Ar1,[%1++]\n" \ + "6: SETB [%0++],D1Ar1\n", \ + "7: ADD %2,%2,#1\n" \ + " SETB [%0++],D1Ar1\n", \ + " .long 6b,7b\n") + +#define __asm_copy_from_user_8x_cont(to, from, ret, COPY, FIXUP, TENTRY) \ + __asm_copy_from_user_4x_cont(to, from, ret, \ + " GETD D1Ar1,[%1++]\n" \ + "4: SETD [%0++],D1Ar1\n" COPY, \ + "5: ADD %2,%2,#4\n" \ + " SETD [%0++],D1Ar1\n" FIXUP, \ + " .long 4b,5b\n" TENTRY) + +#define __asm_copy_from_user_8(to, from, ret) \ + __asm_copy_from_user_8x_cont(to, from, ret, "", "", "") + +#define __asm_copy_from_user_9(to, from, ret) \ + __asm_copy_from_user_8x_cont(to, from, ret, \ + " GETB D1Ar1,[%1++]\n" \ + "6: SETB [%0++],D1Ar1\n", \ + "7: ADD %2,%2,#1\n" \ + " SETB [%0++],D1Ar1\n", \ + " .long 6b,7b\n") + +#define __asm_copy_from_user_10x_cont(to, from, ret, COPY, FIXUP, TENTRY) \ + __asm_copy_from_user_8x_cont(to, from, ret, \ + " GETW D1Ar1,[%1++]\n" \ + "6: SETW [%0++],D1Ar1\n" COPY, \ + "7: ADD %2,%2,#2\n" \ + " SETW [%0++],D1Ar1\n" FIXUP, \ + " .long 6b,7b\n" TENTRY) + +#define __asm_copy_from_user_10(to, from, ret) \ + __asm_copy_from_user_10x_cont(to, from, ret, "", "", "") + +#define __asm_copy_from_user_11(to, from, ret) \ + __asm_copy_from_user_10x_cont(to, from, ret, \ + " GETB D1Ar1,[%1++]\n" \ + "8: SETB [%0++],D1Ar1\n", \ + "9: ADD %2,%2,#1\n" \ + " SETB [%0++],D1Ar1\n", \ + " .long 8b,9b\n") + +#define __asm_copy_from_user_12x_cont(to, from, ret, COPY, FIXUP, TENTRY) \ + __asm_copy_from_user_8x_cont(to, from, ret, \ + " GETD D1Ar1,[%1++]\n" \ + "6: SETD [%0++],D1Ar1\n" COPY, \ + "7: ADD %2,%2,#4\n" \ + " SETD [%0++],D1Ar1\n" FIXUP, \ + " .long 6b,7b\n" TENTRY) + +#define __asm_copy_from_user_12(to, from, ret) \ + __asm_copy_from_user_12x_cont(to, from, ret, "", "", "") + +#define __asm_copy_from_user_13(to, from, ret) \ + __asm_copy_from_user_12x_cont(to, from, ret, \ + " GETB D1Ar1,[%1++]\n" \ + "8: SETB [%0++],D1Ar1\n", \ + "9: ADD %2,%2,#1\n" \ + " SETB [%0++],D1Ar1\n", \ + " .long 8b,9b\n") + +#define __asm_copy_from_user_14x_cont(to, from, ret, COPY, FIXUP, TENTRY) \ + __asm_copy_from_user_12x_cont(to, from, ret, \ + " GETW D1Ar1,[%1++]\n" \ + "8: SETW [%0++],D1Ar1\n" COPY, \ + "9: ADD %2,%2,#2\n" \ + " SETW [%0++],D1Ar1\n" FIXUP, \ + " .long 8b,9b\n" TENTRY) + +#define __asm_copy_from_user_14(to, from, ret) \ + __asm_copy_from_user_14x_cont(to, from, ret, "", "", "") + +#define __asm_copy_from_user_15(to, from, ret) \ + __asm_copy_from_user_14x_cont(to, from, ret, \ + " GETB D1Ar1,[%1++]\n" \ + "10: SETB [%0++],D1Ar1\n", \ + "11: ADD %2,%2,#1\n" \ + " SETB [%0++],D1Ar1\n", \ + " .long 10b,11b\n") + +#define __asm_copy_from_user_16x_cont(to, from, ret, COPY, FIXUP, TENTRY) \ + __asm_copy_from_user_12x_cont(to, from, ret, \ + " GETD D1Ar1,[%1++]\n" \ + "8: SETD [%0++],D1Ar1\n" COPY, \ + "9: ADD %2,%2,#4\n" \ + " SETD [%0++],D1Ar1\n" FIXUP, \ + " .long 8b,9b\n" TENTRY) + +#define __asm_copy_from_user_16(to, from, ret) \ + __asm_copy_from_user_16x_cont(to, from, ret, "", "", "") + +#define __asm_copy_from_user_8x64(to, from, ret) \ + __asm__ __volatile__ ( \ + " GETL D0Ar2,D1Ar1,[%1++]\n" \ + "2: SETL [%0++],D0Ar2,D1Ar1\n" \ + "1:\n" \ + " .section .fixup,\"ax\"\n" \ + " MOV D1Ar1,#0\n" \ + " MOV D0Ar2,#0\n" \ + "3: ADD %2,%2,#8\n" \ + " SETL [%0++],D0Ar2,D1Ar1\n" \ + " MOVT D0Ar2,#HI(1b)\n" \ + " JUMP D0Ar2,#LO(1b)\n" \ + " .previous\n" \ + " .section __ex_table,\"a\"\n" \ + " .long 2b,3b\n" \ + " .previous\n" \ + : "=a" (to), "=r" (from), "=r" (ret) \ + : "0" (to), "1" (from), "2" (ret) \ + : "D1Ar1", "D0Ar2", "memory") + +/* rewind 'from' pointer when a fault occurs + * + * Rationale: + * A fault occurs while reading from user buffer, which is the + * source. Since the fault is at a single address, we only + * need to rewind by 8 bytes. + * Since we don't write to kernel buffer until we read first, + * the kernel buffer is at the right state and needn't be + * corrected. + */ +#define __asm_copy_from_user_64bit_rapf_loop(to, from, ret, n, id) \ + __asm_copy_user_64bit_rapf_loop(to, from, ret, n, id, \ + "SUB %1, %1, #8\n") + +/* rewind 'from' pointer when a fault occurs + * + * Rationale: + * A fault occurs while reading from user buffer, which is the + * source. Since the fault is at a single address, we only + * need to rewind by 4 bytes. + * Since we don't write to kernel buffer until we read first, + * the kernel buffer is at the right state and needn't be + * corrected. + */ +#define __asm_copy_from_user_32bit_rapf_loop(to, from, ret, n, id) \ + __asm_copy_user_32bit_rapf_loop(to, from, ret, n, id, \ + "SUB %1, %1, #4\n") + + +/* Copy from user to kernel, zeroing the bytes that were inaccessible in + userland. The return-value is the number of bytes that were + inaccessible. */ +unsigned long __copy_user_zeroing(void *pdst, const void __user *psrc, + unsigned long n) +{ + register char *dst __asm__ ("A0.2") = pdst; + register const char __user *src __asm__ ("A1.2") = psrc; + unsigned long retn = 0; + + if (n == 0) + return 0; + + if ((unsigned long) src & 1) { + __asm_copy_from_user_1(dst, src, retn); + n--; + } + if ((unsigned long) dst & 1) { + /* Worst case - byte copy */ + while (n > 0) { + __asm_copy_from_user_1(dst, src, retn); + n--; + if (retn) + goto copy_exception_bytes; + } + } + if (((unsigned long) src & 2) && n >= 2) { + __asm_copy_from_user_2(dst, src, retn); + n -= 2; + } + if ((unsigned long) dst & 2) { + /* Second worst case - word copy */ + while (n >= 2) { + __asm_copy_from_user_2(dst, src, retn); + n -= 2; + if (retn) + goto copy_exception_bytes; + } + } + + /* We only need one check after the unalignment-adjustments, + because if both adjustments were done, either both or + neither reference had an exception. */ + if (retn != 0) + goto copy_exception_bytes; + +#ifdef USE_RAPF + /* 64 bit copy loop */ + if (!(((unsigned long) src | (unsigned long) dst) & 7)) { + if (n >= RAPF_MIN_BUF_SIZE) { + /* Copy using fast 64bit rapf */ + __asm_copy_from_user_64bit_rapf_loop(dst, src, retn, + n, "64cuz"); + } + while (n >= 8) { + __asm_copy_from_user_8x64(dst, src, retn); + n -= 8; + if (retn) + goto copy_exception_bytes; + } + } + + if (n >= RAPF_MIN_BUF_SIZE) { + /* Copy using fast 32bit rapf */ + __asm_copy_from_user_32bit_rapf_loop(dst, src, retn, + n, "32cuz"); + } +#else + /* 64 bit copy loop */ + if (!(((unsigned long) src | (unsigned long) dst) & 7)) { + while (n >= 8) { + __asm_copy_from_user_8x64(dst, src, retn); + n -= 8; + if (retn) + goto copy_exception_bytes; + } + } +#endif + + while (n >= 4) { + __asm_copy_from_user_4(dst, src, retn); + n -= 4; + + if (retn) + goto copy_exception_bytes; + } + + /* If we get here, there were no memory read faults. */ + switch (n) { + /* These copies are at least "naturally aligned" (so we don't + have to check each byte), due to the src alignment code. + The *_3 case *will* get the correct count for retn. */ + case 0: + /* This case deliberately left in (if you have doubts check the + generated assembly code). */ + break; + case 1: + __asm_copy_from_user_1(dst, src, retn); + break; + case 2: + __asm_copy_from_user_2(dst, src, retn); + break; + case 3: + __asm_copy_from_user_3(dst, src, retn); + break; + } + + /* If we get here, retn correctly reflects the number of failing + bytes. */ + return retn; + + copy_exception_bytes: + /* We already have "retn" bytes cleared, and need to clear the + remaining "n" bytes. A non-optimized simple byte-for-byte in-line + memset is preferred here, since this isn't speed-critical code and + we'd rather have this a leaf-function than calling memset. */ + { + char *endp; + for (endp = dst + n; dst < endp; dst++) + *dst = 0; + } + + return retn + n; +} + +#define __asm_clear_8x64(to, ret) \ + __asm__ __volatile__ ( \ + " MOV D0Ar2,#0\n" \ + " MOV D1Ar1,#0\n" \ + " SETL [%0],D0Ar2,D1Ar1\n" \ + "2: SETL [%0++],D0Ar2,D1Ar1\n" \ + "1:\n" \ + " .section .fixup,\"ax\"\n" \ + "3: ADD %1,%1,#8\n" \ + " MOVT D0Ar2,#HI(1b)\n" \ + " JUMP D0Ar2,#LO(1b)\n" \ + " .previous\n" \ + " .section __ex_table,\"a\"\n" \ + " .long 2b,3b\n" \ + " .previous\n" \ + : "=r" (to), "=r" (ret) \ + : "0" (to), "1" (ret) \ + : "D1Ar1", "D0Ar2", "memory") + +/* Zero userspace. */ + +#define __asm_clear(to, ret, CLEAR, FIXUP, TENTRY) \ + __asm__ __volatile__ ( \ + " MOV D1Ar1,#0\n" \ + CLEAR \ + "1:\n" \ + " .section .fixup,\"ax\"\n" \ + FIXUP \ + " MOVT D1Ar1,#HI(1b)\n" \ + " JUMP D1Ar1,#LO(1b)\n" \ + " .previous\n" \ + " .section __ex_table,\"a\"\n" \ + TENTRY \ + " .previous" \ + : "=r" (to), "=r" (ret) \ + : "0" (to), "1" (ret) \ + : "D1Ar1", "memory") + +#define __asm_clear_1(to, ret) \ + __asm_clear(to, ret, \ + " SETB [%0],D1Ar1\n" \ + "2: SETB [%0++],D1Ar1\n", \ + "3: ADD %1,%1,#1\n", \ + " .long 2b,3b\n") + +#define __asm_clear_2(to, ret) \ + __asm_clear(to, ret, \ + " SETW [%0],D1Ar1\n" \ + "2: SETW [%0++],D1Ar1\n", \ + "3: ADD %1,%1,#2\n", \ + " .long 2b,3b\n") + +#define __asm_clear_3(to, ret) \ + __asm_clear(to, ret, \ + "2: SETW [%0++],D1Ar1\n" \ + " SETB [%0],D1Ar1\n" \ + "3: SETB [%0++],D1Ar1\n", \ + "4: ADD %1,%1,#2\n" \ + "5: ADD %1,%1,#1\n", \ + " .long 2b,4b\n" \ + " .long 3b,5b\n") + +#define __asm_clear_4x_cont(to, ret, CLEAR, FIXUP, TENTRY) \ + __asm_clear(to, ret, \ + " SETD [%0],D1Ar1\n" \ + "2: SETD [%0++],D1Ar1\n" CLEAR, \ + "3: ADD %1,%1,#4\n" FIXUP, \ + " .long 2b,3b\n" TENTRY) + +#define __asm_clear_4(to, ret) \ + __asm_clear_4x_cont(to, ret, "", "", "") + +#define __asm_clear_8x_cont(to, ret, CLEAR, FIXUP, TENTRY) \ + __asm_clear_4x_cont(to, ret, \ + " SETD [%0],D1Ar1\n" \ + "4: SETD [%0++],D1Ar1\n" CLEAR, \ + "5: ADD %1,%1,#4\n" FIXUP, \ + " .long 4b,5b\n" TENTRY) + +#define __asm_clear_8(to, ret) \ + __asm_clear_8x_cont(to, ret, "", "", "") + +#define __asm_clear_12x_cont(to, ret, CLEAR, FIXUP, TENTRY) \ + __asm_clear_8x_cont(to, ret, \ + " SETD [%0],D1Ar1\n" \ + "6: SETD [%0++],D1Ar1\n" CLEAR, \ + "7: ADD %1,%1,#4\n" FIXUP, \ + " .long 6b,7b\n" TENTRY) + +#define __asm_clear_12(to, ret) \ + __asm_clear_12x_cont(to, ret, "", "", "") + +#define __asm_clear_16x_cont(to, ret, CLEAR, FIXUP, TENTRY) \ + __asm_clear_12x_cont(to, ret, \ + " SETD [%0],D1Ar1\n" \ + "8: SETD [%0++],D1Ar1\n" CLEAR, \ + "9: ADD %1,%1,#4\n" FIXUP, \ + " .long 8b,9b\n" TENTRY) + +#define __asm_clear_16(to, ret) \ + __asm_clear_16x_cont(to, ret, "", "", "") + +unsigned long __do_clear_user(void __user *pto, unsigned long pn) +{ + register char __user *dst __asm__ ("D0Re0") = pto; + register unsigned long n __asm__ ("D1Re0") = pn; + register unsigned long retn __asm__ ("D0Ar6") = 0; + + if ((unsigned long) dst & 1) { + __asm_clear_1(dst, retn); + n--; + } + + if ((unsigned long) dst & 2) { + __asm_clear_2(dst, retn); + n -= 2; + } + + /* 64 bit copy loop */ + if (!((__force unsigned long) dst & 7)) { + while (n >= 8) { + __asm_clear_8x64(dst, retn); + n -= 8; + } + } + + while (n >= 16) { + __asm_clear_16(dst, retn); + n -= 16; + } + + while (n >= 4) { + __asm_clear_4(dst, retn); + n -= 4; + } + + switch (n) { + case 0: + break; + case 1: + __asm_clear_1(dst, retn); + break; + case 2: + __asm_clear_2(dst, retn); + break; + case 3: + __asm_clear_3(dst, retn); + break; + } + + return retn; +} + +unsigned char __get_user_asm_b(const void __user *addr, long *err) +{ + register unsigned char x __asm__ ("D0Re0") = 0; + __asm__ __volatile__( + " GETB %0,[%2]\n" + "1:\n" + " GETB %0,[%2]\n" + "2:\n" + " .section .fixup,\"ax\"\n" + "3: MOV D0FrT,%3\n" + " SETD [%1],D0FrT\n" + " MOVT D0FrT,#HI(2b)\n" + " JUMP D0FrT,#LO(2b)\n" + " .previous\n" + " .section __ex_table,\"a\"\n" + " .long 1b,3b\n" + " .previous\n" + : "=r" (x) + : "r" (err), "r" (addr), "P" (-EFAULT) + : "D0FrT"); + return x; +} + +unsigned short __get_user_asm_w(const void __user *addr, long *err) +{ + register unsigned short x __asm__ ("D0Re0") = 0; + __asm__ __volatile__( + " GETW %0,[%2]\n" + "1:\n" + " GETW %0,[%2]\n" + "2:\n" + " .section .fixup,\"ax\"\n" + "3: MOV D0FrT,%3\n" + " SETD [%1],D0FrT\n" + " MOVT D0FrT,#HI(2b)\n" + " JUMP D0FrT,#LO(2b)\n" + " .previous\n" + " .section __ex_table,\"a\"\n" + " .long 1b,3b\n" + " .previous\n" + : "=r" (x) + : "r" (err), "r" (addr), "P" (-EFAULT) + : "D0FrT"); + return x; +} + +unsigned int __get_user_asm_d(const void __user *addr, long *err) +{ + register unsigned int x __asm__ ("D0Re0") = 0; + __asm__ __volatile__( + " GETD %0,[%2]\n" + "1:\n" + " GETD %0,[%2]\n" + "2:\n" + " .section .fixup,\"ax\"\n" + "3: MOV D0FrT,%3\n" + " SETD [%1],D0FrT\n" + " MOVT D0FrT,#HI(2b)\n" + " JUMP D0FrT,#LO(2b)\n" + " .previous\n" + " .section __ex_table,\"a\"\n" + " .long 1b,3b\n" + " .previous\n" + : "=r" (x) + : "r" (err), "r" (addr), "P" (-EFAULT) + : "D0FrT"); + return x; +} + +long __put_user_asm_b(unsigned int x, void __user *addr) +{ + register unsigned int err __asm__ ("D0Re0") = 0; + __asm__ __volatile__( + " MOV %0,#0\n" + " SETB [%2],%1\n" + "1:\n" + " SETB [%2],%1\n" + "2:\n" + ".section .fixup,\"ax\"\n" + "3: MOV %0,%3\n" + " MOVT D0FrT,#HI(2b)\n" + " JUMP D0FrT,#LO(2b)\n" + ".previous\n" + ".section __ex_table,\"a\"\n" + " .long 1b,3b\n" + ".previous" + : "=r"(err) + : "d" (x), "a" (addr), "P"(-EFAULT) + : "D0FrT"); + return err; +} + +long __put_user_asm_w(unsigned int x, void __user *addr) +{ + register unsigned int err __asm__ ("D0Re0") = 0; + __asm__ __volatile__( + " MOV %0,#0\n" + " SETW [%2],%1\n" + "1:\n" + " SETW [%2],%1\n" + "2:\n" + ".section .fixup,\"ax\"\n" + "3: MOV %0,%3\n" + " MOVT D0FrT,#HI(2b)\n" + " JUMP D0FrT,#LO(2b)\n" + ".previous\n" + ".section __ex_table,\"a\"\n" + " .long 1b,3b\n" + ".previous" + : "=r"(err) + : "d" (x), "a" (addr), "P"(-EFAULT) + : "D0FrT"); + return err; +} + +long __put_user_asm_d(unsigned int x, void __user *addr) +{ + register unsigned int err __asm__ ("D0Re0") = 0; + __asm__ __volatile__( + " MOV %0,#0\n" + " SETD [%2],%1\n" + "1:\n" + " SETD [%2],%1\n" + "2:\n" + ".section .fixup,\"ax\"\n" + "3: MOV %0,%3\n" + " MOVT D0FrT,#HI(2b)\n" + " JUMP D0FrT,#LO(2b)\n" + ".previous\n" + ".section __ex_table,\"a\"\n" + " .long 1b,3b\n" + ".previous" + : "=r"(err) + : "d" (x), "a" (addr), "P"(-EFAULT) + : "D0FrT"); + return err; +} + +long __put_user_asm_l(unsigned long long x, void __user *addr) +{ + register unsigned int err __asm__ ("D0Re0") = 0; + __asm__ __volatile__( + " MOV %0,#0\n" + " SETL [%2],%1,%t1\n" + "1:\n" + " SETL [%2],%1,%t1\n" + "2:\n" + ".section .fixup,\"ax\"\n" + "3: MOV %0,%3\n" + " MOVT D0FrT,#HI(2b)\n" + " JUMP D0FrT,#LO(2b)\n" + ".previous\n" + ".section __ex_table,\"a\"\n" + " .long 1b,3b\n" + ".previous" + : "=r"(err) + : "d" (x), "a" (addr), "P"(-EFAULT) + : "D0FrT"); + return err; +} + +long strnlen_user(const char __user *src, long count) +{ + long res; + + if (!access_ok(VERIFY_READ, src, 0)) + return 0; + + asm volatile (" MOV D0Ar4, %1\n" + " MOV D0Ar6, %2\n" + "0:\n" + " SUBS D0FrT, D0Ar6, #0\n" + " SUB D0Ar6, D0Ar6, #1\n" + " BLE 2f\n" + " GETB D0FrT, [D0Ar4+#1++]\n" + "1:\n" + " TST D0FrT, #255\n" + " BNE 0b\n" + "2:\n" + " SUB %0, %2, D0Ar6\n" + "3:\n" + " .section .fixup,\"ax\"\n" + "4:\n" + " MOV %0, #0\n" + " MOVT D0FrT,#HI(3b)\n" + " JUMP D0FrT,#LO(3b)\n" + " .previous\n" + " .section __ex_table,\"a\"\n" + " .long 1b,4b\n" + " .previous\n" + : "=r" (res) + : "r" (src), "r" (count) + : "D0FrT", "D0Ar4", "D0Ar6", "cc"); + + return res; +} + +long __strncpy_from_user(char *dst, const char __user *src, long count) +{ + long res; + + if (count == 0) + return 0; + + /* + * Currently, in 2.4.0-test9, most ports use a simple byte-copy loop. + * So do we. + * + * This code is deduced from: + * + * char tmp2; + * long tmp1, tmp3; + * tmp1 = count; + * while ((*dst++ = (tmp2 = *src++)) != 0 + * && --tmp1) + * ; + * + * res = count - tmp1; + * + * with tweaks. + */ + + __asm__ __volatile__(" MOV %0,%3\n" + "1:\n" + " GETB D0FrT,[%2++]\n" + "2:\n" + " CMP D0FrT,#0\n" + " SETB [%1++],D0FrT\n" + " BEQ 3f\n" + " SUBS %0,%0,#1\n" + " BNZ 1b\n" + "3:\n" + " SUB %0,%3,%0\n" + "4:\n" + " .section .fixup,\"ax\"\n" + "5:\n" + " MOV %0,%7\n" + " MOVT D0FrT,#HI(4b)\n" + " JUMP D0FrT,#LO(4b)\n" + " .previous\n" + " .section __ex_table,\"a\"\n" + " .long 2b,5b\n" + " .previous" + : "=r" (res), "=r" (dst), "=r" (src), "=r" (count) + : "3" (count), "1" (dst), "2" (src), "P" (-EFAULT) + : "D0FrT", "memory", "cc"); + + return res; +} -- 1.7.7.6