Krait enhancements from caf msm8960: Improve performance of memmove, bcopy, and memmove_words Change-Id: I62b3da046889387f835da741110d35ffd3c8f806 Conflicts: libc/Android.mk msm8960: Improve performance of memcpy Change-Id: I0c8355ae5e92060ad5a0811d33937e6913c8b633 Bionic/libm: fast neon pow() for small x,y Add a fast neon version of pow() suitable for relatively small positive x and y (between 0 and 4). Run the standard implementation in all other cases. Gives approximately 60% performance improvement to AnTuTu FPU score. Change-Id: I9234d37eaa6a815d1e619375f5b049c4ec88f557 msm7627a: Enable neon optimized memove and pow functions. Define SPARROW_NEON_OPTIMIZATION flag so that neon optimized memove and pow functions are used. Also add Corresponding definitions in make files. Change-Id: I12089fc7002e3ec294e63632bd84e395fbd24936 Bionic/libm: Prefer branches and VFP ABI For internal functions set gcc attribute "aapcs-vfp" for ARM and use -fno-if-conversion to prefer branches over predicated instructions (improves performance on architectures with good branch prediction). Change-Id: I365e9508bd3babb0bb06fc5de127c1ae17445bcc Bionic/libm: add assembly versions of sin/cos Add assembly versions of sin/cos with integrated remainder pi/2 calculation. Directly extracted from binary libm.so compiled with __ieee754_rem_pio2 calls inlined. Change-Id: I9a999c01cea92aace9df7be9ad8f90f150040375 Conflicts: libm/Android.mk Bionic/libm: Remove extra vmov from sin/cos Move integer representations of x bits on the integer side rather than moving them to and from the FP registers. Change-Id: I1d0800730d7553a47c462ee2a0cc044ffe62eb20 Bionic/libm: Pow optimizations and bug fixes Use VFP calling convention for pow_neon handoff function by default. Fix register usage collision between two different polynomial coefficients in pow_neon. Remove conditional execution in pow_neon and replace with branching. Change-Id: I254617940b2787297aff2ab97dbf45c11e6a2b08 Bionic/libm: Add precision-correct de-serialize sin/cos Modify sin/cos to improve performance while retaining either bit-for-bit agreement with previous algorithm or <1 ulp deviation from arbitrary precision result. Change-Id: Icbd6d66fb1c0ceb53f43fed6541e0c89cc6e7a63

commit: a8c02214555d91fc1df3fdba065a46a4fad15c58 [log] [tgz]
author: Brent DeGraaf <bdegraaf@codeaurora.org> Wed May 30 22:50:19 2012 -0400
committer: Andrew Sutherland <dr3wsuth3rland@gmail.com> Mon Dec 10 19:32:25 2012 -0600
tree: 16af1366567c6716364014465bda88ffb9c6aaf0
parent: a5d721e629e69b83d49468a204e46f560e6f76b7 [diff]
diff --git a/libc/arch-arm/bionic/memcpy.S b/libc/arch-arm/bionic/memcpy.S
index 8453cc0..7e1a799 100644
--- a/libc/arch-arm/bionic/memcpy.S
+++ b/libc/arch-arm/bionic/memcpy.S

@@ -30,6 +30,396 @@
 #include <machine/asm.h>
 
 #if defined(__ARM_NEON__) && !defined(ARCH_ARM_USE_NON_NEON_MEMCPY)
+#if defined(KRAIT_NEON_OPTIMIZATION)
+	/*
+	 * These can be overridden in:
+	 *   device/<vendor>/<board>/BoardConfig.mk
+         * by setting the following:
+	 *   TARGET_USE_KRAIT_BIONIC_OPTIMIZATION := true
+	 *   TARGET_USE_KRAIT_PLD_SET := true
+	 *   TARGET_KRAIT_BIONIC_PLDOFFS := <pldoffset>
+	 *   TARGET_KRAIT_BIONIC_PLDSIZE := <pldsize>
+	 *   TARGET_KRAIT_BIONIC_PLDTHRESH := <pldthreshold>
+	 *   TARGET_KRAIT_BIONIC_BBTHRESH := <bbthreshold>
+	 */
+#ifndef PLDOFFS
+#define PLDOFFS	(10)
+#endif
+#ifndef PLDTHRESH
+#define PLDTHRESH (PLDOFFS)
+#endif
+#ifndef BBTHRESH
+#define BBTHRESH (4096/64)
+#endif
+#if (PLDOFFS < 1)
+#error Routine does not support offsets less than 1
+#endif
+#if (PLDTHRESH < PLDOFFS)
+#error PLD threshold must be greater than or equal to the PLD offset
+#endif
+#ifndef PLDSIZE
+#define PLDSIZE	(64)
+#endif
+#define NOP_OPCODE       (0xe320f000)
+
+        .text
+        .fpu    neon
+	.global memcpy
+	.type memcpy, %function
+	.align 5
+memcpy:
+	stmfd	sp!, {r0, r9, r10, lr}
+	cmp	r2, #4
+	blt	.Lneon_lt4
+	cmp	r2, #16
+	blt	.Lneon_lt16
+	cmp	r2, #32
+	blt	.Lneon_16
+	cmp	r2, #64
+	blt	.Lneon_copy_32_a
+
+	mov	r12, r2, lsr #6
+	cmp	r12, #PLDTHRESH
+	ble	.Lneon_copy_64_loop_nopld
+
+	cmp	r12, #BBTHRESH
+	ble	.Lneon_prime_pump
+
+	add	lr, r0, #0x400
+	add	r9, r1, #(PLDOFFS*PLDSIZE)
+	sub	lr, lr, r9
+	lsl	lr, lr, #21
+	lsr	lr, lr, #21
+	add	lr, lr, #(PLDOFFS*PLDSIZE)
+	cmp	r12, lr, lsr #6
+	movle	lr, #(PLDOFFS*PLDSIZE)
+
+	movgt	r9, #(PLDOFFS)
+	rsbgts	r9, r9, lr, lsr #6
+	ble	.Lneon_prime_pump
+
+	add	r10, r1, lr
+	bic	r10, #0x3F
+
+	sub	r12, lr, lsr #6
+	cmp	r9, r12
+	suble	r12, r12, r9
+	movgt	r9, r12
+	movgt	r12, #0
+
+	pld	[r1, #((PLDOFFS-1)*PLDSIZE)]
+	.balignl 64, NOP_OPCODE, 4*2
+.Lneon_copy_64_loop_outer_doublepld:
+	pld	[r1, #((PLDOFFS)*PLDSIZE)]
+	vld1.32	{q0, q1}, [r1]!
+	vld1.32	{q2, q3}, [r1]!
+	ldr	r3, [r10]
+	subs	r9, r9, #1
+	vst1.32	{q0, q1}, [r0]!
+	vst1.32	{q2, q3}, [r0]!
+	add	r10, #64
+	bne	.Lneon_copy_64_loop_outer_doublepld
+	cmp	r12, #0
+	bne	.Lneon_copy_64_loop_outer
+	mov	r12, lr, lsr #6
+	b	.Lneon_copy_64_loop_nopld
+	.balignl 64, NOP_OPCODE, 4*2
+.Lneon_prime_pump:
+	mov	lr, #(PLDOFFS*PLDSIZE)
+	add	r10, r1, #(PLDOFFS*PLDSIZE)
+	bic	r10, #0x3F
+	sub	r12, r12, #PLDOFFS
+	pld	[r10, #(-1*PLDSIZE)]
+	.balignl 64, NOP_OPCODE, 4*2
+.Lneon_copy_64_loop_outer:
+	vld1.32	{q0, q1}, [r1]!
+	vld1.32	{q2, q3}, [r1]!
+	ldr	r3, [r10]
+	subs	r12, r12, #1
+	vst1.32	{q0, q1}, [r0]!
+	vst1.32	{q2, q3}, [r0]!
+	add	r10, #64
+	bne	.Lneon_copy_64_loop_outer
+	mov	r12, lr, lsr #6
+	.balignl 64, NOP_OPCODE, 4*2
+.Lneon_copy_64_loop_nopld:
+	vld1.32	{q8, q9}, [r1]!
+	vld1.32	{q10, q11}, [r1]!
+	subs	r12, r12, #1
+	vst1.32	{q8, q9}, [r0]!
+	vst1.32	{q10, q11}, [r0]!
+	bne	.Lneon_copy_64_loop_nopld
+	ands	r2, r2, #0x3f
+	beq	.Lneon_exit
+	.balignl 64, NOP_OPCODE, 4*2
+.Lneon_copy_32_a:
+	movs	r12, r2, lsl #27
+	bcc	.Lneon_16
+	vld1.32	{q0,q1}, [r1]!
+	vst1.32	{q0,q1}, [r0]!
+	.balignl 64, NOP_OPCODE, 4*2
+.Lneon_16:
+	bpl	.Lneon_lt16
+	vld1.32	{q8}, [r1]!
+	vst1.32	{q8}, [r0]!
+	ands	r2, r2, #0x0f
+	beq	.Lneon_exit
+	.balignl 64, NOP_OPCODE, 4*2
+.Lneon_lt16:
+	movs	r12, r2, lsl #29
+	ldrcs	r3, [r1], #4
+	ldrcs	r12, [r1], #4
+	strcs	r3, [r0], #4
+	strcs	r12, [r0], #4
+	ldrmi	r3, [r1], #4
+	strmi	r3, [r0], #4
+	.balignl 64, NOP_OPCODE, 4*2
+.Lneon_lt4:
+	movs	r2, r2, lsl #31
+	ldrcsh	r3, [r1], #2
+	strcsh	r3, [r0], #2
+	ldrmib	r12, [r1]
+	strmib	r12, [r0]
+	.balignl 64, NOP_OPCODE, 4*2
+.Lneon_exit:
+	ldmfd	sp!, {r0, r9, r10, lr}
+	bx	lr
+	.end
+#elif defined(SCORPION_NEON_OPTIMIZATION)
+	/*
+	 * These can be overridden in:
+	 *   device/<vendor>/<board>/BoardConfig.mk
+         * by setting the following:
+	 *   TARGET_USE_SCORPION_BIONIC_OPTIMIZATION := true
+	 *   TARGET_USE_SCORPION_PLD_SET := true
+	 *   TARGET_SCORPION_BIONIC_PLDOFFS := <pldoffset>
+	 *   TARGET_SCORPION_BIONIC_PLDSIZE := <pldsize>
+	 */
+#ifndef PLDOFFS
+#define PLDOFFS	(6)
+#endif
+#ifndef PLDSIZE
+#define PLDSIZE	(128)	/* L2 cache line size */
+#endif
+        .code 32
+        .align 5
+        .globl memcpy
+        .func
+memcpy:
+	push            {r0}
+	cmp             r2, #4
+	blt             .Lneon_lt4
+	cmp             r2, #16
+	blt             .Lneon_lt16
+	cmp             r2, #32
+	blt             .Lneon_16
+	cmp              r2, #128
+	blt              .Lneon_copy_32_a
+	/* Copy blocks of 128-bytes (word-aligned) at a time*/
+	/* Code below is optimized for PLDSIZE=128 only */
+	mov             r12, r2, lsr #7
+	cmp             r12, #PLDOFFS
+	ble             .Lneon_copy_128_loop_nopld
+	sub             r12, #PLDOFFS
+	pld             [r1, #(PLDOFFS-1)*PLDSIZE]
+.Lneon_copy_128_loop_outer:
+	pld             [r1, #(PLDOFFS*PLDSIZE)]
+	vld1.32         {q0, q1}, [r1]!
+	vld1.32         {q2, q3}, [r1]!
+	vld1.32         {q8, q9}, [r1]!
+	vld1.32         {q10, q11}, [r1]!
+	subs            r12, r12, #1
+	vst1.32	        {q0, q1}, [r0]!
+	vst1.32         {q2, q3}, [r0]!
+	vst1.32         {q8, q9}, [r0]!
+	vst1.32         {q10, q11}, [r0]!
+	bne             .Lneon_copy_128_loop_outer
+	mov             r12, #PLDOFFS
+.Lneon_copy_128_loop_nopld:
+	vld1.32         {q0, q1}, [r1]!
+	vld1.32         {q2, q3}, [r1]!
+	vld1.32         {q8, q9}, [r1]!
+	vld1.32         {q10, q11}, [r1]!
+	subs            r12, r12, #1
+	vst1.32         {q0, q1}, [r0]!
+	vst1.32         {q2, q3}, [r0]!
+	vst1.32         {q8, q9}, [r0]!
+	vst1.32         {q10, q11}, [r0]!
+	bne             .Lneon_copy_128_loop_nopld
+	ands            r2, r2, #0x7f
+	beq             .Lneon_exit
+	cmp             r2, #32
+	blt             .Lneon_16
+	nop
+	/* Copy blocks of 32-bytes (word aligned) at a time*/
+.Lneon_copy_32_a:
+	mov             r12, r2, lsr #5
+.Lneon_copy_32_loop_a:
+	vld1.32         {q0,q1}, [r1]!
+	subs            r12, r12, #1
+	vst1.32         {q0,q1}, [r0]!
+	bne             .Lneon_copy_32_loop_a
+	ands            r2, r2, #0x1f
+	beq             .Lneon_exit
+.Lneon_16:
+	subs            r2, r2, #16
+	blt             .Lneon_lt16
+	vld1.32         {q8}, [r1]!
+	vst1.32         {q8}, [r0]!
+	beq             .Lneon_exit
+.Lneon_lt16:
+	movs            r12, r2, lsl #29
+	bcc             .Lneon_skip8
+	ldr             r3, [r1], #4
+	ldr             r12, [r1], #4
+	str             r3, [r0], #4
+	str             r12, [r0], #4
+.Lneon_skip8:
+	bpl             .Lneon_lt4
+	ldr             r3, [r1], #4
+	str             r3, [r0], #4
+.Lneon_lt4:
+	movs            r2, r2, lsl #31
+	bcc             .Lneon_lt2
+	ldrh            r3, [r1], #2
+	strh            r3, [r0], #2
+.Lneon_lt2:
+	bpl             .Lneon_exit
+	ldrb            r12, [r1]
+	strb            r12, [r0]
+.Lneon_exit:
+	pop             {r0}
+	bx              lr
+	.endfunc
+	.end
+#else /* !SCORPION_NEON_OPTIMIZATION */
+#if defined(CORTEX_CACHE_LINE_32)
+	/*
+	   *This can be enabled by setting flag
+	   *TARGET_CORTEX_CACHE_LINE_32 in
+	   *device/<vendor>/<board>/BoardConfig.mk
+	*/
+        .text
+        .fpu    neon
+
+        .global memcpy
+        .type memcpy, %function
+        .align 4
+
+/* a prefetch distance of 4 cache-lines works best experimentally */
+#define CACHE_LINE_SIZE     32
+memcpy:
+        .fnstart
+        .save       {r0, lr}
+        stmfd       sp!, {r0, lr}
+
+        /* start preloading as early as possible */
+        pld         [r1, #(CACHE_LINE_SIZE*0)]
+        pld         [r1, #(CACHE_LINE_SIZE*1)]
+
+        /* do we have at least 16-bytes to copy (needed for alignment below) */
+        cmp         r2, #16
+        blo         5f
+
+        /* align destination to half cache-line for the write-buffer */
+        rsb         r3, r0, #0
+        ands        r3, r3, #0xF
+        beq         0f
+
+        /* copy up to 15-bytes (count in r3) */
+        sub         r2, r2, r3
+        movs        ip, r3, lsl #31
+        ldrmib      lr, [r1], #1
+        strmib      lr, [r0], #1
+        ldrcsb      ip, [r1], #1
+        ldrcsb      lr, [r1], #1
+        strcsb      ip, [r0], #1
+        strcsb      lr, [r0], #1
+        movs        ip, r3, lsl #29
+        bge         1f
+        // copies 4 bytes, destination 32-bits aligned
+        vld4.8      {d0[0], d1[0], d2[0], d3[0]}, [r1]!
+        vst4.8      {d0[0], d1[0], d2[0], d3[0]}, [r0, :32]!
+1:      bcc         2f
+        // copies 8 bytes, destination 64-bits aligned
+        vld1.8      {d0}, [r1]!
+        vst1.8      {d0}, [r0, :64]!
+2:
+
+0:      /* preload immediately the next cache line, which we may need */
+        pld         [r1, #(CACHE_LINE_SIZE*0)]
+        pld         [r1, #(CACHE_LINE_SIZE*1)]
+
+        /* make sure we have at least 128 bytes to copy */
+        subs        r2, r2, #128
+        blo         2f
+
+        /* preload all the cache lines we need.
+         * NOTE: the number of pld below depends on PREFETCH_DISTANCE,
+         * ideally would would increase the distance in the main loop to
+         * avoid the goofy code below. In practice this doesn't seem to make
+         * a big difference.
+         */
+        pld         [r1, #(CACHE_LINE_SIZE*2)]
+        pld         [r1, #(CACHE_LINE_SIZE*3)]
+        pld         [r1, #(CACHE_LINE_SIZE*4)]
+
+	.align 3
+1:      /* The main loop copies 128 bytes at a time */
+	subs        r2, r2, #128
+        vld1.8      {d0  - d3},   [r1]!
+        vld1.8      {d4  - d7},   [r1]!
+        pld         [r1, #(CACHE_LINE_SIZE*1)]
+        pld         [r1, #(CACHE_LINE_SIZE*2)]
+	vld1.8	    {d16 - d19},   [r1]!
+	vld1.8	    {d20 - d23},   [r1]!
+        pld         [r1, #(CACHE_LINE_SIZE*1)]
+        pld         [r1, #(CACHE_LINE_SIZE*2)]
+        vst1.8      {d0  - d3},   [r0, :128]!
+        vst1.8      {d4  - d7},   [r0, :128]!
+        vst1.8      {d16  - d19},   [r0, :128]!
+        vst1.8      {d20  - d23},   [r0, :128]!
+        bhs         1b
+
+2:      /* fix-up the remaining count and make sure we have >= 32 bytes left */
+        add         r2, r2, #128
+        subs        r2, r2, #32
+        blo         4f
+
+3:      /* 32 bytes at a time. These cache lines were already preloaded */
+        vld1.8      {d0 - d3},  [r1]!
+        subs        r2, r2, #32
+        vst1.8      {d0 - d3},  [r0, :128]!
+        bhs         3b
+
+4:      /* less than 32 left */
+        add         r2, r2, #32
+        tst         r2, #0x10
+        beq         5f
+        // copies 16 bytes, 128-bits aligned
+        vld1.8      {d0, d1}, [r1]!
+        vst1.8      {d0, d1}, [r0, :128]!
+
+5:      /* copy up to 15-bytes (count in r2) */
+        movs        ip, r2, lsl #29
+        bcc         1f
+        vld1.8      {d0}, [r1]!
+        vst1.8      {d0}, [r0]!
+1:      bge         2f
+        vld4.8      {d0[0], d1[0], d2[0], d3[0]}, [r1]!
+        vst4.8      {d0[0], d1[0], d2[0], d3[0]}, [r0]!
+2:      movs        ip, r2, lsl #31
+        ldrmib      r3, [r1], #1
+        ldrcsb      ip, [r1], #1
+        ldrcsb      lr, [r1], #1
+        strmib      r3, [r0], #1
+        strcsb      ip, [r0], #1
+        strcsb      lr, [r0], #1
+
+        ldmfd       sp!, {r0, lr}
+        bx          lr
+        .fnend
+#else /*!CORTEX_CACHE_LINE_32*/
 
         .text
         .fpu    neon
@@ -166,7 +556,8 @@
         bx          lr
 END(memcpy)
 
-
+#endif /*!CORTEX_CACHE_LINE_32*/
+#endif /* SCORPION_NEON_OPTIMIZATION */
 #else   /* __ARM_ARCH__ < 7 */
 
 

diff --git a/libc/arch-arm/bionic/memmove.S b/libc/arch-arm/bionic/memmove.S
new file mode 100644
index 0000000..937d14b
--- /dev/null
+++ b/libc/arch-arm/bionic/memmove.S

@@ -0,0 +1,526 @@
+/***************************************************************************
+ Copyright (c) 2009-2012 Code Aurora Forum. All rights reserved.
+
+ Redistribution and use in source and binary forms, with or without
+ modification, are permitted provided that the following conditions are met:
+     * Redistributions of source code must retain the above copyright
+       notice, this list of conditions and the following disclaimer.
+     * Redistributions in binary form must reproduce the above copyright
+       notice, this list of conditions and the following disclaimer in the
+       documentation and/or other materials provided with the distribution.
+     * Neither the name of Code Aurora nor the names of its contributors may
+       be used to endorse or promote products derived from this software
+       without specific prior written permission.
+
+ THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+ LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ POSSIBILITY OF SUCH DAMAGE.
+  ***************************************************************************/
+
+/***************************************************************************
+ *  Neon memmove: Attempts to do a memmove with Neon registers if possible,
+ *     Inputs:
+ *        dest: The destination buffer
+ *        src: The source buffer
+ *        n: The size of the buffer to transfer
+ *     Outputs:
+ *
+ ***************************************************************************/
+
+#include <machine/cpu-features.h>
+
+#if defined(KRAIT_NEON_OPTIMIZATION) || defined(SPARROW_NEON_OPTIMIZATION)
+	/*
+	 * These can be overridden in:
+	 *   device/<vendor>/<board>/BoardConfig.mk
+	 * by setting the following:
+	 *   TARGET_USE_KRAIT_BIONIC_OPTIMIZATION := true
+	 *   TARGET_USE_KRAIT_PLD_SET := true
+	 *   TARGET_KRAIT_BIONIC_PLDOFFS := <pldoffset>
+	 *   TARGET_KRAIT_BIONIC_PLDSIZE := <pldsize>
+	 *   TARGET_KRAIT_BIONIC_PLDTHRESH := <pldthreshold>
+	 */
+#ifndef PLDOFFS
+#define PLDOFFS	(10)
+#endif
+#ifndef PLDTHRESH
+#define PLDTHRESH (PLDOFFS)
+#endif
+#if (PLDOFFS < 5)
+#error Routine does not support offsets less than 5
+#endif
+#if (PLDTHRESH < PLDOFFS)
+#error PLD threshold must be greater than or equal to the PLD offset
+#endif
+#ifndef PLDSIZE
+#define PLDSIZE (64)
+#endif
+#define NOP_OPCODE (0xe320f000)
+
+	.code 32
+	.align 5
+	.global memmove
+	.type memmove, %function
+
+	.global _memmove_words
+	.type _memmove_words, %function
+
+	.global bcopy
+	.type bcopy, %function
+
+bcopy:
+	mov	r12, r0
+	mov	r0, r1
+	mov	r1, r12
+	.balignl 64, NOP_OPCODE, 4*2
+memmove:
+_memmove_words:
+.Lneon_memmove_cmf:
+	subs	r12, r0, r1
+	bxeq	lr
+	cmphi	r2, r12
+	bls	memcpy	/* Use memcpy for non-overlapping areas */
+
+	push	{r0}
+
+.Lneon_back_to_front_copy:
+	add	r0, r0, r2
+	add	r1, r1, r2
+	cmp	r2, #4
+	bgt	.Lneon_b2f_gt4
+	cmp	r2, #0
+.Lneon_b2f_smallcopy_loop:
+	beq	.Lneon_memmove_done
+	ldrb	r12, [r1, #-1]!
+	subs	r2, r2, #1
+	strb	r12, [r0, #-1]!
+	b	.Lneon_b2f_smallcopy_loop
+.Lneon_b2f_gt4:
+	sub	r3, r0, r1
+	cmp	r2, r3
+	movle	r12, r2
+	movgt	r12, r3
+	cmp	r12, #64
+	bge	.Lneon_b2f_copy_64
+	cmp	r12, #32
+	bge	.Lneon_b2f_copy_32
+	cmp	r12, #8
+	bge	.Lneon_b2f_copy_8
+	cmp	r12, #4
+	bge	.Lneon_b2f_copy_4
+	b	.Lneon_b2f_copy_1
+.Lneon_b2f_copy_64:
+	sub	r1, r1, #64	/* Predecrement */
+	sub	r0, r0, #64
+	movs	r12, r2, lsr #6
+	cmp	r12, #PLDTHRESH
+	ble	.Lneon_b2f_copy_64_loop_nopld
+	sub	r12, #PLDOFFS
+	pld	[r1, #-(PLDOFFS-5)*PLDSIZE]
+	pld	[r1, #-(PLDOFFS-4)*PLDSIZE]
+	pld	[r1, #-(PLDOFFS-3)*PLDSIZE]
+	pld	[r1, #-(PLDOFFS-2)*PLDSIZE]
+	pld	[r1, #-(PLDOFFS-1)*PLDSIZE]
+	.balignl 64, NOP_OPCODE, 4*2
+.Lneon_b2f_copy_64_loop_outer:
+	pld	[r1, #-(PLDOFFS)*PLDSIZE]
+	vld1.32	{q0, q1}, [r1]!
+	vld1.32	{q2, q3}, [r1]
+	subs	r12, r12, #1
+	vst1.32	{q0, q1}, [r0]!
+	sub	r1, r1, #96	/* Post-fixup and predecrement */
+	vst1.32	{q2, q3}, [r0]
+	sub	r0, r0, #96
+	bne	.Lneon_b2f_copy_64_loop_outer
+	mov	r12, #PLDOFFS
+	.balignl 64, NOP_OPCODE, 4*2
+.Lneon_b2f_copy_64_loop_nopld:
+	vld1.32	{q8, q9}, [r1]!
+	vld1.32	{q10, q11}, [r1]
+	subs	r12, r12, #1
+	vst1.32	{q8, q9}, [r0]!
+	sub	r1, r1, #96	/* Post-fixup and predecrement */
+	vst1.32	{q10, q11}, [r0]
+	sub	r0, r0, #96
+	bne	.Lneon_b2f_copy_64_loop_nopld
+	ands	r2, r2, #0x3f
+	beq	.Lneon_memmove_done
+	add	r1, r1, #64	/* Post-fixup */
+	add	r0, r0, #64
+	cmp	r2, #32
+	blt	.Lneon_b2f_copy_finish
+.Lneon_b2f_copy_32:
+	mov	r12, r2, lsr #5
+.Lneon_b2f_copy_32_loop:
+	sub	r1, r1, #32	/* Predecrement */
+	sub	r0, r0, #32
+	vld1.32	{q0,q1}, [r1]
+	subs	r12, r12, #1
+	vst1.32	{q0,q1}, [r0]
+	bne	.Lneon_b2f_copy_32_loop
+	ands	r2, r2, #0x1f
+	beq	.Lneon_memmove_done
+.Lneon_b2f_copy_finish:
+.Lneon_b2f_copy_8:
+	movs	r12, r2, lsr #0x3
+	beq	.Lneon_b2f_copy_4
+	.balignl 64, NOP_OPCODE, 4*2
+.Lneon_b2f_copy_8_loop:
+	sub	r1, r1, #8	/* Predecrement */
+	sub	r0, r0, #8
+	vld1.32	{d0}, [r1]
+	subs	r12, r12, #1
+	vst1.32	{d0}, [r0]
+	bne	.Lneon_b2f_copy_8_loop
+	ands	r2, r2, #0x7
+	beq	.Lneon_memmove_done
+.Lneon_b2f_copy_4:
+	movs	r12, r2, lsr #0x2
+	beq	.Lneon_b2f_copy_1
+.Lneon_b2f_copy_4_loop:
+	ldr	r3, [r1, #-4]!
+	subs	r12, r12, #1
+	str	r3, [r0, #-4]!
+	bne	.Lneon_b2f_copy_4_loop
+	ands	r2, r2, #0x3
+.Lneon_b2f_copy_1:
+	cmp	r2, #0
+	beq	.Lneon_memmove_done
+	.balignl 64, NOP_OPCODE, 4*2
+.Lneon_b2f_copy_1_loop:
+	ldrb	r12, [r1, #-1]!
+	subs	r2, r2, #1
+	strb	r12, [r0, #-1]!
+	bne	.Lneon_b2f_copy_1_loop
+
+.Lneon_memmove_done:
+	pop	{r0}
+	bx	lr
+
+	.end
+
+#elif defined(SCORPION_NEON_OPTIMIZATION)
+	/*
+	 * These can be overridden in:
+	 *   device/<vendor>/<board>/BoardConfig.mk
+         * by setting the following:
+	 *   TARGET_USE_SCORPION_BIONIC_OPTIMIZATION := true
+	 *   TARGET_USE_SCORPION_PLD_SET := true
+	 *   TARGET_SCORPION_BIONIC_PLDOFFS := <pldoffset>
+	 *   TARGET_SCORPION_BIONIC_PLDSIZE := <pldsize>
+	 */
+#ifndef PLDOFFS
+#define PLDOFFS	(6)
+#endif
+#ifndef PLDSIZE
+#define PLDSIZE	(128)	/* L2 cache line size */
+#endif
+
+	.code 32
+	.align 5
+	.global memmove
+	.type memmove, %function
+
+	.global bcopy
+	.type bcopy, %function
+
+bcopy:
+	mov		r12, r0
+	mov		r0, r1
+	mov		r1, r12
+memmove:
+	push            {r0}
+
+	/*
+	 * The requirements for memmove state that the function should
+	 * operate as if data were being copied from the source to a
+	 * buffer, then to the destination.  This is to allow a user
+	 * to copy data from a source and target that overlap.
+	 *
+	 * We can't just do byte copies front-to-back automatically, since
+	 * there's a good chance we may have an overlap (why else would someone
+	 * intentionally use memmove then?).
+	 *
+	 * We'll break this into two parts.  Front-to-back, or back-to-front
+	 * copies.
+	 */
+.Lneon_memmove_cmf:
+	cmp             r0, r1
+	blt             .Lneon_front_to_back_copy
+	bgt             .Lneon_back_to_front_copy
+	b               .Lneon_memmove_done
+
+	/* #############################################################
+	 * Front to Back copy
+	 */
+.Lneon_front_to_back_copy:
+	/*
+	 * For small copies, just do a quick memcpy.  We can do this for
+	 * front-to-back copies, aligned or unaligned, since we're only
+	 * doing 1 byte at a time...
+	 */
+	cmp             r2, #4
+	bgt             .Lneon_f2b_gt4
+	cmp             r2, #0
+.Lneon_f2b_smallcopy_loop:
+	beq             .Lneon_memmove_done
+	ldrb            r12, [r1], #1
+	subs            r2, r2, #1
+	strb            r12, [r0], #1
+	b               .Lneon_f2b_smallcopy_loop
+.Lneon_f2b_gt4:
+	/* The window size is in r3. */
+	sub             r3, r1, r0
+	/* #############################################################
+	 * Front to Back copy 
+	 */
+	/*
+	 * Note that we can't just route based on the size in r2.  If that's
+	 * larger than the overlap window in r3, we could potentially
+	 * (and likely!) destroy data we're copying.
+	 */
+	cmp             r2, r3
+	movle           r12, r2
+	movgt           r12, r3
+	cmp             r12, #256
+	bge             .Lneon_f2b_copy_128
+	cmp             r12, #64
+	bge             .Lneon_f2b_copy_32
+	cmp             r12, #16
+	bge             .Lneon_f2b_copy_16
+	cmp             r12, #8
+	bge             .Lneon_f2b_copy_8
+	cmp             r12, #4
+	bge             .Lneon_f2b_copy_4
+	b               .Lneon_f2b_copy_1
+	nop
+.Lneon_f2b_copy_128:
+	mov             r12, r2, lsr #7
+	cmp             r12, #PLDOFFS
+	ble             .Lneon_f2b_copy_128_loop_nopld
+	sub             r12, #PLDOFFS
+	pld             [r1, #(PLDOFFS-1)*PLDSIZE]
+.Lneon_f2b_copy_128_loop_outer:
+	pld             [r1, #(PLDOFFS*PLDSIZE)]
+	vld1.32         {q0,q1}, [r1]!
+	vld1.32         {q2,q3}, [r1]!
+	vld1.32         {q8,q9}, [r1]!
+	vld1.32         {q10,q11}, [r1]!
+	subs            r12, r12, #1
+	vst1.32         {q0,q1}, [r0]!
+	vst1.32         {q2,q3}, [r0]!
+	vst1.32         {q8,q9}, [r0]!
+	vst1.32         {q10,q11}, [r0]!
+	bne             .Lneon_f2b_copy_128_loop_outer
+	mov             r12, #PLDOFFS
+.Lneon_f2b_copy_128_loop_nopld:
+	vld1.32         {q0,q1}, [r1]!
+	vld1.32         {q2,q3}, [r1]!
+	vld1.32         {q8,q9}, [r1]!
+	vld1.32         {q10,q11}, [r1]!
+	subs            r12, r12, #1
+	vst1.32         {q0,q1}, [r0]!
+	vst1.32         {q2,q3}, [r0]!
+	vst1.32         {q8,q9}, [r0]!
+	vst1.32         {q10,q11}, [r0]!
+	bne             .Lneon_f2b_copy_128_loop_nopld
+	ands            r2, r2, #0x7f
+	beq             .Lneon_memmove_done
+	cmp             r2, #32
+	bge             .Lneon_f2b_copy_32
+	b               .Lneon_f2b_copy_finish
+.Lneon_f2b_copy_32:
+	mov             r12, r2, lsr #5
+.Lneon_f2b_copy_32_loop:
+	vld1.32         {q0,q1}, [r1]!
+	subs            r12, r12, #1
+	vst1.32         {q0,q1}, [r0]!
+	bne             .Lneon_f2b_copy_32_loop
+	ands            r2, r2, #0x1f
+	beq             .Lneon_memmove_done
+.Lneon_f2b_copy_finish:
+.Lneon_f2b_copy_16:
+	movs            r12, r2, lsr #4
+	beq             .Lneon_f2b_copy_8
+.Lneon_f2b_copy_16_loop:
+	vld1.32         {q0}, [r1]!
+	subs            r12, r12, #1
+	vst1.32         {q0}, [r0]!
+	bne             .Lneon_f2b_copy_16_loop
+	ands            r2, r2, #0xf
+	beq             .Lneon_memmove_done
+.Lneon_f2b_copy_8:
+	movs            r12, r2, lsr #3
+	beq             .Lneon_f2b_copy_4
+.Lneon_f2b_copy_8_loop:
+	vld1.32         {d0}, [r1]!
+	subs            r12, r12, #1
+	vst1.32         {d0}, [r0]!
+	bne             .Lneon_f2b_copy_8_loop
+	ands            r2, r2, #0x7
+	beq             .Lneon_memmove_done
+.Lneon_f2b_copy_4:
+	movs            r12, r2, lsr #2
+	beq             .Lneon_f2b_copy_1
+.Lneon_f2b_copy_4_loop:
+	ldr             r3, [r1], #4
+	subs            r12, r12, #1
+	str             r3, [r0], #4
+	bne             .Lneon_f2b_copy_4_loop
+	ands            r2, r2, #0x3
+	nop
+.Lneon_f2b_copy_1:
+	cmp             r2, #0
+	beq             .Lneon_memmove_done
+.Lneon_f2b_copy_1_loop:
+	ldrb            r12, [r1], #1
+	subs            r2, r2, #1
+	strb            r12, [r0], #1
+	bne             .Lneon_f2b_copy_1_loop
+.Lneon_f2b_finish:
+	b               .Lneon_memmove_done
+
+	/* #############################################################
+	 * Back to Front copy
+	 */
+.Lneon_back_to_front_copy:
+	/*
+	 * Here, we'll want to shift to the end of the buffers.  This
+	 * actually points us one past where we need to go, but since
+	 * we'll pre-decrement throughout, this will be fine.
+	 */
+	add             r0, r0, r2
+	add             r1, r1, r2
+	cmp             r2, #4
+	bgt             .Lneon_b2f_gt4
+	cmp             r2, #0
+.Lneon_b2f_smallcopy_loop:
+	beq             .Lneon_memmove_done
+	ldrb            r12, [r1, #-1]!
+	subs            r2, r2, #1
+	strb            r12, [r0, #-1]!
+	b               .Lneon_b2f_smallcopy_loop
+.Lneon_b2f_gt4:
+	/*
+	 * The minimum of the overlap window size and the copy size
+	 * is in r3.
+	 */
+	sub             r3, r0, r1
+	/*
+	 * #############################################################
+	 * Back to Front copy -
+	 */
+	cmp             r2, r3
+	movle           r12, r2
+	movgt           r12, r3
+	cmp             r12, #256
+	bge             .Lneon_b2f_copy_128
+	cmp             r12, #64
+	bge             .Lneon_b2f_copy_32
+	cmp             r12, #8
+	bge             .Lneon_b2f_copy_8
+	cmp             r12, #4
+	bge             .Lneon_b2f_copy_4
+	b               .Lneon_b2f_copy_1
+	nop
+.Lneon_b2f_copy_128:
+	movs            r12, r2, lsr #7
+	cmp             r12, #PLDOFFS
+	ble             .Lneon_b2f_copy_128_loop_nopld
+	sub             r12, #PLDOFFS
+	pld             [r1, #-(PLDOFFS-1)*PLDSIZE]
+.Lneon_b2f_copy_128_loop_outer:
+	pld             [r1, #-(PLDOFFS*PLDSIZE)]
+	sub             r1, r1, #128
+	sub             r0, r0, #128
+	vld1.32         {q0, q1}, [r1]!
+	vld1.32         {q2, q3}, [r1]!
+	vld1.32         {q8, q9}, [r1]!
+	vld1.32         {q10, q11}, [r1]!
+	subs            r12, r12, #1
+	vst1.32         {q0, q1}, [r0]!
+	vst1.32         {q2, q3}, [r0]!
+	vst1.32         {q8, q9}, [r0]!
+	vst1.32         {q10, q11}, [r0]!
+	sub             r1, r1, #128
+	sub             r0, r0, #128
+	bne             .Lneon_b2f_copy_128_loop_outer
+	mov             r12, #PLDOFFS
+.Lneon_b2f_copy_128_loop_nopld:
+	sub             r1, r1, #128
+	sub             r0, r0, #128
+	vld1.32         {q0, q1}, [r1]!
+	vld1.32         {q2, q3}, [r1]!
+	vld1.32         {q8, q9}, [r1]!
+	vld1.32         {q10, q11}, [r1]!
+	subs            r12, r12, #1
+	vst1.32         {q0, q1}, [r0]!
+	vst1.32         {q2, q3}, [r0]!
+	vst1.32         {q8, q9}, [r0]!
+	vst1.32         {q10, q11}, [r0]!
+	sub             r1, r1, #128
+	sub             r0, r0, #128
+	bne             .Lneon_b2f_copy_128_loop_nopld
+	ands            r2, r2, #0x7f
+	beq             .Lneon_memmove_done
+	cmp             r2, #32
+	bge             .Lneon_b2f_copy_32
+	b               .Lneon_b2f_copy_finish
+.Lneon_b2f_copy_32:
+	mov             r12, r2, lsr #5
+.Lneon_b2f_copy_32_loop:
+	sub             r1, r1, #32
+	sub             r0, r0, #32
+	vld1.32         {q0,q1}, [r1]
+	subs            r12, r12, #1
+	vst1.32         {q0,q1}, [r0]
+	bne             .Lneon_b2f_copy_32_loop
+	ands            r2, r2, #0x1f
+	beq             .Lneon_memmove_done
+.Lneon_b2f_copy_finish:
+.Lneon_b2f_copy_8:
+	movs            r12, r2, lsr #0x3
+	beq             .Lneon_b2f_copy_4
+.Lneon_b2f_copy_8_loop:
+	sub             r1, r1, #8
+	sub             r0, r0, #8
+	vld1.32         {d0}, [r1]
+	subs            r12, r12, #1
+	vst1.32         {d0}, [r0]
+	bne             .Lneon_b2f_copy_8_loop
+	ands            r2, r2, #0x7
+	beq             .Lneon_memmove_done
+.Lneon_b2f_copy_4:
+	movs            r12, r2, lsr #0x2
+	beq             .Lneon_b2f_copy_1
+.Lneon_b2f_copy_4_loop:
+	ldr             r3, [r1, #-4]!
+	subs            r12, r12, #1
+	str             r3, [r0, #-4]!
+	bne             .Lneon_b2f_copy_4_loop
+	ands            r2, r2, #0x3
+	nop
+.Lneon_b2f_copy_1:
+	cmp             r2, #0
+	beq             .Lneon_memmove_done
+.Lneon_b2f_copy_1_loop:
+	ldrb            r12, [r1, #-1]!
+	subs            r2, r2, #1
+	strb            r12, [r0, #-1]!
+	bne             .Lneon_b2f_copy_1_loop
+
+.Lneon_memmove_done:
+	pop             {r0}
+	bx              lr
+
+	.end
+#endif /* SCORPION_NEON_OPTIMIZATION */
+
commit	a8c02214555d91fc1df3fdba065a46a4fad15c58	[log] [tgz]
author	Brent DeGraaf <bdegraaf@codeaurora.org>	Wed May 30 22:50:19 2012 -0400
committer	Andrew Sutherland <dr3wsuth3rland@gmail.com>	Mon Dec 10 19:32:25 2012 -0600
tree	16af1366567c6716364014465bda88ffb9c6aaf0
parent	a5d721e629e69b83d49468a204e46f560e6f76b7 [diff]