libc: Add NEON optimized memmove ported from cm10.1 bionic-benchmarks on cortex-a9 generic: BM_string_memmove/8 50000000 67 119.35 MiB/s BM_string_memmove/64 10000000 175 364.59 MiB/s BM_string_memmove/512 1000000 1078 474.86 MiB/s BM_string_memmove/1K 1000000 2072 494.20 MiB/s BM_string_memmove/8K 100000 16400 499.50 MiB/s BM_string_memmove/16K 50000 32293 507.35 MiB/s BM_string_memmove/32K 50000 66585 492.12 MiB/s BM_string_memmove/64K 10000 160435 408.49 MiB/s NEON-optimized: BM_string_memmove/8 100000000 25 319.06 MiB/s BM_string_memmove/64 50000000 43 1472.60 MiB/s BM_string_memmove/512 10000000 247 2069.74 MiB/s BM_string_memmove/1K 5000000 463 2210.08 MiB/s BM_string_memmove/8K 500000 3465 2363.69 MiB/s BM_string_memmove/16K 500000 6894 2376.30 MiB/s BM_string_memmove/32K 100000 15490 2115.38 MiB/s BM_string_memmove/64K 50000 42097 1556.75 MiB/s Change-Id: I89253a01fb811438089e16320ac265177a2ca152

commit: 2fbb576e28ad4402b508e60824fd76cb01982f0b [log] [tgz]
author: athurh <athurh@gmail.com> Sat Aug 10 22:21:23 2013 +0200
committer: Andrew Sutherland <dr3wsuth3rland@gmail.com> Wed Aug 28 13:15:07 2013 -0500
tree: 8f5b06af1abe833412cf0c9b64d1b201498698b9
parent: 3ae650c524b760511ce2dc58c85e2e6270c947c2 [diff]
diff --git a/libc/Android.mk b/libc/Android.mk
index c2a2923..0f0affe 100644
--- a/libc/Android.mk
+++ b/libc/Android.mk

@@ -366,8 +366,6 @@
 ifeq ($(TARGET_ARCH),arm)
 libc_common_src_files += \
 	string/strncmp.c \
-#	bionic/memmove.c.arm \
-#	string/bcopy.c \
 
 # These files need to be arm so that gdbserver
 # can set breakpoints in them without messing

diff --git a/libc/arch-arm/bionic/memmove.S b/libc/arch-arm/bionic/memmove.S
new file mode 100644
index 0000000..788ba3c
--- /dev/null
+++ b/libc/arch-arm/bionic/memmove.S

@@ -0,0 +1,193 @@
+/***************************************************************************
+ Copyright (c) 2009-2012 Code Aurora Forum. All rights reserved.
+
+ Redistribution and use in source and binary forms, with or without
+ modification, are permitted provided that the following conditions are met:
+     * Redistributions of source code must retain the above copyright
+       notice, this list of conditions and the following disclaimer.
+     * Redistributions in binary form must reproduce the above copyright
+       notice, this list of conditions and the following disclaimer in the
+       documentation and/or other materials provided with the distribution.
+     * Neither the name of Code Aurora nor the names of its contributors may
+       be used to endorse or promote products derived from this software
+       without specific prior written permission.
+
+ THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+ LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ POSSIBILITY OF SUCH DAMAGE.
+  ***************************************************************************/
+
+/***************************************************************************
+ *  Neon memmove: Attempts to do a memmove with Neon registers if possible,
+ *     Inputs:
+ *        dest: The destination buffer
+ *        src: The source buffer
+ *        n: The size of the buffer to transfer
+ *     Outputs:
+ *
+ ***************************************************************************/
+
+#include <machine/cpu-features.h>
+
+#ifndef PLDOFFS
+#define PLDOFFS	(10)
+#endif
+#ifndef PLDTHRESH
+#define PLDTHRESH (PLDOFFS)
+#endif
+#if (PLDOFFS < 5)
+#error Routine does not support offsets less than 5
+#endif
+#if (PLDTHRESH < PLDOFFS)
+#error PLD threshold must be greater than or equal to the PLD offset
+#endif
+#ifndef PLDSIZE
+#define PLDSIZE (64)
+#endif
+#define NOP_OPCODE (0xe320f000)
+
+	.code 32
+	.align 5
+	.global memmove
+	.type memmove, %function
+
+	.global bcopy
+	.type bcopy, %function
+
+bcopy:
+	mov	r12, r0
+	mov	r0, r1
+	mov	r1, r12
+	.balignl 64, NOP_OPCODE, 4*2
+memmove:
+.Lneon_memmove_cmf:
+	subs	r12, r0, r1
+	bxeq	lr
+	cmphi	r2, r12
+	bls	memcpy	/* Use memcpy for non-overlapping areas */
+
+	push	{r0}
+
+.Lneon_back_to_front_copy:
+	add	r0, r0, r2
+	add	r1, r1, r2
+	cmp	r2, #4
+	bgt	.Lneon_b2f_gt4
+	cmp	r2, #0
+.Lneon_b2f_smallcopy_loop:
+	beq	.Lneon_memmove_done
+	ldrb	r12, [r1, #-1]!
+	subs	r2, r2, #1
+	strb	r12, [r0, #-1]!
+	b	.Lneon_b2f_smallcopy_loop
+.Lneon_b2f_gt4:
+	sub	r3, r0, r1
+	cmp	r2, r3
+	movle	r12, r2
+	movgt	r12, r3
+	cmp	r12, #64
+	bge	.Lneon_b2f_copy_64
+	cmp	r12, #32
+	bge	.Lneon_b2f_copy_32
+	cmp	r12, #8
+	bge	.Lneon_b2f_copy_8
+	cmp	r12, #4
+	bge	.Lneon_b2f_copy_4
+	b	.Lneon_b2f_copy_1
+.Lneon_b2f_copy_64:
+	sub	r1, r1, #64	/* Predecrement */
+	sub	r0, r0, #64
+	movs	r12, r2, lsr #6
+	cmp	r12, #PLDTHRESH
+	ble	.Lneon_b2f_copy_64_loop_nopld
+	sub	r12, #PLDOFFS
+	pld	[r1, #-(PLDOFFS-5)*PLDSIZE]
+	pld	[r1, #-(PLDOFFS-4)*PLDSIZE]
+	pld	[r1, #-(PLDOFFS-3)*PLDSIZE]
+	pld	[r1, #-(PLDOFFS-2)*PLDSIZE]
+	pld	[r1, #-(PLDOFFS-1)*PLDSIZE]
+	.balignl 64, NOP_OPCODE, 4*2
+.Lneon_b2f_copy_64_loop_outer:
+	pld	[r1, #-(PLDOFFS)*PLDSIZE]
+	vld1.32	{q0, q1}, [r1]!
+	vld1.32	{q2, q3}, [r1]
+	subs	r12, r12, #1
+	vst1.32	{q0, q1}, [r0]!
+	sub	r1, r1, #96	/* Post-fixup and predecrement */
+	vst1.32	{q2, q3}, [r0]
+	sub	r0, r0, #96
+	bne	.Lneon_b2f_copy_64_loop_outer
+	mov	r12, #PLDOFFS
+	.balignl 64, NOP_OPCODE, 4*2
+.Lneon_b2f_copy_64_loop_nopld:
+	vld1.32	{q8, q9}, [r1]!
+	vld1.32	{q10, q11}, [r1]
+	subs	r12, r12, #1
+	vst1.32	{q8, q9}, [r0]!
+	sub	r1, r1, #96	/* Post-fixup and predecrement */
+	vst1.32	{q10, q11}, [r0]
+	sub	r0, r0, #96
+	bne	.Lneon_b2f_copy_64_loop_nopld
+	ands	r2, r2, #0x3f
+	beq	.Lneon_memmove_done
+	add	r1, r1, #64	/* Post-fixup */
+	add	r0, r0, #64
+	cmp	r2, #32
+	blt	.Lneon_b2f_copy_finish
+.Lneon_b2f_copy_32:
+	mov	r12, r2, lsr #5
+.Lneon_b2f_copy_32_loop:
+	sub	r1, r1, #32	/* Predecrement */
+	sub	r0, r0, #32
+	vld1.32	{q0,q1}, [r1]
+	subs	r12, r12, #1
+	vst1.32	{q0,q1}, [r0]
+	bne	.Lneon_b2f_copy_32_loop
+	ands	r2, r2, #0x1f
+	beq	.Lneon_memmove_done
+.Lneon_b2f_copy_finish:
+.Lneon_b2f_copy_8:
+	movs	r12, r2, lsr #0x3
+	beq	.Lneon_b2f_copy_4
+	.balignl 64, NOP_OPCODE, 4*2
+.Lneon_b2f_copy_8_loop:
+	sub	r1, r1, #8	/* Predecrement */
+	sub	r0, r0, #8
+	vld1.32	{d0}, [r1]
+	subs	r12, r12, #1
+	vst1.32	{d0}, [r0]
+	bne	.Lneon_b2f_copy_8_loop
+	ands	r2, r2, #0x7
+	beq	.Lneon_memmove_done
+.Lneon_b2f_copy_4:
+	movs	r12, r2, lsr #0x2
+	beq	.Lneon_b2f_copy_1
+.Lneon_b2f_copy_4_loop:
+	ldr	r3, [r1, #-4]!
+	subs	r12, r12, #1
+	str	r3, [r0, #-4]!
+	bne	.Lneon_b2f_copy_4_loop
+	ands	r2, r2, #0x3
+.Lneon_b2f_copy_1:
+	cmp	r2, #0
+	beq	.Lneon_memmove_done
+	.balignl 64, NOP_OPCODE, 4*2
+.Lneon_b2f_copy_1_loop:
+	ldrb	r12, [r1, #-1]!
+	subs	r2, r2, #1
+	strb	r12, [r0, #-1]!
+	bne	.Lneon_b2f_copy_1_loop
+
+.Lneon_memmove_done:
+	pop	{r0}
+	bx	lr
+
+	.end

diff --git a/libc/arch-arm/cortex-a9/cortex-a9.mk b/libc/arch-arm/cortex-a9/cortex-a9.mk
index 61a52c2..e14f180 100644
--- a/libc/arch-arm/cortex-a9/cortex-a9.mk
+++ b/libc/arch-arm/cortex-a9/cortex-a9.mk

@@ -2,7 +2,13 @@
 $(call libc-add-cpu-variant-src,MEMSET,arch-arm/cortex-a9/bionic/memset.S)
 $(call libc-add-cpu-variant-src,STRCAT,arch-arm/cortex-a9/bionic/strcat.S)
 $(call libc-add-cpu-variant-src,STRCMP,arch-arm/cortex-a9/bionic/strcmp.S)
+ifeq ($(ARCH_ARM_HAVE_NEON),true)
+$(call libc-add-cpu-variant-src,MEMMOVE,arch-arm/bionic/memmove.S)
+else
+$(call libc-add-cpu-variant-src,MEMMOVE,bionic/memmove.c.arm)
+$(call libc-add-cpu-variant-src,BCOPY,string/bcopy.c.arm)
+endif
 $(call libc-add-cpu-variant-src,STRCPY,arch-arm/cortex-a9/bionic/strcpy.S)
 $(call libc-add-cpu-variant-src,STRLEN,arch-arm/cortex-a9/bionic/strlen.S)
 
-include bionic/libc/arch-arm/generic/generic.mk
+#include bionic/libc/arch-arm/generic/generic.mk
commit	2fbb576e28ad4402b508e60824fd76cb01982f0b	[log] [tgz]
author	athurh <athurh@gmail.com>	Sat Aug 10 22:21:23 2013 +0200
committer	Andrew Sutherland <dr3wsuth3rland@gmail.com>	Wed Aug 28 13:15:07 2013 -0500
tree	8f5b06af1abe833412cf0c9b64d1b201498698b9
parent	3ae650c524b760511ce2dc58c85e2e6270c947c2 [diff]