Modify Android mem* routines with CodeAurora versions.

Update the memcpy, memmove, and memset routines to use the
versions from CodeAurora when specified in the bionic/Android.mk
file (actually activated in the BoardConfig.mk file under
device/<vendor>/<board>).  With this change, the mem* routines are
only used for the msm8660, while other platforms will use the
current Android mem* routines.

Future platforms can modify the makefile to use the CodeAurora-based
mem* routines as desired.  This has the benefit of making the CodeAurora-
based routines opt-in instead of opt-out.

Also, PLDSIZE and PLDOFFS can be specified in the BoardConfig.mk as well,
so other platforms with different PLD tunings can use the same code
without modifying the source file itself.

Tests with FileCycler-0.3 showed a slight 1.1% improvement with these
files on an 8660v2, based on the average of three FileCycler runs with
and without the patch.  Since the min/max values did not overlap, and
the average score showed an improvement, we can consider upstreaming these
modifications.

Change-Id: I6946076bc6a88a2a2c8667b09494e1eb31e01ee0

Conflicts:

	libc/Android.mk

Signed-off-by: Andrew Sutherland <dr3wsuth3rland@gmail.com>
diff --git a/libc/Android.mk b/libc/Android.mk
index 59d0065..3832e24 100644
--- a/libc/Android.mk
+++ b/libc/Android.mk
@@ -362,8 +362,6 @@
 	arch-arm/bionic/strcpy.S \
 	arch-arm/bionic/strcmp.S \
 	arch-arm/bionic/syscall.S \
-	string/memmove.c.arm \
-	string/bcopy.c \
 	string/strncmp.c \
 	unistd/socketcalls.c
 ifeq ($(ARCH_ARM_HAVE_ARMV7A),true)
@@ -372,6 +370,17 @@
 libc_common_src_files += arch-arm/bionic/strlen.c.arm
 endif
 
+# Check if we want a neonized version of memmove instead of the
+# current ARM version
+ifeq ($(TARGET_USE_SCORPION_BIONIC_OPTIMIZATION),true)
+libc_common_src_files += \
+	arch-arm/bionic/memmove.S
+else # Non-Scorpion-based ARM
+libc_common_src_files += \
+	string/bcopy.c \
+	string/memmove.c.arm
+endif # !TARGET_USE_SCORPION_BIONIC_OPTIMIZATION
+
 # These files need to be arm so that gdbserver
 # can set breakpoints in them without messing
 # up any thumb code.
@@ -516,9 +525,20 @@
   ifeq ($(ARCH_ARM_HAVE_TLS_REGISTER),true)
     libc_common_cflags += -DHAVE_ARM_TLS_REGISTER
   endif
+
   ifeq ($(TARGET_HAVE_TEGRA_ERRATA_657451),true)
     libc_common_cflags += -DHAVE_TEGRA_ERRATA_657451
   endif
+
+  # Add in defines to activate SCORPION_NEON_OPTIMIZATION
+  ifeq ($(TARGET_USE_SCORPION_BIONIC_OPTIMIZATION),true)
+    libc_common_cflags += -DSCORPION_NEON_OPTIMIZATION
+    ifeq ($(TARGET_USE_SCORPION_PLD_SET),true)
+      libc_common_cflags += -DPLDOFFS=$(TARGET_SCORPION_BIONIC_PLDOFFS)
+      libc_common_cflags += -DPLDSIZE=$(TARGET_SCORPION_BIONIC_PLDSIZE)
+    endif
+  endif
+
 else # !arm
   ifeq ($(TARGET_ARCH),x86)
     libc_crt_target_cflags := -m32
diff --git a/libc/arch-arm/bionic/memcpy.S b/libc/arch-arm/bionic/memcpy.S
index 438fa00..90e788a 100644
--- a/libc/arch-arm/bionic/memcpy.S
+++ b/libc/arch-arm/bionic/memcpy.S
@@ -2,6 +2,8 @@
  * Copyright (C) 2008 The Android Open Source Project
  * All rights reserved.
  *
+ * Copyright (c) 2009-2011, Code Aurora Forum. All rights reserved.
+ *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
@@ -30,7 +32,114 @@
 #include <machine/asm.h>
 
 #if defined(__ARM_NEON__)
-
+#if defined(SCORPION_NEON_OPTIMIZATION)
+	/*
+	 * These can be overridden in:
+	 *   device/<vendor>/<board>/BoardConfig.mk
+         * by setting the following:
+	 *   TARGET_USE_SCORPION_BIONIC_OPTIMIZATION := true
+	 *   TARGET_USE_SCORPION_PLD_SET := true
+	 *   TARGET_SCORPION_BIONIC_PLDOFFS := <pldoffset>
+	 *   TARGET_SCORPION_BIONIC_PLDSIZE := <pldsize>
+	 */
+#ifndef PLDOFFS
+#define PLDOFFS	(6)
+#endif
+#ifndef PLDSIZE
+#define PLDSIZE	(128)	/* L2 cache line size */
+#endif
+        .code 32
+        .align 5
+        .globl memcpy
+        .func
+memcpy:
+	push            {r0}
+	cmp             r2, #4
+	blt             .Lneon_lt4
+	cmp             r2, #16
+	blt             .Lneon_lt16
+	cmp             r2, #32
+	blt             .Lneon_16
+	cmp              r2, #128
+	blt              .Lneon_copy_32_a
+	/* Copy blocks of 128-bytes (word-aligned) at a time*/
+	/* Code below is optimized for PLDSIZE=128 only */
+	mov             r12, r2, lsr #7
+	cmp             r12, #PLDOFFS
+	ble             .Lneon_copy_128_loop_nopld
+	sub             r12, #PLDOFFS
+	pld             [r1, #(PLDOFFS-1)*PLDSIZE]
+.Lneon_copy_128_loop_outer:
+	pld             [r1, #(PLDOFFS*PLDSIZE)]
+	vld1.32         {q0, q1}, [r1]!
+	vld1.32         {q2, q3}, [r1]!
+	vld1.32         {q8, q9}, [r1]!
+	vld1.32         {q10, q11}, [r1]!
+	subs            r12, r12, #1
+	vst1.32	        {q0, q1}, [r0]!
+	vst1.32         {q2, q3}, [r0]!
+	vst1.32         {q8, q9}, [r0]!
+	vst1.32         {q10, q11}, [r0]!
+	bne             .Lneon_copy_128_loop_outer
+	mov             r12, #PLDOFFS
+.Lneon_copy_128_loop_nopld:
+	vld1.32         {q0, q1}, [r1]!
+	vld1.32         {q2, q3}, [r1]!
+	vld1.32         {q8, q9}, [r1]!
+	vld1.32         {q10, q11}, [r1]!
+	subs            r12, r12, #1
+	vst1.32         {q0, q1}, [r0]!
+	vst1.32         {q2, q3}, [r0]!
+	vst1.32         {q8, q9}, [r0]!
+	vst1.32         {q10, q11}, [r0]!
+	bne             .Lneon_copy_128_loop_nopld
+	ands            r2, r2, #0x7f
+	beq             .Lneon_exit
+	cmp             r2, #32
+	blt             .Lneon_16
+	nop
+	/* Copy blocks of 32-bytes (word aligned) at a time*/
+.Lneon_copy_32_a:
+	mov             r12, r2, lsr #5
+.Lneon_copy_32_loop_a:
+	vld1.32         {q0,q1}, [r1]!
+	subs            r12, r12, #1
+	vst1.32         {q0,q1}, [r0]!
+	bne             .Lneon_copy_32_loop_a
+	ands            r2, r2, #0x1f
+	beq             .Lneon_exit
+.Lneon_16:
+	subs            r2, r2, #16
+	blt             .Lneon_lt16
+	vld1.32         {q8}, [r1]!
+	vst1.32         {q8}, [r0]!
+	beq             .Lneon_exit
+.Lneon_lt16:
+	movs            r12, r2, lsl #29
+	bcc             .Lneon_skip8
+	ldr             r3, [r1], #4
+	ldr             r12, [r1], #4
+	str             r3, [r0], #4
+	str             r12, [r0], #4
+.Lneon_skip8:
+	bpl             .Lneon_lt4
+	ldr             r3, [r1], #4
+	str             r3, [r0], #4
+.Lneon_lt4:
+	movs            r2, r2, lsl #31
+	bcc             .Lneon_lt2
+	ldrh            r3, [r1], #2
+	strh            r3, [r0], #2
+.Lneon_lt2:
+	bpl             .Lneon_exit
+	ldrb            r12, [r1]
+	strb            r12, [r0]
+.Lneon_exit:
+	pop             {r0}
+	bx              lr
+	.endfunc
+	.end
+#else /* !SCORPION_NEON_OPTIMIZATION */
         .text
         .fpu    neon
 
@@ -141,7 +250,7 @@
         bx          lr
 END(memcpy)
 
-
+#endif  /* !SCORPION_NEON_OPTIMIZATION */
 #else   /* __ARM_ARCH__ < 7 */
 
 
diff --git a/libc/arch-arm/bionic/memmove.S b/libc/arch-arm/bionic/memmove.S
new file mode 100644
index 0000000..1234195
--- /dev/null
+++ b/libc/arch-arm/bionic/memmove.S
@@ -0,0 +1,356 @@
+/***************************************************************************
+ Copyright (c) 2009-2011 Code Aurora Forum. All rights reserved.
+
+ Redistribution and use in source and binary forms, with or without
+ modification, are permitted provided that the following conditions are met:
+     * Redistributions of source code must retain the above copyright
+       notice, this list of conditions and the following disclaimer.
+     * Redistributions in binary form must reproduce the above copyright
+       notice, this list of conditions and the following disclaimer in the
+       documentation and/or other materials provided with the distribution.
+     * Neither the name of Code Aurora nor the names of its contributors may
+       be used to endorse or promote products derived from this software
+       without specific prior written permission.
+
+ THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+ LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ POSSIBILITY OF SUCH DAMAGE.
+  ***************************************************************************/
+
+/***************************************************************************
+ *  Neon memmove: Attempts to do a memmove with Neon registers if possible,
+ *     Inputs:
+ *        dest: The destination buffer
+ *        src: The source buffer
+ *        n: The size of the buffer to transfer
+ *     Outputs:
+ *
+ ***************************************************************************/
+
+#include <machine/cpu-features.h>
+
+#if defined(SCORPION_NEON_OPTIMIZATION)
+	/*
+	 * These can be overridden in:
+	 *   device/<vendor>/<board>/BoardConfig.mk
+         * by setting the following:
+	 *   TARGET_USE_SCORPION_BIONIC_OPTIMIZATION := true
+	 *   TARGET_USE_SCORPION_PLD_SET := true
+	 *   TARGET_SCORPION_BIONIC_PLDOFFS := <pldoffset>
+	 *   TARGET_SCORPION_BIONIC_PLDSIZE := <pldsize>
+	 */
+#ifndef PLDOFFS
+#define PLDOFFS	(6)
+#endif
+#ifndef PLDSIZE
+#define PLDSIZE	(128)	/* L2 cache line size */
+#endif
+
+	.code 32
+	.align 5
+	.global memmove
+	.type memmove, %function
+
+	.global bcopy
+	.type bcopy, %function
+
+bcopy:
+	mov		r12, r0
+	mov		r0, r1
+	mov		r1, r12
+memmove:
+	push            {r0}
+
+	/*
+	 * The requirements for memmove state that the function should
+	 * operate as if data were being copied from the source to a
+	 * buffer, then to the destination.  This is to allow a user
+	 * to copy data from a source and target that overlap.
+	 *
+	 * We can't just do byte copies front-to-back automatically, since
+	 * there's a good chance we may have an overlap (why else would someone
+	 * intentionally use memmove then?).
+	 *
+	 * We'll break this into two parts.  Front-to-back, or back-to-front
+	 * copies.
+	 */
+.Lneon_memmove_cmf:
+	cmp             r0, r1
+	blt             .Lneon_front_to_back_copy
+	bgt             .Lneon_back_to_front_copy
+	b               .Lneon_memmove_done
+
+	/* #############################################################
+	 * Front to Back copy
+	 */
+.Lneon_front_to_back_copy:
+	/*
+	 * For small copies, just do a quick memcpy.  We can do this for
+	 * front-to-back copies, aligned or unaligned, since we're only
+	 * doing 1 byte at a time...
+	 */
+	cmp             r2, #4
+	bgt             .Lneon_f2b_gt4
+	cmp             r2, #0
+.Lneon_f2b_smallcopy_loop:
+	beq             .Lneon_memmove_done
+	ldrb            r12, [r1], #1
+	subs            r2, r2, #1
+	strb            r12, [r0], #1
+	b               .Lneon_f2b_smallcopy_loop
+.Lneon_f2b_gt4:
+	/* The window size is in r3. */
+	sub             r3, r1, r0
+	/* #############################################################
+	 * Front to Back copy 
+	 */
+	/*
+	 * Note that we can't just route based on the size in r2.  If that's
+	 * larger than the overlap window in r3, we could potentially
+	 * (and likely!) destroy data we're copying.
+	 */
+	cmp             r2, r3
+	movle           r12, r2
+	movgt           r12, r3
+	cmp             r12, #256
+	bge             .Lneon_f2b_copy_128
+	cmp             r12, #64
+	bge             .Lneon_f2b_copy_32
+	cmp             r12, #16
+	bge             .Lneon_f2b_copy_16
+	cmp             r12, #8
+	bge             .Lneon_f2b_copy_8
+	cmp             r12, #4
+	bge             .Lneon_f2b_copy_4
+	b               .Lneon_f2b_copy_1
+	nop
+.Lneon_f2b_copy_128:
+	mov             r12, r2, lsr #7
+	cmp             r12, #PLDOFFS
+	ble             .Lneon_f2b_copy_128_loop_nopld
+	sub             r12, #PLDOFFS
+	pld             [r1, #(PLDOFFS-1)*PLDSIZE]
+.Lneon_f2b_copy_128_loop_outer:
+	pld             [r1, #(PLDOFFS*PLDSIZE)]
+	vld1.32         {q0,q1}, [r1]!
+	vld1.32         {q2,q3}, [r1]!
+	vld1.32         {q8,q9}, [r1]!
+	vld1.32         {q10,q11}, [r1]!
+	subs            r12, r12, #1
+	vst1.32         {q0,q1}, [r0]!
+	vst1.32         {q2,q3}, [r0]!
+	vst1.32         {q8,q9}, [r0]!
+	vst1.32         {q10,q11}, [r0]!
+	bne             .Lneon_f2b_copy_128_loop_outer
+	mov             r12, #PLDOFFS
+.Lneon_f2b_copy_128_loop_nopld:
+	vld1.32         {q0,q1}, [r1]!
+	vld1.32         {q2,q3}, [r1]!
+	vld1.32         {q8,q9}, [r1]!
+	vld1.32         {q10,q11}, [r1]!
+	subs            r12, r12, #1
+	vst1.32         {q0,q1}, [r0]!
+	vst1.32         {q2,q3}, [r0]!
+	vst1.32         {q8,q9}, [r0]!
+	vst1.32         {q10,q11}, [r0]!
+	bne             .Lneon_f2b_copy_128_loop_nopld
+	ands            r2, r2, #0x7f
+	beq             .Lneon_memmove_done
+	cmp             r2, #32
+	bge             .Lneon_f2b_copy_32
+	b               .Lneon_f2b_copy_finish
+.Lneon_f2b_copy_32:
+	mov             r12, r2, lsr #5
+.Lneon_f2b_copy_32_loop:
+	vld1.32         {q0,q1}, [r1]!
+	subs            r12, r12, #1
+	vst1.32         {q0,q1}, [r0]!
+	bne             .Lneon_f2b_copy_32_loop
+	ands            r2, r2, #0x1f
+	beq             .Lneon_memmove_done
+.Lneon_f2b_copy_finish:
+.Lneon_f2b_copy_16:
+	movs            r12, r2, lsr #4
+	beq             .Lneon_f2b_copy_8
+.Lneon_f2b_copy_16_loop:
+	vld1.32         {q0}, [r1]!
+	subs            r12, r12, #1
+	vst1.32         {q0}, [r0]!
+	bne             .Lneon_f2b_copy_16_loop
+	ands            r2, r2, #0xf
+	beq             .Lneon_memmove_done
+.Lneon_f2b_copy_8:
+	movs            r12, r2, lsr #3
+	beq             .Lneon_f2b_copy_4
+.Lneon_f2b_copy_8_loop:
+	vld1.32         {d0}, [r1]!
+	subs            r12, r12, #1
+	vst1.32         {d0}, [r0]!
+	bne             .Lneon_f2b_copy_8_loop
+	ands            r2, r2, #0x7
+	beq             .Lneon_memmove_done
+.Lneon_f2b_copy_4:
+	movs            r12, r2, lsr #2
+	beq             .Lneon_f2b_copy_1
+.Lneon_f2b_copy_4_loop:
+	ldr             r3, [r1], #4
+	subs            r12, r12, #1
+	str             r3, [r0], #4
+	bne             .Lneon_f2b_copy_4_loop
+	ands            r2, r2, #0x3
+	nop
+.Lneon_f2b_copy_1:
+	cmp             r2, #0
+	beq             .Lneon_memmove_done
+.Lneon_f2b_copy_1_loop:
+	ldrb            r12, [r1], #1
+	subs            r2, r2, #1
+	strb            r12, [r0], #1
+	bne             .Lneon_f2b_copy_1_loop
+.Lneon_f2b_finish:
+	b               .Lneon_memmove_done
+
+	/* #############################################################
+	 * Back to Front copy
+	 */
+.Lneon_back_to_front_copy:
+	/*
+	 * Here, we'll want to shift to the end of the buffers.  This
+	 * actually points us one past where we need to go, but since
+	 * we'll pre-decrement throughout, this will be fine.
+	 */
+	add             r0, r0, r2
+	add             r1, r1, r2
+	cmp             r2, #4
+	bgt             .Lneon_b2f_gt4
+	cmp             r2, #0
+.Lneon_b2f_smallcopy_loop:
+	beq             .Lneon_memmove_done
+	ldrb            r12, [r1, #-1]!
+	subs            r2, r2, #1
+	strb            r12, [r0, #-1]!
+	b               .Lneon_b2f_smallcopy_loop
+.Lneon_b2f_gt4:
+	/*
+	 * The minimum of the overlap window size and the copy size
+	 * is in r3.
+	 */
+	sub             r3, r0, r1
+	/*
+	 * #############################################################
+	 * Back to Front copy -
+	 */
+	cmp             r2, r3
+	movle           r12, r2
+	movgt           r12, r3
+	cmp             r12, #256
+	bge             .Lneon_b2f_copy_128
+	cmp             r12, #64
+	bge             .Lneon_b2f_copy_32
+	cmp             r12, #8
+	bge             .Lneon_b2f_copy_8
+	cmp             r12, #4
+	bge             .Lneon_b2f_copy_4
+	b               .Lneon_b2f_copy_1
+	nop
+.Lneon_b2f_copy_128:
+	movs            r12, r2, lsr #7
+	cmp             r12, #PLDOFFS
+	ble             .Lneon_b2f_copy_128_loop_nopld
+	sub             r12, #PLDOFFS
+	pld             [r1, #-(PLDOFFS-1)*PLDSIZE]
+.Lneon_b2f_copy_128_loop_outer:
+	pld             [r1, #-(PLDOFFS*PLDSIZE)]
+	sub             r1, r1, #128
+	sub             r0, r0, #128
+	vld1.32         {q0, q1}, [r1]!
+	vld1.32         {q2, q3}, [r1]!
+	vld1.32         {q8, q9}, [r1]!
+	vld1.32         {q10, q11}, [r1]!
+	subs            r12, r12, #1
+	vst1.32         {q0, q1}, [r0]!
+	vst1.32         {q2, q3}, [r0]!
+	vst1.32         {q8, q9}, [r0]!
+	vst1.32         {q10, q11}, [r0]!
+	sub             r1, r1, #128
+	sub             r0, r0, #128
+	bne             .Lneon_b2f_copy_128_loop_outer
+	mov             r12, #PLDOFFS
+.Lneon_b2f_copy_128_loop_nopld:
+	sub             r1, r1, #128
+	sub             r0, r0, #128
+	vld1.32         {q0, q1}, [r1]!
+	vld1.32         {q2, q3}, [r1]!
+	vld1.32         {q8, q9}, [r1]!
+	vld1.32         {q10, q11}, [r1]!
+	subs            r12, r12, #1
+	vst1.32         {q0, q1}, [r0]!
+	vst1.32         {q2, q3}, [r0]!
+	vst1.32         {q8, q9}, [r0]!
+	vst1.32         {q10, q11}, [r0]!
+	sub             r1, r1, #128
+	sub             r0, r0, #128
+	bne             .Lneon_b2f_copy_128_loop_nopld
+	ands            r2, r2, #0x7f
+	beq             .Lneon_memmove_done
+	cmp             r2, #32
+	bge             .Lneon_b2f_copy_32
+	b               .Lneon_b2f_copy_finish
+.Lneon_b2f_copy_32:
+	mov             r12, r2, lsr #5
+.Lneon_b2f_copy_32_loop:
+	sub             r1, r1, #32
+	sub             r0, r0, #32
+	vld1.32         {q0,q1}, [r1]
+	subs            r12, r12, #1
+	vst1.32         {q0,q1}, [r0]
+	bne             .Lneon_b2f_copy_32_loop
+	ands            r2, r2, #0x1f
+	beq             .Lneon_memmove_done
+.Lneon_b2f_copy_finish:
+.Lneon_b2f_copy_8:
+	movs            r12, r2, lsr #0x3
+	beq             .Lneon_b2f_copy_4
+.Lneon_b2f_copy_8_loop:
+	sub             r1, r1, #8
+	sub             r0, r0, #8
+	vld1.32         {d0}, [r1]
+	subs            r12, r12, #1
+	vst1.32         {d0}, [r0]
+	bne             .Lneon_b2f_copy_8_loop
+	ands            r2, r2, #0x7
+	beq             .Lneon_memmove_done
+.Lneon_b2f_copy_4:
+	movs            r12, r2, lsr #0x2
+	beq             .Lneon_b2f_copy_1
+.Lneon_b2f_copy_4_loop:
+	ldr             r3, [r1, #-4]!
+	subs            r12, r12, #1
+	str             r3, [r0, #-4]!
+	bne             .Lneon_b2f_copy_4_loop
+	ands            r2, r2, #0x3
+	nop
+.Lneon_b2f_copy_1:
+	cmp             r2, #0
+	beq             .Lneon_memmove_done
+.Lneon_b2f_copy_1_loop:
+	ldrb            r12, [r1, #-1]!
+	subs            r2, r2, #1
+	strb            r12, [r0, #-1]!
+	bne             .Lneon_b2f_copy_1_loop
+
+.Lneon_memmove_done:
+	pop             {r0}
+	bx              lr
+
+	.end
+#endif /* SCORPION_NEON_OPTIMIZATION */
+
diff --git a/libc/arch-arm/bionic/memset.S b/libc/arch-arm/bionic/memset.S
index 273b9e3..3ea5aef 100644
--- a/libc/arch-arm/bionic/memset.S
+++ b/libc/arch-arm/bionic/memset.S
@@ -2,6 +2,8 @@
  * Copyright (C) 2008 The Android Open Source Project
  * All rights reserved.
  *
+ * Copyright (c) 2009-2011, Code Aurora Forum. All rights reserved.
+ *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
@@ -27,6 +29,90 @@
  */
 
 #include <machine/asm.h>
+
+#if defined(SCORPION_NEON_OPTIMIZATION)
+	.code 32
+	.align 8
+	.global memset
+	.type memset, %function
+
+	.global bzero
+	.type bzero, %function
+
+bzero:
+	mov             r2, r1
+	mov             r1, #0	
+memset:
+	push            {r0}
+
+	cmp             r2, #6
+	bgt             .Lmemset_gt6
+	cmp             r2, #0
+	beq             .Lmemset_smallcopy_done
+.Lmemset_smallcopy_loop:
+	strb            r1, [r0], #1
+	subs            r2, r2, #1
+	bne             .Lmemset_smallcopy_loop
+.Lmemset_smallcopy_done:
+	pop             {r0}
+	bx              lr
+
+.Lmemset_gt6:
+	vdup.8		q0, r1
+	vmov		r1, s0
+
+	/*
+	 * Decide where to route for the maximum copy sizes.
+	 */
+	cmp             r2, #4
+	blt             .Lmemset_lt4
+	cmp             r2, #16
+	blt             .Lmemset_lt16
+	vmov            q1, q0
+	cmp             r2, #128
+	blt             .Lmemset_32
+.Lmemset_128:
+	mov             r12, r2, lsr #7
+.Lmemset_128_loop:
+	vst1.32         {q0, q1}, [r0]!
+	vst1.32         {q0, q1}, [r0]!
+	vst1.32         {q0, q1}, [r0]!
+	vst1.32         {q0, q1}, [r0]!
+	subs            r12, r12, #1
+	bne             .Lmemset_128_loop
+	ands            r2, r2, #0x7f
+	beq             .Lmemset_end
+.Lmemset_32:
+	movs             r12, r2, lsr #5
+	beq              .Lmemset_lt32
+.Lmemset_32_loop:
+	subs            r12, r12, #1
+	vst1.32         {q0, q1}, [r0]!
+	bne             .Lmemset_32_loop
+	ands            r2, r2, #0x1f
+	beq             .Lmemset_end
+.Lmemset_lt32:
+	cmp             r2, #16
+	blt             .Lmemset_lt16
+	vst1.64         {q0}, [r0]!
+	subs            r2, r2, #16
+	beq             .Lmemset_end
+.Lmemset_lt16:
+	movs            r12, r2, lsl #29
+	strcs           r1, [r0], #4
+	strcs           r1, [r0], #4
+	strmi           r1, [r0], #4
+.Lmemset_lt4:
+	movs            r2, r2, lsl #31
+	strcsh          r1, [r0], #2
+	strmib          r1, [r0]
+.Lmemset_end:
+	pop             {r0}
+	bx		lr
+
+	.end
+#else   /* !SCORPION_NEON_OPTIMIZATION */
+
 	
 		/*
 		 * Optimized memset() for ARM.
@@ -107,3 +193,5 @@
         ldmfd		sp!, {r0, r4-r7, lr}
         bx          lr
 END(memset)
+    
+#endif  /* SCORPION_NEON_OPTIMIZATION */