Krait enhancements from caf
msm8960: Improve performance of memmove, bcopy, and memmove_words
Change-Id: I62b3da046889387f835da741110d35ffd3c8f806
Conflicts:
libc/Android.mk
msm8960: Improve performance of memcpy
Change-Id: I0c8355ae5e92060ad5a0811d33937e6913c8b633
Bionic/libm: fast neon pow() for small x,y
Add a fast neon version of pow() suitable for relatively small
positive x and y (between 0 and 4). Run the standard
implementation in all other cases. Gives approximately 60%
performance improvement to AnTuTu FPU score.
Change-Id: I9234d37eaa6a815d1e619375f5b049c4ec88f557
msm7627a: Enable neon optimized memove and pow functions.
Define SPARROW_NEON_OPTIMIZATION flag so that neon optimized
memove and pow functions are used. Also add Corresponding
definitions in make files.
Change-Id: I12089fc7002e3ec294e63632bd84e395fbd24936
Bionic/libm: Prefer branches and VFP ABI
For internal functions set gcc attribute "aapcs-vfp" for ARM
and use -fno-if-conversion to prefer branches over predicated
instructions (improves performance on architectures with good
branch prediction).
Change-Id: I365e9508bd3babb0bb06fc5de127c1ae17445bcc
Bionic/libm: add assembly versions of sin/cos
Add assembly versions of sin/cos with integrated remainder pi/2
calculation. Directly extracted from binary libm.so compiled with
__ieee754_rem_pio2 calls inlined.
Change-Id: I9a999c01cea92aace9df7be9ad8f90f150040375
Conflicts:
libm/Android.mk
Bionic/libm: Remove extra vmov from sin/cos
Move integer representations of x bits on the integer side rather
than moving them to and from the FP registers.
Change-Id: I1d0800730d7553a47c462ee2a0cc044ffe62eb20
Bionic/libm: Pow optimizations and bug fixes
Use VFP calling convention for pow_neon handoff function by default.
Fix register usage collision between two different polynomial
coefficients in pow_neon. Remove conditional execution in pow_neon
and replace with branching.
Change-Id: I254617940b2787297aff2ab97dbf45c11e6a2b08
Bionic/libm: Add precision-correct de-serialize sin/cos
Modify sin/cos to improve performance while retaining either
bit-for-bit agreement with previous algorithm or <1 ulp
deviation from arbitrary precision result.
Change-Id: Icbd6d66fb1c0ceb53f43fed6541e0c89cc6e7a63
diff --git a/libc/Android.mk b/libc/Android.mk
index 6a77deb..8a11e24 100644
--- a/libc/Android.mk
+++ b/libc/Android.mk
@@ -279,7 +279,6 @@
bionic/libc_init_common.c \
bionic/logd_write.c \
bionic/md5.c \
- bionic/memmove_words.c \
bionic/pututline.c \
bionic/realpath.c \
bionic/sched_getaffinity.c \
@@ -388,11 +387,27 @@
arch-arm/bionic/strcpy.S \
arch-arm/bionic/strcmp.S \
arch-arm/bionic/syscall.S \
- string/memmove.c.arm \
- string/bcopy.c \
string/strncmp.c \
unistd/socketcalls.c
+# Check if we want a neonized version of memmove instead of the
+# current ARM version
+ifeq ($(TARGET_USE_SCORPION_BIONIC_OPTIMIZATION),true)
+libc_common_src_files += \
+ arch-arm/bionic/memmove.S \
+ bionic/memmove_words.c
+else
+ifneq (, $(filter true,$(TARGET_USE_KRAIT_BIONIC_OPTIMIZATION) $(TARGET_USE_SPARROW_BIONIC_OPTIMIZATION)))
+ libc_common_src_files += \
+ arch-arm/bionic/memmove.S
+ else # Other ARM
+ libc_common_src_files += \
+ string/bcopy.c \
+ string/memmove.c.arm \
+ bionic/memmove_words.c
+ endif # !TARGET_USE_KRAIT_BIONIC_OPTIMIZATION
+endif # !TARGET_USE_SCORPION_BIONIC_OPTIMIZATION
+
# These files need to be arm so that gdbserver
# can set breakpoints in them without messing
# up any thumb code.
@@ -555,6 +570,30 @@
ifeq ($(ARCH_ARM_USE_NON_NEON_MEMCPY),true)
libc_common_cflags += -DARCH_ARM_USE_NON_NEON_MEMCPY
endif
+ # Add in defines to activate SCORPION_NEON_OPTIMIZATION
+ ifeq ($(TARGET_USE_SCORPION_BIONIC_OPTIMIZATION),true)
+ libc_common_cflags += -DSCORPION_NEON_OPTIMIZATION
+ ifeq ($(TARGET_USE_SCORPION_PLD_SET),true)
+ libc_common_cflags += -DPLDOFFS=$(TARGET_SCORPION_BIONIC_PLDOFFS)
+ libc_common_cflags += -DPLDSIZE=$(TARGET_SCORPION_BIONIC_PLDSIZE)
+ endif
+ endif
+ # Add in defines to activate KRAIT_NEON_OPTIMIZATION
+ ifeq ($(TARGET_USE_KRAIT_BIONIC_OPTIMIZATION),true)
+ libc_common_cflags += -DKRAIT_NEON_OPTIMIZATION
+ ifeq ($(TARGET_USE_KRAIT_PLD_SET),true)
+ libc_common_cflags += -DPLDOFFS=$(TARGET_KRAIT_BIONIC_PLDOFFS)
+ libc_common_cflags += -DPLDTHRESH=$(TARGET_KRAIT_BIONIC_PLDTHRESH)
+ libc_common_cflags += -DPLDSIZE=$(TARGET_KRAIT_BIONIC_PLDSIZE)
+ libc_common_cflags += -DBBTHRESH=$(TARGET_KRAIT_BIONIC_BBTHRESH)
+ endif
+ endif
+ ifeq ($(TARGET_USE_SPARROW_BIONIC_OPTIMIZATION),true)
+ libc_common_cflags += -DSPARROW_NEON_OPTIMIZATION
+ endif
+ ifeq ($(TARGET_CORTEX_CACHE_LINE_32),true)
+ libc_common_cflags += -DCORTEX_CACHE_LINE_32
+ endif
endif # !arm
ifeq ($(TARGET_ARCH),x86)
diff --git a/libc/arch-arm/bionic/memcpy.S b/libc/arch-arm/bionic/memcpy.S
index 8453cc0..7e1a799 100644
--- a/libc/arch-arm/bionic/memcpy.S
+++ b/libc/arch-arm/bionic/memcpy.S
@@ -30,6 +30,396 @@
#include <machine/asm.h>
#if defined(__ARM_NEON__) && !defined(ARCH_ARM_USE_NON_NEON_MEMCPY)
+#if defined(KRAIT_NEON_OPTIMIZATION)
+ /*
+ * These can be overridden in:
+ * device/<vendor>/<board>/BoardConfig.mk
+ * by setting the following:
+ * TARGET_USE_KRAIT_BIONIC_OPTIMIZATION := true
+ * TARGET_USE_KRAIT_PLD_SET := true
+ * TARGET_KRAIT_BIONIC_PLDOFFS := <pldoffset>
+ * TARGET_KRAIT_BIONIC_PLDSIZE := <pldsize>
+ * TARGET_KRAIT_BIONIC_PLDTHRESH := <pldthreshold>
+ * TARGET_KRAIT_BIONIC_BBTHRESH := <bbthreshold>
+ */
+#ifndef PLDOFFS
+#define PLDOFFS (10)
+#endif
+#ifndef PLDTHRESH
+#define PLDTHRESH (PLDOFFS)
+#endif
+#ifndef BBTHRESH
+#define BBTHRESH (4096/64)
+#endif
+#if (PLDOFFS < 1)
+#error Routine does not support offsets less than 1
+#endif
+#if (PLDTHRESH < PLDOFFS)
+#error PLD threshold must be greater than or equal to the PLD offset
+#endif
+#ifndef PLDSIZE
+#define PLDSIZE (64)
+#endif
+#define NOP_OPCODE (0xe320f000)
+
+ .text
+ .fpu neon
+ .global memcpy
+ .type memcpy, %function
+ .align 5
+memcpy:
+ stmfd sp!, {r0, r9, r10, lr}
+ cmp r2, #4
+ blt .Lneon_lt4
+ cmp r2, #16
+ blt .Lneon_lt16
+ cmp r2, #32
+ blt .Lneon_16
+ cmp r2, #64
+ blt .Lneon_copy_32_a
+
+ mov r12, r2, lsr #6
+ cmp r12, #PLDTHRESH
+ ble .Lneon_copy_64_loop_nopld
+
+ cmp r12, #BBTHRESH
+ ble .Lneon_prime_pump
+
+ add lr, r0, #0x400
+ add r9, r1, #(PLDOFFS*PLDSIZE)
+ sub lr, lr, r9
+ lsl lr, lr, #21
+ lsr lr, lr, #21
+ add lr, lr, #(PLDOFFS*PLDSIZE)
+ cmp r12, lr, lsr #6
+ movle lr, #(PLDOFFS*PLDSIZE)
+
+ movgt r9, #(PLDOFFS)
+ rsbgts r9, r9, lr, lsr #6
+ ble .Lneon_prime_pump
+
+ add r10, r1, lr
+ bic r10, #0x3F
+
+ sub r12, lr, lsr #6
+ cmp r9, r12
+ suble r12, r12, r9
+ movgt r9, r12
+ movgt r12, #0
+
+ pld [r1, #((PLDOFFS-1)*PLDSIZE)]
+ .balignl 64, NOP_OPCODE, 4*2
+.Lneon_copy_64_loop_outer_doublepld:
+ pld [r1, #((PLDOFFS)*PLDSIZE)]
+ vld1.32 {q0, q1}, [r1]!
+ vld1.32 {q2, q3}, [r1]!
+ ldr r3, [r10]
+ subs r9, r9, #1
+ vst1.32 {q0, q1}, [r0]!
+ vst1.32 {q2, q3}, [r0]!
+ add r10, #64
+ bne .Lneon_copy_64_loop_outer_doublepld
+ cmp r12, #0
+ bne .Lneon_copy_64_loop_outer
+ mov r12, lr, lsr #6
+ b .Lneon_copy_64_loop_nopld
+ .balignl 64, NOP_OPCODE, 4*2
+.Lneon_prime_pump:
+ mov lr, #(PLDOFFS*PLDSIZE)
+ add r10, r1, #(PLDOFFS*PLDSIZE)
+ bic r10, #0x3F
+ sub r12, r12, #PLDOFFS
+ pld [r10, #(-1*PLDSIZE)]
+ .balignl 64, NOP_OPCODE, 4*2
+.Lneon_copy_64_loop_outer:
+ vld1.32 {q0, q1}, [r1]!
+ vld1.32 {q2, q3}, [r1]!
+ ldr r3, [r10]
+ subs r12, r12, #1
+ vst1.32 {q0, q1}, [r0]!
+ vst1.32 {q2, q3}, [r0]!
+ add r10, #64
+ bne .Lneon_copy_64_loop_outer
+ mov r12, lr, lsr #6
+ .balignl 64, NOP_OPCODE, 4*2
+.Lneon_copy_64_loop_nopld:
+ vld1.32 {q8, q9}, [r1]!
+ vld1.32 {q10, q11}, [r1]!
+ subs r12, r12, #1
+ vst1.32 {q8, q9}, [r0]!
+ vst1.32 {q10, q11}, [r0]!
+ bne .Lneon_copy_64_loop_nopld
+ ands r2, r2, #0x3f
+ beq .Lneon_exit
+ .balignl 64, NOP_OPCODE, 4*2
+.Lneon_copy_32_a:
+ movs r12, r2, lsl #27
+ bcc .Lneon_16
+ vld1.32 {q0,q1}, [r1]!
+ vst1.32 {q0,q1}, [r0]!
+ .balignl 64, NOP_OPCODE, 4*2
+.Lneon_16:
+ bpl .Lneon_lt16
+ vld1.32 {q8}, [r1]!
+ vst1.32 {q8}, [r0]!
+ ands r2, r2, #0x0f
+ beq .Lneon_exit
+ .balignl 64, NOP_OPCODE, 4*2
+.Lneon_lt16:
+ movs r12, r2, lsl #29
+ ldrcs r3, [r1], #4
+ ldrcs r12, [r1], #4
+ strcs r3, [r0], #4
+ strcs r12, [r0], #4
+ ldrmi r3, [r1], #4
+ strmi r3, [r0], #4
+ .balignl 64, NOP_OPCODE, 4*2
+.Lneon_lt4:
+ movs r2, r2, lsl #31
+ ldrcsh r3, [r1], #2
+ strcsh r3, [r0], #2
+ ldrmib r12, [r1]
+ strmib r12, [r0]
+ .balignl 64, NOP_OPCODE, 4*2
+.Lneon_exit:
+ ldmfd sp!, {r0, r9, r10, lr}
+ bx lr
+ .end
+#elif defined(SCORPION_NEON_OPTIMIZATION)
+ /*
+ * These can be overridden in:
+ * device/<vendor>/<board>/BoardConfig.mk
+ * by setting the following:
+ * TARGET_USE_SCORPION_BIONIC_OPTIMIZATION := true
+ * TARGET_USE_SCORPION_PLD_SET := true
+ * TARGET_SCORPION_BIONIC_PLDOFFS := <pldoffset>
+ * TARGET_SCORPION_BIONIC_PLDSIZE := <pldsize>
+ */
+#ifndef PLDOFFS
+#define PLDOFFS (6)
+#endif
+#ifndef PLDSIZE
+#define PLDSIZE (128) /* L2 cache line size */
+#endif
+ .code 32
+ .align 5
+ .globl memcpy
+ .func
+memcpy:
+ push {r0}
+ cmp r2, #4
+ blt .Lneon_lt4
+ cmp r2, #16
+ blt .Lneon_lt16
+ cmp r2, #32
+ blt .Lneon_16
+ cmp r2, #128
+ blt .Lneon_copy_32_a
+ /* Copy blocks of 128-bytes (word-aligned) at a time*/
+ /* Code below is optimized for PLDSIZE=128 only */
+ mov r12, r2, lsr #7
+ cmp r12, #PLDOFFS
+ ble .Lneon_copy_128_loop_nopld
+ sub r12, #PLDOFFS
+ pld [r1, #(PLDOFFS-1)*PLDSIZE]
+.Lneon_copy_128_loop_outer:
+ pld [r1, #(PLDOFFS*PLDSIZE)]
+ vld1.32 {q0, q1}, [r1]!
+ vld1.32 {q2, q3}, [r1]!
+ vld1.32 {q8, q9}, [r1]!
+ vld1.32 {q10, q11}, [r1]!
+ subs r12, r12, #1
+ vst1.32 {q0, q1}, [r0]!
+ vst1.32 {q2, q3}, [r0]!
+ vst1.32 {q8, q9}, [r0]!
+ vst1.32 {q10, q11}, [r0]!
+ bne .Lneon_copy_128_loop_outer
+ mov r12, #PLDOFFS
+.Lneon_copy_128_loop_nopld:
+ vld1.32 {q0, q1}, [r1]!
+ vld1.32 {q2, q3}, [r1]!
+ vld1.32 {q8, q9}, [r1]!
+ vld1.32 {q10, q11}, [r1]!
+ subs r12, r12, #1
+ vst1.32 {q0, q1}, [r0]!
+ vst1.32 {q2, q3}, [r0]!
+ vst1.32 {q8, q9}, [r0]!
+ vst1.32 {q10, q11}, [r0]!
+ bne .Lneon_copy_128_loop_nopld
+ ands r2, r2, #0x7f
+ beq .Lneon_exit
+ cmp r2, #32
+ blt .Lneon_16
+ nop
+ /* Copy blocks of 32-bytes (word aligned) at a time*/
+.Lneon_copy_32_a:
+ mov r12, r2, lsr #5
+.Lneon_copy_32_loop_a:
+ vld1.32 {q0,q1}, [r1]!
+ subs r12, r12, #1
+ vst1.32 {q0,q1}, [r0]!
+ bne .Lneon_copy_32_loop_a
+ ands r2, r2, #0x1f
+ beq .Lneon_exit
+.Lneon_16:
+ subs r2, r2, #16
+ blt .Lneon_lt16
+ vld1.32 {q8}, [r1]!
+ vst1.32 {q8}, [r0]!
+ beq .Lneon_exit
+.Lneon_lt16:
+ movs r12, r2, lsl #29
+ bcc .Lneon_skip8
+ ldr r3, [r1], #4
+ ldr r12, [r1], #4
+ str r3, [r0], #4
+ str r12, [r0], #4
+.Lneon_skip8:
+ bpl .Lneon_lt4
+ ldr r3, [r1], #4
+ str r3, [r0], #4
+.Lneon_lt4:
+ movs r2, r2, lsl #31
+ bcc .Lneon_lt2
+ ldrh r3, [r1], #2
+ strh r3, [r0], #2
+.Lneon_lt2:
+ bpl .Lneon_exit
+ ldrb r12, [r1]
+ strb r12, [r0]
+.Lneon_exit:
+ pop {r0}
+ bx lr
+ .endfunc
+ .end
+#else /* !SCORPION_NEON_OPTIMIZATION */
+#if defined(CORTEX_CACHE_LINE_32)
+ /*
+ *This can be enabled by setting flag
+ *TARGET_CORTEX_CACHE_LINE_32 in
+ *device/<vendor>/<board>/BoardConfig.mk
+ */
+ .text
+ .fpu neon
+
+ .global memcpy
+ .type memcpy, %function
+ .align 4
+
+/* a prefetch distance of 4 cache-lines works best experimentally */
+#define CACHE_LINE_SIZE 32
+memcpy:
+ .fnstart
+ .save {r0, lr}
+ stmfd sp!, {r0, lr}
+
+ /* start preloading as early as possible */
+ pld [r1, #(CACHE_LINE_SIZE*0)]
+ pld [r1, #(CACHE_LINE_SIZE*1)]
+
+ /* do we have at least 16-bytes to copy (needed for alignment below) */
+ cmp r2, #16
+ blo 5f
+
+ /* align destination to half cache-line for the write-buffer */
+ rsb r3, r0, #0
+ ands r3, r3, #0xF
+ beq 0f
+
+ /* copy up to 15-bytes (count in r3) */
+ sub r2, r2, r3
+ movs ip, r3, lsl #31
+ ldrmib lr, [r1], #1
+ strmib lr, [r0], #1
+ ldrcsb ip, [r1], #1
+ ldrcsb lr, [r1], #1
+ strcsb ip, [r0], #1
+ strcsb lr, [r0], #1
+ movs ip, r3, lsl #29
+ bge 1f
+ // copies 4 bytes, destination 32-bits aligned
+ vld4.8 {d0[0], d1[0], d2[0], d3[0]}, [r1]!
+ vst4.8 {d0[0], d1[0], d2[0], d3[0]}, [r0, :32]!
+1: bcc 2f
+ // copies 8 bytes, destination 64-bits aligned
+ vld1.8 {d0}, [r1]!
+ vst1.8 {d0}, [r0, :64]!
+2:
+
+0: /* preload immediately the next cache line, which we may need */
+ pld [r1, #(CACHE_LINE_SIZE*0)]
+ pld [r1, #(CACHE_LINE_SIZE*1)]
+
+ /* make sure we have at least 128 bytes to copy */
+ subs r2, r2, #128
+ blo 2f
+
+ /* preload all the cache lines we need.
+ * NOTE: the number of pld below depends on PREFETCH_DISTANCE,
+ * ideally would would increase the distance in the main loop to
+ * avoid the goofy code below. In practice this doesn't seem to make
+ * a big difference.
+ */
+ pld [r1, #(CACHE_LINE_SIZE*2)]
+ pld [r1, #(CACHE_LINE_SIZE*3)]
+ pld [r1, #(CACHE_LINE_SIZE*4)]
+
+ .align 3
+1: /* The main loop copies 128 bytes at a time */
+ subs r2, r2, #128
+ vld1.8 {d0 - d3}, [r1]!
+ vld1.8 {d4 - d7}, [r1]!
+ pld [r1, #(CACHE_LINE_SIZE*1)]
+ pld [r1, #(CACHE_LINE_SIZE*2)]
+ vld1.8 {d16 - d19}, [r1]!
+ vld1.8 {d20 - d23}, [r1]!
+ pld [r1, #(CACHE_LINE_SIZE*1)]
+ pld [r1, #(CACHE_LINE_SIZE*2)]
+ vst1.8 {d0 - d3}, [r0, :128]!
+ vst1.8 {d4 - d7}, [r0, :128]!
+ vst1.8 {d16 - d19}, [r0, :128]!
+ vst1.8 {d20 - d23}, [r0, :128]!
+ bhs 1b
+
+2: /* fix-up the remaining count and make sure we have >= 32 bytes left */
+ add r2, r2, #128
+ subs r2, r2, #32
+ blo 4f
+
+3: /* 32 bytes at a time. These cache lines were already preloaded */
+ vld1.8 {d0 - d3}, [r1]!
+ subs r2, r2, #32
+ vst1.8 {d0 - d3}, [r0, :128]!
+ bhs 3b
+
+4: /* less than 32 left */
+ add r2, r2, #32
+ tst r2, #0x10
+ beq 5f
+ // copies 16 bytes, 128-bits aligned
+ vld1.8 {d0, d1}, [r1]!
+ vst1.8 {d0, d1}, [r0, :128]!
+
+5: /* copy up to 15-bytes (count in r2) */
+ movs ip, r2, lsl #29
+ bcc 1f
+ vld1.8 {d0}, [r1]!
+ vst1.8 {d0}, [r0]!
+1: bge 2f
+ vld4.8 {d0[0], d1[0], d2[0], d3[0]}, [r1]!
+ vst4.8 {d0[0], d1[0], d2[0], d3[0]}, [r0]!
+2: movs ip, r2, lsl #31
+ ldrmib r3, [r1], #1
+ ldrcsb ip, [r1], #1
+ ldrcsb lr, [r1], #1
+ strmib r3, [r0], #1
+ strcsb ip, [r0], #1
+ strcsb lr, [r0], #1
+
+ ldmfd sp!, {r0, lr}
+ bx lr
+ .fnend
+#else /*!CORTEX_CACHE_LINE_32*/
.text
.fpu neon
@@ -166,7 +556,8 @@
bx lr
END(memcpy)
-
+#endif /*!CORTEX_CACHE_LINE_32*/
+#endif /* SCORPION_NEON_OPTIMIZATION */
#else /* __ARM_ARCH__ < 7 */
diff --git a/libc/arch-arm/bionic/memmove.S b/libc/arch-arm/bionic/memmove.S
new file mode 100644
index 0000000..937d14b
--- /dev/null
+++ b/libc/arch-arm/bionic/memmove.S
@@ -0,0 +1,526 @@
+/***************************************************************************
+ Copyright (c) 2009-2012 Code Aurora Forum. All rights reserved.
+
+ Redistribution and use in source and binary forms, with or without
+ modification, are permitted provided that the following conditions are met:
+ * Redistributions of source code must retain the above copyright
+ notice, this list of conditions and the following disclaimer.
+ * Redistributions in binary form must reproduce the above copyright
+ notice, this list of conditions and the following disclaimer in the
+ documentation and/or other materials provided with the distribution.
+ * Neither the name of Code Aurora nor the names of its contributors may
+ be used to endorse or promote products derived from this software
+ without specific prior written permission.
+
+ THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+ LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ POSSIBILITY OF SUCH DAMAGE.
+ ***************************************************************************/
+
+/***************************************************************************
+ * Neon memmove: Attempts to do a memmove with Neon registers if possible,
+ * Inputs:
+ * dest: The destination buffer
+ * src: The source buffer
+ * n: The size of the buffer to transfer
+ * Outputs:
+ *
+ ***************************************************************************/
+
+#include <machine/cpu-features.h>
+
+#if defined(KRAIT_NEON_OPTIMIZATION) || defined(SPARROW_NEON_OPTIMIZATION)
+ /*
+ * These can be overridden in:
+ * device/<vendor>/<board>/BoardConfig.mk
+ * by setting the following:
+ * TARGET_USE_KRAIT_BIONIC_OPTIMIZATION := true
+ * TARGET_USE_KRAIT_PLD_SET := true
+ * TARGET_KRAIT_BIONIC_PLDOFFS := <pldoffset>
+ * TARGET_KRAIT_BIONIC_PLDSIZE := <pldsize>
+ * TARGET_KRAIT_BIONIC_PLDTHRESH := <pldthreshold>
+ */
+#ifndef PLDOFFS
+#define PLDOFFS (10)
+#endif
+#ifndef PLDTHRESH
+#define PLDTHRESH (PLDOFFS)
+#endif
+#if (PLDOFFS < 5)
+#error Routine does not support offsets less than 5
+#endif
+#if (PLDTHRESH < PLDOFFS)
+#error PLD threshold must be greater than or equal to the PLD offset
+#endif
+#ifndef PLDSIZE
+#define PLDSIZE (64)
+#endif
+#define NOP_OPCODE (0xe320f000)
+
+ .code 32
+ .align 5
+ .global memmove
+ .type memmove, %function
+
+ .global _memmove_words
+ .type _memmove_words, %function
+
+ .global bcopy
+ .type bcopy, %function
+
+bcopy:
+ mov r12, r0
+ mov r0, r1
+ mov r1, r12
+ .balignl 64, NOP_OPCODE, 4*2
+memmove:
+_memmove_words:
+.Lneon_memmove_cmf:
+ subs r12, r0, r1
+ bxeq lr
+ cmphi r2, r12
+ bls memcpy /* Use memcpy for non-overlapping areas */
+
+ push {r0}
+
+.Lneon_back_to_front_copy:
+ add r0, r0, r2
+ add r1, r1, r2
+ cmp r2, #4
+ bgt .Lneon_b2f_gt4
+ cmp r2, #0
+.Lneon_b2f_smallcopy_loop:
+ beq .Lneon_memmove_done
+ ldrb r12, [r1, #-1]!
+ subs r2, r2, #1
+ strb r12, [r0, #-1]!
+ b .Lneon_b2f_smallcopy_loop
+.Lneon_b2f_gt4:
+ sub r3, r0, r1
+ cmp r2, r3
+ movle r12, r2
+ movgt r12, r3
+ cmp r12, #64
+ bge .Lneon_b2f_copy_64
+ cmp r12, #32
+ bge .Lneon_b2f_copy_32
+ cmp r12, #8
+ bge .Lneon_b2f_copy_8
+ cmp r12, #4
+ bge .Lneon_b2f_copy_4
+ b .Lneon_b2f_copy_1
+.Lneon_b2f_copy_64:
+ sub r1, r1, #64 /* Predecrement */
+ sub r0, r0, #64
+ movs r12, r2, lsr #6
+ cmp r12, #PLDTHRESH
+ ble .Lneon_b2f_copy_64_loop_nopld
+ sub r12, #PLDOFFS
+ pld [r1, #-(PLDOFFS-5)*PLDSIZE]
+ pld [r1, #-(PLDOFFS-4)*PLDSIZE]
+ pld [r1, #-(PLDOFFS-3)*PLDSIZE]
+ pld [r1, #-(PLDOFFS-2)*PLDSIZE]
+ pld [r1, #-(PLDOFFS-1)*PLDSIZE]
+ .balignl 64, NOP_OPCODE, 4*2
+.Lneon_b2f_copy_64_loop_outer:
+ pld [r1, #-(PLDOFFS)*PLDSIZE]
+ vld1.32 {q0, q1}, [r1]!
+ vld1.32 {q2, q3}, [r1]
+ subs r12, r12, #1
+ vst1.32 {q0, q1}, [r0]!
+ sub r1, r1, #96 /* Post-fixup and predecrement */
+ vst1.32 {q2, q3}, [r0]
+ sub r0, r0, #96
+ bne .Lneon_b2f_copy_64_loop_outer
+ mov r12, #PLDOFFS
+ .balignl 64, NOP_OPCODE, 4*2
+.Lneon_b2f_copy_64_loop_nopld:
+ vld1.32 {q8, q9}, [r1]!
+ vld1.32 {q10, q11}, [r1]
+ subs r12, r12, #1
+ vst1.32 {q8, q9}, [r0]!
+ sub r1, r1, #96 /* Post-fixup and predecrement */
+ vst1.32 {q10, q11}, [r0]
+ sub r0, r0, #96
+ bne .Lneon_b2f_copy_64_loop_nopld
+ ands r2, r2, #0x3f
+ beq .Lneon_memmove_done
+ add r1, r1, #64 /* Post-fixup */
+ add r0, r0, #64
+ cmp r2, #32
+ blt .Lneon_b2f_copy_finish
+.Lneon_b2f_copy_32:
+ mov r12, r2, lsr #5
+.Lneon_b2f_copy_32_loop:
+ sub r1, r1, #32 /* Predecrement */
+ sub r0, r0, #32
+ vld1.32 {q0,q1}, [r1]
+ subs r12, r12, #1
+ vst1.32 {q0,q1}, [r0]
+ bne .Lneon_b2f_copy_32_loop
+ ands r2, r2, #0x1f
+ beq .Lneon_memmove_done
+.Lneon_b2f_copy_finish:
+.Lneon_b2f_copy_8:
+ movs r12, r2, lsr #0x3
+ beq .Lneon_b2f_copy_4
+ .balignl 64, NOP_OPCODE, 4*2
+.Lneon_b2f_copy_8_loop:
+ sub r1, r1, #8 /* Predecrement */
+ sub r0, r0, #8
+ vld1.32 {d0}, [r1]
+ subs r12, r12, #1
+ vst1.32 {d0}, [r0]
+ bne .Lneon_b2f_copy_8_loop
+ ands r2, r2, #0x7
+ beq .Lneon_memmove_done
+.Lneon_b2f_copy_4:
+ movs r12, r2, lsr #0x2
+ beq .Lneon_b2f_copy_1
+.Lneon_b2f_copy_4_loop:
+ ldr r3, [r1, #-4]!
+ subs r12, r12, #1
+ str r3, [r0, #-4]!
+ bne .Lneon_b2f_copy_4_loop
+ ands r2, r2, #0x3
+.Lneon_b2f_copy_1:
+ cmp r2, #0
+ beq .Lneon_memmove_done
+ .balignl 64, NOP_OPCODE, 4*2
+.Lneon_b2f_copy_1_loop:
+ ldrb r12, [r1, #-1]!
+ subs r2, r2, #1
+ strb r12, [r0, #-1]!
+ bne .Lneon_b2f_copy_1_loop
+
+.Lneon_memmove_done:
+ pop {r0}
+ bx lr
+
+ .end
+
+#elif defined(SCORPION_NEON_OPTIMIZATION)
+ /*
+ * These can be overridden in:
+ * device/<vendor>/<board>/BoardConfig.mk
+ * by setting the following:
+ * TARGET_USE_SCORPION_BIONIC_OPTIMIZATION := true
+ * TARGET_USE_SCORPION_PLD_SET := true
+ * TARGET_SCORPION_BIONIC_PLDOFFS := <pldoffset>
+ * TARGET_SCORPION_BIONIC_PLDSIZE := <pldsize>
+ */
+#ifndef PLDOFFS
+#define PLDOFFS (6)
+#endif
+#ifndef PLDSIZE
+#define PLDSIZE (128) /* L2 cache line size */
+#endif
+
+ .code 32
+ .align 5
+ .global memmove
+ .type memmove, %function
+
+ .global bcopy
+ .type bcopy, %function
+
+bcopy:
+ mov r12, r0
+ mov r0, r1
+ mov r1, r12
+memmove:
+ push {r0}
+
+ /*
+ * The requirements for memmove state that the function should
+ * operate as if data were being copied from the source to a
+ * buffer, then to the destination. This is to allow a user
+ * to copy data from a source and target that overlap.
+ *
+ * We can't just do byte copies front-to-back automatically, since
+ * there's a good chance we may have an overlap (why else would someone
+ * intentionally use memmove then?).
+ *
+ * We'll break this into two parts. Front-to-back, or back-to-front
+ * copies.
+ */
+.Lneon_memmove_cmf:
+ cmp r0, r1
+ blt .Lneon_front_to_back_copy
+ bgt .Lneon_back_to_front_copy
+ b .Lneon_memmove_done
+
+ /* #############################################################
+ * Front to Back copy
+ */
+.Lneon_front_to_back_copy:
+ /*
+ * For small copies, just do a quick memcpy. We can do this for
+ * front-to-back copies, aligned or unaligned, since we're only
+ * doing 1 byte at a time...
+ */
+ cmp r2, #4
+ bgt .Lneon_f2b_gt4
+ cmp r2, #0
+.Lneon_f2b_smallcopy_loop:
+ beq .Lneon_memmove_done
+ ldrb r12, [r1], #1
+ subs r2, r2, #1
+ strb r12, [r0], #1
+ b .Lneon_f2b_smallcopy_loop
+.Lneon_f2b_gt4:
+ /* The window size is in r3. */
+ sub r3, r1, r0
+ /* #############################################################
+ * Front to Back copy
+ */
+ /*
+ * Note that we can't just route based on the size in r2. If that's
+ * larger than the overlap window in r3, we could potentially
+ * (and likely!) destroy data we're copying.
+ */
+ cmp r2, r3
+ movle r12, r2
+ movgt r12, r3
+ cmp r12, #256
+ bge .Lneon_f2b_copy_128
+ cmp r12, #64
+ bge .Lneon_f2b_copy_32
+ cmp r12, #16
+ bge .Lneon_f2b_copy_16
+ cmp r12, #8
+ bge .Lneon_f2b_copy_8
+ cmp r12, #4
+ bge .Lneon_f2b_copy_4
+ b .Lneon_f2b_copy_1
+ nop
+.Lneon_f2b_copy_128:
+ mov r12, r2, lsr #7
+ cmp r12, #PLDOFFS
+ ble .Lneon_f2b_copy_128_loop_nopld
+ sub r12, #PLDOFFS
+ pld [r1, #(PLDOFFS-1)*PLDSIZE]
+.Lneon_f2b_copy_128_loop_outer:
+ pld [r1, #(PLDOFFS*PLDSIZE)]
+ vld1.32 {q0,q1}, [r1]!
+ vld1.32 {q2,q3}, [r1]!
+ vld1.32 {q8,q9}, [r1]!
+ vld1.32 {q10,q11}, [r1]!
+ subs r12, r12, #1
+ vst1.32 {q0,q1}, [r0]!
+ vst1.32 {q2,q3}, [r0]!
+ vst1.32 {q8,q9}, [r0]!
+ vst1.32 {q10,q11}, [r0]!
+ bne .Lneon_f2b_copy_128_loop_outer
+ mov r12, #PLDOFFS
+.Lneon_f2b_copy_128_loop_nopld:
+ vld1.32 {q0,q1}, [r1]!
+ vld1.32 {q2,q3}, [r1]!
+ vld1.32 {q8,q9}, [r1]!
+ vld1.32 {q10,q11}, [r1]!
+ subs r12, r12, #1
+ vst1.32 {q0,q1}, [r0]!
+ vst1.32 {q2,q3}, [r0]!
+ vst1.32 {q8,q9}, [r0]!
+ vst1.32 {q10,q11}, [r0]!
+ bne .Lneon_f2b_copy_128_loop_nopld
+ ands r2, r2, #0x7f
+ beq .Lneon_memmove_done
+ cmp r2, #32
+ bge .Lneon_f2b_copy_32
+ b .Lneon_f2b_copy_finish
+.Lneon_f2b_copy_32:
+ mov r12, r2, lsr #5
+.Lneon_f2b_copy_32_loop:
+ vld1.32 {q0,q1}, [r1]!
+ subs r12, r12, #1
+ vst1.32 {q0,q1}, [r0]!
+ bne .Lneon_f2b_copy_32_loop
+ ands r2, r2, #0x1f
+ beq .Lneon_memmove_done
+.Lneon_f2b_copy_finish:
+.Lneon_f2b_copy_16:
+ movs r12, r2, lsr #4
+ beq .Lneon_f2b_copy_8
+.Lneon_f2b_copy_16_loop:
+ vld1.32 {q0}, [r1]!
+ subs r12, r12, #1
+ vst1.32 {q0}, [r0]!
+ bne .Lneon_f2b_copy_16_loop
+ ands r2, r2, #0xf
+ beq .Lneon_memmove_done
+.Lneon_f2b_copy_8:
+ movs r12, r2, lsr #3
+ beq .Lneon_f2b_copy_4
+.Lneon_f2b_copy_8_loop:
+ vld1.32 {d0}, [r1]!
+ subs r12, r12, #1
+ vst1.32 {d0}, [r0]!
+ bne .Lneon_f2b_copy_8_loop
+ ands r2, r2, #0x7
+ beq .Lneon_memmove_done
+.Lneon_f2b_copy_4:
+ movs r12, r2, lsr #2
+ beq .Lneon_f2b_copy_1
+.Lneon_f2b_copy_4_loop:
+ ldr r3, [r1], #4
+ subs r12, r12, #1
+ str r3, [r0], #4
+ bne .Lneon_f2b_copy_4_loop
+ ands r2, r2, #0x3
+ nop
+.Lneon_f2b_copy_1:
+ cmp r2, #0
+ beq .Lneon_memmove_done
+.Lneon_f2b_copy_1_loop:
+ ldrb r12, [r1], #1
+ subs r2, r2, #1
+ strb r12, [r0], #1
+ bne .Lneon_f2b_copy_1_loop
+.Lneon_f2b_finish:
+ b .Lneon_memmove_done
+
+ /* #############################################################
+ * Back to Front copy
+ */
+.Lneon_back_to_front_copy:
+ /*
+ * Here, we'll want to shift to the end of the buffers. This
+ * actually points us one past where we need to go, but since
+ * we'll pre-decrement throughout, this will be fine.
+ */
+ add r0, r0, r2
+ add r1, r1, r2
+ cmp r2, #4
+ bgt .Lneon_b2f_gt4
+ cmp r2, #0
+.Lneon_b2f_smallcopy_loop:
+ beq .Lneon_memmove_done
+ ldrb r12, [r1, #-1]!
+ subs r2, r2, #1
+ strb r12, [r0, #-1]!
+ b .Lneon_b2f_smallcopy_loop
+.Lneon_b2f_gt4:
+ /*
+ * The minimum of the overlap window size and the copy size
+ * is in r3.
+ */
+ sub r3, r0, r1
+ /*
+ * #############################################################
+ * Back to Front copy -
+ */
+ cmp r2, r3
+ movle r12, r2
+ movgt r12, r3
+ cmp r12, #256
+ bge .Lneon_b2f_copy_128
+ cmp r12, #64
+ bge .Lneon_b2f_copy_32
+ cmp r12, #8
+ bge .Lneon_b2f_copy_8
+ cmp r12, #4
+ bge .Lneon_b2f_copy_4
+ b .Lneon_b2f_copy_1
+ nop
+.Lneon_b2f_copy_128:
+ movs r12, r2, lsr #7
+ cmp r12, #PLDOFFS
+ ble .Lneon_b2f_copy_128_loop_nopld
+ sub r12, #PLDOFFS
+ pld [r1, #-(PLDOFFS-1)*PLDSIZE]
+.Lneon_b2f_copy_128_loop_outer:
+ pld [r1, #-(PLDOFFS*PLDSIZE)]
+ sub r1, r1, #128
+ sub r0, r0, #128
+ vld1.32 {q0, q1}, [r1]!
+ vld1.32 {q2, q3}, [r1]!
+ vld1.32 {q8, q9}, [r1]!
+ vld1.32 {q10, q11}, [r1]!
+ subs r12, r12, #1
+ vst1.32 {q0, q1}, [r0]!
+ vst1.32 {q2, q3}, [r0]!
+ vst1.32 {q8, q9}, [r0]!
+ vst1.32 {q10, q11}, [r0]!
+ sub r1, r1, #128
+ sub r0, r0, #128
+ bne .Lneon_b2f_copy_128_loop_outer
+ mov r12, #PLDOFFS
+.Lneon_b2f_copy_128_loop_nopld:
+ sub r1, r1, #128
+ sub r0, r0, #128
+ vld1.32 {q0, q1}, [r1]!
+ vld1.32 {q2, q3}, [r1]!
+ vld1.32 {q8, q9}, [r1]!
+ vld1.32 {q10, q11}, [r1]!
+ subs r12, r12, #1
+ vst1.32 {q0, q1}, [r0]!
+ vst1.32 {q2, q3}, [r0]!
+ vst1.32 {q8, q9}, [r0]!
+ vst1.32 {q10, q11}, [r0]!
+ sub r1, r1, #128
+ sub r0, r0, #128
+ bne .Lneon_b2f_copy_128_loop_nopld
+ ands r2, r2, #0x7f
+ beq .Lneon_memmove_done
+ cmp r2, #32
+ bge .Lneon_b2f_copy_32
+ b .Lneon_b2f_copy_finish
+.Lneon_b2f_copy_32:
+ mov r12, r2, lsr #5
+.Lneon_b2f_copy_32_loop:
+ sub r1, r1, #32
+ sub r0, r0, #32
+ vld1.32 {q0,q1}, [r1]
+ subs r12, r12, #1
+ vst1.32 {q0,q1}, [r0]
+ bne .Lneon_b2f_copy_32_loop
+ ands r2, r2, #0x1f
+ beq .Lneon_memmove_done
+.Lneon_b2f_copy_finish:
+.Lneon_b2f_copy_8:
+ movs r12, r2, lsr #0x3
+ beq .Lneon_b2f_copy_4
+.Lneon_b2f_copy_8_loop:
+ sub r1, r1, #8
+ sub r0, r0, #8
+ vld1.32 {d0}, [r1]
+ subs r12, r12, #1
+ vst1.32 {d0}, [r0]
+ bne .Lneon_b2f_copy_8_loop
+ ands r2, r2, #0x7
+ beq .Lneon_memmove_done
+.Lneon_b2f_copy_4:
+ movs r12, r2, lsr #0x2
+ beq .Lneon_b2f_copy_1
+.Lneon_b2f_copy_4_loop:
+ ldr r3, [r1, #-4]!
+ subs r12, r12, #1
+ str r3, [r0, #-4]!
+ bne .Lneon_b2f_copy_4_loop
+ ands r2, r2, #0x3
+ nop
+.Lneon_b2f_copy_1:
+ cmp r2, #0
+ beq .Lneon_memmove_done
+.Lneon_b2f_copy_1_loop:
+ ldrb r12, [r1, #-1]!
+ subs r2, r2, #1
+ strb r12, [r0, #-1]!
+ bne .Lneon_b2f_copy_1_loop
+
+.Lneon_memmove_done:
+ pop {r0}
+ bx lr
+
+ .end
+#endif /* SCORPION_NEON_OPTIMIZATION */
+
diff --git a/libm/Android.mk b/libm/Android.mk
index 9c88798..6cf2342 100644
--- a/libm/Android.mk
+++ b/libm/Android.mk
@@ -72,7 +72,6 @@
src/s_ceill.c \
src/s_copysign.c \
src/s_copysignf.c \
- src/s_cos.c \
src/s_cosf.c \
src/s_erf.c \
src/s_erff.c \
@@ -132,7 +131,6 @@
src/s_signgam.c \
src/s_significand.c \
src/s_significandf.c \
- src/s_sin.c \
src/s_sinf.c \
src/s_tan.c \
src/s_tanf.c \
@@ -162,7 +160,29 @@
src/s_scalbnf.c \
src/e_sqrtf.c
+ ifeq ($(TARGET_USE_KRAIT_BIONIC_OPTIMIZATION),true)
+ libm_common_src_files += \
+ arm/e_pow.S \
+ arm/s_cos.S \
+ arm/s_sin.S
+ libm_common_cflags += -DKRAIT_NEON_OPTIMIZATION -fno-if-conversion
+ else
+ libm_common_src_files += \
+ src/s_cos.c \
+ src/s_sin.c
+ endif
+
+ ifeq ($(TARGET_USE_SPARROW_BIONIC_OPTIMIZATION),true)
+ libm_common_src_files += \
+ arm/e_pow.S
+ libm_common_cflags += -DSPARROW_NEON_OPTIMIZATION
+ endif
+
libm_common_includes = $(LOCAL_PATH)/arm
+else
+ libm_common_src_files += \
+ src/s_cos.c \
+ src/s_sin.c
endif
ifeq ($(TARGET_OS)-$(TARGET_ARCH),linux-x86)
@@ -201,6 +221,8 @@
LOCAL_C_INCLUDES += $(libm_common_includes)
LOCAL_CFLAGS := $(libm_common_cflags)
+LOCAL_CFLAGS:= $(libm_common_cflags)
+
LOCAL_MODULE:= libm
LOCAL_ADDITIONAL_DEPENDENCIES := $(LOCAL_PATH)/Android.mk
@@ -221,6 +243,8 @@
LOCAL_C_INCLUDES += $(libm_common_includes)
LOCAL_CFLAGS := $(libm_common_cflags)
+LOCAL_CFLAGS:= $(libm_common_cflags)
+
LOCAL_MODULE:= libm
LOCAL_ADDITIONAL_DEPENDENCIES := $(LOCAL_PATH)/Android.mk
diff --git a/libm/arm/e_pow.S b/libm/arm/e_pow.S
new file mode 100644
index 0000000..1e328f8
--- /dev/null
+++ b/libm/arm/e_pow.S
@@ -0,0 +1,443 @@
+@ Copyright (c) 2012, Code Aurora Forum. All rights reserved.
+@
+@ Redistribution and use in source and binary forms, with or without
+@ modification, are permitted provided that the following conditions are
+@ met:
+@ * Redistributions of source code must retain the above copyright
+@ notice, this list of conditions and the following disclaimer.
+@ * Redistributions in binary form must reproduce the above
+@ copyright notice, this list of conditions and the following
+@ disclaimer in the documentation and/or other materials provided
+@ with the distribution.
+@ * Neither the name of Code Aurora Forum, Inc. nor the names of its
+@ contributors may be used to endorse or promote products derived
+@ from this software without specific prior written permission.
+@
+@ THIS SOFTWARE IS PROVIDED "AS IS" AND ANY EXPRESS OR IMPLIED
+@ WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF
+@ MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT
+@ ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS
+@ BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+@ CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+@ SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR
+@ BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY,
+@ WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE
+@ OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN
+@ IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+#include <machine/cpu-features.h>
+#include <machine/asm.h>
+
+@ Values which exist the program lifetime:
+#define HIGH_WORD_MASK d31
+#define EXPONENT_MASK d30
+#define int_1 d29
+#define double_1 d28
+@ sign and 2^int_n fixup:
+#define expadjustment d7
+#define literals r10
+@ Values which exist within both polynomial implementations:
+#define int_n d2
+#define int_n_low s4
+#define int_n_high s5
+#define double_n d3
+#define k1 d27
+#define k2 d26
+#define k3 d25
+#define k4 d24
+@ Values which cross the boundaries between polynomial implementations:
+#define ss d16
+#define ss2 d17
+#define ss4 d18
+#define Result d0
+#define Return_hw r1
+#define Return_lw r0
+#define ylg2x d0
+@ Intermediate values only needed sometimes:
+@ initial (sorted in approximate order of availability for overwriting):
+#define x_hw r1
+#define x_lw r0
+#define y_hw r3
+#define y_lw r2
+#define x d0
+#define bp d4
+#define y d1
+@ log series:
+#define u d19
+#define v d20
+#define lg2coeff d21
+#define bpa d5
+#define bpb d3
+#define lg2const d6
+#define xmantissa r8
+#define twoto1o5 r4
+#define twoto3o5 r5
+#define ix r6
+#define iEXP_MASK r7
+@ exp input setup:
+#define twoto1o8mask d3
+#define twoto1o4mask d4
+#define twoto1o2mask d1
+#define ylg2x_round_offset d16
+#define ylg2x_temp d17
+#define yn_temp d18
+#define yn_round_offset d19
+#define ln2 d5
+@ Careful, overwriting HIGH_WORD_MASK, reset it if you need it again ...
+#define rounded_exponent d31
+@ exp series:
+#define k5 d23
+#define k6 d22
+#define k7 d21
+#define k8 d20
+#define ss3 d19
+@ overwrite double_1 (we're done with it by now)
+#define k0 d28
+#define twoto1o4 d6
+
+@instructions that gas doesn't like to encode correctly:
+#define vmov_f64 fconstd
+#define vmov_f32 fconsts
+#define vmovne_f64 fconstdne
+
+ENTRY(pow_neon)
+#if defined(KRAIT_NO_AAPCS_VFP_MODE)
+ @ ARM ABI has inputs coming in via r registers, lets move to a d register
+ vmov x, x_lw, x_hw
+#endif
+ push {r4, r5, r6, r7, r8, r9, r10, lr}
+
+ @ pre-staged bp values
+ vldr bpa, .LbpA
+ vldr bpb, .LbpB
+ @ load two fifths into constant term in case we need it due to offsets
+ vldr lg2const, .Ltwofifths
+
+ @ bp is initially 1.0, may adjust later based on x value
+ vmov_f64 bp, #0x70
+
+ @ extract the mantissa from x for scaled value comparisons
+ lsl xmantissa, x_hw, #12
+
+ @ twoto1o5 = 2^(1/5) (input bracketing)
+ movw twoto1o5, #0x186c
+ movt twoto1o5, #0x2611
+ @ twoto3o5 = 2^(3/5) (input bracketing)
+ movw twoto3o5, #0x003b
+ movt twoto3o5, #0x8406
+
+ @ finish extracting xmantissa
+ orr xmantissa, xmantissa, x_lw, lsr #20
+
+ @ begin preparing a mask for normalization
+ vmov.i64 HIGH_WORD_MASK, #0xffffffff00000000
+
+ @ double_1 = (double) 1.0
+ vmov_f64 double_1, #0x70
+
+#if defined(KRAIT_NO_AAPCS_VFP_MODE)
+ @ move y from r registers to a d register
+ vmov y, y_lw, y_hw
+#endif
+
+ cmp xmantissa, twoto1o5
+
+ vshl.i64 EXPONENT_MASK, HIGH_WORD_MASK, #20
+ vshr.u64 int_1, HIGH_WORD_MASK, #63
+
+ adr literals, .LliteralTable
+
+ bhi .Lxgt2to1over5
+ @ zero out lg2 constant term if don't offset our input
+ vsub.f64 lg2const, lg2const, lg2const
+ b .Lxle2to1over5
+
+.Lxgt2to1over5:
+ @ if normalized x > 2^(1/5), bp = 1 + (2^(2/5)-1) = 2^(2/5)
+ vadd.f64 bp, bp, bpa
+
+.Lxle2to1over5:
+ @ will need ln2 for various things
+ vldr ln2, .Lln2
+
+ cmp xmantissa, twoto3o5
+@@@@ X Value Normalization @@@@
+
+ @ ss = abs(x) 2^(-1024)
+ vbic.i64 ss, x, EXPONENT_MASK
+
+ @ N = (floor(log2(x)) + 0x3ff) * 2^52
+ vand.i64 int_n, x, EXPONENT_MASK
+
+ bls .Lxle2to3over5
+ @ if normalized x > 2^(3/5), bp = 2^(2/5) + (2^(4/5) - 2^(2/5) = 2^(4/5)
+ vadd.f64 bp, bp, bpb
+ vadd.f64 lg2const, lg2const, lg2const
+
+.Lxle2to3over5:
+
+ @ load log2 polynomial series constants
+ vldm literals!, {k4, k3, k2, k1}
+
+ @ s = abs(x) 2^(-floor(log2(x))) (normalize abs(x) to around 1)
+ vorr.i64 ss, ss, double_1
+
+@@@@ 3/2 (Log(bp(1+s)/(1-s))) input computation (s = (x-bp)/(x+bp)) @@@@
+
+ vsub.f64 u, ss, bp
+ vadd.f64 v, ss, bp
+
+ @ s = (x-1)/(x+1)
+ vdiv.f64 ss, u, v
+
+ @ load 2/(3log2) into lg2coeff
+ vldr lg2coeff, .Ltwooverthreeln2
+
+ @ N = floor(log2(x)) * 2^52
+ vsub.i64 int_n, int_n, double_1
+
+@@@@ 3/2 (Log(bp(1+s)/(1-s))) polynomial series @@@@
+
+ @ ss2 = ((x-dp)/(x+dp))^2
+ vmul.f64 ss2, ss, ss
+ @ ylg2x = 3.0
+ vmov_f64 ylg2x, #8
+ vmul.f64 ss4, ss2, ss2
+
+ @ todo: useful later for two-way clamp
+ vmul.f64 lg2coeff, lg2coeff, y
+
+ @ N = floor(log2(x))
+ vshr.s64 int_n, int_n, #52
+
+ @ k3 = ss^2 * L4 + L3
+ vmla.f64 k3, ss2, k4
+
+ @ k1 = ss^2 * L2 + L1
+ vmla.f64 k1, ss2, k2
+
+ @ scale ss by 2/(3 ln 2)
+ vmul.f64 lg2coeff, ss, lg2coeff
+
+ @ ylg2x = 3.0 + s^2
+ vadd.f64 ylg2x, ylg2x, ss2
+
+ vcvt.f64.s32 double_n, int_n_low
+
+ @ k1 = s^4 (s^2 L4 + L3) + s^2 L2 + L1
+ vmla.f64 k1, ss4, k3
+
+ @ add in constant term
+ vadd.f64 double_n, lg2const
+
+ @ ylg2x = 3.0 + s^2 + s^4 (s^4 (s^2 L4 + L3) + s^2 L2 + L1)
+ vmla.f64 ylg2x, ss4, k1
+
+ @ ylg2x = y 2 s / (3 ln(2)) (3.0 + s^2 + s^4 (s^4(s^2 L4 + L3) + s^2 L2 + L1)
+ vmul.f64 ylg2x, lg2coeff, ylg2x
+
+@@@@ Compute input to Exp(s) (s = y(n + log2(x)) - (floor(8 yn + 1)/8 + floor(8 ylog2(x) + 1)/8) @@@@@
+
+ @ mask to extract bit 1 (2^-2 from our fixed-point representation)
+ vshl.u64 twoto1o4mask, int_1, #1
+
+ @ double_n = y * n
+ vmul.f64 double_n, double_n, y
+
+ @ Load 2^(1/4) for later computations
+ vldr twoto1o4, .Ltwoto1o4
+
+ @ either add or subtract one based on the sign of double_n and ylg2x
+ vshr.s64 ylg2x_round_offset, ylg2x, #62
+ vshr.s64 yn_round_offset, double_n, #62
+
+ @ move unmodified y*lg2x into temp space
+ vmov ylg2x_temp, ylg2x
+ @ compute floor(8 y * n + 1)/8
+ @ and floor(8 y (log2(x)) + 1)/8
+ vcvt.s32.f64 ylg2x, ylg2x, #3
+ @ move unmodified y*n into temp space
+ vmov yn_temp, double_n
+ vcvt.s32.f64 double_n, double_n, #3
+
+ @ load exp polynomial series constants
+ vldm literals!, {k8, k7, k6, k5, k4, k3, k2, k1}
+
+ @ mask to extract bit 2 (2^-1 from our fixed-point representation)
+ vshl.u64 twoto1o2mask, int_1, #2
+
+ @ make rounding offsets either 1 or -1 instead of 0 or -2
+ vorr.u64 ylg2x_round_offset, ylg2x_round_offset, int_1
+ vorr.u64 yn_round_offset, yn_round_offset, int_1
+
+ @ round up to the nearest 1/8th
+ vadd.s32 ylg2x, ylg2x, ylg2x_round_offset
+ vadd.s32 double_n, double_n, yn_round_offset
+
+ @ clear out round-up bit for y log2(x)
+ vbic.s32 ylg2x, ylg2x, int_1
+ @ clear out round-up bit for yn
+ vbic.s32 double_n, double_n, int_1
+ @ add together the (fixed precision) rounded parts
+ vadd.s64 rounded_exponent, double_n, ylg2x
+ @ turn int_n into a double with value 2^int_n
+ vshl.i64 int_n, rounded_exponent, #49
+ @ compute masks for 2^(1/4) and 2^(1/2) fixups for fractional part of fixed-precision rounded values:
+ vand.u64 twoto1o4mask, twoto1o4mask, rounded_exponent
+ vand.u64 twoto1o2mask, twoto1o2mask, rounded_exponent
+
+ @ convert back into floating point, double_n now holds (double) floor(8 y * n + 1)/8
+ @ ylg2x now holds (double) floor(8 y * log2(x) + 1)/8
+ vcvt.f64.s32 ylg2x, ylg2x, #3
+ vcvt.f64.s32 double_n, double_n, #3
+
+ @ put the 2 bit (0.5) through the roof of twoto1o2mask (make it 0x0 or 0xffffffffffffffff)
+ vqshl.u64 twoto1o2mask, twoto1o2mask, #62
+ @ put the 1 bit (0.25) through the roof of twoto1o4mask (make it 0x0 or 0xffffffffffffffff)
+ vqshl.u64 twoto1o4mask, twoto1o4mask, #63
+
+ @ center y*log2(x) fractional part between -0.125 and 0.125 by subtracting (double) floor(8 y * log2(x) + 1)/8
+ vsub.f64 ylg2x_temp, ylg2x_temp, ylg2x
+ @ center y*n fractional part between -0.125 and 0.125 by subtracting (double) floor(8 y * n + 1)/8
+ vsub.f64 yn_temp, yn_temp, double_n
+
+ @ Add fractional parts of yn and y log2(x) together
+ vadd.f64 ss, ylg2x_temp, yn_temp
+
+ @ Result = 1.0 (offset for exp(s) series)
+ vmov_f64 Result, #0x70
+
+ @ multiply fractional part of y * log2(x) by ln(2)
+ vmul.f64 ss, ln2, ss
+
+@@@@ 10th order polynomial series for Exp(s) @@@@
+
+ @ ss2 = (ss)^2
+ vmul.f64 ss2, ss, ss
+
+ @ twoto1o2mask = twoto1o2mask & twoto1o4
+ vand.u64 twoto1o2mask, twoto1o2mask, twoto1o4
+ @ twoto1o2mask = twoto1o2mask & twoto1o4
+ vand.u64 twoto1o4mask, twoto1o4mask, twoto1o4
+
+ @ Result = 1.0 + ss
+ vadd.f64 Result, Result, ss
+
+ @ k7 = ss k8 + k7
+ vmla.f64 k7, ss, k8
+
+ @ ss4 = (ss*ss) * (ss*ss)
+ vmul.f64 ss4, ss2, ss2
+
+ @ twoto1o2mask = twoto1o2mask | (double) 1.0 - results in either 1.0 or 2^(1/4) in twoto1o2mask
+ vorr.u64 twoto1o2mask, twoto1o2mask, double_1
+ @ twoto1o2mask = twoto1o4mask | (double) 1.0 - results in either 1.0 or 2^(1/4) in twoto1o4mask
+ vorr.u64 twoto1o4mask, twoto1o4mask, double_1
+
+ @ TODO: should setup sign here, expadjustment = 1.0
+ vmov_f64 expadjustment, #0x70
+
+ @ ss3 = (ss*ss) * ss
+ vmul.f64 ss3, ss2, ss
+
+ @ k0 = 1/2 (first non-unity coefficient)
+ vmov_f64 k0, #0x60
+
+ @ Mask out non-exponent bits to make sure we have just 2^int_n
+ vand.i64 int_n, int_n, EXPONENT_MASK
+
+ @ square twoto1o2mask to get 1.0 or 2^(1/2)
+ vmul.f64 twoto1o2mask, twoto1o2mask, twoto1o2mask
+ @ multiply twoto2o4mask into the exponent output adjustment value
+ vmul.f64 expadjustment, expadjustment, twoto1o4mask
+
+ @ k5 = ss k6 + k5
+ vmla.f64 k5, ss, k6
+
+ @ k3 = ss k4 + k3
+ vmla.f64 k3, ss, k4
+
+ @ k1 = ss k2 + k1
+ vmla.f64 k1, ss, k2
+
+ @ multiply twoto1o2mask into exponent output adjustment value
+ vmul.f64 expadjustment, expadjustment, twoto1o2mask
+
+ @ k5 = ss^2 ( ss k8 + k7 ) + ss k6 + k5
+ vmla.f64 k5, ss2, k7
+
+ @ k1 = ss^2 ( ss k4 + k3 ) + ss k2 + k1
+ vmla.f64 k1, ss2, k3
+
+ @ Result = 1.0 + ss + 1/2 ss^2
+ vmla.f64 Result, ss2, k0
+
+ @ Adjust int_n so that it's a double precision value that can be multiplied by Result
+ vadd.i64 expadjustment, int_n, expadjustment
+
+ @ k1 = ss^4 ( ss^2 ( ss k8 + k7 ) + ss k6 + k5 ) + ss^2 ( ss k4 + k3 ) + ss k2 + k1
+ vmla.f64 k1, ss4, k5
+
+ @ Result = 1.0 + ss + 1/2 ss^2 + ss^3 ( ss^4 ( ss^2 ( ss k8 + k7 ) + ss k6 + k5 ) + ss^2 ( ss k4 + k3 ) + ss k2 + k1 )
+ vmla.f64 Result, ss3, k1
+
+ @ multiply by adjustment (sign*(rounding ? sqrt(2) : 1) * 2^int_n)
+ vmul.f64 Result, expadjustment, Result
+
+.LleavePow:
+#if defined(KRAIT_NO_AAPCS_VFP_MODE)
+ @ return Result (FP)
+ vmov Return_lw, Return_hw, Result
+#endif
+.LleavePowDirect:
+ @ leave directly returning whatever is in Return_lw and Return_hw
+ pop {r4, r5, r6, r7, r8, r9, r10, pc}
+
+.align 6
+.LliteralTable:
+@ Least-sqares tuned constants for 11th order (log2((1+s)/(1-s)):
+.LL4: @ ~3/11
+ .long 0x53a79915, 0x3fd1b108
+.LL3: @ ~1/3
+ .long 0x9ca0567a, 0x3fd554fa
+.LL2: @ ~3/7
+ .long 0x1408e660, 0x3fdb6db7
+.LL1: @ ~3/5
+ .long 0x332D4313, 0x3fe33333
+
+@ Least-squares tuned constants for 10th order exp(s):
+.LE10: @ ~1/3628800
+ .long 0x25c7ba0a, 0x3e92819b
+.LE9: @ ~1/362880
+ .long 0x9499b49c, 0x3ec72294
+.LE8: @ ~1/40320
+ .long 0xabb79d95, 0x3efa019f
+.LE7: @ ~1/5040
+ .long 0x8723aeaa, 0x3f2a019f
+.LE6: @ ~1/720
+ .long 0x16c76a94, 0x3f56c16c
+.LE5: @ ~1/120
+ .long 0x11185da8, 0x3f811111
+.LE4: @ ~1/24
+ .long 0x5555551c, 0x3fa55555
+.LE3: @ ~1/6
+ .long 0x555554db, 0x3fc55555
+
+.LbpA: @ (2^(2/5) - 1)
+ .long 0x4ee54db1, 0x3fd472d1
+
+.LbpB: @ (2^(4/5) - 2^(2/5))
+ .long 0x1c8a36cf, 0x3fdafb62
+
+.Ltwofifths: @
+ .long 0x9999999a, 0x3fd99999
+
+.Ltwooverthreeln2:
+ .long 0xDC3A03FD, 0x3FEEC709
+
+.Lln2: @ ln(2)
+ .long 0xFEFA39EF, 0x3FE62E42
+
+.Ltwoto1o4: @ 2^1/4
+ .long 0x0a31b715, 0x3ff306fe
+END(pow)
diff --git a/libm/arm/s_cos.S b/libm/arm/s_cos.S
new file mode 100644
index 0000000..30a6767
--- /dev/null
+++ b/libm/arm/s_cos.S
@@ -0,0 +1,419 @@
+@ Copyright (c) 2012, The Linux Foundation. All rights reserved.
+@
+@ Redistribution and use in source and binary forms, with or without
+@ modification, are permitted provided that the following conditions are
+@ met:
+@ * Redistributions of source code must retain the above copyright
+@ notice, this list of conditions and the following disclaimer.
+@ * Redistributions in binary form must reproduce the above
+@ copyright notice, this list of conditions and the following
+@ disclaimer in the documentation and/or other materials provided
+@ with the distribution.
+@ * Neither the name of Code Aurora Forum, Inc. nor the names of its
+@ contributors may be used to endorse or promote products derived
+@ from this software without specific prior written permission.
+@
+@ THIS SOFTWARE IS PROVIDED "AS IS" AND ANY EXPRESS OR IMPLIED
+@ WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF
+@ MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT
+@ ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS
+@ BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+@ CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+@ SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR
+@ BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY,
+@ WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE
+@ OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN
+@ IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+@
+@ Additional notices preserved for attributions purposes only.
+@
+@ ====================================================
+@ Copyright (C) 1993 by Sun Microsystems, Inc. All rights reserved.
+@
+@ Developed at SunSoft, a Sun Microsystems, Inc. business.
+@ Permission to use, copy, modify, and distribute this
+@ software is freely granted, provided that this notice
+@ is preserved.
+@ ====================================================
+@
+@ ====================================================
+@ Copyright (C) 1993 by Sun Microsystems, Inc. All rights reserved.
+@
+@ Developed at SunPro, a Sun Microsystems, Inc. business.
+@ Permission to use, copy, modify, and distribute this
+@ software is freely granted, provided that this notice
+@ is preserved.
+@ ====================================================
+
+#include <machine/cpu-features.h>
+#include <machine/asm.h>
+
+#define vmov_f64 fconstd
+
+ENTRY(cos)
+ push {r4, r6, r7, lr}
+ vmov d0, r0, r1
+ mov r2, r0
+ mov r3, r1
+ movw r1, #0x21fb
+ movt r1, #0x3fe9
+ mov r4, r3
+ bic r3, r3, #0x80000000
+ sub sp, sp, #48
+ cmp r3, r1
+ bgt .Lxgtpio4
+ cmp r3, #0x3e400000
+ bge .Lxnottiny
+ vcvt.s32.f64 s15, d0
+ vmov r3, s15
+ cmp r3, #0
+ beq .Lreturnone
+.Lxnottiny:
+ vmov.i64 d1, #0
+ bl __kernel_cos
+.Lleave_cos:
+ vmov r0, r1, d0
+.Lleave_cos_direct:
+ add sp, sp, #48
+ pop {r4, r6, r7, pc}
+.Lxgtpio4:
+ movw r2, #0xffff
+ movt r2, #0x7fef
+ cmp r3, r2
+ bgt .LxisNaN
+ movw r0, #0xd97b
+ movt r0, #0x4002
+ cmp r3, r0
+ movw r2, #0x21fb
+ bgt .Lxge3pio4
+ cmp r4, #0
+ movt r2, #0x3ff9
+ ble .Lsmallxisnegative
+ vldr d16, .Lpio2_1
+ cmp r3, r2
+ vsub.f64 d16, d0, d16
+ beq .Lxnearpio2
+ vldr d17, .Lpio2_1t
+.Lfinalizesmallxremainder:
+ vsub.f64 d0, d16, d17
+ vsub.f64 d16, d16, d0
+ vstr d0, [sp, #8]
+ vsub.f64 d1, d16, d17
+ vstr d1, [sp, #16]
+.Lnmod3is1:
+ mov r0, #1
+ bl __kernel_sin
+ vneg.f64 d0, d0
+ b .Lleave_cos
+.Lreturnone:
+ mov r0, #0
+ movw r1, #0x0000
+ movt r1, #0x3ff0
+ vmov_f64 d0, #0x70
+ b .Lleave_cos_direct
+.LxisNaN:
+ vsub.f64 d0, d0, d0
+ b .Lleave_cos
+.Lxge3pio4:
+ movt r2, #0x4139
+ cmp r3, r2
+ bgt .Lxgigantic
+ vmov_f64 d3, #0x60
+ vldr d2, .Linvpio2
+ vldr d18, .Lpio2_1
+ vabs.f64 d16, d0
+ vmla.f64 d3, d16, d2
+ vcvt.s32.f64 s3, d3
+ vcvt.f64.s32 d17, s3
+ vmov r0, s3
+ cmp r0, #31
+ vmls.f64 d16, d17, d18
+ vldr d18, .Lpio2_1t
+ vmul.f64 d18, d17, d18
+ bgt .Lcomputeremainder
+ ldr r2, .Lnpio2_hw_ptr
+ sub lr, r0, #1
+.LPICnpio2_hw0:
+ add r12, pc, r2
+ ldr r1, [r12, lr, lsl #2]
+ cmp r3, r1
+ beq .Lcomputeremainder
+.Lfinishthirditeration:
+ vsub.f64 d0, d16, d18
+ vstr d0, [sp, #8]
+.Lfinishcomputingremainder:
+ vsub.f64 d16, d16, d0
+ cmp r4, #0
+ vsub.f64 d1, d16, d18
+ vstr d1, [sp, #16]
+ blt .Lhandlenegativex
+.Lselectregion:
+ and r0, r0, #3
+ cmp r0, #1
+ beq .Lnmod3is1
+ cmp r0, #2
+ beq .Lnmod3is2
+ cmp r0, #0
+ bne .Lnmod3is0
+ bl __kernel_cos
+ b .Lleave_cos
+.Lxgigantic:
+ asr r2, r3, #20
+ vmov r6, r7, d0
+ sub r2, r2, #1040
+ mov r0, r6
+ sub r2, r2, #6
+ vldr d16, .Ltwo24
+ sub r1, r3, r2, lsl #20
+ vmov d18, r0, r1
+ vcvt.s32.f64 s15, d18
+ add r1, sp, #48
+ mov r3, #3
+ vcvt.f64.s32 d17, s15
+ vsub.f64 d18, d18, d17
+ vstr d17, [sp, #24]
+ vmul.f64 d18, d18, d16
+ vcvt.s32.f64 s15, d18
+ vcvt.f64.s32 d17, s15
+ vsub.f64 d18, d18, d17
+ vstr d17, [sp, #32]
+ vmul.f64 d16, d18, d16
+ fcmpzd d16
+ vstmdb r1!, {d16}
+ vmrs APSR_nzcv, fpscr
+ bne .Lprocessnonzeroterm
+.Lskipzeroterms:
+ vldmdb r1!, {d16}
+ sub r3, r3, #1
+ fcmpzd d16
+ vmrs APSR_nzcv, fpscr
+ beq .Lskipzeroterms
+.Lprocessnonzeroterm:
+ ldr r12, .Ltwo_over_pi_ptr
+ add r0, sp, #24
+ add r1, sp, #8
+.LPICtwo_over_pi0:
+ add lr, pc, r12
+ mov r12, #2
+ str lr, [sp, #4]
+ str r12, [sp]
+ bl __kernel_rem_pio2
+ cmp r4, #0
+ vldr d0, [sp, #8]
+ blt .Lhandlenegativxalso
+ vldr d1, [sp, #16]
+ b .Lselectregion
+.Lxnearpio2:
+ vldr d17, .Lpio2_2
+ vsub.f64 d16, d16, d17
+ vldr d17, .Lpio2_2t
+ b .Lfinalizesmallxremainder
+.Lsmallxisnegative:
+ vldr d1, .Lpio2_1
+ cmp r3, r2
+ vadd.f64 d16, d0, d1
+ beq .Lxnearnegpio2
+ vldr d17, .Lpio2_1t
+.Lfinalizesmallnegxremainder:
+ vadd.f64 d0, d16, d17
+ vsub.f64 d16, d16, d0
+ vstr d0, [sp, #8]
+ vadd.f64 d1, d16, d17
+ vstr d1, [sp, #16]
+.Lnmod3is0:
+ mov r0, #1
+ bl __kernel_sin
+ b .Lleave_cos
+.Lnmod3is2:
+ bl __kernel_cos
+ vneg.f64 d0, d0
+ b .Lleave_cos
+.Lcomputeremainder:
+ vsub.f64 d0, d16, d18
+ asr r1, r3, #20
+ vmov r2, r3, d0
+ ubfx r3, r3, #20, #11
+ rsb r3, r3, r1
+ vstr d0, [sp, #8]
+ cmp r3, #16
+ ble .Lfinishcomputingremainder
+ vldr d18, .Lpio2_2
+ vmul.f64 d20, d17, d18
+ vsub.f64 d19, d16, d20
+ vsub.f64 d16, d16, d19
+ vsub.f64 d18, d16, d20
+ vldr d16, .Lpio2_2t
+ vnmls.f64 d18, d17, d16
+ vsub.f64 d0, d19, d18
+ vmov r2, r3, d0
+ ubfx r3, r3, #20, #11
+ rsb r1, r3, r1
+ vstr d0, [sp, #8]
+ cmp r1, #49
+ ble .Lfinishseconditeration
+ vldr d5, .Lpio2_3
+ vmul.f64 d20, d17, d5
+ vsub.f64 d16, d19, d20
+ vsub.f64 d4, d19, d16
+ vldr d19, .Lpio2_3t
+ vsub.f64 d18, d4, d20
+ vnmls.f64 d18, d17, d19
+ b .Lfinishthirditeration
+.Lhandlenegativex:
+ vneg.f64 d0, d0
+ rsb r0, r0, #0
+ vneg.f64 d1, d1
+ vstr d0, [sp, #8]
+ vstr d1, [sp, #16]
+ b .Lselectregion
+.Lfinishseconditeration:
+ vmov d16, d19
+ b .Lfinishcomputingremainder
+.Lxnearnegpio2:
+ vldr d0, .Lpio2_2
+ vldr d17, .Lpio2_2t
+ vadd.f64 d16, d16, d0
+ b .Lfinalizesmallnegxremainder
+.Lhandlenegativxalso:
+ vldr d6, [sp, #16]
+ vneg.f64 d0, d0
+ rsb r0, r0, #0
+ vneg.f64 d1, d6
+ vstr d0, [sp, #8]
+ vstr d1, [sp, #16]
+ b .Lselectregion
+
+.align 3
+.Lpio2_1:
+ .word 0x54400000, 0x3ff921fb
+.Lpio2_1t:
+ .word 0x1a626331, 0x3dd0b461
+.Linvpio2:
+ .word 0x6dc9c883, 0x3fe45f30
+.Ltwo24:
+ .word 0x00000000, 0x41700000
+.Lpio2_2:
+ .word 0x1a600000, 0x3dd0b461
+.Lpio2_2t:
+ .word 0x2e037073, 0x3ba3198a
+.Lpio2_3:
+ .word 0x2e000000, 0x3ba3198a
+.Lpio2_3t:
+ .word 0x252049c1, 0x397b839a
+.Lnpio2_hw_ptr:
+ .word .Lnpio2_hw-(.LPICnpio2_hw0+8)
+.Ltwo_over_pi_ptr:
+ .word .Ltwo_over_pi-(.LPICtwo_over_pi0+8)
+END(cos)
+
+ .section .rodata.npio2_hw,"a",%progbits
+ .align 2
+.Lnpio2_hw = . + 0
+ .type npio2_hw, %object
+ .size npio2_hw, 128
+npio2_hw:
+ .word 0x3ff921fb
+ .word 0x400921fb
+ .word 0x4012d97c
+ .word 0x401921fb
+ .word 0x401f6a7a
+ .word 0x4022d97c
+ .word 0x4025fdbb
+ .word 0x402921fb
+ .word 0x402c463a
+ .word 0x402f6a7a
+ .word 0x4031475c
+ .word 0x4032d97c
+ .word 0x40346b9c
+ .word 0x4035fdbb
+ .word 0x40378fdb
+ .word 0x403921fb
+ .word 0x403ab41b
+ .word 0x403c463a
+ .word 0x403dd85a
+ .word 0x403f6a7a
+ .word 0x40407e4c
+ .word 0x4041475c
+ .word 0x4042106c
+ .word 0x4042d97c
+ .word 0x4043a28c
+ .word 0x40446b9c
+ .word 0x404534ac
+ .word 0x4045fdbb
+ .word 0x4046c6cb
+ .word 0x40478fdb
+ .word 0x404858eb
+ .word 0x404921fb
+
+ .section .rodata.two_over_pi,"a",%progbits
+ .align 2
+.Ltwo_over_pi = . + 0
+ .type two_over_pi, %object
+ .size two_over_pi, 264
+two_over_pi:
+ .word 0x00a2f983
+ .word 0x006e4e44
+ .word 0x001529fc
+ .word 0x002757d1
+ .word 0x00f534dd
+ .word 0x00c0db62
+ .word 0x0095993c
+ .word 0x00439041
+ .word 0x00fe5163
+ .word 0x00abdebb
+ .word 0x00c561b7
+ .word 0x00246e3a
+ .word 0x00424dd2
+ .word 0x00e00649
+ .word 0x002eea09
+ .word 0x00d1921c
+ .word 0x00fe1deb
+ .word 0x001cb129
+ .word 0x00a73ee8
+ .word 0x008235f5
+ .word 0x002ebb44
+ .word 0x0084e99c
+ .word 0x007026b4
+ .word 0x005f7e41
+ .word 0x003991d6
+ .word 0x00398353
+ .word 0x0039f49c
+ .word 0x00845f8b
+ .word 0x00bdf928
+ .word 0x003b1ff8
+ .word 0x0097ffde
+ .word 0x0005980f
+ .word 0x00ef2f11
+ .word 0x008b5a0a
+ .word 0x006d1f6d
+ .word 0x00367ecf
+ .word 0x0027cb09
+ .word 0x00b74f46
+ .word 0x003f669e
+ .word 0x005fea2d
+ .word 0x007527ba
+ .word 0x00c7ebe5
+ .word 0x00f17b3d
+ .word 0x000739f7
+ .word 0x008a5292
+ .word 0x00ea6bfb
+ .word 0x005fb11f
+ .word 0x008d5d08
+ .word 0x00560330
+ .word 0x0046fc7b
+ .word 0x006babf0
+ .word 0x00cfbc20
+ .word 0x009af436
+ .word 0x001da9e3
+ .word 0x0091615e
+ .word 0x00e61b08
+ .word 0x00659985
+ .word 0x005f14a0
+ .word 0x0068408d
+ .word 0x00ffd880
+ .word 0x004d7327
+ .word 0x00310606
+ .word 0x001556ca
+ .word 0x0073a8c9
+ .word 0x0060e27b
+ .word 0x00c08c6b
diff --git a/libm/arm/s_sin.S b/libm/arm/s_sin.S
new file mode 100644
index 0000000..9c3366c
--- /dev/null
+++ b/libm/arm/s_sin.S
@@ -0,0 +1,414 @@
+@ Copyright (c) 2012, The Linux Foundation. All rights reserved.
+@
+@ Redistribution and use in source and binary forms, with or without
+@ modification, are permitted provided that the following conditions are
+@ met:
+@ * Redistributions of source code must retain the above copyright
+@ notice, this list of conditions and the following disclaimer.
+@ * Redistributions in binary form must reproduce the above
+@ copyright notice, this list of conditions and the following
+@ disclaimer in the documentation and/or other materials provided
+@ with the distribution.
+@ * Neither the name of Code Aurora Forum, Inc. nor the names of its
+@ contributors may be used to endorse or promote products derived
+@ from this software without specific prior written permission.
+@
+@ THIS SOFTWARE IS PROVIDED "AS IS" AND ANY EXPRESS OR IMPLIED
+@ WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF
+@ MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT
+@ ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS
+@ BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+@ CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+@ SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR
+@ BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY,
+@ WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE
+@ OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN
+@ IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+@
+@ Additional notices preserved for attributions purposes only.
+@
+@ ====================================================
+@ Copyright (C) 1993 by Sun Microsystems, Inc. All rights reserved.
+@
+@ Developed at SunSoft, a Sun Microsystems, Inc. business.
+@ Permission to use, copy, modify, and distribute this
+@ software is freely granted, provided that this notice
+@ is preserved.
+@ ====================================================
+@
+@ ====================================================
+@ Copyright (C) 1993 by Sun Microsystems, Inc. All rights reserved.
+@
+@ Developed at SunPro, a Sun Microsystems, Inc. business.
+@ Permission to use, copy, modify, and distribute this
+@ software is freely granted, provided that this notice
+@ is preserved.
+@ ====================================================
+
+#include <machine/cpu-features.h>
+#include <machine/asm.h>
+
+#define vmov_f64 fconstd
+
+ENTRY(sin)
+ push {r4, r6, r7, lr}
+ vmov d0, r0, r1
+ mov r2, r0
+ mov r3, r1
+ movw r1, #0x21fb
+ movt r1, #0x3fe9
+ mov r4, r3
+ bic r3, r3, #0x80000000
+ sub sp, sp, #48
+ cmp r3, r1
+ bgt .Lxgtpio4
+ cmp r3, #0x3e400000
+ bge .Lxnottiny
+ vcvt.s32.f64 s15, d0
+ vmov r3, s15
+ cmp r3, #0
+ bne .Lxnottiny
+.Lleave_sin:
+ vmov r0, r1, d0
+ add sp, sp, #48
+ pop {r4, r6, r7, pc}
+.Lxgtpio4:
+ movw r2, #0xffff
+ movt r2, #0x7fef
+ cmp r3, r2
+ bgt .LxisNaN
+ movw r0, #0xd97b
+ movt r0, #0x4002
+ cmp r3, r0
+ movw r2, #0x21fb
+ bgt .Lxge3pio4
+ cmp r4, #0
+ movt r2, #0x3ff9
+ ble .Lsmallxisnegative
+ vldr d16, .Lpio2_1
+ cmp r3, r2
+ vsub.f64 d16, d0, d16
+ beq .Lxnearpio2
+ vldr d17, .Lpio2_1t
+.Lfinalizesmallxremainder:
+ vsub.f64 d0, d16, d17
+ vsub.f64 d16, d16, d0
+ vstr d0, [sp, #8]
+ vsub.f64 d1, d16, d17
+ vstr d1, [sp, #16]
+.Lnmod3is1:
+ bl __kernel_cos
+ b .Lleave_sin
+.Lxnottiny:
+ vmov.i64 d1, #0
+ mov r0, #0
+ bl __kernel_sin
+ b .Lleave_sin
+.LxisNaN:
+ vsub.f64 d0, d0, d0
+ b .Lleave_sin
+.Lxge3pio4:
+ movt r2, #0x4139
+ cmp r3, r2
+ bgt .Lxgigantic
+ vmov_f64 d3, #0x60
+ vldr d2, .Linvpio2
+ vldr d18, .Lpio2_1
+ vabs.f64 d16, d0
+ vmla.f64 d3, d16, d2
+ vcvt.s32.f64 s3, d3
+ vcvt.f64.s32 d17, s3
+ vmov r0, s3
+ cmp r0, #31
+ vmls.f64 d16, d17, d18
+ vldr d18, .Lpio2_1t
+ vmul.f64 d18, d17, d18
+ bgt .Lcomputeremainder
+ ldr r2, .Lnpio2_hw_ptr
+ sub lr, r0, #1
+.LPICnpio2_hw0:
+ add r12, pc, r2
+ ldr r1, [r12, lr, lsl #2]
+ cmp r3, r1
+ beq .Lcomputeremainder
+.Lfinishthirditeration:
+ vsub.f64 d0, d16, d18
+ vstr d0, [sp, #8]
+.Lfinishcomputingremainder:
+ vsub.f64 d16, d16, d0
+ cmp r4, #0
+ vsub.f64 d1, d16, d18
+ vstr d1, [sp, #16]
+ blt .Lhandlenegativex
+.Lselectregion:
+ and r0, r0, #3
+ cmp r0, #1
+ beq .Lnmod3is1
+ cmp r0, #2
+ beq .Lnmod3is2
+ cmp r0, #0
+ bne .Lnmod3is0
+ mov r0, #1
+ bl __kernel_sin
+ b .Lleave_sin
+.Lxgigantic:
+ asr r2, r3, #20
+ vmov r6, r7, d0
+ sub r2, r2, #1040
+ mov r0, r6
+ sub r2, r2, #6
+ vldr d16, .Ltwo24
+ sub r1, r3, r2, lsl #20
+ vmov d18, r0, r1
+ vcvt.s32.f64 s15, d18
+ add r1, sp, #48
+ mov r3, #3
+ vcvt.f64.s32 d17, s15
+ vsub.f64 d18, d18, d17
+ vstr d17, [sp, #24]
+ vmul.f64 d18, d18, d16
+ vcvt.s32.f64 s15, d18
+ vcvt.f64.s32 d17, s15
+ vsub.f64 d18, d18, d17
+ vstr d17, [sp, #32]
+ vmul.f64 d16, d18, d16
+ fcmpzd d16
+ vstmdb r1!, {d16}
+ vmrs APSR_nzcv, fpscr
+ bne .Lprocessnonzeroterm
+.Lskipzeroterms:
+ vldmdb r1!, {d16}
+ sub r3, r3, #1
+ fcmpzd d16
+ vmrs APSR_nzcv, fpscr
+ beq .Lskipzeroterms
+.Lprocessnonzeroterm:
+ ldr r12, .Ltwo_over_pi_ptr
+ add r0, sp, #24
+ add r1, sp, #8
+.LPICtwo_over_pi0:
+ add lr, pc, r12
+ mov r12, #2
+ str lr, [sp, #4]
+ str r12, [sp]
+ bl __kernel_rem_pio2
+ cmp r4, #0
+ vldr d0, [sp, #8]
+ blt .Lhandlenegativexalso
+ vldr d1, [sp, #16]
+ b .Lselectregion
+.Lxnearpio2:
+ vldr d17, .Lpio2_2
+ vsub.f64 d16, d16, d17
+ vldr d17, .Lpio2_2t
+ b .Lfinalizesmallxremainder
+.Lsmallxisnegative:
+ vldr d1, .Lpio2_1
+ cmp r3, r2
+ vadd.f64 d16, d0, d1
+ beq .Lxnearnegpio2
+ vldr d17, .Lpio2_1t
+.Lfinalizesmallnegxremainder:
+ vadd.f64 d0, d16, d17
+ vsub.f64 d16, d16, d0
+ vstr d0, [sp, #8]
+ vadd.f64 d1, d16, d17
+ vstr d1, [sp, #16]
+.Lnmod3is0:
+ bl __kernel_cos
+ vneg.f64 d0, d0
+ b .Lleave_sin
+.Lnmod3is2:
+ mov r0, #1
+ bl __kernel_sin
+ vneg.f64 d0, d0
+ b .Lleave_sin
+.Lcomputeremainder:
+ vsub.f64 d0, d16, d18
+ asr r1, r3, #20
+ vmov r2, r3, d0
+ ubfx r3, r3, #20, #11
+ rsb r3, r3, r1
+ vstr d0, [sp, #8]
+ cmp r3, #16
+ ble .Lfinishcomputingremainder
+ vldr d18, .Lpio2_2
+ vmul.f64 d20, d17, d18
+ vsub.f64 d19, d16, d20
+ vsub.f64 d16, d16, d19
+ vsub.f64 d18, d16, d20
+ vldr d16, .Lpio2_2t
+ vnmls.f64 d18, d17, d16
+ vsub.f64 d0, d19, d18
+ vmov r2, r3, d0
+ ubfx r3, r3, #20, #11
+ rsb r1, r3, r1
+ vstr d0, [sp, #8]
+ cmp r1, #49
+ ble .Lfinishseconditeration
+ vldr d5, .Lpio2_3
+ vmul.f64 d20, d17, d5
+ vsub.f64 d16, d19, d20
+ vsub.f64 d4, d19, d16
+ vldr d19, .Lpio2_3t
+ vsub.f64 d18, d4, d20
+ vnmls.f64 d18, d17, d19
+ b .Lfinishthirditeration
+.Lhandlenegativex:
+ vneg.f64 d0, d0
+ rsb r0, r0, #0
+ vneg.f64 d1, d1
+ vstr d0, [sp, #8]
+ vstr d1, [sp, #16]
+ b .Lselectregion
+.Lfinishseconditeration:
+ vmov d16, d19
+ b .Lfinishcomputingremainder
+.Lxnearnegpio2:
+ vldr d0, .Lpio2_2
+ vldr d17, .Lpio2_2t
+ vadd.f64 d16, d16, d0
+ b .Lfinalizesmallnegxremainder
+.Lhandlenegativexalso:
+ vldr d6, [sp, #16]
+ vneg.f64 d0, d0
+ rsb r0, r0, #0
+ vneg.f64 d1, d6
+ vstr d0, [sp, #8]
+ vstr d1, [sp, #16]
+ b .Lselectregion
+
+.align 3
+.Lpio2_1:
+ .word 0x54400000, 0x3ff921fb
+.Lpio2_1t:
+ .word 0x1a626331, 0x3dd0b461
+.Linvpio2:
+ .word 0x6dc9c883, 0x3fe45f30
+.Ltwo24:
+ .word 0x00000000, 0x41700000
+.Lpio2_2:
+ .word 0x1a600000, 0x3dd0b461
+.Lpio2_2t:
+ .word 0x2e037073, 0x3ba3198a
+.Lpio2_3:
+ .word 0x2e000000, 0x3ba3198a
+.Lpio2_3t:
+ .word 0x252049c1, 0x397b839a
+.Lnpio2_hw_ptr:
+ .word .Lnpio2_hw-(.LPICnpio2_hw0+8)
+.Ltwo_over_pi_ptr:
+ .word .Ltwo_over_pi-(.LPICtwo_over_pi0+8)
+END(sin)
+
+ .section .rodata.npio2_hw,"a",%progbits
+ .align 2
+.Lnpio2_hw = . + 0
+ .type npio2_hw, %object
+ .size npio2_hw, 128
+npio2_hw:
+ .word 0x3ff921fb
+ .word 0x400921fb
+ .word 0x4012d97c
+ .word 0x401921fb
+ .word 0x401f6a7a
+ .word 0x4022d97c
+ .word 0x4025fdbb
+ .word 0x402921fb
+ .word 0x402c463a
+ .word 0x402f6a7a
+ .word 0x4031475c
+ .word 0x4032d97c
+ .word 0x40346b9c
+ .word 0x4035fdbb
+ .word 0x40378fdb
+ .word 0x403921fb
+ .word 0x403ab41b
+ .word 0x403c463a
+ .word 0x403dd85a
+ .word 0x403f6a7a
+ .word 0x40407e4c
+ .word 0x4041475c
+ .word 0x4042106c
+ .word 0x4042d97c
+ .word 0x4043a28c
+ .word 0x40446b9c
+ .word 0x404534ac
+ .word 0x4045fdbb
+ .word 0x4046c6cb
+ .word 0x40478fdb
+ .word 0x404858eb
+ .word 0x404921fb
+
+ .section .rodata.two_over_pi,"a",%progbits
+ .align 2
+.Ltwo_over_pi = . + 0
+ .type two_over_pi, %object
+ .size two_over_pi, 264
+two_over_pi:
+ .word 0x00a2f983
+ .word 0x006e4e44
+ .word 0x001529fc
+ .word 0x002757d1
+ .word 0x00f534dd
+ .word 0x00c0db62
+ .word 0x0095993c
+ .word 0x00439041
+ .word 0x00fe5163
+ .word 0x00abdebb
+ .word 0x00c561b7
+ .word 0x00246e3a
+ .word 0x00424dd2
+ .word 0x00e00649
+ .word 0x002eea09
+ .word 0x00d1921c
+ .word 0x00fe1deb
+ .word 0x001cb129
+ .word 0x00a73ee8
+ .word 0x008235f5
+ .word 0x002ebb44
+ .word 0x0084e99c
+ .word 0x007026b4
+ .word 0x005f7e41
+ .word 0x003991d6
+ .word 0x00398353
+ .word 0x0039f49c
+ .word 0x00845f8b
+ .word 0x00bdf928
+ .word 0x003b1ff8
+ .word 0x0097ffde
+ .word 0x0005980f
+ .word 0x00ef2f11
+ .word 0x008b5a0a
+ .word 0x006d1f6d
+ .word 0x00367ecf
+ .word 0x0027cb09
+ .word 0x00b74f46
+ .word 0x003f669e
+ .word 0x005fea2d
+ .word 0x007527ba
+ .word 0x00c7ebe5
+ .word 0x00f17b3d
+ .word 0x000739f7
+ .word 0x008a5292
+ .word 0x00ea6bfb
+ .word 0x005fb11f
+ .word 0x008d5d08
+ .word 0x00560330
+ .word 0x0046fc7b
+ .word 0x006babf0
+ .word 0x00cfbc20
+ .word 0x009af436
+ .word 0x001da9e3
+ .word 0x0091615e
+ .word 0x00e61b08
+ .word 0x00659985
+ .word 0x005f14a0
+ .word 0x0068408d
+ .word 0x00ffd880
+ .word 0x004d7327
+ .word 0x00310606
+ .word 0x001556ca
+ .word 0x0073a8c9
+ .word 0x0060e27b
+ .word 0x00c08c6b
diff --git a/libm/src/e_pow.c b/libm/src/e_pow.c
index d213132..b0a3f53 100644
--- a/libm/src/e_pow.c
+++ b/libm/src/e_pow.c
@@ -61,6 +61,14 @@
#include "math.h"
#include "math_private.h"
+#if defined(KRAIT_NEON_OPTIMIZATION) || defined(SPARROW_NEON_OPTIMIZATION)
+#if defined(KRAIT_NO_AAPCS_VFP_MODE)
+double pow_neon(double x, double y);
+#else
+double pow_neon(double x, double y, int32_t lx, int32_t hx) __attribute__((pcs("aapcs-vfp")));
+#endif
+#endif
+
static const double
bp[] = {1.0, 1.5,},
dp_h[] = { 0.0, 5.84962487220764160156e-01,}, /* 0x3FE2B803, 0x40000000 */
@@ -108,12 +116,32 @@
ix = hx&0x7fffffff; iy = hy&0x7fffffff;
/* y==zero: x**0 = 1 */
- if((iy|ly)==0) return one;
- /* +-NaN return x+y */
- if(ix > 0x7ff00000 || ((ix==0x7ff00000)&&(lx!=0)) ||
- iy > 0x7ff00000 || ((iy==0x7ff00000)&&(ly!=0)))
- return x+y;
+ if (ly == 0) {
+ if (hy == ly) {
+ /* y==0.0, x**0 = 1 */
+ return one;
+ }
+ else if (iy > 0x7ff00000) {
+ /* y is NaN, return x+y (NaN) */
+ return x+y;
+ }
+ }
+ else if (iy >= 0x7ff00000) {
+ /* y is NaN, return x+y (NaN) */
+ return x+y;
+ }
+
+ if (lx == 0) {
+ if (ix > 0x7ff00000) {
+ /* x is NaN, return x+y (NaN) */
+ return x+y;
+ }
+ }
+ else if (ix >= 0x7ff00000) {
+ /* x is NaN, return x+y (NaN) */
+ return x+y;
+ }
/* determine if y is an odd int when x < 0
* yisint = 0 ... y is not an integer
@@ -201,6 +229,14 @@
t1 = u+v;
SET_LOW_WORD(t1,0);
t2 = v-(t1-u);
+#if defined(KRAIT_NEON_OPTIMIZATION) || defined(SPARROW_NEON_OPTIMIZATION)
+ } else if (ix <= 0x40100000 && iy <= 0x40100000 && hy > 0 && hx > 0) {
+#if defined(KRAIT_NO_AAPCS_VFP_MODE)
+ return pow_neon(x,y);
+#else
+ return pow_neon(x,y,lx,hx);
+#endif
+#endif
} else {
double ss,s2,s_h,s_l,t_h,t_l;
n = 0;
diff --git a/libm/src/k_cos.c b/libm/src/k_cos.c
index 00916d7..b8cdf8f 100644
--- a/libm/src/k_cos.c
+++ b/libm/src/k_cos.c
@@ -69,6 +69,17 @@
double
__kernel_cos(double x, double y)
{
+#if defined(KRAIT_NEON_OPTIMIZATION)
+ double hz,z,zz,r,w,k;
+
+ z = x*x;
+ zz = z*z;
+ k = x*y;
+ hz = (float)0.5*z;
+ r = z*(z*(C1+z*(C2+z*((C3+z*C4)+zz*(C5+z*C6)))));
+ w = one-hz;
+ return w + (((one-w)-hz) + (r-k));
+#else
double hz,z,r,w;
z = x*x;
@@ -76,4 +87,5 @@
hz = (float)0.5*z;
w = one-hz;
return w + (((one-w)-hz) + (z*r-x*y));
+#endif
}
diff --git a/libm/src/k_sin.c b/libm/src/k_sin.c
index ae06a9d..ee641d4 100644
--- a/libm/src/k_sin.c
+++ b/libm/src/k_sin.c
@@ -60,6 +60,16 @@
double
__kernel_sin(double x, double y, int iy)
{
+#if defined(KRAIT_NEON_OPTIMIZATION)
+ double z,zz,r,v;
+
+ z = x*x;
+ zz = z*z;
+ v = z*x;
+ r = S2+z*((S3+z*S4)+zz*(S5+z*S6));
+ if(iy==0) return x+v*(S1+z*r);
+ else return x-((z*(half*y-v*r)-y)-v*S1);
+#else
double z,r,v;
z = x*x;
@@ -67,4 +77,5 @@
r = S2+z*(S3+z*(S4+z*(S5+z*S6)));
if(iy==0) return x+v*(S1+z*r);
else return x-((z*(half*y-v*r)-y)-v*S1);
+#endif
}
diff --git a/libm/src/math_private.h b/libm/src/math_private.h
index 5f6e088..7cda2e9 100644
--- a/libm/src/math_private.h
+++ b/libm/src/math_private.h
@@ -257,11 +257,19 @@
#define __ieee754_ldexpf ldexpf
/* fdlibm kernel function */
+#if defined(KRAIT_NEON_OPTIMIZATION)
+int __ieee754_rem_pio2(double,double*) __attribute__((pcs("aapcs-vfp")));
+double __kernel_sin(double,double,int) __attribute__((pcs("aapcs-vfp")));
+double __kernel_cos(double,double) __attribute__((pcs("aapcs-vfp")));
+double __kernel_tan(double,double,int) __attribute__((pcs("aapcs-vfp")));
+int __kernel_rem_pio2(double*,double*,int,int,int,const int*) __attribute__((pcs("aapcs-vfp")));
+#else
int __ieee754_rem_pio2(double,double*);
double __kernel_sin(double,double,int);
double __kernel_cos(double,double);
double __kernel_tan(double,double,int);
int __kernel_rem_pio2(double*,double*,int,int,int,const int*);
+#endif
/* float versions of fdlibm kernel functions */
int __ieee754_rem_pio2f(float,float*);