libc: kryo specific memory routine
Add kryo specific memcpy
Change-Id: Id3af7bdbc9d621c56cd26cbc04f9ad116f228550
diff --git a/libc/arch-arm64/kryo/bionic/memcpy_base.S b/libc/arch-arm64/kryo/bionic/memcpy_base.S
new file mode 100644
index 0000000..0096bb7
--- /dev/null
+++ b/libc/arch-arm64/kryo/bionic/memcpy_base.S
@@ -0,0 +1,244 @@
+/* Copyright (c) 2015 The Linux Foundation. All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ * * Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * * Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ * * Neither the name of The Linux Foundation nor the names of its contributors may
+ * be used to endorse or promote products derived from this software
+ * without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#ifdef PLDOFFS
+#undef PLDOFFS
+#endif
+#define PLDOFFS (16)
+
+#ifdef PLDTHRESH
+#undef PLDTHRESH
+#endif
+#define PLDTHRESH (PLDOFFS)
+
+#ifdef BBTHRESH
+#undef BBTHRESH
+#endif
+#define BBTHRESH (2048/128)
+
+#if (PLDOFFS < 1)
+#error Routine does not support offsets less than 1
+#endif
+#if (PLDTHRESH < PLDOFFS)
+#error PLD threshold must be greater than or equal to the PLD offset
+#endif
+
+#ifdef PLDSIZE
+#undef PLDSIZE
+#endif
+#define PLDSIZE (128)
+
+kryo_bb_memcpy:
+ mov x11, x0
+ cmp x2, #4
+ blo kryo_bb_lt4
+ cmp x2, #16
+ blo kryo_bb_lt16
+ cmp x2, #32
+ blo kryo_bb_16
+ cmp x2, #64
+ blo kryo_bb_copy_32_a
+ cmp x2, #128
+ blo kryo_bb_copy_64_a
+
+ // we have at least 127 bytes to achieve 128-byte alignment
+ neg x3, x1 // calculate count to get SOURCE aligned
+ ands x3, x3, #0x7F
+ b.eq kryo_bb_source_aligned // already aligned
+ // alignment fixup, small to large (favorable alignment)
+ tbz x3, #0, 1f
+ ldrb w5, [x1], #1
+ strb w5, [x0], #1
+1: tbz x3, #1, 2f
+ ldrh w6, [x1], #2
+ strh w6, [x0], #2
+2: tbz x3, #2, 3f
+ ldr w8, [x1], #4
+ str w8, [x0], #4
+3: tbz x3, #3, 4f
+ ldr x9, [x1], #8
+ str x9, [x0], #8
+4: tbz x3, #4, 5f
+ ldr q7, [x1], #16
+ str q7, [x0], #16
+5: tbz x3, #5, 55f
+ ldp q0, q1, [x1], #32
+ stp q0, q1, [x0], #32
+55: tbz x3, #6, 6f
+ ldp q0, q1, [x1], #32
+ ldp q2, q3, [x1], #32
+ stp q0, q1, [x0], #32
+ stp q2, q3, [x0], #32
+6: subs x2, x2, x3 // fixup count after alignment
+ b.eq kryo_bb_exit
+ cmp x2, #128
+ blo kryo_bb_copy_64_a
+kryo_bb_source_aligned:
+ lsr x12, x2, #7
+ cmp x12, #PLDTHRESH
+ bls kryo_bb_copy_128_loop_nopld
+
+ cmp x12, #BBTHRESH
+ bls kryo_bb_prime_pump
+
+ add x14, x0, #0x400
+ add x9, x1, #(PLDOFFS*PLDSIZE)
+ sub x14, x14, x9
+ lsl x14, x14, #(21+32)
+ lsr x14, x14, #(21+32)
+ add x14, x14, #(PLDOFFS*PLDSIZE)
+ cmp x12, x14, lsr #7
+ bls kryo_bb_prime_pump
+
+ mov x9, #(PLDOFFS)
+ lsr x13, x14, #7
+ subs x9, x13, x9
+ bls kryo_bb_prime_pump
+
+ add x10, x1, x14
+ bic x10, x10, #0x7F // Round to multiple of PLDSIZE
+
+ sub x12, x12, x14, lsr #7
+ cmp x9, x12
+ sub x13, x12, x9
+ csel x12, x13, x12, LS
+ csel x9, x12, x9, HI
+ csel x12, xzr, x12, HI
+
+ prfm PLDL1STRM, [x1, #((PLDOFFS-1)*PLDSIZE)]
+ prfm PLDL1STRM, [x1, #((PLDOFFS-1)*PLDSIZE+64)]
+kryo_bb_copy_128_loop_outer_doublepld:
+ prfm PLDL1STRM, [x1, #((PLDOFFS)*PLDSIZE)]
+ prfm PLDL1STRM, [x1, #((PLDOFFS)*PLDSIZE)+64]
+ subs x9, x9, #1
+ ldp q0, q1, [x1], #32
+ ldp q2, q3, [x1], #32
+ ldp q4, q5, [x1], #32
+ ldp q6, q7, [x1], #32
+ prfm PLDL1KEEP, [x10]
+ prfm PLDL1KEEP, [x10, #64]
+ add x10, x10, #128
+ stp q0, q1, [x0], #32
+ stp q2, q3, [x0], #32
+ stp q4, q5, [x0], #32
+ stp q6, q7, [x0], #32
+ bne kryo_bb_copy_128_loop_outer_doublepld
+ cmp x12, #0
+ beq kryo_bb_pop_before_nopld
+ cmp x12, #(448*1024/128)
+ bls kryo_bb_copy_128_loop_outer
+
+kryo_bb_copy_128_loop_ddr:
+ subs x12, x12, #1
+ ldr x3, [x10], #128
+ ldp q0, q1, [x1], #32
+ ldp q2, q3, [x1], #32
+ ldp q4, q5, [x1], #32
+ ldp q6, q7, [x1], #32
+ stp q0, q1, [x0], #32
+ stp q2, q3, [x0], #32
+ stp q4, q5, [x0], #32
+ stp q6, q7, [x0], #32
+ bne kryo_bb_copy_128_loop_ddr
+ b kryo_bb_pop_before_nopld
+
+kryo_bb_prime_pump:
+ mov x14, #(PLDOFFS*PLDSIZE)
+ add x10, x1, #(PLDOFFS*PLDSIZE)
+ bic x10, x10, #0x7F
+ sub x12, x12, #PLDOFFS
+ prfm PLDL1KEEP, [x10, #(-1*PLDSIZE)]
+ prfm PLDL1KEEP, [x10, #(-1*PLDSIZE+64)]
+ cmp x12, #(448*1024/128)
+ bhi kryo_bb_copy_128_loop_ddr
+
+kryo_bb_copy_128_loop_outer:
+ subs x12, x12, #1
+ prfm PLDL1KEEP, [x10]
+ prfm PLDL1KEEP, [x10, #64]
+ ldp q0, q1, [x1], #32
+ ldp q2, q3, [x1], #32
+ ldp q4, q5, [x1], #32
+ ldp q6, q7, [x1], #32
+ add x10, x10, #128
+ stp q0, q1, [x0], #32
+ stp q2, q3, [x0], #32
+ stp q4, q5, [x0], #32
+ stp q6, q7, [x0], #32
+ bne kryo_bb_copy_128_loop_outer
+
+kryo_bb_pop_before_nopld:
+ lsr x12, x14, #7
+kryo_bb_copy_128_loop_nopld:
+ ldp q0, q1, [x1], #32
+ ldp q2, q3, [x1], #32
+ ldp q4, q5, [x1], #32
+ ldp q6, q7, [x1], #32
+ subs x12, x12, #1
+ stp q0, q1, [x0], #32
+ stp q2, q3, [x0], #32
+ stp q4, q5, [x0], #32
+ stp q6, q7, [x0], #32
+ bne kryo_bb_copy_128_loop_nopld
+ ands x2, x2, #0x7f
+ beq kryo_bb_exit
+
+kryo_bb_copy_64_a:
+ tbz x2, #6, kryo_bb_copy_32_a
+ ldp q0, q1, [x1], #32
+ ldp q2, q3, [x1], #32
+ stp q0, q1, [x0], #32
+ stp q2, q3, [x0], #32
+kryo_bb_copy_32_a:
+ tbz x2, #5, kryo_bb_16
+ ldp q0, q1, [x1], #32
+ stp q0, q1, [x0], #32
+kryo_bb_16:
+ tbz x2, #4, kryo_bb_lt16
+ ldr q7, [x1], #16
+ str q7, [x0], #16
+ ands x2, x2, #0x0f
+ beq kryo_bb_exit
+kryo_bb_lt16:
+ tbz x2, #3, kryo_bb_lt8
+ ldr x3, [x1], #8
+ str x3, [x0], #8
+kryo_bb_lt8:
+ tbz x2, #2, kryo_bb_lt4
+ ldr w3, [x1], #4
+ str w3, [x0], #4
+kryo_bb_lt4:
+ tbz x2, #1, kryo_bb_lt2
+ ldrh w3, [x1], #2
+ strh w3, [x0], #2
+kryo_bb_lt2:
+ tbz x2, #0, kryo_bb_exit
+ ldrb w3, [x1], #1
+ strb w3, [x0], #1
+kryo_bb_exit:
+ mov x0, x11
+ ret
+