blob: 7e1a79959de6c407f961f280a3eeeb81f2f42bfc [file] [log] [blame]
The Android Open Source Project1dc9e472009-03-03 19:28:35 -08001/*
2 * Copyright (C) 2008 The Android Open Source Project
3 * All rights reserved.
4 *
5 * Redistribution and use in source and binary forms, with or without
6 * modification, are permitted provided that the following conditions
7 * are met:
8 * * Redistributions of source code must retain the above copyright
9 * notice, this list of conditions and the following disclaimer.
10 * * Redistributions in binary form must reproduce the above copyright
11 * notice, this list of conditions and the following disclaimer in
12 * the documentation and/or other materials provided with the
13 * distribution.
14 *
15 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
16 * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
17 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
18 * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
19 * COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
20 * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
21 * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS
22 * OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED
23 * AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
24 * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
25 * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
26 * SUCH DAMAGE.
27 */
28
29#include <machine/cpu-features.h>
Kenny Root420878c2011-02-16 11:55:58 -080030#include <machine/asm.h>
The Android Open Source Project1dc9e472009-03-03 19:28:35 -080031
Prajakta Gudadhe08e72d02012-05-07 14:17:44 -070032#if defined(__ARM_NEON__) && !defined(ARCH_ARM_USE_NON_NEON_MEMCPY)
Brent DeGraafa8c02212012-05-30 22:50:19 -040033#if defined(KRAIT_NEON_OPTIMIZATION)
34 /*
35 * These can be overridden in:
36 * device/<vendor>/<board>/BoardConfig.mk
37 * by setting the following:
38 * TARGET_USE_KRAIT_BIONIC_OPTIMIZATION := true
39 * TARGET_USE_KRAIT_PLD_SET := true
40 * TARGET_KRAIT_BIONIC_PLDOFFS := <pldoffset>
41 * TARGET_KRAIT_BIONIC_PLDSIZE := <pldsize>
42 * TARGET_KRAIT_BIONIC_PLDTHRESH := <pldthreshold>
43 * TARGET_KRAIT_BIONIC_BBTHRESH := <bbthreshold>
44 */
45#ifndef PLDOFFS
46#define PLDOFFS (10)
47#endif
48#ifndef PLDTHRESH
49#define PLDTHRESH (PLDOFFS)
50#endif
51#ifndef BBTHRESH
52#define BBTHRESH (4096/64)
53#endif
54#if (PLDOFFS < 1)
55#error Routine does not support offsets less than 1
56#endif
57#if (PLDTHRESH < PLDOFFS)
58#error PLD threshold must be greater than or equal to the PLD offset
59#endif
60#ifndef PLDSIZE
61#define PLDSIZE (64)
62#endif
63#define NOP_OPCODE (0xe320f000)
64
65 .text
66 .fpu neon
67 .global memcpy
68 .type memcpy, %function
69 .align 5
70memcpy:
71 stmfd sp!, {r0, r9, r10, lr}
72 cmp r2, #4
73 blt .Lneon_lt4
74 cmp r2, #16
75 blt .Lneon_lt16
76 cmp r2, #32
77 blt .Lneon_16
78 cmp r2, #64
79 blt .Lneon_copy_32_a
80
81 mov r12, r2, lsr #6
82 cmp r12, #PLDTHRESH
83 ble .Lneon_copy_64_loop_nopld
84
85 cmp r12, #BBTHRESH
86 ble .Lneon_prime_pump
87
88 add lr, r0, #0x400
89 add r9, r1, #(PLDOFFS*PLDSIZE)
90 sub lr, lr, r9
91 lsl lr, lr, #21
92 lsr lr, lr, #21
93 add lr, lr, #(PLDOFFS*PLDSIZE)
94 cmp r12, lr, lsr #6
95 movle lr, #(PLDOFFS*PLDSIZE)
96
97 movgt r9, #(PLDOFFS)
98 rsbgts r9, r9, lr, lsr #6
99 ble .Lneon_prime_pump
100
101 add r10, r1, lr
102 bic r10, #0x3F
103
104 sub r12, lr, lsr #6
105 cmp r9, r12
106 suble r12, r12, r9
107 movgt r9, r12
108 movgt r12, #0
109
110 pld [r1, #((PLDOFFS-1)*PLDSIZE)]
111 .balignl 64, NOP_OPCODE, 4*2
112.Lneon_copy_64_loop_outer_doublepld:
113 pld [r1, #((PLDOFFS)*PLDSIZE)]
114 vld1.32 {q0, q1}, [r1]!
115 vld1.32 {q2, q3}, [r1]!
116 ldr r3, [r10]
117 subs r9, r9, #1
118 vst1.32 {q0, q1}, [r0]!
119 vst1.32 {q2, q3}, [r0]!
120 add r10, #64
121 bne .Lneon_copy_64_loop_outer_doublepld
122 cmp r12, #0
123 bne .Lneon_copy_64_loop_outer
124 mov r12, lr, lsr #6
125 b .Lneon_copy_64_loop_nopld
126 .balignl 64, NOP_OPCODE, 4*2
127.Lneon_prime_pump:
128 mov lr, #(PLDOFFS*PLDSIZE)
129 add r10, r1, #(PLDOFFS*PLDSIZE)
130 bic r10, #0x3F
131 sub r12, r12, #PLDOFFS
132 pld [r10, #(-1*PLDSIZE)]
133 .balignl 64, NOP_OPCODE, 4*2
134.Lneon_copy_64_loop_outer:
135 vld1.32 {q0, q1}, [r1]!
136 vld1.32 {q2, q3}, [r1]!
137 ldr r3, [r10]
138 subs r12, r12, #1
139 vst1.32 {q0, q1}, [r0]!
140 vst1.32 {q2, q3}, [r0]!
141 add r10, #64
142 bne .Lneon_copy_64_loop_outer
143 mov r12, lr, lsr #6
144 .balignl 64, NOP_OPCODE, 4*2
145.Lneon_copy_64_loop_nopld:
146 vld1.32 {q8, q9}, [r1]!
147 vld1.32 {q10, q11}, [r1]!
148 subs r12, r12, #1
149 vst1.32 {q8, q9}, [r0]!
150 vst1.32 {q10, q11}, [r0]!
151 bne .Lneon_copy_64_loop_nopld
152 ands r2, r2, #0x3f
153 beq .Lneon_exit
154 .balignl 64, NOP_OPCODE, 4*2
155.Lneon_copy_32_a:
156 movs r12, r2, lsl #27
157 bcc .Lneon_16
158 vld1.32 {q0,q1}, [r1]!
159 vst1.32 {q0,q1}, [r0]!
160 .balignl 64, NOP_OPCODE, 4*2
161.Lneon_16:
162 bpl .Lneon_lt16
163 vld1.32 {q8}, [r1]!
164 vst1.32 {q8}, [r0]!
165 ands r2, r2, #0x0f
166 beq .Lneon_exit
167 .balignl 64, NOP_OPCODE, 4*2
168.Lneon_lt16:
169 movs r12, r2, lsl #29
170 ldrcs r3, [r1], #4
171 ldrcs r12, [r1], #4
172 strcs r3, [r0], #4
173 strcs r12, [r0], #4
174 ldrmi r3, [r1], #4
175 strmi r3, [r0], #4
176 .balignl 64, NOP_OPCODE, 4*2
177.Lneon_lt4:
178 movs r2, r2, lsl #31
179 ldrcsh r3, [r1], #2
180 strcsh r3, [r0], #2
181 ldrmib r12, [r1]
182 strmib r12, [r0]
183 .balignl 64, NOP_OPCODE, 4*2
184.Lneon_exit:
185 ldmfd sp!, {r0, r9, r10, lr}
186 bx lr
187 .end
188#elif defined(SCORPION_NEON_OPTIMIZATION)
189 /*
190 * These can be overridden in:
191 * device/<vendor>/<board>/BoardConfig.mk
192 * by setting the following:
193 * TARGET_USE_SCORPION_BIONIC_OPTIMIZATION := true
194 * TARGET_USE_SCORPION_PLD_SET := true
195 * TARGET_SCORPION_BIONIC_PLDOFFS := <pldoffset>
196 * TARGET_SCORPION_BIONIC_PLDSIZE := <pldsize>
197 */
198#ifndef PLDOFFS
199#define PLDOFFS (6)
200#endif
201#ifndef PLDSIZE
202#define PLDSIZE (128) /* L2 cache line size */
203#endif
204 .code 32
205 .align 5
206 .globl memcpy
207 .func
208memcpy:
209 push {r0}
210 cmp r2, #4
211 blt .Lneon_lt4
212 cmp r2, #16
213 blt .Lneon_lt16
214 cmp r2, #32
215 blt .Lneon_16
216 cmp r2, #128
217 blt .Lneon_copy_32_a
218 /* Copy blocks of 128-bytes (word-aligned) at a time*/
219 /* Code below is optimized for PLDSIZE=128 only */
220 mov r12, r2, lsr #7
221 cmp r12, #PLDOFFS
222 ble .Lneon_copy_128_loop_nopld
223 sub r12, #PLDOFFS
224 pld [r1, #(PLDOFFS-1)*PLDSIZE]
225.Lneon_copy_128_loop_outer:
226 pld [r1, #(PLDOFFS*PLDSIZE)]
227 vld1.32 {q0, q1}, [r1]!
228 vld1.32 {q2, q3}, [r1]!
229 vld1.32 {q8, q9}, [r1]!
230 vld1.32 {q10, q11}, [r1]!
231 subs r12, r12, #1
232 vst1.32 {q0, q1}, [r0]!
233 vst1.32 {q2, q3}, [r0]!
234 vst1.32 {q8, q9}, [r0]!
235 vst1.32 {q10, q11}, [r0]!
236 bne .Lneon_copy_128_loop_outer
237 mov r12, #PLDOFFS
238.Lneon_copy_128_loop_nopld:
239 vld1.32 {q0, q1}, [r1]!
240 vld1.32 {q2, q3}, [r1]!
241 vld1.32 {q8, q9}, [r1]!
242 vld1.32 {q10, q11}, [r1]!
243 subs r12, r12, #1
244 vst1.32 {q0, q1}, [r0]!
245 vst1.32 {q2, q3}, [r0]!
246 vst1.32 {q8, q9}, [r0]!
247 vst1.32 {q10, q11}, [r0]!
248 bne .Lneon_copy_128_loop_nopld
249 ands r2, r2, #0x7f
250 beq .Lneon_exit
251 cmp r2, #32
252 blt .Lneon_16
253 nop
254 /* Copy blocks of 32-bytes (word aligned) at a time*/
255.Lneon_copy_32_a:
256 mov r12, r2, lsr #5
257.Lneon_copy_32_loop_a:
258 vld1.32 {q0,q1}, [r1]!
259 subs r12, r12, #1
260 vst1.32 {q0,q1}, [r0]!
261 bne .Lneon_copy_32_loop_a
262 ands r2, r2, #0x1f
263 beq .Lneon_exit
264.Lneon_16:
265 subs r2, r2, #16
266 blt .Lneon_lt16
267 vld1.32 {q8}, [r1]!
268 vst1.32 {q8}, [r0]!
269 beq .Lneon_exit
270.Lneon_lt16:
271 movs r12, r2, lsl #29
272 bcc .Lneon_skip8
273 ldr r3, [r1], #4
274 ldr r12, [r1], #4
275 str r3, [r0], #4
276 str r12, [r0], #4
277.Lneon_skip8:
278 bpl .Lneon_lt4
279 ldr r3, [r1], #4
280 str r3, [r0], #4
281.Lneon_lt4:
282 movs r2, r2, lsl #31
283 bcc .Lneon_lt2
284 ldrh r3, [r1], #2
285 strh r3, [r0], #2
286.Lneon_lt2:
287 bpl .Lneon_exit
288 ldrb r12, [r1]
289 strb r12, [r0]
290.Lneon_exit:
291 pop {r0}
292 bx lr
293 .endfunc
294 .end
295#else /* !SCORPION_NEON_OPTIMIZATION */
296#if defined(CORTEX_CACHE_LINE_32)
297 /*
298 *This can be enabled by setting flag
299 *TARGET_CORTEX_CACHE_LINE_32 in
300 *device/<vendor>/<board>/BoardConfig.mk
301 */
302 .text
303 .fpu neon
304
305 .global memcpy
306 .type memcpy, %function
307 .align 4
308
309/* a prefetch distance of 4 cache-lines works best experimentally */
310#define CACHE_LINE_SIZE 32
311memcpy:
312 .fnstart
313 .save {r0, lr}
314 stmfd sp!, {r0, lr}
315
316 /* start preloading as early as possible */
317 pld [r1, #(CACHE_LINE_SIZE*0)]
318 pld [r1, #(CACHE_LINE_SIZE*1)]
319
320 /* do we have at least 16-bytes to copy (needed for alignment below) */
321 cmp r2, #16
322 blo 5f
323
324 /* align destination to half cache-line for the write-buffer */
325 rsb r3, r0, #0
326 ands r3, r3, #0xF
327 beq 0f
328
329 /* copy up to 15-bytes (count in r3) */
330 sub r2, r2, r3
331 movs ip, r3, lsl #31
332 ldrmib lr, [r1], #1
333 strmib lr, [r0], #1
334 ldrcsb ip, [r1], #1
335 ldrcsb lr, [r1], #1
336 strcsb ip, [r0], #1
337 strcsb lr, [r0], #1
338 movs ip, r3, lsl #29
339 bge 1f
340 // copies 4 bytes, destination 32-bits aligned
341 vld4.8 {d0[0], d1[0], d2[0], d3[0]}, [r1]!
342 vst4.8 {d0[0], d1[0], d2[0], d3[0]}, [r0, :32]!
3431: bcc 2f
344 // copies 8 bytes, destination 64-bits aligned
345 vld1.8 {d0}, [r1]!
346 vst1.8 {d0}, [r0, :64]!
3472:
348
3490: /* preload immediately the next cache line, which we may need */
350 pld [r1, #(CACHE_LINE_SIZE*0)]
351 pld [r1, #(CACHE_LINE_SIZE*1)]
352
353 /* make sure we have at least 128 bytes to copy */
354 subs r2, r2, #128
355 blo 2f
356
357 /* preload all the cache lines we need.
358 * NOTE: the number of pld below depends on PREFETCH_DISTANCE,
359 * ideally would would increase the distance in the main loop to
360 * avoid the goofy code below. In practice this doesn't seem to make
361 * a big difference.
362 */
363 pld [r1, #(CACHE_LINE_SIZE*2)]
364 pld [r1, #(CACHE_LINE_SIZE*3)]
365 pld [r1, #(CACHE_LINE_SIZE*4)]
366
367 .align 3
3681: /* The main loop copies 128 bytes at a time */
369 subs r2, r2, #128
370 vld1.8 {d0 - d3}, [r1]!
371 vld1.8 {d4 - d7}, [r1]!
372 pld [r1, #(CACHE_LINE_SIZE*1)]
373 pld [r1, #(CACHE_LINE_SIZE*2)]
374 vld1.8 {d16 - d19}, [r1]!
375 vld1.8 {d20 - d23}, [r1]!
376 pld [r1, #(CACHE_LINE_SIZE*1)]
377 pld [r1, #(CACHE_LINE_SIZE*2)]
378 vst1.8 {d0 - d3}, [r0, :128]!
379 vst1.8 {d4 - d7}, [r0, :128]!
380 vst1.8 {d16 - d19}, [r0, :128]!
381 vst1.8 {d20 - d23}, [r0, :128]!
382 bhs 1b
383
3842: /* fix-up the remaining count and make sure we have >= 32 bytes left */
385 add r2, r2, #128
386 subs r2, r2, #32
387 blo 4f
388
3893: /* 32 bytes at a time. These cache lines were already preloaded */
390 vld1.8 {d0 - d3}, [r1]!
391 subs r2, r2, #32
392 vst1.8 {d0 - d3}, [r0, :128]!
393 bhs 3b
394
3954: /* less than 32 left */
396 add r2, r2, #32
397 tst r2, #0x10
398 beq 5f
399 // copies 16 bytes, 128-bits aligned
400 vld1.8 {d0, d1}, [r1]!
401 vst1.8 {d0, d1}, [r0, :128]!
402
4035: /* copy up to 15-bytes (count in r2) */
404 movs ip, r2, lsl #29
405 bcc 1f
406 vld1.8 {d0}, [r1]!
407 vst1.8 {d0}, [r0]!
4081: bge 2f
409 vld4.8 {d0[0], d1[0], d2[0], d3[0]}, [r1]!
410 vst4.8 {d0[0], d1[0], d2[0], d3[0]}, [r0]!
4112: movs ip, r2, lsl #31
412 ldrmib r3, [r1], #1
413 ldrcsb ip, [r1], #1
414 ldrcsb lr, [r1], #1
415 strmib r3, [r0], #1
416 strcsb ip, [r0], #1
417 strcsb lr, [r0], #1
418
419 ldmfd sp!, {r0, lr}
420 bx lr
421 .fnend
422#else /*!CORTEX_CACHE_LINE_32*/
Mathias Agopianee223d02009-09-27 17:46:43 -0700423
424 .text
425 .fpu neon
426
Henrik Smidingfe6338d2010-09-15 16:08:03 +0200427#ifdef HAVE_32_BYTE_CACHE_LINE
428/* a prefetch distance of 2 cache-lines */
429#define CACHE_LINE_SIZE 32
430#define PREFETCH_DISTANCE (CACHE_LINE_SIZE*2)
431#else
Mathias Agopian199f9d92009-10-28 02:54:37 -0700432/* a prefetch distance of 4 cache-lines works best experimentally */
433#define CACHE_LINE_SIZE 64
434#define PREFETCH_DISTANCE (CACHE_LINE_SIZE*4)
Henrik Smidingfe6338d2010-09-15 16:08:03 +0200435#endif
Mathias Agopianee223d02009-09-27 17:46:43 -0700436
Evgeniy Stepanov487b6132011-10-04 14:22:15 +0400437ENTRY(memcpy)
Mathias Agopianee223d02009-09-27 17:46:43 -0700438 .save {r0, lr}
Mathias Agopianee223d02009-09-27 17:46:43 -0700439 /* start preloading as early as possible */
Mathias Agopian199f9d92009-10-28 02:54:37 -0700440 pld [r1, #(CACHE_LINE_SIZE*0)]
Henrik Smidingfe6338d2010-09-15 16:08:03 +0200441 stmfd sp!, {r0, lr}
Mathias Agopian199f9d92009-10-28 02:54:37 -0700442 pld [r1, #(CACHE_LINE_SIZE*1)]
Mathias Agopianee223d02009-09-27 17:46:43 -0700443
444 /* do we have at least 16-bytes to copy (needed for alignment below) */
445 cmp r2, #16
446 blo 5f
447
Henrik Smidingfe6338d2010-09-15 16:08:03 +0200448 /* align destination to cache-line for the write-buffer */
Mathias Agopianee223d02009-09-27 17:46:43 -0700449 rsb r3, r0, #0
450 ands r3, r3, #0xF
451 beq 0f
452
453 /* copy up to 15-bytes (count in r3) */
454 sub r2, r2, r3
455 movs ip, r3, lsl #31
456 ldrmib lr, [r1], #1
457 strmib lr, [r0], #1
458 ldrcsb ip, [r1], #1
459 ldrcsb lr, [r1], #1
460 strcsb ip, [r0], #1
461 strcsb lr, [r0], #1
462 movs ip, r3, lsl #29
463 bge 1f
464 // copies 4 bytes, destination 32-bits aligned
465 vld4.8 {d0[0], d1[0], d2[0], d3[0]}, [r1]!
466 vst4.8 {d0[0], d1[0], d2[0], d3[0]}, [r0, :32]!
4671: bcc 2f
468 // copies 8 bytes, destination 64-bits aligned
469 vld1.8 {d0}, [r1]!
470 vst1.8 {d0}, [r0, :64]!
4712:
472
4730: /* preload immediately the next cache line, which we may need */
Mathias Agopian199f9d92009-10-28 02:54:37 -0700474 pld [r1, #(CACHE_LINE_SIZE*0)]
475 pld [r1, #(CACHE_LINE_SIZE*1)]
Mathias Agopianee223d02009-09-27 17:46:43 -0700476
Henrik Smidingfe6338d2010-09-15 16:08:03 +0200477#ifdef HAVE_32_BYTE_CACHE_LINE
478 /* make sure we have at least 32 bytes to copy */
479 subs r2, r2, #32
480 blo 4f
481
482 /* preload all the cache lines we need.
483 * NOTE: the number of pld below depends on PREFETCH_DISTANCE,
484 * ideally would would increase the distance in the main loop to
485 * avoid the goofy code below. In practice this doesn't seem to make
486 * a big difference.
487 */
488 pld [r1, #(PREFETCH_DISTANCE)]
489
4901: /* The main loop copies 32 bytes at a time */
491 vld1.8 {d0 - d3}, [r1]!
492 pld [r1, #(PREFETCH_DISTANCE)]
493 subs r2, r2, #32
494 vst1.8 {d0 - d3}, [r0, :128]!
495 bhs 1b
496#else
Mathias Agopian199f9d92009-10-28 02:54:37 -0700497 /* make sure we have at least 64 bytes to copy */
498 subs r2, r2, #64
Mathias Agopianee223d02009-09-27 17:46:43 -0700499 blo 2f
500
501 /* preload all the cache lines we need.
502 * NOTE: the number of pld below depends on PREFETCH_DISTANCE,
503 * ideally would would increase the distance in the main loop to
504 * avoid the goofy code below. In practice this doesn't seem to make
505 * a big difference.
506 */
Mathias Agopian199f9d92009-10-28 02:54:37 -0700507 pld [r1, #(CACHE_LINE_SIZE*2)]
508 pld [r1, #(CACHE_LINE_SIZE*3)]
509 pld [r1, #(PREFETCH_DISTANCE)]
Mathias Agopianee223d02009-09-27 17:46:43 -0700510
Mathias Agopian199f9d92009-10-28 02:54:37 -07005111: /* The main loop copies 64 bytes at a time */
Mathias Agopianee223d02009-09-27 17:46:43 -0700512 vld1.8 {d0 - d3}, [r1]!
513 vld1.8 {d4 - d7}, [r1]!
Mathias Agopian199f9d92009-10-28 02:54:37 -0700514 pld [r1, #(PREFETCH_DISTANCE)]
515 subs r2, r2, #64
Mathias Agopianee223d02009-09-27 17:46:43 -0700516 vst1.8 {d0 - d3}, [r0, :128]!
517 vst1.8 {d4 - d7}, [r0, :128]!
Mathias Agopianee223d02009-09-27 17:46:43 -0700518 bhs 1b
519
5202: /* fix-up the remaining count and make sure we have >= 32 bytes left */
Mathias Agopian199f9d92009-10-28 02:54:37 -0700521 add r2, r2, #64
Mathias Agopianee223d02009-09-27 17:46:43 -0700522 subs r2, r2, #32
523 blo 4f
524
5253: /* 32 bytes at a time. These cache lines were already preloaded */
526 vld1.8 {d0 - d3}, [r1]!
527 subs r2, r2, #32
528 vst1.8 {d0 - d3}, [r0, :128]!
529 bhs 3b
Henrik Smidingfe6338d2010-09-15 16:08:03 +0200530#endif
Mathias Agopianee223d02009-09-27 17:46:43 -07005314: /* less than 32 left */
532 add r2, r2, #32
533 tst r2, #0x10
534 beq 5f
535 // copies 16 bytes, 128-bits aligned
536 vld1.8 {d0, d1}, [r1]!
537 vst1.8 {d0, d1}, [r0, :128]!
538
5395: /* copy up to 15-bytes (count in r2) */
540 movs ip, r2, lsl #29
541 bcc 1f
542 vld1.8 {d0}, [r1]!
543 vst1.8 {d0}, [r0]!
5441: bge 2f
545 vld4.8 {d0[0], d1[0], d2[0], d3[0]}, [r1]!
546 vst4.8 {d0[0], d1[0], d2[0], d3[0]}, [r0]!
5472: movs ip, r2, lsl #31
548 ldrmib r3, [r1], #1
549 ldrcsb ip, [r1], #1
550 ldrcsb lr, [r1], #1
551 strmib r3, [r0], #1
552 strcsb ip, [r0], #1
553 strcsb lr, [r0], #1
554
555 ldmfd sp!, {r0, lr}
556 bx lr
Kenny Root420878c2011-02-16 11:55:58 -0800557END(memcpy)
Mathias Agopianee223d02009-09-27 17:46:43 -0700558
Brent DeGraafa8c02212012-05-30 22:50:19 -0400559#endif /*!CORTEX_CACHE_LINE_32*/
560#endif /* SCORPION_NEON_OPTIMIZATION */
Mathias Agopianee223d02009-09-27 17:46:43 -0700561#else /* __ARM_ARCH__ < 7 */
562
563
The Android Open Source Project1dc9e472009-03-03 19:28:35 -0800564 /*
565 * Optimized memcpy() for ARM.
566 *
567 * note that memcpy() always returns the destination pointer,
568 * so we have to preserve R0.
569 */
Mathias Agopianee223d02009-09-27 17:46:43 -0700570
Kenny Root420878c2011-02-16 11:55:58 -0800571ENTRY(memcpy)
Mathias Agopianee223d02009-09-27 17:46:43 -0700572 /* The stack must always be 64-bits aligned to be compliant with the
The Android Open Source Project1dc9e472009-03-03 19:28:35 -0800573 * ARM ABI. Since we have to save R0, we might as well save R4
574 * which we can use for better pipelining of the reads below
575 */
The Android Open Source Project1dc9e472009-03-03 19:28:35 -0800576 .save {r0, r4, lr}
577 stmfd sp!, {r0, r4, lr}
578 /* Making room for r5-r11 which will be spilled later */
579 .pad #28
580 sub sp, sp, #28
581
582 // preload the destination because we'll align it to a cache line
583 // with small writes. Also start the source "pump".
584 PLD (r0, #0)
585 PLD (r1, #0)
586 PLD (r1, #32)
587
588 /* it simplifies things to take care of len<4 early */
589 cmp r2, #4
590 blo copy_last_3_and_return
591
592 /* compute the offset to align the source
593 * offset = (4-(src&3))&3 = -src & 3
594 */
595 rsb r3, r1, #0
596 ands r3, r3, #3
597 beq src_aligned
598
599 /* align source to 32 bits. We need to insert 2 instructions between
600 * a ldr[b|h] and str[b|h] because byte and half-word instructions
601 * stall 2 cycles.
602 */
603 movs r12, r3, lsl #31
604 sub r2, r2, r3 /* we know that r3 <= r2 because r2 >= 4 */
605 ldrmib r3, [r1], #1
606 ldrcsb r4, [r1], #1
607 ldrcsb r12,[r1], #1
608 strmib r3, [r0], #1
609 strcsb r4, [r0], #1
610 strcsb r12,[r0], #1
Mathias Agopianee223d02009-09-27 17:46:43 -0700611
The Android Open Source Project1dc9e472009-03-03 19:28:35 -0800612src_aligned:
613
Mathias Agopianee223d02009-09-27 17:46:43 -0700614 /* see if src and dst are aligned together (congruent) */
The Android Open Source Project1dc9e472009-03-03 19:28:35 -0800615 eor r12, r0, r1
616 tst r12, #3
617 bne non_congruent
618
619 /* Use post-incriment mode for stm to spill r5-r11 to reserved stack
620 * frame. Don't update sp.
621 */
622 stmea sp, {r5-r11}
623
624 /* align the destination to a cache-line */
625 rsb r3, r0, #0
626 ands r3, r3, #0x1C
627 beq congruent_aligned32
628 cmp r3, r2
629 andhi r3, r2, #0x1C
630
631 /* conditionnaly copies 0 to 7 words (length in r3) */
Mathias Agopianee223d02009-09-27 17:46:43 -0700632 movs r12, r3, lsl #28
The Android Open Source Project1dc9e472009-03-03 19:28:35 -0800633 ldmcsia r1!, {r4, r5, r6, r7} /* 16 bytes */
634 ldmmiia r1!, {r8, r9} /* 8 bytes */
635 stmcsia r0!, {r4, r5, r6, r7}
636 stmmiia r0!, {r8, r9}
637 tst r3, #0x4
638 ldrne r10,[r1], #4 /* 4 bytes */
639 strne r10,[r0], #4
640 sub r2, r2, r3
641
642congruent_aligned32:
643 /*
644 * here source is aligned to 32 bytes.
645 */
646
647cached_aligned32:
648 subs r2, r2, #32
649 blo less_than_32_left
650
651 /*
652 * We preload a cache-line up to 64 bytes ahead. On the 926, this will
Mathias Agopianee223d02009-09-27 17:46:43 -0700653 * stall only until the requested world is fetched, but the linefill
The Android Open Source Project1dc9e472009-03-03 19:28:35 -0800654 * continues in the the background.
655 * While the linefill is going, we write our previous cache-line
656 * into the write-buffer (which should have some free space).
657 * When the linefill is done, the writebuffer will
658 * start dumping its content into memory
659 *
660 * While all this is going, we then load a full cache line into
661 * 8 registers, this cache line should be in the cache by now
662 * (or partly in the cache).
663 *
664 * This code should work well regardless of the source/dest alignment.
665 *
666 */
667
668 // Align the preload register to a cache-line because the cpu does
669 // "critical word first" (the first word requested is loaded first).
670 bic r12, r1, #0x1F
671 add r12, r12, #64
672
6731: ldmia r1!, { r4-r11 }
674 PLD (r12, #64)
675 subs r2, r2, #32
676
677 // NOTE: if r12 is more than 64 ahead of r1, the following ldrhi
678 // for ARM9 preload will not be safely guarded by the preceding subs.
Mathias Agopianee223d02009-09-27 17:46:43 -0700679 // When it is safely guarded the only possibility to have SIGSEGV here
The Android Open Source Project1dc9e472009-03-03 19:28:35 -0800680 // is because the caller overstates the length.
681 ldrhi r3, [r12], #32 /* cheap ARM9 preload */
682 stmia r0!, { r4-r11 }
683 bhs 1b
Mathias Agopianee223d02009-09-27 17:46:43 -0700684
The Android Open Source Project1dc9e472009-03-03 19:28:35 -0800685 add r2, r2, #32
686
687
688
689
690less_than_32_left:
Mathias Agopianee223d02009-09-27 17:46:43 -0700691 /*
The Android Open Source Project1dc9e472009-03-03 19:28:35 -0800692 * less than 32 bytes left at this point (length in r2)
693 */
694
695 /* skip all this if there is nothing to do, which should
696 * be a common case (if not executed the code below takes
697 * about 16 cycles)
698 */
699 tst r2, #0x1F
700 beq 1f
701
702 /* conditionnaly copies 0 to 31 bytes */
Mathias Agopianee223d02009-09-27 17:46:43 -0700703 movs r12, r2, lsl #28
The Android Open Source Project1dc9e472009-03-03 19:28:35 -0800704 ldmcsia r1!, {r4, r5, r6, r7} /* 16 bytes */
705 ldmmiia r1!, {r8, r9} /* 8 bytes */
706 stmcsia r0!, {r4, r5, r6, r7}
707 stmmiia r0!, {r8, r9}
708 movs r12, r2, lsl #30
709 ldrcs r3, [r1], #4 /* 4 bytes */
710 ldrmih r4, [r1], #2 /* 2 bytes */
Mathias Agopianee223d02009-09-27 17:46:43 -0700711 strcs r3, [r0], #4
The Android Open Source Project1dc9e472009-03-03 19:28:35 -0800712 strmih r4, [r0], #2
713 tst r2, #0x1
714 ldrneb r3, [r1] /* last byte */
715 strneb r3, [r0]
716
717 /* we're done! restore everything and return */
7181: ldmfd sp!, {r5-r11}
719 ldmfd sp!, {r0, r4, lr}
720 bx lr
721
722 /********************************************************************/
723
724non_congruent:
725 /*
726 * here source is aligned to 4 bytes
727 * but destination is not.
728 *
Mathias Agopianee223d02009-09-27 17:46:43 -0700729 * in the code below r2 is the number of bytes read
The Android Open Source Project1dc9e472009-03-03 19:28:35 -0800730 * (the number of bytes written is always smaller, because we have
731 * partial words in the shift queue)
732 */
733 cmp r2, #4
734 blo copy_last_3_and_return
Mathias Agopianee223d02009-09-27 17:46:43 -0700735
The Android Open Source Project1dc9e472009-03-03 19:28:35 -0800736 /* Use post-incriment mode for stm to spill r5-r11 to reserved stack
737 * frame. Don't update sp.
738 */
739 stmea sp, {r5-r11}
Mathias Agopianee223d02009-09-27 17:46:43 -0700740
The Android Open Source Project1dc9e472009-03-03 19:28:35 -0800741 /* compute shifts needed to align src to dest */
742 rsb r5, r0, #0
743 and r5, r5, #3 /* r5 = # bytes in partial words */
Mathias Agopianee223d02009-09-27 17:46:43 -0700744 mov r12, r5, lsl #3 /* r12 = right */
The Android Open Source Project1dc9e472009-03-03 19:28:35 -0800745 rsb lr, r12, #32 /* lr = left */
Mathias Agopianee223d02009-09-27 17:46:43 -0700746
The Android Open Source Project1dc9e472009-03-03 19:28:35 -0800747 /* read the first word */
748 ldr r3, [r1], #4
749 sub r2, r2, #4
Mathias Agopianee223d02009-09-27 17:46:43 -0700750
The Android Open Source Project1dc9e472009-03-03 19:28:35 -0800751 /* write a partial word (0 to 3 bytes), such that destination
752 * becomes aligned to 32 bits (r5 = nb of words to copy for alignment)
753 */
754 movs r5, r5, lsl #31
755 strmib r3, [r0], #1
Mathias Agopianee223d02009-09-27 17:46:43 -0700756 movmi r3, r3, lsr #8
The Android Open Source Project1dc9e472009-03-03 19:28:35 -0800757 strcsb r3, [r0], #1
758 movcs r3, r3, lsr #8
759 strcsb r3, [r0], #1
760 movcs r3, r3, lsr #8
761
762 cmp r2, #4
763 blo partial_word_tail
Mathias Agopianee223d02009-09-27 17:46:43 -0700764
The Android Open Source Project1dc9e472009-03-03 19:28:35 -0800765 /* Align destination to 32 bytes (cache line boundary) */
7661: tst r0, #0x1c
767 beq 2f
768 ldr r5, [r1], #4
769 sub r2, r2, #4
770 orr r4, r3, r5, lsl lr
771 mov r3, r5, lsr r12
772 str r4, [r0], #4
773 cmp r2, #4
774 bhs 1b
775 blo partial_word_tail
776
777 /* copy 32 bytes at a time */
7782: subs r2, r2, #32
779 blo less_than_thirtytwo
780
781 /* Use immediate mode for the shifts, because there is an extra cycle
782 * for register shifts, which could account for up to 50% of
783 * performance hit.
784 */
785
786 cmp r12, #24
787 beq loop24
788 cmp r12, #8
789 beq loop8
790
791loop16:
792 ldr r12, [r1], #4
7931: mov r4, r12
794 ldmia r1!, { r5,r6,r7, r8,r9,r10,r11}
795 PLD (r1, #64)
796 subs r2, r2, #32
797 ldrhs r12, [r1], #4
798 orr r3, r3, r4, lsl #16
799 mov r4, r4, lsr #16
800 orr r4, r4, r5, lsl #16
801 mov r5, r5, lsr #16
802 orr r5, r5, r6, lsl #16
803 mov r6, r6, lsr #16
804 orr r6, r6, r7, lsl #16
805 mov r7, r7, lsr #16
806 orr r7, r7, r8, lsl #16
807 mov r8, r8, lsr #16
808 orr r8, r8, r9, lsl #16
809 mov r9, r9, lsr #16
810 orr r9, r9, r10, lsl #16
811 mov r10, r10, lsr #16
812 orr r10, r10, r11, lsl #16
813 stmia r0!, {r3,r4,r5,r6, r7,r8,r9,r10}
814 mov r3, r11, lsr #16
815 bhs 1b
816 b less_than_thirtytwo
817
818loop8:
819 ldr r12, [r1], #4
8201: mov r4, r12
821 ldmia r1!, { r5,r6,r7, r8,r9,r10,r11}
822 PLD (r1, #64)
823 subs r2, r2, #32
824 ldrhs r12, [r1], #4
825 orr r3, r3, r4, lsl #24
826 mov r4, r4, lsr #8
827 orr r4, r4, r5, lsl #24
828 mov r5, r5, lsr #8
829 orr r5, r5, r6, lsl #24
830 mov r6, r6, lsr #8
831 orr r6, r6, r7, lsl #24
832 mov r7, r7, lsr #8
833 orr r7, r7, r8, lsl #24
834 mov r8, r8, lsr #8
835 orr r8, r8, r9, lsl #24
836 mov r9, r9, lsr #8
837 orr r9, r9, r10, lsl #24
838 mov r10, r10, lsr #8
839 orr r10, r10, r11, lsl #24
840 stmia r0!, {r3,r4,r5,r6, r7,r8,r9,r10}
841 mov r3, r11, lsr #8
842 bhs 1b
843 b less_than_thirtytwo
844
845loop24:
846 ldr r12, [r1], #4
8471: mov r4, r12
848 ldmia r1!, { r5,r6,r7, r8,r9,r10,r11}
849 PLD (r1, #64)
850 subs r2, r2, #32
851 ldrhs r12, [r1], #4
852 orr r3, r3, r4, lsl #8
853 mov r4, r4, lsr #24
854 orr r4, r4, r5, lsl #8
855 mov r5, r5, lsr #24
856 orr r5, r5, r6, lsl #8
857 mov r6, r6, lsr #24
858 orr r6, r6, r7, lsl #8
859 mov r7, r7, lsr #24
860 orr r7, r7, r8, lsl #8
861 mov r8, r8, lsr #24
862 orr r8, r8, r9, lsl #8
863 mov r9, r9, lsr #24
864 orr r9, r9, r10, lsl #8
865 mov r10, r10, lsr #24
866 orr r10, r10, r11, lsl #8
867 stmia r0!, {r3,r4,r5,r6, r7,r8,r9,r10}
868 mov r3, r11, lsr #24
869 bhs 1b
870
871
872less_than_thirtytwo:
873 /* copy the last 0 to 31 bytes of the source */
874 rsb r12, lr, #32 /* we corrupted r12, recompute it */
875 add r2, r2, #32
876 cmp r2, #4
877 blo partial_word_tail
878
8791: ldr r5, [r1], #4
880 sub r2, r2, #4
881 orr r4, r3, r5, lsl lr
882 mov r3, r5, lsr r12
883 str r4, [r0], #4
884 cmp r2, #4
885 bhs 1b
886
887partial_word_tail:
888 /* we have a partial word in the input buffer */
889 movs r5, lr, lsl #(31-3)
890 strmib r3, [r0], #1
891 movmi r3, r3, lsr #8
892 strcsb r3, [r0], #1
893 movcs r3, r3, lsr #8
894 strcsb r3, [r0], #1
Mathias Agopianee223d02009-09-27 17:46:43 -0700895
The Android Open Source Project1dc9e472009-03-03 19:28:35 -0800896 /* Refill spilled registers from the stack. Don't update sp. */
897 ldmfd sp, {r5-r11}
898
899copy_last_3_and_return:
900 movs r2, r2, lsl #31 /* copy remaining 0, 1, 2 or 3 bytes */
901 ldrmib r2, [r1], #1
902 ldrcsb r3, [r1], #1
903 ldrcsb r12,[r1]
904 strmib r2, [r0], #1
905 strcsb r3, [r0], #1
906 strcsb r12,[r0]
907
908 /* we're done! restore sp and spilled registers and return */
909 add sp, sp, #28
910 ldmfd sp!, {r0, r4, lr}
911 bx lr
Kenny Root420878c2011-02-16 11:55:58 -0800912END(memcpy)
The Android Open Source Project1dc9e472009-03-03 19:28:35 -0800913
Mathias Agopianee223d02009-09-27 17:46:43 -0700914
915#endif /* __ARM_ARCH__ < 7 */