blob: d13c605a05fac5332953a09c04a71d6dfe52918a [file] [log] [blame]
The Android Open Source Project1dc9e472009-03-03 19:28:35 -08001/*
2 * Copyright (C) 2008 The Android Open Source Project
3 * All rights reserved.
4 *
Harshad Bhutadac8221472011-05-05 18:27:02 +05305 * Copyright (c) 2009-2011, Code Aurora Forum. All rights reserved.
6 *
The Android Open Source Project1dc9e472009-03-03 19:28:35 -08007 * Redistribution and use in source and binary forms, with or without
8 * modification, are permitted provided that the following conditions
9 * are met:
10 * * Redistributions of source code must retain the above copyright
11 * notice, this list of conditions and the following disclaimer.
12 * * Redistributions in binary form must reproduce the above copyright
13 * notice, this list of conditions and the following disclaimer in
14 * the documentation and/or other materials provided with the
15 * distribution.
16 *
17 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
18 * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
19 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
20 * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
21 * COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
22 * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
23 * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS
24 * OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED
25 * AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
26 * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
27 * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
28 * SUCH DAMAGE.
29 */
30
31#include <machine/cpu-features.h>
Kenny Root420878c2011-02-16 11:55:58 -080032#include <machine/asm.h>
The Android Open Source Project1dc9e472009-03-03 19:28:35 -080033
Colin Crossecede402010-03-09 16:23:51 -080034#if defined(__ARM_NEON__)
Harshad Bhutadac8221472011-05-05 18:27:02 +053035#if defined(SCORPION_NEON_OPTIMIZATION)
36 /*
37 * These can be overridden in:
38 * device/<vendor>/<board>/BoardConfig.mk
39 * by setting the following:
40 * TARGET_USE_SCORPION_BIONIC_OPTIMIZATION := true
41 * TARGET_USE_SCORPION_PLD_SET := true
42 * TARGET_SCORPION_BIONIC_PLDOFFS := <pldoffset>
43 * TARGET_SCORPION_BIONIC_PLDSIZE := <pldsize>
44 */
45#ifndef PLDOFFS
46#define PLDOFFS (6)
47#endif
48#ifndef PLDSIZE
49#define PLDSIZE (128) /* L2 cache line size */
50#endif
51 .code 32
52 .align 5
53 .globl memcpy
54 .func
55memcpy:
56 push {r0}
57 cmp r2, #4
58 blt .Lneon_lt4
59 cmp r2, #16
60 blt .Lneon_lt16
61 cmp r2, #32
62 blt .Lneon_16
63 cmp r2, #128
64 blt .Lneon_copy_32_a
65 /* Copy blocks of 128-bytes (word-aligned) at a time*/
66 /* Code below is optimized for PLDSIZE=128 only */
67 mov r12, r2, lsr #7
68 cmp r12, #PLDOFFS
69 ble .Lneon_copy_128_loop_nopld
70 sub r12, #PLDOFFS
71 pld [r1, #(PLDOFFS-1)*PLDSIZE]
72.Lneon_copy_128_loop_outer:
73 pld [r1, #(PLDOFFS*PLDSIZE)]
74 vld1.32 {q0, q1}, [r1]!
75 vld1.32 {q2, q3}, [r1]!
76 vld1.32 {q8, q9}, [r1]!
77 vld1.32 {q10, q11}, [r1]!
78 subs r12, r12, #1
79 vst1.32 {q0, q1}, [r0]!
80 vst1.32 {q2, q3}, [r0]!
81 vst1.32 {q8, q9}, [r0]!
82 vst1.32 {q10, q11}, [r0]!
83 bne .Lneon_copy_128_loop_outer
84 mov r12, #PLDOFFS
85.Lneon_copy_128_loop_nopld:
86 vld1.32 {q0, q1}, [r1]!
87 vld1.32 {q2, q3}, [r1]!
88 vld1.32 {q8, q9}, [r1]!
89 vld1.32 {q10, q11}, [r1]!
90 subs r12, r12, #1
91 vst1.32 {q0, q1}, [r0]!
92 vst1.32 {q2, q3}, [r0]!
93 vst1.32 {q8, q9}, [r0]!
94 vst1.32 {q10, q11}, [r0]!
95 bne .Lneon_copy_128_loop_nopld
96 ands r2, r2, #0x7f
97 beq .Lneon_exit
98 cmp r2, #32
99 blt .Lneon_16
100 nop
101 /* Copy blocks of 32-bytes (word aligned) at a time*/
102.Lneon_copy_32_a:
103 mov r12, r2, lsr #5
104.Lneon_copy_32_loop_a:
105 vld1.32 {q0,q1}, [r1]!
106 subs r12, r12, #1
107 vst1.32 {q0,q1}, [r0]!
108 bne .Lneon_copy_32_loop_a
109 ands r2, r2, #0x1f
110 beq .Lneon_exit
111.Lneon_16:
112 subs r2, r2, #16
113 blt .Lneon_lt16
114 vld1.32 {q8}, [r1]!
115 vst1.32 {q8}, [r0]!
116 beq .Lneon_exit
117.Lneon_lt16:
118 movs r12, r2, lsl #29
119 bcc .Lneon_skip8
120 ldr r3, [r1], #4
121 ldr r12, [r1], #4
122 str r3, [r0], #4
123 str r12, [r0], #4
124.Lneon_skip8:
125 bpl .Lneon_lt4
126 ldr r3, [r1], #4
127 str r3, [r0], #4
128.Lneon_lt4:
129 movs r2, r2, lsl #31
130 bcc .Lneon_lt2
131 ldrh r3, [r1], #2
132 strh r3, [r0], #2
133.Lneon_lt2:
134 bpl .Lneon_exit
135 ldrb r12, [r1]
136 strb r12, [r0]
137.Lneon_exit:
138 pop {r0}
139 bx lr
140 .endfunc
141 .end
142#else /* !SCORPION_NEON_OPTIMIZATION */
Chitti Babu Theegalaf554a192011-11-18 10:25:58 +0530143#if defined(CORTEX_CACHE_LINE_32)
144 /*
145 *This can be enabled by setting flag
146 *TARGET_CORTEX_CACHE_LINE_32 in
147 *device/<vendor>/<board>/BoardConfig.mk
148 */
149 .text
150 .fpu neon
151
152 .global memcpy
153 .type memcpy, %function
154 .align 4
155
156/* a prefetch distance of 4 cache-lines works best experimentally */
157#define CACHE_LINE_SIZE 32
158memcpy:
159 .fnstart
160 .save {r0, lr}
161 stmfd sp!, {r0, lr}
162
163 /* start preloading as early as possible */
164 pld [r1, #(CACHE_LINE_SIZE*0)]
165 pld [r1, #(CACHE_LINE_SIZE*1)]
166
167 /* do we have at least 16-bytes to copy (needed for alignment below) */
168 cmp r2, #16
169 blo 5f
170
171 /* align destination to half cache-line for the write-buffer */
172 rsb r3, r0, #0
173 ands r3, r3, #0xF
174 beq 0f
175
176 /* copy up to 15-bytes (count in r3) */
177 sub r2, r2, r3
178 movs ip, r3, lsl #31
179 ldrmib lr, [r1], #1
180 strmib lr, [r0], #1
181 ldrcsb ip, [r1], #1
182 ldrcsb lr, [r1], #1
183 strcsb ip, [r0], #1
184 strcsb lr, [r0], #1
185 movs ip, r3, lsl #29
186 bge 1f
187 // copies 4 bytes, destination 32-bits aligned
188 vld4.8 {d0[0], d1[0], d2[0], d3[0]}, [r1]!
189 vst4.8 {d0[0], d1[0], d2[0], d3[0]}, [r0, :32]!
1901: bcc 2f
191 // copies 8 bytes, destination 64-bits aligned
192 vld1.8 {d0}, [r1]!
193 vst1.8 {d0}, [r0, :64]!
1942:
195
1960: /* preload immediately the next cache line, which we may need */
197 pld [r1, #(CACHE_LINE_SIZE*0)]
198 pld [r1, #(CACHE_LINE_SIZE*1)]
199
200 /* make sure we have at least 128 bytes to copy */
201 subs r2, r2, #128
202 blo 2f
203
204 /* preload all the cache lines we need.
205 * NOTE: the number of pld below depends on PREFETCH_DISTANCE,
206 * ideally would would increase the distance in the main loop to
207 * avoid the goofy code below. In practice this doesn't seem to make
208 * a big difference.
209 */
210 pld [r1, #(CACHE_LINE_SIZE*2)]
211 pld [r1, #(CACHE_LINE_SIZE*3)]
212 pld [r1, #(CACHE_LINE_SIZE*4)]
213
214 .align 3
2151: /* The main loop copies 128 bytes at a time */
216 subs r2, r2, #128
217 vld1.8 {d0 - d3}, [r1]!
218 vld1.8 {d4 - d7}, [r1]!
219 pld [r1, #(CACHE_LINE_SIZE*1)]
220 pld [r1, #(CACHE_LINE_SIZE*2)]
221 vld1.8 {d16 - d19}, [r1]!
222 vld1.8 {d20 - d23}, [r1]!
223 pld [r1, #(CACHE_LINE_SIZE*1)]
224 pld [r1, #(CACHE_LINE_SIZE*2)]
225 vst1.8 {d0 - d3}, [r0, :128]!
226 vst1.8 {d4 - d7}, [r0, :128]!
227 vst1.8 {d16 - d19}, [r0, :128]!
228 vst1.8 {d20 - d23}, [r0, :128]!
229 bhs 1b
230
2312: /* fix-up the remaining count and make sure we have >= 32 bytes left */
232 add r2, r2, #128
233 subs r2, r2, #32
234 blo 4f
235
2363: /* 32 bytes at a time. These cache lines were already preloaded */
237 vld1.8 {d0 - d3}, [r1]!
238 subs r2, r2, #32
239 vst1.8 {d0 - d3}, [r0, :128]!
240 bhs 3b
241
2424: /* less than 32 left */
243 add r2, r2, #32
244 tst r2, #0x10
245 beq 5f
246 // copies 16 bytes, 128-bits aligned
247 vld1.8 {d0, d1}, [r1]!
248 vst1.8 {d0, d1}, [r0, :128]!
249
2505: /* copy up to 15-bytes (count in r2) */
251 movs ip, r2, lsl #29
252 bcc 1f
253 vld1.8 {d0}, [r1]!
254 vst1.8 {d0}, [r0]!
2551: bge 2f
256 vld4.8 {d0[0], d1[0], d2[0], d3[0]}, [r1]!
257 vst4.8 {d0[0], d1[0], d2[0], d3[0]}, [r0]!
2582: movs ip, r2, lsl #31
259 ldrmib r3, [r1], #1
260 ldrcsb ip, [r1], #1
261 ldrcsb lr, [r1], #1
262 strmib r3, [r0], #1
263 strcsb ip, [r0], #1
264 strcsb lr, [r0], #1
265
266 ldmfd sp!, {r0, lr}
267 bx lr
268 .fnend
269#else /*!CORTEX_CACHE_LINE_32*/
270
Mathias Agopianee223d02009-09-27 17:46:43 -0700271 .text
272 .fpu neon
273
Mathias Agopian199f9d92009-10-28 02:54:37 -0700274/* a prefetch distance of 4 cache-lines works best experimentally */
275#define CACHE_LINE_SIZE 64
276#define PREFETCH_DISTANCE (CACHE_LINE_SIZE*4)
Mathias Agopianee223d02009-09-27 17:46:43 -0700277
Evgeniy Stepanov487b6132011-10-04 14:22:15 +0400278ENTRY(memcpy)
Mathias Agopianee223d02009-09-27 17:46:43 -0700279 .save {r0, lr}
280 stmfd sp!, {r0, lr}
281
282 /* start preloading as early as possible */
Mathias Agopian199f9d92009-10-28 02:54:37 -0700283 pld [r1, #(CACHE_LINE_SIZE*0)]
284 pld [r1, #(CACHE_LINE_SIZE*1)]
Mathias Agopianee223d02009-09-27 17:46:43 -0700285
286 /* do we have at least 16-bytes to copy (needed for alignment below) */
287 cmp r2, #16
288 blo 5f
289
290 /* align destination to half cache-line for the write-buffer */
291 rsb r3, r0, #0
292 ands r3, r3, #0xF
293 beq 0f
294
295 /* copy up to 15-bytes (count in r3) */
296 sub r2, r2, r3
297 movs ip, r3, lsl #31
298 ldrmib lr, [r1], #1
299 strmib lr, [r0], #1
300 ldrcsb ip, [r1], #1
301 ldrcsb lr, [r1], #1
302 strcsb ip, [r0], #1
303 strcsb lr, [r0], #1
304 movs ip, r3, lsl #29
305 bge 1f
306 // copies 4 bytes, destination 32-bits aligned
307 vld4.8 {d0[0], d1[0], d2[0], d3[0]}, [r1]!
308 vst4.8 {d0[0], d1[0], d2[0], d3[0]}, [r0, :32]!
3091: bcc 2f
310 // copies 8 bytes, destination 64-bits aligned
311 vld1.8 {d0}, [r1]!
312 vst1.8 {d0}, [r0, :64]!
3132:
314
3150: /* preload immediately the next cache line, which we may need */
Mathias Agopian199f9d92009-10-28 02:54:37 -0700316 pld [r1, #(CACHE_LINE_SIZE*0)]
317 pld [r1, #(CACHE_LINE_SIZE*1)]
Mathias Agopianee223d02009-09-27 17:46:43 -0700318
Mathias Agopian199f9d92009-10-28 02:54:37 -0700319 /* make sure we have at least 64 bytes to copy */
320 subs r2, r2, #64
Mathias Agopianee223d02009-09-27 17:46:43 -0700321 blo 2f
322
323 /* preload all the cache lines we need.
324 * NOTE: the number of pld below depends on PREFETCH_DISTANCE,
325 * ideally would would increase the distance in the main loop to
326 * avoid the goofy code below. In practice this doesn't seem to make
327 * a big difference.
328 */
Mathias Agopian199f9d92009-10-28 02:54:37 -0700329 pld [r1, #(CACHE_LINE_SIZE*2)]
330 pld [r1, #(CACHE_LINE_SIZE*3)]
331 pld [r1, #(PREFETCH_DISTANCE)]
Mathias Agopianee223d02009-09-27 17:46:43 -0700332
Mathias Agopian199f9d92009-10-28 02:54:37 -07003331: /* The main loop copies 64 bytes at a time */
Mathias Agopianee223d02009-09-27 17:46:43 -0700334 vld1.8 {d0 - d3}, [r1]!
335 vld1.8 {d4 - d7}, [r1]!
Mathias Agopian199f9d92009-10-28 02:54:37 -0700336 pld [r1, #(PREFETCH_DISTANCE)]
337 subs r2, r2, #64
Mathias Agopianee223d02009-09-27 17:46:43 -0700338 vst1.8 {d0 - d3}, [r0, :128]!
339 vst1.8 {d4 - d7}, [r0, :128]!
Mathias Agopianee223d02009-09-27 17:46:43 -0700340 bhs 1b
341
3422: /* fix-up the remaining count and make sure we have >= 32 bytes left */
Mathias Agopian199f9d92009-10-28 02:54:37 -0700343 add r2, r2, #64
Mathias Agopianee223d02009-09-27 17:46:43 -0700344 subs r2, r2, #32
345 blo 4f
346
3473: /* 32 bytes at a time. These cache lines were already preloaded */
348 vld1.8 {d0 - d3}, [r1]!
349 subs r2, r2, #32
350 vst1.8 {d0 - d3}, [r0, :128]!
351 bhs 3b
352
3534: /* less than 32 left */
354 add r2, r2, #32
355 tst r2, #0x10
356 beq 5f
357 // copies 16 bytes, 128-bits aligned
358 vld1.8 {d0, d1}, [r1]!
359 vst1.8 {d0, d1}, [r0, :128]!
360
3615: /* copy up to 15-bytes (count in r2) */
362 movs ip, r2, lsl #29
363 bcc 1f
364 vld1.8 {d0}, [r1]!
365 vst1.8 {d0}, [r0]!
3661: bge 2f
367 vld4.8 {d0[0], d1[0], d2[0], d3[0]}, [r1]!
368 vst4.8 {d0[0], d1[0], d2[0], d3[0]}, [r0]!
3692: movs ip, r2, lsl #31
370 ldrmib r3, [r1], #1
371 ldrcsb ip, [r1], #1
372 ldrcsb lr, [r1], #1
373 strmib r3, [r0], #1
374 strcsb ip, [r0], #1
375 strcsb lr, [r0], #1
376
377 ldmfd sp!, {r0, lr}
378 bx lr
Kenny Root420878c2011-02-16 11:55:58 -0800379END(memcpy)
Chitti Babu Theegalaf554a192011-11-18 10:25:58 +0530380#endif /* CORTEX_CACHE_LINE_32 */
Harshad Bhutadac8221472011-05-05 18:27:02 +0530381#endif /* !SCORPION_NEON_OPTIMIZATION */
Mathias Agopianee223d02009-09-27 17:46:43 -0700382#else /* __ARM_ARCH__ < 7 */
383
384
The Android Open Source Project1dc9e472009-03-03 19:28:35 -0800385 /*
386 * Optimized memcpy() for ARM.
387 *
388 * note that memcpy() always returns the destination pointer,
389 * so we have to preserve R0.
390 */
Mathias Agopianee223d02009-09-27 17:46:43 -0700391
Kenny Root420878c2011-02-16 11:55:58 -0800392ENTRY(memcpy)
Mathias Agopianee223d02009-09-27 17:46:43 -0700393 /* The stack must always be 64-bits aligned to be compliant with the
The Android Open Source Project1dc9e472009-03-03 19:28:35 -0800394 * ARM ABI. Since we have to save R0, we might as well save R4
395 * which we can use for better pipelining of the reads below
396 */
The Android Open Source Project1dc9e472009-03-03 19:28:35 -0800397 .save {r0, r4, lr}
398 stmfd sp!, {r0, r4, lr}
399 /* Making room for r5-r11 which will be spilled later */
400 .pad #28
401 sub sp, sp, #28
402
403 // preload the destination because we'll align it to a cache line
404 // with small writes. Also start the source "pump".
405 PLD (r0, #0)
406 PLD (r1, #0)
407 PLD (r1, #32)
408
409 /* it simplifies things to take care of len<4 early */
410 cmp r2, #4
411 blo copy_last_3_and_return
412
413 /* compute the offset to align the source
414 * offset = (4-(src&3))&3 = -src & 3
415 */
416 rsb r3, r1, #0
417 ands r3, r3, #3
418 beq src_aligned
419
420 /* align source to 32 bits. We need to insert 2 instructions between
421 * a ldr[b|h] and str[b|h] because byte and half-word instructions
422 * stall 2 cycles.
423 */
424 movs r12, r3, lsl #31
425 sub r2, r2, r3 /* we know that r3 <= r2 because r2 >= 4 */
426 ldrmib r3, [r1], #1
427 ldrcsb r4, [r1], #1
428 ldrcsb r12,[r1], #1
429 strmib r3, [r0], #1
430 strcsb r4, [r0], #1
431 strcsb r12,[r0], #1
Mathias Agopianee223d02009-09-27 17:46:43 -0700432
The Android Open Source Project1dc9e472009-03-03 19:28:35 -0800433src_aligned:
434
Mathias Agopianee223d02009-09-27 17:46:43 -0700435 /* see if src and dst are aligned together (congruent) */
The Android Open Source Project1dc9e472009-03-03 19:28:35 -0800436 eor r12, r0, r1
437 tst r12, #3
438 bne non_congruent
439
440 /* Use post-incriment mode for stm to spill r5-r11 to reserved stack
441 * frame. Don't update sp.
442 */
443 stmea sp, {r5-r11}
444
445 /* align the destination to a cache-line */
446 rsb r3, r0, #0
447 ands r3, r3, #0x1C
448 beq congruent_aligned32
449 cmp r3, r2
450 andhi r3, r2, #0x1C
451
452 /* conditionnaly copies 0 to 7 words (length in r3) */
Mathias Agopianee223d02009-09-27 17:46:43 -0700453 movs r12, r3, lsl #28
The Android Open Source Project1dc9e472009-03-03 19:28:35 -0800454 ldmcsia r1!, {r4, r5, r6, r7} /* 16 bytes */
455 ldmmiia r1!, {r8, r9} /* 8 bytes */
456 stmcsia r0!, {r4, r5, r6, r7}
457 stmmiia r0!, {r8, r9}
458 tst r3, #0x4
459 ldrne r10,[r1], #4 /* 4 bytes */
460 strne r10,[r0], #4
461 sub r2, r2, r3
462
463congruent_aligned32:
464 /*
465 * here source is aligned to 32 bytes.
466 */
467
468cached_aligned32:
469 subs r2, r2, #32
470 blo less_than_32_left
471
472 /*
473 * We preload a cache-line up to 64 bytes ahead. On the 926, this will
Mathias Agopianee223d02009-09-27 17:46:43 -0700474 * stall only until the requested world is fetched, but the linefill
The Android Open Source Project1dc9e472009-03-03 19:28:35 -0800475 * continues in the the background.
476 * While the linefill is going, we write our previous cache-line
477 * into the write-buffer (which should have some free space).
478 * When the linefill is done, the writebuffer will
479 * start dumping its content into memory
480 *
481 * While all this is going, we then load a full cache line into
482 * 8 registers, this cache line should be in the cache by now
483 * (or partly in the cache).
484 *
485 * This code should work well regardless of the source/dest alignment.
486 *
487 */
488
489 // Align the preload register to a cache-line because the cpu does
490 // "critical word first" (the first word requested is loaded first).
491 bic r12, r1, #0x1F
492 add r12, r12, #64
493
4941: ldmia r1!, { r4-r11 }
495 PLD (r12, #64)
496 subs r2, r2, #32
497
498 // NOTE: if r12 is more than 64 ahead of r1, the following ldrhi
499 // for ARM9 preload will not be safely guarded by the preceding subs.
Mathias Agopianee223d02009-09-27 17:46:43 -0700500 // When it is safely guarded the only possibility to have SIGSEGV here
The Android Open Source Project1dc9e472009-03-03 19:28:35 -0800501 // is because the caller overstates the length.
502 ldrhi r3, [r12], #32 /* cheap ARM9 preload */
503 stmia r0!, { r4-r11 }
504 bhs 1b
Mathias Agopianee223d02009-09-27 17:46:43 -0700505
The Android Open Source Project1dc9e472009-03-03 19:28:35 -0800506 add r2, r2, #32
507
508
509
510
511less_than_32_left:
Mathias Agopianee223d02009-09-27 17:46:43 -0700512 /*
The Android Open Source Project1dc9e472009-03-03 19:28:35 -0800513 * less than 32 bytes left at this point (length in r2)
514 */
515
516 /* skip all this if there is nothing to do, which should
517 * be a common case (if not executed the code below takes
518 * about 16 cycles)
519 */
520 tst r2, #0x1F
521 beq 1f
522
523 /* conditionnaly copies 0 to 31 bytes */
Mathias Agopianee223d02009-09-27 17:46:43 -0700524 movs r12, r2, lsl #28
The Android Open Source Project1dc9e472009-03-03 19:28:35 -0800525 ldmcsia r1!, {r4, r5, r6, r7} /* 16 bytes */
526 ldmmiia r1!, {r8, r9} /* 8 bytes */
527 stmcsia r0!, {r4, r5, r6, r7}
528 stmmiia r0!, {r8, r9}
529 movs r12, r2, lsl #30
530 ldrcs r3, [r1], #4 /* 4 bytes */
531 ldrmih r4, [r1], #2 /* 2 bytes */
Mathias Agopianee223d02009-09-27 17:46:43 -0700532 strcs r3, [r0], #4
The Android Open Source Project1dc9e472009-03-03 19:28:35 -0800533 strmih r4, [r0], #2
534 tst r2, #0x1
535 ldrneb r3, [r1] /* last byte */
536 strneb r3, [r0]
537
538 /* we're done! restore everything and return */
5391: ldmfd sp!, {r5-r11}
540 ldmfd sp!, {r0, r4, lr}
541 bx lr
542
543 /********************************************************************/
544
545non_congruent:
546 /*
547 * here source is aligned to 4 bytes
548 * but destination is not.
549 *
Mathias Agopianee223d02009-09-27 17:46:43 -0700550 * in the code below r2 is the number of bytes read
The Android Open Source Project1dc9e472009-03-03 19:28:35 -0800551 * (the number of bytes written is always smaller, because we have
552 * partial words in the shift queue)
553 */
554 cmp r2, #4
555 blo copy_last_3_and_return
Mathias Agopianee223d02009-09-27 17:46:43 -0700556
The Android Open Source Project1dc9e472009-03-03 19:28:35 -0800557 /* Use post-incriment mode for stm to spill r5-r11 to reserved stack
558 * frame. Don't update sp.
559 */
560 stmea sp, {r5-r11}
Mathias Agopianee223d02009-09-27 17:46:43 -0700561
The Android Open Source Project1dc9e472009-03-03 19:28:35 -0800562 /* compute shifts needed to align src to dest */
563 rsb r5, r0, #0
564 and r5, r5, #3 /* r5 = # bytes in partial words */
Mathias Agopianee223d02009-09-27 17:46:43 -0700565 mov r12, r5, lsl #3 /* r12 = right */
The Android Open Source Project1dc9e472009-03-03 19:28:35 -0800566 rsb lr, r12, #32 /* lr = left */
Mathias Agopianee223d02009-09-27 17:46:43 -0700567
The Android Open Source Project1dc9e472009-03-03 19:28:35 -0800568 /* read the first word */
569 ldr r3, [r1], #4
570 sub r2, r2, #4
Mathias Agopianee223d02009-09-27 17:46:43 -0700571
The Android Open Source Project1dc9e472009-03-03 19:28:35 -0800572 /* write a partial word (0 to 3 bytes), such that destination
573 * becomes aligned to 32 bits (r5 = nb of words to copy for alignment)
574 */
575 movs r5, r5, lsl #31
576 strmib r3, [r0], #1
Mathias Agopianee223d02009-09-27 17:46:43 -0700577 movmi r3, r3, lsr #8
The Android Open Source Project1dc9e472009-03-03 19:28:35 -0800578 strcsb r3, [r0], #1
579 movcs r3, r3, lsr #8
580 strcsb r3, [r0], #1
581 movcs r3, r3, lsr #8
582
583 cmp r2, #4
584 blo partial_word_tail
Mathias Agopianee223d02009-09-27 17:46:43 -0700585
The Android Open Source Project1dc9e472009-03-03 19:28:35 -0800586 /* Align destination to 32 bytes (cache line boundary) */
5871: tst r0, #0x1c
588 beq 2f
589 ldr r5, [r1], #4
590 sub r2, r2, #4
591 orr r4, r3, r5, lsl lr
592 mov r3, r5, lsr r12
593 str r4, [r0], #4
594 cmp r2, #4
595 bhs 1b
596 blo partial_word_tail
597
598 /* copy 32 bytes at a time */
5992: subs r2, r2, #32
600 blo less_than_thirtytwo
601
602 /* Use immediate mode for the shifts, because there is an extra cycle
603 * for register shifts, which could account for up to 50% of
604 * performance hit.
605 */
606
607 cmp r12, #24
608 beq loop24
609 cmp r12, #8
610 beq loop8
611
612loop16:
613 ldr r12, [r1], #4
6141: mov r4, r12
615 ldmia r1!, { r5,r6,r7, r8,r9,r10,r11}
616 PLD (r1, #64)
617 subs r2, r2, #32
618 ldrhs r12, [r1], #4
619 orr r3, r3, r4, lsl #16
620 mov r4, r4, lsr #16
621 orr r4, r4, r5, lsl #16
622 mov r5, r5, lsr #16
623 orr r5, r5, r6, lsl #16
624 mov r6, r6, lsr #16
625 orr r6, r6, r7, lsl #16
626 mov r7, r7, lsr #16
627 orr r7, r7, r8, lsl #16
628 mov r8, r8, lsr #16
629 orr r8, r8, r9, lsl #16
630 mov r9, r9, lsr #16
631 orr r9, r9, r10, lsl #16
632 mov r10, r10, lsr #16
633 orr r10, r10, r11, lsl #16
634 stmia r0!, {r3,r4,r5,r6, r7,r8,r9,r10}
635 mov r3, r11, lsr #16
636 bhs 1b
637 b less_than_thirtytwo
638
639loop8:
640 ldr r12, [r1], #4
6411: mov r4, r12
642 ldmia r1!, { r5,r6,r7, r8,r9,r10,r11}
643 PLD (r1, #64)
644 subs r2, r2, #32
645 ldrhs r12, [r1], #4
646 orr r3, r3, r4, lsl #24
647 mov r4, r4, lsr #8
648 orr r4, r4, r5, lsl #24
649 mov r5, r5, lsr #8
650 orr r5, r5, r6, lsl #24
651 mov r6, r6, lsr #8
652 orr r6, r6, r7, lsl #24
653 mov r7, r7, lsr #8
654 orr r7, r7, r8, lsl #24
655 mov r8, r8, lsr #8
656 orr r8, r8, r9, lsl #24
657 mov r9, r9, lsr #8
658 orr r9, r9, r10, lsl #24
659 mov r10, r10, lsr #8
660 orr r10, r10, r11, lsl #24
661 stmia r0!, {r3,r4,r5,r6, r7,r8,r9,r10}
662 mov r3, r11, lsr #8
663 bhs 1b
664 b less_than_thirtytwo
665
666loop24:
667 ldr r12, [r1], #4
6681: mov r4, r12
669 ldmia r1!, { r5,r6,r7, r8,r9,r10,r11}
670 PLD (r1, #64)
671 subs r2, r2, #32
672 ldrhs r12, [r1], #4
673 orr r3, r3, r4, lsl #8
674 mov r4, r4, lsr #24
675 orr r4, r4, r5, lsl #8
676 mov r5, r5, lsr #24
677 orr r5, r5, r6, lsl #8
678 mov r6, r6, lsr #24
679 orr r6, r6, r7, lsl #8
680 mov r7, r7, lsr #24
681 orr r7, r7, r8, lsl #8
682 mov r8, r8, lsr #24
683 orr r8, r8, r9, lsl #8
684 mov r9, r9, lsr #24
685 orr r9, r9, r10, lsl #8
686 mov r10, r10, lsr #24
687 orr r10, r10, r11, lsl #8
688 stmia r0!, {r3,r4,r5,r6, r7,r8,r9,r10}
689 mov r3, r11, lsr #24
690 bhs 1b
691
692
693less_than_thirtytwo:
694 /* copy the last 0 to 31 bytes of the source */
695 rsb r12, lr, #32 /* we corrupted r12, recompute it */
696 add r2, r2, #32
697 cmp r2, #4
698 blo partial_word_tail
699
7001: ldr r5, [r1], #4
701 sub r2, r2, #4
702 orr r4, r3, r5, lsl lr
703 mov r3, r5, lsr r12
704 str r4, [r0], #4
705 cmp r2, #4
706 bhs 1b
707
708partial_word_tail:
709 /* we have a partial word in the input buffer */
710 movs r5, lr, lsl #(31-3)
711 strmib r3, [r0], #1
712 movmi r3, r3, lsr #8
713 strcsb r3, [r0], #1
714 movcs r3, r3, lsr #8
715 strcsb r3, [r0], #1
Mathias Agopianee223d02009-09-27 17:46:43 -0700716
The Android Open Source Project1dc9e472009-03-03 19:28:35 -0800717 /* Refill spilled registers from the stack. Don't update sp. */
718 ldmfd sp, {r5-r11}
719
720copy_last_3_and_return:
721 movs r2, r2, lsl #31 /* copy remaining 0, 1, 2 or 3 bytes */
722 ldrmib r2, [r1], #1
723 ldrcsb r3, [r1], #1
724 ldrcsb r12,[r1]
725 strmib r2, [r0], #1
726 strcsb r3, [r0], #1
727 strcsb r12,[r0]
728
729 /* we're done! restore sp and spilled registers and return */
730 add sp, sp, #28
731 ldmfd sp!, {r0, r4, lr}
732 bx lr
Kenny Root420878c2011-02-16 11:55:58 -0800733END(memcpy)
The Android Open Source Project1dc9e472009-03-03 19:28:35 -0800734
Mathias Agopianee223d02009-09-27 17:46:43 -0700735
736#endif /* __ARM_ARCH__ < 7 */