blob: 90e788aba4c595ebcf075d01014445b5cdccd970 [file] [log] [blame]
The Android Open Source Project1dc9e472009-03-03 19:28:35 -08001/*
2 * Copyright (C) 2008 The Android Open Source Project
3 * All rights reserved.
4 *
Harshad Bhutadac8221472011-05-05 18:27:02 +05305 * Copyright (c) 2009-2011, Code Aurora Forum. All rights reserved.
6 *
The Android Open Source Project1dc9e472009-03-03 19:28:35 -08007 * Redistribution and use in source and binary forms, with or without
8 * modification, are permitted provided that the following conditions
9 * are met:
10 * * Redistributions of source code must retain the above copyright
11 * notice, this list of conditions and the following disclaimer.
12 * * Redistributions in binary form must reproduce the above copyright
13 * notice, this list of conditions and the following disclaimer in
14 * the documentation and/or other materials provided with the
15 * distribution.
16 *
17 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
18 * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
19 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
20 * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
21 * COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
22 * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
23 * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS
24 * OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED
25 * AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
26 * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
27 * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
28 * SUCH DAMAGE.
29 */
30
31#include <machine/cpu-features.h>
Kenny Root420878c2011-02-16 11:55:58 -080032#include <machine/asm.h>
The Android Open Source Project1dc9e472009-03-03 19:28:35 -080033
Colin Crossecede402010-03-09 16:23:51 -080034#if defined(__ARM_NEON__)
Harshad Bhutadac8221472011-05-05 18:27:02 +053035#if defined(SCORPION_NEON_OPTIMIZATION)
36 /*
37 * These can be overridden in:
38 * device/<vendor>/<board>/BoardConfig.mk
39 * by setting the following:
40 * TARGET_USE_SCORPION_BIONIC_OPTIMIZATION := true
41 * TARGET_USE_SCORPION_PLD_SET := true
42 * TARGET_SCORPION_BIONIC_PLDOFFS := <pldoffset>
43 * TARGET_SCORPION_BIONIC_PLDSIZE := <pldsize>
44 */
45#ifndef PLDOFFS
46#define PLDOFFS (6)
47#endif
48#ifndef PLDSIZE
49#define PLDSIZE (128) /* L2 cache line size */
50#endif
51 .code 32
52 .align 5
53 .globl memcpy
54 .func
55memcpy:
56 push {r0}
57 cmp r2, #4
58 blt .Lneon_lt4
59 cmp r2, #16
60 blt .Lneon_lt16
61 cmp r2, #32
62 blt .Lneon_16
63 cmp r2, #128
64 blt .Lneon_copy_32_a
65 /* Copy blocks of 128-bytes (word-aligned) at a time*/
66 /* Code below is optimized for PLDSIZE=128 only */
67 mov r12, r2, lsr #7
68 cmp r12, #PLDOFFS
69 ble .Lneon_copy_128_loop_nopld
70 sub r12, #PLDOFFS
71 pld [r1, #(PLDOFFS-1)*PLDSIZE]
72.Lneon_copy_128_loop_outer:
73 pld [r1, #(PLDOFFS*PLDSIZE)]
74 vld1.32 {q0, q1}, [r1]!
75 vld1.32 {q2, q3}, [r1]!
76 vld1.32 {q8, q9}, [r1]!
77 vld1.32 {q10, q11}, [r1]!
78 subs r12, r12, #1
79 vst1.32 {q0, q1}, [r0]!
80 vst1.32 {q2, q3}, [r0]!
81 vst1.32 {q8, q9}, [r0]!
82 vst1.32 {q10, q11}, [r0]!
83 bne .Lneon_copy_128_loop_outer
84 mov r12, #PLDOFFS
85.Lneon_copy_128_loop_nopld:
86 vld1.32 {q0, q1}, [r1]!
87 vld1.32 {q2, q3}, [r1]!
88 vld1.32 {q8, q9}, [r1]!
89 vld1.32 {q10, q11}, [r1]!
90 subs r12, r12, #1
91 vst1.32 {q0, q1}, [r0]!
92 vst1.32 {q2, q3}, [r0]!
93 vst1.32 {q8, q9}, [r0]!
94 vst1.32 {q10, q11}, [r0]!
95 bne .Lneon_copy_128_loop_nopld
96 ands r2, r2, #0x7f
97 beq .Lneon_exit
98 cmp r2, #32
99 blt .Lneon_16
100 nop
101 /* Copy blocks of 32-bytes (word aligned) at a time*/
102.Lneon_copy_32_a:
103 mov r12, r2, lsr #5
104.Lneon_copy_32_loop_a:
105 vld1.32 {q0,q1}, [r1]!
106 subs r12, r12, #1
107 vst1.32 {q0,q1}, [r0]!
108 bne .Lneon_copy_32_loop_a
109 ands r2, r2, #0x1f
110 beq .Lneon_exit
111.Lneon_16:
112 subs r2, r2, #16
113 blt .Lneon_lt16
114 vld1.32 {q8}, [r1]!
115 vst1.32 {q8}, [r0]!
116 beq .Lneon_exit
117.Lneon_lt16:
118 movs r12, r2, lsl #29
119 bcc .Lneon_skip8
120 ldr r3, [r1], #4
121 ldr r12, [r1], #4
122 str r3, [r0], #4
123 str r12, [r0], #4
124.Lneon_skip8:
125 bpl .Lneon_lt4
126 ldr r3, [r1], #4
127 str r3, [r0], #4
128.Lneon_lt4:
129 movs r2, r2, lsl #31
130 bcc .Lneon_lt2
131 ldrh r3, [r1], #2
132 strh r3, [r0], #2
133.Lneon_lt2:
134 bpl .Lneon_exit
135 ldrb r12, [r1]
136 strb r12, [r0]
137.Lneon_exit:
138 pop {r0}
139 bx lr
140 .endfunc
141 .end
142#else /* !SCORPION_NEON_OPTIMIZATION */
Mathias Agopianee223d02009-09-27 17:46:43 -0700143 .text
144 .fpu neon
145
Mathias Agopian199f9d92009-10-28 02:54:37 -0700146/* a prefetch distance of 4 cache-lines works best experimentally */
147#define CACHE_LINE_SIZE 64
148#define PREFETCH_DISTANCE (CACHE_LINE_SIZE*4)
Mathias Agopianee223d02009-09-27 17:46:43 -0700149
Evgeniy Stepanov487b6132011-10-04 14:22:15 +0400150ENTRY(memcpy)
Mathias Agopianee223d02009-09-27 17:46:43 -0700151 .save {r0, lr}
152 stmfd sp!, {r0, lr}
153
154 /* start preloading as early as possible */
Mathias Agopian199f9d92009-10-28 02:54:37 -0700155 pld [r1, #(CACHE_LINE_SIZE*0)]
156 pld [r1, #(CACHE_LINE_SIZE*1)]
Mathias Agopianee223d02009-09-27 17:46:43 -0700157
158 /* do we have at least 16-bytes to copy (needed for alignment below) */
159 cmp r2, #16
160 blo 5f
161
162 /* align destination to half cache-line for the write-buffer */
163 rsb r3, r0, #0
164 ands r3, r3, #0xF
165 beq 0f
166
167 /* copy up to 15-bytes (count in r3) */
168 sub r2, r2, r3
169 movs ip, r3, lsl #31
170 ldrmib lr, [r1], #1
171 strmib lr, [r0], #1
172 ldrcsb ip, [r1], #1
173 ldrcsb lr, [r1], #1
174 strcsb ip, [r0], #1
175 strcsb lr, [r0], #1
176 movs ip, r3, lsl #29
177 bge 1f
178 // copies 4 bytes, destination 32-bits aligned
179 vld4.8 {d0[0], d1[0], d2[0], d3[0]}, [r1]!
180 vst4.8 {d0[0], d1[0], d2[0], d3[0]}, [r0, :32]!
1811: bcc 2f
182 // copies 8 bytes, destination 64-bits aligned
183 vld1.8 {d0}, [r1]!
184 vst1.8 {d0}, [r0, :64]!
1852:
186
1870: /* preload immediately the next cache line, which we may need */
Mathias Agopian199f9d92009-10-28 02:54:37 -0700188 pld [r1, #(CACHE_LINE_SIZE*0)]
189 pld [r1, #(CACHE_LINE_SIZE*1)]
Mathias Agopianee223d02009-09-27 17:46:43 -0700190
Mathias Agopian199f9d92009-10-28 02:54:37 -0700191 /* make sure we have at least 64 bytes to copy */
192 subs r2, r2, #64
Mathias Agopianee223d02009-09-27 17:46:43 -0700193 blo 2f
194
195 /* preload all the cache lines we need.
196 * NOTE: the number of pld below depends on PREFETCH_DISTANCE,
197 * ideally would would increase the distance in the main loop to
198 * avoid the goofy code below. In practice this doesn't seem to make
199 * a big difference.
200 */
Mathias Agopian199f9d92009-10-28 02:54:37 -0700201 pld [r1, #(CACHE_LINE_SIZE*2)]
202 pld [r1, #(CACHE_LINE_SIZE*3)]
203 pld [r1, #(PREFETCH_DISTANCE)]
Mathias Agopianee223d02009-09-27 17:46:43 -0700204
Mathias Agopian199f9d92009-10-28 02:54:37 -07002051: /* The main loop copies 64 bytes at a time */
Mathias Agopianee223d02009-09-27 17:46:43 -0700206 vld1.8 {d0 - d3}, [r1]!
207 vld1.8 {d4 - d7}, [r1]!
Mathias Agopian199f9d92009-10-28 02:54:37 -0700208 pld [r1, #(PREFETCH_DISTANCE)]
209 subs r2, r2, #64
Mathias Agopianee223d02009-09-27 17:46:43 -0700210 vst1.8 {d0 - d3}, [r0, :128]!
211 vst1.8 {d4 - d7}, [r0, :128]!
Mathias Agopianee223d02009-09-27 17:46:43 -0700212 bhs 1b
213
2142: /* fix-up the remaining count and make sure we have >= 32 bytes left */
Mathias Agopian199f9d92009-10-28 02:54:37 -0700215 add r2, r2, #64
Mathias Agopianee223d02009-09-27 17:46:43 -0700216 subs r2, r2, #32
217 blo 4f
218
2193: /* 32 bytes at a time. These cache lines were already preloaded */
220 vld1.8 {d0 - d3}, [r1]!
221 subs r2, r2, #32
222 vst1.8 {d0 - d3}, [r0, :128]!
223 bhs 3b
224
2254: /* less than 32 left */
226 add r2, r2, #32
227 tst r2, #0x10
228 beq 5f
229 // copies 16 bytes, 128-bits aligned
230 vld1.8 {d0, d1}, [r1]!
231 vst1.8 {d0, d1}, [r0, :128]!
232
2335: /* copy up to 15-bytes (count in r2) */
234 movs ip, r2, lsl #29
235 bcc 1f
236 vld1.8 {d0}, [r1]!
237 vst1.8 {d0}, [r0]!
2381: bge 2f
239 vld4.8 {d0[0], d1[0], d2[0], d3[0]}, [r1]!
240 vst4.8 {d0[0], d1[0], d2[0], d3[0]}, [r0]!
2412: movs ip, r2, lsl #31
242 ldrmib r3, [r1], #1
243 ldrcsb ip, [r1], #1
244 ldrcsb lr, [r1], #1
245 strmib r3, [r0], #1
246 strcsb ip, [r0], #1
247 strcsb lr, [r0], #1
248
249 ldmfd sp!, {r0, lr}
250 bx lr
Kenny Root420878c2011-02-16 11:55:58 -0800251END(memcpy)
Mathias Agopianee223d02009-09-27 17:46:43 -0700252
Harshad Bhutadac8221472011-05-05 18:27:02 +0530253#endif /* !SCORPION_NEON_OPTIMIZATION */
Mathias Agopianee223d02009-09-27 17:46:43 -0700254#else /* __ARM_ARCH__ < 7 */
255
256
The Android Open Source Project1dc9e472009-03-03 19:28:35 -0800257 /*
258 * Optimized memcpy() for ARM.
259 *
260 * note that memcpy() always returns the destination pointer,
261 * so we have to preserve R0.
262 */
Mathias Agopianee223d02009-09-27 17:46:43 -0700263
Kenny Root420878c2011-02-16 11:55:58 -0800264ENTRY(memcpy)
Mathias Agopianee223d02009-09-27 17:46:43 -0700265 /* The stack must always be 64-bits aligned to be compliant with the
The Android Open Source Project1dc9e472009-03-03 19:28:35 -0800266 * ARM ABI. Since we have to save R0, we might as well save R4
267 * which we can use for better pipelining of the reads below
268 */
The Android Open Source Project1dc9e472009-03-03 19:28:35 -0800269 .save {r0, r4, lr}
270 stmfd sp!, {r0, r4, lr}
271 /* Making room for r5-r11 which will be spilled later */
272 .pad #28
273 sub sp, sp, #28
274
275 // preload the destination because we'll align it to a cache line
276 // with small writes. Also start the source "pump".
277 PLD (r0, #0)
278 PLD (r1, #0)
279 PLD (r1, #32)
280
281 /* it simplifies things to take care of len<4 early */
282 cmp r2, #4
283 blo copy_last_3_and_return
284
285 /* compute the offset to align the source
286 * offset = (4-(src&3))&3 = -src & 3
287 */
288 rsb r3, r1, #0
289 ands r3, r3, #3
290 beq src_aligned
291
292 /* align source to 32 bits. We need to insert 2 instructions between
293 * a ldr[b|h] and str[b|h] because byte and half-word instructions
294 * stall 2 cycles.
295 */
296 movs r12, r3, lsl #31
297 sub r2, r2, r3 /* we know that r3 <= r2 because r2 >= 4 */
298 ldrmib r3, [r1], #1
299 ldrcsb r4, [r1], #1
300 ldrcsb r12,[r1], #1
301 strmib r3, [r0], #1
302 strcsb r4, [r0], #1
303 strcsb r12,[r0], #1
Mathias Agopianee223d02009-09-27 17:46:43 -0700304
The Android Open Source Project1dc9e472009-03-03 19:28:35 -0800305src_aligned:
306
Mathias Agopianee223d02009-09-27 17:46:43 -0700307 /* see if src and dst are aligned together (congruent) */
The Android Open Source Project1dc9e472009-03-03 19:28:35 -0800308 eor r12, r0, r1
309 tst r12, #3
310 bne non_congruent
311
312 /* Use post-incriment mode for stm to spill r5-r11 to reserved stack
313 * frame. Don't update sp.
314 */
315 stmea sp, {r5-r11}
316
317 /* align the destination to a cache-line */
318 rsb r3, r0, #0
319 ands r3, r3, #0x1C
320 beq congruent_aligned32
321 cmp r3, r2
322 andhi r3, r2, #0x1C
323
324 /* conditionnaly copies 0 to 7 words (length in r3) */
Mathias Agopianee223d02009-09-27 17:46:43 -0700325 movs r12, r3, lsl #28
The Android Open Source Project1dc9e472009-03-03 19:28:35 -0800326 ldmcsia r1!, {r4, r5, r6, r7} /* 16 bytes */
327 ldmmiia r1!, {r8, r9} /* 8 bytes */
328 stmcsia r0!, {r4, r5, r6, r7}
329 stmmiia r0!, {r8, r9}
330 tst r3, #0x4
331 ldrne r10,[r1], #4 /* 4 bytes */
332 strne r10,[r0], #4
333 sub r2, r2, r3
334
335congruent_aligned32:
336 /*
337 * here source is aligned to 32 bytes.
338 */
339
340cached_aligned32:
341 subs r2, r2, #32
342 blo less_than_32_left
343
344 /*
345 * We preload a cache-line up to 64 bytes ahead. On the 926, this will
Mathias Agopianee223d02009-09-27 17:46:43 -0700346 * stall only until the requested world is fetched, but the linefill
The Android Open Source Project1dc9e472009-03-03 19:28:35 -0800347 * continues in the the background.
348 * While the linefill is going, we write our previous cache-line
349 * into the write-buffer (which should have some free space).
350 * When the linefill is done, the writebuffer will
351 * start dumping its content into memory
352 *
353 * While all this is going, we then load a full cache line into
354 * 8 registers, this cache line should be in the cache by now
355 * (or partly in the cache).
356 *
357 * This code should work well regardless of the source/dest alignment.
358 *
359 */
360
361 // Align the preload register to a cache-line because the cpu does
362 // "critical word first" (the first word requested is loaded first).
363 bic r12, r1, #0x1F
364 add r12, r12, #64
365
3661: ldmia r1!, { r4-r11 }
367 PLD (r12, #64)
368 subs r2, r2, #32
369
370 // NOTE: if r12 is more than 64 ahead of r1, the following ldrhi
371 // for ARM9 preload will not be safely guarded by the preceding subs.
Mathias Agopianee223d02009-09-27 17:46:43 -0700372 // When it is safely guarded the only possibility to have SIGSEGV here
The Android Open Source Project1dc9e472009-03-03 19:28:35 -0800373 // is because the caller overstates the length.
374 ldrhi r3, [r12], #32 /* cheap ARM9 preload */
375 stmia r0!, { r4-r11 }
376 bhs 1b
Mathias Agopianee223d02009-09-27 17:46:43 -0700377
The Android Open Source Project1dc9e472009-03-03 19:28:35 -0800378 add r2, r2, #32
379
380
381
382
383less_than_32_left:
Mathias Agopianee223d02009-09-27 17:46:43 -0700384 /*
The Android Open Source Project1dc9e472009-03-03 19:28:35 -0800385 * less than 32 bytes left at this point (length in r2)
386 */
387
388 /* skip all this if there is nothing to do, which should
389 * be a common case (if not executed the code below takes
390 * about 16 cycles)
391 */
392 tst r2, #0x1F
393 beq 1f
394
395 /* conditionnaly copies 0 to 31 bytes */
Mathias Agopianee223d02009-09-27 17:46:43 -0700396 movs r12, r2, lsl #28
The Android Open Source Project1dc9e472009-03-03 19:28:35 -0800397 ldmcsia r1!, {r4, r5, r6, r7} /* 16 bytes */
398 ldmmiia r1!, {r8, r9} /* 8 bytes */
399 stmcsia r0!, {r4, r5, r6, r7}
400 stmmiia r0!, {r8, r9}
401 movs r12, r2, lsl #30
402 ldrcs r3, [r1], #4 /* 4 bytes */
403 ldrmih r4, [r1], #2 /* 2 bytes */
Mathias Agopianee223d02009-09-27 17:46:43 -0700404 strcs r3, [r0], #4
The Android Open Source Project1dc9e472009-03-03 19:28:35 -0800405 strmih r4, [r0], #2
406 tst r2, #0x1
407 ldrneb r3, [r1] /* last byte */
408 strneb r3, [r0]
409
410 /* we're done! restore everything and return */
4111: ldmfd sp!, {r5-r11}
412 ldmfd sp!, {r0, r4, lr}
413 bx lr
414
415 /********************************************************************/
416
417non_congruent:
418 /*
419 * here source is aligned to 4 bytes
420 * but destination is not.
421 *
Mathias Agopianee223d02009-09-27 17:46:43 -0700422 * in the code below r2 is the number of bytes read
The Android Open Source Project1dc9e472009-03-03 19:28:35 -0800423 * (the number of bytes written is always smaller, because we have
424 * partial words in the shift queue)
425 */
426 cmp r2, #4
427 blo copy_last_3_and_return
Mathias Agopianee223d02009-09-27 17:46:43 -0700428
The Android Open Source Project1dc9e472009-03-03 19:28:35 -0800429 /* Use post-incriment mode for stm to spill r5-r11 to reserved stack
430 * frame. Don't update sp.
431 */
432 stmea sp, {r5-r11}
Mathias Agopianee223d02009-09-27 17:46:43 -0700433
The Android Open Source Project1dc9e472009-03-03 19:28:35 -0800434 /* compute shifts needed to align src to dest */
435 rsb r5, r0, #0
436 and r5, r5, #3 /* r5 = # bytes in partial words */
Mathias Agopianee223d02009-09-27 17:46:43 -0700437 mov r12, r5, lsl #3 /* r12 = right */
The Android Open Source Project1dc9e472009-03-03 19:28:35 -0800438 rsb lr, r12, #32 /* lr = left */
Mathias Agopianee223d02009-09-27 17:46:43 -0700439
The Android Open Source Project1dc9e472009-03-03 19:28:35 -0800440 /* read the first word */
441 ldr r3, [r1], #4
442 sub r2, r2, #4
Mathias Agopianee223d02009-09-27 17:46:43 -0700443
The Android Open Source Project1dc9e472009-03-03 19:28:35 -0800444 /* write a partial word (0 to 3 bytes), such that destination
445 * becomes aligned to 32 bits (r5 = nb of words to copy for alignment)
446 */
447 movs r5, r5, lsl #31
448 strmib r3, [r0], #1
Mathias Agopianee223d02009-09-27 17:46:43 -0700449 movmi r3, r3, lsr #8
The Android Open Source Project1dc9e472009-03-03 19:28:35 -0800450 strcsb r3, [r0], #1
451 movcs r3, r3, lsr #8
452 strcsb r3, [r0], #1
453 movcs r3, r3, lsr #8
454
455 cmp r2, #4
456 blo partial_word_tail
Mathias Agopianee223d02009-09-27 17:46:43 -0700457
The Android Open Source Project1dc9e472009-03-03 19:28:35 -0800458 /* Align destination to 32 bytes (cache line boundary) */
4591: tst r0, #0x1c
460 beq 2f
461 ldr r5, [r1], #4
462 sub r2, r2, #4
463 orr r4, r3, r5, lsl lr
464 mov r3, r5, lsr r12
465 str r4, [r0], #4
466 cmp r2, #4
467 bhs 1b
468 blo partial_word_tail
469
470 /* copy 32 bytes at a time */
4712: subs r2, r2, #32
472 blo less_than_thirtytwo
473
474 /* Use immediate mode for the shifts, because there is an extra cycle
475 * for register shifts, which could account for up to 50% of
476 * performance hit.
477 */
478
479 cmp r12, #24
480 beq loop24
481 cmp r12, #8
482 beq loop8
483
484loop16:
485 ldr r12, [r1], #4
4861: mov r4, r12
487 ldmia r1!, { r5,r6,r7, r8,r9,r10,r11}
488 PLD (r1, #64)
489 subs r2, r2, #32
490 ldrhs r12, [r1], #4
491 orr r3, r3, r4, lsl #16
492 mov r4, r4, lsr #16
493 orr r4, r4, r5, lsl #16
494 mov r5, r5, lsr #16
495 orr r5, r5, r6, lsl #16
496 mov r6, r6, lsr #16
497 orr r6, r6, r7, lsl #16
498 mov r7, r7, lsr #16
499 orr r7, r7, r8, lsl #16
500 mov r8, r8, lsr #16
501 orr r8, r8, r9, lsl #16
502 mov r9, r9, lsr #16
503 orr r9, r9, r10, lsl #16
504 mov r10, r10, lsr #16
505 orr r10, r10, r11, lsl #16
506 stmia r0!, {r3,r4,r5,r6, r7,r8,r9,r10}
507 mov r3, r11, lsr #16
508 bhs 1b
509 b less_than_thirtytwo
510
511loop8:
512 ldr r12, [r1], #4
5131: mov r4, r12
514 ldmia r1!, { r5,r6,r7, r8,r9,r10,r11}
515 PLD (r1, #64)
516 subs r2, r2, #32
517 ldrhs r12, [r1], #4
518 orr r3, r3, r4, lsl #24
519 mov r4, r4, lsr #8
520 orr r4, r4, r5, lsl #24
521 mov r5, r5, lsr #8
522 orr r5, r5, r6, lsl #24
523 mov r6, r6, lsr #8
524 orr r6, r6, r7, lsl #24
525 mov r7, r7, lsr #8
526 orr r7, r7, r8, lsl #24
527 mov r8, r8, lsr #8
528 orr r8, r8, r9, lsl #24
529 mov r9, r9, lsr #8
530 orr r9, r9, r10, lsl #24
531 mov r10, r10, lsr #8
532 orr r10, r10, r11, lsl #24
533 stmia r0!, {r3,r4,r5,r6, r7,r8,r9,r10}
534 mov r3, r11, lsr #8
535 bhs 1b
536 b less_than_thirtytwo
537
538loop24:
539 ldr r12, [r1], #4
5401: mov r4, r12
541 ldmia r1!, { r5,r6,r7, r8,r9,r10,r11}
542 PLD (r1, #64)
543 subs r2, r2, #32
544 ldrhs r12, [r1], #4
545 orr r3, r3, r4, lsl #8
546 mov r4, r4, lsr #24
547 orr r4, r4, r5, lsl #8
548 mov r5, r5, lsr #24
549 orr r5, r5, r6, lsl #8
550 mov r6, r6, lsr #24
551 orr r6, r6, r7, lsl #8
552 mov r7, r7, lsr #24
553 orr r7, r7, r8, lsl #8
554 mov r8, r8, lsr #24
555 orr r8, r8, r9, lsl #8
556 mov r9, r9, lsr #24
557 orr r9, r9, r10, lsl #8
558 mov r10, r10, lsr #24
559 orr r10, r10, r11, lsl #8
560 stmia r0!, {r3,r4,r5,r6, r7,r8,r9,r10}
561 mov r3, r11, lsr #24
562 bhs 1b
563
564
565less_than_thirtytwo:
566 /* copy the last 0 to 31 bytes of the source */
567 rsb r12, lr, #32 /* we corrupted r12, recompute it */
568 add r2, r2, #32
569 cmp r2, #4
570 blo partial_word_tail
571
5721: ldr r5, [r1], #4
573 sub r2, r2, #4
574 orr r4, r3, r5, lsl lr
575 mov r3, r5, lsr r12
576 str r4, [r0], #4
577 cmp r2, #4
578 bhs 1b
579
580partial_word_tail:
581 /* we have a partial word in the input buffer */
582 movs r5, lr, lsl #(31-3)
583 strmib r3, [r0], #1
584 movmi r3, r3, lsr #8
585 strcsb r3, [r0], #1
586 movcs r3, r3, lsr #8
587 strcsb r3, [r0], #1
Mathias Agopianee223d02009-09-27 17:46:43 -0700588
The Android Open Source Project1dc9e472009-03-03 19:28:35 -0800589 /* Refill spilled registers from the stack. Don't update sp. */
590 ldmfd sp, {r5-r11}
591
592copy_last_3_and_return:
593 movs r2, r2, lsl #31 /* copy remaining 0, 1, 2 or 3 bytes */
594 ldrmib r2, [r1], #1
595 ldrcsb r3, [r1], #1
596 ldrcsb r12,[r1]
597 strmib r2, [r0], #1
598 strcsb r3, [r0], #1
599 strcsb r12,[r0]
600
601 /* we're done! restore sp and spilled registers and return */
602 add sp, sp, #28
603 ldmfd sp!, {r0, r4, lr}
604 bx lr
Kenny Root420878c2011-02-16 11:55:58 -0800605END(memcpy)
The Android Open Source Project1dc9e472009-03-03 19:28:35 -0800606
Mathias Agopianee223d02009-09-27 17:46:43 -0700607
608#endif /* __ARM_ARCH__ < 7 */