blob: 937d14bfe73fd93a77557a8909085d566b6bac50 [file] [log] [blame]
Brent DeGraafa8c02212012-05-30 22:50:19 -04001/***************************************************************************
2 Copyright (c) 2009-2012 Code Aurora Forum. All rights reserved.
3
4 Redistribution and use in source and binary forms, with or without
5 modification, are permitted provided that the following conditions are met:
6 * Redistributions of source code must retain the above copyright
7 notice, this list of conditions and the following disclaimer.
8 * Redistributions in binary form must reproduce the above copyright
9 notice, this list of conditions and the following disclaimer in the
10 documentation and/or other materials provided with the distribution.
11 * Neither the name of Code Aurora nor the names of its contributors may
12 be used to endorse or promote products derived from this software
13 without specific prior written permission.
14
15 THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
16 AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
17 IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
18 ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
19 LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
20 CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
21 SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
22 INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
23 CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
24 ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
25 POSSIBILITY OF SUCH DAMAGE.
26 ***************************************************************************/
27
28/***************************************************************************
29 * Neon memmove: Attempts to do a memmove with Neon registers if possible,
30 * Inputs:
31 * dest: The destination buffer
32 * src: The source buffer
33 * n: The size of the buffer to transfer
34 * Outputs:
35 *
36 ***************************************************************************/
37
38#include <machine/cpu-features.h>
39
40#if defined(KRAIT_NEON_OPTIMIZATION) || defined(SPARROW_NEON_OPTIMIZATION)
41 /*
42 * These can be overridden in:
43 * device/<vendor>/<board>/BoardConfig.mk
44 * by setting the following:
45 * TARGET_USE_KRAIT_BIONIC_OPTIMIZATION := true
46 * TARGET_USE_KRAIT_PLD_SET := true
47 * TARGET_KRAIT_BIONIC_PLDOFFS := <pldoffset>
48 * TARGET_KRAIT_BIONIC_PLDSIZE := <pldsize>
49 * TARGET_KRAIT_BIONIC_PLDTHRESH := <pldthreshold>
50 */
51#ifndef PLDOFFS
52#define PLDOFFS (10)
53#endif
54#ifndef PLDTHRESH
55#define PLDTHRESH (PLDOFFS)
56#endif
57#if (PLDOFFS < 5)
58#error Routine does not support offsets less than 5
59#endif
60#if (PLDTHRESH < PLDOFFS)
61#error PLD threshold must be greater than or equal to the PLD offset
62#endif
63#ifndef PLDSIZE
64#define PLDSIZE (64)
65#endif
66#define NOP_OPCODE (0xe320f000)
67
68 .code 32
69 .align 5
70 .global memmove
71 .type memmove, %function
72
73 .global _memmove_words
74 .type _memmove_words, %function
75
76 .global bcopy
77 .type bcopy, %function
78
79bcopy:
80 mov r12, r0
81 mov r0, r1
82 mov r1, r12
83 .balignl 64, NOP_OPCODE, 4*2
84memmove:
85_memmove_words:
86.Lneon_memmove_cmf:
87 subs r12, r0, r1
88 bxeq lr
89 cmphi r2, r12
90 bls memcpy /* Use memcpy for non-overlapping areas */
91
92 push {r0}
93
94.Lneon_back_to_front_copy:
95 add r0, r0, r2
96 add r1, r1, r2
97 cmp r2, #4
98 bgt .Lneon_b2f_gt4
99 cmp r2, #0
100.Lneon_b2f_smallcopy_loop:
101 beq .Lneon_memmove_done
102 ldrb r12, [r1, #-1]!
103 subs r2, r2, #1
104 strb r12, [r0, #-1]!
105 b .Lneon_b2f_smallcopy_loop
106.Lneon_b2f_gt4:
107 sub r3, r0, r1
108 cmp r2, r3
109 movle r12, r2
110 movgt r12, r3
111 cmp r12, #64
112 bge .Lneon_b2f_copy_64
113 cmp r12, #32
114 bge .Lneon_b2f_copy_32
115 cmp r12, #8
116 bge .Lneon_b2f_copy_8
117 cmp r12, #4
118 bge .Lneon_b2f_copy_4
119 b .Lneon_b2f_copy_1
120.Lneon_b2f_copy_64:
121 sub r1, r1, #64 /* Predecrement */
122 sub r0, r0, #64
123 movs r12, r2, lsr #6
124 cmp r12, #PLDTHRESH
125 ble .Lneon_b2f_copy_64_loop_nopld
126 sub r12, #PLDOFFS
127 pld [r1, #-(PLDOFFS-5)*PLDSIZE]
128 pld [r1, #-(PLDOFFS-4)*PLDSIZE]
129 pld [r1, #-(PLDOFFS-3)*PLDSIZE]
130 pld [r1, #-(PLDOFFS-2)*PLDSIZE]
131 pld [r1, #-(PLDOFFS-1)*PLDSIZE]
132 .balignl 64, NOP_OPCODE, 4*2
133.Lneon_b2f_copy_64_loop_outer:
134 pld [r1, #-(PLDOFFS)*PLDSIZE]
135 vld1.32 {q0, q1}, [r1]!
136 vld1.32 {q2, q3}, [r1]
137 subs r12, r12, #1
138 vst1.32 {q0, q1}, [r0]!
139 sub r1, r1, #96 /* Post-fixup and predecrement */
140 vst1.32 {q2, q3}, [r0]
141 sub r0, r0, #96
142 bne .Lneon_b2f_copy_64_loop_outer
143 mov r12, #PLDOFFS
144 .balignl 64, NOP_OPCODE, 4*2
145.Lneon_b2f_copy_64_loop_nopld:
146 vld1.32 {q8, q9}, [r1]!
147 vld1.32 {q10, q11}, [r1]
148 subs r12, r12, #1
149 vst1.32 {q8, q9}, [r0]!
150 sub r1, r1, #96 /* Post-fixup and predecrement */
151 vst1.32 {q10, q11}, [r0]
152 sub r0, r0, #96
153 bne .Lneon_b2f_copy_64_loop_nopld
154 ands r2, r2, #0x3f
155 beq .Lneon_memmove_done
156 add r1, r1, #64 /* Post-fixup */
157 add r0, r0, #64
158 cmp r2, #32
159 blt .Lneon_b2f_copy_finish
160.Lneon_b2f_copy_32:
161 mov r12, r2, lsr #5
162.Lneon_b2f_copy_32_loop:
163 sub r1, r1, #32 /* Predecrement */
164 sub r0, r0, #32
165 vld1.32 {q0,q1}, [r1]
166 subs r12, r12, #1
167 vst1.32 {q0,q1}, [r0]
168 bne .Lneon_b2f_copy_32_loop
169 ands r2, r2, #0x1f
170 beq .Lneon_memmove_done
171.Lneon_b2f_copy_finish:
172.Lneon_b2f_copy_8:
173 movs r12, r2, lsr #0x3
174 beq .Lneon_b2f_copy_4
175 .balignl 64, NOP_OPCODE, 4*2
176.Lneon_b2f_copy_8_loop:
177 sub r1, r1, #8 /* Predecrement */
178 sub r0, r0, #8
179 vld1.32 {d0}, [r1]
180 subs r12, r12, #1
181 vst1.32 {d0}, [r0]
182 bne .Lneon_b2f_copy_8_loop
183 ands r2, r2, #0x7
184 beq .Lneon_memmove_done
185.Lneon_b2f_copy_4:
186 movs r12, r2, lsr #0x2
187 beq .Lneon_b2f_copy_1
188.Lneon_b2f_copy_4_loop:
189 ldr r3, [r1, #-4]!
190 subs r12, r12, #1
191 str r3, [r0, #-4]!
192 bne .Lneon_b2f_copy_4_loop
193 ands r2, r2, #0x3
194.Lneon_b2f_copy_1:
195 cmp r2, #0
196 beq .Lneon_memmove_done
197 .balignl 64, NOP_OPCODE, 4*2
198.Lneon_b2f_copy_1_loop:
199 ldrb r12, [r1, #-1]!
200 subs r2, r2, #1
201 strb r12, [r0, #-1]!
202 bne .Lneon_b2f_copy_1_loop
203
204.Lneon_memmove_done:
205 pop {r0}
206 bx lr
207
208 .end
209
210#elif defined(SCORPION_NEON_OPTIMIZATION)
211 /*
212 * These can be overridden in:
213 * device/<vendor>/<board>/BoardConfig.mk
214 * by setting the following:
215 * TARGET_USE_SCORPION_BIONIC_OPTIMIZATION := true
216 * TARGET_USE_SCORPION_PLD_SET := true
217 * TARGET_SCORPION_BIONIC_PLDOFFS := <pldoffset>
218 * TARGET_SCORPION_BIONIC_PLDSIZE := <pldsize>
219 */
220#ifndef PLDOFFS
221#define PLDOFFS (6)
222#endif
223#ifndef PLDSIZE
224#define PLDSIZE (128) /* L2 cache line size */
225#endif
226
227 .code 32
228 .align 5
229 .global memmove
230 .type memmove, %function
231
232 .global bcopy
233 .type bcopy, %function
234
235bcopy:
236 mov r12, r0
237 mov r0, r1
238 mov r1, r12
239memmove:
240 push {r0}
241
242 /*
243 * The requirements for memmove state that the function should
244 * operate as if data were being copied from the source to a
245 * buffer, then to the destination. This is to allow a user
246 * to copy data from a source and target that overlap.
247 *
248 * We can't just do byte copies front-to-back automatically, since
249 * there's a good chance we may have an overlap (why else would someone
250 * intentionally use memmove then?).
251 *
252 * We'll break this into two parts. Front-to-back, or back-to-front
253 * copies.
254 */
255.Lneon_memmove_cmf:
256 cmp r0, r1
257 blt .Lneon_front_to_back_copy
258 bgt .Lneon_back_to_front_copy
259 b .Lneon_memmove_done
260
261 /* #############################################################
262 * Front to Back copy
263 */
264.Lneon_front_to_back_copy:
265 /*
266 * For small copies, just do a quick memcpy. We can do this for
267 * front-to-back copies, aligned or unaligned, since we're only
268 * doing 1 byte at a time...
269 */
270 cmp r2, #4
271 bgt .Lneon_f2b_gt4
272 cmp r2, #0
273.Lneon_f2b_smallcopy_loop:
274 beq .Lneon_memmove_done
275 ldrb r12, [r1], #1
276 subs r2, r2, #1
277 strb r12, [r0], #1
278 b .Lneon_f2b_smallcopy_loop
279.Lneon_f2b_gt4:
280 /* The window size is in r3. */
281 sub r3, r1, r0
282 /* #############################################################
283 * Front to Back copy
284 */
285 /*
286 * Note that we can't just route based on the size in r2. If that's
287 * larger than the overlap window in r3, we could potentially
288 * (and likely!) destroy data we're copying.
289 */
290 cmp r2, r3
291 movle r12, r2
292 movgt r12, r3
293 cmp r12, #256
294 bge .Lneon_f2b_copy_128
295 cmp r12, #64
296 bge .Lneon_f2b_copy_32
297 cmp r12, #16
298 bge .Lneon_f2b_copy_16
299 cmp r12, #8
300 bge .Lneon_f2b_copy_8
301 cmp r12, #4
302 bge .Lneon_f2b_copy_4
303 b .Lneon_f2b_copy_1
304 nop
305.Lneon_f2b_copy_128:
306 mov r12, r2, lsr #7
307 cmp r12, #PLDOFFS
308 ble .Lneon_f2b_copy_128_loop_nopld
309 sub r12, #PLDOFFS
310 pld [r1, #(PLDOFFS-1)*PLDSIZE]
311.Lneon_f2b_copy_128_loop_outer:
312 pld [r1, #(PLDOFFS*PLDSIZE)]
313 vld1.32 {q0,q1}, [r1]!
314 vld1.32 {q2,q3}, [r1]!
315 vld1.32 {q8,q9}, [r1]!
316 vld1.32 {q10,q11}, [r1]!
317 subs r12, r12, #1
318 vst1.32 {q0,q1}, [r0]!
319 vst1.32 {q2,q3}, [r0]!
320 vst1.32 {q8,q9}, [r0]!
321 vst1.32 {q10,q11}, [r0]!
322 bne .Lneon_f2b_copy_128_loop_outer
323 mov r12, #PLDOFFS
324.Lneon_f2b_copy_128_loop_nopld:
325 vld1.32 {q0,q1}, [r1]!
326 vld1.32 {q2,q3}, [r1]!
327 vld1.32 {q8,q9}, [r1]!
328 vld1.32 {q10,q11}, [r1]!
329 subs r12, r12, #1
330 vst1.32 {q0,q1}, [r0]!
331 vst1.32 {q2,q3}, [r0]!
332 vst1.32 {q8,q9}, [r0]!
333 vst1.32 {q10,q11}, [r0]!
334 bne .Lneon_f2b_copy_128_loop_nopld
335 ands r2, r2, #0x7f
336 beq .Lneon_memmove_done
337 cmp r2, #32
338 bge .Lneon_f2b_copy_32
339 b .Lneon_f2b_copy_finish
340.Lneon_f2b_copy_32:
341 mov r12, r2, lsr #5
342.Lneon_f2b_copy_32_loop:
343 vld1.32 {q0,q1}, [r1]!
344 subs r12, r12, #1
345 vst1.32 {q0,q1}, [r0]!
346 bne .Lneon_f2b_copy_32_loop
347 ands r2, r2, #0x1f
348 beq .Lneon_memmove_done
349.Lneon_f2b_copy_finish:
350.Lneon_f2b_copy_16:
351 movs r12, r2, lsr #4
352 beq .Lneon_f2b_copy_8
353.Lneon_f2b_copy_16_loop:
354 vld1.32 {q0}, [r1]!
355 subs r12, r12, #1
356 vst1.32 {q0}, [r0]!
357 bne .Lneon_f2b_copy_16_loop
358 ands r2, r2, #0xf
359 beq .Lneon_memmove_done
360.Lneon_f2b_copy_8:
361 movs r12, r2, lsr #3
362 beq .Lneon_f2b_copy_4
363.Lneon_f2b_copy_8_loop:
364 vld1.32 {d0}, [r1]!
365 subs r12, r12, #1
366 vst1.32 {d0}, [r0]!
367 bne .Lneon_f2b_copy_8_loop
368 ands r2, r2, #0x7
369 beq .Lneon_memmove_done
370.Lneon_f2b_copy_4:
371 movs r12, r2, lsr #2
372 beq .Lneon_f2b_copy_1
373.Lneon_f2b_copy_4_loop:
374 ldr r3, [r1], #4
375 subs r12, r12, #1
376 str r3, [r0], #4
377 bne .Lneon_f2b_copy_4_loop
378 ands r2, r2, #0x3
379 nop
380.Lneon_f2b_copy_1:
381 cmp r2, #0
382 beq .Lneon_memmove_done
383.Lneon_f2b_copy_1_loop:
384 ldrb r12, [r1], #1
385 subs r2, r2, #1
386 strb r12, [r0], #1
387 bne .Lneon_f2b_copy_1_loop
388.Lneon_f2b_finish:
389 b .Lneon_memmove_done
390
391 /* #############################################################
392 * Back to Front copy
393 */
394.Lneon_back_to_front_copy:
395 /*
396 * Here, we'll want to shift to the end of the buffers. This
397 * actually points us one past where we need to go, but since
398 * we'll pre-decrement throughout, this will be fine.
399 */
400 add r0, r0, r2
401 add r1, r1, r2
402 cmp r2, #4
403 bgt .Lneon_b2f_gt4
404 cmp r2, #0
405.Lneon_b2f_smallcopy_loop:
406 beq .Lneon_memmove_done
407 ldrb r12, [r1, #-1]!
408 subs r2, r2, #1
409 strb r12, [r0, #-1]!
410 b .Lneon_b2f_smallcopy_loop
411.Lneon_b2f_gt4:
412 /*
413 * The minimum of the overlap window size and the copy size
414 * is in r3.
415 */
416 sub r3, r0, r1
417 /*
418 * #############################################################
419 * Back to Front copy -
420 */
421 cmp r2, r3
422 movle r12, r2
423 movgt r12, r3
424 cmp r12, #256
425 bge .Lneon_b2f_copy_128
426 cmp r12, #64
427 bge .Lneon_b2f_copy_32
428 cmp r12, #8
429 bge .Lneon_b2f_copy_8
430 cmp r12, #4
431 bge .Lneon_b2f_copy_4
432 b .Lneon_b2f_copy_1
433 nop
434.Lneon_b2f_copy_128:
435 movs r12, r2, lsr #7
436 cmp r12, #PLDOFFS
437 ble .Lneon_b2f_copy_128_loop_nopld
438 sub r12, #PLDOFFS
439 pld [r1, #-(PLDOFFS-1)*PLDSIZE]
440.Lneon_b2f_copy_128_loop_outer:
441 pld [r1, #-(PLDOFFS*PLDSIZE)]
442 sub r1, r1, #128
443 sub r0, r0, #128
444 vld1.32 {q0, q1}, [r1]!
445 vld1.32 {q2, q3}, [r1]!
446 vld1.32 {q8, q9}, [r1]!
447 vld1.32 {q10, q11}, [r1]!
448 subs r12, r12, #1
449 vst1.32 {q0, q1}, [r0]!
450 vst1.32 {q2, q3}, [r0]!
451 vst1.32 {q8, q9}, [r0]!
452 vst1.32 {q10, q11}, [r0]!
453 sub r1, r1, #128
454 sub r0, r0, #128
455 bne .Lneon_b2f_copy_128_loop_outer
456 mov r12, #PLDOFFS
457.Lneon_b2f_copy_128_loop_nopld:
458 sub r1, r1, #128
459 sub r0, r0, #128
460 vld1.32 {q0, q1}, [r1]!
461 vld1.32 {q2, q3}, [r1]!
462 vld1.32 {q8, q9}, [r1]!
463 vld1.32 {q10, q11}, [r1]!
464 subs r12, r12, #1
465 vst1.32 {q0, q1}, [r0]!
466 vst1.32 {q2, q3}, [r0]!
467 vst1.32 {q8, q9}, [r0]!
468 vst1.32 {q10, q11}, [r0]!
469 sub r1, r1, #128
470 sub r0, r0, #128
471 bne .Lneon_b2f_copy_128_loop_nopld
472 ands r2, r2, #0x7f
473 beq .Lneon_memmove_done
474 cmp r2, #32
475 bge .Lneon_b2f_copy_32
476 b .Lneon_b2f_copy_finish
477.Lneon_b2f_copy_32:
478 mov r12, r2, lsr #5
479.Lneon_b2f_copy_32_loop:
480 sub r1, r1, #32
481 sub r0, r0, #32
482 vld1.32 {q0,q1}, [r1]
483 subs r12, r12, #1
484 vst1.32 {q0,q1}, [r0]
485 bne .Lneon_b2f_copy_32_loop
486 ands r2, r2, #0x1f
487 beq .Lneon_memmove_done
488.Lneon_b2f_copy_finish:
489.Lneon_b2f_copy_8:
490 movs r12, r2, lsr #0x3
491 beq .Lneon_b2f_copy_4
492.Lneon_b2f_copy_8_loop:
493 sub r1, r1, #8
494 sub r0, r0, #8
495 vld1.32 {d0}, [r1]
496 subs r12, r12, #1
497 vst1.32 {d0}, [r0]
498 bne .Lneon_b2f_copy_8_loop
499 ands r2, r2, #0x7
500 beq .Lneon_memmove_done
501.Lneon_b2f_copy_4:
502 movs r12, r2, lsr #0x2
503 beq .Lneon_b2f_copy_1
504.Lneon_b2f_copy_4_loop:
505 ldr r3, [r1, #-4]!
506 subs r12, r12, #1
507 str r3, [r0, #-4]!
508 bne .Lneon_b2f_copy_4_loop
509 ands r2, r2, #0x3
510 nop
511.Lneon_b2f_copy_1:
512 cmp r2, #0
513 beq .Lneon_memmove_done
514.Lneon_b2f_copy_1_loop:
515 ldrb r12, [r1, #-1]!
516 subs r2, r2, #1
517 strb r12, [r0, #-1]!
518 bne .Lneon_b2f_copy_1_loop
519
520.Lneon_memmove_done:
521 pop {r0}
522 bx lr
523
524 .end
525#endif /* SCORPION_NEON_OPTIMIZATION */
526