blob: c6e6d397e4be0b7f99447b8f5e5fcac29b6d7096 [file] [log] [blame]
The Android Open Source Projecta27d2ba2008-10-21 07:00:00 -07001/*
2 * Copyright (C) 2008 The Android Open Source Project
3 * All rights reserved.
4 *
5 * Redistribution and use in source and binary forms, with or without
6 * modification, are permitted provided that the following conditions
7 * are met:
8 * * Redistributions of source code must retain the above copyright
9 * notice, this list of conditions and the following disclaimer.
10 * * Redistributions in binary form must reproduce the above copyright
11 * notice, this list of conditions and the following disclaimer in
12 * the documentation and/or other materials provided with the
13 * distribution.
14 *
15 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
16 * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
17 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
18 * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
19 * COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
20 * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
21 * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS
22 * OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED
23 * AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
24 * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
25 * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
26 * SUCH DAMAGE.
27 */
28 .text
29
30 .global __memcmp16
31 .type __memcmp16, %function
32 .align 4
33
34/*
35 * Optimized memcmp16() for ARM9.
36 * This would not be optimal on XScale or ARM11, where more prefetching
37 * and use of PLD will be needed.
38 * The 2 major optimzations here are
39 * (1) The main loop compares 16 bytes at a time
40 * (2) The loads are scheduled in a way they won't stall
41 */
42
43__memcmp16:
44 pld [r0, #0]
45 pld [r1, #0]
46
47 /* take of the case where length is nul or the buffers are the same */
48 cmp r0, r1
49 cmpne r2, #0
50 moveq r0, #0
51 bxeq lr
52
53 /* since r0 hold the result, move the first source
54 * pointer somewhere else
55 */
56
57 mov r3, r0
58
59 /* make sure we have at least 12 words, this simplify things below
60 * and avoid some overhead for small blocks
61 */
62
63 cmp r2, #12
64 bpl 0f
65
66 /* small blocks (less then 12 words) */
67 pld [r0, #32]
68 pld [r1, #32]
69
701: ldrh r0, [r3], #2
71 ldrh ip, [r1], #2
72 subs r0, r0, ip
73 bxne lr
74 subs r2, r2, #1
75 bne 1b
76 bx lr
77
78
79 /* save registers */
800: stmfd sp!, {r4, lr}
81
82 /* align first pointer to word boundary */
83 tst r3, #2
84 beq 0f
85
86 ldrh r0, [r3], #2
87 ldrh ip, [r1], #2
88 sub r2, r2, #1
89 subs r0, r0, ip
90 /* restore registers and return */
91 ldmnefd sp!, {r4, lr}
92 bxne lr
93
94
95
960: /* here the first pointer is aligned, and we have at least 3 words
97 * to process.
98 */
99
100 /* see if the pointers are congruent */
101 eor r0, r3, r1
102 ands r0, r0, #2
103 bne 5f
104
105 /* congruent case, 16 half-words per iteration
106 * We need to make sure there are at least 16+2 words left
107 * because we effectively read ahead one long word, and we could
108 * read past the buffer (and segfault) if we're not careful.
109 */
110
111 ldr ip, [r1]
112 subs r2, r2, #(16 + 2)
113 bmi 1f
114
1150:
116 pld [r3, #64]
117 pld [r1, #64]
118 ldr r0, [r3], #4
119 ldr lr, [r1, #4]!
120 eors r0, r0, ip
121 ldreq r0, [r3], #4
122 ldreq ip, [r1, #4]!
123 eoreqs r0, r0, lr
124 ldreq r0, [r3], #4
125 ldreq lr, [r1, #4]!
126 eoreqs r0, r0, ip
127 ldreq r0, [r3], #4
128 ldreq ip, [r1, #4]!
129 eoreqs r0, r0, lr
130 ldreq r0, [r3], #4
131 ldreq lr, [r1, #4]!
132 eoreqs r0, r0, ip
133 ldreq r0, [r3], #4
134 ldreq ip, [r1, #4]!
135 eoreqs r0, r0, lr
136 ldreq r0, [r3], #4
137 ldreq lr, [r1, #4]!
138 eoreqs r0, r0, ip
139 ldreq r0, [r3], #4
140 ldreq ip, [r1, #4]!
141 eoreqs r0, r0, lr
142 bne 2f
143 subs r2, r2, #16
144 bhs 0b
145
146 /* do we have at least 2 words left? */
1471: adds r2, r2, #(16 - 2 + 2)
148 bmi 4f
149
150 /* finish off 2 words at a time */
1513: ldr r0, [r3], #4
152 ldr ip, [r1], #4
153 eors r0, r0, ip
154 bne 2f
155 subs r2, r2, #2
156 bhs 3b
157
158 /* are we done? */
1594: adds r2, r2, #2
160 bne 8f
161 /* restore registers and return */
162 mov r0, #0
163 ldmfd sp!, {r4, lr}
164 bx lr
165
1662: /* the last 2 words are different, restart them */
167 ldrh r0, [r3, #-4]
168 ldrh ip, [r1, #-4]
169 subs r0, r0, ip
170 ldreqh r0, [r3, #-2]
171 ldreqh ip, [r1, #-2]
172 subeqs r0, r0, ip
173 /* restore registers and return */
174 ldmfd sp!, {r4, lr}
175 bx lr
176
177 /* process the last few words */
1788: ldrh r0, [r3], #2
179 ldrh ip, [r1], #2
180 subs r0, r0, ip
181 bne 9f
182 subs r2, r2, #1
183 bne 8b
184
1859: /* restore registers and return */
186 ldmfd sp!, {r4, lr}
187 bx lr
188
189
1905: /*************** non-congruent case ***************/
191
192 /* align the unaligned pointer */
193 bic r1, r1, #3
194 ldr lr, [r1], #4
195 sub r2, r2, #8
196
1976:
198 pld [r3, #64]
199 pld [r1, #64]
200 mov ip, lr, lsr #16
201 ldr lr, [r1], #4
202 ldr r0, [r3], #4
203 orr ip, ip, lr, lsl #16
204 eors r0, r0, ip
205 moveq ip, lr, lsr #16
206 ldreq lr, [r1], #4
207 ldreq r0, [r3], #4
208 orreq ip, ip, lr, lsl #16
209 eoreqs r0, r0, ip
210 moveq ip, lr, lsr #16
211 ldreq lr, [r1], #4
212 ldreq r0, [r3], #4
213 orreq ip, ip, lr, lsl #16
214 eoreqs r0, r0, ip
215 moveq ip, lr, lsr #16
216 ldreq lr, [r1], #4
217 ldreq r0, [r3], #4
218 orreq ip, ip, lr, lsl #16
219 eoreqs r0, r0, ip
220 bne 7f
221 subs r2, r2, #8
222 bhs 6b
223 sub r1, r1, #2
224 /* are we done? */
225 adds r2, r2, #8
226 moveq r0, #0
227 beq 9b
228 /* finish off the remaining bytes */
229 b 8b
230
2317: /* fix up the 2 pointers and fallthrough... */
232 sub r1, r1, #2
233 b 2b