blob: 0096bb78610cea45076e5bdeccade291ee4df8cb [file] [log] [blame]
Brent DeGraaf4b5256a2015-09-18 16:06:28 -04001/* Copyright (c) 2015 The Linux Foundation. All rights reserved.
2 *
3 * Redistribution and use in source and binary forms, with or without
4 * modification, are permitted provided that the following conditions are met:
5 * * Redistributions of source code must retain the above copyright
6 * notice, this list of conditions and the following disclaimer.
7 * * Redistributions in binary form must reproduce the above copyright
8 * notice, this list of conditions and the following disclaimer in the
9 * documentation and/or other materials provided with the distribution.
10 * * Neither the name of The Linux Foundation nor the names of its contributors may
11 * be used to endorse or promote products derived from this software
12 * without specific prior written permission.
13 *
14 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
15 * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
16 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
17 * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
18 * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
19 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
20 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
21 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
22 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
23 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
24 * POSSIBILITY OF SUCH DAMAGE.
25 */
26
27#ifdef PLDOFFS
28#undef PLDOFFS
29#endif
30#define PLDOFFS (16)
31
32#ifdef PLDTHRESH
33#undef PLDTHRESH
34#endif
35#define PLDTHRESH (PLDOFFS)
36
37#ifdef BBTHRESH
38#undef BBTHRESH
39#endif
40#define BBTHRESH (2048/128)
41
42#if (PLDOFFS < 1)
43#error Routine does not support offsets less than 1
44#endif
45#if (PLDTHRESH < PLDOFFS)
46#error PLD threshold must be greater than or equal to the PLD offset
47#endif
48
49#ifdef PLDSIZE
50#undef PLDSIZE
51#endif
52#define PLDSIZE (128)
53
54kryo_bb_memcpy:
55 mov x11, x0
56 cmp x2, #4
57 blo kryo_bb_lt4
58 cmp x2, #16
59 blo kryo_bb_lt16
60 cmp x2, #32
61 blo kryo_bb_16
62 cmp x2, #64
63 blo kryo_bb_copy_32_a
64 cmp x2, #128
65 blo kryo_bb_copy_64_a
66
67 // we have at least 127 bytes to achieve 128-byte alignment
68 neg x3, x1 // calculate count to get SOURCE aligned
69 ands x3, x3, #0x7F
70 b.eq kryo_bb_source_aligned // already aligned
71 // alignment fixup, small to large (favorable alignment)
72 tbz x3, #0, 1f
73 ldrb w5, [x1], #1
74 strb w5, [x0], #1
751: tbz x3, #1, 2f
76 ldrh w6, [x1], #2
77 strh w6, [x0], #2
782: tbz x3, #2, 3f
79 ldr w8, [x1], #4
80 str w8, [x0], #4
813: tbz x3, #3, 4f
82 ldr x9, [x1], #8
83 str x9, [x0], #8
844: tbz x3, #4, 5f
85 ldr q7, [x1], #16
86 str q7, [x0], #16
875: tbz x3, #5, 55f
88 ldp q0, q1, [x1], #32
89 stp q0, q1, [x0], #32
9055: tbz x3, #6, 6f
91 ldp q0, q1, [x1], #32
92 ldp q2, q3, [x1], #32
93 stp q0, q1, [x0], #32
94 stp q2, q3, [x0], #32
956: subs x2, x2, x3 // fixup count after alignment
96 b.eq kryo_bb_exit
97 cmp x2, #128
98 blo kryo_bb_copy_64_a
99kryo_bb_source_aligned:
100 lsr x12, x2, #7
101 cmp x12, #PLDTHRESH
102 bls kryo_bb_copy_128_loop_nopld
103
104 cmp x12, #BBTHRESH
105 bls kryo_bb_prime_pump
106
107 add x14, x0, #0x400
108 add x9, x1, #(PLDOFFS*PLDSIZE)
109 sub x14, x14, x9
110 lsl x14, x14, #(21+32)
111 lsr x14, x14, #(21+32)
112 add x14, x14, #(PLDOFFS*PLDSIZE)
113 cmp x12, x14, lsr #7
114 bls kryo_bb_prime_pump
115
116 mov x9, #(PLDOFFS)
117 lsr x13, x14, #7
118 subs x9, x13, x9
119 bls kryo_bb_prime_pump
120
121 add x10, x1, x14
122 bic x10, x10, #0x7F // Round to multiple of PLDSIZE
123
124 sub x12, x12, x14, lsr #7
125 cmp x9, x12
126 sub x13, x12, x9
127 csel x12, x13, x12, LS
128 csel x9, x12, x9, HI
129 csel x12, xzr, x12, HI
130
131 prfm PLDL1STRM, [x1, #((PLDOFFS-1)*PLDSIZE)]
132 prfm PLDL1STRM, [x1, #((PLDOFFS-1)*PLDSIZE+64)]
133kryo_bb_copy_128_loop_outer_doublepld:
134 prfm PLDL1STRM, [x1, #((PLDOFFS)*PLDSIZE)]
135 prfm PLDL1STRM, [x1, #((PLDOFFS)*PLDSIZE)+64]
136 subs x9, x9, #1
137 ldp q0, q1, [x1], #32
138 ldp q2, q3, [x1], #32
139 ldp q4, q5, [x1], #32
140 ldp q6, q7, [x1], #32
141 prfm PLDL1KEEP, [x10]
142 prfm PLDL1KEEP, [x10, #64]
143 add x10, x10, #128
144 stp q0, q1, [x0], #32
145 stp q2, q3, [x0], #32
146 stp q4, q5, [x0], #32
147 stp q6, q7, [x0], #32
148 bne kryo_bb_copy_128_loop_outer_doublepld
149 cmp x12, #0
150 beq kryo_bb_pop_before_nopld
151 cmp x12, #(448*1024/128)
152 bls kryo_bb_copy_128_loop_outer
153
154kryo_bb_copy_128_loop_ddr:
155 subs x12, x12, #1
156 ldr x3, [x10], #128
157 ldp q0, q1, [x1], #32
158 ldp q2, q3, [x1], #32
159 ldp q4, q5, [x1], #32
160 ldp q6, q7, [x1], #32
161 stp q0, q1, [x0], #32
162 stp q2, q3, [x0], #32
163 stp q4, q5, [x0], #32
164 stp q6, q7, [x0], #32
165 bne kryo_bb_copy_128_loop_ddr
166 b kryo_bb_pop_before_nopld
167
168kryo_bb_prime_pump:
169 mov x14, #(PLDOFFS*PLDSIZE)
170 add x10, x1, #(PLDOFFS*PLDSIZE)
171 bic x10, x10, #0x7F
172 sub x12, x12, #PLDOFFS
173 prfm PLDL1KEEP, [x10, #(-1*PLDSIZE)]
174 prfm PLDL1KEEP, [x10, #(-1*PLDSIZE+64)]
175 cmp x12, #(448*1024/128)
176 bhi kryo_bb_copy_128_loop_ddr
177
178kryo_bb_copy_128_loop_outer:
179 subs x12, x12, #1
180 prfm PLDL1KEEP, [x10]
181 prfm PLDL1KEEP, [x10, #64]
182 ldp q0, q1, [x1], #32
183 ldp q2, q3, [x1], #32
184 ldp q4, q5, [x1], #32
185 ldp q6, q7, [x1], #32
186 add x10, x10, #128
187 stp q0, q1, [x0], #32
188 stp q2, q3, [x0], #32
189 stp q4, q5, [x0], #32
190 stp q6, q7, [x0], #32
191 bne kryo_bb_copy_128_loop_outer
192
193kryo_bb_pop_before_nopld:
194 lsr x12, x14, #7
195kryo_bb_copy_128_loop_nopld:
196 ldp q0, q1, [x1], #32
197 ldp q2, q3, [x1], #32
198 ldp q4, q5, [x1], #32
199 ldp q6, q7, [x1], #32
200 subs x12, x12, #1
201 stp q0, q1, [x0], #32
202 stp q2, q3, [x0], #32
203 stp q4, q5, [x0], #32
204 stp q6, q7, [x0], #32
205 bne kryo_bb_copy_128_loop_nopld
206 ands x2, x2, #0x7f
207 beq kryo_bb_exit
208
209kryo_bb_copy_64_a:
210 tbz x2, #6, kryo_bb_copy_32_a
211 ldp q0, q1, [x1], #32
212 ldp q2, q3, [x1], #32
213 stp q0, q1, [x0], #32
214 stp q2, q3, [x0], #32
215kryo_bb_copy_32_a:
216 tbz x2, #5, kryo_bb_16
217 ldp q0, q1, [x1], #32
218 stp q0, q1, [x0], #32
219kryo_bb_16:
220 tbz x2, #4, kryo_bb_lt16
221 ldr q7, [x1], #16
222 str q7, [x0], #16
223 ands x2, x2, #0x0f
224 beq kryo_bb_exit
225kryo_bb_lt16:
226 tbz x2, #3, kryo_bb_lt8
227 ldr x3, [x1], #8
228 str x3, [x0], #8
229kryo_bb_lt8:
230 tbz x2, #2, kryo_bb_lt4
231 ldr w3, [x1], #4
232 str w3, [x0], #4
233kryo_bb_lt4:
234 tbz x2, #1, kryo_bb_lt2
235 ldrh w3, [x1], #2
236 strh w3, [x0], #2
237kryo_bb_lt2:
238 tbz x2, #0, kryo_bb_exit
239 ldrb w3, [x1], #1
240 strb w3, [x0], #1
241kryo_bb_exit:
242 mov x0, x11
243 ret
244