blob: 1e328f88b39a58a3363c818b789cfc1ae3a3d3ee [file] [log] [blame]
Brent DeGraafa8c02212012-05-30 22:50:19 -04001@ Copyright (c) 2012, Code Aurora Forum. All rights reserved.
2@
3@ Redistribution and use in source and binary forms, with or without
4@ modification, are permitted provided that the following conditions are
5@ met:
6@ * Redistributions of source code must retain the above copyright
7@ notice, this list of conditions and the following disclaimer.
8@ * Redistributions in binary form must reproduce the above
9@ copyright notice, this list of conditions and the following
10@ disclaimer in the documentation and/or other materials provided
11@ with the distribution.
12@ * Neither the name of Code Aurora Forum, Inc. nor the names of its
13@ contributors may be used to endorse or promote products derived
14@ from this software without specific prior written permission.
15@
16@ THIS SOFTWARE IS PROVIDED "AS IS" AND ANY EXPRESS OR IMPLIED
17@ WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF
18@ MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT
19@ ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS
20@ BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
21@ CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
22@ SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR
23@ BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY,
24@ WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE
25@ OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN
26@ IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
27
28#include <machine/cpu-features.h>
29#include <machine/asm.h>
30
31@ Values which exist the program lifetime:
32#define HIGH_WORD_MASK d31
33#define EXPONENT_MASK d30
34#define int_1 d29
35#define double_1 d28
36@ sign and 2^int_n fixup:
37#define expadjustment d7
38#define literals r10
39@ Values which exist within both polynomial implementations:
40#define int_n d2
41#define int_n_low s4
42#define int_n_high s5
43#define double_n d3
44#define k1 d27
45#define k2 d26
46#define k3 d25
47#define k4 d24
48@ Values which cross the boundaries between polynomial implementations:
49#define ss d16
50#define ss2 d17
51#define ss4 d18
52#define Result d0
53#define Return_hw r1
54#define Return_lw r0
55#define ylg2x d0
56@ Intermediate values only needed sometimes:
57@ initial (sorted in approximate order of availability for overwriting):
58#define x_hw r1
59#define x_lw r0
60#define y_hw r3
61#define y_lw r2
62#define x d0
63#define bp d4
64#define y d1
65@ log series:
66#define u d19
67#define v d20
68#define lg2coeff d21
69#define bpa d5
70#define bpb d3
71#define lg2const d6
72#define xmantissa r8
73#define twoto1o5 r4
74#define twoto3o5 r5
75#define ix r6
76#define iEXP_MASK r7
77@ exp input setup:
78#define twoto1o8mask d3
79#define twoto1o4mask d4
80#define twoto1o2mask d1
81#define ylg2x_round_offset d16
82#define ylg2x_temp d17
83#define yn_temp d18
84#define yn_round_offset d19
85#define ln2 d5
86@ Careful, overwriting HIGH_WORD_MASK, reset it if you need it again ...
87#define rounded_exponent d31
88@ exp series:
89#define k5 d23
90#define k6 d22
91#define k7 d21
92#define k8 d20
93#define ss3 d19
94@ overwrite double_1 (we're done with it by now)
95#define k0 d28
96#define twoto1o4 d6
97
98@instructions that gas doesn't like to encode correctly:
99#define vmov_f64 fconstd
100#define vmov_f32 fconsts
101#define vmovne_f64 fconstdne
102
103ENTRY(pow_neon)
104#if defined(KRAIT_NO_AAPCS_VFP_MODE)
105 @ ARM ABI has inputs coming in via r registers, lets move to a d register
106 vmov x, x_lw, x_hw
107#endif
108 push {r4, r5, r6, r7, r8, r9, r10, lr}
109
110 @ pre-staged bp values
111 vldr bpa, .LbpA
112 vldr bpb, .LbpB
113 @ load two fifths into constant term in case we need it due to offsets
114 vldr lg2const, .Ltwofifths
115
116 @ bp is initially 1.0, may adjust later based on x value
117 vmov_f64 bp, #0x70
118
119 @ extract the mantissa from x for scaled value comparisons
120 lsl xmantissa, x_hw, #12
121
122 @ twoto1o5 = 2^(1/5) (input bracketing)
123 movw twoto1o5, #0x186c
124 movt twoto1o5, #0x2611
125 @ twoto3o5 = 2^(3/5) (input bracketing)
126 movw twoto3o5, #0x003b
127 movt twoto3o5, #0x8406
128
129 @ finish extracting xmantissa
130 orr xmantissa, xmantissa, x_lw, lsr #20
131
132 @ begin preparing a mask for normalization
133 vmov.i64 HIGH_WORD_MASK, #0xffffffff00000000
134
135 @ double_1 = (double) 1.0
136 vmov_f64 double_1, #0x70
137
138#if defined(KRAIT_NO_AAPCS_VFP_MODE)
139 @ move y from r registers to a d register
140 vmov y, y_lw, y_hw
141#endif
142
143 cmp xmantissa, twoto1o5
144
145 vshl.i64 EXPONENT_MASK, HIGH_WORD_MASK, #20
146 vshr.u64 int_1, HIGH_WORD_MASK, #63
147
148 adr literals, .LliteralTable
149
150 bhi .Lxgt2to1over5
151 @ zero out lg2 constant term if don't offset our input
152 vsub.f64 lg2const, lg2const, lg2const
153 b .Lxle2to1over5
154
155.Lxgt2to1over5:
156 @ if normalized x > 2^(1/5), bp = 1 + (2^(2/5)-1) = 2^(2/5)
157 vadd.f64 bp, bp, bpa
158
159.Lxle2to1over5:
160 @ will need ln2 for various things
161 vldr ln2, .Lln2
162
163 cmp xmantissa, twoto3o5
164@@@@ X Value Normalization @@@@
165
166 @ ss = abs(x) 2^(-1024)
167 vbic.i64 ss, x, EXPONENT_MASK
168
169 @ N = (floor(log2(x)) + 0x3ff) * 2^52
170 vand.i64 int_n, x, EXPONENT_MASK
171
172 bls .Lxle2to3over5
173 @ if normalized x > 2^(3/5), bp = 2^(2/5) + (2^(4/5) - 2^(2/5) = 2^(4/5)
174 vadd.f64 bp, bp, bpb
175 vadd.f64 lg2const, lg2const, lg2const
176
177.Lxle2to3over5:
178
179 @ load log2 polynomial series constants
180 vldm literals!, {k4, k3, k2, k1}
181
182 @ s = abs(x) 2^(-floor(log2(x))) (normalize abs(x) to around 1)
183 vorr.i64 ss, ss, double_1
184
185@@@@ 3/2 (Log(bp(1+s)/(1-s))) input computation (s = (x-bp)/(x+bp)) @@@@
186
187 vsub.f64 u, ss, bp
188 vadd.f64 v, ss, bp
189
190 @ s = (x-1)/(x+1)
191 vdiv.f64 ss, u, v
192
193 @ load 2/(3log2) into lg2coeff
194 vldr lg2coeff, .Ltwooverthreeln2
195
196 @ N = floor(log2(x)) * 2^52
197 vsub.i64 int_n, int_n, double_1
198
199@@@@ 3/2 (Log(bp(1+s)/(1-s))) polynomial series @@@@
200
201 @ ss2 = ((x-dp)/(x+dp))^2
202 vmul.f64 ss2, ss, ss
203 @ ylg2x = 3.0
204 vmov_f64 ylg2x, #8
205 vmul.f64 ss4, ss2, ss2
206
207 @ todo: useful later for two-way clamp
208 vmul.f64 lg2coeff, lg2coeff, y
209
210 @ N = floor(log2(x))
211 vshr.s64 int_n, int_n, #52
212
213 @ k3 = ss^2 * L4 + L3
214 vmla.f64 k3, ss2, k4
215
216 @ k1 = ss^2 * L2 + L1
217 vmla.f64 k1, ss2, k2
218
219 @ scale ss by 2/(3 ln 2)
220 vmul.f64 lg2coeff, ss, lg2coeff
221
222 @ ylg2x = 3.0 + s^2
223 vadd.f64 ylg2x, ylg2x, ss2
224
225 vcvt.f64.s32 double_n, int_n_low
226
227 @ k1 = s^4 (s^2 L4 + L3) + s^2 L2 + L1
228 vmla.f64 k1, ss4, k3
229
230 @ add in constant term
231 vadd.f64 double_n, lg2const
232
233 @ ylg2x = 3.0 + s^2 + s^4 (s^4 (s^2 L4 + L3) + s^2 L2 + L1)
234 vmla.f64 ylg2x, ss4, k1
235
236 @ ylg2x = y 2 s / (3 ln(2)) (3.0 + s^2 + s^4 (s^4(s^2 L4 + L3) + s^2 L2 + L1)
237 vmul.f64 ylg2x, lg2coeff, ylg2x
238
239@@@@ Compute input to Exp(s) (s = y(n + log2(x)) - (floor(8 yn + 1)/8 + floor(8 ylog2(x) + 1)/8) @@@@@
240
241 @ mask to extract bit 1 (2^-2 from our fixed-point representation)
242 vshl.u64 twoto1o4mask, int_1, #1
243
244 @ double_n = y * n
245 vmul.f64 double_n, double_n, y
246
247 @ Load 2^(1/4) for later computations
248 vldr twoto1o4, .Ltwoto1o4
249
250 @ either add or subtract one based on the sign of double_n and ylg2x
251 vshr.s64 ylg2x_round_offset, ylg2x, #62
252 vshr.s64 yn_round_offset, double_n, #62
253
254 @ move unmodified y*lg2x into temp space
255 vmov ylg2x_temp, ylg2x
256 @ compute floor(8 y * n + 1)/8
257 @ and floor(8 y (log2(x)) + 1)/8
258 vcvt.s32.f64 ylg2x, ylg2x, #3
259 @ move unmodified y*n into temp space
260 vmov yn_temp, double_n
261 vcvt.s32.f64 double_n, double_n, #3
262
263 @ load exp polynomial series constants
264 vldm literals!, {k8, k7, k6, k5, k4, k3, k2, k1}
265
266 @ mask to extract bit 2 (2^-1 from our fixed-point representation)
267 vshl.u64 twoto1o2mask, int_1, #2
268
269 @ make rounding offsets either 1 or -1 instead of 0 or -2
270 vorr.u64 ylg2x_round_offset, ylg2x_round_offset, int_1
271 vorr.u64 yn_round_offset, yn_round_offset, int_1
272
273 @ round up to the nearest 1/8th
274 vadd.s32 ylg2x, ylg2x, ylg2x_round_offset
275 vadd.s32 double_n, double_n, yn_round_offset
276
277 @ clear out round-up bit for y log2(x)
278 vbic.s32 ylg2x, ylg2x, int_1
279 @ clear out round-up bit for yn
280 vbic.s32 double_n, double_n, int_1
281 @ add together the (fixed precision) rounded parts
282 vadd.s64 rounded_exponent, double_n, ylg2x
283 @ turn int_n into a double with value 2^int_n
284 vshl.i64 int_n, rounded_exponent, #49
285 @ compute masks for 2^(1/4) and 2^(1/2) fixups for fractional part of fixed-precision rounded values:
286 vand.u64 twoto1o4mask, twoto1o4mask, rounded_exponent
287 vand.u64 twoto1o2mask, twoto1o2mask, rounded_exponent
288
289 @ convert back into floating point, double_n now holds (double) floor(8 y * n + 1)/8
290 @ ylg2x now holds (double) floor(8 y * log2(x) + 1)/8
291 vcvt.f64.s32 ylg2x, ylg2x, #3
292 vcvt.f64.s32 double_n, double_n, #3
293
294 @ put the 2 bit (0.5) through the roof of twoto1o2mask (make it 0x0 or 0xffffffffffffffff)
295 vqshl.u64 twoto1o2mask, twoto1o2mask, #62
296 @ put the 1 bit (0.25) through the roof of twoto1o4mask (make it 0x0 or 0xffffffffffffffff)
297 vqshl.u64 twoto1o4mask, twoto1o4mask, #63
298
299 @ center y*log2(x) fractional part between -0.125 and 0.125 by subtracting (double) floor(8 y * log2(x) + 1)/8
300 vsub.f64 ylg2x_temp, ylg2x_temp, ylg2x
301 @ center y*n fractional part between -0.125 and 0.125 by subtracting (double) floor(8 y * n + 1)/8
302 vsub.f64 yn_temp, yn_temp, double_n
303
304 @ Add fractional parts of yn and y log2(x) together
305 vadd.f64 ss, ylg2x_temp, yn_temp
306
307 @ Result = 1.0 (offset for exp(s) series)
308 vmov_f64 Result, #0x70
309
310 @ multiply fractional part of y * log2(x) by ln(2)
311 vmul.f64 ss, ln2, ss
312
313@@@@ 10th order polynomial series for Exp(s) @@@@
314
315 @ ss2 = (ss)^2
316 vmul.f64 ss2, ss, ss
317
318 @ twoto1o2mask = twoto1o2mask & twoto1o4
319 vand.u64 twoto1o2mask, twoto1o2mask, twoto1o4
320 @ twoto1o2mask = twoto1o2mask & twoto1o4
321 vand.u64 twoto1o4mask, twoto1o4mask, twoto1o4
322
323 @ Result = 1.0 + ss
324 vadd.f64 Result, Result, ss
325
326 @ k7 = ss k8 + k7
327 vmla.f64 k7, ss, k8
328
329 @ ss4 = (ss*ss) * (ss*ss)
330 vmul.f64 ss4, ss2, ss2
331
332 @ twoto1o2mask = twoto1o2mask | (double) 1.0 - results in either 1.0 or 2^(1/4) in twoto1o2mask
333 vorr.u64 twoto1o2mask, twoto1o2mask, double_1
334 @ twoto1o2mask = twoto1o4mask | (double) 1.0 - results in either 1.0 or 2^(1/4) in twoto1o4mask
335 vorr.u64 twoto1o4mask, twoto1o4mask, double_1
336
337 @ TODO: should setup sign here, expadjustment = 1.0
338 vmov_f64 expadjustment, #0x70
339
340 @ ss3 = (ss*ss) * ss
341 vmul.f64 ss3, ss2, ss
342
343 @ k0 = 1/2 (first non-unity coefficient)
344 vmov_f64 k0, #0x60
345
346 @ Mask out non-exponent bits to make sure we have just 2^int_n
347 vand.i64 int_n, int_n, EXPONENT_MASK
348
349 @ square twoto1o2mask to get 1.0 or 2^(1/2)
350 vmul.f64 twoto1o2mask, twoto1o2mask, twoto1o2mask
351 @ multiply twoto2o4mask into the exponent output adjustment value
352 vmul.f64 expadjustment, expadjustment, twoto1o4mask
353
354 @ k5 = ss k6 + k5
355 vmla.f64 k5, ss, k6
356
357 @ k3 = ss k4 + k3
358 vmla.f64 k3, ss, k4
359
360 @ k1 = ss k2 + k1
361 vmla.f64 k1, ss, k2
362
363 @ multiply twoto1o2mask into exponent output adjustment value
364 vmul.f64 expadjustment, expadjustment, twoto1o2mask
365
366 @ k5 = ss^2 ( ss k8 + k7 ) + ss k6 + k5
367 vmla.f64 k5, ss2, k7
368
369 @ k1 = ss^2 ( ss k4 + k3 ) + ss k2 + k1
370 vmla.f64 k1, ss2, k3
371
372 @ Result = 1.0 + ss + 1/2 ss^2
373 vmla.f64 Result, ss2, k0
374
375 @ Adjust int_n so that it's a double precision value that can be multiplied by Result
376 vadd.i64 expadjustment, int_n, expadjustment
377
378 @ k1 = ss^4 ( ss^2 ( ss k8 + k7 ) + ss k6 + k5 ) + ss^2 ( ss k4 + k3 ) + ss k2 + k1
379 vmla.f64 k1, ss4, k5
380
381 @ Result = 1.0 + ss + 1/2 ss^2 + ss^3 ( ss^4 ( ss^2 ( ss k8 + k7 ) + ss k6 + k5 ) + ss^2 ( ss k4 + k3 ) + ss k2 + k1 )
382 vmla.f64 Result, ss3, k1
383
384 @ multiply by adjustment (sign*(rounding ? sqrt(2) : 1) * 2^int_n)
385 vmul.f64 Result, expadjustment, Result
386
387.LleavePow:
388#if defined(KRAIT_NO_AAPCS_VFP_MODE)
389 @ return Result (FP)
390 vmov Return_lw, Return_hw, Result
391#endif
392.LleavePowDirect:
393 @ leave directly returning whatever is in Return_lw and Return_hw
394 pop {r4, r5, r6, r7, r8, r9, r10, pc}
395
396.align 6
397.LliteralTable:
398@ Least-sqares tuned constants for 11th order (log2((1+s)/(1-s)):
399.LL4: @ ~3/11
400 .long 0x53a79915, 0x3fd1b108
401.LL3: @ ~1/3
402 .long 0x9ca0567a, 0x3fd554fa
403.LL2: @ ~3/7
404 .long 0x1408e660, 0x3fdb6db7
405.LL1: @ ~3/5
406 .long 0x332D4313, 0x3fe33333
407
408@ Least-squares tuned constants for 10th order exp(s):
409.LE10: @ ~1/3628800
410 .long 0x25c7ba0a, 0x3e92819b
411.LE9: @ ~1/362880
412 .long 0x9499b49c, 0x3ec72294
413.LE8: @ ~1/40320
414 .long 0xabb79d95, 0x3efa019f
415.LE7: @ ~1/5040
416 .long 0x8723aeaa, 0x3f2a019f
417.LE6: @ ~1/720
418 .long 0x16c76a94, 0x3f56c16c
419.LE5: @ ~1/120
420 .long 0x11185da8, 0x3f811111
421.LE4: @ ~1/24
422 .long 0x5555551c, 0x3fa55555
423.LE3: @ ~1/6
424 .long 0x555554db, 0x3fc55555
425
426.LbpA: @ (2^(2/5) - 1)
427 .long 0x4ee54db1, 0x3fd472d1
428
429.LbpB: @ (2^(4/5) - 2^(2/5))
430 .long 0x1c8a36cf, 0x3fdafb62
431
432.Ltwofifths: @
433 .long 0x9999999a, 0x3fd99999
434
435.Ltwooverthreeln2:
436 .long 0xDC3A03FD, 0x3FEEC709
437
438.Lln2: @ ln(2)
439 .long 0xFEFA39EF, 0x3FE62E42
440
441.Ltwoto1o4: @ 2^1/4
442 .long 0x0a31b715, 0x3ff306fe
443END(pow)