Brent DeGraaf | a8c0221 | 2012-05-30 22:50:19 -0400 | [diff] [blame^] | 1 | @ Copyright (c) 2012, Code Aurora Forum. All rights reserved. |
| 2 | @ |
| 3 | @ Redistribution and use in source and binary forms, with or without |
| 4 | @ modification, are permitted provided that the following conditions are |
| 5 | @ met: |
| 6 | @ * Redistributions of source code must retain the above copyright |
| 7 | @ notice, this list of conditions and the following disclaimer. |
| 8 | @ * Redistributions in binary form must reproduce the above |
| 9 | @ copyright notice, this list of conditions and the following |
| 10 | @ disclaimer in the documentation and/or other materials provided |
| 11 | @ with the distribution. |
| 12 | @ * Neither the name of Code Aurora Forum, Inc. nor the names of its |
| 13 | @ contributors may be used to endorse or promote products derived |
| 14 | @ from this software without specific prior written permission. |
| 15 | @ |
| 16 | @ THIS SOFTWARE IS PROVIDED "AS IS" AND ANY EXPRESS OR IMPLIED |
| 17 | @ WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF |
| 18 | @ MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT |
| 19 | @ ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS |
| 20 | @ BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR |
| 21 | @ CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF |
| 22 | @ SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR |
| 23 | @ BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, |
| 24 | @ WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE |
| 25 | @ OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN |
| 26 | @ IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. |
| 27 | |
| 28 | #include <machine/cpu-features.h> |
| 29 | #include <machine/asm.h> |
| 30 | |
| 31 | @ Values which exist the program lifetime: |
| 32 | #define HIGH_WORD_MASK d31 |
| 33 | #define EXPONENT_MASK d30 |
| 34 | #define int_1 d29 |
| 35 | #define double_1 d28 |
| 36 | @ sign and 2^int_n fixup: |
| 37 | #define expadjustment d7 |
| 38 | #define literals r10 |
| 39 | @ Values which exist within both polynomial implementations: |
| 40 | #define int_n d2 |
| 41 | #define int_n_low s4 |
| 42 | #define int_n_high s5 |
| 43 | #define double_n d3 |
| 44 | #define k1 d27 |
| 45 | #define k2 d26 |
| 46 | #define k3 d25 |
| 47 | #define k4 d24 |
| 48 | @ Values which cross the boundaries between polynomial implementations: |
| 49 | #define ss d16 |
| 50 | #define ss2 d17 |
| 51 | #define ss4 d18 |
| 52 | #define Result d0 |
| 53 | #define Return_hw r1 |
| 54 | #define Return_lw r0 |
| 55 | #define ylg2x d0 |
| 56 | @ Intermediate values only needed sometimes: |
| 57 | @ initial (sorted in approximate order of availability for overwriting): |
| 58 | #define x_hw r1 |
| 59 | #define x_lw r0 |
| 60 | #define y_hw r3 |
| 61 | #define y_lw r2 |
| 62 | #define x d0 |
| 63 | #define bp d4 |
| 64 | #define y d1 |
| 65 | @ log series: |
| 66 | #define u d19 |
| 67 | #define v d20 |
| 68 | #define lg2coeff d21 |
| 69 | #define bpa d5 |
| 70 | #define bpb d3 |
| 71 | #define lg2const d6 |
| 72 | #define xmantissa r8 |
| 73 | #define twoto1o5 r4 |
| 74 | #define twoto3o5 r5 |
| 75 | #define ix r6 |
| 76 | #define iEXP_MASK r7 |
| 77 | @ exp input setup: |
| 78 | #define twoto1o8mask d3 |
| 79 | #define twoto1o4mask d4 |
| 80 | #define twoto1o2mask d1 |
| 81 | #define ylg2x_round_offset d16 |
| 82 | #define ylg2x_temp d17 |
| 83 | #define yn_temp d18 |
| 84 | #define yn_round_offset d19 |
| 85 | #define ln2 d5 |
| 86 | @ Careful, overwriting HIGH_WORD_MASK, reset it if you need it again ... |
| 87 | #define rounded_exponent d31 |
| 88 | @ exp series: |
| 89 | #define k5 d23 |
| 90 | #define k6 d22 |
| 91 | #define k7 d21 |
| 92 | #define k8 d20 |
| 93 | #define ss3 d19 |
| 94 | @ overwrite double_1 (we're done with it by now) |
| 95 | #define k0 d28 |
| 96 | #define twoto1o4 d6 |
| 97 | |
| 98 | @instructions that gas doesn't like to encode correctly: |
| 99 | #define vmov_f64 fconstd |
| 100 | #define vmov_f32 fconsts |
| 101 | #define vmovne_f64 fconstdne |
| 102 | |
| 103 | ENTRY(pow_neon) |
| 104 | #if defined(KRAIT_NO_AAPCS_VFP_MODE) |
| 105 | @ ARM ABI has inputs coming in via r registers, lets move to a d register |
| 106 | vmov x, x_lw, x_hw |
| 107 | #endif |
| 108 | push {r4, r5, r6, r7, r8, r9, r10, lr} |
| 109 | |
| 110 | @ pre-staged bp values |
| 111 | vldr bpa, .LbpA |
| 112 | vldr bpb, .LbpB |
| 113 | @ load two fifths into constant term in case we need it due to offsets |
| 114 | vldr lg2const, .Ltwofifths |
| 115 | |
| 116 | @ bp is initially 1.0, may adjust later based on x value |
| 117 | vmov_f64 bp, #0x70 |
| 118 | |
| 119 | @ extract the mantissa from x for scaled value comparisons |
| 120 | lsl xmantissa, x_hw, #12 |
| 121 | |
| 122 | @ twoto1o5 = 2^(1/5) (input bracketing) |
| 123 | movw twoto1o5, #0x186c |
| 124 | movt twoto1o5, #0x2611 |
| 125 | @ twoto3o5 = 2^(3/5) (input bracketing) |
| 126 | movw twoto3o5, #0x003b |
| 127 | movt twoto3o5, #0x8406 |
| 128 | |
| 129 | @ finish extracting xmantissa |
| 130 | orr xmantissa, xmantissa, x_lw, lsr #20 |
| 131 | |
| 132 | @ begin preparing a mask for normalization |
| 133 | vmov.i64 HIGH_WORD_MASK, #0xffffffff00000000 |
| 134 | |
| 135 | @ double_1 = (double) 1.0 |
| 136 | vmov_f64 double_1, #0x70 |
| 137 | |
| 138 | #if defined(KRAIT_NO_AAPCS_VFP_MODE) |
| 139 | @ move y from r registers to a d register |
| 140 | vmov y, y_lw, y_hw |
| 141 | #endif |
| 142 | |
| 143 | cmp xmantissa, twoto1o5 |
| 144 | |
| 145 | vshl.i64 EXPONENT_MASK, HIGH_WORD_MASK, #20 |
| 146 | vshr.u64 int_1, HIGH_WORD_MASK, #63 |
| 147 | |
| 148 | adr literals, .LliteralTable |
| 149 | |
| 150 | bhi .Lxgt2to1over5 |
| 151 | @ zero out lg2 constant term if don't offset our input |
| 152 | vsub.f64 lg2const, lg2const, lg2const |
| 153 | b .Lxle2to1over5 |
| 154 | |
| 155 | .Lxgt2to1over5: |
| 156 | @ if normalized x > 2^(1/5), bp = 1 + (2^(2/5)-1) = 2^(2/5) |
| 157 | vadd.f64 bp, bp, bpa |
| 158 | |
| 159 | .Lxle2to1over5: |
| 160 | @ will need ln2 for various things |
| 161 | vldr ln2, .Lln2 |
| 162 | |
| 163 | cmp xmantissa, twoto3o5 |
| 164 | @@@@ X Value Normalization @@@@ |
| 165 | |
| 166 | @ ss = abs(x) 2^(-1024) |
| 167 | vbic.i64 ss, x, EXPONENT_MASK |
| 168 | |
| 169 | @ N = (floor(log2(x)) + 0x3ff) * 2^52 |
| 170 | vand.i64 int_n, x, EXPONENT_MASK |
| 171 | |
| 172 | bls .Lxle2to3over5 |
| 173 | @ if normalized x > 2^(3/5), bp = 2^(2/5) + (2^(4/5) - 2^(2/5) = 2^(4/5) |
| 174 | vadd.f64 bp, bp, bpb |
| 175 | vadd.f64 lg2const, lg2const, lg2const |
| 176 | |
| 177 | .Lxle2to3over5: |
| 178 | |
| 179 | @ load log2 polynomial series constants |
| 180 | vldm literals!, {k4, k3, k2, k1} |
| 181 | |
| 182 | @ s = abs(x) 2^(-floor(log2(x))) (normalize abs(x) to around 1) |
| 183 | vorr.i64 ss, ss, double_1 |
| 184 | |
| 185 | @@@@ 3/2 (Log(bp(1+s)/(1-s))) input computation (s = (x-bp)/(x+bp)) @@@@ |
| 186 | |
| 187 | vsub.f64 u, ss, bp |
| 188 | vadd.f64 v, ss, bp |
| 189 | |
| 190 | @ s = (x-1)/(x+1) |
| 191 | vdiv.f64 ss, u, v |
| 192 | |
| 193 | @ load 2/(3log2) into lg2coeff |
| 194 | vldr lg2coeff, .Ltwooverthreeln2 |
| 195 | |
| 196 | @ N = floor(log2(x)) * 2^52 |
| 197 | vsub.i64 int_n, int_n, double_1 |
| 198 | |
| 199 | @@@@ 3/2 (Log(bp(1+s)/(1-s))) polynomial series @@@@ |
| 200 | |
| 201 | @ ss2 = ((x-dp)/(x+dp))^2 |
| 202 | vmul.f64 ss2, ss, ss |
| 203 | @ ylg2x = 3.0 |
| 204 | vmov_f64 ylg2x, #8 |
| 205 | vmul.f64 ss4, ss2, ss2 |
| 206 | |
| 207 | @ todo: useful later for two-way clamp |
| 208 | vmul.f64 lg2coeff, lg2coeff, y |
| 209 | |
| 210 | @ N = floor(log2(x)) |
| 211 | vshr.s64 int_n, int_n, #52 |
| 212 | |
| 213 | @ k3 = ss^2 * L4 + L3 |
| 214 | vmla.f64 k3, ss2, k4 |
| 215 | |
| 216 | @ k1 = ss^2 * L2 + L1 |
| 217 | vmla.f64 k1, ss2, k2 |
| 218 | |
| 219 | @ scale ss by 2/(3 ln 2) |
| 220 | vmul.f64 lg2coeff, ss, lg2coeff |
| 221 | |
| 222 | @ ylg2x = 3.0 + s^2 |
| 223 | vadd.f64 ylg2x, ylg2x, ss2 |
| 224 | |
| 225 | vcvt.f64.s32 double_n, int_n_low |
| 226 | |
| 227 | @ k1 = s^4 (s^2 L4 + L3) + s^2 L2 + L1 |
| 228 | vmla.f64 k1, ss4, k3 |
| 229 | |
| 230 | @ add in constant term |
| 231 | vadd.f64 double_n, lg2const |
| 232 | |
| 233 | @ ylg2x = 3.0 + s^2 + s^4 (s^4 (s^2 L4 + L3) + s^2 L2 + L1) |
| 234 | vmla.f64 ylg2x, ss4, k1 |
| 235 | |
| 236 | @ ylg2x = y 2 s / (3 ln(2)) (3.0 + s^2 + s^4 (s^4(s^2 L4 + L3) + s^2 L2 + L1) |
| 237 | vmul.f64 ylg2x, lg2coeff, ylg2x |
| 238 | |
| 239 | @@@@ Compute input to Exp(s) (s = y(n + log2(x)) - (floor(8 yn + 1)/8 + floor(8 ylog2(x) + 1)/8) @@@@@ |
| 240 | |
| 241 | @ mask to extract bit 1 (2^-2 from our fixed-point representation) |
| 242 | vshl.u64 twoto1o4mask, int_1, #1 |
| 243 | |
| 244 | @ double_n = y * n |
| 245 | vmul.f64 double_n, double_n, y |
| 246 | |
| 247 | @ Load 2^(1/4) for later computations |
| 248 | vldr twoto1o4, .Ltwoto1o4 |
| 249 | |
| 250 | @ either add or subtract one based on the sign of double_n and ylg2x |
| 251 | vshr.s64 ylg2x_round_offset, ylg2x, #62 |
| 252 | vshr.s64 yn_round_offset, double_n, #62 |
| 253 | |
| 254 | @ move unmodified y*lg2x into temp space |
| 255 | vmov ylg2x_temp, ylg2x |
| 256 | @ compute floor(8 y * n + 1)/8 |
| 257 | @ and floor(8 y (log2(x)) + 1)/8 |
| 258 | vcvt.s32.f64 ylg2x, ylg2x, #3 |
| 259 | @ move unmodified y*n into temp space |
| 260 | vmov yn_temp, double_n |
| 261 | vcvt.s32.f64 double_n, double_n, #3 |
| 262 | |
| 263 | @ load exp polynomial series constants |
| 264 | vldm literals!, {k8, k7, k6, k5, k4, k3, k2, k1} |
| 265 | |
| 266 | @ mask to extract bit 2 (2^-1 from our fixed-point representation) |
| 267 | vshl.u64 twoto1o2mask, int_1, #2 |
| 268 | |
| 269 | @ make rounding offsets either 1 or -1 instead of 0 or -2 |
| 270 | vorr.u64 ylg2x_round_offset, ylg2x_round_offset, int_1 |
| 271 | vorr.u64 yn_round_offset, yn_round_offset, int_1 |
| 272 | |
| 273 | @ round up to the nearest 1/8th |
| 274 | vadd.s32 ylg2x, ylg2x, ylg2x_round_offset |
| 275 | vadd.s32 double_n, double_n, yn_round_offset |
| 276 | |
| 277 | @ clear out round-up bit for y log2(x) |
| 278 | vbic.s32 ylg2x, ylg2x, int_1 |
| 279 | @ clear out round-up bit for yn |
| 280 | vbic.s32 double_n, double_n, int_1 |
| 281 | @ add together the (fixed precision) rounded parts |
| 282 | vadd.s64 rounded_exponent, double_n, ylg2x |
| 283 | @ turn int_n into a double with value 2^int_n |
| 284 | vshl.i64 int_n, rounded_exponent, #49 |
| 285 | @ compute masks for 2^(1/4) and 2^(1/2) fixups for fractional part of fixed-precision rounded values: |
| 286 | vand.u64 twoto1o4mask, twoto1o4mask, rounded_exponent |
| 287 | vand.u64 twoto1o2mask, twoto1o2mask, rounded_exponent |
| 288 | |
| 289 | @ convert back into floating point, double_n now holds (double) floor(8 y * n + 1)/8 |
| 290 | @ ylg2x now holds (double) floor(8 y * log2(x) + 1)/8 |
| 291 | vcvt.f64.s32 ylg2x, ylg2x, #3 |
| 292 | vcvt.f64.s32 double_n, double_n, #3 |
| 293 | |
| 294 | @ put the 2 bit (0.5) through the roof of twoto1o2mask (make it 0x0 or 0xffffffffffffffff) |
| 295 | vqshl.u64 twoto1o2mask, twoto1o2mask, #62 |
| 296 | @ put the 1 bit (0.25) through the roof of twoto1o4mask (make it 0x0 or 0xffffffffffffffff) |
| 297 | vqshl.u64 twoto1o4mask, twoto1o4mask, #63 |
| 298 | |
| 299 | @ center y*log2(x) fractional part between -0.125 and 0.125 by subtracting (double) floor(8 y * log2(x) + 1)/8 |
| 300 | vsub.f64 ylg2x_temp, ylg2x_temp, ylg2x |
| 301 | @ center y*n fractional part between -0.125 and 0.125 by subtracting (double) floor(8 y * n + 1)/8 |
| 302 | vsub.f64 yn_temp, yn_temp, double_n |
| 303 | |
| 304 | @ Add fractional parts of yn and y log2(x) together |
| 305 | vadd.f64 ss, ylg2x_temp, yn_temp |
| 306 | |
| 307 | @ Result = 1.0 (offset for exp(s) series) |
| 308 | vmov_f64 Result, #0x70 |
| 309 | |
| 310 | @ multiply fractional part of y * log2(x) by ln(2) |
| 311 | vmul.f64 ss, ln2, ss |
| 312 | |
| 313 | @@@@ 10th order polynomial series for Exp(s) @@@@ |
| 314 | |
| 315 | @ ss2 = (ss)^2 |
| 316 | vmul.f64 ss2, ss, ss |
| 317 | |
| 318 | @ twoto1o2mask = twoto1o2mask & twoto1o4 |
| 319 | vand.u64 twoto1o2mask, twoto1o2mask, twoto1o4 |
| 320 | @ twoto1o2mask = twoto1o2mask & twoto1o4 |
| 321 | vand.u64 twoto1o4mask, twoto1o4mask, twoto1o4 |
| 322 | |
| 323 | @ Result = 1.0 + ss |
| 324 | vadd.f64 Result, Result, ss |
| 325 | |
| 326 | @ k7 = ss k8 + k7 |
| 327 | vmla.f64 k7, ss, k8 |
| 328 | |
| 329 | @ ss4 = (ss*ss) * (ss*ss) |
| 330 | vmul.f64 ss4, ss2, ss2 |
| 331 | |
| 332 | @ twoto1o2mask = twoto1o2mask | (double) 1.0 - results in either 1.0 or 2^(1/4) in twoto1o2mask |
| 333 | vorr.u64 twoto1o2mask, twoto1o2mask, double_1 |
| 334 | @ twoto1o2mask = twoto1o4mask | (double) 1.0 - results in either 1.0 or 2^(1/4) in twoto1o4mask |
| 335 | vorr.u64 twoto1o4mask, twoto1o4mask, double_1 |
| 336 | |
| 337 | @ TODO: should setup sign here, expadjustment = 1.0 |
| 338 | vmov_f64 expadjustment, #0x70 |
| 339 | |
| 340 | @ ss3 = (ss*ss) * ss |
| 341 | vmul.f64 ss3, ss2, ss |
| 342 | |
| 343 | @ k0 = 1/2 (first non-unity coefficient) |
| 344 | vmov_f64 k0, #0x60 |
| 345 | |
| 346 | @ Mask out non-exponent bits to make sure we have just 2^int_n |
| 347 | vand.i64 int_n, int_n, EXPONENT_MASK |
| 348 | |
| 349 | @ square twoto1o2mask to get 1.0 or 2^(1/2) |
| 350 | vmul.f64 twoto1o2mask, twoto1o2mask, twoto1o2mask |
| 351 | @ multiply twoto2o4mask into the exponent output adjustment value |
| 352 | vmul.f64 expadjustment, expadjustment, twoto1o4mask |
| 353 | |
| 354 | @ k5 = ss k6 + k5 |
| 355 | vmla.f64 k5, ss, k6 |
| 356 | |
| 357 | @ k3 = ss k4 + k3 |
| 358 | vmla.f64 k3, ss, k4 |
| 359 | |
| 360 | @ k1 = ss k2 + k1 |
| 361 | vmla.f64 k1, ss, k2 |
| 362 | |
| 363 | @ multiply twoto1o2mask into exponent output adjustment value |
| 364 | vmul.f64 expadjustment, expadjustment, twoto1o2mask |
| 365 | |
| 366 | @ k5 = ss^2 ( ss k8 + k7 ) + ss k6 + k5 |
| 367 | vmla.f64 k5, ss2, k7 |
| 368 | |
| 369 | @ k1 = ss^2 ( ss k4 + k3 ) + ss k2 + k1 |
| 370 | vmla.f64 k1, ss2, k3 |
| 371 | |
| 372 | @ Result = 1.0 + ss + 1/2 ss^2 |
| 373 | vmla.f64 Result, ss2, k0 |
| 374 | |
| 375 | @ Adjust int_n so that it's a double precision value that can be multiplied by Result |
| 376 | vadd.i64 expadjustment, int_n, expadjustment |
| 377 | |
| 378 | @ k1 = ss^4 ( ss^2 ( ss k8 + k7 ) + ss k6 + k5 ) + ss^2 ( ss k4 + k3 ) + ss k2 + k1 |
| 379 | vmla.f64 k1, ss4, k5 |
| 380 | |
| 381 | @ Result = 1.0 + ss + 1/2 ss^2 + ss^3 ( ss^4 ( ss^2 ( ss k8 + k7 ) + ss k6 + k5 ) + ss^2 ( ss k4 + k3 ) + ss k2 + k1 ) |
| 382 | vmla.f64 Result, ss3, k1 |
| 383 | |
| 384 | @ multiply by adjustment (sign*(rounding ? sqrt(2) : 1) * 2^int_n) |
| 385 | vmul.f64 Result, expadjustment, Result |
| 386 | |
| 387 | .LleavePow: |
| 388 | #if defined(KRAIT_NO_AAPCS_VFP_MODE) |
| 389 | @ return Result (FP) |
| 390 | vmov Return_lw, Return_hw, Result |
| 391 | #endif |
| 392 | .LleavePowDirect: |
| 393 | @ leave directly returning whatever is in Return_lw and Return_hw |
| 394 | pop {r4, r5, r6, r7, r8, r9, r10, pc} |
| 395 | |
| 396 | .align 6 |
| 397 | .LliteralTable: |
| 398 | @ Least-sqares tuned constants for 11th order (log2((1+s)/(1-s)): |
| 399 | .LL4: @ ~3/11 |
| 400 | .long 0x53a79915, 0x3fd1b108 |
| 401 | .LL3: @ ~1/3 |
| 402 | .long 0x9ca0567a, 0x3fd554fa |
| 403 | .LL2: @ ~3/7 |
| 404 | .long 0x1408e660, 0x3fdb6db7 |
| 405 | .LL1: @ ~3/5 |
| 406 | .long 0x332D4313, 0x3fe33333 |
| 407 | |
| 408 | @ Least-squares tuned constants for 10th order exp(s): |
| 409 | .LE10: @ ~1/3628800 |
| 410 | .long 0x25c7ba0a, 0x3e92819b |
| 411 | .LE9: @ ~1/362880 |
| 412 | .long 0x9499b49c, 0x3ec72294 |
| 413 | .LE8: @ ~1/40320 |
| 414 | .long 0xabb79d95, 0x3efa019f |
| 415 | .LE7: @ ~1/5040 |
| 416 | .long 0x8723aeaa, 0x3f2a019f |
| 417 | .LE6: @ ~1/720 |
| 418 | .long 0x16c76a94, 0x3f56c16c |
| 419 | .LE5: @ ~1/120 |
| 420 | .long 0x11185da8, 0x3f811111 |
| 421 | .LE4: @ ~1/24 |
| 422 | .long 0x5555551c, 0x3fa55555 |
| 423 | .LE3: @ ~1/6 |
| 424 | .long 0x555554db, 0x3fc55555 |
| 425 | |
| 426 | .LbpA: @ (2^(2/5) - 1) |
| 427 | .long 0x4ee54db1, 0x3fd472d1 |
| 428 | |
| 429 | .LbpB: @ (2^(4/5) - 2^(2/5)) |
| 430 | .long 0x1c8a36cf, 0x3fdafb62 |
| 431 | |
| 432 | .Ltwofifths: @ |
| 433 | .long 0x9999999a, 0x3fd99999 |
| 434 | |
| 435 | .Ltwooverthreeln2: |
| 436 | .long 0xDC3A03FD, 0x3FEEC709 |
| 437 | |
| 438 | .Lln2: @ ln(2) |
| 439 | .long 0xFEFA39EF, 0x3FE62E42 |
| 440 | |
| 441 | .Ltwoto1o4: @ 2^1/4 |
| 442 | .long 0x0a31b715, 0x3ff306fe |
| 443 | END(pow) |