Blame - libm/arm/e_pow.S - android_bionic

blob: 1e328f88b39a58a3363c818b789cfc1ae3a3d3ee [file] [log] [blame]

Brent DeGraaf	a8c0221	2012-05-30 22:50:19 -0400	[diff] [blame^]	1	@ Copyright (c) 2012, Code Aurora Forum. All rights reserved.
				2	@
				3	@ Redistribution and use in source and binary forms, with or without
				4	@ modification, are permitted provided that the following conditions are
				5	@ met:
				6	@ * Redistributions of source code must retain the above copyright
				7	@ notice, this list of conditions and the following disclaimer.
				8	@ * Redistributions in binary form must reproduce the above
				9	@ copyright notice, this list of conditions and the following
				10	@ disclaimer in the documentation and/or other materials provided
				11	@ with the distribution.
				12	@ * Neither the name of Code Aurora Forum, Inc. nor the names of its
				13	@ contributors may be used to endorse or promote products derived
				14	@ from this software without specific prior written permission.
				15	@
				16	@ THIS SOFTWARE IS PROVIDED "AS IS" AND ANY EXPRESS OR IMPLIED
				17	@ WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF
				18	@ MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT
				19	@ ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS
				20	@ BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
				21	@ CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
				22	@ SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR
				23	@ BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY,
				24	@ WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE
				25	@ OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN
				26	@ IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
				27
				28	#include <machine/cpu-features.h>
				29	#include <machine/asm.h>
				30
				31	@ Values which exist the program lifetime:
				32	#define HIGH_WORD_MASK d31
				33	#define EXPONENT_MASK d30
				34	#define int_1 d29
				35	#define double_1 d28
				36	@ sign and 2^int_n fixup:
				37	#define expadjustment d7
				38	#define literals r10
				39	@ Values which exist within both polynomial implementations:
				40	#define int_n d2
				41	#define int_n_low s4
				42	#define int_n_high s5
				43	#define double_n d3
				44	#define k1 d27
				45	#define k2 d26
				46	#define k3 d25
				47	#define k4 d24
				48	@ Values which cross the boundaries between polynomial implementations:
				49	#define ss d16
				50	#define ss2 d17
				51	#define ss4 d18
				52	#define Result d0
				53	#define Return_hw r1
				54	#define Return_lw r0
				55	#define ylg2x d0
				56	@ Intermediate values only needed sometimes:
				57	@ initial (sorted in approximate order of availability for overwriting):
				58	#define x_hw r1
				59	#define x_lw r0
				60	#define y_hw r3
				61	#define y_lw r2
				62	#define x d0
				63	#define bp d4
				64	#define y d1
				65	@ log series:
				66	#define u d19
				67	#define v d20
				68	#define lg2coeff d21
				69	#define bpa d5
				70	#define bpb d3
				71	#define lg2const d6
				72	#define xmantissa r8
				73	#define twoto1o5 r4
				74	#define twoto3o5 r5
				75	#define ix r6
				76	#define iEXP_MASK r7
				77	@ exp input setup:
				78	#define twoto1o8mask d3
				79	#define twoto1o4mask d4
				80	#define twoto1o2mask d1
				81	#define ylg2x_round_offset d16
				82	#define ylg2x_temp d17
				83	#define yn_temp d18
				84	#define yn_round_offset d19
				85	#define ln2 d5
				86	@ Careful, overwriting HIGH_WORD_MASK, reset it if you need it again ...
				87	#define rounded_exponent d31
				88	@ exp series:
				89	#define k5 d23
				90	#define k6 d22
				91	#define k7 d21
				92	#define k8 d20
				93	#define ss3 d19
				94	@ overwrite double_1 (we're done with it by now)
				95	#define k0 d28
				96	#define twoto1o4 d6
				97
				98	@instructions that gas doesn't like to encode correctly:
				99	#define vmov_f64 fconstd
				100	#define vmov_f32 fconsts
				101	#define vmovne_f64 fconstdne
				102
				103	ENTRY(pow_neon)
				104	#if defined(KRAIT_NO_AAPCS_VFP_MODE)
				105	@ ARM ABI has inputs coming in via r registers, lets move to a d register
				106	vmov x, x_lw, x_hw
				107	#endif
				108	push {r4, r5, r6, r7, r8, r9, r10, lr}
				109
				110	@ pre-staged bp values
				111	vldr bpa, .LbpA
				112	vldr bpb, .LbpB
				113	@ load two fifths into constant term in case we need it due to offsets
				114	vldr lg2const, .Ltwofifths
				115
				116	@ bp is initially 1.0, may adjust later based on x value
				117	vmov_f64 bp, #0x70
				118
				119	@ extract the mantissa from x for scaled value comparisons
				120	lsl xmantissa, x_hw, #12
				121
				122	@ twoto1o5 = 2^(1/5) (input bracketing)
				123	movw twoto1o5, #0x186c
				124	movt twoto1o5, #0x2611
				125	@ twoto3o5 = 2^(3/5) (input bracketing)
				126	movw twoto3o5, #0x003b
				127	movt twoto3o5, #0x8406
				128
				129	@ finish extracting xmantissa
				130	orr xmantissa, xmantissa, x_lw, lsr #20
				131
				132	@ begin preparing a mask for normalization
				133	vmov.i64 HIGH_WORD_MASK, #0xffffffff00000000
				134
				135	@ double_1 = (double) 1.0
				136	vmov_f64 double_1, #0x70
				137
				138	#if defined(KRAIT_NO_AAPCS_VFP_MODE)
				139	@ move y from r registers to a d register
				140	vmov y, y_lw, y_hw
				141	#endif
				142
				143	cmp xmantissa, twoto1o5
				144
				145	vshl.i64 EXPONENT_MASK, HIGH_WORD_MASK, #20
				146	vshr.u64 int_1, HIGH_WORD_MASK, #63
				147
				148	adr literals, .LliteralTable
				149
				150	bhi .Lxgt2to1over5
				151	@ zero out lg2 constant term if don't offset our input
				152	vsub.f64 lg2const, lg2const, lg2const
				153	b .Lxle2to1over5
				154
				155	.Lxgt2to1over5:
				156	@ if normalized x > 2^(1/5), bp = 1 + (2^(2/5)-1) = 2^(2/5)
				157	vadd.f64 bp, bp, bpa
				158
				159	.Lxle2to1over5:
				160	@ will need ln2 for various things
				161	vldr ln2, .Lln2
				162
				163	cmp xmantissa, twoto3o5
				164	@@@@ X Value Normalization @@@@
				165
				166	@ ss = abs(x) 2^(-1024)
				167	vbic.i64 ss, x, EXPONENT_MASK
				168
				169	@ N = (floor(log2(x)) + 0x3ff) * 2^52
				170	vand.i64 int_n, x, EXPONENT_MASK
				171
				172	bls .Lxle2to3over5
				173	@ if normalized x > 2^(3/5), bp = 2^(2/5) + (2^(4/5) - 2^(2/5) = 2^(4/5)
				174	vadd.f64 bp, bp, bpb
				175	vadd.f64 lg2const, lg2const, lg2const
				176
				177	.Lxle2to3over5:
				178
				179	@ load log2 polynomial series constants
				180	vldm literals!, {k4, k3, k2, k1}
				181
				182	@ s = abs(x) 2^(-floor(log2(x))) (normalize abs(x) to around 1)
				183	vorr.i64 ss, ss, double_1
				184
				185	@@@@ 3/2 (Log(bp(1+s)/(1-s))) input computation (s = (x-bp)/(x+bp)) @@@@
				186
				187	vsub.f64 u, ss, bp
				188	vadd.f64 v, ss, bp
				189
				190	@ s = (x-1)/(x+1)
				191	vdiv.f64 ss, u, v
				192
				193	@ load 2/(3log2) into lg2coeff
				194	vldr lg2coeff, .Ltwooverthreeln2
				195
				196	@ N = floor(log2(x)) * 2^52
				197	vsub.i64 int_n, int_n, double_1
				198
				199	@@@@ 3/2 (Log(bp(1+s)/(1-s))) polynomial series @@@@
				200
				201	@ ss2 = ((x-dp)/(x+dp))^2
				202	vmul.f64 ss2, ss, ss
				203	@ ylg2x = 3.0
				204	vmov_f64 ylg2x, #8
				205	vmul.f64 ss4, ss2, ss2
				206
				207	@ todo: useful later for two-way clamp
				208	vmul.f64 lg2coeff, lg2coeff, y
				209
				210	@ N = floor(log2(x))
				211	vshr.s64 int_n, int_n, #52
				212
				213	@ k3 = ss^2 * L4 + L3
				214	vmla.f64 k3, ss2, k4
				215
				216	@ k1 = ss^2 * L2 + L1
				217	vmla.f64 k1, ss2, k2
				218
				219	@ scale ss by 2/(3 ln 2)
				220	vmul.f64 lg2coeff, ss, lg2coeff
				221
				222	@ ylg2x = 3.0 + s^2
				223	vadd.f64 ylg2x, ylg2x, ss2
				224
				225	vcvt.f64.s32 double_n, int_n_low
				226
				227	@ k1 = s^4 (s^2 L4 + L3) + s^2 L2 + L1
				228	vmla.f64 k1, ss4, k3
				229
				230	@ add in constant term
				231	vadd.f64 double_n, lg2const
				232
				233	@ ylg2x = 3.0 + s^2 + s^4 (s^4 (s^2 L4 + L3) + s^2 L2 + L1)
				234	vmla.f64 ylg2x, ss4, k1
				235
				236	@ ylg2x = y 2 s / (3 ln(2)) (3.0 + s^2 + s^4 (s^4(s^2 L4 + L3) + s^2 L2 + L1)
				237	vmul.f64 ylg2x, lg2coeff, ylg2x
				238
				239	@@@@ Compute input to Exp(s) (s = y(n + log2(x)) - (floor(8 yn + 1)/8 + floor(8 ylog2(x) + 1)/8) @@@@@
				240
				241	@ mask to extract bit 1 (2^-2 from our fixed-point representation)
				242	vshl.u64 twoto1o4mask, int_1, #1
				243
				244	@ double_n = y * n
				245	vmul.f64 double_n, double_n, y
				246
				247	@ Load 2^(1/4) for later computations
				248	vldr twoto1o4, .Ltwoto1o4
				249
				250	@ either add or subtract one based on the sign of double_n and ylg2x
				251	vshr.s64 ylg2x_round_offset, ylg2x, #62
				252	vshr.s64 yn_round_offset, double_n, #62
				253
				254	@ move unmodified y*lg2x into temp space
				255	vmov ylg2x_temp, ylg2x
				256	@ compute floor(8 y * n + 1)/8
				257	@ and floor(8 y (log2(x)) + 1)/8
				258	vcvt.s32.f64 ylg2x, ylg2x, #3
				259	@ move unmodified y*n into temp space
				260	vmov yn_temp, double_n
				261	vcvt.s32.f64 double_n, double_n, #3
				262
				263	@ load exp polynomial series constants
				264	vldm literals!, {k8, k7, k6, k5, k4, k3, k2, k1}
				265
				266	@ mask to extract bit 2 (2^-1 from our fixed-point representation)
				267	vshl.u64 twoto1o2mask, int_1, #2
				268
				269	@ make rounding offsets either 1 or -1 instead of 0 or -2
				270	vorr.u64 ylg2x_round_offset, ylg2x_round_offset, int_1
				271	vorr.u64 yn_round_offset, yn_round_offset, int_1
				272
				273	@ round up to the nearest 1/8th
				274	vadd.s32 ylg2x, ylg2x, ylg2x_round_offset
				275	vadd.s32 double_n, double_n, yn_round_offset
				276
				277	@ clear out round-up bit for y log2(x)
				278	vbic.s32 ylg2x, ylg2x, int_1
				279	@ clear out round-up bit for yn
				280	vbic.s32 double_n, double_n, int_1
				281	@ add together the (fixed precision) rounded parts
				282	vadd.s64 rounded_exponent, double_n, ylg2x
				283	@ turn int_n into a double with value 2^int_n
				284	vshl.i64 int_n, rounded_exponent, #49
				285	@ compute masks for 2^(1/4) and 2^(1/2) fixups for fractional part of fixed-precision rounded values:
				286	vand.u64 twoto1o4mask, twoto1o4mask, rounded_exponent
				287	vand.u64 twoto1o2mask, twoto1o2mask, rounded_exponent
				288
				289	@ convert back into floating point, double_n now holds (double) floor(8 y * n + 1)/8
				290	@ ylg2x now holds (double) floor(8 y * log2(x) + 1)/8
				291	vcvt.f64.s32 ylg2x, ylg2x, #3
				292	vcvt.f64.s32 double_n, double_n, #3
				293
				294	@ put the 2 bit (0.5) through the roof of twoto1o2mask (make it 0x0 or 0xffffffffffffffff)
				295	vqshl.u64 twoto1o2mask, twoto1o2mask, #62
				296	@ put the 1 bit (0.25) through the roof of twoto1o4mask (make it 0x0 or 0xffffffffffffffff)
				297	vqshl.u64 twoto1o4mask, twoto1o4mask, #63
				298
				299	@ center ylog2(x) fractional part between -0.125 and 0.125 by subtracting (double) floor(8 y log2(x) + 1)/8
				300	vsub.f64 ylg2x_temp, ylg2x_temp, ylg2x
				301	@ center yn fractional part between -0.125 and 0.125 by subtracting (double) floor(8 y n + 1)/8
				302	vsub.f64 yn_temp, yn_temp, double_n
				303
				304	@ Add fractional parts of yn and y log2(x) together
				305	vadd.f64 ss, ylg2x_temp, yn_temp
				306
				307	@ Result = 1.0 (offset for exp(s) series)
				308	vmov_f64 Result, #0x70
				309
				310	@ multiply fractional part of y * log2(x) by ln(2)
				311	vmul.f64 ss, ln2, ss
				312
				313	@@@@ 10th order polynomial series for Exp(s) @@@@
				314
				315	@ ss2 = (ss)^2
				316	vmul.f64 ss2, ss, ss
				317
				318	@ twoto1o2mask = twoto1o2mask & twoto1o4
				319	vand.u64 twoto1o2mask, twoto1o2mask, twoto1o4
				320	@ twoto1o2mask = twoto1o2mask & twoto1o4
				321	vand.u64 twoto1o4mask, twoto1o4mask, twoto1o4
				322
				323	@ Result = 1.0 + ss
				324	vadd.f64 Result, Result, ss
				325
				326	@ k7 = ss k8 + k7
				327	vmla.f64 k7, ss, k8
				328
				329	@ ss4 = (ssss) (ss*ss)
				330	vmul.f64 ss4, ss2, ss2
				331
				332	@ twoto1o2mask = twoto1o2mask \| (double) 1.0 - results in either 1.0 or 2^(1/4) in twoto1o2mask
				333	vorr.u64 twoto1o2mask, twoto1o2mask, double_1
				334	@ twoto1o2mask = twoto1o4mask \| (double) 1.0 - results in either 1.0 or 2^(1/4) in twoto1o4mask
				335	vorr.u64 twoto1o4mask, twoto1o4mask, double_1
				336
				337	@ TODO: should setup sign here, expadjustment = 1.0
				338	vmov_f64 expadjustment, #0x70
				339
				340	@ ss3 = (ssss) ss
				341	vmul.f64 ss3, ss2, ss
				342
				343	@ k0 = 1/2 (first non-unity coefficient)
				344	vmov_f64 k0, #0x60
				345
				346	@ Mask out non-exponent bits to make sure we have just 2^int_n
				347	vand.i64 int_n, int_n, EXPONENT_MASK
				348
				349	@ square twoto1o2mask to get 1.0 or 2^(1/2)
				350	vmul.f64 twoto1o2mask, twoto1o2mask, twoto1o2mask
				351	@ multiply twoto2o4mask into the exponent output adjustment value
				352	vmul.f64 expadjustment, expadjustment, twoto1o4mask
				353
				354	@ k5 = ss k6 + k5
				355	vmla.f64 k5, ss, k6
				356
				357	@ k3 = ss k4 + k3
				358	vmla.f64 k3, ss, k4
				359
				360	@ k1 = ss k2 + k1
				361	vmla.f64 k1, ss, k2
				362
				363	@ multiply twoto1o2mask into exponent output adjustment value
				364	vmul.f64 expadjustment, expadjustment, twoto1o2mask
				365
				366	@ k5 = ss^2 ( ss k8 + k7 ) + ss k6 + k5
				367	vmla.f64 k5, ss2, k7
				368
				369	@ k1 = ss^2 ( ss k4 + k3 ) + ss k2 + k1
				370	vmla.f64 k1, ss2, k3
				371
				372	@ Result = 1.0 + ss + 1/2 ss^2
				373	vmla.f64 Result, ss2, k0
				374
				375	@ Adjust int_n so that it's a double precision value that can be multiplied by Result
				376	vadd.i64 expadjustment, int_n, expadjustment
				377
				378	@ k1 = ss^4 ( ss^2 ( ss k8 + k7 ) + ss k6 + k5 ) + ss^2 ( ss k4 + k3 ) + ss k2 + k1
				379	vmla.f64 k1, ss4, k5
				380
				381	@ Result = 1.0 + ss + 1/2 ss^2 + ss^3 ( ss^4 ( ss^2 ( ss k8 + k7 ) + ss k6 + k5 ) + ss^2 ( ss k4 + k3 ) + ss k2 + k1 )
				382	vmla.f64 Result, ss3, k1
				383
				384	@ multiply by adjustment (sign(rounding ? sqrt(2) : 1) 2^int_n)
				385	vmul.f64 Result, expadjustment, Result
				386
				387	.LleavePow:
				388	#if defined(KRAIT_NO_AAPCS_VFP_MODE)
				389	@ return Result (FP)
				390	vmov Return_lw, Return_hw, Result
				391	#endif
				392	.LleavePowDirect:
				393	@ leave directly returning whatever is in Return_lw and Return_hw
				394	pop {r4, r5, r6, r7, r8, r9, r10, pc}
				395
				396	.align 6
				397	.LliteralTable:
				398	@ Least-sqares tuned constants for 11th order (log2((1+s)/(1-s)):
				399	.LL4: @ ~3/11
				400	.long 0x53a79915, 0x3fd1b108
				401	.LL3: @ ~1/3
				402	.long 0x9ca0567a, 0x3fd554fa
				403	.LL2: @ ~3/7
				404	.long 0x1408e660, 0x3fdb6db7
				405	.LL1: @ ~3/5
				406	.long 0x332D4313, 0x3fe33333
				407
				408	@ Least-squares tuned constants for 10th order exp(s):
				409	.LE10: @ ~1/3628800
				410	.long 0x25c7ba0a, 0x3e92819b
				411	.LE9: @ ~1/362880
				412	.long 0x9499b49c, 0x3ec72294
				413	.LE8: @ ~1/40320
				414	.long 0xabb79d95, 0x3efa019f
				415	.LE7: @ ~1/5040
				416	.long 0x8723aeaa, 0x3f2a019f
				417	.LE6: @ ~1/720
				418	.long 0x16c76a94, 0x3f56c16c
				419	.LE5: @ ~1/120
				420	.long 0x11185da8, 0x3f811111
				421	.LE4: @ ~1/24
				422	.long 0x5555551c, 0x3fa55555
				423	.LE3: @ ~1/6
				424	.long 0x555554db, 0x3fc55555
				425
				426	.LbpA: @ (2^(2/5) - 1)
				427	.long 0x4ee54db1, 0x3fd472d1
				428
				429	.LbpB: @ (2^(4/5) - 2^(2/5))
				430	.long 0x1c8a36cf, 0x3fdafb62
				431
				432	.Ltwofifths: @
				433	.long 0x9999999a, 0x3fd99999
				434
				435	.Ltwooverthreeln2:
				436	.long 0xDC3A03FD, 0x3FEEC709
				437
				438	.Lln2: @ ln(2)
				439	.long 0xFEFA39EF, 0x3FE62E42
				440
				441	.Ltwoto1o4: @ 2^1/4
				442	.long 0x0a31b715, 0x3ff306fe
				443	END(pow)