| Linus Torvalds | 1da177e | 2005-04-16 15:20:36 -0700 | [diff] [blame] | 1 | /* IEEE754 floating point arithmetic | 
|  | 2 | * double precision square root | 
|  | 3 | */ | 
|  | 4 | /* | 
|  | 5 | * MIPS floating point support | 
|  | 6 | * Copyright (C) 1994-2000 Algorithmics Ltd. | 
|  | 7 | * http://www.algor.co.uk | 
|  | 8 | * | 
|  | 9 | * ######################################################################## | 
|  | 10 | * | 
|  | 11 | *  This program is free software; you can distribute it and/or modify it | 
|  | 12 | *  under the terms of the GNU General Public License (Version 2) as | 
|  | 13 | *  published by the Free Software Foundation. | 
|  | 14 | * | 
|  | 15 | *  This program is distributed in the hope it will be useful, but WITHOUT | 
|  | 16 | *  ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or | 
|  | 17 | *  FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License | 
|  | 18 | *  for more details. | 
|  | 19 | * | 
|  | 20 | *  You should have received a copy of the GNU General Public License along | 
|  | 21 | *  with this program; if not, write to the Free Software Foundation, Inc., | 
|  | 22 | *  59 Temple Place - Suite 330, Boston MA 02111-1307, USA. | 
|  | 23 | * | 
|  | 24 | * ######################################################################## | 
|  | 25 | */ | 
|  | 26 |  | 
|  | 27 |  | 
|  | 28 | #include "ieee754dp.h" | 
|  | 29 |  | 
|  | 30 | static const unsigned table[] = { | 
|  | 31 | 0, 1204, 3062, 5746, 9193, 13348, 18162, 23592, | 
|  | 32 | 29598, 36145, 43202, 50740, 58733, 67158, 75992, | 
|  | 33 | 85215, 83599, 71378, 60428, 50647, 41945, 34246, | 
|  | 34 | 27478, 21581, 16499, 12183, 8588, 5674, 3403, | 
|  | 35 | 1742, 661, 130 | 
|  | 36 | }; | 
|  | 37 |  | 
|  | 38 | ieee754dp ieee754dp_sqrt(ieee754dp x) | 
|  | 39 | { | 
| Ralf Baechle | cd21dfc | 2005-04-28 13:39:10 +0000 | [diff] [blame] | 40 | struct _ieee754_csr oldcsr; | 
| Linus Torvalds | 1da177e | 2005-04-16 15:20:36 -0700 | [diff] [blame] | 41 | ieee754dp y, z, t; | 
|  | 42 | unsigned scalx, yh; | 
|  | 43 | COMPXDP; | 
|  | 44 |  | 
|  | 45 | EXPLODEXDP; | 
|  | 46 | CLEARCX; | 
|  | 47 | FLUSHXDP; | 
|  | 48 |  | 
|  | 49 | /* x == INF or NAN? */ | 
|  | 50 | switch (xc) { | 
|  | 51 | case IEEE754_CLASS_QNAN: | 
|  | 52 | /* sqrt(Nan) = Nan */ | 
|  | 53 | return ieee754dp_nanxcpt(x, "sqrt"); | 
|  | 54 | case IEEE754_CLASS_SNAN: | 
|  | 55 | SETCX(IEEE754_INVALID_OPERATION); | 
|  | 56 | return ieee754dp_nanxcpt(ieee754dp_indef(), "sqrt"); | 
|  | 57 | case IEEE754_CLASS_ZERO: | 
|  | 58 | /* sqrt(0) = 0 */ | 
|  | 59 | return x; | 
|  | 60 | case IEEE754_CLASS_INF: | 
|  | 61 | if (xs) { | 
|  | 62 | /* sqrt(-Inf) = Nan */ | 
|  | 63 | SETCX(IEEE754_INVALID_OPERATION); | 
|  | 64 | return ieee754dp_nanxcpt(ieee754dp_indef(), "sqrt"); | 
|  | 65 | } | 
|  | 66 | /* sqrt(+Inf) = Inf */ | 
|  | 67 | return x; | 
|  | 68 | case IEEE754_CLASS_DNORM: | 
|  | 69 | DPDNORMX; | 
|  | 70 | /* fall through */ | 
|  | 71 | case IEEE754_CLASS_NORM: | 
|  | 72 | if (xs) { | 
|  | 73 | /* sqrt(-x) = Nan */ | 
|  | 74 | SETCX(IEEE754_INVALID_OPERATION); | 
|  | 75 | return ieee754dp_nanxcpt(ieee754dp_indef(), "sqrt"); | 
|  | 76 | } | 
|  | 77 | break; | 
|  | 78 | } | 
|  | 79 |  | 
|  | 80 | /* save old csr; switch off INX enable & flag; set RN rounding */ | 
|  | 81 | oldcsr = ieee754_csr; | 
|  | 82 | ieee754_csr.mx &= ~IEEE754_INEXACT; | 
|  | 83 | ieee754_csr.sx &= ~IEEE754_INEXACT; | 
|  | 84 | ieee754_csr.rm = IEEE754_RN; | 
|  | 85 |  | 
|  | 86 | /* adjust exponent to prevent overflow */ | 
|  | 87 | scalx = 0; | 
|  | 88 | if (xe > 512) {		/* x > 2**-512? */ | 
|  | 89 | xe -= 512;	/* x = x / 2**512 */ | 
|  | 90 | scalx += 256; | 
|  | 91 | } else if (xe < -512) {	/* x < 2**-512? */ | 
|  | 92 | xe += 512;	/* x = x * 2**512 */ | 
|  | 93 | scalx -= 256; | 
|  | 94 | } | 
|  | 95 |  | 
|  | 96 | y = x = builddp(0, xe + DP_EBIAS, xm & ~DP_HIDDEN_BIT); | 
|  | 97 |  | 
|  | 98 | /* magic initial approximation to almost 8 sig. bits */ | 
|  | 99 | yh = y.bits >> 32; | 
|  | 100 | yh = (yh >> 1) + 0x1ff80000; | 
|  | 101 | yh = yh - table[(yh >> 15) & 31]; | 
|  | 102 | y.bits = ((u64) yh << 32) | (y.bits & 0xffffffff); | 
|  | 103 |  | 
|  | 104 | /* Heron's rule once with correction to improve to ~18 sig. bits */ | 
|  | 105 | /* t=x/y; y=y+t; py[n0]=py[n0]-0x00100006; py[n1]=0; */ | 
|  | 106 | t = ieee754dp_div(x, y); | 
|  | 107 | y = ieee754dp_add(y, t); | 
|  | 108 | y.bits -= 0x0010000600000000LL; | 
|  | 109 | y.bits &= 0xffffffff00000000LL; | 
|  | 110 |  | 
|  | 111 | /* triple to almost 56 sig. bits: y ~= sqrt(x) to within 1 ulp */ | 
|  | 112 | /* t=y*y; z=t;  pt[n0]+=0x00100000; t+=z; z=(x-z)*y; */ | 
|  | 113 | z = t = ieee754dp_mul(y, y); | 
|  | 114 | t.parts.bexp += 0x001; | 
|  | 115 | t = ieee754dp_add(t, z); | 
|  | 116 | z = ieee754dp_mul(ieee754dp_sub(x, z), y); | 
|  | 117 |  | 
|  | 118 | /* t=z/(t+x) ;  pt[n0]+=0x00100000; y+=t; */ | 
|  | 119 | t = ieee754dp_div(z, ieee754dp_add(t, x)); | 
|  | 120 | t.parts.bexp += 0x001; | 
|  | 121 | y = ieee754dp_add(y, t); | 
|  | 122 |  | 
|  | 123 | /* twiddle last bit to force y correctly rounded */ | 
|  | 124 |  | 
|  | 125 | /* set RZ, clear INEX flag */ | 
|  | 126 | ieee754_csr.rm = IEEE754_RZ; | 
|  | 127 | ieee754_csr.sx &= ~IEEE754_INEXACT; | 
|  | 128 |  | 
|  | 129 | /* t=x/y; ...chopped quotient, possibly inexact */ | 
|  | 130 | t = ieee754dp_div(x, y); | 
|  | 131 |  | 
|  | 132 | if (ieee754_csr.sx & IEEE754_INEXACT || t.bits != y.bits) { | 
|  | 133 |  | 
|  | 134 | if (!(ieee754_csr.sx & IEEE754_INEXACT)) | 
|  | 135 | /* t = t-ulp */ | 
|  | 136 | t.bits -= 1; | 
|  | 137 |  | 
|  | 138 | /* add inexact to result status */ | 
|  | 139 | oldcsr.cx |= IEEE754_INEXACT; | 
|  | 140 | oldcsr.sx |= IEEE754_INEXACT; | 
|  | 141 |  | 
|  | 142 | switch (oldcsr.rm) { | 
|  | 143 | case IEEE754_RP: | 
|  | 144 | y.bits += 1; | 
|  | 145 | /* drop through */ | 
|  | 146 | case IEEE754_RN: | 
|  | 147 | t.bits += 1; | 
|  | 148 | break; | 
|  | 149 | } | 
|  | 150 |  | 
|  | 151 | /* y=y+t; ...chopped sum */ | 
|  | 152 | y = ieee754dp_add(y, t); | 
|  | 153 |  | 
|  | 154 | /* adjust scalx for correctly rounded sqrt(x) */ | 
|  | 155 | scalx -= 1; | 
|  | 156 | } | 
|  | 157 |  | 
|  | 158 | /* py[n0]=py[n0]+scalx; ...scale back y */ | 
|  | 159 | y.parts.bexp += scalx; | 
|  | 160 |  | 
|  | 161 | /* restore rounding mode, possibly set inexact */ | 
|  | 162 | ieee754_csr = oldcsr; | 
|  | 163 |  | 
|  | 164 | return y; | 
|  | 165 | } |