| Linus Torvalds | 1da177e | 2005-04-16 15:20:36 -0700 | [diff] [blame] | 1 |  | 
 | 2 | /* | 
 | 3 | =============================================================================== | 
 | 4 |  | 
 | 5 | This C source fragment is part of the SoftFloat IEC/IEEE Floating-point | 
 | 6 | Arithmetic Package, Release 2. | 
 | 7 |  | 
 | 8 | Written by John R. Hauser.  This work was made possible in part by the | 
 | 9 | International Computer Science Institute, located at Suite 600, 1947 Center | 
 | 10 | Street, Berkeley, California 94704.  Funding was partially provided by the | 
 | 11 | National Science Foundation under grant MIP-9311980.  The original version | 
 | 12 | of this code was written as part of a project to build a fixed-point vector | 
 | 13 | processor in collaboration with the University of California at Berkeley, | 
 | 14 | overseen by Profs. Nelson Morgan and John Wawrzynek.  More information | 
 | 15 | is available through the web page `http://HTTP.CS.Berkeley.EDU/~jhauser/ | 
 | 16 | arithmetic/softfloat.html'. | 
 | 17 |  | 
 | 18 | THIS SOFTWARE IS DISTRIBUTED AS IS, FOR FREE.  Although reasonable effort | 
 | 19 | has been made to avoid it, THIS SOFTWARE MAY CONTAIN FAULTS THAT WILL AT | 
 | 20 | TIMES RESULT IN INCORRECT BEHAVIOR.  USE OF THIS SOFTWARE IS RESTRICTED TO | 
 | 21 | PERSONS AND ORGANIZATIONS WHO CAN AND WILL TAKE FULL RESPONSIBILITY FOR ANY | 
 | 22 | AND ALL LOSSES, COSTS, OR OTHER PROBLEMS ARISING FROM ITS USE. | 
 | 23 |  | 
 | 24 | Derivative works are acceptable, even for commercial purposes, so long as | 
 | 25 | (1) they include prominent notice that the work is derivative, and (2) they | 
 | 26 | include prominent notice akin to these three paragraphs for those parts of | 
 | 27 | this code that are retained. | 
 | 28 |  | 
 | 29 | =============================================================================== | 
 | 30 | */ | 
 | 31 |  | 
 | 32 | /* | 
 | 33 | ------------------------------------------------------------------------------- | 
 | 34 | Shifts `a' right by the number of bits given in `count'.  If any nonzero | 
 | 35 | bits are shifted off, they are ``jammed'' into the least significant bit of | 
 | 36 | the result by setting the least significant bit to 1.  The value of `count' | 
 | 37 | can be arbitrarily large; in particular, if `count' is greater than 32, the | 
 | 38 | result will be either 0 or 1, depending on whether `a' is zero or nonzero. | 
 | 39 | The result is stored in the location pointed to by `zPtr'. | 
 | 40 | ------------------------------------------------------------------------------- | 
 | 41 | */ | 
 | 42 | INLINE void shift32RightJamming( bits32 a, int16 count, bits32 *zPtr ) | 
 | 43 | { | 
 | 44 |     bits32 z; | 
 | 45 |     if ( count == 0 ) { | 
 | 46 |         z = a; | 
 | 47 |     } | 
 | 48 |     else if ( count < 32 ) { | 
 | 49 |         z = ( a>>count ) | ( ( a<<( ( - count ) & 31 ) ) != 0 ); | 
 | 50 |     } | 
 | 51 |     else { | 
 | 52 |         z = ( a != 0 ); | 
 | 53 |     } | 
 | 54 |     *zPtr = z; | 
 | 55 | } | 
 | 56 |  | 
 | 57 | /* | 
 | 58 | ------------------------------------------------------------------------------- | 
 | 59 | Shifts `a' right by the number of bits given in `count'.  If any nonzero | 
 | 60 | bits are shifted off, they are ``jammed'' into the least significant bit of | 
 | 61 | the result by setting the least significant bit to 1.  The value of `count' | 
 | 62 | can be arbitrarily large; in particular, if `count' is greater than 64, the | 
 | 63 | result will be either 0 or 1, depending on whether `a' is zero or nonzero. | 
 | 64 | The result is stored in the location pointed to by `zPtr'. | 
 | 65 | ------------------------------------------------------------------------------- | 
 | 66 | */ | 
 | 67 | INLINE void shift64RightJamming( bits64 a, int16 count, bits64 *zPtr ) | 
 | 68 | { | 
 | 69 |     bits64 z; | 
 | 70 |  | 
 | 71 |  __asm__("@shift64RightJamming -- start");    | 
 | 72 |     if ( count == 0 ) { | 
 | 73 |         z = a; | 
 | 74 |     } | 
 | 75 |     else if ( count < 64 ) { | 
 | 76 |         z = ( a>>count ) | ( ( a<<( ( - count ) & 63 ) ) != 0 ); | 
 | 77 |     } | 
 | 78 |     else { | 
 | 79 |         z = ( a != 0 ); | 
 | 80 |     } | 
 | 81 |  __asm__("@shift64RightJamming -- end");    | 
 | 82 |     *zPtr = z; | 
 | 83 | } | 
 | 84 |  | 
 | 85 | /* | 
 | 86 | ------------------------------------------------------------------------------- | 
 | 87 | Shifts the 128-bit value formed by concatenating `a0' and `a1' right by 64 | 
 | 88 | _plus_ the number of bits given in `count'.  The shifted result is at most | 
 | 89 | 64 nonzero bits; this is stored at the location pointed to by `z0Ptr'.  The | 
 | 90 | bits shifted off form a second 64-bit result as follows:  The _last_ bit | 
 | 91 | shifted off is the most-significant bit of the extra result, and the other | 
 | 92 | 63 bits of the extra result are all zero if and only if _all_but_the_last_ | 
 | 93 | bits shifted off were all zero.  This extra result is stored in the location | 
 | 94 | pointed to by `z1Ptr'.  The value of `count' can be arbitrarily large. | 
 | 95 |     (This routine makes more sense if `a0' and `a1' are considered to form a | 
 | 96 | fixed-point value with binary point between `a0' and `a1'.  This fixed-point | 
 | 97 | value is shifted right by the number of bits given in `count', and the | 
 | 98 | integer part of the result is returned at the location pointed to by | 
 | 99 | `z0Ptr'.  The fractional part of the result may be slightly corrupted as | 
 | 100 | described above, and is returned at the location pointed to by `z1Ptr'.) | 
 | 101 | ------------------------------------------------------------------------------- | 
 | 102 | */ | 
 | 103 | INLINE void | 
 | 104 |  shift64ExtraRightJamming( | 
 | 105 |      bits64 a0, bits64 a1, int16 count, bits64 *z0Ptr, bits64 *z1Ptr ) | 
 | 106 | { | 
 | 107 |     bits64 z0, z1; | 
 | 108 |     int8 negCount = ( - count ) & 63; | 
 | 109 |  | 
 | 110 |     if ( count == 0 ) { | 
 | 111 |         z1 = a1; | 
 | 112 |         z0 = a0; | 
 | 113 |     } | 
 | 114 |     else if ( count < 64 ) { | 
 | 115 |         z1 = ( a0<<negCount ) | ( a1 != 0 ); | 
 | 116 |         z0 = a0>>count; | 
 | 117 |     } | 
 | 118 |     else { | 
 | 119 |         if ( count == 64 ) { | 
 | 120 |             z1 = a0 | ( a1 != 0 ); | 
 | 121 |         } | 
 | 122 |         else { | 
 | 123 |             z1 = ( ( a0 | a1 ) != 0 ); | 
 | 124 |         } | 
 | 125 |         z0 = 0; | 
 | 126 |     } | 
 | 127 |     *z1Ptr = z1; | 
 | 128 |     *z0Ptr = z0; | 
 | 129 |  | 
 | 130 | } | 
 | 131 |  | 
 | 132 | /* | 
 | 133 | ------------------------------------------------------------------------------- | 
 | 134 | Shifts the 128-bit value formed by concatenating `a0' and `a1' right by the | 
 | 135 | number of bits given in `count'.  Any bits shifted off are lost.  The value | 
 | 136 | of `count' can be arbitrarily large; in particular, if `count' is greater | 
 | 137 | than 128, the result will be 0.  The result is broken into two 64-bit pieces | 
 | 138 | which are stored at the locations pointed to by `z0Ptr' and `z1Ptr'. | 
 | 139 | ------------------------------------------------------------------------------- | 
 | 140 | */ | 
 | 141 | INLINE void | 
 | 142 |  shift128Right( | 
 | 143 |      bits64 a0, bits64 a1, int16 count, bits64 *z0Ptr, bits64 *z1Ptr ) | 
 | 144 | { | 
 | 145 |     bits64 z0, z1; | 
 | 146 |     int8 negCount = ( - count ) & 63; | 
 | 147 |  | 
 | 148 |     if ( count == 0 ) { | 
 | 149 |         z1 = a1; | 
 | 150 |         z0 = a0; | 
 | 151 |     } | 
 | 152 |     else if ( count < 64 ) { | 
 | 153 |         z1 = ( a0<<negCount ) | ( a1>>count ); | 
 | 154 |         z0 = a0>>count; | 
 | 155 |     } | 
 | 156 |     else { | 
 | 157 |         z1 = ( count < 64 ) ? ( a0>>( count & 63 ) ) : 0; | 
 | 158 |         z0 = 0; | 
 | 159 |     } | 
 | 160 |     *z1Ptr = z1; | 
 | 161 |     *z0Ptr = z0; | 
 | 162 |  | 
 | 163 | } | 
 | 164 |  | 
 | 165 | /* | 
 | 166 | ------------------------------------------------------------------------------- | 
 | 167 | Shifts the 128-bit value formed by concatenating `a0' and `a1' right by the | 
 | 168 | number of bits given in `count'.  If any nonzero bits are shifted off, they | 
 | 169 | are ``jammed'' into the least significant bit of the result by setting the | 
 | 170 | least significant bit to 1.  The value of `count' can be arbitrarily large; | 
 | 171 | in particular, if `count' is greater than 128, the result will be either 0 | 
 | 172 | or 1, depending on whether the concatenation of `a0' and `a1' is zero or | 
 | 173 | nonzero.  The result is broken into two 64-bit pieces which are stored at | 
 | 174 | the locations pointed to by `z0Ptr' and `z1Ptr'. | 
 | 175 | ------------------------------------------------------------------------------- | 
 | 176 | */ | 
 | 177 | INLINE void | 
 | 178 |  shift128RightJamming( | 
 | 179 |      bits64 a0, bits64 a1, int16 count, bits64 *z0Ptr, bits64 *z1Ptr ) | 
 | 180 | { | 
 | 181 |     bits64 z0, z1; | 
 | 182 |     int8 negCount = ( - count ) & 63; | 
 | 183 |  | 
 | 184 |     if ( count == 0 ) { | 
 | 185 |         z1 = a1; | 
 | 186 |         z0 = a0; | 
 | 187 |     } | 
 | 188 |     else if ( count < 64 ) { | 
 | 189 |         z1 = ( a0<<negCount ) | ( a1>>count ) | ( ( a1<<negCount ) != 0 ); | 
 | 190 |         z0 = a0>>count; | 
 | 191 |     } | 
 | 192 |     else { | 
 | 193 |         if ( count == 64 ) { | 
 | 194 |             z1 = a0 | ( a1 != 0 ); | 
 | 195 |         } | 
 | 196 |         else if ( count < 128 ) { | 
 | 197 |             z1 = ( a0>>( count & 63 ) ) | ( ( ( a0<<negCount ) | a1 ) != 0 ); | 
 | 198 |         } | 
 | 199 |         else { | 
 | 200 |             z1 = ( ( a0 | a1 ) != 0 ); | 
 | 201 |         } | 
 | 202 |         z0 = 0; | 
 | 203 |     } | 
 | 204 |     *z1Ptr = z1; | 
 | 205 |     *z0Ptr = z0; | 
 | 206 |  | 
 | 207 | } | 
 | 208 |  | 
 | 209 | /* | 
 | 210 | ------------------------------------------------------------------------------- | 
 | 211 | Shifts the 192-bit value formed by concatenating `a0', `a1', and `a2' right | 
 | 212 | by 64 _plus_ the number of bits given in `count'.  The shifted result is | 
 | 213 | at most 128 nonzero bits; these are broken into two 64-bit pieces which are | 
 | 214 | stored at the locations pointed to by `z0Ptr' and `z1Ptr'.  The bits shifted | 
 | 215 | off form a third 64-bit result as follows:  The _last_ bit shifted off is | 
 | 216 | the most-significant bit of the extra result, and the other 63 bits of the | 
 | 217 | extra result are all zero if and only if _all_but_the_last_ bits shifted off | 
 | 218 | were all zero.  This extra result is stored in the location pointed to by | 
 | 219 | `z2Ptr'.  The value of `count' can be arbitrarily large. | 
 | 220 |     (This routine makes more sense if `a0', `a1', and `a2' are considered | 
 | 221 | to form a fixed-point value with binary point between `a1' and `a2'.  This | 
 | 222 | fixed-point value is shifted right by the number of bits given in `count', | 
 | 223 | and the integer part of the result is returned at the locations pointed to | 
 | 224 | by `z0Ptr' and `z1Ptr'.  The fractional part of the result may be slightly | 
 | 225 | corrupted as described above, and is returned at the location pointed to by | 
 | 226 | `z2Ptr'.) | 
 | 227 | ------------------------------------------------------------------------------- | 
 | 228 | */ | 
 | 229 | INLINE void | 
 | 230 |  shift128ExtraRightJamming( | 
 | 231 |      bits64 a0, | 
 | 232 |      bits64 a1, | 
 | 233 |      bits64 a2, | 
 | 234 |      int16 count, | 
 | 235 |      bits64 *z0Ptr, | 
 | 236 |      bits64 *z1Ptr, | 
 | 237 |      bits64 *z2Ptr | 
 | 238 |  ) | 
 | 239 | { | 
 | 240 |     bits64 z0, z1, z2; | 
 | 241 |     int8 negCount = ( - count ) & 63; | 
 | 242 |  | 
 | 243 |     if ( count == 0 ) { | 
 | 244 |         z2 = a2; | 
 | 245 |         z1 = a1; | 
 | 246 |         z0 = a0; | 
 | 247 |     } | 
 | 248 |     else { | 
 | 249 |         if ( count < 64 ) { | 
 | 250 |             z2 = a1<<negCount; | 
 | 251 |             z1 = ( a0<<negCount ) | ( a1>>count ); | 
 | 252 |             z0 = a0>>count; | 
 | 253 |         } | 
 | 254 |         else { | 
 | 255 |             if ( count == 64 ) { | 
 | 256 |                 z2 = a1; | 
 | 257 |                 z1 = a0; | 
 | 258 |             } | 
 | 259 |             else { | 
 | 260 |                 a2 |= a1; | 
 | 261 |                 if ( count < 128 ) { | 
 | 262 |                     z2 = a0<<negCount; | 
 | 263 |                     z1 = a0>>( count & 63 ); | 
 | 264 |                 } | 
 | 265 |                 else { | 
 | 266 |                     z2 = ( count == 128 ) ? a0 : ( a0 != 0 ); | 
 | 267 |                     z1 = 0; | 
 | 268 |                 } | 
 | 269 |             } | 
 | 270 |             z0 = 0; | 
 | 271 |         } | 
 | 272 |         z2 |= ( a2 != 0 ); | 
 | 273 |     } | 
 | 274 |     *z2Ptr = z2; | 
 | 275 |     *z1Ptr = z1; | 
 | 276 |     *z0Ptr = z0; | 
 | 277 |  | 
 | 278 | } | 
 | 279 |  | 
 | 280 | /* | 
 | 281 | ------------------------------------------------------------------------------- | 
 | 282 | Shifts the 128-bit value formed by concatenating `a0' and `a1' left by the | 
 | 283 | number of bits given in `count'.  Any bits shifted off are lost.  The value | 
 | 284 | of `count' must be less than 64.  The result is broken into two 64-bit | 
 | 285 | pieces which are stored at the locations pointed to by `z0Ptr' and `z1Ptr'. | 
 | 286 | ------------------------------------------------------------------------------- | 
 | 287 | */ | 
 | 288 | INLINE void | 
 | 289 |  shortShift128Left( | 
 | 290 |      bits64 a0, bits64 a1, int16 count, bits64 *z0Ptr, bits64 *z1Ptr ) | 
 | 291 | { | 
 | 292 |  | 
 | 293 |     *z1Ptr = a1<<count; | 
 | 294 |     *z0Ptr = | 
 | 295 |         ( count == 0 ) ? a0 : ( a0<<count ) | ( a1>>( ( - count ) & 63 ) ); | 
 | 296 |  | 
 | 297 | } | 
 | 298 |  | 
 | 299 | /* | 
 | 300 | ------------------------------------------------------------------------------- | 
 | 301 | Shifts the 192-bit value formed by concatenating `a0', `a1', and `a2' left | 
 | 302 | by the number of bits given in `count'.  Any bits shifted off are lost. | 
 | 303 | The value of `count' must be less than 64.  The result is broken into three | 
 | 304 | 64-bit pieces which are stored at the locations pointed to by `z0Ptr', | 
 | 305 | `z1Ptr', and `z2Ptr'. | 
 | 306 | ------------------------------------------------------------------------------- | 
 | 307 | */ | 
 | 308 | INLINE void | 
 | 309 |  shortShift192Left( | 
 | 310 |      bits64 a0, | 
 | 311 |      bits64 a1, | 
 | 312 |      bits64 a2, | 
 | 313 |      int16 count, | 
 | 314 |      bits64 *z0Ptr, | 
 | 315 |      bits64 *z1Ptr, | 
 | 316 |      bits64 *z2Ptr | 
 | 317 |  ) | 
 | 318 | { | 
 | 319 |     bits64 z0, z1, z2; | 
 | 320 |     int8 negCount; | 
 | 321 |  | 
 | 322 |     z2 = a2<<count; | 
 | 323 |     z1 = a1<<count; | 
 | 324 |     z0 = a0<<count; | 
 | 325 |     if ( 0 < count ) { | 
 | 326 |         negCount = ( ( - count ) & 63 ); | 
 | 327 |         z1 |= a2>>negCount; | 
 | 328 |         z0 |= a1>>negCount; | 
 | 329 |     } | 
 | 330 |     *z2Ptr = z2; | 
 | 331 |     *z1Ptr = z1; | 
 | 332 |     *z0Ptr = z0; | 
 | 333 |  | 
 | 334 | } | 
 | 335 |  | 
 | 336 | /* | 
 | 337 | ------------------------------------------------------------------------------- | 
 | 338 | Adds the 128-bit value formed by concatenating `a0' and `a1' to the 128-bit | 
 | 339 | value formed by concatenating `b0' and `b1'.  Addition is modulo 2^128, so | 
 | 340 | any carry out is lost.  The result is broken into two 64-bit pieces which | 
 | 341 | are stored at the locations pointed to by `z0Ptr' and `z1Ptr'. | 
 | 342 | ------------------------------------------------------------------------------- | 
 | 343 | */ | 
 | 344 | INLINE void | 
 | 345 |  add128( | 
 | 346 |      bits64 a0, bits64 a1, bits64 b0, bits64 b1, bits64 *z0Ptr, bits64 *z1Ptr ) | 
 | 347 | { | 
 | 348 |     bits64 z1; | 
 | 349 |  | 
 | 350 |     z1 = a1 + b1; | 
 | 351 |     *z1Ptr = z1; | 
 | 352 |     *z0Ptr = a0 + b0 + ( z1 < a1 ); | 
 | 353 |  | 
 | 354 | } | 
 | 355 |  | 
 | 356 | /* | 
 | 357 | ------------------------------------------------------------------------------- | 
 | 358 | Adds the 192-bit value formed by concatenating `a0', `a1', and `a2' to the | 
 | 359 | 192-bit value formed by concatenating `b0', `b1', and `b2'.  Addition is | 
 | 360 | modulo 2^192, so any carry out is lost.  The result is broken into three | 
 | 361 | 64-bit pieces which are stored at the locations pointed to by `z0Ptr', | 
 | 362 | `z1Ptr', and `z2Ptr'. | 
 | 363 | ------------------------------------------------------------------------------- | 
 | 364 | */ | 
 | 365 | INLINE void | 
 | 366 |  add192( | 
 | 367 |      bits64 a0, | 
 | 368 |      bits64 a1, | 
 | 369 |      bits64 a2, | 
 | 370 |      bits64 b0, | 
 | 371 |      bits64 b1, | 
 | 372 |      bits64 b2, | 
 | 373 |      bits64 *z0Ptr, | 
 | 374 |      bits64 *z1Ptr, | 
 | 375 |      bits64 *z2Ptr | 
 | 376 |  ) | 
 | 377 | { | 
 | 378 |     bits64 z0, z1, z2; | 
 | 379 |     int8 carry0, carry1; | 
 | 380 |  | 
 | 381 |     z2 = a2 + b2; | 
 | 382 |     carry1 = ( z2 < a2 ); | 
 | 383 |     z1 = a1 + b1; | 
 | 384 |     carry0 = ( z1 < a1 ); | 
 | 385 |     z0 = a0 + b0; | 
 | 386 |     z1 += carry1; | 
 | 387 |     z0 += ( z1 < carry1 ); | 
 | 388 |     z0 += carry0; | 
 | 389 |     *z2Ptr = z2; | 
 | 390 |     *z1Ptr = z1; | 
 | 391 |     *z0Ptr = z0; | 
 | 392 |  | 
 | 393 | } | 
 | 394 |  | 
 | 395 | /* | 
 | 396 | ------------------------------------------------------------------------------- | 
 | 397 | Subtracts the 128-bit value formed by concatenating `b0' and `b1' from the | 
 | 398 | 128-bit value formed by concatenating `a0' and `a1'.  Subtraction is modulo | 
 | 399 | 2^128, so any borrow out (carry out) is lost.  The result is broken into two | 
 | 400 | 64-bit pieces which are stored at the locations pointed to by `z0Ptr' and | 
 | 401 | `z1Ptr'. | 
 | 402 | ------------------------------------------------------------------------------- | 
 | 403 | */ | 
 | 404 | INLINE void | 
 | 405 |  sub128( | 
 | 406 |      bits64 a0, bits64 a1, bits64 b0, bits64 b1, bits64 *z0Ptr, bits64 *z1Ptr ) | 
 | 407 | { | 
 | 408 |  | 
 | 409 |     *z1Ptr = a1 - b1; | 
 | 410 |     *z0Ptr = a0 - b0 - ( a1 < b1 ); | 
 | 411 |  | 
 | 412 | } | 
 | 413 |  | 
 | 414 | /* | 
 | 415 | ------------------------------------------------------------------------------- | 
 | 416 | Subtracts the 192-bit value formed by concatenating `b0', `b1', and `b2' | 
 | 417 | from the 192-bit value formed by concatenating `a0', `a1', and `a2'. | 
 | 418 | Subtraction is modulo 2^192, so any borrow out (carry out) is lost.  The | 
 | 419 | result is broken into three 64-bit pieces which are stored at the locations | 
 | 420 | pointed to by `z0Ptr', `z1Ptr', and `z2Ptr'. | 
 | 421 | ------------------------------------------------------------------------------- | 
 | 422 | */ | 
 | 423 | INLINE void | 
 | 424 |  sub192( | 
 | 425 |      bits64 a0, | 
 | 426 |      bits64 a1, | 
 | 427 |      bits64 a2, | 
 | 428 |      bits64 b0, | 
 | 429 |      bits64 b1, | 
 | 430 |      bits64 b2, | 
 | 431 |      bits64 *z0Ptr, | 
 | 432 |      bits64 *z1Ptr, | 
 | 433 |      bits64 *z2Ptr | 
 | 434 |  ) | 
 | 435 | { | 
 | 436 |     bits64 z0, z1, z2; | 
 | 437 |     int8 borrow0, borrow1; | 
 | 438 |  | 
 | 439 |     z2 = a2 - b2; | 
 | 440 |     borrow1 = ( a2 < b2 ); | 
 | 441 |     z1 = a1 - b1; | 
 | 442 |     borrow0 = ( a1 < b1 ); | 
 | 443 |     z0 = a0 - b0; | 
 | 444 |     z0 -= ( z1 < borrow1 ); | 
 | 445 |     z1 -= borrow1; | 
 | 446 |     z0 -= borrow0; | 
 | 447 |     *z2Ptr = z2; | 
 | 448 |     *z1Ptr = z1; | 
 | 449 |     *z0Ptr = z0; | 
 | 450 |  | 
 | 451 | } | 
 | 452 |  | 
 | 453 | /* | 
 | 454 | ------------------------------------------------------------------------------- | 
 | 455 | Multiplies `a' by `b' to obtain a 128-bit product.  The product is broken | 
 | 456 | into two 64-bit pieces which are stored at the locations pointed to by | 
 | 457 | `z0Ptr' and `z1Ptr'. | 
 | 458 | ------------------------------------------------------------------------------- | 
 | 459 | */ | 
 | 460 | INLINE void mul64To128( bits64 a, bits64 b, bits64 *z0Ptr, bits64 *z1Ptr ) | 
 | 461 | { | 
 | 462 |     bits32 aHigh, aLow, bHigh, bLow; | 
 | 463 |     bits64 z0, zMiddleA, zMiddleB, z1; | 
 | 464 |  | 
 | 465 |     aLow = a; | 
 | 466 |     aHigh = a>>32; | 
 | 467 |     bLow = b; | 
 | 468 |     bHigh = b>>32; | 
 | 469 |     z1 = ( (bits64) aLow ) * bLow; | 
 | 470 |     zMiddleA = ( (bits64) aLow ) * bHigh; | 
 | 471 |     zMiddleB = ( (bits64) aHigh ) * bLow; | 
 | 472 |     z0 = ( (bits64) aHigh ) * bHigh; | 
 | 473 |     zMiddleA += zMiddleB; | 
 | 474 |     z0 += ( ( (bits64) ( zMiddleA < zMiddleB ) )<<32 ) + ( zMiddleA>>32 ); | 
 | 475 |     zMiddleA <<= 32; | 
 | 476 |     z1 += zMiddleA; | 
 | 477 |     z0 += ( z1 < zMiddleA ); | 
 | 478 |     *z1Ptr = z1; | 
 | 479 |     *z0Ptr = z0; | 
 | 480 |  | 
 | 481 | } | 
 | 482 |  | 
 | 483 | /* | 
 | 484 | ------------------------------------------------------------------------------- | 
 | 485 | Multiplies the 128-bit value formed by concatenating `a0' and `a1' by `b' to | 
 | 486 | obtain a 192-bit product.  The product is broken into three 64-bit pieces | 
 | 487 | which are stored at the locations pointed to by `z0Ptr', `z1Ptr', and | 
 | 488 | `z2Ptr'. | 
 | 489 | ------------------------------------------------------------------------------- | 
 | 490 | */ | 
 | 491 | INLINE void | 
 | 492 |  mul128By64To192( | 
 | 493 |      bits64 a0, | 
 | 494 |      bits64 a1, | 
 | 495 |      bits64 b, | 
 | 496 |      bits64 *z0Ptr, | 
 | 497 |      bits64 *z1Ptr, | 
 | 498 |      bits64 *z2Ptr | 
 | 499 |  ) | 
 | 500 | { | 
 | 501 |     bits64 z0, z1, z2, more1; | 
 | 502 |  | 
 | 503 |     mul64To128( a1, b, &z1, &z2 ); | 
 | 504 |     mul64To128( a0, b, &z0, &more1 ); | 
 | 505 |     add128( z0, more1, 0, z1, &z0, &z1 ); | 
 | 506 |     *z2Ptr = z2; | 
 | 507 |     *z1Ptr = z1; | 
 | 508 |     *z0Ptr = z0; | 
 | 509 |  | 
 | 510 | } | 
 | 511 |  | 
 | 512 | /* | 
 | 513 | ------------------------------------------------------------------------------- | 
 | 514 | Multiplies the 128-bit value formed by concatenating `a0' and `a1' to the | 
 | 515 | 128-bit value formed by concatenating `b0' and `b1' to obtain a 256-bit | 
 | 516 | product.  The product is broken into four 64-bit pieces which are stored at | 
 | 517 | the locations pointed to by `z0Ptr', `z1Ptr', `z2Ptr', and `z3Ptr'. | 
 | 518 | ------------------------------------------------------------------------------- | 
 | 519 | */ | 
 | 520 | INLINE void | 
 | 521 |  mul128To256( | 
 | 522 |      bits64 a0, | 
 | 523 |      bits64 a1, | 
 | 524 |      bits64 b0, | 
 | 525 |      bits64 b1, | 
 | 526 |      bits64 *z0Ptr, | 
 | 527 |      bits64 *z1Ptr, | 
 | 528 |      bits64 *z2Ptr, | 
 | 529 |      bits64 *z3Ptr | 
 | 530 |  ) | 
 | 531 | { | 
 | 532 |     bits64 z0, z1, z2, z3; | 
 | 533 |     bits64 more1, more2; | 
 | 534 |  | 
 | 535 |     mul64To128( a1, b1, &z2, &z3 ); | 
 | 536 |     mul64To128( a1, b0, &z1, &more2 ); | 
 | 537 |     add128( z1, more2, 0, z2, &z1, &z2 ); | 
 | 538 |     mul64To128( a0, b0, &z0, &more1 ); | 
 | 539 |     add128( z0, more1, 0, z1, &z0, &z1 ); | 
 | 540 |     mul64To128( a0, b1, &more1, &more2 ); | 
 | 541 |     add128( more1, more2, 0, z2, &more1, &z2 ); | 
 | 542 |     add128( z0, z1, 0, more1, &z0, &z1 ); | 
 | 543 |     *z3Ptr = z3; | 
 | 544 |     *z2Ptr = z2; | 
 | 545 |     *z1Ptr = z1; | 
 | 546 |     *z0Ptr = z0; | 
 | 547 |  | 
 | 548 | } | 
 | 549 |  | 
 | 550 | /* | 
 | 551 | ------------------------------------------------------------------------------- | 
 | 552 | Returns an approximation to the 64-bit integer quotient obtained by dividing | 
 | 553 | `b' into the 128-bit value formed by concatenating `a0' and `a1'.  The | 
 | 554 | divisor `b' must be at least 2^63.  If q is the exact quotient truncated | 
 | 555 | toward zero, the approximation returned lies between q and q + 2 inclusive. | 
 | 556 | If the exact quotient q is larger than 64 bits, the maximum positive 64-bit | 
 | 557 | unsigned integer is returned. | 
 | 558 | ------------------------------------------------------------------------------- | 
 | 559 | */ | 
 | 560 | static bits64 estimateDiv128To64( bits64 a0, bits64 a1, bits64 b ) | 
 | 561 | { | 
 | 562 |     bits64 b0, b1; | 
 | 563 |     bits64 rem0, rem1, term0, term1; | 
 | 564 |     bits64 z; | 
 | 565 |     if ( b <= a0 ) return LIT64( 0xFFFFFFFFFFFFFFFF ); | 
| Nicolas Pitre | c1241c4c | 2005-06-23 21:56:46 +0100 | [diff] [blame] | 566 |     b0 = b>>32;  /* hence b0 is 32 bits wide now */ | 
 | 567 |     if ( b0<<32 <= a0 ) { | 
 | 568 |         z = LIT64( 0xFFFFFFFF00000000 ); | 
 | 569 |     }  else { | 
 | 570 |         z = a0; | 
 | 571 |         do_div( z, b0 ); | 
 | 572 |         z <<= 32; | 
 | 573 |     } | 
| Linus Torvalds | 1da177e | 2005-04-16 15:20:36 -0700 | [diff] [blame] | 574 |     mul64To128( b, z, &term0, &term1 ); | 
 | 575 |     sub128( a0, a1, term0, term1, &rem0, &rem1 ); | 
 | 576 |     while ( ( (sbits64) rem0 ) < 0 ) { | 
 | 577 |         z -= LIT64( 0x100000000 ); | 
 | 578 |         b1 = b<<32; | 
 | 579 |         add128( rem0, rem1, b0, b1, &rem0, &rem1 ); | 
 | 580 |     } | 
 | 581 |     rem0 = ( rem0<<32 ) | ( rem1>>32 ); | 
| Nicolas Pitre | c1241c4c | 2005-06-23 21:56:46 +0100 | [diff] [blame] | 582 |     if ( b0<<32 <= rem0 ) { | 
 | 583 |         z |= 0xFFFFFFFF; | 
 | 584 |     } else { | 
 | 585 |         do_div( rem0, b0 ); | 
 | 586 |         z |= rem0; | 
 | 587 |     } | 
| Linus Torvalds | 1da177e | 2005-04-16 15:20:36 -0700 | [diff] [blame] | 588 |     return z; | 
 | 589 |  | 
 | 590 | } | 
 | 591 |  | 
 | 592 | /* | 
 | 593 | ------------------------------------------------------------------------------- | 
 | 594 | Returns an approximation to the square root of the 32-bit significand given | 
 | 595 | by `a'.  Considered as an integer, `a' must be at least 2^31.  If bit 0 of | 
 | 596 | `aExp' (the least significant bit) is 1, the integer returned approximates | 
 | 597 | 2^31*sqrt(`a'/2^31), where `a' is considered an integer.  If bit 0 of `aExp' | 
 | 598 | is 0, the integer returned approximates 2^31*sqrt(`a'/2^30).  In either | 
 | 599 | case, the approximation returned lies strictly within +/-2 of the exact | 
 | 600 | value. | 
 | 601 | ------------------------------------------------------------------------------- | 
 | 602 | */ | 
 | 603 | static bits32 estimateSqrt32( int16 aExp, bits32 a ) | 
 | 604 | { | 
 | 605 |     static const bits16 sqrtOddAdjustments[] = { | 
 | 606 |         0x0004, 0x0022, 0x005D, 0x00B1, 0x011D, 0x019F, 0x0236, 0x02E0, | 
 | 607 |         0x039C, 0x0468, 0x0545, 0x0631, 0x072B, 0x0832, 0x0946, 0x0A67 | 
 | 608 |     }; | 
 | 609 |     static const bits16 sqrtEvenAdjustments[] = { | 
 | 610 |         0x0A2D, 0x08AF, 0x075A, 0x0629, 0x051A, 0x0429, 0x0356, 0x029E, | 
 | 611 |         0x0200, 0x0179, 0x0109, 0x00AF, 0x0068, 0x0034, 0x0012, 0x0002 | 
 | 612 |     }; | 
 | 613 |     int8 index; | 
 | 614 |     bits32 z; | 
| Nicolas Pitre | c1241c4c | 2005-06-23 21:56:46 +0100 | [diff] [blame] | 615 |     bits64 A; | 
| Linus Torvalds | 1da177e | 2005-04-16 15:20:36 -0700 | [diff] [blame] | 616 |  | 
 | 617 |     index = ( a>>27 ) & 15; | 
 | 618 |     if ( aExp & 1 ) { | 
 | 619 |         z = 0x4000 + ( a>>17 ) - sqrtOddAdjustments[ index ]; | 
 | 620 |         z = ( ( a / z )<<14 ) + ( z<<15 ); | 
 | 621 |         a >>= 1; | 
 | 622 |     } | 
 | 623 |     else { | 
 | 624 |         z = 0x8000 + ( a>>17 ) - sqrtEvenAdjustments[ index ]; | 
 | 625 |         z = a / z + z; | 
 | 626 |         z = ( 0x20000 <= z ) ? 0xFFFF8000 : ( z<<15 ); | 
 | 627 |         if ( z <= a ) return (bits32) ( ( (sbits32) a )>>1 ); | 
 | 628 |     } | 
| Nicolas Pitre | c1241c4c | 2005-06-23 21:56:46 +0100 | [diff] [blame] | 629 |     A = ( (bits64) a )<<31; | 
 | 630 |     do_div( A, z ); | 
 | 631 |     return ( (bits32) A ) + ( z>>1 ); | 
| Linus Torvalds | 1da177e | 2005-04-16 15:20:36 -0700 | [diff] [blame] | 632 |  | 
 | 633 | } | 
 | 634 |  | 
 | 635 | /* | 
 | 636 | ------------------------------------------------------------------------------- | 
 | 637 | Returns the number of leading 0 bits before the most-significant 1 bit | 
 | 638 | of `a'.  If `a' is zero, 32 is returned. | 
 | 639 | ------------------------------------------------------------------------------- | 
 | 640 | */ | 
 | 641 | static int8 countLeadingZeros32( bits32 a ) | 
 | 642 | { | 
 | 643 |     static const int8 countLeadingZerosHigh[] = { | 
 | 644 |         8, 7, 6, 6, 5, 5, 5, 5, 4, 4, 4, 4, 4, 4, 4, 4, | 
 | 645 |         3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, | 
 | 646 |         2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, | 
 | 647 |         2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, | 
 | 648 |         1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, | 
 | 649 |         1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, | 
 | 650 |         1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, | 
 | 651 |         1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, | 
 | 652 |         0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, | 
 | 653 |         0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, | 
 | 654 |         0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, | 
 | 655 |         0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, | 
 | 656 |         0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, | 
 | 657 |         0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, | 
 | 658 |         0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, | 
 | 659 |         0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 | 
 | 660 |     }; | 
 | 661 |     int8 shiftCount; | 
 | 662 |  | 
 | 663 |     shiftCount = 0; | 
 | 664 |     if ( a < 0x10000 ) { | 
 | 665 |         shiftCount += 16; | 
 | 666 |         a <<= 16; | 
 | 667 |     } | 
 | 668 |     if ( a < 0x1000000 ) { | 
 | 669 |         shiftCount += 8; | 
 | 670 |         a <<= 8; | 
 | 671 |     } | 
 | 672 |     shiftCount += countLeadingZerosHigh[ a>>24 ]; | 
 | 673 |     return shiftCount; | 
 | 674 |  | 
 | 675 | } | 
 | 676 |  | 
 | 677 | /* | 
 | 678 | ------------------------------------------------------------------------------- | 
 | 679 | Returns the number of leading 0 bits before the most-significant 1 bit | 
 | 680 | of `a'.  If `a' is zero, 64 is returned. | 
 | 681 | ------------------------------------------------------------------------------- | 
 | 682 | */ | 
 | 683 | static int8 countLeadingZeros64( bits64 a ) | 
 | 684 | { | 
 | 685 |     int8 shiftCount; | 
 | 686 |  | 
 | 687 |     shiftCount = 0; | 
 | 688 |     if ( a < ( (bits64) 1 )<<32 ) { | 
 | 689 |         shiftCount += 32; | 
 | 690 |     } | 
 | 691 |     else { | 
 | 692 |         a >>= 32; | 
 | 693 |     } | 
 | 694 |     shiftCount += countLeadingZeros32( a ); | 
 | 695 |     return shiftCount; | 
 | 696 |  | 
 | 697 | } | 
 | 698 |  | 
 | 699 | /* | 
 | 700 | ------------------------------------------------------------------------------- | 
 | 701 | Returns 1 if the 128-bit value formed by concatenating `a0' and `a1' | 
 | 702 | is equal to the 128-bit value formed by concatenating `b0' and `b1'. | 
 | 703 | Otherwise, returns 0. | 
 | 704 | ------------------------------------------------------------------------------- | 
 | 705 | */ | 
 | 706 | INLINE flag eq128( bits64 a0, bits64 a1, bits64 b0, bits64 b1 ) | 
 | 707 | { | 
 | 708 |  | 
 | 709 |     return ( a0 == b0 ) && ( a1 == b1 ); | 
 | 710 |  | 
 | 711 | } | 
 | 712 |  | 
 | 713 | /* | 
 | 714 | ------------------------------------------------------------------------------- | 
 | 715 | Returns 1 if the 128-bit value formed by concatenating `a0' and `a1' is less | 
 | 716 | than or equal to the 128-bit value formed by concatenating `b0' and `b1'. | 
 | 717 | Otherwise, returns 0. | 
 | 718 | ------------------------------------------------------------------------------- | 
 | 719 | */ | 
 | 720 | INLINE flag le128( bits64 a0, bits64 a1, bits64 b0, bits64 b1 ) | 
 | 721 | { | 
 | 722 |  | 
 | 723 |     return ( a0 < b0 ) || ( ( a0 == b0 ) && ( a1 <= b1 ) ); | 
 | 724 |  | 
 | 725 | } | 
 | 726 |  | 
 | 727 | /* | 
 | 728 | ------------------------------------------------------------------------------- | 
 | 729 | Returns 1 if the 128-bit value formed by concatenating `a0' and `a1' is less | 
 | 730 | than the 128-bit value formed by concatenating `b0' and `b1'.  Otherwise, | 
 | 731 | returns 0. | 
 | 732 | ------------------------------------------------------------------------------- | 
 | 733 | */ | 
 | 734 | INLINE flag lt128( bits64 a0, bits64 a1, bits64 b0, bits64 b1 ) | 
 | 735 | { | 
 | 736 |  | 
 | 737 |     return ( a0 < b0 ) || ( ( a0 == b0 ) && ( a1 < b1 ) ); | 
 | 738 |  | 
 | 739 | } | 
 | 740 |  | 
 | 741 | /* | 
 | 742 | ------------------------------------------------------------------------------- | 
 | 743 | Returns 1 if the 128-bit value formed by concatenating `a0' and `a1' is | 
 | 744 | not equal to the 128-bit value formed by concatenating `b0' and `b1'. | 
 | 745 | Otherwise, returns 0. | 
 | 746 | ------------------------------------------------------------------------------- | 
 | 747 | */ | 
 | 748 | INLINE flag ne128( bits64 a0, bits64 a1, bits64 b0, bits64 b1 ) | 
 | 749 | { | 
 | 750 |  | 
 | 751 |     return ( a0 != b0 ) || ( a1 != b1 ); | 
 | 752 |  | 
 | 753 | } | 
 | 754 |  |