1 /* 2 =============================================================================== 3 4 This C source file is part of the SoftFloat IEC/IEEE Floating-point 5 Arithmetic Package, Release 2. 6 7 Written by John R. Hauser. This work was made possible in part by the 8 International Computer Science Institute, located at Suite 600, 1947 Center 9 Street, Berkeley, California 94704. Funding was partially provided by the 10 National Science Foundation under grant MIP-9311980. The original version 11 of this code was written as part of a project to build a fixed-point vector 12 processor in collaboration with the University of California at Berkeley, 13 overseen by Profs. Nelson Morgan and John Wawrzynek. More information 14 is available through the web page `http://HTTP.CS.Berkeley.EDU/~jhauser/ 15 arithmetic/softfloat.html'. 16 17 THIS SOFTWARE IS DISTRIBUTED AS IS, FOR FREE. Although reasonable effort 18 has been made to avoid it, THIS SOFTWARE MAY CONTAIN FAULTS THAT WILL AT 19 TIMES RESULT IN INCORRECT BEHAVIOR. USE OF THIS SOFTWARE IS RESTRICTED TO 20 PERSONS AND ORGANIZATIONS WHO CAN AND WILL TAKE FULL RESPONSIBILITY FOR ANY 21 AND ALL LOSSES, COSTS, OR OTHER PROBLEMS ARISING FROM ITS USE. 22 23 Derivative works are acceptable, even for commercial purposes, so long as 24 (1) they include prominent notice that the work is derivative, and (2) they 25 include prominent notice akin to these three paragraphs for those parts of 26 this code that are retained. 27 28 =============================================================================== 29 */ 30 31 #include <asm/div64.h> 32 33 #include "fpa11.h" 34 //#include "milieu.h" 35 //#include "softfloat.h" 36 37 /* 38 ------------------------------------------------------------------------------- 39 Floating-point rounding mode, extended double-precision rounding precision, 40 and exception flags. 41 ------------------------------------------------------------------------------- 42 */ 43 int8 float_rounding_mode = float_round_nearest_even; 44 int8 floatx80_rounding_precision = 80; 45 int8 float_exception_flags; 46 47 /* 48 ------------------------------------------------------------------------------- 49 Primitive arithmetic functions, including multi-word arithmetic, and 50 division and square root approximations. (Can be specialized to target if 51 desired.) 52 ------------------------------------------------------------------------------- 53 */ 54 #include "softfloat-macros" 55 56 /* 57 ------------------------------------------------------------------------------- 58 Functions and definitions to determine: (1) whether tininess for underflow 59 is detected before or after rounding by default, (2) what (if anything) 60 happens when exceptions are raised, (3) how signaling NaNs are distinguished 61 from quiet NaNs, (4) the default generated quiet NaNs, and (5) how NaNs 62 are propagated from function inputs to output. These details are target- 63 specific. 64 ------------------------------------------------------------------------------- 65 */ 66 #include "softfloat-specialize" 67 68 /* 69 ------------------------------------------------------------------------------- 70 Takes a 64-bit fixed-point value `absZ' with binary point between bits 6 71 and 7, and returns the properly rounded 32-bit integer corresponding to the 72 input. If `zSign' is nonzero, the input is negated before being converted 73 to an integer. Bit 63 of `absZ' must be zero. Ordinarily, the fixed-point 74 input is simply rounded to an integer, with the inexact exception raised if 75 the input cannot be represented exactly as an integer. If the fixed-point 76 input is too large, however, the invalid exception is raised and the largest 77 positive or negative integer is returned. 78 ------------------------------------------------------------------------------- 79 */ 80 static int32 roundAndPackInt32( flag zSign, bits64 absZ ) 81 { 82 int8 roundingMode; 83 flag roundNearestEven; 84 int8 roundIncrement, roundBits; 85 int32 z; 86 87 roundingMode = float_rounding_mode; 88 roundNearestEven = ( roundingMode == float_round_nearest_even ); 89 roundIncrement = 0x40; 90 if ( ! roundNearestEven ) { 91 if ( roundingMode == float_round_to_zero ) { 92 roundIncrement = 0; 93 } 94 else { 95 roundIncrement = 0x7F; 96 if ( zSign ) { 97 if ( roundingMode == float_round_up ) roundIncrement = 0; 98 } 99 else { 100 if ( roundingMode == float_round_down ) roundIncrement = 0; 101 } 102 } 103 } 104 roundBits = absZ & 0x7F; 105 absZ = ( absZ + roundIncrement )>>7; 106 absZ &= ~ ( ( ( roundBits ^ 0x40 ) == 0 ) & roundNearestEven ); 107 z = absZ; 108 if ( zSign ) z = - z; 109 if ( ( absZ>>32 ) || ( z && ( ( z < 0 ) ^ zSign ) ) ) { 110 float_exception_flags |= float_flag_invalid; 111 return zSign ? 0x80000000 : 0x7FFFFFFF; 112 } 113 if ( roundBits ) float_exception_flags |= float_flag_inexact; 114 return z; 115 116 } 117 118 /* 119 ------------------------------------------------------------------------------- 120 Returns the fraction bits of the single-precision floating-point value `a'. 121 ------------------------------------------------------------------------------- 122 */ 123 INLINE bits32 extractFloat32Frac( float32 a ) 124 { 125 126 return a & 0x007FFFFF; 127 128 } 129 130 /* 131 ------------------------------------------------------------------------------- 132 Returns the exponent bits of the single-precision floating-point value `a'. 133 ------------------------------------------------------------------------------- 134 */ 135 INLINE int16 extractFloat32Exp( float32 a ) 136 { 137 138 return ( a>>23 ) & 0xFF; 139 140 } 141 142 /* 143 ------------------------------------------------------------------------------- 144 Returns the sign bit of the single-precision floating-point value `a'. 145 ------------------------------------------------------------------------------- 146 */ 147 #if 0 /* in softfloat.h */ 148 INLINE flag extractFloat32Sign( float32 a ) 149 { 150 151 return a>>31; 152 153 } 154 #endif 155 156 /* 157 ------------------------------------------------------------------------------- 158 Normalizes the subnormal single-precision floating-point value represented 159 by the denormalized significand `aSig'. The normalized exponent and 160 significand are stored at the locations pointed to by `zExpPtr' and 161 `zSigPtr', respectively. 162 ------------------------------------------------------------------------------- 163 */ 164 static void 165 normalizeFloat32Subnormal( bits32 aSig, int16 *zExpPtr, bits32 *zSigPtr ) 166 { 167 int8 shiftCount; 168 169 shiftCount = countLeadingZeros32( aSig ) - 8; 170 *zSigPtr = aSig<<shiftCount; 171 *zExpPtr = 1 - shiftCount; 172 173 } 174 175 /* 176 ------------------------------------------------------------------------------- 177 Packs the sign `zSign', exponent `zExp', and significand `zSig' into a 178 single-precision floating-point value, returning the result. After being 179 shifted into the proper positions, the three fields are simply added 180 together to form the result. This means that any integer portion of `zSig' 181 will be added into the exponent. Since a properly normalized significand 182 will have an integer portion equal to 1, the `zExp' input should be 1 less 183 than the desired result exponent whenever `zSig' is a complete, normalized 184 significand. 185 ------------------------------------------------------------------------------- 186 */ 187 INLINE float32 packFloat32( flag zSign, int16 zExp, bits32 zSig ) 188 { 189 #if 0 190 float32 f; 191 __asm__("@ packFloat32 \n\ 192 mov %0, %1, asl #31 \n\ 193 orr %0, %2, asl #23 \n\ 194 orr %0, %3" 195 : /* no outputs */ 196 : "g" (f), "g" (zSign), "g" (zExp), "g" (zSig) 197 : "cc"); 198 return f; 199 #else 200 return ( ( (bits32) zSign )<<31 ) + ( ( (bits32) zExp )<<23 ) + zSig; 201 #endif 202 } 203 204 /* 205 ------------------------------------------------------------------------------- 206 Takes an abstract floating-point value having sign `zSign', exponent `zExp', 207 and significand `zSig', and returns the proper single-precision floating- 208 point value corresponding to the abstract input. Ordinarily, the abstract 209 value is simply rounded and packed into the single-precision format, with 210 the inexact exception raised if the abstract input cannot be represented 211 exactly. If the abstract value is too large, however, the overflow and 212 inexact exceptions are raised and an infinity or maximal finite value is 213 returned. If the abstract value is too small, the input value is rounded to 214 a subnormal number, and the underflow and inexact exceptions are raised if 215 the abstract input cannot be represented exactly as a subnormal single- 216 precision floating-point number. 217 The input significand `zSig' has its binary point between bits 30 218 and 29, which is 7 bits to the left of the usual location. This shifted 219 significand must be normalized or smaller. If `zSig' is not normalized, 220 `zExp' must be 0; in that case, the result returned is a subnormal number, 221 and it must not require rounding. In the usual case that `zSig' is 222 normalized, `zExp' must be 1 less than the ``true'' floating-point exponent. 223 The handling of underflow and overflow follows the IEC/IEEE Standard for 224 Binary Floating-point Arithmetic. 225 ------------------------------------------------------------------------------- 226 */ 227 static float32 roundAndPackFloat32( flag zSign, int16 zExp, bits32 zSig ) 228 { 229 int8 roundingMode; 230 flag roundNearestEven; 231 int8 roundIncrement, roundBits; 232 flag isTiny; 233 234 roundingMode = float_rounding_mode; 235 roundNearestEven = ( roundingMode == float_round_nearest_even ); 236 roundIncrement = 0x40; 237 if ( ! roundNearestEven ) { 238 if ( roundingMode == float_round_to_zero ) { 239 roundIncrement = 0; 240 } 241 else { 242 roundIncrement = 0x7F; 243 if ( zSign ) { 244 if ( roundingMode == float_round_up ) roundIncrement = 0; 245 } 246 else { 247 if ( roundingMode == float_round_down ) roundIncrement = 0; 248 } 249 } 250 } 251 roundBits = zSig & 0x7F; 252 if ( 0xFD <= (bits16) zExp ) { 253 if ( ( 0xFD < zExp ) 254 || ( ( zExp == 0xFD ) 255 && ( (sbits32) ( zSig + roundIncrement ) < 0 ) ) 256 ) { 257 float_raise( float_flag_overflow | float_flag_inexact ); 258 return packFloat32( zSign, 0xFF, 0 ) - ( roundIncrement == 0 ); 259 } 260 if ( zExp < 0 ) { 261 isTiny = 262 ( float_detect_tininess == float_tininess_before_rounding ) 263 || ( zExp < -1 ) 264 || ( zSig + roundIncrement < 0x80000000 ); 265 shift32RightJamming( zSig, - zExp, &zSig ); 266 zExp = 0; 267 roundBits = zSig & 0x7F; 268 if ( isTiny && roundBits ) float_raise( float_flag_underflow ); 269 } 270 } 271 if ( roundBits ) float_exception_flags |= float_flag_inexact; 272 zSig = ( zSig + roundIncrement )>>7; 273 zSig &= ~ ( ( ( roundBits ^ 0x40 ) == 0 ) & roundNearestEven ); 274 if ( zSig == 0 ) zExp = 0; 275 return packFloat32( zSign, zExp, zSig ); 276 277 } 278 279 /* 280 ------------------------------------------------------------------------------- 281 Takes an abstract floating-point value having sign `zSign', exponent `zExp', 282 and significand `zSig', and returns the proper single-precision floating- 283 point value corresponding to the abstract input. This routine is just like 284 `roundAndPackFloat32' except that `zSig' does not have to be normalized in 285 any way. In all cases, `zExp' must be 1 less than the ``true'' floating- 286 point exponent. 287 ------------------------------------------------------------------------------- 288 */ 289 static float32 290 normalizeRoundAndPackFloat32( flag zSign, int16 zExp, bits32 zSig ) 291 { 292 int8 shiftCount; 293 294 shiftCount = countLeadingZeros32( zSig ) - 1; 295 return roundAndPackFloat32( zSign, zExp - shiftCount, zSig<<shiftCount ); 296 297 } 298 299 /* 300 ------------------------------------------------------------------------------- 301 Returns the fraction bits of the double-precision floating-point value `a'. 302 ------------------------------------------------------------------------------- 303 */ 304 INLINE bits64 extractFloat64Frac( float64 a ) 305 { 306 307 return a & LIT64( 0x000FFFFFFFFFFFFF ); 308 309 } 310 311 /* 312 ------------------------------------------------------------------------------- 313 Returns the exponent bits of the double-precision floating-point value `a'. 314 ------------------------------------------------------------------------------- 315 */ 316 INLINE int16 extractFloat64Exp( float64 a ) 317 { 318 319 return ( a>>52 ) & 0x7FF; 320 321 } 322 323 /* 324 ------------------------------------------------------------------------------- 325 Returns the sign bit of the double-precision floating-point value `a'. 326 ------------------------------------------------------------------------------- 327 */ 328 #if 0 /* in softfloat.h */ 329 INLINE flag extractFloat64Sign( float64 a ) 330 { 331 332 return a>>63; 333 334 } 335 #endif 336 337 /* 338 ------------------------------------------------------------------------------- 339 Normalizes the subnormal double-precision floating-point value represented 340 by the denormalized significand `aSig'. The normalized exponent and 341 significand are stored at the locations pointed to by `zExpPtr' and 342 `zSigPtr', respectively. 343 ------------------------------------------------------------------------------- 344 */ 345 static void 346 normalizeFloat64Subnormal( bits64 aSig, int16 *zExpPtr, bits64 *zSigPtr ) 347 { 348 int8 shiftCount; 349 350 shiftCount = countLeadingZeros64( aSig ) - 11; 351 *zSigPtr = aSig<<shiftCount; 352 *zExpPtr = 1 - shiftCount; 353 354 } 355 356 /* 357 ------------------------------------------------------------------------------- 358 Packs the sign `zSign', exponent `zExp', and significand `zSig' into a 359 double-precision floating-point value, returning the result. After being 360 shifted into the proper positions, the three fields are simply added 361 together to form the result. This means that any integer portion of `zSig' 362 will be added into the exponent. Since a properly normalized significand 363 will have an integer portion equal to 1, the `zExp' input should be 1 less 364 than the desired result exponent whenever `zSig' is a complete, normalized 365 significand. 366 ------------------------------------------------------------------------------- 367 */ 368 INLINE float64 packFloat64( flag zSign, int16 zExp, bits64 zSig ) 369 { 370 371 return ( ( (bits64) zSign )<<63 ) + ( ( (bits64) zExp )<<52 ) + zSig; 372 373 } 374 375 /* 376 ------------------------------------------------------------------------------- 377 Takes an abstract floating-point value having sign `zSign', exponent `zExp', 378 and significand `zSig', and returns the proper double-precision floating- 379 point value corresponding to the abstract input. Ordinarily, the abstract 380 value is simply rounded and packed into the double-precision format, with 381 the inexact exception raised if the abstract input cannot be represented 382 exactly. If the abstract value is too large, however, the overflow and 383 inexact exceptions are raised and an infinity or maximal finite value is 384 returned. If the abstract value is too small, the input value is rounded to 385 a subnormal number, and the underflow and inexact exceptions are raised if 386 the abstract input cannot be represented exactly as a subnormal double- 387 precision floating-point number. 388 The input significand `zSig' has its binary point between bits 62 389 and 61, which is 10 bits to the left of the usual location. This shifted 390 significand must be normalized or smaller. If `zSig' is not normalized, 391 `zExp' must be 0; in that case, the result returned is a subnormal number, 392 and it must not require rounding. In the usual case that `zSig' is 393 normalized, `zExp' must be 1 less than the ``true'' floating-point exponent. 394 The handling of underflow and overflow follows the IEC/IEEE Standard for 395 Binary Floating-point Arithmetic. 396 ------------------------------------------------------------------------------- 397 */ 398 static float64 roundAndPackFloat64( flag zSign, int16 zExp, bits64 zSig ) 399 { 400 int8 roundingMode; 401 flag roundNearestEven; 402 int16 roundIncrement, roundBits; 403 flag isTiny; 404 405 roundingMode = float_rounding_mode; 406 roundNearestEven = ( roundingMode == float_round_nearest_even ); 407 roundIncrement = 0x200; 408 if ( ! roundNearestEven ) { 409 if ( roundingMode == float_round_to_zero ) { 410 roundIncrement = 0; 411 } 412 else { 413 roundIncrement = 0x3FF; 414 if ( zSign ) { 415 if ( roundingMode == float_round_up ) roundIncrement = 0; 416 } 417 else { 418 if ( roundingMode == float_round_down ) roundIncrement = 0; 419 } 420 } 421 } 422 roundBits = zSig & 0x3FF; 423 if ( 0x7FD <= (bits16) zExp ) { 424 if ( ( 0x7FD < zExp ) 425 || ( ( zExp == 0x7FD ) 426 && ( (sbits64) ( zSig + roundIncrement ) < 0 ) ) 427 ) { 428 //register int lr = __builtin_return_address(0); 429 //printk("roundAndPackFloat64 called from 0x%08x\n",lr); 430 float_raise( float_flag_overflow | float_flag_inexact ); 431 return packFloat64( zSign, 0x7FF, 0 ) - ( roundIncrement == 0 ); 432 } 433 if ( zExp < 0 ) { 434 isTiny = 435 ( float_detect_tininess == float_tininess_before_rounding ) 436 || ( zExp < -1 ) 437 || ( zSig + roundIncrement < LIT64( 0x8000000000000000 ) ); 438 shift64RightJamming( zSig, - zExp, &zSig ); 439 zExp = 0; 440 roundBits = zSig & 0x3FF; 441 if ( isTiny && roundBits ) float_raise( float_flag_underflow ); 442 } 443 } 444 if ( roundBits ) float_exception_flags |= float_flag_inexact; 445 zSig = ( zSig + roundIncrement )>>10; 446 zSig &= ~ ( ( ( roundBits ^ 0x200 ) == 0 ) & roundNearestEven ); 447 if ( zSig == 0 ) zExp = 0; 448 return packFloat64( zSign, zExp, zSig ); 449 450 } 451 452 /* 453 ------------------------------------------------------------------------------- 454 Takes an abstract floating-point value having sign `zSign', exponent `zExp', 455 and significand `zSig', and returns the proper double-precision floating- 456 point value corresponding to the abstract input. This routine is just like 457 `roundAndPackFloat64' except that `zSig' does not have to be normalized in 458 any way. In all cases, `zExp' must be 1 less than the ``true'' floating- 459 point exponent. 460 ------------------------------------------------------------------------------- 461 */ 462 static float64 463 normalizeRoundAndPackFloat64( flag zSign, int16 zExp, bits64 zSig ) 464 { 465 int8 shiftCount; 466 467 shiftCount = countLeadingZeros64( zSig ) - 1; 468 return roundAndPackFloat64( zSign, zExp - shiftCount, zSig<<shiftCount ); 469 470 } 471 472 #ifdef FLOATX80 473 474 /* 475 ------------------------------------------------------------------------------- 476 Returns the fraction bits of the extended double-precision floating-point 477 value `a'. 478 ------------------------------------------------------------------------------- 479 */ 480 INLINE bits64 extractFloatx80Frac( floatx80 a ) 481 { 482 483 return a.low; 484 485 } 486 487 /* 488 ------------------------------------------------------------------------------- 489 Returns the exponent bits of the extended double-precision floating-point 490 value `a'. 491 ------------------------------------------------------------------------------- 492 */ 493 INLINE int32 extractFloatx80Exp( floatx80 a ) 494 { 495 496 return a.high & 0x7FFF; 497 498 } 499 500 /* 501 ------------------------------------------------------------------------------- 502 Returns the sign bit of the extended double-precision floating-point value 503 `a'. 504 ------------------------------------------------------------------------------- 505 */ 506 INLINE flag extractFloatx80Sign( floatx80 a ) 507 { 508 509 return a.high>>15; 510 511 } 512 513 /* 514 ------------------------------------------------------------------------------- 515 Normalizes the subnormal extended double-precision floating-point value 516 represented by the denormalized significand `aSig'. The normalized exponent 517 and significand are stored at the locations pointed to by `zExpPtr' and 518 `zSigPtr', respectively. 519 ------------------------------------------------------------------------------- 520 */ 521 static void 522 normalizeFloatx80Subnormal( bits64 aSig, int32 *zExpPtr, bits64 *zSigPtr ) 523 { 524 int8 shiftCount; 525 526 shiftCount = countLeadingZeros64( aSig ); 527 *zSigPtr = aSig<<shiftCount; 528 *zExpPtr = 1 - shiftCount; 529 530 } 531 532 /* 533 ------------------------------------------------------------------------------- 534 Packs the sign `zSign', exponent `zExp', and significand `zSig' into an 535 extended double-precision floating-point value, returning the result. 536 ------------------------------------------------------------------------------- 537 */ 538 INLINE floatx80 packFloatx80( flag zSign, int32 zExp, bits64 zSig ) 539 { 540 floatx80 z; 541 542 z.low = zSig; 543 z.high = ( ( (bits16) zSign )<<15 ) + zExp; 544 return z; 545 546 } 547 548 /* 549 ------------------------------------------------------------------------------- 550 Takes an abstract floating-point value having sign `zSign', exponent `zExp', 551 and extended significand formed by the concatenation of `zSig0' and `zSig1', 552 and returns the proper extended double-precision floating-point value 553 corresponding to the abstract input. Ordinarily, the abstract value is 554 rounded and packed into the extended double-precision format, with the 555 inexact exception raised if the abstract input cannot be represented 556 exactly. If the abstract value is too large, however, the overflow and 557 inexact exceptions are raised and an infinity or maximal finite value is 558 returned. If the abstract value is too small, the input value is rounded to 559 a subnormal number, and the underflow and inexact exceptions are raised if 560 the abstract input cannot be represented exactly as a subnormal extended 561 double-precision floating-point number. 562 If `roundingPrecision' is 32 or 64, the result is rounded to the same 563 number of bits as single or double precision, respectively. Otherwise, the 564 result is rounded to the full precision of the extended double-precision 565 format. 566 The input significand must be normalized or smaller. If the input 567 significand is not normalized, `zExp' must be 0; in that case, the result 568 returned is a subnormal number, and it must not require rounding. The 569 handling of underflow and overflow follows the IEC/IEEE Standard for Binary 570 Floating-point Arithmetic. 571 ------------------------------------------------------------------------------- 572 */ 573 static floatx80 574 roundAndPackFloatx80( 575 int8 roundingPrecision, flag zSign, int32 zExp, bits64 zSig0, bits64 zSig1 576 ) 577 { 578 int8 roundingMode; 579 flag roundNearestEven, increment, isTiny; 580 int64 roundIncrement, roundMask, roundBits; 581 582 roundingMode = float_rounding_mode; 583 roundNearestEven = ( roundingMode == float_round_nearest_even ); 584 if ( roundingPrecision == 80 ) goto precision80; 585 if ( roundingPrecision == 64 ) { 586 roundIncrement = LIT64( 0x0000000000000400 ); 587 roundMask = LIT64( 0x00000000000007FF ); 588 } 589 else if ( roundingPrecision == 32 ) { 590 roundIncrement = LIT64( 0x0000008000000000 ); 591 roundMask = LIT64( 0x000000FFFFFFFFFF ); 592 } 593 else { 594 goto precision80; 595 } 596 zSig0 |= ( zSig1 != 0 ); 597 if ( ! roundNearestEven ) { 598 if ( roundingMode == float_round_to_zero ) { 599 roundIncrement = 0; 600 } 601 else { 602 roundIncrement = roundMask; 603 if ( zSign ) { 604 if ( roundingMode == float_round_up ) roundIncrement = 0; 605 } 606 else { 607 if ( roundingMode == float_round_down ) roundIncrement = 0; 608 } 609 } 610 } 611 roundBits = zSig0 & roundMask; 612 if ( 0x7FFD <= (bits32) ( zExp - 1 ) ) { 613 if ( ( 0x7FFE < zExp ) 614 || ( ( zExp == 0x7FFE ) && ( zSig0 + roundIncrement < zSig0 ) ) 615 ) { 616 goto overflow; 617 } 618 if ( zExp <= 0 ) { 619 isTiny = 620 ( float_detect_tininess == float_tininess_before_rounding ) 621 || ( zExp < 0 ) 622 || ( zSig0 <= zSig0 + roundIncrement ); 623 shift64RightJamming( zSig0, 1 - zExp, &zSig0 ); 624 zExp = 0; 625 roundBits = zSig0 & roundMask; 626 if ( isTiny && roundBits ) float_raise( float_flag_underflow ); 627 if ( roundBits ) float_exception_flags |= float_flag_inexact; 628 zSig0 += roundIncrement; 629 if ( (sbits64) zSig0 < 0 ) zExp = 1; 630 roundIncrement = roundMask + 1; 631 if ( roundNearestEven && ( roundBits<<1 == roundIncrement ) ) { 632 roundMask |= roundIncrement; 633 } 634 zSig0 &= ~ roundMask; 635 return packFloatx80( zSign, zExp, zSig0 ); 636 } 637 } 638 if ( roundBits ) float_exception_flags |= float_flag_inexact; 639 zSig0 += roundIncrement; 640 if ( zSig0 < roundIncrement ) { 641 ++zExp; 642 zSig0 = LIT64( 0x8000000000000000 ); 643 } 644 roundIncrement = roundMask + 1; 645 if ( roundNearestEven && ( roundBits<<1 == roundIncrement ) ) { 646 roundMask |= roundIncrement; 647 } 648 zSig0 &= ~ roundMask; 649 if ( zSig0 == 0 ) zExp = 0; 650 return packFloatx80( zSign, zExp, zSig0 ); 651 precision80: 652 increment = ( (sbits64) zSig1 < 0 ); 653 if ( ! roundNearestEven ) { 654 if ( roundingMode == float_round_to_zero ) { 655 increment = 0; 656 } 657 else { 658 if ( zSign ) { 659 increment = ( roundingMode == float_round_down ) && zSig1; 660 } 661 else { 662 increment = ( roundingMode == float_round_up ) && zSig1; 663 } 664 } 665 } 666 if ( 0x7FFD <= (bits32) ( zExp - 1 ) ) { 667 if ( ( 0x7FFE < zExp ) 668 || ( ( zExp == 0x7FFE ) 669 && ( zSig0 == LIT64( 0xFFFFFFFFFFFFFFFF ) ) 670 && increment 671 ) 672 ) { 673 roundMask = 0; 674 overflow: 675 float_raise( float_flag_overflow | float_flag_inexact ); 676 if ( ( roundingMode == float_round_to_zero ) 677 || ( zSign && ( roundingMode == float_round_up ) ) 678 || ( ! zSign && ( roundingMode == float_round_down ) ) 679 ) { 680 return packFloatx80( zSign, 0x7FFE, ~ roundMask ); 681 } 682 return packFloatx80( zSign, 0x7FFF, LIT64( 0x8000000000000000 ) ); 683 } 684 if ( zExp <= 0 ) { 685 isTiny = 686 ( float_detect_tininess == float_tininess_before_rounding ) 687 || ( zExp < 0 ) 688 || ! increment 689 || ( zSig0 < LIT64( 0xFFFFFFFFFFFFFFFF ) ); 690 shift64ExtraRightJamming( zSig0, zSig1, 1 - zExp, &zSig0, &zSig1 ); 691 zExp = 0; 692 if ( isTiny && zSig1 ) float_raise( float_flag_underflow ); 693 if ( zSig1 ) float_exception_flags |= float_flag_inexact; 694 if ( roundNearestEven ) { 695 increment = ( (sbits64) zSig1 < 0 ); 696 } 697 else { 698 if ( zSign ) { 699 increment = ( roundingMode == float_round_down ) && zSig1; 700 } 701 else { 702 increment = ( roundingMode == float_round_up ) && zSig1; 703 } 704 } 705 if ( increment ) { 706 ++zSig0; 707 zSig0 &= ~ ( ( zSig1 + zSig1 == 0 ) & roundNearestEven ); 708 if ( (sbits64) zSig0 < 0 ) zExp = 1; 709 } 710 return packFloatx80( zSign, zExp, zSig0 ); 711 } 712 } 713 if ( zSig1 ) float_exception_flags |= float_flag_inexact; 714 if ( increment ) { 715 ++zSig0; 716 if ( zSig0 == 0 ) { 717 ++zExp; 718 zSig0 = LIT64( 0x8000000000000000 ); 719 } 720 else { 721 zSig0 &= ~ ( ( zSig1 + zSig1 == 0 ) & roundNearestEven ); 722 } 723 } 724 else { 725 if ( zSig0 == 0 ) zExp = 0; 726 } 727 728 return packFloatx80( zSign, zExp, zSig0 ); 729 } 730 731 /* 732 ------------------------------------------------------------------------------- 733 Takes an abstract floating-point value having sign `zSign', exponent 734 `zExp', and significand formed by the concatenation of `zSig0' and `zSig1', 735 and returns the proper extended double-precision floating-point value 736 corresponding to the abstract input. This routine is just like 737 `roundAndPackFloatx80' except that the input significand does not have to be 738 normalized. 739 ------------------------------------------------------------------------------- 740 */ 741 static floatx80 742 normalizeRoundAndPackFloatx80( 743 int8 roundingPrecision, flag zSign, int32 zExp, bits64 zSig0, bits64 zSig1 744 ) 745 { 746 int8 shiftCount; 747 748 if ( zSig0 == 0 ) { 749 zSig0 = zSig1; 750 zSig1 = 0; 751 zExp -= 64; 752 } 753 shiftCount = countLeadingZeros64( zSig0 ); 754 shortShift128Left( zSig0, zSig1, shiftCount, &zSig0, &zSig1 ); 755 zExp -= shiftCount; 756 return 757 roundAndPackFloatx80( roundingPrecision, zSign, zExp, zSig0, zSig1 ); 758 759 } 760 761 #endif 762 763 /* 764 ------------------------------------------------------------------------------- 765 Returns the result of converting the 32-bit two's complement integer `a' to 766 the single-precision floating-point format. The conversion is performed 767 according to the IEC/IEEE Standard for Binary Floating-point Arithmetic. 768 ------------------------------------------------------------------------------- 769 */ 770 float32 int32_to_float32( int32 a ) 771 { 772 flag zSign; 773 774 if ( a == 0 ) return 0; 775 if ( a == 0x80000000 ) return packFloat32( 1, 0x9E, 0 ); 776 zSign = ( a < 0 ); 777 return normalizeRoundAndPackFloat32( zSign, 0x9C, zSign ? - a : a ); 778 779 } 780 781 /* 782 ------------------------------------------------------------------------------- 783 Returns the result of converting the 32-bit two's complement integer `a' to 784 the double-precision floating-point format. The conversion is performed 785 according to the IEC/IEEE Standard for Binary Floating-point Arithmetic. 786 ------------------------------------------------------------------------------- 787 */ 788 float64 int32_to_float64( int32 a ) 789 { 790 flag aSign; 791 uint32 absA; 792 int8 shiftCount; 793 bits64 zSig; 794 795 if ( a == 0 ) return 0; 796 aSign = ( a < 0 ); 797 absA = aSign ? - a : a; 798 shiftCount = countLeadingZeros32( absA ) + 21; 799 zSig = absA; 800 return packFloat64( aSign, 0x432 - shiftCount, zSig<<shiftCount ); 801 802 } 803 804 #ifdef FLOATX80 805 806 /* 807 ------------------------------------------------------------------------------- 808 Returns the result of converting the 32-bit two's complement integer `a' 809 to the extended double-precision floating-point format. The conversion 810 is performed according to the IEC/IEEE Standard for Binary Floating-point 811 Arithmetic. 812 ------------------------------------------------------------------------------- 813 */ 814 floatx80 int32_to_floatx80( int32 a ) 815 { 816 flag zSign; 817 uint32 absA; 818 int8 shiftCount; 819 bits64 zSig; 820 821 if ( a == 0 ) return packFloatx80( 0, 0, 0 ); 822 zSign = ( a < 0 ); 823 absA = zSign ? - a : a; 824 shiftCount = countLeadingZeros32( absA ) + 32; 825 zSig = absA; 826 return packFloatx80( zSign, 0x403E - shiftCount, zSig<<shiftCount ); 827 828 } 829 830 #endif 831 832 /* 833 ------------------------------------------------------------------------------- 834 Returns the result of converting the single-precision floating-point value 835 `a' to the 32-bit two's complement integer format. The conversion is 836 performed according to the IEC/IEEE Standard for Binary Floating-point 837 Arithmetic---which means in particular that the conversion is rounded 838 according to the current rounding mode. If `a' is a NaN, the largest 839 positive integer is returned. Otherwise, if the conversion overflows, the 840 largest integer with the same sign as `a' is returned. 841 ------------------------------------------------------------------------------- 842 */ 843 int32 float32_to_int32( float32 a ) 844 { 845 flag aSign; 846 int16 aExp, shiftCount; 847 bits32 aSig; 848 bits64 zSig; 849 850 aSig = extractFloat32Frac( a ); 851 aExp = extractFloat32Exp( a ); 852 aSign = extractFloat32Sign( a ); 853 if ( ( aExp == 0x7FF ) && aSig ) aSign = 0; 854 if ( aExp ) aSig |= 0x00800000; 855 shiftCount = 0xAF - aExp; 856 zSig = aSig; 857 zSig <<= 32; 858 if ( 0 < shiftCount ) shift64RightJamming( zSig, shiftCount, &zSig ); 859 return roundAndPackInt32( aSign, zSig ); 860 861 } 862 863 /* 864 ------------------------------------------------------------------------------- 865 Returns the result of converting the single-precision floating-point value 866 `a' to the 32-bit two's complement integer format. The conversion is 867 performed according to the IEC/IEEE Standard for Binary Floating-point 868 Arithmetic, except that the conversion is always rounded toward zero. If 869 `a' is a NaN, the largest positive integer is returned. Otherwise, if the 870 conversion overflows, the largest integer with the same sign as `a' is 871 returned. 872 ------------------------------------------------------------------------------- 873 */ 874 int32 float32_to_int32_round_to_zero( float32 a ) 875 { 876 flag aSign; 877 int16 aExp, shiftCount; 878 bits32 aSig; 879 int32 z; 880 881 aSig = extractFloat32Frac( a ); 882 aExp = extractFloat32Exp( a ); 883 aSign = extractFloat32Sign( a ); 884 shiftCount = aExp - 0x9E; 885 if ( 0 <= shiftCount ) { 886 if ( a == 0xCF000000 ) return 0x80000000; 887 float_raise( float_flag_invalid ); 888 if ( ! aSign || ( ( aExp == 0xFF ) && aSig ) ) return 0x7FFFFFFF; 889 return 0x80000000; 890 } 891 else if ( aExp <= 0x7E ) { 892 if ( aExp | aSig ) float_exception_flags |= float_flag_inexact; 893 return 0; 894 } 895 aSig = ( aSig | 0x00800000 )<<8; 896 z = aSig>>( - shiftCount ); 897 if ( (bits32) ( aSig<<( shiftCount & 31 ) ) ) { 898 float_exception_flags |= float_flag_inexact; 899 } 900 return aSign ? - z : z; 901 902 } 903 904 /* 905 ------------------------------------------------------------------------------- 906 Returns the result of converting the single-precision floating-point value 907 `a' to the double-precision floating-point format. The conversion is 908 performed according to the IEC/IEEE Standard for Binary Floating-point 909 Arithmetic. 910 ------------------------------------------------------------------------------- 911 */ 912 float64 float32_to_float64( float32 a ) 913 { 914 flag aSign; 915 int16 aExp; 916 bits32 aSig; 917 918 aSig = extractFloat32Frac( a ); 919 aExp = extractFloat32Exp( a ); 920 aSign = extractFloat32Sign( a ); 921 if ( aExp == 0xFF ) { 922 if ( aSig ) return commonNaNToFloat64( float32ToCommonNaN( a ) ); 923 return packFloat64( aSign, 0x7FF, 0 ); 924 } 925 if ( aExp == 0 ) { 926 if ( aSig == 0 ) return packFloat64( aSign, 0, 0 ); 927 normalizeFloat32Subnormal( aSig, &aExp, &aSig ); 928 --aExp; 929 } 930 return packFloat64( aSign, aExp + 0x380, ( (bits64) aSig )<<29 ); 931 932 } 933 934 #ifdef FLOATX80 935 936 /* 937 ------------------------------------------------------------------------------- 938 Returns the result of converting the single-precision floating-point value 939 `a' to the extended double-precision floating-point format. The conversion 940 is performed according to the IEC/IEEE Standard for Binary Floating-point 941 Arithmetic. 942 ------------------------------------------------------------------------------- 943 */ 944 floatx80 float32_to_floatx80( float32 a ) 945 { 946 flag aSign; 947 int16 aExp; 948 bits32 aSig; 949 950 aSig = extractFloat32Frac( a ); 951 aExp = extractFloat32Exp( a ); 952 aSign = extractFloat32Sign( a ); 953 if ( aExp == 0xFF ) { 954 if ( aSig ) return commonNaNToFloatx80( float32ToCommonNaN( a ) ); 955 return packFloatx80( aSign, 0x7FFF, LIT64( 0x8000000000000000 ) ); 956 } 957 if ( aExp == 0 ) { 958 if ( aSig == 0 ) return packFloatx80( aSign, 0, 0 ); 959 normalizeFloat32Subnormal( aSig, &aExp, &aSig ); 960 } 961 aSig |= 0x00800000; 962 return packFloatx80( aSign, aExp + 0x3F80, ( (bits64) aSig )<<40 ); 963 964 } 965 966 #endif 967 968 /* 969 ------------------------------------------------------------------------------- 970 Rounds the single-precision floating-point value `a' to an integer, and 971 returns the result as a single-precision floating-point value. The 972 operation is performed according to the IEC/IEEE Standard for Binary 973 Floating-point Arithmetic. 974 ------------------------------------------------------------------------------- 975 */ 976 float32 float32_round_to_int( float32 a ) 977 { 978 flag aSign; 979 int16 aExp; 980 bits32 lastBitMask, roundBitsMask; 981 int8 roundingMode; 982 float32 z; 983 984 aExp = extractFloat32Exp( a ); 985 if ( 0x96 <= aExp ) { 986 if ( ( aExp == 0xFF ) && extractFloat32Frac( a ) ) { 987 return propagateFloat32NaN( a, a ); 988 } 989 return a; 990 } 991 if ( aExp <= 0x7E ) { 992 if ( (bits32) ( a<<1 ) == 0 ) return a; 993 float_exception_flags |= float_flag_inexact; 994 aSign = extractFloat32Sign( a ); 995 switch ( float_rounding_mode ) { 996 case float_round_nearest_even: 997 if ( ( aExp == 0x7E ) && extractFloat32Frac( a ) ) { 998 return packFloat32( aSign, 0x7F, 0 ); 999 } 1000 break; 1001 case float_round_down: 1002 return aSign ? 0xBF800000 : 0; 1003 case float_round_up: 1004 return aSign ? 0x80000000 : 0x3F800000; 1005 } 1006 return packFloat32( aSign, 0, 0 ); 1007 } 1008 lastBitMask = 1; 1009 lastBitMask <<= 0x96 - aExp; 1010 roundBitsMask = lastBitMask - 1; 1011 z = a; 1012 roundingMode = float_rounding_mode; 1013 if ( roundingMode == float_round_nearest_even ) { 1014 z += lastBitMask>>1; 1015 if ( ( z & roundBitsMask ) == 0 ) z &= ~ lastBitMask; 1016 } 1017 else if ( roundingMode != float_round_to_zero ) { 1018 if ( extractFloat32Sign( z ) ^ ( roundingMode == float_round_up ) ) { 1019 z += roundBitsMask; 1020 } 1021 } 1022 z &= ~ roundBitsMask; 1023 if ( z != a ) float_exception_flags |= float_flag_inexact; 1024 return z; 1025 1026 } 1027 1028 /* 1029 ------------------------------------------------------------------------------- 1030 Returns the result of adding the absolute values of the single-precision 1031 floating-point values `a' and `b'. If `zSign' is true, the sum is negated 1032 before being returned. `zSign' is ignored if the result is a NaN. The 1033 addition is performed according to the IEC/IEEE Standard for Binary 1034 Floating-point Arithmetic. 1035 ------------------------------------------------------------------------------- 1036 */ 1037 static float32 addFloat32Sigs( float32 a, float32 b, flag zSign ) 1038 { 1039 int16 aExp, bExp, zExp; 1040 bits32 aSig, bSig, zSig; 1041 int16 expDiff; 1042 1043 aSig = extractFloat32Frac( a ); 1044 aExp = extractFloat32Exp( a ); 1045 bSig = extractFloat32Frac( b ); 1046 bExp = extractFloat32Exp( b ); 1047 expDiff = aExp - bExp; 1048 aSig <<= 6; 1049 bSig <<= 6; 1050 if ( 0 < expDiff ) { 1051 if ( aExp == 0xFF ) { 1052 if ( aSig ) return propagateFloat32NaN( a, b ); 1053 return a; 1054 } 1055 if ( bExp == 0 ) { 1056 --expDiff; 1057 } 1058 else { 1059 bSig |= 0x20000000; 1060 } 1061 shift32RightJamming( bSig, expDiff, &bSig ); 1062 zExp = aExp; 1063 } 1064 else if ( expDiff < 0 ) { 1065 if ( bExp == 0xFF ) { 1066 if ( bSig ) return propagateFloat32NaN( a, b ); 1067 return packFloat32( zSign, 0xFF, 0 ); 1068 } 1069 if ( aExp == 0 ) { 1070 ++expDiff; 1071 } 1072 else { 1073 aSig |= 0x20000000; 1074 } 1075 shift32RightJamming( aSig, - expDiff, &aSig ); 1076 zExp = bExp; 1077 } 1078 else { 1079 if ( aExp == 0xFF ) { 1080 if ( aSig | bSig ) return propagateFloat32NaN( a, b ); 1081 return a; 1082 } 1083 if ( aExp == 0 ) return packFloat32( zSign, 0, ( aSig + bSig )>>6 ); 1084 zSig = 0x40000000 + aSig + bSig; 1085 zExp = aExp; 1086 goto roundAndPack; 1087 } 1088 aSig |= 0x20000000; 1089 zSig = ( aSig + bSig )<<1; 1090 --zExp; 1091 if ( (sbits32) zSig < 0 ) { 1092 zSig = aSig + bSig; 1093 ++zExp; 1094 } 1095 roundAndPack: 1096 return roundAndPackFloat32( zSign, zExp, zSig ); 1097 1098 } 1099 1100 /* 1101 ------------------------------------------------------------------------------- 1102 Returns the result of subtracting the absolute values of the single- 1103 precision floating-point values `a' and `b'. If `zSign' is true, the 1104 difference is negated before being returned. `zSign' is ignored if the 1105 result is a NaN. The subtraction is performed according to the IEC/IEEE 1106 Standard for Binary Floating-point Arithmetic. 1107 ------------------------------------------------------------------------------- 1108 */ 1109 static float32 subFloat32Sigs( float32 a, float32 b, flag zSign ) 1110 { 1111 int16 aExp, bExp, zExp; 1112 bits32 aSig, bSig, zSig; 1113 int16 expDiff; 1114 1115 aSig = extractFloat32Frac( a ); 1116 aExp = extractFloat32Exp( a ); 1117 bSig = extractFloat32Frac( b ); 1118 bExp = extractFloat32Exp( b ); 1119 expDiff = aExp - bExp; 1120 aSig <<= 7; 1121 bSig <<= 7; 1122 if ( 0 < expDiff ) goto aExpBigger; 1123 if ( expDiff < 0 ) goto bExpBigger; 1124 if ( aExp == 0xFF ) { 1125 if ( aSig | bSig ) return propagateFloat32NaN( a, b ); 1126 float_raise( float_flag_invalid ); 1127 return float32_default_nan; 1128 } 1129 if ( aExp == 0 ) { 1130 aExp = 1; 1131 bExp = 1; 1132 } 1133 if ( bSig < aSig ) goto aBigger; 1134 if ( aSig < bSig ) goto bBigger; 1135 return packFloat32( float_rounding_mode == float_round_down, 0, 0 ); 1136 bExpBigger: 1137 if ( bExp == 0xFF ) { 1138 if ( bSig ) return propagateFloat32NaN( a, b ); 1139 return packFloat32( zSign ^ 1, 0xFF, 0 ); 1140 } 1141 if ( aExp == 0 ) { 1142 ++expDiff; 1143 } 1144 else { 1145 aSig |= 0x40000000; 1146 } 1147 shift32RightJamming( aSig, - expDiff, &aSig ); 1148 bSig |= 0x40000000; 1149 bBigger: 1150 zSig = bSig - aSig; 1151 zExp = bExp; 1152 zSign ^= 1; 1153 goto normalizeRoundAndPack; 1154 aExpBigger: 1155 if ( aExp == 0xFF ) { 1156 if ( aSig ) return propagateFloat32NaN( a, b ); 1157 return a; 1158 } 1159 if ( bExp == 0 ) { 1160 --expDiff; 1161 } 1162 else { 1163 bSig |= 0x40000000; 1164 } 1165 shift32RightJamming( bSig, expDiff, &bSig ); 1166 aSig |= 0x40000000; 1167 aBigger: 1168 zSig = aSig - bSig; 1169 zExp = aExp; 1170 normalizeRoundAndPack: 1171 --zExp; 1172 return normalizeRoundAndPackFloat32( zSign, zExp, zSig ); 1173 1174 } 1175 1176 /* 1177 ------------------------------------------------------------------------------- 1178 Returns the result of adding the single-precision floating-point values `a' 1179 and `b'. The operation is performed according to the IEC/IEEE Standard for 1180 Binary Floating-point Arithmetic. 1181 ------------------------------------------------------------------------------- 1182 */ 1183 float32 float32_add( float32 a, float32 b ) 1184 { 1185 flag aSign, bSign; 1186 1187 aSign = extractFloat32Sign( a ); 1188 bSign = extractFloat32Sign( b ); 1189 if ( aSign == bSign ) { 1190 return addFloat32Sigs( a, b, aSign ); 1191 } 1192 else { 1193 return subFloat32Sigs( a, b, aSign ); 1194 } 1195 1196 } 1197 1198 /* 1199 ------------------------------------------------------------------------------- 1200 Returns the result of subtracting the single-precision floating-point values 1201 `a' and `b'. The operation is performed according to the IEC/IEEE Standard 1202 for Binary Floating-point Arithmetic. 1203 ------------------------------------------------------------------------------- 1204 */ 1205 float32 float32_sub( float32 a, float32 b ) 1206 { 1207 flag aSign, bSign; 1208 1209 aSign = extractFloat32Sign( a ); 1210 bSign = extractFloat32Sign( b ); 1211 if ( aSign == bSign ) { 1212 return subFloat32Sigs( a, b, aSign ); 1213 } 1214 else { 1215 return addFloat32Sigs( a, b, aSign ); 1216 } 1217 1218 } 1219 1220 /* 1221 ------------------------------------------------------------------------------- 1222 Returns the result of multiplying the single-precision floating-point values 1223 `a' and `b'. The operation is performed according to the IEC/IEEE Standard 1224 for Binary Floating-point Arithmetic. 1225 ------------------------------------------------------------------------------- 1226 */ 1227 float32 float32_mul( float32 a, float32 b ) 1228 { 1229 flag aSign, bSign, zSign; 1230 int16 aExp, bExp, zExp; 1231 bits32 aSig, bSig; 1232 bits64 zSig64; 1233 bits32 zSig; 1234 1235 aSig = extractFloat32Frac( a ); 1236 aExp = extractFloat32Exp( a ); 1237 aSign = extractFloat32Sign( a ); 1238 bSig = extractFloat32Frac( b ); 1239 bExp = extractFloat32Exp( b ); 1240 bSign = extractFloat32Sign( b ); 1241 zSign = aSign ^ bSign; 1242 if ( aExp == 0xFF ) { 1243 if ( aSig || ( ( bExp == 0xFF ) && bSig ) ) { 1244 return propagateFloat32NaN( a, b ); 1245 } 1246 if ( ( bExp | bSig ) == 0 ) { 1247 float_raise( float_flag_invalid ); 1248 return float32_default_nan; 1249 } 1250 return packFloat32( zSign, 0xFF, 0 ); 1251 } 1252 if ( bExp == 0xFF ) { 1253 if ( bSig ) return propagateFloat32NaN( a, b ); 1254 if ( ( aExp | aSig ) == 0 ) { 1255 float_raise( float_flag_invalid ); 1256 return float32_default_nan; 1257 } 1258 return packFloat32( zSign, 0xFF, 0 ); 1259 } 1260 if ( aExp == 0 ) { 1261 if ( aSig == 0 ) return packFloat32( zSign, 0, 0 ); 1262 normalizeFloat32Subnormal( aSig, &aExp, &aSig ); 1263 } 1264 if ( bExp == 0 ) { 1265 if ( bSig == 0 ) return packFloat32( zSign, 0, 0 ); 1266 normalizeFloat32Subnormal( bSig, &bExp, &bSig ); 1267 } 1268 zExp = aExp + bExp - 0x7F; 1269 aSig = ( aSig | 0x00800000 )<<7; 1270 bSig = ( bSig | 0x00800000 )<<8; 1271 shift64RightJamming( ( (bits64) aSig ) * bSig, 32, &zSig64 ); 1272 zSig = zSig64; 1273 if ( 0 <= (sbits32) ( zSig<<1 ) ) { 1274 zSig <<= 1; 1275 --zExp; 1276 } 1277 return roundAndPackFloat32( zSign, zExp, zSig ); 1278 1279 } 1280 1281 /* 1282 ------------------------------------------------------------------------------- 1283 Returns the result of dividing the single-precision floating-point value `a' 1284 by the corresponding value `b'. The operation is performed according to the 1285 IEC/IEEE Standard for Binary Floating-point Arithmetic. 1286 ------------------------------------------------------------------------------- 1287 */ 1288 float32 float32_div( float32 a, float32 b ) 1289 { 1290 flag aSign, bSign, zSign; 1291 int16 aExp, bExp, zExp; 1292 bits32 aSig, bSig, zSig; 1293 1294 aSig = extractFloat32Frac( a ); 1295 aExp = extractFloat32Exp( a ); 1296 aSign = extractFloat32Sign( a ); 1297 bSig = extractFloat32Frac( b ); 1298 bExp = extractFloat32Exp( b ); 1299 bSign = extractFloat32Sign( b ); 1300 zSign = aSign ^ bSign; 1301 if ( aExp == 0xFF ) { 1302 if ( aSig ) return propagateFloat32NaN( a, b ); 1303 if ( bExp == 0xFF ) { 1304 if ( bSig ) return propagateFloat32NaN( a, b ); 1305 float_raise( float_flag_invalid ); 1306 return float32_default_nan; 1307 } 1308 return packFloat32( zSign, 0xFF, 0 ); 1309 } 1310 if ( bExp == 0xFF ) { 1311 if ( bSig ) return propagateFloat32NaN( a, b ); 1312 return packFloat32( zSign, 0, 0 ); 1313 } 1314 if ( bExp == 0 ) { 1315 if ( bSig == 0 ) { 1316 if ( ( aExp | aSig ) == 0 ) { 1317 float_raise( float_flag_invalid ); 1318 return float32_default_nan; 1319 } 1320 float_raise( float_flag_divbyzero ); 1321 return packFloat32( zSign, 0xFF, 0 ); 1322 } 1323 normalizeFloat32Subnormal( bSig, &bExp, &bSig ); 1324 } 1325 if ( aExp == 0 ) { 1326 if ( aSig == 0 ) return packFloat32( zSign, 0, 0 ); 1327 normalizeFloat32Subnormal( aSig, &aExp, &aSig ); 1328 } 1329 zExp = aExp - bExp + 0x7D; 1330 aSig = ( aSig | 0x00800000 )<<7; 1331 bSig = ( bSig | 0x00800000 )<<8; 1332 if ( bSig <= ( aSig + aSig ) ) { 1333 aSig >>= 1; 1334 ++zExp; 1335 } 1336 { 1337 bits64 tmp = ( (bits64) aSig )<<32; 1338 do_div( tmp, bSig ); 1339 zSig = tmp; 1340 } 1341 if ( ( zSig & 0x3F ) == 0 ) { 1342 zSig |= ( ( (bits64) bSig ) * zSig != ( (bits64) aSig )<<32 ); 1343 } 1344 return roundAndPackFloat32( zSign, zExp, zSig ); 1345 1346 } 1347 1348 /* 1349 ------------------------------------------------------------------------------- 1350 Returns the remainder of the single-precision floating-point value `a' 1351 with respect to the corresponding value `b'. The operation is performed 1352 according to the IEC/IEEE Standard for Binary Floating-point Arithmetic. 1353 ------------------------------------------------------------------------------- 1354 */ 1355 float32 float32_rem( float32 a, float32 b ) 1356 { 1357 flag aSign, bSign, zSign; 1358 int16 aExp, bExp, expDiff; 1359 bits32 aSig, bSig; 1360 bits32 q; 1361 bits64 aSig64, bSig64, q64; 1362 bits32 alternateASig; 1363 sbits32 sigMean; 1364 1365 aSig = extractFloat32Frac( a ); 1366 aExp = extractFloat32Exp( a ); 1367 aSign = extractFloat32Sign( a ); 1368 bSig = extractFloat32Frac( b ); 1369 bExp = extractFloat32Exp( b ); 1370 bSign = extractFloat32Sign( b ); 1371 if ( aExp == 0xFF ) { 1372 if ( aSig || ( ( bExp == 0xFF ) && bSig ) ) { 1373 return propagateFloat32NaN( a, b ); 1374 } 1375 float_raise( float_flag_invalid ); 1376 return float32_default_nan; 1377 } 1378 if ( bExp == 0xFF ) { 1379 if ( bSig ) return propagateFloat32NaN( a, b ); 1380 return a; 1381 } 1382 if ( bExp == 0 ) { 1383 if ( bSig == 0 ) { 1384 float_raise( float_flag_invalid ); 1385 return float32_default_nan; 1386 } 1387 normalizeFloat32Subnormal( bSig, &bExp, &bSig ); 1388 } 1389 if ( aExp == 0 ) { 1390 if ( aSig == 0 ) return a; 1391 normalizeFloat32Subnormal( aSig, &aExp, &aSig ); 1392 } 1393 expDiff = aExp - bExp; 1394 aSig |= 0x00800000; 1395 bSig |= 0x00800000; 1396 if ( expDiff < 32 ) { 1397 aSig <<= 8; 1398 bSig <<= 8; 1399 if ( expDiff < 0 ) { 1400 if ( expDiff < -1 ) return a; 1401 aSig >>= 1; 1402 } 1403 q = ( bSig <= aSig ); 1404 if ( q ) aSig -= bSig; 1405 if ( 0 < expDiff ) { 1406 bits64 tmp = ( (bits64) aSig )<<32; 1407 do_div( tmp, bSig ); 1408 q = tmp; 1409 q >>= 32 - expDiff; 1410 bSig >>= 2; 1411 aSig = ( ( aSig>>1 )<<( expDiff - 1 ) ) - bSig * q; 1412 } 1413 else { 1414 aSig >>= 2; 1415 bSig >>= 2; 1416 } 1417 } 1418 else { 1419 if ( bSig <= aSig ) aSig -= bSig; 1420 aSig64 = ( (bits64) aSig )<<40; 1421 bSig64 = ( (bits64) bSig )<<40; 1422 expDiff -= 64; 1423 while ( 0 < expDiff ) { 1424 q64 = estimateDiv128To64( aSig64, 0, bSig64 ); 1425 q64 = ( 2 < q64 ) ? q64 - 2 : 0; 1426 aSig64 = - ( ( bSig * q64 )<<38 ); 1427 expDiff -= 62; 1428 } 1429 expDiff += 64; 1430 q64 = estimateDiv128To64( aSig64, 0, bSig64 ); 1431 q64 = ( 2 < q64 ) ? q64 - 2 : 0; 1432 q = q64>>( 64 - expDiff ); 1433 bSig <<= 6; 1434 aSig = ( ( aSig64>>33 )<<( expDiff - 1 ) ) - bSig * q; 1435 } 1436 do { 1437 alternateASig = aSig; 1438 ++q; 1439 aSig -= bSig; 1440 } while ( 0 <= (sbits32) aSig ); 1441 sigMean = aSig + alternateASig; 1442 if ( ( sigMean < 0 ) || ( ( sigMean == 0 ) && ( q & 1 ) ) ) { 1443 aSig = alternateASig; 1444 } 1445 zSign = ( (sbits32) aSig < 0 ); 1446 if ( zSign ) aSig = - aSig; 1447 return normalizeRoundAndPackFloat32( aSign ^ zSign, bExp, aSig ); 1448 1449 } 1450 1451 /* 1452 ------------------------------------------------------------------------------- 1453 Returns the square root of the single-precision floating-point value `a'. 1454 The operation is performed according to the IEC/IEEE Standard for Binary 1455 Floating-point Arithmetic. 1456 ------------------------------------------------------------------------------- 1457 */ 1458 float32 float32_sqrt( float32 a ) 1459 { 1460 flag aSign; 1461 int16 aExp, zExp; 1462 bits32 aSig, zSig; 1463 bits64 rem, term; 1464 1465 aSig = extractFloat32Frac( a ); 1466 aExp = extractFloat32Exp( a ); 1467 aSign = extractFloat32Sign( a ); 1468 if ( aExp == 0xFF ) { 1469 if ( aSig ) return propagateFloat32NaN( a, 0 ); 1470 if ( ! aSign ) return a; 1471 float_raise( float_flag_invalid ); 1472 return float32_default_nan; 1473 } 1474 if ( aSign ) { 1475 if ( ( aExp | aSig ) == 0 ) return a; 1476 float_raise( float_flag_invalid ); 1477 return float32_default_nan; 1478 } 1479 if ( aExp == 0 ) { 1480 if ( aSig == 0 ) return 0; 1481 normalizeFloat32Subnormal( aSig, &aExp, &aSig ); 1482 } 1483 zExp = ( ( aExp - 0x7F )>>1 ) + 0x7E; 1484 aSig = ( aSig | 0x00800000 )<<8; 1485 zSig = estimateSqrt32( aExp, aSig ) + 2; 1486 if ( ( zSig & 0x7F ) <= 5 ) { 1487 if ( zSig < 2 ) { 1488 zSig = 0xFFFFFFFF; 1489 } 1490 else { 1491 aSig >>= aExp & 1; 1492 term = ( (bits64) zSig ) * zSig; 1493 rem = ( ( (bits64) aSig )<<32 ) - term; 1494 while ( (sbits64) rem < 0 ) { 1495 --zSig; 1496 rem += ( ( (bits64) zSig )<<1 ) | 1; 1497 } 1498 zSig |= ( rem != 0 ); 1499 } 1500 } 1501 shift32RightJamming( zSig, 1, &zSig ); 1502 return roundAndPackFloat32( 0, zExp, zSig ); 1503 1504 } 1505 1506 /* 1507 ------------------------------------------------------------------------------- 1508 Returns 1 if the single-precision floating-point value `a' is equal to the 1509 corresponding value `b', and 0 otherwise. The comparison is performed 1510 according to the IEC/IEEE Standard for Binary Floating-point Arithmetic. 1511 ------------------------------------------------------------------------------- 1512 */ 1513 flag float32_eq( float32 a, float32 b ) 1514 { 1515 1516 if ( ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) ) 1517 || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) ) 1518 ) { 1519 if ( float32_is_signaling_nan( a ) || float32_is_signaling_nan( b ) ) { 1520 float_raise( float_flag_invalid ); 1521 } 1522 return 0; 1523 } 1524 return ( a == b ) || ( (bits32) ( ( a | b )<<1 ) == 0 ); 1525 1526 } 1527 1528 /* 1529 ------------------------------------------------------------------------------- 1530 Returns 1 if the single-precision floating-point value `a' is less than or 1531 equal to the corresponding value `b', and 0 otherwise. The comparison is 1532 performed according to the IEC/IEEE Standard for Binary Floating-point 1533 Arithmetic. 1534 ------------------------------------------------------------------------------- 1535 */ 1536 flag float32_le( float32 a, float32 b ) 1537 { 1538 flag aSign, bSign; 1539 1540 if ( ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) ) 1541 || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) ) 1542 ) { 1543 float_raise( float_flag_invalid ); 1544 return 0; 1545 } 1546 aSign = extractFloat32Sign( a ); 1547 bSign = extractFloat32Sign( b ); 1548 if ( aSign != bSign ) return aSign || ( (bits32) ( ( a | b )<<1 ) == 0 ); 1549 return ( a == b ) || ( aSign ^ ( a < b ) ); 1550 1551 } 1552 1553 /* 1554 ------------------------------------------------------------------------------- 1555 Returns 1 if the single-precision floating-point value `a' is less than 1556 the corresponding value `b', and 0 otherwise. The comparison is performed 1557 according to the IEC/IEEE Standard for Binary Floating-point Arithmetic. 1558 ------------------------------------------------------------------------------- 1559 */ 1560 flag float32_lt( float32 a, float32 b ) 1561 { 1562 flag aSign, bSign; 1563 1564 if ( ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) ) 1565 || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) ) 1566 ) { 1567 float_raise( float_flag_invalid ); 1568 return 0; 1569 } 1570 aSign = extractFloat32Sign( a ); 1571 bSign = extractFloat32Sign( b ); 1572 if ( aSign != bSign ) return aSign && ( (bits32) ( ( a | b )<<1 ) != 0 ); 1573 return ( a != b ) && ( aSign ^ ( a < b ) ); 1574 1575 } 1576 1577 /* 1578 ------------------------------------------------------------------------------- 1579 Returns 1 if the single-precision floating-point value `a' is equal to the 1580 corresponding value `b', and 0 otherwise. The invalid exception is raised 1581 if either operand is a NaN. Otherwise, the comparison is performed 1582 according to the IEC/IEEE Standard for Binary Floating-point Arithmetic. 1583 ------------------------------------------------------------------------------- 1584 */ 1585 flag float32_eq_signaling( float32 a, float32 b ) 1586 { 1587 1588 if ( ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) ) 1589 || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) ) 1590 ) { 1591 float_raise( float_flag_invalid ); 1592 return 0; 1593 } 1594 return ( a == b ) || ( (bits32) ( ( a | b )<<1 ) == 0 ); 1595 1596 } 1597 1598 /* 1599 ------------------------------------------------------------------------------- 1600 Returns 1 if the single-precision floating-point value `a' is less than or 1601 equal to the corresponding value `b', and 0 otherwise. Quiet NaNs do not 1602 cause an exception. Otherwise, the comparison is performed according to the 1603 IEC/IEEE Standard for Binary Floating-point Arithmetic. 1604 ------------------------------------------------------------------------------- 1605 */ 1606 flag float32_le_quiet( float32 a, float32 b ) 1607 { 1608 flag aSign, bSign; 1609 //int16 aExp, bExp; 1610 1611 if ( ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) ) 1612 || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) ) 1613 ) { 1614 if ( float32_is_signaling_nan( a ) || float32_is_signaling_nan( b ) ) { 1615 float_raise( float_flag_invalid ); 1616 } 1617 return 0; 1618 } 1619 aSign = extractFloat32Sign( a ); 1620 bSign = extractFloat32Sign( b ); 1621 if ( aSign != bSign ) return aSign || ( (bits32) ( ( a | b )<<1 ) == 0 ); 1622 return ( a == b ) || ( aSign ^ ( a < b ) ); 1623 1624 } 1625 1626 /* 1627 ------------------------------------------------------------------------------- 1628 Returns 1 if the single-precision floating-point value `a' is less than 1629 the corresponding value `b', and 0 otherwise. Quiet NaNs do not cause an 1630 exception. Otherwise, the comparison is performed according to the IEC/IEEE 1631 Standard for Binary Floating-point Arithmetic. 1632 ------------------------------------------------------------------------------- 1633 */ 1634 flag float32_lt_quiet( float32 a, float32 b ) 1635 { 1636 flag aSign, bSign; 1637 1638 if ( ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) ) 1639 || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) ) 1640 ) { 1641 if ( float32_is_signaling_nan( a ) || float32_is_signaling_nan( b ) ) { 1642 float_raise( float_flag_invalid ); 1643 } 1644 return 0; 1645 } 1646 aSign = extractFloat32Sign( a ); 1647 bSign = extractFloat32Sign( b ); 1648 if ( aSign != bSign ) return aSign && ( (bits32) ( ( a | b )<<1 ) != 0 ); 1649 return ( a != b ) && ( aSign ^ ( a < b ) ); 1650 1651 } 1652 1653 /* 1654 ------------------------------------------------------------------------------- 1655 Returns the result of converting the double-precision floating-point value 1656 `a' to the 32-bit two's complement integer format. The conversion is 1657 performed according to the IEC/IEEE Standard for Binary Floating-point 1658 Arithmetic---which means in particular that the conversion is rounded 1659 according to the current rounding mode. If `a' is a NaN, the largest 1660 positive integer is returned. Otherwise, if the conversion overflows, the 1661 largest integer with the same sign as `a' is returned. 1662 ------------------------------------------------------------------------------- 1663 */ 1664 int32 float64_to_int32( float64 a ) 1665 { 1666 flag aSign; 1667 int16 aExp, shiftCount; 1668 bits64 aSig; 1669 1670 aSig = extractFloat64Frac( a ); 1671 aExp = extractFloat64Exp( a ); 1672 aSign = extractFloat64Sign( a ); 1673 if ( ( aExp == 0x7FF ) && aSig ) aSign = 0; 1674 if ( aExp ) aSig |= LIT64( 0x0010000000000000 ); 1675 shiftCount = 0x42C - aExp; 1676 if ( 0 < shiftCount ) shift64RightJamming( aSig, shiftCount, &aSig ); 1677 return roundAndPackInt32( aSign, aSig ); 1678 1679 } 1680 1681 /* 1682 ------------------------------------------------------------------------------- 1683 Returns the result of converting the double-precision floating-point value 1684 `a' to the 32-bit two's complement integer format. The conversion is 1685 performed according to the IEC/IEEE Standard for Binary Floating-point 1686 Arithmetic, except that the conversion is always rounded toward zero. If 1687 `a' is a NaN, the largest positive integer is returned. Otherwise, if the 1688 conversion overflows, the largest integer with the same sign as `a' is 1689 returned. 1690 ------------------------------------------------------------------------------- 1691 */ 1692 int32 float64_to_int32_round_to_zero( float64 a ) 1693 { 1694 flag aSign; 1695 int16 aExp, shiftCount; 1696 bits64 aSig, savedASig; 1697 int32 z; 1698 1699 aSig = extractFloat64Frac( a ); 1700 aExp = extractFloat64Exp( a ); 1701 aSign = extractFloat64Sign( a ); 1702 shiftCount = 0x433 - aExp; 1703 if ( shiftCount < 21 ) { 1704 if ( ( aExp == 0x7FF ) && aSig ) aSign = 0; 1705 goto invalid; 1706 } 1707 else if ( 52 < shiftCount ) { 1708 if ( aExp || aSig ) float_exception_flags |= float_flag_inexact; 1709 return 0; 1710 } 1711 aSig |= LIT64( 0x0010000000000000 ); 1712 savedASig = aSig; 1713 aSig >>= shiftCount; 1714 z = aSig; 1715 if ( aSign ) z = - z; 1716 if ( ( z < 0 ) ^ aSign ) { 1717 invalid: 1718 float_exception_flags |= float_flag_invalid; 1719 return aSign ? 0x80000000 : 0x7FFFFFFF; 1720 } 1721 if ( ( aSig<<shiftCount ) != savedASig ) { 1722 float_exception_flags |= float_flag_inexact; 1723 } 1724 return z; 1725 1726 } 1727 1728 /* 1729 ------------------------------------------------------------------------------- 1730 Returns the result of converting the double-precision floating-point value 1731 `a' to the 32-bit two's complement unsigned integer format. The conversion 1732 is performed according to the IEC/IEEE Standard for Binary Floating-point 1733 Arithmetic---which means in particular that the conversion is rounded 1734 according to the current rounding mode. If `a' is a NaN, the largest 1735 positive integer is returned. Otherwise, if the conversion overflows, the 1736 largest positive integer is returned. 1737 ------------------------------------------------------------------------------- 1738 */ 1739 int32 float64_to_uint32( float64 a ) 1740 { 1741 flag aSign; 1742 int16 aExp, shiftCount; 1743 bits64 aSig; 1744 1745 aSig = extractFloat64Frac( a ); 1746 aExp = extractFloat64Exp( a ); 1747 aSign = 0; //extractFloat64Sign( a ); 1748 //if ( ( aExp == 0x7FF ) && aSig ) aSign = 0; 1749 if ( aExp ) aSig |= LIT64( 0x0010000000000000 ); 1750 shiftCount = 0x42C - aExp; 1751 if ( 0 < shiftCount ) shift64RightJamming( aSig, shiftCount, &aSig ); 1752 return roundAndPackInt32( aSign, aSig ); 1753 } 1754 1755 /* 1756 ------------------------------------------------------------------------------- 1757 Returns the result of converting the double-precision floating-point value 1758 `a' to the 32-bit two's complement integer format. The conversion is 1759 performed according to the IEC/IEEE Standard for Binary Floating-point 1760 Arithmetic, except that the conversion is always rounded toward zero. If 1761 `a' is a NaN, the largest positive integer is returned. Otherwise, if the 1762 conversion overflows, the largest positive integer is returned. 1763 ------------------------------------------------------------------------------- 1764 */ 1765 int32 float64_to_uint32_round_to_zero( float64 a ) 1766 { 1767 flag aSign; 1768 int16 aExp, shiftCount; 1769 bits64 aSig, savedASig; 1770 int32 z; 1771 1772 aSig = extractFloat64Frac( a ); 1773 aExp = extractFloat64Exp( a ); 1774 aSign = extractFloat64Sign( a ); 1775 shiftCount = 0x433 - aExp; 1776 if ( shiftCount < 21 ) { 1777 if ( ( aExp == 0x7FF ) && aSig ) aSign = 0; 1778 goto invalid; 1779 } 1780 else if ( 52 < shiftCount ) { 1781 if ( aExp || aSig ) float_exception_flags |= float_flag_inexact; 1782 return 0; 1783 } 1784 aSig |= LIT64( 0x0010000000000000 ); 1785 savedASig = aSig; 1786 aSig >>= shiftCount; 1787 z = aSig; 1788 if ( aSign ) z = - z; 1789 if ( ( z < 0 ) ^ aSign ) { 1790 invalid: 1791 float_exception_flags |= float_flag_invalid; 1792 return aSign ? 0x80000000 : 0x7FFFFFFF; 1793 } 1794 if ( ( aSig<<shiftCount ) != savedASig ) { 1795 float_exception_flags |= float_flag_inexact; 1796 } 1797 return z; 1798 } 1799 1800 /* 1801 ------------------------------------------------------------------------------- 1802 Returns the result of converting the double-precision floating-point value 1803 `a' to the single-precision floating-point format. The conversion is 1804 performed according to the IEC/IEEE Standard for Binary Floating-point 1805 Arithmetic. 1806 ------------------------------------------------------------------------------- 1807 */ 1808 float32 float64_to_float32( float64 a ) 1809 { 1810 flag aSign; 1811 int16 aExp; 1812 bits64 aSig; 1813 bits32 zSig; 1814 1815 aSig = extractFloat64Frac( a ); 1816 aExp = extractFloat64Exp( a ); 1817 aSign = extractFloat64Sign( a ); 1818 if ( aExp == 0x7FF ) { 1819 if ( aSig ) return commonNaNToFloat32( float64ToCommonNaN( a ) ); 1820 return packFloat32( aSign, 0xFF, 0 ); 1821 } 1822 shift64RightJamming( aSig, 22, &aSig ); 1823 zSig = aSig; 1824 if ( aExp || zSig ) { 1825 zSig |= 0x40000000; 1826 aExp -= 0x381; 1827 } 1828 return roundAndPackFloat32( aSign, aExp, zSig ); 1829 1830 } 1831 1832 #ifdef FLOATX80 1833 1834 /* 1835 ------------------------------------------------------------------------------- 1836 Returns the result of converting the double-precision floating-point value 1837 `a' to the extended double-precision floating-point format. The conversion 1838 is performed according to the IEC/IEEE Standard for Binary Floating-point 1839 Arithmetic. 1840 ------------------------------------------------------------------------------- 1841 */ 1842 floatx80 float64_to_floatx80( float64 a ) 1843 { 1844 flag aSign; 1845 int16 aExp; 1846 bits64 aSig; 1847 1848 aSig = extractFloat64Frac( a ); 1849 aExp = extractFloat64Exp( a ); 1850 aSign = extractFloat64Sign( a ); 1851 if ( aExp == 0x7FF ) { 1852 if ( aSig ) return commonNaNToFloatx80( float64ToCommonNaN( a ) ); 1853 return packFloatx80( aSign, 0x7FFF, LIT64( 0x8000000000000000 ) ); 1854 } 1855 if ( aExp == 0 ) { 1856 if ( aSig == 0 ) return packFloatx80( aSign, 0, 0 ); 1857 normalizeFloat64Subnormal( aSig, &aExp, &aSig ); 1858 } 1859 return 1860 packFloatx80( 1861 aSign, aExp + 0x3C00, ( aSig | LIT64( 0x0010000000000000 ) )<<11 ); 1862 1863 } 1864 1865 #endif 1866 1867 /* 1868 ------------------------------------------------------------------------------- 1869 Rounds the double-precision floating-point value `a' to an integer, and 1870 returns the result as a double-precision floating-point value. The 1871 operation is performed according to the IEC/IEEE Standard for Binary 1872 Floating-point Arithmetic. 1873 ------------------------------------------------------------------------------- 1874 */ 1875 float64 float64_round_to_int( float64 a ) 1876 { 1877 flag aSign; 1878 int16 aExp; 1879 bits64 lastBitMask, roundBitsMask; 1880 int8 roundingMode; 1881 float64 z; 1882 1883 aExp = extractFloat64Exp( a ); 1884 if ( 0x433 <= aExp ) { 1885 if ( ( aExp == 0x7FF ) && extractFloat64Frac( a ) ) { 1886 return propagateFloat64NaN( a, a ); 1887 } 1888 return a; 1889 } 1890 if ( aExp <= 0x3FE ) { 1891 if ( (bits64) ( a<<1 ) == 0 ) return a; 1892 float_exception_flags |= float_flag_inexact; 1893 aSign = extractFloat64Sign( a ); 1894 switch ( float_rounding_mode ) { 1895 case float_round_nearest_even: 1896 if ( ( aExp == 0x3FE ) && extractFloat64Frac( a ) ) { 1897 return packFloat64( aSign, 0x3FF, 0 ); 1898 } 1899 break; 1900 case float_round_down: 1901 return aSign ? LIT64( 0xBFF0000000000000 ) : 0; 1902 case float_round_up: 1903 return 1904 aSign ? LIT64( 0x8000000000000000 ) : LIT64( 0x3FF0000000000000 ); 1905 } 1906 return packFloat64( aSign, 0, 0 ); 1907 } 1908 lastBitMask = 1; 1909 lastBitMask <<= 0x433 - aExp; 1910 roundBitsMask = lastBitMask - 1; 1911 z = a; 1912 roundingMode = float_rounding_mode; 1913 if ( roundingMode == float_round_nearest_even ) { 1914 z += lastBitMask>>1; 1915 if ( ( z & roundBitsMask ) == 0 ) z &= ~ lastBitMask; 1916 } 1917 else if ( roundingMode != float_round_to_zero ) { 1918 if ( extractFloat64Sign( z ) ^ ( roundingMode == float_round_up ) ) { 1919 z += roundBitsMask; 1920 } 1921 } 1922 z &= ~ roundBitsMask; 1923 if ( z != a ) float_exception_flags |= float_flag_inexact; 1924 return z; 1925 1926 } 1927 1928 /* 1929 ------------------------------------------------------------------------------- 1930 Returns the result of adding the absolute values of the double-precision 1931 floating-point values `a' and `b'. If `zSign' is true, the sum is negated 1932 before being returned. `zSign' is ignored if the result is a NaN. The 1933 addition is performed according to the IEC/IEEE Standard for Binary 1934 Floating-point Arithmetic. 1935 ------------------------------------------------------------------------------- 1936 */ 1937 static float64 addFloat64Sigs( float64 a, float64 b, flag zSign ) 1938 { 1939 int16 aExp, bExp, zExp; 1940 bits64 aSig, bSig, zSig; 1941 int16 expDiff; 1942 1943 aSig = extractFloat64Frac( a ); 1944 aExp = extractFloat64Exp( a ); 1945 bSig = extractFloat64Frac( b ); 1946 bExp = extractFloat64Exp( b ); 1947 expDiff = aExp - bExp; 1948 aSig <<= 9; 1949 bSig <<= 9; 1950 if ( 0 < expDiff ) { 1951 if ( aExp == 0x7FF ) { 1952 if ( aSig ) return propagateFloat64NaN( a, b ); 1953 return a; 1954 } 1955 if ( bExp == 0 ) { 1956 --expDiff; 1957 } 1958 else { 1959 bSig |= LIT64( 0x2000000000000000 ); 1960 } 1961 shift64RightJamming( bSig, expDiff, &bSig ); 1962 zExp = aExp; 1963 } 1964 else if ( expDiff < 0 ) { 1965 if ( bExp == 0x7FF ) { 1966 if ( bSig ) return propagateFloat64NaN( a, b ); 1967 return packFloat64( zSign, 0x7FF, 0 ); 1968 } 1969 if ( aExp == 0 ) { 1970 ++expDiff; 1971 } 1972 else { 1973 aSig |= LIT64( 0x2000000000000000 ); 1974 } 1975 shift64RightJamming( aSig, - expDiff, &aSig ); 1976 zExp = bExp; 1977 } 1978 else { 1979 if ( aExp == 0x7FF ) { 1980 if ( aSig | bSig ) return propagateFloat64NaN( a, b ); 1981 return a; 1982 } 1983 if ( aExp == 0 ) return packFloat64( zSign, 0, ( aSig + bSig )>>9 ); 1984 zSig = LIT64( 0x4000000000000000 ) + aSig + bSig; 1985 zExp = aExp; 1986 goto roundAndPack; 1987 } 1988 aSig |= LIT64( 0x2000000000000000 ); 1989 zSig = ( aSig + bSig )<<1; 1990 --zExp; 1991 if ( (sbits64) zSig < 0 ) { 1992 zSig = aSig + bSig; 1993 ++zExp; 1994 } 1995 roundAndPack: 1996 return roundAndPackFloat64( zSign, zExp, zSig ); 1997 1998 } 1999 2000 /* 2001 ------------------------------------------------------------------------------- 2002 Returns the result of subtracting the absolute values of the double- 2003 precision floating-point values `a' and `b'. If `zSign' is true, the 2004 difference is negated before being returned. `zSign' is ignored if the 2005 result is a NaN. The subtraction is performed according to the IEC/IEEE 2006 Standard for Binary Floating-point Arithmetic. 2007 ------------------------------------------------------------------------------- 2008 */ 2009 static float64 subFloat64Sigs( float64 a, float64 b, flag zSign ) 2010 { 2011 int16 aExp, bExp, zExp; 2012 bits64 aSig, bSig, zSig; 2013 int16 expDiff; 2014 2015 aSig = extractFloat64Frac( a ); 2016 aExp = extractFloat64Exp( a ); 2017 bSig = extractFloat64Frac( b ); 2018 bExp = extractFloat64Exp( b ); 2019 expDiff = aExp - bExp; 2020 aSig <<= 10; 2021 bSig <<= 10; 2022 if ( 0 < expDiff ) goto aExpBigger; 2023 if ( expDiff < 0 ) goto bExpBigger; 2024 if ( aExp == 0x7FF ) { 2025 if ( aSig | bSig ) return propagateFloat64NaN( a, b ); 2026 float_raise( float_flag_invalid ); 2027 return float64_default_nan; 2028 } 2029 if ( aExp == 0 ) { 2030 aExp = 1; 2031 bExp = 1; 2032 } 2033 if ( bSig < aSig ) goto aBigger; 2034 if ( aSig < bSig ) goto bBigger; 2035 return packFloat64( float_rounding_mode == float_round_down, 0, 0 ); 2036 bExpBigger: 2037 if ( bExp == 0x7FF ) { 2038 if ( bSig ) return propagateFloat64NaN( a, b ); 2039 return packFloat64( zSign ^ 1, 0x7FF, 0 ); 2040 } 2041 if ( aExp == 0 ) { 2042 ++expDiff; 2043 } 2044 else { 2045 aSig |= LIT64( 0x4000000000000000 ); 2046 } 2047 shift64RightJamming( aSig, - expDiff, &aSig ); 2048 bSig |= LIT64( 0x4000000000000000 ); 2049 bBigger: 2050 zSig = bSig - aSig; 2051 zExp = bExp; 2052 zSign ^= 1; 2053 goto normalizeRoundAndPack; 2054 aExpBigger: 2055 if ( aExp == 0x7FF ) { 2056 if ( aSig ) return propagateFloat64NaN( a, b ); 2057 return a; 2058 } 2059 if ( bExp == 0 ) { 2060 --expDiff; 2061 } 2062 else { 2063 bSig |= LIT64( 0x4000000000000000 ); 2064 } 2065 shift64RightJamming( bSig, expDiff, &bSig ); 2066 aSig |= LIT64( 0x4000000000000000 ); 2067 aBigger: 2068 zSig = aSig - bSig; 2069 zExp = aExp; 2070 normalizeRoundAndPack: 2071 --zExp; 2072 return normalizeRoundAndPackFloat64( zSign, zExp, zSig ); 2073 2074 } 2075 2076 /* 2077 ------------------------------------------------------------------------------- 2078 Returns the result of adding the double-precision floating-point values `a' 2079 and `b'. The operation is performed according to the IEC/IEEE Standard for 2080 Binary Floating-point Arithmetic. 2081 ------------------------------------------------------------------------------- 2082 */ 2083 float64 float64_add( float64 a, float64 b ) 2084 { 2085 flag aSign, bSign; 2086 2087 aSign = extractFloat64Sign( a ); 2088 bSign = extractFloat64Sign( b ); 2089 if ( aSign == bSign ) { 2090 return addFloat64Sigs( a, b, aSign ); 2091 } 2092 else { 2093 return subFloat64Sigs( a, b, aSign ); 2094 } 2095 2096 } 2097 2098 /* 2099 ------------------------------------------------------------------------------- 2100 Returns the result of subtracting the double-precision floating-point values 2101 `a' and `b'. The operation is performed according to the IEC/IEEE Standard 2102 for Binary Floating-point Arithmetic. 2103 ------------------------------------------------------------------------------- 2104 */ 2105 float64 float64_sub( float64 a, float64 b ) 2106 { 2107 flag aSign, bSign; 2108 2109 aSign = extractFloat64Sign( a ); 2110 bSign = extractFloat64Sign( b ); 2111 if ( aSign == bSign ) { 2112 return subFloat64Sigs( a, b, aSign ); 2113 } 2114 else { 2115 return addFloat64Sigs( a, b, aSign ); 2116 } 2117 2118 } 2119 2120 /* 2121 ------------------------------------------------------------------------------- 2122 Returns the result of multiplying the double-precision floating-point values 2123 `a' and `b'. The operation is performed according to the IEC/IEEE Standard 2124 for Binary Floating-point Arithmetic. 2125 ------------------------------------------------------------------------------- 2126 */ 2127 float64 float64_mul( float64 a, float64 b ) 2128 { 2129 flag aSign, bSign, zSign; 2130 int16 aExp, bExp, zExp; 2131 bits64 aSig, bSig, zSig0, zSig1; 2132 2133 aSig = extractFloat64Frac( a ); 2134 aExp = extractFloat64Exp( a ); 2135 aSign = extractFloat64Sign( a ); 2136 bSig = extractFloat64Frac( b ); 2137 bExp = extractFloat64Exp( b ); 2138 bSign = extractFloat64Sign( b ); 2139 zSign = aSign ^ bSign; 2140 if ( aExp == 0x7FF ) { 2141 if ( aSig || ( ( bExp == 0x7FF ) && bSig ) ) { 2142 return propagateFloat64NaN( a, b ); 2143 } 2144 if ( ( bExp | bSig ) == 0 ) { 2145 float_raise( float_flag_invalid ); 2146 return float64_default_nan; 2147 } 2148 return packFloat64( zSign, 0x7FF, 0 ); 2149 } 2150 if ( bExp == 0x7FF ) { 2151 if ( bSig ) return propagateFloat64NaN( a, b ); 2152 if ( ( aExp | aSig ) == 0 ) { 2153 float_raise( float_flag_invalid ); 2154 return float64_default_nan; 2155 } 2156 return packFloat64( zSign, 0x7FF, 0 ); 2157 } 2158 if ( aExp == 0 ) { 2159 if ( aSig == 0 ) return packFloat64( zSign, 0, 0 ); 2160 normalizeFloat64Subnormal( aSig, &aExp, &aSig ); 2161 } 2162 if ( bExp == 0 ) { 2163 if ( bSig == 0 ) return packFloat64( zSign, 0, 0 ); 2164 normalizeFloat64Subnormal( bSig, &bExp, &bSig ); 2165 } 2166 zExp = aExp + bExp - 0x3FF; 2167 aSig = ( aSig | LIT64( 0x0010000000000000 ) )<<10; 2168 bSig = ( bSig | LIT64( 0x0010000000000000 ) )<<11; 2169 mul64To128( aSig, bSig, &zSig0, &zSig1 ); 2170 zSig0 |= ( zSig1 != 0 ); 2171 if ( 0 <= (sbits64) ( zSig0<<1 ) ) { 2172 zSig0 <<= 1; 2173 --zExp; 2174 } 2175 return roundAndPackFloat64( zSign, zExp, zSig0 ); 2176 2177 } 2178 2179 /* 2180 ------------------------------------------------------------------------------- 2181 Returns the result of dividing the double-precision floating-point value `a' 2182 by the corresponding value `b'. The operation is performed according to 2183 the IEC/IEEE Standard for Binary Floating-point Arithmetic. 2184 ------------------------------------------------------------------------------- 2185 */ 2186 float64 float64_div( float64 a, float64 b ) 2187 { 2188 flag aSign, bSign, zSign; 2189 int16 aExp, bExp, zExp; 2190 bits64 aSig, bSig, zSig; 2191 bits64 rem0, rem1; 2192 bits64 term0, term1; 2193 2194 aSig = extractFloat64Frac( a ); 2195 aExp = extractFloat64Exp( a ); 2196 aSign = extractFloat64Sign( a ); 2197 bSig = extractFloat64Frac( b ); 2198 bExp = extractFloat64Exp( b ); 2199 bSign = extractFloat64Sign( b ); 2200 zSign = aSign ^ bSign; 2201 if ( aExp == 0x7FF ) { 2202 if ( aSig ) return propagateFloat64NaN( a, b ); 2203 if ( bExp == 0x7FF ) { 2204 if ( bSig ) return propagateFloat64NaN( a, b ); 2205 float_raise( float_flag_invalid ); 2206 return float64_default_nan; 2207 } 2208 return packFloat64( zSign, 0x7FF, 0 ); 2209 } 2210 if ( bExp == 0x7FF ) { 2211 if ( bSig ) return propagateFloat64NaN( a, b ); 2212 return packFloat64( zSign, 0, 0 ); 2213 } 2214 if ( bExp == 0 ) { 2215 if ( bSig == 0 ) { 2216 if ( ( aExp | aSig ) == 0 ) { 2217 float_raise( float_flag_invalid ); 2218 return float64_default_nan; 2219 } 2220 float_raise( float_flag_divbyzero ); 2221 return packFloat64( zSign, 0x7FF, 0 ); 2222 } 2223 normalizeFloat64Subnormal( bSig, &bExp, &bSig ); 2224 } 2225 if ( aExp == 0 ) { 2226 if ( aSig == 0 ) return packFloat64( zSign, 0, 0 ); 2227 normalizeFloat64Subnormal( aSig, &aExp, &aSig ); 2228 } 2229 zExp = aExp - bExp + 0x3FD; 2230 aSig = ( aSig | LIT64( 0x0010000000000000 ) )<<10; 2231 bSig = ( bSig | LIT64( 0x0010000000000000 ) )<<11; 2232 if ( bSig <= ( aSig + aSig ) ) { 2233 aSig >>= 1; 2234 ++zExp; 2235 } 2236 zSig = estimateDiv128To64( aSig, 0, bSig ); 2237 if ( ( zSig & 0x1FF ) <= 2 ) { 2238 mul64To128( bSig, zSig, &term0, &term1 ); 2239 sub128( aSig, 0, term0, term1, &rem0, &rem1 ); 2240 while ( (sbits64) rem0 < 0 ) { 2241 --zSig; 2242 add128( rem0, rem1, 0, bSig, &rem0, &rem1 ); 2243 } 2244 zSig |= ( rem1 != 0 ); 2245 } 2246 return roundAndPackFloat64( zSign, zExp, zSig ); 2247 2248 } 2249 2250 /* 2251 ------------------------------------------------------------------------------- 2252 Returns the remainder of the double-precision floating-point value `a' 2253 with respect to the corresponding value `b'. The operation is performed 2254 according to the IEC/IEEE Standard for Binary Floating-point Arithmetic. 2255 ------------------------------------------------------------------------------- 2256 */ 2257 float64 float64_rem( float64 a, float64 b ) 2258 { 2259 flag aSign, bSign, zSign; 2260 int16 aExp, bExp, expDiff; 2261 bits64 aSig, bSig; 2262 bits64 q, alternateASig; 2263 sbits64 sigMean; 2264 2265 aSig = extractFloat64Frac( a ); 2266 aExp = extractFloat64Exp( a ); 2267 aSign = extractFloat64Sign( a ); 2268 bSig = extractFloat64Frac( b ); 2269 bExp = extractFloat64Exp( b ); 2270 bSign = extractFloat64Sign( b ); 2271 if ( aExp == 0x7FF ) { 2272 if ( aSig || ( ( bExp == 0x7FF ) && bSig ) ) { 2273 return propagateFloat64NaN( a, b ); 2274 } 2275 float_raise( float_flag_invalid ); 2276 return float64_default_nan; 2277 } 2278 if ( bExp == 0x7FF ) { 2279 if ( bSig ) return propagateFloat64NaN( a, b ); 2280 return a; 2281 } 2282 if ( bExp == 0 ) { 2283 if ( bSig == 0 ) { 2284 float_raise( float_flag_invalid ); 2285 return float64_default_nan; 2286 } 2287 normalizeFloat64Subnormal( bSig, &bExp, &bSig ); 2288 } 2289 if ( aExp == 0 ) { 2290 if ( aSig == 0 ) return a; 2291 normalizeFloat64Subnormal( aSig, &aExp, &aSig ); 2292 } 2293 expDiff = aExp - bExp; 2294 aSig = ( aSig | LIT64( 0x0010000000000000 ) )<<11; 2295 bSig = ( bSig | LIT64( 0x0010000000000000 ) )<<11; 2296 if ( expDiff < 0 ) { 2297 if ( expDiff < -1 ) return a; 2298 aSig >>= 1; 2299 } 2300 q = ( bSig <= aSig ); 2301 if ( q ) aSig -= bSig; 2302 expDiff -= 64; 2303 while ( 0 < expDiff ) { 2304 q = estimateDiv128To64( aSig, 0, bSig ); 2305 q = ( 2 < q ) ? q - 2 : 0; 2306 aSig = - ( ( bSig>>2 ) * q ); 2307 expDiff -= 62; 2308 } 2309 expDiff += 64; 2310 if ( 0 < expDiff ) { 2311 q = estimateDiv128To64( aSig, 0, bSig ); 2312 q = ( 2 < q ) ? q - 2 : 0; 2313 q >>= 64 - expDiff; 2314 bSig >>= 2; 2315 aSig = ( ( aSig>>1 )<<( expDiff - 1 ) ) - bSig * q; 2316 } 2317 else { 2318 aSig >>= 2; 2319 bSig >>= 2; 2320 } 2321 do { 2322 alternateASig = aSig; 2323 ++q; 2324 aSig -= bSig; 2325 } while ( 0 <= (sbits64) aSig ); 2326 sigMean = aSig + alternateASig; 2327 if ( ( sigMean < 0 ) || ( ( sigMean == 0 ) && ( q & 1 ) ) ) { 2328 aSig = alternateASig; 2329 } 2330 zSign = ( (sbits64) aSig < 0 ); 2331 if ( zSign ) aSig = - aSig; 2332 return normalizeRoundAndPackFloat64( aSign ^ zSign, bExp, aSig ); 2333 2334 } 2335 2336 /* 2337 ------------------------------------------------------------------------------- 2338 Returns the square root of the double-precision floating-point value `a'. 2339 The operation is performed according to the IEC/IEEE Standard for Binary 2340 Floating-point Arithmetic. 2341 ------------------------------------------------------------------------------- 2342 */ 2343 float64 float64_sqrt( float64 a ) 2344 { 2345 flag aSign; 2346 int16 aExp, zExp; 2347 bits64 aSig, zSig; 2348 bits64 rem0, rem1, term0, term1; //, shiftedRem; 2349 //float64 z; 2350 2351 aSig = extractFloat64Frac( a ); 2352 aExp = extractFloat64Exp( a ); 2353 aSign = extractFloat64Sign( a ); 2354 if ( aExp == 0x7FF ) { 2355 if ( aSig ) return propagateFloat64NaN( a, a ); 2356 if ( ! aSign ) return a; 2357 float_raise( float_flag_invalid ); 2358 return float64_default_nan; 2359 } 2360 if ( aSign ) { 2361 if ( ( aExp | aSig ) == 0 ) return a; 2362 float_raise( float_flag_invalid ); 2363 return float64_default_nan; 2364 } 2365 if ( aExp == 0 ) { 2366 if ( aSig == 0 ) return 0; 2367 normalizeFloat64Subnormal( aSig, &aExp, &aSig ); 2368 } 2369 zExp = ( ( aExp - 0x3FF )>>1 ) + 0x3FE; 2370 aSig |= LIT64( 0x0010000000000000 ); 2371 zSig = estimateSqrt32( aExp, aSig>>21 ); 2372 zSig <<= 31; 2373 aSig <<= 9 - ( aExp & 1 ); 2374 zSig = estimateDiv128To64( aSig, 0, zSig ) + zSig + 2; 2375 if ( ( zSig & 0x3FF ) <= 5 ) { 2376 if ( zSig < 2 ) { 2377 zSig = LIT64( 0xFFFFFFFFFFFFFFFF ); 2378 } 2379 else { 2380 aSig <<= 2; 2381 mul64To128( zSig, zSig, &term0, &term1 ); 2382 sub128( aSig, 0, term0, term1, &rem0, &rem1 ); 2383 while ( (sbits64) rem0 < 0 ) { 2384 --zSig; 2385 shortShift128Left( 0, zSig, 1, &term0, &term1 ); 2386 term1 |= 1; 2387 add128( rem0, rem1, term0, term1, &rem0, &rem1 ); 2388 } 2389 zSig |= ( ( rem0 | rem1 ) != 0 ); 2390 } 2391 } 2392 shift64RightJamming( zSig, 1, &zSig ); 2393 return roundAndPackFloat64( 0, zExp, zSig ); 2394 2395 } 2396 2397 /* 2398 ------------------------------------------------------------------------------- 2399 Returns 1 if the double-precision floating-point value `a' is equal to the 2400 corresponding value `b', and 0 otherwise. The comparison is performed 2401 according to the IEC/IEEE Standard for Binary Floating-point Arithmetic. 2402 ------------------------------------------------------------------------------- 2403 */ 2404 flag float64_eq( float64 a, float64 b ) 2405 { 2406 2407 if ( ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) ) 2408 || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) ) 2409 ) { 2410 if ( float64_is_signaling_nan( a ) || float64_is_signaling_nan( b ) ) { 2411 float_raise( float_flag_invalid ); 2412 } 2413 return 0; 2414 } 2415 return ( a == b ) || ( (bits64) ( ( a | b )<<1 ) == 0 ); 2416 2417 } 2418 2419 /* 2420 ------------------------------------------------------------------------------- 2421 Returns 1 if the double-precision floating-point value `a' is less than or 2422 equal to the corresponding value `b', and 0 otherwise. The comparison is 2423 performed according to the IEC/IEEE Standard for Binary Floating-point 2424 Arithmetic. 2425 ------------------------------------------------------------------------------- 2426 */ 2427 flag float64_le( float64 a, float64 b ) 2428 { 2429 flag aSign, bSign; 2430 2431 if ( ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) ) 2432 || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) ) 2433 ) { 2434 float_raise( float_flag_invalid ); 2435 return 0; 2436 } 2437 aSign = extractFloat64Sign( a ); 2438 bSign = extractFloat64Sign( b ); 2439 if ( aSign != bSign ) return aSign || ( (bits64) ( ( a | b )<<1 ) == 0 ); 2440 return ( a == b ) || ( aSign ^ ( a < b ) ); 2441 2442 } 2443 2444 /* 2445 ------------------------------------------------------------------------------- 2446 Returns 1 if the double-precision floating-point value `a' is less than 2447 the corresponding value `b', and 0 otherwise. The comparison is performed 2448 according to the IEC/IEEE Standard for Binary Floating-point Arithmetic. 2449 ------------------------------------------------------------------------------- 2450 */ 2451 flag float64_lt( float64 a, float64 b ) 2452 { 2453 flag aSign, bSign; 2454 2455 if ( ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) ) 2456 || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) ) 2457 ) { 2458 float_raise( float_flag_invalid ); 2459 return 0; 2460 } 2461 aSign = extractFloat64Sign( a ); 2462 bSign = extractFloat64Sign( b ); 2463 if ( aSign != bSign ) return aSign && ( (bits64) ( ( a | b )<<1 ) != 0 ); 2464 return ( a != b ) && ( aSign ^ ( a < b ) ); 2465 2466 } 2467 2468 /* 2469 ------------------------------------------------------------------------------- 2470 Returns 1 if the double-precision floating-point value `a' is equal to the 2471 corresponding value `b', and 0 otherwise. The invalid exception is raised 2472 if either operand is a NaN. Otherwise, the comparison is performed 2473 according to the IEC/IEEE Standard for Binary Floating-point Arithmetic. 2474 ------------------------------------------------------------------------------- 2475 */ 2476 flag float64_eq_signaling( float64 a, float64 b ) 2477 { 2478 2479 if ( ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) ) 2480 || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) ) 2481 ) { 2482 float_raise( float_flag_invalid ); 2483 return 0; 2484 } 2485 return ( a == b ) || ( (bits64) ( ( a | b )<<1 ) == 0 ); 2486 2487 } 2488 2489 /* 2490 ------------------------------------------------------------------------------- 2491 Returns 1 if the double-precision floating-point value `a' is less than or 2492 equal to the corresponding value `b', and 0 otherwise. Quiet NaNs do not 2493 cause an exception. Otherwise, the comparison is performed according to the 2494 IEC/IEEE Standard for Binary Floating-point Arithmetic. 2495 ------------------------------------------------------------------------------- 2496 */ 2497 flag float64_le_quiet( float64 a, float64 b ) 2498 { 2499 flag aSign, bSign; 2500 //int16 aExp, bExp; 2501 2502 if ( ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) ) 2503 || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) ) 2504 ) { 2505 if ( float64_is_signaling_nan( a ) || float64_is_signaling_nan( b ) ) { 2506 float_raise( float_flag_invalid ); 2507 } 2508 return 0; 2509 } 2510 aSign = extractFloat64Sign( a ); 2511 bSign = extractFloat64Sign( b ); 2512 if ( aSign != bSign ) return aSign || ( (bits64) ( ( a | b )<<1 ) == 0 ); 2513 return ( a == b ) || ( aSign ^ ( a < b ) ); 2514 2515 } 2516 2517 /* 2518 ------------------------------------------------------------------------------- 2519 Returns 1 if the double-precision floating-point value `a' is less than 2520 the corresponding value `b', and 0 otherwise. Quiet NaNs do not cause an 2521 exception. Otherwise, the comparison is performed according to the IEC/IEEE 2522 Standard for Binary Floating-point Arithmetic. 2523 ------------------------------------------------------------------------------- 2524 */ 2525 flag float64_lt_quiet( float64 a, float64 b ) 2526 { 2527 flag aSign, bSign; 2528 2529 if ( ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) ) 2530 || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) ) 2531 ) { 2532 if ( float64_is_signaling_nan( a ) || float64_is_signaling_nan( b ) ) { 2533 float_raise( float_flag_invalid ); 2534 } 2535 return 0; 2536 } 2537 aSign = extractFloat64Sign( a ); 2538 bSign = extractFloat64Sign( b ); 2539 if ( aSign != bSign ) return aSign && ( (bits64) ( ( a | b )<<1 ) != 0 ); 2540 return ( a != b ) && ( aSign ^ ( a < b ) ); 2541 2542 } 2543 2544 #ifdef FLOATX80 2545 2546 /* 2547 ------------------------------------------------------------------------------- 2548 Returns the result of converting the extended double-precision floating- 2549 point value `a' to the 32-bit two's complement integer format. The 2550 conversion is performed according to the IEC/IEEE Standard for Binary 2551 Floating-point Arithmetic---which means in particular that the conversion 2552 is rounded according to the current rounding mode. If `a' is a NaN, the 2553 largest positive integer is returned. Otherwise, if the conversion 2554 overflows, the largest integer with the same sign as `a' is returned. 2555 ------------------------------------------------------------------------------- 2556 */ 2557 int32 floatx80_to_int32( floatx80 a ) 2558 { 2559 flag aSign; 2560 int32 aExp, shiftCount; 2561 bits64 aSig; 2562 2563 aSig = extractFloatx80Frac( a ); 2564 aExp = extractFloatx80Exp( a ); 2565 aSign = extractFloatx80Sign( a ); 2566 if ( ( aExp == 0x7FFF ) && (bits64) ( aSig<<1 ) ) aSign = 0; 2567 shiftCount = 0x4037 - aExp; 2568 if ( shiftCount <= 0 ) shiftCount = 1; 2569 shift64RightJamming( aSig, shiftCount, &aSig ); 2570 return roundAndPackInt32( aSign, aSig ); 2571 2572 } 2573 2574 /* 2575 ------------------------------------------------------------------------------- 2576 Returns the result of converting the extended double-precision floating- 2577 point value `a' to the 32-bit two's complement integer format. The 2578 conversion is performed according to the IEC/IEEE Standard for Binary 2579 Floating-point Arithmetic, except that the conversion is always rounded 2580 toward zero. If `a' is a NaN, the largest positive integer is returned. 2581 Otherwise, if the conversion overflows, the largest integer with the same 2582 sign as `a' is returned. 2583 ------------------------------------------------------------------------------- 2584 */ 2585 int32 floatx80_to_int32_round_to_zero( floatx80 a ) 2586 { 2587 flag aSign; 2588 int32 aExp, shiftCount; 2589 bits64 aSig, savedASig; 2590 int32 z; 2591 2592 aSig = extractFloatx80Frac( a ); 2593 aExp = extractFloatx80Exp( a ); 2594 aSign = extractFloatx80Sign( a ); 2595 shiftCount = 0x403E - aExp; 2596 if ( shiftCount < 32 ) { 2597 if ( ( aExp == 0x7FFF ) && (bits64) ( aSig<<1 ) ) aSign = 0; 2598 goto invalid; 2599 } 2600 else if ( 63 < shiftCount ) { 2601 if ( aExp || aSig ) float_exception_flags |= float_flag_inexact; 2602 return 0; 2603 } 2604 savedASig = aSig; 2605 aSig >>= shiftCount; 2606 z = aSig; 2607 if ( aSign ) z = - z; 2608 if ( ( z < 0 ) ^ aSign ) { 2609 invalid: 2610 float_exception_flags |= float_flag_invalid; 2611 return aSign ? 0x80000000 : 0x7FFFFFFF; 2612 } 2613 if ( ( aSig<<shiftCount ) != savedASig ) { 2614 float_exception_flags |= float_flag_inexact; 2615 } 2616 return z; 2617 2618 } 2619 2620 /* 2621 ------------------------------------------------------------------------------- 2622 Returns the result of converting the extended double-precision floating- 2623 point value `a' to the single-precision floating-point format. The 2624 conversion is performed according to the IEC/IEEE Standard for Binary 2625 Floating-point Arithmetic. 2626 ------------------------------------------------------------------------------- 2627 */ 2628 float32 floatx80_to_float32( floatx80 a ) 2629 { 2630 flag aSign; 2631 int32 aExp; 2632 bits64 aSig; 2633 2634 aSig = extractFloatx80Frac( a ); 2635 aExp = extractFloatx80Exp( a ); 2636 aSign = extractFloatx80Sign( a ); 2637 if ( aExp == 0x7FFF ) { 2638 if ( (bits64) ( aSig<<1 ) ) { 2639 return commonNaNToFloat32( floatx80ToCommonNaN( a ) ); 2640 } 2641 return packFloat32( aSign, 0xFF, 0 ); 2642 } 2643 shift64RightJamming( aSig, 33, &aSig ); 2644 if ( aExp || aSig ) aExp -= 0x3F81; 2645 return roundAndPackFloat32( aSign, aExp, aSig ); 2646 2647 } 2648 2649 /* 2650 ------------------------------------------------------------------------------- 2651 Returns the result of converting the extended double-precision floating- 2652 point value `a' to the double-precision floating-point format. The 2653 conversion is performed according to the IEC/IEEE Standard for Binary 2654 Floating-point Arithmetic. 2655 ------------------------------------------------------------------------------- 2656 */ 2657 float64 floatx80_to_float64( floatx80 a ) 2658 { 2659 flag aSign; 2660 int32 aExp; 2661 bits64 aSig, zSig; 2662 2663 aSig = extractFloatx80Frac( a ); 2664 aExp = extractFloatx80Exp( a ); 2665 aSign = extractFloatx80Sign( a ); 2666 if ( aExp == 0x7FFF ) { 2667 if ( (bits64) ( aSig<<1 ) ) { 2668 return commonNaNToFloat64( floatx80ToCommonNaN( a ) ); 2669 } 2670 return packFloat64( aSign, 0x7FF, 0 ); 2671 } 2672 shift64RightJamming( aSig, 1, &zSig ); 2673 if ( aExp || aSig ) aExp -= 0x3C01; 2674 return roundAndPackFloat64( aSign, aExp, zSig ); 2675 2676 } 2677 2678 /* 2679 ------------------------------------------------------------------------------- 2680 Rounds the extended double-precision floating-point value `a' to an integer, 2681 and returns the result as an extended quadruple-precision floating-point 2682 value. The operation is performed according to the IEC/IEEE Standard for 2683 Binary Floating-point Arithmetic. 2684 ------------------------------------------------------------------------------- 2685 */ 2686 floatx80 floatx80_round_to_int( floatx80 a ) 2687 { 2688 flag aSign; 2689 int32 aExp; 2690 bits64 lastBitMask, roundBitsMask; 2691 int8 roundingMode; 2692 floatx80 z; 2693 2694 aExp = extractFloatx80Exp( a ); 2695 if ( 0x403E <= aExp ) { 2696 if ( ( aExp == 0x7FFF ) && (bits64) ( extractFloatx80Frac( a )<<1 ) ) { 2697 return propagateFloatx80NaN( a, a ); 2698 } 2699 return a; 2700 } 2701 if ( aExp <= 0x3FFE ) { 2702 if ( ( aExp == 0 ) 2703 && ( (bits64) ( extractFloatx80Frac( a )<<1 ) == 0 ) ) { 2704 return a; 2705 } 2706 float_exception_flags |= float_flag_inexact; 2707 aSign = extractFloatx80Sign( a ); 2708 switch ( float_rounding_mode ) { 2709 case float_round_nearest_even: 2710 if ( ( aExp == 0x3FFE ) && (bits64) ( extractFloatx80Frac( a )<<1 ) 2711 ) { 2712 return 2713 packFloatx80( aSign, 0x3FFF, LIT64( 0x8000000000000000 ) ); 2714 } 2715 break; 2716 case float_round_down: 2717 return 2718 aSign ? 2719 packFloatx80( 1, 0x3FFF, LIT64( 0x8000000000000000 ) ) 2720 : packFloatx80( 0, 0, 0 ); 2721 case float_round_up: 2722 return 2723 aSign ? packFloatx80( 1, 0, 0 ) 2724 : packFloatx80( 0, 0x3FFF, LIT64( 0x8000000000000000 ) ); 2725 } 2726 return packFloatx80( aSign, 0, 0 ); 2727 } 2728 lastBitMask = 1; 2729 lastBitMask <<= 0x403E - aExp; 2730 roundBitsMask = lastBitMask - 1; 2731 z = a; 2732 roundingMode = float_rounding_mode; 2733 if ( roundingMode == float_round_nearest_even ) { 2734 z.low += lastBitMask>>1; 2735 if ( ( z.low & roundBitsMask ) == 0 ) z.low &= ~ lastBitMask; 2736 } 2737 else if ( roundingMode != float_round_to_zero ) { 2738 if ( extractFloatx80Sign( z ) ^ ( roundingMode == float_round_up ) ) { 2739 z.low += roundBitsMask; 2740 } 2741 } 2742 z.low &= ~ roundBitsMask; 2743 if ( z.low == 0 ) { 2744 ++z.high; 2745 z.low = LIT64( 0x8000000000000000 ); 2746 } 2747 if ( z.low != a.low ) float_exception_flags |= float_flag_inexact; 2748 return z; 2749 2750 } 2751 2752 /* 2753 ------------------------------------------------------------------------------- 2754 Returns the result of adding the absolute values of the extended double- 2755 precision floating-point values `a' and `b'. If `zSign' is true, the sum is 2756 negated before being returned. `zSign' is ignored if the result is a NaN. 2757 The addition is performed according to the IEC/IEEE Standard for Binary 2758 Floating-point Arithmetic. 2759 ------------------------------------------------------------------------------- 2760 */ 2761 static floatx80 addFloatx80Sigs( floatx80 a, floatx80 b, flag zSign ) 2762 { 2763 int32 aExp, bExp, zExp; 2764 bits64 aSig, bSig, zSig0, zSig1; 2765 int32 expDiff; 2766 2767 aSig = extractFloatx80Frac( a ); 2768 aExp = extractFloatx80Exp( a ); 2769 bSig = extractFloatx80Frac( b ); 2770 bExp = extractFloatx80Exp( b ); 2771 expDiff = aExp - bExp; 2772 if ( 0 < expDiff ) { 2773 if ( aExp == 0x7FFF ) { 2774 if ( (bits64) ( aSig<<1 ) ) return propagateFloatx80NaN( a, b ); 2775 return a; 2776 } 2777 if ( bExp == 0 ) --expDiff; 2778 shift64ExtraRightJamming( bSig, 0, expDiff, &bSig, &zSig1 ); 2779 zExp = aExp; 2780 } 2781 else if ( expDiff < 0 ) { 2782 if ( bExp == 0x7FFF ) { 2783 if ( (bits64) ( bSig<<1 ) ) return propagateFloatx80NaN( a, b ); 2784 return packFloatx80( zSign, 0x7FFF, LIT64( 0x8000000000000000 ) ); 2785 } 2786 if ( aExp == 0 ) ++expDiff; 2787 shift64ExtraRightJamming( aSig, 0, - expDiff, &aSig, &zSig1 ); 2788 zExp = bExp; 2789 } 2790 else { 2791 if ( aExp == 0x7FFF ) { 2792 if ( (bits64) ( ( aSig | bSig )<<1 ) ) { 2793 return propagateFloatx80NaN( a, b ); 2794 } 2795 return a; 2796 } 2797 zSig1 = 0; 2798 zSig0 = aSig + bSig; 2799 if ( aExp == 0 ) { 2800 normalizeFloatx80Subnormal( zSig0, &zExp, &zSig0 ); 2801 goto roundAndPack; 2802 } 2803 zExp = aExp; 2804 goto shiftRight1; 2805 } 2806 2807 zSig0 = aSig + bSig; 2808 2809 if ( (sbits64) zSig0 < 0 ) goto roundAndPack; 2810 shiftRight1: 2811 shift64ExtraRightJamming( zSig0, zSig1, 1, &zSig0, &zSig1 ); 2812 zSig0 |= LIT64( 0x8000000000000000 ); 2813 ++zExp; 2814 roundAndPack: 2815 return 2816 roundAndPackFloatx80( 2817 floatx80_rounding_precision, zSign, zExp, zSig0, zSig1 ); 2818 2819 } 2820 2821 /* 2822 ------------------------------------------------------------------------------- 2823 Returns the result of subtracting the absolute values of the extended 2824 double-precision floating-point values `a' and `b'. If `zSign' is true, 2825 the difference is negated before being returned. `zSign' is ignored if the 2826 result is a NaN. The subtraction is performed according to the IEC/IEEE 2827 Standard for Binary Floating-point Arithmetic. 2828 ------------------------------------------------------------------------------- 2829 */ 2830 static floatx80 subFloatx80Sigs( floatx80 a, floatx80 b, flag zSign ) 2831 { 2832 int32 aExp, bExp, zExp; 2833 bits64 aSig, bSig, zSig0, zSig1; 2834 int32 expDiff; 2835 floatx80 z; 2836 2837 aSig = extractFloatx80Frac( a ); 2838 aExp = extractFloatx80Exp( a ); 2839 bSig = extractFloatx80Frac( b ); 2840 bExp = extractFloatx80Exp( b ); 2841 expDiff = aExp - bExp; 2842 if ( 0 < expDiff ) goto aExpBigger; 2843 if ( expDiff < 0 ) goto bExpBigger; 2844 if ( aExp == 0x7FFF ) { 2845 if ( (bits64) ( ( aSig | bSig )<<1 ) ) { 2846 return propagateFloatx80NaN( a, b ); 2847 } 2848 float_raise( float_flag_invalid ); 2849 z.low = floatx80_default_nan_low; 2850 z.high = floatx80_default_nan_high; 2851 return z; 2852 } 2853 if ( aExp == 0 ) { 2854 aExp = 1; 2855 bExp = 1; 2856 } 2857 zSig1 = 0; 2858 if ( bSig < aSig ) goto aBigger; 2859 if ( aSig < bSig ) goto bBigger; 2860 return packFloatx80( float_rounding_mode == float_round_down, 0, 0 ); 2861 bExpBigger: 2862 if ( bExp == 0x7FFF ) { 2863 if ( (bits64) ( bSig<<1 ) ) return propagateFloatx80NaN( a, b ); 2864 return packFloatx80( zSign ^ 1, 0x7FFF, LIT64( 0x8000000000000000 ) ); 2865 } 2866 if ( aExp == 0 ) ++expDiff; 2867 shift128RightJamming( aSig, 0, - expDiff, &aSig, &zSig1 ); 2868 bBigger: 2869 sub128( bSig, 0, aSig, zSig1, &zSig0, &zSig1 ); 2870 zExp = bExp; 2871 zSign ^= 1; 2872 goto normalizeRoundAndPack; 2873 aExpBigger: 2874 if ( aExp == 0x7FFF ) { 2875 if ( (bits64) ( aSig<<1 ) ) return propagateFloatx80NaN( a, b ); 2876 return a; 2877 } 2878 if ( bExp == 0 ) --expDiff; 2879 shift128RightJamming( bSig, 0, expDiff, &bSig, &zSig1 ); 2880 aBigger: 2881 sub128( aSig, 0, bSig, zSig1, &zSig0, &zSig1 ); 2882 zExp = aExp; 2883 normalizeRoundAndPack: 2884 return 2885 normalizeRoundAndPackFloatx80( 2886 floatx80_rounding_precision, zSign, zExp, zSig0, zSig1 ); 2887 2888 } 2889 2890 /* 2891 ------------------------------------------------------------------------------- 2892 Returns the result of adding the extended double-precision floating-point 2893 values `a' and `b'. The operation is performed according to the IEC/IEEE 2894 Standard for Binary Floating-point Arithmetic. 2895 ------------------------------------------------------------------------------- 2896 */ 2897 floatx80 floatx80_add( floatx80 a, floatx80 b ) 2898 { 2899 flag aSign, bSign; 2900 2901 aSign = extractFloatx80Sign( a ); 2902 bSign = extractFloatx80Sign( b ); 2903 if ( aSign == bSign ) { 2904 return addFloatx80Sigs( a, b, aSign ); 2905 } 2906 else { 2907 return subFloatx80Sigs( a, b, aSign ); 2908 } 2909 2910 } 2911 2912 /* 2913 ------------------------------------------------------------------------------- 2914 Returns the result of subtracting the extended double-precision floating- 2915 point values `a' and `b'. The operation is performed according to the 2916 IEC/IEEE Standard for Binary Floating-point Arithmetic. 2917 ------------------------------------------------------------------------------- 2918 */ 2919 floatx80 floatx80_sub( floatx80 a, floatx80 b ) 2920 { 2921 flag aSign, bSign; 2922 2923 aSign = extractFloatx80Sign( a ); 2924 bSign = extractFloatx80Sign( b ); 2925 if ( aSign == bSign ) { 2926 return subFloatx80Sigs( a, b, aSign ); 2927 } 2928 else { 2929 return addFloatx80Sigs( a, b, aSign ); 2930 } 2931 2932 } 2933 2934 /* 2935 ------------------------------------------------------------------------------- 2936 Returns the result of multiplying the extended double-precision floating- 2937 point values `a' and `b'. The operation is performed according to the 2938 IEC/IEEE Standard for Binary Floating-point Arithmetic. 2939 ------------------------------------------------------------------------------- 2940 */ 2941 floatx80 floatx80_mul( floatx80 a, floatx80 b ) 2942 { 2943 flag aSign, bSign, zSign; 2944 int32 aExp, bExp, zExp; 2945 bits64 aSig, bSig, zSig0, zSig1; 2946 floatx80 z; 2947 2948 aSig = extractFloatx80Frac( a ); 2949 aExp = extractFloatx80Exp( a ); 2950 aSign = extractFloatx80Sign( a ); 2951 bSig = extractFloatx80Frac( b ); 2952 bExp = extractFloatx80Exp( b ); 2953 bSign = extractFloatx80Sign( b ); 2954 zSign = aSign ^ bSign; 2955 if ( aExp == 0x7FFF ) { 2956 if ( (bits64) ( aSig<<1 ) 2957 || ( ( bExp == 0x7FFF ) && (bits64) ( bSig<<1 ) ) ) { 2958 return propagateFloatx80NaN( a, b ); 2959 } 2960 if ( ( bExp | bSig ) == 0 ) goto invalid; 2961 return packFloatx80( zSign, 0x7FFF, LIT64( 0x8000000000000000 ) ); 2962 } 2963 if ( bExp == 0x7FFF ) { 2964 if ( (bits64) ( bSig<<1 ) ) return propagateFloatx80NaN( a, b ); 2965 if ( ( aExp | aSig ) == 0 ) { 2966 invalid: 2967 float_raise( float_flag_invalid ); 2968 z.low = floatx80_default_nan_low; 2969 z.high = floatx80_default_nan_high; 2970 return z; 2971 } 2972 return packFloatx80( zSign, 0x7FFF, LIT64( 0x8000000000000000 ) ); 2973 } 2974 if ( aExp == 0 ) { 2975 if ( aSig == 0 ) return packFloatx80( zSign, 0, 0 ); 2976 normalizeFloatx80Subnormal( aSig, &aExp, &aSig ); 2977 } 2978 if ( bExp == 0 ) { 2979 if ( bSig == 0 ) return packFloatx80( zSign, 0, 0 ); 2980 normalizeFloatx80Subnormal( bSig, &bExp, &bSig ); 2981 } 2982 zExp = aExp + bExp - 0x3FFE; 2983 mul64To128( aSig, bSig, &zSig0, &zSig1 ); 2984 if ( 0 < (sbits64) zSig0 ) { 2985 shortShift128Left( zSig0, zSig1, 1, &zSig0, &zSig1 ); 2986 --zExp; 2987 } 2988 return 2989 roundAndPackFloatx80( 2990 floatx80_rounding_precision, zSign, zExp, zSig0, zSig1 ); 2991 2992 } 2993 2994 /* 2995 ------------------------------------------------------------------------------- 2996 Returns the result of dividing the extended double-precision floating-point 2997 value `a' by the corresponding value `b'. The operation is performed 2998 according to the IEC/IEEE Standard for Binary Floating-point Arithmetic. 2999 ------------------------------------------------------------------------------- 3000 */ 3001 floatx80 floatx80_div( floatx80 a, floatx80 b ) 3002 { 3003 flag aSign, bSign, zSign; 3004 int32 aExp, bExp, zExp; 3005 bits64 aSig, bSig, zSig0, zSig1; 3006 bits64 rem0, rem1, rem2, term0, term1, term2; 3007 floatx80 z; 3008 3009 aSig = extractFloatx80Frac( a ); 3010 aExp = extractFloatx80Exp( a ); 3011 aSign = extractFloatx80Sign( a ); 3012 bSig = extractFloatx80Frac( b ); 3013 bExp = extractFloatx80Exp( b ); 3014 bSign = extractFloatx80Sign( b ); 3015 zSign = aSign ^ bSign; 3016 if ( aExp == 0x7FFF ) { 3017 if ( (bits64) ( aSig<<1 ) ) return propagateFloatx80NaN( a, b ); 3018 if ( bExp == 0x7FFF ) { 3019 if ( (bits64) ( bSig<<1 ) ) return propagateFloatx80NaN( a, b ); 3020 goto invalid; 3021 } 3022 return packFloatx80( zSign, 0x7FFF, LIT64( 0x8000000000000000 ) ); 3023 } 3024 if ( bExp == 0x7FFF ) { 3025 if ( (bits64) ( bSig<<1 ) ) return propagateFloatx80NaN( a, b ); 3026 return packFloatx80( zSign, 0, 0 ); 3027 } 3028 if ( bExp == 0 ) { 3029 if ( bSig == 0 ) { 3030 if ( ( aExp | aSig ) == 0 ) { 3031 invalid: 3032 float_raise( float_flag_invalid ); 3033 z.low = floatx80_default_nan_low; 3034 z.high = floatx80_default_nan_high; 3035 return z; 3036 } 3037 float_raise( float_flag_divbyzero ); 3038 return packFloatx80( zSign, 0x7FFF, LIT64( 0x8000000000000000 ) ); 3039 } 3040 normalizeFloatx80Subnormal( bSig, &bExp, &bSig ); 3041 } 3042 if ( aExp == 0 ) { 3043 if ( aSig == 0 ) return packFloatx80( zSign, 0, 0 ); 3044 normalizeFloatx80Subnormal( aSig, &aExp, &aSig ); 3045 } 3046 zExp = aExp - bExp + 0x3FFE; 3047 rem1 = 0; 3048 if ( bSig <= aSig ) { 3049 shift128Right( aSig, 0, 1, &aSig, &rem1 ); 3050 ++zExp; 3051 } 3052 zSig0 = estimateDiv128To64( aSig, rem1, bSig ); 3053 mul64To128( bSig, zSig0, &term0, &term1 ); 3054 sub128( aSig, rem1, term0, term1, &rem0, &rem1 ); 3055 while ( (sbits64) rem0 < 0 ) { 3056 --zSig0; 3057 add128( rem0, rem1, 0, bSig, &rem0, &rem1 ); 3058 } 3059 zSig1 = estimateDiv128To64( rem1, 0, bSig ); 3060 if ( (bits64) ( zSig1<<1 ) <= 8 ) { 3061 mul64To128( bSig, zSig1, &term1, &term2 ); 3062 sub128( rem1, 0, term1, term2, &rem1, &rem2 ); 3063 while ( (sbits64) rem1 < 0 ) { 3064 --zSig1; 3065 add128( rem1, rem2, 0, bSig, &rem1, &rem2 ); 3066 } 3067 zSig1 |= ( ( rem1 | rem2 ) != 0 ); 3068 } 3069 return 3070 roundAndPackFloatx80( 3071 floatx80_rounding_precision, zSign, zExp, zSig0, zSig1 ); 3072 3073 } 3074 3075 /* 3076 ------------------------------------------------------------------------------- 3077 Returns the remainder of the extended double-precision floating-point value 3078 `a' with respect to the corresponding value `b'. The operation is performed 3079 according to the IEC/IEEE Standard for Binary Floating-point Arithmetic. 3080 ------------------------------------------------------------------------------- 3081 */ 3082 floatx80 floatx80_rem( floatx80 a, floatx80 b ) 3083 { 3084 flag aSign, bSign, zSign; 3085 int32 aExp, bExp, expDiff; 3086 bits64 aSig0, aSig1, bSig; 3087 bits64 q, term0, term1, alternateASig0, alternateASig1; 3088 floatx80 z; 3089 3090 aSig0 = extractFloatx80Frac( a ); 3091 aExp = extractFloatx80Exp( a ); 3092 aSign = extractFloatx80Sign( a ); 3093 bSig = extractFloatx80Frac( b ); 3094 bExp = extractFloatx80Exp( b ); 3095 bSign = extractFloatx80Sign( b ); 3096 if ( aExp == 0x7FFF ) { 3097 if ( (bits64) ( aSig0<<1 ) 3098 || ( ( bExp == 0x7FFF ) && (bits64) ( bSig<<1 ) ) ) { 3099 return propagateFloatx80NaN( a, b ); 3100 } 3101 goto invalid; 3102 } 3103 if ( bExp == 0x7FFF ) { 3104 if ( (bits64) ( bSig<<1 ) ) return propagateFloatx80NaN( a, b ); 3105 return a; 3106 } 3107 if ( bExp == 0 ) { 3108 if ( bSig == 0 ) { 3109 invalid: 3110 float_raise( float_flag_invalid ); 3111 z.low = floatx80_default_nan_low; 3112 z.high = floatx80_default_nan_high; 3113 return z; 3114 } 3115 normalizeFloatx80Subnormal( bSig, &bExp, &bSig ); 3116 } 3117 if ( aExp == 0 ) { 3118 if ( (bits64) ( aSig0<<1 ) == 0 ) return a; 3119 normalizeFloatx80Subnormal( aSig0, &aExp, &aSig0 ); 3120 } 3121 bSig |= LIT64( 0x8000000000000000 ); 3122 zSign = aSign; 3123 expDiff = aExp - bExp; 3124 aSig1 = 0; 3125 if ( expDiff < 0 ) { 3126 if ( expDiff < -1 ) return a; 3127 shift128Right( aSig0, 0, 1, &aSig0, &aSig1 ); 3128 expDiff = 0; 3129 } 3130 q = ( bSig <= aSig0 ); 3131 if ( q ) aSig0 -= bSig; 3132 expDiff -= 64; 3133 while ( 0 < expDiff ) { 3134 q = estimateDiv128To64( aSig0, aSig1, bSig ); 3135 q = ( 2 < q ) ? q - 2 : 0; 3136 mul64To128( bSig, q, &term0, &term1 ); 3137 sub128( aSig0, aSig1, term0, term1, &aSig0, &aSig1 ); 3138 shortShift128Left( aSig0, aSig1, 62, &aSig0, &aSig1 ); 3139 expDiff -= 62; 3140 } 3141 expDiff += 64; 3142 if ( 0 < expDiff ) { 3143 q = estimateDiv128To64( aSig0, aSig1, bSig ); 3144 q = ( 2 < q ) ? q - 2 : 0; 3145 q >>= 64 - expDiff; 3146 mul64To128( bSig, q<<( 64 - expDiff ), &term0, &term1 ); 3147 sub128( aSig0, aSig1, term0, term1, &aSig0, &aSig1 ); 3148 shortShift128Left( 0, bSig, 64 - expDiff, &term0, &term1 ); 3149 while ( le128( term0, term1, aSig0, aSig1 ) ) { 3150 ++q; 3151 sub128( aSig0, aSig1, term0, term1, &aSig0, &aSig1 ); 3152 } 3153 } 3154 else { 3155 term1 = 0; 3156 term0 = bSig; 3157 } 3158 sub128( term0, term1, aSig0, aSig1, &alternateASig0, &alternateASig1 ); 3159 if ( lt128( alternateASig0, alternateASig1, aSig0, aSig1 ) 3160 || ( eq128( alternateASig0, alternateASig1, aSig0, aSig1 ) 3161 && ( q & 1 ) ) 3162 ) { 3163 aSig0 = alternateASig0; 3164 aSig1 = alternateASig1; 3165 zSign = ! zSign; 3166 } 3167 return 3168 normalizeRoundAndPackFloatx80( 3169 80, zSign, bExp + expDiff, aSig0, aSig1 ); 3170 3171 } 3172 3173 /* 3174 ------------------------------------------------------------------------------- 3175 Returns the square root of the extended double-precision floating-point 3176 value `a'. The operation is performed according to the IEC/IEEE Standard 3177 for Binary Floating-point Arithmetic. 3178 ------------------------------------------------------------------------------- 3179 */ 3180 floatx80 floatx80_sqrt( floatx80 a ) 3181 { 3182 flag aSign; 3183 int32 aExp, zExp; 3184 bits64 aSig0, aSig1, zSig0, zSig1; 3185 bits64 rem0, rem1, rem2, rem3, term0, term1, term2, term3; 3186 bits64 shiftedRem0, shiftedRem1; 3187 floatx80 z; 3188 3189 aSig0 = extractFloatx80Frac( a ); 3190 aExp = extractFloatx80Exp( a ); 3191 aSign = extractFloatx80Sign( a ); 3192 if ( aExp == 0x7FFF ) { 3193 if ( (bits64) ( aSig0<<1 ) ) return propagateFloatx80NaN( a, a ); 3194 if ( ! aSign ) return a; 3195 goto invalid; 3196 } 3197 if ( aSign ) { 3198 if ( ( aExp | aSig0 ) == 0 ) return a; 3199 invalid: 3200 float_raise( float_flag_invalid ); 3201 z.low = floatx80_default_nan_low; 3202 z.high = floatx80_default_nan_high; 3203 return z; 3204 } 3205 if ( aExp == 0 ) { 3206 if ( aSig0 == 0 ) return packFloatx80( 0, 0, 0 ); 3207 normalizeFloatx80Subnormal( aSig0, &aExp, &aSig0 ); 3208 } 3209 zExp = ( ( aExp - 0x3FFF )>>1 ) + 0x3FFF; 3210 zSig0 = estimateSqrt32( aExp, aSig0>>32 ); 3211 zSig0 <<= 31; 3212 aSig1 = 0; 3213 shift128Right( aSig0, 0, ( aExp & 1 ) + 2, &aSig0, &aSig1 ); 3214 zSig0 = estimateDiv128To64( aSig0, aSig1, zSig0 ) + zSig0 + 4; 3215 if ( 0 <= (sbits64) zSig0 ) zSig0 = LIT64( 0xFFFFFFFFFFFFFFFF ); 3216 shortShift128Left( aSig0, aSig1, 2, &aSig0, &aSig1 ); 3217 mul64To128( zSig0, zSig0, &term0, &term1 ); 3218 sub128( aSig0, aSig1, term0, term1, &rem0, &rem1 ); 3219 while ( (sbits64) rem0 < 0 ) { 3220 --zSig0; 3221 shortShift128Left( 0, zSig0, 1, &term0, &term1 ); 3222 term1 |= 1; 3223 add128( rem0, rem1, term0, term1, &rem0, &rem1 ); 3224 } 3225 shortShift128Left( rem0, rem1, 63, &shiftedRem0, &shiftedRem1 ); 3226 zSig1 = estimateDiv128To64( shiftedRem0, shiftedRem1, zSig0 ); 3227 if ( (bits64) ( zSig1<<1 ) <= 10 ) { 3228 if ( zSig1 == 0 ) zSig1 = 1; 3229 mul64To128( zSig0, zSig1, &term1, &term2 ); 3230 shortShift128Left( term1, term2, 1, &term1, &term2 ); 3231 sub128( rem1, 0, term1, term2, &rem1, &rem2 ); 3232 mul64To128( zSig1, zSig1, &term2, &term3 ); 3233 sub192( rem1, rem2, 0, 0, term2, term3, &rem1, &rem2, &rem3 ); 3234 while ( (sbits64) rem1 < 0 ) { 3235 --zSig1; 3236 shortShift192Left( 0, zSig0, zSig1, 1, &term1, &term2, &term3 ); 3237 term3 |= 1; 3238 add192( 3239 rem1, rem2, rem3, term1, term2, term3, &rem1, &rem2, &rem3 ); 3240 } 3241 zSig1 |= ( ( rem1 | rem2 | rem3 ) != 0 ); 3242 } 3243 return 3244 roundAndPackFloatx80( 3245 floatx80_rounding_precision, 0, zExp, zSig0, zSig1 ); 3246 3247 } 3248 3249 /* 3250 ------------------------------------------------------------------------------- 3251 Returns 1 if the extended double-precision floating-point value `a' is 3252 equal to the corresponding value `b', and 0 otherwise. The comparison is 3253 performed according to the IEC/IEEE Standard for Binary Floating-point 3254 Arithmetic. 3255 ------------------------------------------------------------------------------- 3256 */ 3257 flag floatx80_eq( floatx80 a, floatx80 b ) 3258 { 3259 3260 if ( ( ( extractFloatx80Exp( a ) == 0x7FFF ) 3261 && (bits64) ( extractFloatx80Frac( a )<<1 ) ) 3262 || ( ( extractFloatx80Exp( b ) == 0x7FFF ) 3263 && (bits64) ( extractFloatx80Frac( b )<<1 ) ) 3264 ) { 3265 if ( floatx80_is_signaling_nan( a ) 3266 || floatx80_is_signaling_nan( b ) ) { 3267 float_raise( float_flag_invalid ); 3268 } 3269 return 0; 3270 } 3271 return 3272 ( a.low == b.low ) 3273 && ( ( a.high == b.high ) 3274 || ( ( a.low == 0 ) 3275 && ( (bits16) ( ( a.high | b.high )<<1 ) == 0 ) ) 3276 ); 3277 3278 } 3279 3280 /* 3281 ------------------------------------------------------------------------------- 3282 Returns 1 if the extended double-precision floating-point value `a' is 3283 less than or equal to the corresponding value `b', and 0 otherwise. The 3284 comparison is performed according to the IEC/IEEE Standard for Binary 3285 Floating-point Arithmetic. 3286 ------------------------------------------------------------------------------- 3287 */ 3288 flag floatx80_le( floatx80 a, floatx80 b ) 3289 { 3290 flag aSign, bSign; 3291 3292 if ( ( ( extractFloatx80Exp( a ) == 0x7FFF ) 3293 && (bits64) ( extractFloatx80Frac( a )<<1 ) ) 3294 || ( ( extractFloatx80Exp( b ) == 0x7FFF ) 3295 && (bits64) ( extractFloatx80Frac( b )<<1 ) ) 3296 ) { 3297 float_raise( float_flag_invalid ); 3298 return 0; 3299 } 3300 aSign = extractFloatx80Sign( a ); 3301 bSign = extractFloatx80Sign( b ); 3302 if ( aSign != bSign ) { 3303 return 3304 aSign 3305 || ( ( ( (bits16) ( ( a.high | b.high )<<1 ) ) | a.low | b.low ) 3306 == 0 ); 3307 } 3308 return 3309 aSign ? le128( b.high, b.low, a.high, a.low ) 3310 : le128( a.high, a.low, b.high, b.low ); 3311 3312 } 3313 3314 /* 3315 ------------------------------------------------------------------------------- 3316 Returns 1 if the extended double-precision floating-point value `a' is 3317 less than the corresponding value `b', and 0 otherwise. The comparison 3318 is performed according to the IEC/IEEE Standard for Binary Floating-point 3319 Arithmetic. 3320 ------------------------------------------------------------------------------- 3321 */ 3322 flag floatx80_lt( floatx80 a, floatx80 b ) 3323 { 3324 flag aSign, bSign; 3325 3326 if ( ( ( extractFloatx80Exp( a ) == 0x7FFF ) 3327 && (bits64) ( extractFloatx80Frac( a )<<1 ) ) 3328 || ( ( extractFloatx80Exp( b ) == 0x7FFF ) 3329 && (bits64) ( extractFloatx80Frac( b )<<1 ) ) 3330 ) { 3331 float_raise( float_flag_invalid ); 3332 return 0; 3333 } 3334 aSign = extractFloatx80Sign( a ); 3335 bSign = extractFloatx80Sign( b ); 3336 if ( aSign != bSign ) { 3337 return 3338 aSign 3339 && ( ( ( (bits16) ( ( a.high | b.high )<<1 ) ) | a.low | b.low ) 3340 != 0 ); 3341 } 3342 return 3343 aSign ? lt128( b.high, b.low, a.high, a.low ) 3344 : lt128( a.high, a.low, b.high, b.low ); 3345 3346 } 3347 3348 /* 3349 ------------------------------------------------------------------------------- 3350 Returns 1 if the extended double-precision floating-point value `a' is equal 3351 to the corresponding value `b', and 0 otherwise. The invalid exception is 3352 raised if either operand is a NaN. Otherwise, the comparison is performed 3353 according to the IEC/IEEE Standard for Binary Floating-point Arithmetic. 3354 ------------------------------------------------------------------------------- 3355 */ 3356 flag floatx80_eq_signaling( floatx80 a, floatx80 b ) 3357 { 3358 3359 if ( ( ( extractFloatx80Exp( a ) == 0x7FFF ) 3360 && (bits64) ( extractFloatx80Frac( a )<<1 ) ) 3361 || ( ( extractFloatx80Exp( b ) == 0x7FFF ) 3362 && (bits64) ( extractFloatx80Frac( b )<<1 ) ) 3363 ) { 3364 float_raise( float_flag_invalid ); 3365 return 0; 3366 } 3367 return 3368 ( a.low == b.low ) 3369 && ( ( a.high == b.high ) 3370 || ( ( a.low == 0 ) 3371 && ( (bits16) ( ( a.high | b.high )<<1 ) == 0 ) ) 3372 ); 3373 3374 } 3375 3376 /* 3377 ------------------------------------------------------------------------------- 3378 Returns 1 if the extended double-precision floating-point value `a' is less 3379 than or equal to the corresponding value `b', and 0 otherwise. Quiet NaNs 3380 do not cause an exception. Otherwise, the comparison is performed according 3381 to the IEC/IEEE Standard for Binary Floating-point Arithmetic. 3382 ------------------------------------------------------------------------------- 3383 */ 3384 flag floatx80_le_quiet( floatx80 a, floatx80 b ) 3385 { 3386 flag aSign, bSign; 3387 3388 if ( ( ( extractFloatx80Exp( a ) == 0x7FFF ) 3389 && (bits64) ( extractFloatx80Frac( a )<<1 ) ) 3390 || ( ( extractFloatx80Exp( b ) == 0x7FFF ) 3391 && (bits64) ( extractFloatx80Frac( b )<<1 ) ) 3392 ) { 3393 if ( floatx80_is_signaling_nan( a ) 3394 || floatx80_is_signaling_nan( b ) ) { 3395 float_raise( float_flag_invalid ); 3396 } 3397 return 0; 3398 } 3399 aSign = extractFloatx80Sign( a ); 3400 bSign = extractFloatx80Sign( b ); 3401 if ( aSign != bSign ) { 3402 return 3403 aSign 3404 || ( ( ( (bits16) ( ( a.high | b.high )<<1 ) ) | a.low | b.low ) 3405 == 0 ); 3406 } 3407 return 3408 aSign ? le128( b.high, b.low, a.high, a.low ) 3409 : le128( a.high, a.low, b.high, b.low ); 3410 3411 } 3412 3413 /* 3414 ------------------------------------------------------------------------------- 3415 Returns 1 if the extended double-precision floating-point value `a' is less 3416 than the corresponding value `b', and 0 otherwise. Quiet NaNs do not cause 3417 an exception. Otherwise, the comparison is performed according to the 3418 IEC/IEEE Standard for Binary Floating-point Arithmetic. 3419 ------------------------------------------------------------------------------- 3420 */ 3421 flag floatx80_lt_quiet( floatx80 a, floatx80 b ) 3422 { 3423 flag aSign, bSign; 3424 3425 if ( ( ( extractFloatx80Exp( a ) == 0x7FFF ) 3426 && (bits64) ( extractFloatx80Frac( a )<<1 ) ) 3427 || ( ( extractFloatx80Exp( b ) == 0x7FFF ) 3428 && (bits64) ( extractFloatx80Frac( b )<<1 ) ) 3429 ) { 3430 if ( floatx80_is_signaling_nan( a ) 3431 || floatx80_is_signaling_nan( b ) ) { 3432 float_raise( float_flag_invalid ); 3433 } 3434 return 0; 3435 } 3436 aSign = extractFloatx80Sign( a ); 3437 bSign = extractFloatx80Sign( b ); 3438 if ( aSign != bSign ) { 3439 return 3440 aSign 3441 && ( ( ( (bits16) ( ( a.high | b.high )<<1 ) ) | a.low | b.low ) 3442 != 0 ); 3443 } 3444 return 3445 aSign ? lt128( b.high, b.low, a.high, a.low ) 3446 : lt128( a.high, a.low, b.high, b.low ); 3447 3448 } 3449 3450 #endif 3451 3452