1 /* $NetBSD: softfloat.c,v 1.8 2011/07/10 04:52:23 matt Exp $ */ 2 3 /* 4 * This version hacked for use with gcc -msoft-float by bjh21. 5 * (Mostly a case of #ifdefing out things GCC doesn't need or provides 6 * itself). 7 */ 8 9 /* 10 * Things you may want to define: 11 * 12 * SOFTFLOAT_FOR_GCC - build only those functions necessary for GCC (with 13 * -msoft-float) to work. Include "softfloat-for-gcc.h" to get them 14 * properly renamed. 15 */ 16 17 /* 18 =============================================================================== 19 20 This C source file is part of the SoftFloat IEC/IEEE Floating-point 21 Arithmetic Package, Release 2a. 22 23 Written by John R. Hauser. This work was made possible in part by the 24 International Computer Science Institute, located at Suite 600, 1947 Center 25 Street, Berkeley, California 94704. Funding was partially provided by the 26 National Science Foundation under grant MIP-9311980. The original version 27 of this code was written as part of a project to build a fixed-point vector 28 processor in collaboration with the University of California at Berkeley, 29 overseen by Profs. Nelson Morgan and John Wawrzynek. More information 30 is available through the Web page `http://HTTP.CS.Berkeley.EDU/~jhauser/ 31 arithmetic/SoftFloat.html'. 32 33 THIS SOFTWARE IS DISTRIBUTED AS IS, FOR FREE. Although reasonable effort 34 has been made to avoid it, THIS SOFTWARE MAY CONTAIN FAULTS THAT WILL AT 35 TIMES RESULT IN INCORRECT BEHAVIOR. USE OF THIS SOFTWARE IS RESTRICTED TO 36 PERSONS AND ORGANIZATIONS WHO CAN AND WILL TAKE FULL RESPONSIBILITY FOR ANY 37 AND ALL LOSSES, COSTS, OR OTHER PROBLEMS ARISING FROM ITS USE. 38 39 Derivative works are acceptable, even for commercial purposes, so long as 40 (1) they include prominent notice that the work is derivative, and (2) they 41 include prominent notice akin to these four paragraphs for those parts of 42 this code that are retained. 43 44 =============================================================================== 45 */ 46 47 #ifdef SOFTFLOAT_FOR_GCC 48 #include "softfloat-for-gcc.h" 49 #endif 50 51 #include "milieu.h" 52 #include "softfloat.h" 53 54 /* 55 * Conversions between floats as stored in memory and floats as 56 * SoftFloat uses them 57 */ 58 #ifndef FLOAT64_DEMANGLE 59 #define FLOAT64_DEMANGLE(a) (a) 60 #endif 61 #ifndef FLOAT64_MANGLE 62 #define FLOAT64_MANGLE(a) (a) 63 #endif 64 65 /* 66 ------------------------------------------------------------------------------- 67 Floating-point rounding mode, extended double-precision rounding precision, 68 and exception flags. 69 ------------------------------------------------------------------------------- 70 */ 71 int float_rounding_mode = float_round_nearest_even; 72 int float_exception_flags = 0; 73 #ifdef FLOATX80 74 int8 floatx80_rounding_precision = 80; 75 #endif 76 77 /* 78 ------------------------------------------------------------------------------- 79 Primitive arithmetic functions, including multi-word arithmetic, and 80 division and square root approximations. (Can be specialized to target if 81 desired.) 82 ------------------------------------------------------------------------------- 83 */ 84 #include "softfloat-macros" 85 86 /* 87 ------------------------------------------------------------------------------- 88 Functions and definitions to determine: (1) whether tininess for underflow 89 is detected before or after rounding by default, (2) what (if anything) 90 happens when exceptions are raised, (3) how signaling NaNs are distinguished 91 from quiet NaNs, (4) the default generated quiet NaNs, and (5) how NaNs 92 are propagated from function inputs to output. These details are target- 93 specific. 94 ------------------------------------------------------------------------------- 95 */ 96 #include "softfloat-specialize" 97 98 #if !defined(SOFTFLOAT_FOR_GCC) || defined(FLOATX80) || defined(FLOAT128) 99 /* 100 ------------------------------------------------------------------------------- 101 Takes a 64-bit fixed-point value `absZ' with binary point between bits 6 102 and 7, and returns the properly rounded 32-bit integer corresponding to the 103 input. If `zSign' is 1, the input is negated before being converted to an 104 integer. Bit 63 of `absZ' must be zero. Ordinarily, the fixed-point input 105 is simply rounded to an integer, with the inexact exception raised if the 106 input cannot be represented exactly as an integer. However, if the fixed- 107 point input is too large, the invalid exception is raised and the largest 108 positive or negative integer is returned. 109 ------------------------------------------------------------------------------- 110 */ 111 static int32 roundAndPackInt32( flag zSign, bits64 absZ ) 112 { 113 int8 roundingMode; 114 flag roundNearestEven; 115 int8 roundIncrement, roundBits; 116 int32 z; 117 118 roundingMode = float_rounding_mode; 119 roundNearestEven = ( roundingMode == float_round_nearest_even ); 120 roundIncrement = 0x40; 121 if ( ! roundNearestEven ) { 122 if ( roundingMode == float_round_to_zero ) { 123 roundIncrement = 0; 124 } 125 else { 126 roundIncrement = 0x7F; 127 if ( zSign ) { 128 if ( roundingMode == float_round_up ) roundIncrement = 0; 129 } 130 else { 131 if ( roundingMode == float_round_down ) roundIncrement = 0; 132 } 133 } 134 } 135 roundBits = absZ & 0x7F; 136 absZ = ( absZ + roundIncrement )>>7; 137 absZ &= ~ ( ( ( roundBits ^ 0x40 ) == 0 ) & roundNearestEven ); 138 z = absZ; 139 if ( zSign ) z = - z; 140 if ( ( absZ>>32 ) || ( z && ( ( z < 0 ) ^ zSign ) ) ) { 141 float_raise( float_flag_invalid ); 142 return zSign ? (sbits32) 0x80000000 : 0x7FFFFFFF; 143 } 144 if ( roundBits ) float_exception_flags |= float_flag_inexact; 145 return z; 146 147 } 148 149 /* 150 ------------------------------------------------------------------------------- 151 Takes the 128-bit fixed-point value formed by concatenating `absZ0' and 152 `absZ1', with binary point between bits 63 and 64 (between the input words), 153 and returns the properly rounded 64-bit integer corresponding to the input. 154 If `zSign' is 1, the input is negated before being converted to an integer. 155 Ordinarily, the fixed-point input is simply rounded to an integer, with 156 the inexact exception raised if the input cannot be represented exactly as 157 an integer. However, if the fixed-point input is too large, the invalid 158 exception is raised and the largest positive or negative integer is 159 returned. 160 ------------------------------------------------------------------------------- 161 */ 162 static int64 roundAndPackInt64( flag zSign, bits64 absZ0, bits64 absZ1 ) 163 { 164 int8 roundingMode; 165 flag roundNearestEven, increment; 166 int64 z; 167 168 roundingMode = float_rounding_mode; 169 roundNearestEven = ( roundingMode == float_round_nearest_even ); 170 increment = ( (sbits64) absZ1 < 0 ); 171 if ( ! roundNearestEven ) { 172 if ( roundingMode == float_round_to_zero ) { 173 increment = 0; 174 } 175 else { 176 if ( zSign ) { 177 increment = ( roundingMode == float_round_down ) && absZ1; 178 } 179 else { 180 increment = ( roundingMode == float_round_up ) && absZ1; 181 } 182 } 183 } 184 if ( increment ) { 185 ++absZ0; 186 if ( absZ0 == 0 ) goto overflow; 187 absZ0 &= ~ ( ( (bits64) ( absZ1<<1 ) == 0 ) & roundNearestEven ); 188 } 189 z = absZ0; 190 if ( zSign ) z = - z; 191 if ( z && ( ( z < 0 ) ^ zSign ) ) { 192 overflow: 193 float_raise( float_flag_invalid ); 194 return 195 zSign ? (sbits64) LIT64( 0x8000000000000000 ) 196 : LIT64( 0x7FFFFFFFFFFFFFFF ); 197 } 198 if ( absZ1 ) float_exception_flags |= float_flag_inexact; 199 return z; 200 201 } 202 #endif 203 204 /* 205 ------------------------------------------------------------------------------- 206 Returns the fraction bits of the single-precision floating-point value `a'. 207 ------------------------------------------------------------------------------- 208 */ 209 INLINE bits32 extractFloat32Frac( float32 a ) 210 { 211 212 return a & 0x007FFFFF; 213 214 } 215 216 /* 217 ------------------------------------------------------------------------------- 218 Returns the exponent bits of the single-precision floating-point value `a'. 219 ------------------------------------------------------------------------------- 220 */ 221 INLINE int16 extractFloat32Exp( float32 a ) 222 { 223 224 return ( a>>23 ) & 0xFF; 225 226 } 227 228 /* 229 ------------------------------------------------------------------------------- 230 Returns the sign bit of the single-precision floating-point value `a'. 231 ------------------------------------------------------------------------------- 232 */ 233 INLINE flag extractFloat32Sign( float32 a ) 234 { 235 236 return a>>31; 237 238 } 239 240 /* 241 ------------------------------------------------------------------------------- 242 Normalizes the subnormal single-precision floating-point value represented 243 by the denormalized significand `aSig'. The normalized exponent and 244 significand are stored at the locations pointed to by `zExpPtr' and 245 `zSigPtr', respectively. 246 ------------------------------------------------------------------------------- 247 */ 248 static void 249 normalizeFloat32Subnormal( bits32 aSig, int16 *zExpPtr, bits32 *zSigPtr ) 250 { 251 int8 shiftCount; 252 253 shiftCount = countLeadingZeros32( aSig ) - 8; 254 *zSigPtr = aSig<<shiftCount; 255 *zExpPtr = 1 - shiftCount; 256 257 } 258 259 /* 260 ------------------------------------------------------------------------------- 261 Packs the sign `zSign', exponent `zExp', and significand `zSig' into a 262 single-precision floating-point value, returning the result. After being 263 shifted into the proper positions, the three fields are simply added 264 together to form the result. This means that any integer portion of `zSig' 265 will be added into the exponent. Since a properly normalized significand 266 will have an integer portion equal to 1, the `zExp' input should be 1 less 267 than the desired result exponent whenever `zSig' is a complete, normalized 268 significand. 269 ------------------------------------------------------------------------------- 270 */ 271 INLINE float32 packFloat32( flag zSign, int16 zExp, bits32 zSig ) 272 { 273 274 return ( ( (bits32) zSign )<<31 ) + ( ( (bits32) zExp )<<23 ) + zSig; 275 276 } 277 278 /* 279 ------------------------------------------------------------------------------- 280 Takes an abstract floating-point value having sign `zSign', exponent `zExp', 281 and significand `zSig', and returns the proper single-precision floating- 282 point value corresponding to the abstract input. Ordinarily, the abstract 283 value is simply rounded and packed into the single-precision format, with 284 the inexact exception raised if the abstract input cannot be represented 285 exactly. However, if the abstract value is too large, the overflow and 286 inexact exceptions are raised and an infinity or maximal finite value is 287 returned. If the abstract value is too small, the input value is rounded to 288 a subnormal number, and the underflow and inexact exceptions are raised if 289 the abstract input cannot be represented exactly as a subnormal single- 290 precision floating-point number. 291 The input significand `zSig' has its binary point between bits 30 292 and 29, which is 7 bits to the left of the usual location. This shifted 293 significand must be normalized or smaller. If `zSig' is not normalized, 294 `zExp' must be 0; in that case, the result returned is a subnormal number, 295 and it must not require rounding. In the usual case that `zSig' is 296 normalized, `zExp' must be 1 less than the ``true'' floating-point exponent. 297 The handling of underflow and overflow follows the IEC/IEEE Standard for 298 Binary Floating-Point Arithmetic. 299 ------------------------------------------------------------------------------- 300 */ 301 static float32 roundAndPackFloat32( flag zSign, int16 zExp, bits32 zSig ) 302 { 303 int8 roundingMode; 304 flag roundNearestEven; 305 int8 roundIncrement, roundBits; 306 flag isTiny; 307 308 roundingMode = float_rounding_mode; 309 roundNearestEven = ( roundingMode == float_round_nearest_even ); 310 roundIncrement = 0x40; 311 if ( ! roundNearestEven ) { 312 if ( roundingMode == float_round_to_zero ) { 313 roundIncrement = 0; 314 } 315 else { 316 roundIncrement = 0x7F; 317 if ( zSign ) { 318 if ( roundingMode == float_round_up ) roundIncrement = 0; 319 } 320 else { 321 if ( roundingMode == float_round_down ) roundIncrement = 0; 322 } 323 } 324 } 325 roundBits = zSig & 0x7F; 326 if ( 0xFD <= (bits16) zExp ) { 327 if ( ( 0xFD < zExp ) 328 || ( ( zExp == 0xFD ) 329 && ( (sbits32) ( zSig + roundIncrement ) < 0 ) ) 330 ) { 331 float_raise( float_flag_overflow | float_flag_inexact ); 332 return packFloat32( zSign, 0xFF, 0 ) - ( roundIncrement == 0 ); 333 } 334 if ( zExp < 0 ) { 335 isTiny = 336 ( float_detect_tininess == float_tininess_before_rounding ) 337 || ( zExp < -1 ) 338 || ( zSig + roundIncrement < 0x80000000 ); 339 shift32RightJamming( zSig, - zExp, &zSig ); 340 zExp = 0; 341 roundBits = zSig & 0x7F; 342 if ( isTiny && roundBits ) float_raise( float_flag_underflow ); 343 } 344 } 345 if ( roundBits ) float_exception_flags |= float_flag_inexact; 346 zSig = ( zSig + roundIncrement )>>7; 347 zSig &= ~ ( ( ( roundBits ^ 0x40 ) == 0 ) & roundNearestEven ); 348 if ( zSig == 0 ) zExp = 0; 349 return packFloat32( zSign, zExp, zSig ); 350 351 } 352 353 /* 354 ------------------------------------------------------------------------------- 355 Takes an abstract floating-point value having sign `zSign', exponent `zExp', 356 and significand `zSig', and returns the proper single-precision floating- 357 point value corresponding to the abstract input. This routine is just like 358 `roundAndPackFloat32' except that `zSig' does not have to be normalized. 359 Bit 31 of `zSig' must be zero, and `zExp' must be 1 less than the ``true'' 360 floating-point exponent. 361 ------------------------------------------------------------------------------- 362 */ 363 static float32 364 normalizeRoundAndPackFloat32( flag zSign, int16 zExp, bits32 zSig ) 365 { 366 int8 shiftCount; 367 368 shiftCount = countLeadingZeros32( zSig ) - 1; 369 return roundAndPackFloat32( zSign, zExp - shiftCount, zSig<<shiftCount ); 370 371 } 372 373 /* 374 ------------------------------------------------------------------------------- 375 Returns the fraction bits of the double-precision floating-point value `a'. 376 ------------------------------------------------------------------------------- 377 */ 378 INLINE bits64 extractFloat64Frac( float64 a ) 379 { 380 381 return FLOAT64_DEMANGLE(a) & LIT64( 0x000FFFFFFFFFFFFF ); 382 383 } 384 385 /* 386 ------------------------------------------------------------------------------- 387 Returns the exponent bits of the double-precision floating-point value `a'. 388 ------------------------------------------------------------------------------- 389 */ 390 INLINE int16 extractFloat64Exp( float64 a ) 391 { 392 393 return ( FLOAT64_DEMANGLE(a)>>52 ) & 0x7FF; 394 395 } 396 397 /* 398 ------------------------------------------------------------------------------- 399 Returns the sign bit of the double-precision floating-point value `a'. 400 ------------------------------------------------------------------------------- 401 */ 402 INLINE flag extractFloat64Sign( float64 a ) 403 { 404 405 return FLOAT64_DEMANGLE(a)>>63; 406 407 } 408 409 /* 410 ------------------------------------------------------------------------------- 411 Normalizes the subnormal double-precision floating-point value represented 412 by the denormalized significand `aSig'. The normalized exponent and 413 significand are stored at the locations pointed to by `zExpPtr' and 414 `zSigPtr', respectively. 415 ------------------------------------------------------------------------------- 416 */ 417 static void 418 normalizeFloat64Subnormal( bits64 aSig, int16 *zExpPtr, bits64 *zSigPtr ) 419 { 420 int8 shiftCount; 421 422 shiftCount = countLeadingZeros64( aSig ) - 11; 423 *zSigPtr = aSig<<shiftCount; 424 *zExpPtr = 1 - shiftCount; 425 426 } 427 428 /* 429 ------------------------------------------------------------------------------- 430 Packs the sign `zSign', exponent `zExp', and significand `zSig' into a 431 double-precision floating-point value, returning the result. After being 432 shifted into the proper positions, the three fields are simply added 433 together to form the result. This means that any integer portion of `zSig' 434 will be added into the exponent. Since a properly normalized significand 435 will have an integer portion equal to 1, the `zExp' input should be 1 less 436 than the desired result exponent whenever `zSig' is a complete, normalized 437 significand. 438 ------------------------------------------------------------------------------- 439 */ 440 INLINE float64 packFloat64( flag zSign, int16 zExp, bits64 zSig ) 441 { 442 443 return FLOAT64_MANGLE( ( ( (bits64) zSign )<<63 ) + 444 ( ( (bits64) zExp )<<52 ) + zSig ); 445 446 } 447 448 /* 449 ------------------------------------------------------------------------------- 450 Takes an abstract floating-point value having sign `zSign', exponent `zExp', 451 and significand `zSig', and returns the proper double-precision floating- 452 point value corresponding to the abstract input. Ordinarily, the abstract 453 value is simply rounded and packed into the double-precision format, with 454 the inexact exception raised if the abstract input cannot be represented 455 exactly. However, if the abstract value is too large, the overflow and 456 inexact exceptions are raised and an infinity or maximal finite value is 457 returned. If the abstract value is too small, the input value is rounded to 458 a subnormal number, and the underflow and inexact exceptions are raised if 459 the abstract input cannot be represented exactly as a subnormal double- 460 precision floating-point number. 461 The input significand `zSig' has its binary point between bits 62 462 and 61, which is 10 bits to the left of the usual location. This shifted 463 significand must be normalized or smaller. If `zSig' is not normalized, 464 `zExp' must be 0; in that case, the result returned is a subnormal number, 465 and it must not require rounding. In the usual case that `zSig' is 466 normalized, `zExp' must be 1 less than the ``true'' floating-point exponent. 467 The handling of underflow and overflow follows the IEC/IEEE Standard for 468 Binary Floating-Point Arithmetic. 469 ------------------------------------------------------------------------------- 470 */ 471 static float64 roundAndPackFloat64( flag zSign, int16 zExp, bits64 zSig ) 472 { 473 int8 roundingMode; 474 flag roundNearestEven; 475 int16 roundIncrement, roundBits; 476 flag isTiny; 477 478 roundingMode = float_rounding_mode; 479 roundNearestEven = ( roundingMode == float_round_nearest_even ); 480 roundIncrement = 0x200; 481 if ( ! roundNearestEven ) { 482 if ( roundingMode == float_round_to_zero ) { 483 roundIncrement = 0; 484 } 485 else { 486 roundIncrement = 0x3FF; 487 if ( zSign ) { 488 if ( roundingMode == float_round_up ) roundIncrement = 0; 489 } 490 else { 491 if ( roundingMode == float_round_down ) roundIncrement = 0; 492 } 493 } 494 } 495 roundBits = zSig & 0x3FF; 496 if ( 0x7FD <= (bits16) zExp ) { 497 if ( ( 0x7FD < zExp ) 498 || ( ( zExp == 0x7FD ) 499 && ( (sbits64) ( zSig + roundIncrement ) < 0 ) ) 500 ) { 501 float_raise( float_flag_overflow | float_flag_inexact ); 502 return FLOAT64_MANGLE( 503 FLOAT64_DEMANGLE(packFloat64( zSign, 0x7FF, 0 )) - 504 ( roundIncrement == 0 )); 505 } 506 if ( zExp < 0 ) { 507 isTiny = 508 ( float_detect_tininess == float_tininess_before_rounding ) 509 || ( zExp < -1 ) 510 || ( zSig + roundIncrement < LIT64( 0x8000000000000000 ) ); 511 shift64RightJamming( zSig, - zExp, &zSig ); 512 zExp = 0; 513 roundBits = zSig & 0x3FF; 514 if ( isTiny && roundBits ) float_raise( float_flag_underflow ); 515 } 516 } 517 if ( roundBits ) float_exception_flags |= float_flag_inexact; 518 zSig = ( zSig + roundIncrement )>>10; 519 zSig &= ~ ( ( ( roundBits ^ 0x200 ) == 0 ) & roundNearestEven ); 520 if ( zSig == 0 ) zExp = 0; 521 return packFloat64( zSign, zExp, zSig ); 522 523 } 524 525 /* 526 ------------------------------------------------------------------------------- 527 Takes an abstract floating-point value having sign `zSign', exponent `zExp', 528 and significand `zSig', and returns the proper double-precision floating- 529 point value corresponding to the abstract input. This routine is just like 530 `roundAndPackFloat64' except that `zSig' does not have to be normalized. 531 Bit 63 of `zSig' must be zero, and `zExp' must be 1 less than the ``true'' 532 floating-point exponent. 533 ------------------------------------------------------------------------------- 534 */ 535 static float64 536 normalizeRoundAndPackFloat64( flag zSign, int16 zExp, bits64 zSig ) 537 { 538 int8 shiftCount; 539 540 shiftCount = countLeadingZeros64( zSig ) - 1; 541 return roundAndPackFloat64( zSign, zExp - shiftCount, zSig<<shiftCount ); 542 543 } 544 545 #ifdef FLOATX80 546 547 /* 548 ------------------------------------------------------------------------------- 549 Returns the fraction bits of the extended double-precision floating-point 550 value `a'. 551 ------------------------------------------------------------------------------- 552 */ 553 INLINE bits64 extractFloatx80Frac( floatx80 a ) 554 { 555 556 return a.low; 557 558 } 559 560 /* 561 ------------------------------------------------------------------------------- 562 Returns the exponent bits of the extended double-precision floating-point 563 value `a'. 564 ------------------------------------------------------------------------------- 565 */ 566 INLINE int32 extractFloatx80Exp( floatx80 a ) 567 { 568 569 return a.high & 0x7FFF; 570 571 } 572 573 /* 574 ------------------------------------------------------------------------------- 575 Returns the sign bit of the extended double-precision floating-point value 576 `a'. 577 ------------------------------------------------------------------------------- 578 */ 579 INLINE flag extractFloatx80Sign( floatx80 a ) 580 { 581 582 return a.high>>15; 583 584 } 585 586 /* 587 ------------------------------------------------------------------------------- 588 Normalizes the subnormal extended double-precision floating-point value 589 represented by the denormalized significand `aSig'. The normalized exponent 590 and significand are stored at the locations pointed to by `zExpPtr' and 591 `zSigPtr', respectively. 592 ------------------------------------------------------------------------------- 593 */ 594 static void 595 normalizeFloatx80Subnormal( bits64 aSig, int32 *zExpPtr, bits64 *zSigPtr ) 596 { 597 int8 shiftCount; 598 599 shiftCount = countLeadingZeros64( aSig ); 600 *zSigPtr = aSig<<shiftCount; 601 *zExpPtr = 1 - shiftCount; 602 603 } 604 605 /* 606 ------------------------------------------------------------------------------- 607 Packs the sign `zSign', exponent `zExp', and significand `zSig' into an 608 extended double-precision floating-point value, returning the result. 609 ------------------------------------------------------------------------------- 610 */ 611 INLINE floatx80 packFloatx80( flag zSign, int32 zExp, bits64 zSig ) 612 { 613 floatx80 z; 614 615 z.low = zSig; 616 z.high = ( ( (bits16) zSign )<<15 ) + zExp; 617 return z; 618 619 } 620 621 /* 622 ------------------------------------------------------------------------------- 623 Takes an abstract floating-point value having sign `zSign', exponent `zExp', 624 and extended significand formed by the concatenation of `zSig0' and `zSig1', 625 and returns the proper extended double-precision floating-point value 626 corresponding to the abstract input. Ordinarily, the abstract value is 627 rounded and packed into the extended double-precision format, with the 628 inexact exception raised if the abstract input cannot be represented 629 exactly. However, if the abstract value is too large, the overflow and 630 inexact exceptions are raised and an infinity or maximal finite value is 631 returned. If the abstract value is too small, the input value is rounded to 632 a subnormal number, and the underflow and inexact exceptions are raised if 633 the abstract input cannot be represented exactly as a subnormal extended 634 double-precision floating-point number. 635 If `roundingPrecision' is 32 or 64, the result is rounded to the same 636 number of bits as single or double precision, respectively. Otherwise, the 637 result is rounded to the full precision of the extended double-precision 638 format. 639 The input significand must be normalized or smaller. If the input 640 significand is not normalized, `zExp' must be 0; in that case, the result 641 returned is a subnormal number, and it must not require rounding. The 642 handling of underflow and overflow follows the IEC/IEEE Standard for Binary 643 Floating-Point Arithmetic. 644 ------------------------------------------------------------------------------- 645 */ 646 static floatx80 647 roundAndPackFloatx80( 648 int8 roundingPrecision, flag zSign, int32 zExp, bits64 zSig0, bits64 zSig1 649 ) 650 { 651 int8 roundingMode; 652 flag roundNearestEven, increment, isTiny; 653 int64 roundIncrement, roundMask, roundBits; 654 655 roundingMode = float_rounding_mode; 656 roundNearestEven = ( roundingMode == float_round_nearest_even ); 657 if ( roundingPrecision == 80 ) goto precision80; 658 if ( roundingPrecision == 64 ) { 659 roundIncrement = LIT64( 0x0000000000000400 ); 660 roundMask = LIT64( 0x00000000000007FF ); 661 } 662 else if ( roundingPrecision == 32 ) { 663 roundIncrement = LIT64( 0x0000008000000000 ); 664 roundMask = LIT64( 0x000000FFFFFFFFFF ); 665 } 666 else { 667 goto precision80; 668 } 669 zSig0 |= ( zSig1 != 0 ); 670 if ( ! roundNearestEven ) { 671 if ( roundingMode == float_round_to_zero ) { 672 roundIncrement = 0; 673 } 674 else { 675 roundIncrement = roundMask; 676 if ( zSign ) { 677 if ( roundingMode == float_round_up ) roundIncrement = 0; 678 } 679 else { 680 if ( roundingMode == float_round_down ) roundIncrement = 0; 681 } 682 } 683 } 684 roundBits = zSig0 & roundMask; 685 if ( 0x7FFD <= (bits32) ( zExp - 1 ) ) { 686 if ( ( 0x7FFE < zExp ) 687 || ( ( zExp == 0x7FFE ) && ( zSig0 + roundIncrement < zSig0 ) ) 688 ) { 689 goto overflow; 690 } 691 if ( zExp <= 0 ) { 692 isTiny = 693 ( float_detect_tininess == float_tininess_before_rounding ) 694 || ( zExp < 0 ) 695 || ( zSig0 <= zSig0 + roundIncrement ); 696 shift64RightJamming( zSig0, 1 - zExp, &zSig0 ); 697 zExp = 0; 698 roundBits = zSig0 & roundMask; 699 if ( isTiny && roundBits ) float_raise( float_flag_underflow ); 700 if ( roundBits ) float_exception_flags |= float_flag_inexact; 701 zSig0 += roundIncrement; 702 if ( (sbits64) zSig0 < 0 ) zExp = 1; 703 roundIncrement = roundMask + 1; 704 if ( roundNearestEven && ( roundBits<<1 == roundIncrement ) ) { 705 roundMask |= roundIncrement; 706 } 707 zSig0 &= ~ roundMask; 708 return packFloatx80( zSign, zExp, zSig0 ); 709 } 710 } 711 if ( roundBits ) float_exception_flags |= float_flag_inexact; 712 zSig0 += roundIncrement; 713 if ( zSig0 < roundIncrement ) { 714 ++zExp; 715 zSig0 = LIT64( 0x8000000000000000 ); 716 } 717 roundIncrement = roundMask + 1; 718 if ( roundNearestEven && ( roundBits<<1 == roundIncrement ) ) { 719 roundMask |= roundIncrement; 720 } 721 zSig0 &= ~ roundMask; 722 if ( zSig0 == 0 ) zExp = 0; 723 return packFloatx80( zSign, zExp, zSig0 ); 724 precision80: 725 increment = ( (sbits64) zSig1 < 0 ); 726 if ( ! roundNearestEven ) { 727 if ( roundingMode == float_round_to_zero ) { 728 increment = 0; 729 } 730 else { 731 if ( zSign ) { 732 increment = ( roundingMode == float_round_down ) && zSig1; 733 } 734 else { 735 increment = ( roundingMode == float_round_up ) && zSig1; 736 } 737 } 738 } 739 if ( 0x7FFD <= (bits32) ( zExp - 1 ) ) { 740 if ( ( 0x7FFE < zExp ) 741 || ( ( zExp == 0x7FFE ) 742 && ( zSig0 == LIT64( 0xFFFFFFFFFFFFFFFF ) ) 743 && increment 744 ) 745 ) { 746 roundMask = 0; 747 overflow: 748 float_raise( float_flag_overflow | float_flag_inexact ); 749 if ( ( roundingMode == float_round_to_zero ) 750 || ( zSign && ( roundingMode == float_round_up ) ) 751 || ( ! zSign && ( roundingMode == float_round_down ) ) 752 ) { 753 return packFloatx80( zSign, 0x7FFE, ~ roundMask ); 754 } 755 return packFloatx80( zSign, 0x7FFF, LIT64( 0x8000000000000000 ) ); 756 } 757 if ( zExp <= 0 ) { 758 isTiny = 759 ( float_detect_tininess == float_tininess_before_rounding ) 760 || ( zExp < 0 ) 761 || ! increment 762 || ( zSig0 < LIT64( 0xFFFFFFFFFFFFFFFF ) ); 763 shift64ExtraRightJamming( zSig0, zSig1, 1 - zExp, &zSig0, &zSig1 ); 764 zExp = 0; 765 if ( isTiny && zSig1 ) float_raise( float_flag_underflow ); 766 if ( zSig1 ) float_exception_flags |= float_flag_inexact; 767 if ( roundNearestEven ) { 768 increment = ( (sbits64) zSig1 < 0 ); 769 } 770 else { 771 if ( zSign ) { 772 increment = ( roundingMode == float_round_down ) && zSig1; 773 } 774 else { 775 increment = ( roundingMode == float_round_up ) && zSig1; 776 } 777 } 778 if ( increment ) { 779 ++zSig0; 780 zSig0 &= 781 ~ ( ( (bits64) ( zSig1<<1 ) == 0 ) & roundNearestEven ); 782 if ( (sbits64) zSig0 < 0 ) zExp = 1; 783 } 784 return packFloatx80( zSign, zExp, zSig0 ); 785 } 786 } 787 if ( zSig1 ) float_exception_flags |= float_flag_inexact; 788 if ( increment ) { 789 ++zSig0; 790 if ( zSig0 == 0 ) { 791 ++zExp; 792 zSig0 = LIT64( 0x8000000000000000 ); 793 } 794 else { 795 zSig0 &= ~ ( ( (bits64) ( zSig1<<1 ) == 0 ) & roundNearestEven ); 796 } 797 } 798 else { 799 if ( zSig0 == 0 ) zExp = 0; 800 } 801 return packFloatx80( zSign, zExp, zSig0 ); 802 803 } 804 805 /* 806 ------------------------------------------------------------------------------- 807 Takes an abstract floating-point value having sign `zSign', exponent 808 `zExp', and significand formed by the concatenation of `zSig0' and `zSig1', 809 and returns the proper extended double-precision floating-point value 810 corresponding to the abstract input. This routine is just like 811 `roundAndPackFloatx80' except that the input significand does not have to be 812 normalized. 813 ------------------------------------------------------------------------------- 814 */ 815 static floatx80 816 normalizeRoundAndPackFloatx80( 817 int8 roundingPrecision, flag zSign, int32 zExp, bits64 zSig0, bits64 zSig1 818 ) 819 { 820 int8 shiftCount; 821 822 if ( zSig0 == 0 ) { 823 zSig0 = zSig1; 824 zSig1 = 0; 825 zExp -= 64; 826 } 827 shiftCount = countLeadingZeros64( zSig0 ); 828 shortShift128Left( zSig0, zSig1, shiftCount, &zSig0, &zSig1 ); 829 zExp -= shiftCount; 830 return 831 roundAndPackFloatx80( roundingPrecision, zSign, zExp, zSig0, zSig1 ); 832 833 } 834 835 #endif 836 837 #ifdef FLOAT128 838 839 /* 840 ------------------------------------------------------------------------------- 841 Returns the least-significant 64 fraction bits of the quadruple-precision 842 floating-point value `a'. 843 ------------------------------------------------------------------------------- 844 */ 845 INLINE bits64 extractFloat128Frac1( float128 a ) 846 { 847 848 return a.low; 849 850 } 851 852 /* 853 ------------------------------------------------------------------------------- 854 Returns the most-significant 48 fraction bits of the quadruple-precision 855 floating-point value `a'. 856 ------------------------------------------------------------------------------- 857 */ 858 INLINE bits64 extractFloat128Frac0( float128 a ) 859 { 860 861 return a.high & LIT64( 0x0000FFFFFFFFFFFF ); 862 863 } 864 865 /* 866 ------------------------------------------------------------------------------- 867 Returns the exponent bits of the quadruple-precision floating-point value 868 `a'. 869 ------------------------------------------------------------------------------- 870 */ 871 INLINE int32 extractFloat128Exp( float128 a ) 872 { 873 874 return ( a.high>>48 ) & 0x7FFF; 875 876 } 877 878 /* 879 ------------------------------------------------------------------------------- 880 Returns the sign bit of the quadruple-precision floating-point value `a'. 881 ------------------------------------------------------------------------------- 882 */ 883 INLINE flag extractFloat128Sign( float128 a ) 884 { 885 886 return a.high>>63; 887 888 } 889 890 /* 891 ------------------------------------------------------------------------------- 892 Normalizes the subnormal quadruple-precision floating-point value 893 represented by the denormalized significand formed by the concatenation of 894 `aSig0' and `aSig1'. The normalized exponent is stored at the location 895 pointed to by `zExpPtr'. The most significant 49 bits of the normalized 896 significand are stored at the location pointed to by `zSig0Ptr', and the 897 least significant 64 bits of the normalized significand are stored at the 898 location pointed to by `zSig1Ptr'. 899 ------------------------------------------------------------------------------- 900 */ 901 static void 902 normalizeFloat128Subnormal( 903 bits64 aSig0, 904 bits64 aSig1, 905 int32 *zExpPtr, 906 bits64 *zSig0Ptr, 907 bits64 *zSig1Ptr 908 ) 909 { 910 int8 shiftCount; 911 912 if ( aSig0 == 0 ) { 913 shiftCount = countLeadingZeros64( aSig1 ) - 15; 914 if ( shiftCount < 0 ) { 915 *zSig0Ptr = aSig1>>( - shiftCount ); 916 *zSig1Ptr = aSig1<<( shiftCount & 63 ); 917 } 918 else { 919 *zSig0Ptr = aSig1<<shiftCount; 920 *zSig1Ptr = 0; 921 } 922 *zExpPtr = - shiftCount - 63; 923 } 924 else { 925 shiftCount = countLeadingZeros64( aSig0 ) - 15; 926 shortShift128Left( aSig0, aSig1, shiftCount, zSig0Ptr, zSig1Ptr ); 927 *zExpPtr = 1 - shiftCount; 928 } 929 930 } 931 932 /* 933 ------------------------------------------------------------------------------- 934 Packs the sign `zSign', the exponent `zExp', and the significand formed 935 by the concatenation of `zSig0' and `zSig1' into a quadruple-precision 936 floating-point value, returning the result. After being shifted into the 937 proper positions, the three fields `zSign', `zExp', and `zSig0' are simply 938 added together to form the most significant 32 bits of the result. This 939 means that any integer portion of `zSig0' will be added into the exponent. 940 Since a properly normalized significand will have an integer portion equal 941 to 1, the `zExp' input should be 1 less than the desired result exponent 942 whenever `zSig0' and `zSig1' concatenated form a complete, normalized 943 significand. 944 ------------------------------------------------------------------------------- 945 */ 946 INLINE float128 947 packFloat128( flag zSign, int32 zExp, bits64 zSig0, bits64 zSig1 ) 948 { 949 float128 z; 950 951 z.low = zSig1; 952 z.high = ( ( (bits64) zSign )<<63 ) + ( ( (bits64) zExp )<<48 ) + zSig0; 953 return z; 954 955 } 956 957 /* 958 ------------------------------------------------------------------------------- 959 Takes an abstract floating-point value having sign `zSign', exponent `zExp', 960 and extended significand formed by the concatenation of `zSig0', `zSig1', 961 and `zSig2', and returns the proper quadruple-precision floating-point value 962 corresponding to the abstract input. Ordinarily, the abstract value is 963 simply rounded and packed into the quadruple-precision format, with the 964 inexact exception raised if the abstract input cannot be represented 965 exactly. However, if the abstract value is too large, the overflow and 966 inexact exceptions are raised and an infinity or maximal finite value is 967 returned. If the abstract value is too small, the input value is rounded to 968 a subnormal number, and the underflow and inexact exceptions are raised if 969 the abstract input cannot be represented exactly as a subnormal quadruple- 970 precision floating-point number. 971 The input significand must be normalized or smaller. If the input 972 significand is not normalized, `zExp' must be 0; in that case, the result 973 returned is a subnormal number, and it must not require rounding. In the 974 usual case that the input significand is normalized, `zExp' must be 1 less 975 than the ``true'' floating-point exponent. The handling of underflow and 976 overflow follows the IEC/IEEE Standard for Binary Floating-Point Arithmetic. 977 ------------------------------------------------------------------------------- 978 */ 979 static float128 980 roundAndPackFloat128( 981 flag zSign, int32 zExp, bits64 zSig0, bits64 zSig1, bits64 zSig2 ) 982 { 983 int8 roundingMode; 984 flag roundNearestEven, increment, isTiny; 985 986 roundingMode = float_rounding_mode; 987 roundNearestEven = ( roundingMode == float_round_nearest_even ); 988 increment = ( (sbits64) zSig2 < 0 ); 989 if ( ! roundNearestEven ) { 990 if ( roundingMode == float_round_to_zero ) { 991 increment = 0; 992 } 993 else { 994 if ( zSign ) { 995 increment = ( roundingMode == float_round_down ) && zSig2; 996 } 997 else { 998 increment = ( roundingMode == float_round_up ) && zSig2; 999 } 1000 } 1001 } 1002 if ( 0x7FFD <= (bits32) zExp ) { 1003 if ( ( 0x7FFD < zExp ) 1004 || ( ( zExp == 0x7FFD ) 1005 && eq128( 1006 LIT64( 0x0001FFFFFFFFFFFF ), 1007 LIT64( 0xFFFFFFFFFFFFFFFF ), 1008 zSig0, 1009 zSig1 1010 ) 1011 && increment 1012 ) 1013 ) { 1014 float_raise( float_flag_overflow | float_flag_inexact ); 1015 if ( ( roundingMode == float_round_to_zero ) 1016 || ( zSign && ( roundingMode == float_round_up ) ) 1017 || ( ! zSign && ( roundingMode == float_round_down ) ) 1018 ) { 1019 return 1020 packFloat128( 1021 zSign, 1022 0x7FFE, 1023 LIT64( 0x0000FFFFFFFFFFFF ), 1024 LIT64( 0xFFFFFFFFFFFFFFFF ) 1025 ); 1026 } 1027 return packFloat128( zSign, 0x7FFF, 0, 0 ); 1028 } 1029 if ( zExp < 0 ) { 1030 isTiny = 1031 ( float_detect_tininess == float_tininess_before_rounding ) 1032 || ( zExp < -1 ) 1033 || ! increment 1034 || lt128( 1035 zSig0, 1036 zSig1, 1037 LIT64( 0x0001FFFFFFFFFFFF ), 1038 LIT64( 0xFFFFFFFFFFFFFFFF ) 1039 ); 1040 shift128ExtraRightJamming( 1041 zSig0, zSig1, zSig2, - zExp, &zSig0, &zSig1, &zSig2 ); 1042 zExp = 0; 1043 if ( isTiny && zSig2 ) float_raise( float_flag_underflow ); 1044 if ( roundNearestEven ) { 1045 increment = ( (sbits64) zSig2 < 0 ); 1046 } 1047 else { 1048 if ( zSign ) { 1049 increment = ( roundingMode == float_round_down ) && zSig2; 1050 } 1051 else { 1052 increment = ( roundingMode == float_round_up ) && zSig2; 1053 } 1054 } 1055 } 1056 } 1057 if ( zSig2 ) float_exception_flags |= float_flag_inexact; 1058 if ( increment ) { 1059 add128( zSig0, zSig1, 0, 1, &zSig0, &zSig1 ); 1060 zSig1 &= ~ ( ( zSig2 + zSig2 == 0 ) & roundNearestEven ); 1061 } 1062 else { 1063 if ( ( zSig0 | zSig1 ) == 0 ) zExp = 0; 1064 } 1065 return packFloat128( zSign, zExp, zSig0, zSig1 ); 1066 1067 } 1068 1069 /* 1070 ------------------------------------------------------------------------------- 1071 Takes an abstract floating-point value having sign `zSign', exponent `zExp', 1072 and significand formed by the concatenation of `zSig0' and `zSig1', and 1073 returns the proper quadruple-precision floating-point value corresponding 1074 to the abstract input. This routine is just like `roundAndPackFloat128' 1075 except that the input significand has fewer bits and does not have to be 1076 normalized. In all cases, `zExp' must be 1 less than the ``true'' floating- 1077 point exponent. 1078 ------------------------------------------------------------------------------- 1079 */ 1080 static float128 1081 normalizeRoundAndPackFloat128( 1082 flag zSign, int32 zExp, bits64 zSig0, bits64 zSig1 ) 1083 { 1084 int8 shiftCount; 1085 bits64 zSig2; 1086 1087 if ( zSig0 == 0 ) { 1088 zSig0 = zSig1; 1089 zSig1 = 0; 1090 zExp -= 64; 1091 } 1092 shiftCount = countLeadingZeros64( zSig0 ) - 15; 1093 if ( 0 <= shiftCount ) { 1094 zSig2 = 0; 1095 shortShift128Left( zSig0, zSig1, shiftCount, &zSig0, &zSig1 ); 1096 } 1097 else { 1098 shift128ExtraRightJamming( 1099 zSig0, zSig1, 0, - shiftCount, &zSig0, &zSig1, &zSig2 ); 1100 } 1101 zExp -= shiftCount; 1102 return roundAndPackFloat128( zSign, zExp, zSig0, zSig1, zSig2 ); 1103 1104 } 1105 1106 #endif 1107 1108 /* 1109 ------------------------------------------------------------------------------- 1110 Returns the result of converting the 32-bit two's complement integer `a' 1111 to the single-precision floating-point format. The conversion is performed 1112 according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic. 1113 ------------------------------------------------------------------------------- 1114 */ 1115 float32 int32_to_float32( int32 a ) 1116 { 1117 flag zSign; 1118 1119 if ( a == 0 ) return 0; 1120 if ( a == (sbits32) 0x80000000 ) return packFloat32( 1, 0x9E, 0 ); 1121 zSign = ( a < 0 ); 1122 return normalizeRoundAndPackFloat32( zSign, 0x9C, zSign ? - a : a ); 1123 1124 } 1125 1126 #ifndef SOFTFLOAT_FOR_GCC /* __floatunsisf is in libgcc */ 1127 float32 uint32_to_float32( uint32 a ) 1128 { 1129 if ( a == 0 ) return 0; 1130 if ( a & (bits32) 0x80000000 ) 1131 return normalizeRoundAndPackFloat32( 0, 0x9D, a >> 1 ); 1132 return normalizeRoundAndPackFloat32( 0, 0x9C, a ); 1133 } 1134 #endif 1135 1136 1137 /* 1138 ------------------------------------------------------------------------------- 1139 Returns the result of converting the 32-bit two's complement integer `a' 1140 to the double-precision floating-point format. The conversion is performed 1141 according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic. 1142 ------------------------------------------------------------------------------- 1143 */ 1144 float64 int32_to_float64( int32 a ) 1145 { 1146 flag zSign; 1147 uint32 absA; 1148 int8 shiftCount; 1149 bits64 zSig; 1150 1151 if ( a == 0 ) return 0; 1152 zSign = ( a < 0 ); 1153 absA = zSign ? - a : a; 1154 shiftCount = countLeadingZeros32( absA ) + 21; 1155 zSig = absA; 1156 return packFloat64( zSign, 0x432 - shiftCount, zSig<<shiftCount ); 1157 1158 } 1159 1160 #ifndef SOFTFLOAT_FOR_GCC /* __floatunsidf is in libgcc */ 1161 float64 uint32_to_float64( uint32 a ) 1162 { 1163 int8 shiftCount; 1164 bits64 zSig = a; 1165 1166 if ( a == 0 ) return 0; 1167 shiftCount = countLeadingZeros32( a ) + 21; 1168 return packFloat64( 0, 0x432 - shiftCount, zSig<<shiftCount ); 1169 1170 } 1171 #endif 1172 1173 #ifdef FLOATX80 1174 1175 /* 1176 ------------------------------------------------------------------------------- 1177 Returns the result of converting the 32-bit two's complement integer `a' 1178 to the extended double-precision floating-point format. The conversion 1179 is performed according to the IEC/IEEE Standard for Binary Floating-Point 1180 Arithmetic. 1181 ------------------------------------------------------------------------------- 1182 */ 1183 floatx80 int32_to_floatx80( int32 a ) 1184 { 1185 flag zSign; 1186 uint32 absA; 1187 int8 shiftCount; 1188 bits64 zSig; 1189 1190 if ( a == 0 ) return packFloatx80( 0, 0, 0 ); 1191 zSign = ( a < 0 ); 1192 absA = zSign ? - a : a; 1193 shiftCount = countLeadingZeros32( absA ) + 32; 1194 zSig = absA; 1195 return packFloatx80( zSign, 0x403E - shiftCount, zSig<<shiftCount ); 1196 1197 } 1198 1199 floatx80 uint32_to_floatx80( uint32 a ) 1200 { 1201 int8 shiftCount; 1202 bits64 zSig = a; 1203 1204 if ( a == 0 ) return packFloatx80( 0, 0, 0 ); 1205 shiftCount = countLeadingZeros32( a ) + 32; 1206 return packFloatx80( 0, 0x403E - shiftCount, zSig<<shiftCount ); 1207 1208 } 1209 1210 #endif 1211 1212 #ifdef FLOAT128 1213 1214 /* 1215 ------------------------------------------------------------------------------- 1216 Returns the result of converting the 32-bit two's complement integer `a' to 1217 the quadruple-precision floating-point format. The conversion is performed 1218 according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic. 1219 ------------------------------------------------------------------------------- 1220 */ 1221 float128 int32_to_float128( int32 a ) 1222 { 1223 flag zSign; 1224 uint32 absA; 1225 int8 shiftCount; 1226 bits64 zSig0; 1227 1228 if ( a == 0 ) return packFloat128( 0, 0, 0, 0 ); 1229 zSign = ( a < 0 ); 1230 absA = zSign ? - a : a; 1231 shiftCount = countLeadingZeros32( absA ) + 17; 1232 zSig0 = absA; 1233 return packFloat128( zSign, 0x402E - shiftCount, zSig0<<shiftCount, 0 ); 1234 1235 } 1236 1237 float128 uint32_to_float128( uint32 a ) 1238 { 1239 int8 shiftCount; 1240 bits64 zSig0 = a; 1241 1242 if ( a == 0 ) return packFloat128( 0, 0, 0, 0 ); 1243 shiftCount = countLeadingZeros32( a ) + 17; 1244 return packFloat128( 0, 0x402E - shiftCount, zSig0<<shiftCount, 0 ); 1245 1246 } 1247 1248 #endif 1249 1250 #ifndef SOFTFLOAT_FOR_GCC /* __floatdi?f is in libgcc2.c */ 1251 /* 1252 ------------------------------------------------------------------------------- 1253 Returns the result of converting the 64-bit two's complement integer `a' 1254 to the single-precision floating-point format. The conversion is performed 1255 according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic. 1256 ------------------------------------------------------------------------------- 1257 */ 1258 float32 int64_to_float32( int64 a ) 1259 { 1260 flag zSign; 1261 uint64 absA; 1262 int8 shiftCount; 1263 1264 if ( a == 0 ) return 0; 1265 zSign = ( a < 0 ); 1266 absA = zSign ? - a : a; 1267 shiftCount = countLeadingZeros64( absA ) - 40; 1268 if ( 0 <= shiftCount ) { 1269 return packFloat32( zSign, 0x95 - shiftCount, absA<<shiftCount ); 1270 } 1271 else { 1272 shiftCount += 7; 1273 if ( shiftCount < 0 ) { 1274 shift64RightJamming( absA, - shiftCount, &absA ); 1275 } 1276 else { 1277 absA <<= shiftCount; 1278 } 1279 return roundAndPackFloat32( zSign, 0x9C - shiftCount, absA ); 1280 } 1281 1282 } 1283 1284 /* 1285 ------------------------------------------------------------------------------- 1286 Returns the result of converting the 64-bit two's complement integer `a' 1287 to the double-precision floating-point format. The conversion is performed 1288 according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic. 1289 ------------------------------------------------------------------------------- 1290 */ 1291 float64 int64_to_float64( int64 a ) 1292 { 1293 flag zSign; 1294 1295 if ( a == 0 ) return 0; 1296 if ( a == (sbits64) LIT64( 0x8000000000000000 ) ) { 1297 return packFloat64( 1, 0x43E, 0 ); 1298 } 1299 zSign = ( a < 0 ); 1300 return normalizeRoundAndPackFloat64( zSign, 0x43C, zSign ? - a : a ); 1301 1302 } 1303 1304 #ifdef FLOATX80 1305 1306 /* 1307 ------------------------------------------------------------------------------- 1308 Returns the result of converting the 64-bit two's complement integer `a' 1309 to the extended double-precision floating-point format. The conversion 1310 is performed according to the IEC/IEEE Standard for Binary Floating-Point 1311 Arithmetic. 1312 ------------------------------------------------------------------------------- 1313 */ 1314 floatx80 int64_to_floatx80( int64 a ) 1315 { 1316 flag zSign; 1317 uint64 absA; 1318 int8 shiftCount; 1319 1320 if ( a == 0 ) return packFloatx80( 0, 0, 0 ); 1321 zSign = ( a < 0 ); 1322 absA = zSign ? - a : a; 1323 shiftCount = countLeadingZeros64( absA ); 1324 return packFloatx80( zSign, 0x403E - shiftCount, absA<<shiftCount ); 1325 1326 } 1327 1328 #endif 1329 1330 #endif /* !SOFTFLOAT_FOR_GCC */ 1331 1332 #ifdef FLOAT128 1333 1334 /* 1335 ------------------------------------------------------------------------------- 1336 Returns the result of converting the 64-bit two's complement integer `a' to 1337 the quadruple-precision floating-point format. The conversion is performed 1338 according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic. 1339 ------------------------------------------------------------------------------- 1340 */ 1341 float128 int64_to_float128( int64 a ) 1342 { 1343 flag zSign; 1344 uint64 absA; 1345 int8 shiftCount; 1346 int32 zExp; 1347 bits64 zSig0, zSig1; 1348 1349 if ( a == 0 ) return packFloat128( 0, 0, 0, 0 ); 1350 zSign = ( a < 0 ); 1351 absA = zSign ? - a : a; 1352 shiftCount = countLeadingZeros64( absA ) + 49; 1353 zExp = 0x406E - shiftCount; 1354 if ( 64 <= shiftCount ) { 1355 zSig1 = 0; 1356 zSig0 = absA; 1357 shiftCount -= 64; 1358 } 1359 else { 1360 zSig1 = absA; 1361 zSig0 = 0; 1362 } 1363 shortShift128Left( zSig0, zSig1, shiftCount, &zSig0, &zSig1 ); 1364 return packFloat128( zSign, zExp, zSig0, zSig1 ); 1365 1366 } 1367 1368 #endif 1369 1370 #ifndef SOFTFLOAT_FOR_GCC /* Not needed */ 1371 /* 1372 ------------------------------------------------------------------------------- 1373 Returns the result of converting the single-precision floating-point value 1374 `a' to the 32-bit two's complement integer format. The conversion is 1375 performed according to the IEC/IEEE Standard for Binary Floating-Point 1376 Arithmetic---which means in particular that the conversion is rounded 1377 according to the current rounding mode. If `a' is a NaN, the largest 1378 positive integer is returned. Otherwise, if the conversion overflows, the 1379 largest integer with the same sign as `a' is returned. 1380 ------------------------------------------------------------------------------- 1381 */ 1382 int32 float32_to_int32( float32 a ) 1383 { 1384 flag aSign; 1385 int16 aExp, shiftCount; 1386 bits32 aSig; 1387 bits64 aSig64; 1388 1389 aSig = extractFloat32Frac( a ); 1390 aExp = extractFloat32Exp( a ); 1391 aSign = extractFloat32Sign( a ); 1392 if ( ( aExp == 0xFF ) && aSig ) aSign = 0; 1393 if ( aExp ) aSig |= 0x00800000; 1394 shiftCount = 0xAF - aExp; 1395 aSig64 = aSig; 1396 aSig64 <<= 32; 1397 if ( 0 < shiftCount ) shift64RightJamming( aSig64, shiftCount, &aSig64 ); 1398 return roundAndPackInt32( aSign, aSig64 ); 1399 1400 } 1401 #endif /* !SOFTFLOAT_FOR_GCC */ 1402 1403 /* 1404 ------------------------------------------------------------------------------- 1405 Returns the result of converting the single-precision floating-point value 1406 `a' to the 32-bit two's complement integer format. The conversion is 1407 performed according to the IEC/IEEE Standard for Binary Floating-Point 1408 Arithmetic, except that the conversion is always rounded toward zero. 1409 If `a' is a NaN, the largest positive integer is returned. Otherwise, if 1410 the conversion overflows, the largest integer with the same sign as `a' is 1411 returned. 1412 ------------------------------------------------------------------------------- 1413 */ 1414 int32 float32_to_int32_round_to_zero( float32 a ) 1415 { 1416 flag aSign; 1417 int16 aExp, shiftCount; 1418 bits32 aSig; 1419 int32 z; 1420 1421 aSig = extractFloat32Frac( a ); 1422 aExp = extractFloat32Exp( a ); 1423 aSign = extractFloat32Sign( a ); 1424 shiftCount = aExp - 0x9E; 1425 if ( 0 <= shiftCount ) { 1426 if ( a != 0xCF000000 ) { 1427 float_raise( float_flag_invalid ); 1428 if ( ! aSign || ( ( aExp == 0xFF ) && aSig ) ) return 0x7FFFFFFF; 1429 } 1430 return (sbits32) 0x80000000; 1431 } 1432 else if ( aExp <= 0x7E ) { 1433 if ( aExp | aSig ) float_exception_flags |= float_flag_inexact; 1434 return 0; 1435 } 1436 aSig = ( aSig | 0x00800000 )<<8; 1437 z = aSig>>( - shiftCount ); 1438 if ( (bits32) ( aSig<<( shiftCount & 31 ) ) ) { 1439 float_exception_flags |= float_flag_inexact; 1440 } 1441 if ( aSign ) z = - z; 1442 return z; 1443 1444 } 1445 1446 #ifndef SOFTFLOAT_FOR_GCC /* __fix?fdi provided by libgcc2.c */ 1447 /* 1448 ------------------------------------------------------------------------------- 1449 Returns the result of converting the single-precision floating-point value 1450 `a' to the 64-bit two's complement integer format. The conversion is 1451 performed according to the IEC/IEEE Standard for Binary Floating-Point 1452 Arithmetic---which means in particular that the conversion is rounded 1453 according to the current rounding mode. If `a' is a NaN, the largest 1454 positive integer is returned. Otherwise, if the conversion overflows, the 1455 largest integer with the same sign as `a' is returned. 1456 ------------------------------------------------------------------------------- 1457 */ 1458 int64 float32_to_int64( float32 a ) 1459 { 1460 flag aSign; 1461 int16 aExp, shiftCount; 1462 bits32 aSig; 1463 bits64 aSig64, aSigExtra; 1464 1465 aSig = extractFloat32Frac( a ); 1466 aExp = extractFloat32Exp( a ); 1467 aSign = extractFloat32Sign( a ); 1468 shiftCount = 0xBE - aExp; 1469 if ( shiftCount < 0 ) { 1470 float_raise( float_flag_invalid ); 1471 if ( ! aSign || ( ( aExp == 0xFF ) && aSig ) ) { 1472 return LIT64( 0x7FFFFFFFFFFFFFFF ); 1473 } 1474 return (sbits64) LIT64( 0x8000000000000000 ); 1475 } 1476 if ( aExp ) aSig |= 0x00800000; 1477 aSig64 = aSig; 1478 aSig64 <<= 40; 1479 shift64ExtraRightJamming( aSig64, 0, shiftCount, &aSig64, &aSigExtra ); 1480 return roundAndPackInt64( aSign, aSig64, aSigExtra ); 1481 1482 } 1483 1484 /* 1485 ------------------------------------------------------------------------------- 1486 Returns the result of converting the single-precision floating-point value 1487 `a' to the 64-bit two's complement integer format. The conversion is 1488 performed according to the IEC/IEEE Standard for Binary Floating-Point 1489 Arithmetic, except that the conversion is always rounded toward zero. If 1490 `a' is a NaN, the largest positive integer is returned. Otherwise, if the 1491 conversion overflows, the largest integer with the same sign as `a' is 1492 returned. 1493 ------------------------------------------------------------------------------- 1494 */ 1495 int64 float32_to_int64_round_to_zero( float32 a ) 1496 { 1497 flag aSign; 1498 int16 aExp, shiftCount; 1499 bits32 aSig; 1500 bits64 aSig64; 1501 int64 z; 1502 1503 aSig = extractFloat32Frac( a ); 1504 aExp = extractFloat32Exp( a ); 1505 aSign = extractFloat32Sign( a ); 1506 shiftCount = aExp - 0xBE; 1507 if ( 0 <= shiftCount ) { 1508 if ( a != 0xDF000000 ) { 1509 float_raise( float_flag_invalid ); 1510 if ( ! aSign || ( ( aExp == 0xFF ) && aSig ) ) { 1511 return LIT64( 0x7FFFFFFFFFFFFFFF ); 1512 } 1513 } 1514 return (sbits64) LIT64( 0x8000000000000000 ); 1515 } 1516 else if ( aExp <= 0x7E ) { 1517 if ( aExp | aSig ) float_exception_flags |= float_flag_inexact; 1518 return 0; 1519 } 1520 aSig64 = aSig | 0x00800000; 1521 aSig64 <<= 40; 1522 z = aSig64>>( - shiftCount ); 1523 if ( (bits64) ( aSig64<<( shiftCount & 63 ) ) ) { 1524 float_exception_flags |= float_flag_inexact; 1525 } 1526 if ( aSign ) z = - z; 1527 return z; 1528 1529 } 1530 #endif /* !SOFTFLOAT_FOR_GCC */ 1531 1532 /* 1533 ------------------------------------------------------------------------------- 1534 Returns the result of converting the single-precision floating-point value 1535 `a' to the double-precision floating-point format. The conversion is 1536 performed according to the IEC/IEEE Standard for Binary Floating-Point 1537 Arithmetic. 1538 ------------------------------------------------------------------------------- 1539 */ 1540 float64 float32_to_float64( float32 a ) 1541 { 1542 flag aSign; 1543 int16 aExp; 1544 bits32 aSig; 1545 1546 aSig = extractFloat32Frac( a ); 1547 aExp = extractFloat32Exp( a ); 1548 aSign = extractFloat32Sign( a ); 1549 if ( aExp == 0xFF ) { 1550 if ( aSig ) return commonNaNToFloat64( float32ToCommonNaN( a ) ); 1551 return packFloat64( aSign, 0x7FF, 0 ); 1552 } 1553 if ( aExp == 0 ) { 1554 if ( aSig == 0 ) return packFloat64( aSign, 0, 0 ); 1555 normalizeFloat32Subnormal( aSig, &aExp, &aSig ); 1556 --aExp; 1557 } 1558 return packFloat64( aSign, aExp + 0x380, ( (bits64) aSig )<<29 ); 1559 1560 } 1561 1562 #ifdef FLOATX80 1563 1564 /* 1565 ------------------------------------------------------------------------------- 1566 Returns the result of converting the single-precision floating-point value 1567 `a' to the extended double-precision floating-point format. The conversion 1568 is performed according to the IEC/IEEE Standard for Binary Floating-Point 1569 Arithmetic. 1570 ------------------------------------------------------------------------------- 1571 */ 1572 floatx80 float32_to_floatx80( float32 a ) 1573 { 1574 flag aSign; 1575 int16 aExp; 1576 bits32 aSig; 1577 1578 aSig = extractFloat32Frac( a ); 1579 aExp = extractFloat32Exp( a ); 1580 aSign = extractFloat32Sign( a ); 1581 if ( aExp == 0xFF ) { 1582 if ( aSig ) return commonNaNToFloatx80( float32ToCommonNaN( a ) ); 1583 return packFloatx80( aSign, 0x7FFF, LIT64( 0x8000000000000000 ) ); 1584 } 1585 if ( aExp == 0 ) { 1586 if ( aSig == 0 ) return packFloatx80( aSign, 0, 0 ); 1587 normalizeFloat32Subnormal( aSig, &aExp, &aSig ); 1588 } 1589 aSig |= 0x00800000; 1590 return packFloatx80( aSign, aExp + 0x3F80, ( (bits64) aSig )<<40 ); 1591 1592 } 1593 1594 #endif 1595 1596 #ifdef FLOAT128 1597 1598 /* 1599 ------------------------------------------------------------------------------- 1600 Returns the result of converting the single-precision floating-point value 1601 `a' to the double-precision floating-point format. The conversion is 1602 performed according to the IEC/IEEE Standard for Binary Floating-Point 1603 Arithmetic. 1604 ------------------------------------------------------------------------------- 1605 */ 1606 float128 float32_to_float128( float32 a ) 1607 { 1608 flag aSign; 1609 int16 aExp; 1610 bits32 aSig; 1611 1612 aSig = extractFloat32Frac( a ); 1613 aExp = extractFloat32Exp( a ); 1614 aSign = extractFloat32Sign( a ); 1615 if ( aExp == 0xFF ) { 1616 if ( aSig ) return commonNaNToFloat128( float32ToCommonNaN( a ) ); 1617 return packFloat128( aSign, 0x7FFF, 0, 0 ); 1618 } 1619 if ( aExp == 0 ) { 1620 if ( aSig == 0 ) return packFloat128( aSign, 0, 0, 0 ); 1621 normalizeFloat32Subnormal( aSig, &aExp, &aSig ); 1622 --aExp; 1623 } 1624 return packFloat128( aSign, aExp + 0x3F80, ( (bits64) aSig )<<25, 0 ); 1625 1626 } 1627 1628 #endif 1629 1630 #ifndef SOFTFLOAT_FOR_GCC /* Not needed */ 1631 /* 1632 ------------------------------------------------------------------------------- 1633 Rounds the single-precision floating-point value `a' to an integer, and 1634 returns the result as a single-precision floating-point value. The 1635 operation is performed according to the IEC/IEEE Standard for Binary 1636 Floating-Point Arithmetic. 1637 ------------------------------------------------------------------------------- 1638 */ 1639 float32 float32_round_to_int( float32 a ) 1640 { 1641 flag aSign; 1642 int16 aExp; 1643 bits32 lastBitMask, roundBitsMask; 1644 int8 roundingMode; 1645 float32 z; 1646 1647 aExp = extractFloat32Exp( a ); 1648 if ( 0x96 <= aExp ) { 1649 if ( ( aExp == 0xFF ) && extractFloat32Frac( a ) ) { 1650 return propagateFloat32NaN( a, a ); 1651 } 1652 return a; 1653 } 1654 if ( aExp <= 0x7E ) { 1655 if ( (bits32) ( a<<1 ) == 0 ) return a; 1656 float_exception_flags |= float_flag_inexact; 1657 aSign = extractFloat32Sign( a ); 1658 switch ( float_rounding_mode ) { 1659 case float_round_nearest_even: 1660 if ( ( aExp == 0x7E ) && extractFloat32Frac( a ) ) { 1661 return packFloat32( aSign, 0x7F, 0 ); 1662 } 1663 break; 1664 case float_round_to_zero: 1665 break; 1666 case float_round_down: 1667 return aSign ? 0xBF800000 : 0; 1668 case float_round_up: 1669 return aSign ? 0x80000000 : 0x3F800000; 1670 } 1671 return packFloat32( aSign, 0, 0 ); 1672 } 1673 lastBitMask = 1; 1674 lastBitMask <<= 0x96 - aExp; 1675 roundBitsMask = lastBitMask - 1; 1676 z = a; 1677 roundingMode = float_rounding_mode; 1678 if ( roundingMode == float_round_nearest_even ) { 1679 z += lastBitMask>>1; 1680 if ( ( z & roundBitsMask ) == 0 ) z &= ~ lastBitMask; 1681 } 1682 else if ( roundingMode != float_round_to_zero ) { 1683 if ( extractFloat32Sign( z ) ^ ( roundingMode == float_round_up ) ) { 1684 z += roundBitsMask; 1685 } 1686 } 1687 z &= ~ roundBitsMask; 1688 if ( z != a ) float_exception_flags |= float_flag_inexact; 1689 return z; 1690 1691 } 1692 #endif /* !SOFTFLOAT_FOR_GCC */ 1693 1694 /* 1695 ------------------------------------------------------------------------------- 1696 Returns the result of adding the absolute values of the single-precision 1697 floating-point values `a' and `b'. If `zSign' is 1, the sum is negated 1698 before being returned. `zSign' is ignored if the result is a NaN. 1699 The addition is performed according to the IEC/IEEE Standard for Binary 1700 Floating-Point Arithmetic. 1701 ------------------------------------------------------------------------------- 1702 */ 1703 static float32 addFloat32Sigs( float32 a, float32 b, flag zSign ) 1704 { 1705 int16 aExp, bExp, zExp; 1706 bits32 aSig, bSig, zSig; 1707 int16 expDiff; 1708 1709 aSig = extractFloat32Frac( a ); 1710 aExp = extractFloat32Exp( a ); 1711 bSig = extractFloat32Frac( b ); 1712 bExp = extractFloat32Exp( b ); 1713 expDiff = aExp - bExp; 1714 aSig <<= 6; 1715 bSig <<= 6; 1716 if ( 0 < expDiff ) { 1717 if ( aExp == 0xFF ) { 1718 if ( aSig ) return propagateFloat32NaN( a, b ); 1719 return a; 1720 } 1721 if ( bExp == 0 ) { 1722 --expDiff; 1723 } 1724 else { 1725 bSig |= 0x20000000; 1726 } 1727 shift32RightJamming( bSig, expDiff, &bSig ); 1728 zExp = aExp; 1729 } 1730 else if ( expDiff < 0 ) { 1731 if ( bExp == 0xFF ) { 1732 if ( bSig ) return propagateFloat32NaN( a, b ); 1733 return packFloat32( zSign, 0xFF, 0 ); 1734 } 1735 if ( aExp == 0 ) { 1736 ++expDiff; 1737 } 1738 else { 1739 aSig |= 0x20000000; 1740 } 1741 shift32RightJamming( aSig, - expDiff, &aSig ); 1742 zExp = bExp; 1743 } 1744 else { 1745 if ( aExp == 0xFF ) { 1746 if ( aSig | bSig ) return propagateFloat32NaN( a, b ); 1747 return a; 1748 } 1749 if ( aExp == 0 ) return packFloat32( zSign, 0, ( aSig + bSig )>>6 ); 1750 zSig = 0x40000000 + aSig + bSig; 1751 zExp = aExp; 1752 goto roundAndPack; 1753 } 1754 aSig |= 0x20000000; 1755 zSig = ( aSig + bSig )<<1; 1756 --zExp; 1757 if ( (sbits32) zSig < 0 ) { 1758 zSig = aSig + bSig; 1759 ++zExp; 1760 } 1761 roundAndPack: 1762 return roundAndPackFloat32( zSign, zExp, zSig ); 1763 1764 } 1765 1766 /* 1767 ------------------------------------------------------------------------------- 1768 Returns the result of subtracting the absolute values of the single- 1769 precision floating-point values `a' and `b'. If `zSign' is 1, the 1770 difference is negated before being returned. `zSign' is ignored if the 1771 result is a NaN. The subtraction is performed according to the IEC/IEEE 1772 Standard for Binary Floating-Point Arithmetic. 1773 ------------------------------------------------------------------------------- 1774 */ 1775 static float32 subFloat32Sigs( float32 a, float32 b, flag zSign ) 1776 { 1777 int16 aExp, bExp, zExp; 1778 bits32 aSig, bSig, zSig; 1779 int16 expDiff; 1780 1781 aSig = extractFloat32Frac( a ); 1782 aExp = extractFloat32Exp( a ); 1783 bSig = extractFloat32Frac( b ); 1784 bExp = extractFloat32Exp( b ); 1785 expDiff = aExp - bExp; 1786 aSig <<= 7; 1787 bSig <<= 7; 1788 if ( 0 < expDiff ) goto aExpBigger; 1789 if ( expDiff < 0 ) goto bExpBigger; 1790 if ( aExp == 0xFF ) { 1791 if ( aSig | bSig ) return propagateFloat32NaN( a, b ); 1792 float_raise( float_flag_invalid ); 1793 return float32_default_nan; 1794 } 1795 if ( aExp == 0 ) { 1796 aExp = 1; 1797 bExp = 1; 1798 } 1799 if ( bSig < aSig ) goto aBigger; 1800 if ( aSig < bSig ) goto bBigger; 1801 return packFloat32( float_rounding_mode == float_round_down, 0, 0 ); 1802 bExpBigger: 1803 if ( bExp == 0xFF ) { 1804 if ( bSig ) return propagateFloat32NaN( a, b ); 1805 return packFloat32( zSign ^ 1, 0xFF, 0 ); 1806 } 1807 if ( aExp == 0 ) { 1808 ++expDiff; 1809 } 1810 else { 1811 aSig |= 0x40000000; 1812 } 1813 shift32RightJamming( aSig, - expDiff, &aSig ); 1814 bSig |= 0x40000000; 1815 bBigger: 1816 zSig = bSig - aSig; 1817 zExp = bExp; 1818 zSign ^= 1; 1819 goto normalizeRoundAndPack; 1820 aExpBigger: 1821 if ( aExp == 0xFF ) { 1822 if ( aSig ) return propagateFloat32NaN( a, b ); 1823 return a; 1824 } 1825 if ( bExp == 0 ) { 1826 --expDiff; 1827 } 1828 else { 1829 bSig |= 0x40000000; 1830 } 1831 shift32RightJamming( bSig, expDiff, &bSig ); 1832 aSig |= 0x40000000; 1833 aBigger: 1834 zSig = aSig - bSig; 1835 zExp = aExp; 1836 normalizeRoundAndPack: 1837 --zExp; 1838 return normalizeRoundAndPackFloat32( zSign, zExp, zSig ); 1839 1840 } 1841 1842 /* 1843 ------------------------------------------------------------------------------- 1844 Returns the result of adding the single-precision floating-point values `a' 1845 and `b'. The operation is performed according to the IEC/IEEE Standard for 1846 Binary Floating-Point Arithmetic. 1847 ------------------------------------------------------------------------------- 1848 */ 1849 float32 float32_add( float32 a, float32 b ) 1850 { 1851 flag aSign, bSign; 1852 1853 aSign = extractFloat32Sign( a ); 1854 bSign = extractFloat32Sign( b ); 1855 if ( aSign == bSign ) { 1856 return addFloat32Sigs( a, b, aSign ); 1857 } 1858 else { 1859 return subFloat32Sigs( a, b, aSign ); 1860 } 1861 1862 } 1863 1864 /* 1865 ------------------------------------------------------------------------------- 1866 Returns the result of subtracting the single-precision floating-point values 1867 `a' and `b'. The operation is performed according to the IEC/IEEE Standard 1868 for Binary Floating-Point Arithmetic. 1869 ------------------------------------------------------------------------------- 1870 */ 1871 float32 float32_sub( float32 a, float32 b ) 1872 { 1873 flag aSign, bSign; 1874 1875 aSign = extractFloat32Sign( a ); 1876 bSign = extractFloat32Sign( b ); 1877 if ( aSign == bSign ) { 1878 return subFloat32Sigs( a, b, aSign ); 1879 } 1880 else { 1881 return addFloat32Sigs( a, b, aSign ); 1882 } 1883 1884 } 1885 1886 /* 1887 ------------------------------------------------------------------------------- 1888 Returns the result of multiplying the single-precision floating-point values 1889 `a' and `b'. The operation is performed according to the IEC/IEEE Standard 1890 for Binary Floating-Point Arithmetic. 1891 ------------------------------------------------------------------------------- 1892 */ 1893 float32 float32_mul( float32 a, float32 b ) 1894 { 1895 flag aSign, bSign, zSign; 1896 int16 aExp, bExp, zExp; 1897 bits32 aSig, bSig; 1898 bits64 zSig64; 1899 bits32 zSig; 1900 1901 aSig = extractFloat32Frac( a ); 1902 aExp = extractFloat32Exp( a ); 1903 aSign = extractFloat32Sign( a ); 1904 bSig = extractFloat32Frac( b ); 1905 bExp = extractFloat32Exp( b ); 1906 bSign = extractFloat32Sign( b ); 1907 zSign = aSign ^ bSign; 1908 if ( aExp == 0xFF ) { 1909 if ( aSig || ( ( bExp == 0xFF ) && bSig ) ) { 1910 return propagateFloat32NaN( a, b ); 1911 } 1912 if ( ( bExp | bSig ) == 0 ) { 1913 float_raise( float_flag_invalid ); 1914 return float32_default_nan; 1915 } 1916 return packFloat32( zSign, 0xFF, 0 ); 1917 } 1918 if ( bExp == 0xFF ) { 1919 if ( bSig ) return propagateFloat32NaN( a, b ); 1920 if ( ( aExp | aSig ) == 0 ) { 1921 float_raise( float_flag_invalid ); 1922 return float32_default_nan; 1923 } 1924 return packFloat32( zSign, 0xFF, 0 ); 1925 } 1926 if ( aExp == 0 ) { 1927 if ( aSig == 0 ) return packFloat32( zSign, 0, 0 ); 1928 normalizeFloat32Subnormal( aSig, &aExp, &aSig ); 1929 } 1930 if ( bExp == 0 ) { 1931 if ( bSig == 0 ) return packFloat32( zSign, 0, 0 ); 1932 normalizeFloat32Subnormal( bSig, &bExp, &bSig ); 1933 } 1934 zExp = aExp + bExp - 0x7F; 1935 aSig = ( aSig | 0x00800000 )<<7; 1936 bSig = ( bSig | 0x00800000 )<<8; 1937 shift64RightJamming( ( (bits64) aSig ) * bSig, 32, &zSig64 ); 1938 zSig = zSig64; 1939 if ( 0 <= (sbits32) ( zSig<<1 ) ) { 1940 zSig <<= 1; 1941 --zExp; 1942 } 1943 return roundAndPackFloat32( zSign, zExp, zSig ); 1944 1945 } 1946 1947 /* 1948 ------------------------------------------------------------------------------- 1949 Returns the result of dividing the single-precision floating-point value `a' 1950 by the corresponding value `b'. The operation is performed according to the 1951 IEC/IEEE Standard for Binary Floating-Point Arithmetic. 1952 ------------------------------------------------------------------------------- 1953 */ 1954 float32 float32_div( float32 a, float32 b ) 1955 { 1956 flag aSign, bSign, zSign; 1957 int16 aExp, bExp, zExp; 1958 bits32 aSig, bSig, zSig; 1959 1960 aSig = extractFloat32Frac( a ); 1961 aExp = extractFloat32Exp( a ); 1962 aSign = extractFloat32Sign( a ); 1963 bSig = extractFloat32Frac( b ); 1964 bExp = extractFloat32Exp( b ); 1965 bSign = extractFloat32Sign( b ); 1966 zSign = aSign ^ bSign; 1967 if ( aExp == 0xFF ) { 1968 if ( aSig ) return propagateFloat32NaN( a, b ); 1969 if ( bExp == 0xFF ) { 1970 if ( bSig ) return propagateFloat32NaN( a, b ); 1971 float_raise( float_flag_invalid ); 1972 return float32_default_nan; 1973 } 1974 return packFloat32( zSign, 0xFF, 0 ); 1975 } 1976 if ( bExp == 0xFF ) { 1977 if ( bSig ) return propagateFloat32NaN( a, b ); 1978 return packFloat32( zSign, 0, 0 ); 1979 } 1980 if ( bExp == 0 ) { 1981 if ( bSig == 0 ) { 1982 if ( ( aExp | aSig ) == 0 ) { 1983 float_raise( float_flag_invalid ); 1984 return float32_default_nan; 1985 } 1986 float_raise( float_flag_divbyzero ); 1987 return packFloat32( zSign, 0xFF, 0 ); 1988 } 1989 normalizeFloat32Subnormal( bSig, &bExp, &bSig ); 1990 } 1991 if ( aExp == 0 ) { 1992 if ( aSig == 0 ) return packFloat32( zSign, 0, 0 ); 1993 normalizeFloat32Subnormal( aSig, &aExp, &aSig ); 1994 } 1995 zExp = aExp - bExp + 0x7D; 1996 aSig = ( aSig | 0x00800000 )<<7; 1997 bSig = ( bSig | 0x00800000 )<<8; 1998 if ( bSig <= ( aSig + aSig ) ) { 1999 aSig >>= 1; 2000 ++zExp; 2001 } 2002 zSig = ( ( (bits64) aSig )<<32 ) / bSig; 2003 if ( ( zSig & 0x3F ) == 0 ) { 2004 zSig |= ( (bits64) bSig * zSig != ( (bits64) aSig )<<32 ); 2005 } 2006 return roundAndPackFloat32( zSign, zExp, zSig ); 2007 2008 } 2009 2010 #ifndef SOFTFLOAT_FOR_GCC /* Not needed */ 2011 /* 2012 ------------------------------------------------------------------------------- 2013 Returns the remainder of the single-precision floating-point value `a' 2014 with respect to the corresponding value `b'. The operation is performed 2015 according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic. 2016 ------------------------------------------------------------------------------- 2017 */ 2018 float32 float32_rem( float32 a, float32 b ) 2019 { 2020 flag aSign, bSign, zSign; 2021 int16 aExp, bExp, expDiff; 2022 bits32 aSig, bSig; 2023 bits32 q; 2024 bits64 aSig64, bSig64, q64; 2025 bits32 alternateASig; 2026 sbits32 sigMean; 2027 2028 aSig = extractFloat32Frac( a ); 2029 aExp = extractFloat32Exp( a ); 2030 aSign = extractFloat32Sign( a ); 2031 bSig = extractFloat32Frac( b ); 2032 bExp = extractFloat32Exp( b ); 2033 bSign = extractFloat32Sign( b ); 2034 if ( aExp == 0xFF ) { 2035 if ( aSig || ( ( bExp == 0xFF ) && bSig ) ) { 2036 return propagateFloat32NaN( a, b ); 2037 } 2038 float_raise( float_flag_invalid ); 2039 return float32_default_nan; 2040 } 2041 if ( bExp == 0xFF ) { 2042 if ( bSig ) return propagateFloat32NaN( a, b ); 2043 return a; 2044 } 2045 if ( bExp == 0 ) { 2046 if ( bSig == 0 ) { 2047 float_raise( float_flag_invalid ); 2048 return float32_default_nan; 2049 } 2050 normalizeFloat32Subnormal( bSig, &bExp, &bSig ); 2051 } 2052 if ( aExp == 0 ) { 2053 if ( aSig == 0 ) return a; 2054 normalizeFloat32Subnormal( aSig, &aExp, &aSig ); 2055 } 2056 expDiff = aExp - bExp; 2057 aSig |= 0x00800000; 2058 bSig |= 0x00800000; 2059 if ( expDiff < 32 ) { 2060 aSig <<= 8; 2061 bSig <<= 8; 2062 if ( expDiff < 0 ) { 2063 if ( expDiff < -1 ) return a; 2064 aSig >>= 1; 2065 } 2066 q = ( bSig <= aSig ); 2067 if ( q ) aSig -= bSig; 2068 if ( 0 < expDiff ) { 2069 q = ( ( (bits64) aSig )<<32 ) / bSig; 2070 q >>= 32 - expDiff; 2071 bSig >>= 2; 2072 aSig = ( ( aSig>>1 )<<( expDiff - 1 ) ) - bSig * q; 2073 } 2074 else { 2075 aSig >>= 2; 2076 bSig >>= 2; 2077 } 2078 } 2079 else { 2080 if ( bSig <= aSig ) aSig -= bSig; 2081 aSig64 = ( (bits64) aSig )<<40; 2082 bSig64 = ( (bits64) bSig )<<40; 2083 expDiff -= 64; 2084 while ( 0 < expDiff ) { 2085 q64 = estimateDiv128To64( aSig64, 0, bSig64 ); 2086 q64 = ( 2 < q64 ) ? q64 - 2 : 0; 2087 aSig64 = - ( ( bSig * q64 )<<38 ); 2088 expDiff -= 62; 2089 } 2090 expDiff += 64; 2091 q64 = estimateDiv128To64( aSig64, 0, bSig64 ); 2092 q64 = ( 2 < q64 ) ? q64 - 2 : 0; 2093 q = q64>>( 64 - expDiff ); 2094 bSig <<= 6; 2095 aSig = ( ( aSig64>>33 )<<( expDiff - 1 ) ) - bSig * q; 2096 } 2097 do { 2098 alternateASig = aSig; 2099 ++q; 2100 aSig -= bSig; 2101 } while ( 0 <= (sbits32) aSig ); 2102 sigMean = aSig + alternateASig; 2103 if ( ( sigMean < 0 ) || ( ( sigMean == 0 ) && ( q & 1 ) ) ) { 2104 aSig = alternateASig; 2105 } 2106 zSign = ( (sbits32) aSig < 0 ); 2107 if ( zSign ) aSig = - aSig; 2108 return normalizeRoundAndPackFloat32( aSign ^ zSign, bExp, aSig ); 2109 2110 } 2111 #endif /* !SOFTFLOAT_FOR_GCC */ 2112 2113 #ifndef SOFTFLOAT_FOR_GCC /* Not needed */ 2114 /* 2115 ------------------------------------------------------------------------------- 2116 Returns the square root of the single-precision floating-point value `a'. 2117 The operation is performed according to the IEC/IEEE Standard for Binary 2118 Floating-Point Arithmetic. 2119 ------------------------------------------------------------------------------- 2120 */ 2121 float32 float32_sqrt( float32 a ) 2122 { 2123 flag aSign; 2124 int16 aExp, zExp; 2125 bits32 aSig, zSig; 2126 bits64 rem, term; 2127 2128 aSig = extractFloat32Frac( a ); 2129 aExp = extractFloat32Exp( a ); 2130 aSign = extractFloat32Sign( a ); 2131 if ( aExp == 0xFF ) { 2132 if ( aSig ) return propagateFloat32NaN( a, 0 ); 2133 if ( ! aSign ) return a; 2134 float_raise( float_flag_invalid ); 2135 return float32_default_nan; 2136 } 2137 if ( aSign ) { 2138 if ( ( aExp | aSig ) == 0 ) return a; 2139 float_raise( float_flag_invalid ); 2140 return float32_default_nan; 2141 } 2142 if ( aExp == 0 ) { 2143 if ( aSig == 0 ) return 0; 2144 normalizeFloat32Subnormal( aSig, &aExp, &aSig ); 2145 } 2146 zExp = ( ( aExp - 0x7F )>>1 ) + 0x7E; 2147 aSig = ( aSig | 0x00800000 )<<8; 2148 zSig = estimateSqrt32( aExp, aSig ) + 2; 2149 if ( ( zSig & 0x7F ) <= 5 ) { 2150 if ( zSig < 2 ) { 2151 zSig = 0x7FFFFFFF; 2152 goto roundAndPack; 2153 } 2154 aSig >>= aExp & 1; 2155 term = ( (bits64) zSig ) * zSig; 2156 rem = ( ( (bits64) aSig )<<32 ) - term; 2157 while ( (sbits64) rem < 0 ) { 2158 --zSig; 2159 rem += ( ( (bits64) zSig )<<1 ) | 1; 2160 } 2161 zSig |= ( rem != 0 ); 2162 } 2163 shift32RightJamming( zSig, 1, &zSig ); 2164 roundAndPack: 2165 return roundAndPackFloat32( 0, zExp, zSig ); 2166 2167 } 2168 #endif /* !SOFTFLOAT_FOR_GCC */ 2169 2170 /* 2171 ------------------------------------------------------------------------------- 2172 Returns 1 if the single-precision floating-point value `a' is equal to 2173 the corresponding value `b', and 0 otherwise. The comparison is performed 2174 according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic. 2175 ------------------------------------------------------------------------------- 2176 */ 2177 flag float32_eq( float32 a, float32 b ) 2178 { 2179 2180 if ( ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) ) 2181 || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) ) 2182 ) { 2183 if ( float32_is_signaling_nan( a ) || float32_is_signaling_nan( b ) ) { 2184 float_raise( float_flag_invalid ); 2185 } 2186 return 0; 2187 } 2188 return ( a == b ) || ( (bits32) ( ( a | b )<<1 ) == 0 ); 2189 2190 } 2191 2192 /* 2193 ------------------------------------------------------------------------------- 2194 Returns 1 if the single-precision floating-point value `a' is less than 2195 or equal to the corresponding value `b', and 0 otherwise. The comparison 2196 is performed according to the IEC/IEEE Standard for Binary Floating-Point 2197 Arithmetic. 2198 ------------------------------------------------------------------------------- 2199 */ 2200 flag float32_le( float32 a, float32 b ) 2201 { 2202 flag aSign, bSign; 2203 2204 if ( ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) ) 2205 || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) ) 2206 ) { 2207 float_raise( float_flag_invalid ); 2208 return 0; 2209 } 2210 aSign = extractFloat32Sign( a ); 2211 bSign = extractFloat32Sign( b ); 2212 if ( aSign != bSign ) return aSign || ( (bits32) ( ( a | b )<<1 ) == 0 ); 2213 return ( a == b ) || ( aSign ^ ( a < b ) ); 2214 2215 } 2216 2217 /* 2218 ------------------------------------------------------------------------------- 2219 Returns 1 if the single-precision floating-point value `a' is less than 2220 the corresponding value `b', and 0 otherwise. The comparison is performed 2221 according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic. 2222 ------------------------------------------------------------------------------- 2223 */ 2224 flag float32_lt( float32 a, float32 b ) 2225 { 2226 flag aSign, bSign; 2227 2228 if ( ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) ) 2229 || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) ) 2230 ) { 2231 float_raise( float_flag_invalid ); 2232 return 0; 2233 } 2234 aSign = extractFloat32Sign( a ); 2235 bSign = extractFloat32Sign( b ); 2236 if ( aSign != bSign ) return aSign && ( (bits32) ( ( a | b )<<1 ) != 0 ); 2237 return ( a != b ) && ( aSign ^ ( a < b ) ); 2238 2239 } 2240 2241 #ifndef SOFTFLOAT_FOR_GCC /* Not needed */ 2242 /* 2243 ------------------------------------------------------------------------------- 2244 Returns 1 if the single-precision floating-point value `a' is equal to 2245 the corresponding value `b', and 0 otherwise. The invalid exception is 2246 raised if either operand is a NaN. Otherwise, the comparison is performed 2247 according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic. 2248 ------------------------------------------------------------------------------- 2249 */ 2250 flag float32_eq_signaling( float32 a, float32 b ) 2251 { 2252 2253 if ( ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) ) 2254 || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) ) 2255 ) { 2256 float_raise( float_flag_invalid ); 2257 return 0; 2258 } 2259 return ( a == b ) || ( (bits32) ( ( a | b )<<1 ) == 0 ); 2260 2261 } 2262 2263 /* 2264 ------------------------------------------------------------------------------- 2265 Returns 1 if the single-precision floating-point value `a' is less than or 2266 equal to the corresponding value `b', and 0 otherwise. Quiet NaNs do not 2267 cause an exception. Otherwise, the comparison is performed according to the 2268 IEC/IEEE Standard for Binary Floating-Point Arithmetic. 2269 ------------------------------------------------------------------------------- 2270 */ 2271 flag float32_le_quiet( float32 a, float32 b ) 2272 { 2273 flag aSign, bSign; 2274 2275 if ( ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) ) 2276 || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) ) 2277 ) { 2278 if ( float32_is_signaling_nan( a ) || float32_is_signaling_nan( b ) ) { 2279 float_raise( float_flag_invalid ); 2280 } 2281 return 0; 2282 } 2283 aSign = extractFloat32Sign( a ); 2284 bSign = extractFloat32Sign( b ); 2285 if ( aSign != bSign ) return aSign || ( (bits32) ( ( a | b )<<1 ) == 0 ); 2286 return ( a == b ) || ( aSign ^ ( a < b ) ); 2287 2288 } 2289 2290 /* 2291 ------------------------------------------------------------------------------- 2292 Returns 1 if the single-precision floating-point value `a' is less than 2293 the corresponding value `b', and 0 otherwise. Quiet NaNs do not cause an 2294 exception. Otherwise, the comparison is performed according to the IEC/IEEE 2295 Standard for Binary Floating-Point Arithmetic. 2296 ------------------------------------------------------------------------------- 2297 */ 2298 flag float32_lt_quiet( float32 a, float32 b ) 2299 { 2300 flag aSign, bSign; 2301 2302 if ( ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) ) 2303 || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) ) 2304 ) { 2305 if ( float32_is_signaling_nan( a ) || float32_is_signaling_nan( b ) ) { 2306 float_raise( float_flag_invalid ); 2307 } 2308 return 0; 2309 } 2310 aSign = extractFloat32Sign( a ); 2311 bSign = extractFloat32Sign( b ); 2312 if ( aSign != bSign ) return aSign && ( (bits32) ( ( a | b )<<1 ) != 0 ); 2313 return ( a != b ) && ( aSign ^ ( a < b ) ); 2314 2315 } 2316 #endif /* !SOFTFLOAT_FOR_GCC */ 2317 2318 #ifndef SOFTFLOAT_FOR_GCC /* Not needed */ 2319 /* 2320 ------------------------------------------------------------------------------- 2321 Returns the result of converting the double-precision floating-point value 2322 `a' to the 32-bit two's complement integer format. The conversion is 2323 performed according to the IEC/IEEE Standard for Binary Floating-Point 2324 Arithmetic---which means in particular that the conversion is rounded 2325 according to the current rounding mode. If `a' is a NaN, the largest 2326 positive integer is returned. Otherwise, if the conversion overflows, the 2327 largest integer with the same sign as `a' is returned. 2328 ------------------------------------------------------------------------------- 2329 */ 2330 int32 float64_to_int32( float64 a ) 2331 { 2332 flag aSign; 2333 int16 aExp, shiftCount; 2334 bits64 aSig; 2335 2336 aSig = extractFloat64Frac( a ); 2337 aExp = extractFloat64Exp( a ); 2338 aSign = extractFloat64Sign( a ); 2339 if ( ( aExp == 0x7FF ) && aSig ) aSign = 0; 2340 if ( aExp ) aSig |= LIT64( 0x0010000000000000 ); 2341 shiftCount = 0x42C - aExp; 2342 if ( 0 < shiftCount ) shift64RightJamming( aSig, shiftCount, &aSig ); 2343 return roundAndPackInt32( aSign, aSig ); 2344 2345 } 2346 #endif /* !SOFTFLOAT_FOR_GCC */ 2347 2348 /* 2349 ------------------------------------------------------------------------------- 2350 Returns the result of converting the double-precision floating-point value 2351 `a' to the 32-bit two's complement integer format. The conversion is 2352 performed according to the IEC/IEEE Standard for Binary Floating-Point 2353 Arithmetic, except that the conversion is always rounded toward zero. 2354 If `a' is a NaN, the largest positive integer is returned. Otherwise, if 2355 the conversion overflows, the largest integer with the same sign as `a' is 2356 returned. 2357 ------------------------------------------------------------------------------- 2358 */ 2359 int32 float64_to_int32_round_to_zero( float64 a ) 2360 { 2361 flag aSign; 2362 int16 aExp, shiftCount; 2363 bits64 aSig, savedASig; 2364 int32 z; 2365 2366 aSig = extractFloat64Frac( a ); 2367 aExp = extractFloat64Exp( a ); 2368 aSign = extractFloat64Sign( a ); 2369 if ( 0x41E < aExp ) { 2370 if ( ( aExp == 0x7FF ) && aSig ) aSign = 0; 2371 goto invalid; 2372 } 2373 else if ( aExp < 0x3FF ) { 2374 if ( aExp || aSig ) float_exception_flags |= float_flag_inexact; 2375 return 0; 2376 } 2377 aSig |= LIT64( 0x0010000000000000 ); 2378 shiftCount = 0x433 - aExp; 2379 savedASig = aSig; 2380 aSig >>= shiftCount; 2381 z = aSig; 2382 if ( aSign ) z = - z; 2383 if ( ( z < 0 ) ^ aSign ) { 2384 invalid: 2385 float_raise( float_flag_invalid ); 2386 return aSign ? (sbits32) 0x80000000 : 0x7FFFFFFF; 2387 } 2388 if ( ( aSig<<shiftCount ) != savedASig ) { 2389 float_exception_flags |= float_flag_inexact; 2390 } 2391 return z; 2392 2393 } 2394 2395 #ifndef SOFTFLOAT_FOR_GCC /* Not needed */ 2396 /* 2397 ------------------------------------------------------------------------------- 2398 Returns the result of converting the double-precision floating-point value 2399 `a' to the 64-bit two's complement integer format. The conversion is 2400 performed according to the IEC/IEEE Standard for Binary Floating-Point 2401 Arithmetic---which means in particular that the conversion is rounded 2402 according to the current rounding mode. If `a' is a NaN, the largest 2403 positive integer is returned. Otherwise, if the conversion overflows, the 2404 largest integer with the same sign as `a' is returned. 2405 ------------------------------------------------------------------------------- 2406 */ 2407 int64 float64_to_int64( float64 a ) 2408 { 2409 flag aSign; 2410 int16 aExp, shiftCount; 2411 bits64 aSig, aSigExtra; 2412 2413 aSig = extractFloat64Frac( a ); 2414 aExp = extractFloat64Exp( a ); 2415 aSign = extractFloat64Sign( a ); 2416 if ( aExp ) aSig |= LIT64( 0x0010000000000000 ); 2417 shiftCount = 0x433 - aExp; 2418 if ( shiftCount <= 0 ) { 2419 if ( 0x43E < aExp ) { 2420 float_raise( float_flag_invalid ); 2421 if ( ! aSign 2422 || ( ( aExp == 0x7FF ) 2423 && ( aSig != LIT64( 0x0010000000000000 ) ) ) 2424 ) { 2425 return LIT64( 0x7FFFFFFFFFFFFFFF ); 2426 } 2427 return (sbits64) LIT64( 0x8000000000000000 ); 2428 } 2429 aSigExtra = 0; 2430 aSig <<= - shiftCount; 2431 } 2432 else { 2433 shift64ExtraRightJamming( aSig, 0, shiftCount, &aSig, &aSigExtra ); 2434 } 2435 return roundAndPackInt64( aSign, aSig, aSigExtra ); 2436 2437 } 2438 2439 /* 2440 ------------------------------------------------------------------------------- 2441 Returns the result of converting the double-precision floating-point value 2442 `a' to the 64-bit two's complement integer format. The conversion is 2443 performed according to the IEC/IEEE Standard for Binary Floating-Point 2444 Arithmetic, except that the conversion is always rounded toward zero. 2445 If `a' is a NaN, the largest positive integer is returned. Otherwise, if 2446 the conversion overflows, the largest integer with the same sign as `a' is 2447 returned. 2448 ------------------------------------------------------------------------------- 2449 */ 2450 int64 float64_to_int64_round_to_zero( float64 a ) 2451 { 2452 flag aSign; 2453 int16 aExp, shiftCount; 2454 bits64 aSig; 2455 int64 z; 2456 2457 aSig = extractFloat64Frac( a ); 2458 aExp = extractFloat64Exp( a ); 2459 aSign = extractFloat64Sign( a ); 2460 if ( aExp ) aSig |= LIT64( 0x0010000000000000 ); 2461 shiftCount = aExp - 0x433; 2462 if ( 0 <= shiftCount ) { 2463 if ( 0x43E <= aExp ) { 2464 if ( a != LIT64( 0xC3E0000000000000 ) ) { 2465 float_raise( float_flag_invalid ); 2466 if ( ! aSign 2467 || ( ( aExp == 0x7FF ) 2468 && ( aSig != LIT64( 0x0010000000000000 ) ) ) 2469 ) { 2470 return LIT64( 0x7FFFFFFFFFFFFFFF ); 2471 } 2472 } 2473 return (sbits64) LIT64( 0x8000000000000000 ); 2474 } 2475 z = aSig<<shiftCount; 2476 } 2477 else { 2478 if ( aExp < 0x3FE ) { 2479 if ( aExp | aSig ) float_exception_flags |= float_flag_inexact; 2480 return 0; 2481 } 2482 z = aSig>>( - shiftCount ); 2483 if ( (bits64) ( aSig<<( shiftCount & 63 ) ) ) { 2484 float_exception_flags |= float_flag_inexact; 2485 } 2486 } 2487 if ( aSign ) z = - z; 2488 return z; 2489 2490 } 2491 #endif /* !SOFTFLOAT_FOR_GCC */ 2492 2493 /* 2494 ------------------------------------------------------------------------------- 2495 Returns the result of converting the double-precision floating-point value 2496 `a' to the single-precision floating-point format. The conversion is 2497 performed according to the IEC/IEEE Standard for Binary Floating-Point 2498 Arithmetic. 2499 ------------------------------------------------------------------------------- 2500 */ 2501 float32 float64_to_float32( float64 a ) 2502 { 2503 flag aSign; 2504 int16 aExp; 2505 bits64 aSig; 2506 bits32 zSig; 2507 2508 aSig = extractFloat64Frac( a ); 2509 aExp = extractFloat64Exp( a ); 2510 aSign = extractFloat64Sign( a ); 2511 if ( aExp == 0x7FF ) { 2512 if ( aSig ) return commonNaNToFloat32( float64ToCommonNaN( a ) ); 2513 return packFloat32( aSign, 0xFF, 0 ); 2514 } 2515 shift64RightJamming( aSig, 22, &aSig ); 2516 zSig = aSig; 2517 if ( aExp || zSig ) { 2518 zSig |= 0x40000000; 2519 aExp -= 0x381; 2520 } 2521 return roundAndPackFloat32( aSign, aExp, zSig ); 2522 2523 } 2524 2525 #ifdef FLOATX80 2526 2527 /* 2528 ------------------------------------------------------------------------------- 2529 Returns the result of converting the double-precision floating-point value 2530 `a' to the extended double-precision floating-point format. The conversion 2531 is performed according to the IEC/IEEE Standard for Binary Floating-Point 2532 Arithmetic. 2533 ------------------------------------------------------------------------------- 2534 */ 2535 floatx80 float64_to_floatx80( float64 a ) 2536 { 2537 flag aSign; 2538 int16 aExp; 2539 bits64 aSig; 2540 2541 aSig = extractFloat64Frac( a ); 2542 aExp = extractFloat64Exp( a ); 2543 aSign = extractFloat64Sign( a ); 2544 if ( aExp == 0x7FF ) { 2545 if ( aSig ) return commonNaNToFloatx80( float64ToCommonNaN( a ) ); 2546 return packFloatx80( aSign, 0x7FFF, LIT64( 0x8000000000000000 ) ); 2547 } 2548 if ( aExp == 0 ) { 2549 if ( aSig == 0 ) return packFloatx80( aSign, 0, 0 ); 2550 normalizeFloat64Subnormal( aSig, &aExp, &aSig ); 2551 } 2552 return 2553 packFloatx80( 2554 aSign, aExp + 0x3C00, ( aSig | LIT64( 0x0010000000000000 ) )<<11 ); 2555 2556 } 2557 2558 #endif 2559 2560 #ifdef FLOAT128 2561 2562 /* 2563 ------------------------------------------------------------------------------- 2564 Returns the result of converting the double-precision floating-point value 2565 `a' to the quadruple-precision floating-point format. The conversion is 2566 performed according to the IEC/IEEE Standard for Binary Floating-Point 2567 Arithmetic. 2568 ------------------------------------------------------------------------------- 2569 */ 2570 float128 float64_to_float128( float64 a ) 2571 { 2572 flag aSign; 2573 int16 aExp; 2574 bits64 aSig, zSig0, zSig1; 2575 2576 aSig = extractFloat64Frac( a ); 2577 aExp = extractFloat64Exp( a ); 2578 aSign = extractFloat64Sign( a ); 2579 if ( aExp == 0x7FF ) { 2580 if ( aSig ) return commonNaNToFloat128( float64ToCommonNaN( a ) ); 2581 return packFloat128( aSign, 0x7FFF, 0, 0 ); 2582 } 2583 if ( aExp == 0 ) { 2584 if ( aSig == 0 ) return packFloat128( aSign, 0, 0, 0 ); 2585 normalizeFloat64Subnormal( aSig, &aExp, &aSig ); 2586 --aExp; 2587 } 2588 shift128Right( aSig, 0, 4, &zSig0, &zSig1 ); 2589 return packFloat128( aSign, aExp + 0x3C00, zSig0, zSig1 ); 2590 2591 } 2592 2593 #endif 2594 2595 #ifndef SOFTFLOAT_FOR_GCC 2596 /* 2597 ------------------------------------------------------------------------------- 2598 Rounds the double-precision floating-point value `a' to an integer, and 2599 returns the result as a double-precision floating-point value. The 2600 operation is performed according to the IEC/IEEE Standard for Binary 2601 Floating-Point Arithmetic. 2602 ------------------------------------------------------------------------------- 2603 */ 2604 float64 float64_round_to_int( float64 a ) 2605 { 2606 flag aSign; 2607 int16 aExp; 2608 bits64 lastBitMask, roundBitsMask; 2609 int8 roundingMode; 2610 float64 z; 2611 2612 aExp = extractFloat64Exp( a ); 2613 if ( 0x433 <= aExp ) { 2614 if ( ( aExp == 0x7FF ) && extractFloat64Frac( a ) ) { 2615 return propagateFloat64NaN( a, a ); 2616 } 2617 return a; 2618 } 2619 if ( aExp < 0x3FF ) { 2620 if ( (bits64) ( a<<1 ) == 0 ) return a; 2621 float_exception_flags |= float_flag_inexact; 2622 aSign = extractFloat64Sign( a ); 2623 switch ( float_rounding_mode ) { 2624 case float_round_nearest_even: 2625 if ( ( aExp == 0x3FE ) && extractFloat64Frac( a ) ) { 2626 return packFloat64( aSign, 0x3FF, 0 ); 2627 } 2628 break; 2629 case float_round_to_zero: 2630 break; 2631 case float_round_down: 2632 return aSign ? LIT64( 0xBFF0000000000000 ) : 0; 2633 case float_round_up: 2634 return 2635 aSign ? LIT64( 0x8000000000000000 ) : LIT64( 0x3FF0000000000000 ); 2636 } 2637 return packFloat64( aSign, 0, 0 ); 2638 } 2639 lastBitMask = 1; 2640 lastBitMask <<= 0x433 - aExp; 2641 roundBitsMask = lastBitMask - 1; 2642 z = a; 2643 roundingMode = float_rounding_mode; 2644 if ( roundingMode == float_round_nearest_even ) { 2645 z += lastBitMask>>1; 2646 if ( ( z & roundBitsMask ) == 0 ) z &= ~ lastBitMask; 2647 } 2648 else if ( roundingMode != float_round_to_zero ) { 2649 if ( extractFloat64Sign( z ) ^ ( roundingMode == float_round_up ) ) { 2650 z += roundBitsMask; 2651 } 2652 } 2653 z &= ~ roundBitsMask; 2654 if ( z != a ) float_exception_flags |= float_flag_inexact; 2655 return z; 2656 2657 } 2658 #endif 2659 2660 /* 2661 ------------------------------------------------------------------------------- 2662 Returns the result of adding the absolute values of the double-precision 2663 floating-point values `a' and `b'. If `zSign' is 1, the sum is negated 2664 before being returned. `zSign' is ignored if the result is a NaN. 2665 The addition is performed according to the IEC/IEEE Standard for Binary 2666 Floating-Point Arithmetic. 2667 ------------------------------------------------------------------------------- 2668 */ 2669 static float64 addFloat64Sigs( float64 a, float64 b, flag zSign ) 2670 { 2671 int16 aExp, bExp, zExp; 2672 bits64 aSig, bSig, zSig; 2673 int16 expDiff; 2674 2675 aSig = extractFloat64Frac( a ); 2676 aExp = extractFloat64Exp( a ); 2677 bSig = extractFloat64Frac( b ); 2678 bExp = extractFloat64Exp( b ); 2679 expDiff = aExp - bExp; 2680 aSig <<= 9; 2681 bSig <<= 9; 2682 if ( 0 < expDiff ) { 2683 if ( aExp == 0x7FF ) { 2684 if ( aSig ) return propagateFloat64NaN( a, b ); 2685 return a; 2686 } 2687 if ( bExp == 0 ) { 2688 --expDiff; 2689 } 2690 else { 2691 bSig |= LIT64( 0x2000000000000000 ); 2692 } 2693 shift64RightJamming( bSig, expDiff, &bSig ); 2694 zExp = aExp; 2695 } 2696 else if ( expDiff < 0 ) { 2697 if ( bExp == 0x7FF ) { 2698 if ( bSig ) return propagateFloat64NaN( a, b ); 2699 return packFloat64( zSign, 0x7FF, 0 ); 2700 } 2701 if ( aExp == 0 ) { 2702 ++expDiff; 2703 } 2704 else { 2705 aSig |= LIT64( 0x2000000000000000 ); 2706 } 2707 shift64RightJamming( aSig, - expDiff, &aSig ); 2708 zExp = bExp; 2709 } 2710 else { 2711 if ( aExp == 0x7FF ) { 2712 if ( aSig | bSig ) return propagateFloat64NaN( a, b ); 2713 return a; 2714 } 2715 if ( aExp == 0 ) return packFloat64( zSign, 0, ( aSig + bSig )>>9 ); 2716 zSig = LIT64( 0x4000000000000000 ) + aSig + bSig; 2717 zExp = aExp; 2718 goto roundAndPack; 2719 } 2720 aSig |= LIT64( 0x2000000000000000 ); 2721 zSig = ( aSig + bSig )<<1; 2722 --zExp; 2723 if ( (sbits64) zSig < 0 ) { 2724 zSig = aSig + bSig; 2725 ++zExp; 2726 } 2727 roundAndPack: 2728 return roundAndPackFloat64( zSign, zExp, zSig ); 2729 2730 } 2731 2732 /* 2733 ------------------------------------------------------------------------------- 2734 Returns the result of subtracting the absolute values of the double- 2735 precision floating-point values `a' and `b'. If `zSign' is 1, the 2736 difference is negated before being returned. `zSign' is ignored if the 2737 result is a NaN. The subtraction is performed according to the IEC/IEEE 2738 Standard for Binary Floating-Point Arithmetic. 2739 ------------------------------------------------------------------------------- 2740 */ 2741 static float64 subFloat64Sigs( float64 a, float64 b, flag zSign ) 2742 { 2743 int16 aExp, bExp, zExp; 2744 bits64 aSig, bSig, zSig; 2745 int16 expDiff; 2746 2747 aSig = extractFloat64Frac( a ); 2748 aExp = extractFloat64Exp( a ); 2749 bSig = extractFloat64Frac( b ); 2750 bExp = extractFloat64Exp( b ); 2751 expDiff = aExp - bExp; 2752 aSig <<= 10; 2753 bSig <<= 10; 2754 if ( 0 < expDiff ) goto aExpBigger; 2755 if ( expDiff < 0 ) goto bExpBigger; 2756 if ( aExp == 0x7FF ) { 2757 if ( aSig | bSig ) return propagateFloat64NaN( a, b ); 2758 float_raise( float_flag_invalid ); 2759 return float64_default_nan; 2760 } 2761 if ( aExp == 0 ) { 2762 aExp = 1; 2763 bExp = 1; 2764 } 2765 if ( bSig < aSig ) goto aBigger; 2766 if ( aSig < bSig ) goto bBigger; 2767 return packFloat64( float_rounding_mode == float_round_down, 0, 0 ); 2768 bExpBigger: 2769 if ( bExp == 0x7FF ) { 2770 if ( bSig ) return propagateFloat64NaN( a, b ); 2771 return packFloat64( zSign ^ 1, 0x7FF, 0 ); 2772 } 2773 if ( aExp == 0 ) { 2774 ++expDiff; 2775 } 2776 else { 2777 aSig |= LIT64( 0x4000000000000000 ); 2778 } 2779 shift64RightJamming( aSig, - expDiff, &aSig ); 2780 bSig |= LIT64( 0x4000000000000000 ); 2781 bBigger: 2782 zSig = bSig - aSig; 2783 zExp = bExp; 2784 zSign ^= 1; 2785 goto normalizeRoundAndPack; 2786 aExpBigger: 2787 if ( aExp == 0x7FF ) { 2788 if ( aSig ) return propagateFloat64NaN( a, b ); 2789 return a; 2790 } 2791 if ( bExp == 0 ) { 2792 --expDiff; 2793 } 2794 else { 2795 bSig |= LIT64( 0x4000000000000000 ); 2796 } 2797 shift64RightJamming( bSig, expDiff, &bSig ); 2798 aSig |= LIT64( 0x4000000000000000 ); 2799 aBigger: 2800 zSig = aSig - bSig; 2801 zExp = aExp; 2802 normalizeRoundAndPack: 2803 --zExp; 2804 return normalizeRoundAndPackFloat64( zSign, zExp, zSig ); 2805 2806 } 2807 2808 /* 2809 ------------------------------------------------------------------------------- 2810 Returns the result of adding the double-precision floating-point values `a' 2811 and `b'. The operation is performed according to the IEC/IEEE Standard for 2812 Binary Floating-Point Arithmetic. 2813 ------------------------------------------------------------------------------- 2814 */ 2815 float64 float64_add( float64 a, float64 b ) 2816 { 2817 flag aSign, bSign; 2818 2819 aSign = extractFloat64Sign( a ); 2820 bSign = extractFloat64Sign( b ); 2821 if ( aSign == bSign ) { 2822 return addFloat64Sigs( a, b, aSign ); 2823 } 2824 else { 2825 return subFloat64Sigs( a, b, aSign ); 2826 } 2827 2828 } 2829 2830 /* 2831 ------------------------------------------------------------------------------- 2832 Returns the result of subtracting the double-precision floating-point values 2833 `a' and `b'. The operation is performed according to the IEC/IEEE Standard 2834 for Binary Floating-Point Arithmetic. 2835 ------------------------------------------------------------------------------- 2836 */ 2837 float64 float64_sub( float64 a, float64 b ) 2838 { 2839 flag aSign, bSign; 2840 2841 aSign = extractFloat64Sign( a ); 2842 bSign = extractFloat64Sign( b ); 2843 if ( aSign == bSign ) { 2844 return subFloat64Sigs( a, b, aSign ); 2845 } 2846 else { 2847 return addFloat64Sigs( a, b, aSign ); 2848 } 2849 2850 } 2851 2852 /* 2853 ------------------------------------------------------------------------------- 2854 Returns the result of multiplying the double-precision floating-point values 2855 `a' and `b'. The operation is performed according to the IEC/IEEE Standard 2856 for Binary Floating-Point Arithmetic. 2857 ------------------------------------------------------------------------------- 2858 */ 2859 float64 float64_mul( float64 a, float64 b ) 2860 { 2861 flag aSign, bSign, zSign; 2862 int16 aExp, bExp, zExp; 2863 bits64 aSig, bSig, zSig0, zSig1; 2864 2865 aSig = extractFloat64Frac( a ); 2866 aExp = extractFloat64Exp( a ); 2867 aSign = extractFloat64Sign( a ); 2868 bSig = extractFloat64Frac( b ); 2869 bExp = extractFloat64Exp( b ); 2870 bSign = extractFloat64Sign( b ); 2871 zSign = aSign ^ bSign; 2872 if ( aExp == 0x7FF ) { 2873 if ( aSig || ( ( bExp == 0x7FF ) && bSig ) ) { 2874 return propagateFloat64NaN( a, b ); 2875 } 2876 if ( ( bExp | bSig ) == 0 ) { 2877 float_raise( float_flag_invalid ); 2878 return float64_default_nan; 2879 } 2880 return packFloat64( zSign, 0x7FF, 0 ); 2881 } 2882 if ( bExp == 0x7FF ) { 2883 if ( bSig ) return propagateFloat64NaN( a, b ); 2884 if ( ( aExp | aSig ) == 0 ) { 2885 float_raise( float_flag_invalid ); 2886 return float64_default_nan; 2887 } 2888 return packFloat64( zSign, 0x7FF, 0 ); 2889 } 2890 if ( aExp == 0 ) { 2891 if ( aSig == 0 ) return packFloat64( zSign, 0, 0 ); 2892 normalizeFloat64Subnormal( aSig, &aExp, &aSig ); 2893 } 2894 if ( bExp == 0 ) { 2895 if ( bSig == 0 ) return packFloat64( zSign, 0, 0 ); 2896 normalizeFloat64Subnormal( bSig, &bExp, &bSig ); 2897 } 2898 zExp = aExp + bExp - 0x3FF; 2899 aSig = ( aSig | LIT64( 0x0010000000000000 ) )<<10; 2900 bSig = ( bSig | LIT64( 0x0010000000000000 ) )<<11; 2901 mul64To128( aSig, bSig, &zSig0, &zSig1 ); 2902 zSig0 |= ( zSig1 != 0 ); 2903 if ( 0 <= (sbits64) ( zSig0<<1 ) ) { 2904 zSig0 <<= 1; 2905 --zExp; 2906 } 2907 return roundAndPackFloat64( zSign, zExp, zSig0 ); 2908 2909 } 2910 2911 /* 2912 ------------------------------------------------------------------------------- 2913 Returns the result of dividing the double-precision floating-point value `a' 2914 by the corresponding value `b'. The operation is performed according to 2915 the IEC/IEEE Standard for Binary Floating-Point Arithmetic. 2916 ------------------------------------------------------------------------------- 2917 */ 2918 float64 float64_div( float64 a, float64 b ) 2919 { 2920 flag aSign, bSign, zSign; 2921 int16 aExp, bExp, zExp; 2922 bits64 aSig, bSig, zSig; 2923 bits64 rem0, rem1; 2924 bits64 term0, term1; 2925 2926 aSig = extractFloat64Frac( a ); 2927 aExp = extractFloat64Exp( a ); 2928 aSign = extractFloat64Sign( a ); 2929 bSig = extractFloat64Frac( b ); 2930 bExp = extractFloat64Exp( b ); 2931 bSign = extractFloat64Sign( b ); 2932 zSign = aSign ^ bSign; 2933 if ( aExp == 0x7FF ) { 2934 if ( aSig ) return propagateFloat64NaN( a, b ); 2935 if ( bExp == 0x7FF ) { 2936 if ( bSig ) return propagateFloat64NaN( a, b ); 2937 float_raise( float_flag_invalid ); 2938 return float64_default_nan; 2939 } 2940 return packFloat64( zSign, 0x7FF, 0 ); 2941 } 2942 if ( bExp == 0x7FF ) { 2943 if ( bSig ) return propagateFloat64NaN( a, b ); 2944 return packFloat64( zSign, 0, 0 ); 2945 } 2946 if ( bExp == 0 ) { 2947 if ( bSig == 0 ) { 2948 if ( ( aExp | aSig ) == 0 ) { 2949 float_raise( float_flag_invalid ); 2950 return float64_default_nan; 2951 } 2952 float_raise( float_flag_divbyzero ); 2953 return packFloat64( zSign, 0x7FF, 0 ); 2954 } 2955 normalizeFloat64Subnormal( bSig, &bExp, &bSig ); 2956 } 2957 if ( aExp == 0 ) { 2958 if ( aSig == 0 ) return packFloat64( zSign, 0, 0 ); 2959 normalizeFloat64Subnormal( aSig, &aExp, &aSig ); 2960 } 2961 zExp = aExp - bExp + 0x3FD; 2962 aSig = ( aSig | LIT64( 0x0010000000000000 ) )<<10; 2963 bSig = ( bSig | LIT64( 0x0010000000000000 ) )<<11; 2964 if ( bSig <= ( aSig + aSig ) ) { 2965 aSig >>= 1; 2966 ++zExp; 2967 } 2968 zSig = estimateDiv128To64( aSig, 0, bSig ); 2969 if ( ( zSig & 0x1FF ) <= 2 ) { 2970 mul64To128( bSig, zSig, &term0, &term1 ); 2971 sub128( aSig, 0, term0, term1, &rem0, &rem1 ); 2972 while ( (sbits64) rem0 < 0 ) { 2973 --zSig; 2974 add128( rem0, rem1, 0, bSig, &rem0, &rem1 ); 2975 } 2976 zSig |= ( rem1 != 0 ); 2977 } 2978 return roundAndPackFloat64( zSign, zExp, zSig ); 2979 2980 } 2981 2982 #ifndef SOFTFLOAT_FOR_GCC 2983 /* 2984 ------------------------------------------------------------------------------- 2985 Returns the remainder of the double-precision floating-point value `a' 2986 with respect to the corresponding value `b'. The operation is performed 2987 according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic. 2988 ------------------------------------------------------------------------------- 2989 */ 2990 float64 float64_rem( float64 a, float64 b ) 2991 { 2992 flag aSign, bSign, zSign; 2993 int16 aExp, bExp, expDiff; 2994 bits64 aSig, bSig; 2995 bits64 q, alternateASig; 2996 sbits64 sigMean; 2997 2998 aSig = extractFloat64Frac( a ); 2999 aExp = extractFloat64Exp( a ); 3000 aSign = extractFloat64Sign( a ); 3001 bSig = extractFloat64Frac( b ); 3002 bExp = extractFloat64Exp( b ); 3003 bSign = extractFloat64Sign( b ); 3004 if ( aExp == 0x7FF ) { 3005 if ( aSig || ( ( bExp == 0x7FF ) && bSig ) ) { 3006 return propagateFloat64NaN( a, b ); 3007 } 3008 float_raise( float_flag_invalid ); 3009 return float64_default_nan; 3010 } 3011 if ( bExp == 0x7FF ) { 3012 if ( bSig ) return propagateFloat64NaN( a, b ); 3013 return a; 3014 } 3015 if ( bExp == 0 ) { 3016 if ( bSig == 0 ) { 3017 float_raise( float_flag_invalid ); 3018 return float64_default_nan; 3019 } 3020 normalizeFloat64Subnormal( bSig, &bExp, &bSig ); 3021 } 3022 if ( aExp == 0 ) { 3023 if ( aSig == 0 ) return a; 3024 normalizeFloat64Subnormal( aSig, &aExp, &aSig ); 3025 } 3026 expDiff = aExp - bExp; 3027 aSig = ( aSig | LIT64( 0x0010000000000000 ) )<<11; 3028 bSig = ( bSig | LIT64( 0x0010000000000000 ) )<<11; 3029 if ( expDiff < 0 ) { 3030 if ( expDiff < -1 ) return a; 3031 aSig >>= 1; 3032 } 3033 q = ( bSig <= aSig ); 3034 if ( q ) aSig -= bSig; 3035 expDiff -= 64; 3036 while ( 0 < expDiff ) { 3037 q = estimateDiv128To64( aSig, 0, bSig ); 3038 q = ( 2 < q ) ? q - 2 : 0; 3039 aSig = - ( ( bSig>>2 ) * q ); 3040 expDiff -= 62; 3041 } 3042 expDiff += 64; 3043 if ( 0 < expDiff ) { 3044 q = estimateDiv128To64( aSig, 0, bSig ); 3045 q = ( 2 < q ) ? q - 2 : 0; 3046 q >>= 64 - expDiff; 3047 bSig >>= 2; 3048 aSig = ( ( aSig>>1 )<<( expDiff - 1 ) ) - bSig * q; 3049 } 3050 else { 3051 aSig >>= 2; 3052 bSig >>= 2; 3053 } 3054 do { 3055 alternateASig = aSig; 3056 ++q; 3057 aSig -= bSig; 3058 } while ( 0 <= (sbits64) aSig ); 3059 sigMean = aSig + alternateASig; 3060 if ( ( sigMean < 0 ) || ( ( sigMean == 0 ) && ( q & 1 ) ) ) { 3061 aSig = alternateASig; 3062 } 3063 zSign = ( (sbits64) aSig < 0 ); 3064 if ( zSign ) aSig = - aSig; 3065 return normalizeRoundAndPackFloat64( aSign ^ zSign, bExp, aSig ); 3066 3067 } 3068 3069 /* 3070 ------------------------------------------------------------------------------- 3071 Returns the square root of the double-precision floating-point value `a'. 3072 The operation is performed according to the IEC/IEEE Standard for Binary 3073 Floating-Point Arithmetic. 3074 ------------------------------------------------------------------------------- 3075 */ 3076 float64 float64_sqrt( float64 a ) 3077 { 3078 flag aSign; 3079 int16 aExp, zExp; 3080 bits64 aSig, zSig, doubleZSig; 3081 bits64 rem0, rem1, term0, term1; 3082 3083 aSig = extractFloat64Frac( a ); 3084 aExp = extractFloat64Exp( a ); 3085 aSign = extractFloat64Sign( a ); 3086 if ( aExp == 0x7FF ) { 3087 if ( aSig ) return propagateFloat64NaN( a, a ); 3088 if ( ! aSign ) return a; 3089 float_raise( float_flag_invalid ); 3090 return float64_default_nan; 3091 } 3092 if ( aSign ) { 3093 if ( ( aExp | aSig ) == 0 ) return a; 3094 float_raise( float_flag_invalid ); 3095 return float64_default_nan; 3096 } 3097 if ( aExp == 0 ) { 3098 if ( aSig == 0 ) return 0; 3099 normalizeFloat64Subnormal( aSig, &aExp, &aSig ); 3100 } 3101 zExp = ( ( aExp - 0x3FF )>>1 ) + 0x3FE; 3102 aSig |= LIT64( 0x0010000000000000 ); 3103 zSig = estimateSqrt32( aExp, aSig>>21 ); 3104 aSig <<= 9 - ( aExp & 1 ); 3105 zSig = estimateDiv128To64( aSig, 0, zSig<<32 ) + ( zSig<<30 ); 3106 if ( ( zSig & 0x1FF ) <= 5 ) { 3107 doubleZSig = zSig<<1; 3108 mul64To128( zSig, zSig, &term0, &term1 ); 3109 sub128( aSig, 0, term0, term1, &rem0, &rem1 ); 3110 while ( (sbits64) rem0 < 0 ) { 3111 --zSig; 3112 doubleZSig -= 2; 3113 add128( rem0, rem1, zSig>>63, doubleZSig | 1, &rem0, &rem1 ); 3114 } 3115 zSig |= ( ( rem0 | rem1 ) != 0 ); 3116 } 3117 return roundAndPackFloat64( 0, zExp, zSig ); 3118 3119 } 3120 #endif 3121 3122 /* 3123 ------------------------------------------------------------------------------- 3124 Returns 1 if the double-precision floating-point value `a' is equal to the 3125 corresponding value `b', and 0 otherwise. The comparison is performed 3126 according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic. 3127 ------------------------------------------------------------------------------- 3128 */ 3129 flag float64_eq( float64 a, float64 b ) 3130 { 3131 3132 if ( ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) ) 3133 || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) ) 3134 ) { 3135 if ( float64_is_signaling_nan( a ) || float64_is_signaling_nan( b ) ) { 3136 float_raise( float_flag_invalid ); 3137 } 3138 return 0; 3139 } 3140 return ( a == b ) || 3141 ( (bits64) ( ( FLOAT64_DEMANGLE(a) | FLOAT64_DEMANGLE(b) )<<1 ) == 0 ); 3142 3143 } 3144 3145 /* 3146 ------------------------------------------------------------------------------- 3147 Returns 1 if the double-precision floating-point value `a' is less than or 3148 equal to the corresponding value `b', and 0 otherwise. The comparison is 3149 performed according to the IEC/IEEE Standard for Binary Floating-Point 3150 Arithmetic. 3151 ------------------------------------------------------------------------------- 3152 */ 3153 flag float64_le( float64 a, float64 b ) 3154 { 3155 flag aSign, bSign; 3156 3157 if ( ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) ) 3158 || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) ) 3159 ) { 3160 float_raise( float_flag_invalid ); 3161 return 0; 3162 } 3163 aSign = extractFloat64Sign( a ); 3164 bSign = extractFloat64Sign( b ); 3165 if ( aSign != bSign ) 3166 return aSign || 3167 ( (bits64) ( ( FLOAT64_DEMANGLE(a) | FLOAT64_DEMANGLE(b) )<<1 ) == 3168 0 ); 3169 return ( a == b ) || 3170 ( aSign ^ ( FLOAT64_DEMANGLE(a) < FLOAT64_DEMANGLE(b) ) ); 3171 3172 } 3173 3174 /* 3175 ------------------------------------------------------------------------------- 3176 Returns 1 if the double-precision floating-point value `a' is less than 3177 the corresponding value `b', and 0 otherwise. The comparison is performed 3178 according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic. 3179 ------------------------------------------------------------------------------- 3180 */ 3181 flag float64_lt( float64 a, float64 b ) 3182 { 3183 flag aSign, bSign; 3184 3185 if ( ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) ) 3186 || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) ) 3187 ) { 3188 float_raise( float_flag_invalid ); 3189 return 0; 3190 } 3191 aSign = extractFloat64Sign( a ); 3192 bSign = extractFloat64Sign( b ); 3193 if ( aSign != bSign ) 3194 return aSign && 3195 ( (bits64) ( ( FLOAT64_DEMANGLE(a) | FLOAT64_DEMANGLE(b) )<<1 ) != 3196 0 ); 3197 return ( a != b ) && 3198 ( aSign ^ ( FLOAT64_DEMANGLE(a) < FLOAT64_DEMANGLE(b) ) ); 3199 3200 } 3201 3202 #ifndef SOFTFLOAT_FOR_GCC 3203 /* 3204 ------------------------------------------------------------------------------- 3205 Returns 1 if the double-precision floating-point value `a' is equal to the 3206 corresponding value `b', and 0 otherwise. The invalid exception is raised 3207 if either operand is a NaN. Otherwise, the comparison is performed 3208 according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic. 3209 ------------------------------------------------------------------------------- 3210 */ 3211 flag float64_eq_signaling( float64 a, float64 b ) 3212 { 3213 3214 if ( ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) ) 3215 || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) ) 3216 ) { 3217 float_raise( float_flag_invalid ); 3218 return 0; 3219 } 3220 return ( a == b ) || ( (bits64) ( ( a | b )<<1 ) == 0 ); 3221 3222 } 3223 3224 /* 3225 ------------------------------------------------------------------------------- 3226 Returns 1 if the double-precision floating-point value `a' is less than or 3227 equal to the corresponding value `b', and 0 otherwise. Quiet NaNs do not 3228 cause an exception. Otherwise, the comparison is performed according to the 3229 IEC/IEEE Standard for Binary Floating-Point Arithmetic. 3230 ------------------------------------------------------------------------------- 3231 */ 3232 flag float64_le_quiet( float64 a, float64 b ) 3233 { 3234 flag aSign, bSign; 3235 3236 if ( ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) ) 3237 || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) ) 3238 ) { 3239 if ( float64_is_signaling_nan( a ) || float64_is_signaling_nan( b ) ) { 3240 float_raise( float_flag_invalid ); 3241 } 3242 return 0; 3243 } 3244 aSign = extractFloat64Sign( a ); 3245 bSign = extractFloat64Sign( b ); 3246 if ( aSign != bSign ) return aSign || ( (bits64) ( ( a | b )<<1 ) == 0 ); 3247 return ( a == b ) || ( aSign ^ ( a < b ) ); 3248 3249 } 3250 3251 /* 3252 ------------------------------------------------------------------------------- 3253 Returns 1 if the double-precision floating-point value `a' is less than 3254 the corresponding value `b', and 0 otherwise. Quiet NaNs do not cause an 3255 exception. Otherwise, the comparison is performed according to the IEC/IEEE 3256 Standard for Binary Floating-Point Arithmetic. 3257 ------------------------------------------------------------------------------- 3258 */ 3259 flag float64_lt_quiet( float64 a, float64 b ) 3260 { 3261 flag aSign, bSign; 3262 3263 if ( ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) ) 3264 || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) ) 3265 ) { 3266 if ( float64_is_signaling_nan( a ) || float64_is_signaling_nan( b ) ) { 3267 float_raise( float_flag_invalid ); 3268 } 3269 return 0; 3270 } 3271 aSign = extractFloat64Sign( a ); 3272 bSign = extractFloat64Sign( b ); 3273 if ( aSign != bSign ) return aSign && ( (bits64) ( ( a | b )<<1 ) != 0 ); 3274 return ( a != b ) && ( aSign ^ ( a < b ) ); 3275 3276 } 3277 #endif 3278 3279 #ifdef FLOATX80 3280 3281 /* 3282 ------------------------------------------------------------------------------- 3283 Returns the result of converting the extended double-precision floating- 3284 point value `a' to the 32-bit two's complement integer format. The 3285 conversion is performed according to the IEC/IEEE Standard for Binary 3286 Floating-Point Arithmetic---which means in particular that the conversion 3287 is rounded according to the current rounding mode. If `a' is a NaN, the 3288 largest positive integer is returned. Otherwise, if the conversion 3289 overflows, the largest integer with the same sign as `a' is returned. 3290 ------------------------------------------------------------------------------- 3291 */ 3292 int32 floatx80_to_int32( floatx80 a ) 3293 { 3294 flag aSign; 3295 int32 aExp, shiftCount; 3296 bits64 aSig; 3297 3298 aSig = extractFloatx80Frac( a ); 3299 aExp = extractFloatx80Exp( a ); 3300 aSign = extractFloatx80Sign( a ); 3301 if ( ( aExp == 0x7FFF ) && (bits64) ( aSig<<1 ) ) aSign = 0; 3302 shiftCount = 0x4037 - aExp; 3303 if ( shiftCount <= 0 ) shiftCount = 1; 3304 shift64RightJamming( aSig, shiftCount, &aSig ); 3305 return roundAndPackInt32( aSign, aSig ); 3306 3307 } 3308 3309 /* 3310 ------------------------------------------------------------------------------- 3311 Returns the result of converting the extended double-precision floating- 3312 point value `a' to the 32-bit two's complement integer format. The 3313 conversion is performed according to the IEC/IEEE Standard for Binary 3314 Floating-Point Arithmetic, except that the conversion is always rounded 3315 toward zero. If `a' is a NaN, the largest positive integer is returned. 3316 Otherwise, if the conversion overflows, the largest integer with the same 3317 sign as `a' is returned. 3318 ------------------------------------------------------------------------------- 3319 */ 3320 int32 floatx80_to_int32_round_to_zero( floatx80 a ) 3321 { 3322 flag aSign; 3323 int32 aExp, shiftCount; 3324 bits64 aSig, savedASig; 3325 int32 z; 3326 3327 aSig = extractFloatx80Frac( a ); 3328 aExp = extractFloatx80Exp( a ); 3329 aSign = extractFloatx80Sign( a ); 3330 if ( 0x401E < aExp ) { 3331 if ( ( aExp == 0x7FFF ) && (bits64) ( aSig<<1 ) ) aSign = 0; 3332 goto invalid; 3333 } 3334 else if ( aExp < 0x3FFF ) { 3335 if ( aExp || aSig ) float_exception_flags |= float_flag_inexact; 3336 return 0; 3337 } 3338 shiftCount = 0x403E - aExp; 3339 savedASig = aSig; 3340 aSig >>= shiftCount; 3341 z = aSig; 3342 if ( aSign ) z = - z; 3343 if ( ( z < 0 ) ^ aSign ) { 3344 invalid: 3345 float_raise( float_flag_invalid ); 3346 return aSign ? (sbits32) 0x80000000 : 0x7FFFFFFF; 3347 } 3348 if ( ( aSig<<shiftCount ) != savedASig ) { 3349 float_exception_flags |= float_flag_inexact; 3350 } 3351 return z; 3352 3353 } 3354 3355 /* 3356 ------------------------------------------------------------------------------- 3357 Returns the result of converting the extended double-precision floating- 3358 point value `a' to the 64-bit two's complement integer format. The 3359 conversion is performed according to the IEC/IEEE Standard for Binary 3360 Floating-Point Arithmetic---which means in particular that the conversion 3361 is rounded according to the current rounding mode. If `a' is a NaN, 3362 the largest positive integer is returned. Otherwise, if the conversion 3363 overflows, the largest integer with the same sign as `a' is returned. 3364 ------------------------------------------------------------------------------- 3365 */ 3366 int64 floatx80_to_int64( floatx80 a ) 3367 { 3368 flag aSign; 3369 int32 aExp, shiftCount; 3370 bits64 aSig, aSigExtra; 3371 3372 aSig = extractFloatx80Frac( a ); 3373 aExp = extractFloatx80Exp( a ); 3374 aSign = extractFloatx80Sign( a ); 3375 shiftCount = 0x403E - aExp; 3376 if ( shiftCount <= 0 ) { 3377 if ( shiftCount ) { 3378 float_raise( float_flag_invalid ); 3379 if ( ! aSign 3380 || ( ( aExp == 0x7FFF ) 3381 && ( aSig != LIT64( 0x8000000000000000 ) ) ) 3382 ) { 3383 return LIT64( 0x7FFFFFFFFFFFFFFF ); 3384 } 3385 return (sbits64) LIT64( 0x8000000000000000 ); 3386 } 3387 aSigExtra = 0; 3388 } 3389 else { 3390 shift64ExtraRightJamming( aSig, 0, shiftCount, &aSig, &aSigExtra ); 3391 } 3392 return roundAndPackInt64( aSign, aSig, aSigExtra ); 3393 3394 } 3395 3396 /* 3397 ------------------------------------------------------------------------------- 3398 Returns the result of converting the extended double-precision floating- 3399 point value `a' to the 64-bit two's complement integer format. The 3400 conversion is performed according to the IEC/IEEE Standard for Binary 3401 Floating-Point Arithmetic, except that the conversion is always rounded 3402 toward zero. If `a' is a NaN, the largest positive integer is returned. 3403 Otherwise, if the conversion overflows, the largest integer with the same 3404 sign as `a' is returned. 3405 ------------------------------------------------------------------------------- 3406 */ 3407 int64 floatx80_to_int64_round_to_zero( floatx80 a ) 3408 { 3409 flag aSign; 3410 int32 aExp, shiftCount; 3411 bits64 aSig; 3412 int64 z; 3413 3414 aSig = extractFloatx80Frac( a ); 3415 aExp = extractFloatx80Exp( a ); 3416 aSign = extractFloatx80Sign( a ); 3417 shiftCount = aExp - 0x403E; 3418 if ( 0 <= shiftCount ) { 3419 aSig &= LIT64( 0x7FFFFFFFFFFFFFFF ); 3420 if ( ( a.high != 0xC03E ) || aSig ) { 3421 float_raise( float_flag_invalid ); 3422 if ( ! aSign || ( ( aExp == 0x7FFF ) && aSig ) ) { 3423 return LIT64( 0x7FFFFFFFFFFFFFFF ); 3424 } 3425 } 3426 return (sbits64) LIT64( 0x8000000000000000 ); 3427 } 3428 else if ( aExp < 0x3FFF ) { 3429 if ( aExp | aSig ) float_exception_flags |= float_flag_inexact; 3430 return 0; 3431 } 3432 z = aSig>>( - shiftCount ); 3433 if ( (bits64) ( aSig<<( shiftCount & 63 ) ) ) { 3434 float_exception_flags |= float_flag_inexact; 3435 } 3436 if ( aSign ) z = - z; 3437 return z; 3438 3439 } 3440 3441 /* 3442 ------------------------------------------------------------------------------- 3443 Returns the result of converting the extended double-precision floating- 3444 point value `a' to the single-precision floating-point format. The 3445 conversion is performed according to the IEC/IEEE Standard for Binary 3446 Floating-Point Arithmetic. 3447 ------------------------------------------------------------------------------- 3448 */ 3449 float32 floatx80_to_float32( floatx80 a ) 3450 { 3451 flag aSign; 3452 int32 aExp; 3453 bits64 aSig; 3454 3455 aSig = extractFloatx80Frac( a ); 3456 aExp = extractFloatx80Exp( a ); 3457 aSign = extractFloatx80Sign( a ); 3458 if ( aExp == 0x7FFF ) { 3459 if ( (bits64) ( aSig<<1 ) ) { 3460 return commonNaNToFloat32( floatx80ToCommonNaN( a ) ); 3461 } 3462 return packFloat32( aSign, 0xFF, 0 ); 3463 } 3464 shift64RightJamming( aSig, 33, &aSig ); 3465 if ( aExp || aSig ) aExp -= 0x3F81; 3466 return roundAndPackFloat32( aSign, aExp, aSig ); 3467 3468 } 3469 3470 /* 3471 ------------------------------------------------------------------------------- 3472 Returns the result of converting the extended double-precision floating- 3473 point value `a' to the double-precision floating-point format. The 3474 conversion is performed according to the IEC/IEEE Standard for Binary 3475 Floating-Point Arithmetic. 3476 ------------------------------------------------------------------------------- 3477 */ 3478 float64 floatx80_to_float64( floatx80 a ) 3479 { 3480 flag aSign; 3481 int32 aExp; 3482 bits64 aSig, zSig; 3483 3484 aSig = extractFloatx80Frac( a ); 3485 aExp = extractFloatx80Exp( a ); 3486 aSign = extractFloatx80Sign( a ); 3487 if ( aExp == 0x7FFF ) { 3488 if ( (bits64) ( aSig<<1 ) ) { 3489 return commonNaNToFloat64( floatx80ToCommonNaN( a ) ); 3490 } 3491 return packFloat64( aSign, 0x7FF, 0 ); 3492 } 3493 shift64RightJamming( aSig, 1, &zSig ); 3494 if ( aExp || aSig ) aExp -= 0x3C01; 3495 return roundAndPackFloat64( aSign, aExp, zSig ); 3496 3497 } 3498 3499 #ifdef FLOAT128 3500 3501 /* 3502 ------------------------------------------------------------------------------- 3503 Returns the result of converting the extended double-precision floating- 3504 point value `a' to the quadruple-precision floating-point format. The 3505 conversion is performed according to the IEC/IEEE Standard for Binary 3506 Floating-Point Arithmetic. 3507 ------------------------------------------------------------------------------- 3508 */ 3509 float128 floatx80_to_float128( floatx80 a ) 3510 { 3511 flag aSign; 3512 int16 aExp; 3513 bits64 aSig, zSig0, zSig1; 3514 3515 aSig = extractFloatx80Frac( a ); 3516 aExp = extractFloatx80Exp( a ); 3517 aSign = extractFloatx80Sign( a ); 3518 if ( ( aExp == 0x7FFF ) && (bits64) ( aSig<<1 ) ) { 3519 return commonNaNToFloat128( floatx80ToCommonNaN( a ) ); 3520 } 3521 shift128Right( aSig<<1, 0, 16, &zSig0, &zSig1 ); 3522 return packFloat128( aSign, aExp, zSig0, zSig1 ); 3523 3524 } 3525 3526 #endif 3527 3528 /* 3529 ------------------------------------------------------------------------------- 3530 Rounds the extended double-precision floating-point value `a' to an integer, 3531 and returns the result as an extended quadruple-precision floating-point 3532 value. The operation is performed according to the IEC/IEEE Standard for 3533 Binary Floating-Point Arithmetic. 3534 ------------------------------------------------------------------------------- 3535 */ 3536 floatx80 floatx80_round_to_int( floatx80 a ) 3537 { 3538 flag aSign; 3539 int32 aExp; 3540 bits64 lastBitMask, roundBitsMask; 3541 int8 roundingMode; 3542 floatx80 z; 3543 3544 aExp = extractFloatx80Exp( a ); 3545 if ( 0x403E <= aExp ) { 3546 if ( ( aExp == 0x7FFF ) && (bits64) ( extractFloatx80Frac( a )<<1 ) ) { 3547 return propagateFloatx80NaN( a, a ); 3548 } 3549 return a; 3550 } 3551 if ( aExp < 0x3FFF ) { 3552 if ( ( aExp == 0 ) 3553 && ( (bits64) ( extractFloatx80Frac( a )<<1 ) == 0 ) ) { 3554 return a; 3555 } 3556 float_exception_flags |= float_flag_inexact; 3557 aSign = extractFloatx80Sign( a ); 3558 switch ( float_rounding_mode ) { 3559 case float_round_nearest_even: 3560 if ( ( aExp == 0x3FFE ) && (bits64) ( extractFloatx80Frac( a )<<1 ) 3561 ) { 3562 return 3563 packFloatx80( aSign, 0x3FFF, LIT64( 0x8000000000000000 ) ); 3564 } 3565 break; 3566 case float_round_to_zero: 3567 break; 3568 case float_round_down: 3569 return 3570 aSign ? 3571 packFloatx80( 1, 0x3FFF, LIT64( 0x8000000000000000 ) ) 3572 : packFloatx80( 0, 0, 0 ); 3573 case float_round_up: 3574 return 3575 aSign ? packFloatx80( 1, 0, 0 ) 3576 : packFloatx80( 0, 0x3FFF, LIT64( 0x8000000000000000 ) ); 3577 } 3578 return packFloatx80( aSign, 0, 0 ); 3579 } 3580 lastBitMask = 1; 3581 lastBitMask <<= 0x403E - aExp; 3582 roundBitsMask = lastBitMask - 1; 3583 z = a; 3584 roundingMode = float_rounding_mode; 3585 if ( roundingMode == float_round_nearest_even ) { 3586 z.low += lastBitMask>>1; 3587 if ( ( z.low & roundBitsMask ) == 0 ) z.low &= ~ lastBitMask; 3588 } 3589 else if ( roundingMode != float_round_to_zero ) { 3590 if ( extractFloatx80Sign( z ) ^ ( roundingMode == float_round_up ) ) { 3591 z.low += roundBitsMask; 3592 } 3593 } 3594 z.low &= ~ roundBitsMask; 3595 if ( z.low == 0 ) { 3596 ++z.high; 3597 z.low = LIT64( 0x8000000000000000 ); 3598 } 3599 if ( z.low != a.low ) float_exception_flags |= float_flag_inexact; 3600 return z; 3601 3602 } 3603 3604 /* 3605 ------------------------------------------------------------------------------- 3606 Returns the result of adding the absolute values of the extended double- 3607 precision floating-point values `a' and `b'. If `zSign' is 1, the sum is 3608 negated before being returned. `zSign' is ignored if the result is a NaN. 3609 The addition is performed according to the IEC/IEEE Standard for Binary 3610 Floating-Point Arithmetic. 3611 ------------------------------------------------------------------------------- 3612 */ 3613 static floatx80 addFloatx80Sigs( floatx80 a, floatx80 b, flag zSign ) 3614 { 3615 int32 aExp, bExp, zExp; 3616 bits64 aSig, bSig, zSig0, zSig1; 3617 int32 expDiff; 3618 3619 aSig = extractFloatx80Frac( a ); 3620 aExp = extractFloatx80Exp( a ); 3621 bSig = extractFloatx80Frac( b ); 3622 bExp = extractFloatx80Exp( b ); 3623 expDiff = aExp - bExp; 3624 if ( 0 < expDiff ) { 3625 if ( aExp == 0x7FFF ) { 3626 if ( (bits64) ( aSig<<1 ) ) return propagateFloatx80NaN( a, b ); 3627 return a; 3628 } 3629 if ( bExp == 0 ) --expDiff; 3630 shift64ExtraRightJamming( bSig, 0, expDiff, &bSig, &zSig1 ); 3631 zExp = aExp; 3632 } 3633 else if ( expDiff < 0 ) { 3634 if ( bExp == 0x7FFF ) { 3635 if ( (bits64) ( bSig<<1 ) ) return propagateFloatx80NaN( a, b ); 3636 return packFloatx80( zSign, 0x7FFF, LIT64( 0x8000000000000000 ) ); 3637 } 3638 if ( aExp == 0 ) ++expDiff; 3639 shift64ExtraRightJamming( aSig, 0, - expDiff, &aSig, &zSig1 ); 3640 zExp = bExp; 3641 } 3642 else { 3643 if ( aExp == 0x7FFF ) { 3644 if ( (bits64) ( ( aSig | bSig )<<1 ) ) { 3645 return propagateFloatx80NaN( a, b ); 3646 } 3647 return a; 3648 } 3649 zSig1 = 0; 3650 zSig0 = aSig + bSig; 3651 if ( aExp == 0 ) { 3652 normalizeFloatx80Subnormal( zSig0, &zExp, &zSig0 ); 3653 goto roundAndPack; 3654 } 3655 zExp = aExp; 3656 goto shiftRight1; 3657 } 3658 zSig0 = aSig + bSig; 3659 if ( (sbits64) zSig0 < 0 ) goto roundAndPack; 3660 shiftRight1: 3661 shift64ExtraRightJamming( zSig0, zSig1, 1, &zSig0, &zSig1 ); 3662 zSig0 |= LIT64( 0x8000000000000000 ); 3663 ++zExp; 3664 roundAndPack: 3665 return 3666 roundAndPackFloatx80( 3667 floatx80_rounding_precision, zSign, zExp, zSig0, zSig1 ); 3668 3669 } 3670 3671 /* 3672 ------------------------------------------------------------------------------- 3673 Returns the result of subtracting the absolute values of the extended 3674 double-precision floating-point values `a' and `b'. If `zSign' is 1, the 3675 difference is negated before being returned. `zSign' is ignored if the 3676 result is a NaN. The subtraction is performed according to the IEC/IEEE 3677 Standard for Binary Floating-Point Arithmetic. 3678 ------------------------------------------------------------------------------- 3679 */ 3680 static floatx80 subFloatx80Sigs( floatx80 a, floatx80 b, flag zSign ) 3681 { 3682 int32 aExp, bExp, zExp; 3683 bits64 aSig, bSig, zSig0, zSig1; 3684 int32 expDiff; 3685 floatx80 z; 3686 3687 aSig = extractFloatx80Frac( a ); 3688 aExp = extractFloatx80Exp( a ); 3689 bSig = extractFloatx80Frac( b ); 3690 bExp = extractFloatx80Exp( b ); 3691 expDiff = aExp - bExp; 3692 if ( 0 < expDiff ) goto aExpBigger; 3693 if ( expDiff < 0 ) goto bExpBigger; 3694 if ( aExp == 0x7FFF ) { 3695 if ( (bits64) ( ( aSig | bSig )<<1 ) ) { 3696 return propagateFloatx80NaN( a, b ); 3697 } 3698 float_raise( float_flag_invalid ); 3699 z.low = floatx80_default_nan_low; 3700 z.high = floatx80_default_nan_high; 3701 return z; 3702 } 3703 if ( aExp == 0 ) { 3704 aExp = 1; 3705 bExp = 1; 3706 } 3707 zSig1 = 0; 3708 if ( bSig < aSig ) goto aBigger; 3709 if ( aSig < bSig ) goto bBigger; 3710 return packFloatx80( float_rounding_mode == float_round_down, 0, 0 ); 3711 bExpBigger: 3712 if ( bExp == 0x7FFF ) { 3713 if ( (bits64) ( bSig<<1 ) ) return propagateFloatx80NaN( a, b ); 3714 return packFloatx80( zSign ^ 1, 0x7FFF, LIT64( 0x8000000000000000 ) ); 3715 } 3716 if ( aExp == 0 ) ++expDiff; 3717 shift128RightJamming( aSig, 0, - expDiff, &aSig, &zSig1 ); 3718 bBigger: 3719 sub128( bSig, 0, aSig, zSig1, &zSig0, &zSig1 ); 3720 zExp = bExp; 3721 zSign ^= 1; 3722 goto normalizeRoundAndPack; 3723 aExpBigger: 3724 if ( aExp == 0x7FFF ) { 3725 if ( (bits64) ( aSig<<1 ) ) return propagateFloatx80NaN( a, b ); 3726 return a; 3727 } 3728 if ( bExp == 0 ) --expDiff; 3729 shift128RightJamming( bSig, 0, expDiff, &bSig, &zSig1 ); 3730 aBigger: 3731 sub128( aSig, 0, bSig, zSig1, &zSig0, &zSig1 ); 3732 zExp = aExp; 3733 normalizeRoundAndPack: 3734 return 3735 normalizeRoundAndPackFloatx80( 3736 floatx80_rounding_precision, zSign, zExp, zSig0, zSig1 ); 3737 3738 } 3739 3740 /* 3741 ------------------------------------------------------------------------------- 3742 Returns the result of adding the extended double-precision floating-point 3743 values `a' and `b'. The operation is performed according to the IEC/IEEE 3744 Standard for Binary Floating-Point Arithmetic. 3745 ------------------------------------------------------------------------------- 3746 */ 3747 floatx80 floatx80_add( floatx80 a, floatx80 b ) 3748 { 3749 flag aSign, bSign; 3750 3751 aSign = extractFloatx80Sign( a ); 3752 bSign = extractFloatx80Sign( b ); 3753 if ( aSign == bSign ) { 3754 return addFloatx80Sigs( a, b, aSign ); 3755 } 3756 else { 3757 return subFloatx80Sigs( a, b, aSign ); 3758 } 3759 3760 } 3761 3762 /* 3763 ------------------------------------------------------------------------------- 3764 Returns the result of subtracting the extended double-precision floating- 3765 point values `a' and `b'. The operation is performed according to the 3766 IEC/IEEE Standard for Binary Floating-Point Arithmetic. 3767 ------------------------------------------------------------------------------- 3768 */ 3769 floatx80 floatx80_sub( floatx80 a, floatx80 b ) 3770 { 3771 flag aSign, bSign; 3772 3773 aSign = extractFloatx80Sign( a ); 3774 bSign = extractFloatx80Sign( b ); 3775 if ( aSign == bSign ) { 3776 return subFloatx80Sigs( a, b, aSign ); 3777 } 3778 else { 3779 return addFloatx80Sigs( a, b, aSign ); 3780 } 3781 3782 } 3783 3784 /* 3785 ------------------------------------------------------------------------------- 3786 Returns the result of multiplying the extended double-precision floating- 3787 point values `a' and `b'. The operation is performed according to the 3788 IEC/IEEE Standard for Binary Floating-Point Arithmetic. 3789 ------------------------------------------------------------------------------- 3790 */ 3791 floatx80 floatx80_mul( floatx80 a, floatx80 b ) 3792 { 3793 flag aSign, bSign, zSign; 3794 int32 aExp, bExp, zExp; 3795 bits64 aSig, bSig, zSig0, zSig1; 3796 floatx80 z; 3797 3798 aSig = extractFloatx80Frac( a ); 3799 aExp = extractFloatx80Exp( a ); 3800 aSign = extractFloatx80Sign( a ); 3801 bSig = extractFloatx80Frac( b ); 3802 bExp = extractFloatx80Exp( b ); 3803 bSign = extractFloatx80Sign( b ); 3804 zSign = aSign ^ bSign; 3805 if ( aExp == 0x7FFF ) { 3806 if ( (bits64) ( aSig<<1 ) 3807 || ( ( bExp == 0x7FFF ) && (bits64) ( bSig<<1 ) ) ) { 3808 return propagateFloatx80NaN( a, b ); 3809 } 3810 if ( ( bExp | bSig ) == 0 ) goto invalid; 3811 return packFloatx80( zSign, 0x7FFF, LIT64( 0x8000000000000000 ) ); 3812 } 3813 if ( bExp == 0x7FFF ) { 3814 if ( (bits64) ( bSig<<1 ) ) return propagateFloatx80NaN( a, b ); 3815 if ( ( aExp | aSig ) == 0 ) { 3816 invalid: 3817 float_raise( float_flag_invalid ); 3818 z.low = floatx80_default_nan_low; 3819 z.high = floatx80_default_nan_high; 3820 return z; 3821 } 3822 return packFloatx80( zSign, 0x7FFF, LIT64( 0x8000000000000000 ) ); 3823 } 3824 if ( aExp == 0 ) { 3825 if ( aSig == 0 ) return packFloatx80( zSign, 0, 0 ); 3826 normalizeFloatx80Subnormal( aSig, &aExp, &aSig ); 3827 } 3828 if ( bExp == 0 ) { 3829 if ( bSig == 0 ) return packFloatx80( zSign, 0, 0 ); 3830 normalizeFloatx80Subnormal( bSig, &bExp, &bSig ); 3831 } 3832 zExp = aExp + bExp - 0x3FFE; 3833 mul64To128( aSig, bSig, &zSig0, &zSig1 ); 3834 if ( 0 < (sbits64) zSig0 ) { 3835 shortShift128Left( zSig0, zSig1, 1, &zSig0, &zSig1 ); 3836 --zExp; 3837 } 3838 return 3839 roundAndPackFloatx80( 3840 floatx80_rounding_precision, zSign, zExp, zSig0, zSig1 ); 3841 3842 } 3843 3844 /* 3845 ------------------------------------------------------------------------------- 3846 Returns the result of dividing the extended double-precision floating-point 3847 value `a' by the corresponding value `b'. The operation is performed 3848 according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic. 3849 ------------------------------------------------------------------------------- 3850 */ 3851 floatx80 floatx80_div( floatx80 a, floatx80 b ) 3852 { 3853 flag aSign, bSign, zSign; 3854 int32 aExp, bExp, zExp; 3855 bits64 aSig, bSig, zSig0, zSig1; 3856 bits64 rem0, rem1, rem2, term0, term1, term2; 3857 floatx80 z; 3858 3859 aSig = extractFloatx80Frac( a ); 3860 aExp = extractFloatx80Exp( a ); 3861 aSign = extractFloatx80Sign( a ); 3862 bSig = extractFloatx80Frac( b ); 3863 bExp = extractFloatx80Exp( b ); 3864 bSign = extractFloatx80Sign( b ); 3865 zSign = aSign ^ bSign; 3866 if ( aExp == 0x7FFF ) { 3867 if ( (bits64) ( aSig<<1 ) ) return propagateFloatx80NaN( a, b ); 3868 if ( bExp == 0x7FFF ) { 3869 if ( (bits64) ( bSig<<1 ) ) return propagateFloatx80NaN( a, b ); 3870 goto invalid; 3871 } 3872 return packFloatx80( zSign, 0x7FFF, LIT64( 0x8000000000000000 ) ); 3873 } 3874 if ( bExp == 0x7FFF ) { 3875 if ( (bits64) ( bSig<<1 ) ) return propagateFloatx80NaN( a, b ); 3876 return packFloatx80( zSign, 0, 0 ); 3877 } 3878 if ( bExp == 0 ) { 3879 if ( bSig == 0 ) { 3880 if ( ( aExp | aSig ) == 0 ) { 3881 invalid: 3882 float_raise( float_flag_invalid ); 3883 z.low = floatx80_default_nan_low; 3884 z.high = floatx80_default_nan_high; 3885 return z; 3886 } 3887 float_raise( float_flag_divbyzero ); 3888 return packFloatx80( zSign, 0x7FFF, LIT64( 0x8000000000000000 ) ); 3889 } 3890 normalizeFloatx80Subnormal( bSig, &bExp, &bSig ); 3891 } 3892 if ( aExp == 0 ) { 3893 if ( aSig == 0 ) return packFloatx80( zSign, 0, 0 ); 3894 normalizeFloatx80Subnormal( aSig, &aExp, &aSig ); 3895 } 3896 zExp = aExp - bExp + 0x3FFE; 3897 rem1 = 0; 3898 if ( bSig <= aSig ) { 3899 shift128Right( aSig, 0, 1, &aSig, &rem1 ); 3900 ++zExp; 3901 } 3902 zSig0 = estimateDiv128To64( aSig, rem1, bSig ); 3903 mul64To128( bSig, zSig0, &term0, &term1 ); 3904 sub128( aSig, rem1, term0, term1, &rem0, &rem1 ); 3905 while ( (sbits64) rem0 < 0 ) { 3906 --zSig0; 3907 add128( rem0, rem1, 0, bSig, &rem0, &rem1 ); 3908 } 3909 zSig1 = estimateDiv128To64( rem1, 0, bSig ); 3910 if ( (bits64) ( zSig1<<1 ) <= 8 ) { 3911 mul64To128( bSig, zSig1, &term1, &term2 ); 3912 sub128( rem1, 0, term1, term2, &rem1, &rem2 ); 3913 while ( (sbits64) rem1 < 0 ) { 3914 --zSig1; 3915 add128( rem1, rem2, 0, bSig, &rem1, &rem2 ); 3916 } 3917 zSig1 |= ( ( rem1 | rem2 ) != 0 ); 3918 } 3919 return 3920 roundAndPackFloatx80( 3921 floatx80_rounding_precision, zSign, zExp, zSig0, zSig1 ); 3922 3923 } 3924 3925 /* 3926 ------------------------------------------------------------------------------- 3927 Returns the remainder of the extended double-precision floating-point value 3928 `a' with respect to the corresponding value `b'. The operation is performed 3929 according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic. 3930 ------------------------------------------------------------------------------- 3931 */ 3932 floatx80 floatx80_rem( floatx80 a, floatx80 b ) 3933 { 3934 flag aSign, bSign, zSign; 3935 int32 aExp, bExp, expDiff; 3936 bits64 aSig0, aSig1, bSig; 3937 bits64 q, term0, term1, alternateASig0, alternateASig1; 3938 floatx80 z; 3939 3940 aSig0 = extractFloatx80Frac( a ); 3941 aExp = extractFloatx80Exp( a ); 3942 aSign = extractFloatx80Sign( a ); 3943 bSig = extractFloatx80Frac( b ); 3944 bExp = extractFloatx80Exp( b ); 3945 bSign = extractFloatx80Sign( b ); 3946 if ( aExp == 0x7FFF ) { 3947 if ( (bits64) ( aSig0<<1 ) 3948 || ( ( bExp == 0x7FFF ) && (bits64) ( bSig<<1 ) ) ) { 3949 return propagateFloatx80NaN( a, b ); 3950 } 3951 goto invalid; 3952 } 3953 if ( bExp == 0x7FFF ) { 3954 if ( (bits64) ( bSig<<1 ) ) return propagateFloatx80NaN( a, b ); 3955 return a; 3956 } 3957 if ( bExp == 0 ) { 3958 if ( bSig == 0 ) { 3959 invalid: 3960 float_raise( float_flag_invalid ); 3961 z.low = floatx80_default_nan_low; 3962 z.high = floatx80_default_nan_high; 3963 return z; 3964 } 3965 normalizeFloatx80Subnormal( bSig, &bExp, &bSig ); 3966 } 3967 if ( aExp == 0 ) { 3968 if ( (bits64) ( aSig0<<1 ) == 0 ) return a; 3969 normalizeFloatx80Subnormal( aSig0, &aExp, &aSig0 ); 3970 } 3971 bSig |= LIT64( 0x8000000000000000 ); 3972 zSign = aSign; 3973 expDiff = aExp - bExp; 3974 aSig1 = 0; 3975 if ( expDiff < 0 ) { 3976 if ( expDiff < -1 ) return a; 3977 shift128Right( aSig0, 0, 1, &aSig0, &aSig1 ); 3978 expDiff = 0; 3979 } 3980 q = ( bSig <= aSig0 ); 3981 if ( q ) aSig0 -= bSig; 3982 expDiff -= 64; 3983 while ( 0 < expDiff ) { 3984 q = estimateDiv128To64( aSig0, aSig1, bSig ); 3985 q = ( 2 < q ) ? q - 2 : 0; 3986 mul64To128( bSig, q, &term0, &term1 ); 3987 sub128( aSig0, aSig1, term0, term1, &aSig0, &aSig1 ); 3988 shortShift128Left( aSig0, aSig1, 62, &aSig0, &aSig1 ); 3989 expDiff -= 62; 3990 } 3991 expDiff += 64; 3992 if ( 0 < expDiff ) { 3993 q = estimateDiv128To64( aSig0, aSig1, bSig ); 3994 q = ( 2 < q ) ? q - 2 : 0; 3995 q >>= 64 - expDiff; 3996 mul64To128( bSig, q<<( 64 - expDiff ), &term0, &term1 ); 3997 sub128( aSig0, aSig1, term0, term1, &aSig0, &aSig1 ); 3998 shortShift128Left( 0, bSig, 64 - expDiff, &term0, &term1 ); 3999 while ( le128( term0, term1, aSig0, aSig1 ) ) { 4000 ++q; 4001 sub128( aSig0, aSig1, term0, term1, &aSig0, &aSig1 ); 4002 } 4003 } 4004 else { 4005 term1 = 0; 4006 term0 = bSig; 4007 } 4008 sub128( term0, term1, aSig0, aSig1, &alternateASig0, &alternateASig1 ); 4009 if ( lt128( alternateASig0, alternateASig1, aSig0, aSig1 ) 4010 || ( eq128( alternateASig0, alternateASig1, aSig0, aSig1 ) 4011 && ( q & 1 ) ) 4012 ) { 4013 aSig0 = alternateASig0; 4014 aSig1 = alternateASig1; 4015 zSign = ! zSign; 4016 } 4017 return 4018 normalizeRoundAndPackFloatx80( 4019 80, zSign, bExp + expDiff, aSig0, aSig1 ); 4020 4021 } 4022 4023 /* 4024 ------------------------------------------------------------------------------- 4025 Returns the square root of the extended double-precision floating-point 4026 value `a'. The operation is performed according to the IEC/IEEE Standard 4027 for Binary Floating-Point Arithmetic. 4028 ------------------------------------------------------------------------------- 4029 */ 4030 floatx80 floatx80_sqrt( floatx80 a ) 4031 { 4032 flag aSign; 4033 int32 aExp, zExp; 4034 bits64 aSig0, aSig1, zSig0, zSig1, doubleZSig0; 4035 bits64 rem0, rem1, rem2, rem3, term0, term1, term2, term3; 4036 floatx80 z; 4037 4038 aSig0 = extractFloatx80Frac( a ); 4039 aExp = extractFloatx80Exp( a ); 4040 aSign = extractFloatx80Sign( a ); 4041 if ( aExp == 0x7FFF ) { 4042 if ( (bits64) ( aSig0<<1 ) ) return propagateFloatx80NaN( a, a ); 4043 if ( ! aSign ) return a; 4044 goto invalid; 4045 } 4046 if ( aSign ) { 4047 if ( ( aExp | aSig0 ) == 0 ) return a; 4048 invalid: 4049 float_raise( float_flag_invalid ); 4050 z.low = floatx80_default_nan_low; 4051 z.high = floatx80_default_nan_high; 4052 return z; 4053 } 4054 if ( aExp == 0 ) { 4055 if ( aSig0 == 0 ) return packFloatx80( 0, 0, 0 ); 4056 normalizeFloatx80Subnormal( aSig0, &aExp, &aSig0 ); 4057 } 4058 zExp = ( ( aExp - 0x3FFF )>>1 ) + 0x3FFF; 4059 zSig0 = estimateSqrt32( aExp, aSig0>>32 ); 4060 shift128Right( aSig0, 0, 2 + ( aExp & 1 ), &aSig0, &aSig1 ); 4061 zSig0 = estimateDiv128To64( aSig0, aSig1, zSig0<<32 ) + ( zSig0<<30 ); 4062 doubleZSig0 = zSig0<<1; 4063 mul64To128( zSig0, zSig0, &term0, &term1 ); 4064 sub128( aSig0, aSig1, term0, term1, &rem0, &rem1 ); 4065 while ( (sbits64) rem0 < 0 ) { 4066 --zSig0; 4067 doubleZSig0 -= 2; 4068 add128( rem0, rem1, zSig0>>63, doubleZSig0 | 1, &rem0, &rem1 ); 4069 } 4070 zSig1 = estimateDiv128To64( rem1, 0, doubleZSig0 ); 4071 if ( ( zSig1 & LIT64( 0x3FFFFFFFFFFFFFFF ) ) <= 5 ) { 4072 if ( zSig1 == 0 ) zSig1 = 1; 4073 mul64To128( doubleZSig0, zSig1, &term1, &term2 ); 4074 sub128( rem1, 0, term1, term2, &rem1, &rem2 ); 4075 mul64To128( zSig1, zSig1, &term2, &term3 ); 4076 sub192( rem1, rem2, 0, 0, term2, term3, &rem1, &rem2, &rem3 ); 4077 while ( (sbits64) rem1 < 0 ) { 4078 --zSig1; 4079 shortShift128Left( 0, zSig1, 1, &term2, &term3 ); 4080 term3 |= 1; 4081 term2 |= doubleZSig0; 4082 add192( rem1, rem2, rem3, 0, term2, term3, &rem1, &rem2, &rem3 ); 4083 } 4084 zSig1 |= ( ( rem1 | rem2 | rem3 ) != 0 ); 4085 } 4086 shortShift128Left( 0, zSig1, 1, &zSig0, &zSig1 ); 4087 zSig0 |= doubleZSig0; 4088 return 4089 roundAndPackFloatx80( 4090 floatx80_rounding_precision, 0, zExp, zSig0, zSig1 ); 4091 4092 } 4093 4094 /* 4095 ------------------------------------------------------------------------------- 4096 Returns 1 if the extended double-precision floating-point value `a' is 4097 equal to the corresponding value `b', and 0 otherwise. The comparison is 4098 performed according to the IEC/IEEE Standard for Binary Floating-Point 4099 Arithmetic. 4100 ------------------------------------------------------------------------------- 4101 */ 4102 flag floatx80_eq( floatx80 a, floatx80 b ) 4103 { 4104 4105 if ( ( ( extractFloatx80Exp( a ) == 0x7FFF ) 4106 && (bits64) ( extractFloatx80Frac( a )<<1 ) ) 4107 || ( ( extractFloatx80Exp( b ) == 0x7FFF ) 4108 && (bits64) ( extractFloatx80Frac( b )<<1 ) ) 4109 ) { 4110 if ( floatx80_is_signaling_nan( a ) 4111 || floatx80_is_signaling_nan( b ) ) { 4112 float_raise( float_flag_invalid ); 4113 } 4114 return 0; 4115 } 4116 return 4117 ( a.low == b.low ) 4118 && ( ( a.high == b.high ) 4119 || ( ( a.low == 0 ) 4120 && ( (bits16) ( ( a.high | b.high )<<1 ) == 0 ) ) 4121 ); 4122 4123 } 4124 4125 /* 4126 ------------------------------------------------------------------------------- 4127 Returns 1 if the extended double-precision floating-point value `a' is 4128 less than or equal to the corresponding value `b', and 0 otherwise. The 4129 comparison is performed according to the IEC/IEEE Standard for Binary 4130 Floating-Point Arithmetic. 4131 ------------------------------------------------------------------------------- 4132 */ 4133 flag floatx80_le( floatx80 a, floatx80 b ) 4134 { 4135 flag aSign, bSign; 4136 4137 if ( ( ( extractFloatx80Exp( a ) == 0x7FFF ) 4138 && (bits64) ( extractFloatx80Frac( a )<<1 ) ) 4139 || ( ( extractFloatx80Exp( b ) == 0x7FFF ) 4140 && (bits64) ( extractFloatx80Frac( b )<<1 ) ) 4141 ) { 4142 float_raise( float_flag_invalid ); 4143 return 0; 4144 } 4145 aSign = extractFloatx80Sign( a ); 4146 bSign = extractFloatx80Sign( b ); 4147 if ( aSign != bSign ) { 4148 return 4149 aSign 4150 || ( ( ( (bits16) ( ( a.high | b.high )<<1 ) ) | a.low | b.low ) 4151 == 0 ); 4152 } 4153 return 4154 aSign ? le128( b.high, b.low, a.high, a.low ) 4155 : le128( a.high, a.low, b.high, b.low ); 4156 4157 } 4158 4159 /* 4160 ------------------------------------------------------------------------------- 4161 Returns 1 if the extended double-precision floating-point value `a' is 4162 less than the corresponding value `b', and 0 otherwise. The comparison 4163 is performed according to the IEC/IEEE Standard for Binary Floating-Point 4164 Arithmetic. 4165 ------------------------------------------------------------------------------- 4166 */ 4167 flag floatx80_lt( floatx80 a, floatx80 b ) 4168 { 4169 flag aSign, bSign; 4170 4171 if ( ( ( extractFloatx80Exp( a ) == 0x7FFF ) 4172 && (bits64) ( extractFloatx80Frac( a )<<1 ) ) 4173 || ( ( extractFloatx80Exp( b ) == 0x7FFF ) 4174 && (bits64) ( extractFloatx80Frac( b )<<1 ) ) 4175 ) { 4176 float_raise( float_flag_invalid ); 4177 return 0; 4178 } 4179 aSign = extractFloatx80Sign( a ); 4180 bSign = extractFloatx80Sign( b ); 4181 if ( aSign != bSign ) { 4182 return 4183 aSign 4184 && ( ( ( (bits16) ( ( a.high | b.high )<<1 ) ) | a.low | b.low ) 4185 != 0 ); 4186 } 4187 return 4188 aSign ? lt128( b.high, b.low, a.high, a.low ) 4189 : lt128( a.high, a.low, b.high, b.low ); 4190 4191 } 4192 4193 /* 4194 ------------------------------------------------------------------------------- 4195 Returns 1 if the extended double-precision floating-point value `a' is equal 4196 to the corresponding value `b', and 0 otherwise. The invalid exception is 4197 raised if either operand is a NaN. Otherwise, the comparison is performed 4198 according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic. 4199 ------------------------------------------------------------------------------- 4200 */ 4201 flag floatx80_eq_signaling( floatx80 a, floatx80 b ) 4202 { 4203 4204 if ( ( ( extractFloatx80Exp( a ) == 0x7FFF ) 4205 && (bits64) ( extractFloatx80Frac( a )<<1 ) ) 4206 || ( ( extractFloatx80Exp( b ) == 0x7FFF ) 4207 && (bits64) ( extractFloatx80Frac( b )<<1 ) ) 4208 ) { 4209 float_raise( float_flag_invalid ); 4210 return 0; 4211 } 4212 return 4213 ( a.low == b.low ) 4214 && ( ( a.high == b.high ) 4215 || ( ( a.low == 0 ) 4216 && ( (bits16) ( ( a.high | b.high )<<1 ) == 0 ) ) 4217 ); 4218 4219 } 4220 4221 /* 4222 ------------------------------------------------------------------------------- 4223 Returns 1 if the extended double-precision floating-point value `a' is less 4224 than or equal to the corresponding value `b', and 0 otherwise. Quiet NaNs 4225 do not cause an exception. Otherwise, the comparison is performed according 4226 to the IEC/IEEE Standard for Binary Floating-Point Arithmetic. 4227 ------------------------------------------------------------------------------- 4228 */ 4229 flag floatx80_le_quiet( floatx80 a, floatx80 b ) 4230 { 4231 flag aSign, bSign; 4232 4233 if ( ( ( extractFloatx80Exp( a ) == 0x7FFF ) 4234 && (bits64) ( extractFloatx80Frac( a )<<1 ) ) 4235 || ( ( extractFloatx80Exp( b ) == 0x7FFF ) 4236 && (bits64) ( extractFloatx80Frac( b )<<1 ) ) 4237 ) { 4238 if ( floatx80_is_signaling_nan( a ) 4239 || floatx80_is_signaling_nan( b ) ) { 4240 float_raise( float_flag_invalid ); 4241 } 4242 return 0; 4243 } 4244 aSign = extractFloatx80Sign( a ); 4245 bSign = extractFloatx80Sign( b ); 4246 if ( aSign != bSign ) { 4247 return 4248 aSign 4249 || ( ( ( (bits16) ( ( a.high | b.high )<<1 ) ) | a.low | b.low ) 4250 == 0 ); 4251 } 4252 return 4253 aSign ? le128( b.high, b.low, a.high, a.low ) 4254 : le128( a.high, a.low, b.high, b.low ); 4255 4256 } 4257 4258 /* 4259 ------------------------------------------------------------------------------- 4260 Returns 1 if the extended double-precision floating-point value `a' is less 4261 than the corresponding value `b', and 0 otherwise. Quiet NaNs do not cause 4262 an exception. Otherwise, the comparison is performed according to the 4263 IEC/IEEE Standard for Binary Floating-Point Arithmetic. 4264 ------------------------------------------------------------------------------- 4265 */ 4266 flag floatx80_lt_quiet( floatx80 a, floatx80 b ) 4267 { 4268 flag aSign, bSign; 4269 4270 if ( ( ( extractFloatx80Exp( a ) == 0x7FFF ) 4271 && (bits64) ( extractFloatx80Frac( a )<<1 ) ) 4272 || ( ( extractFloatx80Exp( b ) == 0x7FFF ) 4273 && (bits64) ( extractFloatx80Frac( b )<<1 ) ) 4274 ) { 4275 if ( floatx80_is_signaling_nan( a ) 4276 || floatx80_is_signaling_nan( b ) ) { 4277 float_raise( float_flag_invalid ); 4278 } 4279 return 0; 4280 } 4281 aSign = extractFloatx80Sign( a ); 4282 bSign = extractFloatx80Sign( b ); 4283 if ( aSign != bSign ) { 4284 return 4285 aSign 4286 && ( ( ( (bits16) ( ( a.high | b.high )<<1 ) ) | a.low | b.low ) 4287 != 0 ); 4288 } 4289 return 4290 aSign ? lt128( b.high, b.low, a.high, a.low ) 4291 : lt128( a.high, a.low, b.high, b.low ); 4292 4293 } 4294 4295 #endif 4296 4297 #ifdef FLOAT128 4298 4299 /* 4300 ------------------------------------------------------------------------------- 4301 Returns the result of converting the quadruple-precision floating-point 4302 value `a' to the 32-bit two's complement integer format. The conversion 4303 is performed according to the IEC/IEEE Standard for Binary Floating-Point 4304 Arithmetic---which means in particular that the conversion is rounded 4305 according to the current rounding mode. If `a' is a NaN, the largest 4306 positive integer is returned. Otherwise, if the conversion overflows, the 4307 largest integer with the same sign as `a' is returned. 4308 ------------------------------------------------------------------------------- 4309 */ 4310 int32 float128_to_int32( float128 a ) 4311 { 4312 flag aSign; 4313 int32 aExp, shiftCount; 4314 bits64 aSig0, aSig1; 4315 4316 aSig1 = extractFloat128Frac1( a ); 4317 aSig0 = extractFloat128Frac0( a ); 4318 aExp = extractFloat128Exp( a ); 4319 aSign = extractFloat128Sign( a ); 4320 if ( ( aExp == 0x7FFF ) && ( aSig0 | aSig1 ) ) aSign = 0; 4321 if ( aExp ) aSig0 |= LIT64( 0x0001000000000000 ); 4322 aSig0 |= ( aSig1 != 0 ); 4323 shiftCount = 0x4028 - aExp; 4324 if ( 0 < shiftCount ) shift64RightJamming( aSig0, shiftCount, &aSig0 ); 4325 return roundAndPackInt32( aSign, aSig0 ); 4326 4327 } 4328 4329 /* 4330 ------------------------------------------------------------------------------- 4331 Returns the result of converting the quadruple-precision floating-point 4332 value `a' to the 32-bit two's complement integer format. The conversion 4333 is performed according to the IEC/IEEE Standard for Binary Floating-Point 4334 Arithmetic, except that the conversion is always rounded toward zero. If 4335 `a' is a NaN, the largest positive integer is returned. Otherwise, if the 4336 conversion overflows, the largest integer with the same sign as `a' is 4337 returned. 4338 ------------------------------------------------------------------------------- 4339 */ 4340 int32 float128_to_int32_round_to_zero( float128 a ) 4341 { 4342 flag aSign; 4343 int32 aExp, shiftCount; 4344 bits64 aSig0, aSig1, savedASig; 4345 int32 z; 4346 4347 aSig1 = extractFloat128Frac1( a ); 4348 aSig0 = extractFloat128Frac0( a ); 4349 aExp = extractFloat128Exp( a ); 4350 aSign = extractFloat128Sign( a ); 4351 aSig0 |= ( aSig1 != 0 ); 4352 if ( 0x401E < aExp ) { 4353 if ( ( aExp == 0x7FFF ) && aSig0 ) aSign = 0; 4354 goto invalid; 4355 } 4356 else if ( aExp < 0x3FFF ) { 4357 if ( aExp || aSig0 ) float_exception_flags |= float_flag_inexact; 4358 return 0; 4359 } 4360 aSig0 |= LIT64( 0x0001000000000000 ); 4361 shiftCount = 0x402F - aExp; 4362 savedASig = aSig0; 4363 aSig0 >>= shiftCount; 4364 z = aSig0; 4365 if ( aSign ) z = - z; 4366 if ( ( z < 0 ) ^ aSign ) { 4367 invalid: 4368 float_raise( float_flag_invalid ); 4369 return aSign ? (sbits32) 0x80000000 : 0x7FFFFFFF; 4370 } 4371 if ( ( aSig0<<shiftCount ) != savedASig ) { 4372 float_exception_flags |= float_flag_inexact; 4373 } 4374 return z; 4375 4376 } 4377 4378 /* 4379 ------------------------------------------------------------------------------- 4380 Returns the result of converting the quadruple-precision floating-point 4381 value `a' to the 64-bit two's complement integer format. The conversion 4382 is performed according to the IEC/IEEE Standard for Binary Floating-Point 4383 Arithmetic---which means in particular that the conversion is rounded 4384 according to the current rounding mode. If `a' is a NaN, the largest 4385 positive integer is returned. Otherwise, if the conversion overflows, the 4386 largest integer with the same sign as `a' is returned. 4387 ------------------------------------------------------------------------------- 4388 */ 4389 int64 float128_to_int64( float128 a ) 4390 { 4391 flag aSign; 4392 int32 aExp, shiftCount; 4393 bits64 aSig0, aSig1; 4394 4395 aSig1 = extractFloat128Frac1( a ); 4396 aSig0 = extractFloat128Frac0( a ); 4397 aExp = extractFloat128Exp( a ); 4398 aSign = extractFloat128Sign( a ); 4399 if ( aExp ) aSig0 |= LIT64( 0x0001000000000000 ); 4400 shiftCount = 0x402F - aExp; 4401 if ( shiftCount <= 0 ) { 4402 if ( 0x403E < aExp ) { 4403 float_raise( float_flag_invalid ); 4404 if ( ! aSign 4405 || ( ( aExp == 0x7FFF ) 4406 && ( aSig1 || ( aSig0 != LIT64( 0x0001000000000000 ) ) ) 4407 ) 4408 ) { 4409 return LIT64( 0x7FFFFFFFFFFFFFFF ); 4410 } 4411 return (sbits64) LIT64( 0x8000000000000000 ); 4412 } 4413 shortShift128Left( aSig0, aSig1, - shiftCount, &aSig0, &aSig1 ); 4414 } 4415 else { 4416 shift64ExtraRightJamming( aSig0, aSig1, shiftCount, &aSig0, &aSig1 ); 4417 } 4418 return roundAndPackInt64( aSign, aSig0, aSig1 ); 4419 4420 } 4421 4422 /* 4423 ------------------------------------------------------------------------------- 4424 Returns the result of converting the quadruple-precision floating-point 4425 value `a' to the 64-bit two's complement integer format. The conversion 4426 is performed according to the IEC/IEEE Standard for Binary Floating-Point 4427 Arithmetic, except that the conversion is always rounded toward zero. 4428 If `a' is a NaN, the largest positive integer is returned. Otherwise, if 4429 the conversion overflows, the largest integer with the same sign as `a' is 4430 returned. 4431 ------------------------------------------------------------------------------- 4432 */ 4433 int64 float128_to_int64_round_to_zero( float128 a ) 4434 { 4435 flag aSign; 4436 int32 aExp, shiftCount; 4437 bits64 aSig0, aSig1; 4438 int64 z; 4439 4440 aSig1 = extractFloat128Frac1( a ); 4441 aSig0 = extractFloat128Frac0( a ); 4442 aExp = extractFloat128Exp( a ); 4443 aSign = extractFloat128Sign( a ); 4444 if ( aExp ) aSig0 |= LIT64( 0x0001000000000000 ); 4445 shiftCount = aExp - 0x402F; 4446 if ( 0 < shiftCount ) { 4447 if ( 0x403E <= aExp ) { 4448 aSig0 &= LIT64( 0x0000FFFFFFFFFFFF ); 4449 if ( ( a.high == LIT64( 0xC03E000000000000 ) ) 4450 && ( aSig1 < LIT64( 0x0002000000000000 ) ) ) { 4451 if ( aSig1 ) float_exception_flags |= float_flag_inexact; 4452 } 4453 else { 4454 float_raise( float_flag_invalid ); 4455 if ( ! aSign || ( ( aExp == 0x7FFF ) && ( aSig0 | aSig1 ) ) ) { 4456 return LIT64( 0x7FFFFFFFFFFFFFFF ); 4457 } 4458 } 4459 return (sbits64) LIT64( 0x8000000000000000 ); 4460 } 4461 z = ( aSig0<<shiftCount ) | ( aSig1>>( ( - shiftCount ) & 63 ) ); 4462 if ( (bits64) ( aSig1<<shiftCount ) ) { 4463 float_exception_flags |= float_flag_inexact; 4464 } 4465 } 4466 else { 4467 if ( aExp < 0x3FFF ) { 4468 if ( aExp | aSig0 | aSig1 ) { 4469 float_exception_flags |= float_flag_inexact; 4470 } 4471 return 0; 4472 } 4473 z = aSig0>>( - shiftCount ); 4474 if ( aSig1 4475 || ( shiftCount && (bits64) ( aSig0<<( shiftCount & 63 ) ) ) ) { 4476 float_exception_flags |= float_flag_inexact; 4477 } 4478 } 4479 if ( aSign ) z = - z; 4480 return z; 4481 4482 } 4483 4484 #if (defined(SOFTFLOATSPARC64_FOR_GCC) || defined(SOFTFLOAT_FOR_GCC)) \ 4485 && defined(SOFTFLOAT_NEED_FIXUNS) 4486 /* 4487 * just like above - but do not care for overflow of signed results 4488 */ 4489 uint64 float128_to_uint64_round_to_zero( float128 a ) 4490 { 4491 flag aSign; 4492 int32 aExp, shiftCount; 4493 bits64 aSig0, aSig1; 4494 uint64 z; 4495 4496 aSig1 = extractFloat128Frac1( a ); 4497 aSig0 = extractFloat128Frac0( a ); 4498 aExp = extractFloat128Exp( a ); 4499 aSign = extractFloat128Sign( a ); 4500 if ( aExp ) aSig0 |= LIT64( 0x0001000000000000 ); 4501 shiftCount = aExp - 0x402F; 4502 if ( 0 < shiftCount ) { 4503 if ( 0x403F <= aExp ) { 4504 aSig0 &= LIT64( 0x0000FFFFFFFFFFFF ); 4505 if ( ( a.high == LIT64( 0xC03E000000000000 ) ) 4506 && ( aSig1 < LIT64( 0x0002000000000000 ) ) ) { 4507 if ( aSig1 ) float_exception_flags |= float_flag_inexact; 4508 } 4509 else { 4510 float_raise( float_flag_invalid ); 4511 } 4512 return LIT64( 0xFFFFFFFFFFFFFFFF ); 4513 } 4514 z = ( aSig0<<shiftCount ) | ( aSig1>>( ( - shiftCount ) & 63 ) ); 4515 if ( (bits64) ( aSig1<<shiftCount ) ) { 4516 float_exception_flags |= float_flag_inexact; 4517 } 4518 } 4519 else { 4520 if ( aExp < 0x3FFF ) { 4521 if ( aExp | aSig0 | aSig1 ) { 4522 float_exception_flags |= float_flag_inexact; 4523 } 4524 return 0; 4525 } 4526 z = aSig0>>( - shiftCount ); 4527 if (aSig1 || ( shiftCount && (bits64) ( aSig0<<( shiftCount & 63 ) ) ) ) { 4528 float_exception_flags |= float_flag_inexact; 4529 } 4530 } 4531 if ( aSign ) z = - z; 4532 return z; 4533 4534 } 4535 #endif /* (SOFTFLOATSPARC64_FOR_GCC || SOFTFLOAT_FOR_GCC) && SOFTFLOAT_NEED_FIXUNS */ 4536 4537 /* 4538 ------------------------------------------------------------------------------- 4539 Returns the result of converting the quadruple-precision floating-point 4540 value `a' to the single-precision floating-point format. The conversion 4541 is performed according to the IEC/IEEE Standard for Binary Floating-Point 4542 Arithmetic. 4543 ------------------------------------------------------------------------------- 4544 */ 4545 float32 float128_to_float32( float128 a ) 4546 { 4547 flag aSign; 4548 int32 aExp; 4549 bits64 aSig0, aSig1; 4550 bits32 zSig; 4551 4552 aSig1 = extractFloat128Frac1( a ); 4553 aSig0 = extractFloat128Frac0( a ); 4554 aExp = extractFloat128Exp( a ); 4555 aSign = extractFloat128Sign( a ); 4556 if ( aExp == 0x7FFF ) { 4557 if ( aSig0 | aSig1 ) { 4558 return commonNaNToFloat32( float128ToCommonNaN( a ) ); 4559 } 4560 return packFloat32( aSign, 0xFF, 0 ); 4561 } 4562 aSig0 |= ( aSig1 != 0 ); 4563 shift64RightJamming( aSig0, 18, &aSig0 ); 4564 zSig = aSig0; 4565 if ( aExp || zSig ) { 4566 zSig |= 0x40000000; 4567 aExp -= 0x3F81; 4568 } 4569 return roundAndPackFloat32( aSign, aExp, zSig ); 4570 4571 } 4572 4573 /* 4574 ------------------------------------------------------------------------------- 4575 Returns the result of converting the quadruple-precision floating-point 4576 value `a' to the double-precision floating-point format. The conversion 4577 is performed according to the IEC/IEEE Standard for Binary Floating-Point 4578 Arithmetic. 4579 ------------------------------------------------------------------------------- 4580 */ 4581 float64 float128_to_float64( float128 a ) 4582 { 4583 flag aSign; 4584 int32 aExp; 4585 bits64 aSig0, aSig1; 4586 4587 aSig1 = extractFloat128Frac1( a ); 4588 aSig0 = extractFloat128Frac0( a ); 4589 aExp = extractFloat128Exp( a ); 4590 aSign = extractFloat128Sign( a ); 4591 if ( aExp == 0x7FFF ) { 4592 if ( aSig0 | aSig1 ) { 4593 return commonNaNToFloat64( float128ToCommonNaN( a ) ); 4594 } 4595 return packFloat64( aSign, 0x7FF, 0 ); 4596 } 4597 shortShift128Left( aSig0, aSig1, 14, &aSig0, &aSig1 ); 4598 aSig0 |= ( aSig1 != 0 ); 4599 if ( aExp || aSig0 ) { 4600 aSig0 |= LIT64( 0x4000000000000000 ); 4601 aExp -= 0x3C01; 4602 } 4603 return roundAndPackFloat64( aSign, aExp, aSig0 ); 4604 4605 } 4606 4607 #ifdef FLOATX80 4608 4609 /* 4610 ------------------------------------------------------------------------------- 4611 Returns the result of converting the quadruple-precision floating-point 4612 value `a' to the extended double-precision floating-point format. The 4613 conversion is performed according to the IEC/IEEE Standard for Binary 4614 Floating-Point Arithmetic. 4615 ------------------------------------------------------------------------------- 4616 */ 4617 floatx80 float128_to_floatx80( float128 a ) 4618 { 4619 flag aSign; 4620 int32 aExp; 4621 bits64 aSig0, aSig1; 4622 4623 aSig1 = extractFloat128Frac1( a ); 4624 aSig0 = extractFloat128Frac0( a ); 4625 aExp = extractFloat128Exp( a ); 4626 aSign = extractFloat128Sign( a ); 4627 if ( aExp == 0x7FFF ) { 4628 if ( aSig0 | aSig1 ) { 4629 return commonNaNToFloatx80( float128ToCommonNaN( a ) ); 4630 } 4631 return packFloatx80( aSign, 0x7FFF, LIT64( 0x8000000000000000 ) ); 4632 } 4633 if ( aExp == 0 ) { 4634 if ( ( aSig0 | aSig1 ) == 0 ) return packFloatx80( aSign, 0, 0 ); 4635 normalizeFloat128Subnormal( aSig0, aSig1, &aExp, &aSig0, &aSig1 ); 4636 } 4637 else { 4638 aSig0 |= LIT64( 0x0001000000000000 ); 4639 } 4640 shortShift128Left( aSig0, aSig1, 15, &aSig0, &aSig1 ); 4641 return roundAndPackFloatx80( 80, aSign, aExp, aSig0, aSig1 ); 4642 4643 } 4644 4645 #endif 4646 4647 /* 4648 ------------------------------------------------------------------------------- 4649 Rounds the quadruple-precision floating-point value `a' to an integer, and 4650 returns the result as a quadruple-precision floating-point value. The 4651 operation is performed according to the IEC/IEEE Standard for Binary 4652 Floating-Point Arithmetic. 4653 ------------------------------------------------------------------------------- 4654 */ 4655 float128 float128_round_to_int( float128 a ) 4656 { 4657 flag aSign; 4658 int32 aExp; 4659 bits64 lastBitMask, roundBitsMask; 4660 int8 roundingMode; 4661 float128 z; 4662 4663 aExp = extractFloat128Exp( a ); 4664 if ( 0x402F <= aExp ) { 4665 if ( 0x406F <= aExp ) { 4666 if ( ( aExp == 0x7FFF ) 4667 && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) 4668 ) { 4669 return propagateFloat128NaN( a, a ); 4670 } 4671 return a; 4672 } 4673 lastBitMask = 1; 4674 lastBitMask = ( lastBitMask<<( 0x406E - aExp ) )<<1; 4675 roundBitsMask = lastBitMask - 1; 4676 z = a; 4677 roundingMode = float_rounding_mode; 4678 if ( roundingMode == float_round_nearest_even ) { 4679 if ( lastBitMask ) { 4680 add128( z.high, z.low, 0, lastBitMask>>1, &z.high, &z.low ); 4681 if ( ( z.low & roundBitsMask ) == 0 ) z.low &= ~ lastBitMask; 4682 } 4683 else { 4684 if ( (sbits64) z.low < 0 ) { 4685 ++z.high; 4686 if ( (bits64) ( z.low<<1 ) == 0 ) z.high &= ~1; 4687 } 4688 } 4689 } 4690 else if ( roundingMode != float_round_to_zero ) { 4691 if ( extractFloat128Sign( z ) 4692 ^ ( roundingMode == float_round_up ) ) { 4693 add128( z.high, z.low, 0, roundBitsMask, &z.high, &z.low ); 4694 } 4695 } 4696 z.low &= ~ roundBitsMask; 4697 } 4698 else { 4699 if ( aExp < 0x3FFF ) { 4700 if ( ( ( (bits64) ( a.high<<1 ) ) | a.low ) == 0 ) return a; 4701 float_exception_flags |= float_flag_inexact; 4702 aSign = extractFloat128Sign( a ); 4703 switch ( float_rounding_mode ) { 4704 case float_round_nearest_even: 4705 if ( ( aExp == 0x3FFE ) 4706 && ( extractFloat128Frac0( a ) 4707 | extractFloat128Frac1( a ) ) 4708 ) { 4709 return packFloat128( aSign, 0x3FFF, 0, 0 ); 4710 } 4711 break; 4712 case float_round_to_zero: 4713 break; 4714 case float_round_down: 4715 return 4716 aSign ? packFloat128( 1, 0x3FFF, 0, 0 ) 4717 : packFloat128( 0, 0, 0, 0 ); 4718 case float_round_up: 4719 return 4720 aSign ? packFloat128( 1, 0, 0, 0 ) 4721 : packFloat128( 0, 0x3FFF, 0, 0 ); 4722 } 4723 return packFloat128( aSign, 0, 0, 0 ); 4724 } 4725 lastBitMask = 1; 4726 lastBitMask <<= 0x402F - aExp; 4727 roundBitsMask = lastBitMask - 1; 4728 z.low = 0; 4729 z.high = a.high; 4730 roundingMode = float_rounding_mode; 4731 if ( roundingMode == float_round_nearest_even ) { 4732 z.high += lastBitMask>>1; 4733 if ( ( ( z.high & roundBitsMask ) | a.low ) == 0 ) { 4734 z.high &= ~ lastBitMask; 4735 } 4736 } 4737 else if ( roundingMode != float_round_to_zero ) { 4738 if ( extractFloat128Sign( z ) 4739 ^ ( roundingMode == float_round_up ) ) { 4740 z.high |= ( a.low != 0 ); 4741 z.high += roundBitsMask; 4742 } 4743 } 4744 z.high &= ~ roundBitsMask; 4745 } 4746 if ( ( z.low != a.low ) || ( z.high != a.high ) ) { 4747 float_exception_flags |= float_flag_inexact; 4748 } 4749 return z; 4750 4751 } 4752 4753 /* 4754 ------------------------------------------------------------------------------- 4755 Returns the result of adding the absolute values of the quadruple-precision 4756 floating-point values `a' and `b'. If `zSign' is 1, the sum is negated 4757 before being returned. `zSign' is ignored if the result is a NaN. 4758 The addition is performed according to the IEC/IEEE Standard for Binary 4759 Floating-Point Arithmetic. 4760 ------------------------------------------------------------------------------- 4761 */ 4762 static float128 addFloat128Sigs( float128 a, float128 b, flag zSign ) 4763 { 4764 int32 aExp, bExp, zExp; 4765 bits64 aSig0, aSig1, bSig0, bSig1, zSig0, zSig1, zSig2; 4766 int32 expDiff; 4767 4768 aSig1 = extractFloat128Frac1( a ); 4769 aSig0 = extractFloat128Frac0( a ); 4770 aExp = extractFloat128Exp( a ); 4771 bSig1 = extractFloat128Frac1( b ); 4772 bSig0 = extractFloat128Frac0( b ); 4773 bExp = extractFloat128Exp( b ); 4774 expDiff = aExp - bExp; 4775 if ( 0 < expDiff ) { 4776 if ( aExp == 0x7FFF ) { 4777 if ( aSig0 | aSig1 ) return propagateFloat128NaN( a, b ); 4778 return a; 4779 } 4780 if ( bExp == 0 ) { 4781 --expDiff; 4782 } 4783 else { 4784 bSig0 |= LIT64( 0x0001000000000000 ); 4785 } 4786 shift128ExtraRightJamming( 4787 bSig0, bSig1, 0, expDiff, &bSig0, &bSig1, &zSig2 ); 4788 zExp = aExp; 4789 } 4790 else if ( expDiff < 0 ) { 4791 if ( bExp == 0x7FFF ) { 4792 if ( bSig0 | bSig1 ) return propagateFloat128NaN( a, b ); 4793 return packFloat128( zSign, 0x7FFF, 0, 0 ); 4794 } 4795 if ( aExp == 0 ) { 4796 ++expDiff; 4797 } 4798 else { 4799 aSig0 |= LIT64( 0x0001000000000000 ); 4800 } 4801 shift128ExtraRightJamming( 4802 aSig0, aSig1, 0, - expDiff, &aSig0, &aSig1, &zSig2 ); 4803 zExp = bExp; 4804 } 4805 else { 4806 if ( aExp == 0x7FFF ) { 4807 if ( aSig0 | aSig1 | bSig0 | bSig1 ) { 4808 return propagateFloat128NaN( a, b ); 4809 } 4810 return a; 4811 } 4812 add128( aSig0, aSig1, bSig0, bSig1, &zSig0, &zSig1 ); 4813 if ( aExp == 0 ) return packFloat128( zSign, 0, zSig0, zSig1 ); 4814 zSig2 = 0; 4815 zSig0 |= LIT64( 0x0002000000000000 ); 4816 zExp = aExp; 4817 goto shiftRight1; 4818 } 4819 aSig0 |= LIT64( 0x0001000000000000 ); 4820 add128( aSig0, aSig1, bSig0, bSig1, &zSig0, &zSig1 ); 4821 --zExp; 4822 if ( zSig0 < LIT64( 0x0002000000000000 ) ) goto roundAndPack; 4823 ++zExp; 4824 shiftRight1: 4825 shift128ExtraRightJamming( 4826 zSig0, zSig1, zSig2, 1, &zSig0, &zSig1, &zSig2 ); 4827 roundAndPack: 4828 return roundAndPackFloat128( zSign, zExp, zSig0, zSig1, zSig2 ); 4829 4830 } 4831 4832 /* 4833 ------------------------------------------------------------------------------- 4834 Returns the result of subtracting the absolute values of the quadruple- 4835 precision floating-point values `a' and `b'. If `zSign' is 1, the 4836 difference is negated before being returned. `zSign' is ignored if the 4837 result is a NaN. The subtraction is performed according to the IEC/IEEE 4838 Standard for Binary Floating-Point Arithmetic. 4839 ------------------------------------------------------------------------------- 4840 */ 4841 static float128 subFloat128Sigs( float128 a, float128 b, flag zSign ) 4842 { 4843 int32 aExp, bExp, zExp; 4844 bits64 aSig0, aSig1, bSig0, bSig1, zSig0, zSig1; 4845 int32 expDiff; 4846 float128 z; 4847 4848 aSig1 = extractFloat128Frac1( a ); 4849 aSig0 = extractFloat128Frac0( a ); 4850 aExp = extractFloat128Exp( a ); 4851 bSig1 = extractFloat128Frac1( b ); 4852 bSig0 = extractFloat128Frac0( b ); 4853 bExp = extractFloat128Exp( b ); 4854 expDiff = aExp - bExp; 4855 shortShift128Left( aSig0, aSig1, 14, &aSig0, &aSig1 ); 4856 shortShift128Left( bSig0, bSig1, 14, &bSig0, &bSig1 ); 4857 if ( 0 < expDiff ) goto aExpBigger; 4858 if ( expDiff < 0 ) goto bExpBigger; 4859 if ( aExp == 0x7FFF ) { 4860 if ( aSig0 | aSig1 | bSig0 | bSig1 ) { 4861 return propagateFloat128NaN( a, b ); 4862 } 4863 float_raise( float_flag_invalid ); 4864 z.low = float128_default_nan_low; 4865 z.high = float128_default_nan_high; 4866 return z; 4867 } 4868 if ( aExp == 0 ) { 4869 aExp = 1; 4870 bExp = 1; 4871 } 4872 if ( bSig0 < aSig0 ) goto aBigger; 4873 if ( aSig0 < bSig0 ) goto bBigger; 4874 if ( bSig1 < aSig1 ) goto aBigger; 4875 if ( aSig1 < bSig1 ) goto bBigger; 4876 return packFloat128( float_rounding_mode == float_round_down, 0, 0, 0 ); 4877 bExpBigger: 4878 if ( bExp == 0x7FFF ) { 4879 if ( bSig0 | bSig1 ) return propagateFloat128NaN( a, b ); 4880 return packFloat128( zSign ^ 1, 0x7FFF, 0, 0 ); 4881 } 4882 if ( aExp == 0 ) { 4883 ++expDiff; 4884 } 4885 else { 4886 aSig0 |= LIT64( 0x4000000000000000 ); 4887 } 4888 shift128RightJamming( aSig0, aSig1, - expDiff, &aSig0, &aSig1 ); 4889 bSig0 |= LIT64( 0x4000000000000000 ); 4890 bBigger: 4891 sub128( bSig0, bSig1, aSig0, aSig1, &zSig0, &zSig1 ); 4892 zExp = bExp; 4893 zSign ^= 1; 4894 goto normalizeRoundAndPack; 4895 aExpBigger: 4896 if ( aExp == 0x7FFF ) { 4897 if ( aSig0 | aSig1 ) return propagateFloat128NaN( a, b ); 4898 return a; 4899 } 4900 if ( bExp == 0 ) { 4901 --expDiff; 4902 } 4903 else { 4904 bSig0 |= LIT64( 0x4000000000000000 ); 4905 } 4906 shift128RightJamming( bSig0, bSig1, expDiff, &bSig0, &bSig1 ); 4907 aSig0 |= LIT64( 0x4000000000000000 ); 4908 aBigger: 4909 sub128( aSig0, aSig1, bSig0, bSig1, &zSig0, &zSig1 ); 4910 zExp = aExp; 4911 normalizeRoundAndPack: 4912 --zExp; 4913 return normalizeRoundAndPackFloat128( zSign, zExp - 14, zSig0, zSig1 ); 4914 4915 } 4916 4917 /* 4918 ------------------------------------------------------------------------------- 4919 Returns the result of adding the quadruple-precision floating-point values 4920 `a' and `b'. The operation is performed according to the IEC/IEEE Standard 4921 for Binary Floating-Point Arithmetic. 4922 ------------------------------------------------------------------------------- 4923 */ 4924 float128 float128_add( float128 a, float128 b ) 4925 { 4926 flag aSign, bSign; 4927 4928 aSign = extractFloat128Sign( a ); 4929 bSign = extractFloat128Sign( b ); 4930 if ( aSign == bSign ) { 4931 return addFloat128Sigs( a, b, aSign ); 4932 } 4933 else { 4934 return subFloat128Sigs( a, b, aSign ); 4935 } 4936 4937 } 4938 4939 /* 4940 ------------------------------------------------------------------------------- 4941 Returns the result of subtracting the quadruple-precision floating-point 4942 values `a' and `b'. The operation is performed according to the IEC/IEEE 4943 Standard for Binary Floating-Point Arithmetic. 4944 ------------------------------------------------------------------------------- 4945 */ 4946 float128 float128_sub( float128 a, float128 b ) 4947 { 4948 flag aSign, bSign; 4949 4950 aSign = extractFloat128Sign( a ); 4951 bSign = extractFloat128Sign( b ); 4952 if ( aSign == bSign ) { 4953 return subFloat128Sigs( a, b, aSign ); 4954 } 4955 else { 4956 return addFloat128Sigs( a, b, aSign ); 4957 } 4958 4959 } 4960 4961 /* 4962 ------------------------------------------------------------------------------- 4963 Returns the result of multiplying the quadruple-precision floating-point 4964 values `a' and `b'. The operation is performed according to the IEC/IEEE 4965 Standard for Binary Floating-Point Arithmetic. 4966 ------------------------------------------------------------------------------- 4967 */ 4968 float128 float128_mul( float128 a, float128 b ) 4969 { 4970 flag aSign, bSign, zSign; 4971 int32 aExp, bExp, zExp; 4972 bits64 aSig0, aSig1, bSig0, bSig1, zSig0, zSig1, zSig2, zSig3; 4973 float128 z; 4974 4975 aSig1 = extractFloat128Frac1( a ); 4976 aSig0 = extractFloat128Frac0( a ); 4977 aExp = extractFloat128Exp( a ); 4978 aSign = extractFloat128Sign( a ); 4979 bSig1 = extractFloat128Frac1( b ); 4980 bSig0 = extractFloat128Frac0( b ); 4981 bExp = extractFloat128Exp( b ); 4982 bSign = extractFloat128Sign( b ); 4983 zSign = aSign ^ bSign; 4984 if ( aExp == 0x7FFF ) { 4985 if ( ( aSig0 | aSig1 ) 4986 || ( ( bExp == 0x7FFF ) && ( bSig0 | bSig1 ) ) ) { 4987 return propagateFloat128NaN( a, b ); 4988 } 4989 if ( ( bExp | bSig0 | bSig1 ) == 0 ) goto invalid; 4990 return packFloat128( zSign, 0x7FFF, 0, 0 ); 4991 } 4992 if ( bExp == 0x7FFF ) { 4993 if ( bSig0 | bSig1 ) return propagateFloat128NaN( a, b ); 4994 if ( ( aExp | aSig0 | aSig1 ) == 0 ) { 4995 invalid: 4996 float_raise( float_flag_invalid ); 4997 z.low = float128_default_nan_low; 4998 z.high = float128_default_nan_high; 4999 return z; 5000 } 5001 return packFloat128( zSign, 0x7FFF, 0, 0 ); 5002 } 5003 if ( aExp == 0 ) { 5004 if ( ( aSig0 | aSig1 ) == 0 ) return packFloat128( zSign, 0, 0, 0 ); 5005 normalizeFloat128Subnormal( aSig0, aSig1, &aExp, &aSig0, &aSig1 ); 5006 } 5007 if ( bExp == 0 ) { 5008 if ( ( bSig0 | bSig1 ) == 0 ) return packFloat128( zSign, 0, 0, 0 ); 5009 normalizeFloat128Subnormal( bSig0, bSig1, &bExp, &bSig0, &bSig1 ); 5010 } 5011 zExp = aExp + bExp - 0x4000; 5012 aSig0 |= LIT64( 0x0001000000000000 ); 5013 shortShift128Left( bSig0, bSig1, 16, &bSig0, &bSig1 ); 5014 mul128To256( aSig0, aSig1, bSig0, bSig1, &zSig0, &zSig1, &zSig2, &zSig3 ); 5015 add128( zSig0, zSig1, aSig0, aSig1, &zSig0, &zSig1 ); 5016 zSig2 |= ( zSig3 != 0 ); 5017 if ( LIT64( 0x0002000000000000 ) <= zSig0 ) { 5018 shift128ExtraRightJamming( 5019 zSig0, zSig1, zSig2, 1, &zSig0, &zSig1, &zSig2 ); 5020 ++zExp; 5021 } 5022 return roundAndPackFloat128( zSign, zExp, zSig0, zSig1, zSig2 ); 5023 5024 } 5025 5026 /* 5027 ------------------------------------------------------------------------------- 5028 Returns the result of dividing the quadruple-precision floating-point value 5029 `a' by the corresponding value `b'. The operation is performed according to 5030 the IEC/IEEE Standard for Binary Floating-Point Arithmetic. 5031 ------------------------------------------------------------------------------- 5032 */ 5033 float128 float128_div( float128 a, float128 b ) 5034 { 5035 flag aSign, bSign, zSign; 5036 int32 aExp, bExp, zExp; 5037 bits64 aSig0, aSig1, bSig0, bSig1, zSig0, zSig1, zSig2; 5038 bits64 rem0, rem1, rem2, rem3, term0, term1, term2, term3; 5039 float128 z; 5040 5041 aSig1 = extractFloat128Frac1( a ); 5042 aSig0 = extractFloat128Frac0( a ); 5043 aExp = extractFloat128Exp( a ); 5044 aSign = extractFloat128Sign( a ); 5045 bSig1 = extractFloat128Frac1( b ); 5046 bSig0 = extractFloat128Frac0( b ); 5047 bExp = extractFloat128Exp( b ); 5048 bSign = extractFloat128Sign( b ); 5049 zSign = aSign ^ bSign; 5050 if ( aExp == 0x7FFF ) { 5051 if ( aSig0 | aSig1 ) return propagateFloat128NaN( a, b ); 5052 if ( bExp == 0x7FFF ) { 5053 if ( bSig0 | bSig1 ) return propagateFloat128NaN( a, b ); 5054 goto invalid; 5055 } 5056 return packFloat128( zSign, 0x7FFF, 0, 0 ); 5057 } 5058 if ( bExp == 0x7FFF ) { 5059 if ( bSig0 | bSig1 ) return propagateFloat128NaN( a, b ); 5060 return packFloat128( zSign, 0, 0, 0 ); 5061 } 5062 if ( bExp == 0 ) { 5063 if ( ( bSig0 | bSig1 ) == 0 ) { 5064 if ( ( aExp | aSig0 | aSig1 ) == 0 ) { 5065 invalid: 5066 float_raise( float_flag_invalid ); 5067 z.low = float128_default_nan_low; 5068 z.high = float128_default_nan_high; 5069 return z; 5070 } 5071 float_raise( float_flag_divbyzero ); 5072 return packFloat128( zSign, 0x7FFF, 0, 0 ); 5073 } 5074 normalizeFloat128Subnormal( bSig0, bSig1, &bExp, &bSig0, &bSig1 ); 5075 } 5076 if ( aExp == 0 ) { 5077 if ( ( aSig0 | aSig1 ) == 0 ) return packFloat128( zSign, 0, 0, 0 ); 5078 normalizeFloat128Subnormal( aSig0, aSig1, &aExp, &aSig0, &aSig1 ); 5079 } 5080 zExp = aExp - bExp + 0x3FFD; 5081 shortShift128Left( 5082 aSig0 | LIT64( 0x0001000000000000 ), aSig1, 15, &aSig0, &aSig1 ); 5083 shortShift128Left( 5084 bSig0 | LIT64( 0x0001000000000000 ), bSig1, 15, &bSig0, &bSig1 ); 5085 if ( le128( bSig0, bSig1, aSig0, aSig1 ) ) { 5086 shift128Right( aSig0, aSig1, 1, &aSig0, &aSig1 ); 5087 ++zExp; 5088 } 5089 zSig0 = estimateDiv128To64( aSig0, aSig1, bSig0 ); 5090 mul128By64To192( bSig0, bSig1, zSig0, &term0, &term1, &term2 ); 5091 sub192( aSig0, aSig1, 0, term0, term1, term2, &rem0, &rem1, &rem2 ); 5092 while ( (sbits64) rem0 < 0 ) { 5093 --zSig0; 5094 add192( rem0, rem1, rem2, 0, bSig0, bSig1, &rem0, &rem1, &rem2 ); 5095 } 5096 zSig1 = estimateDiv128To64( rem1, rem2, bSig0 ); 5097 if ( ( zSig1 & 0x3FFF ) <= 4 ) { 5098 mul128By64To192( bSig0, bSig1, zSig1, &term1, &term2, &term3 ); 5099 sub192( rem1, rem2, 0, term1, term2, term3, &rem1, &rem2, &rem3 ); 5100 while ( (sbits64) rem1 < 0 ) { 5101 --zSig1; 5102 add192( rem1, rem2, rem3, 0, bSig0, bSig1, &rem1, &rem2, &rem3 ); 5103 } 5104 zSig1 |= ( ( rem1 | rem2 | rem3 ) != 0 ); 5105 } 5106 shift128ExtraRightJamming( zSig0, zSig1, 0, 15, &zSig0, &zSig1, &zSig2 ); 5107 return roundAndPackFloat128( zSign, zExp, zSig0, zSig1, zSig2 ); 5108 5109 } 5110 5111 /* 5112 ------------------------------------------------------------------------------- 5113 Returns the remainder of the quadruple-precision floating-point value `a' 5114 with respect to the corresponding value `b'. The operation is performed 5115 according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic. 5116 ------------------------------------------------------------------------------- 5117 */ 5118 float128 float128_rem( float128 a, float128 b ) 5119 { 5120 flag aSign, bSign, zSign; 5121 int32 aExp, bExp, expDiff; 5122 bits64 aSig0, aSig1, bSig0, bSig1, q, term0, term1, term2; 5123 bits64 allZero, alternateASig0, alternateASig1, sigMean1; 5124 sbits64 sigMean0; 5125 float128 z; 5126 5127 aSig1 = extractFloat128Frac1( a ); 5128 aSig0 = extractFloat128Frac0( a ); 5129 aExp = extractFloat128Exp( a ); 5130 aSign = extractFloat128Sign( a ); 5131 bSig1 = extractFloat128Frac1( b ); 5132 bSig0 = extractFloat128Frac0( b ); 5133 bExp = extractFloat128Exp( b ); 5134 bSign = extractFloat128Sign( b ); 5135 if ( aExp == 0x7FFF ) { 5136 if ( ( aSig0 | aSig1 ) 5137 || ( ( bExp == 0x7FFF ) && ( bSig0 | bSig1 ) ) ) { 5138 return propagateFloat128NaN( a, b ); 5139 } 5140 goto invalid; 5141 } 5142 if ( bExp == 0x7FFF ) { 5143 if ( bSig0 | bSig1 ) return propagateFloat128NaN( a, b ); 5144 return a; 5145 } 5146 if ( bExp == 0 ) { 5147 if ( ( bSig0 | bSig1 ) == 0 ) { 5148 invalid: 5149 float_raise( float_flag_invalid ); 5150 z.low = float128_default_nan_low; 5151 z.high = float128_default_nan_high; 5152 return z; 5153 } 5154 normalizeFloat128Subnormal( bSig0, bSig1, &bExp, &bSig0, &bSig1 ); 5155 } 5156 if ( aExp == 0 ) { 5157 if ( ( aSig0 | aSig1 ) == 0 ) return a; 5158 normalizeFloat128Subnormal( aSig0, aSig1, &aExp, &aSig0, &aSig1 ); 5159 } 5160 expDiff = aExp - bExp; 5161 if ( expDiff < -1 ) return a; 5162 shortShift128Left( 5163 aSig0 | LIT64( 0x0001000000000000 ), 5164 aSig1, 5165 15 - ( expDiff < 0 ), 5166 &aSig0, 5167 &aSig1 5168 ); 5169 shortShift128Left( 5170 bSig0 | LIT64( 0x0001000000000000 ), bSig1, 15, &bSig0, &bSig1 ); 5171 q = le128( bSig0, bSig1, aSig0, aSig1 ); 5172 if ( q ) sub128( aSig0, aSig1, bSig0, bSig1, &aSig0, &aSig1 ); 5173 expDiff -= 64; 5174 while ( 0 < expDiff ) { 5175 q = estimateDiv128To64( aSig0, aSig1, bSig0 ); 5176 q = ( 4 < q ) ? q - 4 : 0; 5177 mul128By64To192( bSig0, bSig1, q, &term0, &term1, &term2 ); 5178 shortShift192Left( term0, term1, term2, 61, &term1, &term2, &allZero ); 5179 shortShift128Left( aSig0, aSig1, 61, &aSig0, &allZero ); 5180 sub128( aSig0, 0, term1, term2, &aSig0, &aSig1 ); 5181 expDiff -= 61; 5182 } 5183 if ( -64 < expDiff ) { 5184 q = estimateDiv128To64( aSig0, aSig1, bSig0 ); 5185 q = ( 4 < q ) ? q - 4 : 0; 5186 q >>= - expDiff; 5187 shift128Right( bSig0, bSig1, 12, &bSig0, &bSig1 ); 5188 expDiff += 52; 5189 if ( expDiff < 0 ) { 5190 shift128Right( aSig0, aSig1, - expDiff, &aSig0, &aSig1 ); 5191 } 5192 else { 5193 shortShift128Left( aSig0, aSig1, expDiff, &aSig0, &aSig1 ); 5194 } 5195 mul128By64To192( bSig0, bSig1, q, &term0, &term1, &term2 ); 5196 sub128( aSig0, aSig1, term1, term2, &aSig0, &aSig1 ); 5197 } 5198 else { 5199 shift128Right( aSig0, aSig1, 12, &aSig0, &aSig1 ); 5200 shift128Right( bSig0, bSig1, 12, &bSig0, &bSig1 ); 5201 } 5202 do { 5203 alternateASig0 = aSig0; 5204 alternateASig1 = aSig1; 5205 ++q; 5206 sub128( aSig0, aSig1, bSig0, bSig1, &aSig0, &aSig1 ); 5207 } while ( 0 <= (sbits64) aSig0 ); 5208 add128( 5209 aSig0, aSig1, alternateASig0, alternateASig1, (bits64 *)&sigMean0, &sigMean1 ); 5210 if ( ( sigMean0 < 0 ) 5211 || ( ( ( sigMean0 | sigMean1 ) == 0 ) && ( q & 1 ) ) ) { 5212 aSig0 = alternateASig0; 5213 aSig1 = alternateASig1; 5214 } 5215 zSign = ( (sbits64) aSig0 < 0 ); 5216 if ( zSign ) sub128( 0, 0, aSig0, aSig1, &aSig0, &aSig1 ); 5217 return 5218 normalizeRoundAndPackFloat128( aSign ^ zSign, bExp - 4, aSig0, aSig1 ); 5219 5220 } 5221 5222 /* 5223 ------------------------------------------------------------------------------- 5224 Returns the square root of the quadruple-precision floating-point value `a'. 5225 The operation is performed according to the IEC/IEEE Standard for Binary 5226 Floating-Point Arithmetic. 5227 ------------------------------------------------------------------------------- 5228 */ 5229 float128 float128_sqrt( float128 a ) 5230 { 5231 flag aSign; 5232 int32 aExp, zExp; 5233 bits64 aSig0, aSig1, zSig0, zSig1, zSig2, doubleZSig0; 5234 bits64 rem0, rem1, rem2, rem3, term0, term1, term2, term3; 5235 float128 z; 5236 5237 aSig1 = extractFloat128Frac1( a ); 5238 aSig0 = extractFloat128Frac0( a ); 5239 aExp = extractFloat128Exp( a ); 5240 aSign = extractFloat128Sign( a ); 5241 if ( aExp == 0x7FFF ) { 5242 if ( aSig0 | aSig1 ) return propagateFloat128NaN( a, a ); 5243 if ( ! aSign ) return a; 5244 goto invalid; 5245 } 5246 if ( aSign ) { 5247 if ( ( aExp | aSig0 | aSig1 ) == 0 ) return a; 5248 invalid: 5249 float_raise( float_flag_invalid ); 5250 z.low = float128_default_nan_low; 5251 z.high = float128_default_nan_high; 5252 return z; 5253 } 5254 if ( aExp == 0 ) { 5255 if ( ( aSig0 | aSig1 ) == 0 ) return packFloat128( 0, 0, 0, 0 ); 5256 normalizeFloat128Subnormal( aSig0, aSig1, &aExp, &aSig0, &aSig1 ); 5257 } 5258 zExp = ( ( aExp - 0x3FFF )>>1 ) + 0x3FFE; 5259 aSig0 |= LIT64( 0x0001000000000000 ); 5260 zSig0 = estimateSqrt32( aExp, aSig0>>17 ); 5261 shortShift128Left( aSig0, aSig1, 13 - ( aExp & 1 ), &aSig0, &aSig1 ); 5262 zSig0 = estimateDiv128To64( aSig0, aSig1, zSig0<<32 ) + ( zSig0<<30 ); 5263 doubleZSig0 = zSig0<<1; 5264 mul64To128( zSig0, zSig0, &term0, &term1 ); 5265 sub128( aSig0, aSig1, term0, term1, &rem0, &rem1 ); 5266 while ( (sbits64) rem0 < 0 ) { 5267 --zSig0; 5268 doubleZSig0 -= 2; 5269 add128( rem0, rem1, zSig0>>63, doubleZSig0 | 1, &rem0, &rem1 ); 5270 } 5271 zSig1 = estimateDiv128To64( rem1, 0, doubleZSig0 ); 5272 if ( ( zSig1 & 0x1FFF ) <= 5 ) { 5273 if ( zSig1 == 0 ) zSig1 = 1; 5274 mul64To128( doubleZSig0, zSig1, &term1, &term2 ); 5275 sub128( rem1, 0, term1, term2, &rem1, &rem2 ); 5276 mul64To128( zSig1, zSig1, &term2, &term3 ); 5277 sub192( rem1, rem2, 0, 0, term2, term3, &rem1, &rem2, &rem3 ); 5278 while ( (sbits64) rem1 < 0 ) { 5279 --zSig1; 5280 shortShift128Left( 0, zSig1, 1, &term2, &term3 ); 5281 term3 |= 1; 5282 term2 |= doubleZSig0; 5283 add192( rem1, rem2, rem3, 0, term2, term3, &rem1, &rem2, &rem3 ); 5284 } 5285 zSig1 |= ( ( rem1 | rem2 | rem3 ) != 0 ); 5286 } 5287 shift128ExtraRightJamming( zSig0, zSig1, 0, 14, &zSig0, &zSig1, &zSig2 ); 5288 return roundAndPackFloat128( 0, zExp, zSig0, zSig1, zSig2 ); 5289 5290 } 5291 5292 /* 5293 ------------------------------------------------------------------------------- 5294 Returns 1 if the quadruple-precision floating-point value `a' is equal to 5295 the corresponding value `b', and 0 otherwise. The comparison is performed 5296 according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic. 5297 ------------------------------------------------------------------------------- 5298 */ 5299 flag float128_eq( float128 a, float128 b ) 5300 { 5301 5302 if ( ( ( extractFloat128Exp( a ) == 0x7FFF ) 5303 && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) ) 5304 || ( ( extractFloat128Exp( b ) == 0x7FFF ) 5305 && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) ) 5306 ) { 5307 if ( float128_is_signaling_nan( a ) 5308 || float128_is_signaling_nan( b ) ) { 5309 float_raise( float_flag_invalid ); 5310 } 5311 return 0; 5312 } 5313 return 5314 ( a.low == b.low ) 5315 && ( ( a.high == b.high ) 5316 || ( ( a.low == 0 ) 5317 && ( (bits64) ( ( a.high | b.high )<<1 ) == 0 ) ) 5318 ); 5319 5320 } 5321 5322 /* 5323 ------------------------------------------------------------------------------- 5324 Returns 1 if the quadruple-precision floating-point value `a' is less than 5325 or equal to the corresponding value `b', and 0 otherwise. The comparison 5326 is performed according to the IEC/IEEE Standard for Binary Floating-Point 5327 Arithmetic. 5328 ------------------------------------------------------------------------------- 5329 */ 5330 flag float128_le( float128 a, float128 b ) 5331 { 5332 flag aSign, bSign; 5333 5334 if ( ( ( extractFloat128Exp( a ) == 0x7FFF ) 5335 && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) ) 5336 || ( ( extractFloat128Exp( b ) == 0x7FFF ) 5337 && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) ) 5338 ) { 5339 float_raise( float_flag_invalid ); 5340 return 0; 5341 } 5342 aSign = extractFloat128Sign( a ); 5343 bSign = extractFloat128Sign( b ); 5344 if ( aSign != bSign ) { 5345 return 5346 aSign 5347 || ( ( ( (bits64) ( ( a.high | b.high )<<1 ) ) | a.low | b.low ) 5348 == 0 ); 5349 } 5350 return 5351 aSign ? le128( b.high, b.low, a.high, a.low ) 5352 : le128( a.high, a.low, b.high, b.low ); 5353 5354 } 5355 5356 /* 5357 ------------------------------------------------------------------------------- 5358 Returns 1 if the quadruple-precision floating-point value `a' is less than 5359 the corresponding value `b', and 0 otherwise. The comparison is performed 5360 according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic. 5361 ------------------------------------------------------------------------------- 5362 */ 5363 flag float128_lt( float128 a, float128 b ) 5364 { 5365 flag aSign, bSign; 5366 5367 if ( ( ( extractFloat128Exp( a ) == 0x7FFF ) 5368 && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) ) 5369 || ( ( extractFloat128Exp( b ) == 0x7FFF ) 5370 && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) ) 5371 ) { 5372 float_raise( float_flag_invalid ); 5373 return 0; 5374 } 5375 aSign = extractFloat128Sign( a ); 5376 bSign = extractFloat128Sign( b ); 5377 if ( aSign != bSign ) { 5378 return 5379 aSign 5380 && ( ( ( (bits64) ( ( a.high | b.high )<<1 ) ) | a.low | b.low ) 5381 != 0 ); 5382 } 5383 return 5384 aSign ? lt128( b.high, b.low, a.high, a.low ) 5385 : lt128( a.high, a.low, b.high, b.low ); 5386 5387 } 5388 5389 /* 5390 ------------------------------------------------------------------------------- 5391 Returns 1 if the quadruple-precision floating-point value `a' is equal to 5392 the corresponding value `b', and 0 otherwise. The invalid exception is 5393 raised if either operand is a NaN. Otherwise, the comparison is performed 5394 according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic. 5395 ------------------------------------------------------------------------------- 5396 */ 5397 flag float128_eq_signaling( float128 a, float128 b ) 5398 { 5399 5400 if ( ( ( extractFloat128Exp( a ) == 0x7FFF ) 5401 && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) ) 5402 || ( ( extractFloat128Exp( b ) == 0x7FFF ) 5403 && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) ) 5404 ) { 5405 float_raise( float_flag_invalid ); 5406 return 0; 5407 } 5408 return 5409 ( a.low == b.low ) 5410 && ( ( a.high == b.high ) 5411 || ( ( a.low == 0 ) 5412 && ( (bits64) ( ( a.high | b.high )<<1 ) == 0 ) ) 5413 ); 5414 5415 } 5416 5417 /* 5418 ------------------------------------------------------------------------------- 5419 Returns 1 if the quadruple-precision floating-point value `a' is less than 5420 or equal to the corresponding value `b', and 0 otherwise. Quiet NaNs do not 5421 cause an exception. Otherwise, the comparison is performed according to the 5422 IEC/IEEE Standard for Binary Floating-Point Arithmetic. 5423 ------------------------------------------------------------------------------- 5424 */ 5425 flag float128_le_quiet( float128 a, float128 b ) 5426 { 5427 flag aSign, bSign; 5428 5429 if ( ( ( extractFloat128Exp( a ) == 0x7FFF ) 5430 && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) ) 5431 || ( ( extractFloat128Exp( b ) == 0x7FFF ) 5432 && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) ) 5433 ) { 5434 if ( float128_is_signaling_nan( a ) 5435 || float128_is_signaling_nan( b ) ) { 5436 float_raise( float_flag_invalid ); 5437 } 5438 return 0; 5439 } 5440 aSign = extractFloat128Sign( a ); 5441 bSign = extractFloat128Sign( b ); 5442 if ( aSign != bSign ) { 5443 return 5444 aSign 5445 || ( ( ( (bits64) ( ( a.high | b.high )<<1 ) ) | a.low | b.low ) 5446 == 0 ); 5447 } 5448 return 5449 aSign ? le128( b.high, b.low, a.high, a.low ) 5450 : le128( a.high, a.low, b.high, b.low ); 5451 5452 } 5453 5454 /* 5455 ------------------------------------------------------------------------------- 5456 Returns 1 if the quadruple-precision floating-point value `a' is less than 5457 the corresponding value `b', and 0 otherwise. Quiet NaNs do not cause an 5458 exception. Otherwise, the comparison is performed according to the IEC/IEEE 5459 Standard for Binary Floating-Point Arithmetic. 5460 ------------------------------------------------------------------------------- 5461 */ 5462 flag float128_lt_quiet( float128 a, float128 b ) 5463 { 5464 flag aSign, bSign; 5465 5466 if ( ( ( extractFloat128Exp( a ) == 0x7FFF ) 5467 && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) ) 5468 || ( ( extractFloat128Exp( b ) == 0x7FFF ) 5469 && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) ) 5470 ) { 5471 if ( float128_is_signaling_nan( a ) 5472 || float128_is_signaling_nan( b ) ) { 5473 float_raise( float_flag_invalid ); 5474 } 5475 return 0; 5476 } 5477 aSign = extractFloat128Sign( a ); 5478 bSign = extractFloat128Sign( b ); 5479 if ( aSign != bSign ) { 5480 return 5481 aSign 5482 && ( ( ( (bits64) ( ( a.high | b.high )<<1 ) ) | a.low | b.low ) 5483 != 0 ); 5484 } 5485 return 5486 aSign ? lt128( b.high, b.low, a.high, a.low ) 5487 : lt128( a.high, a.low, b.high, b.low ); 5488 5489 } 5490 5491 #endif 5492 5493 5494 #if defined(SOFTFLOAT_FOR_GCC) && defined(SOFTFLOAT_NEED_FIXUNS) 5495 5496 /* 5497 * These two routines are not part of the original softfloat distribution. 5498 * 5499 * They are based on the corresponding conversions to integer but return 5500 * unsigned numbers instead since these functions are required by GCC. 5501 * 5502 * Added by Mark Brinicombe <mark@NetBSD.org> 27/09/97 5503 * 5504 * float64 version overhauled for SoftFloat 2a [bjh21 2000-07-15] 5505 */ 5506 5507 /* 5508 ------------------------------------------------------------------------------- 5509 Returns the result of converting the double-precision floating-point value 5510 `a' to the 32-bit unsigned integer format. The conversion is 5511 performed according to the IEC/IEEE Standard for Binary Floating-point 5512 Arithmetic, except that the conversion is always rounded toward zero. If 5513 `a' is a NaN, the largest positive integer is returned. If the conversion 5514 overflows, the largest integer positive is returned. 5515 ------------------------------------------------------------------------------- 5516 */ 5517 uint32 float64_to_uint32_round_to_zero( float64 a ) 5518 { 5519 flag aSign; 5520 int16 aExp, shiftCount; 5521 bits64 aSig, savedASig; 5522 uint32 z; 5523 5524 aSig = extractFloat64Frac( a ); 5525 aExp = extractFloat64Exp( a ); 5526 aSign = extractFloat64Sign( a ); 5527 5528 if (aSign) { 5529 float_raise( float_flag_invalid ); 5530 return(0); 5531 } 5532 5533 if ( 0x41E < aExp ) { 5534 float_raise( float_flag_invalid ); 5535 return 0xffffffff; 5536 } 5537 else if ( aExp < 0x3FF ) { 5538 if ( aExp || aSig ) float_exception_flags |= float_flag_inexact; 5539 return 0; 5540 } 5541 aSig |= LIT64( 0x0010000000000000 ); 5542 shiftCount = 0x433 - aExp; 5543 savedASig = aSig; 5544 aSig >>= shiftCount; 5545 z = aSig; 5546 if ( ( aSig<<shiftCount ) != savedASig ) { 5547 float_exception_flags |= float_flag_inexact; 5548 } 5549 return z; 5550 5551 } 5552 5553 /* 5554 ------------------------------------------------------------------------------- 5555 Returns the result of converting the single-precision floating-point value 5556 `a' to the 32-bit unsigned integer format. The conversion is 5557 performed according to the IEC/IEEE Standard for Binary Floating-point 5558 Arithmetic, except that the conversion is always rounded toward zero. If 5559 `a' is a NaN, the largest positive integer is returned. If the conversion 5560 overflows, the largest positive integer is returned. 5561 ------------------------------------------------------------------------------- 5562 */ 5563 uint32 float32_to_uint32_round_to_zero( float32 a ) 5564 { 5565 flag aSign; 5566 int16 aExp, shiftCount; 5567 bits32 aSig; 5568 uint32 z; 5569 5570 aSig = extractFloat32Frac( a ); 5571 aExp = extractFloat32Exp( a ); 5572 aSign = extractFloat32Sign( a ); 5573 shiftCount = aExp - 0x9E; 5574 5575 if (aSign) { 5576 float_raise( float_flag_invalid ); 5577 return(0); 5578 } 5579 if ( 0 < shiftCount ) { 5580 float_raise( float_flag_invalid ); 5581 return 0xFFFFFFFF; 5582 } 5583 else if ( aExp <= 0x7E ) { 5584 if ( aExp | aSig ) float_exception_flags |= float_flag_inexact; 5585 return 0; 5586 } 5587 aSig = ( aSig | 0x800000 )<<8; 5588 z = aSig>>( - shiftCount ); 5589 if ( aSig<<( shiftCount & 31 ) ) { 5590 float_exception_flags |= float_flag_inexact; 5591 } 5592 return z; 5593 5594 } 5595 5596 #endif 5597