1 /* $NetBSD: softfloat.c,v 1.8 2011/07/10 04:52:23 matt Exp $ */ 2 3 /* 4 * This version hacked for use with gcc -msoft-float by bjh21. 5 * (Mostly a case of #ifdefing out things GCC doesn't need or provides 6 * itself). 7 */ 8 9 /* 10 * Things you may want to define: 11 * 12 * SOFTFLOAT_FOR_GCC - build only those functions necessary for GCC (with 13 * -msoft-float) to work. Include "softfloat-for-gcc.h" to get them 14 * properly renamed. 15 */ 16 17 /* 18 =============================================================================== 19 20 This C source file is part of the SoftFloat IEC/IEEE Floating-point 21 Arithmetic Package, Release 2a. 22 23 Written by John R. Hauser. This work was made possible in part by the 24 International Computer Science Institute, located at Suite 600, 1947 Center 25 Street, Berkeley, California 94704. Funding was partially provided by the 26 National Science Foundation under grant MIP-9311980. The original version 27 of this code was written as part of a project to build a fixed-point vector 28 processor in collaboration with the University of California at Berkeley, 29 overseen by Profs. Nelson Morgan and John Wawrzynek. More information 30 is available through the Web page `http://HTTP.CS.Berkeley.EDU/~jhauser/ 31 arithmetic/SoftFloat.html'. 32 33 THIS SOFTWARE IS DISTRIBUTED AS IS, FOR FREE. Although reasonable effort 34 has been made to avoid it, THIS SOFTWARE MAY CONTAIN FAULTS THAT WILL AT 35 TIMES RESULT IN INCORRECT BEHAVIOR. USE OF THIS SOFTWARE IS RESTRICTED TO 36 PERSONS AND ORGANIZATIONS WHO CAN AND WILL TAKE FULL RESPONSIBILITY FOR ANY 37 AND ALL LOSSES, COSTS, OR OTHER PROBLEMS ARISING FROM ITS USE. 38 39 Derivative works are acceptable, even for commercial purposes, so long as 40 (1) they include prominent notice that the work is derivative, and (2) they 41 include prominent notice akin to these four paragraphs for those parts of 42 this code that are retained. 43 44 =============================================================================== 45 */ 46 47 #include <sys/cdefs.h> 48 #ifdef SOFTFLOAT_FOR_GCC 49 #include "softfloat-for-gcc.h" 50 #endif 51 52 #include "milieu.h" 53 #include "softfloat.h" 54 55 /* 56 * Conversions between floats as stored in memory and floats as 57 * SoftFloat uses them 58 */ 59 #ifndef FLOAT64_DEMANGLE 60 #define FLOAT64_DEMANGLE(a) (a) 61 #endif 62 #ifndef FLOAT64_MANGLE 63 #define FLOAT64_MANGLE(a) (a) 64 #endif 65 66 /* 67 ------------------------------------------------------------------------------- 68 Floating-point rounding mode, extended double-precision rounding precision, 69 and exception flags. 70 ------------------------------------------------------------------------------- 71 */ 72 int float_rounding_mode = float_round_nearest_even; 73 int float_exception_flags = 0; 74 #ifdef FLOATX80 75 int8 floatx80_rounding_precision = 80; 76 #endif 77 78 /* 79 ------------------------------------------------------------------------------- 80 Primitive arithmetic functions, including multi-word arithmetic, and 81 division and square root approximations. (Can be specialized to target if 82 desired.) 83 ------------------------------------------------------------------------------- 84 */ 85 #include "softfloat-macros" 86 87 /* 88 ------------------------------------------------------------------------------- 89 Functions and definitions to determine: (1) whether tininess for underflow 90 is detected before or after rounding by default, (2) what (if anything) 91 happens when exceptions are raised, (3) how signaling NaNs are distinguished 92 from quiet NaNs, (4) the default generated quiet NaNs, and (5) how NaNs 93 are propagated from function inputs to output. These details are target- 94 specific. 95 ------------------------------------------------------------------------------- 96 */ 97 #include "softfloat-specialize" 98 99 #if !defined(SOFTFLOAT_FOR_GCC) || defined(FLOATX80) || defined(FLOAT128) 100 /* 101 ------------------------------------------------------------------------------- 102 Takes a 64-bit fixed-point value `absZ' with binary point between bits 6 103 and 7, and returns the properly rounded 32-bit integer corresponding to the 104 input. If `zSign' is 1, the input is negated before being converted to an 105 integer. Bit 63 of `absZ' must be zero. Ordinarily, the fixed-point input 106 is simply rounded to an integer, with the inexact exception raised if the 107 input cannot be represented exactly as an integer. However, if the fixed- 108 point input is too large, the invalid exception is raised and the largest 109 positive or negative integer is returned. 110 ------------------------------------------------------------------------------- 111 */ 112 static int32 roundAndPackInt32( flag zSign, bits64 absZ ) 113 { 114 int8 roundingMode; 115 flag roundNearestEven; 116 int8 roundIncrement, roundBits; 117 int32 z; 118 119 roundingMode = float_rounding_mode; 120 roundNearestEven = ( roundingMode == float_round_nearest_even ); 121 roundIncrement = 0x40; 122 if ( ! roundNearestEven ) { 123 if ( roundingMode == float_round_to_zero ) { 124 roundIncrement = 0; 125 } 126 else { 127 roundIncrement = 0x7F; 128 if ( zSign ) { 129 if ( roundingMode == float_round_up ) roundIncrement = 0; 130 } 131 else { 132 if ( roundingMode == float_round_down ) roundIncrement = 0; 133 } 134 } 135 } 136 roundBits = absZ & 0x7F; 137 absZ = ( absZ + roundIncrement )>>7; 138 absZ &= ~ ( ( ( roundBits ^ 0x40 ) == 0 ) & roundNearestEven ); 139 z = absZ; 140 if ( zSign ) z = - z; 141 if ( ( absZ>>32 ) || ( z && ( ( z < 0 ) ^ zSign ) ) ) { 142 float_raise( float_flag_invalid ); 143 return zSign ? (sbits32) 0x80000000 : 0x7FFFFFFF; 144 } 145 if ( roundBits ) float_exception_flags |= float_flag_inexact; 146 return z; 147 148 } 149 150 /* 151 ------------------------------------------------------------------------------- 152 Takes the 128-bit fixed-point value formed by concatenating `absZ0' and 153 `absZ1', with binary point between bits 63 and 64 (between the input words), 154 and returns the properly rounded 64-bit integer corresponding to the input. 155 If `zSign' is 1, the input is negated before being converted to an integer. 156 Ordinarily, the fixed-point input is simply rounded to an integer, with 157 the inexact exception raised if the input cannot be represented exactly as 158 an integer. However, if the fixed-point input is too large, the invalid 159 exception is raised and the largest positive or negative integer is 160 returned. 161 ------------------------------------------------------------------------------- 162 */ 163 static int64 roundAndPackInt64( flag zSign, bits64 absZ0, bits64 absZ1 ) 164 { 165 int8 roundingMode; 166 flag roundNearestEven, increment; 167 int64 z; 168 169 roundingMode = float_rounding_mode; 170 roundNearestEven = ( roundingMode == float_round_nearest_even ); 171 increment = ( (sbits64) absZ1 < 0 ); 172 if ( ! roundNearestEven ) { 173 if ( roundingMode == float_round_to_zero ) { 174 increment = 0; 175 } 176 else { 177 if ( zSign ) { 178 increment = ( roundingMode == float_round_down ) && absZ1; 179 } 180 else { 181 increment = ( roundingMode == float_round_up ) && absZ1; 182 } 183 } 184 } 185 if ( increment ) { 186 ++absZ0; 187 if ( absZ0 == 0 ) goto overflow; 188 absZ0 &= ~ ( ( (bits64) ( absZ1<<1 ) == 0 ) & roundNearestEven ); 189 } 190 z = absZ0; 191 if ( zSign ) z = - z; 192 if ( z && ( ( z < 0 ) ^ zSign ) ) { 193 overflow: 194 float_raise( float_flag_invalid ); 195 return 196 zSign ? (sbits64) LIT64( 0x8000000000000000 ) 197 : LIT64( 0x7FFFFFFFFFFFFFFF ); 198 } 199 if ( absZ1 ) float_exception_flags |= float_flag_inexact; 200 return z; 201 202 } 203 #endif 204 205 /* 206 ------------------------------------------------------------------------------- 207 Returns the fraction bits of the single-precision floating-point value `a'. 208 ------------------------------------------------------------------------------- 209 */ 210 INLINE bits32 extractFloat32Frac( float32 a ) 211 { 212 213 return a & 0x007FFFFF; 214 215 } 216 217 /* 218 ------------------------------------------------------------------------------- 219 Returns the exponent bits of the single-precision floating-point value `a'. 220 ------------------------------------------------------------------------------- 221 */ 222 INLINE int16 extractFloat32Exp( float32 a ) 223 { 224 225 return ( a>>23 ) & 0xFF; 226 227 } 228 229 /* 230 ------------------------------------------------------------------------------- 231 Returns the sign bit of the single-precision floating-point value `a'. 232 ------------------------------------------------------------------------------- 233 */ 234 INLINE flag extractFloat32Sign( float32 a ) 235 { 236 237 return a>>31; 238 239 } 240 241 /* 242 ------------------------------------------------------------------------------- 243 Normalizes the subnormal single-precision floating-point value represented 244 by the denormalized significand `aSig'. The normalized exponent and 245 significand are stored at the locations pointed to by `zExpPtr' and 246 `zSigPtr', respectively. 247 ------------------------------------------------------------------------------- 248 */ 249 static void 250 normalizeFloat32Subnormal( bits32 aSig, int16 *zExpPtr, bits32 *zSigPtr ) 251 { 252 int8 shiftCount; 253 254 shiftCount = countLeadingZeros32( aSig ) - 8; 255 *zSigPtr = aSig<<shiftCount; 256 *zExpPtr = 1 - shiftCount; 257 258 } 259 260 /* 261 ------------------------------------------------------------------------------- 262 Packs the sign `zSign', exponent `zExp', and significand `zSig' into a 263 single-precision floating-point value, returning the result. After being 264 shifted into the proper positions, the three fields are simply added 265 together to form the result. This means that any integer portion of `zSig' 266 will be added into the exponent. Since a properly normalized significand 267 will have an integer portion equal to 1, the `zExp' input should be 1 less 268 than the desired result exponent whenever `zSig' is a complete, normalized 269 significand. 270 ------------------------------------------------------------------------------- 271 */ 272 INLINE float32 packFloat32( flag zSign, int16 zExp, bits32 zSig ) 273 { 274 275 return ( ( (bits32) zSign )<<31 ) + ( ( (bits32) zExp )<<23 ) + zSig; 276 277 } 278 279 /* 280 ------------------------------------------------------------------------------- 281 Takes an abstract floating-point value having sign `zSign', exponent `zExp', 282 and significand `zSig', and returns the proper single-precision floating- 283 point value corresponding to the abstract input. Ordinarily, the abstract 284 value is simply rounded and packed into the single-precision format, with 285 the inexact exception raised if the abstract input cannot be represented 286 exactly. However, if the abstract value is too large, the overflow and 287 inexact exceptions are raised and an infinity or maximal finite value is 288 returned. If the abstract value is too small, the input value is rounded to 289 a subnormal number, and the underflow and inexact exceptions are raised if 290 the abstract input cannot be represented exactly as a subnormal single- 291 precision floating-point number. 292 The input significand `zSig' has its binary point between bits 30 293 and 29, which is 7 bits to the left of the usual location. This shifted 294 significand must be normalized or smaller. If `zSig' is not normalized, 295 `zExp' must be 0; in that case, the result returned is a subnormal number, 296 and it must not require rounding. In the usual case that `zSig' is 297 normalized, `zExp' must be 1 less than the ``true'' floating-point exponent. 298 The handling of underflow and overflow follows the IEC/IEEE Standard for 299 Binary Floating-Point Arithmetic. 300 ------------------------------------------------------------------------------- 301 */ 302 static float32 roundAndPackFloat32( flag zSign, int16 zExp, bits32 zSig ) 303 { 304 int8 roundingMode; 305 flag roundNearestEven; 306 int8 roundIncrement, roundBits; 307 flag isTiny; 308 309 roundingMode = float_rounding_mode; 310 roundNearestEven = ( roundingMode == float_round_nearest_even ); 311 roundIncrement = 0x40; 312 if ( ! roundNearestEven ) { 313 if ( roundingMode == float_round_to_zero ) { 314 roundIncrement = 0; 315 } 316 else { 317 roundIncrement = 0x7F; 318 if ( zSign ) { 319 if ( roundingMode == float_round_up ) roundIncrement = 0; 320 } 321 else { 322 if ( roundingMode == float_round_down ) roundIncrement = 0; 323 } 324 } 325 } 326 roundBits = zSig & 0x7F; 327 if ( 0xFD <= (bits16) zExp ) { 328 if ( ( 0xFD < zExp ) 329 || ( ( zExp == 0xFD ) 330 && ( (sbits32) ( zSig + roundIncrement ) < 0 ) ) 331 ) { 332 float_raise( float_flag_overflow | float_flag_inexact ); 333 return packFloat32( zSign, 0xFF, 0 ) - ( roundIncrement == 0 ); 334 } 335 if ( zExp < 0 ) { 336 isTiny = 337 ( float_detect_tininess == float_tininess_before_rounding ) 338 || ( zExp < -1 ) 339 || ( zSig + roundIncrement < 0x80000000 ); 340 shift32RightJamming( zSig, - zExp, &zSig ); 341 zExp = 0; 342 roundBits = zSig & 0x7F; 343 if ( isTiny && roundBits ) float_raise( float_flag_underflow ); 344 } 345 } 346 if ( roundBits ) float_exception_flags |= float_flag_inexact; 347 zSig = ( zSig + roundIncrement )>>7; 348 zSig &= ~ ( ( ( roundBits ^ 0x40 ) == 0 ) & roundNearestEven ); 349 if ( zSig == 0 ) zExp = 0; 350 return packFloat32( zSign, zExp, zSig ); 351 352 } 353 354 /* 355 ------------------------------------------------------------------------------- 356 Takes an abstract floating-point value having sign `zSign', exponent `zExp', 357 and significand `zSig', and returns the proper single-precision floating- 358 point value corresponding to the abstract input. This routine is just like 359 `roundAndPackFloat32' except that `zSig' does not have to be normalized. 360 Bit 31 of `zSig' must be zero, and `zExp' must be 1 less than the ``true'' 361 floating-point exponent. 362 ------------------------------------------------------------------------------- 363 */ 364 static float32 365 normalizeRoundAndPackFloat32( flag zSign, int16 zExp, bits32 zSig ) 366 { 367 int8 shiftCount; 368 369 shiftCount = countLeadingZeros32( zSig ) - 1; 370 return roundAndPackFloat32( zSign, zExp - shiftCount, zSig<<shiftCount ); 371 372 } 373 374 /* 375 ------------------------------------------------------------------------------- 376 Returns the fraction bits of the double-precision floating-point value `a'. 377 ------------------------------------------------------------------------------- 378 */ 379 INLINE bits64 extractFloat64Frac( float64 a ) 380 { 381 382 return FLOAT64_DEMANGLE(a) & LIT64( 0x000FFFFFFFFFFFFF ); 383 384 } 385 386 /* 387 ------------------------------------------------------------------------------- 388 Returns the exponent bits of the double-precision floating-point value `a'. 389 ------------------------------------------------------------------------------- 390 */ 391 INLINE int16 extractFloat64Exp( float64 a ) 392 { 393 394 return ( FLOAT64_DEMANGLE(a)>>52 ) & 0x7FF; 395 396 } 397 398 /* 399 ------------------------------------------------------------------------------- 400 Returns the sign bit of the double-precision floating-point value `a'. 401 ------------------------------------------------------------------------------- 402 */ 403 INLINE flag extractFloat64Sign( float64 a ) 404 { 405 406 return FLOAT64_DEMANGLE(a)>>63; 407 408 } 409 410 /* 411 ------------------------------------------------------------------------------- 412 Normalizes the subnormal double-precision floating-point value represented 413 by the denormalized significand `aSig'. The normalized exponent and 414 significand are stored at the locations pointed to by `zExpPtr' and 415 `zSigPtr', respectively. 416 ------------------------------------------------------------------------------- 417 */ 418 static void 419 normalizeFloat64Subnormal( bits64 aSig, int16 *zExpPtr, bits64 *zSigPtr ) 420 { 421 int8 shiftCount; 422 423 shiftCount = countLeadingZeros64( aSig ) - 11; 424 *zSigPtr = aSig<<shiftCount; 425 *zExpPtr = 1 - shiftCount; 426 427 } 428 429 /* 430 ------------------------------------------------------------------------------- 431 Packs the sign `zSign', exponent `zExp', and significand `zSig' into a 432 double-precision floating-point value, returning the result. After being 433 shifted into the proper positions, the three fields are simply added 434 together to form the result. This means that any integer portion of `zSig' 435 will be added into the exponent. Since a properly normalized significand 436 will have an integer portion equal to 1, the `zExp' input should be 1 less 437 than the desired result exponent whenever `zSig' is a complete, normalized 438 significand. 439 ------------------------------------------------------------------------------- 440 */ 441 INLINE float64 packFloat64( flag zSign, int16 zExp, bits64 zSig ) 442 { 443 444 return FLOAT64_MANGLE( ( ( (bits64) zSign )<<63 ) + 445 ( ( (bits64) zExp )<<52 ) + zSig ); 446 447 } 448 449 /* 450 ------------------------------------------------------------------------------- 451 Takes an abstract floating-point value having sign `zSign', exponent `zExp', 452 and significand `zSig', and returns the proper double-precision floating- 453 point value corresponding to the abstract input. Ordinarily, the abstract 454 value is simply rounded and packed into the double-precision format, with 455 the inexact exception raised if the abstract input cannot be represented 456 exactly. However, if the abstract value is too large, the overflow and 457 inexact exceptions are raised and an infinity or maximal finite value is 458 returned. If the abstract value is too small, the input value is rounded to 459 a subnormal number, and the underflow and inexact exceptions are raised if 460 the abstract input cannot be represented exactly as a subnormal double- 461 precision floating-point number. 462 The input significand `zSig' has its binary point between bits 62 463 and 61, which is 10 bits to the left of the usual location. This shifted 464 significand must be normalized or smaller. If `zSig' is not normalized, 465 `zExp' must be 0; in that case, the result returned is a subnormal number, 466 and it must not require rounding. In the usual case that `zSig' is 467 normalized, `zExp' must be 1 less than the ``true'' floating-point exponent. 468 The handling of underflow and overflow follows the IEC/IEEE Standard for 469 Binary Floating-Point Arithmetic. 470 ------------------------------------------------------------------------------- 471 */ 472 static float64 roundAndPackFloat64( flag zSign, int16 zExp, bits64 zSig ) 473 { 474 int8 roundingMode; 475 flag roundNearestEven; 476 int16 roundIncrement, roundBits; 477 flag isTiny; 478 479 roundingMode = float_rounding_mode; 480 roundNearestEven = ( roundingMode == float_round_nearest_even ); 481 roundIncrement = 0x200; 482 if ( ! roundNearestEven ) { 483 if ( roundingMode == float_round_to_zero ) { 484 roundIncrement = 0; 485 } 486 else { 487 roundIncrement = 0x3FF; 488 if ( zSign ) { 489 if ( roundingMode == float_round_up ) roundIncrement = 0; 490 } 491 else { 492 if ( roundingMode == float_round_down ) roundIncrement = 0; 493 } 494 } 495 } 496 roundBits = zSig & 0x3FF; 497 if ( 0x7FD <= (bits16) zExp ) { 498 if ( ( 0x7FD < zExp ) 499 || ( ( zExp == 0x7FD ) 500 && ( (sbits64) ( zSig + roundIncrement ) < 0 ) ) 501 ) { 502 float_raise( float_flag_overflow | float_flag_inexact ); 503 return FLOAT64_MANGLE( 504 FLOAT64_DEMANGLE(packFloat64( zSign, 0x7FF, 0 )) - 505 ( roundIncrement == 0 )); 506 } 507 if ( zExp < 0 ) { 508 isTiny = 509 ( float_detect_tininess == float_tininess_before_rounding ) 510 || ( zExp < -1 ) 511 || ( zSig + roundIncrement < LIT64( 0x8000000000000000 ) ); 512 shift64RightJamming( zSig, - zExp, &zSig ); 513 zExp = 0; 514 roundBits = zSig & 0x3FF; 515 if ( isTiny && roundBits ) float_raise( float_flag_underflow ); 516 } 517 } 518 if ( roundBits ) float_exception_flags |= float_flag_inexact; 519 zSig = ( zSig + roundIncrement )>>10; 520 zSig &= ~ ( ( ( roundBits ^ 0x200 ) == 0 ) & roundNearestEven ); 521 if ( zSig == 0 ) zExp = 0; 522 return packFloat64( zSign, zExp, zSig ); 523 524 } 525 526 /* 527 ------------------------------------------------------------------------------- 528 Takes an abstract floating-point value having sign `zSign', exponent `zExp', 529 and significand `zSig', and returns the proper double-precision floating- 530 point value corresponding to the abstract input. This routine is just like 531 `roundAndPackFloat64' except that `zSig' does not have to be normalized. 532 Bit 63 of `zSig' must be zero, and `zExp' must be 1 less than the ``true'' 533 floating-point exponent. 534 ------------------------------------------------------------------------------- 535 */ 536 static float64 537 normalizeRoundAndPackFloat64( flag zSign, int16 zExp, bits64 zSig ) 538 { 539 int8 shiftCount; 540 541 shiftCount = countLeadingZeros64( zSig ) - 1; 542 return roundAndPackFloat64( zSign, zExp - shiftCount, zSig<<shiftCount ); 543 544 } 545 546 #ifdef FLOATX80 547 548 /* 549 ------------------------------------------------------------------------------- 550 Returns the fraction bits of the extended double-precision floating-point 551 value `a'. 552 ------------------------------------------------------------------------------- 553 */ 554 INLINE bits64 extractFloatx80Frac( floatx80 a ) 555 { 556 557 return a.low; 558 559 } 560 561 /* 562 ------------------------------------------------------------------------------- 563 Returns the exponent bits of the extended double-precision floating-point 564 value `a'. 565 ------------------------------------------------------------------------------- 566 */ 567 INLINE int32 extractFloatx80Exp( floatx80 a ) 568 { 569 570 return a.high & 0x7FFF; 571 572 } 573 574 /* 575 ------------------------------------------------------------------------------- 576 Returns the sign bit of the extended double-precision floating-point value 577 `a'. 578 ------------------------------------------------------------------------------- 579 */ 580 INLINE flag extractFloatx80Sign( floatx80 a ) 581 { 582 583 return a.high>>15; 584 585 } 586 587 /* 588 ------------------------------------------------------------------------------- 589 Normalizes the subnormal extended double-precision floating-point value 590 represented by the denormalized significand `aSig'. The normalized exponent 591 and significand are stored at the locations pointed to by `zExpPtr' and 592 `zSigPtr', respectively. 593 ------------------------------------------------------------------------------- 594 */ 595 static void 596 normalizeFloatx80Subnormal( bits64 aSig, int32 *zExpPtr, bits64 *zSigPtr ) 597 { 598 int8 shiftCount; 599 600 shiftCount = countLeadingZeros64( aSig ); 601 *zSigPtr = aSig<<shiftCount; 602 *zExpPtr = 1 - shiftCount; 603 604 } 605 606 /* 607 ------------------------------------------------------------------------------- 608 Packs the sign `zSign', exponent `zExp', and significand `zSig' into an 609 extended double-precision floating-point value, returning the result. 610 ------------------------------------------------------------------------------- 611 */ 612 INLINE floatx80 packFloatx80( flag zSign, int32 zExp, bits64 zSig ) 613 { 614 floatx80 z; 615 616 z.low = zSig; 617 z.high = ( ( (bits16) zSign )<<15 ) + zExp; 618 return z; 619 620 } 621 622 /* 623 ------------------------------------------------------------------------------- 624 Takes an abstract floating-point value having sign `zSign', exponent `zExp', 625 and extended significand formed by the concatenation of `zSig0' and `zSig1', 626 and returns the proper extended double-precision floating-point value 627 corresponding to the abstract input. Ordinarily, the abstract value is 628 rounded and packed into the extended double-precision format, with the 629 inexact exception raised if the abstract input cannot be represented 630 exactly. However, if the abstract value is too large, the overflow and 631 inexact exceptions are raised and an infinity or maximal finite value is 632 returned. If the abstract value is too small, the input value is rounded to 633 a subnormal number, and the underflow and inexact exceptions are raised if 634 the abstract input cannot be represented exactly as a subnormal extended 635 double-precision floating-point number. 636 If `roundingPrecision' is 32 or 64, the result is rounded to the same 637 number of bits as single or double precision, respectively. Otherwise, the 638 result is rounded to the full precision of the extended double-precision 639 format. 640 The input significand must be normalized or smaller. If the input 641 significand is not normalized, `zExp' must be 0; in that case, the result 642 returned is a subnormal number, and it must not require rounding. The 643 handling of underflow and overflow follows the IEC/IEEE Standard for Binary 644 Floating-Point Arithmetic. 645 ------------------------------------------------------------------------------- 646 */ 647 static floatx80 648 roundAndPackFloatx80( 649 int8 roundingPrecision, flag zSign, int32 zExp, bits64 zSig0, bits64 zSig1 650 ) 651 { 652 int8 roundingMode; 653 flag roundNearestEven, increment, isTiny; 654 int64 roundIncrement, roundMask, roundBits; 655 656 roundingMode = float_rounding_mode; 657 roundNearestEven = ( roundingMode == float_round_nearest_even ); 658 if ( roundingPrecision == 80 ) goto precision80; 659 if ( roundingPrecision == 64 ) { 660 roundIncrement = LIT64( 0x0000000000000400 ); 661 roundMask = LIT64( 0x00000000000007FF ); 662 } 663 else if ( roundingPrecision == 32 ) { 664 roundIncrement = LIT64( 0x0000008000000000 ); 665 roundMask = LIT64( 0x000000FFFFFFFFFF ); 666 } 667 else { 668 goto precision80; 669 } 670 zSig0 |= ( zSig1 != 0 ); 671 if ( ! roundNearestEven ) { 672 if ( roundingMode == float_round_to_zero ) { 673 roundIncrement = 0; 674 } 675 else { 676 roundIncrement = roundMask; 677 if ( zSign ) { 678 if ( roundingMode == float_round_up ) roundIncrement = 0; 679 } 680 else { 681 if ( roundingMode == float_round_down ) roundIncrement = 0; 682 } 683 } 684 } 685 roundBits = zSig0 & roundMask; 686 if ( 0x7FFD <= (bits32) ( zExp - 1 ) ) { 687 if ( ( 0x7FFE < zExp ) 688 || ( ( zExp == 0x7FFE ) && ( zSig0 + roundIncrement < zSig0 ) ) 689 ) { 690 goto overflow; 691 } 692 if ( zExp <= 0 ) { 693 isTiny = 694 ( float_detect_tininess == float_tininess_before_rounding ) 695 || ( zExp < 0 ) 696 || ( zSig0 <= zSig0 + roundIncrement ); 697 shift64RightJamming( zSig0, 1 - zExp, &zSig0 ); 698 zExp = 0; 699 roundBits = zSig0 & roundMask; 700 if ( isTiny && roundBits ) float_raise( float_flag_underflow ); 701 if ( roundBits ) float_exception_flags |= float_flag_inexact; 702 zSig0 += roundIncrement; 703 if ( (sbits64) zSig0 < 0 ) zExp = 1; 704 roundIncrement = roundMask + 1; 705 if ( roundNearestEven && ( roundBits<<1 == roundIncrement ) ) { 706 roundMask |= roundIncrement; 707 } 708 zSig0 &= ~ roundMask; 709 return packFloatx80( zSign, zExp, zSig0 ); 710 } 711 } 712 if ( roundBits ) float_exception_flags |= float_flag_inexact; 713 zSig0 += roundIncrement; 714 if ( zSig0 < roundIncrement ) { 715 ++zExp; 716 zSig0 = LIT64( 0x8000000000000000 ); 717 } 718 roundIncrement = roundMask + 1; 719 if ( roundNearestEven && ( roundBits<<1 == roundIncrement ) ) { 720 roundMask |= roundIncrement; 721 } 722 zSig0 &= ~ roundMask; 723 if ( zSig0 == 0 ) zExp = 0; 724 return packFloatx80( zSign, zExp, zSig0 ); 725 precision80: 726 increment = ( (sbits64) zSig1 < 0 ); 727 if ( ! roundNearestEven ) { 728 if ( roundingMode == float_round_to_zero ) { 729 increment = 0; 730 } 731 else { 732 if ( zSign ) { 733 increment = ( roundingMode == float_round_down ) && zSig1; 734 } 735 else { 736 increment = ( roundingMode == float_round_up ) && zSig1; 737 } 738 } 739 } 740 if ( 0x7FFD <= (bits32) ( zExp - 1 ) ) { 741 if ( ( 0x7FFE < zExp ) 742 || ( ( zExp == 0x7FFE ) 743 && ( zSig0 == LIT64( 0xFFFFFFFFFFFFFFFF ) ) 744 && increment 745 ) 746 ) { 747 roundMask = 0; 748 overflow: 749 float_raise( float_flag_overflow | float_flag_inexact ); 750 if ( ( roundingMode == float_round_to_zero ) 751 || ( zSign && ( roundingMode == float_round_up ) ) 752 || ( ! zSign && ( roundingMode == float_round_down ) ) 753 ) { 754 return packFloatx80( zSign, 0x7FFE, ~ roundMask ); 755 } 756 return packFloatx80( zSign, 0x7FFF, LIT64( 0x8000000000000000 ) ); 757 } 758 if ( zExp <= 0 ) { 759 isTiny = 760 ( float_detect_tininess == float_tininess_before_rounding ) 761 || ( zExp < 0 ) 762 || ! increment 763 || ( zSig0 < LIT64( 0xFFFFFFFFFFFFFFFF ) ); 764 shift64ExtraRightJamming( zSig0, zSig1, 1 - zExp, &zSig0, &zSig1 ); 765 zExp = 0; 766 if ( isTiny && zSig1 ) float_raise( float_flag_underflow ); 767 if ( zSig1 ) float_exception_flags |= float_flag_inexact; 768 if ( roundNearestEven ) { 769 increment = ( (sbits64) zSig1 < 0 ); 770 } 771 else { 772 if ( zSign ) { 773 increment = ( roundingMode == float_round_down ) && zSig1; 774 } 775 else { 776 increment = ( roundingMode == float_round_up ) && zSig1; 777 } 778 } 779 if ( increment ) { 780 ++zSig0; 781 zSig0 &= 782 ~ ( ( (bits64) ( zSig1<<1 ) == 0 ) & roundNearestEven ); 783 if ( (sbits64) zSig0 < 0 ) zExp = 1; 784 } 785 return packFloatx80( zSign, zExp, zSig0 ); 786 } 787 } 788 if ( zSig1 ) float_exception_flags |= float_flag_inexact; 789 if ( increment ) { 790 ++zSig0; 791 if ( zSig0 == 0 ) { 792 ++zExp; 793 zSig0 = LIT64( 0x8000000000000000 ); 794 } 795 else { 796 zSig0 &= ~ ( ( (bits64) ( zSig1<<1 ) == 0 ) & roundNearestEven ); 797 } 798 } 799 else { 800 if ( zSig0 == 0 ) zExp = 0; 801 } 802 return packFloatx80( zSign, zExp, zSig0 ); 803 804 } 805 806 /* 807 ------------------------------------------------------------------------------- 808 Takes an abstract floating-point value having sign `zSign', exponent 809 `zExp', and significand formed by the concatenation of `zSig0' and `zSig1', 810 and returns the proper extended double-precision floating-point value 811 corresponding to the abstract input. This routine is just like 812 `roundAndPackFloatx80' except that the input significand does not have to be 813 normalized. 814 ------------------------------------------------------------------------------- 815 */ 816 static floatx80 817 normalizeRoundAndPackFloatx80( 818 int8 roundingPrecision, flag zSign, int32 zExp, bits64 zSig0, bits64 zSig1 819 ) 820 { 821 int8 shiftCount; 822 823 if ( zSig0 == 0 ) { 824 zSig0 = zSig1; 825 zSig1 = 0; 826 zExp -= 64; 827 } 828 shiftCount = countLeadingZeros64( zSig0 ); 829 shortShift128Left( zSig0, zSig1, shiftCount, &zSig0, &zSig1 ); 830 zExp -= shiftCount; 831 return 832 roundAndPackFloatx80( roundingPrecision, zSign, zExp, zSig0, zSig1 ); 833 834 } 835 836 #endif 837 838 #ifdef FLOAT128 839 840 /* 841 ------------------------------------------------------------------------------- 842 Returns the least-significant 64 fraction bits of the quadruple-precision 843 floating-point value `a'. 844 ------------------------------------------------------------------------------- 845 */ 846 INLINE bits64 extractFloat128Frac1( float128 a ) 847 { 848 849 return a.low; 850 851 } 852 853 /* 854 ------------------------------------------------------------------------------- 855 Returns the most-significant 48 fraction bits of the quadruple-precision 856 floating-point value `a'. 857 ------------------------------------------------------------------------------- 858 */ 859 INLINE bits64 extractFloat128Frac0( float128 a ) 860 { 861 862 return a.high & LIT64( 0x0000FFFFFFFFFFFF ); 863 864 } 865 866 /* 867 ------------------------------------------------------------------------------- 868 Returns the exponent bits of the quadruple-precision floating-point value 869 `a'. 870 ------------------------------------------------------------------------------- 871 */ 872 INLINE int32 extractFloat128Exp( float128 a ) 873 { 874 875 return ( a.high>>48 ) & 0x7FFF; 876 877 } 878 879 /* 880 ------------------------------------------------------------------------------- 881 Returns the sign bit of the quadruple-precision floating-point value `a'. 882 ------------------------------------------------------------------------------- 883 */ 884 INLINE flag extractFloat128Sign( float128 a ) 885 { 886 887 return a.high>>63; 888 889 } 890 891 /* 892 ------------------------------------------------------------------------------- 893 Normalizes the subnormal quadruple-precision floating-point value 894 represented by the denormalized significand formed by the concatenation of 895 `aSig0' and `aSig1'. The normalized exponent is stored at the location 896 pointed to by `zExpPtr'. The most significant 49 bits of the normalized 897 significand are stored at the location pointed to by `zSig0Ptr', and the 898 least significant 64 bits of the normalized significand are stored at the 899 location pointed to by `zSig1Ptr'. 900 ------------------------------------------------------------------------------- 901 */ 902 static void 903 normalizeFloat128Subnormal( 904 bits64 aSig0, 905 bits64 aSig1, 906 int32 *zExpPtr, 907 bits64 *zSig0Ptr, 908 bits64 *zSig1Ptr 909 ) 910 { 911 int8 shiftCount; 912 913 if ( aSig0 == 0 ) { 914 shiftCount = countLeadingZeros64( aSig1 ) - 15; 915 if ( shiftCount < 0 ) { 916 *zSig0Ptr = aSig1>>( - shiftCount ); 917 *zSig1Ptr = aSig1<<( shiftCount & 63 ); 918 } 919 else { 920 *zSig0Ptr = aSig1<<shiftCount; 921 *zSig1Ptr = 0; 922 } 923 *zExpPtr = - shiftCount - 63; 924 } 925 else { 926 shiftCount = countLeadingZeros64( aSig0 ) - 15; 927 shortShift128Left( aSig0, aSig1, shiftCount, zSig0Ptr, zSig1Ptr ); 928 *zExpPtr = 1 - shiftCount; 929 } 930 931 } 932 933 /* 934 ------------------------------------------------------------------------------- 935 Packs the sign `zSign', the exponent `zExp', and the significand formed 936 by the concatenation of `zSig0' and `zSig1' into a quadruple-precision 937 floating-point value, returning the result. After being shifted into the 938 proper positions, the three fields `zSign', `zExp', and `zSig0' are simply 939 added together to form the most significant 32 bits of the result. This 940 means that any integer portion of `zSig0' will be added into the exponent. 941 Since a properly normalized significand will have an integer portion equal 942 to 1, the `zExp' input should be 1 less than the desired result exponent 943 whenever `zSig0' and `zSig1' concatenated form a complete, normalized 944 significand. 945 ------------------------------------------------------------------------------- 946 */ 947 INLINE float128 948 packFloat128( flag zSign, int32 zExp, bits64 zSig0, bits64 zSig1 ) 949 { 950 float128 z; 951 952 z.low = zSig1; 953 z.high = ( ( (bits64) zSign )<<63 ) + ( ( (bits64) zExp )<<48 ) + zSig0; 954 return z; 955 956 } 957 958 /* 959 ------------------------------------------------------------------------------- 960 Takes an abstract floating-point value having sign `zSign', exponent `zExp', 961 and extended significand formed by the concatenation of `zSig0', `zSig1', 962 and `zSig2', and returns the proper quadruple-precision floating-point value 963 corresponding to the abstract input. Ordinarily, the abstract value is 964 simply rounded and packed into the quadruple-precision format, with the 965 inexact exception raised if the abstract input cannot be represented 966 exactly. However, if the abstract value is too large, the overflow and 967 inexact exceptions are raised and an infinity or maximal finite value is 968 returned. If the abstract value is too small, the input value is rounded to 969 a subnormal number, and the underflow and inexact exceptions are raised if 970 the abstract input cannot be represented exactly as a subnormal quadruple- 971 precision floating-point number. 972 The input significand must be normalized or smaller. If the input 973 significand is not normalized, `zExp' must be 0; in that case, the result 974 returned is a subnormal number, and it must not require rounding. In the 975 usual case that the input significand is normalized, `zExp' must be 1 less 976 than the ``true'' floating-point exponent. The handling of underflow and 977 overflow follows the IEC/IEEE Standard for Binary Floating-Point Arithmetic. 978 ------------------------------------------------------------------------------- 979 */ 980 static float128 981 roundAndPackFloat128( 982 flag zSign, int32 zExp, bits64 zSig0, bits64 zSig1, bits64 zSig2 ) 983 { 984 int8 roundingMode; 985 flag roundNearestEven, increment, isTiny; 986 987 roundingMode = float_rounding_mode; 988 roundNearestEven = ( roundingMode == float_round_nearest_even ); 989 increment = ( (sbits64) zSig2 < 0 ); 990 if ( ! roundNearestEven ) { 991 if ( roundingMode == float_round_to_zero ) { 992 increment = 0; 993 } 994 else { 995 if ( zSign ) { 996 increment = ( roundingMode == float_round_down ) && zSig2; 997 } 998 else { 999 increment = ( roundingMode == float_round_up ) && zSig2; 1000 } 1001 } 1002 } 1003 if ( 0x7FFD <= (bits32) zExp ) { 1004 if ( ( 0x7FFD < zExp ) 1005 || ( ( zExp == 0x7FFD ) 1006 && eq128( 1007 LIT64( 0x0001FFFFFFFFFFFF ), 1008 LIT64( 0xFFFFFFFFFFFFFFFF ), 1009 zSig0, 1010 zSig1 1011 ) 1012 && increment 1013 ) 1014 ) { 1015 float_raise( float_flag_overflow | float_flag_inexact ); 1016 if ( ( roundingMode == float_round_to_zero ) 1017 || ( zSign && ( roundingMode == float_round_up ) ) 1018 || ( ! zSign && ( roundingMode == float_round_down ) ) 1019 ) { 1020 return 1021 packFloat128( 1022 zSign, 1023 0x7FFE, 1024 LIT64( 0x0000FFFFFFFFFFFF ), 1025 LIT64( 0xFFFFFFFFFFFFFFFF ) 1026 ); 1027 } 1028 return packFloat128( zSign, 0x7FFF, 0, 0 ); 1029 } 1030 if ( zExp < 0 ) { 1031 isTiny = 1032 ( float_detect_tininess == float_tininess_before_rounding ) 1033 || ( zExp < -1 ) 1034 || ! increment 1035 || lt128( 1036 zSig0, 1037 zSig1, 1038 LIT64( 0x0001FFFFFFFFFFFF ), 1039 LIT64( 0xFFFFFFFFFFFFFFFF ) 1040 ); 1041 shift128ExtraRightJamming( 1042 zSig0, zSig1, zSig2, - zExp, &zSig0, &zSig1, &zSig2 ); 1043 zExp = 0; 1044 if ( isTiny && zSig2 ) float_raise( float_flag_underflow ); 1045 if ( roundNearestEven ) { 1046 increment = ( (sbits64) zSig2 < 0 ); 1047 } 1048 else { 1049 if ( zSign ) { 1050 increment = ( roundingMode == float_round_down ) && zSig2; 1051 } 1052 else { 1053 increment = ( roundingMode == float_round_up ) && zSig2; 1054 } 1055 } 1056 } 1057 } 1058 if ( zSig2 ) float_exception_flags |= float_flag_inexact; 1059 if ( increment ) { 1060 add128( zSig0, zSig1, 0, 1, &zSig0, &zSig1 ); 1061 zSig1 &= ~ ( ( zSig2 + zSig2 == 0 ) & roundNearestEven ); 1062 } 1063 else { 1064 if ( ( zSig0 | zSig1 ) == 0 ) zExp = 0; 1065 } 1066 return packFloat128( zSign, zExp, zSig0, zSig1 ); 1067 1068 } 1069 1070 /* 1071 ------------------------------------------------------------------------------- 1072 Takes an abstract floating-point value having sign `zSign', exponent `zExp', 1073 and significand formed by the concatenation of `zSig0' and `zSig1', and 1074 returns the proper quadruple-precision floating-point value corresponding 1075 to the abstract input. This routine is just like `roundAndPackFloat128' 1076 except that the input significand has fewer bits and does not have to be 1077 normalized. In all cases, `zExp' must be 1 less than the ``true'' floating- 1078 point exponent. 1079 ------------------------------------------------------------------------------- 1080 */ 1081 static float128 1082 normalizeRoundAndPackFloat128( 1083 flag zSign, int32 zExp, bits64 zSig0, bits64 zSig1 ) 1084 { 1085 int8 shiftCount; 1086 bits64 zSig2; 1087 1088 if ( zSig0 == 0 ) { 1089 zSig0 = zSig1; 1090 zSig1 = 0; 1091 zExp -= 64; 1092 } 1093 shiftCount = countLeadingZeros64( zSig0 ) - 15; 1094 if ( 0 <= shiftCount ) { 1095 zSig2 = 0; 1096 shortShift128Left( zSig0, zSig1, shiftCount, &zSig0, &zSig1 ); 1097 } 1098 else { 1099 shift128ExtraRightJamming( 1100 zSig0, zSig1, 0, - shiftCount, &zSig0, &zSig1, &zSig2 ); 1101 } 1102 zExp -= shiftCount; 1103 return roundAndPackFloat128( zSign, zExp, zSig0, zSig1, zSig2 ); 1104 1105 } 1106 1107 #endif 1108 1109 /* 1110 ------------------------------------------------------------------------------- 1111 Returns the result of converting the 32-bit two's complement integer `a' 1112 to the single-precision floating-point format. The conversion is performed 1113 according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic. 1114 ------------------------------------------------------------------------------- 1115 */ 1116 float32 int32_to_float32( int32 a ) 1117 { 1118 flag zSign; 1119 1120 if ( a == 0 ) return 0; 1121 if ( a == (sbits32) 0x80000000 ) return packFloat32( 1, 0x9E, 0 ); 1122 zSign = ( a < 0 ); 1123 return normalizeRoundAndPackFloat32( zSign, 0x9C, zSign ? - a : a ); 1124 1125 } 1126 1127 #ifndef SOFTFLOAT_FOR_GCC /* __floatunsisf is in libgcc */ 1128 float32 uint32_to_float32( uint32 a ) 1129 { 1130 if ( a == 0 ) return 0; 1131 if ( a & (bits32) 0x80000000 ) 1132 return normalizeRoundAndPackFloat32( 0, 0x9D, a >> 1 ); 1133 return normalizeRoundAndPackFloat32( 0, 0x9C, a ); 1134 } 1135 #endif 1136 1137 1138 /* 1139 ------------------------------------------------------------------------------- 1140 Returns the result of converting the 32-bit two's complement integer `a' 1141 to the double-precision floating-point format. The conversion is performed 1142 according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic. 1143 ------------------------------------------------------------------------------- 1144 */ 1145 float64 int32_to_float64( int32 a ) 1146 { 1147 flag zSign; 1148 uint32 absA; 1149 int8 shiftCount; 1150 bits64 zSig; 1151 1152 if ( a == 0 ) return 0; 1153 zSign = ( a < 0 ); 1154 absA = zSign ? - a : a; 1155 shiftCount = countLeadingZeros32( absA ) + 21; 1156 zSig = absA; 1157 return packFloat64( zSign, 0x432 - shiftCount, zSig<<shiftCount ); 1158 1159 } 1160 1161 #ifndef SOFTFLOAT_FOR_GCC /* __floatunsidf is in libgcc */ 1162 float64 uint32_to_float64( uint32 a ) 1163 { 1164 int8 shiftCount; 1165 bits64 zSig = a; 1166 1167 if ( a == 0 ) return 0; 1168 shiftCount = countLeadingZeros32( a ) + 21; 1169 return packFloat64( 0, 0x432 - shiftCount, zSig<<shiftCount ); 1170 1171 } 1172 #endif 1173 1174 #ifdef FLOATX80 1175 1176 /* 1177 ------------------------------------------------------------------------------- 1178 Returns the result of converting the 32-bit two's complement integer `a' 1179 to the extended double-precision floating-point format. The conversion 1180 is performed according to the IEC/IEEE Standard for Binary Floating-Point 1181 Arithmetic. 1182 ------------------------------------------------------------------------------- 1183 */ 1184 floatx80 int32_to_floatx80( int32 a ) 1185 { 1186 flag zSign; 1187 uint32 absA; 1188 int8 shiftCount; 1189 bits64 zSig; 1190 1191 if ( a == 0 ) return packFloatx80( 0, 0, 0 ); 1192 zSign = ( a < 0 ); 1193 absA = zSign ? - a : a; 1194 shiftCount = countLeadingZeros32( absA ) + 32; 1195 zSig = absA; 1196 return packFloatx80( zSign, 0x403E - shiftCount, zSig<<shiftCount ); 1197 1198 } 1199 1200 floatx80 uint32_to_floatx80( uint32 a ) 1201 { 1202 int8 shiftCount; 1203 bits64 zSig = a; 1204 1205 if ( a == 0 ) return packFloatx80( 0, 0, 0 ); 1206 shiftCount = countLeadingZeros32( a ) + 32; 1207 return packFloatx80( 0, 0x403E - shiftCount, zSig<<shiftCount ); 1208 1209 } 1210 1211 #endif 1212 1213 #ifdef FLOAT128 1214 1215 /* 1216 ------------------------------------------------------------------------------- 1217 Returns the result of converting the 32-bit two's complement integer `a' to 1218 the quadruple-precision floating-point format. The conversion is performed 1219 according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic. 1220 ------------------------------------------------------------------------------- 1221 */ 1222 float128 int32_to_float128( int32 a ) 1223 { 1224 flag zSign; 1225 uint32 absA; 1226 int8 shiftCount; 1227 bits64 zSig0; 1228 1229 if ( a == 0 ) return packFloat128( 0, 0, 0, 0 ); 1230 zSign = ( a < 0 ); 1231 absA = zSign ? - a : a; 1232 shiftCount = countLeadingZeros32( absA ) + 17; 1233 zSig0 = absA; 1234 return packFloat128( zSign, 0x402E - shiftCount, zSig0<<shiftCount, 0 ); 1235 1236 } 1237 1238 float128 uint32_to_float128( uint32 a ) 1239 { 1240 int8 shiftCount; 1241 bits64 zSig0 = a; 1242 1243 if ( a == 0 ) return packFloat128( 0, 0, 0, 0 ); 1244 shiftCount = countLeadingZeros32( a ) + 17; 1245 return packFloat128( 0, 0x402E - shiftCount, zSig0<<shiftCount, 0 ); 1246 1247 } 1248 1249 #endif 1250 1251 #ifndef SOFTFLOAT_FOR_GCC /* __floatdi?f is in libgcc2.c */ 1252 /* 1253 ------------------------------------------------------------------------------- 1254 Returns the result of converting the 64-bit two's complement integer `a' 1255 to the single-precision floating-point format. The conversion is performed 1256 according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic. 1257 ------------------------------------------------------------------------------- 1258 */ 1259 float32 int64_to_float32( int64 a ) 1260 { 1261 flag zSign; 1262 uint64 absA; 1263 int8 shiftCount; 1264 1265 if ( a == 0 ) return 0; 1266 zSign = ( a < 0 ); 1267 absA = zSign ? - a : a; 1268 shiftCount = countLeadingZeros64( absA ) - 40; 1269 if ( 0 <= shiftCount ) { 1270 return packFloat32( zSign, 0x95 - shiftCount, absA<<shiftCount ); 1271 } 1272 else { 1273 shiftCount += 7; 1274 if ( shiftCount < 0 ) { 1275 shift64RightJamming( absA, - shiftCount, &absA ); 1276 } 1277 else { 1278 absA <<= shiftCount; 1279 } 1280 return roundAndPackFloat32( zSign, 0x9C - shiftCount, absA ); 1281 } 1282 1283 } 1284 1285 /* 1286 ------------------------------------------------------------------------------- 1287 Returns the result of converting the 64-bit two's complement integer `a' 1288 to the double-precision floating-point format. The conversion is performed 1289 according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic. 1290 ------------------------------------------------------------------------------- 1291 */ 1292 float64 int64_to_float64( int64 a ) 1293 { 1294 flag zSign; 1295 1296 if ( a == 0 ) return 0; 1297 if ( a == (sbits64) LIT64( 0x8000000000000000 ) ) { 1298 return packFloat64( 1, 0x43E, 0 ); 1299 } 1300 zSign = ( a < 0 ); 1301 return normalizeRoundAndPackFloat64( zSign, 0x43C, zSign ? - a : a ); 1302 1303 } 1304 1305 #ifdef FLOATX80 1306 1307 /* 1308 ------------------------------------------------------------------------------- 1309 Returns the result of converting the 64-bit two's complement integer `a' 1310 to the extended double-precision floating-point format. The conversion 1311 is performed according to the IEC/IEEE Standard for Binary Floating-Point 1312 Arithmetic. 1313 ------------------------------------------------------------------------------- 1314 */ 1315 floatx80 int64_to_floatx80( int64 a ) 1316 { 1317 flag zSign; 1318 uint64 absA; 1319 int8 shiftCount; 1320 1321 if ( a == 0 ) return packFloatx80( 0, 0, 0 ); 1322 zSign = ( a < 0 ); 1323 absA = zSign ? - a : a; 1324 shiftCount = countLeadingZeros64( absA ); 1325 return packFloatx80( zSign, 0x403E - shiftCount, absA<<shiftCount ); 1326 1327 } 1328 1329 #endif 1330 1331 #endif /* !SOFTFLOAT_FOR_GCC */ 1332 1333 #ifdef FLOAT128 1334 1335 /* 1336 ------------------------------------------------------------------------------- 1337 Returns the result of converting the 64-bit two's complement integer `a' to 1338 the quadruple-precision floating-point format. The conversion is performed 1339 according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic. 1340 ------------------------------------------------------------------------------- 1341 */ 1342 float128 int64_to_float128( int64 a ) 1343 { 1344 flag zSign; 1345 uint64 absA; 1346 int8 shiftCount; 1347 int32 zExp; 1348 bits64 zSig0, zSig1; 1349 1350 if ( a == 0 ) return packFloat128( 0, 0, 0, 0 ); 1351 zSign = ( a < 0 ); 1352 absA = zSign ? - a : a; 1353 shiftCount = countLeadingZeros64( absA ) + 49; 1354 zExp = 0x406E - shiftCount; 1355 if ( 64 <= shiftCount ) { 1356 zSig1 = 0; 1357 zSig0 = absA; 1358 shiftCount -= 64; 1359 } 1360 else { 1361 zSig1 = absA; 1362 zSig0 = 0; 1363 } 1364 shortShift128Left( zSig0, zSig1, shiftCount, &zSig0, &zSig1 ); 1365 return packFloat128( zSign, zExp, zSig0, zSig1 ); 1366 1367 } 1368 1369 #endif 1370 1371 #ifndef SOFTFLOAT_FOR_GCC /* Not needed */ 1372 /* 1373 ------------------------------------------------------------------------------- 1374 Returns the result of converting the single-precision floating-point value 1375 `a' to the 32-bit two's complement integer format. The conversion is 1376 performed according to the IEC/IEEE Standard for Binary Floating-Point 1377 Arithmetic---which means in particular that the conversion is rounded 1378 according to the current rounding mode. If `a' is a NaN, the largest 1379 positive integer is returned. Otherwise, if the conversion overflows, the 1380 largest integer with the same sign as `a' is returned. 1381 ------------------------------------------------------------------------------- 1382 */ 1383 int32 float32_to_int32( float32 a ) 1384 { 1385 flag aSign; 1386 int16 aExp, shiftCount; 1387 bits32 aSig; 1388 bits64 aSig64; 1389 1390 aSig = extractFloat32Frac( a ); 1391 aExp = extractFloat32Exp( a ); 1392 aSign = extractFloat32Sign( a ); 1393 if ( ( aExp == 0xFF ) && aSig ) aSign = 0; 1394 if ( aExp ) aSig |= 0x00800000; 1395 shiftCount = 0xAF - aExp; 1396 aSig64 = aSig; 1397 aSig64 <<= 32; 1398 if ( 0 < shiftCount ) shift64RightJamming( aSig64, shiftCount, &aSig64 ); 1399 return roundAndPackInt32( aSign, aSig64 ); 1400 1401 } 1402 #endif /* !SOFTFLOAT_FOR_GCC */ 1403 1404 /* 1405 ------------------------------------------------------------------------------- 1406 Returns the result of converting the single-precision floating-point value 1407 `a' to the 32-bit two's complement integer format. The conversion is 1408 performed according to the IEC/IEEE Standard for Binary Floating-Point 1409 Arithmetic, except that the conversion is always rounded toward zero. 1410 If `a' is a NaN, the largest positive integer is returned. Otherwise, if 1411 the conversion overflows, the largest integer with the same sign as `a' is 1412 returned. 1413 ------------------------------------------------------------------------------- 1414 */ 1415 int32 float32_to_int32_round_to_zero( float32 a ) 1416 { 1417 flag aSign; 1418 int16 aExp, shiftCount; 1419 bits32 aSig; 1420 int32 z; 1421 1422 aSig = extractFloat32Frac( a ); 1423 aExp = extractFloat32Exp( a ); 1424 aSign = extractFloat32Sign( a ); 1425 shiftCount = aExp - 0x9E; 1426 if ( 0 <= shiftCount ) { 1427 if ( a != 0xCF000000 ) { 1428 float_raise( float_flag_invalid ); 1429 if ( ! aSign || ( ( aExp == 0xFF ) && aSig ) ) return 0x7FFFFFFF; 1430 } 1431 return (sbits32) 0x80000000; 1432 } 1433 else if ( aExp <= 0x7E ) { 1434 if ( aExp | aSig ) float_exception_flags |= float_flag_inexact; 1435 return 0; 1436 } 1437 aSig = ( aSig | 0x00800000 )<<8; 1438 z = aSig>>( - shiftCount ); 1439 if ( (bits32) ( aSig<<( shiftCount & 31 ) ) ) { 1440 float_exception_flags |= float_flag_inexact; 1441 } 1442 if ( aSign ) z = - z; 1443 return z; 1444 1445 } 1446 1447 #ifndef SOFTFLOAT_FOR_GCC /* __fix?fdi provided by libgcc2.c */ 1448 /* 1449 ------------------------------------------------------------------------------- 1450 Returns the result of converting the single-precision floating-point value 1451 `a' to the 64-bit two's complement integer format. The conversion is 1452 performed according to the IEC/IEEE Standard for Binary Floating-Point 1453 Arithmetic---which means in particular that the conversion is rounded 1454 according to the current rounding mode. If `a' is a NaN, the largest 1455 positive integer is returned. Otherwise, if the conversion overflows, the 1456 largest integer with the same sign as `a' is returned. 1457 ------------------------------------------------------------------------------- 1458 */ 1459 int64 float32_to_int64( float32 a ) 1460 { 1461 flag aSign; 1462 int16 aExp, shiftCount; 1463 bits32 aSig; 1464 bits64 aSig64, aSigExtra; 1465 1466 aSig = extractFloat32Frac( a ); 1467 aExp = extractFloat32Exp( a ); 1468 aSign = extractFloat32Sign( a ); 1469 shiftCount = 0xBE - aExp; 1470 if ( shiftCount < 0 ) { 1471 float_raise( float_flag_invalid ); 1472 if ( ! aSign || ( ( aExp == 0xFF ) && aSig ) ) { 1473 return LIT64( 0x7FFFFFFFFFFFFFFF ); 1474 } 1475 return (sbits64) LIT64( 0x8000000000000000 ); 1476 } 1477 if ( aExp ) aSig |= 0x00800000; 1478 aSig64 = aSig; 1479 aSig64 <<= 40; 1480 shift64ExtraRightJamming( aSig64, 0, shiftCount, &aSig64, &aSigExtra ); 1481 return roundAndPackInt64( aSign, aSig64, aSigExtra ); 1482 1483 } 1484 1485 /* 1486 ------------------------------------------------------------------------------- 1487 Returns the result of converting the single-precision floating-point value 1488 `a' to the 64-bit two's complement integer format. The conversion is 1489 performed according to the IEC/IEEE Standard for Binary Floating-Point 1490 Arithmetic, except that the conversion is always rounded toward zero. If 1491 `a' is a NaN, the largest positive integer is returned. Otherwise, if the 1492 conversion overflows, the largest integer with the same sign as `a' is 1493 returned. 1494 ------------------------------------------------------------------------------- 1495 */ 1496 int64 float32_to_int64_round_to_zero( float32 a ) 1497 { 1498 flag aSign; 1499 int16 aExp, shiftCount; 1500 bits32 aSig; 1501 bits64 aSig64; 1502 int64 z; 1503 1504 aSig = extractFloat32Frac( a ); 1505 aExp = extractFloat32Exp( a ); 1506 aSign = extractFloat32Sign( a ); 1507 shiftCount = aExp - 0xBE; 1508 if ( 0 <= shiftCount ) { 1509 if ( a != 0xDF000000 ) { 1510 float_raise( float_flag_invalid ); 1511 if ( ! aSign || ( ( aExp == 0xFF ) && aSig ) ) { 1512 return LIT64( 0x7FFFFFFFFFFFFFFF ); 1513 } 1514 } 1515 return (sbits64) LIT64( 0x8000000000000000 ); 1516 } 1517 else if ( aExp <= 0x7E ) { 1518 if ( aExp | aSig ) float_exception_flags |= float_flag_inexact; 1519 return 0; 1520 } 1521 aSig64 = aSig | 0x00800000; 1522 aSig64 <<= 40; 1523 z = aSig64>>( - shiftCount ); 1524 if ( (bits64) ( aSig64<<( shiftCount & 63 ) ) ) { 1525 float_exception_flags |= float_flag_inexact; 1526 } 1527 if ( aSign ) z = - z; 1528 return z; 1529 1530 } 1531 #endif /* !SOFTFLOAT_FOR_GCC */ 1532 1533 /* 1534 ------------------------------------------------------------------------------- 1535 Returns the result of converting the single-precision floating-point value 1536 `a' to the double-precision floating-point format. The conversion is 1537 performed according to the IEC/IEEE Standard for Binary Floating-Point 1538 Arithmetic. 1539 ------------------------------------------------------------------------------- 1540 */ 1541 float64 float32_to_float64( float32 a ) 1542 { 1543 flag aSign; 1544 int16 aExp; 1545 bits32 aSig; 1546 1547 aSig = extractFloat32Frac( a ); 1548 aExp = extractFloat32Exp( a ); 1549 aSign = extractFloat32Sign( a ); 1550 if ( aExp == 0xFF ) { 1551 if ( aSig ) return commonNaNToFloat64( float32ToCommonNaN( a ) ); 1552 return packFloat64( aSign, 0x7FF, 0 ); 1553 } 1554 if ( aExp == 0 ) { 1555 if ( aSig == 0 ) return packFloat64( aSign, 0, 0 ); 1556 normalizeFloat32Subnormal( aSig, &aExp, &aSig ); 1557 --aExp; 1558 } 1559 return packFloat64( aSign, aExp + 0x380, ( (bits64) aSig )<<29 ); 1560 1561 } 1562 1563 #ifdef FLOATX80 1564 1565 /* 1566 ------------------------------------------------------------------------------- 1567 Returns the result of converting the single-precision floating-point value 1568 `a' to the extended double-precision floating-point format. The conversion 1569 is performed according to the IEC/IEEE Standard for Binary Floating-Point 1570 Arithmetic. 1571 ------------------------------------------------------------------------------- 1572 */ 1573 floatx80 float32_to_floatx80( float32 a ) 1574 { 1575 flag aSign; 1576 int16 aExp; 1577 bits32 aSig; 1578 1579 aSig = extractFloat32Frac( a ); 1580 aExp = extractFloat32Exp( a ); 1581 aSign = extractFloat32Sign( a ); 1582 if ( aExp == 0xFF ) { 1583 if ( aSig ) return commonNaNToFloatx80( float32ToCommonNaN( a ) ); 1584 return packFloatx80( aSign, 0x7FFF, LIT64( 0x8000000000000000 ) ); 1585 } 1586 if ( aExp == 0 ) { 1587 if ( aSig == 0 ) return packFloatx80( aSign, 0, 0 ); 1588 normalizeFloat32Subnormal( aSig, &aExp, &aSig ); 1589 } 1590 aSig |= 0x00800000; 1591 return packFloatx80( aSign, aExp + 0x3F80, ( (bits64) aSig )<<40 ); 1592 1593 } 1594 1595 #endif 1596 1597 #ifdef FLOAT128 1598 1599 /* 1600 ------------------------------------------------------------------------------- 1601 Returns the result of converting the single-precision floating-point value 1602 `a' to the double-precision floating-point format. The conversion is 1603 performed according to the IEC/IEEE Standard for Binary Floating-Point 1604 Arithmetic. 1605 ------------------------------------------------------------------------------- 1606 */ 1607 float128 float32_to_float128( float32 a ) 1608 { 1609 flag aSign; 1610 int16 aExp; 1611 bits32 aSig; 1612 1613 aSig = extractFloat32Frac( a ); 1614 aExp = extractFloat32Exp( a ); 1615 aSign = extractFloat32Sign( a ); 1616 if ( aExp == 0xFF ) { 1617 if ( aSig ) return commonNaNToFloat128( float32ToCommonNaN( a ) ); 1618 return packFloat128( aSign, 0x7FFF, 0, 0 ); 1619 } 1620 if ( aExp == 0 ) { 1621 if ( aSig == 0 ) return packFloat128( aSign, 0, 0, 0 ); 1622 normalizeFloat32Subnormal( aSig, &aExp, &aSig ); 1623 --aExp; 1624 } 1625 return packFloat128( aSign, aExp + 0x3F80, ( (bits64) aSig )<<25, 0 ); 1626 1627 } 1628 1629 #endif 1630 1631 #ifndef SOFTFLOAT_FOR_GCC /* Not needed */ 1632 /* 1633 ------------------------------------------------------------------------------- 1634 Rounds the single-precision floating-point value `a' to an integer, and 1635 returns the result as a single-precision floating-point value. The 1636 operation is performed according to the IEC/IEEE Standard for Binary 1637 Floating-Point Arithmetic. 1638 ------------------------------------------------------------------------------- 1639 */ 1640 float32 float32_round_to_int( float32 a ) 1641 { 1642 flag aSign; 1643 int16 aExp; 1644 bits32 lastBitMask, roundBitsMask; 1645 int8 roundingMode; 1646 float32 z; 1647 1648 aExp = extractFloat32Exp( a ); 1649 if ( 0x96 <= aExp ) { 1650 if ( ( aExp == 0xFF ) && extractFloat32Frac( a ) ) { 1651 return propagateFloat32NaN( a, a ); 1652 } 1653 return a; 1654 } 1655 if ( aExp <= 0x7E ) { 1656 if ( (bits32) ( a<<1 ) == 0 ) return a; 1657 float_exception_flags |= float_flag_inexact; 1658 aSign = extractFloat32Sign( a ); 1659 switch ( float_rounding_mode ) { 1660 case float_round_nearest_even: 1661 if ( ( aExp == 0x7E ) && extractFloat32Frac( a ) ) { 1662 return packFloat32( aSign, 0x7F, 0 ); 1663 } 1664 break; 1665 case float_round_to_zero: 1666 break; 1667 case float_round_down: 1668 return aSign ? 0xBF800000 : 0; 1669 case float_round_up: 1670 return aSign ? 0x80000000 : 0x3F800000; 1671 } 1672 return packFloat32( aSign, 0, 0 ); 1673 } 1674 lastBitMask = 1; 1675 lastBitMask <<= 0x96 - aExp; 1676 roundBitsMask = lastBitMask - 1; 1677 z = a; 1678 roundingMode = float_rounding_mode; 1679 if ( roundingMode == float_round_nearest_even ) { 1680 z += lastBitMask>>1; 1681 if ( ( z & roundBitsMask ) == 0 ) z &= ~ lastBitMask; 1682 } 1683 else if ( roundingMode != float_round_to_zero ) { 1684 if ( extractFloat32Sign( z ) ^ ( roundingMode == float_round_up ) ) { 1685 z += roundBitsMask; 1686 } 1687 } 1688 z &= ~ roundBitsMask; 1689 if ( z != a ) float_exception_flags |= float_flag_inexact; 1690 return z; 1691 1692 } 1693 #endif /* !SOFTFLOAT_FOR_GCC */ 1694 1695 /* 1696 ------------------------------------------------------------------------------- 1697 Returns the result of adding the absolute values of the single-precision 1698 floating-point values `a' and `b'. If `zSign' is 1, the sum is negated 1699 before being returned. `zSign' is ignored if the result is a NaN. 1700 The addition is performed according to the IEC/IEEE Standard for Binary 1701 Floating-Point Arithmetic. 1702 ------------------------------------------------------------------------------- 1703 */ 1704 static float32 addFloat32Sigs( float32 a, float32 b, flag zSign ) 1705 { 1706 int16 aExp, bExp, zExp; 1707 bits32 aSig, bSig, zSig; 1708 int16 expDiff; 1709 1710 aSig = extractFloat32Frac( a ); 1711 aExp = extractFloat32Exp( a ); 1712 bSig = extractFloat32Frac( b ); 1713 bExp = extractFloat32Exp( b ); 1714 expDiff = aExp - bExp; 1715 aSig <<= 6; 1716 bSig <<= 6; 1717 if ( 0 < expDiff ) { 1718 if ( aExp == 0xFF ) { 1719 if ( aSig ) return propagateFloat32NaN( a, b ); 1720 return a; 1721 } 1722 if ( bExp == 0 ) { 1723 --expDiff; 1724 } 1725 else { 1726 bSig |= 0x20000000; 1727 } 1728 shift32RightJamming( bSig, expDiff, &bSig ); 1729 zExp = aExp; 1730 } 1731 else if ( expDiff < 0 ) { 1732 if ( bExp == 0xFF ) { 1733 if ( bSig ) return propagateFloat32NaN( a, b ); 1734 return packFloat32( zSign, 0xFF, 0 ); 1735 } 1736 if ( aExp == 0 ) { 1737 ++expDiff; 1738 } 1739 else { 1740 aSig |= 0x20000000; 1741 } 1742 shift32RightJamming( aSig, - expDiff, &aSig ); 1743 zExp = bExp; 1744 } 1745 else { 1746 if ( aExp == 0xFF ) { 1747 if ( aSig | bSig ) return propagateFloat32NaN( a, b ); 1748 return a; 1749 } 1750 if ( aExp == 0 ) return packFloat32( zSign, 0, ( aSig + bSig )>>6 ); 1751 zSig = 0x40000000 + aSig + bSig; 1752 zExp = aExp; 1753 goto roundAndPack; 1754 } 1755 aSig |= 0x20000000; 1756 zSig = ( aSig + bSig )<<1; 1757 --zExp; 1758 if ( (sbits32) zSig < 0 ) { 1759 zSig = aSig + bSig; 1760 ++zExp; 1761 } 1762 roundAndPack: 1763 return roundAndPackFloat32( zSign, zExp, zSig ); 1764 1765 } 1766 1767 /* 1768 ------------------------------------------------------------------------------- 1769 Returns the result of subtracting the absolute values of the single- 1770 precision floating-point values `a' and `b'. If `zSign' is 1, the 1771 difference is negated before being returned. `zSign' is ignored if the 1772 result is a NaN. The subtraction is performed according to the IEC/IEEE 1773 Standard for Binary Floating-Point Arithmetic. 1774 ------------------------------------------------------------------------------- 1775 */ 1776 static float32 subFloat32Sigs( float32 a, float32 b, flag zSign ) 1777 { 1778 int16 aExp, bExp, zExp; 1779 bits32 aSig, bSig, zSig; 1780 int16 expDiff; 1781 1782 aSig = extractFloat32Frac( a ); 1783 aExp = extractFloat32Exp( a ); 1784 bSig = extractFloat32Frac( b ); 1785 bExp = extractFloat32Exp( b ); 1786 expDiff = aExp - bExp; 1787 aSig <<= 7; 1788 bSig <<= 7; 1789 if ( 0 < expDiff ) goto aExpBigger; 1790 if ( expDiff < 0 ) goto bExpBigger; 1791 if ( aExp == 0xFF ) { 1792 if ( aSig | bSig ) return propagateFloat32NaN( a, b ); 1793 float_raise( float_flag_invalid ); 1794 return float32_default_nan; 1795 } 1796 if ( aExp == 0 ) { 1797 aExp = 1; 1798 bExp = 1; 1799 } 1800 if ( bSig < aSig ) goto aBigger; 1801 if ( aSig < bSig ) goto bBigger; 1802 return packFloat32( float_rounding_mode == float_round_down, 0, 0 ); 1803 bExpBigger: 1804 if ( bExp == 0xFF ) { 1805 if ( bSig ) return propagateFloat32NaN( a, b ); 1806 return packFloat32( zSign ^ 1, 0xFF, 0 ); 1807 } 1808 if ( aExp == 0 ) { 1809 ++expDiff; 1810 } 1811 else { 1812 aSig |= 0x40000000; 1813 } 1814 shift32RightJamming( aSig, - expDiff, &aSig ); 1815 bSig |= 0x40000000; 1816 bBigger: 1817 zSig = bSig - aSig; 1818 zExp = bExp; 1819 zSign ^= 1; 1820 goto normalizeRoundAndPack; 1821 aExpBigger: 1822 if ( aExp == 0xFF ) { 1823 if ( aSig ) return propagateFloat32NaN( a, b ); 1824 return a; 1825 } 1826 if ( bExp == 0 ) { 1827 --expDiff; 1828 } 1829 else { 1830 bSig |= 0x40000000; 1831 } 1832 shift32RightJamming( bSig, expDiff, &bSig ); 1833 aSig |= 0x40000000; 1834 aBigger: 1835 zSig = aSig - bSig; 1836 zExp = aExp; 1837 normalizeRoundAndPack: 1838 --zExp; 1839 return normalizeRoundAndPackFloat32( zSign, zExp, zSig ); 1840 1841 } 1842 1843 /* 1844 ------------------------------------------------------------------------------- 1845 Returns the result of adding the single-precision floating-point values `a' 1846 and `b'. The operation is performed according to the IEC/IEEE Standard for 1847 Binary Floating-Point Arithmetic. 1848 ------------------------------------------------------------------------------- 1849 */ 1850 float32 float32_add( float32 a, float32 b ) 1851 { 1852 flag aSign, bSign; 1853 1854 aSign = extractFloat32Sign( a ); 1855 bSign = extractFloat32Sign( b ); 1856 if ( aSign == bSign ) { 1857 return addFloat32Sigs( a, b, aSign ); 1858 } 1859 else { 1860 return subFloat32Sigs( a, b, aSign ); 1861 } 1862 1863 } 1864 1865 /* 1866 ------------------------------------------------------------------------------- 1867 Returns the result of subtracting the single-precision floating-point values 1868 `a' and `b'. The operation is performed according to the IEC/IEEE Standard 1869 for Binary Floating-Point Arithmetic. 1870 ------------------------------------------------------------------------------- 1871 */ 1872 float32 float32_sub( float32 a, float32 b ) 1873 { 1874 flag aSign, bSign; 1875 1876 aSign = extractFloat32Sign( a ); 1877 bSign = extractFloat32Sign( b ); 1878 if ( aSign == bSign ) { 1879 return subFloat32Sigs( a, b, aSign ); 1880 } 1881 else { 1882 return addFloat32Sigs( a, b, aSign ); 1883 } 1884 1885 } 1886 1887 /* 1888 ------------------------------------------------------------------------------- 1889 Returns the result of multiplying the single-precision floating-point values 1890 `a' and `b'. The operation is performed according to the IEC/IEEE Standard 1891 for Binary Floating-Point Arithmetic. 1892 ------------------------------------------------------------------------------- 1893 */ 1894 float32 float32_mul( float32 a, float32 b ) 1895 { 1896 flag aSign, bSign, zSign; 1897 int16 aExp, bExp, zExp; 1898 bits32 aSig, bSig; 1899 bits64 zSig64; 1900 bits32 zSig; 1901 1902 aSig = extractFloat32Frac( a ); 1903 aExp = extractFloat32Exp( a ); 1904 aSign = extractFloat32Sign( a ); 1905 bSig = extractFloat32Frac( b ); 1906 bExp = extractFloat32Exp( b ); 1907 bSign = extractFloat32Sign( b ); 1908 zSign = aSign ^ bSign; 1909 if ( aExp == 0xFF ) { 1910 if ( aSig || ( ( bExp == 0xFF ) && bSig ) ) { 1911 return propagateFloat32NaN( a, b ); 1912 } 1913 if ( ( bExp | bSig ) == 0 ) { 1914 float_raise( float_flag_invalid ); 1915 return float32_default_nan; 1916 } 1917 return packFloat32( zSign, 0xFF, 0 ); 1918 } 1919 if ( bExp == 0xFF ) { 1920 if ( bSig ) return propagateFloat32NaN( a, b ); 1921 if ( ( aExp | aSig ) == 0 ) { 1922 float_raise( float_flag_invalid ); 1923 return float32_default_nan; 1924 } 1925 return packFloat32( zSign, 0xFF, 0 ); 1926 } 1927 if ( aExp == 0 ) { 1928 if ( aSig == 0 ) return packFloat32( zSign, 0, 0 ); 1929 normalizeFloat32Subnormal( aSig, &aExp, &aSig ); 1930 } 1931 if ( bExp == 0 ) { 1932 if ( bSig == 0 ) return packFloat32( zSign, 0, 0 ); 1933 normalizeFloat32Subnormal( bSig, &bExp, &bSig ); 1934 } 1935 zExp = aExp + bExp - 0x7F; 1936 aSig = ( aSig | 0x00800000 )<<7; 1937 bSig = ( bSig | 0x00800000 )<<8; 1938 shift64RightJamming( ( (bits64) aSig ) * bSig, 32, &zSig64 ); 1939 zSig = zSig64; 1940 if ( 0 <= (sbits32) ( zSig<<1 ) ) { 1941 zSig <<= 1; 1942 --zExp; 1943 } 1944 return roundAndPackFloat32( zSign, zExp, zSig ); 1945 1946 } 1947 1948 /* 1949 ------------------------------------------------------------------------------- 1950 Returns the result of dividing the single-precision floating-point value `a' 1951 by the corresponding value `b'. The operation is performed according to the 1952 IEC/IEEE Standard for Binary Floating-Point Arithmetic. 1953 ------------------------------------------------------------------------------- 1954 */ 1955 float32 float32_div( float32 a, float32 b ) 1956 { 1957 flag aSign, bSign, zSign; 1958 int16 aExp, bExp, zExp; 1959 bits32 aSig, bSig, zSig; 1960 1961 aSig = extractFloat32Frac( a ); 1962 aExp = extractFloat32Exp( a ); 1963 aSign = extractFloat32Sign( a ); 1964 bSig = extractFloat32Frac( b ); 1965 bExp = extractFloat32Exp( b ); 1966 bSign = extractFloat32Sign( b ); 1967 zSign = aSign ^ bSign; 1968 if ( aExp == 0xFF ) { 1969 if ( aSig ) return propagateFloat32NaN( a, b ); 1970 if ( bExp == 0xFF ) { 1971 if ( bSig ) return propagateFloat32NaN( a, b ); 1972 float_raise( float_flag_invalid ); 1973 return float32_default_nan; 1974 } 1975 return packFloat32( zSign, 0xFF, 0 ); 1976 } 1977 if ( bExp == 0xFF ) { 1978 if ( bSig ) return propagateFloat32NaN( a, b ); 1979 return packFloat32( zSign, 0, 0 ); 1980 } 1981 if ( bExp == 0 ) { 1982 if ( bSig == 0 ) { 1983 if ( ( aExp | aSig ) == 0 ) { 1984 float_raise( float_flag_invalid ); 1985 return float32_default_nan; 1986 } 1987 float_raise( float_flag_divbyzero ); 1988 return packFloat32( zSign, 0xFF, 0 ); 1989 } 1990 normalizeFloat32Subnormal( bSig, &bExp, &bSig ); 1991 } 1992 if ( aExp == 0 ) { 1993 if ( aSig == 0 ) return packFloat32( zSign, 0, 0 ); 1994 normalizeFloat32Subnormal( aSig, &aExp, &aSig ); 1995 } 1996 zExp = aExp - bExp + 0x7D; 1997 aSig = ( aSig | 0x00800000 )<<7; 1998 bSig = ( bSig | 0x00800000 )<<8; 1999 if ( bSig <= ( aSig + aSig ) ) { 2000 aSig >>= 1; 2001 ++zExp; 2002 } 2003 zSig = ( ( (bits64) aSig )<<32 ) / bSig; 2004 if ( ( zSig & 0x3F ) == 0 ) { 2005 zSig |= ( (bits64) bSig * zSig != ( (bits64) aSig )<<32 ); 2006 } 2007 return roundAndPackFloat32( zSign, zExp, zSig ); 2008 2009 } 2010 2011 #ifndef SOFTFLOAT_FOR_GCC /* Not needed */ 2012 /* 2013 ------------------------------------------------------------------------------- 2014 Returns the remainder of the single-precision floating-point value `a' 2015 with respect to the corresponding value `b'. The operation is performed 2016 according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic. 2017 ------------------------------------------------------------------------------- 2018 */ 2019 float32 float32_rem( float32 a, float32 b ) 2020 { 2021 flag aSign, bSign, zSign; 2022 int16 aExp, bExp, expDiff; 2023 bits32 aSig, bSig; 2024 bits32 q; 2025 bits64 aSig64, bSig64, q64; 2026 bits32 alternateASig; 2027 sbits32 sigMean; 2028 2029 aSig = extractFloat32Frac( a ); 2030 aExp = extractFloat32Exp( a ); 2031 aSign = extractFloat32Sign( a ); 2032 bSig = extractFloat32Frac( b ); 2033 bExp = extractFloat32Exp( b ); 2034 bSign = extractFloat32Sign( b ); 2035 if ( aExp == 0xFF ) { 2036 if ( aSig || ( ( bExp == 0xFF ) && bSig ) ) { 2037 return propagateFloat32NaN( a, b ); 2038 } 2039 float_raise( float_flag_invalid ); 2040 return float32_default_nan; 2041 } 2042 if ( bExp == 0xFF ) { 2043 if ( bSig ) return propagateFloat32NaN( a, b ); 2044 return a; 2045 } 2046 if ( bExp == 0 ) { 2047 if ( bSig == 0 ) { 2048 float_raise( float_flag_invalid ); 2049 return float32_default_nan; 2050 } 2051 normalizeFloat32Subnormal( bSig, &bExp, &bSig ); 2052 } 2053 if ( aExp == 0 ) { 2054 if ( aSig == 0 ) return a; 2055 normalizeFloat32Subnormal( aSig, &aExp, &aSig ); 2056 } 2057 expDiff = aExp - bExp; 2058 aSig |= 0x00800000; 2059 bSig |= 0x00800000; 2060 if ( expDiff < 32 ) { 2061 aSig <<= 8; 2062 bSig <<= 8; 2063 if ( expDiff < 0 ) { 2064 if ( expDiff < -1 ) return a; 2065 aSig >>= 1; 2066 } 2067 q = ( bSig <= aSig ); 2068 if ( q ) aSig -= bSig; 2069 if ( 0 < expDiff ) { 2070 q = ( ( (bits64) aSig )<<32 ) / bSig; 2071 q >>= 32 - expDiff; 2072 bSig >>= 2; 2073 aSig = ( ( aSig>>1 )<<( expDiff - 1 ) ) - bSig * q; 2074 } 2075 else { 2076 aSig >>= 2; 2077 bSig >>= 2; 2078 } 2079 } 2080 else { 2081 if ( bSig <= aSig ) aSig -= bSig; 2082 aSig64 = ( (bits64) aSig )<<40; 2083 bSig64 = ( (bits64) bSig )<<40; 2084 expDiff -= 64; 2085 while ( 0 < expDiff ) { 2086 q64 = estimateDiv128To64( aSig64, 0, bSig64 ); 2087 q64 = ( 2 < q64 ) ? q64 - 2 : 0; 2088 aSig64 = - ( ( bSig * q64 )<<38 ); 2089 expDiff -= 62; 2090 } 2091 expDiff += 64; 2092 q64 = estimateDiv128To64( aSig64, 0, bSig64 ); 2093 q64 = ( 2 < q64 ) ? q64 - 2 : 0; 2094 q = q64>>( 64 - expDiff ); 2095 bSig <<= 6; 2096 aSig = ( ( aSig64>>33 )<<( expDiff - 1 ) ) - bSig * q; 2097 } 2098 do { 2099 alternateASig = aSig; 2100 ++q; 2101 aSig -= bSig; 2102 } while ( 0 <= (sbits32) aSig ); 2103 sigMean = aSig + alternateASig; 2104 if ( ( sigMean < 0 ) || ( ( sigMean == 0 ) && ( q & 1 ) ) ) { 2105 aSig = alternateASig; 2106 } 2107 zSign = ( (sbits32) aSig < 0 ); 2108 if ( zSign ) aSig = - aSig; 2109 return normalizeRoundAndPackFloat32( aSign ^ zSign, bExp, aSig ); 2110 2111 } 2112 #endif /* !SOFTFLOAT_FOR_GCC */ 2113 2114 #ifndef SOFTFLOAT_FOR_GCC /* Not needed */ 2115 /* 2116 ------------------------------------------------------------------------------- 2117 Returns the square root of the single-precision floating-point value `a'. 2118 The operation is performed according to the IEC/IEEE Standard for Binary 2119 Floating-Point Arithmetic. 2120 ------------------------------------------------------------------------------- 2121 */ 2122 float32 float32_sqrt( float32 a ) 2123 { 2124 flag aSign; 2125 int16 aExp, zExp; 2126 bits32 aSig, zSig; 2127 bits64 rem, term; 2128 2129 aSig = extractFloat32Frac( a ); 2130 aExp = extractFloat32Exp( a ); 2131 aSign = extractFloat32Sign( a ); 2132 if ( aExp == 0xFF ) { 2133 if ( aSig ) return propagateFloat32NaN( a, 0 ); 2134 if ( ! aSign ) return a; 2135 float_raise( float_flag_invalid ); 2136 return float32_default_nan; 2137 } 2138 if ( aSign ) { 2139 if ( ( aExp | aSig ) == 0 ) return a; 2140 float_raise( float_flag_invalid ); 2141 return float32_default_nan; 2142 } 2143 if ( aExp == 0 ) { 2144 if ( aSig == 0 ) return 0; 2145 normalizeFloat32Subnormal( aSig, &aExp, &aSig ); 2146 } 2147 zExp = ( ( aExp - 0x7F )>>1 ) + 0x7E; 2148 aSig = ( aSig | 0x00800000 )<<8; 2149 zSig = estimateSqrt32( aExp, aSig ) + 2; 2150 if ( ( zSig & 0x7F ) <= 5 ) { 2151 if ( zSig < 2 ) { 2152 zSig = 0x7FFFFFFF; 2153 goto roundAndPack; 2154 } 2155 aSig >>= aExp & 1; 2156 term = ( (bits64) zSig ) * zSig; 2157 rem = ( ( (bits64) aSig )<<32 ) - term; 2158 while ( (sbits64) rem < 0 ) { 2159 --zSig; 2160 rem += ( ( (bits64) zSig )<<1 ) | 1; 2161 } 2162 zSig |= ( rem != 0 ); 2163 } 2164 shift32RightJamming( zSig, 1, &zSig ); 2165 roundAndPack: 2166 return roundAndPackFloat32( 0, zExp, zSig ); 2167 2168 } 2169 #endif /* !SOFTFLOAT_FOR_GCC */ 2170 2171 /* 2172 ------------------------------------------------------------------------------- 2173 Returns 1 if the single-precision floating-point value `a' is equal to 2174 the corresponding value `b', and 0 otherwise. The comparison is performed 2175 according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic. 2176 ------------------------------------------------------------------------------- 2177 */ 2178 flag float32_eq( float32 a, float32 b ) 2179 { 2180 2181 if ( ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) ) 2182 || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) ) 2183 ) { 2184 if ( float32_is_signaling_nan( a ) || float32_is_signaling_nan( b ) ) { 2185 float_raise( float_flag_invalid ); 2186 } 2187 return 0; 2188 } 2189 return ( a == b ) || ( (bits32) ( ( a | b )<<1 ) == 0 ); 2190 2191 } 2192 2193 /* 2194 ------------------------------------------------------------------------------- 2195 Returns 1 if the single-precision floating-point value `a' is less than 2196 or equal to the corresponding value `b', and 0 otherwise. The comparison 2197 is performed according to the IEC/IEEE Standard for Binary Floating-Point 2198 Arithmetic. 2199 ------------------------------------------------------------------------------- 2200 */ 2201 flag float32_le( float32 a, float32 b ) 2202 { 2203 flag aSign, bSign; 2204 2205 if ( ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) ) 2206 || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) ) 2207 ) { 2208 float_raise( float_flag_invalid ); 2209 return 0; 2210 } 2211 aSign = extractFloat32Sign( a ); 2212 bSign = extractFloat32Sign( b ); 2213 if ( aSign != bSign ) return aSign || ( (bits32) ( ( a | b )<<1 ) == 0 ); 2214 return ( a == b ) || ( aSign ^ ( a < b ) ); 2215 2216 } 2217 2218 /* 2219 ------------------------------------------------------------------------------- 2220 Returns 1 if the single-precision floating-point value `a' is less than 2221 the corresponding value `b', and 0 otherwise. The comparison is performed 2222 according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic. 2223 ------------------------------------------------------------------------------- 2224 */ 2225 flag float32_lt( float32 a, float32 b ) 2226 { 2227 flag aSign, bSign; 2228 2229 if ( ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) ) 2230 || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) ) 2231 ) { 2232 float_raise( float_flag_invalid ); 2233 return 0; 2234 } 2235 aSign = extractFloat32Sign( a ); 2236 bSign = extractFloat32Sign( b ); 2237 if ( aSign != bSign ) return aSign && ( (bits32) ( ( a | b )<<1 ) != 0 ); 2238 return ( a != b ) && ( aSign ^ ( a < b ) ); 2239 2240 } 2241 2242 #ifndef SOFTFLOAT_FOR_GCC /* Not needed */ 2243 /* 2244 ------------------------------------------------------------------------------- 2245 Returns 1 if the single-precision floating-point value `a' is equal to 2246 the corresponding value `b', and 0 otherwise. The invalid exception is 2247 raised if either operand is a NaN. Otherwise, the comparison is performed 2248 according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic. 2249 ------------------------------------------------------------------------------- 2250 */ 2251 flag float32_eq_signaling( float32 a, float32 b ) 2252 { 2253 2254 if ( ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) ) 2255 || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) ) 2256 ) { 2257 float_raise( float_flag_invalid ); 2258 return 0; 2259 } 2260 return ( a == b ) || ( (bits32) ( ( a | b )<<1 ) == 0 ); 2261 2262 } 2263 2264 /* 2265 ------------------------------------------------------------------------------- 2266 Returns 1 if the single-precision floating-point value `a' is less than or 2267 equal to the corresponding value `b', and 0 otherwise. Quiet NaNs do not 2268 cause an exception. Otherwise, the comparison is performed according to the 2269 IEC/IEEE Standard for Binary Floating-Point Arithmetic. 2270 ------------------------------------------------------------------------------- 2271 */ 2272 flag float32_le_quiet( float32 a, float32 b ) 2273 { 2274 flag aSign, bSign; 2275 2276 if ( ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) ) 2277 || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) ) 2278 ) { 2279 if ( float32_is_signaling_nan( a ) || float32_is_signaling_nan( b ) ) { 2280 float_raise( float_flag_invalid ); 2281 } 2282 return 0; 2283 } 2284 aSign = extractFloat32Sign( a ); 2285 bSign = extractFloat32Sign( b ); 2286 if ( aSign != bSign ) return aSign || ( (bits32) ( ( a | b )<<1 ) == 0 ); 2287 return ( a == b ) || ( aSign ^ ( a < b ) ); 2288 2289 } 2290 2291 /* 2292 ------------------------------------------------------------------------------- 2293 Returns 1 if the single-precision floating-point value `a' is less than 2294 the corresponding value `b', and 0 otherwise. Quiet NaNs do not cause an 2295 exception. Otherwise, the comparison is performed according to the IEC/IEEE 2296 Standard for Binary Floating-Point Arithmetic. 2297 ------------------------------------------------------------------------------- 2298 */ 2299 flag float32_lt_quiet( float32 a, float32 b ) 2300 { 2301 flag aSign, bSign; 2302 2303 if ( ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) ) 2304 || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) ) 2305 ) { 2306 if ( float32_is_signaling_nan( a ) || float32_is_signaling_nan( b ) ) { 2307 float_raise( float_flag_invalid ); 2308 } 2309 return 0; 2310 } 2311 aSign = extractFloat32Sign( a ); 2312 bSign = extractFloat32Sign( b ); 2313 if ( aSign != bSign ) return aSign && ( (bits32) ( ( a | b )<<1 ) != 0 ); 2314 return ( a != b ) && ( aSign ^ ( a < b ) ); 2315 2316 } 2317 #endif /* !SOFTFLOAT_FOR_GCC */ 2318 2319 #ifndef SOFTFLOAT_FOR_GCC /* Not needed */ 2320 /* 2321 ------------------------------------------------------------------------------- 2322 Returns the result of converting the double-precision floating-point value 2323 `a' to the 32-bit two's complement integer format. The conversion is 2324 performed according to the IEC/IEEE Standard for Binary Floating-Point 2325 Arithmetic---which means in particular that the conversion is rounded 2326 according to the current rounding mode. If `a' is a NaN, the largest 2327 positive integer is returned. Otherwise, if the conversion overflows, the 2328 largest integer with the same sign as `a' is returned. 2329 ------------------------------------------------------------------------------- 2330 */ 2331 int32 float64_to_int32( float64 a ) 2332 { 2333 flag aSign; 2334 int16 aExp, shiftCount; 2335 bits64 aSig; 2336 2337 aSig = extractFloat64Frac( a ); 2338 aExp = extractFloat64Exp( a ); 2339 aSign = extractFloat64Sign( a ); 2340 if ( ( aExp == 0x7FF ) && aSig ) aSign = 0; 2341 if ( aExp ) aSig |= LIT64( 0x0010000000000000 ); 2342 shiftCount = 0x42C - aExp; 2343 if ( 0 < shiftCount ) shift64RightJamming( aSig, shiftCount, &aSig ); 2344 return roundAndPackInt32( aSign, aSig ); 2345 2346 } 2347 #endif /* !SOFTFLOAT_FOR_GCC */ 2348 2349 /* 2350 ------------------------------------------------------------------------------- 2351 Returns the result of converting the double-precision floating-point value 2352 `a' to the 32-bit two's complement integer format. The conversion is 2353 performed according to the IEC/IEEE Standard for Binary Floating-Point 2354 Arithmetic, except that the conversion is always rounded toward zero. 2355 If `a' is a NaN, the largest positive integer is returned. Otherwise, if 2356 the conversion overflows, the largest integer with the same sign as `a' is 2357 returned. 2358 ------------------------------------------------------------------------------- 2359 */ 2360 int32 float64_to_int32_round_to_zero( float64 a ) 2361 { 2362 flag aSign; 2363 int16 aExp, shiftCount; 2364 bits64 aSig, savedASig; 2365 int32 z; 2366 2367 aSig = extractFloat64Frac( a ); 2368 aExp = extractFloat64Exp( a ); 2369 aSign = extractFloat64Sign( a ); 2370 if ( 0x41E < aExp ) { 2371 if ( ( aExp == 0x7FF ) && aSig ) aSign = 0; 2372 goto invalid; 2373 } 2374 else if ( aExp < 0x3FF ) { 2375 if ( aExp || aSig ) float_exception_flags |= float_flag_inexact; 2376 return 0; 2377 } 2378 aSig |= LIT64( 0x0010000000000000 ); 2379 shiftCount = 0x433 - aExp; 2380 savedASig = aSig; 2381 aSig >>= shiftCount; 2382 z = aSig; 2383 if ( aSign ) z = - z; 2384 if ( ( z < 0 ) ^ aSign ) { 2385 invalid: 2386 float_raise( float_flag_invalid ); 2387 return aSign ? (sbits32) 0x80000000 : 0x7FFFFFFF; 2388 } 2389 if ( ( aSig<<shiftCount ) != savedASig ) { 2390 float_exception_flags |= float_flag_inexact; 2391 } 2392 return z; 2393 2394 } 2395 2396 #ifndef SOFTFLOAT_FOR_GCC /* Not needed */ 2397 /* 2398 ------------------------------------------------------------------------------- 2399 Returns the result of converting the double-precision floating-point value 2400 `a' to the 64-bit two's complement integer format. The conversion is 2401 performed according to the IEC/IEEE Standard for Binary Floating-Point 2402 Arithmetic---which means in particular that the conversion is rounded 2403 according to the current rounding mode. If `a' is a NaN, the largest 2404 positive integer is returned. Otherwise, if the conversion overflows, the 2405 largest integer with the same sign as `a' is returned. 2406 ------------------------------------------------------------------------------- 2407 */ 2408 int64 float64_to_int64( float64 a ) 2409 { 2410 flag aSign; 2411 int16 aExp, shiftCount; 2412 bits64 aSig, aSigExtra; 2413 2414 aSig = extractFloat64Frac( a ); 2415 aExp = extractFloat64Exp( a ); 2416 aSign = extractFloat64Sign( a ); 2417 if ( aExp ) aSig |= LIT64( 0x0010000000000000 ); 2418 shiftCount = 0x433 - aExp; 2419 if ( shiftCount <= 0 ) { 2420 if ( 0x43E < aExp ) { 2421 float_raise( float_flag_invalid ); 2422 if ( ! aSign 2423 || ( ( aExp == 0x7FF ) 2424 && ( aSig != LIT64( 0x0010000000000000 ) ) ) 2425 ) { 2426 return LIT64( 0x7FFFFFFFFFFFFFFF ); 2427 } 2428 return (sbits64) LIT64( 0x8000000000000000 ); 2429 } 2430 aSigExtra = 0; 2431 aSig <<= - shiftCount; 2432 } 2433 else { 2434 shift64ExtraRightJamming( aSig, 0, shiftCount, &aSig, &aSigExtra ); 2435 } 2436 return roundAndPackInt64( aSign, aSig, aSigExtra ); 2437 2438 } 2439 2440 /* 2441 ------------------------------------------------------------------------------- 2442 Returns the result of converting the double-precision floating-point value 2443 `a' to the 64-bit two's complement integer format. The conversion is 2444 performed according to the IEC/IEEE Standard for Binary Floating-Point 2445 Arithmetic, except that the conversion is always rounded toward zero. 2446 If `a' is a NaN, the largest positive integer is returned. Otherwise, if 2447 the conversion overflows, the largest integer with the same sign as `a' is 2448 returned. 2449 ------------------------------------------------------------------------------- 2450 */ 2451 int64 float64_to_int64_round_to_zero( float64 a ) 2452 { 2453 flag aSign; 2454 int16 aExp, shiftCount; 2455 bits64 aSig; 2456 int64 z; 2457 2458 aSig = extractFloat64Frac( a ); 2459 aExp = extractFloat64Exp( a ); 2460 aSign = extractFloat64Sign( a ); 2461 if ( aExp ) aSig |= LIT64( 0x0010000000000000 ); 2462 shiftCount = aExp - 0x433; 2463 if ( 0 <= shiftCount ) { 2464 if ( 0x43E <= aExp ) { 2465 if ( a != LIT64( 0xC3E0000000000000 ) ) { 2466 float_raise( float_flag_invalid ); 2467 if ( ! aSign 2468 || ( ( aExp == 0x7FF ) 2469 && ( aSig != LIT64( 0x0010000000000000 ) ) ) 2470 ) { 2471 return LIT64( 0x7FFFFFFFFFFFFFFF ); 2472 } 2473 } 2474 return (sbits64) LIT64( 0x8000000000000000 ); 2475 } 2476 z = aSig<<shiftCount; 2477 } 2478 else { 2479 if ( aExp < 0x3FE ) { 2480 if ( aExp | aSig ) float_exception_flags |= float_flag_inexact; 2481 return 0; 2482 } 2483 z = aSig>>( - shiftCount ); 2484 if ( (bits64) ( aSig<<( shiftCount & 63 ) ) ) { 2485 float_exception_flags |= float_flag_inexact; 2486 } 2487 } 2488 if ( aSign ) z = - z; 2489 return z; 2490 2491 } 2492 #endif /* !SOFTFLOAT_FOR_GCC */ 2493 2494 /* 2495 ------------------------------------------------------------------------------- 2496 Returns the result of converting the double-precision floating-point value 2497 `a' to the single-precision floating-point format. The conversion is 2498 performed according to the IEC/IEEE Standard for Binary Floating-Point 2499 Arithmetic. 2500 ------------------------------------------------------------------------------- 2501 */ 2502 float32 float64_to_float32( float64 a ) 2503 { 2504 flag aSign; 2505 int16 aExp; 2506 bits64 aSig; 2507 bits32 zSig; 2508 2509 aSig = extractFloat64Frac( a ); 2510 aExp = extractFloat64Exp( a ); 2511 aSign = extractFloat64Sign( a ); 2512 if ( aExp == 0x7FF ) { 2513 if ( aSig ) return commonNaNToFloat32( float64ToCommonNaN( a ) ); 2514 return packFloat32( aSign, 0xFF, 0 ); 2515 } 2516 shift64RightJamming( aSig, 22, &aSig ); 2517 zSig = aSig; 2518 if ( aExp || zSig ) { 2519 zSig |= 0x40000000; 2520 aExp -= 0x381; 2521 } 2522 return roundAndPackFloat32( aSign, aExp, zSig ); 2523 2524 } 2525 2526 #ifdef FLOATX80 2527 2528 /* 2529 ------------------------------------------------------------------------------- 2530 Returns the result of converting the double-precision floating-point value 2531 `a' to the extended double-precision floating-point format. The conversion 2532 is performed according to the IEC/IEEE Standard for Binary Floating-Point 2533 Arithmetic. 2534 ------------------------------------------------------------------------------- 2535 */ 2536 floatx80 float64_to_floatx80( float64 a ) 2537 { 2538 flag aSign; 2539 int16 aExp; 2540 bits64 aSig; 2541 2542 aSig = extractFloat64Frac( a ); 2543 aExp = extractFloat64Exp( a ); 2544 aSign = extractFloat64Sign( a ); 2545 if ( aExp == 0x7FF ) { 2546 if ( aSig ) return commonNaNToFloatx80( float64ToCommonNaN( a ) ); 2547 return packFloatx80( aSign, 0x7FFF, LIT64( 0x8000000000000000 ) ); 2548 } 2549 if ( aExp == 0 ) { 2550 if ( aSig == 0 ) return packFloatx80( aSign, 0, 0 ); 2551 normalizeFloat64Subnormal( aSig, &aExp, &aSig ); 2552 } 2553 return 2554 packFloatx80( 2555 aSign, aExp + 0x3C00, ( aSig | LIT64( 0x0010000000000000 ) )<<11 ); 2556 2557 } 2558 2559 #endif 2560 2561 #ifdef FLOAT128 2562 2563 /* 2564 ------------------------------------------------------------------------------- 2565 Returns the result of converting the double-precision floating-point value 2566 `a' to the quadruple-precision floating-point format. The conversion is 2567 performed according to the IEC/IEEE Standard for Binary Floating-Point 2568 Arithmetic. 2569 ------------------------------------------------------------------------------- 2570 */ 2571 float128 float64_to_float128( float64 a ) 2572 { 2573 flag aSign; 2574 int16 aExp; 2575 bits64 aSig, zSig0, zSig1; 2576 2577 aSig = extractFloat64Frac( a ); 2578 aExp = extractFloat64Exp( a ); 2579 aSign = extractFloat64Sign( a ); 2580 if ( aExp == 0x7FF ) { 2581 if ( aSig ) return commonNaNToFloat128( float64ToCommonNaN( a ) ); 2582 return packFloat128( aSign, 0x7FFF, 0, 0 ); 2583 } 2584 if ( aExp == 0 ) { 2585 if ( aSig == 0 ) return packFloat128( aSign, 0, 0, 0 ); 2586 normalizeFloat64Subnormal( aSig, &aExp, &aSig ); 2587 --aExp; 2588 } 2589 shift128Right( aSig, 0, 4, &zSig0, &zSig1 ); 2590 return packFloat128( aSign, aExp + 0x3C00, zSig0, zSig1 ); 2591 2592 } 2593 2594 #endif 2595 2596 #ifndef SOFTFLOAT_FOR_GCC 2597 /* 2598 ------------------------------------------------------------------------------- 2599 Rounds the double-precision floating-point value `a' to an integer, and 2600 returns the result as a double-precision floating-point value. The 2601 operation is performed according to the IEC/IEEE Standard for Binary 2602 Floating-Point Arithmetic. 2603 ------------------------------------------------------------------------------- 2604 */ 2605 float64 float64_round_to_int( float64 a ) 2606 { 2607 flag aSign; 2608 int16 aExp; 2609 bits64 lastBitMask, roundBitsMask; 2610 int8 roundingMode; 2611 float64 z; 2612 2613 aExp = extractFloat64Exp( a ); 2614 if ( 0x433 <= aExp ) { 2615 if ( ( aExp == 0x7FF ) && extractFloat64Frac( a ) ) { 2616 return propagateFloat64NaN( a, a ); 2617 } 2618 return a; 2619 } 2620 if ( aExp < 0x3FF ) { 2621 if ( (bits64) ( a<<1 ) == 0 ) return a; 2622 float_exception_flags |= float_flag_inexact; 2623 aSign = extractFloat64Sign( a ); 2624 switch ( float_rounding_mode ) { 2625 case float_round_nearest_even: 2626 if ( ( aExp == 0x3FE ) && extractFloat64Frac( a ) ) { 2627 return packFloat64( aSign, 0x3FF, 0 ); 2628 } 2629 break; 2630 case float_round_to_zero: 2631 break; 2632 case float_round_down: 2633 return aSign ? LIT64( 0xBFF0000000000000 ) : 0; 2634 case float_round_up: 2635 return 2636 aSign ? LIT64( 0x8000000000000000 ) : LIT64( 0x3FF0000000000000 ); 2637 } 2638 return packFloat64( aSign, 0, 0 ); 2639 } 2640 lastBitMask = 1; 2641 lastBitMask <<= 0x433 - aExp; 2642 roundBitsMask = lastBitMask - 1; 2643 z = a; 2644 roundingMode = float_rounding_mode; 2645 if ( roundingMode == float_round_nearest_even ) { 2646 z += lastBitMask>>1; 2647 if ( ( z & roundBitsMask ) == 0 ) z &= ~ lastBitMask; 2648 } 2649 else if ( roundingMode != float_round_to_zero ) { 2650 if ( extractFloat64Sign( z ) ^ ( roundingMode == float_round_up ) ) { 2651 z += roundBitsMask; 2652 } 2653 } 2654 z &= ~ roundBitsMask; 2655 if ( z != a ) float_exception_flags |= float_flag_inexact; 2656 return z; 2657 2658 } 2659 #endif 2660 2661 /* 2662 ------------------------------------------------------------------------------- 2663 Returns the result of adding the absolute values of the double-precision 2664 floating-point values `a' and `b'. If `zSign' is 1, the sum is negated 2665 before being returned. `zSign' is ignored if the result is a NaN. 2666 The addition is performed according to the IEC/IEEE Standard for Binary 2667 Floating-Point Arithmetic. 2668 ------------------------------------------------------------------------------- 2669 */ 2670 static float64 addFloat64Sigs( float64 a, float64 b, flag zSign ) 2671 { 2672 int16 aExp, bExp, zExp; 2673 bits64 aSig, bSig, zSig; 2674 int16 expDiff; 2675 2676 aSig = extractFloat64Frac( a ); 2677 aExp = extractFloat64Exp( a ); 2678 bSig = extractFloat64Frac( b ); 2679 bExp = extractFloat64Exp( b ); 2680 expDiff = aExp - bExp; 2681 aSig <<= 9; 2682 bSig <<= 9; 2683 if ( 0 < expDiff ) { 2684 if ( aExp == 0x7FF ) { 2685 if ( aSig ) return propagateFloat64NaN( a, b ); 2686 return a; 2687 } 2688 if ( bExp == 0 ) { 2689 --expDiff; 2690 } 2691 else { 2692 bSig |= LIT64( 0x2000000000000000 ); 2693 } 2694 shift64RightJamming( bSig, expDiff, &bSig ); 2695 zExp = aExp; 2696 } 2697 else if ( expDiff < 0 ) { 2698 if ( bExp == 0x7FF ) { 2699 if ( bSig ) return propagateFloat64NaN( a, b ); 2700 return packFloat64( zSign, 0x7FF, 0 ); 2701 } 2702 if ( aExp == 0 ) { 2703 ++expDiff; 2704 } 2705 else { 2706 aSig |= LIT64( 0x2000000000000000 ); 2707 } 2708 shift64RightJamming( aSig, - expDiff, &aSig ); 2709 zExp = bExp; 2710 } 2711 else { 2712 if ( aExp == 0x7FF ) { 2713 if ( aSig | bSig ) return propagateFloat64NaN( a, b ); 2714 return a; 2715 } 2716 if ( aExp == 0 ) return packFloat64( zSign, 0, ( aSig + bSig )>>9 ); 2717 zSig = LIT64( 0x4000000000000000 ) + aSig + bSig; 2718 zExp = aExp; 2719 goto roundAndPack; 2720 } 2721 aSig |= LIT64( 0x2000000000000000 ); 2722 zSig = ( aSig + bSig )<<1; 2723 --zExp; 2724 if ( (sbits64) zSig < 0 ) { 2725 zSig = aSig + bSig; 2726 ++zExp; 2727 } 2728 roundAndPack: 2729 return roundAndPackFloat64( zSign, zExp, zSig ); 2730 2731 } 2732 2733 /* 2734 ------------------------------------------------------------------------------- 2735 Returns the result of subtracting the absolute values of the double- 2736 precision floating-point values `a' and `b'. If `zSign' is 1, the 2737 difference is negated before being returned. `zSign' is ignored if the 2738 result is a NaN. The subtraction is performed according to the IEC/IEEE 2739 Standard for Binary Floating-Point Arithmetic. 2740 ------------------------------------------------------------------------------- 2741 */ 2742 static float64 subFloat64Sigs( float64 a, float64 b, flag zSign ) 2743 { 2744 int16 aExp, bExp, zExp; 2745 bits64 aSig, bSig, zSig; 2746 int16 expDiff; 2747 2748 aSig = extractFloat64Frac( a ); 2749 aExp = extractFloat64Exp( a ); 2750 bSig = extractFloat64Frac( b ); 2751 bExp = extractFloat64Exp( b ); 2752 expDiff = aExp - bExp; 2753 aSig <<= 10; 2754 bSig <<= 10; 2755 if ( 0 < expDiff ) goto aExpBigger; 2756 if ( expDiff < 0 ) goto bExpBigger; 2757 if ( aExp == 0x7FF ) { 2758 if ( aSig | bSig ) return propagateFloat64NaN( a, b ); 2759 float_raise( float_flag_invalid ); 2760 return float64_default_nan; 2761 } 2762 if ( aExp == 0 ) { 2763 aExp = 1; 2764 bExp = 1; 2765 } 2766 if ( bSig < aSig ) goto aBigger; 2767 if ( aSig < bSig ) goto bBigger; 2768 return packFloat64( float_rounding_mode == float_round_down, 0, 0 ); 2769 bExpBigger: 2770 if ( bExp == 0x7FF ) { 2771 if ( bSig ) return propagateFloat64NaN( a, b ); 2772 return packFloat64( zSign ^ 1, 0x7FF, 0 ); 2773 } 2774 if ( aExp == 0 ) { 2775 ++expDiff; 2776 } 2777 else { 2778 aSig |= LIT64( 0x4000000000000000 ); 2779 } 2780 shift64RightJamming( aSig, - expDiff, &aSig ); 2781 bSig |= LIT64( 0x4000000000000000 ); 2782 bBigger: 2783 zSig = bSig - aSig; 2784 zExp = bExp; 2785 zSign ^= 1; 2786 goto normalizeRoundAndPack; 2787 aExpBigger: 2788 if ( aExp == 0x7FF ) { 2789 if ( aSig ) return propagateFloat64NaN( a, b ); 2790 return a; 2791 } 2792 if ( bExp == 0 ) { 2793 --expDiff; 2794 } 2795 else { 2796 bSig |= LIT64( 0x4000000000000000 ); 2797 } 2798 shift64RightJamming( bSig, expDiff, &bSig ); 2799 aSig |= LIT64( 0x4000000000000000 ); 2800 aBigger: 2801 zSig = aSig - bSig; 2802 zExp = aExp; 2803 normalizeRoundAndPack: 2804 --zExp; 2805 return normalizeRoundAndPackFloat64( zSign, zExp, zSig ); 2806 2807 } 2808 2809 /* 2810 ------------------------------------------------------------------------------- 2811 Returns the result of adding the double-precision floating-point values `a' 2812 and `b'. The operation is performed according to the IEC/IEEE Standard for 2813 Binary Floating-Point Arithmetic. 2814 ------------------------------------------------------------------------------- 2815 */ 2816 float64 float64_add( float64 a, float64 b ) 2817 { 2818 flag aSign, bSign; 2819 2820 aSign = extractFloat64Sign( a ); 2821 bSign = extractFloat64Sign( b ); 2822 if ( aSign == bSign ) { 2823 return addFloat64Sigs( a, b, aSign ); 2824 } 2825 else { 2826 return subFloat64Sigs( a, b, aSign ); 2827 } 2828 2829 } 2830 2831 /* 2832 ------------------------------------------------------------------------------- 2833 Returns the result of subtracting the double-precision floating-point values 2834 `a' and `b'. The operation is performed according to the IEC/IEEE Standard 2835 for Binary Floating-Point Arithmetic. 2836 ------------------------------------------------------------------------------- 2837 */ 2838 float64 float64_sub( float64 a, float64 b ) 2839 { 2840 flag aSign, bSign; 2841 2842 aSign = extractFloat64Sign( a ); 2843 bSign = extractFloat64Sign( b ); 2844 if ( aSign == bSign ) { 2845 return subFloat64Sigs( a, b, aSign ); 2846 } 2847 else { 2848 return addFloat64Sigs( a, b, aSign ); 2849 } 2850 2851 } 2852 2853 /* 2854 ------------------------------------------------------------------------------- 2855 Returns the result of multiplying the double-precision floating-point values 2856 `a' and `b'. The operation is performed according to the IEC/IEEE Standard 2857 for Binary Floating-Point Arithmetic. 2858 ------------------------------------------------------------------------------- 2859 */ 2860 float64 float64_mul( float64 a, float64 b ) 2861 { 2862 flag aSign, bSign, zSign; 2863 int16 aExp, bExp, zExp; 2864 bits64 aSig, bSig, zSig0, zSig1; 2865 2866 aSig = extractFloat64Frac( a ); 2867 aExp = extractFloat64Exp( a ); 2868 aSign = extractFloat64Sign( a ); 2869 bSig = extractFloat64Frac( b ); 2870 bExp = extractFloat64Exp( b ); 2871 bSign = extractFloat64Sign( b ); 2872 zSign = aSign ^ bSign; 2873 if ( aExp == 0x7FF ) { 2874 if ( aSig || ( ( bExp == 0x7FF ) && bSig ) ) { 2875 return propagateFloat64NaN( a, b ); 2876 } 2877 if ( ( bExp | bSig ) == 0 ) { 2878 float_raise( float_flag_invalid ); 2879 return float64_default_nan; 2880 } 2881 return packFloat64( zSign, 0x7FF, 0 ); 2882 } 2883 if ( bExp == 0x7FF ) { 2884 if ( bSig ) return propagateFloat64NaN( a, b ); 2885 if ( ( aExp | aSig ) == 0 ) { 2886 float_raise( float_flag_invalid ); 2887 return float64_default_nan; 2888 } 2889 return packFloat64( zSign, 0x7FF, 0 ); 2890 } 2891 if ( aExp == 0 ) { 2892 if ( aSig == 0 ) return packFloat64( zSign, 0, 0 ); 2893 normalizeFloat64Subnormal( aSig, &aExp, &aSig ); 2894 } 2895 if ( bExp == 0 ) { 2896 if ( bSig == 0 ) return packFloat64( zSign, 0, 0 ); 2897 normalizeFloat64Subnormal( bSig, &bExp, &bSig ); 2898 } 2899 zExp = aExp + bExp - 0x3FF; 2900 aSig = ( aSig | LIT64( 0x0010000000000000 ) )<<10; 2901 bSig = ( bSig | LIT64( 0x0010000000000000 ) )<<11; 2902 mul64To128( aSig, bSig, &zSig0, &zSig1 ); 2903 zSig0 |= ( zSig1 != 0 ); 2904 if ( 0 <= (sbits64) ( zSig0<<1 ) ) { 2905 zSig0 <<= 1; 2906 --zExp; 2907 } 2908 return roundAndPackFloat64( zSign, zExp, zSig0 ); 2909 2910 } 2911 2912 /* 2913 ------------------------------------------------------------------------------- 2914 Returns the result of dividing the double-precision floating-point value `a' 2915 by the corresponding value `b'. The operation is performed according to 2916 the IEC/IEEE Standard for Binary Floating-Point Arithmetic. 2917 ------------------------------------------------------------------------------- 2918 */ 2919 float64 float64_div( float64 a, float64 b ) 2920 { 2921 flag aSign, bSign, zSign; 2922 int16 aExp, bExp, zExp; 2923 bits64 aSig, bSig, zSig; 2924 bits64 rem0, rem1; 2925 bits64 term0, term1; 2926 2927 aSig = extractFloat64Frac( a ); 2928 aExp = extractFloat64Exp( a ); 2929 aSign = extractFloat64Sign( a ); 2930 bSig = extractFloat64Frac( b ); 2931 bExp = extractFloat64Exp( b ); 2932 bSign = extractFloat64Sign( b ); 2933 zSign = aSign ^ bSign; 2934 if ( aExp == 0x7FF ) { 2935 if ( aSig ) return propagateFloat64NaN( a, b ); 2936 if ( bExp == 0x7FF ) { 2937 if ( bSig ) return propagateFloat64NaN( a, b ); 2938 float_raise( float_flag_invalid ); 2939 return float64_default_nan; 2940 } 2941 return packFloat64( zSign, 0x7FF, 0 ); 2942 } 2943 if ( bExp == 0x7FF ) { 2944 if ( bSig ) return propagateFloat64NaN( a, b ); 2945 return packFloat64( zSign, 0, 0 ); 2946 } 2947 if ( bExp == 0 ) { 2948 if ( bSig == 0 ) { 2949 if ( ( aExp | aSig ) == 0 ) { 2950 float_raise( float_flag_invalid ); 2951 return float64_default_nan; 2952 } 2953 float_raise( float_flag_divbyzero ); 2954 return packFloat64( zSign, 0x7FF, 0 ); 2955 } 2956 normalizeFloat64Subnormal( bSig, &bExp, &bSig ); 2957 } 2958 if ( aExp == 0 ) { 2959 if ( aSig == 0 ) return packFloat64( zSign, 0, 0 ); 2960 normalizeFloat64Subnormal( aSig, &aExp, &aSig ); 2961 } 2962 zExp = aExp - bExp + 0x3FD; 2963 aSig = ( aSig | LIT64( 0x0010000000000000 ) )<<10; 2964 bSig = ( bSig | LIT64( 0x0010000000000000 ) )<<11; 2965 if ( bSig <= ( aSig + aSig ) ) { 2966 aSig >>= 1; 2967 ++zExp; 2968 } 2969 zSig = estimateDiv128To64( aSig, 0, bSig ); 2970 if ( ( zSig & 0x1FF ) <= 2 ) { 2971 mul64To128( bSig, zSig, &term0, &term1 ); 2972 sub128( aSig, 0, term0, term1, &rem0, &rem1 ); 2973 while ( (sbits64) rem0 < 0 ) { 2974 --zSig; 2975 add128( rem0, rem1, 0, bSig, &rem0, &rem1 ); 2976 } 2977 zSig |= ( rem1 != 0 ); 2978 } 2979 return roundAndPackFloat64( zSign, zExp, zSig ); 2980 2981 } 2982 2983 #ifndef SOFTFLOAT_FOR_GCC 2984 /* 2985 ------------------------------------------------------------------------------- 2986 Returns the remainder of the double-precision floating-point value `a' 2987 with respect to the corresponding value `b'. The operation is performed 2988 according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic. 2989 ------------------------------------------------------------------------------- 2990 */ 2991 float64 float64_rem( float64 a, float64 b ) 2992 { 2993 flag aSign, bSign, zSign; 2994 int16 aExp, bExp, expDiff; 2995 bits64 aSig, bSig; 2996 bits64 q, alternateASig; 2997 sbits64 sigMean; 2998 2999 aSig = extractFloat64Frac( a ); 3000 aExp = extractFloat64Exp( a ); 3001 aSign = extractFloat64Sign( a ); 3002 bSig = extractFloat64Frac( b ); 3003 bExp = extractFloat64Exp( b ); 3004 bSign = extractFloat64Sign( b ); 3005 if ( aExp == 0x7FF ) { 3006 if ( aSig || ( ( bExp == 0x7FF ) && bSig ) ) { 3007 return propagateFloat64NaN( a, b ); 3008 } 3009 float_raise( float_flag_invalid ); 3010 return float64_default_nan; 3011 } 3012 if ( bExp == 0x7FF ) { 3013 if ( bSig ) return propagateFloat64NaN( a, b ); 3014 return a; 3015 } 3016 if ( bExp == 0 ) { 3017 if ( bSig == 0 ) { 3018 float_raise( float_flag_invalid ); 3019 return float64_default_nan; 3020 } 3021 normalizeFloat64Subnormal( bSig, &bExp, &bSig ); 3022 } 3023 if ( aExp == 0 ) { 3024 if ( aSig == 0 ) return a; 3025 normalizeFloat64Subnormal( aSig, &aExp, &aSig ); 3026 } 3027 expDiff = aExp - bExp; 3028 aSig = ( aSig | LIT64( 0x0010000000000000 ) )<<11; 3029 bSig = ( bSig | LIT64( 0x0010000000000000 ) )<<11; 3030 if ( expDiff < 0 ) { 3031 if ( expDiff < -1 ) return a; 3032 aSig >>= 1; 3033 } 3034 q = ( bSig <= aSig ); 3035 if ( q ) aSig -= bSig; 3036 expDiff -= 64; 3037 while ( 0 < expDiff ) { 3038 q = estimateDiv128To64( aSig, 0, bSig ); 3039 q = ( 2 < q ) ? q - 2 : 0; 3040 aSig = - ( ( bSig>>2 ) * q ); 3041 expDiff -= 62; 3042 } 3043 expDiff += 64; 3044 if ( 0 < expDiff ) { 3045 q = estimateDiv128To64( aSig, 0, bSig ); 3046 q = ( 2 < q ) ? q - 2 : 0; 3047 q >>= 64 - expDiff; 3048 bSig >>= 2; 3049 aSig = ( ( aSig>>1 )<<( expDiff - 1 ) ) - bSig * q; 3050 } 3051 else { 3052 aSig >>= 2; 3053 bSig >>= 2; 3054 } 3055 do { 3056 alternateASig = aSig; 3057 ++q; 3058 aSig -= bSig; 3059 } while ( 0 <= (sbits64) aSig ); 3060 sigMean = aSig + alternateASig; 3061 if ( ( sigMean < 0 ) || ( ( sigMean == 0 ) && ( q & 1 ) ) ) { 3062 aSig = alternateASig; 3063 } 3064 zSign = ( (sbits64) aSig < 0 ); 3065 if ( zSign ) aSig = - aSig; 3066 return normalizeRoundAndPackFloat64( aSign ^ zSign, bExp, aSig ); 3067 3068 } 3069 3070 /* 3071 ------------------------------------------------------------------------------- 3072 Returns the square root of the double-precision floating-point value `a'. 3073 The operation is performed according to the IEC/IEEE Standard for Binary 3074 Floating-Point Arithmetic. 3075 ------------------------------------------------------------------------------- 3076 */ 3077 float64 float64_sqrt( float64 a ) 3078 { 3079 flag aSign; 3080 int16 aExp, zExp; 3081 bits64 aSig, zSig, doubleZSig; 3082 bits64 rem0, rem1, term0, term1; 3083 3084 aSig = extractFloat64Frac( a ); 3085 aExp = extractFloat64Exp( a ); 3086 aSign = extractFloat64Sign( a ); 3087 if ( aExp == 0x7FF ) { 3088 if ( aSig ) return propagateFloat64NaN( a, a ); 3089 if ( ! aSign ) return a; 3090 float_raise( float_flag_invalid ); 3091 return float64_default_nan; 3092 } 3093 if ( aSign ) { 3094 if ( ( aExp | aSig ) == 0 ) return a; 3095 float_raise( float_flag_invalid ); 3096 return float64_default_nan; 3097 } 3098 if ( aExp == 0 ) { 3099 if ( aSig == 0 ) return 0; 3100 normalizeFloat64Subnormal( aSig, &aExp, &aSig ); 3101 } 3102 zExp = ( ( aExp - 0x3FF )>>1 ) + 0x3FE; 3103 aSig |= LIT64( 0x0010000000000000 ); 3104 zSig = estimateSqrt32( aExp, aSig>>21 ); 3105 aSig <<= 9 - ( aExp & 1 ); 3106 zSig = estimateDiv128To64( aSig, 0, zSig<<32 ) + ( zSig<<30 ); 3107 if ( ( zSig & 0x1FF ) <= 5 ) { 3108 doubleZSig = zSig<<1; 3109 mul64To128( zSig, zSig, &term0, &term1 ); 3110 sub128( aSig, 0, term0, term1, &rem0, &rem1 ); 3111 while ( (sbits64) rem0 < 0 ) { 3112 --zSig; 3113 doubleZSig -= 2; 3114 add128( rem0, rem1, zSig>>63, doubleZSig | 1, &rem0, &rem1 ); 3115 } 3116 zSig |= ( ( rem0 | rem1 ) != 0 ); 3117 } 3118 return roundAndPackFloat64( 0, zExp, zSig ); 3119 3120 } 3121 #endif 3122 3123 /* 3124 ------------------------------------------------------------------------------- 3125 Returns 1 if the double-precision floating-point value `a' is equal to the 3126 corresponding value `b', and 0 otherwise. The comparison is performed 3127 according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic. 3128 ------------------------------------------------------------------------------- 3129 */ 3130 flag float64_eq( float64 a, float64 b ) 3131 { 3132 3133 if ( ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) ) 3134 || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) ) 3135 ) { 3136 if ( float64_is_signaling_nan( a ) || float64_is_signaling_nan( b ) ) { 3137 float_raise( float_flag_invalid ); 3138 } 3139 return 0; 3140 } 3141 return ( a == b ) || 3142 ( (bits64) ( ( FLOAT64_DEMANGLE(a) | FLOAT64_DEMANGLE(b) )<<1 ) == 0 ); 3143 3144 } 3145 3146 /* 3147 ------------------------------------------------------------------------------- 3148 Returns 1 if the double-precision floating-point value `a' is less than or 3149 equal to the corresponding value `b', and 0 otherwise. The comparison is 3150 performed according to the IEC/IEEE Standard for Binary Floating-Point 3151 Arithmetic. 3152 ------------------------------------------------------------------------------- 3153 */ 3154 flag float64_le( float64 a, float64 b ) 3155 { 3156 flag aSign, bSign; 3157 3158 if ( ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) ) 3159 || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) ) 3160 ) { 3161 float_raise( float_flag_invalid ); 3162 return 0; 3163 } 3164 aSign = extractFloat64Sign( a ); 3165 bSign = extractFloat64Sign( b ); 3166 if ( aSign != bSign ) 3167 return aSign || 3168 ( (bits64) ( ( FLOAT64_DEMANGLE(a) | FLOAT64_DEMANGLE(b) )<<1 ) == 3169 0 ); 3170 return ( a == b ) || 3171 ( aSign ^ ( FLOAT64_DEMANGLE(a) < FLOAT64_DEMANGLE(b) ) ); 3172 3173 } 3174 3175 /* 3176 ------------------------------------------------------------------------------- 3177 Returns 1 if the double-precision floating-point value `a' is less than 3178 the corresponding value `b', and 0 otherwise. The comparison is performed 3179 according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic. 3180 ------------------------------------------------------------------------------- 3181 */ 3182 flag float64_lt( float64 a, float64 b ) 3183 { 3184 flag aSign, bSign; 3185 3186 if ( ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) ) 3187 || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) ) 3188 ) { 3189 float_raise( float_flag_invalid ); 3190 return 0; 3191 } 3192 aSign = extractFloat64Sign( a ); 3193 bSign = extractFloat64Sign( b ); 3194 if ( aSign != bSign ) 3195 return aSign && 3196 ( (bits64) ( ( FLOAT64_DEMANGLE(a) | FLOAT64_DEMANGLE(b) )<<1 ) != 3197 0 ); 3198 return ( a != b ) && 3199 ( aSign ^ ( FLOAT64_DEMANGLE(a) < FLOAT64_DEMANGLE(b) ) ); 3200 3201 } 3202 3203 #ifndef SOFTFLOAT_FOR_GCC 3204 /* 3205 ------------------------------------------------------------------------------- 3206 Returns 1 if the double-precision floating-point value `a' is equal to the 3207 corresponding value `b', and 0 otherwise. The invalid exception is raised 3208 if either operand is a NaN. Otherwise, the comparison is performed 3209 according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic. 3210 ------------------------------------------------------------------------------- 3211 */ 3212 flag float64_eq_signaling( float64 a, float64 b ) 3213 { 3214 3215 if ( ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) ) 3216 || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) ) 3217 ) { 3218 float_raise( float_flag_invalid ); 3219 return 0; 3220 } 3221 return ( a == b ) || ( (bits64) ( ( a | b )<<1 ) == 0 ); 3222 3223 } 3224 3225 /* 3226 ------------------------------------------------------------------------------- 3227 Returns 1 if the double-precision floating-point value `a' is less than or 3228 equal to the corresponding value `b', and 0 otherwise. Quiet NaNs do not 3229 cause an exception. Otherwise, the comparison is performed according to the 3230 IEC/IEEE Standard for Binary Floating-Point Arithmetic. 3231 ------------------------------------------------------------------------------- 3232 */ 3233 flag float64_le_quiet( float64 a, float64 b ) 3234 { 3235 flag aSign, bSign; 3236 3237 if ( ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) ) 3238 || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) ) 3239 ) { 3240 if ( float64_is_signaling_nan( a ) || float64_is_signaling_nan( b ) ) { 3241 float_raise( float_flag_invalid ); 3242 } 3243 return 0; 3244 } 3245 aSign = extractFloat64Sign( a ); 3246 bSign = extractFloat64Sign( b ); 3247 if ( aSign != bSign ) return aSign || ( (bits64) ( ( a | b )<<1 ) == 0 ); 3248 return ( a == b ) || ( aSign ^ ( a < b ) ); 3249 3250 } 3251 3252 /* 3253 ------------------------------------------------------------------------------- 3254 Returns 1 if the double-precision floating-point value `a' is less than 3255 the corresponding value `b', and 0 otherwise. Quiet NaNs do not cause an 3256 exception. Otherwise, the comparison is performed according to the IEC/IEEE 3257 Standard for Binary Floating-Point Arithmetic. 3258 ------------------------------------------------------------------------------- 3259 */ 3260 flag float64_lt_quiet( float64 a, float64 b ) 3261 { 3262 flag aSign, bSign; 3263 3264 if ( ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) ) 3265 || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) ) 3266 ) { 3267 if ( float64_is_signaling_nan( a ) || float64_is_signaling_nan( b ) ) { 3268 float_raise( float_flag_invalid ); 3269 } 3270 return 0; 3271 } 3272 aSign = extractFloat64Sign( a ); 3273 bSign = extractFloat64Sign( b ); 3274 if ( aSign != bSign ) return aSign && ( (bits64) ( ( a | b )<<1 ) != 0 ); 3275 return ( a != b ) && ( aSign ^ ( a < b ) ); 3276 3277 } 3278 #endif 3279 3280 #ifdef FLOATX80 3281 3282 /* 3283 ------------------------------------------------------------------------------- 3284 Returns the result of converting the extended double-precision floating- 3285 point value `a' to the 32-bit two's complement integer format. The 3286 conversion is performed according to the IEC/IEEE Standard for Binary 3287 Floating-Point Arithmetic---which means in particular that the conversion 3288 is rounded according to the current rounding mode. If `a' is a NaN, the 3289 largest positive integer is returned. Otherwise, if the conversion 3290 overflows, the largest integer with the same sign as `a' is returned. 3291 ------------------------------------------------------------------------------- 3292 */ 3293 int32 floatx80_to_int32( floatx80 a ) 3294 { 3295 flag aSign; 3296 int32 aExp, shiftCount; 3297 bits64 aSig; 3298 3299 aSig = extractFloatx80Frac( a ); 3300 aExp = extractFloatx80Exp( a ); 3301 aSign = extractFloatx80Sign( a ); 3302 if ( ( aExp == 0x7FFF ) && (bits64) ( aSig<<1 ) ) aSign = 0; 3303 shiftCount = 0x4037 - aExp; 3304 if ( shiftCount <= 0 ) shiftCount = 1; 3305 shift64RightJamming( aSig, shiftCount, &aSig ); 3306 return roundAndPackInt32( aSign, aSig ); 3307 3308 } 3309 3310 /* 3311 ------------------------------------------------------------------------------- 3312 Returns the result of converting the extended double-precision floating- 3313 point value `a' to the 32-bit two's complement integer format. The 3314 conversion is performed according to the IEC/IEEE Standard for Binary 3315 Floating-Point Arithmetic, except that the conversion is always rounded 3316 toward zero. If `a' is a NaN, the largest positive integer is returned. 3317 Otherwise, if the conversion overflows, the largest integer with the same 3318 sign as `a' is returned. 3319 ------------------------------------------------------------------------------- 3320 */ 3321 int32 floatx80_to_int32_round_to_zero( floatx80 a ) 3322 { 3323 flag aSign; 3324 int32 aExp, shiftCount; 3325 bits64 aSig, savedASig; 3326 int32 z; 3327 3328 aSig = extractFloatx80Frac( a ); 3329 aExp = extractFloatx80Exp( a ); 3330 aSign = extractFloatx80Sign( a ); 3331 if ( 0x401E < aExp ) { 3332 if ( ( aExp == 0x7FFF ) && (bits64) ( aSig<<1 ) ) aSign = 0; 3333 goto invalid; 3334 } 3335 else if ( aExp < 0x3FFF ) { 3336 if ( aExp || aSig ) float_exception_flags |= float_flag_inexact; 3337 return 0; 3338 } 3339 shiftCount = 0x403E - aExp; 3340 savedASig = aSig; 3341 aSig >>= shiftCount; 3342 z = aSig; 3343 if ( aSign ) z = - z; 3344 if ( ( z < 0 ) ^ aSign ) { 3345 invalid: 3346 float_raise( float_flag_invalid ); 3347 return aSign ? (sbits32) 0x80000000 : 0x7FFFFFFF; 3348 } 3349 if ( ( aSig<<shiftCount ) != savedASig ) { 3350 float_exception_flags |= float_flag_inexact; 3351 } 3352 return z; 3353 3354 } 3355 3356 /* 3357 ------------------------------------------------------------------------------- 3358 Returns the result of converting the extended double-precision floating- 3359 point value `a' to the 64-bit two's complement integer format. The 3360 conversion is performed according to the IEC/IEEE Standard for Binary 3361 Floating-Point Arithmetic---which means in particular that the conversion 3362 is rounded according to the current rounding mode. If `a' is a NaN, 3363 the largest positive integer is returned. Otherwise, if the conversion 3364 overflows, the largest integer with the same sign as `a' is returned. 3365 ------------------------------------------------------------------------------- 3366 */ 3367 int64 floatx80_to_int64( floatx80 a ) 3368 { 3369 flag aSign; 3370 int32 aExp, shiftCount; 3371 bits64 aSig, aSigExtra; 3372 3373 aSig = extractFloatx80Frac( a ); 3374 aExp = extractFloatx80Exp( a ); 3375 aSign = extractFloatx80Sign( a ); 3376 shiftCount = 0x403E - aExp; 3377 if ( shiftCount <= 0 ) { 3378 if ( shiftCount ) { 3379 float_raise( float_flag_invalid ); 3380 if ( ! aSign 3381 || ( ( aExp == 0x7FFF ) 3382 && ( aSig != LIT64( 0x8000000000000000 ) ) ) 3383 ) { 3384 return LIT64( 0x7FFFFFFFFFFFFFFF ); 3385 } 3386 return (sbits64) LIT64( 0x8000000000000000 ); 3387 } 3388 aSigExtra = 0; 3389 } 3390 else { 3391 shift64ExtraRightJamming( aSig, 0, shiftCount, &aSig, &aSigExtra ); 3392 } 3393 return roundAndPackInt64( aSign, aSig, aSigExtra ); 3394 3395 } 3396 3397 /* 3398 ------------------------------------------------------------------------------- 3399 Returns the result of converting the extended double-precision floating- 3400 point value `a' to the 64-bit two's complement integer format. The 3401 conversion is performed according to the IEC/IEEE Standard for Binary 3402 Floating-Point Arithmetic, except that the conversion is always rounded 3403 toward zero. If `a' is a NaN, the largest positive integer is returned. 3404 Otherwise, if the conversion overflows, the largest integer with the same 3405 sign as `a' is returned. 3406 ------------------------------------------------------------------------------- 3407 */ 3408 int64 floatx80_to_int64_round_to_zero( floatx80 a ) 3409 { 3410 flag aSign; 3411 int32 aExp, shiftCount; 3412 bits64 aSig; 3413 int64 z; 3414 3415 aSig = extractFloatx80Frac( a ); 3416 aExp = extractFloatx80Exp( a ); 3417 aSign = extractFloatx80Sign( a ); 3418 shiftCount = aExp - 0x403E; 3419 if ( 0 <= shiftCount ) { 3420 aSig &= LIT64( 0x7FFFFFFFFFFFFFFF ); 3421 if ( ( a.high != 0xC03E ) || aSig ) { 3422 float_raise( float_flag_invalid ); 3423 if ( ! aSign || ( ( aExp == 0x7FFF ) && aSig ) ) { 3424 return LIT64( 0x7FFFFFFFFFFFFFFF ); 3425 } 3426 } 3427 return (sbits64) LIT64( 0x8000000000000000 ); 3428 } 3429 else if ( aExp < 0x3FFF ) { 3430 if ( aExp | aSig ) float_exception_flags |= float_flag_inexact; 3431 return 0; 3432 } 3433 z = aSig>>( - shiftCount ); 3434 if ( (bits64) ( aSig<<( shiftCount & 63 ) ) ) { 3435 float_exception_flags |= float_flag_inexact; 3436 } 3437 if ( aSign ) z = - z; 3438 return z; 3439 3440 } 3441 3442 /* 3443 ------------------------------------------------------------------------------- 3444 Returns the result of converting the extended double-precision floating- 3445 point value `a' to the single-precision floating-point format. The 3446 conversion is performed according to the IEC/IEEE Standard for Binary 3447 Floating-Point Arithmetic. 3448 ------------------------------------------------------------------------------- 3449 */ 3450 float32 floatx80_to_float32( floatx80 a ) 3451 { 3452 flag aSign; 3453 int32 aExp; 3454 bits64 aSig; 3455 3456 aSig = extractFloatx80Frac( a ); 3457 aExp = extractFloatx80Exp( a ); 3458 aSign = extractFloatx80Sign( a ); 3459 if ( aExp == 0x7FFF ) { 3460 if ( (bits64) ( aSig<<1 ) ) { 3461 return commonNaNToFloat32( floatx80ToCommonNaN( a ) ); 3462 } 3463 return packFloat32( aSign, 0xFF, 0 ); 3464 } 3465 shift64RightJamming( aSig, 33, &aSig ); 3466 if ( aExp || aSig ) aExp -= 0x3F81; 3467 return roundAndPackFloat32( aSign, aExp, aSig ); 3468 3469 } 3470 3471 /* 3472 ------------------------------------------------------------------------------- 3473 Returns the result of converting the extended double-precision floating- 3474 point value `a' to the double-precision floating-point format. The 3475 conversion is performed according to the IEC/IEEE Standard for Binary 3476 Floating-Point Arithmetic. 3477 ------------------------------------------------------------------------------- 3478 */ 3479 float64 floatx80_to_float64( floatx80 a ) 3480 { 3481 flag aSign; 3482 int32 aExp; 3483 bits64 aSig, zSig; 3484 3485 aSig = extractFloatx80Frac( a ); 3486 aExp = extractFloatx80Exp( a ); 3487 aSign = extractFloatx80Sign( a ); 3488 if ( aExp == 0x7FFF ) { 3489 if ( (bits64) ( aSig<<1 ) ) { 3490 return commonNaNToFloat64( floatx80ToCommonNaN( a ) ); 3491 } 3492 return packFloat64( aSign, 0x7FF, 0 ); 3493 } 3494 shift64RightJamming( aSig, 1, &zSig ); 3495 if ( aExp || aSig ) aExp -= 0x3C01; 3496 return roundAndPackFloat64( aSign, aExp, zSig ); 3497 3498 } 3499 3500 #ifdef FLOAT128 3501 3502 /* 3503 ------------------------------------------------------------------------------- 3504 Returns the result of converting the extended double-precision floating- 3505 point value `a' to the quadruple-precision floating-point format. The 3506 conversion is performed according to the IEC/IEEE Standard for Binary 3507 Floating-Point Arithmetic. 3508 ------------------------------------------------------------------------------- 3509 */ 3510 float128 floatx80_to_float128( floatx80 a ) 3511 { 3512 flag aSign; 3513 int16 aExp; 3514 bits64 aSig, zSig0, zSig1; 3515 3516 aSig = extractFloatx80Frac( a ); 3517 aExp = extractFloatx80Exp( a ); 3518 aSign = extractFloatx80Sign( a ); 3519 if ( ( aExp == 0x7FFF ) && (bits64) ( aSig<<1 ) ) { 3520 return commonNaNToFloat128( floatx80ToCommonNaN( a ) ); 3521 } 3522 shift128Right( aSig<<1, 0, 16, &zSig0, &zSig1 ); 3523 return packFloat128( aSign, aExp, zSig0, zSig1 ); 3524 3525 } 3526 3527 #endif 3528 3529 /* 3530 ------------------------------------------------------------------------------- 3531 Rounds the extended double-precision floating-point value `a' to an integer, 3532 and returns the result as an extended quadruple-precision floating-point 3533 value. The operation is performed according to the IEC/IEEE Standard for 3534 Binary Floating-Point Arithmetic. 3535 ------------------------------------------------------------------------------- 3536 */ 3537 floatx80 floatx80_round_to_int( floatx80 a ) 3538 { 3539 flag aSign; 3540 int32 aExp; 3541 bits64 lastBitMask, roundBitsMask; 3542 int8 roundingMode; 3543 floatx80 z; 3544 3545 aExp = extractFloatx80Exp( a ); 3546 if ( 0x403E <= aExp ) { 3547 if ( ( aExp == 0x7FFF ) && (bits64) ( extractFloatx80Frac( a )<<1 ) ) { 3548 return propagateFloatx80NaN( a, a ); 3549 } 3550 return a; 3551 } 3552 if ( aExp < 0x3FFF ) { 3553 if ( ( aExp == 0 ) 3554 && ( (bits64) ( extractFloatx80Frac( a )<<1 ) == 0 ) ) { 3555 return a; 3556 } 3557 float_exception_flags |= float_flag_inexact; 3558 aSign = extractFloatx80Sign( a ); 3559 switch ( float_rounding_mode ) { 3560 case float_round_nearest_even: 3561 if ( ( aExp == 0x3FFE ) && (bits64) ( extractFloatx80Frac( a )<<1 ) 3562 ) { 3563 return 3564 packFloatx80( aSign, 0x3FFF, LIT64( 0x8000000000000000 ) ); 3565 } 3566 break; 3567 case float_round_to_zero: 3568 break; 3569 case float_round_down: 3570 return 3571 aSign ? 3572 packFloatx80( 1, 0x3FFF, LIT64( 0x8000000000000000 ) ) 3573 : packFloatx80( 0, 0, 0 ); 3574 case float_round_up: 3575 return 3576 aSign ? packFloatx80( 1, 0, 0 ) 3577 : packFloatx80( 0, 0x3FFF, LIT64( 0x8000000000000000 ) ); 3578 } 3579 return packFloatx80( aSign, 0, 0 ); 3580 } 3581 lastBitMask = 1; 3582 lastBitMask <<= 0x403E - aExp; 3583 roundBitsMask = lastBitMask - 1; 3584 z = a; 3585 roundingMode = float_rounding_mode; 3586 if ( roundingMode == float_round_nearest_even ) { 3587 z.low += lastBitMask>>1; 3588 if ( ( z.low & roundBitsMask ) == 0 ) z.low &= ~ lastBitMask; 3589 } 3590 else if ( roundingMode != float_round_to_zero ) { 3591 if ( extractFloatx80Sign( z ) ^ ( roundingMode == float_round_up ) ) { 3592 z.low += roundBitsMask; 3593 } 3594 } 3595 z.low &= ~ roundBitsMask; 3596 if ( z.low == 0 ) { 3597 ++z.high; 3598 z.low = LIT64( 0x8000000000000000 ); 3599 } 3600 if ( z.low != a.low ) float_exception_flags |= float_flag_inexact; 3601 return z; 3602 3603 } 3604 3605 /* 3606 ------------------------------------------------------------------------------- 3607 Returns the result of adding the absolute values of the extended double- 3608 precision floating-point values `a' and `b'. If `zSign' is 1, the sum is 3609 negated before being returned. `zSign' is ignored if the result is a NaN. 3610 The addition is performed according to the IEC/IEEE Standard for Binary 3611 Floating-Point Arithmetic. 3612 ------------------------------------------------------------------------------- 3613 */ 3614 static floatx80 addFloatx80Sigs( floatx80 a, floatx80 b, flag zSign ) 3615 { 3616 int32 aExp, bExp, zExp; 3617 bits64 aSig, bSig, zSig0, zSig1; 3618 int32 expDiff; 3619 3620 aSig = extractFloatx80Frac( a ); 3621 aExp = extractFloatx80Exp( a ); 3622 bSig = extractFloatx80Frac( b ); 3623 bExp = extractFloatx80Exp( b ); 3624 expDiff = aExp - bExp; 3625 if ( 0 < expDiff ) { 3626 if ( aExp == 0x7FFF ) { 3627 if ( (bits64) ( aSig<<1 ) ) return propagateFloatx80NaN( a, b ); 3628 return a; 3629 } 3630 if ( bExp == 0 ) --expDiff; 3631 shift64ExtraRightJamming( bSig, 0, expDiff, &bSig, &zSig1 ); 3632 zExp = aExp; 3633 } 3634 else if ( expDiff < 0 ) { 3635 if ( bExp == 0x7FFF ) { 3636 if ( (bits64) ( bSig<<1 ) ) return propagateFloatx80NaN( a, b ); 3637 return packFloatx80( zSign, 0x7FFF, LIT64( 0x8000000000000000 ) ); 3638 } 3639 if ( aExp == 0 ) ++expDiff; 3640 shift64ExtraRightJamming( aSig, 0, - expDiff, &aSig, &zSig1 ); 3641 zExp = bExp; 3642 } 3643 else { 3644 if ( aExp == 0x7FFF ) { 3645 if ( (bits64) ( ( aSig | bSig )<<1 ) ) { 3646 return propagateFloatx80NaN( a, b ); 3647 } 3648 return a; 3649 } 3650 zSig1 = 0; 3651 zSig0 = aSig + bSig; 3652 if ( aExp == 0 ) { 3653 normalizeFloatx80Subnormal( zSig0, &zExp, &zSig0 ); 3654 goto roundAndPack; 3655 } 3656 zExp = aExp; 3657 goto shiftRight1; 3658 } 3659 zSig0 = aSig + bSig; 3660 if ( (sbits64) zSig0 < 0 ) goto roundAndPack; 3661 shiftRight1: 3662 shift64ExtraRightJamming( zSig0, zSig1, 1, &zSig0, &zSig1 ); 3663 zSig0 |= LIT64( 0x8000000000000000 ); 3664 ++zExp; 3665 roundAndPack: 3666 return 3667 roundAndPackFloatx80( 3668 floatx80_rounding_precision, zSign, zExp, zSig0, zSig1 ); 3669 3670 } 3671 3672 /* 3673 ------------------------------------------------------------------------------- 3674 Returns the result of subtracting the absolute values of the extended 3675 double-precision floating-point values `a' and `b'. If `zSign' is 1, the 3676 difference is negated before being returned. `zSign' is ignored if the 3677 result is a NaN. The subtraction is performed according to the IEC/IEEE 3678 Standard for Binary Floating-Point Arithmetic. 3679 ------------------------------------------------------------------------------- 3680 */ 3681 static floatx80 subFloatx80Sigs( floatx80 a, floatx80 b, flag zSign ) 3682 { 3683 int32 aExp, bExp, zExp; 3684 bits64 aSig, bSig, zSig0, zSig1; 3685 int32 expDiff; 3686 floatx80 z; 3687 3688 aSig = extractFloatx80Frac( a ); 3689 aExp = extractFloatx80Exp( a ); 3690 bSig = extractFloatx80Frac( b ); 3691 bExp = extractFloatx80Exp( b ); 3692 expDiff = aExp - bExp; 3693 if ( 0 < expDiff ) goto aExpBigger; 3694 if ( expDiff < 0 ) goto bExpBigger; 3695 if ( aExp == 0x7FFF ) { 3696 if ( (bits64) ( ( aSig | bSig )<<1 ) ) { 3697 return propagateFloatx80NaN( a, b ); 3698 } 3699 float_raise( float_flag_invalid ); 3700 z.low = floatx80_default_nan_low; 3701 z.high = floatx80_default_nan_high; 3702 return z; 3703 } 3704 if ( aExp == 0 ) { 3705 aExp = 1; 3706 bExp = 1; 3707 } 3708 zSig1 = 0; 3709 if ( bSig < aSig ) goto aBigger; 3710 if ( aSig < bSig ) goto bBigger; 3711 return packFloatx80( float_rounding_mode == float_round_down, 0, 0 ); 3712 bExpBigger: 3713 if ( bExp == 0x7FFF ) { 3714 if ( (bits64) ( bSig<<1 ) ) return propagateFloatx80NaN( a, b ); 3715 return packFloatx80( zSign ^ 1, 0x7FFF, LIT64( 0x8000000000000000 ) ); 3716 } 3717 if ( aExp == 0 ) ++expDiff; 3718 shift128RightJamming( aSig, 0, - expDiff, &aSig, &zSig1 ); 3719 bBigger: 3720 sub128( bSig, 0, aSig, zSig1, &zSig0, &zSig1 ); 3721 zExp = bExp; 3722 zSign ^= 1; 3723 goto normalizeRoundAndPack; 3724 aExpBigger: 3725 if ( aExp == 0x7FFF ) { 3726 if ( (bits64) ( aSig<<1 ) ) return propagateFloatx80NaN( a, b ); 3727 return a; 3728 } 3729 if ( bExp == 0 ) --expDiff; 3730 shift128RightJamming( bSig, 0, expDiff, &bSig, &zSig1 ); 3731 aBigger: 3732 sub128( aSig, 0, bSig, zSig1, &zSig0, &zSig1 ); 3733 zExp = aExp; 3734 normalizeRoundAndPack: 3735 return 3736 normalizeRoundAndPackFloatx80( 3737 floatx80_rounding_precision, zSign, zExp, zSig0, zSig1 ); 3738 3739 } 3740 3741 /* 3742 ------------------------------------------------------------------------------- 3743 Returns the result of adding the extended double-precision floating-point 3744 values `a' and `b'. The operation is performed according to the IEC/IEEE 3745 Standard for Binary Floating-Point Arithmetic. 3746 ------------------------------------------------------------------------------- 3747 */ 3748 floatx80 floatx80_add( floatx80 a, floatx80 b ) 3749 { 3750 flag aSign, bSign; 3751 3752 aSign = extractFloatx80Sign( a ); 3753 bSign = extractFloatx80Sign( b ); 3754 if ( aSign == bSign ) { 3755 return addFloatx80Sigs( a, b, aSign ); 3756 } 3757 else { 3758 return subFloatx80Sigs( a, b, aSign ); 3759 } 3760 3761 } 3762 3763 /* 3764 ------------------------------------------------------------------------------- 3765 Returns the result of subtracting the extended double-precision floating- 3766 point values `a' and `b'. The operation is performed according to the 3767 IEC/IEEE Standard for Binary Floating-Point Arithmetic. 3768 ------------------------------------------------------------------------------- 3769 */ 3770 floatx80 floatx80_sub( floatx80 a, floatx80 b ) 3771 { 3772 flag aSign, bSign; 3773 3774 aSign = extractFloatx80Sign( a ); 3775 bSign = extractFloatx80Sign( b ); 3776 if ( aSign == bSign ) { 3777 return subFloatx80Sigs( a, b, aSign ); 3778 } 3779 else { 3780 return addFloatx80Sigs( a, b, aSign ); 3781 } 3782 3783 } 3784 3785 /* 3786 ------------------------------------------------------------------------------- 3787 Returns the result of multiplying the extended double-precision floating- 3788 point values `a' and `b'. The operation is performed according to the 3789 IEC/IEEE Standard for Binary Floating-Point Arithmetic. 3790 ------------------------------------------------------------------------------- 3791 */ 3792 floatx80 floatx80_mul( floatx80 a, floatx80 b ) 3793 { 3794 flag aSign, bSign, zSign; 3795 int32 aExp, bExp, zExp; 3796 bits64 aSig, bSig, zSig0, zSig1; 3797 floatx80 z; 3798 3799 aSig = extractFloatx80Frac( a ); 3800 aExp = extractFloatx80Exp( a ); 3801 aSign = extractFloatx80Sign( a ); 3802 bSig = extractFloatx80Frac( b ); 3803 bExp = extractFloatx80Exp( b ); 3804 bSign = extractFloatx80Sign( b ); 3805 zSign = aSign ^ bSign; 3806 if ( aExp == 0x7FFF ) { 3807 if ( (bits64) ( aSig<<1 ) 3808 || ( ( bExp == 0x7FFF ) && (bits64) ( bSig<<1 ) ) ) { 3809 return propagateFloatx80NaN( a, b ); 3810 } 3811 if ( ( bExp | bSig ) == 0 ) goto invalid; 3812 return packFloatx80( zSign, 0x7FFF, LIT64( 0x8000000000000000 ) ); 3813 } 3814 if ( bExp == 0x7FFF ) { 3815 if ( (bits64) ( bSig<<1 ) ) return propagateFloatx80NaN( a, b ); 3816 if ( ( aExp | aSig ) == 0 ) { 3817 invalid: 3818 float_raise( float_flag_invalid ); 3819 z.low = floatx80_default_nan_low; 3820 z.high = floatx80_default_nan_high; 3821 return z; 3822 } 3823 return packFloatx80( zSign, 0x7FFF, LIT64( 0x8000000000000000 ) ); 3824 } 3825 if ( aExp == 0 ) { 3826 if ( aSig == 0 ) return packFloatx80( zSign, 0, 0 ); 3827 normalizeFloatx80Subnormal( aSig, &aExp, &aSig ); 3828 } 3829 if ( bExp == 0 ) { 3830 if ( bSig == 0 ) return packFloatx80( zSign, 0, 0 ); 3831 normalizeFloatx80Subnormal( bSig, &bExp, &bSig ); 3832 } 3833 zExp = aExp + bExp - 0x3FFE; 3834 mul64To128( aSig, bSig, &zSig0, &zSig1 ); 3835 if ( 0 < (sbits64) zSig0 ) { 3836 shortShift128Left( zSig0, zSig1, 1, &zSig0, &zSig1 ); 3837 --zExp; 3838 } 3839 return 3840 roundAndPackFloatx80( 3841 floatx80_rounding_precision, zSign, zExp, zSig0, zSig1 ); 3842 3843 } 3844 3845 /* 3846 ------------------------------------------------------------------------------- 3847 Returns the result of dividing the extended double-precision floating-point 3848 value `a' by the corresponding value `b'. The operation is performed 3849 according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic. 3850 ------------------------------------------------------------------------------- 3851 */ 3852 floatx80 floatx80_div( floatx80 a, floatx80 b ) 3853 { 3854 flag aSign, bSign, zSign; 3855 int32 aExp, bExp, zExp; 3856 bits64 aSig, bSig, zSig0, zSig1; 3857 bits64 rem0, rem1, rem2, term0, term1, term2; 3858 floatx80 z; 3859 3860 aSig = extractFloatx80Frac( a ); 3861 aExp = extractFloatx80Exp( a ); 3862 aSign = extractFloatx80Sign( a ); 3863 bSig = extractFloatx80Frac( b ); 3864 bExp = extractFloatx80Exp( b ); 3865 bSign = extractFloatx80Sign( b ); 3866 zSign = aSign ^ bSign; 3867 if ( aExp == 0x7FFF ) { 3868 if ( (bits64) ( aSig<<1 ) ) return propagateFloatx80NaN( a, b ); 3869 if ( bExp == 0x7FFF ) { 3870 if ( (bits64) ( bSig<<1 ) ) return propagateFloatx80NaN( a, b ); 3871 goto invalid; 3872 } 3873 return packFloatx80( zSign, 0x7FFF, LIT64( 0x8000000000000000 ) ); 3874 } 3875 if ( bExp == 0x7FFF ) { 3876 if ( (bits64) ( bSig<<1 ) ) return propagateFloatx80NaN( a, b ); 3877 return packFloatx80( zSign, 0, 0 ); 3878 } 3879 if ( bExp == 0 ) { 3880 if ( bSig == 0 ) { 3881 if ( ( aExp | aSig ) == 0 ) { 3882 invalid: 3883 float_raise( float_flag_invalid ); 3884 z.low = floatx80_default_nan_low; 3885 z.high = floatx80_default_nan_high; 3886 return z; 3887 } 3888 float_raise( float_flag_divbyzero ); 3889 return packFloatx80( zSign, 0x7FFF, LIT64( 0x8000000000000000 ) ); 3890 } 3891 normalizeFloatx80Subnormal( bSig, &bExp, &bSig ); 3892 } 3893 if ( aExp == 0 ) { 3894 if ( aSig == 0 ) return packFloatx80( zSign, 0, 0 ); 3895 normalizeFloatx80Subnormal( aSig, &aExp, &aSig ); 3896 } 3897 zExp = aExp - bExp + 0x3FFE; 3898 rem1 = 0; 3899 if ( bSig <= aSig ) { 3900 shift128Right( aSig, 0, 1, &aSig, &rem1 ); 3901 ++zExp; 3902 } 3903 zSig0 = estimateDiv128To64( aSig, rem1, bSig ); 3904 mul64To128( bSig, zSig0, &term0, &term1 ); 3905 sub128( aSig, rem1, term0, term1, &rem0, &rem1 ); 3906 while ( (sbits64) rem0 < 0 ) { 3907 --zSig0; 3908 add128( rem0, rem1, 0, bSig, &rem0, &rem1 ); 3909 } 3910 zSig1 = estimateDiv128To64( rem1, 0, bSig ); 3911 if ( (bits64) ( zSig1<<1 ) <= 8 ) { 3912 mul64To128( bSig, zSig1, &term1, &term2 ); 3913 sub128( rem1, 0, term1, term2, &rem1, &rem2 ); 3914 while ( (sbits64) rem1 < 0 ) { 3915 --zSig1; 3916 add128( rem1, rem2, 0, bSig, &rem1, &rem2 ); 3917 } 3918 zSig1 |= ( ( rem1 | rem2 ) != 0 ); 3919 } 3920 return 3921 roundAndPackFloatx80( 3922 floatx80_rounding_precision, zSign, zExp, zSig0, zSig1 ); 3923 3924 } 3925 3926 /* 3927 ------------------------------------------------------------------------------- 3928 Returns the remainder of the extended double-precision floating-point value 3929 `a' with respect to the corresponding value `b'. The operation is performed 3930 according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic. 3931 ------------------------------------------------------------------------------- 3932 */ 3933 floatx80 floatx80_rem( floatx80 a, floatx80 b ) 3934 { 3935 flag aSign, bSign, zSign; 3936 int32 aExp, bExp, expDiff; 3937 bits64 aSig0, aSig1, bSig; 3938 bits64 q, term0, term1, alternateASig0, alternateASig1; 3939 floatx80 z; 3940 3941 aSig0 = extractFloatx80Frac( a ); 3942 aExp = extractFloatx80Exp( a ); 3943 aSign = extractFloatx80Sign( a ); 3944 bSig = extractFloatx80Frac( b ); 3945 bExp = extractFloatx80Exp( b ); 3946 bSign = extractFloatx80Sign( b ); 3947 if ( aExp == 0x7FFF ) { 3948 if ( (bits64) ( aSig0<<1 ) 3949 || ( ( bExp == 0x7FFF ) && (bits64) ( bSig<<1 ) ) ) { 3950 return propagateFloatx80NaN( a, b ); 3951 } 3952 goto invalid; 3953 } 3954 if ( bExp == 0x7FFF ) { 3955 if ( (bits64) ( bSig<<1 ) ) return propagateFloatx80NaN( a, b ); 3956 return a; 3957 } 3958 if ( bExp == 0 ) { 3959 if ( bSig == 0 ) { 3960 invalid: 3961 float_raise( float_flag_invalid ); 3962 z.low = floatx80_default_nan_low; 3963 z.high = floatx80_default_nan_high; 3964 return z; 3965 } 3966 normalizeFloatx80Subnormal( bSig, &bExp, &bSig ); 3967 } 3968 if ( aExp == 0 ) { 3969 if ( (bits64) ( aSig0<<1 ) == 0 ) return a; 3970 normalizeFloatx80Subnormal( aSig0, &aExp, &aSig0 ); 3971 } 3972 bSig |= LIT64( 0x8000000000000000 ); 3973 zSign = aSign; 3974 expDiff = aExp - bExp; 3975 aSig1 = 0; 3976 if ( expDiff < 0 ) { 3977 if ( expDiff < -1 ) return a; 3978 shift128Right( aSig0, 0, 1, &aSig0, &aSig1 ); 3979 expDiff = 0; 3980 } 3981 q = ( bSig <= aSig0 ); 3982 if ( q ) aSig0 -= bSig; 3983 expDiff -= 64; 3984 while ( 0 < expDiff ) { 3985 q = estimateDiv128To64( aSig0, aSig1, bSig ); 3986 q = ( 2 < q ) ? q - 2 : 0; 3987 mul64To128( bSig, q, &term0, &term1 ); 3988 sub128( aSig0, aSig1, term0, term1, &aSig0, &aSig1 ); 3989 shortShift128Left( aSig0, aSig1, 62, &aSig0, &aSig1 ); 3990 expDiff -= 62; 3991 } 3992 expDiff += 64; 3993 if ( 0 < expDiff ) { 3994 q = estimateDiv128To64( aSig0, aSig1, bSig ); 3995 q = ( 2 < q ) ? q - 2 : 0; 3996 q >>= 64 - expDiff; 3997 mul64To128( bSig, q<<( 64 - expDiff ), &term0, &term1 ); 3998 sub128( aSig0, aSig1, term0, term1, &aSig0, &aSig1 ); 3999 shortShift128Left( 0, bSig, 64 - expDiff, &term0, &term1 ); 4000 while ( le128( term0, term1, aSig0, aSig1 ) ) { 4001 ++q; 4002 sub128( aSig0, aSig1, term0, term1, &aSig0, &aSig1 ); 4003 } 4004 } 4005 else { 4006 term1 = 0; 4007 term0 = bSig; 4008 } 4009 sub128( term0, term1, aSig0, aSig1, &alternateASig0, &alternateASig1 ); 4010 if ( lt128( alternateASig0, alternateASig1, aSig0, aSig1 ) 4011 || ( eq128( alternateASig0, alternateASig1, aSig0, aSig1 ) 4012 && ( q & 1 ) ) 4013 ) { 4014 aSig0 = alternateASig0; 4015 aSig1 = alternateASig1; 4016 zSign = ! zSign; 4017 } 4018 return 4019 normalizeRoundAndPackFloatx80( 4020 80, zSign, bExp + expDiff, aSig0, aSig1 ); 4021 4022 } 4023 4024 /* 4025 ------------------------------------------------------------------------------- 4026 Returns the square root of the extended double-precision floating-point 4027 value `a'. The operation is performed according to the IEC/IEEE Standard 4028 for Binary Floating-Point Arithmetic. 4029 ------------------------------------------------------------------------------- 4030 */ 4031 floatx80 floatx80_sqrt( floatx80 a ) 4032 { 4033 flag aSign; 4034 int32 aExp, zExp; 4035 bits64 aSig0, aSig1, zSig0, zSig1, doubleZSig0; 4036 bits64 rem0, rem1, rem2, rem3, term0, term1, term2, term3; 4037 floatx80 z; 4038 4039 aSig0 = extractFloatx80Frac( a ); 4040 aExp = extractFloatx80Exp( a ); 4041 aSign = extractFloatx80Sign( a ); 4042 if ( aExp == 0x7FFF ) { 4043 if ( (bits64) ( aSig0<<1 ) ) return propagateFloatx80NaN( a, a ); 4044 if ( ! aSign ) return a; 4045 goto invalid; 4046 } 4047 if ( aSign ) { 4048 if ( ( aExp | aSig0 ) == 0 ) return a; 4049 invalid: 4050 float_raise( float_flag_invalid ); 4051 z.low = floatx80_default_nan_low; 4052 z.high = floatx80_default_nan_high; 4053 return z; 4054 } 4055 if ( aExp == 0 ) { 4056 if ( aSig0 == 0 ) return packFloatx80( 0, 0, 0 ); 4057 normalizeFloatx80Subnormal( aSig0, &aExp, &aSig0 ); 4058 } 4059 zExp = ( ( aExp - 0x3FFF )>>1 ) + 0x3FFF; 4060 zSig0 = estimateSqrt32( aExp, aSig0>>32 ); 4061 shift128Right( aSig0, 0, 2 + ( aExp & 1 ), &aSig0, &aSig1 ); 4062 zSig0 = estimateDiv128To64( aSig0, aSig1, zSig0<<32 ) + ( zSig0<<30 ); 4063 doubleZSig0 = zSig0<<1; 4064 mul64To128( zSig0, zSig0, &term0, &term1 ); 4065 sub128( aSig0, aSig1, term0, term1, &rem0, &rem1 ); 4066 while ( (sbits64) rem0 < 0 ) { 4067 --zSig0; 4068 doubleZSig0 -= 2; 4069 add128( rem0, rem1, zSig0>>63, doubleZSig0 | 1, &rem0, &rem1 ); 4070 } 4071 zSig1 = estimateDiv128To64( rem1, 0, doubleZSig0 ); 4072 if ( ( zSig1 & LIT64( 0x3FFFFFFFFFFFFFFF ) ) <= 5 ) { 4073 if ( zSig1 == 0 ) zSig1 = 1; 4074 mul64To128( doubleZSig0, zSig1, &term1, &term2 ); 4075 sub128( rem1, 0, term1, term2, &rem1, &rem2 ); 4076 mul64To128( zSig1, zSig1, &term2, &term3 ); 4077 sub192( rem1, rem2, 0, 0, term2, term3, &rem1, &rem2, &rem3 ); 4078 while ( (sbits64) rem1 < 0 ) { 4079 --zSig1; 4080 shortShift128Left( 0, zSig1, 1, &term2, &term3 ); 4081 term3 |= 1; 4082 term2 |= doubleZSig0; 4083 add192( rem1, rem2, rem3, 0, term2, term3, &rem1, &rem2, &rem3 ); 4084 } 4085 zSig1 |= ( ( rem1 | rem2 | rem3 ) != 0 ); 4086 } 4087 shortShift128Left( 0, zSig1, 1, &zSig0, &zSig1 ); 4088 zSig0 |= doubleZSig0; 4089 return 4090 roundAndPackFloatx80( 4091 floatx80_rounding_precision, 0, zExp, zSig0, zSig1 ); 4092 4093 } 4094 4095 /* 4096 ------------------------------------------------------------------------------- 4097 Returns 1 if the extended double-precision floating-point value `a' is 4098 equal to the corresponding value `b', and 0 otherwise. The comparison is 4099 performed according to the IEC/IEEE Standard for Binary Floating-Point 4100 Arithmetic. 4101 ------------------------------------------------------------------------------- 4102 */ 4103 flag floatx80_eq( floatx80 a, floatx80 b ) 4104 { 4105 4106 if ( ( ( extractFloatx80Exp( a ) == 0x7FFF ) 4107 && (bits64) ( extractFloatx80Frac( a )<<1 ) ) 4108 || ( ( extractFloatx80Exp( b ) == 0x7FFF ) 4109 && (bits64) ( extractFloatx80Frac( b )<<1 ) ) 4110 ) { 4111 if ( floatx80_is_signaling_nan( a ) 4112 || floatx80_is_signaling_nan( b ) ) { 4113 float_raise( float_flag_invalid ); 4114 } 4115 return 0; 4116 } 4117 return 4118 ( a.low == b.low ) 4119 && ( ( a.high == b.high ) 4120 || ( ( a.low == 0 ) 4121 && ( (bits16) ( ( a.high | b.high )<<1 ) == 0 ) ) 4122 ); 4123 4124 } 4125 4126 /* 4127 ------------------------------------------------------------------------------- 4128 Returns 1 if the extended double-precision floating-point value `a' is 4129 less than or equal to the corresponding value `b', and 0 otherwise. The 4130 comparison is performed according to the IEC/IEEE Standard for Binary 4131 Floating-Point Arithmetic. 4132 ------------------------------------------------------------------------------- 4133 */ 4134 flag floatx80_le( floatx80 a, floatx80 b ) 4135 { 4136 flag aSign, bSign; 4137 4138 if ( ( ( extractFloatx80Exp( a ) == 0x7FFF ) 4139 && (bits64) ( extractFloatx80Frac( a )<<1 ) ) 4140 || ( ( extractFloatx80Exp( b ) == 0x7FFF ) 4141 && (bits64) ( extractFloatx80Frac( b )<<1 ) ) 4142 ) { 4143 float_raise( float_flag_invalid ); 4144 return 0; 4145 } 4146 aSign = extractFloatx80Sign( a ); 4147 bSign = extractFloatx80Sign( b ); 4148 if ( aSign != bSign ) { 4149 return 4150 aSign 4151 || ( ( ( (bits16) ( ( a.high | b.high )<<1 ) ) | a.low | b.low ) 4152 == 0 ); 4153 } 4154 return 4155 aSign ? le128( b.high, b.low, a.high, a.low ) 4156 : le128( a.high, a.low, b.high, b.low ); 4157 4158 } 4159 4160 /* 4161 ------------------------------------------------------------------------------- 4162 Returns 1 if the extended double-precision floating-point value `a' is 4163 less than the corresponding value `b', and 0 otherwise. The comparison 4164 is performed according to the IEC/IEEE Standard for Binary Floating-Point 4165 Arithmetic. 4166 ------------------------------------------------------------------------------- 4167 */ 4168 flag floatx80_lt( floatx80 a, floatx80 b ) 4169 { 4170 flag aSign, bSign; 4171 4172 if ( ( ( extractFloatx80Exp( a ) == 0x7FFF ) 4173 && (bits64) ( extractFloatx80Frac( a )<<1 ) ) 4174 || ( ( extractFloatx80Exp( b ) == 0x7FFF ) 4175 && (bits64) ( extractFloatx80Frac( b )<<1 ) ) 4176 ) { 4177 float_raise( float_flag_invalid ); 4178 return 0; 4179 } 4180 aSign = extractFloatx80Sign( a ); 4181 bSign = extractFloatx80Sign( b ); 4182 if ( aSign != bSign ) { 4183 return 4184 aSign 4185 && ( ( ( (bits16) ( ( a.high | b.high )<<1 ) ) | a.low | b.low ) 4186 != 0 ); 4187 } 4188 return 4189 aSign ? lt128( b.high, b.low, a.high, a.low ) 4190 : lt128( a.high, a.low, b.high, b.low ); 4191 4192 } 4193 4194 /* 4195 ------------------------------------------------------------------------------- 4196 Returns 1 if the extended double-precision floating-point value `a' is equal 4197 to the corresponding value `b', and 0 otherwise. The invalid exception is 4198 raised if either operand is a NaN. Otherwise, the comparison is performed 4199 according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic. 4200 ------------------------------------------------------------------------------- 4201 */ 4202 flag floatx80_eq_signaling( floatx80 a, floatx80 b ) 4203 { 4204 4205 if ( ( ( extractFloatx80Exp( a ) == 0x7FFF ) 4206 && (bits64) ( extractFloatx80Frac( a )<<1 ) ) 4207 || ( ( extractFloatx80Exp( b ) == 0x7FFF ) 4208 && (bits64) ( extractFloatx80Frac( b )<<1 ) ) 4209 ) { 4210 float_raise( float_flag_invalid ); 4211 return 0; 4212 } 4213 return 4214 ( a.low == b.low ) 4215 && ( ( a.high == b.high ) 4216 || ( ( a.low == 0 ) 4217 && ( (bits16) ( ( a.high | b.high )<<1 ) == 0 ) ) 4218 ); 4219 4220 } 4221 4222 /* 4223 ------------------------------------------------------------------------------- 4224 Returns 1 if the extended double-precision floating-point value `a' is less 4225 than or equal to the corresponding value `b', and 0 otherwise. Quiet NaNs 4226 do not cause an exception. Otherwise, the comparison is performed according 4227 to the IEC/IEEE Standard for Binary Floating-Point Arithmetic. 4228 ------------------------------------------------------------------------------- 4229 */ 4230 flag floatx80_le_quiet( floatx80 a, floatx80 b ) 4231 { 4232 flag aSign, bSign; 4233 4234 if ( ( ( extractFloatx80Exp( a ) == 0x7FFF ) 4235 && (bits64) ( extractFloatx80Frac( a )<<1 ) ) 4236 || ( ( extractFloatx80Exp( b ) == 0x7FFF ) 4237 && (bits64) ( extractFloatx80Frac( b )<<1 ) ) 4238 ) { 4239 if ( floatx80_is_signaling_nan( a ) 4240 || floatx80_is_signaling_nan( b ) ) { 4241 float_raise( float_flag_invalid ); 4242 } 4243 return 0; 4244 } 4245 aSign = extractFloatx80Sign( a ); 4246 bSign = extractFloatx80Sign( b ); 4247 if ( aSign != bSign ) { 4248 return 4249 aSign 4250 || ( ( ( (bits16) ( ( a.high | b.high )<<1 ) ) | a.low | b.low ) 4251 == 0 ); 4252 } 4253 return 4254 aSign ? le128( b.high, b.low, a.high, a.low ) 4255 : le128( a.high, a.low, b.high, b.low ); 4256 4257 } 4258 4259 /* 4260 ------------------------------------------------------------------------------- 4261 Returns 1 if the extended double-precision floating-point value `a' is less 4262 than the corresponding value `b', and 0 otherwise. Quiet NaNs do not cause 4263 an exception. Otherwise, the comparison is performed according to the 4264 IEC/IEEE Standard for Binary Floating-Point Arithmetic. 4265 ------------------------------------------------------------------------------- 4266 */ 4267 flag floatx80_lt_quiet( floatx80 a, floatx80 b ) 4268 { 4269 flag aSign, bSign; 4270 4271 if ( ( ( extractFloatx80Exp( a ) == 0x7FFF ) 4272 && (bits64) ( extractFloatx80Frac( a )<<1 ) ) 4273 || ( ( extractFloatx80Exp( b ) == 0x7FFF ) 4274 && (bits64) ( extractFloatx80Frac( b )<<1 ) ) 4275 ) { 4276 if ( floatx80_is_signaling_nan( a ) 4277 || floatx80_is_signaling_nan( b ) ) { 4278 float_raise( float_flag_invalid ); 4279 } 4280 return 0; 4281 } 4282 aSign = extractFloatx80Sign( a ); 4283 bSign = extractFloatx80Sign( b ); 4284 if ( aSign != bSign ) { 4285 return 4286 aSign 4287 && ( ( ( (bits16) ( ( a.high | b.high )<<1 ) ) | a.low | b.low ) 4288 != 0 ); 4289 } 4290 return 4291 aSign ? lt128( b.high, b.low, a.high, a.low ) 4292 : lt128( a.high, a.low, b.high, b.low ); 4293 4294 } 4295 4296 #endif 4297 4298 #ifdef FLOAT128 4299 4300 /* 4301 ------------------------------------------------------------------------------- 4302 Returns the result of converting the quadruple-precision floating-point 4303 value `a' to the 32-bit two's complement integer format. The conversion 4304 is performed according to the IEC/IEEE Standard for Binary Floating-Point 4305 Arithmetic---which means in particular that the conversion is rounded 4306 according to the current rounding mode. If `a' is a NaN, the largest 4307 positive integer is returned. Otherwise, if the conversion overflows, the 4308 largest integer with the same sign as `a' is returned. 4309 ------------------------------------------------------------------------------- 4310 */ 4311 int32 float128_to_int32( float128 a ) 4312 { 4313 flag aSign; 4314 int32 aExp, shiftCount; 4315 bits64 aSig0, aSig1; 4316 4317 aSig1 = extractFloat128Frac1( a ); 4318 aSig0 = extractFloat128Frac0( a ); 4319 aExp = extractFloat128Exp( a ); 4320 aSign = extractFloat128Sign( a ); 4321 if ( ( aExp == 0x7FFF ) && ( aSig0 | aSig1 ) ) aSign = 0; 4322 if ( aExp ) aSig0 |= LIT64( 0x0001000000000000 ); 4323 aSig0 |= ( aSig1 != 0 ); 4324 shiftCount = 0x4028 - aExp; 4325 if ( 0 < shiftCount ) shift64RightJamming( aSig0, shiftCount, &aSig0 ); 4326 return roundAndPackInt32( aSign, aSig0 ); 4327 4328 } 4329 4330 /* 4331 ------------------------------------------------------------------------------- 4332 Returns the result of converting the quadruple-precision floating-point 4333 value `a' to the 32-bit two's complement integer format. The conversion 4334 is performed according to the IEC/IEEE Standard for Binary Floating-Point 4335 Arithmetic, except that the conversion is always rounded toward zero. If 4336 `a' is a NaN, the largest positive integer is returned. Otherwise, if the 4337 conversion overflows, the largest integer with the same sign as `a' is 4338 returned. 4339 ------------------------------------------------------------------------------- 4340 */ 4341 int32 float128_to_int32_round_to_zero( float128 a ) 4342 { 4343 flag aSign; 4344 int32 aExp, shiftCount; 4345 bits64 aSig0, aSig1, savedASig; 4346 int32 z; 4347 4348 aSig1 = extractFloat128Frac1( a ); 4349 aSig0 = extractFloat128Frac0( a ); 4350 aExp = extractFloat128Exp( a ); 4351 aSign = extractFloat128Sign( a ); 4352 aSig0 |= ( aSig1 != 0 ); 4353 if ( 0x401E < aExp ) { 4354 if ( ( aExp == 0x7FFF ) && aSig0 ) aSign = 0; 4355 goto invalid; 4356 } 4357 else if ( aExp < 0x3FFF ) { 4358 if ( aExp || aSig0 ) float_exception_flags |= float_flag_inexact; 4359 return 0; 4360 } 4361 aSig0 |= LIT64( 0x0001000000000000 ); 4362 shiftCount = 0x402F - aExp; 4363 savedASig = aSig0; 4364 aSig0 >>= shiftCount; 4365 z = aSig0; 4366 if ( aSign ) z = - z; 4367 if ( ( z < 0 ) ^ aSign ) { 4368 invalid: 4369 float_raise( float_flag_invalid ); 4370 return aSign ? (sbits32) 0x80000000 : 0x7FFFFFFF; 4371 } 4372 if ( ( aSig0<<shiftCount ) != savedASig ) { 4373 float_exception_flags |= float_flag_inexact; 4374 } 4375 return z; 4376 4377 } 4378 4379 /* 4380 ------------------------------------------------------------------------------- 4381 Returns the result of converting the quadruple-precision floating-point 4382 value `a' to the 64-bit two's complement integer format. The conversion 4383 is performed according to the IEC/IEEE Standard for Binary Floating-Point 4384 Arithmetic---which means in particular that the conversion is rounded 4385 according to the current rounding mode. If `a' is a NaN, the largest 4386 positive integer is returned. Otherwise, if the conversion overflows, the 4387 largest integer with the same sign as `a' is returned. 4388 ------------------------------------------------------------------------------- 4389 */ 4390 int64 float128_to_int64( float128 a ) 4391 { 4392 flag aSign; 4393 int32 aExp, shiftCount; 4394 bits64 aSig0, aSig1; 4395 4396 aSig1 = extractFloat128Frac1( a ); 4397 aSig0 = extractFloat128Frac0( a ); 4398 aExp = extractFloat128Exp( a ); 4399 aSign = extractFloat128Sign( a ); 4400 if ( aExp ) aSig0 |= LIT64( 0x0001000000000000 ); 4401 shiftCount = 0x402F - aExp; 4402 if ( shiftCount <= 0 ) { 4403 if ( 0x403E < aExp ) { 4404 float_raise( float_flag_invalid ); 4405 if ( ! aSign 4406 || ( ( aExp == 0x7FFF ) 4407 && ( aSig1 || ( aSig0 != LIT64( 0x0001000000000000 ) ) ) 4408 ) 4409 ) { 4410 return LIT64( 0x7FFFFFFFFFFFFFFF ); 4411 } 4412 return (sbits64) LIT64( 0x8000000000000000 ); 4413 } 4414 shortShift128Left( aSig0, aSig1, - shiftCount, &aSig0, &aSig1 ); 4415 } 4416 else { 4417 shift64ExtraRightJamming( aSig0, aSig1, shiftCount, &aSig0, &aSig1 ); 4418 } 4419 return roundAndPackInt64( aSign, aSig0, aSig1 ); 4420 4421 } 4422 4423 /* 4424 ------------------------------------------------------------------------------- 4425 Returns the result of converting the quadruple-precision floating-point 4426 value `a' to the 64-bit two's complement integer format. The conversion 4427 is performed according to the IEC/IEEE Standard for Binary Floating-Point 4428 Arithmetic, except that the conversion is always rounded toward zero. 4429 If `a' is a NaN, the largest positive integer is returned. Otherwise, if 4430 the conversion overflows, the largest integer with the same sign as `a' is 4431 returned. 4432 ------------------------------------------------------------------------------- 4433 */ 4434 int64 float128_to_int64_round_to_zero( float128 a ) 4435 { 4436 flag aSign; 4437 int32 aExp, shiftCount; 4438 bits64 aSig0, aSig1; 4439 int64 z; 4440 4441 aSig1 = extractFloat128Frac1( a ); 4442 aSig0 = extractFloat128Frac0( a ); 4443 aExp = extractFloat128Exp( a ); 4444 aSign = extractFloat128Sign( a ); 4445 if ( aExp ) aSig0 |= LIT64( 0x0001000000000000 ); 4446 shiftCount = aExp - 0x402F; 4447 if ( 0 < shiftCount ) { 4448 if ( 0x403E <= aExp ) { 4449 aSig0 &= LIT64( 0x0000FFFFFFFFFFFF ); 4450 if ( ( a.high == LIT64( 0xC03E000000000000 ) ) 4451 && ( aSig1 < LIT64( 0x0002000000000000 ) ) ) { 4452 if ( aSig1 ) float_exception_flags |= float_flag_inexact; 4453 } 4454 else { 4455 float_raise( float_flag_invalid ); 4456 if ( ! aSign || ( ( aExp == 0x7FFF ) && ( aSig0 | aSig1 ) ) ) { 4457 return LIT64( 0x7FFFFFFFFFFFFFFF ); 4458 } 4459 } 4460 return (sbits64) LIT64( 0x8000000000000000 ); 4461 } 4462 z = ( aSig0<<shiftCount ) | ( aSig1>>( ( - shiftCount ) & 63 ) ); 4463 if ( (bits64) ( aSig1<<shiftCount ) ) { 4464 float_exception_flags |= float_flag_inexact; 4465 } 4466 } 4467 else { 4468 if ( aExp < 0x3FFF ) { 4469 if ( aExp | aSig0 | aSig1 ) { 4470 float_exception_flags |= float_flag_inexact; 4471 } 4472 return 0; 4473 } 4474 z = aSig0>>( - shiftCount ); 4475 if ( aSig1 4476 || ( shiftCount && (bits64) ( aSig0<<( shiftCount & 63 ) ) ) ) { 4477 float_exception_flags |= float_flag_inexact; 4478 } 4479 } 4480 if ( aSign ) z = - z; 4481 return z; 4482 4483 } 4484 4485 #if (defined(SOFTFLOATSPARC64_FOR_GCC) || defined(SOFTFLOAT_FOR_GCC)) \ 4486 && defined(SOFTFLOAT_NEED_FIXUNS) 4487 /* 4488 * just like above - but do not care for overflow of signed results 4489 */ 4490 uint64 float128_to_uint64_round_to_zero( float128 a ) 4491 { 4492 flag aSign; 4493 int32 aExp, shiftCount; 4494 bits64 aSig0, aSig1; 4495 uint64 z; 4496 4497 aSig1 = extractFloat128Frac1( a ); 4498 aSig0 = extractFloat128Frac0( a ); 4499 aExp = extractFloat128Exp( a ); 4500 aSign = extractFloat128Sign( a ); 4501 if ( aExp ) aSig0 |= LIT64( 0x0001000000000000 ); 4502 shiftCount = aExp - 0x402F; 4503 if ( 0 < shiftCount ) { 4504 if ( 0x403F <= aExp ) { 4505 aSig0 &= LIT64( 0x0000FFFFFFFFFFFF ); 4506 if ( ( a.high == LIT64( 0xC03E000000000000 ) ) 4507 && ( aSig1 < LIT64( 0x0002000000000000 ) ) ) { 4508 if ( aSig1 ) float_exception_flags |= float_flag_inexact; 4509 } 4510 else { 4511 float_raise( float_flag_invalid ); 4512 } 4513 return LIT64( 0xFFFFFFFFFFFFFFFF ); 4514 } 4515 z = ( aSig0<<shiftCount ) | ( aSig1>>( ( - shiftCount ) & 63 ) ); 4516 if ( (bits64) ( aSig1<<shiftCount ) ) { 4517 float_exception_flags |= float_flag_inexact; 4518 } 4519 } 4520 else { 4521 if ( aExp < 0x3FFF ) { 4522 if ( aExp | aSig0 | aSig1 ) { 4523 float_exception_flags |= float_flag_inexact; 4524 } 4525 return 0; 4526 } 4527 z = aSig0>>( - shiftCount ); 4528 if (aSig1 || ( shiftCount && (bits64) ( aSig0<<( shiftCount & 63 ) ) ) ) { 4529 float_exception_flags |= float_flag_inexact; 4530 } 4531 } 4532 if ( aSign ) z = - z; 4533 return z; 4534 4535 } 4536 #endif /* (SOFTFLOATSPARC64_FOR_GCC || SOFTFLOAT_FOR_GCC) && SOFTFLOAT_NEED_FIXUNS */ 4537 4538 /* 4539 ------------------------------------------------------------------------------- 4540 Returns the result of converting the quadruple-precision floating-point 4541 value `a' to the single-precision floating-point format. The conversion 4542 is performed according to the IEC/IEEE Standard for Binary Floating-Point 4543 Arithmetic. 4544 ------------------------------------------------------------------------------- 4545 */ 4546 float32 float128_to_float32( float128 a ) 4547 { 4548 flag aSign; 4549 int32 aExp; 4550 bits64 aSig0, aSig1; 4551 bits32 zSig; 4552 4553 aSig1 = extractFloat128Frac1( a ); 4554 aSig0 = extractFloat128Frac0( a ); 4555 aExp = extractFloat128Exp( a ); 4556 aSign = extractFloat128Sign( a ); 4557 if ( aExp == 0x7FFF ) { 4558 if ( aSig0 | aSig1 ) { 4559 return commonNaNToFloat32( float128ToCommonNaN( a ) ); 4560 } 4561 return packFloat32( aSign, 0xFF, 0 ); 4562 } 4563 aSig0 |= ( aSig1 != 0 ); 4564 shift64RightJamming( aSig0, 18, &aSig0 ); 4565 zSig = aSig0; 4566 if ( aExp || zSig ) { 4567 zSig |= 0x40000000; 4568 aExp -= 0x3F81; 4569 } 4570 return roundAndPackFloat32( aSign, aExp, zSig ); 4571 4572 } 4573 4574 /* 4575 ------------------------------------------------------------------------------- 4576 Returns the result of converting the quadruple-precision floating-point 4577 value `a' to the double-precision floating-point format. The conversion 4578 is performed according to the IEC/IEEE Standard for Binary Floating-Point 4579 Arithmetic. 4580 ------------------------------------------------------------------------------- 4581 */ 4582 float64 float128_to_float64( float128 a ) 4583 { 4584 flag aSign; 4585 int32 aExp; 4586 bits64 aSig0, aSig1; 4587 4588 aSig1 = extractFloat128Frac1( a ); 4589 aSig0 = extractFloat128Frac0( a ); 4590 aExp = extractFloat128Exp( a ); 4591 aSign = extractFloat128Sign( a ); 4592 if ( aExp == 0x7FFF ) { 4593 if ( aSig0 | aSig1 ) { 4594 return commonNaNToFloat64( float128ToCommonNaN( a ) ); 4595 } 4596 return packFloat64( aSign, 0x7FF, 0 ); 4597 } 4598 shortShift128Left( aSig0, aSig1, 14, &aSig0, &aSig1 ); 4599 aSig0 |= ( aSig1 != 0 ); 4600 if ( aExp || aSig0 ) { 4601 aSig0 |= LIT64( 0x4000000000000000 ); 4602 aExp -= 0x3C01; 4603 } 4604 return roundAndPackFloat64( aSign, aExp, aSig0 ); 4605 4606 } 4607 4608 #ifdef FLOATX80 4609 4610 /* 4611 ------------------------------------------------------------------------------- 4612 Returns the result of converting the quadruple-precision floating-point 4613 value `a' to the extended double-precision floating-point format. The 4614 conversion is performed according to the IEC/IEEE Standard for Binary 4615 Floating-Point Arithmetic. 4616 ------------------------------------------------------------------------------- 4617 */ 4618 floatx80 float128_to_floatx80( float128 a ) 4619 { 4620 flag aSign; 4621 int32 aExp; 4622 bits64 aSig0, aSig1; 4623 4624 aSig1 = extractFloat128Frac1( a ); 4625 aSig0 = extractFloat128Frac0( a ); 4626 aExp = extractFloat128Exp( a ); 4627 aSign = extractFloat128Sign( a ); 4628 if ( aExp == 0x7FFF ) { 4629 if ( aSig0 | aSig1 ) { 4630 return commonNaNToFloatx80( float128ToCommonNaN( a ) ); 4631 } 4632 return packFloatx80( aSign, 0x7FFF, LIT64( 0x8000000000000000 ) ); 4633 } 4634 if ( aExp == 0 ) { 4635 if ( ( aSig0 | aSig1 ) == 0 ) return packFloatx80( aSign, 0, 0 ); 4636 normalizeFloat128Subnormal( aSig0, aSig1, &aExp, &aSig0, &aSig1 ); 4637 } 4638 else { 4639 aSig0 |= LIT64( 0x0001000000000000 ); 4640 } 4641 shortShift128Left( aSig0, aSig1, 15, &aSig0, &aSig1 ); 4642 return roundAndPackFloatx80( 80, aSign, aExp, aSig0, aSig1 ); 4643 4644 } 4645 4646 #endif 4647 4648 /* 4649 ------------------------------------------------------------------------------- 4650 Rounds the quadruple-precision floating-point value `a' to an integer, and 4651 returns the result as a quadruple-precision floating-point value. The 4652 operation is performed according to the IEC/IEEE Standard for Binary 4653 Floating-Point Arithmetic. 4654 ------------------------------------------------------------------------------- 4655 */ 4656 float128 float128_round_to_int( float128 a ) 4657 { 4658 flag aSign; 4659 int32 aExp; 4660 bits64 lastBitMask, roundBitsMask; 4661 int8 roundingMode; 4662 float128 z; 4663 4664 aExp = extractFloat128Exp( a ); 4665 if ( 0x402F <= aExp ) { 4666 if ( 0x406F <= aExp ) { 4667 if ( ( aExp == 0x7FFF ) 4668 && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) 4669 ) { 4670 return propagateFloat128NaN( a, a ); 4671 } 4672 return a; 4673 } 4674 lastBitMask = 1; 4675 lastBitMask = ( lastBitMask<<( 0x406E - aExp ) )<<1; 4676 roundBitsMask = lastBitMask - 1; 4677 z = a; 4678 roundingMode = float_rounding_mode; 4679 if ( roundingMode == float_round_nearest_even ) { 4680 if ( lastBitMask ) { 4681 add128( z.high, z.low, 0, lastBitMask>>1, &z.high, &z.low ); 4682 if ( ( z.low & roundBitsMask ) == 0 ) z.low &= ~ lastBitMask; 4683 } 4684 else { 4685 if ( (sbits64) z.low < 0 ) { 4686 ++z.high; 4687 if ( (bits64) ( z.low<<1 ) == 0 ) z.high &= ~1; 4688 } 4689 } 4690 } 4691 else if ( roundingMode != float_round_to_zero ) { 4692 if ( extractFloat128Sign( z ) 4693 ^ ( roundingMode == float_round_up ) ) { 4694 add128( z.high, z.low, 0, roundBitsMask, &z.high, &z.low ); 4695 } 4696 } 4697 z.low &= ~ roundBitsMask; 4698 } 4699 else { 4700 if ( aExp < 0x3FFF ) { 4701 if ( ( ( (bits64) ( a.high<<1 ) ) | a.low ) == 0 ) return a; 4702 float_exception_flags |= float_flag_inexact; 4703 aSign = extractFloat128Sign( a ); 4704 switch ( float_rounding_mode ) { 4705 case float_round_nearest_even: 4706 if ( ( aExp == 0x3FFE ) 4707 && ( extractFloat128Frac0( a ) 4708 | extractFloat128Frac1( a ) ) 4709 ) { 4710 return packFloat128( aSign, 0x3FFF, 0, 0 ); 4711 } 4712 break; 4713 case float_round_to_zero: 4714 break; 4715 case float_round_down: 4716 return 4717 aSign ? packFloat128( 1, 0x3FFF, 0, 0 ) 4718 : packFloat128( 0, 0, 0, 0 ); 4719 case float_round_up: 4720 return 4721 aSign ? packFloat128( 1, 0, 0, 0 ) 4722 : packFloat128( 0, 0x3FFF, 0, 0 ); 4723 } 4724 return packFloat128( aSign, 0, 0, 0 ); 4725 } 4726 lastBitMask = 1; 4727 lastBitMask <<= 0x402F - aExp; 4728 roundBitsMask = lastBitMask - 1; 4729 z.low = 0; 4730 z.high = a.high; 4731 roundingMode = float_rounding_mode; 4732 if ( roundingMode == float_round_nearest_even ) { 4733 z.high += lastBitMask>>1; 4734 if ( ( ( z.high & roundBitsMask ) | a.low ) == 0 ) { 4735 z.high &= ~ lastBitMask; 4736 } 4737 } 4738 else if ( roundingMode != float_round_to_zero ) { 4739 if ( extractFloat128Sign( z ) 4740 ^ ( roundingMode == float_round_up ) ) { 4741 z.high |= ( a.low != 0 ); 4742 z.high += roundBitsMask; 4743 } 4744 } 4745 z.high &= ~ roundBitsMask; 4746 } 4747 if ( ( z.low != a.low ) || ( z.high != a.high ) ) { 4748 float_exception_flags |= float_flag_inexact; 4749 } 4750 return z; 4751 4752 } 4753 4754 /* 4755 ------------------------------------------------------------------------------- 4756 Returns the result of adding the absolute values of the quadruple-precision 4757 floating-point values `a' and `b'. If `zSign' is 1, the sum is negated 4758 before being returned. `zSign' is ignored if the result is a NaN. 4759 The addition is performed according to the IEC/IEEE Standard for Binary 4760 Floating-Point Arithmetic. 4761 ------------------------------------------------------------------------------- 4762 */ 4763 static float128 addFloat128Sigs( float128 a, float128 b, flag zSign ) 4764 { 4765 int32 aExp, bExp, zExp; 4766 bits64 aSig0, aSig1, bSig0, bSig1, zSig0, zSig1, zSig2; 4767 int32 expDiff; 4768 4769 aSig1 = extractFloat128Frac1( a ); 4770 aSig0 = extractFloat128Frac0( a ); 4771 aExp = extractFloat128Exp( a ); 4772 bSig1 = extractFloat128Frac1( b ); 4773 bSig0 = extractFloat128Frac0( b ); 4774 bExp = extractFloat128Exp( b ); 4775 expDiff = aExp - bExp; 4776 if ( 0 < expDiff ) { 4777 if ( aExp == 0x7FFF ) { 4778 if ( aSig0 | aSig1 ) return propagateFloat128NaN( a, b ); 4779 return a; 4780 } 4781 if ( bExp == 0 ) { 4782 --expDiff; 4783 } 4784 else { 4785 bSig0 |= LIT64( 0x0001000000000000 ); 4786 } 4787 shift128ExtraRightJamming( 4788 bSig0, bSig1, 0, expDiff, &bSig0, &bSig1, &zSig2 ); 4789 zExp = aExp; 4790 } 4791 else if ( expDiff < 0 ) { 4792 if ( bExp == 0x7FFF ) { 4793 if ( bSig0 | bSig1 ) return propagateFloat128NaN( a, b ); 4794 return packFloat128( zSign, 0x7FFF, 0, 0 ); 4795 } 4796 if ( aExp == 0 ) { 4797 ++expDiff; 4798 } 4799 else { 4800 aSig0 |= LIT64( 0x0001000000000000 ); 4801 } 4802 shift128ExtraRightJamming( 4803 aSig0, aSig1, 0, - expDiff, &aSig0, &aSig1, &zSig2 ); 4804 zExp = bExp; 4805 } 4806 else { 4807 if ( aExp == 0x7FFF ) { 4808 if ( aSig0 | aSig1 | bSig0 | bSig1 ) { 4809 return propagateFloat128NaN( a, b ); 4810 } 4811 return a; 4812 } 4813 add128( aSig0, aSig1, bSig0, bSig1, &zSig0, &zSig1 ); 4814 if ( aExp == 0 ) return packFloat128( zSign, 0, zSig0, zSig1 ); 4815 zSig2 = 0; 4816 zSig0 |= LIT64( 0x0002000000000000 ); 4817 zExp = aExp; 4818 goto shiftRight1; 4819 } 4820 aSig0 |= LIT64( 0x0001000000000000 ); 4821 add128( aSig0, aSig1, bSig0, bSig1, &zSig0, &zSig1 ); 4822 --zExp; 4823 if ( zSig0 < LIT64( 0x0002000000000000 ) ) goto roundAndPack; 4824 ++zExp; 4825 shiftRight1: 4826 shift128ExtraRightJamming( 4827 zSig0, zSig1, zSig2, 1, &zSig0, &zSig1, &zSig2 ); 4828 roundAndPack: 4829 return roundAndPackFloat128( zSign, zExp, zSig0, zSig1, zSig2 ); 4830 4831 } 4832 4833 /* 4834 ------------------------------------------------------------------------------- 4835 Returns the result of subtracting the absolute values of the quadruple- 4836 precision floating-point values `a' and `b'. If `zSign' is 1, the 4837 difference is negated before being returned. `zSign' is ignored if the 4838 result is a NaN. The subtraction is performed according to the IEC/IEEE 4839 Standard for Binary Floating-Point Arithmetic. 4840 ------------------------------------------------------------------------------- 4841 */ 4842 static float128 subFloat128Sigs( float128 a, float128 b, flag zSign ) 4843 { 4844 int32 aExp, bExp, zExp; 4845 bits64 aSig0, aSig1, bSig0, bSig1, zSig0, zSig1; 4846 int32 expDiff; 4847 float128 z; 4848 4849 aSig1 = extractFloat128Frac1( a ); 4850 aSig0 = extractFloat128Frac0( a ); 4851 aExp = extractFloat128Exp( a ); 4852 bSig1 = extractFloat128Frac1( b ); 4853 bSig0 = extractFloat128Frac0( b ); 4854 bExp = extractFloat128Exp( b ); 4855 expDiff = aExp - bExp; 4856 shortShift128Left( aSig0, aSig1, 14, &aSig0, &aSig1 ); 4857 shortShift128Left( bSig0, bSig1, 14, &bSig0, &bSig1 ); 4858 if ( 0 < expDiff ) goto aExpBigger; 4859 if ( expDiff < 0 ) goto bExpBigger; 4860 if ( aExp == 0x7FFF ) { 4861 if ( aSig0 | aSig1 | bSig0 | bSig1 ) { 4862 return propagateFloat128NaN( a, b ); 4863 } 4864 float_raise( float_flag_invalid ); 4865 z.low = float128_default_nan_low; 4866 z.high = float128_default_nan_high; 4867 return z; 4868 } 4869 if ( aExp == 0 ) { 4870 aExp = 1; 4871 bExp = 1; 4872 } 4873 if ( bSig0 < aSig0 ) goto aBigger; 4874 if ( aSig0 < bSig0 ) goto bBigger; 4875 if ( bSig1 < aSig1 ) goto aBigger; 4876 if ( aSig1 < bSig1 ) goto bBigger; 4877 return packFloat128( float_rounding_mode == float_round_down, 0, 0, 0 ); 4878 bExpBigger: 4879 if ( bExp == 0x7FFF ) { 4880 if ( bSig0 | bSig1 ) return propagateFloat128NaN( a, b ); 4881 return packFloat128( zSign ^ 1, 0x7FFF, 0, 0 ); 4882 } 4883 if ( aExp == 0 ) { 4884 ++expDiff; 4885 } 4886 else { 4887 aSig0 |= LIT64( 0x4000000000000000 ); 4888 } 4889 shift128RightJamming( aSig0, aSig1, - expDiff, &aSig0, &aSig1 ); 4890 bSig0 |= LIT64( 0x4000000000000000 ); 4891 bBigger: 4892 sub128( bSig0, bSig1, aSig0, aSig1, &zSig0, &zSig1 ); 4893 zExp = bExp; 4894 zSign ^= 1; 4895 goto normalizeRoundAndPack; 4896 aExpBigger: 4897 if ( aExp == 0x7FFF ) { 4898 if ( aSig0 | aSig1 ) return propagateFloat128NaN( a, b ); 4899 return a; 4900 } 4901 if ( bExp == 0 ) { 4902 --expDiff; 4903 } 4904 else { 4905 bSig0 |= LIT64( 0x4000000000000000 ); 4906 } 4907 shift128RightJamming( bSig0, bSig1, expDiff, &bSig0, &bSig1 ); 4908 aSig0 |= LIT64( 0x4000000000000000 ); 4909 aBigger: 4910 sub128( aSig0, aSig1, bSig0, bSig1, &zSig0, &zSig1 ); 4911 zExp = aExp; 4912 normalizeRoundAndPack: 4913 --zExp; 4914 return normalizeRoundAndPackFloat128( zSign, zExp - 14, zSig0, zSig1 ); 4915 4916 } 4917 4918 /* 4919 ------------------------------------------------------------------------------- 4920 Returns the result of adding the quadruple-precision floating-point values 4921 `a' and `b'. The operation is performed according to the IEC/IEEE Standard 4922 for Binary Floating-Point Arithmetic. 4923 ------------------------------------------------------------------------------- 4924 */ 4925 float128 float128_add( float128 a, float128 b ) 4926 { 4927 flag aSign, bSign; 4928 4929 aSign = extractFloat128Sign( a ); 4930 bSign = extractFloat128Sign( b ); 4931 if ( aSign == bSign ) { 4932 return addFloat128Sigs( a, b, aSign ); 4933 } 4934 else { 4935 return subFloat128Sigs( a, b, aSign ); 4936 } 4937 4938 } 4939 4940 /* 4941 ------------------------------------------------------------------------------- 4942 Returns the result of subtracting the quadruple-precision floating-point 4943 values `a' and `b'. The operation is performed according to the IEC/IEEE 4944 Standard for Binary Floating-Point Arithmetic. 4945 ------------------------------------------------------------------------------- 4946 */ 4947 float128 float128_sub( float128 a, float128 b ) 4948 { 4949 flag aSign, bSign; 4950 4951 aSign = extractFloat128Sign( a ); 4952 bSign = extractFloat128Sign( b ); 4953 if ( aSign == bSign ) { 4954 return subFloat128Sigs( a, b, aSign ); 4955 } 4956 else { 4957 return addFloat128Sigs( a, b, aSign ); 4958 } 4959 4960 } 4961 4962 /* 4963 ------------------------------------------------------------------------------- 4964 Returns the result of multiplying the quadruple-precision floating-point 4965 values `a' and `b'. The operation is performed according to the IEC/IEEE 4966 Standard for Binary Floating-Point Arithmetic. 4967 ------------------------------------------------------------------------------- 4968 */ 4969 float128 float128_mul( float128 a, float128 b ) 4970 { 4971 flag aSign, bSign, zSign; 4972 int32 aExp, bExp, zExp; 4973 bits64 aSig0, aSig1, bSig0, bSig1, zSig0, zSig1, zSig2, zSig3; 4974 float128 z; 4975 4976 aSig1 = extractFloat128Frac1( a ); 4977 aSig0 = extractFloat128Frac0( a ); 4978 aExp = extractFloat128Exp( a ); 4979 aSign = extractFloat128Sign( a ); 4980 bSig1 = extractFloat128Frac1( b ); 4981 bSig0 = extractFloat128Frac0( b ); 4982 bExp = extractFloat128Exp( b ); 4983 bSign = extractFloat128Sign( b ); 4984 zSign = aSign ^ bSign; 4985 if ( aExp == 0x7FFF ) { 4986 if ( ( aSig0 | aSig1 ) 4987 || ( ( bExp == 0x7FFF ) && ( bSig0 | bSig1 ) ) ) { 4988 return propagateFloat128NaN( a, b ); 4989 } 4990 if ( ( bExp | bSig0 | bSig1 ) == 0 ) goto invalid; 4991 return packFloat128( zSign, 0x7FFF, 0, 0 ); 4992 } 4993 if ( bExp == 0x7FFF ) { 4994 if ( bSig0 | bSig1 ) return propagateFloat128NaN( a, b ); 4995 if ( ( aExp | aSig0 | aSig1 ) == 0 ) { 4996 invalid: 4997 float_raise( float_flag_invalid ); 4998 z.low = float128_default_nan_low; 4999 z.high = float128_default_nan_high; 5000 return z; 5001 } 5002 return packFloat128( zSign, 0x7FFF, 0, 0 ); 5003 } 5004 if ( aExp == 0 ) { 5005 if ( ( aSig0 | aSig1 ) == 0 ) return packFloat128( zSign, 0, 0, 0 ); 5006 normalizeFloat128Subnormal( aSig0, aSig1, &aExp, &aSig0, &aSig1 ); 5007 } 5008 if ( bExp == 0 ) { 5009 if ( ( bSig0 | bSig1 ) == 0 ) return packFloat128( zSign, 0, 0, 0 ); 5010 normalizeFloat128Subnormal( bSig0, bSig1, &bExp, &bSig0, &bSig1 ); 5011 } 5012 zExp = aExp + bExp - 0x4000; 5013 aSig0 |= LIT64( 0x0001000000000000 ); 5014 shortShift128Left( bSig0, bSig1, 16, &bSig0, &bSig1 ); 5015 mul128To256( aSig0, aSig1, bSig0, bSig1, &zSig0, &zSig1, &zSig2, &zSig3 ); 5016 add128( zSig0, zSig1, aSig0, aSig1, &zSig0, &zSig1 ); 5017 zSig2 |= ( zSig3 != 0 ); 5018 if ( LIT64( 0x0002000000000000 ) <= zSig0 ) { 5019 shift128ExtraRightJamming( 5020 zSig0, zSig1, zSig2, 1, &zSig0, &zSig1, &zSig2 ); 5021 ++zExp; 5022 } 5023 return roundAndPackFloat128( zSign, zExp, zSig0, zSig1, zSig2 ); 5024 5025 } 5026 5027 /* 5028 ------------------------------------------------------------------------------- 5029 Returns the result of dividing the quadruple-precision floating-point value 5030 `a' by the corresponding value `b'. The operation is performed according to 5031 the IEC/IEEE Standard for Binary Floating-Point Arithmetic. 5032 ------------------------------------------------------------------------------- 5033 */ 5034 float128 float128_div( float128 a, float128 b ) 5035 { 5036 flag aSign, bSign, zSign; 5037 int32 aExp, bExp, zExp; 5038 bits64 aSig0, aSig1, bSig0, bSig1, zSig0, zSig1, zSig2; 5039 bits64 rem0, rem1, rem2, rem3, term0, term1, term2, term3; 5040 float128 z; 5041 5042 aSig1 = extractFloat128Frac1( a ); 5043 aSig0 = extractFloat128Frac0( a ); 5044 aExp = extractFloat128Exp( a ); 5045 aSign = extractFloat128Sign( a ); 5046 bSig1 = extractFloat128Frac1( b ); 5047 bSig0 = extractFloat128Frac0( b ); 5048 bExp = extractFloat128Exp( b ); 5049 bSign = extractFloat128Sign( b ); 5050 zSign = aSign ^ bSign; 5051 if ( aExp == 0x7FFF ) { 5052 if ( aSig0 | aSig1 ) return propagateFloat128NaN( a, b ); 5053 if ( bExp == 0x7FFF ) { 5054 if ( bSig0 | bSig1 ) return propagateFloat128NaN( a, b ); 5055 goto invalid; 5056 } 5057 return packFloat128( zSign, 0x7FFF, 0, 0 ); 5058 } 5059 if ( bExp == 0x7FFF ) { 5060 if ( bSig0 | bSig1 ) return propagateFloat128NaN( a, b ); 5061 return packFloat128( zSign, 0, 0, 0 ); 5062 } 5063 if ( bExp == 0 ) { 5064 if ( ( bSig0 | bSig1 ) == 0 ) { 5065 if ( ( aExp | aSig0 | aSig1 ) == 0 ) { 5066 invalid: 5067 float_raise( float_flag_invalid ); 5068 z.low = float128_default_nan_low; 5069 z.high = float128_default_nan_high; 5070 return z; 5071 } 5072 float_raise( float_flag_divbyzero ); 5073 return packFloat128( zSign, 0x7FFF, 0, 0 ); 5074 } 5075 normalizeFloat128Subnormal( bSig0, bSig1, &bExp, &bSig0, &bSig1 ); 5076 } 5077 if ( aExp == 0 ) { 5078 if ( ( aSig0 | aSig1 ) == 0 ) return packFloat128( zSign, 0, 0, 0 ); 5079 normalizeFloat128Subnormal( aSig0, aSig1, &aExp, &aSig0, &aSig1 ); 5080 } 5081 zExp = aExp - bExp + 0x3FFD; 5082 shortShift128Left( 5083 aSig0 | LIT64( 0x0001000000000000 ), aSig1, 15, &aSig0, &aSig1 ); 5084 shortShift128Left( 5085 bSig0 | LIT64( 0x0001000000000000 ), bSig1, 15, &bSig0, &bSig1 ); 5086 if ( le128( bSig0, bSig1, aSig0, aSig1 ) ) { 5087 shift128Right( aSig0, aSig1, 1, &aSig0, &aSig1 ); 5088 ++zExp; 5089 } 5090 zSig0 = estimateDiv128To64( aSig0, aSig1, bSig0 ); 5091 mul128By64To192( bSig0, bSig1, zSig0, &term0, &term1, &term2 ); 5092 sub192( aSig0, aSig1, 0, term0, term1, term2, &rem0, &rem1, &rem2 ); 5093 while ( (sbits64) rem0 < 0 ) { 5094 --zSig0; 5095 add192( rem0, rem1, rem2, 0, bSig0, bSig1, &rem0, &rem1, &rem2 ); 5096 } 5097 zSig1 = estimateDiv128To64( rem1, rem2, bSig0 ); 5098 if ( ( zSig1 & 0x3FFF ) <= 4 ) { 5099 mul128By64To192( bSig0, bSig1, zSig1, &term1, &term2, &term3 ); 5100 sub192( rem1, rem2, 0, term1, term2, term3, &rem1, &rem2, &rem3 ); 5101 while ( (sbits64) rem1 < 0 ) { 5102 --zSig1; 5103 add192( rem1, rem2, rem3, 0, bSig0, bSig1, &rem1, &rem2, &rem3 ); 5104 } 5105 zSig1 |= ( ( rem1 | rem2 | rem3 ) != 0 ); 5106 } 5107 shift128ExtraRightJamming( zSig0, zSig1, 0, 15, &zSig0, &zSig1, &zSig2 ); 5108 return roundAndPackFloat128( zSign, zExp, zSig0, zSig1, zSig2 ); 5109 5110 } 5111 5112 /* 5113 ------------------------------------------------------------------------------- 5114 Returns the remainder of the quadruple-precision floating-point value `a' 5115 with respect to the corresponding value `b'. The operation is performed 5116 according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic. 5117 ------------------------------------------------------------------------------- 5118 */ 5119 float128 float128_rem( float128 a, float128 b ) 5120 { 5121 flag aSign, bSign, zSign; 5122 int32 aExp, bExp, expDiff; 5123 bits64 aSig0, aSig1, bSig0, bSig1, q, term0, term1, term2; 5124 bits64 allZero, alternateASig0, alternateASig1, sigMean1; 5125 sbits64 sigMean0; 5126 float128 z; 5127 5128 aSig1 = extractFloat128Frac1( a ); 5129 aSig0 = extractFloat128Frac0( a ); 5130 aExp = extractFloat128Exp( a ); 5131 aSign = extractFloat128Sign( a ); 5132 bSig1 = extractFloat128Frac1( b ); 5133 bSig0 = extractFloat128Frac0( b ); 5134 bExp = extractFloat128Exp( b ); 5135 bSign = extractFloat128Sign( b ); 5136 if ( aExp == 0x7FFF ) { 5137 if ( ( aSig0 | aSig1 ) 5138 || ( ( bExp == 0x7FFF ) && ( bSig0 | bSig1 ) ) ) { 5139 return propagateFloat128NaN( a, b ); 5140 } 5141 goto invalid; 5142 } 5143 if ( bExp == 0x7FFF ) { 5144 if ( bSig0 | bSig1 ) return propagateFloat128NaN( a, b ); 5145 return a; 5146 } 5147 if ( bExp == 0 ) { 5148 if ( ( bSig0 | bSig1 ) == 0 ) { 5149 invalid: 5150 float_raise( float_flag_invalid ); 5151 z.low = float128_default_nan_low; 5152 z.high = float128_default_nan_high; 5153 return z; 5154 } 5155 normalizeFloat128Subnormal( bSig0, bSig1, &bExp, &bSig0, &bSig1 ); 5156 } 5157 if ( aExp == 0 ) { 5158 if ( ( aSig0 | aSig1 ) == 0 ) return a; 5159 normalizeFloat128Subnormal( aSig0, aSig1, &aExp, &aSig0, &aSig1 ); 5160 } 5161 expDiff = aExp - bExp; 5162 if ( expDiff < -1 ) return a; 5163 shortShift128Left( 5164 aSig0 | LIT64( 0x0001000000000000 ), 5165 aSig1, 5166 15 - ( expDiff < 0 ), 5167 &aSig0, 5168 &aSig1 5169 ); 5170 shortShift128Left( 5171 bSig0 | LIT64( 0x0001000000000000 ), bSig1, 15, &bSig0, &bSig1 ); 5172 q = le128( bSig0, bSig1, aSig0, aSig1 ); 5173 if ( q ) sub128( aSig0, aSig1, bSig0, bSig1, &aSig0, &aSig1 ); 5174 expDiff -= 64; 5175 while ( 0 < expDiff ) { 5176 q = estimateDiv128To64( aSig0, aSig1, bSig0 ); 5177 q = ( 4 < q ) ? q - 4 : 0; 5178 mul128By64To192( bSig0, bSig1, q, &term0, &term1, &term2 ); 5179 shortShift192Left( term0, term1, term2, 61, &term1, &term2, &allZero ); 5180 shortShift128Left( aSig0, aSig1, 61, &aSig0, &allZero ); 5181 sub128( aSig0, 0, term1, term2, &aSig0, &aSig1 ); 5182 expDiff -= 61; 5183 } 5184 if ( -64 < expDiff ) { 5185 q = estimateDiv128To64( aSig0, aSig1, bSig0 ); 5186 q = ( 4 < q ) ? q - 4 : 0; 5187 q >>= - expDiff; 5188 shift128Right( bSig0, bSig1, 12, &bSig0, &bSig1 ); 5189 expDiff += 52; 5190 if ( expDiff < 0 ) { 5191 shift128Right( aSig0, aSig1, - expDiff, &aSig0, &aSig1 ); 5192 } 5193 else { 5194 shortShift128Left( aSig0, aSig1, expDiff, &aSig0, &aSig1 ); 5195 } 5196 mul128By64To192( bSig0, bSig1, q, &term0, &term1, &term2 ); 5197 sub128( aSig0, aSig1, term1, term2, &aSig0, &aSig1 ); 5198 } 5199 else { 5200 shift128Right( aSig0, aSig1, 12, &aSig0, &aSig1 ); 5201 shift128Right( bSig0, bSig1, 12, &bSig0, &bSig1 ); 5202 } 5203 do { 5204 alternateASig0 = aSig0; 5205 alternateASig1 = aSig1; 5206 ++q; 5207 sub128( aSig0, aSig1, bSig0, bSig1, &aSig0, &aSig1 ); 5208 } while ( 0 <= (sbits64) aSig0 ); 5209 add128( 5210 aSig0, aSig1, alternateASig0, alternateASig1, (bits64 *)&sigMean0, &sigMean1 ); 5211 if ( ( sigMean0 < 0 ) 5212 || ( ( ( sigMean0 | sigMean1 ) == 0 ) && ( q & 1 ) ) ) { 5213 aSig0 = alternateASig0; 5214 aSig1 = alternateASig1; 5215 } 5216 zSign = ( (sbits64) aSig0 < 0 ); 5217 if ( zSign ) sub128( 0, 0, aSig0, aSig1, &aSig0, &aSig1 ); 5218 return 5219 normalizeRoundAndPackFloat128( aSign ^ zSign, bExp - 4, aSig0, aSig1 ); 5220 5221 } 5222 5223 /* 5224 ------------------------------------------------------------------------------- 5225 Returns the square root of the quadruple-precision floating-point value `a'. 5226 The operation is performed according to the IEC/IEEE Standard for Binary 5227 Floating-Point Arithmetic. 5228 ------------------------------------------------------------------------------- 5229 */ 5230 float128 float128_sqrt( float128 a ) 5231 { 5232 flag aSign; 5233 int32 aExp, zExp; 5234 bits64 aSig0, aSig1, zSig0, zSig1, zSig2, doubleZSig0; 5235 bits64 rem0, rem1, rem2, rem3, term0, term1, term2, term3; 5236 float128 z; 5237 5238 aSig1 = extractFloat128Frac1( a ); 5239 aSig0 = extractFloat128Frac0( a ); 5240 aExp = extractFloat128Exp( a ); 5241 aSign = extractFloat128Sign( a ); 5242 if ( aExp == 0x7FFF ) { 5243 if ( aSig0 | aSig1 ) return propagateFloat128NaN( a, a ); 5244 if ( ! aSign ) return a; 5245 goto invalid; 5246 } 5247 if ( aSign ) { 5248 if ( ( aExp | aSig0 | aSig1 ) == 0 ) return a; 5249 invalid: 5250 float_raise( float_flag_invalid ); 5251 z.low = float128_default_nan_low; 5252 z.high = float128_default_nan_high; 5253 return z; 5254 } 5255 if ( aExp == 0 ) { 5256 if ( ( aSig0 | aSig1 ) == 0 ) return packFloat128( 0, 0, 0, 0 ); 5257 normalizeFloat128Subnormal( aSig0, aSig1, &aExp, &aSig0, &aSig1 ); 5258 } 5259 zExp = ( ( aExp - 0x3FFF )>>1 ) + 0x3FFE; 5260 aSig0 |= LIT64( 0x0001000000000000 ); 5261 zSig0 = estimateSqrt32( aExp, aSig0>>17 ); 5262 shortShift128Left( aSig0, aSig1, 13 - ( aExp & 1 ), &aSig0, &aSig1 ); 5263 zSig0 = estimateDiv128To64( aSig0, aSig1, zSig0<<32 ) + ( zSig0<<30 ); 5264 doubleZSig0 = zSig0<<1; 5265 mul64To128( zSig0, zSig0, &term0, &term1 ); 5266 sub128( aSig0, aSig1, term0, term1, &rem0, &rem1 ); 5267 while ( (sbits64) rem0 < 0 ) { 5268 --zSig0; 5269 doubleZSig0 -= 2; 5270 add128( rem0, rem1, zSig0>>63, doubleZSig0 | 1, &rem0, &rem1 ); 5271 } 5272 zSig1 = estimateDiv128To64( rem1, 0, doubleZSig0 ); 5273 if ( ( zSig1 & 0x1FFF ) <= 5 ) { 5274 if ( zSig1 == 0 ) zSig1 = 1; 5275 mul64To128( doubleZSig0, zSig1, &term1, &term2 ); 5276 sub128( rem1, 0, term1, term2, &rem1, &rem2 ); 5277 mul64To128( zSig1, zSig1, &term2, &term3 ); 5278 sub192( rem1, rem2, 0, 0, term2, term3, &rem1, &rem2, &rem3 ); 5279 while ( (sbits64) rem1 < 0 ) { 5280 --zSig1; 5281 shortShift128Left( 0, zSig1, 1, &term2, &term3 ); 5282 term3 |= 1; 5283 term2 |= doubleZSig0; 5284 add192( rem1, rem2, rem3, 0, term2, term3, &rem1, &rem2, &rem3 ); 5285 } 5286 zSig1 |= ( ( rem1 | rem2 | rem3 ) != 0 ); 5287 } 5288 shift128ExtraRightJamming( zSig0, zSig1, 0, 14, &zSig0, &zSig1, &zSig2 ); 5289 return roundAndPackFloat128( 0, zExp, zSig0, zSig1, zSig2 ); 5290 5291 } 5292 5293 /* 5294 ------------------------------------------------------------------------------- 5295 Returns 1 if the quadruple-precision floating-point value `a' is equal to 5296 the corresponding value `b', and 0 otherwise. The comparison is performed 5297 according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic. 5298 ------------------------------------------------------------------------------- 5299 */ 5300 flag float128_eq( float128 a, float128 b ) 5301 { 5302 5303 if ( ( ( extractFloat128Exp( a ) == 0x7FFF ) 5304 && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) ) 5305 || ( ( extractFloat128Exp( b ) == 0x7FFF ) 5306 && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) ) 5307 ) { 5308 if ( float128_is_signaling_nan( a ) 5309 || float128_is_signaling_nan( b ) ) { 5310 float_raise( float_flag_invalid ); 5311 } 5312 return 0; 5313 } 5314 return 5315 ( a.low == b.low ) 5316 && ( ( a.high == b.high ) 5317 || ( ( a.low == 0 ) 5318 && ( (bits64) ( ( a.high | b.high )<<1 ) == 0 ) ) 5319 ); 5320 5321 } 5322 5323 /* 5324 ------------------------------------------------------------------------------- 5325 Returns 1 if the quadruple-precision floating-point value `a' is less than 5326 or equal to the corresponding value `b', and 0 otherwise. The comparison 5327 is performed according to the IEC/IEEE Standard for Binary Floating-Point 5328 Arithmetic. 5329 ------------------------------------------------------------------------------- 5330 */ 5331 flag float128_le( float128 a, float128 b ) 5332 { 5333 flag aSign, bSign; 5334 5335 if ( ( ( extractFloat128Exp( a ) == 0x7FFF ) 5336 && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) ) 5337 || ( ( extractFloat128Exp( b ) == 0x7FFF ) 5338 && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) ) 5339 ) { 5340 float_raise( float_flag_invalid ); 5341 return 0; 5342 } 5343 aSign = extractFloat128Sign( a ); 5344 bSign = extractFloat128Sign( b ); 5345 if ( aSign != bSign ) { 5346 return 5347 aSign 5348 || ( ( ( (bits64) ( ( a.high | b.high )<<1 ) ) | a.low | b.low ) 5349 == 0 ); 5350 } 5351 return 5352 aSign ? le128( b.high, b.low, a.high, a.low ) 5353 : le128( a.high, a.low, b.high, b.low ); 5354 5355 } 5356 5357 /* 5358 ------------------------------------------------------------------------------- 5359 Returns 1 if the quadruple-precision floating-point value `a' is less than 5360 the corresponding value `b', and 0 otherwise. The comparison is performed 5361 according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic. 5362 ------------------------------------------------------------------------------- 5363 */ 5364 flag float128_lt( float128 a, float128 b ) 5365 { 5366 flag aSign, bSign; 5367 5368 if ( ( ( extractFloat128Exp( a ) == 0x7FFF ) 5369 && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) ) 5370 || ( ( extractFloat128Exp( b ) == 0x7FFF ) 5371 && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) ) 5372 ) { 5373 float_raise( float_flag_invalid ); 5374 return 0; 5375 } 5376 aSign = extractFloat128Sign( a ); 5377 bSign = extractFloat128Sign( b ); 5378 if ( aSign != bSign ) { 5379 return 5380 aSign 5381 && ( ( ( (bits64) ( ( a.high | b.high )<<1 ) ) | a.low | b.low ) 5382 != 0 ); 5383 } 5384 return 5385 aSign ? lt128( b.high, b.low, a.high, a.low ) 5386 : lt128( a.high, a.low, b.high, b.low ); 5387 5388 } 5389 5390 /* 5391 ------------------------------------------------------------------------------- 5392 Returns 1 if the quadruple-precision floating-point value `a' is equal to 5393 the corresponding value `b', and 0 otherwise. The invalid exception is 5394 raised if either operand is a NaN. Otherwise, the comparison is performed 5395 according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic. 5396 ------------------------------------------------------------------------------- 5397 */ 5398 flag float128_eq_signaling( float128 a, float128 b ) 5399 { 5400 5401 if ( ( ( extractFloat128Exp( a ) == 0x7FFF ) 5402 && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) ) 5403 || ( ( extractFloat128Exp( b ) == 0x7FFF ) 5404 && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) ) 5405 ) { 5406 float_raise( float_flag_invalid ); 5407 return 0; 5408 } 5409 return 5410 ( a.low == b.low ) 5411 && ( ( a.high == b.high ) 5412 || ( ( a.low == 0 ) 5413 && ( (bits64) ( ( a.high | b.high )<<1 ) == 0 ) ) 5414 ); 5415 5416 } 5417 5418 /* 5419 ------------------------------------------------------------------------------- 5420 Returns 1 if the quadruple-precision floating-point value `a' is less than 5421 or equal to the corresponding value `b', and 0 otherwise. Quiet NaNs do not 5422 cause an exception. Otherwise, the comparison is performed according to the 5423 IEC/IEEE Standard for Binary Floating-Point Arithmetic. 5424 ------------------------------------------------------------------------------- 5425 */ 5426 flag float128_le_quiet( float128 a, float128 b ) 5427 { 5428 flag aSign, bSign; 5429 5430 if ( ( ( extractFloat128Exp( a ) == 0x7FFF ) 5431 && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) ) 5432 || ( ( extractFloat128Exp( b ) == 0x7FFF ) 5433 && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) ) 5434 ) { 5435 if ( float128_is_signaling_nan( a ) 5436 || float128_is_signaling_nan( b ) ) { 5437 float_raise( float_flag_invalid ); 5438 } 5439 return 0; 5440 } 5441 aSign = extractFloat128Sign( a ); 5442 bSign = extractFloat128Sign( b ); 5443 if ( aSign != bSign ) { 5444 return 5445 aSign 5446 || ( ( ( (bits64) ( ( a.high | b.high )<<1 ) ) | a.low | b.low ) 5447 == 0 ); 5448 } 5449 return 5450 aSign ? le128( b.high, b.low, a.high, a.low ) 5451 : le128( a.high, a.low, b.high, b.low ); 5452 5453 } 5454 5455 /* 5456 ------------------------------------------------------------------------------- 5457 Returns 1 if the quadruple-precision floating-point value `a' is less than 5458 the corresponding value `b', and 0 otherwise. Quiet NaNs do not cause an 5459 exception. Otherwise, the comparison is performed according to the IEC/IEEE 5460 Standard for Binary Floating-Point Arithmetic. 5461 ------------------------------------------------------------------------------- 5462 */ 5463 flag float128_lt_quiet( float128 a, float128 b ) 5464 { 5465 flag aSign, bSign; 5466 5467 if ( ( ( extractFloat128Exp( a ) == 0x7FFF ) 5468 && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) ) 5469 || ( ( extractFloat128Exp( b ) == 0x7FFF ) 5470 && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) ) 5471 ) { 5472 if ( float128_is_signaling_nan( a ) 5473 || float128_is_signaling_nan( b ) ) { 5474 float_raise( float_flag_invalid ); 5475 } 5476 return 0; 5477 } 5478 aSign = extractFloat128Sign( a ); 5479 bSign = extractFloat128Sign( b ); 5480 if ( aSign != bSign ) { 5481 return 5482 aSign 5483 && ( ( ( (bits64) ( ( a.high | b.high )<<1 ) ) | a.low | b.low ) 5484 != 0 ); 5485 } 5486 return 5487 aSign ? lt128( b.high, b.low, a.high, a.low ) 5488 : lt128( a.high, a.low, b.high, b.low ); 5489 5490 } 5491 5492 #endif 5493 5494 5495 #if defined(SOFTFLOAT_FOR_GCC) && defined(SOFTFLOAT_NEED_FIXUNS) 5496 5497 /* 5498 * These two routines are not part of the original softfloat distribution. 5499 * 5500 * They are based on the corresponding conversions to integer but return 5501 * unsigned numbers instead since these functions are required by GCC. 5502 * 5503 * Added by Mark Brinicombe <mark@NetBSD.org> 27/09/97 5504 * 5505 * float64 version overhauled for SoftFloat 2a [bjh21 2000-07-15] 5506 */ 5507 5508 /* 5509 ------------------------------------------------------------------------------- 5510 Returns the result of converting the double-precision floating-point value 5511 `a' to the 32-bit unsigned integer format. The conversion is 5512 performed according to the IEC/IEEE Standard for Binary Floating-point 5513 Arithmetic, except that the conversion is always rounded toward zero. If 5514 `a' is a NaN, the largest positive integer is returned. If the conversion 5515 overflows, the largest integer positive is returned. 5516 ------------------------------------------------------------------------------- 5517 */ 5518 uint32 float64_to_uint32_round_to_zero( float64 a ) 5519 { 5520 flag aSign; 5521 int16 aExp, shiftCount; 5522 bits64 aSig, savedASig; 5523 uint32 z; 5524 5525 aSig = extractFloat64Frac( a ); 5526 aExp = extractFloat64Exp( a ); 5527 aSign = extractFloat64Sign( a ); 5528 5529 if (aSign) { 5530 float_raise( float_flag_invalid ); 5531 return(0); 5532 } 5533 5534 if ( 0x41E < aExp ) { 5535 float_raise( float_flag_invalid ); 5536 return 0xffffffff; 5537 } 5538 else if ( aExp < 0x3FF ) { 5539 if ( aExp || aSig ) float_exception_flags |= float_flag_inexact; 5540 return 0; 5541 } 5542 aSig |= LIT64( 0x0010000000000000 ); 5543 shiftCount = 0x433 - aExp; 5544 savedASig = aSig; 5545 aSig >>= shiftCount; 5546 z = aSig; 5547 if ( ( aSig<<shiftCount ) != savedASig ) { 5548 float_exception_flags |= float_flag_inexact; 5549 } 5550 return z; 5551 5552 } 5553 5554 /* 5555 ------------------------------------------------------------------------------- 5556 Returns the result of converting the single-precision floating-point value 5557 `a' to the 32-bit unsigned integer format. The conversion is 5558 performed according to the IEC/IEEE Standard for Binary Floating-point 5559 Arithmetic, except that the conversion is always rounded toward zero. If 5560 `a' is a NaN, the largest positive integer is returned. If the conversion 5561 overflows, the largest positive integer is returned. 5562 ------------------------------------------------------------------------------- 5563 */ 5564 uint32 float32_to_uint32_round_to_zero( float32 a ) 5565 { 5566 flag aSign; 5567 int16 aExp, shiftCount; 5568 bits32 aSig; 5569 uint32 z; 5570 5571 aSig = extractFloat32Frac( a ); 5572 aExp = extractFloat32Exp( a ); 5573 aSign = extractFloat32Sign( a ); 5574 shiftCount = aExp - 0x9E; 5575 5576 if (aSign) { 5577 float_raise( float_flag_invalid ); 5578 return(0); 5579 } 5580 if ( 0 < shiftCount ) { 5581 float_raise( float_flag_invalid ); 5582 return 0xFFFFFFFF; 5583 } 5584 else if ( aExp <= 0x7E ) { 5585 if ( aExp | aSig ) float_exception_flags |= float_flag_inexact; 5586 return 0; 5587 } 5588 aSig = ( aSig | 0x800000 )<<8; 5589 z = aSig>>( - shiftCount ); 5590 if ( aSig<<( shiftCount & 31 ) ) { 5591 float_exception_flags |= float_flag_inexact; 5592 } 5593 return z; 5594 5595 } 5596 5597 #endif 5598