1 /* $NetBSD: softfloat.c,v 1.8 2011/07/10 04:52:23 matt Exp $ */ 2 3 /* 4 * This version hacked for use with gcc -msoft-float by bjh21. 5 * (Mostly a case of #ifdefing out things GCC doesn't need or provides 6 * itself). 7 */ 8 9 /* 10 * Things you may want to define: 11 * 12 * SOFTFLOAT_FOR_GCC - build only those functions necessary for GCC (with 13 * -msoft-float) to work. Include "softfloat-for-gcc.h" to get them 14 * properly renamed. 15 */ 16 17 /* 18 =============================================================================== 19 20 This C source file is part of the SoftFloat IEC/IEEE Floating-point 21 Arithmetic Package, Release 2a. 22 23 Written by John R. Hauser. This work was made possible in part by the 24 International Computer Science Institute, located at Suite 600, 1947 Center 25 Street, Berkeley, California 94704. Funding was partially provided by the 26 National Science Foundation under grant MIP-9311980. The original version 27 of this code was written as part of a project to build a fixed-point vector 28 processor in collaboration with the University of California at Berkeley, 29 overseen by Profs. Nelson Morgan and John Wawrzynek. More information 30 is available through the Web page `http://HTTP.CS.Berkeley.EDU/~jhauser/ 31 arithmetic/SoftFloat.html'. 32 33 THIS SOFTWARE IS DISTRIBUTED AS IS, FOR FREE. Although reasonable effort 34 has been made to avoid it, THIS SOFTWARE MAY CONTAIN FAULTS THAT WILL AT 35 TIMES RESULT IN INCORRECT BEHAVIOR. USE OF THIS SOFTWARE IS RESTRICTED TO 36 PERSONS AND ORGANIZATIONS WHO CAN AND WILL TAKE FULL RESPONSIBILITY FOR ANY 37 AND ALL LOSSES, COSTS, OR OTHER PROBLEMS ARISING FROM ITS USE. 38 39 Derivative works are acceptable, even for commercial purposes, so long as 40 (1) they include prominent notice that the work is derivative, and (2) they 41 include prominent notice akin to these four paragraphs for those parts of 42 this code that are retained. 43 44 =============================================================================== 45 */ 46 47 #include <sys/cdefs.h> 48 __FBSDID("$FreeBSD$"); 49 50 #ifdef SOFTFLOAT_FOR_GCC 51 #include "softfloat-for-gcc.h" 52 #endif 53 54 #include "milieu.h" 55 #include "softfloat.h" 56 57 /* 58 * Conversions between floats as stored in memory and floats as 59 * SoftFloat uses them 60 */ 61 #ifndef FLOAT64_DEMANGLE 62 #define FLOAT64_DEMANGLE(a) (a) 63 #endif 64 #ifndef FLOAT64_MANGLE 65 #define FLOAT64_MANGLE(a) (a) 66 #endif 67 68 /* 69 ------------------------------------------------------------------------------- 70 Floating-point rounding mode, extended double-precision rounding precision, 71 and exception flags. 72 ------------------------------------------------------------------------------- 73 */ 74 int float_rounding_mode = float_round_nearest_even; 75 int float_exception_flags = 0; 76 #ifdef FLOATX80 77 int8 floatx80_rounding_precision = 80; 78 #endif 79 80 /* 81 ------------------------------------------------------------------------------- 82 Primitive arithmetic functions, including multi-word arithmetic, and 83 division and square root approximations. (Can be specialized to target if 84 desired.) 85 ------------------------------------------------------------------------------- 86 */ 87 #include "softfloat-macros" 88 89 /* 90 ------------------------------------------------------------------------------- 91 Functions and definitions to determine: (1) whether tininess for underflow 92 is detected before or after rounding by default, (2) what (if anything) 93 happens when exceptions are raised, (3) how signaling NaNs are distinguished 94 from quiet NaNs, (4) the default generated quiet NaNs, and (5) how NaNs 95 are propagated from function inputs to output. These details are target- 96 specific. 97 ------------------------------------------------------------------------------- 98 */ 99 #include "softfloat-specialize" 100 101 #if !defined(SOFTFLOAT_FOR_GCC) || defined(FLOATX80) || defined(FLOAT128) 102 /* 103 ------------------------------------------------------------------------------- 104 Takes a 64-bit fixed-point value `absZ' with binary point between bits 6 105 and 7, and returns the properly rounded 32-bit integer corresponding to the 106 input. If `zSign' is 1, the input is negated before being converted to an 107 integer. Bit 63 of `absZ' must be zero. Ordinarily, the fixed-point input 108 is simply rounded to an integer, with the inexact exception raised if the 109 input cannot be represented exactly as an integer. However, if the fixed- 110 point input is too large, the invalid exception is raised and the largest 111 positive or negative integer is returned. 112 ------------------------------------------------------------------------------- 113 */ 114 static int32 roundAndPackInt32( flag zSign, bits64 absZ ) 115 { 116 int8 roundingMode; 117 flag roundNearestEven; 118 int8 roundIncrement, roundBits; 119 int32 z; 120 121 roundingMode = float_rounding_mode; 122 roundNearestEven = ( roundingMode == float_round_nearest_even ); 123 roundIncrement = 0x40; 124 if ( ! roundNearestEven ) { 125 if ( roundingMode == float_round_to_zero ) { 126 roundIncrement = 0; 127 } 128 else { 129 roundIncrement = 0x7F; 130 if ( zSign ) { 131 if ( roundingMode == float_round_up ) roundIncrement = 0; 132 } 133 else { 134 if ( roundingMode == float_round_down ) roundIncrement = 0; 135 } 136 } 137 } 138 roundBits = absZ & 0x7F; 139 absZ = ( absZ + roundIncrement )>>7; 140 absZ &= ~ ( ( ( roundBits ^ 0x40 ) == 0 ) & roundNearestEven ); 141 z = absZ; 142 if ( zSign ) z = - z; 143 if ( ( absZ>>32 ) || ( z && ( ( z < 0 ) ^ zSign ) ) ) { 144 float_raise( float_flag_invalid ); 145 return zSign ? (sbits32) 0x80000000 : 0x7FFFFFFF; 146 } 147 if ( roundBits ) float_exception_flags |= float_flag_inexact; 148 return z; 149 150 } 151 152 /* 153 ------------------------------------------------------------------------------- 154 Takes the 128-bit fixed-point value formed by concatenating `absZ0' and 155 `absZ1', with binary point between bits 63 and 64 (between the input words), 156 and returns the properly rounded 64-bit integer corresponding to the input. 157 If `zSign' is 1, the input is negated before being converted to an integer. 158 Ordinarily, the fixed-point input is simply rounded to an integer, with 159 the inexact exception raised if the input cannot be represented exactly as 160 an integer. However, if the fixed-point input is too large, the invalid 161 exception is raised and the largest positive or negative integer is 162 returned. 163 ------------------------------------------------------------------------------- 164 */ 165 static int64 roundAndPackInt64( flag zSign, bits64 absZ0, bits64 absZ1 ) 166 { 167 int8 roundingMode; 168 flag roundNearestEven, increment; 169 int64 z; 170 171 roundingMode = float_rounding_mode; 172 roundNearestEven = ( roundingMode == float_round_nearest_even ); 173 increment = ( (sbits64) absZ1 < 0 ); 174 if ( ! roundNearestEven ) { 175 if ( roundingMode == float_round_to_zero ) { 176 increment = 0; 177 } 178 else { 179 if ( zSign ) { 180 increment = ( roundingMode == float_round_down ) && absZ1; 181 } 182 else { 183 increment = ( roundingMode == float_round_up ) && absZ1; 184 } 185 } 186 } 187 if ( increment ) { 188 ++absZ0; 189 if ( absZ0 == 0 ) goto overflow; 190 absZ0 &= ~ ( ( (bits64) ( absZ1<<1 ) == 0 ) & roundNearestEven ); 191 } 192 z = absZ0; 193 if ( zSign ) z = - z; 194 if ( z && ( ( z < 0 ) ^ zSign ) ) { 195 overflow: 196 float_raise( float_flag_invalid ); 197 return 198 zSign ? (sbits64) LIT64( 0x8000000000000000 ) 199 : LIT64( 0x7FFFFFFFFFFFFFFF ); 200 } 201 if ( absZ1 ) float_exception_flags |= float_flag_inexact; 202 return z; 203 204 } 205 #endif 206 207 /* 208 ------------------------------------------------------------------------------- 209 Returns the fraction bits of the single-precision floating-point value `a'. 210 ------------------------------------------------------------------------------- 211 */ 212 INLINE bits32 extractFloat32Frac( float32 a ) 213 { 214 215 return a & 0x007FFFFF; 216 217 } 218 219 /* 220 ------------------------------------------------------------------------------- 221 Returns the exponent bits of the single-precision floating-point value `a'. 222 ------------------------------------------------------------------------------- 223 */ 224 INLINE int16 extractFloat32Exp( float32 a ) 225 { 226 227 return ( a>>23 ) & 0xFF; 228 229 } 230 231 /* 232 ------------------------------------------------------------------------------- 233 Returns the sign bit of the single-precision floating-point value `a'. 234 ------------------------------------------------------------------------------- 235 */ 236 INLINE flag extractFloat32Sign( float32 a ) 237 { 238 239 return a>>31; 240 241 } 242 243 /* 244 ------------------------------------------------------------------------------- 245 Normalizes the subnormal single-precision floating-point value represented 246 by the denormalized significand `aSig'. The normalized exponent and 247 significand are stored at the locations pointed to by `zExpPtr' and 248 `zSigPtr', respectively. 249 ------------------------------------------------------------------------------- 250 */ 251 static void 252 normalizeFloat32Subnormal( bits32 aSig, int16 *zExpPtr, bits32 *zSigPtr ) 253 { 254 int8 shiftCount; 255 256 shiftCount = countLeadingZeros32( aSig ) - 8; 257 *zSigPtr = aSig<<shiftCount; 258 *zExpPtr = 1 - shiftCount; 259 260 } 261 262 /* 263 ------------------------------------------------------------------------------- 264 Packs the sign `zSign', exponent `zExp', and significand `zSig' into a 265 single-precision floating-point value, returning the result. After being 266 shifted into the proper positions, the three fields are simply added 267 together to form the result. This means that any integer portion of `zSig' 268 will be added into the exponent. Since a properly normalized significand 269 will have an integer portion equal to 1, the `zExp' input should be 1 less 270 than the desired result exponent whenever `zSig' is a complete, normalized 271 significand. 272 ------------------------------------------------------------------------------- 273 */ 274 INLINE float32 packFloat32( flag zSign, int16 zExp, bits32 zSig ) 275 { 276 277 return ( ( (bits32) zSign )<<31 ) + ( ( (bits32) zExp )<<23 ) + zSig; 278 279 } 280 281 /* 282 ------------------------------------------------------------------------------- 283 Takes an abstract floating-point value having sign `zSign', exponent `zExp', 284 and significand `zSig', and returns the proper single-precision floating- 285 point value corresponding to the abstract input. Ordinarily, the abstract 286 value is simply rounded and packed into the single-precision format, with 287 the inexact exception raised if the abstract input cannot be represented 288 exactly. However, if the abstract value is too large, the overflow and 289 inexact exceptions are raised and an infinity or maximal finite value is 290 returned. If the abstract value is too small, the input value is rounded to 291 a subnormal number, and the underflow and inexact exceptions are raised if 292 the abstract input cannot be represented exactly as a subnormal single- 293 precision floating-point number. 294 The input significand `zSig' has its binary point between bits 30 295 and 29, which is 7 bits to the left of the usual location. This shifted 296 significand must be normalized or smaller. If `zSig' is not normalized, 297 `zExp' must be 0; in that case, the result returned is a subnormal number, 298 and it must not require rounding. In the usual case that `zSig' is 299 normalized, `zExp' must be 1 less than the ``true'' floating-point exponent. 300 The handling of underflow and overflow follows the IEC/IEEE Standard for 301 Binary Floating-Point Arithmetic. 302 ------------------------------------------------------------------------------- 303 */ 304 static float32 roundAndPackFloat32( flag zSign, int16 zExp, bits32 zSig ) 305 { 306 int8 roundingMode; 307 flag roundNearestEven; 308 int8 roundIncrement, roundBits; 309 flag isTiny; 310 311 roundingMode = float_rounding_mode; 312 roundNearestEven = ( roundingMode == float_round_nearest_even ); 313 roundIncrement = 0x40; 314 if ( ! roundNearestEven ) { 315 if ( roundingMode == float_round_to_zero ) { 316 roundIncrement = 0; 317 } 318 else { 319 roundIncrement = 0x7F; 320 if ( zSign ) { 321 if ( roundingMode == float_round_up ) roundIncrement = 0; 322 } 323 else { 324 if ( roundingMode == float_round_down ) roundIncrement = 0; 325 } 326 } 327 } 328 roundBits = zSig & 0x7F; 329 if ( 0xFD <= (bits16) zExp ) { 330 if ( ( 0xFD < zExp ) 331 || ( ( zExp == 0xFD ) 332 && ( (sbits32) ( zSig + roundIncrement ) < 0 ) ) 333 ) { 334 float_raise( float_flag_overflow | float_flag_inexact ); 335 return packFloat32( zSign, 0xFF, 0 ) - ( roundIncrement == 0 ); 336 } 337 if ( zExp < 0 ) { 338 isTiny = 339 ( float_detect_tininess == float_tininess_before_rounding ) 340 || ( zExp < -1 ) 341 || ( zSig + roundIncrement < 0x80000000 ); 342 shift32RightJamming( zSig, - zExp, &zSig ); 343 zExp = 0; 344 roundBits = zSig & 0x7F; 345 if ( isTiny && roundBits ) float_raise( float_flag_underflow ); 346 } 347 } 348 if ( roundBits ) float_exception_flags |= float_flag_inexact; 349 zSig = ( zSig + roundIncrement )>>7; 350 zSig &= ~ ( ( ( roundBits ^ 0x40 ) == 0 ) & roundNearestEven ); 351 if ( zSig == 0 ) zExp = 0; 352 return packFloat32( zSign, zExp, zSig ); 353 354 } 355 356 /* 357 ------------------------------------------------------------------------------- 358 Takes an abstract floating-point value having sign `zSign', exponent `zExp', 359 and significand `zSig', and returns the proper single-precision floating- 360 point value corresponding to the abstract input. This routine is just like 361 `roundAndPackFloat32' except that `zSig' does not have to be normalized. 362 Bit 31 of `zSig' must be zero, and `zExp' must be 1 less than the ``true'' 363 floating-point exponent. 364 ------------------------------------------------------------------------------- 365 */ 366 static float32 367 normalizeRoundAndPackFloat32( flag zSign, int16 zExp, bits32 zSig ) 368 { 369 int8 shiftCount; 370 371 shiftCount = countLeadingZeros32( zSig ) - 1; 372 return roundAndPackFloat32( zSign, zExp - shiftCount, zSig<<shiftCount ); 373 374 } 375 376 /* 377 ------------------------------------------------------------------------------- 378 Returns the fraction bits of the double-precision floating-point value `a'. 379 ------------------------------------------------------------------------------- 380 */ 381 INLINE bits64 extractFloat64Frac( float64 a ) 382 { 383 384 return FLOAT64_DEMANGLE(a) & LIT64( 0x000FFFFFFFFFFFFF ); 385 386 } 387 388 /* 389 ------------------------------------------------------------------------------- 390 Returns the exponent bits of the double-precision floating-point value `a'. 391 ------------------------------------------------------------------------------- 392 */ 393 INLINE int16 extractFloat64Exp( float64 a ) 394 { 395 396 return ( FLOAT64_DEMANGLE(a)>>52 ) & 0x7FF; 397 398 } 399 400 /* 401 ------------------------------------------------------------------------------- 402 Returns the sign bit of the double-precision floating-point value `a'. 403 ------------------------------------------------------------------------------- 404 */ 405 INLINE flag extractFloat64Sign( float64 a ) 406 { 407 408 return FLOAT64_DEMANGLE(a)>>63; 409 410 } 411 412 /* 413 ------------------------------------------------------------------------------- 414 Normalizes the subnormal double-precision floating-point value represented 415 by the denormalized significand `aSig'. The normalized exponent and 416 significand are stored at the locations pointed to by `zExpPtr' and 417 `zSigPtr', respectively. 418 ------------------------------------------------------------------------------- 419 */ 420 static void 421 normalizeFloat64Subnormal( bits64 aSig, int16 *zExpPtr, bits64 *zSigPtr ) 422 { 423 int8 shiftCount; 424 425 shiftCount = countLeadingZeros64( aSig ) - 11; 426 *zSigPtr = aSig<<shiftCount; 427 *zExpPtr = 1 - shiftCount; 428 429 } 430 431 /* 432 ------------------------------------------------------------------------------- 433 Packs the sign `zSign', exponent `zExp', and significand `zSig' into a 434 double-precision floating-point value, returning the result. After being 435 shifted into the proper positions, the three fields are simply added 436 together to form the result. This means that any integer portion of `zSig' 437 will be added into the exponent. Since a properly normalized significand 438 will have an integer portion equal to 1, the `zExp' input should be 1 less 439 than the desired result exponent whenever `zSig' is a complete, normalized 440 significand. 441 ------------------------------------------------------------------------------- 442 */ 443 INLINE float64 packFloat64( flag zSign, int16 zExp, bits64 zSig ) 444 { 445 446 return FLOAT64_MANGLE( ( ( (bits64) zSign )<<63 ) + 447 ( ( (bits64) zExp )<<52 ) + zSig ); 448 449 } 450 451 /* 452 ------------------------------------------------------------------------------- 453 Takes an abstract floating-point value having sign `zSign', exponent `zExp', 454 and significand `zSig', and returns the proper double-precision floating- 455 point value corresponding to the abstract input. Ordinarily, the abstract 456 value is simply rounded and packed into the double-precision format, with 457 the inexact exception raised if the abstract input cannot be represented 458 exactly. However, if the abstract value is too large, the overflow and 459 inexact exceptions are raised and an infinity or maximal finite value is 460 returned. If the abstract value is too small, the input value is rounded to 461 a subnormal number, and the underflow and inexact exceptions are raised if 462 the abstract input cannot be represented exactly as a subnormal double- 463 precision floating-point number. 464 The input significand `zSig' has its binary point between bits 62 465 and 61, which is 10 bits to the left of the usual location. This shifted 466 significand must be normalized or smaller. If `zSig' is not normalized, 467 `zExp' must be 0; in that case, the result returned is a subnormal number, 468 and it must not require rounding. In the usual case that `zSig' is 469 normalized, `zExp' must be 1 less than the ``true'' floating-point exponent. 470 The handling of underflow and overflow follows the IEC/IEEE Standard for 471 Binary Floating-Point Arithmetic. 472 ------------------------------------------------------------------------------- 473 */ 474 static float64 roundAndPackFloat64( flag zSign, int16 zExp, bits64 zSig ) 475 { 476 int8 roundingMode; 477 flag roundNearestEven; 478 int16 roundIncrement, roundBits; 479 flag isTiny; 480 481 roundingMode = float_rounding_mode; 482 roundNearestEven = ( roundingMode == float_round_nearest_even ); 483 roundIncrement = 0x200; 484 if ( ! roundNearestEven ) { 485 if ( roundingMode == float_round_to_zero ) { 486 roundIncrement = 0; 487 } 488 else { 489 roundIncrement = 0x3FF; 490 if ( zSign ) { 491 if ( roundingMode == float_round_up ) roundIncrement = 0; 492 } 493 else { 494 if ( roundingMode == float_round_down ) roundIncrement = 0; 495 } 496 } 497 } 498 roundBits = zSig & 0x3FF; 499 if ( 0x7FD <= (bits16) zExp ) { 500 if ( ( 0x7FD < zExp ) 501 || ( ( zExp == 0x7FD ) 502 && ( (sbits64) ( zSig + roundIncrement ) < 0 ) ) 503 ) { 504 float_raise( float_flag_overflow | float_flag_inexact ); 505 return FLOAT64_MANGLE( 506 FLOAT64_DEMANGLE(packFloat64( zSign, 0x7FF, 0 )) - 507 ( roundIncrement == 0 )); 508 } 509 if ( zExp < 0 ) { 510 isTiny = 511 ( float_detect_tininess == float_tininess_before_rounding ) 512 || ( zExp < -1 ) 513 || ( zSig + roundIncrement < LIT64( 0x8000000000000000 ) ); 514 shift64RightJamming( zSig, - zExp, &zSig ); 515 zExp = 0; 516 roundBits = zSig & 0x3FF; 517 if ( isTiny && roundBits ) float_raise( float_flag_underflow ); 518 } 519 } 520 if ( roundBits ) float_exception_flags |= float_flag_inexact; 521 zSig = ( zSig + roundIncrement )>>10; 522 zSig &= ~ ( ( ( roundBits ^ 0x200 ) == 0 ) & roundNearestEven ); 523 if ( zSig == 0 ) zExp = 0; 524 return packFloat64( zSign, zExp, zSig ); 525 526 } 527 528 /* 529 ------------------------------------------------------------------------------- 530 Takes an abstract floating-point value having sign `zSign', exponent `zExp', 531 and significand `zSig', and returns the proper double-precision floating- 532 point value corresponding to the abstract input. This routine is just like 533 `roundAndPackFloat64' except that `zSig' does not have to be normalized. 534 Bit 63 of `zSig' must be zero, and `zExp' must be 1 less than the ``true'' 535 floating-point exponent. 536 ------------------------------------------------------------------------------- 537 */ 538 static float64 539 normalizeRoundAndPackFloat64( flag zSign, int16 zExp, bits64 zSig ) 540 { 541 int8 shiftCount; 542 543 shiftCount = countLeadingZeros64( zSig ) - 1; 544 return roundAndPackFloat64( zSign, zExp - shiftCount, zSig<<shiftCount ); 545 546 } 547 548 #ifdef FLOATX80 549 550 /* 551 ------------------------------------------------------------------------------- 552 Returns the fraction bits of the extended double-precision floating-point 553 value `a'. 554 ------------------------------------------------------------------------------- 555 */ 556 INLINE bits64 extractFloatx80Frac( floatx80 a ) 557 { 558 559 return a.low; 560 561 } 562 563 /* 564 ------------------------------------------------------------------------------- 565 Returns the exponent bits of the extended double-precision floating-point 566 value `a'. 567 ------------------------------------------------------------------------------- 568 */ 569 INLINE int32 extractFloatx80Exp( floatx80 a ) 570 { 571 572 return a.high & 0x7FFF; 573 574 } 575 576 /* 577 ------------------------------------------------------------------------------- 578 Returns the sign bit of the extended double-precision floating-point value 579 `a'. 580 ------------------------------------------------------------------------------- 581 */ 582 INLINE flag extractFloatx80Sign( floatx80 a ) 583 { 584 585 return a.high>>15; 586 587 } 588 589 /* 590 ------------------------------------------------------------------------------- 591 Normalizes the subnormal extended double-precision floating-point value 592 represented by the denormalized significand `aSig'. The normalized exponent 593 and significand are stored at the locations pointed to by `zExpPtr' and 594 `zSigPtr', respectively. 595 ------------------------------------------------------------------------------- 596 */ 597 static void 598 normalizeFloatx80Subnormal( bits64 aSig, int32 *zExpPtr, bits64 *zSigPtr ) 599 { 600 int8 shiftCount; 601 602 shiftCount = countLeadingZeros64( aSig ); 603 *zSigPtr = aSig<<shiftCount; 604 *zExpPtr = 1 - shiftCount; 605 606 } 607 608 /* 609 ------------------------------------------------------------------------------- 610 Packs the sign `zSign', exponent `zExp', and significand `zSig' into an 611 extended double-precision floating-point value, returning the result. 612 ------------------------------------------------------------------------------- 613 */ 614 INLINE floatx80 packFloatx80( flag zSign, int32 zExp, bits64 zSig ) 615 { 616 floatx80 z; 617 618 z.low = zSig; 619 z.high = ( ( (bits16) zSign )<<15 ) + zExp; 620 return z; 621 622 } 623 624 /* 625 ------------------------------------------------------------------------------- 626 Takes an abstract floating-point value having sign `zSign', exponent `zExp', 627 and extended significand formed by the concatenation of `zSig0' and `zSig1', 628 and returns the proper extended double-precision floating-point value 629 corresponding to the abstract input. Ordinarily, the abstract value is 630 rounded and packed into the extended double-precision format, with the 631 inexact exception raised if the abstract input cannot be represented 632 exactly. However, if the abstract value is too large, the overflow and 633 inexact exceptions are raised and an infinity or maximal finite value is 634 returned. If the abstract value is too small, the input value is rounded to 635 a subnormal number, and the underflow and inexact exceptions are raised if 636 the abstract input cannot be represented exactly as a subnormal extended 637 double-precision floating-point number. 638 If `roundingPrecision' is 32 or 64, the result is rounded to the same 639 number of bits as single or double precision, respectively. Otherwise, the 640 result is rounded to the full precision of the extended double-precision 641 format. 642 The input significand must be normalized or smaller. If the input 643 significand is not normalized, `zExp' must be 0; in that case, the result 644 returned is a subnormal number, and it must not require rounding. The 645 handling of underflow and overflow follows the IEC/IEEE Standard for Binary 646 Floating-Point Arithmetic. 647 ------------------------------------------------------------------------------- 648 */ 649 static floatx80 650 roundAndPackFloatx80( 651 int8 roundingPrecision, flag zSign, int32 zExp, bits64 zSig0, bits64 zSig1 652 ) 653 { 654 int8 roundingMode; 655 flag roundNearestEven, increment, isTiny; 656 int64 roundIncrement, roundMask, roundBits; 657 658 roundingMode = float_rounding_mode; 659 roundNearestEven = ( roundingMode == float_round_nearest_even ); 660 if ( roundingPrecision == 80 ) goto precision80; 661 if ( roundingPrecision == 64 ) { 662 roundIncrement = LIT64( 0x0000000000000400 ); 663 roundMask = LIT64( 0x00000000000007FF ); 664 } 665 else if ( roundingPrecision == 32 ) { 666 roundIncrement = LIT64( 0x0000008000000000 ); 667 roundMask = LIT64( 0x000000FFFFFFFFFF ); 668 } 669 else { 670 goto precision80; 671 } 672 zSig0 |= ( zSig1 != 0 ); 673 if ( ! roundNearestEven ) { 674 if ( roundingMode == float_round_to_zero ) { 675 roundIncrement = 0; 676 } 677 else { 678 roundIncrement = roundMask; 679 if ( zSign ) { 680 if ( roundingMode == float_round_up ) roundIncrement = 0; 681 } 682 else { 683 if ( roundingMode == float_round_down ) roundIncrement = 0; 684 } 685 } 686 } 687 roundBits = zSig0 & roundMask; 688 if ( 0x7FFD <= (bits32) ( zExp - 1 ) ) { 689 if ( ( 0x7FFE < zExp ) 690 || ( ( zExp == 0x7FFE ) && ( zSig0 + roundIncrement < zSig0 ) ) 691 ) { 692 goto overflow; 693 } 694 if ( zExp <= 0 ) { 695 isTiny = 696 ( float_detect_tininess == float_tininess_before_rounding ) 697 || ( zExp < 0 ) 698 || ( zSig0 <= zSig0 + roundIncrement ); 699 shift64RightJamming( zSig0, 1 - zExp, &zSig0 ); 700 zExp = 0; 701 roundBits = zSig0 & roundMask; 702 if ( isTiny && roundBits ) float_raise( float_flag_underflow ); 703 if ( roundBits ) float_exception_flags |= float_flag_inexact; 704 zSig0 += roundIncrement; 705 if ( (sbits64) zSig0 < 0 ) zExp = 1; 706 roundIncrement = roundMask + 1; 707 if ( roundNearestEven && ( roundBits<<1 == roundIncrement ) ) { 708 roundMask |= roundIncrement; 709 } 710 zSig0 &= ~ roundMask; 711 return packFloatx80( zSign, zExp, zSig0 ); 712 } 713 } 714 if ( roundBits ) float_exception_flags |= float_flag_inexact; 715 zSig0 += roundIncrement; 716 if ( zSig0 < roundIncrement ) { 717 ++zExp; 718 zSig0 = LIT64( 0x8000000000000000 ); 719 } 720 roundIncrement = roundMask + 1; 721 if ( roundNearestEven && ( roundBits<<1 == roundIncrement ) ) { 722 roundMask |= roundIncrement; 723 } 724 zSig0 &= ~ roundMask; 725 if ( zSig0 == 0 ) zExp = 0; 726 return packFloatx80( zSign, zExp, zSig0 ); 727 precision80: 728 increment = ( (sbits64) zSig1 < 0 ); 729 if ( ! roundNearestEven ) { 730 if ( roundingMode == float_round_to_zero ) { 731 increment = 0; 732 } 733 else { 734 if ( zSign ) { 735 increment = ( roundingMode == float_round_down ) && zSig1; 736 } 737 else { 738 increment = ( roundingMode == float_round_up ) && zSig1; 739 } 740 } 741 } 742 if ( 0x7FFD <= (bits32) ( zExp - 1 ) ) { 743 if ( ( 0x7FFE < zExp ) 744 || ( ( zExp == 0x7FFE ) 745 && ( zSig0 == LIT64( 0xFFFFFFFFFFFFFFFF ) ) 746 && increment 747 ) 748 ) { 749 roundMask = 0; 750 overflow: 751 float_raise( float_flag_overflow | float_flag_inexact ); 752 if ( ( roundingMode == float_round_to_zero ) 753 || ( zSign && ( roundingMode == float_round_up ) ) 754 || ( ! zSign && ( roundingMode == float_round_down ) ) 755 ) { 756 return packFloatx80( zSign, 0x7FFE, ~ roundMask ); 757 } 758 return packFloatx80( zSign, 0x7FFF, LIT64( 0x8000000000000000 ) ); 759 } 760 if ( zExp <= 0 ) { 761 isTiny = 762 ( float_detect_tininess == float_tininess_before_rounding ) 763 || ( zExp < 0 ) 764 || ! increment 765 || ( zSig0 < LIT64( 0xFFFFFFFFFFFFFFFF ) ); 766 shift64ExtraRightJamming( zSig0, zSig1, 1 - zExp, &zSig0, &zSig1 ); 767 zExp = 0; 768 if ( isTiny && zSig1 ) float_raise( float_flag_underflow ); 769 if ( zSig1 ) float_exception_flags |= float_flag_inexact; 770 if ( roundNearestEven ) { 771 increment = ( (sbits64) zSig1 < 0 ); 772 } 773 else { 774 if ( zSign ) { 775 increment = ( roundingMode == float_round_down ) && zSig1; 776 } 777 else { 778 increment = ( roundingMode == float_round_up ) && zSig1; 779 } 780 } 781 if ( increment ) { 782 ++zSig0; 783 zSig0 &= 784 ~ ( ( (bits64) ( zSig1<<1 ) == 0 ) & roundNearestEven ); 785 if ( (sbits64) zSig0 < 0 ) zExp = 1; 786 } 787 return packFloatx80( zSign, zExp, zSig0 ); 788 } 789 } 790 if ( zSig1 ) float_exception_flags |= float_flag_inexact; 791 if ( increment ) { 792 ++zSig0; 793 if ( zSig0 == 0 ) { 794 ++zExp; 795 zSig0 = LIT64( 0x8000000000000000 ); 796 } 797 else { 798 zSig0 &= ~ ( ( (bits64) ( zSig1<<1 ) == 0 ) & roundNearestEven ); 799 } 800 } 801 else { 802 if ( zSig0 == 0 ) zExp = 0; 803 } 804 return packFloatx80( zSign, zExp, zSig0 ); 805 806 } 807 808 /* 809 ------------------------------------------------------------------------------- 810 Takes an abstract floating-point value having sign `zSign', exponent 811 `zExp', and significand formed by the concatenation of `zSig0' and `zSig1', 812 and returns the proper extended double-precision floating-point value 813 corresponding to the abstract input. This routine is just like 814 `roundAndPackFloatx80' except that the input significand does not have to be 815 normalized. 816 ------------------------------------------------------------------------------- 817 */ 818 static floatx80 819 normalizeRoundAndPackFloatx80( 820 int8 roundingPrecision, flag zSign, int32 zExp, bits64 zSig0, bits64 zSig1 821 ) 822 { 823 int8 shiftCount; 824 825 if ( zSig0 == 0 ) { 826 zSig0 = zSig1; 827 zSig1 = 0; 828 zExp -= 64; 829 } 830 shiftCount = countLeadingZeros64( zSig0 ); 831 shortShift128Left( zSig0, zSig1, shiftCount, &zSig0, &zSig1 ); 832 zExp -= shiftCount; 833 return 834 roundAndPackFloatx80( roundingPrecision, zSign, zExp, zSig0, zSig1 ); 835 836 } 837 838 #endif 839 840 #ifdef FLOAT128 841 842 /* 843 ------------------------------------------------------------------------------- 844 Returns the least-significant 64 fraction bits of the quadruple-precision 845 floating-point value `a'. 846 ------------------------------------------------------------------------------- 847 */ 848 INLINE bits64 extractFloat128Frac1( float128 a ) 849 { 850 851 return a.low; 852 853 } 854 855 /* 856 ------------------------------------------------------------------------------- 857 Returns the most-significant 48 fraction bits of the quadruple-precision 858 floating-point value `a'. 859 ------------------------------------------------------------------------------- 860 */ 861 INLINE bits64 extractFloat128Frac0( float128 a ) 862 { 863 864 return a.high & LIT64( 0x0000FFFFFFFFFFFF ); 865 866 } 867 868 /* 869 ------------------------------------------------------------------------------- 870 Returns the exponent bits of the quadruple-precision floating-point value 871 `a'. 872 ------------------------------------------------------------------------------- 873 */ 874 INLINE int32 extractFloat128Exp( float128 a ) 875 { 876 877 return ( a.high>>48 ) & 0x7FFF; 878 879 } 880 881 /* 882 ------------------------------------------------------------------------------- 883 Returns the sign bit of the quadruple-precision floating-point value `a'. 884 ------------------------------------------------------------------------------- 885 */ 886 INLINE flag extractFloat128Sign( float128 a ) 887 { 888 889 return a.high>>63; 890 891 } 892 893 /* 894 ------------------------------------------------------------------------------- 895 Normalizes the subnormal quadruple-precision floating-point value 896 represented by the denormalized significand formed by the concatenation of 897 `aSig0' and `aSig1'. The normalized exponent is stored at the location 898 pointed to by `zExpPtr'. The most significant 49 bits of the normalized 899 significand are stored at the location pointed to by `zSig0Ptr', and the 900 least significant 64 bits of the normalized significand are stored at the 901 location pointed to by `zSig1Ptr'. 902 ------------------------------------------------------------------------------- 903 */ 904 static void 905 normalizeFloat128Subnormal( 906 bits64 aSig0, 907 bits64 aSig1, 908 int32 *zExpPtr, 909 bits64 *zSig0Ptr, 910 bits64 *zSig1Ptr 911 ) 912 { 913 int8 shiftCount; 914 915 if ( aSig0 == 0 ) { 916 shiftCount = countLeadingZeros64( aSig1 ) - 15; 917 if ( shiftCount < 0 ) { 918 *zSig0Ptr = aSig1>>( - shiftCount ); 919 *zSig1Ptr = aSig1<<( shiftCount & 63 ); 920 } 921 else { 922 *zSig0Ptr = aSig1<<shiftCount; 923 *zSig1Ptr = 0; 924 } 925 *zExpPtr = - shiftCount - 63; 926 } 927 else { 928 shiftCount = countLeadingZeros64( aSig0 ) - 15; 929 shortShift128Left( aSig0, aSig1, shiftCount, zSig0Ptr, zSig1Ptr ); 930 *zExpPtr = 1 - shiftCount; 931 } 932 933 } 934 935 /* 936 ------------------------------------------------------------------------------- 937 Packs the sign `zSign', the exponent `zExp', and the significand formed 938 by the concatenation of `zSig0' and `zSig1' into a quadruple-precision 939 floating-point value, returning the result. After being shifted into the 940 proper positions, the three fields `zSign', `zExp', and `zSig0' are simply 941 added together to form the most significant 32 bits of the result. This 942 means that any integer portion of `zSig0' will be added into the exponent. 943 Since a properly normalized significand will have an integer portion equal 944 to 1, the `zExp' input should be 1 less than the desired result exponent 945 whenever `zSig0' and `zSig1' concatenated form a complete, normalized 946 significand. 947 ------------------------------------------------------------------------------- 948 */ 949 INLINE float128 950 packFloat128( flag zSign, int32 zExp, bits64 zSig0, bits64 zSig1 ) 951 { 952 float128 z; 953 954 z.low = zSig1; 955 z.high = ( ( (bits64) zSign )<<63 ) + ( ( (bits64) zExp )<<48 ) + zSig0; 956 return z; 957 958 } 959 960 /* 961 ------------------------------------------------------------------------------- 962 Takes an abstract floating-point value having sign `zSign', exponent `zExp', 963 and extended significand formed by the concatenation of `zSig0', `zSig1', 964 and `zSig2', and returns the proper quadruple-precision floating-point value 965 corresponding to the abstract input. Ordinarily, the abstract value is 966 simply rounded and packed into the quadruple-precision format, with the 967 inexact exception raised if the abstract input cannot be represented 968 exactly. However, if the abstract value is too large, the overflow and 969 inexact exceptions are raised and an infinity or maximal finite value is 970 returned. If the abstract value is too small, the input value is rounded to 971 a subnormal number, and the underflow and inexact exceptions are raised if 972 the abstract input cannot be represented exactly as a subnormal quadruple- 973 precision floating-point number. 974 The input significand must be normalized or smaller. If the input 975 significand is not normalized, `zExp' must be 0; in that case, the result 976 returned is a subnormal number, and it must not require rounding. In the 977 usual case that the input significand is normalized, `zExp' must be 1 less 978 than the ``true'' floating-point exponent. The handling of underflow and 979 overflow follows the IEC/IEEE Standard for Binary Floating-Point Arithmetic. 980 ------------------------------------------------------------------------------- 981 */ 982 static float128 983 roundAndPackFloat128( 984 flag zSign, int32 zExp, bits64 zSig0, bits64 zSig1, bits64 zSig2 ) 985 { 986 int8 roundingMode; 987 flag roundNearestEven, increment, isTiny; 988 989 roundingMode = float_rounding_mode; 990 roundNearestEven = ( roundingMode == float_round_nearest_even ); 991 increment = ( (sbits64) zSig2 < 0 ); 992 if ( ! roundNearestEven ) { 993 if ( roundingMode == float_round_to_zero ) { 994 increment = 0; 995 } 996 else { 997 if ( zSign ) { 998 increment = ( roundingMode == float_round_down ) && zSig2; 999 } 1000 else { 1001 increment = ( roundingMode == float_round_up ) && zSig2; 1002 } 1003 } 1004 } 1005 if ( 0x7FFD <= (bits32) zExp ) { 1006 if ( ( 0x7FFD < zExp ) 1007 || ( ( zExp == 0x7FFD ) 1008 && eq128( 1009 LIT64( 0x0001FFFFFFFFFFFF ), 1010 LIT64( 0xFFFFFFFFFFFFFFFF ), 1011 zSig0, 1012 zSig1 1013 ) 1014 && increment 1015 ) 1016 ) { 1017 float_raise( float_flag_overflow | float_flag_inexact ); 1018 if ( ( roundingMode == float_round_to_zero ) 1019 || ( zSign && ( roundingMode == float_round_up ) ) 1020 || ( ! zSign && ( roundingMode == float_round_down ) ) 1021 ) { 1022 return 1023 packFloat128( 1024 zSign, 1025 0x7FFE, 1026 LIT64( 0x0000FFFFFFFFFFFF ), 1027 LIT64( 0xFFFFFFFFFFFFFFFF ) 1028 ); 1029 } 1030 return packFloat128( zSign, 0x7FFF, 0, 0 ); 1031 } 1032 if ( zExp < 0 ) { 1033 isTiny = 1034 ( float_detect_tininess == float_tininess_before_rounding ) 1035 || ( zExp < -1 ) 1036 || ! increment 1037 || lt128( 1038 zSig0, 1039 zSig1, 1040 LIT64( 0x0001FFFFFFFFFFFF ), 1041 LIT64( 0xFFFFFFFFFFFFFFFF ) 1042 ); 1043 shift128ExtraRightJamming( 1044 zSig0, zSig1, zSig2, - zExp, &zSig0, &zSig1, &zSig2 ); 1045 zExp = 0; 1046 if ( isTiny && zSig2 ) float_raise( float_flag_underflow ); 1047 if ( roundNearestEven ) { 1048 increment = ( (sbits64) zSig2 < 0 ); 1049 } 1050 else { 1051 if ( zSign ) { 1052 increment = ( roundingMode == float_round_down ) && zSig2; 1053 } 1054 else { 1055 increment = ( roundingMode == float_round_up ) && zSig2; 1056 } 1057 } 1058 } 1059 } 1060 if ( zSig2 ) float_exception_flags |= float_flag_inexact; 1061 if ( increment ) { 1062 add128( zSig0, zSig1, 0, 1, &zSig0, &zSig1 ); 1063 zSig1 &= ~ ( ( zSig2 + zSig2 == 0 ) & roundNearestEven ); 1064 } 1065 else { 1066 if ( ( zSig0 | zSig1 ) == 0 ) zExp = 0; 1067 } 1068 return packFloat128( zSign, zExp, zSig0, zSig1 ); 1069 1070 } 1071 1072 /* 1073 ------------------------------------------------------------------------------- 1074 Takes an abstract floating-point value having sign `zSign', exponent `zExp', 1075 and significand formed by the concatenation of `zSig0' and `zSig1', and 1076 returns the proper quadruple-precision floating-point value corresponding 1077 to the abstract input. This routine is just like `roundAndPackFloat128' 1078 except that the input significand has fewer bits and does not have to be 1079 normalized. In all cases, `zExp' must be 1 less than the ``true'' floating- 1080 point exponent. 1081 ------------------------------------------------------------------------------- 1082 */ 1083 static float128 1084 normalizeRoundAndPackFloat128( 1085 flag zSign, int32 zExp, bits64 zSig0, bits64 zSig1 ) 1086 { 1087 int8 shiftCount; 1088 bits64 zSig2; 1089 1090 if ( zSig0 == 0 ) { 1091 zSig0 = zSig1; 1092 zSig1 = 0; 1093 zExp -= 64; 1094 } 1095 shiftCount = countLeadingZeros64( zSig0 ) - 15; 1096 if ( 0 <= shiftCount ) { 1097 zSig2 = 0; 1098 shortShift128Left( zSig0, zSig1, shiftCount, &zSig0, &zSig1 ); 1099 } 1100 else { 1101 shift128ExtraRightJamming( 1102 zSig0, zSig1, 0, - shiftCount, &zSig0, &zSig1, &zSig2 ); 1103 } 1104 zExp -= shiftCount; 1105 return roundAndPackFloat128( zSign, zExp, zSig0, zSig1, zSig2 ); 1106 1107 } 1108 1109 #endif 1110 1111 /* 1112 ------------------------------------------------------------------------------- 1113 Returns the result of converting the 32-bit two's complement integer `a' 1114 to the single-precision floating-point format. The conversion is performed 1115 according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic. 1116 ------------------------------------------------------------------------------- 1117 */ 1118 float32 int32_to_float32( int32 a ) 1119 { 1120 flag zSign; 1121 1122 if ( a == 0 ) return 0; 1123 if ( a == (sbits32) 0x80000000 ) return packFloat32( 1, 0x9E, 0 ); 1124 zSign = ( a < 0 ); 1125 return normalizeRoundAndPackFloat32( zSign, 0x9C, zSign ? - a : a ); 1126 1127 } 1128 1129 #ifndef SOFTFLOAT_FOR_GCC /* __floatunsisf is in libgcc */ 1130 float32 uint32_to_float32( uint32 a ) 1131 { 1132 if ( a == 0 ) return 0; 1133 if ( a & (bits32) 0x80000000 ) 1134 return normalizeRoundAndPackFloat32( 0, 0x9D, a >> 1 ); 1135 return normalizeRoundAndPackFloat32( 0, 0x9C, a ); 1136 } 1137 #endif 1138 1139 1140 /* 1141 ------------------------------------------------------------------------------- 1142 Returns the result of converting the 32-bit two's complement integer `a' 1143 to the double-precision floating-point format. The conversion is performed 1144 according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic. 1145 ------------------------------------------------------------------------------- 1146 */ 1147 float64 int32_to_float64( int32 a ) 1148 { 1149 flag zSign; 1150 uint32 absA; 1151 int8 shiftCount; 1152 bits64 zSig; 1153 1154 if ( a == 0 ) return 0; 1155 zSign = ( a < 0 ); 1156 absA = zSign ? - a : a; 1157 shiftCount = countLeadingZeros32( absA ) + 21; 1158 zSig = absA; 1159 return packFloat64( zSign, 0x432 - shiftCount, zSig<<shiftCount ); 1160 1161 } 1162 1163 #ifndef SOFTFLOAT_FOR_GCC /* __floatunsidf is in libgcc */ 1164 float64 uint32_to_float64( uint32 a ) 1165 { 1166 int8 shiftCount; 1167 bits64 zSig = a; 1168 1169 if ( a == 0 ) return 0; 1170 shiftCount = countLeadingZeros32( a ) + 21; 1171 return packFloat64( 0, 0x432 - shiftCount, zSig<<shiftCount ); 1172 1173 } 1174 #endif 1175 1176 #ifdef FLOATX80 1177 1178 /* 1179 ------------------------------------------------------------------------------- 1180 Returns the result of converting the 32-bit two's complement integer `a' 1181 to the extended double-precision floating-point format. The conversion 1182 is performed according to the IEC/IEEE Standard for Binary Floating-Point 1183 Arithmetic. 1184 ------------------------------------------------------------------------------- 1185 */ 1186 floatx80 int32_to_floatx80( int32 a ) 1187 { 1188 flag zSign; 1189 uint32 absA; 1190 int8 shiftCount; 1191 bits64 zSig; 1192 1193 if ( a == 0 ) return packFloatx80( 0, 0, 0 ); 1194 zSign = ( a < 0 ); 1195 absA = zSign ? - a : a; 1196 shiftCount = countLeadingZeros32( absA ) + 32; 1197 zSig = absA; 1198 return packFloatx80( zSign, 0x403E - shiftCount, zSig<<shiftCount ); 1199 1200 } 1201 1202 floatx80 uint32_to_floatx80( uint32 a ) 1203 { 1204 int8 shiftCount; 1205 bits64 zSig = a; 1206 1207 if ( a == 0 ) return packFloatx80( 0, 0, 0 ); 1208 shiftCount = countLeadingZeros32( a ) + 32; 1209 return packFloatx80( 0, 0x403E - shiftCount, zSig<<shiftCount ); 1210 1211 } 1212 1213 #endif 1214 1215 #ifdef FLOAT128 1216 1217 /* 1218 ------------------------------------------------------------------------------- 1219 Returns the result of converting the 32-bit two's complement integer `a' to 1220 the quadruple-precision floating-point format. The conversion is performed 1221 according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic. 1222 ------------------------------------------------------------------------------- 1223 */ 1224 float128 int32_to_float128( int32 a ) 1225 { 1226 flag zSign; 1227 uint32 absA; 1228 int8 shiftCount; 1229 bits64 zSig0; 1230 1231 if ( a == 0 ) return packFloat128( 0, 0, 0, 0 ); 1232 zSign = ( a < 0 ); 1233 absA = zSign ? - a : a; 1234 shiftCount = countLeadingZeros32( absA ) + 17; 1235 zSig0 = absA; 1236 return packFloat128( zSign, 0x402E - shiftCount, zSig0<<shiftCount, 0 ); 1237 1238 } 1239 1240 float128 uint32_to_float128( uint32 a ) 1241 { 1242 int8 shiftCount; 1243 bits64 zSig0 = a; 1244 1245 if ( a == 0 ) return packFloat128( 0, 0, 0, 0 ); 1246 shiftCount = countLeadingZeros32( a ) + 17; 1247 return packFloat128( 0, 0x402E - shiftCount, zSig0<<shiftCount, 0 ); 1248 1249 } 1250 1251 #endif 1252 1253 #ifndef SOFTFLOAT_FOR_GCC /* __floatdi?f is in libgcc2.c */ 1254 /* 1255 ------------------------------------------------------------------------------- 1256 Returns the result of converting the 64-bit two's complement integer `a' 1257 to the single-precision floating-point format. The conversion is performed 1258 according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic. 1259 ------------------------------------------------------------------------------- 1260 */ 1261 float32 int64_to_float32( int64 a ) 1262 { 1263 flag zSign; 1264 uint64 absA; 1265 int8 shiftCount; 1266 1267 if ( a == 0 ) return 0; 1268 zSign = ( a < 0 ); 1269 absA = zSign ? - a : a; 1270 shiftCount = countLeadingZeros64( absA ) - 40; 1271 if ( 0 <= shiftCount ) { 1272 return packFloat32( zSign, 0x95 - shiftCount, absA<<shiftCount ); 1273 } 1274 else { 1275 shiftCount += 7; 1276 if ( shiftCount < 0 ) { 1277 shift64RightJamming( absA, - shiftCount, &absA ); 1278 } 1279 else { 1280 absA <<= shiftCount; 1281 } 1282 return roundAndPackFloat32( zSign, 0x9C - shiftCount, absA ); 1283 } 1284 1285 } 1286 1287 /* 1288 ------------------------------------------------------------------------------- 1289 Returns the result of converting the 64-bit two's complement integer `a' 1290 to the double-precision floating-point format. The conversion is performed 1291 according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic. 1292 ------------------------------------------------------------------------------- 1293 */ 1294 float64 int64_to_float64( int64 a ) 1295 { 1296 flag zSign; 1297 1298 if ( a == 0 ) return 0; 1299 if ( a == (sbits64) LIT64( 0x8000000000000000 ) ) { 1300 return packFloat64( 1, 0x43E, 0 ); 1301 } 1302 zSign = ( a < 0 ); 1303 return normalizeRoundAndPackFloat64( zSign, 0x43C, zSign ? - a : a ); 1304 1305 } 1306 1307 #ifdef FLOATX80 1308 1309 /* 1310 ------------------------------------------------------------------------------- 1311 Returns the result of converting the 64-bit two's complement integer `a' 1312 to the extended double-precision floating-point format. The conversion 1313 is performed according to the IEC/IEEE Standard for Binary Floating-Point 1314 Arithmetic. 1315 ------------------------------------------------------------------------------- 1316 */ 1317 floatx80 int64_to_floatx80( int64 a ) 1318 { 1319 flag zSign; 1320 uint64 absA; 1321 int8 shiftCount; 1322 1323 if ( a == 0 ) return packFloatx80( 0, 0, 0 ); 1324 zSign = ( a < 0 ); 1325 absA = zSign ? - a : a; 1326 shiftCount = countLeadingZeros64( absA ); 1327 return packFloatx80( zSign, 0x403E - shiftCount, absA<<shiftCount ); 1328 1329 } 1330 1331 #endif 1332 1333 #endif /* !SOFTFLOAT_FOR_GCC */ 1334 1335 #ifdef FLOAT128 1336 1337 /* 1338 ------------------------------------------------------------------------------- 1339 Returns the result of converting the 64-bit two's complement integer `a' to 1340 the quadruple-precision floating-point format. The conversion is performed 1341 according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic. 1342 ------------------------------------------------------------------------------- 1343 */ 1344 float128 int64_to_float128( int64 a ) 1345 { 1346 flag zSign; 1347 uint64 absA; 1348 int8 shiftCount; 1349 int32 zExp; 1350 bits64 zSig0, zSig1; 1351 1352 if ( a == 0 ) return packFloat128( 0, 0, 0, 0 ); 1353 zSign = ( a < 0 ); 1354 absA = zSign ? - a : a; 1355 shiftCount = countLeadingZeros64( absA ) + 49; 1356 zExp = 0x406E - shiftCount; 1357 if ( 64 <= shiftCount ) { 1358 zSig1 = 0; 1359 zSig0 = absA; 1360 shiftCount -= 64; 1361 } 1362 else { 1363 zSig1 = absA; 1364 zSig0 = 0; 1365 } 1366 shortShift128Left( zSig0, zSig1, shiftCount, &zSig0, &zSig1 ); 1367 return packFloat128( zSign, zExp, zSig0, zSig1 ); 1368 1369 } 1370 1371 #endif 1372 1373 #ifndef SOFTFLOAT_FOR_GCC /* Not needed */ 1374 /* 1375 ------------------------------------------------------------------------------- 1376 Returns the result of converting the single-precision floating-point value 1377 `a' to the 32-bit two's complement integer format. The conversion is 1378 performed according to the IEC/IEEE Standard for Binary Floating-Point 1379 Arithmetic---which means in particular that the conversion is rounded 1380 according to the current rounding mode. If `a' is a NaN, the largest 1381 positive integer is returned. Otherwise, if the conversion overflows, the 1382 largest integer with the same sign as `a' is returned. 1383 ------------------------------------------------------------------------------- 1384 */ 1385 int32 float32_to_int32( float32 a ) 1386 { 1387 flag aSign; 1388 int16 aExp, shiftCount; 1389 bits32 aSig; 1390 bits64 aSig64; 1391 1392 aSig = extractFloat32Frac( a ); 1393 aExp = extractFloat32Exp( a ); 1394 aSign = extractFloat32Sign( a ); 1395 if ( ( aExp == 0xFF ) && aSig ) aSign = 0; 1396 if ( aExp ) aSig |= 0x00800000; 1397 shiftCount = 0xAF - aExp; 1398 aSig64 = aSig; 1399 aSig64 <<= 32; 1400 if ( 0 < shiftCount ) shift64RightJamming( aSig64, shiftCount, &aSig64 ); 1401 return roundAndPackInt32( aSign, aSig64 ); 1402 1403 } 1404 #endif /* !SOFTFLOAT_FOR_GCC */ 1405 1406 /* 1407 ------------------------------------------------------------------------------- 1408 Returns the result of converting the single-precision floating-point value 1409 `a' to the 32-bit two's complement integer format. The conversion is 1410 performed according to the IEC/IEEE Standard for Binary Floating-Point 1411 Arithmetic, except that the conversion is always rounded toward zero. 1412 If `a' is a NaN, the largest positive integer is returned. Otherwise, if 1413 the conversion overflows, the largest integer with the same sign as `a' is 1414 returned. 1415 ------------------------------------------------------------------------------- 1416 */ 1417 int32 float32_to_int32_round_to_zero( float32 a ) 1418 { 1419 flag aSign; 1420 int16 aExp, shiftCount; 1421 bits32 aSig; 1422 int32 z; 1423 1424 aSig = extractFloat32Frac( a ); 1425 aExp = extractFloat32Exp( a ); 1426 aSign = extractFloat32Sign( a ); 1427 shiftCount = aExp - 0x9E; 1428 if ( 0 <= shiftCount ) { 1429 if ( a != 0xCF000000 ) { 1430 float_raise( float_flag_invalid ); 1431 if ( ! aSign || ( ( aExp == 0xFF ) && aSig ) ) return 0x7FFFFFFF; 1432 } 1433 return (sbits32) 0x80000000; 1434 } 1435 else if ( aExp <= 0x7E ) { 1436 if ( aExp | aSig ) float_exception_flags |= float_flag_inexact; 1437 return 0; 1438 } 1439 aSig = ( aSig | 0x00800000 )<<8; 1440 z = aSig>>( - shiftCount ); 1441 if ( (bits32) ( aSig<<( shiftCount & 31 ) ) ) { 1442 float_exception_flags |= float_flag_inexact; 1443 } 1444 if ( aSign ) z = - z; 1445 return z; 1446 1447 } 1448 1449 #ifndef SOFTFLOAT_FOR_GCC /* __fix?fdi provided by libgcc2.c */ 1450 /* 1451 ------------------------------------------------------------------------------- 1452 Returns the result of converting the single-precision floating-point value 1453 `a' to the 64-bit two's complement integer format. The conversion is 1454 performed according to the IEC/IEEE Standard for Binary Floating-Point 1455 Arithmetic---which means in particular that the conversion is rounded 1456 according to the current rounding mode. If `a' is a NaN, the largest 1457 positive integer is returned. Otherwise, if the conversion overflows, the 1458 largest integer with the same sign as `a' is returned. 1459 ------------------------------------------------------------------------------- 1460 */ 1461 int64 float32_to_int64( float32 a ) 1462 { 1463 flag aSign; 1464 int16 aExp, shiftCount; 1465 bits32 aSig; 1466 bits64 aSig64, aSigExtra; 1467 1468 aSig = extractFloat32Frac( a ); 1469 aExp = extractFloat32Exp( a ); 1470 aSign = extractFloat32Sign( a ); 1471 shiftCount = 0xBE - aExp; 1472 if ( shiftCount < 0 ) { 1473 float_raise( float_flag_invalid ); 1474 if ( ! aSign || ( ( aExp == 0xFF ) && aSig ) ) { 1475 return LIT64( 0x7FFFFFFFFFFFFFFF ); 1476 } 1477 return (sbits64) LIT64( 0x8000000000000000 ); 1478 } 1479 if ( aExp ) aSig |= 0x00800000; 1480 aSig64 = aSig; 1481 aSig64 <<= 40; 1482 shift64ExtraRightJamming( aSig64, 0, shiftCount, &aSig64, &aSigExtra ); 1483 return roundAndPackInt64( aSign, aSig64, aSigExtra ); 1484 1485 } 1486 1487 /* 1488 ------------------------------------------------------------------------------- 1489 Returns the result of converting the single-precision floating-point value 1490 `a' to the 64-bit two's complement integer format. The conversion is 1491 performed according to the IEC/IEEE Standard for Binary Floating-Point 1492 Arithmetic, except that the conversion is always rounded toward zero. If 1493 `a' is a NaN, the largest positive integer is returned. Otherwise, if the 1494 conversion overflows, the largest integer with the same sign as `a' is 1495 returned. 1496 ------------------------------------------------------------------------------- 1497 */ 1498 int64 float32_to_int64_round_to_zero( float32 a ) 1499 { 1500 flag aSign; 1501 int16 aExp, shiftCount; 1502 bits32 aSig; 1503 bits64 aSig64; 1504 int64 z; 1505 1506 aSig = extractFloat32Frac( a ); 1507 aExp = extractFloat32Exp( a ); 1508 aSign = extractFloat32Sign( a ); 1509 shiftCount = aExp - 0xBE; 1510 if ( 0 <= shiftCount ) { 1511 if ( a != 0xDF000000 ) { 1512 float_raise( float_flag_invalid ); 1513 if ( ! aSign || ( ( aExp == 0xFF ) && aSig ) ) { 1514 return LIT64( 0x7FFFFFFFFFFFFFFF ); 1515 } 1516 } 1517 return (sbits64) LIT64( 0x8000000000000000 ); 1518 } 1519 else if ( aExp <= 0x7E ) { 1520 if ( aExp | aSig ) float_exception_flags |= float_flag_inexact; 1521 return 0; 1522 } 1523 aSig64 = aSig | 0x00800000; 1524 aSig64 <<= 40; 1525 z = aSig64>>( - shiftCount ); 1526 if ( (bits64) ( aSig64<<( shiftCount & 63 ) ) ) { 1527 float_exception_flags |= float_flag_inexact; 1528 } 1529 if ( aSign ) z = - z; 1530 return z; 1531 1532 } 1533 #endif /* !SOFTFLOAT_FOR_GCC */ 1534 1535 /* 1536 ------------------------------------------------------------------------------- 1537 Returns the result of converting the single-precision floating-point value 1538 `a' to the double-precision floating-point format. The conversion is 1539 performed according to the IEC/IEEE Standard for Binary Floating-Point 1540 Arithmetic. 1541 ------------------------------------------------------------------------------- 1542 */ 1543 float64 float32_to_float64( float32 a ) 1544 { 1545 flag aSign; 1546 int16 aExp; 1547 bits32 aSig; 1548 1549 aSig = extractFloat32Frac( a ); 1550 aExp = extractFloat32Exp( a ); 1551 aSign = extractFloat32Sign( a ); 1552 if ( aExp == 0xFF ) { 1553 if ( aSig ) return commonNaNToFloat64( float32ToCommonNaN( a ) ); 1554 return packFloat64( aSign, 0x7FF, 0 ); 1555 } 1556 if ( aExp == 0 ) { 1557 if ( aSig == 0 ) return packFloat64( aSign, 0, 0 ); 1558 normalizeFloat32Subnormal( aSig, &aExp, &aSig ); 1559 --aExp; 1560 } 1561 return packFloat64( aSign, aExp + 0x380, ( (bits64) aSig )<<29 ); 1562 1563 } 1564 1565 #ifdef FLOATX80 1566 1567 /* 1568 ------------------------------------------------------------------------------- 1569 Returns the result of converting the single-precision floating-point value 1570 `a' to the extended double-precision floating-point format. The conversion 1571 is performed according to the IEC/IEEE Standard for Binary Floating-Point 1572 Arithmetic. 1573 ------------------------------------------------------------------------------- 1574 */ 1575 floatx80 float32_to_floatx80( float32 a ) 1576 { 1577 flag aSign; 1578 int16 aExp; 1579 bits32 aSig; 1580 1581 aSig = extractFloat32Frac( a ); 1582 aExp = extractFloat32Exp( a ); 1583 aSign = extractFloat32Sign( a ); 1584 if ( aExp == 0xFF ) { 1585 if ( aSig ) return commonNaNToFloatx80( float32ToCommonNaN( a ) ); 1586 return packFloatx80( aSign, 0x7FFF, LIT64( 0x8000000000000000 ) ); 1587 } 1588 if ( aExp == 0 ) { 1589 if ( aSig == 0 ) return packFloatx80( aSign, 0, 0 ); 1590 normalizeFloat32Subnormal( aSig, &aExp, &aSig ); 1591 } 1592 aSig |= 0x00800000; 1593 return packFloatx80( aSign, aExp + 0x3F80, ( (bits64) aSig )<<40 ); 1594 1595 } 1596 1597 #endif 1598 1599 #ifdef FLOAT128 1600 1601 /* 1602 ------------------------------------------------------------------------------- 1603 Returns the result of converting the single-precision floating-point value 1604 `a' to the double-precision floating-point format. The conversion is 1605 performed according to the IEC/IEEE Standard for Binary Floating-Point 1606 Arithmetic. 1607 ------------------------------------------------------------------------------- 1608 */ 1609 float128 float32_to_float128( float32 a ) 1610 { 1611 flag aSign; 1612 int16 aExp; 1613 bits32 aSig; 1614 1615 aSig = extractFloat32Frac( a ); 1616 aExp = extractFloat32Exp( a ); 1617 aSign = extractFloat32Sign( a ); 1618 if ( aExp == 0xFF ) { 1619 if ( aSig ) return commonNaNToFloat128( float32ToCommonNaN( a ) ); 1620 return packFloat128( aSign, 0x7FFF, 0, 0 ); 1621 } 1622 if ( aExp == 0 ) { 1623 if ( aSig == 0 ) return packFloat128( aSign, 0, 0, 0 ); 1624 normalizeFloat32Subnormal( aSig, &aExp, &aSig ); 1625 --aExp; 1626 } 1627 return packFloat128( aSign, aExp + 0x3F80, ( (bits64) aSig )<<25, 0 ); 1628 1629 } 1630 1631 #endif 1632 1633 #ifndef SOFTFLOAT_FOR_GCC /* Not needed */ 1634 /* 1635 ------------------------------------------------------------------------------- 1636 Rounds the single-precision floating-point value `a' to an integer, and 1637 returns the result as a single-precision floating-point value. The 1638 operation is performed according to the IEC/IEEE Standard for Binary 1639 Floating-Point Arithmetic. 1640 ------------------------------------------------------------------------------- 1641 */ 1642 float32 float32_round_to_int( float32 a ) 1643 { 1644 flag aSign; 1645 int16 aExp; 1646 bits32 lastBitMask, roundBitsMask; 1647 int8 roundingMode; 1648 float32 z; 1649 1650 aExp = extractFloat32Exp( a ); 1651 if ( 0x96 <= aExp ) { 1652 if ( ( aExp == 0xFF ) && extractFloat32Frac( a ) ) { 1653 return propagateFloat32NaN( a, a ); 1654 } 1655 return a; 1656 } 1657 if ( aExp <= 0x7E ) { 1658 if ( (bits32) ( a<<1 ) == 0 ) return a; 1659 float_exception_flags |= float_flag_inexact; 1660 aSign = extractFloat32Sign( a ); 1661 switch ( float_rounding_mode ) { 1662 case float_round_nearest_even: 1663 if ( ( aExp == 0x7E ) && extractFloat32Frac( a ) ) { 1664 return packFloat32( aSign, 0x7F, 0 ); 1665 } 1666 break; 1667 case float_round_to_zero: 1668 break; 1669 case float_round_down: 1670 return aSign ? 0xBF800000 : 0; 1671 case float_round_up: 1672 return aSign ? 0x80000000 : 0x3F800000; 1673 } 1674 return packFloat32( aSign, 0, 0 ); 1675 } 1676 lastBitMask = 1; 1677 lastBitMask <<= 0x96 - aExp; 1678 roundBitsMask = lastBitMask - 1; 1679 z = a; 1680 roundingMode = float_rounding_mode; 1681 if ( roundingMode == float_round_nearest_even ) { 1682 z += lastBitMask>>1; 1683 if ( ( z & roundBitsMask ) == 0 ) z &= ~ lastBitMask; 1684 } 1685 else if ( roundingMode != float_round_to_zero ) { 1686 if ( extractFloat32Sign( z ) ^ ( roundingMode == float_round_up ) ) { 1687 z += roundBitsMask; 1688 } 1689 } 1690 z &= ~ roundBitsMask; 1691 if ( z != a ) float_exception_flags |= float_flag_inexact; 1692 return z; 1693 1694 } 1695 #endif /* !SOFTFLOAT_FOR_GCC */ 1696 1697 /* 1698 ------------------------------------------------------------------------------- 1699 Returns the result of adding the absolute values of the single-precision 1700 floating-point values `a' and `b'. If `zSign' is 1, the sum is negated 1701 before being returned. `zSign' is ignored if the result is a NaN. 1702 The addition is performed according to the IEC/IEEE Standard for Binary 1703 Floating-Point Arithmetic. 1704 ------------------------------------------------------------------------------- 1705 */ 1706 static float32 addFloat32Sigs( float32 a, float32 b, flag zSign ) 1707 { 1708 int16 aExp, bExp, zExp; 1709 bits32 aSig, bSig, zSig; 1710 int16 expDiff; 1711 1712 aSig = extractFloat32Frac( a ); 1713 aExp = extractFloat32Exp( a ); 1714 bSig = extractFloat32Frac( b ); 1715 bExp = extractFloat32Exp( b ); 1716 expDiff = aExp - bExp; 1717 aSig <<= 6; 1718 bSig <<= 6; 1719 if ( 0 < expDiff ) { 1720 if ( aExp == 0xFF ) { 1721 if ( aSig ) return propagateFloat32NaN( a, b ); 1722 return a; 1723 } 1724 if ( bExp == 0 ) { 1725 --expDiff; 1726 } 1727 else { 1728 bSig |= 0x20000000; 1729 } 1730 shift32RightJamming( bSig, expDiff, &bSig ); 1731 zExp = aExp; 1732 } 1733 else if ( expDiff < 0 ) { 1734 if ( bExp == 0xFF ) { 1735 if ( bSig ) return propagateFloat32NaN( a, b ); 1736 return packFloat32( zSign, 0xFF, 0 ); 1737 } 1738 if ( aExp == 0 ) { 1739 ++expDiff; 1740 } 1741 else { 1742 aSig |= 0x20000000; 1743 } 1744 shift32RightJamming( aSig, - expDiff, &aSig ); 1745 zExp = bExp; 1746 } 1747 else { 1748 if ( aExp == 0xFF ) { 1749 if ( aSig | bSig ) return propagateFloat32NaN( a, b ); 1750 return a; 1751 } 1752 if ( aExp == 0 ) return packFloat32( zSign, 0, ( aSig + bSig )>>6 ); 1753 zSig = 0x40000000 + aSig + bSig; 1754 zExp = aExp; 1755 goto roundAndPack; 1756 } 1757 aSig |= 0x20000000; 1758 zSig = ( aSig + bSig )<<1; 1759 --zExp; 1760 if ( (sbits32) zSig < 0 ) { 1761 zSig = aSig + bSig; 1762 ++zExp; 1763 } 1764 roundAndPack: 1765 return roundAndPackFloat32( zSign, zExp, zSig ); 1766 1767 } 1768 1769 /* 1770 ------------------------------------------------------------------------------- 1771 Returns the result of subtracting the absolute values of the single- 1772 precision floating-point values `a' and `b'. If `zSign' is 1, the 1773 difference is negated before being returned. `zSign' is ignored if the 1774 result is a NaN. The subtraction is performed according to the IEC/IEEE 1775 Standard for Binary Floating-Point Arithmetic. 1776 ------------------------------------------------------------------------------- 1777 */ 1778 static float32 subFloat32Sigs( float32 a, float32 b, flag zSign ) 1779 { 1780 int16 aExp, bExp, zExp; 1781 bits32 aSig, bSig, zSig; 1782 int16 expDiff; 1783 1784 aSig = extractFloat32Frac( a ); 1785 aExp = extractFloat32Exp( a ); 1786 bSig = extractFloat32Frac( b ); 1787 bExp = extractFloat32Exp( b ); 1788 expDiff = aExp - bExp; 1789 aSig <<= 7; 1790 bSig <<= 7; 1791 if ( 0 < expDiff ) goto aExpBigger; 1792 if ( expDiff < 0 ) goto bExpBigger; 1793 if ( aExp == 0xFF ) { 1794 if ( aSig | bSig ) return propagateFloat32NaN( a, b ); 1795 float_raise( float_flag_invalid ); 1796 return float32_default_nan; 1797 } 1798 if ( aExp == 0 ) { 1799 aExp = 1; 1800 bExp = 1; 1801 } 1802 if ( bSig < aSig ) goto aBigger; 1803 if ( aSig < bSig ) goto bBigger; 1804 return packFloat32( float_rounding_mode == float_round_down, 0, 0 ); 1805 bExpBigger: 1806 if ( bExp == 0xFF ) { 1807 if ( bSig ) return propagateFloat32NaN( a, b ); 1808 return packFloat32( zSign ^ 1, 0xFF, 0 ); 1809 } 1810 if ( aExp == 0 ) { 1811 ++expDiff; 1812 } 1813 else { 1814 aSig |= 0x40000000; 1815 } 1816 shift32RightJamming( aSig, - expDiff, &aSig ); 1817 bSig |= 0x40000000; 1818 bBigger: 1819 zSig = bSig - aSig; 1820 zExp = bExp; 1821 zSign ^= 1; 1822 goto normalizeRoundAndPack; 1823 aExpBigger: 1824 if ( aExp == 0xFF ) { 1825 if ( aSig ) return propagateFloat32NaN( a, b ); 1826 return a; 1827 } 1828 if ( bExp == 0 ) { 1829 --expDiff; 1830 } 1831 else { 1832 bSig |= 0x40000000; 1833 } 1834 shift32RightJamming( bSig, expDiff, &bSig ); 1835 aSig |= 0x40000000; 1836 aBigger: 1837 zSig = aSig - bSig; 1838 zExp = aExp; 1839 normalizeRoundAndPack: 1840 --zExp; 1841 return normalizeRoundAndPackFloat32( zSign, zExp, zSig ); 1842 1843 } 1844 1845 /* 1846 ------------------------------------------------------------------------------- 1847 Returns the result of adding the single-precision floating-point values `a' 1848 and `b'. The operation is performed according to the IEC/IEEE Standard for 1849 Binary Floating-Point Arithmetic. 1850 ------------------------------------------------------------------------------- 1851 */ 1852 float32 float32_add( float32 a, float32 b ) 1853 { 1854 flag aSign, bSign; 1855 1856 aSign = extractFloat32Sign( a ); 1857 bSign = extractFloat32Sign( b ); 1858 if ( aSign == bSign ) { 1859 return addFloat32Sigs( a, b, aSign ); 1860 } 1861 else { 1862 return subFloat32Sigs( a, b, aSign ); 1863 } 1864 1865 } 1866 1867 /* 1868 ------------------------------------------------------------------------------- 1869 Returns the result of subtracting the single-precision floating-point values 1870 `a' and `b'. The operation is performed according to the IEC/IEEE Standard 1871 for Binary Floating-Point Arithmetic. 1872 ------------------------------------------------------------------------------- 1873 */ 1874 float32 float32_sub( float32 a, float32 b ) 1875 { 1876 flag aSign, bSign; 1877 1878 aSign = extractFloat32Sign( a ); 1879 bSign = extractFloat32Sign( b ); 1880 if ( aSign == bSign ) { 1881 return subFloat32Sigs( a, b, aSign ); 1882 } 1883 else { 1884 return addFloat32Sigs( a, b, aSign ); 1885 } 1886 1887 } 1888 1889 /* 1890 ------------------------------------------------------------------------------- 1891 Returns the result of multiplying the single-precision floating-point values 1892 `a' and `b'. The operation is performed according to the IEC/IEEE Standard 1893 for Binary Floating-Point Arithmetic. 1894 ------------------------------------------------------------------------------- 1895 */ 1896 float32 float32_mul( float32 a, float32 b ) 1897 { 1898 flag aSign, bSign, zSign; 1899 int16 aExp, bExp, zExp; 1900 bits32 aSig, bSig; 1901 bits64 zSig64; 1902 bits32 zSig; 1903 1904 aSig = extractFloat32Frac( a ); 1905 aExp = extractFloat32Exp( a ); 1906 aSign = extractFloat32Sign( a ); 1907 bSig = extractFloat32Frac( b ); 1908 bExp = extractFloat32Exp( b ); 1909 bSign = extractFloat32Sign( b ); 1910 zSign = aSign ^ bSign; 1911 if ( aExp == 0xFF ) { 1912 if ( aSig || ( ( bExp == 0xFF ) && bSig ) ) { 1913 return propagateFloat32NaN( a, b ); 1914 } 1915 if ( ( bExp | bSig ) == 0 ) { 1916 float_raise( float_flag_invalid ); 1917 return float32_default_nan; 1918 } 1919 return packFloat32( zSign, 0xFF, 0 ); 1920 } 1921 if ( bExp == 0xFF ) { 1922 if ( bSig ) return propagateFloat32NaN( a, b ); 1923 if ( ( aExp | aSig ) == 0 ) { 1924 float_raise( float_flag_invalid ); 1925 return float32_default_nan; 1926 } 1927 return packFloat32( zSign, 0xFF, 0 ); 1928 } 1929 if ( aExp == 0 ) { 1930 if ( aSig == 0 ) return packFloat32( zSign, 0, 0 ); 1931 normalizeFloat32Subnormal( aSig, &aExp, &aSig ); 1932 } 1933 if ( bExp == 0 ) { 1934 if ( bSig == 0 ) return packFloat32( zSign, 0, 0 ); 1935 normalizeFloat32Subnormal( bSig, &bExp, &bSig ); 1936 } 1937 zExp = aExp + bExp - 0x7F; 1938 aSig = ( aSig | 0x00800000 )<<7; 1939 bSig = ( bSig | 0x00800000 )<<8; 1940 shift64RightJamming( ( (bits64) aSig ) * bSig, 32, &zSig64 ); 1941 zSig = zSig64; 1942 if ( 0 <= (sbits32) ( zSig<<1 ) ) { 1943 zSig <<= 1; 1944 --zExp; 1945 } 1946 return roundAndPackFloat32( zSign, zExp, zSig ); 1947 1948 } 1949 1950 /* 1951 ------------------------------------------------------------------------------- 1952 Returns the result of dividing the single-precision floating-point value `a' 1953 by the corresponding value `b'. The operation is performed according to the 1954 IEC/IEEE Standard for Binary Floating-Point Arithmetic. 1955 ------------------------------------------------------------------------------- 1956 */ 1957 float32 float32_div( float32 a, float32 b ) 1958 { 1959 flag aSign, bSign, zSign; 1960 int16 aExp, bExp, zExp; 1961 bits32 aSig, bSig, zSig; 1962 1963 aSig = extractFloat32Frac( a ); 1964 aExp = extractFloat32Exp( a ); 1965 aSign = extractFloat32Sign( a ); 1966 bSig = extractFloat32Frac( b ); 1967 bExp = extractFloat32Exp( b ); 1968 bSign = extractFloat32Sign( b ); 1969 zSign = aSign ^ bSign; 1970 if ( aExp == 0xFF ) { 1971 if ( aSig ) return propagateFloat32NaN( a, b ); 1972 if ( bExp == 0xFF ) { 1973 if ( bSig ) return propagateFloat32NaN( a, b ); 1974 float_raise( float_flag_invalid ); 1975 return float32_default_nan; 1976 } 1977 return packFloat32( zSign, 0xFF, 0 ); 1978 } 1979 if ( bExp == 0xFF ) { 1980 if ( bSig ) return propagateFloat32NaN( a, b ); 1981 return packFloat32( zSign, 0, 0 ); 1982 } 1983 if ( bExp == 0 ) { 1984 if ( bSig == 0 ) { 1985 if ( ( aExp | aSig ) == 0 ) { 1986 float_raise( float_flag_invalid ); 1987 return float32_default_nan; 1988 } 1989 float_raise( float_flag_divbyzero ); 1990 return packFloat32( zSign, 0xFF, 0 ); 1991 } 1992 normalizeFloat32Subnormal( bSig, &bExp, &bSig ); 1993 } 1994 if ( aExp == 0 ) { 1995 if ( aSig == 0 ) return packFloat32( zSign, 0, 0 ); 1996 normalizeFloat32Subnormal( aSig, &aExp, &aSig ); 1997 } 1998 zExp = aExp - bExp + 0x7D; 1999 aSig = ( aSig | 0x00800000 )<<7; 2000 bSig = ( bSig | 0x00800000 )<<8; 2001 if ( bSig <= ( aSig + aSig ) ) { 2002 aSig >>= 1; 2003 ++zExp; 2004 } 2005 zSig = ( ( (bits64) aSig )<<32 ) / bSig; 2006 if ( ( zSig & 0x3F ) == 0 ) { 2007 zSig |= ( (bits64) bSig * zSig != ( (bits64) aSig )<<32 ); 2008 } 2009 return roundAndPackFloat32( zSign, zExp, zSig ); 2010 2011 } 2012 2013 #ifndef SOFTFLOAT_FOR_GCC /* Not needed */ 2014 /* 2015 ------------------------------------------------------------------------------- 2016 Returns the remainder of the single-precision floating-point value `a' 2017 with respect to the corresponding value `b'. The operation is performed 2018 according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic. 2019 ------------------------------------------------------------------------------- 2020 */ 2021 float32 float32_rem( float32 a, float32 b ) 2022 { 2023 flag aSign, bSign, zSign; 2024 int16 aExp, bExp, expDiff; 2025 bits32 aSig, bSig; 2026 bits32 q; 2027 bits64 aSig64, bSig64, q64; 2028 bits32 alternateASig; 2029 sbits32 sigMean; 2030 2031 aSig = extractFloat32Frac( a ); 2032 aExp = extractFloat32Exp( a ); 2033 aSign = extractFloat32Sign( a ); 2034 bSig = extractFloat32Frac( b ); 2035 bExp = extractFloat32Exp( b ); 2036 bSign = extractFloat32Sign( b ); 2037 if ( aExp == 0xFF ) { 2038 if ( aSig || ( ( bExp == 0xFF ) && bSig ) ) { 2039 return propagateFloat32NaN( a, b ); 2040 } 2041 float_raise( float_flag_invalid ); 2042 return float32_default_nan; 2043 } 2044 if ( bExp == 0xFF ) { 2045 if ( bSig ) return propagateFloat32NaN( a, b ); 2046 return a; 2047 } 2048 if ( bExp == 0 ) { 2049 if ( bSig == 0 ) { 2050 float_raise( float_flag_invalid ); 2051 return float32_default_nan; 2052 } 2053 normalizeFloat32Subnormal( bSig, &bExp, &bSig ); 2054 } 2055 if ( aExp == 0 ) { 2056 if ( aSig == 0 ) return a; 2057 normalizeFloat32Subnormal( aSig, &aExp, &aSig ); 2058 } 2059 expDiff = aExp - bExp; 2060 aSig |= 0x00800000; 2061 bSig |= 0x00800000; 2062 if ( expDiff < 32 ) { 2063 aSig <<= 8; 2064 bSig <<= 8; 2065 if ( expDiff < 0 ) { 2066 if ( expDiff < -1 ) return a; 2067 aSig >>= 1; 2068 } 2069 q = ( bSig <= aSig ); 2070 if ( q ) aSig -= bSig; 2071 if ( 0 < expDiff ) { 2072 q = ( ( (bits64) aSig )<<32 ) / bSig; 2073 q >>= 32 - expDiff; 2074 bSig >>= 2; 2075 aSig = ( ( aSig>>1 )<<( expDiff - 1 ) ) - bSig * q; 2076 } 2077 else { 2078 aSig >>= 2; 2079 bSig >>= 2; 2080 } 2081 } 2082 else { 2083 if ( bSig <= aSig ) aSig -= bSig; 2084 aSig64 = ( (bits64) aSig )<<40; 2085 bSig64 = ( (bits64) bSig )<<40; 2086 expDiff -= 64; 2087 while ( 0 < expDiff ) { 2088 q64 = estimateDiv128To64( aSig64, 0, bSig64 ); 2089 q64 = ( 2 < q64 ) ? q64 - 2 : 0; 2090 aSig64 = - ( ( bSig * q64 )<<38 ); 2091 expDiff -= 62; 2092 } 2093 expDiff += 64; 2094 q64 = estimateDiv128To64( aSig64, 0, bSig64 ); 2095 q64 = ( 2 < q64 ) ? q64 - 2 : 0; 2096 q = q64>>( 64 - expDiff ); 2097 bSig <<= 6; 2098 aSig = ( ( aSig64>>33 )<<( expDiff - 1 ) ) - bSig * q; 2099 } 2100 do { 2101 alternateASig = aSig; 2102 ++q; 2103 aSig -= bSig; 2104 } while ( 0 <= (sbits32) aSig ); 2105 sigMean = aSig + alternateASig; 2106 if ( ( sigMean < 0 ) || ( ( sigMean == 0 ) && ( q & 1 ) ) ) { 2107 aSig = alternateASig; 2108 } 2109 zSign = ( (sbits32) aSig < 0 ); 2110 if ( zSign ) aSig = - aSig; 2111 return normalizeRoundAndPackFloat32( aSign ^ zSign, bExp, aSig ); 2112 2113 } 2114 #endif /* !SOFTFLOAT_FOR_GCC */ 2115 2116 #ifndef SOFTFLOAT_FOR_GCC /* Not needed */ 2117 /* 2118 ------------------------------------------------------------------------------- 2119 Returns the square root of the single-precision floating-point value `a'. 2120 The operation is performed according to the IEC/IEEE Standard for Binary 2121 Floating-Point Arithmetic. 2122 ------------------------------------------------------------------------------- 2123 */ 2124 float32 float32_sqrt( float32 a ) 2125 { 2126 flag aSign; 2127 int16 aExp, zExp; 2128 bits32 aSig, zSig; 2129 bits64 rem, term; 2130 2131 aSig = extractFloat32Frac( a ); 2132 aExp = extractFloat32Exp( a ); 2133 aSign = extractFloat32Sign( a ); 2134 if ( aExp == 0xFF ) { 2135 if ( aSig ) return propagateFloat32NaN( a, 0 ); 2136 if ( ! aSign ) return a; 2137 float_raise( float_flag_invalid ); 2138 return float32_default_nan; 2139 } 2140 if ( aSign ) { 2141 if ( ( aExp | aSig ) == 0 ) return a; 2142 float_raise( float_flag_invalid ); 2143 return float32_default_nan; 2144 } 2145 if ( aExp == 0 ) { 2146 if ( aSig == 0 ) return 0; 2147 normalizeFloat32Subnormal( aSig, &aExp, &aSig ); 2148 } 2149 zExp = ( ( aExp - 0x7F )>>1 ) + 0x7E; 2150 aSig = ( aSig | 0x00800000 )<<8; 2151 zSig = estimateSqrt32( aExp, aSig ) + 2; 2152 if ( ( zSig & 0x7F ) <= 5 ) { 2153 if ( zSig < 2 ) { 2154 zSig = 0x7FFFFFFF; 2155 goto roundAndPack; 2156 } 2157 aSig >>= aExp & 1; 2158 term = ( (bits64) zSig ) * zSig; 2159 rem = ( ( (bits64) aSig )<<32 ) - term; 2160 while ( (sbits64) rem < 0 ) { 2161 --zSig; 2162 rem += ( ( (bits64) zSig )<<1 ) | 1; 2163 } 2164 zSig |= ( rem != 0 ); 2165 } 2166 shift32RightJamming( zSig, 1, &zSig ); 2167 roundAndPack: 2168 return roundAndPackFloat32( 0, zExp, zSig ); 2169 2170 } 2171 #endif /* !SOFTFLOAT_FOR_GCC */ 2172 2173 /* 2174 ------------------------------------------------------------------------------- 2175 Returns 1 if the single-precision floating-point value `a' is equal to 2176 the corresponding value `b', and 0 otherwise. The comparison is performed 2177 according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic. 2178 ------------------------------------------------------------------------------- 2179 */ 2180 flag float32_eq( float32 a, float32 b ) 2181 { 2182 2183 if ( ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) ) 2184 || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) ) 2185 ) { 2186 if ( float32_is_signaling_nan( a ) || float32_is_signaling_nan( b ) ) { 2187 float_raise( float_flag_invalid ); 2188 } 2189 return 0; 2190 } 2191 return ( a == b ) || ( (bits32) ( ( a | b )<<1 ) == 0 ); 2192 2193 } 2194 2195 /* 2196 ------------------------------------------------------------------------------- 2197 Returns 1 if the single-precision floating-point value `a' is less than 2198 or equal to the corresponding value `b', and 0 otherwise. The comparison 2199 is performed according to the IEC/IEEE Standard for Binary Floating-Point 2200 Arithmetic. 2201 ------------------------------------------------------------------------------- 2202 */ 2203 flag float32_le( float32 a, float32 b ) 2204 { 2205 flag aSign, bSign; 2206 2207 if ( ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) ) 2208 || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) ) 2209 ) { 2210 float_raise( float_flag_invalid ); 2211 return 0; 2212 } 2213 aSign = extractFloat32Sign( a ); 2214 bSign = extractFloat32Sign( b ); 2215 if ( aSign != bSign ) return aSign || ( (bits32) ( ( a | b )<<1 ) == 0 ); 2216 return ( a == b ) || ( aSign ^ ( a < b ) ); 2217 2218 } 2219 2220 /* 2221 ------------------------------------------------------------------------------- 2222 Returns 1 if the single-precision floating-point value `a' is less than 2223 the corresponding value `b', and 0 otherwise. The comparison is performed 2224 according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic. 2225 ------------------------------------------------------------------------------- 2226 */ 2227 flag float32_lt( float32 a, float32 b ) 2228 { 2229 flag aSign, bSign; 2230 2231 if ( ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) ) 2232 || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) ) 2233 ) { 2234 float_raise( float_flag_invalid ); 2235 return 0; 2236 } 2237 aSign = extractFloat32Sign( a ); 2238 bSign = extractFloat32Sign( b ); 2239 if ( aSign != bSign ) return aSign && ( (bits32) ( ( a | b )<<1 ) != 0 ); 2240 return ( a != b ) && ( aSign ^ ( a < b ) ); 2241 2242 } 2243 2244 #ifndef SOFTFLOAT_FOR_GCC /* Not needed */ 2245 /* 2246 ------------------------------------------------------------------------------- 2247 Returns 1 if the single-precision floating-point value `a' is equal to 2248 the corresponding value `b', and 0 otherwise. The invalid exception is 2249 raised if either operand is a NaN. Otherwise, the comparison is performed 2250 according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic. 2251 ------------------------------------------------------------------------------- 2252 */ 2253 flag float32_eq_signaling( float32 a, float32 b ) 2254 { 2255 2256 if ( ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) ) 2257 || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) ) 2258 ) { 2259 float_raise( float_flag_invalid ); 2260 return 0; 2261 } 2262 return ( a == b ) || ( (bits32) ( ( a | b )<<1 ) == 0 ); 2263 2264 } 2265 2266 /* 2267 ------------------------------------------------------------------------------- 2268 Returns 1 if the single-precision floating-point value `a' is less than or 2269 equal to the corresponding value `b', and 0 otherwise. Quiet NaNs do not 2270 cause an exception. Otherwise, the comparison is performed according to the 2271 IEC/IEEE Standard for Binary Floating-Point Arithmetic. 2272 ------------------------------------------------------------------------------- 2273 */ 2274 flag float32_le_quiet( float32 a, float32 b ) 2275 { 2276 flag aSign, bSign; 2277 2278 if ( ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) ) 2279 || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) ) 2280 ) { 2281 if ( float32_is_signaling_nan( a ) || float32_is_signaling_nan( b ) ) { 2282 float_raise( float_flag_invalid ); 2283 } 2284 return 0; 2285 } 2286 aSign = extractFloat32Sign( a ); 2287 bSign = extractFloat32Sign( b ); 2288 if ( aSign != bSign ) return aSign || ( (bits32) ( ( a | b )<<1 ) == 0 ); 2289 return ( a == b ) || ( aSign ^ ( a < b ) ); 2290 2291 } 2292 2293 /* 2294 ------------------------------------------------------------------------------- 2295 Returns 1 if the single-precision floating-point value `a' is less than 2296 the corresponding value `b', and 0 otherwise. Quiet NaNs do not cause an 2297 exception. Otherwise, the comparison is performed according to the IEC/IEEE 2298 Standard for Binary Floating-Point Arithmetic. 2299 ------------------------------------------------------------------------------- 2300 */ 2301 flag float32_lt_quiet( float32 a, float32 b ) 2302 { 2303 flag aSign, bSign; 2304 2305 if ( ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) ) 2306 || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) ) 2307 ) { 2308 if ( float32_is_signaling_nan( a ) || float32_is_signaling_nan( b ) ) { 2309 float_raise( float_flag_invalid ); 2310 } 2311 return 0; 2312 } 2313 aSign = extractFloat32Sign( a ); 2314 bSign = extractFloat32Sign( b ); 2315 if ( aSign != bSign ) return aSign && ( (bits32) ( ( a | b )<<1 ) != 0 ); 2316 return ( a != b ) && ( aSign ^ ( a < b ) ); 2317 2318 } 2319 #endif /* !SOFTFLOAT_FOR_GCC */ 2320 2321 #ifndef SOFTFLOAT_FOR_GCC /* Not needed */ 2322 /* 2323 ------------------------------------------------------------------------------- 2324 Returns the result of converting the double-precision floating-point value 2325 `a' to the 32-bit two's complement integer format. The conversion is 2326 performed according to the IEC/IEEE Standard for Binary Floating-Point 2327 Arithmetic---which means in particular that the conversion is rounded 2328 according to the current rounding mode. If `a' is a NaN, the largest 2329 positive integer is returned. Otherwise, if the conversion overflows, the 2330 largest integer with the same sign as `a' is returned. 2331 ------------------------------------------------------------------------------- 2332 */ 2333 int32 float64_to_int32( float64 a ) 2334 { 2335 flag aSign; 2336 int16 aExp, shiftCount; 2337 bits64 aSig; 2338 2339 aSig = extractFloat64Frac( a ); 2340 aExp = extractFloat64Exp( a ); 2341 aSign = extractFloat64Sign( a ); 2342 if ( ( aExp == 0x7FF ) && aSig ) aSign = 0; 2343 if ( aExp ) aSig |= LIT64( 0x0010000000000000 ); 2344 shiftCount = 0x42C - aExp; 2345 if ( 0 < shiftCount ) shift64RightJamming( aSig, shiftCount, &aSig ); 2346 return roundAndPackInt32( aSign, aSig ); 2347 2348 } 2349 #endif /* !SOFTFLOAT_FOR_GCC */ 2350 2351 /* 2352 ------------------------------------------------------------------------------- 2353 Returns the result of converting the double-precision floating-point value 2354 `a' to the 32-bit two's complement integer format. The conversion is 2355 performed according to the IEC/IEEE Standard for Binary Floating-Point 2356 Arithmetic, except that the conversion is always rounded toward zero. 2357 If `a' is a NaN, the largest positive integer is returned. Otherwise, if 2358 the conversion overflows, the largest integer with the same sign as `a' is 2359 returned. 2360 ------------------------------------------------------------------------------- 2361 */ 2362 int32 float64_to_int32_round_to_zero( float64 a ) 2363 { 2364 flag aSign; 2365 int16 aExp, shiftCount; 2366 bits64 aSig, savedASig; 2367 int32 z; 2368 2369 aSig = extractFloat64Frac( a ); 2370 aExp = extractFloat64Exp( a ); 2371 aSign = extractFloat64Sign( a ); 2372 if ( 0x41E < aExp ) { 2373 if ( ( aExp == 0x7FF ) && aSig ) aSign = 0; 2374 goto invalid; 2375 } 2376 else if ( aExp < 0x3FF ) { 2377 if ( aExp || aSig ) float_exception_flags |= float_flag_inexact; 2378 return 0; 2379 } 2380 aSig |= LIT64( 0x0010000000000000 ); 2381 shiftCount = 0x433 - aExp; 2382 savedASig = aSig; 2383 aSig >>= shiftCount; 2384 z = aSig; 2385 if ( aSign ) z = - z; 2386 if ( ( z < 0 ) ^ aSign ) { 2387 invalid: 2388 float_raise( float_flag_invalid ); 2389 return aSign ? (sbits32) 0x80000000 : 0x7FFFFFFF; 2390 } 2391 if ( ( aSig<<shiftCount ) != savedASig ) { 2392 float_exception_flags |= float_flag_inexact; 2393 } 2394 return z; 2395 2396 } 2397 2398 #ifndef SOFTFLOAT_FOR_GCC /* Not needed */ 2399 /* 2400 ------------------------------------------------------------------------------- 2401 Returns the result of converting the double-precision floating-point value 2402 `a' to the 64-bit two's complement integer format. The conversion is 2403 performed according to the IEC/IEEE Standard for Binary Floating-Point 2404 Arithmetic---which means in particular that the conversion is rounded 2405 according to the current rounding mode. If `a' is a NaN, the largest 2406 positive integer is returned. Otherwise, if the conversion overflows, the 2407 largest integer with the same sign as `a' is returned. 2408 ------------------------------------------------------------------------------- 2409 */ 2410 int64 float64_to_int64( float64 a ) 2411 { 2412 flag aSign; 2413 int16 aExp, shiftCount; 2414 bits64 aSig, aSigExtra; 2415 2416 aSig = extractFloat64Frac( a ); 2417 aExp = extractFloat64Exp( a ); 2418 aSign = extractFloat64Sign( a ); 2419 if ( aExp ) aSig |= LIT64( 0x0010000000000000 ); 2420 shiftCount = 0x433 - aExp; 2421 if ( shiftCount <= 0 ) { 2422 if ( 0x43E < aExp ) { 2423 float_raise( float_flag_invalid ); 2424 if ( ! aSign 2425 || ( ( aExp == 0x7FF ) 2426 && ( aSig != LIT64( 0x0010000000000000 ) ) ) 2427 ) { 2428 return LIT64( 0x7FFFFFFFFFFFFFFF ); 2429 } 2430 return (sbits64) LIT64( 0x8000000000000000 ); 2431 } 2432 aSigExtra = 0; 2433 aSig <<= - shiftCount; 2434 } 2435 else { 2436 shift64ExtraRightJamming( aSig, 0, shiftCount, &aSig, &aSigExtra ); 2437 } 2438 return roundAndPackInt64( aSign, aSig, aSigExtra ); 2439 2440 } 2441 2442 /* 2443 ------------------------------------------------------------------------------- 2444 Returns the result of converting the double-precision floating-point value 2445 `a' to the 64-bit two's complement integer format. The conversion is 2446 performed according to the IEC/IEEE Standard for Binary Floating-Point 2447 Arithmetic, except that the conversion is always rounded toward zero. 2448 If `a' is a NaN, the largest positive integer is returned. Otherwise, if 2449 the conversion overflows, the largest integer with the same sign as `a' is 2450 returned. 2451 ------------------------------------------------------------------------------- 2452 */ 2453 int64 float64_to_int64_round_to_zero( float64 a ) 2454 { 2455 flag aSign; 2456 int16 aExp, shiftCount; 2457 bits64 aSig; 2458 int64 z; 2459 2460 aSig = extractFloat64Frac( a ); 2461 aExp = extractFloat64Exp( a ); 2462 aSign = extractFloat64Sign( a ); 2463 if ( aExp ) aSig |= LIT64( 0x0010000000000000 ); 2464 shiftCount = aExp - 0x433; 2465 if ( 0 <= shiftCount ) { 2466 if ( 0x43E <= aExp ) { 2467 if ( a != LIT64( 0xC3E0000000000000 ) ) { 2468 float_raise( float_flag_invalid ); 2469 if ( ! aSign 2470 || ( ( aExp == 0x7FF ) 2471 && ( aSig != LIT64( 0x0010000000000000 ) ) ) 2472 ) { 2473 return LIT64( 0x7FFFFFFFFFFFFFFF ); 2474 } 2475 } 2476 return (sbits64) LIT64( 0x8000000000000000 ); 2477 } 2478 z = aSig<<shiftCount; 2479 } 2480 else { 2481 if ( aExp < 0x3FE ) { 2482 if ( aExp | aSig ) float_exception_flags |= float_flag_inexact; 2483 return 0; 2484 } 2485 z = aSig>>( - shiftCount ); 2486 if ( (bits64) ( aSig<<( shiftCount & 63 ) ) ) { 2487 float_exception_flags |= float_flag_inexact; 2488 } 2489 } 2490 if ( aSign ) z = - z; 2491 return z; 2492 2493 } 2494 #endif /* !SOFTFLOAT_FOR_GCC */ 2495 2496 /* 2497 ------------------------------------------------------------------------------- 2498 Returns the result of converting the double-precision floating-point value 2499 `a' to the single-precision floating-point format. The conversion is 2500 performed according to the IEC/IEEE Standard for Binary Floating-Point 2501 Arithmetic. 2502 ------------------------------------------------------------------------------- 2503 */ 2504 float32 float64_to_float32( float64 a ) 2505 { 2506 flag aSign; 2507 int16 aExp; 2508 bits64 aSig; 2509 bits32 zSig; 2510 2511 aSig = extractFloat64Frac( a ); 2512 aExp = extractFloat64Exp( a ); 2513 aSign = extractFloat64Sign( a ); 2514 if ( aExp == 0x7FF ) { 2515 if ( aSig ) return commonNaNToFloat32( float64ToCommonNaN( a ) ); 2516 return packFloat32( aSign, 0xFF, 0 ); 2517 } 2518 shift64RightJamming( aSig, 22, &aSig ); 2519 zSig = aSig; 2520 if ( aExp || zSig ) { 2521 zSig |= 0x40000000; 2522 aExp -= 0x381; 2523 } 2524 return roundAndPackFloat32( aSign, aExp, zSig ); 2525 2526 } 2527 2528 #ifdef FLOATX80 2529 2530 /* 2531 ------------------------------------------------------------------------------- 2532 Returns the result of converting the double-precision floating-point value 2533 `a' to the extended double-precision floating-point format. The conversion 2534 is performed according to the IEC/IEEE Standard for Binary Floating-Point 2535 Arithmetic. 2536 ------------------------------------------------------------------------------- 2537 */ 2538 floatx80 float64_to_floatx80( float64 a ) 2539 { 2540 flag aSign; 2541 int16 aExp; 2542 bits64 aSig; 2543 2544 aSig = extractFloat64Frac( a ); 2545 aExp = extractFloat64Exp( a ); 2546 aSign = extractFloat64Sign( a ); 2547 if ( aExp == 0x7FF ) { 2548 if ( aSig ) return commonNaNToFloatx80( float64ToCommonNaN( a ) ); 2549 return packFloatx80( aSign, 0x7FFF, LIT64( 0x8000000000000000 ) ); 2550 } 2551 if ( aExp == 0 ) { 2552 if ( aSig == 0 ) return packFloatx80( aSign, 0, 0 ); 2553 normalizeFloat64Subnormal( aSig, &aExp, &aSig ); 2554 } 2555 return 2556 packFloatx80( 2557 aSign, aExp + 0x3C00, ( aSig | LIT64( 0x0010000000000000 ) )<<11 ); 2558 2559 } 2560 2561 #endif 2562 2563 #ifdef FLOAT128 2564 2565 /* 2566 ------------------------------------------------------------------------------- 2567 Returns the result of converting the double-precision floating-point value 2568 `a' to the quadruple-precision floating-point format. The conversion is 2569 performed according to the IEC/IEEE Standard for Binary Floating-Point 2570 Arithmetic. 2571 ------------------------------------------------------------------------------- 2572 */ 2573 float128 float64_to_float128( float64 a ) 2574 { 2575 flag aSign; 2576 int16 aExp; 2577 bits64 aSig, zSig0, zSig1; 2578 2579 aSig = extractFloat64Frac( a ); 2580 aExp = extractFloat64Exp( a ); 2581 aSign = extractFloat64Sign( a ); 2582 if ( aExp == 0x7FF ) { 2583 if ( aSig ) return commonNaNToFloat128( float64ToCommonNaN( a ) ); 2584 return packFloat128( aSign, 0x7FFF, 0, 0 ); 2585 } 2586 if ( aExp == 0 ) { 2587 if ( aSig == 0 ) return packFloat128( aSign, 0, 0, 0 ); 2588 normalizeFloat64Subnormal( aSig, &aExp, &aSig ); 2589 --aExp; 2590 } 2591 shift128Right( aSig, 0, 4, &zSig0, &zSig1 ); 2592 return packFloat128( aSign, aExp + 0x3C00, zSig0, zSig1 ); 2593 2594 } 2595 2596 #endif 2597 2598 #ifndef SOFTFLOAT_FOR_GCC 2599 /* 2600 ------------------------------------------------------------------------------- 2601 Rounds the double-precision floating-point value `a' to an integer, and 2602 returns the result as a double-precision floating-point value. The 2603 operation is performed according to the IEC/IEEE Standard for Binary 2604 Floating-Point Arithmetic. 2605 ------------------------------------------------------------------------------- 2606 */ 2607 float64 float64_round_to_int( float64 a ) 2608 { 2609 flag aSign; 2610 int16 aExp; 2611 bits64 lastBitMask, roundBitsMask; 2612 int8 roundingMode; 2613 float64 z; 2614 2615 aExp = extractFloat64Exp( a ); 2616 if ( 0x433 <= aExp ) { 2617 if ( ( aExp == 0x7FF ) && extractFloat64Frac( a ) ) { 2618 return propagateFloat64NaN( a, a ); 2619 } 2620 return a; 2621 } 2622 if ( aExp < 0x3FF ) { 2623 if ( (bits64) ( a<<1 ) == 0 ) return a; 2624 float_exception_flags |= float_flag_inexact; 2625 aSign = extractFloat64Sign( a ); 2626 switch ( float_rounding_mode ) { 2627 case float_round_nearest_even: 2628 if ( ( aExp == 0x3FE ) && extractFloat64Frac( a ) ) { 2629 return packFloat64( aSign, 0x3FF, 0 ); 2630 } 2631 break; 2632 case float_round_to_zero: 2633 break; 2634 case float_round_down: 2635 return aSign ? LIT64( 0xBFF0000000000000 ) : 0; 2636 case float_round_up: 2637 return 2638 aSign ? LIT64( 0x8000000000000000 ) : LIT64( 0x3FF0000000000000 ); 2639 } 2640 return packFloat64( aSign, 0, 0 ); 2641 } 2642 lastBitMask = 1; 2643 lastBitMask <<= 0x433 - aExp; 2644 roundBitsMask = lastBitMask - 1; 2645 z = a; 2646 roundingMode = float_rounding_mode; 2647 if ( roundingMode == float_round_nearest_even ) { 2648 z += lastBitMask>>1; 2649 if ( ( z & roundBitsMask ) == 0 ) z &= ~ lastBitMask; 2650 } 2651 else if ( roundingMode != float_round_to_zero ) { 2652 if ( extractFloat64Sign( z ) ^ ( roundingMode == float_round_up ) ) { 2653 z += roundBitsMask; 2654 } 2655 } 2656 z &= ~ roundBitsMask; 2657 if ( z != a ) float_exception_flags |= float_flag_inexact; 2658 return z; 2659 2660 } 2661 #endif 2662 2663 /* 2664 ------------------------------------------------------------------------------- 2665 Returns the result of adding the absolute values of the double-precision 2666 floating-point values `a' and `b'. If `zSign' is 1, the sum is negated 2667 before being returned. `zSign' is ignored if the result is a NaN. 2668 The addition is performed according to the IEC/IEEE Standard for Binary 2669 Floating-Point Arithmetic. 2670 ------------------------------------------------------------------------------- 2671 */ 2672 static float64 addFloat64Sigs( float64 a, float64 b, flag zSign ) 2673 { 2674 int16 aExp, bExp, zExp; 2675 bits64 aSig, bSig, zSig; 2676 int16 expDiff; 2677 2678 aSig = extractFloat64Frac( a ); 2679 aExp = extractFloat64Exp( a ); 2680 bSig = extractFloat64Frac( b ); 2681 bExp = extractFloat64Exp( b ); 2682 expDiff = aExp - bExp; 2683 aSig <<= 9; 2684 bSig <<= 9; 2685 if ( 0 < expDiff ) { 2686 if ( aExp == 0x7FF ) { 2687 if ( aSig ) return propagateFloat64NaN( a, b ); 2688 return a; 2689 } 2690 if ( bExp == 0 ) { 2691 --expDiff; 2692 } 2693 else { 2694 bSig |= LIT64( 0x2000000000000000 ); 2695 } 2696 shift64RightJamming( bSig, expDiff, &bSig ); 2697 zExp = aExp; 2698 } 2699 else if ( expDiff < 0 ) { 2700 if ( bExp == 0x7FF ) { 2701 if ( bSig ) return propagateFloat64NaN( a, b ); 2702 return packFloat64( zSign, 0x7FF, 0 ); 2703 } 2704 if ( aExp == 0 ) { 2705 ++expDiff; 2706 } 2707 else { 2708 aSig |= LIT64( 0x2000000000000000 ); 2709 } 2710 shift64RightJamming( aSig, - expDiff, &aSig ); 2711 zExp = bExp; 2712 } 2713 else { 2714 if ( aExp == 0x7FF ) { 2715 if ( aSig | bSig ) return propagateFloat64NaN( a, b ); 2716 return a; 2717 } 2718 if ( aExp == 0 ) return packFloat64( zSign, 0, ( aSig + bSig )>>9 ); 2719 zSig = LIT64( 0x4000000000000000 ) + aSig + bSig; 2720 zExp = aExp; 2721 goto roundAndPack; 2722 } 2723 aSig |= LIT64( 0x2000000000000000 ); 2724 zSig = ( aSig + bSig )<<1; 2725 --zExp; 2726 if ( (sbits64) zSig < 0 ) { 2727 zSig = aSig + bSig; 2728 ++zExp; 2729 } 2730 roundAndPack: 2731 return roundAndPackFloat64( zSign, zExp, zSig ); 2732 2733 } 2734 2735 /* 2736 ------------------------------------------------------------------------------- 2737 Returns the result of subtracting the absolute values of the double- 2738 precision floating-point values `a' and `b'. If `zSign' is 1, the 2739 difference is negated before being returned. `zSign' is ignored if the 2740 result is a NaN. The subtraction is performed according to the IEC/IEEE 2741 Standard for Binary Floating-Point Arithmetic. 2742 ------------------------------------------------------------------------------- 2743 */ 2744 static float64 subFloat64Sigs( float64 a, float64 b, flag zSign ) 2745 { 2746 int16 aExp, bExp, zExp; 2747 bits64 aSig, bSig, zSig; 2748 int16 expDiff; 2749 2750 aSig = extractFloat64Frac( a ); 2751 aExp = extractFloat64Exp( a ); 2752 bSig = extractFloat64Frac( b ); 2753 bExp = extractFloat64Exp( b ); 2754 expDiff = aExp - bExp; 2755 aSig <<= 10; 2756 bSig <<= 10; 2757 if ( 0 < expDiff ) goto aExpBigger; 2758 if ( expDiff < 0 ) goto bExpBigger; 2759 if ( aExp == 0x7FF ) { 2760 if ( aSig | bSig ) return propagateFloat64NaN( a, b ); 2761 float_raise( float_flag_invalid ); 2762 return float64_default_nan; 2763 } 2764 if ( aExp == 0 ) { 2765 aExp = 1; 2766 bExp = 1; 2767 } 2768 if ( bSig < aSig ) goto aBigger; 2769 if ( aSig < bSig ) goto bBigger; 2770 return packFloat64( float_rounding_mode == float_round_down, 0, 0 ); 2771 bExpBigger: 2772 if ( bExp == 0x7FF ) { 2773 if ( bSig ) return propagateFloat64NaN( a, b ); 2774 return packFloat64( zSign ^ 1, 0x7FF, 0 ); 2775 } 2776 if ( aExp == 0 ) { 2777 ++expDiff; 2778 } 2779 else { 2780 aSig |= LIT64( 0x4000000000000000 ); 2781 } 2782 shift64RightJamming( aSig, - expDiff, &aSig ); 2783 bSig |= LIT64( 0x4000000000000000 ); 2784 bBigger: 2785 zSig = bSig - aSig; 2786 zExp = bExp; 2787 zSign ^= 1; 2788 goto normalizeRoundAndPack; 2789 aExpBigger: 2790 if ( aExp == 0x7FF ) { 2791 if ( aSig ) return propagateFloat64NaN( a, b ); 2792 return a; 2793 } 2794 if ( bExp == 0 ) { 2795 --expDiff; 2796 } 2797 else { 2798 bSig |= LIT64( 0x4000000000000000 ); 2799 } 2800 shift64RightJamming( bSig, expDiff, &bSig ); 2801 aSig |= LIT64( 0x4000000000000000 ); 2802 aBigger: 2803 zSig = aSig - bSig; 2804 zExp = aExp; 2805 normalizeRoundAndPack: 2806 --zExp; 2807 return normalizeRoundAndPackFloat64( zSign, zExp, zSig ); 2808 2809 } 2810 2811 /* 2812 ------------------------------------------------------------------------------- 2813 Returns the result of adding the double-precision floating-point values `a' 2814 and `b'. The operation is performed according to the IEC/IEEE Standard for 2815 Binary Floating-Point Arithmetic. 2816 ------------------------------------------------------------------------------- 2817 */ 2818 float64 float64_add( float64 a, float64 b ) 2819 { 2820 flag aSign, bSign; 2821 2822 aSign = extractFloat64Sign( a ); 2823 bSign = extractFloat64Sign( b ); 2824 if ( aSign == bSign ) { 2825 return addFloat64Sigs( a, b, aSign ); 2826 } 2827 else { 2828 return subFloat64Sigs( a, b, aSign ); 2829 } 2830 2831 } 2832 2833 /* 2834 ------------------------------------------------------------------------------- 2835 Returns the result of subtracting the double-precision floating-point values 2836 `a' and `b'. The operation is performed according to the IEC/IEEE Standard 2837 for Binary Floating-Point Arithmetic. 2838 ------------------------------------------------------------------------------- 2839 */ 2840 float64 float64_sub( float64 a, float64 b ) 2841 { 2842 flag aSign, bSign; 2843 2844 aSign = extractFloat64Sign( a ); 2845 bSign = extractFloat64Sign( b ); 2846 if ( aSign == bSign ) { 2847 return subFloat64Sigs( a, b, aSign ); 2848 } 2849 else { 2850 return addFloat64Sigs( a, b, aSign ); 2851 } 2852 2853 } 2854 2855 /* 2856 ------------------------------------------------------------------------------- 2857 Returns the result of multiplying the double-precision floating-point values 2858 `a' and `b'. The operation is performed according to the IEC/IEEE Standard 2859 for Binary Floating-Point Arithmetic. 2860 ------------------------------------------------------------------------------- 2861 */ 2862 float64 float64_mul( float64 a, float64 b ) 2863 { 2864 flag aSign, bSign, zSign; 2865 int16 aExp, bExp, zExp; 2866 bits64 aSig, bSig, zSig0, zSig1; 2867 2868 aSig = extractFloat64Frac( a ); 2869 aExp = extractFloat64Exp( a ); 2870 aSign = extractFloat64Sign( a ); 2871 bSig = extractFloat64Frac( b ); 2872 bExp = extractFloat64Exp( b ); 2873 bSign = extractFloat64Sign( b ); 2874 zSign = aSign ^ bSign; 2875 if ( aExp == 0x7FF ) { 2876 if ( aSig || ( ( bExp == 0x7FF ) && bSig ) ) { 2877 return propagateFloat64NaN( a, b ); 2878 } 2879 if ( ( bExp | bSig ) == 0 ) { 2880 float_raise( float_flag_invalid ); 2881 return float64_default_nan; 2882 } 2883 return packFloat64( zSign, 0x7FF, 0 ); 2884 } 2885 if ( bExp == 0x7FF ) { 2886 if ( bSig ) return propagateFloat64NaN( a, b ); 2887 if ( ( aExp | aSig ) == 0 ) { 2888 float_raise( float_flag_invalid ); 2889 return float64_default_nan; 2890 } 2891 return packFloat64( zSign, 0x7FF, 0 ); 2892 } 2893 if ( aExp == 0 ) { 2894 if ( aSig == 0 ) return packFloat64( zSign, 0, 0 ); 2895 normalizeFloat64Subnormal( aSig, &aExp, &aSig ); 2896 } 2897 if ( bExp == 0 ) { 2898 if ( bSig == 0 ) return packFloat64( zSign, 0, 0 ); 2899 normalizeFloat64Subnormal( bSig, &bExp, &bSig ); 2900 } 2901 zExp = aExp + bExp - 0x3FF; 2902 aSig = ( aSig | LIT64( 0x0010000000000000 ) )<<10; 2903 bSig = ( bSig | LIT64( 0x0010000000000000 ) )<<11; 2904 mul64To128( aSig, bSig, &zSig0, &zSig1 ); 2905 zSig0 |= ( zSig1 != 0 ); 2906 if ( 0 <= (sbits64) ( zSig0<<1 ) ) { 2907 zSig0 <<= 1; 2908 --zExp; 2909 } 2910 return roundAndPackFloat64( zSign, zExp, zSig0 ); 2911 2912 } 2913 2914 /* 2915 ------------------------------------------------------------------------------- 2916 Returns the result of dividing the double-precision floating-point value `a' 2917 by the corresponding value `b'. The operation is performed according to 2918 the IEC/IEEE Standard for Binary Floating-Point Arithmetic. 2919 ------------------------------------------------------------------------------- 2920 */ 2921 float64 float64_div( float64 a, float64 b ) 2922 { 2923 flag aSign, bSign, zSign; 2924 int16 aExp, bExp, zExp; 2925 bits64 aSig, bSig, zSig; 2926 bits64 rem0, rem1; 2927 bits64 term0, term1; 2928 2929 aSig = extractFloat64Frac( a ); 2930 aExp = extractFloat64Exp( a ); 2931 aSign = extractFloat64Sign( a ); 2932 bSig = extractFloat64Frac( b ); 2933 bExp = extractFloat64Exp( b ); 2934 bSign = extractFloat64Sign( b ); 2935 zSign = aSign ^ bSign; 2936 if ( aExp == 0x7FF ) { 2937 if ( aSig ) return propagateFloat64NaN( a, b ); 2938 if ( bExp == 0x7FF ) { 2939 if ( bSig ) return propagateFloat64NaN( a, b ); 2940 float_raise( float_flag_invalid ); 2941 return float64_default_nan; 2942 } 2943 return packFloat64( zSign, 0x7FF, 0 ); 2944 } 2945 if ( bExp == 0x7FF ) { 2946 if ( bSig ) return propagateFloat64NaN( a, b ); 2947 return packFloat64( zSign, 0, 0 ); 2948 } 2949 if ( bExp == 0 ) { 2950 if ( bSig == 0 ) { 2951 if ( ( aExp | aSig ) == 0 ) { 2952 float_raise( float_flag_invalid ); 2953 return float64_default_nan; 2954 } 2955 float_raise( float_flag_divbyzero ); 2956 return packFloat64( zSign, 0x7FF, 0 ); 2957 } 2958 normalizeFloat64Subnormal( bSig, &bExp, &bSig ); 2959 } 2960 if ( aExp == 0 ) { 2961 if ( aSig == 0 ) return packFloat64( zSign, 0, 0 ); 2962 normalizeFloat64Subnormal( aSig, &aExp, &aSig ); 2963 } 2964 zExp = aExp - bExp + 0x3FD; 2965 aSig = ( aSig | LIT64( 0x0010000000000000 ) )<<10; 2966 bSig = ( bSig | LIT64( 0x0010000000000000 ) )<<11; 2967 if ( bSig <= ( aSig + aSig ) ) { 2968 aSig >>= 1; 2969 ++zExp; 2970 } 2971 zSig = estimateDiv128To64( aSig, 0, bSig ); 2972 if ( ( zSig & 0x1FF ) <= 2 ) { 2973 mul64To128( bSig, zSig, &term0, &term1 ); 2974 sub128( aSig, 0, term0, term1, &rem0, &rem1 ); 2975 while ( (sbits64) rem0 < 0 ) { 2976 --zSig; 2977 add128( rem0, rem1, 0, bSig, &rem0, &rem1 ); 2978 } 2979 zSig |= ( rem1 != 0 ); 2980 } 2981 return roundAndPackFloat64( zSign, zExp, zSig ); 2982 2983 } 2984 2985 #ifndef SOFTFLOAT_FOR_GCC 2986 /* 2987 ------------------------------------------------------------------------------- 2988 Returns the remainder of the double-precision floating-point value `a' 2989 with respect to the corresponding value `b'. The operation is performed 2990 according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic. 2991 ------------------------------------------------------------------------------- 2992 */ 2993 float64 float64_rem( float64 a, float64 b ) 2994 { 2995 flag aSign, bSign, zSign; 2996 int16 aExp, bExp, expDiff; 2997 bits64 aSig, bSig; 2998 bits64 q, alternateASig; 2999 sbits64 sigMean; 3000 3001 aSig = extractFloat64Frac( a ); 3002 aExp = extractFloat64Exp( a ); 3003 aSign = extractFloat64Sign( a ); 3004 bSig = extractFloat64Frac( b ); 3005 bExp = extractFloat64Exp( b ); 3006 bSign = extractFloat64Sign( b ); 3007 if ( aExp == 0x7FF ) { 3008 if ( aSig || ( ( bExp == 0x7FF ) && bSig ) ) { 3009 return propagateFloat64NaN( a, b ); 3010 } 3011 float_raise( float_flag_invalid ); 3012 return float64_default_nan; 3013 } 3014 if ( bExp == 0x7FF ) { 3015 if ( bSig ) return propagateFloat64NaN( a, b ); 3016 return a; 3017 } 3018 if ( bExp == 0 ) { 3019 if ( bSig == 0 ) { 3020 float_raise( float_flag_invalid ); 3021 return float64_default_nan; 3022 } 3023 normalizeFloat64Subnormal( bSig, &bExp, &bSig ); 3024 } 3025 if ( aExp == 0 ) { 3026 if ( aSig == 0 ) return a; 3027 normalizeFloat64Subnormal( aSig, &aExp, &aSig ); 3028 } 3029 expDiff = aExp - bExp; 3030 aSig = ( aSig | LIT64( 0x0010000000000000 ) )<<11; 3031 bSig = ( bSig | LIT64( 0x0010000000000000 ) )<<11; 3032 if ( expDiff < 0 ) { 3033 if ( expDiff < -1 ) return a; 3034 aSig >>= 1; 3035 } 3036 q = ( bSig <= aSig ); 3037 if ( q ) aSig -= bSig; 3038 expDiff -= 64; 3039 while ( 0 < expDiff ) { 3040 q = estimateDiv128To64( aSig, 0, bSig ); 3041 q = ( 2 < q ) ? q - 2 : 0; 3042 aSig = - ( ( bSig>>2 ) * q ); 3043 expDiff -= 62; 3044 } 3045 expDiff += 64; 3046 if ( 0 < expDiff ) { 3047 q = estimateDiv128To64( aSig, 0, bSig ); 3048 q = ( 2 < q ) ? q - 2 : 0; 3049 q >>= 64 - expDiff; 3050 bSig >>= 2; 3051 aSig = ( ( aSig>>1 )<<( expDiff - 1 ) ) - bSig * q; 3052 } 3053 else { 3054 aSig >>= 2; 3055 bSig >>= 2; 3056 } 3057 do { 3058 alternateASig = aSig; 3059 ++q; 3060 aSig -= bSig; 3061 } while ( 0 <= (sbits64) aSig ); 3062 sigMean = aSig + alternateASig; 3063 if ( ( sigMean < 0 ) || ( ( sigMean == 0 ) && ( q & 1 ) ) ) { 3064 aSig = alternateASig; 3065 } 3066 zSign = ( (sbits64) aSig < 0 ); 3067 if ( zSign ) aSig = - aSig; 3068 return normalizeRoundAndPackFloat64( aSign ^ zSign, bExp, aSig ); 3069 3070 } 3071 3072 /* 3073 ------------------------------------------------------------------------------- 3074 Returns the square root of the double-precision floating-point value `a'. 3075 The operation is performed according to the IEC/IEEE Standard for Binary 3076 Floating-Point Arithmetic. 3077 ------------------------------------------------------------------------------- 3078 */ 3079 float64 float64_sqrt( float64 a ) 3080 { 3081 flag aSign; 3082 int16 aExp, zExp; 3083 bits64 aSig, zSig, doubleZSig; 3084 bits64 rem0, rem1, term0, term1; 3085 3086 aSig = extractFloat64Frac( a ); 3087 aExp = extractFloat64Exp( a ); 3088 aSign = extractFloat64Sign( a ); 3089 if ( aExp == 0x7FF ) { 3090 if ( aSig ) return propagateFloat64NaN( a, a ); 3091 if ( ! aSign ) return a; 3092 float_raise( float_flag_invalid ); 3093 return float64_default_nan; 3094 } 3095 if ( aSign ) { 3096 if ( ( aExp | aSig ) == 0 ) return a; 3097 float_raise( float_flag_invalid ); 3098 return float64_default_nan; 3099 } 3100 if ( aExp == 0 ) { 3101 if ( aSig == 0 ) return 0; 3102 normalizeFloat64Subnormal( aSig, &aExp, &aSig ); 3103 } 3104 zExp = ( ( aExp - 0x3FF )>>1 ) + 0x3FE; 3105 aSig |= LIT64( 0x0010000000000000 ); 3106 zSig = estimateSqrt32( aExp, aSig>>21 ); 3107 aSig <<= 9 - ( aExp & 1 ); 3108 zSig = estimateDiv128To64( aSig, 0, zSig<<32 ) + ( zSig<<30 ); 3109 if ( ( zSig & 0x1FF ) <= 5 ) { 3110 doubleZSig = zSig<<1; 3111 mul64To128( zSig, zSig, &term0, &term1 ); 3112 sub128( aSig, 0, term0, term1, &rem0, &rem1 ); 3113 while ( (sbits64) rem0 < 0 ) { 3114 --zSig; 3115 doubleZSig -= 2; 3116 add128( rem0, rem1, zSig>>63, doubleZSig | 1, &rem0, &rem1 ); 3117 } 3118 zSig |= ( ( rem0 | rem1 ) != 0 ); 3119 } 3120 return roundAndPackFloat64( 0, zExp, zSig ); 3121 3122 } 3123 #endif 3124 3125 /* 3126 ------------------------------------------------------------------------------- 3127 Returns 1 if the double-precision floating-point value `a' is equal to the 3128 corresponding value `b', and 0 otherwise. The comparison is performed 3129 according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic. 3130 ------------------------------------------------------------------------------- 3131 */ 3132 flag float64_eq( float64 a, float64 b ) 3133 { 3134 3135 if ( ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) ) 3136 || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) ) 3137 ) { 3138 if ( float64_is_signaling_nan( a ) || float64_is_signaling_nan( b ) ) { 3139 float_raise( float_flag_invalid ); 3140 } 3141 return 0; 3142 } 3143 return ( a == b ) || 3144 ( (bits64) ( ( FLOAT64_DEMANGLE(a) | FLOAT64_DEMANGLE(b) )<<1 ) == 0 ); 3145 3146 } 3147 3148 /* 3149 ------------------------------------------------------------------------------- 3150 Returns 1 if the double-precision floating-point value `a' is less than or 3151 equal to the corresponding value `b', and 0 otherwise. The comparison is 3152 performed according to the IEC/IEEE Standard for Binary Floating-Point 3153 Arithmetic. 3154 ------------------------------------------------------------------------------- 3155 */ 3156 flag float64_le( float64 a, float64 b ) 3157 { 3158 flag aSign, bSign; 3159 3160 if ( ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) ) 3161 || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) ) 3162 ) { 3163 float_raise( float_flag_invalid ); 3164 return 0; 3165 } 3166 aSign = extractFloat64Sign( a ); 3167 bSign = extractFloat64Sign( b ); 3168 if ( aSign != bSign ) 3169 return aSign || 3170 ( (bits64) ( ( FLOAT64_DEMANGLE(a) | FLOAT64_DEMANGLE(b) )<<1 ) == 3171 0 ); 3172 return ( a == b ) || 3173 ( aSign ^ ( FLOAT64_DEMANGLE(a) < FLOAT64_DEMANGLE(b) ) ); 3174 3175 } 3176 3177 /* 3178 ------------------------------------------------------------------------------- 3179 Returns 1 if the double-precision floating-point value `a' is less than 3180 the corresponding value `b', and 0 otherwise. The comparison is performed 3181 according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic. 3182 ------------------------------------------------------------------------------- 3183 */ 3184 flag float64_lt( float64 a, float64 b ) 3185 { 3186 flag aSign, bSign; 3187 3188 if ( ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) ) 3189 || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) ) 3190 ) { 3191 float_raise( float_flag_invalid ); 3192 return 0; 3193 } 3194 aSign = extractFloat64Sign( a ); 3195 bSign = extractFloat64Sign( b ); 3196 if ( aSign != bSign ) 3197 return aSign && 3198 ( (bits64) ( ( FLOAT64_DEMANGLE(a) | FLOAT64_DEMANGLE(b) )<<1 ) != 3199 0 ); 3200 return ( a != b ) && 3201 ( aSign ^ ( FLOAT64_DEMANGLE(a) < FLOAT64_DEMANGLE(b) ) ); 3202 3203 } 3204 3205 #ifndef SOFTFLOAT_FOR_GCC 3206 /* 3207 ------------------------------------------------------------------------------- 3208 Returns 1 if the double-precision floating-point value `a' is equal to the 3209 corresponding value `b', and 0 otherwise. The invalid exception is raised 3210 if either operand is a NaN. Otherwise, the comparison is performed 3211 according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic. 3212 ------------------------------------------------------------------------------- 3213 */ 3214 flag float64_eq_signaling( float64 a, float64 b ) 3215 { 3216 3217 if ( ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) ) 3218 || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) ) 3219 ) { 3220 float_raise( float_flag_invalid ); 3221 return 0; 3222 } 3223 return ( a == b ) || ( (bits64) ( ( a | b )<<1 ) == 0 ); 3224 3225 } 3226 3227 /* 3228 ------------------------------------------------------------------------------- 3229 Returns 1 if the double-precision floating-point value `a' is less than or 3230 equal to the corresponding value `b', and 0 otherwise. Quiet NaNs do not 3231 cause an exception. Otherwise, the comparison is performed according to the 3232 IEC/IEEE Standard for Binary Floating-Point Arithmetic. 3233 ------------------------------------------------------------------------------- 3234 */ 3235 flag float64_le_quiet( float64 a, float64 b ) 3236 { 3237 flag aSign, bSign; 3238 3239 if ( ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) ) 3240 || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) ) 3241 ) { 3242 if ( float64_is_signaling_nan( a ) || float64_is_signaling_nan( b ) ) { 3243 float_raise( float_flag_invalid ); 3244 } 3245 return 0; 3246 } 3247 aSign = extractFloat64Sign( a ); 3248 bSign = extractFloat64Sign( b ); 3249 if ( aSign != bSign ) return aSign || ( (bits64) ( ( a | b )<<1 ) == 0 ); 3250 return ( a == b ) || ( aSign ^ ( a < b ) ); 3251 3252 } 3253 3254 /* 3255 ------------------------------------------------------------------------------- 3256 Returns 1 if the double-precision floating-point value `a' is less than 3257 the corresponding value `b', and 0 otherwise. Quiet NaNs do not cause an 3258 exception. Otherwise, the comparison is performed according to the IEC/IEEE 3259 Standard for Binary Floating-Point Arithmetic. 3260 ------------------------------------------------------------------------------- 3261 */ 3262 flag float64_lt_quiet( float64 a, float64 b ) 3263 { 3264 flag aSign, bSign; 3265 3266 if ( ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) ) 3267 || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) ) 3268 ) { 3269 if ( float64_is_signaling_nan( a ) || float64_is_signaling_nan( b ) ) { 3270 float_raise( float_flag_invalid ); 3271 } 3272 return 0; 3273 } 3274 aSign = extractFloat64Sign( a ); 3275 bSign = extractFloat64Sign( b ); 3276 if ( aSign != bSign ) return aSign && ( (bits64) ( ( a | b )<<1 ) != 0 ); 3277 return ( a != b ) && ( aSign ^ ( a < b ) ); 3278 3279 } 3280 #endif 3281 3282 #ifdef FLOATX80 3283 3284 /* 3285 ------------------------------------------------------------------------------- 3286 Returns the result of converting the extended double-precision floating- 3287 point value `a' to the 32-bit two's complement integer format. The 3288 conversion is performed according to the IEC/IEEE Standard for Binary 3289 Floating-Point Arithmetic---which means in particular that the conversion 3290 is rounded according to the current rounding mode. If `a' is a NaN, the 3291 largest positive integer is returned. Otherwise, if the conversion 3292 overflows, the largest integer with the same sign as `a' is returned. 3293 ------------------------------------------------------------------------------- 3294 */ 3295 int32 floatx80_to_int32( floatx80 a ) 3296 { 3297 flag aSign; 3298 int32 aExp, shiftCount; 3299 bits64 aSig; 3300 3301 aSig = extractFloatx80Frac( a ); 3302 aExp = extractFloatx80Exp( a ); 3303 aSign = extractFloatx80Sign( a ); 3304 if ( ( aExp == 0x7FFF ) && (bits64) ( aSig<<1 ) ) aSign = 0; 3305 shiftCount = 0x4037 - aExp; 3306 if ( shiftCount <= 0 ) shiftCount = 1; 3307 shift64RightJamming( aSig, shiftCount, &aSig ); 3308 return roundAndPackInt32( aSign, aSig ); 3309 3310 } 3311 3312 /* 3313 ------------------------------------------------------------------------------- 3314 Returns the result of converting the extended double-precision floating- 3315 point value `a' to the 32-bit two's complement integer format. The 3316 conversion is performed according to the IEC/IEEE Standard for Binary 3317 Floating-Point Arithmetic, except that the conversion is always rounded 3318 toward zero. If `a' is a NaN, the largest positive integer is returned. 3319 Otherwise, if the conversion overflows, the largest integer with the same 3320 sign as `a' is returned. 3321 ------------------------------------------------------------------------------- 3322 */ 3323 int32 floatx80_to_int32_round_to_zero( floatx80 a ) 3324 { 3325 flag aSign; 3326 int32 aExp, shiftCount; 3327 bits64 aSig, savedASig; 3328 int32 z; 3329 3330 aSig = extractFloatx80Frac( a ); 3331 aExp = extractFloatx80Exp( a ); 3332 aSign = extractFloatx80Sign( a ); 3333 if ( 0x401E < aExp ) { 3334 if ( ( aExp == 0x7FFF ) && (bits64) ( aSig<<1 ) ) aSign = 0; 3335 goto invalid; 3336 } 3337 else if ( aExp < 0x3FFF ) { 3338 if ( aExp || aSig ) float_exception_flags |= float_flag_inexact; 3339 return 0; 3340 } 3341 shiftCount = 0x403E - aExp; 3342 savedASig = aSig; 3343 aSig >>= shiftCount; 3344 z = aSig; 3345 if ( aSign ) z = - z; 3346 if ( ( z < 0 ) ^ aSign ) { 3347 invalid: 3348 float_raise( float_flag_invalid ); 3349 return aSign ? (sbits32) 0x80000000 : 0x7FFFFFFF; 3350 } 3351 if ( ( aSig<<shiftCount ) != savedASig ) { 3352 float_exception_flags |= float_flag_inexact; 3353 } 3354 return z; 3355 3356 } 3357 3358 /* 3359 ------------------------------------------------------------------------------- 3360 Returns the result of converting the extended double-precision floating- 3361 point value `a' to the 64-bit two's complement integer format. The 3362 conversion is performed according to the IEC/IEEE Standard for Binary 3363 Floating-Point Arithmetic---which means in particular that the conversion 3364 is rounded according to the current rounding mode. If `a' is a NaN, 3365 the largest positive integer is returned. Otherwise, if the conversion 3366 overflows, the largest integer with the same sign as `a' is returned. 3367 ------------------------------------------------------------------------------- 3368 */ 3369 int64 floatx80_to_int64( floatx80 a ) 3370 { 3371 flag aSign; 3372 int32 aExp, shiftCount; 3373 bits64 aSig, aSigExtra; 3374 3375 aSig = extractFloatx80Frac( a ); 3376 aExp = extractFloatx80Exp( a ); 3377 aSign = extractFloatx80Sign( a ); 3378 shiftCount = 0x403E - aExp; 3379 if ( shiftCount <= 0 ) { 3380 if ( shiftCount ) { 3381 float_raise( float_flag_invalid ); 3382 if ( ! aSign 3383 || ( ( aExp == 0x7FFF ) 3384 && ( aSig != LIT64( 0x8000000000000000 ) ) ) 3385 ) { 3386 return LIT64( 0x7FFFFFFFFFFFFFFF ); 3387 } 3388 return (sbits64) LIT64( 0x8000000000000000 ); 3389 } 3390 aSigExtra = 0; 3391 } 3392 else { 3393 shift64ExtraRightJamming( aSig, 0, shiftCount, &aSig, &aSigExtra ); 3394 } 3395 return roundAndPackInt64( aSign, aSig, aSigExtra ); 3396 3397 } 3398 3399 /* 3400 ------------------------------------------------------------------------------- 3401 Returns the result of converting the extended double-precision floating- 3402 point value `a' to the 64-bit two's complement integer format. The 3403 conversion is performed according to the IEC/IEEE Standard for Binary 3404 Floating-Point Arithmetic, except that the conversion is always rounded 3405 toward zero. If `a' is a NaN, the largest positive integer is returned. 3406 Otherwise, if the conversion overflows, the largest integer with the same 3407 sign as `a' is returned. 3408 ------------------------------------------------------------------------------- 3409 */ 3410 int64 floatx80_to_int64_round_to_zero( floatx80 a ) 3411 { 3412 flag aSign; 3413 int32 aExp, shiftCount; 3414 bits64 aSig; 3415 int64 z; 3416 3417 aSig = extractFloatx80Frac( a ); 3418 aExp = extractFloatx80Exp( a ); 3419 aSign = extractFloatx80Sign( a ); 3420 shiftCount = aExp - 0x403E; 3421 if ( 0 <= shiftCount ) { 3422 aSig &= LIT64( 0x7FFFFFFFFFFFFFFF ); 3423 if ( ( a.high != 0xC03E ) || aSig ) { 3424 float_raise( float_flag_invalid ); 3425 if ( ! aSign || ( ( aExp == 0x7FFF ) && aSig ) ) { 3426 return LIT64( 0x7FFFFFFFFFFFFFFF ); 3427 } 3428 } 3429 return (sbits64) LIT64( 0x8000000000000000 ); 3430 } 3431 else if ( aExp < 0x3FFF ) { 3432 if ( aExp | aSig ) float_exception_flags |= float_flag_inexact; 3433 return 0; 3434 } 3435 z = aSig>>( - shiftCount ); 3436 if ( (bits64) ( aSig<<( shiftCount & 63 ) ) ) { 3437 float_exception_flags |= float_flag_inexact; 3438 } 3439 if ( aSign ) z = - z; 3440 return z; 3441 3442 } 3443 3444 /* 3445 ------------------------------------------------------------------------------- 3446 Returns the result of converting the extended double-precision floating- 3447 point value `a' to the single-precision floating-point format. The 3448 conversion is performed according to the IEC/IEEE Standard for Binary 3449 Floating-Point Arithmetic. 3450 ------------------------------------------------------------------------------- 3451 */ 3452 float32 floatx80_to_float32( floatx80 a ) 3453 { 3454 flag aSign; 3455 int32 aExp; 3456 bits64 aSig; 3457 3458 aSig = extractFloatx80Frac( a ); 3459 aExp = extractFloatx80Exp( a ); 3460 aSign = extractFloatx80Sign( a ); 3461 if ( aExp == 0x7FFF ) { 3462 if ( (bits64) ( aSig<<1 ) ) { 3463 return commonNaNToFloat32( floatx80ToCommonNaN( a ) ); 3464 } 3465 return packFloat32( aSign, 0xFF, 0 ); 3466 } 3467 shift64RightJamming( aSig, 33, &aSig ); 3468 if ( aExp || aSig ) aExp -= 0x3F81; 3469 return roundAndPackFloat32( aSign, aExp, aSig ); 3470 3471 } 3472 3473 /* 3474 ------------------------------------------------------------------------------- 3475 Returns the result of converting the extended double-precision floating- 3476 point value `a' to the double-precision floating-point format. The 3477 conversion is performed according to the IEC/IEEE Standard for Binary 3478 Floating-Point Arithmetic. 3479 ------------------------------------------------------------------------------- 3480 */ 3481 float64 floatx80_to_float64( floatx80 a ) 3482 { 3483 flag aSign; 3484 int32 aExp; 3485 bits64 aSig, zSig; 3486 3487 aSig = extractFloatx80Frac( a ); 3488 aExp = extractFloatx80Exp( a ); 3489 aSign = extractFloatx80Sign( a ); 3490 if ( aExp == 0x7FFF ) { 3491 if ( (bits64) ( aSig<<1 ) ) { 3492 return commonNaNToFloat64( floatx80ToCommonNaN( a ) ); 3493 } 3494 return packFloat64( aSign, 0x7FF, 0 ); 3495 } 3496 shift64RightJamming( aSig, 1, &zSig ); 3497 if ( aExp || aSig ) aExp -= 0x3C01; 3498 return roundAndPackFloat64( aSign, aExp, zSig ); 3499 3500 } 3501 3502 #ifdef FLOAT128 3503 3504 /* 3505 ------------------------------------------------------------------------------- 3506 Returns the result of converting the extended double-precision floating- 3507 point value `a' to the quadruple-precision floating-point format. The 3508 conversion is performed according to the IEC/IEEE Standard for Binary 3509 Floating-Point Arithmetic. 3510 ------------------------------------------------------------------------------- 3511 */ 3512 float128 floatx80_to_float128( floatx80 a ) 3513 { 3514 flag aSign; 3515 int16 aExp; 3516 bits64 aSig, zSig0, zSig1; 3517 3518 aSig = extractFloatx80Frac( a ); 3519 aExp = extractFloatx80Exp( a ); 3520 aSign = extractFloatx80Sign( a ); 3521 if ( ( aExp == 0x7FFF ) && (bits64) ( aSig<<1 ) ) { 3522 return commonNaNToFloat128( floatx80ToCommonNaN( a ) ); 3523 } 3524 shift128Right( aSig<<1, 0, 16, &zSig0, &zSig1 ); 3525 return packFloat128( aSign, aExp, zSig0, zSig1 ); 3526 3527 } 3528 3529 #endif 3530 3531 /* 3532 ------------------------------------------------------------------------------- 3533 Rounds the extended double-precision floating-point value `a' to an integer, 3534 and returns the result as an extended quadruple-precision floating-point 3535 value. The operation is performed according to the IEC/IEEE Standard for 3536 Binary Floating-Point Arithmetic. 3537 ------------------------------------------------------------------------------- 3538 */ 3539 floatx80 floatx80_round_to_int( floatx80 a ) 3540 { 3541 flag aSign; 3542 int32 aExp; 3543 bits64 lastBitMask, roundBitsMask; 3544 int8 roundingMode; 3545 floatx80 z; 3546 3547 aExp = extractFloatx80Exp( a ); 3548 if ( 0x403E <= aExp ) { 3549 if ( ( aExp == 0x7FFF ) && (bits64) ( extractFloatx80Frac( a )<<1 ) ) { 3550 return propagateFloatx80NaN( a, a ); 3551 } 3552 return a; 3553 } 3554 if ( aExp < 0x3FFF ) { 3555 if ( ( aExp == 0 ) 3556 && ( (bits64) ( extractFloatx80Frac( a )<<1 ) == 0 ) ) { 3557 return a; 3558 } 3559 float_exception_flags |= float_flag_inexact; 3560 aSign = extractFloatx80Sign( a ); 3561 switch ( float_rounding_mode ) { 3562 case float_round_nearest_even: 3563 if ( ( aExp == 0x3FFE ) && (bits64) ( extractFloatx80Frac( a )<<1 ) 3564 ) { 3565 return 3566 packFloatx80( aSign, 0x3FFF, LIT64( 0x8000000000000000 ) ); 3567 } 3568 break; 3569 case float_round_to_zero: 3570 break; 3571 case float_round_down: 3572 return 3573 aSign ? 3574 packFloatx80( 1, 0x3FFF, LIT64( 0x8000000000000000 ) ) 3575 : packFloatx80( 0, 0, 0 ); 3576 case float_round_up: 3577 return 3578 aSign ? packFloatx80( 1, 0, 0 ) 3579 : packFloatx80( 0, 0x3FFF, LIT64( 0x8000000000000000 ) ); 3580 } 3581 return packFloatx80( aSign, 0, 0 ); 3582 } 3583 lastBitMask = 1; 3584 lastBitMask <<= 0x403E - aExp; 3585 roundBitsMask = lastBitMask - 1; 3586 z = a; 3587 roundingMode = float_rounding_mode; 3588 if ( roundingMode == float_round_nearest_even ) { 3589 z.low += lastBitMask>>1; 3590 if ( ( z.low & roundBitsMask ) == 0 ) z.low &= ~ lastBitMask; 3591 } 3592 else if ( roundingMode != float_round_to_zero ) { 3593 if ( extractFloatx80Sign( z ) ^ ( roundingMode == float_round_up ) ) { 3594 z.low += roundBitsMask; 3595 } 3596 } 3597 z.low &= ~ roundBitsMask; 3598 if ( z.low == 0 ) { 3599 ++z.high; 3600 z.low = LIT64( 0x8000000000000000 ); 3601 } 3602 if ( z.low != a.low ) float_exception_flags |= float_flag_inexact; 3603 return z; 3604 3605 } 3606 3607 /* 3608 ------------------------------------------------------------------------------- 3609 Returns the result of adding the absolute values of the extended double- 3610 precision floating-point values `a' and `b'. If `zSign' is 1, the sum is 3611 negated before being returned. `zSign' is ignored if the result is a NaN. 3612 The addition is performed according to the IEC/IEEE Standard for Binary 3613 Floating-Point Arithmetic. 3614 ------------------------------------------------------------------------------- 3615 */ 3616 static floatx80 addFloatx80Sigs( floatx80 a, floatx80 b, flag zSign ) 3617 { 3618 int32 aExp, bExp, zExp; 3619 bits64 aSig, bSig, zSig0, zSig1; 3620 int32 expDiff; 3621 3622 aSig = extractFloatx80Frac( a ); 3623 aExp = extractFloatx80Exp( a ); 3624 bSig = extractFloatx80Frac( b ); 3625 bExp = extractFloatx80Exp( b ); 3626 expDiff = aExp - bExp; 3627 if ( 0 < expDiff ) { 3628 if ( aExp == 0x7FFF ) { 3629 if ( (bits64) ( aSig<<1 ) ) return propagateFloatx80NaN( a, b ); 3630 return a; 3631 } 3632 if ( bExp == 0 ) --expDiff; 3633 shift64ExtraRightJamming( bSig, 0, expDiff, &bSig, &zSig1 ); 3634 zExp = aExp; 3635 } 3636 else if ( expDiff < 0 ) { 3637 if ( bExp == 0x7FFF ) { 3638 if ( (bits64) ( bSig<<1 ) ) return propagateFloatx80NaN( a, b ); 3639 return packFloatx80( zSign, 0x7FFF, LIT64( 0x8000000000000000 ) ); 3640 } 3641 if ( aExp == 0 ) ++expDiff; 3642 shift64ExtraRightJamming( aSig, 0, - expDiff, &aSig, &zSig1 ); 3643 zExp = bExp; 3644 } 3645 else { 3646 if ( aExp == 0x7FFF ) { 3647 if ( (bits64) ( ( aSig | bSig )<<1 ) ) { 3648 return propagateFloatx80NaN( a, b ); 3649 } 3650 return a; 3651 } 3652 zSig1 = 0; 3653 zSig0 = aSig + bSig; 3654 if ( aExp == 0 ) { 3655 normalizeFloatx80Subnormal( zSig0, &zExp, &zSig0 ); 3656 goto roundAndPack; 3657 } 3658 zExp = aExp; 3659 goto shiftRight1; 3660 } 3661 zSig0 = aSig + bSig; 3662 if ( (sbits64) zSig0 < 0 ) goto roundAndPack; 3663 shiftRight1: 3664 shift64ExtraRightJamming( zSig0, zSig1, 1, &zSig0, &zSig1 ); 3665 zSig0 |= LIT64( 0x8000000000000000 ); 3666 ++zExp; 3667 roundAndPack: 3668 return 3669 roundAndPackFloatx80( 3670 floatx80_rounding_precision, zSign, zExp, zSig0, zSig1 ); 3671 3672 } 3673 3674 /* 3675 ------------------------------------------------------------------------------- 3676 Returns the result of subtracting the absolute values of the extended 3677 double-precision floating-point values `a' and `b'. If `zSign' is 1, the 3678 difference is negated before being returned. `zSign' is ignored if the 3679 result is a NaN. The subtraction is performed according to the IEC/IEEE 3680 Standard for Binary Floating-Point Arithmetic. 3681 ------------------------------------------------------------------------------- 3682 */ 3683 static floatx80 subFloatx80Sigs( floatx80 a, floatx80 b, flag zSign ) 3684 { 3685 int32 aExp, bExp, zExp; 3686 bits64 aSig, bSig, zSig0, zSig1; 3687 int32 expDiff; 3688 floatx80 z; 3689 3690 aSig = extractFloatx80Frac( a ); 3691 aExp = extractFloatx80Exp( a ); 3692 bSig = extractFloatx80Frac( b ); 3693 bExp = extractFloatx80Exp( b ); 3694 expDiff = aExp - bExp; 3695 if ( 0 < expDiff ) goto aExpBigger; 3696 if ( expDiff < 0 ) goto bExpBigger; 3697 if ( aExp == 0x7FFF ) { 3698 if ( (bits64) ( ( aSig | bSig )<<1 ) ) { 3699 return propagateFloatx80NaN( a, b ); 3700 } 3701 float_raise( float_flag_invalid ); 3702 z.low = floatx80_default_nan_low; 3703 z.high = floatx80_default_nan_high; 3704 return z; 3705 } 3706 if ( aExp == 0 ) { 3707 aExp = 1; 3708 bExp = 1; 3709 } 3710 zSig1 = 0; 3711 if ( bSig < aSig ) goto aBigger; 3712 if ( aSig < bSig ) goto bBigger; 3713 return packFloatx80( float_rounding_mode == float_round_down, 0, 0 ); 3714 bExpBigger: 3715 if ( bExp == 0x7FFF ) { 3716 if ( (bits64) ( bSig<<1 ) ) return propagateFloatx80NaN( a, b ); 3717 return packFloatx80( zSign ^ 1, 0x7FFF, LIT64( 0x8000000000000000 ) ); 3718 } 3719 if ( aExp == 0 ) ++expDiff; 3720 shift128RightJamming( aSig, 0, - expDiff, &aSig, &zSig1 ); 3721 bBigger: 3722 sub128( bSig, 0, aSig, zSig1, &zSig0, &zSig1 ); 3723 zExp = bExp; 3724 zSign ^= 1; 3725 goto normalizeRoundAndPack; 3726 aExpBigger: 3727 if ( aExp == 0x7FFF ) { 3728 if ( (bits64) ( aSig<<1 ) ) return propagateFloatx80NaN( a, b ); 3729 return a; 3730 } 3731 if ( bExp == 0 ) --expDiff; 3732 shift128RightJamming( bSig, 0, expDiff, &bSig, &zSig1 ); 3733 aBigger: 3734 sub128( aSig, 0, bSig, zSig1, &zSig0, &zSig1 ); 3735 zExp = aExp; 3736 normalizeRoundAndPack: 3737 return 3738 normalizeRoundAndPackFloatx80( 3739 floatx80_rounding_precision, zSign, zExp, zSig0, zSig1 ); 3740 3741 } 3742 3743 /* 3744 ------------------------------------------------------------------------------- 3745 Returns the result of adding the extended double-precision floating-point 3746 values `a' and `b'. The operation is performed according to the IEC/IEEE 3747 Standard for Binary Floating-Point Arithmetic. 3748 ------------------------------------------------------------------------------- 3749 */ 3750 floatx80 floatx80_add( floatx80 a, floatx80 b ) 3751 { 3752 flag aSign, bSign; 3753 3754 aSign = extractFloatx80Sign( a ); 3755 bSign = extractFloatx80Sign( b ); 3756 if ( aSign == bSign ) { 3757 return addFloatx80Sigs( a, b, aSign ); 3758 } 3759 else { 3760 return subFloatx80Sigs( a, b, aSign ); 3761 } 3762 3763 } 3764 3765 /* 3766 ------------------------------------------------------------------------------- 3767 Returns the result of subtracting the extended double-precision floating- 3768 point values `a' and `b'. The operation is performed according to the 3769 IEC/IEEE Standard for Binary Floating-Point Arithmetic. 3770 ------------------------------------------------------------------------------- 3771 */ 3772 floatx80 floatx80_sub( floatx80 a, floatx80 b ) 3773 { 3774 flag aSign, bSign; 3775 3776 aSign = extractFloatx80Sign( a ); 3777 bSign = extractFloatx80Sign( b ); 3778 if ( aSign == bSign ) { 3779 return subFloatx80Sigs( a, b, aSign ); 3780 } 3781 else { 3782 return addFloatx80Sigs( a, b, aSign ); 3783 } 3784 3785 } 3786 3787 /* 3788 ------------------------------------------------------------------------------- 3789 Returns the result of multiplying the extended double-precision floating- 3790 point values `a' and `b'. The operation is performed according to the 3791 IEC/IEEE Standard for Binary Floating-Point Arithmetic. 3792 ------------------------------------------------------------------------------- 3793 */ 3794 floatx80 floatx80_mul( floatx80 a, floatx80 b ) 3795 { 3796 flag aSign, bSign, zSign; 3797 int32 aExp, bExp, zExp; 3798 bits64 aSig, bSig, zSig0, zSig1; 3799 floatx80 z; 3800 3801 aSig = extractFloatx80Frac( a ); 3802 aExp = extractFloatx80Exp( a ); 3803 aSign = extractFloatx80Sign( a ); 3804 bSig = extractFloatx80Frac( b ); 3805 bExp = extractFloatx80Exp( b ); 3806 bSign = extractFloatx80Sign( b ); 3807 zSign = aSign ^ bSign; 3808 if ( aExp == 0x7FFF ) { 3809 if ( (bits64) ( aSig<<1 ) 3810 || ( ( bExp == 0x7FFF ) && (bits64) ( bSig<<1 ) ) ) { 3811 return propagateFloatx80NaN( a, b ); 3812 } 3813 if ( ( bExp | bSig ) == 0 ) goto invalid; 3814 return packFloatx80( zSign, 0x7FFF, LIT64( 0x8000000000000000 ) ); 3815 } 3816 if ( bExp == 0x7FFF ) { 3817 if ( (bits64) ( bSig<<1 ) ) return propagateFloatx80NaN( a, b ); 3818 if ( ( aExp | aSig ) == 0 ) { 3819 invalid: 3820 float_raise( float_flag_invalid ); 3821 z.low = floatx80_default_nan_low; 3822 z.high = floatx80_default_nan_high; 3823 return z; 3824 } 3825 return packFloatx80( zSign, 0x7FFF, LIT64( 0x8000000000000000 ) ); 3826 } 3827 if ( aExp == 0 ) { 3828 if ( aSig == 0 ) return packFloatx80( zSign, 0, 0 ); 3829 normalizeFloatx80Subnormal( aSig, &aExp, &aSig ); 3830 } 3831 if ( bExp == 0 ) { 3832 if ( bSig == 0 ) return packFloatx80( zSign, 0, 0 ); 3833 normalizeFloatx80Subnormal( bSig, &bExp, &bSig ); 3834 } 3835 zExp = aExp + bExp - 0x3FFE; 3836 mul64To128( aSig, bSig, &zSig0, &zSig1 ); 3837 if ( 0 < (sbits64) zSig0 ) { 3838 shortShift128Left( zSig0, zSig1, 1, &zSig0, &zSig1 ); 3839 --zExp; 3840 } 3841 return 3842 roundAndPackFloatx80( 3843 floatx80_rounding_precision, zSign, zExp, zSig0, zSig1 ); 3844 3845 } 3846 3847 /* 3848 ------------------------------------------------------------------------------- 3849 Returns the result of dividing the extended double-precision floating-point 3850 value `a' by the corresponding value `b'. The operation is performed 3851 according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic. 3852 ------------------------------------------------------------------------------- 3853 */ 3854 floatx80 floatx80_div( floatx80 a, floatx80 b ) 3855 { 3856 flag aSign, bSign, zSign; 3857 int32 aExp, bExp, zExp; 3858 bits64 aSig, bSig, zSig0, zSig1; 3859 bits64 rem0, rem1, rem2, term0, term1, term2; 3860 floatx80 z; 3861 3862 aSig = extractFloatx80Frac( a ); 3863 aExp = extractFloatx80Exp( a ); 3864 aSign = extractFloatx80Sign( a ); 3865 bSig = extractFloatx80Frac( b ); 3866 bExp = extractFloatx80Exp( b ); 3867 bSign = extractFloatx80Sign( b ); 3868 zSign = aSign ^ bSign; 3869 if ( aExp == 0x7FFF ) { 3870 if ( (bits64) ( aSig<<1 ) ) return propagateFloatx80NaN( a, b ); 3871 if ( bExp == 0x7FFF ) { 3872 if ( (bits64) ( bSig<<1 ) ) return propagateFloatx80NaN( a, b ); 3873 goto invalid; 3874 } 3875 return packFloatx80( zSign, 0x7FFF, LIT64( 0x8000000000000000 ) ); 3876 } 3877 if ( bExp == 0x7FFF ) { 3878 if ( (bits64) ( bSig<<1 ) ) return propagateFloatx80NaN( a, b ); 3879 return packFloatx80( zSign, 0, 0 ); 3880 } 3881 if ( bExp == 0 ) { 3882 if ( bSig == 0 ) { 3883 if ( ( aExp | aSig ) == 0 ) { 3884 invalid: 3885 float_raise( float_flag_invalid ); 3886 z.low = floatx80_default_nan_low; 3887 z.high = floatx80_default_nan_high; 3888 return z; 3889 } 3890 float_raise( float_flag_divbyzero ); 3891 return packFloatx80( zSign, 0x7FFF, LIT64( 0x8000000000000000 ) ); 3892 } 3893 normalizeFloatx80Subnormal( bSig, &bExp, &bSig ); 3894 } 3895 if ( aExp == 0 ) { 3896 if ( aSig == 0 ) return packFloatx80( zSign, 0, 0 ); 3897 normalizeFloatx80Subnormal( aSig, &aExp, &aSig ); 3898 } 3899 zExp = aExp - bExp + 0x3FFE; 3900 rem1 = 0; 3901 if ( bSig <= aSig ) { 3902 shift128Right( aSig, 0, 1, &aSig, &rem1 ); 3903 ++zExp; 3904 } 3905 zSig0 = estimateDiv128To64( aSig, rem1, bSig ); 3906 mul64To128( bSig, zSig0, &term0, &term1 ); 3907 sub128( aSig, rem1, term0, term1, &rem0, &rem1 ); 3908 while ( (sbits64) rem0 < 0 ) { 3909 --zSig0; 3910 add128( rem0, rem1, 0, bSig, &rem0, &rem1 ); 3911 } 3912 zSig1 = estimateDiv128To64( rem1, 0, bSig ); 3913 if ( (bits64) ( zSig1<<1 ) <= 8 ) { 3914 mul64To128( bSig, zSig1, &term1, &term2 ); 3915 sub128( rem1, 0, term1, term2, &rem1, &rem2 ); 3916 while ( (sbits64) rem1 < 0 ) { 3917 --zSig1; 3918 add128( rem1, rem2, 0, bSig, &rem1, &rem2 ); 3919 } 3920 zSig1 |= ( ( rem1 | rem2 ) != 0 ); 3921 } 3922 return 3923 roundAndPackFloatx80( 3924 floatx80_rounding_precision, zSign, zExp, zSig0, zSig1 ); 3925 3926 } 3927 3928 /* 3929 ------------------------------------------------------------------------------- 3930 Returns the remainder of the extended double-precision floating-point value 3931 `a' with respect to the corresponding value `b'. The operation is performed 3932 according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic. 3933 ------------------------------------------------------------------------------- 3934 */ 3935 floatx80 floatx80_rem( floatx80 a, floatx80 b ) 3936 { 3937 flag aSign, bSign, zSign; 3938 int32 aExp, bExp, expDiff; 3939 bits64 aSig0, aSig1, bSig; 3940 bits64 q, term0, term1, alternateASig0, alternateASig1; 3941 floatx80 z; 3942 3943 aSig0 = extractFloatx80Frac( a ); 3944 aExp = extractFloatx80Exp( a ); 3945 aSign = extractFloatx80Sign( a ); 3946 bSig = extractFloatx80Frac( b ); 3947 bExp = extractFloatx80Exp( b ); 3948 bSign = extractFloatx80Sign( b ); 3949 if ( aExp == 0x7FFF ) { 3950 if ( (bits64) ( aSig0<<1 ) 3951 || ( ( bExp == 0x7FFF ) && (bits64) ( bSig<<1 ) ) ) { 3952 return propagateFloatx80NaN( a, b ); 3953 } 3954 goto invalid; 3955 } 3956 if ( bExp == 0x7FFF ) { 3957 if ( (bits64) ( bSig<<1 ) ) return propagateFloatx80NaN( a, b ); 3958 return a; 3959 } 3960 if ( bExp == 0 ) { 3961 if ( bSig == 0 ) { 3962 invalid: 3963 float_raise( float_flag_invalid ); 3964 z.low = floatx80_default_nan_low; 3965 z.high = floatx80_default_nan_high; 3966 return z; 3967 } 3968 normalizeFloatx80Subnormal( bSig, &bExp, &bSig ); 3969 } 3970 if ( aExp == 0 ) { 3971 if ( (bits64) ( aSig0<<1 ) == 0 ) return a; 3972 normalizeFloatx80Subnormal( aSig0, &aExp, &aSig0 ); 3973 } 3974 bSig |= LIT64( 0x8000000000000000 ); 3975 zSign = aSign; 3976 expDiff = aExp - bExp; 3977 aSig1 = 0; 3978 if ( expDiff < 0 ) { 3979 if ( expDiff < -1 ) return a; 3980 shift128Right( aSig0, 0, 1, &aSig0, &aSig1 ); 3981 expDiff = 0; 3982 } 3983 q = ( bSig <= aSig0 ); 3984 if ( q ) aSig0 -= bSig; 3985 expDiff -= 64; 3986 while ( 0 < expDiff ) { 3987 q = estimateDiv128To64( aSig0, aSig1, bSig ); 3988 q = ( 2 < q ) ? q - 2 : 0; 3989 mul64To128( bSig, q, &term0, &term1 ); 3990 sub128( aSig0, aSig1, term0, term1, &aSig0, &aSig1 ); 3991 shortShift128Left( aSig0, aSig1, 62, &aSig0, &aSig1 ); 3992 expDiff -= 62; 3993 } 3994 expDiff += 64; 3995 if ( 0 < expDiff ) { 3996 q = estimateDiv128To64( aSig0, aSig1, bSig ); 3997 q = ( 2 < q ) ? q - 2 : 0; 3998 q >>= 64 - expDiff; 3999 mul64To128( bSig, q<<( 64 - expDiff ), &term0, &term1 ); 4000 sub128( aSig0, aSig1, term0, term1, &aSig0, &aSig1 ); 4001 shortShift128Left( 0, bSig, 64 - expDiff, &term0, &term1 ); 4002 while ( le128( term0, term1, aSig0, aSig1 ) ) { 4003 ++q; 4004 sub128( aSig0, aSig1, term0, term1, &aSig0, &aSig1 ); 4005 } 4006 } 4007 else { 4008 term1 = 0; 4009 term0 = bSig; 4010 } 4011 sub128( term0, term1, aSig0, aSig1, &alternateASig0, &alternateASig1 ); 4012 if ( lt128( alternateASig0, alternateASig1, aSig0, aSig1 ) 4013 || ( eq128( alternateASig0, alternateASig1, aSig0, aSig1 ) 4014 && ( q & 1 ) ) 4015 ) { 4016 aSig0 = alternateASig0; 4017 aSig1 = alternateASig1; 4018 zSign = ! zSign; 4019 } 4020 return 4021 normalizeRoundAndPackFloatx80( 4022 80, zSign, bExp + expDiff, aSig0, aSig1 ); 4023 4024 } 4025 4026 /* 4027 ------------------------------------------------------------------------------- 4028 Returns the square root of the extended double-precision floating-point 4029 value `a'. The operation is performed according to the IEC/IEEE Standard 4030 for Binary Floating-Point Arithmetic. 4031 ------------------------------------------------------------------------------- 4032 */ 4033 floatx80 floatx80_sqrt( floatx80 a ) 4034 { 4035 flag aSign; 4036 int32 aExp, zExp; 4037 bits64 aSig0, aSig1, zSig0, zSig1, doubleZSig0; 4038 bits64 rem0, rem1, rem2, rem3, term0, term1, term2, term3; 4039 floatx80 z; 4040 4041 aSig0 = extractFloatx80Frac( a ); 4042 aExp = extractFloatx80Exp( a ); 4043 aSign = extractFloatx80Sign( a ); 4044 if ( aExp == 0x7FFF ) { 4045 if ( (bits64) ( aSig0<<1 ) ) return propagateFloatx80NaN( a, a ); 4046 if ( ! aSign ) return a; 4047 goto invalid; 4048 } 4049 if ( aSign ) { 4050 if ( ( aExp | aSig0 ) == 0 ) return a; 4051 invalid: 4052 float_raise( float_flag_invalid ); 4053 z.low = floatx80_default_nan_low; 4054 z.high = floatx80_default_nan_high; 4055 return z; 4056 } 4057 if ( aExp == 0 ) { 4058 if ( aSig0 == 0 ) return packFloatx80( 0, 0, 0 ); 4059 normalizeFloatx80Subnormal( aSig0, &aExp, &aSig0 ); 4060 } 4061 zExp = ( ( aExp - 0x3FFF )>>1 ) + 0x3FFF; 4062 zSig0 = estimateSqrt32( aExp, aSig0>>32 ); 4063 shift128Right( aSig0, 0, 2 + ( aExp & 1 ), &aSig0, &aSig1 ); 4064 zSig0 = estimateDiv128To64( aSig0, aSig1, zSig0<<32 ) + ( zSig0<<30 ); 4065 doubleZSig0 = zSig0<<1; 4066 mul64To128( zSig0, zSig0, &term0, &term1 ); 4067 sub128( aSig0, aSig1, term0, term1, &rem0, &rem1 ); 4068 while ( (sbits64) rem0 < 0 ) { 4069 --zSig0; 4070 doubleZSig0 -= 2; 4071 add128( rem0, rem1, zSig0>>63, doubleZSig0 | 1, &rem0, &rem1 ); 4072 } 4073 zSig1 = estimateDiv128To64( rem1, 0, doubleZSig0 ); 4074 if ( ( zSig1 & LIT64( 0x3FFFFFFFFFFFFFFF ) ) <= 5 ) { 4075 if ( zSig1 == 0 ) zSig1 = 1; 4076 mul64To128( doubleZSig0, zSig1, &term1, &term2 ); 4077 sub128( rem1, 0, term1, term2, &rem1, &rem2 ); 4078 mul64To128( zSig1, zSig1, &term2, &term3 ); 4079 sub192( rem1, rem2, 0, 0, term2, term3, &rem1, &rem2, &rem3 ); 4080 while ( (sbits64) rem1 < 0 ) { 4081 --zSig1; 4082 shortShift128Left( 0, zSig1, 1, &term2, &term3 ); 4083 term3 |= 1; 4084 term2 |= doubleZSig0; 4085 add192( rem1, rem2, rem3, 0, term2, term3, &rem1, &rem2, &rem3 ); 4086 } 4087 zSig1 |= ( ( rem1 | rem2 | rem3 ) != 0 ); 4088 } 4089 shortShift128Left( 0, zSig1, 1, &zSig0, &zSig1 ); 4090 zSig0 |= doubleZSig0; 4091 return 4092 roundAndPackFloatx80( 4093 floatx80_rounding_precision, 0, zExp, zSig0, zSig1 ); 4094 4095 } 4096 4097 /* 4098 ------------------------------------------------------------------------------- 4099 Returns 1 if the extended double-precision floating-point value `a' is 4100 equal to the corresponding value `b', and 0 otherwise. The comparison is 4101 performed according to the IEC/IEEE Standard for Binary Floating-Point 4102 Arithmetic. 4103 ------------------------------------------------------------------------------- 4104 */ 4105 flag floatx80_eq( floatx80 a, floatx80 b ) 4106 { 4107 4108 if ( ( ( extractFloatx80Exp( a ) == 0x7FFF ) 4109 && (bits64) ( extractFloatx80Frac( a )<<1 ) ) 4110 || ( ( extractFloatx80Exp( b ) == 0x7FFF ) 4111 && (bits64) ( extractFloatx80Frac( b )<<1 ) ) 4112 ) { 4113 if ( floatx80_is_signaling_nan( a ) 4114 || floatx80_is_signaling_nan( b ) ) { 4115 float_raise( float_flag_invalid ); 4116 } 4117 return 0; 4118 } 4119 return 4120 ( a.low == b.low ) 4121 && ( ( a.high == b.high ) 4122 || ( ( a.low == 0 ) 4123 && ( (bits16) ( ( a.high | b.high )<<1 ) == 0 ) ) 4124 ); 4125 4126 } 4127 4128 /* 4129 ------------------------------------------------------------------------------- 4130 Returns 1 if the extended double-precision floating-point value `a' is 4131 less than or equal to the corresponding value `b', and 0 otherwise. The 4132 comparison is performed according to the IEC/IEEE Standard for Binary 4133 Floating-Point Arithmetic. 4134 ------------------------------------------------------------------------------- 4135 */ 4136 flag floatx80_le( floatx80 a, floatx80 b ) 4137 { 4138 flag aSign, bSign; 4139 4140 if ( ( ( extractFloatx80Exp( a ) == 0x7FFF ) 4141 && (bits64) ( extractFloatx80Frac( a )<<1 ) ) 4142 || ( ( extractFloatx80Exp( b ) == 0x7FFF ) 4143 && (bits64) ( extractFloatx80Frac( b )<<1 ) ) 4144 ) { 4145 float_raise( float_flag_invalid ); 4146 return 0; 4147 } 4148 aSign = extractFloatx80Sign( a ); 4149 bSign = extractFloatx80Sign( b ); 4150 if ( aSign != bSign ) { 4151 return 4152 aSign 4153 || ( ( ( (bits16) ( ( a.high | b.high )<<1 ) ) | a.low | b.low ) 4154 == 0 ); 4155 } 4156 return 4157 aSign ? le128( b.high, b.low, a.high, a.low ) 4158 : le128( a.high, a.low, b.high, b.low ); 4159 4160 } 4161 4162 /* 4163 ------------------------------------------------------------------------------- 4164 Returns 1 if the extended double-precision floating-point value `a' is 4165 less than the corresponding value `b', and 0 otherwise. The comparison 4166 is performed according to the IEC/IEEE Standard for Binary Floating-Point 4167 Arithmetic. 4168 ------------------------------------------------------------------------------- 4169 */ 4170 flag floatx80_lt( floatx80 a, floatx80 b ) 4171 { 4172 flag aSign, bSign; 4173 4174 if ( ( ( extractFloatx80Exp( a ) == 0x7FFF ) 4175 && (bits64) ( extractFloatx80Frac( a )<<1 ) ) 4176 || ( ( extractFloatx80Exp( b ) == 0x7FFF ) 4177 && (bits64) ( extractFloatx80Frac( b )<<1 ) ) 4178 ) { 4179 float_raise( float_flag_invalid ); 4180 return 0; 4181 } 4182 aSign = extractFloatx80Sign( a ); 4183 bSign = extractFloatx80Sign( b ); 4184 if ( aSign != bSign ) { 4185 return 4186 aSign 4187 && ( ( ( (bits16) ( ( a.high | b.high )<<1 ) ) | a.low | b.low ) 4188 != 0 ); 4189 } 4190 return 4191 aSign ? lt128( b.high, b.low, a.high, a.low ) 4192 : lt128( a.high, a.low, b.high, b.low ); 4193 4194 } 4195 4196 /* 4197 ------------------------------------------------------------------------------- 4198 Returns 1 if the extended double-precision floating-point value `a' is equal 4199 to the corresponding value `b', and 0 otherwise. The invalid exception is 4200 raised if either operand is a NaN. Otherwise, the comparison is performed 4201 according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic. 4202 ------------------------------------------------------------------------------- 4203 */ 4204 flag floatx80_eq_signaling( floatx80 a, floatx80 b ) 4205 { 4206 4207 if ( ( ( extractFloatx80Exp( a ) == 0x7FFF ) 4208 && (bits64) ( extractFloatx80Frac( a )<<1 ) ) 4209 || ( ( extractFloatx80Exp( b ) == 0x7FFF ) 4210 && (bits64) ( extractFloatx80Frac( b )<<1 ) ) 4211 ) { 4212 float_raise( float_flag_invalid ); 4213 return 0; 4214 } 4215 return 4216 ( a.low == b.low ) 4217 && ( ( a.high == b.high ) 4218 || ( ( a.low == 0 ) 4219 && ( (bits16) ( ( a.high | b.high )<<1 ) == 0 ) ) 4220 ); 4221 4222 } 4223 4224 /* 4225 ------------------------------------------------------------------------------- 4226 Returns 1 if the extended double-precision floating-point value `a' is less 4227 than or equal to the corresponding value `b', and 0 otherwise. Quiet NaNs 4228 do not cause an exception. Otherwise, the comparison is performed according 4229 to the IEC/IEEE Standard for Binary Floating-Point Arithmetic. 4230 ------------------------------------------------------------------------------- 4231 */ 4232 flag floatx80_le_quiet( floatx80 a, floatx80 b ) 4233 { 4234 flag aSign, bSign; 4235 4236 if ( ( ( extractFloatx80Exp( a ) == 0x7FFF ) 4237 && (bits64) ( extractFloatx80Frac( a )<<1 ) ) 4238 || ( ( extractFloatx80Exp( b ) == 0x7FFF ) 4239 && (bits64) ( extractFloatx80Frac( b )<<1 ) ) 4240 ) { 4241 if ( floatx80_is_signaling_nan( a ) 4242 || floatx80_is_signaling_nan( b ) ) { 4243 float_raise( float_flag_invalid ); 4244 } 4245 return 0; 4246 } 4247 aSign = extractFloatx80Sign( a ); 4248 bSign = extractFloatx80Sign( b ); 4249 if ( aSign != bSign ) { 4250 return 4251 aSign 4252 || ( ( ( (bits16) ( ( a.high | b.high )<<1 ) ) | a.low | b.low ) 4253 == 0 ); 4254 } 4255 return 4256 aSign ? le128( b.high, b.low, a.high, a.low ) 4257 : le128( a.high, a.low, b.high, b.low ); 4258 4259 } 4260 4261 /* 4262 ------------------------------------------------------------------------------- 4263 Returns 1 if the extended double-precision floating-point value `a' is less 4264 than the corresponding value `b', and 0 otherwise. Quiet NaNs do not cause 4265 an exception. Otherwise, the comparison is performed according to the 4266 IEC/IEEE Standard for Binary Floating-Point Arithmetic. 4267 ------------------------------------------------------------------------------- 4268 */ 4269 flag floatx80_lt_quiet( floatx80 a, floatx80 b ) 4270 { 4271 flag aSign, bSign; 4272 4273 if ( ( ( extractFloatx80Exp( a ) == 0x7FFF ) 4274 && (bits64) ( extractFloatx80Frac( a )<<1 ) ) 4275 || ( ( extractFloatx80Exp( b ) == 0x7FFF ) 4276 && (bits64) ( extractFloatx80Frac( b )<<1 ) ) 4277 ) { 4278 if ( floatx80_is_signaling_nan( a ) 4279 || floatx80_is_signaling_nan( b ) ) { 4280 float_raise( float_flag_invalid ); 4281 } 4282 return 0; 4283 } 4284 aSign = extractFloatx80Sign( a ); 4285 bSign = extractFloatx80Sign( b ); 4286 if ( aSign != bSign ) { 4287 return 4288 aSign 4289 && ( ( ( (bits16) ( ( a.high | b.high )<<1 ) ) | a.low | b.low ) 4290 != 0 ); 4291 } 4292 return 4293 aSign ? lt128( b.high, b.low, a.high, a.low ) 4294 : lt128( a.high, a.low, b.high, b.low ); 4295 4296 } 4297 4298 #endif 4299 4300 #ifdef FLOAT128 4301 4302 /* 4303 ------------------------------------------------------------------------------- 4304 Returns the result of converting the quadruple-precision floating-point 4305 value `a' to the 32-bit two's complement integer format. The conversion 4306 is performed according to the IEC/IEEE Standard for Binary Floating-Point 4307 Arithmetic---which means in particular that the conversion is rounded 4308 according to the current rounding mode. If `a' is a NaN, the largest 4309 positive integer is returned. Otherwise, if the conversion overflows, the 4310 largest integer with the same sign as `a' is returned. 4311 ------------------------------------------------------------------------------- 4312 */ 4313 int32 float128_to_int32( float128 a ) 4314 { 4315 flag aSign; 4316 int32 aExp, shiftCount; 4317 bits64 aSig0, aSig1; 4318 4319 aSig1 = extractFloat128Frac1( a ); 4320 aSig0 = extractFloat128Frac0( a ); 4321 aExp = extractFloat128Exp( a ); 4322 aSign = extractFloat128Sign( a ); 4323 if ( ( aExp == 0x7FFF ) && ( aSig0 | aSig1 ) ) aSign = 0; 4324 if ( aExp ) aSig0 |= LIT64( 0x0001000000000000 ); 4325 aSig0 |= ( aSig1 != 0 ); 4326 shiftCount = 0x4028 - aExp; 4327 if ( 0 < shiftCount ) shift64RightJamming( aSig0, shiftCount, &aSig0 ); 4328 return roundAndPackInt32( aSign, aSig0 ); 4329 4330 } 4331 4332 /* 4333 ------------------------------------------------------------------------------- 4334 Returns the result of converting the quadruple-precision floating-point 4335 value `a' to the 32-bit two's complement integer format. The conversion 4336 is performed according to the IEC/IEEE Standard for Binary Floating-Point 4337 Arithmetic, except that the conversion is always rounded toward zero. If 4338 `a' is a NaN, the largest positive integer is returned. Otherwise, if the 4339 conversion overflows, the largest integer with the same sign as `a' is 4340 returned. 4341 ------------------------------------------------------------------------------- 4342 */ 4343 int32 float128_to_int32_round_to_zero( float128 a ) 4344 { 4345 flag aSign; 4346 int32 aExp, shiftCount; 4347 bits64 aSig0, aSig1, savedASig; 4348 int32 z; 4349 4350 aSig1 = extractFloat128Frac1( a ); 4351 aSig0 = extractFloat128Frac0( a ); 4352 aExp = extractFloat128Exp( a ); 4353 aSign = extractFloat128Sign( a ); 4354 aSig0 |= ( aSig1 != 0 ); 4355 if ( 0x401E < aExp ) { 4356 if ( ( aExp == 0x7FFF ) && aSig0 ) aSign = 0; 4357 goto invalid; 4358 } 4359 else if ( aExp < 0x3FFF ) { 4360 if ( aExp || aSig0 ) float_exception_flags |= float_flag_inexact; 4361 return 0; 4362 } 4363 aSig0 |= LIT64( 0x0001000000000000 ); 4364 shiftCount = 0x402F - aExp; 4365 savedASig = aSig0; 4366 aSig0 >>= shiftCount; 4367 z = aSig0; 4368 if ( aSign ) z = - z; 4369 if ( ( z < 0 ) ^ aSign ) { 4370 invalid: 4371 float_raise( float_flag_invalid ); 4372 return aSign ? (sbits32) 0x80000000 : 0x7FFFFFFF; 4373 } 4374 if ( ( aSig0<<shiftCount ) != savedASig ) { 4375 float_exception_flags |= float_flag_inexact; 4376 } 4377 return z; 4378 4379 } 4380 4381 /* 4382 ------------------------------------------------------------------------------- 4383 Returns the result of converting the quadruple-precision floating-point 4384 value `a' to the 64-bit two's complement integer format. The conversion 4385 is performed according to the IEC/IEEE Standard for Binary Floating-Point 4386 Arithmetic---which means in particular that the conversion is rounded 4387 according to the current rounding mode. If `a' is a NaN, the largest 4388 positive integer is returned. Otherwise, if the conversion overflows, the 4389 largest integer with the same sign as `a' is returned. 4390 ------------------------------------------------------------------------------- 4391 */ 4392 int64 float128_to_int64( float128 a ) 4393 { 4394 flag aSign; 4395 int32 aExp, shiftCount; 4396 bits64 aSig0, aSig1; 4397 4398 aSig1 = extractFloat128Frac1( a ); 4399 aSig0 = extractFloat128Frac0( a ); 4400 aExp = extractFloat128Exp( a ); 4401 aSign = extractFloat128Sign( a ); 4402 if ( aExp ) aSig0 |= LIT64( 0x0001000000000000 ); 4403 shiftCount = 0x402F - aExp; 4404 if ( shiftCount <= 0 ) { 4405 if ( 0x403E < aExp ) { 4406 float_raise( float_flag_invalid ); 4407 if ( ! aSign 4408 || ( ( aExp == 0x7FFF ) 4409 && ( aSig1 || ( aSig0 != LIT64( 0x0001000000000000 ) ) ) 4410 ) 4411 ) { 4412 return LIT64( 0x7FFFFFFFFFFFFFFF ); 4413 } 4414 return (sbits64) LIT64( 0x8000000000000000 ); 4415 } 4416 shortShift128Left( aSig0, aSig1, - shiftCount, &aSig0, &aSig1 ); 4417 } 4418 else { 4419 shift64ExtraRightJamming( aSig0, aSig1, shiftCount, &aSig0, &aSig1 ); 4420 } 4421 return roundAndPackInt64( aSign, aSig0, aSig1 ); 4422 4423 } 4424 4425 /* 4426 ------------------------------------------------------------------------------- 4427 Returns the result of converting the quadruple-precision floating-point 4428 value `a' to the 64-bit two's complement integer format. The conversion 4429 is performed according to the IEC/IEEE Standard for Binary Floating-Point 4430 Arithmetic, except that the conversion is always rounded toward zero. 4431 If `a' is a NaN, the largest positive integer is returned. Otherwise, if 4432 the conversion overflows, the largest integer with the same sign as `a' is 4433 returned. 4434 ------------------------------------------------------------------------------- 4435 */ 4436 int64 float128_to_int64_round_to_zero( float128 a ) 4437 { 4438 flag aSign; 4439 int32 aExp, shiftCount; 4440 bits64 aSig0, aSig1; 4441 int64 z; 4442 4443 aSig1 = extractFloat128Frac1( a ); 4444 aSig0 = extractFloat128Frac0( a ); 4445 aExp = extractFloat128Exp( a ); 4446 aSign = extractFloat128Sign( a ); 4447 if ( aExp ) aSig0 |= LIT64( 0x0001000000000000 ); 4448 shiftCount = aExp - 0x402F; 4449 if ( 0 < shiftCount ) { 4450 if ( 0x403E <= aExp ) { 4451 aSig0 &= LIT64( 0x0000FFFFFFFFFFFF ); 4452 if ( ( a.high == LIT64( 0xC03E000000000000 ) ) 4453 && ( aSig1 < LIT64( 0x0002000000000000 ) ) ) { 4454 if ( aSig1 ) float_exception_flags |= float_flag_inexact; 4455 } 4456 else { 4457 float_raise( float_flag_invalid ); 4458 if ( ! aSign || ( ( aExp == 0x7FFF ) && ( aSig0 | aSig1 ) ) ) { 4459 return LIT64( 0x7FFFFFFFFFFFFFFF ); 4460 } 4461 } 4462 return (sbits64) LIT64( 0x8000000000000000 ); 4463 } 4464 z = ( aSig0<<shiftCount ) | ( aSig1>>( ( - shiftCount ) & 63 ) ); 4465 if ( (bits64) ( aSig1<<shiftCount ) ) { 4466 float_exception_flags |= float_flag_inexact; 4467 } 4468 } 4469 else { 4470 if ( aExp < 0x3FFF ) { 4471 if ( aExp | aSig0 | aSig1 ) { 4472 float_exception_flags |= float_flag_inexact; 4473 } 4474 return 0; 4475 } 4476 z = aSig0>>( - shiftCount ); 4477 if ( aSig1 4478 || ( shiftCount && (bits64) ( aSig0<<( shiftCount & 63 ) ) ) ) { 4479 float_exception_flags |= float_flag_inexact; 4480 } 4481 } 4482 if ( aSign ) z = - z; 4483 return z; 4484 4485 } 4486 4487 #if (defined(SOFTFLOATSPARC64_FOR_GCC) || defined(SOFTFLOAT_FOR_GCC)) \ 4488 && defined(SOFTFLOAT_NEED_FIXUNS) 4489 /* 4490 * just like above - but do not care for overflow of signed results 4491 */ 4492 uint64 float128_to_uint64_round_to_zero( float128 a ) 4493 { 4494 flag aSign; 4495 int32 aExp, shiftCount; 4496 bits64 aSig0, aSig1; 4497 uint64 z; 4498 4499 aSig1 = extractFloat128Frac1( a ); 4500 aSig0 = extractFloat128Frac0( a ); 4501 aExp = extractFloat128Exp( a ); 4502 aSign = extractFloat128Sign( a ); 4503 if ( aExp ) aSig0 |= LIT64( 0x0001000000000000 ); 4504 shiftCount = aExp - 0x402F; 4505 if ( 0 < shiftCount ) { 4506 if ( 0x403F <= aExp ) { 4507 aSig0 &= LIT64( 0x0000FFFFFFFFFFFF ); 4508 if ( ( a.high == LIT64( 0xC03E000000000000 ) ) 4509 && ( aSig1 < LIT64( 0x0002000000000000 ) ) ) { 4510 if ( aSig1 ) float_exception_flags |= float_flag_inexact; 4511 } 4512 else { 4513 float_raise( float_flag_invalid ); 4514 } 4515 return LIT64( 0xFFFFFFFFFFFFFFFF ); 4516 } 4517 z = ( aSig0<<shiftCount ) | ( aSig1>>( ( - shiftCount ) & 63 ) ); 4518 if ( (bits64) ( aSig1<<shiftCount ) ) { 4519 float_exception_flags |= float_flag_inexact; 4520 } 4521 } 4522 else { 4523 if ( aExp < 0x3FFF ) { 4524 if ( aExp | aSig0 | aSig1 ) { 4525 float_exception_flags |= float_flag_inexact; 4526 } 4527 return 0; 4528 } 4529 z = aSig0>>( - shiftCount ); 4530 if (aSig1 || ( shiftCount && (bits64) ( aSig0<<( shiftCount & 63 ) ) ) ) { 4531 float_exception_flags |= float_flag_inexact; 4532 } 4533 } 4534 if ( aSign ) z = - z; 4535 return z; 4536 4537 } 4538 #endif /* (SOFTFLOATSPARC64_FOR_GCC || SOFTFLOAT_FOR_GCC) && SOFTFLOAT_NEED_FIXUNS */ 4539 4540 /* 4541 ------------------------------------------------------------------------------- 4542 Returns the result of converting the quadruple-precision floating-point 4543 value `a' to the single-precision floating-point format. The conversion 4544 is performed according to the IEC/IEEE Standard for Binary Floating-Point 4545 Arithmetic. 4546 ------------------------------------------------------------------------------- 4547 */ 4548 float32 float128_to_float32( float128 a ) 4549 { 4550 flag aSign; 4551 int32 aExp; 4552 bits64 aSig0, aSig1; 4553 bits32 zSig; 4554 4555 aSig1 = extractFloat128Frac1( a ); 4556 aSig0 = extractFloat128Frac0( a ); 4557 aExp = extractFloat128Exp( a ); 4558 aSign = extractFloat128Sign( a ); 4559 if ( aExp == 0x7FFF ) { 4560 if ( aSig0 | aSig1 ) { 4561 return commonNaNToFloat32( float128ToCommonNaN( a ) ); 4562 } 4563 return packFloat32( aSign, 0xFF, 0 ); 4564 } 4565 aSig0 |= ( aSig1 != 0 ); 4566 shift64RightJamming( aSig0, 18, &aSig0 ); 4567 zSig = aSig0; 4568 if ( aExp || zSig ) { 4569 zSig |= 0x40000000; 4570 aExp -= 0x3F81; 4571 } 4572 return roundAndPackFloat32( aSign, aExp, zSig ); 4573 4574 } 4575 4576 /* 4577 ------------------------------------------------------------------------------- 4578 Returns the result of converting the quadruple-precision floating-point 4579 value `a' to the double-precision floating-point format. The conversion 4580 is performed according to the IEC/IEEE Standard for Binary Floating-Point 4581 Arithmetic. 4582 ------------------------------------------------------------------------------- 4583 */ 4584 float64 float128_to_float64( float128 a ) 4585 { 4586 flag aSign; 4587 int32 aExp; 4588 bits64 aSig0, aSig1; 4589 4590 aSig1 = extractFloat128Frac1( a ); 4591 aSig0 = extractFloat128Frac0( a ); 4592 aExp = extractFloat128Exp( a ); 4593 aSign = extractFloat128Sign( a ); 4594 if ( aExp == 0x7FFF ) { 4595 if ( aSig0 | aSig1 ) { 4596 return commonNaNToFloat64( float128ToCommonNaN( a ) ); 4597 } 4598 return packFloat64( aSign, 0x7FF, 0 ); 4599 } 4600 shortShift128Left( aSig0, aSig1, 14, &aSig0, &aSig1 ); 4601 aSig0 |= ( aSig1 != 0 ); 4602 if ( aExp || aSig0 ) { 4603 aSig0 |= LIT64( 0x4000000000000000 ); 4604 aExp -= 0x3C01; 4605 } 4606 return roundAndPackFloat64( aSign, aExp, aSig0 ); 4607 4608 } 4609 4610 #ifdef FLOATX80 4611 4612 /* 4613 ------------------------------------------------------------------------------- 4614 Returns the result of converting the quadruple-precision floating-point 4615 value `a' to the extended double-precision floating-point format. The 4616 conversion is performed according to the IEC/IEEE Standard for Binary 4617 Floating-Point Arithmetic. 4618 ------------------------------------------------------------------------------- 4619 */ 4620 floatx80 float128_to_floatx80( float128 a ) 4621 { 4622 flag aSign; 4623 int32 aExp; 4624 bits64 aSig0, aSig1; 4625 4626 aSig1 = extractFloat128Frac1( a ); 4627 aSig0 = extractFloat128Frac0( a ); 4628 aExp = extractFloat128Exp( a ); 4629 aSign = extractFloat128Sign( a ); 4630 if ( aExp == 0x7FFF ) { 4631 if ( aSig0 | aSig1 ) { 4632 return commonNaNToFloatx80( float128ToCommonNaN( a ) ); 4633 } 4634 return packFloatx80( aSign, 0x7FFF, LIT64( 0x8000000000000000 ) ); 4635 } 4636 if ( aExp == 0 ) { 4637 if ( ( aSig0 | aSig1 ) == 0 ) return packFloatx80( aSign, 0, 0 ); 4638 normalizeFloat128Subnormal( aSig0, aSig1, &aExp, &aSig0, &aSig1 ); 4639 } 4640 else { 4641 aSig0 |= LIT64( 0x0001000000000000 ); 4642 } 4643 shortShift128Left( aSig0, aSig1, 15, &aSig0, &aSig1 ); 4644 return roundAndPackFloatx80( 80, aSign, aExp, aSig0, aSig1 ); 4645 4646 } 4647 4648 #endif 4649 4650 /* 4651 ------------------------------------------------------------------------------- 4652 Rounds the quadruple-precision floating-point value `a' to an integer, and 4653 returns the result as a quadruple-precision floating-point value. The 4654 operation is performed according to the IEC/IEEE Standard for Binary 4655 Floating-Point Arithmetic. 4656 ------------------------------------------------------------------------------- 4657 */ 4658 float128 float128_round_to_int( float128 a ) 4659 { 4660 flag aSign; 4661 int32 aExp; 4662 bits64 lastBitMask, roundBitsMask; 4663 int8 roundingMode; 4664 float128 z; 4665 4666 aExp = extractFloat128Exp( a ); 4667 if ( 0x402F <= aExp ) { 4668 if ( 0x406F <= aExp ) { 4669 if ( ( aExp == 0x7FFF ) 4670 && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) 4671 ) { 4672 return propagateFloat128NaN( a, a ); 4673 } 4674 return a; 4675 } 4676 lastBitMask = 1; 4677 lastBitMask = ( lastBitMask<<( 0x406E - aExp ) )<<1; 4678 roundBitsMask = lastBitMask - 1; 4679 z = a; 4680 roundingMode = float_rounding_mode; 4681 if ( roundingMode == float_round_nearest_even ) { 4682 if ( lastBitMask ) { 4683 add128( z.high, z.low, 0, lastBitMask>>1, &z.high, &z.low ); 4684 if ( ( z.low & roundBitsMask ) == 0 ) z.low &= ~ lastBitMask; 4685 } 4686 else { 4687 if ( (sbits64) z.low < 0 ) { 4688 ++z.high; 4689 if ( (bits64) ( z.low<<1 ) == 0 ) z.high &= ~1; 4690 } 4691 } 4692 } 4693 else if ( roundingMode != float_round_to_zero ) { 4694 if ( extractFloat128Sign( z ) 4695 ^ ( roundingMode == float_round_up ) ) { 4696 add128( z.high, z.low, 0, roundBitsMask, &z.high, &z.low ); 4697 } 4698 } 4699 z.low &= ~ roundBitsMask; 4700 } 4701 else { 4702 if ( aExp < 0x3FFF ) { 4703 if ( ( ( (bits64) ( a.high<<1 ) ) | a.low ) == 0 ) return a; 4704 float_exception_flags |= float_flag_inexact; 4705 aSign = extractFloat128Sign( a ); 4706 switch ( float_rounding_mode ) { 4707 case float_round_nearest_even: 4708 if ( ( aExp == 0x3FFE ) 4709 && ( extractFloat128Frac0( a ) 4710 | extractFloat128Frac1( a ) ) 4711 ) { 4712 return packFloat128( aSign, 0x3FFF, 0, 0 ); 4713 } 4714 break; 4715 case float_round_to_zero: 4716 break; 4717 case float_round_down: 4718 return 4719 aSign ? packFloat128( 1, 0x3FFF, 0, 0 ) 4720 : packFloat128( 0, 0, 0, 0 ); 4721 case float_round_up: 4722 return 4723 aSign ? packFloat128( 1, 0, 0, 0 ) 4724 : packFloat128( 0, 0x3FFF, 0, 0 ); 4725 } 4726 return packFloat128( aSign, 0, 0, 0 ); 4727 } 4728 lastBitMask = 1; 4729 lastBitMask <<= 0x402F - aExp; 4730 roundBitsMask = lastBitMask - 1; 4731 z.low = 0; 4732 z.high = a.high; 4733 roundingMode = float_rounding_mode; 4734 if ( roundingMode == float_round_nearest_even ) { 4735 z.high += lastBitMask>>1; 4736 if ( ( ( z.high & roundBitsMask ) | a.low ) == 0 ) { 4737 z.high &= ~ lastBitMask; 4738 } 4739 } 4740 else if ( roundingMode != float_round_to_zero ) { 4741 if ( extractFloat128Sign( z ) 4742 ^ ( roundingMode == float_round_up ) ) { 4743 z.high |= ( a.low != 0 ); 4744 z.high += roundBitsMask; 4745 } 4746 } 4747 z.high &= ~ roundBitsMask; 4748 } 4749 if ( ( z.low != a.low ) || ( z.high != a.high ) ) { 4750 float_exception_flags |= float_flag_inexact; 4751 } 4752 return z; 4753 4754 } 4755 4756 /* 4757 ------------------------------------------------------------------------------- 4758 Returns the result of adding the absolute values of the quadruple-precision 4759 floating-point values `a' and `b'. If `zSign' is 1, the sum is negated 4760 before being returned. `zSign' is ignored if the result is a NaN. 4761 The addition is performed according to the IEC/IEEE Standard for Binary 4762 Floating-Point Arithmetic. 4763 ------------------------------------------------------------------------------- 4764 */ 4765 static float128 addFloat128Sigs( float128 a, float128 b, flag zSign ) 4766 { 4767 int32 aExp, bExp, zExp; 4768 bits64 aSig0, aSig1, bSig0, bSig1, zSig0, zSig1, zSig2; 4769 int32 expDiff; 4770 4771 aSig1 = extractFloat128Frac1( a ); 4772 aSig0 = extractFloat128Frac0( a ); 4773 aExp = extractFloat128Exp( a ); 4774 bSig1 = extractFloat128Frac1( b ); 4775 bSig0 = extractFloat128Frac0( b ); 4776 bExp = extractFloat128Exp( b ); 4777 expDiff = aExp - bExp; 4778 if ( 0 < expDiff ) { 4779 if ( aExp == 0x7FFF ) { 4780 if ( aSig0 | aSig1 ) return propagateFloat128NaN( a, b ); 4781 return a; 4782 } 4783 if ( bExp == 0 ) { 4784 --expDiff; 4785 } 4786 else { 4787 bSig0 |= LIT64( 0x0001000000000000 ); 4788 } 4789 shift128ExtraRightJamming( 4790 bSig0, bSig1, 0, expDiff, &bSig0, &bSig1, &zSig2 ); 4791 zExp = aExp; 4792 } 4793 else if ( expDiff < 0 ) { 4794 if ( bExp == 0x7FFF ) { 4795 if ( bSig0 | bSig1 ) return propagateFloat128NaN( a, b ); 4796 return packFloat128( zSign, 0x7FFF, 0, 0 ); 4797 } 4798 if ( aExp == 0 ) { 4799 ++expDiff; 4800 } 4801 else { 4802 aSig0 |= LIT64( 0x0001000000000000 ); 4803 } 4804 shift128ExtraRightJamming( 4805 aSig0, aSig1, 0, - expDiff, &aSig0, &aSig1, &zSig2 ); 4806 zExp = bExp; 4807 } 4808 else { 4809 if ( aExp == 0x7FFF ) { 4810 if ( aSig0 | aSig1 | bSig0 | bSig1 ) { 4811 return propagateFloat128NaN( a, b ); 4812 } 4813 return a; 4814 } 4815 add128( aSig0, aSig1, bSig0, bSig1, &zSig0, &zSig1 ); 4816 if ( aExp == 0 ) return packFloat128( zSign, 0, zSig0, zSig1 ); 4817 zSig2 = 0; 4818 zSig0 |= LIT64( 0x0002000000000000 ); 4819 zExp = aExp; 4820 goto shiftRight1; 4821 } 4822 aSig0 |= LIT64( 0x0001000000000000 ); 4823 add128( aSig0, aSig1, bSig0, bSig1, &zSig0, &zSig1 ); 4824 --zExp; 4825 if ( zSig0 < LIT64( 0x0002000000000000 ) ) goto roundAndPack; 4826 ++zExp; 4827 shiftRight1: 4828 shift128ExtraRightJamming( 4829 zSig0, zSig1, zSig2, 1, &zSig0, &zSig1, &zSig2 ); 4830 roundAndPack: 4831 return roundAndPackFloat128( zSign, zExp, zSig0, zSig1, zSig2 ); 4832 4833 } 4834 4835 /* 4836 ------------------------------------------------------------------------------- 4837 Returns the result of subtracting the absolute values of the quadruple- 4838 precision floating-point values `a' and `b'. If `zSign' is 1, the 4839 difference is negated before being returned. `zSign' is ignored if the 4840 result is a NaN. The subtraction is performed according to the IEC/IEEE 4841 Standard for Binary Floating-Point Arithmetic. 4842 ------------------------------------------------------------------------------- 4843 */ 4844 static float128 subFloat128Sigs( float128 a, float128 b, flag zSign ) 4845 { 4846 int32 aExp, bExp, zExp; 4847 bits64 aSig0, aSig1, bSig0, bSig1, zSig0, zSig1; 4848 int32 expDiff; 4849 float128 z; 4850 4851 aSig1 = extractFloat128Frac1( a ); 4852 aSig0 = extractFloat128Frac0( a ); 4853 aExp = extractFloat128Exp( a ); 4854 bSig1 = extractFloat128Frac1( b ); 4855 bSig0 = extractFloat128Frac0( b ); 4856 bExp = extractFloat128Exp( b ); 4857 expDiff = aExp - bExp; 4858 shortShift128Left( aSig0, aSig1, 14, &aSig0, &aSig1 ); 4859 shortShift128Left( bSig0, bSig1, 14, &bSig0, &bSig1 ); 4860 if ( 0 < expDiff ) goto aExpBigger; 4861 if ( expDiff < 0 ) goto bExpBigger; 4862 if ( aExp == 0x7FFF ) { 4863 if ( aSig0 | aSig1 | bSig0 | bSig1 ) { 4864 return propagateFloat128NaN( a, b ); 4865 } 4866 float_raise( float_flag_invalid ); 4867 z.low = float128_default_nan_low; 4868 z.high = float128_default_nan_high; 4869 return z; 4870 } 4871 if ( aExp == 0 ) { 4872 aExp = 1; 4873 bExp = 1; 4874 } 4875 if ( bSig0 < aSig0 ) goto aBigger; 4876 if ( aSig0 < bSig0 ) goto bBigger; 4877 if ( bSig1 < aSig1 ) goto aBigger; 4878 if ( aSig1 < bSig1 ) goto bBigger; 4879 return packFloat128( float_rounding_mode == float_round_down, 0, 0, 0 ); 4880 bExpBigger: 4881 if ( bExp == 0x7FFF ) { 4882 if ( bSig0 | bSig1 ) return propagateFloat128NaN( a, b ); 4883 return packFloat128( zSign ^ 1, 0x7FFF, 0, 0 ); 4884 } 4885 if ( aExp == 0 ) { 4886 ++expDiff; 4887 } 4888 else { 4889 aSig0 |= LIT64( 0x4000000000000000 ); 4890 } 4891 shift128RightJamming( aSig0, aSig1, - expDiff, &aSig0, &aSig1 ); 4892 bSig0 |= LIT64( 0x4000000000000000 ); 4893 bBigger: 4894 sub128( bSig0, bSig1, aSig0, aSig1, &zSig0, &zSig1 ); 4895 zExp = bExp; 4896 zSign ^= 1; 4897 goto normalizeRoundAndPack; 4898 aExpBigger: 4899 if ( aExp == 0x7FFF ) { 4900 if ( aSig0 | aSig1 ) return propagateFloat128NaN( a, b ); 4901 return a; 4902 } 4903 if ( bExp == 0 ) { 4904 --expDiff; 4905 } 4906 else { 4907 bSig0 |= LIT64( 0x4000000000000000 ); 4908 } 4909 shift128RightJamming( bSig0, bSig1, expDiff, &bSig0, &bSig1 ); 4910 aSig0 |= LIT64( 0x4000000000000000 ); 4911 aBigger: 4912 sub128( aSig0, aSig1, bSig0, bSig1, &zSig0, &zSig1 ); 4913 zExp = aExp; 4914 normalizeRoundAndPack: 4915 --zExp; 4916 return normalizeRoundAndPackFloat128( zSign, zExp - 14, zSig0, zSig1 ); 4917 4918 } 4919 4920 /* 4921 ------------------------------------------------------------------------------- 4922 Returns the result of adding the quadruple-precision floating-point values 4923 `a' and `b'. The operation is performed according to the IEC/IEEE Standard 4924 for Binary Floating-Point Arithmetic. 4925 ------------------------------------------------------------------------------- 4926 */ 4927 float128 float128_add( float128 a, float128 b ) 4928 { 4929 flag aSign, bSign; 4930 4931 aSign = extractFloat128Sign( a ); 4932 bSign = extractFloat128Sign( b ); 4933 if ( aSign == bSign ) { 4934 return addFloat128Sigs( a, b, aSign ); 4935 } 4936 else { 4937 return subFloat128Sigs( a, b, aSign ); 4938 } 4939 4940 } 4941 4942 /* 4943 ------------------------------------------------------------------------------- 4944 Returns the result of subtracting the quadruple-precision floating-point 4945 values `a' and `b'. The operation is performed according to the IEC/IEEE 4946 Standard for Binary Floating-Point Arithmetic. 4947 ------------------------------------------------------------------------------- 4948 */ 4949 float128 float128_sub( float128 a, float128 b ) 4950 { 4951 flag aSign, bSign; 4952 4953 aSign = extractFloat128Sign( a ); 4954 bSign = extractFloat128Sign( b ); 4955 if ( aSign == bSign ) { 4956 return subFloat128Sigs( a, b, aSign ); 4957 } 4958 else { 4959 return addFloat128Sigs( a, b, aSign ); 4960 } 4961 4962 } 4963 4964 /* 4965 ------------------------------------------------------------------------------- 4966 Returns the result of multiplying the quadruple-precision floating-point 4967 values `a' and `b'. The operation is performed according to the IEC/IEEE 4968 Standard for Binary Floating-Point Arithmetic. 4969 ------------------------------------------------------------------------------- 4970 */ 4971 float128 float128_mul( float128 a, float128 b ) 4972 { 4973 flag aSign, bSign, zSign; 4974 int32 aExp, bExp, zExp; 4975 bits64 aSig0, aSig1, bSig0, bSig1, zSig0, zSig1, zSig2, zSig3; 4976 float128 z; 4977 4978 aSig1 = extractFloat128Frac1( a ); 4979 aSig0 = extractFloat128Frac0( a ); 4980 aExp = extractFloat128Exp( a ); 4981 aSign = extractFloat128Sign( a ); 4982 bSig1 = extractFloat128Frac1( b ); 4983 bSig0 = extractFloat128Frac0( b ); 4984 bExp = extractFloat128Exp( b ); 4985 bSign = extractFloat128Sign( b ); 4986 zSign = aSign ^ bSign; 4987 if ( aExp == 0x7FFF ) { 4988 if ( ( aSig0 | aSig1 ) 4989 || ( ( bExp == 0x7FFF ) && ( bSig0 | bSig1 ) ) ) { 4990 return propagateFloat128NaN( a, b ); 4991 } 4992 if ( ( bExp | bSig0 | bSig1 ) == 0 ) goto invalid; 4993 return packFloat128( zSign, 0x7FFF, 0, 0 ); 4994 } 4995 if ( bExp == 0x7FFF ) { 4996 if ( bSig0 | bSig1 ) return propagateFloat128NaN( a, b ); 4997 if ( ( aExp | aSig0 | aSig1 ) == 0 ) { 4998 invalid: 4999 float_raise( float_flag_invalid ); 5000 z.low = float128_default_nan_low; 5001 z.high = float128_default_nan_high; 5002 return z; 5003 } 5004 return packFloat128( zSign, 0x7FFF, 0, 0 ); 5005 } 5006 if ( aExp == 0 ) { 5007 if ( ( aSig0 | aSig1 ) == 0 ) return packFloat128( zSign, 0, 0, 0 ); 5008 normalizeFloat128Subnormal( aSig0, aSig1, &aExp, &aSig0, &aSig1 ); 5009 } 5010 if ( bExp == 0 ) { 5011 if ( ( bSig0 | bSig1 ) == 0 ) return packFloat128( zSign, 0, 0, 0 ); 5012 normalizeFloat128Subnormal( bSig0, bSig1, &bExp, &bSig0, &bSig1 ); 5013 } 5014 zExp = aExp + bExp - 0x4000; 5015 aSig0 |= LIT64( 0x0001000000000000 ); 5016 shortShift128Left( bSig0, bSig1, 16, &bSig0, &bSig1 ); 5017 mul128To256( aSig0, aSig1, bSig0, bSig1, &zSig0, &zSig1, &zSig2, &zSig3 ); 5018 add128( zSig0, zSig1, aSig0, aSig1, &zSig0, &zSig1 ); 5019 zSig2 |= ( zSig3 != 0 ); 5020 if ( LIT64( 0x0002000000000000 ) <= zSig0 ) { 5021 shift128ExtraRightJamming( 5022 zSig0, zSig1, zSig2, 1, &zSig0, &zSig1, &zSig2 ); 5023 ++zExp; 5024 } 5025 return roundAndPackFloat128( zSign, zExp, zSig0, zSig1, zSig2 ); 5026 5027 } 5028 5029 /* 5030 ------------------------------------------------------------------------------- 5031 Returns the result of dividing the quadruple-precision floating-point value 5032 `a' by the corresponding value `b'. The operation is performed according to 5033 the IEC/IEEE Standard for Binary Floating-Point Arithmetic. 5034 ------------------------------------------------------------------------------- 5035 */ 5036 float128 float128_div( float128 a, float128 b ) 5037 { 5038 flag aSign, bSign, zSign; 5039 int32 aExp, bExp, zExp; 5040 bits64 aSig0, aSig1, bSig0, bSig1, zSig0, zSig1, zSig2; 5041 bits64 rem0, rem1, rem2, rem3, term0, term1, term2, term3; 5042 float128 z; 5043 5044 aSig1 = extractFloat128Frac1( a ); 5045 aSig0 = extractFloat128Frac0( a ); 5046 aExp = extractFloat128Exp( a ); 5047 aSign = extractFloat128Sign( a ); 5048 bSig1 = extractFloat128Frac1( b ); 5049 bSig0 = extractFloat128Frac0( b ); 5050 bExp = extractFloat128Exp( b ); 5051 bSign = extractFloat128Sign( b ); 5052 zSign = aSign ^ bSign; 5053 if ( aExp == 0x7FFF ) { 5054 if ( aSig0 | aSig1 ) return propagateFloat128NaN( a, b ); 5055 if ( bExp == 0x7FFF ) { 5056 if ( bSig0 | bSig1 ) return propagateFloat128NaN( a, b ); 5057 goto invalid; 5058 } 5059 return packFloat128( zSign, 0x7FFF, 0, 0 ); 5060 } 5061 if ( bExp == 0x7FFF ) { 5062 if ( bSig0 | bSig1 ) return propagateFloat128NaN( a, b ); 5063 return packFloat128( zSign, 0, 0, 0 ); 5064 } 5065 if ( bExp == 0 ) { 5066 if ( ( bSig0 | bSig1 ) == 0 ) { 5067 if ( ( aExp | aSig0 | aSig1 ) == 0 ) { 5068 invalid: 5069 float_raise( float_flag_invalid ); 5070 z.low = float128_default_nan_low; 5071 z.high = float128_default_nan_high; 5072 return z; 5073 } 5074 float_raise( float_flag_divbyzero ); 5075 return packFloat128( zSign, 0x7FFF, 0, 0 ); 5076 } 5077 normalizeFloat128Subnormal( bSig0, bSig1, &bExp, &bSig0, &bSig1 ); 5078 } 5079 if ( aExp == 0 ) { 5080 if ( ( aSig0 | aSig1 ) == 0 ) return packFloat128( zSign, 0, 0, 0 ); 5081 normalizeFloat128Subnormal( aSig0, aSig1, &aExp, &aSig0, &aSig1 ); 5082 } 5083 zExp = aExp - bExp + 0x3FFD; 5084 shortShift128Left( 5085 aSig0 | LIT64( 0x0001000000000000 ), aSig1, 15, &aSig0, &aSig1 ); 5086 shortShift128Left( 5087 bSig0 | LIT64( 0x0001000000000000 ), bSig1, 15, &bSig0, &bSig1 ); 5088 if ( le128( bSig0, bSig1, aSig0, aSig1 ) ) { 5089 shift128Right( aSig0, aSig1, 1, &aSig0, &aSig1 ); 5090 ++zExp; 5091 } 5092 zSig0 = estimateDiv128To64( aSig0, aSig1, bSig0 ); 5093 mul128By64To192( bSig0, bSig1, zSig0, &term0, &term1, &term2 ); 5094 sub192( aSig0, aSig1, 0, term0, term1, term2, &rem0, &rem1, &rem2 ); 5095 while ( (sbits64) rem0 < 0 ) { 5096 --zSig0; 5097 add192( rem0, rem1, rem2, 0, bSig0, bSig1, &rem0, &rem1, &rem2 ); 5098 } 5099 zSig1 = estimateDiv128To64( rem1, rem2, bSig0 ); 5100 if ( ( zSig1 & 0x3FFF ) <= 4 ) { 5101 mul128By64To192( bSig0, bSig1, zSig1, &term1, &term2, &term3 ); 5102 sub192( rem1, rem2, 0, term1, term2, term3, &rem1, &rem2, &rem3 ); 5103 while ( (sbits64) rem1 < 0 ) { 5104 --zSig1; 5105 add192( rem1, rem2, rem3, 0, bSig0, bSig1, &rem1, &rem2, &rem3 ); 5106 } 5107 zSig1 |= ( ( rem1 | rem2 | rem3 ) != 0 ); 5108 } 5109 shift128ExtraRightJamming( zSig0, zSig1, 0, 15, &zSig0, &zSig1, &zSig2 ); 5110 return roundAndPackFloat128( zSign, zExp, zSig0, zSig1, zSig2 ); 5111 5112 } 5113 5114 /* 5115 ------------------------------------------------------------------------------- 5116 Returns the remainder of the quadruple-precision floating-point value `a' 5117 with respect to the corresponding value `b'. The operation is performed 5118 according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic. 5119 ------------------------------------------------------------------------------- 5120 */ 5121 float128 float128_rem( float128 a, float128 b ) 5122 { 5123 flag aSign, bSign, zSign; 5124 int32 aExp, bExp, expDiff; 5125 bits64 aSig0, aSig1, bSig0, bSig1, q, term0, term1, term2; 5126 bits64 allZero, alternateASig0, alternateASig1, sigMean1; 5127 sbits64 sigMean0; 5128 float128 z; 5129 5130 aSig1 = extractFloat128Frac1( a ); 5131 aSig0 = extractFloat128Frac0( a ); 5132 aExp = extractFloat128Exp( a ); 5133 aSign = extractFloat128Sign( a ); 5134 bSig1 = extractFloat128Frac1( b ); 5135 bSig0 = extractFloat128Frac0( b ); 5136 bExp = extractFloat128Exp( b ); 5137 bSign = extractFloat128Sign( b ); 5138 if ( aExp == 0x7FFF ) { 5139 if ( ( aSig0 | aSig1 ) 5140 || ( ( bExp == 0x7FFF ) && ( bSig0 | bSig1 ) ) ) { 5141 return propagateFloat128NaN( a, b ); 5142 } 5143 goto invalid; 5144 } 5145 if ( bExp == 0x7FFF ) { 5146 if ( bSig0 | bSig1 ) return propagateFloat128NaN( a, b ); 5147 return a; 5148 } 5149 if ( bExp == 0 ) { 5150 if ( ( bSig0 | bSig1 ) == 0 ) { 5151 invalid: 5152 float_raise( float_flag_invalid ); 5153 z.low = float128_default_nan_low; 5154 z.high = float128_default_nan_high; 5155 return z; 5156 } 5157 normalizeFloat128Subnormal( bSig0, bSig1, &bExp, &bSig0, &bSig1 ); 5158 } 5159 if ( aExp == 0 ) { 5160 if ( ( aSig0 | aSig1 ) == 0 ) return a; 5161 normalizeFloat128Subnormal( aSig0, aSig1, &aExp, &aSig0, &aSig1 ); 5162 } 5163 expDiff = aExp - bExp; 5164 if ( expDiff < -1 ) return a; 5165 shortShift128Left( 5166 aSig0 | LIT64( 0x0001000000000000 ), 5167 aSig1, 5168 15 - ( expDiff < 0 ), 5169 &aSig0, 5170 &aSig1 5171 ); 5172 shortShift128Left( 5173 bSig0 | LIT64( 0x0001000000000000 ), bSig1, 15, &bSig0, &bSig1 ); 5174 q = le128( bSig0, bSig1, aSig0, aSig1 ); 5175 if ( q ) sub128( aSig0, aSig1, bSig0, bSig1, &aSig0, &aSig1 ); 5176 expDiff -= 64; 5177 while ( 0 < expDiff ) { 5178 q = estimateDiv128To64( aSig0, aSig1, bSig0 ); 5179 q = ( 4 < q ) ? q - 4 : 0; 5180 mul128By64To192( bSig0, bSig1, q, &term0, &term1, &term2 ); 5181 shortShift192Left( term0, term1, term2, 61, &term1, &term2, &allZero ); 5182 shortShift128Left( aSig0, aSig1, 61, &aSig0, &allZero ); 5183 sub128( aSig0, 0, term1, term2, &aSig0, &aSig1 ); 5184 expDiff -= 61; 5185 } 5186 if ( -64 < expDiff ) { 5187 q = estimateDiv128To64( aSig0, aSig1, bSig0 ); 5188 q = ( 4 < q ) ? q - 4 : 0; 5189 q >>= - expDiff; 5190 shift128Right( bSig0, bSig1, 12, &bSig0, &bSig1 ); 5191 expDiff += 52; 5192 if ( expDiff < 0 ) { 5193 shift128Right( aSig0, aSig1, - expDiff, &aSig0, &aSig1 ); 5194 } 5195 else { 5196 shortShift128Left( aSig0, aSig1, expDiff, &aSig0, &aSig1 ); 5197 } 5198 mul128By64To192( bSig0, bSig1, q, &term0, &term1, &term2 ); 5199 sub128( aSig0, aSig1, term1, term2, &aSig0, &aSig1 ); 5200 } 5201 else { 5202 shift128Right( aSig0, aSig1, 12, &aSig0, &aSig1 ); 5203 shift128Right( bSig0, bSig1, 12, &bSig0, &bSig1 ); 5204 } 5205 do { 5206 alternateASig0 = aSig0; 5207 alternateASig1 = aSig1; 5208 ++q; 5209 sub128( aSig0, aSig1, bSig0, bSig1, &aSig0, &aSig1 ); 5210 } while ( 0 <= (sbits64) aSig0 ); 5211 add128( 5212 aSig0, aSig1, alternateASig0, alternateASig1, (bits64 *)&sigMean0, &sigMean1 ); 5213 if ( ( sigMean0 < 0 ) 5214 || ( ( ( sigMean0 | sigMean1 ) == 0 ) && ( q & 1 ) ) ) { 5215 aSig0 = alternateASig0; 5216 aSig1 = alternateASig1; 5217 } 5218 zSign = ( (sbits64) aSig0 < 0 ); 5219 if ( zSign ) sub128( 0, 0, aSig0, aSig1, &aSig0, &aSig1 ); 5220 return 5221 normalizeRoundAndPackFloat128( aSign ^ zSign, bExp - 4, aSig0, aSig1 ); 5222 5223 } 5224 5225 /* 5226 ------------------------------------------------------------------------------- 5227 Returns the square root of the quadruple-precision floating-point value `a'. 5228 The operation is performed according to the IEC/IEEE Standard for Binary 5229 Floating-Point Arithmetic. 5230 ------------------------------------------------------------------------------- 5231 */ 5232 float128 float128_sqrt( float128 a ) 5233 { 5234 flag aSign; 5235 int32 aExp, zExp; 5236 bits64 aSig0, aSig1, zSig0, zSig1, zSig2, doubleZSig0; 5237 bits64 rem0, rem1, rem2, rem3, term0, term1, term2, term3; 5238 float128 z; 5239 5240 aSig1 = extractFloat128Frac1( a ); 5241 aSig0 = extractFloat128Frac0( a ); 5242 aExp = extractFloat128Exp( a ); 5243 aSign = extractFloat128Sign( a ); 5244 if ( aExp == 0x7FFF ) { 5245 if ( aSig0 | aSig1 ) return propagateFloat128NaN( a, a ); 5246 if ( ! aSign ) return a; 5247 goto invalid; 5248 } 5249 if ( aSign ) { 5250 if ( ( aExp | aSig0 | aSig1 ) == 0 ) return a; 5251 invalid: 5252 float_raise( float_flag_invalid ); 5253 z.low = float128_default_nan_low; 5254 z.high = float128_default_nan_high; 5255 return z; 5256 } 5257 if ( aExp == 0 ) { 5258 if ( ( aSig0 | aSig1 ) == 0 ) return packFloat128( 0, 0, 0, 0 ); 5259 normalizeFloat128Subnormal( aSig0, aSig1, &aExp, &aSig0, &aSig1 ); 5260 } 5261 zExp = ( ( aExp - 0x3FFF )>>1 ) + 0x3FFE; 5262 aSig0 |= LIT64( 0x0001000000000000 ); 5263 zSig0 = estimateSqrt32( aExp, aSig0>>17 ); 5264 shortShift128Left( aSig0, aSig1, 13 - ( aExp & 1 ), &aSig0, &aSig1 ); 5265 zSig0 = estimateDiv128To64( aSig0, aSig1, zSig0<<32 ) + ( zSig0<<30 ); 5266 doubleZSig0 = zSig0<<1; 5267 mul64To128( zSig0, zSig0, &term0, &term1 ); 5268 sub128( aSig0, aSig1, term0, term1, &rem0, &rem1 ); 5269 while ( (sbits64) rem0 < 0 ) { 5270 --zSig0; 5271 doubleZSig0 -= 2; 5272 add128( rem0, rem1, zSig0>>63, doubleZSig0 | 1, &rem0, &rem1 ); 5273 } 5274 zSig1 = estimateDiv128To64( rem1, 0, doubleZSig0 ); 5275 if ( ( zSig1 & 0x1FFF ) <= 5 ) { 5276 if ( zSig1 == 0 ) zSig1 = 1; 5277 mul64To128( doubleZSig0, zSig1, &term1, &term2 ); 5278 sub128( rem1, 0, term1, term2, &rem1, &rem2 ); 5279 mul64To128( zSig1, zSig1, &term2, &term3 ); 5280 sub192( rem1, rem2, 0, 0, term2, term3, &rem1, &rem2, &rem3 ); 5281 while ( (sbits64) rem1 < 0 ) { 5282 --zSig1; 5283 shortShift128Left( 0, zSig1, 1, &term2, &term3 ); 5284 term3 |= 1; 5285 term2 |= doubleZSig0; 5286 add192( rem1, rem2, rem3, 0, term2, term3, &rem1, &rem2, &rem3 ); 5287 } 5288 zSig1 |= ( ( rem1 | rem2 | rem3 ) != 0 ); 5289 } 5290 shift128ExtraRightJamming( zSig0, zSig1, 0, 14, &zSig0, &zSig1, &zSig2 ); 5291 return roundAndPackFloat128( 0, zExp, zSig0, zSig1, zSig2 ); 5292 5293 } 5294 5295 /* 5296 ------------------------------------------------------------------------------- 5297 Returns 1 if the quadruple-precision floating-point value `a' is equal to 5298 the corresponding value `b', and 0 otherwise. The comparison is performed 5299 according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic. 5300 ------------------------------------------------------------------------------- 5301 */ 5302 flag float128_eq( float128 a, float128 b ) 5303 { 5304 5305 if ( ( ( extractFloat128Exp( a ) == 0x7FFF ) 5306 && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) ) 5307 || ( ( extractFloat128Exp( b ) == 0x7FFF ) 5308 && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) ) 5309 ) { 5310 if ( float128_is_signaling_nan( a ) 5311 || float128_is_signaling_nan( b ) ) { 5312 float_raise( float_flag_invalid ); 5313 } 5314 return 0; 5315 } 5316 return 5317 ( a.low == b.low ) 5318 && ( ( a.high == b.high ) 5319 || ( ( a.low == 0 ) 5320 && ( (bits64) ( ( a.high | b.high )<<1 ) == 0 ) ) 5321 ); 5322 5323 } 5324 5325 /* 5326 ------------------------------------------------------------------------------- 5327 Returns 1 if the quadruple-precision floating-point value `a' is less than 5328 or equal to the corresponding value `b', and 0 otherwise. The comparison 5329 is performed according to the IEC/IEEE Standard for Binary Floating-Point 5330 Arithmetic. 5331 ------------------------------------------------------------------------------- 5332 */ 5333 flag float128_le( float128 a, float128 b ) 5334 { 5335 flag aSign, bSign; 5336 5337 if ( ( ( extractFloat128Exp( a ) == 0x7FFF ) 5338 && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) ) 5339 || ( ( extractFloat128Exp( b ) == 0x7FFF ) 5340 && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) ) 5341 ) { 5342 float_raise( float_flag_invalid ); 5343 return 0; 5344 } 5345 aSign = extractFloat128Sign( a ); 5346 bSign = extractFloat128Sign( b ); 5347 if ( aSign != bSign ) { 5348 return 5349 aSign 5350 || ( ( ( (bits64) ( ( a.high | b.high )<<1 ) ) | a.low | b.low ) 5351 == 0 ); 5352 } 5353 return 5354 aSign ? le128( b.high, b.low, a.high, a.low ) 5355 : le128( a.high, a.low, b.high, b.low ); 5356 5357 } 5358 5359 /* 5360 ------------------------------------------------------------------------------- 5361 Returns 1 if the quadruple-precision floating-point value `a' is less than 5362 the corresponding value `b', and 0 otherwise. The comparison is performed 5363 according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic. 5364 ------------------------------------------------------------------------------- 5365 */ 5366 flag float128_lt( float128 a, float128 b ) 5367 { 5368 flag aSign, bSign; 5369 5370 if ( ( ( extractFloat128Exp( a ) == 0x7FFF ) 5371 && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) ) 5372 || ( ( extractFloat128Exp( b ) == 0x7FFF ) 5373 && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) ) 5374 ) { 5375 float_raise( float_flag_invalid ); 5376 return 0; 5377 } 5378 aSign = extractFloat128Sign( a ); 5379 bSign = extractFloat128Sign( b ); 5380 if ( aSign != bSign ) { 5381 return 5382 aSign 5383 && ( ( ( (bits64) ( ( a.high | b.high )<<1 ) ) | a.low | b.low ) 5384 != 0 ); 5385 } 5386 return 5387 aSign ? lt128( b.high, b.low, a.high, a.low ) 5388 : lt128( a.high, a.low, b.high, b.low ); 5389 5390 } 5391 5392 /* 5393 ------------------------------------------------------------------------------- 5394 Returns 1 if the quadruple-precision floating-point value `a' is equal to 5395 the corresponding value `b', and 0 otherwise. The invalid exception is 5396 raised if either operand is a NaN. Otherwise, the comparison is performed 5397 according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic. 5398 ------------------------------------------------------------------------------- 5399 */ 5400 flag float128_eq_signaling( float128 a, float128 b ) 5401 { 5402 5403 if ( ( ( extractFloat128Exp( a ) == 0x7FFF ) 5404 && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) ) 5405 || ( ( extractFloat128Exp( b ) == 0x7FFF ) 5406 && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) ) 5407 ) { 5408 float_raise( float_flag_invalid ); 5409 return 0; 5410 } 5411 return 5412 ( a.low == b.low ) 5413 && ( ( a.high == b.high ) 5414 || ( ( a.low == 0 ) 5415 && ( (bits64) ( ( a.high | b.high )<<1 ) == 0 ) ) 5416 ); 5417 5418 } 5419 5420 /* 5421 ------------------------------------------------------------------------------- 5422 Returns 1 if the quadruple-precision floating-point value `a' is less than 5423 or equal to the corresponding value `b', and 0 otherwise. Quiet NaNs do not 5424 cause an exception. Otherwise, the comparison is performed according to the 5425 IEC/IEEE Standard for Binary Floating-Point Arithmetic. 5426 ------------------------------------------------------------------------------- 5427 */ 5428 flag float128_le_quiet( float128 a, float128 b ) 5429 { 5430 flag aSign, bSign; 5431 5432 if ( ( ( extractFloat128Exp( a ) == 0x7FFF ) 5433 && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) ) 5434 || ( ( extractFloat128Exp( b ) == 0x7FFF ) 5435 && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) ) 5436 ) { 5437 if ( float128_is_signaling_nan( a ) 5438 || float128_is_signaling_nan( b ) ) { 5439 float_raise( float_flag_invalid ); 5440 } 5441 return 0; 5442 } 5443 aSign = extractFloat128Sign( a ); 5444 bSign = extractFloat128Sign( b ); 5445 if ( aSign != bSign ) { 5446 return 5447 aSign 5448 || ( ( ( (bits64) ( ( a.high | b.high )<<1 ) ) | a.low | b.low ) 5449 == 0 ); 5450 } 5451 return 5452 aSign ? le128( b.high, b.low, a.high, a.low ) 5453 : le128( a.high, a.low, b.high, b.low ); 5454 5455 } 5456 5457 /* 5458 ------------------------------------------------------------------------------- 5459 Returns 1 if the quadruple-precision floating-point value `a' is less than 5460 the corresponding value `b', and 0 otherwise. Quiet NaNs do not cause an 5461 exception. Otherwise, the comparison is performed according to the IEC/IEEE 5462 Standard for Binary Floating-Point Arithmetic. 5463 ------------------------------------------------------------------------------- 5464 */ 5465 flag float128_lt_quiet( float128 a, float128 b ) 5466 { 5467 flag aSign, bSign; 5468 5469 if ( ( ( extractFloat128Exp( a ) == 0x7FFF ) 5470 && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) ) 5471 || ( ( extractFloat128Exp( b ) == 0x7FFF ) 5472 && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) ) 5473 ) { 5474 if ( float128_is_signaling_nan( a ) 5475 || float128_is_signaling_nan( b ) ) { 5476 float_raise( float_flag_invalid ); 5477 } 5478 return 0; 5479 } 5480 aSign = extractFloat128Sign( a ); 5481 bSign = extractFloat128Sign( b ); 5482 if ( aSign != bSign ) { 5483 return 5484 aSign 5485 && ( ( ( (bits64) ( ( a.high | b.high )<<1 ) ) | a.low | b.low ) 5486 != 0 ); 5487 } 5488 return 5489 aSign ? lt128( b.high, b.low, a.high, a.low ) 5490 : lt128( a.high, a.low, b.high, b.low ); 5491 5492 } 5493 5494 #endif 5495 5496 5497 #if defined(SOFTFLOAT_FOR_GCC) && defined(SOFTFLOAT_NEED_FIXUNS) 5498 5499 /* 5500 * These two routines are not part of the original softfloat distribution. 5501 * 5502 * They are based on the corresponding conversions to integer but return 5503 * unsigned numbers instead since these functions are required by GCC. 5504 * 5505 * Added by Mark Brinicombe <mark@NetBSD.org> 27/09/97 5506 * 5507 * float64 version overhauled for SoftFloat 2a [bjh21 2000-07-15] 5508 */ 5509 5510 /* 5511 ------------------------------------------------------------------------------- 5512 Returns the result of converting the double-precision floating-point value 5513 `a' to the 32-bit unsigned integer format. The conversion is 5514 performed according to the IEC/IEEE Standard for Binary Floating-point 5515 Arithmetic, except that the conversion is always rounded toward zero. If 5516 `a' is a NaN, the largest positive integer is returned. If the conversion 5517 overflows, the largest integer positive is returned. 5518 ------------------------------------------------------------------------------- 5519 */ 5520 uint32 float64_to_uint32_round_to_zero( float64 a ) 5521 { 5522 flag aSign; 5523 int16 aExp, shiftCount; 5524 bits64 aSig, savedASig; 5525 uint32 z; 5526 5527 aSig = extractFloat64Frac( a ); 5528 aExp = extractFloat64Exp( a ); 5529 aSign = extractFloat64Sign( a ); 5530 5531 if (aSign) { 5532 float_raise( float_flag_invalid ); 5533 return(0); 5534 } 5535 5536 if ( 0x41E < aExp ) { 5537 float_raise( float_flag_invalid ); 5538 return 0xffffffff; 5539 } 5540 else if ( aExp < 0x3FF ) { 5541 if ( aExp || aSig ) float_exception_flags |= float_flag_inexact; 5542 return 0; 5543 } 5544 aSig |= LIT64( 0x0010000000000000 ); 5545 shiftCount = 0x433 - aExp; 5546 savedASig = aSig; 5547 aSig >>= shiftCount; 5548 z = aSig; 5549 if ( ( aSig<<shiftCount ) != savedASig ) { 5550 float_exception_flags |= float_flag_inexact; 5551 } 5552 return z; 5553 5554 } 5555 5556 /* 5557 ------------------------------------------------------------------------------- 5558 Returns the result of converting the single-precision floating-point value 5559 `a' to the 32-bit unsigned integer format. The conversion is 5560 performed according to the IEC/IEEE Standard for Binary Floating-point 5561 Arithmetic, except that the conversion is always rounded toward zero. If 5562 `a' is a NaN, the largest positive integer is returned. If the conversion 5563 overflows, the largest positive integer is returned. 5564 ------------------------------------------------------------------------------- 5565 */ 5566 uint32 float32_to_uint32_round_to_zero( float32 a ) 5567 { 5568 flag aSign; 5569 int16 aExp, shiftCount; 5570 bits32 aSig; 5571 uint32 z; 5572 5573 aSig = extractFloat32Frac( a ); 5574 aExp = extractFloat32Exp( a ); 5575 aSign = extractFloat32Sign( a ); 5576 shiftCount = aExp - 0x9E; 5577 5578 if (aSign) { 5579 float_raise( float_flag_invalid ); 5580 return(0); 5581 } 5582 if ( 0 < shiftCount ) { 5583 float_raise( float_flag_invalid ); 5584 return 0xFFFFFFFF; 5585 } 5586 else if ( aExp <= 0x7E ) { 5587 if ( aExp | aSig ) float_exception_flags |= float_flag_inexact; 5588 return 0; 5589 } 5590 aSig = ( aSig | 0x800000 )<<8; 5591 z = aSig>>( - shiftCount ); 5592 if ( aSig<<( shiftCount & 31 ) ) { 5593 float_exception_flags |= float_flag_inexact; 5594 } 5595 return z; 5596 5597 } 5598 5599 #endif 5600