1 /* $NetBSD: softfloat.c,v 1.2 2003/07/26 19:24:52 salo Exp $ */ 2 3 /* 4 * This version hacked for use with gcc -msoft-float by bjh21. 5 * (Mostly a case of #ifdefing out things GCC doesn't need or provides 6 * itself). 7 */ 8 9 /* 10 * Things you may want to define: 11 * 12 * SOFTFLOAT_FOR_GCC - build only those functions necessary for GCC (with 13 * -msoft-float) to work. Include "softfloat-for-gcc.h" to get them 14 * properly renamed. 15 */ 16 17 /* 18 =============================================================================== 19 20 This C source file is part of the SoftFloat IEC/IEEE Floating-point 21 Arithmetic Package, Release 2a. 22 23 Written by John R. Hauser. This work was made possible in part by the 24 International Computer Science Institute, located at Suite 600, 1947 Center 25 Street, Berkeley, California 94704. Funding was partially provided by the 26 National Science Foundation under grant MIP-9311980. The original version 27 of this code was written as part of a project to build a fixed-point vector 28 processor in collaboration with the University of California at Berkeley, 29 overseen by Profs. Nelson Morgan and John Wawrzynek. More information 30 is available through the Web page `http://HTTP.CS.Berkeley.EDU/~jhauser/ 31 arithmetic/SoftFloat.html'. 32 33 THIS SOFTWARE IS DISTRIBUTED AS IS, FOR FREE. Although reasonable effort 34 has been made to avoid it, THIS SOFTWARE MAY CONTAIN FAULTS THAT WILL AT 35 TIMES RESULT IN INCORRECT BEHAVIOR. USE OF THIS SOFTWARE IS RESTRICTED TO 36 PERSONS AND ORGANIZATIONS WHO CAN AND WILL TAKE FULL RESPONSIBILITY FOR ANY 37 AND ALL LOSSES, COSTS, OR OTHER PROBLEMS ARISING FROM ITS USE. 38 39 Derivative works are acceptable, even for commercial purposes, so long as 40 (1) they include prominent notice that the work is derivative, and (2) they 41 include prominent notice akin to these four paragraphs for those parts of 42 this code that are retained. 43 44 =============================================================================== 45 */ 46 47 #include <sys/cdefs.h> 48 __FBSDID("$FreeBSD$"); 49 50 #ifdef SOFTFLOAT_FOR_GCC 51 #include "softfloat-for-gcc.h" 52 #endif 53 54 #include "milieu.h" 55 #include "softfloat.h" 56 57 /* 58 * Conversions between floats as stored in memory and floats as 59 * SoftFloat uses them 60 */ 61 #ifndef FLOAT64_DEMANGLE 62 #define FLOAT64_DEMANGLE(a) (a) 63 #endif 64 #ifndef FLOAT64_MANGLE 65 #define FLOAT64_MANGLE(a) (a) 66 #endif 67 68 /* 69 ------------------------------------------------------------------------------- 70 Floating-point rounding mode, extended double-precision rounding precision, 71 and exception flags. 72 ------------------------------------------------------------------------------- 73 */ 74 fp_rnd_t float_rounding_mode = float_round_nearest_even; 75 fp_except float_exception_flags = 0; 76 #ifdef FLOATX80 77 int8 floatx80_rounding_precision = 80; 78 #endif 79 80 /* 81 ------------------------------------------------------------------------------- 82 Primitive arithmetic functions, including multi-word arithmetic, and 83 division and square root approximations. (Can be specialized to target if 84 desired.) 85 ------------------------------------------------------------------------------- 86 */ 87 #include "softfloat-macros" 88 89 /* 90 ------------------------------------------------------------------------------- 91 Functions and definitions to determine: (1) whether tininess for underflow 92 is detected before or after rounding by default, (2) what (if anything) 93 happens when exceptions are raised, (3) how signaling NaNs are distinguished 94 from quiet NaNs, (4) the default generated quiet NaNs, and (5) how NaNs 95 are propagated from function inputs to output. These details are target- 96 specific. 97 ------------------------------------------------------------------------------- 98 */ 99 #include "softfloat-specialize" 100 101 #if !defined(SOFTFLOAT_FOR_GCC) || defined(FLOATX80) || defined(FLOAT128) 102 /* 103 ------------------------------------------------------------------------------- 104 Takes a 64-bit fixed-point value `absZ' with binary point between bits 6 105 and 7, and returns the properly rounded 32-bit integer corresponding to the 106 input. If `zSign' is 1, the input is negated before being converted to an 107 integer. Bit 63 of `absZ' must be zero. Ordinarily, the fixed-point input 108 is simply rounded to an integer, with the inexact exception raised if the 109 input cannot be represented exactly as an integer. However, if the fixed- 110 point input is too large, the invalid exception is raised and the largest 111 positive or negative integer is returned. 112 ------------------------------------------------------------------------------- 113 */ 114 static int32 roundAndPackInt32( flag zSign, bits64 absZ ) 115 { 116 int8 roundingMode; 117 flag roundNearestEven; 118 int8 roundIncrement, roundBits; 119 int32 z; 120 121 roundingMode = float_rounding_mode; 122 roundNearestEven = ( roundingMode == float_round_nearest_even ); 123 roundIncrement = 0x40; 124 if ( ! roundNearestEven ) { 125 if ( roundingMode == float_round_to_zero ) { 126 roundIncrement = 0; 127 } 128 else { 129 roundIncrement = 0x7F; 130 if ( zSign ) { 131 if ( roundingMode == float_round_up ) roundIncrement = 0; 132 } 133 else { 134 if ( roundingMode == float_round_down ) roundIncrement = 0; 135 } 136 } 137 } 138 roundBits = absZ & 0x7F; 139 absZ = ( absZ + roundIncrement )>>7; 140 absZ &= ~ ( ( ( roundBits ^ 0x40 ) == 0 ) & roundNearestEven ); 141 z = absZ; 142 if ( zSign ) z = - z; 143 if ( ( absZ>>32 ) || ( z && ( ( z < 0 ) ^ zSign ) ) ) { 144 float_raise( float_flag_invalid ); 145 return zSign ? (sbits32) 0x80000000 : 0x7FFFFFFF; 146 } 147 if ( roundBits ) float_exception_flags |= float_flag_inexact; 148 return z; 149 150 } 151 152 /* 153 ------------------------------------------------------------------------------- 154 Takes the 128-bit fixed-point value formed by concatenating `absZ0' and 155 `absZ1', with binary point between bits 63 and 64 (between the input words), 156 and returns the properly rounded 64-bit integer corresponding to the input. 157 If `zSign' is 1, the input is negated before being converted to an integer. 158 Ordinarily, the fixed-point input is simply rounded to an integer, with 159 the inexact exception raised if the input cannot be represented exactly as 160 an integer. However, if the fixed-point input is too large, the invalid 161 exception is raised and the largest positive or negative integer is 162 returned. 163 ------------------------------------------------------------------------------- 164 */ 165 static int64 roundAndPackInt64( flag zSign, bits64 absZ0, bits64 absZ1 ) 166 { 167 int8 roundingMode; 168 flag roundNearestEven, increment; 169 int64 z; 170 171 roundingMode = float_rounding_mode; 172 roundNearestEven = ( roundingMode == float_round_nearest_even ); 173 increment = ( (sbits64) absZ1 < 0 ); 174 if ( ! roundNearestEven ) { 175 if ( roundingMode == float_round_to_zero ) { 176 increment = 0; 177 } 178 else { 179 if ( zSign ) { 180 increment = ( roundingMode == float_round_down ) && absZ1; 181 } 182 else { 183 increment = ( roundingMode == float_round_up ) && absZ1; 184 } 185 } 186 } 187 if ( increment ) { 188 ++absZ0; 189 if ( absZ0 == 0 ) goto overflow; 190 absZ0 &= ~ ( ( (bits64) ( absZ1<<1 ) == 0 ) & roundNearestEven ); 191 } 192 z = absZ0; 193 if ( zSign ) z = - z; 194 if ( z && ( ( z < 0 ) ^ zSign ) ) { 195 overflow: 196 float_raise( float_flag_invalid ); 197 return 198 zSign ? (sbits64) LIT64( 0x8000000000000000 ) 199 : LIT64( 0x7FFFFFFFFFFFFFFF ); 200 } 201 if ( absZ1 ) float_exception_flags |= float_flag_inexact; 202 return z; 203 204 } 205 #endif 206 207 /* 208 ------------------------------------------------------------------------------- 209 Returns the fraction bits of the single-precision floating-point value `a'. 210 ------------------------------------------------------------------------------- 211 */ 212 INLINE bits32 extractFloat32Frac( float32 a ) 213 { 214 215 return a & 0x007FFFFF; 216 217 } 218 219 /* 220 ------------------------------------------------------------------------------- 221 Returns the exponent bits of the single-precision floating-point value `a'. 222 ------------------------------------------------------------------------------- 223 */ 224 INLINE int16 extractFloat32Exp( float32 a ) 225 { 226 227 return ( a>>23 ) & 0xFF; 228 229 } 230 231 /* 232 ------------------------------------------------------------------------------- 233 Returns the sign bit of the single-precision floating-point value `a'. 234 ------------------------------------------------------------------------------- 235 */ 236 INLINE flag extractFloat32Sign( float32 a ) 237 { 238 239 return a>>31; 240 241 } 242 243 /* 244 ------------------------------------------------------------------------------- 245 Normalizes the subnormal single-precision floating-point value represented 246 by the denormalized significand `aSig'. The normalized exponent and 247 significand are stored at the locations pointed to by `zExpPtr' and 248 `zSigPtr', respectively. 249 ------------------------------------------------------------------------------- 250 */ 251 static void 252 normalizeFloat32Subnormal( bits32 aSig, int16 *zExpPtr, bits32 *zSigPtr ) 253 { 254 int8 shiftCount; 255 256 shiftCount = countLeadingZeros32( aSig ) - 8; 257 *zSigPtr = aSig<<shiftCount; 258 *zExpPtr = 1 - shiftCount; 259 260 } 261 262 /* 263 ------------------------------------------------------------------------------- 264 Packs the sign `zSign', exponent `zExp', and significand `zSig' into a 265 single-precision floating-point value, returning the result. After being 266 shifted into the proper positions, the three fields are simply added 267 together to form the result. This means that any integer portion of `zSig' 268 will be added into the exponent. Since a properly normalized significand 269 will have an integer portion equal to 1, the `zExp' input should be 1 less 270 than the desired result exponent whenever `zSig' is a complete, normalized 271 significand. 272 ------------------------------------------------------------------------------- 273 */ 274 INLINE float32 packFloat32( flag zSign, int16 zExp, bits32 zSig ) 275 { 276 277 return ( ( (bits32) zSign )<<31 ) + ( ( (bits32) zExp )<<23 ) + zSig; 278 279 } 280 281 /* 282 ------------------------------------------------------------------------------- 283 Takes an abstract floating-point value having sign `zSign', exponent `zExp', 284 and significand `zSig', and returns the proper single-precision floating- 285 point value corresponding to the abstract input. Ordinarily, the abstract 286 value is simply rounded and packed into the single-precision format, with 287 the inexact exception raised if the abstract input cannot be represented 288 exactly. However, if the abstract value is too large, the overflow and 289 inexact exceptions are raised and an infinity or maximal finite value is 290 returned. If the abstract value is too small, the input value is rounded to 291 a subnormal number, and the underflow and inexact exceptions are raised if 292 the abstract input cannot be represented exactly as a subnormal single- 293 precision floating-point number. 294 The input significand `zSig' has its binary point between bits 30 295 and 29, which is 7 bits to the left of the usual location. This shifted 296 significand must be normalized or smaller. If `zSig' is not normalized, 297 `zExp' must be 0; in that case, the result returned is a subnormal number, 298 and it must not require rounding. In the usual case that `zSig' is 299 normalized, `zExp' must be 1 less than the ``true'' floating-point exponent. 300 The handling of underflow and overflow follows the IEC/IEEE Standard for 301 Binary Floating-Point Arithmetic. 302 ------------------------------------------------------------------------------- 303 */ 304 static float32 roundAndPackFloat32( flag zSign, int16 zExp, bits32 zSig ) 305 { 306 int8 roundingMode; 307 flag roundNearestEven; 308 int8 roundIncrement, roundBits; 309 flag isTiny; 310 311 roundingMode = float_rounding_mode; 312 roundNearestEven = ( roundingMode == float_round_nearest_even ); 313 roundIncrement = 0x40; 314 if ( ! roundNearestEven ) { 315 if ( roundingMode == float_round_to_zero ) { 316 roundIncrement = 0; 317 } 318 else { 319 roundIncrement = 0x7F; 320 if ( zSign ) { 321 if ( roundingMode == float_round_up ) roundIncrement = 0; 322 } 323 else { 324 if ( roundingMode == float_round_down ) roundIncrement = 0; 325 } 326 } 327 } 328 roundBits = zSig & 0x7F; 329 if ( 0xFD <= (bits16) zExp ) { 330 if ( ( 0xFD < zExp ) 331 || ( ( zExp == 0xFD ) 332 && ( (sbits32) ( zSig + roundIncrement ) < 0 ) ) 333 ) { 334 float_raise( float_flag_overflow | float_flag_inexact ); 335 return packFloat32( zSign, 0xFF, 0 ) - ( roundIncrement == 0 ); 336 } 337 if ( zExp < 0 ) { 338 isTiny = 339 ( float_detect_tininess == float_tininess_before_rounding ) 340 || ( zExp < -1 ) 341 || ( zSig + roundIncrement < 0x80000000 ); 342 shift32RightJamming( zSig, - zExp, &zSig ); 343 zExp = 0; 344 roundBits = zSig & 0x7F; 345 if ( isTiny && roundBits ) float_raise( float_flag_underflow ); 346 } 347 } 348 if ( roundBits ) float_exception_flags |= float_flag_inexact; 349 zSig = ( zSig + roundIncrement )>>7; 350 zSig &= ~ ( ( ( roundBits ^ 0x40 ) == 0 ) & roundNearestEven ); 351 if ( zSig == 0 ) zExp = 0; 352 return packFloat32( zSign, zExp, zSig ); 353 354 } 355 356 /* 357 ------------------------------------------------------------------------------- 358 Takes an abstract floating-point value having sign `zSign', exponent `zExp', 359 and significand `zSig', and returns the proper single-precision floating- 360 point value corresponding to the abstract input. This routine is just like 361 `roundAndPackFloat32' except that `zSig' does not have to be normalized. 362 Bit 31 of `zSig' must be zero, and `zExp' must be 1 less than the ``true'' 363 floating-point exponent. 364 ------------------------------------------------------------------------------- 365 */ 366 static float32 367 normalizeRoundAndPackFloat32( flag zSign, int16 zExp, bits32 zSig ) 368 { 369 int8 shiftCount; 370 371 shiftCount = countLeadingZeros32( zSig ) - 1; 372 return roundAndPackFloat32( zSign, zExp - shiftCount, zSig<<shiftCount ); 373 374 } 375 376 /* 377 ------------------------------------------------------------------------------- 378 Returns the fraction bits of the double-precision floating-point value `a'. 379 ------------------------------------------------------------------------------- 380 */ 381 INLINE bits64 extractFloat64Frac( float64 a ) 382 { 383 384 return FLOAT64_DEMANGLE(a) & LIT64( 0x000FFFFFFFFFFFFF ); 385 386 } 387 388 /* 389 ------------------------------------------------------------------------------- 390 Returns the exponent bits of the double-precision floating-point value `a'. 391 ------------------------------------------------------------------------------- 392 */ 393 INLINE int16 extractFloat64Exp( float64 a ) 394 { 395 396 return ( FLOAT64_DEMANGLE(a)>>52 ) & 0x7FF; 397 398 } 399 400 /* 401 ------------------------------------------------------------------------------- 402 Returns the sign bit of the double-precision floating-point value `a'. 403 ------------------------------------------------------------------------------- 404 */ 405 INLINE flag extractFloat64Sign( float64 a ) 406 { 407 408 return FLOAT64_DEMANGLE(a)>>63; 409 410 } 411 412 /* 413 ------------------------------------------------------------------------------- 414 Normalizes the subnormal double-precision floating-point value represented 415 by the denormalized significand `aSig'. The normalized exponent and 416 significand are stored at the locations pointed to by `zExpPtr' and 417 `zSigPtr', respectively. 418 ------------------------------------------------------------------------------- 419 */ 420 static void 421 normalizeFloat64Subnormal( bits64 aSig, int16 *zExpPtr, bits64 *zSigPtr ) 422 { 423 int8 shiftCount; 424 425 shiftCount = countLeadingZeros64( aSig ) - 11; 426 *zSigPtr = aSig<<shiftCount; 427 *zExpPtr = 1 - shiftCount; 428 429 } 430 431 /* 432 ------------------------------------------------------------------------------- 433 Packs the sign `zSign', exponent `zExp', and significand `zSig' into a 434 double-precision floating-point value, returning the result. After being 435 shifted into the proper positions, the three fields are simply added 436 together to form the result. This means that any integer portion of `zSig' 437 will be added into the exponent. Since a properly normalized significand 438 will have an integer portion equal to 1, the `zExp' input should be 1 less 439 than the desired result exponent whenever `zSig' is a complete, normalized 440 significand. 441 ------------------------------------------------------------------------------- 442 */ 443 INLINE float64 packFloat64( flag zSign, int16 zExp, bits64 zSig ) 444 { 445 446 return FLOAT64_MANGLE( ( ( (bits64) zSign )<<63 ) + 447 ( ( (bits64) zExp )<<52 ) + zSig ); 448 449 } 450 451 /* 452 ------------------------------------------------------------------------------- 453 Takes an abstract floating-point value having sign `zSign', exponent `zExp', 454 and significand `zSig', and returns the proper double-precision floating- 455 point value corresponding to the abstract input. Ordinarily, the abstract 456 value is simply rounded and packed into the double-precision format, with 457 the inexact exception raised if the abstract input cannot be represented 458 exactly. However, if the abstract value is too large, the overflow and 459 inexact exceptions are raised and an infinity or maximal finite value is 460 returned. If the abstract value is too small, the input value is rounded to 461 a subnormal number, and the underflow and inexact exceptions are raised if 462 the abstract input cannot be represented exactly as a subnormal double- 463 precision floating-point number. 464 The input significand `zSig' has its binary point between bits 62 465 and 61, which is 10 bits to the left of the usual location. This shifted 466 significand must be normalized or smaller. If `zSig' is not normalized, 467 `zExp' must be 0; in that case, the result returned is a subnormal number, 468 and it must not require rounding. In the usual case that `zSig' is 469 normalized, `zExp' must be 1 less than the ``true'' floating-point exponent. 470 The handling of underflow and overflow follows the IEC/IEEE Standard for 471 Binary Floating-Point Arithmetic. 472 ------------------------------------------------------------------------------- 473 */ 474 static float64 roundAndPackFloat64( flag zSign, int16 zExp, bits64 zSig ) 475 { 476 int8 roundingMode; 477 flag roundNearestEven; 478 int16 roundIncrement, roundBits; 479 flag isTiny; 480 481 roundingMode = float_rounding_mode; 482 roundNearestEven = ( roundingMode == float_round_nearest_even ); 483 roundIncrement = 0x200; 484 if ( ! roundNearestEven ) { 485 if ( roundingMode == float_round_to_zero ) { 486 roundIncrement = 0; 487 } 488 else { 489 roundIncrement = 0x3FF; 490 if ( zSign ) { 491 if ( roundingMode == float_round_up ) roundIncrement = 0; 492 } 493 else { 494 if ( roundingMode == float_round_down ) roundIncrement = 0; 495 } 496 } 497 } 498 roundBits = zSig & 0x3FF; 499 if ( 0x7FD <= (bits16) zExp ) { 500 if ( ( 0x7FD < zExp ) 501 || ( ( zExp == 0x7FD ) 502 && ( (sbits64) ( zSig + roundIncrement ) < 0 ) ) 503 ) { 504 float_raise( float_flag_overflow | float_flag_inexact ); 505 return FLOAT64_MANGLE( 506 FLOAT64_DEMANGLE(packFloat64( zSign, 0x7FF, 0 )) - 507 ( roundIncrement == 0 )); 508 } 509 if ( zExp < 0 ) { 510 isTiny = 511 ( float_detect_tininess == float_tininess_before_rounding ) 512 || ( zExp < -1 ) 513 || ( zSig + roundIncrement < LIT64( 0x8000000000000000 ) ); 514 shift64RightJamming( zSig, - zExp, &zSig ); 515 zExp = 0; 516 roundBits = zSig & 0x3FF; 517 if ( isTiny && roundBits ) float_raise( float_flag_underflow ); 518 } 519 } 520 if ( roundBits ) float_exception_flags |= float_flag_inexact; 521 zSig = ( zSig + roundIncrement )>>10; 522 zSig &= ~ ( ( ( roundBits ^ 0x200 ) == 0 ) & roundNearestEven ); 523 if ( zSig == 0 ) zExp = 0; 524 return packFloat64( zSign, zExp, zSig ); 525 526 } 527 528 /* 529 ------------------------------------------------------------------------------- 530 Takes an abstract floating-point value having sign `zSign', exponent `zExp', 531 and significand `zSig', and returns the proper double-precision floating- 532 point value corresponding to the abstract input. This routine is just like 533 `roundAndPackFloat64' except that `zSig' does not have to be normalized. 534 Bit 63 of `zSig' must be zero, and `zExp' must be 1 less than the ``true'' 535 floating-point exponent. 536 ------------------------------------------------------------------------------- 537 */ 538 static float64 539 normalizeRoundAndPackFloat64( flag zSign, int16 zExp, bits64 zSig ) 540 { 541 int8 shiftCount; 542 543 shiftCount = countLeadingZeros64( zSig ) - 1; 544 return roundAndPackFloat64( zSign, zExp - shiftCount, zSig<<shiftCount ); 545 546 } 547 548 #ifdef FLOATX80 549 550 /* 551 ------------------------------------------------------------------------------- 552 Returns the fraction bits of the extended double-precision floating-point 553 value `a'. 554 ------------------------------------------------------------------------------- 555 */ 556 INLINE bits64 extractFloatx80Frac( floatx80 a ) 557 { 558 559 return a.low; 560 561 } 562 563 /* 564 ------------------------------------------------------------------------------- 565 Returns the exponent bits of the extended double-precision floating-point 566 value `a'. 567 ------------------------------------------------------------------------------- 568 */ 569 INLINE int32 extractFloatx80Exp( floatx80 a ) 570 { 571 572 return a.high & 0x7FFF; 573 574 } 575 576 /* 577 ------------------------------------------------------------------------------- 578 Returns the sign bit of the extended double-precision floating-point value 579 `a'. 580 ------------------------------------------------------------------------------- 581 */ 582 INLINE flag extractFloatx80Sign( floatx80 a ) 583 { 584 585 return a.high>>15; 586 587 } 588 589 /* 590 ------------------------------------------------------------------------------- 591 Normalizes the subnormal extended double-precision floating-point value 592 represented by the denormalized significand `aSig'. The normalized exponent 593 and significand are stored at the locations pointed to by `zExpPtr' and 594 `zSigPtr', respectively. 595 ------------------------------------------------------------------------------- 596 */ 597 static void 598 normalizeFloatx80Subnormal( bits64 aSig, int32 *zExpPtr, bits64 *zSigPtr ) 599 { 600 int8 shiftCount; 601 602 shiftCount = countLeadingZeros64( aSig ); 603 *zSigPtr = aSig<<shiftCount; 604 *zExpPtr = 1 - shiftCount; 605 606 } 607 608 /* 609 ------------------------------------------------------------------------------- 610 Packs the sign `zSign', exponent `zExp', and significand `zSig' into an 611 extended double-precision floating-point value, returning the result. 612 ------------------------------------------------------------------------------- 613 */ 614 INLINE floatx80 packFloatx80( flag zSign, int32 zExp, bits64 zSig ) 615 { 616 floatx80 z; 617 618 z.low = zSig; 619 z.high = ( ( (bits16) zSign )<<15 ) + zExp; 620 return z; 621 622 } 623 624 /* 625 ------------------------------------------------------------------------------- 626 Takes an abstract floating-point value having sign `zSign', exponent `zExp', 627 and extended significand formed by the concatenation of `zSig0' and `zSig1', 628 and returns the proper extended double-precision floating-point value 629 corresponding to the abstract input. Ordinarily, the abstract value is 630 rounded and packed into the extended double-precision format, with the 631 inexact exception raised if the abstract input cannot be represented 632 exactly. However, if the abstract value is too large, the overflow and 633 inexact exceptions are raised and an infinity or maximal finite value is 634 returned. If the abstract value is too small, the input value is rounded to 635 a subnormal number, and the underflow and inexact exceptions are raised if 636 the abstract input cannot be represented exactly as a subnormal extended 637 double-precision floating-point number. 638 If `roundingPrecision' is 32 or 64, the result is rounded to the same 639 number of bits as single or double precision, respectively. Otherwise, the 640 result is rounded to the full precision of the extended double-precision 641 format. 642 The input significand must be normalized or smaller. If the input 643 significand is not normalized, `zExp' must be 0; in that case, the result 644 returned is a subnormal number, and it must not require rounding. The 645 handling of underflow and overflow follows the IEC/IEEE Standard for Binary 646 Floating-Point Arithmetic. 647 ------------------------------------------------------------------------------- 648 */ 649 static floatx80 650 roundAndPackFloatx80( 651 int8 roundingPrecision, flag zSign, int32 zExp, bits64 zSig0, bits64 zSig1 652 ) 653 { 654 int8 roundingMode; 655 flag roundNearestEven, increment, isTiny; 656 int64 roundIncrement, roundMask, roundBits; 657 658 roundingMode = float_rounding_mode; 659 roundNearestEven = ( roundingMode == float_round_nearest_even ); 660 if ( roundingPrecision == 80 ) goto precision80; 661 if ( roundingPrecision == 64 ) { 662 roundIncrement = LIT64( 0x0000000000000400 ); 663 roundMask = LIT64( 0x00000000000007FF ); 664 } 665 else if ( roundingPrecision == 32 ) { 666 roundIncrement = LIT64( 0x0000008000000000 ); 667 roundMask = LIT64( 0x000000FFFFFFFFFF ); 668 } 669 else { 670 goto precision80; 671 } 672 zSig0 |= ( zSig1 != 0 ); 673 if ( ! roundNearestEven ) { 674 if ( roundingMode == float_round_to_zero ) { 675 roundIncrement = 0; 676 } 677 else { 678 roundIncrement = roundMask; 679 if ( zSign ) { 680 if ( roundingMode == float_round_up ) roundIncrement = 0; 681 } 682 else { 683 if ( roundingMode == float_round_down ) roundIncrement = 0; 684 } 685 } 686 } 687 roundBits = zSig0 & roundMask; 688 if ( 0x7FFD <= (bits32) ( zExp - 1 ) ) { 689 if ( ( 0x7FFE < zExp ) 690 || ( ( zExp == 0x7FFE ) && ( zSig0 + roundIncrement < zSig0 ) ) 691 ) { 692 goto overflow; 693 } 694 if ( zExp <= 0 ) { 695 isTiny = 696 ( float_detect_tininess == float_tininess_before_rounding ) 697 || ( zExp < 0 ) 698 || ( zSig0 <= zSig0 + roundIncrement ); 699 shift64RightJamming( zSig0, 1 - zExp, &zSig0 ); 700 zExp = 0; 701 roundBits = zSig0 & roundMask; 702 if ( isTiny && roundBits ) float_raise( float_flag_underflow ); 703 if ( roundBits ) float_exception_flags |= float_flag_inexact; 704 zSig0 += roundIncrement; 705 if ( (sbits64) zSig0 < 0 ) zExp = 1; 706 roundIncrement = roundMask + 1; 707 if ( roundNearestEven && ( roundBits<<1 == roundIncrement ) ) { 708 roundMask |= roundIncrement; 709 } 710 zSig0 &= ~ roundMask; 711 return packFloatx80( zSign, zExp, zSig0 ); 712 } 713 } 714 if ( roundBits ) float_exception_flags |= float_flag_inexact; 715 zSig0 += roundIncrement; 716 if ( zSig0 < roundIncrement ) { 717 ++zExp; 718 zSig0 = LIT64( 0x8000000000000000 ); 719 } 720 roundIncrement = roundMask + 1; 721 if ( roundNearestEven && ( roundBits<<1 == roundIncrement ) ) { 722 roundMask |= roundIncrement; 723 } 724 zSig0 &= ~ roundMask; 725 if ( zSig0 == 0 ) zExp = 0; 726 return packFloatx80( zSign, zExp, zSig0 ); 727 precision80: 728 increment = ( (sbits64) zSig1 < 0 ); 729 if ( ! roundNearestEven ) { 730 if ( roundingMode == float_round_to_zero ) { 731 increment = 0; 732 } 733 else { 734 if ( zSign ) { 735 increment = ( roundingMode == float_round_down ) && zSig1; 736 } 737 else { 738 increment = ( roundingMode == float_round_up ) && zSig1; 739 } 740 } 741 } 742 if ( 0x7FFD <= (bits32) ( zExp - 1 ) ) { 743 if ( ( 0x7FFE < zExp ) 744 || ( ( zExp == 0x7FFE ) 745 && ( zSig0 == LIT64( 0xFFFFFFFFFFFFFFFF ) ) 746 && increment 747 ) 748 ) { 749 roundMask = 0; 750 overflow: 751 float_raise( float_flag_overflow | float_flag_inexact ); 752 if ( ( roundingMode == float_round_to_zero ) 753 || ( zSign && ( roundingMode == float_round_up ) ) 754 || ( ! zSign && ( roundingMode == float_round_down ) ) 755 ) { 756 return packFloatx80( zSign, 0x7FFE, ~ roundMask ); 757 } 758 return packFloatx80( zSign, 0x7FFF, LIT64( 0x8000000000000000 ) ); 759 } 760 if ( zExp <= 0 ) { 761 isTiny = 762 ( float_detect_tininess == float_tininess_before_rounding ) 763 || ( zExp < 0 ) 764 || ! increment 765 || ( zSig0 < LIT64( 0xFFFFFFFFFFFFFFFF ) ); 766 shift64ExtraRightJamming( zSig0, zSig1, 1 - zExp, &zSig0, &zSig1 ); 767 zExp = 0; 768 if ( isTiny && zSig1 ) float_raise( float_flag_underflow ); 769 if ( zSig1 ) float_exception_flags |= float_flag_inexact; 770 if ( roundNearestEven ) { 771 increment = ( (sbits64) zSig1 < 0 ); 772 } 773 else { 774 if ( zSign ) { 775 increment = ( roundingMode == float_round_down ) && zSig1; 776 } 777 else { 778 increment = ( roundingMode == float_round_up ) && zSig1; 779 } 780 } 781 if ( increment ) { 782 ++zSig0; 783 zSig0 &= 784 ~ ( ( (bits64) ( zSig1<<1 ) == 0 ) & roundNearestEven ); 785 if ( (sbits64) zSig0 < 0 ) zExp = 1; 786 } 787 return packFloatx80( zSign, zExp, zSig0 ); 788 } 789 } 790 if ( zSig1 ) float_exception_flags |= float_flag_inexact; 791 if ( increment ) { 792 ++zSig0; 793 if ( zSig0 == 0 ) { 794 ++zExp; 795 zSig0 = LIT64( 0x8000000000000000 ); 796 } 797 else { 798 zSig0 &= ~ ( ( (bits64) ( zSig1<<1 ) == 0 ) & roundNearestEven ); 799 } 800 } 801 else { 802 if ( zSig0 == 0 ) zExp = 0; 803 } 804 return packFloatx80( zSign, zExp, zSig0 ); 805 806 } 807 808 /* 809 ------------------------------------------------------------------------------- 810 Takes an abstract floating-point value having sign `zSign', exponent 811 `zExp', and significand formed by the concatenation of `zSig0' and `zSig1', 812 and returns the proper extended double-precision floating-point value 813 corresponding to the abstract input. This routine is just like 814 `roundAndPackFloatx80' except that the input significand does not have to be 815 normalized. 816 ------------------------------------------------------------------------------- 817 */ 818 static floatx80 819 normalizeRoundAndPackFloatx80( 820 int8 roundingPrecision, flag zSign, int32 zExp, bits64 zSig0, bits64 zSig1 821 ) 822 { 823 int8 shiftCount; 824 825 if ( zSig0 == 0 ) { 826 zSig0 = zSig1; 827 zSig1 = 0; 828 zExp -= 64; 829 } 830 shiftCount = countLeadingZeros64( zSig0 ); 831 shortShift128Left( zSig0, zSig1, shiftCount, &zSig0, &zSig1 ); 832 zExp -= shiftCount; 833 return 834 roundAndPackFloatx80( roundingPrecision, zSign, zExp, zSig0, zSig1 ); 835 836 } 837 838 #endif 839 840 #ifdef FLOAT128 841 842 /* 843 ------------------------------------------------------------------------------- 844 Returns the least-significant 64 fraction bits of the quadruple-precision 845 floating-point value `a'. 846 ------------------------------------------------------------------------------- 847 */ 848 INLINE bits64 extractFloat128Frac1( float128 a ) 849 { 850 851 return a.low; 852 853 } 854 855 /* 856 ------------------------------------------------------------------------------- 857 Returns the most-significant 48 fraction bits of the quadruple-precision 858 floating-point value `a'. 859 ------------------------------------------------------------------------------- 860 */ 861 INLINE bits64 extractFloat128Frac0( float128 a ) 862 { 863 864 return a.high & LIT64( 0x0000FFFFFFFFFFFF ); 865 866 } 867 868 /* 869 ------------------------------------------------------------------------------- 870 Returns the exponent bits of the quadruple-precision floating-point value 871 `a'. 872 ------------------------------------------------------------------------------- 873 */ 874 INLINE int32 extractFloat128Exp( float128 a ) 875 { 876 877 return ( a.high>>48 ) & 0x7FFF; 878 879 } 880 881 /* 882 ------------------------------------------------------------------------------- 883 Returns the sign bit of the quadruple-precision floating-point value `a'. 884 ------------------------------------------------------------------------------- 885 */ 886 INLINE flag extractFloat128Sign( float128 a ) 887 { 888 889 return a.high>>63; 890 891 } 892 893 /* 894 ------------------------------------------------------------------------------- 895 Normalizes the subnormal quadruple-precision floating-point value 896 represented by the denormalized significand formed by the concatenation of 897 `aSig0' and `aSig1'. The normalized exponent is stored at the location 898 pointed to by `zExpPtr'. The most significant 49 bits of the normalized 899 significand are stored at the location pointed to by `zSig0Ptr', and the 900 least significant 64 bits of the normalized significand are stored at the 901 location pointed to by `zSig1Ptr'. 902 ------------------------------------------------------------------------------- 903 */ 904 static void 905 normalizeFloat128Subnormal( 906 bits64 aSig0, 907 bits64 aSig1, 908 int32 *zExpPtr, 909 bits64 *zSig0Ptr, 910 bits64 *zSig1Ptr 911 ) 912 { 913 int8 shiftCount; 914 915 if ( aSig0 == 0 ) { 916 shiftCount = countLeadingZeros64( aSig1 ) - 15; 917 if ( shiftCount < 0 ) { 918 *zSig0Ptr = aSig1>>( - shiftCount ); 919 *zSig1Ptr = aSig1<<( shiftCount & 63 ); 920 } 921 else { 922 *zSig0Ptr = aSig1<<shiftCount; 923 *zSig1Ptr = 0; 924 } 925 *zExpPtr = - shiftCount - 63; 926 } 927 else { 928 shiftCount = countLeadingZeros64( aSig0 ) - 15; 929 shortShift128Left( aSig0, aSig1, shiftCount, zSig0Ptr, zSig1Ptr ); 930 *zExpPtr = 1 - shiftCount; 931 } 932 933 } 934 935 /* 936 ------------------------------------------------------------------------------- 937 Packs the sign `zSign', the exponent `zExp', and the significand formed 938 by the concatenation of `zSig0' and `zSig1' into a quadruple-precision 939 floating-point value, returning the result. After being shifted into the 940 proper positions, the three fields `zSign', `zExp', and `zSig0' are simply 941 added together to form the most significant 32 bits of the result. This 942 means that any integer portion of `zSig0' will be added into the exponent. 943 Since a properly normalized significand will have an integer portion equal 944 to 1, the `zExp' input should be 1 less than the desired result exponent 945 whenever `zSig0' and `zSig1' concatenated form a complete, normalized 946 significand. 947 ------------------------------------------------------------------------------- 948 */ 949 INLINE float128 950 packFloat128( flag zSign, int32 zExp, bits64 zSig0, bits64 zSig1 ) 951 { 952 float128 z; 953 954 z.low = zSig1; 955 z.high = ( ( (bits64) zSign )<<63 ) + ( ( (bits64) zExp )<<48 ) + zSig0; 956 return z; 957 958 } 959 960 /* 961 ------------------------------------------------------------------------------- 962 Takes an abstract floating-point value having sign `zSign', exponent `zExp', 963 and extended significand formed by the concatenation of `zSig0', `zSig1', 964 and `zSig2', and returns the proper quadruple-precision floating-point value 965 corresponding to the abstract input. Ordinarily, the abstract value is 966 simply rounded and packed into the quadruple-precision format, with the 967 inexact exception raised if the abstract input cannot be represented 968 exactly. However, if the abstract value is too large, the overflow and 969 inexact exceptions are raised and an infinity or maximal finite value is 970 returned. If the abstract value is too small, the input value is rounded to 971 a subnormal number, and the underflow and inexact exceptions are raised if 972 the abstract input cannot be represented exactly as a subnormal quadruple- 973 precision floating-point number. 974 The input significand must be normalized or smaller. If the input 975 significand is not normalized, `zExp' must be 0; in that case, the result 976 returned is a subnormal number, and it must not require rounding. In the 977 usual case that the input significand is normalized, `zExp' must be 1 less 978 than the ``true'' floating-point exponent. The handling of underflow and 979 overflow follows the IEC/IEEE Standard for Binary Floating-Point Arithmetic. 980 ------------------------------------------------------------------------------- 981 */ 982 static float128 983 roundAndPackFloat128( 984 flag zSign, int32 zExp, bits64 zSig0, bits64 zSig1, bits64 zSig2 ) 985 { 986 int8 roundingMode; 987 flag roundNearestEven, increment, isTiny; 988 989 roundingMode = float_rounding_mode; 990 roundNearestEven = ( roundingMode == float_round_nearest_even ); 991 increment = ( (sbits64) zSig2 < 0 ); 992 if ( ! roundNearestEven ) { 993 if ( roundingMode == float_round_to_zero ) { 994 increment = 0; 995 } 996 else { 997 if ( zSign ) { 998 increment = ( roundingMode == float_round_down ) && zSig2; 999 } 1000 else { 1001 increment = ( roundingMode == float_round_up ) && zSig2; 1002 } 1003 } 1004 } 1005 if ( 0x7FFD <= (bits32) zExp ) { 1006 if ( ( 0x7FFD < zExp ) 1007 || ( ( zExp == 0x7FFD ) 1008 && eq128( 1009 LIT64( 0x0001FFFFFFFFFFFF ), 1010 LIT64( 0xFFFFFFFFFFFFFFFF ), 1011 zSig0, 1012 zSig1 1013 ) 1014 && increment 1015 ) 1016 ) { 1017 float_raise( float_flag_overflow | float_flag_inexact ); 1018 if ( ( roundingMode == float_round_to_zero ) 1019 || ( zSign && ( roundingMode == float_round_up ) ) 1020 || ( ! zSign && ( roundingMode == float_round_down ) ) 1021 ) { 1022 return 1023 packFloat128( 1024 zSign, 1025 0x7FFE, 1026 LIT64( 0x0000FFFFFFFFFFFF ), 1027 LIT64( 0xFFFFFFFFFFFFFFFF ) 1028 ); 1029 } 1030 return packFloat128( zSign, 0x7FFF, 0, 0 ); 1031 } 1032 if ( zExp < 0 ) { 1033 isTiny = 1034 ( float_detect_tininess == float_tininess_before_rounding ) 1035 || ( zExp < -1 ) 1036 || ! increment 1037 || lt128( 1038 zSig0, 1039 zSig1, 1040 LIT64( 0x0001FFFFFFFFFFFF ), 1041 LIT64( 0xFFFFFFFFFFFFFFFF ) 1042 ); 1043 shift128ExtraRightJamming( 1044 zSig0, zSig1, zSig2, - zExp, &zSig0, &zSig1, &zSig2 ); 1045 zExp = 0; 1046 if ( isTiny && zSig2 ) float_raise( float_flag_underflow ); 1047 if ( roundNearestEven ) { 1048 increment = ( (sbits64) zSig2 < 0 ); 1049 } 1050 else { 1051 if ( zSign ) { 1052 increment = ( roundingMode == float_round_down ) && zSig2; 1053 } 1054 else { 1055 increment = ( roundingMode == float_round_up ) && zSig2; 1056 } 1057 } 1058 } 1059 } 1060 if ( zSig2 ) float_exception_flags |= float_flag_inexact; 1061 if ( increment ) { 1062 add128( zSig0, zSig1, 0, 1, &zSig0, &zSig1 ); 1063 zSig1 &= ~ ( ( zSig2 + zSig2 == 0 ) & roundNearestEven ); 1064 } 1065 else { 1066 if ( ( zSig0 | zSig1 ) == 0 ) zExp = 0; 1067 } 1068 return packFloat128( zSign, zExp, zSig0, zSig1 ); 1069 1070 } 1071 1072 /* 1073 ------------------------------------------------------------------------------- 1074 Takes an abstract floating-point value having sign `zSign', exponent `zExp', 1075 and significand formed by the concatenation of `zSig0' and `zSig1', and 1076 returns the proper quadruple-precision floating-point value corresponding 1077 to the abstract input. This routine is just like `roundAndPackFloat128' 1078 except that the input significand has fewer bits and does not have to be 1079 normalized. In all cases, `zExp' must be 1 less than the ``true'' floating- 1080 point exponent. 1081 ------------------------------------------------------------------------------- 1082 */ 1083 static float128 1084 normalizeRoundAndPackFloat128( 1085 flag zSign, int32 zExp, bits64 zSig0, bits64 zSig1 ) 1086 { 1087 int8 shiftCount; 1088 bits64 zSig2; 1089 1090 if ( zSig0 == 0 ) { 1091 zSig0 = zSig1; 1092 zSig1 = 0; 1093 zExp -= 64; 1094 } 1095 shiftCount = countLeadingZeros64( zSig0 ) - 15; 1096 if ( 0 <= shiftCount ) { 1097 zSig2 = 0; 1098 shortShift128Left( zSig0, zSig1, shiftCount, &zSig0, &zSig1 ); 1099 } 1100 else { 1101 shift128ExtraRightJamming( 1102 zSig0, zSig1, 0, - shiftCount, &zSig0, &zSig1, &zSig2 ); 1103 } 1104 zExp -= shiftCount; 1105 return roundAndPackFloat128( zSign, zExp, zSig0, zSig1, zSig2 ); 1106 1107 } 1108 1109 #endif 1110 1111 /* 1112 ------------------------------------------------------------------------------- 1113 Returns the result of converting the 32-bit two's complement integer `a' 1114 to the single-precision floating-point format. The conversion is performed 1115 according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic. 1116 ------------------------------------------------------------------------------- 1117 */ 1118 float32 int32_to_float32( int32 a ) 1119 { 1120 flag zSign; 1121 1122 if ( a == 0 ) return 0; 1123 if ( a == (sbits32) 0x80000000 ) return packFloat32( 1, 0x9E, 0 ); 1124 zSign = ( a < 0 ); 1125 return normalizeRoundAndPackFloat32( zSign, 0x9C, zSign ? - a : a ); 1126 1127 } 1128 1129 /* 1130 ------------------------------------------------------------------------------- 1131 Returns the result of converting the 32-bit two's complement integer `a' 1132 to the double-precision floating-point format. The conversion is performed 1133 according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic. 1134 ------------------------------------------------------------------------------- 1135 */ 1136 float64 int32_to_float64( int32 a ) 1137 { 1138 flag zSign; 1139 uint32 absA; 1140 int8 shiftCount; 1141 bits64 zSig; 1142 1143 if ( a == 0 ) return 0; 1144 zSign = ( a < 0 ); 1145 absA = zSign ? - a : a; 1146 shiftCount = countLeadingZeros32( absA ) + 21; 1147 zSig = absA; 1148 return packFloat64( zSign, 0x432 - shiftCount, zSig<<shiftCount ); 1149 1150 } 1151 1152 #ifdef FLOATX80 1153 1154 /* 1155 ------------------------------------------------------------------------------- 1156 Returns the result of converting the 32-bit two's complement integer `a' 1157 to the extended double-precision floating-point format. The conversion 1158 is performed according to the IEC/IEEE Standard for Binary Floating-Point 1159 Arithmetic. 1160 ------------------------------------------------------------------------------- 1161 */ 1162 floatx80 int32_to_floatx80( int32 a ) 1163 { 1164 flag zSign; 1165 uint32 absA; 1166 int8 shiftCount; 1167 bits64 zSig; 1168 1169 if ( a == 0 ) return packFloatx80( 0, 0, 0 ); 1170 zSign = ( a < 0 ); 1171 absA = zSign ? - a : a; 1172 shiftCount = countLeadingZeros32( absA ) + 32; 1173 zSig = absA; 1174 return packFloatx80( zSign, 0x403E - shiftCount, zSig<<shiftCount ); 1175 1176 } 1177 1178 #endif 1179 1180 #ifdef FLOAT128 1181 1182 /* 1183 ------------------------------------------------------------------------------- 1184 Returns the result of converting the 32-bit two's complement integer `a' to 1185 the quadruple-precision floating-point format. The conversion is performed 1186 according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic. 1187 ------------------------------------------------------------------------------- 1188 */ 1189 float128 int32_to_float128( int32 a ) 1190 { 1191 flag zSign; 1192 uint32 absA; 1193 int8 shiftCount; 1194 bits64 zSig0; 1195 1196 if ( a == 0 ) return packFloat128( 0, 0, 0, 0 ); 1197 zSign = ( a < 0 ); 1198 absA = zSign ? - a : a; 1199 shiftCount = countLeadingZeros32( absA ) + 17; 1200 zSig0 = absA; 1201 return packFloat128( zSign, 0x402E - shiftCount, zSig0<<shiftCount, 0 ); 1202 1203 } 1204 1205 #endif 1206 1207 #ifndef SOFTFLOAT_FOR_GCC /* __floatdi?f is in libgcc2.c */ 1208 /* 1209 ------------------------------------------------------------------------------- 1210 Returns the result of converting the 64-bit two's complement integer `a' 1211 to the single-precision floating-point format. The conversion is performed 1212 according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic. 1213 ------------------------------------------------------------------------------- 1214 */ 1215 float32 int64_to_float32( int64 a ) 1216 { 1217 flag zSign; 1218 uint64 absA; 1219 int8 shiftCount; 1220 1221 if ( a == 0 ) return 0; 1222 zSign = ( a < 0 ); 1223 absA = zSign ? - a : a; 1224 shiftCount = countLeadingZeros64( absA ) - 40; 1225 if ( 0 <= shiftCount ) { 1226 return packFloat32( zSign, 0x95 - shiftCount, absA<<shiftCount ); 1227 } 1228 else { 1229 shiftCount += 7; 1230 if ( shiftCount < 0 ) { 1231 shift64RightJamming( absA, - shiftCount, &absA ); 1232 } 1233 else { 1234 absA <<= shiftCount; 1235 } 1236 return roundAndPackFloat32( zSign, 0x9C - shiftCount, absA ); 1237 } 1238 1239 } 1240 1241 /* 1242 ------------------------------------------------------------------------------- 1243 Returns the result of converting the 64-bit two's complement integer `a' 1244 to the double-precision floating-point format. The conversion is performed 1245 according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic. 1246 ------------------------------------------------------------------------------- 1247 */ 1248 float64 int64_to_float64( int64 a ) 1249 { 1250 flag zSign; 1251 1252 if ( a == 0 ) return 0; 1253 if ( a == (sbits64) LIT64( 0x8000000000000000 ) ) { 1254 return packFloat64( 1, 0x43E, 0 ); 1255 } 1256 zSign = ( a < 0 ); 1257 return normalizeRoundAndPackFloat64( zSign, 0x43C, zSign ? - a : a ); 1258 1259 } 1260 1261 #ifdef FLOATX80 1262 1263 /* 1264 ------------------------------------------------------------------------------- 1265 Returns the result of converting the 64-bit two's complement integer `a' 1266 to the extended double-precision floating-point format. The conversion 1267 is performed according to the IEC/IEEE Standard for Binary Floating-Point 1268 Arithmetic. 1269 ------------------------------------------------------------------------------- 1270 */ 1271 floatx80 int64_to_floatx80( int64 a ) 1272 { 1273 flag zSign; 1274 uint64 absA; 1275 int8 shiftCount; 1276 1277 if ( a == 0 ) return packFloatx80( 0, 0, 0 ); 1278 zSign = ( a < 0 ); 1279 absA = zSign ? - a : a; 1280 shiftCount = countLeadingZeros64( absA ); 1281 return packFloatx80( zSign, 0x403E - shiftCount, absA<<shiftCount ); 1282 1283 } 1284 1285 #endif 1286 1287 #endif /* !SOFTFLOAT_FOR_GCC */ 1288 1289 #ifdef FLOAT128 1290 1291 /* 1292 ------------------------------------------------------------------------------- 1293 Returns the result of converting the 64-bit two's complement integer `a' to 1294 the quadruple-precision floating-point format. The conversion is performed 1295 according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic. 1296 ------------------------------------------------------------------------------- 1297 */ 1298 float128 int64_to_float128( int64 a ) 1299 { 1300 flag zSign; 1301 uint64 absA; 1302 int8 shiftCount; 1303 int32 zExp; 1304 bits64 zSig0, zSig1; 1305 1306 if ( a == 0 ) return packFloat128( 0, 0, 0, 0 ); 1307 zSign = ( a < 0 ); 1308 absA = zSign ? - a : a; 1309 shiftCount = countLeadingZeros64( absA ) + 49; 1310 zExp = 0x406E - shiftCount; 1311 if ( 64 <= shiftCount ) { 1312 zSig1 = 0; 1313 zSig0 = absA; 1314 shiftCount -= 64; 1315 } 1316 else { 1317 zSig1 = absA; 1318 zSig0 = 0; 1319 } 1320 shortShift128Left( zSig0, zSig1, shiftCount, &zSig0, &zSig1 ); 1321 return packFloat128( zSign, zExp, zSig0, zSig1 ); 1322 1323 } 1324 1325 #endif 1326 1327 #ifndef SOFTFLOAT_FOR_GCC /* Not needed */ 1328 /* 1329 ------------------------------------------------------------------------------- 1330 Returns the result of converting the single-precision floating-point value 1331 `a' to the 32-bit two's complement integer format. The conversion is 1332 performed according to the IEC/IEEE Standard for Binary Floating-Point 1333 Arithmetic---which means in particular that the conversion is rounded 1334 according to the current rounding mode. If `a' is a NaN, the largest 1335 positive integer is returned. Otherwise, if the conversion overflows, the 1336 largest integer with the same sign as `a' is returned. 1337 ------------------------------------------------------------------------------- 1338 */ 1339 int32 float32_to_int32( float32 a ) 1340 { 1341 flag aSign; 1342 int16 aExp, shiftCount; 1343 bits32 aSig; 1344 bits64 aSig64; 1345 1346 aSig = extractFloat32Frac( a ); 1347 aExp = extractFloat32Exp( a ); 1348 aSign = extractFloat32Sign( a ); 1349 if ( ( aExp == 0xFF ) && aSig ) aSign = 0; 1350 if ( aExp ) aSig |= 0x00800000; 1351 shiftCount = 0xAF - aExp; 1352 aSig64 = aSig; 1353 aSig64 <<= 32; 1354 if ( 0 < shiftCount ) shift64RightJamming( aSig64, shiftCount, &aSig64 ); 1355 return roundAndPackInt32( aSign, aSig64 ); 1356 1357 } 1358 #endif /* !SOFTFLOAT_FOR_GCC */ 1359 1360 /* 1361 ------------------------------------------------------------------------------- 1362 Returns the result of converting the single-precision floating-point value 1363 `a' to the 32-bit two's complement integer format. The conversion is 1364 performed according to the IEC/IEEE Standard for Binary Floating-Point 1365 Arithmetic, except that the conversion is always rounded toward zero. 1366 If `a' is a NaN, the largest positive integer is returned. Otherwise, if 1367 the conversion overflows, the largest integer with the same sign as `a' is 1368 returned. 1369 ------------------------------------------------------------------------------- 1370 */ 1371 int32 float32_to_int32_round_to_zero( float32 a ) 1372 { 1373 flag aSign; 1374 int16 aExp, shiftCount; 1375 bits32 aSig; 1376 int32 z; 1377 1378 aSig = extractFloat32Frac( a ); 1379 aExp = extractFloat32Exp( a ); 1380 aSign = extractFloat32Sign( a ); 1381 shiftCount = aExp - 0x9E; 1382 if ( 0 <= shiftCount ) { 1383 if ( a != 0xCF000000 ) { 1384 float_raise( float_flag_invalid ); 1385 if ( ! aSign || ( ( aExp == 0xFF ) && aSig ) ) return 0x7FFFFFFF; 1386 } 1387 return (sbits32) 0x80000000; 1388 } 1389 else if ( aExp <= 0x7E ) { 1390 if ( aExp | aSig ) float_exception_flags |= float_flag_inexact; 1391 return 0; 1392 } 1393 aSig = ( aSig | 0x00800000 )<<8; 1394 z = aSig>>( - shiftCount ); 1395 if ( (bits32) ( aSig<<( shiftCount & 31 ) ) ) { 1396 float_exception_flags |= float_flag_inexact; 1397 } 1398 if ( aSign ) z = - z; 1399 return z; 1400 1401 } 1402 1403 #ifndef SOFTFLOAT_FOR_GCC /* __fix?fdi provided by libgcc2.c */ 1404 /* 1405 ------------------------------------------------------------------------------- 1406 Returns the result of converting the single-precision floating-point value 1407 `a' to the 64-bit two's complement integer format. The conversion is 1408 performed according to the IEC/IEEE Standard for Binary Floating-Point 1409 Arithmetic---which means in particular that the conversion is rounded 1410 according to the current rounding mode. If `a' is a NaN, the largest 1411 positive integer is returned. Otherwise, if the conversion overflows, the 1412 largest integer with the same sign as `a' is returned. 1413 ------------------------------------------------------------------------------- 1414 */ 1415 int64 float32_to_int64( float32 a ) 1416 { 1417 flag aSign; 1418 int16 aExp, shiftCount; 1419 bits32 aSig; 1420 bits64 aSig64, aSigExtra; 1421 1422 aSig = extractFloat32Frac( a ); 1423 aExp = extractFloat32Exp( a ); 1424 aSign = extractFloat32Sign( a ); 1425 shiftCount = 0xBE - aExp; 1426 if ( shiftCount < 0 ) { 1427 float_raise( float_flag_invalid ); 1428 if ( ! aSign || ( ( aExp == 0xFF ) && aSig ) ) { 1429 return LIT64( 0x7FFFFFFFFFFFFFFF ); 1430 } 1431 return (sbits64) LIT64( 0x8000000000000000 ); 1432 } 1433 if ( aExp ) aSig |= 0x00800000; 1434 aSig64 = aSig; 1435 aSig64 <<= 40; 1436 shift64ExtraRightJamming( aSig64, 0, shiftCount, &aSig64, &aSigExtra ); 1437 return roundAndPackInt64( aSign, aSig64, aSigExtra ); 1438 1439 } 1440 1441 /* 1442 ------------------------------------------------------------------------------- 1443 Returns the result of converting the single-precision floating-point value 1444 `a' to the 64-bit two's complement integer format. The conversion is 1445 performed according to the IEC/IEEE Standard for Binary Floating-Point 1446 Arithmetic, except that the conversion is always rounded toward zero. If 1447 `a' is a NaN, the largest positive integer is returned. Otherwise, if the 1448 conversion overflows, the largest integer with the same sign as `a' is 1449 returned. 1450 ------------------------------------------------------------------------------- 1451 */ 1452 int64 float32_to_int64_round_to_zero( float32 a ) 1453 { 1454 flag aSign; 1455 int16 aExp, shiftCount; 1456 bits32 aSig; 1457 bits64 aSig64; 1458 int64 z; 1459 1460 aSig = extractFloat32Frac( a ); 1461 aExp = extractFloat32Exp( a ); 1462 aSign = extractFloat32Sign( a ); 1463 shiftCount = aExp - 0xBE; 1464 if ( 0 <= shiftCount ) { 1465 if ( a != 0xDF000000 ) { 1466 float_raise( float_flag_invalid ); 1467 if ( ! aSign || ( ( aExp == 0xFF ) && aSig ) ) { 1468 return LIT64( 0x7FFFFFFFFFFFFFFF ); 1469 } 1470 } 1471 return (sbits64) LIT64( 0x8000000000000000 ); 1472 } 1473 else if ( aExp <= 0x7E ) { 1474 if ( aExp | aSig ) float_exception_flags |= float_flag_inexact; 1475 return 0; 1476 } 1477 aSig64 = aSig | 0x00800000; 1478 aSig64 <<= 40; 1479 z = aSig64>>( - shiftCount ); 1480 if ( (bits64) ( aSig64<<( shiftCount & 63 ) ) ) { 1481 float_exception_flags |= float_flag_inexact; 1482 } 1483 if ( aSign ) z = - z; 1484 return z; 1485 1486 } 1487 #endif /* !SOFTFLOAT_FOR_GCC */ 1488 1489 /* 1490 ------------------------------------------------------------------------------- 1491 Returns the result of converting the single-precision floating-point value 1492 `a' to the double-precision floating-point format. The conversion is 1493 performed according to the IEC/IEEE Standard for Binary Floating-Point 1494 Arithmetic. 1495 ------------------------------------------------------------------------------- 1496 */ 1497 float64 float32_to_float64( float32 a ) 1498 { 1499 flag aSign; 1500 int16 aExp; 1501 bits32 aSig; 1502 1503 aSig = extractFloat32Frac( a ); 1504 aExp = extractFloat32Exp( a ); 1505 aSign = extractFloat32Sign( a ); 1506 if ( aExp == 0xFF ) { 1507 if ( aSig ) return commonNaNToFloat64( float32ToCommonNaN( a ) ); 1508 return packFloat64( aSign, 0x7FF, 0 ); 1509 } 1510 if ( aExp == 0 ) { 1511 if ( aSig == 0 ) return packFloat64( aSign, 0, 0 ); 1512 normalizeFloat32Subnormal( aSig, &aExp, &aSig ); 1513 --aExp; 1514 } 1515 return packFloat64( aSign, aExp + 0x380, ( (bits64) aSig )<<29 ); 1516 1517 } 1518 1519 #ifdef FLOATX80 1520 1521 /* 1522 ------------------------------------------------------------------------------- 1523 Returns the result of converting the single-precision floating-point value 1524 `a' to the extended double-precision floating-point format. The conversion 1525 is performed according to the IEC/IEEE Standard for Binary Floating-Point 1526 Arithmetic. 1527 ------------------------------------------------------------------------------- 1528 */ 1529 floatx80 float32_to_floatx80( float32 a ) 1530 { 1531 flag aSign; 1532 int16 aExp; 1533 bits32 aSig; 1534 1535 aSig = extractFloat32Frac( a ); 1536 aExp = extractFloat32Exp( a ); 1537 aSign = extractFloat32Sign( a ); 1538 if ( aExp == 0xFF ) { 1539 if ( aSig ) return commonNaNToFloatx80( float32ToCommonNaN( a ) ); 1540 return packFloatx80( aSign, 0x7FFF, LIT64( 0x8000000000000000 ) ); 1541 } 1542 if ( aExp == 0 ) { 1543 if ( aSig == 0 ) return packFloatx80( aSign, 0, 0 ); 1544 normalizeFloat32Subnormal( aSig, &aExp, &aSig ); 1545 } 1546 aSig |= 0x00800000; 1547 return packFloatx80( aSign, aExp + 0x3F80, ( (bits64) aSig )<<40 ); 1548 1549 } 1550 1551 #endif 1552 1553 #ifdef FLOAT128 1554 1555 /* 1556 ------------------------------------------------------------------------------- 1557 Returns the result of converting the single-precision floating-point value 1558 `a' to the double-precision floating-point format. The conversion is 1559 performed according to the IEC/IEEE Standard for Binary Floating-Point 1560 Arithmetic. 1561 ------------------------------------------------------------------------------- 1562 */ 1563 float128 float32_to_float128( float32 a ) 1564 { 1565 flag aSign; 1566 int16 aExp; 1567 bits32 aSig; 1568 1569 aSig = extractFloat32Frac( a ); 1570 aExp = extractFloat32Exp( a ); 1571 aSign = extractFloat32Sign( a ); 1572 if ( aExp == 0xFF ) { 1573 if ( aSig ) return commonNaNToFloat128( float32ToCommonNaN( a ) ); 1574 return packFloat128( aSign, 0x7FFF, 0, 0 ); 1575 } 1576 if ( aExp == 0 ) { 1577 if ( aSig == 0 ) return packFloat128( aSign, 0, 0, 0 ); 1578 normalizeFloat32Subnormal( aSig, &aExp, &aSig ); 1579 --aExp; 1580 } 1581 return packFloat128( aSign, aExp + 0x3F80, ( (bits64) aSig )<<25, 0 ); 1582 1583 } 1584 1585 #endif 1586 1587 #ifndef SOFTFLOAT_FOR_GCC /* Not needed */ 1588 /* 1589 ------------------------------------------------------------------------------- 1590 Rounds the single-precision floating-point value `a' to an integer, and 1591 returns the result as a single-precision floating-point value. The 1592 operation is performed according to the IEC/IEEE Standard for Binary 1593 Floating-Point Arithmetic. 1594 ------------------------------------------------------------------------------- 1595 */ 1596 float32 float32_round_to_int( float32 a ) 1597 { 1598 flag aSign; 1599 int16 aExp; 1600 bits32 lastBitMask, roundBitsMask; 1601 int8 roundingMode; 1602 float32 z; 1603 1604 aExp = extractFloat32Exp( a ); 1605 if ( 0x96 <= aExp ) { 1606 if ( ( aExp == 0xFF ) && extractFloat32Frac( a ) ) { 1607 return propagateFloat32NaN( a, a ); 1608 } 1609 return a; 1610 } 1611 if ( aExp <= 0x7E ) { 1612 if ( (bits32) ( a<<1 ) == 0 ) return a; 1613 float_exception_flags |= float_flag_inexact; 1614 aSign = extractFloat32Sign( a ); 1615 switch ( float_rounding_mode ) { 1616 case float_round_nearest_even: 1617 if ( ( aExp == 0x7E ) && extractFloat32Frac( a ) ) { 1618 return packFloat32( aSign, 0x7F, 0 ); 1619 } 1620 break; 1621 case float_round_to_zero: 1622 break; 1623 case float_round_down: 1624 return aSign ? 0xBF800000 : 0; 1625 case float_round_up: 1626 return aSign ? 0x80000000 : 0x3F800000; 1627 } 1628 return packFloat32( aSign, 0, 0 ); 1629 } 1630 lastBitMask = 1; 1631 lastBitMask <<= 0x96 - aExp; 1632 roundBitsMask = lastBitMask - 1; 1633 z = a; 1634 roundingMode = float_rounding_mode; 1635 if ( roundingMode == float_round_nearest_even ) { 1636 z += lastBitMask>>1; 1637 if ( ( z & roundBitsMask ) == 0 ) z &= ~ lastBitMask; 1638 } 1639 else if ( roundingMode != float_round_to_zero ) { 1640 if ( extractFloat32Sign( z ) ^ ( roundingMode == float_round_up ) ) { 1641 z += roundBitsMask; 1642 } 1643 } 1644 z &= ~ roundBitsMask; 1645 if ( z != a ) float_exception_flags |= float_flag_inexact; 1646 return z; 1647 1648 } 1649 #endif /* !SOFTFLOAT_FOR_GCC */ 1650 1651 /* 1652 ------------------------------------------------------------------------------- 1653 Returns the result of adding the absolute values of the single-precision 1654 floating-point values `a' and `b'. If `zSign' is 1, the sum is negated 1655 before being returned. `zSign' is ignored if the result is a NaN. 1656 The addition is performed according to the IEC/IEEE Standard for Binary 1657 Floating-Point Arithmetic. 1658 ------------------------------------------------------------------------------- 1659 */ 1660 static float32 addFloat32Sigs( float32 a, float32 b, flag zSign ) 1661 { 1662 int16 aExp, bExp, zExp; 1663 bits32 aSig, bSig, zSig; 1664 int16 expDiff; 1665 1666 aSig = extractFloat32Frac( a ); 1667 aExp = extractFloat32Exp( a ); 1668 bSig = extractFloat32Frac( b ); 1669 bExp = extractFloat32Exp( b ); 1670 expDiff = aExp - bExp; 1671 aSig <<= 6; 1672 bSig <<= 6; 1673 if ( 0 < expDiff ) { 1674 if ( aExp == 0xFF ) { 1675 if ( aSig ) return propagateFloat32NaN( a, b ); 1676 return a; 1677 } 1678 if ( bExp == 0 ) { 1679 --expDiff; 1680 } 1681 else { 1682 bSig |= 0x20000000; 1683 } 1684 shift32RightJamming( bSig, expDiff, &bSig ); 1685 zExp = aExp; 1686 } 1687 else if ( expDiff < 0 ) { 1688 if ( bExp == 0xFF ) { 1689 if ( bSig ) return propagateFloat32NaN( a, b ); 1690 return packFloat32( zSign, 0xFF, 0 ); 1691 } 1692 if ( aExp == 0 ) { 1693 ++expDiff; 1694 } 1695 else { 1696 aSig |= 0x20000000; 1697 } 1698 shift32RightJamming( aSig, - expDiff, &aSig ); 1699 zExp = bExp; 1700 } 1701 else { 1702 if ( aExp == 0xFF ) { 1703 if ( aSig | bSig ) return propagateFloat32NaN( a, b ); 1704 return a; 1705 } 1706 if ( aExp == 0 ) return packFloat32( zSign, 0, ( aSig + bSig )>>6 ); 1707 zSig = 0x40000000 + aSig + bSig; 1708 zExp = aExp; 1709 goto roundAndPack; 1710 } 1711 aSig |= 0x20000000; 1712 zSig = ( aSig + bSig )<<1; 1713 --zExp; 1714 if ( (sbits32) zSig < 0 ) { 1715 zSig = aSig + bSig; 1716 ++zExp; 1717 } 1718 roundAndPack: 1719 return roundAndPackFloat32( zSign, zExp, zSig ); 1720 1721 } 1722 1723 /* 1724 ------------------------------------------------------------------------------- 1725 Returns the result of subtracting the absolute values of the single- 1726 precision floating-point values `a' and `b'. If `zSign' is 1, the 1727 difference is negated before being returned. `zSign' is ignored if the 1728 result is a NaN. The subtraction is performed according to the IEC/IEEE 1729 Standard for Binary Floating-Point Arithmetic. 1730 ------------------------------------------------------------------------------- 1731 */ 1732 static float32 subFloat32Sigs( float32 a, float32 b, flag zSign ) 1733 { 1734 int16 aExp, bExp, zExp; 1735 bits32 aSig, bSig, zSig; 1736 int16 expDiff; 1737 1738 aSig = extractFloat32Frac( a ); 1739 aExp = extractFloat32Exp( a ); 1740 bSig = extractFloat32Frac( b ); 1741 bExp = extractFloat32Exp( b ); 1742 expDiff = aExp - bExp; 1743 aSig <<= 7; 1744 bSig <<= 7; 1745 if ( 0 < expDiff ) goto aExpBigger; 1746 if ( expDiff < 0 ) goto bExpBigger; 1747 if ( aExp == 0xFF ) { 1748 if ( aSig | bSig ) return propagateFloat32NaN( a, b ); 1749 float_raise( float_flag_invalid ); 1750 return float32_default_nan; 1751 } 1752 if ( aExp == 0 ) { 1753 aExp = 1; 1754 bExp = 1; 1755 } 1756 if ( bSig < aSig ) goto aBigger; 1757 if ( aSig < bSig ) goto bBigger; 1758 return packFloat32( float_rounding_mode == float_round_down, 0, 0 ); 1759 bExpBigger: 1760 if ( bExp == 0xFF ) { 1761 if ( bSig ) return propagateFloat32NaN( a, b ); 1762 return packFloat32( zSign ^ 1, 0xFF, 0 ); 1763 } 1764 if ( aExp == 0 ) { 1765 ++expDiff; 1766 } 1767 else { 1768 aSig |= 0x40000000; 1769 } 1770 shift32RightJamming( aSig, - expDiff, &aSig ); 1771 bSig |= 0x40000000; 1772 bBigger: 1773 zSig = bSig - aSig; 1774 zExp = bExp; 1775 zSign ^= 1; 1776 goto normalizeRoundAndPack; 1777 aExpBigger: 1778 if ( aExp == 0xFF ) { 1779 if ( aSig ) return propagateFloat32NaN( a, b ); 1780 return a; 1781 } 1782 if ( bExp == 0 ) { 1783 --expDiff; 1784 } 1785 else { 1786 bSig |= 0x40000000; 1787 } 1788 shift32RightJamming( bSig, expDiff, &bSig ); 1789 aSig |= 0x40000000; 1790 aBigger: 1791 zSig = aSig - bSig; 1792 zExp = aExp; 1793 normalizeRoundAndPack: 1794 --zExp; 1795 return normalizeRoundAndPackFloat32( zSign, zExp, zSig ); 1796 1797 } 1798 1799 /* 1800 ------------------------------------------------------------------------------- 1801 Returns the result of adding the single-precision floating-point values `a' 1802 and `b'. The operation is performed according to the IEC/IEEE Standard for 1803 Binary Floating-Point Arithmetic. 1804 ------------------------------------------------------------------------------- 1805 */ 1806 float32 float32_add( float32 a, float32 b ) 1807 { 1808 flag aSign, bSign; 1809 1810 aSign = extractFloat32Sign( a ); 1811 bSign = extractFloat32Sign( b ); 1812 if ( aSign == bSign ) { 1813 return addFloat32Sigs( a, b, aSign ); 1814 } 1815 else { 1816 return subFloat32Sigs( a, b, aSign ); 1817 } 1818 1819 } 1820 1821 /* 1822 ------------------------------------------------------------------------------- 1823 Returns the result of subtracting the single-precision floating-point values 1824 `a' and `b'. The operation is performed according to the IEC/IEEE Standard 1825 for Binary Floating-Point Arithmetic. 1826 ------------------------------------------------------------------------------- 1827 */ 1828 float32 float32_sub( float32 a, float32 b ) 1829 { 1830 flag aSign, bSign; 1831 1832 aSign = extractFloat32Sign( a ); 1833 bSign = extractFloat32Sign( b ); 1834 if ( aSign == bSign ) { 1835 return subFloat32Sigs( a, b, aSign ); 1836 } 1837 else { 1838 return addFloat32Sigs( a, b, aSign ); 1839 } 1840 1841 } 1842 1843 /* 1844 ------------------------------------------------------------------------------- 1845 Returns the result of multiplying the single-precision floating-point values 1846 `a' and `b'. The operation is performed according to the IEC/IEEE Standard 1847 for Binary Floating-Point Arithmetic. 1848 ------------------------------------------------------------------------------- 1849 */ 1850 float32 float32_mul( float32 a, float32 b ) 1851 { 1852 flag aSign, bSign, zSign; 1853 int16 aExp, bExp, zExp; 1854 bits32 aSig, bSig; 1855 bits64 zSig64; 1856 bits32 zSig; 1857 1858 aSig = extractFloat32Frac( a ); 1859 aExp = extractFloat32Exp( a ); 1860 aSign = extractFloat32Sign( a ); 1861 bSig = extractFloat32Frac( b ); 1862 bExp = extractFloat32Exp( b ); 1863 bSign = extractFloat32Sign( b ); 1864 zSign = aSign ^ bSign; 1865 if ( aExp == 0xFF ) { 1866 if ( aSig || ( ( bExp == 0xFF ) && bSig ) ) { 1867 return propagateFloat32NaN( a, b ); 1868 } 1869 if ( ( bExp | bSig ) == 0 ) { 1870 float_raise( float_flag_invalid ); 1871 return float32_default_nan; 1872 } 1873 return packFloat32( zSign, 0xFF, 0 ); 1874 } 1875 if ( bExp == 0xFF ) { 1876 if ( bSig ) return propagateFloat32NaN( a, b ); 1877 if ( ( aExp | aSig ) == 0 ) { 1878 float_raise( float_flag_invalid ); 1879 return float32_default_nan; 1880 } 1881 return packFloat32( zSign, 0xFF, 0 ); 1882 } 1883 if ( aExp == 0 ) { 1884 if ( aSig == 0 ) return packFloat32( zSign, 0, 0 ); 1885 normalizeFloat32Subnormal( aSig, &aExp, &aSig ); 1886 } 1887 if ( bExp == 0 ) { 1888 if ( bSig == 0 ) return packFloat32( zSign, 0, 0 ); 1889 normalizeFloat32Subnormal( bSig, &bExp, &bSig ); 1890 } 1891 zExp = aExp + bExp - 0x7F; 1892 aSig = ( aSig | 0x00800000 )<<7; 1893 bSig = ( bSig | 0x00800000 )<<8; 1894 shift64RightJamming( ( (bits64) aSig ) * bSig, 32, &zSig64 ); 1895 zSig = zSig64; 1896 if ( 0 <= (sbits32) ( zSig<<1 ) ) { 1897 zSig <<= 1; 1898 --zExp; 1899 } 1900 return roundAndPackFloat32( zSign, zExp, zSig ); 1901 1902 } 1903 1904 /* 1905 ------------------------------------------------------------------------------- 1906 Returns the result of dividing the single-precision floating-point value `a' 1907 by the corresponding value `b'. The operation is performed according to the 1908 IEC/IEEE Standard for Binary Floating-Point Arithmetic. 1909 ------------------------------------------------------------------------------- 1910 */ 1911 float32 float32_div( float32 a, float32 b ) 1912 { 1913 flag aSign, bSign, zSign; 1914 int16 aExp, bExp, zExp; 1915 bits32 aSig, bSig, zSig; 1916 1917 aSig = extractFloat32Frac( a ); 1918 aExp = extractFloat32Exp( a ); 1919 aSign = extractFloat32Sign( a ); 1920 bSig = extractFloat32Frac( b ); 1921 bExp = extractFloat32Exp( b ); 1922 bSign = extractFloat32Sign( b ); 1923 zSign = aSign ^ bSign; 1924 if ( aExp == 0xFF ) { 1925 if ( aSig ) return propagateFloat32NaN( a, b ); 1926 if ( bExp == 0xFF ) { 1927 if ( bSig ) return propagateFloat32NaN( a, b ); 1928 float_raise( float_flag_invalid ); 1929 return float32_default_nan; 1930 } 1931 return packFloat32( zSign, 0xFF, 0 ); 1932 } 1933 if ( bExp == 0xFF ) { 1934 if ( bSig ) return propagateFloat32NaN( a, b ); 1935 return packFloat32( zSign, 0, 0 ); 1936 } 1937 if ( bExp == 0 ) { 1938 if ( bSig == 0 ) { 1939 if ( ( aExp | aSig ) == 0 ) { 1940 float_raise( float_flag_invalid ); 1941 return float32_default_nan; 1942 } 1943 float_raise( float_flag_divbyzero ); 1944 return packFloat32( zSign, 0xFF, 0 ); 1945 } 1946 normalizeFloat32Subnormal( bSig, &bExp, &bSig ); 1947 } 1948 if ( aExp == 0 ) { 1949 if ( aSig == 0 ) return packFloat32( zSign, 0, 0 ); 1950 normalizeFloat32Subnormal( aSig, &aExp, &aSig ); 1951 } 1952 zExp = aExp - bExp + 0x7D; 1953 aSig = ( aSig | 0x00800000 )<<7; 1954 bSig = ( bSig | 0x00800000 )<<8; 1955 if ( bSig <= ( aSig + aSig ) ) { 1956 aSig >>= 1; 1957 ++zExp; 1958 } 1959 zSig = ( ( (bits64) aSig )<<32 ) / bSig; 1960 if ( ( zSig & 0x3F ) == 0 ) { 1961 zSig |= ( (bits64) bSig * zSig != ( (bits64) aSig )<<32 ); 1962 } 1963 return roundAndPackFloat32( zSign, zExp, zSig ); 1964 1965 } 1966 1967 #ifndef SOFTFLOAT_FOR_GCC /* Not needed */ 1968 /* 1969 ------------------------------------------------------------------------------- 1970 Returns the remainder of the single-precision floating-point value `a' 1971 with respect to the corresponding value `b'. The operation is performed 1972 according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic. 1973 ------------------------------------------------------------------------------- 1974 */ 1975 float32 float32_rem( float32 a, float32 b ) 1976 { 1977 flag aSign, bSign, zSign; 1978 int16 aExp, bExp, expDiff; 1979 bits32 aSig, bSig; 1980 bits32 q; 1981 bits64 aSig64, bSig64, q64; 1982 bits32 alternateASig; 1983 sbits32 sigMean; 1984 1985 aSig = extractFloat32Frac( a ); 1986 aExp = extractFloat32Exp( a ); 1987 aSign = extractFloat32Sign( a ); 1988 bSig = extractFloat32Frac( b ); 1989 bExp = extractFloat32Exp( b ); 1990 bSign = extractFloat32Sign( b ); 1991 if ( aExp == 0xFF ) { 1992 if ( aSig || ( ( bExp == 0xFF ) && bSig ) ) { 1993 return propagateFloat32NaN( a, b ); 1994 } 1995 float_raise( float_flag_invalid ); 1996 return float32_default_nan; 1997 } 1998 if ( bExp == 0xFF ) { 1999 if ( bSig ) return propagateFloat32NaN( a, b ); 2000 return a; 2001 } 2002 if ( bExp == 0 ) { 2003 if ( bSig == 0 ) { 2004 float_raise( float_flag_invalid ); 2005 return float32_default_nan; 2006 } 2007 normalizeFloat32Subnormal( bSig, &bExp, &bSig ); 2008 } 2009 if ( aExp == 0 ) { 2010 if ( aSig == 0 ) return a; 2011 normalizeFloat32Subnormal( aSig, &aExp, &aSig ); 2012 } 2013 expDiff = aExp - bExp; 2014 aSig |= 0x00800000; 2015 bSig |= 0x00800000; 2016 if ( expDiff < 32 ) { 2017 aSig <<= 8; 2018 bSig <<= 8; 2019 if ( expDiff < 0 ) { 2020 if ( expDiff < -1 ) return a; 2021 aSig >>= 1; 2022 } 2023 q = ( bSig <= aSig ); 2024 if ( q ) aSig -= bSig; 2025 if ( 0 < expDiff ) { 2026 q = ( ( (bits64) aSig )<<32 ) / bSig; 2027 q >>= 32 - expDiff; 2028 bSig >>= 2; 2029 aSig = ( ( aSig>>1 )<<( expDiff - 1 ) ) - bSig * q; 2030 } 2031 else { 2032 aSig >>= 2; 2033 bSig >>= 2; 2034 } 2035 } 2036 else { 2037 if ( bSig <= aSig ) aSig -= bSig; 2038 aSig64 = ( (bits64) aSig )<<40; 2039 bSig64 = ( (bits64) bSig )<<40; 2040 expDiff -= 64; 2041 while ( 0 < expDiff ) { 2042 q64 = estimateDiv128To64( aSig64, 0, bSig64 ); 2043 q64 = ( 2 < q64 ) ? q64 - 2 : 0; 2044 aSig64 = - ( ( bSig * q64 )<<38 ); 2045 expDiff -= 62; 2046 } 2047 expDiff += 64; 2048 q64 = estimateDiv128To64( aSig64, 0, bSig64 ); 2049 q64 = ( 2 < q64 ) ? q64 - 2 : 0; 2050 q = q64>>( 64 - expDiff ); 2051 bSig <<= 6; 2052 aSig = ( ( aSig64>>33 )<<( expDiff - 1 ) ) - bSig * q; 2053 } 2054 do { 2055 alternateASig = aSig; 2056 ++q; 2057 aSig -= bSig; 2058 } while ( 0 <= (sbits32) aSig ); 2059 sigMean = aSig + alternateASig; 2060 if ( ( sigMean < 0 ) || ( ( sigMean == 0 ) && ( q & 1 ) ) ) { 2061 aSig = alternateASig; 2062 } 2063 zSign = ( (sbits32) aSig < 0 ); 2064 if ( zSign ) aSig = - aSig; 2065 return normalizeRoundAndPackFloat32( aSign ^ zSign, bExp, aSig ); 2066 2067 } 2068 #endif /* !SOFTFLOAT_FOR_GCC */ 2069 2070 #ifndef SOFTFLOAT_FOR_GCC /* Not needed */ 2071 /* 2072 ------------------------------------------------------------------------------- 2073 Returns the square root of the single-precision floating-point value `a'. 2074 The operation is performed according to the IEC/IEEE Standard for Binary 2075 Floating-Point Arithmetic. 2076 ------------------------------------------------------------------------------- 2077 */ 2078 float32 float32_sqrt( float32 a ) 2079 { 2080 flag aSign; 2081 int16 aExp, zExp; 2082 bits32 aSig, zSig; 2083 bits64 rem, term; 2084 2085 aSig = extractFloat32Frac( a ); 2086 aExp = extractFloat32Exp( a ); 2087 aSign = extractFloat32Sign( a ); 2088 if ( aExp == 0xFF ) { 2089 if ( aSig ) return propagateFloat32NaN( a, 0 ); 2090 if ( ! aSign ) return a; 2091 float_raise( float_flag_invalid ); 2092 return float32_default_nan; 2093 } 2094 if ( aSign ) { 2095 if ( ( aExp | aSig ) == 0 ) return a; 2096 float_raise( float_flag_invalid ); 2097 return float32_default_nan; 2098 } 2099 if ( aExp == 0 ) { 2100 if ( aSig == 0 ) return 0; 2101 normalizeFloat32Subnormal( aSig, &aExp, &aSig ); 2102 } 2103 zExp = ( ( aExp - 0x7F )>>1 ) + 0x7E; 2104 aSig = ( aSig | 0x00800000 )<<8; 2105 zSig = estimateSqrt32( aExp, aSig ) + 2; 2106 if ( ( zSig & 0x7F ) <= 5 ) { 2107 if ( zSig < 2 ) { 2108 zSig = 0x7FFFFFFF; 2109 goto roundAndPack; 2110 } 2111 aSig >>= aExp & 1; 2112 term = ( (bits64) zSig ) * zSig; 2113 rem = ( ( (bits64) aSig )<<32 ) - term; 2114 while ( (sbits64) rem < 0 ) { 2115 --zSig; 2116 rem += ( ( (bits64) zSig )<<1 ) | 1; 2117 } 2118 zSig |= ( rem != 0 ); 2119 } 2120 shift32RightJamming( zSig, 1, &zSig ); 2121 roundAndPack: 2122 return roundAndPackFloat32( 0, zExp, zSig ); 2123 2124 } 2125 #endif /* !SOFTFLOAT_FOR_GCC */ 2126 2127 /* 2128 ------------------------------------------------------------------------------- 2129 Returns 1 if the single-precision floating-point value `a' is equal to 2130 the corresponding value `b', and 0 otherwise. The comparison is performed 2131 according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic. 2132 ------------------------------------------------------------------------------- 2133 */ 2134 flag float32_eq( float32 a, float32 b ) 2135 { 2136 2137 if ( ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) ) 2138 || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) ) 2139 ) { 2140 if ( float32_is_signaling_nan( a ) || float32_is_signaling_nan( b ) ) { 2141 float_raise( float_flag_invalid ); 2142 } 2143 return 0; 2144 } 2145 return ( a == b ) || ( (bits32) ( ( a | b )<<1 ) == 0 ); 2146 2147 } 2148 2149 /* 2150 ------------------------------------------------------------------------------- 2151 Returns 1 if the single-precision floating-point value `a' is less than 2152 or equal to the corresponding value `b', and 0 otherwise. The comparison 2153 is performed according to the IEC/IEEE Standard for Binary Floating-Point 2154 Arithmetic. 2155 ------------------------------------------------------------------------------- 2156 */ 2157 flag float32_le( float32 a, float32 b ) 2158 { 2159 flag aSign, bSign; 2160 2161 if ( ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) ) 2162 || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) ) 2163 ) { 2164 float_raise( float_flag_invalid ); 2165 return 0; 2166 } 2167 aSign = extractFloat32Sign( a ); 2168 bSign = extractFloat32Sign( b ); 2169 if ( aSign != bSign ) return aSign || ( (bits32) ( ( a | b )<<1 ) == 0 ); 2170 return ( a == b ) || ( aSign ^ ( a < b ) ); 2171 2172 } 2173 2174 /* 2175 ------------------------------------------------------------------------------- 2176 Returns 1 if the single-precision floating-point value `a' is less than 2177 the corresponding value `b', and 0 otherwise. The comparison is performed 2178 according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic. 2179 ------------------------------------------------------------------------------- 2180 */ 2181 flag float32_lt( float32 a, float32 b ) 2182 { 2183 flag aSign, bSign; 2184 2185 if ( ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) ) 2186 || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) ) 2187 ) { 2188 float_raise( float_flag_invalid ); 2189 return 0; 2190 } 2191 aSign = extractFloat32Sign( a ); 2192 bSign = extractFloat32Sign( b ); 2193 if ( aSign != bSign ) return aSign && ( (bits32) ( ( a | b )<<1 ) != 0 ); 2194 return ( a != b ) && ( aSign ^ ( a < b ) ); 2195 2196 } 2197 2198 #ifndef SOFTFLOAT_FOR_GCC /* Not needed */ 2199 /* 2200 ------------------------------------------------------------------------------- 2201 Returns 1 if the single-precision floating-point value `a' is equal to 2202 the corresponding value `b', and 0 otherwise. The invalid exception is 2203 raised if either operand is a NaN. Otherwise, the comparison is performed 2204 according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic. 2205 ------------------------------------------------------------------------------- 2206 */ 2207 flag float32_eq_signaling( float32 a, float32 b ) 2208 { 2209 2210 if ( ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) ) 2211 || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) ) 2212 ) { 2213 float_raise( float_flag_invalid ); 2214 return 0; 2215 } 2216 return ( a == b ) || ( (bits32) ( ( a | b )<<1 ) == 0 ); 2217 2218 } 2219 2220 /* 2221 ------------------------------------------------------------------------------- 2222 Returns 1 if the single-precision floating-point value `a' is less than or 2223 equal to the corresponding value `b', and 0 otherwise. Quiet NaNs do not 2224 cause an exception. Otherwise, the comparison is performed according to the 2225 IEC/IEEE Standard for Binary Floating-Point Arithmetic. 2226 ------------------------------------------------------------------------------- 2227 */ 2228 flag float32_le_quiet( float32 a, float32 b ) 2229 { 2230 flag aSign, bSign; 2231 2232 if ( ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) ) 2233 || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) ) 2234 ) { 2235 if ( float32_is_signaling_nan( a ) || float32_is_signaling_nan( b ) ) { 2236 float_raise( float_flag_invalid ); 2237 } 2238 return 0; 2239 } 2240 aSign = extractFloat32Sign( a ); 2241 bSign = extractFloat32Sign( b ); 2242 if ( aSign != bSign ) return aSign || ( (bits32) ( ( a | b )<<1 ) == 0 ); 2243 return ( a == b ) || ( aSign ^ ( a < b ) ); 2244 2245 } 2246 2247 /* 2248 ------------------------------------------------------------------------------- 2249 Returns 1 if the single-precision floating-point value `a' is less than 2250 the corresponding value `b', and 0 otherwise. Quiet NaNs do not cause an 2251 exception. Otherwise, the comparison is performed according to the IEC/IEEE 2252 Standard for Binary Floating-Point Arithmetic. 2253 ------------------------------------------------------------------------------- 2254 */ 2255 flag float32_lt_quiet( float32 a, float32 b ) 2256 { 2257 flag aSign, bSign; 2258 2259 if ( ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) ) 2260 || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) ) 2261 ) { 2262 if ( float32_is_signaling_nan( a ) || float32_is_signaling_nan( b ) ) { 2263 float_raise( float_flag_invalid ); 2264 } 2265 return 0; 2266 } 2267 aSign = extractFloat32Sign( a ); 2268 bSign = extractFloat32Sign( b ); 2269 if ( aSign != bSign ) return aSign && ( (bits32) ( ( a | b )<<1 ) != 0 ); 2270 return ( a != b ) && ( aSign ^ ( a < b ) ); 2271 2272 } 2273 #endif /* !SOFTFLOAT_FOR_GCC */ 2274 2275 #ifndef SOFTFLOAT_FOR_GCC /* Not needed */ 2276 /* 2277 ------------------------------------------------------------------------------- 2278 Returns the result of converting the double-precision floating-point value 2279 `a' to the 32-bit two's complement integer format. The conversion is 2280 performed according to the IEC/IEEE Standard for Binary Floating-Point 2281 Arithmetic---which means in particular that the conversion is rounded 2282 according to the current rounding mode. If `a' is a NaN, the largest 2283 positive integer is returned. Otherwise, if the conversion overflows, the 2284 largest integer with the same sign as `a' is returned. 2285 ------------------------------------------------------------------------------- 2286 */ 2287 int32 float64_to_int32( float64 a ) 2288 { 2289 flag aSign; 2290 int16 aExp, shiftCount; 2291 bits64 aSig; 2292 2293 aSig = extractFloat64Frac( a ); 2294 aExp = extractFloat64Exp( a ); 2295 aSign = extractFloat64Sign( a ); 2296 if ( ( aExp == 0x7FF ) && aSig ) aSign = 0; 2297 if ( aExp ) aSig |= LIT64( 0x0010000000000000 ); 2298 shiftCount = 0x42C - aExp; 2299 if ( 0 < shiftCount ) shift64RightJamming( aSig, shiftCount, &aSig ); 2300 return roundAndPackInt32( aSign, aSig ); 2301 2302 } 2303 #endif /* !SOFTFLOAT_FOR_GCC */ 2304 2305 /* 2306 ------------------------------------------------------------------------------- 2307 Returns the result of converting the double-precision floating-point value 2308 `a' to the 32-bit two's complement integer format. The conversion is 2309 performed according to the IEC/IEEE Standard for Binary Floating-Point 2310 Arithmetic, except that the conversion is always rounded toward zero. 2311 If `a' is a NaN, the largest positive integer is returned. Otherwise, if 2312 the conversion overflows, the largest integer with the same sign as `a' is 2313 returned. 2314 ------------------------------------------------------------------------------- 2315 */ 2316 int32 float64_to_int32_round_to_zero( float64 a ) 2317 { 2318 flag aSign; 2319 int16 aExp, shiftCount; 2320 bits64 aSig, savedASig; 2321 int32 z; 2322 2323 aSig = extractFloat64Frac( a ); 2324 aExp = extractFloat64Exp( a ); 2325 aSign = extractFloat64Sign( a ); 2326 if ( 0x41E < aExp ) { 2327 if ( ( aExp == 0x7FF ) && aSig ) aSign = 0; 2328 goto invalid; 2329 } 2330 else if ( aExp < 0x3FF ) { 2331 if ( aExp || aSig ) float_exception_flags |= float_flag_inexact; 2332 return 0; 2333 } 2334 aSig |= LIT64( 0x0010000000000000 ); 2335 shiftCount = 0x433 - aExp; 2336 savedASig = aSig; 2337 aSig >>= shiftCount; 2338 z = aSig; 2339 if ( aSign ) z = - z; 2340 if ( ( z < 0 ) ^ aSign ) { 2341 invalid: 2342 float_raise( float_flag_invalid ); 2343 return aSign ? (sbits32) 0x80000000 : 0x7FFFFFFF; 2344 } 2345 if ( ( aSig<<shiftCount ) != savedASig ) { 2346 float_exception_flags |= float_flag_inexact; 2347 } 2348 return z; 2349 2350 } 2351 2352 #ifndef SOFTFLOAT_FOR_GCC /* Not needed */ 2353 /* 2354 ------------------------------------------------------------------------------- 2355 Returns the result of converting the double-precision floating-point value 2356 `a' to the 64-bit two's complement integer format. The conversion is 2357 performed according to the IEC/IEEE Standard for Binary Floating-Point 2358 Arithmetic---which means in particular that the conversion is rounded 2359 according to the current rounding mode. If `a' is a NaN, the largest 2360 positive integer is returned. Otherwise, if the conversion overflows, the 2361 largest integer with the same sign as `a' is returned. 2362 ------------------------------------------------------------------------------- 2363 */ 2364 int64 float64_to_int64( float64 a ) 2365 { 2366 flag aSign; 2367 int16 aExp, shiftCount; 2368 bits64 aSig, aSigExtra; 2369 2370 aSig = extractFloat64Frac( a ); 2371 aExp = extractFloat64Exp( a ); 2372 aSign = extractFloat64Sign( a ); 2373 if ( aExp ) aSig |= LIT64( 0x0010000000000000 ); 2374 shiftCount = 0x433 - aExp; 2375 if ( shiftCount <= 0 ) { 2376 if ( 0x43E < aExp ) { 2377 float_raise( float_flag_invalid ); 2378 if ( ! aSign 2379 || ( ( aExp == 0x7FF ) 2380 && ( aSig != LIT64( 0x0010000000000000 ) ) ) 2381 ) { 2382 return LIT64( 0x7FFFFFFFFFFFFFFF ); 2383 } 2384 return (sbits64) LIT64( 0x8000000000000000 ); 2385 } 2386 aSigExtra = 0; 2387 aSig <<= - shiftCount; 2388 } 2389 else { 2390 shift64ExtraRightJamming( aSig, 0, shiftCount, &aSig, &aSigExtra ); 2391 } 2392 return roundAndPackInt64( aSign, aSig, aSigExtra ); 2393 2394 } 2395 2396 /* 2397 ------------------------------------------------------------------------------- 2398 Returns the result of converting the double-precision floating-point value 2399 `a' to the 64-bit two's complement integer format. The conversion is 2400 performed according to the IEC/IEEE Standard for Binary Floating-Point 2401 Arithmetic, except that the conversion is always rounded toward zero. 2402 If `a' is a NaN, the largest positive integer is returned. Otherwise, if 2403 the conversion overflows, the largest integer with the same sign as `a' is 2404 returned. 2405 ------------------------------------------------------------------------------- 2406 */ 2407 int64 float64_to_int64_round_to_zero( float64 a ) 2408 { 2409 flag aSign; 2410 int16 aExp, shiftCount; 2411 bits64 aSig; 2412 int64 z; 2413 2414 aSig = extractFloat64Frac( a ); 2415 aExp = extractFloat64Exp( a ); 2416 aSign = extractFloat64Sign( a ); 2417 if ( aExp ) aSig |= LIT64( 0x0010000000000000 ); 2418 shiftCount = aExp - 0x433; 2419 if ( 0 <= shiftCount ) { 2420 if ( 0x43E <= aExp ) { 2421 if ( a != LIT64( 0xC3E0000000000000 ) ) { 2422 float_raise( float_flag_invalid ); 2423 if ( ! aSign 2424 || ( ( aExp == 0x7FF ) 2425 && ( aSig != LIT64( 0x0010000000000000 ) ) ) 2426 ) { 2427 return LIT64( 0x7FFFFFFFFFFFFFFF ); 2428 } 2429 } 2430 return (sbits64) LIT64( 0x8000000000000000 ); 2431 } 2432 z = aSig<<shiftCount; 2433 } 2434 else { 2435 if ( aExp < 0x3FE ) { 2436 if ( aExp | aSig ) float_exception_flags |= float_flag_inexact; 2437 return 0; 2438 } 2439 z = aSig>>( - shiftCount ); 2440 if ( (bits64) ( aSig<<( shiftCount & 63 ) ) ) { 2441 float_exception_flags |= float_flag_inexact; 2442 } 2443 } 2444 if ( aSign ) z = - z; 2445 return z; 2446 2447 } 2448 #endif /* !SOFTFLOAT_FOR_GCC */ 2449 2450 /* 2451 ------------------------------------------------------------------------------- 2452 Returns the result of converting the double-precision floating-point value 2453 `a' to the single-precision floating-point format. The conversion is 2454 performed according to the IEC/IEEE Standard for Binary Floating-Point 2455 Arithmetic. 2456 ------------------------------------------------------------------------------- 2457 */ 2458 float32 float64_to_float32( float64 a ) 2459 { 2460 flag aSign; 2461 int16 aExp; 2462 bits64 aSig; 2463 bits32 zSig; 2464 2465 aSig = extractFloat64Frac( a ); 2466 aExp = extractFloat64Exp( a ); 2467 aSign = extractFloat64Sign( a ); 2468 if ( aExp == 0x7FF ) { 2469 if ( aSig ) return commonNaNToFloat32( float64ToCommonNaN( a ) ); 2470 return packFloat32( aSign, 0xFF, 0 ); 2471 } 2472 shift64RightJamming( aSig, 22, &aSig ); 2473 zSig = aSig; 2474 if ( aExp || zSig ) { 2475 zSig |= 0x40000000; 2476 aExp -= 0x381; 2477 } 2478 return roundAndPackFloat32( aSign, aExp, zSig ); 2479 2480 } 2481 2482 #ifdef FLOATX80 2483 2484 /* 2485 ------------------------------------------------------------------------------- 2486 Returns the result of converting the double-precision floating-point value 2487 `a' to the extended double-precision floating-point format. The conversion 2488 is performed according to the IEC/IEEE Standard for Binary Floating-Point 2489 Arithmetic. 2490 ------------------------------------------------------------------------------- 2491 */ 2492 floatx80 float64_to_floatx80( float64 a ) 2493 { 2494 flag aSign; 2495 int16 aExp; 2496 bits64 aSig; 2497 2498 aSig = extractFloat64Frac( a ); 2499 aExp = extractFloat64Exp( a ); 2500 aSign = extractFloat64Sign( a ); 2501 if ( aExp == 0x7FF ) { 2502 if ( aSig ) return commonNaNToFloatx80( float64ToCommonNaN( a ) ); 2503 return packFloatx80( aSign, 0x7FFF, LIT64( 0x8000000000000000 ) ); 2504 } 2505 if ( aExp == 0 ) { 2506 if ( aSig == 0 ) return packFloatx80( aSign, 0, 0 ); 2507 normalizeFloat64Subnormal( aSig, &aExp, &aSig ); 2508 } 2509 return 2510 packFloatx80( 2511 aSign, aExp + 0x3C00, ( aSig | LIT64( 0x0010000000000000 ) )<<11 ); 2512 2513 } 2514 2515 #endif 2516 2517 #ifdef FLOAT128 2518 2519 /* 2520 ------------------------------------------------------------------------------- 2521 Returns the result of converting the double-precision floating-point value 2522 `a' to the quadruple-precision floating-point format. The conversion is 2523 performed according to the IEC/IEEE Standard for Binary Floating-Point 2524 Arithmetic. 2525 ------------------------------------------------------------------------------- 2526 */ 2527 float128 float64_to_float128( float64 a ) 2528 { 2529 flag aSign; 2530 int16 aExp; 2531 bits64 aSig, zSig0, zSig1; 2532 2533 aSig = extractFloat64Frac( a ); 2534 aExp = extractFloat64Exp( a ); 2535 aSign = extractFloat64Sign( a ); 2536 if ( aExp == 0x7FF ) { 2537 if ( aSig ) return commonNaNToFloat128( float64ToCommonNaN( a ) ); 2538 return packFloat128( aSign, 0x7FFF, 0, 0 ); 2539 } 2540 if ( aExp == 0 ) { 2541 if ( aSig == 0 ) return packFloat128( aSign, 0, 0, 0 ); 2542 normalizeFloat64Subnormal( aSig, &aExp, &aSig ); 2543 --aExp; 2544 } 2545 shift128Right( aSig, 0, 4, &zSig0, &zSig1 ); 2546 return packFloat128( aSign, aExp + 0x3C00, zSig0, zSig1 ); 2547 2548 } 2549 2550 #endif 2551 2552 #ifndef SOFTFLOAT_FOR_GCC 2553 /* 2554 ------------------------------------------------------------------------------- 2555 Rounds the double-precision floating-point value `a' to an integer, and 2556 returns the result as a double-precision floating-point value. The 2557 operation is performed according to the IEC/IEEE Standard for Binary 2558 Floating-Point Arithmetic. 2559 ------------------------------------------------------------------------------- 2560 */ 2561 float64 float64_round_to_int( float64 a ) 2562 { 2563 flag aSign; 2564 int16 aExp; 2565 bits64 lastBitMask, roundBitsMask; 2566 int8 roundingMode; 2567 float64 z; 2568 2569 aExp = extractFloat64Exp( a ); 2570 if ( 0x433 <= aExp ) { 2571 if ( ( aExp == 0x7FF ) && extractFloat64Frac( a ) ) { 2572 return propagateFloat64NaN( a, a ); 2573 } 2574 return a; 2575 } 2576 if ( aExp < 0x3FF ) { 2577 if ( (bits64) ( a<<1 ) == 0 ) return a; 2578 float_exception_flags |= float_flag_inexact; 2579 aSign = extractFloat64Sign( a ); 2580 switch ( float_rounding_mode ) { 2581 case float_round_nearest_even: 2582 if ( ( aExp == 0x3FE ) && extractFloat64Frac( a ) ) { 2583 return packFloat64( aSign, 0x3FF, 0 ); 2584 } 2585 break; 2586 case float_round_to_zero: 2587 break; 2588 case float_round_down: 2589 return aSign ? LIT64( 0xBFF0000000000000 ) : 0; 2590 case float_round_up: 2591 return 2592 aSign ? LIT64( 0x8000000000000000 ) : LIT64( 0x3FF0000000000000 ); 2593 } 2594 return packFloat64( aSign, 0, 0 ); 2595 } 2596 lastBitMask = 1; 2597 lastBitMask <<= 0x433 - aExp; 2598 roundBitsMask = lastBitMask - 1; 2599 z = a; 2600 roundingMode = float_rounding_mode; 2601 if ( roundingMode == float_round_nearest_even ) { 2602 z += lastBitMask>>1; 2603 if ( ( z & roundBitsMask ) == 0 ) z &= ~ lastBitMask; 2604 } 2605 else if ( roundingMode != float_round_to_zero ) { 2606 if ( extractFloat64Sign( z ) ^ ( roundingMode == float_round_up ) ) { 2607 z += roundBitsMask; 2608 } 2609 } 2610 z &= ~ roundBitsMask; 2611 if ( z != a ) float_exception_flags |= float_flag_inexact; 2612 return z; 2613 2614 } 2615 #endif 2616 2617 /* 2618 ------------------------------------------------------------------------------- 2619 Returns the result of adding the absolute values of the double-precision 2620 floating-point values `a' and `b'. If `zSign' is 1, the sum is negated 2621 before being returned. `zSign' is ignored if the result is a NaN. 2622 The addition is performed according to the IEC/IEEE Standard for Binary 2623 Floating-Point Arithmetic. 2624 ------------------------------------------------------------------------------- 2625 */ 2626 static float64 addFloat64Sigs( float64 a, float64 b, flag zSign ) 2627 { 2628 int16 aExp, bExp, zExp; 2629 bits64 aSig, bSig, zSig; 2630 int16 expDiff; 2631 2632 aSig = extractFloat64Frac( a ); 2633 aExp = extractFloat64Exp( a ); 2634 bSig = extractFloat64Frac( b ); 2635 bExp = extractFloat64Exp( b ); 2636 expDiff = aExp - bExp; 2637 aSig <<= 9; 2638 bSig <<= 9; 2639 if ( 0 < expDiff ) { 2640 if ( aExp == 0x7FF ) { 2641 if ( aSig ) return propagateFloat64NaN( a, b ); 2642 return a; 2643 } 2644 if ( bExp == 0 ) { 2645 --expDiff; 2646 } 2647 else { 2648 bSig |= LIT64( 0x2000000000000000 ); 2649 } 2650 shift64RightJamming( bSig, expDiff, &bSig ); 2651 zExp = aExp; 2652 } 2653 else if ( expDiff < 0 ) { 2654 if ( bExp == 0x7FF ) { 2655 if ( bSig ) return propagateFloat64NaN( a, b ); 2656 return packFloat64( zSign, 0x7FF, 0 ); 2657 } 2658 if ( aExp == 0 ) { 2659 ++expDiff; 2660 } 2661 else { 2662 aSig |= LIT64( 0x2000000000000000 ); 2663 } 2664 shift64RightJamming( aSig, - expDiff, &aSig ); 2665 zExp = bExp; 2666 } 2667 else { 2668 if ( aExp == 0x7FF ) { 2669 if ( aSig | bSig ) return propagateFloat64NaN( a, b ); 2670 return a; 2671 } 2672 if ( aExp == 0 ) return packFloat64( zSign, 0, ( aSig + bSig )>>9 ); 2673 zSig = LIT64( 0x4000000000000000 ) + aSig + bSig; 2674 zExp = aExp; 2675 goto roundAndPack; 2676 } 2677 aSig |= LIT64( 0x2000000000000000 ); 2678 zSig = ( aSig + bSig )<<1; 2679 --zExp; 2680 if ( (sbits64) zSig < 0 ) { 2681 zSig = aSig + bSig; 2682 ++zExp; 2683 } 2684 roundAndPack: 2685 return roundAndPackFloat64( zSign, zExp, zSig ); 2686 2687 } 2688 2689 /* 2690 ------------------------------------------------------------------------------- 2691 Returns the result of subtracting the absolute values of the double- 2692 precision floating-point values `a' and `b'. If `zSign' is 1, the 2693 difference is negated before being returned. `zSign' is ignored if the 2694 result is a NaN. The subtraction is performed according to the IEC/IEEE 2695 Standard for Binary Floating-Point Arithmetic. 2696 ------------------------------------------------------------------------------- 2697 */ 2698 static float64 subFloat64Sigs( float64 a, float64 b, flag zSign ) 2699 { 2700 int16 aExp, bExp, zExp; 2701 bits64 aSig, bSig, zSig; 2702 int16 expDiff; 2703 2704 aSig = extractFloat64Frac( a ); 2705 aExp = extractFloat64Exp( a ); 2706 bSig = extractFloat64Frac( b ); 2707 bExp = extractFloat64Exp( b ); 2708 expDiff = aExp - bExp; 2709 aSig <<= 10; 2710 bSig <<= 10; 2711 if ( 0 < expDiff ) goto aExpBigger; 2712 if ( expDiff < 0 ) goto bExpBigger; 2713 if ( aExp == 0x7FF ) { 2714 if ( aSig | bSig ) return propagateFloat64NaN( a, b ); 2715 float_raise( float_flag_invalid ); 2716 return float64_default_nan; 2717 } 2718 if ( aExp == 0 ) { 2719 aExp = 1; 2720 bExp = 1; 2721 } 2722 if ( bSig < aSig ) goto aBigger; 2723 if ( aSig < bSig ) goto bBigger; 2724 return packFloat64( float_rounding_mode == float_round_down, 0, 0 ); 2725 bExpBigger: 2726 if ( bExp == 0x7FF ) { 2727 if ( bSig ) return propagateFloat64NaN( a, b ); 2728 return packFloat64( zSign ^ 1, 0x7FF, 0 ); 2729 } 2730 if ( aExp == 0 ) { 2731 ++expDiff; 2732 } 2733 else { 2734 aSig |= LIT64( 0x4000000000000000 ); 2735 } 2736 shift64RightJamming( aSig, - expDiff, &aSig ); 2737 bSig |= LIT64( 0x4000000000000000 ); 2738 bBigger: 2739 zSig = bSig - aSig; 2740 zExp = bExp; 2741 zSign ^= 1; 2742 goto normalizeRoundAndPack; 2743 aExpBigger: 2744 if ( aExp == 0x7FF ) { 2745 if ( aSig ) return propagateFloat64NaN( a, b ); 2746 return a; 2747 } 2748 if ( bExp == 0 ) { 2749 --expDiff; 2750 } 2751 else { 2752 bSig |= LIT64( 0x4000000000000000 ); 2753 } 2754 shift64RightJamming( bSig, expDiff, &bSig ); 2755 aSig |= LIT64( 0x4000000000000000 ); 2756 aBigger: 2757 zSig = aSig - bSig; 2758 zExp = aExp; 2759 normalizeRoundAndPack: 2760 --zExp; 2761 return normalizeRoundAndPackFloat64( zSign, zExp, zSig ); 2762 2763 } 2764 2765 /* 2766 ------------------------------------------------------------------------------- 2767 Returns the result of adding the double-precision floating-point values `a' 2768 and `b'. The operation is performed according to the IEC/IEEE Standard for 2769 Binary Floating-Point Arithmetic. 2770 ------------------------------------------------------------------------------- 2771 */ 2772 float64 float64_add( float64 a, float64 b ) 2773 { 2774 flag aSign, bSign; 2775 2776 aSign = extractFloat64Sign( a ); 2777 bSign = extractFloat64Sign( b ); 2778 if ( aSign == bSign ) { 2779 return addFloat64Sigs( a, b, aSign ); 2780 } 2781 else { 2782 return subFloat64Sigs( a, b, aSign ); 2783 } 2784 2785 } 2786 2787 /* 2788 ------------------------------------------------------------------------------- 2789 Returns the result of subtracting the double-precision floating-point values 2790 `a' and `b'. The operation is performed according to the IEC/IEEE Standard 2791 for Binary Floating-Point Arithmetic. 2792 ------------------------------------------------------------------------------- 2793 */ 2794 float64 float64_sub( float64 a, float64 b ) 2795 { 2796 flag aSign, bSign; 2797 2798 aSign = extractFloat64Sign( a ); 2799 bSign = extractFloat64Sign( b ); 2800 if ( aSign == bSign ) { 2801 return subFloat64Sigs( a, b, aSign ); 2802 } 2803 else { 2804 return addFloat64Sigs( a, b, aSign ); 2805 } 2806 2807 } 2808 2809 /* 2810 ------------------------------------------------------------------------------- 2811 Returns the result of multiplying the double-precision floating-point values 2812 `a' and `b'. The operation is performed according to the IEC/IEEE Standard 2813 for Binary Floating-Point Arithmetic. 2814 ------------------------------------------------------------------------------- 2815 */ 2816 float64 float64_mul( float64 a, float64 b ) 2817 { 2818 flag aSign, bSign, zSign; 2819 int16 aExp, bExp, zExp; 2820 bits64 aSig, bSig, zSig0, zSig1; 2821 2822 aSig = extractFloat64Frac( a ); 2823 aExp = extractFloat64Exp( a ); 2824 aSign = extractFloat64Sign( a ); 2825 bSig = extractFloat64Frac( b ); 2826 bExp = extractFloat64Exp( b ); 2827 bSign = extractFloat64Sign( b ); 2828 zSign = aSign ^ bSign; 2829 if ( aExp == 0x7FF ) { 2830 if ( aSig || ( ( bExp == 0x7FF ) && bSig ) ) { 2831 return propagateFloat64NaN( a, b ); 2832 } 2833 if ( ( bExp | bSig ) == 0 ) { 2834 float_raise( float_flag_invalid ); 2835 return float64_default_nan; 2836 } 2837 return packFloat64( zSign, 0x7FF, 0 ); 2838 } 2839 if ( bExp == 0x7FF ) { 2840 if ( bSig ) return propagateFloat64NaN( a, b ); 2841 if ( ( aExp | aSig ) == 0 ) { 2842 float_raise( float_flag_invalid ); 2843 return float64_default_nan; 2844 } 2845 return packFloat64( zSign, 0x7FF, 0 ); 2846 } 2847 if ( aExp == 0 ) { 2848 if ( aSig == 0 ) return packFloat64( zSign, 0, 0 ); 2849 normalizeFloat64Subnormal( aSig, &aExp, &aSig ); 2850 } 2851 if ( bExp == 0 ) { 2852 if ( bSig == 0 ) return packFloat64( zSign, 0, 0 ); 2853 normalizeFloat64Subnormal( bSig, &bExp, &bSig ); 2854 } 2855 zExp = aExp + bExp - 0x3FF; 2856 aSig = ( aSig | LIT64( 0x0010000000000000 ) )<<10; 2857 bSig = ( bSig | LIT64( 0x0010000000000000 ) )<<11; 2858 mul64To128( aSig, bSig, &zSig0, &zSig1 ); 2859 zSig0 |= ( zSig1 != 0 ); 2860 if ( 0 <= (sbits64) ( zSig0<<1 ) ) { 2861 zSig0 <<= 1; 2862 --zExp; 2863 } 2864 return roundAndPackFloat64( zSign, zExp, zSig0 ); 2865 2866 } 2867 2868 /* 2869 ------------------------------------------------------------------------------- 2870 Returns the result of dividing the double-precision floating-point value `a' 2871 by the corresponding value `b'. The operation is performed according to 2872 the IEC/IEEE Standard for Binary Floating-Point Arithmetic. 2873 ------------------------------------------------------------------------------- 2874 */ 2875 float64 float64_div( float64 a, float64 b ) 2876 { 2877 flag aSign, bSign, zSign; 2878 int16 aExp, bExp, zExp; 2879 bits64 aSig, bSig, zSig; 2880 bits64 rem0, rem1; 2881 bits64 term0, term1; 2882 2883 aSig = extractFloat64Frac( a ); 2884 aExp = extractFloat64Exp( a ); 2885 aSign = extractFloat64Sign( a ); 2886 bSig = extractFloat64Frac( b ); 2887 bExp = extractFloat64Exp( b ); 2888 bSign = extractFloat64Sign( b ); 2889 zSign = aSign ^ bSign; 2890 if ( aExp == 0x7FF ) { 2891 if ( aSig ) return propagateFloat64NaN( a, b ); 2892 if ( bExp == 0x7FF ) { 2893 if ( bSig ) return propagateFloat64NaN( a, b ); 2894 float_raise( float_flag_invalid ); 2895 return float64_default_nan; 2896 } 2897 return packFloat64( zSign, 0x7FF, 0 ); 2898 } 2899 if ( bExp == 0x7FF ) { 2900 if ( bSig ) return propagateFloat64NaN( a, b ); 2901 return packFloat64( zSign, 0, 0 ); 2902 } 2903 if ( bExp == 0 ) { 2904 if ( bSig == 0 ) { 2905 if ( ( aExp | aSig ) == 0 ) { 2906 float_raise( float_flag_invalid ); 2907 return float64_default_nan; 2908 } 2909 float_raise( float_flag_divbyzero ); 2910 return packFloat64( zSign, 0x7FF, 0 ); 2911 } 2912 normalizeFloat64Subnormal( bSig, &bExp, &bSig ); 2913 } 2914 if ( aExp == 0 ) { 2915 if ( aSig == 0 ) return packFloat64( zSign, 0, 0 ); 2916 normalizeFloat64Subnormal( aSig, &aExp, &aSig ); 2917 } 2918 zExp = aExp - bExp + 0x3FD; 2919 aSig = ( aSig | LIT64( 0x0010000000000000 ) )<<10; 2920 bSig = ( bSig | LIT64( 0x0010000000000000 ) )<<11; 2921 if ( bSig <= ( aSig + aSig ) ) { 2922 aSig >>= 1; 2923 ++zExp; 2924 } 2925 zSig = estimateDiv128To64( aSig, 0, bSig ); 2926 if ( ( zSig & 0x1FF ) <= 2 ) { 2927 mul64To128( bSig, zSig, &term0, &term1 ); 2928 sub128( aSig, 0, term0, term1, &rem0, &rem1 ); 2929 while ( (sbits64) rem0 < 0 ) { 2930 --zSig; 2931 add128( rem0, rem1, 0, bSig, &rem0, &rem1 ); 2932 } 2933 zSig |= ( rem1 != 0 ); 2934 } 2935 return roundAndPackFloat64( zSign, zExp, zSig ); 2936 2937 } 2938 2939 #ifndef SOFTFLOAT_FOR_GCC 2940 /* 2941 ------------------------------------------------------------------------------- 2942 Returns the remainder of the double-precision floating-point value `a' 2943 with respect to the corresponding value `b'. The operation is performed 2944 according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic. 2945 ------------------------------------------------------------------------------- 2946 */ 2947 float64 float64_rem( float64 a, float64 b ) 2948 { 2949 flag aSign, bSign, zSign; 2950 int16 aExp, bExp, expDiff; 2951 bits64 aSig, bSig; 2952 bits64 q, alternateASig; 2953 sbits64 sigMean; 2954 2955 aSig = extractFloat64Frac( a ); 2956 aExp = extractFloat64Exp( a ); 2957 aSign = extractFloat64Sign( a ); 2958 bSig = extractFloat64Frac( b ); 2959 bExp = extractFloat64Exp( b ); 2960 bSign = extractFloat64Sign( b ); 2961 if ( aExp == 0x7FF ) { 2962 if ( aSig || ( ( bExp == 0x7FF ) && bSig ) ) { 2963 return propagateFloat64NaN( a, b ); 2964 } 2965 float_raise( float_flag_invalid ); 2966 return float64_default_nan; 2967 } 2968 if ( bExp == 0x7FF ) { 2969 if ( bSig ) return propagateFloat64NaN( a, b ); 2970 return a; 2971 } 2972 if ( bExp == 0 ) { 2973 if ( bSig == 0 ) { 2974 float_raise( float_flag_invalid ); 2975 return float64_default_nan; 2976 } 2977 normalizeFloat64Subnormal( bSig, &bExp, &bSig ); 2978 } 2979 if ( aExp == 0 ) { 2980 if ( aSig == 0 ) return a; 2981 normalizeFloat64Subnormal( aSig, &aExp, &aSig ); 2982 } 2983 expDiff = aExp - bExp; 2984 aSig = ( aSig | LIT64( 0x0010000000000000 ) )<<11; 2985 bSig = ( bSig | LIT64( 0x0010000000000000 ) )<<11; 2986 if ( expDiff < 0 ) { 2987 if ( expDiff < -1 ) return a; 2988 aSig >>= 1; 2989 } 2990 q = ( bSig <= aSig ); 2991 if ( q ) aSig -= bSig; 2992 expDiff -= 64; 2993 while ( 0 < expDiff ) { 2994 q = estimateDiv128To64( aSig, 0, bSig ); 2995 q = ( 2 < q ) ? q - 2 : 0; 2996 aSig = - ( ( bSig>>2 ) * q ); 2997 expDiff -= 62; 2998 } 2999 expDiff += 64; 3000 if ( 0 < expDiff ) { 3001 q = estimateDiv128To64( aSig, 0, bSig ); 3002 q = ( 2 < q ) ? q - 2 : 0; 3003 q >>= 64 - expDiff; 3004 bSig >>= 2; 3005 aSig = ( ( aSig>>1 )<<( expDiff - 1 ) ) - bSig * q; 3006 } 3007 else { 3008 aSig >>= 2; 3009 bSig >>= 2; 3010 } 3011 do { 3012 alternateASig = aSig; 3013 ++q; 3014 aSig -= bSig; 3015 } while ( 0 <= (sbits64) aSig ); 3016 sigMean = aSig + alternateASig; 3017 if ( ( sigMean < 0 ) || ( ( sigMean == 0 ) && ( q & 1 ) ) ) { 3018 aSig = alternateASig; 3019 } 3020 zSign = ( (sbits64) aSig < 0 ); 3021 if ( zSign ) aSig = - aSig; 3022 return normalizeRoundAndPackFloat64( aSign ^ zSign, bExp, aSig ); 3023 3024 } 3025 3026 /* 3027 ------------------------------------------------------------------------------- 3028 Returns the square root of the double-precision floating-point value `a'. 3029 The operation is performed according to the IEC/IEEE Standard for Binary 3030 Floating-Point Arithmetic. 3031 ------------------------------------------------------------------------------- 3032 */ 3033 float64 float64_sqrt( float64 a ) 3034 { 3035 flag aSign; 3036 int16 aExp, zExp; 3037 bits64 aSig, zSig, doubleZSig; 3038 bits64 rem0, rem1, term0, term1; 3039 3040 aSig = extractFloat64Frac( a ); 3041 aExp = extractFloat64Exp( a ); 3042 aSign = extractFloat64Sign( a ); 3043 if ( aExp == 0x7FF ) { 3044 if ( aSig ) return propagateFloat64NaN( a, a ); 3045 if ( ! aSign ) return a; 3046 float_raise( float_flag_invalid ); 3047 return float64_default_nan; 3048 } 3049 if ( aSign ) { 3050 if ( ( aExp | aSig ) == 0 ) return a; 3051 float_raise( float_flag_invalid ); 3052 return float64_default_nan; 3053 } 3054 if ( aExp == 0 ) { 3055 if ( aSig == 0 ) return 0; 3056 normalizeFloat64Subnormal( aSig, &aExp, &aSig ); 3057 } 3058 zExp = ( ( aExp - 0x3FF )>>1 ) + 0x3FE; 3059 aSig |= LIT64( 0x0010000000000000 ); 3060 zSig = estimateSqrt32( aExp, aSig>>21 ); 3061 aSig <<= 9 - ( aExp & 1 ); 3062 zSig = estimateDiv128To64( aSig, 0, zSig<<32 ) + ( zSig<<30 ); 3063 if ( ( zSig & 0x1FF ) <= 5 ) { 3064 doubleZSig = zSig<<1; 3065 mul64To128( zSig, zSig, &term0, &term1 ); 3066 sub128( aSig, 0, term0, term1, &rem0, &rem1 ); 3067 while ( (sbits64) rem0 < 0 ) { 3068 --zSig; 3069 doubleZSig -= 2; 3070 add128( rem0, rem1, zSig>>63, doubleZSig | 1, &rem0, &rem1 ); 3071 } 3072 zSig |= ( ( rem0 | rem1 ) != 0 ); 3073 } 3074 return roundAndPackFloat64( 0, zExp, zSig ); 3075 3076 } 3077 #endif 3078 3079 /* 3080 ------------------------------------------------------------------------------- 3081 Returns 1 if the double-precision floating-point value `a' is equal to the 3082 corresponding value `b', and 0 otherwise. The comparison is performed 3083 according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic. 3084 ------------------------------------------------------------------------------- 3085 */ 3086 flag float64_eq( float64 a, float64 b ) 3087 { 3088 3089 if ( ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) ) 3090 || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) ) 3091 ) { 3092 if ( float64_is_signaling_nan( a ) || float64_is_signaling_nan( b ) ) { 3093 float_raise( float_flag_invalid ); 3094 } 3095 return 0; 3096 } 3097 return ( a == b ) || 3098 ( (bits64) ( ( FLOAT64_DEMANGLE(a) | FLOAT64_DEMANGLE(b) )<<1 ) == 0 ); 3099 3100 } 3101 3102 /* 3103 ------------------------------------------------------------------------------- 3104 Returns 1 if the double-precision floating-point value `a' is less than or 3105 equal to the corresponding value `b', and 0 otherwise. The comparison is 3106 performed according to the IEC/IEEE Standard for Binary Floating-Point 3107 Arithmetic. 3108 ------------------------------------------------------------------------------- 3109 */ 3110 flag float64_le( float64 a, float64 b ) 3111 { 3112 flag aSign, bSign; 3113 3114 if ( ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) ) 3115 || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) ) 3116 ) { 3117 float_raise( float_flag_invalid ); 3118 return 0; 3119 } 3120 aSign = extractFloat64Sign( a ); 3121 bSign = extractFloat64Sign( b ); 3122 if ( aSign != bSign ) 3123 return aSign || 3124 ( (bits64) ( ( FLOAT64_DEMANGLE(a) | FLOAT64_DEMANGLE(b) )<<1 ) == 3125 0 ); 3126 return ( a == b ) || 3127 ( aSign ^ ( FLOAT64_DEMANGLE(a) < FLOAT64_DEMANGLE(b) ) ); 3128 3129 } 3130 3131 /* 3132 ------------------------------------------------------------------------------- 3133 Returns 1 if the double-precision floating-point value `a' is less than 3134 the corresponding value `b', and 0 otherwise. The comparison is performed 3135 according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic. 3136 ------------------------------------------------------------------------------- 3137 */ 3138 flag float64_lt( float64 a, float64 b ) 3139 { 3140 flag aSign, bSign; 3141 3142 if ( ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) ) 3143 || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) ) 3144 ) { 3145 float_raise( float_flag_invalid ); 3146 return 0; 3147 } 3148 aSign = extractFloat64Sign( a ); 3149 bSign = extractFloat64Sign( b ); 3150 if ( aSign != bSign ) 3151 return aSign && 3152 ( (bits64) ( ( FLOAT64_DEMANGLE(a) | FLOAT64_DEMANGLE(b) )<<1 ) != 3153 0 ); 3154 return ( a != b ) && 3155 ( aSign ^ ( FLOAT64_DEMANGLE(a) < FLOAT64_DEMANGLE(b) ) ); 3156 3157 } 3158 3159 #ifndef SOFTFLOAT_FOR_GCC 3160 /* 3161 ------------------------------------------------------------------------------- 3162 Returns 1 if the double-precision floating-point value `a' is equal to the 3163 corresponding value `b', and 0 otherwise. The invalid exception is raised 3164 if either operand is a NaN. Otherwise, the comparison is performed 3165 according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic. 3166 ------------------------------------------------------------------------------- 3167 */ 3168 flag float64_eq_signaling( float64 a, float64 b ) 3169 { 3170 3171 if ( ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) ) 3172 || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) ) 3173 ) { 3174 float_raise( float_flag_invalid ); 3175 return 0; 3176 } 3177 return ( a == b ) || ( (bits64) ( ( a | b )<<1 ) == 0 ); 3178 3179 } 3180 3181 /* 3182 ------------------------------------------------------------------------------- 3183 Returns 1 if the double-precision floating-point value `a' is less than or 3184 equal to the corresponding value `b', and 0 otherwise. Quiet NaNs do not 3185 cause an exception. Otherwise, the comparison is performed according to the 3186 IEC/IEEE Standard for Binary Floating-Point Arithmetic. 3187 ------------------------------------------------------------------------------- 3188 */ 3189 flag float64_le_quiet( float64 a, float64 b ) 3190 { 3191 flag aSign, bSign; 3192 3193 if ( ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) ) 3194 || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) ) 3195 ) { 3196 if ( float64_is_signaling_nan( a ) || float64_is_signaling_nan( b ) ) { 3197 float_raise( float_flag_invalid ); 3198 } 3199 return 0; 3200 } 3201 aSign = extractFloat64Sign( a ); 3202 bSign = extractFloat64Sign( b ); 3203 if ( aSign != bSign ) return aSign || ( (bits64) ( ( a | b )<<1 ) == 0 ); 3204 return ( a == b ) || ( aSign ^ ( a < b ) ); 3205 3206 } 3207 3208 /* 3209 ------------------------------------------------------------------------------- 3210 Returns 1 if the double-precision floating-point value `a' is less than 3211 the corresponding value `b', and 0 otherwise. Quiet NaNs do not cause an 3212 exception. Otherwise, the comparison is performed according to the IEC/IEEE 3213 Standard for Binary Floating-Point Arithmetic. 3214 ------------------------------------------------------------------------------- 3215 */ 3216 flag float64_lt_quiet( float64 a, float64 b ) 3217 { 3218 flag aSign, bSign; 3219 3220 if ( ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) ) 3221 || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) ) 3222 ) { 3223 if ( float64_is_signaling_nan( a ) || float64_is_signaling_nan( b ) ) { 3224 float_raise( float_flag_invalid ); 3225 } 3226 return 0; 3227 } 3228 aSign = extractFloat64Sign( a ); 3229 bSign = extractFloat64Sign( b ); 3230 if ( aSign != bSign ) return aSign && ( (bits64) ( ( a | b )<<1 ) != 0 ); 3231 return ( a != b ) && ( aSign ^ ( a < b ) ); 3232 3233 } 3234 #endif 3235 3236 #ifdef FLOATX80 3237 3238 /* 3239 ------------------------------------------------------------------------------- 3240 Returns the result of converting the extended double-precision floating- 3241 point value `a' to the 32-bit two's complement integer format. The 3242 conversion is performed according to the IEC/IEEE Standard for Binary 3243 Floating-Point Arithmetic---which means in particular that the conversion 3244 is rounded according to the current rounding mode. If `a' is a NaN, the 3245 largest positive integer is returned. Otherwise, if the conversion 3246 overflows, the largest integer with the same sign as `a' is returned. 3247 ------------------------------------------------------------------------------- 3248 */ 3249 int32 floatx80_to_int32( floatx80 a ) 3250 { 3251 flag aSign; 3252 int32 aExp, shiftCount; 3253 bits64 aSig; 3254 3255 aSig = extractFloatx80Frac( a ); 3256 aExp = extractFloatx80Exp( a ); 3257 aSign = extractFloatx80Sign( a ); 3258 if ( ( aExp == 0x7FFF ) && (bits64) ( aSig<<1 ) ) aSign = 0; 3259 shiftCount = 0x4037 - aExp; 3260 if ( shiftCount <= 0 ) shiftCount = 1; 3261 shift64RightJamming( aSig, shiftCount, &aSig ); 3262 return roundAndPackInt32( aSign, aSig ); 3263 3264 } 3265 3266 /* 3267 ------------------------------------------------------------------------------- 3268 Returns the result of converting the extended double-precision floating- 3269 point value `a' to the 32-bit two's complement integer format. The 3270 conversion is performed according to the IEC/IEEE Standard for Binary 3271 Floating-Point Arithmetic, except that the conversion is always rounded 3272 toward zero. If `a' is a NaN, the largest positive integer is returned. 3273 Otherwise, if the conversion overflows, the largest integer with the same 3274 sign as `a' is returned. 3275 ------------------------------------------------------------------------------- 3276 */ 3277 int32 floatx80_to_int32_round_to_zero( floatx80 a ) 3278 { 3279 flag aSign; 3280 int32 aExp, shiftCount; 3281 bits64 aSig, savedASig; 3282 int32 z; 3283 3284 aSig = extractFloatx80Frac( a ); 3285 aExp = extractFloatx80Exp( a ); 3286 aSign = extractFloatx80Sign( a ); 3287 if ( 0x401E < aExp ) { 3288 if ( ( aExp == 0x7FFF ) && (bits64) ( aSig<<1 ) ) aSign = 0; 3289 goto invalid; 3290 } 3291 else if ( aExp < 0x3FFF ) { 3292 if ( aExp || aSig ) float_exception_flags |= float_flag_inexact; 3293 return 0; 3294 } 3295 shiftCount = 0x403E - aExp; 3296 savedASig = aSig; 3297 aSig >>= shiftCount; 3298 z = aSig; 3299 if ( aSign ) z = - z; 3300 if ( ( z < 0 ) ^ aSign ) { 3301 invalid: 3302 float_raise( float_flag_invalid ); 3303 return aSign ? (sbits32) 0x80000000 : 0x7FFFFFFF; 3304 } 3305 if ( ( aSig<<shiftCount ) != savedASig ) { 3306 float_exception_flags |= float_flag_inexact; 3307 } 3308 return z; 3309 3310 } 3311 3312 /* 3313 ------------------------------------------------------------------------------- 3314 Returns the result of converting the extended double-precision floating- 3315 point value `a' to the 64-bit two's complement integer format. The 3316 conversion is performed according to the IEC/IEEE Standard for Binary 3317 Floating-Point Arithmetic---which means in particular that the conversion 3318 is rounded according to the current rounding mode. If `a' is a NaN, 3319 the largest positive integer is returned. Otherwise, if the conversion 3320 overflows, the largest integer with the same sign as `a' is returned. 3321 ------------------------------------------------------------------------------- 3322 */ 3323 int64 floatx80_to_int64( floatx80 a ) 3324 { 3325 flag aSign; 3326 int32 aExp, shiftCount; 3327 bits64 aSig, aSigExtra; 3328 3329 aSig = extractFloatx80Frac( a ); 3330 aExp = extractFloatx80Exp( a ); 3331 aSign = extractFloatx80Sign( a ); 3332 shiftCount = 0x403E - aExp; 3333 if ( shiftCount <= 0 ) { 3334 if ( shiftCount ) { 3335 float_raise( float_flag_invalid ); 3336 if ( ! aSign 3337 || ( ( aExp == 0x7FFF ) 3338 && ( aSig != LIT64( 0x8000000000000000 ) ) ) 3339 ) { 3340 return LIT64( 0x7FFFFFFFFFFFFFFF ); 3341 } 3342 return (sbits64) LIT64( 0x8000000000000000 ); 3343 } 3344 aSigExtra = 0; 3345 } 3346 else { 3347 shift64ExtraRightJamming( aSig, 0, shiftCount, &aSig, &aSigExtra ); 3348 } 3349 return roundAndPackInt64( aSign, aSig, aSigExtra ); 3350 3351 } 3352 3353 /* 3354 ------------------------------------------------------------------------------- 3355 Returns the result of converting the extended double-precision floating- 3356 point value `a' to the 64-bit two's complement integer format. The 3357 conversion is performed according to the IEC/IEEE Standard for Binary 3358 Floating-Point Arithmetic, except that the conversion is always rounded 3359 toward zero. If `a' is a NaN, the largest positive integer is returned. 3360 Otherwise, if the conversion overflows, the largest integer with the same 3361 sign as `a' is returned. 3362 ------------------------------------------------------------------------------- 3363 */ 3364 int64 floatx80_to_int64_round_to_zero( floatx80 a ) 3365 { 3366 flag aSign; 3367 int32 aExp, shiftCount; 3368 bits64 aSig; 3369 int64 z; 3370 3371 aSig = extractFloatx80Frac( a ); 3372 aExp = extractFloatx80Exp( a ); 3373 aSign = extractFloatx80Sign( a ); 3374 shiftCount = aExp - 0x403E; 3375 if ( 0 <= shiftCount ) { 3376 aSig &= LIT64( 0x7FFFFFFFFFFFFFFF ); 3377 if ( ( a.high != 0xC03E ) || aSig ) { 3378 float_raise( float_flag_invalid ); 3379 if ( ! aSign || ( ( aExp == 0x7FFF ) && aSig ) ) { 3380 return LIT64( 0x7FFFFFFFFFFFFFFF ); 3381 } 3382 } 3383 return (sbits64) LIT64( 0x8000000000000000 ); 3384 } 3385 else if ( aExp < 0x3FFF ) { 3386 if ( aExp | aSig ) float_exception_flags |= float_flag_inexact; 3387 return 0; 3388 } 3389 z = aSig>>( - shiftCount ); 3390 if ( (bits64) ( aSig<<( shiftCount & 63 ) ) ) { 3391 float_exception_flags |= float_flag_inexact; 3392 } 3393 if ( aSign ) z = - z; 3394 return z; 3395 3396 } 3397 3398 /* 3399 ------------------------------------------------------------------------------- 3400 Returns the result of converting the extended double-precision floating- 3401 point value `a' to the single-precision floating-point format. The 3402 conversion is performed according to the IEC/IEEE Standard for Binary 3403 Floating-Point Arithmetic. 3404 ------------------------------------------------------------------------------- 3405 */ 3406 float32 floatx80_to_float32( floatx80 a ) 3407 { 3408 flag aSign; 3409 int32 aExp; 3410 bits64 aSig; 3411 3412 aSig = extractFloatx80Frac( a ); 3413 aExp = extractFloatx80Exp( a ); 3414 aSign = extractFloatx80Sign( a ); 3415 if ( aExp == 0x7FFF ) { 3416 if ( (bits64) ( aSig<<1 ) ) { 3417 return commonNaNToFloat32( floatx80ToCommonNaN( a ) ); 3418 } 3419 return packFloat32( aSign, 0xFF, 0 ); 3420 } 3421 shift64RightJamming( aSig, 33, &aSig ); 3422 if ( aExp || aSig ) aExp -= 0x3F81; 3423 return roundAndPackFloat32( aSign, aExp, aSig ); 3424 3425 } 3426 3427 /* 3428 ------------------------------------------------------------------------------- 3429 Returns the result of converting the extended double-precision floating- 3430 point value `a' to the double-precision floating-point format. The 3431 conversion is performed according to the IEC/IEEE Standard for Binary 3432 Floating-Point Arithmetic. 3433 ------------------------------------------------------------------------------- 3434 */ 3435 float64 floatx80_to_float64( floatx80 a ) 3436 { 3437 flag aSign; 3438 int32 aExp; 3439 bits64 aSig, zSig; 3440 3441 aSig = extractFloatx80Frac( a ); 3442 aExp = extractFloatx80Exp( a ); 3443 aSign = extractFloatx80Sign( a ); 3444 if ( aExp == 0x7FFF ) { 3445 if ( (bits64) ( aSig<<1 ) ) { 3446 return commonNaNToFloat64( floatx80ToCommonNaN( a ) ); 3447 } 3448 return packFloat64( aSign, 0x7FF, 0 ); 3449 } 3450 shift64RightJamming( aSig, 1, &zSig ); 3451 if ( aExp || aSig ) aExp -= 0x3C01; 3452 return roundAndPackFloat64( aSign, aExp, zSig ); 3453 3454 } 3455 3456 #ifdef FLOAT128 3457 3458 /* 3459 ------------------------------------------------------------------------------- 3460 Returns the result of converting the extended double-precision floating- 3461 point value `a' to the quadruple-precision floating-point format. The 3462 conversion is performed according to the IEC/IEEE Standard for Binary 3463 Floating-Point Arithmetic. 3464 ------------------------------------------------------------------------------- 3465 */ 3466 float128 floatx80_to_float128( floatx80 a ) 3467 { 3468 flag aSign; 3469 int16 aExp; 3470 bits64 aSig, zSig0, zSig1; 3471 3472 aSig = extractFloatx80Frac( a ); 3473 aExp = extractFloatx80Exp( a ); 3474 aSign = extractFloatx80Sign( a ); 3475 if ( ( aExp == 0x7FFF ) && (bits64) ( aSig<<1 ) ) { 3476 return commonNaNToFloat128( floatx80ToCommonNaN( a ) ); 3477 } 3478 shift128Right( aSig<<1, 0, 16, &zSig0, &zSig1 ); 3479 return packFloat128( aSign, aExp, zSig0, zSig1 ); 3480 3481 } 3482 3483 #endif 3484 3485 /* 3486 ------------------------------------------------------------------------------- 3487 Rounds the extended double-precision floating-point value `a' to an integer, 3488 and returns the result as an extended quadruple-precision floating-point 3489 value. The operation is performed according to the IEC/IEEE Standard for 3490 Binary Floating-Point Arithmetic. 3491 ------------------------------------------------------------------------------- 3492 */ 3493 floatx80 floatx80_round_to_int( floatx80 a ) 3494 { 3495 flag aSign; 3496 int32 aExp; 3497 bits64 lastBitMask, roundBitsMask; 3498 int8 roundingMode; 3499 floatx80 z; 3500 3501 aExp = extractFloatx80Exp( a ); 3502 if ( 0x403E <= aExp ) { 3503 if ( ( aExp == 0x7FFF ) && (bits64) ( extractFloatx80Frac( a )<<1 ) ) { 3504 return propagateFloatx80NaN( a, a ); 3505 } 3506 return a; 3507 } 3508 if ( aExp < 0x3FFF ) { 3509 if ( ( aExp == 0 ) 3510 && ( (bits64) ( extractFloatx80Frac( a )<<1 ) == 0 ) ) { 3511 return a; 3512 } 3513 float_exception_flags |= float_flag_inexact; 3514 aSign = extractFloatx80Sign( a ); 3515 switch ( float_rounding_mode ) { 3516 case float_round_nearest_even: 3517 if ( ( aExp == 0x3FFE ) && (bits64) ( extractFloatx80Frac( a )<<1 ) 3518 ) { 3519 return 3520 packFloatx80( aSign, 0x3FFF, LIT64( 0x8000000000000000 ) ); 3521 } 3522 break; 3523 case float_round_to_zero: 3524 break; 3525 case float_round_down: 3526 return 3527 aSign ? 3528 packFloatx80( 1, 0x3FFF, LIT64( 0x8000000000000000 ) ) 3529 : packFloatx80( 0, 0, 0 ); 3530 case float_round_up: 3531 return 3532 aSign ? packFloatx80( 1, 0, 0 ) 3533 : packFloatx80( 0, 0x3FFF, LIT64( 0x8000000000000000 ) ); 3534 } 3535 return packFloatx80( aSign, 0, 0 ); 3536 } 3537 lastBitMask = 1; 3538 lastBitMask <<= 0x403E - aExp; 3539 roundBitsMask = lastBitMask - 1; 3540 z = a; 3541 roundingMode = float_rounding_mode; 3542 if ( roundingMode == float_round_nearest_even ) { 3543 z.low += lastBitMask>>1; 3544 if ( ( z.low & roundBitsMask ) == 0 ) z.low &= ~ lastBitMask; 3545 } 3546 else if ( roundingMode != float_round_to_zero ) { 3547 if ( extractFloatx80Sign( z ) ^ ( roundingMode == float_round_up ) ) { 3548 z.low += roundBitsMask; 3549 } 3550 } 3551 z.low &= ~ roundBitsMask; 3552 if ( z.low == 0 ) { 3553 ++z.high; 3554 z.low = LIT64( 0x8000000000000000 ); 3555 } 3556 if ( z.low != a.low ) float_exception_flags |= float_flag_inexact; 3557 return z; 3558 3559 } 3560 3561 /* 3562 ------------------------------------------------------------------------------- 3563 Returns the result of adding the absolute values of the extended double- 3564 precision floating-point values `a' and `b'. If `zSign' is 1, the sum is 3565 negated before being returned. `zSign' is ignored if the result is a NaN. 3566 The addition is performed according to the IEC/IEEE Standard for Binary 3567 Floating-Point Arithmetic. 3568 ------------------------------------------------------------------------------- 3569 */ 3570 static floatx80 addFloatx80Sigs( floatx80 a, floatx80 b, flag zSign ) 3571 { 3572 int32 aExp, bExp, zExp; 3573 bits64 aSig, bSig, zSig0, zSig1; 3574 int32 expDiff; 3575 3576 aSig = extractFloatx80Frac( a ); 3577 aExp = extractFloatx80Exp( a ); 3578 bSig = extractFloatx80Frac( b ); 3579 bExp = extractFloatx80Exp( b ); 3580 expDiff = aExp - bExp; 3581 if ( 0 < expDiff ) { 3582 if ( aExp == 0x7FFF ) { 3583 if ( (bits64) ( aSig<<1 ) ) return propagateFloatx80NaN( a, b ); 3584 return a; 3585 } 3586 if ( bExp == 0 ) --expDiff; 3587 shift64ExtraRightJamming( bSig, 0, expDiff, &bSig, &zSig1 ); 3588 zExp = aExp; 3589 } 3590 else if ( expDiff < 0 ) { 3591 if ( bExp == 0x7FFF ) { 3592 if ( (bits64) ( bSig<<1 ) ) return propagateFloatx80NaN( a, b ); 3593 return packFloatx80( zSign, 0x7FFF, LIT64( 0x8000000000000000 ) ); 3594 } 3595 if ( aExp == 0 ) ++expDiff; 3596 shift64ExtraRightJamming( aSig, 0, - expDiff, &aSig, &zSig1 ); 3597 zExp = bExp; 3598 } 3599 else { 3600 if ( aExp == 0x7FFF ) { 3601 if ( (bits64) ( ( aSig | bSig )<<1 ) ) { 3602 return propagateFloatx80NaN( a, b ); 3603 } 3604 return a; 3605 } 3606 zSig1 = 0; 3607 zSig0 = aSig + bSig; 3608 if ( aExp == 0 ) { 3609 normalizeFloatx80Subnormal( zSig0, &zExp, &zSig0 ); 3610 goto roundAndPack; 3611 } 3612 zExp = aExp; 3613 goto shiftRight1; 3614 } 3615 zSig0 = aSig + bSig; 3616 if ( (sbits64) zSig0 < 0 ) goto roundAndPack; 3617 shiftRight1: 3618 shift64ExtraRightJamming( zSig0, zSig1, 1, &zSig0, &zSig1 ); 3619 zSig0 |= LIT64( 0x8000000000000000 ); 3620 ++zExp; 3621 roundAndPack: 3622 return 3623 roundAndPackFloatx80( 3624 floatx80_rounding_precision, zSign, zExp, zSig0, zSig1 ); 3625 3626 } 3627 3628 /* 3629 ------------------------------------------------------------------------------- 3630 Returns the result of subtracting the absolute values of the extended 3631 double-precision floating-point values `a' and `b'. If `zSign' is 1, the 3632 difference is negated before being returned. `zSign' is ignored if the 3633 result is a NaN. The subtraction is performed according to the IEC/IEEE 3634 Standard for Binary Floating-Point Arithmetic. 3635 ------------------------------------------------------------------------------- 3636 */ 3637 static floatx80 subFloatx80Sigs( floatx80 a, floatx80 b, flag zSign ) 3638 { 3639 int32 aExp, bExp, zExp; 3640 bits64 aSig, bSig, zSig0, zSig1; 3641 int32 expDiff; 3642 floatx80 z; 3643 3644 aSig = extractFloatx80Frac( a ); 3645 aExp = extractFloatx80Exp( a ); 3646 bSig = extractFloatx80Frac( b ); 3647 bExp = extractFloatx80Exp( b ); 3648 expDiff = aExp - bExp; 3649 if ( 0 < expDiff ) goto aExpBigger; 3650 if ( expDiff < 0 ) goto bExpBigger; 3651 if ( aExp == 0x7FFF ) { 3652 if ( (bits64) ( ( aSig | bSig )<<1 ) ) { 3653 return propagateFloatx80NaN( a, b ); 3654 } 3655 float_raise( float_flag_invalid ); 3656 z.low = floatx80_default_nan_low; 3657 z.high = floatx80_default_nan_high; 3658 return z; 3659 } 3660 if ( aExp == 0 ) { 3661 aExp = 1; 3662 bExp = 1; 3663 } 3664 zSig1 = 0; 3665 if ( bSig < aSig ) goto aBigger; 3666 if ( aSig < bSig ) goto bBigger; 3667 return packFloatx80( float_rounding_mode == float_round_down, 0, 0 ); 3668 bExpBigger: 3669 if ( bExp == 0x7FFF ) { 3670 if ( (bits64) ( bSig<<1 ) ) return propagateFloatx80NaN( a, b ); 3671 return packFloatx80( zSign ^ 1, 0x7FFF, LIT64( 0x8000000000000000 ) ); 3672 } 3673 if ( aExp == 0 ) ++expDiff; 3674 shift128RightJamming( aSig, 0, - expDiff, &aSig, &zSig1 ); 3675 bBigger: 3676 sub128( bSig, 0, aSig, zSig1, &zSig0, &zSig1 ); 3677 zExp = bExp; 3678 zSign ^= 1; 3679 goto normalizeRoundAndPack; 3680 aExpBigger: 3681 if ( aExp == 0x7FFF ) { 3682 if ( (bits64) ( aSig<<1 ) ) return propagateFloatx80NaN( a, b ); 3683 return a; 3684 } 3685 if ( bExp == 0 ) --expDiff; 3686 shift128RightJamming( bSig, 0, expDiff, &bSig, &zSig1 ); 3687 aBigger: 3688 sub128( aSig, 0, bSig, zSig1, &zSig0, &zSig1 ); 3689 zExp = aExp; 3690 normalizeRoundAndPack: 3691 return 3692 normalizeRoundAndPackFloatx80( 3693 floatx80_rounding_precision, zSign, zExp, zSig0, zSig1 ); 3694 3695 } 3696 3697 /* 3698 ------------------------------------------------------------------------------- 3699 Returns the result of adding the extended double-precision floating-point 3700 values `a' and `b'. The operation is performed according to the IEC/IEEE 3701 Standard for Binary Floating-Point Arithmetic. 3702 ------------------------------------------------------------------------------- 3703 */ 3704 floatx80 floatx80_add( floatx80 a, floatx80 b ) 3705 { 3706 flag aSign, bSign; 3707 3708 aSign = extractFloatx80Sign( a ); 3709 bSign = extractFloatx80Sign( b ); 3710 if ( aSign == bSign ) { 3711 return addFloatx80Sigs( a, b, aSign ); 3712 } 3713 else { 3714 return subFloatx80Sigs( a, b, aSign ); 3715 } 3716 3717 } 3718 3719 /* 3720 ------------------------------------------------------------------------------- 3721 Returns the result of subtracting the extended double-precision floating- 3722 point values `a' and `b'. The operation is performed according to the 3723 IEC/IEEE Standard for Binary Floating-Point Arithmetic. 3724 ------------------------------------------------------------------------------- 3725 */ 3726 floatx80 floatx80_sub( floatx80 a, floatx80 b ) 3727 { 3728 flag aSign, bSign; 3729 3730 aSign = extractFloatx80Sign( a ); 3731 bSign = extractFloatx80Sign( b ); 3732 if ( aSign == bSign ) { 3733 return subFloatx80Sigs( a, b, aSign ); 3734 } 3735 else { 3736 return addFloatx80Sigs( a, b, aSign ); 3737 } 3738 3739 } 3740 3741 /* 3742 ------------------------------------------------------------------------------- 3743 Returns the result of multiplying the extended double-precision floating- 3744 point values `a' and `b'. The operation is performed according to the 3745 IEC/IEEE Standard for Binary Floating-Point Arithmetic. 3746 ------------------------------------------------------------------------------- 3747 */ 3748 floatx80 floatx80_mul( floatx80 a, floatx80 b ) 3749 { 3750 flag aSign, bSign, zSign; 3751 int32 aExp, bExp, zExp; 3752 bits64 aSig, bSig, zSig0, zSig1; 3753 floatx80 z; 3754 3755 aSig = extractFloatx80Frac( a ); 3756 aExp = extractFloatx80Exp( a ); 3757 aSign = extractFloatx80Sign( a ); 3758 bSig = extractFloatx80Frac( b ); 3759 bExp = extractFloatx80Exp( b ); 3760 bSign = extractFloatx80Sign( b ); 3761 zSign = aSign ^ bSign; 3762 if ( aExp == 0x7FFF ) { 3763 if ( (bits64) ( aSig<<1 ) 3764 || ( ( bExp == 0x7FFF ) && (bits64) ( bSig<<1 ) ) ) { 3765 return propagateFloatx80NaN( a, b ); 3766 } 3767 if ( ( bExp | bSig ) == 0 ) goto invalid; 3768 return packFloatx80( zSign, 0x7FFF, LIT64( 0x8000000000000000 ) ); 3769 } 3770 if ( bExp == 0x7FFF ) { 3771 if ( (bits64) ( bSig<<1 ) ) return propagateFloatx80NaN( a, b ); 3772 if ( ( aExp | aSig ) == 0 ) { 3773 invalid: 3774 float_raise( float_flag_invalid ); 3775 z.low = floatx80_default_nan_low; 3776 z.high = floatx80_default_nan_high; 3777 return z; 3778 } 3779 return packFloatx80( zSign, 0x7FFF, LIT64( 0x8000000000000000 ) ); 3780 } 3781 if ( aExp == 0 ) { 3782 if ( aSig == 0 ) return packFloatx80( zSign, 0, 0 ); 3783 normalizeFloatx80Subnormal( aSig, &aExp, &aSig ); 3784 } 3785 if ( bExp == 0 ) { 3786 if ( bSig == 0 ) return packFloatx80( zSign, 0, 0 ); 3787 normalizeFloatx80Subnormal( bSig, &bExp, &bSig ); 3788 } 3789 zExp = aExp + bExp - 0x3FFE; 3790 mul64To128( aSig, bSig, &zSig0, &zSig1 ); 3791 if ( 0 < (sbits64) zSig0 ) { 3792 shortShift128Left( zSig0, zSig1, 1, &zSig0, &zSig1 ); 3793 --zExp; 3794 } 3795 return 3796 roundAndPackFloatx80( 3797 floatx80_rounding_precision, zSign, zExp, zSig0, zSig1 ); 3798 3799 } 3800 3801 /* 3802 ------------------------------------------------------------------------------- 3803 Returns the result of dividing the extended double-precision floating-point 3804 value `a' by the corresponding value `b'. The operation is performed 3805 according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic. 3806 ------------------------------------------------------------------------------- 3807 */ 3808 floatx80 floatx80_div( floatx80 a, floatx80 b ) 3809 { 3810 flag aSign, bSign, zSign; 3811 int32 aExp, bExp, zExp; 3812 bits64 aSig, bSig, zSig0, zSig1; 3813 bits64 rem0, rem1, rem2, term0, term1, term2; 3814 floatx80 z; 3815 3816 aSig = extractFloatx80Frac( a ); 3817 aExp = extractFloatx80Exp( a ); 3818 aSign = extractFloatx80Sign( a ); 3819 bSig = extractFloatx80Frac( b ); 3820 bExp = extractFloatx80Exp( b ); 3821 bSign = extractFloatx80Sign( b ); 3822 zSign = aSign ^ bSign; 3823 if ( aExp == 0x7FFF ) { 3824 if ( (bits64) ( aSig<<1 ) ) return propagateFloatx80NaN( a, b ); 3825 if ( bExp == 0x7FFF ) { 3826 if ( (bits64) ( bSig<<1 ) ) return propagateFloatx80NaN( a, b ); 3827 goto invalid; 3828 } 3829 return packFloatx80( zSign, 0x7FFF, LIT64( 0x8000000000000000 ) ); 3830 } 3831 if ( bExp == 0x7FFF ) { 3832 if ( (bits64) ( bSig<<1 ) ) return propagateFloatx80NaN( a, b ); 3833 return packFloatx80( zSign, 0, 0 ); 3834 } 3835 if ( bExp == 0 ) { 3836 if ( bSig == 0 ) { 3837 if ( ( aExp | aSig ) == 0 ) { 3838 invalid: 3839 float_raise( float_flag_invalid ); 3840 z.low = floatx80_default_nan_low; 3841 z.high = floatx80_default_nan_high; 3842 return z; 3843 } 3844 float_raise( float_flag_divbyzero ); 3845 return packFloatx80( zSign, 0x7FFF, LIT64( 0x8000000000000000 ) ); 3846 } 3847 normalizeFloatx80Subnormal( bSig, &bExp, &bSig ); 3848 } 3849 if ( aExp == 0 ) { 3850 if ( aSig == 0 ) return packFloatx80( zSign, 0, 0 ); 3851 normalizeFloatx80Subnormal( aSig, &aExp, &aSig ); 3852 } 3853 zExp = aExp - bExp + 0x3FFE; 3854 rem1 = 0; 3855 if ( bSig <= aSig ) { 3856 shift128Right( aSig, 0, 1, &aSig, &rem1 ); 3857 ++zExp; 3858 } 3859 zSig0 = estimateDiv128To64( aSig, rem1, bSig ); 3860 mul64To128( bSig, zSig0, &term0, &term1 ); 3861 sub128( aSig, rem1, term0, term1, &rem0, &rem1 ); 3862 while ( (sbits64) rem0 < 0 ) { 3863 --zSig0; 3864 add128( rem0, rem1, 0, bSig, &rem0, &rem1 ); 3865 } 3866 zSig1 = estimateDiv128To64( rem1, 0, bSig ); 3867 if ( (bits64) ( zSig1<<1 ) <= 8 ) { 3868 mul64To128( bSig, zSig1, &term1, &term2 ); 3869 sub128( rem1, 0, term1, term2, &rem1, &rem2 ); 3870 while ( (sbits64) rem1 < 0 ) { 3871 --zSig1; 3872 add128( rem1, rem2, 0, bSig, &rem1, &rem2 ); 3873 } 3874 zSig1 |= ( ( rem1 | rem2 ) != 0 ); 3875 } 3876 return 3877 roundAndPackFloatx80( 3878 floatx80_rounding_precision, zSign, zExp, zSig0, zSig1 ); 3879 3880 } 3881 3882 /* 3883 ------------------------------------------------------------------------------- 3884 Returns the remainder of the extended double-precision floating-point value 3885 `a' with respect to the corresponding value `b'. The operation is performed 3886 according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic. 3887 ------------------------------------------------------------------------------- 3888 */ 3889 floatx80 floatx80_rem( floatx80 a, floatx80 b ) 3890 { 3891 flag aSign, bSign, zSign; 3892 int32 aExp, bExp, expDiff; 3893 bits64 aSig0, aSig1, bSig; 3894 bits64 q, term0, term1, alternateASig0, alternateASig1; 3895 floatx80 z; 3896 3897 aSig0 = extractFloatx80Frac( a ); 3898 aExp = extractFloatx80Exp( a ); 3899 aSign = extractFloatx80Sign( a ); 3900 bSig = extractFloatx80Frac( b ); 3901 bExp = extractFloatx80Exp( b ); 3902 bSign = extractFloatx80Sign( b ); 3903 if ( aExp == 0x7FFF ) { 3904 if ( (bits64) ( aSig0<<1 ) 3905 || ( ( bExp == 0x7FFF ) && (bits64) ( bSig<<1 ) ) ) { 3906 return propagateFloatx80NaN( a, b ); 3907 } 3908 goto invalid; 3909 } 3910 if ( bExp == 0x7FFF ) { 3911 if ( (bits64) ( bSig<<1 ) ) return propagateFloatx80NaN( a, b ); 3912 return a; 3913 } 3914 if ( bExp == 0 ) { 3915 if ( bSig == 0 ) { 3916 invalid: 3917 float_raise( float_flag_invalid ); 3918 z.low = floatx80_default_nan_low; 3919 z.high = floatx80_default_nan_high; 3920 return z; 3921 } 3922 normalizeFloatx80Subnormal( bSig, &bExp, &bSig ); 3923 } 3924 if ( aExp == 0 ) { 3925 if ( (bits64) ( aSig0<<1 ) == 0 ) return a; 3926 normalizeFloatx80Subnormal( aSig0, &aExp, &aSig0 ); 3927 } 3928 bSig |= LIT64( 0x8000000000000000 ); 3929 zSign = aSign; 3930 expDiff = aExp - bExp; 3931 aSig1 = 0; 3932 if ( expDiff < 0 ) { 3933 if ( expDiff < -1 ) return a; 3934 shift128Right( aSig0, 0, 1, &aSig0, &aSig1 ); 3935 expDiff = 0; 3936 } 3937 q = ( bSig <= aSig0 ); 3938 if ( q ) aSig0 -= bSig; 3939 expDiff -= 64; 3940 while ( 0 < expDiff ) { 3941 q = estimateDiv128To64( aSig0, aSig1, bSig ); 3942 q = ( 2 < q ) ? q - 2 : 0; 3943 mul64To128( bSig, q, &term0, &term1 ); 3944 sub128( aSig0, aSig1, term0, term1, &aSig0, &aSig1 ); 3945 shortShift128Left( aSig0, aSig1, 62, &aSig0, &aSig1 ); 3946 expDiff -= 62; 3947 } 3948 expDiff += 64; 3949 if ( 0 < expDiff ) { 3950 q = estimateDiv128To64( aSig0, aSig1, bSig ); 3951 q = ( 2 < q ) ? q - 2 : 0; 3952 q >>= 64 - expDiff; 3953 mul64To128( bSig, q<<( 64 - expDiff ), &term0, &term1 ); 3954 sub128( aSig0, aSig1, term0, term1, &aSig0, &aSig1 ); 3955 shortShift128Left( 0, bSig, 64 - expDiff, &term0, &term1 ); 3956 while ( le128( term0, term1, aSig0, aSig1 ) ) { 3957 ++q; 3958 sub128( aSig0, aSig1, term0, term1, &aSig0, &aSig1 ); 3959 } 3960 } 3961 else { 3962 term1 = 0; 3963 term0 = bSig; 3964 } 3965 sub128( term0, term1, aSig0, aSig1, &alternateASig0, &alternateASig1 ); 3966 if ( lt128( alternateASig0, alternateASig1, aSig0, aSig1 ) 3967 || ( eq128( alternateASig0, alternateASig1, aSig0, aSig1 ) 3968 && ( q & 1 ) ) 3969 ) { 3970 aSig0 = alternateASig0; 3971 aSig1 = alternateASig1; 3972 zSign = ! zSign; 3973 } 3974 return 3975 normalizeRoundAndPackFloatx80( 3976 80, zSign, bExp + expDiff, aSig0, aSig1 ); 3977 3978 } 3979 3980 /* 3981 ------------------------------------------------------------------------------- 3982 Returns the square root of the extended double-precision floating-point 3983 value `a'. The operation is performed according to the IEC/IEEE Standard 3984 for Binary Floating-Point Arithmetic. 3985 ------------------------------------------------------------------------------- 3986 */ 3987 floatx80 floatx80_sqrt( floatx80 a ) 3988 { 3989 flag aSign; 3990 int32 aExp, zExp; 3991 bits64 aSig0, aSig1, zSig0, zSig1, doubleZSig0; 3992 bits64 rem0, rem1, rem2, rem3, term0, term1, term2, term3; 3993 floatx80 z; 3994 3995 aSig0 = extractFloatx80Frac( a ); 3996 aExp = extractFloatx80Exp( a ); 3997 aSign = extractFloatx80Sign( a ); 3998 if ( aExp == 0x7FFF ) { 3999 if ( (bits64) ( aSig0<<1 ) ) return propagateFloatx80NaN( a, a ); 4000 if ( ! aSign ) return a; 4001 goto invalid; 4002 } 4003 if ( aSign ) { 4004 if ( ( aExp | aSig0 ) == 0 ) return a; 4005 invalid: 4006 float_raise( float_flag_invalid ); 4007 z.low = floatx80_default_nan_low; 4008 z.high = floatx80_default_nan_high; 4009 return z; 4010 } 4011 if ( aExp == 0 ) { 4012 if ( aSig0 == 0 ) return packFloatx80( 0, 0, 0 ); 4013 normalizeFloatx80Subnormal( aSig0, &aExp, &aSig0 ); 4014 } 4015 zExp = ( ( aExp - 0x3FFF )>>1 ) + 0x3FFF; 4016 zSig0 = estimateSqrt32( aExp, aSig0>>32 ); 4017 shift128Right( aSig0, 0, 2 + ( aExp & 1 ), &aSig0, &aSig1 ); 4018 zSig0 = estimateDiv128To64( aSig0, aSig1, zSig0<<32 ) + ( zSig0<<30 ); 4019 doubleZSig0 = zSig0<<1; 4020 mul64To128( zSig0, zSig0, &term0, &term1 ); 4021 sub128( aSig0, aSig1, term0, term1, &rem0, &rem1 ); 4022 while ( (sbits64) rem0 < 0 ) { 4023 --zSig0; 4024 doubleZSig0 -= 2; 4025 add128( rem0, rem1, zSig0>>63, doubleZSig0 | 1, &rem0, &rem1 ); 4026 } 4027 zSig1 = estimateDiv128To64( rem1, 0, doubleZSig0 ); 4028 if ( ( zSig1 & LIT64( 0x3FFFFFFFFFFFFFFF ) ) <= 5 ) { 4029 if ( zSig1 == 0 ) zSig1 = 1; 4030 mul64To128( doubleZSig0, zSig1, &term1, &term2 ); 4031 sub128( rem1, 0, term1, term2, &rem1, &rem2 ); 4032 mul64To128( zSig1, zSig1, &term2, &term3 ); 4033 sub192( rem1, rem2, 0, 0, term2, term3, &rem1, &rem2, &rem3 ); 4034 while ( (sbits64) rem1 < 0 ) { 4035 --zSig1; 4036 shortShift128Left( 0, zSig1, 1, &term2, &term3 ); 4037 term3 |= 1; 4038 term2 |= doubleZSig0; 4039 add192( rem1, rem2, rem3, 0, term2, term3, &rem1, &rem2, &rem3 ); 4040 } 4041 zSig1 |= ( ( rem1 | rem2 | rem3 ) != 0 ); 4042 } 4043 shortShift128Left( 0, zSig1, 1, &zSig0, &zSig1 ); 4044 zSig0 |= doubleZSig0; 4045 return 4046 roundAndPackFloatx80( 4047 floatx80_rounding_precision, 0, zExp, zSig0, zSig1 ); 4048 4049 } 4050 4051 /* 4052 ------------------------------------------------------------------------------- 4053 Returns 1 if the extended double-precision floating-point value `a' is 4054 equal to the corresponding value `b', and 0 otherwise. The comparison is 4055 performed according to the IEC/IEEE Standard for Binary Floating-Point 4056 Arithmetic. 4057 ------------------------------------------------------------------------------- 4058 */ 4059 flag floatx80_eq( floatx80 a, floatx80 b ) 4060 { 4061 4062 if ( ( ( extractFloatx80Exp( a ) == 0x7FFF ) 4063 && (bits64) ( extractFloatx80Frac( a )<<1 ) ) 4064 || ( ( extractFloatx80Exp( b ) == 0x7FFF ) 4065 && (bits64) ( extractFloatx80Frac( b )<<1 ) ) 4066 ) { 4067 if ( floatx80_is_signaling_nan( a ) 4068 || floatx80_is_signaling_nan( b ) ) { 4069 float_raise( float_flag_invalid ); 4070 } 4071 return 0; 4072 } 4073 return 4074 ( a.low == b.low ) 4075 && ( ( a.high == b.high ) 4076 || ( ( a.low == 0 ) 4077 && ( (bits16) ( ( a.high | b.high )<<1 ) == 0 ) ) 4078 ); 4079 4080 } 4081 4082 /* 4083 ------------------------------------------------------------------------------- 4084 Returns 1 if the extended double-precision floating-point value `a' is 4085 less than or equal to the corresponding value `b', and 0 otherwise. The 4086 comparison is performed according to the IEC/IEEE Standard for Binary 4087 Floating-Point Arithmetic. 4088 ------------------------------------------------------------------------------- 4089 */ 4090 flag floatx80_le( floatx80 a, floatx80 b ) 4091 { 4092 flag aSign, bSign; 4093 4094 if ( ( ( extractFloatx80Exp( a ) == 0x7FFF ) 4095 && (bits64) ( extractFloatx80Frac( a )<<1 ) ) 4096 || ( ( extractFloatx80Exp( b ) == 0x7FFF ) 4097 && (bits64) ( extractFloatx80Frac( b )<<1 ) ) 4098 ) { 4099 float_raise( float_flag_invalid ); 4100 return 0; 4101 } 4102 aSign = extractFloatx80Sign( a ); 4103 bSign = extractFloatx80Sign( b ); 4104 if ( aSign != bSign ) { 4105 return 4106 aSign 4107 || ( ( ( (bits16) ( ( a.high | b.high )<<1 ) ) | a.low | b.low ) 4108 == 0 ); 4109 } 4110 return 4111 aSign ? le128( b.high, b.low, a.high, a.low ) 4112 : le128( a.high, a.low, b.high, b.low ); 4113 4114 } 4115 4116 /* 4117 ------------------------------------------------------------------------------- 4118 Returns 1 if the extended double-precision floating-point value `a' is 4119 less than the corresponding value `b', and 0 otherwise. The comparison 4120 is performed according to the IEC/IEEE Standard for Binary Floating-Point 4121 Arithmetic. 4122 ------------------------------------------------------------------------------- 4123 */ 4124 flag floatx80_lt( floatx80 a, floatx80 b ) 4125 { 4126 flag aSign, bSign; 4127 4128 if ( ( ( extractFloatx80Exp( a ) == 0x7FFF ) 4129 && (bits64) ( extractFloatx80Frac( a )<<1 ) ) 4130 || ( ( extractFloatx80Exp( b ) == 0x7FFF ) 4131 && (bits64) ( extractFloatx80Frac( b )<<1 ) ) 4132 ) { 4133 float_raise( float_flag_invalid ); 4134 return 0; 4135 } 4136 aSign = extractFloatx80Sign( a ); 4137 bSign = extractFloatx80Sign( b ); 4138 if ( aSign != bSign ) { 4139 return 4140 aSign 4141 && ( ( ( (bits16) ( ( a.high | b.high )<<1 ) ) | a.low | b.low ) 4142 != 0 ); 4143 } 4144 return 4145 aSign ? lt128( b.high, b.low, a.high, a.low ) 4146 : lt128( a.high, a.low, b.high, b.low ); 4147 4148 } 4149 4150 /* 4151 ------------------------------------------------------------------------------- 4152 Returns 1 if the extended double-precision floating-point value `a' is equal 4153 to the corresponding value `b', and 0 otherwise. The invalid exception is 4154 raised if either operand is a NaN. Otherwise, the comparison is performed 4155 according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic. 4156 ------------------------------------------------------------------------------- 4157 */ 4158 flag floatx80_eq_signaling( floatx80 a, floatx80 b ) 4159 { 4160 4161 if ( ( ( extractFloatx80Exp( a ) == 0x7FFF ) 4162 && (bits64) ( extractFloatx80Frac( a )<<1 ) ) 4163 || ( ( extractFloatx80Exp( b ) == 0x7FFF ) 4164 && (bits64) ( extractFloatx80Frac( b )<<1 ) ) 4165 ) { 4166 float_raise( float_flag_invalid ); 4167 return 0; 4168 } 4169 return 4170 ( a.low == b.low ) 4171 && ( ( a.high == b.high ) 4172 || ( ( a.low == 0 ) 4173 && ( (bits16) ( ( a.high | b.high )<<1 ) == 0 ) ) 4174 ); 4175 4176 } 4177 4178 /* 4179 ------------------------------------------------------------------------------- 4180 Returns 1 if the extended double-precision floating-point value `a' is less 4181 than or equal to the corresponding value `b', and 0 otherwise. Quiet NaNs 4182 do not cause an exception. Otherwise, the comparison is performed according 4183 to the IEC/IEEE Standard for Binary Floating-Point Arithmetic. 4184 ------------------------------------------------------------------------------- 4185 */ 4186 flag floatx80_le_quiet( floatx80 a, floatx80 b ) 4187 { 4188 flag aSign, bSign; 4189 4190 if ( ( ( extractFloatx80Exp( a ) == 0x7FFF ) 4191 && (bits64) ( extractFloatx80Frac( a )<<1 ) ) 4192 || ( ( extractFloatx80Exp( b ) == 0x7FFF ) 4193 && (bits64) ( extractFloatx80Frac( b )<<1 ) ) 4194 ) { 4195 if ( floatx80_is_signaling_nan( a ) 4196 || floatx80_is_signaling_nan( b ) ) { 4197 float_raise( float_flag_invalid ); 4198 } 4199 return 0; 4200 } 4201 aSign = extractFloatx80Sign( a ); 4202 bSign = extractFloatx80Sign( b ); 4203 if ( aSign != bSign ) { 4204 return 4205 aSign 4206 || ( ( ( (bits16) ( ( a.high | b.high )<<1 ) ) | a.low | b.low ) 4207 == 0 ); 4208 } 4209 return 4210 aSign ? le128( b.high, b.low, a.high, a.low ) 4211 : le128( a.high, a.low, b.high, b.low ); 4212 4213 } 4214 4215 /* 4216 ------------------------------------------------------------------------------- 4217 Returns 1 if the extended double-precision floating-point value `a' is less 4218 than the corresponding value `b', and 0 otherwise. Quiet NaNs do not cause 4219 an exception. Otherwise, the comparison is performed according to the 4220 IEC/IEEE Standard for Binary Floating-Point Arithmetic. 4221 ------------------------------------------------------------------------------- 4222 */ 4223 flag floatx80_lt_quiet( floatx80 a, floatx80 b ) 4224 { 4225 flag aSign, bSign; 4226 4227 if ( ( ( extractFloatx80Exp( a ) == 0x7FFF ) 4228 && (bits64) ( extractFloatx80Frac( a )<<1 ) ) 4229 || ( ( extractFloatx80Exp( b ) == 0x7FFF ) 4230 && (bits64) ( extractFloatx80Frac( b )<<1 ) ) 4231 ) { 4232 if ( floatx80_is_signaling_nan( a ) 4233 || floatx80_is_signaling_nan( b ) ) { 4234 float_raise( float_flag_invalid ); 4235 } 4236 return 0; 4237 } 4238 aSign = extractFloatx80Sign( a ); 4239 bSign = extractFloatx80Sign( b ); 4240 if ( aSign != bSign ) { 4241 return 4242 aSign 4243 && ( ( ( (bits16) ( ( a.high | b.high )<<1 ) ) | a.low | b.low ) 4244 != 0 ); 4245 } 4246 return 4247 aSign ? lt128( b.high, b.low, a.high, a.low ) 4248 : lt128( a.high, a.low, b.high, b.low ); 4249 4250 } 4251 4252 #endif 4253 4254 #ifdef FLOAT128 4255 4256 /* 4257 ------------------------------------------------------------------------------- 4258 Returns the result of converting the quadruple-precision floating-point 4259 value `a' to the 32-bit two's complement integer format. The conversion 4260 is performed according to the IEC/IEEE Standard for Binary Floating-Point 4261 Arithmetic---which means in particular that the conversion is rounded 4262 according to the current rounding mode. If `a' is a NaN, the largest 4263 positive integer is returned. Otherwise, if the conversion overflows, the 4264 largest integer with the same sign as `a' is returned. 4265 ------------------------------------------------------------------------------- 4266 */ 4267 int32 float128_to_int32( float128 a ) 4268 { 4269 flag aSign; 4270 int32 aExp, shiftCount; 4271 bits64 aSig0, aSig1; 4272 4273 aSig1 = extractFloat128Frac1( a ); 4274 aSig0 = extractFloat128Frac0( a ); 4275 aExp = extractFloat128Exp( a ); 4276 aSign = extractFloat128Sign( a ); 4277 if ( ( aExp == 0x7FFF ) && ( aSig0 | aSig1 ) ) aSign = 0; 4278 if ( aExp ) aSig0 |= LIT64( 0x0001000000000000 ); 4279 aSig0 |= ( aSig1 != 0 ); 4280 shiftCount = 0x4028 - aExp; 4281 if ( 0 < shiftCount ) shift64RightJamming( aSig0, shiftCount, &aSig0 ); 4282 return roundAndPackInt32( aSign, aSig0 ); 4283 4284 } 4285 4286 /* 4287 ------------------------------------------------------------------------------- 4288 Returns the result of converting the quadruple-precision floating-point 4289 value `a' to the 32-bit two's complement integer format. The conversion 4290 is performed according to the IEC/IEEE Standard for Binary Floating-Point 4291 Arithmetic, except that the conversion is always rounded toward zero. If 4292 `a' is a NaN, the largest positive integer is returned. Otherwise, if the 4293 conversion overflows, the largest integer with the same sign as `a' is 4294 returned. 4295 ------------------------------------------------------------------------------- 4296 */ 4297 int32 float128_to_int32_round_to_zero( float128 a ) 4298 { 4299 flag aSign; 4300 int32 aExp, shiftCount; 4301 bits64 aSig0, aSig1, savedASig; 4302 int32 z; 4303 4304 aSig1 = extractFloat128Frac1( a ); 4305 aSig0 = extractFloat128Frac0( a ); 4306 aExp = extractFloat128Exp( a ); 4307 aSign = extractFloat128Sign( a ); 4308 aSig0 |= ( aSig1 != 0 ); 4309 if ( 0x401E < aExp ) { 4310 if ( ( aExp == 0x7FFF ) && aSig0 ) aSign = 0; 4311 goto invalid; 4312 } 4313 else if ( aExp < 0x3FFF ) { 4314 if ( aExp || aSig0 ) float_exception_flags |= float_flag_inexact; 4315 return 0; 4316 } 4317 aSig0 |= LIT64( 0x0001000000000000 ); 4318 shiftCount = 0x402F - aExp; 4319 savedASig = aSig0; 4320 aSig0 >>= shiftCount; 4321 z = aSig0; 4322 if ( aSign ) z = - z; 4323 if ( ( z < 0 ) ^ aSign ) { 4324 invalid: 4325 float_raise( float_flag_invalid ); 4326 return aSign ? (sbits32) 0x80000000 : 0x7FFFFFFF; 4327 } 4328 if ( ( aSig0<<shiftCount ) != savedASig ) { 4329 float_exception_flags |= float_flag_inexact; 4330 } 4331 return z; 4332 4333 } 4334 4335 /* 4336 ------------------------------------------------------------------------------- 4337 Returns the result of converting the quadruple-precision floating-point 4338 value `a' to the 64-bit two's complement integer format. The conversion 4339 is performed according to the IEC/IEEE Standard for Binary Floating-Point 4340 Arithmetic---which means in particular that the conversion is rounded 4341 according to the current rounding mode. If `a' is a NaN, the largest 4342 positive integer is returned. Otherwise, if the conversion overflows, the 4343 largest integer with the same sign as `a' is returned. 4344 ------------------------------------------------------------------------------- 4345 */ 4346 int64 float128_to_int64( float128 a ) 4347 { 4348 flag aSign; 4349 int32 aExp, shiftCount; 4350 bits64 aSig0, aSig1; 4351 4352 aSig1 = extractFloat128Frac1( a ); 4353 aSig0 = extractFloat128Frac0( a ); 4354 aExp = extractFloat128Exp( a ); 4355 aSign = extractFloat128Sign( a ); 4356 if ( aExp ) aSig0 |= LIT64( 0x0001000000000000 ); 4357 shiftCount = 0x402F - aExp; 4358 if ( shiftCount <= 0 ) { 4359 if ( 0x403E < aExp ) { 4360 float_raise( float_flag_invalid ); 4361 if ( ! aSign 4362 || ( ( aExp == 0x7FFF ) 4363 && ( aSig1 || ( aSig0 != LIT64( 0x0001000000000000 ) ) ) 4364 ) 4365 ) { 4366 return LIT64( 0x7FFFFFFFFFFFFFFF ); 4367 } 4368 return (sbits64) LIT64( 0x8000000000000000 ); 4369 } 4370 shortShift128Left( aSig0, aSig1, - shiftCount, &aSig0, &aSig1 ); 4371 } 4372 else { 4373 shift64ExtraRightJamming( aSig0, aSig1, shiftCount, &aSig0, &aSig1 ); 4374 } 4375 return roundAndPackInt64( aSign, aSig0, aSig1 ); 4376 4377 } 4378 4379 /* 4380 ------------------------------------------------------------------------------- 4381 Returns the result of converting the quadruple-precision floating-point 4382 value `a' to the 64-bit two's complement integer format. The conversion 4383 is performed according to the IEC/IEEE Standard for Binary Floating-Point 4384 Arithmetic, except that the conversion is always rounded toward zero. 4385 If `a' is a NaN, the largest positive integer is returned. Otherwise, if 4386 the conversion overflows, the largest integer with the same sign as `a' is 4387 returned. 4388 ------------------------------------------------------------------------------- 4389 */ 4390 int64 float128_to_int64_round_to_zero( float128 a ) 4391 { 4392 flag aSign; 4393 int32 aExp, shiftCount; 4394 bits64 aSig0, aSig1; 4395 int64 z; 4396 4397 aSig1 = extractFloat128Frac1( a ); 4398 aSig0 = extractFloat128Frac0( a ); 4399 aExp = extractFloat128Exp( a ); 4400 aSign = extractFloat128Sign( a ); 4401 if ( aExp ) aSig0 |= LIT64( 0x0001000000000000 ); 4402 shiftCount = aExp - 0x402F; 4403 if ( 0 < shiftCount ) { 4404 if ( 0x403E <= aExp ) { 4405 aSig0 &= LIT64( 0x0000FFFFFFFFFFFF ); 4406 if ( ( a.high == LIT64( 0xC03E000000000000 ) ) 4407 && ( aSig1 < LIT64( 0x0002000000000000 ) ) ) { 4408 if ( aSig1 ) float_exception_flags |= float_flag_inexact; 4409 } 4410 else { 4411 float_raise( float_flag_invalid ); 4412 if ( ! aSign || ( ( aExp == 0x7FFF ) && ( aSig0 | aSig1 ) ) ) { 4413 return LIT64( 0x7FFFFFFFFFFFFFFF ); 4414 } 4415 } 4416 return (sbits64) LIT64( 0x8000000000000000 ); 4417 } 4418 z = ( aSig0<<shiftCount ) | ( aSig1>>( ( - shiftCount ) & 63 ) ); 4419 if ( (bits64) ( aSig1<<shiftCount ) ) { 4420 float_exception_flags |= float_flag_inexact; 4421 } 4422 } 4423 else { 4424 if ( aExp < 0x3FFF ) { 4425 if ( aExp | aSig0 | aSig1 ) { 4426 float_exception_flags |= float_flag_inexact; 4427 } 4428 return 0; 4429 } 4430 z = aSig0>>( - shiftCount ); 4431 if ( aSig1 4432 || ( shiftCount && (bits64) ( aSig0<<( shiftCount & 63 ) ) ) ) { 4433 float_exception_flags |= float_flag_inexact; 4434 } 4435 } 4436 if ( aSign ) z = - z; 4437 return z; 4438 4439 } 4440 4441 /* 4442 ------------------------------------------------------------------------------- 4443 Returns the result of converting the quadruple-precision floating-point 4444 value `a' to the single-precision floating-point format. The conversion 4445 is performed according to the IEC/IEEE Standard for Binary Floating-Point 4446 Arithmetic. 4447 ------------------------------------------------------------------------------- 4448 */ 4449 float32 float128_to_float32( float128 a ) 4450 { 4451 flag aSign; 4452 int32 aExp; 4453 bits64 aSig0, aSig1; 4454 bits32 zSig; 4455 4456 aSig1 = extractFloat128Frac1( a ); 4457 aSig0 = extractFloat128Frac0( a ); 4458 aExp = extractFloat128Exp( a ); 4459 aSign = extractFloat128Sign( a ); 4460 if ( aExp == 0x7FFF ) { 4461 if ( aSig0 | aSig1 ) { 4462 return commonNaNToFloat32( float128ToCommonNaN( a ) ); 4463 } 4464 return packFloat32( aSign, 0xFF, 0 ); 4465 } 4466 aSig0 |= ( aSig1 != 0 ); 4467 shift64RightJamming( aSig0, 18, &aSig0 ); 4468 zSig = aSig0; 4469 if ( aExp || zSig ) { 4470 zSig |= 0x40000000; 4471 aExp -= 0x3F81; 4472 } 4473 return roundAndPackFloat32( aSign, aExp, zSig ); 4474 4475 } 4476 4477 /* 4478 ------------------------------------------------------------------------------- 4479 Returns the result of converting the quadruple-precision floating-point 4480 value `a' to the double-precision floating-point format. The conversion 4481 is performed according to the IEC/IEEE Standard for Binary Floating-Point 4482 Arithmetic. 4483 ------------------------------------------------------------------------------- 4484 */ 4485 float64 float128_to_float64( float128 a ) 4486 { 4487 flag aSign; 4488 int32 aExp; 4489 bits64 aSig0, aSig1; 4490 4491 aSig1 = extractFloat128Frac1( a ); 4492 aSig0 = extractFloat128Frac0( a ); 4493 aExp = extractFloat128Exp( a ); 4494 aSign = extractFloat128Sign( a ); 4495 if ( aExp == 0x7FFF ) { 4496 if ( aSig0 | aSig1 ) { 4497 return commonNaNToFloat64( float128ToCommonNaN( a ) ); 4498 } 4499 return packFloat64( aSign, 0x7FF, 0 ); 4500 } 4501 shortShift128Left( aSig0, aSig1, 14, &aSig0, &aSig1 ); 4502 aSig0 |= ( aSig1 != 0 ); 4503 if ( aExp || aSig0 ) { 4504 aSig0 |= LIT64( 0x4000000000000000 ); 4505 aExp -= 0x3C01; 4506 } 4507 return roundAndPackFloat64( aSign, aExp, aSig0 ); 4508 4509 } 4510 4511 #ifdef FLOATX80 4512 4513 /* 4514 ------------------------------------------------------------------------------- 4515 Returns the result of converting the quadruple-precision floating-point 4516 value `a' to the extended double-precision floating-point format. The 4517 conversion is performed according to the IEC/IEEE Standard for Binary 4518 Floating-Point Arithmetic. 4519 ------------------------------------------------------------------------------- 4520 */ 4521 floatx80 float128_to_floatx80( float128 a ) 4522 { 4523 flag aSign; 4524 int32 aExp; 4525 bits64 aSig0, aSig1; 4526 4527 aSig1 = extractFloat128Frac1( a ); 4528 aSig0 = extractFloat128Frac0( a ); 4529 aExp = extractFloat128Exp( a ); 4530 aSign = extractFloat128Sign( a ); 4531 if ( aExp == 0x7FFF ) { 4532 if ( aSig0 | aSig1 ) { 4533 return commonNaNToFloatx80( float128ToCommonNaN( a ) ); 4534 } 4535 return packFloatx80( aSign, 0x7FFF, LIT64( 0x8000000000000000 ) ); 4536 } 4537 if ( aExp == 0 ) { 4538 if ( ( aSig0 | aSig1 ) == 0 ) return packFloatx80( aSign, 0, 0 ); 4539 normalizeFloat128Subnormal( aSig0, aSig1, &aExp, &aSig0, &aSig1 ); 4540 } 4541 else { 4542 aSig0 |= LIT64( 0x0001000000000000 ); 4543 } 4544 shortShift128Left( aSig0, aSig1, 15, &aSig0, &aSig1 ); 4545 return roundAndPackFloatx80( 80, aSign, aExp, aSig0, aSig1 ); 4546 4547 } 4548 4549 #endif 4550 4551 /* 4552 ------------------------------------------------------------------------------- 4553 Rounds the quadruple-precision floating-point value `a' to an integer, and 4554 returns the result as a quadruple-precision floating-point value. The 4555 operation is performed according to the IEC/IEEE Standard for Binary 4556 Floating-Point Arithmetic. 4557 ------------------------------------------------------------------------------- 4558 */ 4559 float128 float128_round_to_int( float128 a ) 4560 { 4561 flag aSign; 4562 int32 aExp; 4563 bits64 lastBitMask, roundBitsMask; 4564 int8 roundingMode; 4565 float128 z; 4566 4567 aExp = extractFloat128Exp( a ); 4568 if ( 0x402F <= aExp ) { 4569 if ( 0x406F <= aExp ) { 4570 if ( ( aExp == 0x7FFF ) 4571 && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) 4572 ) { 4573 return propagateFloat128NaN( a, a ); 4574 } 4575 return a; 4576 } 4577 lastBitMask = 1; 4578 lastBitMask = ( lastBitMask<<( 0x406E - aExp ) )<<1; 4579 roundBitsMask = lastBitMask - 1; 4580 z = a; 4581 roundingMode = float_rounding_mode; 4582 if ( roundingMode == float_round_nearest_even ) { 4583 if ( lastBitMask ) { 4584 add128( z.high, z.low, 0, lastBitMask>>1, &z.high, &z.low ); 4585 if ( ( z.low & roundBitsMask ) == 0 ) z.low &= ~ lastBitMask; 4586 } 4587 else { 4588 if ( (sbits64) z.low < 0 ) { 4589 ++z.high; 4590 if ( (bits64) ( z.low<<1 ) == 0 ) z.high &= ~1; 4591 } 4592 } 4593 } 4594 else if ( roundingMode != float_round_to_zero ) { 4595 if ( extractFloat128Sign( z ) 4596 ^ ( roundingMode == float_round_up ) ) { 4597 add128( z.high, z.low, 0, roundBitsMask, &z.high, &z.low ); 4598 } 4599 } 4600 z.low &= ~ roundBitsMask; 4601 } 4602 else { 4603 if ( aExp < 0x3FFF ) { 4604 if ( ( ( (bits64) ( a.high<<1 ) ) | a.low ) == 0 ) return a; 4605 float_exception_flags |= float_flag_inexact; 4606 aSign = extractFloat128Sign( a ); 4607 switch ( float_rounding_mode ) { 4608 case float_round_nearest_even: 4609 if ( ( aExp == 0x3FFE ) 4610 && ( extractFloat128Frac0( a ) 4611 | extractFloat128Frac1( a ) ) 4612 ) { 4613 return packFloat128( aSign, 0x3FFF, 0, 0 ); 4614 } 4615 break; 4616 case float_round_to_zero: 4617 break; 4618 case float_round_down: 4619 return 4620 aSign ? packFloat128( 1, 0x3FFF, 0, 0 ) 4621 : packFloat128( 0, 0, 0, 0 ); 4622 case float_round_up: 4623 return 4624 aSign ? packFloat128( 1, 0, 0, 0 ) 4625 : packFloat128( 0, 0x3FFF, 0, 0 ); 4626 } 4627 return packFloat128( aSign, 0, 0, 0 ); 4628 } 4629 lastBitMask = 1; 4630 lastBitMask <<= 0x402F - aExp; 4631 roundBitsMask = lastBitMask - 1; 4632 z.low = 0; 4633 z.high = a.high; 4634 roundingMode = float_rounding_mode; 4635 if ( roundingMode == float_round_nearest_even ) { 4636 z.high += lastBitMask>>1; 4637 if ( ( ( z.high & roundBitsMask ) | a.low ) == 0 ) { 4638 z.high &= ~ lastBitMask; 4639 } 4640 } 4641 else if ( roundingMode != float_round_to_zero ) { 4642 if ( extractFloat128Sign( z ) 4643 ^ ( roundingMode == float_round_up ) ) { 4644 z.high |= ( a.low != 0 ); 4645 z.high += roundBitsMask; 4646 } 4647 } 4648 z.high &= ~ roundBitsMask; 4649 } 4650 if ( ( z.low != a.low ) || ( z.high != a.high ) ) { 4651 float_exception_flags |= float_flag_inexact; 4652 } 4653 return z; 4654 4655 } 4656 4657 /* 4658 ------------------------------------------------------------------------------- 4659 Returns the result of adding the absolute values of the quadruple-precision 4660 floating-point values `a' and `b'. If `zSign' is 1, the sum is negated 4661 before being returned. `zSign' is ignored if the result is a NaN. 4662 The addition is performed according to the IEC/IEEE Standard for Binary 4663 Floating-Point Arithmetic. 4664 ------------------------------------------------------------------------------- 4665 */ 4666 static float128 addFloat128Sigs( float128 a, float128 b, flag zSign ) 4667 { 4668 int32 aExp, bExp, zExp; 4669 bits64 aSig0, aSig1, bSig0, bSig1, zSig0, zSig1, zSig2; 4670 int32 expDiff; 4671 4672 aSig1 = extractFloat128Frac1( a ); 4673 aSig0 = extractFloat128Frac0( a ); 4674 aExp = extractFloat128Exp( a ); 4675 bSig1 = extractFloat128Frac1( b ); 4676 bSig0 = extractFloat128Frac0( b ); 4677 bExp = extractFloat128Exp( b ); 4678 expDiff = aExp - bExp; 4679 if ( 0 < expDiff ) { 4680 if ( aExp == 0x7FFF ) { 4681 if ( aSig0 | aSig1 ) return propagateFloat128NaN( a, b ); 4682 return a; 4683 } 4684 if ( bExp == 0 ) { 4685 --expDiff; 4686 } 4687 else { 4688 bSig0 |= LIT64( 0x0001000000000000 ); 4689 } 4690 shift128ExtraRightJamming( 4691 bSig0, bSig1, 0, expDiff, &bSig0, &bSig1, &zSig2 ); 4692 zExp = aExp; 4693 } 4694 else if ( expDiff < 0 ) { 4695 if ( bExp == 0x7FFF ) { 4696 if ( bSig0 | bSig1 ) return propagateFloat128NaN( a, b ); 4697 return packFloat128( zSign, 0x7FFF, 0, 0 ); 4698 } 4699 if ( aExp == 0 ) { 4700 ++expDiff; 4701 } 4702 else { 4703 aSig0 |= LIT64( 0x0001000000000000 ); 4704 } 4705 shift128ExtraRightJamming( 4706 aSig0, aSig1, 0, - expDiff, &aSig0, &aSig1, &zSig2 ); 4707 zExp = bExp; 4708 } 4709 else { 4710 if ( aExp == 0x7FFF ) { 4711 if ( aSig0 | aSig1 | bSig0 | bSig1 ) { 4712 return propagateFloat128NaN( a, b ); 4713 } 4714 return a; 4715 } 4716 add128( aSig0, aSig1, bSig0, bSig1, &zSig0, &zSig1 ); 4717 if ( aExp == 0 ) return packFloat128( zSign, 0, zSig0, zSig1 ); 4718 zSig2 = 0; 4719 zSig0 |= LIT64( 0x0002000000000000 ); 4720 zExp = aExp; 4721 goto shiftRight1; 4722 } 4723 aSig0 |= LIT64( 0x0001000000000000 ); 4724 add128( aSig0, aSig1, bSig0, bSig1, &zSig0, &zSig1 ); 4725 --zExp; 4726 if ( zSig0 < LIT64( 0x0002000000000000 ) ) goto roundAndPack; 4727 ++zExp; 4728 shiftRight1: 4729 shift128ExtraRightJamming( 4730 zSig0, zSig1, zSig2, 1, &zSig0, &zSig1, &zSig2 ); 4731 roundAndPack: 4732 return roundAndPackFloat128( zSign, zExp, zSig0, zSig1, zSig2 ); 4733 4734 } 4735 4736 /* 4737 ------------------------------------------------------------------------------- 4738 Returns the result of subtracting the absolute values of the quadruple- 4739 precision floating-point values `a' and `b'. If `zSign' is 1, the 4740 difference is negated before being returned. `zSign' is ignored if the 4741 result is a NaN. The subtraction is performed according to the IEC/IEEE 4742 Standard for Binary Floating-Point Arithmetic. 4743 ------------------------------------------------------------------------------- 4744 */ 4745 static float128 subFloat128Sigs( float128 a, float128 b, flag zSign ) 4746 { 4747 int32 aExp, bExp, zExp; 4748 bits64 aSig0, aSig1, bSig0, bSig1, zSig0, zSig1; 4749 int32 expDiff; 4750 float128 z; 4751 4752 aSig1 = extractFloat128Frac1( a ); 4753 aSig0 = extractFloat128Frac0( a ); 4754 aExp = extractFloat128Exp( a ); 4755 bSig1 = extractFloat128Frac1( b ); 4756 bSig0 = extractFloat128Frac0( b ); 4757 bExp = extractFloat128Exp( b ); 4758 expDiff = aExp - bExp; 4759 shortShift128Left( aSig0, aSig1, 14, &aSig0, &aSig1 ); 4760 shortShift128Left( bSig0, bSig1, 14, &bSig0, &bSig1 ); 4761 if ( 0 < expDiff ) goto aExpBigger; 4762 if ( expDiff < 0 ) goto bExpBigger; 4763 if ( aExp == 0x7FFF ) { 4764 if ( aSig0 | aSig1 | bSig0 | bSig1 ) { 4765 return propagateFloat128NaN( a, b ); 4766 } 4767 float_raise( float_flag_invalid ); 4768 z.low = float128_default_nan_low; 4769 z.high = float128_default_nan_high; 4770 return z; 4771 } 4772 if ( aExp == 0 ) { 4773 aExp = 1; 4774 bExp = 1; 4775 } 4776 if ( bSig0 < aSig0 ) goto aBigger; 4777 if ( aSig0 < bSig0 ) goto bBigger; 4778 if ( bSig1 < aSig1 ) goto aBigger; 4779 if ( aSig1 < bSig1 ) goto bBigger; 4780 return packFloat128( float_rounding_mode == float_round_down, 0, 0, 0 ); 4781 bExpBigger: 4782 if ( bExp == 0x7FFF ) { 4783 if ( bSig0 | bSig1 ) return propagateFloat128NaN( a, b ); 4784 return packFloat128( zSign ^ 1, 0x7FFF, 0, 0 ); 4785 } 4786 if ( aExp == 0 ) { 4787 ++expDiff; 4788 } 4789 else { 4790 aSig0 |= LIT64( 0x4000000000000000 ); 4791 } 4792 shift128RightJamming( aSig0, aSig1, - expDiff, &aSig0, &aSig1 ); 4793 bSig0 |= LIT64( 0x4000000000000000 ); 4794 bBigger: 4795 sub128( bSig0, bSig1, aSig0, aSig1, &zSig0, &zSig1 ); 4796 zExp = bExp; 4797 zSign ^= 1; 4798 goto normalizeRoundAndPack; 4799 aExpBigger: 4800 if ( aExp == 0x7FFF ) { 4801 if ( aSig0 | aSig1 ) return propagateFloat128NaN( a, b ); 4802 return a; 4803 } 4804 if ( bExp == 0 ) { 4805 --expDiff; 4806 } 4807 else { 4808 bSig0 |= LIT64( 0x4000000000000000 ); 4809 } 4810 shift128RightJamming( bSig0, bSig1, expDiff, &bSig0, &bSig1 ); 4811 aSig0 |= LIT64( 0x4000000000000000 ); 4812 aBigger: 4813 sub128( aSig0, aSig1, bSig0, bSig1, &zSig0, &zSig1 ); 4814 zExp = aExp; 4815 normalizeRoundAndPack: 4816 --zExp; 4817 return normalizeRoundAndPackFloat128( zSign, zExp - 14, zSig0, zSig1 ); 4818 4819 } 4820 4821 /* 4822 ------------------------------------------------------------------------------- 4823 Returns the result of adding the quadruple-precision floating-point values 4824 `a' and `b'. The operation is performed according to the IEC/IEEE Standard 4825 for Binary Floating-Point Arithmetic. 4826 ------------------------------------------------------------------------------- 4827 */ 4828 float128 float128_add( float128 a, float128 b ) 4829 { 4830 flag aSign, bSign; 4831 4832 aSign = extractFloat128Sign( a ); 4833 bSign = extractFloat128Sign( b ); 4834 if ( aSign == bSign ) { 4835 return addFloat128Sigs( a, b, aSign ); 4836 } 4837 else { 4838 return subFloat128Sigs( a, b, aSign ); 4839 } 4840 4841 } 4842 4843 /* 4844 ------------------------------------------------------------------------------- 4845 Returns the result of subtracting the quadruple-precision floating-point 4846 values `a' and `b'. The operation is performed according to the IEC/IEEE 4847 Standard for Binary Floating-Point Arithmetic. 4848 ------------------------------------------------------------------------------- 4849 */ 4850 float128 float128_sub( float128 a, float128 b ) 4851 { 4852 flag aSign, bSign; 4853 4854 aSign = extractFloat128Sign( a ); 4855 bSign = extractFloat128Sign( b ); 4856 if ( aSign == bSign ) { 4857 return subFloat128Sigs( a, b, aSign ); 4858 } 4859 else { 4860 return addFloat128Sigs( a, b, aSign ); 4861 } 4862 4863 } 4864 4865 /* 4866 ------------------------------------------------------------------------------- 4867 Returns the result of multiplying the quadruple-precision floating-point 4868 values `a' and `b'. The operation is performed according to the IEC/IEEE 4869 Standard for Binary Floating-Point Arithmetic. 4870 ------------------------------------------------------------------------------- 4871 */ 4872 float128 float128_mul( float128 a, float128 b ) 4873 { 4874 flag aSign, bSign, zSign; 4875 int32 aExp, bExp, zExp; 4876 bits64 aSig0, aSig1, bSig0, bSig1, zSig0, zSig1, zSig2, zSig3; 4877 float128 z; 4878 4879 aSig1 = extractFloat128Frac1( a ); 4880 aSig0 = extractFloat128Frac0( a ); 4881 aExp = extractFloat128Exp( a ); 4882 aSign = extractFloat128Sign( a ); 4883 bSig1 = extractFloat128Frac1( b ); 4884 bSig0 = extractFloat128Frac0( b ); 4885 bExp = extractFloat128Exp( b ); 4886 bSign = extractFloat128Sign( b ); 4887 zSign = aSign ^ bSign; 4888 if ( aExp == 0x7FFF ) { 4889 if ( ( aSig0 | aSig1 ) 4890 || ( ( bExp == 0x7FFF ) && ( bSig0 | bSig1 ) ) ) { 4891 return propagateFloat128NaN( a, b ); 4892 } 4893 if ( ( bExp | bSig0 | bSig1 ) == 0 ) goto invalid; 4894 return packFloat128( zSign, 0x7FFF, 0, 0 ); 4895 } 4896 if ( bExp == 0x7FFF ) { 4897 if ( bSig0 | bSig1 ) return propagateFloat128NaN( a, b ); 4898 if ( ( aExp | aSig0 | aSig1 ) == 0 ) { 4899 invalid: 4900 float_raise( float_flag_invalid ); 4901 z.low = float128_default_nan_low; 4902 z.high = float128_default_nan_high; 4903 return z; 4904 } 4905 return packFloat128( zSign, 0x7FFF, 0, 0 ); 4906 } 4907 if ( aExp == 0 ) { 4908 if ( ( aSig0 | aSig1 ) == 0 ) return packFloat128( zSign, 0, 0, 0 ); 4909 normalizeFloat128Subnormal( aSig0, aSig1, &aExp, &aSig0, &aSig1 ); 4910 } 4911 if ( bExp == 0 ) { 4912 if ( ( bSig0 | bSig1 ) == 0 ) return packFloat128( zSign, 0, 0, 0 ); 4913 normalizeFloat128Subnormal( bSig0, bSig1, &bExp, &bSig0, &bSig1 ); 4914 } 4915 zExp = aExp + bExp - 0x4000; 4916 aSig0 |= LIT64( 0x0001000000000000 ); 4917 shortShift128Left( bSig0, bSig1, 16, &bSig0, &bSig1 ); 4918 mul128To256( aSig0, aSig1, bSig0, bSig1, &zSig0, &zSig1, &zSig2, &zSig3 ); 4919 add128( zSig0, zSig1, aSig0, aSig1, &zSig0, &zSig1 ); 4920 zSig2 |= ( zSig3 != 0 ); 4921 if ( LIT64( 0x0002000000000000 ) <= zSig0 ) { 4922 shift128ExtraRightJamming( 4923 zSig0, zSig1, zSig2, 1, &zSig0, &zSig1, &zSig2 ); 4924 ++zExp; 4925 } 4926 return roundAndPackFloat128( zSign, zExp, zSig0, zSig1, zSig2 ); 4927 4928 } 4929 4930 /* 4931 ------------------------------------------------------------------------------- 4932 Returns the result of dividing the quadruple-precision floating-point value 4933 `a' by the corresponding value `b'. The operation is performed according to 4934 the IEC/IEEE Standard for Binary Floating-Point Arithmetic. 4935 ------------------------------------------------------------------------------- 4936 */ 4937 float128 float128_div( float128 a, float128 b ) 4938 { 4939 flag aSign, bSign, zSign; 4940 int32 aExp, bExp, zExp; 4941 bits64 aSig0, aSig1, bSig0, bSig1, zSig0, zSig1, zSig2; 4942 bits64 rem0, rem1, rem2, rem3, term0, term1, term2, term3; 4943 float128 z; 4944 4945 aSig1 = extractFloat128Frac1( a ); 4946 aSig0 = extractFloat128Frac0( a ); 4947 aExp = extractFloat128Exp( a ); 4948 aSign = extractFloat128Sign( a ); 4949 bSig1 = extractFloat128Frac1( b ); 4950 bSig0 = extractFloat128Frac0( b ); 4951 bExp = extractFloat128Exp( b ); 4952 bSign = extractFloat128Sign( b ); 4953 zSign = aSign ^ bSign; 4954 if ( aExp == 0x7FFF ) { 4955 if ( aSig0 | aSig1 ) return propagateFloat128NaN( a, b ); 4956 if ( bExp == 0x7FFF ) { 4957 if ( bSig0 | bSig1 ) return propagateFloat128NaN( a, b ); 4958 goto invalid; 4959 } 4960 return packFloat128( zSign, 0x7FFF, 0, 0 ); 4961 } 4962 if ( bExp == 0x7FFF ) { 4963 if ( bSig0 | bSig1 ) return propagateFloat128NaN( a, b ); 4964 return packFloat128( zSign, 0, 0, 0 ); 4965 } 4966 if ( bExp == 0 ) { 4967 if ( ( bSig0 | bSig1 ) == 0 ) { 4968 if ( ( aExp | aSig0 | aSig1 ) == 0 ) { 4969 invalid: 4970 float_raise( float_flag_invalid ); 4971 z.low = float128_default_nan_low; 4972 z.high = float128_default_nan_high; 4973 return z; 4974 } 4975 float_raise( float_flag_divbyzero ); 4976 return packFloat128( zSign, 0x7FFF, 0, 0 ); 4977 } 4978 normalizeFloat128Subnormal( bSig0, bSig1, &bExp, &bSig0, &bSig1 ); 4979 } 4980 if ( aExp == 0 ) { 4981 if ( ( aSig0 | aSig1 ) == 0 ) return packFloat128( zSign, 0, 0, 0 ); 4982 normalizeFloat128Subnormal( aSig0, aSig1, &aExp, &aSig0, &aSig1 ); 4983 } 4984 zExp = aExp - bExp + 0x3FFD; 4985 shortShift128Left( 4986 aSig0 | LIT64( 0x0001000000000000 ), aSig1, 15, &aSig0, &aSig1 ); 4987 shortShift128Left( 4988 bSig0 | LIT64( 0x0001000000000000 ), bSig1, 15, &bSig0, &bSig1 ); 4989 if ( le128( bSig0, bSig1, aSig0, aSig1 ) ) { 4990 shift128Right( aSig0, aSig1, 1, &aSig0, &aSig1 ); 4991 ++zExp; 4992 } 4993 zSig0 = estimateDiv128To64( aSig0, aSig1, bSig0 ); 4994 mul128By64To192( bSig0, bSig1, zSig0, &term0, &term1, &term2 ); 4995 sub192( aSig0, aSig1, 0, term0, term1, term2, &rem0, &rem1, &rem2 ); 4996 while ( (sbits64) rem0 < 0 ) { 4997 --zSig0; 4998 add192( rem0, rem1, rem2, 0, bSig0, bSig1, &rem0, &rem1, &rem2 ); 4999 } 5000 zSig1 = estimateDiv128To64( rem1, rem2, bSig0 ); 5001 if ( ( zSig1 & 0x3FFF ) <= 4 ) { 5002 mul128By64To192( bSig0, bSig1, zSig1, &term1, &term2, &term3 ); 5003 sub192( rem1, rem2, 0, term1, term2, term3, &rem1, &rem2, &rem3 ); 5004 while ( (sbits64) rem1 < 0 ) { 5005 --zSig1; 5006 add192( rem1, rem2, rem3, 0, bSig0, bSig1, &rem1, &rem2, &rem3 ); 5007 } 5008 zSig1 |= ( ( rem1 | rem2 | rem3 ) != 0 ); 5009 } 5010 shift128ExtraRightJamming( zSig0, zSig1, 0, 15, &zSig0, &zSig1, &zSig2 ); 5011 return roundAndPackFloat128( zSign, zExp, zSig0, zSig1, zSig2 ); 5012 5013 } 5014 5015 /* 5016 ------------------------------------------------------------------------------- 5017 Returns the remainder of the quadruple-precision floating-point value `a' 5018 with respect to the corresponding value `b'. The operation is performed 5019 according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic. 5020 ------------------------------------------------------------------------------- 5021 */ 5022 float128 float128_rem( float128 a, float128 b ) 5023 { 5024 flag aSign, bSign, zSign; 5025 int32 aExp, bExp, expDiff; 5026 bits64 aSig0, aSig1, bSig0, bSig1, q, term0, term1, term2; 5027 bits64 allZero, alternateASig0, alternateASig1, sigMean1; 5028 sbits64 sigMean0; 5029 float128 z; 5030 5031 aSig1 = extractFloat128Frac1( a ); 5032 aSig0 = extractFloat128Frac0( a ); 5033 aExp = extractFloat128Exp( a ); 5034 aSign = extractFloat128Sign( a ); 5035 bSig1 = extractFloat128Frac1( b ); 5036 bSig0 = extractFloat128Frac0( b ); 5037 bExp = extractFloat128Exp( b ); 5038 bSign = extractFloat128Sign( b ); 5039 if ( aExp == 0x7FFF ) { 5040 if ( ( aSig0 | aSig1 ) 5041 || ( ( bExp == 0x7FFF ) && ( bSig0 | bSig1 ) ) ) { 5042 return propagateFloat128NaN( a, b ); 5043 } 5044 goto invalid; 5045 } 5046 if ( bExp == 0x7FFF ) { 5047 if ( bSig0 | bSig1 ) return propagateFloat128NaN( a, b ); 5048 return a; 5049 } 5050 if ( bExp == 0 ) { 5051 if ( ( bSig0 | bSig1 ) == 0 ) { 5052 invalid: 5053 float_raise( float_flag_invalid ); 5054 z.low = float128_default_nan_low; 5055 z.high = float128_default_nan_high; 5056 return z; 5057 } 5058 normalizeFloat128Subnormal( bSig0, bSig1, &bExp, &bSig0, &bSig1 ); 5059 } 5060 if ( aExp == 0 ) { 5061 if ( ( aSig0 | aSig1 ) == 0 ) return a; 5062 normalizeFloat128Subnormal( aSig0, aSig1, &aExp, &aSig0, &aSig1 ); 5063 } 5064 expDiff = aExp - bExp; 5065 if ( expDiff < -1 ) return a; 5066 shortShift128Left( 5067 aSig0 | LIT64( 0x0001000000000000 ), 5068 aSig1, 5069 15 - ( expDiff < 0 ), 5070 &aSig0, 5071 &aSig1 5072 ); 5073 shortShift128Left( 5074 bSig0 | LIT64( 0x0001000000000000 ), bSig1, 15, &bSig0, &bSig1 ); 5075 q = le128( bSig0, bSig1, aSig0, aSig1 ); 5076 if ( q ) sub128( aSig0, aSig1, bSig0, bSig1, &aSig0, &aSig1 ); 5077 expDiff -= 64; 5078 while ( 0 < expDiff ) { 5079 q = estimateDiv128To64( aSig0, aSig1, bSig0 ); 5080 q = ( 4 < q ) ? q - 4 : 0; 5081 mul128By64To192( bSig0, bSig1, q, &term0, &term1, &term2 ); 5082 shortShift192Left( term0, term1, term2, 61, &term1, &term2, &allZero ); 5083 shortShift128Left( aSig0, aSig1, 61, &aSig0, &allZero ); 5084 sub128( aSig0, 0, term1, term2, &aSig0, &aSig1 ); 5085 expDiff -= 61; 5086 } 5087 if ( -64 < expDiff ) { 5088 q = estimateDiv128To64( aSig0, aSig1, bSig0 ); 5089 q = ( 4 < q ) ? q - 4 : 0; 5090 q >>= - expDiff; 5091 shift128Right( bSig0, bSig1, 12, &bSig0, &bSig1 ); 5092 expDiff += 52; 5093 if ( expDiff < 0 ) { 5094 shift128Right( aSig0, aSig1, - expDiff, &aSig0, &aSig1 ); 5095 } 5096 else { 5097 shortShift128Left( aSig0, aSig1, expDiff, &aSig0, &aSig1 ); 5098 } 5099 mul128By64To192( bSig0, bSig1, q, &term0, &term1, &term2 ); 5100 sub128( aSig0, aSig1, term1, term2, &aSig0, &aSig1 ); 5101 } 5102 else { 5103 shift128Right( aSig0, aSig1, 12, &aSig0, &aSig1 ); 5104 shift128Right( bSig0, bSig1, 12, &bSig0, &bSig1 ); 5105 } 5106 do { 5107 alternateASig0 = aSig0; 5108 alternateASig1 = aSig1; 5109 ++q; 5110 sub128( aSig0, aSig1, bSig0, bSig1, &aSig0, &aSig1 ); 5111 } while ( 0 <= (sbits64) aSig0 ); 5112 add128( 5113 aSig0, aSig1, alternateASig0, alternateASig1, &sigMean0, &sigMean1 ); 5114 if ( ( sigMean0 < 0 ) 5115 || ( ( ( sigMean0 | sigMean1 ) == 0 ) && ( q & 1 ) ) ) { 5116 aSig0 = alternateASig0; 5117 aSig1 = alternateASig1; 5118 } 5119 zSign = ( (sbits64) aSig0 < 0 ); 5120 if ( zSign ) sub128( 0, 0, aSig0, aSig1, &aSig0, &aSig1 ); 5121 return 5122 normalizeRoundAndPackFloat128( aSign ^ zSign, bExp - 4, aSig0, aSig1 ); 5123 5124 } 5125 5126 /* 5127 ------------------------------------------------------------------------------- 5128 Returns the square root of the quadruple-precision floating-point value `a'. 5129 The operation is performed according to the IEC/IEEE Standard for Binary 5130 Floating-Point Arithmetic. 5131 ------------------------------------------------------------------------------- 5132 */ 5133 float128 float128_sqrt( float128 a ) 5134 { 5135 flag aSign; 5136 int32 aExp, zExp; 5137 bits64 aSig0, aSig1, zSig0, zSig1, zSig2, doubleZSig0; 5138 bits64 rem0, rem1, rem2, rem3, term0, term1, term2, term3; 5139 float128 z; 5140 5141 aSig1 = extractFloat128Frac1( a ); 5142 aSig0 = extractFloat128Frac0( a ); 5143 aExp = extractFloat128Exp( a ); 5144 aSign = extractFloat128Sign( a ); 5145 if ( aExp == 0x7FFF ) { 5146 if ( aSig0 | aSig1 ) return propagateFloat128NaN( a, a ); 5147 if ( ! aSign ) return a; 5148 goto invalid; 5149 } 5150 if ( aSign ) { 5151 if ( ( aExp | aSig0 | aSig1 ) == 0 ) return a; 5152 invalid: 5153 float_raise( float_flag_invalid ); 5154 z.low = float128_default_nan_low; 5155 z.high = float128_default_nan_high; 5156 return z; 5157 } 5158 if ( aExp == 0 ) { 5159 if ( ( aSig0 | aSig1 ) == 0 ) return packFloat128( 0, 0, 0, 0 ); 5160 normalizeFloat128Subnormal( aSig0, aSig1, &aExp, &aSig0, &aSig1 ); 5161 } 5162 zExp = ( ( aExp - 0x3FFF )>>1 ) + 0x3FFE; 5163 aSig0 |= LIT64( 0x0001000000000000 ); 5164 zSig0 = estimateSqrt32( aExp, aSig0>>17 ); 5165 shortShift128Left( aSig0, aSig1, 13 - ( aExp & 1 ), &aSig0, &aSig1 ); 5166 zSig0 = estimateDiv128To64( aSig0, aSig1, zSig0<<32 ) + ( zSig0<<30 ); 5167 doubleZSig0 = zSig0<<1; 5168 mul64To128( zSig0, zSig0, &term0, &term1 ); 5169 sub128( aSig0, aSig1, term0, term1, &rem0, &rem1 ); 5170 while ( (sbits64) rem0 < 0 ) { 5171 --zSig0; 5172 doubleZSig0 -= 2; 5173 add128( rem0, rem1, zSig0>>63, doubleZSig0 | 1, &rem0, &rem1 ); 5174 } 5175 zSig1 = estimateDiv128To64( rem1, 0, doubleZSig0 ); 5176 if ( ( zSig1 & 0x1FFF ) <= 5 ) { 5177 if ( zSig1 == 0 ) zSig1 = 1; 5178 mul64To128( doubleZSig0, zSig1, &term1, &term2 ); 5179 sub128( rem1, 0, term1, term2, &rem1, &rem2 ); 5180 mul64To128( zSig1, zSig1, &term2, &term3 ); 5181 sub192( rem1, rem2, 0, 0, term2, term3, &rem1, &rem2, &rem3 ); 5182 while ( (sbits64) rem1 < 0 ) { 5183 --zSig1; 5184 shortShift128Left( 0, zSig1, 1, &term2, &term3 ); 5185 term3 |= 1; 5186 term2 |= doubleZSig0; 5187 add192( rem1, rem2, rem3, 0, term2, term3, &rem1, &rem2, &rem3 ); 5188 } 5189 zSig1 |= ( ( rem1 | rem2 | rem3 ) != 0 ); 5190 } 5191 shift128ExtraRightJamming( zSig0, zSig1, 0, 14, &zSig0, &zSig1, &zSig2 ); 5192 return roundAndPackFloat128( 0, zExp, zSig0, zSig1, zSig2 ); 5193 5194 } 5195 5196 /* 5197 ------------------------------------------------------------------------------- 5198 Returns 1 if the quadruple-precision floating-point value `a' is equal to 5199 the corresponding value `b', and 0 otherwise. The comparison is performed 5200 according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic. 5201 ------------------------------------------------------------------------------- 5202 */ 5203 flag float128_eq( float128 a, float128 b ) 5204 { 5205 5206 if ( ( ( extractFloat128Exp( a ) == 0x7FFF ) 5207 && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) ) 5208 || ( ( extractFloat128Exp( b ) == 0x7FFF ) 5209 && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) ) 5210 ) { 5211 if ( float128_is_signaling_nan( a ) 5212 || float128_is_signaling_nan( b ) ) { 5213 float_raise( float_flag_invalid ); 5214 } 5215 return 0; 5216 } 5217 return 5218 ( a.low == b.low ) 5219 && ( ( a.high == b.high ) 5220 || ( ( a.low == 0 ) 5221 && ( (bits64) ( ( a.high | b.high )<<1 ) == 0 ) ) 5222 ); 5223 5224 } 5225 5226 /* 5227 ------------------------------------------------------------------------------- 5228 Returns 1 if the quadruple-precision floating-point value `a' is less than 5229 or equal to the corresponding value `b', and 0 otherwise. The comparison 5230 is performed according to the IEC/IEEE Standard for Binary Floating-Point 5231 Arithmetic. 5232 ------------------------------------------------------------------------------- 5233 */ 5234 flag float128_le( float128 a, float128 b ) 5235 { 5236 flag aSign, bSign; 5237 5238 if ( ( ( extractFloat128Exp( a ) == 0x7FFF ) 5239 && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) ) 5240 || ( ( extractFloat128Exp( b ) == 0x7FFF ) 5241 && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) ) 5242 ) { 5243 float_raise( float_flag_invalid ); 5244 return 0; 5245 } 5246 aSign = extractFloat128Sign( a ); 5247 bSign = extractFloat128Sign( b ); 5248 if ( aSign != bSign ) { 5249 return 5250 aSign 5251 || ( ( ( (bits64) ( ( a.high | b.high )<<1 ) ) | a.low | b.low ) 5252 == 0 ); 5253 } 5254 return 5255 aSign ? le128( b.high, b.low, a.high, a.low ) 5256 : le128( a.high, a.low, b.high, b.low ); 5257 5258 } 5259 5260 /* 5261 ------------------------------------------------------------------------------- 5262 Returns 1 if the quadruple-precision floating-point value `a' is less than 5263 the corresponding value `b', and 0 otherwise. The comparison is performed 5264 according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic. 5265 ------------------------------------------------------------------------------- 5266 */ 5267 flag float128_lt( float128 a, float128 b ) 5268 { 5269 flag aSign, bSign; 5270 5271 if ( ( ( extractFloat128Exp( a ) == 0x7FFF ) 5272 && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) ) 5273 || ( ( extractFloat128Exp( b ) == 0x7FFF ) 5274 && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) ) 5275 ) { 5276 float_raise( float_flag_invalid ); 5277 return 0; 5278 } 5279 aSign = extractFloat128Sign( a ); 5280 bSign = extractFloat128Sign( b ); 5281 if ( aSign != bSign ) { 5282 return 5283 aSign 5284 && ( ( ( (bits64) ( ( a.high | b.high )<<1 ) ) | a.low | b.low ) 5285 != 0 ); 5286 } 5287 return 5288 aSign ? lt128( b.high, b.low, a.high, a.low ) 5289 : lt128( a.high, a.low, b.high, b.low ); 5290 5291 } 5292 5293 /* 5294 ------------------------------------------------------------------------------- 5295 Returns 1 if the quadruple-precision floating-point value `a' is equal to 5296 the corresponding value `b', and 0 otherwise. The invalid exception is 5297 raised if either operand is a NaN. Otherwise, the comparison is performed 5298 according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic. 5299 ------------------------------------------------------------------------------- 5300 */ 5301 flag float128_eq_signaling( float128 a, float128 b ) 5302 { 5303 5304 if ( ( ( extractFloat128Exp( a ) == 0x7FFF ) 5305 && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) ) 5306 || ( ( extractFloat128Exp( b ) == 0x7FFF ) 5307 && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) ) 5308 ) { 5309 float_raise( float_flag_invalid ); 5310 return 0; 5311 } 5312 return 5313 ( a.low == b.low ) 5314 && ( ( a.high == b.high ) 5315 || ( ( a.low == 0 ) 5316 && ( (bits64) ( ( a.high | b.high )<<1 ) == 0 ) ) 5317 ); 5318 5319 } 5320 5321 /* 5322 ------------------------------------------------------------------------------- 5323 Returns 1 if the quadruple-precision floating-point value `a' is less than 5324 or equal to the corresponding value `b', and 0 otherwise. Quiet NaNs do not 5325 cause an exception. Otherwise, the comparison is performed according to the 5326 IEC/IEEE Standard for Binary Floating-Point Arithmetic. 5327 ------------------------------------------------------------------------------- 5328 */ 5329 flag float128_le_quiet( float128 a, float128 b ) 5330 { 5331 flag aSign, bSign; 5332 5333 if ( ( ( extractFloat128Exp( a ) == 0x7FFF ) 5334 && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) ) 5335 || ( ( extractFloat128Exp( b ) == 0x7FFF ) 5336 && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) ) 5337 ) { 5338 if ( float128_is_signaling_nan( a ) 5339 || float128_is_signaling_nan( b ) ) { 5340 float_raise( float_flag_invalid ); 5341 } 5342 return 0; 5343 } 5344 aSign = extractFloat128Sign( a ); 5345 bSign = extractFloat128Sign( b ); 5346 if ( aSign != bSign ) { 5347 return 5348 aSign 5349 || ( ( ( (bits64) ( ( a.high | b.high )<<1 ) ) | a.low | b.low ) 5350 == 0 ); 5351 } 5352 return 5353 aSign ? le128( b.high, b.low, a.high, a.low ) 5354 : le128( a.high, a.low, b.high, b.low ); 5355 5356 } 5357 5358 /* 5359 ------------------------------------------------------------------------------- 5360 Returns 1 if the quadruple-precision floating-point value `a' is less than 5361 the corresponding value `b', and 0 otherwise. Quiet NaNs do not cause an 5362 exception. Otherwise, the comparison is performed according to the IEC/IEEE 5363 Standard for Binary Floating-Point Arithmetic. 5364 ------------------------------------------------------------------------------- 5365 */ 5366 flag float128_lt_quiet( float128 a, float128 b ) 5367 { 5368 flag aSign, bSign; 5369 5370 if ( ( ( extractFloat128Exp( a ) == 0x7FFF ) 5371 && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) ) 5372 || ( ( extractFloat128Exp( b ) == 0x7FFF ) 5373 && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) ) 5374 ) { 5375 if ( float128_is_signaling_nan( a ) 5376 || float128_is_signaling_nan( b ) ) { 5377 float_raise( float_flag_invalid ); 5378 } 5379 return 0; 5380 } 5381 aSign = extractFloat128Sign( a ); 5382 bSign = extractFloat128Sign( b ); 5383 if ( aSign != bSign ) { 5384 return 5385 aSign 5386 && ( ( ( (bits64) ( ( a.high | b.high )<<1 ) ) | a.low | b.low ) 5387 != 0 ); 5388 } 5389 return 5390 aSign ? lt128( b.high, b.low, a.high, a.low ) 5391 : lt128( a.high, a.low, b.high, b.low ); 5392 5393 } 5394 5395 #endif 5396 5397 5398 #if defined(SOFTFLOAT_FOR_GCC) && defined(SOFTFLOAT_NEED_FIXUNS) 5399 5400 /* 5401 * These two routines are not part of the original softfloat distribution. 5402 * 5403 * They are based on the corresponding conversions to integer but return 5404 * unsigned numbers instead since these functions are required by GCC. 5405 * 5406 * Added by Mark Brinicombe <mark@NetBSD.org> 27/09/97 5407 * 5408 * float64 version overhauled for SoftFloat 2a [bjh21 2000-07-15] 5409 */ 5410 5411 /* 5412 ------------------------------------------------------------------------------- 5413 Returns the result of converting the double-precision floating-point value 5414 `a' to the 32-bit unsigned integer format. The conversion is 5415 performed according to the IEC/IEEE Standard for Binary Floating-point 5416 Arithmetic, except that the conversion is always rounded toward zero. If 5417 `a' is a NaN, the largest positive integer is returned. If the conversion 5418 overflows, the largest integer positive is returned. 5419 ------------------------------------------------------------------------------- 5420 */ 5421 uint32 float64_to_uint32_round_to_zero( float64 a ) 5422 { 5423 flag aSign; 5424 int16 aExp, shiftCount; 5425 bits64 aSig, savedASig; 5426 uint32 z; 5427 5428 aSig = extractFloat64Frac( a ); 5429 aExp = extractFloat64Exp( a ); 5430 aSign = extractFloat64Sign( a ); 5431 5432 if (aSign) { 5433 float_raise( float_flag_invalid ); 5434 return(0); 5435 } 5436 5437 if ( 0x41E < aExp ) { 5438 float_raise( float_flag_invalid ); 5439 return 0xffffffff; 5440 } 5441 else if ( aExp < 0x3FF ) { 5442 if ( aExp || aSig ) float_exception_flags |= float_flag_inexact; 5443 return 0; 5444 } 5445 aSig |= LIT64( 0x0010000000000000 ); 5446 shiftCount = 0x433 - aExp; 5447 savedASig = aSig; 5448 aSig >>= shiftCount; 5449 z = aSig; 5450 if ( ( aSig<<shiftCount ) != savedASig ) { 5451 float_exception_flags |= float_flag_inexact; 5452 } 5453 return z; 5454 5455 } 5456 5457 /* 5458 ------------------------------------------------------------------------------- 5459 Returns the result of converting the single-precision floating-point value 5460 `a' to the 32-bit unsigned integer format. The conversion is 5461 performed according to the IEC/IEEE Standard for Binary Floating-point 5462 Arithmetic, except that the conversion is always rounded toward zero. If 5463 `a' is a NaN, the largest positive integer is returned. If the conversion 5464 overflows, the largest positive integer is returned. 5465 ------------------------------------------------------------------------------- 5466 */ 5467 uint32 float32_to_uint32_round_to_zero( float32 a ) 5468 { 5469 flag aSign; 5470 int16 aExp, shiftCount; 5471 bits32 aSig; 5472 uint32 z; 5473 5474 aSig = extractFloat32Frac( a ); 5475 aExp = extractFloat32Exp( a ); 5476 aSign = extractFloat32Sign( a ); 5477 shiftCount = aExp - 0x9E; 5478 5479 if (aSign) { 5480 float_raise( float_flag_invalid ); 5481 return(0); 5482 } 5483 if ( 0 < shiftCount ) { 5484 float_raise( float_flag_invalid ); 5485 return 0xFFFFFFFF; 5486 } 5487 else if ( aExp <= 0x7E ) { 5488 if ( aExp | aSig ) float_exception_flags |= float_flag_inexact; 5489 return 0; 5490 } 5491 aSig = ( aSig | 0x800000 )<<8; 5492 z = aSig>>( - shiftCount ); 5493 if ( aSig<<( shiftCount & 31 ) ) { 5494 float_exception_flags |= float_flag_inexact; 5495 } 5496 return z; 5497 5498 } 5499 5500 #endif 5501