1 /* $NetBSD: softfloat.c,v 1.1 2002/05/21 23:51:07 bjh21 Exp $ */ 2 3 /* 4 * This version hacked for use with gcc -msoft-float by bjh21. 5 * (Mostly a case of #ifdefing out things GCC doesn't need or provides 6 * itself). 7 */ 8 9 /* 10 * Things you may want to define: 11 * 12 * SOFTFLOAT_FOR_GCC - build only those functions necessary for GCC (with 13 * -msoft-float) to work. Include "softfloat-for-gcc.h" to get them 14 * properly renamed. 15 */ 16 17 /* 18 * This differs from the standard bits32/softfloat.c in that float64 19 * is defined to be a 64-bit integer rather than a structure. The 20 * structure is float64s, with translation between the two going via 21 * float64u. 22 */ 23 24 /* 25 =============================================================================== 26 27 This C source file is part of the SoftFloat IEC/IEEE Floating-Point 28 Arithmetic Package, Release 2a. 29 30 Written by John R. Hauser. This work was made possible in part by the 31 International Computer Science Institute, located at Suite 600, 1947 Center 32 Street, Berkeley, California 94704. Funding was partially provided by the 33 National Science Foundation under grant MIP-9311980. The original version 34 of this code was written as part of a project to build a fixed-point vector 35 processor in collaboration with the University of California at Berkeley, 36 overseen by Profs. Nelson Morgan and John Wawrzynek. More information 37 is available through the Web page `http://HTTP.CS.Berkeley.EDU/~jhauser/ 38 arithmetic/SoftFloat.html'. 39 40 THIS SOFTWARE IS DISTRIBUTED AS IS, FOR FREE. Although reasonable effort 41 has been made to avoid it, THIS SOFTWARE MAY CONTAIN FAULTS THAT WILL AT 42 TIMES RESULT IN INCORRECT BEHAVIOR. USE OF THIS SOFTWARE IS RESTRICTED TO 43 PERSONS AND ORGANIZATIONS WHO CAN AND WILL TAKE FULL RESPONSIBILITY FOR ANY 44 AND ALL LOSSES, COSTS, OR OTHER PROBLEMS ARISING FROM ITS USE. 45 46 Derivative works are acceptable, even for commercial purposes, so long as 47 (1) they include prominent notice that the work is derivative, and (2) they 48 include prominent notice akin to these four paragraphs for those parts of 49 this code that are retained. 50 51 =============================================================================== 52 */ 53 54 #ifdef SOFTFLOAT_FOR_GCC 55 #include "softfloat-for-gcc.h" 56 #endif 57 58 #include "milieu.h" 59 #include "softfloat.h" 60 61 /* 62 * Conversions between floats as stored in memory and floats as 63 * SoftFloat uses them 64 */ 65 #ifndef FLOAT64_DEMANGLE 66 #define FLOAT64_DEMANGLE(a) (a) 67 #endif 68 #ifndef FLOAT64_MANGLE 69 #define FLOAT64_MANGLE(a) (a) 70 #endif 71 72 /* 73 ------------------------------------------------------------------------------- 74 Floating-point rounding mode and exception flags. 75 ------------------------------------------------------------------------------- 76 */ 77 int float_rounding_mode = float_round_nearest_even; 78 int float_exception_flags = 0; 79 80 /* 81 ------------------------------------------------------------------------------- 82 Primitive arithmetic functions, including multi-word arithmetic, and 83 division and square root approximations. (Can be specialized to target if 84 desired.) 85 ------------------------------------------------------------------------------- 86 */ 87 #include "softfloat-macros" 88 89 /* 90 ------------------------------------------------------------------------------- 91 Functions and definitions to determine: (1) whether tininess for underflow 92 is detected before or after rounding by default, (2) what (if anything) 93 happens when exceptions are raised, (3) how signaling NaNs are distinguished 94 from quiet NaNs, (4) the default generated quiet NaNs, and (4) how NaNs 95 are propagated from function inputs to output. These details are target- 96 specific. 97 ------------------------------------------------------------------------------- 98 */ 99 #include "softfloat-specialize" 100 101 /* 102 ------------------------------------------------------------------------------- 103 Returns the fraction bits of the single-precision floating-point value `a'. 104 ------------------------------------------------------------------------------- 105 */ 106 INLINE bits32 extractFloat32Frac( float32 a ) 107 { 108 109 return a & 0x007FFFFF; 110 111 } 112 113 /* 114 ------------------------------------------------------------------------------- 115 Returns the exponent bits of the single-precision floating-point value `a'. 116 ------------------------------------------------------------------------------- 117 */ 118 INLINE int16 extractFloat32Exp( float32 a ) 119 { 120 121 return ( a>>23 ) & 0xFF; 122 123 } 124 125 /* 126 ------------------------------------------------------------------------------- 127 Returns the sign bit of the single-precision floating-point value `a'. 128 ------------------------------------------------------------------------------- 129 */ 130 INLINE flag extractFloat32Sign( float32 a ) 131 { 132 133 return a>>31; 134 135 } 136 137 /* 138 ------------------------------------------------------------------------------- 139 Normalizes the subnormal single-precision floating-point value represented 140 by the denormalized significand `aSig'. The normalized exponent and 141 significand are stored at the locations pointed to by `zExpPtr' and 142 `zSigPtr', respectively. 143 ------------------------------------------------------------------------------- 144 */ 145 static void 146 normalizeFloat32Subnormal( bits32 aSig, int16 *zExpPtr, bits32 *zSigPtr ) 147 { 148 int8 shiftCount; 149 150 shiftCount = countLeadingZeros32( aSig ) - 8; 151 *zSigPtr = aSig<<shiftCount; 152 *zExpPtr = 1 - shiftCount; 153 154 } 155 156 /* 157 ------------------------------------------------------------------------------- 158 Packs the sign `zSign', exponent `zExp', and significand `zSig' into a 159 single-precision floating-point value, returning the result. After being 160 shifted into the proper positions, the three fields are simply added 161 together to form the result. This means that any integer portion of `zSig' 162 will be added into the exponent. Since a properly normalized significand 163 will have an integer portion equal to 1, the `zExp' input should be 1 less 164 than the desired result exponent whenever `zSig' is a complete, normalized 165 significand. 166 ------------------------------------------------------------------------------- 167 */ 168 INLINE float32 packFloat32( flag zSign, int16 zExp, bits32 zSig ) 169 { 170 171 return ( ( (bits32) zSign )<<31 ) + ( ( (bits32) zExp )<<23 ) + zSig; 172 173 } 174 175 /* 176 ------------------------------------------------------------------------------- 177 Takes an abstract floating-point value having sign `zSign', exponent `zExp', 178 and significand `zSig', and returns the proper single-precision floating- 179 point value corresponding to the abstract input. Ordinarily, the abstract 180 value is simply rounded and packed into the single-precision format, with 181 the inexact exception raised if the abstract input cannot be represented 182 exactly. However, if the abstract value is too large, the overflow and 183 inexact exceptions are raised and an infinity or maximal finite value is 184 returned. If the abstract value is too small, the input value is rounded to 185 a subnormal number, and the underflow and inexact exceptions are raised if 186 the abstract input cannot be represented exactly as a subnormal single- 187 precision floating-point number. 188 The input significand `zSig' has its binary point between bits 30 189 and 29, which is 7 bits to the left of the usual location. This shifted 190 significand must be normalized or smaller. If `zSig' is not normalized, 191 `zExp' must be 0; in that case, the result returned is a subnormal number, 192 and it must not require rounding. In the usual case that `zSig' is 193 normalized, `zExp' must be 1 less than the ``true'' floating-point exponent. 194 The handling of underflow and overflow follows the IEC/IEEE Standard for 195 Binary Floating-Point Arithmetic. 196 ------------------------------------------------------------------------------- 197 */ 198 static float32 roundAndPackFloat32( flag zSign, int16 zExp, bits32 zSig ) 199 { 200 int8 roundingMode; 201 flag roundNearestEven; 202 int8 roundIncrement, roundBits; 203 flag isTiny; 204 205 roundingMode = float_rounding_mode; 206 roundNearestEven = roundingMode == float_round_nearest_even; 207 roundIncrement = 0x40; 208 if ( ! roundNearestEven ) { 209 if ( roundingMode == float_round_to_zero ) { 210 roundIncrement = 0; 211 } 212 else { 213 roundIncrement = 0x7F; 214 if ( zSign ) { 215 if ( roundingMode == float_round_up ) roundIncrement = 0; 216 } 217 else { 218 if ( roundingMode == float_round_down ) roundIncrement = 0; 219 } 220 } 221 } 222 roundBits = zSig & 0x7F; 223 if ( 0xFD <= (bits16) zExp ) { 224 if ( ( 0xFD < zExp ) 225 || ( ( zExp == 0xFD ) 226 && ( (sbits32) ( zSig + roundIncrement ) < 0 ) ) 227 ) { 228 float_raise( float_flag_overflow | float_flag_inexact ); 229 return packFloat32( zSign, 0xFF, 0 ) - ( roundIncrement == 0 ); 230 } 231 if ( zExp < 0 ) { 232 isTiny = 233 ( float_detect_tininess == float_tininess_before_rounding ) 234 || ( zExp < -1 ) 235 || ( zSig + roundIncrement < 0x80000000 ); 236 shift32RightJamming( zSig, - zExp, &zSig ); 237 zExp = 0; 238 roundBits = zSig & 0x7F; 239 if ( isTiny && roundBits ) float_raise( float_flag_underflow ); 240 } 241 } 242 if ( roundBits ) float_exception_flags |= float_flag_inexact; 243 zSig = ( zSig + roundIncrement )>>7; 244 zSig &= ~ ( ( ( roundBits ^ 0x40 ) == 0 ) & roundNearestEven ); 245 if ( zSig == 0 ) zExp = 0; 246 return packFloat32( zSign, zExp, zSig ); 247 248 } 249 250 /* 251 ------------------------------------------------------------------------------- 252 Takes an abstract floating-point value having sign `zSign', exponent `zExp', 253 and significand `zSig', and returns the proper single-precision floating- 254 point value corresponding to the abstract input. This routine is just like 255 `roundAndPackFloat32' except that `zSig' does not have to be normalized. 256 Bit 31 of `zSig' must be zero, and `zExp' must be 1 less than the ``true'' 257 floating-point exponent. 258 ------------------------------------------------------------------------------- 259 */ 260 static float32 261 normalizeRoundAndPackFloat32( flag zSign, int16 zExp, bits32 zSig ) 262 { 263 int8 shiftCount; 264 265 shiftCount = countLeadingZeros32( zSig ) - 1; 266 return roundAndPackFloat32( zSign, zExp - shiftCount, zSig<<shiftCount ); 267 268 } 269 270 /* 271 ------------------------------------------------------------------------------- 272 Returns the least-significant 32 fraction bits of the double-precision 273 floating-point value `a'. 274 ------------------------------------------------------------------------------- 275 */ 276 INLINE bits32 extractFloat64Frac1( float64 a ) 277 { 278 279 return FLOAT64_DEMANGLE(a) & LIT64( 0x00000000FFFFFFFF ); 280 281 } 282 283 /* 284 ------------------------------------------------------------------------------- 285 Returns the most-significant 20 fraction bits of the double-precision 286 floating-point value `a'. 287 ------------------------------------------------------------------------------- 288 */ 289 INLINE bits32 extractFloat64Frac0( float64 a ) 290 { 291 292 return ( FLOAT64_DEMANGLE(a)>>32 ) & 0x000FFFFF; 293 294 } 295 296 /* 297 ------------------------------------------------------------------------------- 298 Returns the exponent bits of the double-precision floating-point value `a'. 299 ------------------------------------------------------------------------------- 300 */ 301 INLINE int16 extractFloat64Exp( float64 a ) 302 { 303 304 return ( FLOAT64_DEMANGLE(a)>>52 ) & 0x7FF; 305 306 } 307 308 /* 309 ------------------------------------------------------------------------------- 310 Returns the sign bit of the double-precision floating-point value `a'. 311 ------------------------------------------------------------------------------- 312 */ 313 INLINE flag extractFloat64Sign( float64 a ) 314 { 315 316 return FLOAT64_DEMANGLE(a)>>63; 317 318 } 319 320 /* 321 ------------------------------------------------------------------------------- 322 Normalizes the subnormal double-precision floating-point value represented 323 by the denormalized significand formed by the concatenation of `aSig0' and 324 `aSig1'. The normalized exponent is stored at the location pointed to by 325 `zExpPtr'. The most significant 21 bits of the normalized significand are 326 stored at the location pointed to by `zSig0Ptr', and the least significant 327 32 bits of the normalized significand are stored at the location pointed to 328 by `zSig1Ptr'. 329 ------------------------------------------------------------------------------- 330 */ 331 static void 332 normalizeFloat64Subnormal( 333 bits32 aSig0, 334 bits32 aSig1, 335 int16 *zExpPtr, 336 bits32 *zSig0Ptr, 337 bits32 *zSig1Ptr 338 ) 339 { 340 int8 shiftCount; 341 342 if ( aSig0 == 0 ) { 343 shiftCount = countLeadingZeros32( aSig1 ) - 11; 344 if ( shiftCount < 0 ) { 345 *zSig0Ptr = aSig1>>( - shiftCount ); 346 *zSig1Ptr = aSig1<<( shiftCount & 31 ); 347 } 348 else { 349 *zSig0Ptr = aSig1<<shiftCount; 350 *zSig1Ptr = 0; 351 } 352 *zExpPtr = - shiftCount - 31; 353 } 354 else { 355 shiftCount = countLeadingZeros32( aSig0 ) - 11; 356 shortShift64Left( aSig0, aSig1, shiftCount, zSig0Ptr, zSig1Ptr ); 357 *zExpPtr = 1 - shiftCount; 358 } 359 360 } 361 362 /* 363 ------------------------------------------------------------------------------- 364 Packs the sign `zSign', the exponent `zExp', and the significand formed by 365 the concatenation of `zSig0' and `zSig1' into a double-precision floating- 366 point value, returning the result. After being shifted into the proper 367 positions, the three fields `zSign', `zExp', and `zSig0' are simply added 368 together to form the most significant 32 bits of the result. This means 369 that any integer portion of `zSig0' will be added into the exponent. Since 370 a properly normalized significand will have an integer portion equal to 1, 371 the `zExp' input should be 1 less than the desired result exponent whenever 372 `zSig0' and `zSig1' concatenated form a complete, normalized significand. 373 ------------------------------------------------------------------------------- 374 */ 375 INLINE float64 376 packFloat64( flag zSign, int16 zExp, bits32 zSig0, bits32 zSig1 ) 377 { 378 379 return FLOAT64_MANGLE( ( ( (bits64) zSign )<<63 ) + 380 ( ( (bits64) zExp )<<52 ) + 381 ( ( (bits64) zSig0 )<<32 ) + zSig1 ); 382 383 384 } 385 386 /* 387 ------------------------------------------------------------------------------- 388 Takes an abstract floating-point value having sign `zSign', exponent `zExp', 389 and extended significand formed by the concatenation of `zSig0', `zSig1', 390 and `zSig2', and returns the proper double-precision floating-point value 391 corresponding to the abstract input. Ordinarily, the abstract value is 392 simply rounded and packed into the double-precision format, with the inexact 393 exception raised if the abstract input cannot be represented exactly. 394 However, if the abstract value is too large, the overflow and inexact 395 exceptions are raised and an infinity or maximal finite value is returned. 396 If the abstract value is too small, the input value is rounded to a 397 subnormal number, and the underflow and inexact exceptions are raised if the 398 abstract input cannot be represented exactly as a subnormal double-precision 399 floating-point number. 400 The input significand must be normalized or smaller. If the input 401 significand is not normalized, `zExp' must be 0; in that case, the result 402 returned is a subnormal number, and it must not require rounding. In the 403 usual case that the input significand is normalized, `zExp' must be 1 less 404 than the ``true'' floating-point exponent. The handling of underflow and 405 overflow follows the IEC/IEEE Standard for Binary Floating-Point Arithmetic. 406 ------------------------------------------------------------------------------- 407 */ 408 static float64 409 roundAndPackFloat64( 410 flag zSign, int16 zExp, bits32 zSig0, bits32 zSig1, bits32 zSig2 ) 411 { 412 int8 roundingMode; 413 flag roundNearestEven, increment, isTiny; 414 415 roundingMode = float_rounding_mode; 416 roundNearestEven = ( roundingMode == float_round_nearest_even ); 417 increment = ( (sbits32) zSig2 < 0 ); 418 if ( ! roundNearestEven ) { 419 if ( roundingMode == float_round_to_zero ) { 420 increment = 0; 421 } 422 else { 423 if ( zSign ) { 424 increment = ( roundingMode == float_round_down ) && zSig2; 425 } 426 else { 427 increment = ( roundingMode == float_round_up ) && zSig2; 428 } 429 } 430 } 431 if ( 0x7FD <= (bits16) zExp ) { 432 if ( ( 0x7FD < zExp ) 433 || ( ( zExp == 0x7FD ) 434 && eq64( 0x001FFFFF, 0xFFFFFFFF, zSig0, zSig1 ) 435 && increment 436 ) 437 ) { 438 float_raise( float_flag_overflow | float_flag_inexact ); 439 if ( ( roundingMode == float_round_to_zero ) 440 || ( zSign && ( roundingMode == float_round_up ) ) 441 || ( ! zSign && ( roundingMode == float_round_down ) ) 442 ) { 443 return packFloat64( zSign, 0x7FE, 0x000FFFFF, 0xFFFFFFFF ); 444 } 445 return packFloat64( zSign, 0x7FF, 0, 0 ); 446 } 447 if ( zExp < 0 ) { 448 isTiny = 449 ( float_detect_tininess == float_tininess_before_rounding ) 450 || ( zExp < -1 ) 451 || ! increment 452 || lt64( zSig0, zSig1, 0x001FFFFF, 0xFFFFFFFF ); 453 shift64ExtraRightJamming( 454 zSig0, zSig1, zSig2, - zExp, &zSig0, &zSig1, &zSig2 ); 455 zExp = 0; 456 if ( isTiny && zSig2 ) float_raise( float_flag_underflow ); 457 if ( roundNearestEven ) { 458 increment = ( (sbits32) zSig2 < 0 ); 459 } 460 else { 461 if ( zSign ) { 462 increment = ( roundingMode == float_round_down ) && zSig2; 463 } 464 else { 465 increment = ( roundingMode == float_round_up ) && zSig2; 466 } 467 } 468 } 469 } 470 if ( zSig2 ) float_exception_flags |= float_flag_inexact; 471 if ( increment ) { 472 add64( zSig0, zSig1, 0, 1, &zSig0, &zSig1 ); 473 zSig1 &= ~ ( ( zSig2 + zSig2 == 0 ) & roundNearestEven ); 474 } 475 else { 476 if ( ( zSig0 | zSig1 ) == 0 ) zExp = 0; 477 } 478 return packFloat64( zSign, zExp, zSig0, zSig1 ); 479 480 } 481 482 /* 483 ------------------------------------------------------------------------------- 484 Takes an abstract floating-point value having sign `zSign', exponent `zExp', 485 and significand formed by the concatenation of `zSig0' and `zSig1', and 486 returns the proper double-precision floating-point value corresponding 487 to the abstract input. This routine is just like `roundAndPackFloat64' 488 except that the input significand has fewer bits and does not have to be 489 normalized. In all cases, `zExp' must be 1 less than the ``true'' floating- 490 point exponent. 491 ------------------------------------------------------------------------------- 492 */ 493 static float64 494 normalizeRoundAndPackFloat64( 495 flag zSign, int16 zExp, bits32 zSig0, bits32 zSig1 ) 496 { 497 int8 shiftCount; 498 bits32 zSig2; 499 500 if ( zSig0 == 0 ) { 501 zSig0 = zSig1; 502 zSig1 = 0; 503 zExp -= 32; 504 } 505 shiftCount = countLeadingZeros32( zSig0 ) - 11; 506 if ( 0 <= shiftCount ) { 507 zSig2 = 0; 508 shortShift64Left( zSig0, zSig1, shiftCount, &zSig0, &zSig1 ); 509 } 510 else { 511 shift64ExtraRightJamming( 512 zSig0, zSig1, 0, - shiftCount, &zSig0, &zSig1, &zSig2 ); 513 } 514 zExp -= shiftCount; 515 return roundAndPackFloat64( zSign, zExp, zSig0, zSig1, zSig2 ); 516 517 } 518 519 /* 520 ------------------------------------------------------------------------------- 521 Returns the result of converting the 32-bit two's complement integer `a' to 522 the single-precision floating-point format. The conversion is performed 523 according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic. 524 ------------------------------------------------------------------------------- 525 */ 526 float32 int32_to_float32( int32 a ) 527 { 528 flag zSign; 529 530 if ( a == 0 ) return 0; 531 if ( a == (sbits32) 0x80000000 ) return packFloat32( 1, 0x9E, 0 ); 532 zSign = ( a < 0 ); 533 return normalizeRoundAndPackFloat32( zSign, 0x9C, zSign ? - a : a ); 534 535 } 536 537 /* 538 ------------------------------------------------------------------------------- 539 Returns the result of converting the 32-bit two's complement integer `a' to 540 the double-precision floating-point format. The conversion is performed 541 according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic. 542 ------------------------------------------------------------------------------- 543 */ 544 float64 int32_to_float64( int32 a ) 545 { 546 flag zSign; 547 bits32 absA; 548 int8 shiftCount; 549 bits32 zSig0, zSig1; 550 551 if ( a == 0 ) return packFloat64( 0, 0, 0, 0 ); 552 zSign = ( a < 0 ); 553 absA = zSign ? - a : a; 554 shiftCount = countLeadingZeros32( absA ) - 11; 555 if ( 0 <= shiftCount ) { 556 zSig0 = absA<<shiftCount; 557 zSig1 = 0; 558 } 559 else { 560 shift64Right( absA, 0, - shiftCount, &zSig0, &zSig1 ); 561 } 562 return packFloat64( zSign, 0x412 - shiftCount, zSig0, zSig1 ); 563 564 } 565 566 #ifndef SOFTFLOAT_FOR_GCC 567 /* 568 ------------------------------------------------------------------------------- 569 Returns the result of converting the single-precision floating-point value 570 `a' to the 32-bit two's complement integer format. The conversion is 571 performed according to the IEC/IEEE Standard for Binary Floating-Point 572 Arithmetic---which means in particular that the conversion is rounded 573 according to the current rounding mode. If `a' is a NaN, the largest 574 positive integer is returned. Otherwise, if the conversion overflows, the 575 largest integer with the same sign as `a' is returned. 576 ------------------------------------------------------------------------------- 577 */ 578 int32 float32_to_int32( float32 a ) 579 { 580 flag aSign; 581 int16 aExp, shiftCount; 582 bits32 aSig, aSigExtra; 583 int32 z; 584 int8 roundingMode; 585 586 aSig = extractFloat32Frac( a ); 587 aExp = extractFloat32Exp( a ); 588 aSign = extractFloat32Sign( a ); 589 shiftCount = aExp - 0x96; 590 if ( 0 <= shiftCount ) { 591 if ( 0x9E <= aExp ) { 592 if ( a != 0xCF000000 ) { 593 float_raise( float_flag_invalid ); 594 if ( ! aSign || ( ( aExp == 0xFF ) && aSig ) ) { 595 return 0x7FFFFFFF; 596 } 597 } 598 return (sbits32) 0x80000000; 599 } 600 z = ( aSig | 0x00800000 )<<shiftCount; 601 if ( aSign ) z = - z; 602 } 603 else { 604 if ( aExp < 0x7E ) { 605 aSigExtra = aExp | aSig; 606 z = 0; 607 } 608 else { 609 aSig |= 0x00800000; 610 aSigExtra = aSig<<( shiftCount & 31 ); 611 z = aSig>>( - shiftCount ); 612 } 613 if ( aSigExtra ) float_exception_flags |= float_flag_inexact; 614 roundingMode = float_rounding_mode; 615 if ( roundingMode == float_round_nearest_even ) { 616 if ( (sbits32) aSigExtra < 0 ) { 617 ++z; 618 if ( (bits32) ( aSigExtra<<1 ) == 0 ) z &= ~1; 619 } 620 if ( aSign ) z = - z; 621 } 622 else { 623 aSigExtra = ( aSigExtra != 0 ); 624 if ( aSign ) { 625 z += ( roundingMode == float_round_down ) & aSigExtra; 626 z = - z; 627 } 628 else { 629 z += ( roundingMode == float_round_up ) & aSigExtra; 630 } 631 } 632 } 633 return z; 634 635 } 636 #endif 637 638 /* 639 ------------------------------------------------------------------------------- 640 Returns the result of converting the single-precision floating-point value 641 `a' to the 32-bit two's complement integer format. The conversion is 642 performed according to the IEC/IEEE Standard for Binary Floating-Point 643 Arithmetic, except that the conversion is always rounded toward zero. 644 If `a' is a NaN, the largest positive integer is returned. Otherwise, if 645 the conversion overflows, the largest integer with the same sign as `a' is 646 returned. 647 ------------------------------------------------------------------------------- 648 */ 649 int32 float32_to_int32_round_to_zero( float32 a ) 650 { 651 flag aSign; 652 int16 aExp, shiftCount; 653 bits32 aSig; 654 int32 z; 655 656 aSig = extractFloat32Frac( a ); 657 aExp = extractFloat32Exp( a ); 658 aSign = extractFloat32Sign( a ); 659 shiftCount = aExp - 0x9E; 660 if ( 0 <= shiftCount ) { 661 if ( a != 0xCF000000 ) { 662 float_raise( float_flag_invalid ); 663 if ( ! aSign || ( ( aExp == 0xFF ) && aSig ) ) return 0x7FFFFFFF; 664 } 665 return (sbits32) 0x80000000; 666 } 667 else if ( aExp <= 0x7E ) { 668 if ( aExp | aSig ) float_exception_flags |= float_flag_inexact; 669 return 0; 670 } 671 aSig = ( aSig | 0x00800000 )<<8; 672 z = aSig>>( - shiftCount ); 673 if ( (bits32) ( aSig<<( shiftCount & 31 ) ) ) { 674 float_exception_flags |= float_flag_inexact; 675 } 676 if ( aSign ) z = - z; 677 return z; 678 679 } 680 681 /* 682 ------------------------------------------------------------------------------- 683 Returns the result of converting the single-precision floating-point value 684 `a' to the double-precision floating-point format. The conversion is 685 performed according to the IEC/IEEE Standard for Binary Floating-Point 686 Arithmetic. 687 ------------------------------------------------------------------------------- 688 */ 689 float64 float32_to_float64( float32 a ) 690 { 691 flag aSign; 692 int16 aExp; 693 bits32 aSig, zSig0, zSig1; 694 695 aSig = extractFloat32Frac( a ); 696 aExp = extractFloat32Exp( a ); 697 aSign = extractFloat32Sign( a ); 698 if ( aExp == 0xFF ) { 699 if ( aSig ) return commonNaNToFloat64( float32ToCommonNaN( a ) ); 700 return packFloat64( aSign, 0x7FF, 0, 0 ); 701 } 702 if ( aExp == 0 ) { 703 if ( aSig == 0 ) return packFloat64( aSign, 0, 0, 0 ); 704 normalizeFloat32Subnormal( aSig, &aExp, &aSig ); 705 --aExp; 706 } 707 shift64Right( aSig, 0, 3, &zSig0, &zSig1 ); 708 return packFloat64( aSign, aExp + 0x380, zSig0, zSig1 ); 709 710 } 711 712 #ifndef SOFTFLOAT_FOR_GCC 713 /* 714 ------------------------------------------------------------------------------- 715 Rounds the single-precision floating-point value `a' to an integer, 716 and returns the result as a single-precision floating-point value. The 717 operation is performed according to the IEC/IEEE Standard for Binary 718 Floating-Point Arithmetic. 719 ------------------------------------------------------------------------------- 720 */ 721 float32 float32_round_to_int( float32 a ) 722 { 723 flag aSign; 724 int16 aExp; 725 bits32 lastBitMask, roundBitsMask; 726 int8 roundingMode; 727 float32 z; 728 729 aExp = extractFloat32Exp( a ); 730 if ( 0x96 <= aExp ) { 731 if ( ( aExp == 0xFF ) && extractFloat32Frac( a ) ) { 732 return propagateFloat32NaN( a, a ); 733 } 734 return a; 735 } 736 if ( aExp <= 0x7E ) { 737 if ( (bits32) ( a<<1 ) == 0 ) return a; 738 float_exception_flags |= float_flag_inexact; 739 aSign = extractFloat32Sign( a ); 740 switch ( float_rounding_mode ) { 741 case float_round_nearest_even: 742 if ( ( aExp == 0x7E ) && extractFloat32Frac( a ) ) { 743 return packFloat32( aSign, 0x7F, 0 ); 744 } 745 break; 746 case float_round_to_zero: 747 break; 748 case float_round_down: 749 return aSign ? 0xBF800000 : 0; 750 case float_round_up: 751 return aSign ? 0x80000000 : 0x3F800000; 752 } 753 return packFloat32( aSign, 0, 0 ); 754 } 755 lastBitMask = 1; 756 lastBitMask <<= 0x96 - aExp; 757 roundBitsMask = lastBitMask - 1; 758 z = a; 759 roundingMode = float_rounding_mode; 760 if ( roundingMode == float_round_nearest_even ) { 761 z += lastBitMask>>1; 762 if ( ( z & roundBitsMask ) == 0 ) z &= ~ lastBitMask; 763 } 764 else if ( roundingMode != float_round_to_zero ) { 765 if ( extractFloat32Sign( z ) ^ ( roundingMode == float_round_up ) ) { 766 z += roundBitsMask; 767 } 768 } 769 z &= ~ roundBitsMask; 770 if ( z != a ) float_exception_flags |= float_flag_inexact; 771 return z; 772 773 } 774 #endif 775 776 /* 777 ------------------------------------------------------------------------------- 778 Returns the result of adding the absolute values of the single-precision 779 floating-point values `a' and `b'. If `zSign' is 1, the sum is negated 780 before being returned. `zSign' is ignored if the result is a NaN. 781 The addition is performed according to the IEC/IEEE Standard for Binary 782 Floating-Point Arithmetic. 783 ------------------------------------------------------------------------------- 784 */ 785 static float32 addFloat32Sigs( float32 a, float32 b, flag zSign ) 786 { 787 int16 aExp, bExp, zExp; 788 bits32 aSig, bSig, zSig; 789 int16 expDiff; 790 791 aSig = extractFloat32Frac( a ); 792 aExp = extractFloat32Exp( a ); 793 bSig = extractFloat32Frac( b ); 794 bExp = extractFloat32Exp( b ); 795 expDiff = aExp - bExp; 796 aSig <<= 6; 797 bSig <<= 6; 798 if ( 0 < expDiff ) { 799 if ( aExp == 0xFF ) { 800 if ( aSig ) return propagateFloat32NaN( a, b ); 801 return a; 802 } 803 if ( bExp == 0 ) { 804 --expDiff; 805 } 806 else { 807 bSig |= 0x20000000; 808 } 809 shift32RightJamming( bSig, expDiff, &bSig ); 810 zExp = aExp; 811 } 812 else if ( expDiff < 0 ) { 813 if ( bExp == 0xFF ) { 814 if ( bSig ) return propagateFloat32NaN( a, b ); 815 return packFloat32( zSign, 0xFF, 0 ); 816 } 817 if ( aExp == 0 ) { 818 ++expDiff; 819 } 820 else { 821 aSig |= 0x20000000; 822 } 823 shift32RightJamming( aSig, - expDiff, &aSig ); 824 zExp = bExp; 825 } 826 else { 827 if ( aExp == 0xFF ) { 828 if ( aSig | bSig ) return propagateFloat32NaN( a, b ); 829 return a; 830 } 831 if ( aExp == 0 ) return packFloat32( zSign, 0, ( aSig + bSig )>>6 ); 832 zSig = 0x40000000 + aSig + bSig; 833 zExp = aExp; 834 goto roundAndPack; 835 } 836 aSig |= 0x20000000; 837 zSig = ( aSig + bSig )<<1; 838 --zExp; 839 if ( (sbits32) zSig < 0 ) { 840 zSig = aSig + bSig; 841 ++zExp; 842 } 843 roundAndPack: 844 return roundAndPackFloat32( zSign, zExp, zSig ); 845 846 } 847 848 /* 849 ------------------------------------------------------------------------------- 850 Returns the result of subtracting the absolute values of the single- 851 precision floating-point values `a' and `b'. If `zSign' is 1, the 852 difference is negated before being returned. `zSign' is ignored if the 853 result is a NaN. The subtraction is performed according to the IEC/IEEE 854 Standard for Binary Floating-Point Arithmetic. 855 ------------------------------------------------------------------------------- 856 */ 857 static float32 subFloat32Sigs( float32 a, float32 b, flag zSign ) 858 { 859 int16 aExp, bExp, zExp; 860 bits32 aSig, bSig, zSig; 861 int16 expDiff; 862 863 aSig = extractFloat32Frac( a ); 864 aExp = extractFloat32Exp( a ); 865 bSig = extractFloat32Frac( b ); 866 bExp = extractFloat32Exp( b ); 867 expDiff = aExp - bExp; 868 aSig <<= 7; 869 bSig <<= 7; 870 if ( 0 < expDiff ) goto aExpBigger; 871 if ( expDiff < 0 ) goto bExpBigger; 872 if ( aExp == 0xFF ) { 873 if ( aSig | bSig ) return propagateFloat32NaN( a, b ); 874 float_raise( float_flag_invalid ); 875 return float32_default_nan; 876 } 877 if ( aExp == 0 ) { 878 aExp = 1; 879 bExp = 1; 880 } 881 if ( bSig < aSig ) goto aBigger; 882 if ( aSig < bSig ) goto bBigger; 883 return packFloat32( float_rounding_mode == float_round_down, 0, 0 ); 884 bExpBigger: 885 if ( bExp == 0xFF ) { 886 if ( bSig ) return propagateFloat32NaN( a, b ); 887 return packFloat32( zSign ^ 1, 0xFF, 0 ); 888 } 889 if ( aExp == 0 ) { 890 ++expDiff; 891 } 892 else { 893 aSig |= 0x40000000; 894 } 895 shift32RightJamming( aSig, - expDiff, &aSig ); 896 bSig |= 0x40000000; 897 bBigger: 898 zSig = bSig - aSig; 899 zExp = bExp; 900 zSign ^= 1; 901 goto normalizeRoundAndPack; 902 aExpBigger: 903 if ( aExp == 0xFF ) { 904 if ( aSig ) return propagateFloat32NaN( a, b ); 905 return a; 906 } 907 if ( bExp == 0 ) { 908 --expDiff; 909 } 910 else { 911 bSig |= 0x40000000; 912 } 913 shift32RightJamming( bSig, expDiff, &bSig ); 914 aSig |= 0x40000000; 915 aBigger: 916 zSig = aSig - bSig; 917 zExp = aExp; 918 normalizeRoundAndPack: 919 --zExp; 920 return normalizeRoundAndPackFloat32( zSign, zExp, zSig ); 921 922 } 923 924 /* 925 ------------------------------------------------------------------------------- 926 Returns the result of adding the single-precision floating-point values `a' 927 and `b'. The operation is performed according to the IEC/IEEE Standard for 928 Binary Floating-Point Arithmetic. 929 ------------------------------------------------------------------------------- 930 */ 931 float32 float32_add( float32 a, float32 b ) 932 { 933 flag aSign, bSign; 934 935 aSign = extractFloat32Sign( a ); 936 bSign = extractFloat32Sign( b ); 937 if ( aSign == bSign ) { 938 return addFloat32Sigs( a, b, aSign ); 939 } 940 else { 941 return subFloat32Sigs( a, b, aSign ); 942 } 943 944 } 945 946 /* 947 ------------------------------------------------------------------------------- 948 Returns the result of subtracting the single-precision floating-point values 949 `a' and `b'. The operation is performed according to the IEC/IEEE Standard 950 for Binary Floating-Point Arithmetic. 951 ------------------------------------------------------------------------------- 952 */ 953 float32 float32_sub( float32 a, float32 b ) 954 { 955 flag aSign, bSign; 956 957 aSign = extractFloat32Sign( a ); 958 bSign = extractFloat32Sign( b ); 959 if ( aSign == bSign ) { 960 return subFloat32Sigs( a, b, aSign ); 961 } 962 else { 963 return addFloat32Sigs( a, b, aSign ); 964 } 965 966 } 967 968 /* 969 ------------------------------------------------------------------------------- 970 Returns the result of multiplying the single-precision floating-point values 971 `a' and `b'. The operation is performed according to the IEC/IEEE Standard 972 for Binary Floating-Point Arithmetic. 973 ------------------------------------------------------------------------------- 974 */ 975 float32 float32_mul( float32 a, float32 b ) 976 { 977 flag aSign, bSign, zSign; 978 int16 aExp, bExp, zExp; 979 bits32 aSig, bSig, zSig0, zSig1; 980 981 aSig = extractFloat32Frac( a ); 982 aExp = extractFloat32Exp( a ); 983 aSign = extractFloat32Sign( a ); 984 bSig = extractFloat32Frac( b ); 985 bExp = extractFloat32Exp( b ); 986 bSign = extractFloat32Sign( b ); 987 zSign = aSign ^ bSign; 988 if ( aExp == 0xFF ) { 989 if ( aSig || ( ( bExp == 0xFF ) && bSig ) ) { 990 return propagateFloat32NaN( a, b ); 991 } 992 if ( ( bExp | bSig ) == 0 ) { 993 float_raise( float_flag_invalid ); 994 return float32_default_nan; 995 } 996 return packFloat32( zSign, 0xFF, 0 ); 997 } 998 if ( bExp == 0xFF ) { 999 if ( bSig ) return propagateFloat32NaN( a, b ); 1000 if ( ( aExp | aSig ) == 0 ) { 1001 float_raise( float_flag_invalid ); 1002 return float32_default_nan; 1003 } 1004 return packFloat32( zSign, 0xFF, 0 ); 1005 } 1006 if ( aExp == 0 ) { 1007 if ( aSig == 0 ) return packFloat32( zSign, 0, 0 ); 1008 normalizeFloat32Subnormal( aSig, &aExp, &aSig ); 1009 } 1010 if ( bExp == 0 ) { 1011 if ( bSig == 0 ) return packFloat32( zSign, 0, 0 ); 1012 normalizeFloat32Subnormal( bSig, &bExp, &bSig ); 1013 } 1014 zExp = aExp + bExp - 0x7F; 1015 aSig = ( aSig | 0x00800000 )<<7; 1016 bSig = ( bSig | 0x00800000 )<<8; 1017 mul32To64( aSig, bSig, &zSig0, &zSig1 ); 1018 zSig0 |= ( zSig1 != 0 ); 1019 if ( 0 <= (sbits32) ( zSig0<<1 ) ) { 1020 zSig0 <<= 1; 1021 --zExp; 1022 } 1023 return roundAndPackFloat32( zSign, zExp, zSig0 ); 1024 1025 } 1026 1027 /* 1028 ------------------------------------------------------------------------------- 1029 Returns the result of dividing the single-precision floating-point value `a' 1030 by the corresponding value `b'. The operation is performed according to the 1031 IEC/IEEE Standard for Binary Floating-Point Arithmetic. 1032 ------------------------------------------------------------------------------- 1033 */ 1034 float32 float32_div( float32 a, float32 b ) 1035 { 1036 flag aSign, bSign, zSign; 1037 int16 aExp, bExp, zExp; 1038 bits32 aSig, bSig, zSig, rem0, rem1, term0, term1; 1039 1040 aSig = extractFloat32Frac( a ); 1041 aExp = extractFloat32Exp( a ); 1042 aSign = extractFloat32Sign( a ); 1043 bSig = extractFloat32Frac( b ); 1044 bExp = extractFloat32Exp( b ); 1045 bSign = extractFloat32Sign( b ); 1046 zSign = aSign ^ bSign; 1047 if ( aExp == 0xFF ) { 1048 if ( aSig ) return propagateFloat32NaN( a, b ); 1049 if ( bExp == 0xFF ) { 1050 if ( bSig ) return propagateFloat32NaN( a, b ); 1051 float_raise( float_flag_invalid ); 1052 return float32_default_nan; 1053 } 1054 return packFloat32( zSign, 0xFF, 0 ); 1055 } 1056 if ( bExp == 0xFF ) { 1057 if ( bSig ) return propagateFloat32NaN( a, b ); 1058 return packFloat32( zSign, 0, 0 ); 1059 } 1060 if ( bExp == 0 ) { 1061 if ( bSig == 0 ) { 1062 if ( ( aExp | aSig ) == 0 ) { 1063 float_raise( float_flag_invalid ); 1064 return float32_default_nan; 1065 } 1066 float_raise( float_flag_divbyzero ); 1067 return packFloat32( zSign, 0xFF, 0 ); 1068 } 1069 normalizeFloat32Subnormal( bSig, &bExp, &bSig ); 1070 } 1071 if ( aExp == 0 ) { 1072 if ( aSig == 0 ) return packFloat32( zSign, 0, 0 ); 1073 normalizeFloat32Subnormal( aSig, &aExp, &aSig ); 1074 } 1075 zExp = aExp - bExp + 0x7D; 1076 aSig = ( aSig | 0x00800000 )<<7; 1077 bSig = ( bSig | 0x00800000 )<<8; 1078 if ( bSig <= ( aSig + aSig ) ) { 1079 aSig >>= 1; 1080 ++zExp; 1081 } 1082 zSig = estimateDiv64To32( aSig, 0, bSig ); 1083 if ( ( zSig & 0x3F ) <= 2 ) { 1084 mul32To64( bSig, zSig, &term0, &term1 ); 1085 sub64( aSig, 0, term0, term1, &rem0, &rem1 ); 1086 while ( (sbits32) rem0 < 0 ) { 1087 --zSig; 1088 add64( rem0, rem1, 0, bSig, &rem0, &rem1 ); 1089 } 1090 zSig |= ( rem1 != 0 ); 1091 } 1092 return roundAndPackFloat32( zSign, zExp, zSig ); 1093 1094 } 1095 1096 #ifndef SOFTFLOAT_FOR_GCC 1097 /* 1098 ------------------------------------------------------------------------------- 1099 Returns the remainder of the single-precision floating-point value `a' 1100 with respect to the corresponding value `b'. The operation is performed 1101 according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic. 1102 ------------------------------------------------------------------------------- 1103 */ 1104 float32 float32_rem( float32 a, float32 b ) 1105 { 1106 flag aSign, bSign, zSign; 1107 int16 aExp, bExp, expDiff; 1108 bits32 aSig, bSig, q, allZero, alternateASig; 1109 sbits32 sigMean; 1110 1111 aSig = extractFloat32Frac( a ); 1112 aExp = extractFloat32Exp( a ); 1113 aSign = extractFloat32Sign( a ); 1114 bSig = extractFloat32Frac( b ); 1115 bExp = extractFloat32Exp( b ); 1116 bSign = extractFloat32Sign( b ); 1117 if ( aExp == 0xFF ) { 1118 if ( aSig || ( ( bExp == 0xFF ) && bSig ) ) { 1119 return propagateFloat32NaN( a, b ); 1120 } 1121 float_raise( float_flag_invalid ); 1122 return float32_default_nan; 1123 } 1124 if ( bExp == 0xFF ) { 1125 if ( bSig ) return propagateFloat32NaN( a, b ); 1126 return a; 1127 } 1128 if ( bExp == 0 ) { 1129 if ( bSig == 0 ) { 1130 float_raise( float_flag_invalid ); 1131 return float32_default_nan; 1132 } 1133 normalizeFloat32Subnormal( bSig, &bExp, &bSig ); 1134 } 1135 if ( aExp == 0 ) { 1136 if ( aSig == 0 ) return a; 1137 normalizeFloat32Subnormal( aSig, &aExp, &aSig ); 1138 } 1139 expDiff = aExp - bExp; 1140 aSig = ( aSig | 0x00800000 )<<8; 1141 bSig = ( bSig | 0x00800000 )<<8; 1142 if ( expDiff < 0 ) { 1143 if ( expDiff < -1 ) return a; 1144 aSig >>= 1; 1145 } 1146 q = ( bSig <= aSig ); 1147 if ( q ) aSig -= bSig; 1148 expDiff -= 32; 1149 while ( 0 < expDiff ) { 1150 q = estimateDiv64To32( aSig, 0, bSig ); 1151 q = ( 2 < q ) ? q - 2 : 0; 1152 aSig = - ( ( bSig>>2 ) * q ); 1153 expDiff -= 30; 1154 } 1155 expDiff += 32; 1156 if ( 0 < expDiff ) { 1157 q = estimateDiv64To32( aSig, 0, bSig ); 1158 q = ( 2 < q ) ? q - 2 : 0; 1159 q >>= 32 - expDiff; 1160 bSig >>= 2; 1161 aSig = ( ( aSig>>1 )<<( expDiff - 1 ) ) - bSig * q; 1162 } 1163 else { 1164 aSig >>= 2; 1165 bSig >>= 2; 1166 } 1167 do { 1168 alternateASig = aSig; 1169 ++q; 1170 aSig -= bSig; 1171 } while ( 0 <= (sbits32) aSig ); 1172 sigMean = aSig + alternateASig; 1173 if ( ( sigMean < 0 ) || ( ( sigMean == 0 ) && ( q & 1 ) ) ) { 1174 aSig = alternateASig; 1175 } 1176 zSign = ( (sbits32) aSig < 0 ); 1177 if ( zSign ) aSig = - aSig; 1178 return normalizeRoundAndPackFloat32( aSign ^ zSign, bExp, aSig ); 1179 1180 } 1181 #endif 1182 1183 #ifndef SOFTFLOAT_FOR_GCC 1184 /* 1185 ------------------------------------------------------------------------------- 1186 Returns the square root of the single-precision floating-point value `a'. 1187 The operation is performed according to the IEC/IEEE Standard for Binary 1188 Floating-Point Arithmetic. 1189 ------------------------------------------------------------------------------- 1190 */ 1191 float32 float32_sqrt( float32 a ) 1192 { 1193 flag aSign; 1194 int16 aExp, zExp; 1195 bits32 aSig, zSig, rem0, rem1, term0, term1; 1196 1197 aSig = extractFloat32Frac( a ); 1198 aExp = extractFloat32Exp( a ); 1199 aSign = extractFloat32Sign( a ); 1200 if ( aExp == 0xFF ) { 1201 if ( aSig ) return propagateFloat32NaN( a, 0 ); 1202 if ( ! aSign ) return a; 1203 float_raise( float_flag_invalid ); 1204 return float32_default_nan; 1205 } 1206 if ( aSign ) { 1207 if ( ( aExp | aSig ) == 0 ) return a; 1208 float_raise( float_flag_invalid ); 1209 return float32_default_nan; 1210 } 1211 if ( aExp == 0 ) { 1212 if ( aSig == 0 ) return 0; 1213 normalizeFloat32Subnormal( aSig, &aExp, &aSig ); 1214 } 1215 zExp = ( ( aExp - 0x7F )>>1 ) + 0x7E; 1216 aSig = ( aSig | 0x00800000 )<<8; 1217 zSig = estimateSqrt32( aExp, aSig ) + 2; 1218 if ( ( zSig & 0x7F ) <= 5 ) { 1219 if ( zSig < 2 ) { 1220 zSig = 0x7FFFFFFF; 1221 goto roundAndPack; 1222 } 1223 else { 1224 aSig >>= aExp & 1; 1225 mul32To64( zSig, zSig, &term0, &term1 ); 1226 sub64( aSig, 0, term0, term1, &rem0, &rem1 ); 1227 while ( (sbits32) rem0 < 0 ) { 1228 --zSig; 1229 shortShift64Left( 0, zSig, 1, &term0, &term1 ); 1230 term1 |= 1; 1231 add64( rem0, rem1, term0, term1, &rem0, &rem1 ); 1232 } 1233 zSig |= ( ( rem0 | rem1 ) != 0 ); 1234 } 1235 } 1236 shift32RightJamming( zSig, 1, &zSig ); 1237 roundAndPack: 1238 return roundAndPackFloat32( 0, zExp, zSig ); 1239 1240 } 1241 #endif 1242 1243 /* 1244 ------------------------------------------------------------------------------- 1245 Returns 1 if the single-precision floating-point value `a' is equal to 1246 the corresponding value `b', and 0 otherwise. The comparison is performed 1247 according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic. 1248 ------------------------------------------------------------------------------- 1249 */ 1250 flag float32_eq( float32 a, float32 b ) 1251 { 1252 1253 if ( ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) ) 1254 || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) ) 1255 ) { 1256 if ( float32_is_signaling_nan( a ) || float32_is_signaling_nan( b ) ) { 1257 float_raise( float_flag_invalid ); 1258 } 1259 return 0; 1260 } 1261 return ( a == b ) || ( (bits32) ( ( a | b )<<1 ) == 0 ); 1262 1263 } 1264 1265 /* 1266 ------------------------------------------------------------------------------- 1267 Returns 1 if the single-precision floating-point value `a' is less than 1268 or equal to the corresponding value `b', and 0 otherwise. The comparison 1269 is performed according to the IEC/IEEE Standard for Binary Floating-Point 1270 Arithmetic. 1271 ------------------------------------------------------------------------------- 1272 */ 1273 flag float32_le( float32 a, float32 b ) 1274 { 1275 flag aSign, bSign; 1276 1277 if ( ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) ) 1278 || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) ) 1279 ) { 1280 float_raise( float_flag_invalid ); 1281 return 0; 1282 } 1283 aSign = extractFloat32Sign( a ); 1284 bSign = extractFloat32Sign( b ); 1285 if ( aSign != bSign ) return aSign || ( (bits32) ( ( a | b )<<1 ) == 0 ); 1286 return ( a == b ) || ( aSign ^ ( a < b ) ); 1287 1288 } 1289 1290 /* 1291 ------------------------------------------------------------------------------- 1292 Returns 1 if the single-precision floating-point value `a' is less than 1293 the corresponding value `b', and 0 otherwise. The comparison is performed 1294 according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic. 1295 ------------------------------------------------------------------------------- 1296 */ 1297 flag float32_lt( float32 a, float32 b ) 1298 { 1299 flag aSign, bSign; 1300 1301 if ( ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) ) 1302 || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) ) 1303 ) { 1304 float_raise( float_flag_invalid ); 1305 return 0; 1306 } 1307 aSign = extractFloat32Sign( a ); 1308 bSign = extractFloat32Sign( b ); 1309 if ( aSign != bSign ) return aSign && ( (bits32) ( ( a | b )<<1 ) != 0 ); 1310 return ( a != b ) && ( aSign ^ ( a < b ) ); 1311 1312 } 1313 1314 #ifndef SOFTFLOAT_FOR_GCC /* Not needed */ 1315 /* 1316 ------------------------------------------------------------------------------- 1317 Returns 1 if the single-precision floating-point value `a' is equal to 1318 the corresponding value `b', and 0 otherwise. The invalid exception is 1319 raised if either operand is a NaN. Otherwise, the comparison is performed 1320 according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic. 1321 ------------------------------------------------------------------------------- 1322 */ 1323 flag float32_eq_signaling( float32 a, float32 b ) 1324 { 1325 1326 if ( ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) ) 1327 || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) ) 1328 ) { 1329 float_raise( float_flag_invalid ); 1330 return 0; 1331 } 1332 return ( a == b ) || ( (bits32) ( ( a | b )<<1 ) == 0 ); 1333 1334 } 1335 1336 /* 1337 ------------------------------------------------------------------------------- 1338 Returns 1 if the single-precision floating-point value `a' is less than or 1339 equal to the corresponding value `b', and 0 otherwise. Quiet NaNs do not 1340 cause an exception. Otherwise, the comparison is performed according to the 1341 IEC/IEEE Standard for Binary Floating-Point Arithmetic. 1342 ------------------------------------------------------------------------------- 1343 */ 1344 flag float32_le_quiet( float32 a, float32 b ) 1345 { 1346 flag aSign, bSign; 1347 int16 aExp, bExp; 1348 1349 if ( ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) ) 1350 || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) ) 1351 ) { 1352 if ( float32_is_signaling_nan( a ) || float32_is_signaling_nan( b ) ) { 1353 float_raise( float_flag_invalid ); 1354 } 1355 return 0; 1356 } 1357 aSign = extractFloat32Sign( a ); 1358 bSign = extractFloat32Sign( b ); 1359 if ( aSign != bSign ) return aSign || ( (bits32) ( ( a | b )<<1 ) == 0 ); 1360 return ( a == b ) || ( aSign ^ ( a < b ) ); 1361 1362 } 1363 1364 /* 1365 ------------------------------------------------------------------------------- 1366 Returns 1 if the single-precision floating-point value `a' is less than 1367 the corresponding value `b', and 0 otherwise. Quiet NaNs do not cause an 1368 exception. Otherwise, the comparison is performed according to the IEC/IEEE 1369 Standard for Binary Floating-Point Arithmetic. 1370 ------------------------------------------------------------------------------- 1371 */ 1372 flag float32_lt_quiet( float32 a, float32 b ) 1373 { 1374 flag aSign, bSign; 1375 1376 if ( ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) ) 1377 || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) ) 1378 ) { 1379 if ( float32_is_signaling_nan( a ) || float32_is_signaling_nan( b ) ) { 1380 float_raise( float_flag_invalid ); 1381 } 1382 return 0; 1383 } 1384 aSign = extractFloat32Sign( a ); 1385 bSign = extractFloat32Sign( b ); 1386 if ( aSign != bSign ) return aSign && ( (bits32) ( ( a | b )<<1 ) != 0 ); 1387 return ( a != b ) && ( aSign ^ ( a < b ) ); 1388 1389 } 1390 #endif /* !SOFTFLOAT_FOR_GCC */ 1391 1392 #ifndef SOFTFLOAT_FOR_GCC /* Not needed */ 1393 /* 1394 ------------------------------------------------------------------------------- 1395 Returns the result of converting the double-precision floating-point value 1396 `a' to the 32-bit two's complement integer format. The conversion is 1397 performed according to the IEC/IEEE Standard for Binary Floating-Point 1398 Arithmetic---which means in particular that the conversion is rounded 1399 according to the current rounding mode. If `a' is a NaN, the largest 1400 positive integer is returned. Otherwise, if the conversion overflows, the 1401 largest integer with the same sign as `a' is returned. 1402 ------------------------------------------------------------------------------- 1403 */ 1404 int32 float64_to_int32( float64 a ) 1405 { 1406 flag aSign; 1407 int16 aExp, shiftCount; 1408 bits32 aSig0, aSig1, absZ, aSigExtra; 1409 int32 z; 1410 int8 roundingMode; 1411 1412 aSig1 = extractFloat64Frac1( a ); 1413 aSig0 = extractFloat64Frac0( a ); 1414 aExp = extractFloat64Exp( a ); 1415 aSign = extractFloat64Sign( a ); 1416 shiftCount = aExp - 0x413; 1417 if ( 0 <= shiftCount ) { 1418 if ( 0x41E < aExp ) { 1419 if ( ( aExp == 0x7FF ) && ( aSig0 | aSig1 ) ) aSign = 0; 1420 goto invalid; 1421 } 1422 shortShift64Left( 1423 aSig0 | 0x00100000, aSig1, shiftCount, &absZ, &aSigExtra ); 1424 if ( 0x80000000 < absZ ) goto invalid; 1425 } 1426 else { 1427 aSig1 = ( aSig1 != 0 ); 1428 if ( aExp < 0x3FE ) { 1429 aSigExtra = aExp | aSig0 | aSig1; 1430 absZ = 0; 1431 } 1432 else { 1433 aSig0 |= 0x00100000; 1434 aSigExtra = ( aSig0<<( shiftCount & 31 ) ) | aSig1; 1435 absZ = aSig0>>( - shiftCount ); 1436 } 1437 } 1438 roundingMode = float_rounding_mode; 1439 if ( roundingMode == float_round_nearest_even ) { 1440 if ( (sbits32) aSigExtra < 0 ) { 1441 ++absZ; 1442 if ( (bits32) ( aSigExtra<<1 ) == 0 ) absZ &= ~1; 1443 } 1444 z = aSign ? - absZ : absZ; 1445 } 1446 else { 1447 aSigExtra = ( aSigExtra != 0 ); 1448 if ( aSign ) { 1449 z = - ( absZ 1450 + ( ( roundingMode == float_round_down ) & aSigExtra ) ); 1451 } 1452 else { 1453 z = absZ + ( ( roundingMode == float_round_up ) & aSigExtra ); 1454 } 1455 } 1456 if ( ( aSign ^ ( z < 0 ) ) && z ) { 1457 invalid: 1458 float_raise( float_flag_invalid ); 1459 return aSign ? (sbits32) 0x80000000 : 0x7FFFFFFF; 1460 } 1461 if ( aSigExtra ) float_exception_flags |= float_flag_inexact; 1462 return z; 1463 1464 } 1465 #endif /* !SOFTFLOAT_FOR_GCC */ 1466 1467 /* 1468 ------------------------------------------------------------------------------- 1469 Returns the result of converting the double-precision floating-point value 1470 `a' to the 32-bit two's complement integer format. The conversion is 1471 performed according to the IEC/IEEE Standard for Binary Floating-Point 1472 Arithmetic, except that the conversion is always rounded toward zero. 1473 If `a' is a NaN, the largest positive integer is returned. Otherwise, if 1474 the conversion overflows, the largest integer with the same sign as `a' is 1475 returned. 1476 ------------------------------------------------------------------------------- 1477 */ 1478 int32 float64_to_int32_round_to_zero( float64 a ) 1479 { 1480 flag aSign; 1481 int16 aExp, shiftCount; 1482 bits32 aSig0, aSig1, absZ, aSigExtra; 1483 int32 z; 1484 1485 aSig1 = extractFloat64Frac1( a ); 1486 aSig0 = extractFloat64Frac0( a ); 1487 aExp = extractFloat64Exp( a ); 1488 aSign = extractFloat64Sign( a ); 1489 shiftCount = aExp - 0x413; 1490 if ( 0 <= shiftCount ) { 1491 if ( 0x41E < aExp ) { 1492 if ( ( aExp == 0x7FF ) && ( aSig0 | aSig1 ) ) aSign = 0; 1493 goto invalid; 1494 } 1495 shortShift64Left( 1496 aSig0 | 0x00100000, aSig1, shiftCount, &absZ, &aSigExtra ); 1497 } 1498 else { 1499 if ( aExp < 0x3FF ) { 1500 if ( aExp | aSig0 | aSig1 ) { 1501 float_exception_flags |= float_flag_inexact; 1502 } 1503 return 0; 1504 } 1505 aSig0 |= 0x00100000; 1506 aSigExtra = ( aSig0<<( shiftCount & 31 ) ) | aSig1; 1507 absZ = aSig0>>( - shiftCount ); 1508 } 1509 z = aSign ? - absZ : absZ; 1510 if ( ( aSign ^ ( z < 0 ) ) && z ) { 1511 invalid: 1512 float_raise( float_flag_invalid ); 1513 return aSign ? (sbits32) 0x80000000 : 0x7FFFFFFF; 1514 } 1515 if ( aSigExtra ) float_exception_flags |= float_flag_inexact; 1516 return z; 1517 1518 } 1519 1520 /* 1521 ------------------------------------------------------------------------------- 1522 Returns the result of converting the double-precision floating-point value 1523 `a' to the single-precision floating-point format. The conversion is 1524 performed according to the IEC/IEEE Standard for Binary Floating-Point 1525 Arithmetic. 1526 ------------------------------------------------------------------------------- 1527 */ 1528 float32 float64_to_float32( float64 a ) 1529 { 1530 flag aSign; 1531 int16 aExp; 1532 bits32 aSig0, aSig1, zSig; 1533 bits32 allZero; 1534 1535 aSig1 = extractFloat64Frac1( a ); 1536 aSig0 = extractFloat64Frac0( a ); 1537 aExp = extractFloat64Exp( a ); 1538 aSign = extractFloat64Sign( a ); 1539 if ( aExp == 0x7FF ) { 1540 if ( aSig0 | aSig1 ) { 1541 return commonNaNToFloat32( float64ToCommonNaN( a ) ); 1542 } 1543 return packFloat32( aSign, 0xFF, 0 ); 1544 } 1545 shift64RightJamming( aSig0, aSig1, 22, &allZero, &zSig ); 1546 if ( aExp ) zSig |= 0x40000000; 1547 return roundAndPackFloat32( aSign, aExp - 0x381, zSig ); 1548 1549 } 1550 1551 #ifndef SOFTFLOAT_FOR_GCC 1552 /* 1553 ------------------------------------------------------------------------------- 1554 Rounds the double-precision floating-point value `a' to an integer, 1555 and returns the result as a double-precision floating-point value. The 1556 operation is performed according to the IEC/IEEE Standard for Binary 1557 Floating-Point Arithmetic. 1558 ------------------------------------------------------------------------------- 1559 */ 1560 float64 float64_round_to_int( float64 a ) 1561 { 1562 flag aSign; 1563 int16 aExp; 1564 bits32 lastBitMask, roundBitsMask; 1565 int8 roundingMode; 1566 float64 z; 1567 1568 aExp = extractFloat64Exp( a ); 1569 if ( 0x413 <= aExp ) { 1570 if ( 0x433 <= aExp ) { 1571 if ( ( aExp == 0x7FF ) 1572 && ( extractFloat64Frac0( a ) | extractFloat64Frac1( a ) ) ) { 1573 return propagateFloat64NaN( a, a ); 1574 } 1575 return a; 1576 } 1577 lastBitMask = 1; 1578 lastBitMask = ( lastBitMask<<( 0x432 - aExp ) )<<1; 1579 roundBitsMask = lastBitMask - 1; 1580 z = a; 1581 roundingMode = float_rounding_mode; 1582 if ( roundingMode == float_round_nearest_even ) { 1583 if ( lastBitMask ) { 1584 add64( z.high, z.low, 0, lastBitMask>>1, &z.high, &z.low ); 1585 if ( ( z.low & roundBitsMask ) == 0 ) z.low &= ~ lastBitMask; 1586 } 1587 else { 1588 if ( (sbits32) z.low < 0 ) { 1589 ++z.high; 1590 if ( (bits32) ( z.low<<1 ) == 0 ) z.high &= ~1; 1591 } 1592 } 1593 } 1594 else if ( roundingMode != float_round_to_zero ) { 1595 if ( extractFloat64Sign( z ) 1596 ^ ( roundingMode == float_round_up ) ) { 1597 add64( z.high, z.low, 0, roundBitsMask, &z.high, &z.low ); 1598 } 1599 } 1600 z.low &= ~ roundBitsMask; 1601 } 1602 else { 1603 if ( aExp <= 0x3FE ) { 1604 if ( ( ( (bits32) ( a.high<<1 ) ) | a.low ) == 0 ) return a; 1605 float_exception_flags |= float_flag_inexact; 1606 aSign = extractFloat64Sign( a ); 1607 switch ( float_rounding_mode ) { 1608 case float_round_nearest_even: 1609 if ( ( aExp == 0x3FE ) 1610 && ( extractFloat64Frac0( a ) | extractFloat64Frac1( a ) ) 1611 ) { 1612 return packFloat64( aSign, 0x3FF, 0, 0 ); 1613 } 1614 break; 1615 case float_round_down: 1616 return 1617 aSign ? packFloat64( 1, 0x3FF, 0, 0 ) 1618 : packFloat64( 0, 0, 0, 0 ); 1619 case float_round_up: 1620 return 1621 aSign ? packFloat64( 1, 0, 0, 0 ) 1622 : packFloat64( 0, 0x3FF, 0, 0 ); 1623 } 1624 return packFloat64( aSign, 0, 0, 0 ); 1625 } 1626 lastBitMask = 1; 1627 lastBitMask <<= 0x413 - aExp; 1628 roundBitsMask = lastBitMask - 1; 1629 z.low = 0; 1630 z.high = a.high; 1631 roundingMode = float_rounding_mode; 1632 if ( roundingMode == float_round_nearest_even ) { 1633 z.high += lastBitMask>>1; 1634 if ( ( ( z.high & roundBitsMask ) | a.low ) == 0 ) { 1635 z.high &= ~ lastBitMask; 1636 } 1637 } 1638 else if ( roundingMode != float_round_to_zero ) { 1639 if ( extractFloat64Sign( z ) 1640 ^ ( roundingMode == float_round_up ) ) { 1641 z.high |= ( a.low != 0 ); 1642 z.high += roundBitsMask; 1643 } 1644 } 1645 z.high &= ~ roundBitsMask; 1646 } 1647 if ( ( z.low != a.low ) || ( z.high != a.high ) ) { 1648 float_exception_flags |= float_flag_inexact; 1649 } 1650 return z; 1651 1652 } 1653 #endif 1654 1655 /* 1656 ------------------------------------------------------------------------------- 1657 Returns the result of adding the absolute values of the double-precision 1658 floating-point values `a' and `b'. If `zSign' is 1, the sum is negated 1659 before being returned. `zSign' is ignored if the result is a NaN. 1660 The addition is performed according to the IEC/IEEE Standard for Binary 1661 Floating-Point Arithmetic. 1662 ------------------------------------------------------------------------------- 1663 */ 1664 static float64 addFloat64Sigs( float64 a, float64 b, flag zSign ) 1665 { 1666 int16 aExp, bExp, zExp; 1667 bits32 aSig0, aSig1, bSig0, bSig1, zSig0, zSig1, zSig2; 1668 int16 expDiff; 1669 1670 aSig1 = extractFloat64Frac1( a ); 1671 aSig0 = extractFloat64Frac0( a ); 1672 aExp = extractFloat64Exp( a ); 1673 bSig1 = extractFloat64Frac1( b ); 1674 bSig0 = extractFloat64Frac0( b ); 1675 bExp = extractFloat64Exp( b ); 1676 expDiff = aExp - bExp; 1677 if ( 0 < expDiff ) { 1678 if ( aExp == 0x7FF ) { 1679 if ( aSig0 | aSig1 ) return propagateFloat64NaN( a, b ); 1680 return a; 1681 } 1682 if ( bExp == 0 ) { 1683 --expDiff; 1684 } 1685 else { 1686 bSig0 |= 0x00100000; 1687 } 1688 shift64ExtraRightJamming( 1689 bSig0, bSig1, 0, expDiff, &bSig0, &bSig1, &zSig2 ); 1690 zExp = aExp; 1691 } 1692 else if ( expDiff < 0 ) { 1693 if ( bExp == 0x7FF ) { 1694 if ( bSig0 | bSig1 ) return propagateFloat64NaN( a, b ); 1695 return packFloat64( zSign, 0x7FF, 0, 0 ); 1696 } 1697 if ( aExp == 0 ) { 1698 ++expDiff; 1699 } 1700 else { 1701 aSig0 |= 0x00100000; 1702 } 1703 shift64ExtraRightJamming( 1704 aSig0, aSig1, 0, - expDiff, &aSig0, &aSig1, &zSig2 ); 1705 zExp = bExp; 1706 } 1707 else { 1708 if ( aExp == 0x7FF ) { 1709 if ( aSig0 | aSig1 | bSig0 | bSig1 ) { 1710 return propagateFloat64NaN( a, b ); 1711 } 1712 return a; 1713 } 1714 add64( aSig0, aSig1, bSig0, bSig1, &zSig0, &zSig1 ); 1715 if ( aExp == 0 ) return packFloat64( zSign, 0, zSig0, zSig1 ); 1716 zSig2 = 0; 1717 zSig0 |= 0x00200000; 1718 zExp = aExp; 1719 goto shiftRight1; 1720 } 1721 aSig0 |= 0x00100000; 1722 add64( aSig0, aSig1, bSig0, bSig1, &zSig0, &zSig1 ); 1723 --zExp; 1724 if ( zSig0 < 0x00200000 ) goto roundAndPack; 1725 ++zExp; 1726 shiftRight1: 1727 shift64ExtraRightJamming( zSig0, zSig1, zSig2, 1, &zSig0, &zSig1, &zSig2 ); 1728 roundAndPack: 1729 return roundAndPackFloat64( zSign, zExp, zSig0, zSig1, zSig2 ); 1730 1731 } 1732 1733 /* 1734 ------------------------------------------------------------------------------- 1735 Returns the result of subtracting the absolute values of the double- 1736 precision floating-point values `a' and `b'. If `zSign' is 1, the 1737 difference is negated before being returned. `zSign' is ignored if the 1738 result is a NaN. The subtraction is performed according to the IEC/IEEE 1739 Standard for Binary Floating-Point Arithmetic. 1740 ------------------------------------------------------------------------------- 1741 */ 1742 static float64 subFloat64Sigs( float64 a, float64 b, flag zSign ) 1743 { 1744 int16 aExp, bExp, zExp; 1745 bits32 aSig0, aSig1, bSig0, bSig1, zSig0, zSig1; 1746 int16 expDiff; 1747 1748 aSig1 = extractFloat64Frac1( a ); 1749 aSig0 = extractFloat64Frac0( a ); 1750 aExp = extractFloat64Exp( a ); 1751 bSig1 = extractFloat64Frac1( b ); 1752 bSig0 = extractFloat64Frac0( b ); 1753 bExp = extractFloat64Exp( b ); 1754 expDiff = aExp - bExp; 1755 shortShift64Left( aSig0, aSig1, 10, &aSig0, &aSig1 ); 1756 shortShift64Left( bSig0, bSig1, 10, &bSig0, &bSig1 ); 1757 if ( 0 < expDiff ) goto aExpBigger; 1758 if ( expDiff < 0 ) goto bExpBigger; 1759 if ( aExp == 0x7FF ) { 1760 if ( aSig0 | aSig1 | bSig0 | bSig1 ) { 1761 return propagateFloat64NaN( a, b ); 1762 } 1763 float_raise( float_flag_invalid ); 1764 return float64_default_nan; 1765 } 1766 if ( aExp == 0 ) { 1767 aExp = 1; 1768 bExp = 1; 1769 } 1770 if ( bSig0 < aSig0 ) goto aBigger; 1771 if ( aSig0 < bSig0 ) goto bBigger; 1772 if ( bSig1 < aSig1 ) goto aBigger; 1773 if ( aSig1 < bSig1 ) goto bBigger; 1774 return packFloat64( float_rounding_mode == float_round_down, 0, 0, 0 ); 1775 bExpBigger: 1776 if ( bExp == 0x7FF ) { 1777 if ( bSig0 | bSig1 ) return propagateFloat64NaN( a, b ); 1778 return packFloat64( zSign ^ 1, 0x7FF, 0, 0 ); 1779 } 1780 if ( aExp == 0 ) { 1781 ++expDiff; 1782 } 1783 else { 1784 aSig0 |= 0x40000000; 1785 } 1786 shift64RightJamming( aSig0, aSig1, - expDiff, &aSig0, &aSig1 ); 1787 bSig0 |= 0x40000000; 1788 bBigger: 1789 sub64( bSig0, bSig1, aSig0, aSig1, &zSig0, &zSig1 ); 1790 zExp = bExp; 1791 zSign ^= 1; 1792 goto normalizeRoundAndPack; 1793 aExpBigger: 1794 if ( aExp == 0x7FF ) { 1795 if ( aSig0 | aSig1 ) return propagateFloat64NaN( a, b ); 1796 return a; 1797 } 1798 if ( bExp == 0 ) { 1799 --expDiff; 1800 } 1801 else { 1802 bSig0 |= 0x40000000; 1803 } 1804 shift64RightJamming( bSig0, bSig1, expDiff, &bSig0, &bSig1 ); 1805 aSig0 |= 0x40000000; 1806 aBigger: 1807 sub64( aSig0, aSig1, bSig0, bSig1, &zSig0, &zSig1 ); 1808 zExp = aExp; 1809 normalizeRoundAndPack: 1810 --zExp; 1811 return normalizeRoundAndPackFloat64( zSign, zExp - 10, zSig0, zSig1 ); 1812 1813 } 1814 1815 /* 1816 ------------------------------------------------------------------------------- 1817 Returns the result of adding the double-precision floating-point values `a' 1818 and `b'. The operation is performed according to the IEC/IEEE Standard for 1819 Binary Floating-Point Arithmetic. 1820 ------------------------------------------------------------------------------- 1821 */ 1822 float64 float64_add( float64 a, float64 b ) 1823 { 1824 flag aSign, bSign; 1825 1826 aSign = extractFloat64Sign( a ); 1827 bSign = extractFloat64Sign( b ); 1828 if ( aSign == bSign ) { 1829 return addFloat64Sigs( a, b, aSign ); 1830 } 1831 else { 1832 return subFloat64Sigs( a, b, aSign ); 1833 } 1834 1835 } 1836 1837 /* 1838 ------------------------------------------------------------------------------- 1839 Returns the result of subtracting the double-precision floating-point values 1840 `a' and `b'. The operation is performed according to the IEC/IEEE Standard 1841 for Binary Floating-Point Arithmetic. 1842 ------------------------------------------------------------------------------- 1843 */ 1844 float64 float64_sub( float64 a, float64 b ) 1845 { 1846 flag aSign, bSign; 1847 1848 aSign = extractFloat64Sign( a ); 1849 bSign = extractFloat64Sign( b ); 1850 if ( aSign == bSign ) { 1851 return subFloat64Sigs( a, b, aSign ); 1852 } 1853 else { 1854 return addFloat64Sigs( a, b, aSign ); 1855 } 1856 1857 } 1858 1859 /* 1860 ------------------------------------------------------------------------------- 1861 Returns the result of multiplying the double-precision floating-point values 1862 `a' and `b'. The operation is performed according to the IEC/IEEE Standard 1863 for Binary Floating-Point Arithmetic. 1864 ------------------------------------------------------------------------------- 1865 */ 1866 float64 float64_mul( float64 a, float64 b ) 1867 { 1868 flag aSign, bSign, zSign; 1869 int16 aExp, bExp, zExp; 1870 bits32 aSig0, aSig1, bSig0, bSig1, zSig0, zSig1, zSig2, zSig3; 1871 1872 aSig1 = extractFloat64Frac1( a ); 1873 aSig0 = extractFloat64Frac0( a ); 1874 aExp = extractFloat64Exp( a ); 1875 aSign = extractFloat64Sign( a ); 1876 bSig1 = extractFloat64Frac1( b ); 1877 bSig0 = extractFloat64Frac0( b ); 1878 bExp = extractFloat64Exp( b ); 1879 bSign = extractFloat64Sign( b ); 1880 zSign = aSign ^ bSign; 1881 if ( aExp == 0x7FF ) { 1882 if ( ( aSig0 | aSig1 ) 1883 || ( ( bExp == 0x7FF ) && ( bSig0 | bSig1 ) ) ) { 1884 return propagateFloat64NaN( a, b ); 1885 } 1886 if ( ( bExp | bSig0 | bSig1 ) == 0 ) goto invalid; 1887 return packFloat64( zSign, 0x7FF, 0, 0 ); 1888 } 1889 if ( bExp == 0x7FF ) { 1890 if ( bSig0 | bSig1 ) return propagateFloat64NaN( a, b ); 1891 if ( ( aExp | aSig0 | aSig1 ) == 0 ) { 1892 invalid: 1893 float_raise( float_flag_invalid ); 1894 return float64_default_nan; 1895 } 1896 return packFloat64( zSign, 0x7FF, 0, 0 ); 1897 } 1898 if ( aExp == 0 ) { 1899 if ( ( aSig0 | aSig1 ) == 0 ) return packFloat64( zSign, 0, 0, 0 ); 1900 normalizeFloat64Subnormal( aSig0, aSig1, &aExp, &aSig0, &aSig1 ); 1901 } 1902 if ( bExp == 0 ) { 1903 if ( ( bSig0 | bSig1 ) == 0 ) return packFloat64( zSign, 0, 0, 0 ); 1904 normalizeFloat64Subnormal( bSig0, bSig1, &bExp, &bSig0, &bSig1 ); 1905 } 1906 zExp = aExp + bExp - 0x400; 1907 aSig0 |= 0x00100000; 1908 shortShift64Left( bSig0, bSig1, 12, &bSig0, &bSig1 ); 1909 mul64To128( aSig0, aSig1, bSig0, bSig1, &zSig0, &zSig1, &zSig2, &zSig3 ); 1910 add64( zSig0, zSig1, aSig0, aSig1, &zSig0, &zSig1 ); 1911 zSig2 |= ( zSig3 != 0 ); 1912 if ( 0x00200000 <= zSig0 ) { 1913 shift64ExtraRightJamming( 1914 zSig0, zSig1, zSig2, 1, &zSig0, &zSig1, &zSig2 ); 1915 ++zExp; 1916 } 1917 return roundAndPackFloat64( zSign, zExp, zSig0, zSig1, zSig2 ); 1918 1919 } 1920 1921 /* 1922 ------------------------------------------------------------------------------- 1923 Returns the result of dividing the double-precision floating-point value `a' 1924 by the corresponding value `b'. The operation is performed according to the 1925 IEC/IEEE Standard for Binary Floating-Point Arithmetic. 1926 ------------------------------------------------------------------------------- 1927 */ 1928 float64 float64_div( float64 a, float64 b ) 1929 { 1930 flag aSign, bSign, zSign; 1931 int16 aExp, bExp, zExp; 1932 bits32 aSig0, aSig1, bSig0, bSig1, zSig0, zSig1, zSig2; 1933 bits32 rem0, rem1, rem2, rem3, term0, term1, term2, term3; 1934 1935 aSig1 = extractFloat64Frac1( a ); 1936 aSig0 = extractFloat64Frac0( a ); 1937 aExp = extractFloat64Exp( a ); 1938 aSign = extractFloat64Sign( a ); 1939 bSig1 = extractFloat64Frac1( b ); 1940 bSig0 = extractFloat64Frac0( b ); 1941 bExp = extractFloat64Exp( b ); 1942 bSign = extractFloat64Sign( b ); 1943 zSign = aSign ^ bSign; 1944 if ( aExp == 0x7FF ) { 1945 if ( aSig0 | aSig1 ) return propagateFloat64NaN( a, b ); 1946 if ( bExp == 0x7FF ) { 1947 if ( bSig0 | bSig1 ) return propagateFloat64NaN( a, b ); 1948 goto invalid; 1949 } 1950 return packFloat64( zSign, 0x7FF, 0, 0 ); 1951 } 1952 if ( bExp == 0x7FF ) { 1953 if ( bSig0 | bSig1 ) return propagateFloat64NaN( a, b ); 1954 return packFloat64( zSign, 0, 0, 0 ); 1955 } 1956 if ( bExp == 0 ) { 1957 if ( ( bSig0 | bSig1 ) == 0 ) { 1958 if ( ( aExp | aSig0 | aSig1 ) == 0 ) { 1959 invalid: 1960 float_raise( float_flag_invalid ); 1961 return float64_default_nan; 1962 } 1963 float_raise( float_flag_divbyzero ); 1964 return packFloat64( zSign, 0x7FF, 0, 0 ); 1965 } 1966 normalizeFloat64Subnormal( bSig0, bSig1, &bExp, &bSig0, &bSig1 ); 1967 } 1968 if ( aExp == 0 ) { 1969 if ( ( aSig0 | aSig1 ) == 0 ) return packFloat64( zSign, 0, 0, 0 ); 1970 normalizeFloat64Subnormal( aSig0, aSig1, &aExp, &aSig0, &aSig1 ); 1971 } 1972 zExp = aExp - bExp + 0x3FD; 1973 shortShift64Left( aSig0 | 0x00100000, aSig1, 11, &aSig0, &aSig1 ); 1974 shortShift64Left( bSig0 | 0x00100000, bSig1, 11, &bSig0, &bSig1 ); 1975 if ( le64( bSig0, bSig1, aSig0, aSig1 ) ) { 1976 shift64Right( aSig0, aSig1, 1, &aSig0, &aSig1 ); 1977 ++zExp; 1978 } 1979 zSig0 = estimateDiv64To32( aSig0, aSig1, bSig0 ); 1980 mul64By32To96( bSig0, bSig1, zSig0, &term0, &term1, &term2 ); 1981 sub96( aSig0, aSig1, 0, term0, term1, term2, &rem0, &rem1, &rem2 ); 1982 while ( (sbits32) rem0 < 0 ) { 1983 --zSig0; 1984 add96( rem0, rem1, rem2, 0, bSig0, bSig1, &rem0, &rem1, &rem2 ); 1985 } 1986 zSig1 = estimateDiv64To32( rem1, rem2, bSig0 ); 1987 if ( ( zSig1 & 0x3FF ) <= 4 ) { 1988 mul64By32To96( bSig0, bSig1, zSig1, &term1, &term2, &term3 ); 1989 sub96( rem1, rem2, 0, term1, term2, term3, &rem1, &rem2, &rem3 ); 1990 while ( (sbits32) rem1 < 0 ) { 1991 --zSig1; 1992 add96( rem1, rem2, rem3, 0, bSig0, bSig1, &rem1, &rem2, &rem3 ); 1993 } 1994 zSig1 |= ( ( rem1 | rem2 | rem3 ) != 0 ); 1995 } 1996 shift64ExtraRightJamming( zSig0, zSig1, 0, 11, &zSig0, &zSig1, &zSig2 ); 1997 return roundAndPackFloat64( zSign, zExp, zSig0, zSig1, zSig2 ); 1998 1999 } 2000 2001 #ifndef SOFTFLOAT_FOR_GCC 2002 /* 2003 ------------------------------------------------------------------------------- 2004 Returns the remainder of the double-precision floating-point value `a' 2005 with respect to the corresponding value `b'. The operation is performed 2006 according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic. 2007 ------------------------------------------------------------------------------- 2008 */ 2009 float64 float64_rem( float64 a, float64 b ) 2010 { 2011 flag aSign, bSign, zSign; 2012 int16 aExp, bExp, expDiff; 2013 bits32 aSig0, aSig1, bSig0, bSig1, q, term0, term1, term2; 2014 bits32 allZero, alternateASig0, alternateASig1, sigMean1; 2015 sbits32 sigMean0; 2016 float64 z; 2017 2018 aSig1 = extractFloat64Frac1( a ); 2019 aSig0 = extractFloat64Frac0( a ); 2020 aExp = extractFloat64Exp( a ); 2021 aSign = extractFloat64Sign( a ); 2022 bSig1 = extractFloat64Frac1( b ); 2023 bSig0 = extractFloat64Frac0( b ); 2024 bExp = extractFloat64Exp( b ); 2025 bSign = extractFloat64Sign( b ); 2026 if ( aExp == 0x7FF ) { 2027 if ( ( aSig0 | aSig1 ) 2028 || ( ( bExp == 0x7FF ) && ( bSig0 | bSig1 ) ) ) { 2029 return propagateFloat64NaN( a, b ); 2030 } 2031 goto invalid; 2032 } 2033 if ( bExp == 0x7FF ) { 2034 if ( bSig0 | bSig1 ) return propagateFloat64NaN( a, b ); 2035 return a; 2036 } 2037 if ( bExp == 0 ) { 2038 if ( ( bSig0 | bSig1 ) == 0 ) { 2039 invalid: 2040 float_raise( float_flag_invalid ); 2041 return float64_default_nan; 2042 } 2043 normalizeFloat64Subnormal( bSig0, bSig1, &bExp, &bSig0, &bSig1 ); 2044 } 2045 if ( aExp == 0 ) { 2046 if ( ( aSig0 | aSig1 ) == 0 ) return a; 2047 normalizeFloat64Subnormal( aSig0, aSig1, &aExp, &aSig0, &aSig1 ); 2048 } 2049 expDiff = aExp - bExp; 2050 if ( expDiff < -1 ) return a; 2051 shortShift64Left( 2052 aSig0 | 0x00100000, aSig1, 11 - ( expDiff < 0 ), &aSig0, &aSig1 ); 2053 shortShift64Left( bSig0 | 0x00100000, bSig1, 11, &bSig0, &bSig1 ); 2054 q = le64( bSig0, bSig1, aSig0, aSig1 ); 2055 if ( q ) sub64( aSig0, aSig1, bSig0, bSig1, &aSig0, &aSig1 ); 2056 expDiff -= 32; 2057 while ( 0 < expDiff ) { 2058 q = estimateDiv64To32( aSig0, aSig1, bSig0 ); 2059 q = ( 4 < q ) ? q - 4 : 0; 2060 mul64By32To96( bSig0, bSig1, q, &term0, &term1, &term2 ); 2061 shortShift96Left( term0, term1, term2, 29, &term1, &term2, &allZero ); 2062 shortShift64Left( aSig0, aSig1, 29, &aSig0, &allZero ); 2063 sub64( aSig0, 0, term1, term2, &aSig0, &aSig1 ); 2064 expDiff -= 29; 2065 } 2066 if ( -32 < expDiff ) { 2067 q = estimateDiv64To32( aSig0, aSig1, bSig0 ); 2068 q = ( 4 < q ) ? q - 4 : 0; 2069 q >>= - expDiff; 2070 shift64Right( bSig0, bSig1, 8, &bSig0, &bSig1 ); 2071 expDiff += 24; 2072 if ( expDiff < 0 ) { 2073 shift64Right( aSig0, aSig1, - expDiff, &aSig0, &aSig1 ); 2074 } 2075 else { 2076 shortShift64Left( aSig0, aSig1, expDiff, &aSig0, &aSig1 ); 2077 } 2078 mul64By32To96( bSig0, bSig1, q, &term0, &term1, &term2 ); 2079 sub64( aSig0, aSig1, term1, term2, &aSig0, &aSig1 ); 2080 } 2081 else { 2082 shift64Right( aSig0, aSig1, 8, &aSig0, &aSig1 ); 2083 shift64Right( bSig0, bSig1, 8, &bSig0, &bSig1 ); 2084 } 2085 do { 2086 alternateASig0 = aSig0; 2087 alternateASig1 = aSig1; 2088 ++q; 2089 sub64( aSig0, aSig1, bSig0, bSig1, &aSig0, &aSig1 ); 2090 } while ( 0 <= (sbits32) aSig0 ); 2091 add64( 2092 aSig0, aSig1, alternateASig0, alternateASig1, &sigMean0, &sigMean1 ); 2093 if ( ( sigMean0 < 0 ) 2094 || ( ( ( sigMean0 | sigMean1 ) == 0 ) && ( q & 1 ) ) ) { 2095 aSig0 = alternateASig0; 2096 aSig1 = alternateASig1; 2097 } 2098 zSign = ( (sbits32) aSig0 < 0 ); 2099 if ( zSign ) sub64( 0, 0, aSig0, aSig1, &aSig0, &aSig1 ); 2100 return 2101 normalizeRoundAndPackFloat64( aSign ^ zSign, bExp - 4, aSig0, aSig1 ); 2102 2103 } 2104 #endif 2105 2106 #ifndef SOFTFLOAT_FOR_GCC 2107 /* 2108 ------------------------------------------------------------------------------- 2109 Returns the square root of the double-precision floating-point value `a'. 2110 The operation is performed according to the IEC/IEEE Standard for Binary 2111 Floating-Point Arithmetic. 2112 ------------------------------------------------------------------------------- 2113 */ 2114 float64 float64_sqrt( float64 a ) 2115 { 2116 flag aSign; 2117 int16 aExp, zExp; 2118 bits32 aSig0, aSig1, zSig0, zSig1, zSig2, doubleZSig0; 2119 bits32 rem0, rem1, rem2, rem3, term0, term1, term2, term3; 2120 float64 z; 2121 2122 aSig1 = extractFloat64Frac1( a ); 2123 aSig0 = extractFloat64Frac0( a ); 2124 aExp = extractFloat64Exp( a ); 2125 aSign = extractFloat64Sign( a ); 2126 if ( aExp == 0x7FF ) { 2127 if ( aSig0 | aSig1 ) return propagateFloat64NaN( a, a ); 2128 if ( ! aSign ) return a; 2129 goto invalid; 2130 } 2131 if ( aSign ) { 2132 if ( ( aExp | aSig0 | aSig1 ) == 0 ) return a; 2133 invalid: 2134 float_raise( float_flag_invalid ); 2135 return float64_default_nan; 2136 } 2137 if ( aExp == 0 ) { 2138 if ( ( aSig0 | aSig1 ) == 0 ) return packFloat64( 0, 0, 0, 0 ); 2139 normalizeFloat64Subnormal( aSig0, aSig1, &aExp, &aSig0, &aSig1 ); 2140 } 2141 zExp = ( ( aExp - 0x3FF )>>1 ) + 0x3FE; 2142 aSig0 |= 0x00100000; 2143 shortShift64Left( aSig0, aSig1, 11, &term0, &term1 ); 2144 zSig0 = ( estimateSqrt32( aExp, term0 )>>1 ) + 1; 2145 if ( zSig0 == 0 ) zSig0 = 0x7FFFFFFF; 2146 doubleZSig0 = zSig0 + zSig0; 2147 shortShift64Left( aSig0, aSig1, 9 - ( aExp & 1 ), &aSig0, &aSig1 ); 2148 mul32To64( zSig0, zSig0, &term0, &term1 ); 2149 sub64( aSig0, aSig1, term0, term1, &rem0, &rem1 ); 2150 while ( (sbits32) rem0 < 0 ) { 2151 --zSig0; 2152 doubleZSig0 -= 2; 2153 add64( rem0, rem1, 0, doubleZSig0 | 1, &rem0, &rem1 ); 2154 } 2155 zSig1 = estimateDiv64To32( rem1, 0, doubleZSig0 ); 2156 if ( ( zSig1 & 0x1FF ) <= 5 ) { 2157 if ( zSig1 == 0 ) zSig1 = 1; 2158 mul32To64( doubleZSig0, zSig1, &term1, &term2 ); 2159 sub64( rem1, 0, term1, term2, &rem1, &rem2 ); 2160 mul32To64( zSig1, zSig1, &term2, &term3 ); 2161 sub96( rem1, rem2, 0, 0, term2, term3, &rem1, &rem2, &rem3 ); 2162 while ( (sbits32) rem1 < 0 ) { 2163 --zSig1; 2164 shortShift64Left( 0, zSig1, 1, &term2, &term3 ); 2165 term3 |= 1; 2166 term2 |= doubleZSig0; 2167 add96( rem1, rem2, rem3, 0, term2, term3, &rem1, &rem2, &rem3 ); 2168 } 2169 zSig1 |= ( ( rem1 | rem2 | rem3 ) != 0 ); 2170 } 2171 shift64ExtraRightJamming( zSig0, zSig1, 0, 10, &zSig0, &zSig1, &zSig2 ); 2172 return roundAndPackFloat64( 0, zExp, zSig0, zSig1, zSig2 ); 2173 2174 } 2175 #endif 2176 2177 /* 2178 ------------------------------------------------------------------------------- 2179 Returns 1 if the double-precision floating-point value `a' is equal to 2180 the corresponding value `b', and 0 otherwise. The comparison is performed 2181 according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic. 2182 ------------------------------------------------------------------------------- 2183 */ 2184 flag float64_eq( float64 a, float64 b ) 2185 { 2186 2187 if ( ( ( extractFloat64Exp( a ) == 0x7FF ) 2188 && ( extractFloat64Frac0( a ) | extractFloat64Frac1( a ) ) ) 2189 || ( ( extractFloat64Exp( b ) == 0x7FF ) 2190 && ( extractFloat64Frac0( b ) | extractFloat64Frac1( b ) ) ) 2191 ) { 2192 if ( float64_is_signaling_nan( a ) || float64_is_signaling_nan( b ) ) { 2193 float_raise( float_flag_invalid ); 2194 } 2195 return 0; 2196 } 2197 return ( a == b ) || 2198 ( (bits64) ( ( FLOAT64_DEMANGLE(a) | FLOAT64_DEMANGLE(b) )<<1 ) == 0 ); 2199 2200 } 2201 2202 /* 2203 ------------------------------------------------------------------------------- 2204 Returns 1 if the double-precision floating-point value `a' is less than 2205 or equal to the corresponding value `b', and 0 otherwise. The comparison 2206 is performed according to the IEC/IEEE Standard for Binary Floating-Point 2207 Arithmetic. 2208 ------------------------------------------------------------------------------- 2209 */ 2210 flag float64_le( float64 a, float64 b ) 2211 { 2212 flag aSign, bSign; 2213 2214 if ( ( ( extractFloat64Exp( a ) == 0x7FF ) 2215 && ( extractFloat64Frac0( a ) | extractFloat64Frac1( a ) ) ) 2216 || ( ( extractFloat64Exp( b ) == 0x7FF ) 2217 && ( extractFloat64Frac0( b ) | extractFloat64Frac1( b ) ) ) 2218 ) { 2219 float_raise( float_flag_invalid ); 2220 return 0; 2221 } 2222 aSign = extractFloat64Sign( a ); 2223 bSign = extractFloat64Sign( b ); 2224 if ( aSign != bSign ) 2225 return aSign || 2226 ( (bits64) ( ( FLOAT64_DEMANGLE(a) | FLOAT64_DEMANGLE(b) )<<1 ) == 2227 0 ); 2228 return ( a == b ) || 2229 ( aSign ^ ( FLOAT64_DEMANGLE(a) < FLOAT64_DEMANGLE(b) ) ); 2230 } 2231 2232 /* 2233 ------------------------------------------------------------------------------- 2234 Returns 1 if the double-precision floating-point value `a' is less than 2235 the corresponding value `b', and 0 otherwise. The comparison is performed 2236 according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic. 2237 ------------------------------------------------------------------------------- 2238 */ 2239 flag float64_lt( float64 a, float64 b ) 2240 { 2241 flag aSign, bSign; 2242 2243 if ( ( ( extractFloat64Exp( a ) == 0x7FF ) 2244 && ( extractFloat64Frac0( a ) | extractFloat64Frac1( a ) ) ) 2245 || ( ( extractFloat64Exp( b ) == 0x7FF ) 2246 && ( extractFloat64Frac0( b ) | extractFloat64Frac1( b ) ) ) 2247 ) { 2248 float_raise( float_flag_invalid ); 2249 return 0; 2250 } 2251 aSign = extractFloat64Sign( a ); 2252 bSign = extractFloat64Sign( b ); 2253 if ( aSign != bSign ) 2254 return aSign && 2255 ( (bits64) ( ( FLOAT64_DEMANGLE(a) | FLOAT64_DEMANGLE(b) )<<1 ) != 2256 0 ); 2257 return ( a != b ) && 2258 ( aSign ^ ( FLOAT64_DEMANGLE(a) < FLOAT64_DEMANGLE(b) ) ); 2259 2260 } 2261 2262 #ifndef SOFTFLOAT_FOR_GCC 2263 /* 2264 ------------------------------------------------------------------------------- 2265 Returns 1 if the double-precision floating-point value `a' is equal to 2266 the corresponding value `b', and 0 otherwise. The invalid exception is 2267 raised if either operand is a NaN. Otherwise, the comparison is performed 2268 according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic. 2269 ------------------------------------------------------------------------------- 2270 */ 2271 flag float64_eq_signaling( float64 a, float64 b ) 2272 { 2273 2274 if ( ( ( extractFloat64Exp( a ) == 0x7FF ) 2275 && ( extractFloat64Frac0( a ) | extractFloat64Frac1( a ) ) ) 2276 || ( ( extractFloat64Exp( b ) == 0x7FF ) 2277 && ( extractFloat64Frac0( b ) | extractFloat64Frac1( b ) ) ) 2278 ) { 2279 float_raise( float_flag_invalid ); 2280 return 0; 2281 } 2282 return ( a == b ) || ( (bits64) ( ( a | b )<<1 ) == 0 ); 2283 2284 } 2285 2286 /* 2287 ------------------------------------------------------------------------------- 2288 Returns 1 if the double-precision floating-point value `a' is less than or 2289 equal to the corresponding value `b', and 0 otherwise. Quiet NaNs do not 2290 cause an exception. Otherwise, the comparison is performed according to the 2291 IEC/IEEE Standard for Binary Floating-Point Arithmetic. 2292 ------------------------------------------------------------------------------- 2293 */ 2294 flag float64_le_quiet( float64 a, float64 b ) 2295 { 2296 flag aSign, bSign; 2297 2298 if ( ( ( extractFloat64Exp( a ) == 0x7FF ) 2299 && ( extractFloat64Frac0( a ) | extractFloat64Frac1( a ) ) ) 2300 || ( ( extractFloat64Exp( b ) == 0x7FF ) 2301 && ( extractFloat64Frac0( b ) | extractFloat64Frac1( b ) ) ) 2302 ) { 2303 if ( float64_is_signaling_nan( a ) || float64_is_signaling_nan( b ) ) { 2304 float_raise( float_flag_invalid ); 2305 } 2306 return 0; 2307 } 2308 aSign = extractFloat64Sign( a ); 2309 bSign = extractFloat64Sign( b ); 2310 if ( aSign != bSign ) return aSign || ( (bits64) ( ( a | b )<<1 ) == 0 ); 2311 return ( a == b ) || ( aSign ^ ( a < b ) ); 2312 2313 } 2314 2315 /* 2316 ------------------------------------------------------------------------------- 2317 Returns 1 if the double-precision floating-point value `a' is less than 2318 the corresponding value `b', and 0 otherwise. Quiet NaNs do not cause an 2319 exception. Otherwise, the comparison is performed according to the IEC/IEEE 2320 Standard for Binary Floating-Point Arithmetic. 2321 ------------------------------------------------------------------------------- 2322 */ 2323 flag float64_lt_quiet( float64 a, float64 b ) 2324 { 2325 flag aSign, bSign; 2326 2327 if ( ( ( extractFloat64Exp( a ) == 0x7FF ) 2328 && ( extractFloat64Frac0( a ) | extractFloat64Frac1( a ) ) ) 2329 || ( ( extractFloat64Exp( b ) == 0x7FF ) 2330 && ( extractFloat64Frac0( b ) | extractFloat64Frac1( b ) ) ) 2331 ) { 2332 if ( float64_is_signaling_nan( a ) || float64_is_signaling_nan( b ) ) { 2333 float_raise( float_flag_invalid ); 2334 } 2335 return 0; 2336 } 2337 aSign = extractFloat64Sign( a ); 2338 bSign = extractFloat64Sign( b ); 2339 if ( aSign != bSign ) return aSign && ( (bits64) ( ( a | b )<<1 ) != 0 ); 2340 return ( a != b ) && ( aSign ^ ( a < b ) ); 2341 2342 } 2343 2344 #endif 2345