1 /* $NetBSD: softfloat.c,v 1.1 2002/05/21 23:51:07 bjh21 Exp $ */ 2 3 /* 4 * This version hacked for use with gcc -msoft-float by bjh21. 5 * (Mostly a case of #ifdefing out things GCC doesn't need or provides 6 * itself). 7 */ 8 9 /* 10 * Things you may want to define: 11 * 12 * SOFTFLOAT_FOR_GCC - build only those functions necessary for GCC (with 13 * -msoft-float) to work. Include "softfloat-for-gcc.h" to get them 14 * properly renamed. 15 */ 16 17 /* 18 * This differs from the standard bits32/softfloat.c in that float64 19 * is defined to be a 64-bit integer rather than a structure. The 20 * structure is float64s, with translation between the two going via 21 * float64u. 22 */ 23 24 /* 25 =============================================================================== 26 27 This C source file is part of the SoftFloat IEC/IEEE Floating-Point 28 Arithmetic Package, Release 2a. 29 30 Written by John R. Hauser. This work was made possible in part by the 31 International Computer Science Institute, located at Suite 600, 1947 Center 32 Street, Berkeley, California 94704. Funding was partially provided by the 33 National Science Foundation under grant MIP-9311980. The original version 34 of this code was written as part of a project to build a fixed-point vector 35 processor in collaboration with the University of California at Berkeley, 36 overseen by Profs. Nelson Morgan and John Wawrzynek. More information 37 is available through the Web page `http://HTTP.CS.Berkeley.EDU/~jhauser/ 38 arithmetic/SoftFloat.html'. 39 40 THIS SOFTWARE IS DISTRIBUTED AS IS, FOR FREE. Although reasonable effort 41 has been made to avoid it, THIS SOFTWARE MAY CONTAIN FAULTS THAT WILL AT 42 TIMES RESULT IN INCORRECT BEHAVIOR. USE OF THIS SOFTWARE IS RESTRICTED TO 43 PERSONS AND ORGANIZATIONS WHO CAN AND WILL TAKE FULL RESPONSIBILITY FOR ANY 44 AND ALL LOSSES, COSTS, OR OTHER PROBLEMS ARISING FROM ITS USE. 45 46 Derivative works are acceptable, even for commercial purposes, so long as 47 (1) they include prominent notice that the work is derivative, and (2) they 48 include prominent notice akin to these four paragraphs for those parts of 49 this code that are retained. 50 51 =============================================================================== 52 */ 53 54 #include <sys/cdefs.h> 55 #ifdef SOFTFLOAT_FOR_GCC 56 #include "softfloat-for-gcc.h" 57 #endif 58 59 #include "milieu.h" 60 #include "softfloat.h" 61 62 /* 63 * Conversions between floats as stored in memory and floats as 64 * SoftFloat uses them 65 */ 66 #ifndef FLOAT64_DEMANGLE 67 #define FLOAT64_DEMANGLE(a) (a) 68 #endif 69 #ifndef FLOAT64_MANGLE 70 #define FLOAT64_MANGLE(a) (a) 71 #endif 72 73 /* 74 ------------------------------------------------------------------------------- 75 Floating-point rounding mode and exception flags. 76 ------------------------------------------------------------------------------- 77 */ 78 int float_rounding_mode = float_round_nearest_even; 79 int float_exception_flags = 0; 80 81 /* 82 ------------------------------------------------------------------------------- 83 Primitive arithmetic functions, including multi-word arithmetic, and 84 division and square root approximations. (Can be specialized to target if 85 desired.) 86 ------------------------------------------------------------------------------- 87 */ 88 #include "softfloat-macros" 89 90 /* 91 ------------------------------------------------------------------------------- 92 Functions and definitions to determine: (1) whether tininess for underflow 93 is detected before or after rounding by default, (2) what (if anything) 94 happens when exceptions are raised, (3) how signaling NaNs are distinguished 95 from quiet NaNs, (4) the default generated quiet NaNs, and (4) how NaNs 96 are propagated from function inputs to output. These details are target- 97 specific. 98 ------------------------------------------------------------------------------- 99 */ 100 #include "softfloat-specialize" 101 102 /* 103 ------------------------------------------------------------------------------- 104 Returns the fraction bits of the single-precision floating-point value `a'. 105 ------------------------------------------------------------------------------- 106 */ 107 INLINE bits32 extractFloat32Frac( float32 a ) 108 { 109 110 return a & 0x007FFFFF; 111 112 } 113 114 /* 115 ------------------------------------------------------------------------------- 116 Returns the exponent bits of the single-precision floating-point value `a'. 117 ------------------------------------------------------------------------------- 118 */ 119 INLINE int16 extractFloat32Exp( float32 a ) 120 { 121 122 return ( a>>23 ) & 0xFF; 123 124 } 125 126 /* 127 ------------------------------------------------------------------------------- 128 Returns the sign bit of the single-precision floating-point value `a'. 129 ------------------------------------------------------------------------------- 130 */ 131 INLINE flag extractFloat32Sign( float32 a ) 132 { 133 134 return a>>31; 135 136 } 137 138 /* 139 ------------------------------------------------------------------------------- 140 Normalizes the subnormal single-precision floating-point value represented 141 by the denormalized significand `aSig'. The normalized exponent and 142 significand are stored at the locations pointed to by `zExpPtr' and 143 `zSigPtr', respectively. 144 ------------------------------------------------------------------------------- 145 */ 146 static void 147 normalizeFloat32Subnormal( bits32 aSig, int16 *zExpPtr, bits32 *zSigPtr ) 148 { 149 int8 shiftCount; 150 151 shiftCount = countLeadingZeros32( aSig ) - 8; 152 *zSigPtr = aSig<<shiftCount; 153 *zExpPtr = 1 - shiftCount; 154 155 } 156 157 /* 158 ------------------------------------------------------------------------------- 159 Packs the sign `zSign', exponent `zExp', and significand `zSig' into a 160 single-precision floating-point value, returning the result. After being 161 shifted into the proper positions, the three fields are simply added 162 together to form the result. This means that any integer portion of `zSig' 163 will be added into the exponent. Since a properly normalized significand 164 will have an integer portion equal to 1, the `zExp' input should be 1 less 165 than the desired result exponent whenever `zSig' is a complete, normalized 166 significand. 167 ------------------------------------------------------------------------------- 168 */ 169 INLINE float32 packFloat32( flag zSign, int16 zExp, bits32 zSig ) 170 { 171 172 return ( ( (bits32) zSign )<<31 ) + ( ( (bits32) zExp )<<23 ) + zSig; 173 174 } 175 176 /* 177 ------------------------------------------------------------------------------- 178 Takes an abstract floating-point value having sign `zSign', exponent `zExp', 179 and significand `zSig', and returns the proper single-precision floating- 180 point value corresponding to the abstract input. Ordinarily, the abstract 181 value is simply rounded and packed into the single-precision format, with 182 the inexact exception raised if the abstract input cannot be represented 183 exactly. However, if the abstract value is too large, the overflow and 184 inexact exceptions are raised and an infinity or maximal finite value is 185 returned. If the abstract value is too small, the input value is rounded to 186 a subnormal number, and the underflow and inexact exceptions are raised if 187 the abstract input cannot be represented exactly as a subnormal single- 188 precision floating-point number. 189 The input significand `zSig' has its binary point between bits 30 190 and 29, which is 7 bits to the left of the usual location. This shifted 191 significand must be normalized or smaller. If `zSig' is not normalized, 192 `zExp' must be 0; in that case, the result returned is a subnormal number, 193 and it must not require rounding. In the usual case that `zSig' is 194 normalized, `zExp' must be 1 less than the ``true'' floating-point exponent. 195 The handling of underflow and overflow follows the IEC/IEEE Standard for 196 Binary Floating-Point Arithmetic. 197 ------------------------------------------------------------------------------- 198 */ 199 static float32 roundAndPackFloat32( flag zSign, int16 zExp, bits32 zSig ) 200 { 201 int8 roundingMode; 202 flag roundNearestEven; 203 int8 roundIncrement, roundBits; 204 flag isTiny; 205 206 roundingMode = float_rounding_mode; 207 roundNearestEven = roundingMode == float_round_nearest_even; 208 roundIncrement = 0x40; 209 if ( ! roundNearestEven ) { 210 if ( roundingMode == float_round_to_zero ) { 211 roundIncrement = 0; 212 } 213 else { 214 roundIncrement = 0x7F; 215 if ( zSign ) { 216 if ( roundingMode == float_round_up ) roundIncrement = 0; 217 } 218 else { 219 if ( roundingMode == float_round_down ) roundIncrement = 0; 220 } 221 } 222 } 223 roundBits = zSig & 0x7F; 224 if ( 0xFD <= (bits16) zExp ) { 225 if ( ( 0xFD < zExp ) 226 || ( ( zExp == 0xFD ) 227 && ( (sbits32) ( zSig + roundIncrement ) < 0 ) ) 228 ) { 229 float_raise( float_flag_overflow | float_flag_inexact ); 230 return packFloat32( zSign, 0xFF, 0 ) - ( roundIncrement == 0 ); 231 } 232 if ( zExp < 0 ) { 233 isTiny = 234 ( float_detect_tininess == float_tininess_before_rounding ) 235 || ( zExp < -1 ) 236 || ( zSig + roundIncrement < 0x80000000 ); 237 shift32RightJamming( zSig, - zExp, &zSig ); 238 zExp = 0; 239 roundBits = zSig & 0x7F; 240 if ( isTiny && roundBits ) float_raise( float_flag_underflow ); 241 } 242 } 243 if ( roundBits ) float_exception_flags |= float_flag_inexact; 244 zSig = ( zSig + roundIncrement )>>7; 245 zSig &= ~ ( ( ( roundBits ^ 0x40 ) == 0 ) & roundNearestEven ); 246 if ( zSig == 0 ) zExp = 0; 247 return packFloat32( zSign, zExp, zSig ); 248 249 } 250 251 /* 252 ------------------------------------------------------------------------------- 253 Takes an abstract floating-point value having sign `zSign', exponent `zExp', 254 and significand `zSig', and returns the proper single-precision floating- 255 point value corresponding to the abstract input. This routine is just like 256 `roundAndPackFloat32' except that `zSig' does not have to be normalized. 257 Bit 31 of `zSig' must be zero, and `zExp' must be 1 less than the ``true'' 258 floating-point exponent. 259 ------------------------------------------------------------------------------- 260 */ 261 static float32 262 normalizeRoundAndPackFloat32( flag zSign, int16 zExp, bits32 zSig ) 263 { 264 int8 shiftCount; 265 266 shiftCount = countLeadingZeros32( zSig ) - 1; 267 return roundAndPackFloat32( zSign, zExp - shiftCount, zSig<<shiftCount ); 268 269 } 270 271 /* 272 ------------------------------------------------------------------------------- 273 Returns the least-significant 32 fraction bits of the double-precision 274 floating-point value `a'. 275 ------------------------------------------------------------------------------- 276 */ 277 INLINE bits32 extractFloat64Frac1( float64 a ) 278 { 279 280 return FLOAT64_DEMANGLE(a) & LIT64( 0x00000000FFFFFFFF ); 281 282 } 283 284 /* 285 ------------------------------------------------------------------------------- 286 Returns the most-significant 20 fraction bits of the double-precision 287 floating-point value `a'. 288 ------------------------------------------------------------------------------- 289 */ 290 INLINE bits32 extractFloat64Frac0( float64 a ) 291 { 292 293 return ( FLOAT64_DEMANGLE(a)>>32 ) & 0x000FFFFF; 294 295 } 296 297 /* 298 ------------------------------------------------------------------------------- 299 Returns the exponent bits of the double-precision floating-point value `a'. 300 ------------------------------------------------------------------------------- 301 */ 302 INLINE int16 extractFloat64Exp( float64 a ) 303 { 304 305 return ( FLOAT64_DEMANGLE(a)>>52 ) & 0x7FF; 306 307 } 308 309 /* 310 ------------------------------------------------------------------------------- 311 Returns the sign bit of the double-precision floating-point value `a'. 312 ------------------------------------------------------------------------------- 313 */ 314 INLINE flag extractFloat64Sign( float64 a ) 315 { 316 317 return FLOAT64_DEMANGLE(a)>>63; 318 319 } 320 321 /* 322 ------------------------------------------------------------------------------- 323 Normalizes the subnormal double-precision floating-point value represented 324 by the denormalized significand formed by the concatenation of `aSig0' and 325 `aSig1'. The normalized exponent is stored at the location pointed to by 326 `zExpPtr'. The most significant 21 bits of the normalized significand are 327 stored at the location pointed to by `zSig0Ptr', and the least significant 328 32 bits of the normalized significand are stored at the location pointed to 329 by `zSig1Ptr'. 330 ------------------------------------------------------------------------------- 331 */ 332 static void 333 normalizeFloat64Subnormal( 334 bits32 aSig0, 335 bits32 aSig1, 336 int16 *zExpPtr, 337 bits32 *zSig0Ptr, 338 bits32 *zSig1Ptr 339 ) 340 { 341 int8 shiftCount; 342 343 if ( aSig0 == 0 ) { 344 shiftCount = countLeadingZeros32( aSig1 ) - 11; 345 if ( shiftCount < 0 ) { 346 *zSig0Ptr = aSig1>>( - shiftCount ); 347 *zSig1Ptr = aSig1<<( shiftCount & 31 ); 348 } 349 else { 350 *zSig0Ptr = aSig1<<shiftCount; 351 *zSig1Ptr = 0; 352 } 353 *zExpPtr = - shiftCount - 31; 354 } 355 else { 356 shiftCount = countLeadingZeros32( aSig0 ) - 11; 357 shortShift64Left( aSig0, aSig1, shiftCount, zSig0Ptr, zSig1Ptr ); 358 *zExpPtr = 1 - shiftCount; 359 } 360 361 } 362 363 /* 364 ------------------------------------------------------------------------------- 365 Packs the sign `zSign', the exponent `zExp', and the significand formed by 366 the concatenation of `zSig0' and `zSig1' into a double-precision floating- 367 point value, returning the result. After being shifted into the proper 368 positions, the three fields `zSign', `zExp', and `zSig0' are simply added 369 together to form the most significant 32 bits of the result. This means 370 that any integer portion of `zSig0' will be added into the exponent. Since 371 a properly normalized significand will have an integer portion equal to 1, 372 the `zExp' input should be 1 less than the desired result exponent whenever 373 `zSig0' and `zSig1' concatenated form a complete, normalized significand. 374 ------------------------------------------------------------------------------- 375 */ 376 INLINE float64 377 packFloat64( flag zSign, int16 zExp, bits32 zSig0, bits32 zSig1 ) 378 { 379 380 return FLOAT64_MANGLE( ( ( (bits64) zSign )<<63 ) + 381 ( ( (bits64) zExp )<<52 ) + 382 ( ( (bits64) zSig0 )<<32 ) + zSig1 ); 383 384 385 } 386 387 /* 388 ------------------------------------------------------------------------------- 389 Takes an abstract floating-point value having sign `zSign', exponent `zExp', 390 and extended significand formed by the concatenation of `zSig0', `zSig1', 391 and `zSig2', and returns the proper double-precision floating-point value 392 corresponding to the abstract input. Ordinarily, the abstract value is 393 simply rounded and packed into the double-precision format, with the inexact 394 exception raised if the abstract input cannot be represented exactly. 395 However, if the abstract value is too large, the overflow and inexact 396 exceptions are raised and an infinity or maximal finite value is returned. 397 If the abstract value is too small, the input value is rounded to a 398 subnormal number, and the underflow and inexact exceptions are raised if the 399 abstract input cannot be represented exactly as a subnormal double-precision 400 floating-point number. 401 The input significand must be normalized or smaller. If the input 402 significand is not normalized, `zExp' must be 0; in that case, the result 403 returned is a subnormal number, and it must not require rounding. In the 404 usual case that the input significand is normalized, `zExp' must be 1 less 405 than the ``true'' floating-point exponent. The handling of underflow and 406 overflow follows the IEC/IEEE Standard for Binary Floating-Point Arithmetic. 407 ------------------------------------------------------------------------------- 408 */ 409 static float64 410 roundAndPackFloat64( 411 flag zSign, int16 zExp, bits32 zSig0, bits32 zSig1, bits32 zSig2 ) 412 { 413 int8 roundingMode; 414 flag roundNearestEven, increment, isTiny; 415 416 roundingMode = float_rounding_mode; 417 roundNearestEven = ( roundingMode == float_round_nearest_even ); 418 increment = ( (sbits32) zSig2 < 0 ); 419 if ( ! roundNearestEven ) { 420 if ( roundingMode == float_round_to_zero ) { 421 increment = 0; 422 } 423 else { 424 if ( zSign ) { 425 increment = ( roundingMode == float_round_down ) && zSig2; 426 } 427 else { 428 increment = ( roundingMode == float_round_up ) && zSig2; 429 } 430 } 431 } 432 if ( 0x7FD <= (bits16) zExp ) { 433 if ( ( 0x7FD < zExp ) 434 || ( ( zExp == 0x7FD ) 435 && eq64( 0x001FFFFF, 0xFFFFFFFF, zSig0, zSig1 ) 436 && increment 437 ) 438 ) { 439 float_raise( float_flag_overflow | float_flag_inexact ); 440 if ( ( roundingMode == float_round_to_zero ) 441 || ( zSign && ( roundingMode == float_round_up ) ) 442 || ( ! zSign && ( roundingMode == float_round_down ) ) 443 ) { 444 return packFloat64( zSign, 0x7FE, 0x000FFFFF, 0xFFFFFFFF ); 445 } 446 return packFloat64( zSign, 0x7FF, 0, 0 ); 447 } 448 if ( zExp < 0 ) { 449 isTiny = 450 ( float_detect_tininess == float_tininess_before_rounding ) 451 || ( zExp < -1 ) 452 || ! increment 453 || lt64( zSig0, zSig1, 0x001FFFFF, 0xFFFFFFFF ); 454 shift64ExtraRightJamming( 455 zSig0, zSig1, zSig2, - zExp, &zSig0, &zSig1, &zSig2 ); 456 zExp = 0; 457 if ( isTiny && zSig2 ) float_raise( float_flag_underflow ); 458 if ( roundNearestEven ) { 459 increment = ( (sbits32) zSig2 < 0 ); 460 } 461 else { 462 if ( zSign ) { 463 increment = ( roundingMode == float_round_down ) && zSig2; 464 } 465 else { 466 increment = ( roundingMode == float_round_up ) && zSig2; 467 } 468 } 469 } 470 } 471 if ( zSig2 ) float_exception_flags |= float_flag_inexact; 472 if ( increment ) { 473 add64( zSig0, zSig1, 0, 1, &zSig0, &zSig1 ); 474 zSig1 &= ~ ( ( zSig2 + zSig2 == 0 ) & roundNearestEven ); 475 } 476 else { 477 if ( ( zSig0 | zSig1 ) == 0 ) zExp = 0; 478 } 479 return packFloat64( zSign, zExp, zSig0, zSig1 ); 480 481 } 482 483 /* 484 ------------------------------------------------------------------------------- 485 Takes an abstract floating-point value having sign `zSign', exponent `zExp', 486 and significand formed by the concatenation of `zSig0' and `zSig1', and 487 returns the proper double-precision floating-point value corresponding 488 to the abstract input. This routine is just like `roundAndPackFloat64' 489 except that the input significand has fewer bits and does not have to be 490 normalized. In all cases, `zExp' must be 1 less than the ``true'' floating- 491 point exponent. 492 ------------------------------------------------------------------------------- 493 */ 494 static float64 495 normalizeRoundAndPackFloat64( 496 flag zSign, int16 zExp, bits32 zSig0, bits32 zSig1 ) 497 { 498 int8 shiftCount; 499 bits32 zSig2; 500 501 if ( zSig0 == 0 ) { 502 zSig0 = zSig1; 503 zSig1 = 0; 504 zExp -= 32; 505 } 506 shiftCount = countLeadingZeros32( zSig0 ) - 11; 507 if ( 0 <= shiftCount ) { 508 zSig2 = 0; 509 shortShift64Left( zSig0, zSig1, shiftCount, &zSig0, &zSig1 ); 510 } 511 else { 512 shift64ExtraRightJamming( 513 zSig0, zSig1, 0, - shiftCount, &zSig0, &zSig1, &zSig2 ); 514 } 515 zExp -= shiftCount; 516 return roundAndPackFloat64( zSign, zExp, zSig0, zSig1, zSig2 ); 517 518 } 519 520 /* 521 ------------------------------------------------------------------------------- 522 Returns the result of converting the 32-bit two's complement integer `a' to 523 the single-precision floating-point format. The conversion is performed 524 according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic. 525 ------------------------------------------------------------------------------- 526 */ 527 float32 int32_to_float32( int32 a ) 528 { 529 flag zSign; 530 531 if ( a == 0 ) return 0; 532 if ( a == (sbits32) 0x80000000 ) return packFloat32( 1, 0x9E, 0 ); 533 zSign = ( a < 0 ); 534 return normalizeRoundAndPackFloat32( zSign, 0x9C, zSign ? - a : a ); 535 536 } 537 538 /* 539 ------------------------------------------------------------------------------- 540 Returns the result of converting the 32-bit two's complement integer `a' to 541 the double-precision floating-point format. The conversion is performed 542 according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic. 543 ------------------------------------------------------------------------------- 544 */ 545 float64 int32_to_float64( int32 a ) 546 { 547 flag zSign; 548 bits32 absA; 549 int8 shiftCount; 550 bits32 zSig0, zSig1; 551 552 if ( a == 0 ) return packFloat64( 0, 0, 0, 0 ); 553 zSign = ( a < 0 ); 554 absA = zSign ? - a : a; 555 shiftCount = countLeadingZeros32( absA ) - 11; 556 if ( 0 <= shiftCount ) { 557 zSig0 = absA<<shiftCount; 558 zSig1 = 0; 559 } 560 else { 561 shift64Right( absA, 0, - shiftCount, &zSig0, &zSig1 ); 562 } 563 return packFloat64( zSign, 0x412 - shiftCount, zSig0, zSig1 ); 564 565 } 566 567 #ifndef SOFTFLOAT_FOR_GCC 568 /* 569 ------------------------------------------------------------------------------- 570 Returns the result of converting the single-precision floating-point value 571 `a' to the 32-bit two's complement integer format. The conversion is 572 performed according to the IEC/IEEE Standard for Binary Floating-Point 573 Arithmetic---which means in particular that the conversion is rounded 574 according to the current rounding mode. If `a' is a NaN, the largest 575 positive integer is returned. Otherwise, if the conversion overflows, the 576 largest integer with the same sign as `a' is returned. 577 ------------------------------------------------------------------------------- 578 */ 579 int32 float32_to_int32( float32 a ) 580 { 581 flag aSign; 582 int16 aExp, shiftCount; 583 bits32 aSig, aSigExtra; 584 int32 z; 585 int8 roundingMode; 586 587 aSig = extractFloat32Frac( a ); 588 aExp = extractFloat32Exp( a ); 589 aSign = extractFloat32Sign( a ); 590 shiftCount = aExp - 0x96; 591 if ( 0 <= shiftCount ) { 592 if ( 0x9E <= aExp ) { 593 if ( a != 0xCF000000 ) { 594 float_raise( float_flag_invalid ); 595 if ( ! aSign || ( ( aExp == 0xFF ) && aSig ) ) { 596 return 0x7FFFFFFF; 597 } 598 } 599 return (sbits32) 0x80000000; 600 } 601 z = ( aSig | 0x00800000 )<<shiftCount; 602 if ( aSign ) z = - z; 603 } 604 else { 605 if ( aExp < 0x7E ) { 606 aSigExtra = aExp | aSig; 607 z = 0; 608 } 609 else { 610 aSig |= 0x00800000; 611 aSigExtra = aSig<<( shiftCount & 31 ); 612 z = aSig>>( - shiftCount ); 613 } 614 if ( aSigExtra ) float_exception_flags |= float_flag_inexact; 615 roundingMode = float_rounding_mode; 616 if ( roundingMode == float_round_nearest_even ) { 617 if ( (sbits32) aSigExtra < 0 ) { 618 ++z; 619 if ( (bits32) ( aSigExtra<<1 ) == 0 ) z &= ~1; 620 } 621 if ( aSign ) z = - z; 622 } 623 else { 624 aSigExtra = ( aSigExtra != 0 ); 625 if ( aSign ) { 626 z += ( roundingMode == float_round_down ) & aSigExtra; 627 z = - z; 628 } 629 else { 630 z += ( roundingMode == float_round_up ) & aSigExtra; 631 } 632 } 633 } 634 return z; 635 636 } 637 #endif 638 639 /* 640 ------------------------------------------------------------------------------- 641 Returns the result of converting the single-precision floating-point value 642 `a' to the 32-bit two's complement integer format. The conversion is 643 performed according to the IEC/IEEE Standard for Binary Floating-Point 644 Arithmetic, except that the conversion is always rounded toward zero. 645 If `a' is a NaN, the largest positive integer is returned. Otherwise, if 646 the conversion overflows, the largest integer with the same sign as `a' is 647 returned. 648 ------------------------------------------------------------------------------- 649 */ 650 int32 float32_to_int32_round_to_zero( float32 a ) 651 { 652 flag aSign; 653 int16 aExp, shiftCount; 654 bits32 aSig; 655 int32 z; 656 657 aSig = extractFloat32Frac( a ); 658 aExp = extractFloat32Exp( a ); 659 aSign = extractFloat32Sign( a ); 660 shiftCount = aExp - 0x9E; 661 if ( 0 <= shiftCount ) { 662 if ( a != 0xCF000000 ) { 663 float_raise( float_flag_invalid ); 664 if ( ! aSign || ( ( aExp == 0xFF ) && aSig ) ) return 0x7FFFFFFF; 665 } 666 return (sbits32) 0x80000000; 667 } 668 else if ( aExp <= 0x7E ) { 669 if ( aExp | aSig ) float_exception_flags |= float_flag_inexact; 670 return 0; 671 } 672 aSig = ( aSig | 0x00800000 )<<8; 673 z = aSig>>( - shiftCount ); 674 if ( (bits32) ( aSig<<( shiftCount & 31 ) ) ) { 675 float_exception_flags |= float_flag_inexact; 676 } 677 if ( aSign ) z = - z; 678 return z; 679 680 } 681 682 /* 683 ------------------------------------------------------------------------------- 684 Returns the result of converting the single-precision floating-point value 685 `a' to the double-precision floating-point format. The conversion is 686 performed according to the IEC/IEEE Standard for Binary Floating-Point 687 Arithmetic. 688 ------------------------------------------------------------------------------- 689 */ 690 float64 float32_to_float64( float32 a ) 691 { 692 flag aSign; 693 int16 aExp; 694 bits32 aSig, zSig0, zSig1; 695 696 aSig = extractFloat32Frac( a ); 697 aExp = extractFloat32Exp( a ); 698 aSign = extractFloat32Sign( a ); 699 if ( aExp == 0xFF ) { 700 if ( aSig ) return commonNaNToFloat64( float32ToCommonNaN( a ) ); 701 return packFloat64( aSign, 0x7FF, 0, 0 ); 702 } 703 if ( aExp == 0 ) { 704 if ( aSig == 0 ) return packFloat64( aSign, 0, 0, 0 ); 705 normalizeFloat32Subnormal( aSig, &aExp, &aSig ); 706 --aExp; 707 } 708 shift64Right( aSig, 0, 3, &zSig0, &zSig1 ); 709 return packFloat64( aSign, aExp + 0x380, zSig0, zSig1 ); 710 711 } 712 713 #ifndef SOFTFLOAT_FOR_GCC 714 /* 715 ------------------------------------------------------------------------------- 716 Rounds the single-precision floating-point value `a' to an integer, 717 and returns the result as a single-precision floating-point value. The 718 operation is performed according to the IEC/IEEE Standard for Binary 719 Floating-Point Arithmetic. 720 ------------------------------------------------------------------------------- 721 */ 722 float32 float32_round_to_int( float32 a ) 723 { 724 flag aSign; 725 int16 aExp; 726 bits32 lastBitMask, roundBitsMask; 727 int8 roundingMode; 728 float32 z; 729 730 aExp = extractFloat32Exp( a ); 731 if ( 0x96 <= aExp ) { 732 if ( ( aExp == 0xFF ) && extractFloat32Frac( a ) ) { 733 return propagateFloat32NaN( a, a ); 734 } 735 return a; 736 } 737 if ( aExp <= 0x7E ) { 738 if ( (bits32) ( a<<1 ) == 0 ) return a; 739 float_exception_flags |= float_flag_inexact; 740 aSign = extractFloat32Sign( a ); 741 switch ( float_rounding_mode ) { 742 case float_round_nearest_even: 743 if ( ( aExp == 0x7E ) && extractFloat32Frac( a ) ) { 744 return packFloat32( aSign, 0x7F, 0 ); 745 } 746 break; 747 case float_round_to_zero: 748 break; 749 case float_round_down: 750 return aSign ? 0xBF800000 : 0; 751 case float_round_up: 752 return aSign ? 0x80000000 : 0x3F800000; 753 } 754 return packFloat32( aSign, 0, 0 ); 755 } 756 lastBitMask = 1; 757 lastBitMask <<= 0x96 - aExp; 758 roundBitsMask = lastBitMask - 1; 759 z = a; 760 roundingMode = float_rounding_mode; 761 if ( roundingMode == float_round_nearest_even ) { 762 z += lastBitMask>>1; 763 if ( ( z & roundBitsMask ) == 0 ) z &= ~ lastBitMask; 764 } 765 else if ( roundingMode != float_round_to_zero ) { 766 if ( extractFloat32Sign( z ) ^ ( roundingMode == float_round_up ) ) { 767 z += roundBitsMask; 768 } 769 } 770 z &= ~ roundBitsMask; 771 if ( z != a ) float_exception_flags |= float_flag_inexact; 772 return z; 773 774 } 775 #endif 776 777 /* 778 ------------------------------------------------------------------------------- 779 Returns the result of adding the absolute values of the single-precision 780 floating-point values `a' and `b'. If `zSign' is 1, the sum is negated 781 before being returned. `zSign' is ignored if the result is a NaN. 782 The addition is performed according to the IEC/IEEE Standard for Binary 783 Floating-Point Arithmetic. 784 ------------------------------------------------------------------------------- 785 */ 786 static float32 addFloat32Sigs( float32 a, float32 b, flag zSign ) 787 { 788 int16 aExp, bExp, zExp; 789 bits32 aSig, bSig, zSig; 790 int16 expDiff; 791 792 aSig = extractFloat32Frac( a ); 793 aExp = extractFloat32Exp( a ); 794 bSig = extractFloat32Frac( b ); 795 bExp = extractFloat32Exp( b ); 796 expDiff = aExp - bExp; 797 aSig <<= 6; 798 bSig <<= 6; 799 if ( 0 < expDiff ) { 800 if ( aExp == 0xFF ) { 801 if ( aSig ) return propagateFloat32NaN( a, b ); 802 return a; 803 } 804 if ( bExp == 0 ) { 805 --expDiff; 806 } 807 else { 808 bSig |= 0x20000000; 809 } 810 shift32RightJamming( bSig, expDiff, &bSig ); 811 zExp = aExp; 812 } 813 else if ( expDiff < 0 ) { 814 if ( bExp == 0xFF ) { 815 if ( bSig ) return propagateFloat32NaN( a, b ); 816 return packFloat32( zSign, 0xFF, 0 ); 817 } 818 if ( aExp == 0 ) { 819 ++expDiff; 820 } 821 else { 822 aSig |= 0x20000000; 823 } 824 shift32RightJamming( aSig, - expDiff, &aSig ); 825 zExp = bExp; 826 } 827 else { 828 if ( aExp == 0xFF ) { 829 if ( aSig | bSig ) return propagateFloat32NaN( a, b ); 830 return a; 831 } 832 if ( aExp == 0 ) return packFloat32( zSign, 0, ( aSig + bSig )>>6 ); 833 zSig = 0x40000000 + aSig + bSig; 834 zExp = aExp; 835 goto roundAndPack; 836 } 837 aSig |= 0x20000000; 838 zSig = ( aSig + bSig )<<1; 839 --zExp; 840 if ( (sbits32) zSig < 0 ) { 841 zSig = aSig + bSig; 842 ++zExp; 843 } 844 roundAndPack: 845 return roundAndPackFloat32( zSign, zExp, zSig ); 846 847 } 848 849 /* 850 ------------------------------------------------------------------------------- 851 Returns the result of subtracting the absolute values of the single- 852 precision floating-point values `a' and `b'. If `zSign' is 1, the 853 difference is negated before being returned. `zSign' is ignored if the 854 result is a NaN. The subtraction is performed according to the IEC/IEEE 855 Standard for Binary Floating-Point Arithmetic. 856 ------------------------------------------------------------------------------- 857 */ 858 static float32 subFloat32Sigs( float32 a, float32 b, flag zSign ) 859 { 860 int16 aExp, bExp, zExp; 861 bits32 aSig, bSig, zSig; 862 int16 expDiff; 863 864 aSig = extractFloat32Frac( a ); 865 aExp = extractFloat32Exp( a ); 866 bSig = extractFloat32Frac( b ); 867 bExp = extractFloat32Exp( b ); 868 expDiff = aExp - bExp; 869 aSig <<= 7; 870 bSig <<= 7; 871 if ( 0 < expDiff ) goto aExpBigger; 872 if ( expDiff < 0 ) goto bExpBigger; 873 if ( aExp == 0xFF ) { 874 if ( aSig | bSig ) return propagateFloat32NaN( a, b ); 875 float_raise( float_flag_invalid ); 876 return float32_default_nan; 877 } 878 if ( aExp == 0 ) { 879 aExp = 1; 880 bExp = 1; 881 } 882 if ( bSig < aSig ) goto aBigger; 883 if ( aSig < bSig ) goto bBigger; 884 return packFloat32( float_rounding_mode == float_round_down, 0, 0 ); 885 bExpBigger: 886 if ( bExp == 0xFF ) { 887 if ( bSig ) return propagateFloat32NaN( a, b ); 888 return packFloat32( zSign ^ 1, 0xFF, 0 ); 889 } 890 if ( aExp == 0 ) { 891 ++expDiff; 892 } 893 else { 894 aSig |= 0x40000000; 895 } 896 shift32RightJamming( aSig, - expDiff, &aSig ); 897 bSig |= 0x40000000; 898 bBigger: 899 zSig = bSig - aSig; 900 zExp = bExp; 901 zSign ^= 1; 902 goto normalizeRoundAndPack; 903 aExpBigger: 904 if ( aExp == 0xFF ) { 905 if ( aSig ) return propagateFloat32NaN( a, b ); 906 return a; 907 } 908 if ( bExp == 0 ) { 909 --expDiff; 910 } 911 else { 912 bSig |= 0x40000000; 913 } 914 shift32RightJamming( bSig, expDiff, &bSig ); 915 aSig |= 0x40000000; 916 aBigger: 917 zSig = aSig - bSig; 918 zExp = aExp; 919 normalizeRoundAndPack: 920 --zExp; 921 return normalizeRoundAndPackFloat32( zSign, zExp, zSig ); 922 923 } 924 925 /* 926 ------------------------------------------------------------------------------- 927 Returns the result of adding the single-precision floating-point values `a' 928 and `b'. The operation is performed according to the IEC/IEEE Standard for 929 Binary Floating-Point Arithmetic. 930 ------------------------------------------------------------------------------- 931 */ 932 float32 float32_add( float32 a, float32 b ) 933 { 934 flag aSign, bSign; 935 936 aSign = extractFloat32Sign( a ); 937 bSign = extractFloat32Sign( b ); 938 if ( aSign == bSign ) { 939 return addFloat32Sigs( a, b, aSign ); 940 } 941 else { 942 return subFloat32Sigs( a, b, aSign ); 943 } 944 945 } 946 947 /* 948 ------------------------------------------------------------------------------- 949 Returns the result of subtracting the single-precision floating-point values 950 `a' and `b'. The operation is performed according to the IEC/IEEE Standard 951 for Binary Floating-Point Arithmetic. 952 ------------------------------------------------------------------------------- 953 */ 954 float32 float32_sub( float32 a, float32 b ) 955 { 956 flag aSign, bSign; 957 958 aSign = extractFloat32Sign( a ); 959 bSign = extractFloat32Sign( b ); 960 if ( aSign == bSign ) { 961 return subFloat32Sigs( a, b, aSign ); 962 } 963 else { 964 return addFloat32Sigs( a, b, aSign ); 965 } 966 967 } 968 969 /* 970 ------------------------------------------------------------------------------- 971 Returns the result of multiplying the single-precision floating-point values 972 `a' and `b'. The operation is performed according to the IEC/IEEE Standard 973 for Binary Floating-Point Arithmetic. 974 ------------------------------------------------------------------------------- 975 */ 976 float32 float32_mul( float32 a, float32 b ) 977 { 978 flag aSign, bSign, zSign; 979 int16 aExp, bExp, zExp; 980 bits32 aSig, bSig, zSig0, zSig1; 981 982 aSig = extractFloat32Frac( a ); 983 aExp = extractFloat32Exp( a ); 984 aSign = extractFloat32Sign( a ); 985 bSig = extractFloat32Frac( b ); 986 bExp = extractFloat32Exp( b ); 987 bSign = extractFloat32Sign( b ); 988 zSign = aSign ^ bSign; 989 if ( aExp == 0xFF ) { 990 if ( aSig || ( ( bExp == 0xFF ) && bSig ) ) { 991 return propagateFloat32NaN( a, b ); 992 } 993 if ( ( bExp | bSig ) == 0 ) { 994 float_raise( float_flag_invalid ); 995 return float32_default_nan; 996 } 997 return packFloat32( zSign, 0xFF, 0 ); 998 } 999 if ( bExp == 0xFF ) { 1000 if ( bSig ) return propagateFloat32NaN( a, b ); 1001 if ( ( aExp | aSig ) == 0 ) { 1002 float_raise( float_flag_invalid ); 1003 return float32_default_nan; 1004 } 1005 return packFloat32( zSign, 0xFF, 0 ); 1006 } 1007 if ( aExp == 0 ) { 1008 if ( aSig == 0 ) return packFloat32( zSign, 0, 0 ); 1009 normalizeFloat32Subnormal( aSig, &aExp, &aSig ); 1010 } 1011 if ( bExp == 0 ) { 1012 if ( bSig == 0 ) return packFloat32( zSign, 0, 0 ); 1013 normalizeFloat32Subnormal( bSig, &bExp, &bSig ); 1014 } 1015 zExp = aExp + bExp - 0x7F; 1016 aSig = ( aSig | 0x00800000 )<<7; 1017 bSig = ( bSig | 0x00800000 )<<8; 1018 mul32To64( aSig, bSig, &zSig0, &zSig1 ); 1019 zSig0 |= ( zSig1 != 0 ); 1020 if ( 0 <= (sbits32) ( zSig0<<1 ) ) { 1021 zSig0 <<= 1; 1022 --zExp; 1023 } 1024 return roundAndPackFloat32( zSign, zExp, zSig0 ); 1025 1026 } 1027 1028 /* 1029 ------------------------------------------------------------------------------- 1030 Returns the result of dividing the single-precision floating-point value `a' 1031 by the corresponding value `b'. The operation is performed according to the 1032 IEC/IEEE Standard for Binary Floating-Point Arithmetic. 1033 ------------------------------------------------------------------------------- 1034 */ 1035 float32 float32_div( float32 a, float32 b ) 1036 { 1037 flag aSign, bSign, zSign; 1038 int16 aExp, bExp, zExp; 1039 bits32 aSig, bSig, zSig, rem0, rem1, term0, term1; 1040 1041 aSig = extractFloat32Frac( a ); 1042 aExp = extractFloat32Exp( a ); 1043 aSign = extractFloat32Sign( a ); 1044 bSig = extractFloat32Frac( b ); 1045 bExp = extractFloat32Exp( b ); 1046 bSign = extractFloat32Sign( b ); 1047 zSign = aSign ^ bSign; 1048 if ( aExp == 0xFF ) { 1049 if ( aSig ) return propagateFloat32NaN( a, b ); 1050 if ( bExp == 0xFF ) { 1051 if ( bSig ) return propagateFloat32NaN( a, b ); 1052 float_raise( float_flag_invalid ); 1053 return float32_default_nan; 1054 } 1055 return packFloat32( zSign, 0xFF, 0 ); 1056 } 1057 if ( bExp == 0xFF ) { 1058 if ( bSig ) return propagateFloat32NaN( a, b ); 1059 return packFloat32( zSign, 0, 0 ); 1060 } 1061 if ( bExp == 0 ) { 1062 if ( bSig == 0 ) { 1063 if ( ( aExp | aSig ) == 0 ) { 1064 float_raise( float_flag_invalid ); 1065 return float32_default_nan; 1066 } 1067 float_raise( float_flag_divbyzero ); 1068 return packFloat32( zSign, 0xFF, 0 ); 1069 } 1070 normalizeFloat32Subnormal( bSig, &bExp, &bSig ); 1071 } 1072 if ( aExp == 0 ) { 1073 if ( aSig == 0 ) return packFloat32( zSign, 0, 0 ); 1074 normalizeFloat32Subnormal( aSig, &aExp, &aSig ); 1075 } 1076 zExp = aExp - bExp + 0x7D; 1077 aSig = ( aSig | 0x00800000 )<<7; 1078 bSig = ( bSig | 0x00800000 )<<8; 1079 if ( bSig <= ( aSig + aSig ) ) { 1080 aSig >>= 1; 1081 ++zExp; 1082 } 1083 zSig = estimateDiv64To32( aSig, 0, bSig ); 1084 if ( ( zSig & 0x3F ) <= 2 ) { 1085 mul32To64( bSig, zSig, &term0, &term1 ); 1086 sub64( aSig, 0, term0, term1, &rem0, &rem1 ); 1087 while ( (sbits32) rem0 < 0 ) { 1088 --zSig; 1089 add64( rem0, rem1, 0, bSig, &rem0, &rem1 ); 1090 } 1091 zSig |= ( rem1 != 0 ); 1092 } 1093 return roundAndPackFloat32( zSign, zExp, zSig ); 1094 1095 } 1096 1097 #ifndef SOFTFLOAT_FOR_GCC 1098 /* 1099 ------------------------------------------------------------------------------- 1100 Returns the remainder of the single-precision floating-point value `a' 1101 with respect to the corresponding value `b'. The operation is performed 1102 according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic. 1103 ------------------------------------------------------------------------------- 1104 */ 1105 float32 float32_rem( float32 a, float32 b ) 1106 { 1107 flag aSign, bSign, zSign; 1108 int16 aExp, bExp, expDiff; 1109 bits32 aSig, bSig, q, allZero, alternateASig; 1110 sbits32 sigMean; 1111 1112 aSig = extractFloat32Frac( a ); 1113 aExp = extractFloat32Exp( a ); 1114 aSign = extractFloat32Sign( a ); 1115 bSig = extractFloat32Frac( b ); 1116 bExp = extractFloat32Exp( b ); 1117 bSign = extractFloat32Sign( b ); 1118 if ( aExp == 0xFF ) { 1119 if ( aSig || ( ( bExp == 0xFF ) && bSig ) ) { 1120 return propagateFloat32NaN( a, b ); 1121 } 1122 float_raise( float_flag_invalid ); 1123 return float32_default_nan; 1124 } 1125 if ( bExp == 0xFF ) { 1126 if ( bSig ) return propagateFloat32NaN( a, b ); 1127 return a; 1128 } 1129 if ( bExp == 0 ) { 1130 if ( bSig == 0 ) { 1131 float_raise( float_flag_invalid ); 1132 return float32_default_nan; 1133 } 1134 normalizeFloat32Subnormal( bSig, &bExp, &bSig ); 1135 } 1136 if ( aExp == 0 ) { 1137 if ( aSig == 0 ) return a; 1138 normalizeFloat32Subnormal( aSig, &aExp, &aSig ); 1139 } 1140 expDiff = aExp - bExp; 1141 aSig = ( aSig | 0x00800000 )<<8; 1142 bSig = ( bSig | 0x00800000 )<<8; 1143 if ( expDiff < 0 ) { 1144 if ( expDiff < -1 ) return a; 1145 aSig >>= 1; 1146 } 1147 q = ( bSig <= aSig ); 1148 if ( q ) aSig -= bSig; 1149 expDiff -= 32; 1150 while ( 0 < expDiff ) { 1151 q = estimateDiv64To32( aSig, 0, bSig ); 1152 q = ( 2 < q ) ? q - 2 : 0; 1153 aSig = - ( ( bSig>>2 ) * q ); 1154 expDiff -= 30; 1155 } 1156 expDiff += 32; 1157 if ( 0 < expDiff ) { 1158 q = estimateDiv64To32( aSig, 0, bSig ); 1159 q = ( 2 < q ) ? q - 2 : 0; 1160 q >>= 32 - expDiff; 1161 bSig >>= 2; 1162 aSig = ( ( aSig>>1 )<<( expDiff - 1 ) ) - bSig * q; 1163 } 1164 else { 1165 aSig >>= 2; 1166 bSig >>= 2; 1167 } 1168 do { 1169 alternateASig = aSig; 1170 ++q; 1171 aSig -= bSig; 1172 } while ( 0 <= (sbits32) aSig ); 1173 sigMean = aSig + alternateASig; 1174 if ( ( sigMean < 0 ) || ( ( sigMean == 0 ) && ( q & 1 ) ) ) { 1175 aSig = alternateASig; 1176 } 1177 zSign = ( (sbits32) aSig < 0 ); 1178 if ( zSign ) aSig = - aSig; 1179 return normalizeRoundAndPackFloat32( aSign ^ zSign, bExp, aSig ); 1180 1181 } 1182 #endif 1183 1184 #ifndef SOFTFLOAT_FOR_GCC 1185 /* 1186 ------------------------------------------------------------------------------- 1187 Returns the square root of the single-precision floating-point value `a'. 1188 The operation is performed according to the IEC/IEEE Standard for Binary 1189 Floating-Point Arithmetic. 1190 ------------------------------------------------------------------------------- 1191 */ 1192 float32 float32_sqrt( float32 a ) 1193 { 1194 flag aSign; 1195 int16 aExp, zExp; 1196 bits32 aSig, zSig, rem0, rem1, term0, term1; 1197 1198 aSig = extractFloat32Frac( a ); 1199 aExp = extractFloat32Exp( a ); 1200 aSign = extractFloat32Sign( a ); 1201 if ( aExp == 0xFF ) { 1202 if ( aSig ) return propagateFloat32NaN( a, 0 ); 1203 if ( ! aSign ) return a; 1204 float_raise( float_flag_invalid ); 1205 return float32_default_nan; 1206 } 1207 if ( aSign ) { 1208 if ( ( aExp | aSig ) == 0 ) return a; 1209 float_raise( float_flag_invalid ); 1210 return float32_default_nan; 1211 } 1212 if ( aExp == 0 ) { 1213 if ( aSig == 0 ) return 0; 1214 normalizeFloat32Subnormal( aSig, &aExp, &aSig ); 1215 } 1216 zExp = ( ( aExp - 0x7F )>>1 ) + 0x7E; 1217 aSig = ( aSig | 0x00800000 )<<8; 1218 zSig = estimateSqrt32( aExp, aSig ) + 2; 1219 if ( ( zSig & 0x7F ) <= 5 ) { 1220 if ( zSig < 2 ) { 1221 zSig = 0x7FFFFFFF; 1222 goto roundAndPack; 1223 } 1224 else { 1225 aSig >>= aExp & 1; 1226 mul32To64( zSig, zSig, &term0, &term1 ); 1227 sub64( aSig, 0, term0, term1, &rem0, &rem1 ); 1228 while ( (sbits32) rem0 < 0 ) { 1229 --zSig; 1230 shortShift64Left( 0, zSig, 1, &term0, &term1 ); 1231 term1 |= 1; 1232 add64( rem0, rem1, term0, term1, &rem0, &rem1 ); 1233 } 1234 zSig |= ( ( rem0 | rem1 ) != 0 ); 1235 } 1236 } 1237 shift32RightJamming( zSig, 1, &zSig ); 1238 roundAndPack: 1239 return roundAndPackFloat32( 0, zExp, zSig ); 1240 1241 } 1242 #endif 1243 1244 /* 1245 ------------------------------------------------------------------------------- 1246 Returns 1 if the single-precision floating-point value `a' is equal to 1247 the corresponding value `b', and 0 otherwise. The comparison is performed 1248 according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic. 1249 ------------------------------------------------------------------------------- 1250 */ 1251 flag float32_eq( float32 a, float32 b ) 1252 { 1253 1254 if ( ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) ) 1255 || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) ) 1256 ) { 1257 if ( float32_is_signaling_nan( a ) || float32_is_signaling_nan( b ) ) { 1258 float_raise( float_flag_invalid ); 1259 } 1260 return 0; 1261 } 1262 return ( a == b ) || ( (bits32) ( ( a | b )<<1 ) == 0 ); 1263 1264 } 1265 1266 /* 1267 ------------------------------------------------------------------------------- 1268 Returns 1 if the single-precision floating-point value `a' is less than 1269 or equal to the corresponding value `b', and 0 otherwise. The comparison 1270 is performed according to the IEC/IEEE Standard for Binary Floating-Point 1271 Arithmetic. 1272 ------------------------------------------------------------------------------- 1273 */ 1274 flag float32_le( float32 a, float32 b ) 1275 { 1276 flag aSign, bSign; 1277 1278 if ( ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) ) 1279 || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) ) 1280 ) { 1281 float_raise( float_flag_invalid ); 1282 return 0; 1283 } 1284 aSign = extractFloat32Sign( a ); 1285 bSign = extractFloat32Sign( b ); 1286 if ( aSign != bSign ) return aSign || ( (bits32) ( ( a | b )<<1 ) == 0 ); 1287 return ( a == b ) || ( aSign ^ ( a < b ) ); 1288 1289 } 1290 1291 /* 1292 ------------------------------------------------------------------------------- 1293 Returns 1 if the single-precision floating-point value `a' is less than 1294 the corresponding value `b', and 0 otherwise. The comparison is performed 1295 according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic. 1296 ------------------------------------------------------------------------------- 1297 */ 1298 flag float32_lt( float32 a, float32 b ) 1299 { 1300 flag aSign, bSign; 1301 1302 if ( ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) ) 1303 || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) ) 1304 ) { 1305 float_raise( float_flag_invalid ); 1306 return 0; 1307 } 1308 aSign = extractFloat32Sign( a ); 1309 bSign = extractFloat32Sign( b ); 1310 if ( aSign != bSign ) return aSign && ( (bits32) ( ( a | b )<<1 ) != 0 ); 1311 return ( a != b ) && ( aSign ^ ( a < b ) ); 1312 1313 } 1314 1315 #ifndef SOFTFLOAT_FOR_GCC /* Not needed */ 1316 /* 1317 ------------------------------------------------------------------------------- 1318 Returns 1 if the single-precision floating-point value `a' is equal to 1319 the corresponding value `b', and 0 otherwise. The invalid exception is 1320 raised if either operand is a NaN. Otherwise, the comparison is performed 1321 according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic. 1322 ------------------------------------------------------------------------------- 1323 */ 1324 flag float32_eq_signaling( float32 a, float32 b ) 1325 { 1326 1327 if ( ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) ) 1328 || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) ) 1329 ) { 1330 float_raise( float_flag_invalid ); 1331 return 0; 1332 } 1333 return ( a == b ) || ( (bits32) ( ( a | b )<<1 ) == 0 ); 1334 1335 } 1336 1337 /* 1338 ------------------------------------------------------------------------------- 1339 Returns 1 if the single-precision floating-point value `a' is less than or 1340 equal to the corresponding value `b', and 0 otherwise. Quiet NaNs do not 1341 cause an exception. Otherwise, the comparison is performed according to the 1342 IEC/IEEE Standard for Binary Floating-Point Arithmetic. 1343 ------------------------------------------------------------------------------- 1344 */ 1345 flag float32_le_quiet( float32 a, float32 b ) 1346 { 1347 flag aSign, bSign; 1348 int16 aExp, bExp; 1349 1350 if ( ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) ) 1351 || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) ) 1352 ) { 1353 if ( float32_is_signaling_nan( a ) || float32_is_signaling_nan( b ) ) { 1354 float_raise( float_flag_invalid ); 1355 } 1356 return 0; 1357 } 1358 aSign = extractFloat32Sign( a ); 1359 bSign = extractFloat32Sign( b ); 1360 if ( aSign != bSign ) return aSign || ( (bits32) ( ( a | b )<<1 ) == 0 ); 1361 return ( a == b ) || ( aSign ^ ( a < b ) ); 1362 1363 } 1364 1365 /* 1366 ------------------------------------------------------------------------------- 1367 Returns 1 if the single-precision floating-point value `a' is less than 1368 the corresponding value `b', and 0 otherwise. Quiet NaNs do not cause an 1369 exception. Otherwise, the comparison is performed according to the IEC/IEEE 1370 Standard for Binary Floating-Point Arithmetic. 1371 ------------------------------------------------------------------------------- 1372 */ 1373 flag float32_lt_quiet( float32 a, float32 b ) 1374 { 1375 flag aSign, bSign; 1376 1377 if ( ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) ) 1378 || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) ) 1379 ) { 1380 if ( float32_is_signaling_nan( a ) || float32_is_signaling_nan( b ) ) { 1381 float_raise( float_flag_invalid ); 1382 } 1383 return 0; 1384 } 1385 aSign = extractFloat32Sign( a ); 1386 bSign = extractFloat32Sign( b ); 1387 if ( aSign != bSign ) return aSign && ( (bits32) ( ( a | b )<<1 ) != 0 ); 1388 return ( a != b ) && ( aSign ^ ( a < b ) ); 1389 1390 } 1391 #endif /* !SOFTFLOAT_FOR_GCC */ 1392 1393 #ifndef SOFTFLOAT_FOR_GCC /* Not needed */ 1394 /* 1395 ------------------------------------------------------------------------------- 1396 Returns the result of converting the double-precision floating-point value 1397 `a' to the 32-bit two's complement integer format. The conversion is 1398 performed according to the IEC/IEEE Standard for Binary Floating-Point 1399 Arithmetic---which means in particular that the conversion is rounded 1400 according to the current rounding mode. If `a' is a NaN, the largest 1401 positive integer is returned. Otherwise, if the conversion overflows, the 1402 largest integer with the same sign as `a' is returned. 1403 ------------------------------------------------------------------------------- 1404 */ 1405 int32 float64_to_int32( float64 a ) 1406 { 1407 flag aSign; 1408 int16 aExp, shiftCount; 1409 bits32 aSig0, aSig1, absZ, aSigExtra; 1410 int32 z; 1411 int8 roundingMode; 1412 1413 aSig1 = extractFloat64Frac1( a ); 1414 aSig0 = extractFloat64Frac0( a ); 1415 aExp = extractFloat64Exp( a ); 1416 aSign = extractFloat64Sign( a ); 1417 shiftCount = aExp - 0x413; 1418 if ( 0 <= shiftCount ) { 1419 if ( 0x41E < aExp ) { 1420 if ( ( aExp == 0x7FF ) && ( aSig0 | aSig1 ) ) aSign = 0; 1421 goto invalid; 1422 } 1423 shortShift64Left( 1424 aSig0 | 0x00100000, aSig1, shiftCount, &absZ, &aSigExtra ); 1425 if ( 0x80000000 < absZ ) goto invalid; 1426 } 1427 else { 1428 aSig1 = ( aSig1 != 0 ); 1429 if ( aExp < 0x3FE ) { 1430 aSigExtra = aExp | aSig0 | aSig1; 1431 absZ = 0; 1432 } 1433 else { 1434 aSig0 |= 0x00100000; 1435 aSigExtra = ( aSig0<<( shiftCount & 31 ) ) | aSig1; 1436 absZ = aSig0>>( - shiftCount ); 1437 } 1438 } 1439 roundingMode = float_rounding_mode; 1440 if ( roundingMode == float_round_nearest_even ) { 1441 if ( (sbits32) aSigExtra < 0 ) { 1442 ++absZ; 1443 if ( (bits32) ( aSigExtra<<1 ) == 0 ) absZ &= ~1; 1444 } 1445 z = aSign ? - absZ : absZ; 1446 } 1447 else { 1448 aSigExtra = ( aSigExtra != 0 ); 1449 if ( aSign ) { 1450 z = - ( absZ 1451 + ( ( roundingMode == float_round_down ) & aSigExtra ) ); 1452 } 1453 else { 1454 z = absZ + ( ( roundingMode == float_round_up ) & aSigExtra ); 1455 } 1456 } 1457 if ( ( aSign ^ ( z < 0 ) ) && z ) { 1458 invalid: 1459 float_raise( float_flag_invalid ); 1460 return aSign ? (sbits32) 0x80000000 : 0x7FFFFFFF; 1461 } 1462 if ( aSigExtra ) float_exception_flags |= float_flag_inexact; 1463 return z; 1464 1465 } 1466 #endif /* !SOFTFLOAT_FOR_GCC */ 1467 1468 /* 1469 ------------------------------------------------------------------------------- 1470 Returns the result of converting the double-precision floating-point value 1471 `a' to the 32-bit two's complement integer format. The conversion is 1472 performed according to the IEC/IEEE Standard for Binary Floating-Point 1473 Arithmetic, except that the conversion is always rounded toward zero. 1474 If `a' is a NaN, the largest positive integer is returned. Otherwise, if 1475 the conversion overflows, the largest integer with the same sign as `a' is 1476 returned. 1477 ------------------------------------------------------------------------------- 1478 */ 1479 int32 float64_to_int32_round_to_zero( float64 a ) 1480 { 1481 flag aSign; 1482 int16 aExp, shiftCount; 1483 bits32 aSig0, aSig1, absZ, aSigExtra; 1484 int32 z; 1485 1486 aSig1 = extractFloat64Frac1( a ); 1487 aSig0 = extractFloat64Frac0( a ); 1488 aExp = extractFloat64Exp( a ); 1489 aSign = extractFloat64Sign( a ); 1490 shiftCount = aExp - 0x413; 1491 if ( 0 <= shiftCount ) { 1492 if ( 0x41E < aExp ) { 1493 if ( ( aExp == 0x7FF ) && ( aSig0 | aSig1 ) ) aSign = 0; 1494 goto invalid; 1495 } 1496 shortShift64Left( 1497 aSig0 | 0x00100000, aSig1, shiftCount, &absZ, &aSigExtra ); 1498 } 1499 else { 1500 if ( aExp < 0x3FF ) { 1501 if ( aExp | aSig0 | aSig1 ) { 1502 float_exception_flags |= float_flag_inexact; 1503 } 1504 return 0; 1505 } 1506 aSig0 |= 0x00100000; 1507 aSigExtra = ( aSig0<<( shiftCount & 31 ) ) | aSig1; 1508 absZ = aSig0>>( - shiftCount ); 1509 } 1510 z = aSign ? - absZ : absZ; 1511 if ( ( aSign ^ ( z < 0 ) ) && z ) { 1512 invalid: 1513 float_raise( float_flag_invalid ); 1514 return aSign ? (sbits32) 0x80000000 : 0x7FFFFFFF; 1515 } 1516 if ( aSigExtra ) float_exception_flags |= float_flag_inexact; 1517 return z; 1518 1519 } 1520 1521 /* 1522 ------------------------------------------------------------------------------- 1523 Returns the result of converting the double-precision floating-point value 1524 `a' to the single-precision floating-point format. The conversion is 1525 performed according to the IEC/IEEE Standard for Binary Floating-Point 1526 Arithmetic. 1527 ------------------------------------------------------------------------------- 1528 */ 1529 float32 float64_to_float32( float64 a ) 1530 { 1531 flag aSign; 1532 int16 aExp; 1533 bits32 aSig0, aSig1, zSig; 1534 bits32 allZero; 1535 1536 aSig1 = extractFloat64Frac1( a ); 1537 aSig0 = extractFloat64Frac0( a ); 1538 aExp = extractFloat64Exp( a ); 1539 aSign = extractFloat64Sign( a ); 1540 if ( aExp == 0x7FF ) { 1541 if ( aSig0 | aSig1 ) { 1542 return commonNaNToFloat32( float64ToCommonNaN( a ) ); 1543 } 1544 return packFloat32( aSign, 0xFF, 0 ); 1545 } 1546 shift64RightJamming( aSig0, aSig1, 22, &allZero, &zSig ); 1547 if ( aExp ) zSig |= 0x40000000; 1548 return roundAndPackFloat32( aSign, aExp - 0x381, zSig ); 1549 1550 } 1551 1552 #ifndef SOFTFLOAT_FOR_GCC 1553 /* 1554 ------------------------------------------------------------------------------- 1555 Rounds the double-precision floating-point value `a' to an integer, 1556 and returns the result as a double-precision floating-point value. The 1557 operation is performed according to the IEC/IEEE Standard for Binary 1558 Floating-Point Arithmetic. 1559 ------------------------------------------------------------------------------- 1560 */ 1561 float64 float64_round_to_int( float64 a ) 1562 { 1563 flag aSign; 1564 int16 aExp; 1565 bits32 lastBitMask, roundBitsMask; 1566 int8 roundingMode; 1567 float64 z; 1568 1569 aExp = extractFloat64Exp( a ); 1570 if ( 0x413 <= aExp ) { 1571 if ( 0x433 <= aExp ) { 1572 if ( ( aExp == 0x7FF ) 1573 && ( extractFloat64Frac0( a ) | extractFloat64Frac1( a ) ) ) { 1574 return propagateFloat64NaN( a, a ); 1575 } 1576 return a; 1577 } 1578 lastBitMask = 1; 1579 lastBitMask = ( lastBitMask<<( 0x432 - aExp ) )<<1; 1580 roundBitsMask = lastBitMask - 1; 1581 z = a; 1582 roundingMode = float_rounding_mode; 1583 if ( roundingMode == float_round_nearest_even ) { 1584 if ( lastBitMask ) { 1585 add64( z.high, z.low, 0, lastBitMask>>1, &z.high, &z.low ); 1586 if ( ( z.low & roundBitsMask ) == 0 ) z.low &= ~ lastBitMask; 1587 } 1588 else { 1589 if ( (sbits32) z.low < 0 ) { 1590 ++z.high; 1591 if ( (bits32) ( z.low<<1 ) == 0 ) z.high &= ~1; 1592 } 1593 } 1594 } 1595 else if ( roundingMode != float_round_to_zero ) { 1596 if ( extractFloat64Sign( z ) 1597 ^ ( roundingMode == float_round_up ) ) { 1598 add64( z.high, z.low, 0, roundBitsMask, &z.high, &z.low ); 1599 } 1600 } 1601 z.low &= ~ roundBitsMask; 1602 } 1603 else { 1604 if ( aExp <= 0x3FE ) { 1605 if ( ( ( (bits32) ( a.high<<1 ) ) | a.low ) == 0 ) return a; 1606 float_exception_flags |= float_flag_inexact; 1607 aSign = extractFloat64Sign( a ); 1608 switch ( float_rounding_mode ) { 1609 case float_round_nearest_even: 1610 if ( ( aExp == 0x3FE ) 1611 && ( extractFloat64Frac0( a ) | extractFloat64Frac1( a ) ) 1612 ) { 1613 return packFloat64( aSign, 0x3FF, 0, 0 ); 1614 } 1615 break; 1616 case float_round_down: 1617 return 1618 aSign ? packFloat64( 1, 0x3FF, 0, 0 ) 1619 : packFloat64( 0, 0, 0, 0 ); 1620 case float_round_up: 1621 return 1622 aSign ? packFloat64( 1, 0, 0, 0 ) 1623 : packFloat64( 0, 0x3FF, 0, 0 ); 1624 } 1625 return packFloat64( aSign, 0, 0, 0 ); 1626 } 1627 lastBitMask = 1; 1628 lastBitMask <<= 0x413 - aExp; 1629 roundBitsMask = lastBitMask - 1; 1630 z.low = 0; 1631 z.high = a.high; 1632 roundingMode = float_rounding_mode; 1633 if ( roundingMode == float_round_nearest_even ) { 1634 z.high += lastBitMask>>1; 1635 if ( ( ( z.high & roundBitsMask ) | a.low ) == 0 ) { 1636 z.high &= ~ lastBitMask; 1637 } 1638 } 1639 else if ( roundingMode != float_round_to_zero ) { 1640 if ( extractFloat64Sign( z ) 1641 ^ ( roundingMode == float_round_up ) ) { 1642 z.high |= ( a.low != 0 ); 1643 z.high += roundBitsMask; 1644 } 1645 } 1646 z.high &= ~ roundBitsMask; 1647 } 1648 if ( ( z.low != a.low ) || ( z.high != a.high ) ) { 1649 float_exception_flags |= float_flag_inexact; 1650 } 1651 return z; 1652 1653 } 1654 #endif 1655 1656 /* 1657 ------------------------------------------------------------------------------- 1658 Returns the result of adding the absolute values of the double-precision 1659 floating-point values `a' and `b'. If `zSign' is 1, the sum is negated 1660 before being returned. `zSign' is ignored if the result is a NaN. 1661 The addition is performed according to the IEC/IEEE Standard for Binary 1662 Floating-Point Arithmetic. 1663 ------------------------------------------------------------------------------- 1664 */ 1665 static float64 addFloat64Sigs( float64 a, float64 b, flag zSign ) 1666 { 1667 int16 aExp, bExp, zExp; 1668 bits32 aSig0, aSig1, bSig0, bSig1, zSig0, zSig1, zSig2; 1669 int16 expDiff; 1670 1671 aSig1 = extractFloat64Frac1( a ); 1672 aSig0 = extractFloat64Frac0( a ); 1673 aExp = extractFloat64Exp( a ); 1674 bSig1 = extractFloat64Frac1( b ); 1675 bSig0 = extractFloat64Frac0( b ); 1676 bExp = extractFloat64Exp( b ); 1677 expDiff = aExp - bExp; 1678 if ( 0 < expDiff ) { 1679 if ( aExp == 0x7FF ) { 1680 if ( aSig0 | aSig1 ) return propagateFloat64NaN( a, b ); 1681 return a; 1682 } 1683 if ( bExp == 0 ) { 1684 --expDiff; 1685 } 1686 else { 1687 bSig0 |= 0x00100000; 1688 } 1689 shift64ExtraRightJamming( 1690 bSig0, bSig1, 0, expDiff, &bSig0, &bSig1, &zSig2 ); 1691 zExp = aExp; 1692 } 1693 else if ( expDiff < 0 ) { 1694 if ( bExp == 0x7FF ) { 1695 if ( bSig0 | bSig1 ) return propagateFloat64NaN( a, b ); 1696 return packFloat64( zSign, 0x7FF, 0, 0 ); 1697 } 1698 if ( aExp == 0 ) { 1699 ++expDiff; 1700 } 1701 else { 1702 aSig0 |= 0x00100000; 1703 } 1704 shift64ExtraRightJamming( 1705 aSig0, aSig1, 0, - expDiff, &aSig0, &aSig1, &zSig2 ); 1706 zExp = bExp; 1707 } 1708 else { 1709 if ( aExp == 0x7FF ) { 1710 if ( aSig0 | aSig1 | bSig0 | bSig1 ) { 1711 return propagateFloat64NaN( a, b ); 1712 } 1713 return a; 1714 } 1715 add64( aSig0, aSig1, bSig0, bSig1, &zSig0, &zSig1 ); 1716 if ( aExp == 0 ) return packFloat64( zSign, 0, zSig0, zSig1 ); 1717 zSig2 = 0; 1718 zSig0 |= 0x00200000; 1719 zExp = aExp; 1720 goto shiftRight1; 1721 } 1722 aSig0 |= 0x00100000; 1723 add64( aSig0, aSig1, bSig0, bSig1, &zSig0, &zSig1 ); 1724 --zExp; 1725 if ( zSig0 < 0x00200000 ) goto roundAndPack; 1726 ++zExp; 1727 shiftRight1: 1728 shift64ExtraRightJamming( zSig0, zSig1, zSig2, 1, &zSig0, &zSig1, &zSig2 ); 1729 roundAndPack: 1730 return roundAndPackFloat64( zSign, zExp, zSig0, zSig1, zSig2 ); 1731 1732 } 1733 1734 /* 1735 ------------------------------------------------------------------------------- 1736 Returns the result of subtracting the absolute values of the double- 1737 precision floating-point values `a' and `b'. If `zSign' is 1, the 1738 difference is negated before being returned. `zSign' is ignored if the 1739 result is a NaN. The subtraction is performed according to the IEC/IEEE 1740 Standard for Binary Floating-Point Arithmetic. 1741 ------------------------------------------------------------------------------- 1742 */ 1743 static float64 subFloat64Sigs( float64 a, float64 b, flag zSign ) 1744 { 1745 int16 aExp, bExp, zExp; 1746 bits32 aSig0, aSig1, bSig0, bSig1, zSig0, zSig1; 1747 int16 expDiff; 1748 1749 aSig1 = extractFloat64Frac1( a ); 1750 aSig0 = extractFloat64Frac0( a ); 1751 aExp = extractFloat64Exp( a ); 1752 bSig1 = extractFloat64Frac1( b ); 1753 bSig0 = extractFloat64Frac0( b ); 1754 bExp = extractFloat64Exp( b ); 1755 expDiff = aExp - bExp; 1756 shortShift64Left( aSig0, aSig1, 10, &aSig0, &aSig1 ); 1757 shortShift64Left( bSig0, bSig1, 10, &bSig0, &bSig1 ); 1758 if ( 0 < expDiff ) goto aExpBigger; 1759 if ( expDiff < 0 ) goto bExpBigger; 1760 if ( aExp == 0x7FF ) { 1761 if ( aSig0 | aSig1 | bSig0 | bSig1 ) { 1762 return propagateFloat64NaN( a, b ); 1763 } 1764 float_raise( float_flag_invalid ); 1765 return float64_default_nan; 1766 } 1767 if ( aExp == 0 ) { 1768 aExp = 1; 1769 bExp = 1; 1770 } 1771 if ( bSig0 < aSig0 ) goto aBigger; 1772 if ( aSig0 < bSig0 ) goto bBigger; 1773 if ( bSig1 < aSig1 ) goto aBigger; 1774 if ( aSig1 < bSig1 ) goto bBigger; 1775 return packFloat64( float_rounding_mode == float_round_down, 0, 0, 0 ); 1776 bExpBigger: 1777 if ( bExp == 0x7FF ) { 1778 if ( bSig0 | bSig1 ) return propagateFloat64NaN( a, b ); 1779 return packFloat64( zSign ^ 1, 0x7FF, 0, 0 ); 1780 } 1781 if ( aExp == 0 ) { 1782 ++expDiff; 1783 } 1784 else { 1785 aSig0 |= 0x40000000; 1786 } 1787 shift64RightJamming( aSig0, aSig1, - expDiff, &aSig0, &aSig1 ); 1788 bSig0 |= 0x40000000; 1789 bBigger: 1790 sub64( bSig0, bSig1, aSig0, aSig1, &zSig0, &zSig1 ); 1791 zExp = bExp; 1792 zSign ^= 1; 1793 goto normalizeRoundAndPack; 1794 aExpBigger: 1795 if ( aExp == 0x7FF ) { 1796 if ( aSig0 | aSig1 ) return propagateFloat64NaN( a, b ); 1797 return a; 1798 } 1799 if ( bExp == 0 ) { 1800 --expDiff; 1801 } 1802 else { 1803 bSig0 |= 0x40000000; 1804 } 1805 shift64RightJamming( bSig0, bSig1, expDiff, &bSig0, &bSig1 ); 1806 aSig0 |= 0x40000000; 1807 aBigger: 1808 sub64( aSig0, aSig1, bSig0, bSig1, &zSig0, &zSig1 ); 1809 zExp = aExp; 1810 normalizeRoundAndPack: 1811 --zExp; 1812 return normalizeRoundAndPackFloat64( zSign, zExp - 10, zSig0, zSig1 ); 1813 1814 } 1815 1816 /* 1817 ------------------------------------------------------------------------------- 1818 Returns the result of adding the double-precision floating-point values `a' 1819 and `b'. The operation is performed according to the IEC/IEEE Standard for 1820 Binary Floating-Point Arithmetic. 1821 ------------------------------------------------------------------------------- 1822 */ 1823 float64 float64_add( float64 a, float64 b ) 1824 { 1825 flag aSign, bSign; 1826 1827 aSign = extractFloat64Sign( a ); 1828 bSign = extractFloat64Sign( b ); 1829 if ( aSign == bSign ) { 1830 return addFloat64Sigs( a, b, aSign ); 1831 } 1832 else { 1833 return subFloat64Sigs( a, b, aSign ); 1834 } 1835 1836 } 1837 1838 /* 1839 ------------------------------------------------------------------------------- 1840 Returns the result of subtracting the double-precision floating-point values 1841 `a' and `b'. The operation is performed according to the IEC/IEEE Standard 1842 for Binary Floating-Point Arithmetic. 1843 ------------------------------------------------------------------------------- 1844 */ 1845 float64 float64_sub( float64 a, float64 b ) 1846 { 1847 flag aSign, bSign; 1848 1849 aSign = extractFloat64Sign( a ); 1850 bSign = extractFloat64Sign( b ); 1851 if ( aSign == bSign ) { 1852 return subFloat64Sigs( a, b, aSign ); 1853 } 1854 else { 1855 return addFloat64Sigs( a, b, aSign ); 1856 } 1857 1858 } 1859 1860 /* 1861 ------------------------------------------------------------------------------- 1862 Returns the result of multiplying the double-precision floating-point values 1863 `a' and `b'. The operation is performed according to the IEC/IEEE Standard 1864 for Binary Floating-Point Arithmetic. 1865 ------------------------------------------------------------------------------- 1866 */ 1867 float64 float64_mul( float64 a, float64 b ) 1868 { 1869 flag aSign, bSign, zSign; 1870 int16 aExp, bExp, zExp; 1871 bits32 aSig0, aSig1, bSig0, bSig1, zSig0, zSig1, zSig2, zSig3; 1872 1873 aSig1 = extractFloat64Frac1( a ); 1874 aSig0 = extractFloat64Frac0( a ); 1875 aExp = extractFloat64Exp( a ); 1876 aSign = extractFloat64Sign( a ); 1877 bSig1 = extractFloat64Frac1( b ); 1878 bSig0 = extractFloat64Frac0( b ); 1879 bExp = extractFloat64Exp( b ); 1880 bSign = extractFloat64Sign( b ); 1881 zSign = aSign ^ bSign; 1882 if ( aExp == 0x7FF ) { 1883 if ( ( aSig0 | aSig1 ) 1884 || ( ( bExp == 0x7FF ) && ( bSig0 | bSig1 ) ) ) { 1885 return propagateFloat64NaN( a, b ); 1886 } 1887 if ( ( bExp | bSig0 | bSig1 ) == 0 ) goto invalid; 1888 return packFloat64( zSign, 0x7FF, 0, 0 ); 1889 } 1890 if ( bExp == 0x7FF ) { 1891 if ( bSig0 | bSig1 ) return propagateFloat64NaN( a, b ); 1892 if ( ( aExp | aSig0 | aSig1 ) == 0 ) { 1893 invalid: 1894 float_raise( float_flag_invalid ); 1895 return float64_default_nan; 1896 } 1897 return packFloat64( zSign, 0x7FF, 0, 0 ); 1898 } 1899 if ( aExp == 0 ) { 1900 if ( ( aSig0 | aSig1 ) == 0 ) return packFloat64( zSign, 0, 0, 0 ); 1901 normalizeFloat64Subnormal( aSig0, aSig1, &aExp, &aSig0, &aSig1 ); 1902 } 1903 if ( bExp == 0 ) { 1904 if ( ( bSig0 | bSig1 ) == 0 ) return packFloat64( zSign, 0, 0, 0 ); 1905 normalizeFloat64Subnormal( bSig0, bSig1, &bExp, &bSig0, &bSig1 ); 1906 } 1907 zExp = aExp + bExp - 0x400; 1908 aSig0 |= 0x00100000; 1909 shortShift64Left( bSig0, bSig1, 12, &bSig0, &bSig1 ); 1910 mul64To128( aSig0, aSig1, bSig0, bSig1, &zSig0, &zSig1, &zSig2, &zSig3 ); 1911 add64( zSig0, zSig1, aSig0, aSig1, &zSig0, &zSig1 ); 1912 zSig2 |= ( zSig3 != 0 ); 1913 if ( 0x00200000 <= zSig0 ) { 1914 shift64ExtraRightJamming( 1915 zSig0, zSig1, zSig2, 1, &zSig0, &zSig1, &zSig2 ); 1916 ++zExp; 1917 } 1918 return roundAndPackFloat64( zSign, zExp, zSig0, zSig1, zSig2 ); 1919 1920 } 1921 1922 /* 1923 ------------------------------------------------------------------------------- 1924 Returns the result of dividing the double-precision floating-point value `a' 1925 by the corresponding value `b'. The operation is performed according to the 1926 IEC/IEEE Standard for Binary Floating-Point Arithmetic. 1927 ------------------------------------------------------------------------------- 1928 */ 1929 float64 float64_div( float64 a, float64 b ) 1930 { 1931 flag aSign, bSign, zSign; 1932 int16 aExp, bExp, zExp; 1933 bits32 aSig0, aSig1, bSig0, bSig1, zSig0, zSig1, zSig2; 1934 bits32 rem0, rem1, rem2, rem3, term0, term1, term2, term3; 1935 1936 aSig1 = extractFloat64Frac1( a ); 1937 aSig0 = extractFloat64Frac0( a ); 1938 aExp = extractFloat64Exp( a ); 1939 aSign = extractFloat64Sign( a ); 1940 bSig1 = extractFloat64Frac1( b ); 1941 bSig0 = extractFloat64Frac0( b ); 1942 bExp = extractFloat64Exp( b ); 1943 bSign = extractFloat64Sign( b ); 1944 zSign = aSign ^ bSign; 1945 if ( aExp == 0x7FF ) { 1946 if ( aSig0 | aSig1 ) return propagateFloat64NaN( a, b ); 1947 if ( bExp == 0x7FF ) { 1948 if ( bSig0 | bSig1 ) return propagateFloat64NaN( a, b ); 1949 goto invalid; 1950 } 1951 return packFloat64( zSign, 0x7FF, 0, 0 ); 1952 } 1953 if ( bExp == 0x7FF ) { 1954 if ( bSig0 | bSig1 ) return propagateFloat64NaN( a, b ); 1955 return packFloat64( zSign, 0, 0, 0 ); 1956 } 1957 if ( bExp == 0 ) { 1958 if ( ( bSig0 | bSig1 ) == 0 ) { 1959 if ( ( aExp | aSig0 | aSig1 ) == 0 ) { 1960 invalid: 1961 float_raise( float_flag_invalid ); 1962 return float64_default_nan; 1963 } 1964 float_raise( float_flag_divbyzero ); 1965 return packFloat64( zSign, 0x7FF, 0, 0 ); 1966 } 1967 normalizeFloat64Subnormal( bSig0, bSig1, &bExp, &bSig0, &bSig1 ); 1968 } 1969 if ( aExp == 0 ) { 1970 if ( ( aSig0 | aSig1 ) == 0 ) return packFloat64( zSign, 0, 0, 0 ); 1971 normalizeFloat64Subnormal( aSig0, aSig1, &aExp, &aSig0, &aSig1 ); 1972 } 1973 zExp = aExp - bExp + 0x3FD; 1974 shortShift64Left( aSig0 | 0x00100000, aSig1, 11, &aSig0, &aSig1 ); 1975 shortShift64Left( bSig0 | 0x00100000, bSig1, 11, &bSig0, &bSig1 ); 1976 if ( le64( bSig0, bSig1, aSig0, aSig1 ) ) { 1977 shift64Right( aSig0, aSig1, 1, &aSig0, &aSig1 ); 1978 ++zExp; 1979 } 1980 zSig0 = estimateDiv64To32( aSig0, aSig1, bSig0 ); 1981 mul64By32To96( bSig0, bSig1, zSig0, &term0, &term1, &term2 ); 1982 sub96( aSig0, aSig1, 0, term0, term1, term2, &rem0, &rem1, &rem2 ); 1983 while ( (sbits32) rem0 < 0 ) { 1984 --zSig0; 1985 add96( rem0, rem1, rem2, 0, bSig0, bSig1, &rem0, &rem1, &rem2 ); 1986 } 1987 zSig1 = estimateDiv64To32( rem1, rem2, bSig0 ); 1988 if ( ( zSig1 & 0x3FF ) <= 4 ) { 1989 mul64By32To96( bSig0, bSig1, zSig1, &term1, &term2, &term3 ); 1990 sub96( rem1, rem2, 0, term1, term2, term3, &rem1, &rem2, &rem3 ); 1991 while ( (sbits32) rem1 < 0 ) { 1992 --zSig1; 1993 add96( rem1, rem2, rem3, 0, bSig0, bSig1, &rem1, &rem2, &rem3 ); 1994 } 1995 zSig1 |= ( ( rem1 | rem2 | rem3 ) != 0 ); 1996 } 1997 shift64ExtraRightJamming( zSig0, zSig1, 0, 11, &zSig0, &zSig1, &zSig2 ); 1998 return roundAndPackFloat64( zSign, zExp, zSig0, zSig1, zSig2 ); 1999 2000 } 2001 2002 #ifndef SOFTFLOAT_FOR_GCC 2003 /* 2004 ------------------------------------------------------------------------------- 2005 Returns the remainder of the double-precision floating-point value `a' 2006 with respect to the corresponding value `b'. The operation is performed 2007 according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic. 2008 ------------------------------------------------------------------------------- 2009 */ 2010 float64 float64_rem( float64 a, float64 b ) 2011 { 2012 flag aSign, bSign, zSign; 2013 int16 aExp, bExp, expDiff; 2014 bits32 aSig0, aSig1, bSig0, bSig1, q, term0, term1, term2; 2015 bits32 allZero, alternateASig0, alternateASig1, sigMean1; 2016 sbits32 sigMean0; 2017 float64 z; 2018 2019 aSig1 = extractFloat64Frac1( a ); 2020 aSig0 = extractFloat64Frac0( a ); 2021 aExp = extractFloat64Exp( a ); 2022 aSign = extractFloat64Sign( a ); 2023 bSig1 = extractFloat64Frac1( b ); 2024 bSig0 = extractFloat64Frac0( b ); 2025 bExp = extractFloat64Exp( b ); 2026 bSign = extractFloat64Sign( b ); 2027 if ( aExp == 0x7FF ) { 2028 if ( ( aSig0 | aSig1 ) 2029 || ( ( bExp == 0x7FF ) && ( bSig0 | bSig1 ) ) ) { 2030 return propagateFloat64NaN( a, b ); 2031 } 2032 goto invalid; 2033 } 2034 if ( bExp == 0x7FF ) { 2035 if ( bSig0 | bSig1 ) return propagateFloat64NaN( a, b ); 2036 return a; 2037 } 2038 if ( bExp == 0 ) { 2039 if ( ( bSig0 | bSig1 ) == 0 ) { 2040 invalid: 2041 float_raise( float_flag_invalid ); 2042 return float64_default_nan; 2043 } 2044 normalizeFloat64Subnormal( bSig0, bSig1, &bExp, &bSig0, &bSig1 ); 2045 } 2046 if ( aExp == 0 ) { 2047 if ( ( aSig0 | aSig1 ) == 0 ) return a; 2048 normalizeFloat64Subnormal( aSig0, aSig1, &aExp, &aSig0, &aSig1 ); 2049 } 2050 expDiff = aExp - bExp; 2051 if ( expDiff < -1 ) return a; 2052 shortShift64Left( 2053 aSig0 | 0x00100000, aSig1, 11 - ( expDiff < 0 ), &aSig0, &aSig1 ); 2054 shortShift64Left( bSig0 | 0x00100000, bSig1, 11, &bSig0, &bSig1 ); 2055 q = le64( bSig0, bSig1, aSig0, aSig1 ); 2056 if ( q ) sub64( aSig0, aSig1, bSig0, bSig1, &aSig0, &aSig1 ); 2057 expDiff -= 32; 2058 while ( 0 < expDiff ) { 2059 q = estimateDiv64To32( aSig0, aSig1, bSig0 ); 2060 q = ( 4 < q ) ? q - 4 : 0; 2061 mul64By32To96( bSig0, bSig1, q, &term0, &term1, &term2 ); 2062 shortShift96Left( term0, term1, term2, 29, &term1, &term2, &allZero ); 2063 shortShift64Left( aSig0, aSig1, 29, &aSig0, &allZero ); 2064 sub64( aSig0, 0, term1, term2, &aSig0, &aSig1 ); 2065 expDiff -= 29; 2066 } 2067 if ( -32 < expDiff ) { 2068 q = estimateDiv64To32( aSig0, aSig1, bSig0 ); 2069 q = ( 4 < q ) ? q - 4 : 0; 2070 q >>= - expDiff; 2071 shift64Right( bSig0, bSig1, 8, &bSig0, &bSig1 ); 2072 expDiff += 24; 2073 if ( expDiff < 0 ) { 2074 shift64Right( aSig0, aSig1, - expDiff, &aSig0, &aSig1 ); 2075 } 2076 else { 2077 shortShift64Left( aSig0, aSig1, expDiff, &aSig0, &aSig1 ); 2078 } 2079 mul64By32To96( bSig0, bSig1, q, &term0, &term1, &term2 ); 2080 sub64( aSig0, aSig1, term1, term2, &aSig0, &aSig1 ); 2081 } 2082 else { 2083 shift64Right( aSig0, aSig1, 8, &aSig0, &aSig1 ); 2084 shift64Right( bSig0, bSig1, 8, &bSig0, &bSig1 ); 2085 } 2086 do { 2087 alternateASig0 = aSig0; 2088 alternateASig1 = aSig1; 2089 ++q; 2090 sub64( aSig0, aSig1, bSig0, bSig1, &aSig0, &aSig1 ); 2091 } while ( 0 <= (sbits32) aSig0 ); 2092 add64( 2093 aSig0, aSig1, alternateASig0, alternateASig1, &sigMean0, &sigMean1 ); 2094 if ( ( sigMean0 < 0 ) 2095 || ( ( ( sigMean0 | sigMean1 ) == 0 ) && ( q & 1 ) ) ) { 2096 aSig0 = alternateASig0; 2097 aSig1 = alternateASig1; 2098 } 2099 zSign = ( (sbits32) aSig0 < 0 ); 2100 if ( zSign ) sub64( 0, 0, aSig0, aSig1, &aSig0, &aSig1 ); 2101 return 2102 normalizeRoundAndPackFloat64( aSign ^ zSign, bExp - 4, aSig0, aSig1 ); 2103 2104 } 2105 #endif 2106 2107 #ifndef SOFTFLOAT_FOR_GCC 2108 /* 2109 ------------------------------------------------------------------------------- 2110 Returns the square root of the double-precision floating-point value `a'. 2111 The operation is performed according to the IEC/IEEE Standard for Binary 2112 Floating-Point Arithmetic. 2113 ------------------------------------------------------------------------------- 2114 */ 2115 float64 float64_sqrt( float64 a ) 2116 { 2117 flag aSign; 2118 int16 aExp, zExp; 2119 bits32 aSig0, aSig1, zSig0, zSig1, zSig2, doubleZSig0; 2120 bits32 rem0, rem1, rem2, rem3, term0, term1, term2, term3; 2121 float64 z; 2122 2123 aSig1 = extractFloat64Frac1( a ); 2124 aSig0 = extractFloat64Frac0( a ); 2125 aExp = extractFloat64Exp( a ); 2126 aSign = extractFloat64Sign( a ); 2127 if ( aExp == 0x7FF ) { 2128 if ( aSig0 | aSig1 ) return propagateFloat64NaN( a, a ); 2129 if ( ! aSign ) return a; 2130 goto invalid; 2131 } 2132 if ( aSign ) { 2133 if ( ( aExp | aSig0 | aSig1 ) == 0 ) return a; 2134 invalid: 2135 float_raise( float_flag_invalid ); 2136 return float64_default_nan; 2137 } 2138 if ( aExp == 0 ) { 2139 if ( ( aSig0 | aSig1 ) == 0 ) return packFloat64( 0, 0, 0, 0 ); 2140 normalizeFloat64Subnormal( aSig0, aSig1, &aExp, &aSig0, &aSig1 ); 2141 } 2142 zExp = ( ( aExp - 0x3FF )>>1 ) + 0x3FE; 2143 aSig0 |= 0x00100000; 2144 shortShift64Left( aSig0, aSig1, 11, &term0, &term1 ); 2145 zSig0 = ( estimateSqrt32( aExp, term0 )>>1 ) + 1; 2146 if ( zSig0 == 0 ) zSig0 = 0x7FFFFFFF; 2147 doubleZSig0 = zSig0 + zSig0; 2148 shortShift64Left( aSig0, aSig1, 9 - ( aExp & 1 ), &aSig0, &aSig1 ); 2149 mul32To64( zSig0, zSig0, &term0, &term1 ); 2150 sub64( aSig0, aSig1, term0, term1, &rem0, &rem1 ); 2151 while ( (sbits32) rem0 < 0 ) { 2152 --zSig0; 2153 doubleZSig0 -= 2; 2154 add64( rem0, rem1, 0, doubleZSig0 | 1, &rem0, &rem1 ); 2155 } 2156 zSig1 = estimateDiv64To32( rem1, 0, doubleZSig0 ); 2157 if ( ( zSig1 & 0x1FF ) <= 5 ) { 2158 if ( zSig1 == 0 ) zSig1 = 1; 2159 mul32To64( doubleZSig0, zSig1, &term1, &term2 ); 2160 sub64( rem1, 0, term1, term2, &rem1, &rem2 ); 2161 mul32To64( zSig1, zSig1, &term2, &term3 ); 2162 sub96( rem1, rem2, 0, 0, term2, term3, &rem1, &rem2, &rem3 ); 2163 while ( (sbits32) rem1 < 0 ) { 2164 --zSig1; 2165 shortShift64Left( 0, zSig1, 1, &term2, &term3 ); 2166 term3 |= 1; 2167 term2 |= doubleZSig0; 2168 add96( rem1, rem2, rem3, 0, term2, term3, &rem1, &rem2, &rem3 ); 2169 } 2170 zSig1 |= ( ( rem1 | rem2 | rem3 ) != 0 ); 2171 } 2172 shift64ExtraRightJamming( zSig0, zSig1, 0, 10, &zSig0, &zSig1, &zSig2 ); 2173 return roundAndPackFloat64( 0, zExp, zSig0, zSig1, zSig2 ); 2174 2175 } 2176 #endif 2177 2178 /* 2179 ------------------------------------------------------------------------------- 2180 Returns 1 if the double-precision floating-point value `a' is equal to 2181 the corresponding value `b', and 0 otherwise. The comparison is performed 2182 according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic. 2183 ------------------------------------------------------------------------------- 2184 */ 2185 flag float64_eq( float64 a, float64 b ) 2186 { 2187 2188 if ( ( ( extractFloat64Exp( a ) == 0x7FF ) 2189 && ( extractFloat64Frac0( a ) | extractFloat64Frac1( a ) ) ) 2190 || ( ( extractFloat64Exp( b ) == 0x7FF ) 2191 && ( extractFloat64Frac0( b ) | extractFloat64Frac1( b ) ) ) 2192 ) { 2193 if ( float64_is_signaling_nan( a ) || float64_is_signaling_nan( b ) ) { 2194 float_raise( float_flag_invalid ); 2195 } 2196 return 0; 2197 } 2198 return ( a == b ) || 2199 ( (bits64) ( ( FLOAT64_DEMANGLE(a) | FLOAT64_DEMANGLE(b) )<<1 ) == 0 ); 2200 2201 } 2202 2203 /* 2204 ------------------------------------------------------------------------------- 2205 Returns 1 if the double-precision floating-point value `a' is less than 2206 or equal to the corresponding value `b', and 0 otherwise. The comparison 2207 is performed according to the IEC/IEEE Standard for Binary Floating-Point 2208 Arithmetic. 2209 ------------------------------------------------------------------------------- 2210 */ 2211 flag float64_le( float64 a, float64 b ) 2212 { 2213 flag aSign, bSign; 2214 2215 if ( ( ( extractFloat64Exp( a ) == 0x7FF ) 2216 && ( extractFloat64Frac0( a ) | extractFloat64Frac1( a ) ) ) 2217 || ( ( extractFloat64Exp( b ) == 0x7FF ) 2218 && ( extractFloat64Frac0( b ) | extractFloat64Frac1( b ) ) ) 2219 ) { 2220 float_raise( float_flag_invalid ); 2221 return 0; 2222 } 2223 aSign = extractFloat64Sign( a ); 2224 bSign = extractFloat64Sign( b ); 2225 if ( aSign != bSign ) 2226 return aSign || 2227 ( (bits64) ( ( FLOAT64_DEMANGLE(a) | FLOAT64_DEMANGLE(b) )<<1 ) == 2228 0 ); 2229 return ( a == b ) || 2230 ( aSign ^ ( FLOAT64_DEMANGLE(a) < FLOAT64_DEMANGLE(b) ) ); 2231 } 2232 2233 /* 2234 ------------------------------------------------------------------------------- 2235 Returns 1 if the double-precision floating-point value `a' is less than 2236 the corresponding value `b', and 0 otherwise. The comparison is performed 2237 according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic. 2238 ------------------------------------------------------------------------------- 2239 */ 2240 flag float64_lt( float64 a, float64 b ) 2241 { 2242 flag aSign, bSign; 2243 2244 if ( ( ( extractFloat64Exp( a ) == 0x7FF ) 2245 && ( extractFloat64Frac0( a ) | extractFloat64Frac1( a ) ) ) 2246 || ( ( extractFloat64Exp( b ) == 0x7FF ) 2247 && ( extractFloat64Frac0( b ) | extractFloat64Frac1( b ) ) ) 2248 ) { 2249 float_raise( float_flag_invalid ); 2250 return 0; 2251 } 2252 aSign = extractFloat64Sign( a ); 2253 bSign = extractFloat64Sign( b ); 2254 if ( aSign != bSign ) 2255 return aSign && 2256 ( (bits64) ( ( FLOAT64_DEMANGLE(a) | FLOAT64_DEMANGLE(b) )<<1 ) != 2257 0 ); 2258 return ( a != b ) && 2259 ( aSign ^ ( FLOAT64_DEMANGLE(a) < FLOAT64_DEMANGLE(b) ) ); 2260 2261 } 2262 2263 #ifndef SOFTFLOAT_FOR_GCC 2264 /* 2265 ------------------------------------------------------------------------------- 2266 Returns 1 if the double-precision floating-point value `a' is equal to 2267 the corresponding value `b', and 0 otherwise. The invalid exception is 2268 raised if either operand is a NaN. Otherwise, the comparison is performed 2269 according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic. 2270 ------------------------------------------------------------------------------- 2271 */ 2272 flag float64_eq_signaling( float64 a, float64 b ) 2273 { 2274 2275 if ( ( ( extractFloat64Exp( a ) == 0x7FF ) 2276 && ( extractFloat64Frac0( a ) | extractFloat64Frac1( a ) ) ) 2277 || ( ( extractFloat64Exp( b ) == 0x7FF ) 2278 && ( extractFloat64Frac0( b ) | extractFloat64Frac1( b ) ) ) 2279 ) { 2280 float_raise( float_flag_invalid ); 2281 return 0; 2282 } 2283 return ( a == b ) || ( (bits64) ( ( a | b )<<1 ) == 0 ); 2284 2285 } 2286 2287 /* 2288 ------------------------------------------------------------------------------- 2289 Returns 1 if the double-precision floating-point value `a' is less than or 2290 equal to the corresponding value `b', and 0 otherwise. Quiet NaNs do not 2291 cause an exception. Otherwise, the comparison is performed according to the 2292 IEC/IEEE Standard for Binary Floating-Point Arithmetic. 2293 ------------------------------------------------------------------------------- 2294 */ 2295 flag float64_le_quiet( float64 a, float64 b ) 2296 { 2297 flag aSign, bSign; 2298 2299 if ( ( ( extractFloat64Exp( a ) == 0x7FF ) 2300 && ( extractFloat64Frac0( a ) | extractFloat64Frac1( a ) ) ) 2301 || ( ( extractFloat64Exp( b ) == 0x7FF ) 2302 && ( extractFloat64Frac0( b ) | extractFloat64Frac1( b ) ) ) 2303 ) { 2304 if ( float64_is_signaling_nan( a ) || float64_is_signaling_nan( b ) ) { 2305 float_raise( float_flag_invalid ); 2306 } 2307 return 0; 2308 } 2309 aSign = extractFloat64Sign( a ); 2310 bSign = extractFloat64Sign( b ); 2311 if ( aSign != bSign ) return aSign || ( (bits64) ( ( a | b )<<1 ) == 0 ); 2312 return ( a == b ) || ( aSign ^ ( a < b ) ); 2313 2314 } 2315 2316 /* 2317 ------------------------------------------------------------------------------- 2318 Returns 1 if the double-precision floating-point value `a' is less than 2319 the corresponding value `b', and 0 otherwise. Quiet NaNs do not cause an 2320 exception. Otherwise, the comparison is performed according to the IEC/IEEE 2321 Standard for Binary Floating-Point Arithmetic. 2322 ------------------------------------------------------------------------------- 2323 */ 2324 flag float64_lt_quiet( float64 a, float64 b ) 2325 { 2326 flag aSign, bSign; 2327 2328 if ( ( ( extractFloat64Exp( a ) == 0x7FF ) 2329 && ( extractFloat64Frac0( a ) | extractFloat64Frac1( a ) ) ) 2330 || ( ( extractFloat64Exp( b ) == 0x7FF ) 2331 && ( extractFloat64Frac0( b ) | extractFloat64Frac1( b ) ) ) 2332 ) { 2333 if ( float64_is_signaling_nan( a ) || float64_is_signaling_nan( b ) ) { 2334 float_raise( float_flag_invalid ); 2335 } 2336 return 0; 2337 } 2338 aSign = extractFloat64Sign( a ); 2339 bSign = extractFloat64Sign( b ); 2340 if ( aSign != bSign ) return aSign && ( (bits64) ( ( a | b )<<1 ) != 0 ); 2341 return ( a != b ) && ( aSign ^ ( a < b ) ); 2342 2343 } 2344 2345 #endif 2346