softfloat.c - OpenGrok cross reference for /freebsd/lib/libc/softfloat/bits64/softfloat.c

Lines Matching +full:high +full:- +full:z
4  * This version hacked for use with gcc -msoft-float by bjh21.
12  * SOFTFLOAT_FOR_GCC - build only those functions necessary for GCC (with
13  *   -msoft-float) to work.  Include "softfloat-for-gcc.h" to get them
20 This C source file is part of the SoftFloat IEC/IEEE Floating-point
26 National Science Foundation under grant MIP-9311980.  The original version
27 of this code was written as part of a project to build a fixed-point vector
48 #include "softfloat-for-gcc.h"
66 -------------------------------------------------------------------------------
67 Floating-point rounding mode, extended double-precision rounding precision,
69 -------------------------------------------------------------------------------
78 -------------------------------------------------------------------------------
79 Primitive arithmetic functions, including multi-word arithmetic, and
82 -------------------------------------------------------------------------------
84 #include "softfloat-macros"
87 -------------------------------------------------------------------------------
92 are propagated from function inputs to output.  These details are target-
94 -------------------------------------------------------------------------------
96 #include "softfloat-specialize"
100 -------------------------------------------------------------------------------
101 Takes a 64-bit fixed-point value `absZ' with binary point between bits 6
102 and 7, and returns the properly rounded 32-bit integer corresponding to the
104 integer.  Bit 63 of `absZ' must be zero.  Ordinarily, the fixed-point input
106 input cannot be represented exactly as an integer.  However, if the fixed-
109 -------------------------------------------------------------------------------
116     int32 z;  in roundAndPackInt32()  local
138     z = absZ;  in roundAndPackInt32()
139     if ( zSign ) z = - z;  in roundAndPackInt32()
140     if ( ( absZ>>32 ) || ( z && ( ( z < 0 ) ^ zSign ) ) ) {  in roundAndPackInt32()
145     return z;  in roundAndPackInt32()
150 -------------------------------------------------------------------------------
151 Takes the 128-bit fixed-point value formed by concatenating `absZ0' and
153 and returns the properly rounded 64-bit integer corresponding to the input.
155 Ordinarily, the fixed-point input is simply rounded to an integer, with
157 an integer.  However, if the fixed-point input is too large, the invalid
160 -------------------------------------------------------------------------------
166     int64 z;  in roundAndPackInt64()  local
189     z = absZ0;  in roundAndPackInt64()
190     if ( zSign ) z = - z;  in roundAndPackInt64()
191     if ( z && ( ( z < 0 ) ^ zSign ) ) {  in roundAndPackInt64()
199     return z;  in roundAndPackInt64()
205 -------------------------------------------------------------------------------
206 Returns the fraction bits of the single-precision floating-point value `a'.
207 -------------------------------------------------------------------------------
217 -------------------------------------------------------------------------------
218 Returns the exponent bits of the single-precision floating-point value `a'.
219 -------------------------------------------------------------------------------
229 -------------------------------------------------------------------------------
230 Returns the sign bit of the single-precision floating-point value `a'.
231 -------------------------------------------------------------------------------
241 -------------------------------------------------------------------------------
242 Normalizes the subnormal single-precision floating-point value represented
246 -------------------------------------------------------------------------------
253     shiftCount = countLeadingZeros32( aSig ) - 8;  in normalizeFloat32Subnormal()
255     *zExpPtr = 1 - shiftCount;  in normalizeFloat32Subnormal()
260 -------------------------------------------------------------------------------
262 single-precision floating-point value, returning the result.  After being
269 -------------------------------------------------------------------------------
279 -------------------------------------------------------------------------------
280 Takes an abstract floating-point value having sign `zSign', exponent `zExp',
281 and significand `zSig', and returns the proper single-precision floating-
283 value is simply rounded and packed into the single-precision format, with
289 the abstract input cannot be represented exactly as a subnormal single-
290 precision floating-point number.
296 normalized, `zExp' must be 1 less than the ``true'' floating-point exponent.
298 Binary Floating-Point Arithmetic.
299 -------------------------------------------------------------------------------
332             return packFloat32( zSign, 0xFF, 0 ) - ( roundIncrement == 0 );  in roundAndPackFloat32()
337                 || ( zExp < -1 )  in roundAndPackFloat32()
339             shift32RightJamming( zSig, - zExp, &zSig );  in roundAndPackFloat32()
354 -------------------------------------------------------------------------------
355 Takes an abstract floating-point value having sign `zSign', exponent `zExp',
356 and significand `zSig', and returns the proper single-precision floating-
360 floating-point exponent.
361 -------------------------------------------------------------------------------
368     shiftCount = countLeadingZeros32( zSig ) - 1;  in normalizeRoundAndPackFloat32()
369     return roundAndPackFloat32( zSign, zExp - shiftCount, zSig<<shiftCount );  in normalizeRoundAndPackFloat32()
374 -------------------------------------------------------------------------------
375 Returns the fraction bits of the double-precision floating-point value `a'.
376 -------------------------------------------------------------------------------
386 -------------------------------------------------------------------------------
387 Returns the exponent bits of the double-precision floating-point value `a'.
388 -------------------------------------------------------------------------------
398 -------------------------------------------------------------------------------
399 Returns the sign bit of the double-precision floating-point value `a'.
400 -------------------------------------------------------------------------------
410 -------------------------------------------------------------------------------
411 Normalizes the subnormal double-precision floating-point value represented
415 -------------------------------------------------------------------------------
422     shiftCount = countLeadingZeros64( aSig ) - 11;  in normalizeFloat64Subnormal()
424     *zExpPtr = 1 - shiftCount;  in normalizeFloat64Subnormal()
429 -------------------------------------------------------------------------------
431 double-precision floating-point value, returning the result.  After being
438 -------------------------------------------------------------------------------
449 -------------------------------------------------------------------------------
450 Takes an abstract floating-point value having sign `zSign', exponent `zExp',
451 and significand `zSig', and returns the proper double-precision floating-
453 value is simply rounded and packed into the double-precision format, with
459 the abstract input cannot be represented exactly as a subnormal double-
460 precision floating-point number.
466 normalized, `zExp' must be 1 less than the ``true'' floating-point exponent.
468 Binary Floating-Point Arithmetic.
469 -------------------------------------------------------------------------------
503 		FLOAT64_DEMANGLE(packFloat64( zSign, 0x7FF, 0 )) -  in roundAndPackFloat64()
509                 || ( zExp < -1 )  in roundAndPackFloat64()
511             shift64RightJamming( zSig, - zExp, &zSig );  in roundAndPackFloat64()
526 -------------------------------------------------------------------------------
527 Takes an abstract floating-point value having sign `zSign', exponent `zExp',
528 and significand `zSig', and returns the proper double-precision floating-
532 floating-point exponent.
533 -------------------------------------------------------------------------------
540     shiftCount = countLeadingZeros64( zSig ) - 1;  in normalizeRoundAndPackFloat64()
541     return roundAndPackFloat64( zSign, zExp - shiftCount, zSig<<shiftCount );  in normalizeRoundAndPackFloat64()
548 -------------------------------------------------------------------------------
549 Returns the fraction bits of the extended double-precision floating-point
551 -------------------------------------------------------------------------------
561 -------------------------------------------------------------------------------
562 Returns the exponent bits of the extended double-precision floating-point
564 -------------------------------------------------------------------------------
569     return a.high & 0x7FFF;  in extractFloatx80Exp()
574 -------------------------------------------------------------------------------
575 Returns the sign bit of the extended double-precision floating-point value
577 -------------------------------------------------------------------------------
582     return a.high>>15;  in extractFloatx80Sign()
587 -------------------------------------------------------------------------------
588 Normalizes the subnormal extended double-precision floating-point value
592 -------------------------------------------------------------------------------
601     *zExpPtr = 1 - shiftCount;  in normalizeFloatx80Subnormal()
606 -------------------------------------------------------------------------------
608 extended double-precision floating-point value, returning the result.
609 -------------------------------------------------------------------------------
613     floatx80 z;  in packFloatx80()  local
615     z.low = zSig;  in packFloatx80()
616     z.high = ( ( (bits16) zSign )<<15 ) + zExp;  in packFloatx80()
617     return z;  in packFloatx80()
622 -------------------------------------------------------------------------------
623 Takes an abstract floating-point value having sign `zSign', exponent `zExp',
625 and returns the proper extended double-precision floating-point value
627 rounded and packed into the extended double-precision format, with the
634 double-precision floating-point number.
637 result is rounded to the full precision of the extended double-precision
643 Floating-Point Arithmetic.
644 -------------------------------------------------------------------------------
685     if ( 0x7FFD <= (bits32) ( zExp - 1 ) ) {  in roundAndPackFloatx80()
696             shift64RightJamming( zSig0, 1 - zExp, &zSig0 );  in roundAndPackFloatx80()
739     if ( 0x7FFD <= (bits32) ( zExp - 1 ) ) {  in roundAndPackFloatx80()
763             shift64ExtraRightJamming( zSig0, zSig1, 1 - zExp, &zSig0, &zSig1 );  in roundAndPackFloatx80()
806 -------------------------------------------------------------------------------
807 Takes an abstract floating-point value having sign `zSign', exponent
809 and returns the proper extended double-precision floating-point value
813 -------------------------------------------------------------------------------
825         zExp -= 64;  in normalizeRoundAndPackFloatx80()
829     zExp -= shiftCount;  in normalizeRoundAndPackFloatx80()
840 -------------------------------------------------------------------------------
841 Returns the least-significant 64 fraction bits of the quadruple-precision
842 floating-point value `a'.
843 -------------------------------------------------------------------------------
853 -------------------------------------------------------------------------------
854 Returns the most-significant 48 fraction bits of the quadruple-precision
855 floating-point value `a'.
856 -------------------------------------------------------------------------------
861     return a.high & LIT64( 0x0000FFFFFFFFFFFF );  in extractFloat128Frac0()
866 -------------------------------------------------------------------------------
867 Returns the exponent bits of the quadruple-precision floating-point value
869 -------------------------------------------------------------------------------
874     return ( a.high>>48 ) & 0x7FFF;  in extractFloat128Exp()
879 -------------------------------------------------------------------------------
880 Returns the sign bit of the quadruple-precision floating-point value `a'.
881 -------------------------------------------------------------------------------
886     return a.high>>63;  in extractFloat128Sign()
891 -------------------------------------------------------------------------------
892 Normalizes the subnormal quadruple-precision floating-point value
899 -------------------------------------------------------------------------------
913         shiftCount = countLeadingZeros64( aSig1 ) - 15;  in normalizeFloat128Subnormal()
915             *zSig0Ptr = aSig1>>( - shiftCount );  in normalizeFloat128Subnormal()
922         *zExpPtr = - shiftCount - 63;  in normalizeFloat128Subnormal()
925         shiftCount = countLeadingZeros64( aSig0 ) - 15;  in normalizeFloat128Subnormal()
927         *zExpPtr = 1 - shiftCount;  in normalizeFloat128Subnormal()
933 -------------------------------------------------------------------------------
935 by the concatenation of `zSig0' and `zSig1' into a quadruple-precision
936 floating-point value, returning the result.  After being shifted into the
944 -------------------------------------------------------------------------------
949     float128 z;  in packFloat128()  local
951     z.low = zSig1;  in packFloat128()
952     z.high = ( ( (bits64) zSign )<<63 ) + ( ( (bits64) zExp )<<48 ) + zSig0;  in packFloat128()
953     return z;  in packFloat128()
958 -------------------------------------------------------------------------------
959 Takes an abstract floating-point value having sign `zSign', exponent `zExp',
961 and `zSig2', and returns the proper quadruple-precision floating-point value
963 simply rounded and packed into the quadruple-precision format, with the
969 the abstract input cannot be represented exactly as a subnormal quadruple-
970 precision floating-point number.
975 than the ``true'' floating-point exponent.  The handling of underflow and
976 overflow follows the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
977 -------------------------------------------------------------------------------
1032                 || ( zExp < -1 )  in roundAndPackFloat128()
1041                 zSig0, zSig1, zSig2, - zExp, &zSig0, &zSig1, &zSig2 );  in roundAndPackFloat128()
1070 -------------------------------------------------------------------------------
1071 Takes an abstract floating-point value having sign `zSign', exponent `zExp',
1073 returns the proper quadruple-precision floating-point value corresponding
1076 normalized.  In all cases, `zExp' must be 1 less than the ``true'' floating-
1078 -------------------------------------------------------------------------------
1090         zExp -= 64;  in normalizeRoundAndPackFloat128()
1092     shiftCount = countLeadingZeros64( zSig0 ) - 15;  in normalizeRoundAndPackFloat128()
1099             zSig0, zSig1, 0, - shiftCount, &zSig0, &zSig1, &zSig2 );  in normalizeRoundAndPackFloat128()
1101     zExp -= shiftCount;  in normalizeRoundAndPackFloat128()
1109 -------------------------------------------------------------------------------
1110 Returns the result of converting the 32-bit two's complement integer `a'
1111 to the single-precision floating-point format.  The conversion is performed
1112 according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
1113 -------------------------------------------------------------------------------
1122     return normalizeRoundAndPackFloat32( zSign, 0x9C, zSign ? - a : a );  in int32_to_float32()
1138 -------------------------------------------------------------------------------
1139 Returns the result of converting the 32-bit two's complement integer `a'
1140 to the double-precision floating-point format.  The conversion is performed
1141 according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
1142 -------------------------------------------------------------------------------
1153     absA = zSign ? - a : a;  in int32_to_float64()
1156     return packFloat64( zSign, 0x432 - shiftCount, zSig<<shiftCount );  in int32_to_float64()
1168     return packFloat64( 0, 0x432 - shiftCount, zSig<<shiftCount );  in uint32_to_float64()
1176 -------------------------------------------------------------------------------
1177 Returns the result of converting the 32-bit two's complement integer `a'
1178 to the extended double-precision floating-point format.  The conversion
1179 is performed according to the IEC/IEEE Standard for Binary Floating-Point
1181 -------------------------------------------------------------------------------
1192     absA = zSign ? - a : a;  in int32_to_floatx80()
1195     return packFloatx80( zSign, 0x403E - shiftCount, zSig<<shiftCount );  in int32_to_floatx80()
1206     return packFloatx80( 0, 0x403E - shiftCount, zSig<<shiftCount );  in uint32_to_floatx80()
1215 -------------------------------------------------------------------------------
1216 Returns the result of converting the 32-bit two's complement integer `a' to
1217 the quadruple-precision floating-point format.  The conversion is performed
1218 according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
1219 -------------------------------------------------------------------------------
1230     absA = zSign ? - a : a;  in int32_to_float128()
1233     return packFloat128( zSign, 0x402E - shiftCount, zSig0<<shiftCount, 0 );  in int32_to_float128()
1244     return packFloat128( 0, 0x402E - shiftCount, zSig0<<shiftCount, 0 );  in uint32_to_float128()
1252 -------------------------------------------------------------------------------
1253 Returns the result of converting the 64-bit two's complement integer `a'
1254 to the single-precision floating-point format.  The conversion is performed
1255 according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
1256 -------------------------------------------------------------------------------
1266     absA = zSign ? - a : a;  in int64_to_float32()
1267     shiftCount = countLeadingZeros64( absA ) - 40;  in int64_to_float32()
1269         return packFloat32( zSign, 0x95 - shiftCount, absA<<shiftCount );  in int64_to_float32()
1274             shift64RightJamming( absA, - shiftCount, &absA );  in int64_to_float32()
1279         return roundAndPackFloat32( zSign, 0x9C - shiftCount, absA );  in int64_to_float32()
1285 -------------------------------------------------------------------------------
1286 Returns the result of converting the 64-bit two's complement integer `a'
1287 to the double-precision floating-point format.  The conversion is performed
1288 according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
1289 -------------------------------------------------------------------------------
1300     return normalizeRoundAndPackFloat64( zSign, 0x43C, zSign ? - a : a );  in int64_to_float64()
1307 -------------------------------------------------------------------------------
1308 Returns the result of converting the 64-bit two's complement integer `a'
1309 to the extended double-precision floating-point format.  The conversion
1310 is performed according to the IEC/IEEE Standard for Binary Floating-Point
1312 -------------------------------------------------------------------------------
1322     absA = zSign ? - a : a;  in int64_to_floatx80()
1324     return packFloatx80( zSign, 0x403E - shiftCount, absA<<shiftCount );  in int64_to_floatx80()
1335 -------------------------------------------------------------------------------
1336 Returns the result of converting the 64-bit two's complement integer `a' to
1337 the quadruple-precision floating-point format.  The conversion is performed
1338 according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
1339 -------------------------------------------------------------------------------
1351     absA = zSign ? - a : a;  in int64_to_float128()
1353     zExp = 0x406E - shiftCount;  in int64_to_float128()
1357         shiftCount -= 64;  in int64_to_float128()
1372 -------------------------------------------------------------------------------
1373 Returns the result of converting the single-precision floating-point value
1374 `a' to the 32-bit two's complement integer format.  The conversion is
1375 performed according to the IEC/IEEE Standard for Binary Floating-Point
1376 Arithmetic---which means in particular that the conversion is rounded
1380 -------------------------------------------------------------------------------
1394     shiftCount = 0xAF - aExp;  in float32_to_int32()
1404 -------------------------------------------------------------------------------
1405 Returns the result of converting the single-precision floating-point value
1406 `a' to the 32-bit two's complement integer format.  The conversion is
1407 performed according to the IEC/IEEE Standard for Binary Floating-Point
1412 -------------------------------------------------------------------------------
1419     int32 z;  in float32_to_int32_round_to_zero()  local
1424     shiftCount = aExp - 0x9E;  in float32_to_int32_round_to_zero()
1437     z = aSig>>( - shiftCount );  in float32_to_int32_round_to_zero()
1441     if ( aSign ) z = - z;  in float32_to_int32_round_to_zero()
1442     return z;  in float32_to_int32_round_to_zero()
1448 -------------------------------------------------------------------------------
1449 Returns the result of converting the single-precision floating-point value
1450 `a' to the 64-bit two's complement integer format.  The conversion is
1451 performed according to the IEC/IEEE Standard for Binary Floating-Point
1452 Arithmetic---which means in particular that the conversion is rounded
1456 -------------------------------------------------------------------------------
1468     shiftCount = 0xBE - aExp;  in float32_to_int64()
1485 -------------------------------------------------------------------------------
1486 Returns the result of converting the single-precision floating-point value
1487 `a' to the 64-bit two's complement integer format.  The conversion is
1488 performed according to the IEC/IEEE Standard for Binary Floating-Point
1493 -------------------------------------------------------------------------------
1501     int64 z;  in float32_to_int64_round_to_zero()  local
1506     shiftCount = aExp - 0xBE;  in float32_to_int64_round_to_zero()
1522     z = aSig64>>( - shiftCount );  in float32_to_int64_round_to_zero()
1526     if ( aSign ) z = - z;  in float32_to_int64_round_to_zero()
1527     return z;  in float32_to_int64_round_to_zero()
1533 -------------------------------------------------------------------------------
1534 Returns the result of converting the single-precision floating-point value
1535 `a' to the double-precision floating-point format.  The conversion is
1536 performed according to the IEC/IEEE Standard for Binary Floating-Point
1538 -------------------------------------------------------------------------------
1556         --aExp;  in float32_to_float64()
1565 -------------------------------------------------------------------------------
1566 Returns the result of converting the single-precision floating-point value
1567 `a' to the extended double-precision floating-point format.  The conversion
1568 is performed according to the IEC/IEEE Standard for Binary Floating-Point
1570 -------------------------------------------------------------------------------
1599 -------------------------------------------------------------------------------
1600 Returns the result of converting the single-precision floating-point value
1601 `a' to the double-precision floating-point format.  The conversion is
1602 performed according to the IEC/IEEE Standard for Binary Floating-Point
1604 -------------------------------------------------------------------------------
1622         --aExp;  in float32_to_float128()
1632 -------------------------------------------------------------------------------
1633 Rounds the single-precision floating-point value `a' to an integer, and
1634 returns the result as a single-precision floating-point value.  The
1636 Floating-Point Arithmetic.
1637 -------------------------------------------------------------------------------
1645     float32 z;  in float32_round_to_int()  local
1674     lastBitMask <<= 0x96 - aExp;  in float32_round_to_int()
1675     roundBitsMask = lastBitMask - 1;  in float32_round_to_int()
1676     z = a;  in float32_round_to_int()
1679         z += lastBitMask>>1;  in float32_round_to_int()
1680         if ( ( z & roundBitsMask ) == 0 ) z &= ~ lastBitMask;  in float32_round_to_int()
1683         if ( extractFloat32Sign( z ) ^ ( roundingMode == float_round_up ) ) {  in float32_round_to_int()
1684             z += roundBitsMask;  in float32_round_to_int()
1687     z &= ~ roundBitsMask;  in float32_round_to_int()
1688     if ( z != a ) float_exception_flags |= float_flag_inexact;  in float32_round_to_int()
1689     return z;  in float32_round_to_int()
1695 -------------------------------------------------------------------------------
1696 Returns the result of adding the absolute values of the single-precision
1697 floating-point values `a' and `b'.  If `zSign' is 1, the sum is negated
1700 Floating-Point Arithmetic.
1701 -------------------------------------------------------------------------------
1713     expDiff = aExp - bExp;  in addFloat32Sigs()
1722             --expDiff;  in addFloat32Sigs()
1741         shift32RightJamming( aSig, - expDiff, &aSig );  in addFloat32Sigs()
1756     --zExp;  in addFloat32Sigs()
1767 -------------------------------------------------------------------------------
1768 Returns the result of subtracting the absolute values of the single-
1769 precision floating-point values `a' and `b'.  If `zSign' is 1, the
1772 Standard for Binary Floating-Point Arithmetic.
1773 -------------------------------------------------------------------------------
1785     expDiff = aExp - bExp;  in subFloat32Sigs()
1813     shift32RightJamming( aSig, - expDiff, &aSig );  in subFloat32Sigs()
1816     zSig = bSig - aSig;  in subFloat32Sigs()
1826         --expDiff;  in subFloat32Sigs()
1834     zSig = aSig - bSig;  in subFloat32Sigs()
1837     --zExp;  in subFloat32Sigs()
1843 -------------------------------------------------------------------------------
1844 Returns the result of adding the single-precision floating-point values `a'
1846 Binary Floating-Point Arithmetic.
1847 -------------------------------------------------------------------------------
1865 -------------------------------------------------------------------------------
1866 Returns the result of subtracting the single-precision floating-point values
1868 for Binary Floating-Point Arithmetic.
1869 -------------------------------------------------------------------------------
1887 -------------------------------------------------------------------------------
1888 Returns the result of multiplying the single-precision floating-point values
1890 for Binary Floating-Point Arithmetic.
1891 -------------------------------------------------------------------------------
1934     zExp = aExp + bExp - 0x7F;  in float32_mul()
1941         --zExp;  in float32_mul()
1948 -------------------------------------------------------------------------------
1949 Returns the result of dividing the single-precision floating-point value `a'
1951 IEC/IEEE Standard for Binary Floating-Point Arithmetic.
1952 -------------------------------------------------------------------------------
1995     zExp = aExp - bExp + 0x7D;  in float32_div()
2012 -------------------------------------------------------------------------------
2013 Returns the remainder of the single-precision floating-point value `a'
2015 according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
2016 -------------------------------------------------------------------------------
2056     expDiff = aExp - bExp;  in float32_rem()
2063             if ( expDiff < -1 ) return a;  in float32_rem()
2067         if ( q ) aSig -= bSig;  in float32_rem()
2070             q >>= 32 - expDiff;  in float32_rem()
2072             aSig = ( ( aSig>>1 )<<( expDiff - 1 ) ) - bSig * q;  in float32_rem()
2080         if ( bSig <= aSig ) aSig -= bSig;  in float32_rem()
2083         expDiff -= 64;  in float32_rem()
2086             q64 = ( 2 < q64 ) ? q64 - 2 : 0;  in float32_rem()
2087             aSig64 = - ( ( bSig * q64 )<<38 );  in float32_rem()
2088             expDiff -= 62;  in float32_rem()
2092         q64 = ( 2 < q64 ) ? q64 - 2 : 0;  in float32_rem()
2093         q = q64>>( 64 - expDiff );  in float32_rem()
2095         aSig = ( ( aSig64>>33 )<<( expDiff - 1 ) ) - bSig * q;  in float32_rem()
2100         aSig -= bSig;  in float32_rem()
2107     if ( zSign ) aSig = - aSig;  in float32_rem()
2115 -------------------------------------------------------------------------------
2116 Returns the square root of the single-precision floating-point value `a'.
2118 Floating-Point Arithmetic.
2119 -------------------------------------------------------------------------------
2146     zExp = ( ( aExp - 0x7F )>>1 ) + 0x7E;  in float32_sqrt()
2156         rem = ( ( (bits64) aSig )<<32 ) - term;  in float32_sqrt()
2158             --zSig;  in float32_sqrt()
2171 -------------------------------------------------------------------------------
2172 Returns 1 if the single-precision floating-point value `a' is equal to
2174 according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
2175 -------------------------------------------------------------------------------
2193 -------------------------------------------------------------------------------
2194 Returns 1 if the single-precision floating-point value `a' is less than
2196 is performed according to the IEC/IEEE Standard for Binary Floating-Point
2198 -------------------------------------------------------------------------------
2218 -------------------------------------------------------------------------------
2219 Returns 1 if the single-precision floating-point value `a' is less than
2221 according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
2222 -------------------------------------------------------------------------------
2243 -------------------------------------------------------------------------------
2244 Returns 1 if the single-precision floating-point value `a' is equal to
2247 according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
2248 -------------------------------------------------------------------------------
2264 -------------------------------------------------------------------------------
2265 Returns 1 if the single-precision floating-point value `a' is less than or
2268 IEC/IEEE Standard for Binary Floating-Point Arithmetic.
2269 -------------------------------------------------------------------------------
2291 -------------------------------------------------------------------------------
2292 Returns 1 if the single-precision floating-point value `a' is less than
2295 Standard for Binary Floating-Point Arithmetic.
2296 -------------------------------------------------------------------------------
2320 -------------------------------------------------------------------------------
2321 Returns the result of converting the double-precision floating-point value
2322 `a' to the 32-bit two's complement integer format.  The conversion is
2323 performed according to the IEC/IEEE Standard for Binary Floating-Point
2324 Arithmetic---which means in particular that the conversion is rounded
2328 -------------------------------------------------------------------------------
2341     shiftCount = 0x42C - aExp;  in float64_to_int32()
2349 -------------------------------------------------------------------------------
2350 Returns the result of converting the double-precision floating-point value
2351 `a' to the 32-bit two's complement integer format.  The conversion is
2352 performed according to the IEC/IEEE Standard for Binary Floating-Point
2357 -------------------------------------------------------------------------------
2364     int32 z;  in float64_to_int32_round_to_zero()  local
2378     shiftCount = 0x433 - aExp;  in float64_to_int32_round_to_zero()
2381     z = aSig;  in float64_to_int32_round_to_zero()
2382     if ( aSign ) z = - z;  in float64_to_int32_round_to_zero()
2383     if ( ( z < 0 ) ^ aSign ) {  in float64_to_int32_round_to_zero()
2391     return z;  in float64_to_int32_round_to_zero()
2397 -------------------------------------------------------------------------------
2398 Returns the result of converting the double-precision floating-point value
2399 `a' to the 64-bit two's complement integer format.  The conversion is
2400 performed according to the IEC/IEEE Standard for Binary Floating-Point
2401 Arithmetic---which means in particular that the conversion is rounded
2405 -------------------------------------------------------------------------------
2417     shiftCount = 0x433 - aExp;  in float64_to_int64()
2430         aSig <<= - shiftCount;  in float64_to_int64()
2440 -------------------------------------------------------------------------------
2441 Returns the result of converting the double-precision floating-point value
2442 `a' to the 64-bit two's complement integer format.  The conversion is
2443 performed according to the IEC/IEEE Standard for Binary Floating-Point
2448 -------------------------------------------------------------------------------
2455     int64 z;  in float64_to_int64_round_to_zero()  local
2461     shiftCount = aExp - 0x433;  in float64_to_int64_round_to_zero()
2475         z = aSig<<shiftCount;  in float64_to_int64_round_to_zero()
2482         z = aSig>>( - shiftCount );  in float64_to_int64_round_to_zero()
2487     if ( aSign ) z = - z;  in float64_to_int64_round_to_zero()
2488     return z;  in float64_to_int64_round_to_zero()
2494 -------------------------------------------------------------------------------
2495 Returns the result of converting the double-precision floating-point value
2496 `a' to the single-precision floating-point format.  The conversion is
2497 performed according to the IEC/IEEE Standard for Binary Floating-Point
2499 -------------------------------------------------------------------------------
2519         aExp -= 0x381;  in float64_to_float32()
2528 -------------------------------------------------------------------------------
2529 Returns the result of converting the double-precision floating-point value
2530 `a' to the extended double-precision floating-point format.  The conversion
2531 is performed according to the IEC/IEEE Standard for Binary Floating-Point
2533 -------------------------------------------------------------------------------
2563 -------------------------------------------------------------------------------
2564 Returns the result of converting the double-precision floating-point value
2565 `a' to the quadruple-precision floating-point format.  The conversion is
2566 performed according to the IEC/IEEE Standard for Binary Floating-Point
2568 -------------------------------------------------------------------------------
2586         --aExp;  in float64_to_float128()
2597 -------------------------------------------------------------------------------
2598 Rounds the double-precision floating-point value `a' to an integer, and
2599 returns the result as a double-precision floating-point value.  The
2601 Floating-Point Arithmetic.
2602 -------------------------------------------------------------------------------
2610     float64 z;  in float64_round_to_int()  local
2640     lastBitMask <<= 0x433 - aExp;  in float64_round_to_int()
2641     roundBitsMask = lastBitMask - 1;  in float64_round_to_int()
2642     z = a;  in float64_round_to_int()
2645         z += lastBitMask>>1;  in float64_round_to_int()
2646         if ( ( z & roundBitsMask ) == 0 ) z &= ~ lastBitMask;  in float64_round_to_int()
2649         if ( extractFloat64Sign( z ) ^ ( roundingMode == float_round_up ) ) {  in float64_round_to_int()
2650             z += roundBitsMask;  in float64_round_to_int()
2653     z &= ~ roundBitsMask;  in float64_round_to_int()
2654     if ( z != a ) float_exception_flags |= float_flag_inexact;  in float64_round_to_int()
2655     return z;  in float64_round_to_int()
2661 -------------------------------------------------------------------------------
2662 Returns the result of adding the absolute values of the double-precision
2663 floating-point values `a' and `b'.  If `zSign' is 1, the sum is negated
2666 Floating-Point Arithmetic.
2667 -------------------------------------------------------------------------------
2679     expDiff = aExp - bExp;  in addFloat64Sigs()
2688             --expDiff;  in addFloat64Sigs()
2707         shift64RightJamming( aSig, - expDiff, &aSig );  in addFloat64Sigs()
2722     --zExp;  in addFloat64Sigs()
2733 -------------------------------------------------------------------------------
2734 Returns the result of subtracting the absolute values of the double-
2735 precision floating-point values `a' and `b'.  If `zSign' is 1, the
2738 Standard for Binary Floating-Point Arithmetic.
2739 -------------------------------------------------------------------------------
2751     expDiff = aExp - bExp;  in subFloat64Sigs()
2779     shift64RightJamming( aSig, - expDiff, &aSig );  in subFloat64Sigs()
2782     zSig = bSig - aSig;  in subFloat64Sigs()
2792         --expDiff;  in subFloat64Sigs()
2800     zSig = aSig - bSig;  in subFloat64Sigs()
2803     --zExp;  in subFloat64Sigs()
2809 -------------------------------------------------------------------------------
2810 Returns the result of adding the double-precision floating-point values `a'
2812 Binary Floating-Point Arithmetic.
2813 -------------------------------------------------------------------------------
2831 -------------------------------------------------------------------------------
2832 Returns the result of subtracting the double-precision floating-point values
2834 for Binary Floating-Point Arithmetic.
2835 -------------------------------------------------------------------------------
2853 -------------------------------------------------------------------------------
2854 Returns the result of multiplying the double-precision floating-point values
2856 for Binary Floating-Point Arithmetic.
2857 -------------------------------------------------------------------------------
2898     zExp = aExp + bExp - 0x3FF;  in float64_mul()
2905         --zExp;  in float64_mul()
2912 -------------------------------------------------------------------------------
2913 Returns the result of dividing the double-precision floating-point value `a'
2915 the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
2916 -------------------------------------------------------------------------------
2961     zExp = aExp - bExp + 0x3FD;  in float64_div()
2973             --zSig;  in float64_div()
2984 -------------------------------------------------------------------------------
2985 Returns the remainder of the double-precision floating-point value `a'
2987 according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
2988 -------------------------------------------------------------------------------
3026     expDiff = aExp - bExp;  in float64_rem()
3030         if ( expDiff < -1 ) return a;  in float64_rem()
3034     if ( q ) aSig -= bSig;  in float64_rem()
3035     expDiff -= 64;  in float64_rem()
3038         q = ( 2 < q ) ? q - 2 : 0;  in float64_rem()
3039         aSig = - ( ( bSig>>2 ) * q );  in float64_rem()
3040         expDiff -= 62;  in float64_rem()
3045         q = ( 2 < q ) ? q - 2 : 0;  in float64_rem()
3046         q >>= 64 - expDiff;  in float64_rem()
3048         aSig = ( ( aSig>>1 )<<( expDiff - 1 ) ) - bSig * q;  in float64_rem()
3057         aSig -= bSig;  in float64_rem()
3064     if ( zSign ) aSig = - aSig;  in float64_rem()
3070 -------------------------------------------------------------------------------
3071 Returns the square root of the double-precision floating-point value `a'.
3073 Floating-Point Arithmetic.
3074 -------------------------------------------------------------------------------
3101     zExp = ( ( aExp - 0x3FF )>>1 ) + 0x3FE;  in float64_sqrt()
3104     aSig <<= 9 - ( aExp & 1 );  in float64_sqrt()
3111             --zSig;  in float64_sqrt()
3112             doubleZSig -= 2;  in float64_sqrt()
3123 -------------------------------------------------------------------------------
3124 Returns 1 if the double-precision floating-point value `a' is equal to the
3126 according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
3127 -------------------------------------------------------------------------------
3146 -------------------------------------------------------------------------------
3147 Returns 1 if the double-precision floating-point value `a' is less than or
3149 performed according to the IEC/IEEE Standard for Binary Floating-Point
3151 -------------------------------------------------------------------------------
3175 -------------------------------------------------------------------------------
3176 Returns 1 if the double-precision floating-point value `a' is less than
3178 according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
3179 -------------------------------------------------------------------------------
3204 -------------------------------------------------------------------------------
3205 Returns 1 if the double-precision floating-point value `a' is equal to the
3208 according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
3209 -------------------------------------------------------------------------------
3225 -------------------------------------------------------------------------------
3226 Returns 1 if the double-precision floating-point value `a' is less than or
3229 IEC/IEEE Standard for Binary Floating-Point Arithmetic.
3230 -------------------------------------------------------------------------------
3252 -------------------------------------------------------------------------------
3253 Returns 1 if the double-precision floating-point value `a' is less than
3256 Standard for Binary Floating-Point Arithmetic.
3257 -------------------------------------------------------------------------------
3282 -------------------------------------------------------------------------------
3283 Returns the result of converting the extended double-precision floating-
3284 point value `a' to the 32-bit two's complement integer format.  The
3286 Floating-Point Arithmetic---which means in particular that the conversion
3290 -------------------------------------------------------------------------------
3302     shiftCount = 0x4037 - aExp;  in floatx80_to_int32()
3310 -------------------------------------------------------------------------------
3311 Returns the result of converting the extended double-precision floating-
3312 point value `a' to the 32-bit two's complement integer format.  The
3314 Floating-Point Arithmetic, except that the conversion is always rounded
3318 -------------------------------------------------------------------------------
3325     int32 z;  in floatx80_to_int32_round_to_zero()  local
3338     shiftCount = 0x403E - aExp;  in floatx80_to_int32_round_to_zero()
3341     z = aSig;  in floatx80_to_int32_round_to_zero()
3342     if ( aSign ) z = - z;  in floatx80_to_int32_round_to_zero()
3343     if ( ( z < 0 ) ^ aSign ) {  in floatx80_to_int32_round_to_zero()
3351     return z;  in floatx80_to_int32_round_to_zero()
3356 -------------------------------------------------------------------------------
3357 Returns the result of converting the extended double-precision floating-
3358 point value `a' to the 64-bit two's complement integer format.  The
3360 Floating-Point Arithmetic---which means in particular that the conversion
3364 -------------------------------------------------------------------------------
3375     shiftCount = 0x403E - aExp;  in floatx80_to_int64()
3397 -------------------------------------------------------------------------------
3398 Returns the result of converting the extended double-precision floating-
3399 point value `a' to the 64-bit two's complement integer format.  The
3401 Floating-Point Arithmetic, except that the conversion is always rounded
3405 -------------------------------------------------------------------------------
3412     int64 z;  in floatx80_to_int64_round_to_zero()  local
3417     shiftCount = aExp - 0x403E;  in floatx80_to_int64_round_to_zero()
3420         if ( ( a.high != 0xC03E ) || aSig ) {  in floatx80_to_int64_round_to_zero()
3432     z = aSig>>( - shiftCount );  in floatx80_to_int64_round_to_zero()
3436     if ( aSign ) z = - z;  in floatx80_to_int64_round_to_zero()
3437     return z;  in floatx80_to_int64_round_to_zero()
3442 -------------------------------------------------------------------------------
3443 Returns the result of converting the extended double-precision floating-
3444 point value `a' to the single-precision floating-point format.  The
3446 Floating-Point Arithmetic.
3447 -------------------------------------------------------------------------------
3465     if ( aExp || aSig ) aExp -= 0x3F81;  in floatx80_to_float32()
3471 -------------------------------------------------------------------------------
3472 Returns the result of converting the extended double-precision floating-
3473 point value `a' to the double-precision floating-point format.  The
3475 Floating-Point Arithmetic.
3476 -------------------------------------------------------------------------------
3494     if ( aExp || aSig ) aExp -= 0x3C01;  in floatx80_to_float64()
3502 -------------------------------------------------------------------------------
3503 Returns the result of converting the extended double-precision floating-
3504 point value `a' to the quadruple-precision floating-point format.  The
3506 Floating-Point Arithmetic.
3507 -------------------------------------------------------------------------------
3529 -------------------------------------------------------------------------------
3530 Rounds the extended double-precision floating-point value `a' to an integer,
3531 and returns the result as an extended quadruple-precision floating-point
3533 Binary Floating-Point Arithmetic.
3534 -------------------------------------------------------------------------------
3542     floatx80 z;  in floatx80_round_to_int()  local
3581     lastBitMask <<= 0x403E - aExp;  in floatx80_round_to_int()
3582     roundBitsMask = lastBitMask - 1;  in floatx80_round_to_int()
3583     z = a;  in floatx80_round_to_int()
3586         z.low += lastBitMask>>1;  in floatx80_round_to_int()
3587         if ( ( z.low & roundBitsMask ) == 0 ) z.low &= ~ lastBitMask;  in floatx80_round_to_int()
3590         if ( extractFloatx80Sign( z ) ^ ( roundingMode == float_round_up ) ) {  in floatx80_round_to_int()
3591             z.low += roundBitsMask;  in floatx80_round_to_int()
3594     z.low &= ~ roundBitsMask;  in floatx80_round_to_int()
3595     if ( z.low == 0 ) {  in floatx80_round_to_int()
3596         ++z.high;  in floatx80_round_to_int()
3597         z.low = LIT64( 0x8000000000000000 );  in floatx80_round_to_int()
3599     if ( z.low != a.low ) float_exception_flags |= float_flag_inexact;  in floatx80_round_to_int()
3600     return z;  in floatx80_round_to_int()
3605 -------------------------------------------------------------------------------
3606 Returns the result of adding the absolute values of the extended double-
3607 precision floating-point values `a' and `b'.  If `zSign' is 1, the sum is
3610 Floating-Point Arithmetic.
3611 -------------------------------------------------------------------------------
3623     expDiff = aExp - bExp;  in addFloatx80Sigs()
3629         if ( bExp == 0 ) --expDiff;  in addFloatx80Sigs()
3639         shift64ExtraRightJamming( aSig, 0, - expDiff, &aSig, &zSig1 );  in addFloatx80Sigs()
3672 -------------------------------------------------------------------------------
3674 double-precision floating-point values `a' and `b'.  If `zSign' is 1, the
3677 Standard for Binary Floating-Point Arithmetic.
3678 -------------------------------------------------------------------------------
3685     floatx80 z;  in subFloatx80Sigs()  local
3691     expDiff = aExp - bExp;  in subFloatx80Sigs()
3699         z.low = floatx80_default_nan_low;  in subFloatx80Sigs()
3700         z.high = floatx80_default_nan_high;  in subFloatx80Sigs()
3701         return z;  in subFloatx80Sigs()
3717     shift128RightJamming( aSig, 0, - expDiff, &aSig, &zSig1 );  in subFloatx80Sigs()
3728     if ( bExp == 0 ) --expDiff;  in subFloatx80Sigs()
3741 -------------------------------------------------------------------------------
3742 Returns the result of adding the extended double-precision floating-point
3744 Standard for Binary Floating-Point Arithmetic.
3745 -------------------------------------------------------------------------------
3763 -------------------------------------------------------------------------------
3764 Returns the result of subtracting the extended double-precision floating-
3766 IEC/IEEE Standard for Binary Floating-Point Arithmetic.
3767 -------------------------------------------------------------------------------
3785 -------------------------------------------------------------------------------
3786 Returns the result of multiplying the extended double-precision floating-
3788 IEC/IEEE Standard for Binary Floating-Point Arithmetic.
3789 -------------------------------------------------------------------------------
3796     floatx80 z;  in floatx80_mul()  local
3818             z.low = floatx80_default_nan_low;  in floatx80_mul()
3819             z.high = floatx80_default_nan_high;  in floatx80_mul()
3820             return z;  in floatx80_mul()
3832     zExp = aExp + bExp - 0x3FFE;  in floatx80_mul()
3836         --zExp;  in floatx80_mul()
3845 -------------------------------------------------------------------------------
3846 Returns the result of dividing the extended double-precision floating-point
3848 according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
3849 -------------------------------------------------------------------------------
3857     floatx80 z;  in floatx80_div()  local
3883                 z.low = floatx80_default_nan_low;  in floatx80_div()
3884                 z.high = floatx80_default_nan_high;  in floatx80_div()
3885                 return z;  in floatx80_div()
3896     zExp = aExp - bExp + 0x3FFE;  in floatx80_div()
3906         --zSig0;  in floatx80_div()
3914             --zSig1;  in floatx80_div()
3926 -------------------------------------------------------------------------------
3927 Returns the remainder of the extended double-precision floating-point value
3929 according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
3930 -------------------------------------------------------------------------------
3938     floatx80 z;  in floatx80_rem()  local
3961             z.low = floatx80_default_nan_low;  in floatx80_rem()
3962             z.high = floatx80_default_nan_high;  in floatx80_rem()
3963             return z;  in floatx80_rem()
3973     expDiff = aExp - bExp;  in floatx80_rem()
3976         if ( expDiff < -1 ) return a;  in floatx80_rem()
3981     if ( q ) aSig0 -= bSig;  in floatx80_rem()
3982     expDiff -= 64;  in floatx80_rem()
3985         q = ( 2 < q ) ? q - 2 : 0;  in floatx80_rem()
3989         expDiff -= 62;  in floatx80_rem()
3994         q = ( 2 < q ) ? q - 2 : 0;  in floatx80_rem()
3995         q >>= 64 - expDiff;  in floatx80_rem()
3996         mul64To128( bSig, q<<( 64 - expDiff ), &term0, &term1 );  in floatx80_rem()
3998         shortShift128Left( 0, bSig, 64 - expDiff, &term0, &term1 );  in floatx80_rem()
4024 -------------------------------------------------------------------------------
4025 Returns the square root of the extended double-precision floating-point
4027 for Binary Floating-Point Arithmetic.
4028 -------------------------------------------------------------------------------
4036     floatx80 z;  in floatx80_sqrt()  local
4050         z.low = floatx80_default_nan_low;  in floatx80_sqrt()
4051         z.high = floatx80_default_nan_high;  in floatx80_sqrt()
4052         return z;  in floatx80_sqrt()
4058     zExp = ( ( aExp - 0x3FFF )>>1 ) + 0x3FFF;  in floatx80_sqrt()
4066         --zSig0;  in floatx80_sqrt()
4067         doubleZSig0 -= 2;  in floatx80_sqrt()
4078             --zSig1;  in floatx80_sqrt()
4095 -------------------------------------------------------------------------------
4096 Returns 1 if the extended double-precision floating-point value `a' is
4098 performed according to the IEC/IEEE Standard for Binary Floating-Point
4100 -------------------------------------------------------------------------------
4118         && (    ( a.high == b.high )  in floatx80_eq()
4120                   && ( (bits16) ( ( a.high | b.high )<<1 ) == 0 ) )  in floatx80_eq()
4126 -------------------------------------------------------------------------------
4127 Returns 1 if the extended double-precision floating-point value `a' is
4130 Floating-Point Arithmetic.
4131 -------------------------------------------------------------------------------
4150             || (    ( ( (bits16) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )  in floatx80_le()
4154           aSign ? le128( b.high, b.low, a.high, a.low )  in floatx80_le()
4155         : le128( a.high, a.low, b.high, b.low );  in floatx80_le()
4160 -------------------------------------------------------------------------------
4161 Returns 1 if the extended double-precision floating-point value `a' is
4163 is performed according to the IEC/IEEE Standard for Binary Floating-Point
4165 -------------------------------------------------------------------------------
4184             && (    ( ( (bits16) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )  in floatx80_lt()
4188           aSign ? lt128( b.high, b.low, a.high, a.low )  in floatx80_lt()
4189         : lt128( a.high, a.low, b.high, b.low );  in floatx80_lt()
4194 -------------------------------------------------------------------------------
4195 Returns 1 if the extended double-precision floating-point value `a' is equal
4198 according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
4199 -------------------------------------------------------------------------------
4214         && (    ( a.high == b.high )  in floatx80_eq_signaling()
4216                   && ( (bits16) ( ( a.high | b.high )<<1 ) == 0 ) )  in floatx80_eq_signaling()
4222 -------------------------------------------------------------------------------
4223 Returns 1 if the extended double-precision floating-point value `a' is less
4226 to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
4227 -------------------------------------------------------------------------------
4249             || (    ( ( (bits16) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )  in floatx80_le_quiet()
4253           aSign ? le128( b.high, b.low, a.high, a.low )  in floatx80_le_quiet()
4254         : le128( a.high, a.low, b.high, b.low );  in floatx80_le_quiet()
4259 -------------------------------------------------------------------------------
4260 Returns 1 if the extended double-precision floating-point value `a' is less
4263 IEC/IEEE Standard for Binary Floating-Point Arithmetic.
4264 -------------------------------------------------------------------------------
4286             && (    ( ( (bits16) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )  in floatx80_lt_quiet()
4290           aSign ? lt128( b.high, b.low, a.high, a.low )  in floatx80_lt_quiet()
4291         : lt128( a.high, a.low, b.high, b.low );  in floatx80_lt_quiet()
4300 -------------------------------------------------------------------------------
4301 Returns the result of converting the quadruple-precision floating-point
4302 value `a' to the 32-bit two's complement integer format.  The conversion
4303 is performed according to the IEC/IEEE Standard for Binary Floating-Point
4304 Arithmetic---which means in particular that the conversion is rounded
4308 -------------------------------------------------------------------------------
4323     shiftCount = 0x4028 - aExp;  in float128_to_int32()
4330 -------------------------------------------------------------------------------
4331 Returns the result of converting the quadruple-precision floating-point
4332 value `a' to the 32-bit two's complement integer format.  The conversion
4333 is performed according to the IEC/IEEE Standard for Binary Floating-Point
4338 -------------------------------------------------------------------------------
4345     int32 z;  in float128_to_int32_round_to_zero()  local
4361     shiftCount = 0x402F - aExp;  in float128_to_int32_round_to_zero()
4364     z = aSig0;  in float128_to_int32_round_to_zero()
4365     if ( aSign ) z = - z;  in float128_to_int32_round_to_zero()
4366     if ( ( z < 0 ) ^ aSign ) {  in float128_to_int32_round_to_zero()
4374     return z;  in float128_to_int32_round_to_zero()
4379 -------------------------------------------------------------------------------
4380 Returns the result of converting the quadruple-precision floating-point
4381 value `a' to the 64-bit two's complement integer format.  The conversion
4382 is performed according to the IEC/IEEE Standard for Binary Floating-Point
4383 Arithmetic---which means in particular that the conversion is rounded
4387 -------------------------------------------------------------------------------
4400     shiftCount = 0x402F - aExp;  in float128_to_int64()
4413         shortShift128Left( aSig0, aSig1, - shiftCount, &aSig0, &aSig1 );  in float128_to_int64()
4423 -------------------------------------------------------------------------------
4424 Returns the result of converting the quadruple-precision floating-point
4425 value `a' to the 64-bit two's complement integer format.  The conversion
4426 is performed according to the IEC/IEEE Standard for Binary Floating-Point
4431 -------------------------------------------------------------------------------
4438     int64 z;  in float128_to_int64_round_to_zero()  local
4445     shiftCount = aExp - 0x402F;  in float128_to_int64_round_to_zero()
4449             if (    ( a.high == LIT64( 0xC03E000000000000 ) )  in float128_to_int64_round_to_zero()
4461         z = ( aSig0<<shiftCount ) | ( aSig1>>( ( - shiftCount ) & 63 ) );  in float128_to_int64_round_to_zero()
4473         z = aSig0>>( - shiftCount );  in float128_to_int64_round_to_zero()
4479     if ( aSign ) z = - z;  in float128_to_int64_round_to_zero()
4480     return z;  in float128_to_int64_round_to_zero()
4487  * just like above - but do not care for overflow of signed results
4494     uint64 z;  in float128_to_uint64_round_to_zero()  local
4501     shiftCount = aExp - 0x402F;  in float128_to_uint64_round_to_zero()
4505             if (    ( a.high == LIT64( 0xC03E000000000000 ) )  in float128_to_uint64_round_to_zero()
4514         z = ( aSig0<<shiftCount ) | ( aSig1>>( ( - shiftCount ) & 63 ) );  in float128_to_uint64_round_to_zero()
4526         z = aSig0>>( - shiftCount );  in float128_to_uint64_round_to_zero()
4531     if ( aSign ) z = - z;  in float128_to_uint64_round_to_zero()
4532     return z;  in float128_to_uint64_round_to_zero()
4538 -------------------------------------------------------------------------------
4539 Returns the result of converting the quadruple-precision floating-point
4540 value `a' to the single-precision floating-point format.  The conversion
4541 is performed according to the IEC/IEEE Standard for Binary Floating-Point
4543 -------------------------------------------------------------------------------
4567         aExp -= 0x3F81;  in float128_to_float32()
4574 -------------------------------------------------------------------------------
4575 Returns the result of converting the quadruple-precision floating-point
4576 value `a' to the double-precision floating-point format.  The conversion
4577 is performed according to the IEC/IEEE Standard for Binary Floating-Point
4579 -------------------------------------------------------------------------------
4601         aExp -= 0x3C01;  in float128_to_float64()
4610 -------------------------------------------------------------------------------
4611 Returns the result of converting the quadruple-precision floating-point
4612 value `a' to the extended double-precision floating-point format.  The
4614 Floating-Point Arithmetic.
4615 -------------------------------------------------------------------------------
4648 -------------------------------------------------------------------------------
4649 Rounds the quadruple-precision floating-point value `a' to an integer, and
4650 returns the result as a quadruple-precision floating-point value.  The
4652 Floating-Point Arithmetic.
4653 -------------------------------------------------------------------------------
4661     float128 z;  in float128_round_to_int()  local
4674         lastBitMask = ( lastBitMask<<( 0x406E - aExp ) )<<1;  in float128_round_to_int()
4675         roundBitsMask = lastBitMask - 1;  in float128_round_to_int()
4676         z = a;  in float128_round_to_int()
4680                 add128( z.high, z.low, 0, lastBitMask>>1, &z.high, &z.low );  in float128_round_to_int()
4681                 if ( ( z.low & roundBitsMask ) == 0 ) z.low &= ~ lastBitMask;  in float128_round_to_int()
4684                 if ( (sbits64) z.low < 0 ) {  in float128_round_to_int()
4685                     ++z.high;  in float128_round_to_int()
4686                     if ( (bits64) ( z.low<<1 ) == 0 ) z.high &= ~1;  in float128_round_to_int()
4691             if (   extractFloat128Sign( z )  in float128_round_to_int()
4693                 add128( z.high, z.low, 0, roundBitsMask, &z.high, &z.low );  in float128_round_to_int()
4696         z.low &= ~ roundBitsMask;  in float128_round_to_int()
4700             if ( ( ( (bits64) ( a.high<<1 ) ) | a.low ) == 0 ) return a;  in float128_round_to_int()
4726         lastBitMask <<= 0x402F - aExp;  in float128_round_to_int()
4727         roundBitsMask = lastBitMask - 1;  in float128_round_to_int()
4728         z.low = 0;  in float128_round_to_int()
4729         z.high = a.high;  in float128_round_to_int()
4732             z.high += lastBitMask>>1;  in float128_round_to_int()
4733             if ( ( ( z.high & roundBitsMask ) | a.low ) == 0 ) {  in float128_round_to_int()
4734                 z.high &= ~ lastBitMask;  in float128_round_to_int()
4738             if (   extractFloat128Sign( z )  in float128_round_to_int()
4740                 z.high |= ( a.low != 0 );  in float128_round_to_int()
4741                 z.high += roundBitsMask;  in float128_round_to_int()
4744         z.high &= ~ roundBitsMask;  in float128_round_to_int()
4746     if ( ( z.low != a.low ) || ( z.high != a.high ) ) {  in float128_round_to_int()
4749     return z;  in float128_round_to_int()
4754 -------------------------------------------------------------------------------
4755 Returns the result of adding the absolute values of the quadruple-precision
4756 floating-point values `a' and `b'.  If `zSign' is 1, the sum is negated
4759 Floating-Point Arithmetic.
4760 -------------------------------------------------------------------------------
4774     expDiff = aExp - bExp;  in addFloat128Sigs()
4781             --expDiff;  in addFloat128Sigs()
4802             aSig0, aSig1, 0, - expDiff, &aSig0, &aSig1, &zSig2 );  in addFloat128Sigs()
4821     --zExp;  in addFloat128Sigs()
4833 -------------------------------------------------------------------------------
4834 Returns the result of subtracting the absolute values of the quadruple-
4835 precision floating-point values `a' and `b'.  If `zSign' is 1, the
4838 Standard for Binary Floating-Point Arithmetic.
4839 -------------------------------------------------------------------------------
4846     float128 z;  in subFloat128Sigs()  local
4854     expDiff = aExp - bExp;  in subFloat128Sigs()
4864         z.low = float128_default_nan_low;  in subFloat128Sigs()
4865         z.high = float128_default_nan_high;  in subFloat128Sigs()
4866         return z;  in subFloat128Sigs()
4888     shift128RightJamming( aSig0, aSig1, - expDiff, &aSig0, &aSig1 );  in subFloat128Sigs()
4901         --expDiff;  in subFloat128Sigs()
4912     --zExp;  in subFloat128Sigs()
4913     return normalizeRoundAndPackFloat128( zSign, zExp - 14, zSig0, zSig1 );  in subFloat128Sigs()
4918 -------------------------------------------------------------------------------
4919 Returns the result of adding the quadruple-precision floating-point values
4921 for Binary Floating-Point Arithmetic.
4922 -------------------------------------------------------------------------------
4940 -------------------------------------------------------------------------------
4941 Returns the result of subtracting the quadruple-precision floating-point
4943 Standard for Binary Floating-Point Arithmetic.
4944 -------------------------------------------------------------------------------
4962 -------------------------------------------------------------------------------
4963 Returns the result of multiplying the quadruple-precision floating-point
4965 Standard for Binary Floating-Point Arithmetic.
4966 -------------------------------------------------------------------------------
4973     float128 z;  in float128_mul()  local
4997             z.low = float128_default_nan_low;  in float128_mul()
4998             z.high = float128_default_nan_high;  in float128_mul()
4999             return z;  in float128_mul()
5011     zExp = aExp + bExp - 0x4000;  in float128_mul()
5027 -------------------------------------------------------------------------------
5028 Returns the result of dividing the quadruple-precision floating-point value
5030 the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
5031 -------------------------------------------------------------------------------
5039     float128 z;  in float128_div()  local
5067                 z.low = float128_default_nan_low;  in float128_div()
5068                 z.high = float128_default_nan_high;  in float128_div()
5069                 return z;  in float128_div()
5080     zExp = aExp - bExp + 0x3FFD;  in float128_div()
5093         --zSig0;  in float128_div()
5101             --zSig1;  in float128_div()
5112 -------------------------------------------------------------------------------
5113 Returns the remainder of the quadruple-precision floating-point value `a'
5115 according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
5116 -------------------------------------------------------------------------------
5125     float128 z;  in float128_rem()  local
5150             z.low = float128_default_nan_low;  in float128_rem()
5151             z.high = float128_default_nan_high;  in float128_rem()
5152             return z;  in float128_rem()
5160     expDiff = aExp - bExp;  in float128_rem()
5161     if ( expDiff < -1 ) return a;  in float128_rem()
5165         15 - ( expDiff < 0 ),  in float128_rem()
5173     expDiff -= 64;  in float128_rem()
5176         q = ( 4 < q ) ? q - 4 : 0;  in float128_rem()
5181         expDiff -= 61;  in float128_rem()
5183     if ( -64 < expDiff ) {  in float128_rem()
5185         q = ( 4 < q ) ? q - 4 : 0;  in float128_rem()
5186         q >>= - expDiff;  in float128_rem()
5190             shift128Right( aSig0, aSig1, - expDiff, &aSig0, &aSig1 );  in float128_rem()
5218         normalizeRoundAndPackFloat128( aSign ^ zSign, bExp - 4, aSig0, aSig1 );  in float128_rem()
5223 -------------------------------------------------------------------------------
5224 Returns the square root of the quadruple-precision floating-point value `a'.
5226 Floating-Point Arithmetic.
5227 -------------------------------------------------------------------------------
5235     float128 z;  in float128_sqrt()  local
5250         z.low = float128_default_nan_low;  in float128_sqrt()
5251         z.high = float128_default_nan_high;  in float128_sqrt()
5252         return z;  in float128_sqrt()
5258     zExp = ( ( aExp - 0x3FFF )>>1 ) + 0x3FFE;  in float128_sqrt()
5261     shortShift128Left( aSig0, aSig1, 13 - ( aExp & 1 ), &aSig0, &aSig1 );  in float128_sqrt()
5267         --zSig0;  in float128_sqrt()
5268         doubleZSig0 -= 2;  in float128_sqrt()
5279             --zSig1;  in float128_sqrt()
5293 -------------------------------------------------------------------------------
5294 Returns 1 if the quadruple-precision floating-point value `a' is equal to
5296 according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
5297 -------------------------------------------------------------------------------
5315         && (    ( a.high == b.high )  in float128_eq()
5317                   && ( (bits64) ( ( a.high | b.high )<<1 ) == 0 ) )  in float128_eq()
5323 -------------------------------------------------------------------------------
5324 Returns 1 if the quadruple-precision floating-point value `a' is less than
5326 is performed according to the IEC/IEEE Standard for Binary Floating-Point
5328 -------------------------------------------------------------------------------
5347             || (    ( ( (bits64) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )  in float128_le()
5351           aSign ? le128( b.high, b.low, a.high, a.low )  in float128_le()
5352         : le128( a.high, a.low, b.high, b.low );  in float128_le()
5357 -------------------------------------------------------------------------------
5358 Returns 1 if the quadruple-precision floating-point value `a' is less than
5360 according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
5361 -------------------------------------------------------------------------------
5380             && (    ( ( (bits64) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )  in float128_lt()
5384           aSign ? lt128( b.high, b.low, a.high, a.low )  in float128_lt()
5385         : lt128( a.high, a.low, b.high, b.low );  in float128_lt()
5390 -------------------------------------------------------------------------------
5391 Returns 1 if the quadruple-precision floating-point value `a' is equal to
5394 according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
5395 -------------------------------------------------------------------------------
5410         && (    ( a.high == b.high )  in float128_eq_signaling()
5412                   && ( (bits64) ( ( a.high | b.high )<<1 ) == 0 ) )  in float128_eq_signaling()
5418 -------------------------------------------------------------------------------
5419 Returns 1 if the quadruple-precision floating-point value `a' is less than
5422 IEC/IEEE Standard for Binary Floating-Point Arithmetic.
5423 -------------------------------------------------------------------------------
5445             || (    ( ( (bits64) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )  in float128_le_quiet()
5449           aSign ? le128( b.high, b.low, a.high, a.low )  in float128_le_quiet()
5450         : le128( a.high, a.low, b.high, b.low );  in float128_le_quiet()
5455 -------------------------------------------------------------------------------
5456 Returns 1 if the quadruple-precision floating-point value `a' is less than
5459 Standard for Binary Floating-Point Arithmetic.
5460 -------------------------------------------------------------------------------
5482             && (    ( ( (bits64) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )  in float128_lt_quiet()
5486           aSign ? lt128( b.high, b.low, a.high, a.low )  in float128_lt_quiet()
5487         : lt128( a.high, a.low, b.high, b.low );  in float128_lt_quiet()
5504  * float64 version overhauled for SoftFloat 2a [bjh21 2000-07-15]
5508 -------------------------------------------------------------------------------
5509 Returns the result of converting the double-precision floating-point value
5510 `a' to the 32-bit unsigned integer format.  The conversion is
5511 performed according to the IEC/IEEE Standard for Binary Floating-point
5515 -------------------------------------------------------------------------------
5522     uint32 z;  in float64_to_uint32_round_to_zero()  local
5542     shiftCount = 0x433 - aExp;  in float64_to_uint32_round_to_zero()
5545     z = aSig;  in float64_to_uint32_round_to_zero()
5549     return z;  in float64_to_uint32_round_to_zero()
5554 -------------------------------------------------------------------------------
5555 Returns the result of converting the single-precision floating-point value
5556 `a' to the 32-bit unsigned integer format.  The conversion is
5557 performed according to the IEC/IEEE Standard for Binary Floating-point
5561 -------------------------------------------------------------------------------
5568     uint32 z;  in float32_to_uint32_round_to_zero()  local
5573     shiftCount = aExp - 0x9E;  in float32_to_uint32_round_to_zero()
5588     z = aSig>>( - shiftCount );  in float32_to_uint32_round_to_zero()
5592     return z;  in float32_to_uint32_round_to_zero()