Lines Matching +full:high +full:- +full:z
4 * This version hacked for use with gcc -msoft-float by bjh21.
12 * SOFTFLOAT_FOR_GCC - build only those functions necessary for GCC (with
13 * -msoft-float) to work. Include "softfloat-for-gcc.h" to get them
20 This C source file is part of the SoftFloat IEC/IEEE Floating-point
26 National Science Foundation under grant MIP-9311980. The original version
27 of this code was written as part of a project to build a fixed-point vector
48 #include "softfloat-for-gcc.h"
66 -------------------------------------------------------------------------------
67 Floating-point rounding mode, extended double-precision rounding precision,
69 -------------------------------------------------------------------------------
78 -------------------------------------------------------------------------------
79 Primitive arithmetic functions, including multi-word arithmetic, and
82 -------------------------------------------------------------------------------
84 #include "softfloat-macros"
87 -------------------------------------------------------------------------------
92 are propagated from function inputs to output. These details are target-
94 -------------------------------------------------------------------------------
96 #include "softfloat-specialize"
100 -------------------------------------------------------------------------------
101 Takes a 64-bit fixed-point value `absZ' with binary point between bits 6
102 and 7, and returns the properly rounded 32-bit integer corresponding to the
104 integer. Bit 63 of `absZ' must be zero. Ordinarily, the fixed-point input
106 input cannot be represented exactly as an integer. However, if the fixed-
109 -------------------------------------------------------------------------------
116 int32 z; in roundAndPackInt32() local
138 z = absZ; in roundAndPackInt32()
139 if ( zSign ) z = - z; in roundAndPackInt32()
140 if ( ( absZ>>32 ) || ( z && ( ( z < 0 ) ^ zSign ) ) ) { in roundAndPackInt32()
145 return z; in roundAndPackInt32()
150 -------------------------------------------------------------------------------
151 Takes the 128-bit fixed-point value formed by concatenating `absZ0' and
153 and returns the properly rounded 64-bit integer corresponding to the input.
155 Ordinarily, the fixed-point input is simply rounded to an integer, with
157 an integer. However, if the fixed-point input is too large, the invalid
160 -------------------------------------------------------------------------------
166 int64 z; in roundAndPackInt64() local
189 z = absZ0; in roundAndPackInt64()
190 if ( zSign ) z = - z; in roundAndPackInt64()
191 if ( z && ( ( z < 0 ) ^ zSign ) ) { in roundAndPackInt64()
199 return z; in roundAndPackInt64()
205 -------------------------------------------------------------------------------
206 Returns the fraction bits of the single-precision floating-point value `a'.
207 -------------------------------------------------------------------------------
217 -------------------------------------------------------------------------------
218 Returns the exponent bits of the single-precision floating-point value `a'.
219 -------------------------------------------------------------------------------
229 -------------------------------------------------------------------------------
230 Returns the sign bit of the single-precision floating-point value `a'.
231 -------------------------------------------------------------------------------
241 -------------------------------------------------------------------------------
242 Normalizes the subnormal single-precision floating-point value represented
246 -------------------------------------------------------------------------------
253 shiftCount = countLeadingZeros32( aSig ) - 8; in normalizeFloat32Subnormal()
255 *zExpPtr = 1 - shiftCount; in normalizeFloat32Subnormal()
260 -------------------------------------------------------------------------------
262 single-precision floating-point value, returning the result. After being
269 -------------------------------------------------------------------------------
279 -------------------------------------------------------------------------------
280 Takes an abstract floating-point value having sign `zSign', exponent `zExp',
281 and significand `zSig', and returns the proper single-precision floating-
283 value is simply rounded and packed into the single-precision format, with
289 the abstract input cannot be represented exactly as a subnormal single-
290 precision floating-point number.
296 normalized, `zExp' must be 1 less than the ``true'' floating-point exponent.
298 Binary Floating-Point Arithmetic.
299 -------------------------------------------------------------------------------
332 return packFloat32( zSign, 0xFF, 0 ) - ( roundIncrement == 0 ); in roundAndPackFloat32()
337 || ( zExp < -1 ) in roundAndPackFloat32()
339 shift32RightJamming( zSig, - zExp, &zSig ); in roundAndPackFloat32()
354 -------------------------------------------------------------------------------
355 Takes an abstract floating-point value having sign `zSign', exponent `zExp',
356 and significand `zSig', and returns the proper single-precision floating-
360 floating-point exponent.
361 -------------------------------------------------------------------------------
368 shiftCount = countLeadingZeros32( zSig ) - 1; in normalizeRoundAndPackFloat32()
369 return roundAndPackFloat32( zSign, zExp - shiftCount, zSig<<shiftCount ); in normalizeRoundAndPackFloat32()
374 -------------------------------------------------------------------------------
375 Returns the fraction bits of the double-precision floating-point value `a'.
376 -------------------------------------------------------------------------------
386 -------------------------------------------------------------------------------
387 Returns the exponent bits of the double-precision floating-point value `a'.
388 -------------------------------------------------------------------------------
398 -------------------------------------------------------------------------------
399 Returns the sign bit of the double-precision floating-point value `a'.
400 -------------------------------------------------------------------------------
410 -------------------------------------------------------------------------------
411 Normalizes the subnormal double-precision floating-point value represented
415 -------------------------------------------------------------------------------
422 shiftCount = countLeadingZeros64( aSig ) - 11; in normalizeFloat64Subnormal()
424 *zExpPtr = 1 - shiftCount; in normalizeFloat64Subnormal()
429 -------------------------------------------------------------------------------
431 double-precision floating-point value, returning the result. After being
438 -------------------------------------------------------------------------------
449 -------------------------------------------------------------------------------
450 Takes an abstract floating-point value having sign `zSign', exponent `zExp',
451 and significand `zSig', and returns the proper double-precision floating-
453 value is simply rounded and packed into the double-precision format, with
459 the abstract input cannot be represented exactly as a subnormal double-
460 precision floating-point number.
466 normalized, `zExp' must be 1 less than the ``true'' floating-point exponent.
468 Binary Floating-Point Arithmetic.
469 -------------------------------------------------------------------------------
503 FLOAT64_DEMANGLE(packFloat64( zSign, 0x7FF, 0 )) - in roundAndPackFloat64()
509 || ( zExp < -1 ) in roundAndPackFloat64()
511 shift64RightJamming( zSig, - zExp, &zSig ); in roundAndPackFloat64()
526 -------------------------------------------------------------------------------
527 Takes an abstract floating-point value having sign `zSign', exponent `zExp',
528 and significand `zSig', and returns the proper double-precision floating-
532 floating-point exponent.
533 -------------------------------------------------------------------------------
540 shiftCount = countLeadingZeros64( zSig ) - 1; in normalizeRoundAndPackFloat64()
541 return roundAndPackFloat64( zSign, zExp - shiftCount, zSig<<shiftCount ); in normalizeRoundAndPackFloat64()
548 -------------------------------------------------------------------------------
549 Returns the fraction bits of the extended double-precision floating-point
551 -------------------------------------------------------------------------------
561 -------------------------------------------------------------------------------
562 Returns the exponent bits of the extended double-precision floating-point
564 -------------------------------------------------------------------------------
569 return a.high & 0x7FFF; in extractFloatx80Exp()
574 -------------------------------------------------------------------------------
575 Returns the sign bit of the extended double-precision floating-point value
577 -------------------------------------------------------------------------------
582 return a.high>>15; in extractFloatx80Sign()
587 -------------------------------------------------------------------------------
588 Normalizes the subnormal extended double-precision floating-point value
592 -------------------------------------------------------------------------------
601 *zExpPtr = 1 - shiftCount; in normalizeFloatx80Subnormal()
606 -------------------------------------------------------------------------------
608 extended double-precision floating-point value, returning the result.
609 -------------------------------------------------------------------------------
613 floatx80 z; in packFloatx80() local
615 z.low = zSig; in packFloatx80()
616 z.high = ( ( (bits16) zSign )<<15 ) + zExp; in packFloatx80()
617 return z; in packFloatx80()
622 -------------------------------------------------------------------------------
623 Takes an abstract floating-point value having sign `zSign', exponent `zExp',
625 and returns the proper extended double-precision floating-point value
627 rounded and packed into the extended double-precision format, with the
634 double-precision floating-point number.
637 result is rounded to the full precision of the extended double-precision
643 Floating-Point Arithmetic.
644 -------------------------------------------------------------------------------
685 if ( 0x7FFD <= (bits32) ( zExp - 1 ) ) { in roundAndPackFloatx80()
696 shift64RightJamming( zSig0, 1 - zExp, &zSig0 ); in roundAndPackFloatx80()
739 if ( 0x7FFD <= (bits32) ( zExp - 1 ) ) { in roundAndPackFloatx80()
763 shift64ExtraRightJamming( zSig0, zSig1, 1 - zExp, &zSig0, &zSig1 ); in roundAndPackFloatx80()
806 -------------------------------------------------------------------------------
807 Takes an abstract floating-point value having sign `zSign', exponent
809 and returns the proper extended double-precision floating-point value
813 -------------------------------------------------------------------------------
825 zExp -= 64; in normalizeRoundAndPackFloatx80()
829 zExp -= shiftCount; in normalizeRoundAndPackFloatx80()
840 -------------------------------------------------------------------------------
841 Returns the least-significant 64 fraction bits of the quadruple-precision
842 floating-point value `a'.
843 -------------------------------------------------------------------------------
853 -------------------------------------------------------------------------------
854 Returns the most-significant 48 fraction bits of the quadruple-precision
855 floating-point value `a'.
856 -------------------------------------------------------------------------------
861 return a.high & LIT64( 0x0000FFFFFFFFFFFF ); in extractFloat128Frac0()
866 -------------------------------------------------------------------------------
867 Returns the exponent bits of the quadruple-precision floating-point value
869 -------------------------------------------------------------------------------
874 return ( a.high>>48 ) & 0x7FFF; in extractFloat128Exp()
879 -------------------------------------------------------------------------------
880 Returns the sign bit of the quadruple-precision floating-point value `a'.
881 -------------------------------------------------------------------------------
886 return a.high>>63; in extractFloat128Sign()
891 -------------------------------------------------------------------------------
892 Normalizes the subnormal quadruple-precision floating-point value
899 -------------------------------------------------------------------------------
913 shiftCount = countLeadingZeros64( aSig1 ) - 15; in normalizeFloat128Subnormal()
915 *zSig0Ptr = aSig1>>( - shiftCount ); in normalizeFloat128Subnormal()
922 *zExpPtr = - shiftCount - 63; in normalizeFloat128Subnormal()
925 shiftCount = countLeadingZeros64( aSig0 ) - 15; in normalizeFloat128Subnormal()
927 *zExpPtr = 1 - shiftCount; in normalizeFloat128Subnormal()
933 -------------------------------------------------------------------------------
935 by the concatenation of `zSig0' and `zSig1' into a quadruple-precision
936 floating-point value, returning the result. After being shifted into the
944 -------------------------------------------------------------------------------
949 float128 z; in packFloat128() local
951 z.low = zSig1; in packFloat128()
952 z.high = ( ( (bits64) zSign )<<63 ) + ( ( (bits64) zExp )<<48 ) + zSig0; in packFloat128()
953 return z; in packFloat128()
958 -------------------------------------------------------------------------------
959 Takes an abstract floating-point value having sign `zSign', exponent `zExp',
961 and `zSig2', and returns the proper quadruple-precision floating-point value
963 simply rounded and packed into the quadruple-precision format, with the
969 the abstract input cannot be represented exactly as a subnormal quadruple-
970 precision floating-point number.
975 than the ``true'' floating-point exponent. The handling of underflow and
976 overflow follows the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
977 -------------------------------------------------------------------------------
1032 || ( zExp < -1 ) in roundAndPackFloat128()
1041 zSig0, zSig1, zSig2, - zExp, &zSig0, &zSig1, &zSig2 ); in roundAndPackFloat128()
1070 -------------------------------------------------------------------------------
1071 Takes an abstract floating-point value having sign `zSign', exponent `zExp',
1073 returns the proper quadruple-precision floating-point value corresponding
1076 normalized. In all cases, `zExp' must be 1 less than the ``true'' floating-
1078 -------------------------------------------------------------------------------
1090 zExp -= 64; in normalizeRoundAndPackFloat128()
1092 shiftCount = countLeadingZeros64( zSig0 ) - 15; in normalizeRoundAndPackFloat128()
1099 zSig0, zSig1, 0, - shiftCount, &zSig0, &zSig1, &zSig2 ); in normalizeRoundAndPackFloat128()
1101 zExp -= shiftCount; in normalizeRoundAndPackFloat128()
1109 -------------------------------------------------------------------------------
1110 Returns the result of converting the 32-bit two's complement integer `a'
1111 to the single-precision floating-point format. The conversion is performed
1112 according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
1113 -------------------------------------------------------------------------------
1122 return normalizeRoundAndPackFloat32( zSign, 0x9C, zSign ? - a : a ); in int32_to_float32()
1138 -------------------------------------------------------------------------------
1139 Returns the result of converting the 32-bit two's complement integer `a'
1140 to the double-precision floating-point format. The conversion is performed
1141 according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
1142 -------------------------------------------------------------------------------
1153 absA = zSign ? - a : a; in int32_to_float64()
1156 return packFloat64( zSign, 0x432 - shiftCount, zSig<<shiftCount ); in int32_to_float64()
1168 return packFloat64( 0, 0x432 - shiftCount, zSig<<shiftCount ); in uint32_to_float64()
1176 -------------------------------------------------------------------------------
1177 Returns the result of converting the 32-bit two's complement integer `a'
1178 to the extended double-precision floating-point format. The conversion
1179 is performed according to the IEC/IEEE Standard for Binary Floating-Point
1181 -------------------------------------------------------------------------------
1192 absA = zSign ? - a : a; in int32_to_floatx80()
1195 return packFloatx80( zSign, 0x403E - shiftCount, zSig<<shiftCount ); in int32_to_floatx80()
1206 return packFloatx80( 0, 0x403E - shiftCount, zSig<<shiftCount ); in uint32_to_floatx80()
1215 -------------------------------------------------------------------------------
1216 Returns the result of converting the 32-bit two's complement integer `a' to
1217 the quadruple-precision floating-point format. The conversion is performed
1218 according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
1219 -------------------------------------------------------------------------------
1230 absA = zSign ? - a : a; in int32_to_float128()
1233 return packFloat128( zSign, 0x402E - shiftCount, zSig0<<shiftCount, 0 ); in int32_to_float128()
1244 return packFloat128( 0, 0x402E - shiftCount, zSig0<<shiftCount, 0 ); in uint32_to_float128()
1252 -------------------------------------------------------------------------------
1253 Returns the result of converting the 64-bit two's complement integer `a'
1254 to the single-precision floating-point format. The conversion is performed
1255 according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
1256 -------------------------------------------------------------------------------
1266 absA = zSign ? - a : a; in int64_to_float32()
1267 shiftCount = countLeadingZeros64( absA ) - 40; in int64_to_float32()
1269 return packFloat32( zSign, 0x95 - shiftCount, absA<<shiftCount ); in int64_to_float32()
1274 shift64RightJamming( absA, - shiftCount, &absA ); in int64_to_float32()
1279 return roundAndPackFloat32( zSign, 0x9C - shiftCount, absA ); in int64_to_float32()
1285 -------------------------------------------------------------------------------
1286 Returns the result of converting the 64-bit two's complement integer `a'
1287 to the double-precision floating-point format. The conversion is performed
1288 according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
1289 -------------------------------------------------------------------------------
1300 return normalizeRoundAndPackFloat64( zSign, 0x43C, zSign ? - a : a ); in int64_to_float64()
1307 -------------------------------------------------------------------------------
1308 Returns the result of converting the 64-bit two's complement integer `a'
1309 to the extended double-precision floating-point format. The conversion
1310 is performed according to the IEC/IEEE Standard for Binary Floating-Point
1312 -------------------------------------------------------------------------------
1322 absA = zSign ? - a : a; in int64_to_floatx80()
1324 return packFloatx80( zSign, 0x403E - shiftCount, absA<<shiftCount ); in int64_to_floatx80()
1335 -------------------------------------------------------------------------------
1336 Returns the result of converting the 64-bit two's complement integer `a' to
1337 the quadruple-precision floating-point format. The conversion is performed
1338 according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
1339 -------------------------------------------------------------------------------
1351 absA = zSign ? - a : a; in int64_to_float128()
1353 zExp = 0x406E - shiftCount; in int64_to_float128()
1357 shiftCount -= 64; in int64_to_float128()
1372 -------------------------------------------------------------------------------
1373 Returns the result of converting the single-precision floating-point value
1374 `a' to the 32-bit two's complement integer format. The conversion is
1375 performed according to the IEC/IEEE Standard for Binary Floating-Point
1376 Arithmetic---which means in particular that the conversion is rounded
1380 -------------------------------------------------------------------------------
1394 shiftCount = 0xAF - aExp; in float32_to_int32()
1404 -------------------------------------------------------------------------------
1405 Returns the result of converting the single-precision floating-point value
1406 `a' to the 32-bit two's complement integer format. The conversion is
1407 performed according to the IEC/IEEE Standard for Binary Floating-Point
1412 -------------------------------------------------------------------------------
1419 int32 z; in float32_to_int32_round_to_zero() local
1424 shiftCount = aExp - 0x9E; in float32_to_int32_round_to_zero()
1437 z = aSig>>( - shiftCount ); in float32_to_int32_round_to_zero()
1441 if ( aSign ) z = - z; in float32_to_int32_round_to_zero()
1442 return z; in float32_to_int32_round_to_zero()
1448 -------------------------------------------------------------------------------
1449 Returns the result of converting the single-precision floating-point value
1450 `a' to the 64-bit two's complement integer format. The conversion is
1451 performed according to the IEC/IEEE Standard for Binary Floating-Point
1452 Arithmetic---which means in particular that the conversion is rounded
1456 -------------------------------------------------------------------------------
1468 shiftCount = 0xBE - aExp; in float32_to_int64()
1485 -------------------------------------------------------------------------------
1486 Returns the result of converting the single-precision floating-point value
1487 `a' to the 64-bit two's complement integer format. The conversion is
1488 performed according to the IEC/IEEE Standard for Binary Floating-Point
1493 -------------------------------------------------------------------------------
1501 int64 z; in float32_to_int64_round_to_zero() local
1506 shiftCount = aExp - 0xBE; in float32_to_int64_round_to_zero()
1522 z = aSig64>>( - shiftCount ); in float32_to_int64_round_to_zero()
1526 if ( aSign ) z = - z; in float32_to_int64_round_to_zero()
1527 return z; in float32_to_int64_round_to_zero()
1533 -------------------------------------------------------------------------------
1534 Returns the result of converting the single-precision floating-point value
1535 `a' to the double-precision floating-point format. The conversion is
1536 performed according to the IEC/IEEE Standard for Binary Floating-Point
1538 -------------------------------------------------------------------------------
1556 --aExp; in float32_to_float64()
1565 -------------------------------------------------------------------------------
1566 Returns the result of converting the single-precision floating-point value
1567 `a' to the extended double-precision floating-point format. The conversion
1568 is performed according to the IEC/IEEE Standard for Binary Floating-Point
1570 -------------------------------------------------------------------------------
1599 -------------------------------------------------------------------------------
1600 Returns the result of converting the single-precision floating-point value
1601 `a' to the double-precision floating-point format. The conversion is
1602 performed according to the IEC/IEEE Standard for Binary Floating-Point
1604 -------------------------------------------------------------------------------
1622 --aExp; in float32_to_float128()
1632 -------------------------------------------------------------------------------
1633 Rounds the single-precision floating-point value `a' to an integer, and
1634 returns the result as a single-precision floating-point value. The
1636 Floating-Point Arithmetic.
1637 -------------------------------------------------------------------------------
1645 float32 z; in float32_round_to_int() local
1674 lastBitMask <<= 0x96 - aExp; in float32_round_to_int()
1675 roundBitsMask = lastBitMask - 1; in float32_round_to_int()
1676 z = a; in float32_round_to_int()
1679 z += lastBitMask>>1; in float32_round_to_int()
1680 if ( ( z & roundBitsMask ) == 0 ) z &= ~ lastBitMask; in float32_round_to_int()
1683 if ( extractFloat32Sign( z ) ^ ( roundingMode == float_round_up ) ) { in float32_round_to_int()
1684 z += roundBitsMask; in float32_round_to_int()
1687 z &= ~ roundBitsMask; in float32_round_to_int()
1688 if ( z != a ) float_exception_flags |= float_flag_inexact; in float32_round_to_int()
1689 return z; in float32_round_to_int()
1695 -------------------------------------------------------------------------------
1696 Returns the result of adding the absolute values of the single-precision
1697 floating-point values `a' and `b'. If `zSign' is 1, the sum is negated
1700 Floating-Point Arithmetic.
1701 -------------------------------------------------------------------------------
1713 expDiff = aExp - bExp; in addFloat32Sigs()
1722 --expDiff; in addFloat32Sigs()
1741 shift32RightJamming( aSig, - expDiff, &aSig ); in addFloat32Sigs()
1756 --zExp; in addFloat32Sigs()
1767 -------------------------------------------------------------------------------
1768 Returns the result of subtracting the absolute values of the single-
1769 precision floating-point values `a' and `b'. If `zSign' is 1, the
1772 Standard for Binary Floating-Point Arithmetic.
1773 -------------------------------------------------------------------------------
1785 expDiff = aExp - bExp; in subFloat32Sigs()
1813 shift32RightJamming( aSig, - expDiff, &aSig ); in subFloat32Sigs()
1816 zSig = bSig - aSig; in subFloat32Sigs()
1826 --expDiff; in subFloat32Sigs()
1834 zSig = aSig - bSig; in subFloat32Sigs()
1837 --zExp; in subFloat32Sigs()
1843 -------------------------------------------------------------------------------
1844 Returns the result of adding the single-precision floating-point values `a'
1846 Binary Floating-Point Arithmetic.
1847 -------------------------------------------------------------------------------
1865 -------------------------------------------------------------------------------
1866 Returns the result of subtracting the single-precision floating-point values
1868 for Binary Floating-Point Arithmetic.
1869 -------------------------------------------------------------------------------
1887 -------------------------------------------------------------------------------
1888 Returns the result of multiplying the single-precision floating-point values
1890 for Binary Floating-Point Arithmetic.
1891 -------------------------------------------------------------------------------
1934 zExp = aExp + bExp - 0x7F; in float32_mul()
1941 --zExp; in float32_mul()
1948 -------------------------------------------------------------------------------
1949 Returns the result of dividing the single-precision floating-point value `a'
1951 IEC/IEEE Standard for Binary Floating-Point Arithmetic.
1952 -------------------------------------------------------------------------------
1995 zExp = aExp - bExp + 0x7D; in float32_div()
2012 -------------------------------------------------------------------------------
2013 Returns the remainder of the single-precision floating-point value `a'
2015 according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
2016 -------------------------------------------------------------------------------
2056 expDiff = aExp - bExp; in float32_rem()
2063 if ( expDiff < -1 ) return a; in float32_rem()
2067 if ( q ) aSig -= bSig; in float32_rem()
2070 q >>= 32 - expDiff; in float32_rem()
2072 aSig = ( ( aSig>>1 )<<( expDiff - 1 ) ) - bSig * q; in float32_rem()
2080 if ( bSig <= aSig ) aSig -= bSig; in float32_rem()
2083 expDiff -= 64; in float32_rem()
2086 q64 = ( 2 < q64 ) ? q64 - 2 : 0; in float32_rem()
2087 aSig64 = - ( ( bSig * q64 )<<38 ); in float32_rem()
2088 expDiff -= 62; in float32_rem()
2092 q64 = ( 2 < q64 ) ? q64 - 2 : 0; in float32_rem()
2093 q = q64>>( 64 - expDiff ); in float32_rem()
2095 aSig = ( ( aSig64>>33 )<<( expDiff - 1 ) ) - bSig * q; in float32_rem()
2100 aSig -= bSig; in float32_rem()
2107 if ( zSign ) aSig = - aSig; in float32_rem()
2115 -------------------------------------------------------------------------------
2116 Returns the square root of the single-precision floating-point value `a'.
2118 Floating-Point Arithmetic.
2119 -------------------------------------------------------------------------------
2146 zExp = ( ( aExp - 0x7F )>>1 ) + 0x7E; in float32_sqrt()
2156 rem = ( ( (bits64) aSig )<<32 ) - term; in float32_sqrt()
2158 --zSig; in float32_sqrt()
2171 -------------------------------------------------------------------------------
2172 Returns 1 if the single-precision floating-point value `a' is equal to
2174 according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
2175 -------------------------------------------------------------------------------
2193 -------------------------------------------------------------------------------
2194 Returns 1 if the single-precision floating-point value `a' is less than
2196 is performed according to the IEC/IEEE Standard for Binary Floating-Point
2198 -------------------------------------------------------------------------------
2218 -------------------------------------------------------------------------------
2219 Returns 1 if the single-precision floating-point value `a' is less than
2221 according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
2222 -------------------------------------------------------------------------------
2243 -------------------------------------------------------------------------------
2244 Returns 1 if the single-precision floating-point value `a' is equal to
2247 according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
2248 -------------------------------------------------------------------------------
2264 -------------------------------------------------------------------------------
2265 Returns 1 if the single-precision floating-point value `a' is less than or
2268 IEC/IEEE Standard for Binary Floating-Point Arithmetic.
2269 -------------------------------------------------------------------------------
2291 -------------------------------------------------------------------------------
2292 Returns 1 if the single-precision floating-point value `a' is less than
2295 Standard for Binary Floating-Point Arithmetic.
2296 -------------------------------------------------------------------------------
2320 -------------------------------------------------------------------------------
2321 Returns the result of converting the double-precision floating-point value
2322 `a' to the 32-bit two's complement integer format. The conversion is
2323 performed according to the IEC/IEEE Standard for Binary Floating-Point
2324 Arithmetic---which means in particular that the conversion is rounded
2328 -------------------------------------------------------------------------------
2341 shiftCount = 0x42C - aExp; in float64_to_int32()
2349 -------------------------------------------------------------------------------
2350 Returns the result of converting the double-precision floating-point value
2351 `a' to the 32-bit two's complement integer format. The conversion is
2352 performed according to the IEC/IEEE Standard for Binary Floating-Point
2357 -------------------------------------------------------------------------------
2364 int32 z; in float64_to_int32_round_to_zero() local
2378 shiftCount = 0x433 - aExp; in float64_to_int32_round_to_zero()
2381 z = aSig; in float64_to_int32_round_to_zero()
2382 if ( aSign ) z = - z; in float64_to_int32_round_to_zero()
2383 if ( ( z < 0 ) ^ aSign ) { in float64_to_int32_round_to_zero()
2391 return z; in float64_to_int32_round_to_zero()
2397 -------------------------------------------------------------------------------
2398 Returns the result of converting the double-precision floating-point value
2399 `a' to the 64-bit two's complement integer format. The conversion is
2400 performed according to the IEC/IEEE Standard for Binary Floating-Point
2401 Arithmetic---which means in particular that the conversion is rounded
2405 -------------------------------------------------------------------------------
2417 shiftCount = 0x433 - aExp; in float64_to_int64()
2430 aSig <<= - shiftCount; in float64_to_int64()
2440 -------------------------------------------------------------------------------
2441 Returns the result of converting the double-precision floating-point value
2442 `a' to the 64-bit two's complement integer format. The conversion is
2443 performed according to the IEC/IEEE Standard for Binary Floating-Point
2448 -------------------------------------------------------------------------------
2455 int64 z; in float64_to_int64_round_to_zero() local
2461 shiftCount = aExp - 0x433; in float64_to_int64_round_to_zero()
2475 z = aSig<<shiftCount; in float64_to_int64_round_to_zero()
2482 z = aSig>>( - shiftCount ); in float64_to_int64_round_to_zero()
2487 if ( aSign ) z = - z; in float64_to_int64_round_to_zero()
2488 return z; in float64_to_int64_round_to_zero()
2494 -------------------------------------------------------------------------------
2495 Returns the result of converting the double-precision floating-point value
2496 `a' to the single-precision floating-point format. The conversion is
2497 performed according to the IEC/IEEE Standard for Binary Floating-Point
2499 -------------------------------------------------------------------------------
2519 aExp -= 0x381; in float64_to_float32()
2528 -------------------------------------------------------------------------------
2529 Returns the result of converting the double-precision floating-point value
2530 `a' to the extended double-precision floating-point format. The conversion
2531 is performed according to the IEC/IEEE Standard for Binary Floating-Point
2533 -------------------------------------------------------------------------------
2563 -------------------------------------------------------------------------------
2564 Returns the result of converting the double-precision floating-point value
2565 `a' to the quadruple-precision floating-point format. The conversion is
2566 performed according to the IEC/IEEE Standard for Binary Floating-Point
2568 -------------------------------------------------------------------------------
2586 --aExp; in float64_to_float128()
2597 -------------------------------------------------------------------------------
2598 Rounds the double-precision floating-point value `a' to an integer, and
2599 returns the result as a double-precision floating-point value. The
2601 Floating-Point Arithmetic.
2602 -------------------------------------------------------------------------------
2610 float64 z; in float64_round_to_int() local
2640 lastBitMask <<= 0x433 - aExp; in float64_round_to_int()
2641 roundBitsMask = lastBitMask - 1; in float64_round_to_int()
2642 z = a; in float64_round_to_int()
2645 z += lastBitMask>>1; in float64_round_to_int()
2646 if ( ( z & roundBitsMask ) == 0 ) z &= ~ lastBitMask; in float64_round_to_int()
2649 if ( extractFloat64Sign( z ) ^ ( roundingMode == float_round_up ) ) { in float64_round_to_int()
2650 z += roundBitsMask; in float64_round_to_int()
2653 z &= ~ roundBitsMask; in float64_round_to_int()
2654 if ( z != a ) float_exception_flags |= float_flag_inexact; in float64_round_to_int()
2655 return z; in float64_round_to_int()
2661 -------------------------------------------------------------------------------
2662 Returns the result of adding the absolute values of the double-precision
2663 floating-point values `a' and `b'. If `zSign' is 1, the sum is negated
2666 Floating-Point Arithmetic.
2667 -------------------------------------------------------------------------------
2679 expDiff = aExp - bExp; in addFloat64Sigs()
2688 --expDiff; in addFloat64Sigs()
2707 shift64RightJamming( aSig, - expDiff, &aSig ); in addFloat64Sigs()
2722 --zExp; in addFloat64Sigs()
2733 -------------------------------------------------------------------------------
2734 Returns the result of subtracting the absolute values of the double-
2735 precision floating-point values `a' and `b'. If `zSign' is 1, the
2738 Standard for Binary Floating-Point Arithmetic.
2739 -------------------------------------------------------------------------------
2751 expDiff = aExp - bExp; in subFloat64Sigs()
2779 shift64RightJamming( aSig, - expDiff, &aSig ); in subFloat64Sigs()
2782 zSig = bSig - aSig; in subFloat64Sigs()
2792 --expDiff; in subFloat64Sigs()
2800 zSig = aSig - bSig; in subFloat64Sigs()
2803 --zExp; in subFloat64Sigs()
2809 -------------------------------------------------------------------------------
2810 Returns the result of adding the double-precision floating-point values `a'
2812 Binary Floating-Point Arithmetic.
2813 -------------------------------------------------------------------------------
2831 -------------------------------------------------------------------------------
2832 Returns the result of subtracting the double-precision floating-point values
2834 for Binary Floating-Point Arithmetic.
2835 -------------------------------------------------------------------------------
2853 -------------------------------------------------------------------------------
2854 Returns the result of multiplying the double-precision floating-point values
2856 for Binary Floating-Point Arithmetic.
2857 -------------------------------------------------------------------------------
2898 zExp = aExp + bExp - 0x3FF; in float64_mul()
2905 --zExp; in float64_mul()
2912 -------------------------------------------------------------------------------
2913 Returns the result of dividing the double-precision floating-point value `a'
2915 the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
2916 -------------------------------------------------------------------------------
2961 zExp = aExp - bExp + 0x3FD; in float64_div()
2973 --zSig; in float64_div()
2984 -------------------------------------------------------------------------------
2985 Returns the remainder of the double-precision floating-point value `a'
2987 according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
2988 -------------------------------------------------------------------------------
3026 expDiff = aExp - bExp; in float64_rem()
3030 if ( expDiff < -1 ) return a; in float64_rem()
3034 if ( q ) aSig -= bSig; in float64_rem()
3035 expDiff -= 64; in float64_rem()
3038 q = ( 2 < q ) ? q - 2 : 0; in float64_rem()
3039 aSig = - ( ( bSig>>2 ) * q ); in float64_rem()
3040 expDiff -= 62; in float64_rem()
3045 q = ( 2 < q ) ? q - 2 : 0; in float64_rem()
3046 q >>= 64 - expDiff; in float64_rem()
3048 aSig = ( ( aSig>>1 )<<( expDiff - 1 ) ) - bSig * q; in float64_rem()
3057 aSig -= bSig; in float64_rem()
3064 if ( zSign ) aSig = - aSig; in float64_rem()
3070 -------------------------------------------------------------------------------
3071 Returns the square root of the double-precision floating-point value `a'.
3073 Floating-Point Arithmetic.
3074 -------------------------------------------------------------------------------
3101 zExp = ( ( aExp - 0x3FF )>>1 ) + 0x3FE; in float64_sqrt()
3104 aSig <<= 9 - ( aExp & 1 ); in float64_sqrt()
3111 --zSig; in float64_sqrt()
3112 doubleZSig -= 2; in float64_sqrt()
3123 -------------------------------------------------------------------------------
3124 Returns 1 if the double-precision floating-point value `a' is equal to the
3126 according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
3127 -------------------------------------------------------------------------------
3146 -------------------------------------------------------------------------------
3147 Returns 1 if the double-precision floating-point value `a' is less than or
3149 performed according to the IEC/IEEE Standard for Binary Floating-Point
3151 -------------------------------------------------------------------------------
3175 -------------------------------------------------------------------------------
3176 Returns 1 if the double-precision floating-point value `a' is less than
3178 according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
3179 -------------------------------------------------------------------------------
3204 -------------------------------------------------------------------------------
3205 Returns 1 if the double-precision floating-point value `a' is equal to the
3208 according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
3209 -------------------------------------------------------------------------------
3225 -------------------------------------------------------------------------------
3226 Returns 1 if the double-precision floating-point value `a' is less than or
3229 IEC/IEEE Standard for Binary Floating-Point Arithmetic.
3230 -------------------------------------------------------------------------------
3252 -------------------------------------------------------------------------------
3253 Returns 1 if the double-precision floating-point value `a' is less than
3256 Standard for Binary Floating-Point Arithmetic.
3257 -------------------------------------------------------------------------------
3282 -------------------------------------------------------------------------------
3283 Returns the result of converting the extended double-precision floating-
3284 point value `a' to the 32-bit two's complement integer format. The
3286 Floating-Point Arithmetic---which means in particular that the conversion
3290 -------------------------------------------------------------------------------
3302 shiftCount = 0x4037 - aExp; in floatx80_to_int32()
3310 -------------------------------------------------------------------------------
3311 Returns the result of converting the extended double-precision floating-
3312 point value `a' to the 32-bit two's complement integer format. The
3314 Floating-Point Arithmetic, except that the conversion is always rounded
3318 -------------------------------------------------------------------------------
3325 int32 z; in floatx80_to_int32_round_to_zero() local
3338 shiftCount = 0x403E - aExp; in floatx80_to_int32_round_to_zero()
3341 z = aSig; in floatx80_to_int32_round_to_zero()
3342 if ( aSign ) z = - z; in floatx80_to_int32_round_to_zero()
3343 if ( ( z < 0 ) ^ aSign ) { in floatx80_to_int32_round_to_zero()
3351 return z; in floatx80_to_int32_round_to_zero()
3356 -------------------------------------------------------------------------------
3357 Returns the result of converting the extended double-precision floating-
3358 point value `a' to the 64-bit two's complement integer format. The
3360 Floating-Point Arithmetic---which means in particular that the conversion
3364 -------------------------------------------------------------------------------
3375 shiftCount = 0x403E - aExp; in floatx80_to_int64()
3397 -------------------------------------------------------------------------------
3398 Returns the result of converting the extended double-precision floating-
3399 point value `a' to the 64-bit two's complement integer format. The
3401 Floating-Point Arithmetic, except that the conversion is always rounded
3405 -------------------------------------------------------------------------------
3412 int64 z; in floatx80_to_int64_round_to_zero() local
3417 shiftCount = aExp - 0x403E; in floatx80_to_int64_round_to_zero()
3420 if ( ( a.high != 0xC03E ) || aSig ) { in floatx80_to_int64_round_to_zero()
3432 z = aSig>>( - shiftCount ); in floatx80_to_int64_round_to_zero()
3436 if ( aSign ) z = - z; in floatx80_to_int64_round_to_zero()
3437 return z; in floatx80_to_int64_round_to_zero()
3442 -------------------------------------------------------------------------------
3443 Returns the result of converting the extended double-precision floating-
3444 point value `a' to the single-precision floating-point format. The
3446 Floating-Point Arithmetic.
3447 -------------------------------------------------------------------------------
3465 if ( aExp || aSig ) aExp -= 0x3F81; in floatx80_to_float32()
3471 -------------------------------------------------------------------------------
3472 Returns the result of converting the extended double-precision floating-
3473 point value `a' to the double-precision floating-point format. The
3475 Floating-Point Arithmetic.
3476 -------------------------------------------------------------------------------
3494 if ( aExp || aSig ) aExp -= 0x3C01; in floatx80_to_float64()
3502 -------------------------------------------------------------------------------
3503 Returns the result of converting the extended double-precision floating-
3504 point value `a' to the quadruple-precision floating-point format. The
3506 Floating-Point Arithmetic.
3507 -------------------------------------------------------------------------------
3529 -------------------------------------------------------------------------------
3530 Rounds the extended double-precision floating-point value `a' to an integer,
3531 and returns the result as an extended quadruple-precision floating-point
3533 Binary Floating-Point Arithmetic.
3534 -------------------------------------------------------------------------------
3542 floatx80 z; in floatx80_round_to_int() local
3581 lastBitMask <<= 0x403E - aExp; in floatx80_round_to_int()
3582 roundBitsMask = lastBitMask - 1; in floatx80_round_to_int()
3583 z = a; in floatx80_round_to_int()
3586 z.low += lastBitMask>>1; in floatx80_round_to_int()
3587 if ( ( z.low & roundBitsMask ) == 0 ) z.low &= ~ lastBitMask; in floatx80_round_to_int()
3590 if ( extractFloatx80Sign( z ) ^ ( roundingMode == float_round_up ) ) { in floatx80_round_to_int()
3591 z.low += roundBitsMask; in floatx80_round_to_int()
3594 z.low &= ~ roundBitsMask; in floatx80_round_to_int()
3595 if ( z.low == 0 ) { in floatx80_round_to_int()
3596 ++z.high; in floatx80_round_to_int()
3597 z.low = LIT64( 0x8000000000000000 ); in floatx80_round_to_int()
3599 if ( z.low != a.low ) float_exception_flags |= float_flag_inexact; in floatx80_round_to_int()
3600 return z; in floatx80_round_to_int()
3605 -------------------------------------------------------------------------------
3606 Returns the result of adding the absolute values of the extended double-
3607 precision floating-point values `a' and `b'. If `zSign' is 1, the sum is
3610 Floating-Point Arithmetic.
3611 -------------------------------------------------------------------------------
3623 expDiff = aExp - bExp; in addFloatx80Sigs()
3629 if ( bExp == 0 ) --expDiff; in addFloatx80Sigs()
3639 shift64ExtraRightJamming( aSig, 0, - expDiff, &aSig, &zSig1 ); in addFloatx80Sigs()
3672 -------------------------------------------------------------------------------
3674 double-precision floating-point values `a' and `b'. If `zSign' is 1, the
3677 Standard for Binary Floating-Point Arithmetic.
3678 -------------------------------------------------------------------------------
3685 floatx80 z; in subFloatx80Sigs() local
3691 expDiff = aExp - bExp; in subFloatx80Sigs()
3699 z.low = floatx80_default_nan_low; in subFloatx80Sigs()
3700 z.high = floatx80_default_nan_high; in subFloatx80Sigs()
3701 return z; in subFloatx80Sigs()
3717 shift128RightJamming( aSig, 0, - expDiff, &aSig, &zSig1 ); in subFloatx80Sigs()
3728 if ( bExp == 0 ) --expDiff; in subFloatx80Sigs()
3741 -------------------------------------------------------------------------------
3742 Returns the result of adding the extended double-precision floating-point
3744 Standard for Binary Floating-Point Arithmetic.
3745 -------------------------------------------------------------------------------
3763 -------------------------------------------------------------------------------
3764 Returns the result of subtracting the extended double-precision floating-
3766 IEC/IEEE Standard for Binary Floating-Point Arithmetic.
3767 -------------------------------------------------------------------------------
3785 -------------------------------------------------------------------------------
3786 Returns the result of multiplying the extended double-precision floating-
3788 IEC/IEEE Standard for Binary Floating-Point Arithmetic.
3789 -------------------------------------------------------------------------------
3796 floatx80 z; in floatx80_mul() local
3818 z.low = floatx80_default_nan_low; in floatx80_mul()
3819 z.high = floatx80_default_nan_high; in floatx80_mul()
3820 return z; in floatx80_mul()
3832 zExp = aExp + bExp - 0x3FFE; in floatx80_mul()
3836 --zExp; in floatx80_mul()
3845 -------------------------------------------------------------------------------
3846 Returns the result of dividing the extended double-precision floating-point
3848 according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
3849 -------------------------------------------------------------------------------
3857 floatx80 z; in floatx80_div() local
3883 z.low = floatx80_default_nan_low; in floatx80_div()
3884 z.high = floatx80_default_nan_high; in floatx80_div()
3885 return z; in floatx80_div()
3896 zExp = aExp - bExp + 0x3FFE; in floatx80_div()
3906 --zSig0; in floatx80_div()
3914 --zSig1; in floatx80_div()
3926 -------------------------------------------------------------------------------
3927 Returns the remainder of the extended double-precision floating-point value
3929 according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
3930 -------------------------------------------------------------------------------
3938 floatx80 z; in floatx80_rem() local
3961 z.low = floatx80_default_nan_low; in floatx80_rem()
3962 z.high = floatx80_default_nan_high; in floatx80_rem()
3963 return z; in floatx80_rem()
3973 expDiff = aExp - bExp; in floatx80_rem()
3976 if ( expDiff < -1 ) return a; in floatx80_rem()
3981 if ( q ) aSig0 -= bSig; in floatx80_rem()
3982 expDiff -= 64; in floatx80_rem()
3985 q = ( 2 < q ) ? q - 2 : 0; in floatx80_rem()
3989 expDiff -= 62; in floatx80_rem()
3994 q = ( 2 < q ) ? q - 2 : 0; in floatx80_rem()
3995 q >>= 64 - expDiff; in floatx80_rem()
3996 mul64To128( bSig, q<<( 64 - expDiff ), &term0, &term1 ); in floatx80_rem()
3998 shortShift128Left( 0, bSig, 64 - expDiff, &term0, &term1 ); in floatx80_rem()
4024 -------------------------------------------------------------------------------
4025 Returns the square root of the extended double-precision floating-point
4027 for Binary Floating-Point Arithmetic.
4028 -------------------------------------------------------------------------------
4036 floatx80 z; in floatx80_sqrt() local
4050 z.low = floatx80_default_nan_low; in floatx80_sqrt()
4051 z.high = floatx80_default_nan_high; in floatx80_sqrt()
4052 return z; in floatx80_sqrt()
4058 zExp = ( ( aExp - 0x3FFF )>>1 ) + 0x3FFF; in floatx80_sqrt()
4066 --zSig0; in floatx80_sqrt()
4067 doubleZSig0 -= 2; in floatx80_sqrt()
4078 --zSig1; in floatx80_sqrt()
4095 -------------------------------------------------------------------------------
4096 Returns 1 if the extended double-precision floating-point value `a' is
4098 performed according to the IEC/IEEE Standard for Binary Floating-Point
4100 -------------------------------------------------------------------------------
4118 && ( ( a.high == b.high ) in floatx80_eq()
4120 && ( (bits16) ( ( a.high | b.high )<<1 ) == 0 ) ) in floatx80_eq()
4126 -------------------------------------------------------------------------------
4127 Returns 1 if the extended double-precision floating-point value `a' is
4130 Floating-Point Arithmetic.
4131 -------------------------------------------------------------------------------
4150 || ( ( ( (bits16) ( ( a.high | b.high )<<1 ) ) | a.low | b.low ) in floatx80_le()
4154 aSign ? le128( b.high, b.low, a.high, a.low ) in floatx80_le()
4155 : le128( a.high, a.low, b.high, b.low ); in floatx80_le()
4160 -------------------------------------------------------------------------------
4161 Returns 1 if the extended double-precision floating-point value `a' is
4163 is performed according to the IEC/IEEE Standard for Binary Floating-Point
4165 -------------------------------------------------------------------------------
4184 && ( ( ( (bits16) ( ( a.high | b.high )<<1 ) ) | a.low | b.low ) in floatx80_lt()
4188 aSign ? lt128( b.high, b.low, a.high, a.low ) in floatx80_lt()
4189 : lt128( a.high, a.low, b.high, b.low ); in floatx80_lt()
4194 -------------------------------------------------------------------------------
4195 Returns 1 if the extended double-precision floating-point value `a' is equal
4198 according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
4199 -------------------------------------------------------------------------------
4214 && ( ( a.high == b.high ) in floatx80_eq_signaling()
4216 && ( (bits16) ( ( a.high | b.high )<<1 ) == 0 ) ) in floatx80_eq_signaling()
4222 -------------------------------------------------------------------------------
4223 Returns 1 if the extended double-precision floating-point value `a' is less
4226 to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
4227 -------------------------------------------------------------------------------
4249 || ( ( ( (bits16) ( ( a.high | b.high )<<1 ) ) | a.low | b.low ) in floatx80_le_quiet()
4253 aSign ? le128( b.high, b.low, a.high, a.low ) in floatx80_le_quiet()
4254 : le128( a.high, a.low, b.high, b.low ); in floatx80_le_quiet()
4259 -------------------------------------------------------------------------------
4260 Returns 1 if the extended double-precision floating-point value `a' is less
4263 IEC/IEEE Standard for Binary Floating-Point Arithmetic.
4264 -------------------------------------------------------------------------------
4286 && ( ( ( (bits16) ( ( a.high | b.high )<<1 ) ) | a.low | b.low ) in floatx80_lt_quiet()
4290 aSign ? lt128( b.high, b.low, a.high, a.low ) in floatx80_lt_quiet()
4291 : lt128( a.high, a.low, b.high, b.low ); in floatx80_lt_quiet()
4300 -------------------------------------------------------------------------------
4301 Returns the result of converting the quadruple-precision floating-point
4302 value `a' to the 32-bit two's complement integer format. The conversion
4303 is performed according to the IEC/IEEE Standard for Binary Floating-Point
4304 Arithmetic---which means in particular that the conversion is rounded
4308 -------------------------------------------------------------------------------
4323 shiftCount = 0x4028 - aExp; in float128_to_int32()
4330 -------------------------------------------------------------------------------
4331 Returns the result of converting the quadruple-precision floating-point
4332 value `a' to the 32-bit two's complement integer format. The conversion
4333 is performed according to the IEC/IEEE Standard for Binary Floating-Point
4338 -------------------------------------------------------------------------------
4345 int32 z; in float128_to_int32_round_to_zero() local
4361 shiftCount = 0x402F - aExp; in float128_to_int32_round_to_zero()
4364 z = aSig0; in float128_to_int32_round_to_zero()
4365 if ( aSign ) z = - z; in float128_to_int32_round_to_zero()
4366 if ( ( z < 0 ) ^ aSign ) { in float128_to_int32_round_to_zero()
4374 return z; in float128_to_int32_round_to_zero()
4379 -------------------------------------------------------------------------------
4380 Returns the result of converting the quadruple-precision floating-point
4381 value `a' to the 64-bit two's complement integer format. The conversion
4382 is performed according to the IEC/IEEE Standard for Binary Floating-Point
4383 Arithmetic---which means in particular that the conversion is rounded
4387 -------------------------------------------------------------------------------
4400 shiftCount = 0x402F - aExp; in float128_to_int64()
4413 shortShift128Left( aSig0, aSig1, - shiftCount, &aSig0, &aSig1 ); in float128_to_int64()
4423 -------------------------------------------------------------------------------
4424 Returns the result of converting the quadruple-precision floating-point
4425 value `a' to the 64-bit two's complement integer format. The conversion
4426 is performed according to the IEC/IEEE Standard for Binary Floating-Point
4431 -------------------------------------------------------------------------------
4438 int64 z; in float128_to_int64_round_to_zero() local
4445 shiftCount = aExp - 0x402F; in float128_to_int64_round_to_zero()
4449 if ( ( a.high == LIT64( 0xC03E000000000000 ) ) in float128_to_int64_round_to_zero()
4461 z = ( aSig0<<shiftCount ) | ( aSig1>>( ( - shiftCount ) & 63 ) ); in float128_to_int64_round_to_zero()
4473 z = aSig0>>( - shiftCount ); in float128_to_int64_round_to_zero()
4479 if ( aSign ) z = - z; in float128_to_int64_round_to_zero()
4480 return z; in float128_to_int64_round_to_zero()
4487 * just like above - but do not care for overflow of signed results
4494 uint64 z; in float128_to_uint64_round_to_zero() local
4501 shiftCount = aExp - 0x402F; in float128_to_uint64_round_to_zero()
4505 if ( ( a.high == LIT64( 0xC03E000000000000 ) ) in float128_to_uint64_round_to_zero()
4514 z = ( aSig0<<shiftCount ) | ( aSig1>>( ( - shiftCount ) & 63 ) ); in float128_to_uint64_round_to_zero()
4526 z = aSig0>>( - shiftCount ); in float128_to_uint64_round_to_zero()
4531 if ( aSign ) z = - z; in float128_to_uint64_round_to_zero()
4532 return z; in float128_to_uint64_round_to_zero()
4538 -------------------------------------------------------------------------------
4539 Returns the result of converting the quadruple-precision floating-point
4540 value `a' to the single-precision floating-point format. The conversion
4541 is performed according to the IEC/IEEE Standard for Binary Floating-Point
4543 -------------------------------------------------------------------------------
4567 aExp -= 0x3F81; in float128_to_float32()
4574 -------------------------------------------------------------------------------
4575 Returns the result of converting the quadruple-precision floating-point
4576 value `a' to the double-precision floating-point format. The conversion
4577 is performed according to the IEC/IEEE Standard for Binary Floating-Point
4579 -------------------------------------------------------------------------------
4601 aExp -= 0x3C01; in float128_to_float64()
4610 -------------------------------------------------------------------------------
4611 Returns the result of converting the quadruple-precision floating-point
4612 value `a' to the extended double-precision floating-point format. The
4614 Floating-Point Arithmetic.
4615 -------------------------------------------------------------------------------
4648 -------------------------------------------------------------------------------
4649 Rounds the quadruple-precision floating-point value `a' to an integer, and
4650 returns the result as a quadruple-precision floating-point value. The
4652 Floating-Point Arithmetic.
4653 -------------------------------------------------------------------------------
4661 float128 z; in float128_round_to_int() local
4674 lastBitMask = ( lastBitMask<<( 0x406E - aExp ) )<<1; in float128_round_to_int()
4675 roundBitsMask = lastBitMask - 1; in float128_round_to_int()
4676 z = a; in float128_round_to_int()
4680 add128( z.high, z.low, 0, lastBitMask>>1, &z.high, &z.low ); in float128_round_to_int()
4681 if ( ( z.low & roundBitsMask ) == 0 ) z.low &= ~ lastBitMask; in float128_round_to_int()
4684 if ( (sbits64) z.low < 0 ) { in float128_round_to_int()
4685 ++z.high; in float128_round_to_int()
4686 if ( (bits64) ( z.low<<1 ) == 0 ) z.high &= ~1; in float128_round_to_int()
4691 if ( extractFloat128Sign( z ) in float128_round_to_int()
4693 add128( z.high, z.low, 0, roundBitsMask, &z.high, &z.low ); in float128_round_to_int()
4696 z.low &= ~ roundBitsMask; in float128_round_to_int()
4700 if ( ( ( (bits64) ( a.high<<1 ) ) | a.low ) == 0 ) return a; in float128_round_to_int()
4726 lastBitMask <<= 0x402F - aExp; in float128_round_to_int()
4727 roundBitsMask = lastBitMask - 1; in float128_round_to_int()
4728 z.low = 0; in float128_round_to_int()
4729 z.high = a.high; in float128_round_to_int()
4732 z.high += lastBitMask>>1; in float128_round_to_int()
4733 if ( ( ( z.high & roundBitsMask ) | a.low ) == 0 ) { in float128_round_to_int()
4734 z.high &= ~ lastBitMask; in float128_round_to_int()
4738 if ( extractFloat128Sign( z ) in float128_round_to_int()
4740 z.high |= ( a.low != 0 ); in float128_round_to_int()
4741 z.high += roundBitsMask; in float128_round_to_int()
4744 z.high &= ~ roundBitsMask; in float128_round_to_int()
4746 if ( ( z.low != a.low ) || ( z.high != a.high ) ) { in float128_round_to_int()
4749 return z; in float128_round_to_int()
4754 -------------------------------------------------------------------------------
4755 Returns the result of adding the absolute values of the quadruple-precision
4756 floating-point values `a' and `b'. If `zSign' is 1, the sum is negated
4759 Floating-Point Arithmetic.
4760 -------------------------------------------------------------------------------
4774 expDiff = aExp - bExp; in addFloat128Sigs()
4781 --expDiff; in addFloat128Sigs()
4802 aSig0, aSig1, 0, - expDiff, &aSig0, &aSig1, &zSig2 ); in addFloat128Sigs()
4821 --zExp; in addFloat128Sigs()
4833 -------------------------------------------------------------------------------
4834 Returns the result of subtracting the absolute values of the quadruple-
4835 precision floating-point values `a' and `b'. If `zSign' is 1, the
4838 Standard for Binary Floating-Point Arithmetic.
4839 -------------------------------------------------------------------------------
4846 float128 z; in subFloat128Sigs() local
4854 expDiff = aExp - bExp; in subFloat128Sigs()
4864 z.low = float128_default_nan_low; in subFloat128Sigs()
4865 z.high = float128_default_nan_high; in subFloat128Sigs()
4866 return z; in subFloat128Sigs()
4888 shift128RightJamming( aSig0, aSig1, - expDiff, &aSig0, &aSig1 ); in subFloat128Sigs()
4901 --expDiff; in subFloat128Sigs()
4912 --zExp; in subFloat128Sigs()
4913 return normalizeRoundAndPackFloat128( zSign, zExp - 14, zSig0, zSig1 ); in subFloat128Sigs()
4918 -------------------------------------------------------------------------------
4919 Returns the result of adding the quadruple-precision floating-point values
4921 for Binary Floating-Point Arithmetic.
4922 -------------------------------------------------------------------------------
4940 -------------------------------------------------------------------------------
4941 Returns the result of subtracting the quadruple-precision floating-point
4943 Standard for Binary Floating-Point Arithmetic.
4944 -------------------------------------------------------------------------------
4962 -------------------------------------------------------------------------------
4963 Returns the result of multiplying the quadruple-precision floating-point
4965 Standard for Binary Floating-Point Arithmetic.
4966 -------------------------------------------------------------------------------
4973 float128 z; in float128_mul() local
4997 z.low = float128_default_nan_low; in float128_mul()
4998 z.high = float128_default_nan_high; in float128_mul()
4999 return z; in float128_mul()
5011 zExp = aExp + bExp - 0x4000; in float128_mul()
5027 -------------------------------------------------------------------------------
5028 Returns the result of dividing the quadruple-precision floating-point value
5030 the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
5031 -------------------------------------------------------------------------------
5039 float128 z; in float128_div() local
5067 z.low = float128_default_nan_low; in float128_div()
5068 z.high = float128_default_nan_high; in float128_div()
5069 return z; in float128_div()
5080 zExp = aExp - bExp + 0x3FFD; in float128_div()
5093 --zSig0; in float128_div()
5101 --zSig1; in float128_div()
5112 -------------------------------------------------------------------------------
5113 Returns the remainder of the quadruple-precision floating-point value `a'
5115 according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
5116 -------------------------------------------------------------------------------
5125 float128 z; in float128_rem() local
5150 z.low = float128_default_nan_low; in float128_rem()
5151 z.high = float128_default_nan_high; in float128_rem()
5152 return z; in float128_rem()
5160 expDiff = aExp - bExp; in float128_rem()
5161 if ( expDiff < -1 ) return a; in float128_rem()
5165 15 - ( expDiff < 0 ), in float128_rem()
5173 expDiff -= 64; in float128_rem()
5176 q = ( 4 < q ) ? q - 4 : 0; in float128_rem()
5181 expDiff -= 61; in float128_rem()
5183 if ( -64 < expDiff ) { in float128_rem()
5185 q = ( 4 < q ) ? q - 4 : 0; in float128_rem()
5186 q >>= - expDiff; in float128_rem()
5190 shift128Right( aSig0, aSig1, - expDiff, &aSig0, &aSig1 ); in float128_rem()
5218 normalizeRoundAndPackFloat128( aSign ^ zSign, bExp - 4, aSig0, aSig1 ); in float128_rem()
5223 -------------------------------------------------------------------------------
5224 Returns the square root of the quadruple-precision floating-point value `a'.
5226 Floating-Point Arithmetic.
5227 -------------------------------------------------------------------------------
5235 float128 z; in float128_sqrt() local
5250 z.low = float128_default_nan_low; in float128_sqrt()
5251 z.high = float128_default_nan_high; in float128_sqrt()
5252 return z; in float128_sqrt()
5258 zExp = ( ( aExp - 0x3FFF )>>1 ) + 0x3FFE; in float128_sqrt()
5261 shortShift128Left( aSig0, aSig1, 13 - ( aExp & 1 ), &aSig0, &aSig1 ); in float128_sqrt()
5267 --zSig0; in float128_sqrt()
5268 doubleZSig0 -= 2; in float128_sqrt()
5279 --zSig1; in float128_sqrt()
5293 -------------------------------------------------------------------------------
5294 Returns 1 if the quadruple-precision floating-point value `a' is equal to
5296 according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
5297 -------------------------------------------------------------------------------
5315 && ( ( a.high == b.high ) in float128_eq()
5317 && ( (bits64) ( ( a.high | b.high )<<1 ) == 0 ) ) in float128_eq()
5323 -------------------------------------------------------------------------------
5324 Returns 1 if the quadruple-precision floating-point value `a' is less than
5326 is performed according to the IEC/IEEE Standard for Binary Floating-Point
5328 -------------------------------------------------------------------------------
5347 || ( ( ( (bits64) ( ( a.high | b.high )<<1 ) ) | a.low | b.low ) in float128_le()
5351 aSign ? le128( b.high, b.low, a.high, a.low ) in float128_le()
5352 : le128( a.high, a.low, b.high, b.low ); in float128_le()
5357 -------------------------------------------------------------------------------
5358 Returns 1 if the quadruple-precision floating-point value `a' is less than
5360 according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
5361 -------------------------------------------------------------------------------
5380 && ( ( ( (bits64) ( ( a.high | b.high )<<1 ) ) | a.low | b.low ) in float128_lt()
5384 aSign ? lt128( b.high, b.low, a.high, a.low ) in float128_lt()
5385 : lt128( a.high, a.low, b.high, b.low ); in float128_lt()
5390 -------------------------------------------------------------------------------
5391 Returns 1 if the quadruple-precision floating-point value `a' is equal to
5394 according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
5395 -------------------------------------------------------------------------------
5410 && ( ( a.high == b.high ) in float128_eq_signaling()
5412 && ( (bits64) ( ( a.high | b.high )<<1 ) == 0 ) ) in float128_eq_signaling()
5418 -------------------------------------------------------------------------------
5419 Returns 1 if the quadruple-precision floating-point value `a' is less than
5422 IEC/IEEE Standard for Binary Floating-Point Arithmetic.
5423 -------------------------------------------------------------------------------
5445 || ( ( ( (bits64) ( ( a.high | b.high )<<1 ) ) | a.low | b.low ) in float128_le_quiet()
5449 aSign ? le128( b.high, b.low, a.high, a.low ) in float128_le_quiet()
5450 : le128( a.high, a.low, b.high, b.low ); in float128_le_quiet()
5455 -------------------------------------------------------------------------------
5456 Returns 1 if the quadruple-precision floating-point value `a' is less than
5459 Standard for Binary Floating-Point Arithmetic.
5460 -------------------------------------------------------------------------------
5482 && ( ( ( (bits64) ( ( a.high | b.high )<<1 ) ) | a.low | b.low ) in float128_lt_quiet()
5486 aSign ? lt128( b.high, b.low, a.high, a.low ) in float128_lt_quiet()
5487 : lt128( a.high, a.low, b.high, b.low ); in float128_lt_quiet()
5504 * float64 version overhauled for SoftFloat 2a [bjh21 2000-07-15]
5508 -------------------------------------------------------------------------------
5509 Returns the result of converting the double-precision floating-point value
5510 `a' to the 32-bit unsigned integer format. The conversion is
5511 performed according to the IEC/IEEE Standard for Binary Floating-point
5515 -------------------------------------------------------------------------------
5522 uint32 z; in float64_to_uint32_round_to_zero() local
5542 shiftCount = 0x433 - aExp; in float64_to_uint32_round_to_zero()
5545 z = aSig; in float64_to_uint32_round_to_zero()
5549 return z; in float64_to_uint32_round_to_zero()
5554 -------------------------------------------------------------------------------
5555 Returns the result of converting the single-precision floating-point value
5556 `a' to the 32-bit unsigned integer format. The conversion is
5557 performed according to the IEC/IEEE Standard for Binary Floating-point
5561 -------------------------------------------------------------------------------
5568 uint32 z; in float32_to_uint32_round_to_zero() local
5573 shiftCount = aExp - 0x9E; in float32_to_uint32_round_to_zero()
5588 z = aSig>>( - shiftCount ); in float32_to_uint32_round_to_zero()
5592 return z; in float32_to_uint32_round_to_zero()