1 /* $NetBSD: softfloat.c,v 1.1 2002/05/21 23:51:07 bjh21 Exp $ */
2
3 /*
4 * This version hacked for use with gcc -msoft-float by bjh21.
5 * (Mostly a case of #ifdefing out things GCC doesn't need or provides
6 * itself).
7 */
8
9 /*
10 * Things you may want to define:
11 *
12 * SOFTFLOAT_FOR_GCC - build only those functions necessary for GCC (with
13 * -msoft-float) to work. Include "softfloat-for-gcc.h" to get them
14 * properly renamed.
15 */
16
17 /*
18 * This differs from the standard bits32/softfloat.c in that float64
19 * is defined to be a 64-bit integer rather than a structure. The
20 * structure is float64s, with translation between the two going via
21 * float64u.
22 */
23
24 /*
25 ===============================================================================
26
27 This C source file is part of the SoftFloat IEC/IEEE Floating-Point
28 Arithmetic Package, Release 2a.
29
30 Written by John R. Hauser. This work was made possible in part by the
31 International Computer Science Institute, located at Suite 600, 1947 Center
32 Street, Berkeley, California 94704. Funding was partially provided by the
33 National Science Foundation under grant MIP-9311980. The original version
34 of this code was written as part of a project to build a fixed-point vector
35 processor in collaboration with the University of California at Berkeley,
36 overseen by Profs. Nelson Morgan and John Wawrzynek. More information
37 is available through the Web page `http://HTTP.CS.Berkeley.EDU/~jhauser/
38 arithmetic/SoftFloat.html'.
39
40 THIS SOFTWARE IS DISTRIBUTED AS IS, FOR FREE. Although reasonable effort
41 has been made to avoid it, THIS SOFTWARE MAY CONTAIN FAULTS THAT WILL AT
42 TIMES RESULT IN INCORRECT BEHAVIOR. USE OF THIS SOFTWARE IS RESTRICTED TO
43 PERSONS AND ORGANIZATIONS WHO CAN AND WILL TAKE FULL RESPONSIBILITY FOR ANY
44 AND ALL LOSSES, COSTS, OR OTHER PROBLEMS ARISING FROM ITS USE.
45
46 Derivative works are acceptable, even for commercial purposes, so long as
47 (1) they include prominent notice that the work is derivative, and (2) they
48 include prominent notice akin to these four paragraphs for those parts of
49 this code that are retained.
50
51 ===============================================================================
52 */
53
54 #ifdef SOFTFLOAT_FOR_GCC
55 #include "softfloat-for-gcc.h"
56 #endif
57
58 #include "milieu.h"
59 #include "softfloat.h"
60
61 /*
62 * Conversions between floats as stored in memory and floats as
63 * SoftFloat uses them
64 */
65 #ifndef FLOAT64_DEMANGLE
66 #define FLOAT64_DEMANGLE(a) (a)
67 #endif
68 #ifndef FLOAT64_MANGLE
69 #define FLOAT64_MANGLE(a) (a)
70 #endif
71
72 /*
73 -------------------------------------------------------------------------------
74 Floating-point rounding mode and exception flags.
75 -------------------------------------------------------------------------------
76 */
77 int float_rounding_mode = float_round_nearest_even;
78 int float_exception_flags = 0;
79
80 /*
81 -------------------------------------------------------------------------------
82 Primitive arithmetic functions, including multi-word arithmetic, and
83 division and square root approximations. (Can be specialized to target if
84 desired.)
85 -------------------------------------------------------------------------------
86 */
87 #include "softfloat-macros"
88
89 /*
90 -------------------------------------------------------------------------------
91 Functions and definitions to determine: (1) whether tininess for underflow
92 is detected before or after rounding by default, (2) what (if anything)
93 happens when exceptions are raised, (3) how signaling NaNs are distinguished
94 from quiet NaNs, (4) the default generated quiet NaNs, and (4) how NaNs
95 are propagated from function inputs to output. These details are target-
96 specific.
97 -------------------------------------------------------------------------------
98 */
99 #include "softfloat-specialize"
100
101 /*
102 -------------------------------------------------------------------------------
103 Returns the fraction bits of the single-precision floating-point value `a'.
104 -------------------------------------------------------------------------------
105 */
extractFloat32Frac(float32 a)106 INLINE bits32 extractFloat32Frac( float32 a )
107 {
108
109 return a & 0x007FFFFF;
110
111 }
112
113 /*
114 -------------------------------------------------------------------------------
115 Returns the exponent bits of the single-precision floating-point value `a'.
116 -------------------------------------------------------------------------------
117 */
extractFloat32Exp(float32 a)118 INLINE int16 extractFloat32Exp( float32 a )
119 {
120
121 return ( a>>23 ) & 0xFF;
122
123 }
124
125 /*
126 -------------------------------------------------------------------------------
127 Returns the sign bit of the single-precision floating-point value `a'.
128 -------------------------------------------------------------------------------
129 */
extractFloat32Sign(float32 a)130 INLINE flag extractFloat32Sign( float32 a )
131 {
132
133 return a>>31;
134
135 }
136
137 /*
138 -------------------------------------------------------------------------------
139 Normalizes the subnormal single-precision floating-point value represented
140 by the denormalized significand `aSig'. The normalized exponent and
141 significand are stored at the locations pointed to by `zExpPtr' and
142 `zSigPtr', respectively.
143 -------------------------------------------------------------------------------
144 */
145 static void
normalizeFloat32Subnormal(bits32 aSig,int16 * zExpPtr,bits32 * zSigPtr)146 normalizeFloat32Subnormal( bits32 aSig, int16 *zExpPtr, bits32 *zSigPtr )
147 {
148 int8 shiftCount;
149
150 shiftCount = countLeadingZeros32( aSig ) - 8;
151 *zSigPtr = aSig<<shiftCount;
152 *zExpPtr = 1 - shiftCount;
153
154 }
155
156 /*
157 -------------------------------------------------------------------------------
158 Packs the sign `zSign', exponent `zExp', and significand `zSig' into a
159 single-precision floating-point value, returning the result. After being
160 shifted into the proper positions, the three fields are simply added
161 together to form the result. This means that any integer portion of `zSig'
162 will be added into the exponent. Since a properly normalized significand
163 will have an integer portion equal to 1, the `zExp' input should be 1 less
164 than the desired result exponent whenever `zSig' is a complete, normalized
165 significand.
166 -------------------------------------------------------------------------------
167 */
packFloat32(flag zSign,int16 zExp,bits32 zSig)168 INLINE float32 packFloat32( flag zSign, int16 zExp, bits32 zSig )
169 {
170
171 return ( ( (bits32) zSign )<<31 ) + ( ( (bits32) zExp )<<23 ) + zSig;
172
173 }
174
175 /*
176 -------------------------------------------------------------------------------
177 Takes an abstract floating-point value having sign `zSign', exponent `zExp',
178 and significand `zSig', and returns the proper single-precision floating-
179 point value corresponding to the abstract input. Ordinarily, the abstract
180 value is simply rounded and packed into the single-precision format, with
181 the inexact exception raised if the abstract input cannot be represented
182 exactly. However, if the abstract value is too large, the overflow and
183 inexact exceptions are raised and an infinity or maximal finite value is
184 returned. If the abstract value is too small, the input value is rounded to
185 a subnormal number, and the underflow and inexact exceptions are raised if
186 the abstract input cannot be represented exactly as a subnormal single-
187 precision floating-point number.
188 The input significand `zSig' has its binary point between bits 30
189 and 29, which is 7 bits to the left of the usual location. This shifted
190 significand must be normalized or smaller. If `zSig' is not normalized,
191 `zExp' must be 0; in that case, the result returned is a subnormal number,
192 and it must not require rounding. In the usual case that `zSig' is
193 normalized, `zExp' must be 1 less than the ``true'' floating-point exponent.
194 The handling of underflow and overflow follows the IEC/IEEE Standard for
195 Binary Floating-Point Arithmetic.
196 -------------------------------------------------------------------------------
197 */
roundAndPackFloat32(flag zSign,int16 zExp,bits32 zSig)198 static float32 roundAndPackFloat32( flag zSign, int16 zExp, bits32 zSig )
199 {
200 int8 roundingMode;
201 flag roundNearestEven;
202 int8 roundIncrement, roundBits;
203 flag isTiny;
204
205 roundingMode = float_rounding_mode;
206 roundNearestEven = roundingMode == float_round_nearest_even;
207 roundIncrement = 0x40;
208 if ( ! roundNearestEven ) {
209 if ( roundingMode == float_round_to_zero ) {
210 roundIncrement = 0;
211 }
212 else {
213 roundIncrement = 0x7F;
214 if ( zSign ) {
215 if ( roundingMode == float_round_up ) roundIncrement = 0;
216 }
217 else {
218 if ( roundingMode == float_round_down ) roundIncrement = 0;
219 }
220 }
221 }
222 roundBits = zSig & 0x7F;
223 if ( 0xFD <= (bits16) zExp ) {
224 if ( ( 0xFD < zExp )
225 || ( ( zExp == 0xFD )
226 && ( (sbits32) ( zSig + roundIncrement ) < 0 ) )
227 ) {
228 float_raise( float_flag_overflow | float_flag_inexact );
229 return packFloat32( zSign, 0xFF, 0 ) - ( roundIncrement == 0 );
230 }
231 if ( zExp < 0 ) {
232 isTiny =
233 ( float_detect_tininess == float_tininess_before_rounding )
234 || ( zExp < -1 )
235 || ( zSig + roundIncrement < 0x80000000 );
236 shift32RightJamming( zSig, - zExp, &zSig );
237 zExp = 0;
238 roundBits = zSig & 0x7F;
239 if ( isTiny && roundBits ) float_raise( float_flag_underflow );
240 }
241 }
242 if ( roundBits ) float_exception_flags |= float_flag_inexact;
243 zSig = ( zSig + roundIncrement )>>7;
244 zSig &= ~ ( ( ( roundBits ^ 0x40 ) == 0 ) & roundNearestEven );
245 if ( zSig == 0 ) zExp = 0;
246 return packFloat32( zSign, zExp, zSig );
247
248 }
249
250 /*
251 -------------------------------------------------------------------------------
252 Takes an abstract floating-point value having sign `zSign', exponent `zExp',
253 and significand `zSig', and returns the proper single-precision floating-
254 point value corresponding to the abstract input. This routine is just like
255 `roundAndPackFloat32' except that `zSig' does not have to be normalized.
256 Bit 31 of `zSig' must be zero, and `zExp' must be 1 less than the ``true''
257 floating-point exponent.
258 -------------------------------------------------------------------------------
259 */
260 static float32
normalizeRoundAndPackFloat32(flag zSign,int16 zExp,bits32 zSig)261 normalizeRoundAndPackFloat32( flag zSign, int16 zExp, bits32 zSig )
262 {
263 int8 shiftCount;
264
265 shiftCount = countLeadingZeros32( zSig ) - 1;
266 return roundAndPackFloat32( zSign, zExp - shiftCount, zSig<<shiftCount );
267
268 }
269
270 /*
271 -------------------------------------------------------------------------------
272 Returns the least-significant 32 fraction bits of the double-precision
273 floating-point value `a'.
274 -------------------------------------------------------------------------------
275 */
extractFloat64Frac1(float64 a)276 INLINE bits32 extractFloat64Frac1( float64 a )
277 {
278
279 return FLOAT64_DEMANGLE(a) & LIT64( 0x00000000FFFFFFFF );
280
281 }
282
283 /*
284 -------------------------------------------------------------------------------
285 Returns the most-significant 20 fraction bits of the double-precision
286 floating-point value `a'.
287 -------------------------------------------------------------------------------
288 */
extractFloat64Frac0(float64 a)289 INLINE bits32 extractFloat64Frac0( float64 a )
290 {
291
292 return ( FLOAT64_DEMANGLE(a)>>32 ) & 0x000FFFFF;
293
294 }
295
296 /*
297 -------------------------------------------------------------------------------
298 Returns the exponent bits of the double-precision floating-point value `a'.
299 -------------------------------------------------------------------------------
300 */
extractFloat64Exp(float64 a)301 INLINE int16 extractFloat64Exp( float64 a )
302 {
303
304 return ( FLOAT64_DEMANGLE(a)>>52 ) & 0x7FF;
305
306 }
307
308 /*
309 -------------------------------------------------------------------------------
310 Returns the sign bit of the double-precision floating-point value `a'.
311 -------------------------------------------------------------------------------
312 */
extractFloat64Sign(float64 a)313 INLINE flag extractFloat64Sign( float64 a )
314 {
315
316 return FLOAT64_DEMANGLE(a)>>63;
317
318 }
319
320 /*
321 -------------------------------------------------------------------------------
322 Normalizes the subnormal double-precision floating-point value represented
323 by the denormalized significand formed by the concatenation of `aSig0' and
324 `aSig1'. The normalized exponent is stored at the location pointed to by
325 `zExpPtr'. The most significant 21 bits of the normalized significand are
326 stored at the location pointed to by `zSig0Ptr', and the least significant
327 32 bits of the normalized significand are stored at the location pointed to
328 by `zSig1Ptr'.
329 -------------------------------------------------------------------------------
330 */
331 static void
normalizeFloat64Subnormal(bits32 aSig0,bits32 aSig1,int16 * zExpPtr,bits32 * zSig0Ptr,bits32 * zSig1Ptr)332 normalizeFloat64Subnormal(
333 bits32 aSig0,
334 bits32 aSig1,
335 int16 *zExpPtr,
336 bits32 *zSig0Ptr,
337 bits32 *zSig1Ptr
338 )
339 {
340 int8 shiftCount;
341
342 if ( aSig0 == 0 ) {
343 shiftCount = countLeadingZeros32( aSig1 ) - 11;
344 if ( shiftCount < 0 ) {
345 *zSig0Ptr = aSig1>>( - shiftCount );
346 *zSig1Ptr = aSig1<<( shiftCount & 31 );
347 }
348 else {
349 *zSig0Ptr = aSig1<<shiftCount;
350 *zSig1Ptr = 0;
351 }
352 *zExpPtr = - shiftCount - 31;
353 }
354 else {
355 shiftCount = countLeadingZeros32( aSig0 ) - 11;
356 shortShift64Left( aSig0, aSig1, shiftCount, zSig0Ptr, zSig1Ptr );
357 *zExpPtr = 1 - shiftCount;
358 }
359
360 }
361
362 /*
363 -------------------------------------------------------------------------------
364 Packs the sign `zSign', the exponent `zExp', and the significand formed by
365 the concatenation of `zSig0' and `zSig1' into a double-precision floating-
366 point value, returning the result. After being shifted into the proper
367 positions, the three fields `zSign', `zExp', and `zSig0' are simply added
368 together to form the most significant 32 bits of the result. This means
369 that any integer portion of `zSig0' will be added into the exponent. Since
370 a properly normalized significand will have an integer portion equal to 1,
371 the `zExp' input should be 1 less than the desired result exponent whenever
372 `zSig0' and `zSig1' concatenated form a complete, normalized significand.
373 -------------------------------------------------------------------------------
374 */
375 INLINE float64
packFloat64(flag zSign,int16 zExp,bits32 zSig0,bits32 zSig1)376 packFloat64( flag zSign, int16 zExp, bits32 zSig0, bits32 zSig1 )
377 {
378
379 return FLOAT64_MANGLE( ( ( (bits64) zSign )<<63 ) +
380 ( ( (bits64) zExp )<<52 ) +
381 ( ( (bits64) zSig0 )<<32 ) + zSig1 );
382
383
384 }
385
386 /*
387 -------------------------------------------------------------------------------
388 Takes an abstract floating-point value having sign `zSign', exponent `zExp',
389 and extended significand formed by the concatenation of `zSig0', `zSig1',
390 and `zSig2', and returns the proper double-precision floating-point value
391 corresponding to the abstract input. Ordinarily, the abstract value is
392 simply rounded and packed into the double-precision format, with the inexact
393 exception raised if the abstract input cannot be represented exactly.
394 However, if the abstract value is too large, the overflow and inexact
395 exceptions are raised and an infinity or maximal finite value is returned.
396 If the abstract value is too small, the input value is rounded to a
397 subnormal number, and the underflow and inexact exceptions are raised if the
398 abstract input cannot be represented exactly as a subnormal double-precision
399 floating-point number.
400 The input significand must be normalized or smaller. If the input
401 significand is not normalized, `zExp' must be 0; in that case, the result
402 returned is a subnormal number, and it must not require rounding. In the
403 usual case that the input significand is normalized, `zExp' must be 1 less
404 than the ``true'' floating-point exponent. The handling of underflow and
405 overflow follows the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
406 -------------------------------------------------------------------------------
407 */
408 static float64
roundAndPackFloat64(flag zSign,int16 zExp,bits32 zSig0,bits32 zSig1,bits32 zSig2)409 roundAndPackFloat64(
410 flag zSign, int16 zExp, bits32 zSig0, bits32 zSig1, bits32 zSig2 )
411 {
412 int8 roundingMode;
413 flag roundNearestEven, increment, isTiny;
414
415 roundingMode = float_rounding_mode;
416 roundNearestEven = ( roundingMode == float_round_nearest_even );
417 increment = ( (sbits32) zSig2 < 0 );
418 if ( ! roundNearestEven ) {
419 if ( roundingMode == float_round_to_zero ) {
420 increment = 0;
421 }
422 else {
423 if ( zSign ) {
424 increment = ( roundingMode == float_round_down ) && zSig2;
425 }
426 else {
427 increment = ( roundingMode == float_round_up ) && zSig2;
428 }
429 }
430 }
431 if ( 0x7FD <= (bits16) zExp ) {
432 if ( ( 0x7FD < zExp )
433 || ( ( zExp == 0x7FD )
434 && eq64( 0x001FFFFF, 0xFFFFFFFF, zSig0, zSig1 )
435 && increment
436 )
437 ) {
438 float_raise( float_flag_overflow | float_flag_inexact );
439 if ( ( roundingMode == float_round_to_zero )
440 || ( zSign && ( roundingMode == float_round_up ) )
441 || ( ! zSign && ( roundingMode == float_round_down ) )
442 ) {
443 return packFloat64( zSign, 0x7FE, 0x000FFFFF, 0xFFFFFFFF );
444 }
445 return packFloat64( zSign, 0x7FF, 0, 0 );
446 }
447 if ( zExp < 0 ) {
448 isTiny =
449 ( float_detect_tininess == float_tininess_before_rounding )
450 || ( zExp < -1 )
451 || ! increment
452 || lt64( zSig0, zSig1, 0x001FFFFF, 0xFFFFFFFF );
453 shift64ExtraRightJamming(
454 zSig0, zSig1, zSig2, - zExp, &zSig0, &zSig1, &zSig2 );
455 zExp = 0;
456 if ( isTiny && zSig2 ) float_raise( float_flag_underflow );
457 if ( roundNearestEven ) {
458 increment = ( (sbits32) zSig2 < 0 );
459 }
460 else {
461 if ( zSign ) {
462 increment = ( roundingMode == float_round_down ) && zSig2;
463 }
464 else {
465 increment = ( roundingMode == float_round_up ) && zSig2;
466 }
467 }
468 }
469 }
470 if ( zSig2 ) float_exception_flags |= float_flag_inexact;
471 if ( increment ) {
472 add64( zSig0, zSig1, 0, 1, &zSig0, &zSig1 );
473 zSig1 &= ~ ( ( zSig2 + zSig2 == 0 ) & roundNearestEven );
474 }
475 else {
476 if ( ( zSig0 | zSig1 ) == 0 ) zExp = 0;
477 }
478 return packFloat64( zSign, zExp, zSig0, zSig1 );
479
480 }
481
482 /*
483 -------------------------------------------------------------------------------
484 Takes an abstract floating-point value having sign `zSign', exponent `zExp',
485 and significand formed by the concatenation of `zSig0' and `zSig1', and
486 returns the proper double-precision floating-point value corresponding
487 to the abstract input. This routine is just like `roundAndPackFloat64'
488 except that the input significand has fewer bits and does not have to be
489 normalized. In all cases, `zExp' must be 1 less than the ``true'' floating-
490 point exponent.
491 -------------------------------------------------------------------------------
492 */
493 static float64
normalizeRoundAndPackFloat64(flag zSign,int16 zExp,bits32 zSig0,bits32 zSig1)494 normalizeRoundAndPackFloat64(
495 flag zSign, int16 zExp, bits32 zSig0, bits32 zSig1 )
496 {
497 int8 shiftCount;
498 bits32 zSig2;
499
500 if ( zSig0 == 0 ) {
501 zSig0 = zSig1;
502 zSig1 = 0;
503 zExp -= 32;
504 }
505 shiftCount = countLeadingZeros32( zSig0 ) - 11;
506 if ( 0 <= shiftCount ) {
507 zSig2 = 0;
508 shortShift64Left( zSig0, zSig1, shiftCount, &zSig0, &zSig1 );
509 }
510 else {
511 shift64ExtraRightJamming(
512 zSig0, zSig1, 0, - shiftCount, &zSig0, &zSig1, &zSig2 );
513 }
514 zExp -= shiftCount;
515 return roundAndPackFloat64( zSign, zExp, zSig0, zSig1, zSig2 );
516
517 }
518
519 /*
520 -------------------------------------------------------------------------------
521 Returns the result of converting the 32-bit two's complement integer `a' to
522 the single-precision floating-point format. The conversion is performed
523 according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
524 -------------------------------------------------------------------------------
525 */
int32_to_float32(int32 a)526 float32 int32_to_float32( int32 a )
527 {
528 flag zSign;
529
530 if ( a == 0 ) return 0;
531 if ( a == (sbits32) 0x80000000 ) return packFloat32( 1, 0x9E, 0 );
532 zSign = ( a < 0 );
533 return normalizeRoundAndPackFloat32( zSign, 0x9C, zSign ? - a : a );
534
535 }
536
537 /*
538 -------------------------------------------------------------------------------
539 Returns the result of converting the 32-bit two's complement integer `a' to
540 the double-precision floating-point format. The conversion is performed
541 according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
542 -------------------------------------------------------------------------------
543 */
int32_to_float64(int32 a)544 float64 int32_to_float64( int32 a )
545 {
546 flag zSign;
547 bits32 absA;
548 int8 shiftCount;
549 bits32 zSig0, zSig1;
550
551 if ( a == 0 ) return packFloat64( 0, 0, 0, 0 );
552 zSign = ( a < 0 );
553 absA = zSign ? - a : a;
554 shiftCount = countLeadingZeros32( absA ) - 11;
555 if ( 0 <= shiftCount ) {
556 zSig0 = absA<<shiftCount;
557 zSig1 = 0;
558 }
559 else {
560 shift64Right( absA, 0, - shiftCount, &zSig0, &zSig1 );
561 }
562 return packFloat64( zSign, 0x412 - shiftCount, zSig0, zSig1 );
563
564 }
565
566 #ifndef SOFTFLOAT_FOR_GCC
567 /*
568 -------------------------------------------------------------------------------
569 Returns the result of converting the single-precision floating-point value
570 `a' to the 32-bit two's complement integer format. The conversion is
571 performed according to the IEC/IEEE Standard for Binary Floating-Point
572 Arithmetic---which means in particular that the conversion is rounded
573 according to the current rounding mode. If `a' is a NaN, the largest
574 positive integer is returned. Otherwise, if the conversion overflows, the
575 largest integer with the same sign as `a' is returned.
576 -------------------------------------------------------------------------------
577 */
float32_to_int32(float32 a)578 int32 float32_to_int32( float32 a )
579 {
580 flag aSign;
581 int16 aExp, shiftCount;
582 bits32 aSig, aSigExtra;
583 int32 z;
584 int8 roundingMode;
585
586 aSig = extractFloat32Frac( a );
587 aExp = extractFloat32Exp( a );
588 aSign = extractFloat32Sign( a );
589 shiftCount = aExp - 0x96;
590 if ( 0 <= shiftCount ) {
591 if ( 0x9E <= aExp ) {
592 if ( a != 0xCF000000 ) {
593 float_raise( float_flag_invalid );
594 if ( ! aSign || ( ( aExp == 0xFF ) && aSig ) ) {
595 return 0x7FFFFFFF;
596 }
597 }
598 return (sbits32) 0x80000000;
599 }
600 z = ( aSig | 0x00800000 )<<shiftCount;
601 if ( aSign ) z = - z;
602 }
603 else {
604 if ( aExp < 0x7E ) {
605 aSigExtra = aExp | aSig;
606 z = 0;
607 }
608 else {
609 aSig |= 0x00800000;
610 aSigExtra = aSig<<( shiftCount & 31 );
611 z = aSig>>( - shiftCount );
612 }
613 if ( aSigExtra ) float_exception_flags |= float_flag_inexact;
614 roundingMode = float_rounding_mode;
615 if ( roundingMode == float_round_nearest_even ) {
616 if ( (sbits32) aSigExtra < 0 ) {
617 ++z;
618 if ( (bits32) ( aSigExtra<<1 ) == 0 ) z &= ~1;
619 }
620 if ( aSign ) z = - z;
621 }
622 else {
623 aSigExtra = ( aSigExtra != 0 );
624 if ( aSign ) {
625 z += ( roundingMode == float_round_down ) & aSigExtra;
626 z = - z;
627 }
628 else {
629 z += ( roundingMode == float_round_up ) & aSigExtra;
630 }
631 }
632 }
633 return z;
634
635 }
636 #endif
637
638 /*
639 -------------------------------------------------------------------------------
640 Returns the result of converting the single-precision floating-point value
641 `a' to the 32-bit two's complement integer format. The conversion is
642 performed according to the IEC/IEEE Standard for Binary Floating-Point
643 Arithmetic, except that the conversion is always rounded toward zero.
644 If `a' is a NaN, the largest positive integer is returned. Otherwise, if
645 the conversion overflows, the largest integer with the same sign as `a' is
646 returned.
647 -------------------------------------------------------------------------------
648 */
float32_to_int32_round_to_zero(float32 a)649 int32 float32_to_int32_round_to_zero( float32 a )
650 {
651 flag aSign;
652 int16 aExp, shiftCount;
653 bits32 aSig;
654 int32 z;
655
656 aSig = extractFloat32Frac( a );
657 aExp = extractFloat32Exp( a );
658 aSign = extractFloat32Sign( a );
659 shiftCount = aExp - 0x9E;
660 if ( 0 <= shiftCount ) {
661 if ( a != 0xCF000000 ) {
662 float_raise( float_flag_invalid );
663 if ( ! aSign || ( ( aExp == 0xFF ) && aSig ) ) return 0x7FFFFFFF;
664 }
665 return (sbits32) 0x80000000;
666 }
667 else if ( aExp <= 0x7E ) {
668 if ( aExp | aSig ) float_exception_flags |= float_flag_inexact;
669 return 0;
670 }
671 aSig = ( aSig | 0x00800000 )<<8;
672 z = aSig>>( - shiftCount );
673 if ( (bits32) ( aSig<<( shiftCount & 31 ) ) ) {
674 float_exception_flags |= float_flag_inexact;
675 }
676 if ( aSign ) z = - z;
677 return z;
678
679 }
680
681 /*
682 -------------------------------------------------------------------------------
683 Returns the result of converting the single-precision floating-point value
684 `a' to the double-precision floating-point format. The conversion is
685 performed according to the IEC/IEEE Standard for Binary Floating-Point
686 Arithmetic.
687 -------------------------------------------------------------------------------
688 */
float32_to_float64(float32 a)689 float64 float32_to_float64( float32 a )
690 {
691 flag aSign;
692 int16 aExp;
693 bits32 aSig, zSig0, zSig1;
694
695 aSig = extractFloat32Frac( a );
696 aExp = extractFloat32Exp( a );
697 aSign = extractFloat32Sign( a );
698 if ( aExp == 0xFF ) {
699 if ( aSig ) return commonNaNToFloat64( float32ToCommonNaN( a ) );
700 return packFloat64( aSign, 0x7FF, 0, 0 );
701 }
702 if ( aExp == 0 ) {
703 if ( aSig == 0 ) return packFloat64( aSign, 0, 0, 0 );
704 normalizeFloat32Subnormal( aSig, &aExp, &aSig );
705 --aExp;
706 }
707 shift64Right( aSig, 0, 3, &zSig0, &zSig1 );
708 return packFloat64( aSign, aExp + 0x380, zSig0, zSig1 );
709
710 }
711
712 #ifndef SOFTFLOAT_FOR_GCC
713 /*
714 -------------------------------------------------------------------------------
715 Rounds the single-precision floating-point value `a' to an integer,
716 and returns the result as a single-precision floating-point value. The
717 operation is performed according to the IEC/IEEE Standard for Binary
718 Floating-Point Arithmetic.
719 -------------------------------------------------------------------------------
720 */
float32_round_to_int(float32 a)721 float32 float32_round_to_int( float32 a )
722 {
723 flag aSign;
724 int16 aExp;
725 bits32 lastBitMask, roundBitsMask;
726 int8 roundingMode;
727 float32 z;
728
729 aExp = extractFloat32Exp( a );
730 if ( 0x96 <= aExp ) {
731 if ( ( aExp == 0xFF ) && extractFloat32Frac( a ) ) {
732 return propagateFloat32NaN( a, a );
733 }
734 return a;
735 }
736 if ( aExp <= 0x7E ) {
737 if ( (bits32) ( a<<1 ) == 0 ) return a;
738 float_exception_flags |= float_flag_inexact;
739 aSign = extractFloat32Sign( a );
740 switch ( float_rounding_mode ) {
741 case float_round_nearest_even:
742 if ( ( aExp == 0x7E ) && extractFloat32Frac( a ) ) {
743 return packFloat32( aSign, 0x7F, 0 );
744 }
745 break;
746 case float_round_to_zero:
747 break;
748 case float_round_down:
749 return aSign ? 0xBF800000 : 0;
750 case float_round_up:
751 return aSign ? 0x80000000 : 0x3F800000;
752 }
753 return packFloat32( aSign, 0, 0 );
754 }
755 lastBitMask = 1;
756 lastBitMask <<= 0x96 - aExp;
757 roundBitsMask = lastBitMask - 1;
758 z = a;
759 roundingMode = float_rounding_mode;
760 if ( roundingMode == float_round_nearest_even ) {
761 z += lastBitMask>>1;
762 if ( ( z & roundBitsMask ) == 0 ) z &= ~ lastBitMask;
763 }
764 else if ( roundingMode != float_round_to_zero ) {
765 if ( extractFloat32Sign( z ) ^ ( roundingMode == float_round_up ) ) {
766 z += roundBitsMask;
767 }
768 }
769 z &= ~ roundBitsMask;
770 if ( z != a ) float_exception_flags |= float_flag_inexact;
771 return z;
772
773 }
774 #endif
775
776 /*
777 -------------------------------------------------------------------------------
778 Returns the result of adding the absolute values of the single-precision
779 floating-point values `a' and `b'. If `zSign' is 1, the sum is negated
780 before being returned. `zSign' is ignored if the result is a NaN.
781 The addition is performed according to the IEC/IEEE Standard for Binary
782 Floating-Point Arithmetic.
783 -------------------------------------------------------------------------------
784 */
addFloat32Sigs(float32 a,float32 b,flag zSign)785 static float32 addFloat32Sigs( float32 a, float32 b, flag zSign )
786 {
787 int16 aExp, bExp, zExp;
788 bits32 aSig, bSig, zSig;
789 int16 expDiff;
790
791 aSig = extractFloat32Frac( a );
792 aExp = extractFloat32Exp( a );
793 bSig = extractFloat32Frac( b );
794 bExp = extractFloat32Exp( b );
795 expDiff = aExp - bExp;
796 aSig <<= 6;
797 bSig <<= 6;
798 if ( 0 < expDiff ) {
799 if ( aExp == 0xFF ) {
800 if ( aSig ) return propagateFloat32NaN( a, b );
801 return a;
802 }
803 if ( bExp == 0 ) {
804 --expDiff;
805 }
806 else {
807 bSig |= 0x20000000;
808 }
809 shift32RightJamming( bSig, expDiff, &bSig );
810 zExp = aExp;
811 }
812 else if ( expDiff < 0 ) {
813 if ( bExp == 0xFF ) {
814 if ( bSig ) return propagateFloat32NaN( a, b );
815 return packFloat32( zSign, 0xFF, 0 );
816 }
817 if ( aExp == 0 ) {
818 ++expDiff;
819 }
820 else {
821 aSig |= 0x20000000;
822 }
823 shift32RightJamming( aSig, - expDiff, &aSig );
824 zExp = bExp;
825 }
826 else {
827 if ( aExp == 0xFF ) {
828 if ( aSig | bSig ) return propagateFloat32NaN( a, b );
829 return a;
830 }
831 if ( aExp == 0 ) return packFloat32( zSign, 0, ( aSig + bSig )>>6 );
832 zSig = 0x40000000 + aSig + bSig;
833 zExp = aExp;
834 goto roundAndPack;
835 }
836 aSig |= 0x20000000;
837 zSig = ( aSig + bSig )<<1;
838 --zExp;
839 if ( (sbits32) zSig < 0 ) {
840 zSig = aSig + bSig;
841 ++zExp;
842 }
843 roundAndPack:
844 return roundAndPackFloat32( zSign, zExp, zSig );
845
846 }
847
848 /*
849 -------------------------------------------------------------------------------
850 Returns the result of subtracting the absolute values of the single-
851 precision floating-point values `a' and `b'. If `zSign' is 1, the
852 difference is negated before being returned. `zSign' is ignored if the
853 result is a NaN. The subtraction is performed according to the IEC/IEEE
854 Standard for Binary Floating-Point Arithmetic.
855 -------------------------------------------------------------------------------
856 */
subFloat32Sigs(float32 a,float32 b,flag zSign)857 static float32 subFloat32Sigs( float32 a, float32 b, flag zSign )
858 {
859 int16 aExp, bExp, zExp;
860 bits32 aSig, bSig, zSig;
861 int16 expDiff;
862
863 aSig = extractFloat32Frac( a );
864 aExp = extractFloat32Exp( a );
865 bSig = extractFloat32Frac( b );
866 bExp = extractFloat32Exp( b );
867 expDiff = aExp - bExp;
868 aSig <<= 7;
869 bSig <<= 7;
870 if ( 0 < expDiff ) goto aExpBigger;
871 if ( expDiff < 0 ) goto bExpBigger;
872 if ( aExp == 0xFF ) {
873 if ( aSig | bSig ) return propagateFloat32NaN( a, b );
874 float_raise( float_flag_invalid );
875 return float32_default_nan;
876 }
877 if ( aExp == 0 ) {
878 aExp = 1;
879 bExp = 1;
880 }
881 if ( bSig < aSig ) goto aBigger;
882 if ( aSig < bSig ) goto bBigger;
883 return packFloat32( float_rounding_mode == float_round_down, 0, 0 );
884 bExpBigger:
885 if ( bExp == 0xFF ) {
886 if ( bSig ) return propagateFloat32NaN( a, b );
887 return packFloat32( zSign ^ 1, 0xFF, 0 );
888 }
889 if ( aExp == 0 ) {
890 ++expDiff;
891 }
892 else {
893 aSig |= 0x40000000;
894 }
895 shift32RightJamming( aSig, - expDiff, &aSig );
896 bSig |= 0x40000000;
897 bBigger:
898 zSig = bSig - aSig;
899 zExp = bExp;
900 zSign ^= 1;
901 goto normalizeRoundAndPack;
902 aExpBigger:
903 if ( aExp == 0xFF ) {
904 if ( aSig ) return propagateFloat32NaN( a, b );
905 return a;
906 }
907 if ( bExp == 0 ) {
908 --expDiff;
909 }
910 else {
911 bSig |= 0x40000000;
912 }
913 shift32RightJamming( bSig, expDiff, &bSig );
914 aSig |= 0x40000000;
915 aBigger:
916 zSig = aSig - bSig;
917 zExp = aExp;
918 normalizeRoundAndPack:
919 --zExp;
920 return normalizeRoundAndPackFloat32( zSign, zExp, zSig );
921
922 }
923
924 /*
925 -------------------------------------------------------------------------------
926 Returns the result of adding the single-precision floating-point values `a'
927 and `b'. The operation is performed according to the IEC/IEEE Standard for
928 Binary Floating-Point Arithmetic.
929 -------------------------------------------------------------------------------
930 */
float32_add(float32 a,float32 b)931 float32 float32_add( float32 a, float32 b )
932 {
933 flag aSign, bSign;
934
935 aSign = extractFloat32Sign( a );
936 bSign = extractFloat32Sign( b );
937 if ( aSign == bSign ) {
938 return addFloat32Sigs( a, b, aSign );
939 }
940 else {
941 return subFloat32Sigs( a, b, aSign );
942 }
943
944 }
945
946 /*
947 -------------------------------------------------------------------------------
948 Returns the result of subtracting the single-precision floating-point values
949 `a' and `b'. The operation is performed according to the IEC/IEEE Standard
950 for Binary Floating-Point Arithmetic.
951 -------------------------------------------------------------------------------
952 */
float32_sub(float32 a,float32 b)953 float32 float32_sub( float32 a, float32 b )
954 {
955 flag aSign, bSign;
956
957 aSign = extractFloat32Sign( a );
958 bSign = extractFloat32Sign( b );
959 if ( aSign == bSign ) {
960 return subFloat32Sigs( a, b, aSign );
961 }
962 else {
963 return addFloat32Sigs( a, b, aSign );
964 }
965
966 }
967
968 /*
969 -------------------------------------------------------------------------------
970 Returns the result of multiplying the single-precision floating-point values
971 `a' and `b'. The operation is performed according to the IEC/IEEE Standard
972 for Binary Floating-Point Arithmetic.
973 -------------------------------------------------------------------------------
974 */
float32_mul(float32 a,float32 b)975 float32 float32_mul( float32 a, float32 b )
976 {
977 flag aSign, bSign, zSign;
978 int16 aExp, bExp, zExp;
979 bits32 aSig, bSig, zSig0, zSig1;
980
981 aSig = extractFloat32Frac( a );
982 aExp = extractFloat32Exp( a );
983 aSign = extractFloat32Sign( a );
984 bSig = extractFloat32Frac( b );
985 bExp = extractFloat32Exp( b );
986 bSign = extractFloat32Sign( b );
987 zSign = aSign ^ bSign;
988 if ( aExp == 0xFF ) {
989 if ( aSig || ( ( bExp == 0xFF ) && bSig ) ) {
990 return propagateFloat32NaN( a, b );
991 }
992 if ( ( bExp | bSig ) == 0 ) {
993 float_raise( float_flag_invalid );
994 return float32_default_nan;
995 }
996 return packFloat32( zSign, 0xFF, 0 );
997 }
998 if ( bExp == 0xFF ) {
999 if ( bSig ) return propagateFloat32NaN( a, b );
1000 if ( ( aExp | aSig ) == 0 ) {
1001 float_raise( float_flag_invalid );
1002 return float32_default_nan;
1003 }
1004 return packFloat32( zSign, 0xFF, 0 );
1005 }
1006 if ( aExp == 0 ) {
1007 if ( aSig == 0 ) return packFloat32( zSign, 0, 0 );
1008 normalizeFloat32Subnormal( aSig, &aExp, &aSig );
1009 }
1010 if ( bExp == 0 ) {
1011 if ( bSig == 0 ) return packFloat32( zSign, 0, 0 );
1012 normalizeFloat32Subnormal( bSig, &bExp, &bSig );
1013 }
1014 zExp = aExp + bExp - 0x7F;
1015 aSig = ( aSig | 0x00800000 )<<7;
1016 bSig = ( bSig | 0x00800000 )<<8;
1017 mul32To64( aSig, bSig, &zSig0, &zSig1 );
1018 zSig0 |= ( zSig1 != 0 );
1019 if ( 0 <= (sbits32) ( zSig0<<1 ) ) {
1020 zSig0 <<= 1;
1021 --zExp;
1022 }
1023 return roundAndPackFloat32( zSign, zExp, zSig0 );
1024
1025 }
1026
1027 /*
1028 -------------------------------------------------------------------------------
1029 Returns the result of dividing the single-precision floating-point value `a'
1030 by the corresponding value `b'. The operation is performed according to the
1031 IEC/IEEE Standard for Binary Floating-Point Arithmetic.
1032 -------------------------------------------------------------------------------
1033 */
float32_div(float32 a,float32 b)1034 float32 float32_div( float32 a, float32 b )
1035 {
1036 flag aSign, bSign, zSign;
1037 int16 aExp, bExp, zExp;
1038 bits32 aSig, bSig, zSig, rem0, rem1, term0, term1;
1039
1040 aSig = extractFloat32Frac( a );
1041 aExp = extractFloat32Exp( a );
1042 aSign = extractFloat32Sign( a );
1043 bSig = extractFloat32Frac( b );
1044 bExp = extractFloat32Exp( b );
1045 bSign = extractFloat32Sign( b );
1046 zSign = aSign ^ bSign;
1047 if ( aExp == 0xFF ) {
1048 if ( aSig ) return propagateFloat32NaN( a, b );
1049 if ( bExp == 0xFF ) {
1050 if ( bSig ) return propagateFloat32NaN( a, b );
1051 float_raise( float_flag_invalid );
1052 return float32_default_nan;
1053 }
1054 return packFloat32( zSign, 0xFF, 0 );
1055 }
1056 if ( bExp == 0xFF ) {
1057 if ( bSig ) return propagateFloat32NaN( a, b );
1058 return packFloat32( zSign, 0, 0 );
1059 }
1060 if ( bExp == 0 ) {
1061 if ( bSig == 0 ) {
1062 if ( ( aExp | aSig ) == 0 ) {
1063 float_raise( float_flag_invalid );
1064 return float32_default_nan;
1065 }
1066 float_raise( float_flag_divbyzero );
1067 return packFloat32( zSign, 0xFF, 0 );
1068 }
1069 normalizeFloat32Subnormal( bSig, &bExp, &bSig );
1070 }
1071 if ( aExp == 0 ) {
1072 if ( aSig == 0 ) return packFloat32( zSign, 0, 0 );
1073 normalizeFloat32Subnormal( aSig, &aExp, &aSig );
1074 }
1075 zExp = aExp - bExp + 0x7D;
1076 aSig = ( aSig | 0x00800000 )<<7;
1077 bSig = ( bSig | 0x00800000 )<<8;
1078 if ( bSig <= ( aSig + aSig ) ) {
1079 aSig >>= 1;
1080 ++zExp;
1081 }
1082 zSig = estimateDiv64To32( aSig, 0, bSig );
1083 if ( ( zSig & 0x3F ) <= 2 ) {
1084 mul32To64( bSig, zSig, &term0, &term1 );
1085 sub64( aSig, 0, term0, term1, &rem0, &rem1 );
1086 while ( (sbits32) rem0 < 0 ) {
1087 --zSig;
1088 add64( rem0, rem1, 0, bSig, &rem0, &rem1 );
1089 }
1090 zSig |= ( rem1 != 0 );
1091 }
1092 return roundAndPackFloat32( zSign, zExp, zSig );
1093
1094 }
1095
1096 #ifndef SOFTFLOAT_FOR_GCC
1097 /*
1098 -------------------------------------------------------------------------------
1099 Returns the remainder of the single-precision floating-point value `a'
1100 with respect to the corresponding value `b'. The operation is performed
1101 according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
1102 -------------------------------------------------------------------------------
1103 */
float32_rem(float32 a,float32 b)1104 float32 float32_rem( float32 a, float32 b )
1105 {
1106 flag aSign, bSign, zSign;
1107 int16 aExp, bExp, expDiff;
1108 bits32 aSig, bSig, q, allZero, alternateASig;
1109 sbits32 sigMean;
1110
1111 aSig = extractFloat32Frac( a );
1112 aExp = extractFloat32Exp( a );
1113 aSign = extractFloat32Sign( a );
1114 bSig = extractFloat32Frac( b );
1115 bExp = extractFloat32Exp( b );
1116 bSign = extractFloat32Sign( b );
1117 if ( aExp == 0xFF ) {
1118 if ( aSig || ( ( bExp == 0xFF ) && bSig ) ) {
1119 return propagateFloat32NaN( a, b );
1120 }
1121 float_raise( float_flag_invalid );
1122 return float32_default_nan;
1123 }
1124 if ( bExp == 0xFF ) {
1125 if ( bSig ) return propagateFloat32NaN( a, b );
1126 return a;
1127 }
1128 if ( bExp == 0 ) {
1129 if ( bSig == 0 ) {
1130 float_raise( float_flag_invalid );
1131 return float32_default_nan;
1132 }
1133 normalizeFloat32Subnormal( bSig, &bExp, &bSig );
1134 }
1135 if ( aExp == 0 ) {
1136 if ( aSig == 0 ) return a;
1137 normalizeFloat32Subnormal( aSig, &aExp, &aSig );
1138 }
1139 expDiff = aExp - bExp;
1140 aSig = ( aSig | 0x00800000 )<<8;
1141 bSig = ( bSig | 0x00800000 )<<8;
1142 if ( expDiff < 0 ) {
1143 if ( expDiff < -1 ) return a;
1144 aSig >>= 1;
1145 }
1146 q = ( bSig <= aSig );
1147 if ( q ) aSig -= bSig;
1148 expDiff -= 32;
1149 while ( 0 < expDiff ) {
1150 q = estimateDiv64To32( aSig, 0, bSig );
1151 q = ( 2 < q ) ? q - 2 : 0;
1152 aSig = - ( ( bSig>>2 ) * q );
1153 expDiff -= 30;
1154 }
1155 expDiff += 32;
1156 if ( 0 < expDiff ) {
1157 q = estimateDiv64To32( aSig, 0, bSig );
1158 q = ( 2 < q ) ? q - 2 : 0;
1159 q >>= 32 - expDiff;
1160 bSig >>= 2;
1161 aSig = ( ( aSig>>1 )<<( expDiff - 1 ) ) - bSig * q;
1162 }
1163 else {
1164 aSig >>= 2;
1165 bSig >>= 2;
1166 }
1167 do {
1168 alternateASig = aSig;
1169 ++q;
1170 aSig -= bSig;
1171 } while ( 0 <= (sbits32) aSig );
1172 sigMean = aSig + alternateASig;
1173 if ( ( sigMean < 0 ) || ( ( sigMean == 0 ) && ( q & 1 ) ) ) {
1174 aSig = alternateASig;
1175 }
1176 zSign = ( (sbits32) aSig < 0 );
1177 if ( zSign ) aSig = - aSig;
1178 return normalizeRoundAndPackFloat32( aSign ^ zSign, bExp, aSig );
1179
1180 }
1181 #endif
1182
1183 #ifndef SOFTFLOAT_FOR_GCC
1184 /*
1185 -------------------------------------------------------------------------------
1186 Returns the square root of the single-precision floating-point value `a'.
1187 The operation is performed according to the IEC/IEEE Standard for Binary
1188 Floating-Point Arithmetic.
1189 -------------------------------------------------------------------------------
1190 */
float32_sqrt(float32 a)1191 float32 float32_sqrt( float32 a )
1192 {
1193 flag aSign;
1194 int16 aExp, zExp;
1195 bits32 aSig, zSig, rem0, rem1, term0, term1;
1196
1197 aSig = extractFloat32Frac( a );
1198 aExp = extractFloat32Exp( a );
1199 aSign = extractFloat32Sign( a );
1200 if ( aExp == 0xFF ) {
1201 if ( aSig ) return propagateFloat32NaN( a, 0 );
1202 if ( ! aSign ) return a;
1203 float_raise( float_flag_invalid );
1204 return float32_default_nan;
1205 }
1206 if ( aSign ) {
1207 if ( ( aExp | aSig ) == 0 ) return a;
1208 float_raise( float_flag_invalid );
1209 return float32_default_nan;
1210 }
1211 if ( aExp == 0 ) {
1212 if ( aSig == 0 ) return 0;
1213 normalizeFloat32Subnormal( aSig, &aExp, &aSig );
1214 }
1215 zExp = ( ( aExp - 0x7F )>>1 ) + 0x7E;
1216 aSig = ( aSig | 0x00800000 )<<8;
1217 zSig = estimateSqrt32( aExp, aSig ) + 2;
1218 if ( ( zSig & 0x7F ) <= 5 ) {
1219 if ( zSig < 2 ) {
1220 zSig = 0x7FFFFFFF;
1221 goto roundAndPack;
1222 }
1223 else {
1224 aSig >>= aExp & 1;
1225 mul32To64( zSig, zSig, &term0, &term1 );
1226 sub64( aSig, 0, term0, term1, &rem0, &rem1 );
1227 while ( (sbits32) rem0 < 0 ) {
1228 --zSig;
1229 shortShift64Left( 0, zSig, 1, &term0, &term1 );
1230 term1 |= 1;
1231 add64( rem0, rem1, term0, term1, &rem0, &rem1 );
1232 }
1233 zSig |= ( ( rem0 | rem1 ) != 0 );
1234 }
1235 }
1236 shift32RightJamming( zSig, 1, &zSig );
1237 roundAndPack:
1238 return roundAndPackFloat32( 0, zExp, zSig );
1239
1240 }
1241 #endif
1242
1243 /*
1244 -------------------------------------------------------------------------------
1245 Returns 1 if the single-precision floating-point value `a' is equal to
1246 the corresponding value `b', and 0 otherwise. The comparison is performed
1247 according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
1248 -------------------------------------------------------------------------------
1249 */
float32_eq(float32 a,float32 b)1250 flag float32_eq( float32 a, float32 b )
1251 {
1252
1253 if ( ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) )
1254 || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) )
1255 ) {
1256 if ( float32_is_signaling_nan( a ) || float32_is_signaling_nan( b ) ) {
1257 float_raise( float_flag_invalid );
1258 }
1259 return 0;
1260 }
1261 return ( a == b ) || ( (bits32) ( ( a | b )<<1 ) == 0 );
1262
1263 }
1264
1265 /*
1266 -------------------------------------------------------------------------------
1267 Returns 1 if the single-precision floating-point value `a' is less than
1268 or equal to the corresponding value `b', and 0 otherwise. The comparison
1269 is performed according to the IEC/IEEE Standard for Binary Floating-Point
1270 Arithmetic.
1271 -------------------------------------------------------------------------------
1272 */
float32_le(float32 a,float32 b)1273 flag float32_le( float32 a, float32 b )
1274 {
1275 flag aSign, bSign;
1276
1277 if ( ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) )
1278 || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) )
1279 ) {
1280 float_raise( float_flag_invalid );
1281 return 0;
1282 }
1283 aSign = extractFloat32Sign( a );
1284 bSign = extractFloat32Sign( b );
1285 if ( aSign != bSign ) return aSign || ( (bits32) ( ( a | b )<<1 ) == 0 );
1286 return ( a == b ) || ( aSign ^ ( a < b ) );
1287
1288 }
1289
1290 /*
1291 -------------------------------------------------------------------------------
1292 Returns 1 if the single-precision floating-point value `a' is less than
1293 the corresponding value `b', and 0 otherwise. The comparison is performed
1294 according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
1295 -------------------------------------------------------------------------------
1296 */
float32_lt(float32 a,float32 b)1297 flag float32_lt( float32 a, float32 b )
1298 {
1299 flag aSign, bSign;
1300
1301 if ( ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) )
1302 || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) )
1303 ) {
1304 float_raise( float_flag_invalid );
1305 return 0;
1306 }
1307 aSign = extractFloat32Sign( a );
1308 bSign = extractFloat32Sign( b );
1309 if ( aSign != bSign ) return aSign && ( (bits32) ( ( a | b )<<1 ) != 0 );
1310 return ( a != b ) && ( aSign ^ ( a < b ) );
1311
1312 }
1313
1314 #ifndef SOFTFLOAT_FOR_GCC /* Not needed */
1315 /*
1316 -------------------------------------------------------------------------------
1317 Returns 1 if the single-precision floating-point value `a' is equal to
1318 the corresponding value `b', and 0 otherwise. The invalid exception is
1319 raised if either operand is a NaN. Otherwise, the comparison is performed
1320 according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
1321 -------------------------------------------------------------------------------
1322 */
float32_eq_signaling(float32 a,float32 b)1323 flag float32_eq_signaling( float32 a, float32 b )
1324 {
1325
1326 if ( ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) )
1327 || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) )
1328 ) {
1329 float_raise( float_flag_invalid );
1330 return 0;
1331 }
1332 return ( a == b ) || ( (bits32) ( ( a | b )<<1 ) == 0 );
1333
1334 }
1335
1336 /*
1337 -------------------------------------------------------------------------------
1338 Returns 1 if the single-precision floating-point value `a' is less than or
1339 equal to the corresponding value `b', and 0 otherwise. Quiet NaNs do not
1340 cause an exception. Otherwise, the comparison is performed according to the
1341 IEC/IEEE Standard for Binary Floating-Point Arithmetic.
1342 -------------------------------------------------------------------------------
1343 */
float32_le_quiet(float32 a,float32 b)1344 flag float32_le_quiet( float32 a, float32 b )
1345 {
1346 flag aSign, bSign;
1347 int16 aExp, bExp;
1348
1349 if ( ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) )
1350 || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) )
1351 ) {
1352 if ( float32_is_signaling_nan( a ) || float32_is_signaling_nan( b ) ) {
1353 float_raise( float_flag_invalid );
1354 }
1355 return 0;
1356 }
1357 aSign = extractFloat32Sign( a );
1358 bSign = extractFloat32Sign( b );
1359 if ( aSign != bSign ) return aSign || ( (bits32) ( ( a | b )<<1 ) == 0 );
1360 return ( a == b ) || ( aSign ^ ( a < b ) );
1361
1362 }
1363
1364 /*
1365 -------------------------------------------------------------------------------
1366 Returns 1 if the single-precision floating-point value `a' is less than
1367 the corresponding value `b', and 0 otherwise. Quiet NaNs do not cause an
1368 exception. Otherwise, the comparison is performed according to the IEC/IEEE
1369 Standard for Binary Floating-Point Arithmetic.
1370 -------------------------------------------------------------------------------
1371 */
float32_lt_quiet(float32 a,float32 b)1372 flag float32_lt_quiet( float32 a, float32 b )
1373 {
1374 flag aSign, bSign;
1375
1376 if ( ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) )
1377 || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) )
1378 ) {
1379 if ( float32_is_signaling_nan( a ) || float32_is_signaling_nan( b ) ) {
1380 float_raise( float_flag_invalid );
1381 }
1382 return 0;
1383 }
1384 aSign = extractFloat32Sign( a );
1385 bSign = extractFloat32Sign( b );
1386 if ( aSign != bSign ) return aSign && ( (bits32) ( ( a | b )<<1 ) != 0 );
1387 return ( a != b ) && ( aSign ^ ( a < b ) );
1388
1389 }
1390 #endif /* !SOFTFLOAT_FOR_GCC */
1391
1392 #ifndef SOFTFLOAT_FOR_GCC /* Not needed */
1393 /*
1394 -------------------------------------------------------------------------------
1395 Returns the result of converting the double-precision floating-point value
1396 `a' to the 32-bit two's complement integer format. The conversion is
1397 performed according to the IEC/IEEE Standard for Binary Floating-Point
1398 Arithmetic---which means in particular that the conversion is rounded
1399 according to the current rounding mode. If `a' is a NaN, the largest
1400 positive integer is returned. Otherwise, if the conversion overflows, the
1401 largest integer with the same sign as `a' is returned.
1402 -------------------------------------------------------------------------------
1403 */
float64_to_int32(float64 a)1404 int32 float64_to_int32( float64 a )
1405 {
1406 flag aSign;
1407 int16 aExp, shiftCount;
1408 bits32 aSig0, aSig1, absZ, aSigExtra;
1409 int32 z;
1410 int8 roundingMode;
1411
1412 aSig1 = extractFloat64Frac1( a );
1413 aSig0 = extractFloat64Frac0( a );
1414 aExp = extractFloat64Exp( a );
1415 aSign = extractFloat64Sign( a );
1416 shiftCount = aExp - 0x413;
1417 if ( 0 <= shiftCount ) {
1418 if ( 0x41E < aExp ) {
1419 if ( ( aExp == 0x7FF ) && ( aSig0 | aSig1 ) ) aSign = 0;
1420 goto invalid;
1421 }
1422 shortShift64Left(
1423 aSig0 | 0x00100000, aSig1, shiftCount, &absZ, &aSigExtra );
1424 if ( 0x80000000 < absZ ) goto invalid;
1425 }
1426 else {
1427 aSig1 = ( aSig1 != 0 );
1428 if ( aExp < 0x3FE ) {
1429 aSigExtra = aExp | aSig0 | aSig1;
1430 absZ = 0;
1431 }
1432 else {
1433 aSig0 |= 0x00100000;
1434 aSigExtra = ( aSig0<<( shiftCount & 31 ) ) | aSig1;
1435 absZ = aSig0>>( - shiftCount );
1436 }
1437 }
1438 roundingMode = float_rounding_mode;
1439 if ( roundingMode == float_round_nearest_even ) {
1440 if ( (sbits32) aSigExtra < 0 ) {
1441 ++absZ;
1442 if ( (bits32) ( aSigExtra<<1 ) == 0 ) absZ &= ~1;
1443 }
1444 z = aSign ? - absZ : absZ;
1445 }
1446 else {
1447 aSigExtra = ( aSigExtra != 0 );
1448 if ( aSign ) {
1449 z = - ( absZ
1450 + ( ( roundingMode == float_round_down ) & aSigExtra ) );
1451 }
1452 else {
1453 z = absZ + ( ( roundingMode == float_round_up ) & aSigExtra );
1454 }
1455 }
1456 if ( ( aSign ^ ( z < 0 ) ) && z ) {
1457 invalid:
1458 float_raise( float_flag_invalid );
1459 return aSign ? (sbits32) 0x80000000 : 0x7FFFFFFF;
1460 }
1461 if ( aSigExtra ) float_exception_flags |= float_flag_inexact;
1462 return z;
1463
1464 }
1465 #endif /* !SOFTFLOAT_FOR_GCC */
1466
1467 /*
1468 -------------------------------------------------------------------------------
1469 Returns the result of converting the double-precision floating-point value
1470 `a' to the 32-bit two's complement integer format. The conversion is
1471 performed according to the IEC/IEEE Standard for Binary Floating-Point
1472 Arithmetic, except that the conversion is always rounded toward zero.
1473 If `a' is a NaN, the largest positive integer is returned. Otherwise, if
1474 the conversion overflows, the largest integer with the same sign as `a' is
1475 returned.
1476 -------------------------------------------------------------------------------
1477 */
float64_to_int32_round_to_zero(float64 a)1478 int32 float64_to_int32_round_to_zero( float64 a )
1479 {
1480 flag aSign;
1481 int16 aExp, shiftCount;
1482 bits32 aSig0, aSig1, absZ, aSigExtra;
1483 int32 z;
1484
1485 aSig1 = extractFloat64Frac1( a );
1486 aSig0 = extractFloat64Frac0( a );
1487 aExp = extractFloat64Exp( a );
1488 aSign = extractFloat64Sign( a );
1489 shiftCount = aExp - 0x413;
1490 if ( 0 <= shiftCount ) {
1491 if ( 0x41E < aExp ) {
1492 if ( ( aExp == 0x7FF ) && ( aSig0 | aSig1 ) ) aSign = 0;
1493 goto invalid;
1494 }
1495 shortShift64Left(
1496 aSig0 | 0x00100000, aSig1, shiftCount, &absZ, &aSigExtra );
1497 }
1498 else {
1499 if ( aExp < 0x3FF ) {
1500 if ( aExp | aSig0 | aSig1 ) {
1501 float_exception_flags |= float_flag_inexact;
1502 }
1503 return 0;
1504 }
1505 aSig0 |= 0x00100000;
1506 aSigExtra = ( aSig0<<( shiftCount & 31 ) ) | aSig1;
1507 absZ = aSig0>>( - shiftCount );
1508 }
1509 z = aSign ? - absZ : absZ;
1510 if ( ( aSign ^ ( z < 0 ) ) && z ) {
1511 invalid:
1512 float_raise( float_flag_invalid );
1513 return aSign ? (sbits32) 0x80000000 : 0x7FFFFFFF;
1514 }
1515 if ( aSigExtra ) float_exception_flags |= float_flag_inexact;
1516 return z;
1517
1518 }
1519
1520 /*
1521 -------------------------------------------------------------------------------
1522 Returns the result of converting the double-precision floating-point value
1523 `a' to the single-precision floating-point format. The conversion is
1524 performed according to the IEC/IEEE Standard for Binary Floating-Point
1525 Arithmetic.
1526 -------------------------------------------------------------------------------
1527 */
float64_to_float32(float64 a)1528 float32 float64_to_float32( float64 a )
1529 {
1530 flag aSign;
1531 int16 aExp;
1532 bits32 aSig0, aSig1, zSig;
1533 bits32 allZero;
1534
1535 aSig1 = extractFloat64Frac1( a );
1536 aSig0 = extractFloat64Frac0( a );
1537 aExp = extractFloat64Exp( a );
1538 aSign = extractFloat64Sign( a );
1539 if ( aExp == 0x7FF ) {
1540 if ( aSig0 | aSig1 ) {
1541 return commonNaNToFloat32( float64ToCommonNaN( a ) );
1542 }
1543 return packFloat32( aSign, 0xFF, 0 );
1544 }
1545 shift64RightJamming( aSig0, aSig1, 22, &allZero, &zSig );
1546 if ( aExp ) zSig |= 0x40000000;
1547 return roundAndPackFloat32( aSign, aExp - 0x381, zSig );
1548
1549 }
1550
1551 #ifndef SOFTFLOAT_FOR_GCC
1552 /*
1553 -------------------------------------------------------------------------------
1554 Rounds the double-precision floating-point value `a' to an integer,
1555 and returns the result as a double-precision floating-point value. The
1556 operation is performed according to the IEC/IEEE Standard for Binary
1557 Floating-Point Arithmetic.
1558 -------------------------------------------------------------------------------
1559 */
float64_round_to_int(float64 a)1560 float64 float64_round_to_int( float64 a )
1561 {
1562 flag aSign;
1563 int16 aExp;
1564 bits32 lastBitMask, roundBitsMask;
1565 int8 roundingMode;
1566 float64 z;
1567
1568 aExp = extractFloat64Exp( a );
1569 if ( 0x413 <= aExp ) {
1570 if ( 0x433 <= aExp ) {
1571 if ( ( aExp == 0x7FF )
1572 && ( extractFloat64Frac0( a ) | extractFloat64Frac1( a ) ) ) {
1573 return propagateFloat64NaN( a, a );
1574 }
1575 return a;
1576 }
1577 lastBitMask = 1;
1578 lastBitMask = ( lastBitMask<<( 0x432 - aExp ) )<<1;
1579 roundBitsMask = lastBitMask - 1;
1580 z = a;
1581 roundingMode = float_rounding_mode;
1582 if ( roundingMode == float_round_nearest_even ) {
1583 if ( lastBitMask ) {
1584 add64( z.high, z.low, 0, lastBitMask>>1, &z.high, &z.low );
1585 if ( ( z.low & roundBitsMask ) == 0 ) z.low &= ~ lastBitMask;
1586 }
1587 else {
1588 if ( (sbits32) z.low < 0 ) {
1589 ++z.high;
1590 if ( (bits32) ( z.low<<1 ) == 0 ) z.high &= ~1;
1591 }
1592 }
1593 }
1594 else if ( roundingMode != float_round_to_zero ) {
1595 if ( extractFloat64Sign( z )
1596 ^ ( roundingMode == float_round_up ) ) {
1597 add64( z.high, z.low, 0, roundBitsMask, &z.high, &z.low );
1598 }
1599 }
1600 z.low &= ~ roundBitsMask;
1601 }
1602 else {
1603 if ( aExp <= 0x3FE ) {
1604 if ( ( ( (bits32) ( a.high<<1 ) ) | a.low ) == 0 ) return a;
1605 float_exception_flags |= float_flag_inexact;
1606 aSign = extractFloat64Sign( a );
1607 switch ( float_rounding_mode ) {
1608 case float_round_nearest_even:
1609 if ( ( aExp == 0x3FE )
1610 && ( extractFloat64Frac0( a ) | extractFloat64Frac1( a ) )
1611 ) {
1612 return packFloat64( aSign, 0x3FF, 0, 0 );
1613 }
1614 break;
1615 case float_round_down:
1616 return
1617 aSign ? packFloat64( 1, 0x3FF, 0, 0 )
1618 : packFloat64( 0, 0, 0, 0 );
1619 case float_round_up:
1620 return
1621 aSign ? packFloat64( 1, 0, 0, 0 )
1622 : packFloat64( 0, 0x3FF, 0, 0 );
1623 }
1624 return packFloat64( aSign, 0, 0, 0 );
1625 }
1626 lastBitMask = 1;
1627 lastBitMask <<= 0x413 - aExp;
1628 roundBitsMask = lastBitMask - 1;
1629 z.low = 0;
1630 z.high = a.high;
1631 roundingMode = float_rounding_mode;
1632 if ( roundingMode == float_round_nearest_even ) {
1633 z.high += lastBitMask>>1;
1634 if ( ( ( z.high & roundBitsMask ) | a.low ) == 0 ) {
1635 z.high &= ~ lastBitMask;
1636 }
1637 }
1638 else if ( roundingMode != float_round_to_zero ) {
1639 if ( extractFloat64Sign( z )
1640 ^ ( roundingMode == float_round_up ) ) {
1641 z.high |= ( a.low != 0 );
1642 z.high += roundBitsMask;
1643 }
1644 }
1645 z.high &= ~ roundBitsMask;
1646 }
1647 if ( ( z.low != a.low ) || ( z.high != a.high ) ) {
1648 float_exception_flags |= float_flag_inexact;
1649 }
1650 return z;
1651
1652 }
1653 #endif
1654
1655 /*
1656 -------------------------------------------------------------------------------
1657 Returns the result of adding the absolute values of the double-precision
1658 floating-point values `a' and `b'. If `zSign' is 1, the sum is negated
1659 before being returned. `zSign' is ignored if the result is a NaN.
1660 The addition is performed according to the IEC/IEEE Standard for Binary
1661 Floating-Point Arithmetic.
1662 -------------------------------------------------------------------------------
1663 */
addFloat64Sigs(float64 a,float64 b,flag zSign)1664 static float64 addFloat64Sigs( float64 a, float64 b, flag zSign )
1665 {
1666 int16 aExp, bExp, zExp;
1667 bits32 aSig0, aSig1, bSig0, bSig1, zSig0, zSig1, zSig2;
1668 int16 expDiff;
1669
1670 aSig1 = extractFloat64Frac1( a );
1671 aSig0 = extractFloat64Frac0( a );
1672 aExp = extractFloat64Exp( a );
1673 bSig1 = extractFloat64Frac1( b );
1674 bSig0 = extractFloat64Frac0( b );
1675 bExp = extractFloat64Exp( b );
1676 expDiff = aExp - bExp;
1677 if ( 0 < expDiff ) {
1678 if ( aExp == 0x7FF ) {
1679 if ( aSig0 | aSig1 ) return propagateFloat64NaN( a, b );
1680 return a;
1681 }
1682 if ( bExp == 0 ) {
1683 --expDiff;
1684 }
1685 else {
1686 bSig0 |= 0x00100000;
1687 }
1688 shift64ExtraRightJamming(
1689 bSig0, bSig1, 0, expDiff, &bSig0, &bSig1, &zSig2 );
1690 zExp = aExp;
1691 }
1692 else if ( expDiff < 0 ) {
1693 if ( bExp == 0x7FF ) {
1694 if ( bSig0 | bSig1 ) return propagateFloat64NaN( a, b );
1695 return packFloat64( zSign, 0x7FF, 0, 0 );
1696 }
1697 if ( aExp == 0 ) {
1698 ++expDiff;
1699 }
1700 else {
1701 aSig0 |= 0x00100000;
1702 }
1703 shift64ExtraRightJamming(
1704 aSig0, aSig1, 0, - expDiff, &aSig0, &aSig1, &zSig2 );
1705 zExp = bExp;
1706 }
1707 else {
1708 if ( aExp == 0x7FF ) {
1709 if ( aSig0 | aSig1 | bSig0 | bSig1 ) {
1710 return propagateFloat64NaN( a, b );
1711 }
1712 return a;
1713 }
1714 add64( aSig0, aSig1, bSig0, bSig1, &zSig0, &zSig1 );
1715 if ( aExp == 0 ) return packFloat64( zSign, 0, zSig0, zSig1 );
1716 zSig2 = 0;
1717 zSig0 |= 0x00200000;
1718 zExp = aExp;
1719 goto shiftRight1;
1720 }
1721 aSig0 |= 0x00100000;
1722 add64( aSig0, aSig1, bSig0, bSig1, &zSig0, &zSig1 );
1723 --zExp;
1724 if ( zSig0 < 0x00200000 ) goto roundAndPack;
1725 ++zExp;
1726 shiftRight1:
1727 shift64ExtraRightJamming( zSig0, zSig1, zSig2, 1, &zSig0, &zSig1, &zSig2 );
1728 roundAndPack:
1729 return roundAndPackFloat64( zSign, zExp, zSig0, zSig1, zSig2 );
1730
1731 }
1732
1733 /*
1734 -------------------------------------------------------------------------------
1735 Returns the result of subtracting the absolute values of the double-
1736 precision floating-point values `a' and `b'. If `zSign' is 1, the
1737 difference is negated before being returned. `zSign' is ignored if the
1738 result is a NaN. The subtraction is performed according to the IEC/IEEE
1739 Standard for Binary Floating-Point Arithmetic.
1740 -------------------------------------------------------------------------------
1741 */
subFloat64Sigs(float64 a,float64 b,flag zSign)1742 static float64 subFloat64Sigs( float64 a, float64 b, flag zSign )
1743 {
1744 int16 aExp, bExp, zExp;
1745 bits32 aSig0, aSig1, bSig0, bSig1, zSig0, zSig1;
1746 int16 expDiff;
1747
1748 aSig1 = extractFloat64Frac1( a );
1749 aSig0 = extractFloat64Frac0( a );
1750 aExp = extractFloat64Exp( a );
1751 bSig1 = extractFloat64Frac1( b );
1752 bSig0 = extractFloat64Frac0( b );
1753 bExp = extractFloat64Exp( b );
1754 expDiff = aExp - bExp;
1755 shortShift64Left( aSig0, aSig1, 10, &aSig0, &aSig1 );
1756 shortShift64Left( bSig0, bSig1, 10, &bSig0, &bSig1 );
1757 if ( 0 < expDiff ) goto aExpBigger;
1758 if ( expDiff < 0 ) goto bExpBigger;
1759 if ( aExp == 0x7FF ) {
1760 if ( aSig0 | aSig1 | bSig0 | bSig1 ) {
1761 return propagateFloat64NaN( a, b );
1762 }
1763 float_raise( float_flag_invalid );
1764 return float64_default_nan;
1765 }
1766 if ( aExp == 0 ) {
1767 aExp = 1;
1768 bExp = 1;
1769 }
1770 if ( bSig0 < aSig0 ) goto aBigger;
1771 if ( aSig0 < bSig0 ) goto bBigger;
1772 if ( bSig1 < aSig1 ) goto aBigger;
1773 if ( aSig1 < bSig1 ) goto bBigger;
1774 return packFloat64( float_rounding_mode == float_round_down, 0, 0, 0 );
1775 bExpBigger:
1776 if ( bExp == 0x7FF ) {
1777 if ( bSig0 | bSig1 ) return propagateFloat64NaN( a, b );
1778 return packFloat64( zSign ^ 1, 0x7FF, 0, 0 );
1779 }
1780 if ( aExp == 0 ) {
1781 ++expDiff;
1782 }
1783 else {
1784 aSig0 |= 0x40000000;
1785 }
1786 shift64RightJamming( aSig0, aSig1, - expDiff, &aSig0, &aSig1 );
1787 bSig0 |= 0x40000000;
1788 bBigger:
1789 sub64( bSig0, bSig1, aSig0, aSig1, &zSig0, &zSig1 );
1790 zExp = bExp;
1791 zSign ^= 1;
1792 goto normalizeRoundAndPack;
1793 aExpBigger:
1794 if ( aExp == 0x7FF ) {
1795 if ( aSig0 | aSig1 ) return propagateFloat64NaN( a, b );
1796 return a;
1797 }
1798 if ( bExp == 0 ) {
1799 --expDiff;
1800 }
1801 else {
1802 bSig0 |= 0x40000000;
1803 }
1804 shift64RightJamming( bSig0, bSig1, expDiff, &bSig0, &bSig1 );
1805 aSig0 |= 0x40000000;
1806 aBigger:
1807 sub64( aSig0, aSig1, bSig0, bSig1, &zSig0, &zSig1 );
1808 zExp = aExp;
1809 normalizeRoundAndPack:
1810 --zExp;
1811 return normalizeRoundAndPackFloat64( zSign, zExp - 10, zSig0, zSig1 );
1812
1813 }
1814
1815 /*
1816 -------------------------------------------------------------------------------
1817 Returns the result of adding the double-precision floating-point values `a'
1818 and `b'. The operation is performed according to the IEC/IEEE Standard for
1819 Binary Floating-Point Arithmetic.
1820 -------------------------------------------------------------------------------
1821 */
float64_add(float64 a,float64 b)1822 float64 float64_add( float64 a, float64 b )
1823 {
1824 flag aSign, bSign;
1825
1826 aSign = extractFloat64Sign( a );
1827 bSign = extractFloat64Sign( b );
1828 if ( aSign == bSign ) {
1829 return addFloat64Sigs( a, b, aSign );
1830 }
1831 else {
1832 return subFloat64Sigs( a, b, aSign );
1833 }
1834
1835 }
1836
1837 /*
1838 -------------------------------------------------------------------------------
1839 Returns the result of subtracting the double-precision floating-point values
1840 `a' and `b'. The operation is performed according to the IEC/IEEE Standard
1841 for Binary Floating-Point Arithmetic.
1842 -------------------------------------------------------------------------------
1843 */
float64_sub(float64 a,float64 b)1844 float64 float64_sub( float64 a, float64 b )
1845 {
1846 flag aSign, bSign;
1847
1848 aSign = extractFloat64Sign( a );
1849 bSign = extractFloat64Sign( b );
1850 if ( aSign == bSign ) {
1851 return subFloat64Sigs( a, b, aSign );
1852 }
1853 else {
1854 return addFloat64Sigs( a, b, aSign );
1855 }
1856
1857 }
1858
1859 /*
1860 -------------------------------------------------------------------------------
1861 Returns the result of multiplying the double-precision floating-point values
1862 `a' and `b'. The operation is performed according to the IEC/IEEE Standard
1863 for Binary Floating-Point Arithmetic.
1864 -------------------------------------------------------------------------------
1865 */
float64_mul(float64 a,float64 b)1866 float64 float64_mul( float64 a, float64 b )
1867 {
1868 flag aSign, bSign, zSign;
1869 int16 aExp, bExp, zExp;
1870 bits32 aSig0, aSig1, bSig0, bSig1, zSig0, zSig1, zSig2, zSig3;
1871
1872 aSig1 = extractFloat64Frac1( a );
1873 aSig0 = extractFloat64Frac0( a );
1874 aExp = extractFloat64Exp( a );
1875 aSign = extractFloat64Sign( a );
1876 bSig1 = extractFloat64Frac1( b );
1877 bSig0 = extractFloat64Frac0( b );
1878 bExp = extractFloat64Exp( b );
1879 bSign = extractFloat64Sign( b );
1880 zSign = aSign ^ bSign;
1881 if ( aExp == 0x7FF ) {
1882 if ( ( aSig0 | aSig1 )
1883 || ( ( bExp == 0x7FF ) && ( bSig0 | bSig1 ) ) ) {
1884 return propagateFloat64NaN( a, b );
1885 }
1886 if ( ( bExp | bSig0 | bSig1 ) == 0 ) goto invalid;
1887 return packFloat64( zSign, 0x7FF, 0, 0 );
1888 }
1889 if ( bExp == 0x7FF ) {
1890 if ( bSig0 | bSig1 ) return propagateFloat64NaN( a, b );
1891 if ( ( aExp | aSig0 | aSig1 ) == 0 ) {
1892 invalid:
1893 float_raise( float_flag_invalid );
1894 return float64_default_nan;
1895 }
1896 return packFloat64( zSign, 0x7FF, 0, 0 );
1897 }
1898 if ( aExp == 0 ) {
1899 if ( ( aSig0 | aSig1 ) == 0 ) return packFloat64( zSign, 0, 0, 0 );
1900 normalizeFloat64Subnormal( aSig0, aSig1, &aExp, &aSig0, &aSig1 );
1901 }
1902 if ( bExp == 0 ) {
1903 if ( ( bSig0 | bSig1 ) == 0 ) return packFloat64( zSign, 0, 0, 0 );
1904 normalizeFloat64Subnormal( bSig0, bSig1, &bExp, &bSig0, &bSig1 );
1905 }
1906 zExp = aExp + bExp - 0x400;
1907 aSig0 |= 0x00100000;
1908 shortShift64Left( bSig0, bSig1, 12, &bSig0, &bSig1 );
1909 mul64To128( aSig0, aSig1, bSig0, bSig1, &zSig0, &zSig1, &zSig2, &zSig3 );
1910 add64( zSig0, zSig1, aSig0, aSig1, &zSig0, &zSig1 );
1911 zSig2 |= ( zSig3 != 0 );
1912 if ( 0x00200000 <= zSig0 ) {
1913 shift64ExtraRightJamming(
1914 zSig0, zSig1, zSig2, 1, &zSig0, &zSig1, &zSig2 );
1915 ++zExp;
1916 }
1917 return roundAndPackFloat64( zSign, zExp, zSig0, zSig1, zSig2 );
1918
1919 }
1920
1921 /*
1922 -------------------------------------------------------------------------------
1923 Returns the result of dividing the double-precision floating-point value `a'
1924 by the corresponding value `b'. The operation is performed according to the
1925 IEC/IEEE Standard for Binary Floating-Point Arithmetic.
1926 -------------------------------------------------------------------------------
1927 */
float64_div(float64 a,float64 b)1928 float64 float64_div( float64 a, float64 b )
1929 {
1930 flag aSign, bSign, zSign;
1931 int16 aExp, bExp, zExp;
1932 bits32 aSig0, aSig1, bSig0, bSig1, zSig0, zSig1, zSig2;
1933 bits32 rem0, rem1, rem2, rem3, term0, term1, term2, term3;
1934
1935 aSig1 = extractFloat64Frac1( a );
1936 aSig0 = extractFloat64Frac0( a );
1937 aExp = extractFloat64Exp( a );
1938 aSign = extractFloat64Sign( a );
1939 bSig1 = extractFloat64Frac1( b );
1940 bSig0 = extractFloat64Frac0( b );
1941 bExp = extractFloat64Exp( b );
1942 bSign = extractFloat64Sign( b );
1943 zSign = aSign ^ bSign;
1944 if ( aExp == 0x7FF ) {
1945 if ( aSig0 | aSig1 ) return propagateFloat64NaN( a, b );
1946 if ( bExp == 0x7FF ) {
1947 if ( bSig0 | bSig1 ) return propagateFloat64NaN( a, b );
1948 goto invalid;
1949 }
1950 return packFloat64( zSign, 0x7FF, 0, 0 );
1951 }
1952 if ( bExp == 0x7FF ) {
1953 if ( bSig0 | bSig1 ) return propagateFloat64NaN( a, b );
1954 return packFloat64( zSign, 0, 0, 0 );
1955 }
1956 if ( bExp == 0 ) {
1957 if ( ( bSig0 | bSig1 ) == 0 ) {
1958 if ( ( aExp | aSig0 | aSig1 ) == 0 ) {
1959 invalid:
1960 float_raise( float_flag_invalid );
1961 return float64_default_nan;
1962 }
1963 float_raise( float_flag_divbyzero );
1964 return packFloat64( zSign, 0x7FF, 0, 0 );
1965 }
1966 normalizeFloat64Subnormal( bSig0, bSig1, &bExp, &bSig0, &bSig1 );
1967 }
1968 if ( aExp == 0 ) {
1969 if ( ( aSig0 | aSig1 ) == 0 ) return packFloat64( zSign, 0, 0, 0 );
1970 normalizeFloat64Subnormal( aSig0, aSig1, &aExp, &aSig0, &aSig1 );
1971 }
1972 zExp = aExp - bExp + 0x3FD;
1973 shortShift64Left( aSig0 | 0x00100000, aSig1, 11, &aSig0, &aSig1 );
1974 shortShift64Left( bSig0 | 0x00100000, bSig1, 11, &bSig0, &bSig1 );
1975 if ( le64( bSig0, bSig1, aSig0, aSig1 ) ) {
1976 shift64Right( aSig0, aSig1, 1, &aSig0, &aSig1 );
1977 ++zExp;
1978 }
1979 zSig0 = estimateDiv64To32( aSig0, aSig1, bSig0 );
1980 mul64By32To96( bSig0, bSig1, zSig0, &term0, &term1, &term2 );
1981 sub96( aSig0, aSig1, 0, term0, term1, term2, &rem0, &rem1, &rem2 );
1982 while ( (sbits32) rem0 < 0 ) {
1983 --zSig0;
1984 add96( rem0, rem1, rem2, 0, bSig0, bSig1, &rem0, &rem1, &rem2 );
1985 }
1986 zSig1 = estimateDiv64To32( rem1, rem2, bSig0 );
1987 if ( ( zSig1 & 0x3FF ) <= 4 ) {
1988 mul64By32To96( bSig0, bSig1, zSig1, &term1, &term2, &term3 );
1989 sub96( rem1, rem2, 0, term1, term2, term3, &rem1, &rem2, &rem3 );
1990 while ( (sbits32) rem1 < 0 ) {
1991 --zSig1;
1992 add96( rem1, rem2, rem3, 0, bSig0, bSig1, &rem1, &rem2, &rem3 );
1993 }
1994 zSig1 |= ( ( rem1 | rem2 | rem3 ) != 0 );
1995 }
1996 shift64ExtraRightJamming( zSig0, zSig1, 0, 11, &zSig0, &zSig1, &zSig2 );
1997 return roundAndPackFloat64( zSign, zExp, zSig0, zSig1, zSig2 );
1998
1999 }
2000
2001 #ifndef SOFTFLOAT_FOR_GCC
2002 /*
2003 -------------------------------------------------------------------------------
2004 Returns the remainder of the double-precision floating-point value `a'
2005 with respect to the corresponding value `b'. The operation is performed
2006 according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
2007 -------------------------------------------------------------------------------
2008 */
float64_rem(float64 a,float64 b)2009 float64 float64_rem( float64 a, float64 b )
2010 {
2011 flag aSign, bSign, zSign;
2012 int16 aExp, bExp, expDiff;
2013 bits32 aSig0, aSig1, bSig0, bSig1, q, term0, term1, term2;
2014 bits32 allZero, alternateASig0, alternateASig1, sigMean1;
2015 sbits32 sigMean0;
2016 float64 z;
2017
2018 aSig1 = extractFloat64Frac1( a );
2019 aSig0 = extractFloat64Frac0( a );
2020 aExp = extractFloat64Exp( a );
2021 aSign = extractFloat64Sign( a );
2022 bSig1 = extractFloat64Frac1( b );
2023 bSig0 = extractFloat64Frac0( b );
2024 bExp = extractFloat64Exp( b );
2025 bSign = extractFloat64Sign( b );
2026 if ( aExp == 0x7FF ) {
2027 if ( ( aSig0 | aSig1 )
2028 || ( ( bExp == 0x7FF ) && ( bSig0 | bSig1 ) ) ) {
2029 return propagateFloat64NaN( a, b );
2030 }
2031 goto invalid;
2032 }
2033 if ( bExp == 0x7FF ) {
2034 if ( bSig0 | bSig1 ) return propagateFloat64NaN( a, b );
2035 return a;
2036 }
2037 if ( bExp == 0 ) {
2038 if ( ( bSig0 | bSig1 ) == 0 ) {
2039 invalid:
2040 float_raise( float_flag_invalid );
2041 return float64_default_nan;
2042 }
2043 normalizeFloat64Subnormal( bSig0, bSig1, &bExp, &bSig0, &bSig1 );
2044 }
2045 if ( aExp == 0 ) {
2046 if ( ( aSig0 | aSig1 ) == 0 ) return a;
2047 normalizeFloat64Subnormal( aSig0, aSig1, &aExp, &aSig0, &aSig1 );
2048 }
2049 expDiff = aExp - bExp;
2050 if ( expDiff < -1 ) return a;
2051 shortShift64Left(
2052 aSig0 | 0x00100000, aSig1, 11 - ( expDiff < 0 ), &aSig0, &aSig1 );
2053 shortShift64Left( bSig0 | 0x00100000, bSig1, 11, &bSig0, &bSig1 );
2054 q = le64( bSig0, bSig1, aSig0, aSig1 );
2055 if ( q ) sub64( aSig0, aSig1, bSig0, bSig1, &aSig0, &aSig1 );
2056 expDiff -= 32;
2057 while ( 0 < expDiff ) {
2058 q = estimateDiv64To32( aSig0, aSig1, bSig0 );
2059 q = ( 4 < q ) ? q - 4 : 0;
2060 mul64By32To96( bSig0, bSig1, q, &term0, &term1, &term2 );
2061 shortShift96Left( term0, term1, term2, 29, &term1, &term2, &allZero );
2062 shortShift64Left( aSig0, aSig1, 29, &aSig0, &allZero );
2063 sub64( aSig0, 0, term1, term2, &aSig0, &aSig1 );
2064 expDiff -= 29;
2065 }
2066 if ( -32 < expDiff ) {
2067 q = estimateDiv64To32( aSig0, aSig1, bSig0 );
2068 q = ( 4 < q ) ? q - 4 : 0;
2069 q >>= - expDiff;
2070 shift64Right( bSig0, bSig1, 8, &bSig0, &bSig1 );
2071 expDiff += 24;
2072 if ( expDiff < 0 ) {
2073 shift64Right( aSig0, aSig1, - expDiff, &aSig0, &aSig1 );
2074 }
2075 else {
2076 shortShift64Left( aSig0, aSig1, expDiff, &aSig0, &aSig1 );
2077 }
2078 mul64By32To96( bSig0, bSig1, q, &term0, &term1, &term2 );
2079 sub64( aSig0, aSig1, term1, term2, &aSig0, &aSig1 );
2080 }
2081 else {
2082 shift64Right( aSig0, aSig1, 8, &aSig0, &aSig1 );
2083 shift64Right( bSig0, bSig1, 8, &bSig0, &bSig1 );
2084 }
2085 do {
2086 alternateASig0 = aSig0;
2087 alternateASig1 = aSig1;
2088 ++q;
2089 sub64( aSig0, aSig1, bSig0, bSig1, &aSig0, &aSig1 );
2090 } while ( 0 <= (sbits32) aSig0 );
2091 add64(
2092 aSig0, aSig1, alternateASig0, alternateASig1, &sigMean0, &sigMean1 );
2093 if ( ( sigMean0 < 0 )
2094 || ( ( ( sigMean0 | sigMean1 ) == 0 ) && ( q & 1 ) ) ) {
2095 aSig0 = alternateASig0;
2096 aSig1 = alternateASig1;
2097 }
2098 zSign = ( (sbits32) aSig0 < 0 );
2099 if ( zSign ) sub64( 0, 0, aSig0, aSig1, &aSig0, &aSig1 );
2100 return
2101 normalizeRoundAndPackFloat64( aSign ^ zSign, bExp - 4, aSig0, aSig1 );
2102
2103 }
2104 #endif
2105
2106 #ifndef SOFTFLOAT_FOR_GCC
2107 /*
2108 -------------------------------------------------------------------------------
2109 Returns the square root of the double-precision floating-point value `a'.
2110 The operation is performed according to the IEC/IEEE Standard for Binary
2111 Floating-Point Arithmetic.
2112 -------------------------------------------------------------------------------
2113 */
float64_sqrt(float64 a)2114 float64 float64_sqrt( float64 a )
2115 {
2116 flag aSign;
2117 int16 aExp, zExp;
2118 bits32 aSig0, aSig1, zSig0, zSig1, zSig2, doubleZSig0;
2119 bits32 rem0, rem1, rem2, rem3, term0, term1, term2, term3;
2120 float64 z;
2121
2122 aSig1 = extractFloat64Frac1( a );
2123 aSig0 = extractFloat64Frac0( a );
2124 aExp = extractFloat64Exp( a );
2125 aSign = extractFloat64Sign( a );
2126 if ( aExp == 0x7FF ) {
2127 if ( aSig0 | aSig1 ) return propagateFloat64NaN( a, a );
2128 if ( ! aSign ) return a;
2129 goto invalid;
2130 }
2131 if ( aSign ) {
2132 if ( ( aExp | aSig0 | aSig1 ) == 0 ) return a;
2133 invalid:
2134 float_raise( float_flag_invalid );
2135 return float64_default_nan;
2136 }
2137 if ( aExp == 0 ) {
2138 if ( ( aSig0 | aSig1 ) == 0 ) return packFloat64( 0, 0, 0, 0 );
2139 normalizeFloat64Subnormal( aSig0, aSig1, &aExp, &aSig0, &aSig1 );
2140 }
2141 zExp = ( ( aExp - 0x3FF )>>1 ) + 0x3FE;
2142 aSig0 |= 0x00100000;
2143 shortShift64Left( aSig0, aSig1, 11, &term0, &term1 );
2144 zSig0 = ( estimateSqrt32( aExp, term0 )>>1 ) + 1;
2145 if ( zSig0 == 0 ) zSig0 = 0x7FFFFFFF;
2146 doubleZSig0 = zSig0 + zSig0;
2147 shortShift64Left( aSig0, aSig1, 9 - ( aExp & 1 ), &aSig0, &aSig1 );
2148 mul32To64( zSig0, zSig0, &term0, &term1 );
2149 sub64( aSig0, aSig1, term0, term1, &rem0, &rem1 );
2150 while ( (sbits32) rem0 < 0 ) {
2151 --zSig0;
2152 doubleZSig0 -= 2;
2153 add64( rem0, rem1, 0, doubleZSig0 | 1, &rem0, &rem1 );
2154 }
2155 zSig1 = estimateDiv64To32( rem1, 0, doubleZSig0 );
2156 if ( ( zSig1 & 0x1FF ) <= 5 ) {
2157 if ( zSig1 == 0 ) zSig1 = 1;
2158 mul32To64( doubleZSig0, zSig1, &term1, &term2 );
2159 sub64( rem1, 0, term1, term2, &rem1, &rem2 );
2160 mul32To64( zSig1, zSig1, &term2, &term3 );
2161 sub96( rem1, rem2, 0, 0, term2, term3, &rem1, &rem2, &rem3 );
2162 while ( (sbits32) rem1 < 0 ) {
2163 --zSig1;
2164 shortShift64Left( 0, zSig1, 1, &term2, &term3 );
2165 term3 |= 1;
2166 term2 |= doubleZSig0;
2167 add96( rem1, rem2, rem3, 0, term2, term3, &rem1, &rem2, &rem3 );
2168 }
2169 zSig1 |= ( ( rem1 | rem2 | rem3 ) != 0 );
2170 }
2171 shift64ExtraRightJamming( zSig0, zSig1, 0, 10, &zSig0, &zSig1, &zSig2 );
2172 return roundAndPackFloat64( 0, zExp, zSig0, zSig1, zSig2 );
2173
2174 }
2175 #endif
2176
2177 /*
2178 -------------------------------------------------------------------------------
2179 Returns 1 if the double-precision floating-point value `a' is equal to
2180 the corresponding value `b', and 0 otherwise. The comparison is performed
2181 according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
2182 -------------------------------------------------------------------------------
2183 */
float64_eq(float64 a,float64 b)2184 flag float64_eq( float64 a, float64 b )
2185 {
2186
2187 if ( ( ( extractFloat64Exp( a ) == 0x7FF )
2188 && ( extractFloat64Frac0( a ) | extractFloat64Frac1( a ) ) )
2189 || ( ( extractFloat64Exp( b ) == 0x7FF )
2190 && ( extractFloat64Frac0( b ) | extractFloat64Frac1( b ) ) )
2191 ) {
2192 if ( float64_is_signaling_nan( a ) || float64_is_signaling_nan( b ) ) {
2193 float_raise( float_flag_invalid );
2194 }
2195 return 0;
2196 }
2197 return ( a == b ) ||
2198 ( (bits64) ( ( FLOAT64_DEMANGLE(a) | FLOAT64_DEMANGLE(b) )<<1 ) == 0 );
2199
2200 }
2201
2202 /*
2203 -------------------------------------------------------------------------------
2204 Returns 1 if the double-precision floating-point value `a' is less than
2205 or equal to the corresponding value `b', and 0 otherwise. The comparison
2206 is performed according to the IEC/IEEE Standard for Binary Floating-Point
2207 Arithmetic.
2208 -------------------------------------------------------------------------------
2209 */
float64_le(float64 a,float64 b)2210 flag float64_le( float64 a, float64 b )
2211 {
2212 flag aSign, bSign;
2213
2214 if ( ( ( extractFloat64Exp( a ) == 0x7FF )
2215 && ( extractFloat64Frac0( a ) | extractFloat64Frac1( a ) ) )
2216 || ( ( extractFloat64Exp( b ) == 0x7FF )
2217 && ( extractFloat64Frac0( b ) | extractFloat64Frac1( b ) ) )
2218 ) {
2219 float_raise( float_flag_invalid );
2220 return 0;
2221 }
2222 aSign = extractFloat64Sign( a );
2223 bSign = extractFloat64Sign( b );
2224 if ( aSign != bSign )
2225 return aSign ||
2226 ( (bits64) ( ( FLOAT64_DEMANGLE(a) | FLOAT64_DEMANGLE(b) )<<1 ) ==
2227 0 );
2228 return ( a == b ) ||
2229 ( aSign ^ ( FLOAT64_DEMANGLE(a) < FLOAT64_DEMANGLE(b) ) );
2230 }
2231
2232 /*
2233 -------------------------------------------------------------------------------
2234 Returns 1 if the double-precision floating-point value `a' is less than
2235 the corresponding value `b', and 0 otherwise. The comparison is performed
2236 according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
2237 -------------------------------------------------------------------------------
2238 */
float64_lt(float64 a,float64 b)2239 flag float64_lt( float64 a, float64 b )
2240 {
2241 flag aSign, bSign;
2242
2243 if ( ( ( extractFloat64Exp( a ) == 0x7FF )
2244 && ( extractFloat64Frac0( a ) | extractFloat64Frac1( a ) ) )
2245 || ( ( extractFloat64Exp( b ) == 0x7FF )
2246 && ( extractFloat64Frac0( b ) | extractFloat64Frac1( b ) ) )
2247 ) {
2248 float_raise( float_flag_invalid );
2249 return 0;
2250 }
2251 aSign = extractFloat64Sign( a );
2252 bSign = extractFloat64Sign( b );
2253 if ( aSign != bSign )
2254 return aSign &&
2255 ( (bits64) ( ( FLOAT64_DEMANGLE(a) | FLOAT64_DEMANGLE(b) )<<1 ) !=
2256 0 );
2257 return ( a != b ) &&
2258 ( aSign ^ ( FLOAT64_DEMANGLE(a) < FLOAT64_DEMANGLE(b) ) );
2259
2260 }
2261
2262 #ifndef SOFTFLOAT_FOR_GCC
2263 /*
2264 -------------------------------------------------------------------------------
2265 Returns 1 if the double-precision floating-point value `a' is equal to
2266 the corresponding value `b', and 0 otherwise. The invalid exception is
2267 raised if either operand is a NaN. Otherwise, the comparison is performed
2268 according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
2269 -------------------------------------------------------------------------------
2270 */
float64_eq_signaling(float64 a,float64 b)2271 flag float64_eq_signaling( float64 a, float64 b )
2272 {
2273
2274 if ( ( ( extractFloat64Exp( a ) == 0x7FF )
2275 && ( extractFloat64Frac0( a ) | extractFloat64Frac1( a ) ) )
2276 || ( ( extractFloat64Exp( b ) == 0x7FF )
2277 && ( extractFloat64Frac0( b ) | extractFloat64Frac1( b ) ) )
2278 ) {
2279 float_raise( float_flag_invalid );
2280 return 0;
2281 }
2282 return ( a == b ) || ( (bits64) ( ( a | b )<<1 ) == 0 );
2283
2284 }
2285
2286 /*
2287 -------------------------------------------------------------------------------
2288 Returns 1 if the double-precision floating-point value `a' is less than or
2289 equal to the corresponding value `b', and 0 otherwise. Quiet NaNs do not
2290 cause an exception. Otherwise, the comparison is performed according to the
2291 IEC/IEEE Standard for Binary Floating-Point Arithmetic.
2292 -------------------------------------------------------------------------------
2293 */
float64_le_quiet(float64 a,float64 b)2294 flag float64_le_quiet( float64 a, float64 b )
2295 {
2296 flag aSign, bSign;
2297
2298 if ( ( ( extractFloat64Exp( a ) == 0x7FF )
2299 && ( extractFloat64Frac0( a ) | extractFloat64Frac1( a ) ) )
2300 || ( ( extractFloat64Exp( b ) == 0x7FF )
2301 && ( extractFloat64Frac0( b ) | extractFloat64Frac1( b ) ) )
2302 ) {
2303 if ( float64_is_signaling_nan( a ) || float64_is_signaling_nan( b ) ) {
2304 float_raise( float_flag_invalid );
2305 }
2306 return 0;
2307 }
2308 aSign = extractFloat64Sign( a );
2309 bSign = extractFloat64Sign( b );
2310 if ( aSign != bSign ) return aSign || ( (bits64) ( ( a | b )<<1 ) == 0 );
2311 return ( a == b ) || ( aSign ^ ( a < b ) );
2312
2313 }
2314
2315 /*
2316 -------------------------------------------------------------------------------
2317 Returns 1 if the double-precision floating-point value `a' is less than
2318 the corresponding value `b', and 0 otherwise. Quiet NaNs do not cause an
2319 exception. Otherwise, the comparison is performed according to the IEC/IEEE
2320 Standard for Binary Floating-Point Arithmetic.
2321 -------------------------------------------------------------------------------
2322 */
float64_lt_quiet(float64 a,float64 b)2323 flag float64_lt_quiet( float64 a, float64 b )
2324 {
2325 flag aSign, bSign;
2326
2327 if ( ( ( extractFloat64Exp( a ) == 0x7FF )
2328 && ( extractFloat64Frac0( a ) | extractFloat64Frac1( a ) ) )
2329 || ( ( extractFloat64Exp( b ) == 0x7FF )
2330 && ( extractFloat64Frac0( b ) | extractFloat64Frac1( b ) ) )
2331 ) {
2332 if ( float64_is_signaling_nan( a ) || float64_is_signaling_nan( b ) ) {
2333 float_raise( float_flag_invalid );
2334 }
2335 return 0;
2336 }
2337 aSign = extractFloat64Sign( a );
2338 bSign = extractFloat64Sign( b );
2339 if ( aSign != bSign ) return aSign && ( (bits64) ( ( a | b )<<1 ) != 0 );
2340 return ( a != b ) && ( aSign ^ ( a < b ) );
2341
2342 }
2343
2344 #endif
2345