xref: /freebsd/lib/libc/softfloat/bits64/softfloat.c (revision 559a218c9b257775fb249b67945fe4a05b7a6b9f)
1 /* $NetBSD: softfloat.c,v 1.8 2011/07/10 04:52:23 matt Exp $ */
2 
3 /*
4  * This version hacked for use with gcc -msoft-float by bjh21.
5  * (Mostly a case of #ifdefing out things GCC doesn't need or provides
6  *  itself).
7  */
8 
9 /*
10  * Things you may want to define:
11  *
12  * SOFTFLOAT_FOR_GCC - build only those functions necessary for GCC (with
13  *   -msoft-float) to work.  Include "softfloat-for-gcc.h" to get them
14  *   properly renamed.
15  */
16 
17 /*
18 ===============================================================================
19 
20 This C source file is part of the SoftFloat IEC/IEEE Floating-point
21 Arithmetic Package, Release 2a.
22 
23 Written by John R. Hauser.  This work was made possible in part by the
24 International Computer Science Institute, located at Suite 600, 1947 Center
25 Street, Berkeley, California 94704.  Funding was partially provided by the
26 National Science Foundation under grant MIP-9311980.  The original version
27 of this code was written as part of a project to build a fixed-point vector
28 processor in collaboration with the University of California at Berkeley,
29 overseen by Profs. Nelson Morgan and John Wawrzynek.  More information
30 is available through the Web page `http://HTTP.CS.Berkeley.EDU/~jhauser/
31 arithmetic/SoftFloat.html'.
32 
33 THIS SOFTWARE IS DISTRIBUTED AS IS, FOR FREE.  Although reasonable effort
34 has been made to avoid it, THIS SOFTWARE MAY CONTAIN FAULTS THAT WILL AT
35 TIMES RESULT IN INCORRECT BEHAVIOR.  USE OF THIS SOFTWARE IS RESTRICTED TO
36 PERSONS AND ORGANIZATIONS WHO CAN AND WILL TAKE FULL RESPONSIBILITY FOR ANY
37 AND ALL LOSSES, COSTS, OR OTHER PROBLEMS ARISING FROM ITS USE.
38 
39 Derivative works are acceptable, even for commercial purposes, so long as
40 (1) they include prominent notice that the work is derivative, and (2) they
41 include prominent notice akin to these four paragraphs for those parts of
42 this code that are retained.
43 
44 ===============================================================================
45 */
46 
47 #ifdef SOFTFLOAT_FOR_GCC
48 #include "softfloat-for-gcc.h"
49 #endif
50 
51 #include "milieu.h"
52 #include "softfloat.h"
53 
54 /*
55  * Conversions between floats as stored in memory and floats as
56  * SoftFloat uses them
57  */
58 #ifndef FLOAT64_DEMANGLE
59 #define FLOAT64_DEMANGLE(a)	(a)
60 #endif
61 #ifndef FLOAT64_MANGLE
62 #define FLOAT64_MANGLE(a)	(a)
63 #endif
64 
65 /*
66 -------------------------------------------------------------------------------
67 Floating-point rounding mode, extended double-precision rounding precision,
68 and exception flags.
69 -------------------------------------------------------------------------------
70 */
71 int float_rounding_mode = float_round_nearest_even;
72 int float_exception_flags = 0;
73 #ifdef FLOATX80
74 int8 floatx80_rounding_precision = 80;
75 #endif
76 
77 /*
78 -------------------------------------------------------------------------------
79 Primitive arithmetic functions, including multi-word arithmetic, and
80 division and square root approximations.  (Can be specialized to target if
81 desired.)
82 -------------------------------------------------------------------------------
83 */
84 #include "softfloat-macros"
85 
86 /*
87 -------------------------------------------------------------------------------
88 Functions and definitions to determine:  (1) whether tininess for underflow
89 is detected before or after rounding by default, (2) what (if anything)
90 happens when exceptions are raised, (3) how signaling NaNs are distinguished
91 from quiet NaNs, (4) the default generated quiet NaNs, and (5) how NaNs
92 are propagated from function inputs to output.  These details are target-
93 specific.
94 -------------------------------------------------------------------------------
95 */
96 #include "softfloat-specialize"
97 
98 #if !defined(SOFTFLOAT_FOR_GCC) || defined(FLOATX80) || defined(FLOAT128)
99 /*
100 -------------------------------------------------------------------------------
101 Takes a 64-bit fixed-point value `absZ' with binary point between bits 6
102 and 7, and returns the properly rounded 32-bit integer corresponding to the
103 input.  If `zSign' is 1, the input is negated before being converted to an
104 integer.  Bit 63 of `absZ' must be zero.  Ordinarily, the fixed-point input
105 is simply rounded to an integer, with the inexact exception raised if the
106 input cannot be represented exactly as an integer.  However, if the fixed-
107 point input is too large, the invalid exception is raised and the largest
108 positive or negative integer is returned.
109 -------------------------------------------------------------------------------
110 */
roundAndPackInt32(flag zSign,bits64 absZ)111 static int32 roundAndPackInt32( flag zSign, bits64 absZ )
112 {
113     int8 roundingMode;
114     flag roundNearestEven;
115     int8 roundIncrement, roundBits;
116     int32 z;
117 
118     roundingMode = float_rounding_mode;
119     roundNearestEven = ( roundingMode == float_round_nearest_even );
120     roundIncrement = 0x40;
121     if ( ! roundNearestEven ) {
122         if ( roundingMode == float_round_to_zero ) {
123             roundIncrement = 0;
124         }
125         else {
126             roundIncrement = 0x7F;
127             if ( zSign ) {
128                 if ( roundingMode == float_round_up ) roundIncrement = 0;
129             }
130             else {
131                 if ( roundingMode == float_round_down ) roundIncrement = 0;
132             }
133         }
134     }
135     roundBits = absZ & 0x7F;
136     absZ = ( absZ + roundIncrement )>>7;
137     absZ &= ~ ( ( ( roundBits ^ 0x40 ) == 0 ) & roundNearestEven );
138     z = absZ;
139     if ( zSign ) z = - z;
140     if ( ( absZ>>32 ) || ( z && ( ( z < 0 ) ^ zSign ) ) ) {
141         float_raise( float_flag_invalid );
142         return zSign ? (sbits32) 0x80000000 : 0x7FFFFFFF;
143     }
144     if ( roundBits ) float_exception_flags |= float_flag_inexact;
145     return z;
146 
147 }
148 
149 /*
150 -------------------------------------------------------------------------------
151 Takes the 128-bit fixed-point value formed by concatenating `absZ0' and
152 `absZ1', with binary point between bits 63 and 64 (between the input words),
153 and returns the properly rounded 64-bit integer corresponding to the input.
154 If `zSign' is 1, the input is negated before being converted to an integer.
155 Ordinarily, the fixed-point input is simply rounded to an integer, with
156 the inexact exception raised if the input cannot be represented exactly as
157 an integer.  However, if the fixed-point input is too large, the invalid
158 exception is raised and the largest positive or negative integer is
159 returned.
160 -------------------------------------------------------------------------------
161 */
roundAndPackInt64(flag zSign,bits64 absZ0,bits64 absZ1)162 static int64 roundAndPackInt64( flag zSign, bits64 absZ0, bits64 absZ1 )
163 {
164     int8 roundingMode;
165     flag roundNearestEven, increment;
166     int64 z;
167 
168     roundingMode = float_rounding_mode;
169     roundNearestEven = ( roundingMode == float_round_nearest_even );
170     increment = ( (sbits64) absZ1 < 0 );
171     if ( ! roundNearestEven ) {
172         if ( roundingMode == float_round_to_zero ) {
173             increment = 0;
174         }
175         else {
176             if ( zSign ) {
177                 increment = ( roundingMode == float_round_down ) && absZ1;
178             }
179             else {
180                 increment = ( roundingMode == float_round_up ) && absZ1;
181             }
182         }
183     }
184     if ( increment ) {
185         ++absZ0;
186         if ( absZ0 == 0 ) goto overflow;
187         absZ0 &= ~ ( ( (bits64) ( absZ1<<1 ) == 0 ) & roundNearestEven );
188     }
189     z = absZ0;
190     if ( zSign ) z = - z;
191     if ( z && ( ( z < 0 ) ^ zSign ) ) {
192  overflow:
193         float_raise( float_flag_invalid );
194         return
195               zSign ? (sbits64) LIT64( 0x8000000000000000 )
196             : LIT64( 0x7FFFFFFFFFFFFFFF );
197     }
198     if ( absZ1 ) float_exception_flags |= float_flag_inexact;
199     return z;
200 
201 }
202 #endif
203 
204 /*
205 -------------------------------------------------------------------------------
206 Returns the fraction bits of the single-precision floating-point value `a'.
207 -------------------------------------------------------------------------------
208 */
extractFloat32Frac(float32 a)209 INLINE bits32 extractFloat32Frac( float32 a )
210 {
211 
212     return a & 0x007FFFFF;
213 
214 }
215 
216 /*
217 -------------------------------------------------------------------------------
218 Returns the exponent bits of the single-precision floating-point value `a'.
219 -------------------------------------------------------------------------------
220 */
extractFloat32Exp(float32 a)221 INLINE int16 extractFloat32Exp( float32 a )
222 {
223 
224     return ( a>>23 ) & 0xFF;
225 
226 }
227 
228 /*
229 -------------------------------------------------------------------------------
230 Returns the sign bit of the single-precision floating-point value `a'.
231 -------------------------------------------------------------------------------
232 */
extractFloat32Sign(float32 a)233 INLINE flag extractFloat32Sign( float32 a )
234 {
235 
236     return a>>31;
237 
238 }
239 
240 /*
241 -------------------------------------------------------------------------------
242 Normalizes the subnormal single-precision floating-point value represented
243 by the denormalized significand `aSig'.  The normalized exponent and
244 significand are stored at the locations pointed to by `zExpPtr' and
245 `zSigPtr', respectively.
246 -------------------------------------------------------------------------------
247 */
248 static void
normalizeFloat32Subnormal(bits32 aSig,int16 * zExpPtr,bits32 * zSigPtr)249  normalizeFloat32Subnormal( bits32 aSig, int16 *zExpPtr, bits32 *zSigPtr )
250 {
251     int8 shiftCount;
252 
253     shiftCount = countLeadingZeros32( aSig ) - 8;
254     *zSigPtr = aSig<<shiftCount;
255     *zExpPtr = 1 - shiftCount;
256 
257 }
258 
259 /*
260 -------------------------------------------------------------------------------
261 Packs the sign `zSign', exponent `zExp', and significand `zSig' into a
262 single-precision floating-point value, returning the result.  After being
263 shifted into the proper positions, the three fields are simply added
264 together to form the result.  This means that any integer portion of `zSig'
265 will be added into the exponent.  Since a properly normalized significand
266 will have an integer portion equal to 1, the `zExp' input should be 1 less
267 than the desired result exponent whenever `zSig' is a complete, normalized
268 significand.
269 -------------------------------------------------------------------------------
270 */
packFloat32(flag zSign,int16 zExp,bits32 zSig)271 INLINE float32 packFloat32( flag zSign, int16 zExp, bits32 zSig )
272 {
273 
274     return ( ( (bits32) zSign )<<31 ) + ( ( (bits32) zExp )<<23 ) + zSig;
275 
276 }
277 
278 /*
279 -------------------------------------------------------------------------------
280 Takes an abstract floating-point value having sign `zSign', exponent `zExp',
281 and significand `zSig', and returns the proper single-precision floating-
282 point value corresponding to the abstract input.  Ordinarily, the abstract
283 value is simply rounded and packed into the single-precision format, with
284 the inexact exception raised if the abstract input cannot be represented
285 exactly.  However, if the abstract value is too large, the overflow and
286 inexact exceptions are raised and an infinity or maximal finite value is
287 returned.  If the abstract value is too small, the input value is rounded to
288 a subnormal number, and the underflow and inexact exceptions are raised if
289 the abstract input cannot be represented exactly as a subnormal single-
290 precision floating-point number.
291     The input significand `zSig' has its binary point between bits 30
292 and 29, which is 7 bits to the left of the usual location.  This shifted
293 significand must be normalized or smaller.  If `zSig' is not normalized,
294 `zExp' must be 0; in that case, the result returned is a subnormal number,
295 and it must not require rounding.  In the usual case that `zSig' is
296 normalized, `zExp' must be 1 less than the ``true'' floating-point exponent.
297 The handling of underflow and overflow follows the IEC/IEEE Standard for
298 Binary Floating-Point Arithmetic.
299 -------------------------------------------------------------------------------
300 */
roundAndPackFloat32(flag zSign,int16 zExp,bits32 zSig)301 static float32 roundAndPackFloat32( flag zSign, int16 zExp, bits32 zSig )
302 {
303     int8 roundingMode;
304     flag roundNearestEven;
305     int8 roundIncrement, roundBits;
306     flag isTiny;
307 
308     roundingMode = float_rounding_mode;
309     roundNearestEven = ( roundingMode == float_round_nearest_even );
310     roundIncrement = 0x40;
311     if ( ! roundNearestEven ) {
312         if ( roundingMode == float_round_to_zero ) {
313             roundIncrement = 0;
314         }
315         else {
316             roundIncrement = 0x7F;
317             if ( zSign ) {
318                 if ( roundingMode == float_round_up ) roundIncrement = 0;
319             }
320             else {
321                 if ( roundingMode == float_round_down ) roundIncrement = 0;
322             }
323         }
324     }
325     roundBits = zSig & 0x7F;
326     if ( 0xFD <= (bits16) zExp ) {
327         if (    ( 0xFD < zExp )
328              || (    ( zExp == 0xFD )
329                   && ( (sbits32) ( zSig + roundIncrement ) < 0 ) )
330            ) {
331             float_raise( float_flag_overflow | float_flag_inexact );
332             return packFloat32( zSign, 0xFF, 0 ) - ( roundIncrement == 0 );
333         }
334         if ( zExp < 0 ) {
335             isTiny =
336                    ( float_detect_tininess == float_tininess_before_rounding )
337                 || ( zExp < -1 )
338                 || ( zSig + roundIncrement < 0x80000000 );
339             shift32RightJamming( zSig, - zExp, &zSig );
340             zExp = 0;
341             roundBits = zSig & 0x7F;
342             if ( isTiny && roundBits ) float_raise( float_flag_underflow );
343         }
344     }
345     if ( roundBits ) float_exception_flags |= float_flag_inexact;
346     zSig = ( zSig + roundIncrement )>>7;
347     zSig &= ~ ( ( ( roundBits ^ 0x40 ) == 0 ) & roundNearestEven );
348     if ( zSig == 0 ) zExp = 0;
349     return packFloat32( zSign, zExp, zSig );
350 
351 }
352 
353 /*
354 -------------------------------------------------------------------------------
355 Takes an abstract floating-point value having sign `zSign', exponent `zExp',
356 and significand `zSig', and returns the proper single-precision floating-
357 point value corresponding to the abstract input.  This routine is just like
358 `roundAndPackFloat32' except that `zSig' does not have to be normalized.
359 Bit 31 of `zSig' must be zero, and `zExp' must be 1 less than the ``true''
360 floating-point exponent.
361 -------------------------------------------------------------------------------
362 */
363 static float32
normalizeRoundAndPackFloat32(flag zSign,int16 zExp,bits32 zSig)364  normalizeRoundAndPackFloat32( flag zSign, int16 zExp, bits32 zSig )
365 {
366     int8 shiftCount;
367 
368     shiftCount = countLeadingZeros32( zSig ) - 1;
369     return roundAndPackFloat32( zSign, zExp - shiftCount, zSig<<shiftCount );
370 
371 }
372 
373 /*
374 -------------------------------------------------------------------------------
375 Returns the fraction bits of the double-precision floating-point value `a'.
376 -------------------------------------------------------------------------------
377 */
extractFloat64Frac(float64 a)378 INLINE bits64 extractFloat64Frac( float64 a )
379 {
380 
381     return FLOAT64_DEMANGLE(a) & LIT64( 0x000FFFFFFFFFFFFF );
382 
383 }
384 
385 /*
386 -------------------------------------------------------------------------------
387 Returns the exponent bits of the double-precision floating-point value `a'.
388 -------------------------------------------------------------------------------
389 */
extractFloat64Exp(float64 a)390 INLINE int16 extractFloat64Exp( float64 a )
391 {
392 
393     return ( FLOAT64_DEMANGLE(a)>>52 ) & 0x7FF;
394 
395 }
396 
397 /*
398 -------------------------------------------------------------------------------
399 Returns the sign bit of the double-precision floating-point value `a'.
400 -------------------------------------------------------------------------------
401 */
extractFloat64Sign(float64 a)402 INLINE flag extractFloat64Sign( float64 a )
403 {
404 
405     return FLOAT64_DEMANGLE(a)>>63;
406 
407 }
408 
409 /*
410 -------------------------------------------------------------------------------
411 Normalizes the subnormal double-precision floating-point value represented
412 by the denormalized significand `aSig'.  The normalized exponent and
413 significand are stored at the locations pointed to by `zExpPtr' and
414 `zSigPtr', respectively.
415 -------------------------------------------------------------------------------
416 */
417 static void
normalizeFloat64Subnormal(bits64 aSig,int16 * zExpPtr,bits64 * zSigPtr)418  normalizeFloat64Subnormal( bits64 aSig, int16 *zExpPtr, bits64 *zSigPtr )
419 {
420     int8 shiftCount;
421 
422     shiftCount = countLeadingZeros64( aSig ) - 11;
423     *zSigPtr = aSig<<shiftCount;
424     *zExpPtr = 1 - shiftCount;
425 
426 }
427 
428 /*
429 -------------------------------------------------------------------------------
430 Packs the sign `zSign', exponent `zExp', and significand `zSig' into a
431 double-precision floating-point value, returning the result.  After being
432 shifted into the proper positions, the three fields are simply added
433 together to form the result.  This means that any integer portion of `zSig'
434 will be added into the exponent.  Since a properly normalized significand
435 will have an integer portion equal to 1, the `zExp' input should be 1 less
436 than the desired result exponent whenever `zSig' is a complete, normalized
437 significand.
438 -------------------------------------------------------------------------------
439 */
packFloat64(flag zSign,int16 zExp,bits64 zSig)440 INLINE float64 packFloat64( flag zSign, int16 zExp, bits64 zSig )
441 {
442 
443     return FLOAT64_MANGLE( ( ( (bits64) zSign )<<63 ) +
444 			   ( ( (bits64) zExp )<<52 ) + zSig );
445 
446 }
447 
448 /*
449 -------------------------------------------------------------------------------
450 Takes an abstract floating-point value having sign `zSign', exponent `zExp',
451 and significand `zSig', and returns the proper double-precision floating-
452 point value corresponding to the abstract input.  Ordinarily, the abstract
453 value is simply rounded and packed into the double-precision format, with
454 the inexact exception raised if the abstract input cannot be represented
455 exactly.  However, if the abstract value is too large, the overflow and
456 inexact exceptions are raised and an infinity or maximal finite value is
457 returned.  If the abstract value is too small, the input value is rounded to
458 a subnormal number, and the underflow and inexact exceptions are raised if
459 the abstract input cannot be represented exactly as a subnormal double-
460 precision floating-point number.
461     The input significand `zSig' has its binary point between bits 62
462 and 61, which is 10 bits to the left of the usual location.  This shifted
463 significand must be normalized or smaller.  If `zSig' is not normalized,
464 `zExp' must be 0; in that case, the result returned is a subnormal number,
465 and it must not require rounding.  In the usual case that `zSig' is
466 normalized, `zExp' must be 1 less than the ``true'' floating-point exponent.
467 The handling of underflow and overflow follows the IEC/IEEE Standard for
468 Binary Floating-Point Arithmetic.
469 -------------------------------------------------------------------------------
470 */
roundAndPackFloat64(flag zSign,int16 zExp,bits64 zSig)471 static float64 roundAndPackFloat64( flag zSign, int16 zExp, bits64 zSig )
472 {
473     int8 roundingMode;
474     flag roundNearestEven;
475     int16 roundIncrement, roundBits;
476     flag isTiny;
477 
478     roundingMode = float_rounding_mode;
479     roundNearestEven = ( roundingMode == float_round_nearest_even );
480     roundIncrement = 0x200;
481     if ( ! roundNearestEven ) {
482         if ( roundingMode == float_round_to_zero ) {
483             roundIncrement = 0;
484         }
485         else {
486             roundIncrement = 0x3FF;
487             if ( zSign ) {
488                 if ( roundingMode == float_round_up ) roundIncrement = 0;
489             }
490             else {
491                 if ( roundingMode == float_round_down ) roundIncrement = 0;
492             }
493         }
494     }
495     roundBits = zSig & 0x3FF;
496     if ( 0x7FD <= (bits16) zExp ) {
497         if (    ( 0x7FD < zExp )
498              || (    ( zExp == 0x7FD )
499                   && ( (sbits64) ( zSig + roundIncrement ) < 0 ) )
500            ) {
501             float_raise( float_flag_overflow | float_flag_inexact );
502             return FLOAT64_MANGLE(
503 		FLOAT64_DEMANGLE(packFloat64( zSign, 0x7FF, 0 )) -
504 		( roundIncrement == 0 ));
505         }
506         if ( zExp < 0 ) {
507             isTiny =
508                    ( float_detect_tininess == float_tininess_before_rounding )
509                 || ( zExp < -1 )
510                 || ( zSig + roundIncrement < LIT64( 0x8000000000000000 ) );
511             shift64RightJamming( zSig, - zExp, &zSig );
512             zExp = 0;
513             roundBits = zSig & 0x3FF;
514             if ( isTiny && roundBits ) float_raise( float_flag_underflow );
515         }
516     }
517     if ( roundBits ) float_exception_flags |= float_flag_inexact;
518     zSig = ( zSig + roundIncrement )>>10;
519     zSig &= ~ ( ( ( roundBits ^ 0x200 ) == 0 ) & roundNearestEven );
520     if ( zSig == 0 ) zExp = 0;
521     return packFloat64( zSign, zExp, zSig );
522 
523 }
524 
525 /*
526 -------------------------------------------------------------------------------
527 Takes an abstract floating-point value having sign `zSign', exponent `zExp',
528 and significand `zSig', and returns the proper double-precision floating-
529 point value corresponding to the abstract input.  This routine is just like
530 `roundAndPackFloat64' except that `zSig' does not have to be normalized.
531 Bit 63 of `zSig' must be zero, and `zExp' must be 1 less than the ``true''
532 floating-point exponent.
533 -------------------------------------------------------------------------------
534 */
535 static float64
normalizeRoundAndPackFloat64(flag zSign,int16 zExp,bits64 zSig)536  normalizeRoundAndPackFloat64( flag zSign, int16 zExp, bits64 zSig )
537 {
538     int8 shiftCount;
539 
540     shiftCount = countLeadingZeros64( zSig ) - 1;
541     return roundAndPackFloat64( zSign, zExp - shiftCount, zSig<<shiftCount );
542 
543 }
544 
545 #ifdef FLOATX80
546 
547 /*
548 -------------------------------------------------------------------------------
549 Returns the fraction bits of the extended double-precision floating-point
550 value `a'.
551 -------------------------------------------------------------------------------
552 */
extractFloatx80Frac(floatx80 a)553 INLINE bits64 extractFloatx80Frac( floatx80 a )
554 {
555 
556     return a.low;
557 
558 }
559 
560 /*
561 -------------------------------------------------------------------------------
562 Returns the exponent bits of the extended double-precision floating-point
563 value `a'.
564 -------------------------------------------------------------------------------
565 */
extractFloatx80Exp(floatx80 a)566 INLINE int32 extractFloatx80Exp( floatx80 a )
567 {
568 
569     return a.high & 0x7FFF;
570 
571 }
572 
573 /*
574 -------------------------------------------------------------------------------
575 Returns the sign bit of the extended double-precision floating-point value
576 `a'.
577 -------------------------------------------------------------------------------
578 */
extractFloatx80Sign(floatx80 a)579 INLINE flag extractFloatx80Sign( floatx80 a )
580 {
581 
582     return a.high>>15;
583 
584 }
585 
586 /*
587 -------------------------------------------------------------------------------
588 Normalizes the subnormal extended double-precision floating-point value
589 represented by the denormalized significand `aSig'.  The normalized exponent
590 and significand are stored at the locations pointed to by `zExpPtr' and
591 `zSigPtr', respectively.
592 -------------------------------------------------------------------------------
593 */
594 static void
normalizeFloatx80Subnormal(bits64 aSig,int32 * zExpPtr,bits64 * zSigPtr)595  normalizeFloatx80Subnormal( bits64 aSig, int32 *zExpPtr, bits64 *zSigPtr )
596 {
597     int8 shiftCount;
598 
599     shiftCount = countLeadingZeros64( aSig );
600     *zSigPtr = aSig<<shiftCount;
601     *zExpPtr = 1 - shiftCount;
602 
603 }
604 
605 /*
606 -------------------------------------------------------------------------------
607 Packs the sign `zSign', exponent `zExp', and significand `zSig' into an
608 extended double-precision floating-point value, returning the result.
609 -------------------------------------------------------------------------------
610 */
packFloatx80(flag zSign,int32 zExp,bits64 zSig)611 INLINE floatx80 packFloatx80( flag zSign, int32 zExp, bits64 zSig )
612 {
613     floatx80 z;
614 
615     z.low = zSig;
616     z.high = ( ( (bits16) zSign )<<15 ) + zExp;
617     return z;
618 
619 }
620 
621 /*
622 -------------------------------------------------------------------------------
623 Takes an abstract floating-point value having sign `zSign', exponent `zExp',
624 and extended significand formed by the concatenation of `zSig0' and `zSig1',
625 and returns the proper extended double-precision floating-point value
626 corresponding to the abstract input.  Ordinarily, the abstract value is
627 rounded and packed into the extended double-precision format, with the
628 inexact exception raised if the abstract input cannot be represented
629 exactly.  However, if the abstract value is too large, the overflow and
630 inexact exceptions are raised and an infinity or maximal finite value is
631 returned.  If the abstract value is too small, the input value is rounded to
632 a subnormal number, and the underflow and inexact exceptions are raised if
633 the abstract input cannot be represented exactly as a subnormal extended
634 double-precision floating-point number.
635     If `roundingPrecision' is 32 or 64, the result is rounded to the same
636 number of bits as single or double precision, respectively.  Otherwise, the
637 result is rounded to the full precision of the extended double-precision
638 format.
639     The input significand must be normalized or smaller.  If the input
640 significand is not normalized, `zExp' must be 0; in that case, the result
641 returned is a subnormal number, and it must not require rounding.  The
642 handling of underflow and overflow follows the IEC/IEEE Standard for Binary
643 Floating-Point Arithmetic.
644 -------------------------------------------------------------------------------
645 */
646 static floatx80
roundAndPackFloatx80(int8 roundingPrecision,flag zSign,int32 zExp,bits64 zSig0,bits64 zSig1)647  roundAndPackFloatx80(
648      int8 roundingPrecision, flag zSign, int32 zExp, bits64 zSig0, bits64 zSig1
649  )
650 {
651     int8 roundingMode;
652     flag roundNearestEven, increment, isTiny;
653     int64 roundIncrement, roundMask, roundBits;
654 
655     roundingMode = float_rounding_mode;
656     roundNearestEven = ( roundingMode == float_round_nearest_even );
657     if ( roundingPrecision == 80 ) goto precision80;
658     if ( roundingPrecision == 64 ) {
659         roundIncrement = LIT64( 0x0000000000000400 );
660         roundMask = LIT64( 0x00000000000007FF );
661     }
662     else if ( roundingPrecision == 32 ) {
663         roundIncrement = LIT64( 0x0000008000000000 );
664         roundMask = LIT64( 0x000000FFFFFFFFFF );
665     }
666     else {
667         goto precision80;
668     }
669     zSig0 |= ( zSig1 != 0 );
670     if ( ! roundNearestEven ) {
671         if ( roundingMode == float_round_to_zero ) {
672             roundIncrement = 0;
673         }
674         else {
675             roundIncrement = roundMask;
676             if ( zSign ) {
677                 if ( roundingMode == float_round_up ) roundIncrement = 0;
678             }
679             else {
680                 if ( roundingMode == float_round_down ) roundIncrement = 0;
681             }
682         }
683     }
684     roundBits = zSig0 & roundMask;
685     if ( 0x7FFD <= (bits32) ( zExp - 1 ) ) {
686         if (    ( 0x7FFE < zExp )
687              || ( ( zExp == 0x7FFE ) && ( zSig0 + roundIncrement < zSig0 ) )
688            ) {
689             goto overflow;
690         }
691         if ( zExp <= 0 ) {
692             isTiny =
693                    ( float_detect_tininess == float_tininess_before_rounding )
694                 || ( zExp < 0 )
695                 || ( zSig0 <= zSig0 + roundIncrement );
696             shift64RightJamming( zSig0, 1 - zExp, &zSig0 );
697             zExp = 0;
698             roundBits = zSig0 & roundMask;
699             if ( isTiny && roundBits ) float_raise( float_flag_underflow );
700             if ( roundBits ) float_exception_flags |= float_flag_inexact;
701             zSig0 += roundIncrement;
702             if ( (sbits64) zSig0 < 0 ) zExp = 1;
703             roundIncrement = roundMask + 1;
704             if ( roundNearestEven && ( roundBits<<1 == roundIncrement ) ) {
705                 roundMask |= roundIncrement;
706             }
707             zSig0 &= ~ roundMask;
708             return packFloatx80( zSign, zExp, zSig0 );
709         }
710     }
711     if ( roundBits ) float_exception_flags |= float_flag_inexact;
712     zSig0 += roundIncrement;
713     if ( zSig0 < roundIncrement ) {
714         ++zExp;
715         zSig0 = LIT64( 0x8000000000000000 );
716     }
717     roundIncrement = roundMask + 1;
718     if ( roundNearestEven && ( roundBits<<1 == roundIncrement ) ) {
719         roundMask |= roundIncrement;
720     }
721     zSig0 &= ~ roundMask;
722     if ( zSig0 == 0 ) zExp = 0;
723     return packFloatx80( zSign, zExp, zSig0 );
724  precision80:
725     increment = ( (sbits64) zSig1 < 0 );
726     if ( ! roundNearestEven ) {
727         if ( roundingMode == float_round_to_zero ) {
728             increment = 0;
729         }
730         else {
731             if ( zSign ) {
732                 increment = ( roundingMode == float_round_down ) && zSig1;
733             }
734             else {
735                 increment = ( roundingMode == float_round_up ) && zSig1;
736             }
737         }
738     }
739     if ( 0x7FFD <= (bits32) ( zExp - 1 ) ) {
740         if (    ( 0x7FFE < zExp )
741              || (    ( zExp == 0x7FFE )
742                   && ( zSig0 == LIT64( 0xFFFFFFFFFFFFFFFF ) )
743                   && increment
744                 )
745            ) {
746             roundMask = 0;
747  overflow:
748             float_raise( float_flag_overflow | float_flag_inexact );
749             if (    ( roundingMode == float_round_to_zero )
750                  || ( zSign && ( roundingMode == float_round_up ) )
751                  || ( ! zSign && ( roundingMode == float_round_down ) )
752                ) {
753                 return packFloatx80( zSign, 0x7FFE, ~ roundMask );
754             }
755             return packFloatx80( zSign, 0x7FFF, LIT64( 0x8000000000000000 ) );
756         }
757         if ( zExp <= 0 ) {
758             isTiny =
759                    ( float_detect_tininess == float_tininess_before_rounding )
760                 || ( zExp < 0 )
761                 || ! increment
762                 || ( zSig0 < LIT64( 0xFFFFFFFFFFFFFFFF ) );
763             shift64ExtraRightJamming( zSig0, zSig1, 1 - zExp, &zSig0, &zSig1 );
764             zExp = 0;
765             if ( isTiny && zSig1 ) float_raise( float_flag_underflow );
766             if ( zSig1 ) float_exception_flags |= float_flag_inexact;
767             if ( roundNearestEven ) {
768                 increment = ( (sbits64) zSig1 < 0 );
769             }
770             else {
771                 if ( zSign ) {
772                     increment = ( roundingMode == float_round_down ) && zSig1;
773                 }
774                 else {
775                     increment = ( roundingMode == float_round_up ) && zSig1;
776                 }
777             }
778             if ( increment ) {
779                 ++zSig0;
780                 zSig0 &=
781                     ~ ( ( (bits64) ( zSig1<<1 ) == 0 ) & roundNearestEven );
782                 if ( (sbits64) zSig0 < 0 ) zExp = 1;
783             }
784             return packFloatx80( zSign, zExp, zSig0 );
785         }
786     }
787     if ( zSig1 ) float_exception_flags |= float_flag_inexact;
788     if ( increment ) {
789         ++zSig0;
790         if ( zSig0 == 0 ) {
791             ++zExp;
792             zSig0 = LIT64( 0x8000000000000000 );
793         }
794         else {
795             zSig0 &= ~ ( ( (bits64) ( zSig1<<1 ) == 0 ) & roundNearestEven );
796         }
797     }
798     else {
799         if ( zSig0 == 0 ) zExp = 0;
800     }
801     return packFloatx80( zSign, zExp, zSig0 );
802 
803 }
804 
805 /*
806 -------------------------------------------------------------------------------
807 Takes an abstract floating-point value having sign `zSign', exponent
808 `zExp', and significand formed by the concatenation of `zSig0' and `zSig1',
809 and returns the proper extended double-precision floating-point value
810 corresponding to the abstract input.  This routine is just like
811 `roundAndPackFloatx80' except that the input significand does not have to be
812 normalized.
813 -------------------------------------------------------------------------------
814 */
815 static floatx80
normalizeRoundAndPackFloatx80(int8 roundingPrecision,flag zSign,int32 zExp,bits64 zSig0,bits64 zSig1)816  normalizeRoundAndPackFloatx80(
817      int8 roundingPrecision, flag zSign, int32 zExp, bits64 zSig0, bits64 zSig1
818  )
819 {
820     int8 shiftCount;
821 
822     if ( zSig0 == 0 ) {
823         zSig0 = zSig1;
824         zSig1 = 0;
825         zExp -= 64;
826     }
827     shiftCount = countLeadingZeros64( zSig0 );
828     shortShift128Left( zSig0, zSig1, shiftCount, &zSig0, &zSig1 );
829     zExp -= shiftCount;
830     return
831         roundAndPackFloatx80( roundingPrecision, zSign, zExp, zSig0, zSig1 );
832 
833 }
834 
835 #endif
836 
837 #ifdef FLOAT128
838 
839 /*
840 -------------------------------------------------------------------------------
841 Returns the least-significant 64 fraction bits of the quadruple-precision
842 floating-point value `a'.
843 -------------------------------------------------------------------------------
844 */
extractFloat128Frac1(float128 a)845 INLINE bits64 extractFloat128Frac1( float128 a )
846 {
847 
848     return a.low;
849 
850 }
851 
852 /*
853 -------------------------------------------------------------------------------
854 Returns the most-significant 48 fraction bits of the quadruple-precision
855 floating-point value `a'.
856 -------------------------------------------------------------------------------
857 */
extractFloat128Frac0(float128 a)858 INLINE bits64 extractFloat128Frac0( float128 a )
859 {
860 
861     return a.high & LIT64( 0x0000FFFFFFFFFFFF );
862 
863 }
864 
865 /*
866 -------------------------------------------------------------------------------
867 Returns the exponent bits of the quadruple-precision floating-point value
868 `a'.
869 -------------------------------------------------------------------------------
870 */
extractFloat128Exp(float128 a)871 INLINE int32 extractFloat128Exp( float128 a )
872 {
873 
874     return ( a.high>>48 ) & 0x7FFF;
875 
876 }
877 
878 /*
879 -------------------------------------------------------------------------------
880 Returns the sign bit of the quadruple-precision floating-point value `a'.
881 -------------------------------------------------------------------------------
882 */
extractFloat128Sign(float128 a)883 INLINE flag extractFloat128Sign( float128 a )
884 {
885 
886     return a.high>>63;
887 
888 }
889 
890 /*
891 -------------------------------------------------------------------------------
892 Normalizes the subnormal quadruple-precision floating-point value
893 represented by the denormalized significand formed by the concatenation of
894 `aSig0' and `aSig1'.  The normalized exponent is stored at the location
895 pointed to by `zExpPtr'.  The most significant 49 bits of the normalized
896 significand are stored at the location pointed to by `zSig0Ptr', and the
897 least significant 64 bits of the normalized significand are stored at the
898 location pointed to by `zSig1Ptr'.
899 -------------------------------------------------------------------------------
900 */
901 static void
normalizeFloat128Subnormal(bits64 aSig0,bits64 aSig1,int32 * zExpPtr,bits64 * zSig0Ptr,bits64 * zSig1Ptr)902  normalizeFloat128Subnormal(
903      bits64 aSig0,
904      bits64 aSig1,
905      int32 *zExpPtr,
906      bits64 *zSig0Ptr,
907      bits64 *zSig1Ptr
908  )
909 {
910     int8 shiftCount;
911 
912     if ( aSig0 == 0 ) {
913         shiftCount = countLeadingZeros64( aSig1 ) - 15;
914         if ( shiftCount < 0 ) {
915             *zSig0Ptr = aSig1>>( - shiftCount );
916             *zSig1Ptr = aSig1<<( shiftCount & 63 );
917         }
918         else {
919             *zSig0Ptr = aSig1<<shiftCount;
920             *zSig1Ptr = 0;
921         }
922         *zExpPtr = - shiftCount - 63;
923     }
924     else {
925         shiftCount = countLeadingZeros64( aSig0 ) - 15;
926         shortShift128Left( aSig0, aSig1, shiftCount, zSig0Ptr, zSig1Ptr );
927         *zExpPtr = 1 - shiftCount;
928     }
929 
930 }
931 
932 /*
933 -------------------------------------------------------------------------------
934 Packs the sign `zSign', the exponent `zExp', and the significand formed
935 by the concatenation of `zSig0' and `zSig1' into a quadruple-precision
936 floating-point value, returning the result.  After being shifted into the
937 proper positions, the three fields `zSign', `zExp', and `zSig0' are simply
938 added together to form the most significant 32 bits of the result.  This
939 means that any integer portion of `zSig0' will be added into the exponent.
940 Since a properly normalized significand will have an integer portion equal
941 to 1, the `zExp' input should be 1 less than the desired result exponent
942 whenever `zSig0' and `zSig1' concatenated form a complete, normalized
943 significand.
944 -------------------------------------------------------------------------------
945 */
946 INLINE float128
packFloat128(flag zSign,int32 zExp,bits64 zSig0,bits64 zSig1)947  packFloat128( flag zSign, int32 zExp, bits64 zSig0, bits64 zSig1 )
948 {
949     float128 z;
950 
951     z.low = zSig1;
952     z.high = ( ( (bits64) zSign )<<63 ) + ( ( (bits64) zExp )<<48 ) + zSig0;
953     return z;
954 
955 }
956 
957 /*
958 -------------------------------------------------------------------------------
959 Takes an abstract floating-point value having sign `zSign', exponent `zExp',
960 and extended significand formed by the concatenation of `zSig0', `zSig1',
961 and `zSig2', and returns the proper quadruple-precision floating-point value
962 corresponding to the abstract input.  Ordinarily, the abstract value is
963 simply rounded and packed into the quadruple-precision format, with the
964 inexact exception raised if the abstract input cannot be represented
965 exactly.  However, if the abstract value is too large, the overflow and
966 inexact exceptions are raised and an infinity or maximal finite value is
967 returned.  If the abstract value is too small, the input value is rounded to
968 a subnormal number, and the underflow and inexact exceptions are raised if
969 the abstract input cannot be represented exactly as a subnormal quadruple-
970 precision floating-point number.
971     The input significand must be normalized or smaller.  If the input
972 significand is not normalized, `zExp' must be 0; in that case, the result
973 returned is a subnormal number, and it must not require rounding.  In the
974 usual case that the input significand is normalized, `zExp' must be 1 less
975 than the ``true'' floating-point exponent.  The handling of underflow and
976 overflow follows the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
977 -------------------------------------------------------------------------------
978 */
979 static float128
roundAndPackFloat128(flag zSign,int32 zExp,bits64 zSig0,bits64 zSig1,bits64 zSig2)980  roundAndPackFloat128(
981      flag zSign, int32 zExp, bits64 zSig0, bits64 zSig1, bits64 zSig2 )
982 {
983     int8 roundingMode;
984     flag roundNearestEven, increment, isTiny;
985 
986     roundingMode = float_rounding_mode;
987     roundNearestEven = ( roundingMode == float_round_nearest_even );
988     increment = ( (sbits64) zSig2 < 0 );
989     if ( ! roundNearestEven ) {
990         if ( roundingMode == float_round_to_zero ) {
991             increment = 0;
992         }
993         else {
994             if ( zSign ) {
995                 increment = ( roundingMode == float_round_down ) && zSig2;
996             }
997             else {
998                 increment = ( roundingMode == float_round_up ) && zSig2;
999             }
1000         }
1001     }
1002     if ( 0x7FFD <= (bits32) zExp ) {
1003         if (    ( 0x7FFD < zExp )
1004              || (    ( zExp == 0x7FFD )
1005                   && eq128(
1006                          LIT64( 0x0001FFFFFFFFFFFF ),
1007                          LIT64( 0xFFFFFFFFFFFFFFFF ),
1008                          zSig0,
1009                          zSig1
1010                      )
1011                   && increment
1012                 )
1013            ) {
1014             float_raise( float_flag_overflow | float_flag_inexact );
1015             if (    ( roundingMode == float_round_to_zero )
1016                  || ( zSign && ( roundingMode == float_round_up ) )
1017                  || ( ! zSign && ( roundingMode == float_round_down ) )
1018                ) {
1019                 return
1020                     packFloat128(
1021                         zSign,
1022                         0x7FFE,
1023                         LIT64( 0x0000FFFFFFFFFFFF ),
1024                         LIT64( 0xFFFFFFFFFFFFFFFF )
1025                     );
1026             }
1027             return packFloat128( zSign, 0x7FFF, 0, 0 );
1028         }
1029         if ( zExp < 0 ) {
1030             isTiny =
1031                    ( float_detect_tininess == float_tininess_before_rounding )
1032                 || ( zExp < -1 )
1033                 || ! increment
1034                 || lt128(
1035                        zSig0,
1036                        zSig1,
1037                        LIT64( 0x0001FFFFFFFFFFFF ),
1038                        LIT64( 0xFFFFFFFFFFFFFFFF )
1039                    );
1040             shift128ExtraRightJamming(
1041                 zSig0, zSig1, zSig2, - zExp, &zSig0, &zSig1, &zSig2 );
1042             zExp = 0;
1043             if ( isTiny && zSig2 ) float_raise( float_flag_underflow );
1044             if ( roundNearestEven ) {
1045                 increment = ( (sbits64) zSig2 < 0 );
1046             }
1047             else {
1048                 if ( zSign ) {
1049                     increment = ( roundingMode == float_round_down ) && zSig2;
1050                 }
1051                 else {
1052                     increment = ( roundingMode == float_round_up ) && zSig2;
1053                 }
1054             }
1055         }
1056     }
1057     if ( zSig2 ) float_exception_flags |= float_flag_inexact;
1058     if ( increment ) {
1059         add128( zSig0, zSig1, 0, 1, &zSig0, &zSig1 );
1060         zSig1 &= ~ ( ( zSig2 + zSig2 == 0 ) & roundNearestEven );
1061     }
1062     else {
1063         if ( ( zSig0 | zSig1 ) == 0 ) zExp = 0;
1064     }
1065     return packFloat128( zSign, zExp, zSig0, zSig1 );
1066 
1067 }
1068 
1069 /*
1070 -------------------------------------------------------------------------------
1071 Takes an abstract floating-point value having sign `zSign', exponent `zExp',
1072 and significand formed by the concatenation of `zSig0' and `zSig1', and
1073 returns the proper quadruple-precision floating-point value corresponding
1074 to the abstract input.  This routine is just like `roundAndPackFloat128'
1075 except that the input significand has fewer bits and does not have to be
1076 normalized.  In all cases, `zExp' must be 1 less than the ``true'' floating-
1077 point exponent.
1078 -------------------------------------------------------------------------------
1079 */
1080 static float128
normalizeRoundAndPackFloat128(flag zSign,int32 zExp,bits64 zSig0,bits64 zSig1)1081  normalizeRoundAndPackFloat128(
1082      flag zSign, int32 zExp, bits64 zSig0, bits64 zSig1 )
1083 {
1084     int8 shiftCount;
1085     bits64 zSig2;
1086 
1087     if ( zSig0 == 0 ) {
1088         zSig0 = zSig1;
1089         zSig1 = 0;
1090         zExp -= 64;
1091     }
1092     shiftCount = countLeadingZeros64( zSig0 ) - 15;
1093     if ( 0 <= shiftCount ) {
1094         zSig2 = 0;
1095         shortShift128Left( zSig0, zSig1, shiftCount, &zSig0, &zSig1 );
1096     }
1097     else {
1098         shift128ExtraRightJamming(
1099             zSig0, zSig1, 0, - shiftCount, &zSig0, &zSig1, &zSig2 );
1100     }
1101     zExp -= shiftCount;
1102     return roundAndPackFloat128( zSign, zExp, zSig0, zSig1, zSig2 );
1103 
1104 }
1105 
1106 #endif
1107 
1108 /*
1109 -------------------------------------------------------------------------------
1110 Returns the result of converting the 32-bit two's complement integer `a'
1111 to the single-precision floating-point format.  The conversion is performed
1112 according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
1113 -------------------------------------------------------------------------------
1114 */
int32_to_float32(int32 a)1115 float32 int32_to_float32( int32 a )
1116 {
1117     flag zSign;
1118 
1119     if ( a == 0 ) return 0;
1120     if ( a == (sbits32) 0x80000000 ) return packFloat32( 1, 0x9E, 0 );
1121     zSign = ( a < 0 );
1122     return normalizeRoundAndPackFloat32( zSign, 0x9C, zSign ? - a : a );
1123 
1124 }
1125 
1126 #ifndef SOFTFLOAT_FOR_GCC /* __floatunsisf is in libgcc */
uint32_to_float32(uint32 a)1127 float32 uint32_to_float32( uint32 a )
1128 {
1129     if ( a == 0 ) return 0;
1130     if ( a & (bits32) 0x80000000 )
1131 	return normalizeRoundAndPackFloat32( 0, 0x9D, a >> 1 );
1132     return normalizeRoundAndPackFloat32( 0, 0x9C, a );
1133 }
1134 #endif
1135 
1136 
1137 /*
1138 -------------------------------------------------------------------------------
1139 Returns the result of converting the 32-bit two's complement integer `a'
1140 to the double-precision floating-point format.  The conversion is performed
1141 according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
1142 -------------------------------------------------------------------------------
1143 */
int32_to_float64(int32 a)1144 float64 int32_to_float64( int32 a )
1145 {
1146     flag zSign;
1147     uint32 absA;
1148     int8 shiftCount;
1149     bits64 zSig;
1150 
1151     if ( a == 0 ) return 0;
1152     zSign = ( a < 0 );
1153     absA = zSign ? - a : a;
1154     shiftCount = countLeadingZeros32( absA ) + 21;
1155     zSig = absA;
1156     return packFloat64( zSign, 0x432 - shiftCount, zSig<<shiftCount );
1157 
1158 }
1159 
1160 #ifndef SOFTFLOAT_FOR_GCC /* __floatunsidf is in libgcc */
uint32_to_float64(uint32 a)1161 float64 uint32_to_float64( uint32 a )
1162 {
1163     int8 shiftCount;
1164     bits64 zSig = a;
1165 
1166     if ( a == 0 ) return 0;
1167     shiftCount = countLeadingZeros32( a ) + 21;
1168     return packFloat64( 0, 0x432 - shiftCount, zSig<<shiftCount );
1169 
1170 }
1171 #endif
1172 
1173 #ifdef FLOATX80
1174 
1175 /*
1176 -------------------------------------------------------------------------------
1177 Returns the result of converting the 32-bit two's complement integer `a'
1178 to the extended double-precision floating-point format.  The conversion
1179 is performed according to the IEC/IEEE Standard for Binary Floating-Point
1180 Arithmetic.
1181 -------------------------------------------------------------------------------
1182 */
int32_to_floatx80(int32 a)1183 floatx80 int32_to_floatx80( int32 a )
1184 {
1185     flag zSign;
1186     uint32 absA;
1187     int8 shiftCount;
1188     bits64 zSig;
1189 
1190     if ( a == 0 ) return packFloatx80( 0, 0, 0 );
1191     zSign = ( a < 0 );
1192     absA = zSign ? - a : a;
1193     shiftCount = countLeadingZeros32( absA ) + 32;
1194     zSig = absA;
1195     return packFloatx80( zSign, 0x403E - shiftCount, zSig<<shiftCount );
1196 
1197 }
1198 
uint32_to_floatx80(uint32 a)1199 floatx80 uint32_to_floatx80( uint32 a )
1200 {
1201     int8 shiftCount;
1202     bits64 zSig = a;
1203 
1204     if ( a == 0 ) return packFloatx80( 0, 0, 0 );
1205     shiftCount = countLeadingZeros32( a ) + 32;
1206     return packFloatx80( 0, 0x403E - shiftCount, zSig<<shiftCount );
1207 
1208 }
1209 
1210 #endif
1211 
1212 #ifdef FLOAT128
1213 
1214 /*
1215 -------------------------------------------------------------------------------
1216 Returns the result of converting the 32-bit two's complement integer `a' to
1217 the quadruple-precision floating-point format.  The conversion is performed
1218 according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
1219 -------------------------------------------------------------------------------
1220 */
int32_to_float128(int32 a)1221 float128 int32_to_float128( int32 a )
1222 {
1223     flag zSign;
1224     uint32 absA;
1225     int8 shiftCount;
1226     bits64 zSig0;
1227 
1228     if ( a == 0 ) return packFloat128( 0, 0, 0, 0 );
1229     zSign = ( a < 0 );
1230     absA = zSign ? - a : a;
1231     shiftCount = countLeadingZeros32( absA ) + 17;
1232     zSig0 = absA;
1233     return packFloat128( zSign, 0x402E - shiftCount, zSig0<<shiftCount, 0 );
1234 
1235 }
1236 
uint32_to_float128(uint32 a)1237 float128 uint32_to_float128( uint32 a )
1238 {
1239     int8 shiftCount;
1240     bits64 zSig0 = a;
1241 
1242     if ( a == 0 ) return packFloat128( 0, 0, 0, 0 );
1243     shiftCount = countLeadingZeros32( a ) + 17;
1244     return packFloat128( 0, 0x402E - shiftCount, zSig0<<shiftCount, 0 );
1245 
1246 }
1247 
1248 #endif
1249 
1250 #ifndef SOFTFLOAT_FOR_GCC /* __floatdi?f is in libgcc2.c */
1251 /*
1252 -------------------------------------------------------------------------------
1253 Returns the result of converting the 64-bit two's complement integer `a'
1254 to the single-precision floating-point format.  The conversion is performed
1255 according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
1256 -------------------------------------------------------------------------------
1257 */
int64_to_float32(int64 a)1258 float32 int64_to_float32( int64 a )
1259 {
1260     flag zSign;
1261     uint64 absA;
1262     int8 shiftCount;
1263 
1264     if ( a == 0 ) return 0;
1265     zSign = ( a < 0 );
1266     absA = zSign ? - a : a;
1267     shiftCount = countLeadingZeros64( absA ) - 40;
1268     if ( 0 <= shiftCount ) {
1269         return packFloat32( zSign, 0x95 - shiftCount, absA<<shiftCount );
1270     }
1271     else {
1272         shiftCount += 7;
1273         if ( shiftCount < 0 ) {
1274             shift64RightJamming( absA, - shiftCount, &absA );
1275         }
1276         else {
1277             absA <<= shiftCount;
1278         }
1279         return roundAndPackFloat32( zSign, 0x9C - shiftCount, absA );
1280     }
1281 
1282 }
1283 
1284 /*
1285 -------------------------------------------------------------------------------
1286 Returns the result of converting the 64-bit two's complement integer `a'
1287 to the double-precision floating-point format.  The conversion is performed
1288 according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
1289 -------------------------------------------------------------------------------
1290 */
int64_to_float64(int64 a)1291 float64 int64_to_float64( int64 a )
1292 {
1293     flag zSign;
1294 
1295     if ( a == 0 ) return 0;
1296     if ( a == (sbits64) LIT64( 0x8000000000000000 ) ) {
1297         return packFloat64( 1, 0x43E, 0 );
1298     }
1299     zSign = ( a < 0 );
1300     return normalizeRoundAndPackFloat64( zSign, 0x43C, zSign ? - a : a );
1301 
1302 }
1303 
1304 #ifdef FLOATX80
1305 
1306 /*
1307 -------------------------------------------------------------------------------
1308 Returns the result of converting the 64-bit two's complement integer `a'
1309 to the extended double-precision floating-point format.  The conversion
1310 is performed according to the IEC/IEEE Standard for Binary Floating-Point
1311 Arithmetic.
1312 -------------------------------------------------------------------------------
1313 */
int64_to_floatx80(int64 a)1314 floatx80 int64_to_floatx80( int64 a )
1315 {
1316     flag zSign;
1317     uint64 absA;
1318     int8 shiftCount;
1319 
1320     if ( a == 0 ) return packFloatx80( 0, 0, 0 );
1321     zSign = ( a < 0 );
1322     absA = zSign ? - a : a;
1323     shiftCount = countLeadingZeros64( absA );
1324     return packFloatx80( zSign, 0x403E - shiftCount, absA<<shiftCount );
1325 
1326 }
1327 
1328 #endif
1329 
1330 #endif /* !SOFTFLOAT_FOR_GCC */
1331 
1332 #ifdef FLOAT128
1333 
1334 /*
1335 -------------------------------------------------------------------------------
1336 Returns the result of converting the 64-bit two's complement integer `a' to
1337 the quadruple-precision floating-point format.  The conversion is performed
1338 according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
1339 -------------------------------------------------------------------------------
1340 */
int64_to_float128(int64 a)1341 float128 int64_to_float128( int64 a )
1342 {
1343     flag zSign;
1344     uint64 absA;
1345     int8 shiftCount;
1346     int32 zExp;
1347     bits64 zSig0, zSig1;
1348 
1349     if ( a == 0 ) return packFloat128( 0, 0, 0, 0 );
1350     zSign = ( a < 0 );
1351     absA = zSign ? - a : a;
1352     shiftCount = countLeadingZeros64( absA ) + 49;
1353     zExp = 0x406E - shiftCount;
1354     if ( 64 <= shiftCount ) {
1355         zSig1 = 0;
1356         zSig0 = absA;
1357         shiftCount -= 64;
1358     }
1359     else {
1360         zSig1 = absA;
1361         zSig0 = 0;
1362     }
1363     shortShift128Left( zSig0, zSig1, shiftCount, &zSig0, &zSig1 );
1364     return packFloat128( zSign, zExp, zSig0, zSig1 );
1365 
1366 }
1367 
1368 #endif
1369 
1370 #ifndef SOFTFLOAT_FOR_GCC /* Not needed */
1371 /*
1372 -------------------------------------------------------------------------------
1373 Returns the result of converting the single-precision floating-point value
1374 `a' to the 32-bit two's complement integer format.  The conversion is
1375 performed according to the IEC/IEEE Standard for Binary Floating-Point
1376 Arithmetic---which means in particular that the conversion is rounded
1377 according to the current rounding mode.  If `a' is a NaN, the largest
1378 positive integer is returned.  Otherwise, if the conversion overflows, the
1379 largest integer with the same sign as `a' is returned.
1380 -------------------------------------------------------------------------------
1381 */
float32_to_int32(float32 a)1382 int32 float32_to_int32( float32 a )
1383 {
1384     flag aSign;
1385     int16 aExp, shiftCount;
1386     bits32 aSig;
1387     bits64 aSig64;
1388 
1389     aSig = extractFloat32Frac( a );
1390     aExp = extractFloat32Exp( a );
1391     aSign = extractFloat32Sign( a );
1392     if ( ( aExp == 0xFF ) && aSig ) aSign = 0;
1393     if ( aExp ) aSig |= 0x00800000;
1394     shiftCount = 0xAF - aExp;
1395     aSig64 = aSig;
1396     aSig64 <<= 32;
1397     if ( 0 < shiftCount ) shift64RightJamming( aSig64, shiftCount, &aSig64 );
1398     return roundAndPackInt32( aSign, aSig64 );
1399 
1400 }
1401 #endif /* !SOFTFLOAT_FOR_GCC */
1402 
1403 /*
1404 -------------------------------------------------------------------------------
1405 Returns the result of converting the single-precision floating-point value
1406 `a' to the 32-bit two's complement integer format.  The conversion is
1407 performed according to the IEC/IEEE Standard for Binary Floating-Point
1408 Arithmetic, except that the conversion is always rounded toward zero.
1409 If `a' is a NaN, the largest positive integer is returned.  Otherwise, if
1410 the conversion overflows, the largest integer with the same sign as `a' is
1411 returned.
1412 -------------------------------------------------------------------------------
1413 */
float32_to_int32_round_to_zero(float32 a)1414 int32 float32_to_int32_round_to_zero( float32 a )
1415 {
1416     flag aSign;
1417     int16 aExp, shiftCount;
1418     bits32 aSig;
1419     int32 z;
1420 
1421     aSig = extractFloat32Frac( a );
1422     aExp = extractFloat32Exp( a );
1423     aSign = extractFloat32Sign( a );
1424     shiftCount = aExp - 0x9E;
1425     if ( 0 <= shiftCount ) {
1426         if ( a != 0xCF000000 ) {
1427             float_raise( float_flag_invalid );
1428             if ( ! aSign || ( ( aExp == 0xFF ) && aSig ) ) return 0x7FFFFFFF;
1429         }
1430         return (sbits32) 0x80000000;
1431     }
1432     else if ( aExp <= 0x7E ) {
1433         if ( aExp | aSig ) float_exception_flags |= float_flag_inexact;
1434         return 0;
1435     }
1436     aSig = ( aSig | 0x00800000 )<<8;
1437     z = aSig>>( - shiftCount );
1438     if ( (bits32) ( aSig<<( shiftCount & 31 ) ) ) {
1439         float_exception_flags |= float_flag_inexact;
1440     }
1441     if ( aSign ) z = - z;
1442     return z;
1443 
1444 }
1445 
1446 #ifndef SOFTFLOAT_FOR_GCC /* __fix?fdi provided by libgcc2.c */
1447 /*
1448 -------------------------------------------------------------------------------
1449 Returns the result of converting the single-precision floating-point value
1450 `a' to the 64-bit two's complement integer format.  The conversion is
1451 performed according to the IEC/IEEE Standard for Binary Floating-Point
1452 Arithmetic---which means in particular that the conversion is rounded
1453 according to the current rounding mode.  If `a' is a NaN, the largest
1454 positive integer is returned.  Otherwise, if the conversion overflows, the
1455 largest integer with the same sign as `a' is returned.
1456 -------------------------------------------------------------------------------
1457 */
float32_to_int64(float32 a)1458 int64 float32_to_int64( float32 a )
1459 {
1460     flag aSign;
1461     int16 aExp, shiftCount;
1462     bits32 aSig;
1463     bits64 aSig64, aSigExtra;
1464 
1465     aSig = extractFloat32Frac( a );
1466     aExp = extractFloat32Exp( a );
1467     aSign = extractFloat32Sign( a );
1468     shiftCount = 0xBE - aExp;
1469     if ( shiftCount < 0 ) {
1470         float_raise( float_flag_invalid );
1471         if ( ! aSign || ( ( aExp == 0xFF ) && aSig ) ) {
1472             return LIT64( 0x7FFFFFFFFFFFFFFF );
1473         }
1474         return (sbits64) LIT64( 0x8000000000000000 );
1475     }
1476     if ( aExp ) aSig |= 0x00800000;
1477     aSig64 = aSig;
1478     aSig64 <<= 40;
1479     shift64ExtraRightJamming( aSig64, 0, shiftCount, &aSig64, &aSigExtra );
1480     return roundAndPackInt64( aSign, aSig64, aSigExtra );
1481 
1482 }
1483 
1484 /*
1485 -------------------------------------------------------------------------------
1486 Returns the result of converting the single-precision floating-point value
1487 `a' to the 64-bit two's complement integer format.  The conversion is
1488 performed according to the IEC/IEEE Standard for Binary Floating-Point
1489 Arithmetic, except that the conversion is always rounded toward zero.  If
1490 `a' is a NaN, the largest positive integer is returned.  Otherwise, if the
1491 conversion overflows, the largest integer with the same sign as `a' is
1492 returned.
1493 -------------------------------------------------------------------------------
1494 */
float32_to_int64_round_to_zero(float32 a)1495 int64 float32_to_int64_round_to_zero( float32 a )
1496 {
1497     flag aSign;
1498     int16 aExp, shiftCount;
1499     bits32 aSig;
1500     bits64 aSig64;
1501     int64 z;
1502 
1503     aSig = extractFloat32Frac( a );
1504     aExp = extractFloat32Exp( a );
1505     aSign = extractFloat32Sign( a );
1506     shiftCount = aExp - 0xBE;
1507     if ( 0 <= shiftCount ) {
1508         if ( a != 0xDF000000 ) {
1509             float_raise( float_flag_invalid );
1510             if ( ! aSign || ( ( aExp == 0xFF ) && aSig ) ) {
1511                 return LIT64( 0x7FFFFFFFFFFFFFFF );
1512             }
1513         }
1514         return (sbits64) LIT64( 0x8000000000000000 );
1515     }
1516     else if ( aExp <= 0x7E ) {
1517         if ( aExp | aSig ) float_exception_flags |= float_flag_inexact;
1518         return 0;
1519     }
1520     aSig64 = aSig | 0x00800000;
1521     aSig64 <<= 40;
1522     z = aSig64>>( - shiftCount );
1523     if ( (bits64) ( aSig64<<( shiftCount & 63 ) ) ) {
1524         float_exception_flags |= float_flag_inexact;
1525     }
1526     if ( aSign ) z = - z;
1527     return z;
1528 
1529 }
1530 #endif /* !SOFTFLOAT_FOR_GCC */
1531 
1532 /*
1533 -------------------------------------------------------------------------------
1534 Returns the result of converting the single-precision floating-point value
1535 `a' to the double-precision floating-point format.  The conversion is
1536 performed according to the IEC/IEEE Standard for Binary Floating-Point
1537 Arithmetic.
1538 -------------------------------------------------------------------------------
1539 */
float32_to_float64(float32 a)1540 float64 float32_to_float64( float32 a )
1541 {
1542     flag aSign;
1543     int16 aExp;
1544     bits32 aSig;
1545 
1546     aSig = extractFloat32Frac( a );
1547     aExp = extractFloat32Exp( a );
1548     aSign = extractFloat32Sign( a );
1549     if ( aExp == 0xFF ) {
1550         if ( aSig ) return commonNaNToFloat64( float32ToCommonNaN( a ) );
1551         return packFloat64( aSign, 0x7FF, 0 );
1552     }
1553     if ( aExp == 0 ) {
1554         if ( aSig == 0 ) return packFloat64( aSign, 0, 0 );
1555         normalizeFloat32Subnormal( aSig, &aExp, &aSig );
1556         --aExp;
1557     }
1558     return packFloat64( aSign, aExp + 0x380, ( (bits64) aSig )<<29 );
1559 
1560 }
1561 
1562 #ifdef FLOATX80
1563 
1564 /*
1565 -------------------------------------------------------------------------------
1566 Returns the result of converting the single-precision floating-point value
1567 `a' to the extended double-precision floating-point format.  The conversion
1568 is performed according to the IEC/IEEE Standard for Binary Floating-Point
1569 Arithmetic.
1570 -------------------------------------------------------------------------------
1571 */
float32_to_floatx80(float32 a)1572 floatx80 float32_to_floatx80( float32 a )
1573 {
1574     flag aSign;
1575     int16 aExp;
1576     bits32 aSig;
1577 
1578     aSig = extractFloat32Frac( a );
1579     aExp = extractFloat32Exp( a );
1580     aSign = extractFloat32Sign( a );
1581     if ( aExp == 0xFF ) {
1582         if ( aSig ) return commonNaNToFloatx80( float32ToCommonNaN( a ) );
1583         return packFloatx80( aSign, 0x7FFF, LIT64( 0x8000000000000000 ) );
1584     }
1585     if ( aExp == 0 ) {
1586         if ( aSig == 0 ) return packFloatx80( aSign, 0, 0 );
1587         normalizeFloat32Subnormal( aSig, &aExp, &aSig );
1588     }
1589     aSig |= 0x00800000;
1590     return packFloatx80( aSign, aExp + 0x3F80, ( (bits64) aSig )<<40 );
1591 
1592 }
1593 
1594 #endif
1595 
1596 #ifdef FLOAT128
1597 
1598 /*
1599 -------------------------------------------------------------------------------
1600 Returns the result of converting the single-precision floating-point value
1601 `a' to the double-precision floating-point format.  The conversion is
1602 performed according to the IEC/IEEE Standard for Binary Floating-Point
1603 Arithmetic.
1604 -------------------------------------------------------------------------------
1605 */
float32_to_float128(float32 a)1606 float128 float32_to_float128( float32 a )
1607 {
1608     flag aSign;
1609     int16 aExp;
1610     bits32 aSig;
1611 
1612     aSig = extractFloat32Frac( a );
1613     aExp = extractFloat32Exp( a );
1614     aSign = extractFloat32Sign( a );
1615     if ( aExp == 0xFF ) {
1616         if ( aSig ) return commonNaNToFloat128( float32ToCommonNaN( a ) );
1617         return packFloat128( aSign, 0x7FFF, 0, 0 );
1618     }
1619     if ( aExp == 0 ) {
1620         if ( aSig == 0 ) return packFloat128( aSign, 0, 0, 0 );
1621         normalizeFloat32Subnormal( aSig, &aExp, &aSig );
1622         --aExp;
1623     }
1624     return packFloat128( aSign, aExp + 0x3F80, ( (bits64) aSig )<<25, 0 );
1625 
1626 }
1627 
1628 #endif
1629 
1630 #ifndef SOFTFLOAT_FOR_GCC /* Not needed */
1631 /*
1632 -------------------------------------------------------------------------------
1633 Rounds the single-precision floating-point value `a' to an integer, and
1634 returns the result as a single-precision floating-point value.  The
1635 operation is performed according to the IEC/IEEE Standard for Binary
1636 Floating-Point Arithmetic.
1637 -------------------------------------------------------------------------------
1638 */
float32_round_to_int(float32 a)1639 float32 float32_round_to_int( float32 a )
1640 {
1641     flag aSign;
1642     int16 aExp;
1643     bits32 lastBitMask, roundBitsMask;
1644     int8 roundingMode;
1645     float32 z;
1646 
1647     aExp = extractFloat32Exp( a );
1648     if ( 0x96 <= aExp ) {
1649         if ( ( aExp == 0xFF ) && extractFloat32Frac( a ) ) {
1650             return propagateFloat32NaN( a, a );
1651         }
1652         return a;
1653     }
1654     if ( aExp <= 0x7E ) {
1655         if ( (bits32) ( a<<1 ) == 0 ) return a;
1656         float_exception_flags |= float_flag_inexact;
1657         aSign = extractFloat32Sign( a );
1658         switch ( float_rounding_mode ) {
1659          case float_round_nearest_even:
1660             if ( ( aExp == 0x7E ) && extractFloat32Frac( a ) ) {
1661                 return packFloat32( aSign, 0x7F, 0 );
1662             }
1663             break;
1664 	 case float_round_to_zero:
1665 	    break;
1666          case float_round_down:
1667             return aSign ? 0xBF800000 : 0;
1668          case float_round_up:
1669             return aSign ? 0x80000000 : 0x3F800000;
1670         }
1671         return packFloat32( aSign, 0, 0 );
1672     }
1673     lastBitMask = 1;
1674     lastBitMask <<= 0x96 - aExp;
1675     roundBitsMask = lastBitMask - 1;
1676     z = a;
1677     roundingMode = float_rounding_mode;
1678     if ( roundingMode == float_round_nearest_even ) {
1679         z += lastBitMask>>1;
1680         if ( ( z & roundBitsMask ) == 0 ) z &= ~ lastBitMask;
1681     }
1682     else if ( roundingMode != float_round_to_zero ) {
1683         if ( extractFloat32Sign( z ) ^ ( roundingMode == float_round_up ) ) {
1684             z += roundBitsMask;
1685         }
1686     }
1687     z &= ~ roundBitsMask;
1688     if ( z != a ) float_exception_flags |= float_flag_inexact;
1689     return z;
1690 
1691 }
1692 #endif /* !SOFTFLOAT_FOR_GCC */
1693 
1694 /*
1695 -------------------------------------------------------------------------------
1696 Returns the result of adding the absolute values of the single-precision
1697 floating-point values `a' and `b'.  If `zSign' is 1, the sum is negated
1698 before being returned.  `zSign' is ignored if the result is a NaN.
1699 The addition is performed according to the IEC/IEEE Standard for Binary
1700 Floating-Point Arithmetic.
1701 -------------------------------------------------------------------------------
1702 */
addFloat32Sigs(float32 a,float32 b,flag zSign)1703 static float32 addFloat32Sigs( float32 a, float32 b, flag zSign )
1704 {
1705     int16 aExp, bExp, zExp;
1706     bits32 aSig, bSig, zSig;
1707     int16 expDiff;
1708 
1709     aSig = extractFloat32Frac( a );
1710     aExp = extractFloat32Exp( a );
1711     bSig = extractFloat32Frac( b );
1712     bExp = extractFloat32Exp( b );
1713     expDiff = aExp - bExp;
1714     aSig <<= 6;
1715     bSig <<= 6;
1716     if ( 0 < expDiff ) {
1717         if ( aExp == 0xFF ) {
1718             if ( aSig ) return propagateFloat32NaN( a, b );
1719             return a;
1720         }
1721         if ( bExp == 0 ) {
1722             --expDiff;
1723         }
1724         else {
1725             bSig |= 0x20000000;
1726         }
1727         shift32RightJamming( bSig, expDiff, &bSig );
1728         zExp = aExp;
1729     }
1730     else if ( expDiff < 0 ) {
1731         if ( bExp == 0xFF ) {
1732             if ( bSig ) return propagateFloat32NaN( a, b );
1733             return packFloat32( zSign, 0xFF, 0 );
1734         }
1735         if ( aExp == 0 ) {
1736             ++expDiff;
1737         }
1738         else {
1739             aSig |= 0x20000000;
1740         }
1741         shift32RightJamming( aSig, - expDiff, &aSig );
1742         zExp = bExp;
1743     }
1744     else {
1745         if ( aExp == 0xFF ) {
1746             if ( aSig | bSig ) return propagateFloat32NaN( a, b );
1747             return a;
1748         }
1749         if ( aExp == 0 ) return packFloat32( zSign, 0, ( aSig + bSig )>>6 );
1750         zSig = 0x40000000 + aSig + bSig;
1751         zExp = aExp;
1752         goto roundAndPack;
1753     }
1754     aSig |= 0x20000000;
1755     zSig = ( aSig + bSig )<<1;
1756     --zExp;
1757     if ( (sbits32) zSig < 0 ) {
1758         zSig = aSig + bSig;
1759         ++zExp;
1760     }
1761  roundAndPack:
1762     return roundAndPackFloat32( zSign, zExp, zSig );
1763 
1764 }
1765 
1766 /*
1767 -------------------------------------------------------------------------------
1768 Returns the result of subtracting the absolute values of the single-
1769 precision floating-point values `a' and `b'.  If `zSign' is 1, the
1770 difference is negated before being returned.  `zSign' is ignored if the
1771 result is a NaN.  The subtraction is performed according to the IEC/IEEE
1772 Standard for Binary Floating-Point Arithmetic.
1773 -------------------------------------------------------------------------------
1774 */
subFloat32Sigs(float32 a,float32 b,flag zSign)1775 static float32 subFloat32Sigs( float32 a, float32 b, flag zSign )
1776 {
1777     int16 aExp, bExp, zExp;
1778     bits32 aSig, bSig, zSig;
1779     int16 expDiff;
1780 
1781     aSig = extractFloat32Frac( a );
1782     aExp = extractFloat32Exp( a );
1783     bSig = extractFloat32Frac( b );
1784     bExp = extractFloat32Exp( b );
1785     expDiff = aExp - bExp;
1786     aSig <<= 7;
1787     bSig <<= 7;
1788     if ( 0 < expDiff ) goto aExpBigger;
1789     if ( expDiff < 0 ) goto bExpBigger;
1790     if ( aExp == 0xFF ) {
1791         if ( aSig | bSig ) return propagateFloat32NaN( a, b );
1792         float_raise( float_flag_invalid );
1793         return float32_default_nan;
1794     }
1795     if ( aExp == 0 ) {
1796         aExp = 1;
1797         bExp = 1;
1798     }
1799     if ( bSig < aSig ) goto aBigger;
1800     if ( aSig < bSig ) goto bBigger;
1801     return packFloat32( float_rounding_mode == float_round_down, 0, 0 );
1802  bExpBigger:
1803     if ( bExp == 0xFF ) {
1804         if ( bSig ) return propagateFloat32NaN( a, b );
1805         return packFloat32( zSign ^ 1, 0xFF, 0 );
1806     }
1807     if ( aExp == 0 ) {
1808         ++expDiff;
1809     }
1810     else {
1811         aSig |= 0x40000000;
1812     }
1813     shift32RightJamming( aSig, - expDiff, &aSig );
1814     bSig |= 0x40000000;
1815  bBigger:
1816     zSig = bSig - aSig;
1817     zExp = bExp;
1818     zSign ^= 1;
1819     goto normalizeRoundAndPack;
1820  aExpBigger:
1821     if ( aExp == 0xFF ) {
1822         if ( aSig ) return propagateFloat32NaN( a, b );
1823         return a;
1824     }
1825     if ( bExp == 0 ) {
1826         --expDiff;
1827     }
1828     else {
1829         bSig |= 0x40000000;
1830     }
1831     shift32RightJamming( bSig, expDiff, &bSig );
1832     aSig |= 0x40000000;
1833  aBigger:
1834     zSig = aSig - bSig;
1835     zExp = aExp;
1836  normalizeRoundAndPack:
1837     --zExp;
1838     return normalizeRoundAndPackFloat32( zSign, zExp, zSig );
1839 
1840 }
1841 
1842 /*
1843 -------------------------------------------------------------------------------
1844 Returns the result of adding the single-precision floating-point values `a'
1845 and `b'.  The operation is performed according to the IEC/IEEE Standard for
1846 Binary Floating-Point Arithmetic.
1847 -------------------------------------------------------------------------------
1848 */
float32_add(float32 a,float32 b)1849 float32 float32_add( float32 a, float32 b )
1850 {
1851     flag aSign, bSign;
1852 
1853     aSign = extractFloat32Sign( a );
1854     bSign = extractFloat32Sign( b );
1855     if ( aSign == bSign ) {
1856         return addFloat32Sigs( a, b, aSign );
1857     }
1858     else {
1859         return subFloat32Sigs( a, b, aSign );
1860     }
1861 
1862 }
1863 
1864 /*
1865 -------------------------------------------------------------------------------
1866 Returns the result of subtracting the single-precision floating-point values
1867 `a' and `b'.  The operation is performed according to the IEC/IEEE Standard
1868 for Binary Floating-Point Arithmetic.
1869 -------------------------------------------------------------------------------
1870 */
float32_sub(float32 a,float32 b)1871 float32 float32_sub( float32 a, float32 b )
1872 {
1873     flag aSign, bSign;
1874 
1875     aSign = extractFloat32Sign( a );
1876     bSign = extractFloat32Sign( b );
1877     if ( aSign == bSign ) {
1878         return subFloat32Sigs( a, b, aSign );
1879     }
1880     else {
1881         return addFloat32Sigs( a, b, aSign );
1882     }
1883 
1884 }
1885 
1886 /*
1887 -------------------------------------------------------------------------------
1888 Returns the result of multiplying the single-precision floating-point values
1889 `a' and `b'.  The operation is performed according to the IEC/IEEE Standard
1890 for Binary Floating-Point Arithmetic.
1891 -------------------------------------------------------------------------------
1892 */
float32_mul(float32 a,float32 b)1893 float32 float32_mul( float32 a, float32 b )
1894 {
1895     flag aSign, bSign, zSign;
1896     int16 aExp, bExp, zExp;
1897     bits32 aSig, bSig;
1898     bits64 zSig64;
1899     bits32 zSig;
1900 
1901     aSig = extractFloat32Frac( a );
1902     aExp = extractFloat32Exp( a );
1903     aSign = extractFloat32Sign( a );
1904     bSig = extractFloat32Frac( b );
1905     bExp = extractFloat32Exp( b );
1906     bSign = extractFloat32Sign( b );
1907     zSign = aSign ^ bSign;
1908     if ( aExp == 0xFF ) {
1909         if ( aSig || ( ( bExp == 0xFF ) && bSig ) ) {
1910             return propagateFloat32NaN( a, b );
1911         }
1912         if ( ( bExp | bSig ) == 0 ) {
1913             float_raise( float_flag_invalid );
1914             return float32_default_nan;
1915         }
1916         return packFloat32( zSign, 0xFF, 0 );
1917     }
1918     if ( bExp == 0xFF ) {
1919         if ( bSig ) return propagateFloat32NaN( a, b );
1920         if ( ( aExp | aSig ) == 0 ) {
1921             float_raise( float_flag_invalid );
1922             return float32_default_nan;
1923         }
1924         return packFloat32( zSign, 0xFF, 0 );
1925     }
1926     if ( aExp == 0 ) {
1927         if ( aSig == 0 ) return packFloat32( zSign, 0, 0 );
1928         normalizeFloat32Subnormal( aSig, &aExp, &aSig );
1929     }
1930     if ( bExp == 0 ) {
1931         if ( bSig == 0 ) return packFloat32( zSign, 0, 0 );
1932         normalizeFloat32Subnormal( bSig, &bExp, &bSig );
1933     }
1934     zExp = aExp + bExp - 0x7F;
1935     aSig = ( aSig | 0x00800000 )<<7;
1936     bSig = ( bSig | 0x00800000 )<<8;
1937     shift64RightJamming( ( (bits64) aSig ) * bSig, 32, &zSig64 );
1938     zSig = zSig64;
1939     if ( 0 <= (sbits32) ( zSig<<1 ) ) {
1940         zSig <<= 1;
1941         --zExp;
1942     }
1943     return roundAndPackFloat32( zSign, zExp, zSig );
1944 
1945 }
1946 
1947 /*
1948 -------------------------------------------------------------------------------
1949 Returns the result of dividing the single-precision floating-point value `a'
1950 by the corresponding value `b'.  The operation is performed according to the
1951 IEC/IEEE Standard for Binary Floating-Point Arithmetic.
1952 -------------------------------------------------------------------------------
1953 */
float32_div(float32 a,float32 b)1954 float32 float32_div( float32 a, float32 b )
1955 {
1956     flag aSign, bSign, zSign;
1957     int16 aExp, bExp, zExp;
1958     bits32 aSig, bSig, zSig;
1959 
1960     aSig = extractFloat32Frac( a );
1961     aExp = extractFloat32Exp( a );
1962     aSign = extractFloat32Sign( a );
1963     bSig = extractFloat32Frac( b );
1964     bExp = extractFloat32Exp( b );
1965     bSign = extractFloat32Sign( b );
1966     zSign = aSign ^ bSign;
1967     if ( aExp == 0xFF ) {
1968         if ( aSig ) return propagateFloat32NaN( a, b );
1969         if ( bExp == 0xFF ) {
1970             if ( bSig ) return propagateFloat32NaN( a, b );
1971             float_raise( float_flag_invalid );
1972             return float32_default_nan;
1973         }
1974         return packFloat32( zSign, 0xFF, 0 );
1975     }
1976     if ( bExp == 0xFF ) {
1977         if ( bSig ) return propagateFloat32NaN( a, b );
1978         return packFloat32( zSign, 0, 0 );
1979     }
1980     if ( bExp == 0 ) {
1981         if ( bSig == 0 ) {
1982             if ( ( aExp | aSig ) == 0 ) {
1983                 float_raise( float_flag_invalid );
1984                 return float32_default_nan;
1985             }
1986             float_raise( float_flag_divbyzero );
1987             return packFloat32( zSign, 0xFF, 0 );
1988         }
1989         normalizeFloat32Subnormal( bSig, &bExp, &bSig );
1990     }
1991     if ( aExp == 0 ) {
1992         if ( aSig == 0 ) return packFloat32( zSign, 0, 0 );
1993         normalizeFloat32Subnormal( aSig, &aExp, &aSig );
1994     }
1995     zExp = aExp - bExp + 0x7D;
1996     aSig = ( aSig | 0x00800000 )<<7;
1997     bSig = ( bSig | 0x00800000 )<<8;
1998     if ( bSig <= ( aSig + aSig ) ) {
1999         aSig >>= 1;
2000         ++zExp;
2001     }
2002     zSig = ( ( (bits64) aSig )<<32 ) / bSig;
2003     if ( ( zSig & 0x3F ) == 0 ) {
2004         zSig |= ( (bits64) bSig * zSig != ( (bits64) aSig )<<32 );
2005     }
2006     return roundAndPackFloat32( zSign, zExp, zSig );
2007 
2008 }
2009 
2010 #ifndef SOFTFLOAT_FOR_GCC /* Not needed */
2011 /*
2012 -------------------------------------------------------------------------------
2013 Returns the remainder of the single-precision floating-point value `a'
2014 with respect to the corresponding value `b'.  The operation is performed
2015 according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
2016 -------------------------------------------------------------------------------
2017 */
float32_rem(float32 a,float32 b)2018 float32 float32_rem( float32 a, float32 b )
2019 {
2020     flag aSign, bSign, zSign;
2021     int16 aExp, bExp, expDiff;
2022     bits32 aSig, bSig;
2023     bits32 q;
2024     bits64 aSig64, bSig64, q64;
2025     bits32 alternateASig;
2026     sbits32 sigMean;
2027 
2028     aSig = extractFloat32Frac( a );
2029     aExp = extractFloat32Exp( a );
2030     aSign = extractFloat32Sign( a );
2031     bSig = extractFloat32Frac( b );
2032     bExp = extractFloat32Exp( b );
2033     bSign = extractFloat32Sign( b );
2034     if ( aExp == 0xFF ) {
2035         if ( aSig || ( ( bExp == 0xFF ) && bSig ) ) {
2036             return propagateFloat32NaN( a, b );
2037         }
2038         float_raise( float_flag_invalid );
2039         return float32_default_nan;
2040     }
2041     if ( bExp == 0xFF ) {
2042         if ( bSig ) return propagateFloat32NaN( a, b );
2043         return a;
2044     }
2045     if ( bExp == 0 ) {
2046         if ( bSig == 0 ) {
2047             float_raise( float_flag_invalid );
2048             return float32_default_nan;
2049         }
2050         normalizeFloat32Subnormal( bSig, &bExp, &bSig );
2051     }
2052     if ( aExp == 0 ) {
2053         if ( aSig == 0 ) return a;
2054         normalizeFloat32Subnormal( aSig, &aExp, &aSig );
2055     }
2056     expDiff = aExp - bExp;
2057     aSig |= 0x00800000;
2058     bSig |= 0x00800000;
2059     if ( expDiff < 32 ) {
2060         aSig <<= 8;
2061         bSig <<= 8;
2062         if ( expDiff < 0 ) {
2063             if ( expDiff < -1 ) return a;
2064             aSig >>= 1;
2065         }
2066         q = ( bSig <= aSig );
2067         if ( q ) aSig -= bSig;
2068         if ( 0 < expDiff ) {
2069             q = ( ( (bits64) aSig )<<32 ) / bSig;
2070             q >>= 32 - expDiff;
2071             bSig >>= 2;
2072             aSig = ( ( aSig>>1 )<<( expDiff - 1 ) ) - bSig * q;
2073         }
2074         else {
2075             aSig >>= 2;
2076             bSig >>= 2;
2077         }
2078     }
2079     else {
2080         if ( bSig <= aSig ) aSig -= bSig;
2081         aSig64 = ( (bits64) aSig )<<40;
2082         bSig64 = ( (bits64) bSig )<<40;
2083         expDiff -= 64;
2084         while ( 0 < expDiff ) {
2085             q64 = estimateDiv128To64( aSig64, 0, bSig64 );
2086             q64 = ( 2 < q64 ) ? q64 - 2 : 0;
2087             aSig64 = - ( ( bSig * q64 )<<38 );
2088             expDiff -= 62;
2089         }
2090         expDiff += 64;
2091         q64 = estimateDiv128To64( aSig64, 0, bSig64 );
2092         q64 = ( 2 < q64 ) ? q64 - 2 : 0;
2093         q = q64>>( 64 - expDiff );
2094         bSig <<= 6;
2095         aSig = ( ( aSig64>>33 )<<( expDiff - 1 ) ) - bSig * q;
2096     }
2097     do {
2098         alternateASig = aSig;
2099         ++q;
2100         aSig -= bSig;
2101     } while ( 0 <= (sbits32) aSig );
2102     sigMean = aSig + alternateASig;
2103     if ( ( sigMean < 0 ) || ( ( sigMean == 0 ) && ( q & 1 ) ) ) {
2104         aSig = alternateASig;
2105     }
2106     zSign = ( (sbits32) aSig < 0 );
2107     if ( zSign ) aSig = - aSig;
2108     return normalizeRoundAndPackFloat32( aSign ^ zSign, bExp, aSig );
2109 
2110 }
2111 #endif /* !SOFTFLOAT_FOR_GCC */
2112 
2113 #ifndef SOFTFLOAT_FOR_GCC /* Not needed */
2114 /*
2115 -------------------------------------------------------------------------------
2116 Returns the square root of the single-precision floating-point value `a'.
2117 The operation is performed according to the IEC/IEEE Standard for Binary
2118 Floating-Point Arithmetic.
2119 -------------------------------------------------------------------------------
2120 */
float32_sqrt(float32 a)2121 float32 float32_sqrt( float32 a )
2122 {
2123     flag aSign;
2124     int16 aExp, zExp;
2125     bits32 aSig, zSig;
2126     bits64 rem, term;
2127 
2128     aSig = extractFloat32Frac( a );
2129     aExp = extractFloat32Exp( a );
2130     aSign = extractFloat32Sign( a );
2131     if ( aExp == 0xFF ) {
2132         if ( aSig ) return propagateFloat32NaN( a, 0 );
2133         if ( ! aSign ) return a;
2134         float_raise( float_flag_invalid );
2135         return float32_default_nan;
2136     }
2137     if ( aSign ) {
2138         if ( ( aExp | aSig ) == 0 ) return a;
2139         float_raise( float_flag_invalid );
2140         return float32_default_nan;
2141     }
2142     if ( aExp == 0 ) {
2143         if ( aSig == 0 ) return 0;
2144         normalizeFloat32Subnormal( aSig, &aExp, &aSig );
2145     }
2146     zExp = ( ( aExp - 0x7F )>>1 ) + 0x7E;
2147     aSig = ( aSig | 0x00800000 )<<8;
2148     zSig = estimateSqrt32( aExp, aSig ) + 2;
2149     if ( ( zSig & 0x7F ) <= 5 ) {
2150         if ( zSig < 2 ) {
2151             zSig = 0x7FFFFFFF;
2152             goto roundAndPack;
2153         }
2154         aSig >>= aExp & 1;
2155         term = ( (bits64) zSig ) * zSig;
2156         rem = ( ( (bits64) aSig )<<32 ) - term;
2157         while ( (sbits64) rem < 0 ) {
2158             --zSig;
2159             rem += ( ( (bits64) zSig )<<1 ) | 1;
2160         }
2161         zSig |= ( rem != 0 );
2162     }
2163     shift32RightJamming( zSig, 1, &zSig );
2164  roundAndPack:
2165     return roundAndPackFloat32( 0, zExp, zSig );
2166 
2167 }
2168 #endif /* !SOFTFLOAT_FOR_GCC */
2169 
2170 /*
2171 -------------------------------------------------------------------------------
2172 Returns 1 if the single-precision floating-point value `a' is equal to
2173 the corresponding value `b', and 0 otherwise.  The comparison is performed
2174 according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
2175 -------------------------------------------------------------------------------
2176 */
float32_eq(float32 a,float32 b)2177 flag float32_eq( float32 a, float32 b )
2178 {
2179 
2180     if (    ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) )
2181          || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) )
2182        ) {
2183         if ( float32_is_signaling_nan( a ) || float32_is_signaling_nan( b ) ) {
2184             float_raise( float_flag_invalid );
2185         }
2186         return 0;
2187     }
2188     return ( a == b ) || ( (bits32) ( ( a | b )<<1 ) == 0 );
2189 
2190 }
2191 
2192 /*
2193 -------------------------------------------------------------------------------
2194 Returns 1 if the single-precision floating-point value `a' is less than
2195 or equal to the corresponding value `b', and 0 otherwise.  The comparison
2196 is performed according to the IEC/IEEE Standard for Binary Floating-Point
2197 Arithmetic.
2198 -------------------------------------------------------------------------------
2199 */
float32_le(float32 a,float32 b)2200 flag float32_le( float32 a, float32 b )
2201 {
2202     flag aSign, bSign;
2203 
2204     if (    ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) )
2205          || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) )
2206        ) {
2207         float_raise( float_flag_invalid );
2208         return 0;
2209     }
2210     aSign = extractFloat32Sign( a );
2211     bSign = extractFloat32Sign( b );
2212     if ( aSign != bSign ) return aSign || ( (bits32) ( ( a | b )<<1 ) == 0 );
2213     return ( a == b ) || ( aSign ^ ( a < b ) );
2214 
2215 }
2216 
2217 /*
2218 -------------------------------------------------------------------------------
2219 Returns 1 if the single-precision floating-point value `a' is less than
2220 the corresponding value `b', and 0 otherwise.  The comparison is performed
2221 according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
2222 -------------------------------------------------------------------------------
2223 */
float32_lt(float32 a,float32 b)2224 flag float32_lt( float32 a, float32 b )
2225 {
2226     flag aSign, bSign;
2227 
2228     if (    ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) )
2229          || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) )
2230        ) {
2231         float_raise( float_flag_invalid );
2232         return 0;
2233     }
2234     aSign = extractFloat32Sign( a );
2235     bSign = extractFloat32Sign( b );
2236     if ( aSign != bSign ) return aSign && ( (bits32) ( ( a | b )<<1 ) != 0 );
2237     return ( a != b ) && ( aSign ^ ( a < b ) );
2238 
2239 }
2240 
2241 #ifndef SOFTFLOAT_FOR_GCC /* Not needed */
2242 /*
2243 -------------------------------------------------------------------------------
2244 Returns 1 if the single-precision floating-point value `a' is equal to
2245 the corresponding value `b', and 0 otherwise.  The invalid exception is
2246 raised if either operand is a NaN.  Otherwise, the comparison is performed
2247 according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
2248 -------------------------------------------------------------------------------
2249 */
float32_eq_signaling(float32 a,float32 b)2250 flag float32_eq_signaling( float32 a, float32 b )
2251 {
2252 
2253     if (    ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) )
2254          || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) )
2255        ) {
2256         float_raise( float_flag_invalid );
2257         return 0;
2258     }
2259     return ( a == b ) || ( (bits32) ( ( a | b )<<1 ) == 0 );
2260 
2261 }
2262 
2263 /*
2264 -------------------------------------------------------------------------------
2265 Returns 1 if the single-precision floating-point value `a' is less than or
2266 equal to the corresponding value `b', and 0 otherwise.  Quiet NaNs do not
2267 cause an exception.  Otherwise, the comparison is performed according to the
2268 IEC/IEEE Standard for Binary Floating-Point Arithmetic.
2269 -------------------------------------------------------------------------------
2270 */
float32_le_quiet(float32 a,float32 b)2271 flag float32_le_quiet( float32 a, float32 b )
2272 {
2273     flag aSign, bSign;
2274 
2275     if (    ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) )
2276          || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) )
2277        ) {
2278         if ( float32_is_signaling_nan( a ) || float32_is_signaling_nan( b ) ) {
2279             float_raise( float_flag_invalid );
2280         }
2281         return 0;
2282     }
2283     aSign = extractFloat32Sign( a );
2284     bSign = extractFloat32Sign( b );
2285     if ( aSign != bSign ) return aSign || ( (bits32) ( ( a | b )<<1 ) == 0 );
2286     return ( a == b ) || ( aSign ^ ( a < b ) );
2287 
2288 }
2289 
2290 /*
2291 -------------------------------------------------------------------------------
2292 Returns 1 if the single-precision floating-point value `a' is less than
2293 the corresponding value `b', and 0 otherwise.  Quiet NaNs do not cause an
2294 exception.  Otherwise, the comparison is performed according to the IEC/IEEE
2295 Standard for Binary Floating-Point Arithmetic.
2296 -------------------------------------------------------------------------------
2297 */
float32_lt_quiet(float32 a,float32 b)2298 flag float32_lt_quiet( float32 a, float32 b )
2299 {
2300     flag aSign, bSign;
2301 
2302     if (    ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) )
2303          || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) )
2304        ) {
2305         if ( float32_is_signaling_nan( a ) || float32_is_signaling_nan( b ) ) {
2306             float_raise( float_flag_invalid );
2307         }
2308         return 0;
2309     }
2310     aSign = extractFloat32Sign( a );
2311     bSign = extractFloat32Sign( b );
2312     if ( aSign != bSign ) return aSign && ( (bits32) ( ( a | b )<<1 ) != 0 );
2313     return ( a != b ) && ( aSign ^ ( a < b ) );
2314 
2315 }
2316 #endif /* !SOFTFLOAT_FOR_GCC */
2317 
2318 #ifndef SOFTFLOAT_FOR_GCC /* Not needed */
2319 /*
2320 -------------------------------------------------------------------------------
2321 Returns the result of converting the double-precision floating-point value
2322 `a' to the 32-bit two's complement integer format.  The conversion is
2323 performed according to the IEC/IEEE Standard for Binary Floating-Point
2324 Arithmetic---which means in particular that the conversion is rounded
2325 according to the current rounding mode.  If `a' is a NaN, the largest
2326 positive integer is returned.  Otherwise, if the conversion overflows, the
2327 largest integer with the same sign as `a' is returned.
2328 -------------------------------------------------------------------------------
2329 */
float64_to_int32(float64 a)2330 int32 float64_to_int32( float64 a )
2331 {
2332     flag aSign;
2333     int16 aExp, shiftCount;
2334     bits64 aSig;
2335 
2336     aSig = extractFloat64Frac( a );
2337     aExp = extractFloat64Exp( a );
2338     aSign = extractFloat64Sign( a );
2339     if ( ( aExp == 0x7FF ) && aSig ) aSign = 0;
2340     if ( aExp ) aSig |= LIT64( 0x0010000000000000 );
2341     shiftCount = 0x42C - aExp;
2342     if ( 0 < shiftCount ) shift64RightJamming( aSig, shiftCount, &aSig );
2343     return roundAndPackInt32( aSign, aSig );
2344 
2345 }
2346 #endif /* !SOFTFLOAT_FOR_GCC */
2347 
2348 /*
2349 -------------------------------------------------------------------------------
2350 Returns the result of converting the double-precision floating-point value
2351 `a' to the 32-bit two's complement integer format.  The conversion is
2352 performed according to the IEC/IEEE Standard for Binary Floating-Point
2353 Arithmetic, except that the conversion is always rounded toward zero.
2354 If `a' is a NaN, the largest positive integer is returned.  Otherwise, if
2355 the conversion overflows, the largest integer with the same sign as `a' is
2356 returned.
2357 -------------------------------------------------------------------------------
2358 */
float64_to_int32_round_to_zero(float64 a)2359 int32 float64_to_int32_round_to_zero( float64 a )
2360 {
2361     flag aSign;
2362     int16 aExp, shiftCount;
2363     bits64 aSig, savedASig;
2364     int32 z;
2365 
2366     aSig = extractFloat64Frac( a );
2367     aExp = extractFloat64Exp( a );
2368     aSign = extractFloat64Sign( a );
2369     if ( 0x41E < aExp ) {
2370         if ( ( aExp == 0x7FF ) && aSig ) aSign = 0;
2371         goto invalid;
2372     }
2373     else if ( aExp < 0x3FF ) {
2374         if ( aExp || aSig ) float_exception_flags |= float_flag_inexact;
2375         return 0;
2376     }
2377     aSig |= LIT64( 0x0010000000000000 );
2378     shiftCount = 0x433 - aExp;
2379     savedASig = aSig;
2380     aSig >>= shiftCount;
2381     z = aSig;
2382     if ( aSign ) z = - z;
2383     if ( ( z < 0 ) ^ aSign ) {
2384  invalid:
2385         float_raise( float_flag_invalid );
2386         return aSign ? (sbits32) 0x80000000 : 0x7FFFFFFF;
2387     }
2388     if ( ( aSig<<shiftCount ) != savedASig ) {
2389         float_exception_flags |= float_flag_inexact;
2390     }
2391     return z;
2392 
2393 }
2394 
2395 #ifndef SOFTFLOAT_FOR_GCC /* Not needed */
2396 /*
2397 -------------------------------------------------------------------------------
2398 Returns the result of converting the double-precision floating-point value
2399 `a' to the 64-bit two's complement integer format.  The conversion is
2400 performed according to the IEC/IEEE Standard for Binary Floating-Point
2401 Arithmetic---which means in particular that the conversion is rounded
2402 according to the current rounding mode.  If `a' is a NaN, the largest
2403 positive integer is returned.  Otherwise, if the conversion overflows, the
2404 largest integer with the same sign as `a' is returned.
2405 -------------------------------------------------------------------------------
2406 */
float64_to_int64(float64 a)2407 int64 float64_to_int64( float64 a )
2408 {
2409     flag aSign;
2410     int16 aExp, shiftCount;
2411     bits64 aSig, aSigExtra;
2412 
2413     aSig = extractFloat64Frac( a );
2414     aExp = extractFloat64Exp( a );
2415     aSign = extractFloat64Sign( a );
2416     if ( aExp ) aSig |= LIT64( 0x0010000000000000 );
2417     shiftCount = 0x433 - aExp;
2418     if ( shiftCount <= 0 ) {
2419         if ( 0x43E < aExp ) {
2420             float_raise( float_flag_invalid );
2421             if (    ! aSign
2422                  || (    ( aExp == 0x7FF )
2423                       && ( aSig != LIT64( 0x0010000000000000 ) ) )
2424                ) {
2425                 return LIT64( 0x7FFFFFFFFFFFFFFF );
2426             }
2427             return (sbits64) LIT64( 0x8000000000000000 );
2428         }
2429         aSigExtra = 0;
2430         aSig <<= - shiftCount;
2431     }
2432     else {
2433         shift64ExtraRightJamming( aSig, 0, shiftCount, &aSig, &aSigExtra );
2434     }
2435     return roundAndPackInt64( aSign, aSig, aSigExtra );
2436 
2437 }
2438 
2439 /*
2440 -------------------------------------------------------------------------------
2441 Returns the result of converting the double-precision floating-point value
2442 `a' to the 64-bit two's complement integer format.  The conversion is
2443 performed according to the IEC/IEEE Standard for Binary Floating-Point
2444 Arithmetic, except that the conversion is always rounded toward zero.
2445 If `a' is a NaN, the largest positive integer is returned.  Otherwise, if
2446 the conversion overflows, the largest integer with the same sign as `a' is
2447 returned.
2448 -------------------------------------------------------------------------------
2449 */
float64_to_int64_round_to_zero(float64 a)2450 int64 float64_to_int64_round_to_zero( float64 a )
2451 {
2452     flag aSign;
2453     int16 aExp, shiftCount;
2454     bits64 aSig;
2455     int64 z;
2456 
2457     aSig = extractFloat64Frac( a );
2458     aExp = extractFloat64Exp( a );
2459     aSign = extractFloat64Sign( a );
2460     if ( aExp ) aSig |= LIT64( 0x0010000000000000 );
2461     shiftCount = aExp - 0x433;
2462     if ( 0 <= shiftCount ) {
2463         if ( 0x43E <= aExp ) {
2464             if ( a != LIT64( 0xC3E0000000000000 ) ) {
2465                 float_raise( float_flag_invalid );
2466                 if (    ! aSign
2467                      || (    ( aExp == 0x7FF )
2468                           && ( aSig != LIT64( 0x0010000000000000 ) ) )
2469                    ) {
2470                     return LIT64( 0x7FFFFFFFFFFFFFFF );
2471                 }
2472             }
2473             return (sbits64) LIT64( 0x8000000000000000 );
2474         }
2475         z = aSig<<shiftCount;
2476     }
2477     else {
2478         if ( aExp < 0x3FE ) {
2479             if ( aExp | aSig ) float_exception_flags |= float_flag_inexact;
2480             return 0;
2481         }
2482         z = aSig>>( - shiftCount );
2483         if ( (bits64) ( aSig<<( shiftCount & 63 ) ) ) {
2484             float_exception_flags |= float_flag_inexact;
2485         }
2486     }
2487     if ( aSign ) z = - z;
2488     return z;
2489 
2490 }
2491 #endif /* !SOFTFLOAT_FOR_GCC */
2492 
2493 /*
2494 -------------------------------------------------------------------------------
2495 Returns the result of converting the double-precision floating-point value
2496 `a' to the single-precision floating-point format.  The conversion is
2497 performed according to the IEC/IEEE Standard for Binary Floating-Point
2498 Arithmetic.
2499 -------------------------------------------------------------------------------
2500 */
float64_to_float32(float64 a)2501 float32 float64_to_float32( float64 a )
2502 {
2503     flag aSign;
2504     int16 aExp;
2505     bits64 aSig;
2506     bits32 zSig;
2507 
2508     aSig = extractFloat64Frac( a );
2509     aExp = extractFloat64Exp( a );
2510     aSign = extractFloat64Sign( a );
2511     if ( aExp == 0x7FF ) {
2512         if ( aSig ) return commonNaNToFloat32( float64ToCommonNaN( a ) );
2513         return packFloat32( aSign, 0xFF, 0 );
2514     }
2515     shift64RightJamming( aSig, 22, &aSig );
2516     zSig = aSig;
2517     if ( aExp || zSig ) {
2518         zSig |= 0x40000000;
2519         aExp -= 0x381;
2520     }
2521     return roundAndPackFloat32( aSign, aExp, zSig );
2522 
2523 }
2524 
2525 #ifdef FLOATX80
2526 
2527 /*
2528 -------------------------------------------------------------------------------
2529 Returns the result of converting the double-precision floating-point value
2530 `a' to the extended double-precision floating-point format.  The conversion
2531 is performed according to the IEC/IEEE Standard for Binary Floating-Point
2532 Arithmetic.
2533 -------------------------------------------------------------------------------
2534 */
float64_to_floatx80(float64 a)2535 floatx80 float64_to_floatx80( float64 a )
2536 {
2537     flag aSign;
2538     int16 aExp;
2539     bits64 aSig;
2540 
2541     aSig = extractFloat64Frac( a );
2542     aExp = extractFloat64Exp( a );
2543     aSign = extractFloat64Sign( a );
2544     if ( aExp == 0x7FF ) {
2545         if ( aSig ) return commonNaNToFloatx80( float64ToCommonNaN( a ) );
2546         return packFloatx80( aSign, 0x7FFF, LIT64( 0x8000000000000000 ) );
2547     }
2548     if ( aExp == 0 ) {
2549         if ( aSig == 0 ) return packFloatx80( aSign, 0, 0 );
2550         normalizeFloat64Subnormal( aSig, &aExp, &aSig );
2551     }
2552     return
2553         packFloatx80(
2554             aSign, aExp + 0x3C00, ( aSig | LIT64( 0x0010000000000000 ) )<<11 );
2555 
2556 }
2557 
2558 #endif
2559 
2560 #ifdef FLOAT128
2561 
2562 /*
2563 -------------------------------------------------------------------------------
2564 Returns the result of converting the double-precision floating-point value
2565 `a' to the quadruple-precision floating-point format.  The conversion is
2566 performed according to the IEC/IEEE Standard for Binary Floating-Point
2567 Arithmetic.
2568 -------------------------------------------------------------------------------
2569 */
float64_to_float128(float64 a)2570 float128 float64_to_float128( float64 a )
2571 {
2572     flag aSign;
2573     int16 aExp;
2574     bits64 aSig, zSig0, zSig1;
2575 
2576     aSig = extractFloat64Frac( a );
2577     aExp = extractFloat64Exp( a );
2578     aSign = extractFloat64Sign( a );
2579     if ( aExp == 0x7FF ) {
2580         if ( aSig ) return commonNaNToFloat128( float64ToCommonNaN( a ) );
2581         return packFloat128( aSign, 0x7FFF, 0, 0 );
2582     }
2583     if ( aExp == 0 ) {
2584         if ( aSig == 0 ) return packFloat128( aSign, 0, 0, 0 );
2585         normalizeFloat64Subnormal( aSig, &aExp, &aSig );
2586         --aExp;
2587     }
2588     shift128Right( aSig, 0, 4, &zSig0, &zSig1 );
2589     return packFloat128( aSign, aExp + 0x3C00, zSig0, zSig1 );
2590 
2591 }
2592 
2593 #endif
2594 
2595 #ifndef SOFTFLOAT_FOR_GCC
2596 /*
2597 -------------------------------------------------------------------------------
2598 Rounds the double-precision floating-point value `a' to an integer, and
2599 returns the result as a double-precision floating-point value.  The
2600 operation is performed according to the IEC/IEEE Standard for Binary
2601 Floating-Point Arithmetic.
2602 -------------------------------------------------------------------------------
2603 */
float64_round_to_int(float64 a)2604 float64 float64_round_to_int( float64 a )
2605 {
2606     flag aSign;
2607     int16 aExp;
2608     bits64 lastBitMask, roundBitsMask;
2609     int8 roundingMode;
2610     float64 z;
2611 
2612     aExp = extractFloat64Exp( a );
2613     if ( 0x433 <= aExp ) {
2614         if ( ( aExp == 0x7FF ) && extractFloat64Frac( a ) ) {
2615             return propagateFloat64NaN( a, a );
2616         }
2617         return a;
2618     }
2619     if ( aExp < 0x3FF ) {
2620         if ( (bits64) ( a<<1 ) == 0 ) return a;
2621         float_exception_flags |= float_flag_inexact;
2622         aSign = extractFloat64Sign( a );
2623         switch ( float_rounding_mode ) {
2624          case float_round_nearest_even:
2625             if ( ( aExp == 0x3FE ) && extractFloat64Frac( a ) ) {
2626                 return packFloat64( aSign, 0x3FF, 0 );
2627             }
2628             break;
2629 	 case float_round_to_zero:
2630 	    break;
2631          case float_round_down:
2632             return aSign ? LIT64( 0xBFF0000000000000 ) : 0;
2633          case float_round_up:
2634             return
2635             aSign ? LIT64( 0x8000000000000000 ) : LIT64( 0x3FF0000000000000 );
2636         }
2637         return packFloat64( aSign, 0, 0 );
2638     }
2639     lastBitMask = 1;
2640     lastBitMask <<= 0x433 - aExp;
2641     roundBitsMask = lastBitMask - 1;
2642     z = a;
2643     roundingMode = float_rounding_mode;
2644     if ( roundingMode == float_round_nearest_even ) {
2645         z += lastBitMask>>1;
2646         if ( ( z & roundBitsMask ) == 0 ) z &= ~ lastBitMask;
2647     }
2648     else if ( roundingMode != float_round_to_zero ) {
2649         if ( extractFloat64Sign( z ) ^ ( roundingMode == float_round_up ) ) {
2650             z += roundBitsMask;
2651         }
2652     }
2653     z &= ~ roundBitsMask;
2654     if ( z != a ) float_exception_flags |= float_flag_inexact;
2655     return z;
2656 
2657 }
2658 #endif
2659 
2660 /*
2661 -------------------------------------------------------------------------------
2662 Returns the result of adding the absolute values of the double-precision
2663 floating-point values `a' and `b'.  If `zSign' is 1, the sum is negated
2664 before being returned.  `zSign' is ignored if the result is a NaN.
2665 The addition is performed according to the IEC/IEEE Standard for Binary
2666 Floating-Point Arithmetic.
2667 -------------------------------------------------------------------------------
2668 */
addFloat64Sigs(float64 a,float64 b,flag zSign)2669 static float64 addFloat64Sigs( float64 a, float64 b, flag zSign )
2670 {
2671     int16 aExp, bExp, zExp;
2672     bits64 aSig, bSig, zSig;
2673     int16 expDiff;
2674 
2675     aSig = extractFloat64Frac( a );
2676     aExp = extractFloat64Exp( a );
2677     bSig = extractFloat64Frac( b );
2678     bExp = extractFloat64Exp( b );
2679     expDiff = aExp - bExp;
2680     aSig <<= 9;
2681     bSig <<= 9;
2682     if ( 0 < expDiff ) {
2683         if ( aExp == 0x7FF ) {
2684             if ( aSig ) return propagateFloat64NaN( a, b );
2685             return a;
2686         }
2687         if ( bExp == 0 ) {
2688             --expDiff;
2689         }
2690         else {
2691             bSig |= LIT64( 0x2000000000000000 );
2692         }
2693         shift64RightJamming( bSig, expDiff, &bSig );
2694         zExp = aExp;
2695     }
2696     else if ( expDiff < 0 ) {
2697         if ( bExp == 0x7FF ) {
2698             if ( bSig ) return propagateFloat64NaN( a, b );
2699             return packFloat64( zSign, 0x7FF, 0 );
2700         }
2701         if ( aExp == 0 ) {
2702             ++expDiff;
2703         }
2704         else {
2705             aSig |= LIT64( 0x2000000000000000 );
2706         }
2707         shift64RightJamming( aSig, - expDiff, &aSig );
2708         zExp = bExp;
2709     }
2710     else {
2711         if ( aExp == 0x7FF ) {
2712             if ( aSig | bSig ) return propagateFloat64NaN( a, b );
2713             return a;
2714         }
2715         if ( aExp == 0 ) return packFloat64( zSign, 0, ( aSig + bSig )>>9 );
2716         zSig = LIT64( 0x4000000000000000 ) + aSig + bSig;
2717         zExp = aExp;
2718         goto roundAndPack;
2719     }
2720     aSig |= LIT64( 0x2000000000000000 );
2721     zSig = ( aSig + bSig )<<1;
2722     --zExp;
2723     if ( (sbits64) zSig < 0 ) {
2724         zSig = aSig + bSig;
2725         ++zExp;
2726     }
2727  roundAndPack:
2728     return roundAndPackFloat64( zSign, zExp, zSig );
2729 
2730 }
2731 
2732 /*
2733 -------------------------------------------------------------------------------
2734 Returns the result of subtracting the absolute values of the double-
2735 precision floating-point values `a' and `b'.  If `zSign' is 1, the
2736 difference is negated before being returned.  `zSign' is ignored if the
2737 result is a NaN.  The subtraction is performed according to the IEC/IEEE
2738 Standard for Binary Floating-Point Arithmetic.
2739 -------------------------------------------------------------------------------
2740 */
subFloat64Sigs(float64 a,float64 b,flag zSign)2741 static float64 subFloat64Sigs( float64 a, float64 b, flag zSign )
2742 {
2743     int16 aExp, bExp, zExp;
2744     bits64 aSig, bSig, zSig;
2745     int16 expDiff;
2746 
2747     aSig = extractFloat64Frac( a );
2748     aExp = extractFloat64Exp( a );
2749     bSig = extractFloat64Frac( b );
2750     bExp = extractFloat64Exp( b );
2751     expDiff = aExp - bExp;
2752     aSig <<= 10;
2753     bSig <<= 10;
2754     if ( 0 < expDiff ) goto aExpBigger;
2755     if ( expDiff < 0 ) goto bExpBigger;
2756     if ( aExp == 0x7FF ) {
2757         if ( aSig | bSig ) return propagateFloat64NaN( a, b );
2758         float_raise( float_flag_invalid );
2759         return float64_default_nan;
2760     }
2761     if ( aExp == 0 ) {
2762         aExp = 1;
2763         bExp = 1;
2764     }
2765     if ( bSig < aSig ) goto aBigger;
2766     if ( aSig < bSig ) goto bBigger;
2767     return packFloat64( float_rounding_mode == float_round_down, 0, 0 );
2768  bExpBigger:
2769     if ( bExp == 0x7FF ) {
2770         if ( bSig ) return propagateFloat64NaN( a, b );
2771         return packFloat64( zSign ^ 1, 0x7FF, 0 );
2772     }
2773     if ( aExp == 0 ) {
2774         ++expDiff;
2775     }
2776     else {
2777         aSig |= LIT64( 0x4000000000000000 );
2778     }
2779     shift64RightJamming( aSig, - expDiff, &aSig );
2780     bSig |= LIT64( 0x4000000000000000 );
2781  bBigger:
2782     zSig = bSig - aSig;
2783     zExp = bExp;
2784     zSign ^= 1;
2785     goto normalizeRoundAndPack;
2786  aExpBigger:
2787     if ( aExp == 0x7FF ) {
2788         if ( aSig ) return propagateFloat64NaN( a, b );
2789         return a;
2790     }
2791     if ( bExp == 0 ) {
2792         --expDiff;
2793     }
2794     else {
2795         bSig |= LIT64( 0x4000000000000000 );
2796     }
2797     shift64RightJamming( bSig, expDiff, &bSig );
2798     aSig |= LIT64( 0x4000000000000000 );
2799  aBigger:
2800     zSig = aSig - bSig;
2801     zExp = aExp;
2802  normalizeRoundAndPack:
2803     --zExp;
2804     return normalizeRoundAndPackFloat64( zSign, zExp, zSig );
2805 
2806 }
2807 
2808 /*
2809 -------------------------------------------------------------------------------
2810 Returns the result of adding the double-precision floating-point values `a'
2811 and `b'.  The operation is performed according to the IEC/IEEE Standard for
2812 Binary Floating-Point Arithmetic.
2813 -------------------------------------------------------------------------------
2814 */
float64_add(float64 a,float64 b)2815 float64 float64_add( float64 a, float64 b )
2816 {
2817     flag aSign, bSign;
2818 
2819     aSign = extractFloat64Sign( a );
2820     bSign = extractFloat64Sign( b );
2821     if ( aSign == bSign ) {
2822         return addFloat64Sigs( a, b, aSign );
2823     }
2824     else {
2825         return subFloat64Sigs( a, b, aSign );
2826     }
2827 
2828 }
2829 
2830 /*
2831 -------------------------------------------------------------------------------
2832 Returns the result of subtracting the double-precision floating-point values
2833 `a' and `b'.  The operation is performed according to the IEC/IEEE Standard
2834 for Binary Floating-Point Arithmetic.
2835 -------------------------------------------------------------------------------
2836 */
float64_sub(float64 a,float64 b)2837 float64 float64_sub( float64 a, float64 b )
2838 {
2839     flag aSign, bSign;
2840 
2841     aSign = extractFloat64Sign( a );
2842     bSign = extractFloat64Sign( b );
2843     if ( aSign == bSign ) {
2844         return subFloat64Sigs( a, b, aSign );
2845     }
2846     else {
2847         return addFloat64Sigs( a, b, aSign );
2848     }
2849 
2850 }
2851 
2852 /*
2853 -------------------------------------------------------------------------------
2854 Returns the result of multiplying the double-precision floating-point values
2855 `a' and `b'.  The operation is performed according to the IEC/IEEE Standard
2856 for Binary Floating-Point Arithmetic.
2857 -------------------------------------------------------------------------------
2858 */
float64_mul(float64 a,float64 b)2859 float64 float64_mul( float64 a, float64 b )
2860 {
2861     flag aSign, bSign, zSign;
2862     int16 aExp, bExp, zExp;
2863     bits64 aSig, bSig, zSig0, zSig1;
2864 
2865     aSig = extractFloat64Frac( a );
2866     aExp = extractFloat64Exp( a );
2867     aSign = extractFloat64Sign( a );
2868     bSig = extractFloat64Frac( b );
2869     bExp = extractFloat64Exp( b );
2870     bSign = extractFloat64Sign( b );
2871     zSign = aSign ^ bSign;
2872     if ( aExp == 0x7FF ) {
2873         if ( aSig || ( ( bExp == 0x7FF ) && bSig ) ) {
2874             return propagateFloat64NaN( a, b );
2875         }
2876         if ( ( bExp | bSig ) == 0 ) {
2877             float_raise( float_flag_invalid );
2878             return float64_default_nan;
2879         }
2880         return packFloat64( zSign, 0x7FF, 0 );
2881     }
2882     if ( bExp == 0x7FF ) {
2883         if ( bSig ) return propagateFloat64NaN( a, b );
2884         if ( ( aExp | aSig ) == 0 ) {
2885             float_raise( float_flag_invalid );
2886             return float64_default_nan;
2887         }
2888         return packFloat64( zSign, 0x7FF, 0 );
2889     }
2890     if ( aExp == 0 ) {
2891         if ( aSig == 0 ) return packFloat64( zSign, 0, 0 );
2892         normalizeFloat64Subnormal( aSig, &aExp, &aSig );
2893     }
2894     if ( bExp == 0 ) {
2895         if ( bSig == 0 ) return packFloat64( zSign, 0, 0 );
2896         normalizeFloat64Subnormal( bSig, &bExp, &bSig );
2897     }
2898     zExp = aExp + bExp - 0x3FF;
2899     aSig = ( aSig | LIT64( 0x0010000000000000 ) )<<10;
2900     bSig = ( bSig | LIT64( 0x0010000000000000 ) )<<11;
2901     mul64To128( aSig, bSig, &zSig0, &zSig1 );
2902     zSig0 |= ( zSig1 != 0 );
2903     if ( 0 <= (sbits64) ( zSig0<<1 ) ) {
2904         zSig0 <<= 1;
2905         --zExp;
2906     }
2907     return roundAndPackFloat64( zSign, zExp, zSig0 );
2908 
2909 }
2910 
2911 /*
2912 -------------------------------------------------------------------------------
2913 Returns the result of dividing the double-precision floating-point value `a'
2914 by the corresponding value `b'.  The operation is performed according to
2915 the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
2916 -------------------------------------------------------------------------------
2917 */
float64_div(float64 a,float64 b)2918 float64 float64_div( float64 a, float64 b )
2919 {
2920     flag aSign, bSign, zSign;
2921     int16 aExp, bExp, zExp;
2922     bits64 aSig, bSig, zSig;
2923     bits64 rem0, rem1;
2924     bits64 term0, term1;
2925 
2926     aSig = extractFloat64Frac( a );
2927     aExp = extractFloat64Exp( a );
2928     aSign = extractFloat64Sign( a );
2929     bSig = extractFloat64Frac( b );
2930     bExp = extractFloat64Exp( b );
2931     bSign = extractFloat64Sign( b );
2932     zSign = aSign ^ bSign;
2933     if ( aExp == 0x7FF ) {
2934         if ( aSig ) return propagateFloat64NaN( a, b );
2935         if ( bExp == 0x7FF ) {
2936             if ( bSig ) return propagateFloat64NaN( a, b );
2937             float_raise( float_flag_invalid );
2938             return float64_default_nan;
2939         }
2940         return packFloat64( zSign, 0x7FF, 0 );
2941     }
2942     if ( bExp == 0x7FF ) {
2943         if ( bSig ) return propagateFloat64NaN( a, b );
2944         return packFloat64( zSign, 0, 0 );
2945     }
2946     if ( bExp == 0 ) {
2947         if ( bSig == 0 ) {
2948             if ( ( aExp | aSig ) == 0 ) {
2949                 float_raise( float_flag_invalid );
2950                 return float64_default_nan;
2951             }
2952             float_raise( float_flag_divbyzero );
2953             return packFloat64( zSign, 0x7FF, 0 );
2954         }
2955         normalizeFloat64Subnormal( bSig, &bExp, &bSig );
2956     }
2957     if ( aExp == 0 ) {
2958         if ( aSig == 0 ) return packFloat64( zSign, 0, 0 );
2959         normalizeFloat64Subnormal( aSig, &aExp, &aSig );
2960     }
2961     zExp = aExp - bExp + 0x3FD;
2962     aSig = ( aSig | LIT64( 0x0010000000000000 ) )<<10;
2963     bSig = ( bSig | LIT64( 0x0010000000000000 ) )<<11;
2964     if ( bSig <= ( aSig + aSig ) ) {
2965         aSig >>= 1;
2966         ++zExp;
2967     }
2968     zSig = estimateDiv128To64( aSig, 0, bSig );
2969     if ( ( zSig & 0x1FF ) <= 2 ) {
2970         mul64To128( bSig, zSig, &term0, &term1 );
2971         sub128( aSig, 0, term0, term1, &rem0, &rem1 );
2972         while ( (sbits64) rem0 < 0 ) {
2973             --zSig;
2974             add128( rem0, rem1, 0, bSig, &rem0, &rem1 );
2975         }
2976         zSig |= ( rem1 != 0 );
2977     }
2978     return roundAndPackFloat64( zSign, zExp, zSig );
2979 
2980 }
2981 
2982 #ifndef SOFTFLOAT_FOR_GCC
2983 /*
2984 -------------------------------------------------------------------------------
2985 Returns the remainder of the double-precision floating-point value `a'
2986 with respect to the corresponding value `b'.  The operation is performed
2987 according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
2988 -------------------------------------------------------------------------------
2989 */
float64_rem(float64 a,float64 b)2990 float64 float64_rem( float64 a, float64 b )
2991 {
2992     flag aSign, bSign, zSign;
2993     int16 aExp, bExp, expDiff;
2994     bits64 aSig, bSig;
2995     bits64 q, alternateASig;
2996     sbits64 sigMean;
2997 
2998     aSig = extractFloat64Frac( a );
2999     aExp = extractFloat64Exp( a );
3000     aSign = extractFloat64Sign( a );
3001     bSig = extractFloat64Frac( b );
3002     bExp = extractFloat64Exp( b );
3003     bSign = extractFloat64Sign( b );
3004     if ( aExp == 0x7FF ) {
3005         if ( aSig || ( ( bExp == 0x7FF ) && bSig ) ) {
3006             return propagateFloat64NaN( a, b );
3007         }
3008         float_raise( float_flag_invalid );
3009         return float64_default_nan;
3010     }
3011     if ( bExp == 0x7FF ) {
3012         if ( bSig ) return propagateFloat64NaN( a, b );
3013         return a;
3014     }
3015     if ( bExp == 0 ) {
3016         if ( bSig == 0 ) {
3017             float_raise( float_flag_invalid );
3018             return float64_default_nan;
3019         }
3020         normalizeFloat64Subnormal( bSig, &bExp, &bSig );
3021     }
3022     if ( aExp == 0 ) {
3023         if ( aSig == 0 ) return a;
3024         normalizeFloat64Subnormal( aSig, &aExp, &aSig );
3025     }
3026     expDiff = aExp - bExp;
3027     aSig = ( aSig | LIT64( 0x0010000000000000 ) )<<11;
3028     bSig = ( bSig | LIT64( 0x0010000000000000 ) )<<11;
3029     if ( expDiff < 0 ) {
3030         if ( expDiff < -1 ) return a;
3031         aSig >>= 1;
3032     }
3033     q = ( bSig <= aSig );
3034     if ( q ) aSig -= bSig;
3035     expDiff -= 64;
3036     while ( 0 < expDiff ) {
3037         q = estimateDiv128To64( aSig, 0, bSig );
3038         q = ( 2 < q ) ? q - 2 : 0;
3039         aSig = - ( ( bSig>>2 ) * q );
3040         expDiff -= 62;
3041     }
3042     expDiff += 64;
3043     if ( 0 < expDiff ) {
3044         q = estimateDiv128To64( aSig, 0, bSig );
3045         q = ( 2 < q ) ? q - 2 : 0;
3046         q >>= 64 - expDiff;
3047         bSig >>= 2;
3048         aSig = ( ( aSig>>1 )<<( expDiff - 1 ) ) - bSig * q;
3049     }
3050     else {
3051         aSig >>= 2;
3052         bSig >>= 2;
3053     }
3054     do {
3055         alternateASig = aSig;
3056         ++q;
3057         aSig -= bSig;
3058     } while ( 0 <= (sbits64) aSig );
3059     sigMean = aSig + alternateASig;
3060     if ( ( sigMean < 0 ) || ( ( sigMean == 0 ) && ( q & 1 ) ) ) {
3061         aSig = alternateASig;
3062     }
3063     zSign = ( (sbits64) aSig < 0 );
3064     if ( zSign ) aSig = - aSig;
3065     return normalizeRoundAndPackFloat64( aSign ^ zSign, bExp, aSig );
3066 
3067 }
3068 
3069 /*
3070 -------------------------------------------------------------------------------
3071 Returns the square root of the double-precision floating-point value `a'.
3072 The operation is performed according to the IEC/IEEE Standard for Binary
3073 Floating-Point Arithmetic.
3074 -------------------------------------------------------------------------------
3075 */
float64_sqrt(float64 a)3076 float64 float64_sqrt( float64 a )
3077 {
3078     flag aSign;
3079     int16 aExp, zExp;
3080     bits64 aSig, zSig, doubleZSig;
3081     bits64 rem0, rem1, term0, term1;
3082 
3083     aSig = extractFloat64Frac( a );
3084     aExp = extractFloat64Exp( a );
3085     aSign = extractFloat64Sign( a );
3086     if ( aExp == 0x7FF ) {
3087         if ( aSig ) return propagateFloat64NaN( a, a );
3088         if ( ! aSign ) return a;
3089         float_raise( float_flag_invalid );
3090         return float64_default_nan;
3091     }
3092     if ( aSign ) {
3093         if ( ( aExp | aSig ) == 0 ) return a;
3094         float_raise( float_flag_invalid );
3095         return float64_default_nan;
3096     }
3097     if ( aExp == 0 ) {
3098         if ( aSig == 0 ) return 0;
3099         normalizeFloat64Subnormal( aSig, &aExp, &aSig );
3100     }
3101     zExp = ( ( aExp - 0x3FF )>>1 ) + 0x3FE;
3102     aSig |= LIT64( 0x0010000000000000 );
3103     zSig = estimateSqrt32( aExp, aSig>>21 );
3104     aSig <<= 9 - ( aExp & 1 );
3105     zSig = estimateDiv128To64( aSig, 0, zSig<<32 ) + ( zSig<<30 );
3106     if ( ( zSig & 0x1FF ) <= 5 ) {
3107         doubleZSig = zSig<<1;
3108         mul64To128( zSig, zSig, &term0, &term1 );
3109         sub128( aSig, 0, term0, term1, &rem0, &rem1 );
3110         while ( (sbits64) rem0 < 0 ) {
3111             --zSig;
3112             doubleZSig -= 2;
3113             add128( rem0, rem1, zSig>>63, doubleZSig | 1, &rem0, &rem1 );
3114         }
3115         zSig |= ( ( rem0 | rem1 ) != 0 );
3116     }
3117     return roundAndPackFloat64( 0, zExp, zSig );
3118 
3119 }
3120 #endif
3121 
3122 /*
3123 -------------------------------------------------------------------------------
3124 Returns 1 if the double-precision floating-point value `a' is equal to the
3125 corresponding value `b', and 0 otherwise.  The comparison is performed
3126 according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
3127 -------------------------------------------------------------------------------
3128 */
float64_eq(float64 a,float64 b)3129 flag float64_eq( float64 a, float64 b )
3130 {
3131 
3132     if (    ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) )
3133          || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) )
3134        ) {
3135         if ( float64_is_signaling_nan( a ) || float64_is_signaling_nan( b ) ) {
3136             float_raise( float_flag_invalid );
3137         }
3138         return 0;
3139     }
3140     return ( a == b ) ||
3141 	( (bits64) ( ( FLOAT64_DEMANGLE(a) | FLOAT64_DEMANGLE(b) )<<1 ) == 0 );
3142 
3143 }
3144 
3145 /*
3146 -------------------------------------------------------------------------------
3147 Returns 1 if the double-precision floating-point value `a' is less than or
3148 equal to the corresponding value `b', and 0 otherwise.  The comparison is
3149 performed according to the IEC/IEEE Standard for Binary Floating-Point
3150 Arithmetic.
3151 -------------------------------------------------------------------------------
3152 */
float64_le(float64 a,float64 b)3153 flag float64_le( float64 a, float64 b )
3154 {
3155     flag aSign, bSign;
3156 
3157     if (    ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) )
3158          || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) )
3159        ) {
3160         float_raise( float_flag_invalid );
3161         return 0;
3162     }
3163     aSign = extractFloat64Sign( a );
3164     bSign = extractFloat64Sign( b );
3165     if ( aSign != bSign )
3166 	return aSign ||
3167 	    ( (bits64) ( ( FLOAT64_DEMANGLE(a) | FLOAT64_DEMANGLE(b) )<<1 ) ==
3168 	      0 );
3169     return ( a == b ) ||
3170 	( aSign ^ ( FLOAT64_DEMANGLE(a) < FLOAT64_DEMANGLE(b) ) );
3171 
3172 }
3173 
3174 /*
3175 -------------------------------------------------------------------------------
3176 Returns 1 if the double-precision floating-point value `a' is less than
3177 the corresponding value `b', and 0 otherwise.  The comparison is performed
3178 according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
3179 -------------------------------------------------------------------------------
3180 */
float64_lt(float64 a,float64 b)3181 flag float64_lt( float64 a, float64 b )
3182 {
3183     flag aSign, bSign;
3184 
3185     if (    ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) )
3186          || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) )
3187        ) {
3188         float_raise( float_flag_invalid );
3189         return 0;
3190     }
3191     aSign = extractFloat64Sign( a );
3192     bSign = extractFloat64Sign( b );
3193     if ( aSign != bSign )
3194 	return aSign &&
3195 	    ( (bits64) ( ( FLOAT64_DEMANGLE(a) | FLOAT64_DEMANGLE(b) )<<1 ) !=
3196 	      0 );
3197     return ( a != b ) &&
3198 	( aSign ^ ( FLOAT64_DEMANGLE(a) < FLOAT64_DEMANGLE(b) ) );
3199 
3200 }
3201 
3202 #ifndef SOFTFLOAT_FOR_GCC
3203 /*
3204 -------------------------------------------------------------------------------
3205 Returns 1 if the double-precision floating-point value `a' is equal to the
3206 corresponding value `b', and 0 otherwise.  The invalid exception is raised
3207 if either operand is a NaN.  Otherwise, the comparison is performed
3208 according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
3209 -------------------------------------------------------------------------------
3210 */
float64_eq_signaling(float64 a,float64 b)3211 flag float64_eq_signaling( float64 a, float64 b )
3212 {
3213 
3214     if (    ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) )
3215          || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) )
3216        ) {
3217         float_raise( float_flag_invalid );
3218         return 0;
3219     }
3220     return ( a == b ) || ( (bits64) ( ( a | b )<<1 ) == 0 );
3221 
3222 }
3223 
3224 /*
3225 -------------------------------------------------------------------------------
3226 Returns 1 if the double-precision floating-point value `a' is less than or
3227 equal to the corresponding value `b', and 0 otherwise.  Quiet NaNs do not
3228 cause an exception.  Otherwise, the comparison is performed according to the
3229 IEC/IEEE Standard for Binary Floating-Point Arithmetic.
3230 -------------------------------------------------------------------------------
3231 */
float64_le_quiet(float64 a,float64 b)3232 flag float64_le_quiet( float64 a, float64 b )
3233 {
3234     flag aSign, bSign;
3235 
3236     if (    ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) )
3237          || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) )
3238        ) {
3239         if ( float64_is_signaling_nan( a ) || float64_is_signaling_nan( b ) ) {
3240             float_raise( float_flag_invalid );
3241         }
3242         return 0;
3243     }
3244     aSign = extractFloat64Sign( a );
3245     bSign = extractFloat64Sign( b );
3246     if ( aSign != bSign ) return aSign || ( (bits64) ( ( a | b )<<1 ) == 0 );
3247     return ( a == b ) || ( aSign ^ ( a < b ) );
3248 
3249 }
3250 
3251 /*
3252 -------------------------------------------------------------------------------
3253 Returns 1 if the double-precision floating-point value `a' is less than
3254 the corresponding value `b', and 0 otherwise.  Quiet NaNs do not cause an
3255 exception.  Otherwise, the comparison is performed according to the IEC/IEEE
3256 Standard for Binary Floating-Point Arithmetic.
3257 -------------------------------------------------------------------------------
3258 */
float64_lt_quiet(float64 a,float64 b)3259 flag float64_lt_quiet( float64 a, float64 b )
3260 {
3261     flag aSign, bSign;
3262 
3263     if (    ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) )
3264          || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) )
3265        ) {
3266         if ( float64_is_signaling_nan( a ) || float64_is_signaling_nan( b ) ) {
3267             float_raise( float_flag_invalid );
3268         }
3269         return 0;
3270     }
3271     aSign = extractFloat64Sign( a );
3272     bSign = extractFloat64Sign( b );
3273     if ( aSign != bSign ) return aSign && ( (bits64) ( ( a | b )<<1 ) != 0 );
3274     return ( a != b ) && ( aSign ^ ( a < b ) );
3275 
3276 }
3277 #endif
3278 
3279 #ifdef FLOATX80
3280 
3281 /*
3282 -------------------------------------------------------------------------------
3283 Returns the result of converting the extended double-precision floating-
3284 point value `a' to the 32-bit two's complement integer format.  The
3285 conversion is performed according to the IEC/IEEE Standard for Binary
3286 Floating-Point Arithmetic---which means in particular that the conversion
3287 is rounded according to the current rounding mode.  If `a' is a NaN, the
3288 largest positive integer is returned.  Otherwise, if the conversion
3289 overflows, the largest integer with the same sign as `a' is returned.
3290 -------------------------------------------------------------------------------
3291 */
floatx80_to_int32(floatx80 a)3292 int32 floatx80_to_int32( floatx80 a )
3293 {
3294     flag aSign;
3295     int32 aExp, shiftCount;
3296     bits64 aSig;
3297 
3298     aSig = extractFloatx80Frac( a );
3299     aExp = extractFloatx80Exp( a );
3300     aSign = extractFloatx80Sign( a );
3301     if ( ( aExp == 0x7FFF ) && (bits64) ( aSig<<1 ) ) aSign = 0;
3302     shiftCount = 0x4037 - aExp;
3303     if ( shiftCount <= 0 ) shiftCount = 1;
3304     shift64RightJamming( aSig, shiftCount, &aSig );
3305     return roundAndPackInt32( aSign, aSig );
3306 
3307 }
3308 
3309 /*
3310 -------------------------------------------------------------------------------
3311 Returns the result of converting the extended double-precision floating-
3312 point value `a' to the 32-bit two's complement integer format.  The
3313 conversion is performed according to the IEC/IEEE Standard for Binary
3314 Floating-Point Arithmetic, except that the conversion is always rounded
3315 toward zero.  If `a' is a NaN, the largest positive integer is returned.
3316 Otherwise, if the conversion overflows, the largest integer with the same
3317 sign as `a' is returned.
3318 -------------------------------------------------------------------------------
3319 */
floatx80_to_int32_round_to_zero(floatx80 a)3320 int32 floatx80_to_int32_round_to_zero( floatx80 a )
3321 {
3322     flag aSign;
3323     int32 aExp, shiftCount;
3324     bits64 aSig, savedASig;
3325     int32 z;
3326 
3327     aSig = extractFloatx80Frac( a );
3328     aExp = extractFloatx80Exp( a );
3329     aSign = extractFloatx80Sign( a );
3330     if ( 0x401E < aExp ) {
3331         if ( ( aExp == 0x7FFF ) && (bits64) ( aSig<<1 ) ) aSign = 0;
3332         goto invalid;
3333     }
3334     else if ( aExp < 0x3FFF ) {
3335         if ( aExp || aSig ) float_exception_flags |= float_flag_inexact;
3336         return 0;
3337     }
3338     shiftCount = 0x403E - aExp;
3339     savedASig = aSig;
3340     aSig >>= shiftCount;
3341     z = aSig;
3342     if ( aSign ) z = - z;
3343     if ( ( z < 0 ) ^ aSign ) {
3344  invalid:
3345         float_raise( float_flag_invalid );
3346         return aSign ? (sbits32) 0x80000000 : 0x7FFFFFFF;
3347     }
3348     if ( ( aSig<<shiftCount ) != savedASig ) {
3349         float_exception_flags |= float_flag_inexact;
3350     }
3351     return z;
3352 
3353 }
3354 
3355 /*
3356 -------------------------------------------------------------------------------
3357 Returns the result of converting the extended double-precision floating-
3358 point value `a' to the 64-bit two's complement integer format.  The
3359 conversion is performed according to the IEC/IEEE Standard for Binary
3360 Floating-Point Arithmetic---which means in particular that the conversion
3361 is rounded according to the current rounding mode.  If `a' is a NaN,
3362 the largest positive integer is returned.  Otherwise, if the conversion
3363 overflows, the largest integer with the same sign as `a' is returned.
3364 -------------------------------------------------------------------------------
3365 */
floatx80_to_int64(floatx80 a)3366 int64 floatx80_to_int64( floatx80 a )
3367 {
3368     flag aSign;
3369     int32 aExp, shiftCount;
3370     bits64 aSig, aSigExtra;
3371 
3372     aSig = extractFloatx80Frac( a );
3373     aExp = extractFloatx80Exp( a );
3374     aSign = extractFloatx80Sign( a );
3375     shiftCount = 0x403E - aExp;
3376     if ( shiftCount <= 0 ) {
3377         if ( shiftCount ) {
3378             float_raise( float_flag_invalid );
3379             if (    ! aSign
3380                  || (    ( aExp == 0x7FFF )
3381                       && ( aSig != LIT64( 0x8000000000000000 ) ) )
3382                ) {
3383                 return LIT64( 0x7FFFFFFFFFFFFFFF );
3384             }
3385             return (sbits64) LIT64( 0x8000000000000000 );
3386         }
3387         aSigExtra = 0;
3388     }
3389     else {
3390         shift64ExtraRightJamming( aSig, 0, shiftCount, &aSig, &aSigExtra );
3391     }
3392     return roundAndPackInt64( aSign, aSig, aSigExtra );
3393 
3394 }
3395 
3396 /*
3397 -------------------------------------------------------------------------------
3398 Returns the result of converting the extended double-precision floating-
3399 point value `a' to the 64-bit two's complement integer format.  The
3400 conversion is performed according to the IEC/IEEE Standard for Binary
3401 Floating-Point Arithmetic, except that the conversion is always rounded
3402 toward zero.  If `a' is a NaN, the largest positive integer is returned.
3403 Otherwise, if the conversion overflows, the largest integer with the same
3404 sign as `a' is returned.
3405 -------------------------------------------------------------------------------
3406 */
floatx80_to_int64_round_to_zero(floatx80 a)3407 int64 floatx80_to_int64_round_to_zero( floatx80 a )
3408 {
3409     flag aSign;
3410     int32 aExp, shiftCount;
3411     bits64 aSig;
3412     int64 z;
3413 
3414     aSig = extractFloatx80Frac( a );
3415     aExp = extractFloatx80Exp( a );
3416     aSign = extractFloatx80Sign( a );
3417     shiftCount = aExp - 0x403E;
3418     if ( 0 <= shiftCount ) {
3419         aSig &= LIT64( 0x7FFFFFFFFFFFFFFF );
3420         if ( ( a.high != 0xC03E ) || aSig ) {
3421             float_raise( float_flag_invalid );
3422             if ( ! aSign || ( ( aExp == 0x7FFF ) && aSig ) ) {
3423                 return LIT64( 0x7FFFFFFFFFFFFFFF );
3424             }
3425         }
3426         return (sbits64) LIT64( 0x8000000000000000 );
3427     }
3428     else if ( aExp < 0x3FFF ) {
3429         if ( aExp | aSig ) float_exception_flags |= float_flag_inexact;
3430         return 0;
3431     }
3432     z = aSig>>( - shiftCount );
3433     if ( (bits64) ( aSig<<( shiftCount & 63 ) ) ) {
3434         float_exception_flags |= float_flag_inexact;
3435     }
3436     if ( aSign ) z = - z;
3437     return z;
3438 
3439 }
3440 
3441 /*
3442 -------------------------------------------------------------------------------
3443 Returns the result of converting the extended double-precision floating-
3444 point value `a' to the single-precision floating-point format.  The
3445 conversion is performed according to the IEC/IEEE Standard for Binary
3446 Floating-Point Arithmetic.
3447 -------------------------------------------------------------------------------
3448 */
floatx80_to_float32(floatx80 a)3449 float32 floatx80_to_float32( floatx80 a )
3450 {
3451     flag aSign;
3452     int32 aExp;
3453     bits64 aSig;
3454 
3455     aSig = extractFloatx80Frac( a );
3456     aExp = extractFloatx80Exp( a );
3457     aSign = extractFloatx80Sign( a );
3458     if ( aExp == 0x7FFF ) {
3459         if ( (bits64) ( aSig<<1 ) ) {
3460             return commonNaNToFloat32( floatx80ToCommonNaN( a ) );
3461         }
3462         return packFloat32( aSign, 0xFF, 0 );
3463     }
3464     shift64RightJamming( aSig, 33, &aSig );
3465     if ( aExp || aSig ) aExp -= 0x3F81;
3466     return roundAndPackFloat32( aSign, aExp, aSig );
3467 
3468 }
3469 
3470 /*
3471 -------------------------------------------------------------------------------
3472 Returns the result of converting the extended double-precision floating-
3473 point value `a' to the double-precision floating-point format.  The
3474 conversion is performed according to the IEC/IEEE Standard for Binary
3475 Floating-Point Arithmetic.
3476 -------------------------------------------------------------------------------
3477 */
floatx80_to_float64(floatx80 a)3478 float64 floatx80_to_float64( floatx80 a )
3479 {
3480     flag aSign;
3481     int32 aExp;
3482     bits64 aSig, zSig;
3483 
3484     aSig = extractFloatx80Frac( a );
3485     aExp = extractFloatx80Exp( a );
3486     aSign = extractFloatx80Sign( a );
3487     if ( aExp == 0x7FFF ) {
3488         if ( (bits64) ( aSig<<1 ) ) {
3489             return commonNaNToFloat64( floatx80ToCommonNaN( a ) );
3490         }
3491         return packFloat64( aSign, 0x7FF, 0 );
3492     }
3493     shift64RightJamming( aSig, 1, &zSig );
3494     if ( aExp || aSig ) aExp -= 0x3C01;
3495     return roundAndPackFloat64( aSign, aExp, zSig );
3496 
3497 }
3498 
3499 #ifdef FLOAT128
3500 
3501 /*
3502 -------------------------------------------------------------------------------
3503 Returns the result of converting the extended double-precision floating-
3504 point value `a' to the quadruple-precision floating-point format.  The
3505 conversion is performed according to the IEC/IEEE Standard for Binary
3506 Floating-Point Arithmetic.
3507 -------------------------------------------------------------------------------
3508 */
floatx80_to_float128(floatx80 a)3509 float128 floatx80_to_float128( floatx80 a )
3510 {
3511     flag aSign;
3512     int16 aExp;
3513     bits64 aSig, zSig0, zSig1;
3514 
3515     aSig = extractFloatx80Frac( a );
3516     aExp = extractFloatx80Exp( a );
3517     aSign = extractFloatx80Sign( a );
3518     if ( ( aExp == 0x7FFF ) && (bits64) ( aSig<<1 ) ) {
3519         return commonNaNToFloat128( floatx80ToCommonNaN( a ) );
3520     }
3521     shift128Right( aSig<<1, 0, 16, &zSig0, &zSig1 );
3522     return packFloat128( aSign, aExp, zSig0, zSig1 );
3523 
3524 }
3525 
3526 #endif
3527 
3528 /*
3529 -------------------------------------------------------------------------------
3530 Rounds the extended double-precision floating-point value `a' to an integer,
3531 and returns the result as an extended quadruple-precision floating-point
3532 value.  The operation is performed according to the IEC/IEEE Standard for
3533 Binary Floating-Point Arithmetic.
3534 -------------------------------------------------------------------------------
3535 */
floatx80_round_to_int(floatx80 a)3536 floatx80 floatx80_round_to_int( floatx80 a )
3537 {
3538     flag aSign;
3539     int32 aExp;
3540     bits64 lastBitMask, roundBitsMask;
3541     int8 roundingMode;
3542     floatx80 z;
3543 
3544     aExp = extractFloatx80Exp( a );
3545     if ( 0x403E <= aExp ) {
3546         if ( ( aExp == 0x7FFF ) && (bits64) ( extractFloatx80Frac( a )<<1 ) ) {
3547             return propagateFloatx80NaN( a, a );
3548         }
3549         return a;
3550     }
3551     if ( aExp < 0x3FFF ) {
3552         if (    ( aExp == 0 )
3553              && ( (bits64) ( extractFloatx80Frac( a )<<1 ) == 0 ) ) {
3554             return a;
3555         }
3556         float_exception_flags |= float_flag_inexact;
3557         aSign = extractFloatx80Sign( a );
3558         switch ( float_rounding_mode ) {
3559          case float_round_nearest_even:
3560             if ( ( aExp == 0x3FFE ) && (bits64) ( extractFloatx80Frac( a )<<1 )
3561                ) {
3562                 return
3563                     packFloatx80( aSign, 0x3FFF, LIT64( 0x8000000000000000 ) );
3564             }
3565             break;
3566 	 case float_round_to_zero:
3567 	    break;
3568          case float_round_down:
3569             return
3570                   aSign ?
3571                       packFloatx80( 1, 0x3FFF, LIT64( 0x8000000000000000 ) )
3572                 : packFloatx80( 0, 0, 0 );
3573          case float_round_up:
3574             return
3575                   aSign ? packFloatx80( 1, 0, 0 )
3576                 : packFloatx80( 0, 0x3FFF, LIT64( 0x8000000000000000 ) );
3577         }
3578         return packFloatx80( aSign, 0, 0 );
3579     }
3580     lastBitMask = 1;
3581     lastBitMask <<= 0x403E - aExp;
3582     roundBitsMask = lastBitMask - 1;
3583     z = a;
3584     roundingMode = float_rounding_mode;
3585     if ( roundingMode == float_round_nearest_even ) {
3586         z.low += lastBitMask>>1;
3587         if ( ( z.low & roundBitsMask ) == 0 ) z.low &= ~ lastBitMask;
3588     }
3589     else if ( roundingMode != float_round_to_zero ) {
3590         if ( extractFloatx80Sign( z ) ^ ( roundingMode == float_round_up ) ) {
3591             z.low += roundBitsMask;
3592         }
3593     }
3594     z.low &= ~ roundBitsMask;
3595     if ( z.low == 0 ) {
3596         ++z.high;
3597         z.low = LIT64( 0x8000000000000000 );
3598     }
3599     if ( z.low != a.low ) float_exception_flags |= float_flag_inexact;
3600     return z;
3601 
3602 }
3603 
3604 /*
3605 -------------------------------------------------------------------------------
3606 Returns the result of adding the absolute values of the extended double-
3607 precision floating-point values `a' and `b'.  If `zSign' is 1, the sum is
3608 negated before being returned.  `zSign' is ignored if the result is a NaN.
3609 The addition is performed according to the IEC/IEEE Standard for Binary
3610 Floating-Point Arithmetic.
3611 -------------------------------------------------------------------------------
3612 */
addFloatx80Sigs(floatx80 a,floatx80 b,flag zSign)3613 static floatx80 addFloatx80Sigs( floatx80 a, floatx80 b, flag zSign )
3614 {
3615     int32 aExp, bExp, zExp;
3616     bits64 aSig, bSig, zSig0, zSig1;
3617     int32 expDiff;
3618 
3619     aSig = extractFloatx80Frac( a );
3620     aExp = extractFloatx80Exp( a );
3621     bSig = extractFloatx80Frac( b );
3622     bExp = extractFloatx80Exp( b );
3623     expDiff = aExp - bExp;
3624     if ( 0 < expDiff ) {
3625         if ( aExp == 0x7FFF ) {
3626             if ( (bits64) ( aSig<<1 ) ) return propagateFloatx80NaN( a, b );
3627             return a;
3628         }
3629         if ( bExp == 0 ) --expDiff;
3630         shift64ExtraRightJamming( bSig, 0, expDiff, &bSig, &zSig1 );
3631         zExp = aExp;
3632     }
3633     else if ( expDiff < 0 ) {
3634         if ( bExp == 0x7FFF ) {
3635             if ( (bits64) ( bSig<<1 ) ) return propagateFloatx80NaN( a, b );
3636             return packFloatx80( zSign, 0x7FFF, LIT64( 0x8000000000000000 ) );
3637         }
3638         if ( aExp == 0 ) ++expDiff;
3639         shift64ExtraRightJamming( aSig, 0, - expDiff, &aSig, &zSig1 );
3640         zExp = bExp;
3641     }
3642     else {
3643         if ( aExp == 0x7FFF ) {
3644             if ( (bits64) ( ( aSig | bSig )<<1 ) ) {
3645                 return propagateFloatx80NaN( a, b );
3646             }
3647             return a;
3648         }
3649         zSig1 = 0;
3650         zSig0 = aSig + bSig;
3651         if ( aExp == 0 ) {
3652             normalizeFloatx80Subnormal( zSig0, &zExp, &zSig0 );
3653             goto roundAndPack;
3654         }
3655         zExp = aExp;
3656         goto shiftRight1;
3657     }
3658     zSig0 = aSig + bSig;
3659     if ( (sbits64) zSig0 < 0 ) goto roundAndPack;
3660  shiftRight1:
3661     shift64ExtraRightJamming( zSig0, zSig1, 1, &zSig0, &zSig1 );
3662     zSig0 |= LIT64( 0x8000000000000000 );
3663     ++zExp;
3664  roundAndPack:
3665     return
3666         roundAndPackFloatx80(
3667             floatx80_rounding_precision, zSign, zExp, zSig0, zSig1 );
3668 
3669 }
3670 
3671 /*
3672 -------------------------------------------------------------------------------
3673 Returns the result of subtracting the absolute values of the extended
3674 double-precision floating-point values `a' and `b'.  If `zSign' is 1, the
3675 difference is negated before being returned.  `zSign' is ignored if the
3676 result is a NaN.  The subtraction is performed according to the IEC/IEEE
3677 Standard for Binary Floating-Point Arithmetic.
3678 -------------------------------------------------------------------------------
3679 */
subFloatx80Sigs(floatx80 a,floatx80 b,flag zSign)3680 static floatx80 subFloatx80Sigs( floatx80 a, floatx80 b, flag zSign )
3681 {
3682     int32 aExp, bExp, zExp;
3683     bits64 aSig, bSig, zSig0, zSig1;
3684     int32 expDiff;
3685     floatx80 z;
3686 
3687     aSig = extractFloatx80Frac( a );
3688     aExp = extractFloatx80Exp( a );
3689     bSig = extractFloatx80Frac( b );
3690     bExp = extractFloatx80Exp( b );
3691     expDiff = aExp - bExp;
3692     if ( 0 < expDiff ) goto aExpBigger;
3693     if ( expDiff < 0 ) goto bExpBigger;
3694     if ( aExp == 0x7FFF ) {
3695         if ( (bits64) ( ( aSig | bSig )<<1 ) ) {
3696             return propagateFloatx80NaN( a, b );
3697         }
3698         float_raise( float_flag_invalid );
3699         z.low = floatx80_default_nan_low;
3700         z.high = floatx80_default_nan_high;
3701         return z;
3702     }
3703     if ( aExp == 0 ) {
3704         aExp = 1;
3705         bExp = 1;
3706     }
3707     zSig1 = 0;
3708     if ( bSig < aSig ) goto aBigger;
3709     if ( aSig < bSig ) goto bBigger;
3710     return packFloatx80( float_rounding_mode == float_round_down, 0, 0 );
3711  bExpBigger:
3712     if ( bExp == 0x7FFF ) {
3713         if ( (bits64) ( bSig<<1 ) ) return propagateFloatx80NaN( a, b );
3714         return packFloatx80( zSign ^ 1, 0x7FFF, LIT64( 0x8000000000000000 ) );
3715     }
3716     if ( aExp == 0 ) ++expDiff;
3717     shift128RightJamming( aSig, 0, - expDiff, &aSig, &zSig1 );
3718  bBigger:
3719     sub128( bSig, 0, aSig, zSig1, &zSig0, &zSig1 );
3720     zExp = bExp;
3721     zSign ^= 1;
3722     goto normalizeRoundAndPack;
3723  aExpBigger:
3724     if ( aExp == 0x7FFF ) {
3725         if ( (bits64) ( aSig<<1 ) ) return propagateFloatx80NaN( a, b );
3726         return a;
3727     }
3728     if ( bExp == 0 ) --expDiff;
3729     shift128RightJamming( bSig, 0, expDiff, &bSig, &zSig1 );
3730  aBigger:
3731     sub128( aSig, 0, bSig, zSig1, &zSig0, &zSig1 );
3732     zExp = aExp;
3733  normalizeRoundAndPack:
3734     return
3735         normalizeRoundAndPackFloatx80(
3736             floatx80_rounding_precision, zSign, zExp, zSig0, zSig1 );
3737 
3738 }
3739 
3740 /*
3741 -------------------------------------------------------------------------------
3742 Returns the result of adding the extended double-precision floating-point
3743 values `a' and `b'.  The operation is performed according to the IEC/IEEE
3744 Standard for Binary Floating-Point Arithmetic.
3745 -------------------------------------------------------------------------------
3746 */
floatx80_add(floatx80 a,floatx80 b)3747 floatx80 floatx80_add( floatx80 a, floatx80 b )
3748 {
3749     flag aSign, bSign;
3750 
3751     aSign = extractFloatx80Sign( a );
3752     bSign = extractFloatx80Sign( b );
3753     if ( aSign == bSign ) {
3754         return addFloatx80Sigs( a, b, aSign );
3755     }
3756     else {
3757         return subFloatx80Sigs( a, b, aSign );
3758     }
3759 
3760 }
3761 
3762 /*
3763 -------------------------------------------------------------------------------
3764 Returns the result of subtracting the extended double-precision floating-
3765 point values `a' and `b'.  The operation is performed according to the
3766 IEC/IEEE Standard for Binary Floating-Point Arithmetic.
3767 -------------------------------------------------------------------------------
3768 */
floatx80_sub(floatx80 a,floatx80 b)3769 floatx80 floatx80_sub( floatx80 a, floatx80 b )
3770 {
3771     flag aSign, bSign;
3772 
3773     aSign = extractFloatx80Sign( a );
3774     bSign = extractFloatx80Sign( b );
3775     if ( aSign == bSign ) {
3776         return subFloatx80Sigs( a, b, aSign );
3777     }
3778     else {
3779         return addFloatx80Sigs( a, b, aSign );
3780     }
3781 
3782 }
3783 
3784 /*
3785 -------------------------------------------------------------------------------
3786 Returns the result of multiplying the extended double-precision floating-
3787 point values `a' and `b'.  The operation is performed according to the
3788 IEC/IEEE Standard for Binary Floating-Point Arithmetic.
3789 -------------------------------------------------------------------------------
3790 */
floatx80_mul(floatx80 a,floatx80 b)3791 floatx80 floatx80_mul( floatx80 a, floatx80 b )
3792 {
3793     flag aSign, bSign, zSign;
3794     int32 aExp, bExp, zExp;
3795     bits64 aSig, bSig, zSig0, zSig1;
3796     floatx80 z;
3797 
3798     aSig = extractFloatx80Frac( a );
3799     aExp = extractFloatx80Exp( a );
3800     aSign = extractFloatx80Sign( a );
3801     bSig = extractFloatx80Frac( b );
3802     bExp = extractFloatx80Exp( b );
3803     bSign = extractFloatx80Sign( b );
3804     zSign = aSign ^ bSign;
3805     if ( aExp == 0x7FFF ) {
3806         if (    (bits64) ( aSig<<1 )
3807              || ( ( bExp == 0x7FFF ) && (bits64) ( bSig<<1 ) ) ) {
3808             return propagateFloatx80NaN( a, b );
3809         }
3810         if ( ( bExp | bSig ) == 0 ) goto invalid;
3811         return packFloatx80( zSign, 0x7FFF, LIT64( 0x8000000000000000 ) );
3812     }
3813     if ( bExp == 0x7FFF ) {
3814         if ( (bits64) ( bSig<<1 ) ) return propagateFloatx80NaN( a, b );
3815         if ( ( aExp | aSig ) == 0 ) {
3816  invalid:
3817             float_raise( float_flag_invalid );
3818             z.low = floatx80_default_nan_low;
3819             z.high = floatx80_default_nan_high;
3820             return z;
3821         }
3822         return packFloatx80( zSign, 0x7FFF, LIT64( 0x8000000000000000 ) );
3823     }
3824     if ( aExp == 0 ) {
3825         if ( aSig == 0 ) return packFloatx80( zSign, 0, 0 );
3826         normalizeFloatx80Subnormal( aSig, &aExp, &aSig );
3827     }
3828     if ( bExp == 0 ) {
3829         if ( bSig == 0 ) return packFloatx80( zSign, 0, 0 );
3830         normalizeFloatx80Subnormal( bSig, &bExp, &bSig );
3831     }
3832     zExp = aExp + bExp - 0x3FFE;
3833     mul64To128( aSig, bSig, &zSig0, &zSig1 );
3834     if ( 0 < (sbits64) zSig0 ) {
3835         shortShift128Left( zSig0, zSig1, 1, &zSig0, &zSig1 );
3836         --zExp;
3837     }
3838     return
3839         roundAndPackFloatx80(
3840             floatx80_rounding_precision, zSign, zExp, zSig0, zSig1 );
3841 
3842 }
3843 
3844 /*
3845 -------------------------------------------------------------------------------
3846 Returns the result of dividing the extended double-precision floating-point
3847 value `a' by the corresponding value `b'.  The operation is performed
3848 according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
3849 -------------------------------------------------------------------------------
3850 */
floatx80_div(floatx80 a,floatx80 b)3851 floatx80 floatx80_div( floatx80 a, floatx80 b )
3852 {
3853     flag aSign, bSign, zSign;
3854     int32 aExp, bExp, zExp;
3855     bits64 aSig, bSig, zSig0, zSig1;
3856     bits64 rem0, rem1, rem2, term0, term1, term2;
3857     floatx80 z;
3858 
3859     aSig = extractFloatx80Frac( a );
3860     aExp = extractFloatx80Exp( a );
3861     aSign = extractFloatx80Sign( a );
3862     bSig = extractFloatx80Frac( b );
3863     bExp = extractFloatx80Exp( b );
3864     bSign = extractFloatx80Sign( b );
3865     zSign = aSign ^ bSign;
3866     if ( aExp == 0x7FFF ) {
3867         if ( (bits64) ( aSig<<1 ) ) return propagateFloatx80NaN( a, b );
3868         if ( bExp == 0x7FFF ) {
3869             if ( (bits64) ( bSig<<1 ) ) return propagateFloatx80NaN( a, b );
3870             goto invalid;
3871         }
3872         return packFloatx80( zSign, 0x7FFF, LIT64( 0x8000000000000000 ) );
3873     }
3874     if ( bExp == 0x7FFF ) {
3875         if ( (bits64) ( bSig<<1 ) ) return propagateFloatx80NaN( a, b );
3876         return packFloatx80( zSign, 0, 0 );
3877     }
3878     if ( bExp == 0 ) {
3879         if ( bSig == 0 ) {
3880             if ( ( aExp | aSig ) == 0 ) {
3881  invalid:
3882                 float_raise( float_flag_invalid );
3883                 z.low = floatx80_default_nan_low;
3884                 z.high = floatx80_default_nan_high;
3885                 return z;
3886             }
3887             float_raise( float_flag_divbyzero );
3888             return packFloatx80( zSign, 0x7FFF, LIT64( 0x8000000000000000 ) );
3889         }
3890         normalizeFloatx80Subnormal( bSig, &bExp, &bSig );
3891     }
3892     if ( aExp == 0 ) {
3893         if ( aSig == 0 ) return packFloatx80( zSign, 0, 0 );
3894         normalizeFloatx80Subnormal( aSig, &aExp, &aSig );
3895     }
3896     zExp = aExp - bExp + 0x3FFE;
3897     rem1 = 0;
3898     if ( bSig <= aSig ) {
3899         shift128Right( aSig, 0, 1, &aSig, &rem1 );
3900         ++zExp;
3901     }
3902     zSig0 = estimateDiv128To64( aSig, rem1, bSig );
3903     mul64To128( bSig, zSig0, &term0, &term1 );
3904     sub128( aSig, rem1, term0, term1, &rem0, &rem1 );
3905     while ( (sbits64) rem0 < 0 ) {
3906         --zSig0;
3907         add128( rem0, rem1, 0, bSig, &rem0, &rem1 );
3908     }
3909     zSig1 = estimateDiv128To64( rem1, 0, bSig );
3910     if ( (bits64) ( zSig1<<1 ) <= 8 ) {
3911         mul64To128( bSig, zSig1, &term1, &term2 );
3912         sub128( rem1, 0, term1, term2, &rem1, &rem2 );
3913         while ( (sbits64) rem1 < 0 ) {
3914             --zSig1;
3915             add128( rem1, rem2, 0, bSig, &rem1, &rem2 );
3916         }
3917         zSig1 |= ( ( rem1 | rem2 ) != 0 );
3918     }
3919     return
3920         roundAndPackFloatx80(
3921             floatx80_rounding_precision, zSign, zExp, zSig0, zSig1 );
3922 
3923 }
3924 
3925 /*
3926 -------------------------------------------------------------------------------
3927 Returns the remainder of the extended double-precision floating-point value
3928 `a' with respect to the corresponding value `b'.  The operation is performed
3929 according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
3930 -------------------------------------------------------------------------------
3931 */
floatx80_rem(floatx80 a,floatx80 b)3932 floatx80 floatx80_rem( floatx80 a, floatx80 b )
3933 {
3934     flag aSign, bSign, zSign;
3935     int32 aExp, bExp, expDiff;
3936     bits64 aSig0, aSig1, bSig;
3937     bits64 q, term0, term1, alternateASig0, alternateASig1;
3938     floatx80 z;
3939 
3940     aSig0 = extractFloatx80Frac( a );
3941     aExp = extractFloatx80Exp( a );
3942     aSign = extractFloatx80Sign( a );
3943     bSig = extractFloatx80Frac( b );
3944     bExp = extractFloatx80Exp( b );
3945     bSign = extractFloatx80Sign( b );
3946     if ( aExp == 0x7FFF ) {
3947         if (    (bits64) ( aSig0<<1 )
3948              || ( ( bExp == 0x7FFF ) && (bits64) ( bSig<<1 ) ) ) {
3949             return propagateFloatx80NaN( a, b );
3950         }
3951         goto invalid;
3952     }
3953     if ( bExp == 0x7FFF ) {
3954         if ( (bits64) ( bSig<<1 ) ) return propagateFloatx80NaN( a, b );
3955         return a;
3956     }
3957     if ( bExp == 0 ) {
3958         if ( bSig == 0 ) {
3959  invalid:
3960             float_raise( float_flag_invalid );
3961             z.low = floatx80_default_nan_low;
3962             z.high = floatx80_default_nan_high;
3963             return z;
3964         }
3965         normalizeFloatx80Subnormal( bSig, &bExp, &bSig );
3966     }
3967     if ( aExp == 0 ) {
3968         if ( (bits64) ( aSig0<<1 ) == 0 ) return a;
3969         normalizeFloatx80Subnormal( aSig0, &aExp, &aSig0 );
3970     }
3971     bSig |= LIT64( 0x8000000000000000 );
3972     zSign = aSign;
3973     expDiff = aExp - bExp;
3974     aSig1 = 0;
3975     if ( expDiff < 0 ) {
3976         if ( expDiff < -1 ) return a;
3977         shift128Right( aSig0, 0, 1, &aSig0, &aSig1 );
3978         expDiff = 0;
3979     }
3980     q = ( bSig <= aSig0 );
3981     if ( q ) aSig0 -= bSig;
3982     expDiff -= 64;
3983     while ( 0 < expDiff ) {
3984         q = estimateDiv128To64( aSig0, aSig1, bSig );
3985         q = ( 2 < q ) ? q - 2 : 0;
3986         mul64To128( bSig, q, &term0, &term1 );
3987         sub128( aSig0, aSig1, term0, term1, &aSig0, &aSig1 );
3988         shortShift128Left( aSig0, aSig1, 62, &aSig0, &aSig1 );
3989         expDiff -= 62;
3990     }
3991     expDiff += 64;
3992     if ( 0 < expDiff ) {
3993         q = estimateDiv128To64( aSig0, aSig1, bSig );
3994         q = ( 2 < q ) ? q - 2 : 0;
3995         q >>= 64 - expDiff;
3996         mul64To128( bSig, q<<( 64 - expDiff ), &term0, &term1 );
3997         sub128( aSig0, aSig1, term0, term1, &aSig0, &aSig1 );
3998         shortShift128Left( 0, bSig, 64 - expDiff, &term0, &term1 );
3999         while ( le128( term0, term1, aSig0, aSig1 ) ) {
4000             ++q;
4001             sub128( aSig0, aSig1, term0, term1, &aSig0, &aSig1 );
4002         }
4003     }
4004     else {
4005         term1 = 0;
4006         term0 = bSig;
4007     }
4008     sub128( term0, term1, aSig0, aSig1, &alternateASig0, &alternateASig1 );
4009     if (    lt128( alternateASig0, alternateASig1, aSig0, aSig1 )
4010          || (    eq128( alternateASig0, alternateASig1, aSig0, aSig1 )
4011               && ( q & 1 ) )
4012        ) {
4013         aSig0 = alternateASig0;
4014         aSig1 = alternateASig1;
4015         zSign = ! zSign;
4016     }
4017     return
4018         normalizeRoundAndPackFloatx80(
4019             80, zSign, bExp + expDiff, aSig0, aSig1 );
4020 
4021 }
4022 
4023 /*
4024 -------------------------------------------------------------------------------
4025 Returns the square root of the extended double-precision floating-point
4026 value `a'.  The operation is performed according to the IEC/IEEE Standard
4027 for Binary Floating-Point Arithmetic.
4028 -------------------------------------------------------------------------------
4029 */
floatx80_sqrt(floatx80 a)4030 floatx80 floatx80_sqrt( floatx80 a )
4031 {
4032     flag aSign;
4033     int32 aExp, zExp;
4034     bits64 aSig0, aSig1, zSig0, zSig1, doubleZSig0;
4035     bits64 rem0, rem1, rem2, rem3, term0, term1, term2, term3;
4036     floatx80 z;
4037 
4038     aSig0 = extractFloatx80Frac( a );
4039     aExp = extractFloatx80Exp( a );
4040     aSign = extractFloatx80Sign( a );
4041     if ( aExp == 0x7FFF ) {
4042         if ( (bits64) ( aSig0<<1 ) ) return propagateFloatx80NaN( a, a );
4043         if ( ! aSign ) return a;
4044         goto invalid;
4045     }
4046     if ( aSign ) {
4047         if ( ( aExp | aSig0 ) == 0 ) return a;
4048  invalid:
4049         float_raise( float_flag_invalid );
4050         z.low = floatx80_default_nan_low;
4051         z.high = floatx80_default_nan_high;
4052         return z;
4053     }
4054     if ( aExp == 0 ) {
4055         if ( aSig0 == 0 ) return packFloatx80( 0, 0, 0 );
4056         normalizeFloatx80Subnormal( aSig0, &aExp, &aSig0 );
4057     }
4058     zExp = ( ( aExp - 0x3FFF )>>1 ) + 0x3FFF;
4059     zSig0 = estimateSqrt32( aExp, aSig0>>32 );
4060     shift128Right( aSig0, 0, 2 + ( aExp & 1 ), &aSig0, &aSig1 );
4061     zSig0 = estimateDiv128To64( aSig0, aSig1, zSig0<<32 ) + ( zSig0<<30 );
4062     doubleZSig0 = zSig0<<1;
4063     mul64To128( zSig0, zSig0, &term0, &term1 );
4064     sub128( aSig0, aSig1, term0, term1, &rem0, &rem1 );
4065     while ( (sbits64) rem0 < 0 ) {
4066         --zSig0;
4067         doubleZSig0 -= 2;
4068         add128( rem0, rem1, zSig0>>63, doubleZSig0 | 1, &rem0, &rem1 );
4069     }
4070     zSig1 = estimateDiv128To64( rem1, 0, doubleZSig0 );
4071     if ( ( zSig1 & LIT64( 0x3FFFFFFFFFFFFFFF ) ) <= 5 ) {
4072         if ( zSig1 == 0 ) zSig1 = 1;
4073         mul64To128( doubleZSig0, zSig1, &term1, &term2 );
4074         sub128( rem1, 0, term1, term2, &rem1, &rem2 );
4075         mul64To128( zSig1, zSig1, &term2, &term3 );
4076         sub192( rem1, rem2, 0, 0, term2, term3, &rem1, &rem2, &rem3 );
4077         while ( (sbits64) rem1 < 0 ) {
4078             --zSig1;
4079             shortShift128Left( 0, zSig1, 1, &term2, &term3 );
4080             term3 |= 1;
4081             term2 |= doubleZSig0;
4082             add192( rem1, rem2, rem3, 0, term2, term3, &rem1, &rem2, &rem3 );
4083         }
4084         zSig1 |= ( ( rem1 | rem2 | rem3 ) != 0 );
4085     }
4086     shortShift128Left( 0, zSig1, 1, &zSig0, &zSig1 );
4087     zSig0 |= doubleZSig0;
4088     return
4089         roundAndPackFloatx80(
4090             floatx80_rounding_precision, 0, zExp, zSig0, zSig1 );
4091 
4092 }
4093 
4094 /*
4095 -------------------------------------------------------------------------------
4096 Returns 1 if the extended double-precision floating-point value `a' is
4097 equal to the corresponding value `b', and 0 otherwise.  The comparison is
4098 performed according to the IEC/IEEE Standard for Binary Floating-Point
4099 Arithmetic.
4100 -------------------------------------------------------------------------------
4101 */
floatx80_eq(floatx80 a,floatx80 b)4102 flag floatx80_eq( floatx80 a, floatx80 b )
4103 {
4104 
4105     if (    (    ( extractFloatx80Exp( a ) == 0x7FFF )
4106               && (bits64) ( extractFloatx80Frac( a )<<1 ) )
4107          || (    ( extractFloatx80Exp( b ) == 0x7FFF )
4108               && (bits64) ( extractFloatx80Frac( b )<<1 ) )
4109        ) {
4110         if (    floatx80_is_signaling_nan( a )
4111              || floatx80_is_signaling_nan( b ) ) {
4112             float_raise( float_flag_invalid );
4113         }
4114         return 0;
4115     }
4116     return
4117            ( a.low == b.low )
4118         && (    ( a.high == b.high )
4119              || (    ( a.low == 0 )
4120                   && ( (bits16) ( ( a.high | b.high )<<1 ) == 0 ) )
4121            );
4122 
4123 }
4124 
4125 /*
4126 -------------------------------------------------------------------------------
4127 Returns 1 if the extended double-precision floating-point value `a' is
4128 less than or equal to the corresponding value `b', and 0 otherwise.  The
4129 comparison is performed according to the IEC/IEEE Standard for Binary
4130 Floating-Point Arithmetic.
4131 -------------------------------------------------------------------------------
4132 */
floatx80_le(floatx80 a,floatx80 b)4133 flag floatx80_le( floatx80 a, floatx80 b )
4134 {
4135     flag aSign, bSign;
4136 
4137     if (    (    ( extractFloatx80Exp( a ) == 0x7FFF )
4138               && (bits64) ( extractFloatx80Frac( a )<<1 ) )
4139          || (    ( extractFloatx80Exp( b ) == 0x7FFF )
4140               && (bits64) ( extractFloatx80Frac( b )<<1 ) )
4141        ) {
4142         float_raise( float_flag_invalid );
4143         return 0;
4144     }
4145     aSign = extractFloatx80Sign( a );
4146     bSign = extractFloatx80Sign( b );
4147     if ( aSign != bSign ) {
4148         return
4149                aSign
4150             || (    ( ( (bits16) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
4151                  == 0 );
4152     }
4153     return
4154           aSign ? le128( b.high, b.low, a.high, a.low )
4155         : le128( a.high, a.low, b.high, b.low );
4156 
4157 }
4158 
4159 /*
4160 -------------------------------------------------------------------------------
4161 Returns 1 if the extended double-precision floating-point value `a' is
4162 less than the corresponding value `b', and 0 otherwise.  The comparison
4163 is performed according to the IEC/IEEE Standard for Binary Floating-Point
4164 Arithmetic.
4165 -------------------------------------------------------------------------------
4166 */
floatx80_lt(floatx80 a,floatx80 b)4167 flag floatx80_lt( floatx80 a, floatx80 b )
4168 {
4169     flag aSign, bSign;
4170 
4171     if (    (    ( extractFloatx80Exp( a ) == 0x7FFF )
4172               && (bits64) ( extractFloatx80Frac( a )<<1 ) )
4173          || (    ( extractFloatx80Exp( b ) == 0x7FFF )
4174               && (bits64) ( extractFloatx80Frac( b )<<1 ) )
4175        ) {
4176         float_raise( float_flag_invalid );
4177         return 0;
4178     }
4179     aSign = extractFloatx80Sign( a );
4180     bSign = extractFloatx80Sign( b );
4181     if ( aSign != bSign ) {
4182         return
4183                aSign
4184             && (    ( ( (bits16) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
4185                  != 0 );
4186     }
4187     return
4188           aSign ? lt128( b.high, b.low, a.high, a.low )
4189         : lt128( a.high, a.low, b.high, b.low );
4190 
4191 }
4192 
4193 /*
4194 -------------------------------------------------------------------------------
4195 Returns 1 if the extended double-precision floating-point value `a' is equal
4196 to the corresponding value `b', and 0 otherwise.  The invalid exception is
4197 raised if either operand is a NaN.  Otherwise, the comparison is performed
4198 according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
4199 -------------------------------------------------------------------------------
4200 */
floatx80_eq_signaling(floatx80 a,floatx80 b)4201 flag floatx80_eq_signaling( floatx80 a, floatx80 b )
4202 {
4203 
4204     if (    (    ( extractFloatx80Exp( a ) == 0x7FFF )
4205               && (bits64) ( extractFloatx80Frac( a )<<1 ) )
4206          || (    ( extractFloatx80Exp( b ) == 0x7FFF )
4207               && (bits64) ( extractFloatx80Frac( b )<<1 ) )
4208        ) {
4209         float_raise( float_flag_invalid );
4210         return 0;
4211     }
4212     return
4213            ( a.low == b.low )
4214         && (    ( a.high == b.high )
4215              || (    ( a.low == 0 )
4216                   && ( (bits16) ( ( a.high | b.high )<<1 ) == 0 ) )
4217            );
4218 
4219 }
4220 
4221 /*
4222 -------------------------------------------------------------------------------
4223 Returns 1 if the extended double-precision floating-point value `a' is less
4224 than or equal to the corresponding value `b', and 0 otherwise.  Quiet NaNs
4225 do not cause an exception.  Otherwise, the comparison is performed according
4226 to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
4227 -------------------------------------------------------------------------------
4228 */
floatx80_le_quiet(floatx80 a,floatx80 b)4229 flag floatx80_le_quiet( floatx80 a, floatx80 b )
4230 {
4231     flag aSign, bSign;
4232 
4233     if (    (    ( extractFloatx80Exp( a ) == 0x7FFF )
4234               && (bits64) ( extractFloatx80Frac( a )<<1 ) )
4235          || (    ( extractFloatx80Exp( b ) == 0x7FFF )
4236               && (bits64) ( extractFloatx80Frac( b )<<1 ) )
4237        ) {
4238         if (    floatx80_is_signaling_nan( a )
4239              || floatx80_is_signaling_nan( b ) ) {
4240             float_raise( float_flag_invalid );
4241         }
4242         return 0;
4243     }
4244     aSign = extractFloatx80Sign( a );
4245     bSign = extractFloatx80Sign( b );
4246     if ( aSign != bSign ) {
4247         return
4248                aSign
4249             || (    ( ( (bits16) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
4250                  == 0 );
4251     }
4252     return
4253           aSign ? le128( b.high, b.low, a.high, a.low )
4254         : le128( a.high, a.low, b.high, b.low );
4255 
4256 }
4257 
4258 /*
4259 -------------------------------------------------------------------------------
4260 Returns 1 if the extended double-precision floating-point value `a' is less
4261 than the corresponding value `b', and 0 otherwise.  Quiet NaNs do not cause
4262 an exception.  Otherwise, the comparison is performed according to the
4263 IEC/IEEE Standard for Binary Floating-Point Arithmetic.
4264 -------------------------------------------------------------------------------
4265 */
floatx80_lt_quiet(floatx80 a,floatx80 b)4266 flag floatx80_lt_quiet( floatx80 a, floatx80 b )
4267 {
4268     flag aSign, bSign;
4269 
4270     if (    (    ( extractFloatx80Exp( a ) == 0x7FFF )
4271               && (bits64) ( extractFloatx80Frac( a )<<1 ) )
4272          || (    ( extractFloatx80Exp( b ) == 0x7FFF )
4273               && (bits64) ( extractFloatx80Frac( b )<<1 ) )
4274        ) {
4275         if (    floatx80_is_signaling_nan( a )
4276              || floatx80_is_signaling_nan( b ) ) {
4277             float_raise( float_flag_invalid );
4278         }
4279         return 0;
4280     }
4281     aSign = extractFloatx80Sign( a );
4282     bSign = extractFloatx80Sign( b );
4283     if ( aSign != bSign ) {
4284         return
4285                aSign
4286             && (    ( ( (bits16) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
4287                  != 0 );
4288     }
4289     return
4290           aSign ? lt128( b.high, b.low, a.high, a.low )
4291         : lt128( a.high, a.low, b.high, b.low );
4292 
4293 }
4294 
4295 #endif
4296 
4297 #ifdef FLOAT128
4298 
4299 /*
4300 -------------------------------------------------------------------------------
4301 Returns the result of converting the quadruple-precision floating-point
4302 value `a' to the 32-bit two's complement integer format.  The conversion
4303 is performed according to the IEC/IEEE Standard for Binary Floating-Point
4304 Arithmetic---which means in particular that the conversion is rounded
4305 according to the current rounding mode.  If `a' is a NaN, the largest
4306 positive integer is returned.  Otherwise, if the conversion overflows, the
4307 largest integer with the same sign as `a' is returned.
4308 -------------------------------------------------------------------------------
4309 */
float128_to_int32(float128 a)4310 int32 float128_to_int32( float128 a )
4311 {
4312     flag aSign;
4313     int32 aExp, shiftCount;
4314     bits64 aSig0, aSig1;
4315 
4316     aSig1 = extractFloat128Frac1( a );
4317     aSig0 = extractFloat128Frac0( a );
4318     aExp = extractFloat128Exp( a );
4319     aSign = extractFloat128Sign( a );
4320     if ( ( aExp == 0x7FFF ) && ( aSig0 | aSig1 ) ) aSign = 0;
4321     if ( aExp ) aSig0 |= LIT64( 0x0001000000000000 );
4322     aSig0 |= ( aSig1 != 0 );
4323     shiftCount = 0x4028 - aExp;
4324     if ( 0 < shiftCount ) shift64RightJamming( aSig0, shiftCount, &aSig0 );
4325     return roundAndPackInt32( aSign, aSig0 );
4326 
4327 }
4328 
4329 /*
4330 -------------------------------------------------------------------------------
4331 Returns the result of converting the quadruple-precision floating-point
4332 value `a' to the 32-bit two's complement integer format.  The conversion
4333 is performed according to the IEC/IEEE Standard for Binary Floating-Point
4334 Arithmetic, except that the conversion is always rounded toward zero.  If
4335 `a' is a NaN, the largest positive integer is returned.  Otherwise, if the
4336 conversion overflows, the largest integer with the same sign as `a' is
4337 returned.
4338 -------------------------------------------------------------------------------
4339 */
float128_to_int32_round_to_zero(float128 a)4340 int32 float128_to_int32_round_to_zero( float128 a )
4341 {
4342     flag aSign;
4343     int32 aExp, shiftCount;
4344     bits64 aSig0, aSig1, savedASig;
4345     int32 z;
4346 
4347     aSig1 = extractFloat128Frac1( a );
4348     aSig0 = extractFloat128Frac0( a );
4349     aExp = extractFloat128Exp( a );
4350     aSign = extractFloat128Sign( a );
4351     aSig0 |= ( aSig1 != 0 );
4352     if ( 0x401E < aExp ) {
4353         if ( ( aExp == 0x7FFF ) && aSig0 ) aSign = 0;
4354         goto invalid;
4355     }
4356     else if ( aExp < 0x3FFF ) {
4357         if ( aExp || aSig0 ) float_exception_flags |= float_flag_inexact;
4358         return 0;
4359     }
4360     aSig0 |= LIT64( 0x0001000000000000 );
4361     shiftCount = 0x402F - aExp;
4362     savedASig = aSig0;
4363     aSig0 >>= shiftCount;
4364     z = aSig0;
4365     if ( aSign ) z = - z;
4366     if ( ( z < 0 ) ^ aSign ) {
4367  invalid:
4368         float_raise( float_flag_invalid );
4369         return aSign ? (sbits32) 0x80000000 : 0x7FFFFFFF;
4370     }
4371     if ( ( aSig0<<shiftCount ) != savedASig ) {
4372         float_exception_flags |= float_flag_inexact;
4373     }
4374     return z;
4375 
4376 }
4377 
4378 /*
4379 -------------------------------------------------------------------------------
4380 Returns the result of converting the quadruple-precision floating-point
4381 value `a' to the 64-bit two's complement integer format.  The conversion
4382 is performed according to the IEC/IEEE Standard for Binary Floating-Point
4383 Arithmetic---which means in particular that the conversion is rounded
4384 according to the current rounding mode.  If `a' is a NaN, the largest
4385 positive integer is returned.  Otherwise, if the conversion overflows, the
4386 largest integer with the same sign as `a' is returned.
4387 -------------------------------------------------------------------------------
4388 */
float128_to_int64(float128 a)4389 int64 float128_to_int64( float128 a )
4390 {
4391     flag aSign;
4392     int32 aExp, shiftCount;
4393     bits64 aSig0, aSig1;
4394 
4395     aSig1 = extractFloat128Frac1( a );
4396     aSig0 = extractFloat128Frac0( a );
4397     aExp = extractFloat128Exp( a );
4398     aSign = extractFloat128Sign( a );
4399     if ( aExp ) aSig0 |= LIT64( 0x0001000000000000 );
4400     shiftCount = 0x402F - aExp;
4401     if ( shiftCount <= 0 ) {
4402         if ( 0x403E < aExp ) {
4403             float_raise( float_flag_invalid );
4404             if (    ! aSign
4405                  || (    ( aExp == 0x7FFF )
4406                       && ( aSig1 || ( aSig0 != LIT64( 0x0001000000000000 ) ) )
4407                     )
4408                ) {
4409                 return LIT64( 0x7FFFFFFFFFFFFFFF );
4410             }
4411             return (sbits64) LIT64( 0x8000000000000000 );
4412         }
4413         shortShift128Left( aSig0, aSig1, - shiftCount, &aSig0, &aSig1 );
4414     }
4415     else {
4416         shift64ExtraRightJamming( aSig0, aSig1, shiftCount, &aSig0, &aSig1 );
4417     }
4418     return roundAndPackInt64( aSign, aSig0, aSig1 );
4419 
4420 }
4421 
4422 /*
4423 -------------------------------------------------------------------------------
4424 Returns the result of converting the quadruple-precision floating-point
4425 value `a' to the 64-bit two's complement integer format.  The conversion
4426 is performed according to the IEC/IEEE Standard for Binary Floating-Point
4427 Arithmetic, except that the conversion is always rounded toward zero.
4428 If `a' is a NaN, the largest positive integer is returned.  Otherwise, if
4429 the conversion overflows, the largest integer with the same sign as `a' is
4430 returned.
4431 -------------------------------------------------------------------------------
4432 */
float128_to_int64_round_to_zero(float128 a)4433 int64 float128_to_int64_round_to_zero( float128 a )
4434 {
4435     flag aSign;
4436     int32 aExp, shiftCount;
4437     bits64 aSig0, aSig1;
4438     int64 z;
4439 
4440     aSig1 = extractFloat128Frac1( a );
4441     aSig0 = extractFloat128Frac0( a );
4442     aExp = extractFloat128Exp( a );
4443     aSign = extractFloat128Sign( a );
4444     if ( aExp ) aSig0 |= LIT64( 0x0001000000000000 );
4445     shiftCount = aExp - 0x402F;
4446     if ( 0 < shiftCount ) {
4447         if ( 0x403E <= aExp ) {
4448             aSig0 &= LIT64( 0x0000FFFFFFFFFFFF );
4449             if (    ( a.high == LIT64( 0xC03E000000000000 ) )
4450                  && ( aSig1 < LIT64( 0x0002000000000000 ) ) ) {
4451                 if ( aSig1 ) float_exception_flags |= float_flag_inexact;
4452             }
4453             else {
4454                 float_raise( float_flag_invalid );
4455                 if ( ! aSign || ( ( aExp == 0x7FFF ) && ( aSig0 | aSig1 ) ) ) {
4456                     return LIT64( 0x7FFFFFFFFFFFFFFF );
4457                 }
4458             }
4459             return (sbits64) LIT64( 0x8000000000000000 );
4460         }
4461         z = ( aSig0<<shiftCount ) | ( aSig1>>( ( - shiftCount ) & 63 ) );
4462         if ( (bits64) ( aSig1<<shiftCount ) ) {
4463             float_exception_flags |= float_flag_inexact;
4464         }
4465     }
4466     else {
4467         if ( aExp < 0x3FFF ) {
4468             if ( aExp | aSig0 | aSig1 ) {
4469                 float_exception_flags |= float_flag_inexact;
4470             }
4471             return 0;
4472         }
4473         z = aSig0>>( - shiftCount );
4474         if (    aSig1
4475              || ( shiftCount && (bits64) ( aSig0<<( shiftCount & 63 ) ) ) ) {
4476             float_exception_flags |= float_flag_inexact;
4477         }
4478     }
4479     if ( aSign ) z = - z;
4480     return z;
4481 
4482 }
4483 
4484 #if (defined(SOFTFLOATSPARC64_FOR_GCC) || defined(SOFTFLOAT_FOR_GCC)) \
4485     && defined(SOFTFLOAT_NEED_FIXUNS)
4486 /*
4487  * just like above - but do not care for overflow of signed results
4488  */
float128_to_uint64_round_to_zero(float128 a)4489 uint64 float128_to_uint64_round_to_zero( float128 a )
4490 {
4491     flag aSign;
4492     int32 aExp, shiftCount;
4493     bits64 aSig0, aSig1;
4494     uint64 z;
4495 
4496     aSig1 = extractFloat128Frac1( a );
4497     aSig0 = extractFloat128Frac0( a );
4498     aExp = extractFloat128Exp( a );
4499     aSign = extractFloat128Sign( a );
4500     if ( aExp ) aSig0 |= LIT64( 0x0001000000000000 );
4501     shiftCount = aExp - 0x402F;
4502     if ( 0 < shiftCount ) {
4503         if ( 0x403F <= aExp ) {
4504             aSig0 &= LIT64( 0x0000FFFFFFFFFFFF );
4505             if (    ( a.high == LIT64( 0xC03E000000000000 ) )
4506                  && ( aSig1 < LIT64( 0x0002000000000000 ) ) ) {
4507                 if ( aSig1 ) float_exception_flags |= float_flag_inexact;
4508             }
4509             else {
4510                 float_raise( float_flag_invalid );
4511             }
4512             return LIT64( 0xFFFFFFFFFFFFFFFF );
4513         }
4514         z = ( aSig0<<shiftCount ) | ( aSig1>>( ( - shiftCount ) & 63 ) );
4515         if ( (bits64) ( aSig1<<shiftCount ) ) {
4516             float_exception_flags |= float_flag_inexact;
4517         }
4518     }
4519     else {
4520         if ( aExp < 0x3FFF ) {
4521             if ( aExp | aSig0 | aSig1 ) {
4522                 float_exception_flags |= float_flag_inexact;
4523             }
4524             return 0;
4525         }
4526         z = aSig0>>( - shiftCount );
4527         if (aSig1 || ( shiftCount && (bits64) ( aSig0<<( shiftCount & 63 ) ) ) ) {
4528             float_exception_flags |= float_flag_inexact;
4529         }
4530     }
4531     if ( aSign ) z = - z;
4532     return z;
4533 
4534 }
4535 #endif /* (SOFTFLOATSPARC64_FOR_GCC || SOFTFLOAT_FOR_GCC) && SOFTFLOAT_NEED_FIXUNS */
4536 
4537 /*
4538 -------------------------------------------------------------------------------
4539 Returns the result of converting the quadruple-precision floating-point
4540 value `a' to the single-precision floating-point format.  The conversion
4541 is performed according to the IEC/IEEE Standard for Binary Floating-Point
4542 Arithmetic.
4543 -------------------------------------------------------------------------------
4544 */
float128_to_float32(float128 a)4545 float32 float128_to_float32( float128 a )
4546 {
4547     flag aSign;
4548     int32 aExp;
4549     bits64 aSig0, aSig1;
4550     bits32 zSig;
4551 
4552     aSig1 = extractFloat128Frac1( a );
4553     aSig0 = extractFloat128Frac0( a );
4554     aExp = extractFloat128Exp( a );
4555     aSign = extractFloat128Sign( a );
4556     if ( aExp == 0x7FFF ) {
4557         if ( aSig0 | aSig1 ) {
4558             return commonNaNToFloat32( float128ToCommonNaN( a ) );
4559         }
4560         return packFloat32( aSign, 0xFF, 0 );
4561     }
4562     aSig0 |= ( aSig1 != 0 );
4563     shift64RightJamming( aSig0, 18, &aSig0 );
4564     zSig = aSig0;
4565     if ( aExp || zSig ) {
4566         zSig |= 0x40000000;
4567         aExp -= 0x3F81;
4568     }
4569     return roundAndPackFloat32( aSign, aExp, zSig );
4570 
4571 }
4572 
4573 /*
4574 -------------------------------------------------------------------------------
4575 Returns the result of converting the quadruple-precision floating-point
4576 value `a' to the double-precision floating-point format.  The conversion
4577 is performed according to the IEC/IEEE Standard for Binary Floating-Point
4578 Arithmetic.
4579 -------------------------------------------------------------------------------
4580 */
float128_to_float64(float128 a)4581 float64 float128_to_float64( float128 a )
4582 {
4583     flag aSign;
4584     int32 aExp;
4585     bits64 aSig0, aSig1;
4586 
4587     aSig1 = extractFloat128Frac1( a );
4588     aSig0 = extractFloat128Frac0( a );
4589     aExp = extractFloat128Exp( a );
4590     aSign = extractFloat128Sign( a );
4591     if ( aExp == 0x7FFF ) {
4592         if ( aSig0 | aSig1 ) {
4593             return commonNaNToFloat64( float128ToCommonNaN( a ) );
4594         }
4595         return packFloat64( aSign, 0x7FF, 0 );
4596     }
4597     shortShift128Left( aSig0, aSig1, 14, &aSig0, &aSig1 );
4598     aSig0 |= ( aSig1 != 0 );
4599     if ( aExp || aSig0 ) {
4600         aSig0 |= LIT64( 0x4000000000000000 );
4601         aExp -= 0x3C01;
4602     }
4603     return roundAndPackFloat64( aSign, aExp, aSig0 );
4604 
4605 }
4606 
4607 #ifdef FLOATX80
4608 
4609 /*
4610 -------------------------------------------------------------------------------
4611 Returns the result of converting the quadruple-precision floating-point
4612 value `a' to the extended double-precision floating-point format.  The
4613 conversion is performed according to the IEC/IEEE Standard for Binary
4614 Floating-Point Arithmetic.
4615 -------------------------------------------------------------------------------
4616 */
float128_to_floatx80(float128 a)4617 floatx80 float128_to_floatx80( float128 a )
4618 {
4619     flag aSign;
4620     int32 aExp;
4621     bits64 aSig0, aSig1;
4622 
4623     aSig1 = extractFloat128Frac1( a );
4624     aSig0 = extractFloat128Frac0( a );
4625     aExp = extractFloat128Exp( a );
4626     aSign = extractFloat128Sign( a );
4627     if ( aExp == 0x7FFF ) {
4628         if ( aSig0 | aSig1 ) {
4629             return commonNaNToFloatx80( float128ToCommonNaN( a ) );
4630         }
4631         return packFloatx80( aSign, 0x7FFF, LIT64( 0x8000000000000000 ) );
4632     }
4633     if ( aExp == 0 ) {
4634         if ( ( aSig0 | aSig1 ) == 0 ) return packFloatx80( aSign, 0, 0 );
4635         normalizeFloat128Subnormal( aSig0, aSig1, &aExp, &aSig0, &aSig1 );
4636     }
4637     else {
4638         aSig0 |= LIT64( 0x0001000000000000 );
4639     }
4640     shortShift128Left( aSig0, aSig1, 15, &aSig0, &aSig1 );
4641     return roundAndPackFloatx80( 80, aSign, aExp, aSig0, aSig1 );
4642 
4643 }
4644 
4645 #endif
4646 
4647 /*
4648 -------------------------------------------------------------------------------
4649 Rounds the quadruple-precision floating-point value `a' to an integer, and
4650 returns the result as a quadruple-precision floating-point value.  The
4651 operation is performed according to the IEC/IEEE Standard for Binary
4652 Floating-Point Arithmetic.
4653 -------------------------------------------------------------------------------
4654 */
float128_round_to_int(float128 a)4655 float128 float128_round_to_int( float128 a )
4656 {
4657     flag aSign;
4658     int32 aExp;
4659     bits64 lastBitMask, roundBitsMask;
4660     int8 roundingMode;
4661     float128 z;
4662 
4663     aExp = extractFloat128Exp( a );
4664     if ( 0x402F <= aExp ) {
4665         if ( 0x406F <= aExp ) {
4666             if (    ( aExp == 0x7FFF )
4667                  && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) )
4668                ) {
4669                 return propagateFloat128NaN( a, a );
4670             }
4671             return a;
4672         }
4673         lastBitMask = 1;
4674         lastBitMask = ( lastBitMask<<( 0x406E - aExp ) )<<1;
4675         roundBitsMask = lastBitMask - 1;
4676         z = a;
4677         roundingMode = float_rounding_mode;
4678         if ( roundingMode == float_round_nearest_even ) {
4679             if ( lastBitMask ) {
4680                 add128( z.high, z.low, 0, lastBitMask>>1, &z.high, &z.low );
4681                 if ( ( z.low & roundBitsMask ) == 0 ) z.low &= ~ lastBitMask;
4682             }
4683             else {
4684                 if ( (sbits64) z.low < 0 ) {
4685                     ++z.high;
4686                     if ( (bits64) ( z.low<<1 ) == 0 ) z.high &= ~1;
4687                 }
4688             }
4689         }
4690         else if ( roundingMode != float_round_to_zero ) {
4691             if (   extractFloat128Sign( z )
4692                  ^ ( roundingMode == float_round_up ) ) {
4693                 add128( z.high, z.low, 0, roundBitsMask, &z.high, &z.low );
4694             }
4695         }
4696         z.low &= ~ roundBitsMask;
4697     }
4698     else {
4699         if ( aExp < 0x3FFF ) {
4700             if ( ( ( (bits64) ( a.high<<1 ) ) | a.low ) == 0 ) return a;
4701             float_exception_flags |= float_flag_inexact;
4702             aSign = extractFloat128Sign( a );
4703             switch ( float_rounding_mode ) {
4704              case float_round_nearest_even:
4705                 if (    ( aExp == 0x3FFE )
4706                      && (   extractFloat128Frac0( a )
4707                           | extractFloat128Frac1( a ) )
4708                    ) {
4709                     return packFloat128( aSign, 0x3FFF, 0, 0 );
4710                 }
4711                 break;
4712 	     case float_round_to_zero:
4713 		break;
4714              case float_round_down:
4715                 return
4716                       aSign ? packFloat128( 1, 0x3FFF, 0, 0 )
4717                     : packFloat128( 0, 0, 0, 0 );
4718              case float_round_up:
4719                 return
4720                       aSign ? packFloat128( 1, 0, 0, 0 )
4721                     : packFloat128( 0, 0x3FFF, 0, 0 );
4722             }
4723             return packFloat128( aSign, 0, 0, 0 );
4724         }
4725         lastBitMask = 1;
4726         lastBitMask <<= 0x402F - aExp;
4727         roundBitsMask = lastBitMask - 1;
4728         z.low = 0;
4729         z.high = a.high;
4730         roundingMode = float_rounding_mode;
4731         if ( roundingMode == float_round_nearest_even ) {
4732             z.high += lastBitMask>>1;
4733             if ( ( ( z.high & roundBitsMask ) | a.low ) == 0 ) {
4734                 z.high &= ~ lastBitMask;
4735             }
4736         }
4737         else if ( roundingMode != float_round_to_zero ) {
4738             if (   extractFloat128Sign( z )
4739                  ^ ( roundingMode == float_round_up ) ) {
4740                 z.high |= ( a.low != 0 );
4741                 z.high += roundBitsMask;
4742             }
4743         }
4744         z.high &= ~ roundBitsMask;
4745     }
4746     if ( ( z.low != a.low ) || ( z.high != a.high ) ) {
4747         float_exception_flags |= float_flag_inexact;
4748     }
4749     return z;
4750 
4751 }
4752 
4753 /*
4754 -------------------------------------------------------------------------------
4755 Returns the result of adding the absolute values of the quadruple-precision
4756 floating-point values `a' and `b'.  If `zSign' is 1, the sum is negated
4757 before being returned.  `zSign' is ignored if the result is a NaN.
4758 The addition is performed according to the IEC/IEEE Standard for Binary
4759 Floating-Point Arithmetic.
4760 -------------------------------------------------------------------------------
4761 */
addFloat128Sigs(float128 a,float128 b,flag zSign)4762 static float128 addFloat128Sigs( float128 a, float128 b, flag zSign )
4763 {
4764     int32 aExp, bExp, zExp;
4765     bits64 aSig0, aSig1, bSig0, bSig1, zSig0, zSig1, zSig2;
4766     int32 expDiff;
4767 
4768     aSig1 = extractFloat128Frac1( a );
4769     aSig0 = extractFloat128Frac0( a );
4770     aExp = extractFloat128Exp( a );
4771     bSig1 = extractFloat128Frac1( b );
4772     bSig0 = extractFloat128Frac0( b );
4773     bExp = extractFloat128Exp( b );
4774     expDiff = aExp - bExp;
4775     if ( 0 < expDiff ) {
4776         if ( aExp == 0x7FFF ) {
4777             if ( aSig0 | aSig1 ) return propagateFloat128NaN( a, b );
4778             return a;
4779         }
4780         if ( bExp == 0 ) {
4781             --expDiff;
4782         }
4783         else {
4784             bSig0 |= LIT64( 0x0001000000000000 );
4785         }
4786         shift128ExtraRightJamming(
4787             bSig0, bSig1, 0, expDiff, &bSig0, &bSig1, &zSig2 );
4788         zExp = aExp;
4789     }
4790     else if ( expDiff < 0 ) {
4791         if ( bExp == 0x7FFF ) {
4792             if ( bSig0 | bSig1 ) return propagateFloat128NaN( a, b );
4793             return packFloat128( zSign, 0x7FFF, 0, 0 );
4794         }
4795         if ( aExp == 0 ) {
4796             ++expDiff;
4797         }
4798         else {
4799             aSig0 |= LIT64( 0x0001000000000000 );
4800         }
4801         shift128ExtraRightJamming(
4802             aSig0, aSig1, 0, - expDiff, &aSig0, &aSig1, &zSig2 );
4803         zExp = bExp;
4804     }
4805     else {
4806         if ( aExp == 0x7FFF ) {
4807             if ( aSig0 | aSig1 | bSig0 | bSig1 ) {
4808                 return propagateFloat128NaN( a, b );
4809             }
4810             return a;
4811         }
4812         add128( aSig0, aSig1, bSig0, bSig1, &zSig0, &zSig1 );
4813         if ( aExp == 0 ) return packFloat128( zSign, 0, zSig0, zSig1 );
4814         zSig2 = 0;
4815         zSig0 |= LIT64( 0x0002000000000000 );
4816         zExp = aExp;
4817         goto shiftRight1;
4818     }
4819     aSig0 |= LIT64( 0x0001000000000000 );
4820     add128( aSig0, aSig1, bSig0, bSig1, &zSig0, &zSig1 );
4821     --zExp;
4822     if ( zSig0 < LIT64( 0x0002000000000000 ) ) goto roundAndPack;
4823     ++zExp;
4824  shiftRight1:
4825     shift128ExtraRightJamming(
4826         zSig0, zSig1, zSig2, 1, &zSig0, &zSig1, &zSig2 );
4827  roundAndPack:
4828     return roundAndPackFloat128( zSign, zExp, zSig0, zSig1, zSig2 );
4829 
4830 }
4831 
4832 /*
4833 -------------------------------------------------------------------------------
4834 Returns the result of subtracting the absolute values of the quadruple-
4835 precision floating-point values `a' and `b'.  If `zSign' is 1, the
4836 difference is negated before being returned.  `zSign' is ignored if the
4837 result is a NaN.  The subtraction is performed according to the IEC/IEEE
4838 Standard for Binary Floating-Point Arithmetic.
4839 -------------------------------------------------------------------------------
4840 */
subFloat128Sigs(float128 a,float128 b,flag zSign)4841 static float128 subFloat128Sigs( float128 a, float128 b, flag zSign )
4842 {
4843     int32 aExp, bExp, zExp;
4844     bits64 aSig0, aSig1, bSig0, bSig1, zSig0, zSig1;
4845     int32 expDiff;
4846     float128 z;
4847 
4848     aSig1 = extractFloat128Frac1( a );
4849     aSig0 = extractFloat128Frac0( a );
4850     aExp = extractFloat128Exp( a );
4851     bSig1 = extractFloat128Frac1( b );
4852     bSig0 = extractFloat128Frac0( b );
4853     bExp = extractFloat128Exp( b );
4854     expDiff = aExp - bExp;
4855     shortShift128Left( aSig0, aSig1, 14, &aSig0, &aSig1 );
4856     shortShift128Left( bSig0, bSig1, 14, &bSig0, &bSig1 );
4857     if ( 0 < expDiff ) goto aExpBigger;
4858     if ( expDiff < 0 ) goto bExpBigger;
4859     if ( aExp == 0x7FFF ) {
4860         if ( aSig0 | aSig1 | bSig0 | bSig1 ) {
4861             return propagateFloat128NaN( a, b );
4862         }
4863         float_raise( float_flag_invalid );
4864         z.low = float128_default_nan_low;
4865         z.high = float128_default_nan_high;
4866         return z;
4867     }
4868     if ( aExp == 0 ) {
4869         aExp = 1;
4870         bExp = 1;
4871     }
4872     if ( bSig0 < aSig0 ) goto aBigger;
4873     if ( aSig0 < bSig0 ) goto bBigger;
4874     if ( bSig1 < aSig1 ) goto aBigger;
4875     if ( aSig1 < bSig1 ) goto bBigger;
4876     return packFloat128( float_rounding_mode == float_round_down, 0, 0, 0 );
4877  bExpBigger:
4878     if ( bExp == 0x7FFF ) {
4879         if ( bSig0 | bSig1 ) return propagateFloat128NaN( a, b );
4880         return packFloat128( zSign ^ 1, 0x7FFF, 0, 0 );
4881     }
4882     if ( aExp == 0 ) {
4883         ++expDiff;
4884     }
4885     else {
4886         aSig0 |= LIT64( 0x4000000000000000 );
4887     }
4888     shift128RightJamming( aSig0, aSig1, - expDiff, &aSig0, &aSig1 );
4889     bSig0 |= LIT64( 0x4000000000000000 );
4890  bBigger:
4891     sub128( bSig0, bSig1, aSig0, aSig1, &zSig0, &zSig1 );
4892     zExp = bExp;
4893     zSign ^= 1;
4894     goto normalizeRoundAndPack;
4895  aExpBigger:
4896     if ( aExp == 0x7FFF ) {
4897         if ( aSig0 | aSig1 ) return propagateFloat128NaN( a, b );
4898         return a;
4899     }
4900     if ( bExp == 0 ) {
4901         --expDiff;
4902     }
4903     else {
4904         bSig0 |= LIT64( 0x4000000000000000 );
4905     }
4906     shift128RightJamming( bSig0, bSig1, expDiff, &bSig0, &bSig1 );
4907     aSig0 |= LIT64( 0x4000000000000000 );
4908  aBigger:
4909     sub128( aSig0, aSig1, bSig0, bSig1, &zSig0, &zSig1 );
4910     zExp = aExp;
4911  normalizeRoundAndPack:
4912     --zExp;
4913     return normalizeRoundAndPackFloat128( zSign, zExp - 14, zSig0, zSig1 );
4914 
4915 }
4916 
4917 /*
4918 -------------------------------------------------------------------------------
4919 Returns the result of adding the quadruple-precision floating-point values
4920 `a' and `b'.  The operation is performed according to the IEC/IEEE Standard
4921 for Binary Floating-Point Arithmetic.
4922 -------------------------------------------------------------------------------
4923 */
float128_add(float128 a,float128 b)4924 float128 float128_add( float128 a, float128 b )
4925 {
4926     flag aSign, bSign;
4927 
4928     aSign = extractFloat128Sign( a );
4929     bSign = extractFloat128Sign( b );
4930     if ( aSign == bSign ) {
4931         return addFloat128Sigs( a, b, aSign );
4932     }
4933     else {
4934         return subFloat128Sigs( a, b, aSign );
4935     }
4936 
4937 }
4938 
4939 /*
4940 -------------------------------------------------------------------------------
4941 Returns the result of subtracting the quadruple-precision floating-point
4942 values `a' and `b'.  The operation is performed according to the IEC/IEEE
4943 Standard for Binary Floating-Point Arithmetic.
4944 -------------------------------------------------------------------------------
4945 */
float128_sub(float128 a,float128 b)4946 float128 float128_sub( float128 a, float128 b )
4947 {
4948     flag aSign, bSign;
4949 
4950     aSign = extractFloat128Sign( a );
4951     bSign = extractFloat128Sign( b );
4952     if ( aSign == bSign ) {
4953         return subFloat128Sigs( a, b, aSign );
4954     }
4955     else {
4956         return addFloat128Sigs( a, b, aSign );
4957     }
4958 
4959 }
4960 
4961 /*
4962 -------------------------------------------------------------------------------
4963 Returns the result of multiplying the quadruple-precision floating-point
4964 values `a' and `b'.  The operation is performed according to the IEC/IEEE
4965 Standard for Binary Floating-Point Arithmetic.
4966 -------------------------------------------------------------------------------
4967 */
float128_mul(float128 a,float128 b)4968 float128 float128_mul( float128 a, float128 b )
4969 {
4970     flag aSign, bSign, zSign;
4971     int32 aExp, bExp, zExp;
4972     bits64 aSig0, aSig1, bSig0, bSig1, zSig0, zSig1, zSig2, zSig3;
4973     float128 z;
4974 
4975     aSig1 = extractFloat128Frac1( a );
4976     aSig0 = extractFloat128Frac0( a );
4977     aExp = extractFloat128Exp( a );
4978     aSign = extractFloat128Sign( a );
4979     bSig1 = extractFloat128Frac1( b );
4980     bSig0 = extractFloat128Frac0( b );
4981     bExp = extractFloat128Exp( b );
4982     bSign = extractFloat128Sign( b );
4983     zSign = aSign ^ bSign;
4984     if ( aExp == 0x7FFF ) {
4985         if (    ( aSig0 | aSig1 )
4986              || ( ( bExp == 0x7FFF ) && ( bSig0 | bSig1 ) ) ) {
4987             return propagateFloat128NaN( a, b );
4988         }
4989         if ( ( bExp | bSig0 | bSig1 ) == 0 ) goto invalid;
4990         return packFloat128( zSign, 0x7FFF, 0, 0 );
4991     }
4992     if ( bExp == 0x7FFF ) {
4993         if ( bSig0 | bSig1 ) return propagateFloat128NaN( a, b );
4994         if ( ( aExp | aSig0 | aSig1 ) == 0 ) {
4995  invalid:
4996             float_raise( float_flag_invalid );
4997             z.low = float128_default_nan_low;
4998             z.high = float128_default_nan_high;
4999             return z;
5000         }
5001         return packFloat128( zSign, 0x7FFF, 0, 0 );
5002     }
5003     if ( aExp == 0 ) {
5004         if ( ( aSig0 | aSig1 ) == 0 ) return packFloat128( zSign, 0, 0, 0 );
5005         normalizeFloat128Subnormal( aSig0, aSig1, &aExp, &aSig0, &aSig1 );
5006     }
5007     if ( bExp == 0 ) {
5008         if ( ( bSig0 | bSig1 ) == 0 ) return packFloat128( zSign, 0, 0, 0 );
5009         normalizeFloat128Subnormal( bSig0, bSig1, &bExp, &bSig0, &bSig1 );
5010     }
5011     zExp = aExp + bExp - 0x4000;
5012     aSig0 |= LIT64( 0x0001000000000000 );
5013     shortShift128Left( bSig0, bSig1, 16, &bSig0, &bSig1 );
5014     mul128To256( aSig0, aSig1, bSig0, bSig1, &zSig0, &zSig1, &zSig2, &zSig3 );
5015     add128( zSig0, zSig1, aSig0, aSig1, &zSig0, &zSig1 );
5016     zSig2 |= ( zSig3 != 0 );
5017     if ( LIT64( 0x0002000000000000 ) <= zSig0 ) {
5018         shift128ExtraRightJamming(
5019             zSig0, zSig1, zSig2, 1, &zSig0, &zSig1, &zSig2 );
5020         ++zExp;
5021     }
5022     return roundAndPackFloat128( zSign, zExp, zSig0, zSig1, zSig2 );
5023 
5024 }
5025 
5026 /*
5027 -------------------------------------------------------------------------------
5028 Returns the result of dividing the quadruple-precision floating-point value
5029 `a' by the corresponding value `b'.  The operation is performed according to
5030 the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
5031 -------------------------------------------------------------------------------
5032 */
float128_div(float128 a,float128 b)5033 float128 float128_div( float128 a, float128 b )
5034 {
5035     flag aSign, bSign, zSign;
5036     int32 aExp, bExp, zExp;
5037     bits64 aSig0, aSig1, bSig0, bSig1, zSig0, zSig1, zSig2;
5038     bits64 rem0, rem1, rem2, rem3, term0, term1, term2, term3;
5039     float128 z;
5040 
5041     aSig1 = extractFloat128Frac1( a );
5042     aSig0 = extractFloat128Frac0( a );
5043     aExp = extractFloat128Exp( a );
5044     aSign = extractFloat128Sign( a );
5045     bSig1 = extractFloat128Frac1( b );
5046     bSig0 = extractFloat128Frac0( b );
5047     bExp = extractFloat128Exp( b );
5048     bSign = extractFloat128Sign( b );
5049     zSign = aSign ^ bSign;
5050     if ( aExp == 0x7FFF ) {
5051         if ( aSig0 | aSig1 ) return propagateFloat128NaN( a, b );
5052         if ( bExp == 0x7FFF ) {
5053             if ( bSig0 | bSig1 ) return propagateFloat128NaN( a, b );
5054             goto invalid;
5055         }
5056         return packFloat128( zSign, 0x7FFF, 0, 0 );
5057     }
5058     if ( bExp == 0x7FFF ) {
5059         if ( bSig0 | bSig1 ) return propagateFloat128NaN( a, b );
5060         return packFloat128( zSign, 0, 0, 0 );
5061     }
5062     if ( bExp == 0 ) {
5063         if ( ( bSig0 | bSig1 ) == 0 ) {
5064             if ( ( aExp | aSig0 | aSig1 ) == 0 ) {
5065  invalid:
5066                 float_raise( float_flag_invalid );
5067                 z.low = float128_default_nan_low;
5068                 z.high = float128_default_nan_high;
5069                 return z;
5070             }
5071             float_raise( float_flag_divbyzero );
5072             return packFloat128( zSign, 0x7FFF, 0, 0 );
5073         }
5074         normalizeFloat128Subnormal( bSig0, bSig1, &bExp, &bSig0, &bSig1 );
5075     }
5076     if ( aExp == 0 ) {
5077         if ( ( aSig0 | aSig1 ) == 0 ) return packFloat128( zSign, 0, 0, 0 );
5078         normalizeFloat128Subnormal( aSig0, aSig1, &aExp, &aSig0, &aSig1 );
5079     }
5080     zExp = aExp - bExp + 0x3FFD;
5081     shortShift128Left(
5082         aSig0 | LIT64( 0x0001000000000000 ), aSig1, 15, &aSig0, &aSig1 );
5083     shortShift128Left(
5084         bSig0 | LIT64( 0x0001000000000000 ), bSig1, 15, &bSig0, &bSig1 );
5085     if ( le128( bSig0, bSig1, aSig0, aSig1 ) ) {
5086         shift128Right( aSig0, aSig1, 1, &aSig0, &aSig1 );
5087         ++zExp;
5088     }
5089     zSig0 = estimateDiv128To64( aSig0, aSig1, bSig0 );
5090     mul128By64To192( bSig0, bSig1, zSig0, &term0, &term1, &term2 );
5091     sub192( aSig0, aSig1, 0, term0, term1, term2, &rem0, &rem1, &rem2 );
5092     while ( (sbits64) rem0 < 0 ) {
5093         --zSig0;
5094         add192( rem0, rem1, rem2, 0, bSig0, bSig1, &rem0, &rem1, &rem2 );
5095     }
5096     zSig1 = estimateDiv128To64( rem1, rem2, bSig0 );
5097     if ( ( zSig1 & 0x3FFF ) <= 4 ) {
5098         mul128By64To192( bSig0, bSig1, zSig1, &term1, &term2, &term3 );
5099         sub192( rem1, rem2, 0, term1, term2, term3, &rem1, &rem2, &rem3 );
5100         while ( (sbits64) rem1 < 0 ) {
5101             --zSig1;
5102             add192( rem1, rem2, rem3, 0, bSig0, bSig1, &rem1, &rem2, &rem3 );
5103         }
5104         zSig1 |= ( ( rem1 | rem2 | rem3 ) != 0 );
5105     }
5106     shift128ExtraRightJamming( zSig0, zSig1, 0, 15, &zSig0, &zSig1, &zSig2 );
5107     return roundAndPackFloat128( zSign, zExp, zSig0, zSig1, zSig2 );
5108 
5109 }
5110 
5111 /*
5112 -------------------------------------------------------------------------------
5113 Returns the remainder of the quadruple-precision floating-point value `a'
5114 with respect to the corresponding value `b'.  The operation is performed
5115 according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
5116 -------------------------------------------------------------------------------
5117 */
float128_rem(float128 a,float128 b)5118 float128 float128_rem( float128 a, float128 b )
5119 {
5120     flag aSign, bSign, zSign;
5121     int32 aExp, bExp, expDiff;
5122     bits64 aSig0, aSig1, bSig0, bSig1, q, term0, term1, term2;
5123     bits64 allZero, alternateASig0, alternateASig1, sigMean1;
5124     sbits64 sigMean0;
5125     float128 z;
5126 
5127     aSig1 = extractFloat128Frac1( a );
5128     aSig0 = extractFloat128Frac0( a );
5129     aExp = extractFloat128Exp( a );
5130     aSign = extractFloat128Sign( a );
5131     bSig1 = extractFloat128Frac1( b );
5132     bSig0 = extractFloat128Frac0( b );
5133     bExp = extractFloat128Exp( b );
5134     bSign = extractFloat128Sign( b );
5135     if ( aExp == 0x7FFF ) {
5136         if (    ( aSig0 | aSig1 )
5137              || ( ( bExp == 0x7FFF ) && ( bSig0 | bSig1 ) ) ) {
5138             return propagateFloat128NaN( a, b );
5139         }
5140         goto invalid;
5141     }
5142     if ( bExp == 0x7FFF ) {
5143         if ( bSig0 | bSig1 ) return propagateFloat128NaN( a, b );
5144         return a;
5145     }
5146     if ( bExp == 0 ) {
5147         if ( ( bSig0 | bSig1 ) == 0 ) {
5148  invalid:
5149             float_raise( float_flag_invalid );
5150             z.low = float128_default_nan_low;
5151             z.high = float128_default_nan_high;
5152             return z;
5153         }
5154         normalizeFloat128Subnormal( bSig0, bSig1, &bExp, &bSig0, &bSig1 );
5155     }
5156     if ( aExp == 0 ) {
5157         if ( ( aSig0 | aSig1 ) == 0 ) return a;
5158         normalizeFloat128Subnormal( aSig0, aSig1, &aExp, &aSig0, &aSig1 );
5159     }
5160     expDiff = aExp - bExp;
5161     if ( expDiff < -1 ) return a;
5162     shortShift128Left(
5163         aSig0 | LIT64( 0x0001000000000000 ),
5164         aSig1,
5165         15 - ( expDiff < 0 ),
5166         &aSig0,
5167         &aSig1
5168     );
5169     shortShift128Left(
5170         bSig0 | LIT64( 0x0001000000000000 ), bSig1, 15, &bSig0, &bSig1 );
5171     q = le128( bSig0, bSig1, aSig0, aSig1 );
5172     if ( q ) sub128( aSig0, aSig1, bSig0, bSig1, &aSig0, &aSig1 );
5173     expDiff -= 64;
5174     while ( 0 < expDiff ) {
5175         q = estimateDiv128To64( aSig0, aSig1, bSig0 );
5176         q = ( 4 < q ) ? q - 4 : 0;
5177         mul128By64To192( bSig0, bSig1, q, &term0, &term1, &term2 );
5178         shortShift192Left( term0, term1, term2, 61, &term1, &term2, &allZero );
5179         shortShift128Left( aSig0, aSig1, 61, &aSig0, &allZero );
5180         sub128( aSig0, 0, term1, term2, &aSig0, &aSig1 );
5181         expDiff -= 61;
5182     }
5183     if ( -64 < expDiff ) {
5184         q = estimateDiv128To64( aSig0, aSig1, bSig0 );
5185         q = ( 4 < q ) ? q - 4 : 0;
5186         q >>= - expDiff;
5187         shift128Right( bSig0, bSig1, 12, &bSig0, &bSig1 );
5188         expDiff += 52;
5189         if ( expDiff < 0 ) {
5190             shift128Right( aSig0, aSig1, - expDiff, &aSig0, &aSig1 );
5191         }
5192         else {
5193             shortShift128Left( aSig0, aSig1, expDiff, &aSig0, &aSig1 );
5194         }
5195         mul128By64To192( bSig0, bSig1, q, &term0, &term1, &term2 );
5196         sub128( aSig0, aSig1, term1, term2, &aSig0, &aSig1 );
5197     }
5198     else {
5199         shift128Right( aSig0, aSig1, 12, &aSig0, &aSig1 );
5200         shift128Right( bSig0, bSig1, 12, &bSig0, &bSig1 );
5201     }
5202     do {
5203         alternateASig0 = aSig0;
5204         alternateASig1 = aSig1;
5205         ++q;
5206         sub128( aSig0, aSig1, bSig0, bSig1, &aSig0, &aSig1 );
5207     } while ( 0 <= (sbits64) aSig0 );
5208     add128(
5209         aSig0, aSig1, alternateASig0, alternateASig1, (bits64 *)&sigMean0, &sigMean1 );
5210     if (    ( sigMean0 < 0 )
5211          || ( ( ( sigMean0 | sigMean1 ) == 0 ) && ( q & 1 ) ) ) {
5212         aSig0 = alternateASig0;
5213         aSig1 = alternateASig1;
5214     }
5215     zSign = ( (sbits64) aSig0 < 0 );
5216     if ( zSign ) sub128( 0, 0, aSig0, aSig1, &aSig0, &aSig1 );
5217     return
5218         normalizeRoundAndPackFloat128( aSign ^ zSign, bExp - 4, aSig0, aSig1 );
5219 
5220 }
5221 
5222 /*
5223 -------------------------------------------------------------------------------
5224 Returns the square root of the quadruple-precision floating-point value `a'.
5225 The operation is performed according to the IEC/IEEE Standard for Binary
5226 Floating-Point Arithmetic.
5227 -------------------------------------------------------------------------------
5228 */
float128_sqrt(float128 a)5229 float128 float128_sqrt( float128 a )
5230 {
5231     flag aSign;
5232     int32 aExp, zExp;
5233     bits64 aSig0, aSig1, zSig0, zSig1, zSig2, doubleZSig0;
5234     bits64 rem0, rem1, rem2, rem3, term0, term1, term2, term3;
5235     float128 z;
5236 
5237     aSig1 = extractFloat128Frac1( a );
5238     aSig0 = extractFloat128Frac0( a );
5239     aExp = extractFloat128Exp( a );
5240     aSign = extractFloat128Sign( a );
5241     if ( aExp == 0x7FFF ) {
5242         if ( aSig0 | aSig1 ) return propagateFloat128NaN( a, a );
5243         if ( ! aSign ) return a;
5244         goto invalid;
5245     }
5246     if ( aSign ) {
5247         if ( ( aExp | aSig0 | aSig1 ) == 0 ) return a;
5248  invalid:
5249         float_raise( float_flag_invalid );
5250         z.low = float128_default_nan_low;
5251         z.high = float128_default_nan_high;
5252         return z;
5253     }
5254     if ( aExp == 0 ) {
5255         if ( ( aSig0 | aSig1 ) == 0 ) return packFloat128( 0, 0, 0, 0 );
5256         normalizeFloat128Subnormal( aSig0, aSig1, &aExp, &aSig0, &aSig1 );
5257     }
5258     zExp = ( ( aExp - 0x3FFF )>>1 ) + 0x3FFE;
5259     aSig0 |= LIT64( 0x0001000000000000 );
5260     zSig0 = estimateSqrt32( aExp, aSig0>>17 );
5261     shortShift128Left( aSig0, aSig1, 13 - ( aExp & 1 ), &aSig0, &aSig1 );
5262     zSig0 = estimateDiv128To64( aSig0, aSig1, zSig0<<32 ) + ( zSig0<<30 );
5263     doubleZSig0 = zSig0<<1;
5264     mul64To128( zSig0, zSig0, &term0, &term1 );
5265     sub128( aSig0, aSig1, term0, term1, &rem0, &rem1 );
5266     while ( (sbits64) rem0 < 0 ) {
5267         --zSig0;
5268         doubleZSig0 -= 2;
5269         add128( rem0, rem1, zSig0>>63, doubleZSig0 | 1, &rem0, &rem1 );
5270     }
5271     zSig1 = estimateDiv128To64( rem1, 0, doubleZSig0 );
5272     if ( ( zSig1 & 0x1FFF ) <= 5 ) {
5273         if ( zSig1 == 0 ) zSig1 = 1;
5274         mul64To128( doubleZSig0, zSig1, &term1, &term2 );
5275         sub128( rem1, 0, term1, term2, &rem1, &rem2 );
5276         mul64To128( zSig1, zSig1, &term2, &term3 );
5277         sub192( rem1, rem2, 0, 0, term2, term3, &rem1, &rem2, &rem3 );
5278         while ( (sbits64) rem1 < 0 ) {
5279             --zSig1;
5280             shortShift128Left( 0, zSig1, 1, &term2, &term3 );
5281             term3 |= 1;
5282             term2 |= doubleZSig0;
5283             add192( rem1, rem2, rem3, 0, term2, term3, &rem1, &rem2, &rem3 );
5284         }
5285         zSig1 |= ( ( rem1 | rem2 | rem3 ) != 0 );
5286     }
5287     shift128ExtraRightJamming( zSig0, zSig1, 0, 14, &zSig0, &zSig1, &zSig2 );
5288     return roundAndPackFloat128( 0, zExp, zSig0, zSig1, zSig2 );
5289 
5290 }
5291 
5292 /*
5293 -------------------------------------------------------------------------------
5294 Returns 1 if the quadruple-precision floating-point value `a' is equal to
5295 the corresponding value `b', and 0 otherwise.  The comparison is performed
5296 according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
5297 -------------------------------------------------------------------------------
5298 */
float128_eq(float128 a,float128 b)5299 flag float128_eq( float128 a, float128 b )
5300 {
5301 
5302     if (    (    ( extractFloat128Exp( a ) == 0x7FFF )
5303               && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) )
5304          || (    ( extractFloat128Exp( b ) == 0x7FFF )
5305               && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) )
5306        ) {
5307         if (    float128_is_signaling_nan( a )
5308              || float128_is_signaling_nan( b ) ) {
5309             float_raise( float_flag_invalid );
5310         }
5311         return 0;
5312     }
5313     return
5314            ( a.low == b.low )
5315         && (    ( a.high == b.high )
5316              || (    ( a.low == 0 )
5317                   && ( (bits64) ( ( a.high | b.high )<<1 ) == 0 ) )
5318            );
5319 
5320 }
5321 
5322 /*
5323 -------------------------------------------------------------------------------
5324 Returns 1 if the quadruple-precision floating-point value `a' is less than
5325 or equal to the corresponding value `b', and 0 otherwise.  The comparison
5326 is performed according to the IEC/IEEE Standard for Binary Floating-Point
5327 Arithmetic.
5328 -------------------------------------------------------------------------------
5329 */
float128_le(float128 a,float128 b)5330 flag float128_le( float128 a, float128 b )
5331 {
5332     flag aSign, bSign;
5333 
5334     if (    (    ( extractFloat128Exp( a ) == 0x7FFF )
5335               && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) )
5336          || (    ( extractFloat128Exp( b ) == 0x7FFF )
5337               && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) )
5338        ) {
5339         float_raise( float_flag_invalid );
5340         return 0;
5341     }
5342     aSign = extractFloat128Sign( a );
5343     bSign = extractFloat128Sign( b );
5344     if ( aSign != bSign ) {
5345         return
5346                aSign
5347             || (    ( ( (bits64) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
5348                  == 0 );
5349     }
5350     return
5351           aSign ? le128( b.high, b.low, a.high, a.low )
5352         : le128( a.high, a.low, b.high, b.low );
5353 
5354 }
5355 
5356 /*
5357 -------------------------------------------------------------------------------
5358 Returns 1 if the quadruple-precision floating-point value `a' is less than
5359 the corresponding value `b', and 0 otherwise.  The comparison is performed
5360 according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
5361 -------------------------------------------------------------------------------
5362 */
float128_lt(float128 a,float128 b)5363 flag float128_lt( float128 a, float128 b )
5364 {
5365     flag aSign, bSign;
5366 
5367     if (    (    ( extractFloat128Exp( a ) == 0x7FFF )
5368               && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) )
5369          || (    ( extractFloat128Exp( b ) == 0x7FFF )
5370               && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) )
5371        ) {
5372         float_raise( float_flag_invalid );
5373         return 0;
5374     }
5375     aSign = extractFloat128Sign( a );
5376     bSign = extractFloat128Sign( b );
5377     if ( aSign != bSign ) {
5378         return
5379                aSign
5380             && (    ( ( (bits64) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
5381                  != 0 );
5382     }
5383     return
5384           aSign ? lt128( b.high, b.low, a.high, a.low )
5385         : lt128( a.high, a.low, b.high, b.low );
5386 
5387 }
5388 
5389 /*
5390 -------------------------------------------------------------------------------
5391 Returns 1 if the quadruple-precision floating-point value `a' is equal to
5392 the corresponding value `b', and 0 otherwise.  The invalid exception is
5393 raised if either operand is a NaN.  Otherwise, the comparison is performed
5394 according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
5395 -------------------------------------------------------------------------------
5396 */
float128_eq_signaling(float128 a,float128 b)5397 flag float128_eq_signaling( float128 a, float128 b )
5398 {
5399 
5400     if (    (    ( extractFloat128Exp( a ) == 0x7FFF )
5401               && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) )
5402          || (    ( extractFloat128Exp( b ) == 0x7FFF )
5403               && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) )
5404        ) {
5405         float_raise( float_flag_invalid );
5406         return 0;
5407     }
5408     return
5409            ( a.low == b.low )
5410         && (    ( a.high == b.high )
5411              || (    ( a.low == 0 )
5412                   && ( (bits64) ( ( a.high | b.high )<<1 ) == 0 ) )
5413            );
5414 
5415 }
5416 
5417 /*
5418 -------------------------------------------------------------------------------
5419 Returns 1 if the quadruple-precision floating-point value `a' is less than
5420 or equal to the corresponding value `b', and 0 otherwise.  Quiet NaNs do not
5421 cause an exception.  Otherwise, the comparison is performed according to the
5422 IEC/IEEE Standard for Binary Floating-Point Arithmetic.
5423 -------------------------------------------------------------------------------
5424 */
float128_le_quiet(float128 a,float128 b)5425 flag float128_le_quiet( float128 a, float128 b )
5426 {
5427     flag aSign, bSign;
5428 
5429     if (    (    ( extractFloat128Exp( a ) == 0x7FFF )
5430               && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) )
5431          || (    ( extractFloat128Exp( b ) == 0x7FFF )
5432               && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) )
5433        ) {
5434         if (    float128_is_signaling_nan( a )
5435              || float128_is_signaling_nan( b ) ) {
5436             float_raise( float_flag_invalid );
5437         }
5438         return 0;
5439     }
5440     aSign = extractFloat128Sign( a );
5441     bSign = extractFloat128Sign( b );
5442     if ( aSign != bSign ) {
5443         return
5444                aSign
5445             || (    ( ( (bits64) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
5446                  == 0 );
5447     }
5448     return
5449           aSign ? le128( b.high, b.low, a.high, a.low )
5450         : le128( a.high, a.low, b.high, b.low );
5451 
5452 }
5453 
5454 /*
5455 -------------------------------------------------------------------------------
5456 Returns 1 if the quadruple-precision floating-point value `a' is less than
5457 the corresponding value `b', and 0 otherwise.  Quiet NaNs do not cause an
5458 exception.  Otherwise, the comparison is performed according to the IEC/IEEE
5459 Standard for Binary Floating-Point Arithmetic.
5460 -------------------------------------------------------------------------------
5461 */
float128_lt_quiet(float128 a,float128 b)5462 flag float128_lt_quiet( float128 a, float128 b )
5463 {
5464     flag aSign, bSign;
5465 
5466     if (    (    ( extractFloat128Exp( a ) == 0x7FFF )
5467               && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) )
5468          || (    ( extractFloat128Exp( b ) == 0x7FFF )
5469               && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) )
5470        ) {
5471         if (    float128_is_signaling_nan( a )
5472              || float128_is_signaling_nan( b ) ) {
5473             float_raise( float_flag_invalid );
5474         }
5475         return 0;
5476     }
5477     aSign = extractFloat128Sign( a );
5478     bSign = extractFloat128Sign( b );
5479     if ( aSign != bSign ) {
5480         return
5481                aSign
5482             && (    ( ( (bits64) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
5483                  != 0 );
5484     }
5485     return
5486           aSign ? lt128( b.high, b.low, a.high, a.low )
5487         : lt128( a.high, a.low, b.high, b.low );
5488 
5489 }
5490 
5491 #endif
5492 
5493 
5494 #if defined(SOFTFLOAT_FOR_GCC) && defined(SOFTFLOAT_NEED_FIXUNS)
5495 
5496 /*
5497  * These two routines are not part of the original softfloat distribution.
5498  *
5499  * They are based on the corresponding conversions to integer but return
5500  * unsigned numbers instead since these functions are required by GCC.
5501  *
5502  * Added by Mark Brinicombe <mark@NetBSD.org>	27/09/97
5503  *
5504  * float64 version overhauled for SoftFloat 2a [bjh21 2000-07-15]
5505  */
5506 
5507 /*
5508 -------------------------------------------------------------------------------
5509 Returns the result of converting the double-precision floating-point value
5510 `a' to the 32-bit unsigned integer format.  The conversion is
5511 performed according to the IEC/IEEE Standard for Binary Floating-point
5512 Arithmetic, except that the conversion is always rounded toward zero.  If
5513 `a' is a NaN, the largest positive integer is returned.  If the conversion
5514 overflows, the largest integer positive is returned.
5515 -------------------------------------------------------------------------------
5516 */
float64_to_uint32_round_to_zero(float64 a)5517 uint32 float64_to_uint32_round_to_zero( float64 a )
5518 {
5519     flag aSign;
5520     int16 aExp, shiftCount;
5521     bits64 aSig, savedASig;
5522     uint32 z;
5523 
5524     aSig = extractFloat64Frac( a );
5525     aExp = extractFloat64Exp( a );
5526     aSign = extractFloat64Sign( a );
5527 
5528     if (aSign) {
5529         float_raise( float_flag_invalid );
5530     	return(0);
5531     }
5532 
5533     if ( 0x41E < aExp ) {
5534         float_raise( float_flag_invalid );
5535         return 0xffffffff;
5536     }
5537     else if ( aExp < 0x3FF ) {
5538         if ( aExp || aSig ) float_exception_flags |= float_flag_inexact;
5539         return 0;
5540     }
5541     aSig |= LIT64( 0x0010000000000000 );
5542     shiftCount = 0x433 - aExp;
5543     savedASig = aSig;
5544     aSig >>= shiftCount;
5545     z = aSig;
5546     if ( ( aSig<<shiftCount ) != savedASig ) {
5547         float_exception_flags |= float_flag_inexact;
5548     }
5549     return z;
5550 
5551 }
5552 
5553 /*
5554 -------------------------------------------------------------------------------
5555 Returns the result of converting the single-precision floating-point value
5556 `a' to the 32-bit unsigned integer format.  The conversion is
5557 performed according to the IEC/IEEE Standard for Binary Floating-point
5558 Arithmetic, except that the conversion is always rounded toward zero.  If
5559 `a' is a NaN, the largest positive integer is returned.  If the conversion
5560 overflows, the largest positive integer is returned.
5561 -------------------------------------------------------------------------------
5562 */
float32_to_uint32_round_to_zero(float32 a)5563 uint32 float32_to_uint32_round_to_zero( float32 a )
5564 {
5565     flag aSign;
5566     int16 aExp, shiftCount;
5567     bits32 aSig;
5568     uint32 z;
5569 
5570     aSig = extractFloat32Frac( a );
5571     aExp = extractFloat32Exp( a );
5572     aSign = extractFloat32Sign( a );
5573     shiftCount = aExp - 0x9E;
5574 
5575     if (aSign) {
5576         float_raise( float_flag_invalid );
5577     	return(0);
5578     }
5579     if ( 0 < shiftCount ) {
5580         float_raise( float_flag_invalid );
5581         return 0xFFFFFFFF;
5582     }
5583     else if ( aExp <= 0x7E ) {
5584         if ( aExp | aSig ) float_exception_flags |= float_flag_inexact;
5585         return 0;
5586     }
5587     aSig = ( aSig | 0x800000 )<<8;
5588     z = aSig>>( - shiftCount );
5589     if ( aSig<<( shiftCount & 31 ) ) {
5590         float_exception_flags |= float_flag_inexact;
5591     }
5592     return z;
5593 
5594 }
5595 
5596 #endif
5597