1 /* $NetBSD: softfloat.c,v 1.8 2011/07/10 04:52:23 matt Exp $ */
2
3 /*
4 * This version hacked for use with gcc -msoft-float by bjh21.
5 * (Mostly a case of #ifdefing out things GCC doesn't need or provides
6 * itself).
7 */
8
9 /*
10 * Things you may want to define:
11 *
12 * SOFTFLOAT_FOR_GCC - build only those functions necessary for GCC (with
13 * -msoft-float) to work. Include "softfloat-for-gcc.h" to get them
14 * properly renamed.
15 */
16
17 /*
18 ===============================================================================
19
20 This C source file is part of the SoftFloat IEC/IEEE Floating-point
21 Arithmetic Package, Release 2a.
22
23 Written by John R. Hauser. This work was made possible in part by the
24 International Computer Science Institute, located at Suite 600, 1947 Center
25 Street, Berkeley, California 94704. Funding was partially provided by the
26 National Science Foundation under grant MIP-9311980. The original version
27 of this code was written as part of a project to build a fixed-point vector
28 processor in collaboration with the University of California at Berkeley,
29 overseen by Profs. Nelson Morgan and John Wawrzynek. More information
30 is available through the Web page `http://HTTP.CS.Berkeley.EDU/~jhauser/
31 arithmetic/SoftFloat.html'.
32
33 THIS SOFTWARE IS DISTRIBUTED AS IS, FOR FREE. Although reasonable effort
34 has been made to avoid it, THIS SOFTWARE MAY CONTAIN FAULTS THAT WILL AT
35 TIMES RESULT IN INCORRECT BEHAVIOR. USE OF THIS SOFTWARE IS RESTRICTED TO
36 PERSONS AND ORGANIZATIONS WHO CAN AND WILL TAKE FULL RESPONSIBILITY FOR ANY
37 AND ALL LOSSES, COSTS, OR OTHER PROBLEMS ARISING FROM ITS USE.
38
39 Derivative works are acceptable, even for commercial purposes, so long as
40 (1) they include prominent notice that the work is derivative, and (2) they
41 include prominent notice akin to these four paragraphs for those parts of
42 this code that are retained.
43
44 ===============================================================================
45 */
46
47 #ifdef SOFTFLOAT_FOR_GCC
48 #include "softfloat-for-gcc.h"
49 #endif
50
51 #include "milieu.h"
52 #include "softfloat.h"
53
54 /*
55 * Conversions between floats as stored in memory and floats as
56 * SoftFloat uses them
57 */
58 #ifndef FLOAT64_DEMANGLE
59 #define FLOAT64_DEMANGLE(a) (a)
60 #endif
61 #ifndef FLOAT64_MANGLE
62 #define FLOAT64_MANGLE(a) (a)
63 #endif
64
65 /*
66 -------------------------------------------------------------------------------
67 Floating-point rounding mode, extended double-precision rounding precision,
68 and exception flags.
69 -------------------------------------------------------------------------------
70 */
71 int float_rounding_mode = float_round_nearest_even;
72 int float_exception_flags = 0;
73 #ifdef FLOATX80
74 int8 floatx80_rounding_precision = 80;
75 #endif
76
77 /*
78 -------------------------------------------------------------------------------
79 Primitive arithmetic functions, including multi-word arithmetic, and
80 division and square root approximations. (Can be specialized to target if
81 desired.)
82 -------------------------------------------------------------------------------
83 */
84 #include "softfloat-macros"
85
86 /*
87 -------------------------------------------------------------------------------
88 Functions and definitions to determine: (1) whether tininess for underflow
89 is detected before or after rounding by default, (2) what (if anything)
90 happens when exceptions are raised, (3) how signaling NaNs are distinguished
91 from quiet NaNs, (4) the default generated quiet NaNs, and (5) how NaNs
92 are propagated from function inputs to output. These details are target-
93 specific.
94 -------------------------------------------------------------------------------
95 */
96 #include "softfloat-specialize"
97
98 #if !defined(SOFTFLOAT_FOR_GCC) || defined(FLOATX80) || defined(FLOAT128)
99 /*
100 -------------------------------------------------------------------------------
101 Takes a 64-bit fixed-point value `absZ' with binary point between bits 6
102 and 7, and returns the properly rounded 32-bit integer corresponding to the
103 input. If `zSign' is 1, the input is negated before being converted to an
104 integer. Bit 63 of `absZ' must be zero. Ordinarily, the fixed-point input
105 is simply rounded to an integer, with the inexact exception raised if the
106 input cannot be represented exactly as an integer. However, if the fixed-
107 point input is too large, the invalid exception is raised and the largest
108 positive or negative integer is returned.
109 -------------------------------------------------------------------------------
110 */
roundAndPackInt32(flag zSign,bits64 absZ)111 static int32 roundAndPackInt32( flag zSign, bits64 absZ )
112 {
113 int8 roundingMode;
114 flag roundNearestEven;
115 int8 roundIncrement, roundBits;
116 int32 z;
117
118 roundingMode = float_rounding_mode;
119 roundNearestEven = ( roundingMode == float_round_nearest_even );
120 roundIncrement = 0x40;
121 if ( ! roundNearestEven ) {
122 if ( roundingMode == float_round_to_zero ) {
123 roundIncrement = 0;
124 }
125 else {
126 roundIncrement = 0x7F;
127 if ( zSign ) {
128 if ( roundingMode == float_round_up ) roundIncrement = 0;
129 }
130 else {
131 if ( roundingMode == float_round_down ) roundIncrement = 0;
132 }
133 }
134 }
135 roundBits = absZ & 0x7F;
136 absZ = ( absZ + roundIncrement )>>7;
137 absZ &= ~ ( ( ( roundBits ^ 0x40 ) == 0 ) & roundNearestEven );
138 z = absZ;
139 if ( zSign ) z = - z;
140 if ( ( absZ>>32 ) || ( z && ( ( z < 0 ) ^ zSign ) ) ) {
141 float_raise( float_flag_invalid );
142 return zSign ? (sbits32) 0x80000000 : 0x7FFFFFFF;
143 }
144 if ( roundBits ) float_exception_flags |= float_flag_inexact;
145 return z;
146
147 }
148
149 /*
150 -------------------------------------------------------------------------------
151 Takes the 128-bit fixed-point value formed by concatenating `absZ0' and
152 `absZ1', with binary point between bits 63 and 64 (between the input words),
153 and returns the properly rounded 64-bit integer corresponding to the input.
154 If `zSign' is 1, the input is negated before being converted to an integer.
155 Ordinarily, the fixed-point input is simply rounded to an integer, with
156 the inexact exception raised if the input cannot be represented exactly as
157 an integer. However, if the fixed-point input is too large, the invalid
158 exception is raised and the largest positive or negative integer is
159 returned.
160 -------------------------------------------------------------------------------
161 */
roundAndPackInt64(flag zSign,bits64 absZ0,bits64 absZ1)162 static int64 roundAndPackInt64( flag zSign, bits64 absZ0, bits64 absZ1 )
163 {
164 int8 roundingMode;
165 flag roundNearestEven, increment;
166 int64 z;
167
168 roundingMode = float_rounding_mode;
169 roundNearestEven = ( roundingMode == float_round_nearest_even );
170 increment = ( (sbits64) absZ1 < 0 );
171 if ( ! roundNearestEven ) {
172 if ( roundingMode == float_round_to_zero ) {
173 increment = 0;
174 }
175 else {
176 if ( zSign ) {
177 increment = ( roundingMode == float_round_down ) && absZ1;
178 }
179 else {
180 increment = ( roundingMode == float_round_up ) && absZ1;
181 }
182 }
183 }
184 if ( increment ) {
185 ++absZ0;
186 if ( absZ0 == 0 ) goto overflow;
187 absZ0 &= ~ ( ( (bits64) ( absZ1<<1 ) == 0 ) & roundNearestEven );
188 }
189 z = absZ0;
190 if ( zSign ) z = - z;
191 if ( z && ( ( z < 0 ) ^ zSign ) ) {
192 overflow:
193 float_raise( float_flag_invalid );
194 return
195 zSign ? (sbits64) LIT64( 0x8000000000000000 )
196 : LIT64( 0x7FFFFFFFFFFFFFFF );
197 }
198 if ( absZ1 ) float_exception_flags |= float_flag_inexact;
199 return z;
200
201 }
202 #endif
203
204 /*
205 -------------------------------------------------------------------------------
206 Returns the fraction bits of the single-precision floating-point value `a'.
207 -------------------------------------------------------------------------------
208 */
extractFloat32Frac(float32 a)209 INLINE bits32 extractFloat32Frac( float32 a )
210 {
211
212 return a & 0x007FFFFF;
213
214 }
215
216 /*
217 -------------------------------------------------------------------------------
218 Returns the exponent bits of the single-precision floating-point value `a'.
219 -------------------------------------------------------------------------------
220 */
extractFloat32Exp(float32 a)221 INLINE int16 extractFloat32Exp( float32 a )
222 {
223
224 return ( a>>23 ) & 0xFF;
225
226 }
227
228 /*
229 -------------------------------------------------------------------------------
230 Returns the sign bit of the single-precision floating-point value `a'.
231 -------------------------------------------------------------------------------
232 */
extractFloat32Sign(float32 a)233 INLINE flag extractFloat32Sign( float32 a )
234 {
235
236 return a>>31;
237
238 }
239
240 /*
241 -------------------------------------------------------------------------------
242 Normalizes the subnormal single-precision floating-point value represented
243 by the denormalized significand `aSig'. The normalized exponent and
244 significand are stored at the locations pointed to by `zExpPtr' and
245 `zSigPtr', respectively.
246 -------------------------------------------------------------------------------
247 */
248 static void
normalizeFloat32Subnormal(bits32 aSig,int16 * zExpPtr,bits32 * zSigPtr)249 normalizeFloat32Subnormal( bits32 aSig, int16 *zExpPtr, bits32 *zSigPtr )
250 {
251 int8 shiftCount;
252
253 shiftCount = countLeadingZeros32( aSig ) - 8;
254 *zSigPtr = aSig<<shiftCount;
255 *zExpPtr = 1 - shiftCount;
256
257 }
258
259 /*
260 -------------------------------------------------------------------------------
261 Packs the sign `zSign', exponent `zExp', and significand `zSig' into a
262 single-precision floating-point value, returning the result. After being
263 shifted into the proper positions, the three fields are simply added
264 together to form the result. This means that any integer portion of `zSig'
265 will be added into the exponent. Since a properly normalized significand
266 will have an integer portion equal to 1, the `zExp' input should be 1 less
267 than the desired result exponent whenever `zSig' is a complete, normalized
268 significand.
269 -------------------------------------------------------------------------------
270 */
packFloat32(flag zSign,int16 zExp,bits32 zSig)271 INLINE float32 packFloat32( flag zSign, int16 zExp, bits32 zSig )
272 {
273
274 return ( ( (bits32) zSign )<<31 ) + ( ( (bits32) zExp )<<23 ) + zSig;
275
276 }
277
278 /*
279 -------------------------------------------------------------------------------
280 Takes an abstract floating-point value having sign `zSign', exponent `zExp',
281 and significand `zSig', and returns the proper single-precision floating-
282 point value corresponding to the abstract input. Ordinarily, the abstract
283 value is simply rounded and packed into the single-precision format, with
284 the inexact exception raised if the abstract input cannot be represented
285 exactly. However, if the abstract value is too large, the overflow and
286 inexact exceptions are raised and an infinity or maximal finite value is
287 returned. If the abstract value is too small, the input value is rounded to
288 a subnormal number, and the underflow and inexact exceptions are raised if
289 the abstract input cannot be represented exactly as a subnormal single-
290 precision floating-point number.
291 The input significand `zSig' has its binary point between bits 30
292 and 29, which is 7 bits to the left of the usual location. This shifted
293 significand must be normalized or smaller. If `zSig' is not normalized,
294 `zExp' must be 0; in that case, the result returned is a subnormal number,
295 and it must not require rounding. In the usual case that `zSig' is
296 normalized, `zExp' must be 1 less than the ``true'' floating-point exponent.
297 The handling of underflow and overflow follows the IEC/IEEE Standard for
298 Binary Floating-Point Arithmetic.
299 -------------------------------------------------------------------------------
300 */
roundAndPackFloat32(flag zSign,int16 zExp,bits32 zSig)301 static float32 roundAndPackFloat32( flag zSign, int16 zExp, bits32 zSig )
302 {
303 int8 roundingMode;
304 flag roundNearestEven;
305 int8 roundIncrement, roundBits;
306 flag isTiny;
307
308 roundingMode = float_rounding_mode;
309 roundNearestEven = ( roundingMode == float_round_nearest_even );
310 roundIncrement = 0x40;
311 if ( ! roundNearestEven ) {
312 if ( roundingMode == float_round_to_zero ) {
313 roundIncrement = 0;
314 }
315 else {
316 roundIncrement = 0x7F;
317 if ( zSign ) {
318 if ( roundingMode == float_round_up ) roundIncrement = 0;
319 }
320 else {
321 if ( roundingMode == float_round_down ) roundIncrement = 0;
322 }
323 }
324 }
325 roundBits = zSig & 0x7F;
326 if ( 0xFD <= (bits16) zExp ) {
327 if ( ( 0xFD < zExp )
328 || ( ( zExp == 0xFD )
329 && ( (sbits32) ( zSig + roundIncrement ) < 0 ) )
330 ) {
331 float_raise( float_flag_overflow | float_flag_inexact );
332 return packFloat32( zSign, 0xFF, 0 ) - ( roundIncrement == 0 );
333 }
334 if ( zExp < 0 ) {
335 isTiny =
336 ( float_detect_tininess == float_tininess_before_rounding )
337 || ( zExp < -1 )
338 || ( zSig + roundIncrement < 0x80000000 );
339 shift32RightJamming( zSig, - zExp, &zSig );
340 zExp = 0;
341 roundBits = zSig & 0x7F;
342 if ( isTiny && roundBits ) float_raise( float_flag_underflow );
343 }
344 }
345 if ( roundBits ) float_exception_flags |= float_flag_inexact;
346 zSig = ( zSig + roundIncrement )>>7;
347 zSig &= ~ ( ( ( roundBits ^ 0x40 ) == 0 ) & roundNearestEven );
348 if ( zSig == 0 ) zExp = 0;
349 return packFloat32( zSign, zExp, zSig );
350
351 }
352
353 /*
354 -------------------------------------------------------------------------------
355 Takes an abstract floating-point value having sign `zSign', exponent `zExp',
356 and significand `zSig', and returns the proper single-precision floating-
357 point value corresponding to the abstract input. This routine is just like
358 `roundAndPackFloat32' except that `zSig' does not have to be normalized.
359 Bit 31 of `zSig' must be zero, and `zExp' must be 1 less than the ``true''
360 floating-point exponent.
361 -------------------------------------------------------------------------------
362 */
363 static float32
normalizeRoundAndPackFloat32(flag zSign,int16 zExp,bits32 zSig)364 normalizeRoundAndPackFloat32( flag zSign, int16 zExp, bits32 zSig )
365 {
366 int8 shiftCount;
367
368 shiftCount = countLeadingZeros32( zSig ) - 1;
369 return roundAndPackFloat32( zSign, zExp - shiftCount, zSig<<shiftCount );
370
371 }
372
373 /*
374 -------------------------------------------------------------------------------
375 Returns the fraction bits of the double-precision floating-point value `a'.
376 -------------------------------------------------------------------------------
377 */
extractFloat64Frac(float64 a)378 INLINE bits64 extractFloat64Frac( float64 a )
379 {
380
381 return FLOAT64_DEMANGLE(a) & LIT64( 0x000FFFFFFFFFFFFF );
382
383 }
384
385 /*
386 -------------------------------------------------------------------------------
387 Returns the exponent bits of the double-precision floating-point value `a'.
388 -------------------------------------------------------------------------------
389 */
extractFloat64Exp(float64 a)390 INLINE int16 extractFloat64Exp( float64 a )
391 {
392
393 return ( FLOAT64_DEMANGLE(a)>>52 ) & 0x7FF;
394
395 }
396
397 /*
398 -------------------------------------------------------------------------------
399 Returns the sign bit of the double-precision floating-point value `a'.
400 -------------------------------------------------------------------------------
401 */
extractFloat64Sign(float64 a)402 INLINE flag extractFloat64Sign( float64 a )
403 {
404
405 return FLOAT64_DEMANGLE(a)>>63;
406
407 }
408
409 /*
410 -------------------------------------------------------------------------------
411 Normalizes the subnormal double-precision floating-point value represented
412 by the denormalized significand `aSig'. The normalized exponent and
413 significand are stored at the locations pointed to by `zExpPtr' and
414 `zSigPtr', respectively.
415 -------------------------------------------------------------------------------
416 */
417 static void
normalizeFloat64Subnormal(bits64 aSig,int16 * zExpPtr,bits64 * zSigPtr)418 normalizeFloat64Subnormal( bits64 aSig, int16 *zExpPtr, bits64 *zSigPtr )
419 {
420 int8 shiftCount;
421
422 shiftCount = countLeadingZeros64( aSig ) - 11;
423 *zSigPtr = aSig<<shiftCount;
424 *zExpPtr = 1 - shiftCount;
425
426 }
427
428 /*
429 -------------------------------------------------------------------------------
430 Packs the sign `zSign', exponent `zExp', and significand `zSig' into a
431 double-precision floating-point value, returning the result. After being
432 shifted into the proper positions, the three fields are simply added
433 together to form the result. This means that any integer portion of `zSig'
434 will be added into the exponent. Since a properly normalized significand
435 will have an integer portion equal to 1, the `zExp' input should be 1 less
436 than the desired result exponent whenever `zSig' is a complete, normalized
437 significand.
438 -------------------------------------------------------------------------------
439 */
packFloat64(flag zSign,int16 zExp,bits64 zSig)440 INLINE float64 packFloat64( flag zSign, int16 zExp, bits64 zSig )
441 {
442
443 return FLOAT64_MANGLE( ( ( (bits64) zSign )<<63 ) +
444 ( ( (bits64) zExp )<<52 ) + zSig );
445
446 }
447
448 /*
449 -------------------------------------------------------------------------------
450 Takes an abstract floating-point value having sign `zSign', exponent `zExp',
451 and significand `zSig', and returns the proper double-precision floating-
452 point value corresponding to the abstract input. Ordinarily, the abstract
453 value is simply rounded and packed into the double-precision format, with
454 the inexact exception raised if the abstract input cannot be represented
455 exactly. However, if the abstract value is too large, the overflow and
456 inexact exceptions are raised and an infinity or maximal finite value is
457 returned. If the abstract value is too small, the input value is rounded to
458 a subnormal number, and the underflow and inexact exceptions are raised if
459 the abstract input cannot be represented exactly as a subnormal double-
460 precision floating-point number.
461 The input significand `zSig' has its binary point between bits 62
462 and 61, which is 10 bits to the left of the usual location. This shifted
463 significand must be normalized or smaller. If `zSig' is not normalized,
464 `zExp' must be 0; in that case, the result returned is a subnormal number,
465 and it must not require rounding. In the usual case that `zSig' is
466 normalized, `zExp' must be 1 less than the ``true'' floating-point exponent.
467 The handling of underflow and overflow follows the IEC/IEEE Standard for
468 Binary Floating-Point Arithmetic.
469 -------------------------------------------------------------------------------
470 */
roundAndPackFloat64(flag zSign,int16 zExp,bits64 zSig)471 static float64 roundAndPackFloat64( flag zSign, int16 zExp, bits64 zSig )
472 {
473 int8 roundingMode;
474 flag roundNearestEven;
475 int16 roundIncrement, roundBits;
476 flag isTiny;
477
478 roundingMode = float_rounding_mode;
479 roundNearestEven = ( roundingMode == float_round_nearest_even );
480 roundIncrement = 0x200;
481 if ( ! roundNearestEven ) {
482 if ( roundingMode == float_round_to_zero ) {
483 roundIncrement = 0;
484 }
485 else {
486 roundIncrement = 0x3FF;
487 if ( zSign ) {
488 if ( roundingMode == float_round_up ) roundIncrement = 0;
489 }
490 else {
491 if ( roundingMode == float_round_down ) roundIncrement = 0;
492 }
493 }
494 }
495 roundBits = zSig & 0x3FF;
496 if ( 0x7FD <= (bits16) zExp ) {
497 if ( ( 0x7FD < zExp )
498 || ( ( zExp == 0x7FD )
499 && ( (sbits64) ( zSig + roundIncrement ) < 0 ) )
500 ) {
501 float_raise( float_flag_overflow | float_flag_inexact );
502 return FLOAT64_MANGLE(
503 FLOAT64_DEMANGLE(packFloat64( zSign, 0x7FF, 0 )) -
504 ( roundIncrement == 0 ));
505 }
506 if ( zExp < 0 ) {
507 isTiny =
508 ( float_detect_tininess == float_tininess_before_rounding )
509 || ( zExp < -1 )
510 || ( zSig + roundIncrement < LIT64( 0x8000000000000000 ) );
511 shift64RightJamming( zSig, - zExp, &zSig );
512 zExp = 0;
513 roundBits = zSig & 0x3FF;
514 if ( isTiny && roundBits ) float_raise( float_flag_underflow );
515 }
516 }
517 if ( roundBits ) float_exception_flags |= float_flag_inexact;
518 zSig = ( zSig + roundIncrement )>>10;
519 zSig &= ~ ( ( ( roundBits ^ 0x200 ) == 0 ) & roundNearestEven );
520 if ( zSig == 0 ) zExp = 0;
521 return packFloat64( zSign, zExp, zSig );
522
523 }
524
525 /*
526 -------------------------------------------------------------------------------
527 Takes an abstract floating-point value having sign `zSign', exponent `zExp',
528 and significand `zSig', and returns the proper double-precision floating-
529 point value corresponding to the abstract input. This routine is just like
530 `roundAndPackFloat64' except that `zSig' does not have to be normalized.
531 Bit 63 of `zSig' must be zero, and `zExp' must be 1 less than the ``true''
532 floating-point exponent.
533 -------------------------------------------------------------------------------
534 */
535 static float64
normalizeRoundAndPackFloat64(flag zSign,int16 zExp,bits64 zSig)536 normalizeRoundAndPackFloat64( flag zSign, int16 zExp, bits64 zSig )
537 {
538 int8 shiftCount;
539
540 shiftCount = countLeadingZeros64( zSig ) - 1;
541 return roundAndPackFloat64( zSign, zExp - shiftCount, zSig<<shiftCount );
542
543 }
544
545 #ifdef FLOATX80
546
547 /*
548 -------------------------------------------------------------------------------
549 Returns the fraction bits of the extended double-precision floating-point
550 value `a'.
551 -------------------------------------------------------------------------------
552 */
extractFloatx80Frac(floatx80 a)553 INLINE bits64 extractFloatx80Frac( floatx80 a )
554 {
555
556 return a.low;
557
558 }
559
560 /*
561 -------------------------------------------------------------------------------
562 Returns the exponent bits of the extended double-precision floating-point
563 value `a'.
564 -------------------------------------------------------------------------------
565 */
extractFloatx80Exp(floatx80 a)566 INLINE int32 extractFloatx80Exp( floatx80 a )
567 {
568
569 return a.high & 0x7FFF;
570
571 }
572
573 /*
574 -------------------------------------------------------------------------------
575 Returns the sign bit of the extended double-precision floating-point value
576 `a'.
577 -------------------------------------------------------------------------------
578 */
extractFloatx80Sign(floatx80 a)579 INLINE flag extractFloatx80Sign( floatx80 a )
580 {
581
582 return a.high>>15;
583
584 }
585
586 /*
587 -------------------------------------------------------------------------------
588 Normalizes the subnormal extended double-precision floating-point value
589 represented by the denormalized significand `aSig'. The normalized exponent
590 and significand are stored at the locations pointed to by `zExpPtr' and
591 `zSigPtr', respectively.
592 -------------------------------------------------------------------------------
593 */
594 static void
normalizeFloatx80Subnormal(bits64 aSig,int32 * zExpPtr,bits64 * zSigPtr)595 normalizeFloatx80Subnormal( bits64 aSig, int32 *zExpPtr, bits64 *zSigPtr )
596 {
597 int8 shiftCount;
598
599 shiftCount = countLeadingZeros64( aSig );
600 *zSigPtr = aSig<<shiftCount;
601 *zExpPtr = 1 - shiftCount;
602
603 }
604
605 /*
606 -------------------------------------------------------------------------------
607 Packs the sign `zSign', exponent `zExp', and significand `zSig' into an
608 extended double-precision floating-point value, returning the result.
609 -------------------------------------------------------------------------------
610 */
packFloatx80(flag zSign,int32 zExp,bits64 zSig)611 INLINE floatx80 packFloatx80( flag zSign, int32 zExp, bits64 zSig )
612 {
613 floatx80 z;
614
615 z.low = zSig;
616 z.high = ( ( (bits16) zSign )<<15 ) + zExp;
617 return z;
618
619 }
620
621 /*
622 -------------------------------------------------------------------------------
623 Takes an abstract floating-point value having sign `zSign', exponent `zExp',
624 and extended significand formed by the concatenation of `zSig0' and `zSig1',
625 and returns the proper extended double-precision floating-point value
626 corresponding to the abstract input. Ordinarily, the abstract value is
627 rounded and packed into the extended double-precision format, with the
628 inexact exception raised if the abstract input cannot be represented
629 exactly. However, if the abstract value is too large, the overflow and
630 inexact exceptions are raised and an infinity or maximal finite value is
631 returned. If the abstract value is too small, the input value is rounded to
632 a subnormal number, and the underflow and inexact exceptions are raised if
633 the abstract input cannot be represented exactly as a subnormal extended
634 double-precision floating-point number.
635 If `roundingPrecision' is 32 or 64, the result is rounded to the same
636 number of bits as single or double precision, respectively. Otherwise, the
637 result is rounded to the full precision of the extended double-precision
638 format.
639 The input significand must be normalized or smaller. If the input
640 significand is not normalized, `zExp' must be 0; in that case, the result
641 returned is a subnormal number, and it must not require rounding. The
642 handling of underflow and overflow follows the IEC/IEEE Standard for Binary
643 Floating-Point Arithmetic.
644 -------------------------------------------------------------------------------
645 */
646 static floatx80
roundAndPackFloatx80(int8 roundingPrecision,flag zSign,int32 zExp,bits64 zSig0,bits64 zSig1)647 roundAndPackFloatx80(
648 int8 roundingPrecision, flag zSign, int32 zExp, bits64 zSig0, bits64 zSig1
649 )
650 {
651 int8 roundingMode;
652 flag roundNearestEven, increment, isTiny;
653 int64 roundIncrement, roundMask, roundBits;
654
655 roundingMode = float_rounding_mode;
656 roundNearestEven = ( roundingMode == float_round_nearest_even );
657 if ( roundingPrecision == 80 ) goto precision80;
658 if ( roundingPrecision == 64 ) {
659 roundIncrement = LIT64( 0x0000000000000400 );
660 roundMask = LIT64( 0x00000000000007FF );
661 }
662 else if ( roundingPrecision == 32 ) {
663 roundIncrement = LIT64( 0x0000008000000000 );
664 roundMask = LIT64( 0x000000FFFFFFFFFF );
665 }
666 else {
667 goto precision80;
668 }
669 zSig0 |= ( zSig1 != 0 );
670 if ( ! roundNearestEven ) {
671 if ( roundingMode == float_round_to_zero ) {
672 roundIncrement = 0;
673 }
674 else {
675 roundIncrement = roundMask;
676 if ( zSign ) {
677 if ( roundingMode == float_round_up ) roundIncrement = 0;
678 }
679 else {
680 if ( roundingMode == float_round_down ) roundIncrement = 0;
681 }
682 }
683 }
684 roundBits = zSig0 & roundMask;
685 if ( 0x7FFD <= (bits32) ( zExp - 1 ) ) {
686 if ( ( 0x7FFE < zExp )
687 || ( ( zExp == 0x7FFE ) && ( zSig0 + roundIncrement < zSig0 ) )
688 ) {
689 goto overflow;
690 }
691 if ( zExp <= 0 ) {
692 isTiny =
693 ( float_detect_tininess == float_tininess_before_rounding )
694 || ( zExp < 0 )
695 || ( zSig0 <= zSig0 + roundIncrement );
696 shift64RightJamming( zSig0, 1 - zExp, &zSig0 );
697 zExp = 0;
698 roundBits = zSig0 & roundMask;
699 if ( isTiny && roundBits ) float_raise( float_flag_underflow );
700 if ( roundBits ) float_exception_flags |= float_flag_inexact;
701 zSig0 += roundIncrement;
702 if ( (sbits64) zSig0 < 0 ) zExp = 1;
703 roundIncrement = roundMask + 1;
704 if ( roundNearestEven && ( roundBits<<1 == roundIncrement ) ) {
705 roundMask |= roundIncrement;
706 }
707 zSig0 &= ~ roundMask;
708 return packFloatx80( zSign, zExp, zSig0 );
709 }
710 }
711 if ( roundBits ) float_exception_flags |= float_flag_inexact;
712 zSig0 += roundIncrement;
713 if ( zSig0 < roundIncrement ) {
714 ++zExp;
715 zSig0 = LIT64( 0x8000000000000000 );
716 }
717 roundIncrement = roundMask + 1;
718 if ( roundNearestEven && ( roundBits<<1 == roundIncrement ) ) {
719 roundMask |= roundIncrement;
720 }
721 zSig0 &= ~ roundMask;
722 if ( zSig0 == 0 ) zExp = 0;
723 return packFloatx80( zSign, zExp, zSig0 );
724 precision80:
725 increment = ( (sbits64) zSig1 < 0 );
726 if ( ! roundNearestEven ) {
727 if ( roundingMode == float_round_to_zero ) {
728 increment = 0;
729 }
730 else {
731 if ( zSign ) {
732 increment = ( roundingMode == float_round_down ) && zSig1;
733 }
734 else {
735 increment = ( roundingMode == float_round_up ) && zSig1;
736 }
737 }
738 }
739 if ( 0x7FFD <= (bits32) ( zExp - 1 ) ) {
740 if ( ( 0x7FFE < zExp )
741 || ( ( zExp == 0x7FFE )
742 && ( zSig0 == LIT64( 0xFFFFFFFFFFFFFFFF ) )
743 && increment
744 )
745 ) {
746 roundMask = 0;
747 overflow:
748 float_raise( float_flag_overflow | float_flag_inexact );
749 if ( ( roundingMode == float_round_to_zero )
750 || ( zSign && ( roundingMode == float_round_up ) )
751 || ( ! zSign && ( roundingMode == float_round_down ) )
752 ) {
753 return packFloatx80( zSign, 0x7FFE, ~ roundMask );
754 }
755 return packFloatx80( zSign, 0x7FFF, LIT64( 0x8000000000000000 ) );
756 }
757 if ( zExp <= 0 ) {
758 isTiny =
759 ( float_detect_tininess == float_tininess_before_rounding )
760 || ( zExp < 0 )
761 || ! increment
762 || ( zSig0 < LIT64( 0xFFFFFFFFFFFFFFFF ) );
763 shift64ExtraRightJamming( zSig0, zSig1, 1 - zExp, &zSig0, &zSig1 );
764 zExp = 0;
765 if ( isTiny && zSig1 ) float_raise( float_flag_underflow );
766 if ( zSig1 ) float_exception_flags |= float_flag_inexact;
767 if ( roundNearestEven ) {
768 increment = ( (sbits64) zSig1 < 0 );
769 }
770 else {
771 if ( zSign ) {
772 increment = ( roundingMode == float_round_down ) && zSig1;
773 }
774 else {
775 increment = ( roundingMode == float_round_up ) && zSig1;
776 }
777 }
778 if ( increment ) {
779 ++zSig0;
780 zSig0 &=
781 ~ ( ( (bits64) ( zSig1<<1 ) == 0 ) & roundNearestEven );
782 if ( (sbits64) zSig0 < 0 ) zExp = 1;
783 }
784 return packFloatx80( zSign, zExp, zSig0 );
785 }
786 }
787 if ( zSig1 ) float_exception_flags |= float_flag_inexact;
788 if ( increment ) {
789 ++zSig0;
790 if ( zSig0 == 0 ) {
791 ++zExp;
792 zSig0 = LIT64( 0x8000000000000000 );
793 }
794 else {
795 zSig0 &= ~ ( ( (bits64) ( zSig1<<1 ) == 0 ) & roundNearestEven );
796 }
797 }
798 else {
799 if ( zSig0 == 0 ) zExp = 0;
800 }
801 return packFloatx80( zSign, zExp, zSig0 );
802
803 }
804
805 /*
806 -------------------------------------------------------------------------------
807 Takes an abstract floating-point value having sign `zSign', exponent
808 `zExp', and significand formed by the concatenation of `zSig0' and `zSig1',
809 and returns the proper extended double-precision floating-point value
810 corresponding to the abstract input. This routine is just like
811 `roundAndPackFloatx80' except that the input significand does not have to be
812 normalized.
813 -------------------------------------------------------------------------------
814 */
815 static floatx80
normalizeRoundAndPackFloatx80(int8 roundingPrecision,flag zSign,int32 zExp,bits64 zSig0,bits64 zSig1)816 normalizeRoundAndPackFloatx80(
817 int8 roundingPrecision, flag zSign, int32 zExp, bits64 zSig0, bits64 zSig1
818 )
819 {
820 int8 shiftCount;
821
822 if ( zSig0 == 0 ) {
823 zSig0 = zSig1;
824 zSig1 = 0;
825 zExp -= 64;
826 }
827 shiftCount = countLeadingZeros64( zSig0 );
828 shortShift128Left( zSig0, zSig1, shiftCount, &zSig0, &zSig1 );
829 zExp -= shiftCount;
830 return
831 roundAndPackFloatx80( roundingPrecision, zSign, zExp, zSig0, zSig1 );
832
833 }
834
835 #endif
836
837 #ifdef FLOAT128
838
839 /*
840 -------------------------------------------------------------------------------
841 Returns the least-significant 64 fraction bits of the quadruple-precision
842 floating-point value `a'.
843 -------------------------------------------------------------------------------
844 */
extractFloat128Frac1(float128 a)845 INLINE bits64 extractFloat128Frac1( float128 a )
846 {
847
848 return a.low;
849
850 }
851
852 /*
853 -------------------------------------------------------------------------------
854 Returns the most-significant 48 fraction bits of the quadruple-precision
855 floating-point value `a'.
856 -------------------------------------------------------------------------------
857 */
extractFloat128Frac0(float128 a)858 INLINE bits64 extractFloat128Frac0( float128 a )
859 {
860
861 return a.high & LIT64( 0x0000FFFFFFFFFFFF );
862
863 }
864
865 /*
866 -------------------------------------------------------------------------------
867 Returns the exponent bits of the quadruple-precision floating-point value
868 `a'.
869 -------------------------------------------------------------------------------
870 */
extractFloat128Exp(float128 a)871 INLINE int32 extractFloat128Exp( float128 a )
872 {
873
874 return ( a.high>>48 ) & 0x7FFF;
875
876 }
877
878 /*
879 -------------------------------------------------------------------------------
880 Returns the sign bit of the quadruple-precision floating-point value `a'.
881 -------------------------------------------------------------------------------
882 */
extractFloat128Sign(float128 a)883 INLINE flag extractFloat128Sign( float128 a )
884 {
885
886 return a.high>>63;
887
888 }
889
890 /*
891 -------------------------------------------------------------------------------
892 Normalizes the subnormal quadruple-precision floating-point value
893 represented by the denormalized significand formed by the concatenation of
894 `aSig0' and `aSig1'. The normalized exponent is stored at the location
895 pointed to by `zExpPtr'. The most significant 49 bits of the normalized
896 significand are stored at the location pointed to by `zSig0Ptr', and the
897 least significant 64 bits of the normalized significand are stored at the
898 location pointed to by `zSig1Ptr'.
899 -------------------------------------------------------------------------------
900 */
901 static void
normalizeFloat128Subnormal(bits64 aSig0,bits64 aSig1,int32 * zExpPtr,bits64 * zSig0Ptr,bits64 * zSig1Ptr)902 normalizeFloat128Subnormal(
903 bits64 aSig0,
904 bits64 aSig1,
905 int32 *zExpPtr,
906 bits64 *zSig0Ptr,
907 bits64 *zSig1Ptr
908 )
909 {
910 int8 shiftCount;
911
912 if ( aSig0 == 0 ) {
913 shiftCount = countLeadingZeros64( aSig1 ) - 15;
914 if ( shiftCount < 0 ) {
915 *zSig0Ptr = aSig1>>( - shiftCount );
916 *zSig1Ptr = aSig1<<( shiftCount & 63 );
917 }
918 else {
919 *zSig0Ptr = aSig1<<shiftCount;
920 *zSig1Ptr = 0;
921 }
922 *zExpPtr = - shiftCount - 63;
923 }
924 else {
925 shiftCount = countLeadingZeros64( aSig0 ) - 15;
926 shortShift128Left( aSig0, aSig1, shiftCount, zSig0Ptr, zSig1Ptr );
927 *zExpPtr = 1 - shiftCount;
928 }
929
930 }
931
932 /*
933 -------------------------------------------------------------------------------
934 Packs the sign `zSign', the exponent `zExp', and the significand formed
935 by the concatenation of `zSig0' and `zSig1' into a quadruple-precision
936 floating-point value, returning the result. After being shifted into the
937 proper positions, the three fields `zSign', `zExp', and `zSig0' are simply
938 added together to form the most significant 32 bits of the result. This
939 means that any integer portion of `zSig0' will be added into the exponent.
940 Since a properly normalized significand will have an integer portion equal
941 to 1, the `zExp' input should be 1 less than the desired result exponent
942 whenever `zSig0' and `zSig1' concatenated form a complete, normalized
943 significand.
944 -------------------------------------------------------------------------------
945 */
946 INLINE float128
packFloat128(flag zSign,int32 zExp,bits64 zSig0,bits64 zSig1)947 packFloat128( flag zSign, int32 zExp, bits64 zSig0, bits64 zSig1 )
948 {
949 float128 z;
950
951 z.low = zSig1;
952 z.high = ( ( (bits64) zSign )<<63 ) + ( ( (bits64) zExp )<<48 ) + zSig0;
953 return z;
954
955 }
956
957 /*
958 -------------------------------------------------------------------------------
959 Takes an abstract floating-point value having sign `zSign', exponent `zExp',
960 and extended significand formed by the concatenation of `zSig0', `zSig1',
961 and `zSig2', and returns the proper quadruple-precision floating-point value
962 corresponding to the abstract input. Ordinarily, the abstract value is
963 simply rounded and packed into the quadruple-precision format, with the
964 inexact exception raised if the abstract input cannot be represented
965 exactly. However, if the abstract value is too large, the overflow and
966 inexact exceptions are raised and an infinity or maximal finite value is
967 returned. If the abstract value is too small, the input value is rounded to
968 a subnormal number, and the underflow and inexact exceptions are raised if
969 the abstract input cannot be represented exactly as a subnormal quadruple-
970 precision floating-point number.
971 The input significand must be normalized or smaller. If the input
972 significand is not normalized, `zExp' must be 0; in that case, the result
973 returned is a subnormal number, and it must not require rounding. In the
974 usual case that the input significand is normalized, `zExp' must be 1 less
975 than the ``true'' floating-point exponent. The handling of underflow and
976 overflow follows the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
977 -------------------------------------------------------------------------------
978 */
979 static float128
roundAndPackFloat128(flag zSign,int32 zExp,bits64 zSig0,bits64 zSig1,bits64 zSig2)980 roundAndPackFloat128(
981 flag zSign, int32 zExp, bits64 zSig0, bits64 zSig1, bits64 zSig2 )
982 {
983 int8 roundingMode;
984 flag roundNearestEven, increment, isTiny;
985
986 roundingMode = float_rounding_mode;
987 roundNearestEven = ( roundingMode == float_round_nearest_even );
988 increment = ( (sbits64) zSig2 < 0 );
989 if ( ! roundNearestEven ) {
990 if ( roundingMode == float_round_to_zero ) {
991 increment = 0;
992 }
993 else {
994 if ( zSign ) {
995 increment = ( roundingMode == float_round_down ) && zSig2;
996 }
997 else {
998 increment = ( roundingMode == float_round_up ) && zSig2;
999 }
1000 }
1001 }
1002 if ( 0x7FFD <= (bits32) zExp ) {
1003 if ( ( 0x7FFD < zExp )
1004 || ( ( zExp == 0x7FFD )
1005 && eq128(
1006 LIT64( 0x0001FFFFFFFFFFFF ),
1007 LIT64( 0xFFFFFFFFFFFFFFFF ),
1008 zSig0,
1009 zSig1
1010 )
1011 && increment
1012 )
1013 ) {
1014 float_raise( float_flag_overflow | float_flag_inexact );
1015 if ( ( roundingMode == float_round_to_zero )
1016 || ( zSign && ( roundingMode == float_round_up ) )
1017 || ( ! zSign && ( roundingMode == float_round_down ) )
1018 ) {
1019 return
1020 packFloat128(
1021 zSign,
1022 0x7FFE,
1023 LIT64( 0x0000FFFFFFFFFFFF ),
1024 LIT64( 0xFFFFFFFFFFFFFFFF )
1025 );
1026 }
1027 return packFloat128( zSign, 0x7FFF, 0, 0 );
1028 }
1029 if ( zExp < 0 ) {
1030 isTiny =
1031 ( float_detect_tininess == float_tininess_before_rounding )
1032 || ( zExp < -1 )
1033 || ! increment
1034 || lt128(
1035 zSig0,
1036 zSig1,
1037 LIT64( 0x0001FFFFFFFFFFFF ),
1038 LIT64( 0xFFFFFFFFFFFFFFFF )
1039 );
1040 shift128ExtraRightJamming(
1041 zSig0, zSig1, zSig2, - zExp, &zSig0, &zSig1, &zSig2 );
1042 zExp = 0;
1043 if ( isTiny && zSig2 ) float_raise( float_flag_underflow );
1044 if ( roundNearestEven ) {
1045 increment = ( (sbits64) zSig2 < 0 );
1046 }
1047 else {
1048 if ( zSign ) {
1049 increment = ( roundingMode == float_round_down ) && zSig2;
1050 }
1051 else {
1052 increment = ( roundingMode == float_round_up ) && zSig2;
1053 }
1054 }
1055 }
1056 }
1057 if ( zSig2 ) float_exception_flags |= float_flag_inexact;
1058 if ( increment ) {
1059 add128( zSig0, zSig1, 0, 1, &zSig0, &zSig1 );
1060 zSig1 &= ~ ( ( zSig2 + zSig2 == 0 ) & roundNearestEven );
1061 }
1062 else {
1063 if ( ( zSig0 | zSig1 ) == 0 ) zExp = 0;
1064 }
1065 return packFloat128( zSign, zExp, zSig0, zSig1 );
1066
1067 }
1068
1069 /*
1070 -------------------------------------------------------------------------------
1071 Takes an abstract floating-point value having sign `zSign', exponent `zExp',
1072 and significand formed by the concatenation of `zSig0' and `zSig1', and
1073 returns the proper quadruple-precision floating-point value corresponding
1074 to the abstract input. This routine is just like `roundAndPackFloat128'
1075 except that the input significand has fewer bits and does not have to be
1076 normalized. In all cases, `zExp' must be 1 less than the ``true'' floating-
1077 point exponent.
1078 -------------------------------------------------------------------------------
1079 */
1080 static float128
normalizeRoundAndPackFloat128(flag zSign,int32 zExp,bits64 zSig0,bits64 zSig1)1081 normalizeRoundAndPackFloat128(
1082 flag zSign, int32 zExp, bits64 zSig0, bits64 zSig1 )
1083 {
1084 int8 shiftCount;
1085 bits64 zSig2;
1086
1087 if ( zSig0 == 0 ) {
1088 zSig0 = zSig1;
1089 zSig1 = 0;
1090 zExp -= 64;
1091 }
1092 shiftCount = countLeadingZeros64( zSig0 ) - 15;
1093 if ( 0 <= shiftCount ) {
1094 zSig2 = 0;
1095 shortShift128Left( zSig0, zSig1, shiftCount, &zSig0, &zSig1 );
1096 }
1097 else {
1098 shift128ExtraRightJamming(
1099 zSig0, zSig1, 0, - shiftCount, &zSig0, &zSig1, &zSig2 );
1100 }
1101 zExp -= shiftCount;
1102 return roundAndPackFloat128( zSign, zExp, zSig0, zSig1, zSig2 );
1103
1104 }
1105
1106 #endif
1107
1108 /*
1109 -------------------------------------------------------------------------------
1110 Returns the result of converting the 32-bit two's complement integer `a'
1111 to the single-precision floating-point format. The conversion is performed
1112 according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
1113 -------------------------------------------------------------------------------
1114 */
int32_to_float32(int32 a)1115 float32 int32_to_float32( int32 a )
1116 {
1117 flag zSign;
1118
1119 if ( a == 0 ) return 0;
1120 if ( a == (sbits32) 0x80000000 ) return packFloat32( 1, 0x9E, 0 );
1121 zSign = ( a < 0 );
1122 return normalizeRoundAndPackFloat32( zSign, 0x9C, zSign ? - a : a );
1123
1124 }
1125
1126 #ifndef SOFTFLOAT_FOR_GCC /* __floatunsisf is in libgcc */
uint32_to_float32(uint32 a)1127 float32 uint32_to_float32( uint32 a )
1128 {
1129 if ( a == 0 ) return 0;
1130 if ( a & (bits32) 0x80000000 )
1131 return normalizeRoundAndPackFloat32( 0, 0x9D, a >> 1 );
1132 return normalizeRoundAndPackFloat32( 0, 0x9C, a );
1133 }
1134 #endif
1135
1136
1137 /*
1138 -------------------------------------------------------------------------------
1139 Returns the result of converting the 32-bit two's complement integer `a'
1140 to the double-precision floating-point format. The conversion is performed
1141 according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
1142 -------------------------------------------------------------------------------
1143 */
int32_to_float64(int32 a)1144 float64 int32_to_float64( int32 a )
1145 {
1146 flag zSign;
1147 uint32 absA;
1148 int8 shiftCount;
1149 bits64 zSig;
1150
1151 if ( a == 0 ) return 0;
1152 zSign = ( a < 0 );
1153 absA = zSign ? - a : a;
1154 shiftCount = countLeadingZeros32( absA ) + 21;
1155 zSig = absA;
1156 return packFloat64( zSign, 0x432 - shiftCount, zSig<<shiftCount );
1157
1158 }
1159
1160 #ifndef SOFTFLOAT_FOR_GCC /* __floatunsidf is in libgcc */
uint32_to_float64(uint32 a)1161 float64 uint32_to_float64( uint32 a )
1162 {
1163 int8 shiftCount;
1164 bits64 zSig = a;
1165
1166 if ( a == 0 ) return 0;
1167 shiftCount = countLeadingZeros32( a ) + 21;
1168 return packFloat64( 0, 0x432 - shiftCount, zSig<<shiftCount );
1169
1170 }
1171 #endif
1172
1173 #ifdef FLOATX80
1174
1175 /*
1176 -------------------------------------------------------------------------------
1177 Returns the result of converting the 32-bit two's complement integer `a'
1178 to the extended double-precision floating-point format. The conversion
1179 is performed according to the IEC/IEEE Standard for Binary Floating-Point
1180 Arithmetic.
1181 -------------------------------------------------------------------------------
1182 */
int32_to_floatx80(int32 a)1183 floatx80 int32_to_floatx80( int32 a )
1184 {
1185 flag zSign;
1186 uint32 absA;
1187 int8 shiftCount;
1188 bits64 zSig;
1189
1190 if ( a == 0 ) return packFloatx80( 0, 0, 0 );
1191 zSign = ( a < 0 );
1192 absA = zSign ? - a : a;
1193 shiftCount = countLeadingZeros32( absA ) + 32;
1194 zSig = absA;
1195 return packFloatx80( zSign, 0x403E - shiftCount, zSig<<shiftCount );
1196
1197 }
1198
uint32_to_floatx80(uint32 a)1199 floatx80 uint32_to_floatx80( uint32 a )
1200 {
1201 int8 shiftCount;
1202 bits64 zSig = a;
1203
1204 if ( a == 0 ) return packFloatx80( 0, 0, 0 );
1205 shiftCount = countLeadingZeros32( a ) + 32;
1206 return packFloatx80( 0, 0x403E - shiftCount, zSig<<shiftCount );
1207
1208 }
1209
1210 #endif
1211
1212 #ifdef FLOAT128
1213
1214 /*
1215 -------------------------------------------------------------------------------
1216 Returns the result of converting the 32-bit two's complement integer `a' to
1217 the quadruple-precision floating-point format. The conversion is performed
1218 according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
1219 -------------------------------------------------------------------------------
1220 */
int32_to_float128(int32 a)1221 float128 int32_to_float128( int32 a )
1222 {
1223 flag zSign;
1224 uint32 absA;
1225 int8 shiftCount;
1226 bits64 zSig0;
1227
1228 if ( a == 0 ) return packFloat128( 0, 0, 0, 0 );
1229 zSign = ( a < 0 );
1230 absA = zSign ? - a : a;
1231 shiftCount = countLeadingZeros32( absA ) + 17;
1232 zSig0 = absA;
1233 return packFloat128( zSign, 0x402E - shiftCount, zSig0<<shiftCount, 0 );
1234
1235 }
1236
uint32_to_float128(uint32 a)1237 float128 uint32_to_float128( uint32 a )
1238 {
1239 int8 shiftCount;
1240 bits64 zSig0 = a;
1241
1242 if ( a == 0 ) return packFloat128( 0, 0, 0, 0 );
1243 shiftCount = countLeadingZeros32( a ) + 17;
1244 return packFloat128( 0, 0x402E - shiftCount, zSig0<<shiftCount, 0 );
1245
1246 }
1247
1248 #endif
1249
1250 #ifndef SOFTFLOAT_FOR_GCC /* __floatdi?f is in libgcc2.c */
1251 /*
1252 -------------------------------------------------------------------------------
1253 Returns the result of converting the 64-bit two's complement integer `a'
1254 to the single-precision floating-point format. The conversion is performed
1255 according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
1256 -------------------------------------------------------------------------------
1257 */
int64_to_float32(int64 a)1258 float32 int64_to_float32( int64 a )
1259 {
1260 flag zSign;
1261 uint64 absA;
1262 int8 shiftCount;
1263
1264 if ( a == 0 ) return 0;
1265 zSign = ( a < 0 );
1266 absA = zSign ? - a : a;
1267 shiftCount = countLeadingZeros64( absA ) - 40;
1268 if ( 0 <= shiftCount ) {
1269 return packFloat32( zSign, 0x95 - shiftCount, absA<<shiftCount );
1270 }
1271 else {
1272 shiftCount += 7;
1273 if ( shiftCount < 0 ) {
1274 shift64RightJamming( absA, - shiftCount, &absA );
1275 }
1276 else {
1277 absA <<= shiftCount;
1278 }
1279 return roundAndPackFloat32( zSign, 0x9C - shiftCount, absA );
1280 }
1281
1282 }
1283
1284 /*
1285 -------------------------------------------------------------------------------
1286 Returns the result of converting the 64-bit two's complement integer `a'
1287 to the double-precision floating-point format. The conversion is performed
1288 according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
1289 -------------------------------------------------------------------------------
1290 */
int64_to_float64(int64 a)1291 float64 int64_to_float64( int64 a )
1292 {
1293 flag zSign;
1294
1295 if ( a == 0 ) return 0;
1296 if ( a == (sbits64) LIT64( 0x8000000000000000 ) ) {
1297 return packFloat64( 1, 0x43E, 0 );
1298 }
1299 zSign = ( a < 0 );
1300 return normalizeRoundAndPackFloat64( zSign, 0x43C, zSign ? - a : a );
1301
1302 }
1303
1304 #ifdef FLOATX80
1305
1306 /*
1307 -------------------------------------------------------------------------------
1308 Returns the result of converting the 64-bit two's complement integer `a'
1309 to the extended double-precision floating-point format. The conversion
1310 is performed according to the IEC/IEEE Standard for Binary Floating-Point
1311 Arithmetic.
1312 -------------------------------------------------------------------------------
1313 */
int64_to_floatx80(int64 a)1314 floatx80 int64_to_floatx80( int64 a )
1315 {
1316 flag zSign;
1317 uint64 absA;
1318 int8 shiftCount;
1319
1320 if ( a == 0 ) return packFloatx80( 0, 0, 0 );
1321 zSign = ( a < 0 );
1322 absA = zSign ? - a : a;
1323 shiftCount = countLeadingZeros64( absA );
1324 return packFloatx80( zSign, 0x403E - shiftCount, absA<<shiftCount );
1325
1326 }
1327
1328 #endif
1329
1330 #endif /* !SOFTFLOAT_FOR_GCC */
1331
1332 #ifdef FLOAT128
1333
1334 /*
1335 -------------------------------------------------------------------------------
1336 Returns the result of converting the 64-bit two's complement integer `a' to
1337 the quadruple-precision floating-point format. The conversion is performed
1338 according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
1339 -------------------------------------------------------------------------------
1340 */
int64_to_float128(int64 a)1341 float128 int64_to_float128( int64 a )
1342 {
1343 flag zSign;
1344 uint64 absA;
1345 int8 shiftCount;
1346 int32 zExp;
1347 bits64 zSig0, zSig1;
1348
1349 if ( a == 0 ) return packFloat128( 0, 0, 0, 0 );
1350 zSign = ( a < 0 );
1351 absA = zSign ? - a : a;
1352 shiftCount = countLeadingZeros64( absA ) + 49;
1353 zExp = 0x406E - shiftCount;
1354 if ( 64 <= shiftCount ) {
1355 zSig1 = 0;
1356 zSig0 = absA;
1357 shiftCount -= 64;
1358 }
1359 else {
1360 zSig1 = absA;
1361 zSig0 = 0;
1362 }
1363 shortShift128Left( zSig0, zSig1, shiftCount, &zSig0, &zSig1 );
1364 return packFloat128( zSign, zExp, zSig0, zSig1 );
1365
1366 }
1367
1368 #endif
1369
1370 #ifndef SOFTFLOAT_FOR_GCC /* Not needed */
1371 /*
1372 -------------------------------------------------------------------------------
1373 Returns the result of converting the single-precision floating-point value
1374 `a' to the 32-bit two's complement integer format. The conversion is
1375 performed according to the IEC/IEEE Standard for Binary Floating-Point
1376 Arithmetic---which means in particular that the conversion is rounded
1377 according to the current rounding mode. If `a' is a NaN, the largest
1378 positive integer is returned. Otherwise, if the conversion overflows, the
1379 largest integer with the same sign as `a' is returned.
1380 -------------------------------------------------------------------------------
1381 */
float32_to_int32(float32 a)1382 int32 float32_to_int32( float32 a )
1383 {
1384 flag aSign;
1385 int16 aExp, shiftCount;
1386 bits32 aSig;
1387 bits64 aSig64;
1388
1389 aSig = extractFloat32Frac( a );
1390 aExp = extractFloat32Exp( a );
1391 aSign = extractFloat32Sign( a );
1392 if ( ( aExp == 0xFF ) && aSig ) aSign = 0;
1393 if ( aExp ) aSig |= 0x00800000;
1394 shiftCount = 0xAF - aExp;
1395 aSig64 = aSig;
1396 aSig64 <<= 32;
1397 if ( 0 < shiftCount ) shift64RightJamming( aSig64, shiftCount, &aSig64 );
1398 return roundAndPackInt32( aSign, aSig64 );
1399
1400 }
1401 #endif /* !SOFTFLOAT_FOR_GCC */
1402
1403 /*
1404 -------------------------------------------------------------------------------
1405 Returns the result of converting the single-precision floating-point value
1406 `a' to the 32-bit two's complement integer format. The conversion is
1407 performed according to the IEC/IEEE Standard for Binary Floating-Point
1408 Arithmetic, except that the conversion is always rounded toward zero.
1409 If `a' is a NaN, the largest positive integer is returned. Otherwise, if
1410 the conversion overflows, the largest integer with the same sign as `a' is
1411 returned.
1412 -------------------------------------------------------------------------------
1413 */
float32_to_int32_round_to_zero(float32 a)1414 int32 float32_to_int32_round_to_zero( float32 a )
1415 {
1416 flag aSign;
1417 int16 aExp, shiftCount;
1418 bits32 aSig;
1419 int32 z;
1420
1421 aSig = extractFloat32Frac( a );
1422 aExp = extractFloat32Exp( a );
1423 aSign = extractFloat32Sign( a );
1424 shiftCount = aExp - 0x9E;
1425 if ( 0 <= shiftCount ) {
1426 if ( a != 0xCF000000 ) {
1427 float_raise( float_flag_invalid );
1428 if ( ! aSign || ( ( aExp == 0xFF ) && aSig ) ) return 0x7FFFFFFF;
1429 }
1430 return (sbits32) 0x80000000;
1431 }
1432 else if ( aExp <= 0x7E ) {
1433 if ( aExp | aSig ) float_exception_flags |= float_flag_inexact;
1434 return 0;
1435 }
1436 aSig = ( aSig | 0x00800000 )<<8;
1437 z = aSig>>( - shiftCount );
1438 if ( (bits32) ( aSig<<( shiftCount & 31 ) ) ) {
1439 float_exception_flags |= float_flag_inexact;
1440 }
1441 if ( aSign ) z = - z;
1442 return z;
1443
1444 }
1445
1446 #ifndef SOFTFLOAT_FOR_GCC /* __fix?fdi provided by libgcc2.c */
1447 /*
1448 -------------------------------------------------------------------------------
1449 Returns the result of converting the single-precision floating-point value
1450 `a' to the 64-bit two's complement integer format. The conversion is
1451 performed according to the IEC/IEEE Standard for Binary Floating-Point
1452 Arithmetic---which means in particular that the conversion is rounded
1453 according to the current rounding mode. If `a' is a NaN, the largest
1454 positive integer is returned. Otherwise, if the conversion overflows, the
1455 largest integer with the same sign as `a' is returned.
1456 -------------------------------------------------------------------------------
1457 */
float32_to_int64(float32 a)1458 int64 float32_to_int64( float32 a )
1459 {
1460 flag aSign;
1461 int16 aExp, shiftCount;
1462 bits32 aSig;
1463 bits64 aSig64, aSigExtra;
1464
1465 aSig = extractFloat32Frac( a );
1466 aExp = extractFloat32Exp( a );
1467 aSign = extractFloat32Sign( a );
1468 shiftCount = 0xBE - aExp;
1469 if ( shiftCount < 0 ) {
1470 float_raise( float_flag_invalid );
1471 if ( ! aSign || ( ( aExp == 0xFF ) && aSig ) ) {
1472 return LIT64( 0x7FFFFFFFFFFFFFFF );
1473 }
1474 return (sbits64) LIT64( 0x8000000000000000 );
1475 }
1476 if ( aExp ) aSig |= 0x00800000;
1477 aSig64 = aSig;
1478 aSig64 <<= 40;
1479 shift64ExtraRightJamming( aSig64, 0, shiftCount, &aSig64, &aSigExtra );
1480 return roundAndPackInt64( aSign, aSig64, aSigExtra );
1481
1482 }
1483
1484 /*
1485 -------------------------------------------------------------------------------
1486 Returns the result of converting the single-precision floating-point value
1487 `a' to the 64-bit two's complement integer format. The conversion is
1488 performed according to the IEC/IEEE Standard for Binary Floating-Point
1489 Arithmetic, except that the conversion is always rounded toward zero. If
1490 `a' is a NaN, the largest positive integer is returned. Otherwise, if the
1491 conversion overflows, the largest integer with the same sign as `a' is
1492 returned.
1493 -------------------------------------------------------------------------------
1494 */
float32_to_int64_round_to_zero(float32 a)1495 int64 float32_to_int64_round_to_zero( float32 a )
1496 {
1497 flag aSign;
1498 int16 aExp, shiftCount;
1499 bits32 aSig;
1500 bits64 aSig64;
1501 int64 z;
1502
1503 aSig = extractFloat32Frac( a );
1504 aExp = extractFloat32Exp( a );
1505 aSign = extractFloat32Sign( a );
1506 shiftCount = aExp - 0xBE;
1507 if ( 0 <= shiftCount ) {
1508 if ( a != 0xDF000000 ) {
1509 float_raise( float_flag_invalid );
1510 if ( ! aSign || ( ( aExp == 0xFF ) && aSig ) ) {
1511 return LIT64( 0x7FFFFFFFFFFFFFFF );
1512 }
1513 }
1514 return (sbits64) LIT64( 0x8000000000000000 );
1515 }
1516 else if ( aExp <= 0x7E ) {
1517 if ( aExp | aSig ) float_exception_flags |= float_flag_inexact;
1518 return 0;
1519 }
1520 aSig64 = aSig | 0x00800000;
1521 aSig64 <<= 40;
1522 z = aSig64>>( - shiftCount );
1523 if ( (bits64) ( aSig64<<( shiftCount & 63 ) ) ) {
1524 float_exception_flags |= float_flag_inexact;
1525 }
1526 if ( aSign ) z = - z;
1527 return z;
1528
1529 }
1530 #endif /* !SOFTFLOAT_FOR_GCC */
1531
1532 /*
1533 -------------------------------------------------------------------------------
1534 Returns the result of converting the single-precision floating-point value
1535 `a' to the double-precision floating-point format. The conversion is
1536 performed according to the IEC/IEEE Standard for Binary Floating-Point
1537 Arithmetic.
1538 -------------------------------------------------------------------------------
1539 */
float32_to_float64(float32 a)1540 float64 float32_to_float64( float32 a )
1541 {
1542 flag aSign;
1543 int16 aExp;
1544 bits32 aSig;
1545
1546 aSig = extractFloat32Frac( a );
1547 aExp = extractFloat32Exp( a );
1548 aSign = extractFloat32Sign( a );
1549 if ( aExp == 0xFF ) {
1550 if ( aSig ) return commonNaNToFloat64( float32ToCommonNaN( a ) );
1551 return packFloat64( aSign, 0x7FF, 0 );
1552 }
1553 if ( aExp == 0 ) {
1554 if ( aSig == 0 ) return packFloat64( aSign, 0, 0 );
1555 normalizeFloat32Subnormal( aSig, &aExp, &aSig );
1556 --aExp;
1557 }
1558 return packFloat64( aSign, aExp + 0x380, ( (bits64) aSig )<<29 );
1559
1560 }
1561
1562 #ifdef FLOATX80
1563
1564 /*
1565 -------------------------------------------------------------------------------
1566 Returns the result of converting the single-precision floating-point value
1567 `a' to the extended double-precision floating-point format. The conversion
1568 is performed according to the IEC/IEEE Standard for Binary Floating-Point
1569 Arithmetic.
1570 -------------------------------------------------------------------------------
1571 */
float32_to_floatx80(float32 a)1572 floatx80 float32_to_floatx80( float32 a )
1573 {
1574 flag aSign;
1575 int16 aExp;
1576 bits32 aSig;
1577
1578 aSig = extractFloat32Frac( a );
1579 aExp = extractFloat32Exp( a );
1580 aSign = extractFloat32Sign( a );
1581 if ( aExp == 0xFF ) {
1582 if ( aSig ) return commonNaNToFloatx80( float32ToCommonNaN( a ) );
1583 return packFloatx80( aSign, 0x7FFF, LIT64( 0x8000000000000000 ) );
1584 }
1585 if ( aExp == 0 ) {
1586 if ( aSig == 0 ) return packFloatx80( aSign, 0, 0 );
1587 normalizeFloat32Subnormal( aSig, &aExp, &aSig );
1588 }
1589 aSig |= 0x00800000;
1590 return packFloatx80( aSign, aExp + 0x3F80, ( (bits64) aSig )<<40 );
1591
1592 }
1593
1594 #endif
1595
1596 #ifdef FLOAT128
1597
1598 /*
1599 -------------------------------------------------------------------------------
1600 Returns the result of converting the single-precision floating-point value
1601 `a' to the double-precision floating-point format. The conversion is
1602 performed according to the IEC/IEEE Standard for Binary Floating-Point
1603 Arithmetic.
1604 -------------------------------------------------------------------------------
1605 */
float32_to_float128(float32 a)1606 float128 float32_to_float128( float32 a )
1607 {
1608 flag aSign;
1609 int16 aExp;
1610 bits32 aSig;
1611
1612 aSig = extractFloat32Frac( a );
1613 aExp = extractFloat32Exp( a );
1614 aSign = extractFloat32Sign( a );
1615 if ( aExp == 0xFF ) {
1616 if ( aSig ) return commonNaNToFloat128( float32ToCommonNaN( a ) );
1617 return packFloat128( aSign, 0x7FFF, 0, 0 );
1618 }
1619 if ( aExp == 0 ) {
1620 if ( aSig == 0 ) return packFloat128( aSign, 0, 0, 0 );
1621 normalizeFloat32Subnormal( aSig, &aExp, &aSig );
1622 --aExp;
1623 }
1624 return packFloat128( aSign, aExp + 0x3F80, ( (bits64) aSig )<<25, 0 );
1625
1626 }
1627
1628 #endif
1629
1630 #ifndef SOFTFLOAT_FOR_GCC /* Not needed */
1631 /*
1632 -------------------------------------------------------------------------------
1633 Rounds the single-precision floating-point value `a' to an integer, and
1634 returns the result as a single-precision floating-point value. The
1635 operation is performed according to the IEC/IEEE Standard for Binary
1636 Floating-Point Arithmetic.
1637 -------------------------------------------------------------------------------
1638 */
float32_round_to_int(float32 a)1639 float32 float32_round_to_int( float32 a )
1640 {
1641 flag aSign;
1642 int16 aExp;
1643 bits32 lastBitMask, roundBitsMask;
1644 int8 roundingMode;
1645 float32 z;
1646
1647 aExp = extractFloat32Exp( a );
1648 if ( 0x96 <= aExp ) {
1649 if ( ( aExp == 0xFF ) && extractFloat32Frac( a ) ) {
1650 return propagateFloat32NaN( a, a );
1651 }
1652 return a;
1653 }
1654 if ( aExp <= 0x7E ) {
1655 if ( (bits32) ( a<<1 ) == 0 ) return a;
1656 float_exception_flags |= float_flag_inexact;
1657 aSign = extractFloat32Sign( a );
1658 switch ( float_rounding_mode ) {
1659 case float_round_nearest_even:
1660 if ( ( aExp == 0x7E ) && extractFloat32Frac( a ) ) {
1661 return packFloat32( aSign, 0x7F, 0 );
1662 }
1663 break;
1664 case float_round_to_zero:
1665 break;
1666 case float_round_down:
1667 return aSign ? 0xBF800000 : 0;
1668 case float_round_up:
1669 return aSign ? 0x80000000 : 0x3F800000;
1670 }
1671 return packFloat32( aSign, 0, 0 );
1672 }
1673 lastBitMask = 1;
1674 lastBitMask <<= 0x96 - aExp;
1675 roundBitsMask = lastBitMask - 1;
1676 z = a;
1677 roundingMode = float_rounding_mode;
1678 if ( roundingMode == float_round_nearest_even ) {
1679 z += lastBitMask>>1;
1680 if ( ( z & roundBitsMask ) == 0 ) z &= ~ lastBitMask;
1681 }
1682 else if ( roundingMode != float_round_to_zero ) {
1683 if ( extractFloat32Sign( z ) ^ ( roundingMode == float_round_up ) ) {
1684 z += roundBitsMask;
1685 }
1686 }
1687 z &= ~ roundBitsMask;
1688 if ( z != a ) float_exception_flags |= float_flag_inexact;
1689 return z;
1690
1691 }
1692 #endif /* !SOFTFLOAT_FOR_GCC */
1693
1694 /*
1695 -------------------------------------------------------------------------------
1696 Returns the result of adding the absolute values of the single-precision
1697 floating-point values `a' and `b'. If `zSign' is 1, the sum is negated
1698 before being returned. `zSign' is ignored if the result is a NaN.
1699 The addition is performed according to the IEC/IEEE Standard for Binary
1700 Floating-Point Arithmetic.
1701 -------------------------------------------------------------------------------
1702 */
addFloat32Sigs(float32 a,float32 b,flag zSign)1703 static float32 addFloat32Sigs( float32 a, float32 b, flag zSign )
1704 {
1705 int16 aExp, bExp, zExp;
1706 bits32 aSig, bSig, zSig;
1707 int16 expDiff;
1708
1709 aSig = extractFloat32Frac( a );
1710 aExp = extractFloat32Exp( a );
1711 bSig = extractFloat32Frac( b );
1712 bExp = extractFloat32Exp( b );
1713 expDiff = aExp - bExp;
1714 aSig <<= 6;
1715 bSig <<= 6;
1716 if ( 0 < expDiff ) {
1717 if ( aExp == 0xFF ) {
1718 if ( aSig ) return propagateFloat32NaN( a, b );
1719 return a;
1720 }
1721 if ( bExp == 0 ) {
1722 --expDiff;
1723 }
1724 else {
1725 bSig |= 0x20000000;
1726 }
1727 shift32RightJamming( bSig, expDiff, &bSig );
1728 zExp = aExp;
1729 }
1730 else if ( expDiff < 0 ) {
1731 if ( bExp == 0xFF ) {
1732 if ( bSig ) return propagateFloat32NaN( a, b );
1733 return packFloat32( zSign, 0xFF, 0 );
1734 }
1735 if ( aExp == 0 ) {
1736 ++expDiff;
1737 }
1738 else {
1739 aSig |= 0x20000000;
1740 }
1741 shift32RightJamming( aSig, - expDiff, &aSig );
1742 zExp = bExp;
1743 }
1744 else {
1745 if ( aExp == 0xFF ) {
1746 if ( aSig | bSig ) return propagateFloat32NaN( a, b );
1747 return a;
1748 }
1749 if ( aExp == 0 ) return packFloat32( zSign, 0, ( aSig + bSig )>>6 );
1750 zSig = 0x40000000 + aSig + bSig;
1751 zExp = aExp;
1752 goto roundAndPack;
1753 }
1754 aSig |= 0x20000000;
1755 zSig = ( aSig + bSig )<<1;
1756 --zExp;
1757 if ( (sbits32) zSig < 0 ) {
1758 zSig = aSig + bSig;
1759 ++zExp;
1760 }
1761 roundAndPack:
1762 return roundAndPackFloat32( zSign, zExp, zSig );
1763
1764 }
1765
1766 /*
1767 -------------------------------------------------------------------------------
1768 Returns the result of subtracting the absolute values of the single-
1769 precision floating-point values `a' and `b'. If `zSign' is 1, the
1770 difference is negated before being returned. `zSign' is ignored if the
1771 result is a NaN. The subtraction is performed according to the IEC/IEEE
1772 Standard for Binary Floating-Point Arithmetic.
1773 -------------------------------------------------------------------------------
1774 */
subFloat32Sigs(float32 a,float32 b,flag zSign)1775 static float32 subFloat32Sigs( float32 a, float32 b, flag zSign )
1776 {
1777 int16 aExp, bExp, zExp;
1778 bits32 aSig, bSig, zSig;
1779 int16 expDiff;
1780
1781 aSig = extractFloat32Frac( a );
1782 aExp = extractFloat32Exp( a );
1783 bSig = extractFloat32Frac( b );
1784 bExp = extractFloat32Exp( b );
1785 expDiff = aExp - bExp;
1786 aSig <<= 7;
1787 bSig <<= 7;
1788 if ( 0 < expDiff ) goto aExpBigger;
1789 if ( expDiff < 0 ) goto bExpBigger;
1790 if ( aExp == 0xFF ) {
1791 if ( aSig | bSig ) return propagateFloat32NaN( a, b );
1792 float_raise( float_flag_invalid );
1793 return float32_default_nan;
1794 }
1795 if ( aExp == 0 ) {
1796 aExp = 1;
1797 bExp = 1;
1798 }
1799 if ( bSig < aSig ) goto aBigger;
1800 if ( aSig < bSig ) goto bBigger;
1801 return packFloat32( float_rounding_mode == float_round_down, 0, 0 );
1802 bExpBigger:
1803 if ( bExp == 0xFF ) {
1804 if ( bSig ) return propagateFloat32NaN( a, b );
1805 return packFloat32( zSign ^ 1, 0xFF, 0 );
1806 }
1807 if ( aExp == 0 ) {
1808 ++expDiff;
1809 }
1810 else {
1811 aSig |= 0x40000000;
1812 }
1813 shift32RightJamming( aSig, - expDiff, &aSig );
1814 bSig |= 0x40000000;
1815 bBigger:
1816 zSig = bSig - aSig;
1817 zExp = bExp;
1818 zSign ^= 1;
1819 goto normalizeRoundAndPack;
1820 aExpBigger:
1821 if ( aExp == 0xFF ) {
1822 if ( aSig ) return propagateFloat32NaN( a, b );
1823 return a;
1824 }
1825 if ( bExp == 0 ) {
1826 --expDiff;
1827 }
1828 else {
1829 bSig |= 0x40000000;
1830 }
1831 shift32RightJamming( bSig, expDiff, &bSig );
1832 aSig |= 0x40000000;
1833 aBigger:
1834 zSig = aSig - bSig;
1835 zExp = aExp;
1836 normalizeRoundAndPack:
1837 --zExp;
1838 return normalizeRoundAndPackFloat32( zSign, zExp, zSig );
1839
1840 }
1841
1842 /*
1843 -------------------------------------------------------------------------------
1844 Returns the result of adding the single-precision floating-point values `a'
1845 and `b'. The operation is performed according to the IEC/IEEE Standard for
1846 Binary Floating-Point Arithmetic.
1847 -------------------------------------------------------------------------------
1848 */
float32_add(float32 a,float32 b)1849 float32 float32_add( float32 a, float32 b )
1850 {
1851 flag aSign, bSign;
1852
1853 aSign = extractFloat32Sign( a );
1854 bSign = extractFloat32Sign( b );
1855 if ( aSign == bSign ) {
1856 return addFloat32Sigs( a, b, aSign );
1857 }
1858 else {
1859 return subFloat32Sigs( a, b, aSign );
1860 }
1861
1862 }
1863
1864 /*
1865 -------------------------------------------------------------------------------
1866 Returns the result of subtracting the single-precision floating-point values
1867 `a' and `b'. The operation is performed according to the IEC/IEEE Standard
1868 for Binary Floating-Point Arithmetic.
1869 -------------------------------------------------------------------------------
1870 */
float32_sub(float32 a,float32 b)1871 float32 float32_sub( float32 a, float32 b )
1872 {
1873 flag aSign, bSign;
1874
1875 aSign = extractFloat32Sign( a );
1876 bSign = extractFloat32Sign( b );
1877 if ( aSign == bSign ) {
1878 return subFloat32Sigs( a, b, aSign );
1879 }
1880 else {
1881 return addFloat32Sigs( a, b, aSign );
1882 }
1883
1884 }
1885
1886 /*
1887 -------------------------------------------------------------------------------
1888 Returns the result of multiplying the single-precision floating-point values
1889 `a' and `b'. The operation is performed according to the IEC/IEEE Standard
1890 for Binary Floating-Point Arithmetic.
1891 -------------------------------------------------------------------------------
1892 */
float32_mul(float32 a,float32 b)1893 float32 float32_mul( float32 a, float32 b )
1894 {
1895 flag aSign, bSign, zSign;
1896 int16 aExp, bExp, zExp;
1897 bits32 aSig, bSig;
1898 bits64 zSig64;
1899 bits32 zSig;
1900
1901 aSig = extractFloat32Frac( a );
1902 aExp = extractFloat32Exp( a );
1903 aSign = extractFloat32Sign( a );
1904 bSig = extractFloat32Frac( b );
1905 bExp = extractFloat32Exp( b );
1906 bSign = extractFloat32Sign( b );
1907 zSign = aSign ^ bSign;
1908 if ( aExp == 0xFF ) {
1909 if ( aSig || ( ( bExp == 0xFF ) && bSig ) ) {
1910 return propagateFloat32NaN( a, b );
1911 }
1912 if ( ( bExp | bSig ) == 0 ) {
1913 float_raise( float_flag_invalid );
1914 return float32_default_nan;
1915 }
1916 return packFloat32( zSign, 0xFF, 0 );
1917 }
1918 if ( bExp == 0xFF ) {
1919 if ( bSig ) return propagateFloat32NaN( a, b );
1920 if ( ( aExp | aSig ) == 0 ) {
1921 float_raise( float_flag_invalid );
1922 return float32_default_nan;
1923 }
1924 return packFloat32( zSign, 0xFF, 0 );
1925 }
1926 if ( aExp == 0 ) {
1927 if ( aSig == 0 ) return packFloat32( zSign, 0, 0 );
1928 normalizeFloat32Subnormal( aSig, &aExp, &aSig );
1929 }
1930 if ( bExp == 0 ) {
1931 if ( bSig == 0 ) return packFloat32( zSign, 0, 0 );
1932 normalizeFloat32Subnormal( bSig, &bExp, &bSig );
1933 }
1934 zExp = aExp + bExp - 0x7F;
1935 aSig = ( aSig | 0x00800000 )<<7;
1936 bSig = ( bSig | 0x00800000 )<<8;
1937 shift64RightJamming( ( (bits64) aSig ) * bSig, 32, &zSig64 );
1938 zSig = zSig64;
1939 if ( 0 <= (sbits32) ( zSig<<1 ) ) {
1940 zSig <<= 1;
1941 --zExp;
1942 }
1943 return roundAndPackFloat32( zSign, zExp, zSig );
1944
1945 }
1946
1947 /*
1948 -------------------------------------------------------------------------------
1949 Returns the result of dividing the single-precision floating-point value `a'
1950 by the corresponding value `b'. The operation is performed according to the
1951 IEC/IEEE Standard for Binary Floating-Point Arithmetic.
1952 -------------------------------------------------------------------------------
1953 */
float32_div(float32 a,float32 b)1954 float32 float32_div( float32 a, float32 b )
1955 {
1956 flag aSign, bSign, zSign;
1957 int16 aExp, bExp, zExp;
1958 bits32 aSig, bSig, zSig;
1959
1960 aSig = extractFloat32Frac( a );
1961 aExp = extractFloat32Exp( a );
1962 aSign = extractFloat32Sign( a );
1963 bSig = extractFloat32Frac( b );
1964 bExp = extractFloat32Exp( b );
1965 bSign = extractFloat32Sign( b );
1966 zSign = aSign ^ bSign;
1967 if ( aExp == 0xFF ) {
1968 if ( aSig ) return propagateFloat32NaN( a, b );
1969 if ( bExp == 0xFF ) {
1970 if ( bSig ) return propagateFloat32NaN( a, b );
1971 float_raise( float_flag_invalid );
1972 return float32_default_nan;
1973 }
1974 return packFloat32( zSign, 0xFF, 0 );
1975 }
1976 if ( bExp == 0xFF ) {
1977 if ( bSig ) return propagateFloat32NaN( a, b );
1978 return packFloat32( zSign, 0, 0 );
1979 }
1980 if ( bExp == 0 ) {
1981 if ( bSig == 0 ) {
1982 if ( ( aExp | aSig ) == 0 ) {
1983 float_raise( float_flag_invalid );
1984 return float32_default_nan;
1985 }
1986 float_raise( float_flag_divbyzero );
1987 return packFloat32( zSign, 0xFF, 0 );
1988 }
1989 normalizeFloat32Subnormal( bSig, &bExp, &bSig );
1990 }
1991 if ( aExp == 0 ) {
1992 if ( aSig == 0 ) return packFloat32( zSign, 0, 0 );
1993 normalizeFloat32Subnormal( aSig, &aExp, &aSig );
1994 }
1995 zExp = aExp - bExp + 0x7D;
1996 aSig = ( aSig | 0x00800000 )<<7;
1997 bSig = ( bSig | 0x00800000 )<<8;
1998 if ( bSig <= ( aSig + aSig ) ) {
1999 aSig >>= 1;
2000 ++zExp;
2001 }
2002 zSig = ( ( (bits64) aSig )<<32 ) / bSig;
2003 if ( ( zSig & 0x3F ) == 0 ) {
2004 zSig |= ( (bits64) bSig * zSig != ( (bits64) aSig )<<32 );
2005 }
2006 return roundAndPackFloat32( zSign, zExp, zSig );
2007
2008 }
2009
2010 #ifndef SOFTFLOAT_FOR_GCC /* Not needed */
2011 /*
2012 -------------------------------------------------------------------------------
2013 Returns the remainder of the single-precision floating-point value `a'
2014 with respect to the corresponding value `b'. The operation is performed
2015 according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
2016 -------------------------------------------------------------------------------
2017 */
float32_rem(float32 a,float32 b)2018 float32 float32_rem( float32 a, float32 b )
2019 {
2020 flag aSign, bSign, zSign;
2021 int16 aExp, bExp, expDiff;
2022 bits32 aSig, bSig;
2023 bits32 q;
2024 bits64 aSig64, bSig64, q64;
2025 bits32 alternateASig;
2026 sbits32 sigMean;
2027
2028 aSig = extractFloat32Frac( a );
2029 aExp = extractFloat32Exp( a );
2030 aSign = extractFloat32Sign( a );
2031 bSig = extractFloat32Frac( b );
2032 bExp = extractFloat32Exp( b );
2033 bSign = extractFloat32Sign( b );
2034 if ( aExp == 0xFF ) {
2035 if ( aSig || ( ( bExp == 0xFF ) && bSig ) ) {
2036 return propagateFloat32NaN( a, b );
2037 }
2038 float_raise( float_flag_invalid );
2039 return float32_default_nan;
2040 }
2041 if ( bExp == 0xFF ) {
2042 if ( bSig ) return propagateFloat32NaN( a, b );
2043 return a;
2044 }
2045 if ( bExp == 0 ) {
2046 if ( bSig == 0 ) {
2047 float_raise( float_flag_invalid );
2048 return float32_default_nan;
2049 }
2050 normalizeFloat32Subnormal( bSig, &bExp, &bSig );
2051 }
2052 if ( aExp == 0 ) {
2053 if ( aSig == 0 ) return a;
2054 normalizeFloat32Subnormal( aSig, &aExp, &aSig );
2055 }
2056 expDiff = aExp - bExp;
2057 aSig |= 0x00800000;
2058 bSig |= 0x00800000;
2059 if ( expDiff < 32 ) {
2060 aSig <<= 8;
2061 bSig <<= 8;
2062 if ( expDiff < 0 ) {
2063 if ( expDiff < -1 ) return a;
2064 aSig >>= 1;
2065 }
2066 q = ( bSig <= aSig );
2067 if ( q ) aSig -= bSig;
2068 if ( 0 < expDiff ) {
2069 q = ( ( (bits64) aSig )<<32 ) / bSig;
2070 q >>= 32 - expDiff;
2071 bSig >>= 2;
2072 aSig = ( ( aSig>>1 )<<( expDiff - 1 ) ) - bSig * q;
2073 }
2074 else {
2075 aSig >>= 2;
2076 bSig >>= 2;
2077 }
2078 }
2079 else {
2080 if ( bSig <= aSig ) aSig -= bSig;
2081 aSig64 = ( (bits64) aSig )<<40;
2082 bSig64 = ( (bits64) bSig )<<40;
2083 expDiff -= 64;
2084 while ( 0 < expDiff ) {
2085 q64 = estimateDiv128To64( aSig64, 0, bSig64 );
2086 q64 = ( 2 < q64 ) ? q64 - 2 : 0;
2087 aSig64 = - ( ( bSig * q64 )<<38 );
2088 expDiff -= 62;
2089 }
2090 expDiff += 64;
2091 q64 = estimateDiv128To64( aSig64, 0, bSig64 );
2092 q64 = ( 2 < q64 ) ? q64 - 2 : 0;
2093 q = q64>>( 64 - expDiff );
2094 bSig <<= 6;
2095 aSig = ( ( aSig64>>33 )<<( expDiff - 1 ) ) - bSig * q;
2096 }
2097 do {
2098 alternateASig = aSig;
2099 ++q;
2100 aSig -= bSig;
2101 } while ( 0 <= (sbits32) aSig );
2102 sigMean = aSig + alternateASig;
2103 if ( ( sigMean < 0 ) || ( ( sigMean == 0 ) && ( q & 1 ) ) ) {
2104 aSig = alternateASig;
2105 }
2106 zSign = ( (sbits32) aSig < 0 );
2107 if ( zSign ) aSig = - aSig;
2108 return normalizeRoundAndPackFloat32( aSign ^ zSign, bExp, aSig );
2109
2110 }
2111 #endif /* !SOFTFLOAT_FOR_GCC */
2112
2113 #ifndef SOFTFLOAT_FOR_GCC /* Not needed */
2114 /*
2115 -------------------------------------------------------------------------------
2116 Returns the square root of the single-precision floating-point value `a'.
2117 The operation is performed according to the IEC/IEEE Standard for Binary
2118 Floating-Point Arithmetic.
2119 -------------------------------------------------------------------------------
2120 */
float32_sqrt(float32 a)2121 float32 float32_sqrt( float32 a )
2122 {
2123 flag aSign;
2124 int16 aExp, zExp;
2125 bits32 aSig, zSig;
2126 bits64 rem, term;
2127
2128 aSig = extractFloat32Frac( a );
2129 aExp = extractFloat32Exp( a );
2130 aSign = extractFloat32Sign( a );
2131 if ( aExp == 0xFF ) {
2132 if ( aSig ) return propagateFloat32NaN( a, 0 );
2133 if ( ! aSign ) return a;
2134 float_raise( float_flag_invalid );
2135 return float32_default_nan;
2136 }
2137 if ( aSign ) {
2138 if ( ( aExp | aSig ) == 0 ) return a;
2139 float_raise( float_flag_invalid );
2140 return float32_default_nan;
2141 }
2142 if ( aExp == 0 ) {
2143 if ( aSig == 0 ) return 0;
2144 normalizeFloat32Subnormal( aSig, &aExp, &aSig );
2145 }
2146 zExp = ( ( aExp - 0x7F )>>1 ) + 0x7E;
2147 aSig = ( aSig | 0x00800000 )<<8;
2148 zSig = estimateSqrt32( aExp, aSig ) + 2;
2149 if ( ( zSig & 0x7F ) <= 5 ) {
2150 if ( zSig < 2 ) {
2151 zSig = 0x7FFFFFFF;
2152 goto roundAndPack;
2153 }
2154 aSig >>= aExp & 1;
2155 term = ( (bits64) zSig ) * zSig;
2156 rem = ( ( (bits64) aSig )<<32 ) - term;
2157 while ( (sbits64) rem < 0 ) {
2158 --zSig;
2159 rem += ( ( (bits64) zSig )<<1 ) | 1;
2160 }
2161 zSig |= ( rem != 0 );
2162 }
2163 shift32RightJamming( zSig, 1, &zSig );
2164 roundAndPack:
2165 return roundAndPackFloat32( 0, zExp, zSig );
2166
2167 }
2168 #endif /* !SOFTFLOAT_FOR_GCC */
2169
2170 /*
2171 -------------------------------------------------------------------------------
2172 Returns 1 if the single-precision floating-point value `a' is equal to
2173 the corresponding value `b', and 0 otherwise. The comparison is performed
2174 according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
2175 -------------------------------------------------------------------------------
2176 */
float32_eq(float32 a,float32 b)2177 flag float32_eq( float32 a, float32 b )
2178 {
2179
2180 if ( ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) )
2181 || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) )
2182 ) {
2183 if ( float32_is_signaling_nan( a ) || float32_is_signaling_nan( b ) ) {
2184 float_raise( float_flag_invalid );
2185 }
2186 return 0;
2187 }
2188 return ( a == b ) || ( (bits32) ( ( a | b )<<1 ) == 0 );
2189
2190 }
2191
2192 /*
2193 -------------------------------------------------------------------------------
2194 Returns 1 if the single-precision floating-point value `a' is less than
2195 or equal to the corresponding value `b', and 0 otherwise. The comparison
2196 is performed according to the IEC/IEEE Standard for Binary Floating-Point
2197 Arithmetic.
2198 -------------------------------------------------------------------------------
2199 */
float32_le(float32 a,float32 b)2200 flag float32_le( float32 a, float32 b )
2201 {
2202 flag aSign, bSign;
2203
2204 if ( ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) )
2205 || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) )
2206 ) {
2207 float_raise( float_flag_invalid );
2208 return 0;
2209 }
2210 aSign = extractFloat32Sign( a );
2211 bSign = extractFloat32Sign( b );
2212 if ( aSign != bSign ) return aSign || ( (bits32) ( ( a | b )<<1 ) == 0 );
2213 return ( a == b ) || ( aSign ^ ( a < b ) );
2214
2215 }
2216
2217 /*
2218 -------------------------------------------------------------------------------
2219 Returns 1 if the single-precision floating-point value `a' is less than
2220 the corresponding value `b', and 0 otherwise. The comparison is performed
2221 according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
2222 -------------------------------------------------------------------------------
2223 */
float32_lt(float32 a,float32 b)2224 flag float32_lt( float32 a, float32 b )
2225 {
2226 flag aSign, bSign;
2227
2228 if ( ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) )
2229 || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) )
2230 ) {
2231 float_raise( float_flag_invalid );
2232 return 0;
2233 }
2234 aSign = extractFloat32Sign( a );
2235 bSign = extractFloat32Sign( b );
2236 if ( aSign != bSign ) return aSign && ( (bits32) ( ( a | b )<<1 ) != 0 );
2237 return ( a != b ) && ( aSign ^ ( a < b ) );
2238
2239 }
2240
2241 #ifndef SOFTFLOAT_FOR_GCC /* Not needed */
2242 /*
2243 -------------------------------------------------------------------------------
2244 Returns 1 if the single-precision floating-point value `a' is equal to
2245 the corresponding value `b', and 0 otherwise. The invalid exception is
2246 raised if either operand is a NaN. Otherwise, the comparison is performed
2247 according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
2248 -------------------------------------------------------------------------------
2249 */
float32_eq_signaling(float32 a,float32 b)2250 flag float32_eq_signaling( float32 a, float32 b )
2251 {
2252
2253 if ( ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) )
2254 || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) )
2255 ) {
2256 float_raise( float_flag_invalid );
2257 return 0;
2258 }
2259 return ( a == b ) || ( (bits32) ( ( a | b )<<1 ) == 0 );
2260
2261 }
2262
2263 /*
2264 -------------------------------------------------------------------------------
2265 Returns 1 if the single-precision floating-point value `a' is less than or
2266 equal to the corresponding value `b', and 0 otherwise. Quiet NaNs do not
2267 cause an exception. Otherwise, the comparison is performed according to the
2268 IEC/IEEE Standard for Binary Floating-Point Arithmetic.
2269 -------------------------------------------------------------------------------
2270 */
float32_le_quiet(float32 a,float32 b)2271 flag float32_le_quiet( float32 a, float32 b )
2272 {
2273 flag aSign, bSign;
2274
2275 if ( ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) )
2276 || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) )
2277 ) {
2278 if ( float32_is_signaling_nan( a ) || float32_is_signaling_nan( b ) ) {
2279 float_raise( float_flag_invalid );
2280 }
2281 return 0;
2282 }
2283 aSign = extractFloat32Sign( a );
2284 bSign = extractFloat32Sign( b );
2285 if ( aSign != bSign ) return aSign || ( (bits32) ( ( a | b )<<1 ) == 0 );
2286 return ( a == b ) || ( aSign ^ ( a < b ) );
2287
2288 }
2289
2290 /*
2291 -------------------------------------------------------------------------------
2292 Returns 1 if the single-precision floating-point value `a' is less than
2293 the corresponding value `b', and 0 otherwise. Quiet NaNs do not cause an
2294 exception. Otherwise, the comparison is performed according to the IEC/IEEE
2295 Standard for Binary Floating-Point Arithmetic.
2296 -------------------------------------------------------------------------------
2297 */
float32_lt_quiet(float32 a,float32 b)2298 flag float32_lt_quiet( float32 a, float32 b )
2299 {
2300 flag aSign, bSign;
2301
2302 if ( ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) )
2303 || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) )
2304 ) {
2305 if ( float32_is_signaling_nan( a ) || float32_is_signaling_nan( b ) ) {
2306 float_raise( float_flag_invalid );
2307 }
2308 return 0;
2309 }
2310 aSign = extractFloat32Sign( a );
2311 bSign = extractFloat32Sign( b );
2312 if ( aSign != bSign ) return aSign && ( (bits32) ( ( a | b )<<1 ) != 0 );
2313 return ( a != b ) && ( aSign ^ ( a < b ) );
2314
2315 }
2316 #endif /* !SOFTFLOAT_FOR_GCC */
2317
2318 #ifndef SOFTFLOAT_FOR_GCC /* Not needed */
2319 /*
2320 -------------------------------------------------------------------------------
2321 Returns the result of converting the double-precision floating-point value
2322 `a' to the 32-bit two's complement integer format. The conversion is
2323 performed according to the IEC/IEEE Standard for Binary Floating-Point
2324 Arithmetic---which means in particular that the conversion is rounded
2325 according to the current rounding mode. If `a' is a NaN, the largest
2326 positive integer is returned. Otherwise, if the conversion overflows, the
2327 largest integer with the same sign as `a' is returned.
2328 -------------------------------------------------------------------------------
2329 */
float64_to_int32(float64 a)2330 int32 float64_to_int32( float64 a )
2331 {
2332 flag aSign;
2333 int16 aExp, shiftCount;
2334 bits64 aSig;
2335
2336 aSig = extractFloat64Frac( a );
2337 aExp = extractFloat64Exp( a );
2338 aSign = extractFloat64Sign( a );
2339 if ( ( aExp == 0x7FF ) && aSig ) aSign = 0;
2340 if ( aExp ) aSig |= LIT64( 0x0010000000000000 );
2341 shiftCount = 0x42C - aExp;
2342 if ( 0 < shiftCount ) shift64RightJamming( aSig, shiftCount, &aSig );
2343 return roundAndPackInt32( aSign, aSig );
2344
2345 }
2346 #endif /* !SOFTFLOAT_FOR_GCC */
2347
2348 /*
2349 -------------------------------------------------------------------------------
2350 Returns the result of converting the double-precision floating-point value
2351 `a' to the 32-bit two's complement integer format. The conversion is
2352 performed according to the IEC/IEEE Standard for Binary Floating-Point
2353 Arithmetic, except that the conversion is always rounded toward zero.
2354 If `a' is a NaN, the largest positive integer is returned. Otherwise, if
2355 the conversion overflows, the largest integer with the same sign as `a' is
2356 returned.
2357 -------------------------------------------------------------------------------
2358 */
float64_to_int32_round_to_zero(float64 a)2359 int32 float64_to_int32_round_to_zero( float64 a )
2360 {
2361 flag aSign;
2362 int16 aExp, shiftCount;
2363 bits64 aSig, savedASig;
2364 int32 z;
2365
2366 aSig = extractFloat64Frac( a );
2367 aExp = extractFloat64Exp( a );
2368 aSign = extractFloat64Sign( a );
2369 if ( 0x41E < aExp ) {
2370 if ( ( aExp == 0x7FF ) && aSig ) aSign = 0;
2371 goto invalid;
2372 }
2373 else if ( aExp < 0x3FF ) {
2374 if ( aExp || aSig ) float_exception_flags |= float_flag_inexact;
2375 return 0;
2376 }
2377 aSig |= LIT64( 0x0010000000000000 );
2378 shiftCount = 0x433 - aExp;
2379 savedASig = aSig;
2380 aSig >>= shiftCount;
2381 z = aSig;
2382 if ( aSign ) z = - z;
2383 if ( ( z < 0 ) ^ aSign ) {
2384 invalid:
2385 float_raise( float_flag_invalid );
2386 return aSign ? (sbits32) 0x80000000 : 0x7FFFFFFF;
2387 }
2388 if ( ( aSig<<shiftCount ) != savedASig ) {
2389 float_exception_flags |= float_flag_inexact;
2390 }
2391 return z;
2392
2393 }
2394
2395 #ifndef SOFTFLOAT_FOR_GCC /* Not needed */
2396 /*
2397 -------------------------------------------------------------------------------
2398 Returns the result of converting the double-precision floating-point value
2399 `a' to the 64-bit two's complement integer format. The conversion is
2400 performed according to the IEC/IEEE Standard for Binary Floating-Point
2401 Arithmetic---which means in particular that the conversion is rounded
2402 according to the current rounding mode. If `a' is a NaN, the largest
2403 positive integer is returned. Otherwise, if the conversion overflows, the
2404 largest integer with the same sign as `a' is returned.
2405 -------------------------------------------------------------------------------
2406 */
float64_to_int64(float64 a)2407 int64 float64_to_int64( float64 a )
2408 {
2409 flag aSign;
2410 int16 aExp, shiftCount;
2411 bits64 aSig, aSigExtra;
2412
2413 aSig = extractFloat64Frac( a );
2414 aExp = extractFloat64Exp( a );
2415 aSign = extractFloat64Sign( a );
2416 if ( aExp ) aSig |= LIT64( 0x0010000000000000 );
2417 shiftCount = 0x433 - aExp;
2418 if ( shiftCount <= 0 ) {
2419 if ( 0x43E < aExp ) {
2420 float_raise( float_flag_invalid );
2421 if ( ! aSign
2422 || ( ( aExp == 0x7FF )
2423 && ( aSig != LIT64( 0x0010000000000000 ) ) )
2424 ) {
2425 return LIT64( 0x7FFFFFFFFFFFFFFF );
2426 }
2427 return (sbits64) LIT64( 0x8000000000000000 );
2428 }
2429 aSigExtra = 0;
2430 aSig <<= - shiftCount;
2431 }
2432 else {
2433 shift64ExtraRightJamming( aSig, 0, shiftCount, &aSig, &aSigExtra );
2434 }
2435 return roundAndPackInt64( aSign, aSig, aSigExtra );
2436
2437 }
2438
2439 /*
2440 -------------------------------------------------------------------------------
2441 Returns the result of converting the double-precision floating-point value
2442 `a' to the 64-bit two's complement integer format. The conversion is
2443 performed according to the IEC/IEEE Standard for Binary Floating-Point
2444 Arithmetic, except that the conversion is always rounded toward zero.
2445 If `a' is a NaN, the largest positive integer is returned. Otherwise, if
2446 the conversion overflows, the largest integer with the same sign as `a' is
2447 returned.
2448 -------------------------------------------------------------------------------
2449 */
float64_to_int64_round_to_zero(float64 a)2450 int64 float64_to_int64_round_to_zero( float64 a )
2451 {
2452 flag aSign;
2453 int16 aExp, shiftCount;
2454 bits64 aSig;
2455 int64 z;
2456
2457 aSig = extractFloat64Frac( a );
2458 aExp = extractFloat64Exp( a );
2459 aSign = extractFloat64Sign( a );
2460 if ( aExp ) aSig |= LIT64( 0x0010000000000000 );
2461 shiftCount = aExp - 0x433;
2462 if ( 0 <= shiftCount ) {
2463 if ( 0x43E <= aExp ) {
2464 if ( a != LIT64( 0xC3E0000000000000 ) ) {
2465 float_raise( float_flag_invalid );
2466 if ( ! aSign
2467 || ( ( aExp == 0x7FF )
2468 && ( aSig != LIT64( 0x0010000000000000 ) ) )
2469 ) {
2470 return LIT64( 0x7FFFFFFFFFFFFFFF );
2471 }
2472 }
2473 return (sbits64) LIT64( 0x8000000000000000 );
2474 }
2475 z = aSig<<shiftCount;
2476 }
2477 else {
2478 if ( aExp < 0x3FE ) {
2479 if ( aExp | aSig ) float_exception_flags |= float_flag_inexact;
2480 return 0;
2481 }
2482 z = aSig>>( - shiftCount );
2483 if ( (bits64) ( aSig<<( shiftCount & 63 ) ) ) {
2484 float_exception_flags |= float_flag_inexact;
2485 }
2486 }
2487 if ( aSign ) z = - z;
2488 return z;
2489
2490 }
2491 #endif /* !SOFTFLOAT_FOR_GCC */
2492
2493 /*
2494 -------------------------------------------------------------------------------
2495 Returns the result of converting the double-precision floating-point value
2496 `a' to the single-precision floating-point format. The conversion is
2497 performed according to the IEC/IEEE Standard for Binary Floating-Point
2498 Arithmetic.
2499 -------------------------------------------------------------------------------
2500 */
float64_to_float32(float64 a)2501 float32 float64_to_float32( float64 a )
2502 {
2503 flag aSign;
2504 int16 aExp;
2505 bits64 aSig;
2506 bits32 zSig;
2507
2508 aSig = extractFloat64Frac( a );
2509 aExp = extractFloat64Exp( a );
2510 aSign = extractFloat64Sign( a );
2511 if ( aExp == 0x7FF ) {
2512 if ( aSig ) return commonNaNToFloat32( float64ToCommonNaN( a ) );
2513 return packFloat32( aSign, 0xFF, 0 );
2514 }
2515 shift64RightJamming( aSig, 22, &aSig );
2516 zSig = aSig;
2517 if ( aExp || zSig ) {
2518 zSig |= 0x40000000;
2519 aExp -= 0x381;
2520 }
2521 return roundAndPackFloat32( aSign, aExp, zSig );
2522
2523 }
2524
2525 #ifdef FLOATX80
2526
2527 /*
2528 -------------------------------------------------------------------------------
2529 Returns the result of converting the double-precision floating-point value
2530 `a' to the extended double-precision floating-point format. The conversion
2531 is performed according to the IEC/IEEE Standard for Binary Floating-Point
2532 Arithmetic.
2533 -------------------------------------------------------------------------------
2534 */
float64_to_floatx80(float64 a)2535 floatx80 float64_to_floatx80( float64 a )
2536 {
2537 flag aSign;
2538 int16 aExp;
2539 bits64 aSig;
2540
2541 aSig = extractFloat64Frac( a );
2542 aExp = extractFloat64Exp( a );
2543 aSign = extractFloat64Sign( a );
2544 if ( aExp == 0x7FF ) {
2545 if ( aSig ) return commonNaNToFloatx80( float64ToCommonNaN( a ) );
2546 return packFloatx80( aSign, 0x7FFF, LIT64( 0x8000000000000000 ) );
2547 }
2548 if ( aExp == 0 ) {
2549 if ( aSig == 0 ) return packFloatx80( aSign, 0, 0 );
2550 normalizeFloat64Subnormal( aSig, &aExp, &aSig );
2551 }
2552 return
2553 packFloatx80(
2554 aSign, aExp + 0x3C00, ( aSig | LIT64( 0x0010000000000000 ) )<<11 );
2555
2556 }
2557
2558 #endif
2559
2560 #ifdef FLOAT128
2561
2562 /*
2563 -------------------------------------------------------------------------------
2564 Returns the result of converting the double-precision floating-point value
2565 `a' to the quadruple-precision floating-point format. The conversion is
2566 performed according to the IEC/IEEE Standard for Binary Floating-Point
2567 Arithmetic.
2568 -------------------------------------------------------------------------------
2569 */
float64_to_float128(float64 a)2570 float128 float64_to_float128( float64 a )
2571 {
2572 flag aSign;
2573 int16 aExp;
2574 bits64 aSig, zSig0, zSig1;
2575
2576 aSig = extractFloat64Frac( a );
2577 aExp = extractFloat64Exp( a );
2578 aSign = extractFloat64Sign( a );
2579 if ( aExp == 0x7FF ) {
2580 if ( aSig ) return commonNaNToFloat128( float64ToCommonNaN( a ) );
2581 return packFloat128( aSign, 0x7FFF, 0, 0 );
2582 }
2583 if ( aExp == 0 ) {
2584 if ( aSig == 0 ) return packFloat128( aSign, 0, 0, 0 );
2585 normalizeFloat64Subnormal( aSig, &aExp, &aSig );
2586 --aExp;
2587 }
2588 shift128Right( aSig, 0, 4, &zSig0, &zSig1 );
2589 return packFloat128( aSign, aExp + 0x3C00, zSig0, zSig1 );
2590
2591 }
2592
2593 #endif
2594
2595 #ifndef SOFTFLOAT_FOR_GCC
2596 /*
2597 -------------------------------------------------------------------------------
2598 Rounds the double-precision floating-point value `a' to an integer, and
2599 returns the result as a double-precision floating-point value. The
2600 operation is performed according to the IEC/IEEE Standard for Binary
2601 Floating-Point Arithmetic.
2602 -------------------------------------------------------------------------------
2603 */
float64_round_to_int(float64 a)2604 float64 float64_round_to_int( float64 a )
2605 {
2606 flag aSign;
2607 int16 aExp;
2608 bits64 lastBitMask, roundBitsMask;
2609 int8 roundingMode;
2610 float64 z;
2611
2612 aExp = extractFloat64Exp( a );
2613 if ( 0x433 <= aExp ) {
2614 if ( ( aExp == 0x7FF ) && extractFloat64Frac( a ) ) {
2615 return propagateFloat64NaN( a, a );
2616 }
2617 return a;
2618 }
2619 if ( aExp < 0x3FF ) {
2620 if ( (bits64) ( a<<1 ) == 0 ) return a;
2621 float_exception_flags |= float_flag_inexact;
2622 aSign = extractFloat64Sign( a );
2623 switch ( float_rounding_mode ) {
2624 case float_round_nearest_even:
2625 if ( ( aExp == 0x3FE ) && extractFloat64Frac( a ) ) {
2626 return packFloat64( aSign, 0x3FF, 0 );
2627 }
2628 break;
2629 case float_round_to_zero:
2630 break;
2631 case float_round_down:
2632 return aSign ? LIT64( 0xBFF0000000000000 ) : 0;
2633 case float_round_up:
2634 return
2635 aSign ? LIT64( 0x8000000000000000 ) : LIT64( 0x3FF0000000000000 );
2636 }
2637 return packFloat64( aSign, 0, 0 );
2638 }
2639 lastBitMask = 1;
2640 lastBitMask <<= 0x433 - aExp;
2641 roundBitsMask = lastBitMask - 1;
2642 z = a;
2643 roundingMode = float_rounding_mode;
2644 if ( roundingMode == float_round_nearest_even ) {
2645 z += lastBitMask>>1;
2646 if ( ( z & roundBitsMask ) == 0 ) z &= ~ lastBitMask;
2647 }
2648 else if ( roundingMode != float_round_to_zero ) {
2649 if ( extractFloat64Sign( z ) ^ ( roundingMode == float_round_up ) ) {
2650 z += roundBitsMask;
2651 }
2652 }
2653 z &= ~ roundBitsMask;
2654 if ( z != a ) float_exception_flags |= float_flag_inexact;
2655 return z;
2656
2657 }
2658 #endif
2659
2660 /*
2661 -------------------------------------------------------------------------------
2662 Returns the result of adding the absolute values of the double-precision
2663 floating-point values `a' and `b'. If `zSign' is 1, the sum is negated
2664 before being returned. `zSign' is ignored if the result is a NaN.
2665 The addition is performed according to the IEC/IEEE Standard for Binary
2666 Floating-Point Arithmetic.
2667 -------------------------------------------------------------------------------
2668 */
addFloat64Sigs(float64 a,float64 b,flag zSign)2669 static float64 addFloat64Sigs( float64 a, float64 b, flag zSign )
2670 {
2671 int16 aExp, bExp, zExp;
2672 bits64 aSig, bSig, zSig;
2673 int16 expDiff;
2674
2675 aSig = extractFloat64Frac( a );
2676 aExp = extractFloat64Exp( a );
2677 bSig = extractFloat64Frac( b );
2678 bExp = extractFloat64Exp( b );
2679 expDiff = aExp - bExp;
2680 aSig <<= 9;
2681 bSig <<= 9;
2682 if ( 0 < expDiff ) {
2683 if ( aExp == 0x7FF ) {
2684 if ( aSig ) return propagateFloat64NaN( a, b );
2685 return a;
2686 }
2687 if ( bExp == 0 ) {
2688 --expDiff;
2689 }
2690 else {
2691 bSig |= LIT64( 0x2000000000000000 );
2692 }
2693 shift64RightJamming( bSig, expDiff, &bSig );
2694 zExp = aExp;
2695 }
2696 else if ( expDiff < 0 ) {
2697 if ( bExp == 0x7FF ) {
2698 if ( bSig ) return propagateFloat64NaN( a, b );
2699 return packFloat64( zSign, 0x7FF, 0 );
2700 }
2701 if ( aExp == 0 ) {
2702 ++expDiff;
2703 }
2704 else {
2705 aSig |= LIT64( 0x2000000000000000 );
2706 }
2707 shift64RightJamming( aSig, - expDiff, &aSig );
2708 zExp = bExp;
2709 }
2710 else {
2711 if ( aExp == 0x7FF ) {
2712 if ( aSig | bSig ) return propagateFloat64NaN( a, b );
2713 return a;
2714 }
2715 if ( aExp == 0 ) return packFloat64( zSign, 0, ( aSig + bSig )>>9 );
2716 zSig = LIT64( 0x4000000000000000 ) + aSig + bSig;
2717 zExp = aExp;
2718 goto roundAndPack;
2719 }
2720 aSig |= LIT64( 0x2000000000000000 );
2721 zSig = ( aSig + bSig )<<1;
2722 --zExp;
2723 if ( (sbits64) zSig < 0 ) {
2724 zSig = aSig + bSig;
2725 ++zExp;
2726 }
2727 roundAndPack:
2728 return roundAndPackFloat64( zSign, zExp, zSig );
2729
2730 }
2731
2732 /*
2733 -------------------------------------------------------------------------------
2734 Returns the result of subtracting the absolute values of the double-
2735 precision floating-point values `a' and `b'. If `zSign' is 1, the
2736 difference is negated before being returned. `zSign' is ignored if the
2737 result is a NaN. The subtraction is performed according to the IEC/IEEE
2738 Standard for Binary Floating-Point Arithmetic.
2739 -------------------------------------------------------------------------------
2740 */
subFloat64Sigs(float64 a,float64 b,flag zSign)2741 static float64 subFloat64Sigs( float64 a, float64 b, flag zSign )
2742 {
2743 int16 aExp, bExp, zExp;
2744 bits64 aSig, bSig, zSig;
2745 int16 expDiff;
2746
2747 aSig = extractFloat64Frac( a );
2748 aExp = extractFloat64Exp( a );
2749 bSig = extractFloat64Frac( b );
2750 bExp = extractFloat64Exp( b );
2751 expDiff = aExp - bExp;
2752 aSig <<= 10;
2753 bSig <<= 10;
2754 if ( 0 < expDiff ) goto aExpBigger;
2755 if ( expDiff < 0 ) goto bExpBigger;
2756 if ( aExp == 0x7FF ) {
2757 if ( aSig | bSig ) return propagateFloat64NaN( a, b );
2758 float_raise( float_flag_invalid );
2759 return float64_default_nan;
2760 }
2761 if ( aExp == 0 ) {
2762 aExp = 1;
2763 bExp = 1;
2764 }
2765 if ( bSig < aSig ) goto aBigger;
2766 if ( aSig < bSig ) goto bBigger;
2767 return packFloat64( float_rounding_mode == float_round_down, 0, 0 );
2768 bExpBigger:
2769 if ( bExp == 0x7FF ) {
2770 if ( bSig ) return propagateFloat64NaN( a, b );
2771 return packFloat64( zSign ^ 1, 0x7FF, 0 );
2772 }
2773 if ( aExp == 0 ) {
2774 ++expDiff;
2775 }
2776 else {
2777 aSig |= LIT64( 0x4000000000000000 );
2778 }
2779 shift64RightJamming( aSig, - expDiff, &aSig );
2780 bSig |= LIT64( 0x4000000000000000 );
2781 bBigger:
2782 zSig = bSig - aSig;
2783 zExp = bExp;
2784 zSign ^= 1;
2785 goto normalizeRoundAndPack;
2786 aExpBigger:
2787 if ( aExp == 0x7FF ) {
2788 if ( aSig ) return propagateFloat64NaN( a, b );
2789 return a;
2790 }
2791 if ( bExp == 0 ) {
2792 --expDiff;
2793 }
2794 else {
2795 bSig |= LIT64( 0x4000000000000000 );
2796 }
2797 shift64RightJamming( bSig, expDiff, &bSig );
2798 aSig |= LIT64( 0x4000000000000000 );
2799 aBigger:
2800 zSig = aSig - bSig;
2801 zExp = aExp;
2802 normalizeRoundAndPack:
2803 --zExp;
2804 return normalizeRoundAndPackFloat64( zSign, zExp, zSig );
2805
2806 }
2807
2808 /*
2809 -------------------------------------------------------------------------------
2810 Returns the result of adding the double-precision floating-point values `a'
2811 and `b'. The operation is performed according to the IEC/IEEE Standard for
2812 Binary Floating-Point Arithmetic.
2813 -------------------------------------------------------------------------------
2814 */
float64_add(float64 a,float64 b)2815 float64 float64_add( float64 a, float64 b )
2816 {
2817 flag aSign, bSign;
2818
2819 aSign = extractFloat64Sign( a );
2820 bSign = extractFloat64Sign( b );
2821 if ( aSign == bSign ) {
2822 return addFloat64Sigs( a, b, aSign );
2823 }
2824 else {
2825 return subFloat64Sigs( a, b, aSign );
2826 }
2827
2828 }
2829
2830 /*
2831 -------------------------------------------------------------------------------
2832 Returns the result of subtracting the double-precision floating-point values
2833 `a' and `b'. The operation is performed according to the IEC/IEEE Standard
2834 for Binary Floating-Point Arithmetic.
2835 -------------------------------------------------------------------------------
2836 */
float64_sub(float64 a,float64 b)2837 float64 float64_sub( float64 a, float64 b )
2838 {
2839 flag aSign, bSign;
2840
2841 aSign = extractFloat64Sign( a );
2842 bSign = extractFloat64Sign( b );
2843 if ( aSign == bSign ) {
2844 return subFloat64Sigs( a, b, aSign );
2845 }
2846 else {
2847 return addFloat64Sigs( a, b, aSign );
2848 }
2849
2850 }
2851
2852 /*
2853 -------------------------------------------------------------------------------
2854 Returns the result of multiplying the double-precision floating-point values
2855 `a' and `b'. The operation is performed according to the IEC/IEEE Standard
2856 for Binary Floating-Point Arithmetic.
2857 -------------------------------------------------------------------------------
2858 */
float64_mul(float64 a,float64 b)2859 float64 float64_mul( float64 a, float64 b )
2860 {
2861 flag aSign, bSign, zSign;
2862 int16 aExp, bExp, zExp;
2863 bits64 aSig, bSig, zSig0, zSig1;
2864
2865 aSig = extractFloat64Frac( a );
2866 aExp = extractFloat64Exp( a );
2867 aSign = extractFloat64Sign( a );
2868 bSig = extractFloat64Frac( b );
2869 bExp = extractFloat64Exp( b );
2870 bSign = extractFloat64Sign( b );
2871 zSign = aSign ^ bSign;
2872 if ( aExp == 0x7FF ) {
2873 if ( aSig || ( ( bExp == 0x7FF ) && bSig ) ) {
2874 return propagateFloat64NaN( a, b );
2875 }
2876 if ( ( bExp | bSig ) == 0 ) {
2877 float_raise( float_flag_invalid );
2878 return float64_default_nan;
2879 }
2880 return packFloat64( zSign, 0x7FF, 0 );
2881 }
2882 if ( bExp == 0x7FF ) {
2883 if ( bSig ) return propagateFloat64NaN( a, b );
2884 if ( ( aExp | aSig ) == 0 ) {
2885 float_raise( float_flag_invalid );
2886 return float64_default_nan;
2887 }
2888 return packFloat64( zSign, 0x7FF, 0 );
2889 }
2890 if ( aExp == 0 ) {
2891 if ( aSig == 0 ) return packFloat64( zSign, 0, 0 );
2892 normalizeFloat64Subnormal( aSig, &aExp, &aSig );
2893 }
2894 if ( bExp == 0 ) {
2895 if ( bSig == 0 ) return packFloat64( zSign, 0, 0 );
2896 normalizeFloat64Subnormal( bSig, &bExp, &bSig );
2897 }
2898 zExp = aExp + bExp - 0x3FF;
2899 aSig = ( aSig | LIT64( 0x0010000000000000 ) )<<10;
2900 bSig = ( bSig | LIT64( 0x0010000000000000 ) )<<11;
2901 mul64To128( aSig, bSig, &zSig0, &zSig1 );
2902 zSig0 |= ( zSig1 != 0 );
2903 if ( 0 <= (sbits64) ( zSig0<<1 ) ) {
2904 zSig0 <<= 1;
2905 --zExp;
2906 }
2907 return roundAndPackFloat64( zSign, zExp, zSig0 );
2908
2909 }
2910
2911 /*
2912 -------------------------------------------------------------------------------
2913 Returns the result of dividing the double-precision floating-point value `a'
2914 by the corresponding value `b'. The operation is performed according to
2915 the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
2916 -------------------------------------------------------------------------------
2917 */
float64_div(float64 a,float64 b)2918 float64 float64_div( float64 a, float64 b )
2919 {
2920 flag aSign, bSign, zSign;
2921 int16 aExp, bExp, zExp;
2922 bits64 aSig, bSig, zSig;
2923 bits64 rem0, rem1;
2924 bits64 term0, term1;
2925
2926 aSig = extractFloat64Frac( a );
2927 aExp = extractFloat64Exp( a );
2928 aSign = extractFloat64Sign( a );
2929 bSig = extractFloat64Frac( b );
2930 bExp = extractFloat64Exp( b );
2931 bSign = extractFloat64Sign( b );
2932 zSign = aSign ^ bSign;
2933 if ( aExp == 0x7FF ) {
2934 if ( aSig ) return propagateFloat64NaN( a, b );
2935 if ( bExp == 0x7FF ) {
2936 if ( bSig ) return propagateFloat64NaN( a, b );
2937 float_raise( float_flag_invalid );
2938 return float64_default_nan;
2939 }
2940 return packFloat64( zSign, 0x7FF, 0 );
2941 }
2942 if ( bExp == 0x7FF ) {
2943 if ( bSig ) return propagateFloat64NaN( a, b );
2944 return packFloat64( zSign, 0, 0 );
2945 }
2946 if ( bExp == 0 ) {
2947 if ( bSig == 0 ) {
2948 if ( ( aExp | aSig ) == 0 ) {
2949 float_raise( float_flag_invalid );
2950 return float64_default_nan;
2951 }
2952 float_raise( float_flag_divbyzero );
2953 return packFloat64( zSign, 0x7FF, 0 );
2954 }
2955 normalizeFloat64Subnormal( bSig, &bExp, &bSig );
2956 }
2957 if ( aExp == 0 ) {
2958 if ( aSig == 0 ) return packFloat64( zSign, 0, 0 );
2959 normalizeFloat64Subnormal( aSig, &aExp, &aSig );
2960 }
2961 zExp = aExp - bExp + 0x3FD;
2962 aSig = ( aSig | LIT64( 0x0010000000000000 ) )<<10;
2963 bSig = ( bSig | LIT64( 0x0010000000000000 ) )<<11;
2964 if ( bSig <= ( aSig + aSig ) ) {
2965 aSig >>= 1;
2966 ++zExp;
2967 }
2968 zSig = estimateDiv128To64( aSig, 0, bSig );
2969 if ( ( zSig & 0x1FF ) <= 2 ) {
2970 mul64To128( bSig, zSig, &term0, &term1 );
2971 sub128( aSig, 0, term0, term1, &rem0, &rem1 );
2972 while ( (sbits64) rem0 < 0 ) {
2973 --zSig;
2974 add128( rem0, rem1, 0, bSig, &rem0, &rem1 );
2975 }
2976 zSig |= ( rem1 != 0 );
2977 }
2978 return roundAndPackFloat64( zSign, zExp, zSig );
2979
2980 }
2981
2982 #ifndef SOFTFLOAT_FOR_GCC
2983 /*
2984 -------------------------------------------------------------------------------
2985 Returns the remainder of the double-precision floating-point value `a'
2986 with respect to the corresponding value `b'. The operation is performed
2987 according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
2988 -------------------------------------------------------------------------------
2989 */
float64_rem(float64 a,float64 b)2990 float64 float64_rem( float64 a, float64 b )
2991 {
2992 flag aSign, bSign, zSign;
2993 int16 aExp, bExp, expDiff;
2994 bits64 aSig, bSig;
2995 bits64 q, alternateASig;
2996 sbits64 sigMean;
2997
2998 aSig = extractFloat64Frac( a );
2999 aExp = extractFloat64Exp( a );
3000 aSign = extractFloat64Sign( a );
3001 bSig = extractFloat64Frac( b );
3002 bExp = extractFloat64Exp( b );
3003 bSign = extractFloat64Sign( b );
3004 if ( aExp == 0x7FF ) {
3005 if ( aSig || ( ( bExp == 0x7FF ) && bSig ) ) {
3006 return propagateFloat64NaN( a, b );
3007 }
3008 float_raise( float_flag_invalid );
3009 return float64_default_nan;
3010 }
3011 if ( bExp == 0x7FF ) {
3012 if ( bSig ) return propagateFloat64NaN( a, b );
3013 return a;
3014 }
3015 if ( bExp == 0 ) {
3016 if ( bSig == 0 ) {
3017 float_raise( float_flag_invalid );
3018 return float64_default_nan;
3019 }
3020 normalizeFloat64Subnormal( bSig, &bExp, &bSig );
3021 }
3022 if ( aExp == 0 ) {
3023 if ( aSig == 0 ) return a;
3024 normalizeFloat64Subnormal( aSig, &aExp, &aSig );
3025 }
3026 expDiff = aExp - bExp;
3027 aSig = ( aSig | LIT64( 0x0010000000000000 ) )<<11;
3028 bSig = ( bSig | LIT64( 0x0010000000000000 ) )<<11;
3029 if ( expDiff < 0 ) {
3030 if ( expDiff < -1 ) return a;
3031 aSig >>= 1;
3032 }
3033 q = ( bSig <= aSig );
3034 if ( q ) aSig -= bSig;
3035 expDiff -= 64;
3036 while ( 0 < expDiff ) {
3037 q = estimateDiv128To64( aSig, 0, bSig );
3038 q = ( 2 < q ) ? q - 2 : 0;
3039 aSig = - ( ( bSig>>2 ) * q );
3040 expDiff -= 62;
3041 }
3042 expDiff += 64;
3043 if ( 0 < expDiff ) {
3044 q = estimateDiv128To64( aSig, 0, bSig );
3045 q = ( 2 < q ) ? q - 2 : 0;
3046 q >>= 64 - expDiff;
3047 bSig >>= 2;
3048 aSig = ( ( aSig>>1 )<<( expDiff - 1 ) ) - bSig * q;
3049 }
3050 else {
3051 aSig >>= 2;
3052 bSig >>= 2;
3053 }
3054 do {
3055 alternateASig = aSig;
3056 ++q;
3057 aSig -= bSig;
3058 } while ( 0 <= (sbits64) aSig );
3059 sigMean = aSig + alternateASig;
3060 if ( ( sigMean < 0 ) || ( ( sigMean == 0 ) && ( q & 1 ) ) ) {
3061 aSig = alternateASig;
3062 }
3063 zSign = ( (sbits64) aSig < 0 );
3064 if ( zSign ) aSig = - aSig;
3065 return normalizeRoundAndPackFloat64( aSign ^ zSign, bExp, aSig );
3066
3067 }
3068
3069 /*
3070 -------------------------------------------------------------------------------
3071 Returns the square root of the double-precision floating-point value `a'.
3072 The operation is performed according to the IEC/IEEE Standard for Binary
3073 Floating-Point Arithmetic.
3074 -------------------------------------------------------------------------------
3075 */
float64_sqrt(float64 a)3076 float64 float64_sqrt( float64 a )
3077 {
3078 flag aSign;
3079 int16 aExp, zExp;
3080 bits64 aSig, zSig, doubleZSig;
3081 bits64 rem0, rem1, term0, term1;
3082
3083 aSig = extractFloat64Frac( a );
3084 aExp = extractFloat64Exp( a );
3085 aSign = extractFloat64Sign( a );
3086 if ( aExp == 0x7FF ) {
3087 if ( aSig ) return propagateFloat64NaN( a, a );
3088 if ( ! aSign ) return a;
3089 float_raise( float_flag_invalid );
3090 return float64_default_nan;
3091 }
3092 if ( aSign ) {
3093 if ( ( aExp | aSig ) == 0 ) return a;
3094 float_raise( float_flag_invalid );
3095 return float64_default_nan;
3096 }
3097 if ( aExp == 0 ) {
3098 if ( aSig == 0 ) return 0;
3099 normalizeFloat64Subnormal( aSig, &aExp, &aSig );
3100 }
3101 zExp = ( ( aExp - 0x3FF )>>1 ) + 0x3FE;
3102 aSig |= LIT64( 0x0010000000000000 );
3103 zSig = estimateSqrt32( aExp, aSig>>21 );
3104 aSig <<= 9 - ( aExp & 1 );
3105 zSig = estimateDiv128To64( aSig, 0, zSig<<32 ) + ( zSig<<30 );
3106 if ( ( zSig & 0x1FF ) <= 5 ) {
3107 doubleZSig = zSig<<1;
3108 mul64To128( zSig, zSig, &term0, &term1 );
3109 sub128( aSig, 0, term0, term1, &rem0, &rem1 );
3110 while ( (sbits64) rem0 < 0 ) {
3111 --zSig;
3112 doubleZSig -= 2;
3113 add128( rem0, rem1, zSig>>63, doubleZSig | 1, &rem0, &rem1 );
3114 }
3115 zSig |= ( ( rem0 | rem1 ) != 0 );
3116 }
3117 return roundAndPackFloat64( 0, zExp, zSig );
3118
3119 }
3120 #endif
3121
3122 /*
3123 -------------------------------------------------------------------------------
3124 Returns 1 if the double-precision floating-point value `a' is equal to the
3125 corresponding value `b', and 0 otherwise. The comparison is performed
3126 according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
3127 -------------------------------------------------------------------------------
3128 */
float64_eq(float64 a,float64 b)3129 flag float64_eq( float64 a, float64 b )
3130 {
3131
3132 if ( ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) )
3133 || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) )
3134 ) {
3135 if ( float64_is_signaling_nan( a ) || float64_is_signaling_nan( b ) ) {
3136 float_raise( float_flag_invalid );
3137 }
3138 return 0;
3139 }
3140 return ( a == b ) ||
3141 ( (bits64) ( ( FLOAT64_DEMANGLE(a) | FLOAT64_DEMANGLE(b) )<<1 ) == 0 );
3142
3143 }
3144
3145 /*
3146 -------------------------------------------------------------------------------
3147 Returns 1 if the double-precision floating-point value `a' is less than or
3148 equal to the corresponding value `b', and 0 otherwise. The comparison is
3149 performed according to the IEC/IEEE Standard for Binary Floating-Point
3150 Arithmetic.
3151 -------------------------------------------------------------------------------
3152 */
float64_le(float64 a,float64 b)3153 flag float64_le( float64 a, float64 b )
3154 {
3155 flag aSign, bSign;
3156
3157 if ( ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) )
3158 || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) )
3159 ) {
3160 float_raise( float_flag_invalid );
3161 return 0;
3162 }
3163 aSign = extractFloat64Sign( a );
3164 bSign = extractFloat64Sign( b );
3165 if ( aSign != bSign )
3166 return aSign ||
3167 ( (bits64) ( ( FLOAT64_DEMANGLE(a) | FLOAT64_DEMANGLE(b) )<<1 ) ==
3168 0 );
3169 return ( a == b ) ||
3170 ( aSign ^ ( FLOAT64_DEMANGLE(a) < FLOAT64_DEMANGLE(b) ) );
3171
3172 }
3173
3174 /*
3175 -------------------------------------------------------------------------------
3176 Returns 1 if the double-precision floating-point value `a' is less than
3177 the corresponding value `b', and 0 otherwise. The comparison is performed
3178 according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
3179 -------------------------------------------------------------------------------
3180 */
float64_lt(float64 a,float64 b)3181 flag float64_lt( float64 a, float64 b )
3182 {
3183 flag aSign, bSign;
3184
3185 if ( ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) )
3186 || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) )
3187 ) {
3188 float_raise( float_flag_invalid );
3189 return 0;
3190 }
3191 aSign = extractFloat64Sign( a );
3192 bSign = extractFloat64Sign( b );
3193 if ( aSign != bSign )
3194 return aSign &&
3195 ( (bits64) ( ( FLOAT64_DEMANGLE(a) | FLOAT64_DEMANGLE(b) )<<1 ) !=
3196 0 );
3197 return ( a != b ) &&
3198 ( aSign ^ ( FLOAT64_DEMANGLE(a) < FLOAT64_DEMANGLE(b) ) );
3199
3200 }
3201
3202 #ifndef SOFTFLOAT_FOR_GCC
3203 /*
3204 -------------------------------------------------------------------------------
3205 Returns 1 if the double-precision floating-point value `a' is equal to the
3206 corresponding value `b', and 0 otherwise. The invalid exception is raised
3207 if either operand is a NaN. Otherwise, the comparison is performed
3208 according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
3209 -------------------------------------------------------------------------------
3210 */
float64_eq_signaling(float64 a,float64 b)3211 flag float64_eq_signaling( float64 a, float64 b )
3212 {
3213
3214 if ( ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) )
3215 || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) )
3216 ) {
3217 float_raise( float_flag_invalid );
3218 return 0;
3219 }
3220 return ( a == b ) || ( (bits64) ( ( a | b )<<1 ) == 0 );
3221
3222 }
3223
3224 /*
3225 -------------------------------------------------------------------------------
3226 Returns 1 if the double-precision floating-point value `a' is less than or
3227 equal to the corresponding value `b', and 0 otherwise. Quiet NaNs do not
3228 cause an exception. Otherwise, the comparison is performed according to the
3229 IEC/IEEE Standard for Binary Floating-Point Arithmetic.
3230 -------------------------------------------------------------------------------
3231 */
float64_le_quiet(float64 a,float64 b)3232 flag float64_le_quiet( float64 a, float64 b )
3233 {
3234 flag aSign, bSign;
3235
3236 if ( ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) )
3237 || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) )
3238 ) {
3239 if ( float64_is_signaling_nan( a ) || float64_is_signaling_nan( b ) ) {
3240 float_raise( float_flag_invalid );
3241 }
3242 return 0;
3243 }
3244 aSign = extractFloat64Sign( a );
3245 bSign = extractFloat64Sign( b );
3246 if ( aSign != bSign ) return aSign || ( (bits64) ( ( a | b )<<1 ) == 0 );
3247 return ( a == b ) || ( aSign ^ ( a < b ) );
3248
3249 }
3250
3251 /*
3252 -------------------------------------------------------------------------------
3253 Returns 1 if the double-precision floating-point value `a' is less than
3254 the corresponding value `b', and 0 otherwise. Quiet NaNs do not cause an
3255 exception. Otherwise, the comparison is performed according to the IEC/IEEE
3256 Standard for Binary Floating-Point Arithmetic.
3257 -------------------------------------------------------------------------------
3258 */
float64_lt_quiet(float64 a,float64 b)3259 flag float64_lt_quiet( float64 a, float64 b )
3260 {
3261 flag aSign, bSign;
3262
3263 if ( ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) )
3264 || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) )
3265 ) {
3266 if ( float64_is_signaling_nan( a ) || float64_is_signaling_nan( b ) ) {
3267 float_raise( float_flag_invalid );
3268 }
3269 return 0;
3270 }
3271 aSign = extractFloat64Sign( a );
3272 bSign = extractFloat64Sign( b );
3273 if ( aSign != bSign ) return aSign && ( (bits64) ( ( a | b )<<1 ) != 0 );
3274 return ( a != b ) && ( aSign ^ ( a < b ) );
3275
3276 }
3277 #endif
3278
3279 #ifdef FLOATX80
3280
3281 /*
3282 -------------------------------------------------------------------------------
3283 Returns the result of converting the extended double-precision floating-
3284 point value `a' to the 32-bit two's complement integer format. The
3285 conversion is performed according to the IEC/IEEE Standard for Binary
3286 Floating-Point Arithmetic---which means in particular that the conversion
3287 is rounded according to the current rounding mode. If `a' is a NaN, the
3288 largest positive integer is returned. Otherwise, if the conversion
3289 overflows, the largest integer with the same sign as `a' is returned.
3290 -------------------------------------------------------------------------------
3291 */
floatx80_to_int32(floatx80 a)3292 int32 floatx80_to_int32( floatx80 a )
3293 {
3294 flag aSign;
3295 int32 aExp, shiftCount;
3296 bits64 aSig;
3297
3298 aSig = extractFloatx80Frac( a );
3299 aExp = extractFloatx80Exp( a );
3300 aSign = extractFloatx80Sign( a );
3301 if ( ( aExp == 0x7FFF ) && (bits64) ( aSig<<1 ) ) aSign = 0;
3302 shiftCount = 0x4037 - aExp;
3303 if ( shiftCount <= 0 ) shiftCount = 1;
3304 shift64RightJamming( aSig, shiftCount, &aSig );
3305 return roundAndPackInt32( aSign, aSig );
3306
3307 }
3308
3309 /*
3310 -------------------------------------------------------------------------------
3311 Returns the result of converting the extended double-precision floating-
3312 point value `a' to the 32-bit two's complement integer format. The
3313 conversion is performed according to the IEC/IEEE Standard for Binary
3314 Floating-Point Arithmetic, except that the conversion is always rounded
3315 toward zero. If `a' is a NaN, the largest positive integer is returned.
3316 Otherwise, if the conversion overflows, the largest integer with the same
3317 sign as `a' is returned.
3318 -------------------------------------------------------------------------------
3319 */
floatx80_to_int32_round_to_zero(floatx80 a)3320 int32 floatx80_to_int32_round_to_zero( floatx80 a )
3321 {
3322 flag aSign;
3323 int32 aExp, shiftCount;
3324 bits64 aSig, savedASig;
3325 int32 z;
3326
3327 aSig = extractFloatx80Frac( a );
3328 aExp = extractFloatx80Exp( a );
3329 aSign = extractFloatx80Sign( a );
3330 if ( 0x401E < aExp ) {
3331 if ( ( aExp == 0x7FFF ) && (bits64) ( aSig<<1 ) ) aSign = 0;
3332 goto invalid;
3333 }
3334 else if ( aExp < 0x3FFF ) {
3335 if ( aExp || aSig ) float_exception_flags |= float_flag_inexact;
3336 return 0;
3337 }
3338 shiftCount = 0x403E - aExp;
3339 savedASig = aSig;
3340 aSig >>= shiftCount;
3341 z = aSig;
3342 if ( aSign ) z = - z;
3343 if ( ( z < 0 ) ^ aSign ) {
3344 invalid:
3345 float_raise( float_flag_invalid );
3346 return aSign ? (sbits32) 0x80000000 : 0x7FFFFFFF;
3347 }
3348 if ( ( aSig<<shiftCount ) != savedASig ) {
3349 float_exception_flags |= float_flag_inexact;
3350 }
3351 return z;
3352
3353 }
3354
3355 /*
3356 -------------------------------------------------------------------------------
3357 Returns the result of converting the extended double-precision floating-
3358 point value `a' to the 64-bit two's complement integer format. The
3359 conversion is performed according to the IEC/IEEE Standard for Binary
3360 Floating-Point Arithmetic---which means in particular that the conversion
3361 is rounded according to the current rounding mode. If `a' is a NaN,
3362 the largest positive integer is returned. Otherwise, if the conversion
3363 overflows, the largest integer with the same sign as `a' is returned.
3364 -------------------------------------------------------------------------------
3365 */
floatx80_to_int64(floatx80 a)3366 int64 floatx80_to_int64( floatx80 a )
3367 {
3368 flag aSign;
3369 int32 aExp, shiftCount;
3370 bits64 aSig, aSigExtra;
3371
3372 aSig = extractFloatx80Frac( a );
3373 aExp = extractFloatx80Exp( a );
3374 aSign = extractFloatx80Sign( a );
3375 shiftCount = 0x403E - aExp;
3376 if ( shiftCount <= 0 ) {
3377 if ( shiftCount ) {
3378 float_raise( float_flag_invalid );
3379 if ( ! aSign
3380 || ( ( aExp == 0x7FFF )
3381 && ( aSig != LIT64( 0x8000000000000000 ) ) )
3382 ) {
3383 return LIT64( 0x7FFFFFFFFFFFFFFF );
3384 }
3385 return (sbits64) LIT64( 0x8000000000000000 );
3386 }
3387 aSigExtra = 0;
3388 }
3389 else {
3390 shift64ExtraRightJamming( aSig, 0, shiftCount, &aSig, &aSigExtra );
3391 }
3392 return roundAndPackInt64( aSign, aSig, aSigExtra );
3393
3394 }
3395
3396 /*
3397 -------------------------------------------------------------------------------
3398 Returns the result of converting the extended double-precision floating-
3399 point value `a' to the 64-bit two's complement integer format. The
3400 conversion is performed according to the IEC/IEEE Standard for Binary
3401 Floating-Point Arithmetic, except that the conversion is always rounded
3402 toward zero. If `a' is a NaN, the largest positive integer is returned.
3403 Otherwise, if the conversion overflows, the largest integer with the same
3404 sign as `a' is returned.
3405 -------------------------------------------------------------------------------
3406 */
floatx80_to_int64_round_to_zero(floatx80 a)3407 int64 floatx80_to_int64_round_to_zero( floatx80 a )
3408 {
3409 flag aSign;
3410 int32 aExp, shiftCount;
3411 bits64 aSig;
3412 int64 z;
3413
3414 aSig = extractFloatx80Frac( a );
3415 aExp = extractFloatx80Exp( a );
3416 aSign = extractFloatx80Sign( a );
3417 shiftCount = aExp - 0x403E;
3418 if ( 0 <= shiftCount ) {
3419 aSig &= LIT64( 0x7FFFFFFFFFFFFFFF );
3420 if ( ( a.high != 0xC03E ) || aSig ) {
3421 float_raise( float_flag_invalid );
3422 if ( ! aSign || ( ( aExp == 0x7FFF ) && aSig ) ) {
3423 return LIT64( 0x7FFFFFFFFFFFFFFF );
3424 }
3425 }
3426 return (sbits64) LIT64( 0x8000000000000000 );
3427 }
3428 else if ( aExp < 0x3FFF ) {
3429 if ( aExp | aSig ) float_exception_flags |= float_flag_inexact;
3430 return 0;
3431 }
3432 z = aSig>>( - shiftCount );
3433 if ( (bits64) ( aSig<<( shiftCount & 63 ) ) ) {
3434 float_exception_flags |= float_flag_inexact;
3435 }
3436 if ( aSign ) z = - z;
3437 return z;
3438
3439 }
3440
3441 /*
3442 -------------------------------------------------------------------------------
3443 Returns the result of converting the extended double-precision floating-
3444 point value `a' to the single-precision floating-point format. The
3445 conversion is performed according to the IEC/IEEE Standard for Binary
3446 Floating-Point Arithmetic.
3447 -------------------------------------------------------------------------------
3448 */
floatx80_to_float32(floatx80 a)3449 float32 floatx80_to_float32( floatx80 a )
3450 {
3451 flag aSign;
3452 int32 aExp;
3453 bits64 aSig;
3454
3455 aSig = extractFloatx80Frac( a );
3456 aExp = extractFloatx80Exp( a );
3457 aSign = extractFloatx80Sign( a );
3458 if ( aExp == 0x7FFF ) {
3459 if ( (bits64) ( aSig<<1 ) ) {
3460 return commonNaNToFloat32( floatx80ToCommonNaN( a ) );
3461 }
3462 return packFloat32( aSign, 0xFF, 0 );
3463 }
3464 shift64RightJamming( aSig, 33, &aSig );
3465 if ( aExp || aSig ) aExp -= 0x3F81;
3466 return roundAndPackFloat32( aSign, aExp, aSig );
3467
3468 }
3469
3470 /*
3471 -------------------------------------------------------------------------------
3472 Returns the result of converting the extended double-precision floating-
3473 point value `a' to the double-precision floating-point format. The
3474 conversion is performed according to the IEC/IEEE Standard for Binary
3475 Floating-Point Arithmetic.
3476 -------------------------------------------------------------------------------
3477 */
floatx80_to_float64(floatx80 a)3478 float64 floatx80_to_float64( floatx80 a )
3479 {
3480 flag aSign;
3481 int32 aExp;
3482 bits64 aSig, zSig;
3483
3484 aSig = extractFloatx80Frac( a );
3485 aExp = extractFloatx80Exp( a );
3486 aSign = extractFloatx80Sign( a );
3487 if ( aExp == 0x7FFF ) {
3488 if ( (bits64) ( aSig<<1 ) ) {
3489 return commonNaNToFloat64( floatx80ToCommonNaN( a ) );
3490 }
3491 return packFloat64( aSign, 0x7FF, 0 );
3492 }
3493 shift64RightJamming( aSig, 1, &zSig );
3494 if ( aExp || aSig ) aExp -= 0x3C01;
3495 return roundAndPackFloat64( aSign, aExp, zSig );
3496
3497 }
3498
3499 #ifdef FLOAT128
3500
3501 /*
3502 -------------------------------------------------------------------------------
3503 Returns the result of converting the extended double-precision floating-
3504 point value `a' to the quadruple-precision floating-point format. The
3505 conversion is performed according to the IEC/IEEE Standard for Binary
3506 Floating-Point Arithmetic.
3507 -------------------------------------------------------------------------------
3508 */
floatx80_to_float128(floatx80 a)3509 float128 floatx80_to_float128( floatx80 a )
3510 {
3511 flag aSign;
3512 int16 aExp;
3513 bits64 aSig, zSig0, zSig1;
3514
3515 aSig = extractFloatx80Frac( a );
3516 aExp = extractFloatx80Exp( a );
3517 aSign = extractFloatx80Sign( a );
3518 if ( ( aExp == 0x7FFF ) && (bits64) ( aSig<<1 ) ) {
3519 return commonNaNToFloat128( floatx80ToCommonNaN( a ) );
3520 }
3521 shift128Right( aSig<<1, 0, 16, &zSig0, &zSig1 );
3522 return packFloat128( aSign, aExp, zSig0, zSig1 );
3523
3524 }
3525
3526 #endif
3527
3528 /*
3529 -------------------------------------------------------------------------------
3530 Rounds the extended double-precision floating-point value `a' to an integer,
3531 and returns the result as an extended quadruple-precision floating-point
3532 value. The operation is performed according to the IEC/IEEE Standard for
3533 Binary Floating-Point Arithmetic.
3534 -------------------------------------------------------------------------------
3535 */
floatx80_round_to_int(floatx80 a)3536 floatx80 floatx80_round_to_int( floatx80 a )
3537 {
3538 flag aSign;
3539 int32 aExp;
3540 bits64 lastBitMask, roundBitsMask;
3541 int8 roundingMode;
3542 floatx80 z;
3543
3544 aExp = extractFloatx80Exp( a );
3545 if ( 0x403E <= aExp ) {
3546 if ( ( aExp == 0x7FFF ) && (bits64) ( extractFloatx80Frac( a )<<1 ) ) {
3547 return propagateFloatx80NaN( a, a );
3548 }
3549 return a;
3550 }
3551 if ( aExp < 0x3FFF ) {
3552 if ( ( aExp == 0 )
3553 && ( (bits64) ( extractFloatx80Frac( a )<<1 ) == 0 ) ) {
3554 return a;
3555 }
3556 float_exception_flags |= float_flag_inexact;
3557 aSign = extractFloatx80Sign( a );
3558 switch ( float_rounding_mode ) {
3559 case float_round_nearest_even:
3560 if ( ( aExp == 0x3FFE ) && (bits64) ( extractFloatx80Frac( a )<<1 )
3561 ) {
3562 return
3563 packFloatx80( aSign, 0x3FFF, LIT64( 0x8000000000000000 ) );
3564 }
3565 break;
3566 case float_round_to_zero:
3567 break;
3568 case float_round_down:
3569 return
3570 aSign ?
3571 packFloatx80( 1, 0x3FFF, LIT64( 0x8000000000000000 ) )
3572 : packFloatx80( 0, 0, 0 );
3573 case float_round_up:
3574 return
3575 aSign ? packFloatx80( 1, 0, 0 )
3576 : packFloatx80( 0, 0x3FFF, LIT64( 0x8000000000000000 ) );
3577 }
3578 return packFloatx80( aSign, 0, 0 );
3579 }
3580 lastBitMask = 1;
3581 lastBitMask <<= 0x403E - aExp;
3582 roundBitsMask = lastBitMask - 1;
3583 z = a;
3584 roundingMode = float_rounding_mode;
3585 if ( roundingMode == float_round_nearest_even ) {
3586 z.low += lastBitMask>>1;
3587 if ( ( z.low & roundBitsMask ) == 0 ) z.low &= ~ lastBitMask;
3588 }
3589 else if ( roundingMode != float_round_to_zero ) {
3590 if ( extractFloatx80Sign( z ) ^ ( roundingMode == float_round_up ) ) {
3591 z.low += roundBitsMask;
3592 }
3593 }
3594 z.low &= ~ roundBitsMask;
3595 if ( z.low == 0 ) {
3596 ++z.high;
3597 z.low = LIT64( 0x8000000000000000 );
3598 }
3599 if ( z.low != a.low ) float_exception_flags |= float_flag_inexact;
3600 return z;
3601
3602 }
3603
3604 /*
3605 -------------------------------------------------------------------------------
3606 Returns the result of adding the absolute values of the extended double-
3607 precision floating-point values `a' and `b'. If `zSign' is 1, the sum is
3608 negated before being returned. `zSign' is ignored if the result is a NaN.
3609 The addition is performed according to the IEC/IEEE Standard for Binary
3610 Floating-Point Arithmetic.
3611 -------------------------------------------------------------------------------
3612 */
addFloatx80Sigs(floatx80 a,floatx80 b,flag zSign)3613 static floatx80 addFloatx80Sigs( floatx80 a, floatx80 b, flag zSign )
3614 {
3615 int32 aExp, bExp, zExp;
3616 bits64 aSig, bSig, zSig0, zSig1;
3617 int32 expDiff;
3618
3619 aSig = extractFloatx80Frac( a );
3620 aExp = extractFloatx80Exp( a );
3621 bSig = extractFloatx80Frac( b );
3622 bExp = extractFloatx80Exp( b );
3623 expDiff = aExp - bExp;
3624 if ( 0 < expDiff ) {
3625 if ( aExp == 0x7FFF ) {
3626 if ( (bits64) ( aSig<<1 ) ) return propagateFloatx80NaN( a, b );
3627 return a;
3628 }
3629 if ( bExp == 0 ) --expDiff;
3630 shift64ExtraRightJamming( bSig, 0, expDiff, &bSig, &zSig1 );
3631 zExp = aExp;
3632 }
3633 else if ( expDiff < 0 ) {
3634 if ( bExp == 0x7FFF ) {
3635 if ( (bits64) ( bSig<<1 ) ) return propagateFloatx80NaN( a, b );
3636 return packFloatx80( zSign, 0x7FFF, LIT64( 0x8000000000000000 ) );
3637 }
3638 if ( aExp == 0 ) ++expDiff;
3639 shift64ExtraRightJamming( aSig, 0, - expDiff, &aSig, &zSig1 );
3640 zExp = bExp;
3641 }
3642 else {
3643 if ( aExp == 0x7FFF ) {
3644 if ( (bits64) ( ( aSig | bSig )<<1 ) ) {
3645 return propagateFloatx80NaN( a, b );
3646 }
3647 return a;
3648 }
3649 zSig1 = 0;
3650 zSig0 = aSig + bSig;
3651 if ( aExp == 0 ) {
3652 normalizeFloatx80Subnormal( zSig0, &zExp, &zSig0 );
3653 goto roundAndPack;
3654 }
3655 zExp = aExp;
3656 goto shiftRight1;
3657 }
3658 zSig0 = aSig + bSig;
3659 if ( (sbits64) zSig0 < 0 ) goto roundAndPack;
3660 shiftRight1:
3661 shift64ExtraRightJamming( zSig0, zSig1, 1, &zSig0, &zSig1 );
3662 zSig0 |= LIT64( 0x8000000000000000 );
3663 ++zExp;
3664 roundAndPack:
3665 return
3666 roundAndPackFloatx80(
3667 floatx80_rounding_precision, zSign, zExp, zSig0, zSig1 );
3668
3669 }
3670
3671 /*
3672 -------------------------------------------------------------------------------
3673 Returns the result of subtracting the absolute values of the extended
3674 double-precision floating-point values `a' and `b'. If `zSign' is 1, the
3675 difference is negated before being returned. `zSign' is ignored if the
3676 result is a NaN. The subtraction is performed according to the IEC/IEEE
3677 Standard for Binary Floating-Point Arithmetic.
3678 -------------------------------------------------------------------------------
3679 */
subFloatx80Sigs(floatx80 a,floatx80 b,flag zSign)3680 static floatx80 subFloatx80Sigs( floatx80 a, floatx80 b, flag zSign )
3681 {
3682 int32 aExp, bExp, zExp;
3683 bits64 aSig, bSig, zSig0, zSig1;
3684 int32 expDiff;
3685 floatx80 z;
3686
3687 aSig = extractFloatx80Frac( a );
3688 aExp = extractFloatx80Exp( a );
3689 bSig = extractFloatx80Frac( b );
3690 bExp = extractFloatx80Exp( b );
3691 expDiff = aExp - bExp;
3692 if ( 0 < expDiff ) goto aExpBigger;
3693 if ( expDiff < 0 ) goto bExpBigger;
3694 if ( aExp == 0x7FFF ) {
3695 if ( (bits64) ( ( aSig | bSig )<<1 ) ) {
3696 return propagateFloatx80NaN( a, b );
3697 }
3698 float_raise( float_flag_invalid );
3699 z.low = floatx80_default_nan_low;
3700 z.high = floatx80_default_nan_high;
3701 return z;
3702 }
3703 if ( aExp == 0 ) {
3704 aExp = 1;
3705 bExp = 1;
3706 }
3707 zSig1 = 0;
3708 if ( bSig < aSig ) goto aBigger;
3709 if ( aSig < bSig ) goto bBigger;
3710 return packFloatx80( float_rounding_mode == float_round_down, 0, 0 );
3711 bExpBigger:
3712 if ( bExp == 0x7FFF ) {
3713 if ( (bits64) ( bSig<<1 ) ) return propagateFloatx80NaN( a, b );
3714 return packFloatx80( zSign ^ 1, 0x7FFF, LIT64( 0x8000000000000000 ) );
3715 }
3716 if ( aExp == 0 ) ++expDiff;
3717 shift128RightJamming( aSig, 0, - expDiff, &aSig, &zSig1 );
3718 bBigger:
3719 sub128( bSig, 0, aSig, zSig1, &zSig0, &zSig1 );
3720 zExp = bExp;
3721 zSign ^= 1;
3722 goto normalizeRoundAndPack;
3723 aExpBigger:
3724 if ( aExp == 0x7FFF ) {
3725 if ( (bits64) ( aSig<<1 ) ) return propagateFloatx80NaN( a, b );
3726 return a;
3727 }
3728 if ( bExp == 0 ) --expDiff;
3729 shift128RightJamming( bSig, 0, expDiff, &bSig, &zSig1 );
3730 aBigger:
3731 sub128( aSig, 0, bSig, zSig1, &zSig0, &zSig1 );
3732 zExp = aExp;
3733 normalizeRoundAndPack:
3734 return
3735 normalizeRoundAndPackFloatx80(
3736 floatx80_rounding_precision, zSign, zExp, zSig0, zSig1 );
3737
3738 }
3739
3740 /*
3741 -------------------------------------------------------------------------------
3742 Returns the result of adding the extended double-precision floating-point
3743 values `a' and `b'. The operation is performed according to the IEC/IEEE
3744 Standard for Binary Floating-Point Arithmetic.
3745 -------------------------------------------------------------------------------
3746 */
floatx80_add(floatx80 a,floatx80 b)3747 floatx80 floatx80_add( floatx80 a, floatx80 b )
3748 {
3749 flag aSign, bSign;
3750
3751 aSign = extractFloatx80Sign( a );
3752 bSign = extractFloatx80Sign( b );
3753 if ( aSign == bSign ) {
3754 return addFloatx80Sigs( a, b, aSign );
3755 }
3756 else {
3757 return subFloatx80Sigs( a, b, aSign );
3758 }
3759
3760 }
3761
3762 /*
3763 -------------------------------------------------------------------------------
3764 Returns the result of subtracting the extended double-precision floating-
3765 point values `a' and `b'. The operation is performed according to the
3766 IEC/IEEE Standard for Binary Floating-Point Arithmetic.
3767 -------------------------------------------------------------------------------
3768 */
floatx80_sub(floatx80 a,floatx80 b)3769 floatx80 floatx80_sub( floatx80 a, floatx80 b )
3770 {
3771 flag aSign, bSign;
3772
3773 aSign = extractFloatx80Sign( a );
3774 bSign = extractFloatx80Sign( b );
3775 if ( aSign == bSign ) {
3776 return subFloatx80Sigs( a, b, aSign );
3777 }
3778 else {
3779 return addFloatx80Sigs( a, b, aSign );
3780 }
3781
3782 }
3783
3784 /*
3785 -------------------------------------------------------------------------------
3786 Returns the result of multiplying the extended double-precision floating-
3787 point values `a' and `b'. The operation is performed according to the
3788 IEC/IEEE Standard for Binary Floating-Point Arithmetic.
3789 -------------------------------------------------------------------------------
3790 */
floatx80_mul(floatx80 a,floatx80 b)3791 floatx80 floatx80_mul( floatx80 a, floatx80 b )
3792 {
3793 flag aSign, bSign, zSign;
3794 int32 aExp, bExp, zExp;
3795 bits64 aSig, bSig, zSig0, zSig1;
3796 floatx80 z;
3797
3798 aSig = extractFloatx80Frac( a );
3799 aExp = extractFloatx80Exp( a );
3800 aSign = extractFloatx80Sign( a );
3801 bSig = extractFloatx80Frac( b );
3802 bExp = extractFloatx80Exp( b );
3803 bSign = extractFloatx80Sign( b );
3804 zSign = aSign ^ bSign;
3805 if ( aExp == 0x7FFF ) {
3806 if ( (bits64) ( aSig<<1 )
3807 || ( ( bExp == 0x7FFF ) && (bits64) ( bSig<<1 ) ) ) {
3808 return propagateFloatx80NaN( a, b );
3809 }
3810 if ( ( bExp | bSig ) == 0 ) goto invalid;
3811 return packFloatx80( zSign, 0x7FFF, LIT64( 0x8000000000000000 ) );
3812 }
3813 if ( bExp == 0x7FFF ) {
3814 if ( (bits64) ( bSig<<1 ) ) return propagateFloatx80NaN( a, b );
3815 if ( ( aExp | aSig ) == 0 ) {
3816 invalid:
3817 float_raise( float_flag_invalid );
3818 z.low = floatx80_default_nan_low;
3819 z.high = floatx80_default_nan_high;
3820 return z;
3821 }
3822 return packFloatx80( zSign, 0x7FFF, LIT64( 0x8000000000000000 ) );
3823 }
3824 if ( aExp == 0 ) {
3825 if ( aSig == 0 ) return packFloatx80( zSign, 0, 0 );
3826 normalizeFloatx80Subnormal( aSig, &aExp, &aSig );
3827 }
3828 if ( bExp == 0 ) {
3829 if ( bSig == 0 ) return packFloatx80( zSign, 0, 0 );
3830 normalizeFloatx80Subnormal( bSig, &bExp, &bSig );
3831 }
3832 zExp = aExp + bExp - 0x3FFE;
3833 mul64To128( aSig, bSig, &zSig0, &zSig1 );
3834 if ( 0 < (sbits64) zSig0 ) {
3835 shortShift128Left( zSig0, zSig1, 1, &zSig0, &zSig1 );
3836 --zExp;
3837 }
3838 return
3839 roundAndPackFloatx80(
3840 floatx80_rounding_precision, zSign, zExp, zSig0, zSig1 );
3841
3842 }
3843
3844 /*
3845 -------------------------------------------------------------------------------
3846 Returns the result of dividing the extended double-precision floating-point
3847 value `a' by the corresponding value `b'. The operation is performed
3848 according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
3849 -------------------------------------------------------------------------------
3850 */
floatx80_div(floatx80 a,floatx80 b)3851 floatx80 floatx80_div( floatx80 a, floatx80 b )
3852 {
3853 flag aSign, bSign, zSign;
3854 int32 aExp, bExp, zExp;
3855 bits64 aSig, bSig, zSig0, zSig1;
3856 bits64 rem0, rem1, rem2, term0, term1, term2;
3857 floatx80 z;
3858
3859 aSig = extractFloatx80Frac( a );
3860 aExp = extractFloatx80Exp( a );
3861 aSign = extractFloatx80Sign( a );
3862 bSig = extractFloatx80Frac( b );
3863 bExp = extractFloatx80Exp( b );
3864 bSign = extractFloatx80Sign( b );
3865 zSign = aSign ^ bSign;
3866 if ( aExp == 0x7FFF ) {
3867 if ( (bits64) ( aSig<<1 ) ) return propagateFloatx80NaN( a, b );
3868 if ( bExp == 0x7FFF ) {
3869 if ( (bits64) ( bSig<<1 ) ) return propagateFloatx80NaN( a, b );
3870 goto invalid;
3871 }
3872 return packFloatx80( zSign, 0x7FFF, LIT64( 0x8000000000000000 ) );
3873 }
3874 if ( bExp == 0x7FFF ) {
3875 if ( (bits64) ( bSig<<1 ) ) return propagateFloatx80NaN( a, b );
3876 return packFloatx80( zSign, 0, 0 );
3877 }
3878 if ( bExp == 0 ) {
3879 if ( bSig == 0 ) {
3880 if ( ( aExp | aSig ) == 0 ) {
3881 invalid:
3882 float_raise( float_flag_invalid );
3883 z.low = floatx80_default_nan_low;
3884 z.high = floatx80_default_nan_high;
3885 return z;
3886 }
3887 float_raise( float_flag_divbyzero );
3888 return packFloatx80( zSign, 0x7FFF, LIT64( 0x8000000000000000 ) );
3889 }
3890 normalizeFloatx80Subnormal( bSig, &bExp, &bSig );
3891 }
3892 if ( aExp == 0 ) {
3893 if ( aSig == 0 ) return packFloatx80( zSign, 0, 0 );
3894 normalizeFloatx80Subnormal( aSig, &aExp, &aSig );
3895 }
3896 zExp = aExp - bExp + 0x3FFE;
3897 rem1 = 0;
3898 if ( bSig <= aSig ) {
3899 shift128Right( aSig, 0, 1, &aSig, &rem1 );
3900 ++zExp;
3901 }
3902 zSig0 = estimateDiv128To64( aSig, rem1, bSig );
3903 mul64To128( bSig, zSig0, &term0, &term1 );
3904 sub128( aSig, rem1, term0, term1, &rem0, &rem1 );
3905 while ( (sbits64) rem0 < 0 ) {
3906 --zSig0;
3907 add128( rem0, rem1, 0, bSig, &rem0, &rem1 );
3908 }
3909 zSig1 = estimateDiv128To64( rem1, 0, bSig );
3910 if ( (bits64) ( zSig1<<1 ) <= 8 ) {
3911 mul64To128( bSig, zSig1, &term1, &term2 );
3912 sub128( rem1, 0, term1, term2, &rem1, &rem2 );
3913 while ( (sbits64) rem1 < 0 ) {
3914 --zSig1;
3915 add128( rem1, rem2, 0, bSig, &rem1, &rem2 );
3916 }
3917 zSig1 |= ( ( rem1 | rem2 ) != 0 );
3918 }
3919 return
3920 roundAndPackFloatx80(
3921 floatx80_rounding_precision, zSign, zExp, zSig0, zSig1 );
3922
3923 }
3924
3925 /*
3926 -------------------------------------------------------------------------------
3927 Returns the remainder of the extended double-precision floating-point value
3928 `a' with respect to the corresponding value `b'. The operation is performed
3929 according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
3930 -------------------------------------------------------------------------------
3931 */
floatx80_rem(floatx80 a,floatx80 b)3932 floatx80 floatx80_rem( floatx80 a, floatx80 b )
3933 {
3934 flag aSign, bSign, zSign;
3935 int32 aExp, bExp, expDiff;
3936 bits64 aSig0, aSig1, bSig;
3937 bits64 q, term0, term1, alternateASig0, alternateASig1;
3938 floatx80 z;
3939
3940 aSig0 = extractFloatx80Frac( a );
3941 aExp = extractFloatx80Exp( a );
3942 aSign = extractFloatx80Sign( a );
3943 bSig = extractFloatx80Frac( b );
3944 bExp = extractFloatx80Exp( b );
3945 bSign = extractFloatx80Sign( b );
3946 if ( aExp == 0x7FFF ) {
3947 if ( (bits64) ( aSig0<<1 )
3948 || ( ( bExp == 0x7FFF ) && (bits64) ( bSig<<1 ) ) ) {
3949 return propagateFloatx80NaN( a, b );
3950 }
3951 goto invalid;
3952 }
3953 if ( bExp == 0x7FFF ) {
3954 if ( (bits64) ( bSig<<1 ) ) return propagateFloatx80NaN( a, b );
3955 return a;
3956 }
3957 if ( bExp == 0 ) {
3958 if ( bSig == 0 ) {
3959 invalid:
3960 float_raise( float_flag_invalid );
3961 z.low = floatx80_default_nan_low;
3962 z.high = floatx80_default_nan_high;
3963 return z;
3964 }
3965 normalizeFloatx80Subnormal( bSig, &bExp, &bSig );
3966 }
3967 if ( aExp == 0 ) {
3968 if ( (bits64) ( aSig0<<1 ) == 0 ) return a;
3969 normalizeFloatx80Subnormal( aSig0, &aExp, &aSig0 );
3970 }
3971 bSig |= LIT64( 0x8000000000000000 );
3972 zSign = aSign;
3973 expDiff = aExp - bExp;
3974 aSig1 = 0;
3975 if ( expDiff < 0 ) {
3976 if ( expDiff < -1 ) return a;
3977 shift128Right( aSig0, 0, 1, &aSig0, &aSig1 );
3978 expDiff = 0;
3979 }
3980 q = ( bSig <= aSig0 );
3981 if ( q ) aSig0 -= bSig;
3982 expDiff -= 64;
3983 while ( 0 < expDiff ) {
3984 q = estimateDiv128To64( aSig0, aSig1, bSig );
3985 q = ( 2 < q ) ? q - 2 : 0;
3986 mul64To128( bSig, q, &term0, &term1 );
3987 sub128( aSig0, aSig1, term0, term1, &aSig0, &aSig1 );
3988 shortShift128Left( aSig0, aSig1, 62, &aSig0, &aSig1 );
3989 expDiff -= 62;
3990 }
3991 expDiff += 64;
3992 if ( 0 < expDiff ) {
3993 q = estimateDiv128To64( aSig0, aSig1, bSig );
3994 q = ( 2 < q ) ? q - 2 : 0;
3995 q >>= 64 - expDiff;
3996 mul64To128( bSig, q<<( 64 - expDiff ), &term0, &term1 );
3997 sub128( aSig0, aSig1, term0, term1, &aSig0, &aSig1 );
3998 shortShift128Left( 0, bSig, 64 - expDiff, &term0, &term1 );
3999 while ( le128( term0, term1, aSig0, aSig1 ) ) {
4000 ++q;
4001 sub128( aSig0, aSig1, term0, term1, &aSig0, &aSig1 );
4002 }
4003 }
4004 else {
4005 term1 = 0;
4006 term0 = bSig;
4007 }
4008 sub128( term0, term1, aSig0, aSig1, &alternateASig0, &alternateASig1 );
4009 if ( lt128( alternateASig0, alternateASig1, aSig0, aSig1 )
4010 || ( eq128( alternateASig0, alternateASig1, aSig0, aSig1 )
4011 && ( q & 1 ) )
4012 ) {
4013 aSig0 = alternateASig0;
4014 aSig1 = alternateASig1;
4015 zSign = ! zSign;
4016 }
4017 return
4018 normalizeRoundAndPackFloatx80(
4019 80, zSign, bExp + expDiff, aSig0, aSig1 );
4020
4021 }
4022
4023 /*
4024 -------------------------------------------------------------------------------
4025 Returns the square root of the extended double-precision floating-point
4026 value `a'. The operation is performed according to the IEC/IEEE Standard
4027 for Binary Floating-Point Arithmetic.
4028 -------------------------------------------------------------------------------
4029 */
floatx80_sqrt(floatx80 a)4030 floatx80 floatx80_sqrt( floatx80 a )
4031 {
4032 flag aSign;
4033 int32 aExp, zExp;
4034 bits64 aSig0, aSig1, zSig0, zSig1, doubleZSig0;
4035 bits64 rem0, rem1, rem2, rem3, term0, term1, term2, term3;
4036 floatx80 z;
4037
4038 aSig0 = extractFloatx80Frac( a );
4039 aExp = extractFloatx80Exp( a );
4040 aSign = extractFloatx80Sign( a );
4041 if ( aExp == 0x7FFF ) {
4042 if ( (bits64) ( aSig0<<1 ) ) return propagateFloatx80NaN( a, a );
4043 if ( ! aSign ) return a;
4044 goto invalid;
4045 }
4046 if ( aSign ) {
4047 if ( ( aExp | aSig0 ) == 0 ) return a;
4048 invalid:
4049 float_raise( float_flag_invalid );
4050 z.low = floatx80_default_nan_low;
4051 z.high = floatx80_default_nan_high;
4052 return z;
4053 }
4054 if ( aExp == 0 ) {
4055 if ( aSig0 == 0 ) return packFloatx80( 0, 0, 0 );
4056 normalizeFloatx80Subnormal( aSig0, &aExp, &aSig0 );
4057 }
4058 zExp = ( ( aExp - 0x3FFF )>>1 ) + 0x3FFF;
4059 zSig0 = estimateSqrt32( aExp, aSig0>>32 );
4060 shift128Right( aSig0, 0, 2 + ( aExp & 1 ), &aSig0, &aSig1 );
4061 zSig0 = estimateDiv128To64( aSig0, aSig1, zSig0<<32 ) + ( zSig0<<30 );
4062 doubleZSig0 = zSig0<<1;
4063 mul64To128( zSig0, zSig0, &term0, &term1 );
4064 sub128( aSig0, aSig1, term0, term1, &rem0, &rem1 );
4065 while ( (sbits64) rem0 < 0 ) {
4066 --zSig0;
4067 doubleZSig0 -= 2;
4068 add128( rem0, rem1, zSig0>>63, doubleZSig0 | 1, &rem0, &rem1 );
4069 }
4070 zSig1 = estimateDiv128To64( rem1, 0, doubleZSig0 );
4071 if ( ( zSig1 & LIT64( 0x3FFFFFFFFFFFFFFF ) ) <= 5 ) {
4072 if ( zSig1 == 0 ) zSig1 = 1;
4073 mul64To128( doubleZSig0, zSig1, &term1, &term2 );
4074 sub128( rem1, 0, term1, term2, &rem1, &rem2 );
4075 mul64To128( zSig1, zSig1, &term2, &term3 );
4076 sub192( rem1, rem2, 0, 0, term2, term3, &rem1, &rem2, &rem3 );
4077 while ( (sbits64) rem1 < 0 ) {
4078 --zSig1;
4079 shortShift128Left( 0, zSig1, 1, &term2, &term3 );
4080 term3 |= 1;
4081 term2 |= doubleZSig0;
4082 add192( rem1, rem2, rem3, 0, term2, term3, &rem1, &rem2, &rem3 );
4083 }
4084 zSig1 |= ( ( rem1 | rem2 | rem3 ) != 0 );
4085 }
4086 shortShift128Left( 0, zSig1, 1, &zSig0, &zSig1 );
4087 zSig0 |= doubleZSig0;
4088 return
4089 roundAndPackFloatx80(
4090 floatx80_rounding_precision, 0, zExp, zSig0, zSig1 );
4091
4092 }
4093
4094 /*
4095 -------------------------------------------------------------------------------
4096 Returns 1 if the extended double-precision floating-point value `a' is
4097 equal to the corresponding value `b', and 0 otherwise. The comparison is
4098 performed according to the IEC/IEEE Standard for Binary Floating-Point
4099 Arithmetic.
4100 -------------------------------------------------------------------------------
4101 */
floatx80_eq(floatx80 a,floatx80 b)4102 flag floatx80_eq( floatx80 a, floatx80 b )
4103 {
4104
4105 if ( ( ( extractFloatx80Exp( a ) == 0x7FFF )
4106 && (bits64) ( extractFloatx80Frac( a )<<1 ) )
4107 || ( ( extractFloatx80Exp( b ) == 0x7FFF )
4108 && (bits64) ( extractFloatx80Frac( b )<<1 ) )
4109 ) {
4110 if ( floatx80_is_signaling_nan( a )
4111 || floatx80_is_signaling_nan( b ) ) {
4112 float_raise( float_flag_invalid );
4113 }
4114 return 0;
4115 }
4116 return
4117 ( a.low == b.low )
4118 && ( ( a.high == b.high )
4119 || ( ( a.low == 0 )
4120 && ( (bits16) ( ( a.high | b.high )<<1 ) == 0 ) )
4121 );
4122
4123 }
4124
4125 /*
4126 -------------------------------------------------------------------------------
4127 Returns 1 if the extended double-precision floating-point value `a' is
4128 less than or equal to the corresponding value `b', and 0 otherwise. The
4129 comparison is performed according to the IEC/IEEE Standard for Binary
4130 Floating-Point Arithmetic.
4131 -------------------------------------------------------------------------------
4132 */
floatx80_le(floatx80 a,floatx80 b)4133 flag floatx80_le( floatx80 a, floatx80 b )
4134 {
4135 flag aSign, bSign;
4136
4137 if ( ( ( extractFloatx80Exp( a ) == 0x7FFF )
4138 && (bits64) ( extractFloatx80Frac( a )<<1 ) )
4139 || ( ( extractFloatx80Exp( b ) == 0x7FFF )
4140 && (bits64) ( extractFloatx80Frac( b )<<1 ) )
4141 ) {
4142 float_raise( float_flag_invalid );
4143 return 0;
4144 }
4145 aSign = extractFloatx80Sign( a );
4146 bSign = extractFloatx80Sign( b );
4147 if ( aSign != bSign ) {
4148 return
4149 aSign
4150 || ( ( ( (bits16) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
4151 == 0 );
4152 }
4153 return
4154 aSign ? le128( b.high, b.low, a.high, a.low )
4155 : le128( a.high, a.low, b.high, b.low );
4156
4157 }
4158
4159 /*
4160 -------------------------------------------------------------------------------
4161 Returns 1 if the extended double-precision floating-point value `a' is
4162 less than the corresponding value `b', and 0 otherwise. The comparison
4163 is performed according to the IEC/IEEE Standard for Binary Floating-Point
4164 Arithmetic.
4165 -------------------------------------------------------------------------------
4166 */
floatx80_lt(floatx80 a,floatx80 b)4167 flag floatx80_lt( floatx80 a, floatx80 b )
4168 {
4169 flag aSign, bSign;
4170
4171 if ( ( ( extractFloatx80Exp( a ) == 0x7FFF )
4172 && (bits64) ( extractFloatx80Frac( a )<<1 ) )
4173 || ( ( extractFloatx80Exp( b ) == 0x7FFF )
4174 && (bits64) ( extractFloatx80Frac( b )<<1 ) )
4175 ) {
4176 float_raise( float_flag_invalid );
4177 return 0;
4178 }
4179 aSign = extractFloatx80Sign( a );
4180 bSign = extractFloatx80Sign( b );
4181 if ( aSign != bSign ) {
4182 return
4183 aSign
4184 && ( ( ( (bits16) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
4185 != 0 );
4186 }
4187 return
4188 aSign ? lt128( b.high, b.low, a.high, a.low )
4189 : lt128( a.high, a.low, b.high, b.low );
4190
4191 }
4192
4193 /*
4194 -------------------------------------------------------------------------------
4195 Returns 1 if the extended double-precision floating-point value `a' is equal
4196 to the corresponding value `b', and 0 otherwise. The invalid exception is
4197 raised if either operand is a NaN. Otherwise, the comparison is performed
4198 according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
4199 -------------------------------------------------------------------------------
4200 */
floatx80_eq_signaling(floatx80 a,floatx80 b)4201 flag floatx80_eq_signaling( floatx80 a, floatx80 b )
4202 {
4203
4204 if ( ( ( extractFloatx80Exp( a ) == 0x7FFF )
4205 && (bits64) ( extractFloatx80Frac( a )<<1 ) )
4206 || ( ( extractFloatx80Exp( b ) == 0x7FFF )
4207 && (bits64) ( extractFloatx80Frac( b )<<1 ) )
4208 ) {
4209 float_raise( float_flag_invalid );
4210 return 0;
4211 }
4212 return
4213 ( a.low == b.low )
4214 && ( ( a.high == b.high )
4215 || ( ( a.low == 0 )
4216 && ( (bits16) ( ( a.high | b.high )<<1 ) == 0 ) )
4217 );
4218
4219 }
4220
4221 /*
4222 -------------------------------------------------------------------------------
4223 Returns 1 if the extended double-precision floating-point value `a' is less
4224 than or equal to the corresponding value `b', and 0 otherwise. Quiet NaNs
4225 do not cause an exception. Otherwise, the comparison is performed according
4226 to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
4227 -------------------------------------------------------------------------------
4228 */
floatx80_le_quiet(floatx80 a,floatx80 b)4229 flag floatx80_le_quiet( floatx80 a, floatx80 b )
4230 {
4231 flag aSign, bSign;
4232
4233 if ( ( ( extractFloatx80Exp( a ) == 0x7FFF )
4234 && (bits64) ( extractFloatx80Frac( a )<<1 ) )
4235 || ( ( extractFloatx80Exp( b ) == 0x7FFF )
4236 && (bits64) ( extractFloatx80Frac( b )<<1 ) )
4237 ) {
4238 if ( floatx80_is_signaling_nan( a )
4239 || floatx80_is_signaling_nan( b ) ) {
4240 float_raise( float_flag_invalid );
4241 }
4242 return 0;
4243 }
4244 aSign = extractFloatx80Sign( a );
4245 bSign = extractFloatx80Sign( b );
4246 if ( aSign != bSign ) {
4247 return
4248 aSign
4249 || ( ( ( (bits16) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
4250 == 0 );
4251 }
4252 return
4253 aSign ? le128( b.high, b.low, a.high, a.low )
4254 : le128( a.high, a.low, b.high, b.low );
4255
4256 }
4257
4258 /*
4259 -------------------------------------------------------------------------------
4260 Returns 1 if the extended double-precision floating-point value `a' is less
4261 than the corresponding value `b', and 0 otherwise. Quiet NaNs do not cause
4262 an exception. Otherwise, the comparison is performed according to the
4263 IEC/IEEE Standard for Binary Floating-Point Arithmetic.
4264 -------------------------------------------------------------------------------
4265 */
floatx80_lt_quiet(floatx80 a,floatx80 b)4266 flag floatx80_lt_quiet( floatx80 a, floatx80 b )
4267 {
4268 flag aSign, bSign;
4269
4270 if ( ( ( extractFloatx80Exp( a ) == 0x7FFF )
4271 && (bits64) ( extractFloatx80Frac( a )<<1 ) )
4272 || ( ( extractFloatx80Exp( b ) == 0x7FFF )
4273 && (bits64) ( extractFloatx80Frac( b )<<1 ) )
4274 ) {
4275 if ( floatx80_is_signaling_nan( a )
4276 || floatx80_is_signaling_nan( b ) ) {
4277 float_raise( float_flag_invalid );
4278 }
4279 return 0;
4280 }
4281 aSign = extractFloatx80Sign( a );
4282 bSign = extractFloatx80Sign( b );
4283 if ( aSign != bSign ) {
4284 return
4285 aSign
4286 && ( ( ( (bits16) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
4287 != 0 );
4288 }
4289 return
4290 aSign ? lt128( b.high, b.low, a.high, a.low )
4291 : lt128( a.high, a.low, b.high, b.low );
4292
4293 }
4294
4295 #endif
4296
4297 #ifdef FLOAT128
4298
4299 /*
4300 -------------------------------------------------------------------------------
4301 Returns the result of converting the quadruple-precision floating-point
4302 value `a' to the 32-bit two's complement integer format. The conversion
4303 is performed according to the IEC/IEEE Standard for Binary Floating-Point
4304 Arithmetic---which means in particular that the conversion is rounded
4305 according to the current rounding mode. If `a' is a NaN, the largest
4306 positive integer is returned. Otherwise, if the conversion overflows, the
4307 largest integer with the same sign as `a' is returned.
4308 -------------------------------------------------------------------------------
4309 */
float128_to_int32(float128 a)4310 int32 float128_to_int32( float128 a )
4311 {
4312 flag aSign;
4313 int32 aExp, shiftCount;
4314 bits64 aSig0, aSig1;
4315
4316 aSig1 = extractFloat128Frac1( a );
4317 aSig0 = extractFloat128Frac0( a );
4318 aExp = extractFloat128Exp( a );
4319 aSign = extractFloat128Sign( a );
4320 if ( ( aExp == 0x7FFF ) && ( aSig0 | aSig1 ) ) aSign = 0;
4321 if ( aExp ) aSig0 |= LIT64( 0x0001000000000000 );
4322 aSig0 |= ( aSig1 != 0 );
4323 shiftCount = 0x4028 - aExp;
4324 if ( 0 < shiftCount ) shift64RightJamming( aSig0, shiftCount, &aSig0 );
4325 return roundAndPackInt32( aSign, aSig0 );
4326
4327 }
4328
4329 /*
4330 -------------------------------------------------------------------------------
4331 Returns the result of converting the quadruple-precision floating-point
4332 value `a' to the 32-bit two's complement integer format. The conversion
4333 is performed according to the IEC/IEEE Standard for Binary Floating-Point
4334 Arithmetic, except that the conversion is always rounded toward zero. If
4335 `a' is a NaN, the largest positive integer is returned. Otherwise, if the
4336 conversion overflows, the largest integer with the same sign as `a' is
4337 returned.
4338 -------------------------------------------------------------------------------
4339 */
float128_to_int32_round_to_zero(float128 a)4340 int32 float128_to_int32_round_to_zero( float128 a )
4341 {
4342 flag aSign;
4343 int32 aExp, shiftCount;
4344 bits64 aSig0, aSig1, savedASig;
4345 int32 z;
4346
4347 aSig1 = extractFloat128Frac1( a );
4348 aSig0 = extractFloat128Frac0( a );
4349 aExp = extractFloat128Exp( a );
4350 aSign = extractFloat128Sign( a );
4351 aSig0 |= ( aSig1 != 0 );
4352 if ( 0x401E < aExp ) {
4353 if ( ( aExp == 0x7FFF ) && aSig0 ) aSign = 0;
4354 goto invalid;
4355 }
4356 else if ( aExp < 0x3FFF ) {
4357 if ( aExp || aSig0 ) float_exception_flags |= float_flag_inexact;
4358 return 0;
4359 }
4360 aSig0 |= LIT64( 0x0001000000000000 );
4361 shiftCount = 0x402F - aExp;
4362 savedASig = aSig0;
4363 aSig0 >>= shiftCount;
4364 z = aSig0;
4365 if ( aSign ) z = - z;
4366 if ( ( z < 0 ) ^ aSign ) {
4367 invalid:
4368 float_raise( float_flag_invalid );
4369 return aSign ? (sbits32) 0x80000000 : 0x7FFFFFFF;
4370 }
4371 if ( ( aSig0<<shiftCount ) != savedASig ) {
4372 float_exception_flags |= float_flag_inexact;
4373 }
4374 return z;
4375
4376 }
4377
4378 /*
4379 -------------------------------------------------------------------------------
4380 Returns the result of converting the quadruple-precision floating-point
4381 value `a' to the 64-bit two's complement integer format. The conversion
4382 is performed according to the IEC/IEEE Standard for Binary Floating-Point
4383 Arithmetic---which means in particular that the conversion is rounded
4384 according to the current rounding mode. If `a' is a NaN, the largest
4385 positive integer is returned. Otherwise, if the conversion overflows, the
4386 largest integer with the same sign as `a' is returned.
4387 -------------------------------------------------------------------------------
4388 */
float128_to_int64(float128 a)4389 int64 float128_to_int64( float128 a )
4390 {
4391 flag aSign;
4392 int32 aExp, shiftCount;
4393 bits64 aSig0, aSig1;
4394
4395 aSig1 = extractFloat128Frac1( a );
4396 aSig0 = extractFloat128Frac0( a );
4397 aExp = extractFloat128Exp( a );
4398 aSign = extractFloat128Sign( a );
4399 if ( aExp ) aSig0 |= LIT64( 0x0001000000000000 );
4400 shiftCount = 0x402F - aExp;
4401 if ( shiftCount <= 0 ) {
4402 if ( 0x403E < aExp ) {
4403 float_raise( float_flag_invalid );
4404 if ( ! aSign
4405 || ( ( aExp == 0x7FFF )
4406 && ( aSig1 || ( aSig0 != LIT64( 0x0001000000000000 ) ) )
4407 )
4408 ) {
4409 return LIT64( 0x7FFFFFFFFFFFFFFF );
4410 }
4411 return (sbits64) LIT64( 0x8000000000000000 );
4412 }
4413 shortShift128Left( aSig0, aSig1, - shiftCount, &aSig0, &aSig1 );
4414 }
4415 else {
4416 shift64ExtraRightJamming( aSig0, aSig1, shiftCount, &aSig0, &aSig1 );
4417 }
4418 return roundAndPackInt64( aSign, aSig0, aSig1 );
4419
4420 }
4421
4422 /*
4423 -------------------------------------------------------------------------------
4424 Returns the result of converting the quadruple-precision floating-point
4425 value `a' to the 64-bit two's complement integer format. The conversion
4426 is performed according to the IEC/IEEE Standard for Binary Floating-Point
4427 Arithmetic, except that the conversion is always rounded toward zero.
4428 If `a' is a NaN, the largest positive integer is returned. Otherwise, if
4429 the conversion overflows, the largest integer with the same sign as `a' is
4430 returned.
4431 -------------------------------------------------------------------------------
4432 */
float128_to_int64_round_to_zero(float128 a)4433 int64 float128_to_int64_round_to_zero( float128 a )
4434 {
4435 flag aSign;
4436 int32 aExp, shiftCount;
4437 bits64 aSig0, aSig1;
4438 int64 z;
4439
4440 aSig1 = extractFloat128Frac1( a );
4441 aSig0 = extractFloat128Frac0( a );
4442 aExp = extractFloat128Exp( a );
4443 aSign = extractFloat128Sign( a );
4444 if ( aExp ) aSig0 |= LIT64( 0x0001000000000000 );
4445 shiftCount = aExp - 0x402F;
4446 if ( 0 < shiftCount ) {
4447 if ( 0x403E <= aExp ) {
4448 aSig0 &= LIT64( 0x0000FFFFFFFFFFFF );
4449 if ( ( a.high == LIT64( 0xC03E000000000000 ) )
4450 && ( aSig1 < LIT64( 0x0002000000000000 ) ) ) {
4451 if ( aSig1 ) float_exception_flags |= float_flag_inexact;
4452 }
4453 else {
4454 float_raise( float_flag_invalid );
4455 if ( ! aSign || ( ( aExp == 0x7FFF ) && ( aSig0 | aSig1 ) ) ) {
4456 return LIT64( 0x7FFFFFFFFFFFFFFF );
4457 }
4458 }
4459 return (sbits64) LIT64( 0x8000000000000000 );
4460 }
4461 z = ( aSig0<<shiftCount ) | ( aSig1>>( ( - shiftCount ) & 63 ) );
4462 if ( (bits64) ( aSig1<<shiftCount ) ) {
4463 float_exception_flags |= float_flag_inexact;
4464 }
4465 }
4466 else {
4467 if ( aExp < 0x3FFF ) {
4468 if ( aExp | aSig0 | aSig1 ) {
4469 float_exception_flags |= float_flag_inexact;
4470 }
4471 return 0;
4472 }
4473 z = aSig0>>( - shiftCount );
4474 if ( aSig1
4475 || ( shiftCount && (bits64) ( aSig0<<( shiftCount & 63 ) ) ) ) {
4476 float_exception_flags |= float_flag_inexact;
4477 }
4478 }
4479 if ( aSign ) z = - z;
4480 return z;
4481
4482 }
4483
4484 #if (defined(SOFTFLOATSPARC64_FOR_GCC) || defined(SOFTFLOAT_FOR_GCC)) \
4485 && defined(SOFTFLOAT_NEED_FIXUNS)
4486 /*
4487 * just like above - but do not care for overflow of signed results
4488 */
float128_to_uint64_round_to_zero(float128 a)4489 uint64 float128_to_uint64_round_to_zero( float128 a )
4490 {
4491 flag aSign;
4492 int32 aExp, shiftCount;
4493 bits64 aSig0, aSig1;
4494 uint64 z;
4495
4496 aSig1 = extractFloat128Frac1( a );
4497 aSig0 = extractFloat128Frac0( a );
4498 aExp = extractFloat128Exp( a );
4499 aSign = extractFloat128Sign( a );
4500 if ( aExp ) aSig0 |= LIT64( 0x0001000000000000 );
4501 shiftCount = aExp - 0x402F;
4502 if ( 0 < shiftCount ) {
4503 if ( 0x403F <= aExp ) {
4504 aSig0 &= LIT64( 0x0000FFFFFFFFFFFF );
4505 if ( ( a.high == LIT64( 0xC03E000000000000 ) )
4506 && ( aSig1 < LIT64( 0x0002000000000000 ) ) ) {
4507 if ( aSig1 ) float_exception_flags |= float_flag_inexact;
4508 }
4509 else {
4510 float_raise( float_flag_invalid );
4511 }
4512 return LIT64( 0xFFFFFFFFFFFFFFFF );
4513 }
4514 z = ( aSig0<<shiftCount ) | ( aSig1>>( ( - shiftCount ) & 63 ) );
4515 if ( (bits64) ( aSig1<<shiftCount ) ) {
4516 float_exception_flags |= float_flag_inexact;
4517 }
4518 }
4519 else {
4520 if ( aExp < 0x3FFF ) {
4521 if ( aExp | aSig0 | aSig1 ) {
4522 float_exception_flags |= float_flag_inexact;
4523 }
4524 return 0;
4525 }
4526 z = aSig0>>( - shiftCount );
4527 if (aSig1 || ( shiftCount && (bits64) ( aSig0<<( shiftCount & 63 ) ) ) ) {
4528 float_exception_flags |= float_flag_inexact;
4529 }
4530 }
4531 if ( aSign ) z = - z;
4532 return z;
4533
4534 }
4535 #endif /* (SOFTFLOATSPARC64_FOR_GCC || SOFTFLOAT_FOR_GCC) && SOFTFLOAT_NEED_FIXUNS */
4536
4537 /*
4538 -------------------------------------------------------------------------------
4539 Returns the result of converting the quadruple-precision floating-point
4540 value `a' to the single-precision floating-point format. The conversion
4541 is performed according to the IEC/IEEE Standard for Binary Floating-Point
4542 Arithmetic.
4543 -------------------------------------------------------------------------------
4544 */
float128_to_float32(float128 a)4545 float32 float128_to_float32( float128 a )
4546 {
4547 flag aSign;
4548 int32 aExp;
4549 bits64 aSig0, aSig1;
4550 bits32 zSig;
4551
4552 aSig1 = extractFloat128Frac1( a );
4553 aSig0 = extractFloat128Frac0( a );
4554 aExp = extractFloat128Exp( a );
4555 aSign = extractFloat128Sign( a );
4556 if ( aExp == 0x7FFF ) {
4557 if ( aSig0 | aSig1 ) {
4558 return commonNaNToFloat32( float128ToCommonNaN( a ) );
4559 }
4560 return packFloat32( aSign, 0xFF, 0 );
4561 }
4562 aSig0 |= ( aSig1 != 0 );
4563 shift64RightJamming( aSig0, 18, &aSig0 );
4564 zSig = aSig0;
4565 if ( aExp || zSig ) {
4566 zSig |= 0x40000000;
4567 aExp -= 0x3F81;
4568 }
4569 return roundAndPackFloat32( aSign, aExp, zSig );
4570
4571 }
4572
4573 /*
4574 -------------------------------------------------------------------------------
4575 Returns the result of converting the quadruple-precision floating-point
4576 value `a' to the double-precision floating-point format. The conversion
4577 is performed according to the IEC/IEEE Standard for Binary Floating-Point
4578 Arithmetic.
4579 -------------------------------------------------------------------------------
4580 */
float128_to_float64(float128 a)4581 float64 float128_to_float64( float128 a )
4582 {
4583 flag aSign;
4584 int32 aExp;
4585 bits64 aSig0, aSig1;
4586
4587 aSig1 = extractFloat128Frac1( a );
4588 aSig0 = extractFloat128Frac0( a );
4589 aExp = extractFloat128Exp( a );
4590 aSign = extractFloat128Sign( a );
4591 if ( aExp == 0x7FFF ) {
4592 if ( aSig0 | aSig1 ) {
4593 return commonNaNToFloat64( float128ToCommonNaN( a ) );
4594 }
4595 return packFloat64( aSign, 0x7FF, 0 );
4596 }
4597 shortShift128Left( aSig0, aSig1, 14, &aSig0, &aSig1 );
4598 aSig0 |= ( aSig1 != 0 );
4599 if ( aExp || aSig0 ) {
4600 aSig0 |= LIT64( 0x4000000000000000 );
4601 aExp -= 0x3C01;
4602 }
4603 return roundAndPackFloat64( aSign, aExp, aSig0 );
4604
4605 }
4606
4607 #ifdef FLOATX80
4608
4609 /*
4610 -------------------------------------------------------------------------------
4611 Returns the result of converting the quadruple-precision floating-point
4612 value `a' to the extended double-precision floating-point format. The
4613 conversion is performed according to the IEC/IEEE Standard for Binary
4614 Floating-Point Arithmetic.
4615 -------------------------------------------------------------------------------
4616 */
float128_to_floatx80(float128 a)4617 floatx80 float128_to_floatx80( float128 a )
4618 {
4619 flag aSign;
4620 int32 aExp;
4621 bits64 aSig0, aSig1;
4622
4623 aSig1 = extractFloat128Frac1( a );
4624 aSig0 = extractFloat128Frac0( a );
4625 aExp = extractFloat128Exp( a );
4626 aSign = extractFloat128Sign( a );
4627 if ( aExp == 0x7FFF ) {
4628 if ( aSig0 | aSig1 ) {
4629 return commonNaNToFloatx80( float128ToCommonNaN( a ) );
4630 }
4631 return packFloatx80( aSign, 0x7FFF, LIT64( 0x8000000000000000 ) );
4632 }
4633 if ( aExp == 0 ) {
4634 if ( ( aSig0 | aSig1 ) == 0 ) return packFloatx80( aSign, 0, 0 );
4635 normalizeFloat128Subnormal( aSig0, aSig1, &aExp, &aSig0, &aSig1 );
4636 }
4637 else {
4638 aSig0 |= LIT64( 0x0001000000000000 );
4639 }
4640 shortShift128Left( aSig0, aSig1, 15, &aSig0, &aSig1 );
4641 return roundAndPackFloatx80( 80, aSign, aExp, aSig0, aSig1 );
4642
4643 }
4644
4645 #endif
4646
4647 /*
4648 -------------------------------------------------------------------------------
4649 Rounds the quadruple-precision floating-point value `a' to an integer, and
4650 returns the result as a quadruple-precision floating-point value. The
4651 operation is performed according to the IEC/IEEE Standard for Binary
4652 Floating-Point Arithmetic.
4653 -------------------------------------------------------------------------------
4654 */
float128_round_to_int(float128 a)4655 float128 float128_round_to_int( float128 a )
4656 {
4657 flag aSign;
4658 int32 aExp;
4659 bits64 lastBitMask, roundBitsMask;
4660 int8 roundingMode;
4661 float128 z;
4662
4663 aExp = extractFloat128Exp( a );
4664 if ( 0x402F <= aExp ) {
4665 if ( 0x406F <= aExp ) {
4666 if ( ( aExp == 0x7FFF )
4667 && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) )
4668 ) {
4669 return propagateFloat128NaN( a, a );
4670 }
4671 return a;
4672 }
4673 lastBitMask = 1;
4674 lastBitMask = ( lastBitMask<<( 0x406E - aExp ) )<<1;
4675 roundBitsMask = lastBitMask - 1;
4676 z = a;
4677 roundingMode = float_rounding_mode;
4678 if ( roundingMode == float_round_nearest_even ) {
4679 if ( lastBitMask ) {
4680 add128( z.high, z.low, 0, lastBitMask>>1, &z.high, &z.low );
4681 if ( ( z.low & roundBitsMask ) == 0 ) z.low &= ~ lastBitMask;
4682 }
4683 else {
4684 if ( (sbits64) z.low < 0 ) {
4685 ++z.high;
4686 if ( (bits64) ( z.low<<1 ) == 0 ) z.high &= ~1;
4687 }
4688 }
4689 }
4690 else if ( roundingMode != float_round_to_zero ) {
4691 if ( extractFloat128Sign( z )
4692 ^ ( roundingMode == float_round_up ) ) {
4693 add128( z.high, z.low, 0, roundBitsMask, &z.high, &z.low );
4694 }
4695 }
4696 z.low &= ~ roundBitsMask;
4697 }
4698 else {
4699 if ( aExp < 0x3FFF ) {
4700 if ( ( ( (bits64) ( a.high<<1 ) ) | a.low ) == 0 ) return a;
4701 float_exception_flags |= float_flag_inexact;
4702 aSign = extractFloat128Sign( a );
4703 switch ( float_rounding_mode ) {
4704 case float_round_nearest_even:
4705 if ( ( aExp == 0x3FFE )
4706 && ( extractFloat128Frac0( a )
4707 | extractFloat128Frac1( a ) )
4708 ) {
4709 return packFloat128( aSign, 0x3FFF, 0, 0 );
4710 }
4711 break;
4712 case float_round_to_zero:
4713 break;
4714 case float_round_down:
4715 return
4716 aSign ? packFloat128( 1, 0x3FFF, 0, 0 )
4717 : packFloat128( 0, 0, 0, 0 );
4718 case float_round_up:
4719 return
4720 aSign ? packFloat128( 1, 0, 0, 0 )
4721 : packFloat128( 0, 0x3FFF, 0, 0 );
4722 }
4723 return packFloat128( aSign, 0, 0, 0 );
4724 }
4725 lastBitMask = 1;
4726 lastBitMask <<= 0x402F - aExp;
4727 roundBitsMask = lastBitMask - 1;
4728 z.low = 0;
4729 z.high = a.high;
4730 roundingMode = float_rounding_mode;
4731 if ( roundingMode == float_round_nearest_even ) {
4732 z.high += lastBitMask>>1;
4733 if ( ( ( z.high & roundBitsMask ) | a.low ) == 0 ) {
4734 z.high &= ~ lastBitMask;
4735 }
4736 }
4737 else if ( roundingMode != float_round_to_zero ) {
4738 if ( extractFloat128Sign( z )
4739 ^ ( roundingMode == float_round_up ) ) {
4740 z.high |= ( a.low != 0 );
4741 z.high += roundBitsMask;
4742 }
4743 }
4744 z.high &= ~ roundBitsMask;
4745 }
4746 if ( ( z.low != a.low ) || ( z.high != a.high ) ) {
4747 float_exception_flags |= float_flag_inexact;
4748 }
4749 return z;
4750
4751 }
4752
4753 /*
4754 -------------------------------------------------------------------------------
4755 Returns the result of adding the absolute values of the quadruple-precision
4756 floating-point values `a' and `b'. If `zSign' is 1, the sum is negated
4757 before being returned. `zSign' is ignored if the result is a NaN.
4758 The addition is performed according to the IEC/IEEE Standard for Binary
4759 Floating-Point Arithmetic.
4760 -------------------------------------------------------------------------------
4761 */
addFloat128Sigs(float128 a,float128 b,flag zSign)4762 static float128 addFloat128Sigs( float128 a, float128 b, flag zSign )
4763 {
4764 int32 aExp, bExp, zExp;
4765 bits64 aSig0, aSig1, bSig0, bSig1, zSig0, zSig1, zSig2;
4766 int32 expDiff;
4767
4768 aSig1 = extractFloat128Frac1( a );
4769 aSig0 = extractFloat128Frac0( a );
4770 aExp = extractFloat128Exp( a );
4771 bSig1 = extractFloat128Frac1( b );
4772 bSig0 = extractFloat128Frac0( b );
4773 bExp = extractFloat128Exp( b );
4774 expDiff = aExp - bExp;
4775 if ( 0 < expDiff ) {
4776 if ( aExp == 0x7FFF ) {
4777 if ( aSig0 | aSig1 ) return propagateFloat128NaN( a, b );
4778 return a;
4779 }
4780 if ( bExp == 0 ) {
4781 --expDiff;
4782 }
4783 else {
4784 bSig0 |= LIT64( 0x0001000000000000 );
4785 }
4786 shift128ExtraRightJamming(
4787 bSig0, bSig1, 0, expDiff, &bSig0, &bSig1, &zSig2 );
4788 zExp = aExp;
4789 }
4790 else if ( expDiff < 0 ) {
4791 if ( bExp == 0x7FFF ) {
4792 if ( bSig0 | bSig1 ) return propagateFloat128NaN( a, b );
4793 return packFloat128( zSign, 0x7FFF, 0, 0 );
4794 }
4795 if ( aExp == 0 ) {
4796 ++expDiff;
4797 }
4798 else {
4799 aSig0 |= LIT64( 0x0001000000000000 );
4800 }
4801 shift128ExtraRightJamming(
4802 aSig0, aSig1, 0, - expDiff, &aSig0, &aSig1, &zSig2 );
4803 zExp = bExp;
4804 }
4805 else {
4806 if ( aExp == 0x7FFF ) {
4807 if ( aSig0 | aSig1 | bSig0 | bSig1 ) {
4808 return propagateFloat128NaN( a, b );
4809 }
4810 return a;
4811 }
4812 add128( aSig0, aSig1, bSig0, bSig1, &zSig0, &zSig1 );
4813 if ( aExp == 0 ) return packFloat128( zSign, 0, zSig0, zSig1 );
4814 zSig2 = 0;
4815 zSig0 |= LIT64( 0x0002000000000000 );
4816 zExp = aExp;
4817 goto shiftRight1;
4818 }
4819 aSig0 |= LIT64( 0x0001000000000000 );
4820 add128( aSig0, aSig1, bSig0, bSig1, &zSig0, &zSig1 );
4821 --zExp;
4822 if ( zSig0 < LIT64( 0x0002000000000000 ) ) goto roundAndPack;
4823 ++zExp;
4824 shiftRight1:
4825 shift128ExtraRightJamming(
4826 zSig0, zSig1, zSig2, 1, &zSig0, &zSig1, &zSig2 );
4827 roundAndPack:
4828 return roundAndPackFloat128( zSign, zExp, zSig0, zSig1, zSig2 );
4829
4830 }
4831
4832 /*
4833 -------------------------------------------------------------------------------
4834 Returns the result of subtracting the absolute values of the quadruple-
4835 precision floating-point values `a' and `b'. If `zSign' is 1, the
4836 difference is negated before being returned. `zSign' is ignored if the
4837 result is a NaN. The subtraction is performed according to the IEC/IEEE
4838 Standard for Binary Floating-Point Arithmetic.
4839 -------------------------------------------------------------------------------
4840 */
subFloat128Sigs(float128 a,float128 b,flag zSign)4841 static float128 subFloat128Sigs( float128 a, float128 b, flag zSign )
4842 {
4843 int32 aExp, bExp, zExp;
4844 bits64 aSig0, aSig1, bSig0, bSig1, zSig0, zSig1;
4845 int32 expDiff;
4846 float128 z;
4847
4848 aSig1 = extractFloat128Frac1( a );
4849 aSig0 = extractFloat128Frac0( a );
4850 aExp = extractFloat128Exp( a );
4851 bSig1 = extractFloat128Frac1( b );
4852 bSig0 = extractFloat128Frac0( b );
4853 bExp = extractFloat128Exp( b );
4854 expDiff = aExp - bExp;
4855 shortShift128Left( aSig0, aSig1, 14, &aSig0, &aSig1 );
4856 shortShift128Left( bSig0, bSig1, 14, &bSig0, &bSig1 );
4857 if ( 0 < expDiff ) goto aExpBigger;
4858 if ( expDiff < 0 ) goto bExpBigger;
4859 if ( aExp == 0x7FFF ) {
4860 if ( aSig0 | aSig1 | bSig0 | bSig1 ) {
4861 return propagateFloat128NaN( a, b );
4862 }
4863 float_raise( float_flag_invalid );
4864 z.low = float128_default_nan_low;
4865 z.high = float128_default_nan_high;
4866 return z;
4867 }
4868 if ( aExp == 0 ) {
4869 aExp = 1;
4870 bExp = 1;
4871 }
4872 if ( bSig0 < aSig0 ) goto aBigger;
4873 if ( aSig0 < bSig0 ) goto bBigger;
4874 if ( bSig1 < aSig1 ) goto aBigger;
4875 if ( aSig1 < bSig1 ) goto bBigger;
4876 return packFloat128( float_rounding_mode == float_round_down, 0, 0, 0 );
4877 bExpBigger:
4878 if ( bExp == 0x7FFF ) {
4879 if ( bSig0 | bSig1 ) return propagateFloat128NaN( a, b );
4880 return packFloat128( zSign ^ 1, 0x7FFF, 0, 0 );
4881 }
4882 if ( aExp == 0 ) {
4883 ++expDiff;
4884 }
4885 else {
4886 aSig0 |= LIT64( 0x4000000000000000 );
4887 }
4888 shift128RightJamming( aSig0, aSig1, - expDiff, &aSig0, &aSig1 );
4889 bSig0 |= LIT64( 0x4000000000000000 );
4890 bBigger:
4891 sub128( bSig0, bSig1, aSig0, aSig1, &zSig0, &zSig1 );
4892 zExp = bExp;
4893 zSign ^= 1;
4894 goto normalizeRoundAndPack;
4895 aExpBigger:
4896 if ( aExp == 0x7FFF ) {
4897 if ( aSig0 | aSig1 ) return propagateFloat128NaN( a, b );
4898 return a;
4899 }
4900 if ( bExp == 0 ) {
4901 --expDiff;
4902 }
4903 else {
4904 bSig0 |= LIT64( 0x4000000000000000 );
4905 }
4906 shift128RightJamming( bSig0, bSig1, expDiff, &bSig0, &bSig1 );
4907 aSig0 |= LIT64( 0x4000000000000000 );
4908 aBigger:
4909 sub128( aSig0, aSig1, bSig0, bSig1, &zSig0, &zSig1 );
4910 zExp = aExp;
4911 normalizeRoundAndPack:
4912 --zExp;
4913 return normalizeRoundAndPackFloat128( zSign, zExp - 14, zSig0, zSig1 );
4914
4915 }
4916
4917 /*
4918 -------------------------------------------------------------------------------
4919 Returns the result of adding the quadruple-precision floating-point values
4920 `a' and `b'. The operation is performed according to the IEC/IEEE Standard
4921 for Binary Floating-Point Arithmetic.
4922 -------------------------------------------------------------------------------
4923 */
float128_add(float128 a,float128 b)4924 float128 float128_add( float128 a, float128 b )
4925 {
4926 flag aSign, bSign;
4927
4928 aSign = extractFloat128Sign( a );
4929 bSign = extractFloat128Sign( b );
4930 if ( aSign == bSign ) {
4931 return addFloat128Sigs( a, b, aSign );
4932 }
4933 else {
4934 return subFloat128Sigs( a, b, aSign );
4935 }
4936
4937 }
4938
4939 /*
4940 -------------------------------------------------------------------------------
4941 Returns the result of subtracting the quadruple-precision floating-point
4942 values `a' and `b'. The operation is performed according to the IEC/IEEE
4943 Standard for Binary Floating-Point Arithmetic.
4944 -------------------------------------------------------------------------------
4945 */
float128_sub(float128 a,float128 b)4946 float128 float128_sub( float128 a, float128 b )
4947 {
4948 flag aSign, bSign;
4949
4950 aSign = extractFloat128Sign( a );
4951 bSign = extractFloat128Sign( b );
4952 if ( aSign == bSign ) {
4953 return subFloat128Sigs( a, b, aSign );
4954 }
4955 else {
4956 return addFloat128Sigs( a, b, aSign );
4957 }
4958
4959 }
4960
4961 /*
4962 -------------------------------------------------------------------------------
4963 Returns the result of multiplying the quadruple-precision floating-point
4964 values `a' and `b'. The operation is performed according to the IEC/IEEE
4965 Standard for Binary Floating-Point Arithmetic.
4966 -------------------------------------------------------------------------------
4967 */
float128_mul(float128 a,float128 b)4968 float128 float128_mul( float128 a, float128 b )
4969 {
4970 flag aSign, bSign, zSign;
4971 int32 aExp, bExp, zExp;
4972 bits64 aSig0, aSig1, bSig0, bSig1, zSig0, zSig1, zSig2, zSig3;
4973 float128 z;
4974
4975 aSig1 = extractFloat128Frac1( a );
4976 aSig0 = extractFloat128Frac0( a );
4977 aExp = extractFloat128Exp( a );
4978 aSign = extractFloat128Sign( a );
4979 bSig1 = extractFloat128Frac1( b );
4980 bSig0 = extractFloat128Frac0( b );
4981 bExp = extractFloat128Exp( b );
4982 bSign = extractFloat128Sign( b );
4983 zSign = aSign ^ bSign;
4984 if ( aExp == 0x7FFF ) {
4985 if ( ( aSig0 | aSig1 )
4986 || ( ( bExp == 0x7FFF ) && ( bSig0 | bSig1 ) ) ) {
4987 return propagateFloat128NaN( a, b );
4988 }
4989 if ( ( bExp | bSig0 | bSig1 ) == 0 ) goto invalid;
4990 return packFloat128( zSign, 0x7FFF, 0, 0 );
4991 }
4992 if ( bExp == 0x7FFF ) {
4993 if ( bSig0 | bSig1 ) return propagateFloat128NaN( a, b );
4994 if ( ( aExp | aSig0 | aSig1 ) == 0 ) {
4995 invalid:
4996 float_raise( float_flag_invalid );
4997 z.low = float128_default_nan_low;
4998 z.high = float128_default_nan_high;
4999 return z;
5000 }
5001 return packFloat128( zSign, 0x7FFF, 0, 0 );
5002 }
5003 if ( aExp == 0 ) {
5004 if ( ( aSig0 | aSig1 ) == 0 ) return packFloat128( zSign, 0, 0, 0 );
5005 normalizeFloat128Subnormal( aSig0, aSig1, &aExp, &aSig0, &aSig1 );
5006 }
5007 if ( bExp == 0 ) {
5008 if ( ( bSig0 | bSig1 ) == 0 ) return packFloat128( zSign, 0, 0, 0 );
5009 normalizeFloat128Subnormal( bSig0, bSig1, &bExp, &bSig0, &bSig1 );
5010 }
5011 zExp = aExp + bExp - 0x4000;
5012 aSig0 |= LIT64( 0x0001000000000000 );
5013 shortShift128Left( bSig0, bSig1, 16, &bSig0, &bSig1 );
5014 mul128To256( aSig0, aSig1, bSig0, bSig1, &zSig0, &zSig1, &zSig2, &zSig3 );
5015 add128( zSig0, zSig1, aSig0, aSig1, &zSig0, &zSig1 );
5016 zSig2 |= ( zSig3 != 0 );
5017 if ( LIT64( 0x0002000000000000 ) <= zSig0 ) {
5018 shift128ExtraRightJamming(
5019 zSig0, zSig1, zSig2, 1, &zSig0, &zSig1, &zSig2 );
5020 ++zExp;
5021 }
5022 return roundAndPackFloat128( zSign, zExp, zSig0, zSig1, zSig2 );
5023
5024 }
5025
5026 /*
5027 -------------------------------------------------------------------------------
5028 Returns the result of dividing the quadruple-precision floating-point value
5029 `a' by the corresponding value `b'. The operation is performed according to
5030 the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
5031 -------------------------------------------------------------------------------
5032 */
float128_div(float128 a,float128 b)5033 float128 float128_div( float128 a, float128 b )
5034 {
5035 flag aSign, bSign, zSign;
5036 int32 aExp, bExp, zExp;
5037 bits64 aSig0, aSig1, bSig0, bSig1, zSig0, zSig1, zSig2;
5038 bits64 rem0, rem1, rem2, rem3, term0, term1, term2, term3;
5039 float128 z;
5040
5041 aSig1 = extractFloat128Frac1( a );
5042 aSig0 = extractFloat128Frac0( a );
5043 aExp = extractFloat128Exp( a );
5044 aSign = extractFloat128Sign( a );
5045 bSig1 = extractFloat128Frac1( b );
5046 bSig0 = extractFloat128Frac0( b );
5047 bExp = extractFloat128Exp( b );
5048 bSign = extractFloat128Sign( b );
5049 zSign = aSign ^ bSign;
5050 if ( aExp == 0x7FFF ) {
5051 if ( aSig0 | aSig1 ) return propagateFloat128NaN( a, b );
5052 if ( bExp == 0x7FFF ) {
5053 if ( bSig0 | bSig1 ) return propagateFloat128NaN( a, b );
5054 goto invalid;
5055 }
5056 return packFloat128( zSign, 0x7FFF, 0, 0 );
5057 }
5058 if ( bExp == 0x7FFF ) {
5059 if ( bSig0 | bSig1 ) return propagateFloat128NaN( a, b );
5060 return packFloat128( zSign, 0, 0, 0 );
5061 }
5062 if ( bExp == 0 ) {
5063 if ( ( bSig0 | bSig1 ) == 0 ) {
5064 if ( ( aExp | aSig0 | aSig1 ) == 0 ) {
5065 invalid:
5066 float_raise( float_flag_invalid );
5067 z.low = float128_default_nan_low;
5068 z.high = float128_default_nan_high;
5069 return z;
5070 }
5071 float_raise( float_flag_divbyzero );
5072 return packFloat128( zSign, 0x7FFF, 0, 0 );
5073 }
5074 normalizeFloat128Subnormal( bSig0, bSig1, &bExp, &bSig0, &bSig1 );
5075 }
5076 if ( aExp == 0 ) {
5077 if ( ( aSig0 | aSig1 ) == 0 ) return packFloat128( zSign, 0, 0, 0 );
5078 normalizeFloat128Subnormal( aSig0, aSig1, &aExp, &aSig0, &aSig1 );
5079 }
5080 zExp = aExp - bExp + 0x3FFD;
5081 shortShift128Left(
5082 aSig0 | LIT64( 0x0001000000000000 ), aSig1, 15, &aSig0, &aSig1 );
5083 shortShift128Left(
5084 bSig0 | LIT64( 0x0001000000000000 ), bSig1, 15, &bSig0, &bSig1 );
5085 if ( le128( bSig0, bSig1, aSig0, aSig1 ) ) {
5086 shift128Right( aSig0, aSig1, 1, &aSig0, &aSig1 );
5087 ++zExp;
5088 }
5089 zSig0 = estimateDiv128To64( aSig0, aSig1, bSig0 );
5090 mul128By64To192( bSig0, bSig1, zSig0, &term0, &term1, &term2 );
5091 sub192( aSig0, aSig1, 0, term0, term1, term2, &rem0, &rem1, &rem2 );
5092 while ( (sbits64) rem0 < 0 ) {
5093 --zSig0;
5094 add192( rem0, rem1, rem2, 0, bSig0, bSig1, &rem0, &rem1, &rem2 );
5095 }
5096 zSig1 = estimateDiv128To64( rem1, rem2, bSig0 );
5097 if ( ( zSig1 & 0x3FFF ) <= 4 ) {
5098 mul128By64To192( bSig0, bSig1, zSig1, &term1, &term2, &term3 );
5099 sub192( rem1, rem2, 0, term1, term2, term3, &rem1, &rem2, &rem3 );
5100 while ( (sbits64) rem1 < 0 ) {
5101 --zSig1;
5102 add192( rem1, rem2, rem3, 0, bSig0, bSig1, &rem1, &rem2, &rem3 );
5103 }
5104 zSig1 |= ( ( rem1 | rem2 | rem3 ) != 0 );
5105 }
5106 shift128ExtraRightJamming( zSig0, zSig1, 0, 15, &zSig0, &zSig1, &zSig2 );
5107 return roundAndPackFloat128( zSign, zExp, zSig0, zSig1, zSig2 );
5108
5109 }
5110
5111 /*
5112 -------------------------------------------------------------------------------
5113 Returns the remainder of the quadruple-precision floating-point value `a'
5114 with respect to the corresponding value `b'. The operation is performed
5115 according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
5116 -------------------------------------------------------------------------------
5117 */
float128_rem(float128 a,float128 b)5118 float128 float128_rem( float128 a, float128 b )
5119 {
5120 flag aSign, bSign, zSign;
5121 int32 aExp, bExp, expDiff;
5122 bits64 aSig0, aSig1, bSig0, bSig1, q, term0, term1, term2;
5123 bits64 allZero, alternateASig0, alternateASig1, sigMean1;
5124 sbits64 sigMean0;
5125 float128 z;
5126
5127 aSig1 = extractFloat128Frac1( a );
5128 aSig0 = extractFloat128Frac0( a );
5129 aExp = extractFloat128Exp( a );
5130 aSign = extractFloat128Sign( a );
5131 bSig1 = extractFloat128Frac1( b );
5132 bSig0 = extractFloat128Frac0( b );
5133 bExp = extractFloat128Exp( b );
5134 bSign = extractFloat128Sign( b );
5135 if ( aExp == 0x7FFF ) {
5136 if ( ( aSig0 | aSig1 )
5137 || ( ( bExp == 0x7FFF ) && ( bSig0 | bSig1 ) ) ) {
5138 return propagateFloat128NaN( a, b );
5139 }
5140 goto invalid;
5141 }
5142 if ( bExp == 0x7FFF ) {
5143 if ( bSig0 | bSig1 ) return propagateFloat128NaN( a, b );
5144 return a;
5145 }
5146 if ( bExp == 0 ) {
5147 if ( ( bSig0 | bSig1 ) == 0 ) {
5148 invalid:
5149 float_raise( float_flag_invalid );
5150 z.low = float128_default_nan_low;
5151 z.high = float128_default_nan_high;
5152 return z;
5153 }
5154 normalizeFloat128Subnormal( bSig0, bSig1, &bExp, &bSig0, &bSig1 );
5155 }
5156 if ( aExp == 0 ) {
5157 if ( ( aSig0 | aSig1 ) == 0 ) return a;
5158 normalizeFloat128Subnormal( aSig0, aSig1, &aExp, &aSig0, &aSig1 );
5159 }
5160 expDiff = aExp - bExp;
5161 if ( expDiff < -1 ) return a;
5162 shortShift128Left(
5163 aSig0 | LIT64( 0x0001000000000000 ),
5164 aSig1,
5165 15 - ( expDiff < 0 ),
5166 &aSig0,
5167 &aSig1
5168 );
5169 shortShift128Left(
5170 bSig0 | LIT64( 0x0001000000000000 ), bSig1, 15, &bSig0, &bSig1 );
5171 q = le128( bSig0, bSig1, aSig0, aSig1 );
5172 if ( q ) sub128( aSig0, aSig1, bSig0, bSig1, &aSig0, &aSig1 );
5173 expDiff -= 64;
5174 while ( 0 < expDiff ) {
5175 q = estimateDiv128To64( aSig0, aSig1, bSig0 );
5176 q = ( 4 < q ) ? q - 4 : 0;
5177 mul128By64To192( bSig0, bSig1, q, &term0, &term1, &term2 );
5178 shortShift192Left( term0, term1, term2, 61, &term1, &term2, &allZero );
5179 shortShift128Left( aSig0, aSig1, 61, &aSig0, &allZero );
5180 sub128( aSig0, 0, term1, term2, &aSig0, &aSig1 );
5181 expDiff -= 61;
5182 }
5183 if ( -64 < expDiff ) {
5184 q = estimateDiv128To64( aSig0, aSig1, bSig0 );
5185 q = ( 4 < q ) ? q - 4 : 0;
5186 q >>= - expDiff;
5187 shift128Right( bSig0, bSig1, 12, &bSig0, &bSig1 );
5188 expDiff += 52;
5189 if ( expDiff < 0 ) {
5190 shift128Right( aSig0, aSig1, - expDiff, &aSig0, &aSig1 );
5191 }
5192 else {
5193 shortShift128Left( aSig0, aSig1, expDiff, &aSig0, &aSig1 );
5194 }
5195 mul128By64To192( bSig0, bSig1, q, &term0, &term1, &term2 );
5196 sub128( aSig0, aSig1, term1, term2, &aSig0, &aSig1 );
5197 }
5198 else {
5199 shift128Right( aSig0, aSig1, 12, &aSig0, &aSig1 );
5200 shift128Right( bSig0, bSig1, 12, &bSig0, &bSig1 );
5201 }
5202 do {
5203 alternateASig0 = aSig0;
5204 alternateASig1 = aSig1;
5205 ++q;
5206 sub128( aSig0, aSig1, bSig0, bSig1, &aSig0, &aSig1 );
5207 } while ( 0 <= (sbits64) aSig0 );
5208 add128(
5209 aSig0, aSig1, alternateASig0, alternateASig1, (bits64 *)&sigMean0, &sigMean1 );
5210 if ( ( sigMean0 < 0 )
5211 || ( ( ( sigMean0 | sigMean1 ) == 0 ) && ( q & 1 ) ) ) {
5212 aSig0 = alternateASig0;
5213 aSig1 = alternateASig1;
5214 }
5215 zSign = ( (sbits64) aSig0 < 0 );
5216 if ( zSign ) sub128( 0, 0, aSig0, aSig1, &aSig0, &aSig1 );
5217 return
5218 normalizeRoundAndPackFloat128( aSign ^ zSign, bExp - 4, aSig0, aSig1 );
5219
5220 }
5221
5222 /*
5223 -------------------------------------------------------------------------------
5224 Returns the square root of the quadruple-precision floating-point value `a'.
5225 The operation is performed according to the IEC/IEEE Standard for Binary
5226 Floating-Point Arithmetic.
5227 -------------------------------------------------------------------------------
5228 */
float128_sqrt(float128 a)5229 float128 float128_sqrt( float128 a )
5230 {
5231 flag aSign;
5232 int32 aExp, zExp;
5233 bits64 aSig0, aSig1, zSig0, zSig1, zSig2, doubleZSig0;
5234 bits64 rem0, rem1, rem2, rem3, term0, term1, term2, term3;
5235 float128 z;
5236
5237 aSig1 = extractFloat128Frac1( a );
5238 aSig0 = extractFloat128Frac0( a );
5239 aExp = extractFloat128Exp( a );
5240 aSign = extractFloat128Sign( a );
5241 if ( aExp == 0x7FFF ) {
5242 if ( aSig0 | aSig1 ) return propagateFloat128NaN( a, a );
5243 if ( ! aSign ) return a;
5244 goto invalid;
5245 }
5246 if ( aSign ) {
5247 if ( ( aExp | aSig0 | aSig1 ) == 0 ) return a;
5248 invalid:
5249 float_raise( float_flag_invalid );
5250 z.low = float128_default_nan_low;
5251 z.high = float128_default_nan_high;
5252 return z;
5253 }
5254 if ( aExp == 0 ) {
5255 if ( ( aSig0 | aSig1 ) == 0 ) return packFloat128( 0, 0, 0, 0 );
5256 normalizeFloat128Subnormal( aSig0, aSig1, &aExp, &aSig0, &aSig1 );
5257 }
5258 zExp = ( ( aExp - 0x3FFF )>>1 ) + 0x3FFE;
5259 aSig0 |= LIT64( 0x0001000000000000 );
5260 zSig0 = estimateSqrt32( aExp, aSig0>>17 );
5261 shortShift128Left( aSig0, aSig1, 13 - ( aExp & 1 ), &aSig0, &aSig1 );
5262 zSig0 = estimateDiv128To64( aSig0, aSig1, zSig0<<32 ) + ( zSig0<<30 );
5263 doubleZSig0 = zSig0<<1;
5264 mul64To128( zSig0, zSig0, &term0, &term1 );
5265 sub128( aSig0, aSig1, term0, term1, &rem0, &rem1 );
5266 while ( (sbits64) rem0 < 0 ) {
5267 --zSig0;
5268 doubleZSig0 -= 2;
5269 add128( rem0, rem1, zSig0>>63, doubleZSig0 | 1, &rem0, &rem1 );
5270 }
5271 zSig1 = estimateDiv128To64( rem1, 0, doubleZSig0 );
5272 if ( ( zSig1 & 0x1FFF ) <= 5 ) {
5273 if ( zSig1 == 0 ) zSig1 = 1;
5274 mul64To128( doubleZSig0, zSig1, &term1, &term2 );
5275 sub128( rem1, 0, term1, term2, &rem1, &rem2 );
5276 mul64To128( zSig1, zSig1, &term2, &term3 );
5277 sub192( rem1, rem2, 0, 0, term2, term3, &rem1, &rem2, &rem3 );
5278 while ( (sbits64) rem1 < 0 ) {
5279 --zSig1;
5280 shortShift128Left( 0, zSig1, 1, &term2, &term3 );
5281 term3 |= 1;
5282 term2 |= doubleZSig0;
5283 add192( rem1, rem2, rem3, 0, term2, term3, &rem1, &rem2, &rem3 );
5284 }
5285 zSig1 |= ( ( rem1 | rem2 | rem3 ) != 0 );
5286 }
5287 shift128ExtraRightJamming( zSig0, zSig1, 0, 14, &zSig0, &zSig1, &zSig2 );
5288 return roundAndPackFloat128( 0, zExp, zSig0, zSig1, zSig2 );
5289
5290 }
5291
5292 /*
5293 -------------------------------------------------------------------------------
5294 Returns 1 if the quadruple-precision floating-point value `a' is equal to
5295 the corresponding value `b', and 0 otherwise. The comparison is performed
5296 according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
5297 -------------------------------------------------------------------------------
5298 */
float128_eq(float128 a,float128 b)5299 flag float128_eq( float128 a, float128 b )
5300 {
5301
5302 if ( ( ( extractFloat128Exp( a ) == 0x7FFF )
5303 && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) )
5304 || ( ( extractFloat128Exp( b ) == 0x7FFF )
5305 && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) )
5306 ) {
5307 if ( float128_is_signaling_nan( a )
5308 || float128_is_signaling_nan( b ) ) {
5309 float_raise( float_flag_invalid );
5310 }
5311 return 0;
5312 }
5313 return
5314 ( a.low == b.low )
5315 && ( ( a.high == b.high )
5316 || ( ( a.low == 0 )
5317 && ( (bits64) ( ( a.high | b.high )<<1 ) == 0 ) )
5318 );
5319
5320 }
5321
5322 /*
5323 -------------------------------------------------------------------------------
5324 Returns 1 if the quadruple-precision floating-point value `a' is less than
5325 or equal to the corresponding value `b', and 0 otherwise. The comparison
5326 is performed according to the IEC/IEEE Standard for Binary Floating-Point
5327 Arithmetic.
5328 -------------------------------------------------------------------------------
5329 */
float128_le(float128 a,float128 b)5330 flag float128_le( float128 a, float128 b )
5331 {
5332 flag aSign, bSign;
5333
5334 if ( ( ( extractFloat128Exp( a ) == 0x7FFF )
5335 && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) )
5336 || ( ( extractFloat128Exp( b ) == 0x7FFF )
5337 && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) )
5338 ) {
5339 float_raise( float_flag_invalid );
5340 return 0;
5341 }
5342 aSign = extractFloat128Sign( a );
5343 bSign = extractFloat128Sign( b );
5344 if ( aSign != bSign ) {
5345 return
5346 aSign
5347 || ( ( ( (bits64) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
5348 == 0 );
5349 }
5350 return
5351 aSign ? le128( b.high, b.low, a.high, a.low )
5352 : le128( a.high, a.low, b.high, b.low );
5353
5354 }
5355
5356 /*
5357 -------------------------------------------------------------------------------
5358 Returns 1 if the quadruple-precision floating-point value `a' is less than
5359 the corresponding value `b', and 0 otherwise. The comparison is performed
5360 according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
5361 -------------------------------------------------------------------------------
5362 */
float128_lt(float128 a,float128 b)5363 flag float128_lt( float128 a, float128 b )
5364 {
5365 flag aSign, bSign;
5366
5367 if ( ( ( extractFloat128Exp( a ) == 0x7FFF )
5368 && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) )
5369 || ( ( extractFloat128Exp( b ) == 0x7FFF )
5370 && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) )
5371 ) {
5372 float_raise( float_flag_invalid );
5373 return 0;
5374 }
5375 aSign = extractFloat128Sign( a );
5376 bSign = extractFloat128Sign( b );
5377 if ( aSign != bSign ) {
5378 return
5379 aSign
5380 && ( ( ( (bits64) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
5381 != 0 );
5382 }
5383 return
5384 aSign ? lt128( b.high, b.low, a.high, a.low )
5385 : lt128( a.high, a.low, b.high, b.low );
5386
5387 }
5388
5389 /*
5390 -------------------------------------------------------------------------------
5391 Returns 1 if the quadruple-precision floating-point value `a' is equal to
5392 the corresponding value `b', and 0 otherwise. The invalid exception is
5393 raised if either operand is a NaN. Otherwise, the comparison is performed
5394 according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
5395 -------------------------------------------------------------------------------
5396 */
float128_eq_signaling(float128 a,float128 b)5397 flag float128_eq_signaling( float128 a, float128 b )
5398 {
5399
5400 if ( ( ( extractFloat128Exp( a ) == 0x7FFF )
5401 && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) )
5402 || ( ( extractFloat128Exp( b ) == 0x7FFF )
5403 && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) )
5404 ) {
5405 float_raise( float_flag_invalid );
5406 return 0;
5407 }
5408 return
5409 ( a.low == b.low )
5410 && ( ( a.high == b.high )
5411 || ( ( a.low == 0 )
5412 && ( (bits64) ( ( a.high | b.high )<<1 ) == 0 ) )
5413 );
5414
5415 }
5416
5417 /*
5418 -------------------------------------------------------------------------------
5419 Returns 1 if the quadruple-precision floating-point value `a' is less than
5420 or equal to the corresponding value `b', and 0 otherwise. Quiet NaNs do not
5421 cause an exception. Otherwise, the comparison is performed according to the
5422 IEC/IEEE Standard for Binary Floating-Point Arithmetic.
5423 -------------------------------------------------------------------------------
5424 */
float128_le_quiet(float128 a,float128 b)5425 flag float128_le_quiet( float128 a, float128 b )
5426 {
5427 flag aSign, bSign;
5428
5429 if ( ( ( extractFloat128Exp( a ) == 0x7FFF )
5430 && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) )
5431 || ( ( extractFloat128Exp( b ) == 0x7FFF )
5432 && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) )
5433 ) {
5434 if ( float128_is_signaling_nan( a )
5435 || float128_is_signaling_nan( b ) ) {
5436 float_raise( float_flag_invalid );
5437 }
5438 return 0;
5439 }
5440 aSign = extractFloat128Sign( a );
5441 bSign = extractFloat128Sign( b );
5442 if ( aSign != bSign ) {
5443 return
5444 aSign
5445 || ( ( ( (bits64) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
5446 == 0 );
5447 }
5448 return
5449 aSign ? le128( b.high, b.low, a.high, a.low )
5450 : le128( a.high, a.low, b.high, b.low );
5451
5452 }
5453
5454 /*
5455 -------------------------------------------------------------------------------
5456 Returns 1 if the quadruple-precision floating-point value `a' is less than
5457 the corresponding value `b', and 0 otherwise. Quiet NaNs do not cause an
5458 exception. Otherwise, the comparison is performed according to the IEC/IEEE
5459 Standard for Binary Floating-Point Arithmetic.
5460 -------------------------------------------------------------------------------
5461 */
float128_lt_quiet(float128 a,float128 b)5462 flag float128_lt_quiet( float128 a, float128 b )
5463 {
5464 flag aSign, bSign;
5465
5466 if ( ( ( extractFloat128Exp( a ) == 0x7FFF )
5467 && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) )
5468 || ( ( extractFloat128Exp( b ) == 0x7FFF )
5469 && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) )
5470 ) {
5471 if ( float128_is_signaling_nan( a )
5472 || float128_is_signaling_nan( b ) ) {
5473 float_raise( float_flag_invalid );
5474 }
5475 return 0;
5476 }
5477 aSign = extractFloat128Sign( a );
5478 bSign = extractFloat128Sign( b );
5479 if ( aSign != bSign ) {
5480 return
5481 aSign
5482 && ( ( ( (bits64) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
5483 != 0 );
5484 }
5485 return
5486 aSign ? lt128( b.high, b.low, a.high, a.low )
5487 : lt128( a.high, a.low, b.high, b.low );
5488
5489 }
5490
5491 #endif
5492
5493
5494 #if defined(SOFTFLOAT_FOR_GCC) && defined(SOFTFLOAT_NEED_FIXUNS)
5495
5496 /*
5497 * These two routines are not part of the original softfloat distribution.
5498 *
5499 * They are based on the corresponding conversions to integer but return
5500 * unsigned numbers instead since these functions are required by GCC.
5501 *
5502 * Added by Mark Brinicombe <mark@NetBSD.org> 27/09/97
5503 *
5504 * float64 version overhauled for SoftFloat 2a [bjh21 2000-07-15]
5505 */
5506
5507 /*
5508 -------------------------------------------------------------------------------
5509 Returns the result of converting the double-precision floating-point value
5510 `a' to the 32-bit unsigned integer format. The conversion is
5511 performed according to the IEC/IEEE Standard for Binary Floating-point
5512 Arithmetic, except that the conversion is always rounded toward zero. If
5513 `a' is a NaN, the largest positive integer is returned. If the conversion
5514 overflows, the largest integer positive is returned.
5515 -------------------------------------------------------------------------------
5516 */
float64_to_uint32_round_to_zero(float64 a)5517 uint32 float64_to_uint32_round_to_zero( float64 a )
5518 {
5519 flag aSign;
5520 int16 aExp, shiftCount;
5521 bits64 aSig, savedASig;
5522 uint32 z;
5523
5524 aSig = extractFloat64Frac( a );
5525 aExp = extractFloat64Exp( a );
5526 aSign = extractFloat64Sign( a );
5527
5528 if (aSign) {
5529 float_raise( float_flag_invalid );
5530 return(0);
5531 }
5532
5533 if ( 0x41E < aExp ) {
5534 float_raise( float_flag_invalid );
5535 return 0xffffffff;
5536 }
5537 else if ( aExp < 0x3FF ) {
5538 if ( aExp || aSig ) float_exception_flags |= float_flag_inexact;
5539 return 0;
5540 }
5541 aSig |= LIT64( 0x0010000000000000 );
5542 shiftCount = 0x433 - aExp;
5543 savedASig = aSig;
5544 aSig >>= shiftCount;
5545 z = aSig;
5546 if ( ( aSig<<shiftCount ) != savedASig ) {
5547 float_exception_flags |= float_flag_inexact;
5548 }
5549 return z;
5550
5551 }
5552
5553 /*
5554 -------------------------------------------------------------------------------
5555 Returns the result of converting the single-precision floating-point value
5556 `a' to the 32-bit unsigned integer format. The conversion is
5557 performed according to the IEC/IEEE Standard for Binary Floating-point
5558 Arithmetic, except that the conversion is always rounded toward zero. If
5559 `a' is a NaN, the largest positive integer is returned. If the conversion
5560 overflows, the largest positive integer is returned.
5561 -------------------------------------------------------------------------------
5562 */
float32_to_uint32_round_to_zero(float32 a)5563 uint32 float32_to_uint32_round_to_zero( float32 a )
5564 {
5565 flag aSign;
5566 int16 aExp, shiftCount;
5567 bits32 aSig;
5568 uint32 z;
5569
5570 aSig = extractFloat32Frac( a );
5571 aExp = extractFloat32Exp( a );
5572 aSign = extractFloat32Sign( a );
5573 shiftCount = aExp - 0x9E;
5574
5575 if (aSign) {
5576 float_raise( float_flag_invalid );
5577 return(0);
5578 }
5579 if ( 0 < shiftCount ) {
5580 float_raise( float_flag_invalid );
5581 return 0xFFFFFFFF;
5582 }
5583 else if ( aExp <= 0x7E ) {
5584 if ( aExp | aSig ) float_exception_flags |= float_flag_inexact;
5585 return 0;
5586 }
5587 aSig = ( aSig | 0x800000 )<<8;
5588 z = aSig>>( - shiftCount );
5589 if ( aSig<<( shiftCount & 31 ) ) {
5590 float_exception_flags |= float_flag_inexact;
5591 }
5592 return z;
5593
5594 }
5595
5596 #endif
5597