xref: /illumos-gate/usr/src/lib/libm/common/m9x/fma.c (revision 4b5c8e93cab28d3c65ba9d407fd8f46e3be1db1c)
1 /*
2  * CDDL HEADER START
3  *
4  * The contents of this file are subject to the terms of the
5  * Common Development and Distribution License (the "License").
6  * You may not use this file except in compliance with the License.
7  *
8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9  * or http://www.opensolaris.org/os/licensing.
10  * See the License for the specific language governing permissions
11  * and limitations under the License.
12  *
13  * When distributing Covered Code, include this CDDL HEADER in each
14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15  * If applicable, add the following below this CDDL HEADER, with the
16  * fields enclosed by brackets "[]" replaced with your own identifying
17  * information: Portions Copyright [yyyy] [name of copyright owner]
18  *
19  * CDDL HEADER END
20  */
21 
22 /*
23  * Copyright 2011 Nexenta Systems, Inc.  All rights reserved.
24  */
25 /*
26  * Copyright 2006 Sun Microsystems, Inc.  All rights reserved.
27  * Use is subject to license terms.
28  */
29 
30 #pragma weak fma = __fma
31 
32 #include "libm.h"
33 #include "fma.h"
34 #include "fenv_inlines.h"
35 
36 #if defined(__sparc)
37 
38 static const union {
39 	unsigned i[2];
40 	double d;
41 } C[] = {
42 	{ 0x3fe00000u, 0 },
43 	{ 0x40000000u, 0 },
44 	{ 0x43300000u, 0 },
45 	{ 0x41a00000u, 0 },
46 	{ 0x3e500000u, 0 },
47 	{ 0x3df00000u, 0 },
48 	{ 0x3bf00000u, 0 },
49 	{ 0x7fe00000u, 0 },
50 	{ 0x00100000u, 0 },
51 	{ 0x00100001u, 0 }
52 };
53 
54 #define	half	C[0].d
55 #define	two	C[1].d
56 #define	two52	C[2].d
57 #define	two27	C[3].d
58 #define	twom26	C[4].d
59 #define	twom32	C[5].d
60 #define	twom64	C[6].d
61 #define	huge	C[7].d
62 #define	tiny	C[8].d
63 #define	tiny2	C[9].d
64 
65 static const unsigned int fsr_rm = 0xc0000000u;
66 
67 /*
68  * fma for SPARC: 64-bit double precision, big-endian
69  */
70 double
71 __fma(double x, double y, double z) {
72 	union {
73 		unsigned i[2];
74 		double d;
75 	} xx, yy, zz;
76 	double xhi, yhi, xlo, ylo, t;
77 	unsigned int xy0, xy1, xy2, xy3, z0, z1, z2, z3, fsr, rm, sticky;
78 	int hx, hy, hz, ex, ey, ez, exy, sxy, sz, e, ibit;
79 	volatile double	dummy;
80 
81 	/* extract the high order words of the arguments */
82 	xx.d = x;
83 	yy.d = y;
84 	zz.d = z;
85 	hx = xx.i[0] & ~0x80000000;
86 	hy = yy.i[0] & ~0x80000000;
87 	hz = zz.i[0] & ~0x80000000;
88 
89 	/* dispense with inf, nan, and zero cases */
90 	if (hx >= 0x7ff00000 || hy >= 0x7ff00000 || (hx | xx.i[1]) == 0 ||
91 		(hy | yy.i[1]) == 0)	/* x or y is inf, nan, or zero */
92 		return (x * y + z);
93 
94 	if (hz >= 0x7ff00000)	/* z is inf or nan */
95 		return (x + z);	/* avoid spurious under/overflow in x * y */
96 
97 	if ((hz | zz.i[1]) == 0)	/* z is zero */
98 		/*
99 		 * x * y isn't zero but could underflow to zero,
100 		 * so don't add z, lest we perturb the sign
101 		 */
102 		return (x * y);
103 
104 	/*
105 	 * now x, y, and z are all finite and nonzero; save the fsr and
106 	 * set round-to-negative-infinity mode (and clear nonstandard
107 	 * mode before we try to scale subnormal operands)
108 	 */
109 	__fenv_getfsr32(&fsr);
110 	__fenv_setfsr32(&fsr_rm);
111 
112 	/* extract signs and exponents, and normalize subnormals */
113 	sxy = (xx.i[0] ^ yy.i[0]) & 0x80000000;
114 	sz = zz.i[0] & 0x80000000;
115 	ex = hx >> 20;
116 	if (!ex) {
117 		xx.d = x * two52;
118 		ex = ((xx.i[0] & ~0x80000000) >> 20) - 52;
119 	}
120 	ey = hy >> 20;
121 	if (!ey) {
122 		yy.d = y * two52;
123 		ey = ((yy.i[0] & ~0x80000000) >> 20) - 52;
124 	}
125 	ez = hz >> 20;
126 	if (!ez) {
127 		zz.d = z * two52;
128 		ez = ((zz.i[0] & ~0x80000000) >> 20) - 52;
129 	}
130 
131 	/* multiply x*y to 106 bits */
132 	exy = ex + ey - 0x3ff;
133 	xx.i[0] = (xx.i[0] & 0xfffff) | 0x3ff00000;
134 	yy.i[0] = (yy.i[0] & 0xfffff) | 0x3ff00000;
135 	x = xx.d;
136 	y = yy.d;
137 	xhi = ((x + twom26) + two27) - two27;
138 	yhi = ((y + twom26) + two27) - two27;
139 	xlo = x - xhi;
140 	ylo = y - yhi;
141 	x *= y;
142 	y = ((xhi * yhi - x) + xhi * ylo + xlo * yhi) + xlo * ylo;
143 	if (x >= two) {
144 		x *= half;
145 		y *= half;
146 		exy++;
147 	}
148 
149 	/* extract the significands */
150 	xx.d = x;
151 	xy0 = (xx.i[0] & 0xfffff) | 0x100000;
152 	xy1 = xx.i[1];
153 	yy.d = t = y + twom32;
154 	xy2 = yy.i[1];
155 	yy.d = (y - (t - twom32)) + twom64;
156 	xy3 = yy.i[1];
157 	z0 = (zz.i[0] & 0xfffff) | 0x100000;
158 	z1 = zz.i[1];
159 	z2 = z3 = 0;
160 
161 	/*
162 	 * now x*y is represented by sxy, exy, and xy[0-3], and z is
163 	 * represented likewise; swap if need be so |xy| <= |z|
164 	 */
165 	if (exy > ez || (exy == ez && (xy0 > z0 || (xy0 == z0 &&
166 		(xy1 > z1 || (xy1 == z1 && (xy2 | xy3) != 0)))))) {
167 		e = sxy; sxy = sz; sz = e;
168 		e = exy; exy = ez; ez = e;
169 		e = xy0; xy0 = z0; z0 = e;
170 		e = xy1; xy1 = z1; z1 = e;
171 		z2 = xy2; xy2 = 0;
172 		z3 = xy3; xy3 = 0;
173 	}
174 
175 	/* shift the significand of xy keeping a sticky bit */
176 	e = ez - exy;
177 	if (e > 116) {
178 		xy0 = xy1 = xy2 = 0;
179 		xy3 = 1;
180 	} else if (e >= 96) {
181 		sticky = xy3 | xy2 | xy1 | ((xy0 << 1) << (127 - e));
182 		xy3 = xy0 >> (e - 96);
183 		if (sticky)
184 			xy3 |= 1;
185 		xy0 = xy1 = xy2 = 0;
186 	} else if (e >= 64) {
187 		sticky = xy3 | xy2 | ((xy1 << 1) << (95 - e));
188 		xy3 = (xy1 >> (e - 64)) | ((xy0 << 1) << (95 - e));
189 		if (sticky)
190 			xy3 |= 1;
191 		xy2 = xy0 >> (e - 64);
192 		xy0 = xy1 = 0;
193 	} else if (e >= 32) {
194 		sticky = xy3 | ((xy2 << 1) << (63 - e));
195 		xy3 = (xy2 >> (e - 32)) | ((xy1 << 1) << (63 - e));
196 		if (sticky)
197 			xy3 |= 1;
198 		xy2 = (xy1 >> (e - 32)) | ((xy0 << 1) << (63 - e));
199 		xy1 = xy0 >> (e - 32);
200 		xy0 = 0;
201 	} else if (e) {
202 		sticky = (xy3 << 1) << (31 - e);
203 		xy3 = (xy3 >> e) | ((xy2 << 1) << (31 - e));
204 		if (sticky)
205 			xy3 |= 1;
206 		xy2 = (xy2 >> e) | ((xy1 << 1) << (31 - e));
207 		xy1 = (xy1 >> e) | ((xy0 << 1) << (31 - e));
208 		xy0 >>= e;
209 	}
210 
211 	/* if this is a magnitude subtract, negate the significand of xy */
212 	if (sxy ^ sz) {
213 		xy0 = ~xy0;
214 		xy1 = ~xy1;
215 		xy2 = ~xy2;
216 		xy3 = -xy3;
217 		if (xy3 == 0)
218 			if (++xy2 == 0)
219 				if (++xy1 == 0)
220 					xy0++;
221 	}
222 
223 	/* add, propagating carries */
224 	z3 += xy3;
225 	e = (z3 < xy3);
226 	z2 += xy2;
227 	if (e) {
228 		z2++;
229 		e = (z2 <= xy2);
230 	} else
231 		e = (z2 < xy2);
232 	z1 += xy1;
233 	if (e) {
234 		z1++;
235 		e = (z1 <= xy1);
236 	} else
237 		e = (z1 < xy1);
238 	z0 += xy0;
239 	if (e)
240 		z0++;
241 
242 	/* postnormalize and collect rounding information into z2 */
243 	if (ez < 1) {
244 		/* result is tiny; shift right until exponent is within range */
245 		e = 1 - ez;
246 		if (e > 56) {
247 			z2 = 1;	/* result can't be exactly zero */
248 			z0 = z1 = 0;
249 		} else if (e >= 32) {
250 			sticky = z3 | z2 | ((z1 << 1) << (63 - e));
251 			z2 = (z1 >> (e - 32)) | ((z0 << 1) << (63 - e));
252 			if (sticky)
253 				z2 |= 1;
254 			z1 = z0 >> (e - 32);
255 			z0 = 0;
256 		} else {
257 			sticky = z3 | (z2 << 1) << (31 - e);
258 			z2 = (z2 >> e) | ((z1 << 1) << (31 - e));
259 			if (sticky)
260 				z2 |= 1;
261 			z1 = (z1 >> e) | ((z0 << 1) << (31 - e));
262 			z0 >>= e;
263 		}
264 		ez = 1;
265 	} else if (z0 >= 0x200000) {
266 		/* carry out; shift right by one */
267 		sticky = (z2 & 1) | z3;
268 		z2 = (z2 >> 1) | (z1 << 31);
269 		if (sticky)
270 			z2 |= 1;
271 		z1 = (z1 >> 1) | (z0 << 31);
272 		z0 >>= 1;
273 		ez++;
274 	} else {
275 		if (z0 < 0x100000 && (z0 | z1 | z2 | z3) != 0) {
276 			/*
277 			 * borrow/cancellation; shift left as much as
278 			 * exponent allows
279 			 */
280 			while (!(z0 | (z1 & 0xffe00000)) && ez >= 33) {
281 				z0 = z1;
282 				z1 = z2;
283 				z2 = z3;
284 				z3 = 0;
285 				ez -= 32;
286 			}
287 			while (z0 < 0x100000 && ez > 1) {
288 				z0 = (z0 << 1) | (z1 >> 31);
289 				z1 = (z1 << 1) | (z2 >> 31);
290 				z2 = (z2 << 1) | (z3 >> 31);
291 				z3 <<= 1;
292 				ez--;
293 			}
294 		}
295 		if (z3)
296 			z2 |= 1;
297 	}
298 
299 	/* get the rounding mode and clear current exceptions */
300 	rm = fsr >> 30;
301 	fsr &= ~FSR_CEXC;
302 
303 	/* strip off the integer bit, if there is one */
304 	ibit = z0 & 0x100000;
305 	if (ibit)
306 		z0 -= 0x100000;
307 	else {
308 		ez = 0;
309 		if (!(z0 | z1 | z2)) { /* exact zero */
310 			zz.i[0] = rm == FSR_RM ? 0x80000000 : 0;
311 			zz.i[1] = 0;
312 			__fenv_setfsr32(&fsr);
313 			return (zz.d);
314 		}
315 	}
316 
317 	/*
318 	 * flip the sense of directed roundings if the result is negative;
319 	 * the logic below applies to a positive result
320 	 */
321 	if (sz)
322 		rm ^= rm >> 1;
323 
324 	/* round and raise exceptions */
325 	if (z2) {
326 		fsr |= FSR_NXC;
327 
328 		/* decide whether to round the fraction up */
329 		if (rm == FSR_RP || (rm == FSR_RN && (z2 > 0x80000000u ||
330 			(z2 == 0x80000000u && (z1 & 1))))) {
331 			/* round up and renormalize if necessary */
332 			if (++z1 == 0) {
333 				if (++z0 == 0x100000) {
334 					z0 = 0;
335 					ez++;
336 				}
337 			}
338 		}
339 	}
340 
341 	/* check for under/overflow */
342 	if (ez >= 0x7ff) {
343 		if (rm == FSR_RN || rm == FSR_RP) {
344 			zz.i[0] = sz | 0x7ff00000;
345 			zz.i[1] = 0;
346 		} else {
347 			zz.i[0] = sz | 0x7fefffff;
348 			zz.i[1] = 0xffffffff;
349 		}
350 		fsr |= FSR_OFC | FSR_NXC;
351 	} else {
352 		zz.i[0] = sz | (ez << 20) | z0;
353 		zz.i[1] = z1;
354 
355 		/*
356 		 * !ibit => exact result was tiny before rounding,
357 		 * z2 nonzero => result delivered is inexact
358 		 */
359 		if (!ibit) {
360 			if (z2)
361 				fsr |= FSR_UFC | FSR_NXC;
362 			else if (fsr & FSR_UFM)
363 				fsr |= FSR_UFC;
364 		}
365 	}
366 
367 	/* restore the fsr and emulate exceptions as needed */
368 	if ((fsr & FSR_CEXC) & (fsr >> 23)) {
369 		__fenv_setfsr32(&fsr);
370 		if (fsr & FSR_OFC) {
371 			dummy = huge;
372 			dummy *= huge;
373 		} else if (fsr & FSR_UFC) {
374 			dummy = tiny;
375 			if (fsr & FSR_NXC)
376 				dummy *= tiny;
377 			else
378 				dummy -= tiny2;
379 		} else {
380 			dummy = huge;
381 			dummy += tiny;
382 		}
383 	} else {
384 		fsr |= (fsr & 0x1f) << 5;
385 		__fenv_setfsr32(&fsr);
386 	}
387 	return (zz.d);
388 }
389 
390 #elif defined(__x86)
391 
392 #if defined(__amd64)
393 #define	NI	4
394 #else
395 #define	NI	3
396 #endif
397 
398 /*
399  *  fma for x86: 64-bit double precision, little-endian
400  */
401 double
402 __fma(double x, double y, double z) {
403 	union {
404 		unsigned i[NI];
405 		long double e;
406 	} xx, yy, zz;
407 	long double xe, ye, xhi, xlo, yhi, ylo;
408 	int ex, ey, ez;
409 	unsigned cwsw, oldcwsw, rm;
410 
411 	/* convert the operands to double extended */
412 	xx.e = (long double) x;
413 	yy.e = (long double) y;
414 	zz.e = (long double) z;
415 
416 	/* extract the exponents of the arguments */
417 	ex = xx.i[2] & 0x7fff;
418 	ey = yy.i[2] & 0x7fff;
419 	ez = zz.i[2] & 0x7fff;
420 
421 	/* dispense with inf, nan, and zero cases */
422 	if (ex == 0x7fff || ey == 0x7fff || ex == 0 || ey == 0)
423 		/* x or y is inf, nan, or zero */
424 		return ((double) (xx.e * yy.e + zz.e));
425 
426 	if (ez >= 0x7fff) /* z is inf or nan */
427 		return ((double) (xx.e + zz.e));
428 					/* avoid spurious inexact in x * y */
429 
430 	/*
431 	 * save the control and status words, mask all exceptions, and
432 	 * set rounding to 64-bit precision and to-nearest
433 	 */
434 	__fenv_getcwsw(&oldcwsw);
435 	cwsw = (oldcwsw & 0xf0c0ffff) | 0x033f0000;
436 	__fenv_setcwsw(&cwsw);
437 
438 	/* multiply x*y to 106 bits */
439 	xe = xx.e;
440 	xx.i[0] = 0;
441 	xhi = xx.e; /* hi 32 bits */
442 	xlo = xe - xhi; /* lo 21 bits */
443 	ye = yy.e;
444 	yy.i[0] = 0;
445 	yhi = yy.e;
446 	ylo = ye - yhi;
447 	xe = xe * ye;
448 	ye = ((xhi * yhi - xe) + xhi * ylo + xlo * yhi) + xlo * ylo;
449 
450 	/* distill the sum of xe, ye, and z */
451 	xhi = ye + zz.e;
452 	yhi = xhi - ye;
453 	xlo = (zz.e - yhi) + (ye - (xhi - yhi));
454 						/* now (xhi,xlo) = ye + z */
455 
456 	yhi = xe + xhi;
457 	ye = yhi - xe;
458 	ylo = (xhi - ye) + (xe - (yhi - ye));	/* now (yhi,ylo) = xe + xhi */
459 
460 	xhi = xlo + ylo;
461 	xe = xhi - xlo;
462 	xlo = (ylo - xe) + (xlo - (xhi - xe));	/* now (xhi,xlo) = xlo + ylo */
463 
464 	yy.e = yhi + xhi;
465 	ylo = (yhi - yy.e) + xhi;		/* now (yy.e,ylo) = xhi + yhi */
466 
467 	if (yy.i[1] != 0) {	/* yy.e is nonzero */
468 		/* perturb yy.e if its least significant 10 bits are zero */
469 		if (!(yy.i[0] & 0x3ff)) {
470 			xx.e = ylo + xlo;
471 			if (xx.i[1] != 0) {
472 				xx.i[2] = (xx.i[2] & 0x8000) |
473 					((yy.i[2] & 0x7fff) - 63);
474 				xx.i[1] = 0x80000000;
475 				xx.i[0] = 0;
476 				yy.e += xx.e;
477 			}
478 		}
479 	} else {
480 		/* set sign of zero result according to rounding direction */
481 		rm = oldcwsw & 0x0c000000;
482 		yy.i[2] = ((rm == FCW_RM)? 0x8000 : 0);
483 	}
484 
485 	/*
486 	 * restore the control and status words and convert the result
487 	 * to double
488 	 */
489 	__fenv_setcwsw(&oldcwsw);
490 	return ((double) yy.e);
491 }
492 
493 #else
494 #error Unknown architecture
495 #endif
496