xref: /titanic_52/usr/src/lib/libm/common/m9x/fmaf.c (revision 1ec68d336ba97cd53f46053ac10401d16014d075)
1*25c28e83SPiotr Jasiukajtis /*
2*25c28e83SPiotr Jasiukajtis  * CDDL HEADER START
3*25c28e83SPiotr Jasiukajtis  *
4*25c28e83SPiotr Jasiukajtis  * The contents of this file are subject to the terms of the
5*25c28e83SPiotr Jasiukajtis  * Common Development and Distribution License (the "License").
6*25c28e83SPiotr Jasiukajtis  * You may not use this file except in compliance with the License.
7*25c28e83SPiotr Jasiukajtis  *
8*25c28e83SPiotr Jasiukajtis  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9*25c28e83SPiotr Jasiukajtis  * or http://www.opensolaris.org/os/licensing.
10*25c28e83SPiotr Jasiukajtis  * See the License for the specific language governing permissions
11*25c28e83SPiotr Jasiukajtis  * and limitations under the License.
12*25c28e83SPiotr Jasiukajtis  *
13*25c28e83SPiotr Jasiukajtis  * When distributing Covered Code, include this CDDL HEADER in each
14*25c28e83SPiotr Jasiukajtis  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15*25c28e83SPiotr Jasiukajtis  * If applicable, add the following below this CDDL HEADER, with the
16*25c28e83SPiotr Jasiukajtis  * fields enclosed by brackets "[]" replaced with your own identifying
17*25c28e83SPiotr Jasiukajtis  * information: Portions Copyright [yyyy] [name of copyright owner]
18*25c28e83SPiotr Jasiukajtis  *
19*25c28e83SPiotr Jasiukajtis  * CDDL HEADER END
20*25c28e83SPiotr Jasiukajtis  */
21*25c28e83SPiotr Jasiukajtis 
22*25c28e83SPiotr Jasiukajtis /*
23*25c28e83SPiotr Jasiukajtis  * Copyright 2011 Nexenta Systems, Inc.  All rights reserved.
24*25c28e83SPiotr Jasiukajtis  */
25*25c28e83SPiotr Jasiukajtis /*
26*25c28e83SPiotr Jasiukajtis  * Copyright 2006 Sun Microsystems, Inc.  All rights reserved.
27*25c28e83SPiotr Jasiukajtis  * Use is subject to license terms.
28*25c28e83SPiotr Jasiukajtis  */
29*25c28e83SPiotr Jasiukajtis 
30*25c28e83SPiotr Jasiukajtis #pragma weak fmaf = __fmaf
31*25c28e83SPiotr Jasiukajtis 
32*25c28e83SPiotr Jasiukajtis #include "libm.h"
33*25c28e83SPiotr Jasiukajtis #include "fma.h"
34*25c28e83SPiotr Jasiukajtis #include "fenv_inlines.h"
35*25c28e83SPiotr Jasiukajtis 
36*25c28e83SPiotr Jasiukajtis #if defined(__sparc)
37*25c28e83SPiotr Jasiukajtis 
38*25c28e83SPiotr Jasiukajtis /*
39*25c28e83SPiotr Jasiukajtis  * fmaf for SPARC: 32-bit single precision, big-endian
40*25c28e83SPiotr Jasiukajtis  */
41*25c28e83SPiotr Jasiukajtis float
42*25c28e83SPiotr Jasiukajtis __fmaf(float x, float y, float z) {
43*25c28e83SPiotr Jasiukajtis 	union {
44*25c28e83SPiotr Jasiukajtis 		unsigned i[2];
45*25c28e83SPiotr Jasiukajtis 		double d;
46*25c28e83SPiotr Jasiukajtis 	} xy, zz;
47*25c28e83SPiotr Jasiukajtis 	unsigned u, s;
48*25c28e83SPiotr Jasiukajtis 	int exy, ez;
49*25c28e83SPiotr Jasiukajtis 
50*25c28e83SPiotr Jasiukajtis 	/*
51*25c28e83SPiotr Jasiukajtis 	 * the following operations can only raise the invalid exception,
52*25c28e83SPiotr Jasiukajtis 	 * and then only if either x*y is of the form Inf*0 or one of x,
53*25c28e83SPiotr Jasiukajtis 	 * y, or z is a signaling NaN
54*25c28e83SPiotr Jasiukajtis 	 */
55*25c28e83SPiotr Jasiukajtis 	xy.d = (double) x * y;
56*25c28e83SPiotr Jasiukajtis 	zz.d = (double) z;
57*25c28e83SPiotr Jasiukajtis 
58*25c28e83SPiotr Jasiukajtis 	/*
59*25c28e83SPiotr Jasiukajtis 	 * if the sum xy + z will be exact, just compute it and cast the
60*25c28e83SPiotr Jasiukajtis 	 * result to float
61*25c28e83SPiotr Jasiukajtis 	 */
62*25c28e83SPiotr Jasiukajtis 	exy = (xy.i[0] >> 20) & 0x7ff;
63*25c28e83SPiotr Jasiukajtis 	ez = (zz.i[0] >> 20) & 0x7ff;
64*25c28e83SPiotr Jasiukajtis 	if ((ez - exy <= 4 && exy - ez <= 28) || exy == 0x7ff || exy == 0 ||
65*25c28e83SPiotr Jasiukajtis 		ez == 0x7ff || ez == 0) {
66*25c28e83SPiotr Jasiukajtis 		return ((float) (xy.d + zz.d));
67*25c28e83SPiotr Jasiukajtis 	}
68*25c28e83SPiotr Jasiukajtis 
69*25c28e83SPiotr Jasiukajtis 	/*
70*25c28e83SPiotr Jasiukajtis 	 * collapse the tail of the smaller summand into a "sticky bit"
71*25c28e83SPiotr Jasiukajtis 	 * so that the sum can be computed without error
72*25c28e83SPiotr Jasiukajtis 	 */
73*25c28e83SPiotr Jasiukajtis 	if (ez > exy) {
74*25c28e83SPiotr Jasiukajtis 		if (ez - exy < 31) {
75*25c28e83SPiotr Jasiukajtis 			u = xy.i[1];
76*25c28e83SPiotr Jasiukajtis 			s = 2 << (ez - exy);
77*25c28e83SPiotr Jasiukajtis 			if (u & (s - 1))
78*25c28e83SPiotr Jasiukajtis 				u |= s;
79*25c28e83SPiotr Jasiukajtis 			xy.i[1] = u & ~(s - 1);
80*25c28e83SPiotr Jasiukajtis 		} else if (ez - exy < 51) {
81*25c28e83SPiotr Jasiukajtis 			u = xy.i[0];
82*25c28e83SPiotr Jasiukajtis 			s = 1 << (ez - exy - 31);
83*25c28e83SPiotr Jasiukajtis 			if ((u & (s - 1)) | xy.i[1])
84*25c28e83SPiotr Jasiukajtis 				u |= s;
85*25c28e83SPiotr Jasiukajtis 			xy.i[0] = u & ~(s - 1);
86*25c28e83SPiotr Jasiukajtis 			xy.i[1] = 0;
87*25c28e83SPiotr Jasiukajtis 		} else {
88*25c28e83SPiotr Jasiukajtis 			/* collapse all of xy into a single bit */
89*25c28e83SPiotr Jasiukajtis 			xy.i[0] = (xy.i[0] & 0x80000000) | ((ez - 51) << 20);
90*25c28e83SPiotr Jasiukajtis 			xy.i[1] = 0;
91*25c28e83SPiotr Jasiukajtis 		}
92*25c28e83SPiotr Jasiukajtis 	} else {
93*25c28e83SPiotr Jasiukajtis 		if (exy - ez < 31) {
94*25c28e83SPiotr Jasiukajtis 			u = zz.i[1];
95*25c28e83SPiotr Jasiukajtis 			s = 2 << (exy - ez);
96*25c28e83SPiotr Jasiukajtis 			if (u & (s - 1))
97*25c28e83SPiotr Jasiukajtis 				u |= s;
98*25c28e83SPiotr Jasiukajtis 			zz.i[1] = u & ~(s - 1);
99*25c28e83SPiotr Jasiukajtis 		} else if (exy - ez < 51) {
100*25c28e83SPiotr Jasiukajtis 			u = zz.i[0];
101*25c28e83SPiotr Jasiukajtis 			s = 1 << (exy - ez - 31);
102*25c28e83SPiotr Jasiukajtis 			if ((u & (s - 1)) | zz.i[1])
103*25c28e83SPiotr Jasiukajtis 				u |= s;
104*25c28e83SPiotr Jasiukajtis 			zz.i[0] = u & ~(s - 1);
105*25c28e83SPiotr Jasiukajtis 			zz.i[1] = 0;
106*25c28e83SPiotr Jasiukajtis 		} else {
107*25c28e83SPiotr Jasiukajtis 			/* collapse all of zz into a single bit */
108*25c28e83SPiotr Jasiukajtis 			zz.i[0] = (zz.i[0] & 0x80000000) | ((exy - 51) << 20);
109*25c28e83SPiotr Jasiukajtis 			zz.i[1] = 0;
110*25c28e83SPiotr Jasiukajtis 		}
111*25c28e83SPiotr Jasiukajtis 	}
112*25c28e83SPiotr Jasiukajtis 
113*25c28e83SPiotr Jasiukajtis 	return ((float) (xy.d + zz.d));
114*25c28e83SPiotr Jasiukajtis }
115*25c28e83SPiotr Jasiukajtis 
116*25c28e83SPiotr Jasiukajtis #elif defined(__x86)
117*25c28e83SPiotr Jasiukajtis 
118*25c28e83SPiotr Jasiukajtis #if defined(__amd64)
119*25c28e83SPiotr Jasiukajtis #define	NI	4
120*25c28e83SPiotr Jasiukajtis #else
121*25c28e83SPiotr Jasiukajtis #define	NI	3
122*25c28e83SPiotr Jasiukajtis #endif
123*25c28e83SPiotr Jasiukajtis 
124*25c28e83SPiotr Jasiukajtis /*
125*25c28e83SPiotr Jasiukajtis  * fmaf for x86: 32-bit single precision, little-endian
126*25c28e83SPiotr Jasiukajtis  */
127*25c28e83SPiotr Jasiukajtis float
128*25c28e83SPiotr Jasiukajtis __fmaf(float x, float y, float z) {
129*25c28e83SPiotr Jasiukajtis 	union {
130*25c28e83SPiotr Jasiukajtis 		unsigned i[NI];
131*25c28e83SPiotr Jasiukajtis 		long double e;
132*25c28e83SPiotr Jasiukajtis 	} xy, zz;
133*25c28e83SPiotr Jasiukajtis 	unsigned u, s, cwsw, oldcwsw;
134*25c28e83SPiotr Jasiukajtis 	int exy, ez;
135*25c28e83SPiotr Jasiukajtis 
136*25c28e83SPiotr Jasiukajtis 	/* set rounding precision to 64 bits */
137*25c28e83SPiotr Jasiukajtis 	__fenv_getcwsw(&oldcwsw);
138*25c28e83SPiotr Jasiukajtis 	cwsw = (oldcwsw & 0xfcffffff) | 0x03000000;
139*25c28e83SPiotr Jasiukajtis 	__fenv_setcwsw(&cwsw);
140*25c28e83SPiotr Jasiukajtis 
141*25c28e83SPiotr Jasiukajtis 	/*
142*25c28e83SPiotr Jasiukajtis 	 * the following operations can only raise the invalid exception,
143*25c28e83SPiotr Jasiukajtis 	 * and then only if either x*y is of the form Inf*0 or one of x,
144*25c28e83SPiotr Jasiukajtis 	 * y, or z is a signaling NaN
145*25c28e83SPiotr Jasiukajtis 	 */
146*25c28e83SPiotr Jasiukajtis 	xy.e = (long double) x * y;
147*25c28e83SPiotr Jasiukajtis 	zz.e = (long double) z;
148*25c28e83SPiotr Jasiukajtis 
149*25c28e83SPiotr Jasiukajtis 	/*
150*25c28e83SPiotr Jasiukajtis 	 * if the sum xy + z will be exact, just compute it and cast the
151*25c28e83SPiotr Jasiukajtis 	 * result to float
152*25c28e83SPiotr Jasiukajtis 	 */
153*25c28e83SPiotr Jasiukajtis 	exy = xy.i[2] & 0x7fff;
154*25c28e83SPiotr Jasiukajtis 	ez = zz.i[2] & 0x7fff;
155*25c28e83SPiotr Jasiukajtis 	if ((ez - exy <= 15 && exy - ez <= 39) || exy == 0x7fff || exy == 0 ||
156*25c28e83SPiotr Jasiukajtis 		ez == 0x7fff || ez == 0) {
157*25c28e83SPiotr Jasiukajtis 		goto cont;
158*25c28e83SPiotr Jasiukajtis 	}
159*25c28e83SPiotr Jasiukajtis 
160*25c28e83SPiotr Jasiukajtis 	/*
161*25c28e83SPiotr Jasiukajtis 	 * collapse the tail of the smaller summand into a "sticky bit"
162*25c28e83SPiotr Jasiukajtis 	 * so that the sum can be computed without error
163*25c28e83SPiotr Jasiukajtis 	 */
164*25c28e83SPiotr Jasiukajtis 	if (ez > exy) {
165*25c28e83SPiotr Jasiukajtis 		if (ez - exy < 31) {
166*25c28e83SPiotr Jasiukajtis 			u = xy.i[0];
167*25c28e83SPiotr Jasiukajtis 			s = 2 << (ez - exy);
168*25c28e83SPiotr Jasiukajtis 			if (u & (s - 1))
169*25c28e83SPiotr Jasiukajtis 				u |= s;
170*25c28e83SPiotr Jasiukajtis 			xy.i[0] = u & ~(s - 1);
171*25c28e83SPiotr Jasiukajtis 		} else if (ez - exy < 62) {
172*25c28e83SPiotr Jasiukajtis 			u = xy.i[1];
173*25c28e83SPiotr Jasiukajtis 			s = 1 << (ez - exy - 31);
174*25c28e83SPiotr Jasiukajtis 			if ((u & (s - 1)) | xy.i[0])
175*25c28e83SPiotr Jasiukajtis 				u |= s;
176*25c28e83SPiotr Jasiukajtis 			xy.i[1] = u & ~(s - 1);
177*25c28e83SPiotr Jasiukajtis 			xy.i[0] = 0;
178*25c28e83SPiotr Jasiukajtis 		} else {
179*25c28e83SPiotr Jasiukajtis 			/* collapse all of xy into a single bit */
180*25c28e83SPiotr Jasiukajtis 			xy.i[0] = 0;
181*25c28e83SPiotr Jasiukajtis 			xy.i[1] = 0x80000000;
182*25c28e83SPiotr Jasiukajtis 			xy.i[2] = (xy.i[2] & 0x8000) | (ez - 62);
183*25c28e83SPiotr Jasiukajtis 		}
184*25c28e83SPiotr Jasiukajtis 	} else {
185*25c28e83SPiotr Jasiukajtis 		if (exy - ez < 62) {
186*25c28e83SPiotr Jasiukajtis 			u = zz.i[1];
187*25c28e83SPiotr Jasiukajtis 			s = 1 << (exy - ez - 31);
188*25c28e83SPiotr Jasiukajtis 			if ((u & (s - 1)) | zz.i[0])
189*25c28e83SPiotr Jasiukajtis 				u |= s;
190*25c28e83SPiotr Jasiukajtis 			zz.i[1] = u & ~(s - 1);
191*25c28e83SPiotr Jasiukajtis 			zz.i[0] = 0;
192*25c28e83SPiotr Jasiukajtis 		} else {
193*25c28e83SPiotr Jasiukajtis 			/* collapse all of zz into a single bit */
194*25c28e83SPiotr Jasiukajtis 			zz.i[0] = 0;
195*25c28e83SPiotr Jasiukajtis 			zz.i[1] = 0x80000000;
196*25c28e83SPiotr Jasiukajtis 			zz.i[2] = (zz.i[2] & 0x8000) | (exy - 62);
197*25c28e83SPiotr Jasiukajtis 		}
198*25c28e83SPiotr Jasiukajtis 	}
199*25c28e83SPiotr Jasiukajtis 
200*25c28e83SPiotr Jasiukajtis cont:
201*25c28e83SPiotr Jasiukajtis 	xy.e += zz.e;
202*25c28e83SPiotr Jasiukajtis 
203*25c28e83SPiotr Jasiukajtis 	/* restore the rounding precision */
204*25c28e83SPiotr Jasiukajtis 	__fenv_getcwsw(&cwsw);
205*25c28e83SPiotr Jasiukajtis 	cwsw = (cwsw & 0xfcffffff) | (oldcwsw & 0x03000000);
206*25c28e83SPiotr Jasiukajtis 	__fenv_setcwsw(&cwsw);
207*25c28e83SPiotr Jasiukajtis 
208*25c28e83SPiotr Jasiukajtis 	return ((float) xy.e);
209*25c28e83SPiotr Jasiukajtis }
210*25c28e83SPiotr Jasiukajtis 
211*25c28e83SPiotr Jasiukajtis #if 0
212*25c28e83SPiotr Jasiukajtis /*
213*25c28e83SPiotr Jasiukajtis  * another fmaf for x86: assumes return value will be left in
214*25c28e83SPiotr Jasiukajtis  * long double (80-bit double extended) precision
215*25c28e83SPiotr Jasiukajtis  */
216*25c28e83SPiotr Jasiukajtis long double
217*25c28e83SPiotr Jasiukajtis __fmaf(float x, float y, float z) {
218*25c28e83SPiotr Jasiukajtis 	/*
219*25c28e83SPiotr Jasiukajtis 	 * Note: This implementation assumes the rounding precision mode
220*25c28e83SPiotr Jasiukajtis 	 * is set to the default, rounding to 64 bit precision.  If this
221*25c28e83SPiotr Jasiukajtis 	 * routine must work in non-default rounding precision modes, do
222*25c28e83SPiotr Jasiukajtis 	 * the following instead:
223*25c28e83SPiotr Jasiukajtis 	 *
224*25c28e83SPiotr Jasiukajtis 	 *   long double t;
225*25c28e83SPiotr Jasiukajtis 	 *
226*25c28e83SPiotr Jasiukajtis 	 *   <set rp mode to round to 64 bit precision>
227*25c28e83SPiotr Jasiukajtis 	 *   t = x * y;
228*25c28e83SPiotr Jasiukajtis 	 *   <restore rp mode>
229*25c28e83SPiotr Jasiukajtis 	 *   return t + z;
230*25c28e83SPiotr Jasiukajtis 	 *
231*25c28e83SPiotr Jasiukajtis 	 * Note that the code to change rounding precision must not alter
232*25c28e83SPiotr Jasiukajtis 	 * the exception masks or flags, since the product x * y may raise
233*25c28e83SPiotr Jasiukajtis 	 * an invalid operation exception.
234*25c28e83SPiotr Jasiukajtis 	 */
235*25c28e83SPiotr Jasiukajtis 	return ((long double) x * y + z);
236*25c28e83SPiotr Jasiukajtis }
237*25c28e83SPiotr Jasiukajtis #endif
238*25c28e83SPiotr Jasiukajtis 
239*25c28e83SPiotr Jasiukajtis #else
240*25c28e83SPiotr Jasiukajtis #error Unknown architecture
241*25c28e83SPiotr Jasiukajtis #endif
242