xref: /illumos-gate/usr/src/lib/libm/common/m9x/fmaf.c (revision cffcfaee1e6b29ef9ceb7d80e4e053ffd029906b)
1 /*
2  * CDDL HEADER START
3  *
4  * The contents of this file are subject to the terms of the
5  * Common Development and Distribution License (the "License").
6  * You may not use this file except in compliance with the License.
7  *
8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9  * or http://www.opensolaris.org/os/licensing.
10  * See the License for the specific language governing permissions
11  * and limitations under the License.
12  *
13  * When distributing Covered Code, include this CDDL HEADER in each
14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15  * If applicable, add the following below this CDDL HEADER, with the
16  * fields enclosed by brackets "[]" replaced with your own identifying
17  * information: Portions Copyright [yyyy] [name of copyright owner]
18  *
19  * CDDL HEADER END
20  */
21 
22 /*
23  * Copyright 2011 Nexenta Systems, Inc.  All rights reserved.
24  */
25 /*
26  * Copyright 2006 Sun Microsystems, Inc.  All rights reserved.
27  * Use is subject to license terms.
28  */
29 
30 #if defined(ELFOBJ)
31 #pragma weak fmaf = __fmaf
32 #endif
33 
34 #include "libm.h"
35 #include "fma.h"
36 #include "fenv_inlines.h"
37 
38 #if defined(__sparc)
39 
40 /*
41  * fmaf for SPARC: 32-bit single precision, big-endian
42  */
43 float
44 __fmaf(float x, float y, float z) {
45 	union {
46 		unsigned i[2];
47 		double d;
48 	} xy, zz;
49 	unsigned u, s;
50 	int exy, ez;
51 
52 	/*
53 	 * the following operations can only raise the invalid exception,
54 	 * and then only if either x*y is of the form Inf*0 or one of x,
55 	 * y, or z is a signaling NaN
56 	 */
57 	xy.d = (double) x * y;
58 	zz.d = (double) z;
59 
60 	/*
61 	 * if the sum xy + z will be exact, just compute it and cast the
62 	 * result to float
63 	 */
64 	exy = (xy.i[0] >> 20) & 0x7ff;
65 	ez = (zz.i[0] >> 20) & 0x7ff;
66 	if ((ez - exy <= 4 && exy - ez <= 28) || exy == 0x7ff || exy == 0 ||
67 		ez == 0x7ff || ez == 0) {
68 		return ((float) (xy.d + zz.d));
69 	}
70 
71 	/*
72 	 * collapse the tail of the smaller summand into a "sticky bit"
73 	 * so that the sum can be computed without error
74 	 */
75 	if (ez > exy) {
76 		if (ez - exy < 31) {
77 			u = xy.i[1];
78 			s = 2 << (ez - exy);
79 			if (u & (s - 1))
80 				u |= s;
81 			xy.i[1] = u & ~(s - 1);
82 		} else if (ez - exy < 51) {
83 			u = xy.i[0];
84 			s = 1 << (ez - exy - 31);
85 			if ((u & (s - 1)) | xy.i[1])
86 				u |= s;
87 			xy.i[0] = u & ~(s - 1);
88 			xy.i[1] = 0;
89 		} else {
90 			/* collapse all of xy into a single bit */
91 			xy.i[0] = (xy.i[0] & 0x80000000) | ((ez - 51) << 20);
92 			xy.i[1] = 0;
93 		}
94 	} else {
95 		if (exy - ez < 31) {
96 			u = zz.i[1];
97 			s = 2 << (exy - ez);
98 			if (u & (s - 1))
99 				u |= s;
100 			zz.i[1] = u & ~(s - 1);
101 		} else if (exy - ez < 51) {
102 			u = zz.i[0];
103 			s = 1 << (exy - ez - 31);
104 			if ((u & (s - 1)) | zz.i[1])
105 				u |= s;
106 			zz.i[0] = u & ~(s - 1);
107 			zz.i[1] = 0;
108 		} else {
109 			/* collapse all of zz into a single bit */
110 			zz.i[0] = (zz.i[0] & 0x80000000) | ((exy - 51) << 20);
111 			zz.i[1] = 0;
112 		}
113 	}
114 
115 	return ((float) (xy.d + zz.d));
116 }
117 
118 #elif defined(__x86)
119 
120 #if defined(__amd64)
121 #define	NI	4
122 #else
123 #define	NI	3
124 #endif
125 
126 /*
127  * fmaf for x86: 32-bit single precision, little-endian
128  */
129 float
130 __fmaf(float x, float y, float z) {
131 	union {
132 		unsigned i[NI];
133 		long double e;
134 	} xy, zz;
135 	unsigned u, s, cwsw, oldcwsw;
136 	int exy, ez;
137 
138 	/* set rounding precision to 64 bits */
139 	__fenv_getcwsw(&oldcwsw);
140 	cwsw = (oldcwsw & 0xfcffffff) | 0x03000000;
141 	__fenv_setcwsw(&cwsw);
142 
143 	/*
144 	 * the following operations can only raise the invalid exception,
145 	 * and then only if either x*y is of the form Inf*0 or one of x,
146 	 * y, or z is a signaling NaN
147 	 */
148 	xy.e = (long double) x * y;
149 	zz.e = (long double) z;
150 
151 	/*
152 	 * if the sum xy + z will be exact, just compute it and cast the
153 	 * result to float
154 	 */
155 	exy = xy.i[2] & 0x7fff;
156 	ez = zz.i[2] & 0x7fff;
157 	if ((ez - exy <= 15 && exy - ez <= 39) || exy == 0x7fff || exy == 0 ||
158 		ez == 0x7fff || ez == 0) {
159 		goto cont;
160 	}
161 
162 	/*
163 	 * collapse the tail of the smaller summand into a "sticky bit"
164 	 * so that the sum can be computed without error
165 	 */
166 	if (ez > exy) {
167 		if (ez - exy < 31) {
168 			u = xy.i[0];
169 			s = 2 << (ez - exy);
170 			if (u & (s - 1))
171 				u |= s;
172 			xy.i[0] = u & ~(s - 1);
173 		} else if (ez - exy < 62) {
174 			u = xy.i[1];
175 			s = 1 << (ez - exy - 31);
176 			if ((u & (s - 1)) | xy.i[0])
177 				u |= s;
178 			xy.i[1] = u & ~(s - 1);
179 			xy.i[0] = 0;
180 		} else {
181 			/* collapse all of xy into a single bit */
182 			xy.i[0] = 0;
183 			xy.i[1] = 0x80000000;
184 			xy.i[2] = (xy.i[2] & 0x8000) | (ez - 62);
185 		}
186 	} else {
187 		if (exy - ez < 62) {
188 			u = zz.i[1];
189 			s = 1 << (exy - ez - 31);
190 			if ((u & (s - 1)) | zz.i[0])
191 				u |= s;
192 			zz.i[1] = u & ~(s - 1);
193 			zz.i[0] = 0;
194 		} else {
195 			/* collapse all of zz into a single bit */
196 			zz.i[0] = 0;
197 			zz.i[1] = 0x80000000;
198 			zz.i[2] = (zz.i[2] & 0x8000) | (exy - 62);
199 		}
200 	}
201 
202 cont:
203 	xy.e += zz.e;
204 
205 	/* restore the rounding precision */
206 	__fenv_getcwsw(&cwsw);
207 	cwsw = (cwsw & 0xfcffffff) | (oldcwsw & 0x03000000);
208 	__fenv_setcwsw(&cwsw);
209 
210 	return ((float) xy.e);
211 }
212 
213 #if 0
214 /*
215  * another fmaf for x86: assumes return value will be left in
216  * long double (80-bit double extended) precision
217  */
218 long double
219 __fmaf(float x, float y, float z) {
220 	/*
221 	 * Note: This implementation assumes the rounding precision mode
222 	 * is set to the default, rounding to 64 bit precision.  If this
223 	 * routine must work in non-default rounding precision modes, do
224 	 * the following instead:
225 	 *
226 	 *   long double t;
227 	 *
228 	 *   <set rp mode to round to 64 bit precision>
229 	 *   t = x * y;
230 	 *   <restore rp mode>
231 	 *   return t + z;
232 	 *
233 	 * Note that the code to change rounding precision must not alter
234 	 * the exception masks or flags, since the product x * y may raise
235 	 * an invalid operation exception.
236 	 */
237 	return ((long double) x * y + z);
238 }
239 #endif
240 
241 #else
242 #error Unknown architecture
243 #endif
244