1 /*
2 * CDDL HEADER START
3 *
4 * The contents of this file are subject to the terms of the
5 * Common Development and Distribution License (the "License").
6 * You may not use this file except in compliance with the License.
7 *
8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9 * or http://www.opensolaris.org/os/licensing.
10 * See the License for the specific language governing permissions
11 * and limitations under the License.
12 *
13 * When distributing Covered Code, include this CDDL HEADER in each
14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15 * If applicable, add the following below this CDDL HEADER, with the
16 * fields enclosed by brackets "[]" replaced with your own identifying
17 * information: Portions Copyright [yyyy] [name of copyright owner]
18 *
19 * CDDL HEADER END
20 */
21 /*
22 * Copyright 2011 Nexenta Systems, Inc. All rights reserved.
23 */
24 /*
25 * Copyright 2005 Sun Microsystems, Inc. All rights reserved.
26 * Use is subject to license terms.
27 */
28
29 /*
30 * __vcosf: single precision vector cos
31 *
32 * Algorithm:
33 *
34 * For |x| < pi/4, approximate sin(x) by a polynomial x+x*z*(S0+
35 * z*(S1+z*S2)) and cos(x) by a polynomial 1+z*(-1/2+z*(C0+z*(C1+
36 * z*C2))), where z = x*x, all evaluated in double precision.
37 *
38 * Accuracy:
39 *
40 * The largest error is less than 0.6 ulps.
41 */
42
43 #include <sys/isa_defs.h>
44
45 #ifdef _LITTLE_ENDIAN
46 #define HI(x) *(1+(int *)&x)
47 #define LO(x) *(unsigned *)&x
48 #else
49 #define HI(x) *(int *)&x
50 #define LO(x) *(1+(unsigned *)&x)
51 #endif
52
53 #ifdef __RESTRICT
54 #define restrict _Restrict
55 #else
56 #define restrict
57 #endif
58
59 extern int __vlibm_rem_pio2m(double *, double *, int, int, int);
60
61 static const double C[] = {
62 -1.66666552424430847168e-01, /* 2^ -3 * -1.5555460000000 */
63 8.33219196647405624390e-03, /* 2^ -7 * 1.11077E0000000 */
64 -1.95187909412197768688e-04, /* 2^-13 * -1.9956B60000000 */
65 1.0,
66 -0.5,
67 4.16666455566883087158e-02, /* 2^ -5 * 1.55554A0000000 */
68 -1.38873036485165357590e-03, /* 2^-10 * -1.6C0C1E0000000 */
69 2.44309903791872784495e-05, /* 2^-16 * 1.99E24E0000000 */
70 0.636619772367581343075535, /* 2^ -1 * 1.45F306DC9C883 */
71 6755399441055744.0, /* 2^ 52 * 1.8000000000000 */
72 1.570796326734125614166, /* 2^ 0 * 1.921FB54400000 */
73 6.077100506506192601475e-11, /* 2^-34 * 1.0B4611A626331 */
74 };
75
76 #define S0 C[0]
77 #define S1 C[1]
78 #define S2 C[2]
79 #define one C[3]
80 #define mhalf C[4]
81 #define C0 C[5]
82 #define C1 C[6]
83 #define C2 C[7]
84 #define invpio2 C[8]
85 #define c3two51 C[9]
86 #define pio2_1 C[10]
87 #define pio2_t C[11]
88
89 #define PREPROCESS(N, index, label) \
90 hx = *(int *)x; \
91 ix = hx & 0x7fffffff; \
92 t = *x; \
93 x += stridex; \
94 if (ix <= 0x3f490fdb) { /* |x| < pi/4 */ \
95 if (ix == 0) { \
96 y[index] = one; \
97 goto label; \
98 } \
99 y##N = (double)t; \
100 n##N = 1; \
101 } else if (ix <= 0x49c90fdb) { /* |x| < 2^19*pi */ \
102 y##N = (double)t; \
103 medium = 1; \
104 } else { \
105 if (ix >= 0x7f800000) { /* inf or nan */ \
106 y[index] = t / t; \
107 goto label; \
108 } \
109 z##N = y##N = (double)t; \
110 hx = HI(y##N); \
111 n##N = ((hx >> 20) & 0x7ff) - 1046; \
112 HI(z##N) = (hx & 0xfffff) | 0x41600000; \
113 n##N = __vlibm_rem_pio2m(&z##N, &y##N, n##N, 1, 0) + 1; \
114 z##N = y##N * y##N; \
115 if (n##N & 1) { /* compute cos y */ \
116 f##N = (float)(one + z##N * (mhalf + z##N * \
117 (C0 + z##N * (C1 + z##N * C2)))); \
118 } else { /* compute sin y */ \
119 f##N = (float)(y##N + y##N * z##N * (S0 + \
120 z##N * (S1 + z##N * S2))); \
121 } \
122 y[index] = (n##N & 2)? -f##N : f##N; \
123 goto label; \
124 }
125
126 #define PROCESS(N) \
127 if (medium) { \
128 z##N = y##N * invpio2 + c3two51; \
129 n##N = LO(z##N) + 1; \
130 z##N -= c3two51; \
131 y##N = (y##N - z##N * pio2_1) - z##N * pio2_t; \
132 } \
133 z##N = y##N * y##N; \
134 if (n##N & 1) { /* compute cos y */ \
135 f##N = (float)(one + z##N * (mhalf + z##N * (C0 + \
136 z##N * (C1 + z##N * C2)))); \
137 } else { /* compute sin y */ \
138 f##N = (float)(y##N + y##N * z##N * (S0 + z##N * (S1 + \
139 z##N * S2))); \
140 } \
141 *y = (n##N & 2)? -f##N : f##N; \
142 y += stridey
143
144 void
__vcosf(int n,float * restrict x,int stridex,float * restrict y,int stridey)145 __vcosf(int n, float *restrict x, int stridex, float *restrict y,
146 int stridey)
147 {
148 double y0, y1, y2, y3;
149 double z0, z1, z2, z3;
150 float f0, f1, f2, f3, t;
151 int n0 = 0, n1 = 0, n2 = 0, n3, hx, ix, medium;
152
153 y -= stridey;
154
155 for (;;) {
156 begin:
157 y += stridey;
158
159 if (--n < 0)
160 break;
161
162 medium = 0;
163 PREPROCESS(0, 0, begin);
164
165 if (--n < 0)
166 goto process1;
167
168 PREPROCESS(1, stridey, process1);
169
170 if (--n < 0)
171 goto process2;
172
173 PREPROCESS(2, (stridey << 1), process2);
174
175 if (--n < 0)
176 goto process3;
177
178 PREPROCESS(3, (stridey << 1) + stridey, process3);
179
180 if (medium) {
181 z0 = y0 * invpio2 + c3two51;
182 z1 = y1 * invpio2 + c3two51;
183 z2 = y2 * invpio2 + c3two51;
184 z3 = y3 * invpio2 + c3two51;
185
186 n0 = LO(z0) + 1;
187 n1 = LO(z1) + 1;
188 n2 = LO(z2) + 1;
189 n3 = LO(z3) + 1;
190
191 z0 -= c3two51;
192 z1 -= c3two51;
193 z2 -= c3two51;
194 z3 -= c3two51;
195
196 y0 = (y0 - z0 * pio2_1) - z0 * pio2_t;
197 y1 = (y1 - z1 * pio2_1) - z1 * pio2_t;
198 y2 = (y2 - z2 * pio2_1) - z2 * pio2_t;
199 y3 = (y3 - z3 * pio2_1) - z3 * pio2_t;
200 }
201
202 z0 = y0 * y0;
203 z1 = y1 * y1;
204 z2 = y2 * y2;
205 z3 = y3 * y3;
206
207 hx = (n0 & 1) | ((n1 & 1) << 1) | ((n2 & 1) << 2) |
208 ((n3 & 1) << 3);
209 switch (hx) {
210 case 0:
211 f0 = (float)(y0 + y0 * z0 * (S0 + z0 * (S1 + z0 * S2)));
212 f1 = (float)(y1 + y1 * z1 * (S0 + z1 * (S1 + z1 * S2)));
213 f2 = (float)(y2 + y2 * z2 * (S0 + z2 * (S1 + z2 * S2)));
214 f3 = (float)(y3 + y3 * z3 * (S0 + z3 * (S1 + z3 * S2)));
215 break;
216
217 case 1:
218 f0 = (float)(one + z0 * (mhalf + z0 * (C0 +
219 z0 * (C1 + z0 * C2))));
220 f1 = (float)(y1 + y1 * z1 * (S0 + z1 * (S1 + z1 * S2)));
221 f2 = (float)(y2 + y2 * z2 * (S0 + z2 * (S1 + z2 * S2)));
222 f3 = (float)(y3 + y3 * z3 * (S0 + z3 * (S1 + z3 * S2)));
223 break;
224
225 case 2:
226 f0 = (float)(y0 + y0 * z0 * (S0 + z0 * (S1 + z0 * S2)));
227 f1 = (float)(one + z1 * (mhalf + z1 * (C0 +
228 z1 * (C1 + z1 * C2))));
229 f2 = (float)(y2 + y2 * z2 * (S0 + z2 * (S1 + z2 * S2)));
230 f3 = (float)(y3 + y3 * z3 * (S0 + z3 * (S1 + z3 * S2)));
231 break;
232
233 case 3:
234 f0 = (float)(one + z0 * (mhalf + z0 * (C0 +
235 z0 * (C1 + z0 * C2))));
236 f1 = (float)(one + z1 * (mhalf + z1 * (C0 +
237 z1 * (C1 + z1 * C2))));
238 f2 = (float)(y2 + y2 * z2 * (S0 + z2 * (S1 + z2 * S2)));
239 f3 = (float)(y3 + y3 * z3 * (S0 + z3 * (S1 + z3 * S2)));
240 break;
241
242 case 4:
243 f0 = (float)(y0 + y0 * z0 * (S0 + z0 * (S1 + z0 * S2)));
244 f1 = (float)(y1 + y1 * z1 * (S0 + z1 * (S1 + z1 * S2)));
245 f2 = (float)(one + z2 * (mhalf + z2 * (C0 +
246 z2 * (C1 + z2 * C2))));
247 f3 = (float)(y3 + y3 * z3 * (S0 + z3 * (S1 + z3 * S2)));
248 break;
249
250 case 5:
251 f0 = (float)(one + z0 * (mhalf + z0 * (C0 +
252 z0 * (C1 + z0 * C2))));
253 f1 = (float)(y1 + y1 * z1 * (S0 + z1 * (S1 + z1 * S2)));
254 f2 = (float)(one + z2 * (mhalf + z2 * (C0 +
255 z2 * (C1 + z2 * C2))));
256 f3 = (float)(y3 + y3 * z3 * (S0 + z3 * (S1 + z3 * S2)));
257 break;
258
259 case 6:
260 f0 = (float)(y0 + y0 * z0 * (S0 + z0 * (S1 + z0 * S2)));
261 f1 = (float)(one + z1 * (mhalf + z1 * (C0 +
262 z1 * (C1 + z1 * C2))));
263 f2 = (float)(one + z2 * (mhalf + z2 * (C0 +
264 z2 * (C1 + z2 * C2))));
265 f3 = (float)(y3 + y3 * z3 * (S0 + z3 * (S1 + z3 * S2)));
266 break;
267
268 case 7:
269 f0 = (float)(one + z0 * (mhalf + z0 * (C0 +
270 z0 * (C1 + z0 * C2))));
271 f1 = (float)(one + z1 * (mhalf + z1 * (C0 +
272 z1 * (C1 + z1 * C2))));
273 f2 = (float)(one + z2 * (mhalf + z2 * (C0 +
274 z2 * (C1 + z2 * C2))));
275 f3 = (float)(y3 + y3 * z3 * (S0 + z3 * (S1 + z3 * S2)));
276 break;
277
278 case 8:
279 f0 = (float)(y0 + y0 * z0 * (S0 + z0 * (S1 + z0 * S2)));
280 f1 = (float)(y1 + y1 * z1 * (S0 + z1 * (S1 + z1 * S2)));
281 f2 = (float)(y2 + y2 * z2 * (S0 + z2 * (S1 + z2 * S2)));
282 f3 = (float)(one + z3 * (mhalf + z3 * (C0 +
283 z3 * (C1 + z3 * C2))));
284 break;
285
286 case 9:
287 f0 = (float)(one + z0 * (mhalf + z0 * (C0 +
288 z0 * (C1 + z0 * C2))));
289 f1 = (float)(y1 + y1 * z1 * (S0 + z1 * (S1 + z1 * S2)));
290 f2 = (float)(y2 + y2 * z2 * (S0 + z2 * (S1 + z2 * S2)));
291 f3 = (float)(one + z3 * (mhalf + z3 * (C0 +
292 z3 * (C1 + z3 * C2))));
293 break;
294
295 case 10:
296 f0 = (float)(y0 + y0 * z0 * (S0 + z0 * (S1 + z0 * S2)));
297 f1 = (float)(one + z1 * (mhalf + z1 * (C0 +
298 z1 * (C1 + z1 * C2))));
299 f2 = (float)(y2 + y2 * z2 * (S0 + z2 * (S1 + z2 * S2)));
300 f3 = (float)(one + z3 * (mhalf + z3 * (C0 +
301 z3 * (C1 + z3 * C2))));
302 break;
303
304 case 11:
305 f0 = (float)(one + z0 * (mhalf + z0 * (C0 +
306 z0 * (C1 + z0 * C2))));
307 f1 = (float)(one + z1 * (mhalf + z1 * (C0 +
308 z1 * (C1 + z1 * C2))));
309 f2 = (float)(y2 + y2 * z2 * (S0 + z2 * (S1 + z2 * S2)));
310 f3 = (float)(one + z3 * (mhalf + z3 * (C0 +
311 z3 * (C1 + z3 * C2))));
312 break;
313
314 case 12:
315 f0 = (float)(y0 + y0 * z0 * (S0 + z0 * (S1 + z0 * S2)));
316 f1 = (float)(y1 + y1 * z1 * (S0 + z1 * (S1 + z1 * S2)));
317 f2 = (float)(one + z2 * (mhalf + z2 * (C0 +
318 z2 * (C1 + z2 * C2))));
319 f3 = (float)(one + z3 * (mhalf + z3 * (C0 +
320 z3 * (C1 + z3 * C2))));
321 break;
322
323 case 13:
324 f0 = (float)(one + z0 * (mhalf + z0 * (C0 +
325 z0 * (C1 + z0 * C2))));
326 f1 = (float)(y1 + y1 * z1 * (S0 + z1 * (S1 + z1 * S2)));
327 f2 = (float)(one + z2 * (mhalf + z2 * (C0 +
328 z2 * (C1 + z2 * C2))));
329 f3 = (float)(one + z3 * (mhalf + z3 * (C0 +
330 z3 * (C1 + z3 * C2))));
331 break;
332
333 case 14:
334 f0 = (float)(y0 + y0 * z0 * (S0 + z0 * (S1 + z0 * S2)));
335 f1 = (float)(one + z1 * (mhalf + z1 * (C0 +
336 z1 * (C1 + z1 * C2))));
337 f2 = (float)(one + z2 * (mhalf + z2 * (C0 +
338 z2 * (C1 + z2 * C2))));
339 f3 = (float)(one + z3 * (mhalf + z3 * (C0 +
340 z3 * (C1 + z3 * C2))));
341 break;
342
343 default:
344 f0 = (float)(one + z0 * (mhalf + z0 * (C0 +
345 z0 * (C1 + z0 * C2))));
346 f1 = (float)(one + z1 * (mhalf + z1 * (C0 +
347 z1 * (C1 + z1 * C2))));
348 f2 = (float)(one + z2 * (mhalf + z2 * (C0 +
349 z2 * (C1 + z2 * C2))));
350 f3 = (float)(one + z3 * (mhalf + z3 * (C0 +
351 z3 * (C1 + z3 * C2))));
352 }
353
354 *y = (n0 & 2)? -f0 : f0;
355 y += stridey;
356 *y = (n1 & 2)? -f1 : f1;
357 y += stridey;
358 *y = (n2 & 2)? -f2 : f2;
359 y += stridey;
360 *y = (n3 & 2)? -f3 : f3;
361 continue;
362
363 process1:
364 PROCESS(0);
365 continue;
366
367 process2:
368 PROCESS(0);
369 PROCESS(1);
370 continue;
371
372 process3:
373 PROCESS(0);
374 PROCESS(1);
375 PROCESS(2);
376 }
377 }
378