1*25c28e83SPiotr Jasiukajtis /*
2*25c28e83SPiotr Jasiukajtis * CDDL HEADER START
3*25c28e83SPiotr Jasiukajtis *
4*25c28e83SPiotr Jasiukajtis * The contents of this file are subject to the terms of the
5*25c28e83SPiotr Jasiukajtis * Common Development and Distribution License (the "License").
6*25c28e83SPiotr Jasiukajtis * You may not use this file except in compliance with the License.
7*25c28e83SPiotr Jasiukajtis *
8*25c28e83SPiotr Jasiukajtis * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9*25c28e83SPiotr Jasiukajtis * or http://www.opensolaris.org/os/licensing.
10*25c28e83SPiotr Jasiukajtis * See the License for the specific language governing permissions
11*25c28e83SPiotr Jasiukajtis * and limitations under the License.
12*25c28e83SPiotr Jasiukajtis *
13*25c28e83SPiotr Jasiukajtis * When distributing Covered Code, include this CDDL HEADER in each
14*25c28e83SPiotr Jasiukajtis * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15*25c28e83SPiotr Jasiukajtis * If applicable, add the following below this CDDL HEADER, with the
16*25c28e83SPiotr Jasiukajtis * fields enclosed by brackets "[]" replaced with your own identifying
17*25c28e83SPiotr Jasiukajtis * information: Portions Copyright [yyyy] [name of copyright owner]
18*25c28e83SPiotr Jasiukajtis *
19*25c28e83SPiotr Jasiukajtis * CDDL HEADER END
20*25c28e83SPiotr Jasiukajtis */
21*25c28e83SPiotr Jasiukajtis /*
22*25c28e83SPiotr Jasiukajtis * Copyright 2011 Nexenta Systems, Inc. All rights reserved.
23*25c28e83SPiotr Jasiukajtis */
24*25c28e83SPiotr Jasiukajtis /*
25*25c28e83SPiotr Jasiukajtis * Copyright 2005 Sun Microsystems, Inc. All rights reserved.
26*25c28e83SPiotr Jasiukajtis * Use is subject to license terms.
27*25c28e83SPiotr Jasiukajtis */
28*25c28e83SPiotr Jasiukajtis
29*25c28e83SPiotr Jasiukajtis /*
30*25c28e83SPiotr Jasiukajtis * __vsincosf: single precision vector sincos
31*25c28e83SPiotr Jasiukajtis *
32*25c28e83SPiotr Jasiukajtis * Algorithm:
33*25c28e83SPiotr Jasiukajtis *
34*25c28e83SPiotr Jasiukajtis * For |x| < pi/4, approximate sin(x) by a polynomial x+x*z*(S0+
35*25c28e83SPiotr Jasiukajtis * z*(S1+z*S2)) and cos(x) by a polynomial 1+z*(-1/2+z*(C0+z*(C1+
36*25c28e83SPiotr Jasiukajtis * z*C2))), where z = x*x, all evaluated in double precision.
37*25c28e83SPiotr Jasiukajtis *
38*25c28e83SPiotr Jasiukajtis * Accuracy:
39*25c28e83SPiotr Jasiukajtis *
40*25c28e83SPiotr Jasiukajtis * The largest error is less than 0.6 ulps.
41*25c28e83SPiotr Jasiukajtis */
42*25c28e83SPiotr Jasiukajtis
43*25c28e83SPiotr Jasiukajtis #include <sys/isa_defs.h>
44*25c28e83SPiotr Jasiukajtis
45*25c28e83SPiotr Jasiukajtis #ifdef _LITTLE_ENDIAN
46*25c28e83SPiotr Jasiukajtis #define HI(x) *(1+(int *)&x)
47*25c28e83SPiotr Jasiukajtis #define LO(x) *(unsigned *)&x
48*25c28e83SPiotr Jasiukajtis #else
49*25c28e83SPiotr Jasiukajtis #define HI(x) *(int *)&x
50*25c28e83SPiotr Jasiukajtis #define LO(x) *(1+(unsigned *)&x)
51*25c28e83SPiotr Jasiukajtis #endif
52*25c28e83SPiotr Jasiukajtis
53*25c28e83SPiotr Jasiukajtis #ifdef __RESTRICT
54*25c28e83SPiotr Jasiukajtis #define restrict _Restrict
55*25c28e83SPiotr Jasiukajtis #else
56*25c28e83SPiotr Jasiukajtis #define restrict
57*25c28e83SPiotr Jasiukajtis #endif
58*25c28e83SPiotr Jasiukajtis
59*25c28e83SPiotr Jasiukajtis extern int __vlibm_rem_pio2m(double *, double *, int, int, int);
60*25c28e83SPiotr Jasiukajtis
61*25c28e83SPiotr Jasiukajtis static const double C[] = {
62*25c28e83SPiotr Jasiukajtis -1.66666552424430847168e-01, /* 2^ -3 * -1.5555460000000 */
63*25c28e83SPiotr Jasiukajtis 8.33219196647405624390e-03, /* 2^ -7 * 1.11077E0000000 */
64*25c28e83SPiotr Jasiukajtis -1.95187909412197768688e-04, /* 2^-13 * -1.9956B60000000 */
65*25c28e83SPiotr Jasiukajtis 1.0,
66*25c28e83SPiotr Jasiukajtis -0.5,
67*25c28e83SPiotr Jasiukajtis 4.16666455566883087158e-02, /* 2^ -5 * 1.55554A0000000 */
68*25c28e83SPiotr Jasiukajtis -1.38873036485165357590e-03, /* 2^-10 * -1.6C0C1E0000000 */
69*25c28e83SPiotr Jasiukajtis 2.44309903791872784495e-05, /* 2^-16 * 1.99E24E0000000 */
70*25c28e83SPiotr Jasiukajtis 0.636619772367581343075535, /* 2^ -1 * 1.45F306DC9C883 */
71*25c28e83SPiotr Jasiukajtis 6755399441055744.0, /* 2^ 52 * 1.8000000000000 */
72*25c28e83SPiotr Jasiukajtis 1.570796326734125614166, /* 2^ 0 * 1.921FB54400000 */
73*25c28e83SPiotr Jasiukajtis 6.077100506506192601475e-11, /* 2^-34 * 1.0B4611A626331 */
74*25c28e83SPiotr Jasiukajtis };
75*25c28e83SPiotr Jasiukajtis
76*25c28e83SPiotr Jasiukajtis #define S0 C[0]
77*25c28e83SPiotr Jasiukajtis #define S1 C[1]
78*25c28e83SPiotr Jasiukajtis #define S2 C[2]
79*25c28e83SPiotr Jasiukajtis #define one C[3]
80*25c28e83SPiotr Jasiukajtis #define mhalf C[4]
81*25c28e83SPiotr Jasiukajtis #define C0 C[5]
82*25c28e83SPiotr Jasiukajtis #define C1 C[6]
83*25c28e83SPiotr Jasiukajtis #define C2 C[7]
84*25c28e83SPiotr Jasiukajtis #define invpio2 C[8]
85*25c28e83SPiotr Jasiukajtis #define c3two51 C[9]
86*25c28e83SPiotr Jasiukajtis #define pio2_1 C[10]
87*25c28e83SPiotr Jasiukajtis #define pio2_t C[11]
88*25c28e83SPiotr Jasiukajtis
89*25c28e83SPiotr Jasiukajtis #define PREPROCESS(N, sindex, cindex, label) \
90*25c28e83SPiotr Jasiukajtis hx = *(int *)x; \
91*25c28e83SPiotr Jasiukajtis ix = hx & 0x7fffffff; \
92*25c28e83SPiotr Jasiukajtis t = *x; \
93*25c28e83SPiotr Jasiukajtis x += stridex; \
94*25c28e83SPiotr Jasiukajtis if (ix <= 0x3f490fdb) { /* |x| < pi/4 */ \
95*25c28e83SPiotr Jasiukajtis if (ix == 0) { \
96*25c28e83SPiotr Jasiukajtis s[sindex] = t; \
97*25c28e83SPiotr Jasiukajtis c[cindex] = one; \
98*25c28e83SPiotr Jasiukajtis goto label; \
99*25c28e83SPiotr Jasiukajtis } \
100*25c28e83SPiotr Jasiukajtis y##N = (double)t; \
101*25c28e83SPiotr Jasiukajtis n##N = 0; \
102*25c28e83SPiotr Jasiukajtis } else if (ix <= 0x49c90fdb) { /* |x| < 2^19*pi */ \
103*25c28e83SPiotr Jasiukajtis y##N = (double)t; \
104*25c28e83SPiotr Jasiukajtis medium = 1; \
105*25c28e83SPiotr Jasiukajtis } else { \
106*25c28e83SPiotr Jasiukajtis if (ix >= 0x7f800000) { /* inf or nan */ \
107*25c28e83SPiotr Jasiukajtis s[sindex] = c[cindex] = t / t; \
108*25c28e83SPiotr Jasiukajtis goto label; \
109*25c28e83SPiotr Jasiukajtis } \
110*25c28e83SPiotr Jasiukajtis z##N = y##N = (double)t; \
111*25c28e83SPiotr Jasiukajtis hx = HI(y##N); \
112*25c28e83SPiotr Jasiukajtis n##N = ((hx >> 20) & 0x7ff) - 1046; \
113*25c28e83SPiotr Jasiukajtis HI(z##N) = (hx & 0xfffff) | 0x41600000; \
114*25c28e83SPiotr Jasiukajtis n##N = __vlibm_rem_pio2m(&z##N, &y##N, n##N, 1, 0); \
115*25c28e83SPiotr Jasiukajtis if (hx < 0) { \
116*25c28e83SPiotr Jasiukajtis y##N = -y##N; \
117*25c28e83SPiotr Jasiukajtis n##N = -n##N; \
118*25c28e83SPiotr Jasiukajtis } \
119*25c28e83SPiotr Jasiukajtis z##N = y##N * y##N; \
120*25c28e83SPiotr Jasiukajtis f##N = (float)(y##N + y##N * z##N * (S0 + z##N * \
121*25c28e83SPiotr Jasiukajtis (S1 + z##N * S2))); \
122*25c28e83SPiotr Jasiukajtis g##N = (float)(one + z##N * (mhalf + z##N * (C0 + \
123*25c28e83SPiotr Jasiukajtis z##N * (C1 + z##N * C2)))); \
124*25c28e83SPiotr Jasiukajtis if (n##N & 2) { \
125*25c28e83SPiotr Jasiukajtis f##N = -f##N; \
126*25c28e83SPiotr Jasiukajtis g##N = -g##N; \
127*25c28e83SPiotr Jasiukajtis } \
128*25c28e83SPiotr Jasiukajtis if (n##N & 1) { \
129*25c28e83SPiotr Jasiukajtis s[sindex] = g##N; \
130*25c28e83SPiotr Jasiukajtis c[cindex] = -f##N; \
131*25c28e83SPiotr Jasiukajtis } else { \
132*25c28e83SPiotr Jasiukajtis s[sindex] = f##N; \
133*25c28e83SPiotr Jasiukajtis c[cindex] = g##N; \
134*25c28e83SPiotr Jasiukajtis } \
135*25c28e83SPiotr Jasiukajtis goto label; \
136*25c28e83SPiotr Jasiukajtis }
137*25c28e83SPiotr Jasiukajtis
138*25c28e83SPiotr Jasiukajtis #define PROCESS(N) \
139*25c28e83SPiotr Jasiukajtis if (medium) { \
140*25c28e83SPiotr Jasiukajtis z##N = y##N * invpio2 + c3two51; \
141*25c28e83SPiotr Jasiukajtis n##N = LO(z##N); \
142*25c28e83SPiotr Jasiukajtis z##N -= c3two51; \
143*25c28e83SPiotr Jasiukajtis y##N = (y##N - z##N * pio2_1) - z##N * pio2_t; \
144*25c28e83SPiotr Jasiukajtis } \
145*25c28e83SPiotr Jasiukajtis z##N = y##N * y##N; \
146*25c28e83SPiotr Jasiukajtis f##N = (float)(y##N + y##N * z##N * (S0 + z##N * (S1 + z##N * S2)));\
147*25c28e83SPiotr Jasiukajtis g##N = (float)(one + z##N * (mhalf + z##N * (C0 + z##N * \
148*25c28e83SPiotr Jasiukajtis (C1 + z##N * C2)))); \
149*25c28e83SPiotr Jasiukajtis if (n##N & 2) { \
150*25c28e83SPiotr Jasiukajtis f##N = -f##N; \
151*25c28e83SPiotr Jasiukajtis g##N = -g##N; \
152*25c28e83SPiotr Jasiukajtis } \
153*25c28e83SPiotr Jasiukajtis if (n##N & 1) { \
154*25c28e83SPiotr Jasiukajtis *s = g##N; \
155*25c28e83SPiotr Jasiukajtis *c = -f##N; \
156*25c28e83SPiotr Jasiukajtis } else { \
157*25c28e83SPiotr Jasiukajtis *s = f##N; \
158*25c28e83SPiotr Jasiukajtis *c = g##N; \
159*25c28e83SPiotr Jasiukajtis } \
160*25c28e83SPiotr Jasiukajtis s += strides; \
161*25c28e83SPiotr Jasiukajtis c += stridec
162*25c28e83SPiotr Jasiukajtis
163*25c28e83SPiotr Jasiukajtis void
__vsincosf(int n,float * restrict x,int stridex,float * restrict s,int strides,float * restrict c,int stridec)164*25c28e83SPiotr Jasiukajtis __vsincosf(int n, float *restrict x, int stridex,
165*25c28e83SPiotr Jasiukajtis float *restrict s, int strides, float *restrict c, int stridec)
166*25c28e83SPiotr Jasiukajtis {
167*25c28e83SPiotr Jasiukajtis double y0, y1, y2, y3;
168*25c28e83SPiotr Jasiukajtis double z0, z1, z2, z3;
169*25c28e83SPiotr Jasiukajtis float f0, f1, f2, f3, t;
170*25c28e83SPiotr Jasiukajtis float g0, g1, g2, g3;
171*25c28e83SPiotr Jasiukajtis int n0 = 0, n1 = 0, n2 = 0, n3, hx, ix, medium;
172*25c28e83SPiotr Jasiukajtis
173*25c28e83SPiotr Jasiukajtis s -= strides;
174*25c28e83SPiotr Jasiukajtis c -= stridec;
175*25c28e83SPiotr Jasiukajtis
176*25c28e83SPiotr Jasiukajtis for (;;) {
177*25c28e83SPiotr Jasiukajtis begin:
178*25c28e83SPiotr Jasiukajtis s += strides;
179*25c28e83SPiotr Jasiukajtis c += stridec;
180*25c28e83SPiotr Jasiukajtis
181*25c28e83SPiotr Jasiukajtis if (--n < 0)
182*25c28e83SPiotr Jasiukajtis break;
183*25c28e83SPiotr Jasiukajtis
184*25c28e83SPiotr Jasiukajtis medium = 0;
185*25c28e83SPiotr Jasiukajtis PREPROCESS(0, 0, 0, begin);
186*25c28e83SPiotr Jasiukajtis
187*25c28e83SPiotr Jasiukajtis if (--n < 0)
188*25c28e83SPiotr Jasiukajtis goto process1;
189*25c28e83SPiotr Jasiukajtis
190*25c28e83SPiotr Jasiukajtis PREPROCESS(1, strides, stridec, process1);
191*25c28e83SPiotr Jasiukajtis
192*25c28e83SPiotr Jasiukajtis if (--n < 0)
193*25c28e83SPiotr Jasiukajtis goto process2;
194*25c28e83SPiotr Jasiukajtis
195*25c28e83SPiotr Jasiukajtis PREPROCESS(2, (strides << 1), (stridec << 1), process2);
196*25c28e83SPiotr Jasiukajtis
197*25c28e83SPiotr Jasiukajtis if (--n < 0)
198*25c28e83SPiotr Jasiukajtis goto process3;
199*25c28e83SPiotr Jasiukajtis
200*25c28e83SPiotr Jasiukajtis PREPROCESS(3, (strides << 1) + strides,
201*25c28e83SPiotr Jasiukajtis (stridec << 1) + stridec, process3);
202*25c28e83SPiotr Jasiukajtis
203*25c28e83SPiotr Jasiukajtis if (medium) {
204*25c28e83SPiotr Jasiukajtis z0 = y0 * invpio2 + c3two51;
205*25c28e83SPiotr Jasiukajtis z1 = y1 * invpio2 + c3two51;
206*25c28e83SPiotr Jasiukajtis z2 = y2 * invpio2 + c3two51;
207*25c28e83SPiotr Jasiukajtis z3 = y3 * invpio2 + c3two51;
208*25c28e83SPiotr Jasiukajtis
209*25c28e83SPiotr Jasiukajtis n0 = LO(z0);
210*25c28e83SPiotr Jasiukajtis n1 = LO(z1);
211*25c28e83SPiotr Jasiukajtis n2 = LO(z2);
212*25c28e83SPiotr Jasiukajtis n3 = LO(z3);
213*25c28e83SPiotr Jasiukajtis
214*25c28e83SPiotr Jasiukajtis z0 -= c3two51;
215*25c28e83SPiotr Jasiukajtis z1 -= c3two51;
216*25c28e83SPiotr Jasiukajtis z2 -= c3two51;
217*25c28e83SPiotr Jasiukajtis z3 -= c3two51;
218*25c28e83SPiotr Jasiukajtis
219*25c28e83SPiotr Jasiukajtis y0 = (y0 - z0 * pio2_1) - z0 * pio2_t;
220*25c28e83SPiotr Jasiukajtis y1 = (y1 - z1 * pio2_1) - z1 * pio2_t;
221*25c28e83SPiotr Jasiukajtis y2 = (y2 - z2 * pio2_1) - z2 * pio2_t;
222*25c28e83SPiotr Jasiukajtis y3 = (y3 - z3 * pio2_1) - z3 * pio2_t;
223*25c28e83SPiotr Jasiukajtis }
224*25c28e83SPiotr Jasiukajtis
225*25c28e83SPiotr Jasiukajtis z0 = y0 * y0;
226*25c28e83SPiotr Jasiukajtis z1 = y1 * y1;
227*25c28e83SPiotr Jasiukajtis z2 = y2 * y2;
228*25c28e83SPiotr Jasiukajtis z3 = y3 * y3;
229*25c28e83SPiotr Jasiukajtis
230*25c28e83SPiotr Jasiukajtis f0 = (float)(y0 + y0 * z0 * (S0 + z0 * (S1 + z0 * S2)));
231*25c28e83SPiotr Jasiukajtis f1 = (float)(y1 + y1 * z1 * (S0 + z1 * (S1 + z1 * S2)));
232*25c28e83SPiotr Jasiukajtis f2 = (float)(y2 + y2 * z2 * (S0 + z2 * (S1 + z2 * S2)));
233*25c28e83SPiotr Jasiukajtis f3 = (float)(y3 + y3 * z3 * (S0 + z3 * (S1 + z3 * S2)));
234*25c28e83SPiotr Jasiukajtis
235*25c28e83SPiotr Jasiukajtis g0 = (float)(one + z0 * (mhalf + z0 * (C0 + z0 *
236*25c28e83SPiotr Jasiukajtis (C1 + z0 * C2))));
237*25c28e83SPiotr Jasiukajtis g1 = (float)(one + z1 * (mhalf + z1 * (C0 + z1 *
238*25c28e83SPiotr Jasiukajtis (C1 + z1 * C2))));
239*25c28e83SPiotr Jasiukajtis g2 = (float)(one + z2 * (mhalf + z2 * (C0 + z2 *
240*25c28e83SPiotr Jasiukajtis (C1 + z2 * C2))));
241*25c28e83SPiotr Jasiukajtis g3 = (float)(one + z3 * (mhalf + z3 * (C0 + z3 *
242*25c28e83SPiotr Jasiukajtis (C1 + z3 * C2))));
243*25c28e83SPiotr Jasiukajtis
244*25c28e83SPiotr Jasiukajtis if (n0 & 2) {
245*25c28e83SPiotr Jasiukajtis f0 = -f0;
246*25c28e83SPiotr Jasiukajtis g0 = -g0;
247*25c28e83SPiotr Jasiukajtis }
248*25c28e83SPiotr Jasiukajtis if (n1 & 2) {
249*25c28e83SPiotr Jasiukajtis f1 = -f1;
250*25c28e83SPiotr Jasiukajtis g1 = -g1;
251*25c28e83SPiotr Jasiukajtis }
252*25c28e83SPiotr Jasiukajtis if (n2 & 2) {
253*25c28e83SPiotr Jasiukajtis f2 = -f2;
254*25c28e83SPiotr Jasiukajtis g2 = -g2;
255*25c28e83SPiotr Jasiukajtis }
256*25c28e83SPiotr Jasiukajtis if (n3 & 2) {
257*25c28e83SPiotr Jasiukajtis f3 = -f3;
258*25c28e83SPiotr Jasiukajtis g3 = -g3;
259*25c28e83SPiotr Jasiukajtis }
260*25c28e83SPiotr Jasiukajtis
261*25c28e83SPiotr Jasiukajtis if (n0 & 1) {
262*25c28e83SPiotr Jasiukajtis *s = g0;
263*25c28e83SPiotr Jasiukajtis *c = -f0;
264*25c28e83SPiotr Jasiukajtis } else {
265*25c28e83SPiotr Jasiukajtis *s = f0;
266*25c28e83SPiotr Jasiukajtis *c = g0;
267*25c28e83SPiotr Jasiukajtis }
268*25c28e83SPiotr Jasiukajtis s += strides;
269*25c28e83SPiotr Jasiukajtis c += stridec;
270*25c28e83SPiotr Jasiukajtis
271*25c28e83SPiotr Jasiukajtis if (n1 & 1) {
272*25c28e83SPiotr Jasiukajtis *s = g1;
273*25c28e83SPiotr Jasiukajtis *c = -f1;
274*25c28e83SPiotr Jasiukajtis } else {
275*25c28e83SPiotr Jasiukajtis *s = f1;
276*25c28e83SPiotr Jasiukajtis *c = g1;
277*25c28e83SPiotr Jasiukajtis }
278*25c28e83SPiotr Jasiukajtis s += strides;
279*25c28e83SPiotr Jasiukajtis c += stridec;
280*25c28e83SPiotr Jasiukajtis
281*25c28e83SPiotr Jasiukajtis if (n2 & 1) {
282*25c28e83SPiotr Jasiukajtis *s = g2;
283*25c28e83SPiotr Jasiukajtis *c = -f2;
284*25c28e83SPiotr Jasiukajtis } else {
285*25c28e83SPiotr Jasiukajtis *s = f2;
286*25c28e83SPiotr Jasiukajtis *c = g2;
287*25c28e83SPiotr Jasiukajtis }
288*25c28e83SPiotr Jasiukajtis s += strides;
289*25c28e83SPiotr Jasiukajtis c += stridec;
290*25c28e83SPiotr Jasiukajtis
291*25c28e83SPiotr Jasiukajtis if (n3 & 1) {
292*25c28e83SPiotr Jasiukajtis *s = g3;
293*25c28e83SPiotr Jasiukajtis *c = -f3;
294*25c28e83SPiotr Jasiukajtis } else {
295*25c28e83SPiotr Jasiukajtis *s = f3;
296*25c28e83SPiotr Jasiukajtis *c = g3;
297*25c28e83SPiotr Jasiukajtis }
298*25c28e83SPiotr Jasiukajtis continue;
299*25c28e83SPiotr Jasiukajtis
300*25c28e83SPiotr Jasiukajtis process1:
301*25c28e83SPiotr Jasiukajtis PROCESS(0);
302*25c28e83SPiotr Jasiukajtis continue;
303*25c28e83SPiotr Jasiukajtis
304*25c28e83SPiotr Jasiukajtis process2:
305*25c28e83SPiotr Jasiukajtis PROCESS(0);
306*25c28e83SPiotr Jasiukajtis PROCESS(1);
307*25c28e83SPiotr Jasiukajtis continue;
308*25c28e83SPiotr Jasiukajtis
309*25c28e83SPiotr Jasiukajtis process3:
310*25c28e83SPiotr Jasiukajtis PROCESS(0);
311*25c28e83SPiotr Jasiukajtis PROCESS(1);
312*25c28e83SPiotr Jasiukajtis PROCESS(2);
313*25c28e83SPiotr Jasiukajtis }
314*25c28e83SPiotr Jasiukajtis }
315