1 /* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License (the "License"). 6 * You may not use this file except in compliance with the License. 7 * 8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9 * or http://www.opensolaris.org/os/licensing. 10 * See the License for the specific language governing permissions 11 * and limitations under the License. 12 * 13 * When distributing Covered Code, include this CDDL HEADER in each 14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15 * If applicable, add the following below this CDDL HEADER, with the 16 * fields enclosed by brackets "[]" replaced with your own identifying 17 * information: Portions Copyright [yyyy] [name of copyright owner] 18 * 19 * CDDL HEADER END 20 */ 21 /* 22 * Copyright 2011 Nexenta Systems, Inc. All rights reserved. 23 */ 24 /* 25 * Copyright 2005 Sun Microsystems, Inc. All rights reserved. 26 * Use is subject to license terms. 27 */ 28 29 /* 30 * __vsincosf: single precision vector sincos 31 * 32 * Algorithm: 33 * 34 * For |x| < pi/4, approximate sin(x) by a polynomial x+x*z*(S0+ 35 * z*(S1+z*S2)) and cos(x) by a polynomial 1+z*(-1/2+z*(C0+z*(C1+ 36 * z*C2))), where z = x*x, all evaluated in double precision. 37 * 38 * Accuracy: 39 * 40 * The largest error is less than 0.6 ulps. 41 */ 42 43 #include <sys/isa_defs.h> 44 45 #ifdef _LITTLE_ENDIAN 46 #define HI(x) *(1+(int *)&x) 47 #define LO(x) *(unsigned *)&x 48 #else 49 #define HI(x) *(int *)&x 50 #define LO(x) *(1+(unsigned *)&x) 51 #endif 52 53 #ifdef __RESTRICT 54 #define restrict _Restrict 55 #else 56 #define restrict 57 #endif 58 59 extern int __vlibm_rem_pio2m(double *, double *, int, int, int); 60 61 static const double C[] = { 62 -1.66666552424430847168e-01, /* 2^ -3 * -1.5555460000000 */ 63 8.33219196647405624390e-03, /* 2^ -7 * 1.11077E0000000 */ 64 -1.95187909412197768688e-04, /* 2^-13 * -1.9956B60000000 */ 65 1.0, 66 -0.5, 67 4.16666455566883087158e-02, /* 2^ -5 * 1.55554A0000000 */ 68 -1.38873036485165357590e-03, /* 2^-10 * -1.6C0C1E0000000 */ 69 2.44309903791872784495e-05, /* 2^-16 * 1.99E24E0000000 */ 70 0.636619772367581343075535, /* 2^ -1 * 1.45F306DC9C883 */ 71 6755399441055744.0, /* 2^ 52 * 1.8000000000000 */ 72 1.570796326734125614166, /* 2^ 0 * 1.921FB54400000 */ 73 6.077100506506192601475e-11, /* 2^-34 * 1.0B4611A626331 */ 74 }; 75 76 #define S0 C[0] 77 #define S1 C[1] 78 #define S2 C[2] 79 #define one C[3] 80 #define mhalf C[4] 81 #define C0 C[5] 82 #define C1 C[6] 83 #define C2 C[7] 84 #define invpio2 C[8] 85 #define c3two51 C[9] 86 #define pio2_1 C[10] 87 #define pio2_t C[11] 88 89 #define PREPROCESS(N, sindex, cindex, label) \ 90 hx = *(int *)x; \ 91 ix = hx & 0x7fffffff; \ 92 t = *x; \ 93 x += stridex; \ 94 if (ix <= 0x3f490fdb) { /* |x| < pi/4 */ \ 95 if (ix == 0) { \ 96 s[sindex] = t; \ 97 c[cindex] = one; \ 98 goto label; \ 99 } \ 100 y##N = (double)t; \ 101 n##N = 0; \ 102 } else if (ix <= 0x49c90fdb) { /* |x| < 2^19*pi */ \ 103 y##N = (double)t; \ 104 medium = 1; \ 105 } else { \ 106 if (ix >= 0x7f800000) { /* inf or nan */ \ 107 s[sindex] = c[cindex] = t / t; \ 108 goto label; \ 109 } \ 110 z##N = y##N = (double)t; \ 111 hx = HI(y##N); \ 112 n##N = ((hx >> 20) & 0x7ff) - 1046; \ 113 HI(z##N) = (hx & 0xfffff) | 0x41600000; \ 114 n##N = __vlibm_rem_pio2m(&z##N, &y##N, n##N, 1, 0); \ 115 if (hx < 0) { \ 116 y##N = -y##N; \ 117 n##N = -n##N; \ 118 } \ 119 z##N = y##N * y##N; \ 120 f##N = (float)(y##N + y##N * z##N * (S0 + z##N * \ 121 (S1 + z##N * S2))); \ 122 g##N = (float)(one + z##N * (mhalf + z##N * (C0 + \ 123 z##N * (C1 + z##N * C2)))); \ 124 if (n##N & 2) { \ 125 f##N = -f##N; \ 126 g##N = -g##N; \ 127 } \ 128 if (n##N & 1) { \ 129 s[sindex] = g##N; \ 130 c[cindex] = -f##N; \ 131 } else { \ 132 s[sindex] = f##N; \ 133 c[cindex] = g##N; \ 134 } \ 135 goto label; \ 136 } 137 138 #define PROCESS(N) \ 139 if (medium) { \ 140 z##N = y##N * invpio2 + c3two51; \ 141 n##N = LO(z##N); \ 142 z##N -= c3two51; \ 143 y##N = (y##N - z##N * pio2_1) - z##N * pio2_t; \ 144 } \ 145 z##N = y##N * y##N; \ 146 f##N = (float)(y##N + y##N * z##N * (S0 + z##N * (S1 + z##N * S2)));\ 147 g##N = (float)(one + z##N * (mhalf + z##N * (C0 + z##N * \ 148 (C1 + z##N * C2)))); \ 149 if (n##N & 2) { \ 150 f##N = -f##N; \ 151 g##N = -g##N; \ 152 } \ 153 if (n##N & 1) { \ 154 *s = g##N; \ 155 *c = -f##N; \ 156 } else { \ 157 *s = f##N; \ 158 *c = g##N; \ 159 } \ 160 s += strides; \ 161 c += stridec 162 163 void 164 __vsincosf(int n, float *restrict x, int stridex, 165 float *restrict s, int strides, float *restrict c, int stridec) 166 { 167 double y0, y1, y2, y3; 168 double z0, z1, z2, z3; 169 float f0, f1, f2, f3, t; 170 float g0, g1, g2, g3; 171 int n0 = 0, n1 = 0, n2 = 0, n3, hx, ix, medium; 172 173 s -= strides; 174 c -= stridec; 175 176 for (;;) { 177 begin: 178 s += strides; 179 c += stridec; 180 181 if (--n < 0) 182 break; 183 184 medium = 0; 185 PREPROCESS(0, 0, 0, begin); 186 187 if (--n < 0) 188 goto process1; 189 190 PREPROCESS(1, strides, stridec, process1); 191 192 if (--n < 0) 193 goto process2; 194 195 PREPROCESS(2, (strides << 1), (stridec << 1), process2); 196 197 if (--n < 0) 198 goto process3; 199 200 PREPROCESS(3, (strides << 1) + strides, 201 (stridec << 1) + stridec, process3); 202 203 if (medium) { 204 z0 = y0 * invpio2 + c3two51; 205 z1 = y1 * invpio2 + c3two51; 206 z2 = y2 * invpio2 + c3two51; 207 z3 = y3 * invpio2 + c3two51; 208 209 n0 = LO(z0); 210 n1 = LO(z1); 211 n2 = LO(z2); 212 n3 = LO(z3); 213 214 z0 -= c3two51; 215 z1 -= c3two51; 216 z2 -= c3two51; 217 z3 -= c3two51; 218 219 y0 = (y0 - z0 * pio2_1) - z0 * pio2_t; 220 y1 = (y1 - z1 * pio2_1) - z1 * pio2_t; 221 y2 = (y2 - z2 * pio2_1) - z2 * pio2_t; 222 y3 = (y3 - z3 * pio2_1) - z3 * pio2_t; 223 } 224 225 z0 = y0 * y0; 226 z1 = y1 * y1; 227 z2 = y2 * y2; 228 z3 = y3 * y3; 229 230 f0 = (float)(y0 + y0 * z0 * (S0 + z0 * (S1 + z0 * S2))); 231 f1 = (float)(y1 + y1 * z1 * (S0 + z1 * (S1 + z1 * S2))); 232 f2 = (float)(y2 + y2 * z2 * (S0 + z2 * (S1 + z2 * S2))); 233 f3 = (float)(y3 + y3 * z3 * (S0 + z3 * (S1 + z3 * S2))); 234 235 g0 = (float)(one + z0 * (mhalf + z0 * (C0 + z0 * 236 (C1 + z0 * C2)))); 237 g1 = (float)(one + z1 * (mhalf + z1 * (C0 + z1 * 238 (C1 + z1 * C2)))); 239 g2 = (float)(one + z2 * (mhalf + z2 * (C0 + z2 * 240 (C1 + z2 * C2)))); 241 g3 = (float)(one + z3 * (mhalf + z3 * (C0 + z3 * 242 (C1 + z3 * C2)))); 243 244 if (n0 & 2) { 245 f0 = -f0; 246 g0 = -g0; 247 } 248 if (n1 & 2) { 249 f1 = -f1; 250 g1 = -g1; 251 } 252 if (n2 & 2) { 253 f2 = -f2; 254 g2 = -g2; 255 } 256 if (n3 & 2) { 257 f3 = -f3; 258 g3 = -g3; 259 } 260 261 if (n0 & 1) { 262 *s = g0; 263 *c = -f0; 264 } else { 265 *s = f0; 266 *c = g0; 267 } 268 s += strides; 269 c += stridec; 270 271 if (n1 & 1) { 272 *s = g1; 273 *c = -f1; 274 } else { 275 *s = f1; 276 *c = g1; 277 } 278 s += strides; 279 c += stridec; 280 281 if (n2 & 1) { 282 *s = g2; 283 *c = -f2; 284 } else { 285 *s = f2; 286 *c = g2; 287 } 288 s += strides; 289 c += stridec; 290 291 if (n3 & 1) { 292 *s = g3; 293 *c = -f3; 294 } else { 295 *s = f3; 296 *c = g3; 297 } 298 continue; 299 300 process1: 301 PROCESS(0); 302 continue; 303 304 process2: 305 PROCESS(0); 306 PROCESS(1); 307 continue; 308 309 process3: 310 PROCESS(0); 311 PROCESS(1); 312 PROCESS(2); 313 } 314 } 315