1 /* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License (the "License"). 6 * You may not use this file except in compliance with the License. 7 * 8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9 * or http://www.opensolaris.org/os/licensing. 10 * See the License for the specific language governing permissions 11 * and limitations under the License. 12 * 13 * When distributing Covered Code, include this CDDL HEADER in each 14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15 * If applicable, add the following below this CDDL HEADER, with the 16 * fields enclosed by brackets "[]" replaced with your own identifying 17 * information: Portions Copyright [yyyy] [name of copyright owner] 18 * 19 * CDDL HEADER END 20 */ 21 /* 22 * Copyright 2011 Nexenta Systems, Inc. All rights reserved. 23 */ 24 /* 25 * Copyright 2005 Sun Microsystems, Inc. All rights reserved. 26 * Use is subject to license terms. 27 */ 28 29 /* 30 * __vcosf: single precision vector cos 31 * 32 * Algorithm: 33 * 34 * For |x| < pi/4, approximate sin(x) by a polynomial x+x*z*(S0+ 35 * z*(S1+z*S2)) and cos(x) by a polynomial 1+z*(-1/2+z*(C0+z*(C1+ 36 * z*C2))), where z = x*x, all evaluated in double precision. 37 * 38 * Accuracy: 39 * 40 * The largest error is less than 0.6 ulps. 41 */ 42 43 #include <sys/isa_defs.h> 44 45 #ifdef _LITTLE_ENDIAN 46 #define HI(x) *(1+(int *)&x) 47 #define LO(x) *(unsigned *)&x 48 #else 49 #define HI(x) *(int *)&x 50 #define LO(x) *(1+(unsigned *)&x) 51 #endif 52 53 #ifdef __RESTRICT 54 #define restrict _Restrict 55 #else 56 #define restrict 57 #endif 58 59 extern int __vlibm_rem_pio2m(double *, double *, int, int, int); 60 61 static const double C[] = { 62 -1.66666552424430847168e-01, /* 2^ -3 * -1.5555460000000 */ 63 8.33219196647405624390e-03, /* 2^ -7 * 1.11077E0000000 */ 64 -1.95187909412197768688e-04, /* 2^-13 * -1.9956B60000000 */ 65 1.0, 66 -0.5, 67 4.16666455566883087158e-02, /* 2^ -5 * 1.55554A0000000 */ 68 -1.38873036485165357590e-03, /* 2^-10 * -1.6C0C1E0000000 */ 69 2.44309903791872784495e-05, /* 2^-16 * 1.99E24E0000000 */ 70 0.636619772367581343075535, /* 2^ -1 * 1.45F306DC9C883 */ 71 6755399441055744.0, /* 2^ 52 * 1.8000000000000 */ 72 1.570796326734125614166, /* 2^ 0 * 1.921FB54400000 */ 73 6.077100506506192601475e-11, /* 2^-34 * 1.0B4611A626331 */ 74 }; 75 76 #define S0 C[0] 77 #define S1 C[1] 78 #define S2 C[2] 79 #define one C[3] 80 #define mhalf C[4] 81 #define C0 C[5] 82 #define C1 C[6] 83 #define C2 C[7] 84 #define invpio2 C[8] 85 #define c3two51 C[9] 86 #define pio2_1 C[10] 87 #define pio2_t C[11] 88 89 #define PREPROCESS(N, index, label) \ 90 hx = *(int *)x; \ 91 ix = hx & 0x7fffffff; \ 92 t = *x; \ 93 x += stridex; \ 94 if (ix <= 0x3f490fdb) { /* |x| < pi/4 */ \ 95 if (ix == 0) { \ 96 y[index] = one; \ 97 goto label; \ 98 } \ 99 y##N = (double)t; \ 100 n##N = 1; \ 101 } else if (ix <= 0x49c90fdb) { /* |x| < 2^19*pi */ \ 102 y##N = (double)t; \ 103 medium = 1; \ 104 } else { \ 105 if (ix >= 0x7f800000) { /* inf or nan */ \ 106 y[index] = t / t; \ 107 goto label; \ 108 } \ 109 z##N = y##N = (double)t; \ 110 hx = HI(y##N); \ 111 n##N = ((hx >> 20) & 0x7ff) - 1046; \ 112 HI(z##N) = (hx & 0xfffff) | 0x41600000; \ 113 n##N = __vlibm_rem_pio2m(&z##N, &y##N, n##N, 1, 0) + 1; \ 114 z##N = y##N * y##N; \ 115 if (n##N & 1) { /* compute cos y */ \ 116 f##N = (float)(one + z##N * (mhalf + z##N * \ 117 (C0 + z##N * (C1 + z##N * C2)))); \ 118 } else { /* compute sin y */ \ 119 f##N = (float)(y##N + y##N * z##N * (S0 + \ 120 z##N * (S1 + z##N * S2))); \ 121 } \ 122 y[index] = (n##N & 2)? -f##N : f##N; \ 123 goto label; \ 124 } 125 126 #define PROCESS(N) \ 127 if (medium) { \ 128 z##N = y##N * invpio2 + c3two51; \ 129 n##N = LO(z##N) + 1; \ 130 z##N -= c3two51; \ 131 y##N = (y##N - z##N * pio2_1) - z##N * pio2_t; \ 132 } \ 133 z##N = y##N * y##N; \ 134 if (n##N & 1) { /* compute cos y */ \ 135 f##N = (float)(one + z##N * (mhalf + z##N * (C0 + \ 136 z##N * (C1 + z##N * C2)))); \ 137 } else { /* compute sin y */ \ 138 f##N = (float)(y##N + y##N * z##N * (S0 + z##N * (S1 + \ 139 z##N * S2))); \ 140 } \ 141 *y = (n##N & 2)? -f##N : f##N; \ 142 y += stridey 143 144 void 145 __vcosf(int n, float *restrict x, int stridex, float *restrict y, 146 int stridey) 147 { 148 double y0, y1, y2, y3; 149 double z0, z1, z2, z3; 150 float f0, f1, f2, f3, t; 151 int n0 = 0, n1 = 0, n2 = 0, n3, hx, ix, medium; 152 153 y -= stridey; 154 155 for (;;) { 156 begin: 157 y += stridey; 158 159 if (--n < 0) 160 break; 161 162 medium = 0; 163 PREPROCESS(0, 0, begin); 164 165 if (--n < 0) 166 goto process1; 167 168 PREPROCESS(1, stridey, process1); 169 170 if (--n < 0) 171 goto process2; 172 173 PREPROCESS(2, (stridey << 1), process2); 174 175 if (--n < 0) 176 goto process3; 177 178 PREPROCESS(3, (stridey << 1) + stridey, process3); 179 180 if (medium) { 181 z0 = y0 * invpio2 + c3two51; 182 z1 = y1 * invpio2 + c3two51; 183 z2 = y2 * invpio2 + c3two51; 184 z3 = y3 * invpio2 + c3two51; 185 186 n0 = LO(z0) + 1; 187 n1 = LO(z1) + 1; 188 n2 = LO(z2) + 1; 189 n3 = LO(z3) + 1; 190 191 z0 -= c3two51; 192 z1 -= c3two51; 193 z2 -= c3two51; 194 z3 -= c3two51; 195 196 y0 = (y0 - z0 * pio2_1) - z0 * pio2_t; 197 y1 = (y1 - z1 * pio2_1) - z1 * pio2_t; 198 y2 = (y2 - z2 * pio2_1) - z2 * pio2_t; 199 y3 = (y3 - z3 * pio2_1) - z3 * pio2_t; 200 } 201 202 z0 = y0 * y0; 203 z1 = y1 * y1; 204 z2 = y2 * y2; 205 z3 = y3 * y3; 206 207 hx = (n0 & 1) | ((n1 & 1) << 1) | ((n2 & 1) << 2) | 208 ((n3 & 1) << 3); 209 switch (hx) { 210 case 0: 211 f0 = (float)(y0 + y0 * z0 * (S0 + z0 * (S1 + z0 * S2))); 212 f1 = (float)(y1 + y1 * z1 * (S0 + z1 * (S1 + z1 * S2))); 213 f2 = (float)(y2 + y2 * z2 * (S0 + z2 * (S1 + z2 * S2))); 214 f3 = (float)(y3 + y3 * z3 * (S0 + z3 * (S1 + z3 * S2))); 215 break; 216 217 case 1: 218 f0 = (float)(one + z0 * (mhalf + z0 * (C0 + 219 z0 * (C1 + z0 * C2)))); 220 f1 = (float)(y1 + y1 * z1 * (S0 + z1 * (S1 + z1 * S2))); 221 f2 = (float)(y2 + y2 * z2 * (S0 + z2 * (S1 + z2 * S2))); 222 f3 = (float)(y3 + y3 * z3 * (S0 + z3 * (S1 + z3 * S2))); 223 break; 224 225 case 2: 226 f0 = (float)(y0 + y0 * z0 * (S0 + z0 * (S1 + z0 * S2))); 227 f1 = (float)(one + z1 * (mhalf + z1 * (C0 + 228 z1 * (C1 + z1 * C2)))); 229 f2 = (float)(y2 + y2 * z2 * (S0 + z2 * (S1 + z2 * S2))); 230 f3 = (float)(y3 + y3 * z3 * (S0 + z3 * (S1 + z3 * S2))); 231 break; 232 233 case 3: 234 f0 = (float)(one + z0 * (mhalf + z0 * (C0 + 235 z0 * (C1 + z0 * C2)))); 236 f1 = (float)(one + z1 * (mhalf + z1 * (C0 + 237 z1 * (C1 + z1 * C2)))); 238 f2 = (float)(y2 + y2 * z2 * (S0 + z2 * (S1 + z2 * S2))); 239 f3 = (float)(y3 + y3 * z3 * (S0 + z3 * (S1 + z3 * S2))); 240 break; 241 242 case 4: 243 f0 = (float)(y0 + y0 * z0 * (S0 + z0 * (S1 + z0 * S2))); 244 f1 = (float)(y1 + y1 * z1 * (S0 + z1 * (S1 + z1 * S2))); 245 f2 = (float)(one + z2 * (mhalf + z2 * (C0 + 246 z2 * (C1 + z2 * C2)))); 247 f3 = (float)(y3 + y3 * z3 * (S0 + z3 * (S1 + z3 * S2))); 248 break; 249 250 case 5: 251 f0 = (float)(one + z0 * (mhalf + z0 * (C0 + 252 z0 * (C1 + z0 * C2)))); 253 f1 = (float)(y1 + y1 * z1 * (S0 + z1 * (S1 + z1 * S2))); 254 f2 = (float)(one + z2 * (mhalf + z2 * (C0 + 255 z2 * (C1 + z2 * C2)))); 256 f3 = (float)(y3 + y3 * z3 * (S0 + z3 * (S1 + z3 * S2))); 257 break; 258 259 case 6: 260 f0 = (float)(y0 + y0 * z0 * (S0 + z0 * (S1 + z0 * S2))); 261 f1 = (float)(one + z1 * (mhalf + z1 * (C0 + 262 z1 * (C1 + z1 * C2)))); 263 f2 = (float)(one + z2 * (mhalf + z2 * (C0 + 264 z2 * (C1 + z2 * C2)))); 265 f3 = (float)(y3 + y3 * z3 * (S0 + z3 * (S1 + z3 * S2))); 266 break; 267 268 case 7: 269 f0 = (float)(one + z0 * (mhalf + z0 * (C0 + 270 z0 * (C1 + z0 * C2)))); 271 f1 = (float)(one + z1 * (mhalf + z1 * (C0 + 272 z1 * (C1 + z1 * C2)))); 273 f2 = (float)(one + z2 * (mhalf + z2 * (C0 + 274 z2 * (C1 + z2 * C2)))); 275 f3 = (float)(y3 + y3 * z3 * (S0 + z3 * (S1 + z3 * S2))); 276 break; 277 278 case 8: 279 f0 = (float)(y0 + y0 * z0 * (S0 + z0 * (S1 + z0 * S2))); 280 f1 = (float)(y1 + y1 * z1 * (S0 + z1 * (S1 + z1 * S2))); 281 f2 = (float)(y2 + y2 * z2 * (S0 + z2 * (S1 + z2 * S2))); 282 f3 = (float)(one + z3 * (mhalf + z3 * (C0 + 283 z3 * (C1 + z3 * C2)))); 284 break; 285 286 case 9: 287 f0 = (float)(one + z0 * (mhalf + z0 * (C0 + 288 z0 * (C1 + z0 * C2)))); 289 f1 = (float)(y1 + y1 * z1 * (S0 + z1 * (S1 + z1 * S2))); 290 f2 = (float)(y2 + y2 * z2 * (S0 + z2 * (S1 + z2 * S2))); 291 f3 = (float)(one + z3 * (mhalf + z3 * (C0 + 292 z3 * (C1 + z3 * C2)))); 293 break; 294 295 case 10: 296 f0 = (float)(y0 + y0 * z0 * (S0 + z0 * (S1 + z0 * S2))); 297 f1 = (float)(one + z1 * (mhalf + z1 * (C0 + 298 z1 * (C1 + z1 * C2)))); 299 f2 = (float)(y2 + y2 * z2 * (S0 + z2 * (S1 + z2 * S2))); 300 f3 = (float)(one + z3 * (mhalf + z3 * (C0 + 301 z3 * (C1 + z3 * C2)))); 302 break; 303 304 case 11: 305 f0 = (float)(one + z0 * (mhalf + z0 * (C0 + 306 z0 * (C1 + z0 * C2)))); 307 f1 = (float)(one + z1 * (mhalf + z1 * (C0 + 308 z1 * (C1 + z1 * C2)))); 309 f2 = (float)(y2 + y2 * z2 * (S0 + z2 * (S1 + z2 * S2))); 310 f3 = (float)(one + z3 * (mhalf + z3 * (C0 + 311 z3 * (C1 + z3 * C2)))); 312 break; 313 314 case 12: 315 f0 = (float)(y0 + y0 * z0 * (S0 + z0 * (S1 + z0 * S2))); 316 f1 = (float)(y1 + y1 * z1 * (S0 + z1 * (S1 + z1 * S2))); 317 f2 = (float)(one + z2 * (mhalf + z2 * (C0 + 318 z2 * (C1 + z2 * C2)))); 319 f3 = (float)(one + z3 * (mhalf + z3 * (C0 + 320 z3 * (C1 + z3 * C2)))); 321 break; 322 323 case 13: 324 f0 = (float)(one + z0 * (mhalf + z0 * (C0 + 325 z0 * (C1 + z0 * C2)))); 326 f1 = (float)(y1 + y1 * z1 * (S0 + z1 * (S1 + z1 * S2))); 327 f2 = (float)(one + z2 * (mhalf + z2 * (C0 + 328 z2 * (C1 + z2 * C2)))); 329 f3 = (float)(one + z3 * (mhalf + z3 * (C0 + 330 z3 * (C1 + z3 * C2)))); 331 break; 332 333 case 14: 334 f0 = (float)(y0 + y0 * z0 * (S0 + z0 * (S1 + z0 * S2))); 335 f1 = (float)(one + z1 * (mhalf + z1 * (C0 + 336 z1 * (C1 + z1 * C2)))); 337 f2 = (float)(one + z2 * (mhalf + z2 * (C0 + 338 z2 * (C1 + z2 * C2)))); 339 f3 = (float)(one + z3 * (mhalf + z3 * (C0 + 340 z3 * (C1 + z3 * C2)))); 341 break; 342 343 default: 344 f0 = (float)(one + z0 * (mhalf + z0 * (C0 + 345 z0 * (C1 + z0 * C2)))); 346 f1 = (float)(one + z1 * (mhalf + z1 * (C0 + 347 z1 * (C1 + z1 * C2)))); 348 f2 = (float)(one + z2 * (mhalf + z2 * (C0 + 349 z2 * (C1 + z2 * C2)))); 350 f3 = (float)(one + z3 * (mhalf + z3 * (C0 + 351 z3 * (C1 + z3 * C2)))); 352 } 353 354 *y = (n0 & 2)? -f0 : f0; 355 y += stridey; 356 *y = (n1 & 2)? -f1 : f1; 357 y += stridey; 358 *y = (n2 & 2)? -f2 : f2; 359 y += stridey; 360 *y = (n3 & 2)? -f3 : f3; 361 continue; 362 363 process1: 364 PROCESS(0); 365 continue; 366 367 process2: 368 PROCESS(0); 369 PROCESS(1); 370 continue; 371 372 process3: 373 PROCESS(0); 374 PROCESS(1); 375 PROCESS(2); 376 } 377 } 378