xref: /illumos-gate/usr/src/lib/libmvec/common/__vatan2f.c (revision 2f443e27e5988131d8b57bec58ee15f9227e0899)
1 /*
2  * CDDL HEADER START
3  *
4  * The contents of this file are subject to the terms of the
5  * Common Development and Distribution License (the "License").
6  * You may not use this file except in compliance with the License.
7  *
8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9  * or http://www.opensolaris.org/os/licensing.
10  * See the License for the specific language governing permissions
11  * and limitations under the License.
12  *
13  * When distributing Covered Code, include this CDDL HEADER in each
14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15  * If applicable, add the following below this CDDL HEADER, with the
16  * fields enclosed by brackets "[]" replaced with your own identifying
17  * information: Portions Copyright [yyyy] [name of copyright owner]
18  *
19  * CDDL HEADER END
20  */
21 
22 /*
23  * Copyright 2011 Nexenta Systems, Inc.  All rights reserved.
24  */
25 /*
26  * Copyright 2006 Sun Microsystems, Inc.  All rights reserved.
27  * Use is subject to license terms.
28  */
29 
30 #ifdef __RESTRICT
31 #define restrict _Restrict
32 #else
33 #define restrict
34 #endif
35 
36 extern const double __vlibm_TBL_atan1[];
37 
38 static const double
39 pio4	=  7.8539816339744827900e-01,
40 pio2	=  1.5707963267948965580e+00,
41 pi	=  3.1415926535897931160e+00;
42 
43 static const float
44 zero	=  0.0f,
45 one	=  1.0f,
46 q1      = -3.3333333333296428046e-01f,
47 q2      =  1.9999999186853752618e-01f,
48 twop24  =  16777216.0f;
49 
50 void
51 __vatan2f(int n, float * restrict y, int stridey, float * restrict x,
52 	int stridex, float * restrict z, int stridez)
53 {
54 	float		x0, x1, x2, y0, y1, y2, *pz0 = 0, *pz1, *pz2;
55 	double		ah0, ah1, ah2;
56 	double		t0, t1, t2;
57 	double		sx0, sx1, sx2;
58 	double		sign0, sign1, sign2;
59 	int		i, k0 = 0, k1, k2, hx, sx, sy;
60 	int		hy0, hy1, hy2;
61 	float		base0 = 0.0, base1, base2;
62 	double		num0, num1, num2;
63 	double		den0, den1, den2;
64 	double		dx0, dx1, dx2;
65 	double		dy0, dy1, dy2;
66 	double		db0, db1, db2;
67 
68 	do
69 	{
70 loop0:
71 		hy0 = *(int*)y;
72 		hx = *(int*)x;
73 		sign0 = one;
74 		sy = hy0 & 0x80000000;
75 		hy0 &= ~0x80000000;
76 
77 		sx = hx & 0x80000000;
78 		hx &= ~0x80000000;
79 
80 		if (hy0 > hx)
81 		{
82 			x0 = *y;
83 			y0 = *x;
84 			i = hx;
85 			hx = hy0;
86 			hy0 = i;
87 			if (sy)
88 			{
89 				x0 = -x0;
90 				sign0 = -sign0;
91 			}
92 			if (sx)
93 			{
94 				y0 = -y0;
95 				ah0 = pio2;
96 			}
97 			else
98 			{
99 				ah0 = -pio2;
100 				sign0 = -sign0;
101 			}
102 		}
103 		else
104 		{
105 			y0 = *y;
106 			x0 = *x;
107 			if (sy)
108 			{
109 				y0 = -y0;
110 				sign0 = -sign0;
111 			}
112 			if (sx)
113 			{
114 				x0 = -x0;
115 				ah0 = -pi;
116 				sign0 = -sign0;
117 			}
118 			else
119 				ah0 = zero;
120 		}
121 
122 		if (hx >= 0x7f800000 || hx - hy0 >= 0x0c800000)
123 		{
124 			if (hx >= 0x7f800000)
125 			{
126 				if (hx ^ 0x7f800000) /* nan */
127 					ah0 =  x0 + y0;
128 				else if (hy0 >= 0x7f800000)
129 					ah0 += pio4;
130 			}
131 			else if ((int) ah0 == 0)
132 				ah0 = y0 / x0;
133 			*z = (sign0 == one) ? ah0 : -ah0;
134 /* sign0*ah0 would change nan behavior relative to previous release */
135 			x += stridex;
136 			y += stridey;
137 			z += stridez;
138 			i = 0;
139 			if (--n <= 0)
140 				break;
141 			goto loop0;
142 		}
143 		if (hy0 < 0x00800000) {
144 			if (hy0 == 0)
145 			{
146 				*z = sign0 * (float) ah0;
147 				x += stridex;
148 				y += stridey;
149 				z += stridez;
150 				i = 0;
151 				if (--n <= 0)
152 					break;
153 				goto loop0;
154 			}
155 			y0 *= twop24; /* scale subnormal y */
156 			x0 *= twop24; /* scale possibly subnormal x */
157 			hy0 = *(int*)&y0;
158                         hx = *(int*)&x0;
159 		}
160 		pz0 = z;
161 
162 		k0 = (hy0 - hx + 0x3f800000) & 0xfff80000;
163 		if (k0 >= 0x3C800000)          /* if |x| >= (1/64)... */
164     		{
165 			*(int*)&base0 = k0;
166        		 	k0 = (k0 - 0x3C800000) >> 18; /* (index >> 19) << 1) */
167 			k0 += 4;
168 				/* skip over 0,0,pi/2,pi/2 */
169     		}
170     		else                            /* |x| < 1/64 */
171     		{
172 			k0 = 0;
173 			base0 = zero;
174     		}
175 
176 		x += stridex;
177 		y += stridey;
178 		z += stridez;
179 		i = 1;
180 		if (--n <= 0)
181 			break;
182 
183 
184 loop1:
185 		hy1 = *(int*)y;
186 		hx = *(int*)x;
187 		sign1 = one;
188 		sy = hy1 & 0x80000000;
189 		hy1 &= ~0x80000000;
190 
191 		sx = hx & 0x80000000;
192 		hx &= ~0x80000000;
193 
194 		if (hy1 > hx)
195 		{
196 			x1 = *y;
197 			y1 = *x;
198 			i = hx;
199 			hx = hy1;
200 			hy1 = i;
201 			if (sy)
202 			{
203 				x1 = -x1;
204 				sign1 = -sign1;
205 			}
206 			if (sx)
207 			{
208 				y1 = -y1;
209 				ah1 = pio2;
210 			}
211 			else
212 			{
213 				ah1 = -pio2;
214 				sign1 = -sign1;
215 			}
216 		}
217 		else
218 		{
219 			y1 = *y;
220 			x1 = *x;
221 			if (sy)
222 			{
223 				y1 = -y1;
224 				sign1 = -sign1;
225 			}
226 			if (sx)
227 			{
228 				x1 = -x1;
229 				ah1 = -pi;
230 				sign1 = -sign1;
231 			}
232 			else
233 				ah1 = zero;
234 		}
235 
236 		if (hx >= 0x7f800000 || hx - hy1 >= 0x0c800000)
237 		{
238 			if (hx >= 0x7f800000)
239 			{
240 				if (hx ^ 0x7f800000) /* nan */
241 					ah1 =  x1 + y1;
242 				else if (hy1 >= 0x7f800000)
243 					ah1 += pio4;
244 			}
245 			else if ((int) ah1 == 0)
246 				ah1 = y1 / x1;
247 			*z = (sign1 == one)? ah1 : -ah1;
248 			x += stridex;
249 			y += stridey;
250 			z += stridez;
251 			i = 1;
252 			if (--n <= 0)
253 				break;
254 			goto loop1;
255 		}
256 		if (hy1 < 0x00800000) {
257 			if (hy1 == 0)
258 			{
259 				*z = sign1 * (float) ah1;
260 				x += stridex;
261 				y += stridey;
262 				z += stridez;
263 				i = 1;
264 				if (--n <= 0)
265 					break;
266 				goto loop1;
267 			}
268 			y1 *= twop24; /* scale subnormal y */
269 			x1 *= twop24; /* scale possibly subnormal x */
270 			hy1 = *(int*)&y1;
271                         hx = *(int*)&x1;
272 		}
273 		pz1 = z;
274 
275 		k1 = (hy1 - hx + 0x3f800000) & 0xfff80000;
276 		if (k1 >= 0x3C800000)          /* if |x| >= (1/64)... */
277     		{
278 			*(int*)&base1 = k1;
279        		 	k1 = (k1 - 0x3C800000) >> 18; /* (index >> 19) << 1) */
280 			k1 += 4;
281 				/* skip over 0,0,pi/2,pi/2 */
282     		}
283     		else                            /* |x| < 1/64 */
284     		{
285        			k1 = 0;
286 			base1 = zero;
287     		}
288 
289 		x += stridex;
290 		y += stridey;
291 		z += stridez;
292 		i = 2;
293 		if (--n <= 0)
294 			break;
295 
296 loop2:
297 		hy2 = *(int*)y;
298 		hx = *(int*)x;
299 		sign2 = one;
300 		sy = hy2 & 0x80000000;
301 		hy2 &= ~0x80000000;
302 
303 		sx = hx & 0x80000000;
304 		hx &= ~0x80000000;
305 
306 		if (hy2 > hx)
307 		{
308 			x2 = *y;
309 			y2 = *x;
310 			i = hx;
311 			hx = hy2;
312 			hy2 = i;
313 			if (sy)
314 			{
315 				x2 = -x2;
316 				sign2 = -sign2;
317 			}
318 			if (sx)
319 			{
320 				y2 = -y2;
321 				ah2 = pio2;
322 			}
323 			else
324 			{
325 				ah2 = -pio2;
326 				sign2 = -sign2;
327 			}
328 		}
329 		else
330 		{
331 			y2 = *y;
332 			x2 = *x;
333 			if (sy)
334 			{
335 				y2 = -y2;
336 				sign2 = -sign2;
337 			}
338 			if (sx)
339 			{
340 				x2 = -x2;
341 				ah2 = -pi;
342 				sign2 = -sign2;
343 			}
344 			else
345 				ah2 = zero;
346 		}
347 
348 		if (hx >= 0x7f800000 || hx - hy2 >= 0x0c800000)
349 		{
350 			if (hx >= 0x7f800000)
351 			{
352 				if (hx ^ 0x7f800000) /* nan */
353 					ah2 =  x2 + y2;
354 				else if (hy2 >= 0x7f800000)
355 					ah2 += pio4;
356 			}
357 			else if ((int) ah2 == 0)
358 				ah2 = y2 / x2;
359 			*z = (sign2 == one)? ah2 : -ah2;
360 			x += stridex;
361 			y += stridey;
362 			z += stridez;
363 			i = 2;
364 			if (--n <= 0)
365 				break;
366 			goto loop2;
367 		}
368 		if (hy2 < 0x00800000) {
369 			if (hy2 == 0)
370 			{
371 				*z = sign2 * (float) ah2;
372 				x += stridex;
373 				y += stridey;
374 				z += stridez;
375 				i = 2;
376 				if (--n <= 0)
377 					break;
378 				goto loop2;
379 			}
380 			y2 *= twop24; /* scale subnormal y */
381 			x2 *= twop24; /* scale possibly subnormal x */
382 			hy2 = *(int*)&y2;
383                         hx = *(int*)&x2;
384 		}
385 
386 		pz2 = z;
387 
388 		k2 = (hy2 - hx + 0x3f800000) & 0xfff80000;
389 		if (k2 >= 0x3C800000)          /* if |x| >= (1/64)... */
390     		{
391 			*(int*)&base2 = k2;
392        		 	k2 = (k2 - 0x3C800000) >> 18; /* (index >> 19) << 1) */
393 			k2 += 4;
394 				/* skip over 0,0,pi/2,pi/2 */
395     		}
396     		else                            /* |x| < 1/64 */
397     		{
398 			k2 = 0;
399 			base2 = zero;
400     		}
401 
402 		goto endloop;
403 
404 endloop:
405 
406 		ah2 += __vlibm_TBL_atan1[k2];
407 		ah1 += __vlibm_TBL_atan1[k1];
408 		ah0 += __vlibm_TBL_atan1[k0];
409 
410 		db2 = base2;
411 		db1 = base1;
412 		db0 = base0;
413 		dy2 = y2;
414 		dy1 = y1;
415 		dy0 = y0;
416 		dx2 = x2;
417 		dx1 = x1;
418 		dx0 = x0;
419 
420 		num2 = dy2 - dx2 * db2;
421 		den2 = dx2 + dy2 * db2;
422 
423 		num1 = dy1 - dx1 * db1;
424 		den1 = dx1 + dy1 * db1;
425 
426 		num0 = dy0 - dx0 * db0;
427 		den0 = dx0 + dy0 * db0;
428 
429 		t2 = num2 / den2;
430 		t1 = num1 / den1;
431 		t0 = num0 / den0;
432 
433 		sx2 = t2 * t2;
434 		sx1 = t1 * t1;
435 		sx0 = t0 * t0;
436 
437 		t2 += t2 * sx2 * (q1 + sx2 * q2);
438  		t1 += t1 * sx1 * (q1 + sx1 * q2);
439  		t0 += t0 * sx0 * (q1 + sx0 * q2);
440 
441 		t2 += ah2;
442 		t1 += ah1;
443 		t0 += ah0;
444 
445 		*pz2 = sign2 * t2;
446 		*pz1 = sign1 * t1;
447 		*pz0 = sign0 * t0;
448 
449 		x += stridex;
450 		y += stridey;
451 		z += stridez;
452 		i = 0;
453 	} while (--n > 0);
454 
455 	if (i > 1)
456 	{
457 		ah1 += __vlibm_TBL_atan1[k1];
458 		t1 = (y1 - x1 * (double)base1) /
459 			(x1 + y1 * (double)base1);
460 		sx1 = t1 * t1;
461  		t1 += t1 * sx1 * (q1 + sx1 * q2);
462 		t1 += ah1;
463 		*pz1 = sign1 * t1;
464 	}
465 
466 	if (i > 0)
467 	{
468 		ah0 += __vlibm_TBL_atan1[k0];
469 		t0 = (y0 - x0 * (double)base0) /
470 			(x0 + y0 * (double)base0);
471 		sx0 = t0 * t0;
472  		t0 += t0 * sx0 * (q1 + sx0 * q2);
473 		t0 += ah0;
474 		*pz0 = sign0 * t0;
475 	}
476 }
477