xref: /illumos-gate/usr/src/lib/libmvec/common/__vrhypot.c (revision 66582b606a8194f7f3ba5b3a3a6dca5b0d346361)
1 /*
2  * CDDL HEADER START
3  *
4  * The contents of this file are subject to the terms of the
5  * Common Development and Distribution License (the "License").
6  * You may not use this file except in compliance with the License.
7  *
8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9  * or http://www.opensolaris.org/os/licensing.
10  * See the License for the specific language governing permissions
11  * and limitations under the License.
12  *
13  * When distributing Covered Code, include this CDDL HEADER in each
14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15  * If applicable, add the following below this CDDL HEADER, with the
16  * fields enclosed by brackets "[]" replaced with your own identifying
17  * information: Portions Copyright [yyyy] [name of copyright owner]
18  *
19  * CDDL HEADER END
20  */
21 
22 /*
23  * Copyright 2011 Nexenta Systems, Inc.  All rights reserved.
24  */
25 /*
26  * Copyright 2006 Sun Microsystems, Inc.  All rights reserved.
27  * Use is subject to license terms.
28  */
29 
30 #include <sys/isa_defs.h>
31 #include "libm_inlines.h"
32 
33 #ifdef _LITTLE_ENDIAN
34 #define HI(x)	*(1+(int*)x)
35 #define LO(x)	*(unsigned*)x
36 #else
37 #define HI(x)	*(int*)x
38 #define LO(x)	*(1+(unsigned*)x)
39 #endif
40 
41 #ifdef __RESTRICT
42 #define restrict _Restrict
43 #else
44 #define restrict
45 #endif
46 
47 /* double rhypot(double x, double y)
48  *
49  * Method :
50  *	1. Special cases:
51  *		x or  y = Inf					=> 0
52  *		x or  y = NaN					=> QNaN
53  *		x and y = 0					=> Inf + divide-by-zero
54  *	2. Computes rhypot(x,y):
55  *		rhypot(x,y) = m * sqrt(1/(xnm * xnm + ynm * ynm))
56  *	Where:
57  *		m = 1/max(|x|,|y|)
58  *		xnm = x * m
59  *		ynm = y * m
60  *
61  *	Compute 1/(xnm * xnm + ynm * ynm) by simulating
62  *	muti-precision arithmetic.
63  *
64  * Accuracy:
65  *	Maximum error observed: less than 0.869 ulp after 1.000.000.000
66  *	results.
67  */
68 
69 extern double sqrt(double);
70 extern double fabs(double);
71 
72 static const int __vlibm_TBL_rhypot[] = {
73 /* i = [0,127]
74  * TBL[i] = 0x3ff00000 + *(int*)&(1.0 / *(double*)&(0x3ff0000000000000ULL + (i << 45))); */
75  0x7fe00000,  0x7fdfc07f, 0x7fdf81f8,  0x7fdf4465,
76  0x7fdf07c1,  0x7fdecc07, 0x7fde9131,  0x7fde573a,
77  0x7fde1e1e,  0x7fdde5d6, 0x7fddae60,  0x7fdd77b6,
78  0x7fdd41d4,  0x7fdd0cb5, 0x7fdcd856,  0x7fdca4b3,
79  0x7fdc71c7,  0x7fdc3f8f, 0x7fdc0e07,  0x7fdbdd2b,
80  0x7fdbacf9,  0x7fdb7d6c, 0x7fdb4e81,  0x7fdb2036,
81  0x7fdaf286,  0x7fdac570, 0x7fda98ef,  0x7fda6d01,
82  0x7fda41a4,  0x7fda16d3, 0x7fd9ec8e,  0x7fd9c2d1,
83  0x7fd99999,  0x7fd970e4, 0x7fd948b0,  0x7fd920fb,
84  0x7fd8f9c1,  0x7fd8d301, 0x7fd8acb9,  0x7fd886e5,
85  0x7fd86186,  0x7fd83c97, 0x7fd81818,  0x7fd7f405,
86  0x7fd7d05f,  0x7fd7ad22, 0x7fd78a4c,  0x7fd767dc,
87  0x7fd745d1,  0x7fd72428, 0x7fd702e0,  0x7fd6e1f7,
88  0x7fd6c16c,  0x7fd6a13c, 0x7fd68168,  0x7fd661ec,
89  0x7fd642c8,  0x7fd623fa, 0x7fd60581,  0x7fd5e75b,
90  0x7fd5c988,  0x7fd5ac05, 0x7fd58ed2,  0x7fd571ed,
91  0x7fd55555,  0x7fd53909, 0x7fd51d07,  0x7fd50150,
92  0x7fd4e5e0,  0x7fd4cab8, 0x7fd4afd6,  0x7fd49539,
93  0x7fd47ae1,  0x7fd460cb, 0x7fd446f8,  0x7fd42d66,
94  0x7fd41414,  0x7fd3fb01, 0x7fd3e22c,  0x7fd3c995,
95  0x7fd3b13b,  0x7fd3991c, 0x7fd38138,  0x7fd3698d,
96  0x7fd3521c,  0x7fd33ae4, 0x7fd323e3,  0x7fd30d19,
97  0x7fd2f684,  0x7fd2e025, 0x7fd2c9fb,  0x7fd2b404,
98  0x7fd29e41,  0x7fd288b0, 0x7fd27350,  0x7fd25e22,
99  0x7fd24924,  0x7fd23456, 0x7fd21fb7,  0x7fd20b47,
100  0x7fd1f704,  0x7fd1e2ef, 0x7fd1cf06,  0x7fd1bb4a,
101  0x7fd1a7b9,  0x7fd19453, 0x7fd18118,  0x7fd16e06,
102  0x7fd15b1e,  0x7fd1485f, 0x7fd135c8,  0x7fd12358,
103  0x7fd11111,  0x7fd0fef0, 0x7fd0ecf5,  0x7fd0db20,
104  0x7fd0c971,  0x7fd0b7e6, 0x7fd0a681,  0x7fd0953f,
105  0x7fd08421,  0x7fd07326, 0x7fd0624d,  0x7fd05197,
106  0x7fd04104,  0x7fd03091, 0x7fd02040,  0x7fd01010,
107 };
108 
109 static const unsigned long long LCONST[] = {
110 0x3ff0000000000000ULL,	/* DONE = 1.0		*/
111 0x4000000000000000ULL,	/* DTWO = 2.0		*/
112 0x4230000000000000ULL,	/* D2ON36 = 2**36	*/
113 0x7fd0000000000000ULL,	/* D2ON1022 = 2**1022	*/
114 0x3cb0000000000000ULL,	/* D2ONM52 = 2**-52	*/
115 };
116 
117 #define RET_SC(I)										\
118 	px += stridex;										\
119 	py += stridey;										\
120 	pz += stridez;										\
121 	if (--n <= 0)										\
122 		break;										\
123 	goto start##I;
124 
125 #define RETURN(I, ret)										\
126 {												\
127 	pz[0] = (ret);										\
128 	RET_SC(I)										\
129 }
130 
131 #define PREP(I)											\
132 hx##I = HI(px);										\
133 hy##I = HI(py);										\
134 hx##I &= 0x7fffffff;										\
135 hy##I &= 0x7fffffff;										\
136 pz##I = pz;											\
137 if (hx##I >= 0x7ff00000 || hy##I >= 0x7ff00000)	/* |X| or |Y| = Inf,NaN */		\
138 {												\
139 	lx = LO(px);									\
140 	ly = LO(py);									\
141 	x = *px;										\
142 	y = *py;										\
143 	if (hx##I == 0x7ff00000 && lx == 0) res0 = 0.0;		/* |X| = Inf */		\
144 	else if (hy##I == 0x7ff00000 && ly == 0) res0 = 0.0;	/* |Y| = Inf */		\
145 	else res0 = fabs(x) + fabs(y);								\
146 												\
147 	RETURN (I, res0)									\
148 }												\
149 x##I = *px;											\
150 y##I = *py;											\
151 diff0 = hy##I - hx##I;										\
152 j0 = diff0 >> 31;										\
153 if (hx##I < 0x00100000 && hy##I < 0x00100000)	/* |X| and |Y| = subnormal or zero */		\
154 {												\
155 	lx = LO(px);									\
156 	ly = LO(py);									\
157 	x = x##I;										\
158 	y = y##I;										\
159 												\
160 	if ((hx##I | hy##I | lx | ly) == 0)	/* |X| and |Y| = 0 */				\
161 		RETURN (I, DONE / 0.0)							\
162 												\
163 	x = fabs(x);										\
164 	y = fabs(y);										\
165 												\
166 	x = *(long long*)&x;									\
167 	y = *(long long*)&y;									\
168 												\
169 	x *= D2ONM52;										\
170 	y *= D2ONM52;										\
171 												\
172 	x_hi0 = (x + D2ON36) - D2ON36;							\
173 	y_hi0 = (y + D2ON36) - D2ON36;							\
174 	x_lo0 = x - x_hi0;									\
175 	y_lo0 = y - y_hi0;									\
176 	res0_hi = (x_hi0 * x_hi0 + y_hi0 * y_hi0);						\
177 	res0_lo = ((x + x_hi0) * x_lo0 + (y + y_hi0) * y_lo0);					\
178 												\
179 	dres0 = res0_hi + res0_lo;								\
180 												\
181 	iarr0 = HI(&dres0);								\
182 	iexp0 = iarr0 & 0xfff00000;								\
183 												\
184 	iarr0 = (iarr0 >> 11) & 0x1fc;								\
185 	itbl0 = ((int*)((char*)__vlibm_TBL_rhypot + iarr0))[0];					\
186 	itbl0 -= iexp0;										\
187 	HI(&dd0) = itbl0;						\
188 	LO(&dd0) = 0;								\
189 												\
190 	dd0 = dd0 * (DTWO - dd0 * dres0);							\
191 	dd0 = dd0 * (DTWO - dd0 * dres0);							\
192 	dres0 = dd0 * (DTWO - dd0 * dres0);							\
193 												\
194 	HI(&res0) = HI(&dres0) & 0xffffff00;					\
195 	LO(&res0) = 0;								\
196 	res0 += (DONE - res0_hi * res0 - res0_lo * res0) * dres0;				\
197 	res0 = sqrt (res0);									\
198 												\
199 	res0 = D2ON1022 * res0;									\
200 	RETURN (I, res0)									\
201 }												\
202 j0 = hy##I - (diff0 & j0);									\
203 j0 &= 0x7ff00000;										\
204 HI(&scl##I) = 0x7ff00000 - j0;
205 
206 void
207 __vrhypot(int n, double * restrict px, int stridex, double * restrict py,
208 	int stridey, double * restrict pz, int stridez)
209 {
210 	int		i = 0;
211 	double		x, y;
212 	double		x_hi0, x_lo0, y_hi0, y_lo0, scl0 = 0;
213 	double		x0, y0, res0, dd0;
214 	double		res0_hi,res0_lo, dres0;
215 	double		x_hi1, x_lo1, y_hi1, y_lo1, scl1 = 0;
216 	double		x1 = 0.0L, y1 = 0.0L, res1, dd1;
217 	double		res1_hi,res1_lo, dres1;
218 	double		x_hi2, x_lo2, y_hi2, y_lo2, scl2 = 0;
219 	double		x2, y2, res2, dd2;
220 	double		res2_hi,res2_lo, dres2;
221 
222 	int		hx0, hy0, j0, diff0;
223 	int		iarr0, iexp0, itbl0;
224 	int		hx1, hy1;
225 	int		iarr1, iexp1, itbl1;
226 	int		hx2, hy2;
227 	int		iarr2, iexp2, itbl2;
228 
229 	int		lx, ly;
230 
231 	double		DONE = ((double*)LCONST)[0];
232 	double		DTWO = ((double*)LCONST)[1];
233 	double		D2ON36 = ((double*)LCONST)[2];
234 	double		D2ON1022 = ((double*)LCONST)[3];
235 	double		D2ONM52 = ((double*)LCONST)[4];
236 
237 	double		*pz0, *pz1 = 0, *pz2;
238 
239 	do
240 	{
241 start0:
242 		PREP(0)
243 		px += stridex;
244 		py += stridey;
245 		pz += stridez;
246 		i = 1;
247 		if (--n <= 0)
248 			break;
249 
250 start1:
251 		PREP(1)
252 		px += stridex;
253 		py += stridey;
254 		pz += stridez;
255 		i = 2;
256 		if (--n <= 0)
257 			break;
258 
259 start2:
260 		PREP(2)
261 
262 		x0 *= scl0;
263 		y0 *= scl0;
264 		x1 *= scl1;
265 		y1 *= scl1;
266 		x2 *= scl2;
267 		y2 *= scl2;
268 
269 		x_hi0 = (x0 + D2ON36) - D2ON36;
270 		y_hi0 = (y0 + D2ON36) - D2ON36;
271 		x_hi1 = (x1 + D2ON36) - D2ON36;
272 		y_hi1 = (y1 + D2ON36) - D2ON36;
273 		x_hi2 = (x2 + D2ON36) - D2ON36;
274 		y_hi2 = (y2 + D2ON36) - D2ON36;
275 		x_lo0 = x0 - x_hi0;
276 		y_lo0 = y0 - y_hi0;
277 		x_lo1 = x1 - x_hi1;
278 		y_lo1 = y1 - y_hi1;
279 		x_lo2 = x2 - x_hi2;
280 		y_lo2 = y2 - y_hi2;
281 		res0_hi = (x_hi0 * x_hi0 + y_hi0 * y_hi0);
282 		res1_hi = (x_hi1 * x_hi1 + y_hi1 * y_hi1);
283 		res2_hi = (x_hi2 * x_hi2 + y_hi2 * y_hi2);
284 		res0_lo = ((x0 + x_hi0) * x_lo0 + (y0 + y_hi0) * y_lo0);
285 		res1_lo = ((x1 + x_hi1) * x_lo1 + (y1 + y_hi1) * y_lo1);
286 		res2_lo = ((x2 + x_hi2) * x_lo2 + (y2 + y_hi2) * y_lo2);
287 
288 		dres0 = res0_hi + res0_lo;
289 		dres1 = res1_hi + res1_lo;
290 		dres2 = res2_hi + res2_lo;
291 
292 		iarr0 = HI(&dres0);
293 		iarr1 = HI(&dres1);
294 		iarr2 = HI(&dres2);
295 		iexp0 = iarr0 & 0xfff00000;
296 		iexp1 = iarr1 & 0xfff00000;
297 		iexp2 = iarr2 & 0xfff00000;
298 
299 		iarr0 = (iarr0 >> 11) & 0x1fc;
300 		iarr1 = (iarr1 >> 11) & 0x1fc;
301 		iarr2 = (iarr2 >> 11) & 0x1fc;
302 		itbl0 = ((int*)((char*)__vlibm_TBL_rhypot + iarr0))[0];
303 		itbl1 = ((int*)((char*)__vlibm_TBL_rhypot + iarr1))[0];
304 		itbl2 = ((int*)((char*)__vlibm_TBL_rhypot + iarr2))[0];
305 		itbl0 -= iexp0;
306 		itbl1 -= iexp1;
307 		itbl2 -= iexp2;
308 		HI(&dd0) = itbl0;
309 		HI(&dd1) = itbl1;
310 		HI(&dd2) = itbl2;
311 		LO(&dd0) = 0;
312 		LO(&dd1) = 0;
313 		LO(&dd2) = 0;
314 
315 		dd0 = dd0 * (DTWO - dd0 * dres0);
316 		dd1 = dd1 * (DTWO - dd1 * dres1);
317 		dd2 = dd2 * (DTWO - dd2 * dres2);
318 		dd0 = dd0 * (DTWO - dd0 * dres0);
319 		dd1 = dd1 * (DTWO - dd1 * dres1);
320 		dd2 = dd2 * (DTWO - dd2 * dres2);
321 		dres0 = dd0 * (DTWO - dd0 * dres0);
322 		dres1 = dd1 * (DTWO - dd1 * dres1);
323 		dres2 = dd2 * (DTWO - dd2 * dres2);
324 
325 		HI(&res0) = HI(&dres0) & 0xffffff00;
326 		HI(&res1) = HI(&dres1) & 0xffffff00;
327 		HI(&res2) = HI(&dres2) & 0xffffff00;
328 		LO(&res0) = 0;
329 		LO(&res1) = 0;
330 		LO(&res2) = 0;
331 		res0 += (DONE - res0_hi * res0 - res0_lo * res0) * dres0;
332 		res1 += (DONE - res1_hi * res1 - res1_lo * res1) * dres1;
333 		res2 += (DONE - res2_hi * res2 - res2_lo * res2) * dres2;
334 		res0 = sqrt (res0);
335 		res1 = sqrt (res1);
336 		res2 = sqrt (res2);
337 
338 		res0 = scl0 * res0;
339 		res1 = scl1 * res1;
340 		res2 = scl2 * res2;
341 
342 		*pz0 = res0;
343 		*pz1 = res1;
344 		*pz2 = res2;
345 
346 		px += stridex;
347 		py += stridey;
348 		pz += stridez;
349 		i = 0;
350 
351 	} while (--n > 0);
352 
353 	if (i > 0)
354 	{
355 		x0 *= scl0;
356 		y0 *= scl0;
357 
358 		x_hi0 = (x0 + D2ON36) - D2ON36;
359 		y_hi0 = (y0 + D2ON36) - D2ON36;
360 		x_lo0 = x0 - x_hi0;
361 		y_lo0 = y0 - y_hi0;
362 		res0_hi = (x_hi0 * x_hi0 + y_hi0 * y_hi0);
363 		res0_lo = ((x0 + x_hi0) * x_lo0 + (y0 + y_hi0) * y_lo0);
364 
365 		dres0 = res0_hi + res0_lo;
366 
367 		iarr0 = HI(&dres0);
368 		iexp0 = iarr0 & 0xfff00000;
369 
370 		iarr0 = (iarr0 >> 11) & 0x1fc;
371 		itbl0 = ((int*)((char*)__vlibm_TBL_rhypot + iarr0))[0];
372 		itbl0 -= iexp0;
373 		HI(&dd0) = itbl0;
374 		LO(&dd0) = 0;
375 
376 		dd0 = dd0 * (DTWO - dd0 * dres0);
377 		dd0 = dd0 * (DTWO - dd0 * dres0);
378 		dres0 = dd0 * (DTWO - dd0 * dres0);
379 
380 		HI(&res0) = HI(&dres0) & 0xffffff00;
381 		LO(&res0) = 0;
382 		res0 += (DONE - res0_hi * res0 - res0_lo * res0) * dres0;
383 		res0 = sqrt (res0);
384 
385 		res0 = scl0 * res0;
386 
387 		*pz0 = res0;
388 
389 		if (i > 1)
390 		{
391 			x1 *= scl1;
392 			y1 *= scl1;
393 
394 			x_hi1 = (x1 + D2ON36) - D2ON36;
395 			y_hi1 = (y1 + D2ON36) - D2ON36;
396 			x_lo1 = x1 - x_hi1;
397 			y_lo1 = y1 - y_hi1;
398 			res1_hi = (x_hi1 * x_hi1 + y_hi1 * y_hi1);
399 			res1_lo = ((x1 + x_hi1) * x_lo1 + (y1 + y_hi1) * y_lo1);
400 
401 			dres1 = res1_hi + res1_lo;
402 
403 			iarr1 = HI(&dres1);
404 			iexp1 = iarr1 & 0xfff00000;
405 
406 			iarr1 = (iarr1 >> 11) & 0x1fc;
407 			itbl1 = ((int*)((char*)__vlibm_TBL_rhypot + iarr1))[0];
408 			itbl1 -= iexp1;
409 			HI(&dd1) = itbl1;
410 			LO(&dd1) = 0;
411 
412 			dd1 = dd1 * (DTWO - dd1 * dres1);
413 			dd1 = dd1 * (DTWO - dd1 * dres1);
414 			dres1 = dd1 * (DTWO - dd1 * dres1);
415 
416 			HI(&res1) = HI(&dres1) & 0xffffff00;
417 			LO(&res1) = 0;
418 			res1 += (DONE - res1_hi * res1 - res1_lo * res1) * dres1;
419 			res1 = sqrt (res1);
420 
421 			res1 = scl1 * res1;
422 
423 			*pz1 = res1;
424 		}
425 	}
426 }
427