xref: /illumos-gate/usr/src/lib/libmvec/common/__vhypot.c (revision fcdb3229a31dd4ff700c69238814e326aad49098)
1 /*
2  * CDDL HEADER START
3  *
4  * The contents of this file are subject to the terms of the
5  * Common Development and Distribution License (the "License").
6  * You may not use this file except in compliance with the License.
7  *
8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9  * or http://www.opensolaris.org/os/licensing.
10  * See the License for the specific language governing permissions
11  * and limitations under the License.
12  *
13  * When distributing Covered Code, include this CDDL HEADER in each
14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15  * If applicable, add the following below this CDDL HEADER, with the
16  * fields enclosed by brackets "[]" replaced with your own identifying
17  * information: Portions Copyright [yyyy] [name of copyright owner]
18  *
19  * CDDL HEADER END
20  */
21 
22 /*
23  * Copyright 2011 Nexenta Systems, Inc.  All rights reserved.
24  */
25 /*
26  * Copyright 2006 Sun Microsystems, Inc.  All rights reserved.
27  * Use is subject to license terms.
28  */
29 
30 #include <sys/isa_defs.h>
31 #include "libm_inlines.h"
32 
33 #ifdef _LITTLE_ENDIAN
34 #define	HI(x)	*(1+(int *)x)
35 #define	LO(x)	*(unsigned *)x
36 #else
37 #define	HI(x)	*(int *)x
38 #define	LO(x)	*(1+(unsigned *)x)
39 #endif
40 
41 #ifdef __RESTRICT
42 #define	restrict _Restrict
43 #else
44 #define	restrict
45 #endif
46 
47 /*
48  * double hypot(double x, double y)
49  *
50  * Method :
51  *	1. Special cases:
52  *		x or y is +Inf or -Inf				=> +Inf
53  *		x or y is NaN					=> QNaN
54  *	2. Computes hypot(x,y):
55  *		hypot(x,y) = m * sqrt(xnm * xnm + ynm * ynm)
56  *	Where:
57  *		m = max(|x|,|y|)
58  *		xnm = x * (1/m)
59  *		ynm = y * (1/m)
60  *
61  *	Compute xnm * xnm + ynm * ynm by simulating
62  *	muti-precision arithmetic.
63  *
64  * Accuracy:
65  *	Maximum error observed: less than 0.872 ulp after 16.777.216.000
66  *	results.
67  */
68 
69 extern double sqrt(double);
70 extern double fabs(double);
71 
72 static const unsigned long long LCONST[] = {
73 0x41b0000000000000ULL,	/* D2ON28 = 2 ** 28		*/
74 0x0010000000000000ULL,	/* D2ONM1022 = 2 ** -1022	*/
75 0x7fd0000000000000ULL	/* D2ONP1022 = 2 **  1022	*/
76 };
77 
78 static void
79 __vhypot_n(int n, double *restrict px, int stridex, double *restrict py,
80     int stridey, double *restrict pz, int stridez);
81 
82 #define	RETURN(ret)						\
83 {								\
84 	*pz = (ret);						\
85 	py += stridey;						\
86 	pz += stridez;						\
87 	if (n_n == 0)						\
88 	{							\
89 		hx0 = HI(px);					\
90 		hy0 = HI(py);					\
91 		spx = px;					\
92 		spy = py;					\
93 		spz = pz;					\
94 		continue;					\
95 	}							\
96 	n--;							\
97 	break;							\
98 }
99 
100 void
__vhypot(int n,double * restrict px,int stridex,double * restrict py,int stridey,double * restrict pz,int stridez)101 __vhypot(int n, double *restrict px, int stridex, double *restrict py,
102     int stridey, double *restrict pz, int stridez)
103 {
104 	int		hx0, hx1, hy0, j0, diff;
105 	double		x_hi, x_lo, y_hi, y_lo;
106 	double		scl = 0;
107 	double		x, y, res;
108 	double		*spx, *spy, *spz;
109 	int		n_n;
110 	double		D2ON28 = ((double *)LCONST)[0];	 /* 2 ** 28	*/
111 	double		D2ONM1022 = ((double *)LCONST)[1]; /* 2 **-1022	*/
112 	double		D2ONP1022 = ((double *)LCONST)[2]; /* 2 ** 1022	*/
113 
114 	while (n > 1) {
115 		n_n = 0;
116 		spx = px;
117 		spy = py;
118 		spz = pz;
119 		hx0 = HI(px);
120 		hy0 = HI(py);
121 		for (; n > 1; n--) {
122 			px += stridex;
123 			hx0 &= 0x7fffffff;
124 			hy0 &= 0x7fffffff;
125 
126 			if (hx0 >= 0x7fe00000) {
127 				/* |X| >= 2**1023 or Inf or NaN */
128 				diff = hy0 - hx0;
129 				j0 = diff >> 31;
130 				j0 = hy0 - (diff & j0);
131 				j0 &= 0x7ff00000;
132 				x = *(px - stridex);
133 				y = *py;
134 				x = fabs(x);
135 				y = fabs(y);
136 				if (j0 >= 0x7ff00000) {
137 					/* |X| or |Y| = Inf or NaN */
138 					int lx = LO((px - stridex));
139 					int ly = LO(py);
140 					if (hx0 == 0x7ff00000 && lx == 0)
141 						res = x == y ? y : x;
142 					else if (hy0 == 0x7ff00000 && ly == 0)
143 						res = x == y ? x : y;
144 					else
145 						res = x + y;
146 					RETURN(res)
147 				} else {
148 					j0 = diff >> 31;
149 					if (((diff ^ j0) - j0) < 0x03600000) {
150 						/*
151 						 * max(|X|,|Y|)/min(|X|,|Y|) <
152 						 * 2**54
153 						 */
154 						x *= D2ONM1022;
155 						y *= D2ONM1022;
156 
157 						x_hi = (x + D2ON28) - D2ON28;
158 						x_lo = x - x_hi;
159 						y_hi = (y + D2ON28) - D2ON28;
160 						y_lo = y - y_hi;
161 						res = x_hi * x_hi + y_hi * y_hi;
162 						res += ((x + x_hi) * x_lo +
163 						    (y + y_hi) * y_lo);
164 
165 						res = sqrt(res);
166 
167 						res = D2ONP1022 * res;
168 						RETURN(res)
169 					} else {
170 						RETURN(x + y)
171 					}
172 				}
173 			}
174 			if (hy0 >= 0x7fe00000) {
175 				/* |Y| >= 2**1023 or Inf or NaN */
176 				diff = hy0 - hx0;
177 				j0 = diff >> 31;
178 				j0 = hy0 - (diff & j0);
179 				j0 &= 0x7ff00000;
180 				x = *(px - stridex);
181 				y = *py;
182 				x = fabs(x);
183 				y = fabs(y);
184 				if (j0 >= 0x7ff00000) {
185 					/* |X| or |Y| = Inf or NaN */
186 					int lx = LO((px - stridex));
187 					int ly = LO(py);
188 					if (hx0 == 0x7ff00000 && lx == 0)
189 						res = x == y ? y : x;
190 					else if (hy0 == 0x7ff00000 && ly == 0)
191 						res = x == y ? x : y;
192 					else
193 						res = x + y;
194 					RETURN(res)
195 				} else {
196 					j0 = diff >> 31;
197 					if (((diff ^ j0) - j0) < 0x03600000) {
198 						/*
199 						 * max(|X|,|Y|)/min(|X|,|Y|) <
200 						 * 2**54
201 						 */
202 						x *= D2ONM1022;
203 						y *= D2ONM1022;
204 
205 						x_hi = (x + D2ON28) - D2ON28;
206 						x_lo = x - x_hi;
207 						y_hi = (y + D2ON28) - D2ON28;
208 						y_lo = y - y_hi;
209 						res = x_hi * x_hi + y_hi * y_hi;
210 						res += ((x + x_hi) * x_lo +
211 						    (y + y_hi) * y_lo);
212 
213 						res = sqrt(res);
214 
215 						res = D2ONP1022 * res;
216 						RETURN(res)
217 					} else {
218 						RETURN(x + y)
219 					}
220 				}
221 			}
222 
223 			hx1 = HI(px);
224 
225 			if (hx0 < 0x00100000 && hy0 < 0x00100000) {
226 				/* X and Y are subnormal */
227 				x = *(px - stridex);
228 				y = *py;
229 
230 				x *= D2ONP1022;
231 				y *= D2ONP1022;
232 
233 				x_hi = (x + D2ON28) - D2ON28;
234 				x_lo = x - x_hi;
235 				y_hi = (y + D2ON28) - D2ON28;
236 				y_lo = y - y_hi;
237 				res = (x_hi * x_hi + y_hi * y_hi);
238 				res += ((x + x_hi) * x_lo + (y + y_hi) * y_lo);
239 
240 				res = sqrt(res);
241 
242 				res = D2ONM1022 * res;
243 				RETURN(res)
244 			}
245 
246 			hx0 = hx1;
247 			py += stridey;
248 			pz += stridez;
249 			n_n++;
250 			hy0 = HI(py);
251 		}
252 		if (n_n > 0)
253 			__vhypot_n(n_n, spx, stridex, spy, stridey, spz,
254 			    stridez);
255 	}
256 
257 	if (n > 0) {
258 		x = *px;
259 		y = *py;
260 		hx0 = HI(px);
261 		hy0 = HI(py);
262 
263 		hx0 &= 0x7fffffff;
264 		hy0 &= 0x7fffffff;
265 
266 		diff = hy0 - hx0;
267 		j0 = diff >> 31;
268 		j0 = hy0 - (diff & j0);
269 		j0 &= 0x7ff00000;
270 
271 		if (j0 >= 0x7fe00000) {
272 			/* max(|X|,|Y|) >= 2**1023 or X or Y = Inf or NaN */
273 			x = fabs(x);
274 			y = fabs(y);
275 			if (j0 >= 0x7ff00000)	/* |X| or |Y| = Inf or NaN */
276 			{
277 				int lx = LO(px);
278 				int ly = LO(py);
279 				if (hx0 == 0x7ff00000 && lx == 0)
280 					res = x == y ? y : x;
281 				else if (hy0 == 0x7ff00000 && ly == 0)
282 					res = x == y ? x : y;
283 				else
284 					res = x + y;
285 				*pz = res;
286 				return;
287 			} else {
288 				j0 = diff >> 31;
289 				if (((diff ^ j0) - j0) < 0x03600000) {
290 					/* max(|X|,|Y|)/min(|X|,|Y|) < 2**54 */
291 					x *= D2ONM1022;
292 					y *= D2ONM1022;
293 
294 					x_hi = (x + D2ON28) - D2ON28;
295 					x_lo = x - x_hi;
296 					y_hi = (y + D2ON28) - D2ON28;
297 					y_lo = y - y_hi;
298 					res = (x_hi * x_hi + y_hi * y_hi);
299 					res += ((x + x_hi) * x_lo +
300 					    (y + y_hi) * y_lo);
301 
302 					res = sqrt(res);
303 
304 					res = D2ONP1022 * res;
305 					*pz = res;
306 					return;
307 				} else {
308 					*pz = x + y;
309 					return;
310 				}
311 			}
312 		}
313 
314 		if (j0 < 0x00100000) {	/* X and Y are subnormal */
315 			x *= D2ONP1022;
316 			y *= D2ONP1022;
317 
318 			x_hi = (x + D2ON28) - D2ON28;
319 			x_lo = x - x_hi;
320 			y_hi = (y + D2ON28) - D2ON28;
321 			y_lo = y - y_hi;
322 			res = (x_hi * x_hi + y_hi * y_hi);
323 			res += ((x + x_hi) * x_lo + (y + y_hi) * y_lo);
324 
325 			res = sqrt(res);
326 
327 			res = D2ONM1022 * res;
328 			*pz = res;
329 			return;
330 		}
331 
332 		HI(&scl) = (0x7fe00000 - j0);
333 
334 		x *= scl;
335 		y *= scl;
336 
337 		x_hi = (x + D2ON28) - D2ON28;
338 		y_hi = (y + D2ON28) - D2ON28;
339 		x_lo = x - x_hi;
340 		y_lo = y - y_hi;
341 
342 		res = (x_hi * x_hi + y_hi * y_hi);
343 		res += ((x + x_hi) * x_lo + (y + y_hi) * y_lo);
344 
345 		res = sqrt(res);
346 
347 		HI(&scl) = j0;
348 
349 		res = scl * res;
350 		*pz = res;
351 	}
352 }
353 
354 static void
__vhypot_n(int n,double * restrict px,int stridex,double * restrict py,int stridey,double * restrict pz,int stridez)355 __vhypot_n(int n, double *restrict px, int stridex, double *restrict py,
356     int stridey, double *restrict pz, int stridez)
357 {
358 	int		hx0, hy0, j0, diff0;
359 	double		x_hi0, x_lo0, y_hi0, y_lo0, scl0 = 0;
360 	double		x0, y0, res0;
361 	double		D2ON28 = ((double *)LCONST)[0]; /* 2 ** 28 */
362 
363 	for (; n > 0; n--) {
364 		x0 = *px;
365 		y0 = *py;
366 		hx0 = HI(px);
367 		hy0 = HI(py);
368 
369 		hx0 &= 0x7fffffff;
370 		hy0 &= 0x7fffffff;
371 
372 		diff0 = hy0 - hx0;
373 		j0 = diff0 >> 31;
374 		j0 = hy0 - (diff0 & j0);
375 		j0 &= 0x7ff00000;
376 
377 		px += stridex;
378 		py += stridey;
379 
380 		HI(&scl0) = (0x7fe00000 - j0);
381 
382 		x0 *= scl0;
383 		y0 *= scl0;
384 
385 		x_hi0 = (x0 + D2ON28) - D2ON28;
386 		y_hi0 = (y0 + D2ON28) - D2ON28;
387 		x_lo0 = x0 - x_hi0;
388 		y_lo0 = y0 - y_hi0;
389 
390 		res0 = (x_hi0 * x_hi0 + y_hi0 * y_hi0);
391 		res0 += ((x0 + x_hi0) * x_lo0 + (y0 + y_hi0) * y_lo0);
392 
393 		res0 = sqrt(res0);
394 
395 		HI(&scl0) = j0;
396 
397 		res0 = scl0 * res0;
398 		*pz = res0;
399 
400 		pz += stridez;
401 	}
402 }
403