xref: /illumos-gate/usr/src/lib/libmvec/common/__vrsqrt.c (revision fcdb3229a31dd4ff700c69238814e326aad49098)
1 /*
2  * CDDL HEADER START
3  *
4  * The contents of this file are subject to the terms of the
5  * Common Development and Distribution License (the "License").
6  * You may not use this file except in compliance with the License.
7  *
8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9  * or http://www.opensolaris.org/os/licensing.
10  * See the License for the specific language governing permissions
11  * and limitations under the License.
12  *
13  * When distributing Covered Code, include this CDDL HEADER in each
14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15  * If applicable, add the following below this CDDL HEADER, with the
16  * fields enclosed by brackets "[]" replaced with your own identifying
17  * information: Portions Copyright [yyyy] [name of copyright owner]
18  *
19  * CDDL HEADER END
20  */
21 
22 /*
23  * Copyright 2011 Nexenta Systems, Inc.  All rights reserved.
24  */
25 /*
26  * Copyright 2006 Sun Microsystems, Inc.  All rights reserved.
27  * Use is subject to license terms.
28  */
29 
30 #include <sys/isa_defs.h>
31 #include "libm_inlines.h"
32 
33 #ifdef _LITTLE_ENDIAN
34 #define	HI(x)	*(1+(int *)x)
35 #define	LO(x)	*(unsigned *)x
36 #else
37 #define	HI(x)	*(int *)x
38 #define	LO(x)	*(1+(unsigned *)x)
39 #endif
40 
41 #ifdef __RESTRICT
42 #define	restrict _Restrict
43 #else
44 #define	restrict
45 #endif
46 
47 /*
48  * double rsqrt(double x)
49  *
50  * Method :
51  *	1. Special cases:
52  *		for x = NaN			=> QNaN;
53  *		for x = +Inf			=> 0;
54  *		for x is negative, -Inf		=> QNaN + invalid;
55  *		for x = +0			=> +Inf + divide-by-zero;
56  *		for x = -0			=> -Inf + divide-by-zero.
57  *	2. Computes reciprocal square root from:
58  *		x = m * 2**n
59  *	Where:
60  *		m = [0.5, 2),
61  *		n = ((exponent + 1) & ~1).
62  *	Then:
63  *		rsqrt(x) = 1/sqrt( m * 2**n ) = (2 ** (-n/2)) * (1/sqrt(m))
64  *	2. Computes 1/sqrt(m) from:
65  *		1/sqrt(m) = (1/sqrt(m0)) * (1/sqrt(1 + (1/m0)*dm))
66  *	Where:
67  *		m = m0 + dm,
68  *		m0 = 0.5 * (1 + k/64) for m = [0.5, 0.5+127/256), k = [0, 63];
69  *		m0 = 1.0 * (0 + k/64) for m = [0.5+127/256, 1.0+127/128),
70  *		    k = [64, 127];
71  *		m0 = 2.0              for m = [1.0+127/128, 2.0), k = 128.
72  *	Then:
73  *		1/sqrt(m0) is looked up in a table,
74  *		1/m0 is computed as (1/sqrt(m0)) * (1/sqrt(m0)).
75  *		1/sqrt(1 + (1/m0)*dm) is computed using approximation:
76  *			1/sqrt(1 + z) = (((((a6 * z + a5) * z + a4) * z + a3)
77  *						* z + a2) * z + a1) * z + a0
78  *			where z = [-1/128, 1/128].
79  *
80  * Accuracy:
81  *	The maximum relative error for the approximating
82  *	polynomial is 2**(-56.26).
83  *	Maximum error observed: less than 0.563 ulp after 1.500.000.000
84  *	results.
85  */
86 
87 extern double sqrt(double);
88 extern const double __vlibm_TBL_rsqrt[];
89 
90 static void
91 __vrsqrt_n(int n, double *restrict px, int stridex, double *restrict py,
92     int stridey);
93 
94 #define	RETURN(ret)						\
95 {								\
96 	*py = (ret);						\
97 	py += stridey;						\
98 	if (n_n == 0)						\
99 	{							\
100 		spx = px;					\
101 		spy = py;					\
102 		hx = HI(px);					\
103 		continue;					\
104 	}							\
105 	n--;							\
106 	break;							\
107 }
108 
109 static const double
110 	DONE = 1.0,
111 	K1 = -5.00000000000005209867e-01,
112 	K2 =  3.75000000000004884257e-01,
113 	K3 = -3.12499999317136886551e-01,
114 	K4 =  2.73437499359815081532e-01,
115 	K5 = -2.46116125605037803130e-01,
116 	K6 =  2.25606914648617522896e-01;
117 
118 void
__vrsqrt(int n,double * restrict px,int stridex,double * restrict py,int stridey)119 __vrsqrt(int n, double *restrict px, int stridex, double *restrict py,
120     int stridey)
121 {
122 	double		*spx, *spy;
123 	int		ax, lx, hx, n_n;
124 	double		res;
125 
126 	while (n > 1) {
127 		n_n = 0;
128 		spx = px;
129 		spy = py;
130 		hx = HI(px);
131 		for (; n > 1; n--) {
132 			px += stridex;
133 			if (hx >= 0x7ff00000) {	/* X = NaN or Inf */
134 				res = *(px - stridex);
135 				RETURN(DONE / res)
136 			}
137 
138 			py += stridey;
139 
140 			if (hx < 0x00100000) {
141 				/* X = denormal, zero or negative */
142 				py -= stridey;
143 				ax = hx & 0x7fffffff;
144 				lx = LO((px - stridex));
145 				res = *(px - stridex);
146 
147 				if ((ax | lx) == 0) {	/* |X| = zero	*/
148 					RETURN(DONE / res)
149 				} else if (hx >= 0) {	/* X = denormal	*/
150 					double		res_c0, dsqrt_exp0;
151 					int		ind0, sqrt_exp0;
152 					double		xx0, dexp_hi0, dexp_lo0;
153 					int		hx0, resh0, res_ch0;
154 
155 					res = *(long long *)&res;
156 
157 					hx0 = HI(&res);
158 					sqrt_exp0 = (0x817 - (hx0 >> 21)) << 20;
159 					ind0 = (((hx0 >> 10) & 0x7f8) + 8) &
160 					    -16;
161 
162 					resh0 = (hx0 & 0x001fffff) | 0x3fe00000;
163 					res_ch0 = (resh0 + 0x00002000) &
164 					    0x7fffc000;
165 					HI(&res) = resh0;
166 					HI(&res_c0) = res_ch0;
167 					LO(&res_c0) = 0;
168 
169 					dexp_hi0 = ((double *)((char *)
170 					    __vlibm_TBL_rsqrt + ind0))[0];
171 					dexp_lo0 = ((double *)((char *)
172 					    __vlibm_TBL_rsqrt + ind0))[1];
173 					xx0 = dexp_hi0 * dexp_hi0;
174 					xx0 = (res - res_c0) * xx0;
175 					res = (((((K6 * xx0 + K5) * xx0 + K4) *
176 					    xx0 + K3) * xx0 + K2) * xx0 + K1) *
177 					    xx0;
178 
179 					res = dexp_hi0 * res + dexp_lo0 +
180 					    dexp_hi0;
181 
182 					HI(&dsqrt_exp0) = sqrt_exp0;
183 					LO(&dsqrt_exp0) = 0;
184 					res *= dsqrt_exp0;
185 
186 					RETURN(res)
187 				} else {	/* X = negative	*/
188 					RETURN(sqrt(res))
189 				}
190 			}
191 			n_n++;
192 			hx = HI(px);
193 		}
194 		if (n_n > 0)
195 			__vrsqrt_n(n_n, spx, stridex, spy, stridey);
196 	}
197 	if (n > 0) {
198 		hx = HI(px);
199 
200 		if (hx >= 0x7ff00000) {		/* X = NaN or Inf	*/
201 			res = *px;
202 			*py = DONE / res;
203 		} else if (hx < 0x00100000) {
204 			/* X = denormal, zero or negative */
205 			ax = hx & 0x7fffffff;
206 			lx = LO(px);
207 			res = *px;
208 
209 			if ((ax | lx) == 0) {	/* |X| = zero	*/
210 				*py = DONE / res;
211 			} else if (hx >= 0) {	/* X = denormal	*/
212 				double		res_c0, dsqrt_exp0;
213 				int		ind0, sqrt_exp0;
214 				double		xx0, dexp_hi0, dexp_lo0;
215 				int		hx0, resh0, res_ch0;
216 
217 				res = *(long long *)&res;
218 
219 				hx0 = HI(&res);
220 				sqrt_exp0 = (0x817 - (hx0 >> 21)) << 20;
221 				ind0 = (((hx0 >> 10) & 0x7f8) + 8) & -16;
222 
223 				resh0 = (hx0 & 0x001fffff) | 0x3fe00000;
224 				res_ch0 = (resh0 + 0x00002000) & 0x7fffc000;
225 				HI(&res) = resh0;
226 				HI(&res_c0) = res_ch0;
227 				LO(&res_c0) = 0;
228 
229 				dexp_hi0 = ((double *)((char *)
230 				    __vlibm_TBL_rsqrt + ind0))[0];
231 				dexp_lo0 = ((double *)((char *)
232 				    __vlibm_TBL_rsqrt + ind0))[1];
233 				xx0 = dexp_hi0 * dexp_hi0;
234 				xx0 = (res - res_c0) * xx0;
235 				res = (((((K6 * xx0 + K5) * xx0 + K4) * xx0 +
236 				    K3) * xx0 + K2) * xx0 + K1) * xx0;
237 
238 				res = dexp_hi0 * res + dexp_lo0 + dexp_hi0;
239 
240 				HI(&dsqrt_exp0) = sqrt_exp0;
241 				LO(&dsqrt_exp0) = 0;
242 				res *= dsqrt_exp0;
243 
244 				*py = res;
245 			} else { /* X = negative	*/
246 				*py = sqrt(res);
247 			}
248 		} else {
249 			double		res_c0, dsqrt_exp0;
250 			int		ind0, sqrt_exp0;
251 			double		xx0, dexp_hi0, dexp_lo0;
252 			int		resh0, res_ch0;
253 
254 			sqrt_exp0 = (0x5fe - (hx >> 21)) << 20;
255 			ind0 = (((hx >> 10) & 0x7f8) + 8) & -16;
256 
257 			resh0 = (hx & 0x001fffff) | 0x3fe00000;
258 			res_ch0 = (resh0 + 0x00002000) & 0x7fffc000;
259 			HI(&res) = resh0;
260 			LO(&res) = LO(px);
261 			HI(&res_c0) = res_ch0;
262 			LO(&res_c0) = 0;
263 
264 			dexp_hi0 = ((double *)((char *)
265 			    __vlibm_TBL_rsqrt + ind0))[0];
266 			dexp_lo0 = ((double *)((char *)
267 			    __vlibm_TBL_rsqrt + ind0))[1];
268 			xx0 = dexp_hi0 * dexp_hi0;
269 			xx0 = (res - res_c0) * xx0;
270 			res = (((((K6 * xx0 + K5) * xx0 + K4) * xx0 + K3) *
271 			    xx0 + K2) * xx0 + K1) * xx0;
272 
273 			res = dexp_hi0 * res + dexp_lo0 + dexp_hi0;
274 
275 			HI(&dsqrt_exp0) = sqrt_exp0;
276 			LO(&dsqrt_exp0) = 0;
277 			res *= dsqrt_exp0;
278 
279 			*py = res;
280 		}
281 	}
282 }
283 
284 static void
__vrsqrt_n(int n,double * restrict px,int stridex,double * restrict py,int stridey)285 __vrsqrt_n(int n, double *restrict px, int stridex, double *restrict py,
286     int stridey)
287 {
288 	double		res0, res_c0, dsqrt_exp0;
289 	double		res1, res_c1, dsqrt_exp1;
290 	double		res2, res_c2, dsqrt_exp2;
291 	int		ind0, sqrt_exp0;
292 	int		ind1, sqrt_exp1;
293 	int		ind2, sqrt_exp2;
294 	double		xx0, dexp_hi0, dexp_lo0;
295 	double		xx1, dexp_hi1, dexp_lo1;
296 	double		xx2, dexp_hi2, dexp_lo2;
297 	int		hx0, resh0, res_ch0;
298 	int		hx1, resh1, res_ch1;
299 	int		hx2, resh2, res_ch2;
300 
301 	LO(&dsqrt_exp0) = 0;
302 	LO(&dsqrt_exp1) = 0;
303 	LO(&dsqrt_exp2) = 0;
304 	LO(&res_c0) = 0;
305 	LO(&res_c1) = 0;
306 	LO(&res_c2) = 0;
307 
308 	for (; n > 2; n -= 3) {
309 		hx0 = HI(px);
310 		LO(&res0) = LO(px);
311 		px += stridex;
312 
313 		hx1 = HI(px);
314 		LO(&res1) = LO(px);
315 		px += stridex;
316 
317 		hx2 = HI(px);
318 		LO(&res2) = LO(px);
319 		px += stridex;
320 
321 		sqrt_exp0 = (0x5fe - (hx0 >> 21)) << 20;
322 		sqrt_exp1 = (0x5fe - (hx1 >> 21)) << 20;
323 		sqrt_exp2 = (0x5fe - (hx2 >> 21)) << 20;
324 		ind0 = (((hx0 >> 10) & 0x7f8) + 8) & -16;
325 		ind1 = (((hx1 >> 10) & 0x7f8) + 8) & -16;
326 		ind2 = (((hx2 >> 10) & 0x7f8) + 8) & -16;
327 
328 		resh0 = (hx0 & 0x001fffff) | 0x3fe00000;
329 		resh1 = (hx1 & 0x001fffff) | 0x3fe00000;
330 		resh2 = (hx2 & 0x001fffff) | 0x3fe00000;
331 		res_ch0 = (resh0 + 0x00002000) & 0x7fffc000;
332 		res_ch1 = (resh1 + 0x00002000) & 0x7fffc000;
333 		res_ch2 = (resh2 + 0x00002000) & 0x7fffc000;
334 		HI(&res0) = resh0;
335 		HI(&res1) = resh1;
336 		HI(&res2) = resh2;
337 		HI(&res_c0) = res_ch0;
338 		HI(&res_c1) = res_ch1;
339 		HI(&res_c2) = res_ch2;
340 
341 		dexp_hi0 = ((double *)((char *)__vlibm_TBL_rsqrt + ind0))[0];
342 		dexp_hi1 = ((double *)((char *)__vlibm_TBL_rsqrt + ind1))[0];
343 		dexp_hi2 = ((double *)((char *)__vlibm_TBL_rsqrt + ind2))[0];
344 		dexp_lo0 = ((double *)((char *)__vlibm_TBL_rsqrt + ind0))[1];
345 		dexp_lo1 = ((double *)((char *)__vlibm_TBL_rsqrt + ind1))[1];
346 		dexp_lo2 = ((double *)((char *)__vlibm_TBL_rsqrt + ind2))[1];
347 		xx0 = dexp_hi0 * dexp_hi0;
348 		xx1 = dexp_hi1 * dexp_hi1;
349 		xx2 = dexp_hi2 * dexp_hi2;
350 		xx0 = (res0 - res_c0) * xx0;
351 		xx1 = (res1 - res_c1) * xx1;
352 		xx2 = (res2 - res_c2) * xx2;
353 		res0 = (((((K6 * xx0 + K5) * xx0 + K4) * xx0 + K3) *
354 		    xx0 + K2) * xx0 + K1) * xx0;
355 		res1 = (((((K6 * xx1 + K5) * xx1 + K4) * xx1 + K3) *
356 		    xx1 + K2) * xx1 + K1) * xx1;
357 		res2 = (((((K6 * xx2 + K5) * xx2 + K4) * xx2 + K3) *
358 		    xx2 + K2) * xx2 + K1) * xx2;
359 
360 		res0 = dexp_hi0 * res0 + dexp_lo0 + dexp_hi0;
361 		res1 = dexp_hi1 * res1 + dexp_lo1 + dexp_hi1;
362 		res2 = dexp_hi2 * res2 + dexp_lo2 + dexp_hi2;
363 
364 		HI(&dsqrt_exp0) = sqrt_exp0;
365 		HI(&dsqrt_exp1) = sqrt_exp1;
366 		HI(&dsqrt_exp2) = sqrt_exp2;
367 		res0 *= dsqrt_exp0;
368 		res1 *= dsqrt_exp1;
369 		res2 *= dsqrt_exp2;
370 
371 		*py = res0;
372 		py += stridey;
373 
374 		*py = res1;
375 		py += stridey;
376 
377 		*py = res2;
378 		py += stridey;
379 	}
380 
381 	for (; n > 0; n--) {
382 		hx0 = HI(px);
383 
384 		sqrt_exp0 = (0x5fe - (hx0 >> 21)) << 20;
385 		ind0 = (((hx0 >> 10) & 0x7f8) + 8) & -16;
386 
387 		resh0 = (hx0 & 0x001fffff) | 0x3fe00000;
388 		res_ch0 = (resh0 + 0x00002000) & 0x7fffc000;
389 		HI(&res0) = resh0;
390 		LO(&res0) = LO(px);
391 		HI(&res_c0) = res_ch0;
392 		LO(&res_c0) = 0;
393 
394 		px += stridex;
395 
396 		dexp_hi0 = ((double *)((char *)__vlibm_TBL_rsqrt + ind0))[0];
397 		dexp_lo0 = ((double *)((char *)__vlibm_TBL_rsqrt + ind0))[1];
398 		xx0 = dexp_hi0 * dexp_hi0;
399 		xx0 = (res0 - res_c0) * xx0;
400 		res0 = (((((K6 * xx0 + K5) * xx0 + K4) * xx0 + K3) *
401 		    xx0 + K2) * xx0 + K1) * xx0;
402 
403 		res0 = dexp_hi0 * res0 + dexp_lo0 + dexp_hi0;
404 
405 		HI(&dsqrt_exp0) = sqrt_exp0;
406 		LO(&dsqrt_exp0) = 0;
407 		res0 *= dsqrt_exp0;
408 
409 		*py = res0;
410 		py += stridey;
411 	}
412 }
413