xref: /illumos-gate/usr/src/lib/libmvec/common/__vrsqrt.c (revision 1ed6b69a5ca1ca3ee5e9a4931f74e2237c7e1c9f)
1 /*
2  * CDDL HEADER START
3  *
4  * The contents of this file are subject to the terms of the
5  * Common Development and Distribution License (the "License").
6  * You may not use this file except in compliance with the License.
7  *
8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9  * or http://www.opensolaris.org/os/licensing.
10  * See the License for the specific language governing permissions
11  * and limitations under the License.
12  *
13  * When distributing Covered Code, include this CDDL HEADER in each
14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15  * If applicable, add the following below this CDDL HEADER, with the
16  * fields enclosed by brackets "[]" replaced with your own identifying
17  * information: Portions Copyright [yyyy] [name of copyright owner]
18  *
19  * CDDL HEADER END
20  */
21 
22 /*
23  * Copyright 2011 Nexenta Systems, Inc.  All rights reserved.
24  */
25 /*
26  * Copyright 2006 Sun Microsystems, Inc.  All rights reserved.
27  * Use is subject to license terms.
28  */
29 
30 #include <sys/isa_defs.h>
31 #include "libm_synonyms.h"
32 #include "libm_inlines.h"
33 
34 #ifdef _LITTLE_ENDIAN
35 #define HI(x)	*(1+(int*)x)
36 #define LO(x)	*(unsigned*)x
37 #else
38 #define HI(x)	*(int*)x
39 #define LO(x)	*(1+(unsigned*)x)
40 #endif
41 
42 #ifdef __RESTRICT
43 #define restrict _Restrict
44 #else
45 #define restrict
46 #endif
47 
48 /* double rsqrt(double x)
49  *
50  * Method :
51  *	1. Special cases:
52  *		for x = NaN				=> QNaN;
53  *		for x = +Inf				=> 0;
54  *		for x is negative, -Inf			=> QNaN + invalid;
55  *		for x = +0				=> +Inf + divide-by-zero;
56  *		for x = -0				=> -Inf + divide-by-zero.
57  *	2. Computes reciprocal square root from:
58  *		x = m * 2**n
59  *	Where:
60  *		m = [0.5, 2),
61  *		n = ((exponent + 1) & ~1).
62  *	Then:
63  *		rsqrt(x) = 1/sqrt( m * 2**n ) = (2 ** (-n/2)) * (1/sqrt(m))
64  *	2. Computes 1/sqrt(m) from:
65  *		1/sqrt(m) = (1/sqrt(m0)) * (1/sqrt(1 + (1/m0)*dm))
66  *	Where:
67  *		m = m0 + dm,
68  *		m0 = 0.5 * (1 + k/64) for m = [0.5,         0.5+127/256), k = [0, 63];
69  *		m0 = 1.0 * (0 + k/64) for m = [0.5+127/256, 1.0+127/128), k = [64, 127];
70  *		m0 = 2.0              for m = [1.0+127/128, 2.0),         k = 128.
71  *	Then:
72  *		1/sqrt(m0) is looked up in a table,
73  *		1/m0 is computed as (1/sqrt(m0)) * (1/sqrt(m0)).
74  *		1/sqrt(1 + (1/m0)*dm) is computed using approximation:
75  *			1/sqrt(1 + z) = (((((a6 * z + a5) * z + a4) * z + a3)
76  *						* z + a2) * z + a1) * z + a0
77  *			where z = [-1/128, 1/128].
78  *
79  * Accuracy:
80  *	The maximum relative error for the approximating
81  *	polynomial is 2**(-56.26).
82  *	Maximum error observed: less than 0.563 ulp after 1.500.000.000
83  *	results.
84  */
85 
86 #define sqrt __sqrt
87 
88 extern double sqrt (double);
89 extern const double __vlibm_TBL_rsqrt[];
90 
91 static void
92 __vrsqrt_n(int n, double * restrict px, int stridex, double * restrict py, int stridey);
93 
94 #pragma no_inline(__vrsqrt_n)
95 
96 #define RETURN(ret)						\
97 {								\
98 	*py = (ret);						\
99 	py += stridey;						\
100 	if (n_n == 0)						\
101 	{							\
102 		spx = px; spy = py;				\
103 		hx = HI(px);					\
104 		continue;					\
105 	}							\
106 	n--;							\
107 	break;							\
108 }
109 
110 static const double
111 	DONE = 1.0,
112 	K1 = -5.00000000000005209867e-01,
113 	K2 =  3.75000000000004884257e-01,
114 	K3 = -3.12499999317136886551e-01,
115 	K4 =  2.73437499359815081532e-01,
116 	K5 = -2.46116125605037803130e-01,
117 	K6 =  2.25606914648617522896e-01;
118 
119 void
120 __vrsqrt(int n, double * restrict px, int stridex, double * restrict py, int stridey)
121 {
122 	double		*spx, *spy;
123 	int		ax, lx, hx, n_n;
124 	double		res;
125 
126 	while (n > 1)
127 	{
128 		n_n = 0;
129 		spx = px;
130 		spy = py;
131 		hx = HI(px);
132 		for (; n > 1 ; n--)
133 		{
134 			px += stridex;
135 			if (hx >= 0x7ff00000)		/* X = NaN or Inf	*/
136 			{
137 				res = *(px - stridex);
138 				RETURN (DONE / res)
139 			}
140 
141 			py += stridey;
142 
143 			if (hx < 0x00100000)		/* X = denormal, zero or negative	*/
144 			{
145 				py -= stridey;
146 				ax = hx & 0x7fffffff;
147 				lx = LO((px - stridex));
148 				res = *(px - stridex);
149 
150 				if ((ax | lx) == 0)	/* |X| = zero	*/
151 				{
152 					RETURN (DONE / res)
153 				}
154 				else if (hx >= 0)	/* X = denormal	*/
155 				{
156 					double		res_c0, dsqrt_exp0;
157 					int		ind0, sqrt_exp0;
158 					double		xx0, dexp_hi0, dexp_lo0;
159 					int		hx0, resh0, res_ch0;
160 
161 					res = *(long long*)&res;
162 
163 					hx0 = HI(&res);
164 					sqrt_exp0 = (0x817 - (hx0 >> 21)) << 20;
165 					ind0 = (((hx0 >> 10) & 0x7f8) + 8) & -16;
166 
167 					resh0 = (hx0 & 0x001fffff) | 0x3fe00000;
168 					res_ch0 = (resh0 + 0x00002000) & 0x7fffc000;
169 					HI(&res) = resh0;
170 					HI(&res_c0) = res_ch0;
171 					LO(&res_c0) = 0;
172 
173 					dexp_hi0 = ((double*)((char*)__vlibm_TBL_rsqrt + ind0))[0];
174 					dexp_lo0 = ((double*)((char*)__vlibm_TBL_rsqrt + ind0))[1];
175 					xx0 = dexp_hi0 * dexp_hi0;
176 					xx0 = (res - res_c0) * xx0;
177 					res = (((((K6 * xx0 + K5) * xx0 + K4) * xx0 + K3) * xx0 + K2) * xx0 + K1) * xx0;
178 
179 					res = dexp_hi0 * res + dexp_lo0 + dexp_hi0;
180 
181 					HI(&dsqrt_exp0) = sqrt_exp0;
182 					LO(&dsqrt_exp0) = 0;
183 					res *= dsqrt_exp0;
184 
185 					RETURN (res)
186 				}
187 				else	/* X = negative	*/
188 				{
189 					RETURN (sqrt(res))
190 				}
191 			}
192 			n_n++;
193 			hx = HI(px);
194 		}
195 		if (n_n > 0)
196 			__vrsqrt_n(n_n, spx, stridex, spy, stridey);
197 	}
198 	if (n > 0)
199 	{
200 		hx = HI(px);
201 
202 		if (hx >= 0x7ff00000)		/* X = NaN or Inf	*/
203 		{
204 			res = *px;
205 			*py = DONE / res;
206 		}
207 		else if (hx < 0x00100000)	/* X = denormal, zero or negative	*/
208 		{
209 			ax = hx & 0x7fffffff;
210 			lx = LO(px);
211 			res = *px;
212 
213 			if ((ax | lx) == 0)	/* |X| = zero	*/
214 			{
215 				*py = DONE / res;
216 			}
217 			else if (hx >= 0)	/* X = denormal	*/
218 			{
219 				double		res_c0, dsqrt_exp0;
220 				int		ind0, sqrt_exp0;
221 				double		xx0, dexp_hi0, dexp_lo0;
222 				int		hx0, resh0, res_ch0;
223 
224 				res = *(long long*)&res;
225 
226 				hx0 = HI(&res);
227 				sqrt_exp0 = (0x817 - (hx0 >> 21)) << 20;
228 				ind0 = (((hx0 >> 10) & 0x7f8) + 8) & -16;
229 
230 				resh0 = (hx0 & 0x001fffff) | 0x3fe00000;
231 				res_ch0 = (resh0 + 0x00002000) & 0x7fffc000;
232 				HI(&res) = resh0;
233 				HI(&res_c0) = res_ch0;
234 				LO(&res_c0) = 0;
235 
236 				dexp_hi0 = ((double*)((char*)__vlibm_TBL_rsqrt + ind0))[0];
237 				dexp_lo0 = ((double*)((char*)__vlibm_TBL_rsqrt + ind0))[1];
238 				xx0 = dexp_hi0 * dexp_hi0;
239 				xx0 = (res - res_c0) * xx0;
240 				res = (((((K6 * xx0 + K5) * xx0 + K4) * xx0 + K3) * xx0 + K2) * xx0 + K1) * xx0;
241 
242 				res = dexp_hi0 * res + dexp_lo0 + dexp_hi0;
243 
244 				HI(&dsqrt_exp0) = sqrt_exp0;
245 				LO(&dsqrt_exp0) = 0;
246 				res *= dsqrt_exp0;
247 
248 				*py = res;
249 			}
250 			else	/* X = negative	*/
251 			{
252 				*py = sqrt(res);
253 			}
254 		}
255 		else
256 		{
257 			double		res_c0, dsqrt_exp0;
258 			int		ind0, sqrt_exp0;
259 			double		xx0, dexp_hi0, dexp_lo0;
260 			int		resh0, res_ch0;
261 
262 			sqrt_exp0 = (0x5fe - (hx >> 21)) << 20;
263 			ind0 = (((hx >> 10) & 0x7f8) + 8) & -16;
264 
265 			resh0 = (hx & 0x001fffff) | 0x3fe00000;
266 			res_ch0 = (resh0 + 0x00002000) & 0x7fffc000;
267 			HI(&res) = resh0;
268 			LO(&res) = LO(px);
269 			HI(&res_c0) = res_ch0;
270 			LO(&res_c0) = 0;
271 
272 			dexp_hi0 = ((double*)((char*)__vlibm_TBL_rsqrt + ind0))[0];
273 			dexp_lo0 = ((double*)((char*)__vlibm_TBL_rsqrt + ind0))[1];
274 			xx0 = dexp_hi0 * dexp_hi0;
275 			xx0 = (res - res_c0) * xx0;
276 			res = (((((K6 * xx0 + K5) * xx0 + K4) * xx0 + K3) * xx0 + K2) * xx0 + K1) * xx0;
277 
278 			res = dexp_hi0 * res + dexp_lo0 + dexp_hi0;
279 
280 			HI(&dsqrt_exp0) = sqrt_exp0;
281 			LO(&dsqrt_exp0) = 0;
282 			res *= dsqrt_exp0;
283 
284 			*py = res;
285 		}
286 	}
287 }
288 
289 static void
290 __vrsqrt_n(int n, double * restrict px, int stridex, double * restrict py, int stridey)
291 {
292 	double		res0, res_c0, dsqrt_exp0;
293 	double		res1, res_c1, dsqrt_exp1;
294 	double		res2, res_c2, dsqrt_exp2;
295 	int		ind0, sqrt_exp0;
296 	int		ind1, sqrt_exp1;
297 	int		ind2, sqrt_exp2;
298 	double		xx0, dexp_hi0, dexp_lo0;
299 	double		xx1, dexp_hi1, dexp_lo1;
300 	double		xx2, dexp_hi2, dexp_lo2;
301 	int		hx0, resh0, res_ch0;
302 	int		hx1, resh1, res_ch1;
303 	int		hx2, resh2, res_ch2;
304 
305 	LO(&dsqrt_exp0) = 0;
306 	LO(&dsqrt_exp1) = 0;
307 	LO(&dsqrt_exp2) = 0;
308 	LO(&res_c0) = 0;
309 	LO(&res_c1) = 0;
310 	LO(&res_c2) = 0;
311 
312 	for(; n > 2 ; n -= 3)
313 	{
314 		hx0 = HI(px);
315 		LO(&res0) = LO(px);
316 		px += stridex;
317 
318 		hx1 = HI(px);
319 		LO(&res1) = LO(px);
320 		px += stridex;
321 
322 		hx2 = HI(px);
323 		LO(&res2) = LO(px);
324 		px += stridex;
325 
326 		sqrt_exp0 = (0x5fe - (hx0 >> 21)) << 20;
327 		sqrt_exp1 = (0x5fe - (hx1 >> 21)) << 20;
328 		sqrt_exp2 = (0x5fe - (hx2 >> 21)) << 20;
329 		ind0 = (((hx0 >> 10) & 0x7f8) + 8) & -16;
330 		ind1 = (((hx1 >> 10) & 0x7f8) + 8) & -16;
331 		ind2 = (((hx2 >> 10) & 0x7f8) + 8) & -16;
332 
333 		resh0 = (hx0 & 0x001fffff) | 0x3fe00000;
334 		resh1 = (hx1 & 0x001fffff) | 0x3fe00000;
335 		resh2 = (hx2 & 0x001fffff) | 0x3fe00000;
336 		res_ch0 = (resh0 + 0x00002000) & 0x7fffc000;
337 		res_ch1 = (resh1 + 0x00002000) & 0x7fffc000;
338 		res_ch2 = (resh2 + 0x00002000) & 0x7fffc000;
339 		HI(&res0) = resh0;
340 		HI(&res1) = resh1;
341 		HI(&res2) = resh2;
342 		HI(&res_c0) = res_ch0;
343 		HI(&res_c1) = res_ch1;
344 		HI(&res_c2) = res_ch2;
345 
346 		dexp_hi0 = ((double*)((char*)__vlibm_TBL_rsqrt + ind0))[0];
347 		dexp_hi1 = ((double*)((char*)__vlibm_TBL_rsqrt + ind1))[0];
348 		dexp_hi2 = ((double*)((char*)__vlibm_TBL_rsqrt + ind2))[0];
349 		dexp_lo0 = ((double*)((char*)__vlibm_TBL_rsqrt + ind0))[1];
350 		dexp_lo1 = ((double*)((char*)__vlibm_TBL_rsqrt + ind1))[1];
351 		dexp_lo2 = ((double*)((char*)__vlibm_TBL_rsqrt + ind2))[1];
352 		xx0 = dexp_hi0 * dexp_hi0;
353 		xx1 = dexp_hi1 * dexp_hi1;
354 		xx2 = dexp_hi2 * dexp_hi2;
355 		xx0 = (res0 - res_c0) * xx0;
356 		xx1 = (res1 - res_c1) * xx1;
357 		xx2 = (res2 - res_c2) * xx2;
358 		res0 = (((((K6 * xx0 + K5) * xx0 + K4) * xx0 + K3) * xx0 + K2) * xx0 + K1) * xx0;
359 		res1 = (((((K6 * xx1 + K5) * xx1 + K4) * xx1 + K3) * xx1 + K2) * xx1 + K1) * xx1;
360 		res2 = (((((K6 * xx2 + K5) * xx2 + K4) * xx2 + K3) * xx2 + K2) * xx2 + K1) * xx2;
361 
362 		res0 = dexp_hi0 * res0 + dexp_lo0 + dexp_hi0;
363 		res1 = dexp_hi1 * res1 + dexp_lo1 + dexp_hi1;
364 		res2 = dexp_hi2 * res2 + dexp_lo2 + dexp_hi2;
365 
366 		HI(&dsqrt_exp0) = sqrt_exp0;
367 		HI(&dsqrt_exp1) = sqrt_exp1;
368 		HI(&dsqrt_exp2) = sqrt_exp2;
369 		res0 *= dsqrt_exp0;
370 		res1 *= dsqrt_exp1;
371 		res2 *= dsqrt_exp2;
372 
373 		*py = res0;
374 		py += stridey;
375 
376 		*py = res1;
377 		py += stridey;
378 
379 		*py = res2;
380 		py += stridey;
381 	}
382 
383 	for(; n > 0 ; n--)
384 	{
385 		hx0 = HI(px);
386 
387 		sqrt_exp0 = (0x5fe - (hx0 >> 21)) << 20;
388 		ind0 = (((hx0 >> 10) & 0x7f8) + 8) & -16;
389 
390 		resh0 = (hx0 & 0x001fffff) | 0x3fe00000;
391 		res_ch0 = (resh0 + 0x00002000) & 0x7fffc000;
392 		HI(&res0) = resh0;
393 		LO(&res0) = LO(px);
394 		HI(&res_c0) = res_ch0;
395 		LO(&res_c0) = 0;
396 
397 		px += stridex;
398 
399 		dexp_hi0 = ((double*)((char*)__vlibm_TBL_rsqrt + ind0))[0];
400 		dexp_lo0 = ((double*)((char*)__vlibm_TBL_rsqrt + ind0))[1];
401 		xx0 = dexp_hi0 * dexp_hi0;
402 		xx0 = (res0 - res_c0) * xx0;
403 		res0 = (((((K6 * xx0 + K5) * xx0 + K4) * xx0 + K3) * xx0 + K2) * xx0 + K1) * xx0;
404 
405 		res0 = dexp_hi0 * res0 + dexp_lo0 + dexp_hi0;
406 
407 		HI(&dsqrt_exp0) = sqrt_exp0;
408 		LO(&dsqrt_exp0) = 0;
409 		res0 *= dsqrt_exp0;
410 
411 		*py = res0;
412 		py += stridey;
413 	}
414 }
415 
416