xref: /titanic_44/usr/src/lib/libmvec/common/__vrsqrt.c (revision ddc0e0b53c661f6e439e3b7072b3ef353eadb4af)
1*25c28e83SPiotr Jasiukajtis /*
2*25c28e83SPiotr Jasiukajtis  * CDDL HEADER START
3*25c28e83SPiotr Jasiukajtis  *
4*25c28e83SPiotr Jasiukajtis  * The contents of this file are subject to the terms of the
5*25c28e83SPiotr Jasiukajtis  * Common Development and Distribution License (the "License").
6*25c28e83SPiotr Jasiukajtis  * You may not use this file except in compliance with the License.
7*25c28e83SPiotr Jasiukajtis  *
8*25c28e83SPiotr Jasiukajtis  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9*25c28e83SPiotr Jasiukajtis  * or http://www.opensolaris.org/os/licensing.
10*25c28e83SPiotr Jasiukajtis  * See the License for the specific language governing permissions
11*25c28e83SPiotr Jasiukajtis  * and limitations under the License.
12*25c28e83SPiotr Jasiukajtis  *
13*25c28e83SPiotr Jasiukajtis  * When distributing Covered Code, include this CDDL HEADER in each
14*25c28e83SPiotr Jasiukajtis  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15*25c28e83SPiotr Jasiukajtis  * If applicable, add the following below this CDDL HEADER, with the
16*25c28e83SPiotr Jasiukajtis  * fields enclosed by brackets "[]" replaced with your own identifying
17*25c28e83SPiotr Jasiukajtis  * information: Portions Copyright [yyyy] [name of copyright owner]
18*25c28e83SPiotr Jasiukajtis  *
19*25c28e83SPiotr Jasiukajtis  * CDDL HEADER END
20*25c28e83SPiotr Jasiukajtis  */
21*25c28e83SPiotr Jasiukajtis 
22*25c28e83SPiotr Jasiukajtis /*
23*25c28e83SPiotr Jasiukajtis  * Copyright 2011 Nexenta Systems, Inc.  All rights reserved.
24*25c28e83SPiotr Jasiukajtis  */
25*25c28e83SPiotr Jasiukajtis /*
26*25c28e83SPiotr Jasiukajtis  * Copyright 2006 Sun Microsystems, Inc.  All rights reserved.
27*25c28e83SPiotr Jasiukajtis  * Use is subject to license terms.
28*25c28e83SPiotr Jasiukajtis  */
29*25c28e83SPiotr Jasiukajtis 
30*25c28e83SPiotr Jasiukajtis #include <sys/isa_defs.h>
31*25c28e83SPiotr Jasiukajtis #include "libm_inlines.h"
32*25c28e83SPiotr Jasiukajtis 
33*25c28e83SPiotr Jasiukajtis #ifdef _LITTLE_ENDIAN
34*25c28e83SPiotr Jasiukajtis #define HI(x)	*(1+(int*)x)
35*25c28e83SPiotr Jasiukajtis #define LO(x)	*(unsigned*)x
36*25c28e83SPiotr Jasiukajtis #else
37*25c28e83SPiotr Jasiukajtis #define HI(x)	*(int*)x
38*25c28e83SPiotr Jasiukajtis #define LO(x)	*(1+(unsigned*)x)
39*25c28e83SPiotr Jasiukajtis #endif
40*25c28e83SPiotr Jasiukajtis 
41*25c28e83SPiotr Jasiukajtis #ifdef __RESTRICT
42*25c28e83SPiotr Jasiukajtis #define restrict _Restrict
43*25c28e83SPiotr Jasiukajtis #else
44*25c28e83SPiotr Jasiukajtis #define restrict
45*25c28e83SPiotr Jasiukajtis #endif
46*25c28e83SPiotr Jasiukajtis 
47*25c28e83SPiotr Jasiukajtis /* double rsqrt(double x)
48*25c28e83SPiotr Jasiukajtis  *
49*25c28e83SPiotr Jasiukajtis  * Method :
50*25c28e83SPiotr Jasiukajtis  *	1. Special cases:
51*25c28e83SPiotr Jasiukajtis  *		for x = NaN				=> QNaN;
52*25c28e83SPiotr Jasiukajtis  *		for x = +Inf				=> 0;
53*25c28e83SPiotr Jasiukajtis  *		for x is negative, -Inf			=> QNaN + invalid;
54*25c28e83SPiotr Jasiukajtis  *		for x = +0				=> +Inf + divide-by-zero;
55*25c28e83SPiotr Jasiukajtis  *		for x = -0				=> -Inf + divide-by-zero.
56*25c28e83SPiotr Jasiukajtis  *	2. Computes reciprocal square root from:
57*25c28e83SPiotr Jasiukajtis  *		x = m * 2**n
58*25c28e83SPiotr Jasiukajtis  *	Where:
59*25c28e83SPiotr Jasiukajtis  *		m = [0.5, 2),
60*25c28e83SPiotr Jasiukajtis  *		n = ((exponent + 1) & ~1).
61*25c28e83SPiotr Jasiukajtis  *	Then:
62*25c28e83SPiotr Jasiukajtis  *		rsqrt(x) = 1/sqrt( m * 2**n ) = (2 ** (-n/2)) * (1/sqrt(m))
63*25c28e83SPiotr Jasiukajtis  *	2. Computes 1/sqrt(m) from:
64*25c28e83SPiotr Jasiukajtis  *		1/sqrt(m) = (1/sqrt(m0)) * (1/sqrt(1 + (1/m0)*dm))
65*25c28e83SPiotr Jasiukajtis  *	Where:
66*25c28e83SPiotr Jasiukajtis  *		m = m0 + dm,
67*25c28e83SPiotr Jasiukajtis  *		m0 = 0.5 * (1 + k/64) for m = [0.5,         0.5+127/256), k = [0, 63];
68*25c28e83SPiotr Jasiukajtis  *		m0 = 1.0 * (0 + k/64) for m = [0.5+127/256, 1.0+127/128), k = [64, 127];
69*25c28e83SPiotr Jasiukajtis  *		m0 = 2.0              for m = [1.0+127/128, 2.0),         k = 128.
70*25c28e83SPiotr Jasiukajtis  *	Then:
71*25c28e83SPiotr Jasiukajtis  *		1/sqrt(m0) is looked up in a table,
72*25c28e83SPiotr Jasiukajtis  *		1/m0 is computed as (1/sqrt(m0)) * (1/sqrt(m0)).
73*25c28e83SPiotr Jasiukajtis  *		1/sqrt(1 + (1/m0)*dm) is computed using approximation:
74*25c28e83SPiotr Jasiukajtis  *			1/sqrt(1 + z) = (((((a6 * z + a5) * z + a4) * z + a3)
75*25c28e83SPiotr Jasiukajtis  *						* z + a2) * z + a1) * z + a0
76*25c28e83SPiotr Jasiukajtis  *			where z = [-1/128, 1/128].
77*25c28e83SPiotr Jasiukajtis  *
78*25c28e83SPiotr Jasiukajtis  * Accuracy:
79*25c28e83SPiotr Jasiukajtis  *	The maximum relative error for the approximating
80*25c28e83SPiotr Jasiukajtis  *	polynomial is 2**(-56.26).
81*25c28e83SPiotr Jasiukajtis  *	Maximum error observed: less than 0.563 ulp after 1.500.000.000
82*25c28e83SPiotr Jasiukajtis  *	results.
83*25c28e83SPiotr Jasiukajtis  */
84*25c28e83SPiotr Jasiukajtis 
85*25c28e83SPiotr Jasiukajtis extern double sqrt (double);
86*25c28e83SPiotr Jasiukajtis extern const double __vlibm_TBL_rsqrt[];
87*25c28e83SPiotr Jasiukajtis 
88*25c28e83SPiotr Jasiukajtis static void
89*25c28e83SPiotr Jasiukajtis __vrsqrt_n(int n, double * restrict px, int stridex, double * restrict py, int stridey);
90*25c28e83SPiotr Jasiukajtis 
91*25c28e83SPiotr Jasiukajtis #pragma no_inline(__vrsqrt_n)
92*25c28e83SPiotr Jasiukajtis 
93*25c28e83SPiotr Jasiukajtis #define RETURN(ret)						\
94*25c28e83SPiotr Jasiukajtis {								\
95*25c28e83SPiotr Jasiukajtis 	*py = (ret);						\
96*25c28e83SPiotr Jasiukajtis 	py += stridey;						\
97*25c28e83SPiotr Jasiukajtis 	if (n_n == 0)						\
98*25c28e83SPiotr Jasiukajtis 	{							\
99*25c28e83SPiotr Jasiukajtis 		spx = px; spy = py;				\
100*25c28e83SPiotr Jasiukajtis 		hx = HI(px);					\
101*25c28e83SPiotr Jasiukajtis 		continue;					\
102*25c28e83SPiotr Jasiukajtis 	}							\
103*25c28e83SPiotr Jasiukajtis 	n--;							\
104*25c28e83SPiotr Jasiukajtis 	break;							\
105*25c28e83SPiotr Jasiukajtis }
106*25c28e83SPiotr Jasiukajtis 
107*25c28e83SPiotr Jasiukajtis static const double
108*25c28e83SPiotr Jasiukajtis 	DONE = 1.0,
109*25c28e83SPiotr Jasiukajtis 	K1 = -5.00000000000005209867e-01,
110*25c28e83SPiotr Jasiukajtis 	K2 =  3.75000000000004884257e-01,
111*25c28e83SPiotr Jasiukajtis 	K3 = -3.12499999317136886551e-01,
112*25c28e83SPiotr Jasiukajtis 	K4 =  2.73437499359815081532e-01,
113*25c28e83SPiotr Jasiukajtis 	K5 = -2.46116125605037803130e-01,
114*25c28e83SPiotr Jasiukajtis 	K6 =  2.25606914648617522896e-01;
115*25c28e83SPiotr Jasiukajtis 
116*25c28e83SPiotr Jasiukajtis void
__vrsqrt(int n,double * restrict px,int stridex,double * restrict py,int stridey)117*25c28e83SPiotr Jasiukajtis __vrsqrt(int n, double * restrict px, int stridex, double * restrict py, int stridey)
118*25c28e83SPiotr Jasiukajtis {
119*25c28e83SPiotr Jasiukajtis 	double		*spx, *spy;
120*25c28e83SPiotr Jasiukajtis 	int		ax, lx, hx, n_n;
121*25c28e83SPiotr Jasiukajtis 	double		res;
122*25c28e83SPiotr Jasiukajtis 
123*25c28e83SPiotr Jasiukajtis 	while (n > 1)
124*25c28e83SPiotr Jasiukajtis 	{
125*25c28e83SPiotr Jasiukajtis 		n_n = 0;
126*25c28e83SPiotr Jasiukajtis 		spx = px;
127*25c28e83SPiotr Jasiukajtis 		spy = py;
128*25c28e83SPiotr Jasiukajtis 		hx = HI(px);
129*25c28e83SPiotr Jasiukajtis 		for (; n > 1 ; n--)
130*25c28e83SPiotr Jasiukajtis 		{
131*25c28e83SPiotr Jasiukajtis 			px += stridex;
132*25c28e83SPiotr Jasiukajtis 			if (hx >= 0x7ff00000)		/* X = NaN or Inf	*/
133*25c28e83SPiotr Jasiukajtis 			{
134*25c28e83SPiotr Jasiukajtis 				res = *(px - stridex);
135*25c28e83SPiotr Jasiukajtis 				RETURN (DONE / res)
136*25c28e83SPiotr Jasiukajtis 			}
137*25c28e83SPiotr Jasiukajtis 
138*25c28e83SPiotr Jasiukajtis 			py += stridey;
139*25c28e83SPiotr Jasiukajtis 
140*25c28e83SPiotr Jasiukajtis 			if (hx < 0x00100000)		/* X = denormal, zero or negative	*/
141*25c28e83SPiotr Jasiukajtis 			{
142*25c28e83SPiotr Jasiukajtis 				py -= stridey;
143*25c28e83SPiotr Jasiukajtis 				ax = hx & 0x7fffffff;
144*25c28e83SPiotr Jasiukajtis 				lx = LO((px - stridex));
145*25c28e83SPiotr Jasiukajtis 				res = *(px - stridex);
146*25c28e83SPiotr Jasiukajtis 
147*25c28e83SPiotr Jasiukajtis 				if ((ax | lx) == 0)	/* |X| = zero	*/
148*25c28e83SPiotr Jasiukajtis 				{
149*25c28e83SPiotr Jasiukajtis 					RETURN (DONE / res)
150*25c28e83SPiotr Jasiukajtis 				}
151*25c28e83SPiotr Jasiukajtis 				else if (hx >= 0)	/* X = denormal	*/
152*25c28e83SPiotr Jasiukajtis 				{
153*25c28e83SPiotr Jasiukajtis 					double		res_c0, dsqrt_exp0;
154*25c28e83SPiotr Jasiukajtis 					int		ind0, sqrt_exp0;
155*25c28e83SPiotr Jasiukajtis 					double		xx0, dexp_hi0, dexp_lo0;
156*25c28e83SPiotr Jasiukajtis 					int		hx0, resh0, res_ch0;
157*25c28e83SPiotr Jasiukajtis 
158*25c28e83SPiotr Jasiukajtis 					res = *(long long*)&res;
159*25c28e83SPiotr Jasiukajtis 
160*25c28e83SPiotr Jasiukajtis 					hx0 = HI(&res);
161*25c28e83SPiotr Jasiukajtis 					sqrt_exp0 = (0x817 - (hx0 >> 21)) << 20;
162*25c28e83SPiotr Jasiukajtis 					ind0 = (((hx0 >> 10) & 0x7f8) + 8) & -16;
163*25c28e83SPiotr Jasiukajtis 
164*25c28e83SPiotr Jasiukajtis 					resh0 = (hx0 & 0x001fffff) | 0x3fe00000;
165*25c28e83SPiotr Jasiukajtis 					res_ch0 = (resh0 + 0x00002000) & 0x7fffc000;
166*25c28e83SPiotr Jasiukajtis 					HI(&res) = resh0;
167*25c28e83SPiotr Jasiukajtis 					HI(&res_c0) = res_ch0;
168*25c28e83SPiotr Jasiukajtis 					LO(&res_c0) = 0;
169*25c28e83SPiotr Jasiukajtis 
170*25c28e83SPiotr Jasiukajtis 					dexp_hi0 = ((double*)((char*)__vlibm_TBL_rsqrt + ind0))[0];
171*25c28e83SPiotr Jasiukajtis 					dexp_lo0 = ((double*)((char*)__vlibm_TBL_rsqrt + ind0))[1];
172*25c28e83SPiotr Jasiukajtis 					xx0 = dexp_hi0 * dexp_hi0;
173*25c28e83SPiotr Jasiukajtis 					xx0 = (res - res_c0) * xx0;
174*25c28e83SPiotr Jasiukajtis 					res = (((((K6 * xx0 + K5) * xx0 + K4) * xx0 + K3) * xx0 + K2) * xx0 + K1) * xx0;
175*25c28e83SPiotr Jasiukajtis 
176*25c28e83SPiotr Jasiukajtis 					res = dexp_hi0 * res + dexp_lo0 + dexp_hi0;
177*25c28e83SPiotr Jasiukajtis 
178*25c28e83SPiotr Jasiukajtis 					HI(&dsqrt_exp0) = sqrt_exp0;
179*25c28e83SPiotr Jasiukajtis 					LO(&dsqrt_exp0) = 0;
180*25c28e83SPiotr Jasiukajtis 					res *= dsqrt_exp0;
181*25c28e83SPiotr Jasiukajtis 
182*25c28e83SPiotr Jasiukajtis 					RETURN (res)
183*25c28e83SPiotr Jasiukajtis 				}
184*25c28e83SPiotr Jasiukajtis 				else	/* X = negative	*/
185*25c28e83SPiotr Jasiukajtis 				{
186*25c28e83SPiotr Jasiukajtis 					RETURN (sqrt(res))
187*25c28e83SPiotr Jasiukajtis 				}
188*25c28e83SPiotr Jasiukajtis 			}
189*25c28e83SPiotr Jasiukajtis 			n_n++;
190*25c28e83SPiotr Jasiukajtis 			hx = HI(px);
191*25c28e83SPiotr Jasiukajtis 		}
192*25c28e83SPiotr Jasiukajtis 		if (n_n > 0)
193*25c28e83SPiotr Jasiukajtis 			__vrsqrt_n(n_n, spx, stridex, spy, stridey);
194*25c28e83SPiotr Jasiukajtis 	}
195*25c28e83SPiotr Jasiukajtis 	if (n > 0)
196*25c28e83SPiotr Jasiukajtis 	{
197*25c28e83SPiotr Jasiukajtis 		hx = HI(px);
198*25c28e83SPiotr Jasiukajtis 
199*25c28e83SPiotr Jasiukajtis 		if (hx >= 0x7ff00000)		/* X = NaN or Inf	*/
200*25c28e83SPiotr Jasiukajtis 		{
201*25c28e83SPiotr Jasiukajtis 			res = *px;
202*25c28e83SPiotr Jasiukajtis 			*py = DONE / res;
203*25c28e83SPiotr Jasiukajtis 		}
204*25c28e83SPiotr Jasiukajtis 		else if (hx < 0x00100000)	/* X = denormal, zero or negative	*/
205*25c28e83SPiotr Jasiukajtis 		{
206*25c28e83SPiotr Jasiukajtis 			ax = hx & 0x7fffffff;
207*25c28e83SPiotr Jasiukajtis 			lx = LO(px);
208*25c28e83SPiotr Jasiukajtis 			res = *px;
209*25c28e83SPiotr Jasiukajtis 
210*25c28e83SPiotr Jasiukajtis 			if ((ax | lx) == 0)	/* |X| = zero	*/
211*25c28e83SPiotr Jasiukajtis 			{
212*25c28e83SPiotr Jasiukajtis 				*py = DONE / res;
213*25c28e83SPiotr Jasiukajtis 			}
214*25c28e83SPiotr Jasiukajtis 			else if (hx >= 0)	/* X = denormal	*/
215*25c28e83SPiotr Jasiukajtis 			{
216*25c28e83SPiotr Jasiukajtis 				double		res_c0, dsqrt_exp0;
217*25c28e83SPiotr Jasiukajtis 				int		ind0, sqrt_exp0;
218*25c28e83SPiotr Jasiukajtis 				double		xx0, dexp_hi0, dexp_lo0;
219*25c28e83SPiotr Jasiukajtis 				int		hx0, resh0, res_ch0;
220*25c28e83SPiotr Jasiukajtis 
221*25c28e83SPiotr Jasiukajtis 				res = *(long long*)&res;
222*25c28e83SPiotr Jasiukajtis 
223*25c28e83SPiotr Jasiukajtis 				hx0 = HI(&res);
224*25c28e83SPiotr Jasiukajtis 				sqrt_exp0 = (0x817 - (hx0 >> 21)) << 20;
225*25c28e83SPiotr Jasiukajtis 				ind0 = (((hx0 >> 10) & 0x7f8) + 8) & -16;
226*25c28e83SPiotr Jasiukajtis 
227*25c28e83SPiotr Jasiukajtis 				resh0 = (hx0 & 0x001fffff) | 0x3fe00000;
228*25c28e83SPiotr Jasiukajtis 				res_ch0 = (resh0 + 0x00002000) & 0x7fffc000;
229*25c28e83SPiotr Jasiukajtis 				HI(&res) = resh0;
230*25c28e83SPiotr Jasiukajtis 				HI(&res_c0) = res_ch0;
231*25c28e83SPiotr Jasiukajtis 				LO(&res_c0) = 0;
232*25c28e83SPiotr Jasiukajtis 
233*25c28e83SPiotr Jasiukajtis 				dexp_hi0 = ((double*)((char*)__vlibm_TBL_rsqrt + ind0))[0];
234*25c28e83SPiotr Jasiukajtis 				dexp_lo0 = ((double*)((char*)__vlibm_TBL_rsqrt + ind0))[1];
235*25c28e83SPiotr Jasiukajtis 				xx0 = dexp_hi0 * dexp_hi0;
236*25c28e83SPiotr Jasiukajtis 				xx0 = (res - res_c0) * xx0;
237*25c28e83SPiotr Jasiukajtis 				res = (((((K6 * xx0 + K5) * xx0 + K4) * xx0 + K3) * xx0 + K2) * xx0 + K1) * xx0;
238*25c28e83SPiotr Jasiukajtis 
239*25c28e83SPiotr Jasiukajtis 				res = dexp_hi0 * res + dexp_lo0 + dexp_hi0;
240*25c28e83SPiotr Jasiukajtis 
241*25c28e83SPiotr Jasiukajtis 				HI(&dsqrt_exp0) = sqrt_exp0;
242*25c28e83SPiotr Jasiukajtis 				LO(&dsqrt_exp0) = 0;
243*25c28e83SPiotr Jasiukajtis 				res *= dsqrt_exp0;
244*25c28e83SPiotr Jasiukajtis 
245*25c28e83SPiotr Jasiukajtis 				*py = res;
246*25c28e83SPiotr Jasiukajtis 			}
247*25c28e83SPiotr Jasiukajtis 			else	/* X = negative	*/
248*25c28e83SPiotr Jasiukajtis 			{
249*25c28e83SPiotr Jasiukajtis 				*py = sqrt(res);
250*25c28e83SPiotr Jasiukajtis 			}
251*25c28e83SPiotr Jasiukajtis 		}
252*25c28e83SPiotr Jasiukajtis 		else
253*25c28e83SPiotr Jasiukajtis 		{
254*25c28e83SPiotr Jasiukajtis 			double		res_c0, dsqrt_exp0;
255*25c28e83SPiotr Jasiukajtis 			int		ind0, sqrt_exp0;
256*25c28e83SPiotr Jasiukajtis 			double		xx0, dexp_hi0, dexp_lo0;
257*25c28e83SPiotr Jasiukajtis 			int		resh0, res_ch0;
258*25c28e83SPiotr Jasiukajtis 
259*25c28e83SPiotr Jasiukajtis 			sqrt_exp0 = (0x5fe - (hx >> 21)) << 20;
260*25c28e83SPiotr Jasiukajtis 			ind0 = (((hx >> 10) & 0x7f8) + 8) & -16;
261*25c28e83SPiotr Jasiukajtis 
262*25c28e83SPiotr Jasiukajtis 			resh0 = (hx & 0x001fffff) | 0x3fe00000;
263*25c28e83SPiotr Jasiukajtis 			res_ch0 = (resh0 + 0x00002000) & 0x7fffc000;
264*25c28e83SPiotr Jasiukajtis 			HI(&res) = resh0;
265*25c28e83SPiotr Jasiukajtis 			LO(&res) = LO(px);
266*25c28e83SPiotr Jasiukajtis 			HI(&res_c0) = res_ch0;
267*25c28e83SPiotr Jasiukajtis 			LO(&res_c0) = 0;
268*25c28e83SPiotr Jasiukajtis 
269*25c28e83SPiotr Jasiukajtis 			dexp_hi0 = ((double*)((char*)__vlibm_TBL_rsqrt + ind0))[0];
270*25c28e83SPiotr Jasiukajtis 			dexp_lo0 = ((double*)((char*)__vlibm_TBL_rsqrt + ind0))[1];
271*25c28e83SPiotr Jasiukajtis 			xx0 = dexp_hi0 * dexp_hi0;
272*25c28e83SPiotr Jasiukajtis 			xx0 = (res - res_c0) * xx0;
273*25c28e83SPiotr Jasiukajtis 			res = (((((K6 * xx0 + K5) * xx0 + K4) * xx0 + K3) * xx0 + K2) * xx0 + K1) * xx0;
274*25c28e83SPiotr Jasiukajtis 
275*25c28e83SPiotr Jasiukajtis 			res = dexp_hi0 * res + dexp_lo0 + dexp_hi0;
276*25c28e83SPiotr Jasiukajtis 
277*25c28e83SPiotr Jasiukajtis 			HI(&dsqrt_exp0) = sqrt_exp0;
278*25c28e83SPiotr Jasiukajtis 			LO(&dsqrt_exp0) = 0;
279*25c28e83SPiotr Jasiukajtis 			res *= dsqrt_exp0;
280*25c28e83SPiotr Jasiukajtis 
281*25c28e83SPiotr Jasiukajtis 			*py = res;
282*25c28e83SPiotr Jasiukajtis 		}
283*25c28e83SPiotr Jasiukajtis 	}
284*25c28e83SPiotr Jasiukajtis }
285*25c28e83SPiotr Jasiukajtis 
286*25c28e83SPiotr Jasiukajtis static void
__vrsqrt_n(int n,double * restrict px,int stridex,double * restrict py,int stridey)287*25c28e83SPiotr Jasiukajtis __vrsqrt_n(int n, double * restrict px, int stridex, double * restrict py, int stridey)
288*25c28e83SPiotr Jasiukajtis {
289*25c28e83SPiotr Jasiukajtis 	double		res0, res_c0, dsqrt_exp0;
290*25c28e83SPiotr Jasiukajtis 	double		res1, res_c1, dsqrt_exp1;
291*25c28e83SPiotr Jasiukajtis 	double		res2, res_c2, dsqrt_exp2;
292*25c28e83SPiotr Jasiukajtis 	int		ind0, sqrt_exp0;
293*25c28e83SPiotr Jasiukajtis 	int		ind1, sqrt_exp1;
294*25c28e83SPiotr Jasiukajtis 	int		ind2, sqrt_exp2;
295*25c28e83SPiotr Jasiukajtis 	double		xx0, dexp_hi0, dexp_lo0;
296*25c28e83SPiotr Jasiukajtis 	double		xx1, dexp_hi1, dexp_lo1;
297*25c28e83SPiotr Jasiukajtis 	double		xx2, dexp_hi2, dexp_lo2;
298*25c28e83SPiotr Jasiukajtis 	int		hx0, resh0, res_ch0;
299*25c28e83SPiotr Jasiukajtis 	int		hx1, resh1, res_ch1;
300*25c28e83SPiotr Jasiukajtis 	int		hx2, resh2, res_ch2;
301*25c28e83SPiotr Jasiukajtis 
302*25c28e83SPiotr Jasiukajtis 	LO(&dsqrt_exp0) = 0;
303*25c28e83SPiotr Jasiukajtis 	LO(&dsqrt_exp1) = 0;
304*25c28e83SPiotr Jasiukajtis 	LO(&dsqrt_exp2) = 0;
305*25c28e83SPiotr Jasiukajtis 	LO(&res_c0) = 0;
306*25c28e83SPiotr Jasiukajtis 	LO(&res_c1) = 0;
307*25c28e83SPiotr Jasiukajtis 	LO(&res_c2) = 0;
308*25c28e83SPiotr Jasiukajtis 
309*25c28e83SPiotr Jasiukajtis 	for(; n > 2 ; n -= 3)
310*25c28e83SPiotr Jasiukajtis 	{
311*25c28e83SPiotr Jasiukajtis 		hx0 = HI(px);
312*25c28e83SPiotr Jasiukajtis 		LO(&res0) = LO(px);
313*25c28e83SPiotr Jasiukajtis 		px += stridex;
314*25c28e83SPiotr Jasiukajtis 
315*25c28e83SPiotr Jasiukajtis 		hx1 = HI(px);
316*25c28e83SPiotr Jasiukajtis 		LO(&res1) = LO(px);
317*25c28e83SPiotr Jasiukajtis 		px += stridex;
318*25c28e83SPiotr Jasiukajtis 
319*25c28e83SPiotr Jasiukajtis 		hx2 = HI(px);
320*25c28e83SPiotr Jasiukajtis 		LO(&res2) = LO(px);
321*25c28e83SPiotr Jasiukajtis 		px += stridex;
322*25c28e83SPiotr Jasiukajtis 
323*25c28e83SPiotr Jasiukajtis 		sqrt_exp0 = (0x5fe - (hx0 >> 21)) << 20;
324*25c28e83SPiotr Jasiukajtis 		sqrt_exp1 = (0x5fe - (hx1 >> 21)) << 20;
325*25c28e83SPiotr Jasiukajtis 		sqrt_exp2 = (0x5fe - (hx2 >> 21)) << 20;
326*25c28e83SPiotr Jasiukajtis 		ind0 = (((hx0 >> 10) & 0x7f8) + 8) & -16;
327*25c28e83SPiotr Jasiukajtis 		ind1 = (((hx1 >> 10) & 0x7f8) + 8) & -16;
328*25c28e83SPiotr Jasiukajtis 		ind2 = (((hx2 >> 10) & 0x7f8) + 8) & -16;
329*25c28e83SPiotr Jasiukajtis 
330*25c28e83SPiotr Jasiukajtis 		resh0 = (hx0 & 0x001fffff) | 0x3fe00000;
331*25c28e83SPiotr Jasiukajtis 		resh1 = (hx1 & 0x001fffff) | 0x3fe00000;
332*25c28e83SPiotr Jasiukajtis 		resh2 = (hx2 & 0x001fffff) | 0x3fe00000;
333*25c28e83SPiotr Jasiukajtis 		res_ch0 = (resh0 + 0x00002000) & 0x7fffc000;
334*25c28e83SPiotr Jasiukajtis 		res_ch1 = (resh1 + 0x00002000) & 0x7fffc000;
335*25c28e83SPiotr Jasiukajtis 		res_ch2 = (resh2 + 0x00002000) & 0x7fffc000;
336*25c28e83SPiotr Jasiukajtis 		HI(&res0) = resh0;
337*25c28e83SPiotr Jasiukajtis 		HI(&res1) = resh1;
338*25c28e83SPiotr Jasiukajtis 		HI(&res2) = resh2;
339*25c28e83SPiotr Jasiukajtis 		HI(&res_c0) = res_ch0;
340*25c28e83SPiotr Jasiukajtis 		HI(&res_c1) = res_ch1;
341*25c28e83SPiotr Jasiukajtis 		HI(&res_c2) = res_ch2;
342*25c28e83SPiotr Jasiukajtis 
343*25c28e83SPiotr Jasiukajtis 		dexp_hi0 = ((double*)((char*)__vlibm_TBL_rsqrt + ind0))[0];
344*25c28e83SPiotr Jasiukajtis 		dexp_hi1 = ((double*)((char*)__vlibm_TBL_rsqrt + ind1))[0];
345*25c28e83SPiotr Jasiukajtis 		dexp_hi2 = ((double*)((char*)__vlibm_TBL_rsqrt + ind2))[0];
346*25c28e83SPiotr Jasiukajtis 		dexp_lo0 = ((double*)((char*)__vlibm_TBL_rsqrt + ind0))[1];
347*25c28e83SPiotr Jasiukajtis 		dexp_lo1 = ((double*)((char*)__vlibm_TBL_rsqrt + ind1))[1];
348*25c28e83SPiotr Jasiukajtis 		dexp_lo2 = ((double*)((char*)__vlibm_TBL_rsqrt + ind2))[1];
349*25c28e83SPiotr Jasiukajtis 		xx0 = dexp_hi0 * dexp_hi0;
350*25c28e83SPiotr Jasiukajtis 		xx1 = dexp_hi1 * dexp_hi1;
351*25c28e83SPiotr Jasiukajtis 		xx2 = dexp_hi2 * dexp_hi2;
352*25c28e83SPiotr Jasiukajtis 		xx0 = (res0 - res_c0) * xx0;
353*25c28e83SPiotr Jasiukajtis 		xx1 = (res1 - res_c1) * xx1;
354*25c28e83SPiotr Jasiukajtis 		xx2 = (res2 - res_c2) * xx2;
355*25c28e83SPiotr Jasiukajtis 		res0 = (((((K6 * xx0 + K5) * xx0 + K4) * xx0 + K3) * xx0 + K2) * xx0 + K1) * xx0;
356*25c28e83SPiotr Jasiukajtis 		res1 = (((((K6 * xx1 + K5) * xx1 + K4) * xx1 + K3) * xx1 + K2) * xx1 + K1) * xx1;
357*25c28e83SPiotr Jasiukajtis 		res2 = (((((K6 * xx2 + K5) * xx2 + K4) * xx2 + K3) * xx2 + K2) * xx2 + K1) * xx2;
358*25c28e83SPiotr Jasiukajtis 
359*25c28e83SPiotr Jasiukajtis 		res0 = dexp_hi0 * res0 + dexp_lo0 + dexp_hi0;
360*25c28e83SPiotr Jasiukajtis 		res1 = dexp_hi1 * res1 + dexp_lo1 + dexp_hi1;
361*25c28e83SPiotr Jasiukajtis 		res2 = dexp_hi2 * res2 + dexp_lo2 + dexp_hi2;
362*25c28e83SPiotr Jasiukajtis 
363*25c28e83SPiotr Jasiukajtis 		HI(&dsqrt_exp0) = sqrt_exp0;
364*25c28e83SPiotr Jasiukajtis 		HI(&dsqrt_exp1) = sqrt_exp1;
365*25c28e83SPiotr Jasiukajtis 		HI(&dsqrt_exp2) = sqrt_exp2;
366*25c28e83SPiotr Jasiukajtis 		res0 *= dsqrt_exp0;
367*25c28e83SPiotr Jasiukajtis 		res1 *= dsqrt_exp1;
368*25c28e83SPiotr Jasiukajtis 		res2 *= dsqrt_exp2;
369*25c28e83SPiotr Jasiukajtis 
370*25c28e83SPiotr Jasiukajtis 		*py = res0;
371*25c28e83SPiotr Jasiukajtis 		py += stridey;
372*25c28e83SPiotr Jasiukajtis 
373*25c28e83SPiotr Jasiukajtis 		*py = res1;
374*25c28e83SPiotr Jasiukajtis 		py += stridey;
375*25c28e83SPiotr Jasiukajtis 
376*25c28e83SPiotr Jasiukajtis 		*py = res2;
377*25c28e83SPiotr Jasiukajtis 		py += stridey;
378*25c28e83SPiotr Jasiukajtis 	}
379*25c28e83SPiotr Jasiukajtis 
380*25c28e83SPiotr Jasiukajtis 	for(; n > 0 ; n--)
381*25c28e83SPiotr Jasiukajtis 	{
382*25c28e83SPiotr Jasiukajtis 		hx0 = HI(px);
383*25c28e83SPiotr Jasiukajtis 
384*25c28e83SPiotr Jasiukajtis 		sqrt_exp0 = (0x5fe - (hx0 >> 21)) << 20;
385*25c28e83SPiotr Jasiukajtis 		ind0 = (((hx0 >> 10) & 0x7f8) + 8) & -16;
386*25c28e83SPiotr Jasiukajtis 
387*25c28e83SPiotr Jasiukajtis 		resh0 = (hx0 & 0x001fffff) | 0x3fe00000;
388*25c28e83SPiotr Jasiukajtis 		res_ch0 = (resh0 + 0x00002000) & 0x7fffc000;
389*25c28e83SPiotr Jasiukajtis 		HI(&res0) = resh0;
390*25c28e83SPiotr Jasiukajtis 		LO(&res0) = LO(px);
391*25c28e83SPiotr Jasiukajtis 		HI(&res_c0) = res_ch0;
392*25c28e83SPiotr Jasiukajtis 		LO(&res_c0) = 0;
393*25c28e83SPiotr Jasiukajtis 
394*25c28e83SPiotr Jasiukajtis 		px += stridex;
395*25c28e83SPiotr Jasiukajtis 
396*25c28e83SPiotr Jasiukajtis 		dexp_hi0 = ((double*)((char*)__vlibm_TBL_rsqrt + ind0))[0];
397*25c28e83SPiotr Jasiukajtis 		dexp_lo0 = ((double*)((char*)__vlibm_TBL_rsqrt + ind0))[1];
398*25c28e83SPiotr Jasiukajtis 		xx0 = dexp_hi0 * dexp_hi0;
399*25c28e83SPiotr Jasiukajtis 		xx0 = (res0 - res_c0) * xx0;
400*25c28e83SPiotr Jasiukajtis 		res0 = (((((K6 * xx0 + K5) * xx0 + K4) * xx0 + K3) * xx0 + K2) * xx0 + K1) * xx0;
401*25c28e83SPiotr Jasiukajtis 
402*25c28e83SPiotr Jasiukajtis 		res0 = dexp_hi0 * res0 + dexp_lo0 + dexp_hi0;
403*25c28e83SPiotr Jasiukajtis 
404*25c28e83SPiotr Jasiukajtis 		HI(&dsqrt_exp0) = sqrt_exp0;
405*25c28e83SPiotr Jasiukajtis 		LO(&dsqrt_exp0) = 0;
406*25c28e83SPiotr Jasiukajtis 		res0 *= dsqrt_exp0;
407*25c28e83SPiotr Jasiukajtis 
408*25c28e83SPiotr Jasiukajtis 		*py = res0;
409*25c28e83SPiotr Jasiukajtis 		py += stridey;
410*25c28e83SPiotr Jasiukajtis 	}
411*25c28e83SPiotr Jasiukajtis }
412