xref: /illumos-gate/usr/src/lib/libmvec/common/__vrsqrtf.c (revision fcdb3229a31dd4ff700c69238814e326aad49098)
1 /*
2  * CDDL HEADER START
3  *
4  * The contents of this file are subject to the terms of the
5  * Common Development and Distribution License (the "License").
6  * You may not use this file except in compliance with the License.
7  *
8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9  * or http://www.opensolaris.org/os/licensing.
10  * See the License for the specific language governing permissions
11  * and limitations under the License.
12  *
13  * When distributing Covered Code, include this CDDL HEADER in each
14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15  * If applicable, add the following below this CDDL HEADER, with the
16  * fields enclosed by brackets "[]" replaced with your own identifying
17  * information: Portions Copyright [yyyy] [name of copyright owner]
18  *
19  * CDDL HEADER END
20  */
21 
22 /*
23  * Copyright 2011 Nexenta Systems, Inc.  All rights reserved.
24  */
25 /*
26  * Copyright 2006 Sun Microsystems, Inc.  All rights reserved.
27  * Use is subject to license terms.
28  */
29 
30 #include "libm_inlines.h"
31 
32 #ifdef __RESTRICT
33 #define	restrict _Restrict
34 #else
35 #define	restrict
36 #endif
37 
38 /*
39  * float rsqrtf(float x)
40  *
41  * Method :
42  *	1. Special cases:
43  *		for x = NaN			=> QNaN;
44  *		for x = +Inf			=> 0;
45  *		for x is negative, -Inf		=> QNaN + invalid;
46  *		for x = +0			=> +Inf + divide-by-zero;
47  *		for x = -0			=> -Inf + divide-by-zero.
48  *	2. Computes reciprocal square root from:
49  *		x = m * 2**n
50  *	Where:
51  *		m = [0.5, 2),
52  *		n = ((exponent + 1) & ~1).
53  *	Then:
54  *		rsqrtf(x) = 1/sqrt( m * 2**n ) = (2 ** (-n/2)) * (1/sqrt(m))
55  *	2. Computes 1/sqrt(m) from:
56  *		1/sqrt(m) = (1/sqrt(m0)) * (1/sqrt(1 + (1/m0)*dm))
57  *	Where:
58  *		m = m0 + dm,
59  *		m0 = 0.5 * (1 + k/64) for m = [0.5, 0.5+127/256), k = [0, 63];
60  *		m0 = 1.0 * (0 + k/64) for m = [0.5+127/256, 1.0+127/128),
61  *		    k = [64, 127];
62  *	Then:
63  *		1/sqrt(m0), 1/m0 are looked up in a table,
64  *		1/sqrt(1 + (1/m0)*dm) is computed using approximation:
65  *			1/sqrt(1 + z) = ((a3 * z + a2) * z + a1) * z + a0
66  *			where z = [-1/64, 1/64].
67  *
68  * Accuracy:
69  *	The maximum relative error for the approximating
70  *	polynomial is 2**(-27.87).
71  *	Maximum error observed: less than 0.534 ulp for the
72  *	whole float type range.
73  */
74 
75 extern float sqrtf(float);
76 
77 static const double __TBL_rsqrtf[] = {
78 /*
79  * i = [0,63]
80  * TBL[2*i  ] = 1 / (*(double*)&(0x3fe0000000000000ULL + (i << 46))) * 2**-24;
81  * TBL[2*i+1] = 1 / sqrtl(*(double*)&(0x3fe0000000000000ULL + (i << 46)));
82  * i = [64,127]
83  * TBL[2*i  ] = 1 / (*(double*)&(0x3fe0000000000000ULL + (i << 46))) * 2**-23;
84  * TBL[2*i+1] = 1 / sqrtl(*(double*)&(0x3fe0000000000000ULL + (i << 46)));
85  */
86 	1.1920928955078125000e-07, 1.4142135623730951455e+00,
87 	1.1737530048076923728e-07, 1.4032928308912466786e+00,
88 	1.1559688683712121533e-07, 1.3926212476455828160e+00,
89 	1.1387156016791044559e-07, 1.3821894809301762397e+00,
90 	1.1219697840073529256e-07, 1.3719886811400707760e+00,
91 	1.1057093523550724772e-07, 1.3620104492139977204e+00,
92 	1.0899135044642856803e-07, 1.3522468075656264297e+00,
93 	1.0745626100352112918e-07, 1.3426901732747025253e+00,
94 	1.0596381293402777190e-07, 1.3333333333333332593e+00,
95 	1.0451225385273972023e-07, 1.3241694217637887121e+00,
96 	1.0309992609797297870e-07, 1.3151918984428583315e+00,
97 	1.0172526041666667320e-07, 1.3063945294843617440e+00,
98 	1.0038677014802631022e-07, 1.2977713690461003537e+00,
99 	9.9083045860389616921e-08, 1.2893167424406084542e+00,
100 	9.7812750400641022247e-08, 1.2810252304406970492e+00,
101 	9.6574614319620251657e-08, 1.2728916546811681609e+00,
102 	9.5367431640625005294e-08, 1.2649110640673517647e+00,
103 	9.4190055941358019463e-08, 1.2570787221094177344e+00,
104 	9.3041396722560978838e-08, 1.2493900951088485751e+00,
105 	9.1920416039156631290e-08, 1.2418408411301324890e+00,
106 	9.0826125372023804482e-08, 1.2344267996967352996e+00,
107 	8.9757582720588234048e-08, 1.2271439821557927896e+00,
108 	8.8713889898255812722e-08, 1.2199885626608373279e+00,
109 	8.7694190014367814875e-08, 1.2129568697262453902e+00,
110 	8.6697665127840911497e-08, 1.2060453783110545167e+00,
111 	8.5723534058988761666e-08, 1.1992507023933782762e+00,
112 	8.4771050347222225457e-08, 1.1925695879998878812e+00,
113 	8.3839500343406599951e-08, 1.1859989066577618644e+00,
114 	8.2928201426630432481e-08, 1.1795356492391770864e+00,
115 	8.2036500336021511923e-08, 1.1731769201708264205e+00,
116 	8.1163771609042551220e-08, 1.1669199319831564665e+00,
117 	8.0309416118421050820e-08, 1.1607620001760186046e+00,
118 	7.9472859700520828922e-08, 1.1547005383792514621e+00,
119 	7.8653551868556699530e-08, 1.1487330537883810866e+00,
120 	7.7850964604591830522e-08, 1.1428571428571427937e+00,
121 	7.7064591224747481298e-08, 1.1370704872299222110e+00,
122 	7.6293945312500001588e-08, 1.1313708498984760276e+00,
123 	7.5538559715346535571e-08, 1.1257560715684669095e+00,
124 	7.4797985600490195040e-08, 1.1202240672224077489e+00,
125 	7.4071791565533974158e-08, 1.1147728228665882977e+00,
126 	7.3359562800480773303e-08, 1.1094003924504582947e+00,
127 	7.2660900297619054173e-08, 1.1041048949477667573e+00,
128 	7.1975420106132072725e-08, 1.0988845115895122806e+00,
129 	7.1302752628504667579e-08, 1.0937374832394612945e+00,
130 	7.0642541956018514597e-08, 1.0886621079036347126e+00,
131 	6.9994445240825691959e-08, 1.0836567383657542685e+00,
132 	6.9358132102272723904e-08, 1.0787197799411873955e+00,
133 	6.8733284065315314719e-08, 1.0738496883424388795e+00,
134 	6.8119594029017853361e-08, 1.0690449676496975862e+00,
135 	6.7516765763274335346e-08, 1.0643041683803828867e+00,
136 	6.6924513432017540145e-08, 1.0596258856520350822e+00,
137 	6.6342561141304348632e-08, 1.0550087574332591700e+00,
138 	6.5770642510775861156e-08, 1.0504514628777803509e+00,
139 	6.5208500267094023655e-08, 1.0459527207369814228e+00,
140 	6.4655885858050847233e-08, 1.0415112878465908608e+00,
141 	6.4112559086134451001e-08, 1.0371259576834630511e+00,
142 	6.3578287760416665784e-08, 1.0327955589886446131e+00,
143 	6.3052847365702481089e-08, 1.0285189544531601058e+00,
144 	6.2536020747950822927e-08, 1.0242950394631678002e+00,
145 	6.2027597815040656970e-08, 1.0201227409013413627e+00,
146 	6.1527375252016127325e-08, 1.0160010160015240377e+00,
147 	6.1035156250000001271e-08, 1.0119288512538813229e+00,
148 	6.0550750248015869655e-08, 1.0079052613579393416e+00,
149 	6.0073972687007873182e-08, 1.0039292882210537616e+00,
150 	1.1920928955078125000e-07, 1.0000000000000000000e+00,
151 	1.1737530048076923728e-07, 9.9227787671366762812e-01,
152 	1.1559688683712121533e-07, 9.8473192783466190203e-01,
153 	1.1387156016791044559e-07, 9.7735555485044178781e-01,
154 	1.1219697840073529256e-07, 9.7014250014533187638e-01,
155 	1.1057093523550724772e-07, 9.6308682468615358641e-01,
156 	1.0899135044642856803e-07, 9.5618288746751489704e-01,
157 	1.0745626100352112918e-07, 9.4942532655508271588e-01,
158 	1.0596381293402777190e-07, 9.4280904158206335630e-01,
159 	1.0451225385273972023e-07, 9.3632917756904454620e-01,
160 	1.0309992609797297870e-07, 9.2998110995055427441e-01,
161 	1.0172526041666667320e-07, 9.2376043070340119190e-01,
162 	1.0038677014802631022e-07, 9.1766293548224708854e-01,
163 	9.9083045860389616921e-08, 9.1168461167710357351e-01,
164 	9.7812750400641022247e-08, 9.0582162731567661407e-01,
165 	9.6574614319620251657e-08, 9.0007032074081916306e-01,
166 	9.5367431640625005294e-08, 8.9442719099991585541e-01,
167 	9.4190055941358019463e-08, 8.8888888888888883955e-01,
168 	9.3041396722560978838e-08, 8.8345220859877238162e-01,
169 	9.1920416039156631290e-08, 8.7811407991752277180e-01,
170 	9.0826125372023804482e-08, 8.7287156094396955996e-01,
171 	8.9757582720588234048e-08, 8.6772183127462465535e-01,
172 	8.8713889898255812722e-08, 8.6266218562750729415e-01,
173 	8.7694190014367814875e-08, 8.5769002787023584933e-01,
174 	8.6697665127840911497e-08, 8.5280286542244176928e-01,
175 	8.5723534058988761666e-08, 8.4799830400508802164e-01,
176 	8.4771050347222225457e-08, 8.4327404271156780613e-01,
177 	8.3839500343406599951e-08, 8.3862786937753464045e-01,
178 	8.2928201426630432481e-08, 8.3405765622829908246e-01,
179 	8.2036500336021511923e-08, 8.2956135578434020417e-01,
180 	8.1163771609042551220e-08, 8.2513699700703468931e-01,
181 	8.0309416118421050820e-08, 8.2078268166812329287e-01,
182 	7.9472859700520828922e-08, 8.1649658092772603446e-01,
183 	7.8653551868556699530e-08, 8.1227693210689522196e-01,
184 	7.7850964604591830522e-08, 8.0812203564176865456e-01,
185 	7.7064591224747481298e-08, 8.0403025220736967782e-01,
186 	7.6293945312500001588e-08, 8.0000000000000004441e-01,
187 	7.5538559715346535571e-08, 7.9602975216799132241e-01,
188 	7.4797985600490195040e-08, 7.9211803438133943089e-01,
189 	7.4071791565533974158e-08, 7.8826342253143455441e-01,
190 	7.3359562800480773303e-08, 7.8446454055273617811e-01,
191 	7.2660900297619054173e-08, 7.8072005835882651859e-01,
192 	7.1975420106132072725e-08, 7.7702868988581130782e-01,
193 	7.1302752628504667579e-08, 7.7338919123653082632e-01,
194 	7.0642541956018514597e-08, 7.6980035891950104876e-01,
195 	6.9994445240825691959e-08, 7.6626102817692109959e-01,
196 	6.9358132102272723904e-08, 7.6277007139647390321e-01,
197 	6.8733284065315314719e-08, 7.5932639660199918730e-01,
198 	6.8119594029017853361e-08, 7.5592894601845450619e-01,
199 	6.7516765763274335346e-08, 7.5257669470687782454e-01,
200 	6.6924513432017540145e-08, 7.4926864926535519107e-01,
201 	6.6342561141304348632e-08, 7.4600384659225105199e-01,
202 	6.5770642510775861156e-08, 7.4278135270820744296e-01,
203 	6.5208500267094023655e-08, 7.3960026163363878915e-01,
204 	6.4655885858050847233e-08, 7.3645969431865865307e-01,
205 	6.4112559086134451001e-08, 7.3335879762256905856e-01,
206 	6.3578287760416665784e-08, 7.3029674334022143256e-01,
207 	6.3052847365702481089e-08, 7.2727272727272729291e-01,
208 	6.2536020747950822927e-08, 7.2428596834014824513e-01,
209 	6.2027597815040656970e-08, 7.2133570773394584119e-01,
210 	6.1527375252016127325e-08, 7.1842120810709964029e-01,
211 	6.1035156250000001271e-08, 7.1554175279993270653e-01,
212 	6.0550750248015869655e-08, 7.1269664509979835376e-01,
213 	6.0073972687007873182e-08, 7.0988520753289097165e-01,
214 };
215 
216 static const unsigned long long LCONST[] = {
217 0x3feffffffee7f18fULL,	/* A0 = 9.99999997962321453275e-01	*/
218 0xbfdffffffe07e52fULL,	/* A1 =-4.99999998166077580600e-01	*/
219 0x3fd801180ca296d9ULL,	/* A2 = 3.75066768969515586277e-01	*/
220 0xbfd400fc0bbb8e78ULL,	/* A3 =-3.12560092408808548438e-01	*/
221 };
222 
223 static void
224 __vrsqrtf_n(int n, float *restrict px, int stridex, float *restrict py,
225     int stridey);
226 
227 #define	RETURN(ret)						\
228 {								\
229 	*py = (ret);						\
230 	py += stridey;						\
231 	if (n_n == 0) {						\
232 		spx = px;					\
233 		spy = py;					\
234 		ax0 = *(int *)px;				\
235 		continue;					\
236 	}							\
237 	n--;							\
238 	break;							\
239 }
240 
241 void
__vrsqrtf(int n,float * restrict px,int stridex,float * restrict py,int stridey)242 __vrsqrtf(int n, float *restrict px, int stridex, float *restrict py,
243     int stridey)
244 {
245 	float		*spx, *spy;
246 	int		ax0, n_n;
247 	float		res;
248 	float		FONE = 1.0f, FTWO = 2.0f;
249 
250 	while (n > 1) {
251 		n_n = 0;
252 		spx = px;
253 		spy = py;
254 		ax0 = *(int *)px;
255 		for (; n > 1; n--) {
256 			px += stridex;
257 			if (ax0 >= 0x7f800000) { /* X = NaN or Inf */
258 				res = *(px - stridex);
259 				RETURN(FONE / res)
260 			}
261 
262 			py += stridey;
263 
264 			if (ax0 < 0x00800000) {
265 				/* X = denormal, zero or negative */
266 				py -= stridey;
267 				res = *(px - stridex);
268 
269 				if ((ax0 & 0x7fffffff) == 0) { /* |X| = zero */
270 					RETURN(FONE / res)
271 				} else if (ax0 >= 0) {	/* X = denormal	*/
272 					/*  9.99999997962321453275e-01	*/
273 					double A0 = ((double *)LCONST)[0];
274 					/* -4.99999998166077580600e-01	*/
275 					double A1 = ((double *)LCONST)[1];
276 					/*  3.75066768969515586277e-01	*/
277 					double A2 = ((double *)LCONST)[2];
278 					/* -3.12560092408808548438e-01	*/
279 					double A3 = ((double *)LCONST)[3];
280 
281 					double res0, xx0, tbl_div0, tbl_sqrt0;
282 					float	fres0;
283 					int	iax0, si0, iexp0;
284 
285 					res = *(int *)&res;
286 					res *= FTWO;
287 					ax0 = *(int *)&res;
288 					iexp0 = ax0 >> 24;
289 					iexp0 = 0x3f + 0x4b - iexp0;
290 					iexp0 = iexp0 << 23;
291 
292 					si0 = (ax0 >> 13) & 0x7f0;
293 
294 					tbl_div0 = ((double *)
295 					    ((char *)__TBL_rsqrtf + si0))[0];
296 					tbl_sqrt0 = ((double *)
297 					    ((char *)__TBL_rsqrtf + si0))[1];
298 					iax0 = ax0 & 0x7ffe0000;
299 					iax0 = ax0 - iax0;
300 					xx0 = iax0 * tbl_div0;
301 					res0 = tbl_sqrt0 *
302 					    (((A3 * xx0 + A2) * xx0 + A1) *
303 					    xx0 + A0);
304 
305 					fres0 = res0;
306 					iexp0 += *(int *)&fres0;
307 					RETURN(*(float *)&iexp0)
308 				} else {	/* X = negative	*/
309 					RETURN(sqrtf(res))
310 				}
311 			}
312 			n_n++;
313 			ax0 = *(int *)px;
314 		}
315 		if (n_n > 0)
316 			__vrsqrtf_n(n_n, spx, stridex, spy, stridey);
317 	}
318 
319 	if (n > 0) {
320 		ax0 = *(int *)px;
321 
322 		if (ax0 >= 0x7f800000) {	/* X = NaN or Inf	*/
323 			res = *px;
324 			*py = FONE / res;
325 		} else if (ax0 < 0x00800000) {
326 			/* X = denormal, zero or negative */
327 			res = *px;
328 
329 			if ((ax0 & 0x7fffffff) == 0) {	/* |X| = zero	*/
330 				*py = FONE / res;
331 			} else if (ax0 >= 0) {	/* X = denormal	*/
332 				/*  9.99999997962321453275e-01	*/
333 				double A0 = ((double *)LCONST)[0];
334 				/* -4.99999998166077580600e-01	*/
335 				double A1 = ((double *)LCONST)[1];
336 				/*  3.75066768969515586277e-01	*/
337 				double A2 = ((double *)LCONST)[2];
338 				/* -3.12560092408808548438e-01	*/
339 				double A3 = ((double *)LCONST)[3];
340 				double		res0, xx0, tbl_div0, tbl_sqrt0;
341 				float		fres0;
342 				int		iax0, si0, iexp0;
343 
344 				res = *(int *)&res;
345 				res *= FTWO;
346 				ax0 = *(int *)&res;
347 				iexp0 = ax0 >> 24;
348 				iexp0 = 0x3f + 0x4b - iexp0;
349 				iexp0 = iexp0 << 23;
350 
351 				si0 = (ax0 >> 13) & 0x7f0;
352 
353 				tbl_div0 = ((double *)((char *)__TBL_rsqrtf +
354 				    si0))[0];
355 				tbl_sqrt0 = ((double *)((char *)__TBL_rsqrtf +
356 				    si0))[1];
357 				iax0 = ax0 & 0x7ffe0000;
358 				iax0 = ax0 - iax0;
359 				xx0 = iax0 * tbl_div0;
360 				res0 = tbl_sqrt0 *
361 				    (((A3 * xx0 + A2) * xx0 + A1) * xx0 + A0);
362 
363 				fres0 = res0;
364 				iexp0 += *(int *)&fres0;
365 
366 				*(int *)py = iexp0;
367 			} else {	/* X = negative	*/
368 				*py = sqrtf(res);
369 			}
370 		} else {
371 			/*  9.99999997962321453275e-01	*/
372 			double A0 = ((double *)LCONST)[0];
373 			/* -4.99999998166077580600e-01	*/
374 			double A1 = ((double *)LCONST)[1];
375 			/*  3.75066768969515586277e-01	*/
376 			double A2 = ((double *)LCONST)[2];
377 			/* -3.12560092408808548438e-01	*/
378 			double A3 = ((double *)LCONST)[3];
379 			double		res0, xx0, tbl_div0, tbl_sqrt0;
380 			float		fres0;
381 			int		iax0, si0, iexp0;
382 
383 			iexp0 = ax0 >> 24;
384 			iexp0 = 0x3f - iexp0;
385 			iexp0 = iexp0 << 23;
386 
387 			si0 = (ax0 >> 13) & 0x7f0;
388 
389 			tbl_div0 = ((double *)((char *)__TBL_rsqrtf + si0))[0];
390 			tbl_sqrt0 = ((double *)((char *)__TBL_rsqrtf + si0))[1];
391 			iax0 = ax0 & 0x7ffe0000;
392 			iax0 = ax0 - iax0;
393 			xx0 = iax0 * tbl_div0;
394 			res0 = tbl_sqrt0 * (((A3 * xx0 + A2) * xx0 + A1) *
395 			    xx0 + A0);
396 
397 			fres0 = res0;
398 			iexp0 += *(int *)&fres0;
399 
400 			*(int *)py = iexp0;
401 		}
402 	}
403 }
404 
405 void
__vrsqrtf_n(int n,float * restrict px,int stridex,float * restrict py,int stridey)406 __vrsqrtf_n(int n, float *restrict px, int stridex, float *restrict py,
407     int stridey)
408 {
409 	double A0 = ((double *)LCONST)[0]; /*  9.99999997962321453275e-01 */
410 	double A1 = ((double *)LCONST)[1]; /* -4.99999998166077580600e-01 */
411 	double A2 = ((double *)LCONST)[2]; /*  3.75066768969515586277e-01 */
412 	double A3 = ((double *)LCONST)[3]; /* -3.12560092408808548438e-01 */
413 	double		res0, xx0, tbl_div0, tbl_sqrt0;
414 	float		fres0;
415 	int		iax0, ax0, si0, iexp0;
416 
417 #if defined(ARCH_v7) || defined(ARCH_v8)
418 	double		res1, xx1, tbl_div1, tbl_sqrt1;
419 	double		res2, xx2, tbl_div2, tbl_sqrt2;
420 	float		fres1, fres2;
421 	int		iax1, ax1, si1, iexp1;
422 	int		iax2, ax2, si2, iexp2;
423 
424 	for (; n > 2; n -= 3) {
425 		ax0 = *(int *)px;
426 		px += stridex;
427 
428 		ax1 = *(int *)px;
429 		px += stridex;
430 
431 		ax2 = *(int *)px;
432 		px += stridex;
433 
434 		iexp0 = ax0 >> 24;
435 		iexp1 = ax1 >> 24;
436 		iexp2 = ax2 >> 24;
437 		iexp0 = 0x3f - iexp0;
438 		iexp1 = 0x3f - iexp1;
439 		iexp2 = 0x3f - iexp2;
440 
441 		iexp0 = iexp0 << 23;
442 		iexp1 = iexp1 << 23;
443 		iexp2 = iexp2 << 23;
444 
445 		si0 = (ax0 >> 13) & 0x7f0;
446 		si1 = (ax1 >> 13) & 0x7f0;
447 		si2 = (ax2 >> 13) & 0x7f0;
448 
449 		tbl_div0 = ((double *)((char *)__TBL_rsqrtf + si0))[0];
450 		tbl_div1 = ((double *)((char *)__TBL_rsqrtf + si1))[0];
451 		tbl_div2 = ((double *)((char *)__TBL_rsqrtf + si2))[0];
452 		tbl_sqrt0 = ((double *)((char *)__TBL_rsqrtf + si0))[1];
453 		tbl_sqrt1 = ((double *)((char *)__TBL_rsqrtf + si1))[1];
454 		tbl_sqrt2 = ((double *)((char *)__TBL_rsqrtf + si2))[1];
455 		iax0 = ax0 & 0x7ffe0000;
456 		iax1 = ax1 & 0x7ffe0000;
457 		iax2 = ax2 & 0x7ffe0000;
458 		iax0 = ax0 - iax0;
459 		iax1 = ax1 - iax1;
460 		iax2 = ax2 - iax2;
461 		xx0 = iax0 * tbl_div0;
462 		xx1 = iax1 * tbl_div1;
463 		xx2 = iax2 * tbl_div2;
464 		res0 = tbl_sqrt0 * (((A3 * xx0 + A2) * xx0 + A1) * xx0 + A0);
465 		res1 = tbl_sqrt1 * (((A3 * xx1 + A2) * xx1 + A1) * xx1 + A0);
466 		res2 = tbl_sqrt2 * (((A3 * xx2 + A2) * xx2 + A1) * xx2 + A0);
467 
468 		fres0 = res0;
469 		fres1 = res1;
470 		fres2 = res2;
471 
472 		iexp0 += *(int *)&fres0;
473 		iexp1 += *(int *)&fres1;
474 		iexp2 += *(int *)&fres2;
475 		*(int *)py = iexp0;
476 		py += stridey;
477 		*(int *)py = iexp1;
478 		py += stridey;
479 		*(int *)py = iexp2;
480 		py += stridey;
481 	}
482 #endif
483 	for (; n > 0; n--) {
484 		ax0 = *(int *)px;
485 		px += stridex;
486 
487 		iexp0 = ax0 >> 24;
488 		iexp0 = 0x3f - iexp0;
489 		iexp0 = iexp0 << 23;
490 
491 		si0 = (ax0 >> 13) & 0x7f0;
492 
493 		tbl_div0 = ((double *)((char *)__TBL_rsqrtf + si0))[0];
494 		tbl_sqrt0 = ((double *)((char *)__TBL_rsqrtf + si0))[1];
495 		iax0 = ax0 & 0x7ffe0000;
496 		iax0 = ax0 - iax0;
497 		xx0 = iax0 * tbl_div0;
498 		res0 = tbl_sqrt0 * (((A3 * xx0 + A2) * xx0 + A1) * xx0 + A0);
499 
500 		fres0 = res0;
501 		iexp0 += *(int *)&fres0;
502 		*(int *)py = iexp0;
503 		py += stridey;
504 	}
505 }
506