xref: /titanic_52/usr/src/lib/libmvec/common/__vrsqrtf.c (revision ddc0e0b53c661f6e439e3b7072b3ef353eadb4af)
1*25c28e83SPiotr Jasiukajtis /*
2*25c28e83SPiotr Jasiukajtis  * CDDL HEADER START
3*25c28e83SPiotr Jasiukajtis  *
4*25c28e83SPiotr Jasiukajtis  * The contents of this file are subject to the terms of the
5*25c28e83SPiotr Jasiukajtis  * Common Development and Distribution License (the "License").
6*25c28e83SPiotr Jasiukajtis  * You may not use this file except in compliance with the License.
7*25c28e83SPiotr Jasiukajtis  *
8*25c28e83SPiotr Jasiukajtis  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9*25c28e83SPiotr Jasiukajtis  * or http://www.opensolaris.org/os/licensing.
10*25c28e83SPiotr Jasiukajtis  * See the License for the specific language governing permissions
11*25c28e83SPiotr Jasiukajtis  * and limitations under the License.
12*25c28e83SPiotr Jasiukajtis  *
13*25c28e83SPiotr Jasiukajtis  * When distributing Covered Code, include this CDDL HEADER in each
14*25c28e83SPiotr Jasiukajtis  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15*25c28e83SPiotr Jasiukajtis  * If applicable, add the following below this CDDL HEADER, with the
16*25c28e83SPiotr Jasiukajtis  * fields enclosed by brackets "[]" replaced with your own identifying
17*25c28e83SPiotr Jasiukajtis  * information: Portions Copyright [yyyy] [name of copyright owner]
18*25c28e83SPiotr Jasiukajtis  *
19*25c28e83SPiotr Jasiukajtis  * CDDL HEADER END
20*25c28e83SPiotr Jasiukajtis  */
21*25c28e83SPiotr Jasiukajtis 
22*25c28e83SPiotr Jasiukajtis /*
23*25c28e83SPiotr Jasiukajtis  * Copyright 2011 Nexenta Systems, Inc.  All rights reserved.
24*25c28e83SPiotr Jasiukajtis  */
25*25c28e83SPiotr Jasiukajtis /*
26*25c28e83SPiotr Jasiukajtis  * Copyright 2006 Sun Microsystems, Inc.  All rights reserved.
27*25c28e83SPiotr Jasiukajtis  * Use is subject to license terms.
28*25c28e83SPiotr Jasiukajtis  */
29*25c28e83SPiotr Jasiukajtis 
30*25c28e83SPiotr Jasiukajtis #include "libm_inlines.h"
31*25c28e83SPiotr Jasiukajtis 
32*25c28e83SPiotr Jasiukajtis #ifdef __RESTRICT
33*25c28e83SPiotr Jasiukajtis #define restrict _Restrict
34*25c28e83SPiotr Jasiukajtis #else
35*25c28e83SPiotr Jasiukajtis #define restrict
36*25c28e83SPiotr Jasiukajtis #endif
37*25c28e83SPiotr Jasiukajtis 
38*25c28e83SPiotr Jasiukajtis /* float rsqrtf(float x)
39*25c28e83SPiotr Jasiukajtis  *
40*25c28e83SPiotr Jasiukajtis  * Method :
41*25c28e83SPiotr Jasiukajtis  *	1. Special cases:
42*25c28e83SPiotr Jasiukajtis  *		for x = NaN				=> QNaN;
43*25c28e83SPiotr Jasiukajtis  *		for x = +Inf				=> 0;
44*25c28e83SPiotr Jasiukajtis  *		for x is negative, -Inf			=> QNaN + invalid;
45*25c28e83SPiotr Jasiukajtis  *		for x = +0				=> +Inf + divide-by-zero;
46*25c28e83SPiotr Jasiukajtis  *		for x = -0				=> -Inf + divide-by-zero.
47*25c28e83SPiotr Jasiukajtis  *	2. Computes reciprocal square root from:
48*25c28e83SPiotr Jasiukajtis  *		x = m * 2**n
49*25c28e83SPiotr Jasiukajtis  *	Where:
50*25c28e83SPiotr Jasiukajtis  *		m = [0.5, 2),
51*25c28e83SPiotr Jasiukajtis  *		n = ((exponent + 1) & ~1).
52*25c28e83SPiotr Jasiukajtis  *	Then:
53*25c28e83SPiotr Jasiukajtis  *		rsqrtf(x) = 1/sqrt( m * 2**n ) = (2 ** (-n/2)) * (1/sqrt(m))
54*25c28e83SPiotr Jasiukajtis  *	2. Computes 1/sqrt(m) from:
55*25c28e83SPiotr Jasiukajtis  *		1/sqrt(m) = (1/sqrt(m0)) * (1/sqrt(1 + (1/m0)*dm))
56*25c28e83SPiotr Jasiukajtis  *	Where:
57*25c28e83SPiotr Jasiukajtis  *		m = m0 + dm,
58*25c28e83SPiotr Jasiukajtis  *		m0 = 0.5 * (1 + k/64) for m = [0.5,         0.5+127/256), k = [0, 63];
59*25c28e83SPiotr Jasiukajtis  *		m0 = 1.0 * (0 + k/64) for m = [0.5+127/256, 1.0+127/128), k = [64, 127];
60*25c28e83SPiotr Jasiukajtis  *	Then:
61*25c28e83SPiotr Jasiukajtis  *		1/sqrt(m0), 1/m0 are looked up in a table,
62*25c28e83SPiotr Jasiukajtis  *		1/sqrt(1 + (1/m0)*dm) is computed using approximation:
63*25c28e83SPiotr Jasiukajtis  *			1/sqrt(1 + z) = ((a3 * z + a2) * z + a1) * z + a0
64*25c28e83SPiotr Jasiukajtis  *			where z = [-1/64, 1/64].
65*25c28e83SPiotr Jasiukajtis  *
66*25c28e83SPiotr Jasiukajtis  * Accuracy:
67*25c28e83SPiotr Jasiukajtis  *	The maximum relative error for the approximating
68*25c28e83SPiotr Jasiukajtis  *	polynomial is 2**(-27.87).
69*25c28e83SPiotr Jasiukajtis  *	Maximum error observed: less than 0.534 ulp for the
70*25c28e83SPiotr Jasiukajtis  *	whole float type range.
71*25c28e83SPiotr Jasiukajtis  */
72*25c28e83SPiotr Jasiukajtis 
73*25c28e83SPiotr Jasiukajtis extern float sqrtf(float);
74*25c28e83SPiotr Jasiukajtis 
75*25c28e83SPiotr Jasiukajtis static const double __TBL_rsqrtf[] = {
76*25c28e83SPiotr Jasiukajtis /*
77*25c28e83SPiotr Jasiukajtis i = [0,63]
78*25c28e83SPiotr Jasiukajtis  TBL[2*i  ] = 1 / (*(double*)&(0x3fe0000000000000ULL + (i << 46))) * 2**-24;
79*25c28e83SPiotr Jasiukajtis  TBL[2*i+1] = 1 / sqrtl(*(double*)&(0x3fe0000000000000ULL + (i << 46)));
80*25c28e83SPiotr Jasiukajtis i = [64,127]
81*25c28e83SPiotr Jasiukajtis  TBL[2*i  ] = 1 / (*(double*)&(0x3fe0000000000000ULL + (i << 46))) * 2**-23;
82*25c28e83SPiotr Jasiukajtis  TBL[2*i+1] = 1 / sqrtl(*(double*)&(0x3fe0000000000000ULL + (i << 46)));
83*25c28e83SPiotr Jasiukajtis */
84*25c28e83SPiotr Jasiukajtis  1.1920928955078125000e-07, 1.4142135623730951455e+00,
85*25c28e83SPiotr Jasiukajtis  1.1737530048076923728e-07, 1.4032928308912466786e+00,
86*25c28e83SPiotr Jasiukajtis  1.1559688683712121533e-07, 1.3926212476455828160e+00,
87*25c28e83SPiotr Jasiukajtis  1.1387156016791044559e-07, 1.3821894809301762397e+00,
88*25c28e83SPiotr Jasiukajtis  1.1219697840073529256e-07, 1.3719886811400707760e+00,
89*25c28e83SPiotr Jasiukajtis  1.1057093523550724772e-07, 1.3620104492139977204e+00,
90*25c28e83SPiotr Jasiukajtis  1.0899135044642856803e-07, 1.3522468075656264297e+00,
91*25c28e83SPiotr Jasiukajtis  1.0745626100352112918e-07, 1.3426901732747025253e+00,
92*25c28e83SPiotr Jasiukajtis  1.0596381293402777190e-07, 1.3333333333333332593e+00,
93*25c28e83SPiotr Jasiukajtis  1.0451225385273972023e-07, 1.3241694217637887121e+00,
94*25c28e83SPiotr Jasiukajtis  1.0309992609797297870e-07, 1.3151918984428583315e+00,
95*25c28e83SPiotr Jasiukajtis  1.0172526041666667320e-07, 1.3063945294843617440e+00,
96*25c28e83SPiotr Jasiukajtis  1.0038677014802631022e-07, 1.2977713690461003537e+00,
97*25c28e83SPiotr Jasiukajtis  9.9083045860389616921e-08, 1.2893167424406084542e+00,
98*25c28e83SPiotr Jasiukajtis  9.7812750400641022247e-08, 1.2810252304406970492e+00,
99*25c28e83SPiotr Jasiukajtis  9.6574614319620251657e-08, 1.2728916546811681609e+00,
100*25c28e83SPiotr Jasiukajtis  9.5367431640625005294e-08, 1.2649110640673517647e+00,
101*25c28e83SPiotr Jasiukajtis  9.4190055941358019463e-08, 1.2570787221094177344e+00,
102*25c28e83SPiotr Jasiukajtis  9.3041396722560978838e-08, 1.2493900951088485751e+00,
103*25c28e83SPiotr Jasiukajtis  9.1920416039156631290e-08, 1.2418408411301324890e+00,
104*25c28e83SPiotr Jasiukajtis  9.0826125372023804482e-08, 1.2344267996967352996e+00,
105*25c28e83SPiotr Jasiukajtis  8.9757582720588234048e-08, 1.2271439821557927896e+00,
106*25c28e83SPiotr Jasiukajtis  8.8713889898255812722e-08, 1.2199885626608373279e+00,
107*25c28e83SPiotr Jasiukajtis  8.7694190014367814875e-08, 1.2129568697262453902e+00,
108*25c28e83SPiotr Jasiukajtis  8.6697665127840911497e-08, 1.2060453783110545167e+00,
109*25c28e83SPiotr Jasiukajtis  8.5723534058988761666e-08, 1.1992507023933782762e+00,
110*25c28e83SPiotr Jasiukajtis  8.4771050347222225457e-08, 1.1925695879998878812e+00,
111*25c28e83SPiotr Jasiukajtis  8.3839500343406599951e-08, 1.1859989066577618644e+00,
112*25c28e83SPiotr Jasiukajtis  8.2928201426630432481e-08, 1.1795356492391770864e+00,
113*25c28e83SPiotr Jasiukajtis  8.2036500336021511923e-08, 1.1731769201708264205e+00,
114*25c28e83SPiotr Jasiukajtis  8.1163771609042551220e-08, 1.1669199319831564665e+00,
115*25c28e83SPiotr Jasiukajtis  8.0309416118421050820e-08, 1.1607620001760186046e+00,
116*25c28e83SPiotr Jasiukajtis  7.9472859700520828922e-08, 1.1547005383792514621e+00,
117*25c28e83SPiotr Jasiukajtis  7.8653551868556699530e-08, 1.1487330537883810866e+00,
118*25c28e83SPiotr Jasiukajtis  7.7850964604591830522e-08, 1.1428571428571427937e+00,
119*25c28e83SPiotr Jasiukajtis  7.7064591224747481298e-08, 1.1370704872299222110e+00,
120*25c28e83SPiotr Jasiukajtis  7.6293945312500001588e-08, 1.1313708498984760276e+00,
121*25c28e83SPiotr Jasiukajtis  7.5538559715346535571e-08, 1.1257560715684669095e+00,
122*25c28e83SPiotr Jasiukajtis  7.4797985600490195040e-08, 1.1202240672224077489e+00,
123*25c28e83SPiotr Jasiukajtis  7.4071791565533974158e-08, 1.1147728228665882977e+00,
124*25c28e83SPiotr Jasiukajtis  7.3359562800480773303e-08, 1.1094003924504582947e+00,
125*25c28e83SPiotr Jasiukajtis  7.2660900297619054173e-08, 1.1041048949477667573e+00,
126*25c28e83SPiotr Jasiukajtis  7.1975420106132072725e-08, 1.0988845115895122806e+00,
127*25c28e83SPiotr Jasiukajtis  7.1302752628504667579e-08, 1.0937374832394612945e+00,
128*25c28e83SPiotr Jasiukajtis  7.0642541956018514597e-08, 1.0886621079036347126e+00,
129*25c28e83SPiotr Jasiukajtis  6.9994445240825691959e-08, 1.0836567383657542685e+00,
130*25c28e83SPiotr Jasiukajtis  6.9358132102272723904e-08, 1.0787197799411873955e+00,
131*25c28e83SPiotr Jasiukajtis  6.8733284065315314719e-08, 1.0738496883424388795e+00,
132*25c28e83SPiotr Jasiukajtis  6.8119594029017853361e-08, 1.0690449676496975862e+00,
133*25c28e83SPiotr Jasiukajtis  6.7516765763274335346e-08, 1.0643041683803828867e+00,
134*25c28e83SPiotr Jasiukajtis  6.6924513432017540145e-08, 1.0596258856520350822e+00,
135*25c28e83SPiotr Jasiukajtis  6.6342561141304348632e-08, 1.0550087574332591700e+00,
136*25c28e83SPiotr Jasiukajtis  6.5770642510775861156e-08, 1.0504514628777803509e+00,
137*25c28e83SPiotr Jasiukajtis  6.5208500267094023655e-08, 1.0459527207369814228e+00,
138*25c28e83SPiotr Jasiukajtis  6.4655885858050847233e-08, 1.0415112878465908608e+00,
139*25c28e83SPiotr Jasiukajtis  6.4112559086134451001e-08, 1.0371259576834630511e+00,
140*25c28e83SPiotr Jasiukajtis  6.3578287760416665784e-08, 1.0327955589886446131e+00,
141*25c28e83SPiotr Jasiukajtis  6.3052847365702481089e-08, 1.0285189544531601058e+00,
142*25c28e83SPiotr Jasiukajtis  6.2536020747950822927e-08, 1.0242950394631678002e+00,
143*25c28e83SPiotr Jasiukajtis  6.2027597815040656970e-08, 1.0201227409013413627e+00,
144*25c28e83SPiotr Jasiukajtis  6.1527375252016127325e-08, 1.0160010160015240377e+00,
145*25c28e83SPiotr Jasiukajtis  6.1035156250000001271e-08, 1.0119288512538813229e+00,
146*25c28e83SPiotr Jasiukajtis  6.0550750248015869655e-08, 1.0079052613579393416e+00,
147*25c28e83SPiotr Jasiukajtis  6.0073972687007873182e-08, 1.0039292882210537616e+00,
148*25c28e83SPiotr Jasiukajtis  1.1920928955078125000e-07, 1.0000000000000000000e+00,
149*25c28e83SPiotr Jasiukajtis  1.1737530048076923728e-07, 9.9227787671366762812e-01,
150*25c28e83SPiotr Jasiukajtis  1.1559688683712121533e-07, 9.8473192783466190203e-01,
151*25c28e83SPiotr Jasiukajtis  1.1387156016791044559e-07, 9.7735555485044178781e-01,
152*25c28e83SPiotr Jasiukajtis  1.1219697840073529256e-07, 9.7014250014533187638e-01,
153*25c28e83SPiotr Jasiukajtis  1.1057093523550724772e-07, 9.6308682468615358641e-01,
154*25c28e83SPiotr Jasiukajtis  1.0899135044642856803e-07, 9.5618288746751489704e-01,
155*25c28e83SPiotr Jasiukajtis  1.0745626100352112918e-07, 9.4942532655508271588e-01,
156*25c28e83SPiotr Jasiukajtis  1.0596381293402777190e-07, 9.4280904158206335630e-01,
157*25c28e83SPiotr Jasiukajtis  1.0451225385273972023e-07, 9.3632917756904454620e-01,
158*25c28e83SPiotr Jasiukajtis  1.0309992609797297870e-07, 9.2998110995055427441e-01,
159*25c28e83SPiotr Jasiukajtis  1.0172526041666667320e-07, 9.2376043070340119190e-01,
160*25c28e83SPiotr Jasiukajtis  1.0038677014802631022e-07, 9.1766293548224708854e-01,
161*25c28e83SPiotr Jasiukajtis  9.9083045860389616921e-08, 9.1168461167710357351e-01,
162*25c28e83SPiotr Jasiukajtis  9.7812750400641022247e-08, 9.0582162731567661407e-01,
163*25c28e83SPiotr Jasiukajtis  9.6574614319620251657e-08, 9.0007032074081916306e-01,
164*25c28e83SPiotr Jasiukajtis  9.5367431640625005294e-08, 8.9442719099991585541e-01,
165*25c28e83SPiotr Jasiukajtis  9.4190055941358019463e-08, 8.8888888888888883955e-01,
166*25c28e83SPiotr Jasiukajtis  9.3041396722560978838e-08, 8.8345220859877238162e-01,
167*25c28e83SPiotr Jasiukajtis  9.1920416039156631290e-08, 8.7811407991752277180e-01,
168*25c28e83SPiotr Jasiukajtis  9.0826125372023804482e-08, 8.7287156094396955996e-01,
169*25c28e83SPiotr Jasiukajtis  8.9757582720588234048e-08, 8.6772183127462465535e-01,
170*25c28e83SPiotr Jasiukajtis  8.8713889898255812722e-08, 8.6266218562750729415e-01,
171*25c28e83SPiotr Jasiukajtis  8.7694190014367814875e-08, 8.5769002787023584933e-01,
172*25c28e83SPiotr Jasiukajtis  8.6697665127840911497e-08, 8.5280286542244176928e-01,
173*25c28e83SPiotr Jasiukajtis  8.5723534058988761666e-08, 8.4799830400508802164e-01,
174*25c28e83SPiotr Jasiukajtis  8.4771050347222225457e-08, 8.4327404271156780613e-01,
175*25c28e83SPiotr Jasiukajtis  8.3839500343406599951e-08, 8.3862786937753464045e-01,
176*25c28e83SPiotr Jasiukajtis  8.2928201426630432481e-08, 8.3405765622829908246e-01,
177*25c28e83SPiotr Jasiukajtis  8.2036500336021511923e-08, 8.2956135578434020417e-01,
178*25c28e83SPiotr Jasiukajtis  8.1163771609042551220e-08, 8.2513699700703468931e-01,
179*25c28e83SPiotr Jasiukajtis  8.0309416118421050820e-08, 8.2078268166812329287e-01,
180*25c28e83SPiotr Jasiukajtis  7.9472859700520828922e-08, 8.1649658092772603446e-01,
181*25c28e83SPiotr Jasiukajtis  7.8653551868556699530e-08, 8.1227693210689522196e-01,
182*25c28e83SPiotr Jasiukajtis  7.7850964604591830522e-08, 8.0812203564176865456e-01,
183*25c28e83SPiotr Jasiukajtis  7.7064591224747481298e-08, 8.0403025220736967782e-01,
184*25c28e83SPiotr Jasiukajtis  7.6293945312500001588e-08, 8.0000000000000004441e-01,
185*25c28e83SPiotr Jasiukajtis  7.5538559715346535571e-08, 7.9602975216799132241e-01,
186*25c28e83SPiotr Jasiukajtis  7.4797985600490195040e-08, 7.9211803438133943089e-01,
187*25c28e83SPiotr Jasiukajtis  7.4071791565533974158e-08, 7.8826342253143455441e-01,
188*25c28e83SPiotr Jasiukajtis  7.3359562800480773303e-08, 7.8446454055273617811e-01,
189*25c28e83SPiotr Jasiukajtis  7.2660900297619054173e-08, 7.8072005835882651859e-01,
190*25c28e83SPiotr Jasiukajtis  7.1975420106132072725e-08, 7.7702868988581130782e-01,
191*25c28e83SPiotr Jasiukajtis  7.1302752628504667579e-08, 7.7338919123653082632e-01,
192*25c28e83SPiotr Jasiukajtis  7.0642541956018514597e-08, 7.6980035891950104876e-01,
193*25c28e83SPiotr Jasiukajtis  6.9994445240825691959e-08, 7.6626102817692109959e-01,
194*25c28e83SPiotr Jasiukajtis  6.9358132102272723904e-08, 7.6277007139647390321e-01,
195*25c28e83SPiotr Jasiukajtis  6.8733284065315314719e-08, 7.5932639660199918730e-01,
196*25c28e83SPiotr Jasiukajtis  6.8119594029017853361e-08, 7.5592894601845450619e-01,
197*25c28e83SPiotr Jasiukajtis  6.7516765763274335346e-08, 7.5257669470687782454e-01,
198*25c28e83SPiotr Jasiukajtis  6.6924513432017540145e-08, 7.4926864926535519107e-01,
199*25c28e83SPiotr Jasiukajtis  6.6342561141304348632e-08, 7.4600384659225105199e-01,
200*25c28e83SPiotr Jasiukajtis  6.5770642510775861156e-08, 7.4278135270820744296e-01,
201*25c28e83SPiotr Jasiukajtis  6.5208500267094023655e-08, 7.3960026163363878915e-01,
202*25c28e83SPiotr Jasiukajtis  6.4655885858050847233e-08, 7.3645969431865865307e-01,
203*25c28e83SPiotr Jasiukajtis  6.4112559086134451001e-08, 7.3335879762256905856e-01,
204*25c28e83SPiotr Jasiukajtis  6.3578287760416665784e-08, 7.3029674334022143256e-01,
205*25c28e83SPiotr Jasiukajtis  6.3052847365702481089e-08, 7.2727272727272729291e-01,
206*25c28e83SPiotr Jasiukajtis  6.2536020747950822927e-08, 7.2428596834014824513e-01,
207*25c28e83SPiotr Jasiukajtis  6.2027597815040656970e-08, 7.2133570773394584119e-01,
208*25c28e83SPiotr Jasiukajtis  6.1527375252016127325e-08, 7.1842120810709964029e-01,
209*25c28e83SPiotr Jasiukajtis  6.1035156250000001271e-08, 7.1554175279993270653e-01,
210*25c28e83SPiotr Jasiukajtis  6.0550750248015869655e-08, 7.1269664509979835376e-01,
211*25c28e83SPiotr Jasiukajtis  6.0073972687007873182e-08, 7.0988520753289097165e-01,
212*25c28e83SPiotr Jasiukajtis };
213*25c28e83SPiotr Jasiukajtis 
214*25c28e83SPiotr Jasiukajtis static const unsigned long long LCONST[] = {
215*25c28e83SPiotr Jasiukajtis 0x3feffffffee7f18fULL,	/* A0 = 9.99999997962321453275e-01	*/
216*25c28e83SPiotr Jasiukajtis 0xbfdffffffe07e52fULL,	/* A1 =-4.99999998166077580600e-01	*/
217*25c28e83SPiotr Jasiukajtis 0x3fd801180ca296d9ULL,	/* A2 = 3.75066768969515586277e-01	*/
218*25c28e83SPiotr Jasiukajtis 0xbfd400fc0bbb8e78ULL,	/* A3 =-3.12560092408808548438e-01	*/
219*25c28e83SPiotr Jasiukajtis };
220*25c28e83SPiotr Jasiukajtis 
221*25c28e83SPiotr Jasiukajtis static void
222*25c28e83SPiotr Jasiukajtis __vrsqrtf_n(int n, float * restrict px, int stridex, float * restrict py, int stridey);
223*25c28e83SPiotr Jasiukajtis 
224*25c28e83SPiotr Jasiukajtis #pragma no_inline(__vrsqrtf_n)
225*25c28e83SPiotr Jasiukajtis 
226*25c28e83SPiotr Jasiukajtis #define RETURN(ret)						\
227*25c28e83SPiotr Jasiukajtis {								\
228*25c28e83SPiotr Jasiukajtis 	*py = (ret);						\
229*25c28e83SPiotr Jasiukajtis 	py += stridey;						\
230*25c28e83SPiotr Jasiukajtis 	if (n_n == 0)						\
231*25c28e83SPiotr Jasiukajtis 	{							\
232*25c28e83SPiotr Jasiukajtis 		spx = px; spy = py;				\
233*25c28e83SPiotr Jasiukajtis 		ax0 = *(int*)px;				\
234*25c28e83SPiotr Jasiukajtis 		continue;					\
235*25c28e83SPiotr Jasiukajtis 	}							\
236*25c28e83SPiotr Jasiukajtis 	n--;							\
237*25c28e83SPiotr Jasiukajtis 	break;							\
238*25c28e83SPiotr Jasiukajtis }
239*25c28e83SPiotr Jasiukajtis 
240*25c28e83SPiotr Jasiukajtis void
241*25c28e83SPiotr Jasiukajtis __vrsqrtf(int n, float * restrict px, int stridex, float * restrict py, int stridey)
242*25c28e83SPiotr Jasiukajtis {
243*25c28e83SPiotr Jasiukajtis 	float		*spx, *spy;
244*25c28e83SPiotr Jasiukajtis 	int		ax0, n_n;
245*25c28e83SPiotr Jasiukajtis 	float		res;
246*25c28e83SPiotr Jasiukajtis 	float		FONE = 1.0f, FTWO = 2.0f;
247*25c28e83SPiotr Jasiukajtis 
248*25c28e83SPiotr Jasiukajtis 	while (n > 1)
249*25c28e83SPiotr Jasiukajtis 	{
250*25c28e83SPiotr Jasiukajtis 		n_n = 0;
251*25c28e83SPiotr Jasiukajtis 		spx = px;
252*25c28e83SPiotr Jasiukajtis 		spy = py;
253*25c28e83SPiotr Jasiukajtis 		ax0 = *(int*)px;
254*25c28e83SPiotr Jasiukajtis 		for (; n > 1 ; n--)
255*25c28e83SPiotr Jasiukajtis 		{
256*25c28e83SPiotr Jasiukajtis 			px += stridex;
257*25c28e83SPiotr Jasiukajtis 			if (ax0 >= 0x7f800000)	/* X = NaN or Inf	*/
258*25c28e83SPiotr Jasiukajtis 			{
259*25c28e83SPiotr Jasiukajtis 				res = *(px - stridex);
260*25c28e83SPiotr Jasiukajtis 				RETURN (FONE / res)
261*25c28e83SPiotr Jasiukajtis 			}
262*25c28e83SPiotr Jasiukajtis 
263*25c28e83SPiotr Jasiukajtis 			py += stridey;
264*25c28e83SPiotr Jasiukajtis 
265*25c28e83SPiotr Jasiukajtis 			if (ax0 < 0x00800000)		/* X = denormal, zero or negative	*/
266*25c28e83SPiotr Jasiukajtis 			{
267*25c28e83SPiotr Jasiukajtis 				py -= stridey;
268*25c28e83SPiotr Jasiukajtis 				res = *(px - stridex);
269*25c28e83SPiotr Jasiukajtis 
270*25c28e83SPiotr Jasiukajtis 				if ((ax0 & 0x7fffffff) == 0)	/* |X| = zero	*/
271*25c28e83SPiotr Jasiukajtis 				{
272*25c28e83SPiotr Jasiukajtis 					RETURN (FONE / res)
273*25c28e83SPiotr Jasiukajtis 				}
274*25c28e83SPiotr Jasiukajtis 				else if (ax0 >= 0)	/* X = denormal	*/
275*25c28e83SPiotr Jasiukajtis 				{
276*25c28e83SPiotr Jasiukajtis 					double		A0 = ((double*)LCONST)[0];	/*  9.99999997962321453275e-01	*/
277*25c28e83SPiotr Jasiukajtis 					double		A1 = ((double*)LCONST)[1];	/* -4.99999998166077580600e-01	*/
278*25c28e83SPiotr Jasiukajtis 					double		A2 = ((double*)LCONST)[2];	/*  3.75066768969515586277e-01	*/
279*25c28e83SPiotr Jasiukajtis 					double		A3 = ((double*)LCONST)[3];	/* -3.12560092408808548438e-01	*/
280*25c28e83SPiotr Jasiukajtis 
281*25c28e83SPiotr Jasiukajtis 					double		res0, xx0, tbl_div0, tbl_sqrt0;
282*25c28e83SPiotr Jasiukajtis 					float		fres0;
283*25c28e83SPiotr Jasiukajtis 					int		iax0, si0, iexp0;
284*25c28e83SPiotr Jasiukajtis 
285*25c28e83SPiotr Jasiukajtis 					res = *(int*)&res;
286*25c28e83SPiotr Jasiukajtis 					res *= FTWO;
287*25c28e83SPiotr Jasiukajtis 					ax0 = *(int*)&res;
288*25c28e83SPiotr Jasiukajtis 					iexp0 = ax0 >> 24;
289*25c28e83SPiotr Jasiukajtis 					iexp0 = 0x3f + 0x4b - iexp0;
290*25c28e83SPiotr Jasiukajtis 					iexp0 = iexp0 << 23;
291*25c28e83SPiotr Jasiukajtis 
292*25c28e83SPiotr Jasiukajtis 					si0 = (ax0 >> 13) & 0x7f0;
293*25c28e83SPiotr Jasiukajtis 
294*25c28e83SPiotr Jasiukajtis 					tbl_div0 = ((double*)((char*)__TBL_rsqrtf + si0))[0];
295*25c28e83SPiotr Jasiukajtis 					tbl_sqrt0 = ((double*)((char*)__TBL_rsqrtf + si0))[1];
296*25c28e83SPiotr Jasiukajtis 					iax0 = ax0 & 0x7ffe0000;
297*25c28e83SPiotr Jasiukajtis 					iax0 = ax0 - iax0;
298*25c28e83SPiotr Jasiukajtis 					xx0 = iax0 * tbl_div0;
299*25c28e83SPiotr Jasiukajtis 					res0 = tbl_sqrt0 * (((A3 * xx0 + A2) * xx0 + A1) * xx0 + A0);
300*25c28e83SPiotr Jasiukajtis 
301*25c28e83SPiotr Jasiukajtis 					fres0 = res0;
302*25c28e83SPiotr Jasiukajtis 					iexp0 += *(int*)&fres0;
303*25c28e83SPiotr Jasiukajtis 					RETURN(*(float*)&iexp0)
304*25c28e83SPiotr Jasiukajtis 				}
305*25c28e83SPiotr Jasiukajtis 				else	/* X = negative	*/
306*25c28e83SPiotr Jasiukajtis 				{
307*25c28e83SPiotr Jasiukajtis 					RETURN (sqrtf(res))
308*25c28e83SPiotr Jasiukajtis 				}
309*25c28e83SPiotr Jasiukajtis 			}
310*25c28e83SPiotr Jasiukajtis 			n_n++;
311*25c28e83SPiotr Jasiukajtis 			ax0 = *(int*)px;
312*25c28e83SPiotr Jasiukajtis 		}
313*25c28e83SPiotr Jasiukajtis 		if (n_n > 0)
314*25c28e83SPiotr Jasiukajtis 			__vrsqrtf_n(n_n, spx, stridex, spy, stridey);
315*25c28e83SPiotr Jasiukajtis 	}
316*25c28e83SPiotr Jasiukajtis 
317*25c28e83SPiotr Jasiukajtis 	if (n > 0)
318*25c28e83SPiotr Jasiukajtis 	{
319*25c28e83SPiotr Jasiukajtis 		ax0 = *(int*)px;
320*25c28e83SPiotr Jasiukajtis 
321*25c28e83SPiotr Jasiukajtis 		if (ax0 >= 0x7f800000)	/* X = NaN or Inf	*/
322*25c28e83SPiotr Jasiukajtis 		{
323*25c28e83SPiotr Jasiukajtis 			res = *px;
324*25c28e83SPiotr Jasiukajtis 			*py = FONE / res;
325*25c28e83SPiotr Jasiukajtis 		}
326*25c28e83SPiotr Jasiukajtis 		else if (ax0 < 0x00800000)	/* X = denormal, zero or negative	*/
327*25c28e83SPiotr Jasiukajtis 		{
328*25c28e83SPiotr Jasiukajtis 			res = *px;
329*25c28e83SPiotr Jasiukajtis 
330*25c28e83SPiotr Jasiukajtis 			if ((ax0 & 0x7fffffff) == 0)	/* |X| = zero	*/
331*25c28e83SPiotr Jasiukajtis 			{
332*25c28e83SPiotr Jasiukajtis 				*py = FONE / res;
333*25c28e83SPiotr Jasiukajtis 			}
334*25c28e83SPiotr Jasiukajtis 			else if (ax0 >= 0)	/* X = denormal	*/
335*25c28e83SPiotr Jasiukajtis 			{
336*25c28e83SPiotr Jasiukajtis 				double		A0 = ((double*)LCONST)[0];	/*  9.99999997962321453275e-01	*/
337*25c28e83SPiotr Jasiukajtis 				double		A1 = ((double*)LCONST)[1];	/* -4.99999998166077580600e-01	*/
338*25c28e83SPiotr Jasiukajtis 				double		A2 = ((double*)LCONST)[2];	/*  3.75066768969515586277e-01	*/
339*25c28e83SPiotr Jasiukajtis 				double		A3 = ((double*)LCONST)[3];	/* -3.12560092408808548438e-01	*/
340*25c28e83SPiotr Jasiukajtis 				double		res0, xx0, tbl_div0, tbl_sqrt0;
341*25c28e83SPiotr Jasiukajtis 				float		fres0;
342*25c28e83SPiotr Jasiukajtis 				int		iax0, si0, iexp0;
343*25c28e83SPiotr Jasiukajtis 
344*25c28e83SPiotr Jasiukajtis 				res = *(int*)&res;
345*25c28e83SPiotr Jasiukajtis 				res *= FTWO;
346*25c28e83SPiotr Jasiukajtis 				ax0 = *(int*)&res;
347*25c28e83SPiotr Jasiukajtis 				iexp0 = ax0 >> 24;
348*25c28e83SPiotr Jasiukajtis 				iexp0 = 0x3f + 0x4b - iexp0;
349*25c28e83SPiotr Jasiukajtis 				iexp0 = iexp0 << 23;
350*25c28e83SPiotr Jasiukajtis 
351*25c28e83SPiotr Jasiukajtis 				si0 = (ax0 >> 13) & 0x7f0;
352*25c28e83SPiotr Jasiukajtis 
353*25c28e83SPiotr Jasiukajtis 				tbl_div0 = ((double*)((char*)__TBL_rsqrtf + si0))[0];
354*25c28e83SPiotr Jasiukajtis 				tbl_sqrt0 = ((double*)((char*)__TBL_rsqrtf + si0))[1];
355*25c28e83SPiotr Jasiukajtis 				iax0 = ax0 & 0x7ffe0000;
356*25c28e83SPiotr Jasiukajtis 				iax0 = ax0 - iax0;
357*25c28e83SPiotr Jasiukajtis 				xx0 = iax0 * tbl_div0;
358*25c28e83SPiotr Jasiukajtis 				res0 = tbl_sqrt0 * (((A3 * xx0 + A2) * xx0 + A1) * xx0 + A0);
359*25c28e83SPiotr Jasiukajtis 
360*25c28e83SPiotr Jasiukajtis 				fres0 = res0;
361*25c28e83SPiotr Jasiukajtis 				iexp0 += *(int*)&fres0;
362*25c28e83SPiotr Jasiukajtis 
363*25c28e83SPiotr Jasiukajtis 				*(int*)py = iexp0;
364*25c28e83SPiotr Jasiukajtis 			}
365*25c28e83SPiotr Jasiukajtis 			else	/* X = negative	*/
366*25c28e83SPiotr Jasiukajtis 			{
367*25c28e83SPiotr Jasiukajtis 				*py = sqrtf(res);
368*25c28e83SPiotr Jasiukajtis 			}
369*25c28e83SPiotr Jasiukajtis 		}
370*25c28e83SPiotr Jasiukajtis 		else
371*25c28e83SPiotr Jasiukajtis 		{
372*25c28e83SPiotr Jasiukajtis 			double		A0 = ((double*)LCONST)[0];	/*  9.99999997962321453275e-01	*/
373*25c28e83SPiotr Jasiukajtis 			double		A1 = ((double*)LCONST)[1];	/* -4.99999998166077580600e-01	*/
374*25c28e83SPiotr Jasiukajtis 			double		A2 = ((double*)LCONST)[2];	/*  3.75066768969515586277e-01	*/
375*25c28e83SPiotr Jasiukajtis 			double		A3 = ((double*)LCONST)[3];	/* -3.12560092408808548438e-01	*/
376*25c28e83SPiotr Jasiukajtis 			double		res0, xx0, tbl_div0, tbl_sqrt0;
377*25c28e83SPiotr Jasiukajtis 			float		fres0;
378*25c28e83SPiotr Jasiukajtis 			int		iax0, si0, iexp0;
379*25c28e83SPiotr Jasiukajtis 
380*25c28e83SPiotr Jasiukajtis 			iexp0 = ax0 >> 24;
381*25c28e83SPiotr Jasiukajtis 			iexp0 = 0x3f - iexp0;
382*25c28e83SPiotr Jasiukajtis 			iexp0 = iexp0 << 23;
383*25c28e83SPiotr Jasiukajtis 
384*25c28e83SPiotr Jasiukajtis 			si0 = (ax0 >> 13) & 0x7f0;
385*25c28e83SPiotr Jasiukajtis 
386*25c28e83SPiotr Jasiukajtis 			tbl_div0 = ((double*)((char*)__TBL_rsqrtf + si0))[0];
387*25c28e83SPiotr Jasiukajtis 			tbl_sqrt0 = ((double*)((char*)__TBL_rsqrtf + si0))[1];
388*25c28e83SPiotr Jasiukajtis 			iax0 = ax0 & 0x7ffe0000;
389*25c28e83SPiotr Jasiukajtis 			iax0 = ax0 - iax0;
390*25c28e83SPiotr Jasiukajtis 			xx0 = iax0 * tbl_div0;
391*25c28e83SPiotr Jasiukajtis 			res0 = tbl_sqrt0 * (((A3 * xx0 + A2) * xx0 + A1) * xx0 + A0);
392*25c28e83SPiotr Jasiukajtis 
393*25c28e83SPiotr Jasiukajtis 			fres0 = res0;
394*25c28e83SPiotr Jasiukajtis 			iexp0 += *(int*)&fres0;
395*25c28e83SPiotr Jasiukajtis 
396*25c28e83SPiotr Jasiukajtis 			*(int*)py = iexp0;
397*25c28e83SPiotr Jasiukajtis 		}
398*25c28e83SPiotr Jasiukajtis 	}
399*25c28e83SPiotr Jasiukajtis }
400*25c28e83SPiotr Jasiukajtis 
401*25c28e83SPiotr Jasiukajtis void
402*25c28e83SPiotr Jasiukajtis __vrsqrtf_n(int n, float * restrict px, int stridex, float * restrict py, int stridey)
403*25c28e83SPiotr Jasiukajtis {
404*25c28e83SPiotr Jasiukajtis 	double		A0 = ((double*)LCONST)[0];	/*  9.99999997962321453275e-01	*/
405*25c28e83SPiotr Jasiukajtis 	double		A1 = ((double*)LCONST)[1];	/* -4.99999998166077580600e-01	*/
406*25c28e83SPiotr Jasiukajtis 	double		A2 = ((double*)LCONST)[2];	/*  3.75066768969515586277e-01	*/
407*25c28e83SPiotr Jasiukajtis 	double		A3 = ((double*)LCONST)[3];	/* -3.12560092408808548438e-01	*/
408*25c28e83SPiotr Jasiukajtis 	double		res0, xx0, tbl_div0, tbl_sqrt0;
409*25c28e83SPiotr Jasiukajtis 	float		fres0;
410*25c28e83SPiotr Jasiukajtis 	int		iax0, ax0, si0, iexp0;
411*25c28e83SPiotr Jasiukajtis 
412*25c28e83SPiotr Jasiukajtis #if defined(ARCH_v7) || defined(ARCH_v8)
413*25c28e83SPiotr Jasiukajtis 	double		res1, xx1, tbl_div1, tbl_sqrt1;
414*25c28e83SPiotr Jasiukajtis 	double		res2, xx2, tbl_div2, tbl_sqrt2;
415*25c28e83SPiotr Jasiukajtis 	float		fres1, fres2;
416*25c28e83SPiotr Jasiukajtis 	int		iax1, ax1, si1, iexp1;
417*25c28e83SPiotr Jasiukajtis 	int		iax2, ax2, si2, iexp2;
418*25c28e83SPiotr Jasiukajtis 
419*25c28e83SPiotr Jasiukajtis 	for(; n > 2 ; n -= 3)
420*25c28e83SPiotr Jasiukajtis 	{
421*25c28e83SPiotr Jasiukajtis 		ax0 = *(int*)px;
422*25c28e83SPiotr Jasiukajtis 		px += stridex;
423*25c28e83SPiotr Jasiukajtis 
424*25c28e83SPiotr Jasiukajtis 		ax1 = *(int*)px;
425*25c28e83SPiotr Jasiukajtis 		px += stridex;
426*25c28e83SPiotr Jasiukajtis 
427*25c28e83SPiotr Jasiukajtis 		ax2 = *(int*)px;
428*25c28e83SPiotr Jasiukajtis 		px += stridex;
429*25c28e83SPiotr Jasiukajtis 
430*25c28e83SPiotr Jasiukajtis 		iexp0 = ax0 >> 24;
431*25c28e83SPiotr Jasiukajtis 		iexp1 = ax1 >> 24;
432*25c28e83SPiotr Jasiukajtis 		iexp2 = ax2 >> 24;
433*25c28e83SPiotr Jasiukajtis 		iexp0 = 0x3f - iexp0;
434*25c28e83SPiotr Jasiukajtis 		iexp1 = 0x3f - iexp1;
435*25c28e83SPiotr Jasiukajtis 		iexp2 = 0x3f - iexp2;
436*25c28e83SPiotr Jasiukajtis 
437*25c28e83SPiotr Jasiukajtis 		iexp0 = iexp0 << 23;
438*25c28e83SPiotr Jasiukajtis 		iexp1 = iexp1 << 23;
439*25c28e83SPiotr Jasiukajtis 		iexp2 = iexp2 << 23;
440*25c28e83SPiotr Jasiukajtis 
441*25c28e83SPiotr Jasiukajtis 		si0 = (ax0 >> 13) & 0x7f0;
442*25c28e83SPiotr Jasiukajtis 		si1 = (ax1 >> 13) & 0x7f0;
443*25c28e83SPiotr Jasiukajtis 		si2 = (ax2 >> 13) & 0x7f0;
444*25c28e83SPiotr Jasiukajtis 
445*25c28e83SPiotr Jasiukajtis 		tbl_div0 = ((double*)((char*)__TBL_rsqrtf + si0))[0];
446*25c28e83SPiotr Jasiukajtis 		tbl_div1 = ((double*)((char*)__TBL_rsqrtf + si1))[0];
447*25c28e83SPiotr Jasiukajtis 		tbl_div2 = ((double*)((char*)__TBL_rsqrtf + si2))[0];
448*25c28e83SPiotr Jasiukajtis 		tbl_sqrt0 = ((double*)((char*)__TBL_rsqrtf + si0))[1];
449*25c28e83SPiotr Jasiukajtis 		tbl_sqrt1 = ((double*)((char*)__TBL_rsqrtf + si1))[1];
450*25c28e83SPiotr Jasiukajtis 		tbl_sqrt2 = ((double*)((char*)__TBL_rsqrtf + si2))[1];
451*25c28e83SPiotr Jasiukajtis 		iax0 = ax0 & 0x7ffe0000;
452*25c28e83SPiotr Jasiukajtis 		iax1 = ax1 & 0x7ffe0000;
453*25c28e83SPiotr Jasiukajtis 		iax2 = ax2 & 0x7ffe0000;
454*25c28e83SPiotr Jasiukajtis 		iax0 = ax0 - iax0;
455*25c28e83SPiotr Jasiukajtis 		iax1 = ax1 - iax1;
456*25c28e83SPiotr Jasiukajtis 		iax2 = ax2 - iax2;
457*25c28e83SPiotr Jasiukajtis 		xx0 = iax0 * tbl_div0;
458*25c28e83SPiotr Jasiukajtis 		xx1 = iax1 * tbl_div1;
459*25c28e83SPiotr Jasiukajtis 		xx2 = iax2 * tbl_div2;
460*25c28e83SPiotr Jasiukajtis 		res0 = tbl_sqrt0 * (((A3 * xx0 + A2) * xx0 + A1) * xx0 + A0);
461*25c28e83SPiotr Jasiukajtis 		res1 = tbl_sqrt1 * (((A3 * xx1 + A2) * xx1 + A1) * xx1 + A0);
462*25c28e83SPiotr Jasiukajtis 		res2 = tbl_sqrt2 * (((A3 * xx2 + A2) * xx2 + A1) * xx2 + A0);
463*25c28e83SPiotr Jasiukajtis 
464*25c28e83SPiotr Jasiukajtis 		fres0 = res0;
465*25c28e83SPiotr Jasiukajtis 		fres1 = res1;
466*25c28e83SPiotr Jasiukajtis 		fres2 = res2;
467*25c28e83SPiotr Jasiukajtis 
468*25c28e83SPiotr Jasiukajtis 		iexp0 += *(int*)&fres0;
469*25c28e83SPiotr Jasiukajtis 		iexp1 += *(int*)&fres1;
470*25c28e83SPiotr Jasiukajtis 		iexp2 += *(int*)&fres2;
471*25c28e83SPiotr Jasiukajtis 		*(int*)py = iexp0;
472*25c28e83SPiotr Jasiukajtis 		py += stridey;
473*25c28e83SPiotr Jasiukajtis 		*(int*)py = iexp1;
474*25c28e83SPiotr Jasiukajtis 		py += stridey;
475*25c28e83SPiotr Jasiukajtis 		*(int*)py = iexp2;
476*25c28e83SPiotr Jasiukajtis 		py += stridey;
477*25c28e83SPiotr Jasiukajtis 	}
478*25c28e83SPiotr Jasiukajtis #endif
479*25c28e83SPiotr Jasiukajtis 	for(; n > 0 ; n--)
480*25c28e83SPiotr Jasiukajtis 	{
481*25c28e83SPiotr Jasiukajtis 		ax0 = *(int*)px;
482*25c28e83SPiotr Jasiukajtis 		px += stridex;
483*25c28e83SPiotr Jasiukajtis 
484*25c28e83SPiotr Jasiukajtis 		iexp0 = ax0 >> 24;
485*25c28e83SPiotr Jasiukajtis 		iexp0 = 0x3f - iexp0;
486*25c28e83SPiotr Jasiukajtis 		iexp0 = iexp0 << 23;
487*25c28e83SPiotr Jasiukajtis 
488*25c28e83SPiotr Jasiukajtis 		si0 = (ax0 >> 13) & 0x7f0;
489*25c28e83SPiotr Jasiukajtis 
490*25c28e83SPiotr Jasiukajtis 		tbl_div0 = ((double*)((char*)__TBL_rsqrtf + si0))[0];
491*25c28e83SPiotr Jasiukajtis 		tbl_sqrt0 = ((double*)((char*)__TBL_rsqrtf + si0))[1];
492*25c28e83SPiotr Jasiukajtis 		iax0 = ax0 & 0x7ffe0000;
493*25c28e83SPiotr Jasiukajtis 		iax0 = ax0 - iax0;
494*25c28e83SPiotr Jasiukajtis 		xx0 = iax0 * tbl_div0;
495*25c28e83SPiotr Jasiukajtis 		res0 = tbl_sqrt0 * (((A3 * xx0 + A2) * xx0 + A1) * xx0 + A0);
496*25c28e83SPiotr Jasiukajtis 
497*25c28e83SPiotr Jasiukajtis 		fres0 = res0;
498*25c28e83SPiotr Jasiukajtis 		iexp0 += *(int*)&fres0;
499*25c28e83SPiotr Jasiukajtis 		*(int*)py = iexp0;
500*25c28e83SPiotr Jasiukajtis 		py += stridey;
501*25c28e83SPiotr Jasiukajtis 	}
502*25c28e83SPiotr Jasiukajtis }
503