1*25c28e83SPiotr Jasiukajtis /* 2*25c28e83SPiotr Jasiukajtis * CDDL HEADER START 3*25c28e83SPiotr Jasiukajtis * 4*25c28e83SPiotr Jasiukajtis * The contents of this file are subject to the terms of the 5*25c28e83SPiotr Jasiukajtis * Common Development and Distribution License (the "License"). 6*25c28e83SPiotr Jasiukajtis * You may not use this file except in compliance with the License. 7*25c28e83SPiotr Jasiukajtis * 8*25c28e83SPiotr Jasiukajtis * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9*25c28e83SPiotr Jasiukajtis * or http://www.opensolaris.org/os/licensing. 10*25c28e83SPiotr Jasiukajtis * See the License for the specific language governing permissions 11*25c28e83SPiotr Jasiukajtis * and limitations under the License. 12*25c28e83SPiotr Jasiukajtis * 13*25c28e83SPiotr Jasiukajtis * When distributing Covered Code, include this CDDL HEADER in each 14*25c28e83SPiotr Jasiukajtis * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15*25c28e83SPiotr Jasiukajtis * If applicable, add the following below this CDDL HEADER, with the 16*25c28e83SPiotr Jasiukajtis * fields enclosed by brackets "[]" replaced with your own identifying 17*25c28e83SPiotr Jasiukajtis * information: Portions Copyright [yyyy] [name of copyright owner] 18*25c28e83SPiotr Jasiukajtis * 19*25c28e83SPiotr Jasiukajtis * CDDL HEADER END 20*25c28e83SPiotr Jasiukajtis */ 21*25c28e83SPiotr Jasiukajtis 22*25c28e83SPiotr Jasiukajtis /* 23*25c28e83SPiotr Jasiukajtis * Copyright 2011 Nexenta Systems, Inc. All rights reserved. 24*25c28e83SPiotr Jasiukajtis */ 25*25c28e83SPiotr Jasiukajtis /* 26*25c28e83SPiotr Jasiukajtis * Copyright 2006 Sun Microsystems, Inc. All rights reserved. 27*25c28e83SPiotr Jasiukajtis * Use is subject to license terms. 28*25c28e83SPiotr Jasiukajtis */ 29*25c28e83SPiotr Jasiukajtis 30*25c28e83SPiotr Jasiukajtis #include "libm_inlines.h" 31*25c28e83SPiotr Jasiukajtis 32*25c28e83SPiotr Jasiukajtis #ifdef __RESTRICT 33*25c28e83SPiotr Jasiukajtis #define restrict _Restrict 34*25c28e83SPiotr Jasiukajtis #else 35*25c28e83SPiotr Jasiukajtis #define restrict 36*25c28e83SPiotr Jasiukajtis #endif 37*25c28e83SPiotr Jasiukajtis 38*25c28e83SPiotr Jasiukajtis /* float rsqrtf(float x) 39*25c28e83SPiotr Jasiukajtis * 40*25c28e83SPiotr Jasiukajtis * Method : 41*25c28e83SPiotr Jasiukajtis * 1. Special cases: 42*25c28e83SPiotr Jasiukajtis * for x = NaN => QNaN; 43*25c28e83SPiotr Jasiukajtis * for x = +Inf => 0; 44*25c28e83SPiotr Jasiukajtis * for x is negative, -Inf => QNaN + invalid; 45*25c28e83SPiotr Jasiukajtis * for x = +0 => +Inf + divide-by-zero; 46*25c28e83SPiotr Jasiukajtis * for x = -0 => -Inf + divide-by-zero. 47*25c28e83SPiotr Jasiukajtis * 2. Computes reciprocal square root from: 48*25c28e83SPiotr Jasiukajtis * x = m * 2**n 49*25c28e83SPiotr Jasiukajtis * Where: 50*25c28e83SPiotr Jasiukajtis * m = [0.5, 2), 51*25c28e83SPiotr Jasiukajtis * n = ((exponent + 1) & ~1). 52*25c28e83SPiotr Jasiukajtis * Then: 53*25c28e83SPiotr Jasiukajtis * rsqrtf(x) = 1/sqrt( m * 2**n ) = (2 ** (-n/2)) * (1/sqrt(m)) 54*25c28e83SPiotr Jasiukajtis * 2. Computes 1/sqrt(m) from: 55*25c28e83SPiotr Jasiukajtis * 1/sqrt(m) = (1/sqrt(m0)) * (1/sqrt(1 + (1/m0)*dm)) 56*25c28e83SPiotr Jasiukajtis * Where: 57*25c28e83SPiotr Jasiukajtis * m = m0 + dm, 58*25c28e83SPiotr Jasiukajtis * m0 = 0.5 * (1 + k/64) for m = [0.5, 0.5+127/256), k = [0, 63]; 59*25c28e83SPiotr Jasiukajtis * m0 = 1.0 * (0 + k/64) for m = [0.5+127/256, 1.0+127/128), k = [64, 127]; 60*25c28e83SPiotr Jasiukajtis * Then: 61*25c28e83SPiotr Jasiukajtis * 1/sqrt(m0), 1/m0 are looked up in a table, 62*25c28e83SPiotr Jasiukajtis * 1/sqrt(1 + (1/m0)*dm) is computed using approximation: 63*25c28e83SPiotr Jasiukajtis * 1/sqrt(1 + z) = ((a3 * z + a2) * z + a1) * z + a0 64*25c28e83SPiotr Jasiukajtis * where z = [-1/64, 1/64]. 65*25c28e83SPiotr Jasiukajtis * 66*25c28e83SPiotr Jasiukajtis * Accuracy: 67*25c28e83SPiotr Jasiukajtis * The maximum relative error for the approximating 68*25c28e83SPiotr Jasiukajtis * polynomial is 2**(-27.87). 69*25c28e83SPiotr Jasiukajtis * Maximum error observed: less than 0.534 ulp for the 70*25c28e83SPiotr Jasiukajtis * whole float type range. 71*25c28e83SPiotr Jasiukajtis */ 72*25c28e83SPiotr Jasiukajtis 73*25c28e83SPiotr Jasiukajtis extern float sqrtf(float); 74*25c28e83SPiotr Jasiukajtis 75*25c28e83SPiotr Jasiukajtis static const double __TBL_rsqrtf[] = { 76*25c28e83SPiotr Jasiukajtis /* 77*25c28e83SPiotr Jasiukajtis i = [0,63] 78*25c28e83SPiotr Jasiukajtis TBL[2*i ] = 1 / (*(double*)&(0x3fe0000000000000ULL + (i << 46))) * 2**-24; 79*25c28e83SPiotr Jasiukajtis TBL[2*i+1] = 1 / sqrtl(*(double*)&(0x3fe0000000000000ULL + (i << 46))); 80*25c28e83SPiotr Jasiukajtis i = [64,127] 81*25c28e83SPiotr Jasiukajtis TBL[2*i ] = 1 / (*(double*)&(0x3fe0000000000000ULL + (i << 46))) * 2**-23; 82*25c28e83SPiotr Jasiukajtis TBL[2*i+1] = 1 / sqrtl(*(double*)&(0x3fe0000000000000ULL + (i << 46))); 83*25c28e83SPiotr Jasiukajtis */ 84*25c28e83SPiotr Jasiukajtis 1.1920928955078125000e-07, 1.4142135623730951455e+00, 85*25c28e83SPiotr Jasiukajtis 1.1737530048076923728e-07, 1.4032928308912466786e+00, 86*25c28e83SPiotr Jasiukajtis 1.1559688683712121533e-07, 1.3926212476455828160e+00, 87*25c28e83SPiotr Jasiukajtis 1.1387156016791044559e-07, 1.3821894809301762397e+00, 88*25c28e83SPiotr Jasiukajtis 1.1219697840073529256e-07, 1.3719886811400707760e+00, 89*25c28e83SPiotr Jasiukajtis 1.1057093523550724772e-07, 1.3620104492139977204e+00, 90*25c28e83SPiotr Jasiukajtis 1.0899135044642856803e-07, 1.3522468075656264297e+00, 91*25c28e83SPiotr Jasiukajtis 1.0745626100352112918e-07, 1.3426901732747025253e+00, 92*25c28e83SPiotr Jasiukajtis 1.0596381293402777190e-07, 1.3333333333333332593e+00, 93*25c28e83SPiotr Jasiukajtis 1.0451225385273972023e-07, 1.3241694217637887121e+00, 94*25c28e83SPiotr Jasiukajtis 1.0309992609797297870e-07, 1.3151918984428583315e+00, 95*25c28e83SPiotr Jasiukajtis 1.0172526041666667320e-07, 1.3063945294843617440e+00, 96*25c28e83SPiotr Jasiukajtis 1.0038677014802631022e-07, 1.2977713690461003537e+00, 97*25c28e83SPiotr Jasiukajtis 9.9083045860389616921e-08, 1.2893167424406084542e+00, 98*25c28e83SPiotr Jasiukajtis 9.7812750400641022247e-08, 1.2810252304406970492e+00, 99*25c28e83SPiotr Jasiukajtis 9.6574614319620251657e-08, 1.2728916546811681609e+00, 100*25c28e83SPiotr Jasiukajtis 9.5367431640625005294e-08, 1.2649110640673517647e+00, 101*25c28e83SPiotr Jasiukajtis 9.4190055941358019463e-08, 1.2570787221094177344e+00, 102*25c28e83SPiotr Jasiukajtis 9.3041396722560978838e-08, 1.2493900951088485751e+00, 103*25c28e83SPiotr Jasiukajtis 9.1920416039156631290e-08, 1.2418408411301324890e+00, 104*25c28e83SPiotr Jasiukajtis 9.0826125372023804482e-08, 1.2344267996967352996e+00, 105*25c28e83SPiotr Jasiukajtis 8.9757582720588234048e-08, 1.2271439821557927896e+00, 106*25c28e83SPiotr Jasiukajtis 8.8713889898255812722e-08, 1.2199885626608373279e+00, 107*25c28e83SPiotr Jasiukajtis 8.7694190014367814875e-08, 1.2129568697262453902e+00, 108*25c28e83SPiotr Jasiukajtis 8.6697665127840911497e-08, 1.2060453783110545167e+00, 109*25c28e83SPiotr Jasiukajtis 8.5723534058988761666e-08, 1.1992507023933782762e+00, 110*25c28e83SPiotr Jasiukajtis 8.4771050347222225457e-08, 1.1925695879998878812e+00, 111*25c28e83SPiotr Jasiukajtis 8.3839500343406599951e-08, 1.1859989066577618644e+00, 112*25c28e83SPiotr Jasiukajtis 8.2928201426630432481e-08, 1.1795356492391770864e+00, 113*25c28e83SPiotr Jasiukajtis 8.2036500336021511923e-08, 1.1731769201708264205e+00, 114*25c28e83SPiotr Jasiukajtis 8.1163771609042551220e-08, 1.1669199319831564665e+00, 115*25c28e83SPiotr Jasiukajtis 8.0309416118421050820e-08, 1.1607620001760186046e+00, 116*25c28e83SPiotr Jasiukajtis 7.9472859700520828922e-08, 1.1547005383792514621e+00, 117*25c28e83SPiotr Jasiukajtis 7.8653551868556699530e-08, 1.1487330537883810866e+00, 118*25c28e83SPiotr Jasiukajtis 7.7850964604591830522e-08, 1.1428571428571427937e+00, 119*25c28e83SPiotr Jasiukajtis 7.7064591224747481298e-08, 1.1370704872299222110e+00, 120*25c28e83SPiotr Jasiukajtis 7.6293945312500001588e-08, 1.1313708498984760276e+00, 121*25c28e83SPiotr Jasiukajtis 7.5538559715346535571e-08, 1.1257560715684669095e+00, 122*25c28e83SPiotr Jasiukajtis 7.4797985600490195040e-08, 1.1202240672224077489e+00, 123*25c28e83SPiotr Jasiukajtis 7.4071791565533974158e-08, 1.1147728228665882977e+00, 124*25c28e83SPiotr Jasiukajtis 7.3359562800480773303e-08, 1.1094003924504582947e+00, 125*25c28e83SPiotr Jasiukajtis 7.2660900297619054173e-08, 1.1041048949477667573e+00, 126*25c28e83SPiotr Jasiukajtis 7.1975420106132072725e-08, 1.0988845115895122806e+00, 127*25c28e83SPiotr Jasiukajtis 7.1302752628504667579e-08, 1.0937374832394612945e+00, 128*25c28e83SPiotr Jasiukajtis 7.0642541956018514597e-08, 1.0886621079036347126e+00, 129*25c28e83SPiotr Jasiukajtis 6.9994445240825691959e-08, 1.0836567383657542685e+00, 130*25c28e83SPiotr Jasiukajtis 6.9358132102272723904e-08, 1.0787197799411873955e+00, 131*25c28e83SPiotr Jasiukajtis 6.8733284065315314719e-08, 1.0738496883424388795e+00, 132*25c28e83SPiotr Jasiukajtis 6.8119594029017853361e-08, 1.0690449676496975862e+00, 133*25c28e83SPiotr Jasiukajtis 6.7516765763274335346e-08, 1.0643041683803828867e+00, 134*25c28e83SPiotr Jasiukajtis 6.6924513432017540145e-08, 1.0596258856520350822e+00, 135*25c28e83SPiotr Jasiukajtis 6.6342561141304348632e-08, 1.0550087574332591700e+00, 136*25c28e83SPiotr Jasiukajtis 6.5770642510775861156e-08, 1.0504514628777803509e+00, 137*25c28e83SPiotr Jasiukajtis 6.5208500267094023655e-08, 1.0459527207369814228e+00, 138*25c28e83SPiotr Jasiukajtis 6.4655885858050847233e-08, 1.0415112878465908608e+00, 139*25c28e83SPiotr Jasiukajtis 6.4112559086134451001e-08, 1.0371259576834630511e+00, 140*25c28e83SPiotr Jasiukajtis 6.3578287760416665784e-08, 1.0327955589886446131e+00, 141*25c28e83SPiotr Jasiukajtis 6.3052847365702481089e-08, 1.0285189544531601058e+00, 142*25c28e83SPiotr Jasiukajtis 6.2536020747950822927e-08, 1.0242950394631678002e+00, 143*25c28e83SPiotr Jasiukajtis 6.2027597815040656970e-08, 1.0201227409013413627e+00, 144*25c28e83SPiotr Jasiukajtis 6.1527375252016127325e-08, 1.0160010160015240377e+00, 145*25c28e83SPiotr Jasiukajtis 6.1035156250000001271e-08, 1.0119288512538813229e+00, 146*25c28e83SPiotr Jasiukajtis 6.0550750248015869655e-08, 1.0079052613579393416e+00, 147*25c28e83SPiotr Jasiukajtis 6.0073972687007873182e-08, 1.0039292882210537616e+00, 148*25c28e83SPiotr Jasiukajtis 1.1920928955078125000e-07, 1.0000000000000000000e+00, 149*25c28e83SPiotr Jasiukajtis 1.1737530048076923728e-07, 9.9227787671366762812e-01, 150*25c28e83SPiotr Jasiukajtis 1.1559688683712121533e-07, 9.8473192783466190203e-01, 151*25c28e83SPiotr Jasiukajtis 1.1387156016791044559e-07, 9.7735555485044178781e-01, 152*25c28e83SPiotr Jasiukajtis 1.1219697840073529256e-07, 9.7014250014533187638e-01, 153*25c28e83SPiotr Jasiukajtis 1.1057093523550724772e-07, 9.6308682468615358641e-01, 154*25c28e83SPiotr Jasiukajtis 1.0899135044642856803e-07, 9.5618288746751489704e-01, 155*25c28e83SPiotr Jasiukajtis 1.0745626100352112918e-07, 9.4942532655508271588e-01, 156*25c28e83SPiotr Jasiukajtis 1.0596381293402777190e-07, 9.4280904158206335630e-01, 157*25c28e83SPiotr Jasiukajtis 1.0451225385273972023e-07, 9.3632917756904454620e-01, 158*25c28e83SPiotr Jasiukajtis 1.0309992609797297870e-07, 9.2998110995055427441e-01, 159*25c28e83SPiotr Jasiukajtis 1.0172526041666667320e-07, 9.2376043070340119190e-01, 160*25c28e83SPiotr Jasiukajtis 1.0038677014802631022e-07, 9.1766293548224708854e-01, 161*25c28e83SPiotr Jasiukajtis 9.9083045860389616921e-08, 9.1168461167710357351e-01, 162*25c28e83SPiotr Jasiukajtis 9.7812750400641022247e-08, 9.0582162731567661407e-01, 163*25c28e83SPiotr Jasiukajtis 9.6574614319620251657e-08, 9.0007032074081916306e-01, 164*25c28e83SPiotr Jasiukajtis 9.5367431640625005294e-08, 8.9442719099991585541e-01, 165*25c28e83SPiotr Jasiukajtis 9.4190055941358019463e-08, 8.8888888888888883955e-01, 166*25c28e83SPiotr Jasiukajtis 9.3041396722560978838e-08, 8.8345220859877238162e-01, 167*25c28e83SPiotr Jasiukajtis 9.1920416039156631290e-08, 8.7811407991752277180e-01, 168*25c28e83SPiotr Jasiukajtis 9.0826125372023804482e-08, 8.7287156094396955996e-01, 169*25c28e83SPiotr Jasiukajtis 8.9757582720588234048e-08, 8.6772183127462465535e-01, 170*25c28e83SPiotr Jasiukajtis 8.8713889898255812722e-08, 8.6266218562750729415e-01, 171*25c28e83SPiotr Jasiukajtis 8.7694190014367814875e-08, 8.5769002787023584933e-01, 172*25c28e83SPiotr Jasiukajtis 8.6697665127840911497e-08, 8.5280286542244176928e-01, 173*25c28e83SPiotr Jasiukajtis 8.5723534058988761666e-08, 8.4799830400508802164e-01, 174*25c28e83SPiotr Jasiukajtis 8.4771050347222225457e-08, 8.4327404271156780613e-01, 175*25c28e83SPiotr Jasiukajtis 8.3839500343406599951e-08, 8.3862786937753464045e-01, 176*25c28e83SPiotr Jasiukajtis 8.2928201426630432481e-08, 8.3405765622829908246e-01, 177*25c28e83SPiotr Jasiukajtis 8.2036500336021511923e-08, 8.2956135578434020417e-01, 178*25c28e83SPiotr Jasiukajtis 8.1163771609042551220e-08, 8.2513699700703468931e-01, 179*25c28e83SPiotr Jasiukajtis 8.0309416118421050820e-08, 8.2078268166812329287e-01, 180*25c28e83SPiotr Jasiukajtis 7.9472859700520828922e-08, 8.1649658092772603446e-01, 181*25c28e83SPiotr Jasiukajtis 7.8653551868556699530e-08, 8.1227693210689522196e-01, 182*25c28e83SPiotr Jasiukajtis 7.7850964604591830522e-08, 8.0812203564176865456e-01, 183*25c28e83SPiotr Jasiukajtis 7.7064591224747481298e-08, 8.0403025220736967782e-01, 184*25c28e83SPiotr Jasiukajtis 7.6293945312500001588e-08, 8.0000000000000004441e-01, 185*25c28e83SPiotr Jasiukajtis 7.5538559715346535571e-08, 7.9602975216799132241e-01, 186*25c28e83SPiotr Jasiukajtis 7.4797985600490195040e-08, 7.9211803438133943089e-01, 187*25c28e83SPiotr Jasiukajtis 7.4071791565533974158e-08, 7.8826342253143455441e-01, 188*25c28e83SPiotr Jasiukajtis 7.3359562800480773303e-08, 7.8446454055273617811e-01, 189*25c28e83SPiotr Jasiukajtis 7.2660900297619054173e-08, 7.8072005835882651859e-01, 190*25c28e83SPiotr Jasiukajtis 7.1975420106132072725e-08, 7.7702868988581130782e-01, 191*25c28e83SPiotr Jasiukajtis 7.1302752628504667579e-08, 7.7338919123653082632e-01, 192*25c28e83SPiotr Jasiukajtis 7.0642541956018514597e-08, 7.6980035891950104876e-01, 193*25c28e83SPiotr Jasiukajtis 6.9994445240825691959e-08, 7.6626102817692109959e-01, 194*25c28e83SPiotr Jasiukajtis 6.9358132102272723904e-08, 7.6277007139647390321e-01, 195*25c28e83SPiotr Jasiukajtis 6.8733284065315314719e-08, 7.5932639660199918730e-01, 196*25c28e83SPiotr Jasiukajtis 6.8119594029017853361e-08, 7.5592894601845450619e-01, 197*25c28e83SPiotr Jasiukajtis 6.7516765763274335346e-08, 7.5257669470687782454e-01, 198*25c28e83SPiotr Jasiukajtis 6.6924513432017540145e-08, 7.4926864926535519107e-01, 199*25c28e83SPiotr Jasiukajtis 6.6342561141304348632e-08, 7.4600384659225105199e-01, 200*25c28e83SPiotr Jasiukajtis 6.5770642510775861156e-08, 7.4278135270820744296e-01, 201*25c28e83SPiotr Jasiukajtis 6.5208500267094023655e-08, 7.3960026163363878915e-01, 202*25c28e83SPiotr Jasiukajtis 6.4655885858050847233e-08, 7.3645969431865865307e-01, 203*25c28e83SPiotr Jasiukajtis 6.4112559086134451001e-08, 7.3335879762256905856e-01, 204*25c28e83SPiotr Jasiukajtis 6.3578287760416665784e-08, 7.3029674334022143256e-01, 205*25c28e83SPiotr Jasiukajtis 6.3052847365702481089e-08, 7.2727272727272729291e-01, 206*25c28e83SPiotr Jasiukajtis 6.2536020747950822927e-08, 7.2428596834014824513e-01, 207*25c28e83SPiotr Jasiukajtis 6.2027597815040656970e-08, 7.2133570773394584119e-01, 208*25c28e83SPiotr Jasiukajtis 6.1527375252016127325e-08, 7.1842120810709964029e-01, 209*25c28e83SPiotr Jasiukajtis 6.1035156250000001271e-08, 7.1554175279993270653e-01, 210*25c28e83SPiotr Jasiukajtis 6.0550750248015869655e-08, 7.1269664509979835376e-01, 211*25c28e83SPiotr Jasiukajtis 6.0073972687007873182e-08, 7.0988520753289097165e-01, 212*25c28e83SPiotr Jasiukajtis }; 213*25c28e83SPiotr Jasiukajtis 214*25c28e83SPiotr Jasiukajtis static const unsigned long long LCONST[] = { 215*25c28e83SPiotr Jasiukajtis 0x3feffffffee7f18fULL, /* A0 = 9.99999997962321453275e-01 */ 216*25c28e83SPiotr Jasiukajtis 0xbfdffffffe07e52fULL, /* A1 =-4.99999998166077580600e-01 */ 217*25c28e83SPiotr Jasiukajtis 0x3fd801180ca296d9ULL, /* A2 = 3.75066768969515586277e-01 */ 218*25c28e83SPiotr Jasiukajtis 0xbfd400fc0bbb8e78ULL, /* A3 =-3.12560092408808548438e-01 */ 219*25c28e83SPiotr Jasiukajtis }; 220*25c28e83SPiotr Jasiukajtis 221*25c28e83SPiotr Jasiukajtis static void 222*25c28e83SPiotr Jasiukajtis __vrsqrtf_n(int n, float * restrict px, int stridex, float * restrict py, int stridey); 223*25c28e83SPiotr Jasiukajtis 224*25c28e83SPiotr Jasiukajtis #pragma no_inline(__vrsqrtf_n) 225*25c28e83SPiotr Jasiukajtis 226*25c28e83SPiotr Jasiukajtis #define RETURN(ret) \ 227*25c28e83SPiotr Jasiukajtis { \ 228*25c28e83SPiotr Jasiukajtis *py = (ret); \ 229*25c28e83SPiotr Jasiukajtis py += stridey; \ 230*25c28e83SPiotr Jasiukajtis if (n_n == 0) \ 231*25c28e83SPiotr Jasiukajtis { \ 232*25c28e83SPiotr Jasiukajtis spx = px; spy = py; \ 233*25c28e83SPiotr Jasiukajtis ax0 = *(int*)px; \ 234*25c28e83SPiotr Jasiukajtis continue; \ 235*25c28e83SPiotr Jasiukajtis } \ 236*25c28e83SPiotr Jasiukajtis n--; \ 237*25c28e83SPiotr Jasiukajtis break; \ 238*25c28e83SPiotr Jasiukajtis } 239*25c28e83SPiotr Jasiukajtis 240*25c28e83SPiotr Jasiukajtis void 241*25c28e83SPiotr Jasiukajtis __vrsqrtf(int n, float * restrict px, int stridex, float * restrict py, int stridey) 242*25c28e83SPiotr Jasiukajtis { 243*25c28e83SPiotr Jasiukajtis float *spx, *spy; 244*25c28e83SPiotr Jasiukajtis int ax0, n_n; 245*25c28e83SPiotr Jasiukajtis float res; 246*25c28e83SPiotr Jasiukajtis float FONE = 1.0f, FTWO = 2.0f; 247*25c28e83SPiotr Jasiukajtis 248*25c28e83SPiotr Jasiukajtis while (n > 1) 249*25c28e83SPiotr Jasiukajtis { 250*25c28e83SPiotr Jasiukajtis n_n = 0; 251*25c28e83SPiotr Jasiukajtis spx = px; 252*25c28e83SPiotr Jasiukajtis spy = py; 253*25c28e83SPiotr Jasiukajtis ax0 = *(int*)px; 254*25c28e83SPiotr Jasiukajtis for (; n > 1 ; n--) 255*25c28e83SPiotr Jasiukajtis { 256*25c28e83SPiotr Jasiukajtis px += stridex; 257*25c28e83SPiotr Jasiukajtis if (ax0 >= 0x7f800000) /* X = NaN or Inf */ 258*25c28e83SPiotr Jasiukajtis { 259*25c28e83SPiotr Jasiukajtis res = *(px - stridex); 260*25c28e83SPiotr Jasiukajtis RETURN (FONE / res) 261*25c28e83SPiotr Jasiukajtis } 262*25c28e83SPiotr Jasiukajtis 263*25c28e83SPiotr Jasiukajtis py += stridey; 264*25c28e83SPiotr Jasiukajtis 265*25c28e83SPiotr Jasiukajtis if (ax0 < 0x00800000) /* X = denormal, zero or negative */ 266*25c28e83SPiotr Jasiukajtis { 267*25c28e83SPiotr Jasiukajtis py -= stridey; 268*25c28e83SPiotr Jasiukajtis res = *(px - stridex); 269*25c28e83SPiotr Jasiukajtis 270*25c28e83SPiotr Jasiukajtis if ((ax0 & 0x7fffffff) == 0) /* |X| = zero */ 271*25c28e83SPiotr Jasiukajtis { 272*25c28e83SPiotr Jasiukajtis RETURN (FONE / res) 273*25c28e83SPiotr Jasiukajtis } 274*25c28e83SPiotr Jasiukajtis else if (ax0 >= 0) /* X = denormal */ 275*25c28e83SPiotr Jasiukajtis { 276*25c28e83SPiotr Jasiukajtis double A0 = ((double*)LCONST)[0]; /* 9.99999997962321453275e-01 */ 277*25c28e83SPiotr Jasiukajtis double A1 = ((double*)LCONST)[1]; /* -4.99999998166077580600e-01 */ 278*25c28e83SPiotr Jasiukajtis double A2 = ((double*)LCONST)[2]; /* 3.75066768969515586277e-01 */ 279*25c28e83SPiotr Jasiukajtis double A3 = ((double*)LCONST)[3]; /* -3.12560092408808548438e-01 */ 280*25c28e83SPiotr Jasiukajtis 281*25c28e83SPiotr Jasiukajtis double res0, xx0, tbl_div0, tbl_sqrt0; 282*25c28e83SPiotr Jasiukajtis float fres0; 283*25c28e83SPiotr Jasiukajtis int iax0, si0, iexp0; 284*25c28e83SPiotr Jasiukajtis 285*25c28e83SPiotr Jasiukajtis res = *(int*)&res; 286*25c28e83SPiotr Jasiukajtis res *= FTWO; 287*25c28e83SPiotr Jasiukajtis ax0 = *(int*)&res; 288*25c28e83SPiotr Jasiukajtis iexp0 = ax0 >> 24; 289*25c28e83SPiotr Jasiukajtis iexp0 = 0x3f + 0x4b - iexp0; 290*25c28e83SPiotr Jasiukajtis iexp0 = iexp0 << 23; 291*25c28e83SPiotr Jasiukajtis 292*25c28e83SPiotr Jasiukajtis si0 = (ax0 >> 13) & 0x7f0; 293*25c28e83SPiotr Jasiukajtis 294*25c28e83SPiotr Jasiukajtis tbl_div0 = ((double*)((char*)__TBL_rsqrtf + si0))[0]; 295*25c28e83SPiotr Jasiukajtis tbl_sqrt0 = ((double*)((char*)__TBL_rsqrtf + si0))[1]; 296*25c28e83SPiotr Jasiukajtis iax0 = ax0 & 0x7ffe0000; 297*25c28e83SPiotr Jasiukajtis iax0 = ax0 - iax0; 298*25c28e83SPiotr Jasiukajtis xx0 = iax0 * tbl_div0; 299*25c28e83SPiotr Jasiukajtis res0 = tbl_sqrt0 * (((A3 * xx0 + A2) * xx0 + A1) * xx0 + A0); 300*25c28e83SPiotr Jasiukajtis 301*25c28e83SPiotr Jasiukajtis fres0 = res0; 302*25c28e83SPiotr Jasiukajtis iexp0 += *(int*)&fres0; 303*25c28e83SPiotr Jasiukajtis RETURN(*(float*)&iexp0) 304*25c28e83SPiotr Jasiukajtis } 305*25c28e83SPiotr Jasiukajtis else /* X = negative */ 306*25c28e83SPiotr Jasiukajtis { 307*25c28e83SPiotr Jasiukajtis RETURN (sqrtf(res)) 308*25c28e83SPiotr Jasiukajtis } 309*25c28e83SPiotr Jasiukajtis } 310*25c28e83SPiotr Jasiukajtis n_n++; 311*25c28e83SPiotr Jasiukajtis ax0 = *(int*)px; 312*25c28e83SPiotr Jasiukajtis } 313*25c28e83SPiotr Jasiukajtis if (n_n > 0) 314*25c28e83SPiotr Jasiukajtis __vrsqrtf_n(n_n, spx, stridex, spy, stridey); 315*25c28e83SPiotr Jasiukajtis } 316*25c28e83SPiotr Jasiukajtis 317*25c28e83SPiotr Jasiukajtis if (n > 0) 318*25c28e83SPiotr Jasiukajtis { 319*25c28e83SPiotr Jasiukajtis ax0 = *(int*)px; 320*25c28e83SPiotr Jasiukajtis 321*25c28e83SPiotr Jasiukajtis if (ax0 >= 0x7f800000) /* X = NaN or Inf */ 322*25c28e83SPiotr Jasiukajtis { 323*25c28e83SPiotr Jasiukajtis res = *px; 324*25c28e83SPiotr Jasiukajtis *py = FONE / res; 325*25c28e83SPiotr Jasiukajtis } 326*25c28e83SPiotr Jasiukajtis else if (ax0 < 0x00800000) /* X = denormal, zero or negative */ 327*25c28e83SPiotr Jasiukajtis { 328*25c28e83SPiotr Jasiukajtis res = *px; 329*25c28e83SPiotr Jasiukajtis 330*25c28e83SPiotr Jasiukajtis if ((ax0 & 0x7fffffff) == 0) /* |X| = zero */ 331*25c28e83SPiotr Jasiukajtis { 332*25c28e83SPiotr Jasiukajtis *py = FONE / res; 333*25c28e83SPiotr Jasiukajtis } 334*25c28e83SPiotr Jasiukajtis else if (ax0 >= 0) /* X = denormal */ 335*25c28e83SPiotr Jasiukajtis { 336*25c28e83SPiotr Jasiukajtis double A0 = ((double*)LCONST)[0]; /* 9.99999997962321453275e-01 */ 337*25c28e83SPiotr Jasiukajtis double A1 = ((double*)LCONST)[1]; /* -4.99999998166077580600e-01 */ 338*25c28e83SPiotr Jasiukajtis double A2 = ((double*)LCONST)[2]; /* 3.75066768969515586277e-01 */ 339*25c28e83SPiotr Jasiukajtis double A3 = ((double*)LCONST)[3]; /* -3.12560092408808548438e-01 */ 340*25c28e83SPiotr Jasiukajtis double res0, xx0, tbl_div0, tbl_sqrt0; 341*25c28e83SPiotr Jasiukajtis float fres0; 342*25c28e83SPiotr Jasiukajtis int iax0, si0, iexp0; 343*25c28e83SPiotr Jasiukajtis 344*25c28e83SPiotr Jasiukajtis res = *(int*)&res; 345*25c28e83SPiotr Jasiukajtis res *= FTWO; 346*25c28e83SPiotr Jasiukajtis ax0 = *(int*)&res; 347*25c28e83SPiotr Jasiukajtis iexp0 = ax0 >> 24; 348*25c28e83SPiotr Jasiukajtis iexp0 = 0x3f + 0x4b - iexp0; 349*25c28e83SPiotr Jasiukajtis iexp0 = iexp0 << 23; 350*25c28e83SPiotr Jasiukajtis 351*25c28e83SPiotr Jasiukajtis si0 = (ax0 >> 13) & 0x7f0; 352*25c28e83SPiotr Jasiukajtis 353*25c28e83SPiotr Jasiukajtis tbl_div0 = ((double*)((char*)__TBL_rsqrtf + si0))[0]; 354*25c28e83SPiotr Jasiukajtis tbl_sqrt0 = ((double*)((char*)__TBL_rsqrtf + si0))[1]; 355*25c28e83SPiotr Jasiukajtis iax0 = ax0 & 0x7ffe0000; 356*25c28e83SPiotr Jasiukajtis iax0 = ax0 - iax0; 357*25c28e83SPiotr Jasiukajtis xx0 = iax0 * tbl_div0; 358*25c28e83SPiotr Jasiukajtis res0 = tbl_sqrt0 * (((A3 * xx0 + A2) * xx0 + A1) * xx0 + A0); 359*25c28e83SPiotr Jasiukajtis 360*25c28e83SPiotr Jasiukajtis fres0 = res0; 361*25c28e83SPiotr Jasiukajtis iexp0 += *(int*)&fres0; 362*25c28e83SPiotr Jasiukajtis 363*25c28e83SPiotr Jasiukajtis *(int*)py = iexp0; 364*25c28e83SPiotr Jasiukajtis } 365*25c28e83SPiotr Jasiukajtis else /* X = negative */ 366*25c28e83SPiotr Jasiukajtis { 367*25c28e83SPiotr Jasiukajtis *py = sqrtf(res); 368*25c28e83SPiotr Jasiukajtis } 369*25c28e83SPiotr Jasiukajtis } 370*25c28e83SPiotr Jasiukajtis else 371*25c28e83SPiotr Jasiukajtis { 372*25c28e83SPiotr Jasiukajtis double A0 = ((double*)LCONST)[0]; /* 9.99999997962321453275e-01 */ 373*25c28e83SPiotr Jasiukajtis double A1 = ((double*)LCONST)[1]; /* -4.99999998166077580600e-01 */ 374*25c28e83SPiotr Jasiukajtis double A2 = ((double*)LCONST)[2]; /* 3.75066768969515586277e-01 */ 375*25c28e83SPiotr Jasiukajtis double A3 = ((double*)LCONST)[3]; /* -3.12560092408808548438e-01 */ 376*25c28e83SPiotr Jasiukajtis double res0, xx0, tbl_div0, tbl_sqrt0; 377*25c28e83SPiotr Jasiukajtis float fres0; 378*25c28e83SPiotr Jasiukajtis int iax0, si0, iexp0; 379*25c28e83SPiotr Jasiukajtis 380*25c28e83SPiotr Jasiukajtis iexp0 = ax0 >> 24; 381*25c28e83SPiotr Jasiukajtis iexp0 = 0x3f - iexp0; 382*25c28e83SPiotr Jasiukajtis iexp0 = iexp0 << 23; 383*25c28e83SPiotr Jasiukajtis 384*25c28e83SPiotr Jasiukajtis si0 = (ax0 >> 13) & 0x7f0; 385*25c28e83SPiotr Jasiukajtis 386*25c28e83SPiotr Jasiukajtis tbl_div0 = ((double*)((char*)__TBL_rsqrtf + si0))[0]; 387*25c28e83SPiotr Jasiukajtis tbl_sqrt0 = ((double*)((char*)__TBL_rsqrtf + si0))[1]; 388*25c28e83SPiotr Jasiukajtis iax0 = ax0 & 0x7ffe0000; 389*25c28e83SPiotr Jasiukajtis iax0 = ax0 - iax0; 390*25c28e83SPiotr Jasiukajtis xx0 = iax0 * tbl_div0; 391*25c28e83SPiotr Jasiukajtis res0 = tbl_sqrt0 * (((A3 * xx0 + A2) * xx0 + A1) * xx0 + A0); 392*25c28e83SPiotr Jasiukajtis 393*25c28e83SPiotr Jasiukajtis fres0 = res0; 394*25c28e83SPiotr Jasiukajtis iexp0 += *(int*)&fres0; 395*25c28e83SPiotr Jasiukajtis 396*25c28e83SPiotr Jasiukajtis *(int*)py = iexp0; 397*25c28e83SPiotr Jasiukajtis } 398*25c28e83SPiotr Jasiukajtis } 399*25c28e83SPiotr Jasiukajtis } 400*25c28e83SPiotr Jasiukajtis 401*25c28e83SPiotr Jasiukajtis void 402*25c28e83SPiotr Jasiukajtis __vrsqrtf_n(int n, float * restrict px, int stridex, float * restrict py, int stridey) 403*25c28e83SPiotr Jasiukajtis { 404*25c28e83SPiotr Jasiukajtis double A0 = ((double*)LCONST)[0]; /* 9.99999997962321453275e-01 */ 405*25c28e83SPiotr Jasiukajtis double A1 = ((double*)LCONST)[1]; /* -4.99999998166077580600e-01 */ 406*25c28e83SPiotr Jasiukajtis double A2 = ((double*)LCONST)[2]; /* 3.75066768969515586277e-01 */ 407*25c28e83SPiotr Jasiukajtis double A3 = ((double*)LCONST)[3]; /* -3.12560092408808548438e-01 */ 408*25c28e83SPiotr Jasiukajtis double res0, xx0, tbl_div0, tbl_sqrt0; 409*25c28e83SPiotr Jasiukajtis float fres0; 410*25c28e83SPiotr Jasiukajtis int iax0, ax0, si0, iexp0; 411*25c28e83SPiotr Jasiukajtis 412*25c28e83SPiotr Jasiukajtis #if defined(ARCH_v7) || defined(ARCH_v8) 413*25c28e83SPiotr Jasiukajtis double res1, xx1, tbl_div1, tbl_sqrt1; 414*25c28e83SPiotr Jasiukajtis double res2, xx2, tbl_div2, tbl_sqrt2; 415*25c28e83SPiotr Jasiukajtis float fres1, fres2; 416*25c28e83SPiotr Jasiukajtis int iax1, ax1, si1, iexp1; 417*25c28e83SPiotr Jasiukajtis int iax2, ax2, si2, iexp2; 418*25c28e83SPiotr Jasiukajtis 419*25c28e83SPiotr Jasiukajtis for(; n > 2 ; n -= 3) 420*25c28e83SPiotr Jasiukajtis { 421*25c28e83SPiotr Jasiukajtis ax0 = *(int*)px; 422*25c28e83SPiotr Jasiukajtis px += stridex; 423*25c28e83SPiotr Jasiukajtis 424*25c28e83SPiotr Jasiukajtis ax1 = *(int*)px; 425*25c28e83SPiotr Jasiukajtis px += stridex; 426*25c28e83SPiotr Jasiukajtis 427*25c28e83SPiotr Jasiukajtis ax2 = *(int*)px; 428*25c28e83SPiotr Jasiukajtis px += stridex; 429*25c28e83SPiotr Jasiukajtis 430*25c28e83SPiotr Jasiukajtis iexp0 = ax0 >> 24; 431*25c28e83SPiotr Jasiukajtis iexp1 = ax1 >> 24; 432*25c28e83SPiotr Jasiukajtis iexp2 = ax2 >> 24; 433*25c28e83SPiotr Jasiukajtis iexp0 = 0x3f - iexp0; 434*25c28e83SPiotr Jasiukajtis iexp1 = 0x3f - iexp1; 435*25c28e83SPiotr Jasiukajtis iexp2 = 0x3f - iexp2; 436*25c28e83SPiotr Jasiukajtis 437*25c28e83SPiotr Jasiukajtis iexp0 = iexp0 << 23; 438*25c28e83SPiotr Jasiukajtis iexp1 = iexp1 << 23; 439*25c28e83SPiotr Jasiukajtis iexp2 = iexp2 << 23; 440*25c28e83SPiotr Jasiukajtis 441*25c28e83SPiotr Jasiukajtis si0 = (ax0 >> 13) & 0x7f0; 442*25c28e83SPiotr Jasiukajtis si1 = (ax1 >> 13) & 0x7f0; 443*25c28e83SPiotr Jasiukajtis si2 = (ax2 >> 13) & 0x7f0; 444*25c28e83SPiotr Jasiukajtis 445*25c28e83SPiotr Jasiukajtis tbl_div0 = ((double*)((char*)__TBL_rsqrtf + si0))[0]; 446*25c28e83SPiotr Jasiukajtis tbl_div1 = ((double*)((char*)__TBL_rsqrtf + si1))[0]; 447*25c28e83SPiotr Jasiukajtis tbl_div2 = ((double*)((char*)__TBL_rsqrtf + si2))[0]; 448*25c28e83SPiotr Jasiukajtis tbl_sqrt0 = ((double*)((char*)__TBL_rsqrtf + si0))[1]; 449*25c28e83SPiotr Jasiukajtis tbl_sqrt1 = ((double*)((char*)__TBL_rsqrtf + si1))[1]; 450*25c28e83SPiotr Jasiukajtis tbl_sqrt2 = ((double*)((char*)__TBL_rsqrtf + si2))[1]; 451*25c28e83SPiotr Jasiukajtis iax0 = ax0 & 0x7ffe0000; 452*25c28e83SPiotr Jasiukajtis iax1 = ax1 & 0x7ffe0000; 453*25c28e83SPiotr Jasiukajtis iax2 = ax2 & 0x7ffe0000; 454*25c28e83SPiotr Jasiukajtis iax0 = ax0 - iax0; 455*25c28e83SPiotr Jasiukajtis iax1 = ax1 - iax1; 456*25c28e83SPiotr Jasiukajtis iax2 = ax2 - iax2; 457*25c28e83SPiotr Jasiukajtis xx0 = iax0 * tbl_div0; 458*25c28e83SPiotr Jasiukajtis xx1 = iax1 * tbl_div1; 459*25c28e83SPiotr Jasiukajtis xx2 = iax2 * tbl_div2; 460*25c28e83SPiotr Jasiukajtis res0 = tbl_sqrt0 * (((A3 * xx0 + A2) * xx0 + A1) * xx0 + A0); 461*25c28e83SPiotr Jasiukajtis res1 = tbl_sqrt1 * (((A3 * xx1 + A2) * xx1 + A1) * xx1 + A0); 462*25c28e83SPiotr Jasiukajtis res2 = tbl_sqrt2 * (((A3 * xx2 + A2) * xx2 + A1) * xx2 + A0); 463*25c28e83SPiotr Jasiukajtis 464*25c28e83SPiotr Jasiukajtis fres0 = res0; 465*25c28e83SPiotr Jasiukajtis fres1 = res1; 466*25c28e83SPiotr Jasiukajtis fres2 = res2; 467*25c28e83SPiotr Jasiukajtis 468*25c28e83SPiotr Jasiukajtis iexp0 += *(int*)&fres0; 469*25c28e83SPiotr Jasiukajtis iexp1 += *(int*)&fres1; 470*25c28e83SPiotr Jasiukajtis iexp2 += *(int*)&fres2; 471*25c28e83SPiotr Jasiukajtis *(int*)py = iexp0; 472*25c28e83SPiotr Jasiukajtis py += stridey; 473*25c28e83SPiotr Jasiukajtis *(int*)py = iexp1; 474*25c28e83SPiotr Jasiukajtis py += stridey; 475*25c28e83SPiotr Jasiukajtis *(int*)py = iexp2; 476*25c28e83SPiotr Jasiukajtis py += stridey; 477*25c28e83SPiotr Jasiukajtis } 478*25c28e83SPiotr Jasiukajtis #endif 479*25c28e83SPiotr Jasiukajtis for(; n > 0 ; n--) 480*25c28e83SPiotr Jasiukajtis { 481*25c28e83SPiotr Jasiukajtis ax0 = *(int*)px; 482*25c28e83SPiotr Jasiukajtis px += stridex; 483*25c28e83SPiotr Jasiukajtis 484*25c28e83SPiotr Jasiukajtis iexp0 = ax0 >> 24; 485*25c28e83SPiotr Jasiukajtis iexp0 = 0x3f - iexp0; 486*25c28e83SPiotr Jasiukajtis iexp0 = iexp0 << 23; 487*25c28e83SPiotr Jasiukajtis 488*25c28e83SPiotr Jasiukajtis si0 = (ax0 >> 13) & 0x7f0; 489*25c28e83SPiotr Jasiukajtis 490*25c28e83SPiotr Jasiukajtis tbl_div0 = ((double*)((char*)__TBL_rsqrtf + si0))[0]; 491*25c28e83SPiotr Jasiukajtis tbl_sqrt0 = ((double*)((char*)__TBL_rsqrtf + si0))[1]; 492*25c28e83SPiotr Jasiukajtis iax0 = ax0 & 0x7ffe0000; 493*25c28e83SPiotr Jasiukajtis iax0 = ax0 - iax0; 494*25c28e83SPiotr Jasiukajtis xx0 = iax0 * tbl_div0; 495*25c28e83SPiotr Jasiukajtis res0 = tbl_sqrt0 * (((A3 * xx0 + A2) * xx0 + A1) * xx0 + A0); 496*25c28e83SPiotr Jasiukajtis 497*25c28e83SPiotr Jasiukajtis fres0 = res0; 498*25c28e83SPiotr Jasiukajtis iexp0 += *(int*)&fres0; 499*25c28e83SPiotr Jasiukajtis *(int*)py = iexp0; 500*25c28e83SPiotr Jasiukajtis py += stridey; 501*25c28e83SPiotr Jasiukajtis } 502*25c28e83SPiotr Jasiukajtis } 503