1*25c28e83SPiotr Jasiukajtis /* 2*25c28e83SPiotr Jasiukajtis * CDDL HEADER START 3*25c28e83SPiotr Jasiukajtis * 4*25c28e83SPiotr Jasiukajtis * The contents of this file are subject to the terms of the 5*25c28e83SPiotr Jasiukajtis * Common Development and Distribution License (the "License"). 6*25c28e83SPiotr Jasiukajtis * You may not use this file except in compliance with the License. 7*25c28e83SPiotr Jasiukajtis * 8*25c28e83SPiotr Jasiukajtis * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9*25c28e83SPiotr Jasiukajtis * or http://www.opensolaris.org/os/licensing. 10*25c28e83SPiotr Jasiukajtis * See the License for the specific language governing permissions 11*25c28e83SPiotr Jasiukajtis * and limitations under the License. 12*25c28e83SPiotr Jasiukajtis * 13*25c28e83SPiotr Jasiukajtis * When distributing Covered Code, include this CDDL HEADER in each 14*25c28e83SPiotr Jasiukajtis * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15*25c28e83SPiotr Jasiukajtis * If applicable, add the following below this CDDL HEADER, with the 16*25c28e83SPiotr Jasiukajtis * fields enclosed by brackets "[]" replaced with your own identifying 17*25c28e83SPiotr Jasiukajtis * information: Portions Copyright [yyyy] [name of copyright owner] 18*25c28e83SPiotr Jasiukajtis * 19*25c28e83SPiotr Jasiukajtis * CDDL HEADER END 20*25c28e83SPiotr Jasiukajtis */ 21*25c28e83SPiotr Jasiukajtis 22*25c28e83SPiotr Jasiukajtis /* 23*25c28e83SPiotr Jasiukajtis * Copyright 2011 Nexenta Systems, Inc. All rights reserved. 24*25c28e83SPiotr Jasiukajtis */ 25*25c28e83SPiotr Jasiukajtis /* 26*25c28e83SPiotr Jasiukajtis * Copyright 2006 Sun Microsystems, Inc. All rights reserved. 27*25c28e83SPiotr Jasiukajtis * Use is subject to license terms. 28*25c28e83SPiotr Jasiukajtis */ 29*25c28e83SPiotr Jasiukajtis 30*25c28e83SPiotr Jasiukajtis #include <sys/isa_defs.h> 31*25c28e83SPiotr Jasiukajtis #include "libm_inlines.h" 32*25c28e83SPiotr Jasiukajtis 33*25c28e83SPiotr Jasiukajtis #ifdef _LITTLE_ENDIAN 34*25c28e83SPiotr Jasiukajtis #define HI(x) *(1+(int*)x) 35*25c28e83SPiotr Jasiukajtis #define LO(x) *(unsigned*)x 36*25c28e83SPiotr Jasiukajtis #else 37*25c28e83SPiotr Jasiukajtis #define HI(x) *(int*)x 38*25c28e83SPiotr Jasiukajtis #define LO(x) *(1+(unsigned*)x) 39*25c28e83SPiotr Jasiukajtis #endif 40*25c28e83SPiotr Jasiukajtis 41*25c28e83SPiotr Jasiukajtis #ifdef __RESTRICT 42*25c28e83SPiotr Jasiukajtis #define restrict _Restrict 43*25c28e83SPiotr Jasiukajtis #else 44*25c28e83SPiotr Jasiukajtis #define restrict 45*25c28e83SPiotr Jasiukajtis #endif 46*25c28e83SPiotr Jasiukajtis 47*25c28e83SPiotr Jasiukajtis /* float rhypotf(float x, float y) 48*25c28e83SPiotr Jasiukajtis * 49*25c28e83SPiotr Jasiukajtis * Method : 50*25c28e83SPiotr Jasiukajtis * 1. Special cases: 51*25c28e83SPiotr Jasiukajtis * for x or y = Inf => 0; 52*25c28e83SPiotr Jasiukajtis * for x or y = NaN => QNaN; 53*25c28e83SPiotr Jasiukajtis * for x and y = 0 => +Inf + divide-by-zero; 54*25c28e83SPiotr Jasiukajtis * 2. Computes d = x * x + y * y; 55*25c28e83SPiotr Jasiukajtis * 3. Computes reciprocal square root from: 56*25c28e83SPiotr Jasiukajtis * d = m * 2**n 57*25c28e83SPiotr Jasiukajtis * Where: 58*25c28e83SPiotr Jasiukajtis * m = [0.5, 2), 59*25c28e83SPiotr Jasiukajtis * n = ((exponent + 1) & ~1). 60*25c28e83SPiotr Jasiukajtis * Then: 61*25c28e83SPiotr Jasiukajtis * rsqrtf(d) = 1/sqrt( m * 2**n ) = (2 ** (-n/2)) * (1/sqrt(m)) 62*25c28e83SPiotr Jasiukajtis * 4. Computes 1/sqrt(m) from: 63*25c28e83SPiotr Jasiukajtis * 1/sqrt(m) = (1/sqrt(m0)) * (1/sqrt(1 + (1/m0)*dm)) 64*25c28e83SPiotr Jasiukajtis * Where: 65*25c28e83SPiotr Jasiukajtis * m = m0 + dm, 66*25c28e83SPiotr Jasiukajtis * m0 = 0.5 * (1 + k/64) for m = [0.5, 0.5+127/256), k = [0, 63]; 67*25c28e83SPiotr Jasiukajtis * m0 = 1.0 * (0 + k/64) for m = [0.5+127/256, 1.0+127/128), k = [64, 127]; 68*25c28e83SPiotr Jasiukajtis * Then: 69*25c28e83SPiotr Jasiukajtis * 1/sqrt(m0), 1/m0 are looked up in a table, 70*25c28e83SPiotr Jasiukajtis * 1/sqrt(1 + (1/m0)*dm) is computed using approximation: 71*25c28e83SPiotr Jasiukajtis * 1/sqrt(1 + z) = ((a3 * z + a2) * z + a1) * z + a0 72*25c28e83SPiotr Jasiukajtis * where z = [-1/64, 1/64]. 73*25c28e83SPiotr Jasiukajtis * 74*25c28e83SPiotr Jasiukajtis * Accuracy: 75*25c28e83SPiotr Jasiukajtis * The maximum relative error for the approximating 76*25c28e83SPiotr Jasiukajtis * polynomial is 2**(-27.87). 77*25c28e83SPiotr Jasiukajtis * Maximum error observed: less than 0.535 ulp after 3.000.000.000 78*25c28e83SPiotr Jasiukajtis * results. 79*25c28e83SPiotr Jasiukajtis */ 80*25c28e83SPiotr Jasiukajtis 81*25c28e83SPiotr Jasiukajtis #pragma align 32 (__vlibm_TBL_rhypotf) 82*25c28e83SPiotr Jasiukajtis 83*25c28e83SPiotr Jasiukajtis static const double __vlibm_TBL_rhypotf[] = { 84*25c28e83SPiotr Jasiukajtis /* 85*25c28e83SPiotr Jasiukajtis i = [0,63] 86*25c28e83SPiotr Jasiukajtis TBL[2*i+0] = 1.0 / (*(double*)&(0x3ff0000000000000LL + (i << 46))); 87*25c28e83SPiotr Jasiukajtis TBL[2*i+1] = (double)(0.5/sqrtl(2) / sqrtl(*(double*)&(0x3ff0000000000000LL + (i << 46)))); 88*25c28e83SPiotr Jasiukajtis TBL[128+2*i+0] = 1.0 / (*(double*)&(0x3ff0000000000000LL + (i << 46))); 89*25c28e83SPiotr Jasiukajtis TBL[128+2*i+1] = (double)(0.25 / sqrtl(*(double*)&(0x3ff0000000000000LL + (i << 46)))); 90*25c28e83SPiotr Jasiukajtis */ 91*25c28e83SPiotr Jasiukajtis 1.0000000000000000000e+00, 3.5355339059327378637e-01, 92*25c28e83SPiotr Jasiukajtis 9.8461538461538467004e-01, 3.5082320772281166965e-01, 93*25c28e83SPiotr Jasiukajtis 9.6969696969696972388e-01, 3.4815531191139570399e-01, 94*25c28e83SPiotr Jasiukajtis 9.5522388059701490715e-01, 3.4554737023254405992e-01, 95*25c28e83SPiotr Jasiukajtis 9.4117647058823528106e-01, 3.4299717028501769400e-01, 96*25c28e83SPiotr Jasiukajtis 9.2753623188405798228e-01, 3.4050261230349943009e-01, 97*25c28e83SPiotr Jasiukajtis 9.1428571428571425717e-01, 3.3806170189140660742e-01, 98*25c28e83SPiotr Jasiukajtis 9.0140845070422537244e-01, 3.3567254331867563133e-01, 99*25c28e83SPiotr Jasiukajtis 8.8888888888888883955e-01, 3.3333333333333331483e-01, 100*25c28e83SPiotr Jasiukajtis 8.7671232876712323900e-01, 3.3104235544094717802e-01, 101*25c28e83SPiotr Jasiukajtis 8.6486486486486491287e-01, 3.2879797461071458287e-01, 102*25c28e83SPiotr Jasiukajtis 8.5333333333333338810e-01, 3.2659863237109043599e-01, 103*25c28e83SPiotr Jasiukajtis 8.4210526315789469010e-01, 3.2444284226152508843e-01, 104*25c28e83SPiotr Jasiukajtis 8.3116883116883122362e-01, 3.2232918561015211356e-01, 105*25c28e83SPiotr Jasiukajtis 8.2051282051282048435e-01, 3.2025630761017426229e-01, 106*25c28e83SPiotr Jasiukajtis 8.1012658227848100001e-01, 3.1822291367029204023e-01, 107*25c28e83SPiotr Jasiukajtis 8.0000000000000004441e-01, 3.1622776601683794118e-01, 108*25c28e83SPiotr Jasiukajtis 7.9012345679012341293e-01, 3.1426968052735443360e-01, 109*25c28e83SPiotr Jasiukajtis 7.8048780487804880757e-01, 3.1234752377721214378e-01, 110*25c28e83SPiotr Jasiukajtis 7.7108433734939763049e-01, 3.1046021028253312224e-01, 111*25c28e83SPiotr Jasiukajtis 7.6190476190476186247e-01, 3.0860669992418382490e-01, 112*25c28e83SPiotr Jasiukajtis 7.5294117647058822484e-01, 3.0678599553894819740e-01, 113*25c28e83SPiotr Jasiukajtis 7.4418604651162789665e-01, 3.0499714066520933198e-01, 114*25c28e83SPiotr Jasiukajtis 7.3563218390804596680e-01, 3.0323921743156134756e-01, 115*25c28e83SPiotr Jasiukajtis 7.2727272727272729291e-01, 3.0151134457776362918e-01, 116*25c28e83SPiotr Jasiukajtis 7.1910112359550559802e-01, 2.9981267559834456904e-01, 117*25c28e83SPiotr Jasiukajtis 7.1111111111111113825e-01, 2.9814239699997197031e-01, 118*25c28e83SPiotr Jasiukajtis 7.0329670329670335160e-01, 2.9649972666444046610e-01, 119*25c28e83SPiotr Jasiukajtis 6.9565217391304345895e-01, 2.9488391230979427160e-01, 120*25c28e83SPiotr Jasiukajtis 6.8817204301075274309e-01, 2.9329423004270660513e-01, 121*25c28e83SPiotr Jasiukajtis 6.8085106382978721751e-01, 2.9172998299578911663e-01, 122*25c28e83SPiotr Jasiukajtis 6.7368421052631577428e-01, 2.9019050004400465115e-01, 123*25c28e83SPiotr Jasiukajtis 6.6666666666666662966e-01, 2.8867513459481286553e-01, 124*25c28e83SPiotr Jasiukajtis 6.5979381443298967813e-01, 2.8718326344709527165e-01, 125*25c28e83SPiotr Jasiukajtis 6.5306122448979586625e-01, 2.8571428571428569843e-01, 126*25c28e83SPiotr Jasiukajtis 6.4646464646464651960e-01, 2.8426762180748055275e-01, 127*25c28e83SPiotr Jasiukajtis 6.4000000000000001332e-01, 2.8284271247461900689e-01, 128*25c28e83SPiotr Jasiukajtis 6.3366336633663367106e-01, 2.8143901789211672737e-01, 129*25c28e83SPiotr Jasiukajtis 6.2745098039215685404e-01, 2.8005601680560193723e-01, 130*25c28e83SPiotr Jasiukajtis 6.2135922330097081989e-01, 2.7869320571664707442e-01, 131*25c28e83SPiotr Jasiukajtis 6.1538461538461541878e-01, 2.7735009811261457369e-01, 132*25c28e83SPiotr Jasiukajtis 6.0952380952380957879e-01, 2.7602622373694168934e-01, 133*25c28e83SPiotr Jasiukajtis 6.0377358490566035432e-01, 2.7472112789737807015e-01, 134*25c28e83SPiotr Jasiukajtis 5.9813084112149528249e-01, 2.7343437080986532361e-01, 135*25c28e83SPiotr Jasiukajtis 5.9259259259259255970e-01, 2.7216552697590867815e-01, 136*25c28e83SPiotr Jasiukajtis 5.8715596330275232617e-01, 2.7091418459143856712e-01, 137*25c28e83SPiotr Jasiukajtis 5.8181818181818178992e-01, 2.6967994498529684888e-01, 138*25c28e83SPiotr Jasiukajtis 5.7657657657657657158e-01, 2.6846242208560971987e-01, 139*25c28e83SPiotr Jasiukajtis 5.7142857142857139685e-01, 2.6726124191242439654e-01, 140*25c28e83SPiotr Jasiukajtis 5.6637168141592919568e-01, 2.6607604209509572168e-01, 141*25c28e83SPiotr Jasiukajtis 5.6140350877192979340e-01, 2.6490647141300877054e-01, 142*25c28e83SPiotr Jasiukajtis 5.5652173913043478937e-01, 2.6375218935831479250e-01, 143*25c28e83SPiotr Jasiukajtis 5.5172413793103447510e-01, 2.6261286571944508772e-01, 144*25c28e83SPiotr Jasiukajtis 5.4700854700854706358e-01, 2.6148818018424535570e-01, 145*25c28e83SPiotr Jasiukajtis 5.4237288135593220151e-01, 2.6037782196164771520e-01, 146*25c28e83SPiotr Jasiukajtis 5.3781512605042014474e-01, 2.5928148942086576278e-01, 147*25c28e83SPiotr Jasiukajtis 5.3333333333333332593e-01, 2.5819888974716115326e-01, 148*25c28e83SPiotr Jasiukajtis 5.2892561983471075848e-01, 2.5712973861329002645e-01, 149*25c28e83SPiotr Jasiukajtis 5.2459016393442625681e-01, 2.5607375986579195004e-01, 150*25c28e83SPiotr Jasiukajtis 5.2032520325203257539e-01, 2.5503068522533534068e-01, 151*25c28e83SPiotr Jasiukajtis 5.1612903225806450180e-01, 2.5400025400038100942e-01, 152*25c28e83SPiotr Jasiukajtis 5.1200000000000001066e-01, 2.5298221281347033074e-01, 153*25c28e83SPiotr Jasiukajtis 5.0793650793650790831e-01, 2.5197631533948483540e-01, 154*25c28e83SPiotr Jasiukajtis 5.0393700787401574104e-01, 2.5098232205526344041e-01, 155*25c28e83SPiotr Jasiukajtis 1.0000000000000000000e+00, 2.5000000000000000000e-01, 156*25c28e83SPiotr Jasiukajtis 9.8461538461538467004e-01, 2.4806946917841690703e-01, 157*25c28e83SPiotr Jasiukajtis 9.6969696969696972388e-01, 2.4618298195866547551e-01, 158*25c28e83SPiotr Jasiukajtis 9.5522388059701490715e-01, 2.4433888871261044695e-01, 159*25c28e83SPiotr Jasiukajtis 9.4117647058823528106e-01, 2.4253562503633296910e-01, 160*25c28e83SPiotr Jasiukajtis 9.2753623188405798228e-01, 2.4077170617153839660e-01, 161*25c28e83SPiotr Jasiukajtis 9.1428571428571425717e-01, 2.3904572186687872426e-01, 162*25c28e83SPiotr Jasiukajtis 9.0140845070422537244e-01, 2.3735633163877067897e-01, 163*25c28e83SPiotr Jasiukajtis 8.8888888888888883955e-01, 2.3570226039551583908e-01, 164*25c28e83SPiotr Jasiukajtis 8.7671232876712323900e-01, 2.3408229439226113655e-01, 165*25c28e83SPiotr Jasiukajtis 8.6486486486486491287e-01, 2.3249527748763856860e-01, 166*25c28e83SPiotr Jasiukajtis 8.5333333333333338810e-01, 2.3094010767585029797e-01, 167*25c28e83SPiotr Jasiukajtis 8.4210526315789469010e-01, 2.2941573387056177213e-01, 168*25c28e83SPiotr Jasiukajtis 8.3116883116883122362e-01, 2.2792115291927589338e-01, 169*25c28e83SPiotr Jasiukajtis 8.2051282051282048435e-01, 2.2645540682891915352e-01, 170*25c28e83SPiotr Jasiukajtis 8.1012658227848100001e-01, 2.2501758018520479077e-01, 171*25c28e83SPiotr Jasiukajtis 8.0000000000000004441e-01, 2.2360679774997896385e-01, 172*25c28e83SPiotr Jasiukajtis 7.9012345679012341293e-01, 2.2222222222222220989e-01, 173*25c28e83SPiotr Jasiukajtis 7.8048780487804880757e-01, 2.2086305214969309541e-01, 174*25c28e83SPiotr Jasiukajtis 7.7108433734939763049e-01, 2.1952851997938069295e-01, 175*25c28e83SPiotr Jasiukajtis 7.6190476190476186247e-01, 2.1821789023599238999e-01, 176*25c28e83SPiotr Jasiukajtis 7.5294117647058822484e-01, 2.1693045781865616384e-01, 177*25c28e83SPiotr Jasiukajtis 7.4418604651162789665e-01, 2.1566554640687682354e-01, 178*25c28e83SPiotr Jasiukajtis 7.3563218390804596680e-01, 2.1442250696755896233e-01, 179*25c28e83SPiotr Jasiukajtis 7.2727272727272729291e-01, 2.1320071635561044232e-01, 180*25c28e83SPiotr Jasiukajtis 7.1910112359550559802e-01, 2.1199957600127200541e-01, 181*25c28e83SPiotr Jasiukajtis 7.1111111111111113825e-01, 2.1081851067789195153e-01, 182*25c28e83SPiotr Jasiukajtis 7.0329670329670335160e-01, 2.0965696734438366011e-01, 183*25c28e83SPiotr Jasiukajtis 6.9565217391304345895e-01, 2.0851441405707477061e-01, 184*25c28e83SPiotr Jasiukajtis 6.8817204301075274309e-01, 2.0739033894608505104e-01, 185*25c28e83SPiotr Jasiukajtis 6.8085106382978721751e-01, 2.0628424925175867233e-01, 186*25c28e83SPiotr Jasiukajtis 6.7368421052631577428e-01, 2.0519567041703082322e-01, 187*25c28e83SPiotr Jasiukajtis 6.6666666666666662966e-01, 2.0412414523193150862e-01, 188*25c28e83SPiotr Jasiukajtis 6.5979381443298967813e-01, 2.0306923302672380549e-01, 189*25c28e83SPiotr Jasiukajtis 6.5306122448979586625e-01, 2.0203050891044216364e-01, 190*25c28e83SPiotr Jasiukajtis 6.4646464646464651960e-01, 2.0100756305184241945e-01, 191*25c28e83SPiotr Jasiukajtis 6.4000000000000001332e-01, 2.0000000000000001110e-01, 192*25c28e83SPiotr Jasiukajtis 6.3366336633663367106e-01, 1.9900743804199783060e-01, 193*25c28e83SPiotr Jasiukajtis 6.2745098039215685404e-01, 1.9802950859533485772e-01, 194*25c28e83SPiotr Jasiukajtis 6.2135922330097081989e-01, 1.9706585563285863860e-01, 195*25c28e83SPiotr Jasiukajtis 6.1538461538461541878e-01, 1.9611613513818404453e-01, 196*25c28e83SPiotr Jasiukajtis 6.0952380952380957879e-01, 1.9518001458970662965e-01, 197*25c28e83SPiotr Jasiukajtis 6.0377358490566035432e-01, 1.9425717247145282696e-01, 198*25c28e83SPiotr Jasiukajtis 5.9813084112149528249e-01, 1.9334729780913270658e-01, 199*25c28e83SPiotr Jasiukajtis 5.9259259259259255970e-01, 1.9245008972987526219e-01, 200*25c28e83SPiotr Jasiukajtis 5.8715596330275232617e-01, 1.9156525704423027490e-01, 201*25c28e83SPiotr Jasiukajtis 5.8181818181818178992e-01, 1.9069251784911847580e-01, 202*25c28e83SPiotr Jasiukajtis 5.7657657657657657158e-01, 1.8983159915049979682e-01, 203*25c28e83SPiotr Jasiukajtis 5.7142857142857139685e-01, 1.8898223650461362655e-01, 204*25c28e83SPiotr Jasiukajtis 5.6637168141592919568e-01, 1.8814417367671945613e-01, 205*25c28e83SPiotr Jasiukajtis 5.6140350877192979340e-01, 1.8731716231633879777e-01, 206*25c28e83SPiotr Jasiukajtis 5.5652173913043478937e-01, 1.8650096164806276300e-01, 207*25c28e83SPiotr Jasiukajtis 5.5172413793103447510e-01, 1.8569533817705186074e-01, 208*25c28e83SPiotr Jasiukajtis 5.4700854700854706358e-01, 1.8490006540840969729e-01, 209*25c28e83SPiotr Jasiukajtis 5.4237288135593220151e-01, 1.8411492357966466327e-01, 210*25c28e83SPiotr Jasiukajtis 5.3781512605042014474e-01, 1.8333969940564226464e-01, 211*25c28e83SPiotr Jasiukajtis 5.3333333333333332593e-01, 1.8257418583505535814e-01, 212*25c28e83SPiotr Jasiukajtis 5.2892561983471075848e-01, 1.8181818181818182323e-01, 213*25c28e83SPiotr Jasiukajtis 5.2459016393442625681e-01, 1.8107149208503706128e-01, 214*25c28e83SPiotr Jasiukajtis 5.2032520325203257539e-01, 1.8033392693348646030e-01, 215*25c28e83SPiotr Jasiukajtis 5.1612903225806450180e-01, 1.7960530202677491007e-01, 216*25c28e83SPiotr Jasiukajtis 5.1200000000000001066e-01, 1.7888543819998317663e-01, 217*25c28e83SPiotr Jasiukajtis 5.0793650793650790831e-01, 1.7817416127494958844e-01, 218*25c28e83SPiotr Jasiukajtis 5.0393700787401574104e-01, 1.7747130188322274291e-01, 219*25c28e83SPiotr Jasiukajtis }; 220*25c28e83SPiotr Jasiukajtis 221*25c28e83SPiotr Jasiukajtis extern float fabsf(float); 222*25c28e83SPiotr Jasiukajtis 223*25c28e83SPiotr Jasiukajtis static const double 224*25c28e83SPiotr Jasiukajtis A0 = 9.99999997962321453275e-01, 225*25c28e83SPiotr Jasiukajtis A1 =-4.99999998166077580600e-01, 226*25c28e83SPiotr Jasiukajtis A2 = 3.75066768969515586277e-01, 227*25c28e83SPiotr Jasiukajtis A3 =-3.12560092408808548438e-01; 228*25c28e83SPiotr Jasiukajtis 229*25c28e83SPiotr Jasiukajtis static void 230*25c28e83SPiotr Jasiukajtis __vrhypotf_n(int n, float * restrict px, int stridex, float * restrict py, 231*25c28e83SPiotr Jasiukajtis int stridey, float * restrict pz, int stridez); 232*25c28e83SPiotr Jasiukajtis 233*25c28e83SPiotr Jasiukajtis #pragma no_inline(__vrhypotf_n) 234*25c28e83SPiotr Jasiukajtis 235*25c28e83SPiotr Jasiukajtis #define RETURN(ret) \ 236*25c28e83SPiotr Jasiukajtis { \ 237*25c28e83SPiotr Jasiukajtis *pz = (ret); \ 238*25c28e83SPiotr Jasiukajtis pz += stridez; \ 239*25c28e83SPiotr Jasiukajtis if (n_n == 0) \ 240*25c28e83SPiotr Jasiukajtis { \ 241*25c28e83SPiotr Jasiukajtis spx = px; spy = py; spz = pz; \ 242*25c28e83SPiotr Jasiukajtis ay0 = *(int*)py; \ 243*25c28e83SPiotr Jasiukajtis continue; \ 244*25c28e83SPiotr Jasiukajtis } \ 245*25c28e83SPiotr Jasiukajtis n--; \ 246*25c28e83SPiotr Jasiukajtis break; \ 247*25c28e83SPiotr Jasiukajtis } 248*25c28e83SPiotr Jasiukajtis 249*25c28e83SPiotr Jasiukajtis 250*25c28e83SPiotr Jasiukajtis void 251*25c28e83SPiotr Jasiukajtis __vrhypotf(int n, float * restrict px, int stridex, float * restrict py, 252*25c28e83SPiotr Jasiukajtis int stridey, float * restrict pz, int stridez) 253*25c28e83SPiotr Jasiukajtis { 254*25c28e83SPiotr Jasiukajtis float *spx, *spy, *spz; 255*25c28e83SPiotr Jasiukajtis int ax0, ay0, n_n; 256*25c28e83SPiotr Jasiukajtis float res, x0, y0; 257*25c28e83SPiotr Jasiukajtis 258*25c28e83SPiotr Jasiukajtis while (n > 1) 259*25c28e83SPiotr Jasiukajtis { 260*25c28e83SPiotr Jasiukajtis n_n = 0; 261*25c28e83SPiotr Jasiukajtis spx = px; 262*25c28e83SPiotr Jasiukajtis spy = py; 263*25c28e83SPiotr Jasiukajtis spz = pz; 264*25c28e83SPiotr Jasiukajtis ax0 = *(int*)px; 265*25c28e83SPiotr Jasiukajtis ay0 = *(int*)py; 266*25c28e83SPiotr Jasiukajtis for (; n > 1 ; n--) 267*25c28e83SPiotr Jasiukajtis { 268*25c28e83SPiotr Jasiukajtis ax0 &= 0x7fffffff; 269*25c28e83SPiotr Jasiukajtis ay0 &= 0x7fffffff; 270*25c28e83SPiotr Jasiukajtis 271*25c28e83SPiotr Jasiukajtis px += stridex; 272*25c28e83SPiotr Jasiukajtis 273*25c28e83SPiotr Jasiukajtis if (ax0 >= 0x7f800000 || ay0 >= 0x7f800000) /* X or Y = NaN or Inf */ 274*25c28e83SPiotr Jasiukajtis { 275*25c28e83SPiotr Jasiukajtis x0 = *(px - stridex); 276*25c28e83SPiotr Jasiukajtis y0 = *py; 277*25c28e83SPiotr Jasiukajtis res = fabsf(x0) + fabsf(y0); 278*25c28e83SPiotr Jasiukajtis if (ax0 == 0x7f800000) res = 0.0f; 279*25c28e83SPiotr Jasiukajtis else if (ay0 == 0x7f800000) res = 0.0f; 280*25c28e83SPiotr Jasiukajtis ax0 = *(int*)px; 281*25c28e83SPiotr Jasiukajtis py += stridey; 282*25c28e83SPiotr Jasiukajtis RETURN (res) 283*25c28e83SPiotr Jasiukajtis } 284*25c28e83SPiotr Jasiukajtis ax0 = *(int*)px; 285*25c28e83SPiotr Jasiukajtis py += stridey; 286*25c28e83SPiotr Jasiukajtis if (ay0 == 0) /* Y = 0 */ 287*25c28e83SPiotr Jasiukajtis { 288*25c28e83SPiotr Jasiukajtis int tx = *(int*)(px - stridex) & 0x7fffffff; 289*25c28e83SPiotr Jasiukajtis if (tx == 0) /* X = 0 */ 290*25c28e83SPiotr Jasiukajtis { 291*25c28e83SPiotr Jasiukajtis RETURN (1.0f / 0.0f) 292*25c28e83SPiotr Jasiukajtis } 293*25c28e83SPiotr Jasiukajtis } 294*25c28e83SPiotr Jasiukajtis pz += stridez; 295*25c28e83SPiotr Jasiukajtis n_n++; 296*25c28e83SPiotr Jasiukajtis ay0 = *(int*)py; 297*25c28e83SPiotr Jasiukajtis } 298*25c28e83SPiotr Jasiukajtis if (n_n > 0) 299*25c28e83SPiotr Jasiukajtis __vrhypotf_n(n_n, spx, stridex, spy, stridey, spz, stridez); 300*25c28e83SPiotr Jasiukajtis } 301*25c28e83SPiotr Jasiukajtis if (n > 0) 302*25c28e83SPiotr Jasiukajtis { 303*25c28e83SPiotr Jasiukajtis ax0 = *(int*)px; 304*25c28e83SPiotr Jasiukajtis ay0 = *(int*)py; 305*25c28e83SPiotr Jasiukajtis x0 = *px; 306*25c28e83SPiotr Jasiukajtis y0 = *py; 307*25c28e83SPiotr Jasiukajtis 308*25c28e83SPiotr Jasiukajtis ax0 &= 0x7fffffff; 309*25c28e83SPiotr Jasiukajtis ay0 &= 0x7fffffff; 310*25c28e83SPiotr Jasiukajtis 311*25c28e83SPiotr Jasiukajtis if (ax0 >= 0x7f800000 || ay0 >= 0x7f800000) /* X or Y = NaN or Inf */ 312*25c28e83SPiotr Jasiukajtis { 313*25c28e83SPiotr Jasiukajtis res = fabsf(x0) + fabsf(y0); 314*25c28e83SPiotr Jasiukajtis if (ax0 == 0x7f800000) res = 0.0f; 315*25c28e83SPiotr Jasiukajtis else if (ay0 == 0x7f800000) res = 0.0f; 316*25c28e83SPiotr Jasiukajtis *pz = res; 317*25c28e83SPiotr Jasiukajtis } 318*25c28e83SPiotr Jasiukajtis else if (ax0 == 0 && ay0 == 0) /* X and Y = 0 */ 319*25c28e83SPiotr Jasiukajtis { 320*25c28e83SPiotr Jasiukajtis *pz = 1.0f / 0.0f; 321*25c28e83SPiotr Jasiukajtis } 322*25c28e83SPiotr Jasiukajtis else 323*25c28e83SPiotr Jasiukajtis { 324*25c28e83SPiotr Jasiukajtis double xx0, res0, hyp0, h_hi0 = 0, dbase0 = 0; 325*25c28e83SPiotr Jasiukajtis int ibase0, si0, hyp0h; 326*25c28e83SPiotr Jasiukajtis 327*25c28e83SPiotr Jasiukajtis hyp0 = x0 * (double)x0 + y0 * (double)y0; 328*25c28e83SPiotr Jasiukajtis 329*25c28e83SPiotr Jasiukajtis ibase0 = HI(&hyp0); 330*25c28e83SPiotr Jasiukajtis 331*25c28e83SPiotr Jasiukajtis HI(&dbase0) = (0x60000000 - ((ibase0 & 0x7fe00000) >> 1)); 332*25c28e83SPiotr Jasiukajtis 333*25c28e83SPiotr Jasiukajtis hyp0h = (ibase0 & 0x000fffff) | 0x3ff00000; 334*25c28e83SPiotr Jasiukajtis HI(&hyp0) = hyp0h; 335*25c28e83SPiotr Jasiukajtis HI(&h_hi0) = hyp0h & 0x7fffc000; 336*25c28e83SPiotr Jasiukajtis 337*25c28e83SPiotr Jasiukajtis ibase0 >>= 10; 338*25c28e83SPiotr Jasiukajtis si0 = ibase0 & 0x7f0; 339*25c28e83SPiotr Jasiukajtis xx0 = ((double*)((char*)__vlibm_TBL_rhypotf + si0))[0]; 340*25c28e83SPiotr Jasiukajtis 341*25c28e83SPiotr Jasiukajtis xx0 = (hyp0 - h_hi0) * xx0; 342*25c28e83SPiotr Jasiukajtis res0 = ((double*)((char*)__vlibm_TBL_rhypotf + si0))[1]; 343*25c28e83SPiotr Jasiukajtis res0 *= (((A3 * xx0 + A2) * xx0 + A1) * xx0 + A0); 344*25c28e83SPiotr Jasiukajtis res0 *= dbase0; 345*25c28e83SPiotr Jasiukajtis *pz = res0; 346*25c28e83SPiotr Jasiukajtis } 347*25c28e83SPiotr Jasiukajtis } 348*25c28e83SPiotr Jasiukajtis } 349*25c28e83SPiotr Jasiukajtis 350*25c28e83SPiotr Jasiukajtis static void 351*25c28e83SPiotr Jasiukajtis __vrhypotf_n(int n, float * restrict px, int stridex, float * restrict py, 352*25c28e83SPiotr Jasiukajtis int stridey, float * restrict pz, int stridez) 353*25c28e83SPiotr Jasiukajtis { 354*25c28e83SPiotr Jasiukajtis double xx0, res0, hyp0, h_hi0 = 0, dbase0 = 0; 355*25c28e83SPiotr Jasiukajtis double xx1, res1, hyp1, h_hi1 = 0, dbase1 = 0; 356*25c28e83SPiotr Jasiukajtis double xx2, res2, hyp2, h_hi2 = 0, dbase2 = 0; 357*25c28e83SPiotr Jasiukajtis float x0, y0; 358*25c28e83SPiotr Jasiukajtis float x1, y1; 359*25c28e83SPiotr Jasiukajtis float x2, y2; 360*25c28e83SPiotr Jasiukajtis int ibase0, si0, hyp0h; 361*25c28e83SPiotr Jasiukajtis int ibase1, si1, hyp1h; 362*25c28e83SPiotr Jasiukajtis int ibase2, si2, hyp2h; 363*25c28e83SPiotr Jasiukajtis 364*25c28e83SPiotr Jasiukajtis for (; n > 2 ; n -= 3) 365*25c28e83SPiotr Jasiukajtis { 366*25c28e83SPiotr Jasiukajtis x0 = *px; 367*25c28e83SPiotr Jasiukajtis px += stridex; 368*25c28e83SPiotr Jasiukajtis x1 = *px; 369*25c28e83SPiotr Jasiukajtis px += stridex; 370*25c28e83SPiotr Jasiukajtis x2 = *px; 371*25c28e83SPiotr Jasiukajtis px += stridex; 372*25c28e83SPiotr Jasiukajtis 373*25c28e83SPiotr Jasiukajtis y0 = *py; 374*25c28e83SPiotr Jasiukajtis py += stridey; 375*25c28e83SPiotr Jasiukajtis y1 = *py; 376*25c28e83SPiotr Jasiukajtis py += stridey; 377*25c28e83SPiotr Jasiukajtis y2 = *py; 378*25c28e83SPiotr Jasiukajtis py += stridey; 379*25c28e83SPiotr Jasiukajtis 380*25c28e83SPiotr Jasiukajtis hyp0 = x0 * (double)x0 + y0 * (double)y0; 381*25c28e83SPiotr Jasiukajtis hyp1 = x1 * (double)x1 + y1 * (double)y1; 382*25c28e83SPiotr Jasiukajtis hyp2 = x2 * (double)x2 + y2 * (double)y2; 383*25c28e83SPiotr Jasiukajtis 384*25c28e83SPiotr Jasiukajtis ibase0 = HI(&hyp0); 385*25c28e83SPiotr Jasiukajtis ibase1 = HI(&hyp1); 386*25c28e83SPiotr Jasiukajtis ibase2 = HI(&hyp2); 387*25c28e83SPiotr Jasiukajtis 388*25c28e83SPiotr Jasiukajtis HI(&dbase0) = (0x60000000 - ((ibase0 & 0x7fe00000) >> 1)); 389*25c28e83SPiotr Jasiukajtis HI(&dbase1) = (0x60000000 - ((ibase1 & 0x7fe00000) >> 1)); 390*25c28e83SPiotr Jasiukajtis HI(&dbase2) = (0x60000000 - ((ibase2 & 0x7fe00000) >> 1)); 391*25c28e83SPiotr Jasiukajtis 392*25c28e83SPiotr Jasiukajtis hyp0h = (ibase0 & 0x000fffff) | 0x3ff00000; 393*25c28e83SPiotr Jasiukajtis hyp1h = (ibase1 & 0x000fffff) | 0x3ff00000; 394*25c28e83SPiotr Jasiukajtis hyp2h = (ibase2 & 0x000fffff) | 0x3ff00000; 395*25c28e83SPiotr Jasiukajtis HI(&hyp0) = hyp0h; 396*25c28e83SPiotr Jasiukajtis HI(&hyp1) = hyp1h; 397*25c28e83SPiotr Jasiukajtis HI(&hyp2) = hyp2h; 398*25c28e83SPiotr Jasiukajtis HI(&h_hi0) = hyp0h & 0x7fffc000; 399*25c28e83SPiotr Jasiukajtis HI(&h_hi1) = hyp1h & 0x7fffc000; 400*25c28e83SPiotr Jasiukajtis HI(&h_hi2) = hyp2h & 0x7fffc000; 401*25c28e83SPiotr Jasiukajtis 402*25c28e83SPiotr Jasiukajtis ibase0 >>= 10; 403*25c28e83SPiotr Jasiukajtis ibase1 >>= 10; 404*25c28e83SPiotr Jasiukajtis ibase2 >>= 10; 405*25c28e83SPiotr Jasiukajtis si0 = ibase0 & 0x7f0; 406*25c28e83SPiotr Jasiukajtis si1 = ibase1 & 0x7f0; 407*25c28e83SPiotr Jasiukajtis si2 = ibase2 & 0x7f0; 408*25c28e83SPiotr Jasiukajtis xx0 = ((double*)((char*)__vlibm_TBL_rhypotf + si0))[0]; 409*25c28e83SPiotr Jasiukajtis xx1 = ((double*)((char*)__vlibm_TBL_rhypotf + si1))[0]; 410*25c28e83SPiotr Jasiukajtis xx2 = ((double*)((char*)__vlibm_TBL_rhypotf + si2))[0]; 411*25c28e83SPiotr Jasiukajtis 412*25c28e83SPiotr Jasiukajtis xx0 = (hyp0 - h_hi0) * xx0; 413*25c28e83SPiotr Jasiukajtis xx1 = (hyp1 - h_hi1) * xx1; 414*25c28e83SPiotr Jasiukajtis xx2 = (hyp2 - h_hi2) * xx2; 415*25c28e83SPiotr Jasiukajtis res0 = ((double*)((char*)__vlibm_TBL_rhypotf + si0))[1]; 416*25c28e83SPiotr Jasiukajtis res1 = ((double*)((char*)__vlibm_TBL_rhypotf + si1))[1]; 417*25c28e83SPiotr Jasiukajtis res2 = ((double*)((char*)__vlibm_TBL_rhypotf + si2))[1]; 418*25c28e83SPiotr Jasiukajtis res0 *= (((A3 * xx0 + A2) * xx0 + A1) * xx0 + A0); 419*25c28e83SPiotr Jasiukajtis res1 *= (((A3 * xx1 + A2) * xx1 + A1) * xx1 + A0); 420*25c28e83SPiotr Jasiukajtis res2 *= (((A3 * xx2 + A2) * xx2 + A1) * xx2 + A0); 421*25c28e83SPiotr Jasiukajtis res0 *= dbase0; 422*25c28e83SPiotr Jasiukajtis res1 *= dbase1; 423*25c28e83SPiotr Jasiukajtis res2 *= dbase2; 424*25c28e83SPiotr Jasiukajtis *pz = res0; 425*25c28e83SPiotr Jasiukajtis pz += stridez; 426*25c28e83SPiotr Jasiukajtis *pz = res1; 427*25c28e83SPiotr Jasiukajtis pz += stridez; 428*25c28e83SPiotr Jasiukajtis *pz = res2; 429*25c28e83SPiotr Jasiukajtis pz += stridez; 430*25c28e83SPiotr Jasiukajtis } 431*25c28e83SPiotr Jasiukajtis 432*25c28e83SPiotr Jasiukajtis for (; n > 0 ; n--) 433*25c28e83SPiotr Jasiukajtis { 434*25c28e83SPiotr Jasiukajtis x0 = *px; 435*25c28e83SPiotr Jasiukajtis px += stridex; 436*25c28e83SPiotr Jasiukajtis 437*25c28e83SPiotr Jasiukajtis y0 = *py; 438*25c28e83SPiotr Jasiukajtis py += stridey; 439*25c28e83SPiotr Jasiukajtis 440*25c28e83SPiotr Jasiukajtis hyp0 = x0 * (double)x0 + y0 * (double)y0; 441*25c28e83SPiotr Jasiukajtis 442*25c28e83SPiotr Jasiukajtis ibase0 = HI(&hyp0); 443*25c28e83SPiotr Jasiukajtis 444*25c28e83SPiotr Jasiukajtis HI(&dbase0) = (0x60000000 - ((ibase0 & 0x7fe00000) >> 1)); 445*25c28e83SPiotr Jasiukajtis 446*25c28e83SPiotr Jasiukajtis hyp0h = (ibase0 & 0x000fffff) | 0x3ff00000; 447*25c28e83SPiotr Jasiukajtis HI(&hyp0) = hyp0h; 448*25c28e83SPiotr Jasiukajtis HI(&h_hi0) = hyp0h & 0x7fffc000; 449*25c28e83SPiotr Jasiukajtis 450*25c28e83SPiotr Jasiukajtis ibase0 >>= 10; 451*25c28e83SPiotr Jasiukajtis si0 = ibase0 & 0x7f0; 452*25c28e83SPiotr Jasiukajtis xx0 = ((double*)((char*)__vlibm_TBL_rhypotf + si0))[0]; 453*25c28e83SPiotr Jasiukajtis 454*25c28e83SPiotr Jasiukajtis xx0 = (hyp0 - h_hi0) * xx0; 455*25c28e83SPiotr Jasiukajtis res0 = ((double*)((char*)__vlibm_TBL_rhypotf + si0))[1]; 456*25c28e83SPiotr Jasiukajtis res0 *= (((A3 * xx0 + A2) * xx0 + A1) * xx0 + A0); 457*25c28e83SPiotr Jasiukajtis res0 *= dbase0; 458*25c28e83SPiotr Jasiukajtis *pz = res0; 459*25c28e83SPiotr Jasiukajtis pz += stridez; 460*25c28e83SPiotr Jasiukajtis } 461*25c28e83SPiotr Jasiukajtis } 462