1*25c28e83SPiotr Jasiukajtis /* 2*25c28e83SPiotr Jasiukajtis * CDDL HEADER START 3*25c28e83SPiotr Jasiukajtis * 4*25c28e83SPiotr Jasiukajtis * The contents of this file are subject to the terms of the 5*25c28e83SPiotr Jasiukajtis * Common Development and Distribution License (the "License"). 6*25c28e83SPiotr Jasiukajtis * You may not use this file except in compliance with the License. 7*25c28e83SPiotr Jasiukajtis * 8*25c28e83SPiotr Jasiukajtis * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9*25c28e83SPiotr Jasiukajtis * or http://www.opensolaris.org/os/licensing. 10*25c28e83SPiotr Jasiukajtis * See the License for the specific language governing permissions 11*25c28e83SPiotr Jasiukajtis * and limitations under the License. 12*25c28e83SPiotr Jasiukajtis * 13*25c28e83SPiotr Jasiukajtis * When distributing Covered Code, include this CDDL HEADER in each 14*25c28e83SPiotr Jasiukajtis * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15*25c28e83SPiotr Jasiukajtis * If applicable, add the following below this CDDL HEADER, with the 16*25c28e83SPiotr Jasiukajtis * fields enclosed by brackets "[]" replaced with your own identifying 17*25c28e83SPiotr Jasiukajtis * information: Portions Copyright [yyyy] [name of copyright owner] 18*25c28e83SPiotr Jasiukajtis * 19*25c28e83SPiotr Jasiukajtis * CDDL HEADER END 20*25c28e83SPiotr Jasiukajtis */ 21*25c28e83SPiotr Jasiukajtis 22*25c28e83SPiotr Jasiukajtis /* 23*25c28e83SPiotr Jasiukajtis * Copyright 2011 Nexenta Systems, Inc. All rights reserved. 24*25c28e83SPiotr Jasiukajtis */ 25*25c28e83SPiotr Jasiukajtis /* 26*25c28e83SPiotr Jasiukajtis * Copyright 2006 Sun Microsystems, Inc. All rights reserved. 27*25c28e83SPiotr Jasiukajtis * Use is subject to license terms. 28*25c28e83SPiotr Jasiukajtis */ 29*25c28e83SPiotr Jasiukajtis 30*25c28e83SPiotr Jasiukajtis #include <sys/isa_defs.h> 31*25c28e83SPiotr Jasiukajtis #include "libm_inlines.h" 32*25c28e83SPiotr Jasiukajtis 33*25c28e83SPiotr Jasiukajtis #ifdef _LITTLE_ENDIAN 34*25c28e83SPiotr Jasiukajtis #define HI(x) *(1+(int*)x) 35*25c28e83SPiotr Jasiukajtis #define LO(x) *(unsigned*)x 36*25c28e83SPiotr Jasiukajtis #else 37*25c28e83SPiotr Jasiukajtis #define HI(x) *(int*)x 38*25c28e83SPiotr Jasiukajtis #define LO(x) *(1+(unsigned*)x) 39*25c28e83SPiotr Jasiukajtis #endif 40*25c28e83SPiotr Jasiukajtis 41*25c28e83SPiotr Jasiukajtis #ifdef __RESTRICT 42*25c28e83SPiotr Jasiukajtis #define restrict _Restrict 43*25c28e83SPiotr Jasiukajtis #else 44*25c28e83SPiotr Jasiukajtis #define restrict 45*25c28e83SPiotr Jasiukajtis #endif 46*25c28e83SPiotr Jasiukajtis 47*25c28e83SPiotr Jasiukajtis /* double rsqrt(double x) 48*25c28e83SPiotr Jasiukajtis * 49*25c28e83SPiotr Jasiukajtis * Method : 50*25c28e83SPiotr Jasiukajtis * 1. Special cases: 51*25c28e83SPiotr Jasiukajtis * for x = NaN => QNaN; 52*25c28e83SPiotr Jasiukajtis * for x = +Inf => 0; 53*25c28e83SPiotr Jasiukajtis * for x is negative, -Inf => QNaN + invalid; 54*25c28e83SPiotr Jasiukajtis * for x = +0 => +Inf + divide-by-zero; 55*25c28e83SPiotr Jasiukajtis * for x = -0 => -Inf + divide-by-zero. 56*25c28e83SPiotr Jasiukajtis * 2. Computes reciprocal square root from: 57*25c28e83SPiotr Jasiukajtis * x = m * 2**n 58*25c28e83SPiotr Jasiukajtis * Where: 59*25c28e83SPiotr Jasiukajtis * m = [0.5, 2), 60*25c28e83SPiotr Jasiukajtis * n = ((exponent + 1) & ~1). 61*25c28e83SPiotr Jasiukajtis * Then: 62*25c28e83SPiotr Jasiukajtis * rsqrt(x) = 1/sqrt( m * 2**n ) = (2 ** (-n/2)) * (1/sqrt(m)) 63*25c28e83SPiotr Jasiukajtis * 2. Computes 1/sqrt(m) from: 64*25c28e83SPiotr Jasiukajtis * 1/sqrt(m) = (1/sqrt(m0)) * (1/sqrt(1 + (1/m0)*dm)) 65*25c28e83SPiotr Jasiukajtis * Where: 66*25c28e83SPiotr Jasiukajtis * m = m0 + dm, 67*25c28e83SPiotr Jasiukajtis * m0 = 0.5 * (1 + k/64) for m = [0.5, 0.5+127/256), k = [0, 63]; 68*25c28e83SPiotr Jasiukajtis * m0 = 1.0 * (0 + k/64) for m = [0.5+127/256, 1.0+127/128), k = [64, 127]; 69*25c28e83SPiotr Jasiukajtis * m0 = 2.0 for m = [1.0+127/128, 2.0), k = 128. 70*25c28e83SPiotr Jasiukajtis * Then: 71*25c28e83SPiotr Jasiukajtis * 1/sqrt(m0) is looked up in a table, 72*25c28e83SPiotr Jasiukajtis * 1/m0 is computed as (1/sqrt(m0)) * (1/sqrt(m0)). 73*25c28e83SPiotr Jasiukajtis * 1/sqrt(1 + (1/m0)*dm) is computed using approximation: 74*25c28e83SPiotr Jasiukajtis * 1/sqrt(1 + z) = (((((a6 * z + a5) * z + a4) * z + a3) 75*25c28e83SPiotr Jasiukajtis * * z + a2) * z + a1) * z + a0 76*25c28e83SPiotr Jasiukajtis * where z = [-1/128, 1/128]. 77*25c28e83SPiotr Jasiukajtis * 78*25c28e83SPiotr Jasiukajtis * Accuracy: 79*25c28e83SPiotr Jasiukajtis * The maximum relative error for the approximating 80*25c28e83SPiotr Jasiukajtis * polynomial is 2**(-56.26). 81*25c28e83SPiotr Jasiukajtis * Maximum error observed: less than 0.563 ulp after 1.500.000.000 82*25c28e83SPiotr Jasiukajtis * results. 83*25c28e83SPiotr Jasiukajtis */ 84*25c28e83SPiotr Jasiukajtis 85*25c28e83SPiotr Jasiukajtis extern double sqrt (double); 86*25c28e83SPiotr Jasiukajtis extern const double __vlibm_TBL_rsqrt[]; 87*25c28e83SPiotr Jasiukajtis 88*25c28e83SPiotr Jasiukajtis static void 89*25c28e83SPiotr Jasiukajtis __vrsqrt_n(int n, double * restrict px, int stridex, double * restrict py, int stridey); 90*25c28e83SPiotr Jasiukajtis 91*25c28e83SPiotr Jasiukajtis #pragma no_inline(__vrsqrt_n) 92*25c28e83SPiotr Jasiukajtis 93*25c28e83SPiotr Jasiukajtis #define RETURN(ret) \ 94*25c28e83SPiotr Jasiukajtis { \ 95*25c28e83SPiotr Jasiukajtis *py = (ret); \ 96*25c28e83SPiotr Jasiukajtis py += stridey; \ 97*25c28e83SPiotr Jasiukajtis if (n_n == 0) \ 98*25c28e83SPiotr Jasiukajtis { \ 99*25c28e83SPiotr Jasiukajtis spx = px; spy = py; \ 100*25c28e83SPiotr Jasiukajtis hx = HI(px); \ 101*25c28e83SPiotr Jasiukajtis continue; \ 102*25c28e83SPiotr Jasiukajtis } \ 103*25c28e83SPiotr Jasiukajtis n--; \ 104*25c28e83SPiotr Jasiukajtis break; \ 105*25c28e83SPiotr Jasiukajtis } 106*25c28e83SPiotr Jasiukajtis 107*25c28e83SPiotr Jasiukajtis static const double 108*25c28e83SPiotr Jasiukajtis DONE = 1.0, 109*25c28e83SPiotr Jasiukajtis K1 = -5.00000000000005209867e-01, 110*25c28e83SPiotr Jasiukajtis K2 = 3.75000000000004884257e-01, 111*25c28e83SPiotr Jasiukajtis K3 = -3.12499999317136886551e-01, 112*25c28e83SPiotr Jasiukajtis K4 = 2.73437499359815081532e-01, 113*25c28e83SPiotr Jasiukajtis K5 = -2.46116125605037803130e-01, 114*25c28e83SPiotr Jasiukajtis K6 = 2.25606914648617522896e-01; 115*25c28e83SPiotr Jasiukajtis 116*25c28e83SPiotr Jasiukajtis void 117*25c28e83SPiotr Jasiukajtis __vrsqrt(int n, double * restrict px, int stridex, double * restrict py, int stridey) 118*25c28e83SPiotr Jasiukajtis { 119*25c28e83SPiotr Jasiukajtis double *spx, *spy; 120*25c28e83SPiotr Jasiukajtis int ax, lx, hx, n_n; 121*25c28e83SPiotr Jasiukajtis double res; 122*25c28e83SPiotr Jasiukajtis 123*25c28e83SPiotr Jasiukajtis while (n > 1) 124*25c28e83SPiotr Jasiukajtis { 125*25c28e83SPiotr Jasiukajtis n_n = 0; 126*25c28e83SPiotr Jasiukajtis spx = px; 127*25c28e83SPiotr Jasiukajtis spy = py; 128*25c28e83SPiotr Jasiukajtis hx = HI(px); 129*25c28e83SPiotr Jasiukajtis for (; n > 1 ; n--) 130*25c28e83SPiotr Jasiukajtis { 131*25c28e83SPiotr Jasiukajtis px += stridex; 132*25c28e83SPiotr Jasiukajtis if (hx >= 0x7ff00000) /* X = NaN or Inf */ 133*25c28e83SPiotr Jasiukajtis { 134*25c28e83SPiotr Jasiukajtis res = *(px - stridex); 135*25c28e83SPiotr Jasiukajtis RETURN (DONE / res) 136*25c28e83SPiotr Jasiukajtis } 137*25c28e83SPiotr Jasiukajtis 138*25c28e83SPiotr Jasiukajtis py += stridey; 139*25c28e83SPiotr Jasiukajtis 140*25c28e83SPiotr Jasiukajtis if (hx < 0x00100000) /* X = denormal, zero or negative */ 141*25c28e83SPiotr Jasiukajtis { 142*25c28e83SPiotr Jasiukajtis py -= stridey; 143*25c28e83SPiotr Jasiukajtis ax = hx & 0x7fffffff; 144*25c28e83SPiotr Jasiukajtis lx = LO((px - stridex)); 145*25c28e83SPiotr Jasiukajtis res = *(px - stridex); 146*25c28e83SPiotr Jasiukajtis 147*25c28e83SPiotr Jasiukajtis if ((ax | lx) == 0) /* |X| = zero */ 148*25c28e83SPiotr Jasiukajtis { 149*25c28e83SPiotr Jasiukajtis RETURN (DONE / res) 150*25c28e83SPiotr Jasiukajtis } 151*25c28e83SPiotr Jasiukajtis else if (hx >= 0) /* X = denormal */ 152*25c28e83SPiotr Jasiukajtis { 153*25c28e83SPiotr Jasiukajtis double res_c0, dsqrt_exp0; 154*25c28e83SPiotr Jasiukajtis int ind0, sqrt_exp0; 155*25c28e83SPiotr Jasiukajtis double xx0, dexp_hi0, dexp_lo0; 156*25c28e83SPiotr Jasiukajtis int hx0, resh0, res_ch0; 157*25c28e83SPiotr Jasiukajtis 158*25c28e83SPiotr Jasiukajtis res = *(long long*)&res; 159*25c28e83SPiotr Jasiukajtis 160*25c28e83SPiotr Jasiukajtis hx0 = HI(&res); 161*25c28e83SPiotr Jasiukajtis sqrt_exp0 = (0x817 - (hx0 >> 21)) << 20; 162*25c28e83SPiotr Jasiukajtis ind0 = (((hx0 >> 10) & 0x7f8) + 8) & -16; 163*25c28e83SPiotr Jasiukajtis 164*25c28e83SPiotr Jasiukajtis resh0 = (hx0 & 0x001fffff) | 0x3fe00000; 165*25c28e83SPiotr Jasiukajtis res_ch0 = (resh0 + 0x00002000) & 0x7fffc000; 166*25c28e83SPiotr Jasiukajtis HI(&res) = resh0; 167*25c28e83SPiotr Jasiukajtis HI(&res_c0) = res_ch0; 168*25c28e83SPiotr Jasiukajtis LO(&res_c0) = 0; 169*25c28e83SPiotr Jasiukajtis 170*25c28e83SPiotr Jasiukajtis dexp_hi0 = ((double*)((char*)__vlibm_TBL_rsqrt + ind0))[0]; 171*25c28e83SPiotr Jasiukajtis dexp_lo0 = ((double*)((char*)__vlibm_TBL_rsqrt + ind0))[1]; 172*25c28e83SPiotr Jasiukajtis xx0 = dexp_hi0 * dexp_hi0; 173*25c28e83SPiotr Jasiukajtis xx0 = (res - res_c0) * xx0; 174*25c28e83SPiotr Jasiukajtis res = (((((K6 * xx0 + K5) * xx0 + K4) * xx0 + K3) * xx0 + K2) * xx0 + K1) * xx0; 175*25c28e83SPiotr Jasiukajtis 176*25c28e83SPiotr Jasiukajtis res = dexp_hi0 * res + dexp_lo0 + dexp_hi0; 177*25c28e83SPiotr Jasiukajtis 178*25c28e83SPiotr Jasiukajtis HI(&dsqrt_exp0) = sqrt_exp0; 179*25c28e83SPiotr Jasiukajtis LO(&dsqrt_exp0) = 0; 180*25c28e83SPiotr Jasiukajtis res *= dsqrt_exp0; 181*25c28e83SPiotr Jasiukajtis 182*25c28e83SPiotr Jasiukajtis RETURN (res) 183*25c28e83SPiotr Jasiukajtis } 184*25c28e83SPiotr Jasiukajtis else /* X = negative */ 185*25c28e83SPiotr Jasiukajtis { 186*25c28e83SPiotr Jasiukajtis RETURN (sqrt(res)) 187*25c28e83SPiotr Jasiukajtis } 188*25c28e83SPiotr Jasiukajtis } 189*25c28e83SPiotr Jasiukajtis n_n++; 190*25c28e83SPiotr Jasiukajtis hx = HI(px); 191*25c28e83SPiotr Jasiukajtis } 192*25c28e83SPiotr Jasiukajtis if (n_n > 0) 193*25c28e83SPiotr Jasiukajtis __vrsqrt_n(n_n, spx, stridex, spy, stridey); 194*25c28e83SPiotr Jasiukajtis } 195*25c28e83SPiotr Jasiukajtis if (n > 0) 196*25c28e83SPiotr Jasiukajtis { 197*25c28e83SPiotr Jasiukajtis hx = HI(px); 198*25c28e83SPiotr Jasiukajtis 199*25c28e83SPiotr Jasiukajtis if (hx >= 0x7ff00000) /* X = NaN or Inf */ 200*25c28e83SPiotr Jasiukajtis { 201*25c28e83SPiotr Jasiukajtis res = *px; 202*25c28e83SPiotr Jasiukajtis *py = DONE / res; 203*25c28e83SPiotr Jasiukajtis } 204*25c28e83SPiotr Jasiukajtis else if (hx < 0x00100000) /* X = denormal, zero or negative */ 205*25c28e83SPiotr Jasiukajtis { 206*25c28e83SPiotr Jasiukajtis ax = hx & 0x7fffffff; 207*25c28e83SPiotr Jasiukajtis lx = LO(px); 208*25c28e83SPiotr Jasiukajtis res = *px; 209*25c28e83SPiotr Jasiukajtis 210*25c28e83SPiotr Jasiukajtis if ((ax | lx) == 0) /* |X| = zero */ 211*25c28e83SPiotr Jasiukajtis { 212*25c28e83SPiotr Jasiukajtis *py = DONE / res; 213*25c28e83SPiotr Jasiukajtis } 214*25c28e83SPiotr Jasiukajtis else if (hx >= 0) /* X = denormal */ 215*25c28e83SPiotr Jasiukajtis { 216*25c28e83SPiotr Jasiukajtis double res_c0, dsqrt_exp0; 217*25c28e83SPiotr Jasiukajtis int ind0, sqrt_exp0; 218*25c28e83SPiotr Jasiukajtis double xx0, dexp_hi0, dexp_lo0; 219*25c28e83SPiotr Jasiukajtis int hx0, resh0, res_ch0; 220*25c28e83SPiotr Jasiukajtis 221*25c28e83SPiotr Jasiukajtis res = *(long long*)&res; 222*25c28e83SPiotr Jasiukajtis 223*25c28e83SPiotr Jasiukajtis hx0 = HI(&res); 224*25c28e83SPiotr Jasiukajtis sqrt_exp0 = (0x817 - (hx0 >> 21)) << 20; 225*25c28e83SPiotr Jasiukajtis ind0 = (((hx0 >> 10) & 0x7f8) + 8) & -16; 226*25c28e83SPiotr Jasiukajtis 227*25c28e83SPiotr Jasiukajtis resh0 = (hx0 & 0x001fffff) | 0x3fe00000; 228*25c28e83SPiotr Jasiukajtis res_ch0 = (resh0 + 0x00002000) & 0x7fffc000; 229*25c28e83SPiotr Jasiukajtis HI(&res) = resh0; 230*25c28e83SPiotr Jasiukajtis HI(&res_c0) = res_ch0; 231*25c28e83SPiotr Jasiukajtis LO(&res_c0) = 0; 232*25c28e83SPiotr Jasiukajtis 233*25c28e83SPiotr Jasiukajtis dexp_hi0 = ((double*)((char*)__vlibm_TBL_rsqrt + ind0))[0]; 234*25c28e83SPiotr Jasiukajtis dexp_lo0 = ((double*)((char*)__vlibm_TBL_rsqrt + ind0))[1]; 235*25c28e83SPiotr Jasiukajtis xx0 = dexp_hi0 * dexp_hi0; 236*25c28e83SPiotr Jasiukajtis xx0 = (res - res_c0) * xx0; 237*25c28e83SPiotr Jasiukajtis res = (((((K6 * xx0 + K5) * xx0 + K4) * xx0 + K3) * xx0 + K2) * xx0 + K1) * xx0; 238*25c28e83SPiotr Jasiukajtis 239*25c28e83SPiotr Jasiukajtis res = dexp_hi0 * res + dexp_lo0 + dexp_hi0; 240*25c28e83SPiotr Jasiukajtis 241*25c28e83SPiotr Jasiukajtis HI(&dsqrt_exp0) = sqrt_exp0; 242*25c28e83SPiotr Jasiukajtis LO(&dsqrt_exp0) = 0; 243*25c28e83SPiotr Jasiukajtis res *= dsqrt_exp0; 244*25c28e83SPiotr Jasiukajtis 245*25c28e83SPiotr Jasiukajtis *py = res; 246*25c28e83SPiotr Jasiukajtis } 247*25c28e83SPiotr Jasiukajtis else /* X = negative */ 248*25c28e83SPiotr Jasiukajtis { 249*25c28e83SPiotr Jasiukajtis *py = sqrt(res); 250*25c28e83SPiotr Jasiukajtis } 251*25c28e83SPiotr Jasiukajtis } 252*25c28e83SPiotr Jasiukajtis else 253*25c28e83SPiotr Jasiukajtis { 254*25c28e83SPiotr Jasiukajtis double res_c0, dsqrt_exp0; 255*25c28e83SPiotr Jasiukajtis int ind0, sqrt_exp0; 256*25c28e83SPiotr Jasiukajtis double xx0, dexp_hi0, dexp_lo0; 257*25c28e83SPiotr Jasiukajtis int resh0, res_ch0; 258*25c28e83SPiotr Jasiukajtis 259*25c28e83SPiotr Jasiukajtis sqrt_exp0 = (0x5fe - (hx >> 21)) << 20; 260*25c28e83SPiotr Jasiukajtis ind0 = (((hx >> 10) & 0x7f8) + 8) & -16; 261*25c28e83SPiotr Jasiukajtis 262*25c28e83SPiotr Jasiukajtis resh0 = (hx & 0x001fffff) | 0x3fe00000; 263*25c28e83SPiotr Jasiukajtis res_ch0 = (resh0 + 0x00002000) & 0x7fffc000; 264*25c28e83SPiotr Jasiukajtis HI(&res) = resh0; 265*25c28e83SPiotr Jasiukajtis LO(&res) = LO(px); 266*25c28e83SPiotr Jasiukajtis HI(&res_c0) = res_ch0; 267*25c28e83SPiotr Jasiukajtis LO(&res_c0) = 0; 268*25c28e83SPiotr Jasiukajtis 269*25c28e83SPiotr Jasiukajtis dexp_hi0 = ((double*)((char*)__vlibm_TBL_rsqrt + ind0))[0]; 270*25c28e83SPiotr Jasiukajtis dexp_lo0 = ((double*)((char*)__vlibm_TBL_rsqrt + ind0))[1]; 271*25c28e83SPiotr Jasiukajtis xx0 = dexp_hi0 * dexp_hi0; 272*25c28e83SPiotr Jasiukajtis xx0 = (res - res_c0) * xx0; 273*25c28e83SPiotr Jasiukajtis res = (((((K6 * xx0 + K5) * xx0 + K4) * xx0 + K3) * xx0 + K2) * xx0 + K1) * xx0; 274*25c28e83SPiotr Jasiukajtis 275*25c28e83SPiotr Jasiukajtis res = dexp_hi0 * res + dexp_lo0 + dexp_hi0; 276*25c28e83SPiotr Jasiukajtis 277*25c28e83SPiotr Jasiukajtis HI(&dsqrt_exp0) = sqrt_exp0; 278*25c28e83SPiotr Jasiukajtis LO(&dsqrt_exp0) = 0; 279*25c28e83SPiotr Jasiukajtis res *= dsqrt_exp0; 280*25c28e83SPiotr Jasiukajtis 281*25c28e83SPiotr Jasiukajtis *py = res; 282*25c28e83SPiotr Jasiukajtis } 283*25c28e83SPiotr Jasiukajtis } 284*25c28e83SPiotr Jasiukajtis } 285*25c28e83SPiotr Jasiukajtis 286*25c28e83SPiotr Jasiukajtis static void 287*25c28e83SPiotr Jasiukajtis __vrsqrt_n(int n, double * restrict px, int stridex, double * restrict py, int stridey) 288*25c28e83SPiotr Jasiukajtis { 289*25c28e83SPiotr Jasiukajtis double res0, res_c0, dsqrt_exp0; 290*25c28e83SPiotr Jasiukajtis double res1, res_c1, dsqrt_exp1; 291*25c28e83SPiotr Jasiukajtis double res2, res_c2, dsqrt_exp2; 292*25c28e83SPiotr Jasiukajtis int ind0, sqrt_exp0; 293*25c28e83SPiotr Jasiukajtis int ind1, sqrt_exp1; 294*25c28e83SPiotr Jasiukajtis int ind2, sqrt_exp2; 295*25c28e83SPiotr Jasiukajtis double xx0, dexp_hi0, dexp_lo0; 296*25c28e83SPiotr Jasiukajtis double xx1, dexp_hi1, dexp_lo1; 297*25c28e83SPiotr Jasiukajtis double xx2, dexp_hi2, dexp_lo2; 298*25c28e83SPiotr Jasiukajtis int hx0, resh0, res_ch0; 299*25c28e83SPiotr Jasiukajtis int hx1, resh1, res_ch1; 300*25c28e83SPiotr Jasiukajtis int hx2, resh2, res_ch2; 301*25c28e83SPiotr Jasiukajtis 302*25c28e83SPiotr Jasiukajtis LO(&dsqrt_exp0) = 0; 303*25c28e83SPiotr Jasiukajtis LO(&dsqrt_exp1) = 0; 304*25c28e83SPiotr Jasiukajtis LO(&dsqrt_exp2) = 0; 305*25c28e83SPiotr Jasiukajtis LO(&res_c0) = 0; 306*25c28e83SPiotr Jasiukajtis LO(&res_c1) = 0; 307*25c28e83SPiotr Jasiukajtis LO(&res_c2) = 0; 308*25c28e83SPiotr Jasiukajtis 309*25c28e83SPiotr Jasiukajtis for(; n > 2 ; n -= 3) 310*25c28e83SPiotr Jasiukajtis { 311*25c28e83SPiotr Jasiukajtis hx0 = HI(px); 312*25c28e83SPiotr Jasiukajtis LO(&res0) = LO(px); 313*25c28e83SPiotr Jasiukajtis px += stridex; 314*25c28e83SPiotr Jasiukajtis 315*25c28e83SPiotr Jasiukajtis hx1 = HI(px); 316*25c28e83SPiotr Jasiukajtis LO(&res1) = LO(px); 317*25c28e83SPiotr Jasiukajtis px += stridex; 318*25c28e83SPiotr Jasiukajtis 319*25c28e83SPiotr Jasiukajtis hx2 = HI(px); 320*25c28e83SPiotr Jasiukajtis LO(&res2) = LO(px); 321*25c28e83SPiotr Jasiukajtis px += stridex; 322*25c28e83SPiotr Jasiukajtis 323*25c28e83SPiotr Jasiukajtis sqrt_exp0 = (0x5fe - (hx0 >> 21)) << 20; 324*25c28e83SPiotr Jasiukajtis sqrt_exp1 = (0x5fe - (hx1 >> 21)) << 20; 325*25c28e83SPiotr Jasiukajtis sqrt_exp2 = (0x5fe - (hx2 >> 21)) << 20; 326*25c28e83SPiotr Jasiukajtis ind0 = (((hx0 >> 10) & 0x7f8) + 8) & -16; 327*25c28e83SPiotr Jasiukajtis ind1 = (((hx1 >> 10) & 0x7f8) + 8) & -16; 328*25c28e83SPiotr Jasiukajtis ind2 = (((hx2 >> 10) & 0x7f8) + 8) & -16; 329*25c28e83SPiotr Jasiukajtis 330*25c28e83SPiotr Jasiukajtis resh0 = (hx0 & 0x001fffff) | 0x3fe00000; 331*25c28e83SPiotr Jasiukajtis resh1 = (hx1 & 0x001fffff) | 0x3fe00000; 332*25c28e83SPiotr Jasiukajtis resh2 = (hx2 & 0x001fffff) | 0x3fe00000; 333*25c28e83SPiotr Jasiukajtis res_ch0 = (resh0 + 0x00002000) & 0x7fffc000; 334*25c28e83SPiotr Jasiukajtis res_ch1 = (resh1 + 0x00002000) & 0x7fffc000; 335*25c28e83SPiotr Jasiukajtis res_ch2 = (resh2 + 0x00002000) & 0x7fffc000; 336*25c28e83SPiotr Jasiukajtis HI(&res0) = resh0; 337*25c28e83SPiotr Jasiukajtis HI(&res1) = resh1; 338*25c28e83SPiotr Jasiukajtis HI(&res2) = resh2; 339*25c28e83SPiotr Jasiukajtis HI(&res_c0) = res_ch0; 340*25c28e83SPiotr Jasiukajtis HI(&res_c1) = res_ch1; 341*25c28e83SPiotr Jasiukajtis HI(&res_c2) = res_ch2; 342*25c28e83SPiotr Jasiukajtis 343*25c28e83SPiotr Jasiukajtis dexp_hi0 = ((double*)((char*)__vlibm_TBL_rsqrt + ind0))[0]; 344*25c28e83SPiotr Jasiukajtis dexp_hi1 = ((double*)((char*)__vlibm_TBL_rsqrt + ind1))[0]; 345*25c28e83SPiotr Jasiukajtis dexp_hi2 = ((double*)((char*)__vlibm_TBL_rsqrt + ind2))[0]; 346*25c28e83SPiotr Jasiukajtis dexp_lo0 = ((double*)((char*)__vlibm_TBL_rsqrt + ind0))[1]; 347*25c28e83SPiotr Jasiukajtis dexp_lo1 = ((double*)((char*)__vlibm_TBL_rsqrt + ind1))[1]; 348*25c28e83SPiotr Jasiukajtis dexp_lo2 = ((double*)((char*)__vlibm_TBL_rsqrt + ind2))[1]; 349*25c28e83SPiotr Jasiukajtis xx0 = dexp_hi0 * dexp_hi0; 350*25c28e83SPiotr Jasiukajtis xx1 = dexp_hi1 * dexp_hi1; 351*25c28e83SPiotr Jasiukajtis xx2 = dexp_hi2 * dexp_hi2; 352*25c28e83SPiotr Jasiukajtis xx0 = (res0 - res_c0) * xx0; 353*25c28e83SPiotr Jasiukajtis xx1 = (res1 - res_c1) * xx1; 354*25c28e83SPiotr Jasiukajtis xx2 = (res2 - res_c2) * xx2; 355*25c28e83SPiotr Jasiukajtis res0 = (((((K6 * xx0 + K5) * xx0 + K4) * xx0 + K3) * xx0 + K2) * xx0 + K1) * xx0; 356*25c28e83SPiotr Jasiukajtis res1 = (((((K6 * xx1 + K5) * xx1 + K4) * xx1 + K3) * xx1 + K2) * xx1 + K1) * xx1; 357*25c28e83SPiotr Jasiukajtis res2 = (((((K6 * xx2 + K5) * xx2 + K4) * xx2 + K3) * xx2 + K2) * xx2 + K1) * xx2; 358*25c28e83SPiotr Jasiukajtis 359*25c28e83SPiotr Jasiukajtis res0 = dexp_hi0 * res0 + dexp_lo0 + dexp_hi0; 360*25c28e83SPiotr Jasiukajtis res1 = dexp_hi1 * res1 + dexp_lo1 + dexp_hi1; 361*25c28e83SPiotr Jasiukajtis res2 = dexp_hi2 * res2 + dexp_lo2 + dexp_hi2; 362*25c28e83SPiotr Jasiukajtis 363*25c28e83SPiotr Jasiukajtis HI(&dsqrt_exp0) = sqrt_exp0; 364*25c28e83SPiotr Jasiukajtis HI(&dsqrt_exp1) = sqrt_exp1; 365*25c28e83SPiotr Jasiukajtis HI(&dsqrt_exp2) = sqrt_exp2; 366*25c28e83SPiotr Jasiukajtis res0 *= dsqrt_exp0; 367*25c28e83SPiotr Jasiukajtis res1 *= dsqrt_exp1; 368*25c28e83SPiotr Jasiukajtis res2 *= dsqrt_exp2; 369*25c28e83SPiotr Jasiukajtis 370*25c28e83SPiotr Jasiukajtis *py = res0; 371*25c28e83SPiotr Jasiukajtis py += stridey; 372*25c28e83SPiotr Jasiukajtis 373*25c28e83SPiotr Jasiukajtis *py = res1; 374*25c28e83SPiotr Jasiukajtis py += stridey; 375*25c28e83SPiotr Jasiukajtis 376*25c28e83SPiotr Jasiukajtis *py = res2; 377*25c28e83SPiotr Jasiukajtis py += stridey; 378*25c28e83SPiotr Jasiukajtis } 379*25c28e83SPiotr Jasiukajtis 380*25c28e83SPiotr Jasiukajtis for(; n > 0 ; n--) 381*25c28e83SPiotr Jasiukajtis { 382*25c28e83SPiotr Jasiukajtis hx0 = HI(px); 383*25c28e83SPiotr Jasiukajtis 384*25c28e83SPiotr Jasiukajtis sqrt_exp0 = (0x5fe - (hx0 >> 21)) << 20; 385*25c28e83SPiotr Jasiukajtis ind0 = (((hx0 >> 10) & 0x7f8) + 8) & -16; 386*25c28e83SPiotr Jasiukajtis 387*25c28e83SPiotr Jasiukajtis resh0 = (hx0 & 0x001fffff) | 0x3fe00000; 388*25c28e83SPiotr Jasiukajtis res_ch0 = (resh0 + 0x00002000) & 0x7fffc000; 389*25c28e83SPiotr Jasiukajtis HI(&res0) = resh0; 390*25c28e83SPiotr Jasiukajtis LO(&res0) = LO(px); 391*25c28e83SPiotr Jasiukajtis HI(&res_c0) = res_ch0; 392*25c28e83SPiotr Jasiukajtis LO(&res_c0) = 0; 393*25c28e83SPiotr Jasiukajtis 394*25c28e83SPiotr Jasiukajtis px += stridex; 395*25c28e83SPiotr Jasiukajtis 396*25c28e83SPiotr Jasiukajtis dexp_hi0 = ((double*)((char*)__vlibm_TBL_rsqrt + ind0))[0]; 397*25c28e83SPiotr Jasiukajtis dexp_lo0 = ((double*)((char*)__vlibm_TBL_rsqrt + ind0))[1]; 398*25c28e83SPiotr Jasiukajtis xx0 = dexp_hi0 * dexp_hi0; 399*25c28e83SPiotr Jasiukajtis xx0 = (res0 - res_c0) * xx0; 400*25c28e83SPiotr Jasiukajtis res0 = (((((K6 * xx0 + K5) * xx0 + K4) * xx0 + K3) * xx0 + K2) * xx0 + K1) * xx0; 401*25c28e83SPiotr Jasiukajtis 402*25c28e83SPiotr Jasiukajtis res0 = dexp_hi0 * res0 + dexp_lo0 + dexp_hi0; 403*25c28e83SPiotr Jasiukajtis 404*25c28e83SPiotr Jasiukajtis HI(&dsqrt_exp0) = sqrt_exp0; 405*25c28e83SPiotr Jasiukajtis LO(&dsqrt_exp0) = 0; 406*25c28e83SPiotr Jasiukajtis res0 *= dsqrt_exp0; 407*25c28e83SPiotr Jasiukajtis 408*25c28e83SPiotr Jasiukajtis *py = res0; 409*25c28e83SPiotr Jasiukajtis py += stridey; 410*25c28e83SPiotr Jasiukajtis } 411*25c28e83SPiotr Jasiukajtis } 412