1 /*
2 * CDDL HEADER START
3 *
4 * The contents of this file are subject to the terms of the
5 * Common Development and Distribution License (the "License").
6 * You may not use this file except in compliance with the License.
7 *
8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9 * or http://www.opensolaris.org/os/licensing.
10 * See the License for the specific language governing permissions
11 * and limitations under the License.
12 *
13 * When distributing Covered Code, include this CDDL HEADER in each
14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15 * If applicable, add the following below this CDDL HEADER, with the
16 * fields enclosed by brackets "[]" replaced with your own identifying
17 * information: Portions Copyright [yyyy] [name of copyright owner]
18 *
19 * CDDL HEADER END
20 */
21
22 /*
23 * Copyright 2011 Nexenta Systems, Inc. All rights reserved.
24 */
25 /*
26 * Copyright 2006 Sun Microsystems, Inc. All rights reserved.
27 * Use is subject to license terms.
28 */
29
30 #include <sys/isa_defs.h>
31 #include "libm_inlines.h"
32
33 #ifdef _LITTLE_ENDIAN
34 #define HI(x) *(1+(int *)x)
35 #define LO(x) *(unsigned *)x
36 #else
37 #define HI(x) *(int *)x
38 #define LO(x) *(1+(unsigned *)x)
39 #endif
40
41 #ifdef __RESTRICT
42 #define restrict _Restrict
43 #else
44 #define restrict
45 #endif
46
47 /*
48 * double rsqrt(double x)
49 *
50 * Method :
51 * 1. Special cases:
52 * for x = NaN => QNaN;
53 * for x = +Inf => 0;
54 * for x is negative, -Inf => QNaN + invalid;
55 * for x = +0 => +Inf + divide-by-zero;
56 * for x = -0 => -Inf + divide-by-zero.
57 * 2. Computes reciprocal square root from:
58 * x = m * 2**n
59 * Where:
60 * m = [0.5, 2),
61 * n = ((exponent + 1) & ~1).
62 * Then:
63 * rsqrt(x) = 1/sqrt( m * 2**n ) = (2 ** (-n/2)) * (1/sqrt(m))
64 * 2. Computes 1/sqrt(m) from:
65 * 1/sqrt(m) = (1/sqrt(m0)) * (1/sqrt(1 + (1/m0)*dm))
66 * Where:
67 * m = m0 + dm,
68 * m0 = 0.5 * (1 + k/64) for m = [0.5, 0.5+127/256), k = [0, 63];
69 * m0 = 1.0 * (0 + k/64) for m = [0.5+127/256, 1.0+127/128),
70 * k = [64, 127];
71 * m0 = 2.0 for m = [1.0+127/128, 2.0), k = 128.
72 * Then:
73 * 1/sqrt(m0) is looked up in a table,
74 * 1/m0 is computed as (1/sqrt(m0)) * (1/sqrt(m0)).
75 * 1/sqrt(1 + (1/m0)*dm) is computed using approximation:
76 * 1/sqrt(1 + z) = (((((a6 * z + a5) * z + a4) * z + a3)
77 * * z + a2) * z + a1) * z + a0
78 * where z = [-1/128, 1/128].
79 *
80 * Accuracy:
81 * The maximum relative error for the approximating
82 * polynomial is 2**(-56.26).
83 * Maximum error observed: less than 0.563 ulp after 1.500.000.000
84 * results.
85 */
86
87 extern double sqrt(double);
88 extern const double __vlibm_TBL_rsqrt[];
89
90 static void
91 __vrsqrt_n(int n, double *restrict px, int stridex, double *restrict py,
92 int stridey);
93
94 #define RETURN(ret) \
95 { \
96 *py = (ret); \
97 py += stridey; \
98 if (n_n == 0) \
99 { \
100 spx = px; \
101 spy = py; \
102 hx = HI(px); \
103 continue; \
104 } \
105 n--; \
106 break; \
107 }
108
109 static const double
110 DONE = 1.0,
111 K1 = -5.00000000000005209867e-01,
112 K2 = 3.75000000000004884257e-01,
113 K3 = -3.12499999317136886551e-01,
114 K4 = 2.73437499359815081532e-01,
115 K5 = -2.46116125605037803130e-01,
116 K6 = 2.25606914648617522896e-01;
117
118 void
__vrsqrt(int n,double * restrict px,int stridex,double * restrict py,int stridey)119 __vrsqrt(int n, double *restrict px, int stridex, double *restrict py,
120 int stridey)
121 {
122 double *spx, *spy;
123 int ax, lx, hx, n_n;
124 double res;
125
126 while (n > 1) {
127 n_n = 0;
128 spx = px;
129 spy = py;
130 hx = HI(px);
131 for (; n > 1; n--) {
132 px += stridex;
133 if (hx >= 0x7ff00000) { /* X = NaN or Inf */
134 res = *(px - stridex);
135 RETURN(DONE / res)
136 }
137
138 py += stridey;
139
140 if (hx < 0x00100000) {
141 /* X = denormal, zero or negative */
142 py -= stridey;
143 ax = hx & 0x7fffffff;
144 lx = LO((px - stridex));
145 res = *(px - stridex);
146
147 if ((ax | lx) == 0) { /* |X| = zero */
148 RETURN(DONE / res)
149 } else if (hx >= 0) { /* X = denormal */
150 double res_c0, dsqrt_exp0;
151 int ind0, sqrt_exp0;
152 double xx0, dexp_hi0, dexp_lo0;
153 int hx0, resh0, res_ch0;
154
155 res = *(long long *)&res;
156
157 hx0 = HI(&res);
158 sqrt_exp0 = (0x817 - (hx0 >> 21)) << 20;
159 ind0 = (((hx0 >> 10) & 0x7f8) + 8) &
160 -16;
161
162 resh0 = (hx0 & 0x001fffff) | 0x3fe00000;
163 res_ch0 = (resh0 + 0x00002000) &
164 0x7fffc000;
165 HI(&res) = resh0;
166 HI(&res_c0) = res_ch0;
167 LO(&res_c0) = 0;
168
169 dexp_hi0 = ((double *)((char *)
170 __vlibm_TBL_rsqrt + ind0))[0];
171 dexp_lo0 = ((double *)((char *)
172 __vlibm_TBL_rsqrt + ind0))[1];
173 xx0 = dexp_hi0 * dexp_hi0;
174 xx0 = (res - res_c0) * xx0;
175 res = (((((K6 * xx0 + K5) * xx0 + K4) *
176 xx0 + K3) * xx0 + K2) * xx0 + K1) *
177 xx0;
178
179 res = dexp_hi0 * res + dexp_lo0 +
180 dexp_hi0;
181
182 HI(&dsqrt_exp0) = sqrt_exp0;
183 LO(&dsqrt_exp0) = 0;
184 res *= dsqrt_exp0;
185
186 RETURN(res)
187 } else { /* X = negative */
188 RETURN(sqrt(res))
189 }
190 }
191 n_n++;
192 hx = HI(px);
193 }
194 if (n_n > 0)
195 __vrsqrt_n(n_n, spx, stridex, spy, stridey);
196 }
197 if (n > 0) {
198 hx = HI(px);
199
200 if (hx >= 0x7ff00000) { /* X = NaN or Inf */
201 res = *px;
202 *py = DONE / res;
203 } else if (hx < 0x00100000) {
204 /* X = denormal, zero or negative */
205 ax = hx & 0x7fffffff;
206 lx = LO(px);
207 res = *px;
208
209 if ((ax | lx) == 0) { /* |X| = zero */
210 *py = DONE / res;
211 } else if (hx >= 0) { /* X = denormal */
212 double res_c0, dsqrt_exp0;
213 int ind0, sqrt_exp0;
214 double xx0, dexp_hi0, dexp_lo0;
215 int hx0, resh0, res_ch0;
216
217 res = *(long long *)&res;
218
219 hx0 = HI(&res);
220 sqrt_exp0 = (0x817 - (hx0 >> 21)) << 20;
221 ind0 = (((hx0 >> 10) & 0x7f8) + 8) & -16;
222
223 resh0 = (hx0 & 0x001fffff) | 0x3fe00000;
224 res_ch0 = (resh0 + 0x00002000) & 0x7fffc000;
225 HI(&res) = resh0;
226 HI(&res_c0) = res_ch0;
227 LO(&res_c0) = 0;
228
229 dexp_hi0 = ((double *)((char *)
230 __vlibm_TBL_rsqrt + ind0))[0];
231 dexp_lo0 = ((double *)((char *)
232 __vlibm_TBL_rsqrt + ind0))[1];
233 xx0 = dexp_hi0 * dexp_hi0;
234 xx0 = (res - res_c0) * xx0;
235 res = (((((K6 * xx0 + K5) * xx0 + K4) * xx0 +
236 K3) * xx0 + K2) * xx0 + K1) * xx0;
237
238 res = dexp_hi0 * res + dexp_lo0 + dexp_hi0;
239
240 HI(&dsqrt_exp0) = sqrt_exp0;
241 LO(&dsqrt_exp0) = 0;
242 res *= dsqrt_exp0;
243
244 *py = res;
245 } else { /* X = negative */
246 *py = sqrt(res);
247 }
248 } else {
249 double res_c0, dsqrt_exp0;
250 int ind0, sqrt_exp0;
251 double xx0, dexp_hi0, dexp_lo0;
252 int resh0, res_ch0;
253
254 sqrt_exp0 = (0x5fe - (hx >> 21)) << 20;
255 ind0 = (((hx >> 10) & 0x7f8) + 8) & -16;
256
257 resh0 = (hx & 0x001fffff) | 0x3fe00000;
258 res_ch0 = (resh0 + 0x00002000) & 0x7fffc000;
259 HI(&res) = resh0;
260 LO(&res) = LO(px);
261 HI(&res_c0) = res_ch0;
262 LO(&res_c0) = 0;
263
264 dexp_hi0 = ((double *)((char *)
265 __vlibm_TBL_rsqrt + ind0))[0];
266 dexp_lo0 = ((double *)((char *)
267 __vlibm_TBL_rsqrt + ind0))[1];
268 xx0 = dexp_hi0 * dexp_hi0;
269 xx0 = (res - res_c0) * xx0;
270 res = (((((K6 * xx0 + K5) * xx0 + K4) * xx0 + K3) *
271 xx0 + K2) * xx0 + K1) * xx0;
272
273 res = dexp_hi0 * res + dexp_lo0 + dexp_hi0;
274
275 HI(&dsqrt_exp0) = sqrt_exp0;
276 LO(&dsqrt_exp0) = 0;
277 res *= dsqrt_exp0;
278
279 *py = res;
280 }
281 }
282 }
283
284 static void
__vrsqrt_n(int n,double * restrict px,int stridex,double * restrict py,int stridey)285 __vrsqrt_n(int n, double *restrict px, int stridex, double *restrict py,
286 int stridey)
287 {
288 double res0, res_c0, dsqrt_exp0;
289 double res1, res_c1, dsqrt_exp1;
290 double res2, res_c2, dsqrt_exp2;
291 int ind0, sqrt_exp0;
292 int ind1, sqrt_exp1;
293 int ind2, sqrt_exp2;
294 double xx0, dexp_hi0, dexp_lo0;
295 double xx1, dexp_hi1, dexp_lo1;
296 double xx2, dexp_hi2, dexp_lo2;
297 int hx0, resh0, res_ch0;
298 int hx1, resh1, res_ch1;
299 int hx2, resh2, res_ch2;
300
301 LO(&dsqrt_exp0) = 0;
302 LO(&dsqrt_exp1) = 0;
303 LO(&dsqrt_exp2) = 0;
304 LO(&res_c0) = 0;
305 LO(&res_c1) = 0;
306 LO(&res_c2) = 0;
307
308 for (; n > 2; n -= 3) {
309 hx0 = HI(px);
310 LO(&res0) = LO(px);
311 px += stridex;
312
313 hx1 = HI(px);
314 LO(&res1) = LO(px);
315 px += stridex;
316
317 hx2 = HI(px);
318 LO(&res2) = LO(px);
319 px += stridex;
320
321 sqrt_exp0 = (0x5fe - (hx0 >> 21)) << 20;
322 sqrt_exp1 = (0x5fe - (hx1 >> 21)) << 20;
323 sqrt_exp2 = (0x5fe - (hx2 >> 21)) << 20;
324 ind0 = (((hx0 >> 10) & 0x7f8) + 8) & -16;
325 ind1 = (((hx1 >> 10) & 0x7f8) + 8) & -16;
326 ind2 = (((hx2 >> 10) & 0x7f8) + 8) & -16;
327
328 resh0 = (hx0 & 0x001fffff) | 0x3fe00000;
329 resh1 = (hx1 & 0x001fffff) | 0x3fe00000;
330 resh2 = (hx2 & 0x001fffff) | 0x3fe00000;
331 res_ch0 = (resh0 + 0x00002000) & 0x7fffc000;
332 res_ch1 = (resh1 + 0x00002000) & 0x7fffc000;
333 res_ch2 = (resh2 + 0x00002000) & 0x7fffc000;
334 HI(&res0) = resh0;
335 HI(&res1) = resh1;
336 HI(&res2) = resh2;
337 HI(&res_c0) = res_ch0;
338 HI(&res_c1) = res_ch1;
339 HI(&res_c2) = res_ch2;
340
341 dexp_hi0 = ((double *)((char *)__vlibm_TBL_rsqrt + ind0))[0];
342 dexp_hi1 = ((double *)((char *)__vlibm_TBL_rsqrt + ind1))[0];
343 dexp_hi2 = ((double *)((char *)__vlibm_TBL_rsqrt + ind2))[0];
344 dexp_lo0 = ((double *)((char *)__vlibm_TBL_rsqrt + ind0))[1];
345 dexp_lo1 = ((double *)((char *)__vlibm_TBL_rsqrt + ind1))[1];
346 dexp_lo2 = ((double *)((char *)__vlibm_TBL_rsqrt + ind2))[1];
347 xx0 = dexp_hi0 * dexp_hi0;
348 xx1 = dexp_hi1 * dexp_hi1;
349 xx2 = dexp_hi2 * dexp_hi2;
350 xx0 = (res0 - res_c0) * xx0;
351 xx1 = (res1 - res_c1) * xx1;
352 xx2 = (res2 - res_c2) * xx2;
353 res0 = (((((K6 * xx0 + K5) * xx0 + K4) * xx0 + K3) *
354 xx0 + K2) * xx0 + K1) * xx0;
355 res1 = (((((K6 * xx1 + K5) * xx1 + K4) * xx1 + K3) *
356 xx1 + K2) * xx1 + K1) * xx1;
357 res2 = (((((K6 * xx2 + K5) * xx2 + K4) * xx2 + K3) *
358 xx2 + K2) * xx2 + K1) * xx2;
359
360 res0 = dexp_hi0 * res0 + dexp_lo0 + dexp_hi0;
361 res1 = dexp_hi1 * res1 + dexp_lo1 + dexp_hi1;
362 res2 = dexp_hi2 * res2 + dexp_lo2 + dexp_hi2;
363
364 HI(&dsqrt_exp0) = sqrt_exp0;
365 HI(&dsqrt_exp1) = sqrt_exp1;
366 HI(&dsqrt_exp2) = sqrt_exp2;
367 res0 *= dsqrt_exp0;
368 res1 *= dsqrt_exp1;
369 res2 *= dsqrt_exp2;
370
371 *py = res0;
372 py += stridey;
373
374 *py = res1;
375 py += stridey;
376
377 *py = res2;
378 py += stridey;
379 }
380
381 for (; n > 0; n--) {
382 hx0 = HI(px);
383
384 sqrt_exp0 = (0x5fe - (hx0 >> 21)) << 20;
385 ind0 = (((hx0 >> 10) & 0x7f8) + 8) & -16;
386
387 resh0 = (hx0 & 0x001fffff) | 0x3fe00000;
388 res_ch0 = (resh0 + 0x00002000) & 0x7fffc000;
389 HI(&res0) = resh0;
390 LO(&res0) = LO(px);
391 HI(&res_c0) = res_ch0;
392 LO(&res_c0) = 0;
393
394 px += stridex;
395
396 dexp_hi0 = ((double *)((char *)__vlibm_TBL_rsqrt + ind0))[0];
397 dexp_lo0 = ((double *)((char *)__vlibm_TBL_rsqrt + ind0))[1];
398 xx0 = dexp_hi0 * dexp_hi0;
399 xx0 = (res0 - res_c0) * xx0;
400 res0 = (((((K6 * xx0 + K5) * xx0 + K4) * xx0 + K3) *
401 xx0 + K2) * xx0 + K1) * xx0;
402
403 res0 = dexp_hi0 * res0 + dexp_lo0 + dexp_hi0;
404
405 HI(&dsqrt_exp0) = sqrt_exp0;
406 LO(&dsqrt_exp0) = 0;
407 res0 *= dsqrt_exp0;
408
409 *py = res0;
410 py += stridey;
411 }
412 }
413