1*3085fc9dSSteve Kargl /*-
2*3085fc9dSSteve Kargl * Copyright (c) 2026 Steven G. Kargl
3*3085fc9dSSteve Kargl * All rights reserved.
4*3085fc9dSSteve Kargl *
5*3085fc9dSSteve Kargl * Redistribution and use in source and binary forms, with or without
6*3085fc9dSSteve Kargl * modification, are permitted provided that the following conditions
7*3085fc9dSSteve Kargl * are met:
8*3085fc9dSSteve Kargl * 1. Redistributions of source code must retain the above copyright
9*3085fc9dSSteve Kargl * notice unmodified, this list of conditions, and the following
10*3085fc9dSSteve Kargl * disclaimer.
11*3085fc9dSSteve Kargl * 2. Redistributions in binary form must reproduce the above copyright
12*3085fc9dSSteve Kargl * notice, this list of conditions and the following disclaimer in the
13*3085fc9dSSteve Kargl * documentation and/or other materials provided with the distribution.
14*3085fc9dSSteve Kargl *
15*3085fc9dSSteve Kargl * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
16*3085fc9dSSteve Kargl * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
17*3085fc9dSSteve Kargl * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
18*3085fc9dSSteve Kargl * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
19*3085fc9dSSteve Kargl * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
20*3085fc9dSSteve Kargl * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
21*3085fc9dSSteve Kargl * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
22*3085fc9dSSteve Kargl * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
23*3085fc9dSSteve Kargl * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
24*3085fc9dSSteve Kargl * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
25*3085fc9dSSteve Kargl */
26*3085fc9dSSteve Kargl
27*3085fc9dSSteve Kargl /**
28*3085fc9dSSteve Kargl * Compute the inverse sqrt of x, i.e., rsqrt(x) = 1 / sqrt(x).
29*3085fc9dSSteve Kargl *
30*3085fc9dSSteve Kargl * First, filter out special cases:
31*3085fc9dSSteve Kargl *
32*3085fc9dSSteve Kargl * 1. rsqrt(+-0) = +-inf, and raise FE_DIVBYZERO exception.
33*3085fc9dSSteve Kargl * 2. rsqrt(nan) = NaN.
34*3085fc9dSSteve Kargl * 3. rsqrt(+inf) returns +0.
35*3085fc9dSSteve Kargl * 2. rsqrt(x<0) = NaN, and raises FE_INVALID.
36*3085fc9dSSteve Kargl *
37*3085fc9dSSteve Kargl * If x is a subnormal, scale x into the normal range by x*0x1pN; while
38*3085fc9dSSteve Kargl * recording the exponent of the scale factor N. Split the possibly
39*3085fc9dSSteve Kargl * scaled x into f*2^n with f in [0.5,1). Set m=n or m=n-N (subnormal).
40*3085fc9dSSteve Kargl * If n is odd, then set f = f/2 and increase n to n+1. Thus, f is
41*3085fc9dSSteve Kargl * in [0.25,1) with n even.
42*3085fc9dSSteve Kargl *
43*3085fc9dSSteve Kargl * An initial estimate of y = rqrt[f](x) is 1 / sqrt[f](x). Exhaustive
44*3085fc9dSSteve Kargl * testing of rsqrtf() gave a max ULP of 1.49; while testing 500M x in
45*3085fc9dSSteve Kargl * [0,1000] gave a max ULP of 1.24 for rsqrt(). The value of y is then
46*3085fc9dSSteve Kargl * used with one iteration of Goldschmidt's algorithm:
47*3085fc9dSSteve Kargl *
48*3085fc9dSSteve Kargl * z = x * y
49*3085fc9dSSteve Kargl * h = y / 2
50*3085fc9dSSteve Kargl * r = 0.5 - h * z
51*3085fc9dSSteve Kargl * y = h * r + h
52*3085fc9dSSteve Kargl *
53*3085fc9dSSteve Kargl * A factor of 2 appears missing in the above, but it is included in the
54*3085fc9dSSteve Kargl * exponent m.
55*3085fc9dSSteve Kargl */
56*3085fc9dSSteve Kargl #include <fenv.h>
57*3085fc9dSSteve Kargl #include <float.h>
58*3085fc9dSSteve Kargl #include "math.h"
59*3085fc9dSSteve Kargl #include "math_private.h"
60*3085fc9dSSteve Kargl
61*3085fc9dSSteve Kargl #pragma STDC FENV_ACCESS ON
62*3085fc9dSSteve Kargl
63*3085fc9dSSteve Kargl #ifdef _CC
64*3085fc9dSSteve Kargl #undef _CC
65*3085fc9dSSteve Kargl #endif
66*3085fc9dSSteve Kargl #define _CC (0x1p12F + 1)
67*3085fc9dSSteve Kargl
68*3085fc9dSSteve Kargl float
rsqrtf(float x)69*3085fc9dSSteve Kargl rsqrtf(float x)
70*3085fc9dSSteve Kargl {
71*3085fc9dSSteve Kargl volatile static const float vzero = 0;
72*3085fc9dSSteve Kargl static const float half = 0.5;
73*3085fc9dSSteve Kargl uint32_t ix, ux;
74*3085fc9dSSteve Kargl int m, rnd;
75*3085fc9dSSteve Kargl float h, ph, pl, rh, rl, y, zh, zl;
76*3085fc9dSSteve Kargl
77*3085fc9dSSteve Kargl GET_FLOAT_WORD(ix, x);
78*3085fc9dSSteve Kargl ux = ix & 0x7fffffff;
79*3085fc9dSSteve Kargl
80*3085fc9dSSteve Kargl /* x = +-0. Raise exception. */
81*3085fc9dSSteve Kargl if (ux == 0)
82*3085fc9dSSteve Kargl return (1 / x);
83*3085fc9dSSteve Kargl
84*3085fc9dSSteve Kargl /* x is NaN. */
85*3085fc9dSSteve Kargl if (ux > 0x7f800000)
86*3085fc9dSSteve Kargl return (x + x);
87*3085fc9dSSteve Kargl
88*3085fc9dSSteve Kargl /* x is +-inf. */
89*3085fc9dSSteve Kargl if (ux == 0x7f800000)
90*3085fc9dSSteve Kargl return (ix & 0x80000000 ? vzero / vzero : 0.F);
91*3085fc9dSSteve Kargl
92*3085fc9dSSteve Kargl /* x < 0. Raise exception. */
93*3085fc9dSSteve Kargl if (ix & 0x80000000)
94*3085fc9dSSteve Kargl return (vzero / vzero);
95*3085fc9dSSteve Kargl
96*3085fc9dSSteve Kargl /*
97*3085fc9dSSteve Kargl * If x is subnormal, then scale it into the normal range.
98*3085fc9dSSteve Kargl * Split x into significand and exponent, x = f * 2^m, with
99*3085fc9dSSteve Kargl * f in [0.5,1) and m a biased exponent.
100*3085fc9dSSteve Kargl */
101*3085fc9dSSteve Kargl m = 0;
102*3085fc9dSSteve Kargl if (ux < 0x00800000) { /* Subnormal */
103*3085fc9dSSteve Kargl x *= 0x1p25f;
104*3085fc9dSSteve Kargl GET_FLOAT_WORD(ix, x);
105*3085fc9dSSteve Kargl m = -25;
106*3085fc9dSSteve Kargl }
107*3085fc9dSSteve Kargl m += (ix >> 23) - 126; /* Unbiased exponent */
108*3085fc9dSSteve Kargl ix = (ix & 0x007fffff) | 0x3f000000;
109*3085fc9dSSteve Kargl SET_FLOAT_WORD(x, ix); /* x is in [0.5,1). */
110*3085fc9dSSteve Kargl
111*3085fc9dSSteve Kargl /* m is odd. Put x into [0.25,5) and increase m. */
112*3085fc9dSSteve Kargl if (m & 1) {
113*3085fc9dSSteve Kargl x /= 2;
114*3085fc9dSSteve Kargl m += 1;
115*3085fc9dSSteve Kargl }
116*3085fc9dSSteve Kargl m = -(m >> 1); /* Prepare for 2^(-m/2). */
117*3085fc9dSSteve Kargl
118*3085fc9dSSteve Kargl /*
119*3085fc9dSSteve Kargl * Exhaustive testing of rsqrtf(x) = 1 / sqrtf(x) with x in
120*3085fc9dSSteve Kargl * [0x1p-127,0x1p126] shows the this approximation gives a
121*3085fc9dSSteve Kargl * 22- to 23-bit estimate of rsqrt(f). This is equivalent to
122*3085fc9dSSteve Kargl * a max ulp of ~1.49.
123*3085fc9dSSteve Kargl */
124*3085fc9dSSteve Kargl y = 1 / sqrtf(x);
125*3085fc9dSSteve Kargl
126*3085fc9dSSteve Kargl h = y / 2;
127*3085fc9dSSteve Kargl
128*3085fc9dSSteve Kargl /*
129*3085fc9dSSteve Kargl * For values of x with a representation of 0x1.fffffcpN with
130*3085fc9dSSteve Kargl * N an odd integer, the computed rsqrtf() is not correctly
131*3085fc9dSSteve Kargl * rounded in round-to-nearest without toggling the rounding
132*3085fc9dSSteve Kargl * mode to FE_TOWARDZERO. Note, FE_DOWNWARD also works.
133*3085fc9dSSteve Kargl * However, messing with the rounding mode is expensive, so
134*3085fc9dSSteve Kargl * only do it when necessary. Example, x = 0x1.fffffcp3 gives
135*3085fc9dSSteve Kargl * y --> 0x3f800001.
136*3085fc9dSSteve Kargl */
137*3085fc9dSSteve Kargl GET_FLOAT_WORD(ix, y);
138*3085fc9dSSteve Kargl if ((ix & 0x000fffff) == 1) {
139*3085fc9dSSteve Kargl rnd = fegetround();
140*3085fc9dSSteve Kargl fesetround(FE_TOWARDZERO);
141*3085fc9dSSteve Kargl _MUL(x, y, zh, zl);
142*3085fc9dSSteve Kargl _XMUL(zh, zl, h, 0, ph, pl);
143*3085fc9dSSteve Kargl fesetround(rnd);
144*3085fc9dSSteve Kargl } else {
145*3085fc9dSSteve Kargl _MUL(x, y, zh, zl);
146*3085fc9dSSteve Kargl _XMUL(zh, zl, h, 0, ph, pl);
147*3085fc9dSSteve Kargl }
148*3085fc9dSSteve Kargl
149*3085fc9dSSteve Kargl _XADD(-ph, -pl, half, 0, rh, rl);
150*3085fc9dSSteve Kargl y = h * rh + h;
151*3085fc9dSSteve Kargl
152*3085fc9dSSteve Kargl ix = (uint32_t)(m + 128) << 23;
153*3085fc9dSSteve Kargl SET_FLOAT_WORD(x, ix);
154*3085fc9dSSteve Kargl return (y *= x);
155*3085fc9dSSteve Kargl }
156