xref: /freebsd/contrib/arm-optimized-routines/pl/math/erfinvf_4u7.c (revision 5a02ffc32e777041dd2dad4e651ed2a0865a0a5d)
1*5a02ffc3SAndrew Turner /*
2*5a02ffc3SAndrew Turner  * Single-precision inverse error function.
3*5a02ffc3SAndrew Turner  *
4*5a02ffc3SAndrew Turner  * Copyright (c) 2023, Arm Limited.
5*5a02ffc3SAndrew Turner  * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
6*5a02ffc3SAndrew Turner  */
7*5a02ffc3SAndrew Turner #include "poly_scalar_f32.h"
8*5a02ffc3SAndrew Turner #include "math_config.h"
9*5a02ffc3SAndrew Turner #include "pl_sig.h"
10*5a02ffc3SAndrew Turner #include "pl_test.h"
11*5a02ffc3SAndrew Turner 
12*5a02ffc3SAndrew Turner const static struct
13*5a02ffc3SAndrew Turner {
14*5a02ffc3SAndrew Turner   /*  We use P_N and Q_N to refer to arrays of coefficients, where P_N is the
15*5a02ffc3SAndrew Turner       coeffs of the numerator in table N of Blair et al, and Q_N is the coeffs
16*5a02ffc3SAndrew Turner       of the denominator.  */
17*5a02ffc3SAndrew Turner   float P_10[3], Q_10[4], P_29[4], Q_29[4], P_50[6], Q_50[3];
18*5a02ffc3SAndrew Turner } data = { .P_10 = { -0x1.a31268p+3, 0x1.ac9048p+4, -0x1.293ff6p+3 },
19*5a02ffc3SAndrew Turner 	   .Q_10 = { -0x1.8265eep+3, 0x1.ef5eaep+4, -0x1.12665p+4, 0x1p+0 },
20*5a02ffc3SAndrew Turner 	   .P_29
21*5a02ffc3SAndrew Turner 	   = { -0x1.fc0252p-4, 0x1.119d44p+0, -0x1.f59ee2p+0, 0x1.b13626p-2 },
22*5a02ffc3SAndrew Turner 	   .Q_29 = { -0x1.69952p-4, 0x1.c7b7d2p-1, -0x1.167d7p+1, 0x1p+0 },
23*5a02ffc3SAndrew Turner 	   .P_50 = { 0x1.3d8948p-3, 0x1.61f9eap+0, 0x1.61c6bcp-1,
24*5a02ffc3SAndrew Turner 		     -0x1.20c9f2p+0, 0x1.5c704cp-1, -0x1.50c6bep-3 },
25*5a02ffc3SAndrew Turner 	   .Q_50 = { 0x1.3d7dacp-3, 0x1.629e5p+0, 0x1p+0 } };
26*5a02ffc3SAndrew Turner 
27*5a02ffc3SAndrew Turner /* Inverse error function approximation, based on rational approximation as
28*5a02ffc3SAndrew Turner    described in
29*5a02ffc3SAndrew Turner    J. M. Blair, C. A. Edwards, and J. H. Johnson,
30*5a02ffc3SAndrew Turner    "Rational Chebyshev approximations for the inverse of the error function",
31*5a02ffc3SAndrew Turner    Math. Comp. 30, pp. 827--830 (1976).
32*5a02ffc3SAndrew Turner    https://doi.org/10.1090/S0025-5718-1976-0421040-7
33*5a02ffc3SAndrew Turner    Largest error is 4.71 ULP, in the tail region:
34*5a02ffc3SAndrew Turner    erfinvf(0x1.f84e9ap-1) got 0x1.b8326ap+0
35*5a02ffc3SAndrew Turner 			 want 0x1.b83274p+0.  */
36*5a02ffc3SAndrew Turner float
erfinvf(float x)37*5a02ffc3SAndrew Turner erfinvf (float x)
38*5a02ffc3SAndrew Turner {
39*5a02ffc3SAndrew Turner   if (x == 1.0f)
40*5a02ffc3SAndrew Turner     return __math_oflowf (0);
41*5a02ffc3SAndrew Turner   if (x == -1.0f)
42*5a02ffc3SAndrew Turner     return __math_oflowf (1);
43*5a02ffc3SAndrew Turner 
44*5a02ffc3SAndrew Turner   float a = fabsf (x);
45*5a02ffc3SAndrew Turner   if (a > 1.0f)
46*5a02ffc3SAndrew Turner     return __math_invalidf (x);
47*5a02ffc3SAndrew Turner 
48*5a02ffc3SAndrew Turner   if (a <= 0.75f)
49*5a02ffc3SAndrew Turner     {
50*5a02ffc3SAndrew Turner       /* Greatest error in this region is 4.60 ULP:
51*5a02ffc3SAndrew Turner 	 erfinvf(0x1.0a98bap-5) got 0x1.d8a93ep-6
52*5a02ffc3SAndrew Turner 			       want 0x1.d8a948p-6.  */
53*5a02ffc3SAndrew Turner       float t = x * x - 0.5625f;
54*5a02ffc3SAndrew Turner       return x * horner_2_f32 (t, data.P_10) / horner_3_f32 (t, data.Q_10);
55*5a02ffc3SAndrew Turner     }
56*5a02ffc3SAndrew Turner   if (a < 0.9375f)
57*5a02ffc3SAndrew Turner     {
58*5a02ffc3SAndrew Turner       /* Greatest error in this region is 3.79 ULP:
59*5a02ffc3SAndrew Turner 	 erfinvf(0x1.ac82d6p-1) got 0x1.f8fc54p-1
60*5a02ffc3SAndrew Turner 			       want 0x1.f8fc5cp-1.  */
61*5a02ffc3SAndrew Turner       float t = x * x - 0.87890625f;
62*5a02ffc3SAndrew Turner       return x * horner_3_f32 (t, data.P_29) / horner_3_f32 (t, data.Q_29);
63*5a02ffc3SAndrew Turner     }
64*5a02ffc3SAndrew Turner 
65*5a02ffc3SAndrew Turner   /* Tail region, where error is greatest (and sensitive to sqrt and log1p
66*5a02ffc3SAndrew Turner      implementations.  */
67*5a02ffc3SAndrew Turner   float t = 1.0 / sqrtf (-log1pf (-a));
68*5a02ffc3SAndrew Turner   return horner_5_f32 (t, data.P_50)
69*5a02ffc3SAndrew Turner 	 / (copysignf (t, x) * horner_2_f32 (t, data.Q_50));
70*5a02ffc3SAndrew Turner }
71*5a02ffc3SAndrew Turner 
72*5a02ffc3SAndrew Turner PL_SIG (S, F, 1, erfinv, -0.99, 0.99)
73*5a02ffc3SAndrew Turner PL_TEST_ULP (erfinvf, 4.09)
74*5a02ffc3SAndrew Turner PL_TEST_SYM_INTERVAL (erfinvf, 0, 1, 40000)
75