xref: /freebsd/contrib/arm-optimized-routines/pl/math/v_erfinv_25u.c (revision 357378bbdedf24ce2b90e9bd831af4a9db3ec70a)
1 /*
2  * Double-precision inverse error function (AdvSIMD variant).
3  *
4  * Copyright (c) 2023, Arm Limited.
5  * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
6  */
7 #include "v_math.h"
8 #include "pl_test.h"
9 #include "mathlib.h"
10 #include "math_config.h"
11 #include "pl_sig.h"
12 #include "poly_advsimd_f64.h"
13 #define V_LOG_INLINE_POLY_ORDER 4
14 #include "v_log_inline.h"
15 
16 const static struct data
17 {
18   /*  We use P_N and Q_N to refer to arrays of coefficients, where P_N is the
19       coeffs of the numerator in table N of Blair et al, and Q_N is the coeffs
20       of the denominator. P is interleaved P_17 and P_37, similar for Q. P17
21       and Q17 are provided as homogenous vectors as well for when the shortcut
22       can be taken.  */
23   double P[8][2], Q[7][2];
24   float64x2_t tailshift;
25   uint8x16_t idx;
26   struct v_log_inline_data log_tbl;
27   float64x2_t P_57[9], Q_57[10], P_17[7], Q_17[6];
28 } data = { .P = { { 0x1.007ce8f01b2e8p+4, -0x1.f3596123109edp-7 },
29 		  { -0x1.6b23cc5c6c6d7p+6, 0x1.60b8fe375999ep-2 },
30 		  { 0x1.74e5f6ceb3548p+7, -0x1.779bb9bef7c0fp+1 },
31 		  { -0x1.5200bb15cc6bbp+7, 0x1.786ea384470a2p+3 },
32 		  { 0x1.05d193233a849p+6, -0x1.6a7c1453c85d3p+4 },
33 		  { -0x1.148c5474ee5e1p+3, 0x1.31f0fc5613142p+4 },
34 		  { 0x1.689181bbafd0cp-3, -0x1.5ea6c007d4dbbp+2 },
35 		  { 0, 0x1.e66f265ce9e5p-3 } },
36 	   .Q = { { 0x1.d8fb0f913bd7bp+3, -0x1.636b2dcf4edbep-7 },
37 		  { -0x1.6d7f25a3f1c24p+6, 0x1.0b5411e2acf29p-2 },
38 		  { 0x1.a450d8e7f4cbbp+7, -0x1.3413109467a0bp+1 },
39 		  { -0x1.bc3480485857p+7, 0x1.563e8136c554ap+3 },
40 		  { 0x1.ae6b0c504ee02p+6, -0x1.7b77aab1dcafbp+4 },
41 		  { -0x1.499dfec1a7f5fp+4, 0x1.8a3e174e05ddcp+4 },
42 		  { 0x1p+0, -0x1.4075c56404eecp+3 } },
43 	   .P_57 = { V2 (0x1.b874f9516f7f1p-14), V2 (0x1.5921f2916c1c4p-7),
44 		     V2 (0x1.145ae7d5b8fa4p-2), V2 (0x1.29d6dcc3b2fb7p+1),
45 		     V2 (0x1.cabe2209a7985p+2), V2 (0x1.11859f0745c4p+3),
46 		     V2 (0x1.b7ec7bc6a2ce5p+2), V2 (0x1.d0419e0bb42aep+1),
47 		     V2 (0x1.c5aa03eef7258p-1) },
48 	   .Q_57 = { V2 (0x1.b8747e12691f1p-14), V2 (0x1.59240d8ed1e0ap-7),
49 		     V2 (0x1.14aef2b181e2p-2), V2 (0x1.2cd181bcea52p+1),
50 		     V2 (0x1.e6e63e0b7aa4cp+2), V2 (0x1.65cf8da94aa3ap+3),
51 		     V2 (0x1.7e5c787b10a36p+3), V2 (0x1.0626d68b6cea3p+3),
52 		     V2 (0x1.065c5f193abf6p+2), V2 (0x1p+0) },
53 	   .P_17 = { V2 (0x1.007ce8f01b2e8p+4), V2 (-0x1.6b23cc5c6c6d7p+6),
54 		     V2 (0x1.74e5f6ceb3548p+7), V2 (-0x1.5200bb15cc6bbp+7),
55 		     V2 (0x1.05d193233a849p+6), V2 (-0x1.148c5474ee5e1p+3),
56 		     V2 (0x1.689181bbafd0cp-3) },
57 	   .Q_17 = { V2 (0x1.d8fb0f913bd7bp+3), V2 (-0x1.6d7f25a3f1c24p+6),
58 		     V2 (0x1.a450d8e7f4cbbp+7), V2 (-0x1.bc3480485857p+7),
59 		     V2 (0x1.ae6b0c504ee02p+6), V2 (-0x1.499dfec1a7f5fp+4) },
60 	   .tailshift = V2 (-0.87890625),
61 	   .idx = { 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 },
62 	   .log_tbl = V_LOG_CONSTANTS };
63 
64 static inline float64x2_t
65 special (float64x2_t x, const struct data *d)
66 {
67   /* Note erfinv(inf) should return NaN, and erfinv(1) should return Inf.
68      By using log here, instead of log1p, we return finite values for both
69      these inputs, and values outside [-1, 1]. This is non-compliant, but is an
70      acceptable optimisation at Ofast. To get correct behaviour for all finite
71      values use the log1p_inline helper on -abs(x) - note that erfinv(inf)
72      will still be finite.  */
73   float64x2_t t = vnegq_f64 (
74       v_log_inline (vsubq_f64 (v_f64 (1), vabsq_f64 (x)), &d->log_tbl));
75   t = vdivq_f64 (v_f64 (1), vsqrtq_f64 (t));
76   float64x2_t ts = vbslq_f64 (v_u64 (0x7fffffffffffffff), t, x);
77   return vdivq_f64 (v_horner_8_f64 (t, d->P_57),
78 		    vmulq_f64 (ts, v_horner_9_f64 (t, d->Q_57)));
79 }
80 
81 static inline float64x2_t
82 lookup (const double *c, uint8x16_t idx)
83 {
84   float64x2_t x = vld1q_f64 (c);
85   return vreinterpretq_f64_u8 (vqtbl1q_u8 (vreinterpretq_u8_f64 (x), idx));
86 }
87 
88 static inline float64x2_t VPCS_ATTR
89 notails (float64x2_t x, const struct data *d)
90 {
91   /* Shortcut when no input is in a tail region - no need to gather shift or
92      coefficients.  */
93   float64x2_t t = vfmaq_f64 (v_f64 (-0.5625), x, x);
94   float64x2_t p = vmulq_f64 (v_horner_6_f64 (t, d->P_17), x);
95   float64x2_t q = vaddq_f64 (d->Q_17[5], t);
96   for (int i = 4; i >= 0; i--)
97     q = vfmaq_f64 (d->Q_17[i], q, t);
98   return vdivq_f64 (p, q);
99 }
100 
101 /* Vector implementation of Blair et al's rational approximation to inverse
102    error function in single-precision. Largest observed error is 24.75 ULP:
103    _ZGVnN2v_erfinv(0x1.fc861d81c2ba8p-1) got 0x1.ea05472686625p+0
104 					want 0x1.ea0547268660cp+0.  */
105 float64x2_t VPCS_ATTR V_NAME_D1 (erfinv) (float64x2_t x)
106 {
107   const struct data *d = ptr_barrier (&data);
108   /* Calculate inverse error using algorithm described in
109      J. M. Blair, C. A. Edwards, and J. H. Johnson,
110      "Rational Chebyshev approximations for the inverse of the error function",
111      Math. Comp. 30, pp. 827--830 (1976).
112      https://doi.org/10.1090/S0025-5718-1976-0421040-7.
113 
114      Algorithm has 3 intervals:
115      - 'Normal' region [-0.75, 0.75]
116      - Tail region [0.75, 0.9375] U [-0.9375, -0.75]
117      - Extreme tail [-1, -0.9375] U [0.9375, 1]
118      Normal and tail are both rational approximation of similar order on
119      shifted input - these are typically performed in parallel using gather
120      loads to obtain correct coefficients depending on interval.  */
121   uint64x2_t is_tail = vcagtq_f64 (x, v_f64 (0.75));
122 
123   if (unlikely (!v_any_u64 (is_tail)))
124     /* If input is normally distributed in [-1, 1] then likelihood of this is
125        0.75^2 ~= 0.56.  */
126     return notails (x, d);
127 
128   uint64x2_t extreme_tail = vcagtq_f64 (x, v_f64 (0.9375));
129 
130   uint8x16_t off = vandq_u8 (vreinterpretq_u8_u64 (is_tail), vdupq_n_u8 (8));
131   uint8x16_t idx = vaddq_u8 (d->idx, off);
132 
133   float64x2_t t = vbslq_f64 (is_tail, d->tailshift, v_f64 (-0.5625));
134   t = vfmaq_f64 (t, x, x);
135 
136   float64x2_t p = lookup (&d->P[7][0], idx);
137   /* Last coeff of q is either 0 or 1 - use mask instead of load.  */
138   float64x2_t q = vreinterpretq_f64_u64 (
139       vandq_u64 (is_tail, vreinterpretq_u64_f64 (v_f64 (1))));
140   for (int i = 6; i >= 0; i--)
141     {
142       p = vfmaq_f64 (lookup (&d->P[i][0], idx), p, t);
143       q = vfmaq_f64 (lookup (&d->Q[i][0], idx), q, t);
144     }
145   p = vmulq_f64 (p, x);
146 
147   if (unlikely (v_any_u64 (extreme_tail)))
148     return vbslq_f64 (extreme_tail, special (x, d), vdivq_f64 (p, q));
149 
150   return vdivq_f64 (p, q);
151 }
152 
153 PL_SIG (V, D, 1, erfinv, -0.99, 0.99)
154 PL_TEST_ULP (V_NAME_D1 (erfinv), 24.8)
155 /* Test with control lane in each interval.  */
156 PL_TEST_SYM_INTERVAL_C (V_NAME_D1 (erfinv), 0, 0x1.fffffffffffffp-1, 100000,
157 			0.5)
158 PL_TEST_SYM_INTERVAL_C (V_NAME_D1 (erfinv), 0, 0x1.fffffffffffffp-1, 100000,
159 			0.8)
160 PL_TEST_SYM_INTERVAL_C (V_NAME_D1 (erfinv), 0, 0x1.fffffffffffffp-1, 100000,
161 			0.95)
162