1 /*
2 * Single-precision inverse error function (SVE variant).
3 *
4 * Copyright (c) 2024, Arm Limited.
5 * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
6 */
7 #include "sv_math.h"
8 #include "test_sig.h"
9 #include "test_defs.h"
10 #include "sv_poly_f32.h"
11 #include "sv_logf_inline.h"
12
13 const static struct data
14 {
15 /* We use P_N and Q_N to refer to arrays of coefficients, where P_N
16 is the coeffs of the numerator in table N of Blair et al, and
17 Q_N is the coeffs of the denominator. Coefficients stored in
18 interleaved format to support lookup scheme. */
19 float P10_2, P29_3, Q10_2, Q29_2;
20 float P10_0, P29_1, P10_1, P29_2;
21 float Q10_0, Q29_0, Q10_1, Q29_1;
22 float P29_0, P_50[6], Q_50[2], tailshift;
23 struct sv_logf_data logf_tbl;
24 } data = { .P10_0 = -0x1.a31268p+3,
25 .P10_1 = 0x1.ac9048p+4,
26 .P10_2 = -0x1.293ff6p+3,
27 .P29_0 = -0x1.fc0252p-4,
28 .P29_1 = 0x1.119d44p+0,
29 .P29_2 = -0x1.f59ee2p+0,
30 .P29_3 = 0x1.b13626p-2,
31 .Q10_0 = -0x1.8265eep+3,
32 .Q10_1 = 0x1.ef5eaep+4,
33 .Q10_2 = -0x1.12665p+4,
34 .Q29_0 = -0x1.69952p-4,
35 .Q29_1 = 0x1.c7b7d2p-1,
36 .Q29_2 = -0x1.167d7p+1,
37 .P_50 = { 0x1.3d8948p-3, 0x1.61f9eap+0, 0x1.61c6bcp-1,
38 -0x1.20c9f2p+0, 0x1.5c704cp-1, -0x1.50c6bep-3 },
39 .Q_50 = { 0x1.3d7dacp-3, 0x1.629e5p+0 },
40 .tailshift = -0.87890625,
41 .logf_tbl = SV_LOGF_CONSTANTS };
42
43 static inline svfloat32_t
special(svbool_t pg,svfloat32_t x,const struct data * d)44 special (svbool_t pg, svfloat32_t x, const struct data *d)
45 {
46 svfloat32_t ax = svabs_x (pg, x);
47 svfloat32_t t = svdivr_x (
48 pg,
49 svsqrt_x (pg, svneg_x (pg, sv_logf_inline (pg, svsubr_x (pg, ax, 1),
50 &d->logf_tbl))),
51 1);
52 svuint32_t sign
53 = sveor_x (pg, svreinterpret_u32 (ax), svreinterpret_u32 (x));
54 svfloat32_t ts
55 = svreinterpret_f32 (svorr_x (pg, sign, svreinterpret_u32 (t)));
56 svfloat32_t q
57 = svmla_x (pg, sv_f32 (d->Q_50[0]), svadd_x (pg, t, d->Q_50[1]), t);
58 return svdiv_x (pg, sv_horner_5_f32_x (pg, t, d->P_50), svmul_x (pg, ts, q));
59 }
60
61 static inline svfloat32_t
notails(svbool_t pg,svfloat32_t x,const struct data * d)62 notails (svbool_t pg, svfloat32_t x, const struct data *d)
63 {
64 /* Shortcut when no input is in a tail region - no need to gather shift or
65 coefficients. */
66 svfloat32_t t = svmad_x (pg, x, x, -0.5625);
67 svfloat32_t q = svadd_x (pg, t, d->Q10_2);
68 q = svmad_x (pg, t, q, d->Q10_1);
69 q = svmad_x (pg, t, q, d->Q10_0);
70
71 svfloat32_t p = svmla_x (pg, sv_f32 (d->P10_1), t, d->P10_2);
72 p = svmad_x (pg, p, t, d->P10_0);
73
74 return svdiv_x (pg, svmul_x (pg, x, p), q);
75 }
76
77 /* Vector implementation of Blair et al's rational approximation to inverse
78 error function in single-precision. Worst-case error is 4.71 ULP, in the
79 tail region:
80 _ZGVsMxv_erfinvf(0x1.f84e9ap-1) got 0x1.b8326ap+0
81 want 0x1.b83274p+0. */
SV_NAME_F1(erfinv)82 svfloat32_t SV_NAME_F1 (erfinv) (svfloat32_t x, svbool_t pg)
83 {
84 const struct data *d = ptr_barrier (&data);
85
86 /* Calculate inverse error using algorithm described in
87 J. M. Blair, C. A. Edwards, and J. H. Johnson,
88 "Rational Chebyshev approximations for the inverse of the error function",
89 Math. Comp. 30, pp. 827--830 (1976).
90 https://doi.org/10.1090/S0025-5718-1976-0421040-7. */
91
92 /* Algorithm has 3 intervals:
93 - 'Normal' region [-0.75, 0.75]
94 - Tail region [0.75, 0.9375] U [-0.9375, -0.75]
95 - Extreme tail [-1, -0.9375] U [0.9375, 1]
96 Normal and tail are both rational approximation of similar order on
97 shifted input - these are typically performed in parallel using gather
98 loads to obtain correct coefficients depending on interval. */
99 svbool_t is_tail = svacge (pg, x, 0.75);
100 svbool_t extreme_tail = svacge (pg, x, 0.9375);
101
102 if (likely (!svptest_any (pg, is_tail)))
103 return notails (pg, x, d);
104
105 /* Select requisite shift depending on interval: polynomial is evaluated on
106 x * x - shift.
107 Normal shift = 0.5625
108 Tail shift = 0.87890625. */
109 svfloat32_t t = svmla_x (
110 pg, svsel (is_tail, sv_f32 (d->tailshift), sv_f32 (-0.5625)), x, x);
111
112 svuint32_t idx = svdup_u32_z (is_tail, 1);
113 svuint32_t idxhi = svadd_x (pg, idx, 2);
114
115 /* Load coeffs in quadwords and select them according to interval. */
116 svfloat32_t pqhi = svld1rq (svptrue_b32 (), &d->P10_2);
117 svfloat32_t plo = svld1rq (svptrue_b32 (), &d->P10_0);
118 svfloat32_t qlo = svld1rq (svptrue_b32 (), &d->Q10_0);
119
120 svfloat32_t p2 = svtbl (pqhi, idx);
121 svfloat32_t p1 = svtbl (plo, idxhi);
122 svfloat32_t p0 = svtbl (plo, idx);
123 svfloat32_t q0 = svtbl (qlo, idx);
124 svfloat32_t q1 = svtbl (qlo, idxhi);
125 svfloat32_t q2 = svtbl (pqhi, idxhi);
126
127 svfloat32_t p = svmla_x (pg, p1, p2, t);
128 p = svmla_x (pg, p0, p, t);
129 /* Tail polynomial has higher order - merge with normal lanes. */
130 p = svmad_m (is_tail, p, t, d->P29_0);
131 svfloat32_t y = svmul_x (pg, x, p);
132
133 /* Least significant term of both Q polynomials is 1, so no need to generate
134 it. */
135 svfloat32_t q = svadd_x (pg, t, q2);
136 q = svmla_x (pg, q1, q, t);
137 q = svmla_x (pg, q0, q, t);
138
139 if (unlikely (svptest_any (pg, extreme_tail)))
140 return svsel (extreme_tail, special (extreme_tail, x, d),
141 svdiv_x (pg, y, q));
142 return svdiv_x (pg, y, q);
143 }
144
145 #if USE_MPFR
146 # warning Not generating tests for _ZGVsMxv_erfinvf, as MPFR has no suitable reference
147 #else
148 TEST_SIG (SV, F, 1, erfinv, -0.99, 0.99)
149 TEST_ULP (SV_NAME_F1 (erfinv), 4.09)
150 TEST_DISABLE_FENV (SV_NAME_F1 (erfinv))
151 TEST_SYM_INTERVAL (SV_NAME_F1 (erfinv), 0, 1, 40000)
152 TEST_CONTROL_VALUE (SV_NAME_F1 (erfinv), 0.5)
153 TEST_CONTROL_VALUE (SV_NAME_F1 (erfinv), 0.8)
154 TEST_CONTROL_VALUE (SV_NAME_F1 (erfinv), 0.95)
155 #endif
156 CLOSE_SVE_ATTR
157