1 /* 2 * Single-precision inverse error function (SVE variant). 3 * 4 * Copyright (c) 2024, Arm Limited. 5 * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception 6 */ 7 #include "sv_math.h" 8 #include "test_sig.h" 9 #include "test_defs.h" 10 #include "sv_poly_f32.h" 11 #include "sv_logf_inline.h" 12 13 const static struct data 14 { 15 /* We use P_N and Q_N to refer to arrays of coefficients, where P_N 16 is the coeffs of the numerator in table N of Blair et al, and 17 Q_N is the coeffs of the denominator. Coefficients stored in 18 interleaved format to support lookup scheme. */ 19 float P10_2, P29_3, Q10_2, Q29_2; 20 float P10_0, P29_1, P10_1, P29_2; 21 float Q10_0, Q29_0, Q10_1, Q29_1; 22 float P29_0, P_50[6], Q_50[2], tailshift; 23 struct sv_logf_data logf_tbl; 24 } data = { .P10_0 = -0x1.a31268p+3, 25 .P10_1 = 0x1.ac9048p+4, 26 .P10_2 = -0x1.293ff6p+3, 27 .P29_0 = -0x1.fc0252p-4, 28 .P29_1 = 0x1.119d44p+0, 29 .P29_2 = -0x1.f59ee2p+0, 30 .P29_3 = 0x1.b13626p-2, 31 .Q10_0 = -0x1.8265eep+3, 32 .Q10_1 = 0x1.ef5eaep+4, 33 .Q10_2 = -0x1.12665p+4, 34 .Q29_0 = -0x1.69952p-4, 35 .Q29_1 = 0x1.c7b7d2p-1, 36 .Q29_2 = -0x1.167d7p+1, 37 .P_50 = { 0x1.3d8948p-3, 0x1.61f9eap+0, 0x1.61c6bcp-1, 38 -0x1.20c9f2p+0, 0x1.5c704cp-1, -0x1.50c6bep-3 }, 39 .Q_50 = { 0x1.3d7dacp-3, 0x1.629e5p+0 }, 40 .tailshift = -0.87890625, 41 .logf_tbl = SV_LOGF_CONSTANTS }; 42 43 static inline svfloat32_t 44 special (svbool_t pg, svfloat32_t x, const struct data *d) 45 { 46 svfloat32_t ax = svabs_x (pg, x); 47 svfloat32_t t = svdivr_x ( 48 pg, 49 svsqrt_x (pg, svneg_x (pg, sv_logf_inline (pg, svsubr_x (pg, ax, 1), 50 &d->logf_tbl))), 51 1); 52 svuint32_t sign 53 = sveor_x (pg, svreinterpret_u32 (ax), svreinterpret_u32 (x)); 54 svfloat32_t ts 55 = svreinterpret_f32 (svorr_x (pg, sign, svreinterpret_u32 (t))); 56 svfloat32_t q 57 = svmla_x (pg, sv_f32 (d->Q_50[0]), svadd_x (pg, t, d->Q_50[1]), t); 58 return svdiv_x (pg, sv_horner_5_f32_x (pg, t, d->P_50), svmul_x (pg, ts, q)); 59 } 60 61 static inline svfloat32_t 62 notails (svbool_t pg, svfloat32_t x, const struct data *d) 63 { 64 /* Shortcut when no input is in a tail region - no need to gather shift or 65 coefficients. */ 66 svfloat32_t t = svmad_x (pg, x, x, -0.5625); 67 svfloat32_t q = svadd_x (pg, t, d->Q10_2); 68 q = svmad_x (pg, t, q, d->Q10_1); 69 q = svmad_x (pg, t, q, d->Q10_0); 70 71 svfloat32_t p = svmla_x (pg, sv_f32 (d->P10_1), t, d->P10_2); 72 p = svmad_x (pg, p, t, d->P10_0); 73 74 return svdiv_x (pg, svmul_x (pg, x, p), q); 75 } 76 77 /* Vector implementation of Blair et al's rational approximation to inverse 78 error function in single-precision. Worst-case error is 4.71 ULP, in the 79 tail region: 80 _ZGVsMxv_erfinvf(0x1.f84e9ap-1) got 0x1.b8326ap+0 81 want 0x1.b83274p+0. */ 82 svfloat32_t SV_NAME_F1 (erfinv) (svfloat32_t x, svbool_t pg) 83 { 84 const struct data *d = ptr_barrier (&data); 85 86 /* Calculate inverse error using algorithm described in 87 J. M. Blair, C. A. Edwards, and J. H. Johnson, 88 "Rational Chebyshev approximations for the inverse of the error function", 89 Math. Comp. 30, pp. 827--830 (1976). 90 https://doi.org/10.1090/S0025-5718-1976-0421040-7. */ 91 92 /* Algorithm has 3 intervals: 93 - 'Normal' region [-0.75, 0.75] 94 - Tail region [0.75, 0.9375] U [-0.9375, -0.75] 95 - Extreme tail [-1, -0.9375] U [0.9375, 1] 96 Normal and tail are both rational approximation of similar order on 97 shifted input - these are typically performed in parallel using gather 98 loads to obtain correct coefficients depending on interval. */ 99 svbool_t is_tail = svacge (pg, x, 0.75); 100 svbool_t extreme_tail = svacge (pg, x, 0.9375); 101 102 if (likely (!svptest_any (pg, is_tail))) 103 return notails (pg, x, d); 104 105 /* Select requisite shift depending on interval: polynomial is evaluated on 106 x * x - shift. 107 Normal shift = 0.5625 108 Tail shift = 0.87890625. */ 109 svfloat32_t t = svmla_x ( 110 pg, svsel (is_tail, sv_f32 (d->tailshift), sv_f32 (-0.5625)), x, x); 111 112 svuint32_t idx = svdup_u32_z (is_tail, 1); 113 svuint32_t idxhi = svadd_x (pg, idx, 2); 114 115 /* Load coeffs in quadwords and select them according to interval. */ 116 svfloat32_t pqhi = svld1rq (svptrue_b32 (), &d->P10_2); 117 svfloat32_t plo = svld1rq (svptrue_b32 (), &d->P10_0); 118 svfloat32_t qlo = svld1rq (svptrue_b32 (), &d->Q10_0); 119 120 svfloat32_t p2 = svtbl (pqhi, idx); 121 svfloat32_t p1 = svtbl (plo, idxhi); 122 svfloat32_t p0 = svtbl (plo, idx); 123 svfloat32_t q0 = svtbl (qlo, idx); 124 svfloat32_t q1 = svtbl (qlo, idxhi); 125 svfloat32_t q2 = svtbl (pqhi, idxhi); 126 127 svfloat32_t p = svmla_x (pg, p1, p2, t); 128 p = svmla_x (pg, p0, p, t); 129 /* Tail polynomial has higher order - merge with normal lanes. */ 130 p = svmad_m (is_tail, p, t, d->P29_0); 131 svfloat32_t y = svmul_x (pg, x, p); 132 133 /* Least significant term of both Q polynomials is 1, so no need to generate 134 it. */ 135 svfloat32_t q = svadd_x (pg, t, q2); 136 q = svmla_x (pg, q1, q, t); 137 q = svmla_x (pg, q0, q, t); 138 139 if (unlikely (svptest_any (pg, extreme_tail))) 140 return svsel (extreme_tail, special (extreme_tail, x, d), 141 svdiv_x (pg, y, q)); 142 return svdiv_x (pg, y, q); 143 } 144 145 #if USE_MPFR 146 # warning Not generating tests for _ZGVsMxv_erfinvf, as MPFR has no suitable reference 147 #else 148 TEST_SIG (SV, F, 1, erfinv, -0.99, 0.99) 149 TEST_ULP (SV_NAME_F1 (erfinv), 4.09) 150 TEST_DISABLE_FENV (SV_NAME_F1 (erfinv)) 151 TEST_SYM_INTERVAL (SV_NAME_F1 (erfinv), 0, 1, 40000) 152 TEST_CONTROL_VALUE (SV_NAME_F1 (erfinv), 0.5) 153 TEST_CONTROL_VALUE (SV_NAME_F1 (erfinv), 0.8) 154 TEST_CONTROL_VALUE (SV_NAME_F1 (erfinv), 0.95) 155 #endif 156 CLOSE_SVE_ATTR 157