1 /* 2 * Single-precision inverse error function (AdvSIMD variant). 3 * 4 * Copyright (c) 2023-2024, Arm Limited. 5 * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception 6 */ 7 #include "v_math.h" 8 #include "test_sig.h" 9 #include "test_defs.h" 10 #include "v_poly_f32.h" 11 #include "v_logf_inline.h" 12 13 const static struct data 14 { 15 /* We use P_N and Q_N to refer to arrays of coefficients, where P_N is the 16 coeffs of the numerator in table N of Blair et al, and Q_N is the coeffs 17 of the denominator. Coefficients are stored in various interleaved 18 formats to allow for table-based (vector-to-vector) lookup. 19 20 Plo is first two coefficients of P_10 and P_29 interleaved. 21 PQ is third coeff of P_10 and first of Q_29 interleaved. 22 Qhi is second and third coeffs of Q_29 interleaved. 23 P29_3 is a homogenous vector with fourth coeff of P_29. 24 25 P_10 and Q_10 are also stored in homogenous vectors to allow better 26 memory access when no lanes are in a tail region. */ 27 float Plo[4], PQ[4], Qhi[4]; 28 float32x4_t P29_3, tailshift; 29 float32x4_t P_50[6], Q_50[2]; 30 float32x4_t P_10[3], Q_10[3]; 31 uint8_t idxhi[16], idxlo[16]; 32 struct v_logf_data logf_tbl; 33 } data = { 34 .idxlo = { 0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3 }, 35 .idxhi = { 8, 9, 10, 11, 8, 9, 10, 11, 8, 9, 10, 11, 8, 9, 10, 11 }, 36 .P29_3 = V4 (0x1.b13626p-2), 37 .tailshift = V4 (-0.87890625), 38 .Plo = { -0x1.a31268p+3, -0x1.fc0252p-4, 0x1.ac9048p+4, 0x1.119d44p+0 }, 39 .PQ = { -0x1.293ff6p+3, -0x1.f59ee2p+0, -0x1.8265eep+3, -0x1.69952p-4 }, 40 .Qhi = { 0x1.ef5eaep+4, 0x1.c7b7d2p-1, -0x1.12665p+4, -0x1.167d7p+1 }, 41 .P_50 = { V4 (0x1.3d8948p-3), V4 (0x1.61f9eap+0), V4 (0x1.61c6bcp-1), 42 V4 (-0x1.20c9f2p+0), V4 (0x1.5c704cp-1), V4 (-0x1.50c6bep-3) }, 43 .Q_50 = { V4 (0x1.3d7dacp-3), V4 (0x1.629e5p+0) }, 44 .P_10 = { V4 (-0x1.a31268p+3), V4 (0x1.ac9048p+4), V4 (-0x1.293ff6p+3) }, 45 .Q_10 = { V4 (-0x1.8265eep+3), V4 (0x1.ef5eaep+4), V4 (-0x1.12665p+4) }, 46 .logf_tbl = V_LOGF_CONSTANTS 47 }; 48 49 static inline float32x4_t 50 special (float32x4_t x, const struct data *d) 51 { 52 /* Note erfinvf(inf) should return NaN, and erfinvf(1) should return Inf. 53 By using log here, instead of log1p, we return finite values for both 54 these inputs, and values outside [-1, 1]. This is non-compliant, but is an 55 acceptable optimisation at Ofast. To get correct behaviour for all finite 56 values use the log1pf_inline helper on -abs(x) - note that erfinvf(inf) 57 will still be finite. */ 58 float32x4_t t = vdivq_f32 ( 59 v_f32 (1), vsqrtq_f32 (vnegq_f32 (v_logf_inline ( 60 vsubq_f32 (v_f32 (1), vabsq_f32 (x)), &d->logf_tbl)))); 61 float32x4_t ts = vbslq_f32 (v_u32 (0x7fffffff), t, x); 62 float32x4_t q = vfmaq_f32 (d->Q_50[0], vaddq_f32 (t, d->Q_50[1]), t); 63 return vdivq_f32 (v_horner_5_f32 (t, d->P_50), vmulq_f32 (ts, q)); 64 } 65 66 static inline float32x4_t 67 notails (float32x4_t x, const struct data *d) 68 { 69 /* Shortcut when no input is in a tail region - no need to gather shift or 70 coefficients. */ 71 float32x4_t t = vfmaq_f32 (v_f32 (-0.5625), x, x); 72 float32x4_t q = vaddq_f32 (t, d->Q_10[2]); 73 q = vfmaq_f32 (d->Q_10[1], t, q); 74 q = vfmaq_f32 (d->Q_10[0], t, q); 75 76 return vdivq_f32 (vmulq_f32 (x, v_horner_2_f32 (t, d->P_10)), q); 77 } 78 79 static inline float32x4_t 80 lookup (float32x4_t tbl, uint8x16_t idx) 81 { 82 return vreinterpretq_f32_u8 (vqtbl1q_u8 (vreinterpretq_u8_f32 (tbl), idx)); 83 } 84 85 /* Vector implementation of Blair et al's rational approximation to inverse 86 error function in single-precision. Worst-case error is 4.98 ULP, in the 87 tail region: 88 _ZGVnN4v_erfinvf(0x1.f7dbeep-1) got 0x1.b4793p+0 89 want 0x1.b4793ap+0 . */ 90 float32x4_t VPCS_ATTR NOINLINE V_NAME_F1 (erfinv) (float32x4_t x) 91 { 92 const struct data *d = ptr_barrier (&data); 93 94 /* Calculate inverse error using algorithm described in 95 J. M. Blair, C. A. Edwards, and J. H. Johnson, 96 "Rational Chebyshev approximations for the inverse of the error 97 function", Math. Comp. 30, pp. 827--830 (1976). 98 https://doi.org/10.1090/S0025-5718-1976-0421040-7. 99 100 Algorithm has 3 intervals: 101 - 'Normal' region [-0.75, 0.75] 102 - Tail region [0.75, 0.9375] U [-0.9375, -0.75] 103 - Extreme tail [-1, -0.9375] U [0.9375, 1] 104 Normal and tail are both rational approximation of similar order on 105 shifted input - these are typically performed in parallel using gather 106 loads to obtain correct coefficients depending on interval. */ 107 uint32x4_t is_tail = vcageq_f32 (x, v_f32 (0.75)); 108 uint32x4_t extreme_tail = vcageq_f32 (x, v_f32 (0.9375)); 109 110 if (unlikely (!v_any_u32 (is_tail))) 111 /* Shortcut for if all lanes are in [-0.75, 0.75] - can avoid having to 112 gather coefficients. If input is uniform in [-1, 1] then likelihood of 113 this is 0.75^4 ~= 0.31. */ 114 return notails (x, d); 115 116 /* Select requisite shift depending on interval: polynomial is evaluated on 117 x * x - shift. 118 Normal shift = 0.5625 119 Tail shift = 0.87890625. */ 120 float32x4_t t 121 = vfmaq_f32 (vbslq_f32 (is_tail, d->tailshift, v_f32 (-0.5625)), x, x); 122 123 /* Calculate indexes for tbl: tbl is byte-wise, so: 124 [0, 1, 2, 3, 4, 5, 6, ....] copies the vector 125 Add 4 * i to a group of 4 lanes to copy 32-bit lane i. Each vector stores 126 two pairs of coeffs, so we need two idx vectors - one for each pair. */ 127 uint8x16_t off = vandq_u8 (vreinterpretq_u8_u32 (is_tail), vdupq_n_u8 (4)); 128 uint8x16_t idx_lo = vaddq_u8 (vld1q_u8 (d->idxlo), off); 129 uint8x16_t idx_hi = vaddq_u8 (vld1q_u8 (d->idxhi), off); 130 131 /* Load the tables. */ 132 float32x4_t plo = vld1q_f32 (d->Plo); 133 float32x4_t pq = vld1q_f32 (d->PQ); 134 float32x4_t qhi = vld1q_f32 (d->Qhi); 135 136 /* Do the lookup (and calculate p3 by masking non-tail lanes). */ 137 float32x4_t p3 = vreinterpretq_f32_u32 ( 138 vandq_u32 (is_tail, vreinterpretq_u32_f32 (d->P29_3))); 139 float32x4_t p0 = lookup (plo, idx_lo), p1 = lookup (plo, idx_hi), 140 p2 = lookup (pq, idx_lo), q0 = lookup (pq, idx_hi), 141 q1 = lookup (qhi, idx_lo), q2 = lookup (qhi, idx_hi); 142 143 float32x4_t p = vfmaq_f32 (p2, p3, t); 144 p = vfmaq_f32 (p1, p, t); 145 p = vfmaq_f32 (p0, p, t); 146 p = vmulq_f32 (x, p); 147 148 float32x4_t q = vfmaq_f32 (q1, vaddq_f32 (q2, t), t); 149 q = vfmaq_f32 (q0, q, t); 150 151 if (unlikely (v_any_u32 (extreme_tail))) 152 /* At least one lane is in the extreme tail - if input is uniform in 153 [-1, 1] the likelihood of this is ~0.23. */ 154 return vbslq_f32 (extreme_tail, special (x, d), vdivq_f32 (p, q)); 155 156 return vdivq_f32 (p, q); 157 } 158 159 HALF_WIDTH_ALIAS_F1 (erfinv) 160 161 #if USE_MPFR 162 # warning Not generating tests for _ZGVnN4v_erfinvf, as MPFR has no suitable reference 163 #else 164 TEST_SIG (V, F, 1, erfinv, -0.99, 0.99) 165 TEST_DISABLE_FENV (V_NAME_F1 (erfinv)) 166 TEST_ULP (V_NAME_F1 (erfinv), 4.49) 167 TEST_SYM_INTERVAL (V_NAME_F1 (erfinv), 0, 0x1.fffffep-1, 40000) 168 /* Test with control lane in each interval. */ 169 TEST_CONTROL_VALUE (V_NAME_F1 (erfinv), 0.5) 170 TEST_CONTROL_VALUE (V_NAME_F1 (erfinv), 0.8) 171 TEST_CONTROL_VALUE (V_NAME_F1 (erfinv), 0.95) 172 #endif 173