1 /* 2 * Single-precision vector erfc(x) function. 3 * 4 * Copyright (c) 2023, Arm Limited. 5 * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception 6 */ 7 8 #include "v_math.h" 9 #include "pl_sig.h" 10 #include "pl_test.h" 11 12 static const struct data 13 { 14 uint32x4_t offset, table_scale; 15 float32x4_t max, shift; 16 float32x4_t coeffs, third, two_over_five, tenth; 17 #if WANT_SIMD_EXCEPT 18 float32x4_t uflow_bound; 19 #endif 20 21 } data = { 22 /* Set an offset so the range of the index used for lookup is 644, and it can 23 be clamped using a saturated add. */ 24 .offset = V4 (0xb7fffd7b), /* 0xffffffff - asuint(shift) - 644. */ 25 .table_scale = V4 (0x28000000 << 1), /* asuint (2^-47) << 1. */ 26 .max = V4 (10.0625f), /* 10 + 1/16 = 644/64. */ 27 .shift = V4 (0x1p17f), 28 /* Store 1/3, 2/3 and 2/15 in a single register for use with indexed muls and 29 fmas. */ 30 .coeffs = (float32x4_t){ 0x1.555556p-2f, 0x1.555556p-1f, 0x1.111112p-3f, 0 }, 31 .third = V4 (0x1.555556p-2f), 32 .two_over_five = V4 (-0x1.99999ap-2f), 33 .tenth = V4 (-0x1.99999ap-4f), 34 #if WANT_SIMD_EXCEPT 35 .uflow_bound = V4 (0x1.2639cp+3f), 36 #endif 37 }; 38 39 #define TinyBound 0x41000000 /* 0x1p-62f << 1. */ 40 #define Thres 0xbe000000 /* asuint(infinity) << 1 - TinyBound. */ 41 #define Off 0xfffffd7b /* 0xffffffff - 644. */ 42 43 struct entry 44 { 45 float32x4_t erfc; 46 float32x4_t scale; 47 }; 48 49 static inline struct entry 50 lookup (uint32x4_t i) 51 { 52 struct entry e; 53 float64_t t0 = *((float64_t *) (__erfcf_data.tab - Off + i[0])); 54 float64_t t1 = *((float64_t *) (__erfcf_data.tab - Off + i[1])); 55 float64_t t2 = *((float64_t *) (__erfcf_data.tab - Off + i[2])); 56 float64_t t3 = *((float64_t *) (__erfcf_data.tab - Off + i[3])); 57 float32x4_t e1 = vreinterpretq_f32_f64 ((float64x2_t){ t0, t1 }); 58 float32x4_t e2 = vreinterpretq_f32_f64 ((float64x2_t){ t2, t3 }); 59 e.erfc = vuzp1q_f32 (e1, e2); 60 e.scale = vuzp2q_f32 (e1, e2); 61 return e; 62 } 63 64 #if WANT_SIMD_EXCEPT 65 static float32x4_t VPCS_ATTR NOINLINE 66 special_case (float32x4_t x, float32x4_t y, uint32x4_t cmp) 67 { 68 return v_call_f32 (erfcf, x, y, cmp); 69 } 70 #endif 71 72 /* Optimized single-precision vector erfcf(x). 73 Approximation based on series expansion near x rounded to 74 nearest multiple of 1/64. 75 Let d = x - r, and scale = 2 / sqrt(pi) * exp(-r^2). For x near r, 76 77 erfc(x) ~ erfc(r) - scale * d * poly(r, d), with 78 79 poly(r, d) = 1 - r d + (2/3 r^2 - 1/3) d^2 - r (1/3 r^2 - 1/2) d^3 80 + (2/15 r^4 - 2/5 r^2 + 1/10) d^4 81 82 Values of erfc(r) and scale are read from lookup tables. Stored values 83 are scaled to avoid hitting the subnormal range. 84 85 Note that for x < 0, erfc(x) = 2.0 - erfc(-x). 86 Maximum error: 1.63 ULP (~1.0 ULP for x < 0.0). 87 _ZGVnN4v_erfcf(0x1.1dbf7ap+3) got 0x1.f51212p-120 88 want 0x1.f51216p-120. */ 89 VPCS_ATTR 90 float32x4_t V_NAME_F1 (erfc) (float32x4_t x) 91 { 92 const struct data *dat = ptr_barrier (&data); 93 94 #if WANT_SIMD_EXCEPT 95 /* |x| < 2^-62. Avoid fabs by left-shifting by 1. */ 96 uint32x4_t ix = vreinterpretq_u32_f32 (x); 97 uint32x4_t cmp = vcltq_u32 (vaddq_u32 (ix, ix), v_u32 (TinyBound)); 98 /* x >= ~9.19 (into subnormal case and uflow case). Comparison is done in 99 integer domain to avoid raising exceptions in presence of nans. */ 100 uint32x4_t uflow = vcgeq_s32 (vreinterpretq_s32_f32 (x), 101 vreinterpretq_s32_f32 (dat->uflow_bound)); 102 cmp = vorrq_u32 (cmp, uflow); 103 float32x4_t xm = x; 104 /* If any lanes are special, mask them with 0 and retain a copy of x to allow 105 special case handler to fix special lanes later. This is only necessary if 106 fenv exceptions are to be triggered correctly. */ 107 if (unlikely (v_any_u32 (cmp))) 108 x = v_zerofy_f32 (x, cmp); 109 #endif 110 111 float32x4_t a = vabsq_f32 (x); 112 a = vminq_f32 (a, dat->max); 113 114 /* Lookup erfc(r) and scale(r) in tables, e.g. set erfc(r) to 0 and scale to 115 2/sqrt(pi), when x reduced to r = 0. */ 116 float32x4_t shift = dat->shift; 117 float32x4_t z = vaddq_f32 (a, shift); 118 119 /* Clamp index to a range of 644. A naive approach would use a subtract and 120 min. Instead we offset the table address and the index, then use a 121 saturating add. */ 122 uint32x4_t i = vqaddq_u32 (vreinterpretq_u32_f32 (z), dat->offset); 123 124 struct entry e = lookup (i); 125 126 /* erfc(x) ~ erfc(r) - scale * d * poly(r, d). */ 127 float32x4_t r = vsubq_f32 (z, shift); 128 float32x4_t d = vsubq_f32 (a, r); 129 float32x4_t d2 = vmulq_f32 (d, d); 130 float32x4_t r2 = vmulq_f32 (r, r); 131 132 float32x4_t p1 = r; 133 float32x4_t p2 = vfmsq_laneq_f32 (dat->third, r2, dat->coeffs, 1); 134 float32x4_t p3 135 = vmulq_f32 (r, vfmaq_laneq_f32 (v_f32 (-0.5), r2, dat->coeffs, 0)); 136 float32x4_t p4 = vfmaq_laneq_f32 (dat->two_over_five, r2, dat->coeffs, 2); 137 p4 = vfmsq_f32 (dat->tenth, r2, p4); 138 139 float32x4_t y = vfmaq_f32 (p3, d, p4); 140 y = vfmaq_f32 (p2, d, y); 141 y = vfmaq_f32 (p1, d, y); 142 y = vfmsq_f32 (e.erfc, e.scale, vfmsq_f32 (d, d2, y)); 143 144 /* Offset equals 2.0f if sign, else 0.0f. */ 145 uint32x4_t sign = vshrq_n_u32 (vreinterpretq_u32_f32 (x), 31); 146 float32x4_t off = vreinterpretq_f32_u32 (vshlq_n_u32 (sign, 30)); 147 /* Copy sign and scale back in a single fma. Since the bit patterns do not 148 overlap, then logical or and addition are equivalent here. */ 149 float32x4_t fac = vreinterpretq_f32_u32 ( 150 vsraq_n_u32 (vshlq_n_u32 (sign, 31), dat->table_scale, 1)); 151 152 #if WANT_SIMD_EXCEPT 153 if (unlikely (v_any_u32 (cmp))) 154 return special_case (xm, vfmaq_f32 (off, fac, y), cmp); 155 #endif 156 157 return vfmaq_f32 (off, fac, y); 158 } 159 160 PL_SIG (V, F, 1, erfc, -4.0, 10.0) 161 PL_TEST_ULP (V_NAME_F1 (erfc), 1.14) 162 PL_TEST_SYM_INTERVAL (V_NAME_F1 (erfc), 0, 0x1p-26, 40000) 163 PL_TEST_INTERVAL (V_NAME_F1 (erfc), 0x1p-26, 10.0625, 40000) 164 PL_TEST_INTERVAL (V_NAME_F1 (erfc), -0x1p-26, -4.0, 40000) 165 PL_TEST_INTERVAL (V_NAME_F1 (erfc), 10.0625, inf, 40000) 166 PL_TEST_INTERVAL (V_NAME_F1 (erfc), -4.0, -inf, 40000) 167