1 /* 2 * Single-precision vector erfc(x) function. 3 * 4 * Copyright (c) 2023-2024, Arm Limited. 5 * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception 6 */ 7 8 #include "v_math.h" 9 #include "test_sig.h" 10 #include "test_defs.h" 11 12 static const struct data 13 { 14 uint32x4_t offset, table_scale; 15 float32x4_t max, shift; 16 float coeffs[4]; 17 float32x4_t third, two_over_five, tenth; 18 #if WANT_SIMD_EXCEPT 19 float32x4_t uflow_bound; 20 #endif 21 22 } data = { 23 /* Set an offset so the range of the index used for lookup is 644, and it can 24 be clamped using a saturated add. */ 25 .offset = V4 (0xb7fffd7b), /* 0xffffffff - asuint(shift) - 644. */ 26 .table_scale = V4 (0x28000000 << 1), /* asuint (2^-47) << 1. */ 27 .max = V4 (10.0625f), /* 10 + 1/16 = 644/64. */ 28 .shift = V4 (0x1p17f), 29 /* Store 1/3, 2/3 and 2/15 in a single register for use with indexed muls and 30 fmas. */ 31 .coeffs = { 0x1.555556p-2f, 0x1.555556p-1f, 0x1.111112p-3f, 0 }, 32 .third = V4 (0x1.555556p-2f), 33 .two_over_five = V4 (-0x1.99999ap-2f), 34 .tenth = V4 (-0x1.99999ap-4f), 35 #if WANT_SIMD_EXCEPT 36 .uflow_bound = V4 (0x1.2639cp+3f), 37 #endif 38 }; 39 40 #define TinyBound 0x41000000 /* 0x1p-62f << 1. */ 41 #define Thres 0xbe000000 /* asuint(infinity) << 1 - TinyBound. */ 42 #define Off 0xfffffd7b /* 0xffffffff - 644. */ 43 44 struct entry 45 { 46 float32x4_t erfc; 47 float32x4_t scale; 48 }; 49 50 static inline struct entry 51 lookup (uint32x4_t i) 52 { 53 struct entry e; 54 float32x2_t t0 55 = vld1_f32 (&__v_erfcf_data.tab[vgetq_lane_u32 (i, 0) - Off].erfc); 56 float32x2_t t1 57 = vld1_f32 (&__v_erfcf_data.tab[vgetq_lane_u32 (i, 1) - Off].erfc); 58 float32x2_t t2 59 = vld1_f32 (&__v_erfcf_data.tab[vgetq_lane_u32 (i, 2) - Off].erfc); 60 float32x2_t t3 61 = vld1_f32 (&__v_erfcf_data.tab[vgetq_lane_u32 (i, 3) - Off].erfc); 62 float32x4_t e1 = vcombine_f32 (t0, t1); 63 float32x4_t e2 = vcombine_f32 (t2, t3); 64 e.erfc = vuzp1q_f32 (e1, e2); 65 e.scale = vuzp2q_f32 (e1, e2); 66 return e; 67 } 68 69 #if WANT_SIMD_EXCEPT 70 static float32x4_t VPCS_ATTR NOINLINE 71 special_case (float32x4_t x, float32x4_t y, uint32x4_t cmp) 72 { 73 return v_call_f32 (erfcf, x, y, cmp); 74 } 75 #endif 76 77 /* Optimized single-precision vector erfcf(x). 78 Approximation based on series expansion near x rounded to 79 nearest multiple of 1/64. 80 Let d = x - r, and scale = 2 / sqrt(pi) * exp(-r^2). For x near r, 81 82 erfc(x) ~ erfc(r) - scale * d * poly(r, d), with 83 84 poly(r, d) = 1 - r d + (2/3 r^2 - 1/3) d^2 - r (1/3 r^2 - 1/2) d^3 85 + (2/15 r^4 - 2/5 r^2 + 1/10) d^4 86 87 Values of erfc(r) and scale are read from lookup tables. Stored values 88 are scaled to avoid hitting the subnormal range. 89 90 Note that for x < 0, erfc(x) = 2.0 - erfc(-x). 91 Maximum error: 1.63 ULP (~1.0 ULP for x < 0.0). 92 _ZGVnN4v_erfcf(0x1.1dbf7ap+3) got 0x1.f51212p-120 93 want 0x1.f51216p-120. */ 94 NOINLINE VPCS_ATTR float32x4_t V_NAME_F1 (erfc) (float32x4_t x) 95 { 96 const struct data *dat = ptr_barrier (&data); 97 98 #if WANT_SIMD_EXCEPT 99 /* |x| < 2^-62. Avoid fabs by left-shifting by 1. */ 100 uint32x4_t ix = vreinterpretq_u32_f32 (x); 101 uint32x4_t cmp = vcltq_u32 (vaddq_u32 (ix, ix), v_u32 (TinyBound)); 102 /* x >= ~9.19 (into subnormal case and uflow case). Comparison is done in 103 integer domain to avoid raising exceptions in presence of nans. */ 104 uint32x4_t uflow = vcgeq_s32 (vreinterpretq_s32_f32 (x), 105 vreinterpretq_s32_f32 (dat->uflow_bound)); 106 cmp = vorrq_u32 (cmp, uflow); 107 float32x4_t xm = x; 108 /* If any lanes are special, mask them with 0 and retain a copy of x to allow 109 special case handler to fix special lanes later. This is only necessary if 110 fenv exceptions are to be triggered correctly. */ 111 if (unlikely (v_any_u32 (cmp))) 112 x = v_zerofy_f32 (x, cmp); 113 #endif 114 115 float32x4_t a = vabsq_f32 (x); 116 a = vminq_f32 (a, dat->max); 117 118 /* Lookup erfc(r) and scale(r) in tables, e.g. set erfc(r) to 0 and scale to 119 2/sqrt(pi), when x reduced to r = 0. */ 120 float32x4_t shift = dat->shift; 121 float32x4_t z = vaddq_f32 (a, shift); 122 123 /* Clamp index to a range of 644. A naive approach would use a subtract and 124 min. Instead we offset the table address and the index, then use a 125 saturating add. */ 126 uint32x4_t i = vqaddq_u32 (vreinterpretq_u32_f32 (z), dat->offset); 127 128 struct entry e = lookup (i); 129 130 /* erfc(x) ~ erfc(r) - scale * d * poly(r, d). */ 131 float32x4_t r = vsubq_f32 (z, shift); 132 float32x4_t d = vsubq_f32 (a, r); 133 float32x4_t d2 = vmulq_f32 (d, d); 134 float32x4_t r2 = vmulq_f32 (r, r); 135 136 float32x4_t p1 = r; 137 float32x4_t coeffs = vld1q_f32 (dat->coeffs); 138 float32x4_t p2 = vfmsq_laneq_f32 (dat->third, r2, coeffs, 1); 139 float32x4_t p3 140 = vmulq_f32 (r, vfmaq_laneq_f32 (v_f32 (-0.5), r2, coeffs, 0)); 141 float32x4_t p4 = vfmaq_laneq_f32 (dat->two_over_five, r2, coeffs, 2); 142 p4 = vfmsq_f32 (dat->tenth, r2, p4); 143 144 float32x4_t y = vfmaq_f32 (p3, d, p4); 145 y = vfmaq_f32 (p2, d, y); 146 y = vfmaq_f32 (p1, d, y); 147 y = vfmsq_f32 (e.erfc, e.scale, vfmsq_f32 (d, d2, y)); 148 149 /* Offset equals 2.0f if sign, else 0.0f. */ 150 uint32x4_t sign = vshrq_n_u32 (vreinterpretq_u32_f32 (x), 31); 151 float32x4_t off = vreinterpretq_f32_u32 (vshlq_n_u32 (sign, 30)); 152 /* Copy sign and scale back in a single fma. Since the bit patterns do not 153 overlap, then logical or and addition are equivalent here. */ 154 float32x4_t fac = vreinterpretq_f32_u32 ( 155 vsraq_n_u32 (vshlq_n_u32 (sign, 31), dat->table_scale, 1)); 156 157 #if WANT_SIMD_EXCEPT 158 if (unlikely (v_any_u32 (cmp))) 159 return special_case (xm, vfmaq_f32 (off, fac, y), cmp); 160 #endif 161 162 return vfmaq_f32 (off, fac, y); 163 } 164 165 HALF_WIDTH_ALIAS_F1 (erfc) 166 167 TEST_SIG (V, F, 1, erfc, -4.0, 10.0) 168 TEST_DISABLE_FENV_IF_NOT (V_NAME_F1 (erfc), WANT_SIMD_EXCEPT) 169 TEST_ULP (V_NAME_F1 (erfc), 1.14) 170 TEST_SYM_INTERVAL (V_NAME_F1 (erfc), 0, 0x1p-26, 40000) 171 TEST_INTERVAL (V_NAME_F1 (erfc), 0x1p-26, 10.0625, 40000) 172 TEST_INTERVAL (V_NAME_F1 (erfc), -0x1p-26, -4.0, 40000) 173 TEST_INTERVAL (V_NAME_F1 (erfc), 10.0625, inf, 40000) 174 TEST_INTERVAL (V_NAME_F1 (erfc), -4.0, -inf, 40000) 175