1 /* 2 * Double-precision vector log(1+x) function. 3 * 4 * Copyright (c) 2022-2023, Arm Limited. 5 * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception 6 */ 7 8 #include "v_math.h" 9 #include "estrin.h" 10 #include "pl_sig.h" 11 #include "pl_test.h" 12 13 #if V_SUPPORTED 14 15 #define Ln2Hi v_f64 (0x1.62e42fefa3800p-1) 16 #define Ln2Lo v_f64 (0x1.ef35793c76730p-45) 17 #define HfRt2Top 0x3fe6a09e00000000 /* top32(asuint64(sqrt(2)/2)) << 32. */ 18 #define OneMHfRt2Top \ 19 0x00095f6200000000 /* (top32(asuint64(1)) - top32(asuint64(sqrt(2)/2))) \ 20 << 32. */ 21 #define OneTop12 0x3ff 22 #define BottomMask 0xffffffff 23 #define AbsMask 0x7fffffffffffffff 24 #define C(i) v_f64 (__log1p_data.coeffs[i]) 25 26 static inline v_f64_t 27 eval_poly (v_f64_t f) 28 { 29 v_f64_t f2 = f * f; 30 v_f64_t f4 = f2 * f2; 31 v_f64_t f8 = f4 * f4; 32 return ESTRIN_18 (f, f2, f4, f8, f8 * f8, C); 33 } 34 35 VPCS_ATTR 36 NOINLINE static v_f64_t 37 specialcase (v_f64_t x, v_f64_t y, v_u64_t special) 38 { 39 return v_call_f64 (log1p, x, y, special); 40 } 41 42 /* Vector log1p approximation using polynomial on reduced interval. Routine is a 43 modification of the algorithm used in scalar log1p, with no shortcut for k=0 44 and no narrowing for f and k. Maximum observed error is 2.46 ULP: 45 __v_log1p(0x1.654a1307242a4p+11) got 0x1.fd5565fb590f4p+2 46 want 0x1.fd5565fb590f6p+2 . */ 47 VPCS_ATTR v_f64_t V_NAME (log1p) (v_f64_t x) 48 { 49 v_u64_t ix = v_as_u64_f64 (x); 50 v_u64_t ia = ix & AbsMask; 51 v_u64_t special 52 = v_cond_u64 ((ia >= v_u64 (0x7ff0000000000000)) 53 | (ix >= 0xbff0000000000000) | (ix == 0x8000000000000000)); 54 55 #if WANT_SIMD_EXCEPT 56 if (unlikely (v_any_u64 (special))) 57 x = v_sel_f64 (special, v_f64 (0), x); 58 #endif 59 60 /* With x + 1 = t * 2^k (where t = f + 1 and k is chosen such that f 61 is in [sqrt(2)/2, sqrt(2)]): 62 log1p(x) = k*log(2) + log1p(f). 63 64 f may not be representable exactly, so we need a correction term: 65 let m = round(1 + x), c = (1 + x) - m. 66 c << m: at very small x, log1p(x) ~ x, hence: 67 log(1+x) - log(m) ~ c/m. 68 69 We therefore calculate log1p(x) by k*log2 + log1p(f) + c/m. */ 70 71 /* Obtain correctly scaled k by manipulation in the exponent. 72 The scalar algorithm casts down to 32-bit at this point to calculate k and 73 u_red. We stay in double-width to obtain f and k, using the same constants 74 as the scalar algorithm but shifted left by 32. */ 75 v_f64_t m = x + 1; 76 v_u64_t mi = v_as_u64_f64 (m); 77 v_u64_t u = mi + OneMHfRt2Top; 78 79 v_s64_t ki = v_as_s64_u64 (u >> 52) - OneTop12; 80 v_f64_t k = v_to_f64_s64 (ki); 81 82 /* Reduce x to f in [sqrt(2)/2, sqrt(2)]. */ 83 v_u64_t utop = (u & 0x000fffff00000000) + HfRt2Top; 84 v_u64_t u_red = utop | (mi & BottomMask); 85 v_f64_t f = v_as_f64_u64 (u_red) - 1; 86 87 /* Correction term c/m. */ 88 v_f64_t cm = (x - (m - 1)) / m; 89 90 /* Approximate log1p(x) on the reduced input using a polynomial. Because 91 log1p(0)=0 we choose an approximation of the form: 92 x + C0*x^2 + C1*x^3 + C2x^4 + ... 93 Hence approximation has the form f + f^2 * P(f) 94 where P(x) = C0 + C1*x + C2x^2 + ... 95 Assembling this all correctly is dealt with at the final step. */ 96 v_f64_t p = eval_poly (f); 97 98 v_f64_t ylo = v_fma_f64 (k, Ln2Lo, cm); 99 v_f64_t yhi = v_fma_f64 (k, Ln2Hi, f); 100 v_f64_t y = v_fma_f64 (f * f, p, ylo + yhi); 101 102 if (unlikely (v_any_u64 (special))) 103 return specialcase (v_as_f64_u64 (ix), y, special); 104 105 return y; 106 } 107 VPCS_ALIAS 108 109 PL_SIG (V, D, 1, log1p, -0.9, 10.0) 110 PL_TEST_ULP (V_NAME (log1p), 1.97) 111 PL_TEST_EXPECT_FENV (V_NAME (log1p), WANT_SIMD_EXCEPT) 112 PL_TEST_INTERVAL (V_NAME (log1p), -10.0, 10.0, 10000) 113 PL_TEST_INTERVAL (V_NAME (log1p), 0.0, 0x1p-23, 50000) 114 PL_TEST_INTERVAL (V_NAME (log1p), 0x1p-23, 0.001, 50000) 115 PL_TEST_INTERVAL (V_NAME (log1p), 0.001, 1.0, 50000) 116 PL_TEST_INTERVAL (V_NAME (log1p), 0.0, -0x1p-23, 50000) 117 PL_TEST_INTERVAL (V_NAME (log1p), -0x1p-23, -0.001, 50000) 118 PL_TEST_INTERVAL (V_NAME (log1p), -0.001, -1.0, 50000) 119 PL_TEST_INTERVAL (V_NAME (log1p), -1.0, inf, 5000) 120 #endif 121