1*072a4ba8SAndrew Turner /* 2*072a4ba8SAndrew Turner * Single-precision vector e^x function. 3*072a4ba8SAndrew Turner * 4*072a4ba8SAndrew Turner * Copyright (c) 2019-2023, Arm Limited. 5*072a4ba8SAndrew Turner * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception 6*072a4ba8SAndrew Turner */ 7*072a4ba8SAndrew Turner 8*072a4ba8SAndrew Turner #include "sv_math.h" 9*072a4ba8SAndrew Turner #include "pl_sig.h" 10*072a4ba8SAndrew Turner #include "pl_test.h" 11*072a4ba8SAndrew Turner 12*072a4ba8SAndrew Turner #if SV_SUPPORTED 13*072a4ba8SAndrew Turner 14*072a4ba8SAndrew Turner #define C(i) __sv_expf_poly[i] 15*072a4ba8SAndrew Turner 16*072a4ba8SAndrew Turner #define InvLn2 (0x1.715476p+0f) 17*072a4ba8SAndrew Turner #define Ln2hi (0x1.62e4p-1f) 18*072a4ba8SAndrew Turner #define Ln2lo (0x1.7f7d1cp-20f) 19*072a4ba8SAndrew Turner 20*072a4ba8SAndrew Turner #if SV_EXPF_USE_FEXPA 21*072a4ba8SAndrew Turner 22*072a4ba8SAndrew Turner #define Shift (0x1.903f8p17f) /* 1.5*2^17 + 127. */ 23*072a4ba8SAndrew Turner #define Thres \ 24*072a4ba8SAndrew Turner (0x1.5d5e2ap+6f) /* Roughly 87.3. For x < -Thres, the result is subnormal \ 25*072a4ba8SAndrew Turner and not handled correctly by FEXPA. */ 26*072a4ba8SAndrew Turner 27*072a4ba8SAndrew Turner static NOINLINE sv_f32_t 28*072a4ba8SAndrew Turner special_case (sv_f32_t x, sv_f32_t y, svbool_t special) 29*072a4ba8SAndrew Turner { 30*072a4ba8SAndrew Turner /* The special-case handler from the Neon routine does not handle subnormals 31*072a4ba8SAndrew Turner in a way that is compatible with FEXPA. For the FEXPA variant we just fall 32*072a4ba8SAndrew Turner back to scalar expf. */ 33*072a4ba8SAndrew Turner return sv_call_f32 (expf, x, y, special); 34*072a4ba8SAndrew Turner } 35*072a4ba8SAndrew Turner 36*072a4ba8SAndrew Turner #else 37*072a4ba8SAndrew Turner 38*072a4ba8SAndrew Turner #define Shift (0x1.8p23f) /* 1.5 * 2^23. */ 39*072a4ba8SAndrew Turner #define Thres (126.0f) 40*072a4ba8SAndrew Turner 41*072a4ba8SAndrew Turner /* Special-case handler adapted from Neon variant. Uses s, y and n to produce 42*072a4ba8SAndrew Turner the final result (normal cases included). It performs an update of all lanes! 43*072a4ba8SAndrew Turner Therefore: 44*072a4ba8SAndrew Turner - all previous computation need to be done on all lanes indicated by input 45*072a4ba8SAndrew Turner pg 46*072a4ba8SAndrew Turner - we cannot simply apply the special case to the special-case-activated 47*072a4ba8SAndrew Turner lanes. Besides it is likely that this would not increase performance (no 48*072a4ba8SAndrew Turner scatter/gather). */ 49*072a4ba8SAndrew Turner static inline sv_f32_t 50*072a4ba8SAndrew Turner specialcase (svbool_t pg, sv_f32_t poly, sv_f32_t n, sv_u32_t e, 51*072a4ba8SAndrew Turner svbool_t p_cmp1, sv_f32_t scale) 52*072a4ba8SAndrew Turner { 53*072a4ba8SAndrew Turner /* s=2^(n/N) may overflow, break it up into s=s1*s2, 54*072a4ba8SAndrew Turner such that exp = s + s*y can be computed as s1*(s2+s2*y) 55*072a4ba8SAndrew Turner and s1*s1 overflows only if n>0. */ 56*072a4ba8SAndrew Turner 57*072a4ba8SAndrew Turner /* If n<=0 then set b to 0x820...0, 0 otherwise. */ 58*072a4ba8SAndrew Turner svbool_t p_sign = svcmple_n_f32 (pg, n, 0.0f); /* n <= 0. */ 59*072a4ba8SAndrew Turner sv_u32_t b 60*072a4ba8SAndrew Turner = svdup_n_u32_z (p_sign, 0x82000000); /* Inactive lanes set to 0. */ 61*072a4ba8SAndrew Turner 62*072a4ba8SAndrew Turner /* Set s1 to generate overflow depending on sign of exponent n. */ 63*072a4ba8SAndrew Turner sv_f32_t s1 64*072a4ba8SAndrew Turner = sv_as_f32_u32 (svadd_n_u32_x (pg, b, 0x7f000000)); /* b + 0x7f000000. */ 65*072a4ba8SAndrew Turner /* Offset s to avoid overflow in final result if n is below threshold. */ 66*072a4ba8SAndrew Turner sv_f32_t s2 = sv_as_f32_u32 ( 67*072a4ba8SAndrew Turner svsub_u32_x (pg, e, b)); /* as_u32 (s) - 0x3010...0 + b. */ 68*072a4ba8SAndrew Turner 69*072a4ba8SAndrew Turner /* |n| > 192 => 2^(n/N) overflows. */ 70*072a4ba8SAndrew Turner svbool_t p_cmp2 = svacgt_n_f32 (pg, n, 192.0f); 71*072a4ba8SAndrew Turner 72*072a4ba8SAndrew Turner sv_f32_t r2 = svmul_f32_x (pg, s1, s1); 73*072a4ba8SAndrew Turner sv_f32_t r1 = sv_fma_f32_x (pg, poly, s2, s2); 74*072a4ba8SAndrew Turner r1 = svmul_f32_x (pg, r1, s1); 75*072a4ba8SAndrew Turner sv_f32_t r0 = sv_fma_f32_x (pg, poly, scale, scale); 76*072a4ba8SAndrew Turner 77*072a4ba8SAndrew Turner /* Apply condition 1 then 2. 78*072a4ba8SAndrew Turner Returns r2 if cond2 is true, otherwise 79*072a4ba8SAndrew Turner if cond1 is true then return r1, otherwise return r0. */ 80*072a4ba8SAndrew Turner sv_f32_t r = svsel_f32 (p_cmp1, r1, r0); 81*072a4ba8SAndrew Turner 82*072a4ba8SAndrew Turner return svsel_f32 (p_cmp2, r2, r); 83*072a4ba8SAndrew Turner } 84*072a4ba8SAndrew Turner 85*072a4ba8SAndrew Turner #endif 86*072a4ba8SAndrew Turner 87*072a4ba8SAndrew Turner /* Optimised single-precision SVE exp function. By default this is an SVE port 88*072a4ba8SAndrew Turner of the Neon algorithm from math/. Alternatively, enable a modification of 89*072a4ba8SAndrew Turner that algorithm that looks up scale using SVE FEXPA instruction with 90*072a4ba8SAndrew Turner SV_EXPF_USE_FEXPA. 91*072a4ba8SAndrew Turner 92*072a4ba8SAndrew Turner Worst-case error of the default algorithm is 1.95 ulp: 93*072a4ba8SAndrew Turner __sv_expf(-0x1.4cb74ap+2) got 0x1.6a022cp-8 94*072a4ba8SAndrew Turner want 0x1.6a023p-8. 95*072a4ba8SAndrew Turner 96*072a4ba8SAndrew Turner Worst-case error when using FEXPA is 1.04 ulp: 97*072a4ba8SAndrew Turner __sv_expf(0x1.a8eda4p+1) got 0x1.ba74bcp+4 98*072a4ba8SAndrew Turner want 0x1.ba74bap+4. */ 99*072a4ba8SAndrew Turner sv_f32_t 100*072a4ba8SAndrew Turner __sv_expf_x (sv_f32_t x, const svbool_t pg) 101*072a4ba8SAndrew Turner { 102*072a4ba8SAndrew Turner /* exp(x) = 2^n (1 + poly(r)), with 1 + poly(r) in [1/sqrt(2),sqrt(2)] 103*072a4ba8SAndrew Turner x = ln2*n + r, with r in [-ln2/2, ln2/2]. */ 104*072a4ba8SAndrew Turner 105*072a4ba8SAndrew Turner /* n = round(x/(ln2/N)). */ 106*072a4ba8SAndrew Turner sv_f32_t z = sv_fma_n_f32_x (pg, InvLn2, x, sv_f32 (Shift)); 107*072a4ba8SAndrew Turner sv_f32_t n = svsub_n_f32_x (pg, z, Shift); 108*072a4ba8SAndrew Turner 109*072a4ba8SAndrew Turner /* r = x - n*ln2/N. */ 110*072a4ba8SAndrew Turner sv_f32_t r = sv_fma_n_f32_x (pg, -Ln2hi, n, x); 111*072a4ba8SAndrew Turner r = sv_fma_n_f32_x (pg, -Ln2lo, n, r); 112*072a4ba8SAndrew Turner 113*072a4ba8SAndrew Turner /* scale = 2^(n/N). */ 114*072a4ba8SAndrew Turner #if SV_EXPF_USE_FEXPA 115*072a4ba8SAndrew Turner /* NaNs also need special handling with FEXPA. */ 116*072a4ba8SAndrew Turner svbool_t is_special_case 117*072a4ba8SAndrew Turner = svorr_b_z (pg, svacgt_n_f32 (pg, x, Thres), svcmpne_f32 (pg, x, x)); 118*072a4ba8SAndrew Turner sv_f32_t scale = svexpa_f32 (sv_as_u32_f32 (z)); 119*072a4ba8SAndrew Turner #else 120*072a4ba8SAndrew Turner sv_u32_t e = svlsl_n_u32_x (pg, sv_as_u32_f32 (z), 23); 121*072a4ba8SAndrew Turner svbool_t is_special_case = svacgt_n_f32 (pg, n, Thres); 122*072a4ba8SAndrew Turner sv_f32_t scale = sv_as_f32_u32 (svadd_n_u32_x (pg, e, 0x3f800000)); 123*072a4ba8SAndrew Turner #endif 124*072a4ba8SAndrew Turner 125*072a4ba8SAndrew Turner /* y = exp(r) - 1 ~= r + C1 r^2 + C2 r^3 + C3 r^4. */ 126*072a4ba8SAndrew Turner sv_f32_t r2 = svmul_f32_x (pg, r, r); 127*072a4ba8SAndrew Turner sv_f32_t p = sv_fma_n_f32_x (pg, C (0), r, sv_f32 (C (1))); 128*072a4ba8SAndrew Turner sv_f32_t q = sv_fma_n_f32_x (pg, C (2), r, sv_f32 (C (3))); 129*072a4ba8SAndrew Turner q = sv_fma_f32_x (pg, p, r2, q); 130*072a4ba8SAndrew Turner p = svmul_n_f32_x (pg, r, C (4)); 131*072a4ba8SAndrew Turner sv_f32_t poly = sv_fma_f32_x (pg, q, r2, p); 132*072a4ba8SAndrew Turner 133*072a4ba8SAndrew Turner if (unlikely (svptest_any (pg, is_special_case))) 134*072a4ba8SAndrew Turner #if SV_EXPF_USE_FEXPA 135*072a4ba8SAndrew Turner return special_case (x, sv_fma_f32_x (pg, poly, scale, scale), 136*072a4ba8SAndrew Turner is_special_case); 137*072a4ba8SAndrew Turner #else 138*072a4ba8SAndrew Turner return specialcase (pg, poly, n, e, is_special_case, scale); 139*072a4ba8SAndrew Turner #endif 140*072a4ba8SAndrew Turner 141*072a4ba8SAndrew Turner return sv_fma_f32_x (pg, poly, scale, scale); 142*072a4ba8SAndrew Turner } 143*072a4ba8SAndrew Turner 144*072a4ba8SAndrew Turner PL_ALIAS (__sv_expf_x, _ZGVsMxv_expf) 145*072a4ba8SAndrew Turner 146*072a4ba8SAndrew Turner PL_SIG (SV, F, 1, exp, -9.9, 9.9) 147*072a4ba8SAndrew Turner PL_TEST_ULP (__sv_expf, 1.46) 148*072a4ba8SAndrew Turner PL_TEST_INTERVAL (__sv_expf, 0, 0x1p-23, 40000) 149*072a4ba8SAndrew Turner PL_TEST_INTERVAL (__sv_expf, 0x1p-23, 1, 50000) 150*072a4ba8SAndrew Turner PL_TEST_INTERVAL (__sv_expf, 1, 0x1p23, 50000) 151*072a4ba8SAndrew Turner PL_TEST_INTERVAL (__sv_expf, 0x1p23, inf, 50000) 152*072a4ba8SAndrew Turner PL_TEST_INTERVAL (__sv_expf, -0, -0x1p-23, 40000) 153*072a4ba8SAndrew Turner PL_TEST_INTERVAL (__sv_expf, -0x1p-23, -1, 50000) 154*072a4ba8SAndrew Turner PL_TEST_INTERVAL (__sv_expf, -1, -0x1p23, 50000) 155*072a4ba8SAndrew Turner PL_TEST_INTERVAL (__sv_expf, -0x1p23, -inf, 50000) 156*072a4ba8SAndrew Turner #endif // SV_SUPPORTED 157