xref: /freebsd/contrib/arm-optimized-routines/pl/math/sv_expf_2u.c (revision 072a4ba82a01476eaee33781ccd241033eefcf0b)
1*072a4ba8SAndrew Turner /*
2*072a4ba8SAndrew Turner  * Single-precision vector e^x function.
3*072a4ba8SAndrew Turner  *
4*072a4ba8SAndrew Turner  * Copyright (c) 2019-2023, Arm Limited.
5*072a4ba8SAndrew Turner  * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
6*072a4ba8SAndrew Turner  */
7*072a4ba8SAndrew Turner 
8*072a4ba8SAndrew Turner #include "sv_math.h"
9*072a4ba8SAndrew Turner #include "pl_sig.h"
10*072a4ba8SAndrew Turner #include "pl_test.h"
11*072a4ba8SAndrew Turner 
12*072a4ba8SAndrew Turner #if SV_SUPPORTED
13*072a4ba8SAndrew Turner 
14*072a4ba8SAndrew Turner #define C(i) __sv_expf_poly[i]
15*072a4ba8SAndrew Turner 
16*072a4ba8SAndrew Turner #define InvLn2 (0x1.715476p+0f)
17*072a4ba8SAndrew Turner #define Ln2hi (0x1.62e4p-1f)
18*072a4ba8SAndrew Turner #define Ln2lo (0x1.7f7d1cp-20f)
19*072a4ba8SAndrew Turner 
20*072a4ba8SAndrew Turner #if SV_EXPF_USE_FEXPA
21*072a4ba8SAndrew Turner 
22*072a4ba8SAndrew Turner #define Shift (0x1.903f8p17f) /* 1.5*2^17 + 127.  */
23*072a4ba8SAndrew Turner #define Thres                                                                  \
24*072a4ba8SAndrew Turner   (0x1.5d5e2ap+6f) /* Roughly 87.3. For x < -Thres, the result is subnormal    \
25*072a4ba8SAndrew Turner 		      and not handled correctly by FEXPA.  */
26*072a4ba8SAndrew Turner 
27*072a4ba8SAndrew Turner static NOINLINE sv_f32_t
28*072a4ba8SAndrew Turner special_case (sv_f32_t x, sv_f32_t y, svbool_t special)
29*072a4ba8SAndrew Turner {
30*072a4ba8SAndrew Turner   /* The special-case handler from the Neon routine does not handle subnormals
31*072a4ba8SAndrew Turner      in a way that is compatible with FEXPA. For the FEXPA variant we just fall
32*072a4ba8SAndrew Turner      back to scalar expf.  */
33*072a4ba8SAndrew Turner   return sv_call_f32 (expf, x, y, special);
34*072a4ba8SAndrew Turner }
35*072a4ba8SAndrew Turner 
36*072a4ba8SAndrew Turner #else
37*072a4ba8SAndrew Turner 
38*072a4ba8SAndrew Turner #define Shift (0x1.8p23f) /* 1.5 * 2^23.  */
39*072a4ba8SAndrew Turner #define Thres (126.0f)
40*072a4ba8SAndrew Turner 
41*072a4ba8SAndrew Turner /* Special-case handler adapted from Neon variant. Uses s, y and n to produce
42*072a4ba8SAndrew Turner    the final result (normal cases included). It performs an update of all lanes!
43*072a4ba8SAndrew Turner    Therefore:
44*072a4ba8SAndrew Turner    - all previous computation need to be done on all lanes indicated by input
45*072a4ba8SAndrew Turner      pg
46*072a4ba8SAndrew Turner    - we cannot simply apply the special case to the special-case-activated
47*072a4ba8SAndrew Turner      lanes. Besides it is likely that this would not increase performance (no
48*072a4ba8SAndrew Turner      scatter/gather).  */
49*072a4ba8SAndrew Turner static inline sv_f32_t
50*072a4ba8SAndrew Turner specialcase (svbool_t pg, sv_f32_t poly, sv_f32_t n, sv_u32_t e,
51*072a4ba8SAndrew Turner 	     svbool_t p_cmp1, sv_f32_t scale)
52*072a4ba8SAndrew Turner {
53*072a4ba8SAndrew Turner   /* s=2^(n/N) may overflow, break it up into s=s1*s2,
54*072a4ba8SAndrew Turner      such that exp = s + s*y can be computed as s1*(s2+s2*y)
55*072a4ba8SAndrew Turner      and s1*s1 overflows only if n>0.  */
56*072a4ba8SAndrew Turner 
57*072a4ba8SAndrew Turner   /* If n<=0 then set b to 0x820...0, 0 otherwise.  */
58*072a4ba8SAndrew Turner   svbool_t p_sign = svcmple_n_f32 (pg, n, 0.0f); /* n <= 0.  */
59*072a4ba8SAndrew Turner   sv_u32_t b
60*072a4ba8SAndrew Turner     = svdup_n_u32_z (p_sign, 0x82000000); /* Inactive lanes set to 0.  */
61*072a4ba8SAndrew Turner 
62*072a4ba8SAndrew Turner   /* Set s1 to generate overflow depending on sign of exponent n.  */
63*072a4ba8SAndrew Turner   sv_f32_t s1
64*072a4ba8SAndrew Turner     = sv_as_f32_u32 (svadd_n_u32_x (pg, b, 0x7f000000)); /* b + 0x7f000000.  */
65*072a4ba8SAndrew Turner   /* Offset s to avoid overflow in final result if n is below threshold.  */
66*072a4ba8SAndrew Turner   sv_f32_t s2 = sv_as_f32_u32 (
67*072a4ba8SAndrew Turner     svsub_u32_x (pg, e, b)); /* as_u32 (s) - 0x3010...0 + b.  */
68*072a4ba8SAndrew Turner 
69*072a4ba8SAndrew Turner   /* |n| > 192 => 2^(n/N) overflows.  */
70*072a4ba8SAndrew Turner   svbool_t p_cmp2 = svacgt_n_f32 (pg, n, 192.0f);
71*072a4ba8SAndrew Turner 
72*072a4ba8SAndrew Turner   sv_f32_t r2 = svmul_f32_x (pg, s1, s1);
73*072a4ba8SAndrew Turner   sv_f32_t r1 = sv_fma_f32_x (pg, poly, s2, s2);
74*072a4ba8SAndrew Turner   r1 = svmul_f32_x (pg, r1, s1);
75*072a4ba8SAndrew Turner   sv_f32_t r0 = sv_fma_f32_x (pg, poly, scale, scale);
76*072a4ba8SAndrew Turner 
77*072a4ba8SAndrew Turner   /* Apply condition 1 then 2.
78*072a4ba8SAndrew Turner      Returns r2 if cond2 is true, otherwise
79*072a4ba8SAndrew Turner      if cond1 is true then return r1, otherwise return r0.  */
80*072a4ba8SAndrew Turner   sv_f32_t r = svsel_f32 (p_cmp1, r1, r0);
81*072a4ba8SAndrew Turner 
82*072a4ba8SAndrew Turner   return svsel_f32 (p_cmp2, r2, r);
83*072a4ba8SAndrew Turner }
84*072a4ba8SAndrew Turner 
85*072a4ba8SAndrew Turner #endif
86*072a4ba8SAndrew Turner 
87*072a4ba8SAndrew Turner /* Optimised single-precision SVE exp function. By default this is an SVE port
88*072a4ba8SAndrew Turner    of the Neon algorithm from math/. Alternatively, enable a modification of
89*072a4ba8SAndrew Turner    that algorithm that looks up scale using SVE FEXPA instruction with
90*072a4ba8SAndrew Turner    SV_EXPF_USE_FEXPA.
91*072a4ba8SAndrew Turner 
92*072a4ba8SAndrew Turner    Worst-case error of the default algorithm is 1.95 ulp:
93*072a4ba8SAndrew Turner    __sv_expf(-0x1.4cb74ap+2) got 0x1.6a022cp-8
94*072a4ba8SAndrew Turner 			     want 0x1.6a023p-8.
95*072a4ba8SAndrew Turner 
96*072a4ba8SAndrew Turner    Worst-case error when using FEXPA is 1.04 ulp:
97*072a4ba8SAndrew Turner    __sv_expf(0x1.a8eda4p+1) got 0x1.ba74bcp+4
98*072a4ba8SAndrew Turner 			   want 0x1.ba74bap+4.  */
99*072a4ba8SAndrew Turner sv_f32_t
100*072a4ba8SAndrew Turner __sv_expf_x (sv_f32_t x, const svbool_t pg)
101*072a4ba8SAndrew Turner {
102*072a4ba8SAndrew Turner   /* exp(x) = 2^n (1 + poly(r)), with 1 + poly(r) in [1/sqrt(2),sqrt(2)]
103*072a4ba8SAndrew Turner      x = ln2*n + r, with r in [-ln2/2, ln2/2].  */
104*072a4ba8SAndrew Turner 
105*072a4ba8SAndrew Turner   /* n = round(x/(ln2/N)).  */
106*072a4ba8SAndrew Turner   sv_f32_t z = sv_fma_n_f32_x (pg, InvLn2, x, sv_f32 (Shift));
107*072a4ba8SAndrew Turner   sv_f32_t n = svsub_n_f32_x (pg, z, Shift);
108*072a4ba8SAndrew Turner 
109*072a4ba8SAndrew Turner   /* r = x - n*ln2/N.  */
110*072a4ba8SAndrew Turner   sv_f32_t r = sv_fma_n_f32_x (pg, -Ln2hi, n, x);
111*072a4ba8SAndrew Turner   r = sv_fma_n_f32_x (pg, -Ln2lo, n, r);
112*072a4ba8SAndrew Turner 
113*072a4ba8SAndrew Turner /* scale = 2^(n/N).  */
114*072a4ba8SAndrew Turner #if SV_EXPF_USE_FEXPA
115*072a4ba8SAndrew Turner   /* NaNs also need special handling with FEXPA.  */
116*072a4ba8SAndrew Turner   svbool_t is_special_case
117*072a4ba8SAndrew Turner     = svorr_b_z (pg, svacgt_n_f32 (pg, x, Thres), svcmpne_f32 (pg, x, x));
118*072a4ba8SAndrew Turner   sv_f32_t scale = svexpa_f32 (sv_as_u32_f32 (z));
119*072a4ba8SAndrew Turner #else
120*072a4ba8SAndrew Turner   sv_u32_t e = svlsl_n_u32_x (pg, sv_as_u32_f32 (z), 23);
121*072a4ba8SAndrew Turner   svbool_t is_special_case = svacgt_n_f32 (pg, n, Thres);
122*072a4ba8SAndrew Turner   sv_f32_t scale = sv_as_f32_u32 (svadd_n_u32_x (pg, e, 0x3f800000));
123*072a4ba8SAndrew Turner #endif
124*072a4ba8SAndrew Turner 
125*072a4ba8SAndrew Turner   /* y = exp(r) - 1 ~= r + C1 r^2 + C2 r^3 + C3 r^4.  */
126*072a4ba8SAndrew Turner   sv_f32_t r2 = svmul_f32_x (pg, r, r);
127*072a4ba8SAndrew Turner   sv_f32_t p = sv_fma_n_f32_x (pg, C (0), r, sv_f32 (C (1)));
128*072a4ba8SAndrew Turner   sv_f32_t q = sv_fma_n_f32_x (pg, C (2), r, sv_f32 (C (3)));
129*072a4ba8SAndrew Turner   q = sv_fma_f32_x (pg, p, r2, q);
130*072a4ba8SAndrew Turner   p = svmul_n_f32_x (pg, r, C (4));
131*072a4ba8SAndrew Turner   sv_f32_t poly = sv_fma_f32_x (pg, q, r2, p);
132*072a4ba8SAndrew Turner 
133*072a4ba8SAndrew Turner   if (unlikely (svptest_any (pg, is_special_case)))
134*072a4ba8SAndrew Turner #if SV_EXPF_USE_FEXPA
135*072a4ba8SAndrew Turner     return special_case (x, sv_fma_f32_x (pg, poly, scale, scale),
136*072a4ba8SAndrew Turner 			 is_special_case);
137*072a4ba8SAndrew Turner #else
138*072a4ba8SAndrew Turner     return specialcase (pg, poly, n, e, is_special_case, scale);
139*072a4ba8SAndrew Turner #endif
140*072a4ba8SAndrew Turner 
141*072a4ba8SAndrew Turner   return sv_fma_f32_x (pg, poly, scale, scale);
142*072a4ba8SAndrew Turner }
143*072a4ba8SAndrew Turner 
144*072a4ba8SAndrew Turner PL_ALIAS (__sv_expf_x, _ZGVsMxv_expf)
145*072a4ba8SAndrew Turner 
146*072a4ba8SAndrew Turner PL_SIG (SV, F, 1, exp, -9.9, 9.9)
147*072a4ba8SAndrew Turner PL_TEST_ULP (__sv_expf, 1.46)
148*072a4ba8SAndrew Turner PL_TEST_INTERVAL (__sv_expf, 0, 0x1p-23, 40000)
149*072a4ba8SAndrew Turner PL_TEST_INTERVAL (__sv_expf, 0x1p-23, 1, 50000)
150*072a4ba8SAndrew Turner PL_TEST_INTERVAL (__sv_expf, 1, 0x1p23, 50000)
151*072a4ba8SAndrew Turner PL_TEST_INTERVAL (__sv_expf, 0x1p23, inf, 50000)
152*072a4ba8SAndrew Turner PL_TEST_INTERVAL (__sv_expf, -0, -0x1p-23, 40000)
153*072a4ba8SAndrew Turner PL_TEST_INTERVAL (__sv_expf, -0x1p-23, -1, 50000)
154*072a4ba8SAndrew Turner PL_TEST_INTERVAL (__sv_expf, -1, -0x1p23, 50000)
155*072a4ba8SAndrew Turner PL_TEST_INTERVAL (__sv_expf, -0x1p23, -inf, 50000)
156*072a4ba8SAndrew Turner #endif // SV_SUPPORTED
157