xref: /freebsd/contrib/arm-optimized-routines/math/aarch64/advsimd/v_math.h (revision f3087bef11543b42e0d69b708f367097a4118d24)
1*f3087befSAndrew Turner /*
2*f3087befSAndrew Turner  * Vector math abstractions.
3*f3087befSAndrew Turner  *
4*f3087befSAndrew Turner  * Copyright (c) 2019-2024, Arm Limited.
5*f3087befSAndrew Turner  * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
6*f3087befSAndrew Turner  */
7*f3087befSAndrew Turner 
8*f3087befSAndrew Turner #ifndef _V_MATH_H
9*f3087befSAndrew Turner #define _V_MATH_H
10*f3087befSAndrew Turner 
11*f3087befSAndrew Turner #if !__aarch64__
12*f3087befSAndrew Turner # error "Cannot build without AArch64"
13*f3087befSAndrew Turner #endif
14*f3087befSAndrew Turner 
15*f3087befSAndrew Turner #define VPCS_ATTR __attribute__ ((aarch64_vector_pcs))
16*f3087befSAndrew Turner 
17*f3087befSAndrew Turner #define V_NAME_F1(fun) _ZGVnN4v_##fun##f
18*f3087befSAndrew Turner #define V_NAME_D1(fun) _ZGVnN2v_##fun
19*f3087befSAndrew Turner #define V_NAME_F2(fun) _ZGVnN4vv_##fun##f
20*f3087befSAndrew Turner #define V_NAME_D2(fun) _ZGVnN2vv_##fun
21*f3087befSAndrew Turner #define V_NAME_F1_L1(fun) _ZGVnN4vl4_##fun##f
22*f3087befSAndrew Turner #define V_NAME_D1_L1(fun) _ZGVnN2vl8_##fun
23*f3087befSAndrew Turner 
24*f3087befSAndrew Turner #if USE_GLIBC_ABI
25*f3087befSAndrew Turner 
26*f3087befSAndrew Turner # define HALF_WIDTH_ALIAS_F1(fun)                                             \
27*f3087befSAndrew Turner     float32x2_t VPCS_ATTR _ZGVnN2v_##fun##f (float32x2_t x)                   \
28*f3087befSAndrew Turner     {                                                                         \
29*f3087befSAndrew Turner       return vget_low_f32 (_ZGVnN4v_##fun##f (vcombine_f32 (x, x)));          \
30*f3087befSAndrew Turner     }
31*f3087befSAndrew Turner 
32*f3087befSAndrew Turner # define HALF_WIDTH_ALIAS_F2(fun)                                             \
33*f3087befSAndrew Turner     float32x2_t VPCS_ATTR _ZGVnN2vv_##fun##f (float32x2_t x, float32x2_t y)   \
34*f3087befSAndrew Turner     {                                                                         \
35*f3087befSAndrew Turner       return vget_low_f32 (                                                   \
36*f3087befSAndrew Turner 	  _ZGVnN4vv_##fun##f (vcombine_f32 (x, x), vcombine_f32 (y, y)));     \
37*f3087befSAndrew Turner     }
38*f3087befSAndrew Turner 
39*f3087befSAndrew Turner #else
40*f3087befSAndrew Turner # define HALF_WIDTH_ALIAS_F1(fun)
41*f3087befSAndrew Turner # define HALF_WIDTH_ALIAS_F2(fun)
42*f3087befSAndrew Turner #endif
43*f3087befSAndrew Turner 
44*f3087befSAndrew Turner #include <stdint.h>
45*f3087befSAndrew Turner #include "math_config.h"
46*f3087befSAndrew Turner #include <arm_neon.h>
47*f3087befSAndrew Turner 
48*f3087befSAndrew Turner /* Shorthand helpers for declaring constants.  */
49*f3087befSAndrew Turner #define V2(X)                                                                 \
50*f3087befSAndrew Turner   {                                                                           \
51*f3087befSAndrew Turner     X, X                                                                      \
52*f3087befSAndrew Turner   }
53*f3087befSAndrew Turner #define V4(X)                                                                 \
54*f3087befSAndrew Turner   {                                                                           \
55*f3087befSAndrew Turner     X, X, X, X                                                                \
56*f3087befSAndrew Turner   }
57*f3087befSAndrew Turner #define V8(X)                                                                 \
58*f3087befSAndrew Turner   {                                                                           \
59*f3087befSAndrew Turner     X, X, X, X, X, X, X, X                                                    \
60*f3087befSAndrew Turner   }
61*f3087befSAndrew Turner 
62*f3087befSAndrew Turner static inline int
v_any_u16h(uint16x4_t x)63*f3087befSAndrew Turner v_any_u16h (uint16x4_t x)
64*f3087befSAndrew Turner {
65*f3087befSAndrew Turner   return vget_lane_u64 (vreinterpret_u64_u16 (x), 0) != 0;
66*f3087befSAndrew Turner }
67*f3087befSAndrew Turner 
68*f3087befSAndrew Turner static inline int
v_lanes32(void)69*f3087befSAndrew Turner v_lanes32 (void)
70*f3087befSAndrew Turner {
71*f3087befSAndrew Turner   return 4;
72*f3087befSAndrew Turner }
73*f3087befSAndrew Turner 
74*f3087befSAndrew Turner static inline float32x4_t
v_f32(float x)75*f3087befSAndrew Turner v_f32 (float x)
76*f3087befSAndrew Turner {
77*f3087befSAndrew Turner   return (float32x4_t) V4 (x);
78*f3087befSAndrew Turner }
79*f3087befSAndrew Turner static inline uint32x4_t
v_u32(uint32_t x)80*f3087befSAndrew Turner v_u32 (uint32_t x)
81*f3087befSAndrew Turner {
82*f3087befSAndrew Turner   return (uint32x4_t) V4 (x);
83*f3087befSAndrew Turner }
84*f3087befSAndrew Turner static inline int32x4_t
v_s32(int32_t x)85*f3087befSAndrew Turner v_s32 (int32_t x)
86*f3087befSAndrew Turner {
87*f3087befSAndrew Turner   return (int32x4_t) V4 (x);
88*f3087befSAndrew Turner }
89*f3087befSAndrew Turner 
90*f3087befSAndrew Turner /* true if any elements of a v_cond result is non-zero.  */
91*f3087befSAndrew Turner static inline int
v_any_u32(uint32x4_t x)92*f3087befSAndrew Turner v_any_u32 (uint32x4_t x)
93*f3087befSAndrew Turner {
94*f3087befSAndrew Turner   /* assume elements in x are either 0 or -1u.  */
95*f3087befSAndrew Turner   return vpaddd_u64 (vreinterpretq_u64_u32 (x)) != 0;
96*f3087befSAndrew Turner }
97*f3087befSAndrew Turner static inline int
v_any_u32h(uint32x2_t x)98*f3087befSAndrew Turner v_any_u32h (uint32x2_t x)
99*f3087befSAndrew Turner {
100*f3087befSAndrew Turner   return vget_lane_u64 (vreinterpret_u64_u32 (x), 0) != 0;
101*f3087befSAndrew Turner }
102*f3087befSAndrew Turner static inline float32x4_t
v_lookup_f32(const float * tab,uint32x4_t idx)103*f3087befSAndrew Turner v_lookup_f32 (const float *tab, uint32x4_t idx)
104*f3087befSAndrew Turner {
105*f3087befSAndrew Turner   return (float32x4_t){ tab[idx[0]], tab[idx[1]], tab[idx[2]], tab[idx[3]] };
106*f3087befSAndrew Turner }
107*f3087befSAndrew Turner static inline uint32x4_t
v_lookup_u32(const uint32_t * tab,uint32x4_t idx)108*f3087befSAndrew Turner v_lookup_u32 (const uint32_t *tab, uint32x4_t idx)
109*f3087befSAndrew Turner {
110*f3087befSAndrew Turner   return (uint32x4_t){ tab[idx[0]], tab[idx[1]], tab[idx[2]], tab[idx[3]] };
111*f3087befSAndrew Turner }
112*f3087befSAndrew Turner static inline float32x4_t
v_call_f32(float (* f)(float),float32x4_t x,float32x4_t y,uint32x4_t p)113*f3087befSAndrew Turner v_call_f32 (float (*f) (float), float32x4_t x, float32x4_t y, uint32x4_t p)
114*f3087befSAndrew Turner {
115*f3087befSAndrew Turner   return (float32x4_t){ p[0] ? f (x[0]) : y[0], p[1] ? f (x[1]) : y[1],
116*f3087befSAndrew Turner 			p[2] ? f (x[2]) : y[2], p[3] ? f (x[3]) : y[3] };
117*f3087befSAndrew Turner }
118*f3087befSAndrew Turner static inline float32x4_t
v_call2_f32(float (* f)(float,float),float32x4_t x1,float32x4_t x2,float32x4_t y,uint32x4_t p)119*f3087befSAndrew Turner v_call2_f32 (float (*f) (float, float), float32x4_t x1, float32x4_t x2,
120*f3087befSAndrew Turner 	     float32x4_t y, uint32x4_t p)
121*f3087befSAndrew Turner {
122*f3087befSAndrew Turner   return (float32x4_t){ p[0] ? f (x1[0], x2[0]) : y[0],
123*f3087befSAndrew Turner 			p[1] ? f (x1[1], x2[1]) : y[1],
124*f3087befSAndrew Turner 			p[2] ? f (x1[2], x2[2]) : y[2],
125*f3087befSAndrew Turner 			p[3] ? f (x1[3], x2[3]) : y[3] };
126*f3087befSAndrew Turner }
127*f3087befSAndrew Turner static inline float32x4_t
v_zerofy_f32(float32x4_t x,uint32x4_t mask)128*f3087befSAndrew Turner v_zerofy_f32 (float32x4_t x, uint32x4_t mask)
129*f3087befSAndrew Turner {
130*f3087befSAndrew Turner   return vreinterpretq_f32_u32 (vbicq_u32 (vreinterpretq_u32_f32 (x), mask));
131*f3087befSAndrew Turner }
132*f3087befSAndrew Turner 
133*f3087befSAndrew Turner static inline int
v_lanes64(void)134*f3087befSAndrew Turner v_lanes64 (void)
135*f3087befSAndrew Turner {
136*f3087befSAndrew Turner   return 2;
137*f3087befSAndrew Turner }
138*f3087befSAndrew Turner static inline float64x2_t
v_f64(double x)139*f3087befSAndrew Turner v_f64 (double x)
140*f3087befSAndrew Turner {
141*f3087befSAndrew Turner   return (float64x2_t) V2 (x);
142*f3087befSAndrew Turner }
143*f3087befSAndrew Turner static inline uint64x2_t
v_u64(uint64_t x)144*f3087befSAndrew Turner v_u64 (uint64_t x)
145*f3087befSAndrew Turner {
146*f3087befSAndrew Turner   return (uint64x2_t) V2 (x);
147*f3087befSAndrew Turner }
148*f3087befSAndrew Turner static inline int64x2_t
v_s64(int64_t x)149*f3087befSAndrew Turner v_s64 (int64_t x)
150*f3087befSAndrew Turner {
151*f3087befSAndrew Turner   return (int64x2_t) V2 (x);
152*f3087befSAndrew Turner }
153*f3087befSAndrew Turner 
154*f3087befSAndrew Turner /* true if any elements of a v_cond result is non-zero.  */
155*f3087befSAndrew Turner static inline int
v_any_u64(uint64x2_t x)156*f3087befSAndrew Turner v_any_u64 (uint64x2_t x)
157*f3087befSAndrew Turner {
158*f3087befSAndrew Turner   /* assume elements in x are either 0 or -1u.  */
159*f3087befSAndrew Turner   return vpaddd_u64 (x) != 0;
160*f3087befSAndrew Turner }
161*f3087befSAndrew Turner static inline float64x2_t
v_lookup_f64(const double * tab,uint64x2_t idx)162*f3087befSAndrew Turner v_lookup_f64 (const double *tab, uint64x2_t idx)
163*f3087befSAndrew Turner {
164*f3087befSAndrew Turner   return (float64x2_t){ tab[idx[0]], tab[idx[1]] };
165*f3087befSAndrew Turner }
166*f3087befSAndrew Turner static inline uint64x2_t
v_lookup_u64(const uint64_t * tab,uint64x2_t idx)167*f3087befSAndrew Turner v_lookup_u64 (const uint64_t *tab, uint64x2_t idx)
168*f3087befSAndrew Turner {
169*f3087befSAndrew Turner   return (uint64x2_t){ tab[idx[0]], tab[idx[1]] };
170*f3087befSAndrew Turner }
171*f3087befSAndrew Turner static inline float64x2_t
v_call_f64(double (* f)(double),float64x2_t x,float64x2_t y,uint64x2_t p)172*f3087befSAndrew Turner v_call_f64 (double (*f) (double), float64x2_t x, float64x2_t y, uint64x2_t p)
173*f3087befSAndrew Turner {
174*f3087befSAndrew Turner   double p1 = p[1];
175*f3087befSAndrew Turner   double x1 = x[1];
176*f3087befSAndrew Turner   if (likely (p[0]))
177*f3087befSAndrew Turner     y[0] = f (x[0]);
178*f3087befSAndrew Turner   if (likely (p1))
179*f3087befSAndrew Turner     y[1] = f (x1);
180*f3087befSAndrew Turner   return y;
181*f3087befSAndrew Turner }
182*f3087befSAndrew Turner 
183*f3087befSAndrew Turner static inline float64x2_t
v_call2_f64(double (* f)(double,double),float64x2_t x1,float64x2_t x2,float64x2_t y,uint64x2_t p)184*f3087befSAndrew Turner v_call2_f64 (double (*f) (double, double), float64x2_t x1, float64x2_t x2,
185*f3087befSAndrew Turner 	     float64x2_t y, uint64x2_t p)
186*f3087befSAndrew Turner {
187*f3087befSAndrew Turner   double p1 = p[1];
188*f3087befSAndrew Turner   double x1h = x1[1];
189*f3087befSAndrew Turner   double x2h = x2[1];
190*f3087befSAndrew Turner   if (likely (p[0]))
191*f3087befSAndrew Turner     y[0] = f (x1[0], x2[0]);
192*f3087befSAndrew Turner   if (likely (p1))
193*f3087befSAndrew Turner     y[1] = f (x1h, x2h);
194*f3087befSAndrew Turner   return y;
195*f3087befSAndrew Turner }
196*f3087befSAndrew Turner static inline float64x2_t
v_zerofy_f64(float64x2_t x,uint64x2_t mask)197*f3087befSAndrew Turner v_zerofy_f64 (float64x2_t x, uint64x2_t mask)
198*f3087befSAndrew Turner {
199*f3087befSAndrew Turner   return vreinterpretq_f64_u64 (vbicq_u64 (vreinterpretq_u64_f64 (x), mask));
200*f3087befSAndrew Turner }
201*f3087befSAndrew Turner 
202*f3087befSAndrew Turner #endif
203