xref: /freebsd/contrib/arm-optimized-routines/math/aarch64/sve/sv_poly_generic.h (revision f3087bef11543b42e0d69b708f367097a4118d24)
1 /*
2  * Helpers for evaluating polynomials with various schemes - specific to SVE
3  * but precision-agnostic.
4  *
5  * Copyright (c) 2023-2024, Arm Limited.
6  * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
7  */
8 
9 #ifndef VTYPE
10 # error Cannot use poly_generic without defining VTYPE
11 #endif
12 #ifndef STYPE
13 # error Cannot use poly_generic without defining STYPE
14 #endif
15 #ifndef VWRAP
16 # error Cannot use poly_generic without defining VWRAP
17 #endif
18 #ifndef DUP
19 # error Cannot use poly_generic without defining DUP
20 #endif
21 
VWRAP(pairwise_poly_3)22 static inline VTYPE VWRAP (pairwise_poly_3) (svbool_t pg, VTYPE x, VTYPE x2,
23 					     const STYPE *poly)
24 {
25   /* At order 3, Estrin and Pairwise Horner are identical.  */
26   VTYPE p01 = svmla_x (pg, DUP (poly[0]), x, poly[1]);
27   VTYPE p23 = svmla_x (pg, DUP (poly[2]), x, poly[3]);
28   return svmla_x (pg, p01, p23, x2);
29 }
30 
VWRAP(estrin_4)31 static inline VTYPE VWRAP (estrin_4) (svbool_t pg, VTYPE x, VTYPE x2, VTYPE x4,
32 				      const STYPE *poly)
33 {
34   VTYPE p03 = VWRAP (pairwise_poly_3) (pg, x, x2, poly);
35   return svmla_x (pg, p03, x4, poly[4]);
36 }
VWRAP(estrin_5)37 static inline VTYPE VWRAP (estrin_5) (svbool_t pg, VTYPE x, VTYPE x2, VTYPE x4,
38 				      const STYPE *poly)
39 {
40   VTYPE p03 = VWRAP (pairwise_poly_3) (pg, x, x2, poly);
41   VTYPE p45 = svmla_x (pg, DUP (poly[4]), x, poly[5]);
42   return svmla_x (pg, p03, p45, x4);
43 }
VWRAP(estrin_6)44 static inline VTYPE VWRAP (estrin_6) (svbool_t pg, VTYPE x, VTYPE x2, VTYPE x4,
45 				      const STYPE *poly)
46 {
47   VTYPE p03 = VWRAP (pairwise_poly_3) (pg, x, x2, poly);
48   VTYPE p45 = svmla_x (pg, DUP (poly[4]), x, poly[5]);
49   VTYPE p46 = svmla_x (pg, p45, x, poly[6]);
50   return svmla_x (pg, p03, p46, x4);
51 }
VWRAP(estrin_7)52 static inline VTYPE VWRAP (estrin_7) (svbool_t pg, VTYPE x, VTYPE x2, VTYPE x4,
53 				      const STYPE *poly)
54 {
55   VTYPE p03 = VWRAP (pairwise_poly_3) (pg, x, x2, poly);
56   VTYPE p47 = VWRAP (pairwise_poly_3) (pg, x, x2, poly + 4);
57   return svmla_x (pg, p03, p47, x4);
58 }
VWRAP(estrin_8)59 static inline VTYPE VWRAP (estrin_8) (svbool_t pg, VTYPE x, VTYPE x2, VTYPE x4,
60 				      VTYPE x8, const STYPE *poly)
61 {
62   return svmla_x (pg, VWRAP (estrin_7) (pg, x, x2, x4, poly), x8, poly[8]);
63 }
VWRAP(estrin_9)64 static inline VTYPE VWRAP (estrin_9) (svbool_t pg, VTYPE x, VTYPE x2, VTYPE x4,
65 				      VTYPE x8, const STYPE *poly)
66 {
67   VTYPE p89 = svmla_x (pg, DUP (poly[8]), x, poly[9]);
68   return svmla_x (pg, VWRAP (estrin_7) (pg, x, x2, x4, poly), p89, x8);
69 }
VWRAP(estrin_10)70 static inline VTYPE VWRAP (estrin_10) (svbool_t pg, VTYPE x, VTYPE x2,
71 				       VTYPE x4, VTYPE x8, const STYPE *poly)
72 {
73   VTYPE p89 = svmla_x (pg, DUP (poly[8]), x, poly[9]);
74   VTYPE p8_10 = svmla_x (pg, p89, x2, poly[10]);
75   return svmla_x (pg, VWRAP (estrin_7) (pg, x, x2, x4, poly), p8_10, x8);
76 }
VWRAP(estrin_11)77 static inline VTYPE VWRAP (estrin_11) (svbool_t pg, VTYPE x, VTYPE x2,
78 				       VTYPE x4, VTYPE x8, const STYPE *poly)
79 {
80   VTYPE p8_11 = VWRAP (pairwise_poly_3) (pg, x, x2, poly + 8);
81   return svmla_x (pg, VWRAP (estrin_7) (pg, x, x2, x4, poly), p8_11, x8);
82 }
VWRAP(estrin_12)83 static inline VTYPE VWRAP (estrin_12) (svbool_t pg, VTYPE x, VTYPE x2,
84 				       VTYPE x4, VTYPE x8, const STYPE *poly)
85 {
86   return svmla_x (pg, VWRAP (estrin_7) (pg, x, x2, x4, poly),
87 		  VWRAP (estrin_4) (pg, x, x2, x4, poly + 8), x8);
88 }
VWRAP(estrin_13)89 static inline VTYPE VWRAP (estrin_13) (svbool_t pg, VTYPE x, VTYPE x2,
90 				       VTYPE x4, VTYPE x8, const STYPE *poly)
91 {
92   return svmla_x (pg, VWRAP (estrin_7) (pg, x, x2, x4, poly),
93 		  VWRAP (estrin_5) (pg, x, x2, x4, poly + 8), x8);
94 }
VWRAP(estrin_14)95 static inline VTYPE VWRAP (estrin_14) (svbool_t pg, VTYPE x, VTYPE x2,
96 				       VTYPE x4, VTYPE x8, const STYPE *poly)
97 {
98   return svmla_x (pg, VWRAP (estrin_7) (pg, x, x2, x4, poly),
99 		  VWRAP (estrin_6) (pg, x, x2, x4, poly + 8), x8);
100 }
VWRAP(estrin_15)101 static inline VTYPE VWRAP (estrin_15) (svbool_t pg, VTYPE x, VTYPE x2,
102 				       VTYPE x4, VTYPE x8, const STYPE *poly)
103 {
104   return svmla_x (pg, VWRAP (estrin_7) (pg, x, x2, x4, poly),
105 		  VWRAP (estrin_7) (pg, x, x2, x4, poly + 8), x8);
106 }
VWRAP(estrin_16)107 static inline VTYPE VWRAP (estrin_16) (svbool_t pg, VTYPE x, VTYPE x2,
108 				       VTYPE x4, VTYPE x8, VTYPE x16,
109 				       const STYPE *poly)
110 {
111   return svmla_x (pg, VWRAP (estrin_15) (pg, x, x2, x4, x8, poly), x16,
112 		  poly[16]);
113 }
VWRAP(estrin_17)114 static inline VTYPE VWRAP (estrin_17) (svbool_t pg, VTYPE x, VTYPE x2,
115 				       VTYPE x4, VTYPE x8, VTYPE x16,
116 				       const STYPE *poly)
117 {
118   VTYPE p16_17 = svmla_x (pg, DUP (poly[16]), x, poly[17]);
119   return svmla_x (pg, VWRAP (estrin_15) (pg, x, x2, x4, x8, poly), p16_17,
120 		  x16);
121 }
VWRAP(estrin_18)122 static inline VTYPE VWRAP (estrin_18) (svbool_t pg, VTYPE x, VTYPE x2,
123 				       VTYPE x4, VTYPE x8, VTYPE x16,
124 				       const STYPE *poly)
125 {
126   VTYPE p16_17 = svmla_x (pg, DUP (poly[16]), x, poly[17]);
127   VTYPE p16_18 = svmla_x (pg, p16_17, x2, poly[18]);
128   return svmla_x (pg, VWRAP (estrin_15) (pg, x, x2, x4, x8, poly), p16_18,
129 		  x16);
130 }
VWRAP(estrin_19)131 static inline VTYPE VWRAP (estrin_19) (svbool_t pg, VTYPE x, VTYPE x2,
132 				       VTYPE x4, VTYPE x8, VTYPE x16,
133 				       const STYPE *poly)
134 {
135   return svmla_x (pg, VWRAP (estrin_15) (pg, x, x2, x4, x8, poly),
136 		  VWRAP (pairwise_poly_3) (pg, x, x2, poly + 16), x16);
137 }
138 
VWRAP(horner_3)139 static inline VTYPE VWRAP (horner_3) (svbool_t pg, VTYPE x, const STYPE *poly)
140 {
141   VTYPE p = svmla_x (pg, DUP (poly[2]), x, poly[3]);
142   p = svmad_x (pg, x, p, poly[1]);
143   p = svmad_x (pg, x, p, poly[0]);
144   return p;
145 }
VWRAP(horner_4)146 static inline VTYPE VWRAP (horner_4) (svbool_t pg, VTYPE x, const STYPE *poly)
147 {
148   VTYPE p = svmla_x (pg, DUP (poly[3]), x, poly[4]);
149   p = svmad_x (pg, x, p, poly[2]);
150   p = svmad_x (pg, x, p, poly[1]);
151   p = svmad_x (pg, x, p, poly[0]);
152   return p;
153 }
VWRAP(horner_5)154 static inline VTYPE VWRAP (horner_5) (svbool_t pg, VTYPE x, const STYPE *poly)
155 {
156   return svmad_x (pg, x, VWRAP (horner_4) (pg, x, poly + 1), poly[0]);
157 }
VWRAP(horner_6)158 static inline VTYPE VWRAP (horner_6) (svbool_t pg, VTYPE x, const STYPE *poly)
159 {
160   return svmad_x (pg, x, VWRAP (horner_5) (pg, x, poly + 1), poly[0]);
161 }
VWRAP(horner_7)162 static inline VTYPE VWRAP (horner_7) (svbool_t pg, VTYPE x, const STYPE *poly)
163 {
164   return svmad_x (pg, x, VWRAP (horner_6) (pg, x, poly + 1), poly[0]);
165 }
VWRAP(horner_8)166 static inline VTYPE VWRAP (horner_8) (svbool_t pg, VTYPE x, const STYPE *poly)
167 {
168   return svmad_x (pg, x, VWRAP (horner_7) (pg, x, poly + 1), poly[0]);
169 }
VWRAP(horner_9)170 static inline VTYPE VWRAP (horner_9) (svbool_t pg, VTYPE x, const STYPE *poly)
171 {
172   return svmad_x (pg, x, VWRAP (horner_8) (pg, x, poly + 1), poly[0]);
173 }
174 static inline VTYPE
sv_horner_10_f32_x(svbool_t pg,VTYPE x,const STYPE * poly)175 sv_horner_10_f32_x (svbool_t pg, VTYPE x, const STYPE *poly)
176 {
177   return svmad_x (pg, x, VWRAP (horner_9) (pg, x, poly + 1), poly[0]);
178 }
179 static inline VTYPE
sv_horner_11_f32_x(svbool_t pg,VTYPE x,const STYPE * poly)180 sv_horner_11_f32_x (svbool_t pg, VTYPE x, const STYPE *poly)
181 {
182   return svmad_x (pg, x, sv_horner_10_f32_x (pg, x, poly + 1), poly[0]);
183 }
184 static inline VTYPE
sv_horner_12_f32_x(svbool_t pg,VTYPE x,const STYPE * poly)185 sv_horner_12_f32_x (svbool_t pg, VTYPE x, const STYPE *poly)
186 {
187   return svmad_x (pg, x, sv_horner_11_f32_x (pg, x, poly + 1), poly[0]);
188 }
189 
VWRAP(pw_horner_4)190 static inline VTYPE VWRAP (pw_horner_4) (svbool_t pg, VTYPE x, VTYPE x2,
191 					 const STYPE *poly)
192 {
193   VTYPE p01 = svmla_x (pg, DUP (poly[0]), x, poly[1]);
194   VTYPE p23 = svmla_x (pg, DUP (poly[2]), x, poly[3]);
195   VTYPE p;
196   p = svmla_x (pg, p23, x2, poly[4]);
197   p = svmla_x (pg, p01, x2, p);
198   return p;
199 }
VWRAP(pw_horner_5)200 static inline VTYPE VWRAP (pw_horner_5) (svbool_t pg, VTYPE x, VTYPE x2,
201 					 const STYPE *poly)
202 {
203   VTYPE p01 = svmla_x (pg, DUP (poly[0]), x, poly[1]);
204   VTYPE p23 = svmla_x (pg, DUP (poly[2]), x, poly[3]);
205   VTYPE p45 = svmla_x (pg, DUP (poly[4]), x, poly[5]);
206   VTYPE p;
207   p = svmla_x (pg, p23, x2, p45);
208   p = svmla_x (pg, p01, x2, p);
209   return p;
210 }
VWRAP(pw_horner_6)211 static inline VTYPE VWRAP (pw_horner_6) (svbool_t pg, VTYPE x, VTYPE x2,
212 					 const STYPE *poly)
213 {
214   VTYPE p26 = VWRAP (pw_horner_4) (pg, x, x2, poly + 2);
215   VTYPE p01 = svmla_x (pg, DUP (poly[0]), x, poly[1]);
216   return svmla_x (pg, p01, x2, p26);
217 }
VWRAP(pw_horner_7)218 static inline VTYPE VWRAP (pw_horner_7) (svbool_t pg, VTYPE x, VTYPE x2,
219 					 const STYPE *poly)
220 {
221   VTYPE p27 = VWRAP (pw_horner_5) (pg, x, x2, poly + 2);
222   VTYPE p01 = svmla_x (pg, DUP (poly[0]), x, poly[1]);
223   return svmla_x (pg, p01, x2, p27);
224 }
VWRAP(pw_horner_8)225 static inline VTYPE VWRAP (pw_horner_8) (svbool_t pg, VTYPE x, VTYPE x2,
226 					 const STYPE *poly)
227 {
228   VTYPE p28 = VWRAP (pw_horner_6) (pg, x, x2, poly + 2);
229   VTYPE p01 = svmla_x (pg, DUP (poly[0]), x, poly[1]);
230   return svmla_x (pg, p01, x2, p28);
231 }
VWRAP(pw_horner_9)232 static inline VTYPE VWRAP (pw_horner_9) (svbool_t pg, VTYPE x, VTYPE x2,
233 					 const STYPE *poly)
234 {
235   VTYPE p29 = VWRAP (pw_horner_7) (pg, x, x2, poly + 2);
236   VTYPE p01 = svmla_x (pg, DUP (poly[0]), x, poly[1]);
237   return svmla_x (pg, p01, x2, p29);
238 }
VWRAP(pw_horner_10)239 static inline VTYPE VWRAP (pw_horner_10) (svbool_t pg, VTYPE x, VTYPE x2,
240 					  const STYPE *poly)
241 {
242   VTYPE p2_10 = VWRAP (pw_horner_8) (pg, x, x2, poly + 2);
243   VTYPE p01 = svmla_x (pg, DUP (poly[0]), x, poly[1]);
244   return svmla_x (pg, p01, x2, p2_10);
245 }
VWRAP(pw_horner_11)246 static inline VTYPE VWRAP (pw_horner_11) (svbool_t pg, VTYPE x, VTYPE x2,
247 					  const STYPE *poly)
248 {
249   VTYPE p2_11 = VWRAP (pw_horner_9) (pg, x, x2, poly + 2);
250   VTYPE p01 = svmla_x (pg, DUP (poly[0]), x, poly[1]);
251   return svmla_x (pg, p01, x2, p2_11);
252 }
VWRAP(pw_horner_12)253 static inline VTYPE VWRAP (pw_horner_12) (svbool_t pg, VTYPE x, VTYPE x2,
254 					  const STYPE *poly)
255 {
256   VTYPE p2_12 = VWRAP (pw_horner_10) (pg, x, x2, poly + 2);
257   VTYPE p01 = svmla_x (pg, DUP (poly[0]), x, poly[1]);
258   return svmla_x (pg, p01, x2, p2_12);
259 }
VWRAP(pw_horner_13)260 static inline VTYPE VWRAP (pw_horner_13) (svbool_t pg, VTYPE x, VTYPE x2,
261 					  const STYPE *poly)
262 {
263   VTYPE p2_13 = VWRAP (pw_horner_11) (pg, x, x2, poly + 2);
264   VTYPE p01 = svmla_x (pg, DUP (poly[0]), x, poly[1]);
265   return svmla_x (pg, p01, x2, p2_13);
266 }
VWRAP(pw_horner_14)267 static inline VTYPE VWRAP (pw_horner_14) (svbool_t pg, VTYPE x, VTYPE x2,
268 					  const STYPE *poly)
269 {
270   VTYPE p2_14 = VWRAP (pw_horner_12) (pg, x, x2, poly + 2);
271   VTYPE p01 = svmla_x (pg, DUP (poly[0]), x, poly[1]);
272   return svmla_x (pg, p01, x2, p2_14);
273 }
VWRAP(pw_horner_15)274 static inline VTYPE VWRAP (pw_horner_15) (svbool_t pg, VTYPE x, VTYPE x2,
275 					  const STYPE *poly)
276 {
277   VTYPE p2_15 = VWRAP (pw_horner_13) (pg, x, x2, poly + 2);
278   VTYPE p01 = svmla_x (pg, DUP (poly[0]), x, poly[1]);
279   return svmla_x (pg, p01, x2, p2_15);
280 }
VWRAP(pw_horner_16)281 static inline VTYPE VWRAP (pw_horner_16) (svbool_t pg, VTYPE x, VTYPE x2,
282 					  const STYPE *poly)
283 {
284   VTYPE p2_16 = VWRAP (pw_horner_14) (pg, x, x2, poly + 2);
285   VTYPE p01 = svmla_x (pg, DUP (poly[0]), x, poly[1]);
286   return svmla_x (pg, p01, x2, p2_16);
287 }
VWRAP(pw_horner_17)288 static inline VTYPE VWRAP (pw_horner_17) (svbool_t pg, VTYPE x, VTYPE x2,
289 					  const STYPE *poly)
290 {
291   VTYPE p2_17 = VWRAP (pw_horner_15) (pg, x, x2, poly + 2);
292   VTYPE p01 = svmla_x (pg, DUP (poly[0]), x, poly[1]);
293   return svmla_x (pg, p01, x2, p2_17);
294 }
VWRAP(pw_horner_18)295 static inline VTYPE VWRAP (pw_horner_18) (svbool_t pg, VTYPE x, VTYPE x2,
296 					  const STYPE *poly)
297 {
298   VTYPE p2_18 = VWRAP (pw_horner_16) (pg, x, x2, poly + 2);
299   VTYPE p01 = svmla_x (pg, DUP (poly[0]), x, poly[1]);
300   return svmla_x (pg, p01, x2, p2_18);
301 }
302 
VWRAP(lw_pw_horner_5)303 static inline VTYPE VWRAP (lw_pw_horner_5) (svbool_t pg, VTYPE x, VTYPE x2,
304 					    const STYPE *poly_even,
305 					    const STYPE *poly_odd)
306 {
307   VTYPE c13 = svld1rq (pg, poly_odd);
308 
309   VTYPE p01 = svmla_lane (DUP (poly_even[0]), x, c13, 0);
310   VTYPE p23 = svmla_lane (DUP (poly_even[1]), x, c13, 1);
311   VTYPE p45 = svmla_x (pg, DUP (poly_even[2]), x, poly_odd[2]);
312 
313   VTYPE p;
314   p = svmla_x (pg, p23, x2, p45);
315   p = svmla_x (pg, p01, x2, p);
316   return p;
317 }
VWRAP(lw_pw_horner_9)318 static inline VTYPE VWRAP (lw_pw_horner_9) (svbool_t pg, VTYPE x, VTYPE x2,
319 					    const STYPE *poly_even,
320 					    const STYPE *poly_odd)
321 {
322   VTYPE c13 = svld1rq (pg, poly_odd);
323 
324   VTYPE p49 = VWRAP (lw_pw_horner_5) (pg, x, x2, poly_even + 2, poly_odd + 2);
325   VTYPE p23 = svmla_lane (DUP (poly_even[1]), x, c13, 1);
326 
327   VTYPE p29 = svmla_x (pg, p23, x2, p49);
328   VTYPE p01 = svmla_lane (DUP (poly_even[0]), x, c13, 0);
329 
330   return svmla_x (pg, p01, x2, p29);
331 }
332