1 /*
2 * Helpers for evaluating polynomials with various schemes - specific to SVE
3 * but precision-agnostic.
4 *
5 * Copyright (c) 2023-2024, Arm Limited.
6 * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
7 */
8
9 #ifndef VTYPE
10 # error Cannot use poly_generic without defining VTYPE
11 #endif
12 #ifndef STYPE
13 # error Cannot use poly_generic without defining STYPE
14 #endif
15 #ifndef VWRAP
16 # error Cannot use poly_generic without defining VWRAP
17 #endif
18 #ifndef DUP
19 # error Cannot use poly_generic without defining DUP
20 #endif
21
VWRAP(pairwise_poly_3)22 static inline VTYPE VWRAP (pairwise_poly_3) (svbool_t pg, VTYPE x, VTYPE x2,
23 const STYPE *poly)
24 {
25 /* At order 3, Estrin and Pairwise Horner are identical. */
26 VTYPE p01 = svmla_x (pg, DUP (poly[0]), x, poly[1]);
27 VTYPE p23 = svmla_x (pg, DUP (poly[2]), x, poly[3]);
28 return svmla_x (pg, p01, p23, x2);
29 }
30
VWRAP(estrin_4)31 static inline VTYPE VWRAP (estrin_4) (svbool_t pg, VTYPE x, VTYPE x2, VTYPE x4,
32 const STYPE *poly)
33 {
34 VTYPE p03 = VWRAP (pairwise_poly_3) (pg, x, x2, poly);
35 return svmla_x (pg, p03, x4, poly[4]);
36 }
VWRAP(estrin_5)37 static inline VTYPE VWRAP (estrin_5) (svbool_t pg, VTYPE x, VTYPE x2, VTYPE x4,
38 const STYPE *poly)
39 {
40 VTYPE p03 = VWRAP (pairwise_poly_3) (pg, x, x2, poly);
41 VTYPE p45 = svmla_x (pg, DUP (poly[4]), x, poly[5]);
42 return svmla_x (pg, p03, p45, x4);
43 }
VWRAP(estrin_6)44 static inline VTYPE VWRAP (estrin_6) (svbool_t pg, VTYPE x, VTYPE x2, VTYPE x4,
45 const STYPE *poly)
46 {
47 VTYPE p03 = VWRAP (pairwise_poly_3) (pg, x, x2, poly);
48 VTYPE p45 = svmla_x (pg, DUP (poly[4]), x, poly[5]);
49 VTYPE p46 = svmla_x (pg, p45, x, poly[6]);
50 return svmla_x (pg, p03, p46, x4);
51 }
VWRAP(estrin_7)52 static inline VTYPE VWRAP (estrin_7) (svbool_t pg, VTYPE x, VTYPE x2, VTYPE x4,
53 const STYPE *poly)
54 {
55 VTYPE p03 = VWRAP (pairwise_poly_3) (pg, x, x2, poly);
56 VTYPE p47 = VWRAP (pairwise_poly_3) (pg, x, x2, poly + 4);
57 return svmla_x (pg, p03, p47, x4);
58 }
VWRAP(estrin_8)59 static inline VTYPE VWRAP (estrin_8) (svbool_t pg, VTYPE x, VTYPE x2, VTYPE x4,
60 VTYPE x8, const STYPE *poly)
61 {
62 return svmla_x (pg, VWRAP (estrin_7) (pg, x, x2, x4, poly), x8, poly[8]);
63 }
VWRAP(estrin_9)64 static inline VTYPE VWRAP (estrin_9) (svbool_t pg, VTYPE x, VTYPE x2, VTYPE x4,
65 VTYPE x8, const STYPE *poly)
66 {
67 VTYPE p89 = svmla_x (pg, DUP (poly[8]), x, poly[9]);
68 return svmla_x (pg, VWRAP (estrin_7) (pg, x, x2, x4, poly), p89, x8);
69 }
VWRAP(estrin_10)70 static inline VTYPE VWRAP (estrin_10) (svbool_t pg, VTYPE x, VTYPE x2,
71 VTYPE x4, VTYPE x8, const STYPE *poly)
72 {
73 VTYPE p89 = svmla_x (pg, DUP (poly[8]), x, poly[9]);
74 VTYPE p8_10 = svmla_x (pg, p89, x2, poly[10]);
75 return svmla_x (pg, VWRAP (estrin_7) (pg, x, x2, x4, poly), p8_10, x8);
76 }
VWRAP(estrin_11)77 static inline VTYPE VWRAP (estrin_11) (svbool_t pg, VTYPE x, VTYPE x2,
78 VTYPE x4, VTYPE x8, const STYPE *poly)
79 {
80 VTYPE p8_11 = VWRAP (pairwise_poly_3) (pg, x, x2, poly + 8);
81 return svmla_x (pg, VWRAP (estrin_7) (pg, x, x2, x4, poly), p8_11, x8);
82 }
VWRAP(estrin_12)83 static inline VTYPE VWRAP (estrin_12) (svbool_t pg, VTYPE x, VTYPE x2,
84 VTYPE x4, VTYPE x8, const STYPE *poly)
85 {
86 return svmla_x (pg, VWRAP (estrin_7) (pg, x, x2, x4, poly),
87 VWRAP (estrin_4) (pg, x, x2, x4, poly + 8), x8);
88 }
VWRAP(estrin_13)89 static inline VTYPE VWRAP (estrin_13) (svbool_t pg, VTYPE x, VTYPE x2,
90 VTYPE x4, VTYPE x8, const STYPE *poly)
91 {
92 return svmla_x (pg, VWRAP (estrin_7) (pg, x, x2, x4, poly),
93 VWRAP (estrin_5) (pg, x, x2, x4, poly + 8), x8);
94 }
VWRAP(estrin_14)95 static inline VTYPE VWRAP (estrin_14) (svbool_t pg, VTYPE x, VTYPE x2,
96 VTYPE x4, VTYPE x8, const STYPE *poly)
97 {
98 return svmla_x (pg, VWRAP (estrin_7) (pg, x, x2, x4, poly),
99 VWRAP (estrin_6) (pg, x, x2, x4, poly + 8), x8);
100 }
VWRAP(estrin_15)101 static inline VTYPE VWRAP (estrin_15) (svbool_t pg, VTYPE x, VTYPE x2,
102 VTYPE x4, VTYPE x8, const STYPE *poly)
103 {
104 return svmla_x (pg, VWRAP (estrin_7) (pg, x, x2, x4, poly),
105 VWRAP (estrin_7) (pg, x, x2, x4, poly + 8), x8);
106 }
VWRAP(estrin_16)107 static inline VTYPE VWRAP (estrin_16) (svbool_t pg, VTYPE x, VTYPE x2,
108 VTYPE x4, VTYPE x8, VTYPE x16,
109 const STYPE *poly)
110 {
111 return svmla_x (pg, VWRAP (estrin_15) (pg, x, x2, x4, x8, poly), x16,
112 poly[16]);
113 }
VWRAP(estrin_17)114 static inline VTYPE VWRAP (estrin_17) (svbool_t pg, VTYPE x, VTYPE x2,
115 VTYPE x4, VTYPE x8, VTYPE x16,
116 const STYPE *poly)
117 {
118 VTYPE p16_17 = svmla_x (pg, DUP (poly[16]), x, poly[17]);
119 return svmla_x (pg, VWRAP (estrin_15) (pg, x, x2, x4, x8, poly), p16_17,
120 x16);
121 }
VWRAP(estrin_18)122 static inline VTYPE VWRAP (estrin_18) (svbool_t pg, VTYPE x, VTYPE x2,
123 VTYPE x4, VTYPE x8, VTYPE x16,
124 const STYPE *poly)
125 {
126 VTYPE p16_17 = svmla_x (pg, DUP (poly[16]), x, poly[17]);
127 VTYPE p16_18 = svmla_x (pg, p16_17, x2, poly[18]);
128 return svmla_x (pg, VWRAP (estrin_15) (pg, x, x2, x4, x8, poly), p16_18,
129 x16);
130 }
VWRAP(estrin_19)131 static inline VTYPE VWRAP (estrin_19) (svbool_t pg, VTYPE x, VTYPE x2,
132 VTYPE x4, VTYPE x8, VTYPE x16,
133 const STYPE *poly)
134 {
135 return svmla_x (pg, VWRAP (estrin_15) (pg, x, x2, x4, x8, poly),
136 VWRAP (pairwise_poly_3) (pg, x, x2, poly + 16), x16);
137 }
138
VWRAP(horner_3)139 static inline VTYPE VWRAP (horner_3) (svbool_t pg, VTYPE x, const STYPE *poly)
140 {
141 VTYPE p = svmla_x (pg, DUP (poly[2]), x, poly[3]);
142 p = svmad_x (pg, x, p, poly[1]);
143 p = svmad_x (pg, x, p, poly[0]);
144 return p;
145 }
VWRAP(horner_4)146 static inline VTYPE VWRAP (horner_4) (svbool_t pg, VTYPE x, const STYPE *poly)
147 {
148 VTYPE p = svmla_x (pg, DUP (poly[3]), x, poly[4]);
149 p = svmad_x (pg, x, p, poly[2]);
150 p = svmad_x (pg, x, p, poly[1]);
151 p = svmad_x (pg, x, p, poly[0]);
152 return p;
153 }
VWRAP(horner_5)154 static inline VTYPE VWRAP (horner_5) (svbool_t pg, VTYPE x, const STYPE *poly)
155 {
156 return svmad_x (pg, x, VWRAP (horner_4) (pg, x, poly + 1), poly[0]);
157 }
VWRAP(horner_6)158 static inline VTYPE VWRAP (horner_6) (svbool_t pg, VTYPE x, const STYPE *poly)
159 {
160 return svmad_x (pg, x, VWRAP (horner_5) (pg, x, poly + 1), poly[0]);
161 }
VWRAP(horner_7)162 static inline VTYPE VWRAP (horner_7) (svbool_t pg, VTYPE x, const STYPE *poly)
163 {
164 return svmad_x (pg, x, VWRAP (horner_6) (pg, x, poly + 1), poly[0]);
165 }
VWRAP(horner_8)166 static inline VTYPE VWRAP (horner_8) (svbool_t pg, VTYPE x, const STYPE *poly)
167 {
168 return svmad_x (pg, x, VWRAP (horner_7) (pg, x, poly + 1), poly[0]);
169 }
VWRAP(horner_9)170 static inline VTYPE VWRAP (horner_9) (svbool_t pg, VTYPE x, const STYPE *poly)
171 {
172 return svmad_x (pg, x, VWRAP (horner_8) (pg, x, poly + 1), poly[0]);
173 }
174 static inline VTYPE
sv_horner_10_f32_x(svbool_t pg,VTYPE x,const STYPE * poly)175 sv_horner_10_f32_x (svbool_t pg, VTYPE x, const STYPE *poly)
176 {
177 return svmad_x (pg, x, VWRAP (horner_9) (pg, x, poly + 1), poly[0]);
178 }
179 static inline VTYPE
sv_horner_11_f32_x(svbool_t pg,VTYPE x,const STYPE * poly)180 sv_horner_11_f32_x (svbool_t pg, VTYPE x, const STYPE *poly)
181 {
182 return svmad_x (pg, x, sv_horner_10_f32_x (pg, x, poly + 1), poly[0]);
183 }
184 static inline VTYPE
sv_horner_12_f32_x(svbool_t pg,VTYPE x,const STYPE * poly)185 sv_horner_12_f32_x (svbool_t pg, VTYPE x, const STYPE *poly)
186 {
187 return svmad_x (pg, x, sv_horner_11_f32_x (pg, x, poly + 1), poly[0]);
188 }
189
VWRAP(pw_horner_4)190 static inline VTYPE VWRAP (pw_horner_4) (svbool_t pg, VTYPE x, VTYPE x2,
191 const STYPE *poly)
192 {
193 VTYPE p01 = svmla_x (pg, DUP (poly[0]), x, poly[1]);
194 VTYPE p23 = svmla_x (pg, DUP (poly[2]), x, poly[3]);
195 VTYPE p;
196 p = svmla_x (pg, p23, x2, poly[4]);
197 p = svmla_x (pg, p01, x2, p);
198 return p;
199 }
VWRAP(pw_horner_5)200 static inline VTYPE VWRAP (pw_horner_5) (svbool_t pg, VTYPE x, VTYPE x2,
201 const STYPE *poly)
202 {
203 VTYPE p01 = svmla_x (pg, DUP (poly[0]), x, poly[1]);
204 VTYPE p23 = svmla_x (pg, DUP (poly[2]), x, poly[3]);
205 VTYPE p45 = svmla_x (pg, DUP (poly[4]), x, poly[5]);
206 VTYPE p;
207 p = svmla_x (pg, p23, x2, p45);
208 p = svmla_x (pg, p01, x2, p);
209 return p;
210 }
VWRAP(pw_horner_6)211 static inline VTYPE VWRAP (pw_horner_6) (svbool_t pg, VTYPE x, VTYPE x2,
212 const STYPE *poly)
213 {
214 VTYPE p26 = VWRAP (pw_horner_4) (pg, x, x2, poly + 2);
215 VTYPE p01 = svmla_x (pg, DUP (poly[0]), x, poly[1]);
216 return svmla_x (pg, p01, x2, p26);
217 }
VWRAP(pw_horner_7)218 static inline VTYPE VWRAP (pw_horner_7) (svbool_t pg, VTYPE x, VTYPE x2,
219 const STYPE *poly)
220 {
221 VTYPE p27 = VWRAP (pw_horner_5) (pg, x, x2, poly + 2);
222 VTYPE p01 = svmla_x (pg, DUP (poly[0]), x, poly[1]);
223 return svmla_x (pg, p01, x2, p27);
224 }
VWRAP(pw_horner_8)225 static inline VTYPE VWRAP (pw_horner_8) (svbool_t pg, VTYPE x, VTYPE x2,
226 const STYPE *poly)
227 {
228 VTYPE p28 = VWRAP (pw_horner_6) (pg, x, x2, poly + 2);
229 VTYPE p01 = svmla_x (pg, DUP (poly[0]), x, poly[1]);
230 return svmla_x (pg, p01, x2, p28);
231 }
VWRAP(pw_horner_9)232 static inline VTYPE VWRAP (pw_horner_9) (svbool_t pg, VTYPE x, VTYPE x2,
233 const STYPE *poly)
234 {
235 VTYPE p29 = VWRAP (pw_horner_7) (pg, x, x2, poly + 2);
236 VTYPE p01 = svmla_x (pg, DUP (poly[0]), x, poly[1]);
237 return svmla_x (pg, p01, x2, p29);
238 }
VWRAP(pw_horner_10)239 static inline VTYPE VWRAP (pw_horner_10) (svbool_t pg, VTYPE x, VTYPE x2,
240 const STYPE *poly)
241 {
242 VTYPE p2_10 = VWRAP (pw_horner_8) (pg, x, x2, poly + 2);
243 VTYPE p01 = svmla_x (pg, DUP (poly[0]), x, poly[1]);
244 return svmla_x (pg, p01, x2, p2_10);
245 }
VWRAP(pw_horner_11)246 static inline VTYPE VWRAP (pw_horner_11) (svbool_t pg, VTYPE x, VTYPE x2,
247 const STYPE *poly)
248 {
249 VTYPE p2_11 = VWRAP (pw_horner_9) (pg, x, x2, poly + 2);
250 VTYPE p01 = svmla_x (pg, DUP (poly[0]), x, poly[1]);
251 return svmla_x (pg, p01, x2, p2_11);
252 }
VWRAP(pw_horner_12)253 static inline VTYPE VWRAP (pw_horner_12) (svbool_t pg, VTYPE x, VTYPE x2,
254 const STYPE *poly)
255 {
256 VTYPE p2_12 = VWRAP (pw_horner_10) (pg, x, x2, poly + 2);
257 VTYPE p01 = svmla_x (pg, DUP (poly[0]), x, poly[1]);
258 return svmla_x (pg, p01, x2, p2_12);
259 }
VWRAP(pw_horner_13)260 static inline VTYPE VWRAP (pw_horner_13) (svbool_t pg, VTYPE x, VTYPE x2,
261 const STYPE *poly)
262 {
263 VTYPE p2_13 = VWRAP (pw_horner_11) (pg, x, x2, poly + 2);
264 VTYPE p01 = svmla_x (pg, DUP (poly[0]), x, poly[1]);
265 return svmla_x (pg, p01, x2, p2_13);
266 }
VWRAP(pw_horner_14)267 static inline VTYPE VWRAP (pw_horner_14) (svbool_t pg, VTYPE x, VTYPE x2,
268 const STYPE *poly)
269 {
270 VTYPE p2_14 = VWRAP (pw_horner_12) (pg, x, x2, poly + 2);
271 VTYPE p01 = svmla_x (pg, DUP (poly[0]), x, poly[1]);
272 return svmla_x (pg, p01, x2, p2_14);
273 }
VWRAP(pw_horner_15)274 static inline VTYPE VWRAP (pw_horner_15) (svbool_t pg, VTYPE x, VTYPE x2,
275 const STYPE *poly)
276 {
277 VTYPE p2_15 = VWRAP (pw_horner_13) (pg, x, x2, poly + 2);
278 VTYPE p01 = svmla_x (pg, DUP (poly[0]), x, poly[1]);
279 return svmla_x (pg, p01, x2, p2_15);
280 }
VWRAP(pw_horner_16)281 static inline VTYPE VWRAP (pw_horner_16) (svbool_t pg, VTYPE x, VTYPE x2,
282 const STYPE *poly)
283 {
284 VTYPE p2_16 = VWRAP (pw_horner_14) (pg, x, x2, poly + 2);
285 VTYPE p01 = svmla_x (pg, DUP (poly[0]), x, poly[1]);
286 return svmla_x (pg, p01, x2, p2_16);
287 }
VWRAP(pw_horner_17)288 static inline VTYPE VWRAP (pw_horner_17) (svbool_t pg, VTYPE x, VTYPE x2,
289 const STYPE *poly)
290 {
291 VTYPE p2_17 = VWRAP (pw_horner_15) (pg, x, x2, poly + 2);
292 VTYPE p01 = svmla_x (pg, DUP (poly[0]), x, poly[1]);
293 return svmla_x (pg, p01, x2, p2_17);
294 }
VWRAP(pw_horner_18)295 static inline VTYPE VWRAP (pw_horner_18) (svbool_t pg, VTYPE x, VTYPE x2,
296 const STYPE *poly)
297 {
298 VTYPE p2_18 = VWRAP (pw_horner_16) (pg, x, x2, poly + 2);
299 VTYPE p01 = svmla_x (pg, DUP (poly[0]), x, poly[1]);
300 return svmla_x (pg, p01, x2, p2_18);
301 }
302
VWRAP(lw_pw_horner_5)303 static inline VTYPE VWRAP (lw_pw_horner_5) (svbool_t pg, VTYPE x, VTYPE x2,
304 const STYPE *poly_even,
305 const STYPE *poly_odd)
306 {
307 VTYPE c13 = svld1rq (pg, poly_odd);
308
309 VTYPE p01 = svmla_lane (DUP (poly_even[0]), x, c13, 0);
310 VTYPE p23 = svmla_lane (DUP (poly_even[1]), x, c13, 1);
311 VTYPE p45 = svmla_x (pg, DUP (poly_even[2]), x, poly_odd[2]);
312
313 VTYPE p;
314 p = svmla_x (pg, p23, x2, p45);
315 p = svmla_x (pg, p01, x2, p);
316 return p;
317 }
VWRAP(lw_pw_horner_9)318 static inline VTYPE VWRAP (lw_pw_horner_9) (svbool_t pg, VTYPE x, VTYPE x2,
319 const STYPE *poly_even,
320 const STYPE *poly_odd)
321 {
322 VTYPE c13 = svld1rq (pg, poly_odd);
323
324 VTYPE p49 = VWRAP (lw_pw_horner_5) (pg, x, x2, poly_even + 2, poly_odd + 2);
325 VTYPE p23 = svmla_lane (DUP (poly_even[1]), x, c13, 1);
326
327 VTYPE p29 = svmla_x (pg, p23, x2, p49);
328 VTYPE p01 = svmla_lane (DUP (poly_even[0]), x, c13, 0);
329
330 return svmla_x (pg, p01, x2, p29);
331 }
332