1*f3087befSAndrew Turner /*
2*f3087befSAndrew Turner * Helpers for evaluating polynomials with various schemes - specific to SVE
3*f3087befSAndrew Turner * but precision-agnostic.
4*f3087befSAndrew Turner *
5*f3087befSAndrew Turner * Copyright (c) 2023-2024, Arm Limited.
6*f3087befSAndrew Turner * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
7*f3087befSAndrew Turner */
8*f3087befSAndrew Turner
9*f3087befSAndrew Turner #ifndef VTYPE
10*f3087befSAndrew Turner # error Cannot use poly_generic without defining VTYPE
11*f3087befSAndrew Turner #endif
12*f3087befSAndrew Turner #ifndef STYPE
13*f3087befSAndrew Turner # error Cannot use poly_generic without defining STYPE
14*f3087befSAndrew Turner #endif
15*f3087befSAndrew Turner #ifndef VWRAP
16*f3087befSAndrew Turner # error Cannot use poly_generic without defining VWRAP
17*f3087befSAndrew Turner #endif
18*f3087befSAndrew Turner #ifndef DUP
19*f3087befSAndrew Turner # error Cannot use poly_generic without defining DUP
20*f3087befSAndrew Turner #endif
21*f3087befSAndrew Turner
VWRAP(pairwise_poly_3)22*f3087befSAndrew Turner static inline VTYPE VWRAP (pairwise_poly_3) (svbool_t pg, VTYPE x, VTYPE x2,
23*f3087befSAndrew Turner const STYPE *poly)
24*f3087befSAndrew Turner {
25*f3087befSAndrew Turner /* At order 3, Estrin and Pairwise Horner are identical. */
26*f3087befSAndrew Turner VTYPE p01 = svmla_x (pg, DUP (poly[0]), x, poly[1]);
27*f3087befSAndrew Turner VTYPE p23 = svmla_x (pg, DUP (poly[2]), x, poly[3]);
28*f3087befSAndrew Turner return svmla_x (pg, p01, p23, x2);
29*f3087befSAndrew Turner }
30*f3087befSAndrew Turner
VWRAP(estrin_4)31*f3087befSAndrew Turner static inline VTYPE VWRAP (estrin_4) (svbool_t pg, VTYPE x, VTYPE x2, VTYPE x4,
32*f3087befSAndrew Turner const STYPE *poly)
33*f3087befSAndrew Turner {
34*f3087befSAndrew Turner VTYPE p03 = VWRAP (pairwise_poly_3) (pg, x, x2, poly);
35*f3087befSAndrew Turner return svmla_x (pg, p03, x4, poly[4]);
36*f3087befSAndrew Turner }
VWRAP(estrin_5)37*f3087befSAndrew Turner static inline VTYPE VWRAP (estrin_5) (svbool_t pg, VTYPE x, VTYPE x2, VTYPE x4,
38*f3087befSAndrew Turner const STYPE *poly)
39*f3087befSAndrew Turner {
40*f3087befSAndrew Turner VTYPE p03 = VWRAP (pairwise_poly_3) (pg, x, x2, poly);
41*f3087befSAndrew Turner VTYPE p45 = svmla_x (pg, DUP (poly[4]), x, poly[5]);
42*f3087befSAndrew Turner return svmla_x (pg, p03, p45, x4);
43*f3087befSAndrew Turner }
VWRAP(estrin_6)44*f3087befSAndrew Turner static inline VTYPE VWRAP (estrin_6) (svbool_t pg, VTYPE x, VTYPE x2, VTYPE x4,
45*f3087befSAndrew Turner const STYPE *poly)
46*f3087befSAndrew Turner {
47*f3087befSAndrew Turner VTYPE p03 = VWRAP (pairwise_poly_3) (pg, x, x2, poly);
48*f3087befSAndrew Turner VTYPE p45 = svmla_x (pg, DUP (poly[4]), x, poly[5]);
49*f3087befSAndrew Turner VTYPE p46 = svmla_x (pg, p45, x, poly[6]);
50*f3087befSAndrew Turner return svmla_x (pg, p03, p46, x4);
51*f3087befSAndrew Turner }
VWRAP(estrin_7)52*f3087befSAndrew Turner static inline VTYPE VWRAP (estrin_7) (svbool_t pg, VTYPE x, VTYPE x2, VTYPE x4,
53*f3087befSAndrew Turner const STYPE *poly)
54*f3087befSAndrew Turner {
55*f3087befSAndrew Turner VTYPE p03 = VWRAP (pairwise_poly_3) (pg, x, x2, poly);
56*f3087befSAndrew Turner VTYPE p47 = VWRAP (pairwise_poly_3) (pg, x, x2, poly + 4);
57*f3087befSAndrew Turner return svmla_x (pg, p03, p47, x4);
58*f3087befSAndrew Turner }
VWRAP(estrin_8)59*f3087befSAndrew Turner static inline VTYPE VWRAP (estrin_8) (svbool_t pg, VTYPE x, VTYPE x2, VTYPE x4,
60*f3087befSAndrew Turner VTYPE x8, const STYPE *poly)
61*f3087befSAndrew Turner {
62*f3087befSAndrew Turner return svmla_x (pg, VWRAP (estrin_7) (pg, x, x2, x4, poly), x8, poly[8]);
63*f3087befSAndrew Turner }
VWRAP(estrin_9)64*f3087befSAndrew Turner static inline VTYPE VWRAP (estrin_9) (svbool_t pg, VTYPE x, VTYPE x2, VTYPE x4,
65*f3087befSAndrew Turner VTYPE x8, const STYPE *poly)
66*f3087befSAndrew Turner {
67*f3087befSAndrew Turner VTYPE p89 = svmla_x (pg, DUP (poly[8]), x, poly[9]);
68*f3087befSAndrew Turner return svmla_x (pg, VWRAP (estrin_7) (pg, x, x2, x4, poly), p89, x8);
69*f3087befSAndrew Turner }
VWRAP(estrin_10)70*f3087befSAndrew Turner static inline VTYPE VWRAP (estrin_10) (svbool_t pg, VTYPE x, VTYPE x2,
71*f3087befSAndrew Turner VTYPE x4, VTYPE x8, const STYPE *poly)
72*f3087befSAndrew Turner {
73*f3087befSAndrew Turner VTYPE p89 = svmla_x (pg, DUP (poly[8]), x, poly[9]);
74*f3087befSAndrew Turner VTYPE p8_10 = svmla_x (pg, p89, x2, poly[10]);
75*f3087befSAndrew Turner return svmla_x (pg, VWRAP (estrin_7) (pg, x, x2, x4, poly), p8_10, x8);
76*f3087befSAndrew Turner }
VWRAP(estrin_11)77*f3087befSAndrew Turner static inline VTYPE VWRAP (estrin_11) (svbool_t pg, VTYPE x, VTYPE x2,
78*f3087befSAndrew Turner VTYPE x4, VTYPE x8, const STYPE *poly)
79*f3087befSAndrew Turner {
80*f3087befSAndrew Turner VTYPE p8_11 = VWRAP (pairwise_poly_3) (pg, x, x2, poly + 8);
81*f3087befSAndrew Turner return svmla_x (pg, VWRAP (estrin_7) (pg, x, x2, x4, poly), p8_11, x8);
82*f3087befSAndrew Turner }
VWRAP(estrin_12)83*f3087befSAndrew Turner static inline VTYPE VWRAP (estrin_12) (svbool_t pg, VTYPE x, VTYPE x2,
84*f3087befSAndrew Turner VTYPE x4, VTYPE x8, const STYPE *poly)
85*f3087befSAndrew Turner {
86*f3087befSAndrew Turner return svmla_x (pg, VWRAP (estrin_7) (pg, x, x2, x4, poly),
87*f3087befSAndrew Turner VWRAP (estrin_4) (pg, x, x2, x4, poly + 8), x8);
88*f3087befSAndrew Turner }
VWRAP(estrin_13)89*f3087befSAndrew Turner static inline VTYPE VWRAP (estrin_13) (svbool_t pg, VTYPE x, VTYPE x2,
90*f3087befSAndrew Turner VTYPE x4, VTYPE x8, const STYPE *poly)
91*f3087befSAndrew Turner {
92*f3087befSAndrew Turner return svmla_x (pg, VWRAP (estrin_7) (pg, x, x2, x4, poly),
93*f3087befSAndrew Turner VWRAP (estrin_5) (pg, x, x2, x4, poly + 8), x8);
94*f3087befSAndrew Turner }
VWRAP(estrin_14)95*f3087befSAndrew Turner static inline VTYPE VWRAP (estrin_14) (svbool_t pg, VTYPE x, VTYPE x2,
96*f3087befSAndrew Turner VTYPE x4, VTYPE x8, const STYPE *poly)
97*f3087befSAndrew Turner {
98*f3087befSAndrew Turner return svmla_x (pg, VWRAP (estrin_7) (pg, x, x2, x4, poly),
99*f3087befSAndrew Turner VWRAP (estrin_6) (pg, x, x2, x4, poly + 8), x8);
100*f3087befSAndrew Turner }
VWRAP(estrin_15)101*f3087befSAndrew Turner static inline VTYPE VWRAP (estrin_15) (svbool_t pg, VTYPE x, VTYPE x2,
102*f3087befSAndrew Turner VTYPE x4, VTYPE x8, const STYPE *poly)
103*f3087befSAndrew Turner {
104*f3087befSAndrew Turner return svmla_x (pg, VWRAP (estrin_7) (pg, x, x2, x4, poly),
105*f3087befSAndrew Turner VWRAP (estrin_7) (pg, x, x2, x4, poly + 8), x8);
106*f3087befSAndrew Turner }
VWRAP(estrin_16)107*f3087befSAndrew Turner static inline VTYPE VWRAP (estrin_16) (svbool_t pg, VTYPE x, VTYPE x2,
108*f3087befSAndrew Turner VTYPE x4, VTYPE x8, VTYPE x16,
109*f3087befSAndrew Turner const STYPE *poly)
110*f3087befSAndrew Turner {
111*f3087befSAndrew Turner return svmla_x (pg, VWRAP (estrin_15) (pg, x, x2, x4, x8, poly), x16,
112*f3087befSAndrew Turner poly[16]);
113*f3087befSAndrew Turner }
VWRAP(estrin_17)114*f3087befSAndrew Turner static inline VTYPE VWRAP (estrin_17) (svbool_t pg, VTYPE x, VTYPE x2,
115*f3087befSAndrew Turner VTYPE x4, VTYPE x8, VTYPE x16,
116*f3087befSAndrew Turner const STYPE *poly)
117*f3087befSAndrew Turner {
118*f3087befSAndrew Turner VTYPE p16_17 = svmla_x (pg, DUP (poly[16]), x, poly[17]);
119*f3087befSAndrew Turner return svmla_x (pg, VWRAP (estrin_15) (pg, x, x2, x4, x8, poly), p16_17,
120*f3087befSAndrew Turner x16);
121*f3087befSAndrew Turner }
VWRAP(estrin_18)122*f3087befSAndrew Turner static inline VTYPE VWRAP (estrin_18) (svbool_t pg, VTYPE x, VTYPE x2,
123*f3087befSAndrew Turner VTYPE x4, VTYPE x8, VTYPE x16,
124*f3087befSAndrew Turner const STYPE *poly)
125*f3087befSAndrew Turner {
126*f3087befSAndrew Turner VTYPE p16_17 = svmla_x (pg, DUP (poly[16]), x, poly[17]);
127*f3087befSAndrew Turner VTYPE p16_18 = svmla_x (pg, p16_17, x2, poly[18]);
128*f3087befSAndrew Turner return svmla_x (pg, VWRAP (estrin_15) (pg, x, x2, x4, x8, poly), p16_18,
129*f3087befSAndrew Turner x16);
130*f3087befSAndrew Turner }
VWRAP(estrin_19)131*f3087befSAndrew Turner static inline VTYPE VWRAP (estrin_19) (svbool_t pg, VTYPE x, VTYPE x2,
132*f3087befSAndrew Turner VTYPE x4, VTYPE x8, VTYPE x16,
133*f3087befSAndrew Turner const STYPE *poly)
134*f3087befSAndrew Turner {
135*f3087befSAndrew Turner return svmla_x (pg, VWRAP (estrin_15) (pg, x, x2, x4, x8, poly),
136*f3087befSAndrew Turner VWRAP (pairwise_poly_3) (pg, x, x2, poly + 16), x16);
137*f3087befSAndrew Turner }
138*f3087befSAndrew Turner
VWRAP(horner_3)139*f3087befSAndrew Turner static inline VTYPE VWRAP (horner_3) (svbool_t pg, VTYPE x, const STYPE *poly)
140*f3087befSAndrew Turner {
141*f3087befSAndrew Turner VTYPE p = svmla_x (pg, DUP (poly[2]), x, poly[3]);
142*f3087befSAndrew Turner p = svmad_x (pg, x, p, poly[1]);
143*f3087befSAndrew Turner p = svmad_x (pg, x, p, poly[0]);
144*f3087befSAndrew Turner return p;
145*f3087befSAndrew Turner }
VWRAP(horner_4)146*f3087befSAndrew Turner static inline VTYPE VWRAP (horner_4) (svbool_t pg, VTYPE x, const STYPE *poly)
147*f3087befSAndrew Turner {
148*f3087befSAndrew Turner VTYPE p = svmla_x (pg, DUP (poly[3]), x, poly[4]);
149*f3087befSAndrew Turner p = svmad_x (pg, x, p, poly[2]);
150*f3087befSAndrew Turner p = svmad_x (pg, x, p, poly[1]);
151*f3087befSAndrew Turner p = svmad_x (pg, x, p, poly[0]);
152*f3087befSAndrew Turner return p;
153*f3087befSAndrew Turner }
VWRAP(horner_5)154*f3087befSAndrew Turner static inline VTYPE VWRAP (horner_5) (svbool_t pg, VTYPE x, const STYPE *poly)
155*f3087befSAndrew Turner {
156*f3087befSAndrew Turner return svmad_x (pg, x, VWRAP (horner_4) (pg, x, poly + 1), poly[0]);
157*f3087befSAndrew Turner }
VWRAP(horner_6)158*f3087befSAndrew Turner static inline VTYPE VWRAP (horner_6) (svbool_t pg, VTYPE x, const STYPE *poly)
159*f3087befSAndrew Turner {
160*f3087befSAndrew Turner return svmad_x (pg, x, VWRAP (horner_5) (pg, x, poly + 1), poly[0]);
161*f3087befSAndrew Turner }
VWRAP(horner_7)162*f3087befSAndrew Turner static inline VTYPE VWRAP (horner_7) (svbool_t pg, VTYPE x, const STYPE *poly)
163*f3087befSAndrew Turner {
164*f3087befSAndrew Turner return svmad_x (pg, x, VWRAP (horner_6) (pg, x, poly + 1), poly[0]);
165*f3087befSAndrew Turner }
VWRAP(horner_8)166*f3087befSAndrew Turner static inline VTYPE VWRAP (horner_8) (svbool_t pg, VTYPE x, const STYPE *poly)
167*f3087befSAndrew Turner {
168*f3087befSAndrew Turner return svmad_x (pg, x, VWRAP (horner_7) (pg, x, poly + 1), poly[0]);
169*f3087befSAndrew Turner }
VWRAP(horner_9)170*f3087befSAndrew Turner static inline VTYPE VWRAP (horner_9) (svbool_t pg, VTYPE x, const STYPE *poly)
171*f3087befSAndrew Turner {
172*f3087befSAndrew Turner return svmad_x (pg, x, VWRAP (horner_8) (pg, x, poly + 1), poly[0]);
173*f3087befSAndrew Turner }
174*f3087befSAndrew Turner static inline VTYPE
sv_horner_10_f32_x(svbool_t pg,VTYPE x,const STYPE * poly)175*f3087befSAndrew Turner sv_horner_10_f32_x (svbool_t pg, VTYPE x, const STYPE *poly)
176*f3087befSAndrew Turner {
177*f3087befSAndrew Turner return svmad_x (pg, x, VWRAP (horner_9) (pg, x, poly + 1), poly[0]);
178*f3087befSAndrew Turner }
179*f3087befSAndrew Turner static inline VTYPE
sv_horner_11_f32_x(svbool_t pg,VTYPE x,const STYPE * poly)180*f3087befSAndrew Turner sv_horner_11_f32_x (svbool_t pg, VTYPE x, const STYPE *poly)
181*f3087befSAndrew Turner {
182*f3087befSAndrew Turner return svmad_x (pg, x, sv_horner_10_f32_x (pg, x, poly + 1), poly[0]);
183*f3087befSAndrew Turner }
184*f3087befSAndrew Turner static inline VTYPE
sv_horner_12_f32_x(svbool_t pg,VTYPE x,const STYPE * poly)185*f3087befSAndrew Turner sv_horner_12_f32_x (svbool_t pg, VTYPE x, const STYPE *poly)
186*f3087befSAndrew Turner {
187*f3087befSAndrew Turner return svmad_x (pg, x, sv_horner_11_f32_x (pg, x, poly + 1), poly[0]);
188*f3087befSAndrew Turner }
189*f3087befSAndrew Turner
VWRAP(pw_horner_4)190*f3087befSAndrew Turner static inline VTYPE VWRAP (pw_horner_4) (svbool_t pg, VTYPE x, VTYPE x2,
191*f3087befSAndrew Turner const STYPE *poly)
192*f3087befSAndrew Turner {
193*f3087befSAndrew Turner VTYPE p01 = svmla_x (pg, DUP (poly[0]), x, poly[1]);
194*f3087befSAndrew Turner VTYPE p23 = svmla_x (pg, DUP (poly[2]), x, poly[3]);
195*f3087befSAndrew Turner VTYPE p;
196*f3087befSAndrew Turner p = svmla_x (pg, p23, x2, poly[4]);
197*f3087befSAndrew Turner p = svmla_x (pg, p01, x2, p);
198*f3087befSAndrew Turner return p;
199*f3087befSAndrew Turner }
VWRAP(pw_horner_5)200*f3087befSAndrew Turner static inline VTYPE VWRAP (pw_horner_5) (svbool_t pg, VTYPE x, VTYPE x2,
201*f3087befSAndrew Turner const STYPE *poly)
202*f3087befSAndrew Turner {
203*f3087befSAndrew Turner VTYPE p01 = svmla_x (pg, DUP (poly[0]), x, poly[1]);
204*f3087befSAndrew Turner VTYPE p23 = svmla_x (pg, DUP (poly[2]), x, poly[3]);
205*f3087befSAndrew Turner VTYPE p45 = svmla_x (pg, DUP (poly[4]), x, poly[5]);
206*f3087befSAndrew Turner VTYPE p;
207*f3087befSAndrew Turner p = svmla_x (pg, p23, x2, p45);
208*f3087befSAndrew Turner p = svmla_x (pg, p01, x2, p);
209*f3087befSAndrew Turner return p;
210*f3087befSAndrew Turner }
VWRAP(pw_horner_6)211*f3087befSAndrew Turner static inline VTYPE VWRAP (pw_horner_6) (svbool_t pg, VTYPE x, VTYPE x2,
212*f3087befSAndrew Turner const STYPE *poly)
213*f3087befSAndrew Turner {
214*f3087befSAndrew Turner VTYPE p26 = VWRAP (pw_horner_4) (pg, x, x2, poly + 2);
215*f3087befSAndrew Turner VTYPE p01 = svmla_x (pg, DUP (poly[0]), x, poly[1]);
216*f3087befSAndrew Turner return svmla_x (pg, p01, x2, p26);
217*f3087befSAndrew Turner }
VWRAP(pw_horner_7)218*f3087befSAndrew Turner static inline VTYPE VWRAP (pw_horner_7) (svbool_t pg, VTYPE x, VTYPE x2,
219*f3087befSAndrew Turner const STYPE *poly)
220*f3087befSAndrew Turner {
221*f3087befSAndrew Turner VTYPE p27 = VWRAP (pw_horner_5) (pg, x, x2, poly + 2);
222*f3087befSAndrew Turner VTYPE p01 = svmla_x (pg, DUP (poly[0]), x, poly[1]);
223*f3087befSAndrew Turner return svmla_x (pg, p01, x2, p27);
224*f3087befSAndrew Turner }
VWRAP(pw_horner_8)225*f3087befSAndrew Turner static inline VTYPE VWRAP (pw_horner_8) (svbool_t pg, VTYPE x, VTYPE x2,
226*f3087befSAndrew Turner const STYPE *poly)
227*f3087befSAndrew Turner {
228*f3087befSAndrew Turner VTYPE p28 = VWRAP (pw_horner_6) (pg, x, x2, poly + 2);
229*f3087befSAndrew Turner VTYPE p01 = svmla_x (pg, DUP (poly[0]), x, poly[1]);
230*f3087befSAndrew Turner return svmla_x (pg, p01, x2, p28);
231*f3087befSAndrew Turner }
VWRAP(pw_horner_9)232*f3087befSAndrew Turner static inline VTYPE VWRAP (pw_horner_9) (svbool_t pg, VTYPE x, VTYPE x2,
233*f3087befSAndrew Turner const STYPE *poly)
234*f3087befSAndrew Turner {
235*f3087befSAndrew Turner VTYPE p29 = VWRAP (pw_horner_7) (pg, x, x2, poly + 2);
236*f3087befSAndrew Turner VTYPE p01 = svmla_x (pg, DUP (poly[0]), x, poly[1]);
237*f3087befSAndrew Turner return svmla_x (pg, p01, x2, p29);
238*f3087befSAndrew Turner }
VWRAP(pw_horner_10)239*f3087befSAndrew Turner static inline VTYPE VWRAP (pw_horner_10) (svbool_t pg, VTYPE x, VTYPE x2,
240*f3087befSAndrew Turner const STYPE *poly)
241*f3087befSAndrew Turner {
242*f3087befSAndrew Turner VTYPE p2_10 = VWRAP (pw_horner_8) (pg, x, x2, poly + 2);
243*f3087befSAndrew Turner VTYPE p01 = svmla_x (pg, DUP (poly[0]), x, poly[1]);
244*f3087befSAndrew Turner return svmla_x (pg, p01, x2, p2_10);
245*f3087befSAndrew Turner }
VWRAP(pw_horner_11)246*f3087befSAndrew Turner static inline VTYPE VWRAP (pw_horner_11) (svbool_t pg, VTYPE x, VTYPE x2,
247*f3087befSAndrew Turner const STYPE *poly)
248*f3087befSAndrew Turner {
249*f3087befSAndrew Turner VTYPE p2_11 = VWRAP (pw_horner_9) (pg, x, x2, poly + 2);
250*f3087befSAndrew Turner VTYPE p01 = svmla_x (pg, DUP (poly[0]), x, poly[1]);
251*f3087befSAndrew Turner return svmla_x (pg, p01, x2, p2_11);
252*f3087befSAndrew Turner }
VWRAP(pw_horner_12)253*f3087befSAndrew Turner static inline VTYPE VWRAP (pw_horner_12) (svbool_t pg, VTYPE x, VTYPE x2,
254*f3087befSAndrew Turner const STYPE *poly)
255*f3087befSAndrew Turner {
256*f3087befSAndrew Turner VTYPE p2_12 = VWRAP (pw_horner_10) (pg, x, x2, poly + 2);
257*f3087befSAndrew Turner VTYPE p01 = svmla_x (pg, DUP (poly[0]), x, poly[1]);
258*f3087befSAndrew Turner return svmla_x (pg, p01, x2, p2_12);
259*f3087befSAndrew Turner }
VWRAP(pw_horner_13)260*f3087befSAndrew Turner static inline VTYPE VWRAP (pw_horner_13) (svbool_t pg, VTYPE x, VTYPE x2,
261*f3087befSAndrew Turner const STYPE *poly)
262*f3087befSAndrew Turner {
263*f3087befSAndrew Turner VTYPE p2_13 = VWRAP (pw_horner_11) (pg, x, x2, poly + 2);
264*f3087befSAndrew Turner VTYPE p01 = svmla_x (pg, DUP (poly[0]), x, poly[1]);
265*f3087befSAndrew Turner return svmla_x (pg, p01, x2, p2_13);
266*f3087befSAndrew Turner }
VWRAP(pw_horner_14)267*f3087befSAndrew Turner static inline VTYPE VWRAP (pw_horner_14) (svbool_t pg, VTYPE x, VTYPE x2,
268*f3087befSAndrew Turner const STYPE *poly)
269*f3087befSAndrew Turner {
270*f3087befSAndrew Turner VTYPE p2_14 = VWRAP (pw_horner_12) (pg, x, x2, poly + 2);
271*f3087befSAndrew Turner VTYPE p01 = svmla_x (pg, DUP (poly[0]), x, poly[1]);
272*f3087befSAndrew Turner return svmla_x (pg, p01, x2, p2_14);
273*f3087befSAndrew Turner }
VWRAP(pw_horner_15)274*f3087befSAndrew Turner static inline VTYPE VWRAP (pw_horner_15) (svbool_t pg, VTYPE x, VTYPE x2,
275*f3087befSAndrew Turner const STYPE *poly)
276*f3087befSAndrew Turner {
277*f3087befSAndrew Turner VTYPE p2_15 = VWRAP (pw_horner_13) (pg, x, x2, poly + 2);
278*f3087befSAndrew Turner VTYPE p01 = svmla_x (pg, DUP (poly[0]), x, poly[1]);
279*f3087befSAndrew Turner return svmla_x (pg, p01, x2, p2_15);
280*f3087befSAndrew Turner }
VWRAP(pw_horner_16)281*f3087befSAndrew Turner static inline VTYPE VWRAP (pw_horner_16) (svbool_t pg, VTYPE x, VTYPE x2,
282*f3087befSAndrew Turner const STYPE *poly)
283*f3087befSAndrew Turner {
284*f3087befSAndrew Turner VTYPE p2_16 = VWRAP (pw_horner_14) (pg, x, x2, poly + 2);
285*f3087befSAndrew Turner VTYPE p01 = svmla_x (pg, DUP (poly[0]), x, poly[1]);
286*f3087befSAndrew Turner return svmla_x (pg, p01, x2, p2_16);
287*f3087befSAndrew Turner }
VWRAP(pw_horner_17)288*f3087befSAndrew Turner static inline VTYPE VWRAP (pw_horner_17) (svbool_t pg, VTYPE x, VTYPE x2,
289*f3087befSAndrew Turner const STYPE *poly)
290*f3087befSAndrew Turner {
291*f3087befSAndrew Turner VTYPE p2_17 = VWRAP (pw_horner_15) (pg, x, x2, poly + 2);
292*f3087befSAndrew Turner VTYPE p01 = svmla_x (pg, DUP (poly[0]), x, poly[1]);
293*f3087befSAndrew Turner return svmla_x (pg, p01, x2, p2_17);
294*f3087befSAndrew Turner }
VWRAP(pw_horner_18)295*f3087befSAndrew Turner static inline VTYPE VWRAP (pw_horner_18) (svbool_t pg, VTYPE x, VTYPE x2,
296*f3087befSAndrew Turner const STYPE *poly)
297*f3087befSAndrew Turner {
298*f3087befSAndrew Turner VTYPE p2_18 = VWRAP (pw_horner_16) (pg, x, x2, poly + 2);
299*f3087befSAndrew Turner VTYPE p01 = svmla_x (pg, DUP (poly[0]), x, poly[1]);
300*f3087befSAndrew Turner return svmla_x (pg, p01, x2, p2_18);
301*f3087befSAndrew Turner }
302*f3087befSAndrew Turner
VWRAP(lw_pw_horner_5)303*f3087befSAndrew Turner static inline VTYPE VWRAP (lw_pw_horner_5) (svbool_t pg, VTYPE x, VTYPE x2,
304*f3087befSAndrew Turner const STYPE *poly_even,
305*f3087befSAndrew Turner const STYPE *poly_odd)
306*f3087befSAndrew Turner {
307*f3087befSAndrew Turner VTYPE c13 = svld1rq (pg, poly_odd);
308*f3087befSAndrew Turner
309*f3087befSAndrew Turner VTYPE p01 = svmla_lane (DUP (poly_even[0]), x, c13, 0);
310*f3087befSAndrew Turner VTYPE p23 = svmla_lane (DUP (poly_even[1]), x, c13, 1);
311*f3087befSAndrew Turner VTYPE p45 = svmla_x (pg, DUP (poly_even[2]), x, poly_odd[2]);
312*f3087befSAndrew Turner
313*f3087befSAndrew Turner VTYPE p;
314*f3087befSAndrew Turner p = svmla_x (pg, p23, x2, p45);
315*f3087befSAndrew Turner p = svmla_x (pg, p01, x2, p);
316*f3087befSAndrew Turner return p;
317*f3087befSAndrew Turner }
VWRAP(lw_pw_horner_9)318*f3087befSAndrew Turner static inline VTYPE VWRAP (lw_pw_horner_9) (svbool_t pg, VTYPE x, VTYPE x2,
319*f3087befSAndrew Turner const STYPE *poly_even,
320*f3087befSAndrew Turner const STYPE *poly_odd)
321*f3087befSAndrew Turner {
322*f3087befSAndrew Turner VTYPE c13 = svld1rq (pg, poly_odd);
323*f3087befSAndrew Turner
324*f3087befSAndrew Turner VTYPE p49 = VWRAP (lw_pw_horner_5) (pg, x, x2, poly_even + 2, poly_odd + 2);
325*f3087befSAndrew Turner VTYPE p23 = svmla_lane (DUP (poly_even[1]), x, c13, 1);
326*f3087befSAndrew Turner
327*f3087befSAndrew Turner VTYPE p29 = svmla_x (pg, p23, x2, p49);
328*f3087befSAndrew Turner VTYPE p01 = svmla_lane (DUP (poly_even[0]), x, c13, 0);
329*f3087befSAndrew Turner
330*f3087befSAndrew Turner return svmla_x (pg, p01, x2, p29);
331*f3087befSAndrew Turner }
332