xref: /freebsd/contrib/arm-optimized-routines/math/poly_generic.h (revision f3087bef11543b42e0d69b708f367097a4118d24)
1*f3087befSAndrew Turner /*
2*f3087befSAndrew Turner  * Generic helpers for evaluating polynomials with various schemes.
3*f3087befSAndrew Turner  *
4*f3087befSAndrew Turner  * Copyright (c) 2023-2024, Arm Limited.
5*f3087befSAndrew Turner  * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
6*f3087befSAndrew Turner  */
7*f3087befSAndrew Turner 
8*f3087befSAndrew Turner #ifndef VTYPE
9*f3087befSAndrew Turner # error Cannot use poly_generic without defining VTYPE
10*f3087befSAndrew Turner #endif
11*f3087befSAndrew Turner #ifndef VWRAP
12*f3087befSAndrew Turner # error Cannot use poly_generic without defining VWRAP
13*f3087befSAndrew Turner #endif
14*f3087befSAndrew Turner #ifndef FMA
15*f3087befSAndrew Turner # error Cannot use poly_generic without defining FMA
16*f3087befSAndrew Turner #endif
17*f3087befSAndrew Turner 
VWRAP(pairwise_poly_3)18*f3087befSAndrew Turner static inline VTYPE VWRAP (pairwise_poly_3) (VTYPE x, VTYPE x2,
19*f3087befSAndrew Turner 					     const VTYPE *poly)
20*f3087befSAndrew Turner {
21*f3087befSAndrew Turner   /* At order 3, Estrin and Pairwise Horner are identical.  */
22*f3087befSAndrew Turner   VTYPE p01 = FMA (poly[1], x, poly[0]);
23*f3087befSAndrew Turner   VTYPE p23 = FMA (poly[3], x, poly[2]);
24*f3087befSAndrew Turner   return FMA (p23, x2, p01);
25*f3087befSAndrew Turner }
26*f3087befSAndrew Turner 
VWRAP(estrin_4)27*f3087befSAndrew Turner static inline VTYPE VWRAP (estrin_4) (VTYPE x, VTYPE x2, VTYPE x4,
28*f3087befSAndrew Turner 				      const VTYPE *poly)
29*f3087befSAndrew Turner {
30*f3087befSAndrew Turner   VTYPE p03 = VWRAP (pairwise_poly_3) (x, x2, poly);
31*f3087befSAndrew Turner   return FMA (poly[4], x4, p03);
32*f3087befSAndrew Turner }
VWRAP(estrin_5)33*f3087befSAndrew Turner static inline VTYPE VWRAP (estrin_5) (VTYPE x, VTYPE x2, VTYPE x4,
34*f3087befSAndrew Turner 				      const VTYPE *poly)
35*f3087befSAndrew Turner {
36*f3087befSAndrew Turner   VTYPE p03 = VWRAP (pairwise_poly_3) (x, x2, poly);
37*f3087befSAndrew Turner   VTYPE p45 = FMA (poly[5], x, poly[4]);
38*f3087befSAndrew Turner   return FMA (p45, x4, p03);
39*f3087befSAndrew Turner }
VWRAP(estrin_6)40*f3087befSAndrew Turner static inline VTYPE VWRAP (estrin_6) (VTYPE x, VTYPE x2, VTYPE x4,
41*f3087befSAndrew Turner 				      const VTYPE *poly)
42*f3087befSAndrew Turner {
43*f3087befSAndrew Turner   VTYPE p03 = VWRAP (pairwise_poly_3) (x, x2, poly);
44*f3087befSAndrew Turner   VTYPE p45 = FMA (poly[5], x, poly[4]);
45*f3087befSAndrew Turner   VTYPE p46 = FMA (poly[6], x2, p45);
46*f3087befSAndrew Turner   return FMA (p46, x4, p03);
47*f3087befSAndrew Turner }
VWRAP(estrin_7)48*f3087befSAndrew Turner static inline VTYPE VWRAP (estrin_7) (VTYPE x, VTYPE x2, VTYPE x4,
49*f3087befSAndrew Turner 				      const VTYPE *poly)
50*f3087befSAndrew Turner {
51*f3087befSAndrew Turner   VTYPE p03 = VWRAP (pairwise_poly_3) (x, x2, poly);
52*f3087befSAndrew Turner   VTYPE p47 = VWRAP (pairwise_poly_3) (x, x2, poly + 4);
53*f3087befSAndrew Turner   return FMA (p47, x4, p03);
54*f3087befSAndrew Turner }
VWRAP(estrin_8)55*f3087befSAndrew Turner static inline VTYPE VWRAP (estrin_8) (VTYPE x, VTYPE x2, VTYPE x4, VTYPE x8,
56*f3087befSAndrew Turner 				      const VTYPE *poly)
57*f3087befSAndrew Turner {
58*f3087befSAndrew Turner   return FMA (poly[8], x8, VWRAP (estrin_7) (x, x2, x4, poly));
59*f3087befSAndrew Turner }
VWRAP(estrin_9)60*f3087befSAndrew Turner static inline VTYPE VWRAP (estrin_9) (VTYPE x, VTYPE x2, VTYPE x4, VTYPE x8,
61*f3087befSAndrew Turner 				      const VTYPE *poly)
62*f3087befSAndrew Turner {
63*f3087befSAndrew Turner   VTYPE p89 = FMA (poly[9], x, poly[8]);
64*f3087befSAndrew Turner   return FMA (p89, x8, VWRAP (estrin_7) (x, x2, x4, poly));
65*f3087befSAndrew Turner }
VWRAP(estrin_10)66*f3087befSAndrew Turner static inline VTYPE VWRAP (estrin_10) (VTYPE x, VTYPE x2, VTYPE x4, VTYPE x8,
67*f3087befSAndrew Turner 				       const VTYPE *poly)
68*f3087befSAndrew Turner {
69*f3087befSAndrew Turner   VTYPE p89 = FMA (poly[9], x, poly[8]);
70*f3087befSAndrew Turner   VTYPE p8_10 = FMA (poly[10], x2, p89);
71*f3087befSAndrew Turner   return FMA (p8_10, x8, VWRAP (estrin_7) (x, x2, x4, poly));
72*f3087befSAndrew Turner }
VWRAP(estrin_11)73*f3087befSAndrew Turner static inline VTYPE VWRAP (estrin_11) (VTYPE x, VTYPE x2, VTYPE x4, VTYPE x8,
74*f3087befSAndrew Turner 				       const VTYPE *poly)
75*f3087befSAndrew Turner {
76*f3087befSAndrew Turner   VTYPE p8_11 = VWRAP (pairwise_poly_3) (x, x2, poly + 8);
77*f3087befSAndrew Turner   return FMA (p8_11, x8, VWRAP (estrin_7) (x, x2, x4, poly));
78*f3087befSAndrew Turner }
VWRAP(estrin_12)79*f3087befSAndrew Turner static inline VTYPE VWRAP (estrin_12) (VTYPE x, VTYPE x2, VTYPE x4, VTYPE x8,
80*f3087befSAndrew Turner 				       const VTYPE *poly)
81*f3087befSAndrew Turner {
82*f3087befSAndrew Turner   return FMA (VWRAP (estrin_4) (x, x2, x4, poly + 8), x8,
83*f3087befSAndrew Turner 	      VWRAP (estrin_7) (x, x2, x4, poly));
84*f3087befSAndrew Turner }
VWRAP(estrin_13)85*f3087befSAndrew Turner static inline VTYPE VWRAP (estrin_13) (VTYPE x, VTYPE x2, VTYPE x4, VTYPE x8,
86*f3087befSAndrew Turner 				       const VTYPE *poly)
87*f3087befSAndrew Turner {
88*f3087befSAndrew Turner   return FMA (VWRAP (estrin_5) (x, x2, x4, poly + 8), x8,
89*f3087befSAndrew Turner 	      VWRAP (estrin_7) (x, x2, x4, poly));
90*f3087befSAndrew Turner }
VWRAP(estrin_14)91*f3087befSAndrew Turner static inline VTYPE VWRAP (estrin_14) (VTYPE x, VTYPE x2, VTYPE x4, VTYPE x8,
92*f3087befSAndrew Turner 				       const VTYPE *poly)
93*f3087befSAndrew Turner {
94*f3087befSAndrew Turner   return FMA (VWRAP (estrin_6) (x, x2, x4, poly + 8), x8,
95*f3087befSAndrew Turner 	      VWRAP (estrin_7) (x, x2, x4, poly));
96*f3087befSAndrew Turner }
VWRAP(estrin_15)97*f3087befSAndrew Turner static inline VTYPE VWRAP (estrin_15) (VTYPE x, VTYPE x2, VTYPE x4, VTYPE x8,
98*f3087befSAndrew Turner 				       const VTYPE *poly)
99*f3087befSAndrew Turner {
100*f3087befSAndrew Turner   return FMA (VWRAP (estrin_7) (x, x2, x4, poly + 8), x8,
101*f3087befSAndrew Turner 	      VWRAP (estrin_7) (x, x2, x4, poly));
102*f3087befSAndrew Turner }
VWRAP(estrin_16)103*f3087befSAndrew Turner static inline VTYPE VWRAP (estrin_16) (VTYPE x, VTYPE x2, VTYPE x4, VTYPE x8,
104*f3087befSAndrew Turner 				       VTYPE x16, const VTYPE *poly)
105*f3087befSAndrew Turner {
106*f3087befSAndrew Turner   return FMA (poly[16], x16, VWRAP (estrin_15) (x, x2, x4, x8, poly));
107*f3087befSAndrew Turner }
VWRAP(estrin_17)108*f3087befSAndrew Turner static inline VTYPE VWRAP (estrin_17) (VTYPE x, VTYPE x2, VTYPE x4, VTYPE x8,
109*f3087befSAndrew Turner 				       VTYPE x16, const VTYPE *poly)
110*f3087befSAndrew Turner {
111*f3087befSAndrew Turner   VTYPE p16_17 = FMA (poly[17], x, poly[16]);
112*f3087befSAndrew Turner   return FMA (p16_17, x16, VWRAP (estrin_15) (x, x2, x4, x8, poly));
113*f3087befSAndrew Turner }
VWRAP(estrin_18)114*f3087befSAndrew Turner static inline VTYPE VWRAP (estrin_18) (VTYPE x, VTYPE x2, VTYPE x4, VTYPE x8,
115*f3087befSAndrew Turner 				       VTYPE x16, const VTYPE *poly)
116*f3087befSAndrew Turner {
117*f3087befSAndrew Turner   VTYPE p16_17 = FMA (poly[17], x, poly[16]);
118*f3087befSAndrew Turner   VTYPE p16_18 = FMA (poly[18], x2, p16_17);
119*f3087befSAndrew Turner   return FMA (p16_18, x16, VWRAP (estrin_15) (x, x2, x4, x8, poly));
120*f3087befSAndrew Turner }
VWRAP(estrin_19)121*f3087befSAndrew Turner static inline VTYPE VWRAP (estrin_19) (VTYPE x, VTYPE x2, VTYPE x4, VTYPE x8,
122*f3087befSAndrew Turner 				       VTYPE x16, const VTYPE *poly)
123*f3087befSAndrew Turner {
124*f3087befSAndrew Turner   VTYPE p16_19 = VWRAP (pairwise_poly_3) (x, x2, poly + 16);
125*f3087befSAndrew Turner   return FMA (p16_19, x16, VWRAP (estrin_15) (x, x2, x4, x8, poly));
126*f3087befSAndrew Turner }
127*f3087befSAndrew Turner 
VWRAP(horner_2)128*f3087befSAndrew Turner static inline VTYPE VWRAP (horner_2) (VTYPE x, const VTYPE *poly)
129*f3087befSAndrew Turner {
130*f3087befSAndrew Turner   VTYPE p = FMA (poly[2], x, poly[1]);
131*f3087befSAndrew Turner   return FMA (x, p, poly[0]);
132*f3087befSAndrew Turner }
VWRAP(horner_3)133*f3087befSAndrew Turner static inline VTYPE VWRAP (horner_3) (VTYPE x, const VTYPE *poly)
134*f3087befSAndrew Turner {
135*f3087befSAndrew Turner   VTYPE p = FMA (poly[3], x, poly[2]);
136*f3087befSAndrew Turner   p = FMA (x, p, poly[1]);
137*f3087befSAndrew Turner   p = FMA (x, p, poly[0]);
138*f3087befSAndrew Turner   return p;
139*f3087befSAndrew Turner }
VWRAP(horner_4)140*f3087befSAndrew Turner static inline VTYPE VWRAP (horner_4) (VTYPE x, const VTYPE *poly)
141*f3087befSAndrew Turner {
142*f3087befSAndrew Turner   VTYPE p = FMA (poly[4], x, poly[3]);
143*f3087befSAndrew Turner   p = FMA (x, p, poly[2]);
144*f3087befSAndrew Turner   p = FMA (x, p, poly[1]);
145*f3087befSAndrew Turner   p = FMA (x, p, poly[0]);
146*f3087befSAndrew Turner   return p;
147*f3087befSAndrew Turner }
VWRAP(horner_5)148*f3087befSAndrew Turner static inline VTYPE VWRAP (horner_5) (VTYPE x, const VTYPE *poly)
149*f3087befSAndrew Turner {
150*f3087befSAndrew Turner   return FMA (x, VWRAP (horner_4) (x, poly + 1), poly[0]);
151*f3087befSAndrew Turner }
VWRAP(horner_6)152*f3087befSAndrew Turner static inline VTYPE VWRAP (horner_6) (VTYPE x, const VTYPE *poly)
153*f3087befSAndrew Turner {
154*f3087befSAndrew Turner   return FMA (x, VWRAP (horner_5) (x, poly + 1), poly[0]);
155*f3087befSAndrew Turner }
VWRAP(horner_7)156*f3087befSAndrew Turner static inline VTYPE VWRAP (horner_7) (VTYPE x, const VTYPE *poly)
157*f3087befSAndrew Turner {
158*f3087befSAndrew Turner   return FMA (x, VWRAP (horner_6) (x, poly + 1), poly[0]);
159*f3087befSAndrew Turner }
VWRAP(horner_8)160*f3087befSAndrew Turner static inline VTYPE VWRAP (horner_8) (VTYPE x, const VTYPE *poly)
161*f3087befSAndrew Turner {
162*f3087befSAndrew Turner   return FMA (x, VWRAP (horner_7) (x, poly + 1), poly[0]);
163*f3087befSAndrew Turner }
VWRAP(horner_9)164*f3087befSAndrew Turner static inline VTYPE VWRAP (horner_9) (VTYPE x, const VTYPE *poly)
165*f3087befSAndrew Turner {
166*f3087befSAndrew Turner   return FMA (x, VWRAP (horner_8) (x, poly + 1), poly[0]);
167*f3087befSAndrew Turner }
VWRAP(horner_10)168*f3087befSAndrew Turner static inline VTYPE VWRAP (horner_10) (VTYPE x, const VTYPE *poly)
169*f3087befSAndrew Turner {
170*f3087befSAndrew Turner   return FMA (x, VWRAP (horner_9) (x, poly + 1), poly[0]);
171*f3087befSAndrew Turner }
VWRAP(horner_11)172*f3087befSAndrew Turner static inline VTYPE VWRAP (horner_11) (VTYPE x, const VTYPE *poly)
173*f3087befSAndrew Turner {
174*f3087befSAndrew Turner   return FMA (x, VWRAP (horner_10) (x, poly + 1), poly[0]);
175*f3087befSAndrew Turner }
VWRAP(horner_12)176*f3087befSAndrew Turner static inline VTYPE VWRAP (horner_12) (VTYPE x, const VTYPE *poly)
177*f3087befSAndrew Turner {
178*f3087befSAndrew Turner   return FMA (x, VWRAP (horner_11) (x, poly + 1), poly[0]);
179*f3087befSAndrew Turner }
180*f3087befSAndrew Turner 
VWRAP(pw_horner_4)181*f3087befSAndrew Turner static inline VTYPE VWRAP (pw_horner_4) (VTYPE x, VTYPE x2, const VTYPE *poly)
182*f3087befSAndrew Turner {
183*f3087befSAndrew Turner   VTYPE p01 = FMA (poly[1], x, poly[0]);
184*f3087befSAndrew Turner   VTYPE p23 = FMA (poly[3], x, poly[2]);
185*f3087befSAndrew Turner   VTYPE p;
186*f3087befSAndrew Turner   p = FMA (x2, poly[4], p23);
187*f3087befSAndrew Turner   p = FMA (x2, p, p01);
188*f3087befSAndrew Turner   return p;
189*f3087befSAndrew Turner }
VWRAP(pw_horner_5)190*f3087befSAndrew Turner static inline VTYPE VWRAP (pw_horner_5) (VTYPE x, VTYPE x2, const VTYPE *poly)
191*f3087befSAndrew Turner {
192*f3087befSAndrew Turner   VTYPE p01 = FMA (poly[1], x, poly[0]);
193*f3087befSAndrew Turner   VTYPE p23 = FMA (poly[3], x, poly[2]);
194*f3087befSAndrew Turner   VTYPE p45 = FMA (poly[5], x, poly[4]);
195*f3087befSAndrew Turner   VTYPE p;
196*f3087befSAndrew Turner   p = FMA (x2, p45, p23);
197*f3087befSAndrew Turner   p = FMA (x2, p, p01);
198*f3087befSAndrew Turner   return p;
199*f3087befSAndrew Turner }
VWRAP(pw_horner_6)200*f3087befSAndrew Turner static inline VTYPE VWRAP (pw_horner_6) (VTYPE x, VTYPE x2, const VTYPE *poly)
201*f3087befSAndrew Turner {
202*f3087befSAndrew Turner   VTYPE p26 = VWRAP (pw_horner_4) (x, x2, poly + 2);
203*f3087befSAndrew Turner   VTYPE p01 = FMA (poly[1], x, poly[0]);
204*f3087befSAndrew Turner   return FMA (x2, p26, p01);
205*f3087befSAndrew Turner }
VWRAP(pw_horner_7)206*f3087befSAndrew Turner static inline VTYPE VWRAP (pw_horner_7) (VTYPE x, VTYPE x2, const VTYPE *poly)
207*f3087befSAndrew Turner {
208*f3087befSAndrew Turner   VTYPE p27 = VWRAP (pw_horner_5) (x, x2, poly + 2);
209*f3087befSAndrew Turner   VTYPE p01 = FMA (poly[1], x, poly[0]);
210*f3087befSAndrew Turner   return FMA (x2, p27, p01);
211*f3087befSAndrew Turner }
VWRAP(pw_horner_8)212*f3087befSAndrew Turner static inline VTYPE VWRAP (pw_horner_8) (VTYPE x, VTYPE x2, const VTYPE *poly)
213*f3087befSAndrew Turner {
214*f3087befSAndrew Turner   VTYPE p28 = VWRAP (pw_horner_6) (x, x2, poly + 2);
215*f3087befSAndrew Turner   VTYPE p01 = FMA (poly[1], x, poly[0]);
216*f3087befSAndrew Turner   return FMA (x2, p28, p01);
217*f3087befSAndrew Turner }
VWRAP(pw_horner_9)218*f3087befSAndrew Turner static inline VTYPE VWRAP (pw_horner_9) (VTYPE x, VTYPE x2, const VTYPE *poly)
219*f3087befSAndrew Turner {
220*f3087befSAndrew Turner   VTYPE p29 = VWRAP (pw_horner_7) (x, x2, poly + 2);
221*f3087befSAndrew Turner   VTYPE p01 = FMA (poly[1], x, poly[0]);
222*f3087befSAndrew Turner   return FMA (x2, p29, p01);
223*f3087befSAndrew Turner }
VWRAP(pw_horner_10)224*f3087befSAndrew Turner static inline VTYPE VWRAP (pw_horner_10) (VTYPE x, VTYPE x2, const VTYPE *poly)
225*f3087befSAndrew Turner {
226*f3087befSAndrew Turner   VTYPE p2_10 = VWRAP (pw_horner_8) (x, x2, poly + 2);
227*f3087befSAndrew Turner   VTYPE p01 = FMA (poly[1], x, poly[0]);
228*f3087befSAndrew Turner   return FMA (x2, p2_10, p01);
229*f3087befSAndrew Turner }
VWRAP(pw_horner_11)230*f3087befSAndrew Turner static inline VTYPE VWRAP (pw_horner_11) (VTYPE x, VTYPE x2, const VTYPE *poly)
231*f3087befSAndrew Turner {
232*f3087befSAndrew Turner   VTYPE p2_11 = VWRAP (pw_horner_9) (x, x2, poly + 2);
233*f3087befSAndrew Turner   VTYPE p01 = FMA (poly[1], x, poly[0]);
234*f3087befSAndrew Turner   return FMA (x2, p2_11, p01);
235*f3087befSAndrew Turner }
VWRAP(pw_horner_12)236*f3087befSAndrew Turner static inline VTYPE VWRAP (pw_horner_12) (VTYPE x, VTYPE x2, const VTYPE *poly)
237*f3087befSAndrew Turner {
238*f3087befSAndrew Turner   VTYPE p2_12 = VWRAP (pw_horner_10) (x, x2, poly + 2);
239*f3087befSAndrew Turner   VTYPE p01 = FMA (poly[1], x, poly[0]);
240*f3087befSAndrew Turner   return FMA (x2, p2_12, p01);
241*f3087befSAndrew Turner }
VWRAP(pw_horner_13)242*f3087befSAndrew Turner static inline VTYPE VWRAP (pw_horner_13) (VTYPE x, VTYPE x2, const VTYPE *poly)
243*f3087befSAndrew Turner {
244*f3087befSAndrew Turner   VTYPE p2_13 = VWRAP (pw_horner_11) (x, x2, poly + 2);
245*f3087befSAndrew Turner   VTYPE p01 = FMA (poly[1], x, poly[0]);
246*f3087befSAndrew Turner   return FMA (x2, p2_13, p01);
247*f3087befSAndrew Turner }
VWRAP(pw_horner_14)248*f3087befSAndrew Turner static inline VTYPE VWRAP (pw_horner_14) (VTYPE x, VTYPE x2, const VTYPE *poly)
249*f3087befSAndrew Turner {
250*f3087befSAndrew Turner   VTYPE p2_14 = VWRAP (pw_horner_12) (x, x2, poly + 2);
251*f3087befSAndrew Turner   VTYPE p01 = FMA (poly[1], x, poly[0]);
252*f3087befSAndrew Turner   return FMA (x2, p2_14, p01);
253*f3087befSAndrew Turner }
VWRAP(pw_horner_15)254*f3087befSAndrew Turner static inline VTYPE VWRAP (pw_horner_15) (VTYPE x, VTYPE x2, const VTYPE *poly)
255*f3087befSAndrew Turner {
256*f3087befSAndrew Turner   VTYPE p2_15 = VWRAP (pw_horner_13) (x, x2, poly + 2);
257*f3087befSAndrew Turner   VTYPE p01 = FMA (poly[1], x, poly[0]);
258*f3087befSAndrew Turner   return FMA (x2, p2_15, p01);
259*f3087befSAndrew Turner }
VWRAP(pw_horner_16)260*f3087befSAndrew Turner static inline VTYPE VWRAP (pw_horner_16) (VTYPE x, VTYPE x2, const VTYPE *poly)
261*f3087befSAndrew Turner {
262*f3087befSAndrew Turner   VTYPE p2_16 = VWRAP (pw_horner_14) (x, x2, poly + 2);
263*f3087befSAndrew Turner   VTYPE p01 = FMA (poly[1], x, poly[0]);
264*f3087befSAndrew Turner   return FMA (x2, p2_16, p01);
265*f3087befSAndrew Turner }
VWRAP(pw_horner_17)266*f3087befSAndrew Turner static inline VTYPE VWRAP (pw_horner_17) (VTYPE x, VTYPE x2, const VTYPE *poly)
267*f3087befSAndrew Turner {
268*f3087befSAndrew Turner   VTYPE p2_17 = VWRAP (pw_horner_15) (x, x2, poly + 2);
269*f3087befSAndrew Turner   VTYPE p01 = FMA (poly[1], x, poly[0]);
270*f3087befSAndrew Turner   return FMA (x2, p2_17, p01);
271*f3087befSAndrew Turner }
VWRAP(pw_horner_18)272*f3087befSAndrew Turner static inline VTYPE VWRAP (pw_horner_18) (VTYPE x, VTYPE x2, const VTYPE *poly)
273*f3087befSAndrew Turner {
274*f3087befSAndrew Turner   VTYPE p2_18 = VWRAP (pw_horner_16) (x, x2, poly + 2);
275*f3087befSAndrew Turner   VTYPE p01 = FMA (poly[1], x, poly[0]);
276*f3087befSAndrew Turner   return FMA (x2, p2_18, p01);
277*f3087befSAndrew Turner }
278