1 /* 2 * Helpers for evaluating polynomials with various schemes - specific to SVE 3 * but precision-agnostic. 4 * 5 * Copyright (c) 2023-2024, Arm Limited. 6 * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception 7 */ 8 9 #ifndef VTYPE 10 # error Cannot use poly_generic without defining VTYPE 11 #endif 12 #ifndef STYPE 13 # error Cannot use poly_generic without defining STYPE 14 #endif 15 #ifndef VWRAP 16 # error Cannot use poly_generic without defining VWRAP 17 #endif 18 #ifndef DUP 19 # error Cannot use poly_generic without defining DUP 20 #endif 21 22 static inline VTYPE VWRAP (pairwise_poly_3) (svbool_t pg, VTYPE x, VTYPE x2, 23 const STYPE *poly) 24 { 25 /* At order 3, Estrin and Pairwise Horner are identical. */ 26 VTYPE p01 = svmla_x (pg, DUP (poly[0]), x, poly[1]); 27 VTYPE p23 = svmla_x (pg, DUP (poly[2]), x, poly[3]); 28 return svmla_x (pg, p01, p23, x2); 29 } 30 31 static inline VTYPE VWRAP (estrin_4) (svbool_t pg, VTYPE x, VTYPE x2, VTYPE x4, 32 const STYPE *poly) 33 { 34 VTYPE p03 = VWRAP (pairwise_poly_3) (pg, x, x2, poly); 35 return svmla_x (pg, p03, x4, poly[4]); 36 } 37 static inline VTYPE VWRAP (estrin_5) (svbool_t pg, VTYPE x, VTYPE x2, VTYPE x4, 38 const STYPE *poly) 39 { 40 VTYPE p03 = VWRAP (pairwise_poly_3) (pg, x, x2, poly); 41 VTYPE p45 = svmla_x (pg, DUP (poly[4]), x, poly[5]); 42 return svmla_x (pg, p03, p45, x4); 43 } 44 static inline VTYPE VWRAP (estrin_6) (svbool_t pg, VTYPE x, VTYPE x2, VTYPE x4, 45 const STYPE *poly) 46 { 47 VTYPE p03 = VWRAP (pairwise_poly_3) (pg, x, x2, poly); 48 VTYPE p45 = svmla_x (pg, DUP (poly[4]), x, poly[5]); 49 VTYPE p46 = svmla_x (pg, p45, x, poly[6]); 50 return svmla_x (pg, p03, p46, x4); 51 } 52 static inline VTYPE VWRAP (estrin_7) (svbool_t pg, VTYPE x, VTYPE x2, VTYPE x4, 53 const STYPE *poly) 54 { 55 VTYPE p03 = VWRAP (pairwise_poly_3) (pg, x, x2, poly); 56 VTYPE p47 = VWRAP (pairwise_poly_3) (pg, x, x2, poly + 4); 57 return svmla_x (pg, p03, p47, x4); 58 } 59 static inline VTYPE VWRAP (estrin_8) (svbool_t pg, VTYPE x, VTYPE x2, VTYPE x4, 60 VTYPE x8, const STYPE *poly) 61 { 62 return svmla_x (pg, VWRAP (estrin_7) (pg, x, x2, x4, poly), x8, poly[8]); 63 } 64 static inline VTYPE VWRAP (estrin_9) (svbool_t pg, VTYPE x, VTYPE x2, VTYPE x4, 65 VTYPE x8, const STYPE *poly) 66 { 67 VTYPE p89 = svmla_x (pg, DUP (poly[8]), x, poly[9]); 68 return svmla_x (pg, VWRAP (estrin_7) (pg, x, x2, x4, poly), p89, x8); 69 } 70 static inline VTYPE VWRAP (estrin_10) (svbool_t pg, VTYPE x, VTYPE x2, 71 VTYPE x4, VTYPE x8, const STYPE *poly) 72 { 73 VTYPE p89 = svmla_x (pg, DUP (poly[8]), x, poly[9]); 74 VTYPE p8_10 = svmla_x (pg, p89, x2, poly[10]); 75 return svmla_x (pg, VWRAP (estrin_7) (pg, x, x2, x4, poly), p8_10, x8); 76 } 77 static inline VTYPE VWRAP (estrin_11) (svbool_t pg, VTYPE x, VTYPE x2, 78 VTYPE x4, VTYPE x8, const STYPE *poly) 79 { 80 VTYPE p8_11 = VWRAP (pairwise_poly_3) (pg, x, x2, poly + 8); 81 return svmla_x (pg, VWRAP (estrin_7) (pg, x, x2, x4, poly), p8_11, x8); 82 } 83 static inline VTYPE VWRAP (estrin_12) (svbool_t pg, VTYPE x, VTYPE x2, 84 VTYPE x4, VTYPE x8, const STYPE *poly) 85 { 86 return svmla_x (pg, VWRAP (estrin_7) (pg, x, x2, x4, poly), 87 VWRAP (estrin_4) (pg, x, x2, x4, poly + 8), x8); 88 } 89 static inline VTYPE VWRAP (estrin_13) (svbool_t pg, VTYPE x, VTYPE x2, 90 VTYPE x4, VTYPE x8, const STYPE *poly) 91 { 92 return svmla_x (pg, VWRAP (estrin_7) (pg, x, x2, x4, poly), 93 VWRAP (estrin_5) (pg, x, x2, x4, poly + 8), x8); 94 } 95 static inline VTYPE VWRAP (estrin_14) (svbool_t pg, VTYPE x, VTYPE x2, 96 VTYPE x4, VTYPE x8, const STYPE *poly) 97 { 98 return svmla_x (pg, VWRAP (estrin_7) (pg, x, x2, x4, poly), 99 VWRAP (estrin_6) (pg, x, x2, x4, poly + 8), x8); 100 } 101 static inline VTYPE VWRAP (estrin_15) (svbool_t pg, VTYPE x, VTYPE x2, 102 VTYPE x4, VTYPE x8, const STYPE *poly) 103 { 104 return svmla_x (pg, VWRAP (estrin_7) (pg, x, x2, x4, poly), 105 VWRAP (estrin_7) (pg, x, x2, x4, poly + 8), x8); 106 } 107 static inline VTYPE VWRAP (estrin_16) (svbool_t pg, VTYPE x, VTYPE x2, 108 VTYPE x4, VTYPE x8, VTYPE x16, 109 const STYPE *poly) 110 { 111 return svmla_x (pg, VWRAP (estrin_15) (pg, x, x2, x4, x8, poly), x16, 112 poly[16]); 113 } 114 static inline VTYPE VWRAP (estrin_17) (svbool_t pg, VTYPE x, VTYPE x2, 115 VTYPE x4, VTYPE x8, VTYPE x16, 116 const STYPE *poly) 117 { 118 VTYPE p16_17 = svmla_x (pg, DUP (poly[16]), x, poly[17]); 119 return svmla_x (pg, VWRAP (estrin_15) (pg, x, x2, x4, x8, poly), p16_17, 120 x16); 121 } 122 static inline VTYPE VWRAP (estrin_18) (svbool_t pg, VTYPE x, VTYPE x2, 123 VTYPE x4, VTYPE x8, VTYPE x16, 124 const STYPE *poly) 125 { 126 VTYPE p16_17 = svmla_x (pg, DUP (poly[16]), x, poly[17]); 127 VTYPE p16_18 = svmla_x (pg, p16_17, x2, poly[18]); 128 return svmla_x (pg, VWRAP (estrin_15) (pg, x, x2, x4, x8, poly), p16_18, 129 x16); 130 } 131 static inline VTYPE VWRAP (estrin_19) (svbool_t pg, VTYPE x, VTYPE x2, 132 VTYPE x4, VTYPE x8, VTYPE x16, 133 const STYPE *poly) 134 { 135 return svmla_x (pg, VWRAP (estrin_15) (pg, x, x2, x4, x8, poly), 136 VWRAP (pairwise_poly_3) (pg, x, x2, poly + 16), x16); 137 } 138 139 static inline VTYPE VWRAP (horner_3) (svbool_t pg, VTYPE x, const STYPE *poly) 140 { 141 VTYPE p = svmla_x (pg, DUP (poly[2]), x, poly[3]); 142 p = svmad_x (pg, x, p, poly[1]); 143 p = svmad_x (pg, x, p, poly[0]); 144 return p; 145 } 146 static inline VTYPE VWRAP (horner_4) (svbool_t pg, VTYPE x, const STYPE *poly) 147 { 148 VTYPE p = svmla_x (pg, DUP (poly[3]), x, poly[4]); 149 p = svmad_x (pg, x, p, poly[2]); 150 p = svmad_x (pg, x, p, poly[1]); 151 p = svmad_x (pg, x, p, poly[0]); 152 return p; 153 } 154 static inline VTYPE VWRAP (horner_5) (svbool_t pg, VTYPE x, const STYPE *poly) 155 { 156 return svmad_x (pg, x, VWRAP (horner_4) (pg, x, poly + 1), poly[0]); 157 } 158 static inline VTYPE VWRAP (horner_6) (svbool_t pg, VTYPE x, const STYPE *poly) 159 { 160 return svmad_x (pg, x, VWRAP (horner_5) (pg, x, poly + 1), poly[0]); 161 } 162 static inline VTYPE VWRAP (horner_7) (svbool_t pg, VTYPE x, const STYPE *poly) 163 { 164 return svmad_x (pg, x, VWRAP (horner_6) (pg, x, poly + 1), poly[0]); 165 } 166 static inline VTYPE VWRAP (horner_8) (svbool_t pg, VTYPE x, const STYPE *poly) 167 { 168 return svmad_x (pg, x, VWRAP (horner_7) (pg, x, poly + 1), poly[0]); 169 } 170 static inline VTYPE VWRAP (horner_9) (svbool_t pg, VTYPE x, const STYPE *poly) 171 { 172 return svmad_x (pg, x, VWRAP (horner_8) (pg, x, poly + 1), poly[0]); 173 } 174 static inline VTYPE 175 sv_horner_10_f32_x (svbool_t pg, VTYPE x, const STYPE *poly) 176 { 177 return svmad_x (pg, x, VWRAP (horner_9) (pg, x, poly + 1), poly[0]); 178 } 179 static inline VTYPE 180 sv_horner_11_f32_x (svbool_t pg, VTYPE x, const STYPE *poly) 181 { 182 return svmad_x (pg, x, sv_horner_10_f32_x (pg, x, poly + 1), poly[0]); 183 } 184 static inline VTYPE 185 sv_horner_12_f32_x (svbool_t pg, VTYPE x, const STYPE *poly) 186 { 187 return svmad_x (pg, x, sv_horner_11_f32_x (pg, x, poly + 1), poly[0]); 188 } 189 190 static inline VTYPE VWRAP (pw_horner_4) (svbool_t pg, VTYPE x, VTYPE x2, 191 const STYPE *poly) 192 { 193 VTYPE p01 = svmla_x (pg, DUP (poly[0]), x, poly[1]); 194 VTYPE p23 = svmla_x (pg, DUP (poly[2]), x, poly[3]); 195 VTYPE p; 196 p = svmla_x (pg, p23, x2, poly[4]); 197 p = svmla_x (pg, p01, x2, p); 198 return p; 199 } 200 static inline VTYPE VWRAP (pw_horner_5) (svbool_t pg, VTYPE x, VTYPE x2, 201 const STYPE *poly) 202 { 203 VTYPE p01 = svmla_x (pg, DUP (poly[0]), x, poly[1]); 204 VTYPE p23 = svmla_x (pg, DUP (poly[2]), x, poly[3]); 205 VTYPE p45 = svmla_x (pg, DUP (poly[4]), x, poly[5]); 206 VTYPE p; 207 p = svmla_x (pg, p23, x2, p45); 208 p = svmla_x (pg, p01, x2, p); 209 return p; 210 } 211 static inline VTYPE VWRAP (pw_horner_6) (svbool_t pg, VTYPE x, VTYPE x2, 212 const STYPE *poly) 213 { 214 VTYPE p26 = VWRAP (pw_horner_4) (pg, x, x2, poly + 2); 215 VTYPE p01 = svmla_x (pg, DUP (poly[0]), x, poly[1]); 216 return svmla_x (pg, p01, x2, p26); 217 } 218 static inline VTYPE VWRAP (pw_horner_7) (svbool_t pg, VTYPE x, VTYPE x2, 219 const STYPE *poly) 220 { 221 VTYPE p27 = VWRAP (pw_horner_5) (pg, x, x2, poly + 2); 222 VTYPE p01 = svmla_x (pg, DUP (poly[0]), x, poly[1]); 223 return svmla_x (pg, p01, x2, p27); 224 } 225 static inline VTYPE VWRAP (pw_horner_8) (svbool_t pg, VTYPE x, VTYPE x2, 226 const STYPE *poly) 227 { 228 VTYPE p28 = VWRAP (pw_horner_6) (pg, x, x2, poly + 2); 229 VTYPE p01 = svmla_x (pg, DUP (poly[0]), x, poly[1]); 230 return svmla_x (pg, p01, x2, p28); 231 } 232 static inline VTYPE VWRAP (pw_horner_9) (svbool_t pg, VTYPE x, VTYPE x2, 233 const STYPE *poly) 234 { 235 VTYPE p29 = VWRAP (pw_horner_7) (pg, x, x2, poly + 2); 236 VTYPE p01 = svmla_x (pg, DUP (poly[0]), x, poly[1]); 237 return svmla_x (pg, p01, x2, p29); 238 } 239 static inline VTYPE VWRAP (pw_horner_10) (svbool_t pg, VTYPE x, VTYPE x2, 240 const STYPE *poly) 241 { 242 VTYPE p2_10 = VWRAP (pw_horner_8) (pg, x, x2, poly + 2); 243 VTYPE p01 = svmla_x (pg, DUP (poly[0]), x, poly[1]); 244 return svmla_x (pg, p01, x2, p2_10); 245 } 246 static inline VTYPE VWRAP (pw_horner_11) (svbool_t pg, VTYPE x, VTYPE x2, 247 const STYPE *poly) 248 { 249 VTYPE p2_11 = VWRAP (pw_horner_9) (pg, x, x2, poly + 2); 250 VTYPE p01 = svmla_x (pg, DUP (poly[0]), x, poly[1]); 251 return svmla_x (pg, p01, x2, p2_11); 252 } 253 static inline VTYPE VWRAP (pw_horner_12) (svbool_t pg, VTYPE x, VTYPE x2, 254 const STYPE *poly) 255 { 256 VTYPE p2_12 = VWRAP (pw_horner_10) (pg, x, x2, poly + 2); 257 VTYPE p01 = svmla_x (pg, DUP (poly[0]), x, poly[1]); 258 return svmla_x (pg, p01, x2, p2_12); 259 } 260 static inline VTYPE VWRAP (pw_horner_13) (svbool_t pg, VTYPE x, VTYPE x2, 261 const STYPE *poly) 262 { 263 VTYPE p2_13 = VWRAP (pw_horner_11) (pg, x, x2, poly + 2); 264 VTYPE p01 = svmla_x (pg, DUP (poly[0]), x, poly[1]); 265 return svmla_x (pg, p01, x2, p2_13); 266 } 267 static inline VTYPE VWRAP (pw_horner_14) (svbool_t pg, VTYPE x, VTYPE x2, 268 const STYPE *poly) 269 { 270 VTYPE p2_14 = VWRAP (pw_horner_12) (pg, x, x2, poly + 2); 271 VTYPE p01 = svmla_x (pg, DUP (poly[0]), x, poly[1]); 272 return svmla_x (pg, p01, x2, p2_14); 273 } 274 static inline VTYPE VWRAP (pw_horner_15) (svbool_t pg, VTYPE x, VTYPE x2, 275 const STYPE *poly) 276 { 277 VTYPE p2_15 = VWRAP (pw_horner_13) (pg, x, x2, poly + 2); 278 VTYPE p01 = svmla_x (pg, DUP (poly[0]), x, poly[1]); 279 return svmla_x (pg, p01, x2, p2_15); 280 } 281 static inline VTYPE VWRAP (pw_horner_16) (svbool_t pg, VTYPE x, VTYPE x2, 282 const STYPE *poly) 283 { 284 VTYPE p2_16 = VWRAP (pw_horner_14) (pg, x, x2, poly + 2); 285 VTYPE p01 = svmla_x (pg, DUP (poly[0]), x, poly[1]); 286 return svmla_x (pg, p01, x2, p2_16); 287 } 288 static inline VTYPE VWRAP (pw_horner_17) (svbool_t pg, VTYPE x, VTYPE x2, 289 const STYPE *poly) 290 { 291 VTYPE p2_17 = VWRAP (pw_horner_15) (pg, x, x2, poly + 2); 292 VTYPE p01 = svmla_x (pg, DUP (poly[0]), x, poly[1]); 293 return svmla_x (pg, p01, x2, p2_17); 294 } 295 static inline VTYPE VWRAP (pw_horner_18) (svbool_t pg, VTYPE x, VTYPE x2, 296 const STYPE *poly) 297 { 298 VTYPE p2_18 = VWRAP (pw_horner_16) (pg, x, x2, poly + 2); 299 VTYPE p01 = svmla_x (pg, DUP (poly[0]), x, poly[1]); 300 return svmla_x (pg, p01, x2, p2_18); 301 } 302 303 static inline VTYPE VWRAP (lw_pw_horner_5) (svbool_t pg, VTYPE x, VTYPE x2, 304 const STYPE *poly_even, 305 const STYPE *poly_odd) 306 { 307 VTYPE c13 = svld1rq (pg, poly_odd); 308 309 VTYPE p01 = svmla_lane (DUP (poly_even[0]), x, c13, 0); 310 VTYPE p23 = svmla_lane (DUP (poly_even[1]), x, c13, 1); 311 VTYPE p45 = svmla_x (pg, DUP (poly_even[2]), x, poly_odd[2]); 312 313 VTYPE p; 314 p = svmla_x (pg, p23, x2, p45); 315 p = svmla_x (pg, p01, x2, p); 316 return p; 317 } 318 static inline VTYPE VWRAP (lw_pw_horner_9) (svbool_t pg, VTYPE x, VTYPE x2, 319 const STYPE *poly_even, 320 const STYPE *poly_odd) 321 { 322 VTYPE c13 = svld1rq (pg, poly_odd); 323 324 VTYPE p49 = VWRAP (lw_pw_horner_5) (pg, x, x2, poly_even + 2, poly_odd + 2); 325 VTYPE p23 = svmla_lane (DUP (poly_even[1]), x, c13, 1); 326 327 VTYPE p29 = svmla_x (pg, p23, x2, p49); 328 VTYPE p01 = svmla_lane (DUP (poly_even[0]), x, c13, 0); 329 330 return svmla_x (pg, p01, x2, p29); 331 } 332