xref: /freebsd/contrib/llvm-project/clang/lib/Headers/fmaintrin.h (revision 02e9120893770924227138ba49df1edb3896112a)
1 /*===---- fmaintrin.h - FMA intrinsics -------------------------------------===
2  *
3  * Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4  * See https://llvm.org/LICENSE.txt for license information.
5  * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6  *
7  *===-----------------------------------------------------------------------===
8  */
9 
10 #ifndef __IMMINTRIN_H
11 #error "Never use <fmaintrin.h> directly; include <immintrin.h> instead."
12 #endif
13 
14 #ifndef __FMAINTRIN_H
15 #define __FMAINTRIN_H
16 
17 /* Define the default attributes for the functions in this file. */
18 #define __DEFAULT_FN_ATTRS128 __attribute__((__always_inline__, __nodebug__, __target__("fma"), __min_vector_width__(128)))
19 #define __DEFAULT_FN_ATTRS256 __attribute__((__always_inline__, __nodebug__, __target__("fma"), __min_vector_width__(256)))
20 
21 /// Computes a multiply-add of 128-bit vectors of [4 x float].
22 ///    For each element, computes <c> (__A * __B) + __C </c>.
23 ///
24 /// \headerfile <immintrin.h>
25 ///
26 /// This intrinsic corresponds to the \c VFMADD213PS instruction.
27 ///
28 /// \param __A
29 ///    A 128-bit vector of [4 x float] containing the multiplicand.
30 /// \param __B
31 ///    A 128-bit vector of [4 x float] containing the multiplier.
32 /// \param __C
33 ///    A 128-bit vector of [4 x float] containing the addend.
34 /// \returns A 128-bit vector of [4 x float] containing the result.
35 static __inline__ __m128 __DEFAULT_FN_ATTRS128
36 _mm_fmadd_ps(__m128 __A, __m128 __B, __m128 __C)
37 {
38   return (__m128)__builtin_ia32_vfmaddps((__v4sf)__A, (__v4sf)__B, (__v4sf)__C);
39 }
40 
41 /// Computes a multiply-add of 128-bit vectors of [2 x double].
42 ///    For each element, computes <c> (__A * __B) + __C </c>.
43 ///
44 /// \headerfile <immintrin.h>
45 ///
46 /// This intrinsic corresponds to the \c VFMADD213PD instruction.
47 ///
48 /// \param __A
49 ///    A 128-bit vector of [2 x double] containing the multiplicand.
50 /// \param __B
51 ///    A 128-bit vector of [2 x double] containing the multiplier.
52 /// \param __C
53 ///    A 128-bit vector of [2 x double] containing the addend.
54 /// \returns A 128-bit [2 x double] vector containing the result.
55 static __inline__ __m128d __DEFAULT_FN_ATTRS128
56 _mm_fmadd_pd(__m128d __A, __m128d __B, __m128d __C)
57 {
58   return (__m128d)__builtin_ia32_vfmaddpd((__v2df)__A, (__v2df)__B, (__v2df)__C);
59 }
60 
61 /// Computes a scalar multiply-add of the single-precision values in the
62 ///    low 32 bits of 128-bit vectors of [4 x float].
63 /// \code
64 /// result[31:0] = (__A[31:0] * __B[31:0]) + __C[31:0]
65 /// result[127:32] = __A[127:32]
66 /// \endcode
67 ///
68 /// \headerfile <immintrin.h>
69 ///
70 /// This intrinsic corresponds to the \c VFMADD213SS instruction.
71 ///
72 /// \param __A
73 ///    A 128-bit vector of [4 x float] containing the multiplicand in the low
74 ///    32 bits.
75 /// \param __B
76 ///    A 128-bit vector of [4 x float] containing the multiplier in the low
77 ///    32 bits.
78 /// \param __C
79 ///    A 128-bit vector of [4 x float] containing the addend in the low
80 ///    32 bits.
81 /// \returns A 128-bit vector of [4 x float] containing the result in the low
82 ///    32 bits and a copy of \a __A[127:32] in the upper 96 bits.
83 static __inline__ __m128 __DEFAULT_FN_ATTRS128
84 _mm_fmadd_ss(__m128 __A, __m128 __B, __m128 __C)
85 {
86   return (__m128)__builtin_ia32_vfmaddss3((__v4sf)__A, (__v4sf)__B, (__v4sf)__C);
87 }
88 
89 /// Computes a scalar multiply-add of the double-precision values in the
90 ///    low 64 bits of 128-bit vectors of [2 x double].
91 /// \code
92 /// result[63:0] = (__A[63:0] * __B[63:0]) + __C[63:0]
93 /// result[127:64] = __A[127:64]
94 /// \endcode
95 ///
96 /// \headerfile <immintrin.h>
97 ///
98 /// This intrinsic corresponds to the \c VFMADD213SD instruction.
99 ///
100 /// \param __A
101 ///    A 128-bit vector of [2 x double] containing the multiplicand in the low
102 ///    64 bits.
103 /// \param __B
104 ///    A 128-bit vector of [2 x double] containing the multiplier in the low
105 ///    64 bits.
106 /// \param __C
107 ///    A 128-bit vector of [2 x double] containing the addend in the low
108 ///    64 bits.
109 /// \returns A 128-bit vector of [2 x double] containing the result in the low
110 ///    64 bits and a copy of \a __A[127:64] in the upper 64 bits.
111 static __inline__ __m128d __DEFAULT_FN_ATTRS128
112 _mm_fmadd_sd(__m128d __A, __m128d __B, __m128d __C)
113 {
114   return (__m128d)__builtin_ia32_vfmaddsd3((__v2df)__A, (__v2df)__B, (__v2df)__C);
115 }
116 
117 /// Computes a multiply-subtract of 128-bit vectors of [4 x float].
118 ///    For each element, computes <c> (__A * __B) - __C </c>.
119 ///
120 /// \headerfile <immintrin.h>
121 ///
122 /// This intrinsic corresponds to the \c VFMSUB213PS instruction.
123 ///
124 /// \param __A
125 ///    A 128-bit vector of [4 x float] containing the multiplicand.
126 /// \param __B
127 ///    A 128-bit vector of [4 x float] containing the multiplier.
128 /// \param __C
129 ///    A 128-bit vector of [4 x float] containing the subtrahend.
130 /// \returns A 128-bit vector of [4 x float] containing the result.
131 static __inline__ __m128 __DEFAULT_FN_ATTRS128
132 _mm_fmsub_ps(__m128 __A, __m128 __B, __m128 __C)
133 {
134   return (__m128)__builtin_ia32_vfmaddps((__v4sf)__A, (__v4sf)__B, -(__v4sf)__C);
135 }
136 
137 /// Computes a multiply-subtract of 128-bit vectors of [2 x double].
138 ///    For each element, computes <c> (__A * __B) - __C </c>.
139 ///
140 /// \headerfile <immintrin.h>
141 ///
142 /// This intrinsic corresponds to the \c VFMSUB213PD instruction.
143 ///
144 /// \param __A
145 ///    A 128-bit vector of [2 x double] containing the multiplicand.
146 /// \param __B
147 ///    A 128-bit vector of [2 x double] containing the multiplier.
148 /// \param __C
149 ///    A 128-bit vector of [2 x double] containing the addend.
150 /// \returns A 128-bit vector of [2 x double] containing the result.
151 static __inline__ __m128d __DEFAULT_FN_ATTRS128
152 _mm_fmsub_pd(__m128d __A, __m128d __B, __m128d __C)
153 {
154   return (__m128d)__builtin_ia32_vfmaddpd((__v2df)__A, (__v2df)__B, -(__v2df)__C);
155 }
156 
157 /// Computes a scalar multiply-subtract of the single-precision values in
158 ///    the low 32 bits of 128-bit vectors of [4 x float].
159 /// \code
160 /// result[31:0] = (__A[31:0] * __B[31:0]) - __C[31:0]
161 /// result[127:32] = __A[127:32]
162 /// \endcode
163 ///
164 /// \headerfile <immintrin.h>
165 ///
166 /// This intrinsic corresponds to the \c VFMSUB213SS instruction.
167 ///
168 /// \param __A
169 ///    A 128-bit vector of [4 x float] containing the multiplicand in the low
170 ///    32 bits.
171 /// \param __B
172 ///    A 128-bit vector of [4 x float] containing the multiplier in the low
173 ///    32 bits.
174 /// \param __C
175 ///    A 128-bit vector of [4 x float] containing the subtrahend in the low
176 ///   32 bits.
177 /// \returns A 128-bit vector of [4 x float] containing the result in the low
178 ///    32 bits, and a copy of \a __A[127:32] in the upper 96 bits.
179 static __inline__ __m128 __DEFAULT_FN_ATTRS128
180 _mm_fmsub_ss(__m128 __A, __m128 __B, __m128 __C)
181 {
182   return (__m128)__builtin_ia32_vfmaddss3((__v4sf)__A, (__v4sf)__B, -(__v4sf)__C);
183 }
184 
185 /// Computes a scalar multiply-subtract of the double-precision values in
186 ///    the low 64 bits of 128-bit vectors of [2 x double].
187 /// \code
188 /// result[63:0] = (__A[63:0] * __B[63:0]) - __C[63:0]
189 /// result[127:64] = __A[127:64]
190 /// \endcode
191 ///
192 /// \headerfile <immintrin.h>
193 ///
194 /// This intrinsic corresponds to the \c VFMSUB213SD instruction.
195 ///
196 /// \param __A
197 ///    A 128-bit vector of [2 x double] containing the multiplicand in the low
198 ///    64 bits.
199 /// \param __B
200 ///    A 128-bit vector of [2 x double] containing the multiplier in the low
201 ///    64 bits.
202 /// \param __C
203 ///    A 128-bit vector of [2 x double] containing the subtrahend in the low
204 ///    64 bits.
205 /// \returns A 128-bit vector of [2 x double] containing the result in the low
206 ///    64 bits, and a copy of \a __A[127:64] in the upper 64 bits.
207 static __inline__ __m128d __DEFAULT_FN_ATTRS128
208 _mm_fmsub_sd(__m128d __A, __m128d __B, __m128d __C)
209 {
210   return (__m128d)__builtin_ia32_vfmaddsd3((__v2df)__A, (__v2df)__B, -(__v2df)__C);
211 }
212 
213 /// Computes a negated multiply-add of 128-bit vectors of [4 x float].
214 ///    For each element, computes <c> -(__A * __B) + __C </c>.
215 ///
216 /// \headerfile <immintrin.h>
217 ///
218 /// This intrinsic corresponds to the \c VFNMADD213DPS instruction.
219 ///
220 /// \param __A
221 ///    A 128-bit vector of [4 x float] containing the multiplicand.
222 /// \param __B
223 ///    A 128-bit vector of [4 x float] containing the multiplier.
224 /// \param __C
225 ///    A 128-bit vector of [4 x float] containing the addend.
226 /// \returns A 128-bit [4 x float] vector containing the result.
227 static __inline__ __m128 __DEFAULT_FN_ATTRS128
228 _mm_fnmadd_ps(__m128 __A, __m128 __B, __m128 __C)
229 {
230   return (__m128)__builtin_ia32_vfmaddps(-(__v4sf)__A, (__v4sf)__B, (__v4sf)__C);
231 }
232 
233 /// Computes a negated multiply-add of 128-bit vectors of [2 x double].
234 ///    For each element, computes <c> -(__A * __B) + __C </c>.
235 ///
236 /// \headerfile <immintrin.h>
237 ///
238 /// This intrinsic corresponds to the \c VFNMADD213PD instruction.
239 ///
240 /// \param __A
241 ///    A 128-bit vector of [2 x double] containing the multiplicand.
242 /// \param __B
243 ///    A 128-bit vector of [2 x double] containing the multiplier.
244 /// \param __C
245 ///    A 128-bit vector of [2 x double] containing the addend.
246 /// \returns A 128-bit vector of [2 x double] containing the result.
247 static __inline__ __m128d __DEFAULT_FN_ATTRS128
248 _mm_fnmadd_pd(__m128d __A, __m128d __B, __m128d __C)
249 {
250   return (__m128d)__builtin_ia32_vfmaddpd(-(__v2df)__A, (__v2df)__B, (__v2df)__C);
251 }
252 
253 /// Computes a scalar negated multiply-add of the single-precision values in
254 ///    the low 32 bits of 128-bit vectors of [4 x float].
255 /// \code
256 /// result[31:0] = -(__A[31:0] * __B[31:0]) + __C[31:0]
257 /// result[127:32] = __A[127:32]
258 /// \endcode
259 ///
260 /// \headerfile <immintrin.h>
261 ///
262 /// This intrinsic corresponds to the \c VFNMADD213SS instruction.
263 ///
264 /// \param __A
265 ///    A 128-bit vector of [4 x float] containing the multiplicand in the low
266 ///    32 bits.
267 /// \param __B
268 ///    A 128-bit vector of [4 x float] containing the multiplier in the low
269 ///    32 bits.
270 /// \param __C
271 ///    A 128-bit vector of [4 x float] containing the addend in the low
272 ///    32 bits.
273 /// \returns A 128-bit vector of [4 x float] containing the result in the low
274 ///    32 bits, and a copy of \a __A[127:32] in the upper 96 bits.
275 static __inline__ __m128 __DEFAULT_FN_ATTRS128
276 _mm_fnmadd_ss(__m128 __A, __m128 __B, __m128 __C)
277 {
278   return (__m128)__builtin_ia32_vfmaddss3((__v4sf)__A, -(__v4sf)__B, (__v4sf)__C);
279 }
280 
281 /// Computes a scalar negated multiply-add of the double-precision values
282 ///    in the low 64 bits of 128-bit vectors of [2 x double].
283 /// \code
284 /// result[63:0] = -(__A[63:0] * __B[63:0]) + __C[63:0]
285 /// result[127:64] = __A[127:64]
286 /// \endcode
287 ///
288 /// \headerfile <immintrin.h>
289 ///
290 /// This intrinsic corresponds to the \c VFNMADD213SD instruction.
291 ///
292 /// \param __A
293 ///    A 128-bit vector of [2 x double] containing the multiplicand in the low
294 ///    64 bits.
295 /// \param __B
296 ///    A 128-bit vector of [2 x double] containing the multiplier in the low
297 ///    64 bits.
298 /// \param __C
299 ///    A 128-bit vector of [2 x double] containing the addend in the low
300 ///    64 bits.
301 /// \returns A 128-bit vector of [2 x double] containing the result in the low
302 ///    64 bits, and a copy of \a __A[127:64] in the upper 64 bits.
303 static __inline__ __m128d __DEFAULT_FN_ATTRS128
304 _mm_fnmadd_sd(__m128d __A, __m128d __B, __m128d __C)
305 {
306   return (__m128d)__builtin_ia32_vfmaddsd3((__v2df)__A, -(__v2df)__B, (__v2df)__C);
307 }
308 
309 /// Computes a negated multiply-subtract of 128-bit vectors of [4 x float].
310 ///    For each element, computes <c> -(__A * __B) - __C </c>.
311 ///
312 /// \headerfile <immintrin.h>
313 ///
314 /// This intrinsic corresponds to the \c VFNMSUB213PS instruction.
315 ///
316 /// \param __A
317 ///    A 128-bit vector of [4 x float] containing the multiplicand.
318 /// \param __B
319 ///    A 128-bit vector of [4 x float] containing the multiplier.
320 /// \param __C
321 ///    A 128-bit vector of [4 x float] containing the subtrahend.
322 /// \returns A 128-bit vector of [4 x float] containing the result.
323 static __inline__ __m128 __DEFAULT_FN_ATTRS128
324 _mm_fnmsub_ps(__m128 __A, __m128 __B, __m128 __C)
325 {
326   return (__m128)__builtin_ia32_vfmaddps(-(__v4sf)__A, (__v4sf)__B, -(__v4sf)__C);
327 }
328 
329 /// Computes a negated multiply-subtract of 128-bit vectors of [2 x double].
330 ///    For each element, computes <c> -(__A * __B) - __C </c>.
331 ///
332 /// \headerfile <immintrin.h>
333 ///
334 /// This intrinsic corresponds to the \c VFNMSUB213PD instruction.
335 ///
336 /// \param __A
337 ///    A 128-bit vector of [2 x double] containing the multiplicand.
338 /// \param __B
339 ///    A 128-bit vector of [2 x double] containing the multiplier.
340 /// \param __C
341 ///    A 128-bit vector of [2 x double] containing the subtrahend.
342 /// \returns A 128-bit vector of [2 x double] containing the result.
343 static __inline__ __m128d __DEFAULT_FN_ATTRS128
344 _mm_fnmsub_pd(__m128d __A, __m128d __B, __m128d __C)
345 {
346   return (__m128d)__builtin_ia32_vfmaddpd(-(__v2df)__A, (__v2df)__B, -(__v2df)__C);
347 }
348 
349 /// Computes a scalar negated multiply-subtract of the single-precision
350 ///    values in the low 32 bits of 128-bit vectors of [4 x float].
351 /// \code
352 /// result[31:0] = -(__A[31:0] * __B[31:0]) - __C[31:0]
353 /// result[127:32] = __A[127:32]
354 /// \endcode
355 ///
356 /// \headerfile <immintrin.h>
357 ///
358 /// This intrinsic corresponds to the \c VFNMSUB213SS instruction.
359 ///
360 /// \param __A
361 ///    A 128-bit vector of [4 x float] containing the multiplicand in the low
362 ///    32 bits.
363 /// \param __B
364 ///    A 128-bit vector of [4 x float] containing the multiplier in the low
365 ///    32 bits.
366 /// \param __C
367 ///    A 128-bit vector of [4 x float] containing the subtrahend in the low
368 ///    32 bits.
369 /// \returns A 128-bit vector of [4 x float] containing the result in the low
370 ///    32 bits, and a copy of \a __A[127:32] in the upper 96 bits.
371 static __inline__ __m128 __DEFAULT_FN_ATTRS128
372 _mm_fnmsub_ss(__m128 __A, __m128 __B, __m128 __C)
373 {
374   return (__m128)__builtin_ia32_vfmaddss3((__v4sf)__A, -(__v4sf)__B, -(__v4sf)__C);
375 }
376 
377 /// Computes a scalar negated multiply-subtract of the double-precision
378 ///    values in the low 64 bits of 128-bit vectors of [2 x double].
379 /// \code
380 /// result[63:0] = -(__A[63:0] * __B[63:0]) - __C[63:0]
381 /// result[127:64] = __A[127:64]
382 /// \endcode
383 ///
384 /// \headerfile <immintrin.h>
385 ///
386 /// This intrinsic corresponds to the \c VFNMSUB213SD instruction.
387 ///
388 /// \param __A
389 ///    A 128-bit vector of [2 x double] containing the multiplicand in the low
390 ///    64 bits.
391 /// \param __B
392 ///    A 128-bit vector of [2 x double] containing the multiplier in the low
393 ///    64 bits.
394 /// \param __C
395 ///    A 128-bit vector of [2 x double] containing the subtrahend in the low
396 ///    64 bits.
397 /// \returns A 128-bit vector of [2 x double] containing the result in the low
398 ///    64 bits, and a copy of \a __A[127:64] in the upper 64 bits.
399 static __inline__ __m128d __DEFAULT_FN_ATTRS128
400 _mm_fnmsub_sd(__m128d __A, __m128d __B, __m128d __C)
401 {
402   return (__m128d)__builtin_ia32_vfmaddsd3((__v2df)__A, -(__v2df)__B, -(__v2df)__C);
403 }
404 
405 /// Computes a multiply with alternating add/subtract of 128-bit vectors of
406 ///    [4 x float].
407 /// \code
408 /// result[31:0]  = (__A[31:0] * __B[31:0]) - __C[31:0]
409 /// result[63:32] = (__A[63:32] * __B[63:32]) + __C[63:32]
410 /// result[95:64] = (__A[95:64] * __B[95:64]) - __C[95:64]
411 /// result[127:96] = (__A[127:96] * __B[127:96]) + __C[127:96]
412 /// \endcode
413 ///
414 /// \headerfile <immintrin.h>
415 ///
416 /// This intrinsic corresponds to the \c VFMADDSUB213PS instruction.
417 ///
418 /// \param __A
419 ///    A 128-bit vector of [4 x float] containing the multiplicand.
420 /// \param __B
421 ///    A 128-bit vector of [4 x float] containing the multiplier.
422 /// \param __C
423 ///    A 128-bit vector of [4 x float] containing the addend/subtrahend.
424 /// \returns A 128-bit vector of [4 x float] containing the result.
425 static __inline__ __m128 __DEFAULT_FN_ATTRS128
426 _mm_fmaddsub_ps(__m128 __A, __m128 __B, __m128 __C)
427 {
428   return (__m128)__builtin_ia32_vfmaddsubps((__v4sf)__A, (__v4sf)__B, (__v4sf)__C);
429 }
430 
431 /// Computes a multiply with alternating add/subtract of 128-bit vectors of
432 ///    [2 x double].
433 /// \code
434 /// result[63:0]  = (__A[63:0] * __B[63:0]) - __C[63:0]
435 /// result[127:64] = (__A[127:64] * __B[127:64]) + __C[127:64]
436 /// \endcode
437 ///
438 /// \headerfile <immintrin.h>
439 ///
440 /// This intrinsic corresponds to the \c VFMADDSUB213PD instruction.
441 ///
442 /// \param __A
443 ///    A 128-bit vector of [2 x double] containing the multiplicand.
444 /// \param __B
445 ///    A 128-bit vector of [2 x double] containing the multiplier.
446 /// \param __C
447 ///    A 128-bit vector of [2 x double] containing the addend/subtrahend.
448 /// \returns A 128-bit vector of [2 x double] containing the result.
449 static __inline__ __m128d __DEFAULT_FN_ATTRS128
450 _mm_fmaddsub_pd(__m128d __A, __m128d __B, __m128d __C)
451 {
452   return (__m128d)__builtin_ia32_vfmaddsubpd((__v2df)__A, (__v2df)__B, (__v2df)__C);
453 }
454 
455 /// Computes a multiply with alternating add/subtract of 128-bit vectors of
456 ///    [4 x float].
457 /// \code
458 /// result[31:0]  = (__A[31:0] * __B[31:0]) + __C[31:0]
459 /// result[63:32] = (__A[63:32] * __B[63:32]) - __C[63:32]
460 /// result[95:64] = (__A[95:64] * __B[95:64]) + __C[95:64]
461 /// result[127:96 = (__A[127:96] * __B[127:96]) - __C[127:96]
462 /// \endcode
463 ///
464 /// \headerfile <immintrin.h>
465 ///
466 /// This intrinsic corresponds to the \c VFMSUBADD213PS instruction.
467 ///
468 /// \param __A
469 ///    A 128-bit vector of [4 x float] containing the multiplicand.
470 /// \param __B
471 ///    A 128-bit vector of [4 x float] containing the multiplier.
472 /// \param __C
473 ///    A 128-bit vector of [4 x float] containing the addend/subtrahend.
474 /// \returns A 128-bit vector of [4 x float] containing the result.
475 static __inline__ __m128 __DEFAULT_FN_ATTRS128
476 _mm_fmsubadd_ps(__m128 __A, __m128 __B, __m128 __C)
477 {
478   return (__m128)__builtin_ia32_vfmaddsubps((__v4sf)__A, (__v4sf)__B, -(__v4sf)__C);
479 }
480 
481 /// Computes a multiply with alternating add/subtract of 128-bit vectors of
482 ///    [2 x double].
483 /// \code
484 /// result[63:0]  = (__A[63:0] * __B[63:0]) + __C[63:0]
485 /// result[127:64] = (__A[127:64] * __B[127:64]) - __C[127:64]
486 /// \endcode
487 ///
488 /// \headerfile <immintrin.h>
489 ///
490 /// This intrinsic corresponds to the \c VFMADDSUB213PD instruction.
491 ///
492 /// \param __A
493 ///    A 128-bit vector of [2 x double] containing the multiplicand.
494 /// \param __B
495 ///    A 128-bit vector of [2 x double] containing the multiplier.
496 /// \param __C
497 ///    A 128-bit vector of [2 x double] containing the addend/subtrahend.
498 /// \returns A 128-bit vector of [2 x double] containing the result.
499 static __inline__ __m128d __DEFAULT_FN_ATTRS128
500 _mm_fmsubadd_pd(__m128d __A, __m128d __B, __m128d __C)
501 {
502   return (__m128d)__builtin_ia32_vfmaddsubpd((__v2df)__A, (__v2df)__B, -(__v2df)__C);
503 }
504 
505 /// Computes a multiply-add of 256-bit vectors of [8 x float].
506 ///    For each element, computes <c> (__A * __B) + __C </c>.
507 ///
508 /// \headerfile <immintrin.h>
509 ///
510 /// This intrinsic corresponds to the \c VFMADD213PS instruction.
511 ///
512 /// \param __A
513 ///    A 256-bit vector of [8 x float] containing the multiplicand.
514 /// \param __B
515 ///    A 256-bit vector of [8 x float] containing the multiplier.
516 /// \param __C
517 ///    A 256-bit vector of [8 x float] containing the addend.
518 /// \returns A 256-bit vector of [8 x float] containing the result.
519 static __inline__ __m256 __DEFAULT_FN_ATTRS256
520 _mm256_fmadd_ps(__m256 __A, __m256 __B, __m256 __C)
521 {
522   return (__m256)__builtin_ia32_vfmaddps256((__v8sf)__A, (__v8sf)__B, (__v8sf)__C);
523 }
524 
525 /// Computes a multiply-add of 256-bit vectors of [4 x double].
526 ///    For each element, computes <c> (__A * __B) + __C </c>.
527 ///
528 /// \headerfile <immintrin.h>
529 ///
530 /// This intrinsic corresponds to the \c VFMADD213PD instruction.
531 ///
532 /// \param __A
533 ///    A 256-bit vector of [4 x double] containing the multiplicand.
534 /// \param __B
535 ///    A 256-bit vector of [4 x double] containing the multiplier.
536 /// \param __C
537 ///    A 256-bit vector of [4 x double] containing the addend.
538 /// \returns A 256-bit vector of [4 x double] containing the result.
539 static __inline__ __m256d __DEFAULT_FN_ATTRS256
540 _mm256_fmadd_pd(__m256d __A, __m256d __B, __m256d __C)
541 {
542   return (__m256d)__builtin_ia32_vfmaddpd256((__v4df)__A, (__v4df)__B, (__v4df)__C);
543 }
544 
545 /// Computes a multiply-subtract of 256-bit vectors of [8 x float].
546 ///    For each element, computes <c> (__A * __B) - __C </c>.
547 ///
548 /// \headerfile <immintrin.h>
549 ///
550 /// This intrinsic corresponds to the \c VFMSUB213PS instruction.
551 ///
552 /// \param __A
553 ///    A 256-bit vector of [8 x float] containing the multiplicand.
554 /// \param __B
555 ///    A 256-bit vector of [8 x float] containing the multiplier.
556 /// \param __C
557 ///    A 256-bit vector of [8 x float] containing the subtrahend.
558 /// \returns A 256-bit vector of [8 x float] containing the result.
559 static __inline__ __m256 __DEFAULT_FN_ATTRS256
560 _mm256_fmsub_ps(__m256 __A, __m256 __B, __m256 __C)
561 {
562   return (__m256)__builtin_ia32_vfmaddps256((__v8sf)__A, (__v8sf)__B, -(__v8sf)__C);
563 }
564 
565 /// Computes a multiply-subtract of 256-bit vectors of [4 x double].
566 ///    For each element, computes <c> (__A * __B) - __C </c>.
567 ///
568 /// \headerfile <immintrin.h>
569 ///
570 /// This intrinsic corresponds to the \c VFMSUB213PD instruction.
571 ///
572 /// \param __A
573 ///    A 256-bit vector of [4 x double] containing the multiplicand.
574 /// \param __B
575 ///    A 256-bit vector of [4 x double] containing the multiplier.
576 /// \param __C
577 ///    A 256-bit vector of [4 x double] containing the subtrahend.
578 /// \returns A 256-bit vector of [4 x double] containing the result.
579 static __inline__ __m256d __DEFAULT_FN_ATTRS256
580 _mm256_fmsub_pd(__m256d __A, __m256d __B, __m256d __C)
581 {
582   return (__m256d)__builtin_ia32_vfmaddpd256((__v4df)__A, (__v4df)__B, -(__v4df)__C);
583 }
584 
585 /// Computes a negated multiply-add of 256-bit vectors of [8 x float].
586 ///    For each element, computes <c> -(__A * __B) + __C </c>.
587 ///
588 /// \headerfile <immintrin.h>
589 ///
590 /// This intrinsic corresponds to the \c VFNMADD213PS instruction.
591 ///
592 /// \param __A
593 ///    A 256-bit vector of [8 x float] containing the multiplicand.
594 /// \param __B
595 ///    A 256-bit vector of [8 x float] containing the multiplier.
596 /// \param __C
597 ///    A 256-bit vector of [8 x float] containing the addend.
598 /// \returns A 256-bit vector of [8 x float] containing the result.
599 static __inline__ __m256 __DEFAULT_FN_ATTRS256
600 _mm256_fnmadd_ps(__m256 __A, __m256 __B, __m256 __C)
601 {
602   return (__m256)__builtin_ia32_vfmaddps256(-(__v8sf)__A, (__v8sf)__B, (__v8sf)__C);
603 }
604 
605 /// Computes a negated multiply-add of 256-bit vectors of [4 x double].
606 ///    For each element, computes <c> -(__A * __B) + __C </c>.
607 ///
608 /// \headerfile <immintrin.h>
609 ///
610 /// This intrinsic corresponds to the \c VFNMADD213PD instruction.
611 ///
612 /// \param __A
613 ///    A 256-bit vector of [4 x double] containing the multiplicand.
614 /// \param __B
615 ///    A 256-bit vector of [4 x double] containing the multiplier.
616 /// \param __C
617 ///    A 256-bit vector of [4 x double] containing the addend.
618 /// \returns A 256-bit vector of [4 x double] containing the result.
619 static __inline__ __m256d __DEFAULT_FN_ATTRS256
620 _mm256_fnmadd_pd(__m256d __A, __m256d __B, __m256d __C)
621 {
622   return (__m256d)__builtin_ia32_vfmaddpd256(-(__v4df)__A, (__v4df)__B, (__v4df)__C);
623 }
624 
625 /// Computes a negated multiply-subtract of 256-bit vectors of [8 x float].
626 ///    For each element, computes <c> -(__A * __B) - __C </c>.
627 ///
628 /// \headerfile <immintrin.h>
629 ///
630 /// This intrinsic corresponds to the \c VFNMSUB213PS instruction.
631 ///
632 /// \param __A
633 ///    A 256-bit vector of [8 x float] containing the multiplicand.
634 /// \param __B
635 ///    A 256-bit vector of [8 x float] containing the multiplier.
636 /// \param __C
637 ///    A 256-bit vector of [8 x float] containing the subtrahend.
638 /// \returns A 256-bit vector of [8 x float] containing the result.
639 static __inline__ __m256 __DEFAULT_FN_ATTRS256
640 _mm256_fnmsub_ps(__m256 __A, __m256 __B, __m256 __C)
641 {
642   return (__m256)__builtin_ia32_vfmaddps256(-(__v8sf)__A, (__v8sf)__B, -(__v8sf)__C);
643 }
644 
645 /// Computes a negated multiply-subtract of 256-bit vectors of [4 x double].
646 ///    For each element, computes <c> -(__A * __B) - __C </c>.
647 ///
648 /// \headerfile <immintrin.h>
649 ///
650 /// This intrinsic corresponds to the \c VFNMSUB213PD instruction.
651 ///
652 /// \param __A
653 ///    A 256-bit vector of [4 x double] containing the multiplicand.
654 /// \param __B
655 ///    A 256-bit vector of [4 x double] containing the multiplier.
656 /// \param __C
657 ///    A 256-bit vector of [4 x double] containing the subtrahend.
658 /// \returns A 256-bit vector of [4 x double] containing the result.
659 static __inline__ __m256d __DEFAULT_FN_ATTRS256
660 _mm256_fnmsub_pd(__m256d __A, __m256d __B, __m256d __C)
661 {
662   return (__m256d)__builtin_ia32_vfmaddpd256(-(__v4df)__A, (__v4df)__B, -(__v4df)__C);
663 }
664 
665 /// Computes a multiply with alternating add/subtract of 256-bit vectors of
666 ///    [8 x float].
667 /// \code
668 /// result[31:0] = (__A[31:0] * __B[31:0]) - __C[31:0]
669 /// result[63:32] = (__A[63:32] * __B[63:32]) + __C[63:32]
670 /// result[95:64] = (__A[95:64] * __B[95:64]) - __C[95:64]
671 /// result[127:96] = (__A[127:96] * __B[127:96]) + __C[127:96]
672 /// result[159:128] = (__A[159:128] * __B[159:128]) - __C[159:128]
673 /// result[191:160] = (__A[191:160] * __B[191:160]) + __C[191:160]
674 /// result[223:192] = (__A[223:192] * __B[223:192]) - __C[223:192]
675 /// result[255:224] = (__A[255:224] * __B[255:224]) + __C[255:224]
676 /// \endcode
677 ///
678 /// \headerfile <immintrin.h>
679 ///
680 /// This intrinsic corresponds to the \c VFMADDSUB213PS instruction.
681 ///
682 /// \param __A
683 ///    A 256-bit vector of [8 x float] containing the multiplicand.
684 /// \param __B
685 ///    A 256-bit vector of [8 x float] containing the multiplier.
686 /// \param __C
687 ///    A 256-bit vector of [8 x float] containing the addend/subtrahend.
688 /// \returns A 256-bit vector of [8 x float] containing the result.
689 static __inline__ __m256 __DEFAULT_FN_ATTRS256
690 _mm256_fmaddsub_ps(__m256 __A, __m256 __B, __m256 __C)
691 {
692   return (__m256)__builtin_ia32_vfmaddsubps256((__v8sf)__A, (__v8sf)__B, (__v8sf)__C);
693 }
694 
695 /// Computes a multiply with alternating add/subtract of 256-bit vectors of
696 ///    [4 x double].
697 /// \code
698 /// result[63:0] = (__A[63:0] * __B[63:0]) - __C[63:0]
699 /// result[127:64] = (__A[127:64] * __B[127:64]) + __C[127:64]
700 /// result[191:128] = (__A[191:128] * __B[191:128]) - __C[191:128]
701 /// result[255:192] = (__A[255:192] * __B[255:192]) + __C[255:192]
702 /// \endcode
703 ///
704 /// \headerfile <immintrin.h>
705 ///
706 /// This intrinsic corresponds to the \c VFMADDSUB213PD instruction.
707 ///
708 /// \param __A
709 ///    A 256-bit vector of [4 x double] containing the multiplicand.
710 /// \param __B
711 ///    A 256-bit vector of [4 x double] containing the multiplier.
712 /// \param __C
713 ///    A 256-bit vector of [4 x double] containing the addend/subtrahend.
714 /// \returns A 256-bit vector of [4 x double] containing the result.
715 static __inline__ __m256d __DEFAULT_FN_ATTRS256
716 _mm256_fmaddsub_pd(__m256d __A, __m256d __B, __m256d __C)
717 {
718   return (__m256d)__builtin_ia32_vfmaddsubpd256((__v4df)__A, (__v4df)__B, (__v4df)__C);
719 }
720 
721 /// Computes a vector multiply with alternating add/subtract of 256-bit
722 ///    vectors of [8 x float].
723 /// \code
724 /// result[31:0] = (__A[31:0] * __B[31:0]) + __C[31:0]
725 /// result[63:32] = (__A[63:32] * __B[63:32]) - __C[63:32]
726 /// result[95:64] = (__A[95:64] * __B[95:64]) + __C[95:64]
727 /// result[127:96] = (__A[127:96] * __B[127:96]) - __C[127:96]
728 /// result[159:128] = (__A[159:128] * __B[159:128]) + __C[159:128]
729 /// result[191:160] = (__A[191:160] * __B[191:160]) - __C[191:160]
730 /// result[223:192] = (__A[223:192] * __B[223:192]) + __C[223:192]
731 /// result[255:224] = (__A[255:224] * __B[255:224]) - __C[255:224]
732 /// \endcode
733 ///
734 /// \headerfile <immintrin.h>
735 ///
736 /// This intrinsic corresponds to the \c VFMSUBADD213PS instruction.
737 ///
738 /// \param __A
739 ///    A 256-bit vector of [8 x float] containing the multiplicand.
740 /// \param __B
741 ///    A 256-bit vector of [8 x float] containing the multiplier.
742 /// \param __C
743 ///    A 256-bit vector of [8 x float] containing the addend/subtrahend.
744 /// \returns A 256-bit vector of [8 x float] containing the result.
745 static __inline__ __m256 __DEFAULT_FN_ATTRS256
746 _mm256_fmsubadd_ps(__m256 __A, __m256 __B, __m256 __C)
747 {
748   return (__m256)__builtin_ia32_vfmaddsubps256((__v8sf)__A, (__v8sf)__B, -(__v8sf)__C);
749 }
750 
751 /// Computes a vector multiply with alternating add/subtract of 256-bit
752 ///    vectors of [4 x double].
753 /// \code
754 /// result[63:0] = (__A[63:0] * __B[63:0]) + __C[63:0]
755 /// result[127:64] = (__A[127:64] * __B[127:64]) - __C[127:64]
756 /// result[191:128] = (__A[191:128] * __B[191:128]) + __C[191:128]
757 /// result[255:192] = (__A[255:192] * __B[255:192]) - __C[255:192]
758 /// \endcode
759 ///
760 /// \headerfile <immintrin.h>
761 ///
762 /// This intrinsic corresponds to the \c VFMSUBADD213PD instruction.
763 ///
764 /// \param __A
765 ///    A 256-bit vector of [4 x double] containing the multiplicand.
766 /// \param __B
767 ///    A 256-bit vector of [4 x double] containing the multiplier.
768 /// \param __C
769 ///    A 256-bit vector of [4 x double] containing the addend/subtrahend.
770 /// \returns A 256-bit vector of [4 x double] containing the result.
771 static __inline__ __m256d __DEFAULT_FN_ATTRS256
772 _mm256_fmsubadd_pd(__m256d __A, __m256d __B, __m256d __C)
773 {
774   return (__m256d)__builtin_ia32_vfmaddsubpd256((__v4df)__A, (__v4df)__B, -(__v4df)__C);
775 }
776 
777 #undef __DEFAULT_FN_ATTRS128
778 #undef __DEFAULT_FN_ATTRS256
779 
780 #endif /* __FMAINTRIN_H */
781