xref: /freebsd/contrib/llvm-project/clang/lib/Headers/fmaintrin.h (revision 06c3fb2749bda94cb5201f81ffdb8fa6c3161b2e)
10b57cec5SDimitry Andric /*===---- fmaintrin.h - FMA intrinsics -------------------------------------===
20b57cec5SDimitry Andric  *
30b57cec5SDimitry Andric  * Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
40b57cec5SDimitry Andric  * See https://llvm.org/LICENSE.txt for license information.
50b57cec5SDimitry Andric  * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
60b57cec5SDimitry Andric  *
70b57cec5SDimitry Andric  *===-----------------------------------------------------------------------===
80b57cec5SDimitry Andric  */
90b57cec5SDimitry Andric 
100b57cec5SDimitry Andric #ifndef __IMMINTRIN_H
110b57cec5SDimitry Andric #error "Never use <fmaintrin.h> directly; include <immintrin.h> instead."
120b57cec5SDimitry Andric #endif
130b57cec5SDimitry Andric 
140b57cec5SDimitry Andric #ifndef __FMAINTRIN_H
150b57cec5SDimitry Andric #define __FMAINTRIN_H
160b57cec5SDimitry Andric 
170b57cec5SDimitry Andric /* Define the default attributes for the functions in this file. */
180b57cec5SDimitry Andric #define __DEFAULT_FN_ATTRS128 __attribute__((__always_inline__, __nodebug__, __target__("fma"), __min_vector_width__(128)))
190b57cec5SDimitry Andric #define __DEFAULT_FN_ATTRS256 __attribute__((__always_inline__, __nodebug__, __target__("fma"), __min_vector_width__(256)))
200b57cec5SDimitry Andric 
21*06c3fb27SDimitry Andric /// Computes a multiply-add of 128-bit vectors of [4 x float].
22*06c3fb27SDimitry Andric ///    For each element, computes <c> (__A * __B) + __C </c>.
23*06c3fb27SDimitry Andric ///
24*06c3fb27SDimitry Andric /// \headerfile <immintrin.h>
25*06c3fb27SDimitry Andric ///
26*06c3fb27SDimitry Andric /// This intrinsic corresponds to the \c VFMADD213PS instruction.
27*06c3fb27SDimitry Andric ///
28*06c3fb27SDimitry Andric /// \param __A
29*06c3fb27SDimitry Andric ///    A 128-bit vector of [4 x float] containing the multiplicand.
30*06c3fb27SDimitry Andric /// \param __B
31*06c3fb27SDimitry Andric ///    A 128-bit vector of [4 x float] containing the multiplier.
32*06c3fb27SDimitry Andric /// \param __C
33*06c3fb27SDimitry Andric ///    A 128-bit vector of [4 x float] containing the addend.
34*06c3fb27SDimitry Andric /// \returns A 128-bit vector of [4 x float] containing the result.
350b57cec5SDimitry Andric static __inline__ __m128 __DEFAULT_FN_ATTRS128
360b57cec5SDimitry Andric _mm_fmadd_ps(__m128 __A, __m128 __B, __m128 __C)
370b57cec5SDimitry Andric {
380b57cec5SDimitry Andric   return (__m128)__builtin_ia32_vfmaddps((__v4sf)__A, (__v4sf)__B, (__v4sf)__C);
390b57cec5SDimitry Andric }
400b57cec5SDimitry Andric 
41*06c3fb27SDimitry Andric /// Computes a multiply-add of 128-bit vectors of [2 x double].
42*06c3fb27SDimitry Andric ///    For each element, computes <c> (__A * __B) + __C </c>.
43*06c3fb27SDimitry Andric ///
44*06c3fb27SDimitry Andric /// \headerfile <immintrin.h>
45*06c3fb27SDimitry Andric ///
46*06c3fb27SDimitry Andric /// This intrinsic corresponds to the \c VFMADD213PD instruction.
47*06c3fb27SDimitry Andric ///
48*06c3fb27SDimitry Andric /// \param __A
49*06c3fb27SDimitry Andric ///    A 128-bit vector of [2 x double] containing the multiplicand.
50*06c3fb27SDimitry Andric /// \param __B
51*06c3fb27SDimitry Andric ///    A 128-bit vector of [2 x double] containing the multiplier.
52*06c3fb27SDimitry Andric /// \param __C
53*06c3fb27SDimitry Andric ///    A 128-bit vector of [2 x double] containing the addend.
54*06c3fb27SDimitry Andric /// \returns A 128-bit [2 x double] vector containing the result.
550b57cec5SDimitry Andric static __inline__ __m128d __DEFAULT_FN_ATTRS128
560b57cec5SDimitry Andric _mm_fmadd_pd(__m128d __A, __m128d __B, __m128d __C)
570b57cec5SDimitry Andric {
580b57cec5SDimitry Andric   return (__m128d)__builtin_ia32_vfmaddpd((__v2df)__A, (__v2df)__B, (__v2df)__C);
590b57cec5SDimitry Andric }
600b57cec5SDimitry Andric 
61*06c3fb27SDimitry Andric /// Computes a scalar multiply-add of the single-precision values in the
62*06c3fb27SDimitry Andric ///    low 32 bits of 128-bit vectors of [4 x float].
63*06c3fb27SDimitry Andric /// \code
64*06c3fb27SDimitry Andric /// result[31:0] = (__A[31:0] * __B[31:0]) + __C[31:0]
65*06c3fb27SDimitry Andric /// result[127:32] = __A[127:32]
66*06c3fb27SDimitry Andric /// \endcode
67*06c3fb27SDimitry Andric ///
68*06c3fb27SDimitry Andric /// \headerfile <immintrin.h>
69*06c3fb27SDimitry Andric ///
70*06c3fb27SDimitry Andric /// This intrinsic corresponds to the \c VFMADD213SS instruction.
71*06c3fb27SDimitry Andric ///
72*06c3fb27SDimitry Andric /// \param __A
73*06c3fb27SDimitry Andric ///    A 128-bit vector of [4 x float] containing the multiplicand in the low
74*06c3fb27SDimitry Andric ///    32 bits.
75*06c3fb27SDimitry Andric /// \param __B
76*06c3fb27SDimitry Andric ///    A 128-bit vector of [4 x float] containing the multiplier in the low
77*06c3fb27SDimitry Andric ///    32 bits.
78*06c3fb27SDimitry Andric /// \param __C
79*06c3fb27SDimitry Andric ///    A 128-bit vector of [4 x float] containing the addend in the low
80*06c3fb27SDimitry Andric ///    32 bits.
81*06c3fb27SDimitry Andric /// \returns A 128-bit vector of [4 x float] containing the result in the low
82*06c3fb27SDimitry Andric ///    32 bits and a copy of \a __A[127:32] in the upper 96 bits.
830b57cec5SDimitry Andric static __inline__ __m128 __DEFAULT_FN_ATTRS128
840b57cec5SDimitry Andric _mm_fmadd_ss(__m128 __A, __m128 __B, __m128 __C)
850b57cec5SDimitry Andric {
860b57cec5SDimitry Andric   return (__m128)__builtin_ia32_vfmaddss3((__v4sf)__A, (__v4sf)__B, (__v4sf)__C);
870b57cec5SDimitry Andric }
880b57cec5SDimitry Andric 
89*06c3fb27SDimitry Andric /// Computes a scalar multiply-add of the double-precision values in the
90*06c3fb27SDimitry Andric ///    low 64 bits of 128-bit vectors of [2 x double].
91*06c3fb27SDimitry Andric /// \code
92*06c3fb27SDimitry Andric /// result[63:0] = (__A[63:0] * __B[63:0]) + __C[63:0]
93*06c3fb27SDimitry Andric /// result[127:64] = __A[127:64]
94*06c3fb27SDimitry Andric /// \endcode
95*06c3fb27SDimitry Andric ///
96*06c3fb27SDimitry Andric /// \headerfile <immintrin.h>
97*06c3fb27SDimitry Andric ///
98*06c3fb27SDimitry Andric /// This intrinsic corresponds to the \c VFMADD213SD instruction.
99*06c3fb27SDimitry Andric ///
100*06c3fb27SDimitry Andric /// \param __A
101*06c3fb27SDimitry Andric ///    A 128-bit vector of [2 x double] containing the multiplicand in the low
102*06c3fb27SDimitry Andric ///    64 bits.
103*06c3fb27SDimitry Andric /// \param __B
104*06c3fb27SDimitry Andric ///    A 128-bit vector of [2 x double] containing the multiplier in the low
105*06c3fb27SDimitry Andric ///    64 bits.
106*06c3fb27SDimitry Andric /// \param __C
107*06c3fb27SDimitry Andric ///    A 128-bit vector of [2 x double] containing the addend in the low
108*06c3fb27SDimitry Andric ///    64 bits.
109*06c3fb27SDimitry Andric /// \returns A 128-bit vector of [2 x double] containing the result in the low
110*06c3fb27SDimitry Andric ///    64 bits and a copy of \a __A[127:64] in the upper 64 bits.
1110b57cec5SDimitry Andric static __inline__ __m128d __DEFAULT_FN_ATTRS128
1120b57cec5SDimitry Andric _mm_fmadd_sd(__m128d __A, __m128d __B, __m128d __C)
1130b57cec5SDimitry Andric {
1140b57cec5SDimitry Andric   return (__m128d)__builtin_ia32_vfmaddsd3((__v2df)__A, (__v2df)__B, (__v2df)__C);
1150b57cec5SDimitry Andric }
1160b57cec5SDimitry Andric 
117*06c3fb27SDimitry Andric /// Computes a multiply-subtract of 128-bit vectors of [4 x float].
118*06c3fb27SDimitry Andric ///    For each element, computes <c> (__A * __B) - __C </c>.
119*06c3fb27SDimitry Andric ///
120*06c3fb27SDimitry Andric /// \headerfile <immintrin.h>
121*06c3fb27SDimitry Andric ///
122*06c3fb27SDimitry Andric /// This intrinsic corresponds to the \c VFMSUB213PS instruction.
123*06c3fb27SDimitry Andric ///
124*06c3fb27SDimitry Andric /// \param __A
125*06c3fb27SDimitry Andric ///    A 128-bit vector of [4 x float] containing the multiplicand.
126*06c3fb27SDimitry Andric /// \param __B
127*06c3fb27SDimitry Andric ///    A 128-bit vector of [4 x float] containing the multiplier.
128*06c3fb27SDimitry Andric /// \param __C
129*06c3fb27SDimitry Andric ///    A 128-bit vector of [4 x float] containing the subtrahend.
130*06c3fb27SDimitry Andric /// \returns A 128-bit vector of [4 x float] containing the result.
1310b57cec5SDimitry Andric static __inline__ __m128 __DEFAULT_FN_ATTRS128
1320b57cec5SDimitry Andric _mm_fmsub_ps(__m128 __A, __m128 __B, __m128 __C)
1330b57cec5SDimitry Andric {
1340b57cec5SDimitry Andric   return (__m128)__builtin_ia32_vfmaddps((__v4sf)__A, (__v4sf)__B, -(__v4sf)__C);
1350b57cec5SDimitry Andric }
1360b57cec5SDimitry Andric 
137*06c3fb27SDimitry Andric /// Computes a multiply-subtract of 128-bit vectors of [2 x double].
138*06c3fb27SDimitry Andric ///    For each element, computes <c> (__A * __B) - __C </c>.
139*06c3fb27SDimitry Andric ///
140*06c3fb27SDimitry Andric /// \headerfile <immintrin.h>
141*06c3fb27SDimitry Andric ///
142*06c3fb27SDimitry Andric /// This intrinsic corresponds to the \c VFMSUB213PD instruction.
143*06c3fb27SDimitry Andric ///
144*06c3fb27SDimitry Andric /// \param __A
145*06c3fb27SDimitry Andric ///    A 128-bit vector of [2 x double] containing the multiplicand.
146*06c3fb27SDimitry Andric /// \param __B
147*06c3fb27SDimitry Andric ///    A 128-bit vector of [2 x double] containing the multiplier.
148*06c3fb27SDimitry Andric /// \param __C
149*06c3fb27SDimitry Andric ///    A 128-bit vector of [2 x double] containing the addend.
150*06c3fb27SDimitry Andric /// \returns A 128-bit vector of [2 x double] containing the result.
1510b57cec5SDimitry Andric static __inline__ __m128d __DEFAULT_FN_ATTRS128
1520b57cec5SDimitry Andric _mm_fmsub_pd(__m128d __A, __m128d __B, __m128d __C)
1530b57cec5SDimitry Andric {
1540b57cec5SDimitry Andric   return (__m128d)__builtin_ia32_vfmaddpd((__v2df)__A, (__v2df)__B, -(__v2df)__C);
1550b57cec5SDimitry Andric }
1560b57cec5SDimitry Andric 
157*06c3fb27SDimitry Andric /// Computes a scalar multiply-subtract of the single-precision values in
158*06c3fb27SDimitry Andric ///    the low 32 bits of 128-bit vectors of [4 x float].
159*06c3fb27SDimitry Andric /// \code
160*06c3fb27SDimitry Andric /// result[31:0] = (__A[31:0] * __B[31:0]) - __C[31:0]
161*06c3fb27SDimitry Andric /// result[127:32] = __A[127:32]
162*06c3fb27SDimitry Andric /// \endcode
163*06c3fb27SDimitry Andric ///
164*06c3fb27SDimitry Andric /// \headerfile <immintrin.h>
165*06c3fb27SDimitry Andric ///
166*06c3fb27SDimitry Andric /// This intrinsic corresponds to the \c VFMSUB213SS instruction.
167*06c3fb27SDimitry Andric ///
168*06c3fb27SDimitry Andric /// \param __A
169*06c3fb27SDimitry Andric ///    A 128-bit vector of [4 x float] containing the multiplicand in the low
170*06c3fb27SDimitry Andric ///    32 bits.
171*06c3fb27SDimitry Andric /// \param __B
172*06c3fb27SDimitry Andric ///    A 128-bit vector of [4 x float] containing the multiplier in the low
173*06c3fb27SDimitry Andric ///    32 bits.
174*06c3fb27SDimitry Andric /// \param __C
175*06c3fb27SDimitry Andric ///    A 128-bit vector of [4 x float] containing the subtrahend in the low
176*06c3fb27SDimitry Andric ///   32 bits.
177*06c3fb27SDimitry Andric /// \returns A 128-bit vector of [4 x float] containing the result in the low
178*06c3fb27SDimitry Andric ///    32 bits, and a copy of \a __A[127:32] in the upper 96 bits.
1790b57cec5SDimitry Andric static __inline__ __m128 __DEFAULT_FN_ATTRS128
1800b57cec5SDimitry Andric _mm_fmsub_ss(__m128 __A, __m128 __B, __m128 __C)
1810b57cec5SDimitry Andric {
1820b57cec5SDimitry Andric   return (__m128)__builtin_ia32_vfmaddss3((__v4sf)__A, (__v4sf)__B, -(__v4sf)__C);
1830b57cec5SDimitry Andric }
1840b57cec5SDimitry Andric 
185*06c3fb27SDimitry Andric /// Computes a scalar multiply-subtract of the double-precision values in
186*06c3fb27SDimitry Andric ///    the low 64 bits of 128-bit vectors of [2 x double].
187*06c3fb27SDimitry Andric /// \code
188*06c3fb27SDimitry Andric /// result[63:0] = (__A[63:0] * __B[63:0]) - __C[63:0]
189*06c3fb27SDimitry Andric /// result[127:64] = __A[127:64]
190*06c3fb27SDimitry Andric /// \endcode
191*06c3fb27SDimitry Andric ///
192*06c3fb27SDimitry Andric /// \headerfile <immintrin.h>
193*06c3fb27SDimitry Andric ///
194*06c3fb27SDimitry Andric /// This intrinsic corresponds to the \c VFMSUB213SD instruction.
195*06c3fb27SDimitry Andric ///
196*06c3fb27SDimitry Andric /// \param __A
197*06c3fb27SDimitry Andric ///    A 128-bit vector of [2 x double] containing the multiplicand in the low
198*06c3fb27SDimitry Andric ///    64 bits.
199*06c3fb27SDimitry Andric /// \param __B
200*06c3fb27SDimitry Andric ///    A 128-bit vector of [2 x double] containing the multiplier in the low
201*06c3fb27SDimitry Andric ///    64 bits.
202*06c3fb27SDimitry Andric /// \param __C
203*06c3fb27SDimitry Andric ///    A 128-bit vector of [2 x double] containing the subtrahend in the low
204*06c3fb27SDimitry Andric ///    64 bits.
205*06c3fb27SDimitry Andric /// \returns A 128-bit vector of [2 x double] containing the result in the low
206*06c3fb27SDimitry Andric ///    64 bits, and a copy of \a __A[127:64] in the upper 64 bits.
2070b57cec5SDimitry Andric static __inline__ __m128d __DEFAULT_FN_ATTRS128
2080b57cec5SDimitry Andric _mm_fmsub_sd(__m128d __A, __m128d __B, __m128d __C)
2090b57cec5SDimitry Andric {
2100b57cec5SDimitry Andric   return (__m128d)__builtin_ia32_vfmaddsd3((__v2df)__A, (__v2df)__B, -(__v2df)__C);
2110b57cec5SDimitry Andric }
2120b57cec5SDimitry Andric 
213*06c3fb27SDimitry Andric /// Computes a negated multiply-add of 128-bit vectors of [4 x float].
214*06c3fb27SDimitry Andric ///    For each element, computes <c> -(__A * __B) + __C </c>.
215*06c3fb27SDimitry Andric ///
216*06c3fb27SDimitry Andric /// \headerfile <immintrin.h>
217*06c3fb27SDimitry Andric ///
218*06c3fb27SDimitry Andric /// This intrinsic corresponds to the \c VFNMADD213DPS instruction.
219*06c3fb27SDimitry Andric ///
220*06c3fb27SDimitry Andric /// \param __A
221*06c3fb27SDimitry Andric ///    A 128-bit vector of [4 x float] containing the multiplicand.
222*06c3fb27SDimitry Andric /// \param __B
223*06c3fb27SDimitry Andric ///    A 128-bit vector of [4 x float] containing the multiplier.
224*06c3fb27SDimitry Andric /// \param __C
225*06c3fb27SDimitry Andric ///    A 128-bit vector of [4 x float] containing the addend.
226*06c3fb27SDimitry Andric /// \returns A 128-bit [4 x float] vector containing the result.
2270b57cec5SDimitry Andric static __inline__ __m128 __DEFAULT_FN_ATTRS128
2280b57cec5SDimitry Andric _mm_fnmadd_ps(__m128 __A, __m128 __B, __m128 __C)
2290b57cec5SDimitry Andric {
2300b57cec5SDimitry Andric   return (__m128)__builtin_ia32_vfmaddps(-(__v4sf)__A, (__v4sf)__B, (__v4sf)__C);
2310b57cec5SDimitry Andric }
2320b57cec5SDimitry Andric 
233*06c3fb27SDimitry Andric /// Computes a negated multiply-add of 128-bit vectors of [2 x double].
234*06c3fb27SDimitry Andric ///    For each element, computes <c> -(__A * __B) + __C </c>.
235*06c3fb27SDimitry Andric ///
236*06c3fb27SDimitry Andric /// \headerfile <immintrin.h>
237*06c3fb27SDimitry Andric ///
238*06c3fb27SDimitry Andric /// This intrinsic corresponds to the \c VFNMADD213PD instruction.
239*06c3fb27SDimitry Andric ///
240*06c3fb27SDimitry Andric /// \param __A
241*06c3fb27SDimitry Andric ///    A 128-bit vector of [2 x double] containing the multiplicand.
242*06c3fb27SDimitry Andric /// \param __B
243*06c3fb27SDimitry Andric ///    A 128-bit vector of [2 x double] containing the multiplier.
244*06c3fb27SDimitry Andric /// \param __C
245*06c3fb27SDimitry Andric ///    A 128-bit vector of [2 x double] containing the addend.
246*06c3fb27SDimitry Andric /// \returns A 128-bit vector of [2 x double] containing the result.
2470b57cec5SDimitry Andric static __inline__ __m128d __DEFAULT_FN_ATTRS128
2480b57cec5SDimitry Andric _mm_fnmadd_pd(__m128d __A, __m128d __B, __m128d __C)
2490b57cec5SDimitry Andric {
2500b57cec5SDimitry Andric   return (__m128d)__builtin_ia32_vfmaddpd(-(__v2df)__A, (__v2df)__B, (__v2df)__C);
2510b57cec5SDimitry Andric }
2520b57cec5SDimitry Andric 
253*06c3fb27SDimitry Andric /// Computes a scalar negated multiply-add of the single-precision values in
254*06c3fb27SDimitry Andric ///    the low 32 bits of 128-bit vectors of [4 x float].
255*06c3fb27SDimitry Andric /// \code
256*06c3fb27SDimitry Andric /// result[31:0] = -(__A[31:0] * __B[31:0]) + __C[31:0]
257*06c3fb27SDimitry Andric /// result[127:32] = __A[127:32]
258*06c3fb27SDimitry Andric /// \endcode
259*06c3fb27SDimitry Andric ///
260*06c3fb27SDimitry Andric /// \headerfile <immintrin.h>
261*06c3fb27SDimitry Andric ///
262*06c3fb27SDimitry Andric /// This intrinsic corresponds to the \c VFNMADD213SS instruction.
263*06c3fb27SDimitry Andric ///
264*06c3fb27SDimitry Andric /// \param __A
265*06c3fb27SDimitry Andric ///    A 128-bit vector of [4 x float] containing the multiplicand in the low
266*06c3fb27SDimitry Andric ///    32 bits.
267*06c3fb27SDimitry Andric /// \param __B
268*06c3fb27SDimitry Andric ///    A 128-bit vector of [4 x float] containing the multiplier in the low
269*06c3fb27SDimitry Andric ///    32 bits.
270*06c3fb27SDimitry Andric /// \param __C
271*06c3fb27SDimitry Andric ///    A 128-bit vector of [4 x float] containing the addend in the low
272*06c3fb27SDimitry Andric ///    32 bits.
273*06c3fb27SDimitry Andric /// \returns A 128-bit vector of [4 x float] containing the result in the low
274*06c3fb27SDimitry Andric ///    32 bits, and a copy of \a __A[127:32] in the upper 96 bits.
2750b57cec5SDimitry Andric static __inline__ __m128 __DEFAULT_FN_ATTRS128
2760b57cec5SDimitry Andric _mm_fnmadd_ss(__m128 __A, __m128 __B, __m128 __C)
2770b57cec5SDimitry Andric {
2780b57cec5SDimitry Andric   return (__m128)__builtin_ia32_vfmaddss3((__v4sf)__A, -(__v4sf)__B, (__v4sf)__C);
2790b57cec5SDimitry Andric }
2800b57cec5SDimitry Andric 
281*06c3fb27SDimitry Andric /// Computes a scalar negated multiply-add of the double-precision values
282*06c3fb27SDimitry Andric ///    in the low 64 bits of 128-bit vectors of [2 x double].
283*06c3fb27SDimitry Andric /// \code
284*06c3fb27SDimitry Andric /// result[63:0] = -(__A[63:0] * __B[63:0]) + __C[63:0]
285*06c3fb27SDimitry Andric /// result[127:64] = __A[127:64]
286*06c3fb27SDimitry Andric /// \endcode
287*06c3fb27SDimitry Andric ///
288*06c3fb27SDimitry Andric /// \headerfile <immintrin.h>
289*06c3fb27SDimitry Andric ///
290*06c3fb27SDimitry Andric /// This intrinsic corresponds to the \c VFNMADD213SD instruction.
291*06c3fb27SDimitry Andric ///
292*06c3fb27SDimitry Andric /// \param __A
293*06c3fb27SDimitry Andric ///    A 128-bit vector of [2 x double] containing the multiplicand in the low
294*06c3fb27SDimitry Andric ///    64 bits.
295*06c3fb27SDimitry Andric /// \param __B
296*06c3fb27SDimitry Andric ///    A 128-bit vector of [2 x double] containing the multiplier in the low
297*06c3fb27SDimitry Andric ///    64 bits.
298*06c3fb27SDimitry Andric /// \param __C
299*06c3fb27SDimitry Andric ///    A 128-bit vector of [2 x double] containing the addend in the low
300*06c3fb27SDimitry Andric ///    64 bits.
301*06c3fb27SDimitry Andric /// \returns A 128-bit vector of [2 x double] containing the result in the low
302*06c3fb27SDimitry Andric ///    64 bits, and a copy of \a __A[127:64] in the upper 64 bits.
3030b57cec5SDimitry Andric static __inline__ __m128d __DEFAULT_FN_ATTRS128
3040b57cec5SDimitry Andric _mm_fnmadd_sd(__m128d __A, __m128d __B, __m128d __C)
3050b57cec5SDimitry Andric {
3060b57cec5SDimitry Andric   return (__m128d)__builtin_ia32_vfmaddsd3((__v2df)__A, -(__v2df)__B, (__v2df)__C);
3070b57cec5SDimitry Andric }
3080b57cec5SDimitry Andric 
309*06c3fb27SDimitry Andric /// Computes a negated multiply-subtract of 128-bit vectors of [4 x float].
310*06c3fb27SDimitry Andric ///    For each element, computes <c> -(__A * __B) - __C </c>.
311*06c3fb27SDimitry Andric ///
312*06c3fb27SDimitry Andric /// \headerfile <immintrin.h>
313*06c3fb27SDimitry Andric ///
314*06c3fb27SDimitry Andric /// This intrinsic corresponds to the \c VFNMSUB213PS instruction.
315*06c3fb27SDimitry Andric ///
316*06c3fb27SDimitry Andric /// \param __A
317*06c3fb27SDimitry Andric ///    A 128-bit vector of [4 x float] containing the multiplicand.
318*06c3fb27SDimitry Andric /// \param __B
319*06c3fb27SDimitry Andric ///    A 128-bit vector of [4 x float] containing the multiplier.
320*06c3fb27SDimitry Andric /// \param __C
321*06c3fb27SDimitry Andric ///    A 128-bit vector of [4 x float] containing the subtrahend.
322*06c3fb27SDimitry Andric /// \returns A 128-bit vector of [4 x float] containing the result.
3230b57cec5SDimitry Andric static __inline__ __m128 __DEFAULT_FN_ATTRS128
3240b57cec5SDimitry Andric _mm_fnmsub_ps(__m128 __A, __m128 __B, __m128 __C)
3250b57cec5SDimitry Andric {
3260b57cec5SDimitry Andric   return (__m128)__builtin_ia32_vfmaddps(-(__v4sf)__A, (__v4sf)__B, -(__v4sf)__C);
3270b57cec5SDimitry Andric }
3280b57cec5SDimitry Andric 
329*06c3fb27SDimitry Andric /// Computes a negated multiply-subtract of 128-bit vectors of [2 x double].
330*06c3fb27SDimitry Andric ///    For each element, computes <c> -(__A * __B) - __C </c>.
331*06c3fb27SDimitry Andric ///
332*06c3fb27SDimitry Andric /// \headerfile <immintrin.h>
333*06c3fb27SDimitry Andric ///
334*06c3fb27SDimitry Andric /// This intrinsic corresponds to the \c VFNMSUB213PD instruction.
335*06c3fb27SDimitry Andric ///
336*06c3fb27SDimitry Andric /// \param __A
337*06c3fb27SDimitry Andric ///    A 128-bit vector of [2 x double] containing the multiplicand.
338*06c3fb27SDimitry Andric /// \param __B
339*06c3fb27SDimitry Andric ///    A 128-bit vector of [2 x double] containing the multiplier.
340*06c3fb27SDimitry Andric /// \param __C
341*06c3fb27SDimitry Andric ///    A 128-bit vector of [2 x double] containing the subtrahend.
342*06c3fb27SDimitry Andric /// \returns A 128-bit vector of [2 x double] containing the result.
3430b57cec5SDimitry Andric static __inline__ __m128d __DEFAULT_FN_ATTRS128
3440b57cec5SDimitry Andric _mm_fnmsub_pd(__m128d __A, __m128d __B, __m128d __C)
3450b57cec5SDimitry Andric {
3460b57cec5SDimitry Andric   return (__m128d)__builtin_ia32_vfmaddpd(-(__v2df)__A, (__v2df)__B, -(__v2df)__C);
3470b57cec5SDimitry Andric }
3480b57cec5SDimitry Andric 
349*06c3fb27SDimitry Andric /// Computes a scalar negated multiply-subtract of the single-precision
350*06c3fb27SDimitry Andric ///    values in the low 32 bits of 128-bit vectors of [4 x float].
351*06c3fb27SDimitry Andric /// \code
352*06c3fb27SDimitry Andric /// result[31:0] = -(__A[31:0] * __B[31:0]) - __C[31:0]
353*06c3fb27SDimitry Andric /// result[127:32] = __A[127:32]
354*06c3fb27SDimitry Andric /// \endcode
355*06c3fb27SDimitry Andric ///
356*06c3fb27SDimitry Andric /// \headerfile <immintrin.h>
357*06c3fb27SDimitry Andric ///
358*06c3fb27SDimitry Andric /// This intrinsic corresponds to the \c VFNMSUB213SS instruction.
359*06c3fb27SDimitry Andric ///
360*06c3fb27SDimitry Andric /// \param __A
361*06c3fb27SDimitry Andric ///    A 128-bit vector of [4 x float] containing the multiplicand in the low
362*06c3fb27SDimitry Andric ///    32 bits.
363*06c3fb27SDimitry Andric /// \param __B
364*06c3fb27SDimitry Andric ///    A 128-bit vector of [4 x float] containing the multiplier in the low
365*06c3fb27SDimitry Andric ///    32 bits.
366*06c3fb27SDimitry Andric /// \param __C
367*06c3fb27SDimitry Andric ///    A 128-bit vector of [4 x float] containing the subtrahend in the low
368*06c3fb27SDimitry Andric ///    32 bits.
369*06c3fb27SDimitry Andric /// \returns A 128-bit vector of [4 x float] containing the result in the low
370*06c3fb27SDimitry Andric ///    32 bits, and a copy of \a __A[127:32] in the upper 96 bits.
3710b57cec5SDimitry Andric static __inline__ __m128 __DEFAULT_FN_ATTRS128
3720b57cec5SDimitry Andric _mm_fnmsub_ss(__m128 __A, __m128 __B, __m128 __C)
3730b57cec5SDimitry Andric {
3740b57cec5SDimitry Andric   return (__m128)__builtin_ia32_vfmaddss3((__v4sf)__A, -(__v4sf)__B, -(__v4sf)__C);
3750b57cec5SDimitry Andric }
3760b57cec5SDimitry Andric 
377*06c3fb27SDimitry Andric /// Computes a scalar negated multiply-subtract of the double-precision
378*06c3fb27SDimitry Andric ///    values in the low 64 bits of 128-bit vectors of [2 x double].
379*06c3fb27SDimitry Andric /// \code
380*06c3fb27SDimitry Andric /// result[63:0] = -(__A[63:0] * __B[63:0]) - __C[63:0]
381*06c3fb27SDimitry Andric /// result[127:64] = __A[127:64]
382*06c3fb27SDimitry Andric /// \endcode
383*06c3fb27SDimitry Andric ///
384*06c3fb27SDimitry Andric /// \headerfile <immintrin.h>
385*06c3fb27SDimitry Andric ///
386*06c3fb27SDimitry Andric /// This intrinsic corresponds to the \c VFNMSUB213SD instruction.
387*06c3fb27SDimitry Andric ///
388*06c3fb27SDimitry Andric /// \param __A
389*06c3fb27SDimitry Andric ///    A 128-bit vector of [2 x double] containing the multiplicand in the low
390*06c3fb27SDimitry Andric ///    64 bits.
391*06c3fb27SDimitry Andric /// \param __B
392*06c3fb27SDimitry Andric ///    A 128-bit vector of [2 x double] containing the multiplier in the low
393*06c3fb27SDimitry Andric ///    64 bits.
394*06c3fb27SDimitry Andric /// \param __C
395*06c3fb27SDimitry Andric ///    A 128-bit vector of [2 x double] containing the subtrahend in the low
396*06c3fb27SDimitry Andric ///    64 bits.
397*06c3fb27SDimitry Andric /// \returns A 128-bit vector of [2 x double] containing the result in the low
398*06c3fb27SDimitry Andric ///    64 bits, and a copy of \a __A[127:64] in the upper 64 bits.
3990b57cec5SDimitry Andric static __inline__ __m128d __DEFAULT_FN_ATTRS128
4000b57cec5SDimitry Andric _mm_fnmsub_sd(__m128d __A, __m128d __B, __m128d __C)
4010b57cec5SDimitry Andric {
4020b57cec5SDimitry Andric   return (__m128d)__builtin_ia32_vfmaddsd3((__v2df)__A, -(__v2df)__B, -(__v2df)__C);
4030b57cec5SDimitry Andric }
4040b57cec5SDimitry Andric 
405*06c3fb27SDimitry Andric /// Computes a multiply with alternating add/subtract of 128-bit vectors of
406*06c3fb27SDimitry Andric ///    [4 x float].
407*06c3fb27SDimitry Andric /// \code
408*06c3fb27SDimitry Andric /// result[31:0]  = (__A[31:0] * __B[31:0]) - __C[31:0]
409*06c3fb27SDimitry Andric /// result[63:32] = (__A[63:32] * __B[63:32]) + __C[63:32]
410*06c3fb27SDimitry Andric /// result[95:64] = (__A[95:64] * __B[95:64]) - __C[95:64]
411*06c3fb27SDimitry Andric /// result[127:96] = (__A[127:96] * __B[127:96]) + __C[127:96]
412*06c3fb27SDimitry Andric /// \endcode
413*06c3fb27SDimitry Andric ///
414*06c3fb27SDimitry Andric /// \headerfile <immintrin.h>
415*06c3fb27SDimitry Andric ///
416*06c3fb27SDimitry Andric /// This intrinsic corresponds to the \c VFMADDSUB213PS instruction.
417*06c3fb27SDimitry Andric ///
418*06c3fb27SDimitry Andric /// \param __A
419*06c3fb27SDimitry Andric ///    A 128-bit vector of [4 x float] containing the multiplicand.
420*06c3fb27SDimitry Andric /// \param __B
421*06c3fb27SDimitry Andric ///    A 128-bit vector of [4 x float] containing the multiplier.
422*06c3fb27SDimitry Andric /// \param __C
423*06c3fb27SDimitry Andric ///    A 128-bit vector of [4 x float] containing the addend/subtrahend.
424*06c3fb27SDimitry Andric /// \returns A 128-bit vector of [4 x float] containing the result.
4250b57cec5SDimitry Andric static __inline__ __m128 __DEFAULT_FN_ATTRS128
4260b57cec5SDimitry Andric _mm_fmaddsub_ps(__m128 __A, __m128 __B, __m128 __C)
4270b57cec5SDimitry Andric {
4280b57cec5SDimitry Andric   return (__m128)__builtin_ia32_vfmaddsubps((__v4sf)__A, (__v4sf)__B, (__v4sf)__C);
4290b57cec5SDimitry Andric }
4300b57cec5SDimitry Andric 
431*06c3fb27SDimitry Andric /// Computes a multiply with alternating add/subtract of 128-bit vectors of
432*06c3fb27SDimitry Andric ///    [2 x double].
433*06c3fb27SDimitry Andric /// \code
434*06c3fb27SDimitry Andric /// result[63:0]  = (__A[63:0] * __B[63:0]) - __C[63:0]
435*06c3fb27SDimitry Andric /// result[127:64] = (__A[127:64] * __B[127:64]) + __C[127:64]
436*06c3fb27SDimitry Andric /// \endcode
437*06c3fb27SDimitry Andric ///
438*06c3fb27SDimitry Andric /// \headerfile <immintrin.h>
439*06c3fb27SDimitry Andric ///
440*06c3fb27SDimitry Andric /// This intrinsic corresponds to the \c VFMADDSUB213PD instruction.
441*06c3fb27SDimitry Andric ///
442*06c3fb27SDimitry Andric /// \param __A
443*06c3fb27SDimitry Andric ///    A 128-bit vector of [2 x double] containing the multiplicand.
444*06c3fb27SDimitry Andric /// \param __B
445*06c3fb27SDimitry Andric ///    A 128-bit vector of [2 x double] containing the multiplier.
446*06c3fb27SDimitry Andric /// \param __C
447*06c3fb27SDimitry Andric ///    A 128-bit vector of [2 x double] containing the addend/subtrahend.
448*06c3fb27SDimitry Andric /// \returns A 128-bit vector of [2 x double] containing the result.
4490b57cec5SDimitry Andric static __inline__ __m128d __DEFAULT_FN_ATTRS128
4500b57cec5SDimitry Andric _mm_fmaddsub_pd(__m128d __A, __m128d __B, __m128d __C)
4510b57cec5SDimitry Andric {
4520b57cec5SDimitry Andric   return (__m128d)__builtin_ia32_vfmaddsubpd((__v2df)__A, (__v2df)__B, (__v2df)__C);
4530b57cec5SDimitry Andric }
4540b57cec5SDimitry Andric 
455*06c3fb27SDimitry Andric /// Computes a multiply with alternating add/subtract of 128-bit vectors of
456*06c3fb27SDimitry Andric ///    [4 x float].
457*06c3fb27SDimitry Andric /// \code
458*06c3fb27SDimitry Andric /// result[31:0]  = (__A[31:0] * __B[31:0]) + __C[31:0]
459*06c3fb27SDimitry Andric /// result[63:32] = (__A[63:32] * __B[63:32]) - __C[63:32]
460*06c3fb27SDimitry Andric /// result[95:64] = (__A[95:64] * __B[95:64]) + __C[95:64]
461*06c3fb27SDimitry Andric /// result[127:96 = (__A[127:96] * __B[127:96]) - __C[127:96]
462*06c3fb27SDimitry Andric /// \endcode
463*06c3fb27SDimitry Andric ///
464*06c3fb27SDimitry Andric /// \headerfile <immintrin.h>
465*06c3fb27SDimitry Andric ///
466*06c3fb27SDimitry Andric /// This intrinsic corresponds to the \c VFMSUBADD213PS instruction.
467*06c3fb27SDimitry Andric ///
468*06c3fb27SDimitry Andric /// \param __A
469*06c3fb27SDimitry Andric ///    A 128-bit vector of [4 x float] containing the multiplicand.
470*06c3fb27SDimitry Andric /// \param __B
471*06c3fb27SDimitry Andric ///    A 128-bit vector of [4 x float] containing the multiplier.
472*06c3fb27SDimitry Andric /// \param __C
473*06c3fb27SDimitry Andric ///    A 128-bit vector of [4 x float] containing the addend/subtrahend.
474*06c3fb27SDimitry Andric /// \returns A 128-bit vector of [4 x float] containing the result.
4750b57cec5SDimitry Andric static __inline__ __m128 __DEFAULT_FN_ATTRS128
4760b57cec5SDimitry Andric _mm_fmsubadd_ps(__m128 __A, __m128 __B, __m128 __C)
4770b57cec5SDimitry Andric {
4780b57cec5SDimitry Andric   return (__m128)__builtin_ia32_vfmaddsubps((__v4sf)__A, (__v4sf)__B, -(__v4sf)__C);
4790b57cec5SDimitry Andric }
4800b57cec5SDimitry Andric 
481*06c3fb27SDimitry Andric /// Computes a multiply with alternating add/subtract of 128-bit vectors of
482*06c3fb27SDimitry Andric ///    [2 x double].
483*06c3fb27SDimitry Andric /// \code
484*06c3fb27SDimitry Andric /// result[63:0]  = (__A[63:0] * __B[63:0]) + __C[63:0]
485*06c3fb27SDimitry Andric /// result[127:64] = (__A[127:64] * __B[127:64]) - __C[127:64]
486*06c3fb27SDimitry Andric /// \endcode
487*06c3fb27SDimitry Andric ///
488*06c3fb27SDimitry Andric /// \headerfile <immintrin.h>
489*06c3fb27SDimitry Andric ///
490*06c3fb27SDimitry Andric /// This intrinsic corresponds to the \c VFMADDSUB213PD instruction.
491*06c3fb27SDimitry Andric ///
492*06c3fb27SDimitry Andric /// \param __A
493*06c3fb27SDimitry Andric ///    A 128-bit vector of [2 x double] containing the multiplicand.
494*06c3fb27SDimitry Andric /// \param __B
495*06c3fb27SDimitry Andric ///    A 128-bit vector of [2 x double] containing the multiplier.
496*06c3fb27SDimitry Andric /// \param __C
497*06c3fb27SDimitry Andric ///    A 128-bit vector of [2 x double] containing the addend/subtrahend.
498*06c3fb27SDimitry Andric /// \returns A 128-bit vector of [2 x double] containing the result.
4990b57cec5SDimitry Andric static __inline__ __m128d __DEFAULT_FN_ATTRS128
5000b57cec5SDimitry Andric _mm_fmsubadd_pd(__m128d __A, __m128d __B, __m128d __C)
5010b57cec5SDimitry Andric {
5020b57cec5SDimitry Andric   return (__m128d)__builtin_ia32_vfmaddsubpd((__v2df)__A, (__v2df)__B, -(__v2df)__C);
5030b57cec5SDimitry Andric }
5040b57cec5SDimitry Andric 
505*06c3fb27SDimitry Andric /// Computes a multiply-add of 256-bit vectors of [8 x float].
506*06c3fb27SDimitry Andric ///    For each element, computes <c> (__A * __B) + __C </c>.
507*06c3fb27SDimitry Andric ///
508*06c3fb27SDimitry Andric /// \headerfile <immintrin.h>
509*06c3fb27SDimitry Andric ///
510*06c3fb27SDimitry Andric /// This intrinsic corresponds to the \c VFMADD213PS instruction.
511*06c3fb27SDimitry Andric ///
512*06c3fb27SDimitry Andric /// \param __A
513*06c3fb27SDimitry Andric ///    A 256-bit vector of [8 x float] containing the multiplicand.
514*06c3fb27SDimitry Andric /// \param __B
515*06c3fb27SDimitry Andric ///    A 256-bit vector of [8 x float] containing the multiplier.
516*06c3fb27SDimitry Andric /// \param __C
517*06c3fb27SDimitry Andric ///    A 256-bit vector of [8 x float] containing the addend.
518*06c3fb27SDimitry Andric /// \returns A 256-bit vector of [8 x float] containing the result.
5190b57cec5SDimitry Andric static __inline__ __m256 __DEFAULT_FN_ATTRS256
5200b57cec5SDimitry Andric _mm256_fmadd_ps(__m256 __A, __m256 __B, __m256 __C)
5210b57cec5SDimitry Andric {
5220b57cec5SDimitry Andric   return (__m256)__builtin_ia32_vfmaddps256((__v8sf)__A, (__v8sf)__B, (__v8sf)__C);
5230b57cec5SDimitry Andric }
5240b57cec5SDimitry Andric 
525*06c3fb27SDimitry Andric /// Computes a multiply-add of 256-bit vectors of [4 x double].
526*06c3fb27SDimitry Andric ///    For each element, computes <c> (__A * __B) + __C </c>.
527*06c3fb27SDimitry Andric ///
528*06c3fb27SDimitry Andric /// \headerfile <immintrin.h>
529*06c3fb27SDimitry Andric ///
530*06c3fb27SDimitry Andric /// This intrinsic corresponds to the \c VFMADD213PD instruction.
531*06c3fb27SDimitry Andric ///
532*06c3fb27SDimitry Andric /// \param __A
533*06c3fb27SDimitry Andric ///    A 256-bit vector of [4 x double] containing the multiplicand.
534*06c3fb27SDimitry Andric /// \param __B
535*06c3fb27SDimitry Andric ///    A 256-bit vector of [4 x double] containing the multiplier.
536*06c3fb27SDimitry Andric /// \param __C
537*06c3fb27SDimitry Andric ///    A 256-bit vector of [4 x double] containing the addend.
538*06c3fb27SDimitry Andric /// \returns A 256-bit vector of [4 x double] containing the result.
5390b57cec5SDimitry Andric static __inline__ __m256d __DEFAULT_FN_ATTRS256
5400b57cec5SDimitry Andric _mm256_fmadd_pd(__m256d __A, __m256d __B, __m256d __C)
5410b57cec5SDimitry Andric {
5420b57cec5SDimitry Andric   return (__m256d)__builtin_ia32_vfmaddpd256((__v4df)__A, (__v4df)__B, (__v4df)__C);
5430b57cec5SDimitry Andric }
5440b57cec5SDimitry Andric 
545*06c3fb27SDimitry Andric /// Computes a multiply-subtract of 256-bit vectors of [8 x float].
546*06c3fb27SDimitry Andric ///    For each element, computes <c> (__A * __B) - __C </c>.
547*06c3fb27SDimitry Andric ///
548*06c3fb27SDimitry Andric /// \headerfile <immintrin.h>
549*06c3fb27SDimitry Andric ///
550*06c3fb27SDimitry Andric /// This intrinsic corresponds to the \c VFMSUB213PS instruction.
551*06c3fb27SDimitry Andric ///
552*06c3fb27SDimitry Andric /// \param __A
553*06c3fb27SDimitry Andric ///    A 256-bit vector of [8 x float] containing the multiplicand.
554*06c3fb27SDimitry Andric /// \param __B
555*06c3fb27SDimitry Andric ///    A 256-bit vector of [8 x float] containing the multiplier.
556*06c3fb27SDimitry Andric /// \param __C
557*06c3fb27SDimitry Andric ///    A 256-bit vector of [8 x float] containing the subtrahend.
558*06c3fb27SDimitry Andric /// \returns A 256-bit vector of [8 x float] containing the result.
5590b57cec5SDimitry Andric static __inline__ __m256 __DEFAULT_FN_ATTRS256
5600b57cec5SDimitry Andric _mm256_fmsub_ps(__m256 __A, __m256 __B, __m256 __C)
5610b57cec5SDimitry Andric {
5620b57cec5SDimitry Andric   return (__m256)__builtin_ia32_vfmaddps256((__v8sf)__A, (__v8sf)__B, -(__v8sf)__C);
5630b57cec5SDimitry Andric }
5640b57cec5SDimitry Andric 
565*06c3fb27SDimitry Andric /// Computes a multiply-subtract of 256-bit vectors of [4 x double].
566*06c3fb27SDimitry Andric ///    For each element, computes <c> (__A * __B) - __C </c>.
567*06c3fb27SDimitry Andric ///
568*06c3fb27SDimitry Andric /// \headerfile <immintrin.h>
569*06c3fb27SDimitry Andric ///
570*06c3fb27SDimitry Andric /// This intrinsic corresponds to the \c VFMSUB213PD instruction.
571*06c3fb27SDimitry Andric ///
572*06c3fb27SDimitry Andric /// \param __A
573*06c3fb27SDimitry Andric ///    A 256-bit vector of [4 x double] containing the multiplicand.
574*06c3fb27SDimitry Andric /// \param __B
575*06c3fb27SDimitry Andric ///    A 256-bit vector of [4 x double] containing the multiplier.
576*06c3fb27SDimitry Andric /// \param __C
577*06c3fb27SDimitry Andric ///    A 256-bit vector of [4 x double] containing the subtrahend.
578*06c3fb27SDimitry Andric /// \returns A 256-bit vector of [4 x double] containing the result.
5790b57cec5SDimitry Andric static __inline__ __m256d __DEFAULT_FN_ATTRS256
5800b57cec5SDimitry Andric _mm256_fmsub_pd(__m256d __A, __m256d __B, __m256d __C)
5810b57cec5SDimitry Andric {
5820b57cec5SDimitry Andric   return (__m256d)__builtin_ia32_vfmaddpd256((__v4df)__A, (__v4df)__B, -(__v4df)__C);
5830b57cec5SDimitry Andric }
5840b57cec5SDimitry Andric 
585*06c3fb27SDimitry Andric /// Computes a negated multiply-add of 256-bit vectors of [8 x float].
586*06c3fb27SDimitry Andric ///    For each element, computes <c> -(__A * __B) + __C </c>.
587*06c3fb27SDimitry Andric ///
588*06c3fb27SDimitry Andric /// \headerfile <immintrin.h>
589*06c3fb27SDimitry Andric ///
590*06c3fb27SDimitry Andric /// This intrinsic corresponds to the \c VFNMADD213PS instruction.
591*06c3fb27SDimitry Andric ///
592*06c3fb27SDimitry Andric /// \param __A
593*06c3fb27SDimitry Andric ///    A 256-bit vector of [8 x float] containing the multiplicand.
594*06c3fb27SDimitry Andric /// \param __B
595*06c3fb27SDimitry Andric ///    A 256-bit vector of [8 x float] containing the multiplier.
596*06c3fb27SDimitry Andric /// \param __C
597*06c3fb27SDimitry Andric ///    A 256-bit vector of [8 x float] containing the addend.
598*06c3fb27SDimitry Andric /// \returns A 256-bit vector of [8 x float] containing the result.
5990b57cec5SDimitry Andric static __inline__ __m256 __DEFAULT_FN_ATTRS256
6000b57cec5SDimitry Andric _mm256_fnmadd_ps(__m256 __A, __m256 __B, __m256 __C)
6010b57cec5SDimitry Andric {
6020b57cec5SDimitry Andric   return (__m256)__builtin_ia32_vfmaddps256(-(__v8sf)__A, (__v8sf)__B, (__v8sf)__C);
6030b57cec5SDimitry Andric }
6040b57cec5SDimitry Andric 
605*06c3fb27SDimitry Andric /// Computes a negated multiply-add of 256-bit vectors of [4 x double].
606*06c3fb27SDimitry Andric ///    For each element, computes <c> -(__A * __B) + __C </c>.
607*06c3fb27SDimitry Andric ///
608*06c3fb27SDimitry Andric /// \headerfile <immintrin.h>
609*06c3fb27SDimitry Andric ///
610*06c3fb27SDimitry Andric /// This intrinsic corresponds to the \c VFNMADD213PD instruction.
611*06c3fb27SDimitry Andric ///
612*06c3fb27SDimitry Andric /// \param __A
613*06c3fb27SDimitry Andric ///    A 256-bit vector of [4 x double] containing the multiplicand.
614*06c3fb27SDimitry Andric /// \param __B
615*06c3fb27SDimitry Andric ///    A 256-bit vector of [4 x double] containing the multiplier.
616*06c3fb27SDimitry Andric /// \param __C
617*06c3fb27SDimitry Andric ///    A 256-bit vector of [4 x double] containing the addend.
618*06c3fb27SDimitry Andric /// \returns A 256-bit vector of [4 x double] containing the result.
6190b57cec5SDimitry Andric static __inline__ __m256d __DEFAULT_FN_ATTRS256
6200b57cec5SDimitry Andric _mm256_fnmadd_pd(__m256d __A, __m256d __B, __m256d __C)
6210b57cec5SDimitry Andric {
6220b57cec5SDimitry Andric   return (__m256d)__builtin_ia32_vfmaddpd256(-(__v4df)__A, (__v4df)__B, (__v4df)__C);
6230b57cec5SDimitry Andric }
6240b57cec5SDimitry Andric 
625*06c3fb27SDimitry Andric /// Computes a negated multiply-subtract of 256-bit vectors of [8 x float].
626*06c3fb27SDimitry Andric ///    For each element, computes <c> -(__A * __B) - __C </c>.
627*06c3fb27SDimitry Andric ///
628*06c3fb27SDimitry Andric /// \headerfile <immintrin.h>
629*06c3fb27SDimitry Andric ///
630*06c3fb27SDimitry Andric /// This intrinsic corresponds to the \c VFNMSUB213PS instruction.
631*06c3fb27SDimitry Andric ///
632*06c3fb27SDimitry Andric /// \param __A
633*06c3fb27SDimitry Andric ///    A 256-bit vector of [8 x float] containing the multiplicand.
634*06c3fb27SDimitry Andric /// \param __B
635*06c3fb27SDimitry Andric ///    A 256-bit vector of [8 x float] containing the multiplier.
636*06c3fb27SDimitry Andric /// \param __C
637*06c3fb27SDimitry Andric ///    A 256-bit vector of [8 x float] containing the subtrahend.
638*06c3fb27SDimitry Andric /// \returns A 256-bit vector of [8 x float] containing the result.
6390b57cec5SDimitry Andric static __inline__ __m256 __DEFAULT_FN_ATTRS256
6400b57cec5SDimitry Andric _mm256_fnmsub_ps(__m256 __A, __m256 __B, __m256 __C)
6410b57cec5SDimitry Andric {
6420b57cec5SDimitry Andric   return (__m256)__builtin_ia32_vfmaddps256(-(__v8sf)__A, (__v8sf)__B, -(__v8sf)__C);
6430b57cec5SDimitry Andric }
6440b57cec5SDimitry Andric 
645*06c3fb27SDimitry Andric /// Computes a negated multiply-subtract of 256-bit vectors of [4 x double].
646*06c3fb27SDimitry Andric ///    For each element, computes <c> -(__A * __B) - __C </c>.
647*06c3fb27SDimitry Andric ///
648*06c3fb27SDimitry Andric /// \headerfile <immintrin.h>
649*06c3fb27SDimitry Andric ///
650*06c3fb27SDimitry Andric /// This intrinsic corresponds to the \c VFNMSUB213PD instruction.
651*06c3fb27SDimitry Andric ///
652*06c3fb27SDimitry Andric /// \param __A
653*06c3fb27SDimitry Andric ///    A 256-bit vector of [4 x double] containing the multiplicand.
654*06c3fb27SDimitry Andric /// \param __B
655*06c3fb27SDimitry Andric ///    A 256-bit vector of [4 x double] containing the multiplier.
656*06c3fb27SDimitry Andric /// \param __C
657*06c3fb27SDimitry Andric ///    A 256-bit vector of [4 x double] containing the subtrahend.
658*06c3fb27SDimitry Andric /// \returns A 256-bit vector of [4 x double] containing the result.
6590b57cec5SDimitry Andric static __inline__ __m256d __DEFAULT_FN_ATTRS256
6600b57cec5SDimitry Andric _mm256_fnmsub_pd(__m256d __A, __m256d __B, __m256d __C)
6610b57cec5SDimitry Andric {
6620b57cec5SDimitry Andric   return (__m256d)__builtin_ia32_vfmaddpd256(-(__v4df)__A, (__v4df)__B, -(__v4df)__C);
6630b57cec5SDimitry Andric }
6640b57cec5SDimitry Andric 
665*06c3fb27SDimitry Andric /// Computes a multiply with alternating add/subtract of 256-bit vectors of
666*06c3fb27SDimitry Andric ///    [8 x float].
667*06c3fb27SDimitry Andric /// \code
668*06c3fb27SDimitry Andric /// result[31:0] = (__A[31:0] * __B[31:0]) - __C[31:0]
669*06c3fb27SDimitry Andric /// result[63:32] = (__A[63:32] * __B[63:32]) + __C[63:32]
670*06c3fb27SDimitry Andric /// result[95:64] = (__A[95:64] * __B[95:64]) - __C[95:64]
671*06c3fb27SDimitry Andric /// result[127:96] = (__A[127:96] * __B[127:96]) + __C[127:96]
672*06c3fb27SDimitry Andric /// result[159:128] = (__A[159:128] * __B[159:128]) - __C[159:128]
673*06c3fb27SDimitry Andric /// result[191:160] = (__A[191:160] * __B[191:160]) + __C[191:160]
674*06c3fb27SDimitry Andric /// result[223:192] = (__A[223:192] * __B[223:192]) - __C[223:192]
675*06c3fb27SDimitry Andric /// result[255:224] = (__A[255:224] * __B[255:224]) + __C[255:224]
676*06c3fb27SDimitry Andric /// \endcode
677*06c3fb27SDimitry Andric ///
678*06c3fb27SDimitry Andric /// \headerfile <immintrin.h>
679*06c3fb27SDimitry Andric ///
680*06c3fb27SDimitry Andric /// This intrinsic corresponds to the \c VFMADDSUB213PS instruction.
681*06c3fb27SDimitry Andric ///
682*06c3fb27SDimitry Andric /// \param __A
683*06c3fb27SDimitry Andric ///    A 256-bit vector of [8 x float] containing the multiplicand.
684*06c3fb27SDimitry Andric /// \param __B
685*06c3fb27SDimitry Andric ///    A 256-bit vector of [8 x float] containing the multiplier.
686*06c3fb27SDimitry Andric /// \param __C
687*06c3fb27SDimitry Andric ///    A 256-bit vector of [8 x float] containing the addend/subtrahend.
688*06c3fb27SDimitry Andric /// \returns A 256-bit vector of [8 x float] containing the result.
6890b57cec5SDimitry Andric static __inline__ __m256 __DEFAULT_FN_ATTRS256
6900b57cec5SDimitry Andric _mm256_fmaddsub_ps(__m256 __A, __m256 __B, __m256 __C)
6910b57cec5SDimitry Andric {
6920b57cec5SDimitry Andric   return (__m256)__builtin_ia32_vfmaddsubps256((__v8sf)__A, (__v8sf)__B, (__v8sf)__C);
6930b57cec5SDimitry Andric }
6940b57cec5SDimitry Andric 
695*06c3fb27SDimitry Andric /// Computes a multiply with alternating add/subtract of 256-bit vectors of
696*06c3fb27SDimitry Andric ///    [4 x double].
697*06c3fb27SDimitry Andric /// \code
698*06c3fb27SDimitry Andric /// result[63:0] = (__A[63:0] * __B[63:0]) - __C[63:0]
699*06c3fb27SDimitry Andric /// result[127:64] = (__A[127:64] * __B[127:64]) + __C[127:64]
700*06c3fb27SDimitry Andric /// result[191:128] = (__A[191:128] * __B[191:128]) - __C[191:128]
701*06c3fb27SDimitry Andric /// result[255:192] = (__A[255:192] * __B[255:192]) + __C[255:192]
702*06c3fb27SDimitry Andric /// \endcode
703*06c3fb27SDimitry Andric ///
704*06c3fb27SDimitry Andric /// \headerfile <immintrin.h>
705*06c3fb27SDimitry Andric ///
706*06c3fb27SDimitry Andric /// This intrinsic corresponds to the \c VFMADDSUB213PD instruction.
707*06c3fb27SDimitry Andric ///
708*06c3fb27SDimitry Andric /// \param __A
709*06c3fb27SDimitry Andric ///    A 256-bit vector of [4 x double] containing the multiplicand.
710*06c3fb27SDimitry Andric /// \param __B
711*06c3fb27SDimitry Andric ///    A 256-bit vector of [4 x double] containing the multiplier.
712*06c3fb27SDimitry Andric /// \param __C
713*06c3fb27SDimitry Andric ///    A 256-bit vector of [4 x double] containing the addend/subtrahend.
714*06c3fb27SDimitry Andric /// \returns A 256-bit vector of [4 x double] containing the result.
7150b57cec5SDimitry Andric static __inline__ __m256d __DEFAULT_FN_ATTRS256
7160b57cec5SDimitry Andric _mm256_fmaddsub_pd(__m256d __A, __m256d __B, __m256d __C)
7170b57cec5SDimitry Andric {
7180b57cec5SDimitry Andric   return (__m256d)__builtin_ia32_vfmaddsubpd256((__v4df)__A, (__v4df)__B, (__v4df)__C);
7190b57cec5SDimitry Andric }
7200b57cec5SDimitry Andric 
721*06c3fb27SDimitry Andric /// Computes a vector multiply with alternating add/subtract of 256-bit
722*06c3fb27SDimitry Andric ///    vectors of [8 x float].
723*06c3fb27SDimitry Andric /// \code
724*06c3fb27SDimitry Andric /// result[31:0] = (__A[31:0] * __B[31:0]) + __C[31:0]
725*06c3fb27SDimitry Andric /// result[63:32] = (__A[63:32] * __B[63:32]) - __C[63:32]
726*06c3fb27SDimitry Andric /// result[95:64] = (__A[95:64] * __B[95:64]) + __C[95:64]
727*06c3fb27SDimitry Andric /// result[127:96] = (__A[127:96] * __B[127:96]) - __C[127:96]
728*06c3fb27SDimitry Andric /// result[159:128] = (__A[159:128] * __B[159:128]) + __C[159:128]
729*06c3fb27SDimitry Andric /// result[191:160] = (__A[191:160] * __B[191:160]) - __C[191:160]
730*06c3fb27SDimitry Andric /// result[223:192] = (__A[223:192] * __B[223:192]) + __C[223:192]
731*06c3fb27SDimitry Andric /// result[255:224] = (__A[255:224] * __B[255:224]) - __C[255:224]
732*06c3fb27SDimitry Andric /// \endcode
733*06c3fb27SDimitry Andric ///
734*06c3fb27SDimitry Andric /// \headerfile <immintrin.h>
735*06c3fb27SDimitry Andric ///
736*06c3fb27SDimitry Andric /// This intrinsic corresponds to the \c VFMSUBADD213PS instruction.
737*06c3fb27SDimitry Andric ///
738*06c3fb27SDimitry Andric /// \param __A
739*06c3fb27SDimitry Andric ///    A 256-bit vector of [8 x float] containing the multiplicand.
740*06c3fb27SDimitry Andric /// \param __B
741*06c3fb27SDimitry Andric ///    A 256-bit vector of [8 x float] containing the multiplier.
742*06c3fb27SDimitry Andric /// \param __C
743*06c3fb27SDimitry Andric ///    A 256-bit vector of [8 x float] containing the addend/subtrahend.
744*06c3fb27SDimitry Andric /// \returns A 256-bit vector of [8 x float] containing the result.
7450b57cec5SDimitry Andric static __inline__ __m256 __DEFAULT_FN_ATTRS256
7460b57cec5SDimitry Andric _mm256_fmsubadd_ps(__m256 __A, __m256 __B, __m256 __C)
7470b57cec5SDimitry Andric {
7480b57cec5SDimitry Andric   return (__m256)__builtin_ia32_vfmaddsubps256((__v8sf)__A, (__v8sf)__B, -(__v8sf)__C);
7490b57cec5SDimitry Andric }
7500b57cec5SDimitry Andric 
751*06c3fb27SDimitry Andric /// Computes a vector multiply with alternating add/subtract of 256-bit
752*06c3fb27SDimitry Andric ///    vectors of [4 x double].
753*06c3fb27SDimitry Andric /// \code
754*06c3fb27SDimitry Andric /// result[63:0] = (__A[63:0] * __B[63:0]) + __C[63:0]
755*06c3fb27SDimitry Andric /// result[127:64] = (__A[127:64] * __B[127:64]) - __C[127:64]
756*06c3fb27SDimitry Andric /// result[191:128] = (__A[191:128] * __B[191:128]) + __C[191:128]
757*06c3fb27SDimitry Andric /// result[255:192] = (__A[255:192] * __B[255:192]) - __C[255:192]
758*06c3fb27SDimitry Andric /// \endcode
759*06c3fb27SDimitry Andric ///
760*06c3fb27SDimitry Andric /// \headerfile <immintrin.h>
761*06c3fb27SDimitry Andric ///
762*06c3fb27SDimitry Andric /// This intrinsic corresponds to the \c VFMSUBADD213PD instruction.
763*06c3fb27SDimitry Andric ///
764*06c3fb27SDimitry Andric /// \param __A
765*06c3fb27SDimitry Andric ///    A 256-bit vector of [4 x double] containing the multiplicand.
766*06c3fb27SDimitry Andric /// \param __B
767*06c3fb27SDimitry Andric ///    A 256-bit vector of [4 x double] containing the multiplier.
768*06c3fb27SDimitry Andric /// \param __C
769*06c3fb27SDimitry Andric ///    A 256-bit vector of [4 x double] containing the addend/subtrahend.
770*06c3fb27SDimitry Andric /// \returns A 256-bit vector of [4 x double] containing the result.
7710b57cec5SDimitry Andric static __inline__ __m256d __DEFAULT_FN_ATTRS256
7720b57cec5SDimitry Andric _mm256_fmsubadd_pd(__m256d __A, __m256d __B, __m256d __C)
7730b57cec5SDimitry Andric {
7740b57cec5SDimitry Andric   return (__m256d)__builtin_ia32_vfmaddsubpd256((__v4df)__A, (__v4df)__B, -(__v4df)__C);
7750b57cec5SDimitry Andric }
7760b57cec5SDimitry Andric 
7770b57cec5SDimitry Andric #undef __DEFAULT_FN_ATTRS128
7780b57cec5SDimitry Andric #undef __DEFAULT_FN_ATTRS256
7790b57cec5SDimitry Andric 
7800b57cec5SDimitry Andric #endif /* __FMAINTRIN_H */
781