10b57cec5SDimitry Andric /*===---- fmaintrin.h - FMA intrinsics -------------------------------------=== 20b57cec5SDimitry Andric * 30b57cec5SDimitry Andric * Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 40b57cec5SDimitry Andric * See https://llvm.org/LICENSE.txt for license information. 50b57cec5SDimitry Andric * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 60b57cec5SDimitry Andric * 70b57cec5SDimitry Andric *===-----------------------------------------------------------------------=== 80b57cec5SDimitry Andric */ 90b57cec5SDimitry Andric 100b57cec5SDimitry Andric #ifndef __IMMINTRIN_H 110b57cec5SDimitry Andric #error "Never use <fmaintrin.h> directly; include <immintrin.h> instead." 120b57cec5SDimitry Andric #endif 130b57cec5SDimitry Andric 140b57cec5SDimitry Andric #ifndef __FMAINTRIN_H 150b57cec5SDimitry Andric #define __FMAINTRIN_H 160b57cec5SDimitry Andric 170b57cec5SDimitry Andric /* Define the default attributes for the functions in this file. */ 180b57cec5SDimitry Andric #define __DEFAULT_FN_ATTRS128 __attribute__((__always_inline__, __nodebug__, __target__("fma"), __min_vector_width__(128))) 190b57cec5SDimitry Andric #define __DEFAULT_FN_ATTRS256 __attribute__((__always_inline__, __nodebug__, __target__("fma"), __min_vector_width__(256))) 200b57cec5SDimitry Andric 21*06c3fb27SDimitry Andric /// Computes a multiply-add of 128-bit vectors of [4 x float]. 22*06c3fb27SDimitry Andric /// For each element, computes <c> (__A * __B) + __C </c>. 23*06c3fb27SDimitry Andric /// 24*06c3fb27SDimitry Andric /// \headerfile <immintrin.h> 25*06c3fb27SDimitry Andric /// 26*06c3fb27SDimitry Andric /// This intrinsic corresponds to the \c VFMADD213PS instruction. 27*06c3fb27SDimitry Andric /// 28*06c3fb27SDimitry Andric /// \param __A 29*06c3fb27SDimitry Andric /// A 128-bit vector of [4 x float] containing the multiplicand. 30*06c3fb27SDimitry Andric /// \param __B 31*06c3fb27SDimitry Andric /// A 128-bit vector of [4 x float] containing the multiplier. 32*06c3fb27SDimitry Andric /// \param __C 33*06c3fb27SDimitry Andric /// A 128-bit vector of [4 x float] containing the addend. 34*06c3fb27SDimitry Andric /// \returns A 128-bit vector of [4 x float] containing the result. 350b57cec5SDimitry Andric static __inline__ __m128 __DEFAULT_FN_ATTRS128 360b57cec5SDimitry Andric _mm_fmadd_ps(__m128 __A, __m128 __B, __m128 __C) 370b57cec5SDimitry Andric { 380b57cec5SDimitry Andric return (__m128)__builtin_ia32_vfmaddps((__v4sf)__A, (__v4sf)__B, (__v4sf)__C); 390b57cec5SDimitry Andric } 400b57cec5SDimitry Andric 41*06c3fb27SDimitry Andric /// Computes a multiply-add of 128-bit vectors of [2 x double]. 42*06c3fb27SDimitry Andric /// For each element, computes <c> (__A * __B) + __C </c>. 43*06c3fb27SDimitry Andric /// 44*06c3fb27SDimitry Andric /// \headerfile <immintrin.h> 45*06c3fb27SDimitry Andric /// 46*06c3fb27SDimitry Andric /// This intrinsic corresponds to the \c VFMADD213PD instruction. 47*06c3fb27SDimitry Andric /// 48*06c3fb27SDimitry Andric /// \param __A 49*06c3fb27SDimitry Andric /// A 128-bit vector of [2 x double] containing the multiplicand. 50*06c3fb27SDimitry Andric /// \param __B 51*06c3fb27SDimitry Andric /// A 128-bit vector of [2 x double] containing the multiplier. 52*06c3fb27SDimitry Andric /// \param __C 53*06c3fb27SDimitry Andric /// A 128-bit vector of [2 x double] containing the addend. 54*06c3fb27SDimitry Andric /// \returns A 128-bit [2 x double] vector containing the result. 550b57cec5SDimitry Andric static __inline__ __m128d __DEFAULT_FN_ATTRS128 560b57cec5SDimitry Andric _mm_fmadd_pd(__m128d __A, __m128d __B, __m128d __C) 570b57cec5SDimitry Andric { 580b57cec5SDimitry Andric return (__m128d)__builtin_ia32_vfmaddpd((__v2df)__A, (__v2df)__B, (__v2df)__C); 590b57cec5SDimitry Andric } 600b57cec5SDimitry Andric 61*06c3fb27SDimitry Andric /// Computes a scalar multiply-add of the single-precision values in the 62*06c3fb27SDimitry Andric /// low 32 bits of 128-bit vectors of [4 x float]. 63*06c3fb27SDimitry Andric /// \code 64*06c3fb27SDimitry Andric /// result[31:0] = (__A[31:0] * __B[31:0]) + __C[31:0] 65*06c3fb27SDimitry Andric /// result[127:32] = __A[127:32] 66*06c3fb27SDimitry Andric /// \endcode 67*06c3fb27SDimitry Andric /// 68*06c3fb27SDimitry Andric /// \headerfile <immintrin.h> 69*06c3fb27SDimitry Andric /// 70*06c3fb27SDimitry Andric /// This intrinsic corresponds to the \c VFMADD213SS instruction. 71*06c3fb27SDimitry Andric /// 72*06c3fb27SDimitry Andric /// \param __A 73*06c3fb27SDimitry Andric /// A 128-bit vector of [4 x float] containing the multiplicand in the low 74*06c3fb27SDimitry Andric /// 32 bits. 75*06c3fb27SDimitry Andric /// \param __B 76*06c3fb27SDimitry Andric /// A 128-bit vector of [4 x float] containing the multiplier in the low 77*06c3fb27SDimitry Andric /// 32 bits. 78*06c3fb27SDimitry Andric /// \param __C 79*06c3fb27SDimitry Andric /// A 128-bit vector of [4 x float] containing the addend in the low 80*06c3fb27SDimitry Andric /// 32 bits. 81*06c3fb27SDimitry Andric /// \returns A 128-bit vector of [4 x float] containing the result in the low 82*06c3fb27SDimitry Andric /// 32 bits and a copy of \a __A[127:32] in the upper 96 bits. 830b57cec5SDimitry Andric static __inline__ __m128 __DEFAULT_FN_ATTRS128 840b57cec5SDimitry Andric _mm_fmadd_ss(__m128 __A, __m128 __B, __m128 __C) 850b57cec5SDimitry Andric { 860b57cec5SDimitry Andric return (__m128)__builtin_ia32_vfmaddss3((__v4sf)__A, (__v4sf)__B, (__v4sf)__C); 870b57cec5SDimitry Andric } 880b57cec5SDimitry Andric 89*06c3fb27SDimitry Andric /// Computes a scalar multiply-add of the double-precision values in the 90*06c3fb27SDimitry Andric /// low 64 bits of 128-bit vectors of [2 x double]. 91*06c3fb27SDimitry Andric /// \code 92*06c3fb27SDimitry Andric /// result[63:0] = (__A[63:0] * __B[63:0]) + __C[63:0] 93*06c3fb27SDimitry Andric /// result[127:64] = __A[127:64] 94*06c3fb27SDimitry Andric /// \endcode 95*06c3fb27SDimitry Andric /// 96*06c3fb27SDimitry Andric /// \headerfile <immintrin.h> 97*06c3fb27SDimitry Andric /// 98*06c3fb27SDimitry Andric /// This intrinsic corresponds to the \c VFMADD213SD instruction. 99*06c3fb27SDimitry Andric /// 100*06c3fb27SDimitry Andric /// \param __A 101*06c3fb27SDimitry Andric /// A 128-bit vector of [2 x double] containing the multiplicand in the low 102*06c3fb27SDimitry Andric /// 64 bits. 103*06c3fb27SDimitry Andric /// \param __B 104*06c3fb27SDimitry Andric /// A 128-bit vector of [2 x double] containing the multiplier in the low 105*06c3fb27SDimitry Andric /// 64 bits. 106*06c3fb27SDimitry Andric /// \param __C 107*06c3fb27SDimitry Andric /// A 128-bit vector of [2 x double] containing the addend in the low 108*06c3fb27SDimitry Andric /// 64 bits. 109*06c3fb27SDimitry Andric /// \returns A 128-bit vector of [2 x double] containing the result in the low 110*06c3fb27SDimitry Andric /// 64 bits and a copy of \a __A[127:64] in the upper 64 bits. 1110b57cec5SDimitry Andric static __inline__ __m128d __DEFAULT_FN_ATTRS128 1120b57cec5SDimitry Andric _mm_fmadd_sd(__m128d __A, __m128d __B, __m128d __C) 1130b57cec5SDimitry Andric { 1140b57cec5SDimitry Andric return (__m128d)__builtin_ia32_vfmaddsd3((__v2df)__A, (__v2df)__B, (__v2df)__C); 1150b57cec5SDimitry Andric } 1160b57cec5SDimitry Andric 117*06c3fb27SDimitry Andric /// Computes a multiply-subtract of 128-bit vectors of [4 x float]. 118*06c3fb27SDimitry Andric /// For each element, computes <c> (__A * __B) - __C </c>. 119*06c3fb27SDimitry Andric /// 120*06c3fb27SDimitry Andric /// \headerfile <immintrin.h> 121*06c3fb27SDimitry Andric /// 122*06c3fb27SDimitry Andric /// This intrinsic corresponds to the \c VFMSUB213PS instruction. 123*06c3fb27SDimitry Andric /// 124*06c3fb27SDimitry Andric /// \param __A 125*06c3fb27SDimitry Andric /// A 128-bit vector of [4 x float] containing the multiplicand. 126*06c3fb27SDimitry Andric /// \param __B 127*06c3fb27SDimitry Andric /// A 128-bit vector of [4 x float] containing the multiplier. 128*06c3fb27SDimitry Andric /// \param __C 129*06c3fb27SDimitry Andric /// A 128-bit vector of [4 x float] containing the subtrahend. 130*06c3fb27SDimitry Andric /// \returns A 128-bit vector of [4 x float] containing the result. 1310b57cec5SDimitry Andric static __inline__ __m128 __DEFAULT_FN_ATTRS128 1320b57cec5SDimitry Andric _mm_fmsub_ps(__m128 __A, __m128 __B, __m128 __C) 1330b57cec5SDimitry Andric { 1340b57cec5SDimitry Andric return (__m128)__builtin_ia32_vfmaddps((__v4sf)__A, (__v4sf)__B, -(__v4sf)__C); 1350b57cec5SDimitry Andric } 1360b57cec5SDimitry Andric 137*06c3fb27SDimitry Andric /// Computes a multiply-subtract of 128-bit vectors of [2 x double]. 138*06c3fb27SDimitry Andric /// For each element, computes <c> (__A * __B) - __C </c>. 139*06c3fb27SDimitry Andric /// 140*06c3fb27SDimitry Andric /// \headerfile <immintrin.h> 141*06c3fb27SDimitry Andric /// 142*06c3fb27SDimitry Andric /// This intrinsic corresponds to the \c VFMSUB213PD instruction. 143*06c3fb27SDimitry Andric /// 144*06c3fb27SDimitry Andric /// \param __A 145*06c3fb27SDimitry Andric /// A 128-bit vector of [2 x double] containing the multiplicand. 146*06c3fb27SDimitry Andric /// \param __B 147*06c3fb27SDimitry Andric /// A 128-bit vector of [2 x double] containing the multiplier. 148*06c3fb27SDimitry Andric /// \param __C 149*06c3fb27SDimitry Andric /// A 128-bit vector of [2 x double] containing the addend. 150*06c3fb27SDimitry Andric /// \returns A 128-bit vector of [2 x double] containing the result. 1510b57cec5SDimitry Andric static __inline__ __m128d __DEFAULT_FN_ATTRS128 1520b57cec5SDimitry Andric _mm_fmsub_pd(__m128d __A, __m128d __B, __m128d __C) 1530b57cec5SDimitry Andric { 1540b57cec5SDimitry Andric return (__m128d)__builtin_ia32_vfmaddpd((__v2df)__A, (__v2df)__B, -(__v2df)__C); 1550b57cec5SDimitry Andric } 1560b57cec5SDimitry Andric 157*06c3fb27SDimitry Andric /// Computes a scalar multiply-subtract of the single-precision values in 158*06c3fb27SDimitry Andric /// the low 32 bits of 128-bit vectors of [4 x float]. 159*06c3fb27SDimitry Andric /// \code 160*06c3fb27SDimitry Andric /// result[31:0] = (__A[31:0] * __B[31:0]) - __C[31:0] 161*06c3fb27SDimitry Andric /// result[127:32] = __A[127:32] 162*06c3fb27SDimitry Andric /// \endcode 163*06c3fb27SDimitry Andric /// 164*06c3fb27SDimitry Andric /// \headerfile <immintrin.h> 165*06c3fb27SDimitry Andric /// 166*06c3fb27SDimitry Andric /// This intrinsic corresponds to the \c VFMSUB213SS instruction. 167*06c3fb27SDimitry Andric /// 168*06c3fb27SDimitry Andric /// \param __A 169*06c3fb27SDimitry Andric /// A 128-bit vector of [4 x float] containing the multiplicand in the low 170*06c3fb27SDimitry Andric /// 32 bits. 171*06c3fb27SDimitry Andric /// \param __B 172*06c3fb27SDimitry Andric /// A 128-bit vector of [4 x float] containing the multiplier in the low 173*06c3fb27SDimitry Andric /// 32 bits. 174*06c3fb27SDimitry Andric /// \param __C 175*06c3fb27SDimitry Andric /// A 128-bit vector of [4 x float] containing the subtrahend in the low 176*06c3fb27SDimitry Andric /// 32 bits. 177*06c3fb27SDimitry Andric /// \returns A 128-bit vector of [4 x float] containing the result in the low 178*06c3fb27SDimitry Andric /// 32 bits, and a copy of \a __A[127:32] in the upper 96 bits. 1790b57cec5SDimitry Andric static __inline__ __m128 __DEFAULT_FN_ATTRS128 1800b57cec5SDimitry Andric _mm_fmsub_ss(__m128 __A, __m128 __B, __m128 __C) 1810b57cec5SDimitry Andric { 1820b57cec5SDimitry Andric return (__m128)__builtin_ia32_vfmaddss3((__v4sf)__A, (__v4sf)__B, -(__v4sf)__C); 1830b57cec5SDimitry Andric } 1840b57cec5SDimitry Andric 185*06c3fb27SDimitry Andric /// Computes a scalar multiply-subtract of the double-precision values in 186*06c3fb27SDimitry Andric /// the low 64 bits of 128-bit vectors of [2 x double]. 187*06c3fb27SDimitry Andric /// \code 188*06c3fb27SDimitry Andric /// result[63:0] = (__A[63:0] * __B[63:0]) - __C[63:0] 189*06c3fb27SDimitry Andric /// result[127:64] = __A[127:64] 190*06c3fb27SDimitry Andric /// \endcode 191*06c3fb27SDimitry Andric /// 192*06c3fb27SDimitry Andric /// \headerfile <immintrin.h> 193*06c3fb27SDimitry Andric /// 194*06c3fb27SDimitry Andric /// This intrinsic corresponds to the \c VFMSUB213SD instruction. 195*06c3fb27SDimitry Andric /// 196*06c3fb27SDimitry Andric /// \param __A 197*06c3fb27SDimitry Andric /// A 128-bit vector of [2 x double] containing the multiplicand in the low 198*06c3fb27SDimitry Andric /// 64 bits. 199*06c3fb27SDimitry Andric /// \param __B 200*06c3fb27SDimitry Andric /// A 128-bit vector of [2 x double] containing the multiplier in the low 201*06c3fb27SDimitry Andric /// 64 bits. 202*06c3fb27SDimitry Andric /// \param __C 203*06c3fb27SDimitry Andric /// A 128-bit vector of [2 x double] containing the subtrahend in the low 204*06c3fb27SDimitry Andric /// 64 bits. 205*06c3fb27SDimitry Andric /// \returns A 128-bit vector of [2 x double] containing the result in the low 206*06c3fb27SDimitry Andric /// 64 bits, and a copy of \a __A[127:64] in the upper 64 bits. 2070b57cec5SDimitry Andric static __inline__ __m128d __DEFAULT_FN_ATTRS128 2080b57cec5SDimitry Andric _mm_fmsub_sd(__m128d __A, __m128d __B, __m128d __C) 2090b57cec5SDimitry Andric { 2100b57cec5SDimitry Andric return (__m128d)__builtin_ia32_vfmaddsd3((__v2df)__A, (__v2df)__B, -(__v2df)__C); 2110b57cec5SDimitry Andric } 2120b57cec5SDimitry Andric 213*06c3fb27SDimitry Andric /// Computes a negated multiply-add of 128-bit vectors of [4 x float]. 214*06c3fb27SDimitry Andric /// For each element, computes <c> -(__A * __B) + __C </c>. 215*06c3fb27SDimitry Andric /// 216*06c3fb27SDimitry Andric /// \headerfile <immintrin.h> 217*06c3fb27SDimitry Andric /// 218*06c3fb27SDimitry Andric /// This intrinsic corresponds to the \c VFNMADD213DPS instruction. 219*06c3fb27SDimitry Andric /// 220*06c3fb27SDimitry Andric /// \param __A 221*06c3fb27SDimitry Andric /// A 128-bit vector of [4 x float] containing the multiplicand. 222*06c3fb27SDimitry Andric /// \param __B 223*06c3fb27SDimitry Andric /// A 128-bit vector of [4 x float] containing the multiplier. 224*06c3fb27SDimitry Andric /// \param __C 225*06c3fb27SDimitry Andric /// A 128-bit vector of [4 x float] containing the addend. 226*06c3fb27SDimitry Andric /// \returns A 128-bit [4 x float] vector containing the result. 2270b57cec5SDimitry Andric static __inline__ __m128 __DEFAULT_FN_ATTRS128 2280b57cec5SDimitry Andric _mm_fnmadd_ps(__m128 __A, __m128 __B, __m128 __C) 2290b57cec5SDimitry Andric { 2300b57cec5SDimitry Andric return (__m128)__builtin_ia32_vfmaddps(-(__v4sf)__A, (__v4sf)__B, (__v4sf)__C); 2310b57cec5SDimitry Andric } 2320b57cec5SDimitry Andric 233*06c3fb27SDimitry Andric /// Computes a negated multiply-add of 128-bit vectors of [2 x double]. 234*06c3fb27SDimitry Andric /// For each element, computes <c> -(__A * __B) + __C </c>. 235*06c3fb27SDimitry Andric /// 236*06c3fb27SDimitry Andric /// \headerfile <immintrin.h> 237*06c3fb27SDimitry Andric /// 238*06c3fb27SDimitry Andric /// This intrinsic corresponds to the \c VFNMADD213PD instruction. 239*06c3fb27SDimitry Andric /// 240*06c3fb27SDimitry Andric /// \param __A 241*06c3fb27SDimitry Andric /// A 128-bit vector of [2 x double] containing the multiplicand. 242*06c3fb27SDimitry Andric /// \param __B 243*06c3fb27SDimitry Andric /// A 128-bit vector of [2 x double] containing the multiplier. 244*06c3fb27SDimitry Andric /// \param __C 245*06c3fb27SDimitry Andric /// A 128-bit vector of [2 x double] containing the addend. 246*06c3fb27SDimitry Andric /// \returns A 128-bit vector of [2 x double] containing the result. 2470b57cec5SDimitry Andric static __inline__ __m128d __DEFAULT_FN_ATTRS128 2480b57cec5SDimitry Andric _mm_fnmadd_pd(__m128d __A, __m128d __B, __m128d __C) 2490b57cec5SDimitry Andric { 2500b57cec5SDimitry Andric return (__m128d)__builtin_ia32_vfmaddpd(-(__v2df)__A, (__v2df)__B, (__v2df)__C); 2510b57cec5SDimitry Andric } 2520b57cec5SDimitry Andric 253*06c3fb27SDimitry Andric /// Computes a scalar negated multiply-add of the single-precision values in 254*06c3fb27SDimitry Andric /// the low 32 bits of 128-bit vectors of [4 x float]. 255*06c3fb27SDimitry Andric /// \code 256*06c3fb27SDimitry Andric /// result[31:0] = -(__A[31:0] * __B[31:0]) + __C[31:0] 257*06c3fb27SDimitry Andric /// result[127:32] = __A[127:32] 258*06c3fb27SDimitry Andric /// \endcode 259*06c3fb27SDimitry Andric /// 260*06c3fb27SDimitry Andric /// \headerfile <immintrin.h> 261*06c3fb27SDimitry Andric /// 262*06c3fb27SDimitry Andric /// This intrinsic corresponds to the \c VFNMADD213SS instruction. 263*06c3fb27SDimitry Andric /// 264*06c3fb27SDimitry Andric /// \param __A 265*06c3fb27SDimitry Andric /// A 128-bit vector of [4 x float] containing the multiplicand in the low 266*06c3fb27SDimitry Andric /// 32 bits. 267*06c3fb27SDimitry Andric /// \param __B 268*06c3fb27SDimitry Andric /// A 128-bit vector of [4 x float] containing the multiplier in the low 269*06c3fb27SDimitry Andric /// 32 bits. 270*06c3fb27SDimitry Andric /// \param __C 271*06c3fb27SDimitry Andric /// A 128-bit vector of [4 x float] containing the addend in the low 272*06c3fb27SDimitry Andric /// 32 bits. 273*06c3fb27SDimitry Andric /// \returns A 128-bit vector of [4 x float] containing the result in the low 274*06c3fb27SDimitry Andric /// 32 bits, and a copy of \a __A[127:32] in the upper 96 bits. 2750b57cec5SDimitry Andric static __inline__ __m128 __DEFAULT_FN_ATTRS128 2760b57cec5SDimitry Andric _mm_fnmadd_ss(__m128 __A, __m128 __B, __m128 __C) 2770b57cec5SDimitry Andric { 2780b57cec5SDimitry Andric return (__m128)__builtin_ia32_vfmaddss3((__v4sf)__A, -(__v4sf)__B, (__v4sf)__C); 2790b57cec5SDimitry Andric } 2800b57cec5SDimitry Andric 281*06c3fb27SDimitry Andric /// Computes a scalar negated multiply-add of the double-precision values 282*06c3fb27SDimitry Andric /// in the low 64 bits of 128-bit vectors of [2 x double]. 283*06c3fb27SDimitry Andric /// \code 284*06c3fb27SDimitry Andric /// result[63:0] = -(__A[63:0] * __B[63:0]) + __C[63:0] 285*06c3fb27SDimitry Andric /// result[127:64] = __A[127:64] 286*06c3fb27SDimitry Andric /// \endcode 287*06c3fb27SDimitry Andric /// 288*06c3fb27SDimitry Andric /// \headerfile <immintrin.h> 289*06c3fb27SDimitry Andric /// 290*06c3fb27SDimitry Andric /// This intrinsic corresponds to the \c VFNMADD213SD instruction. 291*06c3fb27SDimitry Andric /// 292*06c3fb27SDimitry Andric /// \param __A 293*06c3fb27SDimitry Andric /// A 128-bit vector of [2 x double] containing the multiplicand in the low 294*06c3fb27SDimitry Andric /// 64 bits. 295*06c3fb27SDimitry Andric /// \param __B 296*06c3fb27SDimitry Andric /// A 128-bit vector of [2 x double] containing the multiplier in the low 297*06c3fb27SDimitry Andric /// 64 bits. 298*06c3fb27SDimitry Andric /// \param __C 299*06c3fb27SDimitry Andric /// A 128-bit vector of [2 x double] containing the addend in the low 300*06c3fb27SDimitry Andric /// 64 bits. 301*06c3fb27SDimitry Andric /// \returns A 128-bit vector of [2 x double] containing the result in the low 302*06c3fb27SDimitry Andric /// 64 bits, and a copy of \a __A[127:64] in the upper 64 bits. 3030b57cec5SDimitry Andric static __inline__ __m128d __DEFAULT_FN_ATTRS128 3040b57cec5SDimitry Andric _mm_fnmadd_sd(__m128d __A, __m128d __B, __m128d __C) 3050b57cec5SDimitry Andric { 3060b57cec5SDimitry Andric return (__m128d)__builtin_ia32_vfmaddsd3((__v2df)__A, -(__v2df)__B, (__v2df)__C); 3070b57cec5SDimitry Andric } 3080b57cec5SDimitry Andric 309*06c3fb27SDimitry Andric /// Computes a negated multiply-subtract of 128-bit vectors of [4 x float]. 310*06c3fb27SDimitry Andric /// For each element, computes <c> -(__A * __B) - __C </c>. 311*06c3fb27SDimitry Andric /// 312*06c3fb27SDimitry Andric /// \headerfile <immintrin.h> 313*06c3fb27SDimitry Andric /// 314*06c3fb27SDimitry Andric /// This intrinsic corresponds to the \c VFNMSUB213PS instruction. 315*06c3fb27SDimitry Andric /// 316*06c3fb27SDimitry Andric /// \param __A 317*06c3fb27SDimitry Andric /// A 128-bit vector of [4 x float] containing the multiplicand. 318*06c3fb27SDimitry Andric /// \param __B 319*06c3fb27SDimitry Andric /// A 128-bit vector of [4 x float] containing the multiplier. 320*06c3fb27SDimitry Andric /// \param __C 321*06c3fb27SDimitry Andric /// A 128-bit vector of [4 x float] containing the subtrahend. 322*06c3fb27SDimitry Andric /// \returns A 128-bit vector of [4 x float] containing the result. 3230b57cec5SDimitry Andric static __inline__ __m128 __DEFAULT_FN_ATTRS128 3240b57cec5SDimitry Andric _mm_fnmsub_ps(__m128 __A, __m128 __B, __m128 __C) 3250b57cec5SDimitry Andric { 3260b57cec5SDimitry Andric return (__m128)__builtin_ia32_vfmaddps(-(__v4sf)__A, (__v4sf)__B, -(__v4sf)__C); 3270b57cec5SDimitry Andric } 3280b57cec5SDimitry Andric 329*06c3fb27SDimitry Andric /// Computes a negated multiply-subtract of 128-bit vectors of [2 x double]. 330*06c3fb27SDimitry Andric /// For each element, computes <c> -(__A * __B) - __C </c>. 331*06c3fb27SDimitry Andric /// 332*06c3fb27SDimitry Andric /// \headerfile <immintrin.h> 333*06c3fb27SDimitry Andric /// 334*06c3fb27SDimitry Andric /// This intrinsic corresponds to the \c VFNMSUB213PD instruction. 335*06c3fb27SDimitry Andric /// 336*06c3fb27SDimitry Andric /// \param __A 337*06c3fb27SDimitry Andric /// A 128-bit vector of [2 x double] containing the multiplicand. 338*06c3fb27SDimitry Andric /// \param __B 339*06c3fb27SDimitry Andric /// A 128-bit vector of [2 x double] containing the multiplier. 340*06c3fb27SDimitry Andric /// \param __C 341*06c3fb27SDimitry Andric /// A 128-bit vector of [2 x double] containing the subtrahend. 342*06c3fb27SDimitry Andric /// \returns A 128-bit vector of [2 x double] containing the result. 3430b57cec5SDimitry Andric static __inline__ __m128d __DEFAULT_FN_ATTRS128 3440b57cec5SDimitry Andric _mm_fnmsub_pd(__m128d __A, __m128d __B, __m128d __C) 3450b57cec5SDimitry Andric { 3460b57cec5SDimitry Andric return (__m128d)__builtin_ia32_vfmaddpd(-(__v2df)__A, (__v2df)__B, -(__v2df)__C); 3470b57cec5SDimitry Andric } 3480b57cec5SDimitry Andric 349*06c3fb27SDimitry Andric /// Computes a scalar negated multiply-subtract of the single-precision 350*06c3fb27SDimitry Andric /// values in the low 32 bits of 128-bit vectors of [4 x float]. 351*06c3fb27SDimitry Andric /// \code 352*06c3fb27SDimitry Andric /// result[31:0] = -(__A[31:0] * __B[31:0]) - __C[31:0] 353*06c3fb27SDimitry Andric /// result[127:32] = __A[127:32] 354*06c3fb27SDimitry Andric /// \endcode 355*06c3fb27SDimitry Andric /// 356*06c3fb27SDimitry Andric /// \headerfile <immintrin.h> 357*06c3fb27SDimitry Andric /// 358*06c3fb27SDimitry Andric /// This intrinsic corresponds to the \c VFNMSUB213SS instruction. 359*06c3fb27SDimitry Andric /// 360*06c3fb27SDimitry Andric /// \param __A 361*06c3fb27SDimitry Andric /// A 128-bit vector of [4 x float] containing the multiplicand in the low 362*06c3fb27SDimitry Andric /// 32 bits. 363*06c3fb27SDimitry Andric /// \param __B 364*06c3fb27SDimitry Andric /// A 128-bit vector of [4 x float] containing the multiplier in the low 365*06c3fb27SDimitry Andric /// 32 bits. 366*06c3fb27SDimitry Andric /// \param __C 367*06c3fb27SDimitry Andric /// A 128-bit vector of [4 x float] containing the subtrahend in the low 368*06c3fb27SDimitry Andric /// 32 bits. 369*06c3fb27SDimitry Andric /// \returns A 128-bit vector of [4 x float] containing the result in the low 370*06c3fb27SDimitry Andric /// 32 bits, and a copy of \a __A[127:32] in the upper 96 bits. 3710b57cec5SDimitry Andric static __inline__ __m128 __DEFAULT_FN_ATTRS128 3720b57cec5SDimitry Andric _mm_fnmsub_ss(__m128 __A, __m128 __B, __m128 __C) 3730b57cec5SDimitry Andric { 3740b57cec5SDimitry Andric return (__m128)__builtin_ia32_vfmaddss3((__v4sf)__A, -(__v4sf)__B, -(__v4sf)__C); 3750b57cec5SDimitry Andric } 3760b57cec5SDimitry Andric 377*06c3fb27SDimitry Andric /// Computes a scalar negated multiply-subtract of the double-precision 378*06c3fb27SDimitry Andric /// values in the low 64 bits of 128-bit vectors of [2 x double]. 379*06c3fb27SDimitry Andric /// \code 380*06c3fb27SDimitry Andric /// result[63:0] = -(__A[63:0] * __B[63:0]) - __C[63:0] 381*06c3fb27SDimitry Andric /// result[127:64] = __A[127:64] 382*06c3fb27SDimitry Andric /// \endcode 383*06c3fb27SDimitry Andric /// 384*06c3fb27SDimitry Andric /// \headerfile <immintrin.h> 385*06c3fb27SDimitry Andric /// 386*06c3fb27SDimitry Andric /// This intrinsic corresponds to the \c VFNMSUB213SD instruction. 387*06c3fb27SDimitry Andric /// 388*06c3fb27SDimitry Andric /// \param __A 389*06c3fb27SDimitry Andric /// A 128-bit vector of [2 x double] containing the multiplicand in the low 390*06c3fb27SDimitry Andric /// 64 bits. 391*06c3fb27SDimitry Andric /// \param __B 392*06c3fb27SDimitry Andric /// A 128-bit vector of [2 x double] containing the multiplier in the low 393*06c3fb27SDimitry Andric /// 64 bits. 394*06c3fb27SDimitry Andric /// \param __C 395*06c3fb27SDimitry Andric /// A 128-bit vector of [2 x double] containing the subtrahend in the low 396*06c3fb27SDimitry Andric /// 64 bits. 397*06c3fb27SDimitry Andric /// \returns A 128-bit vector of [2 x double] containing the result in the low 398*06c3fb27SDimitry Andric /// 64 bits, and a copy of \a __A[127:64] in the upper 64 bits. 3990b57cec5SDimitry Andric static __inline__ __m128d __DEFAULT_FN_ATTRS128 4000b57cec5SDimitry Andric _mm_fnmsub_sd(__m128d __A, __m128d __B, __m128d __C) 4010b57cec5SDimitry Andric { 4020b57cec5SDimitry Andric return (__m128d)__builtin_ia32_vfmaddsd3((__v2df)__A, -(__v2df)__B, -(__v2df)__C); 4030b57cec5SDimitry Andric } 4040b57cec5SDimitry Andric 405*06c3fb27SDimitry Andric /// Computes a multiply with alternating add/subtract of 128-bit vectors of 406*06c3fb27SDimitry Andric /// [4 x float]. 407*06c3fb27SDimitry Andric /// \code 408*06c3fb27SDimitry Andric /// result[31:0] = (__A[31:0] * __B[31:0]) - __C[31:0] 409*06c3fb27SDimitry Andric /// result[63:32] = (__A[63:32] * __B[63:32]) + __C[63:32] 410*06c3fb27SDimitry Andric /// result[95:64] = (__A[95:64] * __B[95:64]) - __C[95:64] 411*06c3fb27SDimitry Andric /// result[127:96] = (__A[127:96] * __B[127:96]) + __C[127:96] 412*06c3fb27SDimitry Andric /// \endcode 413*06c3fb27SDimitry Andric /// 414*06c3fb27SDimitry Andric /// \headerfile <immintrin.h> 415*06c3fb27SDimitry Andric /// 416*06c3fb27SDimitry Andric /// This intrinsic corresponds to the \c VFMADDSUB213PS instruction. 417*06c3fb27SDimitry Andric /// 418*06c3fb27SDimitry Andric /// \param __A 419*06c3fb27SDimitry Andric /// A 128-bit vector of [4 x float] containing the multiplicand. 420*06c3fb27SDimitry Andric /// \param __B 421*06c3fb27SDimitry Andric /// A 128-bit vector of [4 x float] containing the multiplier. 422*06c3fb27SDimitry Andric /// \param __C 423*06c3fb27SDimitry Andric /// A 128-bit vector of [4 x float] containing the addend/subtrahend. 424*06c3fb27SDimitry Andric /// \returns A 128-bit vector of [4 x float] containing the result. 4250b57cec5SDimitry Andric static __inline__ __m128 __DEFAULT_FN_ATTRS128 4260b57cec5SDimitry Andric _mm_fmaddsub_ps(__m128 __A, __m128 __B, __m128 __C) 4270b57cec5SDimitry Andric { 4280b57cec5SDimitry Andric return (__m128)__builtin_ia32_vfmaddsubps((__v4sf)__A, (__v4sf)__B, (__v4sf)__C); 4290b57cec5SDimitry Andric } 4300b57cec5SDimitry Andric 431*06c3fb27SDimitry Andric /// Computes a multiply with alternating add/subtract of 128-bit vectors of 432*06c3fb27SDimitry Andric /// [2 x double]. 433*06c3fb27SDimitry Andric /// \code 434*06c3fb27SDimitry Andric /// result[63:0] = (__A[63:0] * __B[63:0]) - __C[63:0] 435*06c3fb27SDimitry Andric /// result[127:64] = (__A[127:64] * __B[127:64]) + __C[127:64] 436*06c3fb27SDimitry Andric /// \endcode 437*06c3fb27SDimitry Andric /// 438*06c3fb27SDimitry Andric /// \headerfile <immintrin.h> 439*06c3fb27SDimitry Andric /// 440*06c3fb27SDimitry Andric /// This intrinsic corresponds to the \c VFMADDSUB213PD instruction. 441*06c3fb27SDimitry Andric /// 442*06c3fb27SDimitry Andric /// \param __A 443*06c3fb27SDimitry Andric /// A 128-bit vector of [2 x double] containing the multiplicand. 444*06c3fb27SDimitry Andric /// \param __B 445*06c3fb27SDimitry Andric /// A 128-bit vector of [2 x double] containing the multiplier. 446*06c3fb27SDimitry Andric /// \param __C 447*06c3fb27SDimitry Andric /// A 128-bit vector of [2 x double] containing the addend/subtrahend. 448*06c3fb27SDimitry Andric /// \returns A 128-bit vector of [2 x double] containing the result. 4490b57cec5SDimitry Andric static __inline__ __m128d __DEFAULT_FN_ATTRS128 4500b57cec5SDimitry Andric _mm_fmaddsub_pd(__m128d __A, __m128d __B, __m128d __C) 4510b57cec5SDimitry Andric { 4520b57cec5SDimitry Andric return (__m128d)__builtin_ia32_vfmaddsubpd((__v2df)__A, (__v2df)__B, (__v2df)__C); 4530b57cec5SDimitry Andric } 4540b57cec5SDimitry Andric 455*06c3fb27SDimitry Andric /// Computes a multiply with alternating add/subtract of 128-bit vectors of 456*06c3fb27SDimitry Andric /// [4 x float]. 457*06c3fb27SDimitry Andric /// \code 458*06c3fb27SDimitry Andric /// result[31:0] = (__A[31:0] * __B[31:0]) + __C[31:0] 459*06c3fb27SDimitry Andric /// result[63:32] = (__A[63:32] * __B[63:32]) - __C[63:32] 460*06c3fb27SDimitry Andric /// result[95:64] = (__A[95:64] * __B[95:64]) + __C[95:64] 461*06c3fb27SDimitry Andric /// result[127:96 = (__A[127:96] * __B[127:96]) - __C[127:96] 462*06c3fb27SDimitry Andric /// \endcode 463*06c3fb27SDimitry Andric /// 464*06c3fb27SDimitry Andric /// \headerfile <immintrin.h> 465*06c3fb27SDimitry Andric /// 466*06c3fb27SDimitry Andric /// This intrinsic corresponds to the \c VFMSUBADD213PS instruction. 467*06c3fb27SDimitry Andric /// 468*06c3fb27SDimitry Andric /// \param __A 469*06c3fb27SDimitry Andric /// A 128-bit vector of [4 x float] containing the multiplicand. 470*06c3fb27SDimitry Andric /// \param __B 471*06c3fb27SDimitry Andric /// A 128-bit vector of [4 x float] containing the multiplier. 472*06c3fb27SDimitry Andric /// \param __C 473*06c3fb27SDimitry Andric /// A 128-bit vector of [4 x float] containing the addend/subtrahend. 474*06c3fb27SDimitry Andric /// \returns A 128-bit vector of [4 x float] containing the result. 4750b57cec5SDimitry Andric static __inline__ __m128 __DEFAULT_FN_ATTRS128 4760b57cec5SDimitry Andric _mm_fmsubadd_ps(__m128 __A, __m128 __B, __m128 __C) 4770b57cec5SDimitry Andric { 4780b57cec5SDimitry Andric return (__m128)__builtin_ia32_vfmaddsubps((__v4sf)__A, (__v4sf)__B, -(__v4sf)__C); 4790b57cec5SDimitry Andric } 4800b57cec5SDimitry Andric 481*06c3fb27SDimitry Andric /// Computes a multiply with alternating add/subtract of 128-bit vectors of 482*06c3fb27SDimitry Andric /// [2 x double]. 483*06c3fb27SDimitry Andric /// \code 484*06c3fb27SDimitry Andric /// result[63:0] = (__A[63:0] * __B[63:0]) + __C[63:0] 485*06c3fb27SDimitry Andric /// result[127:64] = (__A[127:64] * __B[127:64]) - __C[127:64] 486*06c3fb27SDimitry Andric /// \endcode 487*06c3fb27SDimitry Andric /// 488*06c3fb27SDimitry Andric /// \headerfile <immintrin.h> 489*06c3fb27SDimitry Andric /// 490*06c3fb27SDimitry Andric /// This intrinsic corresponds to the \c VFMADDSUB213PD instruction. 491*06c3fb27SDimitry Andric /// 492*06c3fb27SDimitry Andric /// \param __A 493*06c3fb27SDimitry Andric /// A 128-bit vector of [2 x double] containing the multiplicand. 494*06c3fb27SDimitry Andric /// \param __B 495*06c3fb27SDimitry Andric /// A 128-bit vector of [2 x double] containing the multiplier. 496*06c3fb27SDimitry Andric /// \param __C 497*06c3fb27SDimitry Andric /// A 128-bit vector of [2 x double] containing the addend/subtrahend. 498*06c3fb27SDimitry Andric /// \returns A 128-bit vector of [2 x double] containing the result. 4990b57cec5SDimitry Andric static __inline__ __m128d __DEFAULT_FN_ATTRS128 5000b57cec5SDimitry Andric _mm_fmsubadd_pd(__m128d __A, __m128d __B, __m128d __C) 5010b57cec5SDimitry Andric { 5020b57cec5SDimitry Andric return (__m128d)__builtin_ia32_vfmaddsubpd((__v2df)__A, (__v2df)__B, -(__v2df)__C); 5030b57cec5SDimitry Andric } 5040b57cec5SDimitry Andric 505*06c3fb27SDimitry Andric /// Computes a multiply-add of 256-bit vectors of [8 x float]. 506*06c3fb27SDimitry Andric /// For each element, computes <c> (__A * __B) + __C </c>. 507*06c3fb27SDimitry Andric /// 508*06c3fb27SDimitry Andric /// \headerfile <immintrin.h> 509*06c3fb27SDimitry Andric /// 510*06c3fb27SDimitry Andric /// This intrinsic corresponds to the \c VFMADD213PS instruction. 511*06c3fb27SDimitry Andric /// 512*06c3fb27SDimitry Andric /// \param __A 513*06c3fb27SDimitry Andric /// A 256-bit vector of [8 x float] containing the multiplicand. 514*06c3fb27SDimitry Andric /// \param __B 515*06c3fb27SDimitry Andric /// A 256-bit vector of [8 x float] containing the multiplier. 516*06c3fb27SDimitry Andric /// \param __C 517*06c3fb27SDimitry Andric /// A 256-bit vector of [8 x float] containing the addend. 518*06c3fb27SDimitry Andric /// \returns A 256-bit vector of [8 x float] containing the result. 5190b57cec5SDimitry Andric static __inline__ __m256 __DEFAULT_FN_ATTRS256 5200b57cec5SDimitry Andric _mm256_fmadd_ps(__m256 __A, __m256 __B, __m256 __C) 5210b57cec5SDimitry Andric { 5220b57cec5SDimitry Andric return (__m256)__builtin_ia32_vfmaddps256((__v8sf)__A, (__v8sf)__B, (__v8sf)__C); 5230b57cec5SDimitry Andric } 5240b57cec5SDimitry Andric 525*06c3fb27SDimitry Andric /// Computes a multiply-add of 256-bit vectors of [4 x double]. 526*06c3fb27SDimitry Andric /// For each element, computes <c> (__A * __B) + __C </c>. 527*06c3fb27SDimitry Andric /// 528*06c3fb27SDimitry Andric /// \headerfile <immintrin.h> 529*06c3fb27SDimitry Andric /// 530*06c3fb27SDimitry Andric /// This intrinsic corresponds to the \c VFMADD213PD instruction. 531*06c3fb27SDimitry Andric /// 532*06c3fb27SDimitry Andric /// \param __A 533*06c3fb27SDimitry Andric /// A 256-bit vector of [4 x double] containing the multiplicand. 534*06c3fb27SDimitry Andric /// \param __B 535*06c3fb27SDimitry Andric /// A 256-bit vector of [4 x double] containing the multiplier. 536*06c3fb27SDimitry Andric /// \param __C 537*06c3fb27SDimitry Andric /// A 256-bit vector of [4 x double] containing the addend. 538*06c3fb27SDimitry Andric /// \returns A 256-bit vector of [4 x double] containing the result. 5390b57cec5SDimitry Andric static __inline__ __m256d __DEFAULT_FN_ATTRS256 5400b57cec5SDimitry Andric _mm256_fmadd_pd(__m256d __A, __m256d __B, __m256d __C) 5410b57cec5SDimitry Andric { 5420b57cec5SDimitry Andric return (__m256d)__builtin_ia32_vfmaddpd256((__v4df)__A, (__v4df)__B, (__v4df)__C); 5430b57cec5SDimitry Andric } 5440b57cec5SDimitry Andric 545*06c3fb27SDimitry Andric /// Computes a multiply-subtract of 256-bit vectors of [8 x float]. 546*06c3fb27SDimitry Andric /// For each element, computes <c> (__A * __B) - __C </c>. 547*06c3fb27SDimitry Andric /// 548*06c3fb27SDimitry Andric /// \headerfile <immintrin.h> 549*06c3fb27SDimitry Andric /// 550*06c3fb27SDimitry Andric /// This intrinsic corresponds to the \c VFMSUB213PS instruction. 551*06c3fb27SDimitry Andric /// 552*06c3fb27SDimitry Andric /// \param __A 553*06c3fb27SDimitry Andric /// A 256-bit vector of [8 x float] containing the multiplicand. 554*06c3fb27SDimitry Andric /// \param __B 555*06c3fb27SDimitry Andric /// A 256-bit vector of [8 x float] containing the multiplier. 556*06c3fb27SDimitry Andric /// \param __C 557*06c3fb27SDimitry Andric /// A 256-bit vector of [8 x float] containing the subtrahend. 558*06c3fb27SDimitry Andric /// \returns A 256-bit vector of [8 x float] containing the result. 5590b57cec5SDimitry Andric static __inline__ __m256 __DEFAULT_FN_ATTRS256 5600b57cec5SDimitry Andric _mm256_fmsub_ps(__m256 __A, __m256 __B, __m256 __C) 5610b57cec5SDimitry Andric { 5620b57cec5SDimitry Andric return (__m256)__builtin_ia32_vfmaddps256((__v8sf)__A, (__v8sf)__B, -(__v8sf)__C); 5630b57cec5SDimitry Andric } 5640b57cec5SDimitry Andric 565*06c3fb27SDimitry Andric /// Computes a multiply-subtract of 256-bit vectors of [4 x double]. 566*06c3fb27SDimitry Andric /// For each element, computes <c> (__A * __B) - __C </c>. 567*06c3fb27SDimitry Andric /// 568*06c3fb27SDimitry Andric /// \headerfile <immintrin.h> 569*06c3fb27SDimitry Andric /// 570*06c3fb27SDimitry Andric /// This intrinsic corresponds to the \c VFMSUB213PD instruction. 571*06c3fb27SDimitry Andric /// 572*06c3fb27SDimitry Andric /// \param __A 573*06c3fb27SDimitry Andric /// A 256-bit vector of [4 x double] containing the multiplicand. 574*06c3fb27SDimitry Andric /// \param __B 575*06c3fb27SDimitry Andric /// A 256-bit vector of [4 x double] containing the multiplier. 576*06c3fb27SDimitry Andric /// \param __C 577*06c3fb27SDimitry Andric /// A 256-bit vector of [4 x double] containing the subtrahend. 578*06c3fb27SDimitry Andric /// \returns A 256-bit vector of [4 x double] containing the result. 5790b57cec5SDimitry Andric static __inline__ __m256d __DEFAULT_FN_ATTRS256 5800b57cec5SDimitry Andric _mm256_fmsub_pd(__m256d __A, __m256d __B, __m256d __C) 5810b57cec5SDimitry Andric { 5820b57cec5SDimitry Andric return (__m256d)__builtin_ia32_vfmaddpd256((__v4df)__A, (__v4df)__B, -(__v4df)__C); 5830b57cec5SDimitry Andric } 5840b57cec5SDimitry Andric 585*06c3fb27SDimitry Andric /// Computes a negated multiply-add of 256-bit vectors of [8 x float]. 586*06c3fb27SDimitry Andric /// For each element, computes <c> -(__A * __B) + __C </c>. 587*06c3fb27SDimitry Andric /// 588*06c3fb27SDimitry Andric /// \headerfile <immintrin.h> 589*06c3fb27SDimitry Andric /// 590*06c3fb27SDimitry Andric /// This intrinsic corresponds to the \c VFNMADD213PS instruction. 591*06c3fb27SDimitry Andric /// 592*06c3fb27SDimitry Andric /// \param __A 593*06c3fb27SDimitry Andric /// A 256-bit vector of [8 x float] containing the multiplicand. 594*06c3fb27SDimitry Andric /// \param __B 595*06c3fb27SDimitry Andric /// A 256-bit vector of [8 x float] containing the multiplier. 596*06c3fb27SDimitry Andric /// \param __C 597*06c3fb27SDimitry Andric /// A 256-bit vector of [8 x float] containing the addend. 598*06c3fb27SDimitry Andric /// \returns A 256-bit vector of [8 x float] containing the result. 5990b57cec5SDimitry Andric static __inline__ __m256 __DEFAULT_FN_ATTRS256 6000b57cec5SDimitry Andric _mm256_fnmadd_ps(__m256 __A, __m256 __B, __m256 __C) 6010b57cec5SDimitry Andric { 6020b57cec5SDimitry Andric return (__m256)__builtin_ia32_vfmaddps256(-(__v8sf)__A, (__v8sf)__B, (__v8sf)__C); 6030b57cec5SDimitry Andric } 6040b57cec5SDimitry Andric 605*06c3fb27SDimitry Andric /// Computes a negated multiply-add of 256-bit vectors of [4 x double]. 606*06c3fb27SDimitry Andric /// For each element, computes <c> -(__A * __B) + __C </c>. 607*06c3fb27SDimitry Andric /// 608*06c3fb27SDimitry Andric /// \headerfile <immintrin.h> 609*06c3fb27SDimitry Andric /// 610*06c3fb27SDimitry Andric /// This intrinsic corresponds to the \c VFNMADD213PD instruction. 611*06c3fb27SDimitry Andric /// 612*06c3fb27SDimitry Andric /// \param __A 613*06c3fb27SDimitry Andric /// A 256-bit vector of [4 x double] containing the multiplicand. 614*06c3fb27SDimitry Andric /// \param __B 615*06c3fb27SDimitry Andric /// A 256-bit vector of [4 x double] containing the multiplier. 616*06c3fb27SDimitry Andric /// \param __C 617*06c3fb27SDimitry Andric /// A 256-bit vector of [4 x double] containing the addend. 618*06c3fb27SDimitry Andric /// \returns A 256-bit vector of [4 x double] containing the result. 6190b57cec5SDimitry Andric static __inline__ __m256d __DEFAULT_FN_ATTRS256 6200b57cec5SDimitry Andric _mm256_fnmadd_pd(__m256d __A, __m256d __B, __m256d __C) 6210b57cec5SDimitry Andric { 6220b57cec5SDimitry Andric return (__m256d)__builtin_ia32_vfmaddpd256(-(__v4df)__A, (__v4df)__B, (__v4df)__C); 6230b57cec5SDimitry Andric } 6240b57cec5SDimitry Andric 625*06c3fb27SDimitry Andric /// Computes a negated multiply-subtract of 256-bit vectors of [8 x float]. 626*06c3fb27SDimitry Andric /// For each element, computes <c> -(__A * __B) - __C </c>. 627*06c3fb27SDimitry Andric /// 628*06c3fb27SDimitry Andric /// \headerfile <immintrin.h> 629*06c3fb27SDimitry Andric /// 630*06c3fb27SDimitry Andric /// This intrinsic corresponds to the \c VFNMSUB213PS instruction. 631*06c3fb27SDimitry Andric /// 632*06c3fb27SDimitry Andric /// \param __A 633*06c3fb27SDimitry Andric /// A 256-bit vector of [8 x float] containing the multiplicand. 634*06c3fb27SDimitry Andric /// \param __B 635*06c3fb27SDimitry Andric /// A 256-bit vector of [8 x float] containing the multiplier. 636*06c3fb27SDimitry Andric /// \param __C 637*06c3fb27SDimitry Andric /// A 256-bit vector of [8 x float] containing the subtrahend. 638*06c3fb27SDimitry Andric /// \returns A 256-bit vector of [8 x float] containing the result. 6390b57cec5SDimitry Andric static __inline__ __m256 __DEFAULT_FN_ATTRS256 6400b57cec5SDimitry Andric _mm256_fnmsub_ps(__m256 __A, __m256 __B, __m256 __C) 6410b57cec5SDimitry Andric { 6420b57cec5SDimitry Andric return (__m256)__builtin_ia32_vfmaddps256(-(__v8sf)__A, (__v8sf)__B, -(__v8sf)__C); 6430b57cec5SDimitry Andric } 6440b57cec5SDimitry Andric 645*06c3fb27SDimitry Andric /// Computes a negated multiply-subtract of 256-bit vectors of [4 x double]. 646*06c3fb27SDimitry Andric /// For each element, computes <c> -(__A * __B) - __C </c>. 647*06c3fb27SDimitry Andric /// 648*06c3fb27SDimitry Andric /// \headerfile <immintrin.h> 649*06c3fb27SDimitry Andric /// 650*06c3fb27SDimitry Andric /// This intrinsic corresponds to the \c VFNMSUB213PD instruction. 651*06c3fb27SDimitry Andric /// 652*06c3fb27SDimitry Andric /// \param __A 653*06c3fb27SDimitry Andric /// A 256-bit vector of [4 x double] containing the multiplicand. 654*06c3fb27SDimitry Andric /// \param __B 655*06c3fb27SDimitry Andric /// A 256-bit vector of [4 x double] containing the multiplier. 656*06c3fb27SDimitry Andric /// \param __C 657*06c3fb27SDimitry Andric /// A 256-bit vector of [4 x double] containing the subtrahend. 658*06c3fb27SDimitry Andric /// \returns A 256-bit vector of [4 x double] containing the result. 6590b57cec5SDimitry Andric static __inline__ __m256d __DEFAULT_FN_ATTRS256 6600b57cec5SDimitry Andric _mm256_fnmsub_pd(__m256d __A, __m256d __B, __m256d __C) 6610b57cec5SDimitry Andric { 6620b57cec5SDimitry Andric return (__m256d)__builtin_ia32_vfmaddpd256(-(__v4df)__A, (__v4df)__B, -(__v4df)__C); 6630b57cec5SDimitry Andric } 6640b57cec5SDimitry Andric 665*06c3fb27SDimitry Andric /// Computes a multiply with alternating add/subtract of 256-bit vectors of 666*06c3fb27SDimitry Andric /// [8 x float]. 667*06c3fb27SDimitry Andric /// \code 668*06c3fb27SDimitry Andric /// result[31:0] = (__A[31:0] * __B[31:0]) - __C[31:0] 669*06c3fb27SDimitry Andric /// result[63:32] = (__A[63:32] * __B[63:32]) + __C[63:32] 670*06c3fb27SDimitry Andric /// result[95:64] = (__A[95:64] * __B[95:64]) - __C[95:64] 671*06c3fb27SDimitry Andric /// result[127:96] = (__A[127:96] * __B[127:96]) + __C[127:96] 672*06c3fb27SDimitry Andric /// result[159:128] = (__A[159:128] * __B[159:128]) - __C[159:128] 673*06c3fb27SDimitry Andric /// result[191:160] = (__A[191:160] * __B[191:160]) + __C[191:160] 674*06c3fb27SDimitry Andric /// result[223:192] = (__A[223:192] * __B[223:192]) - __C[223:192] 675*06c3fb27SDimitry Andric /// result[255:224] = (__A[255:224] * __B[255:224]) + __C[255:224] 676*06c3fb27SDimitry Andric /// \endcode 677*06c3fb27SDimitry Andric /// 678*06c3fb27SDimitry Andric /// \headerfile <immintrin.h> 679*06c3fb27SDimitry Andric /// 680*06c3fb27SDimitry Andric /// This intrinsic corresponds to the \c VFMADDSUB213PS instruction. 681*06c3fb27SDimitry Andric /// 682*06c3fb27SDimitry Andric /// \param __A 683*06c3fb27SDimitry Andric /// A 256-bit vector of [8 x float] containing the multiplicand. 684*06c3fb27SDimitry Andric /// \param __B 685*06c3fb27SDimitry Andric /// A 256-bit vector of [8 x float] containing the multiplier. 686*06c3fb27SDimitry Andric /// \param __C 687*06c3fb27SDimitry Andric /// A 256-bit vector of [8 x float] containing the addend/subtrahend. 688*06c3fb27SDimitry Andric /// \returns A 256-bit vector of [8 x float] containing the result. 6890b57cec5SDimitry Andric static __inline__ __m256 __DEFAULT_FN_ATTRS256 6900b57cec5SDimitry Andric _mm256_fmaddsub_ps(__m256 __A, __m256 __B, __m256 __C) 6910b57cec5SDimitry Andric { 6920b57cec5SDimitry Andric return (__m256)__builtin_ia32_vfmaddsubps256((__v8sf)__A, (__v8sf)__B, (__v8sf)__C); 6930b57cec5SDimitry Andric } 6940b57cec5SDimitry Andric 695*06c3fb27SDimitry Andric /// Computes a multiply with alternating add/subtract of 256-bit vectors of 696*06c3fb27SDimitry Andric /// [4 x double]. 697*06c3fb27SDimitry Andric /// \code 698*06c3fb27SDimitry Andric /// result[63:0] = (__A[63:0] * __B[63:0]) - __C[63:0] 699*06c3fb27SDimitry Andric /// result[127:64] = (__A[127:64] * __B[127:64]) + __C[127:64] 700*06c3fb27SDimitry Andric /// result[191:128] = (__A[191:128] * __B[191:128]) - __C[191:128] 701*06c3fb27SDimitry Andric /// result[255:192] = (__A[255:192] * __B[255:192]) + __C[255:192] 702*06c3fb27SDimitry Andric /// \endcode 703*06c3fb27SDimitry Andric /// 704*06c3fb27SDimitry Andric /// \headerfile <immintrin.h> 705*06c3fb27SDimitry Andric /// 706*06c3fb27SDimitry Andric /// This intrinsic corresponds to the \c VFMADDSUB213PD instruction. 707*06c3fb27SDimitry Andric /// 708*06c3fb27SDimitry Andric /// \param __A 709*06c3fb27SDimitry Andric /// A 256-bit vector of [4 x double] containing the multiplicand. 710*06c3fb27SDimitry Andric /// \param __B 711*06c3fb27SDimitry Andric /// A 256-bit vector of [4 x double] containing the multiplier. 712*06c3fb27SDimitry Andric /// \param __C 713*06c3fb27SDimitry Andric /// A 256-bit vector of [4 x double] containing the addend/subtrahend. 714*06c3fb27SDimitry Andric /// \returns A 256-bit vector of [4 x double] containing the result. 7150b57cec5SDimitry Andric static __inline__ __m256d __DEFAULT_FN_ATTRS256 7160b57cec5SDimitry Andric _mm256_fmaddsub_pd(__m256d __A, __m256d __B, __m256d __C) 7170b57cec5SDimitry Andric { 7180b57cec5SDimitry Andric return (__m256d)__builtin_ia32_vfmaddsubpd256((__v4df)__A, (__v4df)__B, (__v4df)__C); 7190b57cec5SDimitry Andric } 7200b57cec5SDimitry Andric 721*06c3fb27SDimitry Andric /// Computes a vector multiply with alternating add/subtract of 256-bit 722*06c3fb27SDimitry Andric /// vectors of [8 x float]. 723*06c3fb27SDimitry Andric /// \code 724*06c3fb27SDimitry Andric /// result[31:0] = (__A[31:0] * __B[31:0]) + __C[31:0] 725*06c3fb27SDimitry Andric /// result[63:32] = (__A[63:32] * __B[63:32]) - __C[63:32] 726*06c3fb27SDimitry Andric /// result[95:64] = (__A[95:64] * __B[95:64]) + __C[95:64] 727*06c3fb27SDimitry Andric /// result[127:96] = (__A[127:96] * __B[127:96]) - __C[127:96] 728*06c3fb27SDimitry Andric /// result[159:128] = (__A[159:128] * __B[159:128]) + __C[159:128] 729*06c3fb27SDimitry Andric /// result[191:160] = (__A[191:160] * __B[191:160]) - __C[191:160] 730*06c3fb27SDimitry Andric /// result[223:192] = (__A[223:192] * __B[223:192]) + __C[223:192] 731*06c3fb27SDimitry Andric /// result[255:224] = (__A[255:224] * __B[255:224]) - __C[255:224] 732*06c3fb27SDimitry Andric /// \endcode 733*06c3fb27SDimitry Andric /// 734*06c3fb27SDimitry Andric /// \headerfile <immintrin.h> 735*06c3fb27SDimitry Andric /// 736*06c3fb27SDimitry Andric /// This intrinsic corresponds to the \c VFMSUBADD213PS instruction. 737*06c3fb27SDimitry Andric /// 738*06c3fb27SDimitry Andric /// \param __A 739*06c3fb27SDimitry Andric /// A 256-bit vector of [8 x float] containing the multiplicand. 740*06c3fb27SDimitry Andric /// \param __B 741*06c3fb27SDimitry Andric /// A 256-bit vector of [8 x float] containing the multiplier. 742*06c3fb27SDimitry Andric /// \param __C 743*06c3fb27SDimitry Andric /// A 256-bit vector of [8 x float] containing the addend/subtrahend. 744*06c3fb27SDimitry Andric /// \returns A 256-bit vector of [8 x float] containing the result. 7450b57cec5SDimitry Andric static __inline__ __m256 __DEFAULT_FN_ATTRS256 7460b57cec5SDimitry Andric _mm256_fmsubadd_ps(__m256 __A, __m256 __B, __m256 __C) 7470b57cec5SDimitry Andric { 7480b57cec5SDimitry Andric return (__m256)__builtin_ia32_vfmaddsubps256((__v8sf)__A, (__v8sf)__B, -(__v8sf)__C); 7490b57cec5SDimitry Andric } 7500b57cec5SDimitry Andric 751*06c3fb27SDimitry Andric /// Computes a vector multiply with alternating add/subtract of 256-bit 752*06c3fb27SDimitry Andric /// vectors of [4 x double]. 753*06c3fb27SDimitry Andric /// \code 754*06c3fb27SDimitry Andric /// result[63:0] = (__A[63:0] * __B[63:0]) + __C[63:0] 755*06c3fb27SDimitry Andric /// result[127:64] = (__A[127:64] * __B[127:64]) - __C[127:64] 756*06c3fb27SDimitry Andric /// result[191:128] = (__A[191:128] * __B[191:128]) + __C[191:128] 757*06c3fb27SDimitry Andric /// result[255:192] = (__A[255:192] * __B[255:192]) - __C[255:192] 758*06c3fb27SDimitry Andric /// \endcode 759*06c3fb27SDimitry Andric /// 760*06c3fb27SDimitry Andric /// \headerfile <immintrin.h> 761*06c3fb27SDimitry Andric /// 762*06c3fb27SDimitry Andric /// This intrinsic corresponds to the \c VFMSUBADD213PD instruction. 763*06c3fb27SDimitry Andric /// 764*06c3fb27SDimitry Andric /// \param __A 765*06c3fb27SDimitry Andric /// A 256-bit vector of [4 x double] containing the multiplicand. 766*06c3fb27SDimitry Andric /// \param __B 767*06c3fb27SDimitry Andric /// A 256-bit vector of [4 x double] containing the multiplier. 768*06c3fb27SDimitry Andric /// \param __C 769*06c3fb27SDimitry Andric /// A 256-bit vector of [4 x double] containing the addend/subtrahend. 770*06c3fb27SDimitry Andric /// \returns A 256-bit vector of [4 x double] containing the result. 7710b57cec5SDimitry Andric static __inline__ __m256d __DEFAULT_FN_ATTRS256 7720b57cec5SDimitry Andric _mm256_fmsubadd_pd(__m256d __A, __m256d __B, __m256d __C) 7730b57cec5SDimitry Andric { 7740b57cec5SDimitry Andric return (__m256d)__builtin_ia32_vfmaddsubpd256((__v4df)__A, (__v4df)__B, -(__v4df)__C); 7750b57cec5SDimitry Andric } 7760b57cec5SDimitry Andric 7770b57cec5SDimitry Andric #undef __DEFAULT_FN_ATTRS128 7780b57cec5SDimitry Andric #undef __DEFAULT_FN_ATTRS256 7790b57cec5SDimitry Andric 7800b57cec5SDimitry Andric #endif /* __FMAINTRIN_H */ 781