1 /*===---- fmaintrin.h - FMA intrinsics -------------------------------------=== 2 * 3 * Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 4 * See https://llvm.org/LICENSE.txt for license information. 5 * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 6 * 7 *===-----------------------------------------------------------------------=== 8 */ 9 10 #ifndef __IMMINTRIN_H 11 #error "Never use <fmaintrin.h> directly; include <immintrin.h> instead." 12 #endif 13 14 #ifndef __FMAINTRIN_H 15 #define __FMAINTRIN_H 16 17 /* Define the default attributes for the functions in this file. */ 18 #define __DEFAULT_FN_ATTRS128 __attribute__((__always_inline__, __nodebug__, __target__("fma"), __min_vector_width__(128))) 19 #define __DEFAULT_FN_ATTRS256 __attribute__((__always_inline__, __nodebug__, __target__("fma"), __min_vector_width__(256))) 20 21 /// Computes a multiply-add of 128-bit vectors of [4 x float]. 22 /// For each element, computes <c> (__A * __B) + __C </c>. 23 /// 24 /// \headerfile <immintrin.h> 25 /// 26 /// This intrinsic corresponds to the \c VFMADD213PS instruction. 27 /// 28 /// \param __A 29 /// A 128-bit vector of [4 x float] containing the multiplicand. 30 /// \param __B 31 /// A 128-bit vector of [4 x float] containing the multiplier. 32 /// \param __C 33 /// A 128-bit vector of [4 x float] containing the addend. 34 /// \returns A 128-bit vector of [4 x float] containing the result. 35 static __inline__ __m128 __DEFAULT_FN_ATTRS128 36 _mm_fmadd_ps(__m128 __A, __m128 __B, __m128 __C) 37 { 38 return (__m128)__builtin_ia32_vfmaddps((__v4sf)__A, (__v4sf)__B, (__v4sf)__C); 39 } 40 41 /// Computes a multiply-add of 128-bit vectors of [2 x double]. 42 /// For each element, computes <c> (__A * __B) + __C </c>. 43 /// 44 /// \headerfile <immintrin.h> 45 /// 46 /// This intrinsic corresponds to the \c VFMADD213PD instruction. 47 /// 48 /// \param __A 49 /// A 128-bit vector of [2 x double] containing the multiplicand. 50 /// \param __B 51 /// A 128-bit vector of [2 x double] containing the multiplier. 52 /// \param __C 53 /// A 128-bit vector of [2 x double] containing the addend. 54 /// \returns A 128-bit [2 x double] vector containing the result. 55 static __inline__ __m128d __DEFAULT_FN_ATTRS128 56 _mm_fmadd_pd(__m128d __A, __m128d __B, __m128d __C) 57 { 58 return (__m128d)__builtin_ia32_vfmaddpd((__v2df)__A, (__v2df)__B, (__v2df)__C); 59 } 60 61 /// Computes a scalar multiply-add of the single-precision values in the 62 /// low 32 bits of 128-bit vectors of [4 x float]. 63 /// \code 64 /// result[31:0] = (__A[31:0] * __B[31:0]) + __C[31:0] 65 /// result[127:32] = __A[127:32] 66 /// \endcode 67 /// 68 /// \headerfile <immintrin.h> 69 /// 70 /// This intrinsic corresponds to the \c VFMADD213SS instruction. 71 /// 72 /// \param __A 73 /// A 128-bit vector of [4 x float] containing the multiplicand in the low 74 /// 32 bits. 75 /// \param __B 76 /// A 128-bit vector of [4 x float] containing the multiplier in the low 77 /// 32 bits. 78 /// \param __C 79 /// A 128-bit vector of [4 x float] containing the addend in the low 80 /// 32 bits. 81 /// \returns A 128-bit vector of [4 x float] containing the result in the low 82 /// 32 bits and a copy of \a __A[127:32] in the upper 96 bits. 83 static __inline__ __m128 __DEFAULT_FN_ATTRS128 84 _mm_fmadd_ss(__m128 __A, __m128 __B, __m128 __C) 85 { 86 return (__m128)__builtin_ia32_vfmaddss3((__v4sf)__A, (__v4sf)__B, (__v4sf)__C); 87 } 88 89 /// Computes a scalar multiply-add of the double-precision values in the 90 /// low 64 bits of 128-bit vectors of [2 x double]. 91 /// \code 92 /// result[63:0] = (__A[63:0] * __B[63:0]) + __C[63:0] 93 /// result[127:64] = __A[127:64] 94 /// \endcode 95 /// 96 /// \headerfile <immintrin.h> 97 /// 98 /// This intrinsic corresponds to the \c VFMADD213SD instruction. 99 /// 100 /// \param __A 101 /// A 128-bit vector of [2 x double] containing the multiplicand in the low 102 /// 64 bits. 103 /// \param __B 104 /// A 128-bit vector of [2 x double] containing the multiplier in the low 105 /// 64 bits. 106 /// \param __C 107 /// A 128-bit vector of [2 x double] containing the addend in the low 108 /// 64 bits. 109 /// \returns A 128-bit vector of [2 x double] containing the result in the low 110 /// 64 bits and a copy of \a __A[127:64] in the upper 64 bits. 111 static __inline__ __m128d __DEFAULT_FN_ATTRS128 112 _mm_fmadd_sd(__m128d __A, __m128d __B, __m128d __C) 113 { 114 return (__m128d)__builtin_ia32_vfmaddsd3((__v2df)__A, (__v2df)__B, (__v2df)__C); 115 } 116 117 /// Computes a multiply-subtract of 128-bit vectors of [4 x float]. 118 /// For each element, computes <c> (__A * __B) - __C </c>. 119 /// 120 /// \headerfile <immintrin.h> 121 /// 122 /// This intrinsic corresponds to the \c VFMSUB213PS instruction. 123 /// 124 /// \param __A 125 /// A 128-bit vector of [4 x float] containing the multiplicand. 126 /// \param __B 127 /// A 128-bit vector of [4 x float] containing the multiplier. 128 /// \param __C 129 /// A 128-bit vector of [4 x float] containing the subtrahend. 130 /// \returns A 128-bit vector of [4 x float] containing the result. 131 static __inline__ __m128 __DEFAULT_FN_ATTRS128 132 _mm_fmsub_ps(__m128 __A, __m128 __B, __m128 __C) 133 { 134 return (__m128)__builtin_ia32_vfmaddps((__v4sf)__A, (__v4sf)__B, -(__v4sf)__C); 135 } 136 137 /// Computes a multiply-subtract of 128-bit vectors of [2 x double]. 138 /// For each element, computes <c> (__A * __B) - __C </c>. 139 /// 140 /// \headerfile <immintrin.h> 141 /// 142 /// This intrinsic corresponds to the \c VFMSUB213PD instruction. 143 /// 144 /// \param __A 145 /// A 128-bit vector of [2 x double] containing the multiplicand. 146 /// \param __B 147 /// A 128-bit vector of [2 x double] containing the multiplier. 148 /// \param __C 149 /// A 128-bit vector of [2 x double] containing the addend. 150 /// \returns A 128-bit vector of [2 x double] containing the result. 151 static __inline__ __m128d __DEFAULT_FN_ATTRS128 152 _mm_fmsub_pd(__m128d __A, __m128d __B, __m128d __C) 153 { 154 return (__m128d)__builtin_ia32_vfmaddpd((__v2df)__A, (__v2df)__B, -(__v2df)__C); 155 } 156 157 /// Computes a scalar multiply-subtract of the single-precision values in 158 /// the low 32 bits of 128-bit vectors of [4 x float]. 159 /// \code 160 /// result[31:0] = (__A[31:0] * __B[31:0]) - __C[31:0] 161 /// result[127:32] = __A[127:32] 162 /// \endcode 163 /// 164 /// \headerfile <immintrin.h> 165 /// 166 /// This intrinsic corresponds to the \c VFMSUB213SS instruction. 167 /// 168 /// \param __A 169 /// A 128-bit vector of [4 x float] containing the multiplicand in the low 170 /// 32 bits. 171 /// \param __B 172 /// A 128-bit vector of [4 x float] containing the multiplier in the low 173 /// 32 bits. 174 /// \param __C 175 /// A 128-bit vector of [4 x float] containing the subtrahend in the low 176 /// 32 bits. 177 /// \returns A 128-bit vector of [4 x float] containing the result in the low 178 /// 32 bits, and a copy of \a __A[127:32] in the upper 96 bits. 179 static __inline__ __m128 __DEFAULT_FN_ATTRS128 180 _mm_fmsub_ss(__m128 __A, __m128 __B, __m128 __C) 181 { 182 return (__m128)__builtin_ia32_vfmaddss3((__v4sf)__A, (__v4sf)__B, -(__v4sf)__C); 183 } 184 185 /// Computes a scalar multiply-subtract of the double-precision values in 186 /// the low 64 bits of 128-bit vectors of [2 x double]. 187 /// \code 188 /// result[63:0] = (__A[63:0] * __B[63:0]) - __C[63:0] 189 /// result[127:64] = __A[127:64] 190 /// \endcode 191 /// 192 /// \headerfile <immintrin.h> 193 /// 194 /// This intrinsic corresponds to the \c VFMSUB213SD instruction. 195 /// 196 /// \param __A 197 /// A 128-bit vector of [2 x double] containing the multiplicand in the low 198 /// 64 bits. 199 /// \param __B 200 /// A 128-bit vector of [2 x double] containing the multiplier in the low 201 /// 64 bits. 202 /// \param __C 203 /// A 128-bit vector of [2 x double] containing the subtrahend in the low 204 /// 64 bits. 205 /// \returns A 128-bit vector of [2 x double] containing the result in the low 206 /// 64 bits, and a copy of \a __A[127:64] in the upper 64 bits. 207 static __inline__ __m128d __DEFAULT_FN_ATTRS128 208 _mm_fmsub_sd(__m128d __A, __m128d __B, __m128d __C) 209 { 210 return (__m128d)__builtin_ia32_vfmaddsd3((__v2df)__A, (__v2df)__B, -(__v2df)__C); 211 } 212 213 /// Computes a negated multiply-add of 128-bit vectors of [4 x float]. 214 /// For each element, computes <c> -(__A * __B) + __C </c>. 215 /// 216 /// \headerfile <immintrin.h> 217 /// 218 /// This intrinsic corresponds to the \c VFNMADD213DPS instruction. 219 /// 220 /// \param __A 221 /// A 128-bit vector of [4 x float] containing the multiplicand. 222 /// \param __B 223 /// A 128-bit vector of [4 x float] containing the multiplier. 224 /// \param __C 225 /// A 128-bit vector of [4 x float] containing the addend. 226 /// \returns A 128-bit [4 x float] vector containing the result. 227 static __inline__ __m128 __DEFAULT_FN_ATTRS128 228 _mm_fnmadd_ps(__m128 __A, __m128 __B, __m128 __C) 229 { 230 return (__m128)__builtin_ia32_vfmaddps(-(__v4sf)__A, (__v4sf)__B, (__v4sf)__C); 231 } 232 233 /// Computes a negated multiply-add of 128-bit vectors of [2 x double]. 234 /// For each element, computes <c> -(__A * __B) + __C </c>. 235 /// 236 /// \headerfile <immintrin.h> 237 /// 238 /// This intrinsic corresponds to the \c VFNMADD213PD instruction. 239 /// 240 /// \param __A 241 /// A 128-bit vector of [2 x double] containing the multiplicand. 242 /// \param __B 243 /// A 128-bit vector of [2 x double] containing the multiplier. 244 /// \param __C 245 /// A 128-bit vector of [2 x double] containing the addend. 246 /// \returns A 128-bit vector of [2 x double] containing the result. 247 static __inline__ __m128d __DEFAULT_FN_ATTRS128 248 _mm_fnmadd_pd(__m128d __A, __m128d __B, __m128d __C) 249 { 250 return (__m128d)__builtin_ia32_vfmaddpd(-(__v2df)__A, (__v2df)__B, (__v2df)__C); 251 } 252 253 /// Computes a scalar negated multiply-add of the single-precision values in 254 /// the low 32 bits of 128-bit vectors of [4 x float]. 255 /// \code 256 /// result[31:0] = -(__A[31:0] * __B[31:0]) + __C[31:0] 257 /// result[127:32] = __A[127:32] 258 /// \endcode 259 /// 260 /// \headerfile <immintrin.h> 261 /// 262 /// This intrinsic corresponds to the \c VFNMADD213SS instruction. 263 /// 264 /// \param __A 265 /// A 128-bit vector of [4 x float] containing the multiplicand in the low 266 /// 32 bits. 267 /// \param __B 268 /// A 128-bit vector of [4 x float] containing the multiplier in the low 269 /// 32 bits. 270 /// \param __C 271 /// A 128-bit vector of [4 x float] containing the addend in the low 272 /// 32 bits. 273 /// \returns A 128-bit vector of [4 x float] containing the result in the low 274 /// 32 bits, and a copy of \a __A[127:32] in the upper 96 bits. 275 static __inline__ __m128 __DEFAULT_FN_ATTRS128 276 _mm_fnmadd_ss(__m128 __A, __m128 __B, __m128 __C) 277 { 278 return (__m128)__builtin_ia32_vfmaddss3((__v4sf)__A, -(__v4sf)__B, (__v4sf)__C); 279 } 280 281 /// Computes a scalar negated multiply-add of the double-precision values 282 /// in the low 64 bits of 128-bit vectors of [2 x double]. 283 /// \code 284 /// result[63:0] = -(__A[63:0] * __B[63:0]) + __C[63:0] 285 /// result[127:64] = __A[127:64] 286 /// \endcode 287 /// 288 /// \headerfile <immintrin.h> 289 /// 290 /// This intrinsic corresponds to the \c VFNMADD213SD instruction. 291 /// 292 /// \param __A 293 /// A 128-bit vector of [2 x double] containing the multiplicand in the low 294 /// 64 bits. 295 /// \param __B 296 /// A 128-bit vector of [2 x double] containing the multiplier in the low 297 /// 64 bits. 298 /// \param __C 299 /// A 128-bit vector of [2 x double] containing the addend in the low 300 /// 64 bits. 301 /// \returns A 128-bit vector of [2 x double] containing the result in the low 302 /// 64 bits, and a copy of \a __A[127:64] in the upper 64 bits. 303 static __inline__ __m128d __DEFAULT_FN_ATTRS128 304 _mm_fnmadd_sd(__m128d __A, __m128d __B, __m128d __C) 305 { 306 return (__m128d)__builtin_ia32_vfmaddsd3((__v2df)__A, -(__v2df)__B, (__v2df)__C); 307 } 308 309 /// Computes a negated multiply-subtract of 128-bit vectors of [4 x float]. 310 /// For each element, computes <c> -(__A * __B) - __C </c>. 311 /// 312 /// \headerfile <immintrin.h> 313 /// 314 /// This intrinsic corresponds to the \c VFNMSUB213PS instruction. 315 /// 316 /// \param __A 317 /// A 128-bit vector of [4 x float] containing the multiplicand. 318 /// \param __B 319 /// A 128-bit vector of [4 x float] containing the multiplier. 320 /// \param __C 321 /// A 128-bit vector of [4 x float] containing the subtrahend. 322 /// \returns A 128-bit vector of [4 x float] containing the result. 323 static __inline__ __m128 __DEFAULT_FN_ATTRS128 324 _mm_fnmsub_ps(__m128 __A, __m128 __B, __m128 __C) 325 { 326 return (__m128)__builtin_ia32_vfmaddps(-(__v4sf)__A, (__v4sf)__B, -(__v4sf)__C); 327 } 328 329 /// Computes a negated multiply-subtract of 128-bit vectors of [2 x double]. 330 /// For each element, computes <c> -(__A * __B) - __C </c>. 331 /// 332 /// \headerfile <immintrin.h> 333 /// 334 /// This intrinsic corresponds to the \c VFNMSUB213PD instruction. 335 /// 336 /// \param __A 337 /// A 128-bit vector of [2 x double] containing the multiplicand. 338 /// \param __B 339 /// A 128-bit vector of [2 x double] containing the multiplier. 340 /// \param __C 341 /// A 128-bit vector of [2 x double] containing the subtrahend. 342 /// \returns A 128-bit vector of [2 x double] containing the result. 343 static __inline__ __m128d __DEFAULT_FN_ATTRS128 344 _mm_fnmsub_pd(__m128d __A, __m128d __B, __m128d __C) 345 { 346 return (__m128d)__builtin_ia32_vfmaddpd(-(__v2df)__A, (__v2df)__B, -(__v2df)__C); 347 } 348 349 /// Computes a scalar negated multiply-subtract of the single-precision 350 /// values in the low 32 bits of 128-bit vectors of [4 x float]. 351 /// \code 352 /// result[31:0] = -(__A[31:0] * __B[31:0]) - __C[31:0] 353 /// result[127:32] = __A[127:32] 354 /// \endcode 355 /// 356 /// \headerfile <immintrin.h> 357 /// 358 /// This intrinsic corresponds to the \c VFNMSUB213SS instruction. 359 /// 360 /// \param __A 361 /// A 128-bit vector of [4 x float] containing the multiplicand in the low 362 /// 32 bits. 363 /// \param __B 364 /// A 128-bit vector of [4 x float] containing the multiplier in the low 365 /// 32 bits. 366 /// \param __C 367 /// A 128-bit vector of [4 x float] containing the subtrahend in the low 368 /// 32 bits. 369 /// \returns A 128-bit vector of [4 x float] containing the result in the low 370 /// 32 bits, and a copy of \a __A[127:32] in the upper 96 bits. 371 static __inline__ __m128 __DEFAULT_FN_ATTRS128 372 _mm_fnmsub_ss(__m128 __A, __m128 __B, __m128 __C) 373 { 374 return (__m128)__builtin_ia32_vfmaddss3((__v4sf)__A, -(__v4sf)__B, -(__v4sf)__C); 375 } 376 377 /// Computes a scalar negated multiply-subtract of the double-precision 378 /// values in the low 64 bits of 128-bit vectors of [2 x double]. 379 /// \code 380 /// result[63:0] = -(__A[63:0] * __B[63:0]) - __C[63:0] 381 /// result[127:64] = __A[127:64] 382 /// \endcode 383 /// 384 /// \headerfile <immintrin.h> 385 /// 386 /// This intrinsic corresponds to the \c VFNMSUB213SD instruction. 387 /// 388 /// \param __A 389 /// A 128-bit vector of [2 x double] containing the multiplicand in the low 390 /// 64 bits. 391 /// \param __B 392 /// A 128-bit vector of [2 x double] containing the multiplier in the low 393 /// 64 bits. 394 /// \param __C 395 /// A 128-bit vector of [2 x double] containing the subtrahend in the low 396 /// 64 bits. 397 /// \returns A 128-bit vector of [2 x double] containing the result in the low 398 /// 64 bits, and a copy of \a __A[127:64] in the upper 64 bits. 399 static __inline__ __m128d __DEFAULT_FN_ATTRS128 400 _mm_fnmsub_sd(__m128d __A, __m128d __B, __m128d __C) 401 { 402 return (__m128d)__builtin_ia32_vfmaddsd3((__v2df)__A, -(__v2df)__B, -(__v2df)__C); 403 } 404 405 /// Computes a multiply with alternating add/subtract of 128-bit vectors of 406 /// [4 x float]. 407 /// \code 408 /// result[31:0] = (__A[31:0] * __B[31:0]) - __C[31:0] 409 /// result[63:32] = (__A[63:32] * __B[63:32]) + __C[63:32] 410 /// result[95:64] = (__A[95:64] * __B[95:64]) - __C[95:64] 411 /// result[127:96] = (__A[127:96] * __B[127:96]) + __C[127:96] 412 /// \endcode 413 /// 414 /// \headerfile <immintrin.h> 415 /// 416 /// This intrinsic corresponds to the \c VFMADDSUB213PS instruction. 417 /// 418 /// \param __A 419 /// A 128-bit vector of [4 x float] containing the multiplicand. 420 /// \param __B 421 /// A 128-bit vector of [4 x float] containing the multiplier. 422 /// \param __C 423 /// A 128-bit vector of [4 x float] containing the addend/subtrahend. 424 /// \returns A 128-bit vector of [4 x float] containing the result. 425 static __inline__ __m128 __DEFAULT_FN_ATTRS128 426 _mm_fmaddsub_ps(__m128 __A, __m128 __B, __m128 __C) 427 { 428 return (__m128)__builtin_ia32_vfmaddsubps((__v4sf)__A, (__v4sf)__B, (__v4sf)__C); 429 } 430 431 /// Computes a multiply with alternating add/subtract of 128-bit vectors of 432 /// [2 x double]. 433 /// \code 434 /// result[63:0] = (__A[63:0] * __B[63:0]) - __C[63:0] 435 /// result[127:64] = (__A[127:64] * __B[127:64]) + __C[127:64] 436 /// \endcode 437 /// 438 /// \headerfile <immintrin.h> 439 /// 440 /// This intrinsic corresponds to the \c VFMADDSUB213PD instruction. 441 /// 442 /// \param __A 443 /// A 128-bit vector of [2 x double] containing the multiplicand. 444 /// \param __B 445 /// A 128-bit vector of [2 x double] containing the multiplier. 446 /// \param __C 447 /// A 128-bit vector of [2 x double] containing the addend/subtrahend. 448 /// \returns A 128-bit vector of [2 x double] containing the result. 449 static __inline__ __m128d __DEFAULT_FN_ATTRS128 450 _mm_fmaddsub_pd(__m128d __A, __m128d __B, __m128d __C) 451 { 452 return (__m128d)__builtin_ia32_vfmaddsubpd((__v2df)__A, (__v2df)__B, (__v2df)__C); 453 } 454 455 /// Computes a multiply with alternating add/subtract of 128-bit vectors of 456 /// [4 x float]. 457 /// \code 458 /// result[31:0] = (__A[31:0] * __B[31:0]) + __C[31:0] 459 /// result[63:32] = (__A[63:32] * __B[63:32]) - __C[63:32] 460 /// result[95:64] = (__A[95:64] * __B[95:64]) + __C[95:64] 461 /// result[127:96 = (__A[127:96] * __B[127:96]) - __C[127:96] 462 /// \endcode 463 /// 464 /// \headerfile <immintrin.h> 465 /// 466 /// This intrinsic corresponds to the \c VFMSUBADD213PS instruction. 467 /// 468 /// \param __A 469 /// A 128-bit vector of [4 x float] containing the multiplicand. 470 /// \param __B 471 /// A 128-bit vector of [4 x float] containing the multiplier. 472 /// \param __C 473 /// A 128-bit vector of [4 x float] containing the addend/subtrahend. 474 /// \returns A 128-bit vector of [4 x float] containing the result. 475 static __inline__ __m128 __DEFAULT_FN_ATTRS128 476 _mm_fmsubadd_ps(__m128 __A, __m128 __B, __m128 __C) 477 { 478 return (__m128)__builtin_ia32_vfmaddsubps((__v4sf)__A, (__v4sf)__B, -(__v4sf)__C); 479 } 480 481 /// Computes a multiply with alternating add/subtract of 128-bit vectors of 482 /// [2 x double]. 483 /// \code 484 /// result[63:0] = (__A[63:0] * __B[63:0]) + __C[63:0] 485 /// result[127:64] = (__A[127:64] * __B[127:64]) - __C[127:64] 486 /// \endcode 487 /// 488 /// \headerfile <immintrin.h> 489 /// 490 /// This intrinsic corresponds to the \c VFMADDSUB213PD instruction. 491 /// 492 /// \param __A 493 /// A 128-bit vector of [2 x double] containing the multiplicand. 494 /// \param __B 495 /// A 128-bit vector of [2 x double] containing the multiplier. 496 /// \param __C 497 /// A 128-bit vector of [2 x double] containing the addend/subtrahend. 498 /// \returns A 128-bit vector of [2 x double] containing the result. 499 static __inline__ __m128d __DEFAULT_FN_ATTRS128 500 _mm_fmsubadd_pd(__m128d __A, __m128d __B, __m128d __C) 501 { 502 return (__m128d)__builtin_ia32_vfmaddsubpd((__v2df)__A, (__v2df)__B, -(__v2df)__C); 503 } 504 505 /// Computes a multiply-add of 256-bit vectors of [8 x float]. 506 /// For each element, computes <c> (__A * __B) + __C </c>. 507 /// 508 /// \headerfile <immintrin.h> 509 /// 510 /// This intrinsic corresponds to the \c VFMADD213PS instruction. 511 /// 512 /// \param __A 513 /// A 256-bit vector of [8 x float] containing the multiplicand. 514 /// \param __B 515 /// A 256-bit vector of [8 x float] containing the multiplier. 516 /// \param __C 517 /// A 256-bit vector of [8 x float] containing the addend. 518 /// \returns A 256-bit vector of [8 x float] containing the result. 519 static __inline__ __m256 __DEFAULT_FN_ATTRS256 520 _mm256_fmadd_ps(__m256 __A, __m256 __B, __m256 __C) 521 { 522 return (__m256)__builtin_ia32_vfmaddps256((__v8sf)__A, (__v8sf)__B, (__v8sf)__C); 523 } 524 525 /// Computes a multiply-add of 256-bit vectors of [4 x double]. 526 /// For each element, computes <c> (__A * __B) + __C </c>. 527 /// 528 /// \headerfile <immintrin.h> 529 /// 530 /// This intrinsic corresponds to the \c VFMADD213PD instruction. 531 /// 532 /// \param __A 533 /// A 256-bit vector of [4 x double] containing the multiplicand. 534 /// \param __B 535 /// A 256-bit vector of [4 x double] containing the multiplier. 536 /// \param __C 537 /// A 256-bit vector of [4 x double] containing the addend. 538 /// \returns A 256-bit vector of [4 x double] containing the result. 539 static __inline__ __m256d __DEFAULT_FN_ATTRS256 540 _mm256_fmadd_pd(__m256d __A, __m256d __B, __m256d __C) 541 { 542 return (__m256d)__builtin_ia32_vfmaddpd256((__v4df)__A, (__v4df)__B, (__v4df)__C); 543 } 544 545 /// Computes a multiply-subtract of 256-bit vectors of [8 x float]. 546 /// For each element, computes <c> (__A * __B) - __C </c>. 547 /// 548 /// \headerfile <immintrin.h> 549 /// 550 /// This intrinsic corresponds to the \c VFMSUB213PS instruction. 551 /// 552 /// \param __A 553 /// A 256-bit vector of [8 x float] containing the multiplicand. 554 /// \param __B 555 /// A 256-bit vector of [8 x float] containing the multiplier. 556 /// \param __C 557 /// A 256-bit vector of [8 x float] containing the subtrahend. 558 /// \returns A 256-bit vector of [8 x float] containing the result. 559 static __inline__ __m256 __DEFAULT_FN_ATTRS256 560 _mm256_fmsub_ps(__m256 __A, __m256 __B, __m256 __C) 561 { 562 return (__m256)__builtin_ia32_vfmaddps256((__v8sf)__A, (__v8sf)__B, -(__v8sf)__C); 563 } 564 565 /// Computes a multiply-subtract of 256-bit vectors of [4 x double]. 566 /// For each element, computes <c> (__A * __B) - __C </c>. 567 /// 568 /// \headerfile <immintrin.h> 569 /// 570 /// This intrinsic corresponds to the \c VFMSUB213PD instruction. 571 /// 572 /// \param __A 573 /// A 256-bit vector of [4 x double] containing the multiplicand. 574 /// \param __B 575 /// A 256-bit vector of [4 x double] containing the multiplier. 576 /// \param __C 577 /// A 256-bit vector of [4 x double] containing the subtrahend. 578 /// \returns A 256-bit vector of [4 x double] containing the result. 579 static __inline__ __m256d __DEFAULT_FN_ATTRS256 580 _mm256_fmsub_pd(__m256d __A, __m256d __B, __m256d __C) 581 { 582 return (__m256d)__builtin_ia32_vfmaddpd256((__v4df)__A, (__v4df)__B, -(__v4df)__C); 583 } 584 585 /// Computes a negated multiply-add of 256-bit vectors of [8 x float]. 586 /// For each element, computes <c> -(__A * __B) + __C </c>. 587 /// 588 /// \headerfile <immintrin.h> 589 /// 590 /// This intrinsic corresponds to the \c VFNMADD213PS instruction. 591 /// 592 /// \param __A 593 /// A 256-bit vector of [8 x float] containing the multiplicand. 594 /// \param __B 595 /// A 256-bit vector of [8 x float] containing the multiplier. 596 /// \param __C 597 /// A 256-bit vector of [8 x float] containing the addend. 598 /// \returns A 256-bit vector of [8 x float] containing the result. 599 static __inline__ __m256 __DEFAULT_FN_ATTRS256 600 _mm256_fnmadd_ps(__m256 __A, __m256 __B, __m256 __C) 601 { 602 return (__m256)__builtin_ia32_vfmaddps256(-(__v8sf)__A, (__v8sf)__B, (__v8sf)__C); 603 } 604 605 /// Computes a negated multiply-add of 256-bit vectors of [4 x double]. 606 /// For each element, computes <c> -(__A * __B) + __C </c>. 607 /// 608 /// \headerfile <immintrin.h> 609 /// 610 /// This intrinsic corresponds to the \c VFNMADD213PD instruction. 611 /// 612 /// \param __A 613 /// A 256-bit vector of [4 x double] containing the multiplicand. 614 /// \param __B 615 /// A 256-bit vector of [4 x double] containing the multiplier. 616 /// \param __C 617 /// A 256-bit vector of [4 x double] containing the addend. 618 /// \returns A 256-bit vector of [4 x double] containing the result. 619 static __inline__ __m256d __DEFAULT_FN_ATTRS256 620 _mm256_fnmadd_pd(__m256d __A, __m256d __B, __m256d __C) 621 { 622 return (__m256d)__builtin_ia32_vfmaddpd256(-(__v4df)__A, (__v4df)__B, (__v4df)__C); 623 } 624 625 /// Computes a negated multiply-subtract of 256-bit vectors of [8 x float]. 626 /// For each element, computes <c> -(__A * __B) - __C </c>. 627 /// 628 /// \headerfile <immintrin.h> 629 /// 630 /// This intrinsic corresponds to the \c VFNMSUB213PS instruction. 631 /// 632 /// \param __A 633 /// A 256-bit vector of [8 x float] containing the multiplicand. 634 /// \param __B 635 /// A 256-bit vector of [8 x float] containing the multiplier. 636 /// \param __C 637 /// A 256-bit vector of [8 x float] containing the subtrahend. 638 /// \returns A 256-bit vector of [8 x float] containing the result. 639 static __inline__ __m256 __DEFAULT_FN_ATTRS256 640 _mm256_fnmsub_ps(__m256 __A, __m256 __B, __m256 __C) 641 { 642 return (__m256)__builtin_ia32_vfmaddps256(-(__v8sf)__A, (__v8sf)__B, -(__v8sf)__C); 643 } 644 645 /// Computes a negated multiply-subtract of 256-bit vectors of [4 x double]. 646 /// For each element, computes <c> -(__A * __B) - __C </c>. 647 /// 648 /// \headerfile <immintrin.h> 649 /// 650 /// This intrinsic corresponds to the \c VFNMSUB213PD instruction. 651 /// 652 /// \param __A 653 /// A 256-bit vector of [4 x double] containing the multiplicand. 654 /// \param __B 655 /// A 256-bit vector of [4 x double] containing the multiplier. 656 /// \param __C 657 /// A 256-bit vector of [4 x double] containing the subtrahend. 658 /// \returns A 256-bit vector of [4 x double] containing the result. 659 static __inline__ __m256d __DEFAULT_FN_ATTRS256 660 _mm256_fnmsub_pd(__m256d __A, __m256d __B, __m256d __C) 661 { 662 return (__m256d)__builtin_ia32_vfmaddpd256(-(__v4df)__A, (__v4df)__B, -(__v4df)__C); 663 } 664 665 /// Computes a multiply with alternating add/subtract of 256-bit vectors of 666 /// [8 x float]. 667 /// \code 668 /// result[31:0] = (__A[31:0] * __B[31:0]) - __C[31:0] 669 /// result[63:32] = (__A[63:32] * __B[63:32]) + __C[63:32] 670 /// result[95:64] = (__A[95:64] * __B[95:64]) - __C[95:64] 671 /// result[127:96] = (__A[127:96] * __B[127:96]) + __C[127:96] 672 /// result[159:128] = (__A[159:128] * __B[159:128]) - __C[159:128] 673 /// result[191:160] = (__A[191:160] * __B[191:160]) + __C[191:160] 674 /// result[223:192] = (__A[223:192] * __B[223:192]) - __C[223:192] 675 /// result[255:224] = (__A[255:224] * __B[255:224]) + __C[255:224] 676 /// \endcode 677 /// 678 /// \headerfile <immintrin.h> 679 /// 680 /// This intrinsic corresponds to the \c VFMADDSUB213PS instruction. 681 /// 682 /// \param __A 683 /// A 256-bit vector of [8 x float] containing the multiplicand. 684 /// \param __B 685 /// A 256-bit vector of [8 x float] containing the multiplier. 686 /// \param __C 687 /// A 256-bit vector of [8 x float] containing the addend/subtrahend. 688 /// \returns A 256-bit vector of [8 x float] containing the result. 689 static __inline__ __m256 __DEFAULT_FN_ATTRS256 690 _mm256_fmaddsub_ps(__m256 __A, __m256 __B, __m256 __C) 691 { 692 return (__m256)__builtin_ia32_vfmaddsubps256((__v8sf)__A, (__v8sf)__B, (__v8sf)__C); 693 } 694 695 /// Computes a multiply with alternating add/subtract of 256-bit vectors of 696 /// [4 x double]. 697 /// \code 698 /// result[63:0] = (__A[63:0] * __B[63:0]) - __C[63:0] 699 /// result[127:64] = (__A[127:64] * __B[127:64]) + __C[127:64] 700 /// result[191:128] = (__A[191:128] * __B[191:128]) - __C[191:128] 701 /// result[255:192] = (__A[255:192] * __B[255:192]) + __C[255:192] 702 /// \endcode 703 /// 704 /// \headerfile <immintrin.h> 705 /// 706 /// This intrinsic corresponds to the \c VFMADDSUB213PD instruction. 707 /// 708 /// \param __A 709 /// A 256-bit vector of [4 x double] containing the multiplicand. 710 /// \param __B 711 /// A 256-bit vector of [4 x double] containing the multiplier. 712 /// \param __C 713 /// A 256-bit vector of [4 x double] containing the addend/subtrahend. 714 /// \returns A 256-bit vector of [4 x double] containing the result. 715 static __inline__ __m256d __DEFAULT_FN_ATTRS256 716 _mm256_fmaddsub_pd(__m256d __A, __m256d __B, __m256d __C) 717 { 718 return (__m256d)__builtin_ia32_vfmaddsubpd256((__v4df)__A, (__v4df)__B, (__v4df)__C); 719 } 720 721 /// Computes a vector multiply with alternating add/subtract of 256-bit 722 /// vectors of [8 x float]. 723 /// \code 724 /// result[31:0] = (__A[31:0] * __B[31:0]) + __C[31:0] 725 /// result[63:32] = (__A[63:32] * __B[63:32]) - __C[63:32] 726 /// result[95:64] = (__A[95:64] * __B[95:64]) + __C[95:64] 727 /// result[127:96] = (__A[127:96] * __B[127:96]) - __C[127:96] 728 /// result[159:128] = (__A[159:128] * __B[159:128]) + __C[159:128] 729 /// result[191:160] = (__A[191:160] * __B[191:160]) - __C[191:160] 730 /// result[223:192] = (__A[223:192] * __B[223:192]) + __C[223:192] 731 /// result[255:224] = (__A[255:224] * __B[255:224]) - __C[255:224] 732 /// \endcode 733 /// 734 /// \headerfile <immintrin.h> 735 /// 736 /// This intrinsic corresponds to the \c VFMSUBADD213PS instruction. 737 /// 738 /// \param __A 739 /// A 256-bit vector of [8 x float] containing the multiplicand. 740 /// \param __B 741 /// A 256-bit vector of [8 x float] containing the multiplier. 742 /// \param __C 743 /// A 256-bit vector of [8 x float] containing the addend/subtrahend. 744 /// \returns A 256-bit vector of [8 x float] containing the result. 745 static __inline__ __m256 __DEFAULT_FN_ATTRS256 746 _mm256_fmsubadd_ps(__m256 __A, __m256 __B, __m256 __C) 747 { 748 return (__m256)__builtin_ia32_vfmaddsubps256((__v8sf)__A, (__v8sf)__B, -(__v8sf)__C); 749 } 750 751 /// Computes a vector multiply with alternating add/subtract of 256-bit 752 /// vectors of [4 x double]. 753 /// \code 754 /// result[63:0] = (__A[63:0] * __B[63:0]) + __C[63:0] 755 /// result[127:64] = (__A[127:64] * __B[127:64]) - __C[127:64] 756 /// result[191:128] = (__A[191:128] * __B[191:128]) + __C[191:128] 757 /// result[255:192] = (__A[255:192] * __B[255:192]) - __C[255:192] 758 /// \endcode 759 /// 760 /// \headerfile <immintrin.h> 761 /// 762 /// This intrinsic corresponds to the \c VFMSUBADD213PD instruction. 763 /// 764 /// \param __A 765 /// A 256-bit vector of [4 x double] containing the multiplicand. 766 /// \param __B 767 /// A 256-bit vector of [4 x double] containing the multiplier. 768 /// \param __C 769 /// A 256-bit vector of [4 x double] containing the addend/subtrahend. 770 /// \returns A 256-bit vector of [4 x double] containing the result. 771 static __inline__ __m256d __DEFAULT_FN_ATTRS256 772 _mm256_fmsubadd_pd(__m256d __A, __m256d __B, __m256d __C) 773 { 774 return (__m256d)__builtin_ia32_vfmaddsubpd256((__v4df)__A, (__v4df)__B, -(__v4df)__C); 775 } 776 777 #undef __DEFAULT_FN_ATTRS128 778 #undef __DEFAULT_FN_ATTRS256 779 780 #endif /* __FMAINTRIN_H */ 781