1 /*===---- fmaintrin.h - FMA intrinsics -------------------------------------=== 2 * 3 * Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 4 * See https://llvm.org/LICENSE.txt for license information. 5 * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 6 * 7 *===-----------------------------------------------------------------------=== 8 */ 9 10 #ifndef __IMMINTRIN_H 11 #error "Never use <fmaintrin.h> directly; include <immintrin.h> instead." 12 #endif 13 14 #ifndef __FMAINTRIN_H 15 #define __FMAINTRIN_H 16 17 /* Define the default attributes for the functions in this file. */ 18 #define __DEFAULT_FN_ATTRS128 __attribute__((__always_inline__, __nodebug__, __target__("fma"), __min_vector_width__(128))) 19 #define __DEFAULT_FN_ATTRS256 __attribute__((__always_inline__, __nodebug__, __target__("fma"), __min_vector_width__(256))) 20 21 /// Computes a multiply-add of 128-bit vectors of [4 x float]. 22 /// For each element, computes <c> (__A * __B) + __C </c>. 23 /// 24 /// \headerfile <immintrin.h> 25 /// 26 /// This intrinsic corresponds to the \c VFMADD213PS instruction. 27 /// 28 /// \param __A 29 /// A 128-bit vector of [4 x float] containing the multiplicand. 30 /// \param __B 31 /// A 128-bit vector of [4 x float] containing the multiplier. 32 /// \param __C 33 /// A 128-bit vector of [4 x float] containing the addend. 34 /// \returns A 128-bit vector of [4 x float] containing the result. 35 static __inline__ __m128 __DEFAULT_FN_ATTRS128 36 _mm_fmadd_ps(__m128 __A, __m128 __B, __m128 __C) 37 { 38 return (__m128)__builtin_ia32_vfmaddps((__v4sf)__A, (__v4sf)__B, (__v4sf)__C); 39 } 40 41 /// Computes a multiply-add of 128-bit vectors of [2 x double]. 42 /// For each element, computes <c> (__A * __B) + __C </c>. 43 /// 44 /// \headerfile <immintrin.h> 45 /// 46 /// This intrinsic corresponds to the \c VFMADD213PD instruction. 47 /// 48 /// \param __A 49 /// A 128-bit vector of [2 x double] containing the multiplicand. 50 /// \param __B 51 /// A 128-bit vector of [2 x double] containing the multiplier. 52 /// \param __C 53 /// A 128-bit vector of [2 x double] containing the addend. 54 /// \returns A 128-bit [2 x double] vector containing the result. 55 static __inline__ __m128d __DEFAULT_FN_ATTRS128 56 _mm_fmadd_pd(__m128d __A, __m128d __B, __m128d __C) 57 { 58 return (__m128d)__builtin_ia32_vfmaddpd((__v2df)__A, (__v2df)__B, (__v2df)__C); 59 } 60 61 /// Computes a scalar multiply-add of the single-precision values in the 62 /// low 32 bits of 128-bit vectors of [4 x float]. 63 /// 64 /// \code{.operation} 65 /// result[31:0] = (__A[31:0] * __B[31:0]) + __C[31:0] 66 /// result[127:32] = __A[127:32] 67 /// \endcode 68 /// 69 /// \headerfile <immintrin.h> 70 /// 71 /// This intrinsic corresponds to the \c VFMADD213SS instruction. 72 /// 73 /// \param __A 74 /// A 128-bit vector of [4 x float] containing the multiplicand in the low 75 /// 32 bits. 76 /// \param __B 77 /// A 128-bit vector of [4 x float] containing the multiplier in the low 78 /// 32 bits. 79 /// \param __C 80 /// A 128-bit vector of [4 x float] containing the addend in the low 81 /// 32 bits. 82 /// \returns A 128-bit vector of [4 x float] containing the result in the low 83 /// 32 bits and a copy of \a __A[127:32] in the upper 96 bits. 84 static __inline__ __m128 __DEFAULT_FN_ATTRS128 85 _mm_fmadd_ss(__m128 __A, __m128 __B, __m128 __C) 86 { 87 return (__m128)__builtin_ia32_vfmaddss3((__v4sf)__A, (__v4sf)__B, (__v4sf)__C); 88 } 89 90 /// Computes a scalar multiply-add of the double-precision values in the 91 /// low 64 bits of 128-bit vectors of [2 x double]. 92 /// 93 /// \code{.operation} 94 /// result[63:0] = (__A[63:0] * __B[63:0]) + __C[63:0] 95 /// result[127:64] = __A[127:64] 96 /// \endcode 97 /// 98 /// \headerfile <immintrin.h> 99 /// 100 /// This intrinsic corresponds to the \c VFMADD213SD instruction. 101 /// 102 /// \param __A 103 /// A 128-bit vector of [2 x double] containing the multiplicand in the low 104 /// 64 bits. 105 /// \param __B 106 /// A 128-bit vector of [2 x double] containing the multiplier in the low 107 /// 64 bits. 108 /// \param __C 109 /// A 128-bit vector of [2 x double] containing the addend in the low 110 /// 64 bits. 111 /// \returns A 128-bit vector of [2 x double] containing the result in the low 112 /// 64 bits and a copy of \a __A[127:64] in the upper 64 bits. 113 static __inline__ __m128d __DEFAULT_FN_ATTRS128 114 _mm_fmadd_sd(__m128d __A, __m128d __B, __m128d __C) 115 { 116 return (__m128d)__builtin_ia32_vfmaddsd3((__v2df)__A, (__v2df)__B, (__v2df)__C); 117 } 118 119 /// Computes a multiply-subtract of 128-bit vectors of [4 x float]. 120 /// For each element, computes <c> (__A * __B) - __C </c>. 121 /// 122 /// \headerfile <immintrin.h> 123 /// 124 /// This intrinsic corresponds to the \c VFMSUB213PS instruction. 125 /// 126 /// \param __A 127 /// A 128-bit vector of [4 x float] containing the multiplicand. 128 /// \param __B 129 /// A 128-bit vector of [4 x float] containing the multiplier. 130 /// \param __C 131 /// A 128-bit vector of [4 x float] containing the subtrahend. 132 /// \returns A 128-bit vector of [4 x float] containing the result. 133 static __inline__ __m128 __DEFAULT_FN_ATTRS128 134 _mm_fmsub_ps(__m128 __A, __m128 __B, __m128 __C) 135 { 136 return (__m128)__builtin_ia32_vfmaddps((__v4sf)__A, (__v4sf)__B, -(__v4sf)__C); 137 } 138 139 /// Computes a multiply-subtract of 128-bit vectors of [2 x double]. 140 /// For each element, computes <c> (__A * __B) - __C </c>. 141 /// 142 /// \headerfile <immintrin.h> 143 /// 144 /// This intrinsic corresponds to the \c VFMSUB213PD instruction. 145 /// 146 /// \param __A 147 /// A 128-bit vector of [2 x double] containing the multiplicand. 148 /// \param __B 149 /// A 128-bit vector of [2 x double] containing the multiplier. 150 /// \param __C 151 /// A 128-bit vector of [2 x double] containing the addend. 152 /// \returns A 128-bit vector of [2 x double] containing the result. 153 static __inline__ __m128d __DEFAULT_FN_ATTRS128 154 _mm_fmsub_pd(__m128d __A, __m128d __B, __m128d __C) 155 { 156 return (__m128d)__builtin_ia32_vfmaddpd((__v2df)__A, (__v2df)__B, -(__v2df)__C); 157 } 158 159 /// Computes a scalar multiply-subtract of the single-precision values in 160 /// the low 32 bits of 128-bit vectors of [4 x float]. 161 /// 162 /// \code{.operation} 163 /// result[31:0] = (__A[31:0] * __B[31:0]) - __C[31:0] 164 /// result[127:32] = __A[127:32] 165 /// \endcode 166 /// 167 /// \headerfile <immintrin.h> 168 /// 169 /// This intrinsic corresponds to the \c VFMSUB213SS instruction. 170 /// 171 /// \param __A 172 /// A 128-bit vector of [4 x float] containing the multiplicand in the low 173 /// 32 bits. 174 /// \param __B 175 /// A 128-bit vector of [4 x float] containing the multiplier in the low 176 /// 32 bits. 177 /// \param __C 178 /// A 128-bit vector of [4 x float] containing the subtrahend in the low 179 /// 32 bits. 180 /// \returns A 128-bit vector of [4 x float] containing the result in the low 181 /// 32 bits, and a copy of \a __A[127:32] in the upper 96 bits. 182 static __inline__ __m128 __DEFAULT_FN_ATTRS128 183 _mm_fmsub_ss(__m128 __A, __m128 __B, __m128 __C) 184 { 185 return (__m128)__builtin_ia32_vfmaddss3((__v4sf)__A, (__v4sf)__B, -(__v4sf)__C); 186 } 187 188 /// Computes a scalar multiply-subtract of the double-precision values in 189 /// the low 64 bits of 128-bit vectors of [2 x double]. 190 /// 191 /// \code{.operation} 192 /// result[63:0] = (__A[63:0] * __B[63:0]) - __C[63:0] 193 /// result[127:64] = __A[127:64] 194 /// \endcode 195 /// 196 /// \headerfile <immintrin.h> 197 /// 198 /// This intrinsic corresponds to the \c VFMSUB213SD instruction. 199 /// 200 /// \param __A 201 /// A 128-bit vector of [2 x double] containing the multiplicand in the low 202 /// 64 bits. 203 /// \param __B 204 /// A 128-bit vector of [2 x double] containing the multiplier in the low 205 /// 64 bits. 206 /// \param __C 207 /// A 128-bit vector of [2 x double] containing the subtrahend in the low 208 /// 64 bits. 209 /// \returns A 128-bit vector of [2 x double] containing the result in the low 210 /// 64 bits, and a copy of \a __A[127:64] in the upper 64 bits. 211 static __inline__ __m128d __DEFAULT_FN_ATTRS128 212 _mm_fmsub_sd(__m128d __A, __m128d __B, __m128d __C) 213 { 214 return (__m128d)__builtin_ia32_vfmaddsd3((__v2df)__A, (__v2df)__B, -(__v2df)__C); 215 } 216 217 /// Computes a negated multiply-add of 128-bit vectors of [4 x float]. 218 /// For each element, computes <c> -(__A * __B) + __C </c>. 219 /// 220 /// \headerfile <immintrin.h> 221 /// 222 /// This intrinsic corresponds to the \c VFNMADD213DPS instruction. 223 /// 224 /// \param __A 225 /// A 128-bit vector of [4 x float] containing the multiplicand. 226 /// \param __B 227 /// A 128-bit vector of [4 x float] containing the multiplier. 228 /// \param __C 229 /// A 128-bit vector of [4 x float] containing the addend. 230 /// \returns A 128-bit [4 x float] vector containing the result. 231 static __inline__ __m128 __DEFAULT_FN_ATTRS128 232 _mm_fnmadd_ps(__m128 __A, __m128 __B, __m128 __C) 233 { 234 return (__m128)__builtin_ia32_vfmaddps(-(__v4sf)__A, (__v4sf)__B, (__v4sf)__C); 235 } 236 237 /// Computes a negated multiply-add of 128-bit vectors of [2 x double]. 238 /// For each element, computes <c> -(__A * __B) + __C </c>. 239 /// 240 /// \headerfile <immintrin.h> 241 /// 242 /// This intrinsic corresponds to the \c VFNMADD213PD instruction. 243 /// 244 /// \param __A 245 /// A 128-bit vector of [2 x double] containing the multiplicand. 246 /// \param __B 247 /// A 128-bit vector of [2 x double] containing the multiplier. 248 /// \param __C 249 /// A 128-bit vector of [2 x double] containing the addend. 250 /// \returns A 128-bit vector of [2 x double] containing the result. 251 static __inline__ __m128d __DEFAULT_FN_ATTRS128 252 _mm_fnmadd_pd(__m128d __A, __m128d __B, __m128d __C) 253 { 254 return (__m128d)__builtin_ia32_vfmaddpd(-(__v2df)__A, (__v2df)__B, (__v2df)__C); 255 } 256 257 /// Computes a scalar negated multiply-add of the single-precision values in 258 /// the low 32 bits of 128-bit vectors of [4 x float]. 259 /// 260 /// \code{.operation} 261 /// result[31:0] = -(__A[31:0] * __B[31:0]) + __C[31:0] 262 /// result[127:32] = __A[127:32] 263 /// \endcode 264 /// 265 /// \headerfile <immintrin.h> 266 /// 267 /// This intrinsic corresponds to the \c VFNMADD213SS instruction. 268 /// 269 /// \param __A 270 /// A 128-bit vector of [4 x float] containing the multiplicand in the low 271 /// 32 bits. 272 /// \param __B 273 /// A 128-bit vector of [4 x float] containing the multiplier in the low 274 /// 32 bits. 275 /// \param __C 276 /// A 128-bit vector of [4 x float] containing the addend in the low 277 /// 32 bits. 278 /// \returns A 128-bit vector of [4 x float] containing the result in the low 279 /// 32 bits, and a copy of \a __A[127:32] in the upper 96 bits. 280 static __inline__ __m128 __DEFAULT_FN_ATTRS128 281 _mm_fnmadd_ss(__m128 __A, __m128 __B, __m128 __C) 282 { 283 return (__m128)__builtin_ia32_vfmaddss3((__v4sf)__A, -(__v4sf)__B, (__v4sf)__C); 284 } 285 286 /// Computes a scalar negated multiply-add of the double-precision values 287 /// in the low 64 bits of 128-bit vectors of [2 x double]. 288 /// 289 /// \code{.operation} 290 /// result[63:0] = -(__A[63:0] * __B[63:0]) + __C[63:0] 291 /// result[127:64] = __A[127:64] 292 /// \endcode 293 /// 294 /// \headerfile <immintrin.h> 295 /// 296 /// This intrinsic corresponds to the \c VFNMADD213SD instruction. 297 /// 298 /// \param __A 299 /// A 128-bit vector of [2 x double] containing the multiplicand in the low 300 /// 64 bits. 301 /// \param __B 302 /// A 128-bit vector of [2 x double] containing the multiplier in the low 303 /// 64 bits. 304 /// \param __C 305 /// A 128-bit vector of [2 x double] containing the addend in the low 306 /// 64 bits. 307 /// \returns A 128-bit vector of [2 x double] containing the result in the low 308 /// 64 bits, and a copy of \a __A[127:64] in the upper 64 bits. 309 static __inline__ __m128d __DEFAULT_FN_ATTRS128 310 _mm_fnmadd_sd(__m128d __A, __m128d __B, __m128d __C) 311 { 312 return (__m128d)__builtin_ia32_vfmaddsd3((__v2df)__A, -(__v2df)__B, (__v2df)__C); 313 } 314 315 /// Computes a negated multiply-subtract of 128-bit vectors of [4 x float]. 316 /// For each element, computes <c> -(__A * __B) - __C </c>. 317 /// 318 /// \headerfile <immintrin.h> 319 /// 320 /// This intrinsic corresponds to the \c VFNMSUB213PS instruction. 321 /// 322 /// \param __A 323 /// A 128-bit vector of [4 x float] containing the multiplicand. 324 /// \param __B 325 /// A 128-bit vector of [4 x float] containing the multiplier. 326 /// \param __C 327 /// A 128-bit vector of [4 x float] containing the subtrahend. 328 /// \returns A 128-bit vector of [4 x float] containing the result. 329 static __inline__ __m128 __DEFAULT_FN_ATTRS128 330 _mm_fnmsub_ps(__m128 __A, __m128 __B, __m128 __C) 331 { 332 return (__m128)__builtin_ia32_vfmaddps(-(__v4sf)__A, (__v4sf)__B, -(__v4sf)__C); 333 } 334 335 /// Computes a negated multiply-subtract of 128-bit vectors of [2 x double]. 336 /// For each element, computes <c> -(__A * __B) - __C </c>. 337 /// 338 /// \headerfile <immintrin.h> 339 /// 340 /// This intrinsic corresponds to the \c VFNMSUB213PD instruction. 341 /// 342 /// \param __A 343 /// A 128-bit vector of [2 x double] containing the multiplicand. 344 /// \param __B 345 /// A 128-bit vector of [2 x double] containing the multiplier. 346 /// \param __C 347 /// A 128-bit vector of [2 x double] containing the subtrahend. 348 /// \returns A 128-bit vector of [2 x double] containing the result. 349 static __inline__ __m128d __DEFAULT_FN_ATTRS128 350 _mm_fnmsub_pd(__m128d __A, __m128d __B, __m128d __C) 351 { 352 return (__m128d)__builtin_ia32_vfmaddpd(-(__v2df)__A, (__v2df)__B, -(__v2df)__C); 353 } 354 355 /// Computes a scalar negated multiply-subtract of the single-precision 356 /// values in the low 32 bits of 128-bit vectors of [4 x float]. 357 /// 358 /// \code{.operation} 359 /// result[31:0] = -(__A[31:0] * __B[31:0]) - __C[31:0] 360 /// result[127:32] = __A[127:32] 361 /// \endcode 362 /// 363 /// \headerfile <immintrin.h> 364 /// 365 /// This intrinsic corresponds to the \c VFNMSUB213SS instruction. 366 /// 367 /// \param __A 368 /// A 128-bit vector of [4 x float] containing the multiplicand in the low 369 /// 32 bits. 370 /// \param __B 371 /// A 128-bit vector of [4 x float] containing the multiplier in the low 372 /// 32 bits. 373 /// \param __C 374 /// A 128-bit vector of [4 x float] containing the subtrahend in the low 375 /// 32 bits. 376 /// \returns A 128-bit vector of [4 x float] containing the result in the low 377 /// 32 bits, and a copy of \a __A[127:32] in the upper 96 bits. 378 static __inline__ __m128 __DEFAULT_FN_ATTRS128 379 _mm_fnmsub_ss(__m128 __A, __m128 __B, __m128 __C) 380 { 381 return (__m128)__builtin_ia32_vfmaddss3((__v4sf)__A, -(__v4sf)__B, -(__v4sf)__C); 382 } 383 384 /// Computes a scalar negated multiply-subtract of the double-precision 385 /// values in the low 64 bits of 128-bit vectors of [2 x double]. 386 /// 387 /// \code{.operation} 388 /// result[63:0] = -(__A[63:0] * __B[63:0]) - __C[63:0] 389 /// result[127:64] = __A[127:64] 390 /// \endcode 391 /// 392 /// \headerfile <immintrin.h> 393 /// 394 /// This intrinsic corresponds to the \c VFNMSUB213SD instruction. 395 /// 396 /// \param __A 397 /// A 128-bit vector of [2 x double] containing the multiplicand in the low 398 /// 64 bits. 399 /// \param __B 400 /// A 128-bit vector of [2 x double] containing the multiplier in the low 401 /// 64 bits. 402 /// \param __C 403 /// A 128-bit vector of [2 x double] containing the subtrahend in the low 404 /// 64 bits. 405 /// \returns A 128-bit vector of [2 x double] containing the result in the low 406 /// 64 bits, and a copy of \a __A[127:64] in the upper 64 bits. 407 static __inline__ __m128d __DEFAULT_FN_ATTRS128 408 _mm_fnmsub_sd(__m128d __A, __m128d __B, __m128d __C) 409 { 410 return (__m128d)__builtin_ia32_vfmaddsd3((__v2df)__A, -(__v2df)__B, -(__v2df)__C); 411 } 412 413 /// Computes a multiply with alternating add/subtract of 128-bit vectors of 414 /// [4 x float]. 415 /// 416 /// \code{.operation} 417 /// result[31:0] = (__A[31:0] * __B[31:0]) - __C[31:0] 418 /// result[63:32] = (__A[63:32] * __B[63:32]) + __C[63:32] 419 /// result[95:64] = (__A[95:64] * __B[95:64]) - __C[95:64] 420 /// result[127:96] = (__A[127:96] * __B[127:96]) + __C[127:96] 421 /// \endcode 422 /// 423 /// \headerfile <immintrin.h> 424 /// 425 /// This intrinsic corresponds to the \c VFMADDSUB213PS instruction. 426 /// 427 /// \param __A 428 /// A 128-bit vector of [4 x float] containing the multiplicand. 429 /// \param __B 430 /// A 128-bit vector of [4 x float] containing the multiplier. 431 /// \param __C 432 /// A 128-bit vector of [4 x float] containing the addend/subtrahend. 433 /// \returns A 128-bit vector of [4 x float] containing the result. 434 static __inline__ __m128 __DEFAULT_FN_ATTRS128 435 _mm_fmaddsub_ps(__m128 __A, __m128 __B, __m128 __C) 436 { 437 return (__m128)__builtin_ia32_vfmaddsubps((__v4sf)__A, (__v4sf)__B, (__v4sf)__C); 438 } 439 440 /// Computes a multiply with alternating add/subtract of 128-bit vectors of 441 /// [2 x double]. 442 /// 443 /// \code{.operation} 444 /// result[63:0] = (__A[63:0] * __B[63:0]) - __C[63:0] 445 /// result[127:64] = (__A[127:64] * __B[127:64]) + __C[127:64] 446 /// \endcode 447 /// 448 /// \headerfile <immintrin.h> 449 /// 450 /// This intrinsic corresponds to the \c VFMADDSUB213PD instruction. 451 /// 452 /// \param __A 453 /// A 128-bit vector of [2 x double] containing the multiplicand. 454 /// \param __B 455 /// A 128-bit vector of [2 x double] containing the multiplier. 456 /// \param __C 457 /// A 128-bit vector of [2 x double] containing the addend/subtrahend. 458 /// \returns A 128-bit vector of [2 x double] containing the result. 459 static __inline__ __m128d __DEFAULT_FN_ATTRS128 460 _mm_fmaddsub_pd(__m128d __A, __m128d __B, __m128d __C) 461 { 462 return (__m128d)__builtin_ia32_vfmaddsubpd((__v2df)__A, (__v2df)__B, (__v2df)__C); 463 } 464 465 /// Computes a multiply with alternating add/subtract of 128-bit vectors of 466 /// [4 x float]. 467 /// 468 /// \code{.operation} 469 /// result[31:0] = (__A[31:0] * __B[31:0]) + __C[31:0] 470 /// result[63:32] = (__A[63:32] * __B[63:32]) - __C[63:32] 471 /// result[95:64] = (__A[95:64] * __B[95:64]) + __C[95:64] 472 /// result[127:96 = (__A[127:96] * __B[127:96]) - __C[127:96] 473 /// \endcode 474 /// 475 /// \headerfile <immintrin.h> 476 /// 477 /// This intrinsic corresponds to the \c VFMSUBADD213PS instruction. 478 /// 479 /// \param __A 480 /// A 128-bit vector of [4 x float] containing the multiplicand. 481 /// \param __B 482 /// A 128-bit vector of [4 x float] containing the multiplier. 483 /// \param __C 484 /// A 128-bit vector of [4 x float] containing the addend/subtrahend. 485 /// \returns A 128-bit vector of [4 x float] containing the result. 486 static __inline__ __m128 __DEFAULT_FN_ATTRS128 487 _mm_fmsubadd_ps(__m128 __A, __m128 __B, __m128 __C) 488 { 489 return (__m128)__builtin_ia32_vfmaddsubps((__v4sf)__A, (__v4sf)__B, -(__v4sf)__C); 490 } 491 492 /// Computes a multiply with alternating add/subtract of 128-bit vectors of 493 /// [2 x double]. 494 /// 495 /// \code{.operation} 496 /// result[63:0] = (__A[63:0] * __B[63:0]) + __C[63:0] 497 /// result[127:64] = (__A[127:64] * __B[127:64]) - __C[127:64] 498 /// \endcode 499 /// 500 /// \headerfile <immintrin.h> 501 /// 502 /// This intrinsic corresponds to the \c VFMADDSUB213PD instruction. 503 /// 504 /// \param __A 505 /// A 128-bit vector of [2 x double] containing the multiplicand. 506 /// \param __B 507 /// A 128-bit vector of [2 x double] containing the multiplier. 508 /// \param __C 509 /// A 128-bit vector of [2 x double] containing the addend/subtrahend. 510 /// \returns A 128-bit vector of [2 x double] containing the result. 511 static __inline__ __m128d __DEFAULT_FN_ATTRS128 512 _mm_fmsubadd_pd(__m128d __A, __m128d __B, __m128d __C) 513 { 514 return (__m128d)__builtin_ia32_vfmaddsubpd((__v2df)__A, (__v2df)__B, -(__v2df)__C); 515 } 516 517 /// Computes a multiply-add of 256-bit vectors of [8 x float]. 518 /// For each element, computes <c> (__A * __B) + __C </c>. 519 /// 520 /// \headerfile <immintrin.h> 521 /// 522 /// This intrinsic corresponds to the \c VFMADD213PS instruction. 523 /// 524 /// \param __A 525 /// A 256-bit vector of [8 x float] containing the multiplicand. 526 /// \param __B 527 /// A 256-bit vector of [8 x float] containing the multiplier. 528 /// \param __C 529 /// A 256-bit vector of [8 x float] containing the addend. 530 /// \returns A 256-bit vector of [8 x float] containing the result. 531 static __inline__ __m256 __DEFAULT_FN_ATTRS256 532 _mm256_fmadd_ps(__m256 __A, __m256 __B, __m256 __C) 533 { 534 return (__m256)__builtin_ia32_vfmaddps256((__v8sf)__A, (__v8sf)__B, (__v8sf)__C); 535 } 536 537 /// Computes a multiply-add of 256-bit vectors of [4 x double]. 538 /// For each element, computes <c> (__A * __B) + __C </c>. 539 /// 540 /// \headerfile <immintrin.h> 541 /// 542 /// This intrinsic corresponds to the \c VFMADD213PD instruction. 543 /// 544 /// \param __A 545 /// A 256-bit vector of [4 x double] containing the multiplicand. 546 /// \param __B 547 /// A 256-bit vector of [4 x double] containing the multiplier. 548 /// \param __C 549 /// A 256-bit vector of [4 x double] containing the addend. 550 /// \returns A 256-bit vector of [4 x double] containing the result. 551 static __inline__ __m256d __DEFAULT_FN_ATTRS256 552 _mm256_fmadd_pd(__m256d __A, __m256d __B, __m256d __C) 553 { 554 return (__m256d)__builtin_ia32_vfmaddpd256((__v4df)__A, (__v4df)__B, (__v4df)__C); 555 } 556 557 /// Computes a multiply-subtract of 256-bit vectors of [8 x float]. 558 /// For each element, computes <c> (__A * __B) - __C </c>. 559 /// 560 /// \headerfile <immintrin.h> 561 /// 562 /// This intrinsic corresponds to the \c VFMSUB213PS instruction. 563 /// 564 /// \param __A 565 /// A 256-bit vector of [8 x float] containing the multiplicand. 566 /// \param __B 567 /// A 256-bit vector of [8 x float] containing the multiplier. 568 /// \param __C 569 /// A 256-bit vector of [8 x float] containing the subtrahend. 570 /// \returns A 256-bit vector of [8 x float] containing the result. 571 static __inline__ __m256 __DEFAULT_FN_ATTRS256 572 _mm256_fmsub_ps(__m256 __A, __m256 __B, __m256 __C) 573 { 574 return (__m256)__builtin_ia32_vfmaddps256((__v8sf)__A, (__v8sf)__B, -(__v8sf)__C); 575 } 576 577 /// Computes a multiply-subtract of 256-bit vectors of [4 x double]. 578 /// For each element, computes <c> (__A * __B) - __C </c>. 579 /// 580 /// \headerfile <immintrin.h> 581 /// 582 /// This intrinsic corresponds to the \c VFMSUB213PD instruction. 583 /// 584 /// \param __A 585 /// A 256-bit vector of [4 x double] containing the multiplicand. 586 /// \param __B 587 /// A 256-bit vector of [4 x double] containing the multiplier. 588 /// \param __C 589 /// A 256-bit vector of [4 x double] containing the subtrahend. 590 /// \returns A 256-bit vector of [4 x double] containing the result. 591 static __inline__ __m256d __DEFAULT_FN_ATTRS256 592 _mm256_fmsub_pd(__m256d __A, __m256d __B, __m256d __C) 593 { 594 return (__m256d)__builtin_ia32_vfmaddpd256((__v4df)__A, (__v4df)__B, -(__v4df)__C); 595 } 596 597 /// Computes a negated multiply-add of 256-bit vectors of [8 x float]. 598 /// For each element, computes <c> -(__A * __B) + __C </c>. 599 /// 600 /// \headerfile <immintrin.h> 601 /// 602 /// This intrinsic corresponds to the \c VFNMADD213PS instruction. 603 /// 604 /// \param __A 605 /// A 256-bit vector of [8 x float] containing the multiplicand. 606 /// \param __B 607 /// A 256-bit vector of [8 x float] containing the multiplier. 608 /// \param __C 609 /// A 256-bit vector of [8 x float] containing the addend. 610 /// \returns A 256-bit vector of [8 x float] containing the result. 611 static __inline__ __m256 __DEFAULT_FN_ATTRS256 612 _mm256_fnmadd_ps(__m256 __A, __m256 __B, __m256 __C) 613 { 614 return (__m256)__builtin_ia32_vfmaddps256(-(__v8sf)__A, (__v8sf)__B, (__v8sf)__C); 615 } 616 617 /// Computes a negated multiply-add of 256-bit vectors of [4 x double]. 618 /// For each element, computes <c> -(__A * __B) + __C </c>. 619 /// 620 /// \headerfile <immintrin.h> 621 /// 622 /// This intrinsic corresponds to the \c VFNMADD213PD instruction. 623 /// 624 /// \param __A 625 /// A 256-bit vector of [4 x double] containing the multiplicand. 626 /// \param __B 627 /// A 256-bit vector of [4 x double] containing the multiplier. 628 /// \param __C 629 /// A 256-bit vector of [4 x double] containing the addend. 630 /// \returns A 256-bit vector of [4 x double] containing the result. 631 static __inline__ __m256d __DEFAULT_FN_ATTRS256 632 _mm256_fnmadd_pd(__m256d __A, __m256d __B, __m256d __C) 633 { 634 return (__m256d)__builtin_ia32_vfmaddpd256(-(__v4df)__A, (__v4df)__B, (__v4df)__C); 635 } 636 637 /// Computes a negated multiply-subtract of 256-bit vectors of [8 x float]. 638 /// For each element, computes <c> -(__A * __B) - __C </c>. 639 /// 640 /// \headerfile <immintrin.h> 641 /// 642 /// This intrinsic corresponds to the \c VFNMSUB213PS instruction. 643 /// 644 /// \param __A 645 /// A 256-bit vector of [8 x float] containing the multiplicand. 646 /// \param __B 647 /// A 256-bit vector of [8 x float] containing the multiplier. 648 /// \param __C 649 /// A 256-bit vector of [8 x float] containing the subtrahend. 650 /// \returns A 256-bit vector of [8 x float] containing the result. 651 static __inline__ __m256 __DEFAULT_FN_ATTRS256 652 _mm256_fnmsub_ps(__m256 __A, __m256 __B, __m256 __C) 653 { 654 return (__m256)__builtin_ia32_vfmaddps256(-(__v8sf)__A, (__v8sf)__B, -(__v8sf)__C); 655 } 656 657 /// Computes a negated multiply-subtract of 256-bit vectors of [4 x double]. 658 /// For each element, computes <c> -(__A * __B) - __C </c>. 659 /// 660 /// \headerfile <immintrin.h> 661 /// 662 /// This intrinsic corresponds to the \c VFNMSUB213PD instruction. 663 /// 664 /// \param __A 665 /// A 256-bit vector of [4 x double] containing the multiplicand. 666 /// \param __B 667 /// A 256-bit vector of [4 x double] containing the multiplier. 668 /// \param __C 669 /// A 256-bit vector of [4 x double] containing the subtrahend. 670 /// \returns A 256-bit vector of [4 x double] containing the result. 671 static __inline__ __m256d __DEFAULT_FN_ATTRS256 672 _mm256_fnmsub_pd(__m256d __A, __m256d __B, __m256d __C) 673 { 674 return (__m256d)__builtin_ia32_vfmaddpd256(-(__v4df)__A, (__v4df)__B, -(__v4df)__C); 675 } 676 677 /// Computes a multiply with alternating add/subtract of 256-bit vectors of 678 /// [8 x float]. 679 /// 680 /// \code{.operation} 681 /// result[31:0] = (__A[31:0] * __B[31:0]) - __C[31:0] 682 /// result[63:32] = (__A[63:32] * __B[63:32]) + __C[63:32] 683 /// result[95:64] = (__A[95:64] * __B[95:64]) - __C[95:64] 684 /// result[127:96] = (__A[127:96] * __B[127:96]) + __C[127:96] 685 /// result[159:128] = (__A[159:128] * __B[159:128]) - __C[159:128] 686 /// result[191:160] = (__A[191:160] * __B[191:160]) + __C[191:160] 687 /// result[223:192] = (__A[223:192] * __B[223:192]) - __C[223:192] 688 /// result[255:224] = (__A[255:224] * __B[255:224]) + __C[255:224] 689 /// \endcode 690 /// 691 /// \headerfile <immintrin.h> 692 /// 693 /// This intrinsic corresponds to the \c VFMADDSUB213PS instruction. 694 /// 695 /// \param __A 696 /// A 256-bit vector of [8 x float] containing the multiplicand. 697 /// \param __B 698 /// A 256-bit vector of [8 x float] containing the multiplier. 699 /// \param __C 700 /// A 256-bit vector of [8 x float] containing the addend/subtrahend. 701 /// \returns A 256-bit vector of [8 x float] containing the result. 702 static __inline__ __m256 __DEFAULT_FN_ATTRS256 703 _mm256_fmaddsub_ps(__m256 __A, __m256 __B, __m256 __C) 704 { 705 return (__m256)__builtin_ia32_vfmaddsubps256((__v8sf)__A, (__v8sf)__B, (__v8sf)__C); 706 } 707 708 /// Computes a multiply with alternating add/subtract of 256-bit vectors of 709 /// [4 x double]. 710 /// 711 /// \code{.operation} 712 /// result[63:0] = (__A[63:0] * __B[63:0]) - __C[63:0] 713 /// result[127:64] = (__A[127:64] * __B[127:64]) + __C[127:64] 714 /// result[191:128] = (__A[191:128] * __B[191:128]) - __C[191:128] 715 /// result[255:192] = (__A[255:192] * __B[255:192]) + __C[255:192] 716 /// \endcode 717 /// 718 /// \headerfile <immintrin.h> 719 /// 720 /// This intrinsic corresponds to the \c VFMADDSUB213PD instruction. 721 /// 722 /// \param __A 723 /// A 256-bit vector of [4 x double] containing the multiplicand. 724 /// \param __B 725 /// A 256-bit vector of [4 x double] containing the multiplier. 726 /// \param __C 727 /// A 256-bit vector of [4 x double] containing the addend/subtrahend. 728 /// \returns A 256-bit vector of [4 x double] containing the result. 729 static __inline__ __m256d __DEFAULT_FN_ATTRS256 730 _mm256_fmaddsub_pd(__m256d __A, __m256d __B, __m256d __C) 731 { 732 return (__m256d)__builtin_ia32_vfmaddsubpd256((__v4df)__A, (__v4df)__B, (__v4df)__C); 733 } 734 735 /// Computes a vector multiply with alternating add/subtract of 256-bit 736 /// vectors of [8 x float]. 737 /// 738 /// \code{.operation} 739 /// result[31:0] = (__A[31:0] * __B[31:0]) + __C[31:0] 740 /// result[63:32] = (__A[63:32] * __B[63:32]) - __C[63:32] 741 /// result[95:64] = (__A[95:64] * __B[95:64]) + __C[95:64] 742 /// result[127:96] = (__A[127:96] * __B[127:96]) - __C[127:96] 743 /// result[159:128] = (__A[159:128] * __B[159:128]) + __C[159:128] 744 /// result[191:160] = (__A[191:160] * __B[191:160]) - __C[191:160] 745 /// result[223:192] = (__A[223:192] * __B[223:192]) + __C[223:192] 746 /// result[255:224] = (__A[255:224] * __B[255:224]) - __C[255:224] 747 /// \endcode 748 /// 749 /// \headerfile <immintrin.h> 750 /// 751 /// This intrinsic corresponds to the \c VFMSUBADD213PS instruction. 752 /// 753 /// \param __A 754 /// A 256-bit vector of [8 x float] containing the multiplicand. 755 /// \param __B 756 /// A 256-bit vector of [8 x float] containing the multiplier. 757 /// \param __C 758 /// A 256-bit vector of [8 x float] containing the addend/subtrahend. 759 /// \returns A 256-bit vector of [8 x float] containing the result. 760 static __inline__ __m256 __DEFAULT_FN_ATTRS256 761 _mm256_fmsubadd_ps(__m256 __A, __m256 __B, __m256 __C) 762 { 763 return (__m256)__builtin_ia32_vfmaddsubps256((__v8sf)__A, (__v8sf)__B, -(__v8sf)__C); 764 } 765 766 /// Computes a vector multiply with alternating add/subtract of 256-bit 767 /// vectors of [4 x double]. 768 /// 769 /// \code{.operation} 770 /// result[63:0] = (__A[63:0] * __B[63:0]) + __C[63:0] 771 /// result[127:64] = (__A[127:64] * __B[127:64]) - __C[127:64] 772 /// result[191:128] = (__A[191:128] * __B[191:128]) + __C[191:128] 773 /// result[255:192] = (__A[255:192] * __B[255:192]) - __C[255:192] 774 /// \endcode 775 /// 776 /// \headerfile <immintrin.h> 777 /// 778 /// This intrinsic corresponds to the \c VFMSUBADD213PD instruction. 779 /// 780 /// \param __A 781 /// A 256-bit vector of [4 x double] containing the multiplicand. 782 /// \param __B 783 /// A 256-bit vector of [4 x double] containing the multiplier. 784 /// \param __C 785 /// A 256-bit vector of [4 x double] containing the addend/subtrahend. 786 /// \returns A 256-bit vector of [4 x double] containing the result. 787 static __inline__ __m256d __DEFAULT_FN_ATTRS256 788 _mm256_fmsubadd_pd(__m256d __A, __m256d __B, __m256d __C) 789 { 790 return (__m256d)__builtin_ia32_vfmaddsubpd256((__v4df)__A, (__v4df)__B, -(__v4df)__C); 791 } 792 793 #undef __DEFAULT_FN_ATTRS128 794 #undef __DEFAULT_FN_ATTRS256 795 796 #endif /* __FMAINTRIN_H */ 797