xref: /freebsd/contrib/llvm-project/clang/lib/Headers/avx2intrin.h (revision 78cd75393ec79565c63927bf200f06f839a1dc05)
1 /*===---- avx2intrin.h - AVX2 intrinsics -----------------------------------===
2  *
3  * Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4  * See https://llvm.org/LICENSE.txt for license information.
5  * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6  *
7  *===-----------------------------------------------------------------------===
8  */
9 
10 #ifndef __IMMINTRIN_H
11 #error "Never use <avx2intrin.h> directly; include <immintrin.h> instead."
12 #endif
13 
14 #ifndef __AVX2INTRIN_H
15 #define __AVX2INTRIN_H
16 
17 /* Define the default attributes for the functions in this file. */
18 #define __DEFAULT_FN_ATTRS256 __attribute__((__always_inline__, __nodebug__, __target__("avx2"), __min_vector_width__(256)))
19 #define __DEFAULT_FN_ATTRS128 __attribute__((__always_inline__, __nodebug__, __target__("avx2"), __min_vector_width__(128)))
20 
21 /* SSE4 Multiple Packed Sums of Absolute Difference.  */
22 /// Computes sixteen sum of absolute difference (SAD) operations on sets of
23 ///    four unsigned 8-bit integers from the 256-bit integer vectors \a X and
24 ///    \a Y.
25 ///
26 ///    Eight SAD results are computed using the lower half of the input
27 ///    vectors, and another eight using the upper half. These 16-bit values
28 ///    are returned in the lower and upper halves of the 256-bit result,
29 ///    respectively.
30 ///
31 ///    A single SAD operation selects four bytes from \a X and four bytes from
32 ///    \a Y as input. It computes the differences between each \a X byte and
33 ///    the corresponding \a Y byte, takes the absolute value of each
34 ///    difference, and sums these four values to form one 16-bit result. The
35 ///    intrinsic computes 16 of these results with different sets of input
36 ///    bytes.
37 ///
38 ///    For each set of eight results, the SAD operations use the same four
39 ///    bytes from \a Y; the starting bit position for these four bytes is
40 ///    specified by \a M[1:0] times 32. The eight operations use successive
41 ///    sets of four bytes from \a X; the starting bit position for the first
42 ///    set of four bytes is specified by \a M[2] times 32. These bit positions
43 ///    are all relative to the 128-bit lane for each set of eight operations.
44 ///
45 /// \code{.operation}
46 /// r := 0
47 /// FOR i := 0 TO 1
48 ///   j := i*3
49 ///   Ybase := M[j+1:j]*32 + i*128
50 ///   Xbase := M[j+2]*32 + i*128
51 ///   FOR k := 0 TO 3
52 ///     temp0 := ABS(X[Xbase+7:Xbase] - Y[Ybase+7:Ybase])
53 ///     temp1 := ABS(X[Xbase+15:Xbase+8] - Y[Ybase+15:Ybase+8])
54 ///     temp2 := ABS(X[Xbase+23:Xbase+16] - Y[Ybase+23:Ybase+16])
55 ///     temp3 := ABS(X[Xbase+31:Xbase+24] - Y[Ybase+31:Ybase+24])
56 ///     result[r+15:r] := temp0 + temp1 + temp2 + temp3
57 ///     Xbase := Xbase + 8
58 ///     r := r + 16
59 ///   ENDFOR
60 /// ENDFOR
61 /// \endcode
62 ///
63 /// \headerfile <immintrin.h>
64 ///
65 /// \code
66 /// __m256i _mm256_mpsadbw_epu8(__m256i X, __m256i Y, const int M);
67 /// \endcode
68 ///
69 /// This intrinsic corresponds to the \c VMPSADBW instruction.
70 ///
71 /// \param X
72 ///    A 256-bit integer vector containing one of the inputs.
73 /// \param Y
74 ///    A 256-bit integer vector containing one of the inputs.
75 /// \param M
76 ///     An unsigned immediate value specifying the starting positions of the
77 ///     bytes to operate on.
78 /// \returns A 256-bit vector of [16 x i16] containing the result.
79 #define _mm256_mpsadbw_epu8(X, Y, M) \
80   ((__m256i)__builtin_ia32_mpsadbw256((__v32qi)(__m256i)(X), \
81                                       (__v32qi)(__m256i)(Y), (int)(M)))
82 
83 /// Computes the absolute value of each signed byte in the 256-bit integer
84 ///    vector \a __a and returns each value in the corresponding byte of
85 ///    the result.
86 ///
87 /// \headerfile <immintrin.h>
88 ///
89 /// This intrinsic corresponds to the \c VPABSB instruction.
90 ///
91 /// \param __a
92 ///    A 256-bit integer vector.
93 /// \returns A 256-bit integer vector containing the result.
94 static __inline__ __m256i __DEFAULT_FN_ATTRS256
95 _mm256_abs_epi8(__m256i __a)
96 {
97     return (__m256i)__builtin_elementwise_abs((__v32qs)__a);
98 }
99 
100 /// Computes the absolute value of each signed 16-bit element in the 256-bit
101 ///    vector of [16 x i16] in \a __a and returns each value in the
102 ///    corresponding element of the result.
103 ///
104 /// \headerfile <immintrin.h>
105 ///
106 /// This intrinsic corresponds to the \c VPABSW instruction.
107 ///
108 /// \param __a
109 ///    A 256-bit vector of [16 x i16].
110 /// \returns A 256-bit vector of [16 x i16] containing the result.
111 static __inline__ __m256i __DEFAULT_FN_ATTRS256
112 _mm256_abs_epi16(__m256i __a)
113 {
114     return (__m256i)__builtin_elementwise_abs((__v16hi)__a);
115 }
116 
117 /// Computes the absolute value of each signed 32-bit element in the 256-bit
118 ///    vector of [8 x i32] in \a __a and returns each value in the
119 ///    corresponding element of the result.
120 ///
121 /// \headerfile <immintrin.h>
122 ///
123 /// This intrinsic corresponds to the \c VPABSD instruction.
124 ///
125 /// \param __a
126 ///    A 256-bit vector of [8 x i32].
127 /// \returns A 256-bit vector of [8 x i32] containing the result.
128 static __inline__ __m256i __DEFAULT_FN_ATTRS256
129 _mm256_abs_epi32(__m256i __a)
130 {
131     return (__m256i)__builtin_elementwise_abs((__v8si)__a);
132 }
133 
134 /// Converts the elements of two 256-bit vectors of [16 x i16] to 8-bit
135 ///    integers using signed saturation, and returns the 256-bit result.
136 ///
137 /// \code{.operation}
138 /// FOR i := 0 TO 7
139 ///   j := i*16
140 ///   k := i*8
141 ///   result[7+k:k] := SATURATE8(__a[15+j:j])
142 ///   result[71+k:64+k] := SATURATE8(__b[15+j:j])
143 ///   result[135+k:128+k] := SATURATE8(__a[143+j:128+j])
144 ///   result[199+k:192+k] := SATURATE8(__b[143+j:128+j])
145 /// ENDFOR
146 /// \endcode
147 ///
148 /// \headerfile <immintrin.h>
149 ///
150 /// This intrinsic corresponds to the \c VPACKSSWB instruction.
151 ///
152 /// \param __a
153 ///    A 256-bit vector of [16 x i16] used to generate result[63:0] and
154 ///    result[191:128].
155 /// \param __b
156 ///    A 256-bit vector of [16 x i16] used to generate result[127:64] and
157 ///    result[255:192].
158 /// \returns A 256-bit integer vector containing the result.
159 static __inline__ __m256i __DEFAULT_FN_ATTRS256
160 _mm256_packs_epi16(__m256i __a, __m256i __b)
161 {
162   return (__m256i)__builtin_ia32_packsswb256((__v16hi)__a, (__v16hi)__b);
163 }
164 
165 /// Converts the elements of two 256-bit vectors of [8 x i32] to 16-bit
166 ///    integers using signed saturation, and returns the resulting 256-bit
167 ///    vector of [16 x i16].
168 ///
169 /// \code{.operation}
170 /// FOR i := 0 TO 3
171 ///   j := i*32
172 ///   k := i*16
173 ///   result[15+k:k] := SATURATE16(__a[31+j:j])
174 ///   result[79+k:64+k] := SATURATE16(__b[31+j:j])
175 ///   result[143+k:128+k] := SATURATE16(__a[159+j:128+j])
176 ///   result[207+k:192+k] := SATURATE16(__b[159+j:128+j])
177 /// ENDFOR
178 /// \endcode
179 ///
180 /// \headerfile <immintrin.h>
181 ///
182 /// This intrinsic corresponds to the \c VPACKSSDW instruction.
183 ///
184 /// \param __a
185 ///    A 256-bit vector of [8 x i32] used to generate result[63:0] and
186 ///    result[191:128].
187 /// \param __b
188 ///    A 256-bit vector of [8 x i32] used to generate result[127:64] and
189 ///    result[255:192].
190 /// \returns A 256-bit vector of [16 x i16] containing the result.
191 static __inline__ __m256i __DEFAULT_FN_ATTRS256
192 _mm256_packs_epi32(__m256i __a, __m256i __b)
193 {
194   return (__m256i)__builtin_ia32_packssdw256((__v8si)__a, (__v8si)__b);
195 }
196 
197 /// Converts elements from two 256-bit vectors of [16 x i16] to 8-bit integers
198 ///    using unsigned saturation, and returns the 256-bit result.
199 ///
200 /// \code{.operation}
201 /// FOR i := 0 TO 7
202 ///   j := i*16
203 ///   k := i*8
204 ///   result[7+k:k] := SATURATE8U(__a[15+j:j])
205 ///   result[71+k:64+k] := SATURATE8U(__b[15+j:j])
206 ///   result[135+k:128+k] := SATURATE8U(__a[143+j:128+j])
207 ///   result[199+k:192+k] := SATURATE8U(__b[143+j:128+j])
208 /// ENDFOR
209 /// \endcode
210 ///
211 /// \headerfile <immintrin.h>
212 ///
213 /// This intrinsic corresponds to the \c VPACKUSWB instruction.
214 ///
215 /// \param __a
216 ///    A 256-bit vector of [16 x i16] used to generate result[63:0] and
217 ///    result[191:128].
218 /// \param __b
219 ///    A 256-bit vector of [16 x i16] used to generate result[127:64] and
220 ///    result[255:192].
221 /// \returns A 256-bit integer vector containing the result.
222 static __inline__ __m256i __DEFAULT_FN_ATTRS256
223 _mm256_packus_epi16(__m256i __a, __m256i __b)
224 {
225   return (__m256i)__builtin_ia32_packuswb256((__v16hi)__a, (__v16hi)__b);
226 }
227 
228 /// Converts elements from two 256-bit vectors of [8 x i32] to 16-bit integers
229 ///    using unsigned saturation, and returns the resulting 256-bit vector of
230 ///    [16 x i16].
231 ///
232 /// \code{.operation}
233 /// FOR i := 0 TO 3
234 ///   j := i*32
235 ///   k := i*16
236 ///   result[15+k:k] := SATURATE16U(__V1[31+j:j])
237 ///   result[79+k:64+k] := SATURATE16U(__V2[31+j:j])
238 ///   result[143+k:128+k] := SATURATE16U(__V1[159+j:128+j])
239 ///   result[207+k:192+k] := SATURATE16U(__V2[159+j:128+j])
240 /// ENDFOR
241 /// \endcode
242 ///
243 /// \headerfile <immintrin.h>
244 ///
245 /// This intrinsic corresponds to the \c VPACKUSDW instruction.
246 ///
247 /// \param __V1
248 ///    A 256-bit vector of [8 x i32] used to generate result[63:0] and
249 ///    result[191:128].
250 /// \param __V2
251 ///    A 256-bit vector of [8 x i32] used to generate result[127:64] and
252 ///    result[255:192].
253 /// \returns A 256-bit vector of [16 x i16] containing the result.
254 static __inline__ __m256i __DEFAULT_FN_ATTRS256
255 _mm256_packus_epi32(__m256i __V1, __m256i __V2)
256 {
257   return (__m256i) __builtin_ia32_packusdw256((__v8si)__V1, (__v8si)__V2);
258 }
259 
260 /// Adds 8-bit integers from corresponding bytes of two 256-bit integer
261 ///    vectors and returns the lower 8 bits of each sum in the corresponding
262 ///    byte of the 256-bit integer vector result (overflow is ignored).
263 ///
264 /// \headerfile <immintrin.h>
265 ///
266 /// This intrinsic corresponds to the \c VPADDB instruction.
267 ///
268 /// \param __a
269 ///    A 256-bit integer vector containing one of the source operands.
270 /// \param __b
271 ///    A 256-bit integer vector containing one of the source operands.
272 /// \returns A 256-bit integer vector containing the sums.
273 static __inline__ __m256i __DEFAULT_FN_ATTRS256
274 _mm256_add_epi8(__m256i __a, __m256i __b)
275 {
276   return (__m256i)((__v32qu)__a + (__v32qu)__b);
277 }
278 
279 /// Adds 16-bit integers from corresponding elements of two 256-bit vectors of
280 ///    [16 x i16] and returns the lower 16 bits of each sum in the
281 ///    corresponding element of the [16 x i16] result (overflow is ignored).
282 ///
283 /// \headerfile <immintrin.h>
284 ///
285 /// This intrinsic corresponds to the \c VPADDW instruction.
286 ///
287 /// \param __a
288 ///    A 256-bit vector of [16 x i16] containing one of the source operands.
289 /// \param __b
290 ///    A 256-bit vector of [16 x i16] containing one of the source operands.
291 /// \returns A 256-bit vector of [16 x i16] containing the sums.
292 static __inline__ __m256i __DEFAULT_FN_ATTRS256
293 _mm256_add_epi16(__m256i __a, __m256i __b)
294 {
295   return (__m256i)((__v16hu)__a + (__v16hu)__b);
296 }
297 
298 /// Adds 32-bit integers from corresponding elements of two 256-bit vectors of
299 ///    [8 x i32] and returns the lower 32 bits of each sum in the corresponding
300 ///    element of the [8 x i32] result (overflow is ignored).
301 ///
302 /// \headerfile <immintrin.h>
303 ///
304 /// This intrinsic corresponds to the \c VPADDD instruction.
305 ///
306 /// \param __a
307 ///    A 256-bit vector of [8 x i32] containing one of the source operands.
308 /// \param __b
309 ///    A 256-bit vector of [8 x i32] containing one of the source operands.
310 /// \returns A 256-bit vector of [8 x i32] containing the sums.
311 static __inline__ __m256i __DEFAULT_FN_ATTRS256
312 _mm256_add_epi32(__m256i __a, __m256i __b)
313 {
314   return (__m256i)((__v8su)__a + (__v8su)__b);
315 }
316 
317 /// Adds 64-bit integers from corresponding elements of two 256-bit vectors of
318 ///    [4 x i64] and returns the lower 64 bits of each sum in the corresponding
319 ///    element of the [4 x i64] result (overflow is ignored).
320 ///
321 /// \headerfile <immintrin.h>
322 ///
323 /// This intrinsic corresponds to the \c VPADDQ instruction.
324 ///
325 /// \param __a
326 ///    A 256-bit vector of [4 x i64] containing one of the source operands.
327 /// \param __b
328 ///    A 256-bit vector of [4 x i64] containing one of the source operands.
329 /// \returns A 256-bit vector of [4 x i64] containing the sums.
330 static __inline__ __m256i __DEFAULT_FN_ATTRS256
331 _mm256_add_epi64(__m256i __a, __m256i __b)
332 {
333   return (__m256i)((__v4du)__a + (__v4du)__b);
334 }
335 
336 /// Adds 8-bit integers from corresponding bytes of two 256-bit integer
337 ///    vectors using signed saturation, and returns each sum in the
338 ///    corresponding byte of the 256-bit integer vector result.
339 ///
340 /// \headerfile <immintrin.h>
341 ///
342 /// This intrinsic corresponds to the \c VPADDSB instruction.
343 ///
344 /// \param __a
345 ///    A 256-bit integer vector containing one of the source operands.
346 /// \param __b
347 ///    A 256-bit integer vector containing one of the source operands.
348 /// \returns A 256-bit integer vector containing the sums.
349 static __inline__ __m256i __DEFAULT_FN_ATTRS256
350 _mm256_adds_epi8(__m256i __a, __m256i __b)
351 {
352   return (__m256i)__builtin_elementwise_add_sat((__v32qs)__a, (__v32qs)__b);
353 }
354 
355 /// Adds 16-bit integers from corresponding elements of two 256-bit vectors of
356 ///    [16 x i16] using signed saturation, and returns the [16 x i16] result.
357 ///
358 /// \headerfile <immintrin.h>
359 ///
360 /// This intrinsic corresponds to the \c VPADDSW instruction.
361 ///
362 /// \param __a
363 ///    A 256-bit vector of [16 x i16] containing one of the source operands.
364 /// \param __b
365 ///    A 256-bit vector of [16 x i16] containing one of the source operands.
366 /// \returns A 256-bit vector of [16 x i16] containing the sums.
367 static __inline__ __m256i __DEFAULT_FN_ATTRS256
368 _mm256_adds_epi16(__m256i __a, __m256i __b)
369 {
370   return (__m256i)__builtin_elementwise_add_sat((__v16hi)__a, (__v16hi)__b);
371 }
372 
373 /// Adds 8-bit integers from corresponding bytes of two 256-bit integer
374 ///    vectors using unsigned saturation, and returns each sum in the
375 ///    corresponding byte of the 256-bit integer vector result.
376 ///
377 /// \headerfile <immintrin.h>
378 ///
379 /// This intrinsic corresponds to the \c VPADDUSB instruction.
380 ///
381 /// \param __a
382 ///    A 256-bit integer vector containing one of the source operands.
383 /// \param __b
384 ///    A 256-bit integer vector containing one of the source operands.
385 /// \returns A 256-bit integer vector containing the sums.
386 static __inline__ __m256i __DEFAULT_FN_ATTRS256
387 _mm256_adds_epu8(__m256i __a, __m256i __b)
388 {
389   return (__m256i)__builtin_elementwise_add_sat((__v32qu)__a, (__v32qu)__b);
390 }
391 
392 /// Adds 16-bit integers from corresponding elements of two 256-bit vectors of
393 ///    [16 x i16] using unsigned saturation, and returns the [16 x i16] result.
394 ///
395 /// \headerfile <immintrin.h>
396 ///
397 /// This intrinsic corresponds to the \c VPADDUSW instruction.
398 ///
399 /// \param __a
400 ///    A 256-bit vector of [16 x i16] containing one of the source operands.
401 /// \param __b
402 ///    A 256-bit vector of [16 x i16] containing one of the source operands.
403 /// \returns A 256-bit vector of [16 x i16] containing the sums.
404 static __inline__ __m256i __DEFAULT_FN_ATTRS256
405 _mm256_adds_epu16(__m256i __a, __m256i __b)
406 {
407   return (__m256i)__builtin_elementwise_add_sat((__v16hu)__a, (__v16hu)__b);
408 }
409 
410 /// Uses the lower half of the 256-bit vector \a a as the upper half of a
411 ///    temporary 256-bit value, and the lower half of the 256-bit vector \a b
412 ///    as the lower half of the temporary value. Right-shifts the temporary
413 ///    value by \a n bytes, and uses the lower 16 bytes of the shifted value
414 ///    as the lower 16 bytes of the result. Uses the upper halves of \a a and
415 ///    \a b to make another temporary value, right shifts by \a n, and uses
416 ///    the lower 16 bytes of the shifted value as the upper 16 bytes of the
417 ///    result.
418 ///
419 /// \headerfile <immintrin.h>
420 ///
421 /// \code
422 /// __m256i _mm256_alignr_epi8(__m256i a, __m256i b, const int n);
423 /// \endcode
424 ///
425 /// This intrinsic corresponds to the \c VPALIGNR instruction.
426 ///
427 /// \param a
428 ///    A 256-bit integer vector containing source values.
429 /// \param b
430 ///    A 256-bit integer vector containing source values.
431 /// \param n
432 ///    An immediate value specifying the number of bytes to shift.
433 /// \returns A 256-bit integer vector containing the result.
434 #define _mm256_alignr_epi8(a, b, n) \
435   ((__m256i)__builtin_ia32_palignr256((__v32qi)(__m256i)(a), \
436                                       (__v32qi)(__m256i)(b), (n)))
437 
438 /// Computes the bitwise AND of the 256-bit integer vectors in \a __a and
439 ///    \a __b.
440 ///
441 /// \headerfile <immintrin.h>
442 ///
443 /// This intrinsic corresponds to the \c VPAND instruction.
444 ///
445 /// \param __a
446 ///    A 256-bit integer vector.
447 /// \param __b
448 ///    A 256-bit integer vector.
449 /// \returns A 256-bit integer vector containing the result.
450 static __inline__ __m256i __DEFAULT_FN_ATTRS256
451 _mm256_and_si256(__m256i __a, __m256i __b)
452 {
453   return (__m256i)((__v4du)__a & (__v4du)__b);
454 }
455 
456 /// Computes the bitwise AND of the 256-bit integer vector in \a __b with
457 ///    the bitwise NOT of the 256-bit integer vector in \a __a.
458 ///
459 /// \headerfile <immintrin.h>
460 ///
461 /// This intrinsic corresponds to the \c VPANDN instruction.
462 ///
463 /// \param __a
464 ///    A 256-bit integer vector.
465 /// \param __b
466 ///    A 256-bit integer vector.
467 /// \returns A 256-bit integer vector containing the result.
468 static __inline__ __m256i __DEFAULT_FN_ATTRS256
469 _mm256_andnot_si256(__m256i __a, __m256i __b)
470 {
471   return (__m256i)(~(__v4du)__a & (__v4du)__b);
472 }
473 
474 /// Computes the averages of the corresponding unsigned bytes in the two
475 ///    256-bit integer vectors in \a __a and \a __b and returns each
476 ///    average in the corresponding byte of the 256-bit result.
477 ///
478 /// \code{.operation}
479 /// FOR i := 0 TO 31
480 ///   j := i*8
481 ///   result[j+7:j] := (__a[j+7:j] + __b[j+7:j] + 1) >> 1
482 /// ENDFOR
483 /// \endcode
484 ///
485 /// \headerfile <immintrin.h>
486 ///
487 /// This intrinsic corresponds to the \c VPAVGB instruction.
488 ///
489 /// \param __a
490 ///    A 256-bit integer vector.
491 /// \param __b
492 ///    A 256-bit integer vector.
493 /// \returns A 256-bit integer vector containing the result.
494 static __inline__ __m256i __DEFAULT_FN_ATTRS256
495 _mm256_avg_epu8(__m256i __a, __m256i __b)
496 {
497   return (__m256i)__builtin_ia32_pavgb256((__v32qi)__a, (__v32qi)__b);
498 }
499 
500 /// Computes the averages of the corresponding unsigned 16-bit integers in
501 ///    the two 256-bit vectors of [16 x i16] in \a __a and \a __b and returns
502 ///    each average in the corresponding element of the 256-bit result.
503 ///
504 /// \code{.operation}
505 /// FOR i := 0 TO 15
506 ///   j := i*16
507 ///   result[j+15:j] := (__a[j+15:j] + __b[j+15:j] + 1) >> 1
508 /// ENDFOR
509 /// \endcode
510 ///
511 /// \headerfile <immintrin.h>
512 ///
513 /// This intrinsic corresponds to the \c VPAVGW instruction.
514 ///
515 /// \param __a
516 ///    A 256-bit vector of [16 x i16].
517 /// \param __b
518 ///    A 256-bit vector of [16 x i16].
519 /// \returns A 256-bit vector of [16 x i16] containing the result.
520 static __inline__ __m256i __DEFAULT_FN_ATTRS256
521 _mm256_avg_epu16(__m256i __a, __m256i __b)
522 {
523   return (__m256i)__builtin_ia32_pavgw256((__v16hi)__a, (__v16hi)__b);
524 }
525 
526 /// Merges 8-bit integer values from either of the two 256-bit vectors
527 ///    \a __V1 or \a __V2, as specified by the 256-bit mask \a __M and returns
528 ///    the resulting 256-bit integer vector.
529 ///
530 /// \code{.operation}
531 /// FOR i := 0 TO 31
532 ///   j := i*8
533 ///   IF __M[7+i] == 0
534 ///     result[7+j:j] := __V1[7+j:j]
535 ///   ELSE
536 ///     result[7+j:j] := __V2[7+j:j]
537 ///   FI
538 /// ENDFOR
539 /// \endcode
540 ///
541 /// \headerfile <immintrin.h>
542 ///
543 /// This intrinsic corresponds to the \c VPBLENDVB instruction.
544 ///
545 /// \param __V1
546 ///    A 256-bit integer vector containing source values.
547 /// \param __V2
548 ///    A 256-bit integer vector containing source values.
549 /// \param __M
550 ///    A 256-bit integer vector, with bit [7] of each byte specifying the
551 ///    source for each corresponding byte of the result. When the mask bit
552 ///    is 0, the byte is copied from \a __V1; otherwise, it is copied from
553 ///    \a __V2.
554 /// \returns A 256-bit integer vector containing the result.
555 static __inline__ __m256i __DEFAULT_FN_ATTRS256
556 _mm256_blendv_epi8(__m256i __V1, __m256i __V2, __m256i __M)
557 {
558   return (__m256i)__builtin_ia32_pblendvb256((__v32qi)__V1, (__v32qi)__V2,
559                                               (__v32qi)__M);
560 }
561 
562 /// Merges 16-bit integer values from either of the two 256-bit vectors
563 ///    \a V1 or \a V2, as specified by the immediate integer operand \a M,
564 ///    and returns the resulting 256-bit vector of [16 x i16].
565 ///
566 /// \code{.operation}
567 /// FOR i := 0 TO 7
568 ///   j := i*16
569 ///   IF M[i] == 0
570 ///     result[7+j:j] := V1[7+j:j]
571 ///     result[135+j:128+j] := V1[135+j:128+j]
572 ///   ELSE
573 ///     result[7+j:j] := V2[7+j:j]
574 ///     result[135+j:128+j] := V2[135+j:128+j]
575 ///   FI
576 /// ENDFOR
577 /// \endcode
578 ///
579 /// \headerfile <immintrin.h>
580 ///
581 /// \code
582 /// __m256i _mm256_blend_epi16(__m256i V1, __m256i V2, const int M);
583 /// \endcode
584 ///
585 /// This intrinsic corresponds to the \c VPBLENDW instruction.
586 ///
587 /// \param V1
588 ///    A 256-bit vector of [16 x i16] containing source values.
589 /// \param V2
590 ///    A 256-bit vector of [16 x i16] containing source values.
591 /// \param M
592 ///    An immediate 8-bit integer operand, with bits [7:0] specifying the
593 ///    source for each element of the result. The position of the mask bit
594 ///    corresponds to the index of a copied value. When a mask bit is 0, the
595 ///    element is copied from \a V1; otherwise, it is copied from \a V2.
596 ///    \a M[0] determines the source for elements 0 and 8, \a M[1] for
597 ///    elements 1 and 9, and so forth.
598 /// \returns A 256-bit vector of [16 x i16] containing the result.
599 #define _mm256_blend_epi16(V1, V2, M) \
600   ((__m256i)__builtin_ia32_pblendw256((__v16hi)(__m256i)(V1), \
601                                       (__v16hi)(__m256i)(V2), (int)(M)))
602 
603 /// Compares corresponding bytes in the 256-bit integer vectors in \a __a and
604 ///    \a __b for equality and returns the outcomes in the corresponding
605 ///    bytes of the 256-bit result.
606 ///
607 /// \code{.operation}
608 /// FOR i := 0 TO 31
609 ///   j := i*8
610 ///   result[j+7:j] := (__a[j+7:j] == __b[j+7:j]) ? 0xFF : 0
611 /// ENDFOR
612 /// \endcode
613 ///
614 /// \headerfile <immintrin.h>
615 ///
616 /// This intrinsic corresponds to the \c VPCMPEQB instruction.
617 ///
618 /// \param __a
619 ///    A 256-bit integer vector containing one of the inputs.
620 /// \param __b
621 ///    A 256-bit integer vector containing one of the inputs.
622 /// \returns A 256-bit integer vector containing the result.
623 static __inline__ __m256i __DEFAULT_FN_ATTRS256
624 _mm256_cmpeq_epi8(__m256i __a, __m256i __b)
625 {
626   return (__m256i)((__v32qi)__a == (__v32qi)__b);
627 }
628 
629 /// Compares corresponding elements in the 256-bit vectors of [16 x i16] in
630 ///    \a __a and \a __b for equality and returns the outcomes in the
631 ///    corresponding elements of the 256-bit result.
632 ///
633 /// \code{.operation}
634 /// FOR i := 0 TO 15
635 ///   j := i*16
636 ///   result[j+15:j] := (__a[j+15:j] == __b[j+15:j]) ? 0xFFFF : 0
637 /// ENDFOR
638 /// \endcode
639 ///
640 /// \headerfile <immintrin.h>
641 ///
642 /// This intrinsic corresponds to the \c VPCMPEQW instruction.
643 ///
644 /// \param __a
645 ///    A 256-bit vector of [16 x i16] containing one of the inputs.
646 /// \param __b
647 ///    A 256-bit vector of [16 x i16] containing one of the inputs.
648 /// \returns A 256-bit vector of [16 x i16] containing the result.
649 static __inline__ __m256i __DEFAULT_FN_ATTRS256
650 _mm256_cmpeq_epi16(__m256i __a, __m256i __b)
651 {
652   return (__m256i)((__v16hi)__a == (__v16hi)__b);
653 }
654 
655 /// Compares corresponding elements in the 256-bit vectors of [8 x i32] in
656 ///    \a __a and \a __b for equality and returns the outcomes in the
657 ///    corresponding elements of the 256-bit result.
658 ///
659 /// \code{.operation}
660 /// FOR i := 0 TO 7
661 ///   j := i*32
662 ///   result[j+31:j] := (__a[j+31:j] == __b[j+31:j]) ? 0xFFFFFFFF : 0
663 /// ENDFOR
664 /// \endcode
665 ///
666 /// \headerfile <immintrin.h>
667 ///
668 /// This intrinsic corresponds to the \c VPCMPEQD instruction.
669 ///
670 /// \param __a
671 ///    A 256-bit vector of [8 x i32] containing one of the inputs.
672 /// \param __b
673 ///    A 256-bit vector of [8 x i32] containing one of the inputs.
674 /// \returns A 256-bit vector of [8 x i32] containing the result.
675 static __inline__ __m256i __DEFAULT_FN_ATTRS256
676 _mm256_cmpeq_epi32(__m256i __a, __m256i __b)
677 {
678   return (__m256i)((__v8si)__a == (__v8si)__b);
679 }
680 
681 /// Compares corresponding elements in the 256-bit vectors of [4 x i64] in
682 ///    \a __a and \a __b for equality and returns the outcomes in the
683 ///    corresponding elements of the 256-bit result.
684 ///
685 /// \code{.operation}
686 /// FOR i := 0 TO 3
687 ///   j := i*64
688 ///   result[j+63:j] := (__a[j+63:j] == __b[j+63:j]) ? 0xFFFFFFFFFFFFFFFF : 0
689 /// ENDFOR
690 /// \endcode
691 ///
692 /// \headerfile <immintrin.h>
693 ///
694 /// This intrinsic corresponds to the \c VPCMPEQQ instruction.
695 ///
696 /// \param __a
697 ///    A 256-bit vector of [4 x i64] containing one of the inputs.
698 /// \param __b
699 ///    A 256-bit vector of [4 x i64] containing one of the inputs.
700 /// \returns A 256-bit vector of [4 x i64] containing the result.
701 static __inline__ __m256i __DEFAULT_FN_ATTRS256
702 _mm256_cmpeq_epi64(__m256i __a, __m256i __b)
703 {
704   return (__m256i)((__v4di)__a == (__v4di)__b);
705 }
706 
707 /// Compares corresponding signed bytes in the 256-bit integer vectors in
708 ///    \a __a and \a __b for greater-than and returns the outcomes in the
709 ///    corresponding bytes of the 256-bit result.
710 ///
711 /// \code{.operation}
712 /// FOR i := 0 TO 31
713 ///   j := i*8
714 ///   result[j+7:j] := (__a[j+7:j] > __b[j+7:j]) ? 0xFF : 0
715 /// ENDFOR
716 /// \endcode
717 ///
718 /// \headerfile <immintrin.h>
719 ///
720 /// This intrinsic corresponds to the \c VPCMPGTB instruction.
721 ///
722 /// \param __a
723 ///    A 256-bit integer vector containing one of the inputs.
724 /// \param __b
725 ///    A 256-bit integer vector containing one of the inputs.
726 /// \returns A 256-bit integer vector containing the result.
727 static __inline__ __m256i __DEFAULT_FN_ATTRS256
728 _mm256_cmpgt_epi8(__m256i __a, __m256i __b)
729 {
730   /* This function always performs a signed comparison, but __v32qi is a char
731      which may be signed or unsigned, so use __v32qs. */
732   return (__m256i)((__v32qs)__a > (__v32qs)__b);
733 }
734 
735 /// Compares corresponding signed elements in the 256-bit vectors of
736 ///    [16 x i16] in \a __a and \a __b for greater-than and returns the
737 ///    outcomes in the corresponding elements of the 256-bit result.
738 ///
739 /// \code{.operation}
740 /// FOR i := 0 TO 15
741 ///   j := i*16
742 ///   result[j+15:j] := (__a[j+15:j] > __b[j+15:j]) ? 0xFFFF : 0
743 /// ENDFOR
744 /// \endcode
745 ///
746 /// \headerfile <immintrin.h>
747 ///
748 /// This intrinsic corresponds to the \c VPCMPGTW instruction.
749 ///
750 /// \param __a
751 ///    A 256-bit vector of [16 x i16] containing one of the inputs.
752 /// \param __b
753 ///    A 256-bit vector of [16 x i16] containing one of the inputs.
754 /// \returns A 256-bit vector of [16 x i16] containing the result.
755 static __inline__ __m256i __DEFAULT_FN_ATTRS256
756 _mm256_cmpgt_epi16(__m256i __a, __m256i __b)
757 {
758   return (__m256i)((__v16hi)__a > (__v16hi)__b);
759 }
760 
761 /// Compares corresponding signed elements in the 256-bit vectors of
762 ///    [8 x i32] in \a __a and \a __b for greater-than and returns the
763 ///    outcomes in the corresponding elements of the 256-bit result.
764 ///
765 /// \code{.operation}
766 /// FOR i := 0 TO 7
767 ///   j := i*32
768 ///   result[j+31:j] := (__a[j+31:j] > __b[j+31:j]) ? 0xFFFFFFFF : 0
769 /// ENDFOR
770 /// \endcode
771 ///
772 /// \headerfile <immintrin.h>
773 ///
774 /// This intrinsic corresponds to the \c VPCMPGTD instruction.
775 ///
776 /// \param __a
777 ///    A 256-bit vector of [8 x i32] containing one of the inputs.
778 /// \param __b
779 ///    A 256-bit vector of [8 x i32] containing one of the inputs.
780 /// \returns A 256-bit vector of [8 x i32] containing the result.
781 static __inline__ __m256i __DEFAULT_FN_ATTRS256
782 _mm256_cmpgt_epi32(__m256i __a, __m256i __b)
783 {
784   return (__m256i)((__v8si)__a > (__v8si)__b);
785 }
786 
787 /// Compares corresponding signed elements in the 256-bit vectors of
788 ///    [4 x i64] in \a __a and \a __b for greater-than and returns the
789 ///    outcomes in the corresponding elements of the 256-bit result.
790 ///
791 /// \code{.operation}
792 /// FOR i := 0 TO 3
793 ///   j := i*64
794 ///   result[j+63:j] := (__a[j+63:j] > __b[j+63:j]) ? 0xFFFFFFFFFFFFFFFF : 0
795 /// ENDFOR
796 /// \endcode
797 ///
798 /// \headerfile <immintrin.h>
799 ///
800 /// This intrinsic corresponds to the \c VPCMPGTQ instruction.
801 ///
802 /// \param __a
803 ///    A 256-bit vector of [4 x i64] containing one of the inputs.
804 /// \param __b
805 ///    A 256-bit vector of [4 x i64] containing one of the inputs.
806 /// \returns A 256-bit vector of [4 x i64] containing the result.
807 static __inline__ __m256i __DEFAULT_FN_ATTRS256
808 _mm256_cmpgt_epi64(__m256i __a, __m256i __b)
809 {
810   return (__m256i)((__v4di)__a > (__v4di)__b);
811 }
812 
813 /// Horizontally adds the adjacent pairs of 16-bit integers from two 256-bit
814 ///    vectors of [16 x i16] and returns the lower 16 bits of each sum in an
815 ///    element of the [16 x i16] result (overflow is ignored). Sums from
816 ///    \a __a are returned in the lower 64 bits of each 128-bit half of the
817 ///    result; sums from \a __b are returned in the upper 64 bits of each
818 ///    128-bit half of the result.
819 ///
820 /// \code{.operation}
821 /// FOR i := 0 TO 1
822 ///   j := i*128
823 ///   result[j+15:j] := __a[j+15:j] + __a[j+31:j+16]
824 ///   result[j+31:j+16] := __a[j+47:j+32] + __a[j+63:j+48]
825 ///   result[j+47:j+32] := __a[j+79:j+64] + __a[j+95:j+80]
826 ///   result[j+63:j+48] := __a[j+111:j+96] + __a[j+127:j+112]
827 ///   result[j+79:j+64] := __b[j+15:j] + __b[j+31:j+16]
828 ///   result[j+95:j+80] := __b[j+47:j+32] + __b[j+63:j+48]
829 ///   result[j+111:j+96] := __b[j+79:j+64] + __b[j+95:j+80]
830 ///   result[j+127:j+112] := __b[j+111:j+96] + __b[j+127:j+112]
831 /// ENDFOR
832 /// \endcode
833 ///
834 /// \headerfile <immintrin.h>
835 ///
836 /// This intrinsic corresponds to the \c VPHADDW instruction.
837 ///
838 /// \param __a
839 ///    A 256-bit vector of [16 x i16] containing one of the source operands.
840 /// \param __b
841 ///    A 256-bit vector of [16 x i16] containing one of the source operands.
842 /// \returns A 256-bit vector of [16 x i16] containing the sums.
843 static __inline__ __m256i __DEFAULT_FN_ATTRS256
844 _mm256_hadd_epi16(__m256i __a, __m256i __b)
845 {
846     return (__m256i)__builtin_ia32_phaddw256((__v16hi)__a, (__v16hi)__b);
847 }
848 
849 /// Horizontally adds the adjacent pairs of 32-bit integers from two 256-bit
850 ///    vectors of [8 x i32] and returns the lower 32 bits of each sum in an
851 ///    element of the [8 x i32] result (overflow is ignored). Sums from \a __a
852 ///    are returned in the lower 64 bits of each 128-bit half of the result;
853 ///    sums from \a __b are returned in the upper 64 bits of each 128-bit half
854 ///    of the result.
855 ///
856 /// \code{.operation}
857 /// FOR i := 0 TO 1
858 ///   j := i*128
859 ///   result[j+31:j] := __a[j+31:j] + __a[j+63:j+32]
860 ///   result[j+63:j+32] := __a[j+95:j+64] + __a[j+127:j+96]
861 ///   result[j+95:j+64] := __b[j+31:j] + __b[j+63:j+32]
862 ///   result[j+127:j+96] := __b[j+95:j+64] + __b[j+127:j+96]
863 /// ENDFOR
864 /// \endcode
865 ///
866 /// \headerfile <immintrin.h>
867 ///
868 /// This intrinsic corresponds to the \c VPHADDD instruction.
869 ///
870 /// \param __a
871 ///    A 256-bit vector of [8 x i32] containing one of the source operands.
872 /// \param __b
873 ///    A 256-bit vector of [8 x i32] containing one of the source operands.
874 /// \returns A 256-bit vector of [8 x i32] containing the sums.
875 static __inline__ __m256i __DEFAULT_FN_ATTRS256
876 _mm256_hadd_epi32(__m256i __a, __m256i __b)
877 {
878     return (__m256i)__builtin_ia32_phaddd256((__v8si)__a, (__v8si)__b);
879 }
880 
881 /// Horizontally adds the adjacent pairs of 16-bit integers from two 256-bit
882 ///    vectors of [16 x i16] using signed saturation and returns each sum in
883 ///    an element of the [16 x i16] result. Sums from \a __a are returned in
884 ///    the lower 64 bits of each 128-bit half of the result; sums from \a __b
885 ///    are returned in the upper 64 bits of each 128-bit half of the result.
886 ///
887 /// \code{.operation}
888 /// FOR i := 0 TO 1
889 ///   j := i*128
890 ///   result[j+15:j] := SATURATE16(__a[j+15:j] + __a[j+31:j+16])
891 ///   result[j+31:j+16] := SATURATE16(__a[j+47:j+32] + __a[j+63:j+48])
892 ///   result[j+47:j+32] := SATURATE16(__a[j+79:j+64] + __a[j+95:j+80])
893 ///   result[j+63:j+48] := SATURATE16(__a[j+111:j+96] + __a[j+127:j+112])
894 ///   result[j+79:j+64] := SATURATE16(__b[j+15:j] + __b[j+31:j+16])
895 ///   result[j+95:j+80] := SATURATE16(__b[j+47:j+32] + __b[j+63:j+48])
896 ///   result[j+111:j+96] := SATURATE16(__b[j+79:j+64] + __b[j+95:j+80])
897 ///   result[j+127:j+112] := SATURATE16(__b[j+111:j+96] + __b[j+127:j+112])
898 /// ENDFOR
899 /// \endcode
900 ///
901 /// \headerfile <immintrin.h>
902 ///
903 /// This intrinsic corresponds to the \c VPHADDSW instruction.
904 ///
905 /// \param __a
906 ///    A 256-bit vector of [16 x i16] containing one of the source operands.
907 /// \param __b
908 ///    A 256-bit vector of [16 x i16] containing one of the source operands.
909 /// \returns A 256-bit vector of [16 x i16] containing the sums.
910 static __inline__ __m256i __DEFAULT_FN_ATTRS256
911 _mm256_hadds_epi16(__m256i __a, __m256i __b)
912 {
913     return (__m256i)__builtin_ia32_phaddsw256((__v16hi)__a, (__v16hi)__b);
914 }
915 
916 /// Horizontally subtracts adjacent pairs of 16-bit integers from two 256-bit
917 ///    vectors of [16 x i16] and returns the lower 16 bits of each difference
918 ///    in an element of the [16 x i16] result (overflow is ignored).
919 ///    Differences from \a __a are returned in the lower 64 bits of each
920 ///    128-bit half of the result; differences from \a __b are returned in the
921 ///    upper 64 bits of each 128-bit half of the result.
922 ///
923 /// \code{.operation}
924 /// FOR i := 0 TO 1
925 ///   j := i*128
926 ///   result[j+15:j] := __a[j+15:j] - __a[j+31:j+16]
927 ///   result[j+31:j+16] := __a[j+47:j+32] - __a[j+63:j+48]
928 ///   result[j+47:j+32] := __a[j+79:j+64] - __a[j+95:j+80]
929 ///   result[j+63:j+48] := __a[j+111:j+96] - __a[j+127:j+112]
930 ///   result[j+79:j+64] := __b[j+15:j] - __b[j+31:j+16]
931 ///   result[j+95:j+80] := __b[j+47:j+32] - __b[j+63:j+48]
932 ///   result[j+111:j+96] := __b[j+79:j+64] - __b[j+95:j+80]
933 ///   result[j+127:j+112] := __b[j+111:j+96] - __b[j+127:j+112]
934 /// ENDFOR
935 /// \endcode
936 ///
937 /// \headerfile <immintrin.h>
938 ///
939 /// This intrinsic corresponds to the \c VPHSUBW instruction.
940 ///
941 /// \param __a
942 ///    A 256-bit vector of [16 x i16] containing one of the source operands.
943 /// \param __b
944 ///    A 256-bit vector of [16 x i16] containing one of the source operands.
945 /// \returns A 256-bit vector of [16 x i16] containing the differences.
946 static __inline__ __m256i __DEFAULT_FN_ATTRS256
947 _mm256_hsub_epi16(__m256i __a, __m256i __b)
948 {
949     return (__m256i)__builtin_ia32_phsubw256((__v16hi)__a, (__v16hi)__b);
950 }
951 
952 /// Horizontally subtracts adjacent pairs of 32-bit integers from two 256-bit
953 ///    vectors of [8 x i32] and returns the lower 32 bits of each difference in
954 ///    an element of the [8 x i32] result (overflow is ignored). Differences
955 ///    from \a __a are returned in the lower 64 bits of each 128-bit half of
956 ///    the result; differences from \a __b are returned in the upper 64 bits
957 ///    of each 128-bit half of the result.
958 ///
959 /// \code{.operation}
960 /// FOR i := 0 TO 1
961 ///   j := i*128
962 ///   result[j+31:j] := __a[j+31:j] - __a[j+63:j+32]
963 ///   result[j+63:j+32] := __a[j+95:j+64] - __a[j+127:j+96]
964 ///   result[j+95:j+64] := __b[j+31:j] - __b[j+63:j+32]
965 ///   result[j+127:j+96] := __b[j+95:j+64] - __b[j+127:j+96]
966 /// ENDFOR
967 /// \endcode
968 ///
969 /// \headerfile <immintrin.h>
970 ///
971 /// This intrinsic corresponds to the \c VPHSUBD instruction.
972 ///
973 /// \param __a
974 ///    A 256-bit vector of [8 x i32] containing one of the source operands.
975 /// \param __b
976 ///    A 256-bit vector of [8 x i32] containing one of the source operands.
977 /// \returns A 256-bit vector of [8 x i32] containing the differences.
978 static __inline__ __m256i __DEFAULT_FN_ATTRS256
979 _mm256_hsub_epi32(__m256i __a, __m256i __b)
980 {
981     return (__m256i)__builtin_ia32_phsubd256((__v8si)__a, (__v8si)__b);
982 }
983 
984 /// Horizontally subtracts adjacent pairs of 16-bit integers from two 256-bit
985 ///    vectors of [16 x i16] using signed saturation and returns each sum in
986 ///    an element of the [16 x i16] result. Differences from \a __a are
987 ///    returned in the lower 64 bits of each 128-bit half of the result;
988 ///    differences from \a __b are returned in the upper 64 bits of each
989 ///    128-bit half of the result.
990 ///
991 /// \code{.operation}
992 /// FOR i := 0 TO 1
993 ///   j := i*128
994 ///   result[j+15:j] := SATURATE16(__a[j+15:j] - __a[j+31:j+16])
995 ///   result[j+31:j+16] := SATURATE16(__a[j+47:j+32] - __a[j+63:j+48])
996 ///   result[j+47:j+32] := SATURATE16(__a[j+79:j+64] - __a[j+95:j+80])
997 ///   result[j+63:j+48] := SATURATE16(__a[j+111:j+96] - __a[j+127:j+112])
998 ///   result[j+79:j+64] := SATURATE16(__b[j+15:j] - __b[j+31:j+16])
999 ///   result[j+95:j+80] := SATURATE16(__b[j+47:j+32] - __b[j+63:j+48])
1000 ///   result[j+111:j+96] := SATURATE16(__b[j+79:j+64] - __b[j+95:j+80])
1001 ///   result[j+127:j+112] := SATURATE16(__b[j+111:j+96] - __b[j+127:j+112])
1002 /// ENDFOR
1003 /// \endcode
1004 ///
1005 /// \headerfile <immintrin.h>
1006 ///
1007 /// This intrinsic corresponds to the \c VPHSUBSW instruction.
1008 ///
1009 /// \param __a
1010 ///    A 256-bit vector of [16 x i16] containing one of the source operands.
1011 /// \param __b
1012 ///    A 256-bit vector of [16 x i16] containing one of the source operands.
1013 /// \returns A 256-bit vector of [16 x i16] containing the differences.
1014 static __inline__ __m256i __DEFAULT_FN_ATTRS256
1015 _mm256_hsubs_epi16(__m256i __a, __m256i __b)
1016 {
1017     return (__m256i)__builtin_ia32_phsubsw256((__v16hi)__a, (__v16hi)__b);
1018 }
1019 
1020 /// Multiplies each unsigned byte from the 256-bit integer vector in \a __a
1021 ///    with the corresponding signed byte from the 256-bit integer vector in
1022 ///    \a __b, forming signed 16-bit intermediate products. Adds adjacent
1023 ///    pairs of those products using signed saturation to form 16-bit sums
1024 ///    returned as elements of the [16 x i16] result.
1025 ///
1026 /// \code{.operation}
1027 /// FOR i := 0 TO 15
1028 ///   j := i*16
1029 ///   temp1 := __a[j+7:j] * __b[j+7:j]
1030 ///   temp2 := __a[j+15:j+8] * __b[j+15:j+8]
1031 ///   result[j+15:j] := SATURATE16(temp1 + temp2)
1032 /// ENDFOR
1033 /// \endcode
1034 ///
1035 /// \headerfile <immintrin.h>
1036 ///
1037 /// This intrinsic corresponds to the \c VPMADDUBSW instruction.
1038 ///
1039 /// \param __a
1040 ///    A 256-bit vector containing one of the source operands.
1041 /// \param __b
1042 ///    A 256-bit vector containing one of the source operands.
1043 /// \returns A 256-bit vector of [16 x i16] containing the result.
1044 static __inline__ __m256i __DEFAULT_FN_ATTRS256
1045 _mm256_maddubs_epi16(__m256i __a, __m256i __b)
1046 {
1047     return (__m256i)__builtin_ia32_pmaddubsw256((__v32qi)__a, (__v32qi)__b);
1048 }
1049 
1050 /// Multiplies corresponding 16-bit elements of two 256-bit vectors of
1051 ///    [16 x i16], forming 32-bit intermediate products, and adds pairs of
1052 ///    those products to form 32-bit sums returned as elements of the
1053 ///    [8 x i32] result.
1054 ///
1055 ///    There is only one wraparound case: when all four of the 16-bit sources
1056 ///    are \c 0x8000, the result will be \c 0x80000000.
1057 ///
1058 /// \code{.operation}
1059 /// FOR i := 0 TO 7
1060 ///   j := i*32
1061 ///   temp1 := __a[j+15:j] * __b[j+15:j]
1062 ///   temp2 := __a[j+31:j+16] * __b[j+31:j+16]
1063 ///   result[j+31:j] := temp1 + temp2
1064 /// ENDFOR
1065 /// \endcode
1066 ///
1067 /// \headerfile <immintrin.h>
1068 ///
1069 /// This intrinsic corresponds to the \c VPMADDWD instruction.
1070 ///
1071 /// \param __a
1072 ///    A 256-bit vector of [16 x i16] containing one of the source operands.
1073 /// \param __b
1074 ///    A 256-bit vector of [16 x i16] containing one of the source operands.
1075 /// \returns A 256-bit vector of [8 x i32] containing the result.
1076 static __inline__ __m256i __DEFAULT_FN_ATTRS256
1077 _mm256_madd_epi16(__m256i __a, __m256i __b)
1078 {
1079   return (__m256i)__builtin_ia32_pmaddwd256((__v16hi)__a, (__v16hi)__b);
1080 }
1081 
1082 /// Compares the corresponding signed bytes in the two 256-bit integer vectors
1083 ///     in \a __a and \a __b and returns the larger of each pair in the
1084 ///     corresponding byte of the 256-bit result.
1085 ///
1086 /// \headerfile <immintrin.h>
1087 ///
1088 /// This intrinsic corresponds to the \c VPMAXSB instruction.
1089 ///
1090 /// \param __a
1091 ///    A 256-bit integer vector.
1092 /// \param __b
1093 ///    A 256-bit integer vector.
1094 /// \returns A 256-bit integer vector containing the result.
1095 static __inline__ __m256i __DEFAULT_FN_ATTRS256
1096 _mm256_max_epi8(__m256i __a, __m256i __b)
1097 {
1098   return (__m256i)__builtin_elementwise_max((__v32qs)__a, (__v32qs)__b);
1099 }
1100 
1101 /// Compares the corresponding signed 16-bit integers in the two 256-bit
1102 ///    vectors of [16 x i16] in \a __a and \a __b and returns the larger of
1103 ///    each pair in the corresponding element of the 256-bit result.
1104 ///
1105 /// \headerfile <immintrin.h>
1106 ///
1107 /// This intrinsic corresponds to the \c VPMAXSW instruction.
1108 ///
1109 /// \param __a
1110 ///    A 256-bit vector of [16 x i16].
1111 /// \param __b
1112 ///    A 256-bit vector of [16 x i16].
1113 /// \returns A 256-bit vector of [16 x i16] containing the result.
1114 static __inline__ __m256i __DEFAULT_FN_ATTRS256
1115 _mm256_max_epi16(__m256i __a, __m256i __b)
1116 {
1117   return (__m256i)__builtin_elementwise_max((__v16hi)__a, (__v16hi)__b);
1118 }
1119 
1120 /// Compares the corresponding signed 32-bit integers in the two 256-bit
1121 ///    vectors of [8 x i32] in \a __a and \a __b and returns the larger of
1122 ///    each pair in the corresponding element of the 256-bit result.
1123 ///
1124 /// \headerfile <immintrin.h>
1125 ///
1126 /// This intrinsic corresponds to the \c VPMAXSD instruction.
1127 ///
1128 /// \param __a
1129 ///    A 256-bit vector of [8 x i32].
1130 /// \param __b
1131 ///    A 256-bit vector of [8 x i32].
1132 /// \returns A 256-bit vector of [8 x i32] containing the result.
1133 static __inline__ __m256i __DEFAULT_FN_ATTRS256
1134 _mm256_max_epi32(__m256i __a, __m256i __b)
1135 {
1136   return (__m256i)__builtin_elementwise_max((__v8si)__a, (__v8si)__b);
1137 }
1138 
1139 /// Compares the corresponding unsigned bytes in the two 256-bit integer
1140 ///     vectors in \a __a and \a __b and returns the larger of each pair in
1141 ///     the corresponding byte of the 256-bit result.
1142 ///
1143 /// \headerfile <immintrin.h>
1144 ///
1145 /// This intrinsic corresponds to the \c VPMAXUB instruction.
1146 ///
1147 /// \param __a
1148 ///    A 256-bit integer vector.
1149 /// \param __b
1150 ///    A 256-bit integer vector.
1151 /// \returns A 256-bit integer vector containing the result.
1152 static __inline__ __m256i __DEFAULT_FN_ATTRS256
1153 _mm256_max_epu8(__m256i __a, __m256i __b)
1154 {
1155   return (__m256i)__builtin_elementwise_max((__v32qu)__a, (__v32qu)__b);
1156 }
1157 
1158 /// Compares the corresponding unsigned 16-bit integers in the two 256-bit
1159 ///    vectors of [16 x i16] in \a __a and \a __b and returns the larger of
1160 ///    each pair in the corresponding element of the 256-bit result.
1161 ///
1162 /// \headerfile <immintrin.h>
1163 ///
1164 /// This intrinsic corresponds to the \c VPMAXUW instruction.
1165 ///
1166 /// \param __a
1167 ///    A 256-bit vector of [16 x i16].
1168 /// \param __b
1169 ///    A 256-bit vector of [16 x i16].
1170 /// \returns A 256-bit vector of [16 x i16] containing the result.
1171 static __inline__ __m256i __DEFAULT_FN_ATTRS256
1172 _mm256_max_epu16(__m256i __a, __m256i __b)
1173 {
1174   return (__m256i)__builtin_elementwise_max((__v16hu)__a, (__v16hu)__b);
1175 }
1176 
1177 /// Compares the corresponding unsigned 32-bit integers in the two 256-bit
1178 ///    vectors of [8 x i32] in \a __a and \a __b and returns the larger of
1179 ///    each pair in the corresponding element of the 256-bit result.
1180 ///
1181 /// \headerfile <immintrin.h>
1182 ///
1183 /// This intrinsic corresponds to the \c VPMAXUD instruction.
1184 ///
1185 /// \param __a
1186 ///    A 256-bit vector of [8 x i32].
1187 /// \param __b
1188 ///    A 256-bit vector of [8 x i32].
1189 /// \returns A 256-bit vector of [8 x i32] containing the result.
1190 static __inline__ __m256i __DEFAULT_FN_ATTRS256
1191 _mm256_max_epu32(__m256i __a, __m256i __b)
1192 {
1193   return (__m256i)__builtin_elementwise_max((__v8su)__a, (__v8su)__b);
1194 }
1195 
1196 /// Compares the corresponding signed bytes in the two 256-bit integer vectors
1197 ///     in \a __a and \a __b and returns the smaller of each pair in the
1198 ///     corresponding byte of the 256-bit result.
1199 ///
1200 /// \headerfile <immintrin.h>
1201 ///
1202 /// This intrinsic corresponds to the \c VPMINSB instruction.
1203 ///
1204 /// \param __a
1205 ///    A 256-bit integer vector.
1206 /// \param __b
1207 ///    A 256-bit integer vector.
1208 /// \returns A 256-bit integer vector containing the result.
1209 static __inline__ __m256i __DEFAULT_FN_ATTRS256
1210 _mm256_min_epi8(__m256i __a, __m256i __b)
1211 {
1212   return (__m256i)__builtin_elementwise_min((__v32qs)__a, (__v32qs)__b);
1213 }
1214 
1215 /// Compares the corresponding signed 16-bit integers in the two 256-bit
1216 ///    vectors of [16 x i16] in \a __a and \a __b and returns the smaller of
1217 ///    each pair in the corresponding element of the 256-bit result.
1218 ///
1219 /// \headerfile <immintrin.h>
1220 ///
1221 /// This intrinsic corresponds to the \c VPMINSW instruction.
1222 ///
1223 /// \param __a
1224 ///    A 256-bit vector of [16 x i16].
1225 /// \param __b
1226 ///    A 256-bit vector of [16 x i16].
1227 /// \returns A 256-bit vector of [16 x i16] containing the result.
1228 static __inline__ __m256i __DEFAULT_FN_ATTRS256
1229 _mm256_min_epi16(__m256i __a, __m256i __b)
1230 {
1231   return (__m256i)__builtin_elementwise_min((__v16hi)__a, (__v16hi)__b);
1232 }
1233 
1234 /// Compares the corresponding signed 32-bit integers in the two 256-bit
1235 ///    vectors of [8 x i32] in \a __a and \a __b and returns the smaller of
1236 ///    each pair in the corresponding element of the 256-bit result.
1237 ///
1238 /// \headerfile <immintrin.h>
1239 ///
1240 /// This intrinsic corresponds to the \c VPMINSD instruction.
1241 ///
1242 /// \param __a
1243 ///    A 256-bit vector of [8 x i32].
1244 /// \param __b
1245 ///    A 256-bit vector of [8 x i32].
1246 /// \returns A 256-bit vector of [8 x i32] containing the result.
1247 static __inline__ __m256i __DEFAULT_FN_ATTRS256
1248 _mm256_min_epi32(__m256i __a, __m256i __b)
1249 {
1250   return (__m256i)__builtin_elementwise_min((__v8si)__a, (__v8si)__b);
1251 }
1252 
1253 /// Compares the corresponding unsigned bytes in the two 256-bit integer
1254 ///     vectors in \a __a and \a __b and returns the smaller of each pair in
1255 ///     the corresponding byte of the 256-bit result.
1256 ///
1257 /// \headerfile <immintrin.h>
1258 ///
1259 /// This intrinsic corresponds to the \c VPMINUB instruction.
1260 ///
1261 /// \param __a
1262 ///    A 256-bit integer vector.
1263 /// \param __b
1264 ///    A 256-bit integer vector.
1265 /// \returns A 256-bit integer vector containing the result.
1266 static __inline__ __m256i __DEFAULT_FN_ATTRS256
1267 _mm256_min_epu8(__m256i __a, __m256i __b)
1268 {
1269   return (__m256i)__builtin_elementwise_min((__v32qu)__a, (__v32qu)__b);
1270 }
1271 
1272 /// Compares the corresponding unsigned 16-bit integers in the two 256-bit
1273 ///    vectors of [16 x i16] in \a __a and \a __b and returns the smaller of
1274 ///    each pair in the corresponding element of the 256-bit result.
1275 ///
1276 /// \headerfile <immintrin.h>
1277 ///
1278 /// This intrinsic corresponds to the \c VPMINUW instruction.
1279 ///
1280 /// \param __a
1281 ///    A 256-bit vector of [16 x i16].
1282 /// \param __b
1283 ///    A 256-bit vector of [16 x i16].
1284 /// \returns A 256-bit vector of [16 x i16] containing the result.
1285 static __inline__ __m256i __DEFAULT_FN_ATTRS256
1286 _mm256_min_epu16(__m256i __a, __m256i __b)
1287 {
1288   return (__m256i)__builtin_elementwise_min((__v16hu)__a, (__v16hu)__b);
1289 }
1290 
1291 /// Compares the corresponding unsigned 32-bit integers in the two 256-bit
1292 ///    vectors of [8 x i32] in \a __a and \a __b and returns the smaller of
1293 ///    each pair in the corresponding element of the 256-bit result.
1294 ///
1295 /// \headerfile <immintrin.h>
1296 ///
1297 /// This intrinsic corresponds to the \c VPMINUD instruction.
1298 ///
1299 /// \param __a
1300 ///    A 256-bit vector of [8 x i32].
1301 /// \param __b
1302 ///    A 256-bit vector of [8 x i32].
1303 /// \returns A 256-bit vector of [8 x i32] containing the result.
1304 static __inline__ __m256i __DEFAULT_FN_ATTRS256
1305 _mm256_min_epu32(__m256i __a, __m256i __b)
1306 {
1307   return (__m256i)__builtin_elementwise_min((__v8su)__a, (__v8su)__b);
1308 }
1309 
1310 static __inline__ int __DEFAULT_FN_ATTRS256
1311 _mm256_movemask_epi8(__m256i __a)
1312 {
1313   return __builtin_ia32_pmovmskb256((__v32qi)__a);
1314 }
1315 
1316 /// Sign-extends bytes from the 128-bit integer vector in \a __V and returns
1317 ///    the 16-bit values in the corresponding elements of a 256-bit vector
1318 ///    of [16 x i16].
1319 ///
1320 /// \code{.operation}
1321 /// FOR i := 0 TO 15
1322 ///   j := i*8
1323 ///   k := i*16
1324 ///   result[k+15:k] := SignExtend(__V[j+7:j])
1325 /// ENDFOR
1326 /// \endcode
1327 ///
1328 /// \headerfile <immintrin.h>
1329 ///
1330 /// This intrinsic corresponds to the \c VPMOVSXBW instruction.
1331 ///
1332 /// \param __V
1333 ///    A 128-bit integer vector containing the source bytes.
1334 /// \returns A 256-bit vector of [16 x i16] containing the sign-extended
1335 ///    values.
1336 static __inline__ __m256i __DEFAULT_FN_ATTRS256
1337 _mm256_cvtepi8_epi16(__m128i __V)
1338 {
1339   /* This function always performs a signed extension, but __v16qi is a char
1340      which may be signed or unsigned, so use __v16qs. */
1341   return (__m256i)__builtin_convertvector((__v16qs)__V, __v16hi);
1342 }
1343 
1344 /// Sign-extends bytes from the lower half of the 128-bit integer vector in
1345 ///    \a __V and returns the 32-bit values in the corresponding elements of a
1346 ///    256-bit vector of [8 x i32].
1347 ///
1348 /// \code{.operation}
1349 /// FOR i := 0 TO 7
1350 ///   j := i*8
1351 ///   k := i*32
1352 ///   result[k+31:k] := SignExtend(__V[j+7:j])
1353 /// ENDFOR
1354 /// \endcode
1355 ///
1356 /// \headerfile <immintrin.h>
1357 ///
1358 /// This intrinsic corresponds to the \c VPMOVSXBD instruction.
1359 ///
1360 /// \param __V
1361 ///    A 128-bit integer vector containing the source bytes.
1362 /// \returns A 256-bit vector of [8 x i32] containing the sign-extended
1363 ///    values.
1364 static __inline__ __m256i __DEFAULT_FN_ATTRS256
1365 _mm256_cvtepi8_epi32(__m128i __V)
1366 {
1367   /* This function always performs a signed extension, but __v16qi is a char
1368      which may be signed or unsigned, so use __v16qs. */
1369   return (__m256i)__builtin_convertvector(__builtin_shufflevector((__v16qs)__V, (__v16qs)__V, 0, 1, 2, 3, 4, 5, 6, 7), __v8si);
1370 }
1371 
1372 /// Sign-extends the first four bytes from the 128-bit integer vector in
1373 ///    \a __V and returns the 64-bit values in the corresponding elements of a
1374 ///    256-bit vector of [4 x i64].
1375 ///
1376 /// \code{.operation}
1377 /// result[63:0] := SignExtend(__V[7:0])
1378 /// result[127:64] := SignExtend(__V[15:8])
1379 /// result[191:128] := SignExtend(__V[23:16])
1380 /// result[255:192] := SignExtend(__V[31:24])
1381 /// \endcode
1382 ///
1383 /// \headerfile <immintrin.h>
1384 ///
1385 /// This intrinsic corresponds to the \c VPMOVSXBQ instruction.
1386 ///
1387 /// \param __V
1388 ///    A 128-bit integer vector containing the source bytes.
1389 /// \returns A 256-bit vector of [4 x i64] containing the sign-extended
1390 ///    values.
1391 static __inline__ __m256i __DEFAULT_FN_ATTRS256
1392 _mm256_cvtepi8_epi64(__m128i __V)
1393 {
1394   /* This function always performs a signed extension, but __v16qi is a char
1395      which may be signed or unsigned, so use __v16qs. */
1396   return (__m256i)__builtin_convertvector(__builtin_shufflevector((__v16qs)__V, (__v16qs)__V, 0, 1, 2, 3), __v4di);
1397 }
1398 
1399 /// Sign-extends 16-bit elements from the 128-bit vector of [8 x i16] in
1400 ///    \a __V and returns the 32-bit values in the corresponding elements of a
1401 ///    256-bit vector of [8 x i32].
1402 ///
1403 /// \code{.operation}
1404 /// FOR i := 0 TO 7
1405 ///   j := i*16
1406 ///   k := i*32
1407 ///   result[k+31:k] := SignExtend(__V[j+15:j])
1408 /// ENDFOR
1409 /// \endcode
1410 ///
1411 /// \headerfile <immintrin.h>
1412 ///
1413 /// This intrinsic corresponds to the \c VPMOVSXWD instruction.
1414 ///
1415 /// \param __V
1416 ///    A 128-bit vector of [8 x i16] containing the source values.
1417 /// \returns A 256-bit vector of [8 x i32] containing the sign-extended
1418 ///    values.
1419 static __inline__ __m256i __DEFAULT_FN_ATTRS256
1420 _mm256_cvtepi16_epi32(__m128i __V)
1421 {
1422   return (__m256i)__builtin_convertvector((__v8hi)__V, __v8si);
1423 }
1424 
1425 /// Sign-extends 16-bit elements from the lower half of the 128-bit vector of
1426 ///    [8 x i16] in \a __V and returns the 64-bit values in the corresponding
1427 ///    elements of a 256-bit vector of [4 x i64].
1428 ///
1429 /// \code{.operation}
1430 /// result[63:0] := SignExtend(__V[15:0])
1431 /// result[127:64] := SignExtend(__V[31:16])
1432 /// result[191:128] := SignExtend(__V[47:32])
1433 /// result[255:192] := SignExtend(__V[64:48])
1434 /// \endcode
1435 ///
1436 /// \headerfile <immintrin.h>
1437 ///
1438 /// This intrinsic corresponds to the \c VPMOVSXWQ instruction.
1439 ///
1440 /// \param __V
1441 ///    A 128-bit vector of [8 x i16] containing the source values.
1442 /// \returns A 256-bit vector of [4 x i64] containing the sign-extended
1443 ///    values.
1444 static __inline__ __m256i __DEFAULT_FN_ATTRS256
1445 _mm256_cvtepi16_epi64(__m128i __V)
1446 {
1447   return (__m256i)__builtin_convertvector(__builtin_shufflevector((__v8hi)__V, (__v8hi)__V, 0, 1, 2, 3), __v4di);
1448 }
1449 
1450 /// Sign-extends 32-bit elements from the 128-bit vector of [4 x i32] in
1451 ///    \a __V and returns the 64-bit values in the corresponding elements of a
1452 ///    256-bit vector of [4 x i64].
1453 ///
1454 /// \code{.operation}
1455 /// result[63:0] := SignExtend(__V[31:0])
1456 /// result[127:64] := SignExtend(__V[63:32])
1457 /// result[191:128] := SignExtend(__V[95:64])
1458 /// result[255:192] := SignExtend(__V[127:96])
1459 /// \endcode
1460 ///
1461 /// \headerfile <immintrin.h>
1462 ///
1463 /// This intrinsic corresponds to the \c VPMOVSXDQ instruction.
1464 ///
1465 /// \param __V
1466 ///    A 128-bit vector of [4 x i32] containing the source values.
1467 /// \returns A 256-bit vector of [4 x i64] containing the sign-extended
1468 ///    values.
1469 static __inline__ __m256i __DEFAULT_FN_ATTRS256
1470 _mm256_cvtepi32_epi64(__m128i __V)
1471 {
1472   return (__m256i)__builtin_convertvector((__v4si)__V, __v4di);
1473 }
1474 
1475 /// Zero-extends bytes from the 128-bit integer vector in \a __V and returns
1476 ///    the 16-bit values in the corresponding elements of a 256-bit vector
1477 ///    of [16 x i16].
1478 ///
1479 /// \code{.operation}
1480 /// FOR i := 0 TO 15
1481 ///   j := i*8
1482 ///   k := i*16
1483 ///   result[k+15:k] := ZeroExtend(__V[j+7:j])
1484 /// ENDFOR
1485 /// \endcode
1486 ///
1487 /// \headerfile <immintrin.h>
1488 ///
1489 /// This intrinsic corresponds to the \c VPMOVZXBW instruction.
1490 ///
1491 /// \param __V
1492 ///    A 128-bit integer vector containing the source bytes.
1493 /// \returns A 256-bit vector of [16 x i16] containing the zero-extended
1494 ///    values.
1495 static __inline__ __m256i __DEFAULT_FN_ATTRS256
1496 _mm256_cvtepu8_epi16(__m128i __V)
1497 {
1498   return (__m256i)__builtin_convertvector((__v16qu)__V, __v16hi);
1499 }
1500 
1501 /// Zero-extends bytes from the lower half of the 128-bit integer vector in
1502 ///    \a __V and returns the 32-bit values in the corresponding elements of a
1503 ///    256-bit vector of [8 x i32].
1504 ///
1505 /// \code{.operation}
1506 /// FOR i := 0 TO 7
1507 ///   j := i*8
1508 ///   k := i*32
1509 ///   result[k+31:k] := ZeroExtend(__V[j+7:j])
1510 /// ENDFOR
1511 /// \endcode
1512 ///
1513 /// \headerfile <immintrin.h>
1514 ///
1515 /// This intrinsic corresponds to the \c VPMOVZXBD instruction.
1516 ///
1517 /// \param __V
1518 ///    A 128-bit integer vector containing the source bytes.
1519 /// \returns A 256-bit vector of [8 x i32] containing the zero-extended
1520 ///    values.
1521 static __inline__ __m256i __DEFAULT_FN_ATTRS256
1522 _mm256_cvtepu8_epi32(__m128i __V)
1523 {
1524   return (__m256i)__builtin_convertvector(__builtin_shufflevector((__v16qu)__V, (__v16qu)__V, 0, 1, 2, 3, 4, 5, 6, 7), __v8si);
1525 }
1526 
1527 /// Zero-extends the first four bytes from the 128-bit integer vector in
1528 ///    \a __V and returns the 64-bit values in the corresponding elements of a
1529 ///    256-bit vector of [4 x i64].
1530 ///
1531 /// \code{.operation}
1532 /// result[63:0] := ZeroExtend(__V[7:0])
1533 /// result[127:64] := ZeroExtend(__V[15:8])
1534 /// result[191:128] := ZeroExtend(__V[23:16])
1535 /// result[255:192] := ZeroExtend(__V[31:24])
1536 /// \endcode
1537 ///
1538 /// \headerfile <immintrin.h>
1539 ///
1540 /// This intrinsic corresponds to the \c VPMOVZXBQ instruction.
1541 ///
1542 /// \param __V
1543 ///    A 128-bit integer vector containing the source bytes.
1544 /// \returns A 256-bit vector of [4 x i64] containing the zero-extended
1545 ///    values.
1546 static __inline__ __m256i __DEFAULT_FN_ATTRS256
1547 _mm256_cvtepu8_epi64(__m128i __V)
1548 {
1549   return (__m256i)__builtin_convertvector(__builtin_shufflevector((__v16qu)__V, (__v16qu)__V, 0, 1, 2, 3), __v4di);
1550 }
1551 
1552 /// Zero-extends 16-bit elements from the 128-bit vector of [8 x i16] in
1553 ///    \a __V and returns the 32-bit values in the corresponding elements of a
1554 ///    256-bit vector of [8 x i32].
1555 ///
1556 /// \code{.operation}
1557 /// FOR i := 0 TO 7
1558 ///   j := i*16
1559 ///   k := i*32
1560 ///   result[k+31:k] := ZeroExtend(__V[j+15:j])
1561 /// ENDFOR
1562 /// \endcode
1563 ///
1564 /// \headerfile <immintrin.h>
1565 ///
1566 /// This intrinsic corresponds to the \c VPMOVZXWD instruction.
1567 ///
1568 /// \param __V
1569 ///    A 128-bit vector of [8 x i16] containing the source values.
1570 /// \returns A 256-bit vector of [8 x i32] containing the zero-extended
1571 ///    values.
1572 static __inline__ __m256i __DEFAULT_FN_ATTRS256
1573 _mm256_cvtepu16_epi32(__m128i __V)
1574 {
1575   return (__m256i)__builtin_convertvector((__v8hu)__V, __v8si);
1576 }
1577 
1578 /// Zero-extends 16-bit elements from the lower half of the 128-bit vector of
1579 ///    [8 x i16] in \a __V and returns the 64-bit values in the corresponding
1580 ///    elements of a 256-bit vector of [4 x i64].
1581 ///
1582 /// \code{.operation}
1583 /// result[63:0] := ZeroExtend(__V[15:0])
1584 /// result[127:64] := ZeroExtend(__V[31:16])
1585 /// result[191:128] := ZeroExtend(__V[47:32])
1586 /// result[255:192] := ZeroExtend(__V[64:48])
1587 /// \endcode
1588 ///
1589 /// \headerfile <immintrin.h>
1590 ///
1591 /// This intrinsic corresponds to the \c VPMOVSXWQ instruction.
1592 ///
1593 /// \param __V
1594 ///    A 128-bit vector of [8 x i16] containing the source values.
1595 /// \returns A 256-bit vector of [4 x i64] containing the zero-extended
1596 ///    values.
1597 static __inline__ __m256i __DEFAULT_FN_ATTRS256
1598 _mm256_cvtepu16_epi64(__m128i __V)
1599 {
1600   return (__m256i)__builtin_convertvector(__builtin_shufflevector((__v8hu)__V, (__v8hu)__V, 0, 1, 2, 3), __v4di);
1601 }
1602 
1603 /// Zero-extends 32-bit elements from the 128-bit vector of [4 x i32] in
1604 ///    \a __V and returns the 64-bit values in the corresponding elements of a
1605 ///    256-bit vector of [4 x i64].
1606 ///
1607 /// \code{.operation}
1608 /// result[63:0] := ZeroExtend(__V[31:0])
1609 /// result[127:64] := ZeroExtend(__V[63:32])
1610 /// result[191:128] := ZeroExtend(__V[95:64])
1611 /// result[255:192] := ZeroExtend(__V[127:96])
1612 /// \endcode
1613 ///
1614 /// \headerfile <immintrin.h>
1615 ///
1616 /// This intrinsic corresponds to the \c VPMOVZXDQ instruction.
1617 ///
1618 /// \param __V
1619 ///    A 128-bit vector of [4 x i32] containing the source values.
1620 /// \returns A 256-bit vector of [4 x i64] containing the zero-extended
1621 ///    values.
1622 static __inline__ __m256i __DEFAULT_FN_ATTRS256
1623 _mm256_cvtepu32_epi64(__m128i __V)
1624 {
1625   return (__m256i)__builtin_convertvector((__v4su)__V, __v4di);
1626 }
1627 
1628 /// Multiplies signed 32-bit integers from even-numbered elements of two
1629 ///    256-bit vectors of [8 x i32] and returns the 64-bit products in the
1630 ///    [4 x i64] result.
1631 ///
1632 /// \code{.operation}
1633 /// result[63:0] := __a[31:0] * __b[31:0]
1634 /// result[127:64] := __a[95:64] * __b[95:64]
1635 /// result[191:128] := __a[159:128] * __b[159:128]
1636 /// result[255:192] := __a[223:192] * __b[223:192]
1637 /// \endcode
1638 ///
1639 /// \headerfile <immintrin.h>
1640 ///
1641 /// This intrinsic corresponds to the \c VPMULDQ instruction.
1642 ///
1643 /// \param __a
1644 ///    A 256-bit vector of [8 x i32] containing one of the source operands.
1645 /// \param __b
1646 ///    A 256-bit vector of [8 x i32] containing one of the source operands.
1647 /// \returns A 256-bit vector of [4 x i64] containing the products.
1648 static __inline__  __m256i __DEFAULT_FN_ATTRS256
1649 _mm256_mul_epi32(__m256i __a, __m256i __b)
1650 {
1651   return (__m256i)__builtin_ia32_pmuldq256((__v8si)__a, (__v8si)__b);
1652 }
1653 
1654 /// Multiplies signed 16-bit integer elements of two 256-bit vectors of
1655 ///    [16 x i16], truncates the 32-bit results to the most significant 18
1656 ///    bits, rounds by adding 1, and returns bits [16:1] of each rounded
1657 ///    product in the [16 x i16] result.
1658 ///
1659 /// \code{.operation}
1660 /// FOR i := 0 TO 15
1661 ///   j := i*16
1662 ///   temp := ((__a[j+15:j] * __b[j+15:j]) >> 14) + 1
1663 ///   result[j+15:j] := temp[16:1]
1664 /// \endcode
1665 ///
1666 /// \headerfile <immintrin.h>
1667 ///
1668 /// This intrinsic corresponds to the \c VPMULHRSW instruction.
1669 ///
1670 /// \param __a
1671 ///    A 256-bit vector of [16 x i16] containing one of the source operands.
1672 /// \param __b
1673 ///    A 256-bit vector of [16 x i16] containing one of the source operands.
1674 /// \returns A 256-bit vector of [16 x i16] containing the rounded products.
1675 static __inline__ __m256i __DEFAULT_FN_ATTRS256
1676 _mm256_mulhrs_epi16(__m256i __a, __m256i __b)
1677 {
1678   return (__m256i)__builtin_ia32_pmulhrsw256((__v16hi)__a, (__v16hi)__b);
1679 }
1680 
1681 /// Multiplies unsigned 16-bit integer elements of two 256-bit vectors of
1682 ///    [16 x i16], and returns the upper 16 bits of each 32-bit product in the
1683 ///    [16 x i16] result.
1684 ///
1685 /// \headerfile <immintrin.h>
1686 ///
1687 /// This intrinsic corresponds to the \c VPMULHUW instruction.
1688 ///
1689 /// \param __a
1690 ///    A 256-bit vector of [16 x i16] containing one of the source operands.
1691 /// \param __b
1692 ///    A 256-bit vector of [16 x i16] containing one of the source operands.
1693 /// \returns A 256-bit vector of [16 x i16] containing the products.
1694 static __inline__ __m256i __DEFAULT_FN_ATTRS256
1695 _mm256_mulhi_epu16(__m256i __a, __m256i __b)
1696 {
1697   return (__m256i)__builtin_ia32_pmulhuw256((__v16hi)__a, (__v16hi)__b);
1698 }
1699 
1700 /// Multiplies signed 16-bit integer elements of two 256-bit vectors of
1701 ///    [16 x i16], and returns the upper 16 bits of each 32-bit product in the
1702 ///    [16 x i16] result.
1703 ///
1704 /// \headerfile <immintrin.h>
1705 ///
1706 /// This intrinsic corresponds to the \c VPMULHW instruction.
1707 ///
1708 /// \param __a
1709 ///    A 256-bit vector of [16 x i16] containing one of the source operands.
1710 /// \param __b
1711 ///    A 256-bit vector of [16 x i16] containing one of the source operands.
1712 /// \returns A 256-bit vector of [16 x i16] containing the products.
1713 static __inline__ __m256i __DEFAULT_FN_ATTRS256
1714 _mm256_mulhi_epi16(__m256i __a, __m256i __b)
1715 {
1716   return (__m256i)__builtin_ia32_pmulhw256((__v16hi)__a, (__v16hi)__b);
1717 }
1718 
1719 /// Multiplies signed 16-bit integer elements of two 256-bit vectors of
1720 ///    [16 x i16], and returns the lower 16 bits of each 32-bit product in the
1721 ///    [16 x i16] result.
1722 ///
1723 /// \headerfile <immintrin.h>
1724 ///
1725 /// This intrinsic corresponds to the \c VPMULLW instruction.
1726 ///
1727 /// \param __a
1728 ///    A 256-bit vector of [16 x i16] containing one of the source operands.
1729 /// \param __b
1730 ///    A 256-bit vector of [16 x i16] containing one of the source operands.
1731 /// \returns A 256-bit vector of [16 x i16] containing the products.
1732 static __inline__ __m256i __DEFAULT_FN_ATTRS256
1733 _mm256_mullo_epi16(__m256i __a, __m256i __b)
1734 {
1735   return (__m256i)((__v16hu)__a * (__v16hu)__b);
1736 }
1737 
1738 /// Multiplies signed 32-bit integer elements of two 256-bit vectors of
1739 ///    [8 x i32], and returns the lower 32 bits of each 64-bit product in the
1740 ///    [8 x i32] result.
1741 ///
1742 /// \headerfile <immintrin.h>
1743 ///
1744 /// This intrinsic corresponds to the \c VPMULLD instruction.
1745 ///
1746 /// \param __a
1747 ///    A 256-bit vector of [8 x i32] containing one of the source operands.
1748 /// \param __b
1749 ///    A 256-bit vector of [8 x i32] containing one of the source operands.
1750 /// \returns A 256-bit vector of [8 x i32] containing the products.
1751 static __inline__  __m256i __DEFAULT_FN_ATTRS256
1752 _mm256_mullo_epi32 (__m256i __a, __m256i __b)
1753 {
1754   return (__m256i)((__v8su)__a * (__v8su)__b);
1755 }
1756 
1757 /// Multiplies unsigned 32-bit integers from even-numered elements of two
1758 ///    256-bit vectors of [8 x i32] and returns the 64-bit products in the
1759 ///    [4 x i64] result.
1760 ///
1761 /// \code{.operation}
1762 /// result[63:0] := __a[31:0] * __b[31:0]
1763 /// result[127:64] := __a[95:64] * __b[95:64]
1764 /// result[191:128] := __a[159:128] * __b[159:128]
1765 /// result[255:192] := __a[223:192] * __b[223:192]
1766 /// \endcode
1767 ///
1768 /// \headerfile <immintrin.h>
1769 ///
1770 /// This intrinsic corresponds to the \c VPMULUDQ instruction.
1771 ///
1772 /// \param __a
1773 ///    A 256-bit vector of [8 x i32] containing one of the source operands.
1774 /// \param __b
1775 ///    A 256-bit vector of [8 x i32] containing one of the source operands.
1776 /// \returns A 256-bit vector of [4 x i64] containing the products.
1777 static __inline__ __m256i __DEFAULT_FN_ATTRS256
1778 _mm256_mul_epu32(__m256i __a, __m256i __b)
1779 {
1780   return __builtin_ia32_pmuludq256((__v8si)__a, (__v8si)__b);
1781 }
1782 
1783 /// Computes the bitwise OR of the 256-bit integer vectors in \a __a and
1784 ///    \a __b.
1785 ///
1786 /// \headerfile <immintrin.h>
1787 ///
1788 /// This intrinsic corresponds to the \c VPOR instruction.
1789 ///
1790 /// \param __a
1791 ///    A 256-bit integer vector.
1792 /// \param __b
1793 ///    A 256-bit integer vector.
1794 /// \returns A 256-bit integer vector containing the result.
1795 static __inline__ __m256i __DEFAULT_FN_ATTRS256
1796 _mm256_or_si256(__m256i __a, __m256i __b)
1797 {
1798   return (__m256i)((__v4du)__a | (__v4du)__b);
1799 }
1800 
1801 /// Computes four sum of absolute difference (SAD) operations on sets of eight
1802 ///    unsigned 8-bit integers from the 256-bit integer vectors \a __a and
1803 ///    \a __b.
1804 ///
1805 ///    One SAD result is computed for each set of eight bytes from \a __a and
1806 ///    eight bytes from \a __b. The zero-extended SAD value is returned in the
1807 ///    corresponding 64-bit element of the result.
1808 ///
1809 ///    A single SAD operation takes the differences between the corresponding
1810 ///    bytes of \a __a and \a __b, takes the absolute value of each difference,
1811 ///    and sums these eight values to form one 16-bit result. This operation
1812 ///    is repeated four times with successive sets of eight bytes.
1813 ///
1814 /// \code{.operation}
1815 /// FOR i := 0 TO 3
1816 ///   j := i*64
1817 ///   temp0 := ABS(__a[j+7:j] - __b[j+7:j])
1818 ///   temp1 := ABS(__a[j+15:j+8] - __b[j+15:j+8])
1819 ///   temp2 := ABS(__a[j+23:j+16] - __b[j+23:j+16])
1820 ///   temp3 := ABS(__a[j+31:j+24] - __b[j+31:j+24])
1821 ///   temp4 := ABS(__a[j+39:j+32] - __b[j+39:j+32])
1822 ///   temp5 := ABS(__a[j+47:j+40] - __b[j+47:j+40])
1823 ///   temp6 := ABS(__a[j+55:j+48] - __b[j+55:j+48])
1824 ///   temp7 := ABS(__a[j+63:j+56] - __b[j+63:j+56])
1825 ///   result[j+15:j] := temp0 + temp1 + temp2 + temp3 +
1826 ///                     temp4 + temp5 + temp6 + temp7
1827 ///   result[j+63:j+16] := 0
1828 /// ENDFOR
1829 /// \endcode
1830 ///
1831 /// \headerfile <immintrin.h>
1832 ///
1833 /// This intrinsic corresponds to the \c VPSADBW instruction.
1834 ///
1835 /// \param __a
1836 ///    A 256-bit integer vector.
1837 /// \param __b
1838 ///    A 256-bit integer vector.
1839 /// \returns A 256-bit integer vector containing the result.
1840 static __inline__ __m256i __DEFAULT_FN_ATTRS256
1841 _mm256_sad_epu8(__m256i __a, __m256i __b)
1842 {
1843   return __builtin_ia32_psadbw256((__v32qi)__a, (__v32qi)__b);
1844 }
1845 
1846 /// Shuffles 8-bit integers in the 256-bit integer vector \a __a according
1847 ///    to control information in the 256-bit integer vector \a __b, and
1848 ///    returns the 256-bit result. In effect there are two separate 128-bit
1849 ///    shuffles in the lower and upper halves.
1850 ///
1851 /// \code{.operation}
1852 /// FOR i := 0 TO 31
1853 ///   j := i*8
1854 ///   IF __b[j+7] == 1
1855 ///     result[j+7:j] := 0
1856 ///   ELSE
1857 ///     k := __b[j+3:j] * 8
1858 ///     IF i > 15
1859 ///       k := k + 128
1860 ///     FI
1861 ///     result[j+7:j] := __a[k+7:k]
1862 ///   FI
1863 /// ENDFOR
1864 /// \endcode
1865 ///
1866 /// \headerfile <immintrin.h>
1867 ///
1868 /// This intrinsic corresponds to the \c VPSHUFB instruction.
1869 ///
1870 /// \param __a
1871 ///    A 256-bit integer vector containing source values.
1872 /// \param __b
1873 ///    A 256-bit integer vector containing control information to determine
1874 ///    what goes into the corresponding byte of the result. If bit 7 of the
1875 ///    control byte is 1, the result byte is 0; otherwise, bits 3:0 of the
1876 ///    control byte specify the index (within the same 128-bit half) of \a __a
1877 ///    to copy to the result byte.
1878 /// \returns A 256-bit integer vector containing the result.
1879 static __inline__ __m256i __DEFAULT_FN_ATTRS256
1880 _mm256_shuffle_epi8(__m256i __a, __m256i __b)
1881 {
1882   return (__m256i)__builtin_ia32_pshufb256((__v32qi)__a, (__v32qi)__b);
1883 }
1884 
1885 /// Shuffles 32-bit integers from the 256-bit vector of [8 x i32] in \a a
1886 ///    according to control information in the integer literal \a imm, and
1887 ///    returns the 256-bit result. In effect there are two parallel 128-bit
1888 ///    shuffles in the lower and upper halves.
1889 ///
1890 /// \code{.operation}
1891 /// FOR i := 0 to 3
1892 ///   j := i*32
1893 ///   k := (imm >> i*2)[1:0] * 32
1894 ///   result[j+31:j] := a[k+31:k]
1895 ///   result[128+j+31:128+j] := a[128+k+31:128+k]
1896 /// ENDFOR
1897 /// \endcode
1898 ///
1899 /// \headerfile <immintrin.h>
1900 ///
1901 /// \code
1902 /// __m256i _mm256_shuffle_epi32(__m256i a, const int imm);
1903 /// \endcode
1904 ///
1905 /// This intrinsic corresponds to the \c VPSHUFB instruction.
1906 ///
1907 /// \param a
1908 ///    A 256-bit vector of [8 x i32] containing source values.
1909 /// \param imm
1910 ///    An immediate 8-bit value specifying which elements to copy from \a a.
1911 ///    \a imm[1:0] specifies the index in \a a for elements 0 and 4 of the
1912 ///    result, \a imm[3:2] specifies the index for elements 1 and 5, and so
1913 ///    forth.
1914 /// \returns A 256-bit vector of [8 x i32] containing the result.
1915 #define _mm256_shuffle_epi32(a, imm) \
1916   ((__m256i)__builtin_ia32_pshufd256((__v8si)(__m256i)(a), (int)(imm)))
1917 
1918 /// Shuffles 16-bit integers from the 256-bit vector of [16 x i16] in \a a
1919 ///    according to control information in the integer literal \a imm, and
1920 ///    returns the 256-bit result. The upper 64 bits of each 128-bit half
1921 ///    are shuffled in parallel; the lower 64 bits of each 128-bit half are
1922 ///    copied from \a a unchanged.
1923 ///
1924 /// \code{.operation}
1925 /// result[63:0] := a[63:0]
1926 /// result[191:128] := a[191:128]
1927 /// FOR i := 0 TO 3
1928 ///   j := i * 16 + 64
1929 ///   k := (imm >> i*2)[1:0] * 16 + 64
1930 ///   result[j+15:j] := a[k+15:k]
1931 ///   result[128+j+15:128+j] := a[128+k+15:128+k]
1932 /// ENDFOR
1933 /// \endcode
1934 ///
1935 /// \headerfile <immintrin.h>
1936 ///
1937 /// \code
1938 /// __m256i _mm256_shufflehi_epi16(__m256i a, const int imm);
1939 /// \endcode
1940 ///
1941 /// This intrinsic corresponds to the \c VPSHUFHW instruction.
1942 ///
1943 /// \param a
1944 ///    A 256-bit vector of [16 x i16] containing source values.
1945 /// \param imm
1946 ///    An immediate 8-bit value specifying which elements to copy from \a a.
1947 ///    \a imm[1:0] specifies the index in \a a for elements 4 and 8 of the
1948 ///    result, \a imm[3:2] specifies the index for elements 5 and 9, and so
1949 ///    forth. Indexes are offset by 4 (so 0 means index 4, and so forth).
1950 /// \returns A 256-bit vector of [16 x i16] containing the result.
1951 #define _mm256_shufflehi_epi16(a, imm) \
1952   ((__m256i)__builtin_ia32_pshufhw256((__v16hi)(__m256i)(a), (int)(imm)))
1953 
1954 /// Shuffles 16-bit integers from the 256-bit vector of [16 x i16] \a a
1955 ///    according to control information in the integer literal \a imm, and
1956 ///    returns the 256-bit [16 x i16] result. The lower 64 bits of each
1957 ///    128-bit half are shuffled; the upper 64 bits of each 128-bit half are
1958 ///    copied from \a a unchanged.
1959 ///
1960 /// \code{.operation}
1961 /// result[127:64] := a[127:64]
1962 /// result[255:192] := a[255:192]
1963 /// FOR i := 0 TO 3
1964 ///   j := i * 16
1965 ///   k := (imm >> i*2)[1:0] * 16
1966 ///   result[j+15:j] := a[k+15:k]
1967 ///   result[128+j+15:128+j] := a[128+k+15:128+k]
1968 /// ENDFOR
1969 /// \endcode
1970 ///
1971 /// \headerfile <immintrin.h>
1972 ///
1973 /// \code
1974 /// __m256i _mm256_shufflelo_epi16(__m256i a, const int imm);
1975 /// \endcode
1976 ///
1977 /// This intrinsic corresponds to the \c VPSHUFLW instruction.
1978 ///
1979 /// \param a
1980 ///    A 256-bit vector of [16 x i16] to use as a source of data for the
1981 ///    result.
1982 /// \param imm
1983 ///    An immediate 8-bit value specifying which elements to copy from \a a.
1984 ///    \a imm[1:0] specifies the index in \a a for elements 0 and 8 of the
1985 ///    result, \a imm[3:2] specifies the index for elements 1 and 9, and so
1986 ///    forth.
1987 /// \returns A 256-bit vector of [16 x i16] containing the result.
1988 #define _mm256_shufflelo_epi16(a, imm) \
1989   ((__m256i)__builtin_ia32_pshuflw256((__v16hi)(__m256i)(a), (int)(imm)))
1990 
1991 /// Sets each byte of the result to the corresponding byte of the 256-bit
1992 ///    integer vector in \a __a, the negative of that byte, or zero, depending
1993 ///    on whether the corresponding byte of the 256-bit integer vector in
1994 ///    \a __b is greater than zero, less than zero, or equal to zero,
1995 ///    respectively.
1996 ///
1997 /// \headerfile <immintrin.h>
1998 ///
1999 /// This intrinsic corresponds to the \c VPSIGNB instruction.
2000 ///
2001 /// \param __a
2002 ///    A 256-bit integer vector.
2003 /// \param __b
2004 ///    A 256-bit integer vector].
2005 /// \returns A 256-bit integer vector containing the result.
2006 static __inline__ __m256i __DEFAULT_FN_ATTRS256
2007 _mm256_sign_epi8(__m256i __a, __m256i __b)
2008 {
2009     return (__m256i)__builtin_ia32_psignb256((__v32qi)__a, (__v32qi)__b);
2010 }
2011 
2012 /// Sets each element of the result to the corresponding element of the
2013 ///    256-bit vector of [16 x i16] in \a __a, the negative of that element,
2014 ///    or zero, depending on whether the corresponding element of the 256-bit
2015 ///    vector of [16 x i16] in \a __b is greater than zero, less than zero, or
2016 ///    equal to zero, respectively.
2017 ///
2018 /// \headerfile <immintrin.h>
2019 ///
2020 /// This intrinsic corresponds to the \c VPSIGNW instruction.
2021 ///
2022 /// \param __a
2023 ///    A 256-bit vector of [16 x i16].
2024 /// \param __b
2025 ///    A 256-bit vector of [16 x i16].
2026 /// \returns A 256-bit vector of [16 x i16] containing the result.
2027 static __inline__ __m256i __DEFAULT_FN_ATTRS256
2028 _mm256_sign_epi16(__m256i __a, __m256i __b)
2029 {
2030     return (__m256i)__builtin_ia32_psignw256((__v16hi)__a, (__v16hi)__b);
2031 }
2032 
2033 /// Sets each element of the result to the corresponding element of the
2034 ///    256-bit vector of [8 x i32] in \a __a, the negative of that element, or
2035 ///    zero, depending on whether the corresponding element of the 256-bit
2036 ///    vector of [8 x i32] in \a __b is greater than zero, less than zero, or
2037 ///    equal to zero, respectively.
2038 ///
2039 /// \headerfile <immintrin.h>
2040 ///
2041 /// This intrinsic corresponds to the \c VPSIGND instruction.
2042 ///
2043 /// \param __a
2044 ///    A 256-bit vector of [8 x i32].
2045 /// \param __b
2046 ///    A 256-bit vector of [8 x i32].
2047 /// \returns A 256-bit vector of [8 x i32] containing the result.
2048 static __inline__ __m256i __DEFAULT_FN_ATTRS256
2049 _mm256_sign_epi32(__m256i __a, __m256i __b)
2050 {
2051     return (__m256i)__builtin_ia32_psignd256((__v8si)__a, (__v8si)__b);
2052 }
2053 
2054 /// Shifts each 128-bit half of the 256-bit integer vector \a a left by
2055 ///    \a imm bytes, shifting in zero bytes, and returns the result. If \a imm
2056 ///    is greater than 15, the returned result is all zeroes.
2057 ///
2058 /// \headerfile <immintrin.h>
2059 ///
2060 /// \code
2061 /// __m256i _mm256_slli_si256(__m256i a, const int imm);
2062 /// \endcode
2063 ///
2064 /// This intrinsic corresponds to the \c VPSLLDQ instruction.
2065 ///
2066 /// \param a
2067 ///    A 256-bit integer vector to be shifted.
2068 /// \param imm
2069 ///     An unsigned immediate value specifying the shift count (in bytes).
2070 /// \returns A 256-bit integer vector containing the result.
2071 #define _mm256_slli_si256(a, imm) \
2072   ((__m256i)__builtin_ia32_pslldqi256_byteshift((__v4di)(__m256i)(a), (int)(imm)))
2073 
2074 /// Shifts each 128-bit half of the 256-bit integer vector \a a left by
2075 ///    \a imm bytes, shifting in zero bytes, and returns the result. If \a imm
2076 ///    is greater than 15, the returned result is all zeroes.
2077 ///
2078 /// \headerfile <immintrin.h>
2079 ///
2080 /// \code
2081 /// __m256i _mm256_bslli_epi128(__m256i a, const int imm);
2082 /// \endcode
2083 ///
2084 /// This intrinsic corresponds to the \c VPSLLDQ instruction.
2085 ///
2086 /// \param a
2087 ///    A 256-bit integer vector to be shifted.
2088 /// \param imm
2089 ///    An unsigned immediate value specifying the shift count (in bytes).
2090 /// \returns A 256-bit integer vector containing the result.
2091 #define _mm256_bslli_epi128(a, imm) \
2092   ((__m256i)__builtin_ia32_pslldqi256_byteshift((__v4di)(__m256i)(a), (int)(imm)))
2093 
2094 /// Shifts each 16-bit element of the 256-bit vector of [16 x i16] in \a __a
2095 ///    left by \a __count bits, shifting in zero bits, and returns the result.
2096 ///    If \a __count is greater than 15, the returned result is all zeroes.
2097 ///
2098 /// \headerfile <immintrin.h>
2099 ///
2100 /// This intrinsic corresponds to the \c VPSLLW instruction.
2101 ///
2102 /// \param __a
2103 ///    A 256-bit vector of [16 x i16] to be shifted.
2104 /// \param __count
2105 ///    An unsigned integer value specifying the shift count (in bits).
2106 /// \returns A 256-bit vector of [16 x i16] containing the result.
2107 static __inline__ __m256i __DEFAULT_FN_ATTRS256
2108 _mm256_slli_epi16(__m256i __a, int __count)
2109 {
2110   return (__m256i)__builtin_ia32_psllwi256((__v16hi)__a, __count);
2111 }
2112 
2113 /// Shifts each 16-bit element of the 256-bit vector of [16 x i16] in \a __a
2114 ///    left by the number of bits specified by the lower 64 bits of \a __count,
2115 ///    shifting in zero bits, and returns the result. If \a __count is greater
2116 ///    than 15, the returned result is all zeroes.
2117 ///
2118 /// \headerfile <immintrin.h>
2119 ///
2120 /// This intrinsic corresponds to the \c VPSLLW instruction.
2121 ///
2122 /// \param __a
2123 ///    A 256-bit vector of [16 x i16] to be shifted.
2124 /// \param __count
2125 ///    A 128-bit vector of [2 x i64] whose lower element gives the unsigned
2126 ///    shift count (in bits). The upper element is ignored.
2127 /// \returns A 256-bit vector of [16 x i16] containing the result.
2128 static __inline__ __m256i __DEFAULT_FN_ATTRS256
2129 _mm256_sll_epi16(__m256i __a, __m128i __count)
2130 {
2131   return (__m256i)__builtin_ia32_psllw256((__v16hi)__a, (__v8hi)__count);
2132 }
2133 
2134 /// Shifts each 32-bit element of the 256-bit vector of [8 x i32] in \a __a
2135 ///    left by \a __count bits, shifting in zero bits, and returns the result.
2136 ///    If \a __count is greater than 31, the returned result is all zeroes.
2137 ///
2138 /// \headerfile <immintrin.h>
2139 ///
2140 /// This intrinsic corresponds to the \c VPSLLD instruction.
2141 ///
2142 /// \param __a
2143 ///    A 256-bit vector of [8 x i32] to be shifted.
2144 /// \param __count
2145 ///    An unsigned integer value specifying the shift count (in bits).
2146 /// \returns A 256-bit vector of [8 x i32] containing the result.
2147 static __inline__ __m256i __DEFAULT_FN_ATTRS256
2148 _mm256_slli_epi32(__m256i __a, int __count)
2149 {
2150   return (__m256i)__builtin_ia32_pslldi256((__v8si)__a, __count);
2151 }
2152 
2153 /// Shifts each 32-bit element of the 256-bit vector of [8 x i32] in \a __a
2154 ///    left by the number of bits given in the lower 64 bits of \a __count,
2155 ///    shifting in zero bits, and returns the result. If \a __count is greater
2156 ///    than 31, the returned result is all zeroes.
2157 ///
2158 /// \headerfile <immintrin.h>
2159 ///
2160 /// This intrinsic corresponds to the \c VPSLLD instruction.
2161 ///
2162 /// \param __a
2163 ///    A 256-bit vector of [8 x i32] to be shifted.
2164 /// \param __count
2165 ///    A 128-bit vector of [2 x i64] whose lower element gives the unsigned
2166 ///    shift count (in bits). The upper element is ignored.
2167 /// \returns A 256-bit vector of [8 x i32] containing the result.
2168 static __inline__ __m256i __DEFAULT_FN_ATTRS256
2169 _mm256_sll_epi32(__m256i __a, __m128i __count)
2170 {
2171   return (__m256i)__builtin_ia32_pslld256((__v8si)__a, (__v4si)__count);
2172 }
2173 
2174 /// Shifts each 64-bit element of the 256-bit vector of [4 x i64] in \a __a
2175 ///    left by \a __count bits, shifting in zero bits, and returns the result.
2176 ///    If \a __count is greater than 63, the returned result is all zeroes.
2177 ///
2178 /// \headerfile <immintrin.h>
2179 ///
2180 /// This intrinsic corresponds to the \c VPSLLQ instruction.
2181 ///
2182 /// \param __a
2183 ///    A 256-bit vector of [4 x i64] to be shifted.
2184 /// \param __count
2185 ///    An unsigned integer value specifying the shift count (in bits).
2186 /// \returns A 256-bit vector of [4 x i64] containing the result.
2187 static __inline__ __m256i __DEFAULT_FN_ATTRS256
2188 _mm256_slli_epi64(__m256i __a, int __count)
2189 {
2190   return __builtin_ia32_psllqi256((__v4di)__a, __count);
2191 }
2192 
2193 /// Shifts each 64-bit element of the 256-bit vector of [4 x i64] in \a __a
2194 ///    left by the number of bits given in the lower 64 bits of \a __count,
2195 ///    shifting in zero bits, and returns the result. If \a __count is greater
2196 ///    than 63, the returned result is all zeroes.
2197 ///
2198 /// \headerfile <immintrin.h>
2199 ///
2200 /// This intrinsic corresponds to the \c VPSLLQ instruction.
2201 ///
2202 /// \param __a
2203 ///    A 256-bit vector of [4 x i64] to be shifted.
2204 /// \param __count
2205 ///    A 128-bit vector of [2 x i64] whose lower element gives the unsigned
2206 ///    shift count (in bits). The upper element is ignored.
2207 /// \returns A 256-bit vector of [4 x i64] containing the result.
2208 static __inline__ __m256i __DEFAULT_FN_ATTRS256
2209 _mm256_sll_epi64(__m256i __a, __m128i __count)
2210 {
2211   return __builtin_ia32_psllq256((__v4di)__a, __count);
2212 }
2213 
2214 /// Shifts each 16-bit element of the 256-bit vector of [16 x i16] in \a __a
2215 ///    right by \a __count bits, shifting in sign bits, and returns the result.
2216 ///    If \a __count is greater than 15, each element of the result is either
2217 ///    0 or -1 according to the corresponding input sign bit.
2218 ///
2219 /// \headerfile <immintrin.h>
2220 ///
2221 /// This intrinsic corresponds to the \c VPSRAW instruction.
2222 ///
2223 /// \param __a
2224 ///    A 256-bit vector of [16 x i16] to be shifted.
2225 /// \param __count
2226 ///    An unsigned integer value specifying the shift count (in bits).
2227 /// \returns A 256-bit vector of [16 x i16] containing the result.
2228 static __inline__ __m256i __DEFAULT_FN_ATTRS256
2229 _mm256_srai_epi16(__m256i __a, int __count)
2230 {
2231   return (__m256i)__builtin_ia32_psrawi256((__v16hi)__a, __count);
2232 }
2233 
2234 /// Shifts each 16-bit element of the 256-bit vector of [16 x i16] in \a __a
2235 ///    right by the number of bits given in the lower 64 bits of \a __count,
2236 ///    shifting in sign bits, and returns the result. If \a __count is greater
2237 ///    than 15, each element of the result is either 0 or -1 according to the
2238 ///    corresponding input sign bit.
2239 ///
2240 /// \headerfile <immintrin.h>
2241 ///
2242 /// This intrinsic corresponds to the \c VPSRAW instruction.
2243 ///
2244 /// \param __a
2245 ///    A 256-bit vector of [16 x i16] to be shifted.
2246 /// \param __count
2247 ///    A 128-bit vector of [2 x i64] whose lower element gives the unsigned
2248 ///    shift count (in bits). The upper element is ignored.
2249 /// \returns A 256-bit vector of [16 x i16] containing the result.
2250 static __inline__ __m256i __DEFAULT_FN_ATTRS256
2251 _mm256_sra_epi16(__m256i __a, __m128i __count)
2252 {
2253   return (__m256i)__builtin_ia32_psraw256((__v16hi)__a, (__v8hi)__count);
2254 }
2255 
2256 /// Shifts each 32-bit element of the 256-bit vector of [8 x i32] in \a __a
2257 ///    right by \a __count bits, shifting in sign bits, and returns the result.
2258 ///    If \a __count is greater than 31, each element of the result is either
2259 ///    0 or -1 according to the corresponding input sign bit.
2260 ///
2261 /// \headerfile <immintrin.h>
2262 ///
2263 /// This intrinsic corresponds to the \c VPSRAD instruction.
2264 ///
2265 /// \param __a
2266 ///    A 256-bit vector of [8 x i32] to be shifted.
2267 /// \param __count
2268 ///    An unsigned integer value specifying the shift count (in bits).
2269 /// \returns A 256-bit vector of [8 x i32] containing the result.
2270 static __inline__ __m256i __DEFAULT_FN_ATTRS256
2271 _mm256_srai_epi32(__m256i __a, int __count)
2272 {
2273   return (__m256i)__builtin_ia32_psradi256((__v8si)__a, __count);
2274 }
2275 
2276 /// Shifts each 32-bit element of the 256-bit vector of [8 x i32] in \a __a
2277 ///    right by the number of bits given in the lower 64 bits of \a __count,
2278 ///    shifting in sign bits, and returns the result. If \a __count is greater
2279 ///    than 31, each element of the result is either 0 or -1 according to the
2280 ///    corresponding input sign bit.
2281 ///
2282 /// \headerfile <immintrin.h>
2283 ///
2284 /// This intrinsic corresponds to the \c VPSRAD instruction.
2285 ///
2286 /// \param __a
2287 ///    A 256-bit vector of [8 x i32] to be shifted.
2288 /// \param __count
2289 ///    A 128-bit vector of [2 x i64] whose lower element gives the unsigned
2290 ///    shift count (in bits). The upper element is ignored.
2291 /// \returns A 256-bit vector of [8 x i32] containing the result.
2292 static __inline__ __m256i __DEFAULT_FN_ATTRS256
2293 _mm256_sra_epi32(__m256i __a, __m128i __count)
2294 {
2295   return (__m256i)__builtin_ia32_psrad256((__v8si)__a, (__v4si)__count);
2296 }
2297 
2298 /// Shifts each 128-bit half of the 256-bit integer vector in \a a right by
2299 ///    \a imm bytes, shifting in zero bytes, and returns the result. If
2300 ///    \a imm is greater than 15, the returned result is all zeroes.
2301 ///
2302 /// \headerfile <immintrin.h>
2303 ///
2304 /// \code
2305 /// __m256i _mm256_srli_si256(__m256i a, const int imm);
2306 /// \endcode
2307 ///
2308 /// This intrinsic corresponds to the \c VPSRLDQ instruction.
2309 ///
2310 /// \param a
2311 ///    A 256-bit integer vector to be shifted.
2312 /// \param imm
2313 ///    An unsigned immediate value specifying the shift count (in bytes).
2314 /// \returns A 256-bit integer vector containing the result.
2315 #define _mm256_srli_si256(a, imm) \
2316   ((__m256i)__builtin_ia32_psrldqi256_byteshift((__m256i)(a), (int)(imm)))
2317 
2318 /// Shifts each 128-bit half of the 256-bit integer vector in \a a right by
2319 ///    \a imm bytes, shifting in zero bytes, and returns the result. If
2320 ///    \a imm is greater than 15, the returned result is all zeroes.
2321 ///
2322 /// \headerfile <immintrin.h>
2323 ///
2324 /// \code
2325 /// __m256i _mm256_bsrli_epi128(__m256i a, const int imm);
2326 /// \endcode
2327 ///
2328 /// This intrinsic corresponds to the \c VPSRLDQ instruction.
2329 ///
2330 /// \param a
2331 ///    A 256-bit integer vector to be shifted.
2332 /// \param imm
2333 ///     An unsigned immediate value specifying the shift count (in bytes).
2334 /// \returns A 256-bit integer vector containing the result.
2335 #define _mm256_bsrli_epi128(a, imm) \
2336   ((__m256i)__builtin_ia32_psrldqi256_byteshift((__m256i)(a), (int)(imm)))
2337 
2338 /// Shifts each 16-bit element of the 256-bit vector of [16 x i16] in \a __a
2339 ///    right by \a __count bits, shifting in zero bits, and returns the result.
2340 ///    If \a __count is greater than 15, the returned result is all zeroes.
2341 ///
2342 /// \headerfile <immintrin.h>
2343 ///
2344 /// This intrinsic corresponds to the \c VPSRLW instruction.
2345 ///
2346 /// \param __a
2347 ///    A 256-bit vector of [16 x i16] to be shifted.
2348 /// \param __count
2349 ///    An unsigned integer value specifying the shift count (in bits).
2350 /// \returns A 256-bit vector of [16 x i16] containing the result.
2351 static __inline__ __m256i __DEFAULT_FN_ATTRS256
2352 _mm256_srli_epi16(__m256i __a, int __count)
2353 {
2354   return (__m256i)__builtin_ia32_psrlwi256((__v16hi)__a, __count);
2355 }
2356 
2357 /// Shifts each 16-bit element of the 256-bit vector of [16 x i16] in \a __a
2358 ///    right by the number of bits given in the lower 64 bits of \a __count,
2359 ///    shifting in zero bits, and returns the result. If \a __count is greater
2360 ///    than 15, the returned result is all zeroes.
2361 ///
2362 /// \headerfile <immintrin.h>
2363 ///
2364 /// This intrinsic corresponds to the \c VPSRLW instruction.
2365 ///
2366 /// \param __a
2367 ///    A 256-bit vector of [16 x i16] to be shifted.
2368 /// \param __count
2369 ///    A 128-bit vector of [2 x i64] whose lower element gives the unsigned
2370 ///    shift count (in bits). The upper element is ignored.
2371 /// \returns A 256-bit vector of [16 x i16] containing the result.
2372 static __inline__ __m256i __DEFAULT_FN_ATTRS256
2373 _mm256_srl_epi16(__m256i __a, __m128i __count)
2374 {
2375   return (__m256i)__builtin_ia32_psrlw256((__v16hi)__a, (__v8hi)__count);
2376 }
2377 
2378 /// Shifts each 32-bit element of the 256-bit vector of [8 x i32] in \a __a
2379 ///    right by \a __count bits, shifting in zero bits, and returns the result.
2380 ///    If \a __count is greater than 31, the returned result is all zeroes.
2381 ///
2382 /// \headerfile <immintrin.h>
2383 ///
2384 /// This intrinsic corresponds to the \c VPSRLD instruction.
2385 ///
2386 /// \param __a
2387 ///    A 256-bit vector of [8 x i32] to be shifted.
2388 /// \param __count
2389 ///    An unsigned integer value specifying the shift count (in bits).
2390 /// \returns A 256-bit vector of [8 x i32] containing the result.
2391 static __inline__ __m256i __DEFAULT_FN_ATTRS256
2392 _mm256_srli_epi32(__m256i __a, int __count)
2393 {
2394   return (__m256i)__builtin_ia32_psrldi256((__v8si)__a, __count);
2395 }
2396 
2397 /// Shifts each 32-bit element of the 256-bit vector of [8 x i32] in \a __a
2398 ///    right by the number of bits given in the lower 64 bits of \a __count,
2399 ///    shifting in zero bits, and returns the result. If \a __count is greater
2400 ///    than 31, the returned result is all zeroes.
2401 ///
2402 /// \headerfile <immintrin.h>
2403 ///
2404 /// This intrinsic corresponds to the \c VPSRLD instruction.
2405 ///
2406 /// \param __a
2407 ///    A 256-bit vector of [8 x i32] to be shifted.
2408 /// \param __count
2409 ///    A 128-bit vector of [2 x i64] whose lower element gives the unsigned
2410 ///    shift count (in bits). The upper element is ignored.
2411 /// \returns A 256-bit vector of [8 x i32] containing the result.
2412 static __inline__ __m256i __DEFAULT_FN_ATTRS256
2413 _mm256_srl_epi32(__m256i __a, __m128i __count)
2414 {
2415   return (__m256i)__builtin_ia32_psrld256((__v8si)__a, (__v4si)__count);
2416 }
2417 
2418 /// Shifts each 64-bit element of the 256-bit vector of [4 x i64] in \a __a
2419 ///    right by \a __count bits, shifting in zero bits, and returns the result.
2420 ///    If \a __count is greater than 63, the returned result is all zeroes.
2421 ///
2422 /// \headerfile <immintrin.h>
2423 ///
2424 /// This intrinsic corresponds to the \c VPSRLQ instruction.
2425 ///
2426 /// \param __a
2427 ///    A 256-bit vector of [4 x i64] to be shifted.
2428 /// \param __count
2429 ///    An unsigned integer value specifying the shift count (in bits).
2430 /// \returns A 256-bit vector of [4 x i64] containing the result.
2431 static __inline__ __m256i __DEFAULT_FN_ATTRS256
2432 _mm256_srli_epi64(__m256i __a, int __count)
2433 {
2434   return __builtin_ia32_psrlqi256((__v4di)__a, __count);
2435 }
2436 
2437 /// Shifts each 64-bit element of the 256-bit vector of [4 x i64] in \a __a
2438 ///    right by the number of bits given in the lower 64 bits of \a __count,
2439 ///    shifting in zero bits, and returns the result. If \a __count is greater
2440 ///    than 63, the returned result is all zeroes.
2441 ///
2442 /// \headerfile <immintrin.h>
2443 ///
2444 /// This intrinsic corresponds to the \c VPSRLQ instruction.
2445 ///
2446 /// \param __a
2447 ///    A 256-bit vector of [4 x i64] to be shifted.
2448 /// \param __count
2449 ///    A 128-bit vector of [2 x i64] whose lower element gives the unsigned
2450 ///    shift count (in bits). The upper element is ignored.
2451 /// \returns A 256-bit vector of [4 x i64] containing the result.
2452 static __inline__ __m256i __DEFAULT_FN_ATTRS256
2453 _mm256_srl_epi64(__m256i __a, __m128i __count)
2454 {
2455   return __builtin_ia32_psrlq256((__v4di)__a, __count);
2456 }
2457 
2458 /// Subtracts 8-bit integers from corresponding bytes of two 256-bit integer
2459 ///    vectors. Returns the lower 8 bits of each difference in the
2460 ///    corresponding byte of the 256-bit integer vector result (overflow is
2461 ///    ignored).
2462 ///
2463 /// \code{.operation}
2464 /// FOR i := 0 TO 31
2465 ///   j := i*8
2466 ///   result[j+7:j] := __a[j+7:j] - __b[j+7:j]
2467 /// ENDFOR
2468 /// \endcode
2469 ///
2470 /// \headerfile <immintrin.h>
2471 ///
2472 /// This intrinsic corresponds to the \c VPSUBB instruction.
2473 ///
2474 /// \param __a
2475 ///    A 256-bit integer vector containing the minuends.
2476 /// \param __b
2477 ///    A 256-bit integer vector containing the subtrahends.
2478 /// \returns A 256-bit integer vector containing the differences.
2479 static __inline__ __m256i __DEFAULT_FN_ATTRS256
2480 _mm256_sub_epi8(__m256i __a, __m256i __b)
2481 {
2482   return (__m256i)((__v32qu)__a - (__v32qu)__b);
2483 }
2484 
2485 /// Subtracts 16-bit integers from corresponding elements of two 256-bit
2486 ///    vectors of [16 x i16]. Returns the lower 16 bits of each difference in
2487 ///    the corresponding element of the [16 x i16] result (overflow is
2488 ///    ignored).
2489 ///
2490 /// \code{.operation}
2491 /// FOR i := 0 TO 15
2492 ///   j := i*16
2493 ///   result[j+15:j] := __a[j+15:j] - __b[j+15:j]
2494 /// ENDFOR
2495 /// \endcode
2496 ///
2497 /// \headerfile <immintrin.h>
2498 ///
2499 /// This intrinsic corresponds to the \c VPSUBW instruction.
2500 ///
2501 /// \param __a
2502 ///    A 256-bit vector of [16 x i16] containing the minuends.
2503 /// \param __b
2504 ///    A 256-bit vector of [16 x i16] containing the subtrahends.
2505 /// \returns A 256-bit vector of [16 x i16] containing the differences.
2506 static __inline__ __m256i __DEFAULT_FN_ATTRS256
2507 _mm256_sub_epi16(__m256i __a, __m256i __b)
2508 {
2509   return (__m256i)((__v16hu)__a - (__v16hu)__b);
2510 }
2511 
2512 /// Subtracts 32-bit integers from corresponding elements of two 256-bit
2513 ///    vectors of [8 x i32]. Returns the lower 32 bits of each difference in
2514 ///    the corresponding element of the [8 x i32] result (overflow is ignored).
2515 ///
2516 /// \code{.operation}
2517 /// FOR i := 0 TO 7
2518 ///   j := i*32
2519 ///   result[j+31:j] := __a[j+31:j] - __b[j+31:j]
2520 /// ENDFOR
2521 /// \endcode
2522 ///
2523 /// \headerfile <immintrin.h>
2524 ///
2525 /// This intrinsic corresponds to the \c VPSUBD instruction.
2526 ///
2527 /// \param __a
2528 ///    A 256-bit vector of [8 x i32] containing the minuends.
2529 /// \param __b
2530 ///    A 256-bit vector of [8 x i32] containing the subtrahends.
2531 /// \returns A 256-bit vector of [8 x i32] containing the differences.
2532 static __inline__ __m256i __DEFAULT_FN_ATTRS256
2533 _mm256_sub_epi32(__m256i __a, __m256i __b)
2534 {
2535   return (__m256i)((__v8su)__a - (__v8su)__b);
2536 }
2537 
2538 /// Subtracts 64-bit integers from corresponding elements of two 256-bit
2539 ///    vectors of [4 x i64]. Returns the lower 64 bits of each difference in
2540 ///    the corresponding element of the [4 x i64] result (overflow is ignored).
2541 ///
2542 /// \code{.operation}
2543 /// FOR i := 0 TO 3
2544 ///   j := i*64
2545 ///   result[j+63:j] := __a[j+63:j] - __b[j+63:j]
2546 /// ENDFOR
2547 /// \endcode
2548 ///
2549 /// \headerfile <immintrin.h>
2550 ///
2551 /// This intrinsic corresponds to the \c VPSUBQ instruction.
2552 ///
2553 /// \param __a
2554 ///    A 256-bit vector of [4 x i64] containing the minuends.
2555 /// \param __b
2556 ///    A 256-bit vector of [4 x i64] containing the subtrahends.
2557 /// \returns A 256-bit vector of [4 x i64] containing the differences.
2558 static __inline__ __m256i __DEFAULT_FN_ATTRS256
2559 _mm256_sub_epi64(__m256i __a, __m256i __b)
2560 {
2561   return (__m256i)((__v4du)__a - (__v4du)__b);
2562 }
2563 
2564 /// Subtracts 8-bit integers from corresponding bytes of two 256-bit integer
2565 ///    vectors using signed saturation, and returns each differences in the
2566 ///    corresponding byte of the 256-bit integer vector result.
2567 ///
2568 /// \code{.operation}
2569 /// FOR i := 0 TO 31
2570 ///   j := i*8
2571 ///   result[j+7:j] := SATURATE8(__a[j+7:j] - __b[j+7:j])
2572 /// ENDFOR
2573 /// \endcode
2574 ///
2575 /// \headerfile <immintrin.h>
2576 ///
2577 /// This intrinsic corresponds to the \c VPSUBSB instruction.
2578 ///
2579 /// \param __a
2580 ///    A 256-bit integer vector containing the minuends.
2581 /// \param __b
2582 ///    A 256-bit integer vector containing the subtrahends.
2583 /// \returns A 256-bit integer vector containing the differences.
2584 static __inline__ __m256i __DEFAULT_FN_ATTRS256
2585 _mm256_subs_epi8(__m256i __a, __m256i __b)
2586 {
2587   return (__m256i)__builtin_elementwise_sub_sat((__v32qs)__a, (__v32qs)__b);
2588 }
2589 
2590 /// Subtracts 16-bit integers from corresponding elements of two 256-bit
2591 ///    vectors of [16 x i16] using signed saturation, and returns each
2592 ///    difference in the corresponding element of the [16 x i16] result.
2593 ///
2594 /// \code{.operation}
2595 /// FOR i := 0 TO 15
2596 ///   j := i*16
2597 ///   result[j+7:j] := SATURATE16(__a[j+7:j] - __b[j+7:j])
2598 /// ENDFOR
2599 /// \endcode
2600 ///
2601 /// \headerfile <immintrin.h>
2602 ///
2603 /// This intrinsic corresponds to the \c VPSUBSW instruction.
2604 ///
2605 /// \param __a
2606 ///    A 256-bit vector of [16 x i16] containing the minuends.
2607 /// \param __b
2608 ///    A 256-bit vector of [16 x i16] containing the subtrahends.
2609 /// \returns A 256-bit vector of [16 x i16] containing the differences.
2610 static __inline__ __m256i __DEFAULT_FN_ATTRS256
2611 _mm256_subs_epi16(__m256i __a, __m256i __b)
2612 {
2613   return (__m256i)__builtin_elementwise_sub_sat((__v16hi)__a, (__v16hi)__b);
2614 }
2615 
2616 /// Subtracts 8-bit integers from corresponding bytes of two 256-bit integer
2617 ///    vectors using unsigned saturation, and returns each difference in the
2618 ///    corresponding byte of the 256-bit integer vector result. For each byte,
2619 ///    computes <c> result = __a - __b </c>.
2620 ///
2621 /// \code{.operation}
2622 /// FOR i := 0 TO 31
2623 ///   j := i*8
2624 ///   result[j+7:j] := SATURATE8U(__a[j+7:j] - __b[j+7:j])
2625 /// ENDFOR
2626 /// \endcode
2627 ///
2628 /// \headerfile <immintrin.h>
2629 ///
2630 /// This intrinsic corresponds to the \c VPSUBUSB instruction.
2631 ///
2632 /// \param __a
2633 ///    A 256-bit integer vector containing the minuends.
2634 /// \param __b
2635 ///    A 256-bit integer vector containing the subtrahends.
2636 /// \returns A 256-bit integer vector containing the differences.
2637 static __inline__ __m256i __DEFAULT_FN_ATTRS256
2638 _mm256_subs_epu8(__m256i __a, __m256i __b)
2639 {
2640   return (__m256i)__builtin_elementwise_sub_sat((__v32qu)__a, (__v32qu)__b);
2641 }
2642 
2643 /// Subtracts 16-bit integers from corresponding elements of two 256-bit
2644 ///    vectors of [16 x i16] using unsigned saturation, and returns each
2645 ///    difference in the corresponding element of the [16 x i16] result.
2646 ///
2647 /// \code{.operation}
2648 /// FOR i := 0 TO 15
2649 ///   j := i*16
2650 ///   result[j+15:j] := SATURATE16U(__a[j+15:j] - __b[j+15:j])
2651 /// ENDFOR
2652 /// \endcode
2653 ///
2654 /// \headerfile <immintrin.h>
2655 ///
2656 /// This intrinsic corresponds to the \c VPSUBUSW instruction.
2657 ///
2658 /// \param __a
2659 ///    A 256-bit vector of [16 x i16] containing the minuends.
2660 /// \param __b
2661 ///    A 256-bit vector of [16 x i16] containing the subtrahends.
2662 /// \returns A 256-bit vector of [16 x i16] containing the differences.
2663 static __inline__ __m256i __DEFAULT_FN_ATTRS256
2664 _mm256_subs_epu16(__m256i __a, __m256i __b)
2665 {
2666   return (__m256i)__builtin_elementwise_sub_sat((__v16hu)__a, (__v16hu)__b);
2667 }
2668 
2669 /// Unpacks and interleaves 8-bit integers from parts of the 256-bit integer
2670 ///    vectors in \a __a and \a __b to form the 256-bit result. Specifically,
2671 ///    uses the upper 64 bits of each 128-bit half of \a __a and \a __b as
2672 ///    input; other bits in these parameters are ignored.
2673 ///
2674 /// \code{.operation}
2675 /// result[7:0] := __a[71:64]
2676 /// result[15:8] := __b[71:64]
2677 /// result[23:16] := __a[79:72]
2678 /// result[31:24] := __b[79:72]
2679 /// . . .
2680 /// result[127:120] := __b[127:120]
2681 /// result[135:128] := __a[199:192]
2682 /// . . .
2683 /// result[255:248] := __b[255:248]
2684 /// \endcode
2685 ///
2686 /// \headerfile <immintrin.h>
2687 ///
2688 /// This intrinsic corresponds to the \c VPUNPCKHBW instruction.
2689 ///
2690 /// \param __a
2691 ///    A 256-bit integer vector used as the source for the even-numbered bytes
2692 ///    of the result.
2693 /// \param __b
2694 ///    A 256-bit integer vector used as the source for the odd-numbered bytes
2695 ///    of the result.
2696 /// \returns A 256-bit integer vector containing the result.
2697 static __inline__ __m256i __DEFAULT_FN_ATTRS256
2698 _mm256_unpackhi_epi8(__m256i __a, __m256i __b)
2699 {
2700   return (__m256i)__builtin_shufflevector((__v32qi)__a, (__v32qi)__b, 8, 32+8, 9, 32+9, 10, 32+10, 11, 32+11, 12, 32+12, 13, 32+13, 14, 32+14, 15, 32+15, 24, 32+24, 25, 32+25, 26, 32+26, 27, 32+27, 28, 32+28, 29, 32+29, 30, 32+30, 31, 32+31);
2701 }
2702 
2703 /// Unpacks and interleaves 16-bit integers from parts of the 256-bit vectors
2704 ///    of [16 x i16] in \a __a and \a __b to return the resulting 256-bit
2705 ///    vector of [16 x i16]. Specifically, uses the upper 64 bits of each
2706 ///    128-bit half of \a __a and \a __b as input; other bits in these
2707 ///    parameters are ignored.
2708 ///
2709 /// \code{.operation}
2710 /// result[15:0] := __a[79:64]
2711 /// result[31:16] := __b[79:64]
2712 /// result[47:32] := __a[95:80]
2713 /// result[63:48] := __b[95:80]
2714 /// . . .
2715 /// result[127:112] := __b[127:112]
2716 /// result[143:128] := __a[211:196]
2717 /// . . .
2718 /// result[255:240] := __b[255:240]
2719 /// \endcode
2720 ///
2721 /// \headerfile <immintrin.h>
2722 ///
2723 /// This intrinsic corresponds to the \c VPUNPCKHWD instruction.
2724 ///
2725 /// \param __a
2726 ///    A 256-bit vector of [16 x i16] used as the source for the even-numbered
2727 ///    elements of the result.
2728 /// \param __b
2729 ///    A 256-bit vector of [16 x i16] used as the source for the odd-numbered
2730 ///    elements of the result.
2731 /// \returns A 256-bit vector of [16 x i16] containing the result.
2732 static __inline__ __m256i __DEFAULT_FN_ATTRS256
2733 _mm256_unpackhi_epi16(__m256i __a, __m256i __b)
2734 {
2735   return (__m256i)__builtin_shufflevector((__v16hi)__a, (__v16hi)__b, 4, 16+4, 5, 16+5, 6, 16+6, 7, 16+7, 12, 16+12, 13, 16+13, 14, 16+14, 15, 16+15);
2736 }
2737 
2738 /// Unpacks and interleaves 32-bit integers from parts of the 256-bit vectors
2739 ///    of [8 x i32] in \a __a and \a __b to return the resulting 256-bit vector
2740 ///    of [8 x i32]. Specifically, uses the upper 64 bits of each 128-bit half
2741 ///    of \a __a and \a __b as input; other bits in these parameters are
2742 ///    ignored.
2743 ///
2744 /// \code{.operation}
2745 /// result[31:0] := __a[95:64]
2746 /// result[63:32] := __b[95:64]
2747 /// result[95:64] := __a[127:96]
2748 /// result[127:96] := __b[127:96]
2749 /// result[159:128] := __a[223:192]
2750 /// result[191:160] := __b[223:192]
2751 /// result[223:192] := __a[255:224]
2752 /// result[255:224] := __b[255:224]
2753 /// \endcode
2754 ///
2755 /// \headerfile <immintrin.h>
2756 ///
2757 /// This intrinsic corresponds to the \c VPUNPCKHDQ instruction.
2758 ///
2759 /// \param __a
2760 ///    A 256-bit vector of [8 x i32] used as the source for the even-numbered
2761 ///    elements of the result.
2762 /// \param __b
2763 ///    A 256-bit vector of [8 x i32] used as the source for the odd-numbered
2764 ///    elements of the result.
2765 /// \returns A 256-bit vector of [8 x i32] containing the result.
2766 static __inline__ __m256i __DEFAULT_FN_ATTRS256
2767 _mm256_unpackhi_epi32(__m256i __a, __m256i __b)
2768 {
2769   return (__m256i)__builtin_shufflevector((__v8si)__a, (__v8si)__b, 2, 8+2, 3, 8+3, 6, 8+6, 7, 8+7);
2770 }
2771 
2772 /// Unpacks and interleaves 64-bit integers from parts of the 256-bit vectors
2773 ///    of [4 x i64] in \a __a and \a __b to return the resulting 256-bit vector
2774 ///    of [4 x i64]. Specifically, uses the upper 64 bits of each 128-bit half
2775 ///    of \a __a and \a __b as input; other bits in these parameters are
2776 ///    ignored.
2777 ///
2778 /// \code{.operation}
2779 /// result[63:0] := __a[127:64]
2780 /// result[127:64] := __b[127:64]
2781 /// result[191:128] := __a[255:192]
2782 /// result[255:192] := __b[255:192]
2783 /// \endcode
2784 ///
2785 /// \headerfile <immintrin.h>
2786 ///
2787 /// This intrinsic corresponds to the \c VPUNPCKHQDQ instruction.
2788 ///
2789 /// \param __a
2790 ///    A 256-bit vector of [4 x i64] used as the source for the even-numbered
2791 ///    elements of the result.
2792 /// \param __b
2793 ///    A 256-bit vector of [4 x i64] used as the source for the odd-numbered
2794 ///    elements of the result.
2795 /// \returns A 256-bit vector of [4 x i64] containing the result.
2796 static __inline__ __m256i __DEFAULT_FN_ATTRS256
2797 _mm256_unpackhi_epi64(__m256i __a, __m256i __b)
2798 {
2799   return (__m256i)__builtin_shufflevector((__v4di)__a, (__v4di)__b, 1, 4+1, 3, 4+3);
2800 }
2801 
2802 /// Unpacks and interleaves 8-bit integers from parts of the 256-bit integer
2803 ///    vectors in \a __a and \a __b to form the 256-bit result. Specifically,
2804 ///    uses the lower 64 bits of each 128-bit half of \a __a and \a __b as
2805 ///    input; other bits in these parameters are ignored.
2806 ///
2807 /// \code{.operation}
2808 /// result[7:0] := __a[7:0]
2809 /// result[15:8] := __b[7:0]
2810 /// result[23:16] := __a[15:8]
2811 /// result[31:24] := __b[15:8]
2812 /// . . .
2813 /// result[127:120] := __b[63:56]
2814 /// result[135:128] := __a[135:128]
2815 /// . . .
2816 /// result[255:248] := __b[191:184]
2817 /// \endcode
2818 ///
2819 /// \headerfile <immintrin.h>
2820 ///
2821 /// This intrinsic corresponds to the \c VPUNPCKLBW instruction.
2822 ///
2823 /// \param __a
2824 ///    A 256-bit integer vector used as the source for the even-numbered bytes
2825 ///    of the result.
2826 /// \param __b
2827 ///    A 256-bit integer vector used as the source for the odd-numbered bytes
2828 ///    of the result.
2829 /// \returns A 256-bit integer vector containing the result.
2830 static __inline__ __m256i __DEFAULT_FN_ATTRS256
2831 _mm256_unpacklo_epi8(__m256i __a, __m256i __b)
2832 {
2833   return (__m256i)__builtin_shufflevector((__v32qi)__a, (__v32qi)__b, 0, 32+0, 1, 32+1, 2, 32+2, 3, 32+3, 4, 32+4, 5, 32+5, 6, 32+6, 7, 32+7, 16, 32+16, 17, 32+17, 18, 32+18, 19, 32+19, 20, 32+20, 21, 32+21, 22, 32+22, 23, 32+23);
2834 }
2835 
2836 /// Unpacks and interleaves 16-bit integers from parts of the 256-bit vectors
2837 ///    of [16 x i16] in \a __a and \a __b to return the resulting 256-bit
2838 ///    vector of [16 x i16]. Specifically, uses the lower 64 bits of each
2839 ///    128-bit half of \a __a and \a __b as input; other bits in these
2840 ///    parameters are ignored.
2841 ///
2842 /// \code{.operation}
2843 /// result[15:0] := __a[15:0]
2844 /// result[31:16] := __b[15:0]
2845 /// result[47:32] := __a[31:16]
2846 /// result[63:48] := __b[31:16]
2847 /// . . .
2848 /// result[127:112] := __b[63:48]
2849 /// result[143:128] := __a[143:128]
2850 /// . . .
2851 /// result[255:239] := __b[191:176]
2852 /// \endcode
2853 ///
2854 /// \headerfile <immintrin.h>
2855 ///
2856 /// This intrinsic corresponds to the \c VPUNPCKLWD instruction.
2857 ///
2858 /// \param __a
2859 ///    A 256-bit vector of [16 x i16] used as the source for the even-numbered
2860 ///    elements of the result.
2861 /// \param __b
2862 ///    A 256-bit vector of [16 x i16] used as the source for the odd-numbered
2863 ///    elements of the result.
2864 /// \returns A 256-bit vector of [16 x i16] containing the result.
2865 static __inline__ __m256i __DEFAULT_FN_ATTRS256
2866 _mm256_unpacklo_epi16(__m256i __a, __m256i __b)
2867 {
2868   return (__m256i)__builtin_shufflevector((__v16hi)__a, (__v16hi)__b, 0, 16+0, 1, 16+1, 2, 16+2, 3, 16+3, 8, 16+8, 9, 16+9, 10, 16+10, 11, 16+11);
2869 }
2870 
2871 /// Unpacks and interleaves 32-bit integers from parts of the 256-bit vectors
2872 ///    of [8 x i32] in \a __a and \a __b to return the resulting 256-bit vector
2873 ///    of [8 x i32]. Specifically, uses the lower 64 bits of each 128-bit half
2874 ///    of \a __a and \a __b as input; other bits in these parameters are
2875 ///    ignored.
2876 ///
2877 /// \code{.operation}
2878 /// result[31:0] := __a[31:0]
2879 /// result[63:32] := __b[31:0]
2880 /// result[95:64] := __a[63:32]
2881 /// result[127:96] := __b[63:32]
2882 /// result[159:128] := __a[159:128]
2883 /// result[191:160] := __b[159:128]
2884 /// result[223:192] := __a[191:160]
2885 /// result[255:224] := __b[191:190]
2886 /// \endcode
2887 ///
2888 /// \headerfile <immintrin.h>
2889 ///
2890 /// This intrinsic corresponds to the \c VPUNPCKLDQ instruction.
2891 ///
2892 /// \param __a
2893 ///    A 256-bit vector of [8 x i32] used as the source for the even-numbered
2894 ///    elements of the result.
2895 /// \param __b
2896 ///    A 256-bit vector of [8 x i32] used as the source for the odd-numbered
2897 ///    elements of the result.
2898 /// \returns A 256-bit vector of [8 x i32] containing the result.
2899 static __inline__ __m256i __DEFAULT_FN_ATTRS256
2900 _mm256_unpacklo_epi32(__m256i __a, __m256i __b)
2901 {
2902   return (__m256i)__builtin_shufflevector((__v8si)__a, (__v8si)__b, 0, 8+0, 1, 8+1, 4, 8+4, 5, 8+5);
2903 }
2904 
2905 /// Unpacks and interleaves 64-bit integers from parts of the 256-bit vectors
2906 ///    of [4 x i64] in \a __a and \a __b to return the resulting 256-bit vector
2907 ///    of [4 x i64]. Specifically, uses the lower 64 bits of each 128-bit half
2908 ///    of \a __a and \a __b as input; other bits in these parameters are
2909 ///    ignored.
2910 ///
2911 /// \code{.operation}
2912 /// result[63:0] := __a[63:0]
2913 /// result[127:64] := __b[63:0]
2914 /// result[191:128] := __a[191:128]
2915 /// result[255:192] := __b[191:128]
2916 /// \endcode
2917 ///
2918 /// \headerfile <immintrin.h>
2919 ///
2920 /// This intrinsic corresponds to the \c VPUNPCKLQDQ instruction.
2921 ///
2922 /// \param __a
2923 ///    A 256-bit vector of [4 x i64] used as the source for the even-numbered
2924 ///    elements of the result.
2925 /// \param __b
2926 ///    A 256-bit vector of [4 x i64] used as the source for the odd-numbered
2927 ///    elements of the result.
2928 /// \returns A 256-bit vector of [4 x i64] containing the result.
2929 static __inline__ __m256i __DEFAULT_FN_ATTRS256
2930 _mm256_unpacklo_epi64(__m256i __a, __m256i __b)
2931 {
2932   return (__m256i)__builtin_shufflevector((__v4di)__a, (__v4di)__b, 0, 4+0, 2, 4+2);
2933 }
2934 
2935 /// Computes the bitwise XOR of the 256-bit integer vectors in \a __a and
2936 ///    \a __b.
2937 ///
2938 /// \headerfile <immintrin.h>
2939 ///
2940 /// This intrinsic corresponds to the \c VPXOR instruction.
2941 ///
2942 /// \param __a
2943 ///    A 256-bit integer vector.
2944 /// \param __b
2945 ///    A 256-bit integer vector.
2946 /// \returns A 256-bit integer vector containing the result.
2947 static __inline__ __m256i __DEFAULT_FN_ATTRS256
2948 _mm256_xor_si256(__m256i __a, __m256i __b)
2949 {
2950   return (__m256i)((__v4du)__a ^ (__v4du)__b);
2951 }
2952 
2953 /// Loads the 256-bit integer vector from memory \a __V using a non-temporal
2954 ///   memory hint and returns the vector. \a __V must be aligned on a 32-byte
2955 ///   boundary.
2956 ///
2957 /// \headerfile <immintrin.h>
2958 ///
2959 /// This intrinsic corresponds to the \c VMOVNTDQA instruction.
2960 ///
2961 /// \param __V
2962 ///    A pointer to the 32-byte aligned memory containing the vector to load.
2963 /// \returns A 256-bit integer vector loaded from memory.
2964 static __inline__ __m256i __DEFAULT_FN_ATTRS256
2965 _mm256_stream_load_si256(__m256i const *__V)
2966 {
2967   typedef __v4di __v4di_aligned __attribute__((aligned(32)));
2968   return (__m256i)__builtin_nontemporal_load((const __v4di_aligned *)__V);
2969 }
2970 
2971 /// Broadcasts the 32-bit floating-point value from the low element of the
2972 ///    128-bit vector of [4 x float] in \a __X to all elements of the result's
2973 ///    128-bit vector of [4 x float].
2974 ///
2975 /// \headerfile <immintrin.h>
2976 ///
2977 /// This intrinsic corresponds to the \c VBROADCASTSS instruction.
2978 ///
2979 /// \param __X
2980 ///    A 128-bit vector of [4 x float] whose low element will be broadcast.
2981 /// \returns A 128-bit vector of [4 x float] containing the result.
2982 static __inline__ __m128 __DEFAULT_FN_ATTRS128
2983 _mm_broadcastss_ps(__m128 __X)
2984 {
2985   return (__m128)__builtin_shufflevector((__v4sf)__X, (__v4sf)__X, 0, 0, 0, 0);
2986 }
2987 
2988 /// Broadcasts the 64-bit floating-point value from the low element of the
2989 ///    128-bit vector of [2 x double] in \a __a to both elements of the
2990 ///    result's 128-bit vector of [2 x double].
2991 ///
2992 /// \headerfile <immintrin.h>
2993 ///
2994 /// This intrinsic corresponds to the \c MOVDDUP instruction.
2995 ///
2996 /// \param __a
2997 ///    A 128-bit vector of [2 x double] whose low element will be broadcast.
2998 /// \returns A 128-bit vector of [2 x double] containing the result.
2999 static __inline__ __m128d __DEFAULT_FN_ATTRS128
3000 _mm_broadcastsd_pd(__m128d __a)
3001 {
3002   return __builtin_shufflevector((__v2df)__a, (__v2df)__a, 0, 0);
3003 }
3004 
3005 /// Broadcasts the 32-bit floating-point value from the low element of the
3006 ///    128-bit vector of [4 x float] in \a __X to all elements of the
3007 ///    result's 256-bit vector of [8 x float].
3008 ///
3009 /// \headerfile <immintrin.h>
3010 ///
3011 /// This intrinsic corresponds to the \c VBROADCASTSS instruction.
3012 ///
3013 /// \param __X
3014 ///    A 128-bit vector of [4 x float] whose low element will be broadcast.
3015 /// \returns A 256-bit vector of [8 x float] containing the result.
3016 static __inline__ __m256 __DEFAULT_FN_ATTRS256
3017 _mm256_broadcastss_ps(__m128 __X)
3018 {
3019   return (__m256)__builtin_shufflevector((__v4sf)__X, (__v4sf)__X, 0, 0, 0, 0, 0, 0, 0, 0);
3020 }
3021 
3022 /// Broadcasts the 64-bit floating-point value from the low element of the
3023 ///    128-bit vector of [2 x double] in \a __X to all elements of the
3024 ///    result's 256-bit vector of [4 x double].
3025 ///
3026 /// \headerfile <immintrin.h>
3027 ///
3028 /// This intrinsic corresponds to the \c VBROADCASTSD instruction.
3029 ///
3030 /// \param __X
3031 ///    A 128-bit vector of [2 x double] whose low element will be broadcast.
3032 /// \returns A 256-bit vector of [4 x double] containing the result.
3033 static __inline__ __m256d __DEFAULT_FN_ATTRS256
3034 _mm256_broadcastsd_pd(__m128d __X)
3035 {
3036   return (__m256d)__builtin_shufflevector((__v2df)__X, (__v2df)__X, 0, 0, 0, 0);
3037 }
3038 
3039 /// Broadcasts the 128-bit integer data from \a __X to both the lower and
3040 ///    upper halves of the 256-bit result.
3041 ///
3042 /// \headerfile <immintrin.h>
3043 ///
3044 /// This intrinsic corresponds to the \c VBROADCASTI128 instruction.
3045 ///
3046 /// \param __X
3047 ///    A 128-bit integer vector to be broadcast.
3048 /// \returns A 256-bit integer vector containing the result.
3049 static __inline__ __m256i __DEFAULT_FN_ATTRS256
3050 _mm256_broadcastsi128_si256(__m128i __X)
3051 {
3052   return (__m256i)__builtin_shufflevector((__v2di)__X, (__v2di)__X, 0, 1, 0, 1);
3053 }
3054 
3055 #define _mm_broadcastsi128_si256(X) _mm256_broadcastsi128_si256(X)
3056 
3057 /// Merges 32-bit integer elements from either of the two 128-bit vectors of
3058 ///    [4 x i32] in \a V1 or \a V2 to the result's 128-bit vector of [4 x i32],
3059 ///    as specified by the immediate integer operand \a M.
3060 ///
3061 /// \code{.operation}
3062 /// FOR i := 0 TO 3
3063 ///   j := i*32
3064 ///   IF M[i] == 0
3065 ///     result[31+j:j] := V1[31+j:j]
3066 ///   ELSE
3067 ///     result[31+j:j] := V2[32+j:j]
3068 ///   FI
3069 /// ENDFOR
3070 /// \endcode
3071 ///
3072 /// \headerfile <immintrin.h>
3073 ///
3074 /// \code
3075 /// __m128i _mm_blend_epi32(__m128i V1, __m128i V2, const int M);
3076 /// \endcode
3077 ///
3078 /// This intrinsic corresponds to the \c VPBLENDDD instruction.
3079 ///
3080 /// \param V1
3081 ///    A 128-bit vector of [4 x i32] containing source values.
3082 /// \param V2
3083 ///    A 128-bit vector of [4 x i32] containing source values.
3084 /// \param M
3085 ///    An immediate 8-bit integer operand, with bits [3:0] specifying the
3086 ///    source for each element of the result. The position of the mask bit
3087 ///    corresponds to the index of a copied value. When a mask bit is 0, the
3088 ///    element is copied from \a V1; otherwise, it is copied from \a V2.
3089 /// \returns A 128-bit vector of [4 x i32] containing the result.
3090 #define _mm_blend_epi32(V1, V2, M) \
3091   ((__m128i)__builtin_ia32_pblendd128((__v4si)(__m128i)(V1), \
3092                                       (__v4si)(__m128i)(V2), (int)(M)))
3093 
3094 /// Merges 32-bit integer elements from either of the two 256-bit vectors of
3095 ///    [8 x i32] in \a V1 or \a V2 to return a 256-bit vector of [8 x i32],
3096 ///    as specified by the immediate integer operand \a M.
3097 ///
3098 /// \code{.operation}
3099 /// FOR i := 0 TO 7
3100 ///   j := i*32
3101 ///   IF M[i] == 0
3102 ///     result[31+j:j] := V1[31+j:j]
3103 ///   ELSE
3104 ///     result[31+j:j] := V2[32+j:j]
3105 ///   FI
3106 /// ENDFOR
3107 /// \endcode
3108 ///
3109 /// \headerfile <immintrin.h>
3110 ///
3111 /// \code
3112 /// __m256i _mm256_blend_epi32(__m256i V1, __m256i V2, const int M);
3113 /// \endcode
3114 ///
3115 /// This intrinsic corresponds to the \c VPBLENDDD instruction.
3116 ///
3117 /// \param V1
3118 ///    A 256-bit vector of [8 x i32] containing source values.
3119 /// \param V2
3120 ///    A 256-bit vector of [8 x i32] containing source values.
3121 /// \param M
3122 ///    An immediate 8-bit integer operand, with bits [7:0] specifying the
3123 ///    source for each element of the result. The position of the mask bit
3124 ///    corresponds to the index of a copied value. When a mask bit is 0, the
3125 ///    element is copied from \a V1; otherwise, it is is copied from \a V2.
3126 /// \returns A 256-bit vector of [8 x i32] containing the result.
3127 #define _mm256_blend_epi32(V1, V2, M) \
3128   ((__m256i)__builtin_ia32_pblendd256((__v8si)(__m256i)(V1), \
3129                                       (__v8si)(__m256i)(V2), (int)(M)))
3130 
3131 /// Broadcasts the low byte from the 128-bit integer vector in \a __X to all
3132 ///    bytes of the 256-bit result.
3133 ///
3134 /// \headerfile <immintrin.h>
3135 ///
3136 /// This intrinsic corresponds to the \c VPBROADCASTB instruction.
3137 ///
3138 /// \param __X
3139 ///    A 128-bit integer vector whose low byte will be broadcast.
3140 /// \returns A 256-bit integer vector containing the result.
3141 static __inline__ __m256i __DEFAULT_FN_ATTRS256
3142 _mm256_broadcastb_epi8(__m128i __X)
3143 {
3144   return (__m256i)__builtin_shufflevector((__v16qi)__X, (__v16qi)__X, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0);
3145 }
3146 
3147 /// Broadcasts the low element from the 128-bit vector of [8 x i16] in \a __X
3148 ///    to all elements of the result's 256-bit vector of [16 x i16].
3149 ///
3150 /// \headerfile <immintrin.h>
3151 ///
3152 /// This intrinsic corresponds to the \c VPBROADCASTW instruction.
3153 ///
3154 /// \param __X
3155 ///    A 128-bit vector of [8 x i16] whose low element will be broadcast.
3156 /// \returns A 256-bit vector of [16 x i16] containing the result.
3157 static __inline__ __m256i __DEFAULT_FN_ATTRS256
3158 _mm256_broadcastw_epi16(__m128i __X)
3159 {
3160   return (__m256i)__builtin_shufflevector((__v8hi)__X, (__v8hi)__X, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0);
3161 }
3162 
3163 /// Broadcasts the low element from the 128-bit vector of [4 x i32] in \a __X
3164 ///    to all elements of the result's 256-bit vector of [8 x i32].
3165 ///
3166 /// \headerfile <immintrin.h>
3167 ///
3168 /// This intrinsic corresponds to the \c VPBROADCASTD instruction.
3169 ///
3170 /// \param __X
3171 ///    A 128-bit vector of [4 x i32] whose low element will be broadcast.
3172 /// \returns A 256-bit vector of [8 x i32] containing the result.
3173 static __inline__ __m256i __DEFAULT_FN_ATTRS256
3174 _mm256_broadcastd_epi32(__m128i __X)
3175 {
3176   return (__m256i)__builtin_shufflevector((__v4si)__X, (__v4si)__X, 0, 0, 0, 0, 0, 0, 0, 0);
3177 }
3178 
3179 /// Broadcasts the low element from the 128-bit vector of [2 x i64] in \a __X
3180 ///    to all elements of the result's 256-bit vector of [4 x i64].
3181 ///
3182 /// \headerfile <immintrin.h>
3183 ///
3184 /// This intrinsic corresponds to the \c VPBROADCASTQ instruction.
3185 ///
3186 /// \param __X
3187 ///    A 128-bit vector of [2 x i64] whose low element will be broadcast.
3188 /// \returns A 256-bit vector of [4 x i64] containing the result.
3189 static __inline__ __m256i __DEFAULT_FN_ATTRS256
3190 _mm256_broadcastq_epi64(__m128i __X)
3191 {
3192   return (__m256i)__builtin_shufflevector((__v2di)__X, (__v2di)__X, 0, 0, 0, 0);
3193 }
3194 
3195 /// Broadcasts the low byte from the 128-bit integer vector in \a __X to all
3196 ///    bytes of the 128-bit result.
3197 ///
3198 /// \headerfile <immintrin.h>
3199 ///
3200 /// This intrinsic corresponds to the \c VPBROADCASTB instruction.
3201 ///
3202 /// \param __X
3203 ///    A 128-bit integer vector whose low byte will be broadcast.
3204 /// \returns A 128-bit integer vector containing the result.
3205 static __inline__ __m128i __DEFAULT_FN_ATTRS128
3206 _mm_broadcastb_epi8(__m128i __X)
3207 {
3208   return (__m128i)__builtin_shufflevector((__v16qi)__X, (__v16qi)__X, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0);
3209 }
3210 
3211 /// Broadcasts the low element from the 128-bit vector of [8 x i16] in
3212 ///    \a __X to all elements of the result's 128-bit vector of [8 x i16].
3213 ///
3214 /// \headerfile <immintrin.h>
3215 ///
3216 /// This intrinsic corresponds to the \c VPBROADCASTW instruction.
3217 ///
3218 /// \param __X
3219 ///    A 128-bit vector of [8 x i16] whose low element will be broadcast.
3220 /// \returns A 128-bit vector of [8 x i16] containing the result.
3221 static __inline__ __m128i __DEFAULT_FN_ATTRS128
3222 _mm_broadcastw_epi16(__m128i __X)
3223 {
3224   return (__m128i)__builtin_shufflevector((__v8hi)__X, (__v8hi)__X, 0, 0, 0, 0, 0, 0, 0, 0);
3225 }
3226 
3227 /// Broadcasts the low element from the 128-bit vector of [4 x i32] in \a __X
3228 ///    to all elements of the result's vector of [4 x i32].
3229 ///
3230 /// \headerfile <immintrin.h>
3231 ///
3232 /// This intrinsic corresponds to the \c VPBROADCASTD instruction.
3233 ///
3234 /// \param __X
3235 ///    A 128-bit vector of [4 x i32] whose low element will be broadcast.
3236 /// \returns A 128-bit vector of [4 x i32] containing the result.
3237 static __inline__ __m128i __DEFAULT_FN_ATTRS128
3238 _mm_broadcastd_epi32(__m128i __X)
3239 {
3240   return (__m128i)__builtin_shufflevector((__v4si)__X, (__v4si)__X, 0, 0, 0, 0);
3241 }
3242 
3243 /// Broadcasts the low element from the 128-bit vector of [2 x i64] in \a __X
3244 ///    to both elements of the result's 128-bit vector of [2 x i64].
3245 ///
3246 /// \headerfile <immintrin.h>
3247 ///
3248 /// This intrinsic corresponds to the \c VPBROADCASTQ instruction.
3249 ///
3250 /// \param __X
3251 ///    A 128-bit vector of [2 x i64] whose low element will be broadcast.
3252 /// \returns A 128-bit vector of [2 x i64] containing the result.
3253 static __inline__ __m128i __DEFAULT_FN_ATTRS128
3254 _mm_broadcastq_epi64(__m128i __X)
3255 {
3256   return (__m128i)__builtin_shufflevector((__v2di)__X, (__v2di)__X, 0, 0);
3257 }
3258 
3259 /// Sets the result's 256-bit vector of [8 x i32] to copies of elements of the
3260 ///    256-bit vector of [8 x i32] in \a __a as specified by indexes in the
3261 ///    elements of the 256-bit vector of [8 x i32] in \a __b.
3262 ///
3263 /// \code{.operation}
3264 /// FOR i := 0 TO 7
3265 ///   j := i*32
3266 ///   k := __b[j+2:j] * 32
3267 ///   result[j+31:j] := __a[k+31:k]
3268 /// ENDFOR
3269 /// \endcode
3270 ///
3271 /// \headerfile <immintrin.h>
3272 ///
3273 /// This intrinsic corresponds to the \c VPERMD instruction.
3274 ///
3275 /// \param __a
3276 ///    A 256-bit vector of [8 x i32] containing the source values.
3277 /// \param __b
3278 ///    A 256-bit vector of [8 x i32] containing indexes of values to use from
3279 ///    \a __a.
3280 /// \returns A 256-bit vector of [8 x i32] containing the result.
3281 static __inline__ __m256i __DEFAULT_FN_ATTRS256
3282 _mm256_permutevar8x32_epi32(__m256i __a, __m256i __b)
3283 {
3284   return (__m256i)__builtin_ia32_permvarsi256((__v8si)__a, (__v8si)__b);
3285 }
3286 
3287 /// Sets the result's 256-bit vector of [4 x double] to copies of elements of
3288 ///    the 256-bit vector of [4 x double] in \a V as specified by the
3289 ///    immediate value \a M.
3290 ///
3291 /// \code{.operation}
3292 /// FOR i := 0 TO 3
3293 ///   j := i*64
3294 ///   k := (M >> i*2)[1:0] * 64
3295 ///   result[j+63:j] := V[k+63:k]
3296 /// ENDFOR
3297 /// \endcode
3298 ///
3299 /// \headerfile <immintrin.h>
3300 ///
3301 /// \code
3302 /// __m256d _mm256_permute4x64_pd(__m256d V, const int M);
3303 /// \endcode
3304 ///
3305 /// This intrinsic corresponds to the \c VPERMPD instruction.
3306 ///
3307 /// \param V
3308 ///    A 256-bit vector of [4 x double] containing the source values.
3309 /// \param M
3310 ///    An immediate 8-bit value specifying which elements to copy from \a V.
3311 ///    \a M[1:0] specifies the index in \a a for element 0 of the result,
3312 ///    \a M[3:2] specifies the index for element 1, and so forth.
3313 /// \returns A 256-bit vector of [4 x double] containing the result.
3314 #define _mm256_permute4x64_pd(V, M) \
3315   ((__m256d)__builtin_ia32_permdf256((__v4df)(__m256d)(V), (int)(M)))
3316 
3317 /// Sets the result's 256-bit vector of [8 x float] to copies of elements of
3318 ///    the 256-bit vector of [8 x float] in \a __a as specified by indexes in
3319 ///    the elements of the 256-bit vector of [8 x i32] in \a __b.
3320 ///
3321 /// \code{.operation}
3322 /// FOR i := 0 TO 7
3323 ///   j := i*32
3324 ///   k := __b[j+2:j] * 32
3325 ///   result[j+31:j] := __a[k+31:k]
3326 /// ENDFOR
3327 /// \endcode
3328 ///
3329 /// \headerfile <immintrin.h>
3330 ///
3331 /// This intrinsic corresponds to the \c VPERMPS instruction.
3332 ///
3333 /// \param __a
3334 ///    A 256-bit vector of [8 x float] containing the source values.
3335 /// \param __b
3336 ///    A 256-bit vector of [8 x i32] containing indexes of values to use from
3337 ///    \a __a.
3338 /// \returns A 256-bit vector of [8 x float] containing the result.
3339 static __inline__ __m256 __DEFAULT_FN_ATTRS256
3340 _mm256_permutevar8x32_ps(__m256 __a, __m256i __b)
3341 {
3342   return (__m256)__builtin_ia32_permvarsf256((__v8sf)__a, (__v8si)__b);
3343 }
3344 
3345 /// Sets the result's 256-bit vector of [4 x i64] result to copies of elements
3346 ///    of the 256-bit vector of [4 x i64] in \a V as specified by the
3347 ///    immediate value \a M.
3348 ///
3349 /// \code{.operation}
3350 /// FOR i := 0 TO 3
3351 ///   j := i*64
3352 ///   k := (M >> i*2)[1:0] * 64
3353 ///   result[j+63:j] := V[k+63:k]
3354 /// ENDFOR
3355 /// \endcode
3356 ///
3357 /// \headerfile <immintrin.h>
3358 ///
3359 /// \code
3360 /// __m256i _mm256_permute4x64_epi64(__m256i V, const int M);
3361 /// \endcode
3362 ///
3363 /// This intrinsic corresponds to the \c VPERMQ instruction.
3364 ///
3365 /// \param V
3366 ///    A 256-bit vector of [4 x i64] containing the source values.
3367 /// \param M
3368 ///    An immediate 8-bit value specifying which elements to copy from \a V.
3369 ///    \a M[1:0] specifies the index in \a a for element 0 of the result,
3370 ///    \a M[3:2] specifies the index for element 1, and so forth.
3371 /// \returns A 256-bit vector of [4 x i64] containing the result.
3372 #define _mm256_permute4x64_epi64(V, M) \
3373   ((__m256i)__builtin_ia32_permdi256((__v4di)(__m256i)(V), (int)(M)))
3374 
3375 /// Sets each half of the 256-bit result either to zero or to one of the
3376 ///    four possible 128-bit halves of the 256-bit vectors \a V1 and \a V2,
3377 ///    as specified by the immediate value \a M.
3378 ///
3379 /// \code{.operation}
3380 /// FOR i := 0 TO 1
3381 ///   j := i*128
3382 ///   k := M >> (i*4)
3383 ///   IF k[3] == 0
3384 ///     CASE (k[1:0]) OF
3385 ///     0: result[127+j:j] := V1[127:0]
3386 ///     1: result[127+j:j] := V1[255:128]
3387 ///     2: result[127+j:j] := V2[127:0]
3388 ///     3: result[127+j:j] := V2[255:128]
3389 ///     ESAC
3390 ///   ELSE
3391 ///     result[127+j:j] := 0
3392 ///   FI
3393 /// ENDFOR
3394 /// \endcode
3395 ///
3396 /// \headerfile <immintrin.h>
3397 ///
3398 /// \code
3399 /// __m256i _mm256_permute2x128_si256(__m256i V1, __m256i V2, const int M);
3400 /// \endcode
3401 ///
3402 /// This intrinsic corresponds to the \c VPERM2I128 instruction.
3403 ///
3404 /// \param V1
3405 ///    A 256-bit integer vector containing source values.
3406 /// \param V2
3407 ///    A 256-bit integer vector containing source values.
3408 /// \param M
3409 ///    An immediate value specifying how to form the result. Bits [3:0]
3410 ///    control the lower half of the result, bits [7:4] control the upper half.
3411 ///    Within each 4-bit control value, if bit 3 is 1, the result is zero,
3412 ///    otherwise bits [1:0] determine the source as follows. \n
3413 ///    0: the lower half of \a V1 \n
3414 ///    1: the upper half of \a V1 \n
3415 ///    2: the lower half of \a V2 \n
3416 ///    3: the upper half of \a V2
3417 /// \returns A 256-bit integer vector containing the result.
3418 #define _mm256_permute2x128_si256(V1, V2, M) \
3419   ((__m256i)__builtin_ia32_permti256((__m256i)(V1), (__m256i)(V2), (int)(M)))
3420 
3421 /// Extracts half of the 256-bit vector \a V to the 128-bit result. If bit 0
3422 ///     of the immediate \a M is zero, extracts the lower half of the result;
3423 ///     otherwise, extracts the upper half.
3424 ///
3425 /// \headerfile <immintrin.h>
3426 ///
3427 /// \code
3428 /// __m128i _mm256_extracti128_si256(__m256i V, const int M);
3429 /// \endcode
3430 ///
3431 /// This intrinsic corresponds to the \c VEXTRACTI128 instruction.
3432 ///
3433 /// \param V
3434 ///    A 256-bit integer vector containing the source values.
3435 /// \param M
3436 ///    An immediate value specifying which half of \a V to extract.
3437 /// \returns A 128-bit integer vector containing the result.
3438 #define _mm256_extracti128_si256(V, M) \
3439   ((__m128i)__builtin_ia32_extract128i256((__v4di)(__m256i)(V), (int)(M)))
3440 
3441 /// Copies the 256-bit vector \a V1 to the result, then overwrites half of the
3442 ///     result with the 128-bit vector \a V2. If bit 0 of the immediate \a M
3443 ///     is zero, overwrites the lower half of the result; otherwise,
3444 ///     overwrites the upper half.
3445 ///
3446 /// \headerfile <immintrin.h>
3447 ///
3448 /// \code
3449 /// __m256i _mm256_inserti128_si256(__m256i V1, __m128i V2, const int M);
3450 /// \endcode
3451 ///
3452 /// This intrinsic corresponds to the \c VINSERTI128 instruction.
3453 ///
3454 /// \param V1
3455 ///    A 256-bit integer vector containing a source value.
3456 /// \param V2
3457 ///    A 128-bit integer vector containing a source value.
3458 /// \param M
3459 ///    An immediate value specifying where to put \a V2 in the result.
3460 /// \returns A 256-bit integer vector containing the result.
3461 #define _mm256_inserti128_si256(V1, V2, M) \
3462   ((__m256i)__builtin_ia32_insert128i256((__v4di)(__m256i)(V1), \
3463                                          (__v2di)(__m128i)(V2), (int)(M)))
3464 
3465 /// Conditionally loads eight 32-bit integer elements from memory \a __X, if
3466 ///    the most significant bit of the corresponding element in the mask
3467 ///    \a __M is set; otherwise, sets that element of the result to zero.
3468 ///    Returns the 256-bit [8 x i32] result.
3469 ///
3470 /// \code{.operation}
3471 /// FOR i := 0 TO 7
3472 ///   j := i*32
3473 ///   IF __M[j+31] == 1
3474 ///     result[j+31:j] := Load32(__X+(i*4))
3475 ///   ELSE
3476 ///     result[j+31:j] := 0
3477 ///   FI
3478 /// ENDFOR
3479 /// \endcode
3480 ///
3481 /// \headerfile <immintrin.h>
3482 ///
3483 /// This intrinsic corresponds to the \c VPMASKMOVD instruction.
3484 ///
3485 /// \param __X
3486 ///    A pointer to the memory used for loading values.
3487 /// \param __M
3488 ///    A 256-bit vector of [8 x i32] containing the mask bits.
3489 /// \returns A 256-bit vector of [8 x i32] containing the loaded or zeroed
3490 ///    elements.
3491 static __inline__ __m256i __DEFAULT_FN_ATTRS256
3492 _mm256_maskload_epi32(int const *__X, __m256i __M)
3493 {
3494   return (__m256i)__builtin_ia32_maskloadd256((const __v8si *)__X, (__v8si)__M);
3495 }
3496 
3497 /// Conditionally loads four 64-bit integer elements from memory \a __X, if
3498 ///    the most significant bit of the corresponding element in the mask
3499 ///    \a __M is set; otherwise, sets that element of the result to zero.
3500 ///    Returns the 256-bit [4 x i64] result.
3501 ///
3502 /// \code{.operation}
3503 /// FOR i := 0 TO 3
3504 ///   j := i*64
3505 ///   IF __M[j+63] == 1
3506 ///     result[j+63:j] := Load64(__X+(i*8))
3507 ///   ELSE
3508 ///     result[j+63:j] := 0
3509 ///   FI
3510 /// ENDFOR
3511 /// \endcode
3512 ///
3513 /// \headerfile <immintrin.h>
3514 ///
3515 /// This intrinsic corresponds to the \c VPMASKMOVQ instruction.
3516 ///
3517 /// \param __X
3518 ///    A pointer to the memory used for loading values.
3519 /// \param __M
3520 ///    A 256-bit vector of [4 x i64] containing the mask bits.
3521 /// \returns A 256-bit vector of [4 x i64] containing the loaded or zeroed
3522 ///    elements.
3523 static __inline__ __m256i __DEFAULT_FN_ATTRS256
3524 _mm256_maskload_epi64(long long const *__X, __m256i __M)
3525 {
3526   return (__m256i)__builtin_ia32_maskloadq256((const __v4di *)__X, (__v4di)__M);
3527 }
3528 
3529 /// Conditionally loads four 32-bit integer elements from memory \a __X, if
3530 ///    the most significant bit of the corresponding element in the mask
3531 ///    \a __M is set; otherwise, sets that element of the result to zero.
3532 ///    Returns the 128-bit [4 x i32] result.
3533 ///
3534 /// \code{.operation}
3535 /// FOR i := 0 TO 3
3536 ///   j := i*32
3537 ///   IF __M[j+31] == 1
3538 ///     result[j+31:j] := Load32(__X+(i*4))
3539 ///   ELSE
3540 ///     result[j+31:j] := 0
3541 ///   FI
3542 /// ENDFOR
3543 /// \endcode
3544 ///
3545 /// \headerfile <immintrin.h>
3546 ///
3547 /// This intrinsic corresponds to the \c VPMASKMOVD instruction.
3548 ///
3549 /// \param __X
3550 ///    A pointer to the memory used for loading values.
3551 /// \param __M
3552 ///    A 128-bit vector of [4 x i32] containing the mask bits.
3553 /// \returns A 128-bit vector of [4 x i32] containing the loaded or zeroed
3554 ///    elements.
3555 static __inline__ __m128i __DEFAULT_FN_ATTRS128
3556 _mm_maskload_epi32(int const *__X, __m128i __M)
3557 {
3558   return (__m128i)__builtin_ia32_maskloadd((const __v4si *)__X, (__v4si)__M);
3559 }
3560 
3561 /// Conditionally loads two 64-bit integer elements from memory \a __X, if
3562 ///    the most significant bit of the corresponding element in the mask
3563 ///    \a __M is set; otherwise, sets that element of the result to zero.
3564 ///    Returns the 128-bit [2 x i64] result.
3565 ///
3566 /// \code{.operation}
3567 /// FOR i := 0 TO 1
3568 ///   j := i*64
3569 ///   IF __M[j+63] == 1
3570 ///     result[j+63:j] := Load64(__X+(i*8))
3571 ///   ELSE
3572 ///     result[j+63:j] := 0
3573 ///   FI
3574 /// ENDFOR
3575 /// \endcode
3576 ///
3577 /// \headerfile <immintrin.h>
3578 ///
3579 /// This intrinsic corresponds to the \c VPMASKMOVQ instruction.
3580 ///
3581 /// \param __X
3582 ///    A pointer to the memory used for loading values.
3583 /// \param __M
3584 ///    A 128-bit vector of [2 x i64] containing the mask bits.
3585 /// \returns A 128-bit vector of [2 x i64] containing the loaded or zeroed
3586 ///    elements.
3587 static __inline__ __m128i __DEFAULT_FN_ATTRS128
3588 _mm_maskload_epi64(long long const *__X, __m128i __M)
3589 {
3590   return (__m128i)__builtin_ia32_maskloadq((const __v2di *)__X, (__v2di)__M);
3591 }
3592 
3593 /// Conditionally stores eight 32-bit integer elements from the 256-bit vector
3594 ///    of [8 x i32] in \a __Y to memory \a __X, if the most significant bit of
3595 ///    the corresponding element in the mask \a __M is set; otherwise, the
3596 ///    memory element is unchanged.
3597 ///
3598 /// \code{.operation}
3599 /// FOR i := 0 TO 7
3600 ///   j := i*32
3601 ///   IF __M[j+31] == 1
3602 ///     Store32(__X+(i*4), __Y[j+31:j])
3603 ///   FI
3604 /// ENDFOR
3605 /// \endcode
3606 ///
3607 /// \headerfile <immintrin.h>
3608 ///
3609 /// This intrinsic corresponds to the \c VPMASKMOVD instruction.
3610 ///
3611 /// \param __X
3612 ///    A pointer to the memory used for storing values.
3613 /// \param __M
3614 ///    A 256-bit vector of [8 x i32] containing the mask bits.
3615 /// \param __Y
3616 ///    A 256-bit vector of [8 x i32] containing the values to store.
3617 static __inline__ void __DEFAULT_FN_ATTRS256
3618 _mm256_maskstore_epi32(int *__X, __m256i __M, __m256i __Y)
3619 {
3620   __builtin_ia32_maskstored256((__v8si *)__X, (__v8si)__M, (__v8si)__Y);
3621 }
3622 
3623 /// Conditionally stores four 64-bit integer elements from the 256-bit vector
3624 ///    of [4 x i64] in \a __Y to memory \a __X, if the most significant bit of
3625 ///    the corresponding element in the mask \a __M is set; otherwise, the
3626 ///    memory element is unchanged.
3627 ///
3628 /// \code{.operation}
3629 /// FOR i := 0 TO 3
3630 ///   j := i*64
3631 ///   IF __M[j+63] == 1
3632 ///     Store64(__X+(i*8), __Y[j+63:j])
3633 ///   FI
3634 /// ENDFOR
3635 /// \endcode
3636 ///
3637 /// \headerfile <immintrin.h>
3638 ///
3639 /// This intrinsic corresponds to the \c VPMASKMOVQ instruction.
3640 ///
3641 /// \param __X
3642 ///    A pointer to the memory used for storing values.
3643 /// \param __M
3644 ///    A 256-bit vector of [4 x i64] containing the mask bits.
3645 /// \param __Y
3646 ///    A 256-bit vector of [4 x i64] containing the values to store.
3647 static __inline__ void __DEFAULT_FN_ATTRS256
3648 _mm256_maskstore_epi64(long long *__X, __m256i __M, __m256i __Y)
3649 {
3650   __builtin_ia32_maskstoreq256((__v4di *)__X, (__v4di)__M, (__v4di)__Y);
3651 }
3652 
3653 /// Conditionally stores four 32-bit integer elements from the 128-bit vector
3654 ///    of [4 x i32] in \a __Y to memory \a __X, if the most significant bit of
3655 ///    the corresponding element in the mask \a __M is set; otherwise, the
3656 ///    memory element is unchanged.
3657 ///
3658 /// \code{.operation}
3659 /// FOR i := 0 TO 3
3660 ///   j := i*32
3661 ///   IF __M[j+31] == 1
3662 ///     Store32(__X+(i*4), __Y[j+31:j])
3663 ///   FI
3664 /// ENDFOR
3665 /// \endcode
3666 ///
3667 /// \headerfile <immintrin.h>
3668 ///
3669 /// This intrinsic corresponds to the \c VPMASKMOVD instruction.
3670 ///
3671 /// \param __X
3672 ///    A pointer to the memory used for storing values.
3673 /// \param __M
3674 ///    A 128-bit vector of [4 x i32] containing the mask bits.
3675 /// \param __Y
3676 ///    A 128-bit vector of [4 x i32] containing the values to store.
3677 static __inline__ void __DEFAULT_FN_ATTRS128
3678 _mm_maskstore_epi32(int *__X, __m128i __M, __m128i __Y)
3679 {
3680   __builtin_ia32_maskstored((__v4si *)__X, (__v4si)__M, (__v4si)__Y);
3681 }
3682 
3683 /// Conditionally stores two 64-bit integer elements from the 128-bit vector
3684 ///    of [2 x i64] in \a __Y to memory \a __X, if the most significant bit of
3685 ///    the corresponding element in the mask \a __M is set; otherwise, the
3686 ///    memory element is unchanged.
3687 ///
3688 /// \code{.operation}
3689 /// FOR i := 0 TO 1
3690 ///   j := i*64
3691 ///   IF __M[j+63] == 1
3692 ///     Store64(__X+(i*8), __Y[j+63:j])
3693 ///   FI
3694 /// ENDFOR
3695 /// \endcode
3696 ///
3697 /// \headerfile <immintrin.h>
3698 ///
3699 /// This intrinsic corresponds to the \c VPMASKMOVQ instruction.
3700 ///
3701 /// \param __X
3702 ///    A pointer to the memory used for storing values.
3703 /// \param __M
3704 ///    A 128-bit vector of [2 x i64] containing the mask bits.
3705 /// \param __Y
3706 ///    A 128-bit vector of [2 x i64] containing the values to store.
3707 static __inline__ void __DEFAULT_FN_ATTRS128
3708 _mm_maskstore_epi64(long long *__X, __m128i __M, __m128i __Y)
3709 {
3710   __builtin_ia32_maskstoreq(( __v2di *)__X, (__v2di)__M, (__v2di)__Y);
3711 }
3712 
3713 /// Shifts each 32-bit element of the 256-bit vector of [8 x i32] in \a __X
3714 ///    left by the number of bits given in the corresponding element of the
3715 ///    256-bit vector of [8 x i32] in \a __Y, shifting in zero bits, and
3716 ///    returns the result. If the shift count for any element is greater than
3717 ///    31, the result for that element is zero.
3718 ///
3719 /// \headerfile <immintrin.h>
3720 ///
3721 /// This intrinsic corresponds to the \c VPSLLVD instruction.
3722 ///
3723 /// \param __X
3724 ///    A 256-bit vector of [8 x i32] to be shifted.
3725 /// \param __Y
3726 ///    A 256-bit vector of [8 x i32] containing the unsigned shift counts (in
3727 ///    bits).
3728 /// \returns A 256-bit vector of [8 x i32] containing the result.
3729 static __inline__ __m256i __DEFAULT_FN_ATTRS256
3730 _mm256_sllv_epi32(__m256i __X, __m256i __Y)
3731 {
3732   return (__m256i)__builtin_ia32_psllv8si((__v8si)__X, (__v8si)__Y);
3733 }
3734 
3735 /// Shifts each 32-bit element of the 128-bit vector of [4 x i32] in \a __X
3736 ///    left by the number of bits given in the corresponding element of the
3737 ///    128-bit vector of [4 x i32] in \a __Y, shifting in zero bits, and
3738 ///    returns the result. If the shift count for any element is greater than
3739 ///    31, the result for that element is zero.
3740 ///
3741 /// \headerfile <immintrin.h>
3742 ///
3743 /// This intrinsic corresponds to the \c VPSLLVD instruction.
3744 ///
3745 /// \param __X
3746 ///    A 128-bit vector of [4 x i32] to be shifted.
3747 /// \param __Y
3748 ///    A 128-bit vector of [4 x i32] containing the unsigned shift counts (in
3749 ///    bits).
3750 /// \returns A 128-bit vector of [4 x i32] containing the result.
3751 static __inline__ __m128i __DEFAULT_FN_ATTRS128
3752 _mm_sllv_epi32(__m128i __X, __m128i __Y)
3753 {
3754   return (__m128i)__builtin_ia32_psllv4si((__v4si)__X, (__v4si)__Y);
3755 }
3756 
3757 /// Shifts each 64-bit element of the 256-bit vector of [4 x i64] in \a __X
3758 ///    left by the number of bits given in the corresponding element of the
3759 ///    128-bit vector of [4 x i64] in \a __Y, shifting in zero bits, and
3760 ///    returns the result. If the shift count for any element is greater than
3761 ///    63, the result for that element is zero.
3762 ///
3763 /// \headerfile <immintrin.h>
3764 ///
3765 /// This intrinsic corresponds to the \c VPSLLVQ instruction.
3766 ///
3767 /// \param __X
3768 ///    A 256-bit vector of [4 x i64] to be shifted.
3769 /// \param __Y
3770 ///    A 256-bit vector of [4 x i64] containing the unsigned shift counts (in
3771 ///    bits).
3772 /// \returns A 256-bit vector of [4 x i64] containing the result.
3773 static __inline__ __m256i __DEFAULT_FN_ATTRS256
3774 _mm256_sllv_epi64(__m256i __X, __m256i __Y)
3775 {
3776   return (__m256i)__builtin_ia32_psllv4di((__v4di)__X, (__v4di)__Y);
3777 }
3778 
3779 /// Shifts each 64-bit element of the 128-bit vector of [2 x i64] in \a __X
3780 ///    left by the number of bits given in the corresponding element of the
3781 ///    128-bit vector of [2 x i64] in \a __Y, shifting in zero bits, and
3782 ///    returns the result. If the shift count for any element is greater than
3783 ///    63, the result for that element is zero.
3784 ///
3785 /// \headerfile <immintrin.h>
3786 ///
3787 /// This intrinsic corresponds to the \c VPSLLVQ instruction.
3788 ///
3789 /// \param __X
3790 ///    A 128-bit vector of [2 x i64] to be shifted.
3791 /// \param __Y
3792 ///    A 128-bit vector of [2 x i64] containing the unsigned shift counts (in
3793 ///    bits).
3794 /// \returns A 128-bit vector of [2 x i64] containing the result.
3795 static __inline__ __m128i __DEFAULT_FN_ATTRS128
3796 _mm_sllv_epi64(__m128i __X, __m128i __Y)
3797 {
3798   return (__m128i)__builtin_ia32_psllv2di((__v2di)__X, (__v2di)__Y);
3799 }
3800 
3801 /// Shifts each 32-bit element of the 256-bit vector of [8 x i32] in \a __X
3802 ///    right by the number of bits given in the corresponding element of the
3803 ///    256-bit vector of [8 x i32] in \a __Y, shifting in sign bits, and
3804 ///    returns the result. If the shift count for any element is greater than
3805 ///    31, the result for that element is 0 or -1 according to the sign bit
3806 ///    for that element.
3807 ///
3808 /// \headerfile <immintrin.h>
3809 ///
3810 /// This intrinsic corresponds to the \c VPSRAVD instruction.
3811 ///
3812 /// \param __X
3813 ///    A 256-bit vector of [8 x i32] to be shifted.
3814 /// \param __Y
3815 ///    A 256-bit vector of [8 x i32] containing the unsigned shift counts (in
3816 ///    bits).
3817 /// \returns A 256-bit vector of [8 x i32] containing the result.
3818 static __inline__ __m256i __DEFAULT_FN_ATTRS256
3819 _mm256_srav_epi32(__m256i __X, __m256i __Y)
3820 {
3821   return (__m256i)__builtin_ia32_psrav8si((__v8si)__X, (__v8si)__Y);
3822 }
3823 
3824 /// Shifts each 32-bit element of the 128-bit vector of [4 x i32] in \a __X
3825 ///    right by the number of bits given in the corresponding element of the
3826 ///    128-bit vector of [4 x i32] in \a __Y, shifting in sign bits, and
3827 ///    returns the result. If the shift count for any element is greater than
3828 ///    31, the result for that element is 0 or -1 according to the sign bit
3829 ///    for that element.
3830 ///
3831 /// \headerfile <immintrin.h>
3832 ///
3833 /// This intrinsic corresponds to the \c VPSRAVD instruction.
3834 ///
3835 /// \param __X
3836 ///    A 128-bit vector of [4 x i32] to be shifted.
3837 /// \param __Y
3838 ///    A 128-bit vector of [4 x i32] containing the unsigned shift counts (in
3839 ///    bits).
3840 /// \returns A 128-bit vector of [4 x i32] containing the result.
3841 static __inline__ __m128i __DEFAULT_FN_ATTRS128
3842 _mm_srav_epi32(__m128i __X, __m128i __Y)
3843 {
3844   return (__m128i)__builtin_ia32_psrav4si((__v4si)__X, (__v4si)__Y);
3845 }
3846 
3847 /// Shifts each 32-bit element of the 256-bit vector of [8 x i32] in \a __X
3848 ///    right by the number of bits given in the corresponding element of the
3849 ///    256-bit vector of [8 x i32] in \a __Y, shifting in zero bits, and
3850 ///    returns the result. If the shift count for any element is greater than
3851 ///    31, the result for that element is zero.
3852 ///
3853 /// \headerfile <immintrin.h>
3854 ///
3855 /// This intrinsic corresponds to the \c VPSRLVD instruction.
3856 ///
3857 /// \param __X
3858 ///    A 256-bit vector of [8 x i32] to be shifted.
3859 /// \param __Y
3860 ///    A 256-bit vector of [8 x i32] containing the unsigned shift counts (in
3861 ///    bits).
3862 /// \returns A 256-bit vector of [8 x i32] containing the result.
3863 static __inline__ __m256i __DEFAULT_FN_ATTRS256
3864 _mm256_srlv_epi32(__m256i __X, __m256i __Y)
3865 {
3866   return (__m256i)__builtin_ia32_psrlv8si((__v8si)__X, (__v8si)__Y);
3867 }
3868 
3869 /// Shifts each 32-bit element of the 128-bit vector of [4 x i32] in \a __X
3870 ///    right by the number of bits given in the corresponding element of the
3871 ///    128-bit vector of [4 x i32] in \a __Y, shifting in zero bits, and
3872 ///    returns the result. If the shift count for any element is greater than
3873 ///    31, the result for that element is zero.
3874 ///
3875 /// \headerfile <immintrin.h>
3876 ///
3877 /// This intrinsic corresponds to the \c VPSRLVD instruction.
3878 ///
3879 /// \param __X
3880 ///    A 128-bit vector of [4 x i32] to be shifted.
3881 /// \param __Y
3882 ///    A 128-bit vector of [4 x i32] containing the unsigned shift counts (in
3883 ///    bits).
3884 /// \returns A 128-bit vector of [4 x i32] containing the result.
3885 static __inline__ __m128i __DEFAULT_FN_ATTRS128
3886 _mm_srlv_epi32(__m128i __X, __m128i __Y)
3887 {
3888   return (__m128i)__builtin_ia32_psrlv4si((__v4si)__X, (__v4si)__Y);
3889 }
3890 
3891 /// Shifts each 64-bit element of the 256-bit vector of [4 x i64] in \a __X
3892 ///    right by the number of bits given in the corresponding element of the
3893 ///    128-bit vector of [4 x i64] in \a __Y, shifting in zero bits, and
3894 ///    returns the result. If the shift count for any element is greater than
3895 ///    63, the result for that element is zero.
3896 ///
3897 /// \headerfile <immintrin.h>
3898 ///
3899 /// This intrinsic corresponds to the \c VPSRLVQ instruction.
3900 ///
3901 /// \param __X
3902 ///    A 256-bit vector of [4 x i64] to be shifted.
3903 /// \param __Y
3904 ///    A 256-bit vector of [4 x i64] containing the unsigned shift counts (in
3905 ///    bits).
3906 /// \returns A 256-bit vector of [4 x i64] containing the result.
3907 static __inline__ __m256i __DEFAULT_FN_ATTRS256
3908 _mm256_srlv_epi64(__m256i __X, __m256i __Y)
3909 {
3910   return (__m256i)__builtin_ia32_psrlv4di((__v4di)__X, (__v4di)__Y);
3911 }
3912 
3913 /// Shifts each 64-bit element of the 128-bit vector of [2 x i64] in \a __X
3914 ///    right by the number of bits given in the corresponding element of the
3915 ///    128-bit vector of [2 x i64] in \a __Y, shifting in zero bits, and
3916 ///    returns the result. If the shift count for any element is greater than
3917 ///    63, the result for that element is zero.
3918 ///
3919 /// \headerfile <immintrin.h>
3920 ///
3921 /// This intrinsic corresponds to the \c VPSRLVQ instruction.
3922 ///
3923 /// \param __X
3924 ///    A 128-bit vector of [2 x i64] to be shifted.
3925 /// \param __Y
3926 ///    A 128-bit vector of [2 x i64] containing the unsigned shift counts (in
3927 ///    bits).
3928 /// \returns A 128-bit vector of [2 x i64] containing the result.
3929 static __inline__ __m128i __DEFAULT_FN_ATTRS128
3930 _mm_srlv_epi64(__m128i __X, __m128i __Y)
3931 {
3932   return (__m128i)__builtin_ia32_psrlv2di((__v2di)__X, (__v2di)__Y);
3933 }
3934 
3935 /// Conditionally gathers two 64-bit floating-point values, either from the
3936 ///    128-bit vector of [2 x double] in \a a, or from memory \a m using scaled
3937 ///    indexes from the 128-bit vector of [4 x i32] in \a i. The 128-bit vector
3938 ///    of [2 x double] in \a mask determines the source for each element.
3939 ///
3940 /// \code{.operation}
3941 /// FOR element := 0 to 1
3942 ///   j := element*64
3943 ///   k := element*32
3944 ///   IF mask[j+63] == 0
3945 ///     result[j+63:j] := a[j+63:j]
3946 ///   ELSE
3947 ///     result[j+63:j] := Load64(m + SignExtend(i[k+31:k])*s)
3948 ///   FI
3949 /// ENDFOR
3950 /// \endcode
3951 ///
3952 /// \headerfile <immintrin.h>
3953 ///
3954 /// \code
3955 /// __m128d _mm_mask_i32gather_pd(__m128d a, const double *m, __m128i i,
3956 ///                               __m128d mask, const int s);
3957 /// \endcode
3958 ///
3959 /// This intrinsic corresponds to the \c VGATHERDPD instruction.
3960 ///
3961 /// \param a
3962 ///    A 128-bit vector of [2 x double] used as the source when a mask bit is
3963 ///    zero.
3964 /// \param m
3965 ///    A pointer to the memory used for loading values.
3966 /// \param i
3967 ///    A 128-bit vector of [4 x i32] containing signed indexes into \a m. Only
3968 ///    the first two elements are used.
3969 /// \param mask
3970 ///    A 128-bit vector of [2 x double] containing the mask. The most
3971 ///    significant bit of each element in the mask vector represents the mask
3972 ///    bits. If a mask bit is zero, the corresponding value from vector \a a
3973 ///    is gathered; otherwise the value is loaded from memory.
3974 /// \param s
3975 ///    A literal constant scale factor for the indexes in \a i. Must be
3976 ///    1, 2, 4, or 8.
3977 /// \returns A 128-bit vector of [2 x double] containing the gathered values.
3978 #define _mm_mask_i32gather_pd(a, m, i, mask, s) \
3979   ((__m128d)__builtin_ia32_gatherd_pd((__v2df)(__m128i)(a), \
3980                                       (double const *)(m), \
3981                                       (__v4si)(__m128i)(i), \
3982                                       (__v2df)(__m128d)(mask), (s)))
3983 
3984 /// Conditionally gathers four 64-bit floating-point values, either from the
3985 ///    256-bit vector of [4 x double] in \a a, or from memory \a m using scaled
3986 ///    indexes from the 128-bit vector of [4 x i32] in \a i. The 256-bit vector
3987 ///    of [4 x double] in \a mask determines the source for each element.
3988 ///
3989 /// \code{.operation}
3990 /// FOR element := 0 to 3
3991 ///   j := element*64
3992 ///   k := element*32
3993 ///   IF mask[j+63] == 0
3994 ///     result[j+63:j] := a[j+63:j]
3995 ///   ELSE
3996 ///     result[j+63:j] := Load64(m + SignExtend(i[k+31:k])*s)
3997 ///   FI
3998 /// ENDFOR
3999 /// \endcode
4000 ///
4001 /// \headerfile <immintrin.h>
4002 ///
4003 /// \code
4004 /// __m256d _mm256_mask_i32gather_pd(__m256d a, const double *m, __m128i i,
4005 ///                                  __m256d mask, const int s);
4006 /// \endcode
4007 ///
4008 /// This intrinsic corresponds to the \c VGATHERDPD instruction.
4009 ///
4010 /// \param a
4011 ///    A 256-bit vector of [4 x double] used as the source when a mask bit is
4012 ///    zero.
4013 /// \param m
4014 ///    A pointer to the memory used for loading values.
4015 /// \param i
4016 ///    A 128-bit vector of [4 x i32] containing signed indexes into \a m.
4017 /// \param mask
4018 ///    A 256-bit vector of [4 x double] containing the mask. The most
4019 ///    significant bit of each element in the mask vector represents the mask
4020 ///    bits. If a mask bit is zero, the corresponding value from vector \a a
4021 ///    is gathered; otherwise the value is loaded from memory.
4022 /// \param s
4023 ///    A literal constant scale factor for the indexes in \a i. Must be
4024 ///    1, 2, 4, or 8.
4025 /// \returns A 256-bit vector of [4 x double] containing the gathered values.
4026 #define _mm256_mask_i32gather_pd(a, m, i, mask, s) \
4027   ((__m256d)__builtin_ia32_gatherd_pd256((__v4df)(__m256d)(a), \
4028                                          (double const *)(m), \
4029                                          (__v4si)(__m128i)(i), \
4030                                          (__v4df)(__m256d)(mask), (s)))
4031 
4032 /// Conditionally gathers two 64-bit floating-point values, either from the
4033 ///    128-bit vector of [2 x double] in \a a, or from memory \a m using scaled
4034 ///    indexes from the 128-bit vector of [2 x i64] in \a i. The 128-bit vector
4035 ///    of [2 x double] in \a mask determines the source for each element.
4036 ///
4037 /// \code{.operation}
4038 /// FOR element := 0 to 1
4039 ///   j := element*64
4040 ///   k := element*64
4041 ///   IF mask[j+63] == 0
4042 ///     result[j+63:j] := a[j+63:j]
4043 ///   ELSE
4044 ///     result[j+63:j] := Load64(m + SignExtend(i[k+63:k])*s)
4045 ///   FI
4046 /// ENDFOR
4047 /// \endcode
4048 ///
4049 /// \headerfile <immintrin.h>
4050 ///
4051 /// \code
4052 /// __m128d _mm_mask_i64gather_pd(__m128d a, const double *m, __m128i i,
4053 ///                               __m128d mask, const int s);
4054 /// \endcode
4055 ///
4056 /// This intrinsic corresponds to the \c VGATHERQPD instruction.
4057 ///
4058 /// \param a
4059 ///    A 128-bit vector of [2 x double] used as the source when a mask bit is
4060 ///    zero.
4061 /// \param m
4062 ///    A pointer to the memory used for loading values.
4063 /// \param i
4064 ///    A 128-bit vector of [2 x i64] containing signed indexes into \a m.
4065 /// \param mask
4066 ///    A 128-bit vector of [2 x double] containing the mask. The most
4067 ///    significant bit of each element in the mask vector represents the mask
4068 ///    bits. If a mask bit is zero, the corresponding value from vector \a a
4069 ///    is gathered; otherwise the value is loaded from memory.
4070 /// \param s
4071 ///    A literal constant scale factor for the indexes in \a i. Must be
4072 ///    1, 2, 4, or 8.
4073 /// \returns A 128-bit vector of [2 x double] containing the gathered values.
4074 #define _mm_mask_i64gather_pd(a, m, i, mask, s) \
4075   ((__m128d)__builtin_ia32_gatherq_pd((__v2df)(__m128d)(a), \
4076                                       (double const *)(m), \
4077                                       (__v2di)(__m128i)(i), \
4078                                       (__v2df)(__m128d)(mask), (s)))
4079 
4080 /// Conditionally gathers four 64-bit floating-point values, either from the
4081 ///    256-bit vector of [4 x double] in \a a, or from memory \a m using scaled
4082 ///    indexes from the 256-bit vector of [4 x i64] in \a i. The 256-bit vector
4083 ///    of [4 x double] in \a mask determines the source for each element.
4084 ///
4085 /// \code{.operation}
4086 /// FOR element := 0 to 3
4087 ///   j := element*64
4088 ///   k := element*64
4089 ///   IF mask[j+63] == 0
4090 ///     result[j+63:j] := a[j+63:j]
4091 ///   ELSE
4092 ///     result[j+63:j] := Load64(m + SignExtend(i[k+63:k])*s)
4093 ///   FI
4094 /// ENDFOR
4095 /// \endcode
4096 ///
4097 /// \headerfile <immintrin.h>
4098 ///
4099 /// \code
4100 /// __m256d _mm256_mask_i64gather_pd(__m256d a, const double *m, __m256i i,
4101 ///                                  __m256d mask, const int s);
4102 /// \endcode
4103 ///
4104 /// This intrinsic corresponds to the \c VGATHERQPD instruction.
4105 ///
4106 /// \param a
4107 ///    A 256-bit vector of [4 x double] used as the source when a mask bit is
4108 ///    zero.
4109 /// \param m
4110 ///    A pointer to the memory used for loading values.
4111 /// \param i
4112 ///    A 256-bit vector of [4 x i64] containing signed indexes into \a m.
4113 /// \param mask
4114 ///    A 256-bit vector of [4 x double] containing the mask. The most
4115 ///    significant bit of each element in the mask vector represents the mask
4116 ///    bits. If a mask bit is zero, the corresponding value from vector \a a
4117 ///    is gathered; otherwise the value is loaded from memory.
4118 /// \param s
4119 ///    A literal constant scale factor for the indexes in \a i. Must be
4120 ///    1, 2, 4, or 8.
4121 /// \returns A 256-bit vector of [4 x double] containing the gathered values.
4122 #define _mm256_mask_i64gather_pd(a, m, i, mask, s) \
4123   ((__m256d)__builtin_ia32_gatherq_pd256((__v4df)(__m256d)(a), \
4124                                          (double const *)(m), \
4125                                          (__v4di)(__m256i)(i), \
4126                                          (__v4df)(__m256d)(mask), (s)))
4127 
4128 /// Conditionally gathers four 32-bit floating-point values, either from the
4129 ///    128-bit vector of [4 x float] in \a a, or from memory \a m using scaled
4130 ///    indexes from the 128-bit vector of [4 x i32] in \a i. The 128-bit vector
4131 ///    of [4 x float] in \a mask determines the source for each element.
4132 ///
4133 /// \code{.operation}
4134 /// FOR element := 0 to 3
4135 ///   j := element*32
4136 ///   k := element*32
4137 ///   IF mask[j+31] == 0
4138 ///     result[j+31:j] := a[j+31:j]
4139 ///   ELSE
4140 ///     result[j+31:j] := Load32(m + SignExtend(i[k+31:k])*s)
4141 ///   FI
4142 /// ENDFOR
4143 /// \endcode
4144 ///
4145 /// \headerfile <immintrin.h>
4146 ///
4147 /// \code
4148 /// __m128 _mm_mask_i32gather_ps(__m128 a, const float *m, __m128i i,
4149 ///                              __m128 mask, const int s);
4150 /// \endcode
4151 ///
4152 /// This intrinsic corresponds to the \c VGATHERDPS instruction.
4153 ///
4154 /// \param a
4155 ///    A 128-bit vector of [4 x float] used as the source when a mask bit is
4156 ///    zero.
4157 /// \param m
4158 ///    A pointer to the memory used for loading values.
4159 /// \param i
4160 ///    A 128-bit vector of [4 x i32] containing signed indexes into \a m.
4161 /// \param mask
4162 ///    A 128-bit vector of [4 x float] containing the mask. The most
4163 ///    significant bit of each element in the mask vector represents the mask
4164 ///    bits. If a mask bit is zero, the corresponding value from vector \a a
4165 ///    is gathered; otherwise the value is loaded from memory.
4166 /// \param s
4167 ///    A literal constant scale factor for the indexes in \a i. Must be
4168 ///    1, 2, 4, or 8.
4169 /// \returns A 128-bit vector of [4 x float] containing the gathered values.
4170 #define _mm_mask_i32gather_ps(a, m, i, mask, s) \
4171   ((__m128)__builtin_ia32_gatherd_ps((__v4sf)(__m128)(a), \
4172                                      (float const *)(m), \
4173                                      (__v4si)(__m128i)(i), \
4174                                      (__v4sf)(__m128)(mask), (s)))
4175 
4176 /// Conditionally gathers eight 32-bit floating-point values, either from the
4177 ///    256-bit vector of [8 x float] in \a a, or from memory \a m using scaled
4178 ///    indexes from the 256-bit vector of [8 x i32] in \a i. The 256-bit vector
4179 ///    of [8 x float] in \a mask determines the source for each element.
4180 ///
4181 /// \code{.operation}
4182 /// FOR element := 0 to 7
4183 ///   j := element*32
4184 ///   k := element*32
4185 ///   IF mask[j+31] == 0
4186 ///     result[j+31:j] := a[j+31:j]
4187 ///   ELSE
4188 ///     result[j+31:j] := Load32(m + SignExtend(i[k+31:k])*s)
4189 ///   FI
4190 /// ENDFOR
4191 /// \endcode
4192 ///
4193 /// \headerfile <immintrin.h>
4194 ///
4195 /// \code
4196 /// __m256 _mm256_mask_i32gather_ps(__m256 a, const float *m, __m256i i,
4197 ///                                 __m256 mask, const int s);
4198 /// \endcode
4199 ///
4200 /// This intrinsic corresponds to the \c VGATHERDPS instruction.
4201 ///
4202 /// \param a
4203 ///    A 256-bit vector of [8 x float] used as the source when a mask bit is
4204 ///    zero.
4205 /// \param m
4206 ///    A pointer to the memory used for loading values.
4207 /// \param i
4208 ///    A 256-bit vector of [8 x i32] containing signed indexes into \a m.
4209 /// \param mask
4210 ///    A 256-bit vector of [8 x float] containing the mask. The most
4211 ///    significant bit of each element in the mask vector represents the mask
4212 ///    bits. If a mask bit is zero, the corresponding value from vector \a a
4213 ///    is gathered; otherwise the value is loaded from memory.
4214 /// \param s
4215 ///    A literal constant scale factor for the indexes in \a i. Must be
4216 ///    1, 2, 4, or 8.
4217 /// \returns A 256-bit vector of [8 x float] containing the gathered values.
4218 #define _mm256_mask_i32gather_ps(a, m, i, mask, s) \
4219   ((__m256)__builtin_ia32_gatherd_ps256((__v8sf)(__m256)(a), \
4220                                         (float const *)(m), \
4221                                         (__v8si)(__m256i)(i), \
4222                                         (__v8sf)(__m256)(mask), (s)))
4223 
4224 /// Conditionally gathers two 32-bit floating-point values, either from the
4225 ///    128-bit vector of [4 x float] in \a a, or from memory \a m using scaled
4226 ///    indexes from the 128-bit vector of [2 x i64] in \a i. The 128-bit vector
4227 ///    of [4 x float] in \a mask determines the source for the lower two
4228 ///    elements. The upper two elements of the result are zeroed.
4229 ///
4230 /// \code{.operation}
4231 /// FOR element := 0 to 1
4232 ///   j := element*32
4233 ///   k := element*64
4234 ///   IF mask[j+31] == 0
4235 ///     result[j+31:j] := a[j+31:j]
4236 ///   ELSE
4237 ///     result[j+31:j] := Load32(m + SignExtend(i[k+63:k])*s)
4238 ///   FI
4239 /// ENDFOR
4240 /// result[127:64] := 0
4241 /// \endcode
4242 ///
4243 /// \headerfile <immintrin.h>
4244 ///
4245 /// \code
4246 /// __m128 _mm_mask_i64gather_ps(__m128 a, const float *m, __m128i i,
4247 ///                              __m128 mask, const int s);
4248 /// \endcode
4249 ///
4250 /// This intrinsic corresponds to the \c VGATHERQPS instruction.
4251 ///
4252 /// \param a
4253 ///    A 128-bit vector of [4 x float] used as the source when a mask bit is
4254 ///    zero. Only the first two elements are used.
4255 /// \param m
4256 ///    A pointer to the memory used for loading values.
4257 /// \param i
4258 ///    A 128-bit vector of [2 x i64] containing signed indexes into \a m.
4259 /// \param mask
4260 ///    A 128-bit vector of [4 x float] containing the mask. The most
4261 ///    significant bit of each element in the mask vector represents the mask
4262 ///    bits. If a mask bit is zero, the corresponding value from vector \a a
4263 ///    is gathered; otherwise the value is loaded from memory. Only the first
4264 ///    two elements are used.
4265 /// \param s
4266 ///    A literal constant scale factor for the indexes in \a i. Must be
4267 ///    1, 2, 4, or 8.
4268 /// \returns A 128-bit vector of [4 x float] containing the gathered values.
4269 #define _mm_mask_i64gather_ps(a, m, i, mask, s) \
4270   ((__m128)__builtin_ia32_gatherq_ps((__v4sf)(__m128)(a), \
4271                                      (float const *)(m), \
4272                                      (__v2di)(__m128i)(i), \
4273                                      (__v4sf)(__m128)(mask), (s)))
4274 
4275 /// Conditionally gathers four 32-bit floating-point values, either from the
4276 ///    128-bit vector of [4 x float] in \a a, or from memory \a m using scaled
4277 ///    indexes from the 256-bit vector of [4 x i64] in \a i. The 128-bit vector
4278 ///    of [4 x float] in \a mask determines the source for each element.
4279 ///
4280 /// \code{.operation}
4281 /// FOR element := 0 to 3
4282 ///   j := element*32
4283 ///   k := element*64
4284 ///   IF mask[j+31] == 0
4285 ///     result[j+31:j] := a[j+31:j]
4286 ///   ELSE
4287 ///     result[j+31:j] := Load32(m + SignExtend(i[k+63:k])*s)
4288 ///   FI
4289 /// ENDFOR
4290 /// \endcode
4291 ///
4292 /// \headerfile <immintrin.h>
4293 ///
4294 /// \code
4295 /// __m128 _mm256_mask_i64gather_ps(__m128 a, const float *m, __m256i i,
4296 ///                                 __m128 mask, const int s);
4297 /// \endcode
4298 ///
4299 /// This intrinsic corresponds to the \c VGATHERQPS instruction.
4300 ///
4301 /// \param a
4302 ///    A 128-bit vector of [4 x float] used as the source when a mask bit is
4303 ///   zero.
4304 /// \param m
4305 ///    A pointer to the memory used for loading values.
4306 /// \param i
4307 ///    A 256-bit vector of [4 x i64] containing signed indexes into \a m.
4308 /// \param mask
4309 ///    A 128-bit vector of [4 x float] containing the mask. The most
4310 ///    significant bit of each element in the mask vector represents the mask
4311 ///    bits. If a mask bit is zero, the corresponding value from vector \a a
4312 ///    is gathered; otherwise the value is loaded from memory.
4313 /// \param s
4314 ///    A literal constant scale factor for the indexes in \a i. Must be
4315 ///    1, 2, 4, or 8.
4316 /// \returns A 128-bit vector of [4 x float] containing the gathered values.
4317 #define _mm256_mask_i64gather_ps(a, m, i, mask, s) \
4318   ((__m128)__builtin_ia32_gatherq_ps256((__v4sf)(__m128)(a), \
4319                                         (float const *)(m), \
4320                                         (__v4di)(__m256i)(i), \
4321                                         (__v4sf)(__m128)(mask), (s)))
4322 
4323 /// Conditionally gathers four 32-bit integer values, either from the
4324 ///    128-bit vector of [4 x i32] in \a a, or from memory \a m using scaled
4325 ///    indexes from the 128-bit vector of [4 x i32] in \a i. The 128-bit vector
4326 ///    of [4 x i32] in \a mask determines the source for each element.
4327 ///
4328 /// \code{.operation}
4329 /// FOR element := 0 to 3
4330 ///   j := element*32
4331 ///   k := element*32
4332 ///   IF mask[j+31] == 0
4333 ///     result[j+31:j] := a[j+31:j]
4334 ///   ELSE
4335 ///     result[j+31:j] := Load32(m + SignExtend(i[k+31:k])*s)
4336 ///   FI
4337 /// ENDFOR
4338 /// \endcode
4339 ///
4340 /// \headerfile <immintrin.h>
4341 ///
4342 /// \code
4343 /// __m128i _mm_mask_i32gather_epi32(__m128i a, const int *m, __m128i i,
4344 ///                                  __m128i mask, const int s);
4345 /// \endcode
4346 ///
4347 /// This intrinsic corresponds to the \c VPGATHERDD instruction.
4348 ///
4349 /// \param a
4350 ///    A 128-bit vector of [4 x i32] used as the source when a mask bit is
4351 ///    zero.
4352 /// \param m
4353 ///    A pointer to the memory used for loading values.
4354 /// \param i
4355 ///    A 128-bit vector of [4 x i32] containing signed indexes into \a m.
4356 /// \param mask
4357 ///    A 128-bit vector of [4 x i32] containing the mask. The most significant
4358 ///    bit of each element in the mask vector represents the mask bits. If a
4359 ///    mask bit is zero, the corresponding value from vector \a a is gathered;
4360 ///    otherwise the value is loaded from memory.
4361 /// \param s
4362 ///    A literal constant scale factor for the indexes in \a i. Must be
4363 ///    1, 2, 4, or 8.
4364 /// \returns A 128-bit vector of [4 x i32] containing the gathered values.
4365 #define _mm_mask_i32gather_epi32(a, m, i, mask, s) \
4366   ((__m128i)__builtin_ia32_gatherd_d((__v4si)(__m128i)(a), \
4367                                      (int const *)(m), \
4368                                      (__v4si)(__m128i)(i), \
4369                                      (__v4si)(__m128i)(mask), (s)))
4370 
4371 /// Conditionally gathers eight 32-bit integer values, either from the
4372 ///    256-bit vector of [8 x i32] in \a a, or from memory \a m using scaled
4373 ///    indexes from the 256-bit vector of [8 x i32] in \a i. The 256-bit vector
4374 ///    of [8 x i32] in \a mask determines the source for each element.
4375 ///
4376 /// \code{.operation}
4377 /// FOR element := 0 to 7
4378 ///   j := element*32
4379 ///   k := element*32
4380 ///   IF mask[j+31] == 0
4381 ///     result[j+31:j] := a[j+31:j]
4382 ///   ELSE
4383 ///     result[j+31:j] := Load32(m + SignExtend(i[k+31:k])*s)
4384 ///   FI
4385 /// ENDFOR
4386 /// \endcode
4387 ///
4388 /// \headerfile <immintrin.h>
4389 ///
4390 /// \code
4391 /// __m256i _mm256_mask_i32gather_epi32(__m256i a, const int *m, __m256i i,
4392 ///                                     __m256i mask, const int s);
4393 /// \endcode
4394 ///
4395 /// This intrinsic corresponds to the \c VPGATHERDD instruction.
4396 ///
4397 /// \param a
4398 ///    A 256-bit vector of [8 x i32] used as the source when a mask bit is
4399 ///    zero.
4400 /// \param m
4401 ///    A pointer to the memory used for loading values.
4402 /// \param i
4403 ///    A 256-bit vector of [8 x i32] containing signed indexes into \a m.
4404 /// \param mask
4405 ///    A 256-bit vector of [8 x i32] containing the mask. The most significant
4406 ///    bit of each element in the mask vector represents the mask bits. If a
4407 ///    mask bit is zero, the corresponding value from vector \a a is gathered;
4408 ///    otherwise the value is loaded from memory.
4409 /// \param s
4410 ///    A literal constant scale factor for the indexes in \a i. Must be
4411 ///    1, 2, 4, or 8.
4412 /// \returns A 256-bit vector of [8 x i32] containing the gathered values.
4413 #define _mm256_mask_i32gather_epi32(a, m, i, mask, s) \
4414   ((__m256i)__builtin_ia32_gatherd_d256((__v8si)(__m256i)(a), \
4415                                         (int const *)(m), \
4416                                         (__v8si)(__m256i)(i), \
4417                                         (__v8si)(__m256i)(mask), (s)))
4418 
4419 /// Conditionally gathers two 32-bit integer values, either from the
4420 ///    128-bit vector of [4 x i32] in \a a, or from memory \a m using scaled
4421 ///    indexes from the 128-bit vector of [2 x i64] in \a i. The 128-bit vector
4422 ///    of [4 x i32] in \a mask determines the source for the lower two
4423 ///    elements. The upper two elements of the result are zeroed.
4424 ///
4425 /// \code{.operation}
4426 /// FOR element := 0 to 1
4427 ///   j := element*32
4428 ///   k := element*64
4429 ///   IF mask[j+31] == 0
4430 ///     result[j+31:j] := a[j+31:j]
4431 ///   ELSE
4432 ///     result[j+31:j] := Load32(m + SignExtend(i[k+63:k])*s)
4433 ///   FI
4434 /// ENDFOR
4435 /// result[127:64] := 0
4436 /// \endcode
4437 ///
4438 /// \headerfile <immintrin.h>
4439 ///
4440 /// \code
4441 /// __m128i _mm_mask_i64gather_epi32(__m128i a, const int *m, __m128i i,
4442 ///                                  __m128i mask, const int s);
4443 /// \endcode
4444 ///
4445 /// This intrinsic corresponds to the \c VPGATHERQD instruction.
4446 ///
4447 /// \param a
4448 ///    A 128-bit vector of [4 x i32] used as the source when a mask bit is
4449 ///   zero. Only the first two elements are used.
4450 /// \param m
4451 ///    A pointer to the memory used for loading values.
4452 /// \param i
4453 ///    A 128-bit vector of [2 x i64] containing indexes into \a m.
4454 /// \param mask
4455 ///    A 128-bit vector of [4 x i32] containing the mask. The most significant
4456 ///    bit of each element in the mask vector represents the mask bits. If a
4457 ///    mask bit is zero, the corresponding value from vector \a a is gathered;
4458 ///    otherwise the value is loaded from memory. Only the first two elements
4459 ///    are used.
4460 /// \param s
4461 ///    A literal constant scale factor for the indexes in \a i. Must be
4462 ///    1, 2, 4, or 8.
4463 /// \returns A 128-bit vector of [4 x i32] containing the gathered values.
4464 #define _mm_mask_i64gather_epi32(a, m, i, mask, s) \
4465   ((__m128i)__builtin_ia32_gatherq_d((__v4si)(__m128i)(a), \
4466                                      (int const *)(m), \
4467                                      (__v2di)(__m128i)(i), \
4468                                      (__v4si)(__m128i)(mask), (s)))
4469 
4470 /// Conditionally gathers four 32-bit integer values, either from the
4471 ///    128-bit vector of [4 x i32] in \a a, or from memory \a m using scaled
4472 ///    indexes from the 256-bit vector of [4 x i64] in \a i. The 128-bit vector
4473 ///    of [4 x i32] in \a mask determines the source for each element.
4474 ///
4475 /// \code{.operation}
4476 /// FOR element := 0 to 3
4477 ///   j := element*32
4478 ///   k := element*64
4479 ///   IF mask[j+31] == 0
4480 ///     result[j+31:j] := a[j+31:j]
4481 ///   ELSE
4482 ///     result[j+31:j] := Load32(m + SignExtend(i[k+63:k])*s)
4483 ///   FI
4484 /// ENDFOR
4485 /// \endcode
4486 ///
4487 /// \headerfile <immintrin.h>
4488 ///
4489 /// \code
4490 /// __m128i _mm256_mask_i64gather_epi32(__m128i a, const int *m, __m256i i,
4491 ///                                     __m128i mask, const int s);
4492 /// \endcode
4493 ///
4494 /// This intrinsic corresponds to the \c VPGATHERQD instruction.
4495 ///
4496 /// \param a
4497 ///    A 128-bit vector of [4 x i32] used as the source when a mask bit is
4498 ///    zero.
4499 /// \param m
4500 ///    A pointer to the memory used for loading values.
4501 /// \param i
4502 ///    A 256-bit vector of [4 x i64] containing signed indexes into \a m.
4503 /// \param mask
4504 ///    A 128-bit vector of [4 x i32] containing the mask. The most significant
4505 ///    bit of each element in the mask vector represents the mask bits. If a
4506 ///    mask bit is zero, the corresponding value from vector \a a is gathered;
4507 ///    otherwise the value is loaded from memory.
4508 /// \param s
4509 ///    A literal constant scale factor for the indexes in \a i. Must be
4510 ///    1, 2, 4, or 8.
4511 /// \returns A 128-bit vector of [4 x i32] containing the gathered values.
4512 #define _mm256_mask_i64gather_epi32(a, m, i, mask, s) \
4513   ((__m128i)__builtin_ia32_gatherq_d256((__v4si)(__m128i)(a), \
4514                                         (int const *)(m), \
4515                                         (__v4di)(__m256i)(i), \
4516                                         (__v4si)(__m128i)(mask), (s)))
4517 
4518 /// Conditionally gathers two 64-bit integer values, either from the
4519 ///    128-bit vector of [2 x i64] in \a a, or from memory \a m using scaled
4520 ///    indexes from the 128-bit vector of [4 x i32] in \a i. The 128-bit vector
4521 ///    of [2 x i64] in \a mask determines the source for each element.
4522 ///
4523 /// \code{.operation}
4524 /// FOR element := 0 to 1
4525 ///   j := element*64
4526 ///   k := element*32
4527 ///   IF mask[j+63] == 0
4528 ///     result[j+63:j] := a[j+63:j]
4529 ///   ELSE
4530 ///     result[j+63:j] := Load64(m + SignExtend(i[k+31:k])*s)
4531 ///   FI
4532 /// ENDFOR
4533 /// \endcode
4534 ///
4535 /// \headerfile <immintrin.h>
4536 ///
4537 /// \code
4538 /// __m128i _mm_mask_i32gather_epi64(__m128i a, const long long *m, __m128i i,
4539 ///                                  __m128i mask, const int s);
4540 /// \endcode
4541 ///
4542 /// This intrinsic corresponds to the \c VPGATHERDQ instruction.
4543 ///
4544 /// \param a
4545 ///    A 128-bit vector of [2 x i64] used as the source when a mask bit is
4546 ///    zero.
4547 /// \param m
4548 ///    A pointer to the memory used for loading values.
4549 /// \param i
4550 ///    A 128-bit vector of [4 x i32] containing signed indexes into \a m. Only
4551 ///    the first two elements are used.
4552 /// \param mask
4553 ///    A 128-bit vector of [2 x i64] containing the mask. The most significant
4554 ///    bit of each element in the mask vector represents the mask bits. If a
4555 ///    mask bit is zero, the corresponding value from vector \a a is gathered;
4556 ///    otherwise the value is loaded from memory.
4557 /// \param s
4558 ///    A literal constant scale factor for the indexes in \a i. Must be
4559 ///    1, 2, 4, or 8.
4560 /// \returns A 128-bit vector of [2 x i64] containing the gathered values.
4561 #define _mm_mask_i32gather_epi64(a, m, i, mask, s) \
4562   ((__m128i)__builtin_ia32_gatherd_q((__v2di)(__m128i)(a), \
4563                                      (long long const *)(m), \
4564                                      (__v4si)(__m128i)(i), \
4565                                      (__v2di)(__m128i)(mask), (s)))
4566 
4567 /// Conditionally gathers four 64-bit integer values, either from the
4568 ///    256-bit vector of [4 x i64] in \a a, or from memory \a m using scaled
4569 ///    indexes from the 128-bit vector of [4 x i32] in \a i. The 256-bit vector
4570 ///    of [4 x i64] in \a mask determines the source for each element.
4571 ///
4572 /// \code{.operation}
4573 /// FOR element := 0 to 3
4574 ///   j := element*64
4575 ///   k := element*32
4576 ///   IF mask[j+63] == 0
4577 ///     result[j+63:j] := a[j+63:j]
4578 ///   ELSE
4579 ///     result[j+63:j] := Load64(m + SignExtend(i[k+31:k])*s)
4580 ///   FI
4581 /// ENDFOR
4582 /// \endcode
4583 ///
4584 /// \headerfile <immintrin.h>
4585 ///
4586 /// \code
4587 /// __m256i _mm256_mask_i32gather_epi64(__m256i a, const long long *m,
4588 ///                                     __m128i i, __m256i mask, const int s);
4589 /// \endcode
4590 ///
4591 /// This intrinsic corresponds to the \c VPGATHERDQ instruction.
4592 ///
4593 /// \param a
4594 ///    A 256-bit vector of [4 x i64] used as the source when a mask bit is
4595 ///    zero.
4596 /// \param m
4597 ///    A pointer to the memory used for loading values.
4598 /// \param i
4599 ///    A 128-bit vector of [4 x i32] containing signed indexes into \a m.
4600 /// \param mask
4601 ///    A 256-bit vector of [4 x i64] containing the mask. The most significant
4602 ///    bit of each element in the mask vector represents the mask bits. If a
4603 ///    mask bit is zero, the corresponding value from vector \a a is gathered;
4604 ///    otherwise the value is loaded from memory.
4605 /// \param s
4606 ///    A literal constant scale factor for the indexes in \a i. Must be
4607 ///    1, 2, 4, or 8.
4608 /// \returns A 256-bit vector of [4 x i64] containing the gathered values.
4609 #define _mm256_mask_i32gather_epi64(a, m, i, mask, s) \
4610   ((__m256i)__builtin_ia32_gatherd_q256((__v4di)(__m256i)(a), \
4611                                         (long long const *)(m), \
4612                                         (__v4si)(__m128i)(i), \
4613                                         (__v4di)(__m256i)(mask), (s)))
4614 
4615 /// Conditionally gathers two 64-bit integer values, either from the
4616 ///    128-bit vector of [2 x i64] in \a a, or from memory \a m using scaled
4617 ///    indexes from the 128-bit vector of [2 x i64] in \a i. The 128-bit vector
4618 ///    of [2 x i64] in \a mask determines the source for each element.
4619 ///
4620 /// \code{.operation}
4621 /// FOR element := 0 to 1
4622 ///   j := element*64
4623 ///   k := element*64
4624 ///   IF mask[j+63] == 0
4625 ///     result[j+63:j] := a[j+63:j]
4626 ///   ELSE
4627 ///     result[j+63:j] := Load64(m + SignExtend(i[k+63:k])*s)
4628 ///   FI
4629 /// ENDFOR
4630 /// \endcode
4631 ///
4632 /// \headerfile <immintrin.h>
4633 ///
4634 /// \code
4635 /// __m128i _mm_mask_i64gather_epi64(__m128i a, const long long *m, __m128i i,
4636 ///                                  __m128i mask, const int s);
4637 /// \endcode
4638 ///
4639 /// This intrinsic corresponds to the \c VPGATHERQQ instruction.
4640 ///
4641 /// \param a
4642 ///    A 128-bit vector of [2 x i64] used as the source when a mask bit is
4643 ///    zero.
4644 /// \param m
4645 ///    A pointer to the memory used for loading values.
4646 /// \param i
4647 ///    A 128-bit vector of [2 x i64] containing signed indexes into \a m.
4648 /// \param mask
4649 ///    A 128-bit vector of [2 x i64] containing the mask. The most significant
4650 ///    bit of each element in the mask vector represents the mask bits. If a
4651 ///    mask bit is zero, the corresponding value from vector \a a is gathered;
4652 ///    otherwise the value is loaded from memory.
4653 /// \param s
4654 ///    A literal constant scale factor for the indexes in \a i. Must be
4655 ///    1, 2, 4, or 8.
4656 /// \returns A 128-bit vector of [2 x i64] containing the gathered values.
4657 #define _mm_mask_i64gather_epi64(a, m, i, mask, s) \
4658   ((__m128i)__builtin_ia32_gatherq_q((__v2di)(__m128i)(a), \
4659                                      (long long const *)(m), \
4660                                      (__v2di)(__m128i)(i), \
4661                                      (__v2di)(__m128i)(mask), (s)))
4662 
4663 /// Conditionally gathers four 64-bit integer values, either from the
4664 ///    256-bit vector of [4 x i64] in \a a, or from memory \a m using scaled
4665 ///    indexes from the 256-bit vector of [4 x i64] in \a i. The 256-bit vector
4666 ///    of [4 x i64] in \a mask determines the source for each element.
4667 ///
4668 /// \code{.operation}
4669 /// FOR element := 0 to 3
4670 ///   j := element*64
4671 ///   k := element*64
4672 ///   IF mask[j+63] == 0
4673 ///     result[j+63:j] := a[j+63:j]
4674 ///   ELSE
4675 ///     result[j+63:j] := Load64(m + SignExtend(i[k+63:k])*s)
4676 ///   FI
4677 /// ENDFOR
4678 /// \endcode
4679 ///
4680 /// \headerfile <immintrin.h>
4681 ///
4682 /// \code
4683 /// __m256i _mm256_mask_i64gather_epi64(__m256i a, const long long *m,
4684 ///                                     __m256i i, __m256i mask, const int s);
4685 /// \endcode
4686 ///
4687 /// This intrinsic corresponds to the \c VPGATHERQQ instruction.
4688 ///
4689 /// \param a
4690 ///    A 256-bit vector of [4 x i64] used as the source when a mask bit is
4691 ///    zero.
4692 /// \param m
4693 ///    A pointer to the memory used for loading values.
4694 /// \param i
4695 ///    A 256-bit vector of [4 x i64] containing signed indexes into \a m.
4696 /// \param mask
4697 ///    A 256-bit vector of [4 x i64] containing the mask. The most significant
4698 ///    bit of each element in the mask vector represents the mask bits. If a
4699 ///    mask bit is zero, the corresponding value from vector \a a is gathered;
4700 ///    otherwise the value is loaded from memory.
4701 /// \param s
4702 ///    A literal constant scale factor for the indexes in \a i. Must be
4703 ///    1, 2, 4, or 8.
4704 /// \returns A 256-bit vector of [4 x i64] containing the gathered values.
4705 #define _mm256_mask_i64gather_epi64(a, m, i, mask, s) \
4706   ((__m256i)__builtin_ia32_gatherq_q256((__v4di)(__m256i)(a), \
4707                                         (long long const *)(m), \
4708                                         (__v4di)(__m256i)(i), \
4709                                         (__v4di)(__m256i)(mask), (s)))
4710 
4711 /// Gathers two 64-bit floating-point values from memory \a m using scaled
4712 ///    indexes from the 128-bit vector of [4 x i32] in \a i.
4713 ///
4714 /// \code{.operation}
4715 /// FOR element := 0 to 1
4716 ///   j := element*64
4717 ///   k := element*32
4718 ///   result[j+63:j] := Load64(m + SignExtend(i[k+31:k])*s)
4719 /// ENDFOR
4720 /// \endcode
4721 ///
4722 /// \headerfile <immintrin.h>
4723 ///
4724 /// \code
4725 /// __m128d _mm_i32gather_pd(const double *m, __m128i i, const int s);
4726 /// \endcode
4727 ///
4728 /// This intrinsic corresponds to the \c VGATHERDPD instruction.
4729 ///
4730 /// \param m
4731 ///    A pointer to the memory used for loading values.
4732 /// \param i
4733 ///    A 128-bit vector of [4 x i32] containing signed indexes into \a m. Only
4734 ///    the first two elements are used.
4735 /// \param s
4736 ///    A literal constant scale factor for the indexes in \a i. Must be
4737 ///    1, 2, 4, or 8.
4738 /// \returns A 128-bit vector of [2 x double] containing the gathered values.
4739 #define _mm_i32gather_pd(m, i, s) \
4740   ((__m128d)__builtin_ia32_gatherd_pd((__v2df)_mm_undefined_pd(), \
4741                                       (double const *)(m), \
4742                                       (__v4si)(__m128i)(i), \
4743                                       (__v2df)_mm_cmpeq_pd(_mm_setzero_pd(), \
4744                                                            _mm_setzero_pd()), \
4745                                       (s)))
4746 
4747 /// Gathers four 64-bit floating-point values from memory \a m using scaled
4748 ///    indexes from the 128-bit vector of [4 x i32] in \a i.
4749 ///
4750 /// \code{.operation}
4751 /// FOR element := 0 to 3
4752 ///   j := element*64
4753 ///   k := element*32
4754 ///   result[j+63:j] := Load64(m + SignExtend(i[k+31:k])*s)
4755 /// ENDFOR
4756 /// \endcode
4757 ///
4758 /// \headerfile <immintrin.h>
4759 ///
4760 /// \code
4761 /// __m256d _mm256_i32gather_pd(const double *m, __m128i i, const int s);
4762 /// \endcode
4763 ///
4764 /// This intrinsic corresponds to the \c VGATHERDPD instruction.
4765 ///
4766 /// \param m
4767 ///    A pointer to the memory used for loading values.
4768 /// \param i
4769 ///    A 128-bit vector of [4 x i32] containing signed indexes into \a m.
4770 /// \param s
4771 ///    A literal constant scale factor for the indexes in \a i. Must be
4772 ///    1, 2, 4, or 8.
4773 /// \returns A 256-bit vector of [4 x double] containing the gathered values.
4774 #define _mm256_i32gather_pd(m, i, s) \
4775   ((__m256d)__builtin_ia32_gatherd_pd256((__v4df)_mm256_undefined_pd(), \
4776                                          (double const *)(m), \
4777                                          (__v4si)(__m128i)(i), \
4778                                          (__v4df)_mm256_cmp_pd(_mm256_setzero_pd(), \
4779                                                                _mm256_setzero_pd(), \
4780                                                                _CMP_EQ_OQ), \
4781                                          (s)))
4782 
4783 /// Gathers two 64-bit floating-point values from memory \a m using scaled
4784 ///    indexes from the 128-bit vector of [2 x i64] in \a i.
4785 ///
4786 /// \code{.operation}
4787 /// FOR element := 0 to 1
4788 ///   j := element*64
4789 ///   k := element*64
4790 ///   result[j+63:j] := Load64(m + SignExtend(i[k+63:k])*s)
4791 /// ENDFOR
4792 /// \endcode
4793 ///
4794 /// \headerfile <immintrin.h>
4795 ///
4796 /// \code
4797 /// __m128d _mm_i64gather_pd(const double *m, __m128i i, const int s);
4798 /// \endcode
4799 ///
4800 /// This intrinsic corresponds to the \c VGATHERQPD instruction.
4801 ///
4802 /// \param m
4803 ///    A pointer to the memory used for loading values.
4804 /// \param i
4805 ///    A 128-bit vector of [2 x i64] containing signed indexes into \a m.
4806 /// \param s
4807 ///    A literal constant scale factor for the indexes in \a i. Must be
4808 ///    1, 2, 4, or 8.
4809 /// \returns A 128-bit vector of [2 x double] containing the gathered values.
4810 #define _mm_i64gather_pd(m, i, s) \
4811   ((__m128d)__builtin_ia32_gatherq_pd((__v2df)_mm_undefined_pd(), \
4812                                       (double const *)(m), \
4813                                       (__v2di)(__m128i)(i), \
4814                                       (__v2df)_mm_cmpeq_pd(_mm_setzero_pd(), \
4815                                                            _mm_setzero_pd()), \
4816                                       (s)))
4817 
4818 /// Gathers four 64-bit floating-point values from memory \a m using scaled
4819 ///    indexes from the 256-bit vector of [4 x i64] in \a i.
4820 ///
4821 /// \code{.operation}
4822 /// FOR element := 0 to 3
4823 ///   j := element*64
4824 ///   k := element*64
4825 ///   result[j+63:j] := Load64(m + SignExtend(i[k+63:k])*s)
4826 /// ENDFOR
4827 /// \endcode
4828 ///
4829 /// \headerfile <immintrin.h>
4830 ///
4831 /// \code
4832 /// __m256d _mm256_i64gather_pd(const double *m, __m256i i, const int s);
4833 /// \endcode
4834 ///
4835 /// This intrinsic corresponds to the \c VGATHERQPD instruction.
4836 ///
4837 /// \param m
4838 ///    A pointer to the memory used for loading values.
4839 /// \param i
4840 ///    A 256-bit vector of [4 x i64] containing signed indexes into \a m.
4841 /// \param s
4842 ///    A literal constant scale factor for the indexes in \a i. Must be
4843 ///    1, 2, 4, or 8.
4844 /// \returns A 256-bit vector of [4 x double] containing the gathered values.
4845 #define _mm256_i64gather_pd(m, i, s) \
4846   ((__m256d)__builtin_ia32_gatherq_pd256((__v4df)_mm256_undefined_pd(), \
4847                                          (double const *)(m), \
4848                                          (__v4di)(__m256i)(i), \
4849                                          (__v4df)_mm256_cmp_pd(_mm256_setzero_pd(), \
4850                                                                _mm256_setzero_pd(), \
4851                                                                _CMP_EQ_OQ), \
4852                                          (s)))
4853 
4854 /// Gathers four 32-bit floating-point values from memory \a m using scaled
4855 ///    indexes from the 128-bit vector of [4 x i32] in \a i.
4856 ///
4857 /// \code{.operation}
4858 /// FOR element := 0 to 3
4859 ///   j := element*32
4860 ///   k := element*32
4861 ///   result[j+31:j] := Load32(m + SignExtend(i[k+31:k])*s)
4862 /// ENDFOR
4863 /// \endcode
4864 ///
4865 /// \headerfile <immintrin.h>
4866 ///
4867 /// \code
4868 /// __m128 _mm_i32gather_ps(const float *m, __m128i i, const int s);
4869 /// \endcode
4870 ///
4871 /// This intrinsic corresponds to the \c VGATHERDPS instruction.
4872 ///
4873 /// \param m
4874 ///    A pointer to the memory used for loading values.
4875 /// \param i
4876 ///    A 128-bit vector of [4 x i32] containing signed indexes into \a m.
4877 /// \param s
4878 ///    A literal constant scale factor for the indexes in \a i. Must be
4879 ///    1, 2, 4, or 8.
4880 /// \returns A 128-bit vector of [4 x float] containing the gathered values.
4881 #define _mm_i32gather_ps(m, i, s) \
4882   ((__m128)__builtin_ia32_gatherd_ps((__v4sf)_mm_undefined_ps(), \
4883                                      (float const *)(m), \
4884                                      (__v4si)(__m128i)(i), \
4885                                      (__v4sf)_mm_cmpeq_ps(_mm_setzero_ps(), \
4886                                                           _mm_setzero_ps()), \
4887                                      (s)))
4888 
4889 /// Gathers eight 32-bit floating-point values from memory \a m using scaled
4890 ///    indexes from the 256-bit vector of [8 x i32] in \a i.
4891 ///
4892 /// \code{.operation}
4893 /// FOR element := 0 to 7
4894 ///   j := element*32
4895 ///   k := element*32
4896 ///   result[j+31:j] := Load32(m + SignExtend(i[k+31:k])*s)
4897 /// ENDFOR
4898 /// \endcode
4899 ///
4900 /// \headerfile <immintrin.h>
4901 ///
4902 /// \code
4903 /// __m256 _mm256_i32gather_ps(const float *m, __m256i i, const int s);
4904 /// \endcode
4905 ///
4906 /// This intrinsic corresponds to the \c VGATHERDPS instruction.
4907 ///
4908 /// \param m
4909 ///    A pointer to the memory used for loading values.
4910 /// \param i
4911 ///    A 256-bit vector of [8 x i32] containing signed indexes into \a m.
4912 /// \param s
4913 ///    A literal constant scale factor for the indexes in \a i. Must be
4914 ///    1, 2, 4, or 8.
4915 /// \returns A 256-bit vector of [8 x float] containing the gathered values.
4916 #define _mm256_i32gather_ps(m, i, s) \
4917   ((__m256)__builtin_ia32_gatherd_ps256((__v8sf)_mm256_undefined_ps(), \
4918                                         (float const *)(m), \
4919                                         (__v8si)(__m256i)(i), \
4920                                         (__v8sf)_mm256_cmp_ps(_mm256_setzero_ps(), \
4921                                                               _mm256_setzero_ps(), \
4922                                                               _CMP_EQ_OQ), \
4923                                         (s)))
4924 
4925 /// Gathers two 32-bit floating-point values from memory \a m using scaled
4926 ///    indexes from the 128-bit vector of [2 x i64] in \a i. The upper two
4927 ///    elements of the result are zeroed.
4928 ///
4929 /// \code{.operation}
4930 /// FOR element := 0 to 1
4931 ///   j := element*32
4932 ///   k := element*64
4933 ///   result[j+31:j] := Load32(m + SignExtend(i[k+63:k])*s)
4934 /// ENDFOR
4935 /// result[127:64] := 0
4936 /// \endcode
4937 ///
4938 /// \headerfile <immintrin.h>
4939 ///
4940 /// \code
4941 /// __m128 _mm_i64gather_ps(const float *m, __m128i i, const int s);
4942 /// \endcode
4943 ///
4944 /// This intrinsic corresponds to the \c VGATHERQPS instruction.
4945 ///
4946 /// \param m
4947 ///    A pointer to the memory used for loading values.
4948 /// \param i
4949 ///    A 128-bit vector of [2 x i64] containing signed indexes into \a m.
4950 /// \param s
4951 ///    A literal constant scale factor for the indexes in \a i. Must be
4952 ///    1, 2, 4, or 8.
4953 /// \returns A 128-bit vector of [4 x float] containing the gathered values.
4954 #define _mm_i64gather_ps(m, i, s) \
4955   ((__m128)__builtin_ia32_gatherq_ps((__v4sf)_mm_undefined_ps(), \
4956                                      (float const *)(m), \
4957                                      (__v2di)(__m128i)(i), \
4958                                      (__v4sf)_mm_cmpeq_ps(_mm_setzero_ps(), \
4959                                                           _mm_setzero_ps()), \
4960                                      (s)))
4961 
4962 /// Gathers four 32-bit floating-point values from memory \a m using scaled
4963 ///    indexes from the 256-bit vector of [4 x i64] in \a i.
4964 ///
4965 /// \code{.operation}
4966 /// FOR element := 0 to 3
4967 ///   j := element*32
4968 ///   k := element*64
4969 ///   result[j+31:j] := Load32(m + SignExtend(i[k+64:k])*s)
4970 /// ENDFOR
4971 /// \endcode
4972 ///
4973 /// \headerfile <immintrin.h>
4974 ///
4975 /// \code
4976 /// __m128 _mm256_i64gather_ps(const float *m, __m256i i, const int s);
4977 /// \endcode
4978 ///
4979 /// This intrinsic corresponds to the \c VGATHERQPS instruction.
4980 ///
4981 /// \param m
4982 ///    A pointer to the memory used for loading values.
4983 /// \param i
4984 ///    A 256-bit vector of [4 x i64] containing signed indexes into \a m.
4985 /// \param s
4986 ///    A literal constant scale factor for the indexes in \a i. Must be
4987 ///    1, 2, 4, or 8.
4988 /// \returns A 128-bit vector of [4 x float] containing the gathered values.
4989 #define _mm256_i64gather_ps(m, i, s) \
4990   ((__m128)__builtin_ia32_gatherq_ps256((__v4sf)_mm_undefined_ps(), \
4991                                         (float const *)(m), \
4992                                         (__v4di)(__m256i)(i), \
4993                                         (__v4sf)_mm_cmpeq_ps(_mm_setzero_ps(), \
4994                                                              _mm_setzero_ps()), \
4995                                         (s)))
4996 
4997 /// Gathers four 32-bit floating-point values from memory \a m using scaled
4998 ///    indexes from the 128-bit vector of [4 x i32] in \a i.
4999 ///
5000 /// \code{.operation}
5001 /// FOR element := 0 to 3
5002 ///   j := element*32
5003 ///   k := element*32
5004 ///   result[j+31:j] := Load32(m + SignExtend(i[k+31:k])*s)
5005 /// ENDFOR
5006 /// \endcode
5007 ///
5008 /// \headerfile <immintrin.h>
5009 ///
5010 /// \code
5011 /// __m128i _mm_i32gather_epi32(const int *m, __m128i i, const int s);
5012 /// \endcode
5013 ///
5014 /// This intrinsic corresponds to the \c VPGATHERDD instruction.
5015 ///
5016 /// \param m
5017 ///    A pointer to the memory used for loading values.
5018 /// \param i
5019 ///    A 128-bit vector of [4 x i32] containing signed indexes into \a m.
5020 /// \param s
5021 ///    A literal constant scale factor for the indexes in \a i. Must be
5022 ///    1, 2, 4, or 8.
5023 /// \returns A 128-bit vector of [4 x i32] containing the gathered values.
5024 #define _mm_i32gather_epi32(m, i, s) \
5025   ((__m128i)__builtin_ia32_gatherd_d((__v4si)_mm_undefined_si128(), \
5026                                      (int const *)(m), (__v4si)(__m128i)(i), \
5027                                      (__v4si)_mm_set1_epi32(-1), (s)))
5028 
5029 /// Gathers eight 32-bit floating-point values from memory \a m using scaled
5030 ///    indexes from the 256-bit vector of [8 x i32] in \a i.
5031 ///
5032 /// \code{.operation}
5033 /// FOR element := 0 to 7
5034 ///   j := element*32
5035 ///   k := element*32
5036 ///   result[j+31:j] := Load32(m + SignExtend(i[k+31:k])*s)
5037 /// ENDFOR
5038 /// \endcode
5039 ///
5040 /// \headerfile <immintrin.h>
5041 ///
5042 /// \code
5043 /// __m256i _mm256_i32gather_epi32(const int *m, __m256i i, const int s);
5044 /// \endcode
5045 ///
5046 /// This intrinsic corresponds to the \c VPGATHERDD instruction.
5047 ///
5048 /// \param m
5049 ///    A pointer to the memory used for loading values.
5050 /// \param i
5051 ///    A 256-bit vector of [8 x i32] containing signed indexes into \a m.
5052 /// \param s
5053 ///    A literal constant scale factor for the indexes in \a i. Must be
5054 ///    1, 2, 4, or 8.
5055 /// \returns A 256-bit vector of [8 x i32] containing the gathered values.
5056 #define _mm256_i32gather_epi32(m, i, s) \
5057   ((__m256i)__builtin_ia32_gatherd_d256((__v8si)_mm256_undefined_si256(), \
5058                                         (int const *)(m), (__v8si)(__m256i)(i), \
5059                                         (__v8si)_mm256_set1_epi32(-1), (s)))
5060 
5061 /// Gathers two 32-bit integer values from memory \a m using scaled indexes
5062 ///    from the 128-bit vector of [2 x i64] in \a i. The upper two elements
5063 ///    of the result are zeroed.
5064 ///
5065 /// \code{.operation}
5066 /// FOR element := 0 to 1
5067 ///   j := element*32
5068 ///   k := element*64
5069 ///   result[j+31:j] := Load32(m + SignExtend(i[k+63:k])*s)
5070 /// ENDFOR
5071 /// result[127:64] := 0
5072 /// \endcode
5073 ///
5074 /// \headerfile <immintrin.h>
5075 ///
5076 /// \code
5077 /// __m128i _mm_i64gather_epi32(const int *m, __m128i i, const int s);
5078 /// \endcode
5079 ///
5080 /// This intrinsic corresponds to the \c VPGATHERQD instruction.
5081 ///
5082 /// \param m
5083 ///    A pointer to the memory used for loading values.
5084 /// \param i
5085 ///    A 128-bit vector of [2 x i64] containing signed indexes into \a m.
5086 /// \param s
5087 ///    A literal constant scale factor for the indexes in \a i. Must be
5088 ///    1, 2, 4, or 8.
5089 /// \returns A 128-bit vector of [4 x i32] containing the gathered values.
5090 #define _mm_i64gather_epi32(m, i, s) \
5091   ((__m128i)__builtin_ia32_gatherq_d((__v4si)_mm_undefined_si128(), \
5092                                      (int const *)(m), (__v2di)(__m128i)(i), \
5093                                      (__v4si)_mm_set1_epi32(-1), (s)))
5094 
5095 /// Gathers four 32-bit integer values from memory \a m using scaled indexes
5096 ///    from the 256-bit vector of [4 x i64] in \a i.
5097 ///
5098 /// \code{.operation}
5099 /// FOR element := 0 to 3
5100 ///   j := element*32
5101 ///   k := element*64
5102 ///   result[j+31:j] := Load32(m + SignExtend(i[k+63:k])*s)
5103 /// ENDFOR
5104 /// \endcode
5105 ///
5106 /// \headerfile <immintrin.h>
5107 ///
5108 /// \code
5109 /// __m128i _mm256_i64gather_epi32(const int *m, __m256i i, const int s);
5110 /// \endcode
5111 ///
5112 /// This intrinsic corresponds to the \c VPGATHERQD instruction.
5113 ///
5114 /// \param m
5115 ///    A pointer to the memory used for loading values.
5116 /// \param i
5117 ///    A 256-bit vector of [4 x i64] containing signed indexes into \a m.
5118 /// \param s
5119 ///    A literal constant scale factor for the indexes in \a i. Must be
5120 ///    1, 2, 4, or 8.
5121 /// \returns A 128-bit vector of [4 x i32] containing the gathered values.
5122 #define _mm256_i64gather_epi32(m, i, s) \
5123   ((__m128i)__builtin_ia32_gatherq_d256((__v4si)_mm_undefined_si128(), \
5124                                         (int const *)(m), (__v4di)(__m256i)(i), \
5125                                         (__v4si)_mm_set1_epi32(-1), (s)))
5126 
5127 /// Gathers two 64-bit integer values from memory \a m using scaled indexes
5128 ///    from the 128-bit vector of [4 x i32] in \a i.
5129 ///
5130 /// \code{.operation}
5131 /// FOR element := 0 to 1
5132 ///   j := element*64
5133 ///   k := element*32
5134 ///   result[j+63:j] := Load64(m + SignExtend(i[k+31:k])*s)
5135 /// ENDFOR
5136 /// \endcode
5137 ///
5138 /// \headerfile <immintrin.h>
5139 ///
5140 /// \code
5141 /// __m128i _mm_i32gather_epi64(const long long *m, __m128i i, const int s);
5142 /// \endcode
5143 ///
5144 /// This intrinsic corresponds to the \c VPGATHERDQ instruction.
5145 ///
5146 /// \param m
5147 ///    A pointer to the memory used for loading values.
5148 /// \param i
5149 ///    A 128-bit vector of [4 x i32] containing signed indexes into \a m. Only
5150 ///    the first two elements are used.
5151 /// \param s
5152 ///    A literal constant scale factor for the indexes in \a i. Must be
5153 ///    1, 2, 4, or 8.
5154 /// \returns A 128-bit vector of [2 x i64] containing the gathered values.
5155 #define _mm_i32gather_epi64(m, i, s) \
5156   ((__m128i)__builtin_ia32_gatherd_q((__v2di)_mm_undefined_si128(), \
5157                                      (long long const *)(m), \
5158                                      (__v4si)(__m128i)(i), \
5159                                      (__v2di)_mm_set1_epi64x(-1), (s)))
5160 
5161 /// Gathers four 64-bit integer values from memory \a m using scaled indexes
5162 ///    from the 128-bit vector of [4 x i32] in \a i.
5163 ///
5164 /// \code{.operation}
5165 /// FOR element := 0 to 3
5166 ///   j := element*64
5167 ///   k := element*32
5168 ///   result[j+63:j] := Load64(m + SignExtend(i[k+31:k])*s)
5169 /// ENDFOR
5170 /// \endcode
5171 ///
5172 /// \headerfile <immintrin.h>
5173 ///
5174 /// \code
5175 /// __m256i _mm256_i32gather_epi64(const long long *m, __m128i i, const int s);
5176 /// \endcode
5177 ///
5178 /// This intrinsic corresponds to the \c VPGATHERDQ instruction.
5179 ///
5180 /// \param m
5181 ///    A pointer to the memory used for loading values.
5182 /// \param i
5183 ///    A 128-bit vector of [4 x i32] containing signed indexes into \a m.
5184 /// \param s
5185 ///    A literal constant scale factor for the indexes in \a i. Must be
5186 ///    1, 2, 4, or 8.
5187 /// \returns A 256-bit vector of [4 x i64] containing the gathered values.
5188 #define _mm256_i32gather_epi64(m, i, s) \
5189   ((__m256i)__builtin_ia32_gatherd_q256((__v4di)_mm256_undefined_si256(), \
5190                                         (long long const *)(m), \
5191                                         (__v4si)(__m128i)(i), \
5192                                         (__v4di)_mm256_set1_epi64x(-1), (s)))
5193 
5194 /// Gathers two 64-bit integer values from memory \a m using scaled indexes
5195 ///    from the 128-bit vector of [2 x i64] in \a i.
5196 ///
5197 /// \code{.operation}
5198 /// FOR element := 0 to 1
5199 ///   j := element*64
5200 ///   k := element*64
5201 ///   result[j+63:j] := Load64(m + SignExtend(i[k+63:k])*s)
5202 /// ENDFOR
5203 /// \endcode
5204 ///
5205 /// \headerfile <immintrin.h>
5206 ///
5207 /// \code
5208 /// __m128i _mm_i64gather_epi64(const long long *m, __m128i i, const int s);
5209 /// \endcode
5210 ///
5211 /// This intrinsic corresponds to the \c VPGATHERQQ instruction.
5212 ///
5213 /// \param m
5214 ///    A pointer to the memory used for loading values.
5215 /// \param i
5216 ///    A 128-bit vector of [2 x i64] containing signed indexes into \a m.
5217 /// \param s
5218 ///    A literal constant scale factor for the indexes in \a i. Must be
5219 ///    1, 2, 4, or 8.
5220 /// \returns A 128-bit vector of [2 x i64] containing the gathered values.
5221 #define _mm_i64gather_epi64(m, i, s) \
5222   ((__m128i)__builtin_ia32_gatherq_q((__v2di)_mm_undefined_si128(), \
5223                                      (long long const *)(m), \
5224                                      (__v2di)(__m128i)(i), \
5225                                      (__v2di)_mm_set1_epi64x(-1), (s)))
5226 
5227 /// Gathers four 64-bit integer values from memory \a m using scaled indexes
5228 ///    from the 256-bit vector of [4 x i64] in \a i.
5229 ///
5230 /// \code{.operation}
5231 /// FOR element := 0 to 3
5232 ///   j := element*64
5233 ///   k := element*64
5234 ///   result[j+63:j] := Load64(m + SignExtend(i[k+63:k])*s)
5235 /// ENDFOR
5236 /// \endcode
5237 ///
5238 /// \headerfile <immintrin.h>
5239 ///
5240 /// \code
5241 /// __m256i _mm256_i64gather_epi64(const long long *m, __m256i i, const int s);
5242 /// \endcode
5243 ///
5244 /// This intrinsic corresponds to the \c VPGATHERQQ instruction.
5245 ///
5246 /// \param m
5247 ///    A pointer to the memory used for loading values.
5248 /// \param i
5249 ///    A 256-bit vector of [4 x i64] containing signed indexes into \a m.
5250 /// \param s
5251 ///    A literal constant scale factor for the indexes in \a i. Must be
5252 ///    1, 2, 4, or 8.
5253 /// \returns A 256-bit vector of [4 x i64] containing the gathered values.
5254 #define _mm256_i64gather_epi64(m, i, s) \
5255   ((__m256i)__builtin_ia32_gatherq_q256((__v4di)_mm256_undefined_si256(), \
5256                                         (long long const *)(m), \
5257                                         (__v4di)(__m256i)(i), \
5258                                         (__v4di)_mm256_set1_epi64x(-1), (s)))
5259 
5260 #undef __DEFAULT_FN_ATTRS256
5261 #undef __DEFAULT_FN_ATTRS128
5262 
5263 #endif /* __AVX2INTRIN_H */
5264