xref: /freebsd/contrib/llvm-project/clang/lib/Headers/immintrin.h (revision b64c5a0ace59af62eff52bfe110a521dc73c937b)
1 /*===---- immintrin.h - Intel intrinsics -----------------------------------===
2  *
3  * Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4  * See https://llvm.org/LICENSE.txt for license information.
5  * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6  *
7  *===-----------------------------------------------------------------------===
8  */
9 
10 #ifndef __IMMINTRIN_H
11 #define __IMMINTRIN_H
12 
13 #if !defined(__i386__) && !defined(__x86_64__)
14 #error "This header is only meant to be used on x86 and x64 architecture"
15 #endif
16 
17 #include <x86gprintrin.h>
18 
19 #if !defined(__SCE__) || __has_feature(modules) || defined(__MMX__)
20 #include <mmintrin.h>
21 #endif
22 
23 #if !defined(__SCE__) || __has_feature(modules) || defined(__SSE__)
24 #include <xmmintrin.h>
25 #endif
26 
27 #if !defined(__SCE__) || __has_feature(modules) || defined(__SSE2__)
28 #include <emmintrin.h>
29 #endif
30 
31 #if !defined(__SCE__) || __has_feature(modules) || defined(__SSE3__)
32 #include <pmmintrin.h>
33 #endif
34 
35 #if !defined(__SCE__) || __has_feature(modules) || defined(__SSSE3__)
36 #include <tmmintrin.h>
37 #endif
38 
39 #if !defined(__SCE__) || __has_feature(modules) ||                             \
40     (defined(__SSE4_2__) || defined(__SSE4_1__))
41 #include <smmintrin.h>
42 #endif
43 
44 #if !defined(__SCE__) || __has_feature(modules) ||                             \
45     (defined(__AES__) || defined(__PCLMUL__))
46 #include <wmmintrin.h>
47 #endif
48 
49 #if !defined(__SCE__) || __has_feature(modules) || defined(__CLFLUSHOPT__)
50 #include <clflushoptintrin.h>
51 #endif
52 
53 #if !defined(__SCE__) || __has_feature(modules) || defined(__CLWB__)
54 #include <clwbintrin.h>
55 #endif
56 
57 #if !defined(__SCE__) || __has_feature(modules) || defined(__AVX__)
58 #include <avxintrin.h>
59 #endif
60 
61 #if !defined(__SCE__) || __has_feature(modules) || defined(__AVX2__)
62 #include <avx2intrin.h>
63 #endif
64 
65 #if !defined(__SCE__) || __has_feature(modules) || defined(__F16C__)
66 #include <f16cintrin.h>
67 #endif
68 
69 /* No feature check desired due to internal checks */
70 #include <bmiintrin.h>
71 
72 #if !defined(__SCE__) || __has_feature(modules) || defined(__BMI2__)
73 #include <bmi2intrin.h>
74 #endif
75 
76 #if !defined(__SCE__) || __has_feature(modules) || defined(__LZCNT__)
77 #include <lzcntintrin.h>
78 #endif
79 
80 #if !defined(__SCE__) || __has_feature(modules) || defined(__POPCNT__)
81 #include <popcntintrin.h>
82 #endif
83 
84 #if !defined(__SCE__) || __has_feature(modules) || defined(__FMA__)
85 #include <fmaintrin.h>
86 #endif
87 
88 #if !defined(__SCE__) || __has_feature(modules) || defined(__AVX512F__)
89 #include <avx512fintrin.h>
90 #endif
91 
92 #if !defined(__SCE__) || __has_feature(modules) || defined(__AVX512VL__)
93 #include <avx512vlintrin.h>
94 #endif
95 
96 #if !defined(__SCE__) || __has_feature(modules) || defined(__AVX512BW__)
97 #include <avx512bwintrin.h>
98 #endif
99 
100 #if !defined(__SCE__) || __has_feature(modules) || defined(__AVX512BITALG__)
101 #include <avx512bitalgintrin.h>
102 #endif
103 
104 #if !defined(__SCE__) || __has_feature(modules) || defined(__AVX512CD__)
105 #include <avx512cdintrin.h>
106 #endif
107 
108 #if !defined(__SCE__) || __has_feature(modules) || defined(__AVX512VPOPCNTDQ__)
109 #include <avx512vpopcntdqintrin.h>
110 #endif
111 
112 #if !defined(__SCE__) || __has_feature(modules) ||                             \
113     (defined(__AVX512VL__) && defined(__AVX512VPOPCNTDQ__))
114 #include <avx512vpopcntdqvlintrin.h>
115 #endif
116 
117 #if !defined(__SCE__) || __has_feature(modules) || defined(__AVX512VNNI__)
118 #include <avx512vnniintrin.h>
119 #endif
120 
121 #if !defined(__SCE__) || __has_feature(modules) ||                             \
122     (defined(__AVX512VL__) && defined(__AVX512VNNI__))
123 #include <avx512vlvnniintrin.h>
124 #endif
125 
126 #if !defined(__SCE__) || __has_feature(modules) || defined(__AVXVNNI__)
127 #include <avxvnniintrin.h>
128 #endif
129 
130 #if !defined(__SCE__) || __has_feature(modules) || defined(__AVX512DQ__)
131 #include <avx512dqintrin.h>
132 #endif
133 
134 #if !defined(__SCE__) || __has_feature(modules) ||                             \
135     (defined(__AVX512VL__) && defined(__AVX512BITALG__))
136 #include <avx512vlbitalgintrin.h>
137 #endif
138 
139 #if !defined(__SCE__) || __has_feature(modules) ||                             \
140     (defined(__AVX512VL__) && defined(__AVX512BW__))
141 #include <avx512vlbwintrin.h>
142 #endif
143 
144 #if !defined(__SCE__) || __has_feature(modules) ||                             \
145     (defined(__AVX512VL__) && defined(__AVX512CD__))
146 #include <avx512vlcdintrin.h>
147 #endif
148 
149 #if !defined(__SCE__) || __has_feature(modules) ||                             \
150     (defined(__AVX512VL__) && defined(__AVX512DQ__))
151 #include <avx512vldqintrin.h>
152 #endif
153 
154 #if !defined(__SCE__) || __has_feature(modules) || defined(__AVX512IFMA__)
155 #include <avx512ifmaintrin.h>
156 #endif
157 
158 #if !defined(__SCE__) || __has_feature(modules) ||                             \
159     (defined(__AVX512IFMA__) && defined(__AVX512VL__))
160 #include <avx512ifmavlintrin.h>
161 #endif
162 
163 #if !defined(__SCE__) || __has_feature(modules) || defined(__AVXIFMA__)
164 #include <avxifmaintrin.h>
165 #endif
166 
167 #if !defined(__SCE__) || __has_feature(modules) || defined(__AVX512VBMI__)
168 #include <avx512vbmiintrin.h>
169 #endif
170 
171 #if !defined(__SCE__) || __has_feature(modules) ||                             \
172     (defined(__AVX512VBMI__) && defined(__AVX512VL__))
173 #include <avx512vbmivlintrin.h>
174 #endif
175 
176 #if !defined(__SCE__) || __has_feature(modules) || defined(__AVX512VBMI2__)
177 #include <avx512vbmi2intrin.h>
178 #endif
179 
180 #if !defined(__SCE__) || __has_feature(modules) ||                             \
181     (defined(__AVX512VBMI2__) && defined(__AVX512VL__))
182 #include <avx512vlvbmi2intrin.h>
183 #endif
184 
185 #if !defined(__SCE__) || __has_feature(modules) || defined(__AVX512FP16__)
186 #include <avx512fp16intrin.h>
187 #endif
188 
189 #if !defined(__SCE__) || __has_feature(modules) ||                             \
190     (defined(__AVX512VL__) && defined(__AVX512FP16__))
191 #include <avx512vlfp16intrin.h>
192 #endif
193 
194 #if !defined(__SCE__) || __has_feature(modules) || defined(__AVX512BF16__)
195 #include <avx512bf16intrin.h>
196 #endif
197 
198 #if !defined(__SCE__) || __has_feature(modules) ||                             \
199     (defined(__AVX512VL__) && defined(__AVX512BF16__))
200 #include <avx512vlbf16intrin.h>
201 #endif
202 
203 #if !defined(__SCE__) || __has_feature(modules) || defined(__PKU__)
204 #include <pkuintrin.h>
205 #endif
206 
207 #if !defined(__SCE__) || __has_feature(modules) || defined(__VPCLMULQDQ__)
208 #include <vpclmulqdqintrin.h>
209 #endif
210 
211 #if !defined(__SCE__) || __has_feature(modules) || defined(__VAES__)
212 #include <vaesintrin.h>
213 #endif
214 
215 #if !defined(__SCE__) || __has_feature(modules) || defined(__GFNI__)
216 #include <gfniintrin.h>
217 #endif
218 
219 #if !defined(__SCE__) || __has_feature(modules) || defined(__AVXVNNIINT8__)
220 #include <avxvnniint8intrin.h>
221 #endif
222 
223 #if !defined(__SCE__) || __has_feature(modules) || defined(__AVXNECONVERT__)
224 #include <avxneconvertintrin.h>
225 #endif
226 
227 #if !defined(__SCE__) || __has_feature(modules) || defined(__SHA512__)
228 #include <sha512intrin.h>
229 #endif
230 
231 #if !defined(__SCE__) || __has_feature(modules) || defined(__SM3__)
232 #include <sm3intrin.h>
233 #endif
234 
235 #if !defined(__SCE__) || __has_feature(modules) || defined(__SM4__)
236 #include <sm4intrin.h>
237 #endif
238 
239 #if !defined(__SCE__) || __has_feature(modules) || defined(__AVXVNNIINT16__)
240 #include <avxvnniint16intrin.h>
241 #endif
242 
243 #if !defined(__SCE__) || __has_feature(modules) || defined(__RDPID__)
244 /// Reads the value of the IA32_TSC_AUX MSR (0xc0000103).
245 ///
246 /// \headerfile <immintrin.h>
247 ///
248 /// This intrinsic corresponds to the <c> RDPID </c> instruction.
249 ///
250 /// \returns The 32-bit contents of the MSR.
251 static __inline__ unsigned int __attribute__((__always_inline__, __nodebug__, __target__("rdpid")))
252 _rdpid_u32(void) {
253   return __builtin_ia32_rdpid();
254 }
255 #endif // __RDPID__
256 
257 #if !defined(__SCE__) || __has_feature(modules) || defined(__RDRND__)
258 /// Returns a 16-bit hardware-generated random value.
259 ///
260 /// \headerfile <immintrin.h>
261 ///
262 /// This intrinsic corresponds to the <c> RDRAND </c> instruction.
263 ///
264 /// \param __p
265 ///    A pointer to a 16-bit memory location to place the random value.
266 /// \returns 1 if the value was successfully generated, 0 otherwise.
267 static __inline__ int __attribute__((__always_inline__, __nodebug__, __target__("rdrnd")))
268 _rdrand16_step(unsigned short *__p)
269 {
270   return (int)__builtin_ia32_rdrand16_step(__p);
271 }
272 
273 /// Returns a 32-bit hardware-generated random value.
274 ///
275 /// \headerfile <immintrin.h>
276 ///
277 /// This intrinsic corresponds to the <c> RDRAND </c> instruction.
278 ///
279 /// \param __p
280 ///    A pointer to a 32-bit memory location to place the random value.
281 /// \returns 1 if the value was successfully generated, 0 otherwise.
282 static __inline__ int __attribute__((__always_inline__, __nodebug__, __target__("rdrnd")))
283 _rdrand32_step(unsigned int *__p)
284 {
285   return (int)__builtin_ia32_rdrand32_step(__p);
286 }
287 
288 /// Returns a 64-bit hardware-generated random value.
289 ///
290 /// \headerfile <immintrin.h>
291 ///
292 /// This intrinsic corresponds to the <c> RDRAND </c> instruction.
293 ///
294 /// \param __p
295 ///    A pointer to a 64-bit memory location to place the random value.
296 /// \returns 1 if the value was successfully generated, 0 otherwise.
297 static __inline__ int __attribute__((__always_inline__, __nodebug__, __target__("rdrnd")))
298 _rdrand64_step(unsigned long long *__p)
299 {
300 #ifdef __x86_64__
301   return (int)__builtin_ia32_rdrand64_step(__p);
302 #else
303   // We need to emulate the functionality of 64-bit rdrand with 2 32-bit
304   // rdrand instructions.
305   unsigned int __lo, __hi;
306   unsigned int __res_lo = __builtin_ia32_rdrand32_step(&__lo);
307   unsigned int __res_hi = __builtin_ia32_rdrand32_step(&__hi);
308   if (__res_lo && __res_hi) {
309     *__p = ((unsigned long long)__hi << 32) | (unsigned long long)__lo;
310     return 1;
311   } else {
312     *__p = 0;
313     return 0;
314   }
315 #endif
316 }
317 #endif /* __RDRND__ */
318 
319 #if !defined(__SCE__) || __has_feature(modules) || defined(__FSGSBASE__)
320 #ifdef __x86_64__
321 /// Reads the FS base register.
322 ///
323 /// \headerfile <immintrin.h>
324 ///
325 /// This intrinsic corresponds to the <c> RDFSBASE </c> instruction.
326 ///
327 /// \returns The lower 32 bits of the FS base register.
328 static __inline__ unsigned int __attribute__((__always_inline__, __nodebug__, __target__("fsgsbase")))
329 _readfsbase_u32(void)
330 {
331   return __builtin_ia32_rdfsbase32();
332 }
333 
334 /// Reads the FS base register.
335 ///
336 /// \headerfile <immintrin.h>
337 ///
338 /// This intrinsic corresponds to the <c> RDFSBASE </c> instruction.
339 ///
340 /// \returns The contents of the FS base register.
341 static __inline__ unsigned long long __attribute__((__always_inline__, __nodebug__, __target__("fsgsbase")))
342 _readfsbase_u64(void)
343 {
344   return __builtin_ia32_rdfsbase64();
345 }
346 
347 /// Reads the GS base register.
348 ///
349 /// \headerfile <immintrin.h>
350 ///
351 /// This intrinsic corresponds to the <c> RDGSBASE </c> instruction.
352 ///
353 /// \returns The lower 32 bits of the GS base register.
354 static __inline__ unsigned int __attribute__((__always_inline__, __nodebug__, __target__("fsgsbase")))
355 _readgsbase_u32(void)
356 {
357   return __builtin_ia32_rdgsbase32();
358 }
359 
360 /// Reads the GS base register.
361 ///
362 /// \headerfile <immintrin.h>
363 ///
364 /// This intrinsic corresponds to the <c> RDGSBASE </c> instruction.
365 ///
366 /// \returns The contents of the GS base register.
367 static __inline__ unsigned long long __attribute__((__always_inline__, __nodebug__, __target__("fsgsbase")))
368 _readgsbase_u64(void)
369 {
370   return __builtin_ia32_rdgsbase64();
371 }
372 
373 /// Modifies the FS base register.
374 ///
375 /// \headerfile <immintrin.h>
376 ///
377 /// This intrinsic corresponds to the <c> WRFSBASE </c> instruction.
378 ///
379 /// \param __V
380 ///    Value to use for the lower 32 bits of the FS base register.
381 static __inline__ void __attribute__((__always_inline__, __nodebug__, __target__("fsgsbase")))
382 _writefsbase_u32(unsigned int __V)
383 {
384   __builtin_ia32_wrfsbase32(__V);
385 }
386 
387 /// Modifies the FS base register.
388 ///
389 /// \headerfile <immintrin.h>
390 ///
391 /// This intrinsic corresponds to the <c> WRFSBASE </c> instruction.
392 ///
393 /// \param __V
394 ///    Value to use for the FS base register.
395 static __inline__ void __attribute__((__always_inline__, __nodebug__, __target__("fsgsbase")))
396 _writefsbase_u64(unsigned long long __V)
397 {
398   __builtin_ia32_wrfsbase64(__V);
399 }
400 
401 /// Modifies the GS base register.
402 ///
403 /// \headerfile <immintrin.h>
404 ///
405 /// This intrinsic corresponds to the <c> WRGSBASE </c> instruction.
406 ///
407 /// \param __V
408 ///    Value to use for the lower 32 bits of the GS base register.
409 static __inline__ void __attribute__((__always_inline__, __nodebug__, __target__("fsgsbase")))
410 _writegsbase_u32(unsigned int __V)
411 {
412   __builtin_ia32_wrgsbase32(__V);
413 }
414 
415 /// Modifies the GS base register.
416 ///
417 /// \headerfile <immintrin.h>
418 ///
419 /// This intrinsic corresponds to the <c> WRFSBASE </c> instruction.
420 ///
421 /// \param __V
422 ///    Value to use for GS base register.
423 static __inline__ void __attribute__((__always_inline__, __nodebug__, __target__("fsgsbase")))
424 _writegsbase_u64(unsigned long long __V)
425 {
426   __builtin_ia32_wrgsbase64(__V);
427 }
428 
429 #endif
430 #endif /* __FSGSBASE__ */
431 
432 #if !defined(__SCE__) || __has_feature(modules) || defined(__MOVBE__)
433 
434 /* The structs used below are to force the load/store to be unaligned. This
435  * is accomplished with the __packed__ attribute. The __may_alias__ prevents
436  * tbaa metadata from being generated based on the struct and the type of the
437  * field inside of it.
438  */
439 
440 /// Load a 16-bit value from memory and swap its bytes.
441 ///
442 /// \headerfile <x86intrin.h>
443 ///
444 /// This intrinsic corresponds to the MOVBE instruction.
445 ///
446 /// \param __P
447 ///    A pointer to the 16-bit value to load.
448 /// \returns The byte-swapped value.
449 static __inline__ short __attribute__((__always_inline__, __nodebug__, __target__("movbe")))
450 _loadbe_i16(void const * __P) {
451   struct __loadu_i16 {
452     unsigned short __v;
453   } __attribute__((__packed__, __may_alias__));
454   return (short)__builtin_bswap16(((const struct __loadu_i16*)__P)->__v);
455 }
456 
457 /// Swap the bytes of a 16-bit value and store it to memory.
458 ///
459 /// \headerfile <x86intrin.h>
460 ///
461 /// This intrinsic corresponds to the MOVBE instruction.
462 ///
463 /// \param __P
464 ///    A pointer to the memory for storing the swapped value.
465 /// \param __D
466 ///    The 16-bit value to be byte-swapped.
467 static __inline__ void __attribute__((__always_inline__, __nodebug__, __target__("movbe")))
468 _storebe_i16(void * __P, short __D) {
469   struct __storeu_i16 {
470     unsigned short __v;
471   } __attribute__((__packed__, __may_alias__));
472   ((struct __storeu_i16*)__P)->__v = __builtin_bswap16((unsigned short)__D);
473 }
474 
475 /// Load a 32-bit value from memory and swap its bytes.
476 ///
477 /// \headerfile <x86intrin.h>
478 ///
479 /// This intrinsic corresponds to the MOVBE instruction.
480 ///
481 /// \param __P
482 ///    A pointer to the 32-bit value to load.
483 /// \returns The byte-swapped value.
484 static __inline__ int __attribute__((__always_inline__, __nodebug__, __target__("movbe")))
485 _loadbe_i32(void const * __P) {
486   struct __loadu_i32 {
487     unsigned int __v;
488   } __attribute__((__packed__, __may_alias__));
489   return (int)__builtin_bswap32(((const struct __loadu_i32*)__P)->__v);
490 }
491 
492 /// Swap the bytes of a 32-bit value and store it to memory.
493 ///
494 /// \headerfile <x86intrin.h>
495 ///
496 /// This intrinsic corresponds to the MOVBE instruction.
497 ///
498 /// \param __P
499 ///    A pointer to the memory for storing the swapped value.
500 /// \param __D
501 ///    The 32-bit value to be byte-swapped.
502 static __inline__ void __attribute__((__always_inline__, __nodebug__, __target__("movbe")))
503 _storebe_i32(void * __P, int __D) {
504   struct __storeu_i32 {
505     unsigned int __v;
506   } __attribute__((__packed__, __may_alias__));
507   ((struct __storeu_i32*)__P)->__v = __builtin_bswap32((unsigned int)__D);
508 }
509 
510 #ifdef __x86_64__
511 /// Load a 64-bit value from memory and swap its bytes.
512 ///
513 /// \headerfile <x86intrin.h>
514 ///
515 /// This intrinsic corresponds to the MOVBE instruction.
516 ///
517 /// \param __P
518 ///    A pointer to the 64-bit value to load.
519 /// \returns The byte-swapped value.
520 static __inline__ long long __attribute__((__always_inline__, __nodebug__, __target__("movbe")))
521 _loadbe_i64(void const * __P) {
522   struct __loadu_i64 {
523     unsigned long long __v;
524   } __attribute__((__packed__, __may_alias__));
525   return (long long)__builtin_bswap64(((const struct __loadu_i64*)__P)->__v);
526 }
527 
528 /// Swap the bytes of a 64-bit value and store it to memory.
529 ///
530 /// \headerfile <x86intrin.h>
531 ///
532 /// This intrinsic corresponds to the MOVBE instruction.
533 ///
534 /// \param __P
535 ///    A pointer to the memory for storing the swapped value.
536 /// \param __D
537 ///    The 64-bit value to be byte-swapped.
538 static __inline__ void __attribute__((__always_inline__, __nodebug__, __target__("movbe")))
539 _storebe_i64(void * __P, long long __D) {
540   struct __storeu_i64 {
541     unsigned long long __v;
542   } __attribute__((__packed__, __may_alias__));
543   ((struct __storeu_i64*)__P)->__v = __builtin_bswap64((unsigned long long)__D);
544 }
545 #endif
546 #endif /* __MOVBE */
547 
548 #if !defined(__SCE__) || __has_feature(modules) || defined(__RTM__)
549 #include <rtmintrin.h>
550 #include <xtestintrin.h>
551 #endif
552 
553 #if !defined(__SCE__) || __has_feature(modules) || defined(__SHA__)
554 #include <shaintrin.h>
555 #endif
556 
557 #if !defined(__SCE__) || __has_feature(modules) || defined(__FXSR__)
558 #include <fxsrintrin.h>
559 #endif
560 
561 /* No feature check desired due to internal MSC_VER checks */
562 #include <xsaveintrin.h>
563 
564 #if !defined(__SCE__) || __has_feature(modules) || defined(__XSAVEOPT__)
565 #include <xsaveoptintrin.h>
566 #endif
567 
568 #if !defined(__SCE__) || __has_feature(modules) || defined(__XSAVEC__)
569 #include <xsavecintrin.h>
570 #endif
571 
572 #if !defined(__SCE__) || __has_feature(modules) || defined(__XSAVES__)
573 #include <xsavesintrin.h>
574 #endif
575 
576 #if !defined(__SCE__) || __has_feature(modules) || defined(__SHSTK__)
577 #include <cetintrin.h>
578 #endif
579 
580 /* Intrinsics inside adcintrin.h are available at all times. */
581 #include <adcintrin.h>
582 
583 #if !defined(__SCE__) || __has_feature(modules) || defined(__ADX__)
584 #include <adxintrin.h>
585 #endif
586 
587 #if !defined(__SCE__) || __has_feature(modules) || defined(__RDSEED__)
588 #include <rdseedintrin.h>
589 #endif
590 
591 #if !defined(__SCE__) || __has_feature(modules) || defined(__WBNOINVD__)
592 #include <wbnoinvdintrin.h>
593 #endif
594 
595 #if !defined(__SCE__) || __has_feature(modules) || defined(__CLDEMOTE__)
596 #include <cldemoteintrin.h>
597 #endif
598 
599 #if !defined(__SCE__) || __has_feature(modules) || defined(__WAITPKG__)
600 #include <waitpkgintrin.h>
601 #endif
602 
603 #if !defined(__SCE__) || __has_feature(modules) || defined(__MOVDIRI__) ||     \
604     defined(__MOVDIR64B__)
605 #include <movdirintrin.h>
606 #endif
607 
608 #if !defined(__SCE__) || __has_feature(modules) || defined(__PCONFIG__)
609 #include <pconfigintrin.h>
610 #endif
611 
612 #if !defined(__SCE__) || __has_feature(modules) || defined(__SGX__)
613 #include <sgxintrin.h>
614 #endif
615 
616 #if !defined(__SCE__) || __has_feature(modules) || defined(__PTWRITE__)
617 #include <ptwriteintrin.h>
618 #endif
619 
620 #if !defined(__SCE__) || __has_feature(modules) || defined(__INVPCID__)
621 #include <invpcidintrin.h>
622 #endif
623 #if !defined(__SCE__) || __has_feature(modules) || defined(__AMX_FP16__)
624 #include <amxfp16intrin.h>
625 #endif
626 
627 #if !defined(__SCE__) || __has_feature(modules) || defined(__KL__) ||          \
628     defined(__WIDEKL__)
629 #include <keylockerintrin.h>
630 #endif
631 
632 #if !defined(__SCE__) || __has_feature(modules) || defined(__AMX_TILE__) ||    \
633     defined(__AMX_INT8__) || defined(__AMX_BF16__)
634 #include <amxintrin.h>
635 #endif
636 
637 #if !defined(__SCE__) || __has_feature(modules) || defined(__AMX_COMPLEX__)
638 #include <amxcomplexintrin.h>
639 #endif
640 
641 #if !defined(__SCE__) || __has_feature(modules) ||                             \
642     defined(__AVX512VP2INTERSECT__)
643 #include <avx512vp2intersectintrin.h>
644 #endif
645 
646 #if !defined(__SCE__) || __has_feature(modules) ||                             \
647     (defined(__AVX512VL__) && defined(__AVX512VP2INTERSECT__))
648 #include <avx512vlvp2intersectintrin.h>
649 #endif
650 
651 #if !defined(__SCE__) || __has_feature(modules) || defined(__ENQCMD__)
652 #include <enqcmdintrin.h>
653 #endif
654 
655 #if !defined(__SCE__) || __has_feature(modules) || defined(__SERIALIZE__)
656 #include <serializeintrin.h>
657 #endif
658 
659 #if !defined(__SCE__) || __has_feature(modules) || defined(__TSXLDTRK__)
660 #include <tsxldtrkintrin.h>
661 #endif
662 
663 #if defined(_MSC_VER) && __has_extension(gnu_asm)
664 /* Define the default attributes for these intrinsics */
665 #define __DEFAULT_FN_ATTRS __attribute__((__always_inline__, __nodebug__))
666 #ifdef __cplusplus
667 extern "C" {
668 #endif
669 /*----------------------------------------------------------------------------*\
670 |* Interlocked Exchange HLE
671 \*----------------------------------------------------------------------------*/
672 #if defined(__i386__) || defined(__x86_64__)
673 static __inline__ long __DEFAULT_FN_ATTRS
674 _InterlockedExchange_HLEAcquire(long volatile *_Target, long _Value) {
675   __asm__ __volatile__(".byte 0xf2 ; lock ; xchg {%0, %1|%1, %0}"
676                        : "+r" (_Value), "+m" (*_Target) :: "memory");
677   return _Value;
678 }
679 static __inline__ long __DEFAULT_FN_ATTRS
680 _InterlockedExchange_HLERelease(long volatile *_Target, long _Value) {
681   __asm__ __volatile__(".byte 0xf3 ; lock ; xchg {%0, %1|%1, %0}"
682                        : "+r" (_Value), "+m" (*_Target) :: "memory");
683   return _Value;
684 }
685 #endif
686 #if defined(__x86_64__)
687 static __inline__ __int64 __DEFAULT_FN_ATTRS
688 _InterlockedExchange64_HLEAcquire(__int64 volatile *_Target, __int64 _Value) {
689   __asm__ __volatile__(".byte 0xf2 ; lock ; xchg {%0, %1|%1, %0}"
690                        : "+r" (_Value), "+m" (*_Target) :: "memory");
691   return _Value;
692 }
693 static __inline__ __int64 __DEFAULT_FN_ATTRS
694 _InterlockedExchange64_HLERelease(__int64 volatile *_Target, __int64 _Value) {
695   __asm__ __volatile__(".byte 0xf3 ; lock ; xchg {%0, %1|%1, %0}"
696                        : "+r" (_Value), "+m" (*_Target) :: "memory");
697   return _Value;
698 }
699 #endif
700 /*----------------------------------------------------------------------------*\
701 |* Interlocked Compare Exchange HLE
702 \*----------------------------------------------------------------------------*/
703 #if defined(__i386__) || defined(__x86_64__)
704 static __inline__ long __DEFAULT_FN_ATTRS
705 _InterlockedCompareExchange_HLEAcquire(long volatile *_Destination,
706                               long _Exchange, long _Comparand) {
707   __asm__ __volatile__(".byte 0xf2 ; lock ; cmpxchg {%2, %1|%1, %2}"
708                        : "+a" (_Comparand), "+m" (*_Destination)
709                        : "r" (_Exchange) : "memory");
710   return _Comparand;
711 }
712 static __inline__ long __DEFAULT_FN_ATTRS
713 _InterlockedCompareExchange_HLERelease(long volatile *_Destination,
714                               long _Exchange, long _Comparand) {
715   __asm__ __volatile__(".byte 0xf3 ; lock ; cmpxchg {%2, %1|%1, %2}"
716                        : "+a" (_Comparand), "+m" (*_Destination)
717                        : "r" (_Exchange) : "memory");
718   return _Comparand;
719 }
720 #endif
721 #if defined(__x86_64__)
722 static __inline__ __int64 __DEFAULT_FN_ATTRS
723 _InterlockedCompareExchange64_HLEAcquire(__int64 volatile *_Destination,
724                               __int64 _Exchange, __int64 _Comparand) {
725   __asm__ __volatile__(".byte 0xf2 ; lock ; cmpxchg {%2, %1|%1, %2}"
726                        : "+a" (_Comparand), "+m" (*_Destination)
727                        : "r" (_Exchange) : "memory");
728   return _Comparand;
729 }
730 static __inline__ __int64 __DEFAULT_FN_ATTRS
731 _InterlockedCompareExchange64_HLERelease(__int64 volatile *_Destination,
732                               __int64 _Exchange, __int64 _Comparand) {
733   __asm__ __volatile__(".byte 0xf3 ; lock ; cmpxchg {%2, %1|%1, %2}"
734                        : "+a" (_Comparand), "+m" (*_Destination)
735                        : "r" (_Exchange) : "memory");
736   return _Comparand;
737 }
738 #endif
739 #ifdef __cplusplus
740 }
741 #endif
742 
743 #undef __DEFAULT_FN_ATTRS
744 
745 #endif /* defined(_MSC_VER) && __has_extension(gnu_asm) */
746 
747 #endif /* __IMMINTRIN_H */
748