xmmintrin.h - OpenGrok cross reference for /freebsd/contrib/llvm-project/clang/lib/Headers/xmmintrin.h

Lines Matching +full:64 +full:- +full:bit
1 /*===---- xmmintrin.h - SSE intrinsics -------------------------------------===
5  * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
7  *===-----------------------------------------------------------------------===
36   __attribute__((__always_inline__, __nodebug__, __target__("sse,no-evex512"), \
40                  __target__("mmx,sse,no-evex512"), __min_vector_width__(64)))
42 /// Adds the 32-bit float values in the low-order bits of the operands.
49 ///    A 128-bit vector of [4 x float] containing one of the source operands.
52 ///    A 128-bit vector of [4 x float] containing one of the source operands.
54 /// \returns A 128-bit vector of [4 x float] whose lower 32 bits contain the sum
64 /// Adds two 128-bit vectors of [4 x float], and returns the results of
72 ///    A 128-bit vector of [4 x float] containing one of the source operands.
74 ///    A 128-bit vector of [4 x float] containing one of the source operands.
75 /// \returns A 128-bit vector of [4 x float] containing the sums of both
83 /// Subtracts the 32-bit float value in the low-order bits of the second
91 ///    A 128-bit vector of [4 x float] containing the minuend. The lower 32 bits
94 ///    A 128-bit vector of [4 x float] containing the subtrahend. The lower 32
96 /// \returns A 128-bit vector of [4 x float] whose lower 32 bits contain the
102   __a[0] -= __b[0];  in _mm_sub_ss()
107 ///    operand, both of which are 128-bit vectors of [4 x float] and returns
115 ///    A 128-bit vector of [4 x float] containing the minuend.
117 ///    A 128-bit vector of [4 x float] containing the subtrahend.
118 /// \returns A 128-bit vector of [4 x float] containing the differences between
123   return (__m128)((__v4sf)__a - (__v4sf)__b);  in _mm_sub_ps()
126 /// Multiplies two 32-bit float values in the low-order bits of the
134 ///    A 128-bit vector of [4 x float] containing one of the source operands.
137 ///    A 128-bit vector of [4 x float] containing one of the source operands.
139 /// \returns A 128-bit vector of [4 x float] containing the product of the lower
149 /// Multiplies two 128-bit vectors of [4 x float] and returns the
157 ///    A 128-bit vector of [4 x float] containing one of the source operands.
159 ///    A 128-bit vector of [4 x float] containing one of the source operands.
160 /// \returns A 128-bit vector of [4 x float] containing the products of both
168 /// Divides the value in the low-order 32 bits of the first operand by
176 ///    A 128-bit vector of [4 x float] containing the dividend. The lower 32
179 ///    A 128-bit vector of [4 x float] containing the divisor. The lower 32 bits
181 /// \returns A 128-bit vector of [4 x float] containing the quotients of the
191 /// Divides two 128-bit vectors of [4 x float].
198 ///    A 128-bit vector of [4 x float] containing the dividend.
200 ///    A 128-bit vector of [4 x float] containing the divisor.
201 /// \returns A 128-bit vector of [4 x float] containing the quotients of both
209 /// Calculates the square root of the value stored in the low-order bits
210 ///    of a 128-bit vector of [4 x float].
217 ///    A 128-bit vector of [4 x float]. The lower 32 bits of this operand are
219 /// \returns A 128-bit vector of [4 x float] containing the square root of the
220 ///    value in the low-order bits of the operand.
227 /// Calculates the square roots of the values stored in a 128-bit vector
235 ///    A 128-bit vector of [4 x float].
236 /// \returns A 128-bit vector of [4 x float] containing the square roots of the
245 ///    low-order bits of a 128-bit vector of [4 x float].
252 ///    A 128-bit vector of [4 x float]. The lower 32 bits of this operand are
254 /// \returns A 128-bit vector of [4 x float] containing the approximate
255 ///    reciprocal of the value in the low-order bits of the operand.
263 ///    128-bit vector of [4 x float].
270 ///    A 128-bit vector of [4 x float].
271 /// \returns A 128-bit vector of [4 x float] containing the approximate
280 ///    stored in the low-order bits of a 128-bit vector of [4 x float].
287 ///    A 128-bit vector of [4 x float]. The lower 32 bits of this operand are
289 /// \returns A 128-bit vector of [4 x float] containing the approximate
290 ///    reciprocal of the square root of the value in the low-order bits of the
299 ///    values stored in a 128-bit vector of [4 x float].
306 ///    A 128-bit vector of [4 x float].
307 /// \returns A 128-bit vector of [4 x float] containing the approximate
315 /// Compares two 32-bit float values in the low-order bits of both
316 ///    operands and returns the lesser value in the low-order bits of the
326 ///    A 128-bit vector of [4 x float] containing one of the operands. The lower
329 ///    A 128-bit vector of [4 x float] containing one of the operands. The lower
331 /// \returns A 128-bit vector of [4 x float] whose lower 32 bits contain the
340 /// Compares two 128-bit vectors of [4 x float] and returns the lesser
350 ///    A 128-bit vector of [4 x float] containing one of the operands.
352 ///    A 128-bit vector of [4 x float] containing one of the operands.
353 /// \returns A 128-bit vector of [4 x float] containing the minimum values
361 /// Compares two 32-bit float values in the low-order bits of both
362 ///    operands and returns the greater value in the low-order bits of a 128-bit
372 ///    A 128-bit vector of [4 x float] containing one of the operands. The lower
375 ///    A 128-bit vector of [4 x float] containing one of the operands. The lower
377 /// \returns A 128-bit vector of [4 x float] whose lower 32 bits contain the
386 /// Compares two 128-bit vectors of [4 x float] and returns the greater
396 ///    A 128-bit vector of [4 x float] containing one of the operands.
398 ///    A 128-bit vector of [4 x float] containing one of the operands.
399 /// \returns A 128-bit vector of [4 x float] containing the maximum values
407 /// Performs a bitwise AND of two 128-bit vectors of [4 x float].
414 ///    A 128-bit vector containing one of the source operands.
416 ///    A 128-bit vector containing one of the source operands.
417 /// \returns A 128-bit vector of [4 x float] containing the bitwise AND of the
425 /// Performs a bitwise AND of two 128-bit vectors of [4 x float], using
434 ///    A 128-bit vector of [4 x float] containing the first source operand. The
437 ///    A 128-bit vector of [4 x float] containing the second source operand.
438 /// \returns A 128-bit vector of [4 x float] containing the bitwise AND of the
447 /// Performs a bitwise OR of two 128-bit vectors of [4 x float].
454 ///    A 128-bit vector of [4 x float] containing one of the source operands.
456 ///    A 128-bit vector of [4 x float] containing one of the source operands.
457 /// \returns A 128-bit vector of [4 x float] containing the bitwise OR of the
465 /// Performs a bitwise exclusive OR of two 128-bit vectors of
473 ///    A 128-bit vector of [4 x float] containing one of the source operands.
475 ///    A 128-bit vector of [4 x float] containing one of the source operands.
476 /// \returns A 128-bit vector of [4 x float] containing the bitwise exclusive OR
484 /// Compares two 32-bit float values in the low-order bits of both
488 ///    low-order bits of a vector [4 x float].
496 ///    A 128-bit vector of [4 x float] containing one of the operands. The lower
499 ///    A 128-bit vector of [4 x float] containing one of the operands. The lower
501 /// \returns A 128-bit vector of [4 x float] containing the comparison results
502 ///    in the low-order bits.
509 /// Compares each of the corresponding 32-bit float values of the
510 ///    128-bit vectors of [4 x float] for equality.
520 ///    A 128-bit vector of [4 x float].
522 ///    A 128-bit vector of [4 x float].
523 /// \returns A 128-bit vector of [4 x float] containing the comparison results.
530 /// Compares two 32-bit float values in the low-order bits of both
535 ///    low-order bits of a vector of [4 x float].
543 ///    A 128-bit vector of [4 x float] containing one of the operands. The lower
546 ///    A 128-bit vector of [4 x float] containing one of the operands. The lower
548 /// \returns A 128-bit vector of [4 x float] containing the comparison results
549 ///    in the low-order bits.
556 /// Compares each of the corresponding 32-bit float values of the
557 ///    128-bit vectors of [4 x float] to determine if the values in the first
568 ///    A 128-bit vector of [4 x float].
570 ///    A 128-bit vector of [4 x float].
571 /// \returns A 128-bit vector of [4 x float] containing the comparison results.
578 /// Compares two 32-bit float values in the low-order bits of both
583 ///    the low-order bits of a vector of [4 x float].
591 ///    A 128-bit vector of [4 x float] containing one of the operands. The lower
594 ///    A 128-bit vector of [4 x float] containing one of the operands. The lower
596 /// \returns A 128-bit vector of [4 x float] containing the comparison results
597 ///    in the low-order bits.
604 /// Compares each of the corresponding 32-bit float values of the
605 ///    128-bit vectors of [4 x float] to determine if the values in the first
616 ///    A 128-bit vector of [4 x float].
618 ///    A 128-bit vector of [4 x float].
619 /// \returns A 128-bit vector of [4 x float] containing the comparison results.
626 /// Compares two 32-bit float values in the low-order bits of both
631 ///    low-order bits of a vector of [4 x float].
639 ///    A 128-bit vector of [4 x float] containing one of the operands. The lower
642 ///    A 128-bit vector of [4 x float] containing one of the operands. The lower
644 /// \returns A 128-bit vector of [4 x float] containing the comparison results
645 ///    in the low-order bits.
654 /// Compares each of the corresponding 32-bit float values of the
655 ///    128-bit vectors of [4 x float] to determine if the values in the first
666 ///    A 128-bit vector of [4 x float].
668 ///    A 128-bit vector of [4 x float].
669 /// \returns A 128-bit vector of [4 x float] containing the comparison results.
676 /// Compares two 32-bit float values in the low-order bits of both
681 ///    low-order bits of a vector of [4 x float].
689 ///    A 128-bit vector of [4 x float] containing one of the operands. The lower
692 ///    A 128-bit vector of [4 x float] containing one of the operands. The lower
694 /// \returns A 128-bit vector of [4 x float] containing the comparison results
695 ///    in the low-order bits.
704 /// Compares each of the corresponding 32-bit float values of the
705 ///    128-bit vectors of [4 x float] to determine if the values in the first
716 ///    A 128-bit vector of [4 x float].
718 ///    A 128-bit vector of [4 x float].
719 /// \returns A 128-bit vector of [4 x float] containing the comparison results.
726 /// Compares two 32-bit float values in the low-order bits of both operands
730 ///    low-order bits of a vector of [4 x float].
739 ///    A 128-bit vector of [4 x float] containing one of the operands. The lower
742 ///    A 128-bit vector of [4 x float] containing one of the operands. The lower
744 /// \returns A 128-bit vector of [4 x float] containing the comparison results
745 ///    in the low-order bits.
752 /// Compares each of the corresponding 32-bit float values of the
753 ///    128-bit vectors of [4 x float] for inequality.
764 ///    A 128-bit vector of [4 x float].
766 ///    A 128-bit vector of [4 x float].
767 /// \returns A 128-bit vector of [4 x float] containing the comparison results.
774 /// Compares two 32-bit float values in the low-order bits of both
779 ///    low-order bits of a vector of [4 x float].
788 ///    A 128-bit vector of [4 x float] containing one of the operands. The lower
791 ///    A 128-bit vector of [4 x float] containing one of the operands. The lower
793 /// \returns A 128-bit vector of [4 x float] containing the comparison results
794 ///    in the low-order bits.
801 /// Compares each of the corresponding 32-bit float values of the
802 ///    128-bit vectors of [4 x float] to determine if the values in the first
814 ///    A 128-bit vector of [4 x float].
816 ///    A 128-bit vector of [4 x float].
817 /// \returns A 128-bit vector of [4 x float] containing the comparison results.
824 /// Compares two 32-bit float values in the low-order bits of both
829 ///    low-order bits of a vector of [4 x float].
838 ///    A 128-bit vector of [4 x float] containing one of the operands. The lower
841 ///    A 128-bit vector of [4 x float] containing one of the operands. The lower
843 /// \returns A 128-bit vector of [4 x float] containing the comparison results
844 ///    in the low-order bits.
851 /// Compares each of the corresponding 32-bit float values of the
852 ///    128-bit vectors of [4 x float] to determine if the values in the first
864 ///    A 128-bit vector of [4 x float].
866 ///    A 128-bit vector of [4 x float].
867 /// \returns A 128-bit vector of [4 x float] containing the comparison results.
874 /// Compares two 32-bit float values in the low-order bits of both
879 ///    low-order bits of a vector of [4 x float].
888 ///    A 128-bit vector of [4 x float] containing one of the operands. The lower
891 ///    A 128-bit vector of [4 x float] containing one of the operands. The lower
893 /// \returns A 128-bit vector of [4 x float] containing the comparison results
894 ///    in the low-order bits.
903 /// Compares each of the corresponding 32-bit float values of the
904 ///    128-bit vectors of [4 x float] to determine if the values in the first
916 ///    A 128-bit vector of [4 x float].
918 ///    A 128-bit vector of [4 x float].
919 /// \returns A 128-bit vector of [4 x float] containing the comparison results.
926 /// Compares two 32-bit float values in the low-order bits of both
931 ///    low-order bits of a vector of [4 x float].
940 ///    A 128-bit vector of [4 x float] containing one of the operands. The lower
943 ///    A 128-bit vector of [4 x float] containing one of the operands. The lower
945 /// \returns A 128-bit vector of [4 x float] containing the comparison results
946 ///    in the low-order bits.
955 /// Compares each of the corresponding 32-bit float values of the
956 ///    128-bit vectors of [4 x float] to determine if the values in the first
968 ///    A 128-bit vector of [4 x float].
970 ///    A 128-bit vector of [4 x float].
971 /// \returns A 128-bit vector of [4 x float] containing the comparison results.
978 /// Compares two 32-bit float values in the low-order bits of both
982 ///    A pair of floating-point values are ordered with respect to each
992 ///    A 128-bit vector of [4 x float] containing one of the operands. The lower
995 ///    A 128-bit vector of [4 x float] containing one of the operands. The lower
997 /// \returns A 128-bit vector of [4 x float] containing the comparison results
998 ///    in the low-order bits.
1005 /// Compares each of the corresponding 32-bit float values of the
1006 ///    128-bit vectors of [4 x float] to determine if the values in the first
1009 ///    A pair of floating-point values are ordered with respect to each
1019 ///    A 128-bit vector of [4 x float].
1021 ///    A 128-bit vector of [4 x float].
1022 /// \returns A 128-bit vector of [4 x float] containing the comparison results.
1029 /// Compares two 32-bit float values in the low-order bits of both
1033 ///    A pair of double-precision values are unordered with respect to each
1043 ///    A 128-bit vector of [4 x float] containing one of the operands. The lower
1046 ///    A 128-bit vector of [4 x float] containing one of the operands. The lower
1048 /// \returns A 128-bit vector of [4 x float] containing the comparison results
1049 ///    in the low-order bits.
1056 /// Compares each of the corresponding 32-bit float values of the
1057 ///    128-bit vectors of [4 x float] to determine if the values in the first
1060 ///    A pair of double-precision values are unordered with respect to each
1070 ///    A 128-bit vector of [4 x float].
1072 ///    A 128-bit vector of [4 x float].
1073 /// \returns A 128-bit vector of [4 x float] containing the comparison results.
1080 /// Compares two 32-bit float values in the low-order bits of both
1092 ///    A 128-bit vector of [4 x float]. The lower 32 bits of this operand are
1095 ///    A 128-bit vector of [4 x float]. The lower 32 bits of this operand are
1104 /// Compares two 32-bit float values in the low-order bits of both
1117 ///    A 128-bit vector of [4 x float]. The lower 32 bits of this operand are
1120 ///    A 128-bit vector of [4 x float]. The lower 32 bits of this operand are
1129 /// Compares two 32-bit float values in the low-order bits of both
1141 ///    A 128-bit vector of [4 x float]. The lower 32 bits of this operand are
1144 ///    A 128-bit vector of [4 x float]. The lower 32 bits of this operand are
1153 /// Compares two 32-bit float values in the low-order bits of both
1165 ///    A 128-bit vector of [4 x float]. The lower 32 bits of this operand are
1168 ///    A 128-bit vector of [4 x float]. The lower 32 bits of this operand are
1177 /// Compares two 32-bit float values in the low-order bits of both
1189 ///    A 128-bit vector of [4 x float]. The lower 32 bits of this operand are
1192 ///    A 128-bit vector of [4 x float]. The lower 32 bits of this operand are
1201 /// Compares two 32-bit float values in the low-order bits of both
1213 ///    A 128-bit vector of [4 x float]. The lower 32 bits of this operand are
1216 ///    A 128-bit vector of [4 x float]. The lower 32 bits of this operand are
1225 /// Performs an unordered comparison of two 32-bit float values using
1226 ///    the low-order bits of both operands to determine equality.
1236 ///    A 128-bit vector of [4 x float]. The lower 32 bits of this operand are
1239 ///    A 128-bit vector of [4 x float]. The lower 32 bits of this operand are
1248 /// Performs an unordered comparison of two 32-bit float values using
1249 ///    the low-order bits of both operands to determine if the first operand is
1260 ///    A 128-bit vector of [4 x float]. The lower 32 bits of this operand are
1263 ///    A 128-bit vector of [4 x float]. The lower 32 bits of this operand are
1272 /// Performs an unordered comparison of two 32-bit float values using
1273 ///    the low-order bits of both operands to determine if the first operand is
1284 ///    A 128-bit vector of [4 x float]. The lower 32 bits of this operand are
1287 ///    A 128-bit vector of [4 x float]. The lower 32 bits of this operand are
1296 /// Performs an unordered comparison of two 32-bit float values using
1297 ///    the low-order bits of both operands to determine if the first operand is
1308 ///    A 128-bit vector of [4 x float]. The lower 32 bits of this operand are
1311 ///    A 128-bit vector of [4 x float]. The lower 32 bits of this operand are
1320 /// Performs an unordered comparison of two 32-bit float values using
1321 ///    the low-order bits of both operands to determine if the first operand is
1332 ///    A 128-bit vector of [4 x float]. The lower 32 bits of this operand are
1335 ///    A 128-bit vector of [4 x float]. The lower 32 bits of this operand are
1344 /// Performs an unordered comparison of two 32-bit float values using
1345 ///    the low-order bits of both operands to determine inequality.
1355 ///    A 128-bit vector of [4 x float]. The lower 32 bits of this operand are
1358 ///    A 128-bit vector of [4 x float]. The lower 32 bits of this operand are
1368 ///    [4 x float] into a 32-bit integer.
1370 ///    If the converted value does not fit in a 32-bit integer, raises a
1371 ///    floating-point invalid exception. If the exception is masked, returns
1380 ///    A 128-bit vector of [4 x float]. The lower 32 bits of this operand are
1382 /// \returns A 32-bit integer containing the converted value.
1390 ///    [4 x float] into a 32-bit integer.
1392 ///    If the converted value does not fit in a 32-bit integer, raises a
1393 ///    floating-point invalid exception. If the exception is masked, returns
1402 ///    A 128-bit vector of [4 x float]. The lower 32 bits of this operand are
1404 /// \returns A 32-bit integer containing the converted value.
1414 ///    [4 x float] into a 64-bit integer.
1416 ///    If the converted value does not fit in a 32-bit integer, raises a
1417 ///    floating-point invalid exception. If the exception is masked, returns
1426 ///    A 128-bit vector of [4 x float]. The lower 32 bits of this operand are
1428 /// \returns A 64-bit integer containing the converted value.
1437 /// Converts two low-order float values in a 128-bit vector of
1438 ///    [4 x float] into a 64-bit vector of [2 x i32].
1440 ///    If a converted value does not fit in a 32-bit integer, raises a
1441 ///    floating-point invalid exception. If the exception is masked, returns
1449 ///    A 128-bit vector of [4 x float].
1450 /// \returns A 64-bit integer vector containing the converted values.
1457 /// Converts two low-order float values in a 128-bit vector of
1458 ///    [4 x float] into a 64-bit vector of [2 x i32].
1460 ///    If a converted value does not fit in a 32-bit integer, raises a
1461 ///    floating-point invalid exception. If the exception is masked, returns
1469 ///    A 128-bit vector of [4 x float].
1470 /// \returns A 64-bit integer vector containing the converted values.
1478 ///    truncated (rounded toward zero) 32-bit integer.
1480 ///    If the converted value does not fit in a 32-bit integer, raises a
1481 ///    floating-point invalid exception. If the exception is masked, returns
1490 ///    A 128-bit vector of [4 x float]. The lower 32 bits of this operand are
1492 /// \returns A 32-bit integer containing the converted value.
1500 ///    truncated (rounded toward zero) 32-bit integer.
1502 ///    If the converted value does not fit in a 32-bit integer, raises a
1503 ///    floating-point invalid exception. If the exception is masked, returns
1512 ///    A 128-bit vector of [4 x float]. The lower 32 bits of this operand are
1514 /// \returns A 32-bit integer containing the converted value.
1523 ///    truncated (rounded toward zero) 64-bit integer.
1525 ///    If the converted value does not fit in a 64-bit integer, raises a
1526 ///    floating-point invalid exception. If the exception is masked, returns
1535 ///    A 128-bit vector of [4 x float]. The lower 32 bits of this operand are
1537 /// \returns A 64-bit integer containing the converted value.
1545 /// Converts the lower (first) two elements of a 128-bit vector of [4 x float]
1546 ///    into two signed truncated (rounded toward zero) 32-bit integers,
1547 ///    returned in a 64-bit vector of [2 x i32].
1549 ///    If a converted value does not fit in a 32-bit integer, raises a
1550 ///    floating-point invalid exception. If the exception is masked, returns
1559 ///    A 128-bit vector of [4 x float].
1560 /// \returns A 64-bit integer vector containing the converted values.
1567 /// Converts the lower (first) two elements of a 128-bit vector of [4 x float]
1568 ///    into two signed truncated (rounded toward zero) 64-bit integers,
1569 ///    returned in a 64-bit vector of [2 x i32].
1571 ///    If a converted value does not fit in a 32-bit integer, raises a
1572 ///    floating-point invalid exception. If the exception is masked, returns
1580 ///    A 128-bit vector of [4 x float].
1581 /// \returns A 64-bit integer vector containing the converted values.
1588 /// Converts a 32-bit signed integer value into a floating point value
1598 ///    A 128-bit vector of [4 x float].
1600 ///    A 32-bit signed integer operand containing the value to be converted.
1601 /// \returns A 128-bit vector of [4 x float] whose lower 32 bits contain the
1611 /// Converts a 32-bit signed integer value into a floating point value
1621 ///    A 128-bit vector of [4 x float].
1623 ///    A 32-bit signed integer operand containing the value to be converted.
1624 /// \returns A 128-bit vector of [4 x float] whose lower 32 bits contain the
1635 /// Converts a 64-bit signed integer value into a floating point value
1645 ///    A 128-bit vector of [4 x float].
1647 ///    A 64-bit signed integer operand containing the value to be converted.
1648 /// \returns A 128-bit vector of [4 x float] whose lower 32 bits contain the
1660 /// Converts two elements of a 64-bit vector of [2 x i32] into two
1661 ///    floating point values and writes them to the lower 64-bits of the
1670 ///    A 128-bit vector of [4 x float].
1672 ///    A 64-bit vector of [2 x i32]. The elements in this vector are converted
1673 ///    and written to the corresponding low-order elements in the destination.
1674 /// \returns A 128-bit vector of [4 x float] whose lower 64 bits contain the
1675 ///    converted value of the second operand. The upper 64 bits are copied from
1676 ///    the upper 64 bits of the first operand.
1683 /// Converts two elements of a 64-bit vector of [2 x i32] into two
1684 ///    floating point values and writes them to the lower 64-bits of the
1693 ///    A 128-bit vector of [4 x float].
1695 ///    A 64-bit vector of [2 x i32]. The elements in this vector are converted
1696 ///    and written to the corresponding low-order elements in the destination.
1697 /// \returns A 128-bit vector of [4 x float] whose lower 64 bits contain the
1698 ///    converted value from the second operand. The upper 64 bits are copied
1699 ///    from the upper 64 bits of the first operand.
1714 ///    A 128-bit vector of [4 x float]. The lower 32 bits of this operand are
1716 /// \returns A 32-bit float containing the extracted value.
1724 ///     high-order bits of a 128-bit vector of [4 x float]. The low-order bits
1725 ///     are copied from the low-order bits of the first operand.
1732 ///    A 128-bit vector of [4 x float]. Bits [63:0] are written to bits [63:0]
1736 ///    [127:64] of the destination.
1737 /// \returns A 128-bit vector of [4 x float] containing the moved values.
1745   __mm_loadh_pi_v2f32 __b = ((const struct __mm_loadh_pi_struct*)__p)->__u;  in _mm_loadh_pi()
1751 ///    low-order bits of a 128-bit vector of [4 x float]. The high-order bits
1752 ///    are copied from the high-order bits of the first operand.
1759 ///    A 128-bit vector of [4 x float]. Bits [127:64] are written to bits
1760 ///    [127:64] of the destination.
1764 /// \returns A 128-bit vector of [4 x float] containing the moved values.
1772   __mm_loadl_pi_v2f32 __b = ((const struct __mm_loadl_pi_struct*)__p)->__u;  in _mm_loadl_pi()
1777 /// Constructs a 128-bit floating-point vector of [4 x float]. The lower
1778 ///    32 bits of the vector are initialized with the single-precision
1779 ///    floating-point value loaded from a specified memory location. The upper
1787 ///    A pointer to a 32-bit memory location containing a single-precision
1788 ///    floating-point value.
1789 /// \returns An initialized 128-bit floating-point vector of [4 x float]. The
1798   float __u = ((const struct __mm_load_ss_struct*)__p)->__u;  in _mm_load_ss()
1802 /// Loads a 32-bit float value and duplicates it to all four vector
1803 ///    elements of a 128-bit vector of [4 x float].
1812 /// \returns A 128-bit vector of [4 x float] containing the loaded and
1820   float __u = ((const struct __mm_load1_ps_struct*)__p)->__u;  in _mm_load1_ps()
1826 /// Loads a 128-bit floating-point vector of [4 x float] from an aligned
1834 ///    A pointer to a 128-bit memory location. The address of the memory
1835 ///    location has to be 128-bit aligned.
1836 /// \returns A 128-bit vector of [4 x float] containing the loaded values.
1843 /// Loads a 128-bit floating-point vector of [4 x float] from an
1851 ///    A pointer to a 128-bit memory location. The address of the memory
1853 /// \returns A 128-bit vector of [4 x float] containing the loaded values.
1860   return ((const struct __loadu_ps*)__p)->__v;  in _mm_loadu_ps()
1864 ///    memory location to 32-bit elements in a 128-bit vector of [4 x float].
1872 ///    A pointer to a 128-bit memory location. The address of the memory
1873 ///    location has to be 128-bit aligned.
1874 /// \returns A 128-bit vector of [4 x float] containing the moved values, loaded
1883 /// Create a 128-bit vector of [4 x float] with undefined values.
1889 /// \returns A 128-bit vector of [4 x float] containing undefined values.
1896 /// Constructs a 128-bit floating-point vector of [4 x float]. The lower
1897 ///    32 bits of the vector are initialized with the specified single-precision
1898 ///    floating-point value. The upper 96 bits are set to zero.
1905 ///    A single-precision floating-point value used to initialize the lower 32
1907 /// \returns An initialized 128-bit floating-point vector of [4 x float]. The
1916 /// Constructs a 128-bit floating-point vector of [4 x float], with each
1917 ///    of the four single-precision floating-point vector elements set to the
1918 ///    specified single-precision floating-point value.
1925 ///    A single-precision floating-point value used to initialize each vector
1927 /// \returns An initialized 128-bit floating-point vector of [4 x float].
1935 /// Constructs a 128-bit floating-point vector of [4 x float], with each
1936 ///    of the four single-precision floating-point vector elements set to the
1937 ///    specified single-precision floating-point value.
1944 ///    A single-precision floating-point value used to initialize each vector
1946 /// \returns An initialized 128-bit floating-point vector of [4 x float].
1953 /// Constructs a 128-bit floating-point vector of [4 x float]
1954 ///    initialized with the specified single-precision floating-point values.
1962 ///    A single-precision floating-point value used to initialize bits [127:96]
1965 ///    A single-precision floating-point value used to initialize bits [95:64]
1968 ///    A single-precision floating-point value used to initialize bits [63:32]
1971 ///    A single-precision floating-point value used to initialize bits [31:0]
1973 /// \returns An initialized 128-bit floating-point vector of [4 x float].
1980 /// Constructs a 128-bit floating-point vector of [4 x float],
1981 ///    initialized in reverse order with the specified 32-bit single-precision
1982 ///    float-point values.
1990 ///    A single-precision floating-point value used to initialize bits [31:0]
1993 ///    A single-precision floating-point value used to initialize bits [63:32]
1996 ///    A single-precision floating-point value used to initialize bits [95:64]
1999 ///    A single-precision floating-point value used to initialize bits [127:96]
2001 /// \returns An initialized 128-bit floating-point vector of [4 x float].
2008 /// Constructs a 128-bit floating-point vector of [4 x float] initialized
2015 /// \returns An initialized 128-bit floating-point vector of [4 x float] with
2023 /// Stores the upper 64 bits of a 128-bit vector of [4 x float] to a
2031 ///    A pointer to a 64-bit memory location.
2033 ///    A 128-bit vector of [4 x float] containing the values to be stored.
2041   ((struct __mm_storeh_pi_struct*)__p)->__u = __builtin_shufflevector(__a, __a, 2, 3);  in _mm_storeh_pi()
2044 /// Stores the lower 64 bits of a 128-bit vector of [4 x float] to a
2054 ///    A 128-bit vector of [4 x float] containing the values to be stored.
2062   ((struct __mm_storeh_pi_struct*)__p)->__u = __builtin_shufflevector(__a, __a, 0, 1);  in _mm_storel_pi()
2065 /// Stores the lower 32 bits of a 128-bit vector of [4 x float] to a
2073 ///    A pointer to a 32-bit memory location.
2075 ///    A 128-bit vector of [4 x float] containing the value to be stored.
2082   ((struct __mm_store_ss_struct*)__p)->__u = __a[0];  in _mm_store_ss()
2085 /// Stores a 128-bit vector of [4 x float] to an unaligned memory
2093 ///    A pointer to a 128-bit memory location. The address of the memory
2096 ///    A 128-bit vector of [4 x float] containing the values to be stored.
2103   ((struct __storeu_ps*)__p)->__v = __a;  in _mm_storeu_ps()
2106 /// Stores a 128-bit vector of [4 x float] into an aligned memory
2114 ///    A pointer to a 128-bit memory location. The address of the memory
2115 ///    location has to be 16-byte aligned.
2117 ///    A 128-bit vector of [4 x float] containing the values to be stored.
2124 /// Stores the lower 32 bits of a 128-bit vector of [4 x float] into
2133 ///    A pointer to a 128-bit memory location.
2135 ///    A 128-bit vector of [4 x float] whose lower 32 bits are stored to each
2144 /// Stores the lower 32 bits of a 128-bit vector of [4 x float] into
2153 ///    A pointer to a 128-bit memory location.
2155 ///    A 128-bit vector of [4 x float] whose lower 32 bits are stored to each
2163 /// Stores float values from a 128-bit vector of [4 x float] to an
2172 ///    A pointer to a 128-bit memory location. The address of the memory
2173 ///    location has to be 128-bit aligned.
2175 ///    A 128-bit vector of [4 x float] containing the values to be stored.
2210 ///    _MM_HINT_NTA: Move data using the non-temporal access (NTA) hint. The
2222 /// Stores a 64-bit integer in the specified aligned memory location. To
2223 ///    minimize caching, the data is flagged as non-temporal (unlikely to be
2233 ///    A 64-bit integer containing the value to be stored.
2240 /// Moves packed float values from a 128-bit vector of [4 x float] to a
2241 ///    128-bit aligned memory location. To minimize caching, the data is flagged
2242 ///    as non-temporal (unlikely to be used again soon).
2249 ///    A pointer to a 128-bit aligned memory location that will receive the
2250 ///    single-precision floating-point values.
2252 ///    A 128-bit vector of [4 x float] containing the values to be moved.
2278 /// Extracts 16-bit element from a 64-bit vector of [4 x i16] and
2290 ///    A 64-bit vector of [4 x i16].
2297 /// \returns A 16-bit integer containing the extracted 16 bits of packed data.
2301 /// Copies data from the 64-bit vector of [4 x i16] to the destination,
2302 ///    and inserts the lower 16-bits of an integer operand at the 16-bit offset
2314 ///    A 64-bit vector of [4 x i16].
2316 ///    An integer. The lower 16-bit value from this operand is written to the
2327 /// \returns A 64-bit integer vector containing the copied packed data from the
2332 /// Compares each of the corresponding packed 16-bit integer values of
2333 ///    the 64-bit integer vectors, and writes the greater value to the
2341 ///    A 64-bit integer vector containing one of the source operands.
2343 ///    A 64-bit integer vector containing one of the source operands.
2344 /// \returns A 64-bit integer vector containing the comparison results.
2351 /// Compares each of the corresponding packed 8-bit unsigned integer
2352 ///    values of the 64-bit integer vectors, and writes the greater value to the
2360 ///    A 64-bit integer vector containing one of the source operands.
2362 ///    A 64-bit integer vector containing one of the source operands.
2363 /// \returns A 64-bit integer vector containing the comparison results.
2370 /// Compares each of the corresponding packed 16-bit integer values of
2371 ///    the 64-bit integer vectors, and writes the lesser value to the
2379 ///    A 64-bit integer vector containing one of the source operands.
2381 ///    A 64-bit integer vector containing one of the source operands.
2382 /// \returns A 64-bit integer vector containing the comparison results.
2389 /// Compares each of the corresponding packed 8-bit unsigned integer
2390 ///    values of the 64-bit integer vectors, and writes the lesser value to the
2398 ///    A 64-bit integer vector containing one of the source operands.
2400 ///    A 64-bit integer vector containing one of the source operands.
2401 /// \returns A 64-bit integer vector containing the comparison results.
2408 /// Takes the most significant bit from each 8-bit element in a 64-bit
2409 ///    integer vector to create an 8-bit mask value. Zero-extends the value to
2410 ///    32-bit integer and writes it to the destination.
2417 ///    A 64-bit integer vector containing the values with bits to be extracted.
2418 /// \returns The most significant bit from each 8-bit element in \a __a,
2426 /// Multiplies packed 16-bit unsigned integer values and writes the
2427 ///    high-order 16 bits of each 32-bit product to the corresponding bits in
2435 ///    A 64-bit integer vector containing one of the source operands.
2437 ///    A 64-bit integer vector containing one of the source operands.
2438 /// \returns A 64-bit integer vector containing the products of both operands.
2445 /// Shuffles the 4 16-bit integers from a 64-bit integer vector to the
2457 ///    A 64-bit integer vector containing the values to be shuffled.
2459 ///    An immediate value containing an 8-bit value specifying which elements to
2460 ///    copy from \a a. The destinations within the 64-bit destination are
2470 ///    Bit value assignments: \n
2476 ///    <c>_MM_SHUFFLE(b6, b4, b2, b0)</c> can create an 8-bit mask of the form
2478 /// \returns A 64-bit integer vector containing the shuffled values.
2482 /// Conditionally copies the values from each 8-bit element in the first
2483 ///    64-bit integer vector operand to the specified memory location, as
2484 ///    specified by the most significant bit in the corresponding element in the
2485 ///    second 64-bit integer vector operand.
2487 ///    To minimize caching, the data is flagged as non-temporal
2495 ///    A 64-bit integer vector containing the values with elements to be copied.
2497 ///    A 64-bit integer vector operand. The most significant bit from each 8-bit
2499 ///    is copied. If the most significant bit of a given element is 1, the
2502 ///    A pointer to a 64-bit memory location that will receive the conditionally
2511 /// Computes the rounded averages of the packed unsigned 8-bit integer
2520 ///    A 64-bit integer vector containing one of the source operands.
2522 ///    A 64-bit integer vector containing one of the source operands.
2523 /// \returns A 64-bit integer vector containing the averages of both operands.
2530 /// Computes the rounded averages of the packed unsigned 16-bit integer
2539 ///    A 64-bit integer vector containing one of the source operands.
2541 ///    A 64-bit integer vector containing one of the source operands.
2542 /// \returns A 64-bit integer vector containing the averages of both operands.
2549 /// Subtracts the corresponding 8-bit unsigned integer values of the two
2550 ///    64-bit vector operands and computes the absolute value for each of the
2559 ///    A 64-bit integer vector containing one of the source operands.
2561 ///    A 64-bit integer vector containing one of the source operands.
2562 /// \returns A 64-bit integer vector whose lower 16 bits contain the sums of the
2575 /// Returns the contents of the MXCSR register as a 32-bit unsigned
2598 ///      For checking flush-to-zero mode: _MM_FLUSH_ZERO_ON, _MM_FLUSH_ZERO_OFF.
2602 ///      For checking denormals-are-zero mode: _MM_DENORMALS_ZERO_ON,
2623 /// \returns A 32-bit unsigned integer containing the contents of the MXCSR
2627 /// Sets the MXCSR register with the 32-bit unsigned integer value.
2650 ///      For setting flush-to-zero mode: _MM_FLUSH_ZERO_ON, _MM_FLUSH_ZERO_OFF.
2655 ///      For setting denormals-are-zero mode: _MM_DENORMALS_ZERO_ON,
2661 ///    For example, the following expression causes subsequent floating-point
2678 ///    A 32-bit unsigned integer value to be written to the MXCSR register.
2685 /// Selects 4 float values from the 128-bit operands of [4 x float], as
2697 ///    A 128-bit vector of [4 x float].
2699 ///    A 128-bit vector of [4 x float].
2701 ///    An immediate value containing an 8-bit value specifying which elements to
2705 ///    The destinations within the 128-bit destination are assigned values as
2711 ///    Bits [5:4] are used to assign values to bits [95:64] in the
2715 ///    Bit value assignments: \n
2718 ///    10: Bits [95:64] copied from the specified operand. \n
2721 ///    <c>_MM_SHUFFLE(b6, b4, b2, b0)</c> can create an 8-bit mask of the form
2723 /// \returns A 128-bit vector of [4 x float] containing the shuffled values.
2728 /// Unpacks the high-order (index 2,3) values from two 128-bit vectors of
2729 ///    [4 x float] and interleaves them into a 128-bit vector of [4 x float].
2736 ///    A 128-bit vector of [4 x float]. \n
2737 ///    Bits [95:64] are written to bits [31:0] of the destination. \n
2738 ///    Bits [127:96] are written to bits [95:64] of the destination.
2740 ///    A 128-bit vector of [4 x float].
2741 ///    Bits [95:64] are written to bits [63:32] of the destination. \n
2743 /// \returns A 128-bit vector of [4 x float] containing the interleaved values.
2750 /// Unpacks the low-order (index 0,1) values from two 128-bit vectors of
2751 ///    [4 x float] and interleaves them into a 128-bit vector of [4 x float].
2758 ///    A 128-bit vector of [4 x float]. \n
2760 ///    Bits [63:32] are written to bits [95:64] of the destination.
2762 ///    A 128-bit vector of [4 x float]. \n
2765 /// \returns A 128-bit vector of [4 x float] containing the interleaved values.
2772 /// Constructs a 128-bit floating-point vector of [4 x float]. The lower
2782 ///    A 128-bit floating-point vector of [4 x float]. The upper 96 bits are
2785 ///    A 128-bit floating-point vector of [4 x float]. The lower 32 bits are
2787 /// \returns A 128-bit floating-point vector of [4 x float].
2795 /// Constructs a 128-bit floating-point vector of [4 x float]. The lower
2796 ///    64 bits are set to the upper 64 bits of the second parameter. The upper
2797 ///    64 bits are set to the upper 64 bits of the first parameter.
2804 ///    A 128-bit floating-point vector of [4 x float]. The upper 64 bits are
2805 ///    written to the upper 64 bits of the result.
2807 ///    A 128-bit floating-point vector of [4 x float]. The upper 64 bits are
2808 ///    written to the lower 64 bits of the result.
2809 /// \returns A 128-bit floating-point vector of [4 x float].
2816 /// Constructs a 128-bit floating-point vector of [4 x float]. The lower
2817 ///    64 bits are set to the lower 64 bits of the first parameter. The upper
2818 ///    64 bits are set to the lower 64 bits of the second parameter.
2825 ///    A 128-bit floating-point vector of [4 x float]. The lower 64 bits are
2826 ///    written to the lower 64 bits of the result.
2828 ///    A 128-bit floating-point vector of [4 x float]. The lower 64 bits are
2829 ///    written to the upper 64 bits of the result.
2830 /// \returns A 128-bit floating-point vector of [4 x float].
2837 /// Converts a 64-bit vector of [4 x i16] into a 128-bit vector of [4 x
2845 ///    A 64-bit vector of [4 x i16]. The elements of the destination are copied
2847 /// \returns A 128-bit vector of [4 x float] containing the copied and converted
2867 /// Converts a 64-bit vector of 16-bit unsigned integer values into a
2868 ///    128-bit vector of [4 x float].
2875 ///    A 64-bit vector of 16-bit unsigned integer values. The elements of the
2877 /// \returns A 128-bit vector of [4 x float] containing the copied and converted
2896 /// Converts the lower four 8-bit values from a 64-bit vector of [8 x i8]
2897 ///    into a 128-bit vector of [4 x float].
2904 ///    A 64-bit vector of [8 x i8]. The elements of the destination are copied
2906 /// \returns A 128-bit vector of [4 x float] containing the copied and converted
2920 /// Converts the lower four unsigned 8-bit integer values from a 64-bit
2921 ///    vector of [8 x u8] into a 128-bit vector of [4 x float].
2928 ///    A 64-bit vector of unsigned 8-bit integer values. The elements of the
2931 /// \returns A 128-bit vector of [4 x float] containing the copied and converted
2944 /// Converts the two 32-bit signed integer values from each 64-bit vector
2945 ///    operand of [2 x i32] into a 128-bit vector of [4 x float].
2952 ///    A 64-bit vector of [2 x i32]. The lower elements of the destination are
2955 ///    A 64-bit vector of [2 x i32]. The upper elements of the destination are
2957 /// \returns A 128-bit vector of [4 x float] whose lower 64 bits contain the
2958 ///    copied and converted values from the first operand. The upper 64 bits
2972 /// Converts each single-precision floating-point element of a 128-bit
2973 ///    floating-point vector of [4 x float] into a 16-bit signed integer, and
2974 ///    packs the results into a 64-bit integer vector of [4 x i16].
2976 ///    If the floating-point element is NaN or infinity, or if the
2977 ///    floating-point element is greater than 0x7FFFFFFF or less than -0x8000,
2978 ///    it is converted to 0x8000. Otherwise if the floating-point element is
2986 ///    A 128-bit floating-point vector of [4 x float].
2987 /// \returns A 64-bit integer vector of [4 x i16] containing the converted
3001 /// Converts each single-precision floating-point element of a 128-bit
3002 ///    floating-point vector of [4 x float] into an 8-bit signed integer, and
3003 ///    packs the results into the lower 32 bits of a 64-bit integer vector of
3006 ///    If the floating-point element is NaN or infinity, or if the
3007 ///    floating-point element is greater than 0x7FFFFFFF or less than -0x80, it
3008 ///    is converted to 0x80. Otherwise if the floating-point element is greater
3016 ///    128-bit floating-point vector of [4 x float].
3017 /// \returns A 64-bit integer vector of [8 x i8]. The lower 32 bits contain the
3030 /// Extracts the sign bits from each single-precision floating-point
3031 ///    element of a 128-bit floating-point vector of [4 x float] and returns the
3040 ///    A 128-bit floating-point vector of [4 x float].
3041 /// \returns A 32-bit integer value. Bits [3:0] contain the sign bits from each
3042 ///    single-precision floating-point element of the parameter. Bits [31:4] are
3051 #define _CMP_EQ_OQ    0x00 /* Equal (ordered, non-signaling)  */
3052 #define _CMP_LT_OS    0x01 /* Less-than (ordered, signaling)  */
3053 #define _CMP_LE_OS    0x02 /* Less-than-or-equal (ordered, signaling)  */
3054 #define _CMP_UNORD_Q  0x03 /* Unordered (non-signaling)  */
3055 #define _CMP_NEQ_UQ   0x04 /* Not-equal (unordered, non-signaling)  */
3056 #define _CMP_NLT_US   0x05 /* Not-less-than (unordered, signaling)  */
3057 #define _CMP_NLE_US   0x06 /* Not-less-than-or-equal (unordered, signaling)  */
3058 #define _CMP_ORD_Q    0x07 /* Ordered (non-signaling)   */
3060 /// Compares each of the corresponding values of two 128-bit vectors of
3077 ///    A 128-bit vector of [4 x float].
3079 ///    A 128-bit vector of [4 x float].
3083 ///    0x00: Equal (ordered, non-signaling) \n
3084 ///    0x01: Less-than (ordered, signaling) \n
3085 ///    0x02: Less-than-or-equal (ordered, signaling) \n
3086 ///    0x03: Unordered (non-signaling) \n
3087 ///    0x04: Not-equal (unordered, non-signaling) \n
3088 ///    0x05: Not-less-than (unordered, signaling) \n
3089 ///    0x06: Not-less-than-or-equal (unordered, signaling) \n
3090 ///    0x07: Ordered (non-signaling) \n
3091 /// \returns A 128-bit vector of [4 x float] containing the comparison results.
3095 /// Compares each of the corresponding scalar values of two 128-bit
3112 ///    A 128-bit vector of [4 x float].
3114 ///    A 128-bit vector of [4 x float].
3118 ///    0x00: Equal (ordered, non-signaling) \n
3119 ///    0x01: Less-than (ordered, signaling) \n
3120 ///    0x02: Less-than-or-equal (ordered, signaling) \n
3121 ///    0x03: Unordered (non-signaling) \n
3122 ///    0x04: Not-equal (unordered, non-signaling) \n
3123 ///    0x05: Not-less-than (unordered, signaling) \n
3124 ///    0x06: Not-less-than-or-equal (unordered, signaling) \n
3125 ///    0x07: Ordered (non-signaling) \n
3126 /// \returns A 128-bit vector of [4 x float] containing the comparison results.
3202 /* Ugly hack for backwards-compatibility (compatible with gcc) */