1 //===-- X86ISelLowering.h - X86 DAG Lowering Interface ----------*- C++ -*-===// 2 // 3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 4 // See https://llvm.org/LICENSE.txt for license information. 5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 6 // 7 //===----------------------------------------------------------------------===// 8 // 9 // This file defines the interfaces that X86 uses to lower LLVM code into a 10 // selection DAG. 11 // 12 //===----------------------------------------------------------------------===// 13 14 #ifndef LLVM_LIB_TARGET_X86_X86ISELLOWERING_H 15 #define LLVM_LIB_TARGET_X86_X86ISELLOWERING_H 16 17 #include "llvm/CodeGen/MachineFunction.h" 18 #include "llvm/CodeGen/TargetLowering.h" 19 20 namespace llvm { 21 class X86Subtarget; 22 class X86TargetMachine; 23 24 namespace X86ISD { 25 // X86 Specific DAG Nodes 26 enum NodeType : unsigned { 27 // Start the numbering where the builtin ops leave off. 28 FIRST_NUMBER = ISD::BUILTIN_OP_END, 29 30 /// Bit scan forward. 31 BSF, 32 /// Bit scan reverse. 33 BSR, 34 35 /// X86 funnel/double shift i16 instructions. These correspond to 36 /// X86::SHLDW and X86::SHRDW instructions which have different amt 37 /// modulo rules to generic funnel shifts. 38 /// NOTE: The operand order matches ISD::FSHL/FSHR not SHLD/SHRD. 39 FSHL, 40 FSHR, 41 42 /// Bitwise logical AND of floating point values. This corresponds 43 /// to X86::ANDPS or X86::ANDPD. 44 FAND, 45 46 /// Bitwise logical OR of floating point values. This corresponds 47 /// to X86::ORPS or X86::ORPD. 48 FOR, 49 50 /// Bitwise logical XOR of floating point values. This corresponds 51 /// to X86::XORPS or X86::XORPD. 52 FXOR, 53 54 /// Bitwise logical ANDNOT of floating point values. This 55 /// corresponds to X86::ANDNPS or X86::ANDNPD. 56 FANDN, 57 58 /// These operations represent an abstract X86 call 59 /// instruction, which includes a bunch of information. In particular the 60 /// operands of these node are: 61 /// 62 /// #0 - The incoming token chain 63 /// #1 - The callee 64 /// #2 - The number of arg bytes the caller pushes on the stack. 65 /// #3 - The number of arg bytes the callee pops off the stack. 66 /// #4 - The value to pass in AL/AX/EAX (optional) 67 /// #5 - The value to pass in DL/DX/EDX (optional) 68 /// 69 /// The result values of these nodes are: 70 /// 71 /// #0 - The outgoing token chain 72 /// #1 - The first register result value (optional) 73 /// #2 - The second register result value (optional) 74 /// 75 CALL, 76 77 /// Same as call except it adds the NoTrack prefix. 78 NT_CALL, 79 80 // Pseudo for a OBJC call that gets emitted together with a special 81 // marker instruction. 82 CALL_RVMARKER, 83 84 /// X86 compare and logical compare instructions. 85 CMP, 86 FCMP, 87 COMI, 88 UCOMI, 89 90 /// X86 bit-test instructions. 91 BT, 92 93 /// X86 SetCC. Operand 0 is condition code, and operand 1 is the EFLAGS 94 /// operand, usually produced by a CMP instruction. 95 SETCC, 96 97 /// X86 Select 98 SELECTS, 99 100 // Same as SETCC except it's materialized with a sbb and the value is all 101 // one's or all zero's. 102 SETCC_CARRY, // R = carry_bit ? ~0 : 0 103 104 /// X86 FP SETCC, implemented with CMP{cc}SS/CMP{cc}SD. 105 /// Operands are two FP values to compare; result is a mask of 106 /// 0s or 1s. Generally DTRT for C/C++ with NaNs. 107 FSETCC, 108 109 /// X86 FP SETCC, similar to above, but with output as an i1 mask and 110 /// and a version with SAE. 111 FSETCCM, 112 FSETCCM_SAE, 113 114 /// X86 conditional moves. Operand 0 and operand 1 are the two values 115 /// to select from. Operand 2 is the condition code, and operand 3 is the 116 /// flag operand produced by a CMP or TEST instruction. 117 CMOV, 118 119 /// X86 conditional branches. Operand 0 is the chain operand, operand 1 120 /// is the block to branch if condition is true, operand 2 is the 121 /// condition code, and operand 3 is the flag operand produced by a CMP 122 /// or TEST instruction. 123 BRCOND, 124 125 /// BRIND node with NoTrack prefix. Operand 0 is the chain operand and 126 /// operand 1 is the target address. 127 NT_BRIND, 128 129 /// Return with a flag operand. Operand 0 is the chain operand, operand 130 /// 1 is the number of bytes of stack to pop. 131 RET_FLAG, 132 133 /// Return from interrupt. Operand 0 is the number of bytes to pop. 134 IRET, 135 136 /// Repeat fill, corresponds to X86::REP_STOSx. 137 REP_STOS, 138 139 /// Repeat move, corresponds to X86::REP_MOVSx. 140 REP_MOVS, 141 142 /// On Darwin, this node represents the result of the popl 143 /// at function entry, used for PIC code. 144 GlobalBaseReg, 145 146 /// A wrapper node for TargetConstantPool, TargetJumpTable, 147 /// TargetExternalSymbol, TargetGlobalAddress, TargetGlobalTLSAddress, 148 /// MCSymbol and TargetBlockAddress. 149 Wrapper, 150 151 /// Special wrapper used under X86-64 PIC mode for RIP 152 /// relative displacements. 153 WrapperRIP, 154 155 /// Copies a 64-bit value from an MMX vector to the low word 156 /// of an XMM vector, with the high word zero filled. 157 MOVQ2DQ, 158 159 /// Copies a 64-bit value from the low word of an XMM vector 160 /// to an MMX vector. 161 MOVDQ2Q, 162 163 /// Copies a 32-bit value from the low word of a MMX 164 /// vector to a GPR. 165 MMX_MOVD2W, 166 167 /// Copies a GPR into the low 32-bit word of a MMX vector 168 /// and zero out the high word. 169 MMX_MOVW2D, 170 171 /// Extract an 8-bit value from a vector and zero extend it to 172 /// i32, corresponds to X86::PEXTRB. 173 PEXTRB, 174 175 /// Extract a 16-bit value from a vector and zero extend it to 176 /// i32, corresponds to X86::PEXTRW. 177 PEXTRW, 178 179 /// Insert any element of a 4 x float vector into any element 180 /// of a destination 4 x floatvector. 181 INSERTPS, 182 183 /// Insert the lower 8-bits of a 32-bit value to a vector, 184 /// corresponds to X86::PINSRB. 185 PINSRB, 186 187 /// Insert the lower 16-bits of a 32-bit value to a vector, 188 /// corresponds to X86::PINSRW. 189 PINSRW, 190 191 /// Shuffle 16 8-bit values within a vector. 192 PSHUFB, 193 194 /// Compute Sum of Absolute Differences. 195 PSADBW, 196 /// Compute Double Block Packed Sum-Absolute-Differences 197 DBPSADBW, 198 199 /// Bitwise Logical AND NOT of Packed FP values. 200 ANDNP, 201 202 /// Blend where the selector is an immediate. 203 BLENDI, 204 205 /// Dynamic (non-constant condition) vector blend where only the sign bits 206 /// of the condition elements are used. This is used to enforce that the 207 /// condition mask is not valid for generic VSELECT optimizations. This 208 /// is also used to implement the intrinsics. 209 /// Operands are in VSELECT order: MASK, TRUE, FALSE 210 BLENDV, 211 212 /// Combined add and sub on an FP vector. 213 ADDSUB, 214 215 // FP vector ops with rounding mode. 216 FADD_RND, 217 FADDS, 218 FADDS_RND, 219 FSUB_RND, 220 FSUBS, 221 FSUBS_RND, 222 FMUL_RND, 223 FMULS, 224 FMULS_RND, 225 FDIV_RND, 226 FDIVS, 227 FDIVS_RND, 228 FMAX_SAE, 229 FMAXS_SAE, 230 FMIN_SAE, 231 FMINS_SAE, 232 FSQRT_RND, 233 FSQRTS, 234 FSQRTS_RND, 235 236 // FP vector get exponent. 237 FGETEXP, 238 FGETEXP_SAE, 239 FGETEXPS, 240 FGETEXPS_SAE, 241 // Extract Normalized Mantissas. 242 VGETMANT, 243 VGETMANT_SAE, 244 VGETMANTS, 245 VGETMANTS_SAE, 246 // FP Scale. 247 SCALEF, 248 SCALEF_RND, 249 SCALEFS, 250 SCALEFS_RND, 251 252 // Unsigned Integer average. 253 AVG, 254 255 /// Integer horizontal add/sub. 256 HADD, 257 HSUB, 258 259 /// Floating point horizontal add/sub. 260 FHADD, 261 FHSUB, 262 263 // Detect Conflicts Within a Vector 264 CONFLICT, 265 266 /// Floating point max and min. 267 FMAX, 268 FMIN, 269 270 /// Commutative FMIN and FMAX. 271 FMAXC, 272 FMINC, 273 274 /// Scalar intrinsic floating point max and min. 275 FMAXS, 276 FMINS, 277 278 /// Floating point reciprocal-sqrt and reciprocal approximation. 279 /// Note that these typically require refinement 280 /// in order to obtain suitable precision. 281 FRSQRT, 282 FRCP, 283 284 // AVX-512 reciprocal approximations with a little more precision. 285 RSQRT14, 286 RSQRT14S, 287 RCP14, 288 RCP14S, 289 290 // Thread Local Storage. 291 TLSADDR, 292 293 // Thread Local Storage. A call to get the start address 294 // of the TLS block for the current module. 295 TLSBASEADDR, 296 297 // Thread Local Storage. When calling to an OS provided 298 // thunk at the address from an earlier relocation. 299 TLSCALL, 300 301 // Exception Handling helpers. 302 EH_RETURN, 303 304 // SjLj exception handling setjmp. 305 EH_SJLJ_SETJMP, 306 307 // SjLj exception handling longjmp. 308 EH_SJLJ_LONGJMP, 309 310 // SjLj exception handling dispatch. 311 EH_SJLJ_SETUP_DISPATCH, 312 313 /// Tail call return. See X86TargetLowering::LowerCall for 314 /// the list of operands. 315 TC_RETURN, 316 317 // Vector move to low scalar and zero higher vector elements. 318 VZEXT_MOVL, 319 320 // Vector integer truncate. 321 VTRUNC, 322 // Vector integer truncate with unsigned/signed saturation. 323 VTRUNCUS, 324 VTRUNCS, 325 326 // Masked version of the above. Used when less than a 128-bit result is 327 // produced since the mask only applies to the lower elements and can't 328 // be represented by a select. 329 // SRC, PASSTHRU, MASK 330 VMTRUNC, 331 VMTRUNCUS, 332 VMTRUNCS, 333 334 // Vector FP extend. 335 VFPEXT, 336 VFPEXT_SAE, 337 VFPEXTS, 338 VFPEXTS_SAE, 339 340 // Vector FP round. 341 VFPROUND, 342 VFPROUND_RND, 343 VFPROUNDS, 344 VFPROUNDS_RND, 345 346 // Masked version of above. Used for v2f64->v4f32. 347 // SRC, PASSTHRU, MASK 348 VMFPROUND, 349 350 // 128-bit vector logical left / right shift 351 VSHLDQ, 352 VSRLDQ, 353 354 // Vector shift elements 355 VSHL, 356 VSRL, 357 VSRA, 358 359 // Vector variable shift 360 VSHLV, 361 VSRLV, 362 VSRAV, 363 364 // Vector shift elements by immediate 365 VSHLI, 366 VSRLI, 367 VSRAI, 368 369 // Shifts of mask registers. 370 KSHIFTL, 371 KSHIFTR, 372 373 // Bit rotate by immediate 374 VROTLI, 375 VROTRI, 376 377 // Vector packed double/float comparison. 378 CMPP, 379 380 // Vector integer comparisons. 381 PCMPEQ, 382 PCMPGT, 383 384 // v8i16 Horizontal minimum and position. 385 PHMINPOS, 386 387 MULTISHIFT, 388 389 /// Vector comparison generating mask bits for fp and 390 /// integer signed and unsigned data types. 391 CMPM, 392 // Vector mask comparison generating mask bits for FP values. 393 CMPMM, 394 // Vector mask comparison with SAE for FP values. 395 CMPMM_SAE, 396 397 // Arithmetic operations with FLAGS results. 398 ADD, 399 SUB, 400 ADC, 401 SBB, 402 SMUL, 403 UMUL, 404 OR, 405 XOR, 406 AND, 407 408 // Bit field extract. 409 BEXTR, 410 BEXTRI, 411 412 // Zero High Bits Starting with Specified Bit Position. 413 BZHI, 414 415 // Parallel extract and deposit. 416 PDEP, 417 PEXT, 418 419 // X86-specific multiply by immediate. 420 MUL_IMM, 421 422 // Vector sign bit extraction. 423 MOVMSK, 424 425 // Vector bitwise comparisons. 426 PTEST, 427 428 // Vector packed fp sign bitwise comparisons. 429 TESTP, 430 431 // OR/AND test for masks. 432 KORTEST, 433 KTEST, 434 435 // ADD for masks. 436 KADD, 437 438 // Several flavors of instructions with vector shuffle behaviors. 439 // Saturated signed/unnsigned packing. 440 PACKSS, 441 PACKUS, 442 // Intra-lane alignr. 443 PALIGNR, 444 // AVX512 inter-lane alignr. 445 VALIGN, 446 PSHUFD, 447 PSHUFHW, 448 PSHUFLW, 449 SHUFP, 450 // VBMI2 Concat & Shift. 451 VSHLD, 452 VSHRD, 453 VSHLDV, 454 VSHRDV, 455 // Shuffle Packed Values at 128-bit granularity. 456 SHUF128, 457 MOVDDUP, 458 MOVSHDUP, 459 MOVSLDUP, 460 MOVLHPS, 461 MOVHLPS, 462 MOVSD, 463 MOVSS, 464 MOVSH, 465 UNPCKL, 466 UNPCKH, 467 VPERMILPV, 468 VPERMILPI, 469 VPERMI, 470 VPERM2X128, 471 472 // Variable Permute (VPERM). 473 // Res = VPERMV MaskV, V0 474 VPERMV, 475 476 // 3-op Variable Permute (VPERMT2). 477 // Res = VPERMV3 V0, MaskV, V1 478 VPERMV3, 479 480 // Bitwise ternary logic. 481 VPTERNLOG, 482 // Fix Up Special Packed Float32/64 values. 483 VFIXUPIMM, 484 VFIXUPIMM_SAE, 485 VFIXUPIMMS, 486 VFIXUPIMMS_SAE, 487 // Range Restriction Calculation For Packed Pairs of Float32/64 values. 488 VRANGE, 489 VRANGE_SAE, 490 VRANGES, 491 VRANGES_SAE, 492 // Reduce - Perform Reduction Transformation on scalar\packed FP. 493 VREDUCE, 494 VREDUCE_SAE, 495 VREDUCES, 496 VREDUCES_SAE, 497 // RndScale - Round FP Values To Include A Given Number Of Fraction Bits. 498 // Also used by the legacy (V)ROUND intrinsics where we mask out the 499 // scaling part of the immediate. 500 VRNDSCALE, 501 VRNDSCALE_SAE, 502 VRNDSCALES, 503 VRNDSCALES_SAE, 504 // Tests Types Of a FP Values for packed types. 505 VFPCLASS, 506 // Tests Types Of a FP Values for scalar types. 507 VFPCLASSS, 508 509 // Broadcast (splat) scalar or element 0 of a vector. If the operand is 510 // a vector, this node may change the vector length as part of the splat. 511 VBROADCAST, 512 // Broadcast mask to vector. 513 VBROADCASTM, 514 515 /// SSE4A Extraction and Insertion. 516 EXTRQI, 517 INSERTQI, 518 519 // XOP arithmetic/logical shifts. 520 VPSHA, 521 VPSHL, 522 // XOP signed/unsigned integer comparisons. 523 VPCOM, 524 VPCOMU, 525 // XOP packed permute bytes. 526 VPPERM, 527 // XOP two source permutation. 528 VPERMIL2, 529 530 // Vector multiply packed unsigned doubleword integers. 531 PMULUDQ, 532 // Vector multiply packed signed doubleword integers. 533 PMULDQ, 534 // Vector Multiply Packed UnsignedIntegers with Round and Scale. 535 MULHRS, 536 537 // Multiply and Add Packed Integers. 538 VPMADDUBSW, 539 VPMADDWD, 540 541 // AVX512IFMA multiply and add. 542 // NOTE: These are different than the instruction and perform 543 // op0 x op1 + op2. 544 VPMADD52L, 545 VPMADD52H, 546 547 // VNNI 548 VPDPBUSD, 549 VPDPBUSDS, 550 VPDPWSSD, 551 VPDPWSSDS, 552 553 // FMA nodes. 554 // We use the target independent ISD::FMA for the non-inverted case. 555 FNMADD, 556 FMSUB, 557 FNMSUB, 558 FMADDSUB, 559 FMSUBADD, 560 561 // FMA with rounding mode. 562 FMADD_RND, 563 FNMADD_RND, 564 FMSUB_RND, 565 FNMSUB_RND, 566 FMADDSUB_RND, 567 FMSUBADD_RND, 568 569 // AVX512-FP16 complex addition and multiplication. 570 VFMADDC, 571 VFMADDC_RND, 572 VFCMADDC, 573 VFCMADDC_RND, 574 575 VFMULC, 576 VFMULC_RND, 577 VFCMULC, 578 VFCMULC_RND, 579 580 VFMADDCSH, 581 VFMADDCSH_RND, 582 VFCMADDCSH, 583 VFCMADDCSH_RND, 584 585 VFMULCSH, 586 VFMULCSH_RND, 587 VFCMULCSH, 588 VFCMULCSH_RND, 589 590 // Compress and expand. 591 COMPRESS, 592 EXPAND, 593 594 // Bits shuffle 595 VPSHUFBITQMB, 596 597 // Convert Unsigned/Integer to Floating-Point Value with rounding mode. 598 SINT_TO_FP_RND, 599 UINT_TO_FP_RND, 600 SCALAR_SINT_TO_FP, 601 SCALAR_UINT_TO_FP, 602 SCALAR_SINT_TO_FP_RND, 603 SCALAR_UINT_TO_FP_RND, 604 605 // Vector float/double to signed/unsigned integer. 606 CVTP2SI, 607 CVTP2UI, 608 CVTP2SI_RND, 609 CVTP2UI_RND, 610 // Scalar float/double to signed/unsigned integer. 611 CVTS2SI, 612 CVTS2UI, 613 CVTS2SI_RND, 614 CVTS2UI_RND, 615 616 // Vector float/double to signed/unsigned integer with truncation. 617 CVTTP2SI, 618 CVTTP2UI, 619 CVTTP2SI_SAE, 620 CVTTP2UI_SAE, 621 // Scalar float/double to signed/unsigned integer with truncation. 622 CVTTS2SI, 623 CVTTS2UI, 624 CVTTS2SI_SAE, 625 CVTTS2UI_SAE, 626 627 // Vector signed/unsigned integer to float/double. 628 CVTSI2P, 629 CVTUI2P, 630 631 // Masked versions of above. Used for v2f64->v4f32. 632 // SRC, PASSTHRU, MASK 633 MCVTP2SI, 634 MCVTP2UI, 635 MCVTTP2SI, 636 MCVTTP2UI, 637 MCVTSI2P, 638 MCVTUI2P, 639 640 // Vector float to bfloat16. 641 // Convert TWO packed single data to one packed BF16 data 642 CVTNE2PS2BF16, 643 // Convert packed single data to packed BF16 data 644 CVTNEPS2BF16, 645 // Masked version of above. 646 // SRC, PASSTHRU, MASK 647 MCVTNEPS2BF16, 648 649 // Dot product of BF16 pairs to accumulated into 650 // packed single precision. 651 DPBF16PS, 652 653 // A stack checking function call. On Windows it's _chkstk call. 654 DYN_ALLOCA, 655 656 // For allocating variable amounts of stack space when using 657 // segmented stacks. Check if the current stacklet has enough space, and 658 // falls back to heap allocation if not. 659 SEG_ALLOCA, 660 661 // For allocating stack space when using stack clash protector. 662 // Allocation is performed by block, and each block is probed. 663 PROBED_ALLOCA, 664 665 // Memory barriers. 666 MEMBARRIER, 667 MFENCE, 668 669 // Get a random integer and indicate whether it is valid in CF. 670 RDRAND, 671 672 // Get a NIST SP800-90B & C compliant random integer and 673 // indicate whether it is valid in CF. 674 RDSEED, 675 676 // Protection keys 677 // RDPKRU - Operand 0 is chain. Operand 1 is value for ECX. 678 // WRPKRU - Operand 0 is chain. Operand 1 is value for EDX. Operand 2 is 679 // value for ECX. 680 RDPKRU, 681 WRPKRU, 682 683 // SSE42 string comparisons. 684 // These nodes produce 3 results, index, mask, and flags. X86ISelDAGToDAG 685 // will emit one or two instructions based on which results are used. If 686 // flags and index/mask this allows us to use a single instruction since 687 // we won't have to pick and opcode for flags. Instead we can rely on the 688 // DAG to CSE everything and decide at isel. 689 PCMPISTR, 690 PCMPESTR, 691 692 // Test if in transactional execution. 693 XTEST, 694 695 // ERI instructions. 696 RSQRT28, 697 RSQRT28_SAE, 698 RSQRT28S, 699 RSQRT28S_SAE, 700 RCP28, 701 RCP28_SAE, 702 RCP28S, 703 RCP28S_SAE, 704 EXP2, 705 EXP2_SAE, 706 707 // Conversions between float and half-float. 708 CVTPS2PH, 709 CVTPH2PS, 710 CVTPH2PS_SAE, 711 712 // Masked version of above. 713 // SRC, RND, PASSTHRU, MASK 714 MCVTPS2PH, 715 716 // Galois Field Arithmetic Instructions 717 GF2P8AFFINEINVQB, 718 GF2P8AFFINEQB, 719 GF2P8MULB, 720 721 // LWP insert record. 722 LWPINS, 723 724 // User level wait 725 UMWAIT, 726 TPAUSE, 727 728 // Enqueue Stores Instructions 729 ENQCMD, 730 ENQCMDS, 731 732 // For avx512-vp2intersect 733 VP2INTERSECT, 734 735 // User level interrupts - testui 736 TESTUI, 737 738 /// X86 strict FP compare instructions. 739 STRICT_FCMP = ISD::FIRST_TARGET_STRICTFP_OPCODE, 740 STRICT_FCMPS, 741 742 // Vector packed double/float comparison. 743 STRICT_CMPP, 744 745 /// Vector comparison generating mask bits for fp and 746 /// integer signed and unsigned data types. 747 STRICT_CMPM, 748 749 // Vector float/double to signed/unsigned integer with truncation. 750 STRICT_CVTTP2SI, 751 STRICT_CVTTP2UI, 752 753 // Vector FP extend. 754 STRICT_VFPEXT, 755 756 // Vector FP round. 757 STRICT_VFPROUND, 758 759 // RndScale - Round FP Values To Include A Given Number Of Fraction Bits. 760 // Also used by the legacy (V)ROUND intrinsics where we mask out the 761 // scaling part of the immediate. 762 STRICT_VRNDSCALE, 763 764 // Vector signed/unsigned integer to float/double. 765 STRICT_CVTSI2P, 766 STRICT_CVTUI2P, 767 768 // Strict FMA nodes. 769 STRICT_FNMADD, 770 STRICT_FMSUB, 771 STRICT_FNMSUB, 772 773 // Conversions between float and half-float. 774 STRICT_CVTPS2PH, 775 STRICT_CVTPH2PS, 776 777 // WARNING: Only add nodes here if they are stric FP nodes. Non-memory and 778 // non-strict FP nodes should be above FIRST_TARGET_STRICTFP_OPCODE. 779 780 // Compare and swap. 781 LCMPXCHG_DAG = ISD::FIRST_TARGET_MEMORY_OPCODE, 782 LCMPXCHG8_DAG, 783 LCMPXCHG16_DAG, 784 LCMPXCHG16_SAVE_RBX_DAG, 785 786 /// LOCK-prefixed arithmetic read-modify-write instructions. 787 /// EFLAGS, OUTCHAIN = LADD(INCHAIN, PTR, RHS) 788 LADD, 789 LSUB, 790 LOR, 791 LXOR, 792 LAND, 793 794 // Load, scalar_to_vector, and zero extend. 795 VZEXT_LOAD, 796 797 // extract_vector_elt, store. 798 VEXTRACT_STORE, 799 800 // scalar broadcast from memory. 801 VBROADCAST_LOAD, 802 803 // subvector broadcast from memory. 804 SUBV_BROADCAST_LOAD, 805 806 // Store FP control word into i16 memory. 807 FNSTCW16m, 808 809 // Load FP control word from i16 memory. 810 FLDCW16m, 811 812 /// This instruction implements FP_TO_SINT with the 813 /// integer destination in memory and a FP reg source. This corresponds 814 /// to the X86::FIST*m instructions and the rounding mode change stuff. It 815 /// has two inputs (token chain and address) and two outputs (int value 816 /// and token chain). Memory VT specifies the type to store to. 817 FP_TO_INT_IN_MEM, 818 819 /// This instruction implements SINT_TO_FP with the 820 /// integer source in memory and FP reg result. This corresponds to the 821 /// X86::FILD*m instructions. It has two inputs (token chain and address) 822 /// and two outputs (FP value and token chain). The integer source type is 823 /// specified by the memory VT. 824 FILD, 825 826 /// This instruction implements a fp->int store from FP stack 827 /// slots. This corresponds to the fist instruction. It takes a 828 /// chain operand, value to store, address, and glue. The memory VT 829 /// specifies the type to store as. 830 FIST, 831 832 /// This instruction implements an extending load to FP stack slots. 833 /// This corresponds to the X86::FLD32m / X86::FLD64m. It takes a chain 834 /// operand, and ptr to load from. The memory VT specifies the type to 835 /// load from. 836 FLD, 837 838 /// This instruction implements a truncating store from FP stack 839 /// slots. This corresponds to the X86::FST32m / X86::FST64m. It takes a 840 /// chain operand, value to store, address, and glue. The memory VT 841 /// specifies the type to store as. 842 FST, 843 844 /// These instructions grab the address of the next argument 845 /// from a va_list. (reads and modifies the va_list in memory) 846 VAARG_64, 847 VAARG_X32, 848 849 // Vector truncating store with unsigned/signed saturation 850 VTRUNCSTOREUS, 851 VTRUNCSTORES, 852 // Vector truncating masked store with unsigned/signed saturation 853 VMTRUNCSTOREUS, 854 VMTRUNCSTORES, 855 856 // X86 specific gather and scatter 857 MGATHER, 858 MSCATTER, 859 860 // Key locker nodes that produce flags. 861 AESENC128KL, 862 AESDEC128KL, 863 AESENC256KL, 864 AESDEC256KL, 865 AESENCWIDE128KL, 866 AESDECWIDE128KL, 867 AESENCWIDE256KL, 868 AESDECWIDE256KL, 869 870 // Save xmm argument registers to the stack, according to %al. An operator 871 // is needed so that this can be expanded with control flow. 872 VASTART_SAVE_XMM_REGS, 873 874 // WARNING: Do not add anything in the end unless you want the node to 875 // have memop! In fact, starting from FIRST_TARGET_MEMORY_OPCODE all 876 // opcodes will be thought as target memory ops! 877 }; 878 } // end namespace X86ISD 879 880 namespace X86 { 881 /// Current rounding mode is represented in bits 11:10 of FPSR. These 882 /// values are same as corresponding constants for rounding mode used 883 /// in glibc. 884 enum RoundingMode { 885 rmToNearest = 0, // FE_TONEAREST 886 rmDownward = 1 << 10, // FE_DOWNWARD 887 rmUpward = 2 << 10, // FE_UPWARD 888 rmTowardZero = 3 << 10, // FE_TOWARDZERO 889 rmMask = 3 << 10 // Bit mask selecting rounding mode 890 }; 891 } 892 893 /// Define some predicates that are used for node matching. 894 namespace X86 { 895 /// Returns true if Elt is a constant zero or floating point constant +0.0. 896 bool isZeroNode(SDValue Elt); 897 898 /// Returns true of the given offset can be 899 /// fit into displacement field of the instruction. 900 bool isOffsetSuitableForCodeModel(int64_t Offset, CodeModel::Model M, 901 bool hasSymbolicDisplacement); 902 903 /// Determines whether the callee is required to pop its 904 /// own arguments. Callee pop is necessary to support tail calls. 905 bool isCalleePop(CallingConv::ID CallingConv, 906 bool is64Bit, bool IsVarArg, bool GuaranteeTCO); 907 908 /// If Op is a constant whose elements are all the same constant or 909 /// undefined, return true and return the constant value in \p SplatVal. 910 /// If we have undef bits that don't cover an entire element, we treat these 911 /// as zero if AllowPartialUndefs is set, else we fail and return false. 912 bool isConstantSplat(SDValue Op, APInt &SplatVal, 913 bool AllowPartialUndefs = true); 914 915 /// Check if Op is a load operation that could be folded into some other x86 916 /// instruction as a memory operand. Example: vpaddd (%rdi), %xmm0, %xmm0. 917 bool mayFoldLoad(SDValue Op, const X86Subtarget &Subtarget, 918 bool AssumeSingleUse = false); 919 920 /// Check if Op is a load operation that could be folded into a vector splat 921 /// instruction as a memory operand. Example: vbroadcastss 16(%rdi), %xmm2. 922 bool mayFoldLoadIntoBroadcastFromMem(SDValue Op, MVT EltVT, 923 const X86Subtarget &Subtarget, 924 bool AssumeSingleUse = false); 925 926 /// Check if Op is a value that could be used to fold a store into some 927 /// other x86 instruction as a memory operand. Ex: pextrb $0, %xmm0, (%rdi). 928 bool mayFoldIntoStore(SDValue Op); 929 930 /// Check if Op is an operation that could be folded into a zero extend x86 931 /// instruction. 932 bool mayFoldIntoZeroExtend(SDValue Op); 933 } // end namespace X86 934 935 //===--------------------------------------------------------------------===// 936 // X86 Implementation of the TargetLowering interface 937 class X86TargetLowering final : public TargetLowering { 938 public: 939 explicit X86TargetLowering(const X86TargetMachine &TM, 940 const X86Subtarget &STI); 941 942 unsigned getJumpTableEncoding() const override; 943 bool useSoftFloat() const override; 944 945 void markLibCallAttributes(MachineFunction *MF, unsigned CC, 946 ArgListTy &Args) const override; 947 948 MVT getScalarShiftAmountTy(const DataLayout &, EVT VT) const override { 949 return MVT::i8; 950 } 951 952 const MCExpr * 953 LowerCustomJumpTableEntry(const MachineJumpTableInfo *MJTI, 954 const MachineBasicBlock *MBB, unsigned uid, 955 MCContext &Ctx) const override; 956 957 /// Returns relocation base for the given PIC jumptable. 958 SDValue getPICJumpTableRelocBase(SDValue Table, 959 SelectionDAG &DAG) const override; 960 const MCExpr * 961 getPICJumpTableRelocBaseExpr(const MachineFunction *MF, 962 unsigned JTI, MCContext &Ctx) const override; 963 964 /// Return the desired alignment for ByVal aggregate 965 /// function arguments in the caller parameter area. For X86, aggregates 966 /// that contains are placed at 16-byte boundaries while the rest are at 967 /// 4-byte boundaries. 968 uint64_t getByValTypeAlignment(Type *Ty, 969 const DataLayout &DL) const override; 970 971 EVT getOptimalMemOpType(const MemOp &Op, 972 const AttributeList &FuncAttributes) const override; 973 974 /// Returns true if it's safe to use load / store of the 975 /// specified type to expand memcpy / memset inline. This is mostly true 976 /// for all types except for some special cases. For example, on X86 977 /// targets without SSE2 f64 load / store are done with fldl / fstpl which 978 /// also does type conversion. Note the specified type doesn't have to be 979 /// legal as the hook is used before type legalization. 980 bool isSafeMemOpType(MVT VT) const override; 981 982 /// Returns true if the target allows unaligned memory accesses of the 983 /// specified type. Returns whether it is "fast" in the last argument. 984 bool allowsMisalignedMemoryAccesses(EVT VT, unsigned AS, Align Alignment, 985 MachineMemOperand::Flags Flags, 986 bool *Fast) const override; 987 988 /// Provide custom lowering hooks for some operations. 989 /// 990 SDValue LowerOperation(SDValue Op, SelectionDAG &DAG) const override; 991 992 /// Replace the results of node with an illegal result 993 /// type with new values built out of custom code. 994 /// 995 void ReplaceNodeResults(SDNode *N, SmallVectorImpl<SDValue>&Results, 996 SelectionDAG &DAG) const override; 997 998 SDValue PerformDAGCombine(SDNode *N, DAGCombinerInfo &DCI) const override; 999 1000 /// Return true if the target has native support for 1001 /// the specified value type and it is 'desirable' to use the type for the 1002 /// given node type. e.g. On x86 i16 is legal, but undesirable since i16 1003 /// instruction encodings are longer and some i16 instructions are slow. 1004 bool isTypeDesirableForOp(unsigned Opc, EVT VT) const override; 1005 1006 /// Return true if the target has native support for the 1007 /// specified value type and it is 'desirable' to use the type. e.g. On x86 1008 /// i16 is legal, but undesirable since i16 instruction encodings are longer 1009 /// and some i16 instructions are slow. 1010 bool IsDesirableToPromoteOp(SDValue Op, EVT &PVT) const override; 1011 1012 /// Return the newly negated expression if the cost is not expensive and 1013 /// set the cost in \p Cost to indicate that if it is cheaper or neutral to 1014 /// do the negation. 1015 SDValue getNegatedExpression(SDValue Op, SelectionDAG &DAG, 1016 bool LegalOperations, bool ForCodeSize, 1017 NegatibleCost &Cost, 1018 unsigned Depth) const override; 1019 1020 MachineBasicBlock * 1021 EmitInstrWithCustomInserter(MachineInstr &MI, 1022 MachineBasicBlock *MBB) const override; 1023 1024 /// This method returns the name of a target specific DAG node. 1025 const char *getTargetNodeName(unsigned Opcode) const override; 1026 1027 /// Do not merge vector stores after legalization because that may conflict 1028 /// with x86-specific store splitting optimizations. 1029 bool mergeStoresAfterLegalization(EVT MemVT) const override { 1030 return !MemVT.isVector(); 1031 } 1032 1033 bool canMergeStoresTo(unsigned AddressSpace, EVT MemVT, 1034 const MachineFunction &MF) const override; 1035 1036 bool isCheapToSpeculateCttz() const override; 1037 1038 bool isCheapToSpeculateCtlz() const override; 1039 1040 bool isCtlzFast() const override; 1041 1042 bool hasBitPreservingFPLogic(EVT VT) const override { 1043 return VT == MVT::f32 || VT == MVT::f64 || VT.isVector() || 1044 (VT == MVT::f16 && X86ScalarSSEf16); 1045 } 1046 1047 bool isMultiStoresCheaperThanBitsMerge(EVT LTy, EVT HTy) const override { 1048 // If the pair to store is a mixture of float and int values, we will 1049 // save two bitwise instructions and one float-to-int instruction and 1050 // increase one store instruction. There is potentially a more 1051 // significant benefit because it avoids the float->int domain switch 1052 // for input value. So It is more likely a win. 1053 if ((LTy.isFloatingPoint() && HTy.isInteger()) || 1054 (LTy.isInteger() && HTy.isFloatingPoint())) 1055 return true; 1056 // If the pair only contains int values, we will save two bitwise 1057 // instructions and increase one store instruction (costing one more 1058 // store buffer). Since the benefit is more blurred so we leave 1059 // such pair out until we get testcase to prove it is a win. 1060 return false; 1061 } 1062 1063 bool isMaskAndCmp0FoldingBeneficial(const Instruction &AndI) const override; 1064 1065 bool hasAndNotCompare(SDValue Y) const override; 1066 1067 bool hasAndNot(SDValue Y) const override; 1068 1069 bool hasBitTest(SDValue X, SDValue Y) const override; 1070 1071 bool shouldProduceAndByConstByHoistingConstFromShiftsLHSOfAnd( 1072 SDValue X, ConstantSDNode *XC, ConstantSDNode *CC, SDValue Y, 1073 unsigned OldShiftOpcode, unsigned NewShiftOpcode, 1074 SelectionDAG &DAG) const override; 1075 1076 bool shouldFoldConstantShiftPairToMask(const SDNode *N, 1077 CombineLevel Level) const override; 1078 1079 bool shouldFoldMaskToVariableShiftPair(SDValue Y) const override; 1080 1081 bool 1082 shouldTransformSignedTruncationCheck(EVT XVT, 1083 unsigned KeptBits) const override { 1084 // For vectors, we don't have a preference.. 1085 if (XVT.isVector()) 1086 return false; 1087 1088 auto VTIsOk = [](EVT VT) -> bool { 1089 return VT == MVT::i8 || VT == MVT::i16 || VT == MVT::i32 || 1090 VT == MVT::i64; 1091 }; 1092 1093 // We are ok with KeptBitsVT being byte/word/dword, what MOVS supports. 1094 // XVT will be larger than KeptBitsVT. 1095 MVT KeptBitsVT = MVT::getIntegerVT(KeptBits); 1096 return VTIsOk(XVT) && VTIsOk(KeptBitsVT); 1097 } 1098 1099 bool shouldExpandShift(SelectionDAG &DAG, SDNode *N) const override; 1100 1101 bool shouldSplatInsEltVarIndex(EVT VT) const override; 1102 1103 bool shouldConvertFpToSat(unsigned Op, EVT FPVT, EVT VT) const override { 1104 // Converting to sat variants holds little benefit on X86 as we will just 1105 // need to saturate the value back using fp arithmatic. 1106 return Op != ISD::FP_TO_UINT_SAT && isOperationLegalOrCustom(Op, VT); 1107 } 1108 1109 bool convertSetCCLogicToBitwiseLogic(EVT VT) const override { 1110 return VT.isScalarInteger(); 1111 } 1112 1113 /// Vector-sized comparisons are fast using PCMPEQ + PMOVMSK or PTEST. 1114 MVT hasFastEqualityCompare(unsigned NumBits) const override; 1115 1116 /// Return the value type to use for ISD::SETCC. 1117 EVT getSetCCResultType(const DataLayout &DL, LLVMContext &Context, 1118 EVT VT) const override; 1119 1120 bool targetShrinkDemandedConstant(SDValue Op, const APInt &DemandedBits, 1121 const APInt &DemandedElts, 1122 TargetLoweringOpt &TLO) const override; 1123 1124 /// Determine which of the bits specified in Mask are known to be either 1125 /// zero or one and return them in the KnownZero/KnownOne bitsets. 1126 void computeKnownBitsForTargetNode(const SDValue Op, 1127 KnownBits &Known, 1128 const APInt &DemandedElts, 1129 const SelectionDAG &DAG, 1130 unsigned Depth = 0) const override; 1131 1132 /// Determine the number of bits in the operation that are sign bits. 1133 unsigned ComputeNumSignBitsForTargetNode(SDValue Op, 1134 const APInt &DemandedElts, 1135 const SelectionDAG &DAG, 1136 unsigned Depth) const override; 1137 1138 bool SimplifyDemandedVectorEltsForTargetNode(SDValue Op, 1139 const APInt &DemandedElts, 1140 APInt &KnownUndef, 1141 APInt &KnownZero, 1142 TargetLoweringOpt &TLO, 1143 unsigned Depth) const override; 1144 1145 bool SimplifyDemandedVectorEltsForTargetShuffle(SDValue Op, 1146 const APInt &DemandedElts, 1147 unsigned MaskIndex, 1148 TargetLoweringOpt &TLO, 1149 unsigned Depth) const; 1150 1151 bool SimplifyDemandedBitsForTargetNode(SDValue Op, 1152 const APInt &DemandedBits, 1153 const APInt &DemandedElts, 1154 KnownBits &Known, 1155 TargetLoweringOpt &TLO, 1156 unsigned Depth) const override; 1157 1158 SDValue SimplifyMultipleUseDemandedBitsForTargetNode( 1159 SDValue Op, const APInt &DemandedBits, const APInt &DemandedElts, 1160 SelectionDAG &DAG, unsigned Depth) const override; 1161 1162 bool isSplatValueForTargetNode(SDValue Op, const APInt &DemandedElts, 1163 APInt &UndefElts, 1164 unsigned Depth) const override; 1165 1166 const Constant *getTargetConstantFromLoad(LoadSDNode *LD) const override; 1167 1168 SDValue unwrapAddress(SDValue N) const override; 1169 1170 SDValue getReturnAddressFrameIndex(SelectionDAG &DAG) const; 1171 1172 bool ExpandInlineAsm(CallInst *CI) const override; 1173 1174 ConstraintType getConstraintType(StringRef Constraint) const override; 1175 1176 /// Examine constraint string and operand type and determine a weight value. 1177 /// The operand object must already have been set up with the operand type. 1178 ConstraintWeight 1179 getSingleConstraintMatchWeight(AsmOperandInfo &info, 1180 const char *constraint) const override; 1181 1182 const char *LowerXConstraint(EVT ConstraintVT) const override; 1183 1184 /// Lower the specified operand into the Ops vector. If it is invalid, don't 1185 /// add anything to Ops. If hasMemory is true it means one of the asm 1186 /// constraint of the inline asm instruction being processed is 'm'. 1187 void LowerAsmOperandForConstraint(SDValue Op, 1188 std::string &Constraint, 1189 std::vector<SDValue> &Ops, 1190 SelectionDAG &DAG) const override; 1191 1192 unsigned 1193 getInlineAsmMemConstraint(StringRef ConstraintCode) const override { 1194 if (ConstraintCode == "v") 1195 return InlineAsm::Constraint_v; 1196 return TargetLowering::getInlineAsmMemConstraint(ConstraintCode); 1197 } 1198 1199 /// Handle Lowering flag assembly outputs. 1200 SDValue LowerAsmOutputForConstraint(SDValue &Chain, SDValue &Flag, 1201 const SDLoc &DL, 1202 const AsmOperandInfo &Constraint, 1203 SelectionDAG &DAG) const override; 1204 1205 /// Given a physical register constraint 1206 /// (e.g. {edx}), return the register number and the register class for the 1207 /// register. This should only be used for C_Register constraints. On 1208 /// error, this returns a register number of 0. 1209 std::pair<unsigned, const TargetRegisterClass *> 1210 getRegForInlineAsmConstraint(const TargetRegisterInfo *TRI, 1211 StringRef Constraint, MVT VT) const override; 1212 1213 /// Return true if the addressing mode represented 1214 /// by AM is legal for this target, for a load/store of the specified type. 1215 bool isLegalAddressingMode(const DataLayout &DL, const AddrMode &AM, 1216 Type *Ty, unsigned AS, 1217 Instruction *I = nullptr) const override; 1218 1219 /// Return true if the specified immediate is legal 1220 /// icmp immediate, that is the target has icmp instructions which can 1221 /// compare a register against the immediate without having to materialize 1222 /// the immediate into a register. 1223 bool isLegalICmpImmediate(int64_t Imm) const override; 1224 1225 /// Return true if the specified immediate is legal 1226 /// add immediate, that is the target has add instructions which can 1227 /// add a register and the immediate without having to materialize 1228 /// the immediate into a register. 1229 bool isLegalAddImmediate(int64_t Imm) const override; 1230 1231 bool isLegalStoreImmediate(int64_t Imm) const override; 1232 1233 /// Return the cost of the scaling factor used in the addressing 1234 /// mode represented by AM for this target, for a load/store 1235 /// of the specified type. 1236 /// If the AM is supported, the return value must be >= 0. 1237 /// If the AM is not supported, it returns a negative value. 1238 InstructionCost getScalingFactorCost(const DataLayout &DL, 1239 const AddrMode &AM, Type *Ty, 1240 unsigned AS) const override; 1241 1242 /// This is used to enable splatted operand transforms for vector shifts 1243 /// and vector funnel shifts. 1244 bool isVectorShiftByScalarCheap(Type *Ty) const override; 1245 1246 /// Add x86-specific opcodes to the default list. 1247 bool isBinOp(unsigned Opcode) const override; 1248 1249 /// Returns true if the opcode is a commutative binary operation. 1250 bool isCommutativeBinOp(unsigned Opcode) const override; 1251 1252 /// Return true if it's free to truncate a value of 1253 /// type Ty1 to type Ty2. e.g. On x86 it's free to truncate a i32 value in 1254 /// register EAX to i16 by referencing its sub-register AX. 1255 bool isTruncateFree(Type *Ty1, Type *Ty2) const override; 1256 bool isTruncateFree(EVT VT1, EVT VT2) const override; 1257 1258 bool allowTruncateForTailCall(Type *Ty1, Type *Ty2) const override; 1259 1260 /// Return true if any actual instruction that defines a 1261 /// value of type Ty1 implicit zero-extends the value to Ty2 in the result 1262 /// register. This does not necessarily include registers defined in 1263 /// unknown ways, such as incoming arguments, or copies from unknown 1264 /// virtual registers. Also, if isTruncateFree(Ty2, Ty1) is true, this 1265 /// does not necessarily apply to truncate instructions. e.g. on x86-64, 1266 /// all instructions that define 32-bit values implicit zero-extend the 1267 /// result out to 64 bits. 1268 bool isZExtFree(Type *Ty1, Type *Ty2) const override; 1269 bool isZExtFree(EVT VT1, EVT VT2) const override; 1270 bool isZExtFree(SDValue Val, EVT VT2) const override; 1271 1272 bool shouldSinkOperands(Instruction *I, 1273 SmallVectorImpl<Use *> &Ops) const override; 1274 bool shouldConvertPhiType(Type *From, Type *To) const override; 1275 1276 /// Return true if folding a vector load into ExtVal (a sign, zero, or any 1277 /// extend node) is profitable. 1278 bool isVectorLoadExtDesirable(SDValue) const override; 1279 1280 /// Return true if an FMA operation is faster than a pair of fmul and fadd 1281 /// instructions. fmuladd intrinsics will be expanded to FMAs when this 1282 /// method returns true, otherwise fmuladd is expanded to fmul + fadd. 1283 bool isFMAFasterThanFMulAndFAdd(const MachineFunction &MF, 1284 EVT VT) const override; 1285 1286 /// Return true if it's profitable to narrow 1287 /// operations of type VT1 to VT2. e.g. on x86, it's profitable to narrow 1288 /// from i32 to i8 but not from i32 to i16. 1289 bool isNarrowingProfitable(EVT VT1, EVT VT2) const override; 1290 1291 bool shouldFoldSelectWithIdentityConstant(unsigned BinOpcode, 1292 EVT VT) const override; 1293 1294 /// Given an intrinsic, checks if on the target the intrinsic will need to map 1295 /// to a MemIntrinsicNode (touches memory). If this is the case, it returns 1296 /// true and stores the intrinsic information into the IntrinsicInfo that was 1297 /// passed to the function. 1298 bool getTgtMemIntrinsic(IntrinsicInfo &Info, const CallInst &I, 1299 MachineFunction &MF, 1300 unsigned Intrinsic) const override; 1301 1302 /// Returns true if the target can instruction select the 1303 /// specified FP immediate natively. If false, the legalizer will 1304 /// materialize the FP immediate as a load from a constant pool. 1305 bool isFPImmLegal(const APFloat &Imm, EVT VT, 1306 bool ForCodeSize) const override; 1307 1308 /// Targets can use this to indicate that they only support *some* 1309 /// VECTOR_SHUFFLE operations, those with specific masks. By default, if a 1310 /// target supports the VECTOR_SHUFFLE node, all mask values are assumed to 1311 /// be legal. 1312 bool isShuffleMaskLegal(ArrayRef<int> Mask, EVT VT) const override; 1313 1314 /// Similar to isShuffleMaskLegal. Targets can use this to indicate if there 1315 /// is a suitable VECTOR_SHUFFLE that can be used to replace a VAND with a 1316 /// constant pool entry. 1317 bool isVectorClearMaskLegal(ArrayRef<int> Mask, EVT VT) const override; 1318 1319 /// Returns true if lowering to a jump table is allowed. 1320 bool areJTsAllowed(const Function *Fn) const override; 1321 1322 /// If true, then instruction selection should 1323 /// seek to shrink the FP constant of the specified type to a smaller type 1324 /// in order to save space and / or reduce runtime. 1325 bool ShouldShrinkFPConstant(EVT VT) const override { 1326 // Don't shrink FP constpool if SSE2 is available since cvtss2sd is more 1327 // expensive than a straight movsd. On the other hand, it's important to 1328 // shrink long double fp constant since fldt is very slow. 1329 return !X86ScalarSSEf64 || VT == MVT::f80; 1330 } 1331 1332 /// Return true if we believe it is correct and profitable to reduce the 1333 /// load node to a smaller type. 1334 bool shouldReduceLoadWidth(SDNode *Load, ISD::LoadExtType ExtTy, 1335 EVT NewVT) const override; 1336 1337 /// Return true if the specified scalar FP type is computed in an SSE 1338 /// register, not on the X87 floating point stack. 1339 bool isScalarFPTypeInSSEReg(EVT VT) const { 1340 return (VT == MVT::f64 && X86ScalarSSEf64) || // f64 is when SSE2 1341 (VT == MVT::f32 && X86ScalarSSEf32) || // f32 is when SSE1 1342 (VT == MVT::f16 && X86ScalarSSEf16); // f16 is when AVX512FP16 1343 } 1344 1345 /// Returns true if it is beneficial to convert a load of a constant 1346 /// to just the constant itself. 1347 bool shouldConvertConstantLoadToIntImm(const APInt &Imm, 1348 Type *Ty) const override; 1349 1350 bool reduceSelectOfFPConstantLoads(EVT CmpOpVT) const override; 1351 1352 bool convertSelectOfConstantsToMath(EVT VT) const override; 1353 1354 bool decomposeMulByConstant(LLVMContext &Context, EVT VT, 1355 SDValue C) const override; 1356 1357 /// Return true if EXTRACT_SUBVECTOR is cheap for this result type 1358 /// with this index. 1359 bool isExtractSubvectorCheap(EVT ResVT, EVT SrcVT, 1360 unsigned Index) const override; 1361 1362 /// Scalar ops always have equal or better analysis/performance/power than 1363 /// the vector equivalent, so this always makes sense if the scalar op is 1364 /// supported. 1365 bool shouldScalarizeBinop(SDValue) const override; 1366 1367 /// Extract of a scalar FP value from index 0 of a vector is free. 1368 bool isExtractVecEltCheap(EVT VT, unsigned Index) const override { 1369 EVT EltVT = VT.getScalarType(); 1370 return (EltVT == MVT::f32 || EltVT == MVT::f64) && Index == 0; 1371 } 1372 1373 /// Overflow nodes should get combined/lowered to optimal instructions 1374 /// (they should allow eliminating explicit compares by getting flags from 1375 /// math ops). 1376 bool shouldFormOverflowOp(unsigned Opcode, EVT VT, 1377 bool MathUsed) const override; 1378 1379 bool storeOfVectorConstantIsCheap(EVT MemVT, unsigned NumElem, 1380 unsigned AddrSpace) const override { 1381 // If we can replace more than 2 scalar stores, there will be a reduction 1382 // in instructions even after we add a vector constant load. 1383 return NumElem > 2; 1384 } 1385 1386 bool isLoadBitCastBeneficial(EVT LoadVT, EVT BitcastVT, 1387 const SelectionDAG &DAG, 1388 const MachineMemOperand &MMO) const override; 1389 1390 /// Intel processors have a unified instruction and data cache 1391 const char * getClearCacheBuiltinName() const override { 1392 return nullptr; // nothing to do, move along. 1393 } 1394 1395 Register getRegisterByName(const char* RegName, LLT VT, 1396 const MachineFunction &MF) const override; 1397 1398 /// If a physical register, this returns the register that receives the 1399 /// exception address on entry to an EH pad. 1400 Register 1401 getExceptionPointerRegister(const Constant *PersonalityFn) const override; 1402 1403 /// If a physical register, this returns the register that receives the 1404 /// exception typeid on entry to a landing pad. 1405 Register 1406 getExceptionSelectorRegister(const Constant *PersonalityFn) const override; 1407 1408 virtual bool needsFixedCatchObjects() const override; 1409 1410 /// This method returns a target specific FastISel object, 1411 /// or null if the target does not support "fast" ISel. 1412 FastISel *createFastISel(FunctionLoweringInfo &funcInfo, 1413 const TargetLibraryInfo *libInfo) const override; 1414 1415 /// If the target has a standard location for the stack protector cookie, 1416 /// returns the address of that location. Otherwise, returns nullptr. 1417 Value *getIRStackGuard(IRBuilderBase &IRB) const override; 1418 1419 bool useLoadStackGuardNode() const override; 1420 bool useStackGuardXorFP() const override; 1421 void insertSSPDeclarations(Module &M) const override; 1422 Value *getSDagStackGuard(const Module &M) const override; 1423 Function *getSSPStackGuardCheck(const Module &M) const override; 1424 SDValue emitStackGuardXorFP(SelectionDAG &DAG, SDValue Val, 1425 const SDLoc &DL) const override; 1426 1427 1428 /// Return true if the target stores SafeStack pointer at a fixed offset in 1429 /// some non-standard address space, and populates the address space and 1430 /// offset as appropriate. 1431 Value *getSafeStackPointerLocation(IRBuilderBase &IRB) const override; 1432 1433 std::pair<SDValue, SDValue> BuildFILD(EVT DstVT, EVT SrcVT, const SDLoc &DL, 1434 SDValue Chain, SDValue Pointer, 1435 MachinePointerInfo PtrInfo, 1436 Align Alignment, 1437 SelectionDAG &DAG) const; 1438 1439 /// Customize the preferred legalization strategy for certain types. 1440 LegalizeTypeAction getPreferredVectorAction(MVT VT) const override; 1441 1442 bool softPromoteHalfType() const override { return true; } 1443 1444 MVT getRegisterTypeForCallingConv(LLVMContext &Context, CallingConv::ID CC, 1445 EVT VT) const override; 1446 1447 unsigned getNumRegistersForCallingConv(LLVMContext &Context, 1448 CallingConv::ID CC, 1449 EVT VT) const override; 1450 1451 unsigned getVectorTypeBreakdownForCallingConv( 1452 LLVMContext &Context, CallingConv::ID CC, EVT VT, EVT &IntermediateVT, 1453 unsigned &NumIntermediates, MVT &RegisterVT) const override; 1454 1455 bool isIntDivCheap(EVT VT, AttributeList Attr) const override; 1456 1457 bool supportSwiftError() const override; 1458 1459 bool hasStackProbeSymbol(MachineFunction &MF) const override; 1460 bool hasInlineStackProbe(MachineFunction &MF) const override; 1461 StringRef getStackProbeSymbolName(MachineFunction &MF) const override; 1462 1463 unsigned getStackProbeSize(MachineFunction &MF) const; 1464 1465 bool hasVectorBlend() const override { return true; } 1466 1467 unsigned getMaxSupportedInterleaveFactor() const override { return 4; } 1468 1469 /// Lower interleaved load(s) into target specific 1470 /// instructions/intrinsics. 1471 bool lowerInterleavedLoad(LoadInst *LI, 1472 ArrayRef<ShuffleVectorInst *> Shuffles, 1473 ArrayRef<unsigned> Indices, 1474 unsigned Factor) const override; 1475 1476 /// Lower interleaved store(s) into target specific 1477 /// instructions/intrinsics. 1478 bool lowerInterleavedStore(StoreInst *SI, ShuffleVectorInst *SVI, 1479 unsigned Factor) const override; 1480 1481 SDValue expandIndirectJTBranch(const SDLoc& dl, SDValue Value, 1482 SDValue Addr, SelectionDAG &DAG) 1483 const override; 1484 1485 Align getPrefLoopAlignment(MachineLoop *ML) const override; 1486 1487 protected: 1488 std::pair<const TargetRegisterClass *, uint8_t> 1489 findRepresentativeClass(const TargetRegisterInfo *TRI, 1490 MVT VT) const override; 1491 1492 private: 1493 /// Keep a reference to the X86Subtarget around so that we can 1494 /// make the right decision when generating code for different targets. 1495 const X86Subtarget &Subtarget; 1496 1497 /// Select between SSE or x87 floating point ops. 1498 /// When SSE is available, use it for f32 operations. 1499 /// When SSE2 is available, use it for f64 operations. 1500 bool X86ScalarSSEf32; 1501 bool X86ScalarSSEf64; 1502 bool X86ScalarSSEf16; 1503 1504 /// A list of legal FP immediates. 1505 std::vector<APFloat> LegalFPImmediates; 1506 1507 /// Indicate that this x86 target can instruction 1508 /// select the specified FP immediate natively. 1509 void addLegalFPImmediate(const APFloat& Imm) { 1510 LegalFPImmediates.push_back(Imm); 1511 } 1512 1513 SDValue LowerCallResult(SDValue Chain, SDValue InFlag, 1514 CallingConv::ID CallConv, bool isVarArg, 1515 const SmallVectorImpl<ISD::InputArg> &Ins, 1516 const SDLoc &dl, SelectionDAG &DAG, 1517 SmallVectorImpl<SDValue> &InVals, 1518 uint32_t *RegMask) const; 1519 SDValue LowerMemArgument(SDValue Chain, CallingConv::ID CallConv, 1520 const SmallVectorImpl<ISD::InputArg> &ArgInfo, 1521 const SDLoc &dl, SelectionDAG &DAG, 1522 const CCValAssign &VA, MachineFrameInfo &MFI, 1523 unsigned i) const; 1524 SDValue LowerMemOpCallTo(SDValue Chain, SDValue StackPtr, SDValue Arg, 1525 const SDLoc &dl, SelectionDAG &DAG, 1526 const CCValAssign &VA, 1527 ISD::ArgFlagsTy Flags, bool isByval) const; 1528 1529 // Call lowering helpers. 1530 1531 /// Check whether the call is eligible for tail call optimization. Targets 1532 /// that want to do tail call optimization should implement this function. 1533 bool IsEligibleForTailCallOptimization( 1534 SDValue Callee, CallingConv::ID CalleeCC, bool IsCalleeStackStructRet, 1535 bool isVarArg, Type *RetTy, const SmallVectorImpl<ISD::OutputArg> &Outs, 1536 const SmallVectorImpl<SDValue> &OutVals, 1537 const SmallVectorImpl<ISD::InputArg> &Ins, SelectionDAG &DAG) const; 1538 SDValue EmitTailCallLoadRetAddr(SelectionDAG &DAG, SDValue &OutRetAddr, 1539 SDValue Chain, bool IsTailCall, 1540 bool Is64Bit, int FPDiff, 1541 const SDLoc &dl) const; 1542 1543 unsigned GetAlignedArgumentStackSize(unsigned StackSize, 1544 SelectionDAG &DAG) const; 1545 1546 unsigned getAddressSpace() const; 1547 1548 SDValue FP_TO_INTHelper(SDValue Op, SelectionDAG &DAG, bool IsSigned, 1549 SDValue &Chain) const; 1550 SDValue LRINT_LLRINTHelper(SDNode *N, SelectionDAG &DAG) const; 1551 1552 SDValue LowerBUILD_VECTOR(SDValue Op, SelectionDAG &DAG) const; 1553 SDValue LowerVSELECT(SDValue Op, SelectionDAG &DAG) const; 1554 SDValue LowerEXTRACT_VECTOR_ELT(SDValue Op, SelectionDAG &DAG) const; 1555 SDValue LowerINSERT_VECTOR_ELT(SDValue Op, SelectionDAG &DAG) const; 1556 1557 unsigned getGlobalWrapperKind(const GlobalValue *GV = nullptr, 1558 const unsigned char OpFlags = 0) const; 1559 SDValue LowerConstantPool(SDValue Op, SelectionDAG &DAG) const; 1560 SDValue LowerBlockAddress(SDValue Op, SelectionDAG &DAG) const; 1561 SDValue LowerGlobalAddress(SDValue Op, SelectionDAG &DAG) const; 1562 SDValue LowerGlobalTLSAddress(SDValue Op, SelectionDAG &DAG) const; 1563 SDValue LowerExternalSymbol(SDValue Op, SelectionDAG &DAG) const; 1564 1565 /// Creates target global address or external symbol nodes for calls or 1566 /// other uses. 1567 SDValue LowerGlobalOrExternal(SDValue Op, SelectionDAG &DAG, 1568 bool ForCall) const; 1569 1570 SDValue LowerSINT_TO_FP(SDValue Op, SelectionDAG &DAG) const; 1571 SDValue LowerUINT_TO_FP(SDValue Op, SelectionDAG &DAG) const; 1572 SDValue LowerTRUNCATE(SDValue Op, SelectionDAG &DAG) const; 1573 SDValue LowerFP_TO_INT(SDValue Op, SelectionDAG &DAG) const; 1574 SDValue LowerFP_TO_INT_SAT(SDValue Op, SelectionDAG &DAG) const; 1575 SDValue LowerLRINT_LLRINT(SDValue Op, SelectionDAG &DAG) const; 1576 SDValue LowerSETCC(SDValue Op, SelectionDAG &DAG) const; 1577 SDValue LowerSETCCCARRY(SDValue Op, SelectionDAG &DAG) const; 1578 SDValue LowerSELECT(SDValue Op, SelectionDAG &DAG) const; 1579 SDValue LowerBRCOND(SDValue Op, SelectionDAG &DAG) const; 1580 SDValue LowerJumpTable(SDValue Op, SelectionDAG &DAG) const; 1581 SDValue LowerDYNAMIC_STACKALLOC(SDValue Op, SelectionDAG &DAG) const; 1582 SDValue LowerVASTART(SDValue Op, SelectionDAG &DAG) const; 1583 SDValue LowerVAARG(SDValue Op, SelectionDAG &DAG) const; 1584 SDValue LowerRETURNADDR(SDValue Op, SelectionDAG &DAG) const; 1585 SDValue LowerADDROFRETURNADDR(SDValue Op, SelectionDAG &DAG) const; 1586 SDValue LowerFRAMEADDR(SDValue Op, SelectionDAG &DAG) const; 1587 SDValue LowerFRAME_TO_ARGS_OFFSET(SDValue Op, SelectionDAG &DAG) const; 1588 SDValue LowerEH_RETURN(SDValue Op, SelectionDAG &DAG) const; 1589 SDValue lowerEH_SJLJ_SETJMP(SDValue Op, SelectionDAG &DAG) const; 1590 SDValue lowerEH_SJLJ_LONGJMP(SDValue Op, SelectionDAG &DAG) const; 1591 SDValue lowerEH_SJLJ_SETUP_DISPATCH(SDValue Op, SelectionDAG &DAG) const; 1592 SDValue LowerINIT_TRAMPOLINE(SDValue Op, SelectionDAG &DAG) const; 1593 SDValue LowerFLT_ROUNDS_(SDValue Op, SelectionDAG &DAG) const; 1594 SDValue LowerSET_ROUNDING(SDValue Op, SelectionDAG &DAG) const; 1595 SDValue LowerWin64_i128OP(SDValue Op, SelectionDAG &DAG) const; 1596 SDValue LowerWin64_FP_TO_INT128(SDValue Op, SelectionDAG &DAG, 1597 SDValue &Chain) const; 1598 SDValue LowerWin64_INT128_TO_FP(SDValue Op, SelectionDAG &DAG) const; 1599 SDValue LowerGC_TRANSITION(SDValue Op, SelectionDAG &DAG) const; 1600 SDValue LowerINTRINSIC_WO_CHAIN(SDValue Op, SelectionDAG &DAG) const; 1601 SDValue lowerFaddFsub(SDValue Op, SelectionDAG &DAG) const; 1602 SDValue LowerFP_EXTEND(SDValue Op, SelectionDAG &DAG) const; 1603 SDValue LowerFP_ROUND(SDValue Op, SelectionDAG &DAG) const; 1604 1605 SDValue 1606 LowerFormalArguments(SDValue Chain, CallingConv::ID CallConv, bool isVarArg, 1607 const SmallVectorImpl<ISD::InputArg> &Ins, 1608 const SDLoc &dl, SelectionDAG &DAG, 1609 SmallVectorImpl<SDValue> &InVals) const override; 1610 SDValue LowerCall(CallLoweringInfo &CLI, 1611 SmallVectorImpl<SDValue> &InVals) const override; 1612 1613 SDValue LowerReturn(SDValue Chain, CallingConv::ID CallConv, bool isVarArg, 1614 const SmallVectorImpl<ISD::OutputArg> &Outs, 1615 const SmallVectorImpl<SDValue> &OutVals, 1616 const SDLoc &dl, SelectionDAG &DAG) const override; 1617 1618 bool supportSplitCSR(MachineFunction *MF) const override { 1619 return MF->getFunction().getCallingConv() == CallingConv::CXX_FAST_TLS && 1620 MF->getFunction().hasFnAttribute(Attribute::NoUnwind); 1621 } 1622 void initializeSplitCSR(MachineBasicBlock *Entry) const override; 1623 void insertCopiesSplitCSR( 1624 MachineBasicBlock *Entry, 1625 const SmallVectorImpl<MachineBasicBlock *> &Exits) const override; 1626 1627 bool isUsedByReturnOnly(SDNode *N, SDValue &Chain) const override; 1628 1629 bool mayBeEmittedAsTailCall(const CallInst *CI) const override; 1630 1631 EVT getTypeForExtReturn(LLVMContext &Context, EVT VT, 1632 ISD::NodeType ExtendKind) const override; 1633 1634 bool CanLowerReturn(CallingConv::ID CallConv, MachineFunction &MF, 1635 bool isVarArg, 1636 const SmallVectorImpl<ISD::OutputArg> &Outs, 1637 LLVMContext &Context) const override; 1638 1639 const MCPhysReg *getScratchRegisters(CallingConv::ID CC) const override; 1640 1641 TargetLoweringBase::AtomicExpansionKind 1642 shouldExpandAtomicLoadInIR(LoadInst *LI) const override; 1643 bool shouldExpandAtomicStoreInIR(StoreInst *SI) const override; 1644 TargetLoweringBase::AtomicExpansionKind 1645 shouldExpandAtomicRMWInIR(AtomicRMWInst *AI) const override; 1646 1647 LoadInst * 1648 lowerIdempotentRMWIntoFencedLoad(AtomicRMWInst *AI) const override; 1649 1650 bool lowerAtomicStoreAsStoreSDNode(const StoreInst &SI) const override; 1651 bool lowerAtomicLoadAsLoadSDNode(const LoadInst &LI) const override; 1652 1653 bool needsCmpXchgNb(Type *MemType) const; 1654 1655 void SetupEntryBlockForSjLj(MachineInstr &MI, MachineBasicBlock *MBB, 1656 MachineBasicBlock *DispatchBB, int FI) const; 1657 1658 // Utility function to emit the low-level va_arg code for X86-64. 1659 MachineBasicBlock * 1660 EmitVAARGWithCustomInserter(MachineInstr &MI, MachineBasicBlock *MBB) const; 1661 1662 /// Utility function to emit the xmm reg save portion of va_start. 1663 MachineBasicBlock *EmitLoweredCascadedSelect(MachineInstr &MI1, 1664 MachineInstr &MI2, 1665 MachineBasicBlock *BB) const; 1666 1667 MachineBasicBlock *EmitLoweredSelect(MachineInstr &I, 1668 MachineBasicBlock *BB) const; 1669 1670 MachineBasicBlock *EmitLoweredCatchRet(MachineInstr &MI, 1671 MachineBasicBlock *BB) const; 1672 1673 MachineBasicBlock *EmitLoweredSegAlloca(MachineInstr &MI, 1674 MachineBasicBlock *BB) const; 1675 1676 MachineBasicBlock *EmitLoweredProbedAlloca(MachineInstr &MI, 1677 MachineBasicBlock *BB) const; 1678 1679 MachineBasicBlock *EmitLoweredTLSAddr(MachineInstr &MI, 1680 MachineBasicBlock *BB) const; 1681 1682 MachineBasicBlock *EmitLoweredTLSCall(MachineInstr &MI, 1683 MachineBasicBlock *BB) const; 1684 1685 MachineBasicBlock *EmitLoweredIndirectThunk(MachineInstr &MI, 1686 MachineBasicBlock *BB) const; 1687 1688 MachineBasicBlock *emitEHSjLjSetJmp(MachineInstr &MI, 1689 MachineBasicBlock *MBB) const; 1690 1691 void emitSetJmpShadowStackFix(MachineInstr &MI, 1692 MachineBasicBlock *MBB) const; 1693 1694 MachineBasicBlock *emitEHSjLjLongJmp(MachineInstr &MI, 1695 MachineBasicBlock *MBB) const; 1696 1697 MachineBasicBlock *emitLongJmpShadowStackFix(MachineInstr &MI, 1698 MachineBasicBlock *MBB) const; 1699 1700 MachineBasicBlock *EmitSjLjDispatchBlock(MachineInstr &MI, 1701 MachineBasicBlock *MBB) const; 1702 1703 /// Emit flags for the given setcc condition and operands. Also returns the 1704 /// corresponding X86 condition code constant in X86CC. 1705 SDValue emitFlagsForSetcc(SDValue Op0, SDValue Op1, ISD::CondCode CC, 1706 const SDLoc &dl, SelectionDAG &DAG, 1707 SDValue &X86CC) const; 1708 1709 /// Check if replacement of SQRT with RSQRT should be disabled. 1710 bool isFsqrtCheap(SDValue Op, SelectionDAG &DAG) const override; 1711 1712 /// Use rsqrt* to speed up sqrt calculations. 1713 SDValue getSqrtEstimate(SDValue Op, SelectionDAG &DAG, int Enabled, 1714 int &RefinementSteps, bool &UseOneConstNR, 1715 bool Reciprocal) const override; 1716 1717 /// Use rcp* to speed up fdiv calculations. 1718 SDValue getRecipEstimate(SDValue Op, SelectionDAG &DAG, int Enabled, 1719 int &RefinementSteps) const override; 1720 1721 /// Reassociate floating point divisions into multiply by reciprocal. 1722 unsigned combineRepeatedFPDivisors() const override; 1723 1724 SDValue BuildSDIVPow2(SDNode *N, const APInt &Divisor, SelectionDAG &DAG, 1725 SmallVectorImpl<SDNode *> &Created) const override; 1726 }; 1727 1728 namespace X86 { 1729 FastISel *createFastISel(FunctionLoweringInfo &funcInfo, 1730 const TargetLibraryInfo *libInfo); 1731 } // end namespace X86 1732 1733 // X86 specific Gather/Scatter nodes. 1734 // The class has the same order of operands as MaskedGatherScatterSDNode for 1735 // convenience. 1736 class X86MaskedGatherScatterSDNode : public MemIntrinsicSDNode { 1737 public: 1738 // This is a intended as a utility and should never be directly created. 1739 X86MaskedGatherScatterSDNode() = delete; 1740 ~X86MaskedGatherScatterSDNode() = delete; 1741 1742 const SDValue &getBasePtr() const { return getOperand(3); } 1743 const SDValue &getIndex() const { return getOperand(4); } 1744 const SDValue &getMask() const { return getOperand(2); } 1745 const SDValue &getScale() const { return getOperand(5); } 1746 1747 static bool classof(const SDNode *N) { 1748 return N->getOpcode() == X86ISD::MGATHER || 1749 N->getOpcode() == X86ISD::MSCATTER; 1750 } 1751 }; 1752 1753 class X86MaskedGatherSDNode : public X86MaskedGatherScatterSDNode { 1754 public: 1755 const SDValue &getPassThru() const { return getOperand(1); } 1756 1757 static bool classof(const SDNode *N) { 1758 return N->getOpcode() == X86ISD::MGATHER; 1759 } 1760 }; 1761 1762 class X86MaskedScatterSDNode : public X86MaskedGatherScatterSDNode { 1763 public: 1764 const SDValue &getValue() const { return getOperand(1); } 1765 1766 static bool classof(const SDNode *N) { 1767 return N->getOpcode() == X86ISD::MSCATTER; 1768 } 1769 }; 1770 1771 /// Generate unpacklo/unpackhi shuffle mask. 1772 void createUnpackShuffleMask(EVT VT, SmallVectorImpl<int> &Mask, bool Lo, 1773 bool Unary); 1774 1775 /// Similar to unpacklo/unpackhi, but without the 128-bit lane limitation 1776 /// imposed by AVX and specific to the unary pattern. Example: 1777 /// v8iX Lo --> <0, 0, 1, 1, 2, 2, 3, 3> 1778 /// v8iX Hi --> <4, 4, 5, 5, 6, 6, 7, 7> 1779 void createSplat2ShuffleMask(MVT VT, SmallVectorImpl<int> &Mask, bool Lo); 1780 1781 } // end namespace llvm 1782 1783 #endif // LLVM_LIB_TARGET_X86_X86ISELLOWERING_H 1784