1 //===- X86ISelDAGToDAG.cpp - A DAG pattern matching inst selector for X86 -===// 2 // 3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 4 // See https://llvm.org/LICENSE.txt for license information. 5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 6 // 7 //===----------------------------------------------------------------------===// 8 // 9 // This file defines a DAG pattern matching instruction selector for X86, 10 // converting from a legalized dag to a X86 dag. 11 // 12 //===----------------------------------------------------------------------===// 13 14 #include "X86.h" 15 #include "X86MachineFunctionInfo.h" 16 #include "X86RegisterInfo.h" 17 #include "X86Subtarget.h" 18 #include "X86TargetMachine.h" 19 #include "llvm/ADT/Statistic.h" 20 #include "llvm/CodeGen/MachineModuleInfo.h" 21 #include "llvm/CodeGen/SelectionDAGISel.h" 22 #include "llvm/Config/llvm-config.h" 23 #include "llvm/IR/ConstantRange.h" 24 #include "llvm/IR/Function.h" 25 #include "llvm/IR/Instructions.h" 26 #include "llvm/IR/Intrinsics.h" 27 #include "llvm/IR/IntrinsicsX86.h" 28 #include "llvm/IR/Type.h" 29 #include "llvm/Support/Debug.h" 30 #include "llvm/Support/ErrorHandling.h" 31 #include "llvm/Support/KnownBits.h" 32 #include "llvm/Support/MathExtras.h" 33 #include <cstdint> 34 35 using namespace llvm; 36 37 #define DEBUG_TYPE "x86-isel" 38 #define PASS_NAME "X86 DAG->DAG Instruction Selection" 39 40 STATISTIC(NumLoadMoved, "Number of loads moved below TokenFactor"); 41 42 static cl::opt<bool> AndImmShrink("x86-and-imm-shrink", cl::init(true), 43 cl::desc("Enable setting constant bits to reduce size of mask immediates"), 44 cl::Hidden); 45 46 static cl::opt<bool> EnablePromoteAnyextLoad( 47 "x86-promote-anyext-load", cl::init(true), 48 cl::desc("Enable promoting aligned anyext load to wider load"), cl::Hidden); 49 50 extern cl::opt<bool> IndirectBranchTracking; 51 52 //===----------------------------------------------------------------------===// 53 // Pattern Matcher Implementation 54 //===----------------------------------------------------------------------===// 55 56 namespace { 57 /// This corresponds to X86AddressMode, but uses SDValue's instead of register 58 /// numbers for the leaves of the matched tree. 59 struct X86ISelAddressMode { 60 enum { 61 RegBase, 62 FrameIndexBase 63 } BaseType = RegBase; 64 65 // This is really a union, discriminated by BaseType! 66 SDValue Base_Reg; 67 int Base_FrameIndex = 0; 68 69 unsigned Scale = 1; 70 SDValue IndexReg; 71 int32_t Disp = 0; 72 SDValue Segment; 73 const GlobalValue *GV = nullptr; 74 const Constant *CP = nullptr; 75 const BlockAddress *BlockAddr = nullptr; 76 const char *ES = nullptr; 77 MCSymbol *MCSym = nullptr; 78 int JT = -1; 79 Align Alignment; // CP alignment. 80 unsigned char SymbolFlags = X86II::MO_NO_FLAG; // X86II::MO_* 81 bool NegateIndex = false; 82 83 X86ISelAddressMode() = default; 84 85 bool hasSymbolicDisplacement() const { 86 return GV != nullptr || CP != nullptr || ES != nullptr || 87 MCSym != nullptr || JT != -1 || BlockAddr != nullptr; 88 } 89 90 bool hasBaseOrIndexReg() const { 91 return BaseType == FrameIndexBase || 92 IndexReg.getNode() != nullptr || Base_Reg.getNode() != nullptr; 93 } 94 95 /// Return true if this addressing mode is already RIP-relative. 96 bool isRIPRelative() const { 97 if (BaseType != RegBase) return false; 98 if (RegisterSDNode *RegNode = 99 dyn_cast_or_null<RegisterSDNode>(Base_Reg.getNode())) 100 return RegNode->getReg() == X86::RIP; 101 return false; 102 } 103 104 void setBaseReg(SDValue Reg) { 105 BaseType = RegBase; 106 Base_Reg = Reg; 107 } 108 109 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) 110 void dump(SelectionDAG *DAG = nullptr) { 111 dbgs() << "X86ISelAddressMode " << this << '\n'; 112 dbgs() << "Base_Reg "; 113 if (Base_Reg.getNode()) 114 Base_Reg.getNode()->dump(DAG); 115 else 116 dbgs() << "nul\n"; 117 if (BaseType == FrameIndexBase) 118 dbgs() << " Base.FrameIndex " << Base_FrameIndex << '\n'; 119 dbgs() << " Scale " << Scale << '\n' 120 << "IndexReg "; 121 if (NegateIndex) 122 dbgs() << "negate "; 123 if (IndexReg.getNode()) 124 IndexReg.getNode()->dump(DAG); 125 else 126 dbgs() << "nul\n"; 127 dbgs() << " Disp " << Disp << '\n' 128 << "GV "; 129 if (GV) 130 GV->dump(); 131 else 132 dbgs() << "nul"; 133 dbgs() << " CP "; 134 if (CP) 135 CP->dump(); 136 else 137 dbgs() << "nul"; 138 dbgs() << '\n' 139 << "ES "; 140 if (ES) 141 dbgs() << ES; 142 else 143 dbgs() << "nul"; 144 dbgs() << " MCSym "; 145 if (MCSym) 146 dbgs() << MCSym; 147 else 148 dbgs() << "nul"; 149 dbgs() << " JT" << JT << " Align" << Alignment.value() << '\n'; 150 } 151 #endif 152 }; 153 } 154 155 namespace { 156 //===--------------------------------------------------------------------===// 157 /// ISel - X86-specific code to select X86 machine instructions for 158 /// SelectionDAG operations. 159 /// 160 class X86DAGToDAGISel final : public SelectionDAGISel { 161 /// Keep a pointer to the X86Subtarget around so that we can 162 /// make the right decision when generating code for different targets. 163 const X86Subtarget *Subtarget; 164 165 /// If true, selector should try to optimize for minimum code size. 166 bool OptForMinSize; 167 168 /// Disable direct TLS access through segment registers. 169 bool IndirectTlsSegRefs; 170 171 public: 172 static char ID; 173 174 X86DAGToDAGISel() = delete; 175 176 explicit X86DAGToDAGISel(X86TargetMachine &tm, CodeGenOpt::Level OptLevel) 177 : SelectionDAGISel(ID, tm, OptLevel), Subtarget(nullptr), 178 OptForMinSize(false), IndirectTlsSegRefs(false) {} 179 180 bool runOnMachineFunction(MachineFunction &MF) override { 181 // Reset the subtarget each time through. 182 Subtarget = &MF.getSubtarget<X86Subtarget>(); 183 IndirectTlsSegRefs = MF.getFunction().hasFnAttribute( 184 "indirect-tls-seg-refs"); 185 186 // OptFor[Min]Size are used in pattern predicates that isel is matching. 187 OptForMinSize = MF.getFunction().hasMinSize(); 188 assert((!OptForMinSize || MF.getFunction().hasOptSize()) && 189 "OptForMinSize implies OptForSize"); 190 191 SelectionDAGISel::runOnMachineFunction(MF); 192 return true; 193 } 194 195 void emitFunctionEntryCode() override; 196 197 bool IsProfitableToFold(SDValue N, SDNode *U, SDNode *Root) const override; 198 199 void PreprocessISelDAG() override; 200 void PostprocessISelDAG() override; 201 202 // Include the pieces autogenerated from the target description. 203 #include "X86GenDAGISel.inc" 204 205 private: 206 void Select(SDNode *N) override; 207 208 bool foldOffsetIntoAddress(uint64_t Offset, X86ISelAddressMode &AM); 209 bool matchLoadInAddress(LoadSDNode *N, X86ISelAddressMode &AM, 210 bool AllowSegmentRegForX32 = false); 211 bool matchWrapper(SDValue N, X86ISelAddressMode &AM); 212 bool matchAddress(SDValue N, X86ISelAddressMode &AM); 213 bool matchVectorAddress(SDValue N, X86ISelAddressMode &AM); 214 bool matchAdd(SDValue &N, X86ISelAddressMode &AM, unsigned Depth); 215 bool matchAddressRecursively(SDValue N, X86ISelAddressMode &AM, 216 unsigned Depth); 217 bool matchVectorAddressRecursively(SDValue N, X86ISelAddressMode &AM, 218 unsigned Depth); 219 bool matchAddressBase(SDValue N, X86ISelAddressMode &AM); 220 bool selectAddr(SDNode *Parent, SDValue N, SDValue &Base, 221 SDValue &Scale, SDValue &Index, SDValue &Disp, 222 SDValue &Segment); 223 bool selectVectorAddr(MemSDNode *Parent, SDValue BasePtr, SDValue IndexOp, 224 SDValue ScaleOp, SDValue &Base, SDValue &Scale, 225 SDValue &Index, SDValue &Disp, SDValue &Segment); 226 bool selectMOV64Imm32(SDValue N, SDValue &Imm); 227 bool selectLEAAddr(SDValue N, SDValue &Base, 228 SDValue &Scale, SDValue &Index, SDValue &Disp, 229 SDValue &Segment); 230 bool selectLEA64_32Addr(SDValue N, SDValue &Base, 231 SDValue &Scale, SDValue &Index, SDValue &Disp, 232 SDValue &Segment); 233 bool selectTLSADDRAddr(SDValue N, SDValue &Base, 234 SDValue &Scale, SDValue &Index, SDValue &Disp, 235 SDValue &Segment); 236 bool selectRelocImm(SDValue N, SDValue &Op); 237 238 bool tryFoldLoad(SDNode *Root, SDNode *P, SDValue N, 239 SDValue &Base, SDValue &Scale, 240 SDValue &Index, SDValue &Disp, 241 SDValue &Segment); 242 243 // Convenience method where P is also root. 244 bool tryFoldLoad(SDNode *P, SDValue N, 245 SDValue &Base, SDValue &Scale, 246 SDValue &Index, SDValue &Disp, 247 SDValue &Segment) { 248 return tryFoldLoad(P, P, N, Base, Scale, Index, Disp, Segment); 249 } 250 251 bool tryFoldBroadcast(SDNode *Root, SDNode *P, SDValue N, 252 SDValue &Base, SDValue &Scale, 253 SDValue &Index, SDValue &Disp, 254 SDValue &Segment); 255 256 bool isProfitableToFormMaskedOp(SDNode *N) const; 257 258 /// Implement addressing mode selection for inline asm expressions. 259 bool SelectInlineAsmMemoryOperand(const SDValue &Op, 260 unsigned ConstraintID, 261 std::vector<SDValue> &OutOps) override; 262 263 void emitSpecialCodeForMain(); 264 265 inline void getAddressOperands(X86ISelAddressMode &AM, const SDLoc &DL, 266 MVT VT, SDValue &Base, SDValue &Scale, 267 SDValue &Index, SDValue &Disp, 268 SDValue &Segment) { 269 if (AM.BaseType == X86ISelAddressMode::FrameIndexBase) 270 Base = CurDAG->getTargetFrameIndex( 271 AM.Base_FrameIndex, TLI->getPointerTy(CurDAG->getDataLayout())); 272 else if (AM.Base_Reg.getNode()) 273 Base = AM.Base_Reg; 274 else 275 Base = CurDAG->getRegister(0, VT); 276 277 Scale = getI8Imm(AM.Scale, DL); 278 279 // Negate the index if needed. 280 if (AM.NegateIndex) { 281 unsigned NegOpc = VT == MVT::i64 ? X86::NEG64r : X86::NEG32r; 282 SDValue Neg = SDValue(CurDAG->getMachineNode(NegOpc, DL, VT, MVT::i32, 283 AM.IndexReg), 0); 284 AM.IndexReg = Neg; 285 } 286 287 if (AM.IndexReg.getNode()) 288 Index = AM.IndexReg; 289 else 290 Index = CurDAG->getRegister(0, VT); 291 292 // These are 32-bit even in 64-bit mode since RIP-relative offset 293 // is 32-bit. 294 if (AM.GV) 295 Disp = CurDAG->getTargetGlobalAddress(AM.GV, SDLoc(), 296 MVT::i32, AM.Disp, 297 AM.SymbolFlags); 298 else if (AM.CP) 299 Disp = CurDAG->getTargetConstantPool(AM.CP, MVT::i32, AM.Alignment, 300 AM.Disp, AM.SymbolFlags); 301 else if (AM.ES) { 302 assert(!AM.Disp && "Non-zero displacement is ignored with ES."); 303 Disp = CurDAG->getTargetExternalSymbol(AM.ES, MVT::i32, AM.SymbolFlags); 304 } else if (AM.MCSym) { 305 assert(!AM.Disp && "Non-zero displacement is ignored with MCSym."); 306 assert(AM.SymbolFlags == 0 && "oo"); 307 Disp = CurDAG->getMCSymbol(AM.MCSym, MVT::i32); 308 } else if (AM.JT != -1) { 309 assert(!AM.Disp && "Non-zero displacement is ignored with JT."); 310 Disp = CurDAG->getTargetJumpTable(AM.JT, MVT::i32, AM.SymbolFlags); 311 } else if (AM.BlockAddr) 312 Disp = CurDAG->getTargetBlockAddress(AM.BlockAddr, MVT::i32, AM.Disp, 313 AM.SymbolFlags); 314 else 315 Disp = CurDAG->getTargetConstant(AM.Disp, DL, MVT::i32); 316 317 if (AM.Segment.getNode()) 318 Segment = AM.Segment; 319 else 320 Segment = CurDAG->getRegister(0, MVT::i16); 321 } 322 323 // Utility function to determine whether we should avoid selecting 324 // immediate forms of instructions for better code size or not. 325 // At a high level, we'd like to avoid such instructions when 326 // we have similar constants used within the same basic block 327 // that can be kept in a register. 328 // 329 bool shouldAvoidImmediateInstFormsForSize(SDNode *N) const { 330 uint32_t UseCount = 0; 331 332 // Do not want to hoist if we're not optimizing for size. 333 // TODO: We'd like to remove this restriction. 334 // See the comment in X86InstrInfo.td for more info. 335 if (!CurDAG->shouldOptForSize()) 336 return false; 337 338 // Walk all the users of the immediate. 339 for (const SDNode *User : N->uses()) { 340 if (UseCount >= 2) 341 break; 342 343 // This user is already selected. Count it as a legitimate use and 344 // move on. 345 if (User->isMachineOpcode()) { 346 UseCount++; 347 continue; 348 } 349 350 // We want to count stores of immediates as real uses. 351 if (User->getOpcode() == ISD::STORE && 352 User->getOperand(1).getNode() == N) { 353 UseCount++; 354 continue; 355 } 356 357 // We don't currently match users that have > 2 operands (except 358 // for stores, which are handled above) 359 // Those instruction won't match in ISEL, for now, and would 360 // be counted incorrectly. 361 // This may change in the future as we add additional instruction 362 // types. 363 if (User->getNumOperands() != 2) 364 continue; 365 366 // If this is a sign-extended 8-bit integer immediate used in an ALU 367 // instruction, there is probably an opcode encoding to save space. 368 auto *C = dyn_cast<ConstantSDNode>(N); 369 if (C && isInt<8>(C->getSExtValue())) 370 continue; 371 372 // Immediates that are used for offsets as part of stack 373 // manipulation should be left alone. These are typically 374 // used to indicate SP offsets for argument passing and 375 // will get pulled into stores/pushes (implicitly). 376 if (User->getOpcode() == X86ISD::ADD || 377 User->getOpcode() == ISD::ADD || 378 User->getOpcode() == X86ISD::SUB || 379 User->getOpcode() == ISD::SUB) { 380 381 // Find the other operand of the add/sub. 382 SDValue OtherOp = User->getOperand(0); 383 if (OtherOp.getNode() == N) 384 OtherOp = User->getOperand(1); 385 386 // Don't count if the other operand is SP. 387 RegisterSDNode *RegNode; 388 if (OtherOp->getOpcode() == ISD::CopyFromReg && 389 (RegNode = dyn_cast_or_null<RegisterSDNode>( 390 OtherOp->getOperand(1).getNode()))) 391 if ((RegNode->getReg() == X86::ESP) || 392 (RegNode->getReg() == X86::RSP)) 393 continue; 394 } 395 396 // ... otherwise, count this and move on. 397 UseCount++; 398 } 399 400 // If we have more than 1 use, then recommend for hoisting. 401 return (UseCount > 1); 402 } 403 404 /// Return a target constant with the specified value of type i8. 405 inline SDValue getI8Imm(unsigned Imm, const SDLoc &DL) { 406 return CurDAG->getTargetConstant(Imm, DL, MVT::i8); 407 } 408 409 /// Return a target constant with the specified value, of type i32. 410 inline SDValue getI32Imm(unsigned Imm, const SDLoc &DL) { 411 return CurDAG->getTargetConstant(Imm, DL, MVT::i32); 412 } 413 414 /// Return a target constant with the specified value, of type i64. 415 inline SDValue getI64Imm(uint64_t Imm, const SDLoc &DL) { 416 return CurDAG->getTargetConstant(Imm, DL, MVT::i64); 417 } 418 419 SDValue getExtractVEXTRACTImmediate(SDNode *N, unsigned VecWidth, 420 const SDLoc &DL) { 421 assert((VecWidth == 128 || VecWidth == 256) && "Unexpected vector width"); 422 uint64_t Index = N->getConstantOperandVal(1); 423 MVT VecVT = N->getOperand(0).getSimpleValueType(); 424 return getI8Imm((Index * VecVT.getScalarSizeInBits()) / VecWidth, DL); 425 } 426 427 SDValue getInsertVINSERTImmediate(SDNode *N, unsigned VecWidth, 428 const SDLoc &DL) { 429 assert((VecWidth == 128 || VecWidth == 256) && "Unexpected vector width"); 430 uint64_t Index = N->getConstantOperandVal(2); 431 MVT VecVT = N->getSimpleValueType(0); 432 return getI8Imm((Index * VecVT.getScalarSizeInBits()) / VecWidth, DL); 433 } 434 435 SDValue getPermuteVINSERTCommutedImmediate(SDNode *N, unsigned VecWidth, 436 const SDLoc &DL) { 437 assert(VecWidth == 128 && "Unexpected vector width"); 438 uint64_t Index = N->getConstantOperandVal(2); 439 MVT VecVT = N->getSimpleValueType(0); 440 uint64_t InsertIdx = (Index * VecVT.getScalarSizeInBits()) / VecWidth; 441 assert((InsertIdx == 0 || InsertIdx == 1) && "Bad insertf128 index"); 442 // vinsert(0,sub,vec) -> [sub0][vec1] -> vperm2x128(0x30,vec,sub) 443 // vinsert(1,sub,vec) -> [vec0][sub0] -> vperm2x128(0x02,vec,sub) 444 return getI8Imm(InsertIdx ? 0x02 : 0x30, DL); 445 } 446 447 SDValue getSBBZero(SDNode *N) { 448 SDLoc dl(N); 449 MVT VT = N->getSimpleValueType(0); 450 451 // Create zero. 452 SDVTList VTs = CurDAG->getVTList(MVT::i32, MVT::i32); 453 SDValue Zero = SDValue( 454 CurDAG->getMachineNode(X86::MOV32r0, dl, VTs, std::nullopt), 0); 455 if (VT == MVT::i64) { 456 Zero = SDValue( 457 CurDAG->getMachineNode( 458 TargetOpcode::SUBREG_TO_REG, dl, MVT::i64, 459 CurDAG->getTargetConstant(0, dl, MVT::i64), Zero, 460 CurDAG->getTargetConstant(X86::sub_32bit, dl, MVT::i32)), 461 0); 462 } 463 464 // Copy flags to the EFLAGS register and glue it to next node. 465 unsigned Opcode = N->getOpcode(); 466 assert((Opcode == X86ISD::SBB || Opcode == X86ISD::SETCC_CARRY) && 467 "Unexpected opcode for SBB materialization"); 468 unsigned FlagOpIndex = Opcode == X86ISD::SBB ? 2 : 1; 469 SDValue EFLAGS = 470 CurDAG->getCopyToReg(CurDAG->getEntryNode(), dl, X86::EFLAGS, 471 N->getOperand(FlagOpIndex), SDValue()); 472 473 // Create a 64-bit instruction if the result is 64-bits otherwise use the 474 // 32-bit version. 475 unsigned Opc = VT == MVT::i64 ? X86::SBB64rr : X86::SBB32rr; 476 MVT SBBVT = VT == MVT::i64 ? MVT::i64 : MVT::i32; 477 VTs = CurDAG->getVTList(SBBVT, MVT::i32); 478 return SDValue( 479 CurDAG->getMachineNode(Opc, dl, VTs, 480 {Zero, Zero, EFLAGS, EFLAGS.getValue(1)}), 481 0); 482 } 483 484 // Helper to detect unneeded and instructions on shift amounts. Called 485 // from PatFrags in tablegen. 486 bool isUnneededShiftMask(SDNode *N, unsigned Width) const { 487 assert(N->getOpcode() == ISD::AND && "Unexpected opcode"); 488 const APInt &Val = cast<ConstantSDNode>(N->getOperand(1))->getAPIntValue(); 489 490 if (Val.countTrailingOnes() >= Width) 491 return true; 492 493 APInt Mask = Val | CurDAG->computeKnownBits(N->getOperand(0)).Zero; 494 return Mask.countTrailingOnes() >= Width; 495 } 496 497 /// Return an SDNode that returns the value of the global base register. 498 /// Output instructions required to initialize the global base register, 499 /// if necessary. 500 SDNode *getGlobalBaseReg(); 501 502 /// Return a reference to the TargetMachine, casted to the target-specific 503 /// type. 504 const X86TargetMachine &getTargetMachine() const { 505 return static_cast<const X86TargetMachine &>(TM); 506 } 507 508 /// Return a reference to the TargetInstrInfo, casted to the target-specific 509 /// type. 510 const X86InstrInfo *getInstrInfo() const { 511 return Subtarget->getInstrInfo(); 512 } 513 514 /// Return a condition code of the given SDNode 515 X86::CondCode getCondFromNode(SDNode *N) const; 516 517 /// Address-mode matching performs shift-of-and to and-of-shift 518 /// reassociation in order to expose more scaled addressing 519 /// opportunities. 520 bool ComplexPatternFuncMutatesDAG() const override { 521 return true; 522 } 523 524 bool isSExtAbsoluteSymbolRef(unsigned Width, SDNode *N) const; 525 526 // Indicates we should prefer to use a non-temporal load for this load. 527 bool useNonTemporalLoad(LoadSDNode *N) const { 528 if (!N->isNonTemporal()) 529 return false; 530 531 unsigned StoreSize = N->getMemoryVT().getStoreSize(); 532 533 if (N->getAlign().value() < StoreSize) 534 return false; 535 536 switch (StoreSize) { 537 default: llvm_unreachable("Unsupported store size"); 538 case 4: 539 case 8: 540 return false; 541 case 16: 542 return Subtarget->hasSSE41(); 543 case 32: 544 return Subtarget->hasAVX2(); 545 case 64: 546 return Subtarget->hasAVX512(); 547 } 548 } 549 550 bool foldLoadStoreIntoMemOperand(SDNode *Node); 551 MachineSDNode *matchBEXTRFromAndImm(SDNode *Node); 552 bool matchBitExtract(SDNode *Node); 553 bool shrinkAndImmediate(SDNode *N); 554 bool isMaskZeroExtended(SDNode *N) const; 555 bool tryShiftAmountMod(SDNode *N); 556 bool tryShrinkShlLogicImm(SDNode *N); 557 bool tryVPTERNLOG(SDNode *N); 558 bool matchVPTERNLOG(SDNode *Root, SDNode *ParentA, SDNode *ParentB, 559 SDNode *ParentC, SDValue A, SDValue B, SDValue C, 560 uint8_t Imm); 561 bool tryVPTESTM(SDNode *Root, SDValue Setcc, SDValue Mask); 562 bool tryMatchBitSelect(SDNode *N); 563 564 MachineSDNode *emitPCMPISTR(unsigned ROpc, unsigned MOpc, bool MayFoldLoad, 565 const SDLoc &dl, MVT VT, SDNode *Node); 566 MachineSDNode *emitPCMPESTR(unsigned ROpc, unsigned MOpc, bool MayFoldLoad, 567 const SDLoc &dl, MVT VT, SDNode *Node, 568 SDValue &InFlag); 569 570 bool tryOptimizeRem8Extend(SDNode *N); 571 572 bool onlyUsesZeroFlag(SDValue Flags) const; 573 bool hasNoSignFlagUses(SDValue Flags) const; 574 bool hasNoCarryFlagUses(SDValue Flags) const; 575 }; 576 } 577 578 char X86DAGToDAGISel::ID = 0; 579 580 INITIALIZE_PASS(X86DAGToDAGISel, DEBUG_TYPE, PASS_NAME, false, false) 581 582 // Returns true if this masked compare can be implemented legally with this 583 // type. 584 static bool isLegalMaskCompare(SDNode *N, const X86Subtarget *Subtarget) { 585 unsigned Opcode = N->getOpcode(); 586 if (Opcode == X86ISD::CMPM || Opcode == X86ISD::CMPMM || 587 Opcode == X86ISD::STRICT_CMPM || Opcode == ISD::SETCC || 588 Opcode == X86ISD::CMPMM_SAE || Opcode == X86ISD::VFPCLASS) { 589 // We can get 256-bit 8 element types here without VLX being enabled. When 590 // this happens we will use 512-bit operations and the mask will not be 591 // zero extended. 592 EVT OpVT = N->getOperand(0).getValueType(); 593 // The first operand of X86ISD::STRICT_CMPM is chain, so we need to get the 594 // second operand. 595 if (Opcode == X86ISD::STRICT_CMPM) 596 OpVT = N->getOperand(1).getValueType(); 597 if (OpVT.is256BitVector() || OpVT.is128BitVector()) 598 return Subtarget->hasVLX(); 599 600 return true; 601 } 602 // Scalar opcodes use 128 bit registers, but aren't subject to the VLX check. 603 if (Opcode == X86ISD::VFPCLASSS || Opcode == X86ISD::FSETCCM || 604 Opcode == X86ISD::FSETCCM_SAE) 605 return true; 606 607 return false; 608 } 609 610 // Returns true if we can assume the writer of the mask has zero extended it 611 // for us. 612 bool X86DAGToDAGISel::isMaskZeroExtended(SDNode *N) const { 613 // If this is an AND, check if we have a compare on either side. As long as 614 // one side guarantees the mask is zero extended, the AND will preserve those 615 // zeros. 616 if (N->getOpcode() == ISD::AND) 617 return isLegalMaskCompare(N->getOperand(0).getNode(), Subtarget) || 618 isLegalMaskCompare(N->getOperand(1).getNode(), Subtarget); 619 620 return isLegalMaskCompare(N, Subtarget); 621 } 622 623 bool 624 X86DAGToDAGISel::IsProfitableToFold(SDValue N, SDNode *U, SDNode *Root) const { 625 if (OptLevel == CodeGenOpt::None) return false; 626 627 if (!N.hasOneUse()) 628 return false; 629 630 if (N.getOpcode() != ISD::LOAD) 631 return true; 632 633 // Don't fold non-temporal loads if we have an instruction for them. 634 if (useNonTemporalLoad(cast<LoadSDNode>(N))) 635 return false; 636 637 // If N is a load, do additional profitability checks. 638 if (U == Root) { 639 switch (U->getOpcode()) { 640 default: break; 641 case X86ISD::ADD: 642 case X86ISD::ADC: 643 case X86ISD::SUB: 644 case X86ISD::SBB: 645 case X86ISD::AND: 646 case X86ISD::XOR: 647 case X86ISD::OR: 648 case ISD::ADD: 649 case ISD::ADDCARRY: 650 case ISD::AND: 651 case ISD::OR: 652 case ISD::XOR: { 653 SDValue Op1 = U->getOperand(1); 654 655 // If the other operand is a 8-bit immediate we should fold the immediate 656 // instead. This reduces code size. 657 // e.g. 658 // movl 4(%esp), %eax 659 // addl $4, %eax 660 // vs. 661 // movl $4, %eax 662 // addl 4(%esp), %eax 663 // The former is 2 bytes shorter. In case where the increment is 1, then 664 // the saving can be 4 bytes (by using incl %eax). 665 if (auto *Imm = dyn_cast<ConstantSDNode>(Op1)) { 666 if (Imm->getAPIntValue().isSignedIntN(8)) 667 return false; 668 669 // If this is a 64-bit AND with an immediate that fits in 32-bits, 670 // prefer using the smaller and over folding the load. This is needed to 671 // make sure immediates created by shrinkAndImmediate are always folded. 672 // Ideally we would narrow the load during DAG combine and get the 673 // best of both worlds. 674 if (U->getOpcode() == ISD::AND && 675 Imm->getAPIntValue().getBitWidth() == 64 && 676 Imm->getAPIntValue().isIntN(32)) 677 return false; 678 679 // If this really a zext_inreg that can be represented with a movzx 680 // instruction, prefer that. 681 // TODO: We could shrink the load and fold if it is non-volatile. 682 if (U->getOpcode() == ISD::AND && 683 (Imm->getAPIntValue() == UINT8_MAX || 684 Imm->getAPIntValue() == UINT16_MAX || 685 Imm->getAPIntValue() == UINT32_MAX)) 686 return false; 687 688 // ADD/SUB with can negate the immediate and use the opposite operation 689 // to fit 128 into a sign extended 8 bit immediate. 690 if ((U->getOpcode() == ISD::ADD || U->getOpcode() == ISD::SUB) && 691 (-Imm->getAPIntValue()).isSignedIntN(8)) 692 return false; 693 694 if ((U->getOpcode() == X86ISD::ADD || U->getOpcode() == X86ISD::SUB) && 695 (-Imm->getAPIntValue()).isSignedIntN(8) && 696 hasNoCarryFlagUses(SDValue(U, 1))) 697 return false; 698 } 699 700 // If the other operand is a TLS address, we should fold it instead. 701 // This produces 702 // movl %gs:0, %eax 703 // leal i@NTPOFF(%eax), %eax 704 // instead of 705 // movl $i@NTPOFF, %eax 706 // addl %gs:0, %eax 707 // if the block also has an access to a second TLS address this will save 708 // a load. 709 // FIXME: This is probably also true for non-TLS addresses. 710 if (Op1.getOpcode() == X86ISD::Wrapper) { 711 SDValue Val = Op1.getOperand(0); 712 if (Val.getOpcode() == ISD::TargetGlobalTLSAddress) 713 return false; 714 } 715 716 // Don't fold load if this matches the BTS/BTR/BTC patterns. 717 // BTS: (or X, (shl 1, n)) 718 // BTR: (and X, (rotl -2, n)) 719 // BTC: (xor X, (shl 1, n)) 720 if (U->getOpcode() == ISD::OR || U->getOpcode() == ISD::XOR) { 721 if (U->getOperand(0).getOpcode() == ISD::SHL && 722 isOneConstant(U->getOperand(0).getOperand(0))) 723 return false; 724 725 if (U->getOperand(1).getOpcode() == ISD::SHL && 726 isOneConstant(U->getOperand(1).getOperand(0))) 727 return false; 728 } 729 if (U->getOpcode() == ISD::AND) { 730 SDValue U0 = U->getOperand(0); 731 SDValue U1 = U->getOperand(1); 732 if (U0.getOpcode() == ISD::ROTL) { 733 auto *C = dyn_cast<ConstantSDNode>(U0.getOperand(0)); 734 if (C && C->getSExtValue() == -2) 735 return false; 736 } 737 738 if (U1.getOpcode() == ISD::ROTL) { 739 auto *C = dyn_cast<ConstantSDNode>(U1.getOperand(0)); 740 if (C && C->getSExtValue() == -2) 741 return false; 742 } 743 } 744 745 break; 746 } 747 case ISD::SHL: 748 case ISD::SRA: 749 case ISD::SRL: 750 // Don't fold a load into a shift by immediate. The BMI2 instructions 751 // support folding a load, but not an immediate. The legacy instructions 752 // support folding an immediate, but can't fold a load. Folding an 753 // immediate is preferable to folding a load. 754 if (isa<ConstantSDNode>(U->getOperand(1))) 755 return false; 756 757 break; 758 } 759 } 760 761 // Prevent folding a load if this can implemented with an insert_subreg or 762 // a move that implicitly zeroes. 763 if (Root->getOpcode() == ISD::INSERT_SUBVECTOR && 764 isNullConstant(Root->getOperand(2)) && 765 (Root->getOperand(0).isUndef() || 766 ISD::isBuildVectorAllZeros(Root->getOperand(0).getNode()))) 767 return false; 768 769 return true; 770 } 771 772 // Indicates it is profitable to form an AVX512 masked operation. Returning 773 // false will favor a masked register-register masked move or vblendm and the 774 // operation will be selected separately. 775 bool X86DAGToDAGISel::isProfitableToFormMaskedOp(SDNode *N) const { 776 assert( 777 (N->getOpcode() == ISD::VSELECT || N->getOpcode() == X86ISD::SELECTS) && 778 "Unexpected opcode!"); 779 780 // If the operation has additional users, the operation will be duplicated. 781 // Check the use count to prevent that. 782 // FIXME: Are there cheap opcodes we might want to duplicate? 783 return N->getOperand(1).hasOneUse(); 784 } 785 786 /// Replace the original chain operand of the call with 787 /// load's chain operand and move load below the call's chain operand. 788 static void moveBelowOrigChain(SelectionDAG *CurDAG, SDValue Load, 789 SDValue Call, SDValue OrigChain) { 790 SmallVector<SDValue, 8> Ops; 791 SDValue Chain = OrigChain.getOperand(0); 792 if (Chain.getNode() == Load.getNode()) 793 Ops.push_back(Load.getOperand(0)); 794 else { 795 assert(Chain.getOpcode() == ISD::TokenFactor && 796 "Unexpected chain operand"); 797 for (unsigned i = 0, e = Chain.getNumOperands(); i != e; ++i) 798 if (Chain.getOperand(i).getNode() == Load.getNode()) 799 Ops.push_back(Load.getOperand(0)); 800 else 801 Ops.push_back(Chain.getOperand(i)); 802 SDValue NewChain = 803 CurDAG->getNode(ISD::TokenFactor, SDLoc(Load), MVT::Other, Ops); 804 Ops.clear(); 805 Ops.push_back(NewChain); 806 } 807 Ops.append(OrigChain->op_begin() + 1, OrigChain->op_end()); 808 CurDAG->UpdateNodeOperands(OrigChain.getNode(), Ops); 809 CurDAG->UpdateNodeOperands(Load.getNode(), Call.getOperand(0), 810 Load.getOperand(1), Load.getOperand(2)); 811 812 Ops.clear(); 813 Ops.push_back(SDValue(Load.getNode(), 1)); 814 Ops.append(Call->op_begin() + 1, Call->op_end()); 815 CurDAG->UpdateNodeOperands(Call.getNode(), Ops); 816 } 817 818 /// Return true if call address is a load and it can be 819 /// moved below CALLSEQ_START and the chains leading up to the call. 820 /// Return the CALLSEQ_START by reference as a second output. 821 /// In the case of a tail call, there isn't a callseq node between the call 822 /// chain and the load. 823 static bool isCalleeLoad(SDValue Callee, SDValue &Chain, bool HasCallSeq) { 824 // The transformation is somewhat dangerous if the call's chain was glued to 825 // the call. After MoveBelowOrigChain the load is moved between the call and 826 // the chain, this can create a cycle if the load is not folded. So it is 827 // *really* important that we are sure the load will be folded. 828 if (Callee.getNode() == Chain.getNode() || !Callee.hasOneUse()) 829 return false; 830 auto *LD = dyn_cast<LoadSDNode>(Callee.getNode()); 831 if (!LD || 832 !LD->isSimple() || 833 LD->getAddressingMode() != ISD::UNINDEXED || 834 LD->getExtensionType() != ISD::NON_EXTLOAD) 835 return false; 836 837 // Now let's find the callseq_start. 838 while (HasCallSeq && Chain.getOpcode() != ISD::CALLSEQ_START) { 839 if (!Chain.hasOneUse()) 840 return false; 841 Chain = Chain.getOperand(0); 842 } 843 844 if (!Chain.getNumOperands()) 845 return false; 846 // Since we are not checking for AA here, conservatively abort if the chain 847 // writes to memory. It's not safe to move the callee (a load) across a store. 848 if (isa<MemSDNode>(Chain.getNode()) && 849 cast<MemSDNode>(Chain.getNode())->writeMem()) 850 return false; 851 if (Chain.getOperand(0).getNode() == Callee.getNode()) 852 return true; 853 if (Chain.getOperand(0).getOpcode() == ISD::TokenFactor && 854 Callee.getValue(1).isOperandOf(Chain.getOperand(0).getNode()) && 855 Callee.getValue(1).hasOneUse()) 856 return true; 857 return false; 858 } 859 860 static bool isEndbrImm64(uint64_t Imm) { 861 // There may be some other prefix bytes between 0xF3 and 0x0F1EFA. 862 // i.g: 0xF3660F1EFA, 0xF3670F1EFA 863 if ((Imm & 0x00FFFFFF) != 0x0F1EFA) 864 return false; 865 866 uint8_t OptionalPrefixBytes [] = {0x26, 0x2e, 0x36, 0x3e, 0x64, 867 0x65, 0x66, 0x67, 0xf0, 0xf2}; 868 int i = 24; // 24bit 0x0F1EFA has matched 869 while (i < 64) { 870 uint8_t Byte = (Imm >> i) & 0xFF; 871 if (Byte == 0xF3) 872 return true; 873 if (!llvm::is_contained(OptionalPrefixBytes, Byte)) 874 return false; 875 i += 8; 876 } 877 878 return false; 879 } 880 881 void X86DAGToDAGISel::PreprocessISelDAG() { 882 bool MadeChange = false; 883 for (SelectionDAG::allnodes_iterator I = CurDAG->allnodes_begin(), 884 E = CurDAG->allnodes_end(); I != E; ) { 885 SDNode *N = &*I++; // Preincrement iterator to avoid invalidation issues. 886 887 // This is for CET enhancement. 888 // 889 // ENDBR32 and ENDBR64 have specific opcodes: 890 // ENDBR32: F3 0F 1E FB 891 // ENDBR64: F3 0F 1E FA 892 // And we want that attackers won’t find unintended ENDBR32/64 893 // opcode matches in the binary 894 // Here’s an example: 895 // If the compiler had to generate asm for the following code: 896 // a = 0xF30F1EFA 897 // it could, for example, generate: 898 // mov 0xF30F1EFA, dword ptr[a] 899 // In such a case, the binary would include a gadget that starts 900 // with a fake ENDBR64 opcode. Therefore, we split such generation 901 // into multiple operations, let it not shows in the binary 902 if (N->getOpcode() == ISD::Constant) { 903 MVT VT = N->getSimpleValueType(0); 904 int64_t Imm = cast<ConstantSDNode>(N)->getSExtValue(); 905 int32_t EndbrImm = Subtarget->is64Bit() ? 0xF30F1EFA : 0xF30F1EFB; 906 if (Imm == EndbrImm || isEndbrImm64(Imm)) { 907 // Check that the cf-protection-branch is enabled. 908 Metadata *CFProtectionBranch = 909 MF->getMMI().getModule()->getModuleFlag("cf-protection-branch"); 910 if (CFProtectionBranch || IndirectBranchTracking) { 911 SDLoc dl(N); 912 SDValue Complement = CurDAG->getConstant(~Imm, dl, VT, false, true); 913 Complement = CurDAG->getNOT(dl, Complement, VT); 914 --I; 915 CurDAG->ReplaceAllUsesOfValueWith(SDValue(N, 0), Complement); 916 ++I; 917 MadeChange = true; 918 continue; 919 } 920 } 921 } 922 923 // If this is a target specific AND node with no flag usages, turn it back 924 // into ISD::AND to enable test instruction matching. 925 if (N->getOpcode() == X86ISD::AND && !N->hasAnyUseOfValue(1)) { 926 SDValue Res = CurDAG->getNode(ISD::AND, SDLoc(N), N->getValueType(0), 927 N->getOperand(0), N->getOperand(1)); 928 --I; 929 CurDAG->ReplaceAllUsesOfValueWith(SDValue(N, 0), Res); 930 ++I; 931 MadeChange = true; 932 continue; 933 } 934 935 // Convert vector increment or decrement to sub/add with an all-ones 936 // constant: 937 // add X, <1, 1...> --> sub X, <-1, -1...> 938 // sub X, <1, 1...> --> add X, <-1, -1...> 939 // The all-ones vector constant can be materialized using a pcmpeq 940 // instruction that is commonly recognized as an idiom (has no register 941 // dependency), so that's better/smaller than loading a splat 1 constant. 942 // 943 // But don't do this if it would inhibit a potentially profitable load 944 // folding opportunity for the other operand. That only occurs with the 945 // intersection of: 946 // (1) The other operand (op0) is load foldable. 947 // (2) The op is an add (otherwise, we are *creating* an add and can still 948 // load fold the other op). 949 // (3) The target has AVX (otherwise, we have a destructive add and can't 950 // load fold the other op without killing the constant op). 951 // (4) The constant 1 vector has multiple uses (so it is profitable to load 952 // into a register anyway). 953 auto mayPreventLoadFold = [&]() { 954 return X86::mayFoldLoad(N->getOperand(0), *Subtarget) && 955 N->getOpcode() == ISD::ADD && Subtarget->hasAVX() && 956 !N->getOperand(1).hasOneUse(); 957 }; 958 if ((N->getOpcode() == ISD::ADD || N->getOpcode() == ISD::SUB) && 959 N->getSimpleValueType(0).isVector() && !mayPreventLoadFold()) { 960 APInt SplatVal; 961 if (X86::isConstantSplat(N->getOperand(1), SplatVal) && 962 SplatVal.isOne()) { 963 SDLoc DL(N); 964 965 MVT VT = N->getSimpleValueType(0); 966 unsigned NumElts = VT.getSizeInBits() / 32; 967 SDValue AllOnes = 968 CurDAG->getAllOnesConstant(DL, MVT::getVectorVT(MVT::i32, NumElts)); 969 AllOnes = CurDAG->getBitcast(VT, AllOnes); 970 971 unsigned NewOpcode = N->getOpcode() == ISD::ADD ? ISD::SUB : ISD::ADD; 972 SDValue Res = 973 CurDAG->getNode(NewOpcode, DL, VT, N->getOperand(0), AllOnes); 974 --I; 975 CurDAG->ReplaceAllUsesWith(N, Res.getNode()); 976 ++I; 977 MadeChange = true; 978 continue; 979 } 980 } 981 982 switch (N->getOpcode()) { 983 case X86ISD::VBROADCAST: { 984 MVT VT = N->getSimpleValueType(0); 985 // Emulate v32i16/v64i8 broadcast without BWI. 986 if (!Subtarget->hasBWI() && (VT == MVT::v32i16 || VT == MVT::v64i8)) { 987 MVT NarrowVT = VT == MVT::v32i16 ? MVT::v16i16 : MVT::v32i8; 988 SDLoc dl(N); 989 SDValue NarrowBCast = 990 CurDAG->getNode(X86ISD::VBROADCAST, dl, NarrowVT, N->getOperand(0)); 991 SDValue Res = 992 CurDAG->getNode(ISD::INSERT_SUBVECTOR, dl, VT, CurDAG->getUNDEF(VT), 993 NarrowBCast, CurDAG->getIntPtrConstant(0, dl)); 994 unsigned Index = VT == MVT::v32i16 ? 16 : 32; 995 Res = CurDAG->getNode(ISD::INSERT_SUBVECTOR, dl, VT, Res, NarrowBCast, 996 CurDAG->getIntPtrConstant(Index, dl)); 997 998 --I; 999 CurDAG->ReplaceAllUsesWith(N, Res.getNode()); 1000 ++I; 1001 MadeChange = true; 1002 continue; 1003 } 1004 1005 break; 1006 } 1007 case X86ISD::VBROADCAST_LOAD: { 1008 MVT VT = N->getSimpleValueType(0); 1009 // Emulate v32i16/v64i8 broadcast without BWI. 1010 if (!Subtarget->hasBWI() && (VT == MVT::v32i16 || VT == MVT::v64i8)) { 1011 MVT NarrowVT = VT == MVT::v32i16 ? MVT::v16i16 : MVT::v32i8; 1012 auto *MemNode = cast<MemSDNode>(N); 1013 SDLoc dl(N); 1014 SDVTList VTs = CurDAG->getVTList(NarrowVT, MVT::Other); 1015 SDValue Ops[] = {MemNode->getChain(), MemNode->getBasePtr()}; 1016 SDValue NarrowBCast = CurDAG->getMemIntrinsicNode( 1017 X86ISD::VBROADCAST_LOAD, dl, VTs, Ops, MemNode->getMemoryVT(), 1018 MemNode->getMemOperand()); 1019 SDValue Res = 1020 CurDAG->getNode(ISD::INSERT_SUBVECTOR, dl, VT, CurDAG->getUNDEF(VT), 1021 NarrowBCast, CurDAG->getIntPtrConstant(0, dl)); 1022 unsigned Index = VT == MVT::v32i16 ? 16 : 32; 1023 Res = CurDAG->getNode(ISD::INSERT_SUBVECTOR, dl, VT, Res, NarrowBCast, 1024 CurDAG->getIntPtrConstant(Index, dl)); 1025 1026 --I; 1027 SDValue To[] = {Res, NarrowBCast.getValue(1)}; 1028 CurDAG->ReplaceAllUsesWith(N, To); 1029 ++I; 1030 MadeChange = true; 1031 continue; 1032 } 1033 1034 break; 1035 } 1036 case ISD::VSELECT: { 1037 // Replace VSELECT with non-mask conditions with with BLENDV. 1038 if (N->getOperand(0).getValueType().getVectorElementType() == MVT::i1) 1039 break; 1040 1041 assert(Subtarget->hasSSE41() && "Expected SSE4.1 support!"); 1042 SDValue Blendv = 1043 CurDAG->getNode(X86ISD::BLENDV, SDLoc(N), N->getValueType(0), 1044 N->getOperand(0), N->getOperand(1), N->getOperand(2)); 1045 --I; 1046 CurDAG->ReplaceAllUsesWith(N, Blendv.getNode()); 1047 ++I; 1048 MadeChange = true; 1049 continue; 1050 } 1051 case ISD::FP_ROUND: 1052 case ISD::STRICT_FP_ROUND: 1053 case ISD::FP_TO_SINT: 1054 case ISD::FP_TO_UINT: 1055 case ISD::STRICT_FP_TO_SINT: 1056 case ISD::STRICT_FP_TO_UINT: { 1057 // Replace vector fp_to_s/uint with their X86 specific equivalent so we 1058 // don't need 2 sets of patterns. 1059 if (!N->getSimpleValueType(0).isVector()) 1060 break; 1061 1062 unsigned NewOpc; 1063 switch (N->getOpcode()) { 1064 default: llvm_unreachable("Unexpected opcode!"); 1065 case ISD::FP_ROUND: NewOpc = X86ISD::VFPROUND; break; 1066 case ISD::STRICT_FP_ROUND: NewOpc = X86ISD::STRICT_VFPROUND; break; 1067 case ISD::STRICT_FP_TO_SINT: NewOpc = X86ISD::STRICT_CVTTP2SI; break; 1068 case ISD::FP_TO_SINT: NewOpc = X86ISD::CVTTP2SI; break; 1069 case ISD::STRICT_FP_TO_UINT: NewOpc = X86ISD::STRICT_CVTTP2UI; break; 1070 case ISD::FP_TO_UINT: NewOpc = X86ISD::CVTTP2UI; break; 1071 } 1072 SDValue Res; 1073 if (N->isStrictFPOpcode()) 1074 Res = 1075 CurDAG->getNode(NewOpc, SDLoc(N), {N->getValueType(0), MVT::Other}, 1076 {N->getOperand(0), N->getOperand(1)}); 1077 else 1078 Res = 1079 CurDAG->getNode(NewOpc, SDLoc(N), N->getValueType(0), 1080 N->getOperand(0)); 1081 --I; 1082 CurDAG->ReplaceAllUsesWith(N, Res.getNode()); 1083 ++I; 1084 MadeChange = true; 1085 continue; 1086 } 1087 case ISD::SHL: 1088 case ISD::SRA: 1089 case ISD::SRL: { 1090 // Replace vector shifts with their X86 specific equivalent so we don't 1091 // need 2 sets of patterns. 1092 if (!N->getValueType(0).isVector()) 1093 break; 1094 1095 unsigned NewOpc; 1096 switch (N->getOpcode()) { 1097 default: llvm_unreachable("Unexpected opcode!"); 1098 case ISD::SHL: NewOpc = X86ISD::VSHLV; break; 1099 case ISD::SRA: NewOpc = X86ISD::VSRAV; break; 1100 case ISD::SRL: NewOpc = X86ISD::VSRLV; break; 1101 } 1102 SDValue Res = CurDAG->getNode(NewOpc, SDLoc(N), N->getValueType(0), 1103 N->getOperand(0), N->getOperand(1)); 1104 --I; 1105 CurDAG->ReplaceAllUsesOfValueWith(SDValue(N, 0), Res); 1106 ++I; 1107 MadeChange = true; 1108 continue; 1109 } 1110 case ISD::ANY_EXTEND: 1111 case ISD::ANY_EXTEND_VECTOR_INREG: { 1112 // Replace vector any extend with the zero extend equivalents so we don't 1113 // need 2 sets of patterns. Ignore vXi1 extensions. 1114 if (!N->getValueType(0).isVector()) 1115 break; 1116 1117 unsigned NewOpc; 1118 if (N->getOperand(0).getScalarValueSizeInBits() == 1) { 1119 assert(N->getOpcode() == ISD::ANY_EXTEND && 1120 "Unexpected opcode for mask vector!"); 1121 NewOpc = ISD::SIGN_EXTEND; 1122 } else { 1123 NewOpc = N->getOpcode() == ISD::ANY_EXTEND 1124 ? ISD::ZERO_EXTEND 1125 : ISD::ZERO_EXTEND_VECTOR_INREG; 1126 } 1127 1128 SDValue Res = CurDAG->getNode(NewOpc, SDLoc(N), N->getValueType(0), 1129 N->getOperand(0)); 1130 --I; 1131 CurDAG->ReplaceAllUsesOfValueWith(SDValue(N, 0), Res); 1132 ++I; 1133 MadeChange = true; 1134 continue; 1135 } 1136 case ISD::FCEIL: 1137 case ISD::STRICT_FCEIL: 1138 case ISD::FFLOOR: 1139 case ISD::STRICT_FFLOOR: 1140 case ISD::FTRUNC: 1141 case ISD::STRICT_FTRUNC: 1142 case ISD::FROUNDEVEN: 1143 case ISD::STRICT_FROUNDEVEN: 1144 case ISD::FNEARBYINT: 1145 case ISD::STRICT_FNEARBYINT: 1146 case ISD::FRINT: 1147 case ISD::STRICT_FRINT: { 1148 // Replace fp rounding with their X86 specific equivalent so we don't 1149 // need 2 sets of patterns. 1150 unsigned Imm; 1151 switch (N->getOpcode()) { 1152 default: llvm_unreachable("Unexpected opcode!"); 1153 case ISD::STRICT_FCEIL: 1154 case ISD::FCEIL: Imm = 0xA; break; 1155 case ISD::STRICT_FFLOOR: 1156 case ISD::FFLOOR: Imm = 0x9; break; 1157 case ISD::STRICT_FTRUNC: 1158 case ISD::FTRUNC: Imm = 0xB; break; 1159 case ISD::STRICT_FROUNDEVEN: 1160 case ISD::FROUNDEVEN: Imm = 0x8; break; 1161 case ISD::STRICT_FNEARBYINT: 1162 case ISD::FNEARBYINT: Imm = 0xC; break; 1163 case ISD::STRICT_FRINT: 1164 case ISD::FRINT: Imm = 0x4; break; 1165 } 1166 SDLoc dl(N); 1167 bool IsStrict = N->isStrictFPOpcode(); 1168 SDValue Res; 1169 if (IsStrict) 1170 Res = CurDAG->getNode(X86ISD::STRICT_VRNDSCALE, dl, 1171 {N->getValueType(0), MVT::Other}, 1172 {N->getOperand(0), N->getOperand(1), 1173 CurDAG->getTargetConstant(Imm, dl, MVT::i32)}); 1174 else 1175 Res = CurDAG->getNode(X86ISD::VRNDSCALE, dl, N->getValueType(0), 1176 N->getOperand(0), 1177 CurDAG->getTargetConstant(Imm, dl, MVT::i32)); 1178 --I; 1179 CurDAG->ReplaceAllUsesWith(N, Res.getNode()); 1180 ++I; 1181 MadeChange = true; 1182 continue; 1183 } 1184 case X86ISD::FANDN: 1185 case X86ISD::FAND: 1186 case X86ISD::FOR: 1187 case X86ISD::FXOR: { 1188 // Widen scalar fp logic ops to vector to reduce isel patterns. 1189 // FIXME: Can we do this during lowering/combine. 1190 MVT VT = N->getSimpleValueType(0); 1191 if (VT.isVector() || VT == MVT::f128) 1192 break; 1193 1194 MVT VecVT = VT == MVT::f64 ? MVT::v2f64 1195 : VT == MVT::f32 ? MVT::v4f32 1196 : MVT::v8f16; 1197 1198 SDLoc dl(N); 1199 SDValue Op0 = CurDAG->getNode(ISD::SCALAR_TO_VECTOR, dl, VecVT, 1200 N->getOperand(0)); 1201 SDValue Op1 = CurDAG->getNode(ISD::SCALAR_TO_VECTOR, dl, VecVT, 1202 N->getOperand(1)); 1203 1204 SDValue Res; 1205 if (Subtarget->hasSSE2()) { 1206 EVT IntVT = EVT(VecVT).changeVectorElementTypeToInteger(); 1207 Op0 = CurDAG->getNode(ISD::BITCAST, dl, IntVT, Op0); 1208 Op1 = CurDAG->getNode(ISD::BITCAST, dl, IntVT, Op1); 1209 unsigned Opc; 1210 switch (N->getOpcode()) { 1211 default: llvm_unreachable("Unexpected opcode!"); 1212 case X86ISD::FANDN: Opc = X86ISD::ANDNP; break; 1213 case X86ISD::FAND: Opc = ISD::AND; break; 1214 case X86ISD::FOR: Opc = ISD::OR; break; 1215 case X86ISD::FXOR: Opc = ISD::XOR; break; 1216 } 1217 Res = CurDAG->getNode(Opc, dl, IntVT, Op0, Op1); 1218 Res = CurDAG->getNode(ISD::BITCAST, dl, VecVT, Res); 1219 } else { 1220 Res = CurDAG->getNode(N->getOpcode(), dl, VecVT, Op0, Op1); 1221 } 1222 Res = CurDAG->getNode(ISD::EXTRACT_VECTOR_ELT, dl, VT, Res, 1223 CurDAG->getIntPtrConstant(0, dl)); 1224 --I; 1225 CurDAG->ReplaceAllUsesOfValueWith(SDValue(N, 0), Res); 1226 ++I; 1227 MadeChange = true; 1228 continue; 1229 } 1230 } 1231 1232 if (OptLevel != CodeGenOpt::None && 1233 // Only do this when the target can fold the load into the call or 1234 // jmp. 1235 !Subtarget->useIndirectThunkCalls() && 1236 ((N->getOpcode() == X86ISD::CALL && !Subtarget->slowTwoMemOps()) || 1237 (N->getOpcode() == X86ISD::TC_RETURN && 1238 (Subtarget->is64Bit() || 1239 !getTargetMachine().isPositionIndependent())))) { 1240 /// Also try moving call address load from outside callseq_start to just 1241 /// before the call to allow it to be folded. 1242 /// 1243 /// [Load chain] 1244 /// ^ 1245 /// | 1246 /// [Load] 1247 /// ^ ^ 1248 /// | | 1249 /// / \-- 1250 /// / | 1251 ///[CALLSEQ_START] | 1252 /// ^ | 1253 /// | | 1254 /// [LOAD/C2Reg] | 1255 /// | | 1256 /// \ / 1257 /// \ / 1258 /// [CALL] 1259 bool HasCallSeq = N->getOpcode() == X86ISD::CALL; 1260 SDValue Chain = N->getOperand(0); 1261 SDValue Load = N->getOperand(1); 1262 if (!isCalleeLoad(Load, Chain, HasCallSeq)) 1263 continue; 1264 moveBelowOrigChain(CurDAG, Load, SDValue(N, 0), Chain); 1265 ++NumLoadMoved; 1266 MadeChange = true; 1267 continue; 1268 } 1269 1270 // Lower fpround and fpextend nodes that target the FP stack to be store and 1271 // load to the stack. This is a gross hack. We would like to simply mark 1272 // these as being illegal, but when we do that, legalize produces these when 1273 // it expands calls, then expands these in the same legalize pass. We would 1274 // like dag combine to be able to hack on these between the call expansion 1275 // and the node legalization. As such this pass basically does "really 1276 // late" legalization of these inline with the X86 isel pass. 1277 // FIXME: This should only happen when not compiled with -O0. 1278 switch (N->getOpcode()) { 1279 default: continue; 1280 case ISD::FP_ROUND: 1281 case ISD::FP_EXTEND: 1282 { 1283 MVT SrcVT = N->getOperand(0).getSimpleValueType(); 1284 MVT DstVT = N->getSimpleValueType(0); 1285 1286 // If any of the sources are vectors, no fp stack involved. 1287 if (SrcVT.isVector() || DstVT.isVector()) 1288 continue; 1289 1290 // If the source and destination are SSE registers, then this is a legal 1291 // conversion that should not be lowered. 1292 const X86TargetLowering *X86Lowering = 1293 static_cast<const X86TargetLowering *>(TLI); 1294 bool SrcIsSSE = X86Lowering->isScalarFPTypeInSSEReg(SrcVT); 1295 bool DstIsSSE = X86Lowering->isScalarFPTypeInSSEReg(DstVT); 1296 if (SrcIsSSE && DstIsSSE) 1297 continue; 1298 1299 if (!SrcIsSSE && !DstIsSSE) { 1300 // If this is an FPStack extension, it is a noop. 1301 if (N->getOpcode() == ISD::FP_EXTEND) 1302 continue; 1303 // If this is a value-preserving FPStack truncation, it is a noop. 1304 if (N->getConstantOperandVal(1)) 1305 continue; 1306 } 1307 1308 // Here we could have an FP stack truncation or an FPStack <-> SSE convert. 1309 // FPStack has extload and truncstore. SSE can fold direct loads into other 1310 // operations. Based on this, decide what we want to do. 1311 MVT MemVT = (N->getOpcode() == ISD::FP_ROUND) ? DstVT : SrcVT; 1312 SDValue MemTmp = CurDAG->CreateStackTemporary(MemVT); 1313 int SPFI = cast<FrameIndexSDNode>(MemTmp)->getIndex(); 1314 MachinePointerInfo MPI = 1315 MachinePointerInfo::getFixedStack(CurDAG->getMachineFunction(), SPFI); 1316 SDLoc dl(N); 1317 1318 // FIXME: optimize the case where the src/dest is a load or store? 1319 1320 SDValue Store = CurDAG->getTruncStore( 1321 CurDAG->getEntryNode(), dl, N->getOperand(0), MemTmp, MPI, MemVT); 1322 SDValue Result = CurDAG->getExtLoad(ISD::EXTLOAD, dl, DstVT, Store, 1323 MemTmp, MPI, MemVT); 1324 1325 // We're about to replace all uses of the FP_ROUND/FP_EXTEND with the 1326 // extload we created. This will cause general havok on the dag because 1327 // anything below the conversion could be folded into other existing nodes. 1328 // To avoid invalidating 'I', back it up to the convert node. 1329 --I; 1330 CurDAG->ReplaceAllUsesOfValueWith(SDValue(N, 0), Result); 1331 break; 1332 } 1333 1334 //The sequence of events for lowering STRICT_FP versions of these nodes requires 1335 //dealing with the chain differently, as there is already a preexisting chain. 1336 case ISD::STRICT_FP_ROUND: 1337 case ISD::STRICT_FP_EXTEND: 1338 { 1339 MVT SrcVT = N->getOperand(1).getSimpleValueType(); 1340 MVT DstVT = N->getSimpleValueType(0); 1341 1342 // If any of the sources are vectors, no fp stack involved. 1343 if (SrcVT.isVector() || DstVT.isVector()) 1344 continue; 1345 1346 // If the source and destination are SSE registers, then this is a legal 1347 // conversion that should not be lowered. 1348 const X86TargetLowering *X86Lowering = 1349 static_cast<const X86TargetLowering *>(TLI); 1350 bool SrcIsSSE = X86Lowering->isScalarFPTypeInSSEReg(SrcVT); 1351 bool DstIsSSE = X86Lowering->isScalarFPTypeInSSEReg(DstVT); 1352 if (SrcIsSSE && DstIsSSE) 1353 continue; 1354 1355 if (!SrcIsSSE && !DstIsSSE) { 1356 // If this is an FPStack extension, it is a noop. 1357 if (N->getOpcode() == ISD::STRICT_FP_EXTEND) 1358 continue; 1359 // If this is a value-preserving FPStack truncation, it is a noop. 1360 if (N->getConstantOperandVal(2)) 1361 continue; 1362 } 1363 1364 // Here we could have an FP stack truncation or an FPStack <-> SSE convert. 1365 // FPStack has extload and truncstore. SSE can fold direct loads into other 1366 // operations. Based on this, decide what we want to do. 1367 MVT MemVT = (N->getOpcode() == ISD::STRICT_FP_ROUND) ? DstVT : SrcVT; 1368 SDValue MemTmp = CurDAG->CreateStackTemporary(MemVT); 1369 int SPFI = cast<FrameIndexSDNode>(MemTmp)->getIndex(); 1370 MachinePointerInfo MPI = 1371 MachinePointerInfo::getFixedStack(CurDAG->getMachineFunction(), SPFI); 1372 SDLoc dl(N); 1373 1374 // FIXME: optimize the case where the src/dest is a load or store? 1375 1376 //Since the operation is StrictFP, use the preexisting chain. 1377 SDValue Store, Result; 1378 if (!SrcIsSSE) { 1379 SDVTList VTs = CurDAG->getVTList(MVT::Other); 1380 SDValue Ops[] = {N->getOperand(0), N->getOperand(1), MemTmp}; 1381 Store = CurDAG->getMemIntrinsicNode(X86ISD::FST, dl, VTs, Ops, MemVT, 1382 MPI, /*Align*/ std::nullopt, 1383 MachineMemOperand::MOStore); 1384 if (N->getFlags().hasNoFPExcept()) { 1385 SDNodeFlags Flags = Store->getFlags(); 1386 Flags.setNoFPExcept(true); 1387 Store->setFlags(Flags); 1388 } 1389 } else { 1390 assert(SrcVT == MemVT && "Unexpected VT!"); 1391 Store = CurDAG->getStore(N->getOperand(0), dl, N->getOperand(1), MemTmp, 1392 MPI); 1393 } 1394 1395 if (!DstIsSSE) { 1396 SDVTList VTs = CurDAG->getVTList(DstVT, MVT::Other); 1397 SDValue Ops[] = {Store, MemTmp}; 1398 Result = CurDAG->getMemIntrinsicNode( 1399 X86ISD::FLD, dl, VTs, Ops, MemVT, MPI, 1400 /*Align*/ std::nullopt, MachineMemOperand::MOLoad); 1401 if (N->getFlags().hasNoFPExcept()) { 1402 SDNodeFlags Flags = Result->getFlags(); 1403 Flags.setNoFPExcept(true); 1404 Result->setFlags(Flags); 1405 } 1406 } else { 1407 assert(DstVT == MemVT && "Unexpected VT!"); 1408 Result = CurDAG->getLoad(DstVT, dl, Store, MemTmp, MPI); 1409 } 1410 1411 // We're about to replace all uses of the FP_ROUND/FP_EXTEND with the 1412 // extload we created. This will cause general havok on the dag because 1413 // anything below the conversion could be folded into other existing nodes. 1414 // To avoid invalidating 'I', back it up to the convert node. 1415 --I; 1416 CurDAG->ReplaceAllUsesWith(N, Result.getNode()); 1417 break; 1418 } 1419 } 1420 1421 1422 // Now that we did that, the node is dead. Increment the iterator to the 1423 // next node to process, then delete N. 1424 ++I; 1425 MadeChange = true; 1426 } 1427 1428 // Remove any dead nodes that may have been left behind. 1429 if (MadeChange) 1430 CurDAG->RemoveDeadNodes(); 1431 } 1432 1433 // Look for a redundant movzx/movsx that can occur after an 8-bit divrem. 1434 bool X86DAGToDAGISel::tryOptimizeRem8Extend(SDNode *N) { 1435 unsigned Opc = N->getMachineOpcode(); 1436 if (Opc != X86::MOVZX32rr8 && Opc != X86::MOVSX32rr8 && 1437 Opc != X86::MOVSX64rr8) 1438 return false; 1439 1440 SDValue N0 = N->getOperand(0); 1441 1442 // We need to be extracting the lower bit of an extend. 1443 if (!N0.isMachineOpcode() || 1444 N0.getMachineOpcode() != TargetOpcode::EXTRACT_SUBREG || 1445 N0.getConstantOperandVal(1) != X86::sub_8bit) 1446 return false; 1447 1448 // We're looking for either a movsx or movzx to match the original opcode. 1449 unsigned ExpectedOpc = Opc == X86::MOVZX32rr8 ? X86::MOVZX32rr8_NOREX 1450 : X86::MOVSX32rr8_NOREX; 1451 SDValue N00 = N0.getOperand(0); 1452 if (!N00.isMachineOpcode() || N00.getMachineOpcode() != ExpectedOpc) 1453 return false; 1454 1455 if (Opc == X86::MOVSX64rr8) { 1456 // If we had a sign extend from 8 to 64 bits. We still need to go from 32 1457 // to 64. 1458 MachineSDNode *Extend = CurDAG->getMachineNode(X86::MOVSX64rr32, SDLoc(N), 1459 MVT::i64, N00); 1460 ReplaceUses(N, Extend); 1461 } else { 1462 // Ok we can drop this extend and just use the original extend. 1463 ReplaceUses(N, N00.getNode()); 1464 } 1465 1466 return true; 1467 } 1468 1469 void X86DAGToDAGISel::PostprocessISelDAG() { 1470 // Skip peepholes at -O0. 1471 if (TM.getOptLevel() == CodeGenOpt::None) 1472 return; 1473 1474 SelectionDAG::allnodes_iterator Position = CurDAG->allnodes_end(); 1475 1476 bool MadeChange = false; 1477 while (Position != CurDAG->allnodes_begin()) { 1478 SDNode *N = &*--Position; 1479 // Skip dead nodes and any non-machine opcodes. 1480 if (N->use_empty() || !N->isMachineOpcode()) 1481 continue; 1482 1483 if (tryOptimizeRem8Extend(N)) { 1484 MadeChange = true; 1485 continue; 1486 } 1487 1488 // Look for a TESTrr+ANDrr pattern where both operands of the test are 1489 // the same. Rewrite to remove the AND. 1490 unsigned Opc = N->getMachineOpcode(); 1491 if ((Opc == X86::TEST8rr || Opc == X86::TEST16rr || 1492 Opc == X86::TEST32rr || Opc == X86::TEST64rr) && 1493 N->getOperand(0) == N->getOperand(1) && 1494 N->getOperand(0)->hasNUsesOfValue(2, N->getOperand(0).getResNo()) && 1495 N->getOperand(0).isMachineOpcode()) { 1496 SDValue And = N->getOperand(0); 1497 unsigned N0Opc = And.getMachineOpcode(); 1498 if ((N0Opc == X86::AND8rr || N0Opc == X86::AND16rr || 1499 N0Opc == X86::AND32rr || N0Opc == X86::AND64rr) && 1500 !And->hasAnyUseOfValue(1)) { 1501 MachineSDNode *Test = CurDAG->getMachineNode(Opc, SDLoc(N), 1502 MVT::i32, 1503 And.getOperand(0), 1504 And.getOperand(1)); 1505 ReplaceUses(N, Test); 1506 MadeChange = true; 1507 continue; 1508 } 1509 if ((N0Opc == X86::AND8rm || N0Opc == X86::AND16rm || 1510 N0Opc == X86::AND32rm || N0Opc == X86::AND64rm) && 1511 !And->hasAnyUseOfValue(1)) { 1512 unsigned NewOpc; 1513 switch (N0Opc) { 1514 case X86::AND8rm: NewOpc = X86::TEST8mr; break; 1515 case X86::AND16rm: NewOpc = X86::TEST16mr; break; 1516 case X86::AND32rm: NewOpc = X86::TEST32mr; break; 1517 case X86::AND64rm: NewOpc = X86::TEST64mr; break; 1518 } 1519 1520 // Need to swap the memory and register operand. 1521 SDValue Ops[] = { And.getOperand(1), 1522 And.getOperand(2), 1523 And.getOperand(3), 1524 And.getOperand(4), 1525 And.getOperand(5), 1526 And.getOperand(0), 1527 And.getOperand(6) /* Chain */ }; 1528 MachineSDNode *Test = CurDAG->getMachineNode(NewOpc, SDLoc(N), 1529 MVT::i32, MVT::Other, Ops); 1530 CurDAG->setNodeMemRefs( 1531 Test, cast<MachineSDNode>(And.getNode())->memoperands()); 1532 ReplaceUses(And.getValue(2), SDValue(Test, 1)); 1533 ReplaceUses(SDValue(N, 0), SDValue(Test, 0)); 1534 MadeChange = true; 1535 continue; 1536 } 1537 } 1538 1539 // Look for a KAND+KORTEST and turn it into KTEST if only the zero flag is 1540 // used. We're doing this late so we can prefer to fold the AND into masked 1541 // comparisons. Doing that can be better for the live range of the mask 1542 // register. 1543 if ((Opc == X86::KORTESTBrr || Opc == X86::KORTESTWrr || 1544 Opc == X86::KORTESTDrr || Opc == X86::KORTESTQrr) && 1545 N->getOperand(0) == N->getOperand(1) && 1546 N->isOnlyUserOf(N->getOperand(0).getNode()) && 1547 N->getOperand(0).isMachineOpcode() && 1548 onlyUsesZeroFlag(SDValue(N, 0))) { 1549 SDValue And = N->getOperand(0); 1550 unsigned N0Opc = And.getMachineOpcode(); 1551 // KANDW is legal with AVX512F, but KTESTW requires AVX512DQ. The other 1552 // KAND instructions and KTEST use the same ISA feature. 1553 if (N0Opc == X86::KANDBrr || 1554 (N0Opc == X86::KANDWrr && Subtarget->hasDQI()) || 1555 N0Opc == X86::KANDDrr || N0Opc == X86::KANDQrr) { 1556 unsigned NewOpc; 1557 switch (Opc) { 1558 default: llvm_unreachable("Unexpected opcode!"); 1559 case X86::KORTESTBrr: NewOpc = X86::KTESTBrr; break; 1560 case X86::KORTESTWrr: NewOpc = X86::KTESTWrr; break; 1561 case X86::KORTESTDrr: NewOpc = X86::KTESTDrr; break; 1562 case X86::KORTESTQrr: NewOpc = X86::KTESTQrr; break; 1563 } 1564 MachineSDNode *KTest = CurDAG->getMachineNode(NewOpc, SDLoc(N), 1565 MVT::i32, 1566 And.getOperand(0), 1567 And.getOperand(1)); 1568 ReplaceUses(N, KTest); 1569 MadeChange = true; 1570 continue; 1571 } 1572 } 1573 1574 // Attempt to remove vectors moves that were inserted to zero upper bits. 1575 if (Opc != TargetOpcode::SUBREG_TO_REG) 1576 continue; 1577 1578 unsigned SubRegIdx = N->getConstantOperandVal(2); 1579 if (SubRegIdx != X86::sub_xmm && SubRegIdx != X86::sub_ymm) 1580 continue; 1581 1582 SDValue Move = N->getOperand(1); 1583 if (!Move.isMachineOpcode()) 1584 continue; 1585 1586 // Make sure its one of the move opcodes we recognize. 1587 switch (Move.getMachineOpcode()) { 1588 default: 1589 continue; 1590 case X86::VMOVAPDrr: case X86::VMOVUPDrr: 1591 case X86::VMOVAPSrr: case X86::VMOVUPSrr: 1592 case X86::VMOVDQArr: case X86::VMOVDQUrr: 1593 case X86::VMOVAPDYrr: case X86::VMOVUPDYrr: 1594 case X86::VMOVAPSYrr: case X86::VMOVUPSYrr: 1595 case X86::VMOVDQAYrr: case X86::VMOVDQUYrr: 1596 case X86::VMOVAPDZ128rr: case X86::VMOVUPDZ128rr: 1597 case X86::VMOVAPSZ128rr: case X86::VMOVUPSZ128rr: 1598 case X86::VMOVDQA32Z128rr: case X86::VMOVDQU32Z128rr: 1599 case X86::VMOVDQA64Z128rr: case X86::VMOVDQU64Z128rr: 1600 case X86::VMOVAPDZ256rr: case X86::VMOVUPDZ256rr: 1601 case X86::VMOVAPSZ256rr: case X86::VMOVUPSZ256rr: 1602 case X86::VMOVDQA32Z256rr: case X86::VMOVDQU32Z256rr: 1603 case X86::VMOVDQA64Z256rr: case X86::VMOVDQU64Z256rr: 1604 break; 1605 } 1606 1607 SDValue In = Move.getOperand(0); 1608 if (!In.isMachineOpcode() || 1609 In.getMachineOpcode() <= TargetOpcode::GENERIC_OP_END) 1610 continue; 1611 1612 // Make sure the instruction has a VEX, XOP, or EVEX prefix. This covers 1613 // the SHA instructions which use a legacy encoding. 1614 uint64_t TSFlags = getInstrInfo()->get(In.getMachineOpcode()).TSFlags; 1615 if ((TSFlags & X86II::EncodingMask) != X86II::VEX && 1616 (TSFlags & X86II::EncodingMask) != X86II::EVEX && 1617 (TSFlags & X86II::EncodingMask) != X86II::XOP) 1618 continue; 1619 1620 // Producing instruction is another vector instruction. We can drop the 1621 // move. 1622 CurDAG->UpdateNodeOperands(N, N->getOperand(0), In, N->getOperand(2)); 1623 MadeChange = true; 1624 } 1625 1626 if (MadeChange) 1627 CurDAG->RemoveDeadNodes(); 1628 } 1629 1630 1631 /// Emit any code that needs to be executed only in the main function. 1632 void X86DAGToDAGISel::emitSpecialCodeForMain() { 1633 if (Subtarget->isTargetCygMing()) { 1634 TargetLowering::ArgListTy Args; 1635 auto &DL = CurDAG->getDataLayout(); 1636 1637 TargetLowering::CallLoweringInfo CLI(*CurDAG); 1638 CLI.setChain(CurDAG->getRoot()) 1639 .setCallee(CallingConv::C, Type::getVoidTy(*CurDAG->getContext()), 1640 CurDAG->getExternalSymbol("__main", TLI->getPointerTy(DL)), 1641 std::move(Args)); 1642 const TargetLowering &TLI = CurDAG->getTargetLoweringInfo(); 1643 std::pair<SDValue, SDValue> Result = TLI.LowerCallTo(CLI); 1644 CurDAG->setRoot(Result.second); 1645 } 1646 } 1647 1648 void X86DAGToDAGISel::emitFunctionEntryCode() { 1649 // If this is main, emit special code for main. 1650 const Function &F = MF->getFunction(); 1651 if (F.hasExternalLinkage() && F.getName() == "main") 1652 emitSpecialCodeForMain(); 1653 } 1654 1655 static bool isDispSafeForFrameIndex(int64_t Val) { 1656 // On 64-bit platforms, we can run into an issue where a frame index 1657 // includes a displacement that, when added to the explicit displacement, 1658 // will overflow the displacement field. Assuming that the frame index 1659 // displacement fits into a 31-bit integer (which is only slightly more 1660 // aggressive than the current fundamental assumption that it fits into 1661 // a 32-bit integer), a 31-bit disp should always be safe. 1662 return isInt<31>(Val); 1663 } 1664 1665 bool X86DAGToDAGISel::foldOffsetIntoAddress(uint64_t Offset, 1666 X86ISelAddressMode &AM) { 1667 // We may have already matched a displacement and the caller just added the 1668 // symbolic displacement. So we still need to do the checks even if Offset 1669 // is zero. 1670 1671 int64_t Val = AM.Disp + Offset; 1672 1673 // Cannot combine ExternalSymbol displacements with integer offsets. 1674 if (Val != 0 && (AM.ES || AM.MCSym)) 1675 return true; 1676 1677 CodeModel::Model M = TM.getCodeModel(); 1678 if (Subtarget->is64Bit()) { 1679 if (Val != 0 && 1680 !X86::isOffsetSuitableForCodeModel(Val, M, 1681 AM.hasSymbolicDisplacement())) 1682 return true; 1683 // In addition to the checks required for a register base, check that 1684 // we do not try to use an unsafe Disp with a frame index. 1685 if (AM.BaseType == X86ISelAddressMode::FrameIndexBase && 1686 !isDispSafeForFrameIndex(Val)) 1687 return true; 1688 } 1689 AM.Disp = Val; 1690 return false; 1691 1692 } 1693 1694 bool X86DAGToDAGISel::matchLoadInAddress(LoadSDNode *N, X86ISelAddressMode &AM, 1695 bool AllowSegmentRegForX32) { 1696 SDValue Address = N->getOperand(1); 1697 1698 // load gs:0 -> GS segment register. 1699 // load fs:0 -> FS segment register. 1700 // 1701 // This optimization is generally valid because the GNU TLS model defines that 1702 // gs:0 (or fs:0 on X86-64) contains its own address. However, for X86-64 mode 1703 // with 32-bit registers, as we get in ILP32 mode, those registers are first 1704 // zero-extended to 64 bits and then added it to the base address, which gives 1705 // unwanted results when the register holds a negative value. 1706 // For more information see http://people.redhat.com/drepper/tls.pdf 1707 if (auto *C = dyn_cast<ConstantSDNode>(Address)) { 1708 if (C->getSExtValue() == 0 && AM.Segment.getNode() == nullptr && 1709 !IndirectTlsSegRefs && 1710 (Subtarget->isTargetGlibc() || Subtarget->isTargetAndroid() || 1711 Subtarget->isTargetFuchsia())) { 1712 if (Subtarget->isTarget64BitILP32() && !AllowSegmentRegForX32) 1713 return true; 1714 switch (N->getPointerInfo().getAddrSpace()) { 1715 case X86AS::GS: 1716 AM.Segment = CurDAG->getRegister(X86::GS, MVT::i16); 1717 return false; 1718 case X86AS::FS: 1719 AM.Segment = CurDAG->getRegister(X86::FS, MVT::i16); 1720 return false; 1721 // Address space X86AS::SS is not handled here, because it is not used to 1722 // address TLS areas. 1723 } 1724 } 1725 } 1726 1727 return true; 1728 } 1729 1730 /// Try to match X86ISD::Wrapper and X86ISD::WrapperRIP nodes into an addressing 1731 /// mode. These wrap things that will resolve down into a symbol reference. 1732 /// If no match is possible, this returns true, otherwise it returns false. 1733 bool X86DAGToDAGISel::matchWrapper(SDValue N, X86ISelAddressMode &AM) { 1734 // If the addressing mode already has a symbol as the displacement, we can 1735 // never match another symbol. 1736 if (AM.hasSymbolicDisplacement()) 1737 return true; 1738 1739 bool IsRIPRelTLS = false; 1740 bool IsRIPRel = N.getOpcode() == X86ISD::WrapperRIP; 1741 if (IsRIPRel) { 1742 SDValue Val = N.getOperand(0); 1743 if (Val.getOpcode() == ISD::TargetGlobalTLSAddress) 1744 IsRIPRelTLS = true; 1745 } 1746 1747 // We can't use an addressing mode in the 64-bit large code model. 1748 // Global TLS addressing is an exception. In the medium code model, 1749 // we use can use a mode when RIP wrappers are present. 1750 // That signifies access to globals that are known to be "near", 1751 // such as the GOT itself. 1752 CodeModel::Model M = TM.getCodeModel(); 1753 if (Subtarget->is64Bit() && 1754 ((M == CodeModel::Large && !IsRIPRelTLS) || 1755 (M == CodeModel::Medium && !IsRIPRel))) 1756 return true; 1757 1758 // Base and index reg must be 0 in order to use %rip as base. 1759 if (IsRIPRel && AM.hasBaseOrIndexReg()) 1760 return true; 1761 1762 // Make a local copy in case we can't do this fold. 1763 X86ISelAddressMode Backup = AM; 1764 1765 int64_t Offset = 0; 1766 SDValue N0 = N.getOperand(0); 1767 if (auto *G = dyn_cast<GlobalAddressSDNode>(N0)) { 1768 AM.GV = G->getGlobal(); 1769 AM.SymbolFlags = G->getTargetFlags(); 1770 Offset = G->getOffset(); 1771 } else if (auto *CP = dyn_cast<ConstantPoolSDNode>(N0)) { 1772 AM.CP = CP->getConstVal(); 1773 AM.Alignment = CP->getAlign(); 1774 AM.SymbolFlags = CP->getTargetFlags(); 1775 Offset = CP->getOffset(); 1776 } else if (auto *S = dyn_cast<ExternalSymbolSDNode>(N0)) { 1777 AM.ES = S->getSymbol(); 1778 AM.SymbolFlags = S->getTargetFlags(); 1779 } else if (auto *S = dyn_cast<MCSymbolSDNode>(N0)) { 1780 AM.MCSym = S->getMCSymbol(); 1781 } else if (auto *J = dyn_cast<JumpTableSDNode>(N0)) { 1782 AM.JT = J->getIndex(); 1783 AM.SymbolFlags = J->getTargetFlags(); 1784 } else if (auto *BA = dyn_cast<BlockAddressSDNode>(N0)) { 1785 AM.BlockAddr = BA->getBlockAddress(); 1786 AM.SymbolFlags = BA->getTargetFlags(); 1787 Offset = BA->getOffset(); 1788 } else 1789 llvm_unreachable("Unhandled symbol reference node."); 1790 1791 if (foldOffsetIntoAddress(Offset, AM)) { 1792 AM = Backup; 1793 return true; 1794 } 1795 1796 if (IsRIPRel) 1797 AM.setBaseReg(CurDAG->getRegister(X86::RIP, MVT::i64)); 1798 1799 // Commit the changes now that we know this fold is safe. 1800 return false; 1801 } 1802 1803 /// Add the specified node to the specified addressing mode, returning true if 1804 /// it cannot be done. This just pattern matches for the addressing mode. 1805 bool X86DAGToDAGISel::matchAddress(SDValue N, X86ISelAddressMode &AM) { 1806 if (matchAddressRecursively(N, AM, 0)) 1807 return true; 1808 1809 // Post-processing: Make a second attempt to fold a load, if we now know 1810 // that there will not be any other register. This is only performed for 1811 // 64-bit ILP32 mode since 32-bit mode and 64-bit LP64 mode will have folded 1812 // any foldable load the first time. 1813 if (Subtarget->isTarget64BitILP32() && 1814 AM.BaseType == X86ISelAddressMode::RegBase && 1815 AM.Base_Reg.getNode() != nullptr && AM.IndexReg.getNode() == nullptr) { 1816 SDValue Save_Base_Reg = AM.Base_Reg; 1817 if (auto *LoadN = dyn_cast<LoadSDNode>(Save_Base_Reg)) { 1818 AM.Base_Reg = SDValue(); 1819 if (matchLoadInAddress(LoadN, AM, /*AllowSegmentRegForX32=*/true)) 1820 AM.Base_Reg = Save_Base_Reg; 1821 } 1822 } 1823 1824 // Post-processing: Convert lea(,%reg,2) to lea(%reg,%reg), which has 1825 // a smaller encoding and avoids a scaled-index. 1826 if (AM.Scale == 2 && 1827 AM.BaseType == X86ISelAddressMode::RegBase && 1828 AM.Base_Reg.getNode() == nullptr) { 1829 AM.Base_Reg = AM.IndexReg; 1830 AM.Scale = 1; 1831 } 1832 1833 // Post-processing: Convert foo to foo(%rip), even in non-PIC mode, 1834 // because it has a smaller encoding. 1835 // TODO: Which other code models can use this? 1836 switch (TM.getCodeModel()) { 1837 default: break; 1838 case CodeModel::Small: 1839 case CodeModel::Kernel: 1840 if (Subtarget->is64Bit() && 1841 AM.Scale == 1 && 1842 AM.BaseType == X86ISelAddressMode::RegBase && 1843 AM.Base_Reg.getNode() == nullptr && 1844 AM.IndexReg.getNode() == nullptr && 1845 AM.SymbolFlags == X86II::MO_NO_FLAG && 1846 AM.hasSymbolicDisplacement()) 1847 AM.Base_Reg = CurDAG->getRegister(X86::RIP, MVT::i64); 1848 break; 1849 } 1850 1851 return false; 1852 } 1853 1854 bool X86DAGToDAGISel::matchAdd(SDValue &N, X86ISelAddressMode &AM, 1855 unsigned Depth) { 1856 // Add an artificial use to this node so that we can keep track of 1857 // it if it gets CSE'd with a different node. 1858 HandleSDNode Handle(N); 1859 1860 X86ISelAddressMode Backup = AM; 1861 if (!matchAddressRecursively(N.getOperand(0), AM, Depth+1) && 1862 !matchAddressRecursively(Handle.getValue().getOperand(1), AM, Depth+1)) 1863 return false; 1864 AM = Backup; 1865 1866 // Try again after commutating the operands. 1867 if (!matchAddressRecursively(Handle.getValue().getOperand(1), AM, 1868 Depth + 1) && 1869 !matchAddressRecursively(Handle.getValue().getOperand(0), AM, Depth + 1)) 1870 return false; 1871 AM = Backup; 1872 1873 // If we couldn't fold both operands into the address at the same time, 1874 // see if we can just put each operand into a register and fold at least 1875 // the add. 1876 if (AM.BaseType == X86ISelAddressMode::RegBase && 1877 !AM.Base_Reg.getNode() && 1878 !AM.IndexReg.getNode()) { 1879 N = Handle.getValue(); 1880 AM.Base_Reg = N.getOperand(0); 1881 AM.IndexReg = N.getOperand(1); 1882 AM.Scale = 1; 1883 return false; 1884 } 1885 N = Handle.getValue(); 1886 return true; 1887 } 1888 1889 // Insert a node into the DAG at least before the Pos node's position. This 1890 // will reposition the node as needed, and will assign it a node ID that is <= 1891 // the Pos node's ID. Note that this does *not* preserve the uniqueness of node 1892 // IDs! The selection DAG must no longer depend on their uniqueness when this 1893 // is used. 1894 static void insertDAGNode(SelectionDAG &DAG, SDValue Pos, SDValue N) { 1895 if (N->getNodeId() == -1 || 1896 (SelectionDAGISel::getUninvalidatedNodeId(N.getNode()) > 1897 SelectionDAGISel::getUninvalidatedNodeId(Pos.getNode()))) { 1898 DAG.RepositionNode(Pos->getIterator(), N.getNode()); 1899 // Mark Node as invalid for pruning as after this it may be a successor to a 1900 // selected node but otherwise be in the same position of Pos. 1901 // Conservatively mark it with the same -abs(Id) to assure node id 1902 // invariant is preserved. 1903 N->setNodeId(Pos->getNodeId()); 1904 SelectionDAGISel::InvalidateNodeId(N.getNode()); 1905 } 1906 } 1907 1908 // Transform "(X >> (8-C1)) & (0xff << C1)" to "((X >> 8) & 0xff) << C1" if 1909 // safe. This allows us to convert the shift and and into an h-register 1910 // extract and a scaled index. Returns false if the simplification is 1911 // performed. 1912 static bool foldMaskAndShiftToExtract(SelectionDAG &DAG, SDValue N, 1913 uint64_t Mask, 1914 SDValue Shift, SDValue X, 1915 X86ISelAddressMode &AM) { 1916 if (Shift.getOpcode() != ISD::SRL || 1917 !isa<ConstantSDNode>(Shift.getOperand(1)) || 1918 !Shift.hasOneUse()) 1919 return true; 1920 1921 int ScaleLog = 8 - Shift.getConstantOperandVal(1); 1922 if (ScaleLog <= 0 || ScaleLog >= 4 || 1923 Mask != (0xffu << ScaleLog)) 1924 return true; 1925 1926 MVT VT = N.getSimpleValueType(); 1927 SDLoc DL(N); 1928 SDValue Eight = DAG.getConstant(8, DL, MVT::i8); 1929 SDValue NewMask = DAG.getConstant(0xff, DL, VT); 1930 SDValue Srl = DAG.getNode(ISD::SRL, DL, VT, X, Eight); 1931 SDValue And = DAG.getNode(ISD::AND, DL, VT, Srl, NewMask); 1932 SDValue ShlCount = DAG.getConstant(ScaleLog, DL, MVT::i8); 1933 SDValue Shl = DAG.getNode(ISD::SHL, DL, VT, And, ShlCount); 1934 1935 // Insert the new nodes into the topological ordering. We must do this in 1936 // a valid topological ordering as nothing is going to go back and re-sort 1937 // these nodes. We continually insert before 'N' in sequence as this is 1938 // essentially a pre-flattened and pre-sorted sequence of nodes. There is no 1939 // hierarchy left to express. 1940 insertDAGNode(DAG, N, Eight); 1941 insertDAGNode(DAG, N, Srl); 1942 insertDAGNode(DAG, N, NewMask); 1943 insertDAGNode(DAG, N, And); 1944 insertDAGNode(DAG, N, ShlCount); 1945 insertDAGNode(DAG, N, Shl); 1946 DAG.ReplaceAllUsesWith(N, Shl); 1947 DAG.RemoveDeadNode(N.getNode()); 1948 AM.IndexReg = And; 1949 AM.Scale = (1 << ScaleLog); 1950 return false; 1951 } 1952 1953 // Transforms "(X << C1) & C2" to "(X & (C2>>C1)) << C1" if safe and if this 1954 // allows us to fold the shift into this addressing mode. Returns false if the 1955 // transform succeeded. 1956 static bool foldMaskedShiftToScaledMask(SelectionDAG &DAG, SDValue N, 1957 X86ISelAddressMode &AM) { 1958 SDValue Shift = N.getOperand(0); 1959 1960 // Use a signed mask so that shifting right will insert sign bits. These 1961 // bits will be removed when we shift the result left so it doesn't matter 1962 // what we use. This might allow a smaller immediate encoding. 1963 int64_t Mask = cast<ConstantSDNode>(N->getOperand(1))->getSExtValue(); 1964 1965 // If we have an any_extend feeding the AND, look through it to see if there 1966 // is a shift behind it. But only if the AND doesn't use the extended bits. 1967 // FIXME: Generalize this to other ANY_EXTEND than i32 to i64? 1968 bool FoundAnyExtend = false; 1969 if (Shift.getOpcode() == ISD::ANY_EXTEND && Shift.hasOneUse() && 1970 Shift.getOperand(0).getSimpleValueType() == MVT::i32 && 1971 isUInt<32>(Mask)) { 1972 FoundAnyExtend = true; 1973 Shift = Shift.getOperand(0); 1974 } 1975 1976 if (Shift.getOpcode() != ISD::SHL || 1977 !isa<ConstantSDNode>(Shift.getOperand(1))) 1978 return true; 1979 1980 SDValue X = Shift.getOperand(0); 1981 1982 // Not likely to be profitable if either the AND or SHIFT node has more 1983 // than one use (unless all uses are for address computation). Besides, 1984 // isel mechanism requires their node ids to be reused. 1985 if (!N.hasOneUse() || !Shift.hasOneUse()) 1986 return true; 1987 1988 // Verify that the shift amount is something we can fold. 1989 unsigned ShiftAmt = Shift.getConstantOperandVal(1); 1990 if (ShiftAmt != 1 && ShiftAmt != 2 && ShiftAmt != 3) 1991 return true; 1992 1993 MVT VT = N.getSimpleValueType(); 1994 SDLoc DL(N); 1995 if (FoundAnyExtend) { 1996 SDValue NewX = DAG.getNode(ISD::ANY_EXTEND, DL, VT, X); 1997 insertDAGNode(DAG, N, NewX); 1998 X = NewX; 1999 } 2000 2001 SDValue NewMask = DAG.getConstant(Mask >> ShiftAmt, DL, VT); 2002 SDValue NewAnd = DAG.getNode(ISD::AND, DL, VT, X, NewMask); 2003 SDValue NewShift = DAG.getNode(ISD::SHL, DL, VT, NewAnd, Shift.getOperand(1)); 2004 2005 // Insert the new nodes into the topological ordering. We must do this in 2006 // a valid topological ordering as nothing is going to go back and re-sort 2007 // these nodes. We continually insert before 'N' in sequence as this is 2008 // essentially a pre-flattened and pre-sorted sequence of nodes. There is no 2009 // hierarchy left to express. 2010 insertDAGNode(DAG, N, NewMask); 2011 insertDAGNode(DAG, N, NewAnd); 2012 insertDAGNode(DAG, N, NewShift); 2013 DAG.ReplaceAllUsesWith(N, NewShift); 2014 DAG.RemoveDeadNode(N.getNode()); 2015 2016 AM.Scale = 1 << ShiftAmt; 2017 AM.IndexReg = NewAnd; 2018 return false; 2019 } 2020 2021 // Implement some heroics to detect shifts of masked values where the mask can 2022 // be replaced by extending the shift and undoing that in the addressing mode 2023 // scale. Patterns such as (shl (srl x, c1), c2) are canonicalized into (and 2024 // (srl x, SHIFT), MASK) by DAGCombines that don't know the shl can be done in 2025 // the addressing mode. This results in code such as: 2026 // 2027 // int f(short *y, int *lookup_table) { 2028 // ... 2029 // return *y + lookup_table[*y >> 11]; 2030 // } 2031 // 2032 // Turning into: 2033 // movzwl (%rdi), %eax 2034 // movl %eax, %ecx 2035 // shrl $11, %ecx 2036 // addl (%rsi,%rcx,4), %eax 2037 // 2038 // Instead of: 2039 // movzwl (%rdi), %eax 2040 // movl %eax, %ecx 2041 // shrl $9, %ecx 2042 // andl $124, %rcx 2043 // addl (%rsi,%rcx), %eax 2044 // 2045 // Note that this function assumes the mask is provided as a mask *after* the 2046 // value is shifted. The input chain may or may not match that, but computing 2047 // such a mask is trivial. 2048 static bool foldMaskAndShiftToScale(SelectionDAG &DAG, SDValue N, 2049 uint64_t Mask, 2050 SDValue Shift, SDValue X, 2051 X86ISelAddressMode &AM) { 2052 if (Shift.getOpcode() != ISD::SRL || !Shift.hasOneUse() || 2053 !isa<ConstantSDNode>(Shift.getOperand(1))) 2054 return true; 2055 2056 unsigned ShiftAmt = Shift.getConstantOperandVal(1); 2057 unsigned MaskLZ = countLeadingZeros(Mask); 2058 unsigned MaskTZ = countTrailingZeros(Mask); 2059 2060 // The amount of shift we're trying to fit into the addressing mode is taken 2061 // from the trailing zeros of the mask. 2062 unsigned AMShiftAmt = MaskTZ; 2063 2064 // There is nothing we can do here unless the mask is removing some bits. 2065 // Also, the addressing mode can only represent shifts of 1, 2, or 3 bits. 2066 if (AMShiftAmt == 0 || AMShiftAmt > 3) return true; 2067 2068 // We also need to ensure that mask is a continuous run of bits. 2069 if (countTrailingOnes(Mask >> MaskTZ) + MaskTZ + MaskLZ != 64) return true; 2070 2071 // Scale the leading zero count down based on the actual size of the value. 2072 // Also scale it down based on the size of the shift. 2073 unsigned ScaleDown = (64 - X.getSimpleValueType().getSizeInBits()) + ShiftAmt; 2074 if (MaskLZ < ScaleDown) 2075 return true; 2076 MaskLZ -= ScaleDown; 2077 2078 // The final check is to ensure that any masked out high bits of X are 2079 // already known to be zero. Otherwise, the mask has a semantic impact 2080 // other than masking out a couple of low bits. Unfortunately, because of 2081 // the mask, zero extensions will be removed from operands in some cases. 2082 // This code works extra hard to look through extensions because we can 2083 // replace them with zero extensions cheaply if necessary. 2084 bool ReplacingAnyExtend = false; 2085 if (X.getOpcode() == ISD::ANY_EXTEND) { 2086 unsigned ExtendBits = X.getSimpleValueType().getSizeInBits() - 2087 X.getOperand(0).getSimpleValueType().getSizeInBits(); 2088 // Assume that we'll replace the any-extend with a zero-extend, and 2089 // narrow the search to the extended value. 2090 X = X.getOperand(0); 2091 MaskLZ = ExtendBits > MaskLZ ? 0 : MaskLZ - ExtendBits; 2092 ReplacingAnyExtend = true; 2093 } 2094 APInt MaskedHighBits = 2095 APInt::getHighBitsSet(X.getSimpleValueType().getSizeInBits(), MaskLZ); 2096 KnownBits Known = DAG.computeKnownBits(X); 2097 if (MaskedHighBits != Known.Zero) return true; 2098 2099 // We've identified a pattern that can be transformed into a single shift 2100 // and an addressing mode. Make it so. 2101 MVT VT = N.getSimpleValueType(); 2102 if (ReplacingAnyExtend) { 2103 assert(X.getValueType() != VT); 2104 // We looked through an ANY_EXTEND node, insert a ZERO_EXTEND. 2105 SDValue NewX = DAG.getNode(ISD::ZERO_EXTEND, SDLoc(X), VT, X); 2106 insertDAGNode(DAG, N, NewX); 2107 X = NewX; 2108 } 2109 SDLoc DL(N); 2110 SDValue NewSRLAmt = DAG.getConstant(ShiftAmt + AMShiftAmt, DL, MVT::i8); 2111 SDValue NewSRL = DAG.getNode(ISD::SRL, DL, VT, X, NewSRLAmt); 2112 SDValue NewSHLAmt = DAG.getConstant(AMShiftAmt, DL, MVT::i8); 2113 SDValue NewSHL = DAG.getNode(ISD::SHL, DL, VT, NewSRL, NewSHLAmt); 2114 2115 // Insert the new nodes into the topological ordering. We must do this in 2116 // a valid topological ordering as nothing is going to go back and re-sort 2117 // these nodes. We continually insert before 'N' in sequence as this is 2118 // essentially a pre-flattened and pre-sorted sequence of nodes. There is no 2119 // hierarchy left to express. 2120 insertDAGNode(DAG, N, NewSRLAmt); 2121 insertDAGNode(DAG, N, NewSRL); 2122 insertDAGNode(DAG, N, NewSHLAmt); 2123 insertDAGNode(DAG, N, NewSHL); 2124 DAG.ReplaceAllUsesWith(N, NewSHL); 2125 DAG.RemoveDeadNode(N.getNode()); 2126 2127 AM.Scale = 1 << AMShiftAmt; 2128 AM.IndexReg = NewSRL; 2129 return false; 2130 } 2131 2132 // Transform "(X >> SHIFT) & (MASK << C1)" to 2133 // "((X >> (SHIFT + C1)) & (MASK)) << C1". Everything before the SHL will be 2134 // matched to a BEXTR later. Returns false if the simplification is performed. 2135 static bool foldMaskedShiftToBEXTR(SelectionDAG &DAG, SDValue N, 2136 uint64_t Mask, 2137 SDValue Shift, SDValue X, 2138 X86ISelAddressMode &AM, 2139 const X86Subtarget &Subtarget) { 2140 if (Shift.getOpcode() != ISD::SRL || 2141 !isa<ConstantSDNode>(Shift.getOperand(1)) || 2142 !Shift.hasOneUse() || !N.hasOneUse()) 2143 return true; 2144 2145 // Only do this if BEXTR will be matched by matchBEXTRFromAndImm. 2146 if (!Subtarget.hasTBM() && 2147 !(Subtarget.hasBMI() && Subtarget.hasFastBEXTR())) 2148 return true; 2149 2150 // We need to ensure that mask is a continuous run of bits. 2151 if (!isShiftedMask_64(Mask)) return true; 2152 2153 unsigned ShiftAmt = Shift.getConstantOperandVal(1); 2154 2155 // The amount of shift we're trying to fit into the addressing mode is taken 2156 // from the trailing zeros of the mask. 2157 unsigned AMShiftAmt = countTrailingZeros(Mask); 2158 2159 // There is nothing we can do here unless the mask is removing some bits. 2160 // Also, the addressing mode can only represent shifts of 1, 2, or 3 bits. 2161 if (AMShiftAmt == 0 || AMShiftAmt > 3) return true; 2162 2163 MVT VT = N.getSimpleValueType(); 2164 SDLoc DL(N); 2165 SDValue NewSRLAmt = DAG.getConstant(ShiftAmt + AMShiftAmt, DL, MVT::i8); 2166 SDValue NewSRL = DAG.getNode(ISD::SRL, DL, VT, X, NewSRLAmt); 2167 SDValue NewMask = DAG.getConstant(Mask >> AMShiftAmt, DL, VT); 2168 SDValue NewAnd = DAG.getNode(ISD::AND, DL, VT, NewSRL, NewMask); 2169 SDValue NewSHLAmt = DAG.getConstant(AMShiftAmt, DL, MVT::i8); 2170 SDValue NewSHL = DAG.getNode(ISD::SHL, DL, VT, NewAnd, NewSHLAmt); 2171 2172 // Insert the new nodes into the topological ordering. We must do this in 2173 // a valid topological ordering as nothing is going to go back and re-sort 2174 // these nodes. We continually insert before 'N' in sequence as this is 2175 // essentially a pre-flattened and pre-sorted sequence of nodes. There is no 2176 // hierarchy left to express. 2177 insertDAGNode(DAG, N, NewSRLAmt); 2178 insertDAGNode(DAG, N, NewSRL); 2179 insertDAGNode(DAG, N, NewMask); 2180 insertDAGNode(DAG, N, NewAnd); 2181 insertDAGNode(DAG, N, NewSHLAmt); 2182 insertDAGNode(DAG, N, NewSHL); 2183 DAG.ReplaceAllUsesWith(N, NewSHL); 2184 DAG.RemoveDeadNode(N.getNode()); 2185 2186 AM.Scale = 1 << AMShiftAmt; 2187 AM.IndexReg = NewAnd; 2188 return false; 2189 } 2190 2191 bool X86DAGToDAGISel::matchAddressRecursively(SDValue N, X86ISelAddressMode &AM, 2192 unsigned Depth) { 2193 SDLoc dl(N); 2194 LLVM_DEBUG({ 2195 dbgs() << "MatchAddress: "; 2196 AM.dump(CurDAG); 2197 }); 2198 // Limit recursion. 2199 if (Depth > 5) 2200 return matchAddressBase(N, AM); 2201 2202 // If this is already a %rip relative address, we can only merge immediates 2203 // into it. Instead of handling this in every case, we handle it here. 2204 // RIP relative addressing: %rip + 32-bit displacement! 2205 if (AM.isRIPRelative()) { 2206 // FIXME: JumpTable and ExternalSymbol address currently don't like 2207 // displacements. It isn't very important, but this should be fixed for 2208 // consistency. 2209 if (!(AM.ES || AM.MCSym) && AM.JT != -1) 2210 return true; 2211 2212 if (auto *Cst = dyn_cast<ConstantSDNode>(N)) 2213 if (!foldOffsetIntoAddress(Cst->getSExtValue(), AM)) 2214 return false; 2215 return true; 2216 } 2217 2218 switch (N.getOpcode()) { 2219 default: break; 2220 case ISD::LOCAL_RECOVER: { 2221 if (!AM.hasSymbolicDisplacement() && AM.Disp == 0) 2222 if (const auto *ESNode = dyn_cast<MCSymbolSDNode>(N.getOperand(0))) { 2223 // Use the symbol and don't prefix it. 2224 AM.MCSym = ESNode->getMCSymbol(); 2225 return false; 2226 } 2227 break; 2228 } 2229 case ISD::Constant: { 2230 uint64_t Val = cast<ConstantSDNode>(N)->getSExtValue(); 2231 if (!foldOffsetIntoAddress(Val, AM)) 2232 return false; 2233 break; 2234 } 2235 2236 case X86ISD::Wrapper: 2237 case X86ISD::WrapperRIP: 2238 if (!matchWrapper(N, AM)) 2239 return false; 2240 break; 2241 2242 case ISD::LOAD: 2243 if (!matchLoadInAddress(cast<LoadSDNode>(N), AM)) 2244 return false; 2245 break; 2246 2247 case ISD::FrameIndex: 2248 if (AM.BaseType == X86ISelAddressMode::RegBase && 2249 AM.Base_Reg.getNode() == nullptr && 2250 (!Subtarget->is64Bit() || isDispSafeForFrameIndex(AM.Disp))) { 2251 AM.BaseType = X86ISelAddressMode::FrameIndexBase; 2252 AM.Base_FrameIndex = cast<FrameIndexSDNode>(N)->getIndex(); 2253 return false; 2254 } 2255 break; 2256 2257 case ISD::SHL: 2258 if (AM.IndexReg.getNode() != nullptr || AM.Scale != 1) 2259 break; 2260 2261 if (auto *CN = dyn_cast<ConstantSDNode>(N.getOperand(1))) { 2262 unsigned Val = CN->getZExtValue(); 2263 // Note that we handle x<<1 as (,x,2) rather than (x,x) here so 2264 // that the base operand remains free for further matching. If 2265 // the base doesn't end up getting used, a post-processing step 2266 // in MatchAddress turns (,x,2) into (x,x), which is cheaper. 2267 if (Val == 1 || Val == 2 || Val == 3) { 2268 AM.Scale = 1 << Val; 2269 SDValue ShVal = N.getOperand(0); 2270 2271 // Okay, we know that we have a scale by now. However, if the scaled 2272 // value is an add of something and a constant, we can fold the 2273 // constant into the disp field here. 2274 if (CurDAG->isBaseWithConstantOffset(ShVal)) { 2275 AM.IndexReg = ShVal.getOperand(0); 2276 auto *AddVal = cast<ConstantSDNode>(ShVal.getOperand(1)); 2277 uint64_t Disp = (uint64_t)AddVal->getSExtValue() << Val; 2278 if (!foldOffsetIntoAddress(Disp, AM)) 2279 return false; 2280 } 2281 2282 AM.IndexReg = ShVal; 2283 return false; 2284 } 2285 } 2286 break; 2287 2288 case ISD::SRL: { 2289 // Scale must not be used already. 2290 if (AM.IndexReg.getNode() != nullptr || AM.Scale != 1) break; 2291 2292 // We only handle up to 64-bit values here as those are what matter for 2293 // addressing mode optimizations. 2294 assert(N.getSimpleValueType().getSizeInBits() <= 64 && 2295 "Unexpected value size!"); 2296 2297 SDValue And = N.getOperand(0); 2298 if (And.getOpcode() != ISD::AND) break; 2299 SDValue X = And.getOperand(0); 2300 2301 // The mask used for the transform is expected to be post-shift, but we 2302 // found the shift first so just apply the shift to the mask before passing 2303 // it down. 2304 if (!isa<ConstantSDNode>(N.getOperand(1)) || 2305 !isa<ConstantSDNode>(And.getOperand(1))) 2306 break; 2307 uint64_t Mask = And.getConstantOperandVal(1) >> N.getConstantOperandVal(1); 2308 2309 // Try to fold the mask and shift into the scale, and return false if we 2310 // succeed. 2311 if (!foldMaskAndShiftToScale(*CurDAG, N, Mask, N, X, AM)) 2312 return false; 2313 break; 2314 } 2315 2316 case ISD::SMUL_LOHI: 2317 case ISD::UMUL_LOHI: 2318 // A mul_lohi where we need the low part can be folded as a plain multiply. 2319 if (N.getResNo() != 0) break; 2320 [[fallthrough]]; 2321 case ISD::MUL: 2322 case X86ISD::MUL_IMM: 2323 // X*[3,5,9] -> X+X*[2,4,8] 2324 if (AM.BaseType == X86ISelAddressMode::RegBase && 2325 AM.Base_Reg.getNode() == nullptr && 2326 AM.IndexReg.getNode() == nullptr) { 2327 if (auto *CN = dyn_cast<ConstantSDNode>(N.getOperand(1))) 2328 if (CN->getZExtValue() == 3 || CN->getZExtValue() == 5 || 2329 CN->getZExtValue() == 9) { 2330 AM.Scale = unsigned(CN->getZExtValue())-1; 2331 2332 SDValue MulVal = N.getOperand(0); 2333 SDValue Reg; 2334 2335 // Okay, we know that we have a scale by now. However, if the scaled 2336 // value is an add of something and a constant, we can fold the 2337 // constant into the disp field here. 2338 if (MulVal.getNode()->getOpcode() == ISD::ADD && MulVal.hasOneUse() && 2339 isa<ConstantSDNode>(MulVal.getOperand(1))) { 2340 Reg = MulVal.getOperand(0); 2341 auto *AddVal = cast<ConstantSDNode>(MulVal.getOperand(1)); 2342 uint64_t Disp = AddVal->getSExtValue() * CN->getZExtValue(); 2343 if (foldOffsetIntoAddress(Disp, AM)) 2344 Reg = N.getOperand(0); 2345 } else { 2346 Reg = N.getOperand(0); 2347 } 2348 2349 AM.IndexReg = AM.Base_Reg = Reg; 2350 return false; 2351 } 2352 } 2353 break; 2354 2355 case ISD::SUB: { 2356 // Given A-B, if A can be completely folded into the address and 2357 // the index field with the index field unused, use -B as the index. 2358 // This is a win if a has multiple parts that can be folded into 2359 // the address. Also, this saves a mov if the base register has 2360 // other uses, since it avoids a two-address sub instruction, however 2361 // it costs an additional mov if the index register has other uses. 2362 2363 // Add an artificial use to this node so that we can keep track of 2364 // it if it gets CSE'd with a different node. 2365 HandleSDNode Handle(N); 2366 2367 // Test if the LHS of the sub can be folded. 2368 X86ISelAddressMode Backup = AM; 2369 if (matchAddressRecursively(N.getOperand(0), AM, Depth+1)) { 2370 N = Handle.getValue(); 2371 AM = Backup; 2372 break; 2373 } 2374 N = Handle.getValue(); 2375 // Test if the index field is free for use. 2376 if (AM.IndexReg.getNode() || AM.isRIPRelative()) { 2377 AM = Backup; 2378 break; 2379 } 2380 2381 int Cost = 0; 2382 SDValue RHS = N.getOperand(1); 2383 // If the RHS involves a register with multiple uses, this 2384 // transformation incurs an extra mov, due to the neg instruction 2385 // clobbering its operand. 2386 if (!RHS.getNode()->hasOneUse() || 2387 RHS.getNode()->getOpcode() == ISD::CopyFromReg || 2388 RHS.getNode()->getOpcode() == ISD::TRUNCATE || 2389 RHS.getNode()->getOpcode() == ISD::ANY_EXTEND || 2390 (RHS.getNode()->getOpcode() == ISD::ZERO_EXTEND && 2391 RHS.getOperand(0).getValueType() == MVT::i32)) 2392 ++Cost; 2393 // If the base is a register with multiple uses, this 2394 // transformation may save a mov. 2395 if ((AM.BaseType == X86ISelAddressMode::RegBase && AM.Base_Reg.getNode() && 2396 !AM.Base_Reg.getNode()->hasOneUse()) || 2397 AM.BaseType == X86ISelAddressMode::FrameIndexBase) 2398 --Cost; 2399 // If the folded LHS was interesting, this transformation saves 2400 // address arithmetic. 2401 if ((AM.hasSymbolicDisplacement() && !Backup.hasSymbolicDisplacement()) + 2402 ((AM.Disp != 0) && (Backup.Disp == 0)) + 2403 (AM.Segment.getNode() && !Backup.Segment.getNode()) >= 2) 2404 --Cost; 2405 // If it doesn't look like it may be an overall win, don't do it. 2406 if (Cost >= 0) { 2407 AM = Backup; 2408 break; 2409 } 2410 2411 // Ok, the transformation is legal and appears profitable. Go for it. 2412 // Negation will be emitted later to avoid creating dangling nodes if this 2413 // was an unprofitable LEA. 2414 AM.IndexReg = RHS; 2415 AM.NegateIndex = true; 2416 AM.Scale = 1; 2417 return false; 2418 } 2419 2420 case ISD::ADD: 2421 if (!matchAdd(N, AM, Depth)) 2422 return false; 2423 break; 2424 2425 case ISD::OR: 2426 // We want to look through a transform in InstCombine and DAGCombiner that 2427 // turns 'add' into 'or', so we can treat this 'or' exactly like an 'add'. 2428 // Example: (or (and x, 1), (shl y, 3)) --> (add (and x, 1), (shl y, 3)) 2429 // An 'lea' can then be used to match the shift (multiply) and add: 2430 // and $1, %esi 2431 // lea (%rsi, %rdi, 8), %rax 2432 if (CurDAG->haveNoCommonBitsSet(N.getOperand(0), N.getOperand(1)) && 2433 !matchAdd(N, AM, Depth)) 2434 return false; 2435 break; 2436 2437 case ISD::XOR: 2438 // We want to look through a transform in InstCombine that 2439 // turns 'add' with min_signed_val into 'xor', so we can treat this 'xor' 2440 // exactly like an 'add'. 2441 if (isMinSignedConstant(N.getOperand(1)) && !matchAdd(N, AM, Depth)) 2442 return false; 2443 break; 2444 2445 case ISD::AND: { 2446 // Perform some heroic transforms on an and of a constant-count shift 2447 // with a constant to enable use of the scaled offset field. 2448 2449 // Scale must not be used already. 2450 if (AM.IndexReg.getNode() != nullptr || AM.Scale != 1) break; 2451 2452 // We only handle up to 64-bit values here as those are what matter for 2453 // addressing mode optimizations. 2454 assert(N.getSimpleValueType().getSizeInBits() <= 64 && 2455 "Unexpected value size!"); 2456 2457 if (!isa<ConstantSDNode>(N.getOperand(1))) 2458 break; 2459 2460 if (N.getOperand(0).getOpcode() == ISD::SRL) { 2461 SDValue Shift = N.getOperand(0); 2462 SDValue X = Shift.getOperand(0); 2463 2464 uint64_t Mask = N.getConstantOperandVal(1); 2465 2466 // Try to fold the mask and shift into an extract and scale. 2467 if (!foldMaskAndShiftToExtract(*CurDAG, N, Mask, Shift, X, AM)) 2468 return false; 2469 2470 // Try to fold the mask and shift directly into the scale. 2471 if (!foldMaskAndShiftToScale(*CurDAG, N, Mask, Shift, X, AM)) 2472 return false; 2473 2474 // Try to fold the mask and shift into BEXTR and scale. 2475 if (!foldMaskedShiftToBEXTR(*CurDAG, N, Mask, Shift, X, AM, *Subtarget)) 2476 return false; 2477 } 2478 2479 // Try to swap the mask and shift to place shifts which can be done as 2480 // a scale on the outside of the mask. 2481 if (!foldMaskedShiftToScaledMask(*CurDAG, N, AM)) 2482 return false; 2483 2484 break; 2485 } 2486 case ISD::ZERO_EXTEND: { 2487 // Try to widen a zexted shift left to the same size as its use, so we can 2488 // match the shift as a scale factor. 2489 if (AM.IndexReg.getNode() != nullptr || AM.Scale != 1) 2490 break; 2491 if (N.getOperand(0).getOpcode() != ISD::SHL || !N.getOperand(0).hasOneUse()) 2492 break; 2493 2494 // Give up if the shift is not a valid scale factor [1,2,3]. 2495 SDValue Shl = N.getOperand(0); 2496 auto *ShAmtC = dyn_cast<ConstantSDNode>(Shl.getOperand(1)); 2497 if (!ShAmtC || ShAmtC->getZExtValue() > 3) 2498 break; 2499 2500 // The narrow shift must only shift out zero bits (it must be 'nuw'). 2501 // That makes it safe to widen to the destination type. 2502 APInt HighZeros = APInt::getHighBitsSet(Shl.getValueSizeInBits(), 2503 ShAmtC->getZExtValue()); 2504 if (!CurDAG->MaskedValueIsZero(Shl.getOperand(0), HighZeros)) 2505 break; 2506 2507 // zext (shl nuw i8 %x, C) to i32 --> shl (zext i8 %x to i32), (zext C) 2508 MVT VT = N.getSimpleValueType(); 2509 SDLoc DL(N); 2510 SDValue Zext = CurDAG->getNode(ISD::ZERO_EXTEND, DL, VT, Shl.getOperand(0)); 2511 SDValue NewShl = CurDAG->getNode(ISD::SHL, DL, VT, Zext, Shl.getOperand(1)); 2512 2513 // Convert the shift to scale factor. 2514 AM.Scale = 1 << ShAmtC->getZExtValue(); 2515 AM.IndexReg = Zext; 2516 2517 insertDAGNode(*CurDAG, N, Zext); 2518 insertDAGNode(*CurDAG, N, NewShl); 2519 CurDAG->ReplaceAllUsesWith(N, NewShl); 2520 CurDAG->RemoveDeadNode(N.getNode()); 2521 return false; 2522 } 2523 } 2524 2525 return matchAddressBase(N, AM); 2526 } 2527 2528 /// Helper for MatchAddress. Add the specified node to the 2529 /// specified addressing mode without any further recursion. 2530 bool X86DAGToDAGISel::matchAddressBase(SDValue N, X86ISelAddressMode &AM) { 2531 // Is the base register already occupied? 2532 if (AM.BaseType != X86ISelAddressMode::RegBase || AM.Base_Reg.getNode()) { 2533 // If so, check to see if the scale index register is set. 2534 if (!AM.IndexReg.getNode()) { 2535 AM.IndexReg = N; 2536 AM.Scale = 1; 2537 return false; 2538 } 2539 2540 // Otherwise, we cannot select it. 2541 return true; 2542 } 2543 2544 // Default, generate it as a register. 2545 AM.BaseType = X86ISelAddressMode::RegBase; 2546 AM.Base_Reg = N; 2547 return false; 2548 } 2549 2550 bool X86DAGToDAGISel::matchVectorAddressRecursively(SDValue N, 2551 X86ISelAddressMode &AM, 2552 unsigned Depth) { 2553 SDLoc dl(N); 2554 LLVM_DEBUG({ 2555 dbgs() << "MatchVectorAddress: "; 2556 AM.dump(CurDAG); 2557 }); 2558 // Limit recursion. 2559 if (Depth > 5) 2560 return matchAddressBase(N, AM); 2561 2562 // TODO: Support other operations. 2563 switch (N.getOpcode()) { 2564 case ISD::Constant: { 2565 uint64_t Val = cast<ConstantSDNode>(N)->getSExtValue(); 2566 if (!foldOffsetIntoAddress(Val, AM)) 2567 return false; 2568 break; 2569 } 2570 case X86ISD::Wrapper: 2571 if (!matchWrapper(N, AM)) 2572 return false; 2573 break; 2574 case ISD::ADD: { 2575 // Add an artificial use to this node so that we can keep track of 2576 // it if it gets CSE'd with a different node. 2577 HandleSDNode Handle(N); 2578 2579 X86ISelAddressMode Backup = AM; 2580 if (!matchVectorAddressRecursively(N.getOperand(0), AM, Depth + 1) && 2581 !matchVectorAddressRecursively(Handle.getValue().getOperand(1), AM, 2582 Depth + 1)) 2583 return false; 2584 AM = Backup; 2585 2586 // Try again after commuting the operands. 2587 if (!matchVectorAddressRecursively(Handle.getValue().getOperand(1), AM, 2588 Depth + 1) && 2589 !matchVectorAddressRecursively(Handle.getValue().getOperand(0), AM, 2590 Depth + 1)) 2591 return false; 2592 AM = Backup; 2593 2594 N = Handle.getValue(); 2595 break; 2596 } 2597 } 2598 2599 return matchAddressBase(N, AM); 2600 } 2601 2602 /// Helper for selectVectorAddr. Handles things that can be folded into a 2603 /// gather/scatter address. The index register and scale should have already 2604 /// been handled. 2605 bool X86DAGToDAGISel::matchVectorAddress(SDValue N, X86ISelAddressMode &AM) { 2606 return matchVectorAddressRecursively(N, AM, 0); 2607 } 2608 2609 bool X86DAGToDAGISel::selectVectorAddr(MemSDNode *Parent, SDValue BasePtr, 2610 SDValue IndexOp, SDValue ScaleOp, 2611 SDValue &Base, SDValue &Scale, 2612 SDValue &Index, SDValue &Disp, 2613 SDValue &Segment) { 2614 X86ISelAddressMode AM; 2615 AM.IndexReg = IndexOp; 2616 AM.Scale = cast<ConstantSDNode>(ScaleOp)->getZExtValue(); 2617 2618 unsigned AddrSpace = Parent->getPointerInfo().getAddrSpace(); 2619 if (AddrSpace == X86AS::GS) 2620 AM.Segment = CurDAG->getRegister(X86::GS, MVT::i16); 2621 if (AddrSpace == X86AS::FS) 2622 AM.Segment = CurDAG->getRegister(X86::FS, MVT::i16); 2623 if (AddrSpace == X86AS::SS) 2624 AM.Segment = CurDAG->getRegister(X86::SS, MVT::i16); 2625 2626 SDLoc DL(BasePtr); 2627 MVT VT = BasePtr.getSimpleValueType(); 2628 2629 // Try to match into the base and displacement fields. 2630 if (matchVectorAddress(BasePtr, AM)) 2631 return false; 2632 2633 getAddressOperands(AM, DL, VT, Base, Scale, Index, Disp, Segment); 2634 return true; 2635 } 2636 2637 /// Returns true if it is able to pattern match an addressing mode. 2638 /// It returns the operands which make up the maximal addressing mode it can 2639 /// match by reference. 2640 /// 2641 /// Parent is the parent node of the addr operand that is being matched. It 2642 /// is always a load, store, atomic node, or null. It is only null when 2643 /// checking memory operands for inline asm nodes. 2644 bool X86DAGToDAGISel::selectAddr(SDNode *Parent, SDValue N, SDValue &Base, 2645 SDValue &Scale, SDValue &Index, 2646 SDValue &Disp, SDValue &Segment) { 2647 X86ISelAddressMode AM; 2648 2649 if (Parent && 2650 // This list of opcodes are all the nodes that have an "addr:$ptr" operand 2651 // that are not a MemSDNode, and thus don't have proper addrspace info. 2652 Parent->getOpcode() != ISD::INTRINSIC_W_CHAIN && // unaligned loads, fixme 2653 Parent->getOpcode() != ISD::INTRINSIC_VOID && // nontemporal stores 2654 Parent->getOpcode() != X86ISD::TLSCALL && // Fixme 2655 Parent->getOpcode() != X86ISD::ENQCMD && // Fixme 2656 Parent->getOpcode() != X86ISD::ENQCMDS && // Fixme 2657 Parent->getOpcode() != X86ISD::EH_SJLJ_SETJMP && // setjmp 2658 Parent->getOpcode() != X86ISD::EH_SJLJ_LONGJMP) { // longjmp 2659 unsigned AddrSpace = 2660 cast<MemSDNode>(Parent)->getPointerInfo().getAddrSpace(); 2661 if (AddrSpace == X86AS::GS) 2662 AM.Segment = CurDAG->getRegister(X86::GS, MVT::i16); 2663 if (AddrSpace == X86AS::FS) 2664 AM.Segment = CurDAG->getRegister(X86::FS, MVT::i16); 2665 if (AddrSpace == X86AS::SS) 2666 AM.Segment = CurDAG->getRegister(X86::SS, MVT::i16); 2667 } 2668 2669 // Save the DL and VT before calling matchAddress, it can invalidate N. 2670 SDLoc DL(N); 2671 MVT VT = N.getSimpleValueType(); 2672 2673 if (matchAddress(N, AM)) 2674 return false; 2675 2676 getAddressOperands(AM, DL, VT, Base, Scale, Index, Disp, Segment); 2677 return true; 2678 } 2679 2680 bool X86DAGToDAGISel::selectMOV64Imm32(SDValue N, SDValue &Imm) { 2681 // In static codegen with small code model, we can get the address of a label 2682 // into a register with 'movl' 2683 if (N->getOpcode() != X86ISD::Wrapper) 2684 return false; 2685 2686 N = N.getOperand(0); 2687 2688 // At least GNU as does not accept 'movl' for TPOFF relocations. 2689 // FIXME: We could use 'movl' when we know we are targeting MC. 2690 if (N->getOpcode() == ISD::TargetGlobalTLSAddress) 2691 return false; 2692 2693 Imm = N; 2694 if (N->getOpcode() != ISD::TargetGlobalAddress) 2695 return TM.getCodeModel() == CodeModel::Small; 2696 2697 std::optional<ConstantRange> CR = 2698 cast<GlobalAddressSDNode>(N)->getGlobal()->getAbsoluteSymbolRange(); 2699 if (!CR) 2700 return TM.getCodeModel() == CodeModel::Small; 2701 2702 return CR->getUnsignedMax().ult(1ull << 32); 2703 } 2704 2705 bool X86DAGToDAGISel::selectLEA64_32Addr(SDValue N, SDValue &Base, 2706 SDValue &Scale, SDValue &Index, 2707 SDValue &Disp, SDValue &Segment) { 2708 // Save the debug loc before calling selectLEAAddr, in case it invalidates N. 2709 SDLoc DL(N); 2710 2711 if (!selectLEAAddr(N, Base, Scale, Index, Disp, Segment)) 2712 return false; 2713 2714 auto *RN = dyn_cast<RegisterSDNode>(Base); 2715 if (RN && RN->getReg() == 0) 2716 Base = CurDAG->getRegister(0, MVT::i64); 2717 else if (Base.getValueType() == MVT::i32 && !isa<FrameIndexSDNode>(Base)) { 2718 // Base could already be %rip, particularly in the x32 ABI. 2719 SDValue ImplDef = SDValue(CurDAG->getMachineNode(X86::IMPLICIT_DEF, DL, 2720 MVT::i64), 0); 2721 Base = CurDAG->getTargetInsertSubreg(X86::sub_32bit, DL, MVT::i64, ImplDef, 2722 Base); 2723 } 2724 2725 RN = dyn_cast<RegisterSDNode>(Index); 2726 if (RN && RN->getReg() == 0) 2727 Index = CurDAG->getRegister(0, MVT::i64); 2728 else { 2729 assert(Index.getValueType() == MVT::i32 && 2730 "Expect to be extending 32-bit registers for use in LEA"); 2731 SDValue ImplDef = SDValue(CurDAG->getMachineNode(X86::IMPLICIT_DEF, DL, 2732 MVT::i64), 0); 2733 Index = CurDAG->getTargetInsertSubreg(X86::sub_32bit, DL, MVT::i64, ImplDef, 2734 Index); 2735 } 2736 2737 return true; 2738 } 2739 2740 /// Calls SelectAddr and determines if the maximal addressing 2741 /// mode it matches can be cost effectively emitted as an LEA instruction. 2742 bool X86DAGToDAGISel::selectLEAAddr(SDValue N, 2743 SDValue &Base, SDValue &Scale, 2744 SDValue &Index, SDValue &Disp, 2745 SDValue &Segment) { 2746 X86ISelAddressMode AM; 2747 2748 // Save the DL and VT before calling matchAddress, it can invalidate N. 2749 SDLoc DL(N); 2750 MVT VT = N.getSimpleValueType(); 2751 2752 // Set AM.Segment to prevent MatchAddress from using one. LEA doesn't support 2753 // segments. 2754 SDValue Copy = AM.Segment; 2755 SDValue T = CurDAG->getRegister(0, MVT::i32); 2756 AM.Segment = T; 2757 if (matchAddress(N, AM)) 2758 return false; 2759 assert (T == AM.Segment); 2760 AM.Segment = Copy; 2761 2762 unsigned Complexity = 0; 2763 if (AM.BaseType == X86ISelAddressMode::RegBase && AM.Base_Reg.getNode()) 2764 Complexity = 1; 2765 else if (AM.BaseType == X86ISelAddressMode::FrameIndexBase) 2766 Complexity = 4; 2767 2768 if (AM.IndexReg.getNode()) 2769 Complexity++; 2770 2771 // Don't match just leal(,%reg,2). It's cheaper to do addl %reg, %reg, or with 2772 // a simple shift. 2773 if (AM.Scale > 1) 2774 Complexity++; 2775 2776 // FIXME: We are artificially lowering the criteria to turn ADD %reg, $GA 2777 // to a LEA. This is determined with some experimentation but is by no means 2778 // optimal (especially for code size consideration). LEA is nice because of 2779 // its three-address nature. Tweak the cost function again when we can run 2780 // convertToThreeAddress() at register allocation time. 2781 if (AM.hasSymbolicDisplacement()) { 2782 // For X86-64, always use LEA to materialize RIP-relative addresses. 2783 if (Subtarget->is64Bit()) 2784 Complexity = 4; 2785 else 2786 Complexity += 2; 2787 } 2788 2789 // Heuristic: try harder to form an LEA from ADD if the operands set flags. 2790 // Unlike ADD, LEA does not affect flags, so we will be less likely to require 2791 // duplicating flag-producing instructions later in the pipeline. 2792 if (N.getOpcode() == ISD::ADD) { 2793 auto isMathWithFlags = [](SDValue V) { 2794 switch (V.getOpcode()) { 2795 case X86ISD::ADD: 2796 case X86ISD::SUB: 2797 case X86ISD::ADC: 2798 case X86ISD::SBB: 2799 case X86ISD::SMUL: 2800 case X86ISD::UMUL: 2801 /* TODO: These opcodes can be added safely, but we may want to justify 2802 their inclusion for different reasons (better for reg-alloc). 2803 case X86ISD::OR: 2804 case X86ISD::XOR: 2805 case X86ISD::AND: 2806 */ 2807 // Value 1 is the flag output of the node - verify it's not dead. 2808 return !SDValue(V.getNode(), 1).use_empty(); 2809 default: 2810 return false; 2811 } 2812 }; 2813 // TODO: We might want to factor in whether there's a load folding 2814 // opportunity for the math op that disappears with LEA. 2815 if (isMathWithFlags(N.getOperand(0)) || isMathWithFlags(N.getOperand(1))) 2816 Complexity++; 2817 } 2818 2819 if (AM.Disp) 2820 Complexity++; 2821 2822 // If it isn't worth using an LEA, reject it. 2823 if (Complexity <= 2) 2824 return false; 2825 2826 getAddressOperands(AM, DL, VT, Base, Scale, Index, Disp, Segment); 2827 return true; 2828 } 2829 2830 /// This is only run on TargetGlobalTLSAddress nodes. 2831 bool X86DAGToDAGISel::selectTLSADDRAddr(SDValue N, SDValue &Base, 2832 SDValue &Scale, SDValue &Index, 2833 SDValue &Disp, SDValue &Segment) { 2834 assert(N.getOpcode() == ISD::TargetGlobalTLSAddress); 2835 auto *GA = cast<GlobalAddressSDNode>(N); 2836 2837 X86ISelAddressMode AM; 2838 AM.GV = GA->getGlobal(); 2839 AM.Disp += GA->getOffset(); 2840 AM.SymbolFlags = GA->getTargetFlags(); 2841 2842 if (Subtarget->is32Bit()) { 2843 AM.Scale = 1; 2844 AM.IndexReg = CurDAG->getRegister(X86::EBX, MVT::i32); 2845 } 2846 2847 MVT VT = N.getSimpleValueType(); 2848 getAddressOperands(AM, SDLoc(N), VT, Base, Scale, Index, Disp, Segment); 2849 return true; 2850 } 2851 2852 bool X86DAGToDAGISel::selectRelocImm(SDValue N, SDValue &Op) { 2853 // Keep track of the original value type and whether this value was 2854 // truncated. If we see a truncation from pointer type to VT that truncates 2855 // bits that are known to be zero, we can use a narrow reference. 2856 EVT VT = N.getValueType(); 2857 bool WasTruncated = false; 2858 if (N.getOpcode() == ISD::TRUNCATE) { 2859 WasTruncated = true; 2860 N = N.getOperand(0); 2861 } 2862 2863 if (N.getOpcode() != X86ISD::Wrapper) 2864 return false; 2865 2866 // We can only use non-GlobalValues as immediates if they were not truncated, 2867 // as we do not have any range information. If we have a GlobalValue and the 2868 // address was not truncated, we can select it as an operand directly. 2869 unsigned Opc = N.getOperand(0)->getOpcode(); 2870 if (Opc != ISD::TargetGlobalAddress || !WasTruncated) { 2871 Op = N.getOperand(0); 2872 // We can only select the operand directly if we didn't have to look past a 2873 // truncate. 2874 return !WasTruncated; 2875 } 2876 2877 // Check that the global's range fits into VT. 2878 auto *GA = cast<GlobalAddressSDNode>(N.getOperand(0)); 2879 std::optional<ConstantRange> CR = GA->getGlobal()->getAbsoluteSymbolRange(); 2880 if (!CR || CR->getUnsignedMax().uge(1ull << VT.getSizeInBits())) 2881 return false; 2882 2883 // Okay, we can use a narrow reference. 2884 Op = CurDAG->getTargetGlobalAddress(GA->getGlobal(), SDLoc(N), VT, 2885 GA->getOffset(), GA->getTargetFlags()); 2886 return true; 2887 } 2888 2889 bool X86DAGToDAGISel::tryFoldLoad(SDNode *Root, SDNode *P, SDValue N, 2890 SDValue &Base, SDValue &Scale, 2891 SDValue &Index, SDValue &Disp, 2892 SDValue &Segment) { 2893 assert(Root && P && "Unknown root/parent nodes"); 2894 if (!ISD::isNON_EXTLoad(N.getNode()) || 2895 !IsProfitableToFold(N, P, Root) || 2896 !IsLegalToFold(N, P, Root, OptLevel)) 2897 return false; 2898 2899 return selectAddr(N.getNode(), 2900 N.getOperand(1), Base, Scale, Index, Disp, Segment); 2901 } 2902 2903 bool X86DAGToDAGISel::tryFoldBroadcast(SDNode *Root, SDNode *P, SDValue N, 2904 SDValue &Base, SDValue &Scale, 2905 SDValue &Index, SDValue &Disp, 2906 SDValue &Segment) { 2907 assert(Root && P && "Unknown root/parent nodes"); 2908 if (N->getOpcode() != X86ISD::VBROADCAST_LOAD || 2909 !IsProfitableToFold(N, P, Root) || 2910 !IsLegalToFold(N, P, Root, OptLevel)) 2911 return false; 2912 2913 return selectAddr(N.getNode(), 2914 N.getOperand(1), Base, Scale, Index, Disp, Segment); 2915 } 2916 2917 /// Return an SDNode that returns the value of the global base register. 2918 /// Output instructions required to initialize the global base register, 2919 /// if necessary. 2920 SDNode *X86DAGToDAGISel::getGlobalBaseReg() { 2921 unsigned GlobalBaseReg = getInstrInfo()->getGlobalBaseReg(MF); 2922 auto &DL = MF->getDataLayout(); 2923 return CurDAG->getRegister(GlobalBaseReg, TLI->getPointerTy(DL)).getNode(); 2924 } 2925 2926 bool X86DAGToDAGISel::isSExtAbsoluteSymbolRef(unsigned Width, SDNode *N) const { 2927 if (N->getOpcode() == ISD::TRUNCATE) 2928 N = N->getOperand(0).getNode(); 2929 if (N->getOpcode() != X86ISD::Wrapper) 2930 return false; 2931 2932 auto *GA = dyn_cast<GlobalAddressSDNode>(N->getOperand(0)); 2933 if (!GA) 2934 return false; 2935 2936 std::optional<ConstantRange> CR = GA->getGlobal()->getAbsoluteSymbolRange(); 2937 if (!CR) 2938 return Width == 32 && TM.getCodeModel() == CodeModel::Small; 2939 2940 return CR->getSignedMin().sge(-1ull << Width) && 2941 CR->getSignedMax().slt(1ull << Width); 2942 } 2943 2944 X86::CondCode X86DAGToDAGISel::getCondFromNode(SDNode *N) const { 2945 assert(N->isMachineOpcode() && "Unexpected node"); 2946 unsigned Opc = N->getMachineOpcode(); 2947 const MCInstrDesc &MCID = getInstrInfo()->get(Opc); 2948 int CondNo = X86::getCondSrcNoFromDesc(MCID); 2949 if (CondNo < 0) 2950 return X86::COND_INVALID; 2951 2952 return static_cast<X86::CondCode>(N->getConstantOperandVal(CondNo)); 2953 } 2954 2955 /// Test whether the given X86ISD::CMP node has any users that use a flag 2956 /// other than ZF. 2957 bool X86DAGToDAGISel::onlyUsesZeroFlag(SDValue Flags) const { 2958 // Examine each user of the node. 2959 for (SDNode::use_iterator UI = Flags->use_begin(), UE = Flags->use_end(); 2960 UI != UE; ++UI) { 2961 // Only check things that use the flags. 2962 if (UI.getUse().getResNo() != Flags.getResNo()) 2963 continue; 2964 // Only examine CopyToReg uses that copy to EFLAGS. 2965 if (UI->getOpcode() != ISD::CopyToReg || 2966 cast<RegisterSDNode>(UI->getOperand(1))->getReg() != X86::EFLAGS) 2967 return false; 2968 // Examine each user of the CopyToReg use. 2969 for (SDNode::use_iterator FlagUI = UI->use_begin(), 2970 FlagUE = UI->use_end(); FlagUI != FlagUE; ++FlagUI) { 2971 // Only examine the Flag result. 2972 if (FlagUI.getUse().getResNo() != 1) continue; 2973 // Anything unusual: assume conservatively. 2974 if (!FlagUI->isMachineOpcode()) return false; 2975 // Examine the condition code of the user. 2976 X86::CondCode CC = getCondFromNode(*FlagUI); 2977 2978 switch (CC) { 2979 // Comparisons which only use the zero flag. 2980 case X86::COND_E: case X86::COND_NE: 2981 continue; 2982 // Anything else: assume conservatively. 2983 default: 2984 return false; 2985 } 2986 } 2987 } 2988 return true; 2989 } 2990 2991 /// Test whether the given X86ISD::CMP node has any uses which require the SF 2992 /// flag to be accurate. 2993 bool X86DAGToDAGISel::hasNoSignFlagUses(SDValue Flags) const { 2994 // Examine each user of the node. 2995 for (SDNode::use_iterator UI = Flags->use_begin(), UE = Flags->use_end(); 2996 UI != UE; ++UI) { 2997 // Only check things that use the flags. 2998 if (UI.getUse().getResNo() != Flags.getResNo()) 2999 continue; 3000 // Only examine CopyToReg uses that copy to EFLAGS. 3001 if (UI->getOpcode() != ISD::CopyToReg || 3002 cast<RegisterSDNode>(UI->getOperand(1))->getReg() != X86::EFLAGS) 3003 return false; 3004 // Examine each user of the CopyToReg use. 3005 for (SDNode::use_iterator FlagUI = UI->use_begin(), 3006 FlagUE = UI->use_end(); FlagUI != FlagUE; ++FlagUI) { 3007 // Only examine the Flag result. 3008 if (FlagUI.getUse().getResNo() != 1) continue; 3009 // Anything unusual: assume conservatively. 3010 if (!FlagUI->isMachineOpcode()) return false; 3011 // Examine the condition code of the user. 3012 X86::CondCode CC = getCondFromNode(*FlagUI); 3013 3014 switch (CC) { 3015 // Comparisons which don't examine the SF flag. 3016 case X86::COND_A: case X86::COND_AE: 3017 case X86::COND_B: case X86::COND_BE: 3018 case X86::COND_E: case X86::COND_NE: 3019 case X86::COND_O: case X86::COND_NO: 3020 case X86::COND_P: case X86::COND_NP: 3021 continue; 3022 // Anything else: assume conservatively. 3023 default: 3024 return false; 3025 } 3026 } 3027 } 3028 return true; 3029 } 3030 3031 static bool mayUseCarryFlag(X86::CondCode CC) { 3032 switch (CC) { 3033 // Comparisons which don't examine the CF flag. 3034 case X86::COND_O: case X86::COND_NO: 3035 case X86::COND_E: case X86::COND_NE: 3036 case X86::COND_S: case X86::COND_NS: 3037 case X86::COND_P: case X86::COND_NP: 3038 case X86::COND_L: case X86::COND_GE: 3039 case X86::COND_G: case X86::COND_LE: 3040 return false; 3041 // Anything else: assume conservatively. 3042 default: 3043 return true; 3044 } 3045 } 3046 3047 /// Test whether the given node which sets flags has any uses which require the 3048 /// CF flag to be accurate. 3049 bool X86DAGToDAGISel::hasNoCarryFlagUses(SDValue Flags) const { 3050 // Examine each user of the node. 3051 for (SDNode::use_iterator UI = Flags->use_begin(), UE = Flags->use_end(); 3052 UI != UE; ++UI) { 3053 // Only check things that use the flags. 3054 if (UI.getUse().getResNo() != Flags.getResNo()) 3055 continue; 3056 3057 unsigned UIOpc = UI->getOpcode(); 3058 3059 if (UIOpc == ISD::CopyToReg) { 3060 // Only examine CopyToReg uses that copy to EFLAGS. 3061 if (cast<RegisterSDNode>(UI->getOperand(1))->getReg() != X86::EFLAGS) 3062 return false; 3063 // Examine each user of the CopyToReg use. 3064 for (SDNode::use_iterator FlagUI = UI->use_begin(), FlagUE = UI->use_end(); 3065 FlagUI != FlagUE; ++FlagUI) { 3066 // Only examine the Flag result. 3067 if (FlagUI.getUse().getResNo() != 1) 3068 continue; 3069 // Anything unusual: assume conservatively. 3070 if (!FlagUI->isMachineOpcode()) 3071 return false; 3072 // Examine the condition code of the user. 3073 X86::CondCode CC = getCondFromNode(*FlagUI); 3074 3075 if (mayUseCarryFlag(CC)) 3076 return false; 3077 } 3078 3079 // This CopyToReg is ok. Move on to the next user. 3080 continue; 3081 } 3082 3083 // This might be an unselected node. So look for the pre-isel opcodes that 3084 // use flags. 3085 unsigned CCOpNo; 3086 switch (UIOpc) { 3087 default: 3088 // Something unusual. Be conservative. 3089 return false; 3090 case X86ISD::SETCC: CCOpNo = 0; break; 3091 case X86ISD::SETCC_CARRY: CCOpNo = 0; break; 3092 case X86ISD::CMOV: CCOpNo = 2; break; 3093 case X86ISD::BRCOND: CCOpNo = 2; break; 3094 } 3095 3096 X86::CondCode CC = (X86::CondCode)UI->getConstantOperandVal(CCOpNo); 3097 if (mayUseCarryFlag(CC)) 3098 return false; 3099 } 3100 return true; 3101 } 3102 3103 /// Check whether or not the chain ending in StoreNode is suitable for doing 3104 /// the {load; op; store} to modify transformation. 3105 static bool isFusableLoadOpStorePattern(StoreSDNode *StoreNode, 3106 SDValue StoredVal, SelectionDAG *CurDAG, 3107 unsigned LoadOpNo, 3108 LoadSDNode *&LoadNode, 3109 SDValue &InputChain) { 3110 // Is the stored value result 0 of the operation? 3111 if (StoredVal.getResNo() != 0) return false; 3112 3113 // Are there other uses of the operation other than the store? 3114 if (!StoredVal.getNode()->hasNUsesOfValue(1, 0)) return false; 3115 3116 // Is the store non-extending and non-indexed? 3117 if (!ISD::isNormalStore(StoreNode) || StoreNode->isNonTemporal()) 3118 return false; 3119 3120 SDValue Load = StoredVal->getOperand(LoadOpNo); 3121 // Is the stored value a non-extending and non-indexed load? 3122 if (!ISD::isNormalLoad(Load.getNode())) return false; 3123 3124 // Return LoadNode by reference. 3125 LoadNode = cast<LoadSDNode>(Load); 3126 3127 // Is store the only read of the loaded value? 3128 if (!Load.hasOneUse()) 3129 return false; 3130 3131 // Is the address of the store the same as the load? 3132 if (LoadNode->getBasePtr() != StoreNode->getBasePtr() || 3133 LoadNode->getOffset() != StoreNode->getOffset()) 3134 return false; 3135 3136 bool FoundLoad = false; 3137 SmallVector<SDValue, 4> ChainOps; 3138 SmallVector<const SDNode *, 4> LoopWorklist; 3139 SmallPtrSet<const SDNode *, 16> Visited; 3140 const unsigned int Max = 1024; 3141 3142 // Visualization of Load-Op-Store fusion: 3143 // ------------------------- 3144 // Legend: 3145 // *-lines = Chain operand dependencies. 3146 // |-lines = Normal operand dependencies. 3147 // Dependencies flow down and right. n-suffix references multiple nodes. 3148 // 3149 // C Xn C 3150 // * * * 3151 // * * * 3152 // Xn A-LD Yn TF Yn 3153 // * * \ | * | 3154 // * * \ | * | 3155 // * * \ | => A--LD_OP_ST 3156 // * * \| \ 3157 // TF OP \ 3158 // * | \ Zn 3159 // * | \ 3160 // A-ST Zn 3161 // 3162 3163 // This merge induced dependences from: #1: Xn -> LD, OP, Zn 3164 // #2: Yn -> LD 3165 // #3: ST -> Zn 3166 3167 // Ensure the transform is safe by checking for the dual 3168 // dependencies to make sure we do not induce a loop. 3169 3170 // As LD is a predecessor to both OP and ST we can do this by checking: 3171 // a). if LD is a predecessor to a member of Xn or Yn. 3172 // b). if a Zn is a predecessor to ST. 3173 3174 // However, (b) can only occur through being a chain predecessor to 3175 // ST, which is the same as Zn being a member or predecessor of Xn, 3176 // which is a subset of LD being a predecessor of Xn. So it's 3177 // subsumed by check (a). 3178 3179 SDValue Chain = StoreNode->getChain(); 3180 3181 // Gather X elements in ChainOps. 3182 if (Chain == Load.getValue(1)) { 3183 FoundLoad = true; 3184 ChainOps.push_back(Load.getOperand(0)); 3185 } else if (Chain.getOpcode() == ISD::TokenFactor) { 3186 for (unsigned i = 0, e = Chain.getNumOperands(); i != e; ++i) { 3187 SDValue Op = Chain.getOperand(i); 3188 if (Op == Load.getValue(1)) { 3189 FoundLoad = true; 3190 // Drop Load, but keep its chain. No cycle check necessary. 3191 ChainOps.push_back(Load.getOperand(0)); 3192 continue; 3193 } 3194 LoopWorklist.push_back(Op.getNode()); 3195 ChainOps.push_back(Op); 3196 } 3197 } 3198 3199 if (!FoundLoad) 3200 return false; 3201 3202 // Worklist is currently Xn. Add Yn to worklist. 3203 for (SDValue Op : StoredVal->ops()) 3204 if (Op.getNode() != LoadNode) 3205 LoopWorklist.push_back(Op.getNode()); 3206 3207 // Check (a) if Load is a predecessor to Xn + Yn 3208 if (SDNode::hasPredecessorHelper(Load.getNode(), Visited, LoopWorklist, Max, 3209 true)) 3210 return false; 3211 3212 InputChain = 3213 CurDAG->getNode(ISD::TokenFactor, SDLoc(Chain), MVT::Other, ChainOps); 3214 return true; 3215 } 3216 3217 // Change a chain of {load; op; store} of the same value into a simple op 3218 // through memory of that value, if the uses of the modified value and its 3219 // address are suitable. 3220 // 3221 // The tablegen pattern memory operand pattern is currently not able to match 3222 // the case where the EFLAGS on the original operation are used. 3223 // 3224 // To move this to tablegen, we'll need to improve tablegen to allow flags to 3225 // be transferred from a node in the pattern to the result node, probably with 3226 // a new keyword. For example, we have this 3227 // def DEC64m : RI<0xFF, MRM1m, (outs), (ins i64mem:$dst), "dec{q}\t$dst", 3228 // [(store (add (loadi64 addr:$dst), -1), addr:$dst), 3229 // (implicit EFLAGS)]>; 3230 // but maybe need something like this 3231 // def DEC64m : RI<0xFF, MRM1m, (outs), (ins i64mem:$dst), "dec{q}\t$dst", 3232 // [(store (add (loadi64 addr:$dst), -1), addr:$dst), 3233 // (transferrable EFLAGS)]>; 3234 // 3235 // Until then, we manually fold these and instruction select the operation 3236 // here. 3237 bool X86DAGToDAGISel::foldLoadStoreIntoMemOperand(SDNode *Node) { 3238 auto *StoreNode = cast<StoreSDNode>(Node); 3239 SDValue StoredVal = StoreNode->getOperand(1); 3240 unsigned Opc = StoredVal->getOpcode(); 3241 3242 // Before we try to select anything, make sure this is memory operand size 3243 // and opcode we can handle. Note that this must match the code below that 3244 // actually lowers the opcodes. 3245 EVT MemVT = StoreNode->getMemoryVT(); 3246 if (MemVT != MVT::i64 && MemVT != MVT::i32 && MemVT != MVT::i16 && 3247 MemVT != MVT::i8) 3248 return false; 3249 3250 bool IsCommutable = false; 3251 bool IsNegate = false; 3252 switch (Opc) { 3253 default: 3254 return false; 3255 case X86ISD::SUB: 3256 IsNegate = isNullConstant(StoredVal.getOperand(0)); 3257 break; 3258 case X86ISD::SBB: 3259 break; 3260 case X86ISD::ADD: 3261 case X86ISD::ADC: 3262 case X86ISD::AND: 3263 case X86ISD::OR: 3264 case X86ISD::XOR: 3265 IsCommutable = true; 3266 break; 3267 } 3268 3269 unsigned LoadOpNo = IsNegate ? 1 : 0; 3270 LoadSDNode *LoadNode = nullptr; 3271 SDValue InputChain; 3272 if (!isFusableLoadOpStorePattern(StoreNode, StoredVal, CurDAG, LoadOpNo, 3273 LoadNode, InputChain)) { 3274 if (!IsCommutable) 3275 return false; 3276 3277 // This operation is commutable, try the other operand. 3278 LoadOpNo = 1; 3279 if (!isFusableLoadOpStorePattern(StoreNode, StoredVal, CurDAG, LoadOpNo, 3280 LoadNode, InputChain)) 3281 return false; 3282 } 3283 3284 SDValue Base, Scale, Index, Disp, Segment; 3285 if (!selectAddr(LoadNode, LoadNode->getBasePtr(), Base, Scale, Index, Disp, 3286 Segment)) 3287 return false; 3288 3289 auto SelectOpcode = [&](unsigned Opc64, unsigned Opc32, unsigned Opc16, 3290 unsigned Opc8) { 3291 switch (MemVT.getSimpleVT().SimpleTy) { 3292 case MVT::i64: 3293 return Opc64; 3294 case MVT::i32: 3295 return Opc32; 3296 case MVT::i16: 3297 return Opc16; 3298 case MVT::i8: 3299 return Opc8; 3300 default: 3301 llvm_unreachable("Invalid size!"); 3302 } 3303 }; 3304 3305 MachineSDNode *Result; 3306 switch (Opc) { 3307 case X86ISD::SUB: 3308 // Handle negate. 3309 if (IsNegate) { 3310 unsigned NewOpc = SelectOpcode(X86::NEG64m, X86::NEG32m, X86::NEG16m, 3311 X86::NEG8m); 3312 const SDValue Ops[] = {Base, Scale, Index, Disp, Segment, InputChain}; 3313 Result = CurDAG->getMachineNode(NewOpc, SDLoc(Node), MVT::i32, 3314 MVT::Other, Ops); 3315 break; 3316 } 3317 [[fallthrough]]; 3318 case X86ISD::ADD: 3319 // Try to match inc/dec. 3320 if (!Subtarget->slowIncDec() || CurDAG->shouldOptForSize()) { 3321 bool IsOne = isOneConstant(StoredVal.getOperand(1)); 3322 bool IsNegOne = isAllOnesConstant(StoredVal.getOperand(1)); 3323 // ADD/SUB with 1/-1 and carry flag isn't used can use inc/dec. 3324 if ((IsOne || IsNegOne) && hasNoCarryFlagUses(StoredVal.getValue(1))) { 3325 unsigned NewOpc = 3326 ((Opc == X86ISD::ADD) == IsOne) 3327 ? SelectOpcode(X86::INC64m, X86::INC32m, X86::INC16m, X86::INC8m) 3328 : SelectOpcode(X86::DEC64m, X86::DEC32m, X86::DEC16m, X86::DEC8m); 3329 const SDValue Ops[] = {Base, Scale, Index, Disp, Segment, InputChain}; 3330 Result = CurDAG->getMachineNode(NewOpc, SDLoc(Node), MVT::i32, 3331 MVT::Other, Ops); 3332 break; 3333 } 3334 } 3335 [[fallthrough]]; 3336 case X86ISD::ADC: 3337 case X86ISD::SBB: 3338 case X86ISD::AND: 3339 case X86ISD::OR: 3340 case X86ISD::XOR: { 3341 auto SelectRegOpcode = [SelectOpcode](unsigned Opc) { 3342 switch (Opc) { 3343 case X86ISD::ADD: 3344 return SelectOpcode(X86::ADD64mr, X86::ADD32mr, X86::ADD16mr, 3345 X86::ADD8mr); 3346 case X86ISD::ADC: 3347 return SelectOpcode(X86::ADC64mr, X86::ADC32mr, X86::ADC16mr, 3348 X86::ADC8mr); 3349 case X86ISD::SUB: 3350 return SelectOpcode(X86::SUB64mr, X86::SUB32mr, X86::SUB16mr, 3351 X86::SUB8mr); 3352 case X86ISD::SBB: 3353 return SelectOpcode(X86::SBB64mr, X86::SBB32mr, X86::SBB16mr, 3354 X86::SBB8mr); 3355 case X86ISD::AND: 3356 return SelectOpcode(X86::AND64mr, X86::AND32mr, X86::AND16mr, 3357 X86::AND8mr); 3358 case X86ISD::OR: 3359 return SelectOpcode(X86::OR64mr, X86::OR32mr, X86::OR16mr, X86::OR8mr); 3360 case X86ISD::XOR: 3361 return SelectOpcode(X86::XOR64mr, X86::XOR32mr, X86::XOR16mr, 3362 X86::XOR8mr); 3363 default: 3364 llvm_unreachable("Invalid opcode!"); 3365 } 3366 }; 3367 auto SelectImm8Opcode = [SelectOpcode](unsigned Opc) { 3368 switch (Opc) { 3369 case X86ISD::ADD: 3370 return SelectOpcode(X86::ADD64mi8, X86::ADD32mi8, X86::ADD16mi8, 0); 3371 case X86ISD::ADC: 3372 return SelectOpcode(X86::ADC64mi8, X86::ADC32mi8, X86::ADC16mi8, 0); 3373 case X86ISD::SUB: 3374 return SelectOpcode(X86::SUB64mi8, X86::SUB32mi8, X86::SUB16mi8, 0); 3375 case X86ISD::SBB: 3376 return SelectOpcode(X86::SBB64mi8, X86::SBB32mi8, X86::SBB16mi8, 0); 3377 case X86ISD::AND: 3378 return SelectOpcode(X86::AND64mi8, X86::AND32mi8, X86::AND16mi8, 0); 3379 case X86ISD::OR: 3380 return SelectOpcode(X86::OR64mi8, X86::OR32mi8, X86::OR16mi8, 0); 3381 case X86ISD::XOR: 3382 return SelectOpcode(X86::XOR64mi8, X86::XOR32mi8, X86::XOR16mi8, 0); 3383 default: 3384 llvm_unreachable("Invalid opcode!"); 3385 } 3386 }; 3387 auto SelectImmOpcode = [SelectOpcode](unsigned Opc) { 3388 switch (Opc) { 3389 case X86ISD::ADD: 3390 return SelectOpcode(X86::ADD64mi32, X86::ADD32mi, X86::ADD16mi, 3391 X86::ADD8mi); 3392 case X86ISD::ADC: 3393 return SelectOpcode(X86::ADC64mi32, X86::ADC32mi, X86::ADC16mi, 3394 X86::ADC8mi); 3395 case X86ISD::SUB: 3396 return SelectOpcode(X86::SUB64mi32, X86::SUB32mi, X86::SUB16mi, 3397 X86::SUB8mi); 3398 case X86ISD::SBB: 3399 return SelectOpcode(X86::SBB64mi32, X86::SBB32mi, X86::SBB16mi, 3400 X86::SBB8mi); 3401 case X86ISD::AND: 3402 return SelectOpcode(X86::AND64mi32, X86::AND32mi, X86::AND16mi, 3403 X86::AND8mi); 3404 case X86ISD::OR: 3405 return SelectOpcode(X86::OR64mi32, X86::OR32mi, X86::OR16mi, 3406 X86::OR8mi); 3407 case X86ISD::XOR: 3408 return SelectOpcode(X86::XOR64mi32, X86::XOR32mi, X86::XOR16mi, 3409 X86::XOR8mi); 3410 default: 3411 llvm_unreachable("Invalid opcode!"); 3412 } 3413 }; 3414 3415 unsigned NewOpc = SelectRegOpcode(Opc); 3416 SDValue Operand = StoredVal->getOperand(1-LoadOpNo); 3417 3418 // See if the operand is a constant that we can fold into an immediate 3419 // operand. 3420 if (auto *OperandC = dyn_cast<ConstantSDNode>(Operand)) { 3421 int64_t OperandV = OperandC->getSExtValue(); 3422 3423 // Check if we can shrink the operand enough to fit in an immediate (or 3424 // fit into a smaller immediate) by negating it and switching the 3425 // operation. 3426 if ((Opc == X86ISD::ADD || Opc == X86ISD::SUB) && 3427 ((MemVT != MVT::i8 && !isInt<8>(OperandV) && isInt<8>(-OperandV)) || 3428 (MemVT == MVT::i64 && !isInt<32>(OperandV) && 3429 isInt<32>(-OperandV))) && 3430 hasNoCarryFlagUses(StoredVal.getValue(1))) { 3431 OperandV = -OperandV; 3432 Opc = Opc == X86ISD::ADD ? X86ISD::SUB : X86ISD::ADD; 3433 } 3434 3435 // First try to fit this into an Imm8 operand. If it doesn't fit, then try 3436 // the larger immediate operand. 3437 if (MemVT != MVT::i8 && isInt<8>(OperandV)) { 3438 Operand = CurDAG->getTargetConstant(OperandV, SDLoc(Node), MemVT); 3439 NewOpc = SelectImm8Opcode(Opc); 3440 } else if (MemVT != MVT::i64 || isInt<32>(OperandV)) { 3441 Operand = CurDAG->getTargetConstant(OperandV, SDLoc(Node), MemVT); 3442 NewOpc = SelectImmOpcode(Opc); 3443 } 3444 } 3445 3446 if (Opc == X86ISD::ADC || Opc == X86ISD::SBB) { 3447 SDValue CopyTo = 3448 CurDAG->getCopyToReg(InputChain, SDLoc(Node), X86::EFLAGS, 3449 StoredVal.getOperand(2), SDValue()); 3450 3451 const SDValue Ops[] = {Base, Scale, Index, Disp, 3452 Segment, Operand, CopyTo, CopyTo.getValue(1)}; 3453 Result = CurDAG->getMachineNode(NewOpc, SDLoc(Node), MVT::i32, MVT::Other, 3454 Ops); 3455 } else { 3456 const SDValue Ops[] = {Base, Scale, Index, Disp, 3457 Segment, Operand, InputChain}; 3458 Result = CurDAG->getMachineNode(NewOpc, SDLoc(Node), MVT::i32, MVT::Other, 3459 Ops); 3460 } 3461 break; 3462 } 3463 default: 3464 llvm_unreachable("Invalid opcode!"); 3465 } 3466 3467 MachineMemOperand *MemOps[] = {StoreNode->getMemOperand(), 3468 LoadNode->getMemOperand()}; 3469 CurDAG->setNodeMemRefs(Result, MemOps); 3470 3471 // Update Load Chain uses as well. 3472 ReplaceUses(SDValue(LoadNode, 1), SDValue(Result, 1)); 3473 ReplaceUses(SDValue(StoreNode, 0), SDValue(Result, 1)); 3474 ReplaceUses(SDValue(StoredVal.getNode(), 1), SDValue(Result, 0)); 3475 CurDAG->RemoveDeadNode(Node); 3476 return true; 3477 } 3478 3479 // See if this is an X & Mask that we can match to BEXTR/BZHI. 3480 // Where Mask is one of the following patterns: 3481 // a) x & (1 << nbits) - 1 3482 // b) x & ~(-1 << nbits) 3483 // c) x & (-1 >> (32 - y)) 3484 // d) x << (32 - y) >> (32 - y) 3485 bool X86DAGToDAGISel::matchBitExtract(SDNode *Node) { 3486 assert( 3487 (Node->getOpcode() == ISD::AND || Node->getOpcode() == ISD::SRL) && 3488 "Should be either an and-mask, or right-shift after clearing high bits."); 3489 3490 // BEXTR is BMI instruction, BZHI is BMI2 instruction. We need at least one. 3491 if (!Subtarget->hasBMI() && !Subtarget->hasBMI2()) 3492 return false; 3493 3494 MVT NVT = Node->getSimpleValueType(0); 3495 3496 // Only supported for 32 and 64 bits. 3497 if (NVT != MVT::i32 && NVT != MVT::i64) 3498 return false; 3499 3500 SDValue NBits; 3501 bool NegateNBits; 3502 3503 // If we have BMI2's BZHI, we are ok with muti-use patterns. 3504 // Else, if we only have BMI1's BEXTR, we require one-use. 3505 const bool AllowExtraUsesByDefault = Subtarget->hasBMI2(); 3506 auto checkUses = [AllowExtraUsesByDefault]( 3507 SDValue Op, unsigned NUses, 3508 std::optional<bool> AllowExtraUses) { 3509 return AllowExtraUses.value_or(AllowExtraUsesByDefault) || 3510 Op.getNode()->hasNUsesOfValue(NUses, Op.getResNo()); 3511 }; 3512 auto checkOneUse = [checkUses](SDValue Op, 3513 std::optional<bool> AllowExtraUses = 3514 std::nullopt) { 3515 return checkUses(Op, 1, AllowExtraUses); 3516 }; 3517 auto checkTwoUse = [checkUses](SDValue Op, 3518 std::optional<bool> AllowExtraUses = 3519 std::nullopt) { 3520 return checkUses(Op, 2, AllowExtraUses); 3521 }; 3522 3523 auto peekThroughOneUseTruncation = [checkOneUse](SDValue V) { 3524 if (V->getOpcode() == ISD::TRUNCATE && checkOneUse(V)) { 3525 assert(V.getSimpleValueType() == MVT::i32 && 3526 V.getOperand(0).getSimpleValueType() == MVT::i64 && 3527 "Expected i64 -> i32 truncation"); 3528 V = V.getOperand(0); 3529 } 3530 return V; 3531 }; 3532 3533 // a) x & ((1 << nbits) + (-1)) 3534 auto matchPatternA = [checkOneUse, peekThroughOneUseTruncation, &NBits, 3535 &NegateNBits](SDValue Mask) -> bool { 3536 // Match `add`. Must only have one use! 3537 if (Mask->getOpcode() != ISD::ADD || !checkOneUse(Mask)) 3538 return false; 3539 // We should be adding all-ones constant (i.e. subtracting one.) 3540 if (!isAllOnesConstant(Mask->getOperand(1))) 3541 return false; 3542 // Match `1 << nbits`. Might be truncated. Must only have one use! 3543 SDValue M0 = peekThroughOneUseTruncation(Mask->getOperand(0)); 3544 if (M0->getOpcode() != ISD::SHL || !checkOneUse(M0)) 3545 return false; 3546 if (!isOneConstant(M0->getOperand(0))) 3547 return false; 3548 NBits = M0->getOperand(1); 3549 NegateNBits = false; 3550 return true; 3551 }; 3552 3553 auto isAllOnes = [this, peekThroughOneUseTruncation, NVT](SDValue V) { 3554 V = peekThroughOneUseTruncation(V); 3555 return CurDAG->MaskedValueIsAllOnes( 3556 V, APInt::getLowBitsSet(V.getSimpleValueType().getSizeInBits(), 3557 NVT.getSizeInBits())); 3558 }; 3559 3560 // b) x & ~(-1 << nbits) 3561 auto matchPatternB = [checkOneUse, isAllOnes, peekThroughOneUseTruncation, 3562 &NBits, &NegateNBits](SDValue Mask) -> bool { 3563 // Match `~()`. Must only have one use! 3564 if (Mask.getOpcode() != ISD::XOR || !checkOneUse(Mask)) 3565 return false; 3566 // The -1 only has to be all-ones for the final Node's NVT. 3567 if (!isAllOnes(Mask->getOperand(1))) 3568 return false; 3569 // Match `-1 << nbits`. Might be truncated. Must only have one use! 3570 SDValue M0 = peekThroughOneUseTruncation(Mask->getOperand(0)); 3571 if (M0->getOpcode() != ISD::SHL || !checkOneUse(M0)) 3572 return false; 3573 // The -1 only has to be all-ones for the final Node's NVT. 3574 if (!isAllOnes(M0->getOperand(0))) 3575 return false; 3576 NBits = M0->getOperand(1); 3577 NegateNBits = false; 3578 return true; 3579 }; 3580 3581 // Try to match potentially-truncated shift amount as `(bitwidth - y)`, 3582 // or leave the shift amount as-is, but then we'll have to negate it. 3583 auto canonicalizeShiftAmt = [&NBits, &NegateNBits](SDValue ShiftAmt, 3584 unsigned Bitwidth) { 3585 NBits = ShiftAmt; 3586 NegateNBits = true; 3587 // Skip over a truncate of the shift amount, if any. 3588 if (NBits.getOpcode() == ISD::TRUNCATE) 3589 NBits = NBits.getOperand(0); 3590 // Try to match the shift amount as (bitwidth - y). It should go away, too. 3591 // If it doesn't match, that's fine, we'll just negate it ourselves. 3592 if (NBits.getOpcode() != ISD::SUB) 3593 return; 3594 auto *V0 = dyn_cast<ConstantSDNode>(NBits.getOperand(0)); 3595 if (!V0 || V0->getZExtValue() != Bitwidth) 3596 return; 3597 NBits = NBits.getOperand(1); 3598 NegateNBits = false; 3599 }; 3600 3601 // c) x & (-1 >> z) but then we'll have to subtract z from bitwidth 3602 // or 3603 // c) x & (-1 >> (32 - y)) 3604 auto matchPatternC = [checkOneUse, peekThroughOneUseTruncation, &NegateNBits, 3605 canonicalizeShiftAmt](SDValue Mask) -> bool { 3606 // The mask itself may be truncated. 3607 Mask = peekThroughOneUseTruncation(Mask); 3608 unsigned Bitwidth = Mask.getSimpleValueType().getSizeInBits(); 3609 // Match `l>>`. Must only have one use! 3610 if (Mask.getOpcode() != ISD::SRL || !checkOneUse(Mask)) 3611 return false; 3612 // We should be shifting truly all-ones constant. 3613 if (!isAllOnesConstant(Mask.getOperand(0))) 3614 return false; 3615 SDValue M1 = Mask.getOperand(1); 3616 // The shift amount should not be used externally. 3617 if (!checkOneUse(M1)) 3618 return false; 3619 canonicalizeShiftAmt(M1, Bitwidth); 3620 // Pattern c. is non-canonical, and is expanded into pattern d. iff there 3621 // is no extra use of the mask. Clearly, there was one since we are here. 3622 // But at the same time, if we need to negate the shift amount, 3623 // then we don't want the mask to stick around, else it's unprofitable. 3624 return !NegateNBits; 3625 }; 3626 3627 SDValue X; 3628 3629 // d) x << z >> z but then we'll have to subtract z from bitwidth 3630 // or 3631 // d) x << (32 - y) >> (32 - y) 3632 auto matchPatternD = [checkOneUse, checkTwoUse, canonicalizeShiftAmt, 3633 AllowExtraUsesByDefault, &NegateNBits, 3634 &X](SDNode *Node) -> bool { 3635 if (Node->getOpcode() != ISD::SRL) 3636 return false; 3637 SDValue N0 = Node->getOperand(0); 3638 if (N0->getOpcode() != ISD::SHL) 3639 return false; 3640 unsigned Bitwidth = N0.getSimpleValueType().getSizeInBits(); 3641 SDValue N1 = Node->getOperand(1); 3642 SDValue N01 = N0->getOperand(1); 3643 // Both of the shifts must be by the exact same value. 3644 if (N1 != N01) 3645 return false; 3646 canonicalizeShiftAmt(N1, Bitwidth); 3647 // There should not be any external uses of the inner shift / shift amount. 3648 // Note that while we are generally okay with external uses given BMI2, 3649 // iff we need to negate the shift amount, we are not okay with extra uses. 3650 const bool AllowExtraUses = AllowExtraUsesByDefault && !NegateNBits; 3651 if (!checkOneUse(N0, AllowExtraUses) || !checkTwoUse(N1, AllowExtraUses)) 3652 return false; 3653 X = N0->getOperand(0); 3654 return true; 3655 }; 3656 3657 auto matchLowBitMask = [matchPatternA, matchPatternB, 3658 matchPatternC](SDValue Mask) -> bool { 3659 return matchPatternA(Mask) || matchPatternB(Mask) || matchPatternC(Mask); 3660 }; 3661 3662 if (Node->getOpcode() == ISD::AND) { 3663 X = Node->getOperand(0); 3664 SDValue Mask = Node->getOperand(1); 3665 3666 if (matchLowBitMask(Mask)) { 3667 // Great. 3668 } else { 3669 std::swap(X, Mask); 3670 if (!matchLowBitMask(Mask)) 3671 return false; 3672 } 3673 } else if (!matchPatternD(Node)) 3674 return false; 3675 3676 // If we need to negate the shift amount, require BMI2 BZHI support. 3677 // It's just too unprofitable for BMI1 BEXTR. 3678 if (NegateNBits && !Subtarget->hasBMI2()) 3679 return false; 3680 3681 SDLoc DL(Node); 3682 3683 // Truncate the shift amount. 3684 NBits = CurDAG->getNode(ISD::TRUNCATE, DL, MVT::i8, NBits); 3685 insertDAGNode(*CurDAG, SDValue(Node, 0), NBits); 3686 3687 // Insert 8-bit NBits into lowest 8 bits of 32-bit register. 3688 // All the other bits are undefined, we do not care about them. 3689 SDValue ImplDef = SDValue( 3690 CurDAG->getMachineNode(TargetOpcode::IMPLICIT_DEF, DL, MVT::i32), 0); 3691 insertDAGNode(*CurDAG, SDValue(Node, 0), ImplDef); 3692 3693 SDValue SRIdxVal = CurDAG->getTargetConstant(X86::sub_8bit, DL, MVT::i32); 3694 insertDAGNode(*CurDAG, SDValue(Node, 0), SRIdxVal); 3695 NBits = SDValue(CurDAG->getMachineNode(TargetOpcode::INSERT_SUBREG, DL, 3696 MVT::i32, ImplDef, NBits, SRIdxVal), 3697 0); 3698 insertDAGNode(*CurDAG, SDValue(Node, 0), NBits); 3699 3700 // We might have matched the amount of high bits to be cleared, 3701 // but we want the amount of low bits to be kept, so negate it then. 3702 if (NegateNBits) { 3703 SDValue BitWidthC = CurDAG->getConstant(NVT.getSizeInBits(), DL, MVT::i32); 3704 insertDAGNode(*CurDAG, SDValue(Node, 0), BitWidthC); 3705 3706 NBits = CurDAG->getNode(ISD::SUB, DL, MVT::i32, BitWidthC, NBits); 3707 insertDAGNode(*CurDAG, SDValue(Node, 0), NBits); 3708 } 3709 3710 if (Subtarget->hasBMI2()) { 3711 // Great, just emit the the BZHI.. 3712 if (NVT != MVT::i32) { 3713 // But have to place the bit count into the wide-enough register first. 3714 NBits = CurDAG->getNode(ISD::ANY_EXTEND, DL, NVT, NBits); 3715 insertDAGNode(*CurDAG, SDValue(Node, 0), NBits); 3716 } 3717 3718 SDValue Extract = CurDAG->getNode(X86ISD::BZHI, DL, NVT, X, NBits); 3719 ReplaceNode(Node, Extract.getNode()); 3720 SelectCode(Extract.getNode()); 3721 return true; 3722 } 3723 3724 // Else, if we do *NOT* have BMI2, let's find out if the if the 'X' is 3725 // *logically* shifted (potentially with one-use trunc inbetween), 3726 // and the truncation was the only use of the shift, 3727 // and if so look past one-use truncation. 3728 { 3729 SDValue RealX = peekThroughOneUseTruncation(X); 3730 // FIXME: only if the shift is one-use? 3731 if (RealX != X && RealX.getOpcode() == ISD::SRL) 3732 X = RealX; 3733 } 3734 3735 MVT XVT = X.getSimpleValueType(); 3736 3737 // Else, emitting BEXTR requires one more step. 3738 // The 'control' of BEXTR has the pattern of: 3739 // [15...8 bit][ 7...0 bit] location 3740 // [ bit count][ shift] name 3741 // I.e. 0b000000011'00000001 means (x >> 0b1) & 0b11 3742 3743 // Shift NBits left by 8 bits, thus producing 'control'. 3744 // This makes the low 8 bits to be zero. 3745 SDValue C8 = CurDAG->getConstant(8, DL, MVT::i8); 3746 insertDAGNode(*CurDAG, SDValue(Node, 0), C8); 3747 SDValue Control = CurDAG->getNode(ISD::SHL, DL, MVT::i32, NBits, C8); 3748 insertDAGNode(*CurDAG, SDValue(Node, 0), Control); 3749 3750 // If the 'X' is *logically* shifted, we can fold that shift into 'control'. 3751 // FIXME: only if the shift is one-use? 3752 if (X.getOpcode() == ISD::SRL) { 3753 SDValue ShiftAmt = X.getOperand(1); 3754 X = X.getOperand(0); 3755 3756 assert(ShiftAmt.getValueType() == MVT::i8 && 3757 "Expected shift amount to be i8"); 3758 3759 // Now, *zero*-extend the shift amount. The bits 8...15 *must* be zero! 3760 // We could zext to i16 in some form, but we intentionally don't do that. 3761 SDValue OrigShiftAmt = ShiftAmt; 3762 ShiftAmt = CurDAG->getNode(ISD::ZERO_EXTEND, DL, MVT::i32, ShiftAmt); 3763 insertDAGNode(*CurDAG, OrigShiftAmt, ShiftAmt); 3764 3765 // And now 'or' these low 8 bits of shift amount into the 'control'. 3766 Control = CurDAG->getNode(ISD::OR, DL, MVT::i32, Control, ShiftAmt); 3767 insertDAGNode(*CurDAG, SDValue(Node, 0), Control); 3768 } 3769 3770 // But have to place the 'control' into the wide-enough register first. 3771 if (XVT != MVT::i32) { 3772 Control = CurDAG->getNode(ISD::ANY_EXTEND, DL, XVT, Control); 3773 insertDAGNode(*CurDAG, SDValue(Node, 0), Control); 3774 } 3775 3776 // And finally, form the BEXTR itself. 3777 SDValue Extract = CurDAG->getNode(X86ISD::BEXTR, DL, XVT, X, Control); 3778 3779 // The 'X' was originally truncated. Do that now. 3780 if (XVT != NVT) { 3781 insertDAGNode(*CurDAG, SDValue(Node, 0), Extract); 3782 Extract = CurDAG->getNode(ISD::TRUNCATE, DL, NVT, Extract); 3783 } 3784 3785 ReplaceNode(Node, Extract.getNode()); 3786 SelectCode(Extract.getNode()); 3787 3788 return true; 3789 } 3790 3791 // See if this is an (X >> C1) & C2 that we can match to BEXTR/BEXTRI. 3792 MachineSDNode *X86DAGToDAGISel::matchBEXTRFromAndImm(SDNode *Node) { 3793 MVT NVT = Node->getSimpleValueType(0); 3794 SDLoc dl(Node); 3795 3796 SDValue N0 = Node->getOperand(0); 3797 SDValue N1 = Node->getOperand(1); 3798 3799 // If we have TBM we can use an immediate for the control. If we have BMI 3800 // we should only do this if the BEXTR instruction is implemented well. 3801 // Otherwise moving the control into a register makes this more costly. 3802 // TODO: Maybe load folding, greater than 32-bit masks, or a guarantee of LICM 3803 // hoisting the move immediate would make it worthwhile with a less optimal 3804 // BEXTR? 3805 bool PreferBEXTR = 3806 Subtarget->hasTBM() || (Subtarget->hasBMI() && Subtarget->hasFastBEXTR()); 3807 if (!PreferBEXTR && !Subtarget->hasBMI2()) 3808 return nullptr; 3809 3810 // Must have a shift right. 3811 if (N0->getOpcode() != ISD::SRL && N0->getOpcode() != ISD::SRA) 3812 return nullptr; 3813 3814 // Shift can't have additional users. 3815 if (!N0->hasOneUse()) 3816 return nullptr; 3817 3818 // Only supported for 32 and 64 bits. 3819 if (NVT != MVT::i32 && NVT != MVT::i64) 3820 return nullptr; 3821 3822 // Shift amount and RHS of and must be constant. 3823 auto *MaskCst = dyn_cast<ConstantSDNode>(N1); 3824 auto *ShiftCst = dyn_cast<ConstantSDNode>(N0->getOperand(1)); 3825 if (!MaskCst || !ShiftCst) 3826 return nullptr; 3827 3828 // And RHS must be a mask. 3829 uint64_t Mask = MaskCst->getZExtValue(); 3830 if (!isMask_64(Mask)) 3831 return nullptr; 3832 3833 uint64_t Shift = ShiftCst->getZExtValue(); 3834 uint64_t MaskSize = llvm::popcount(Mask); 3835 3836 // Don't interfere with something that can be handled by extracting AH. 3837 // TODO: If we are able to fold a load, BEXTR might still be better than AH. 3838 if (Shift == 8 && MaskSize == 8) 3839 return nullptr; 3840 3841 // Make sure we are only using bits that were in the original value, not 3842 // shifted in. 3843 if (Shift + MaskSize > NVT.getSizeInBits()) 3844 return nullptr; 3845 3846 // BZHI, if available, is always fast, unlike BEXTR. But even if we decide 3847 // that we can't use BEXTR, it is only worthwhile using BZHI if the mask 3848 // does not fit into 32 bits. Load folding is not a sufficient reason. 3849 if (!PreferBEXTR && MaskSize <= 32) 3850 return nullptr; 3851 3852 SDValue Control; 3853 unsigned ROpc, MOpc; 3854 3855 if (!PreferBEXTR) { 3856 assert(Subtarget->hasBMI2() && "We must have BMI2's BZHI then."); 3857 // If we can't make use of BEXTR then we can't fuse shift+mask stages. 3858 // Let's perform the mask first, and apply shift later. Note that we need to 3859 // widen the mask to account for the fact that we'll apply shift afterwards! 3860 Control = CurDAG->getTargetConstant(Shift + MaskSize, dl, NVT); 3861 ROpc = NVT == MVT::i64 ? X86::BZHI64rr : X86::BZHI32rr; 3862 MOpc = NVT == MVT::i64 ? X86::BZHI64rm : X86::BZHI32rm; 3863 unsigned NewOpc = NVT == MVT::i64 ? X86::MOV32ri64 : X86::MOV32ri; 3864 Control = SDValue(CurDAG->getMachineNode(NewOpc, dl, NVT, Control), 0); 3865 } else { 3866 // The 'control' of BEXTR has the pattern of: 3867 // [15...8 bit][ 7...0 bit] location 3868 // [ bit count][ shift] name 3869 // I.e. 0b000000011'00000001 means (x >> 0b1) & 0b11 3870 Control = CurDAG->getTargetConstant(Shift | (MaskSize << 8), dl, NVT); 3871 if (Subtarget->hasTBM()) { 3872 ROpc = NVT == MVT::i64 ? X86::BEXTRI64ri : X86::BEXTRI32ri; 3873 MOpc = NVT == MVT::i64 ? X86::BEXTRI64mi : X86::BEXTRI32mi; 3874 } else { 3875 assert(Subtarget->hasBMI() && "We must have BMI1's BEXTR then."); 3876 // BMI requires the immediate to placed in a register. 3877 ROpc = NVT == MVT::i64 ? X86::BEXTR64rr : X86::BEXTR32rr; 3878 MOpc = NVT == MVT::i64 ? X86::BEXTR64rm : X86::BEXTR32rm; 3879 unsigned NewOpc = NVT == MVT::i64 ? X86::MOV32ri64 : X86::MOV32ri; 3880 Control = SDValue(CurDAG->getMachineNode(NewOpc, dl, NVT, Control), 0); 3881 } 3882 } 3883 3884 MachineSDNode *NewNode; 3885 SDValue Input = N0->getOperand(0); 3886 SDValue Tmp0, Tmp1, Tmp2, Tmp3, Tmp4; 3887 if (tryFoldLoad(Node, N0.getNode(), Input, Tmp0, Tmp1, Tmp2, Tmp3, Tmp4)) { 3888 SDValue Ops[] = { 3889 Tmp0, Tmp1, Tmp2, Tmp3, Tmp4, Control, Input.getOperand(0)}; 3890 SDVTList VTs = CurDAG->getVTList(NVT, MVT::i32, MVT::Other); 3891 NewNode = CurDAG->getMachineNode(MOpc, dl, VTs, Ops); 3892 // Update the chain. 3893 ReplaceUses(Input.getValue(1), SDValue(NewNode, 2)); 3894 // Record the mem-refs 3895 CurDAG->setNodeMemRefs(NewNode, {cast<LoadSDNode>(Input)->getMemOperand()}); 3896 } else { 3897 NewNode = CurDAG->getMachineNode(ROpc, dl, NVT, MVT::i32, Input, Control); 3898 } 3899 3900 if (!PreferBEXTR) { 3901 // We still need to apply the shift. 3902 SDValue ShAmt = CurDAG->getTargetConstant(Shift, dl, NVT); 3903 unsigned NewOpc = NVT == MVT::i64 ? X86::SHR64ri : X86::SHR32ri; 3904 NewNode = 3905 CurDAG->getMachineNode(NewOpc, dl, NVT, SDValue(NewNode, 0), ShAmt); 3906 } 3907 3908 return NewNode; 3909 } 3910 3911 // Emit a PCMISTR(I/M) instruction. 3912 MachineSDNode *X86DAGToDAGISel::emitPCMPISTR(unsigned ROpc, unsigned MOpc, 3913 bool MayFoldLoad, const SDLoc &dl, 3914 MVT VT, SDNode *Node) { 3915 SDValue N0 = Node->getOperand(0); 3916 SDValue N1 = Node->getOperand(1); 3917 SDValue Imm = Node->getOperand(2); 3918 auto *Val = cast<ConstantSDNode>(Imm)->getConstantIntValue(); 3919 Imm = CurDAG->getTargetConstant(*Val, SDLoc(Node), Imm.getValueType()); 3920 3921 // Try to fold a load. No need to check alignment. 3922 SDValue Tmp0, Tmp1, Tmp2, Tmp3, Tmp4; 3923 if (MayFoldLoad && tryFoldLoad(Node, N1, Tmp0, Tmp1, Tmp2, Tmp3, Tmp4)) { 3924 SDValue Ops[] = { N0, Tmp0, Tmp1, Tmp2, Tmp3, Tmp4, Imm, 3925 N1.getOperand(0) }; 3926 SDVTList VTs = CurDAG->getVTList(VT, MVT::i32, MVT::Other); 3927 MachineSDNode *CNode = CurDAG->getMachineNode(MOpc, dl, VTs, Ops); 3928 // Update the chain. 3929 ReplaceUses(N1.getValue(1), SDValue(CNode, 2)); 3930 // Record the mem-refs 3931 CurDAG->setNodeMemRefs(CNode, {cast<LoadSDNode>(N1)->getMemOperand()}); 3932 return CNode; 3933 } 3934 3935 SDValue Ops[] = { N0, N1, Imm }; 3936 SDVTList VTs = CurDAG->getVTList(VT, MVT::i32); 3937 MachineSDNode *CNode = CurDAG->getMachineNode(ROpc, dl, VTs, Ops); 3938 return CNode; 3939 } 3940 3941 // Emit a PCMESTR(I/M) instruction. Also return the Glue result in case we need 3942 // to emit a second instruction after this one. This is needed since we have two 3943 // copyToReg nodes glued before this and we need to continue that glue through. 3944 MachineSDNode *X86DAGToDAGISel::emitPCMPESTR(unsigned ROpc, unsigned MOpc, 3945 bool MayFoldLoad, const SDLoc &dl, 3946 MVT VT, SDNode *Node, 3947 SDValue &InFlag) { 3948 SDValue N0 = Node->getOperand(0); 3949 SDValue N2 = Node->getOperand(2); 3950 SDValue Imm = Node->getOperand(4); 3951 auto *Val = cast<ConstantSDNode>(Imm)->getConstantIntValue(); 3952 Imm = CurDAG->getTargetConstant(*Val, SDLoc(Node), Imm.getValueType()); 3953 3954 // Try to fold a load. No need to check alignment. 3955 SDValue Tmp0, Tmp1, Tmp2, Tmp3, Tmp4; 3956 if (MayFoldLoad && tryFoldLoad(Node, N2, Tmp0, Tmp1, Tmp2, Tmp3, Tmp4)) { 3957 SDValue Ops[] = { N0, Tmp0, Tmp1, Tmp2, Tmp3, Tmp4, Imm, 3958 N2.getOperand(0), InFlag }; 3959 SDVTList VTs = CurDAG->getVTList(VT, MVT::i32, MVT::Other, MVT::Glue); 3960 MachineSDNode *CNode = CurDAG->getMachineNode(MOpc, dl, VTs, Ops); 3961 InFlag = SDValue(CNode, 3); 3962 // Update the chain. 3963 ReplaceUses(N2.getValue(1), SDValue(CNode, 2)); 3964 // Record the mem-refs 3965 CurDAG->setNodeMemRefs(CNode, {cast<LoadSDNode>(N2)->getMemOperand()}); 3966 return CNode; 3967 } 3968 3969 SDValue Ops[] = { N0, N2, Imm, InFlag }; 3970 SDVTList VTs = CurDAG->getVTList(VT, MVT::i32, MVT::Glue); 3971 MachineSDNode *CNode = CurDAG->getMachineNode(ROpc, dl, VTs, Ops); 3972 InFlag = SDValue(CNode, 2); 3973 return CNode; 3974 } 3975 3976 bool X86DAGToDAGISel::tryShiftAmountMod(SDNode *N) { 3977 EVT VT = N->getValueType(0); 3978 3979 // Only handle scalar shifts. 3980 if (VT.isVector()) 3981 return false; 3982 3983 // Narrower shifts only mask to 5 bits in hardware. 3984 unsigned Size = VT == MVT::i64 ? 64 : 32; 3985 3986 SDValue OrigShiftAmt = N->getOperand(1); 3987 SDValue ShiftAmt = OrigShiftAmt; 3988 SDLoc DL(N); 3989 3990 // Skip over a truncate of the shift amount. 3991 if (ShiftAmt->getOpcode() == ISD::TRUNCATE) 3992 ShiftAmt = ShiftAmt->getOperand(0); 3993 3994 // This function is called after X86DAGToDAGISel::matchBitExtract(), 3995 // so we are not afraid that we might mess up BZHI/BEXTR pattern. 3996 3997 SDValue NewShiftAmt; 3998 if (ShiftAmt->getOpcode() == ISD::ADD || ShiftAmt->getOpcode() == ISD::SUB || 3999 ShiftAmt->getOpcode() == ISD::XOR) { 4000 SDValue Add0 = ShiftAmt->getOperand(0); 4001 SDValue Add1 = ShiftAmt->getOperand(1); 4002 auto *Add0C = dyn_cast<ConstantSDNode>(Add0); 4003 auto *Add1C = dyn_cast<ConstantSDNode>(Add1); 4004 // If we are shifting by X+/-/^N where N == 0 mod Size, then just shift by X 4005 // to avoid the ADD/SUB/XOR. 4006 if (Add1C && Add1C->getAPIntValue().urem(Size) == 0) { 4007 NewShiftAmt = Add0; 4008 4009 } else if (ShiftAmt->getOpcode() != ISD::ADD && 4010 ((Add0C && Add0C->getAPIntValue().urem(Size) == Size - 1) || 4011 (Add1C && Add1C->getAPIntValue().urem(Size) == Size - 1))) { 4012 // If we are doing a NOT on just the lower bits with (Size*N-1) -/^ X 4013 // we can replace it with a NOT. In the XOR case it may save some code 4014 // size, in the SUB case it also may save a move. 4015 assert(Add0C == nullptr || Add1C == nullptr); 4016 4017 // We can only do N-X, not X-N 4018 if (ShiftAmt->getOpcode() == ISD::SUB && Add0C == nullptr) 4019 return false; 4020 4021 EVT OpVT = ShiftAmt.getValueType(); 4022 4023 SDValue AllOnes = CurDAG->getAllOnesConstant(DL, OpVT); 4024 NewShiftAmt = CurDAG->getNode(ISD::XOR, DL, OpVT, 4025 Add0C == nullptr ? Add0 : Add1, AllOnes); 4026 insertDAGNode(*CurDAG, OrigShiftAmt, AllOnes); 4027 insertDAGNode(*CurDAG, OrigShiftAmt, NewShiftAmt); 4028 // If we are shifting by N-X where N == 0 mod Size, then just shift by 4029 // -X to generate a NEG instead of a SUB of a constant. 4030 } else if (ShiftAmt->getOpcode() == ISD::SUB && Add0C && 4031 Add0C->getZExtValue() != 0) { 4032 EVT SubVT = ShiftAmt.getValueType(); 4033 SDValue X; 4034 if (Add0C->getZExtValue() % Size == 0) 4035 X = Add1; 4036 else if (ShiftAmt.hasOneUse() && Size == 64 && 4037 Add0C->getZExtValue() % 32 == 0) { 4038 // We have a 64-bit shift by (n*32-x), turn it into -(x+n*32). 4039 // This is mainly beneficial if we already compute (x+n*32). 4040 if (Add1.getOpcode() == ISD::TRUNCATE) { 4041 Add1 = Add1.getOperand(0); 4042 SubVT = Add1.getValueType(); 4043 } 4044 if (Add0.getValueType() != SubVT) { 4045 Add0 = CurDAG->getZExtOrTrunc(Add0, DL, SubVT); 4046 insertDAGNode(*CurDAG, OrigShiftAmt, Add0); 4047 } 4048 4049 X = CurDAG->getNode(ISD::ADD, DL, SubVT, Add1, Add0); 4050 insertDAGNode(*CurDAG, OrigShiftAmt, X); 4051 } else 4052 return false; 4053 // Insert a negate op. 4054 // TODO: This isn't guaranteed to replace the sub if there is a logic cone 4055 // that uses it that's not a shift. 4056 SDValue Zero = CurDAG->getConstant(0, DL, SubVT); 4057 SDValue Neg = CurDAG->getNode(ISD::SUB, DL, SubVT, Zero, X); 4058 NewShiftAmt = Neg; 4059 4060 // Insert these operands into a valid topological order so they can 4061 // get selected independently. 4062 insertDAGNode(*CurDAG, OrigShiftAmt, Zero); 4063 insertDAGNode(*CurDAG, OrigShiftAmt, Neg); 4064 } else 4065 return false; 4066 } else 4067 return false; 4068 4069 if (NewShiftAmt.getValueType() != MVT::i8) { 4070 // Need to truncate the shift amount. 4071 NewShiftAmt = CurDAG->getNode(ISD::TRUNCATE, DL, MVT::i8, NewShiftAmt); 4072 // Add to a correct topological ordering. 4073 insertDAGNode(*CurDAG, OrigShiftAmt, NewShiftAmt); 4074 } 4075 4076 // Insert a new mask to keep the shift amount legal. This should be removed 4077 // by isel patterns. 4078 NewShiftAmt = CurDAG->getNode(ISD::AND, DL, MVT::i8, NewShiftAmt, 4079 CurDAG->getConstant(Size - 1, DL, MVT::i8)); 4080 // Place in a correct topological ordering. 4081 insertDAGNode(*CurDAG, OrigShiftAmt, NewShiftAmt); 4082 4083 SDNode *UpdatedNode = CurDAG->UpdateNodeOperands(N, N->getOperand(0), 4084 NewShiftAmt); 4085 if (UpdatedNode != N) { 4086 // If we found an existing node, we should replace ourselves with that node 4087 // and wait for it to be selected after its other users. 4088 ReplaceNode(N, UpdatedNode); 4089 return true; 4090 } 4091 4092 // If the original shift amount is now dead, delete it so that we don't run 4093 // it through isel. 4094 if (OrigShiftAmt.getNode()->use_empty()) 4095 CurDAG->RemoveDeadNode(OrigShiftAmt.getNode()); 4096 4097 // Now that we've optimized the shift amount, defer to normal isel to get 4098 // load folding and legacy vs BMI2 selection without repeating it here. 4099 SelectCode(N); 4100 return true; 4101 } 4102 4103 bool X86DAGToDAGISel::tryShrinkShlLogicImm(SDNode *N) { 4104 MVT NVT = N->getSimpleValueType(0); 4105 unsigned Opcode = N->getOpcode(); 4106 SDLoc dl(N); 4107 4108 // For operations of the form (x << C1) op C2, check if we can use a smaller 4109 // encoding for C2 by transforming it into (x op (C2>>C1)) << C1. 4110 SDValue Shift = N->getOperand(0); 4111 SDValue N1 = N->getOperand(1); 4112 4113 auto *Cst = dyn_cast<ConstantSDNode>(N1); 4114 if (!Cst) 4115 return false; 4116 4117 int64_t Val = Cst->getSExtValue(); 4118 4119 // If we have an any_extend feeding the AND, look through it to see if there 4120 // is a shift behind it. But only if the AND doesn't use the extended bits. 4121 // FIXME: Generalize this to other ANY_EXTEND than i32 to i64? 4122 bool FoundAnyExtend = false; 4123 if (Shift.getOpcode() == ISD::ANY_EXTEND && Shift.hasOneUse() && 4124 Shift.getOperand(0).getSimpleValueType() == MVT::i32 && 4125 isUInt<32>(Val)) { 4126 FoundAnyExtend = true; 4127 Shift = Shift.getOperand(0); 4128 } 4129 4130 if (Shift.getOpcode() != ISD::SHL || !Shift.hasOneUse()) 4131 return false; 4132 4133 // i8 is unshrinkable, i16 should be promoted to i32. 4134 if (NVT != MVT::i32 && NVT != MVT::i64) 4135 return false; 4136 4137 auto *ShlCst = dyn_cast<ConstantSDNode>(Shift.getOperand(1)); 4138 if (!ShlCst) 4139 return false; 4140 4141 uint64_t ShAmt = ShlCst->getZExtValue(); 4142 4143 // Make sure that we don't change the operation by removing bits. 4144 // This only matters for OR and XOR, AND is unaffected. 4145 uint64_t RemovedBitsMask = (1ULL << ShAmt) - 1; 4146 if (Opcode != ISD::AND && (Val & RemovedBitsMask) != 0) 4147 return false; 4148 4149 // Check the minimum bitwidth for the new constant. 4150 // TODO: Using 16 and 8 bit operations is also possible for or32 & xor32. 4151 auto CanShrinkImmediate = [&](int64_t &ShiftedVal) { 4152 if (Opcode == ISD::AND) { 4153 // AND32ri is the same as AND64ri32 with zext imm. 4154 // Try this before sign extended immediates below. 4155 ShiftedVal = (uint64_t)Val >> ShAmt; 4156 if (NVT == MVT::i64 && !isUInt<32>(Val) && isUInt<32>(ShiftedVal)) 4157 return true; 4158 // Also swap order when the AND can become MOVZX. 4159 if (ShiftedVal == UINT8_MAX || ShiftedVal == UINT16_MAX) 4160 return true; 4161 } 4162 ShiftedVal = Val >> ShAmt; 4163 if ((!isInt<8>(Val) && isInt<8>(ShiftedVal)) || 4164 (!isInt<32>(Val) && isInt<32>(ShiftedVal))) 4165 return true; 4166 if (Opcode != ISD::AND) { 4167 // MOV32ri+OR64r/XOR64r is cheaper than MOV64ri64+OR64rr/XOR64rr 4168 ShiftedVal = (uint64_t)Val >> ShAmt; 4169 if (NVT == MVT::i64 && !isUInt<32>(Val) && isUInt<32>(ShiftedVal)) 4170 return true; 4171 } 4172 return false; 4173 }; 4174 4175 int64_t ShiftedVal; 4176 if (!CanShrinkImmediate(ShiftedVal)) 4177 return false; 4178 4179 // Ok, we can reorder to get a smaller immediate. 4180 4181 // But, its possible the original immediate allowed an AND to become MOVZX. 4182 // Doing this late due to avoid the MakedValueIsZero call as late as 4183 // possible. 4184 if (Opcode == ISD::AND) { 4185 // Find the smallest zext this could possibly be. 4186 unsigned ZExtWidth = Cst->getAPIntValue().getActiveBits(); 4187 ZExtWidth = PowerOf2Ceil(std::max(ZExtWidth, 8U)); 4188 4189 // Figure out which bits need to be zero to achieve that mask. 4190 APInt NeededMask = APInt::getLowBitsSet(NVT.getSizeInBits(), 4191 ZExtWidth); 4192 NeededMask &= ~Cst->getAPIntValue(); 4193 4194 if (CurDAG->MaskedValueIsZero(N->getOperand(0), NeededMask)) 4195 return false; 4196 } 4197 4198 SDValue X = Shift.getOperand(0); 4199 if (FoundAnyExtend) { 4200 SDValue NewX = CurDAG->getNode(ISD::ANY_EXTEND, dl, NVT, X); 4201 insertDAGNode(*CurDAG, SDValue(N, 0), NewX); 4202 X = NewX; 4203 } 4204 4205 SDValue NewCst = CurDAG->getConstant(ShiftedVal, dl, NVT); 4206 insertDAGNode(*CurDAG, SDValue(N, 0), NewCst); 4207 SDValue NewBinOp = CurDAG->getNode(Opcode, dl, NVT, X, NewCst); 4208 insertDAGNode(*CurDAG, SDValue(N, 0), NewBinOp); 4209 SDValue NewSHL = CurDAG->getNode(ISD::SHL, dl, NVT, NewBinOp, 4210 Shift.getOperand(1)); 4211 ReplaceNode(N, NewSHL.getNode()); 4212 SelectCode(NewSHL.getNode()); 4213 return true; 4214 } 4215 4216 bool X86DAGToDAGISel::matchVPTERNLOG(SDNode *Root, SDNode *ParentA, 4217 SDNode *ParentB, SDNode *ParentC, 4218 SDValue A, SDValue B, SDValue C, 4219 uint8_t Imm) { 4220 assert(A.isOperandOf(ParentA) && B.isOperandOf(ParentB) && 4221 C.isOperandOf(ParentC) && "Incorrect parent node"); 4222 4223 auto tryFoldLoadOrBCast = 4224 [this](SDNode *Root, SDNode *P, SDValue &L, SDValue &Base, SDValue &Scale, 4225 SDValue &Index, SDValue &Disp, SDValue &Segment) { 4226 if (tryFoldLoad(Root, P, L, Base, Scale, Index, Disp, Segment)) 4227 return true; 4228 4229 // Not a load, check for broadcast which may be behind a bitcast. 4230 if (L.getOpcode() == ISD::BITCAST && L.hasOneUse()) { 4231 P = L.getNode(); 4232 L = L.getOperand(0); 4233 } 4234 4235 if (L.getOpcode() != X86ISD::VBROADCAST_LOAD) 4236 return false; 4237 4238 // Only 32 and 64 bit broadcasts are supported. 4239 auto *MemIntr = cast<MemIntrinsicSDNode>(L); 4240 unsigned Size = MemIntr->getMemoryVT().getSizeInBits(); 4241 if (Size != 32 && Size != 64) 4242 return false; 4243 4244 return tryFoldBroadcast(Root, P, L, Base, Scale, Index, Disp, Segment); 4245 }; 4246 4247 bool FoldedLoad = false; 4248 SDValue Tmp0, Tmp1, Tmp2, Tmp3, Tmp4; 4249 if (tryFoldLoadOrBCast(Root, ParentC, C, Tmp0, Tmp1, Tmp2, Tmp3, Tmp4)) { 4250 FoldedLoad = true; 4251 } else if (tryFoldLoadOrBCast(Root, ParentA, A, Tmp0, Tmp1, Tmp2, Tmp3, 4252 Tmp4)) { 4253 FoldedLoad = true; 4254 std::swap(A, C); 4255 // Swap bits 1/4 and 3/6. 4256 uint8_t OldImm = Imm; 4257 Imm = OldImm & 0xa5; 4258 if (OldImm & 0x02) Imm |= 0x10; 4259 if (OldImm & 0x10) Imm |= 0x02; 4260 if (OldImm & 0x08) Imm |= 0x40; 4261 if (OldImm & 0x40) Imm |= 0x08; 4262 } else if (tryFoldLoadOrBCast(Root, ParentB, B, Tmp0, Tmp1, Tmp2, Tmp3, 4263 Tmp4)) { 4264 FoldedLoad = true; 4265 std::swap(B, C); 4266 // Swap bits 1/2 and 5/6. 4267 uint8_t OldImm = Imm; 4268 Imm = OldImm & 0x99; 4269 if (OldImm & 0x02) Imm |= 0x04; 4270 if (OldImm & 0x04) Imm |= 0x02; 4271 if (OldImm & 0x20) Imm |= 0x40; 4272 if (OldImm & 0x40) Imm |= 0x20; 4273 } 4274 4275 SDLoc DL(Root); 4276 4277 SDValue TImm = CurDAG->getTargetConstant(Imm, DL, MVT::i8); 4278 4279 MVT NVT = Root->getSimpleValueType(0); 4280 4281 MachineSDNode *MNode; 4282 if (FoldedLoad) { 4283 SDVTList VTs = CurDAG->getVTList(NVT, MVT::Other); 4284 4285 unsigned Opc; 4286 if (C.getOpcode() == X86ISD::VBROADCAST_LOAD) { 4287 auto *MemIntr = cast<MemIntrinsicSDNode>(C); 4288 unsigned EltSize = MemIntr->getMemoryVT().getSizeInBits(); 4289 assert((EltSize == 32 || EltSize == 64) && "Unexpected broadcast size!"); 4290 4291 bool UseD = EltSize == 32; 4292 if (NVT.is128BitVector()) 4293 Opc = UseD ? X86::VPTERNLOGDZ128rmbi : X86::VPTERNLOGQZ128rmbi; 4294 else if (NVT.is256BitVector()) 4295 Opc = UseD ? X86::VPTERNLOGDZ256rmbi : X86::VPTERNLOGQZ256rmbi; 4296 else if (NVT.is512BitVector()) 4297 Opc = UseD ? X86::VPTERNLOGDZrmbi : X86::VPTERNLOGQZrmbi; 4298 else 4299 llvm_unreachable("Unexpected vector size!"); 4300 } else { 4301 bool UseD = NVT.getVectorElementType() == MVT::i32; 4302 if (NVT.is128BitVector()) 4303 Opc = UseD ? X86::VPTERNLOGDZ128rmi : X86::VPTERNLOGQZ128rmi; 4304 else if (NVT.is256BitVector()) 4305 Opc = UseD ? X86::VPTERNLOGDZ256rmi : X86::VPTERNLOGQZ256rmi; 4306 else if (NVT.is512BitVector()) 4307 Opc = UseD ? X86::VPTERNLOGDZrmi : X86::VPTERNLOGQZrmi; 4308 else 4309 llvm_unreachable("Unexpected vector size!"); 4310 } 4311 4312 SDValue Ops[] = {A, B, Tmp0, Tmp1, Tmp2, Tmp3, Tmp4, TImm, C.getOperand(0)}; 4313 MNode = CurDAG->getMachineNode(Opc, DL, VTs, Ops); 4314 4315 // Update the chain. 4316 ReplaceUses(C.getValue(1), SDValue(MNode, 1)); 4317 // Record the mem-refs 4318 CurDAG->setNodeMemRefs(MNode, {cast<MemSDNode>(C)->getMemOperand()}); 4319 } else { 4320 bool UseD = NVT.getVectorElementType() == MVT::i32; 4321 unsigned Opc; 4322 if (NVT.is128BitVector()) 4323 Opc = UseD ? X86::VPTERNLOGDZ128rri : X86::VPTERNLOGQZ128rri; 4324 else if (NVT.is256BitVector()) 4325 Opc = UseD ? X86::VPTERNLOGDZ256rri : X86::VPTERNLOGQZ256rri; 4326 else if (NVT.is512BitVector()) 4327 Opc = UseD ? X86::VPTERNLOGDZrri : X86::VPTERNLOGQZrri; 4328 else 4329 llvm_unreachable("Unexpected vector size!"); 4330 4331 MNode = CurDAG->getMachineNode(Opc, DL, NVT, {A, B, C, TImm}); 4332 } 4333 4334 ReplaceUses(SDValue(Root, 0), SDValue(MNode, 0)); 4335 CurDAG->RemoveDeadNode(Root); 4336 return true; 4337 } 4338 4339 // Try to match two logic ops to a VPTERNLOG. 4340 // FIXME: Handle more complex patterns that use an operand more than once? 4341 bool X86DAGToDAGISel::tryVPTERNLOG(SDNode *N) { 4342 MVT NVT = N->getSimpleValueType(0); 4343 4344 // Make sure we support VPTERNLOG. 4345 if (!NVT.isVector() || !Subtarget->hasAVX512() || 4346 NVT.getVectorElementType() == MVT::i1) 4347 return false; 4348 4349 // We need VLX for 128/256-bit. 4350 if (!(Subtarget->hasVLX() || NVT.is512BitVector())) 4351 return false; 4352 4353 SDValue N0 = N->getOperand(0); 4354 SDValue N1 = N->getOperand(1); 4355 4356 auto getFoldableLogicOp = [](SDValue Op) { 4357 // Peek through single use bitcast. 4358 if (Op.getOpcode() == ISD::BITCAST && Op.hasOneUse()) 4359 Op = Op.getOperand(0); 4360 4361 if (!Op.hasOneUse()) 4362 return SDValue(); 4363 4364 unsigned Opc = Op.getOpcode(); 4365 if (Opc == ISD::AND || Opc == ISD::OR || Opc == ISD::XOR || 4366 Opc == X86ISD::ANDNP) 4367 return Op; 4368 4369 return SDValue(); 4370 }; 4371 4372 SDValue A, FoldableOp; 4373 if ((FoldableOp = getFoldableLogicOp(N1))) { 4374 A = N0; 4375 } else if ((FoldableOp = getFoldableLogicOp(N0))) { 4376 A = N1; 4377 } else 4378 return false; 4379 4380 SDValue B = FoldableOp.getOperand(0); 4381 SDValue C = FoldableOp.getOperand(1); 4382 SDNode *ParentA = N; 4383 SDNode *ParentB = FoldableOp.getNode(); 4384 SDNode *ParentC = FoldableOp.getNode(); 4385 4386 // We can build the appropriate control immediate by performing the logic 4387 // operation we're matching using these constants for A, B, and C. 4388 uint8_t TernlogMagicA = 0xf0; 4389 uint8_t TernlogMagicB = 0xcc; 4390 uint8_t TernlogMagicC = 0xaa; 4391 4392 // Some of the inputs may be inverted, peek through them and invert the 4393 // magic values accordingly. 4394 // TODO: There may be a bitcast before the xor that we should peek through. 4395 auto PeekThroughNot = [](SDValue &Op, SDNode *&Parent, uint8_t &Magic) { 4396 if (Op.getOpcode() == ISD::XOR && Op.hasOneUse() && 4397 ISD::isBuildVectorAllOnes(Op.getOperand(1).getNode())) { 4398 Magic = ~Magic; 4399 Parent = Op.getNode(); 4400 Op = Op.getOperand(0); 4401 } 4402 }; 4403 4404 PeekThroughNot(A, ParentA, TernlogMagicA); 4405 PeekThroughNot(B, ParentB, TernlogMagicB); 4406 PeekThroughNot(C, ParentC, TernlogMagicC); 4407 4408 uint8_t Imm; 4409 switch (FoldableOp.getOpcode()) { 4410 default: llvm_unreachable("Unexpected opcode!"); 4411 case ISD::AND: Imm = TernlogMagicB & TernlogMagicC; break; 4412 case ISD::OR: Imm = TernlogMagicB | TernlogMagicC; break; 4413 case ISD::XOR: Imm = TernlogMagicB ^ TernlogMagicC; break; 4414 case X86ISD::ANDNP: Imm = ~(TernlogMagicB) & TernlogMagicC; break; 4415 } 4416 4417 switch (N->getOpcode()) { 4418 default: llvm_unreachable("Unexpected opcode!"); 4419 case X86ISD::ANDNP: 4420 if (A == N0) 4421 Imm &= ~TernlogMagicA; 4422 else 4423 Imm = ~(Imm) & TernlogMagicA; 4424 break; 4425 case ISD::AND: Imm &= TernlogMagicA; break; 4426 case ISD::OR: Imm |= TernlogMagicA; break; 4427 case ISD::XOR: Imm ^= TernlogMagicA; break; 4428 } 4429 4430 return matchVPTERNLOG(N, ParentA, ParentB, ParentC, A, B, C, Imm); 4431 } 4432 4433 /// If the high bits of an 'and' operand are known zero, try setting the 4434 /// high bits of an 'and' constant operand to produce a smaller encoding by 4435 /// creating a small, sign-extended negative immediate rather than a large 4436 /// positive one. This reverses a transform in SimplifyDemandedBits that 4437 /// shrinks mask constants by clearing bits. There is also a possibility that 4438 /// the 'and' mask can be made -1, so the 'and' itself is unnecessary. In that 4439 /// case, just replace the 'and'. Return 'true' if the node is replaced. 4440 bool X86DAGToDAGISel::shrinkAndImmediate(SDNode *And) { 4441 // i8 is unshrinkable, i16 should be promoted to i32, and vector ops don't 4442 // have immediate operands. 4443 MVT VT = And->getSimpleValueType(0); 4444 if (VT != MVT::i32 && VT != MVT::i64) 4445 return false; 4446 4447 auto *And1C = dyn_cast<ConstantSDNode>(And->getOperand(1)); 4448 if (!And1C) 4449 return false; 4450 4451 // Bail out if the mask constant is already negative. It's can't shrink more. 4452 // If the upper 32 bits of a 64 bit mask are all zeros, we have special isel 4453 // patterns to use a 32-bit and instead of a 64-bit and by relying on the 4454 // implicit zeroing of 32 bit ops. So we should check if the lower 32 bits 4455 // are negative too. 4456 APInt MaskVal = And1C->getAPIntValue(); 4457 unsigned MaskLZ = MaskVal.countLeadingZeros(); 4458 if (!MaskLZ || (VT == MVT::i64 && MaskLZ == 32)) 4459 return false; 4460 4461 // Don't extend into the upper 32 bits of a 64 bit mask. 4462 if (VT == MVT::i64 && MaskLZ >= 32) { 4463 MaskLZ -= 32; 4464 MaskVal = MaskVal.trunc(32); 4465 } 4466 4467 SDValue And0 = And->getOperand(0); 4468 APInt HighZeros = APInt::getHighBitsSet(MaskVal.getBitWidth(), MaskLZ); 4469 APInt NegMaskVal = MaskVal | HighZeros; 4470 4471 // If a negative constant would not allow a smaller encoding, there's no need 4472 // to continue. Only change the constant when we know it's a win. 4473 unsigned MinWidth = NegMaskVal.getMinSignedBits(); 4474 if (MinWidth > 32 || (MinWidth > 8 && MaskVal.getMinSignedBits() <= 32)) 4475 return false; 4476 4477 // Extend masks if we truncated above. 4478 if (VT == MVT::i64 && MaskVal.getBitWidth() < 64) { 4479 NegMaskVal = NegMaskVal.zext(64); 4480 HighZeros = HighZeros.zext(64); 4481 } 4482 4483 // The variable operand must be all zeros in the top bits to allow using the 4484 // new, negative constant as the mask. 4485 if (!CurDAG->MaskedValueIsZero(And0, HighZeros)) 4486 return false; 4487 4488 // Check if the mask is -1. In that case, this is an unnecessary instruction 4489 // that escaped earlier analysis. 4490 if (NegMaskVal.isAllOnes()) { 4491 ReplaceNode(And, And0.getNode()); 4492 return true; 4493 } 4494 4495 // A negative mask allows a smaller encoding. Create a new 'and' node. 4496 SDValue NewMask = CurDAG->getConstant(NegMaskVal, SDLoc(And), VT); 4497 insertDAGNode(*CurDAG, SDValue(And, 0), NewMask); 4498 SDValue NewAnd = CurDAG->getNode(ISD::AND, SDLoc(And), VT, And0, NewMask); 4499 ReplaceNode(And, NewAnd.getNode()); 4500 SelectCode(NewAnd.getNode()); 4501 return true; 4502 } 4503 4504 static unsigned getVPTESTMOpc(MVT TestVT, bool IsTestN, bool FoldedLoad, 4505 bool FoldedBCast, bool Masked) { 4506 #define VPTESTM_CASE(VT, SUFFIX) \ 4507 case MVT::VT: \ 4508 if (Masked) \ 4509 return IsTestN ? X86::VPTESTNM##SUFFIX##k: X86::VPTESTM##SUFFIX##k; \ 4510 return IsTestN ? X86::VPTESTNM##SUFFIX : X86::VPTESTM##SUFFIX; 4511 4512 4513 #define VPTESTM_BROADCAST_CASES(SUFFIX) \ 4514 default: llvm_unreachable("Unexpected VT!"); \ 4515 VPTESTM_CASE(v4i32, DZ128##SUFFIX) \ 4516 VPTESTM_CASE(v2i64, QZ128##SUFFIX) \ 4517 VPTESTM_CASE(v8i32, DZ256##SUFFIX) \ 4518 VPTESTM_CASE(v4i64, QZ256##SUFFIX) \ 4519 VPTESTM_CASE(v16i32, DZ##SUFFIX) \ 4520 VPTESTM_CASE(v8i64, QZ##SUFFIX) 4521 4522 #define VPTESTM_FULL_CASES(SUFFIX) \ 4523 VPTESTM_BROADCAST_CASES(SUFFIX) \ 4524 VPTESTM_CASE(v16i8, BZ128##SUFFIX) \ 4525 VPTESTM_CASE(v8i16, WZ128##SUFFIX) \ 4526 VPTESTM_CASE(v32i8, BZ256##SUFFIX) \ 4527 VPTESTM_CASE(v16i16, WZ256##SUFFIX) \ 4528 VPTESTM_CASE(v64i8, BZ##SUFFIX) \ 4529 VPTESTM_CASE(v32i16, WZ##SUFFIX) 4530 4531 if (FoldedBCast) { 4532 switch (TestVT.SimpleTy) { 4533 VPTESTM_BROADCAST_CASES(rmb) 4534 } 4535 } 4536 4537 if (FoldedLoad) { 4538 switch (TestVT.SimpleTy) { 4539 VPTESTM_FULL_CASES(rm) 4540 } 4541 } 4542 4543 switch (TestVT.SimpleTy) { 4544 VPTESTM_FULL_CASES(rr) 4545 } 4546 4547 #undef VPTESTM_FULL_CASES 4548 #undef VPTESTM_BROADCAST_CASES 4549 #undef VPTESTM_CASE 4550 } 4551 4552 // Try to create VPTESTM instruction. If InMask is not null, it will be used 4553 // to form a masked operation. 4554 bool X86DAGToDAGISel::tryVPTESTM(SDNode *Root, SDValue Setcc, 4555 SDValue InMask) { 4556 assert(Subtarget->hasAVX512() && "Expected AVX512!"); 4557 assert(Setcc.getSimpleValueType().getVectorElementType() == MVT::i1 && 4558 "Unexpected VT!"); 4559 4560 // Look for equal and not equal compares. 4561 ISD::CondCode CC = cast<CondCodeSDNode>(Setcc.getOperand(2))->get(); 4562 if (CC != ISD::SETEQ && CC != ISD::SETNE) 4563 return false; 4564 4565 SDValue SetccOp0 = Setcc.getOperand(0); 4566 SDValue SetccOp1 = Setcc.getOperand(1); 4567 4568 // Canonicalize the all zero vector to the RHS. 4569 if (ISD::isBuildVectorAllZeros(SetccOp0.getNode())) 4570 std::swap(SetccOp0, SetccOp1); 4571 4572 // See if we're comparing against zero. 4573 if (!ISD::isBuildVectorAllZeros(SetccOp1.getNode())) 4574 return false; 4575 4576 SDValue N0 = SetccOp0; 4577 4578 MVT CmpVT = N0.getSimpleValueType(); 4579 MVT CmpSVT = CmpVT.getVectorElementType(); 4580 4581 // Start with both operands the same. We'll try to refine this. 4582 SDValue Src0 = N0; 4583 SDValue Src1 = N0; 4584 4585 { 4586 // Look through single use bitcasts. 4587 SDValue N0Temp = N0; 4588 if (N0Temp.getOpcode() == ISD::BITCAST && N0Temp.hasOneUse()) 4589 N0Temp = N0.getOperand(0); 4590 4591 // Look for single use AND. 4592 if (N0Temp.getOpcode() == ISD::AND && N0Temp.hasOneUse()) { 4593 Src0 = N0Temp.getOperand(0); 4594 Src1 = N0Temp.getOperand(1); 4595 } 4596 } 4597 4598 // Without VLX we need to widen the operation. 4599 bool Widen = !Subtarget->hasVLX() && !CmpVT.is512BitVector(); 4600 4601 auto tryFoldLoadOrBCast = [&](SDNode *Root, SDNode *P, SDValue &L, 4602 SDValue &Base, SDValue &Scale, SDValue &Index, 4603 SDValue &Disp, SDValue &Segment) { 4604 // If we need to widen, we can't fold the load. 4605 if (!Widen) 4606 if (tryFoldLoad(Root, P, L, Base, Scale, Index, Disp, Segment)) 4607 return true; 4608 4609 // If we didn't fold a load, try to match broadcast. No widening limitation 4610 // for this. But only 32 and 64 bit types are supported. 4611 if (CmpSVT != MVT::i32 && CmpSVT != MVT::i64) 4612 return false; 4613 4614 // Look through single use bitcasts. 4615 if (L.getOpcode() == ISD::BITCAST && L.hasOneUse()) { 4616 P = L.getNode(); 4617 L = L.getOperand(0); 4618 } 4619 4620 if (L.getOpcode() != X86ISD::VBROADCAST_LOAD) 4621 return false; 4622 4623 auto *MemIntr = cast<MemIntrinsicSDNode>(L); 4624 if (MemIntr->getMemoryVT().getSizeInBits() != CmpSVT.getSizeInBits()) 4625 return false; 4626 4627 return tryFoldBroadcast(Root, P, L, Base, Scale, Index, Disp, Segment); 4628 }; 4629 4630 // We can only fold loads if the sources are unique. 4631 bool CanFoldLoads = Src0 != Src1; 4632 4633 bool FoldedLoad = false; 4634 SDValue Tmp0, Tmp1, Tmp2, Tmp3, Tmp4; 4635 if (CanFoldLoads) { 4636 FoldedLoad = tryFoldLoadOrBCast(Root, N0.getNode(), Src1, Tmp0, Tmp1, Tmp2, 4637 Tmp3, Tmp4); 4638 if (!FoldedLoad) { 4639 // And is commutative. 4640 FoldedLoad = tryFoldLoadOrBCast(Root, N0.getNode(), Src0, Tmp0, Tmp1, 4641 Tmp2, Tmp3, Tmp4); 4642 if (FoldedLoad) 4643 std::swap(Src0, Src1); 4644 } 4645 } 4646 4647 bool FoldedBCast = FoldedLoad && Src1.getOpcode() == X86ISD::VBROADCAST_LOAD; 4648 4649 bool IsMasked = InMask.getNode() != nullptr; 4650 4651 SDLoc dl(Root); 4652 4653 MVT ResVT = Setcc.getSimpleValueType(); 4654 MVT MaskVT = ResVT; 4655 if (Widen) { 4656 // Widen the inputs using insert_subreg or copy_to_regclass. 4657 unsigned Scale = CmpVT.is128BitVector() ? 4 : 2; 4658 unsigned SubReg = CmpVT.is128BitVector() ? X86::sub_xmm : X86::sub_ymm; 4659 unsigned NumElts = CmpVT.getVectorNumElements() * Scale; 4660 CmpVT = MVT::getVectorVT(CmpSVT, NumElts); 4661 MaskVT = MVT::getVectorVT(MVT::i1, NumElts); 4662 SDValue ImplDef = SDValue(CurDAG->getMachineNode(X86::IMPLICIT_DEF, dl, 4663 CmpVT), 0); 4664 Src0 = CurDAG->getTargetInsertSubreg(SubReg, dl, CmpVT, ImplDef, Src0); 4665 4666 if (!FoldedBCast) 4667 Src1 = CurDAG->getTargetInsertSubreg(SubReg, dl, CmpVT, ImplDef, Src1); 4668 4669 if (IsMasked) { 4670 // Widen the mask. 4671 unsigned RegClass = TLI->getRegClassFor(MaskVT)->getID(); 4672 SDValue RC = CurDAG->getTargetConstant(RegClass, dl, MVT::i32); 4673 InMask = SDValue(CurDAG->getMachineNode(TargetOpcode::COPY_TO_REGCLASS, 4674 dl, MaskVT, InMask, RC), 0); 4675 } 4676 } 4677 4678 bool IsTestN = CC == ISD::SETEQ; 4679 unsigned Opc = getVPTESTMOpc(CmpVT, IsTestN, FoldedLoad, FoldedBCast, 4680 IsMasked); 4681 4682 MachineSDNode *CNode; 4683 if (FoldedLoad) { 4684 SDVTList VTs = CurDAG->getVTList(MaskVT, MVT::Other); 4685 4686 if (IsMasked) { 4687 SDValue Ops[] = { InMask, Src0, Tmp0, Tmp1, Tmp2, Tmp3, Tmp4, 4688 Src1.getOperand(0) }; 4689 CNode = CurDAG->getMachineNode(Opc, dl, VTs, Ops); 4690 } else { 4691 SDValue Ops[] = { Src0, Tmp0, Tmp1, Tmp2, Tmp3, Tmp4, 4692 Src1.getOperand(0) }; 4693 CNode = CurDAG->getMachineNode(Opc, dl, VTs, Ops); 4694 } 4695 4696 // Update the chain. 4697 ReplaceUses(Src1.getValue(1), SDValue(CNode, 1)); 4698 // Record the mem-refs 4699 CurDAG->setNodeMemRefs(CNode, {cast<MemSDNode>(Src1)->getMemOperand()}); 4700 } else { 4701 if (IsMasked) 4702 CNode = CurDAG->getMachineNode(Opc, dl, MaskVT, InMask, Src0, Src1); 4703 else 4704 CNode = CurDAG->getMachineNode(Opc, dl, MaskVT, Src0, Src1); 4705 } 4706 4707 // If we widened, we need to shrink the mask VT. 4708 if (Widen) { 4709 unsigned RegClass = TLI->getRegClassFor(ResVT)->getID(); 4710 SDValue RC = CurDAG->getTargetConstant(RegClass, dl, MVT::i32); 4711 CNode = CurDAG->getMachineNode(TargetOpcode::COPY_TO_REGCLASS, 4712 dl, ResVT, SDValue(CNode, 0), RC); 4713 } 4714 4715 ReplaceUses(SDValue(Root, 0), SDValue(CNode, 0)); 4716 CurDAG->RemoveDeadNode(Root); 4717 return true; 4718 } 4719 4720 // Try to match the bitselect pattern (or (and A, B), (andn A, C)). Turn it 4721 // into vpternlog. 4722 bool X86DAGToDAGISel::tryMatchBitSelect(SDNode *N) { 4723 assert(N->getOpcode() == ISD::OR && "Unexpected opcode!"); 4724 4725 MVT NVT = N->getSimpleValueType(0); 4726 4727 // Make sure we support VPTERNLOG. 4728 if (!NVT.isVector() || !Subtarget->hasAVX512()) 4729 return false; 4730 4731 // We need VLX for 128/256-bit. 4732 if (!(Subtarget->hasVLX() || NVT.is512BitVector())) 4733 return false; 4734 4735 SDValue N0 = N->getOperand(0); 4736 SDValue N1 = N->getOperand(1); 4737 4738 // Canonicalize AND to LHS. 4739 if (N1.getOpcode() == ISD::AND) 4740 std::swap(N0, N1); 4741 4742 if (N0.getOpcode() != ISD::AND || 4743 N1.getOpcode() != X86ISD::ANDNP || 4744 !N0.hasOneUse() || !N1.hasOneUse()) 4745 return false; 4746 4747 // ANDN is not commutable, use it to pick down A and C. 4748 SDValue A = N1.getOperand(0); 4749 SDValue C = N1.getOperand(1); 4750 4751 // AND is commutable, if one operand matches A, the other operand is B. 4752 // Otherwise this isn't a match. 4753 SDValue B; 4754 if (N0.getOperand(0) == A) 4755 B = N0.getOperand(1); 4756 else if (N0.getOperand(1) == A) 4757 B = N0.getOperand(0); 4758 else 4759 return false; 4760 4761 SDLoc dl(N); 4762 SDValue Imm = CurDAG->getTargetConstant(0xCA, dl, MVT::i8); 4763 SDValue Ternlog = CurDAG->getNode(X86ISD::VPTERNLOG, dl, NVT, A, B, C, Imm); 4764 ReplaceNode(N, Ternlog.getNode()); 4765 4766 return matchVPTERNLOG(Ternlog.getNode(), Ternlog.getNode(), Ternlog.getNode(), 4767 Ternlog.getNode(), A, B, C, 0xCA); 4768 } 4769 4770 void X86DAGToDAGISel::Select(SDNode *Node) { 4771 MVT NVT = Node->getSimpleValueType(0); 4772 unsigned Opcode = Node->getOpcode(); 4773 SDLoc dl(Node); 4774 4775 if (Node->isMachineOpcode()) { 4776 LLVM_DEBUG(dbgs() << "== "; Node->dump(CurDAG); dbgs() << '\n'); 4777 Node->setNodeId(-1); 4778 return; // Already selected. 4779 } 4780 4781 switch (Opcode) { 4782 default: break; 4783 case ISD::INTRINSIC_W_CHAIN: { 4784 unsigned IntNo = Node->getConstantOperandVal(1); 4785 switch (IntNo) { 4786 default: break; 4787 case Intrinsic::x86_encodekey128: 4788 case Intrinsic::x86_encodekey256: { 4789 if (!Subtarget->hasKL()) 4790 break; 4791 4792 unsigned Opcode; 4793 switch (IntNo) { 4794 default: llvm_unreachable("Impossible intrinsic"); 4795 case Intrinsic::x86_encodekey128: Opcode = X86::ENCODEKEY128; break; 4796 case Intrinsic::x86_encodekey256: Opcode = X86::ENCODEKEY256; break; 4797 } 4798 4799 SDValue Chain = Node->getOperand(0); 4800 Chain = CurDAG->getCopyToReg(Chain, dl, X86::XMM0, Node->getOperand(3), 4801 SDValue()); 4802 if (Opcode == X86::ENCODEKEY256) 4803 Chain = CurDAG->getCopyToReg(Chain, dl, X86::XMM1, Node->getOperand(4), 4804 Chain.getValue(1)); 4805 4806 MachineSDNode *Res = CurDAG->getMachineNode( 4807 Opcode, dl, Node->getVTList(), 4808 {Node->getOperand(2), Chain, Chain.getValue(1)}); 4809 ReplaceNode(Node, Res); 4810 return; 4811 } 4812 case Intrinsic::x86_tileloadd64_internal: 4813 case Intrinsic::x86_tileloaddt164_internal: { 4814 if (!Subtarget->hasAMXTILE()) 4815 break; 4816 unsigned Opc = IntNo == Intrinsic::x86_tileloadd64_internal 4817 ? X86::PTILELOADDV 4818 : X86::PTILELOADDT1V; 4819 // _tile_loadd_internal(row, col, buf, STRIDE) 4820 SDValue Base = Node->getOperand(4); 4821 SDValue Scale = getI8Imm(1, dl); 4822 SDValue Index = Node->getOperand(5); 4823 SDValue Disp = CurDAG->getTargetConstant(0, dl, MVT::i32); 4824 SDValue Segment = CurDAG->getRegister(0, MVT::i16); 4825 SDValue Chain = Node->getOperand(0); 4826 MachineSDNode *CNode; 4827 SDValue Ops[] = {Node->getOperand(2), 4828 Node->getOperand(3), 4829 Base, 4830 Scale, 4831 Index, 4832 Disp, 4833 Segment, 4834 Chain}; 4835 CNode = CurDAG->getMachineNode(Opc, dl, {MVT::x86amx, MVT::Other}, Ops); 4836 ReplaceNode(Node, CNode); 4837 return; 4838 } 4839 } 4840 break; 4841 } 4842 case ISD::INTRINSIC_VOID: { 4843 unsigned IntNo = Node->getConstantOperandVal(1); 4844 switch (IntNo) { 4845 default: break; 4846 case Intrinsic::x86_sse3_monitor: 4847 case Intrinsic::x86_monitorx: 4848 case Intrinsic::x86_clzero: { 4849 bool Use64BitPtr = Node->getOperand(2).getValueType() == MVT::i64; 4850 4851 unsigned Opc = 0; 4852 switch (IntNo) { 4853 default: llvm_unreachable("Unexpected intrinsic!"); 4854 case Intrinsic::x86_sse3_monitor: 4855 if (!Subtarget->hasSSE3()) 4856 break; 4857 Opc = Use64BitPtr ? X86::MONITOR64rrr : X86::MONITOR32rrr; 4858 break; 4859 case Intrinsic::x86_monitorx: 4860 if (!Subtarget->hasMWAITX()) 4861 break; 4862 Opc = Use64BitPtr ? X86::MONITORX64rrr : X86::MONITORX32rrr; 4863 break; 4864 case Intrinsic::x86_clzero: 4865 if (!Subtarget->hasCLZERO()) 4866 break; 4867 Opc = Use64BitPtr ? X86::CLZERO64r : X86::CLZERO32r; 4868 break; 4869 } 4870 4871 if (Opc) { 4872 unsigned PtrReg = Use64BitPtr ? X86::RAX : X86::EAX; 4873 SDValue Chain = CurDAG->getCopyToReg(Node->getOperand(0), dl, PtrReg, 4874 Node->getOperand(2), SDValue()); 4875 SDValue InFlag = Chain.getValue(1); 4876 4877 if (IntNo == Intrinsic::x86_sse3_monitor || 4878 IntNo == Intrinsic::x86_monitorx) { 4879 // Copy the other two operands to ECX and EDX. 4880 Chain = CurDAG->getCopyToReg(Chain, dl, X86::ECX, Node->getOperand(3), 4881 InFlag); 4882 InFlag = Chain.getValue(1); 4883 Chain = CurDAG->getCopyToReg(Chain, dl, X86::EDX, Node->getOperand(4), 4884 InFlag); 4885 InFlag = Chain.getValue(1); 4886 } 4887 4888 MachineSDNode *CNode = CurDAG->getMachineNode(Opc, dl, MVT::Other, 4889 { Chain, InFlag}); 4890 ReplaceNode(Node, CNode); 4891 return; 4892 } 4893 4894 break; 4895 } 4896 case Intrinsic::x86_tilestored64_internal: { 4897 unsigned Opc = X86::PTILESTOREDV; 4898 // _tile_stored_internal(row, col, buf, STRIDE, c) 4899 SDValue Base = Node->getOperand(4); 4900 SDValue Scale = getI8Imm(1, dl); 4901 SDValue Index = Node->getOperand(5); 4902 SDValue Disp = CurDAG->getTargetConstant(0, dl, MVT::i32); 4903 SDValue Segment = CurDAG->getRegister(0, MVT::i16); 4904 SDValue Chain = Node->getOperand(0); 4905 MachineSDNode *CNode; 4906 SDValue Ops[] = {Node->getOperand(2), 4907 Node->getOperand(3), 4908 Base, 4909 Scale, 4910 Index, 4911 Disp, 4912 Segment, 4913 Node->getOperand(6), 4914 Chain}; 4915 CNode = CurDAG->getMachineNode(Opc, dl, MVT::Other, Ops); 4916 ReplaceNode(Node, CNode); 4917 return; 4918 } 4919 case Intrinsic::x86_tileloadd64: 4920 case Intrinsic::x86_tileloaddt164: 4921 case Intrinsic::x86_tilestored64: { 4922 if (!Subtarget->hasAMXTILE()) 4923 break; 4924 unsigned Opc; 4925 switch (IntNo) { 4926 default: llvm_unreachable("Unexpected intrinsic!"); 4927 case Intrinsic::x86_tileloadd64: Opc = X86::PTILELOADD; break; 4928 case Intrinsic::x86_tileloaddt164: Opc = X86::PTILELOADDT1; break; 4929 case Intrinsic::x86_tilestored64: Opc = X86::PTILESTORED; break; 4930 } 4931 // FIXME: Match displacement and scale. 4932 unsigned TIndex = Node->getConstantOperandVal(2); 4933 SDValue TReg = getI8Imm(TIndex, dl); 4934 SDValue Base = Node->getOperand(3); 4935 SDValue Scale = getI8Imm(1, dl); 4936 SDValue Index = Node->getOperand(4); 4937 SDValue Disp = CurDAG->getTargetConstant(0, dl, MVT::i32); 4938 SDValue Segment = CurDAG->getRegister(0, MVT::i16); 4939 SDValue Chain = Node->getOperand(0); 4940 MachineSDNode *CNode; 4941 if (Opc == X86::PTILESTORED) { 4942 SDValue Ops[] = { Base, Scale, Index, Disp, Segment, TReg, Chain }; 4943 CNode = CurDAG->getMachineNode(Opc, dl, MVT::Other, Ops); 4944 } else { 4945 SDValue Ops[] = { TReg, Base, Scale, Index, Disp, Segment, Chain }; 4946 CNode = CurDAG->getMachineNode(Opc, dl, MVT::Other, Ops); 4947 } 4948 ReplaceNode(Node, CNode); 4949 return; 4950 } 4951 } 4952 break; 4953 } 4954 case ISD::BRIND: 4955 case X86ISD::NT_BRIND: { 4956 if (Subtarget->isTargetNaCl()) 4957 // NaCl has its own pass where jmp %r32 are converted to jmp %r64. We 4958 // leave the instruction alone. 4959 break; 4960 if (Subtarget->isTarget64BitILP32()) { 4961 // Converts a 32-bit register to a 64-bit, zero-extended version of 4962 // it. This is needed because x86-64 can do many things, but jmp %r32 4963 // ain't one of them. 4964 SDValue Target = Node->getOperand(1); 4965 assert(Target.getValueType() == MVT::i32 && "Unexpected VT!"); 4966 SDValue ZextTarget = CurDAG->getZExtOrTrunc(Target, dl, MVT::i64); 4967 SDValue Brind = CurDAG->getNode(Opcode, dl, MVT::Other, 4968 Node->getOperand(0), ZextTarget); 4969 ReplaceNode(Node, Brind.getNode()); 4970 SelectCode(ZextTarget.getNode()); 4971 SelectCode(Brind.getNode()); 4972 return; 4973 } 4974 break; 4975 } 4976 case X86ISD::GlobalBaseReg: 4977 ReplaceNode(Node, getGlobalBaseReg()); 4978 return; 4979 4980 case ISD::BITCAST: 4981 // Just drop all 128/256/512-bit bitcasts. 4982 if (NVT.is512BitVector() || NVT.is256BitVector() || NVT.is128BitVector() || 4983 NVT == MVT::f128) { 4984 ReplaceUses(SDValue(Node, 0), Node->getOperand(0)); 4985 CurDAG->RemoveDeadNode(Node); 4986 return; 4987 } 4988 break; 4989 4990 case ISD::SRL: 4991 if (matchBitExtract(Node)) 4992 return; 4993 [[fallthrough]]; 4994 case ISD::SRA: 4995 case ISD::SHL: 4996 if (tryShiftAmountMod(Node)) 4997 return; 4998 break; 4999 5000 case X86ISD::VPTERNLOG: { 5001 uint8_t Imm = cast<ConstantSDNode>(Node->getOperand(3))->getZExtValue(); 5002 if (matchVPTERNLOG(Node, Node, Node, Node, Node->getOperand(0), 5003 Node->getOperand(1), Node->getOperand(2), Imm)) 5004 return; 5005 break; 5006 } 5007 5008 case X86ISD::ANDNP: 5009 if (tryVPTERNLOG(Node)) 5010 return; 5011 break; 5012 5013 case ISD::AND: 5014 if (NVT.isVector() && NVT.getVectorElementType() == MVT::i1) { 5015 // Try to form a masked VPTESTM. Operands can be in either order. 5016 SDValue N0 = Node->getOperand(0); 5017 SDValue N1 = Node->getOperand(1); 5018 if (N0.getOpcode() == ISD::SETCC && N0.hasOneUse() && 5019 tryVPTESTM(Node, N0, N1)) 5020 return; 5021 if (N1.getOpcode() == ISD::SETCC && N1.hasOneUse() && 5022 tryVPTESTM(Node, N1, N0)) 5023 return; 5024 } 5025 5026 if (MachineSDNode *NewNode = matchBEXTRFromAndImm(Node)) { 5027 ReplaceUses(SDValue(Node, 0), SDValue(NewNode, 0)); 5028 CurDAG->RemoveDeadNode(Node); 5029 return; 5030 } 5031 if (matchBitExtract(Node)) 5032 return; 5033 if (AndImmShrink && shrinkAndImmediate(Node)) 5034 return; 5035 5036 [[fallthrough]]; 5037 case ISD::OR: 5038 case ISD::XOR: 5039 if (tryShrinkShlLogicImm(Node)) 5040 return; 5041 if (Opcode == ISD::OR && tryMatchBitSelect(Node)) 5042 return; 5043 if (tryVPTERNLOG(Node)) 5044 return; 5045 5046 [[fallthrough]]; 5047 case ISD::ADD: 5048 case ISD::SUB: { 5049 // Try to avoid folding immediates with multiple uses for optsize. 5050 // This code tries to select to register form directly to avoid going 5051 // through the isel table which might fold the immediate. We can't change 5052 // the patterns on the add/sub/and/or/xor with immediate paterns in the 5053 // tablegen files to check immediate use count without making the patterns 5054 // unavailable to the fast-isel table. 5055 if (!CurDAG->shouldOptForSize()) 5056 break; 5057 5058 // Only handle i8/i16/i32/i64. 5059 if (NVT != MVT::i8 && NVT != MVT::i16 && NVT != MVT::i32 && NVT != MVT::i64) 5060 break; 5061 5062 SDValue N0 = Node->getOperand(0); 5063 SDValue N1 = Node->getOperand(1); 5064 5065 auto *Cst = dyn_cast<ConstantSDNode>(N1); 5066 if (!Cst) 5067 break; 5068 5069 int64_t Val = Cst->getSExtValue(); 5070 5071 // Make sure its an immediate that is considered foldable. 5072 // FIXME: Handle unsigned 32 bit immediates for 64-bit AND. 5073 if (!isInt<8>(Val) && !isInt<32>(Val)) 5074 break; 5075 5076 // If this can match to INC/DEC, let it go. 5077 if (Opcode == ISD::ADD && (Val == 1 || Val == -1)) 5078 break; 5079 5080 // Check if we should avoid folding this immediate. 5081 if (!shouldAvoidImmediateInstFormsForSize(N1.getNode())) 5082 break; 5083 5084 // We should not fold the immediate. So we need a register form instead. 5085 unsigned ROpc, MOpc; 5086 switch (NVT.SimpleTy) { 5087 default: llvm_unreachable("Unexpected VT!"); 5088 case MVT::i8: 5089 switch (Opcode) { 5090 default: llvm_unreachable("Unexpected opcode!"); 5091 case ISD::ADD: ROpc = X86::ADD8rr; MOpc = X86::ADD8rm; break; 5092 case ISD::SUB: ROpc = X86::SUB8rr; MOpc = X86::SUB8rm; break; 5093 case ISD::AND: ROpc = X86::AND8rr; MOpc = X86::AND8rm; break; 5094 case ISD::OR: ROpc = X86::OR8rr; MOpc = X86::OR8rm; break; 5095 case ISD::XOR: ROpc = X86::XOR8rr; MOpc = X86::XOR8rm; break; 5096 } 5097 break; 5098 case MVT::i16: 5099 switch (Opcode) { 5100 default: llvm_unreachable("Unexpected opcode!"); 5101 case ISD::ADD: ROpc = X86::ADD16rr; MOpc = X86::ADD16rm; break; 5102 case ISD::SUB: ROpc = X86::SUB16rr; MOpc = X86::SUB16rm; break; 5103 case ISD::AND: ROpc = X86::AND16rr; MOpc = X86::AND16rm; break; 5104 case ISD::OR: ROpc = X86::OR16rr; MOpc = X86::OR16rm; break; 5105 case ISD::XOR: ROpc = X86::XOR16rr; MOpc = X86::XOR16rm; break; 5106 } 5107 break; 5108 case MVT::i32: 5109 switch (Opcode) { 5110 default: llvm_unreachable("Unexpected opcode!"); 5111 case ISD::ADD: ROpc = X86::ADD32rr; MOpc = X86::ADD32rm; break; 5112 case ISD::SUB: ROpc = X86::SUB32rr; MOpc = X86::SUB32rm; break; 5113 case ISD::AND: ROpc = X86::AND32rr; MOpc = X86::AND32rm; break; 5114 case ISD::OR: ROpc = X86::OR32rr; MOpc = X86::OR32rm; break; 5115 case ISD::XOR: ROpc = X86::XOR32rr; MOpc = X86::XOR32rm; break; 5116 } 5117 break; 5118 case MVT::i64: 5119 switch (Opcode) { 5120 default: llvm_unreachable("Unexpected opcode!"); 5121 case ISD::ADD: ROpc = X86::ADD64rr; MOpc = X86::ADD64rm; break; 5122 case ISD::SUB: ROpc = X86::SUB64rr; MOpc = X86::SUB64rm; break; 5123 case ISD::AND: ROpc = X86::AND64rr; MOpc = X86::AND64rm; break; 5124 case ISD::OR: ROpc = X86::OR64rr; MOpc = X86::OR64rm; break; 5125 case ISD::XOR: ROpc = X86::XOR64rr; MOpc = X86::XOR64rm; break; 5126 } 5127 break; 5128 } 5129 5130 // Ok this is a AND/OR/XOR/ADD/SUB with constant. 5131 5132 // If this is a not a subtract, we can still try to fold a load. 5133 if (Opcode != ISD::SUB) { 5134 SDValue Tmp0, Tmp1, Tmp2, Tmp3, Tmp4; 5135 if (tryFoldLoad(Node, N0, Tmp0, Tmp1, Tmp2, Tmp3, Tmp4)) { 5136 SDValue Ops[] = { N1, Tmp0, Tmp1, Tmp2, Tmp3, Tmp4, N0.getOperand(0) }; 5137 SDVTList VTs = CurDAG->getVTList(NVT, MVT::i32, MVT::Other); 5138 MachineSDNode *CNode = CurDAG->getMachineNode(MOpc, dl, VTs, Ops); 5139 // Update the chain. 5140 ReplaceUses(N0.getValue(1), SDValue(CNode, 2)); 5141 // Record the mem-refs 5142 CurDAG->setNodeMemRefs(CNode, {cast<LoadSDNode>(N0)->getMemOperand()}); 5143 ReplaceUses(SDValue(Node, 0), SDValue(CNode, 0)); 5144 CurDAG->RemoveDeadNode(Node); 5145 return; 5146 } 5147 } 5148 5149 CurDAG->SelectNodeTo(Node, ROpc, NVT, MVT::i32, N0, N1); 5150 return; 5151 } 5152 5153 case X86ISD::SMUL: 5154 // i16/i32/i64 are handled with isel patterns. 5155 if (NVT != MVT::i8) 5156 break; 5157 [[fallthrough]]; 5158 case X86ISD::UMUL: { 5159 SDValue N0 = Node->getOperand(0); 5160 SDValue N1 = Node->getOperand(1); 5161 5162 unsigned LoReg, ROpc, MOpc; 5163 switch (NVT.SimpleTy) { 5164 default: llvm_unreachable("Unsupported VT!"); 5165 case MVT::i8: 5166 LoReg = X86::AL; 5167 ROpc = Opcode == X86ISD::SMUL ? X86::IMUL8r : X86::MUL8r; 5168 MOpc = Opcode == X86ISD::SMUL ? X86::IMUL8m : X86::MUL8m; 5169 break; 5170 case MVT::i16: 5171 LoReg = X86::AX; 5172 ROpc = X86::MUL16r; 5173 MOpc = X86::MUL16m; 5174 break; 5175 case MVT::i32: 5176 LoReg = X86::EAX; 5177 ROpc = X86::MUL32r; 5178 MOpc = X86::MUL32m; 5179 break; 5180 case MVT::i64: 5181 LoReg = X86::RAX; 5182 ROpc = X86::MUL64r; 5183 MOpc = X86::MUL64m; 5184 break; 5185 } 5186 5187 SDValue Tmp0, Tmp1, Tmp2, Tmp3, Tmp4; 5188 bool FoldedLoad = tryFoldLoad(Node, N1, Tmp0, Tmp1, Tmp2, Tmp3, Tmp4); 5189 // Multiply is commutative. 5190 if (!FoldedLoad) { 5191 FoldedLoad = tryFoldLoad(Node, N0, Tmp0, Tmp1, Tmp2, Tmp3, Tmp4); 5192 if (FoldedLoad) 5193 std::swap(N0, N1); 5194 } 5195 5196 SDValue InFlag = CurDAG->getCopyToReg(CurDAG->getEntryNode(), dl, LoReg, 5197 N0, SDValue()).getValue(1); 5198 5199 MachineSDNode *CNode; 5200 if (FoldedLoad) { 5201 // i16/i32/i64 use an instruction that produces a low and high result even 5202 // though only the low result is used. 5203 SDVTList VTs; 5204 if (NVT == MVT::i8) 5205 VTs = CurDAG->getVTList(NVT, MVT::i32, MVT::Other); 5206 else 5207 VTs = CurDAG->getVTList(NVT, NVT, MVT::i32, MVT::Other); 5208 5209 SDValue Ops[] = { Tmp0, Tmp1, Tmp2, Tmp3, Tmp4, N1.getOperand(0), 5210 InFlag }; 5211 CNode = CurDAG->getMachineNode(MOpc, dl, VTs, Ops); 5212 5213 // Update the chain. 5214 ReplaceUses(N1.getValue(1), SDValue(CNode, NVT == MVT::i8 ? 2 : 3)); 5215 // Record the mem-refs 5216 CurDAG->setNodeMemRefs(CNode, {cast<LoadSDNode>(N1)->getMemOperand()}); 5217 } else { 5218 // i16/i32/i64 use an instruction that produces a low and high result even 5219 // though only the low result is used. 5220 SDVTList VTs; 5221 if (NVT == MVT::i8) 5222 VTs = CurDAG->getVTList(NVT, MVT::i32); 5223 else 5224 VTs = CurDAG->getVTList(NVT, NVT, MVT::i32); 5225 5226 CNode = CurDAG->getMachineNode(ROpc, dl, VTs, {N1, InFlag}); 5227 } 5228 5229 ReplaceUses(SDValue(Node, 0), SDValue(CNode, 0)); 5230 ReplaceUses(SDValue(Node, 1), SDValue(CNode, NVT == MVT::i8 ? 1 : 2)); 5231 CurDAG->RemoveDeadNode(Node); 5232 return; 5233 } 5234 5235 case ISD::SMUL_LOHI: 5236 case ISD::UMUL_LOHI: { 5237 SDValue N0 = Node->getOperand(0); 5238 SDValue N1 = Node->getOperand(1); 5239 5240 unsigned Opc, MOpc; 5241 unsigned LoReg, HiReg; 5242 bool IsSigned = Opcode == ISD::SMUL_LOHI; 5243 bool UseMULX = !IsSigned && Subtarget->hasBMI2(); 5244 bool UseMULXHi = UseMULX && SDValue(Node, 0).use_empty(); 5245 switch (NVT.SimpleTy) { 5246 default: llvm_unreachable("Unsupported VT!"); 5247 case MVT::i32: 5248 Opc = UseMULXHi ? X86::MULX32Hrr : 5249 UseMULX ? X86::MULX32rr : 5250 IsSigned ? X86::IMUL32r : X86::MUL32r; 5251 MOpc = UseMULXHi ? X86::MULX32Hrm : 5252 UseMULX ? X86::MULX32rm : 5253 IsSigned ? X86::IMUL32m : X86::MUL32m; 5254 LoReg = UseMULX ? X86::EDX : X86::EAX; 5255 HiReg = X86::EDX; 5256 break; 5257 case MVT::i64: 5258 Opc = UseMULXHi ? X86::MULX64Hrr : 5259 UseMULX ? X86::MULX64rr : 5260 IsSigned ? X86::IMUL64r : X86::MUL64r; 5261 MOpc = UseMULXHi ? X86::MULX64Hrm : 5262 UseMULX ? X86::MULX64rm : 5263 IsSigned ? X86::IMUL64m : X86::MUL64m; 5264 LoReg = UseMULX ? X86::RDX : X86::RAX; 5265 HiReg = X86::RDX; 5266 break; 5267 } 5268 5269 SDValue Tmp0, Tmp1, Tmp2, Tmp3, Tmp4; 5270 bool foldedLoad = tryFoldLoad(Node, N1, Tmp0, Tmp1, Tmp2, Tmp3, Tmp4); 5271 // Multiply is commutative. 5272 if (!foldedLoad) { 5273 foldedLoad = tryFoldLoad(Node, N0, Tmp0, Tmp1, Tmp2, Tmp3, Tmp4); 5274 if (foldedLoad) 5275 std::swap(N0, N1); 5276 } 5277 5278 SDValue InFlag = CurDAG->getCopyToReg(CurDAG->getEntryNode(), dl, LoReg, 5279 N0, SDValue()).getValue(1); 5280 SDValue ResHi, ResLo; 5281 if (foldedLoad) { 5282 SDValue Chain; 5283 MachineSDNode *CNode = nullptr; 5284 SDValue Ops[] = { Tmp0, Tmp1, Tmp2, Tmp3, Tmp4, N1.getOperand(0), 5285 InFlag }; 5286 if (UseMULXHi) { 5287 SDVTList VTs = CurDAG->getVTList(NVT, MVT::Other); 5288 CNode = CurDAG->getMachineNode(MOpc, dl, VTs, Ops); 5289 ResHi = SDValue(CNode, 0); 5290 Chain = SDValue(CNode, 1); 5291 } else if (UseMULX) { 5292 SDVTList VTs = CurDAG->getVTList(NVT, NVT, MVT::Other); 5293 CNode = CurDAG->getMachineNode(MOpc, dl, VTs, Ops); 5294 ResHi = SDValue(CNode, 0); 5295 ResLo = SDValue(CNode, 1); 5296 Chain = SDValue(CNode, 2); 5297 } else { 5298 SDVTList VTs = CurDAG->getVTList(MVT::Other, MVT::Glue); 5299 CNode = CurDAG->getMachineNode(MOpc, dl, VTs, Ops); 5300 Chain = SDValue(CNode, 0); 5301 InFlag = SDValue(CNode, 1); 5302 } 5303 5304 // Update the chain. 5305 ReplaceUses(N1.getValue(1), Chain); 5306 // Record the mem-refs 5307 CurDAG->setNodeMemRefs(CNode, {cast<LoadSDNode>(N1)->getMemOperand()}); 5308 } else { 5309 SDValue Ops[] = { N1, InFlag }; 5310 if (UseMULXHi) { 5311 SDVTList VTs = CurDAG->getVTList(NVT); 5312 SDNode *CNode = CurDAG->getMachineNode(Opc, dl, VTs, Ops); 5313 ResHi = SDValue(CNode, 0); 5314 } else if (UseMULX) { 5315 SDVTList VTs = CurDAG->getVTList(NVT, NVT); 5316 SDNode *CNode = CurDAG->getMachineNode(Opc, dl, VTs, Ops); 5317 ResHi = SDValue(CNode, 0); 5318 ResLo = SDValue(CNode, 1); 5319 } else { 5320 SDVTList VTs = CurDAG->getVTList(MVT::Glue); 5321 SDNode *CNode = CurDAG->getMachineNode(Opc, dl, VTs, Ops); 5322 InFlag = SDValue(CNode, 0); 5323 } 5324 } 5325 5326 // Copy the low half of the result, if it is needed. 5327 if (!SDValue(Node, 0).use_empty()) { 5328 if (!ResLo) { 5329 assert(LoReg && "Register for low half is not defined!"); 5330 ResLo = CurDAG->getCopyFromReg(CurDAG->getEntryNode(), dl, LoReg, 5331 NVT, InFlag); 5332 InFlag = ResLo.getValue(2); 5333 } 5334 ReplaceUses(SDValue(Node, 0), ResLo); 5335 LLVM_DEBUG(dbgs() << "=> "; ResLo.getNode()->dump(CurDAG); 5336 dbgs() << '\n'); 5337 } 5338 // Copy the high half of the result, if it is needed. 5339 if (!SDValue(Node, 1).use_empty()) { 5340 if (!ResHi) { 5341 assert(HiReg && "Register for high half is not defined!"); 5342 ResHi = CurDAG->getCopyFromReg(CurDAG->getEntryNode(), dl, HiReg, 5343 NVT, InFlag); 5344 InFlag = ResHi.getValue(2); 5345 } 5346 ReplaceUses(SDValue(Node, 1), ResHi); 5347 LLVM_DEBUG(dbgs() << "=> "; ResHi.getNode()->dump(CurDAG); 5348 dbgs() << '\n'); 5349 } 5350 5351 CurDAG->RemoveDeadNode(Node); 5352 return; 5353 } 5354 5355 case ISD::SDIVREM: 5356 case ISD::UDIVREM: { 5357 SDValue N0 = Node->getOperand(0); 5358 SDValue N1 = Node->getOperand(1); 5359 5360 unsigned ROpc, MOpc; 5361 bool isSigned = Opcode == ISD::SDIVREM; 5362 if (!isSigned) { 5363 switch (NVT.SimpleTy) { 5364 default: llvm_unreachable("Unsupported VT!"); 5365 case MVT::i8: ROpc = X86::DIV8r; MOpc = X86::DIV8m; break; 5366 case MVT::i16: ROpc = X86::DIV16r; MOpc = X86::DIV16m; break; 5367 case MVT::i32: ROpc = X86::DIV32r; MOpc = X86::DIV32m; break; 5368 case MVT::i64: ROpc = X86::DIV64r; MOpc = X86::DIV64m; break; 5369 } 5370 } else { 5371 switch (NVT.SimpleTy) { 5372 default: llvm_unreachable("Unsupported VT!"); 5373 case MVT::i8: ROpc = X86::IDIV8r; MOpc = X86::IDIV8m; break; 5374 case MVT::i16: ROpc = X86::IDIV16r; MOpc = X86::IDIV16m; break; 5375 case MVT::i32: ROpc = X86::IDIV32r; MOpc = X86::IDIV32m; break; 5376 case MVT::i64: ROpc = X86::IDIV64r; MOpc = X86::IDIV64m; break; 5377 } 5378 } 5379 5380 unsigned LoReg, HiReg, ClrReg; 5381 unsigned SExtOpcode; 5382 switch (NVT.SimpleTy) { 5383 default: llvm_unreachable("Unsupported VT!"); 5384 case MVT::i8: 5385 LoReg = X86::AL; ClrReg = HiReg = X86::AH; 5386 SExtOpcode = 0; // Not used. 5387 break; 5388 case MVT::i16: 5389 LoReg = X86::AX; HiReg = X86::DX; 5390 ClrReg = X86::DX; 5391 SExtOpcode = X86::CWD; 5392 break; 5393 case MVT::i32: 5394 LoReg = X86::EAX; ClrReg = HiReg = X86::EDX; 5395 SExtOpcode = X86::CDQ; 5396 break; 5397 case MVT::i64: 5398 LoReg = X86::RAX; ClrReg = HiReg = X86::RDX; 5399 SExtOpcode = X86::CQO; 5400 break; 5401 } 5402 5403 SDValue Tmp0, Tmp1, Tmp2, Tmp3, Tmp4; 5404 bool foldedLoad = tryFoldLoad(Node, N1, Tmp0, Tmp1, Tmp2, Tmp3, Tmp4); 5405 bool signBitIsZero = CurDAG->SignBitIsZero(N0); 5406 5407 SDValue InFlag; 5408 if (NVT == MVT::i8) { 5409 // Special case for div8, just use a move with zero extension to AX to 5410 // clear the upper 8 bits (AH). 5411 SDValue Tmp0, Tmp1, Tmp2, Tmp3, Tmp4, Chain; 5412 MachineSDNode *Move; 5413 if (tryFoldLoad(Node, N0, Tmp0, Tmp1, Tmp2, Tmp3, Tmp4)) { 5414 SDValue Ops[] = { Tmp0, Tmp1, Tmp2, Tmp3, Tmp4, N0.getOperand(0) }; 5415 unsigned Opc = (isSigned && !signBitIsZero) ? X86::MOVSX16rm8 5416 : X86::MOVZX16rm8; 5417 Move = CurDAG->getMachineNode(Opc, dl, MVT::i16, MVT::Other, Ops); 5418 Chain = SDValue(Move, 1); 5419 ReplaceUses(N0.getValue(1), Chain); 5420 // Record the mem-refs 5421 CurDAG->setNodeMemRefs(Move, {cast<LoadSDNode>(N0)->getMemOperand()}); 5422 } else { 5423 unsigned Opc = (isSigned && !signBitIsZero) ? X86::MOVSX16rr8 5424 : X86::MOVZX16rr8; 5425 Move = CurDAG->getMachineNode(Opc, dl, MVT::i16, N0); 5426 Chain = CurDAG->getEntryNode(); 5427 } 5428 Chain = CurDAG->getCopyToReg(Chain, dl, X86::AX, SDValue(Move, 0), 5429 SDValue()); 5430 InFlag = Chain.getValue(1); 5431 } else { 5432 InFlag = 5433 CurDAG->getCopyToReg(CurDAG->getEntryNode(), dl, 5434 LoReg, N0, SDValue()).getValue(1); 5435 if (isSigned && !signBitIsZero) { 5436 // Sign extend the low part into the high part. 5437 InFlag = 5438 SDValue(CurDAG->getMachineNode(SExtOpcode, dl, MVT::Glue, InFlag),0); 5439 } else { 5440 // Zero out the high part, effectively zero extending the input. 5441 SDVTList VTs = CurDAG->getVTList(MVT::i32, MVT::i32); 5442 SDValue ClrNode = SDValue( 5443 CurDAG->getMachineNode(X86::MOV32r0, dl, VTs, std::nullopt), 0); 5444 switch (NVT.SimpleTy) { 5445 case MVT::i16: 5446 ClrNode = 5447 SDValue(CurDAG->getMachineNode( 5448 TargetOpcode::EXTRACT_SUBREG, dl, MVT::i16, ClrNode, 5449 CurDAG->getTargetConstant(X86::sub_16bit, dl, 5450 MVT::i32)), 5451 0); 5452 break; 5453 case MVT::i32: 5454 break; 5455 case MVT::i64: 5456 ClrNode = 5457 SDValue(CurDAG->getMachineNode( 5458 TargetOpcode::SUBREG_TO_REG, dl, MVT::i64, 5459 CurDAG->getTargetConstant(0, dl, MVT::i64), ClrNode, 5460 CurDAG->getTargetConstant(X86::sub_32bit, dl, 5461 MVT::i32)), 5462 0); 5463 break; 5464 default: 5465 llvm_unreachable("Unexpected division source"); 5466 } 5467 5468 InFlag = CurDAG->getCopyToReg(CurDAG->getEntryNode(), dl, ClrReg, 5469 ClrNode, InFlag).getValue(1); 5470 } 5471 } 5472 5473 if (foldedLoad) { 5474 SDValue Ops[] = { Tmp0, Tmp1, Tmp2, Tmp3, Tmp4, N1.getOperand(0), 5475 InFlag }; 5476 MachineSDNode *CNode = 5477 CurDAG->getMachineNode(MOpc, dl, MVT::Other, MVT::Glue, Ops); 5478 InFlag = SDValue(CNode, 1); 5479 // Update the chain. 5480 ReplaceUses(N1.getValue(1), SDValue(CNode, 0)); 5481 // Record the mem-refs 5482 CurDAG->setNodeMemRefs(CNode, {cast<LoadSDNode>(N1)->getMemOperand()}); 5483 } else { 5484 InFlag = 5485 SDValue(CurDAG->getMachineNode(ROpc, dl, MVT::Glue, N1, InFlag), 0); 5486 } 5487 5488 // Prevent use of AH in a REX instruction by explicitly copying it to 5489 // an ABCD_L register. 5490 // 5491 // The current assumption of the register allocator is that isel 5492 // won't generate explicit references to the GR8_ABCD_H registers. If 5493 // the allocator and/or the backend get enhanced to be more robust in 5494 // that regard, this can be, and should be, removed. 5495 if (HiReg == X86::AH && !SDValue(Node, 1).use_empty()) { 5496 SDValue AHCopy = CurDAG->getRegister(X86::AH, MVT::i8); 5497 unsigned AHExtOpcode = 5498 isSigned ? X86::MOVSX32rr8_NOREX : X86::MOVZX32rr8_NOREX; 5499 5500 SDNode *RNode = CurDAG->getMachineNode(AHExtOpcode, dl, MVT::i32, 5501 MVT::Glue, AHCopy, InFlag); 5502 SDValue Result(RNode, 0); 5503 InFlag = SDValue(RNode, 1); 5504 5505 Result = 5506 CurDAG->getTargetExtractSubreg(X86::sub_8bit, dl, MVT::i8, Result); 5507 5508 ReplaceUses(SDValue(Node, 1), Result); 5509 LLVM_DEBUG(dbgs() << "=> "; Result.getNode()->dump(CurDAG); 5510 dbgs() << '\n'); 5511 } 5512 // Copy the division (low) result, if it is needed. 5513 if (!SDValue(Node, 0).use_empty()) { 5514 SDValue Result = CurDAG->getCopyFromReg(CurDAG->getEntryNode(), dl, 5515 LoReg, NVT, InFlag); 5516 InFlag = Result.getValue(2); 5517 ReplaceUses(SDValue(Node, 0), Result); 5518 LLVM_DEBUG(dbgs() << "=> "; Result.getNode()->dump(CurDAG); 5519 dbgs() << '\n'); 5520 } 5521 // Copy the remainder (high) result, if it is needed. 5522 if (!SDValue(Node, 1).use_empty()) { 5523 SDValue Result = CurDAG->getCopyFromReg(CurDAG->getEntryNode(), dl, 5524 HiReg, NVT, InFlag); 5525 InFlag = Result.getValue(2); 5526 ReplaceUses(SDValue(Node, 1), Result); 5527 LLVM_DEBUG(dbgs() << "=> "; Result.getNode()->dump(CurDAG); 5528 dbgs() << '\n'); 5529 } 5530 CurDAG->RemoveDeadNode(Node); 5531 return; 5532 } 5533 5534 case X86ISD::FCMP: 5535 case X86ISD::STRICT_FCMP: 5536 case X86ISD::STRICT_FCMPS: { 5537 bool IsStrictCmp = Node->getOpcode() == X86ISD::STRICT_FCMP || 5538 Node->getOpcode() == X86ISD::STRICT_FCMPS; 5539 SDValue N0 = Node->getOperand(IsStrictCmp ? 1 : 0); 5540 SDValue N1 = Node->getOperand(IsStrictCmp ? 2 : 1); 5541 5542 // Save the original VT of the compare. 5543 MVT CmpVT = N0.getSimpleValueType(); 5544 5545 // Floating point needs special handling if we don't have FCOMI. 5546 if (Subtarget->canUseCMOV()) 5547 break; 5548 5549 bool IsSignaling = Node->getOpcode() == X86ISD::STRICT_FCMPS; 5550 5551 unsigned Opc; 5552 switch (CmpVT.SimpleTy) { 5553 default: llvm_unreachable("Unexpected type!"); 5554 case MVT::f32: 5555 Opc = IsSignaling ? X86::COM_Fpr32 : X86::UCOM_Fpr32; 5556 break; 5557 case MVT::f64: 5558 Opc = IsSignaling ? X86::COM_Fpr64 : X86::UCOM_Fpr64; 5559 break; 5560 case MVT::f80: 5561 Opc = IsSignaling ? X86::COM_Fpr80 : X86::UCOM_Fpr80; 5562 break; 5563 } 5564 5565 SDValue Chain = 5566 IsStrictCmp ? Node->getOperand(0) : CurDAG->getEntryNode(); 5567 SDValue Glue; 5568 if (IsStrictCmp) { 5569 SDVTList VTs = CurDAG->getVTList(MVT::Other, MVT::Glue); 5570 Chain = SDValue(CurDAG->getMachineNode(Opc, dl, VTs, {N0, N1, Chain}), 0); 5571 Glue = Chain.getValue(1); 5572 } else { 5573 Glue = SDValue(CurDAG->getMachineNode(Opc, dl, MVT::Glue, N0, N1), 0); 5574 } 5575 5576 // Move FPSW to AX. 5577 SDValue FNSTSW = 5578 SDValue(CurDAG->getMachineNode(X86::FNSTSW16r, dl, MVT::i16, Glue), 0); 5579 5580 // Extract upper 8-bits of AX. 5581 SDValue Extract = 5582 CurDAG->getTargetExtractSubreg(X86::sub_8bit_hi, dl, MVT::i8, FNSTSW); 5583 5584 // Move AH into flags. 5585 // Some 64-bit targets lack SAHF support, but they do support FCOMI. 5586 assert(Subtarget->canUseLAHFSAHF() && 5587 "Target doesn't support SAHF or FCOMI?"); 5588 SDValue AH = CurDAG->getCopyToReg(Chain, dl, X86::AH, Extract, SDValue()); 5589 Chain = AH; 5590 SDValue SAHF = SDValue( 5591 CurDAG->getMachineNode(X86::SAHF, dl, MVT::i32, AH.getValue(1)), 0); 5592 5593 if (IsStrictCmp) 5594 ReplaceUses(SDValue(Node, 1), Chain); 5595 5596 ReplaceUses(SDValue(Node, 0), SAHF); 5597 CurDAG->RemoveDeadNode(Node); 5598 return; 5599 } 5600 5601 case X86ISD::CMP: { 5602 SDValue N0 = Node->getOperand(0); 5603 SDValue N1 = Node->getOperand(1); 5604 5605 // Optimizations for TEST compares. 5606 if (!isNullConstant(N1)) 5607 break; 5608 5609 // Save the original VT of the compare. 5610 MVT CmpVT = N0.getSimpleValueType(); 5611 5612 // If we are comparing (and (shr X, C, Mask) with 0, emit a BEXTR followed 5613 // by a test instruction. The test should be removed later by 5614 // analyzeCompare if we are using only the zero flag. 5615 // TODO: Should we check the users and use the BEXTR flags directly? 5616 if (N0.getOpcode() == ISD::AND && N0.hasOneUse()) { 5617 if (MachineSDNode *NewNode = matchBEXTRFromAndImm(N0.getNode())) { 5618 unsigned TestOpc = CmpVT == MVT::i64 ? X86::TEST64rr 5619 : X86::TEST32rr; 5620 SDValue BEXTR = SDValue(NewNode, 0); 5621 NewNode = CurDAG->getMachineNode(TestOpc, dl, MVT::i32, BEXTR, BEXTR); 5622 ReplaceUses(SDValue(Node, 0), SDValue(NewNode, 0)); 5623 CurDAG->RemoveDeadNode(Node); 5624 return; 5625 } 5626 } 5627 5628 // We can peek through truncates, but we need to be careful below. 5629 if (N0.getOpcode() == ISD::TRUNCATE && N0.hasOneUse()) 5630 N0 = N0.getOperand(0); 5631 5632 // Look for (X86cmp (and $op, $imm), 0) and see if we can convert it to 5633 // use a smaller encoding. 5634 // Look past the truncate if CMP is the only use of it. 5635 if (N0.getOpcode() == ISD::AND && N0.getNode()->hasOneUse() && 5636 N0.getValueType() != MVT::i8) { 5637 auto *MaskC = dyn_cast<ConstantSDNode>(N0.getOperand(1)); 5638 if (!MaskC) 5639 break; 5640 5641 // We may have looked through a truncate so mask off any bits that 5642 // shouldn't be part of the compare. 5643 uint64_t Mask = MaskC->getZExtValue(); 5644 Mask &= maskTrailingOnes<uint64_t>(CmpVT.getScalarSizeInBits()); 5645 5646 // Check if we can replace AND+IMM{32,64} with a shift. This is possible 5647 // for masks like 0xFF000000 or 0x00FFFFFF and if we care only about the 5648 // zero flag. 5649 if (CmpVT == MVT::i64 && !isInt<8>(Mask) && isShiftedMask_64(Mask) && 5650 onlyUsesZeroFlag(SDValue(Node, 0))) { 5651 unsigned ShiftOpcode = ISD::DELETED_NODE; 5652 unsigned ShiftAmt; 5653 unsigned SubRegIdx; 5654 MVT SubRegVT; 5655 unsigned TestOpcode; 5656 unsigned LeadingZeros = countLeadingZeros(Mask); 5657 unsigned TrailingZeros = countTrailingZeros(Mask); 5658 5659 // With leading/trailing zeros, the transform is profitable if we can 5660 // eliminate a movabsq or shrink a 32-bit immediate to 8-bit without 5661 // incurring any extra register moves. 5662 bool SavesBytes = !isInt<32>(Mask) || N0.getOperand(0).hasOneUse(); 5663 if (LeadingZeros == 0 && SavesBytes) { 5664 // If the mask covers the most significant bit, then we can replace 5665 // TEST+AND with a SHR and check eflags. 5666 // This emits a redundant TEST which is subsequently eliminated. 5667 ShiftOpcode = X86::SHR64ri; 5668 ShiftAmt = TrailingZeros; 5669 SubRegIdx = 0; 5670 TestOpcode = X86::TEST64rr; 5671 } else if (TrailingZeros == 0 && SavesBytes) { 5672 // If the mask covers the least significant bit, then we can replace 5673 // TEST+AND with a SHL and check eflags. 5674 // This emits a redundant TEST which is subsequently eliminated. 5675 ShiftOpcode = X86::SHL64ri; 5676 ShiftAmt = LeadingZeros; 5677 SubRegIdx = 0; 5678 TestOpcode = X86::TEST64rr; 5679 } else if (MaskC->hasOneUse() && !isInt<32>(Mask)) { 5680 // If the shifted mask extends into the high half and is 8/16/32 bits 5681 // wide, then replace it with a SHR and a TEST8rr/TEST16rr/TEST32rr. 5682 unsigned PopCount = 64 - LeadingZeros - TrailingZeros; 5683 if (PopCount == 8) { 5684 ShiftOpcode = X86::SHR64ri; 5685 ShiftAmt = TrailingZeros; 5686 SubRegIdx = X86::sub_8bit; 5687 SubRegVT = MVT::i8; 5688 TestOpcode = X86::TEST8rr; 5689 } else if (PopCount == 16) { 5690 ShiftOpcode = X86::SHR64ri; 5691 ShiftAmt = TrailingZeros; 5692 SubRegIdx = X86::sub_16bit; 5693 SubRegVT = MVT::i16; 5694 TestOpcode = X86::TEST16rr; 5695 } else if (PopCount == 32) { 5696 ShiftOpcode = X86::SHR64ri; 5697 ShiftAmt = TrailingZeros; 5698 SubRegIdx = X86::sub_32bit; 5699 SubRegVT = MVT::i32; 5700 TestOpcode = X86::TEST32rr; 5701 } 5702 } 5703 if (ShiftOpcode != ISD::DELETED_NODE) { 5704 SDValue ShiftC = CurDAG->getTargetConstant(ShiftAmt, dl, MVT::i64); 5705 SDValue Shift = SDValue( 5706 CurDAG->getMachineNode(ShiftOpcode, dl, MVT::i64, MVT::i32, 5707 N0.getOperand(0), ShiftC), 5708 0); 5709 if (SubRegIdx != 0) { 5710 Shift = 5711 CurDAG->getTargetExtractSubreg(SubRegIdx, dl, SubRegVT, Shift); 5712 } 5713 MachineSDNode *Test = 5714 CurDAG->getMachineNode(TestOpcode, dl, MVT::i32, Shift, Shift); 5715 ReplaceNode(Node, Test); 5716 return; 5717 } 5718 } 5719 5720 MVT VT; 5721 int SubRegOp; 5722 unsigned ROpc, MOpc; 5723 5724 // For each of these checks we need to be careful if the sign flag is 5725 // being used. It is only safe to use the sign flag in two conditions, 5726 // either the sign bit in the shrunken mask is zero or the final test 5727 // size is equal to the original compare size. 5728 5729 if (isUInt<8>(Mask) && 5730 (!(Mask & 0x80) || CmpVT == MVT::i8 || 5731 hasNoSignFlagUses(SDValue(Node, 0)))) { 5732 // For example, convert "testl %eax, $8" to "testb %al, $8" 5733 VT = MVT::i8; 5734 SubRegOp = X86::sub_8bit; 5735 ROpc = X86::TEST8ri; 5736 MOpc = X86::TEST8mi; 5737 } else if (OptForMinSize && isUInt<16>(Mask) && 5738 (!(Mask & 0x8000) || CmpVT == MVT::i16 || 5739 hasNoSignFlagUses(SDValue(Node, 0)))) { 5740 // For example, "testl %eax, $32776" to "testw %ax, $32776". 5741 // NOTE: We only want to form TESTW instructions if optimizing for 5742 // min size. Otherwise we only save one byte and possibly get a length 5743 // changing prefix penalty in the decoders. 5744 VT = MVT::i16; 5745 SubRegOp = X86::sub_16bit; 5746 ROpc = X86::TEST16ri; 5747 MOpc = X86::TEST16mi; 5748 } else if (isUInt<32>(Mask) && N0.getValueType() != MVT::i16 && 5749 ((!(Mask & 0x80000000) && 5750 // Without minsize 16-bit Cmps can get here so we need to 5751 // be sure we calculate the correct sign flag if needed. 5752 (CmpVT != MVT::i16 || !(Mask & 0x8000))) || 5753 CmpVT == MVT::i32 || 5754 hasNoSignFlagUses(SDValue(Node, 0)))) { 5755 // For example, "testq %rax, $268468232" to "testl %eax, $268468232". 5756 // NOTE: We only want to run that transform if N0 is 32 or 64 bits. 5757 // Otherwize, we find ourselves in a position where we have to do 5758 // promotion. If previous passes did not promote the and, we assume 5759 // they had a good reason not to and do not promote here. 5760 VT = MVT::i32; 5761 SubRegOp = X86::sub_32bit; 5762 ROpc = X86::TEST32ri; 5763 MOpc = X86::TEST32mi; 5764 } else { 5765 // No eligible transformation was found. 5766 break; 5767 } 5768 5769 SDValue Imm = CurDAG->getTargetConstant(Mask, dl, VT); 5770 SDValue Reg = N0.getOperand(0); 5771 5772 // Emit a testl or testw. 5773 MachineSDNode *NewNode; 5774 SDValue Tmp0, Tmp1, Tmp2, Tmp3, Tmp4; 5775 if (tryFoldLoad(Node, N0.getNode(), Reg, Tmp0, Tmp1, Tmp2, Tmp3, Tmp4)) { 5776 if (auto *LoadN = dyn_cast<LoadSDNode>(N0.getOperand(0).getNode())) { 5777 if (!LoadN->isSimple()) { 5778 unsigned NumVolBits = LoadN->getValueType(0).getSizeInBits(); 5779 if ((MOpc == X86::TEST8mi && NumVolBits != 8) || 5780 (MOpc == X86::TEST16mi && NumVolBits != 16) || 5781 (MOpc == X86::TEST32mi && NumVolBits != 32)) 5782 break; 5783 } 5784 } 5785 SDValue Ops[] = { Tmp0, Tmp1, Tmp2, Tmp3, Tmp4, Imm, 5786 Reg.getOperand(0) }; 5787 NewNode = CurDAG->getMachineNode(MOpc, dl, MVT::i32, MVT::Other, Ops); 5788 // Update the chain. 5789 ReplaceUses(Reg.getValue(1), SDValue(NewNode, 1)); 5790 // Record the mem-refs 5791 CurDAG->setNodeMemRefs(NewNode, 5792 {cast<LoadSDNode>(Reg)->getMemOperand()}); 5793 } else { 5794 // Extract the subregister if necessary. 5795 if (N0.getValueType() != VT) 5796 Reg = CurDAG->getTargetExtractSubreg(SubRegOp, dl, VT, Reg); 5797 5798 NewNode = CurDAG->getMachineNode(ROpc, dl, MVT::i32, Reg, Imm); 5799 } 5800 // Replace CMP with TEST. 5801 ReplaceNode(Node, NewNode); 5802 return; 5803 } 5804 break; 5805 } 5806 case X86ISD::PCMPISTR: { 5807 if (!Subtarget->hasSSE42()) 5808 break; 5809 5810 bool NeedIndex = !SDValue(Node, 0).use_empty(); 5811 bool NeedMask = !SDValue(Node, 1).use_empty(); 5812 // We can't fold a load if we are going to make two instructions. 5813 bool MayFoldLoad = !NeedIndex || !NeedMask; 5814 5815 MachineSDNode *CNode; 5816 if (NeedMask) { 5817 unsigned ROpc = Subtarget->hasAVX() ? X86::VPCMPISTRMrr : X86::PCMPISTRMrr; 5818 unsigned MOpc = Subtarget->hasAVX() ? X86::VPCMPISTRMrm : X86::PCMPISTRMrm; 5819 CNode = emitPCMPISTR(ROpc, MOpc, MayFoldLoad, dl, MVT::v16i8, Node); 5820 ReplaceUses(SDValue(Node, 1), SDValue(CNode, 0)); 5821 } 5822 if (NeedIndex || !NeedMask) { 5823 unsigned ROpc = Subtarget->hasAVX() ? X86::VPCMPISTRIrr : X86::PCMPISTRIrr; 5824 unsigned MOpc = Subtarget->hasAVX() ? X86::VPCMPISTRIrm : X86::PCMPISTRIrm; 5825 CNode = emitPCMPISTR(ROpc, MOpc, MayFoldLoad, dl, MVT::i32, Node); 5826 ReplaceUses(SDValue(Node, 0), SDValue(CNode, 0)); 5827 } 5828 5829 // Connect the flag usage to the last instruction created. 5830 ReplaceUses(SDValue(Node, 2), SDValue(CNode, 1)); 5831 CurDAG->RemoveDeadNode(Node); 5832 return; 5833 } 5834 case X86ISD::PCMPESTR: { 5835 if (!Subtarget->hasSSE42()) 5836 break; 5837 5838 // Copy the two implicit register inputs. 5839 SDValue InFlag = CurDAG->getCopyToReg(CurDAG->getEntryNode(), dl, X86::EAX, 5840 Node->getOperand(1), 5841 SDValue()).getValue(1); 5842 InFlag = CurDAG->getCopyToReg(CurDAG->getEntryNode(), dl, X86::EDX, 5843 Node->getOperand(3), InFlag).getValue(1); 5844 5845 bool NeedIndex = !SDValue(Node, 0).use_empty(); 5846 bool NeedMask = !SDValue(Node, 1).use_empty(); 5847 // We can't fold a load if we are going to make two instructions. 5848 bool MayFoldLoad = !NeedIndex || !NeedMask; 5849 5850 MachineSDNode *CNode; 5851 if (NeedMask) { 5852 unsigned ROpc = Subtarget->hasAVX() ? X86::VPCMPESTRMrr : X86::PCMPESTRMrr; 5853 unsigned MOpc = Subtarget->hasAVX() ? X86::VPCMPESTRMrm : X86::PCMPESTRMrm; 5854 CNode = emitPCMPESTR(ROpc, MOpc, MayFoldLoad, dl, MVT::v16i8, Node, 5855 InFlag); 5856 ReplaceUses(SDValue(Node, 1), SDValue(CNode, 0)); 5857 } 5858 if (NeedIndex || !NeedMask) { 5859 unsigned ROpc = Subtarget->hasAVX() ? X86::VPCMPESTRIrr : X86::PCMPESTRIrr; 5860 unsigned MOpc = Subtarget->hasAVX() ? X86::VPCMPESTRIrm : X86::PCMPESTRIrm; 5861 CNode = emitPCMPESTR(ROpc, MOpc, MayFoldLoad, dl, MVT::i32, Node, InFlag); 5862 ReplaceUses(SDValue(Node, 0), SDValue(CNode, 0)); 5863 } 5864 // Connect the flag usage to the last instruction created. 5865 ReplaceUses(SDValue(Node, 2), SDValue(CNode, 1)); 5866 CurDAG->RemoveDeadNode(Node); 5867 return; 5868 } 5869 5870 case ISD::SETCC: { 5871 if (NVT.isVector() && tryVPTESTM(Node, SDValue(Node, 0), SDValue())) 5872 return; 5873 5874 break; 5875 } 5876 5877 case ISD::STORE: 5878 if (foldLoadStoreIntoMemOperand(Node)) 5879 return; 5880 break; 5881 5882 case X86ISD::SETCC_CARRY: { 5883 MVT VT = Node->getSimpleValueType(0); 5884 SDValue Result; 5885 if (Subtarget->hasSBBDepBreaking()) { 5886 // We have to do this manually because tblgen will put the eflags copy in 5887 // the wrong place if we use an extract_subreg in the pattern. 5888 // Copy flags to the EFLAGS register and glue it to next node. 5889 SDValue EFLAGS = 5890 CurDAG->getCopyToReg(CurDAG->getEntryNode(), dl, X86::EFLAGS, 5891 Node->getOperand(1), SDValue()); 5892 5893 // Create a 64-bit instruction if the result is 64-bits otherwise use the 5894 // 32-bit version. 5895 unsigned Opc = VT == MVT::i64 ? X86::SETB_C64r : X86::SETB_C32r; 5896 MVT SetVT = VT == MVT::i64 ? MVT::i64 : MVT::i32; 5897 Result = SDValue( 5898 CurDAG->getMachineNode(Opc, dl, SetVT, EFLAGS, EFLAGS.getValue(1)), 5899 0); 5900 } else { 5901 // The target does not recognize sbb with the same reg operand as a 5902 // no-source idiom, so we explicitly zero the input values. 5903 Result = getSBBZero(Node); 5904 } 5905 5906 // For less than 32-bits we need to extract from the 32-bit node. 5907 if (VT == MVT::i8 || VT == MVT::i16) { 5908 int SubIndex = VT == MVT::i16 ? X86::sub_16bit : X86::sub_8bit; 5909 Result = CurDAG->getTargetExtractSubreg(SubIndex, dl, VT, Result); 5910 } 5911 5912 ReplaceUses(SDValue(Node, 0), Result); 5913 CurDAG->RemoveDeadNode(Node); 5914 return; 5915 } 5916 case X86ISD::SBB: { 5917 if (isNullConstant(Node->getOperand(0)) && 5918 isNullConstant(Node->getOperand(1))) { 5919 SDValue Result = getSBBZero(Node); 5920 5921 // Replace the flag use. 5922 ReplaceUses(SDValue(Node, 1), Result.getValue(1)); 5923 5924 // Replace the result use. 5925 if (!SDValue(Node, 0).use_empty()) { 5926 // For less than 32-bits we need to extract from the 32-bit node. 5927 MVT VT = Node->getSimpleValueType(0); 5928 if (VT == MVT::i8 || VT == MVT::i16) { 5929 int SubIndex = VT == MVT::i16 ? X86::sub_16bit : X86::sub_8bit; 5930 Result = CurDAG->getTargetExtractSubreg(SubIndex, dl, VT, Result); 5931 } 5932 ReplaceUses(SDValue(Node, 0), Result); 5933 } 5934 5935 CurDAG->RemoveDeadNode(Node); 5936 return; 5937 } 5938 break; 5939 } 5940 case X86ISD::MGATHER: { 5941 auto *Mgt = cast<X86MaskedGatherSDNode>(Node); 5942 SDValue IndexOp = Mgt->getIndex(); 5943 SDValue Mask = Mgt->getMask(); 5944 MVT IndexVT = IndexOp.getSimpleValueType(); 5945 MVT ValueVT = Node->getSimpleValueType(0); 5946 MVT MaskVT = Mask.getSimpleValueType(); 5947 5948 // This is just to prevent crashes if the nodes are malformed somehow. We're 5949 // otherwise only doing loose type checking in here based on type what 5950 // a type constraint would say just like table based isel. 5951 if (!ValueVT.isVector() || !MaskVT.isVector()) 5952 break; 5953 5954 unsigned NumElts = ValueVT.getVectorNumElements(); 5955 MVT ValueSVT = ValueVT.getVectorElementType(); 5956 5957 bool IsFP = ValueSVT.isFloatingPoint(); 5958 unsigned EltSize = ValueSVT.getSizeInBits(); 5959 5960 unsigned Opc = 0; 5961 bool AVX512Gather = MaskVT.getVectorElementType() == MVT::i1; 5962 if (AVX512Gather) { 5963 if (IndexVT == MVT::v4i32 && NumElts == 4 && EltSize == 32) 5964 Opc = IsFP ? X86::VGATHERDPSZ128rm : X86::VPGATHERDDZ128rm; 5965 else if (IndexVT == MVT::v8i32 && NumElts == 8 && EltSize == 32) 5966 Opc = IsFP ? X86::VGATHERDPSZ256rm : X86::VPGATHERDDZ256rm; 5967 else if (IndexVT == MVT::v16i32 && NumElts == 16 && EltSize == 32) 5968 Opc = IsFP ? X86::VGATHERDPSZrm : X86::VPGATHERDDZrm; 5969 else if (IndexVT == MVT::v4i32 && NumElts == 2 && EltSize == 64) 5970 Opc = IsFP ? X86::VGATHERDPDZ128rm : X86::VPGATHERDQZ128rm; 5971 else if (IndexVT == MVT::v4i32 && NumElts == 4 && EltSize == 64) 5972 Opc = IsFP ? X86::VGATHERDPDZ256rm : X86::VPGATHERDQZ256rm; 5973 else if (IndexVT == MVT::v8i32 && NumElts == 8 && EltSize == 64) 5974 Opc = IsFP ? X86::VGATHERDPDZrm : X86::VPGATHERDQZrm; 5975 else if (IndexVT == MVT::v2i64 && NumElts == 4 && EltSize == 32) 5976 Opc = IsFP ? X86::VGATHERQPSZ128rm : X86::VPGATHERQDZ128rm; 5977 else if (IndexVT == MVT::v4i64 && NumElts == 4 && EltSize == 32) 5978 Opc = IsFP ? X86::VGATHERQPSZ256rm : X86::VPGATHERQDZ256rm; 5979 else if (IndexVT == MVT::v8i64 && NumElts == 8 && EltSize == 32) 5980 Opc = IsFP ? X86::VGATHERQPSZrm : X86::VPGATHERQDZrm; 5981 else if (IndexVT == MVT::v2i64 && NumElts == 2 && EltSize == 64) 5982 Opc = IsFP ? X86::VGATHERQPDZ128rm : X86::VPGATHERQQZ128rm; 5983 else if (IndexVT == MVT::v4i64 && NumElts == 4 && EltSize == 64) 5984 Opc = IsFP ? X86::VGATHERQPDZ256rm : X86::VPGATHERQQZ256rm; 5985 else if (IndexVT == MVT::v8i64 && NumElts == 8 && EltSize == 64) 5986 Opc = IsFP ? X86::VGATHERQPDZrm : X86::VPGATHERQQZrm; 5987 } else { 5988 assert(EVT(MaskVT) == EVT(ValueVT).changeVectorElementTypeToInteger() && 5989 "Unexpected mask VT!"); 5990 if (IndexVT == MVT::v4i32 && NumElts == 4 && EltSize == 32) 5991 Opc = IsFP ? X86::VGATHERDPSrm : X86::VPGATHERDDrm; 5992 else if (IndexVT == MVT::v8i32 && NumElts == 8 && EltSize == 32) 5993 Opc = IsFP ? X86::VGATHERDPSYrm : X86::VPGATHERDDYrm; 5994 else if (IndexVT == MVT::v4i32 && NumElts == 2 && EltSize == 64) 5995 Opc = IsFP ? X86::VGATHERDPDrm : X86::VPGATHERDQrm; 5996 else if (IndexVT == MVT::v4i32 && NumElts == 4 && EltSize == 64) 5997 Opc = IsFP ? X86::VGATHERDPDYrm : X86::VPGATHERDQYrm; 5998 else if (IndexVT == MVT::v2i64 && NumElts == 4 && EltSize == 32) 5999 Opc = IsFP ? X86::VGATHERQPSrm : X86::VPGATHERQDrm; 6000 else if (IndexVT == MVT::v4i64 && NumElts == 4 && EltSize == 32) 6001 Opc = IsFP ? X86::VGATHERQPSYrm : X86::VPGATHERQDYrm; 6002 else if (IndexVT == MVT::v2i64 && NumElts == 2 && EltSize == 64) 6003 Opc = IsFP ? X86::VGATHERQPDrm : X86::VPGATHERQQrm; 6004 else if (IndexVT == MVT::v4i64 && NumElts == 4 && EltSize == 64) 6005 Opc = IsFP ? X86::VGATHERQPDYrm : X86::VPGATHERQQYrm; 6006 } 6007 6008 if (!Opc) 6009 break; 6010 6011 SDValue Base, Scale, Index, Disp, Segment; 6012 if (!selectVectorAddr(Mgt, Mgt->getBasePtr(), IndexOp, Mgt->getScale(), 6013 Base, Scale, Index, Disp, Segment)) 6014 break; 6015 6016 SDValue PassThru = Mgt->getPassThru(); 6017 SDValue Chain = Mgt->getChain(); 6018 // Gather instructions have a mask output not in the ISD node. 6019 SDVTList VTs = CurDAG->getVTList(ValueVT, MaskVT, MVT::Other); 6020 6021 MachineSDNode *NewNode; 6022 if (AVX512Gather) { 6023 SDValue Ops[] = {PassThru, Mask, Base, Scale, 6024 Index, Disp, Segment, Chain}; 6025 NewNode = CurDAG->getMachineNode(Opc, SDLoc(dl), VTs, Ops); 6026 } else { 6027 SDValue Ops[] = {PassThru, Base, Scale, Index, 6028 Disp, Segment, Mask, Chain}; 6029 NewNode = CurDAG->getMachineNode(Opc, SDLoc(dl), VTs, Ops); 6030 } 6031 CurDAG->setNodeMemRefs(NewNode, {Mgt->getMemOperand()}); 6032 ReplaceUses(SDValue(Node, 0), SDValue(NewNode, 0)); 6033 ReplaceUses(SDValue(Node, 1), SDValue(NewNode, 2)); 6034 CurDAG->RemoveDeadNode(Node); 6035 return; 6036 } 6037 case X86ISD::MSCATTER: { 6038 auto *Sc = cast<X86MaskedScatterSDNode>(Node); 6039 SDValue Value = Sc->getValue(); 6040 SDValue IndexOp = Sc->getIndex(); 6041 MVT IndexVT = IndexOp.getSimpleValueType(); 6042 MVT ValueVT = Value.getSimpleValueType(); 6043 6044 // This is just to prevent crashes if the nodes are malformed somehow. We're 6045 // otherwise only doing loose type checking in here based on type what 6046 // a type constraint would say just like table based isel. 6047 if (!ValueVT.isVector()) 6048 break; 6049 6050 unsigned NumElts = ValueVT.getVectorNumElements(); 6051 MVT ValueSVT = ValueVT.getVectorElementType(); 6052 6053 bool IsFP = ValueSVT.isFloatingPoint(); 6054 unsigned EltSize = ValueSVT.getSizeInBits(); 6055 6056 unsigned Opc; 6057 if (IndexVT == MVT::v4i32 && NumElts == 4 && EltSize == 32) 6058 Opc = IsFP ? X86::VSCATTERDPSZ128mr : X86::VPSCATTERDDZ128mr; 6059 else if (IndexVT == MVT::v8i32 && NumElts == 8 && EltSize == 32) 6060 Opc = IsFP ? X86::VSCATTERDPSZ256mr : X86::VPSCATTERDDZ256mr; 6061 else if (IndexVT == MVT::v16i32 && NumElts == 16 && EltSize == 32) 6062 Opc = IsFP ? X86::VSCATTERDPSZmr : X86::VPSCATTERDDZmr; 6063 else if (IndexVT == MVT::v4i32 && NumElts == 2 && EltSize == 64) 6064 Opc = IsFP ? X86::VSCATTERDPDZ128mr : X86::VPSCATTERDQZ128mr; 6065 else if (IndexVT == MVT::v4i32 && NumElts == 4 && EltSize == 64) 6066 Opc = IsFP ? X86::VSCATTERDPDZ256mr : X86::VPSCATTERDQZ256mr; 6067 else if (IndexVT == MVT::v8i32 && NumElts == 8 && EltSize == 64) 6068 Opc = IsFP ? X86::VSCATTERDPDZmr : X86::VPSCATTERDQZmr; 6069 else if (IndexVT == MVT::v2i64 && NumElts == 4 && EltSize == 32) 6070 Opc = IsFP ? X86::VSCATTERQPSZ128mr : X86::VPSCATTERQDZ128mr; 6071 else if (IndexVT == MVT::v4i64 && NumElts == 4 && EltSize == 32) 6072 Opc = IsFP ? X86::VSCATTERQPSZ256mr : X86::VPSCATTERQDZ256mr; 6073 else if (IndexVT == MVT::v8i64 && NumElts == 8 && EltSize == 32) 6074 Opc = IsFP ? X86::VSCATTERQPSZmr : X86::VPSCATTERQDZmr; 6075 else if (IndexVT == MVT::v2i64 && NumElts == 2 && EltSize == 64) 6076 Opc = IsFP ? X86::VSCATTERQPDZ128mr : X86::VPSCATTERQQZ128mr; 6077 else if (IndexVT == MVT::v4i64 && NumElts == 4 && EltSize == 64) 6078 Opc = IsFP ? X86::VSCATTERQPDZ256mr : X86::VPSCATTERQQZ256mr; 6079 else if (IndexVT == MVT::v8i64 && NumElts == 8 && EltSize == 64) 6080 Opc = IsFP ? X86::VSCATTERQPDZmr : X86::VPSCATTERQQZmr; 6081 else 6082 break; 6083 6084 SDValue Base, Scale, Index, Disp, Segment; 6085 if (!selectVectorAddr(Sc, Sc->getBasePtr(), IndexOp, Sc->getScale(), 6086 Base, Scale, Index, Disp, Segment)) 6087 break; 6088 6089 SDValue Mask = Sc->getMask(); 6090 SDValue Chain = Sc->getChain(); 6091 // Scatter instructions have a mask output not in the ISD node. 6092 SDVTList VTs = CurDAG->getVTList(Mask.getValueType(), MVT::Other); 6093 SDValue Ops[] = {Base, Scale, Index, Disp, Segment, Mask, Value, Chain}; 6094 6095 MachineSDNode *NewNode = CurDAG->getMachineNode(Opc, SDLoc(dl), VTs, Ops); 6096 CurDAG->setNodeMemRefs(NewNode, {Sc->getMemOperand()}); 6097 ReplaceUses(SDValue(Node, 0), SDValue(NewNode, 1)); 6098 CurDAG->RemoveDeadNode(Node); 6099 return; 6100 } 6101 case ISD::PREALLOCATED_SETUP: { 6102 auto *MFI = CurDAG->getMachineFunction().getInfo<X86MachineFunctionInfo>(); 6103 auto CallId = MFI->getPreallocatedIdForCallSite( 6104 cast<SrcValueSDNode>(Node->getOperand(1))->getValue()); 6105 SDValue Chain = Node->getOperand(0); 6106 SDValue CallIdValue = CurDAG->getTargetConstant(CallId, dl, MVT::i32); 6107 MachineSDNode *New = CurDAG->getMachineNode( 6108 TargetOpcode::PREALLOCATED_SETUP, dl, MVT::Other, CallIdValue, Chain); 6109 ReplaceUses(SDValue(Node, 0), SDValue(New, 0)); // Chain 6110 CurDAG->RemoveDeadNode(Node); 6111 return; 6112 } 6113 case ISD::PREALLOCATED_ARG: { 6114 auto *MFI = CurDAG->getMachineFunction().getInfo<X86MachineFunctionInfo>(); 6115 auto CallId = MFI->getPreallocatedIdForCallSite( 6116 cast<SrcValueSDNode>(Node->getOperand(1))->getValue()); 6117 SDValue Chain = Node->getOperand(0); 6118 SDValue CallIdValue = CurDAG->getTargetConstant(CallId, dl, MVT::i32); 6119 SDValue ArgIndex = Node->getOperand(2); 6120 SDValue Ops[3]; 6121 Ops[0] = CallIdValue; 6122 Ops[1] = ArgIndex; 6123 Ops[2] = Chain; 6124 MachineSDNode *New = CurDAG->getMachineNode( 6125 TargetOpcode::PREALLOCATED_ARG, dl, 6126 CurDAG->getVTList(TLI->getPointerTy(CurDAG->getDataLayout()), 6127 MVT::Other), 6128 Ops); 6129 ReplaceUses(SDValue(Node, 0), SDValue(New, 0)); // Arg pointer 6130 ReplaceUses(SDValue(Node, 1), SDValue(New, 1)); // Chain 6131 CurDAG->RemoveDeadNode(Node); 6132 return; 6133 } 6134 case X86ISD::AESENCWIDE128KL: 6135 case X86ISD::AESDECWIDE128KL: 6136 case X86ISD::AESENCWIDE256KL: 6137 case X86ISD::AESDECWIDE256KL: { 6138 if (!Subtarget->hasWIDEKL()) 6139 break; 6140 6141 unsigned Opcode; 6142 switch (Node->getOpcode()) { 6143 default: 6144 llvm_unreachable("Unexpected opcode!"); 6145 case X86ISD::AESENCWIDE128KL: 6146 Opcode = X86::AESENCWIDE128KL; 6147 break; 6148 case X86ISD::AESDECWIDE128KL: 6149 Opcode = X86::AESDECWIDE128KL; 6150 break; 6151 case X86ISD::AESENCWIDE256KL: 6152 Opcode = X86::AESENCWIDE256KL; 6153 break; 6154 case X86ISD::AESDECWIDE256KL: 6155 Opcode = X86::AESDECWIDE256KL; 6156 break; 6157 } 6158 6159 SDValue Chain = Node->getOperand(0); 6160 SDValue Addr = Node->getOperand(1); 6161 6162 SDValue Base, Scale, Index, Disp, Segment; 6163 if (!selectAddr(Node, Addr, Base, Scale, Index, Disp, Segment)) 6164 break; 6165 6166 Chain = CurDAG->getCopyToReg(Chain, dl, X86::XMM0, Node->getOperand(2), 6167 SDValue()); 6168 Chain = CurDAG->getCopyToReg(Chain, dl, X86::XMM1, Node->getOperand(3), 6169 Chain.getValue(1)); 6170 Chain = CurDAG->getCopyToReg(Chain, dl, X86::XMM2, Node->getOperand(4), 6171 Chain.getValue(1)); 6172 Chain = CurDAG->getCopyToReg(Chain, dl, X86::XMM3, Node->getOperand(5), 6173 Chain.getValue(1)); 6174 Chain = CurDAG->getCopyToReg(Chain, dl, X86::XMM4, Node->getOperand(6), 6175 Chain.getValue(1)); 6176 Chain = CurDAG->getCopyToReg(Chain, dl, X86::XMM5, Node->getOperand(7), 6177 Chain.getValue(1)); 6178 Chain = CurDAG->getCopyToReg(Chain, dl, X86::XMM6, Node->getOperand(8), 6179 Chain.getValue(1)); 6180 Chain = CurDAG->getCopyToReg(Chain, dl, X86::XMM7, Node->getOperand(9), 6181 Chain.getValue(1)); 6182 6183 MachineSDNode *Res = CurDAG->getMachineNode( 6184 Opcode, dl, Node->getVTList(), 6185 {Base, Scale, Index, Disp, Segment, Chain, Chain.getValue(1)}); 6186 CurDAG->setNodeMemRefs(Res, cast<MemSDNode>(Node)->getMemOperand()); 6187 ReplaceNode(Node, Res); 6188 return; 6189 } 6190 } 6191 6192 SelectCode(Node); 6193 } 6194 6195 bool X86DAGToDAGISel:: 6196 SelectInlineAsmMemoryOperand(const SDValue &Op, unsigned ConstraintID, 6197 std::vector<SDValue> &OutOps) { 6198 SDValue Op0, Op1, Op2, Op3, Op4; 6199 switch (ConstraintID) { 6200 default: 6201 llvm_unreachable("Unexpected asm memory constraint"); 6202 case InlineAsm::Constraint_o: // offsetable ?? 6203 case InlineAsm::Constraint_v: // not offsetable ?? 6204 case InlineAsm::Constraint_m: // memory 6205 case InlineAsm::Constraint_X: 6206 case InlineAsm::Constraint_p: // address 6207 if (!selectAddr(nullptr, Op, Op0, Op1, Op2, Op3, Op4)) 6208 return true; 6209 break; 6210 } 6211 6212 OutOps.push_back(Op0); 6213 OutOps.push_back(Op1); 6214 OutOps.push_back(Op2); 6215 OutOps.push_back(Op3); 6216 OutOps.push_back(Op4); 6217 return false; 6218 } 6219 6220 /// This pass converts a legalized DAG into a X86-specific DAG, 6221 /// ready for instruction scheduling. 6222 FunctionPass *llvm::createX86ISelDag(X86TargetMachine &TM, 6223 CodeGenOpt::Level OptLevel) { 6224 return new X86DAGToDAGISel(TM, OptLevel); 6225 } 6226