1 //===- X86ISelDAGToDAG.cpp - A DAG pattern matching inst selector for X86 -===// 2 // 3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 4 // See https://llvm.org/LICENSE.txt for license information. 5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 6 // 7 //===----------------------------------------------------------------------===// 8 // 9 // This file defines a DAG pattern matching instruction selector for X86, 10 // converting from a legalized dag to a X86 dag. 11 // 12 //===----------------------------------------------------------------------===// 13 14 #include "X86.h" 15 #include "X86MachineFunctionInfo.h" 16 #include "X86RegisterInfo.h" 17 #include "X86Subtarget.h" 18 #include "X86TargetMachine.h" 19 #include "llvm/ADT/Statistic.h" 20 #include "llvm/CodeGen/SelectionDAGISel.h" 21 #include "llvm/Config/llvm-config.h" 22 #include "llvm/IR/ConstantRange.h" 23 #include "llvm/IR/Function.h" 24 #include "llvm/IR/Instructions.h" 25 #include "llvm/IR/Intrinsics.h" 26 #include "llvm/IR/IntrinsicsX86.h" 27 #include "llvm/IR/Type.h" 28 #include "llvm/Support/Debug.h" 29 #include "llvm/Support/ErrorHandling.h" 30 #include "llvm/Support/KnownBits.h" 31 #include "llvm/Support/MathExtras.h" 32 #include <stdint.h> 33 using namespace llvm; 34 35 #define DEBUG_TYPE "x86-isel" 36 37 STATISTIC(NumLoadMoved, "Number of loads moved below TokenFactor"); 38 39 static cl::opt<bool> AndImmShrink("x86-and-imm-shrink", cl::init(true), 40 cl::desc("Enable setting constant bits to reduce size of mask immediates"), 41 cl::Hidden); 42 43 static cl::opt<bool> EnablePromoteAnyextLoad( 44 "x86-promote-anyext-load", cl::init(true), 45 cl::desc("Enable promoting aligned anyext load to wider load"), cl::Hidden); 46 47 //===----------------------------------------------------------------------===// 48 // Pattern Matcher Implementation 49 //===----------------------------------------------------------------------===// 50 51 namespace { 52 /// This corresponds to X86AddressMode, but uses SDValue's instead of register 53 /// numbers for the leaves of the matched tree. 54 struct X86ISelAddressMode { 55 enum { 56 RegBase, 57 FrameIndexBase 58 } BaseType; 59 60 // This is really a union, discriminated by BaseType! 61 SDValue Base_Reg; 62 int Base_FrameIndex; 63 64 unsigned Scale; 65 SDValue IndexReg; 66 int32_t Disp; 67 SDValue Segment; 68 const GlobalValue *GV; 69 const Constant *CP; 70 const BlockAddress *BlockAddr; 71 const char *ES; 72 MCSymbol *MCSym; 73 int JT; 74 Align Alignment; // CP alignment. 75 unsigned char SymbolFlags; // X86II::MO_* 76 bool NegateIndex = false; 77 78 X86ISelAddressMode() 79 : BaseType(RegBase), Base_FrameIndex(0), Scale(1), IndexReg(), Disp(0), 80 Segment(), GV(nullptr), CP(nullptr), BlockAddr(nullptr), ES(nullptr), 81 MCSym(nullptr), JT(-1), SymbolFlags(X86II::MO_NO_FLAG) {} 82 83 bool hasSymbolicDisplacement() const { 84 return GV != nullptr || CP != nullptr || ES != nullptr || 85 MCSym != nullptr || JT != -1 || BlockAddr != nullptr; 86 } 87 88 bool hasBaseOrIndexReg() const { 89 return BaseType == FrameIndexBase || 90 IndexReg.getNode() != nullptr || Base_Reg.getNode() != nullptr; 91 } 92 93 /// Return true if this addressing mode is already RIP-relative. 94 bool isRIPRelative() const { 95 if (BaseType != RegBase) return false; 96 if (RegisterSDNode *RegNode = 97 dyn_cast_or_null<RegisterSDNode>(Base_Reg.getNode())) 98 return RegNode->getReg() == X86::RIP; 99 return false; 100 } 101 102 void setBaseReg(SDValue Reg) { 103 BaseType = RegBase; 104 Base_Reg = Reg; 105 } 106 107 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) 108 void dump(SelectionDAG *DAG = nullptr) { 109 dbgs() << "X86ISelAddressMode " << this << '\n'; 110 dbgs() << "Base_Reg "; 111 if (Base_Reg.getNode()) 112 Base_Reg.getNode()->dump(DAG); 113 else 114 dbgs() << "nul\n"; 115 if (BaseType == FrameIndexBase) 116 dbgs() << " Base.FrameIndex " << Base_FrameIndex << '\n'; 117 dbgs() << " Scale " << Scale << '\n' 118 << "IndexReg "; 119 if (NegateIndex) 120 dbgs() << "negate "; 121 if (IndexReg.getNode()) 122 IndexReg.getNode()->dump(DAG); 123 else 124 dbgs() << "nul\n"; 125 dbgs() << " Disp " << Disp << '\n' 126 << "GV "; 127 if (GV) 128 GV->dump(); 129 else 130 dbgs() << "nul"; 131 dbgs() << " CP "; 132 if (CP) 133 CP->dump(); 134 else 135 dbgs() << "nul"; 136 dbgs() << '\n' 137 << "ES "; 138 if (ES) 139 dbgs() << ES; 140 else 141 dbgs() << "nul"; 142 dbgs() << " MCSym "; 143 if (MCSym) 144 dbgs() << MCSym; 145 else 146 dbgs() << "nul"; 147 dbgs() << " JT" << JT << " Align" << Alignment.value() << '\n'; 148 } 149 #endif 150 }; 151 } 152 153 namespace { 154 //===--------------------------------------------------------------------===// 155 /// ISel - X86-specific code to select X86 machine instructions for 156 /// SelectionDAG operations. 157 /// 158 class X86DAGToDAGISel final : public SelectionDAGISel { 159 /// Keep a pointer to the X86Subtarget around so that we can 160 /// make the right decision when generating code for different targets. 161 const X86Subtarget *Subtarget; 162 163 /// If true, selector should try to optimize for minimum code size. 164 bool OptForMinSize; 165 166 /// Disable direct TLS access through segment registers. 167 bool IndirectTlsSegRefs; 168 169 public: 170 explicit X86DAGToDAGISel(X86TargetMachine &tm, CodeGenOpt::Level OptLevel) 171 : SelectionDAGISel(tm, OptLevel), Subtarget(nullptr), 172 OptForMinSize(false), IndirectTlsSegRefs(false) {} 173 174 StringRef getPassName() const override { 175 return "X86 DAG->DAG Instruction Selection"; 176 } 177 178 bool runOnMachineFunction(MachineFunction &MF) override { 179 // Reset the subtarget each time through. 180 Subtarget = &MF.getSubtarget<X86Subtarget>(); 181 IndirectTlsSegRefs = MF.getFunction().hasFnAttribute( 182 "indirect-tls-seg-refs"); 183 184 // OptFor[Min]Size are used in pattern predicates that isel is matching. 185 OptForMinSize = MF.getFunction().hasMinSize(); 186 assert((!OptForMinSize || MF.getFunction().hasOptSize()) && 187 "OptForMinSize implies OptForSize"); 188 189 SelectionDAGISel::runOnMachineFunction(MF); 190 return true; 191 } 192 193 void emitFunctionEntryCode() override; 194 195 bool IsProfitableToFold(SDValue N, SDNode *U, SDNode *Root) const override; 196 197 void PreprocessISelDAG() override; 198 void PostprocessISelDAG() override; 199 200 // Include the pieces autogenerated from the target description. 201 #include "X86GenDAGISel.inc" 202 203 private: 204 void Select(SDNode *N) override; 205 206 bool foldOffsetIntoAddress(uint64_t Offset, X86ISelAddressMode &AM); 207 bool matchLoadInAddress(LoadSDNode *N, X86ISelAddressMode &AM); 208 bool matchWrapper(SDValue N, X86ISelAddressMode &AM); 209 bool matchAddress(SDValue N, X86ISelAddressMode &AM); 210 bool matchVectorAddress(SDValue N, X86ISelAddressMode &AM); 211 bool matchAdd(SDValue &N, X86ISelAddressMode &AM, unsigned Depth); 212 bool matchAddressRecursively(SDValue N, X86ISelAddressMode &AM, 213 unsigned Depth); 214 bool matchAddressBase(SDValue N, X86ISelAddressMode &AM); 215 bool selectAddr(SDNode *Parent, SDValue N, SDValue &Base, 216 SDValue &Scale, SDValue &Index, SDValue &Disp, 217 SDValue &Segment); 218 bool selectVectorAddr(MemSDNode *Parent, SDValue BasePtr, SDValue IndexOp, 219 SDValue ScaleOp, SDValue &Base, SDValue &Scale, 220 SDValue &Index, SDValue &Disp, SDValue &Segment); 221 bool selectMOV64Imm32(SDValue N, SDValue &Imm); 222 bool selectLEAAddr(SDValue N, SDValue &Base, 223 SDValue &Scale, SDValue &Index, SDValue &Disp, 224 SDValue &Segment); 225 bool selectLEA64_32Addr(SDValue N, SDValue &Base, 226 SDValue &Scale, SDValue &Index, SDValue &Disp, 227 SDValue &Segment); 228 bool selectTLSADDRAddr(SDValue N, SDValue &Base, 229 SDValue &Scale, SDValue &Index, SDValue &Disp, 230 SDValue &Segment); 231 bool selectRelocImm(SDValue N, SDValue &Op); 232 233 bool tryFoldLoad(SDNode *Root, SDNode *P, SDValue N, 234 SDValue &Base, SDValue &Scale, 235 SDValue &Index, SDValue &Disp, 236 SDValue &Segment); 237 238 // Convenience method where P is also root. 239 bool tryFoldLoad(SDNode *P, SDValue N, 240 SDValue &Base, SDValue &Scale, 241 SDValue &Index, SDValue &Disp, 242 SDValue &Segment) { 243 return tryFoldLoad(P, P, N, Base, Scale, Index, Disp, Segment); 244 } 245 246 bool tryFoldBroadcast(SDNode *Root, SDNode *P, SDValue N, 247 SDValue &Base, SDValue &Scale, 248 SDValue &Index, SDValue &Disp, 249 SDValue &Segment); 250 251 bool isProfitableToFormMaskedOp(SDNode *N) const; 252 253 /// Implement addressing mode selection for inline asm expressions. 254 bool SelectInlineAsmMemoryOperand(const SDValue &Op, 255 unsigned ConstraintID, 256 std::vector<SDValue> &OutOps) override; 257 258 void emitSpecialCodeForMain(); 259 260 inline void getAddressOperands(X86ISelAddressMode &AM, const SDLoc &DL, 261 MVT VT, SDValue &Base, SDValue &Scale, 262 SDValue &Index, SDValue &Disp, 263 SDValue &Segment) { 264 if (AM.BaseType == X86ISelAddressMode::FrameIndexBase) 265 Base = CurDAG->getTargetFrameIndex( 266 AM.Base_FrameIndex, TLI->getPointerTy(CurDAG->getDataLayout())); 267 else if (AM.Base_Reg.getNode()) 268 Base = AM.Base_Reg; 269 else 270 Base = CurDAG->getRegister(0, VT); 271 272 Scale = getI8Imm(AM.Scale, DL); 273 274 // Negate the index if needed. 275 if (AM.NegateIndex) { 276 unsigned NegOpc = VT == MVT::i64 ? X86::NEG64r : X86::NEG32r; 277 SDValue Neg = SDValue(CurDAG->getMachineNode(NegOpc, DL, VT, MVT::i32, 278 AM.IndexReg), 0); 279 AM.IndexReg = Neg; 280 } 281 282 if (AM.IndexReg.getNode()) 283 Index = AM.IndexReg; 284 else 285 Index = CurDAG->getRegister(0, VT); 286 287 // These are 32-bit even in 64-bit mode since RIP-relative offset 288 // is 32-bit. 289 if (AM.GV) 290 Disp = CurDAG->getTargetGlobalAddress(AM.GV, SDLoc(), 291 MVT::i32, AM.Disp, 292 AM.SymbolFlags); 293 else if (AM.CP) 294 Disp = CurDAG->getTargetConstantPool(AM.CP, MVT::i32, AM.Alignment, 295 AM.Disp, AM.SymbolFlags); 296 else if (AM.ES) { 297 assert(!AM.Disp && "Non-zero displacement is ignored with ES."); 298 Disp = CurDAG->getTargetExternalSymbol(AM.ES, MVT::i32, AM.SymbolFlags); 299 } else if (AM.MCSym) { 300 assert(!AM.Disp && "Non-zero displacement is ignored with MCSym."); 301 assert(AM.SymbolFlags == 0 && "oo"); 302 Disp = CurDAG->getMCSymbol(AM.MCSym, MVT::i32); 303 } else if (AM.JT != -1) { 304 assert(!AM.Disp && "Non-zero displacement is ignored with JT."); 305 Disp = CurDAG->getTargetJumpTable(AM.JT, MVT::i32, AM.SymbolFlags); 306 } else if (AM.BlockAddr) 307 Disp = CurDAG->getTargetBlockAddress(AM.BlockAddr, MVT::i32, AM.Disp, 308 AM.SymbolFlags); 309 else 310 Disp = CurDAG->getTargetConstant(AM.Disp, DL, MVT::i32); 311 312 if (AM.Segment.getNode()) 313 Segment = AM.Segment; 314 else 315 Segment = CurDAG->getRegister(0, MVT::i16); 316 } 317 318 // Utility function to determine whether we should avoid selecting 319 // immediate forms of instructions for better code size or not. 320 // At a high level, we'd like to avoid such instructions when 321 // we have similar constants used within the same basic block 322 // that can be kept in a register. 323 // 324 bool shouldAvoidImmediateInstFormsForSize(SDNode *N) const { 325 uint32_t UseCount = 0; 326 327 // Do not want to hoist if we're not optimizing for size. 328 // TODO: We'd like to remove this restriction. 329 // See the comment in X86InstrInfo.td for more info. 330 if (!CurDAG->shouldOptForSize()) 331 return false; 332 333 // Walk all the users of the immediate. 334 for (SDNode::use_iterator UI = N->use_begin(), 335 UE = N->use_end(); (UI != UE) && (UseCount < 2); ++UI) { 336 337 SDNode *User = *UI; 338 339 // This user is already selected. Count it as a legitimate use and 340 // move on. 341 if (User->isMachineOpcode()) { 342 UseCount++; 343 continue; 344 } 345 346 // We want to count stores of immediates as real uses. 347 if (User->getOpcode() == ISD::STORE && 348 User->getOperand(1).getNode() == N) { 349 UseCount++; 350 continue; 351 } 352 353 // We don't currently match users that have > 2 operands (except 354 // for stores, which are handled above) 355 // Those instruction won't match in ISEL, for now, and would 356 // be counted incorrectly. 357 // This may change in the future as we add additional instruction 358 // types. 359 if (User->getNumOperands() != 2) 360 continue; 361 362 // If this is a sign-extended 8-bit integer immediate used in an ALU 363 // instruction, there is probably an opcode encoding to save space. 364 auto *C = dyn_cast<ConstantSDNode>(N); 365 if (C && isInt<8>(C->getSExtValue())) 366 continue; 367 368 // Immediates that are used for offsets as part of stack 369 // manipulation should be left alone. These are typically 370 // used to indicate SP offsets for argument passing and 371 // will get pulled into stores/pushes (implicitly). 372 if (User->getOpcode() == X86ISD::ADD || 373 User->getOpcode() == ISD::ADD || 374 User->getOpcode() == X86ISD::SUB || 375 User->getOpcode() == ISD::SUB) { 376 377 // Find the other operand of the add/sub. 378 SDValue OtherOp = User->getOperand(0); 379 if (OtherOp.getNode() == N) 380 OtherOp = User->getOperand(1); 381 382 // Don't count if the other operand is SP. 383 RegisterSDNode *RegNode; 384 if (OtherOp->getOpcode() == ISD::CopyFromReg && 385 (RegNode = dyn_cast_or_null<RegisterSDNode>( 386 OtherOp->getOperand(1).getNode()))) 387 if ((RegNode->getReg() == X86::ESP) || 388 (RegNode->getReg() == X86::RSP)) 389 continue; 390 } 391 392 // ... otherwise, count this and move on. 393 UseCount++; 394 } 395 396 // If we have more than 1 use, then recommend for hoisting. 397 return (UseCount > 1); 398 } 399 400 /// Return a target constant with the specified value of type i8. 401 inline SDValue getI8Imm(unsigned Imm, const SDLoc &DL) { 402 return CurDAG->getTargetConstant(Imm, DL, MVT::i8); 403 } 404 405 /// Return a target constant with the specified value, of type i32. 406 inline SDValue getI32Imm(unsigned Imm, const SDLoc &DL) { 407 return CurDAG->getTargetConstant(Imm, DL, MVT::i32); 408 } 409 410 /// Return a target constant with the specified value, of type i64. 411 inline SDValue getI64Imm(uint64_t Imm, const SDLoc &DL) { 412 return CurDAG->getTargetConstant(Imm, DL, MVT::i64); 413 } 414 415 SDValue getExtractVEXTRACTImmediate(SDNode *N, unsigned VecWidth, 416 const SDLoc &DL) { 417 assert((VecWidth == 128 || VecWidth == 256) && "Unexpected vector width"); 418 uint64_t Index = N->getConstantOperandVal(1); 419 MVT VecVT = N->getOperand(0).getSimpleValueType(); 420 return getI8Imm((Index * VecVT.getScalarSizeInBits()) / VecWidth, DL); 421 } 422 423 SDValue getInsertVINSERTImmediate(SDNode *N, unsigned VecWidth, 424 const SDLoc &DL) { 425 assert((VecWidth == 128 || VecWidth == 256) && "Unexpected vector width"); 426 uint64_t Index = N->getConstantOperandVal(2); 427 MVT VecVT = N->getSimpleValueType(0); 428 return getI8Imm((Index * VecVT.getScalarSizeInBits()) / VecWidth, DL); 429 } 430 431 // Helper to detect unneeded and instructions on shift amounts. Called 432 // from PatFrags in tablegen. 433 bool isUnneededShiftMask(SDNode *N, unsigned Width) const { 434 assert(N->getOpcode() == ISD::AND && "Unexpected opcode"); 435 const APInt &Val = cast<ConstantSDNode>(N->getOperand(1))->getAPIntValue(); 436 437 if (Val.countTrailingOnes() >= Width) 438 return true; 439 440 APInt Mask = Val | CurDAG->computeKnownBits(N->getOperand(0)).Zero; 441 return Mask.countTrailingOnes() >= Width; 442 } 443 444 /// Return an SDNode that returns the value of the global base register. 445 /// Output instructions required to initialize the global base register, 446 /// if necessary. 447 SDNode *getGlobalBaseReg(); 448 449 /// Return a reference to the TargetMachine, casted to the target-specific 450 /// type. 451 const X86TargetMachine &getTargetMachine() const { 452 return static_cast<const X86TargetMachine &>(TM); 453 } 454 455 /// Return a reference to the TargetInstrInfo, casted to the target-specific 456 /// type. 457 const X86InstrInfo *getInstrInfo() const { 458 return Subtarget->getInstrInfo(); 459 } 460 461 /// Address-mode matching performs shift-of-and to and-of-shift 462 /// reassociation in order to expose more scaled addressing 463 /// opportunities. 464 bool ComplexPatternFuncMutatesDAG() const override { 465 return true; 466 } 467 468 bool isSExtAbsoluteSymbolRef(unsigned Width, SDNode *N) const; 469 470 // Indicates we should prefer to use a non-temporal load for this load. 471 bool useNonTemporalLoad(LoadSDNode *N) const { 472 if (!N->isNonTemporal()) 473 return false; 474 475 unsigned StoreSize = N->getMemoryVT().getStoreSize(); 476 477 if (N->getAlignment() < StoreSize) 478 return false; 479 480 switch (StoreSize) { 481 default: llvm_unreachable("Unsupported store size"); 482 case 4: 483 case 8: 484 return false; 485 case 16: 486 return Subtarget->hasSSE41(); 487 case 32: 488 return Subtarget->hasAVX2(); 489 case 64: 490 return Subtarget->hasAVX512(); 491 } 492 } 493 494 bool foldLoadStoreIntoMemOperand(SDNode *Node); 495 MachineSDNode *matchBEXTRFromAndImm(SDNode *Node); 496 bool matchBitExtract(SDNode *Node); 497 bool shrinkAndImmediate(SDNode *N); 498 bool isMaskZeroExtended(SDNode *N) const; 499 bool tryShiftAmountMod(SDNode *N); 500 bool tryShrinkShlLogicImm(SDNode *N); 501 bool tryVPTERNLOG(SDNode *N); 502 bool tryVPTESTM(SDNode *Root, SDValue Setcc, SDValue Mask); 503 bool tryMatchBitSelect(SDNode *N); 504 505 MachineSDNode *emitPCMPISTR(unsigned ROpc, unsigned MOpc, bool MayFoldLoad, 506 const SDLoc &dl, MVT VT, SDNode *Node); 507 MachineSDNode *emitPCMPESTR(unsigned ROpc, unsigned MOpc, bool MayFoldLoad, 508 const SDLoc &dl, MVT VT, SDNode *Node, 509 SDValue &InFlag); 510 511 bool tryOptimizeRem8Extend(SDNode *N); 512 513 bool onlyUsesZeroFlag(SDValue Flags) const; 514 bool hasNoSignFlagUses(SDValue Flags) const; 515 bool hasNoCarryFlagUses(SDValue Flags) const; 516 }; 517 } 518 519 520 // Returns true if this masked compare can be implemented legally with this 521 // type. 522 static bool isLegalMaskCompare(SDNode *N, const X86Subtarget *Subtarget) { 523 unsigned Opcode = N->getOpcode(); 524 if (Opcode == X86ISD::CMPM || Opcode == X86ISD::STRICT_CMPM || 525 Opcode == ISD::SETCC || Opcode == X86ISD::CMPM_SAE || 526 Opcode == X86ISD::VFPCLASS) { 527 // We can get 256-bit 8 element types here without VLX being enabled. When 528 // this happens we will use 512-bit operations and the mask will not be 529 // zero extended. 530 EVT OpVT = N->getOperand(0).getValueType(); 531 // The first operand of X86ISD::STRICT_CMPM is chain, so we need to get the 532 // second operand. 533 if (Opcode == X86ISD::STRICT_CMPM) 534 OpVT = N->getOperand(1).getValueType(); 535 if (OpVT.is256BitVector() || OpVT.is128BitVector()) 536 return Subtarget->hasVLX(); 537 538 return true; 539 } 540 // Scalar opcodes use 128 bit registers, but aren't subject to the VLX check. 541 if (Opcode == X86ISD::VFPCLASSS || Opcode == X86ISD::FSETCCM || 542 Opcode == X86ISD::FSETCCM_SAE) 543 return true; 544 545 return false; 546 } 547 548 // Returns true if we can assume the writer of the mask has zero extended it 549 // for us. 550 bool X86DAGToDAGISel::isMaskZeroExtended(SDNode *N) const { 551 // If this is an AND, check if we have a compare on either side. As long as 552 // one side guarantees the mask is zero extended, the AND will preserve those 553 // zeros. 554 if (N->getOpcode() == ISD::AND) 555 return isLegalMaskCompare(N->getOperand(0).getNode(), Subtarget) || 556 isLegalMaskCompare(N->getOperand(1).getNode(), Subtarget); 557 558 return isLegalMaskCompare(N, Subtarget); 559 } 560 561 bool 562 X86DAGToDAGISel::IsProfitableToFold(SDValue N, SDNode *U, SDNode *Root) const { 563 if (OptLevel == CodeGenOpt::None) return false; 564 565 if (!N.hasOneUse()) 566 return false; 567 568 if (N.getOpcode() != ISD::LOAD) 569 return true; 570 571 // Don't fold non-temporal loads if we have an instruction for them. 572 if (useNonTemporalLoad(cast<LoadSDNode>(N))) 573 return false; 574 575 // If N is a load, do additional profitability checks. 576 if (U == Root) { 577 switch (U->getOpcode()) { 578 default: break; 579 case X86ISD::ADD: 580 case X86ISD::ADC: 581 case X86ISD::SUB: 582 case X86ISD::SBB: 583 case X86ISD::AND: 584 case X86ISD::XOR: 585 case X86ISD::OR: 586 case ISD::ADD: 587 case ISD::ADDCARRY: 588 case ISD::AND: 589 case ISD::OR: 590 case ISD::XOR: { 591 SDValue Op1 = U->getOperand(1); 592 593 // If the other operand is a 8-bit immediate we should fold the immediate 594 // instead. This reduces code size. 595 // e.g. 596 // movl 4(%esp), %eax 597 // addl $4, %eax 598 // vs. 599 // movl $4, %eax 600 // addl 4(%esp), %eax 601 // The former is 2 bytes shorter. In case where the increment is 1, then 602 // the saving can be 4 bytes (by using incl %eax). 603 if (ConstantSDNode *Imm = dyn_cast<ConstantSDNode>(Op1)) { 604 if (Imm->getAPIntValue().isSignedIntN(8)) 605 return false; 606 607 // If this is a 64-bit AND with an immediate that fits in 32-bits, 608 // prefer using the smaller and over folding the load. This is needed to 609 // make sure immediates created by shrinkAndImmediate are always folded. 610 // Ideally we would narrow the load during DAG combine and get the 611 // best of both worlds. 612 if (U->getOpcode() == ISD::AND && 613 Imm->getAPIntValue().getBitWidth() == 64 && 614 Imm->getAPIntValue().isIntN(32)) 615 return false; 616 617 // If this really a zext_inreg that can be represented with a movzx 618 // instruction, prefer that. 619 // TODO: We could shrink the load and fold if it is non-volatile. 620 if (U->getOpcode() == ISD::AND && 621 (Imm->getAPIntValue() == UINT8_MAX || 622 Imm->getAPIntValue() == UINT16_MAX || 623 Imm->getAPIntValue() == UINT32_MAX)) 624 return false; 625 626 // ADD/SUB with can negate the immediate and use the opposite operation 627 // to fit 128 into a sign extended 8 bit immediate. 628 if ((U->getOpcode() == ISD::ADD || U->getOpcode() == ISD::SUB) && 629 (-Imm->getAPIntValue()).isSignedIntN(8)) 630 return false; 631 632 if ((U->getOpcode() == X86ISD::ADD || U->getOpcode() == X86ISD::SUB) && 633 (-Imm->getAPIntValue()).isSignedIntN(8) && 634 hasNoCarryFlagUses(SDValue(U, 1))) 635 return false; 636 } 637 638 // If the other operand is a TLS address, we should fold it instead. 639 // This produces 640 // movl %gs:0, %eax 641 // leal i@NTPOFF(%eax), %eax 642 // instead of 643 // movl $i@NTPOFF, %eax 644 // addl %gs:0, %eax 645 // if the block also has an access to a second TLS address this will save 646 // a load. 647 // FIXME: This is probably also true for non-TLS addresses. 648 if (Op1.getOpcode() == X86ISD::Wrapper) { 649 SDValue Val = Op1.getOperand(0); 650 if (Val.getOpcode() == ISD::TargetGlobalTLSAddress) 651 return false; 652 } 653 654 // Don't fold load if this matches the BTS/BTR/BTC patterns. 655 // BTS: (or X, (shl 1, n)) 656 // BTR: (and X, (rotl -2, n)) 657 // BTC: (xor X, (shl 1, n)) 658 if (U->getOpcode() == ISD::OR || U->getOpcode() == ISD::XOR) { 659 if (U->getOperand(0).getOpcode() == ISD::SHL && 660 isOneConstant(U->getOperand(0).getOperand(0))) 661 return false; 662 663 if (U->getOperand(1).getOpcode() == ISD::SHL && 664 isOneConstant(U->getOperand(1).getOperand(0))) 665 return false; 666 } 667 if (U->getOpcode() == ISD::AND) { 668 SDValue U0 = U->getOperand(0); 669 SDValue U1 = U->getOperand(1); 670 if (U0.getOpcode() == ISD::ROTL) { 671 auto *C = dyn_cast<ConstantSDNode>(U0.getOperand(0)); 672 if (C && C->getSExtValue() == -2) 673 return false; 674 } 675 676 if (U1.getOpcode() == ISD::ROTL) { 677 auto *C = dyn_cast<ConstantSDNode>(U1.getOperand(0)); 678 if (C && C->getSExtValue() == -2) 679 return false; 680 } 681 } 682 683 break; 684 } 685 case ISD::SHL: 686 case ISD::SRA: 687 case ISD::SRL: 688 // Don't fold a load into a shift by immediate. The BMI2 instructions 689 // support folding a load, but not an immediate. The legacy instructions 690 // support folding an immediate, but can't fold a load. Folding an 691 // immediate is preferable to folding a load. 692 if (isa<ConstantSDNode>(U->getOperand(1))) 693 return false; 694 695 break; 696 } 697 } 698 699 // Prevent folding a load if this can implemented with an insert_subreg or 700 // a move that implicitly zeroes. 701 if (Root->getOpcode() == ISD::INSERT_SUBVECTOR && 702 isNullConstant(Root->getOperand(2)) && 703 (Root->getOperand(0).isUndef() || 704 ISD::isBuildVectorAllZeros(Root->getOperand(0).getNode()))) 705 return false; 706 707 return true; 708 } 709 710 // Indicates it is profitable to form an AVX512 masked operation. Returning 711 // false will favor a masked register-register masked move or vblendm and the 712 // operation will be selected separately. 713 bool X86DAGToDAGISel::isProfitableToFormMaskedOp(SDNode *N) const { 714 assert( 715 (N->getOpcode() == ISD::VSELECT || N->getOpcode() == X86ISD::SELECTS) && 716 "Unexpected opcode!"); 717 718 // If the operation has additional users, the operation will be duplicated. 719 // Check the use count to prevent that. 720 // FIXME: Are there cheap opcodes we might want to duplicate? 721 return N->getOperand(1).hasOneUse(); 722 } 723 724 /// Replace the original chain operand of the call with 725 /// load's chain operand and move load below the call's chain operand. 726 static void moveBelowOrigChain(SelectionDAG *CurDAG, SDValue Load, 727 SDValue Call, SDValue OrigChain) { 728 SmallVector<SDValue, 8> Ops; 729 SDValue Chain = OrigChain.getOperand(0); 730 if (Chain.getNode() == Load.getNode()) 731 Ops.push_back(Load.getOperand(0)); 732 else { 733 assert(Chain.getOpcode() == ISD::TokenFactor && 734 "Unexpected chain operand"); 735 for (unsigned i = 0, e = Chain.getNumOperands(); i != e; ++i) 736 if (Chain.getOperand(i).getNode() == Load.getNode()) 737 Ops.push_back(Load.getOperand(0)); 738 else 739 Ops.push_back(Chain.getOperand(i)); 740 SDValue NewChain = 741 CurDAG->getNode(ISD::TokenFactor, SDLoc(Load), MVT::Other, Ops); 742 Ops.clear(); 743 Ops.push_back(NewChain); 744 } 745 Ops.append(OrigChain->op_begin() + 1, OrigChain->op_end()); 746 CurDAG->UpdateNodeOperands(OrigChain.getNode(), Ops); 747 CurDAG->UpdateNodeOperands(Load.getNode(), Call.getOperand(0), 748 Load.getOperand(1), Load.getOperand(2)); 749 750 Ops.clear(); 751 Ops.push_back(SDValue(Load.getNode(), 1)); 752 Ops.append(Call->op_begin() + 1, Call->op_end()); 753 CurDAG->UpdateNodeOperands(Call.getNode(), Ops); 754 } 755 756 /// Return true if call address is a load and it can be 757 /// moved below CALLSEQ_START and the chains leading up to the call. 758 /// Return the CALLSEQ_START by reference as a second output. 759 /// In the case of a tail call, there isn't a callseq node between the call 760 /// chain and the load. 761 static bool isCalleeLoad(SDValue Callee, SDValue &Chain, bool HasCallSeq) { 762 // The transformation is somewhat dangerous if the call's chain was glued to 763 // the call. After MoveBelowOrigChain the load is moved between the call and 764 // the chain, this can create a cycle if the load is not folded. So it is 765 // *really* important that we are sure the load will be folded. 766 if (Callee.getNode() == Chain.getNode() || !Callee.hasOneUse()) 767 return false; 768 LoadSDNode *LD = dyn_cast<LoadSDNode>(Callee.getNode()); 769 if (!LD || 770 !LD->isSimple() || 771 LD->getAddressingMode() != ISD::UNINDEXED || 772 LD->getExtensionType() != ISD::NON_EXTLOAD) 773 return false; 774 775 // Now let's find the callseq_start. 776 while (HasCallSeq && Chain.getOpcode() != ISD::CALLSEQ_START) { 777 if (!Chain.hasOneUse()) 778 return false; 779 Chain = Chain.getOperand(0); 780 } 781 782 if (!Chain.getNumOperands()) 783 return false; 784 // Since we are not checking for AA here, conservatively abort if the chain 785 // writes to memory. It's not safe to move the callee (a load) across a store. 786 if (isa<MemSDNode>(Chain.getNode()) && 787 cast<MemSDNode>(Chain.getNode())->writeMem()) 788 return false; 789 if (Chain.getOperand(0).getNode() == Callee.getNode()) 790 return true; 791 if (Chain.getOperand(0).getOpcode() == ISD::TokenFactor && 792 Callee.getValue(1).isOperandOf(Chain.getOperand(0).getNode()) && 793 Callee.getValue(1).hasOneUse()) 794 return true; 795 return false; 796 } 797 798 void X86DAGToDAGISel::PreprocessISelDAG() { 799 bool MadeChange = false; 800 for (SelectionDAG::allnodes_iterator I = CurDAG->allnodes_begin(), 801 E = CurDAG->allnodes_end(); I != E; ) { 802 SDNode *N = &*I++; // Preincrement iterator to avoid invalidation issues. 803 804 // If this is a target specific AND node with no flag usages, turn it back 805 // into ISD::AND to enable test instruction matching. 806 if (N->getOpcode() == X86ISD::AND && !N->hasAnyUseOfValue(1)) { 807 SDValue Res = CurDAG->getNode(ISD::AND, SDLoc(N), N->getValueType(0), 808 N->getOperand(0), N->getOperand(1)); 809 --I; 810 CurDAG->ReplaceAllUsesOfValueWith(SDValue(N, 0), Res); 811 ++I; 812 MadeChange = true; 813 continue; 814 } 815 816 /// Convert vector increment or decrement to sub/add with an all-ones 817 /// constant: 818 /// add X, <1, 1...> --> sub X, <-1, -1...> 819 /// sub X, <1, 1...> --> add X, <-1, -1...> 820 /// The all-ones vector constant can be materialized using a pcmpeq 821 /// instruction that is commonly recognized as an idiom (has no register 822 /// dependency), so that's better/smaller than loading a splat 1 constant. 823 if ((N->getOpcode() == ISD::ADD || N->getOpcode() == ISD::SUB) && 824 N->getSimpleValueType(0).isVector()) { 825 826 APInt SplatVal; 827 if (X86::isConstantSplat(N->getOperand(1), SplatVal) && 828 SplatVal.isOneValue()) { 829 SDLoc DL(N); 830 831 MVT VT = N->getSimpleValueType(0); 832 unsigned NumElts = VT.getSizeInBits() / 32; 833 SDValue AllOnes = 834 CurDAG->getAllOnesConstant(DL, MVT::getVectorVT(MVT::i32, NumElts)); 835 AllOnes = CurDAG->getBitcast(VT, AllOnes); 836 837 unsigned NewOpcode = N->getOpcode() == ISD::ADD ? ISD::SUB : ISD::ADD; 838 SDValue Res = 839 CurDAG->getNode(NewOpcode, DL, VT, N->getOperand(0), AllOnes); 840 --I; 841 CurDAG->ReplaceAllUsesWith(N, Res.getNode()); 842 ++I; 843 MadeChange = true; 844 continue; 845 } 846 } 847 848 switch (N->getOpcode()) { 849 case X86ISD::VBROADCAST: { 850 MVT VT = N->getSimpleValueType(0); 851 // Emulate v32i16/v64i8 broadcast without BWI. 852 if (!Subtarget->hasBWI() && (VT == MVT::v32i16 || VT == MVT::v64i8)) { 853 MVT NarrowVT = VT == MVT::v32i16 ? MVT::v16i16 : MVT::v32i8; 854 SDLoc dl(N); 855 SDValue NarrowBCast = 856 CurDAG->getNode(X86ISD::VBROADCAST, dl, NarrowVT, N->getOperand(0)); 857 SDValue Res = 858 CurDAG->getNode(ISD::INSERT_SUBVECTOR, dl, VT, CurDAG->getUNDEF(VT), 859 NarrowBCast, CurDAG->getIntPtrConstant(0, dl)); 860 unsigned Index = VT == MVT::v32i16 ? 16 : 32; 861 Res = CurDAG->getNode(ISD::INSERT_SUBVECTOR, dl, VT, Res, NarrowBCast, 862 CurDAG->getIntPtrConstant(Index, dl)); 863 864 --I; 865 CurDAG->ReplaceAllUsesWith(N, Res.getNode()); 866 ++I; 867 MadeChange = true; 868 continue; 869 } 870 871 break; 872 } 873 case X86ISD::VBROADCAST_LOAD: { 874 MVT VT = N->getSimpleValueType(0); 875 // Emulate v32i16/v64i8 broadcast without BWI. 876 if (!Subtarget->hasBWI() && (VT == MVT::v32i16 || VT == MVT::v64i8)) { 877 MVT NarrowVT = VT == MVT::v32i16 ? MVT::v16i16 : MVT::v32i8; 878 auto *MemNode = cast<MemSDNode>(N); 879 SDLoc dl(N); 880 SDVTList VTs = CurDAG->getVTList(NarrowVT, MVT::Other); 881 SDValue Ops[] = {MemNode->getChain(), MemNode->getBasePtr()}; 882 SDValue NarrowBCast = CurDAG->getMemIntrinsicNode( 883 X86ISD::VBROADCAST_LOAD, dl, VTs, Ops, MemNode->getMemoryVT(), 884 MemNode->getMemOperand()); 885 SDValue Res = 886 CurDAG->getNode(ISD::INSERT_SUBVECTOR, dl, VT, CurDAG->getUNDEF(VT), 887 NarrowBCast, CurDAG->getIntPtrConstant(0, dl)); 888 unsigned Index = VT == MVT::v32i16 ? 16 : 32; 889 Res = CurDAG->getNode(ISD::INSERT_SUBVECTOR, dl, VT, Res, NarrowBCast, 890 CurDAG->getIntPtrConstant(Index, dl)); 891 892 --I; 893 SDValue To[] = {Res, NarrowBCast.getValue(1)}; 894 CurDAG->ReplaceAllUsesWith(N, To); 895 ++I; 896 MadeChange = true; 897 continue; 898 } 899 900 break; 901 } 902 case ISD::VSELECT: { 903 // Replace VSELECT with non-mask conditions with with BLENDV. 904 if (N->getOperand(0).getValueType().getVectorElementType() == MVT::i1) 905 break; 906 907 assert(Subtarget->hasSSE41() && "Expected SSE4.1 support!"); 908 SDValue Blendv = 909 CurDAG->getNode(X86ISD::BLENDV, SDLoc(N), N->getValueType(0), 910 N->getOperand(0), N->getOperand(1), N->getOperand(2)); 911 --I; 912 CurDAG->ReplaceAllUsesWith(N, Blendv.getNode()); 913 ++I; 914 MadeChange = true; 915 continue; 916 } 917 case ISD::FP_ROUND: 918 case ISD::STRICT_FP_ROUND: 919 case ISD::FP_TO_SINT: 920 case ISD::FP_TO_UINT: 921 case ISD::STRICT_FP_TO_SINT: 922 case ISD::STRICT_FP_TO_UINT: { 923 // Replace vector fp_to_s/uint with their X86 specific equivalent so we 924 // don't need 2 sets of patterns. 925 if (!N->getSimpleValueType(0).isVector()) 926 break; 927 928 unsigned NewOpc; 929 switch (N->getOpcode()) { 930 default: llvm_unreachable("Unexpected opcode!"); 931 case ISD::FP_ROUND: NewOpc = X86ISD::VFPROUND; break; 932 case ISD::STRICT_FP_ROUND: NewOpc = X86ISD::STRICT_VFPROUND; break; 933 case ISD::STRICT_FP_TO_SINT: NewOpc = X86ISD::STRICT_CVTTP2SI; break; 934 case ISD::FP_TO_SINT: NewOpc = X86ISD::CVTTP2SI; break; 935 case ISD::STRICT_FP_TO_UINT: NewOpc = X86ISD::STRICT_CVTTP2UI; break; 936 case ISD::FP_TO_UINT: NewOpc = X86ISD::CVTTP2UI; break; 937 } 938 SDValue Res; 939 if (N->isStrictFPOpcode()) 940 Res = 941 CurDAG->getNode(NewOpc, SDLoc(N), {N->getValueType(0), MVT::Other}, 942 {N->getOperand(0), N->getOperand(1)}); 943 else 944 Res = 945 CurDAG->getNode(NewOpc, SDLoc(N), N->getValueType(0), 946 N->getOperand(0)); 947 --I; 948 CurDAG->ReplaceAllUsesWith(N, Res.getNode()); 949 ++I; 950 MadeChange = true; 951 continue; 952 } 953 case ISD::SHL: 954 case ISD::SRA: 955 case ISD::SRL: { 956 // Replace vector shifts with their X86 specific equivalent so we don't 957 // need 2 sets of patterns. 958 if (!N->getValueType(0).isVector()) 959 break; 960 961 unsigned NewOpc; 962 switch (N->getOpcode()) { 963 default: llvm_unreachable("Unexpected opcode!"); 964 case ISD::SHL: NewOpc = X86ISD::VSHLV; break; 965 case ISD::SRA: NewOpc = X86ISD::VSRAV; break; 966 case ISD::SRL: NewOpc = X86ISD::VSRLV; break; 967 } 968 SDValue Res = CurDAG->getNode(NewOpc, SDLoc(N), N->getValueType(0), 969 N->getOperand(0), N->getOperand(1)); 970 --I; 971 CurDAG->ReplaceAllUsesOfValueWith(SDValue(N, 0), Res); 972 ++I; 973 MadeChange = true; 974 continue; 975 } 976 case ISD::ANY_EXTEND: 977 case ISD::ANY_EXTEND_VECTOR_INREG: { 978 // Replace vector any extend with the zero extend equivalents so we don't 979 // need 2 sets of patterns. Ignore vXi1 extensions. 980 if (!N->getValueType(0).isVector()) 981 break; 982 983 unsigned NewOpc; 984 if (N->getOperand(0).getScalarValueSizeInBits() == 1) { 985 assert(N->getOpcode() == ISD::ANY_EXTEND && 986 "Unexpected opcode for mask vector!"); 987 NewOpc = ISD::SIGN_EXTEND; 988 } else { 989 NewOpc = N->getOpcode() == ISD::ANY_EXTEND 990 ? ISD::ZERO_EXTEND 991 : ISD::ZERO_EXTEND_VECTOR_INREG; 992 } 993 994 SDValue Res = CurDAG->getNode(NewOpc, SDLoc(N), N->getValueType(0), 995 N->getOperand(0)); 996 --I; 997 CurDAG->ReplaceAllUsesOfValueWith(SDValue(N, 0), Res); 998 ++I; 999 MadeChange = true; 1000 continue; 1001 } 1002 case ISD::FCEIL: 1003 case ISD::STRICT_FCEIL: 1004 case ISD::FFLOOR: 1005 case ISD::STRICT_FFLOOR: 1006 case ISD::FTRUNC: 1007 case ISD::STRICT_FTRUNC: 1008 case ISD::FNEARBYINT: 1009 case ISD::STRICT_FNEARBYINT: 1010 case ISD::FRINT: 1011 case ISD::STRICT_FRINT: { 1012 // Replace fp rounding with their X86 specific equivalent so we don't 1013 // need 2 sets of patterns. 1014 unsigned Imm; 1015 switch (N->getOpcode()) { 1016 default: llvm_unreachable("Unexpected opcode!"); 1017 case ISD::STRICT_FCEIL: 1018 case ISD::FCEIL: Imm = 0xA; break; 1019 case ISD::STRICT_FFLOOR: 1020 case ISD::FFLOOR: Imm = 0x9; break; 1021 case ISD::STRICT_FTRUNC: 1022 case ISD::FTRUNC: Imm = 0xB; break; 1023 case ISD::STRICT_FNEARBYINT: 1024 case ISD::FNEARBYINT: Imm = 0xC; break; 1025 case ISD::STRICT_FRINT: 1026 case ISD::FRINT: Imm = 0x4; break; 1027 } 1028 SDLoc dl(N); 1029 bool IsStrict = N->isStrictFPOpcode(); 1030 SDValue Res; 1031 if (IsStrict) 1032 Res = CurDAG->getNode(X86ISD::STRICT_VRNDSCALE, dl, 1033 {N->getValueType(0), MVT::Other}, 1034 {N->getOperand(0), N->getOperand(1), 1035 CurDAG->getTargetConstant(Imm, dl, MVT::i8)}); 1036 else 1037 Res = CurDAG->getNode(X86ISD::VRNDSCALE, dl, N->getValueType(0), 1038 N->getOperand(0), 1039 CurDAG->getTargetConstant(Imm, dl, MVT::i8)); 1040 --I; 1041 CurDAG->ReplaceAllUsesWith(N, Res.getNode()); 1042 ++I; 1043 MadeChange = true; 1044 continue; 1045 } 1046 case X86ISD::FANDN: 1047 case X86ISD::FAND: 1048 case X86ISD::FOR: 1049 case X86ISD::FXOR: { 1050 // Widen scalar fp logic ops to vector to reduce isel patterns. 1051 // FIXME: Can we do this during lowering/combine. 1052 MVT VT = N->getSimpleValueType(0); 1053 if (VT.isVector() || VT == MVT::f128) 1054 break; 1055 1056 MVT VecVT = VT == MVT::f64 ? MVT::v2f64 : MVT::v4f32; 1057 SDLoc dl(N); 1058 SDValue Op0 = CurDAG->getNode(ISD::SCALAR_TO_VECTOR, dl, VecVT, 1059 N->getOperand(0)); 1060 SDValue Op1 = CurDAG->getNode(ISD::SCALAR_TO_VECTOR, dl, VecVT, 1061 N->getOperand(1)); 1062 1063 SDValue Res; 1064 if (Subtarget->hasSSE2()) { 1065 EVT IntVT = EVT(VecVT).changeVectorElementTypeToInteger(); 1066 Op0 = CurDAG->getNode(ISD::BITCAST, dl, IntVT, Op0); 1067 Op1 = CurDAG->getNode(ISD::BITCAST, dl, IntVT, Op1); 1068 unsigned Opc; 1069 switch (N->getOpcode()) { 1070 default: llvm_unreachable("Unexpected opcode!"); 1071 case X86ISD::FANDN: Opc = X86ISD::ANDNP; break; 1072 case X86ISD::FAND: Opc = ISD::AND; break; 1073 case X86ISD::FOR: Opc = ISD::OR; break; 1074 case X86ISD::FXOR: Opc = ISD::XOR; break; 1075 } 1076 Res = CurDAG->getNode(Opc, dl, IntVT, Op0, Op1); 1077 Res = CurDAG->getNode(ISD::BITCAST, dl, VecVT, Res); 1078 } else { 1079 Res = CurDAG->getNode(N->getOpcode(), dl, VecVT, Op0, Op1); 1080 } 1081 Res = CurDAG->getNode(ISD::EXTRACT_VECTOR_ELT, dl, VT, Res, 1082 CurDAG->getIntPtrConstant(0, dl)); 1083 --I; 1084 CurDAG->ReplaceAllUsesOfValueWith(SDValue(N, 0), Res); 1085 ++I; 1086 MadeChange = true; 1087 continue; 1088 } 1089 } 1090 1091 if (OptLevel != CodeGenOpt::None && 1092 // Only do this when the target can fold the load into the call or 1093 // jmp. 1094 !Subtarget->useIndirectThunkCalls() && 1095 ((N->getOpcode() == X86ISD::CALL && !Subtarget->slowTwoMemOps()) || 1096 (N->getOpcode() == X86ISD::TC_RETURN && 1097 (Subtarget->is64Bit() || 1098 !getTargetMachine().isPositionIndependent())))) { 1099 /// Also try moving call address load from outside callseq_start to just 1100 /// before the call to allow it to be folded. 1101 /// 1102 /// [Load chain] 1103 /// ^ 1104 /// | 1105 /// [Load] 1106 /// ^ ^ 1107 /// | | 1108 /// / \-- 1109 /// / | 1110 ///[CALLSEQ_START] | 1111 /// ^ | 1112 /// | | 1113 /// [LOAD/C2Reg] | 1114 /// | | 1115 /// \ / 1116 /// \ / 1117 /// [CALL] 1118 bool HasCallSeq = N->getOpcode() == X86ISD::CALL; 1119 SDValue Chain = N->getOperand(0); 1120 SDValue Load = N->getOperand(1); 1121 if (!isCalleeLoad(Load, Chain, HasCallSeq)) 1122 continue; 1123 moveBelowOrigChain(CurDAG, Load, SDValue(N, 0), Chain); 1124 ++NumLoadMoved; 1125 MadeChange = true; 1126 continue; 1127 } 1128 1129 // Lower fpround and fpextend nodes that target the FP stack to be store and 1130 // load to the stack. This is a gross hack. We would like to simply mark 1131 // these as being illegal, but when we do that, legalize produces these when 1132 // it expands calls, then expands these in the same legalize pass. We would 1133 // like dag combine to be able to hack on these between the call expansion 1134 // and the node legalization. As such this pass basically does "really 1135 // late" legalization of these inline with the X86 isel pass. 1136 // FIXME: This should only happen when not compiled with -O0. 1137 switch (N->getOpcode()) { 1138 default: continue; 1139 case ISD::FP_ROUND: 1140 case ISD::FP_EXTEND: 1141 { 1142 MVT SrcVT = N->getOperand(0).getSimpleValueType(); 1143 MVT DstVT = N->getSimpleValueType(0); 1144 1145 // If any of the sources are vectors, no fp stack involved. 1146 if (SrcVT.isVector() || DstVT.isVector()) 1147 continue; 1148 1149 // If the source and destination are SSE registers, then this is a legal 1150 // conversion that should not be lowered. 1151 const X86TargetLowering *X86Lowering = 1152 static_cast<const X86TargetLowering *>(TLI); 1153 bool SrcIsSSE = X86Lowering->isScalarFPTypeInSSEReg(SrcVT); 1154 bool DstIsSSE = X86Lowering->isScalarFPTypeInSSEReg(DstVT); 1155 if (SrcIsSSE && DstIsSSE) 1156 continue; 1157 1158 if (!SrcIsSSE && !DstIsSSE) { 1159 // If this is an FPStack extension, it is a noop. 1160 if (N->getOpcode() == ISD::FP_EXTEND) 1161 continue; 1162 // If this is a value-preserving FPStack truncation, it is a noop. 1163 if (N->getConstantOperandVal(1)) 1164 continue; 1165 } 1166 1167 // Here we could have an FP stack truncation or an FPStack <-> SSE convert. 1168 // FPStack has extload and truncstore. SSE can fold direct loads into other 1169 // operations. Based on this, decide what we want to do. 1170 MVT MemVT = (N->getOpcode() == ISD::FP_ROUND) ? DstVT : SrcVT; 1171 SDValue MemTmp = CurDAG->CreateStackTemporary(MemVT); 1172 int SPFI = cast<FrameIndexSDNode>(MemTmp)->getIndex(); 1173 MachinePointerInfo MPI = 1174 MachinePointerInfo::getFixedStack(CurDAG->getMachineFunction(), SPFI); 1175 SDLoc dl(N); 1176 1177 // FIXME: optimize the case where the src/dest is a load or store? 1178 1179 SDValue Store = CurDAG->getTruncStore( 1180 CurDAG->getEntryNode(), dl, N->getOperand(0), MemTmp, MPI, MemVT); 1181 SDValue Result = CurDAG->getExtLoad(ISD::EXTLOAD, dl, DstVT, Store, 1182 MemTmp, MPI, MemVT); 1183 1184 // We're about to replace all uses of the FP_ROUND/FP_EXTEND with the 1185 // extload we created. This will cause general havok on the dag because 1186 // anything below the conversion could be folded into other existing nodes. 1187 // To avoid invalidating 'I', back it up to the convert node. 1188 --I; 1189 CurDAG->ReplaceAllUsesOfValueWith(SDValue(N, 0), Result); 1190 break; 1191 } 1192 1193 //The sequence of events for lowering STRICT_FP versions of these nodes requires 1194 //dealing with the chain differently, as there is already a preexisting chain. 1195 case ISD::STRICT_FP_ROUND: 1196 case ISD::STRICT_FP_EXTEND: 1197 { 1198 MVT SrcVT = N->getOperand(1).getSimpleValueType(); 1199 MVT DstVT = N->getSimpleValueType(0); 1200 1201 // If any of the sources are vectors, no fp stack involved. 1202 if (SrcVT.isVector() || DstVT.isVector()) 1203 continue; 1204 1205 // If the source and destination are SSE registers, then this is a legal 1206 // conversion that should not be lowered. 1207 const X86TargetLowering *X86Lowering = 1208 static_cast<const X86TargetLowering *>(TLI); 1209 bool SrcIsSSE = X86Lowering->isScalarFPTypeInSSEReg(SrcVT); 1210 bool DstIsSSE = X86Lowering->isScalarFPTypeInSSEReg(DstVT); 1211 if (SrcIsSSE && DstIsSSE) 1212 continue; 1213 1214 if (!SrcIsSSE && !DstIsSSE) { 1215 // If this is an FPStack extension, it is a noop. 1216 if (N->getOpcode() == ISD::STRICT_FP_EXTEND) 1217 continue; 1218 // If this is a value-preserving FPStack truncation, it is a noop. 1219 if (N->getConstantOperandVal(2)) 1220 continue; 1221 } 1222 1223 // Here we could have an FP stack truncation or an FPStack <-> SSE convert. 1224 // FPStack has extload and truncstore. SSE can fold direct loads into other 1225 // operations. Based on this, decide what we want to do. 1226 MVT MemVT = (N->getOpcode() == ISD::STRICT_FP_ROUND) ? DstVT : SrcVT; 1227 SDValue MemTmp = CurDAG->CreateStackTemporary(MemVT); 1228 int SPFI = cast<FrameIndexSDNode>(MemTmp)->getIndex(); 1229 MachinePointerInfo MPI = 1230 MachinePointerInfo::getFixedStack(CurDAG->getMachineFunction(), SPFI); 1231 SDLoc dl(N); 1232 1233 // FIXME: optimize the case where the src/dest is a load or store? 1234 1235 //Since the operation is StrictFP, use the preexisting chain. 1236 SDValue Store, Result; 1237 if (!SrcIsSSE) { 1238 SDVTList VTs = CurDAG->getVTList(MVT::Other); 1239 SDValue Ops[] = {N->getOperand(0), N->getOperand(1), MemTmp}; 1240 Store = CurDAG->getMemIntrinsicNode(X86ISD::FST, dl, VTs, Ops, MemVT, 1241 MPI, /*Align*/ None, 1242 MachineMemOperand::MOStore); 1243 if (N->getFlags().hasNoFPExcept()) { 1244 SDNodeFlags Flags = Store->getFlags(); 1245 Flags.setNoFPExcept(true); 1246 Store->setFlags(Flags); 1247 } 1248 } else { 1249 assert(SrcVT == MemVT && "Unexpected VT!"); 1250 Store = CurDAG->getStore(N->getOperand(0), dl, N->getOperand(1), MemTmp, 1251 MPI); 1252 } 1253 1254 if (!DstIsSSE) { 1255 SDVTList VTs = CurDAG->getVTList(DstVT, MVT::Other); 1256 SDValue Ops[] = {Store, MemTmp}; 1257 Result = CurDAG->getMemIntrinsicNode( 1258 X86ISD::FLD, dl, VTs, Ops, MemVT, MPI, 1259 /*Align*/ None, MachineMemOperand::MOLoad); 1260 if (N->getFlags().hasNoFPExcept()) { 1261 SDNodeFlags Flags = Result->getFlags(); 1262 Flags.setNoFPExcept(true); 1263 Result->setFlags(Flags); 1264 } 1265 } else { 1266 assert(DstVT == MemVT && "Unexpected VT!"); 1267 Result = CurDAG->getLoad(DstVT, dl, Store, MemTmp, MPI); 1268 } 1269 1270 // We're about to replace all uses of the FP_ROUND/FP_EXTEND with the 1271 // extload we created. This will cause general havok on the dag because 1272 // anything below the conversion could be folded into other existing nodes. 1273 // To avoid invalidating 'I', back it up to the convert node. 1274 --I; 1275 CurDAG->ReplaceAllUsesWith(N, Result.getNode()); 1276 break; 1277 } 1278 } 1279 1280 1281 // Now that we did that, the node is dead. Increment the iterator to the 1282 // next node to process, then delete N. 1283 ++I; 1284 MadeChange = true; 1285 } 1286 1287 // Remove any dead nodes that may have been left behind. 1288 if (MadeChange) 1289 CurDAG->RemoveDeadNodes(); 1290 } 1291 1292 // Look for a redundant movzx/movsx that can occur after an 8-bit divrem. 1293 bool X86DAGToDAGISel::tryOptimizeRem8Extend(SDNode *N) { 1294 unsigned Opc = N->getMachineOpcode(); 1295 if (Opc != X86::MOVZX32rr8 && Opc != X86::MOVSX32rr8 && 1296 Opc != X86::MOVSX64rr8) 1297 return false; 1298 1299 SDValue N0 = N->getOperand(0); 1300 1301 // We need to be extracting the lower bit of an extend. 1302 if (!N0.isMachineOpcode() || 1303 N0.getMachineOpcode() != TargetOpcode::EXTRACT_SUBREG || 1304 N0.getConstantOperandVal(1) != X86::sub_8bit) 1305 return false; 1306 1307 // We're looking for either a movsx or movzx to match the original opcode. 1308 unsigned ExpectedOpc = Opc == X86::MOVZX32rr8 ? X86::MOVZX32rr8_NOREX 1309 : X86::MOVSX32rr8_NOREX; 1310 SDValue N00 = N0.getOperand(0); 1311 if (!N00.isMachineOpcode() || N00.getMachineOpcode() != ExpectedOpc) 1312 return false; 1313 1314 if (Opc == X86::MOVSX64rr8) { 1315 // If we had a sign extend from 8 to 64 bits. We still need to go from 32 1316 // to 64. 1317 MachineSDNode *Extend = CurDAG->getMachineNode(X86::MOVSX64rr32, SDLoc(N), 1318 MVT::i64, N00); 1319 ReplaceUses(N, Extend); 1320 } else { 1321 // Ok we can drop this extend and just use the original extend. 1322 ReplaceUses(N, N00.getNode()); 1323 } 1324 1325 return true; 1326 } 1327 1328 void X86DAGToDAGISel::PostprocessISelDAG() { 1329 // Skip peepholes at -O0. 1330 if (TM.getOptLevel() == CodeGenOpt::None) 1331 return; 1332 1333 SelectionDAG::allnodes_iterator Position = CurDAG->allnodes_end(); 1334 1335 bool MadeChange = false; 1336 while (Position != CurDAG->allnodes_begin()) { 1337 SDNode *N = &*--Position; 1338 // Skip dead nodes and any non-machine opcodes. 1339 if (N->use_empty() || !N->isMachineOpcode()) 1340 continue; 1341 1342 if (tryOptimizeRem8Extend(N)) { 1343 MadeChange = true; 1344 continue; 1345 } 1346 1347 // Look for a TESTrr+ANDrr pattern where both operands of the test are 1348 // the same. Rewrite to remove the AND. 1349 unsigned Opc = N->getMachineOpcode(); 1350 if ((Opc == X86::TEST8rr || Opc == X86::TEST16rr || 1351 Opc == X86::TEST32rr || Opc == X86::TEST64rr) && 1352 N->getOperand(0) == N->getOperand(1) && 1353 N->isOnlyUserOf(N->getOperand(0).getNode()) && 1354 N->getOperand(0).isMachineOpcode()) { 1355 SDValue And = N->getOperand(0); 1356 unsigned N0Opc = And.getMachineOpcode(); 1357 if (N0Opc == X86::AND8rr || N0Opc == X86::AND16rr || 1358 N0Opc == X86::AND32rr || N0Opc == X86::AND64rr) { 1359 MachineSDNode *Test = CurDAG->getMachineNode(Opc, SDLoc(N), 1360 MVT::i32, 1361 And.getOperand(0), 1362 And.getOperand(1)); 1363 ReplaceUses(N, Test); 1364 MadeChange = true; 1365 continue; 1366 } 1367 if (N0Opc == X86::AND8rm || N0Opc == X86::AND16rm || 1368 N0Opc == X86::AND32rm || N0Opc == X86::AND64rm) { 1369 unsigned NewOpc; 1370 switch (N0Opc) { 1371 case X86::AND8rm: NewOpc = X86::TEST8mr; break; 1372 case X86::AND16rm: NewOpc = X86::TEST16mr; break; 1373 case X86::AND32rm: NewOpc = X86::TEST32mr; break; 1374 case X86::AND64rm: NewOpc = X86::TEST64mr; break; 1375 } 1376 1377 // Need to swap the memory and register operand. 1378 SDValue Ops[] = { And.getOperand(1), 1379 And.getOperand(2), 1380 And.getOperand(3), 1381 And.getOperand(4), 1382 And.getOperand(5), 1383 And.getOperand(0), 1384 And.getOperand(6) /* Chain */ }; 1385 MachineSDNode *Test = CurDAG->getMachineNode(NewOpc, SDLoc(N), 1386 MVT::i32, MVT::Other, Ops); 1387 CurDAG->setNodeMemRefs( 1388 Test, cast<MachineSDNode>(And.getNode())->memoperands()); 1389 ReplaceUses(N, Test); 1390 MadeChange = true; 1391 continue; 1392 } 1393 } 1394 1395 // Look for a KAND+KORTEST and turn it into KTEST if only the zero flag is 1396 // used. We're doing this late so we can prefer to fold the AND into masked 1397 // comparisons. Doing that can be better for the live range of the mask 1398 // register. 1399 if ((Opc == X86::KORTESTBrr || Opc == X86::KORTESTWrr || 1400 Opc == X86::KORTESTDrr || Opc == X86::KORTESTQrr) && 1401 N->getOperand(0) == N->getOperand(1) && 1402 N->isOnlyUserOf(N->getOperand(0).getNode()) && 1403 N->getOperand(0).isMachineOpcode() && 1404 onlyUsesZeroFlag(SDValue(N, 0))) { 1405 SDValue And = N->getOperand(0); 1406 unsigned N0Opc = And.getMachineOpcode(); 1407 // KANDW is legal with AVX512F, but KTESTW requires AVX512DQ. The other 1408 // KAND instructions and KTEST use the same ISA feature. 1409 if (N0Opc == X86::KANDBrr || 1410 (N0Opc == X86::KANDWrr && Subtarget->hasDQI()) || 1411 N0Opc == X86::KANDDrr || N0Opc == X86::KANDQrr) { 1412 unsigned NewOpc; 1413 switch (Opc) { 1414 default: llvm_unreachable("Unexpected opcode!"); 1415 case X86::KORTESTBrr: NewOpc = X86::KTESTBrr; break; 1416 case X86::KORTESTWrr: NewOpc = X86::KTESTWrr; break; 1417 case X86::KORTESTDrr: NewOpc = X86::KTESTDrr; break; 1418 case X86::KORTESTQrr: NewOpc = X86::KTESTQrr; break; 1419 } 1420 MachineSDNode *KTest = CurDAG->getMachineNode(NewOpc, SDLoc(N), 1421 MVT::i32, 1422 And.getOperand(0), 1423 And.getOperand(1)); 1424 ReplaceUses(N, KTest); 1425 MadeChange = true; 1426 continue; 1427 } 1428 } 1429 1430 // Attempt to remove vectors moves that were inserted to zero upper bits. 1431 if (Opc != TargetOpcode::SUBREG_TO_REG) 1432 continue; 1433 1434 unsigned SubRegIdx = N->getConstantOperandVal(2); 1435 if (SubRegIdx != X86::sub_xmm && SubRegIdx != X86::sub_ymm) 1436 continue; 1437 1438 SDValue Move = N->getOperand(1); 1439 if (!Move.isMachineOpcode()) 1440 continue; 1441 1442 // Make sure its one of the move opcodes we recognize. 1443 switch (Move.getMachineOpcode()) { 1444 default: 1445 continue; 1446 case X86::VMOVAPDrr: case X86::VMOVUPDrr: 1447 case X86::VMOVAPSrr: case X86::VMOVUPSrr: 1448 case X86::VMOVDQArr: case X86::VMOVDQUrr: 1449 case X86::VMOVAPDYrr: case X86::VMOVUPDYrr: 1450 case X86::VMOVAPSYrr: case X86::VMOVUPSYrr: 1451 case X86::VMOVDQAYrr: case X86::VMOVDQUYrr: 1452 case X86::VMOVAPDZ128rr: case X86::VMOVUPDZ128rr: 1453 case X86::VMOVAPSZ128rr: case X86::VMOVUPSZ128rr: 1454 case X86::VMOVDQA32Z128rr: case X86::VMOVDQU32Z128rr: 1455 case X86::VMOVDQA64Z128rr: case X86::VMOVDQU64Z128rr: 1456 case X86::VMOVAPDZ256rr: case X86::VMOVUPDZ256rr: 1457 case X86::VMOVAPSZ256rr: case X86::VMOVUPSZ256rr: 1458 case X86::VMOVDQA32Z256rr: case X86::VMOVDQU32Z256rr: 1459 case X86::VMOVDQA64Z256rr: case X86::VMOVDQU64Z256rr: 1460 break; 1461 } 1462 1463 SDValue In = Move.getOperand(0); 1464 if (!In.isMachineOpcode() || 1465 In.getMachineOpcode() <= TargetOpcode::GENERIC_OP_END) 1466 continue; 1467 1468 // Make sure the instruction has a VEX, XOP, or EVEX prefix. This covers 1469 // the SHA instructions which use a legacy encoding. 1470 uint64_t TSFlags = getInstrInfo()->get(In.getMachineOpcode()).TSFlags; 1471 if ((TSFlags & X86II::EncodingMask) != X86II::VEX && 1472 (TSFlags & X86II::EncodingMask) != X86II::EVEX && 1473 (TSFlags & X86II::EncodingMask) != X86II::XOP) 1474 continue; 1475 1476 // Producing instruction is another vector instruction. We can drop the 1477 // move. 1478 CurDAG->UpdateNodeOperands(N, N->getOperand(0), In, N->getOperand(2)); 1479 MadeChange = true; 1480 } 1481 1482 if (MadeChange) 1483 CurDAG->RemoveDeadNodes(); 1484 } 1485 1486 1487 /// Emit any code that needs to be executed only in the main function. 1488 void X86DAGToDAGISel::emitSpecialCodeForMain() { 1489 if (Subtarget->isTargetCygMing()) { 1490 TargetLowering::ArgListTy Args; 1491 auto &DL = CurDAG->getDataLayout(); 1492 1493 TargetLowering::CallLoweringInfo CLI(*CurDAG); 1494 CLI.setChain(CurDAG->getRoot()) 1495 .setCallee(CallingConv::C, Type::getVoidTy(*CurDAG->getContext()), 1496 CurDAG->getExternalSymbol("__main", TLI->getPointerTy(DL)), 1497 std::move(Args)); 1498 const TargetLowering &TLI = CurDAG->getTargetLoweringInfo(); 1499 std::pair<SDValue, SDValue> Result = TLI.LowerCallTo(CLI); 1500 CurDAG->setRoot(Result.second); 1501 } 1502 } 1503 1504 void X86DAGToDAGISel::emitFunctionEntryCode() { 1505 // If this is main, emit special code for main. 1506 const Function &F = MF->getFunction(); 1507 if (F.hasExternalLinkage() && F.getName() == "main") 1508 emitSpecialCodeForMain(); 1509 } 1510 1511 static bool isDispSafeForFrameIndex(int64_t Val) { 1512 // On 64-bit platforms, we can run into an issue where a frame index 1513 // includes a displacement that, when added to the explicit displacement, 1514 // will overflow the displacement field. Assuming that the frame index 1515 // displacement fits into a 31-bit integer (which is only slightly more 1516 // aggressive than the current fundamental assumption that it fits into 1517 // a 32-bit integer), a 31-bit disp should always be safe. 1518 return isInt<31>(Val); 1519 } 1520 1521 bool X86DAGToDAGISel::foldOffsetIntoAddress(uint64_t Offset, 1522 X86ISelAddressMode &AM) { 1523 // We may have already matched a displacement and the caller just added the 1524 // symbolic displacement. So we still need to do the checks even if Offset 1525 // is zero. 1526 1527 int64_t Val = AM.Disp + Offset; 1528 1529 // Cannot combine ExternalSymbol displacements with integer offsets. 1530 if (Val != 0 && (AM.ES || AM.MCSym)) 1531 return true; 1532 1533 CodeModel::Model M = TM.getCodeModel(); 1534 if (Subtarget->is64Bit()) { 1535 if (Val != 0 && 1536 !X86::isOffsetSuitableForCodeModel(Val, M, 1537 AM.hasSymbolicDisplacement())) 1538 return true; 1539 // In addition to the checks required for a register base, check that 1540 // we do not try to use an unsafe Disp with a frame index. 1541 if (AM.BaseType == X86ISelAddressMode::FrameIndexBase && 1542 !isDispSafeForFrameIndex(Val)) 1543 return true; 1544 } 1545 AM.Disp = Val; 1546 return false; 1547 1548 } 1549 1550 bool X86DAGToDAGISel::matchLoadInAddress(LoadSDNode *N, X86ISelAddressMode &AM){ 1551 SDValue Address = N->getOperand(1); 1552 1553 // load gs:0 -> GS segment register. 1554 // load fs:0 -> FS segment register. 1555 // 1556 // This optimization is valid because the GNU TLS model defines that 1557 // gs:0 (or fs:0 on X86-64) contains its own address. 1558 // For more information see http://people.redhat.com/drepper/tls.pdf 1559 if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Address)) 1560 if (C->getSExtValue() == 0 && AM.Segment.getNode() == nullptr && 1561 !IndirectTlsSegRefs && 1562 (Subtarget->isTargetGlibc() || Subtarget->isTargetAndroid() || 1563 Subtarget->isTargetFuchsia())) 1564 switch (N->getPointerInfo().getAddrSpace()) { 1565 case X86AS::GS: 1566 AM.Segment = CurDAG->getRegister(X86::GS, MVT::i16); 1567 return false; 1568 case X86AS::FS: 1569 AM.Segment = CurDAG->getRegister(X86::FS, MVT::i16); 1570 return false; 1571 // Address space X86AS::SS is not handled here, because it is not used to 1572 // address TLS areas. 1573 } 1574 1575 return true; 1576 } 1577 1578 /// Try to match X86ISD::Wrapper and X86ISD::WrapperRIP nodes into an addressing 1579 /// mode. These wrap things that will resolve down into a symbol reference. 1580 /// If no match is possible, this returns true, otherwise it returns false. 1581 bool X86DAGToDAGISel::matchWrapper(SDValue N, X86ISelAddressMode &AM) { 1582 // If the addressing mode already has a symbol as the displacement, we can 1583 // never match another symbol. 1584 if (AM.hasSymbolicDisplacement()) 1585 return true; 1586 1587 bool IsRIPRelTLS = false; 1588 bool IsRIPRel = N.getOpcode() == X86ISD::WrapperRIP; 1589 if (IsRIPRel) { 1590 SDValue Val = N.getOperand(0); 1591 if (Val.getOpcode() == ISD::TargetGlobalTLSAddress) 1592 IsRIPRelTLS = true; 1593 } 1594 1595 // We can't use an addressing mode in the 64-bit large code model. 1596 // Global TLS addressing is an exception. In the medium code model, 1597 // we use can use a mode when RIP wrappers are present. 1598 // That signifies access to globals that are known to be "near", 1599 // such as the GOT itself. 1600 CodeModel::Model M = TM.getCodeModel(); 1601 if (Subtarget->is64Bit() && 1602 ((M == CodeModel::Large && !IsRIPRelTLS) || 1603 (M == CodeModel::Medium && !IsRIPRel))) 1604 return true; 1605 1606 // Base and index reg must be 0 in order to use %rip as base. 1607 if (IsRIPRel && AM.hasBaseOrIndexReg()) 1608 return true; 1609 1610 // Make a local copy in case we can't do this fold. 1611 X86ISelAddressMode Backup = AM; 1612 1613 int64_t Offset = 0; 1614 SDValue N0 = N.getOperand(0); 1615 if (GlobalAddressSDNode *G = dyn_cast<GlobalAddressSDNode>(N0)) { 1616 AM.GV = G->getGlobal(); 1617 AM.SymbolFlags = G->getTargetFlags(); 1618 Offset = G->getOffset(); 1619 } else if (ConstantPoolSDNode *CP = dyn_cast<ConstantPoolSDNode>(N0)) { 1620 AM.CP = CP->getConstVal(); 1621 AM.Alignment = CP->getAlign(); 1622 AM.SymbolFlags = CP->getTargetFlags(); 1623 Offset = CP->getOffset(); 1624 } else if (ExternalSymbolSDNode *S = dyn_cast<ExternalSymbolSDNode>(N0)) { 1625 AM.ES = S->getSymbol(); 1626 AM.SymbolFlags = S->getTargetFlags(); 1627 } else if (auto *S = dyn_cast<MCSymbolSDNode>(N0)) { 1628 AM.MCSym = S->getMCSymbol(); 1629 } else if (JumpTableSDNode *J = dyn_cast<JumpTableSDNode>(N0)) { 1630 AM.JT = J->getIndex(); 1631 AM.SymbolFlags = J->getTargetFlags(); 1632 } else if (BlockAddressSDNode *BA = dyn_cast<BlockAddressSDNode>(N0)) { 1633 AM.BlockAddr = BA->getBlockAddress(); 1634 AM.SymbolFlags = BA->getTargetFlags(); 1635 Offset = BA->getOffset(); 1636 } else 1637 llvm_unreachable("Unhandled symbol reference node."); 1638 1639 if (foldOffsetIntoAddress(Offset, AM)) { 1640 AM = Backup; 1641 return true; 1642 } 1643 1644 if (IsRIPRel) 1645 AM.setBaseReg(CurDAG->getRegister(X86::RIP, MVT::i64)); 1646 1647 // Commit the changes now that we know this fold is safe. 1648 return false; 1649 } 1650 1651 /// Add the specified node to the specified addressing mode, returning true if 1652 /// it cannot be done. This just pattern matches for the addressing mode. 1653 bool X86DAGToDAGISel::matchAddress(SDValue N, X86ISelAddressMode &AM) { 1654 if (matchAddressRecursively(N, AM, 0)) 1655 return true; 1656 1657 // Post-processing: Convert lea(,%reg,2) to lea(%reg,%reg), which has 1658 // a smaller encoding and avoids a scaled-index. 1659 if (AM.Scale == 2 && 1660 AM.BaseType == X86ISelAddressMode::RegBase && 1661 AM.Base_Reg.getNode() == nullptr) { 1662 AM.Base_Reg = AM.IndexReg; 1663 AM.Scale = 1; 1664 } 1665 1666 // Post-processing: Convert foo to foo(%rip), even in non-PIC mode, 1667 // because it has a smaller encoding. 1668 // TODO: Which other code models can use this? 1669 switch (TM.getCodeModel()) { 1670 default: break; 1671 case CodeModel::Small: 1672 case CodeModel::Kernel: 1673 if (Subtarget->is64Bit() && 1674 AM.Scale == 1 && 1675 AM.BaseType == X86ISelAddressMode::RegBase && 1676 AM.Base_Reg.getNode() == nullptr && 1677 AM.IndexReg.getNode() == nullptr && 1678 AM.SymbolFlags == X86II::MO_NO_FLAG && 1679 AM.hasSymbolicDisplacement()) 1680 AM.Base_Reg = CurDAG->getRegister(X86::RIP, MVT::i64); 1681 break; 1682 } 1683 1684 return false; 1685 } 1686 1687 bool X86DAGToDAGISel::matchAdd(SDValue &N, X86ISelAddressMode &AM, 1688 unsigned Depth) { 1689 // Add an artificial use to this node so that we can keep track of 1690 // it if it gets CSE'd with a different node. 1691 HandleSDNode Handle(N); 1692 1693 X86ISelAddressMode Backup = AM; 1694 if (!matchAddressRecursively(N.getOperand(0), AM, Depth+1) && 1695 !matchAddressRecursively(Handle.getValue().getOperand(1), AM, Depth+1)) 1696 return false; 1697 AM = Backup; 1698 1699 // Try again after commutating the operands. 1700 if (!matchAddressRecursively(Handle.getValue().getOperand(1), AM, 1701 Depth + 1) && 1702 !matchAddressRecursively(Handle.getValue().getOperand(0), AM, Depth + 1)) 1703 return false; 1704 AM = Backup; 1705 1706 // If we couldn't fold both operands into the address at the same time, 1707 // see if we can just put each operand into a register and fold at least 1708 // the add. 1709 if (AM.BaseType == X86ISelAddressMode::RegBase && 1710 !AM.Base_Reg.getNode() && 1711 !AM.IndexReg.getNode()) { 1712 N = Handle.getValue(); 1713 AM.Base_Reg = N.getOperand(0); 1714 AM.IndexReg = N.getOperand(1); 1715 AM.Scale = 1; 1716 return false; 1717 } 1718 N = Handle.getValue(); 1719 return true; 1720 } 1721 1722 // Insert a node into the DAG at least before the Pos node's position. This 1723 // will reposition the node as needed, and will assign it a node ID that is <= 1724 // the Pos node's ID. Note that this does *not* preserve the uniqueness of node 1725 // IDs! The selection DAG must no longer depend on their uniqueness when this 1726 // is used. 1727 static void insertDAGNode(SelectionDAG &DAG, SDValue Pos, SDValue N) { 1728 if (N->getNodeId() == -1 || 1729 (SelectionDAGISel::getUninvalidatedNodeId(N.getNode()) > 1730 SelectionDAGISel::getUninvalidatedNodeId(Pos.getNode()))) { 1731 DAG.RepositionNode(Pos->getIterator(), N.getNode()); 1732 // Mark Node as invalid for pruning as after this it may be a successor to a 1733 // selected node but otherwise be in the same position of Pos. 1734 // Conservatively mark it with the same -abs(Id) to assure node id 1735 // invariant is preserved. 1736 N->setNodeId(Pos->getNodeId()); 1737 SelectionDAGISel::InvalidateNodeId(N.getNode()); 1738 } 1739 } 1740 1741 // Transform "(X >> (8-C1)) & (0xff << C1)" to "((X >> 8) & 0xff) << C1" if 1742 // safe. This allows us to convert the shift and and into an h-register 1743 // extract and a scaled index. Returns false if the simplification is 1744 // performed. 1745 static bool foldMaskAndShiftToExtract(SelectionDAG &DAG, SDValue N, 1746 uint64_t Mask, 1747 SDValue Shift, SDValue X, 1748 X86ISelAddressMode &AM) { 1749 if (Shift.getOpcode() != ISD::SRL || 1750 !isa<ConstantSDNode>(Shift.getOperand(1)) || 1751 !Shift.hasOneUse()) 1752 return true; 1753 1754 int ScaleLog = 8 - Shift.getConstantOperandVal(1); 1755 if (ScaleLog <= 0 || ScaleLog >= 4 || 1756 Mask != (0xffu << ScaleLog)) 1757 return true; 1758 1759 MVT VT = N.getSimpleValueType(); 1760 SDLoc DL(N); 1761 SDValue Eight = DAG.getConstant(8, DL, MVT::i8); 1762 SDValue NewMask = DAG.getConstant(0xff, DL, VT); 1763 SDValue Srl = DAG.getNode(ISD::SRL, DL, VT, X, Eight); 1764 SDValue And = DAG.getNode(ISD::AND, DL, VT, Srl, NewMask); 1765 SDValue ShlCount = DAG.getConstant(ScaleLog, DL, MVT::i8); 1766 SDValue Shl = DAG.getNode(ISD::SHL, DL, VT, And, ShlCount); 1767 1768 // Insert the new nodes into the topological ordering. We must do this in 1769 // a valid topological ordering as nothing is going to go back and re-sort 1770 // these nodes. We continually insert before 'N' in sequence as this is 1771 // essentially a pre-flattened and pre-sorted sequence of nodes. There is no 1772 // hierarchy left to express. 1773 insertDAGNode(DAG, N, Eight); 1774 insertDAGNode(DAG, N, Srl); 1775 insertDAGNode(DAG, N, NewMask); 1776 insertDAGNode(DAG, N, And); 1777 insertDAGNode(DAG, N, ShlCount); 1778 insertDAGNode(DAG, N, Shl); 1779 DAG.ReplaceAllUsesWith(N, Shl); 1780 DAG.RemoveDeadNode(N.getNode()); 1781 AM.IndexReg = And; 1782 AM.Scale = (1 << ScaleLog); 1783 return false; 1784 } 1785 1786 // Transforms "(X << C1) & C2" to "(X & (C2>>C1)) << C1" if safe and if this 1787 // allows us to fold the shift into this addressing mode. Returns false if the 1788 // transform succeeded. 1789 static bool foldMaskedShiftToScaledMask(SelectionDAG &DAG, SDValue N, 1790 X86ISelAddressMode &AM) { 1791 SDValue Shift = N.getOperand(0); 1792 1793 // Use a signed mask so that shifting right will insert sign bits. These 1794 // bits will be removed when we shift the result left so it doesn't matter 1795 // what we use. This might allow a smaller immediate encoding. 1796 int64_t Mask = cast<ConstantSDNode>(N->getOperand(1))->getSExtValue(); 1797 1798 // If we have an any_extend feeding the AND, look through it to see if there 1799 // is a shift behind it. But only if the AND doesn't use the extended bits. 1800 // FIXME: Generalize this to other ANY_EXTEND than i32 to i64? 1801 bool FoundAnyExtend = false; 1802 if (Shift.getOpcode() == ISD::ANY_EXTEND && Shift.hasOneUse() && 1803 Shift.getOperand(0).getSimpleValueType() == MVT::i32 && 1804 isUInt<32>(Mask)) { 1805 FoundAnyExtend = true; 1806 Shift = Shift.getOperand(0); 1807 } 1808 1809 if (Shift.getOpcode() != ISD::SHL || 1810 !isa<ConstantSDNode>(Shift.getOperand(1))) 1811 return true; 1812 1813 SDValue X = Shift.getOperand(0); 1814 1815 // Not likely to be profitable if either the AND or SHIFT node has more 1816 // than one use (unless all uses are for address computation). Besides, 1817 // isel mechanism requires their node ids to be reused. 1818 if (!N.hasOneUse() || !Shift.hasOneUse()) 1819 return true; 1820 1821 // Verify that the shift amount is something we can fold. 1822 unsigned ShiftAmt = Shift.getConstantOperandVal(1); 1823 if (ShiftAmt != 1 && ShiftAmt != 2 && ShiftAmt != 3) 1824 return true; 1825 1826 MVT VT = N.getSimpleValueType(); 1827 SDLoc DL(N); 1828 if (FoundAnyExtend) { 1829 SDValue NewX = DAG.getNode(ISD::ANY_EXTEND, DL, VT, X); 1830 insertDAGNode(DAG, N, NewX); 1831 X = NewX; 1832 } 1833 1834 SDValue NewMask = DAG.getConstant(Mask >> ShiftAmt, DL, VT); 1835 SDValue NewAnd = DAG.getNode(ISD::AND, DL, VT, X, NewMask); 1836 SDValue NewShift = DAG.getNode(ISD::SHL, DL, VT, NewAnd, Shift.getOperand(1)); 1837 1838 // Insert the new nodes into the topological ordering. We must do this in 1839 // a valid topological ordering as nothing is going to go back and re-sort 1840 // these nodes. We continually insert before 'N' in sequence as this is 1841 // essentially a pre-flattened and pre-sorted sequence of nodes. There is no 1842 // hierarchy left to express. 1843 insertDAGNode(DAG, N, NewMask); 1844 insertDAGNode(DAG, N, NewAnd); 1845 insertDAGNode(DAG, N, NewShift); 1846 DAG.ReplaceAllUsesWith(N, NewShift); 1847 DAG.RemoveDeadNode(N.getNode()); 1848 1849 AM.Scale = 1 << ShiftAmt; 1850 AM.IndexReg = NewAnd; 1851 return false; 1852 } 1853 1854 // Implement some heroics to detect shifts of masked values where the mask can 1855 // be replaced by extending the shift and undoing that in the addressing mode 1856 // scale. Patterns such as (shl (srl x, c1), c2) are canonicalized into (and 1857 // (srl x, SHIFT), MASK) by DAGCombines that don't know the shl can be done in 1858 // the addressing mode. This results in code such as: 1859 // 1860 // int f(short *y, int *lookup_table) { 1861 // ... 1862 // return *y + lookup_table[*y >> 11]; 1863 // } 1864 // 1865 // Turning into: 1866 // movzwl (%rdi), %eax 1867 // movl %eax, %ecx 1868 // shrl $11, %ecx 1869 // addl (%rsi,%rcx,4), %eax 1870 // 1871 // Instead of: 1872 // movzwl (%rdi), %eax 1873 // movl %eax, %ecx 1874 // shrl $9, %ecx 1875 // andl $124, %rcx 1876 // addl (%rsi,%rcx), %eax 1877 // 1878 // Note that this function assumes the mask is provided as a mask *after* the 1879 // value is shifted. The input chain may or may not match that, but computing 1880 // such a mask is trivial. 1881 static bool foldMaskAndShiftToScale(SelectionDAG &DAG, SDValue N, 1882 uint64_t Mask, 1883 SDValue Shift, SDValue X, 1884 X86ISelAddressMode &AM) { 1885 if (Shift.getOpcode() != ISD::SRL || !Shift.hasOneUse() || 1886 !isa<ConstantSDNode>(Shift.getOperand(1))) 1887 return true; 1888 1889 unsigned ShiftAmt = Shift.getConstantOperandVal(1); 1890 unsigned MaskLZ = countLeadingZeros(Mask); 1891 unsigned MaskTZ = countTrailingZeros(Mask); 1892 1893 // The amount of shift we're trying to fit into the addressing mode is taken 1894 // from the trailing zeros of the mask. 1895 unsigned AMShiftAmt = MaskTZ; 1896 1897 // There is nothing we can do here unless the mask is removing some bits. 1898 // Also, the addressing mode can only represent shifts of 1, 2, or 3 bits. 1899 if (AMShiftAmt == 0 || AMShiftAmt > 3) return true; 1900 1901 // We also need to ensure that mask is a continuous run of bits. 1902 if (countTrailingOnes(Mask >> MaskTZ) + MaskTZ + MaskLZ != 64) return true; 1903 1904 // Scale the leading zero count down based on the actual size of the value. 1905 // Also scale it down based on the size of the shift. 1906 unsigned ScaleDown = (64 - X.getSimpleValueType().getSizeInBits()) + ShiftAmt; 1907 if (MaskLZ < ScaleDown) 1908 return true; 1909 MaskLZ -= ScaleDown; 1910 1911 // The final check is to ensure that any masked out high bits of X are 1912 // already known to be zero. Otherwise, the mask has a semantic impact 1913 // other than masking out a couple of low bits. Unfortunately, because of 1914 // the mask, zero extensions will be removed from operands in some cases. 1915 // This code works extra hard to look through extensions because we can 1916 // replace them with zero extensions cheaply if necessary. 1917 bool ReplacingAnyExtend = false; 1918 if (X.getOpcode() == ISD::ANY_EXTEND) { 1919 unsigned ExtendBits = X.getSimpleValueType().getSizeInBits() - 1920 X.getOperand(0).getSimpleValueType().getSizeInBits(); 1921 // Assume that we'll replace the any-extend with a zero-extend, and 1922 // narrow the search to the extended value. 1923 X = X.getOperand(0); 1924 MaskLZ = ExtendBits > MaskLZ ? 0 : MaskLZ - ExtendBits; 1925 ReplacingAnyExtend = true; 1926 } 1927 APInt MaskedHighBits = 1928 APInt::getHighBitsSet(X.getSimpleValueType().getSizeInBits(), MaskLZ); 1929 KnownBits Known = DAG.computeKnownBits(X); 1930 if (MaskedHighBits != Known.Zero) return true; 1931 1932 // We've identified a pattern that can be transformed into a single shift 1933 // and an addressing mode. Make it so. 1934 MVT VT = N.getSimpleValueType(); 1935 if (ReplacingAnyExtend) { 1936 assert(X.getValueType() != VT); 1937 // We looked through an ANY_EXTEND node, insert a ZERO_EXTEND. 1938 SDValue NewX = DAG.getNode(ISD::ZERO_EXTEND, SDLoc(X), VT, X); 1939 insertDAGNode(DAG, N, NewX); 1940 X = NewX; 1941 } 1942 SDLoc DL(N); 1943 SDValue NewSRLAmt = DAG.getConstant(ShiftAmt + AMShiftAmt, DL, MVT::i8); 1944 SDValue NewSRL = DAG.getNode(ISD::SRL, DL, VT, X, NewSRLAmt); 1945 SDValue NewSHLAmt = DAG.getConstant(AMShiftAmt, DL, MVT::i8); 1946 SDValue NewSHL = DAG.getNode(ISD::SHL, DL, VT, NewSRL, NewSHLAmt); 1947 1948 // Insert the new nodes into the topological ordering. We must do this in 1949 // a valid topological ordering as nothing is going to go back and re-sort 1950 // these nodes. We continually insert before 'N' in sequence as this is 1951 // essentially a pre-flattened and pre-sorted sequence of nodes. There is no 1952 // hierarchy left to express. 1953 insertDAGNode(DAG, N, NewSRLAmt); 1954 insertDAGNode(DAG, N, NewSRL); 1955 insertDAGNode(DAG, N, NewSHLAmt); 1956 insertDAGNode(DAG, N, NewSHL); 1957 DAG.ReplaceAllUsesWith(N, NewSHL); 1958 DAG.RemoveDeadNode(N.getNode()); 1959 1960 AM.Scale = 1 << AMShiftAmt; 1961 AM.IndexReg = NewSRL; 1962 return false; 1963 } 1964 1965 // Transform "(X >> SHIFT) & (MASK << C1)" to 1966 // "((X >> (SHIFT + C1)) & (MASK)) << C1". Everything before the SHL will be 1967 // matched to a BEXTR later. Returns false if the simplification is performed. 1968 static bool foldMaskedShiftToBEXTR(SelectionDAG &DAG, SDValue N, 1969 uint64_t Mask, 1970 SDValue Shift, SDValue X, 1971 X86ISelAddressMode &AM, 1972 const X86Subtarget &Subtarget) { 1973 if (Shift.getOpcode() != ISD::SRL || 1974 !isa<ConstantSDNode>(Shift.getOperand(1)) || 1975 !Shift.hasOneUse() || !N.hasOneUse()) 1976 return true; 1977 1978 // Only do this if BEXTR will be matched by matchBEXTRFromAndImm. 1979 if (!Subtarget.hasTBM() && 1980 !(Subtarget.hasBMI() && Subtarget.hasFastBEXTR())) 1981 return true; 1982 1983 // We need to ensure that mask is a continuous run of bits. 1984 if (!isShiftedMask_64(Mask)) return true; 1985 1986 unsigned ShiftAmt = Shift.getConstantOperandVal(1); 1987 1988 // The amount of shift we're trying to fit into the addressing mode is taken 1989 // from the trailing zeros of the mask. 1990 unsigned AMShiftAmt = countTrailingZeros(Mask); 1991 1992 // There is nothing we can do here unless the mask is removing some bits. 1993 // Also, the addressing mode can only represent shifts of 1, 2, or 3 bits. 1994 if (AMShiftAmt == 0 || AMShiftAmt > 3) return true; 1995 1996 MVT VT = N.getSimpleValueType(); 1997 SDLoc DL(N); 1998 SDValue NewSRLAmt = DAG.getConstant(ShiftAmt + AMShiftAmt, DL, MVT::i8); 1999 SDValue NewSRL = DAG.getNode(ISD::SRL, DL, VT, X, NewSRLAmt); 2000 SDValue NewMask = DAG.getConstant(Mask >> AMShiftAmt, DL, VT); 2001 SDValue NewAnd = DAG.getNode(ISD::AND, DL, VT, NewSRL, NewMask); 2002 SDValue NewSHLAmt = DAG.getConstant(AMShiftAmt, DL, MVT::i8); 2003 SDValue NewSHL = DAG.getNode(ISD::SHL, DL, VT, NewAnd, NewSHLAmt); 2004 2005 // Insert the new nodes into the topological ordering. We must do this in 2006 // a valid topological ordering as nothing is going to go back and re-sort 2007 // these nodes. We continually insert before 'N' in sequence as this is 2008 // essentially a pre-flattened and pre-sorted sequence of nodes. There is no 2009 // hierarchy left to express. 2010 insertDAGNode(DAG, N, NewSRLAmt); 2011 insertDAGNode(DAG, N, NewSRL); 2012 insertDAGNode(DAG, N, NewMask); 2013 insertDAGNode(DAG, N, NewAnd); 2014 insertDAGNode(DAG, N, NewSHLAmt); 2015 insertDAGNode(DAG, N, NewSHL); 2016 DAG.ReplaceAllUsesWith(N, NewSHL); 2017 DAG.RemoveDeadNode(N.getNode()); 2018 2019 AM.Scale = 1 << AMShiftAmt; 2020 AM.IndexReg = NewAnd; 2021 return false; 2022 } 2023 2024 bool X86DAGToDAGISel::matchAddressRecursively(SDValue N, X86ISelAddressMode &AM, 2025 unsigned Depth) { 2026 SDLoc dl(N); 2027 LLVM_DEBUG({ 2028 dbgs() << "MatchAddress: "; 2029 AM.dump(CurDAG); 2030 }); 2031 // Limit recursion. 2032 if (Depth > 5) 2033 return matchAddressBase(N, AM); 2034 2035 // If this is already a %rip relative address, we can only merge immediates 2036 // into it. Instead of handling this in every case, we handle it here. 2037 // RIP relative addressing: %rip + 32-bit displacement! 2038 if (AM.isRIPRelative()) { 2039 // FIXME: JumpTable and ExternalSymbol address currently don't like 2040 // displacements. It isn't very important, but this should be fixed for 2041 // consistency. 2042 if (!(AM.ES || AM.MCSym) && AM.JT != -1) 2043 return true; 2044 2045 if (ConstantSDNode *Cst = dyn_cast<ConstantSDNode>(N)) 2046 if (!foldOffsetIntoAddress(Cst->getSExtValue(), AM)) 2047 return false; 2048 return true; 2049 } 2050 2051 switch (N.getOpcode()) { 2052 default: break; 2053 case ISD::LOCAL_RECOVER: { 2054 if (!AM.hasSymbolicDisplacement() && AM.Disp == 0) 2055 if (const auto *ESNode = dyn_cast<MCSymbolSDNode>(N.getOperand(0))) { 2056 // Use the symbol and don't prefix it. 2057 AM.MCSym = ESNode->getMCSymbol(); 2058 return false; 2059 } 2060 break; 2061 } 2062 case ISD::Constant: { 2063 uint64_t Val = cast<ConstantSDNode>(N)->getSExtValue(); 2064 if (!foldOffsetIntoAddress(Val, AM)) 2065 return false; 2066 break; 2067 } 2068 2069 case X86ISD::Wrapper: 2070 case X86ISD::WrapperRIP: 2071 if (!matchWrapper(N, AM)) 2072 return false; 2073 break; 2074 2075 case ISD::LOAD: 2076 if (!matchLoadInAddress(cast<LoadSDNode>(N), AM)) 2077 return false; 2078 break; 2079 2080 case ISD::FrameIndex: 2081 if (AM.BaseType == X86ISelAddressMode::RegBase && 2082 AM.Base_Reg.getNode() == nullptr && 2083 (!Subtarget->is64Bit() || isDispSafeForFrameIndex(AM.Disp))) { 2084 AM.BaseType = X86ISelAddressMode::FrameIndexBase; 2085 AM.Base_FrameIndex = cast<FrameIndexSDNode>(N)->getIndex(); 2086 return false; 2087 } 2088 break; 2089 2090 case ISD::SHL: 2091 if (AM.IndexReg.getNode() != nullptr || AM.Scale != 1) 2092 break; 2093 2094 if (ConstantSDNode *CN = dyn_cast<ConstantSDNode>(N.getOperand(1))) { 2095 unsigned Val = CN->getZExtValue(); 2096 // Note that we handle x<<1 as (,x,2) rather than (x,x) here so 2097 // that the base operand remains free for further matching. If 2098 // the base doesn't end up getting used, a post-processing step 2099 // in MatchAddress turns (,x,2) into (x,x), which is cheaper. 2100 if (Val == 1 || Val == 2 || Val == 3) { 2101 AM.Scale = 1 << Val; 2102 SDValue ShVal = N.getOperand(0); 2103 2104 // Okay, we know that we have a scale by now. However, if the scaled 2105 // value is an add of something and a constant, we can fold the 2106 // constant into the disp field here. 2107 if (CurDAG->isBaseWithConstantOffset(ShVal)) { 2108 AM.IndexReg = ShVal.getOperand(0); 2109 ConstantSDNode *AddVal = cast<ConstantSDNode>(ShVal.getOperand(1)); 2110 uint64_t Disp = (uint64_t)AddVal->getSExtValue() << Val; 2111 if (!foldOffsetIntoAddress(Disp, AM)) 2112 return false; 2113 } 2114 2115 AM.IndexReg = ShVal; 2116 return false; 2117 } 2118 } 2119 break; 2120 2121 case ISD::SRL: { 2122 // Scale must not be used already. 2123 if (AM.IndexReg.getNode() != nullptr || AM.Scale != 1) break; 2124 2125 // We only handle up to 64-bit values here as those are what matter for 2126 // addressing mode optimizations. 2127 assert(N.getSimpleValueType().getSizeInBits() <= 64 && 2128 "Unexpected value size!"); 2129 2130 SDValue And = N.getOperand(0); 2131 if (And.getOpcode() != ISD::AND) break; 2132 SDValue X = And.getOperand(0); 2133 2134 // The mask used for the transform is expected to be post-shift, but we 2135 // found the shift first so just apply the shift to the mask before passing 2136 // it down. 2137 if (!isa<ConstantSDNode>(N.getOperand(1)) || 2138 !isa<ConstantSDNode>(And.getOperand(1))) 2139 break; 2140 uint64_t Mask = And.getConstantOperandVal(1) >> N.getConstantOperandVal(1); 2141 2142 // Try to fold the mask and shift into the scale, and return false if we 2143 // succeed. 2144 if (!foldMaskAndShiftToScale(*CurDAG, N, Mask, N, X, AM)) 2145 return false; 2146 break; 2147 } 2148 2149 case ISD::SMUL_LOHI: 2150 case ISD::UMUL_LOHI: 2151 // A mul_lohi where we need the low part can be folded as a plain multiply. 2152 if (N.getResNo() != 0) break; 2153 LLVM_FALLTHROUGH; 2154 case ISD::MUL: 2155 case X86ISD::MUL_IMM: 2156 // X*[3,5,9] -> X+X*[2,4,8] 2157 if (AM.BaseType == X86ISelAddressMode::RegBase && 2158 AM.Base_Reg.getNode() == nullptr && 2159 AM.IndexReg.getNode() == nullptr) { 2160 if (ConstantSDNode *CN = dyn_cast<ConstantSDNode>(N.getOperand(1))) 2161 if (CN->getZExtValue() == 3 || CN->getZExtValue() == 5 || 2162 CN->getZExtValue() == 9) { 2163 AM.Scale = unsigned(CN->getZExtValue())-1; 2164 2165 SDValue MulVal = N.getOperand(0); 2166 SDValue Reg; 2167 2168 // Okay, we know that we have a scale by now. However, if the scaled 2169 // value is an add of something and a constant, we can fold the 2170 // constant into the disp field here. 2171 if (MulVal.getNode()->getOpcode() == ISD::ADD && MulVal.hasOneUse() && 2172 isa<ConstantSDNode>(MulVal.getOperand(1))) { 2173 Reg = MulVal.getOperand(0); 2174 ConstantSDNode *AddVal = 2175 cast<ConstantSDNode>(MulVal.getOperand(1)); 2176 uint64_t Disp = AddVal->getSExtValue() * CN->getZExtValue(); 2177 if (foldOffsetIntoAddress(Disp, AM)) 2178 Reg = N.getOperand(0); 2179 } else { 2180 Reg = N.getOperand(0); 2181 } 2182 2183 AM.IndexReg = AM.Base_Reg = Reg; 2184 return false; 2185 } 2186 } 2187 break; 2188 2189 case ISD::SUB: { 2190 // Given A-B, if A can be completely folded into the address and 2191 // the index field with the index field unused, use -B as the index. 2192 // This is a win if a has multiple parts that can be folded into 2193 // the address. Also, this saves a mov if the base register has 2194 // other uses, since it avoids a two-address sub instruction, however 2195 // it costs an additional mov if the index register has other uses. 2196 2197 // Add an artificial use to this node so that we can keep track of 2198 // it if it gets CSE'd with a different node. 2199 HandleSDNode Handle(N); 2200 2201 // Test if the LHS of the sub can be folded. 2202 X86ISelAddressMode Backup = AM; 2203 if (matchAddressRecursively(N.getOperand(0), AM, Depth+1)) { 2204 N = Handle.getValue(); 2205 AM = Backup; 2206 break; 2207 } 2208 N = Handle.getValue(); 2209 // Test if the index field is free for use. 2210 if (AM.IndexReg.getNode() || AM.isRIPRelative()) { 2211 AM = Backup; 2212 break; 2213 } 2214 2215 int Cost = 0; 2216 SDValue RHS = N.getOperand(1); 2217 // If the RHS involves a register with multiple uses, this 2218 // transformation incurs an extra mov, due to the neg instruction 2219 // clobbering its operand. 2220 if (!RHS.getNode()->hasOneUse() || 2221 RHS.getNode()->getOpcode() == ISD::CopyFromReg || 2222 RHS.getNode()->getOpcode() == ISD::TRUNCATE || 2223 RHS.getNode()->getOpcode() == ISD::ANY_EXTEND || 2224 (RHS.getNode()->getOpcode() == ISD::ZERO_EXTEND && 2225 RHS.getOperand(0).getValueType() == MVT::i32)) 2226 ++Cost; 2227 // If the base is a register with multiple uses, this 2228 // transformation may save a mov. 2229 if ((AM.BaseType == X86ISelAddressMode::RegBase && AM.Base_Reg.getNode() && 2230 !AM.Base_Reg.getNode()->hasOneUse()) || 2231 AM.BaseType == X86ISelAddressMode::FrameIndexBase) 2232 --Cost; 2233 // If the folded LHS was interesting, this transformation saves 2234 // address arithmetic. 2235 if ((AM.hasSymbolicDisplacement() && !Backup.hasSymbolicDisplacement()) + 2236 ((AM.Disp != 0) && (Backup.Disp == 0)) + 2237 (AM.Segment.getNode() && !Backup.Segment.getNode()) >= 2) 2238 --Cost; 2239 // If it doesn't look like it may be an overall win, don't do it. 2240 if (Cost >= 0) { 2241 AM = Backup; 2242 break; 2243 } 2244 2245 // Ok, the transformation is legal and appears profitable. Go for it. 2246 // Negation will be emitted later to avoid creating dangling nodes if this 2247 // was an unprofitable LEA. 2248 AM.IndexReg = RHS; 2249 AM.NegateIndex = true; 2250 AM.Scale = 1; 2251 return false; 2252 } 2253 2254 case ISD::ADD: 2255 if (!matchAdd(N, AM, Depth)) 2256 return false; 2257 break; 2258 2259 case ISD::OR: 2260 // We want to look through a transform in InstCombine and DAGCombiner that 2261 // turns 'add' into 'or', so we can treat this 'or' exactly like an 'add'. 2262 // Example: (or (and x, 1), (shl y, 3)) --> (add (and x, 1), (shl y, 3)) 2263 // An 'lea' can then be used to match the shift (multiply) and add: 2264 // and $1, %esi 2265 // lea (%rsi, %rdi, 8), %rax 2266 if (CurDAG->haveNoCommonBitsSet(N.getOperand(0), N.getOperand(1)) && 2267 !matchAdd(N, AM, Depth)) 2268 return false; 2269 break; 2270 2271 case ISD::AND: { 2272 // Perform some heroic transforms on an and of a constant-count shift 2273 // with a constant to enable use of the scaled offset field. 2274 2275 // Scale must not be used already. 2276 if (AM.IndexReg.getNode() != nullptr || AM.Scale != 1) break; 2277 2278 // We only handle up to 64-bit values here as those are what matter for 2279 // addressing mode optimizations. 2280 assert(N.getSimpleValueType().getSizeInBits() <= 64 && 2281 "Unexpected value size!"); 2282 2283 if (!isa<ConstantSDNode>(N.getOperand(1))) 2284 break; 2285 2286 if (N.getOperand(0).getOpcode() == ISD::SRL) { 2287 SDValue Shift = N.getOperand(0); 2288 SDValue X = Shift.getOperand(0); 2289 2290 uint64_t Mask = N.getConstantOperandVal(1); 2291 2292 // Try to fold the mask and shift into an extract and scale. 2293 if (!foldMaskAndShiftToExtract(*CurDAG, N, Mask, Shift, X, AM)) 2294 return false; 2295 2296 // Try to fold the mask and shift directly into the scale. 2297 if (!foldMaskAndShiftToScale(*CurDAG, N, Mask, Shift, X, AM)) 2298 return false; 2299 2300 // Try to fold the mask and shift into BEXTR and scale. 2301 if (!foldMaskedShiftToBEXTR(*CurDAG, N, Mask, Shift, X, AM, *Subtarget)) 2302 return false; 2303 } 2304 2305 // Try to swap the mask and shift to place shifts which can be done as 2306 // a scale on the outside of the mask. 2307 if (!foldMaskedShiftToScaledMask(*CurDAG, N, AM)) 2308 return false; 2309 2310 break; 2311 } 2312 case ISD::ZERO_EXTEND: { 2313 // Try to widen a zexted shift left to the same size as its use, so we can 2314 // match the shift as a scale factor. 2315 if (AM.IndexReg.getNode() != nullptr || AM.Scale != 1) 2316 break; 2317 if (N.getOperand(0).getOpcode() != ISD::SHL || !N.getOperand(0).hasOneUse()) 2318 break; 2319 2320 // Give up if the shift is not a valid scale factor [1,2,3]. 2321 SDValue Shl = N.getOperand(0); 2322 auto *ShAmtC = dyn_cast<ConstantSDNode>(Shl.getOperand(1)); 2323 if (!ShAmtC || ShAmtC->getZExtValue() > 3) 2324 break; 2325 2326 // The narrow shift must only shift out zero bits (it must be 'nuw'). 2327 // That makes it safe to widen to the destination type. 2328 APInt HighZeros = APInt::getHighBitsSet(Shl.getValueSizeInBits(), 2329 ShAmtC->getZExtValue()); 2330 if (!CurDAG->MaskedValueIsZero(Shl.getOperand(0), HighZeros)) 2331 break; 2332 2333 // zext (shl nuw i8 %x, C) to i32 --> shl (zext i8 %x to i32), (zext C) 2334 MVT VT = N.getSimpleValueType(); 2335 SDLoc DL(N); 2336 SDValue Zext = CurDAG->getNode(ISD::ZERO_EXTEND, DL, VT, Shl.getOperand(0)); 2337 SDValue NewShl = CurDAG->getNode(ISD::SHL, DL, VT, Zext, Shl.getOperand(1)); 2338 2339 // Convert the shift to scale factor. 2340 AM.Scale = 1 << ShAmtC->getZExtValue(); 2341 AM.IndexReg = Zext; 2342 2343 insertDAGNode(*CurDAG, N, Zext); 2344 insertDAGNode(*CurDAG, N, NewShl); 2345 CurDAG->ReplaceAllUsesWith(N, NewShl); 2346 CurDAG->RemoveDeadNode(N.getNode()); 2347 return false; 2348 } 2349 } 2350 2351 return matchAddressBase(N, AM); 2352 } 2353 2354 /// Helper for MatchAddress. Add the specified node to the 2355 /// specified addressing mode without any further recursion. 2356 bool X86DAGToDAGISel::matchAddressBase(SDValue N, X86ISelAddressMode &AM) { 2357 // Is the base register already occupied? 2358 if (AM.BaseType != X86ISelAddressMode::RegBase || AM.Base_Reg.getNode()) { 2359 // If so, check to see if the scale index register is set. 2360 if (!AM.IndexReg.getNode()) { 2361 AM.IndexReg = N; 2362 AM.Scale = 1; 2363 return false; 2364 } 2365 2366 // Otherwise, we cannot select it. 2367 return true; 2368 } 2369 2370 // Default, generate it as a register. 2371 AM.BaseType = X86ISelAddressMode::RegBase; 2372 AM.Base_Reg = N; 2373 return false; 2374 } 2375 2376 /// Helper for selectVectorAddr. Handles things that can be folded into a 2377 /// gather scatter address. The index register and scale should have already 2378 /// been handled. 2379 bool X86DAGToDAGISel::matchVectorAddress(SDValue N, X86ISelAddressMode &AM) { 2380 // TODO: Support other operations. 2381 switch (N.getOpcode()) { 2382 case ISD::Constant: { 2383 uint64_t Val = cast<ConstantSDNode>(N)->getSExtValue(); 2384 if (!foldOffsetIntoAddress(Val, AM)) 2385 return false; 2386 break; 2387 } 2388 case X86ISD::Wrapper: 2389 if (!matchWrapper(N, AM)) 2390 return false; 2391 break; 2392 } 2393 2394 return matchAddressBase(N, AM); 2395 } 2396 2397 bool X86DAGToDAGISel::selectVectorAddr(MemSDNode *Parent, SDValue BasePtr, 2398 SDValue IndexOp, SDValue ScaleOp, 2399 SDValue &Base, SDValue &Scale, 2400 SDValue &Index, SDValue &Disp, 2401 SDValue &Segment) { 2402 X86ISelAddressMode AM; 2403 AM.IndexReg = IndexOp; 2404 AM.Scale = cast<ConstantSDNode>(ScaleOp)->getZExtValue(); 2405 2406 unsigned AddrSpace = Parent->getPointerInfo().getAddrSpace(); 2407 if (AddrSpace == X86AS::GS) 2408 AM.Segment = CurDAG->getRegister(X86::GS, MVT::i16); 2409 if (AddrSpace == X86AS::FS) 2410 AM.Segment = CurDAG->getRegister(X86::FS, MVT::i16); 2411 if (AddrSpace == X86AS::SS) 2412 AM.Segment = CurDAG->getRegister(X86::SS, MVT::i16); 2413 2414 SDLoc DL(BasePtr); 2415 MVT VT = BasePtr.getSimpleValueType(); 2416 2417 // Try to match into the base and displacement fields. 2418 if (matchVectorAddress(BasePtr, AM)) 2419 return false; 2420 2421 getAddressOperands(AM, DL, VT, Base, Scale, Index, Disp, Segment); 2422 return true; 2423 } 2424 2425 /// Returns true if it is able to pattern match an addressing mode. 2426 /// It returns the operands which make up the maximal addressing mode it can 2427 /// match by reference. 2428 /// 2429 /// Parent is the parent node of the addr operand that is being matched. It 2430 /// is always a load, store, atomic node, or null. It is only null when 2431 /// checking memory operands for inline asm nodes. 2432 bool X86DAGToDAGISel::selectAddr(SDNode *Parent, SDValue N, SDValue &Base, 2433 SDValue &Scale, SDValue &Index, 2434 SDValue &Disp, SDValue &Segment) { 2435 X86ISelAddressMode AM; 2436 2437 if (Parent && 2438 // This list of opcodes are all the nodes that have an "addr:$ptr" operand 2439 // that are not a MemSDNode, and thus don't have proper addrspace info. 2440 Parent->getOpcode() != ISD::INTRINSIC_W_CHAIN && // unaligned loads, fixme 2441 Parent->getOpcode() != ISD::INTRINSIC_VOID && // nontemporal stores 2442 Parent->getOpcode() != X86ISD::TLSCALL && // Fixme 2443 Parent->getOpcode() != X86ISD::ENQCMD && // Fixme 2444 Parent->getOpcode() != X86ISD::ENQCMDS && // Fixme 2445 Parent->getOpcode() != X86ISD::EH_SJLJ_SETJMP && // setjmp 2446 Parent->getOpcode() != X86ISD::EH_SJLJ_LONGJMP) { // longjmp 2447 unsigned AddrSpace = 2448 cast<MemSDNode>(Parent)->getPointerInfo().getAddrSpace(); 2449 if (AddrSpace == X86AS::GS) 2450 AM.Segment = CurDAG->getRegister(X86::GS, MVT::i16); 2451 if (AddrSpace == X86AS::FS) 2452 AM.Segment = CurDAG->getRegister(X86::FS, MVT::i16); 2453 if (AddrSpace == X86AS::SS) 2454 AM.Segment = CurDAG->getRegister(X86::SS, MVT::i16); 2455 } 2456 2457 // Save the DL and VT before calling matchAddress, it can invalidate N. 2458 SDLoc DL(N); 2459 MVT VT = N.getSimpleValueType(); 2460 2461 if (matchAddress(N, AM)) 2462 return false; 2463 2464 getAddressOperands(AM, DL, VT, Base, Scale, Index, Disp, Segment); 2465 return true; 2466 } 2467 2468 bool X86DAGToDAGISel::selectMOV64Imm32(SDValue N, SDValue &Imm) { 2469 // In static codegen with small code model, we can get the address of a label 2470 // into a register with 'movl' 2471 if (N->getOpcode() != X86ISD::Wrapper) 2472 return false; 2473 2474 N = N.getOperand(0); 2475 2476 // At least GNU as does not accept 'movl' for TPOFF relocations. 2477 // FIXME: We could use 'movl' when we know we are targeting MC. 2478 if (N->getOpcode() == ISD::TargetGlobalTLSAddress) 2479 return false; 2480 2481 Imm = N; 2482 if (N->getOpcode() != ISD::TargetGlobalAddress) 2483 return TM.getCodeModel() == CodeModel::Small; 2484 2485 Optional<ConstantRange> CR = 2486 cast<GlobalAddressSDNode>(N)->getGlobal()->getAbsoluteSymbolRange(); 2487 if (!CR) 2488 return TM.getCodeModel() == CodeModel::Small; 2489 2490 return CR->getUnsignedMax().ult(1ull << 32); 2491 } 2492 2493 bool X86DAGToDAGISel::selectLEA64_32Addr(SDValue N, SDValue &Base, 2494 SDValue &Scale, SDValue &Index, 2495 SDValue &Disp, SDValue &Segment) { 2496 // Save the debug loc before calling selectLEAAddr, in case it invalidates N. 2497 SDLoc DL(N); 2498 2499 if (!selectLEAAddr(N, Base, Scale, Index, Disp, Segment)) 2500 return false; 2501 2502 RegisterSDNode *RN = dyn_cast<RegisterSDNode>(Base); 2503 if (RN && RN->getReg() == 0) 2504 Base = CurDAG->getRegister(0, MVT::i64); 2505 else if (Base.getValueType() == MVT::i32 && !isa<FrameIndexSDNode>(Base)) { 2506 // Base could already be %rip, particularly in the x32 ABI. 2507 SDValue ImplDef = SDValue(CurDAG->getMachineNode(X86::IMPLICIT_DEF, DL, 2508 MVT::i64), 0); 2509 Base = CurDAG->getTargetInsertSubreg(X86::sub_32bit, DL, MVT::i64, ImplDef, 2510 Base); 2511 } 2512 2513 RN = dyn_cast<RegisterSDNode>(Index); 2514 if (RN && RN->getReg() == 0) 2515 Index = CurDAG->getRegister(0, MVT::i64); 2516 else { 2517 assert(Index.getValueType() == MVT::i32 && 2518 "Expect to be extending 32-bit registers for use in LEA"); 2519 SDValue ImplDef = SDValue(CurDAG->getMachineNode(X86::IMPLICIT_DEF, DL, 2520 MVT::i64), 0); 2521 Index = CurDAG->getTargetInsertSubreg(X86::sub_32bit, DL, MVT::i64, ImplDef, 2522 Index); 2523 } 2524 2525 return true; 2526 } 2527 2528 /// Calls SelectAddr and determines if the maximal addressing 2529 /// mode it matches can be cost effectively emitted as an LEA instruction. 2530 bool X86DAGToDAGISel::selectLEAAddr(SDValue N, 2531 SDValue &Base, SDValue &Scale, 2532 SDValue &Index, SDValue &Disp, 2533 SDValue &Segment) { 2534 X86ISelAddressMode AM; 2535 2536 // Save the DL and VT before calling matchAddress, it can invalidate N. 2537 SDLoc DL(N); 2538 MVT VT = N.getSimpleValueType(); 2539 2540 // Set AM.Segment to prevent MatchAddress from using one. LEA doesn't support 2541 // segments. 2542 SDValue Copy = AM.Segment; 2543 SDValue T = CurDAG->getRegister(0, MVT::i32); 2544 AM.Segment = T; 2545 if (matchAddress(N, AM)) 2546 return false; 2547 assert (T == AM.Segment); 2548 AM.Segment = Copy; 2549 2550 unsigned Complexity = 0; 2551 if (AM.BaseType == X86ISelAddressMode::RegBase && AM.Base_Reg.getNode()) 2552 Complexity = 1; 2553 else if (AM.BaseType == X86ISelAddressMode::FrameIndexBase) 2554 Complexity = 4; 2555 2556 if (AM.IndexReg.getNode()) 2557 Complexity++; 2558 2559 // Don't match just leal(,%reg,2). It's cheaper to do addl %reg, %reg, or with 2560 // a simple shift. 2561 if (AM.Scale > 1) 2562 Complexity++; 2563 2564 // FIXME: We are artificially lowering the criteria to turn ADD %reg, $GA 2565 // to a LEA. This is determined with some experimentation but is by no means 2566 // optimal (especially for code size consideration). LEA is nice because of 2567 // its three-address nature. Tweak the cost function again when we can run 2568 // convertToThreeAddress() at register allocation time. 2569 if (AM.hasSymbolicDisplacement()) { 2570 // For X86-64, always use LEA to materialize RIP-relative addresses. 2571 if (Subtarget->is64Bit()) 2572 Complexity = 4; 2573 else 2574 Complexity += 2; 2575 } 2576 2577 // Heuristic: try harder to form an LEA from ADD if the operands set flags. 2578 // Unlike ADD, LEA does not affect flags, so we will be less likely to require 2579 // duplicating flag-producing instructions later in the pipeline. 2580 if (N.getOpcode() == ISD::ADD) { 2581 auto isMathWithFlags = [](SDValue V) { 2582 switch (V.getOpcode()) { 2583 case X86ISD::ADD: 2584 case X86ISD::SUB: 2585 case X86ISD::ADC: 2586 case X86ISD::SBB: 2587 /* TODO: These opcodes can be added safely, but we may want to justify 2588 their inclusion for different reasons (better for reg-alloc). 2589 case X86ISD::SMUL: 2590 case X86ISD::UMUL: 2591 case X86ISD::OR: 2592 case X86ISD::XOR: 2593 case X86ISD::AND: 2594 */ 2595 // Value 1 is the flag output of the node - verify it's not dead. 2596 return !SDValue(V.getNode(), 1).use_empty(); 2597 default: 2598 return false; 2599 } 2600 }; 2601 // TODO: This could be an 'or' rather than 'and' to make the transform more 2602 // likely to happen. We might want to factor in whether there's a 2603 // load folding opportunity for the math op that disappears with LEA. 2604 if (isMathWithFlags(N.getOperand(0)) && isMathWithFlags(N.getOperand(1))) 2605 Complexity++; 2606 } 2607 2608 if (AM.Disp) 2609 Complexity++; 2610 2611 // If it isn't worth using an LEA, reject it. 2612 if (Complexity <= 2) 2613 return false; 2614 2615 getAddressOperands(AM, DL, VT, Base, Scale, Index, Disp, Segment); 2616 return true; 2617 } 2618 2619 /// This is only run on TargetGlobalTLSAddress nodes. 2620 bool X86DAGToDAGISel::selectTLSADDRAddr(SDValue N, SDValue &Base, 2621 SDValue &Scale, SDValue &Index, 2622 SDValue &Disp, SDValue &Segment) { 2623 assert(N.getOpcode() == ISD::TargetGlobalTLSAddress); 2624 const GlobalAddressSDNode *GA = cast<GlobalAddressSDNode>(N); 2625 2626 X86ISelAddressMode AM; 2627 AM.GV = GA->getGlobal(); 2628 AM.Disp += GA->getOffset(); 2629 AM.SymbolFlags = GA->getTargetFlags(); 2630 2631 MVT VT = N.getSimpleValueType(); 2632 if (VT == MVT::i32) { 2633 AM.Scale = 1; 2634 AM.IndexReg = CurDAG->getRegister(X86::EBX, MVT::i32); 2635 } 2636 2637 getAddressOperands(AM, SDLoc(N), VT, Base, Scale, Index, Disp, Segment); 2638 return true; 2639 } 2640 2641 bool X86DAGToDAGISel::selectRelocImm(SDValue N, SDValue &Op) { 2642 // Keep track of the original value type and whether this value was 2643 // truncated. If we see a truncation from pointer type to VT that truncates 2644 // bits that are known to be zero, we can use a narrow reference. 2645 EVT VT = N.getValueType(); 2646 bool WasTruncated = false; 2647 if (N.getOpcode() == ISD::TRUNCATE) { 2648 WasTruncated = true; 2649 N = N.getOperand(0); 2650 } 2651 2652 if (N.getOpcode() != X86ISD::Wrapper) 2653 return false; 2654 2655 // We can only use non-GlobalValues as immediates if they were not truncated, 2656 // as we do not have any range information. If we have a GlobalValue and the 2657 // address was not truncated, we can select it as an operand directly. 2658 unsigned Opc = N.getOperand(0)->getOpcode(); 2659 if (Opc != ISD::TargetGlobalAddress || !WasTruncated) { 2660 Op = N.getOperand(0); 2661 // We can only select the operand directly if we didn't have to look past a 2662 // truncate. 2663 return !WasTruncated; 2664 } 2665 2666 // Check that the global's range fits into VT. 2667 auto *GA = cast<GlobalAddressSDNode>(N.getOperand(0)); 2668 Optional<ConstantRange> CR = GA->getGlobal()->getAbsoluteSymbolRange(); 2669 if (!CR || CR->getUnsignedMax().uge(1ull << VT.getSizeInBits())) 2670 return false; 2671 2672 // Okay, we can use a narrow reference. 2673 Op = CurDAG->getTargetGlobalAddress(GA->getGlobal(), SDLoc(N), VT, 2674 GA->getOffset(), GA->getTargetFlags()); 2675 return true; 2676 } 2677 2678 bool X86DAGToDAGISel::tryFoldLoad(SDNode *Root, SDNode *P, SDValue N, 2679 SDValue &Base, SDValue &Scale, 2680 SDValue &Index, SDValue &Disp, 2681 SDValue &Segment) { 2682 assert(Root && P && "Unknown root/parent nodes"); 2683 if (!ISD::isNON_EXTLoad(N.getNode()) || 2684 !IsProfitableToFold(N, P, Root) || 2685 !IsLegalToFold(N, P, Root, OptLevel)) 2686 return false; 2687 2688 return selectAddr(N.getNode(), 2689 N.getOperand(1), Base, Scale, Index, Disp, Segment); 2690 } 2691 2692 bool X86DAGToDAGISel::tryFoldBroadcast(SDNode *Root, SDNode *P, SDValue N, 2693 SDValue &Base, SDValue &Scale, 2694 SDValue &Index, SDValue &Disp, 2695 SDValue &Segment) { 2696 assert(Root && P && "Unknown root/parent nodes"); 2697 if (N->getOpcode() != X86ISD::VBROADCAST_LOAD || 2698 !IsProfitableToFold(N, P, Root) || 2699 !IsLegalToFold(N, P, Root, OptLevel)) 2700 return false; 2701 2702 return selectAddr(N.getNode(), 2703 N.getOperand(1), Base, Scale, Index, Disp, Segment); 2704 } 2705 2706 /// Return an SDNode that returns the value of the global base register. 2707 /// Output instructions required to initialize the global base register, 2708 /// if necessary. 2709 SDNode *X86DAGToDAGISel::getGlobalBaseReg() { 2710 unsigned GlobalBaseReg = getInstrInfo()->getGlobalBaseReg(MF); 2711 auto &DL = MF->getDataLayout(); 2712 return CurDAG->getRegister(GlobalBaseReg, TLI->getPointerTy(DL)).getNode(); 2713 } 2714 2715 bool X86DAGToDAGISel::isSExtAbsoluteSymbolRef(unsigned Width, SDNode *N) const { 2716 if (N->getOpcode() == ISD::TRUNCATE) 2717 N = N->getOperand(0).getNode(); 2718 if (N->getOpcode() != X86ISD::Wrapper) 2719 return false; 2720 2721 auto *GA = dyn_cast<GlobalAddressSDNode>(N->getOperand(0)); 2722 if (!GA) 2723 return false; 2724 2725 Optional<ConstantRange> CR = GA->getGlobal()->getAbsoluteSymbolRange(); 2726 return CR && CR->getSignedMin().sge(-1ull << Width) && 2727 CR->getSignedMax().slt(1ull << Width); 2728 } 2729 2730 static X86::CondCode getCondFromNode(SDNode *N) { 2731 assert(N->isMachineOpcode() && "Unexpected node"); 2732 X86::CondCode CC = X86::COND_INVALID; 2733 unsigned Opc = N->getMachineOpcode(); 2734 if (Opc == X86::JCC_1) 2735 CC = static_cast<X86::CondCode>(N->getConstantOperandVal(1)); 2736 else if (Opc == X86::SETCCr) 2737 CC = static_cast<X86::CondCode>(N->getConstantOperandVal(0)); 2738 else if (Opc == X86::SETCCm) 2739 CC = static_cast<X86::CondCode>(N->getConstantOperandVal(5)); 2740 else if (Opc == X86::CMOV16rr || Opc == X86::CMOV32rr || 2741 Opc == X86::CMOV64rr) 2742 CC = static_cast<X86::CondCode>(N->getConstantOperandVal(2)); 2743 else if (Opc == X86::CMOV16rm || Opc == X86::CMOV32rm || 2744 Opc == X86::CMOV64rm) 2745 CC = static_cast<X86::CondCode>(N->getConstantOperandVal(6)); 2746 2747 return CC; 2748 } 2749 2750 /// Test whether the given X86ISD::CMP node has any users that use a flag 2751 /// other than ZF. 2752 bool X86DAGToDAGISel::onlyUsesZeroFlag(SDValue Flags) const { 2753 // Examine each user of the node. 2754 for (SDNode::use_iterator UI = Flags->use_begin(), UE = Flags->use_end(); 2755 UI != UE; ++UI) { 2756 // Only check things that use the flags. 2757 if (UI.getUse().getResNo() != Flags.getResNo()) 2758 continue; 2759 // Only examine CopyToReg uses that copy to EFLAGS. 2760 if (UI->getOpcode() != ISD::CopyToReg || 2761 cast<RegisterSDNode>(UI->getOperand(1))->getReg() != X86::EFLAGS) 2762 return false; 2763 // Examine each user of the CopyToReg use. 2764 for (SDNode::use_iterator FlagUI = UI->use_begin(), 2765 FlagUE = UI->use_end(); FlagUI != FlagUE; ++FlagUI) { 2766 // Only examine the Flag result. 2767 if (FlagUI.getUse().getResNo() != 1) continue; 2768 // Anything unusual: assume conservatively. 2769 if (!FlagUI->isMachineOpcode()) return false; 2770 // Examine the condition code of the user. 2771 X86::CondCode CC = getCondFromNode(*FlagUI); 2772 2773 switch (CC) { 2774 // Comparisons which only use the zero flag. 2775 case X86::COND_E: case X86::COND_NE: 2776 continue; 2777 // Anything else: assume conservatively. 2778 default: 2779 return false; 2780 } 2781 } 2782 } 2783 return true; 2784 } 2785 2786 /// Test whether the given X86ISD::CMP node has any uses which require the SF 2787 /// flag to be accurate. 2788 bool X86DAGToDAGISel::hasNoSignFlagUses(SDValue Flags) const { 2789 // Examine each user of the node. 2790 for (SDNode::use_iterator UI = Flags->use_begin(), UE = Flags->use_end(); 2791 UI != UE; ++UI) { 2792 // Only check things that use the flags. 2793 if (UI.getUse().getResNo() != Flags.getResNo()) 2794 continue; 2795 // Only examine CopyToReg uses that copy to EFLAGS. 2796 if (UI->getOpcode() != ISD::CopyToReg || 2797 cast<RegisterSDNode>(UI->getOperand(1))->getReg() != X86::EFLAGS) 2798 return false; 2799 // Examine each user of the CopyToReg use. 2800 for (SDNode::use_iterator FlagUI = UI->use_begin(), 2801 FlagUE = UI->use_end(); FlagUI != FlagUE; ++FlagUI) { 2802 // Only examine the Flag result. 2803 if (FlagUI.getUse().getResNo() != 1) continue; 2804 // Anything unusual: assume conservatively. 2805 if (!FlagUI->isMachineOpcode()) return false; 2806 // Examine the condition code of the user. 2807 X86::CondCode CC = getCondFromNode(*FlagUI); 2808 2809 switch (CC) { 2810 // Comparisons which don't examine the SF flag. 2811 case X86::COND_A: case X86::COND_AE: 2812 case X86::COND_B: case X86::COND_BE: 2813 case X86::COND_E: case X86::COND_NE: 2814 case X86::COND_O: case X86::COND_NO: 2815 case X86::COND_P: case X86::COND_NP: 2816 continue; 2817 // Anything else: assume conservatively. 2818 default: 2819 return false; 2820 } 2821 } 2822 } 2823 return true; 2824 } 2825 2826 static bool mayUseCarryFlag(X86::CondCode CC) { 2827 switch (CC) { 2828 // Comparisons which don't examine the CF flag. 2829 case X86::COND_O: case X86::COND_NO: 2830 case X86::COND_E: case X86::COND_NE: 2831 case X86::COND_S: case X86::COND_NS: 2832 case X86::COND_P: case X86::COND_NP: 2833 case X86::COND_L: case X86::COND_GE: 2834 case X86::COND_G: case X86::COND_LE: 2835 return false; 2836 // Anything else: assume conservatively. 2837 default: 2838 return true; 2839 } 2840 } 2841 2842 /// Test whether the given node which sets flags has any uses which require the 2843 /// CF flag to be accurate. 2844 bool X86DAGToDAGISel::hasNoCarryFlagUses(SDValue Flags) const { 2845 // Examine each user of the node. 2846 for (SDNode::use_iterator UI = Flags->use_begin(), UE = Flags->use_end(); 2847 UI != UE; ++UI) { 2848 // Only check things that use the flags. 2849 if (UI.getUse().getResNo() != Flags.getResNo()) 2850 continue; 2851 2852 unsigned UIOpc = UI->getOpcode(); 2853 2854 if (UIOpc == ISD::CopyToReg) { 2855 // Only examine CopyToReg uses that copy to EFLAGS. 2856 if (cast<RegisterSDNode>(UI->getOperand(1))->getReg() != X86::EFLAGS) 2857 return false; 2858 // Examine each user of the CopyToReg use. 2859 for (SDNode::use_iterator FlagUI = UI->use_begin(), FlagUE = UI->use_end(); 2860 FlagUI != FlagUE; ++FlagUI) { 2861 // Only examine the Flag result. 2862 if (FlagUI.getUse().getResNo() != 1) 2863 continue; 2864 // Anything unusual: assume conservatively. 2865 if (!FlagUI->isMachineOpcode()) 2866 return false; 2867 // Examine the condition code of the user. 2868 X86::CondCode CC = getCondFromNode(*FlagUI); 2869 2870 if (mayUseCarryFlag(CC)) 2871 return false; 2872 } 2873 2874 // This CopyToReg is ok. Move on to the next user. 2875 continue; 2876 } 2877 2878 // This might be an unselected node. So look for the pre-isel opcodes that 2879 // use flags. 2880 unsigned CCOpNo; 2881 switch (UIOpc) { 2882 default: 2883 // Something unusual. Be conservative. 2884 return false; 2885 case X86ISD::SETCC: CCOpNo = 0; break; 2886 case X86ISD::SETCC_CARRY: CCOpNo = 0; break; 2887 case X86ISD::CMOV: CCOpNo = 2; break; 2888 case X86ISD::BRCOND: CCOpNo = 2; break; 2889 } 2890 2891 X86::CondCode CC = (X86::CondCode)UI->getConstantOperandVal(CCOpNo); 2892 if (mayUseCarryFlag(CC)) 2893 return false; 2894 } 2895 return true; 2896 } 2897 2898 /// Check whether or not the chain ending in StoreNode is suitable for doing 2899 /// the {load; op; store} to modify transformation. 2900 static bool isFusableLoadOpStorePattern(StoreSDNode *StoreNode, 2901 SDValue StoredVal, SelectionDAG *CurDAG, 2902 unsigned LoadOpNo, 2903 LoadSDNode *&LoadNode, 2904 SDValue &InputChain) { 2905 // Is the stored value result 0 of the operation? 2906 if (StoredVal.getResNo() != 0) return false; 2907 2908 // Are there other uses of the operation other than the store? 2909 if (!StoredVal.getNode()->hasNUsesOfValue(1, 0)) return false; 2910 2911 // Is the store non-extending and non-indexed? 2912 if (!ISD::isNormalStore(StoreNode) || StoreNode->isNonTemporal()) 2913 return false; 2914 2915 SDValue Load = StoredVal->getOperand(LoadOpNo); 2916 // Is the stored value a non-extending and non-indexed load? 2917 if (!ISD::isNormalLoad(Load.getNode())) return false; 2918 2919 // Return LoadNode by reference. 2920 LoadNode = cast<LoadSDNode>(Load); 2921 2922 // Is store the only read of the loaded value? 2923 if (!Load.hasOneUse()) 2924 return false; 2925 2926 // Is the address of the store the same as the load? 2927 if (LoadNode->getBasePtr() != StoreNode->getBasePtr() || 2928 LoadNode->getOffset() != StoreNode->getOffset()) 2929 return false; 2930 2931 bool FoundLoad = false; 2932 SmallVector<SDValue, 4> ChainOps; 2933 SmallVector<const SDNode *, 4> LoopWorklist; 2934 SmallPtrSet<const SDNode *, 16> Visited; 2935 const unsigned int Max = 1024; 2936 2937 // Visualization of Load-Op-Store fusion: 2938 // ------------------------- 2939 // Legend: 2940 // *-lines = Chain operand dependencies. 2941 // |-lines = Normal operand dependencies. 2942 // Dependencies flow down and right. n-suffix references multiple nodes. 2943 // 2944 // C Xn C 2945 // * * * 2946 // * * * 2947 // Xn A-LD Yn TF Yn 2948 // * * \ | * | 2949 // * * \ | * | 2950 // * * \ | => A--LD_OP_ST 2951 // * * \| \ 2952 // TF OP \ 2953 // * | \ Zn 2954 // * | \ 2955 // A-ST Zn 2956 // 2957 2958 // This merge induced dependences from: #1: Xn -> LD, OP, Zn 2959 // #2: Yn -> LD 2960 // #3: ST -> Zn 2961 2962 // Ensure the transform is safe by checking for the dual 2963 // dependencies to make sure we do not induce a loop. 2964 2965 // As LD is a predecessor to both OP and ST we can do this by checking: 2966 // a). if LD is a predecessor to a member of Xn or Yn. 2967 // b). if a Zn is a predecessor to ST. 2968 2969 // However, (b) can only occur through being a chain predecessor to 2970 // ST, which is the same as Zn being a member or predecessor of Xn, 2971 // which is a subset of LD being a predecessor of Xn. So it's 2972 // subsumed by check (a). 2973 2974 SDValue Chain = StoreNode->getChain(); 2975 2976 // Gather X elements in ChainOps. 2977 if (Chain == Load.getValue(1)) { 2978 FoundLoad = true; 2979 ChainOps.push_back(Load.getOperand(0)); 2980 } else if (Chain.getOpcode() == ISD::TokenFactor) { 2981 for (unsigned i = 0, e = Chain.getNumOperands(); i != e; ++i) { 2982 SDValue Op = Chain.getOperand(i); 2983 if (Op == Load.getValue(1)) { 2984 FoundLoad = true; 2985 // Drop Load, but keep its chain. No cycle check necessary. 2986 ChainOps.push_back(Load.getOperand(0)); 2987 continue; 2988 } 2989 LoopWorklist.push_back(Op.getNode()); 2990 ChainOps.push_back(Op); 2991 } 2992 } 2993 2994 if (!FoundLoad) 2995 return false; 2996 2997 // Worklist is currently Xn. Add Yn to worklist. 2998 for (SDValue Op : StoredVal->ops()) 2999 if (Op.getNode() != LoadNode) 3000 LoopWorklist.push_back(Op.getNode()); 3001 3002 // Check (a) if Load is a predecessor to Xn + Yn 3003 if (SDNode::hasPredecessorHelper(Load.getNode(), Visited, LoopWorklist, Max, 3004 true)) 3005 return false; 3006 3007 InputChain = 3008 CurDAG->getNode(ISD::TokenFactor, SDLoc(Chain), MVT::Other, ChainOps); 3009 return true; 3010 } 3011 3012 // Change a chain of {load; op; store} of the same value into a simple op 3013 // through memory of that value, if the uses of the modified value and its 3014 // address are suitable. 3015 // 3016 // The tablegen pattern memory operand pattern is currently not able to match 3017 // the case where the EFLAGS on the original operation are used. 3018 // 3019 // To move this to tablegen, we'll need to improve tablegen to allow flags to 3020 // be transferred from a node in the pattern to the result node, probably with 3021 // a new keyword. For example, we have this 3022 // def DEC64m : RI<0xFF, MRM1m, (outs), (ins i64mem:$dst), "dec{q}\t$dst", 3023 // [(store (add (loadi64 addr:$dst), -1), addr:$dst), 3024 // (implicit EFLAGS)]>; 3025 // but maybe need something like this 3026 // def DEC64m : RI<0xFF, MRM1m, (outs), (ins i64mem:$dst), "dec{q}\t$dst", 3027 // [(store (add (loadi64 addr:$dst), -1), addr:$dst), 3028 // (transferrable EFLAGS)]>; 3029 // 3030 // Until then, we manually fold these and instruction select the operation 3031 // here. 3032 bool X86DAGToDAGISel::foldLoadStoreIntoMemOperand(SDNode *Node) { 3033 StoreSDNode *StoreNode = cast<StoreSDNode>(Node); 3034 SDValue StoredVal = StoreNode->getOperand(1); 3035 unsigned Opc = StoredVal->getOpcode(); 3036 3037 // Before we try to select anything, make sure this is memory operand size 3038 // and opcode we can handle. Note that this must match the code below that 3039 // actually lowers the opcodes. 3040 EVT MemVT = StoreNode->getMemoryVT(); 3041 if (MemVT != MVT::i64 && MemVT != MVT::i32 && MemVT != MVT::i16 && 3042 MemVT != MVT::i8) 3043 return false; 3044 3045 bool IsCommutable = false; 3046 bool IsNegate = false; 3047 switch (Opc) { 3048 default: 3049 return false; 3050 case X86ISD::SUB: 3051 IsNegate = isNullConstant(StoredVal.getOperand(0)); 3052 break; 3053 case X86ISD::SBB: 3054 break; 3055 case X86ISD::ADD: 3056 case X86ISD::ADC: 3057 case X86ISD::AND: 3058 case X86ISD::OR: 3059 case X86ISD::XOR: 3060 IsCommutable = true; 3061 break; 3062 } 3063 3064 unsigned LoadOpNo = IsNegate ? 1 : 0; 3065 LoadSDNode *LoadNode = nullptr; 3066 SDValue InputChain; 3067 if (!isFusableLoadOpStorePattern(StoreNode, StoredVal, CurDAG, LoadOpNo, 3068 LoadNode, InputChain)) { 3069 if (!IsCommutable) 3070 return false; 3071 3072 // This operation is commutable, try the other operand. 3073 LoadOpNo = 1; 3074 if (!isFusableLoadOpStorePattern(StoreNode, StoredVal, CurDAG, LoadOpNo, 3075 LoadNode, InputChain)) 3076 return false; 3077 } 3078 3079 SDValue Base, Scale, Index, Disp, Segment; 3080 if (!selectAddr(LoadNode, LoadNode->getBasePtr(), Base, Scale, Index, Disp, 3081 Segment)) 3082 return false; 3083 3084 auto SelectOpcode = [&](unsigned Opc64, unsigned Opc32, unsigned Opc16, 3085 unsigned Opc8) { 3086 switch (MemVT.getSimpleVT().SimpleTy) { 3087 case MVT::i64: 3088 return Opc64; 3089 case MVT::i32: 3090 return Opc32; 3091 case MVT::i16: 3092 return Opc16; 3093 case MVT::i8: 3094 return Opc8; 3095 default: 3096 llvm_unreachable("Invalid size!"); 3097 } 3098 }; 3099 3100 MachineSDNode *Result; 3101 switch (Opc) { 3102 case X86ISD::SUB: 3103 // Handle negate. 3104 if (IsNegate) { 3105 unsigned NewOpc = SelectOpcode(X86::NEG64m, X86::NEG32m, X86::NEG16m, 3106 X86::NEG8m); 3107 const SDValue Ops[] = {Base, Scale, Index, Disp, Segment, InputChain}; 3108 Result = CurDAG->getMachineNode(NewOpc, SDLoc(Node), MVT::i32, 3109 MVT::Other, Ops); 3110 break; 3111 } 3112 LLVM_FALLTHROUGH; 3113 case X86ISD::ADD: 3114 // Try to match inc/dec. 3115 if (!Subtarget->slowIncDec() || CurDAG->shouldOptForSize()) { 3116 bool IsOne = isOneConstant(StoredVal.getOperand(1)); 3117 bool IsNegOne = isAllOnesConstant(StoredVal.getOperand(1)); 3118 // ADD/SUB with 1/-1 and carry flag isn't used can use inc/dec. 3119 if ((IsOne || IsNegOne) && hasNoCarryFlagUses(StoredVal.getValue(1))) { 3120 unsigned NewOpc = 3121 ((Opc == X86ISD::ADD) == IsOne) 3122 ? SelectOpcode(X86::INC64m, X86::INC32m, X86::INC16m, X86::INC8m) 3123 : SelectOpcode(X86::DEC64m, X86::DEC32m, X86::DEC16m, X86::DEC8m); 3124 const SDValue Ops[] = {Base, Scale, Index, Disp, Segment, InputChain}; 3125 Result = CurDAG->getMachineNode(NewOpc, SDLoc(Node), MVT::i32, 3126 MVT::Other, Ops); 3127 break; 3128 } 3129 } 3130 LLVM_FALLTHROUGH; 3131 case X86ISD::ADC: 3132 case X86ISD::SBB: 3133 case X86ISD::AND: 3134 case X86ISD::OR: 3135 case X86ISD::XOR: { 3136 auto SelectRegOpcode = [SelectOpcode](unsigned Opc) { 3137 switch (Opc) { 3138 case X86ISD::ADD: 3139 return SelectOpcode(X86::ADD64mr, X86::ADD32mr, X86::ADD16mr, 3140 X86::ADD8mr); 3141 case X86ISD::ADC: 3142 return SelectOpcode(X86::ADC64mr, X86::ADC32mr, X86::ADC16mr, 3143 X86::ADC8mr); 3144 case X86ISD::SUB: 3145 return SelectOpcode(X86::SUB64mr, X86::SUB32mr, X86::SUB16mr, 3146 X86::SUB8mr); 3147 case X86ISD::SBB: 3148 return SelectOpcode(X86::SBB64mr, X86::SBB32mr, X86::SBB16mr, 3149 X86::SBB8mr); 3150 case X86ISD::AND: 3151 return SelectOpcode(X86::AND64mr, X86::AND32mr, X86::AND16mr, 3152 X86::AND8mr); 3153 case X86ISD::OR: 3154 return SelectOpcode(X86::OR64mr, X86::OR32mr, X86::OR16mr, X86::OR8mr); 3155 case X86ISD::XOR: 3156 return SelectOpcode(X86::XOR64mr, X86::XOR32mr, X86::XOR16mr, 3157 X86::XOR8mr); 3158 default: 3159 llvm_unreachable("Invalid opcode!"); 3160 } 3161 }; 3162 auto SelectImm8Opcode = [SelectOpcode](unsigned Opc) { 3163 switch (Opc) { 3164 case X86ISD::ADD: 3165 return SelectOpcode(X86::ADD64mi8, X86::ADD32mi8, X86::ADD16mi8, 0); 3166 case X86ISD::ADC: 3167 return SelectOpcode(X86::ADC64mi8, X86::ADC32mi8, X86::ADC16mi8, 0); 3168 case X86ISD::SUB: 3169 return SelectOpcode(X86::SUB64mi8, X86::SUB32mi8, X86::SUB16mi8, 0); 3170 case X86ISD::SBB: 3171 return SelectOpcode(X86::SBB64mi8, X86::SBB32mi8, X86::SBB16mi8, 0); 3172 case X86ISD::AND: 3173 return SelectOpcode(X86::AND64mi8, X86::AND32mi8, X86::AND16mi8, 0); 3174 case X86ISD::OR: 3175 return SelectOpcode(X86::OR64mi8, X86::OR32mi8, X86::OR16mi8, 0); 3176 case X86ISD::XOR: 3177 return SelectOpcode(X86::XOR64mi8, X86::XOR32mi8, X86::XOR16mi8, 0); 3178 default: 3179 llvm_unreachable("Invalid opcode!"); 3180 } 3181 }; 3182 auto SelectImmOpcode = [SelectOpcode](unsigned Opc) { 3183 switch (Opc) { 3184 case X86ISD::ADD: 3185 return SelectOpcode(X86::ADD64mi32, X86::ADD32mi, X86::ADD16mi, 3186 X86::ADD8mi); 3187 case X86ISD::ADC: 3188 return SelectOpcode(X86::ADC64mi32, X86::ADC32mi, X86::ADC16mi, 3189 X86::ADC8mi); 3190 case X86ISD::SUB: 3191 return SelectOpcode(X86::SUB64mi32, X86::SUB32mi, X86::SUB16mi, 3192 X86::SUB8mi); 3193 case X86ISD::SBB: 3194 return SelectOpcode(X86::SBB64mi32, X86::SBB32mi, X86::SBB16mi, 3195 X86::SBB8mi); 3196 case X86ISD::AND: 3197 return SelectOpcode(X86::AND64mi32, X86::AND32mi, X86::AND16mi, 3198 X86::AND8mi); 3199 case X86ISD::OR: 3200 return SelectOpcode(X86::OR64mi32, X86::OR32mi, X86::OR16mi, 3201 X86::OR8mi); 3202 case X86ISD::XOR: 3203 return SelectOpcode(X86::XOR64mi32, X86::XOR32mi, X86::XOR16mi, 3204 X86::XOR8mi); 3205 default: 3206 llvm_unreachable("Invalid opcode!"); 3207 } 3208 }; 3209 3210 unsigned NewOpc = SelectRegOpcode(Opc); 3211 SDValue Operand = StoredVal->getOperand(1-LoadOpNo); 3212 3213 // See if the operand is a constant that we can fold into an immediate 3214 // operand. 3215 if (auto *OperandC = dyn_cast<ConstantSDNode>(Operand)) { 3216 int64_t OperandV = OperandC->getSExtValue(); 3217 3218 // Check if we can shrink the operand enough to fit in an immediate (or 3219 // fit into a smaller immediate) by negating it and switching the 3220 // operation. 3221 if ((Opc == X86ISD::ADD || Opc == X86ISD::SUB) && 3222 ((MemVT != MVT::i8 && !isInt<8>(OperandV) && isInt<8>(-OperandV)) || 3223 (MemVT == MVT::i64 && !isInt<32>(OperandV) && 3224 isInt<32>(-OperandV))) && 3225 hasNoCarryFlagUses(StoredVal.getValue(1))) { 3226 OperandV = -OperandV; 3227 Opc = Opc == X86ISD::ADD ? X86ISD::SUB : X86ISD::ADD; 3228 } 3229 3230 // First try to fit this into an Imm8 operand. If it doesn't fit, then try 3231 // the larger immediate operand. 3232 if (MemVT != MVT::i8 && isInt<8>(OperandV)) { 3233 Operand = CurDAG->getTargetConstant(OperandV, SDLoc(Node), MemVT); 3234 NewOpc = SelectImm8Opcode(Opc); 3235 } else if (MemVT != MVT::i64 || isInt<32>(OperandV)) { 3236 Operand = CurDAG->getTargetConstant(OperandV, SDLoc(Node), MemVT); 3237 NewOpc = SelectImmOpcode(Opc); 3238 } 3239 } 3240 3241 if (Opc == X86ISD::ADC || Opc == X86ISD::SBB) { 3242 SDValue CopyTo = 3243 CurDAG->getCopyToReg(InputChain, SDLoc(Node), X86::EFLAGS, 3244 StoredVal.getOperand(2), SDValue()); 3245 3246 const SDValue Ops[] = {Base, Scale, Index, Disp, 3247 Segment, Operand, CopyTo, CopyTo.getValue(1)}; 3248 Result = CurDAG->getMachineNode(NewOpc, SDLoc(Node), MVT::i32, MVT::Other, 3249 Ops); 3250 } else { 3251 const SDValue Ops[] = {Base, Scale, Index, Disp, 3252 Segment, Operand, InputChain}; 3253 Result = CurDAG->getMachineNode(NewOpc, SDLoc(Node), MVT::i32, MVT::Other, 3254 Ops); 3255 } 3256 break; 3257 } 3258 default: 3259 llvm_unreachable("Invalid opcode!"); 3260 } 3261 3262 MachineMemOperand *MemOps[] = {StoreNode->getMemOperand(), 3263 LoadNode->getMemOperand()}; 3264 CurDAG->setNodeMemRefs(Result, MemOps); 3265 3266 // Update Load Chain uses as well. 3267 ReplaceUses(SDValue(LoadNode, 1), SDValue(Result, 1)); 3268 ReplaceUses(SDValue(StoreNode, 0), SDValue(Result, 1)); 3269 ReplaceUses(SDValue(StoredVal.getNode(), 1), SDValue(Result, 0)); 3270 CurDAG->RemoveDeadNode(Node); 3271 return true; 3272 } 3273 3274 // See if this is an X & Mask that we can match to BEXTR/BZHI. 3275 // Where Mask is one of the following patterns: 3276 // a) x & (1 << nbits) - 1 3277 // b) x & ~(-1 << nbits) 3278 // c) x & (-1 >> (32 - y)) 3279 // d) x << (32 - y) >> (32 - y) 3280 bool X86DAGToDAGISel::matchBitExtract(SDNode *Node) { 3281 assert( 3282 (Node->getOpcode() == ISD::AND || Node->getOpcode() == ISD::SRL) && 3283 "Should be either an and-mask, or right-shift after clearing high bits."); 3284 3285 // BEXTR is BMI instruction, BZHI is BMI2 instruction. We need at least one. 3286 if (!Subtarget->hasBMI() && !Subtarget->hasBMI2()) 3287 return false; 3288 3289 MVT NVT = Node->getSimpleValueType(0); 3290 3291 // Only supported for 32 and 64 bits. 3292 if (NVT != MVT::i32 && NVT != MVT::i64) 3293 return false; 3294 3295 SDValue NBits; 3296 3297 // If we have BMI2's BZHI, we are ok with muti-use patterns. 3298 // Else, if we only have BMI1's BEXTR, we require one-use. 3299 const bool CanHaveExtraUses = Subtarget->hasBMI2(); 3300 auto checkUses = [CanHaveExtraUses](SDValue Op, unsigned NUses) { 3301 return CanHaveExtraUses || 3302 Op.getNode()->hasNUsesOfValue(NUses, Op.getResNo()); 3303 }; 3304 auto checkOneUse = [checkUses](SDValue Op) { return checkUses(Op, 1); }; 3305 auto checkTwoUse = [checkUses](SDValue Op) { return checkUses(Op, 2); }; 3306 3307 auto peekThroughOneUseTruncation = [checkOneUse](SDValue V) { 3308 if (V->getOpcode() == ISD::TRUNCATE && checkOneUse(V)) { 3309 assert(V.getSimpleValueType() == MVT::i32 && 3310 V.getOperand(0).getSimpleValueType() == MVT::i64 && 3311 "Expected i64 -> i32 truncation"); 3312 V = V.getOperand(0); 3313 } 3314 return V; 3315 }; 3316 3317 // a) x & ((1 << nbits) + (-1)) 3318 auto matchPatternA = [checkOneUse, peekThroughOneUseTruncation, 3319 &NBits](SDValue Mask) -> bool { 3320 // Match `add`. Must only have one use! 3321 if (Mask->getOpcode() != ISD::ADD || !checkOneUse(Mask)) 3322 return false; 3323 // We should be adding all-ones constant (i.e. subtracting one.) 3324 if (!isAllOnesConstant(Mask->getOperand(1))) 3325 return false; 3326 // Match `1 << nbits`. Might be truncated. Must only have one use! 3327 SDValue M0 = peekThroughOneUseTruncation(Mask->getOperand(0)); 3328 if (M0->getOpcode() != ISD::SHL || !checkOneUse(M0)) 3329 return false; 3330 if (!isOneConstant(M0->getOperand(0))) 3331 return false; 3332 NBits = M0->getOperand(1); 3333 return true; 3334 }; 3335 3336 auto isAllOnes = [this, peekThroughOneUseTruncation, NVT](SDValue V) { 3337 V = peekThroughOneUseTruncation(V); 3338 return CurDAG->MaskedValueIsAllOnes( 3339 V, APInt::getLowBitsSet(V.getSimpleValueType().getSizeInBits(), 3340 NVT.getSizeInBits())); 3341 }; 3342 3343 // b) x & ~(-1 << nbits) 3344 auto matchPatternB = [checkOneUse, isAllOnes, peekThroughOneUseTruncation, 3345 &NBits](SDValue Mask) -> bool { 3346 // Match `~()`. Must only have one use! 3347 if (Mask.getOpcode() != ISD::XOR || !checkOneUse(Mask)) 3348 return false; 3349 // The -1 only has to be all-ones for the final Node's NVT. 3350 if (!isAllOnes(Mask->getOperand(1))) 3351 return false; 3352 // Match `-1 << nbits`. Might be truncated. Must only have one use! 3353 SDValue M0 = peekThroughOneUseTruncation(Mask->getOperand(0)); 3354 if (M0->getOpcode() != ISD::SHL || !checkOneUse(M0)) 3355 return false; 3356 // The -1 only has to be all-ones for the final Node's NVT. 3357 if (!isAllOnes(M0->getOperand(0))) 3358 return false; 3359 NBits = M0->getOperand(1); 3360 return true; 3361 }; 3362 3363 // Match potentially-truncated (bitwidth - y) 3364 auto matchShiftAmt = [checkOneUse, &NBits](SDValue ShiftAmt, 3365 unsigned Bitwidth) { 3366 // Skip over a truncate of the shift amount. 3367 if (ShiftAmt.getOpcode() == ISD::TRUNCATE) { 3368 ShiftAmt = ShiftAmt.getOperand(0); 3369 // The trunc should have been the only user of the real shift amount. 3370 if (!checkOneUse(ShiftAmt)) 3371 return false; 3372 } 3373 // Match the shift amount as: (bitwidth - y). It should go away, too. 3374 if (ShiftAmt.getOpcode() != ISD::SUB) 3375 return false; 3376 auto V0 = dyn_cast<ConstantSDNode>(ShiftAmt.getOperand(0)); 3377 if (!V0 || V0->getZExtValue() != Bitwidth) 3378 return false; 3379 NBits = ShiftAmt.getOperand(1); 3380 return true; 3381 }; 3382 3383 // c) x & (-1 >> (32 - y)) 3384 auto matchPatternC = [checkOneUse, peekThroughOneUseTruncation, 3385 matchShiftAmt](SDValue Mask) -> bool { 3386 // The mask itself may be truncated. 3387 Mask = peekThroughOneUseTruncation(Mask); 3388 unsigned Bitwidth = Mask.getSimpleValueType().getSizeInBits(); 3389 // Match `l>>`. Must only have one use! 3390 if (Mask.getOpcode() != ISD::SRL || !checkOneUse(Mask)) 3391 return false; 3392 // We should be shifting truly all-ones constant. 3393 if (!isAllOnesConstant(Mask.getOperand(0))) 3394 return false; 3395 SDValue M1 = Mask.getOperand(1); 3396 // The shift amount should not be used externally. 3397 if (!checkOneUse(M1)) 3398 return false; 3399 return matchShiftAmt(M1, Bitwidth); 3400 }; 3401 3402 SDValue X; 3403 3404 // d) x << (32 - y) >> (32 - y) 3405 auto matchPatternD = [checkOneUse, checkTwoUse, matchShiftAmt, 3406 &X](SDNode *Node) -> bool { 3407 if (Node->getOpcode() != ISD::SRL) 3408 return false; 3409 SDValue N0 = Node->getOperand(0); 3410 if (N0->getOpcode() != ISD::SHL || !checkOneUse(N0)) 3411 return false; 3412 unsigned Bitwidth = N0.getSimpleValueType().getSizeInBits(); 3413 SDValue N1 = Node->getOperand(1); 3414 SDValue N01 = N0->getOperand(1); 3415 // Both of the shifts must be by the exact same value. 3416 // There should not be any uses of the shift amount outside of the pattern. 3417 if (N1 != N01 || !checkTwoUse(N1)) 3418 return false; 3419 if (!matchShiftAmt(N1, Bitwidth)) 3420 return false; 3421 X = N0->getOperand(0); 3422 return true; 3423 }; 3424 3425 auto matchLowBitMask = [matchPatternA, matchPatternB, 3426 matchPatternC](SDValue Mask) -> bool { 3427 return matchPatternA(Mask) || matchPatternB(Mask) || matchPatternC(Mask); 3428 }; 3429 3430 if (Node->getOpcode() == ISD::AND) { 3431 X = Node->getOperand(0); 3432 SDValue Mask = Node->getOperand(1); 3433 3434 if (matchLowBitMask(Mask)) { 3435 // Great. 3436 } else { 3437 std::swap(X, Mask); 3438 if (!matchLowBitMask(Mask)) 3439 return false; 3440 } 3441 } else if (!matchPatternD(Node)) 3442 return false; 3443 3444 SDLoc DL(Node); 3445 3446 // Truncate the shift amount. 3447 NBits = CurDAG->getNode(ISD::TRUNCATE, DL, MVT::i8, NBits); 3448 insertDAGNode(*CurDAG, SDValue(Node, 0), NBits); 3449 3450 // Insert 8-bit NBits into lowest 8 bits of 32-bit register. 3451 // All the other bits are undefined, we do not care about them. 3452 SDValue ImplDef = SDValue( 3453 CurDAG->getMachineNode(TargetOpcode::IMPLICIT_DEF, DL, MVT::i32), 0); 3454 insertDAGNode(*CurDAG, SDValue(Node, 0), ImplDef); 3455 3456 SDValue SRIdxVal = CurDAG->getTargetConstant(X86::sub_8bit, DL, MVT::i32); 3457 insertDAGNode(*CurDAG, SDValue(Node, 0), SRIdxVal); 3458 NBits = SDValue( 3459 CurDAG->getMachineNode(TargetOpcode::INSERT_SUBREG, DL, MVT::i32, ImplDef, 3460 NBits, SRIdxVal), 0); 3461 insertDAGNode(*CurDAG, SDValue(Node, 0), NBits); 3462 3463 if (Subtarget->hasBMI2()) { 3464 // Great, just emit the the BZHI.. 3465 if (NVT != MVT::i32) { 3466 // But have to place the bit count into the wide-enough register first. 3467 NBits = CurDAG->getNode(ISD::ANY_EXTEND, DL, NVT, NBits); 3468 insertDAGNode(*CurDAG, SDValue(Node, 0), NBits); 3469 } 3470 3471 SDValue Extract = CurDAG->getNode(X86ISD::BZHI, DL, NVT, X, NBits); 3472 ReplaceNode(Node, Extract.getNode()); 3473 SelectCode(Extract.getNode()); 3474 return true; 3475 } 3476 3477 // Else, if we do *NOT* have BMI2, let's find out if the if the 'X' is 3478 // *logically* shifted (potentially with one-use trunc inbetween), 3479 // and the truncation was the only use of the shift, 3480 // and if so look past one-use truncation. 3481 { 3482 SDValue RealX = peekThroughOneUseTruncation(X); 3483 // FIXME: only if the shift is one-use? 3484 if (RealX != X && RealX.getOpcode() == ISD::SRL) 3485 X = RealX; 3486 } 3487 3488 MVT XVT = X.getSimpleValueType(); 3489 3490 // Else, emitting BEXTR requires one more step. 3491 // The 'control' of BEXTR has the pattern of: 3492 // [15...8 bit][ 7...0 bit] location 3493 // [ bit count][ shift] name 3494 // I.e. 0b000000011'00000001 means (x >> 0b1) & 0b11 3495 3496 // Shift NBits left by 8 bits, thus producing 'control'. 3497 // This makes the low 8 bits to be zero. 3498 SDValue C8 = CurDAG->getConstant(8, DL, MVT::i8); 3499 insertDAGNode(*CurDAG, SDValue(Node, 0), C8); 3500 SDValue Control = CurDAG->getNode(ISD::SHL, DL, MVT::i32, NBits, C8); 3501 insertDAGNode(*CurDAG, SDValue(Node, 0), Control); 3502 3503 // If the 'X' is *logically* shifted, we can fold that shift into 'control'. 3504 // FIXME: only if the shift is one-use? 3505 if (X.getOpcode() == ISD::SRL) { 3506 SDValue ShiftAmt = X.getOperand(1); 3507 X = X.getOperand(0); 3508 3509 assert(ShiftAmt.getValueType() == MVT::i8 && 3510 "Expected shift amount to be i8"); 3511 3512 // Now, *zero*-extend the shift amount. The bits 8...15 *must* be zero! 3513 // We could zext to i16 in some form, but we intentionally don't do that. 3514 SDValue OrigShiftAmt = ShiftAmt; 3515 ShiftAmt = CurDAG->getNode(ISD::ZERO_EXTEND, DL, MVT::i32, ShiftAmt); 3516 insertDAGNode(*CurDAG, OrigShiftAmt, ShiftAmt); 3517 3518 // And now 'or' these low 8 bits of shift amount into the 'control'. 3519 Control = CurDAG->getNode(ISD::OR, DL, MVT::i32, Control, ShiftAmt); 3520 insertDAGNode(*CurDAG, SDValue(Node, 0), Control); 3521 } 3522 3523 // But have to place the 'control' into the wide-enough register first. 3524 if (XVT != MVT::i32) { 3525 Control = CurDAG->getNode(ISD::ANY_EXTEND, DL, XVT, Control); 3526 insertDAGNode(*CurDAG, SDValue(Node, 0), Control); 3527 } 3528 3529 // And finally, form the BEXTR itself. 3530 SDValue Extract = CurDAG->getNode(X86ISD::BEXTR, DL, XVT, X, Control); 3531 3532 // The 'X' was originally truncated. Do that now. 3533 if (XVT != NVT) { 3534 insertDAGNode(*CurDAG, SDValue(Node, 0), Extract); 3535 Extract = CurDAG->getNode(ISD::TRUNCATE, DL, NVT, Extract); 3536 } 3537 3538 ReplaceNode(Node, Extract.getNode()); 3539 SelectCode(Extract.getNode()); 3540 3541 return true; 3542 } 3543 3544 // See if this is an (X >> C1) & C2 that we can match to BEXTR/BEXTRI. 3545 MachineSDNode *X86DAGToDAGISel::matchBEXTRFromAndImm(SDNode *Node) { 3546 MVT NVT = Node->getSimpleValueType(0); 3547 SDLoc dl(Node); 3548 3549 SDValue N0 = Node->getOperand(0); 3550 SDValue N1 = Node->getOperand(1); 3551 3552 // If we have TBM we can use an immediate for the control. If we have BMI 3553 // we should only do this if the BEXTR instruction is implemented well. 3554 // Otherwise moving the control into a register makes this more costly. 3555 // TODO: Maybe load folding, greater than 32-bit masks, or a guarantee of LICM 3556 // hoisting the move immediate would make it worthwhile with a less optimal 3557 // BEXTR? 3558 bool PreferBEXTR = 3559 Subtarget->hasTBM() || (Subtarget->hasBMI() && Subtarget->hasFastBEXTR()); 3560 if (!PreferBEXTR && !Subtarget->hasBMI2()) 3561 return nullptr; 3562 3563 // Must have a shift right. 3564 if (N0->getOpcode() != ISD::SRL && N0->getOpcode() != ISD::SRA) 3565 return nullptr; 3566 3567 // Shift can't have additional users. 3568 if (!N0->hasOneUse()) 3569 return nullptr; 3570 3571 // Only supported for 32 and 64 bits. 3572 if (NVT != MVT::i32 && NVT != MVT::i64) 3573 return nullptr; 3574 3575 // Shift amount and RHS of and must be constant. 3576 ConstantSDNode *MaskCst = dyn_cast<ConstantSDNode>(N1); 3577 ConstantSDNode *ShiftCst = dyn_cast<ConstantSDNode>(N0->getOperand(1)); 3578 if (!MaskCst || !ShiftCst) 3579 return nullptr; 3580 3581 // And RHS must be a mask. 3582 uint64_t Mask = MaskCst->getZExtValue(); 3583 if (!isMask_64(Mask)) 3584 return nullptr; 3585 3586 uint64_t Shift = ShiftCst->getZExtValue(); 3587 uint64_t MaskSize = countPopulation(Mask); 3588 3589 // Don't interfere with something that can be handled by extracting AH. 3590 // TODO: If we are able to fold a load, BEXTR might still be better than AH. 3591 if (Shift == 8 && MaskSize == 8) 3592 return nullptr; 3593 3594 // Make sure we are only using bits that were in the original value, not 3595 // shifted in. 3596 if (Shift + MaskSize > NVT.getSizeInBits()) 3597 return nullptr; 3598 3599 // BZHI, if available, is always fast, unlike BEXTR. But even if we decide 3600 // that we can't use BEXTR, it is only worthwhile using BZHI if the mask 3601 // does not fit into 32 bits. Load folding is not a sufficient reason. 3602 if (!PreferBEXTR && MaskSize <= 32) 3603 return nullptr; 3604 3605 SDValue Control; 3606 unsigned ROpc, MOpc; 3607 3608 if (!PreferBEXTR) { 3609 assert(Subtarget->hasBMI2() && "We must have BMI2's BZHI then."); 3610 // If we can't make use of BEXTR then we can't fuse shift+mask stages. 3611 // Let's perform the mask first, and apply shift later. Note that we need to 3612 // widen the mask to account for the fact that we'll apply shift afterwards! 3613 Control = CurDAG->getTargetConstant(Shift + MaskSize, dl, NVT); 3614 ROpc = NVT == MVT::i64 ? X86::BZHI64rr : X86::BZHI32rr; 3615 MOpc = NVT == MVT::i64 ? X86::BZHI64rm : X86::BZHI32rm; 3616 unsigned NewOpc = NVT == MVT::i64 ? X86::MOV32ri64 : X86::MOV32ri; 3617 Control = SDValue(CurDAG->getMachineNode(NewOpc, dl, NVT, Control), 0); 3618 } else { 3619 // The 'control' of BEXTR has the pattern of: 3620 // [15...8 bit][ 7...0 bit] location 3621 // [ bit count][ shift] name 3622 // I.e. 0b000000011'00000001 means (x >> 0b1) & 0b11 3623 Control = CurDAG->getTargetConstant(Shift | (MaskSize << 8), dl, NVT); 3624 if (Subtarget->hasTBM()) { 3625 ROpc = NVT == MVT::i64 ? X86::BEXTRI64ri : X86::BEXTRI32ri; 3626 MOpc = NVT == MVT::i64 ? X86::BEXTRI64mi : X86::BEXTRI32mi; 3627 } else { 3628 assert(Subtarget->hasBMI() && "We must have BMI1's BEXTR then."); 3629 // BMI requires the immediate to placed in a register. 3630 ROpc = NVT == MVT::i64 ? X86::BEXTR64rr : X86::BEXTR32rr; 3631 MOpc = NVT == MVT::i64 ? X86::BEXTR64rm : X86::BEXTR32rm; 3632 unsigned NewOpc = NVT == MVT::i64 ? X86::MOV32ri64 : X86::MOV32ri; 3633 Control = SDValue(CurDAG->getMachineNode(NewOpc, dl, NVT, Control), 0); 3634 } 3635 } 3636 3637 MachineSDNode *NewNode; 3638 SDValue Input = N0->getOperand(0); 3639 SDValue Tmp0, Tmp1, Tmp2, Tmp3, Tmp4; 3640 if (tryFoldLoad(Node, N0.getNode(), Input, Tmp0, Tmp1, Tmp2, Tmp3, Tmp4)) { 3641 SDValue Ops[] = { 3642 Tmp0, Tmp1, Tmp2, Tmp3, Tmp4, Control, Input.getOperand(0)}; 3643 SDVTList VTs = CurDAG->getVTList(NVT, MVT::i32, MVT::Other); 3644 NewNode = CurDAG->getMachineNode(MOpc, dl, VTs, Ops); 3645 // Update the chain. 3646 ReplaceUses(Input.getValue(1), SDValue(NewNode, 2)); 3647 // Record the mem-refs 3648 CurDAG->setNodeMemRefs(NewNode, {cast<LoadSDNode>(Input)->getMemOperand()}); 3649 } else { 3650 NewNode = CurDAG->getMachineNode(ROpc, dl, NVT, MVT::i32, Input, Control); 3651 } 3652 3653 if (!PreferBEXTR) { 3654 // We still need to apply the shift. 3655 SDValue ShAmt = CurDAG->getTargetConstant(Shift, dl, NVT); 3656 unsigned NewOpc = NVT == MVT::i64 ? X86::SHR64ri : X86::SHR32ri; 3657 NewNode = 3658 CurDAG->getMachineNode(NewOpc, dl, NVT, SDValue(NewNode, 0), ShAmt); 3659 } 3660 3661 return NewNode; 3662 } 3663 3664 // Emit a PCMISTR(I/M) instruction. 3665 MachineSDNode *X86DAGToDAGISel::emitPCMPISTR(unsigned ROpc, unsigned MOpc, 3666 bool MayFoldLoad, const SDLoc &dl, 3667 MVT VT, SDNode *Node) { 3668 SDValue N0 = Node->getOperand(0); 3669 SDValue N1 = Node->getOperand(1); 3670 SDValue Imm = Node->getOperand(2); 3671 const ConstantInt *Val = cast<ConstantSDNode>(Imm)->getConstantIntValue(); 3672 Imm = CurDAG->getTargetConstant(*Val, SDLoc(Node), Imm.getValueType()); 3673 3674 // Try to fold a load. No need to check alignment. 3675 SDValue Tmp0, Tmp1, Tmp2, Tmp3, Tmp4; 3676 if (MayFoldLoad && tryFoldLoad(Node, N1, Tmp0, Tmp1, Tmp2, Tmp3, Tmp4)) { 3677 SDValue Ops[] = { N0, Tmp0, Tmp1, Tmp2, Tmp3, Tmp4, Imm, 3678 N1.getOperand(0) }; 3679 SDVTList VTs = CurDAG->getVTList(VT, MVT::i32, MVT::Other); 3680 MachineSDNode *CNode = CurDAG->getMachineNode(MOpc, dl, VTs, Ops); 3681 // Update the chain. 3682 ReplaceUses(N1.getValue(1), SDValue(CNode, 2)); 3683 // Record the mem-refs 3684 CurDAG->setNodeMemRefs(CNode, {cast<LoadSDNode>(N1)->getMemOperand()}); 3685 return CNode; 3686 } 3687 3688 SDValue Ops[] = { N0, N1, Imm }; 3689 SDVTList VTs = CurDAG->getVTList(VT, MVT::i32); 3690 MachineSDNode *CNode = CurDAG->getMachineNode(ROpc, dl, VTs, Ops); 3691 return CNode; 3692 } 3693 3694 // Emit a PCMESTR(I/M) instruction. Also return the Glue result in case we need 3695 // to emit a second instruction after this one. This is needed since we have two 3696 // copyToReg nodes glued before this and we need to continue that glue through. 3697 MachineSDNode *X86DAGToDAGISel::emitPCMPESTR(unsigned ROpc, unsigned MOpc, 3698 bool MayFoldLoad, const SDLoc &dl, 3699 MVT VT, SDNode *Node, 3700 SDValue &InFlag) { 3701 SDValue N0 = Node->getOperand(0); 3702 SDValue N2 = Node->getOperand(2); 3703 SDValue Imm = Node->getOperand(4); 3704 const ConstantInt *Val = cast<ConstantSDNode>(Imm)->getConstantIntValue(); 3705 Imm = CurDAG->getTargetConstant(*Val, SDLoc(Node), Imm.getValueType()); 3706 3707 // Try to fold a load. No need to check alignment. 3708 SDValue Tmp0, Tmp1, Tmp2, Tmp3, Tmp4; 3709 if (MayFoldLoad && tryFoldLoad(Node, N2, Tmp0, Tmp1, Tmp2, Tmp3, Tmp4)) { 3710 SDValue Ops[] = { N0, Tmp0, Tmp1, Tmp2, Tmp3, Tmp4, Imm, 3711 N2.getOperand(0), InFlag }; 3712 SDVTList VTs = CurDAG->getVTList(VT, MVT::i32, MVT::Other, MVT::Glue); 3713 MachineSDNode *CNode = CurDAG->getMachineNode(MOpc, dl, VTs, Ops); 3714 InFlag = SDValue(CNode, 3); 3715 // Update the chain. 3716 ReplaceUses(N2.getValue(1), SDValue(CNode, 2)); 3717 // Record the mem-refs 3718 CurDAG->setNodeMemRefs(CNode, {cast<LoadSDNode>(N2)->getMemOperand()}); 3719 return CNode; 3720 } 3721 3722 SDValue Ops[] = { N0, N2, Imm, InFlag }; 3723 SDVTList VTs = CurDAG->getVTList(VT, MVT::i32, MVT::Glue); 3724 MachineSDNode *CNode = CurDAG->getMachineNode(ROpc, dl, VTs, Ops); 3725 InFlag = SDValue(CNode, 2); 3726 return CNode; 3727 } 3728 3729 bool X86DAGToDAGISel::tryShiftAmountMod(SDNode *N) { 3730 EVT VT = N->getValueType(0); 3731 3732 // Only handle scalar shifts. 3733 if (VT.isVector()) 3734 return false; 3735 3736 // Narrower shifts only mask to 5 bits in hardware. 3737 unsigned Size = VT == MVT::i64 ? 64 : 32; 3738 3739 SDValue OrigShiftAmt = N->getOperand(1); 3740 SDValue ShiftAmt = OrigShiftAmt; 3741 SDLoc DL(N); 3742 3743 // Skip over a truncate of the shift amount. 3744 if (ShiftAmt->getOpcode() == ISD::TRUNCATE) 3745 ShiftAmt = ShiftAmt->getOperand(0); 3746 3747 // This function is called after X86DAGToDAGISel::matchBitExtract(), 3748 // so we are not afraid that we might mess up BZHI/BEXTR pattern. 3749 3750 SDValue NewShiftAmt; 3751 if (ShiftAmt->getOpcode() == ISD::ADD || ShiftAmt->getOpcode() == ISD::SUB) { 3752 SDValue Add0 = ShiftAmt->getOperand(0); 3753 SDValue Add1 = ShiftAmt->getOperand(1); 3754 // If we are shifting by X+/-N where N == 0 mod Size, then just shift by X 3755 // to avoid the ADD/SUB. 3756 if (isa<ConstantSDNode>(Add1) && 3757 cast<ConstantSDNode>(Add1)->getZExtValue() % Size == 0) { 3758 NewShiftAmt = Add0; 3759 // If we are shifting by N-X where N == 0 mod Size, then just shift by -X to 3760 // generate a NEG instead of a SUB of a constant. 3761 } else if (ShiftAmt->getOpcode() == ISD::SUB && 3762 isa<ConstantSDNode>(Add0) && 3763 cast<ConstantSDNode>(Add0)->getZExtValue() != 0 && 3764 cast<ConstantSDNode>(Add0)->getZExtValue() % Size == 0) { 3765 // Insert a negate op. 3766 // TODO: This isn't guaranteed to replace the sub if there is a logic cone 3767 // that uses it that's not a shift. 3768 EVT SubVT = ShiftAmt.getValueType(); 3769 SDValue Zero = CurDAG->getConstant(0, DL, SubVT); 3770 SDValue Neg = CurDAG->getNode(ISD::SUB, DL, SubVT, Zero, Add1); 3771 NewShiftAmt = Neg; 3772 3773 // Insert these operands into a valid topological order so they can 3774 // get selected independently. 3775 insertDAGNode(*CurDAG, OrigShiftAmt, Zero); 3776 insertDAGNode(*CurDAG, OrigShiftAmt, Neg); 3777 } else 3778 return false; 3779 } else 3780 return false; 3781 3782 if (NewShiftAmt.getValueType() != MVT::i8) { 3783 // Need to truncate the shift amount. 3784 NewShiftAmt = CurDAG->getNode(ISD::TRUNCATE, DL, MVT::i8, NewShiftAmt); 3785 // Add to a correct topological ordering. 3786 insertDAGNode(*CurDAG, OrigShiftAmt, NewShiftAmt); 3787 } 3788 3789 // Insert a new mask to keep the shift amount legal. This should be removed 3790 // by isel patterns. 3791 NewShiftAmt = CurDAG->getNode(ISD::AND, DL, MVT::i8, NewShiftAmt, 3792 CurDAG->getConstant(Size - 1, DL, MVT::i8)); 3793 // Place in a correct topological ordering. 3794 insertDAGNode(*CurDAG, OrigShiftAmt, NewShiftAmt); 3795 3796 SDNode *UpdatedNode = CurDAG->UpdateNodeOperands(N, N->getOperand(0), 3797 NewShiftAmt); 3798 if (UpdatedNode != N) { 3799 // If we found an existing node, we should replace ourselves with that node 3800 // and wait for it to be selected after its other users. 3801 ReplaceNode(N, UpdatedNode); 3802 return true; 3803 } 3804 3805 // If the original shift amount is now dead, delete it so that we don't run 3806 // it through isel. 3807 if (OrigShiftAmt.getNode()->use_empty()) 3808 CurDAG->RemoveDeadNode(OrigShiftAmt.getNode()); 3809 3810 // Now that we've optimized the shift amount, defer to normal isel to get 3811 // load folding and legacy vs BMI2 selection without repeating it here. 3812 SelectCode(N); 3813 return true; 3814 } 3815 3816 bool X86DAGToDAGISel::tryShrinkShlLogicImm(SDNode *N) { 3817 MVT NVT = N->getSimpleValueType(0); 3818 unsigned Opcode = N->getOpcode(); 3819 SDLoc dl(N); 3820 3821 // For operations of the form (x << C1) op C2, check if we can use a smaller 3822 // encoding for C2 by transforming it into (x op (C2>>C1)) << C1. 3823 SDValue Shift = N->getOperand(0); 3824 SDValue N1 = N->getOperand(1); 3825 3826 ConstantSDNode *Cst = dyn_cast<ConstantSDNode>(N1); 3827 if (!Cst) 3828 return false; 3829 3830 int64_t Val = Cst->getSExtValue(); 3831 3832 // If we have an any_extend feeding the AND, look through it to see if there 3833 // is a shift behind it. But only if the AND doesn't use the extended bits. 3834 // FIXME: Generalize this to other ANY_EXTEND than i32 to i64? 3835 bool FoundAnyExtend = false; 3836 if (Shift.getOpcode() == ISD::ANY_EXTEND && Shift.hasOneUse() && 3837 Shift.getOperand(0).getSimpleValueType() == MVT::i32 && 3838 isUInt<32>(Val)) { 3839 FoundAnyExtend = true; 3840 Shift = Shift.getOperand(0); 3841 } 3842 3843 if (Shift.getOpcode() != ISD::SHL || !Shift.hasOneUse()) 3844 return false; 3845 3846 // i8 is unshrinkable, i16 should be promoted to i32. 3847 if (NVT != MVT::i32 && NVT != MVT::i64) 3848 return false; 3849 3850 ConstantSDNode *ShlCst = dyn_cast<ConstantSDNode>(Shift.getOperand(1)); 3851 if (!ShlCst) 3852 return false; 3853 3854 uint64_t ShAmt = ShlCst->getZExtValue(); 3855 3856 // Make sure that we don't change the operation by removing bits. 3857 // This only matters for OR and XOR, AND is unaffected. 3858 uint64_t RemovedBitsMask = (1ULL << ShAmt) - 1; 3859 if (Opcode != ISD::AND && (Val & RemovedBitsMask) != 0) 3860 return false; 3861 3862 // Check the minimum bitwidth for the new constant. 3863 // TODO: Using 16 and 8 bit operations is also possible for or32 & xor32. 3864 auto CanShrinkImmediate = [&](int64_t &ShiftedVal) { 3865 if (Opcode == ISD::AND) { 3866 // AND32ri is the same as AND64ri32 with zext imm. 3867 // Try this before sign extended immediates below. 3868 ShiftedVal = (uint64_t)Val >> ShAmt; 3869 if (NVT == MVT::i64 && !isUInt<32>(Val) && isUInt<32>(ShiftedVal)) 3870 return true; 3871 // Also swap order when the AND can become MOVZX. 3872 if (ShiftedVal == UINT8_MAX || ShiftedVal == UINT16_MAX) 3873 return true; 3874 } 3875 ShiftedVal = Val >> ShAmt; 3876 if ((!isInt<8>(Val) && isInt<8>(ShiftedVal)) || 3877 (!isInt<32>(Val) && isInt<32>(ShiftedVal))) 3878 return true; 3879 if (Opcode != ISD::AND) { 3880 // MOV32ri+OR64r/XOR64r is cheaper than MOV64ri64+OR64rr/XOR64rr 3881 ShiftedVal = (uint64_t)Val >> ShAmt; 3882 if (NVT == MVT::i64 && !isUInt<32>(Val) && isUInt<32>(ShiftedVal)) 3883 return true; 3884 } 3885 return false; 3886 }; 3887 3888 int64_t ShiftedVal; 3889 if (!CanShrinkImmediate(ShiftedVal)) 3890 return false; 3891 3892 // Ok, we can reorder to get a smaller immediate. 3893 3894 // But, its possible the original immediate allowed an AND to become MOVZX. 3895 // Doing this late due to avoid the MakedValueIsZero call as late as 3896 // possible. 3897 if (Opcode == ISD::AND) { 3898 // Find the smallest zext this could possibly be. 3899 unsigned ZExtWidth = Cst->getAPIntValue().getActiveBits(); 3900 ZExtWidth = PowerOf2Ceil(std::max(ZExtWidth, 8U)); 3901 3902 // Figure out which bits need to be zero to achieve that mask. 3903 APInt NeededMask = APInt::getLowBitsSet(NVT.getSizeInBits(), 3904 ZExtWidth); 3905 NeededMask &= ~Cst->getAPIntValue(); 3906 3907 if (CurDAG->MaskedValueIsZero(N->getOperand(0), NeededMask)) 3908 return false; 3909 } 3910 3911 SDValue X = Shift.getOperand(0); 3912 if (FoundAnyExtend) { 3913 SDValue NewX = CurDAG->getNode(ISD::ANY_EXTEND, dl, NVT, X); 3914 insertDAGNode(*CurDAG, SDValue(N, 0), NewX); 3915 X = NewX; 3916 } 3917 3918 SDValue NewCst = CurDAG->getConstant(ShiftedVal, dl, NVT); 3919 insertDAGNode(*CurDAG, SDValue(N, 0), NewCst); 3920 SDValue NewBinOp = CurDAG->getNode(Opcode, dl, NVT, X, NewCst); 3921 insertDAGNode(*CurDAG, SDValue(N, 0), NewBinOp); 3922 SDValue NewSHL = CurDAG->getNode(ISD::SHL, dl, NVT, NewBinOp, 3923 Shift.getOperand(1)); 3924 ReplaceNode(N, NewSHL.getNode()); 3925 SelectCode(NewSHL.getNode()); 3926 return true; 3927 } 3928 3929 // Try to match two logic ops to a VPTERNLOG. 3930 // FIXME: Handle inverted inputs? 3931 // FIXME: Handle more complex patterns that use an operand more than once? 3932 bool X86DAGToDAGISel::tryVPTERNLOG(SDNode *N) { 3933 MVT NVT = N->getSimpleValueType(0); 3934 3935 // Make sure we support VPTERNLOG. 3936 if (!NVT.isVector() || !Subtarget->hasAVX512() || 3937 NVT.getVectorElementType() == MVT::i1) 3938 return false; 3939 3940 // We need VLX for 128/256-bit. 3941 if (!(Subtarget->hasVLX() || NVT.is512BitVector())) 3942 return false; 3943 3944 unsigned Opc1 = N->getOpcode(); 3945 SDValue N0 = N->getOperand(0); 3946 SDValue N1 = N->getOperand(1); 3947 3948 auto isLogicOp = [](unsigned Opc) { 3949 return Opc == ISD::AND || Opc == ISD::OR || Opc == ISD::XOR || 3950 Opc == X86ISD::ANDNP; 3951 }; 3952 3953 SDValue A, B, C; 3954 unsigned Opc2; 3955 if (isLogicOp(N1.getOpcode()) && N1.hasOneUse()) { 3956 Opc2 = N1.getOpcode(); 3957 A = N0; 3958 B = N1.getOperand(0); 3959 C = N1.getOperand(1); 3960 } else if (isLogicOp(N0.getOpcode()) && N0.hasOneUse()) { 3961 Opc2 = N0.getOpcode(); 3962 A = N1; 3963 B = N0.getOperand(0); 3964 C = N0.getOperand(1); 3965 } else 3966 return false; 3967 3968 uint64_t Imm; 3969 switch (Opc1) { 3970 default: llvm_unreachable("Unexpected opcode!"); 3971 case ISD::AND: 3972 switch (Opc2) { 3973 default: llvm_unreachable("Unexpected opcode!"); 3974 case ISD::AND: Imm = 0x80; break; 3975 case ISD::OR: Imm = 0xe0; break; 3976 case ISD::XOR: Imm = 0x60; break; 3977 case X86ISD::ANDNP: Imm = 0x20; break; 3978 } 3979 break; 3980 case ISD::OR: 3981 switch (Opc2) { 3982 default: llvm_unreachable("Unexpected opcode!"); 3983 case ISD::AND: Imm = 0xf8; break; 3984 case ISD::OR: Imm = 0xfe; break; 3985 case ISD::XOR: Imm = 0xf6; break; 3986 case X86ISD::ANDNP: Imm = 0xf2; break; 3987 } 3988 break; 3989 case ISD::XOR: 3990 switch (Opc2) { 3991 default: llvm_unreachable("Unexpected opcode!"); 3992 case ISD::AND: Imm = 0x78; break; 3993 case ISD::OR: Imm = 0x1e; break; 3994 case ISD::XOR: Imm = 0x96; break; 3995 case X86ISD::ANDNP: Imm = 0xd2; break; 3996 } 3997 break; 3998 } 3999 4000 SDLoc DL(N); 4001 SDValue New = CurDAG->getNode(X86ISD::VPTERNLOG, DL, NVT, A, B, C, 4002 CurDAG->getTargetConstant(Imm, DL, MVT::i8)); 4003 ReplaceNode(N, New.getNode()); 4004 SelectCode(New.getNode()); 4005 return true; 4006 } 4007 4008 /// If the high bits of an 'and' operand are known zero, try setting the 4009 /// high bits of an 'and' constant operand to produce a smaller encoding by 4010 /// creating a small, sign-extended negative immediate rather than a large 4011 /// positive one. This reverses a transform in SimplifyDemandedBits that 4012 /// shrinks mask constants by clearing bits. There is also a possibility that 4013 /// the 'and' mask can be made -1, so the 'and' itself is unnecessary. In that 4014 /// case, just replace the 'and'. Return 'true' if the node is replaced. 4015 bool X86DAGToDAGISel::shrinkAndImmediate(SDNode *And) { 4016 // i8 is unshrinkable, i16 should be promoted to i32, and vector ops don't 4017 // have immediate operands. 4018 MVT VT = And->getSimpleValueType(0); 4019 if (VT != MVT::i32 && VT != MVT::i64) 4020 return false; 4021 4022 auto *And1C = dyn_cast<ConstantSDNode>(And->getOperand(1)); 4023 if (!And1C) 4024 return false; 4025 4026 // Bail out if the mask constant is already negative. It's can't shrink more. 4027 // If the upper 32 bits of a 64 bit mask are all zeros, we have special isel 4028 // patterns to use a 32-bit and instead of a 64-bit and by relying on the 4029 // implicit zeroing of 32 bit ops. So we should check if the lower 32 bits 4030 // are negative too. 4031 APInt MaskVal = And1C->getAPIntValue(); 4032 unsigned MaskLZ = MaskVal.countLeadingZeros(); 4033 if (!MaskLZ || (VT == MVT::i64 && MaskLZ == 32)) 4034 return false; 4035 4036 // Don't extend into the upper 32 bits of a 64 bit mask. 4037 if (VT == MVT::i64 && MaskLZ >= 32) { 4038 MaskLZ -= 32; 4039 MaskVal = MaskVal.trunc(32); 4040 } 4041 4042 SDValue And0 = And->getOperand(0); 4043 APInt HighZeros = APInt::getHighBitsSet(MaskVal.getBitWidth(), MaskLZ); 4044 APInt NegMaskVal = MaskVal | HighZeros; 4045 4046 // If a negative constant would not allow a smaller encoding, there's no need 4047 // to continue. Only change the constant when we know it's a win. 4048 unsigned MinWidth = NegMaskVal.getMinSignedBits(); 4049 if (MinWidth > 32 || (MinWidth > 8 && MaskVal.getMinSignedBits() <= 32)) 4050 return false; 4051 4052 // Extend masks if we truncated above. 4053 if (VT == MVT::i64 && MaskVal.getBitWidth() < 64) { 4054 NegMaskVal = NegMaskVal.zext(64); 4055 HighZeros = HighZeros.zext(64); 4056 } 4057 4058 // The variable operand must be all zeros in the top bits to allow using the 4059 // new, negative constant as the mask. 4060 if (!CurDAG->MaskedValueIsZero(And0, HighZeros)) 4061 return false; 4062 4063 // Check if the mask is -1. In that case, this is an unnecessary instruction 4064 // that escaped earlier analysis. 4065 if (NegMaskVal.isAllOnesValue()) { 4066 ReplaceNode(And, And0.getNode()); 4067 return true; 4068 } 4069 4070 // A negative mask allows a smaller encoding. Create a new 'and' node. 4071 SDValue NewMask = CurDAG->getConstant(NegMaskVal, SDLoc(And), VT); 4072 SDValue NewAnd = CurDAG->getNode(ISD::AND, SDLoc(And), VT, And0, NewMask); 4073 ReplaceNode(And, NewAnd.getNode()); 4074 SelectCode(NewAnd.getNode()); 4075 return true; 4076 } 4077 4078 static unsigned getVPTESTMOpc(MVT TestVT, bool IsTestN, bool FoldedLoad, 4079 bool FoldedBCast, bool Masked) { 4080 #define VPTESTM_CASE(VT, SUFFIX) \ 4081 case MVT::VT: \ 4082 if (Masked) \ 4083 return IsTestN ? X86::VPTESTNM##SUFFIX##k: X86::VPTESTM##SUFFIX##k; \ 4084 return IsTestN ? X86::VPTESTNM##SUFFIX : X86::VPTESTM##SUFFIX; 4085 4086 4087 #define VPTESTM_BROADCAST_CASES(SUFFIX) \ 4088 default: llvm_unreachable("Unexpected VT!"); \ 4089 VPTESTM_CASE(v4i32, DZ128##SUFFIX) \ 4090 VPTESTM_CASE(v2i64, QZ128##SUFFIX) \ 4091 VPTESTM_CASE(v8i32, DZ256##SUFFIX) \ 4092 VPTESTM_CASE(v4i64, QZ256##SUFFIX) \ 4093 VPTESTM_CASE(v16i32, DZ##SUFFIX) \ 4094 VPTESTM_CASE(v8i64, QZ##SUFFIX) 4095 4096 #define VPTESTM_FULL_CASES(SUFFIX) \ 4097 VPTESTM_BROADCAST_CASES(SUFFIX) \ 4098 VPTESTM_CASE(v16i8, BZ128##SUFFIX) \ 4099 VPTESTM_CASE(v8i16, WZ128##SUFFIX) \ 4100 VPTESTM_CASE(v32i8, BZ256##SUFFIX) \ 4101 VPTESTM_CASE(v16i16, WZ256##SUFFIX) \ 4102 VPTESTM_CASE(v64i8, BZ##SUFFIX) \ 4103 VPTESTM_CASE(v32i16, WZ##SUFFIX) 4104 4105 if (FoldedLoad) { 4106 switch (TestVT.SimpleTy) { 4107 VPTESTM_FULL_CASES(rm) 4108 } 4109 } 4110 4111 if (FoldedBCast) { 4112 switch (TestVT.SimpleTy) { 4113 VPTESTM_BROADCAST_CASES(rmb) 4114 } 4115 } 4116 4117 switch (TestVT.SimpleTy) { 4118 VPTESTM_FULL_CASES(rr) 4119 } 4120 4121 #undef VPTESTM_FULL_CASES 4122 #undef VPTESTM_BROADCAST_CASES 4123 #undef VPTESTM_CASE 4124 } 4125 4126 // Try to create VPTESTM instruction. If InMask is not null, it will be used 4127 // to form a masked operation. 4128 bool X86DAGToDAGISel::tryVPTESTM(SDNode *Root, SDValue Setcc, 4129 SDValue InMask) { 4130 assert(Subtarget->hasAVX512() && "Expected AVX512!"); 4131 assert(Setcc.getSimpleValueType().getVectorElementType() == MVT::i1 && 4132 "Unexpected VT!"); 4133 4134 // Look for equal and not equal compares. 4135 ISD::CondCode CC = cast<CondCodeSDNode>(Setcc.getOperand(2))->get(); 4136 if (CC != ISD::SETEQ && CC != ISD::SETNE) 4137 return false; 4138 4139 SDValue SetccOp0 = Setcc.getOperand(0); 4140 SDValue SetccOp1 = Setcc.getOperand(1); 4141 4142 // Canonicalize the all zero vector to the RHS. 4143 if (ISD::isBuildVectorAllZeros(SetccOp0.getNode())) 4144 std::swap(SetccOp0, SetccOp1); 4145 4146 // See if we're comparing against zero. 4147 if (!ISD::isBuildVectorAllZeros(SetccOp1.getNode())) 4148 return false; 4149 4150 SDValue N0 = SetccOp0; 4151 4152 MVT CmpVT = N0.getSimpleValueType(); 4153 MVT CmpSVT = CmpVT.getVectorElementType(); 4154 4155 // Start with both operands the same. We'll try to refine this. 4156 SDValue Src0 = N0; 4157 SDValue Src1 = N0; 4158 4159 { 4160 // Look through single use bitcasts. 4161 SDValue N0Temp = N0; 4162 if (N0Temp.getOpcode() == ISD::BITCAST && N0Temp.hasOneUse()) 4163 N0Temp = N0.getOperand(0); 4164 4165 // Look for single use AND. 4166 if (N0Temp.getOpcode() == ISD::AND && N0Temp.hasOneUse()) { 4167 Src0 = N0Temp.getOperand(0); 4168 Src1 = N0Temp.getOperand(1); 4169 } 4170 } 4171 4172 // Without VLX we need to widen the load. 4173 bool Widen = !Subtarget->hasVLX() && !CmpVT.is512BitVector(); 4174 4175 // We can only fold loads if the sources are unique. 4176 bool CanFoldLoads = Src0 != Src1; 4177 4178 // Try to fold loads unless we need to widen. 4179 bool FoldedLoad = false; 4180 SDValue Tmp0, Tmp1, Tmp2, Tmp3, Tmp4, Load; 4181 if (!Widen && CanFoldLoads) { 4182 Load = Src1; 4183 FoldedLoad = tryFoldLoad(Root, N0.getNode(), Load, Tmp0, Tmp1, Tmp2, Tmp3, 4184 Tmp4); 4185 if (!FoldedLoad) { 4186 // And is computative. 4187 Load = Src0; 4188 FoldedLoad = tryFoldLoad(Root, N0.getNode(), Load, Tmp0, Tmp1, Tmp2, 4189 Tmp3, Tmp4); 4190 if (FoldedLoad) 4191 std::swap(Src0, Src1); 4192 } 4193 } 4194 4195 auto findBroadcastedOp = [](SDValue Src, MVT CmpSVT, SDNode *&Parent) { 4196 // Look through single use bitcasts. 4197 if (Src.getOpcode() == ISD::BITCAST && Src.hasOneUse()) { 4198 Parent = Src.getNode(); 4199 Src = Src.getOperand(0); 4200 } 4201 4202 if (Src.getOpcode() == X86ISD::VBROADCAST_LOAD && Src.hasOneUse()) { 4203 auto *MemIntr = cast<MemIntrinsicSDNode>(Src); 4204 if (MemIntr->getMemoryVT().getSizeInBits() == CmpSVT.getSizeInBits()) 4205 return Src; 4206 } 4207 4208 return SDValue(); 4209 }; 4210 4211 // If we didn't fold a load, try to match broadcast. No widening limitation 4212 // for this. But only 32 and 64 bit types are supported. 4213 bool FoldedBCast = false; 4214 if (!FoldedLoad && CanFoldLoads && 4215 (CmpSVT == MVT::i32 || CmpSVT == MVT::i64)) { 4216 SDNode *ParentNode = N0.getNode(); 4217 if ((Load = findBroadcastedOp(Src1, CmpSVT, ParentNode))) { 4218 FoldedBCast = tryFoldBroadcast(Root, ParentNode, Load, Tmp0, 4219 Tmp1, Tmp2, Tmp3, Tmp4); 4220 } 4221 4222 // Try the other operand. 4223 if (!FoldedBCast) { 4224 SDNode *ParentNode = N0.getNode(); 4225 if ((Load = findBroadcastedOp(Src0, CmpSVT, ParentNode))) { 4226 FoldedBCast = tryFoldBroadcast(Root, ParentNode, Load, Tmp0, 4227 Tmp1, Tmp2, Tmp3, Tmp4); 4228 if (FoldedBCast) 4229 std::swap(Src0, Src1); 4230 } 4231 } 4232 } 4233 4234 auto getMaskRC = [](MVT MaskVT) { 4235 switch (MaskVT.SimpleTy) { 4236 default: llvm_unreachable("Unexpected VT!"); 4237 case MVT::v2i1: return X86::VK2RegClassID; 4238 case MVT::v4i1: return X86::VK4RegClassID; 4239 case MVT::v8i1: return X86::VK8RegClassID; 4240 case MVT::v16i1: return X86::VK16RegClassID; 4241 case MVT::v32i1: return X86::VK32RegClassID; 4242 case MVT::v64i1: return X86::VK64RegClassID; 4243 } 4244 }; 4245 4246 bool IsMasked = InMask.getNode() != nullptr; 4247 4248 SDLoc dl(Root); 4249 4250 MVT ResVT = Setcc.getSimpleValueType(); 4251 MVT MaskVT = ResVT; 4252 if (Widen) { 4253 // Widen the inputs using insert_subreg or copy_to_regclass. 4254 unsigned Scale = CmpVT.is128BitVector() ? 4 : 2; 4255 unsigned SubReg = CmpVT.is128BitVector() ? X86::sub_xmm : X86::sub_ymm; 4256 unsigned NumElts = CmpVT.getVectorNumElements() * Scale; 4257 CmpVT = MVT::getVectorVT(CmpSVT, NumElts); 4258 MaskVT = MVT::getVectorVT(MVT::i1, NumElts); 4259 SDValue ImplDef = SDValue(CurDAG->getMachineNode(X86::IMPLICIT_DEF, dl, 4260 CmpVT), 0); 4261 Src0 = CurDAG->getTargetInsertSubreg(SubReg, dl, CmpVT, ImplDef, Src0); 4262 4263 assert(!FoldedLoad && "Shouldn't have folded the load"); 4264 if (!FoldedBCast) 4265 Src1 = CurDAG->getTargetInsertSubreg(SubReg, dl, CmpVT, ImplDef, Src1); 4266 4267 if (IsMasked) { 4268 // Widen the mask. 4269 unsigned RegClass = getMaskRC(MaskVT); 4270 SDValue RC = CurDAG->getTargetConstant(RegClass, dl, MVT::i32); 4271 InMask = SDValue(CurDAG->getMachineNode(TargetOpcode::COPY_TO_REGCLASS, 4272 dl, MaskVT, InMask, RC), 0); 4273 } 4274 } 4275 4276 bool IsTestN = CC == ISD::SETEQ; 4277 unsigned Opc = getVPTESTMOpc(CmpVT, IsTestN, FoldedLoad, FoldedBCast, 4278 IsMasked); 4279 4280 MachineSDNode *CNode; 4281 if (FoldedLoad || FoldedBCast) { 4282 SDVTList VTs = CurDAG->getVTList(MaskVT, MVT::Other); 4283 4284 if (IsMasked) { 4285 SDValue Ops[] = { InMask, Src0, Tmp0, Tmp1, Tmp2, Tmp3, Tmp4, 4286 Load.getOperand(0) }; 4287 CNode = CurDAG->getMachineNode(Opc, dl, VTs, Ops); 4288 } else { 4289 SDValue Ops[] = { Src0, Tmp0, Tmp1, Tmp2, Tmp3, Tmp4, 4290 Load.getOperand(0) }; 4291 CNode = CurDAG->getMachineNode(Opc, dl, VTs, Ops); 4292 } 4293 4294 // Update the chain. 4295 ReplaceUses(Load.getValue(1), SDValue(CNode, 1)); 4296 // Record the mem-refs 4297 CurDAG->setNodeMemRefs(CNode, {cast<MemSDNode>(Load)->getMemOperand()}); 4298 } else { 4299 if (IsMasked) 4300 CNode = CurDAG->getMachineNode(Opc, dl, MaskVT, InMask, Src0, Src1); 4301 else 4302 CNode = CurDAG->getMachineNode(Opc, dl, MaskVT, Src0, Src1); 4303 } 4304 4305 // If we widened, we need to shrink the mask VT. 4306 if (Widen) { 4307 unsigned RegClass = getMaskRC(ResVT); 4308 SDValue RC = CurDAG->getTargetConstant(RegClass, dl, MVT::i32); 4309 CNode = CurDAG->getMachineNode(TargetOpcode::COPY_TO_REGCLASS, 4310 dl, ResVT, SDValue(CNode, 0), RC); 4311 } 4312 4313 ReplaceUses(SDValue(Root, 0), SDValue(CNode, 0)); 4314 CurDAG->RemoveDeadNode(Root); 4315 return true; 4316 } 4317 4318 // Try to match the bitselect pattern (or (and A, B), (andn A, C)). Turn it 4319 // into vpternlog. 4320 bool X86DAGToDAGISel::tryMatchBitSelect(SDNode *N) { 4321 assert(N->getOpcode() == ISD::OR && "Unexpected opcode!"); 4322 4323 MVT NVT = N->getSimpleValueType(0); 4324 4325 // Make sure we support VPTERNLOG. 4326 if (!NVT.isVector() || !Subtarget->hasAVX512()) 4327 return false; 4328 4329 // We need VLX for 128/256-bit. 4330 if (!(Subtarget->hasVLX() || NVT.is512BitVector())) 4331 return false; 4332 4333 SDValue N0 = N->getOperand(0); 4334 SDValue N1 = N->getOperand(1); 4335 4336 // Canonicalize AND to LHS. 4337 if (N1.getOpcode() == ISD::AND) 4338 std::swap(N0, N1); 4339 4340 if (N0.getOpcode() != ISD::AND || 4341 N1.getOpcode() != X86ISD::ANDNP || 4342 !N0.hasOneUse() || !N1.hasOneUse()) 4343 return false; 4344 4345 // ANDN is not commutable, use it to pick down A and C. 4346 SDValue A = N1.getOperand(0); 4347 SDValue C = N1.getOperand(1); 4348 4349 // AND is commutable, if one operand matches A, the other operand is B. 4350 // Otherwise this isn't a match. 4351 SDValue B; 4352 if (N0.getOperand(0) == A) 4353 B = N0.getOperand(1); 4354 else if (N0.getOperand(1) == A) 4355 B = N0.getOperand(0); 4356 else 4357 return false; 4358 4359 SDLoc dl(N); 4360 SDValue Imm = CurDAG->getTargetConstant(0xCA, dl, MVT::i8); 4361 SDValue Ternlog = CurDAG->getNode(X86ISD::VPTERNLOG, dl, NVT, A, B, C, Imm); 4362 ReplaceNode(N, Ternlog.getNode()); 4363 SelectCode(Ternlog.getNode()); 4364 return true; 4365 } 4366 4367 void X86DAGToDAGISel::Select(SDNode *Node) { 4368 MVT NVT = Node->getSimpleValueType(0); 4369 unsigned Opcode = Node->getOpcode(); 4370 SDLoc dl(Node); 4371 4372 if (Node->isMachineOpcode()) { 4373 LLVM_DEBUG(dbgs() << "== "; Node->dump(CurDAG); dbgs() << '\n'); 4374 Node->setNodeId(-1); 4375 return; // Already selected. 4376 } 4377 4378 switch (Opcode) { 4379 default: break; 4380 case ISD::INTRINSIC_VOID: { 4381 unsigned IntNo = Node->getConstantOperandVal(1); 4382 switch (IntNo) { 4383 default: break; 4384 case Intrinsic::x86_sse3_monitor: 4385 case Intrinsic::x86_monitorx: 4386 case Intrinsic::x86_clzero: { 4387 bool Use64BitPtr = Node->getOperand(2).getValueType() == MVT::i64; 4388 4389 unsigned Opc = 0; 4390 switch (IntNo) { 4391 default: llvm_unreachable("Unexpected intrinsic!"); 4392 case Intrinsic::x86_sse3_monitor: 4393 if (!Subtarget->hasSSE3()) 4394 break; 4395 Opc = Use64BitPtr ? X86::MONITOR64rrr : X86::MONITOR32rrr; 4396 break; 4397 case Intrinsic::x86_monitorx: 4398 if (!Subtarget->hasMWAITX()) 4399 break; 4400 Opc = Use64BitPtr ? X86::MONITORX64rrr : X86::MONITORX32rrr; 4401 break; 4402 case Intrinsic::x86_clzero: 4403 if (!Subtarget->hasCLZERO()) 4404 break; 4405 Opc = Use64BitPtr ? X86::CLZERO64r : X86::CLZERO32r; 4406 break; 4407 } 4408 4409 if (Opc) { 4410 unsigned PtrReg = Use64BitPtr ? X86::RAX : X86::EAX; 4411 SDValue Chain = CurDAG->getCopyToReg(Node->getOperand(0), dl, PtrReg, 4412 Node->getOperand(2), SDValue()); 4413 SDValue InFlag = Chain.getValue(1); 4414 4415 if (IntNo == Intrinsic::x86_sse3_monitor || 4416 IntNo == Intrinsic::x86_monitorx) { 4417 // Copy the other two operands to ECX and EDX. 4418 Chain = CurDAG->getCopyToReg(Chain, dl, X86::ECX, Node->getOperand(3), 4419 InFlag); 4420 InFlag = Chain.getValue(1); 4421 Chain = CurDAG->getCopyToReg(Chain, dl, X86::EDX, Node->getOperand(4), 4422 InFlag); 4423 InFlag = Chain.getValue(1); 4424 } 4425 4426 MachineSDNode *CNode = CurDAG->getMachineNode(Opc, dl, MVT::Other, 4427 { Chain, InFlag}); 4428 ReplaceNode(Node, CNode); 4429 return; 4430 } 4431 4432 break; 4433 } 4434 case Intrinsic::x86_tileloadd64: 4435 case Intrinsic::x86_tileloaddt164: 4436 case Intrinsic::x86_tilestored64: { 4437 if (!Subtarget->hasAMXTILE()) 4438 break; 4439 unsigned Opc; 4440 switch (IntNo) { 4441 default: llvm_unreachable("Unexpected intrinsic!"); 4442 case Intrinsic::x86_tileloadd64: Opc = X86::PTILELOADD; break; 4443 case Intrinsic::x86_tileloaddt164: Opc = X86::PTILELOADDT1; break; 4444 case Intrinsic::x86_tilestored64: Opc = X86::PTILESTORED; break; 4445 } 4446 // FIXME: Match displacement and scale. 4447 unsigned TIndex = Node->getConstantOperandVal(2); 4448 SDValue TReg = getI8Imm(TIndex, dl); 4449 SDValue Base = Node->getOperand(3); 4450 SDValue Scale = getI8Imm(1, dl); 4451 SDValue Index = Node->getOperand(4); 4452 SDValue Disp = CurDAG->getTargetConstant(0, dl, MVT::i32); 4453 SDValue Segment = CurDAG->getRegister(0, MVT::i16); 4454 SDValue Chain = Node->getOperand(0); 4455 MachineSDNode *CNode; 4456 if (Opc == X86::PTILESTORED) { 4457 SDValue Ops[] = { Base, Scale, Index, Disp, Segment, TReg, Chain }; 4458 CNode = CurDAG->getMachineNode(Opc, dl, MVT::Other, Ops); 4459 } else { 4460 SDValue Ops[] = { TReg, Base, Scale, Index, Disp, Segment, Chain }; 4461 CNode = CurDAG->getMachineNode(Opc, dl, MVT::Other, Ops); 4462 } 4463 ReplaceNode(Node, CNode); 4464 return; 4465 } 4466 } 4467 break; 4468 } 4469 case ISD::BRIND: { 4470 if (Subtarget->isTargetNaCl()) 4471 // NaCl has its own pass where jmp %r32 are converted to jmp %r64. We 4472 // leave the instruction alone. 4473 break; 4474 if (Subtarget->isTarget64BitILP32()) { 4475 // Converts a 32-bit register to a 64-bit, zero-extended version of 4476 // it. This is needed because x86-64 can do many things, but jmp %r32 4477 // ain't one of them. 4478 SDValue Target = Node->getOperand(1); 4479 assert(Target.getValueType() == MVT::i32 && "Unexpected VT!"); 4480 SDValue ZextTarget = CurDAG->getZExtOrTrunc(Target, dl, MVT::i64); 4481 SDValue Brind = CurDAG->getNode(ISD::BRIND, dl, MVT::Other, 4482 Node->getOperand(0), ZextTarget); 4483 ReplaceNode(Node, Brind.getNode()); 4484 SelectCode(ZextTarget.getNode()); 4485 SelectCode(Brind.getNode()); 4486 return; 4487 } 4488 break; 4489 } 4490 case X86ISD::GlobalBaseReg: 4491 ReplaceNode(Node, getGlobalBaseReg()); 4492 return; 4493 4494 case ISD::BITCAST: 4495 // Just drop all 128/256/512-bit bitcasts. 4496 if (NVT.is512BitVector() || NVT.is256BitVector() || NVT.is128BitVector() || 4497 NVT == MVT::f128) { 4498 ReplaceUses(SDValue(Node, 0), Node->getOperand(0)); 4499 CurDAG->RemoveDeadNode(Node); 4500 return; 4501 } 4502 break; 4503 4504 case ISD::SRL: 4505 if (matchBitExtract(Node)) 4506 return; 4507 LLVM_FALLTHROUGH; 4508 case ISD::SRA: 4509 case ISD::SHL: 4510 if (tryShiftAmountMod(Node)) 4511 return; 4512 break; 4513 4514 case ISD::AND: 4515 if (NVT.isVector() && NVT.getVectorElementType() == MVT::i1) { 4516 // Try to form a masked VPTESTM. Operands can be in either order. 4517 SDValue N0 = Node->getOperand(0); 4518 SDValue N1 = Node->getOperand(1); 4519 if (N0.getOpcode() == ISD::SETCC && N0.hasOneUse() && 4520 tryVPTESTM(Node, N0, N1)) 4521 return; 4522 if (N1.getOpcode() == ISD::SETCC && N1.hasOneUse() && 4523 tryVPTESTM(Node, N1, N0)) 4524 return; 4525 } 4526 4527 if (MachineSDNode *NewNode = matchBEXTRFromAndImm(Node)) { 4528 ReplaceUses(SDValue(Node, 0), SDValue(NewNode, 0)); 4529 CurDAG->RemoveDeadNode(Node); 4530 return; 4531 } 4532 if (matchBitExtract(Node)) 4533 return; 4534 if (AndImmShrink && shrinkAndImmediate(Node)) 4535 return; 4536 4537 LLVM_FALLTHROUGH; 4538 case ISD::OR: 4539 case ISD::XOR: 4540 if (tryShrinkShlLogicImm(Node)) 4541 return; 4542 if (Opcode == ISD::OR && tryMatchBitSelect(Node)) 4543 return; 4544 if (tryVPTERNLOG(Node)) 4545 return; 4546 4547 LLVM_FALLTHROUGH; 4548 case ISD::ADD: 4549 case ISD::SUB: { 4550 // Try to avoid folding immediates with multiple uses for optsize. 4551 // This code tries to select to register form directly to avoid going 4552 // through the isel table which might fold the immediate. We can't change 4553 // the patterns on the add/sub/and/or/xor with immediate paterns in the 4554 // tablegen files to check immediate use count without making the patterns 4555 // unavailable to the fast-isel table. 4556 if (!CurDAG->shouldOptForSize()) 4557 break; 4558 4559 // Only handle i8/i16/i32/i64. 4560 if (NVT != MVT::i8 && NVT != MVT::i16 && NVT != MVT::i32 && NVT != MVT::i64) 4561 break; 4562 4563 SDValue N0 = Node->getOperand(0); 4564 SDValue N1 = Node->getOperand(1); 4565 4566 ConstantSDNode *Cst = dyn_cast<ConstantSDNode>(N1); 4567 if (!Cst) 4568 break; 4569 4570 int64_t Val = Cst->getSExtValue(); 4571 4572 // Make sure its an immediate that is considered foldable. 4573 // FIXME: Handle unsigned 32 bit immediates for 64-bit AND. 4574 if (!isInt<8>(Val) && !isInt<32>(Val)) 4575 break; 4576 4577 // If this can match to INC/DEC, let it go. 4578 if (Opcode == ISD::ADD && (Val == 1 || Val == -1)) 4579 break; 4580 4581 // Check if we should avoid folding this immediate. 4582 if (!shouldAvoidImmediateInstFormsForSize(N1.getNode())) 4583 break; 4584 4585 // We should not fold the immediate. So we need a register form instead. 4586 unsigned ROpc, MOpc; 4587 switch (NVT.SimpleTy) { 4588 default: llvm_unreachable("Unexpected VT!"); 4589 case MVT::i8: 4590 switch (Opcode) { 4591 default: llvm_unreachable("Unexpected opcode!"); 4592 case ISD::ADD: ROpc = X86::ADD8rr; MOpc = X86::ADD8rm; break; 4593 case ISD::SUB: ROpc = X86::SUB8rr; MOpc = X86::SUB8rm; break; 4594 case ISD::AND: ROpc = X86::AND8rr; MOpc = X86::AND8rm; break; 4595 case ISD::OR: ROpc = X86::OR8rr; MOpc = X86::OR8rm; break; 4596 case ISD::XOR: ROpc = X86::XOR8rr; MOpc = X86::XOR8rm; break; 4597 } 4598 break; 4599 case MVT::i16: 4600 switch (Opcode) { 4601 default: llvm_unreachable("Unexpected opcode!"); 4602 case ISD::ADD: ROpc = X86::ADD16rr; MOpc = X86::ADD16rm; break; 4603 case ISD::SUB: ROpc = X86::SUB16rr; MOpc = X86::SUB16rm; break; 4604 case ISD::AND: ROpc = X86::AND16rr; MOpc = X86::AND16rm; break; 4605 case ISD::OR: ROpc = X86::OR16rr; MOpc = X86::OR16rm; break; 4606 case ISD::XOR: ROpc = X86::XOR16rr; MOpc = X86::XOR16rm; break; 4607 } 4608 break; 4609 case MVT::i32: 4610 switch (Opcode) { 4611 default: llvm_unreachable("Unexpected opcode!"); 4612 case ISD::ADD: ROpc = X86::ADD32rr; MOpc = X86::ADD32rm; break; 4613 case ISD::SUB: ROpc = X86::SUB32rr; MOpc = X86::SUB32rm; break; 4614 case ISD::AND: ROpc = X86::AND32rr; MOpc = X86::AND32rm; break; 4615 case ISD::OR: ROpc = X86::OR32rr; MOpc = X86::OR32rm; break; 4616 case ISD::XOR: ROpc = X86::XOR32rr; MOpc = X86::XOR32rm; break; 4617 } 4618 break; 4619 case MVT::i64: 4620 switch (Opcode) { 4621 default: llvm_unreachable("Unexpected opcode!"); 4622 case ISD::ADD: ROpc = X86::ADD64rr; MOpc = X86::ADD64rm; break; 4623 case ISD::SUB: ROpc = X86::SUB64rr; MOpc = X86::SUB64rm; break; 4624 case ISD::AND: ROpc = X86::AND64rr; MOpc = X86::AND64rm; break; 4625 case ISD::OR: ROpc = X86::OR64rr; MOpc = X86::OR64rm; break; 4626 case ISD::XOR: ROpc = X86::XOR64rr; MOpc = X86::XOR64rm; break; 4627 } 4628 break; 4629 } 4630 4631 // Ok this is a AND/OR/XOR/ADD/SUB with constant. 4632 4633 // If this is a not a subtract, we can still try to fold a load. 4634 if (Opcode != ISD::SUB) { 4635 SDValue Tmp0, Tmp1, Tmp2, Tmp3, Tmp4; 4636 if (tryFoldLoad(Node, N0, Tmp0, Tmp1, Tmp2, Tmp3, Tmp4)) { 4637 SDValue Ops[] = { N1, Tmp0, Tmp1, Tmp2, Tmp3, Tmp4, N0.getOperand(0) }; 4638 SDVTList VTs = CurDAG->getVTList(NVT, MVT::i32, MVT::Other); 4639 MachineSDNode *CNode = CurDAG->getMachineNode(MOpc, dl, VTs, Ops); 4640 // Update the chain. 4641 ReplaceUses(N0.getValue(1), SDValue(CNode, 2)); 4642 // Record the mem-refs 4643 CurDAG->setNodeMemRefs(CNode, {cast<LoadSDNode>(N0)->getMemOperand()}); 4644 ReplaceUses(SDValue(Node, 0), SDValue(CNode, 0)); 4645 CurDAG->RemoveDeadNode(Node); 4646 return; 4647 } 4648 } 4649 4650 CurDAG->SelectNodeTo(Node, ROpc, NVT, MVT::i32, N0, N1); 4651 return; 4652 } 4653 4654 case X86ISD::SMUL: 4655 // i16/i32/i64 are handled with isel patterns. 4656 if (NVT != MVT::i8) 4657 break; 4658 LLVM_FALLTHROUGH; 4659 case X86ISD::UMUL: { 4660 SDValue N0 = Node->getOperand(0); 4661 SDValue N1 = Node->getOperand(1); 4662 4663 unsigned LoReg, ROpc, MOpc; 4664 switch (NVT.SimpleTy) { 4665 default: llvm_unreachable("Unsupported VT!"); 4666 case MVT::i8: 4667 LoReg = X86::AL; 4668 ROpc = Opcode == X86ISD::SMUL ? X86::IMUL8r : X86::MUL8r; 4669 MOpc = Opcode == X86ISD::SMUL ? X86::IMUL8m : X86::MUL8m; 4670 break; 4671 case MVT::i16: 4672 LoReg = X86::AX; 4673 ROpc = X86::MUL16r; 4674 MOpc = X86::MUL16m; 4675 break; 4676 case MVT::i32: 4677 LoReg = X86::EAX; 4678 ROpc = X86::MUL32r; 4679 MOpc = X86::MUL32m; 4680 break; 4681 case MVT::i64: 4682 LoReg = X86::RAX; 4683 ROpc = X86::MUL64r; 4684 MOpc = X86::MUL64m; 4685 break; 4686 } 4687 4688 SDValue Tmp0, Tmp1, Tmp2, Tmp3, Tmp4; 4689 bool FoldedLoad = tryFoldLoad(Node, N1, Tmp0, Tmp1, Tmp2, Tmp3, Tmp4); 4690 // Multiply is commutative. 4691 if (!FoldedLoad) { 4692 FoldedLoad = tryFoldLoad(Node, N0, Tmp0, Tmp1, Tmp2, Tmp3, Tmp4); 4693 if (FoldedLoad) 4694 std::swap(N0, N1); 4695 } 4696 4697 SDValue InFlag = CurDAG->getCopyToReg(CurDAG->getEntryNode(), dl, LoReg, 4698 N0, SDValue()).getValue(1); 4699 4700 MachineSDNode *CNode; 4701 if (FoldedLoad) { 4702 // i16/i32/i64 use an instruction that produces a low and high result even 4703 // though only the low result is used. 4704 SDVTList VTs; 4705 if (NVT == MVT::i8) 4706 VTs = CurDAG->getVTList(NVT, MVT::i32, MVT::Other); 4707 else 4708 VTs = CurDAG->getVTList(NVT, NVT, MVT::i32, MVT::Other); 4709 4710 SDValue Ops[] = { Tmp0, Tmp1, Tmp2, Tmp3, Tmp4, N1.getOperand(0), 4711 InFlag }; 4712 CNode = CurDAG->getMachineNode(MOpc, dl, VTs, Ops); 4713 4714 // Update the chain. 4715 ReplaceUses(N1.getValue(1), SDValue(CNode, NVT == MVT::i8 ? 2 : 3)); 4716 // Record the mem-refs 4717 CurDAG->setNodeMemRefs(CNode, {cast<LoadSDNode>(N1)->getMemOperand()}); 4718 } else { 4719 // i16/i32/i64 use an instruction that produces a low and high result even 4720 // though only the low result is used. 4721 SDVTList VTs; 4722 if (NVT == MVT::i8) 4723 VTs = CurDAG->getVTList(NVT, MVT::i32); 4724 else 4725 VTs = CurDAG->getVTList(NVT, NVT, MVT::i32); 4726 4727 CNode = CurDAG->getMachineNode(ROpc, dl, VTs, {N1, InFlag}); 4728 } 4729 4730 ReplaceUses(SDValue(Node, 0), SDValue(CNode, 0)); 4731 ReplaceUses(SDValue(Node, 1), SDValue(CNode, NVT == MVT::i8 ? 1 : 2)); 4732 CurDAG->RemoveDeadNode(Node); 4733 return; 4734 } 4735 4736 case ISD::SMUL_LOHI: 4737 case ISD::UMUL_LOHI: { 4738 SDValue N0 = Node->getOperand(0); 4739 SDValue N1 = Node->getOperand(1); 4740 4741 unsigned Opc, MOpc; 4742 unsigned LoReg, HiReg; 4743 bool IsSigned = Opcode == ISD::SMUL_LOHI; 4744 bool UseMULX = !IsSigned && Subtarget->hasBMI2(); 4745 bool UseMULXHi = UseMULX && SDValue(Node, 0).use_empty(); 4746 switch (NVT.SimpleTy) { 4747 default: llvm_unreachable("Unsupported VT!"); 4748 case MVT::i32: 4749 Opc = UseMULXHi ? X86::MULX32Hrr : 4750 UseMULX ? X86::MULX32rr : 4751 IsSigned ? X86::IMUL32r : X86::MUL32r; 4752 MOpc = UseMULXHi ? X86::MULX32Hrm : 4753 UseMULX ? X86::MULX32rm : 4754 IsSigned ? X86::IMUL32m : X86::MUL32m; 4755 LoReg = UseMULX ? X86::EDX : X86::EAX; 4756 HiReg = X86::EDX; 4757 break; 4758 case MVT::i64: 4759 Opc = UseMULXHi ? X86::MULX64Hrr : 4760 UseMULX ? X86::MULX64rr : 4761 IsSigned ? X86::IMUL64r : X86::MUL64r; 4762 MOpc = UseMULXHi ? X86::MULX64Hrm : 4763 UseMULX ? X86::MULX64rm : 4764 IsSigned ? X86::IMUL64m : X86::MUL64m; 4765 LoReg = UseMULX ? X86::RDX : X86::RAX; 4766 HiReg = X86::RDX; 4767 break; 4768 } 4769 4770 SDValue Tmp0, Tmp1, Tmp2, Tmp3, Tmp4; 4771 bool foldedLoad = tryFoldLoad(Node, N1, Tmp0, Tmp1, Tmp2, Tmp3, Tmp4); 4772 // Multiply is commmutative. 4773 if (!foldedLoad) { 4774 foldedLoad = tryFoldLoad(Node, N0, Tmp0, Tmp1, Tmp2, Tmp3, Tmp4); 4775 if (foldedLoad) 4776 std::swap(N0, N1); 4777 } 4778 4779 SDValue InFlag = CurDAG->getCopyToReg(CurDAG->getEntryNode(), dl, LoReg, 4780 N0, SDValue()).getValue(1); 4781 SDValue ResHi, ResLo; 4782 if (foldedLoad) { 4783 SDValue Chain; 4784 MachineSDNode *CNode = nullptr; 4785 SDValue Ops[] = { Tmp0, Tmp1, Tmp2, Tmp3, Tmp4, N1.getOperand(0), 4786 InFlag }; 4787 if (UseMULXHi) { 4788 SDVTList VTs = CurDAG->getVTList(NVT, MVT::Other); 4789 CNode = CurDAG->getMachineNode(MOpc, dl, VTs, Ops); 4790 ResHi = SDValue(CNode, 0); 4791 Chain = SDValue(CNode, 1); 4792 } else if (UseMULX) { 4793 SDVTList VTs = CurDAG->getVTList(NVT, NVT, MVT::Other); 4794 CNode = CurDAG->getMachineNode(MOpc, dl, VTs, Ops); 4795 ResHi = SDValue(CNode, 0); 4796 ResLo = SDValue(CNode, 1); 4797 Chain = SDValue(CNode, 2); 4798 } else { 4799 SDVTList VTs = CurDAG->getVTList(MVT::Other, MVT::Glue); 4800 CNode = CurDAG->getMachineNode(MOpc, dl, VTs, Ops); 4801 Chain = SDValue(CNode, 0); 4802 InFlag = SDValue(CNode, 1); 4803 } 4804 4805 // Update the chain. 4806 ReplaceUses(N1.getValue(1), Chain); 4807 // Record the mem-refs 4808 CurDAG->setNodeMemRefs(CNode, {cast<LoadSDNode>(N1)->getMemOperand()}); 4809 } else { 4810 SDValue Ops[] = { N1, InFlag }; 4811 if (UseMULXHi) { 4812 SDVTList VTs = CurDAG->getVTList(NVT); 4813 SDNode *CNode = CurDAG->getMachineNode(Opc, dl, VTs, Ops); 4814 ResHi = SDValue(CNode, 0); 4815 } else if (UseMULX) { 4816 SDVTList VTs = CurDAG->getVTList(NVT, NVT); 4817 SDNode *CNode = CurDAG->getMachineNode(Opc, dl, VTs, Ops); 4818 ResHi = SDValue(CNode, 0); 4819 ResLo = SDValue(CNode, 1); 4820 } else { 4821 SDVTList VTs = CurDAG->getVTList(MVT::Glue); 4822 SDNode *CNode = CurDAG->getMachineNode(Opc, dl, VTs, Ops); 4823 InFlag = SDValue(CNode, 0); 4824 } 4825 } 4826 4827 // Copy the low half of the result, if it is needed. 4828 if (!SDValue(Node, 0).use_empty()) { 4829 if (!ResLo) { 4830 assert(LoReg && "Register for low half is not defined!"); 4831 ResLo = CurDAG->getCopyFromReg(CurDAG->getEntryNode(), dl, LoReg, 4832 NVT, InFlag); 4833 InFlag = ResLo.getValue(2); 4834 } 4835 ReplaceUses(SDValue(Node, 0), ResLo); 4836 LLVM_DEBUG(dbgs() << "=> "; ResLo.getNode()->dump(CurDAG); 4837 dbgs() << '\n'); 4838 } 4839 // Copy the high half of the result, if it is needed. 4840 if (!SDValue(Node, 1).use_empty()) { 4841 if (!ResHi) { 4842 assert(HiReg && "Register for high half is not defined!"); 4843 ResHi = CurDAG->getCopyFromReg(CurDAG->getEntryNode(), dl, HiReg, 4844 NVT, InFlag); 4845 InFlag = ResHi.getValue(2); 4846 } 4847 ReplaceUses(SDValue(Node, 1), ResHi); 4848 LLVM_DEBUG(dbgs() << "=> "; ResHi.getNode()->dump(CurDAG); 4849 dbgs() << '\n'); 4850 } 4851 4852 CurDAG->RemoveDeadNode(Node); 4853 return; 4854 } 4855 4856 case ISD::SDIVREM: 4857 case ISD::UDIVREM: { 4858 SDValue N0 = Node->getOperand(0); 4859 SDValue N1 = Node->getOperand(1); 4860 4861 unsigned ROpc, MOpc; 4862 bool isSigned = Opcode == ISD::SDIVREM; 4863 if (!isSigned) { 4864 switch (NVT.SimpleTy) { 4865 default: llvm_unreachable("Unsupported VT!"); 4866 case MVT::i8: ROpc = X86::DIV8r; MOpc = X86::DIV8m; break; 4867 case MVT::i16: ROpc = X86::DIV16r; MOpc = X86::DIV16m; break; 4868 case MVT::i32: ROpc = X86::DIV32r; MOpc = X86::DIV32m; break; 4869 case MVT::i64: ROpc = X86::DIV64r; MOpc = X86::DIV64m; break; 4870 } 4871 } else { 4872 switch (NVT.SimpleTy) { 4873 default: llvm_unreachable("Unsupported VT!"); 4874 case MVT::i8: ROpc = X86::IDIV8r; MOpc = X86::IDIV8m; break; 4875 case MVT::i16: ROpc = X86::IDIV16r; MOpc = X86::IDIV16m; break; 4876 case MVT::i32: ROpc = X86::IDIV32r; MOpc = X86::IDIV32m; break; 4877 case MVT::i64: ROpc = X86::IDIV64r; MOpc = X86::IDIV64m; break; 4878 } 4879 } 4880 4881 unsigned LoReg, HiReg, ClrReg; 4882 unsigned SExtOpcode; 4883 switch (NVT.SimpleTy) { 4884 default: llvm_unreachable("Unsupported VT!"); 4885 case MVT::i8: 4886 LoReg = X86::AL; ClrReg = HiReg = X86::AH; 4887 SExtOpcode = 0; // Not used. 4888 break; 4889 case MVT::i16: 4890 LoReg = X86::AX; HiReg = X86::DX; 4891 ClrReg = X86::DX; 4892 SExtOpcode = X86::CWD; 4893 break; 4894 case MVT::i32: 4895 LoReg = X86::EAX; ClrReg = HiReg = X86::EDX; 4896 SExtOpcode = X86::CDQ; 4897 break; 4898 case MVT::i64: 4899 LoReg = X86::RAX; ClrReg = HiReg = X86::RDX; 4900 SExtOpcode = X86::CQO; 4901 break; 4902 } 4903 4904 SDValue Tmp0, Tmp1, Tmp2, Tmp3, Tmp4; 4905 bool foldedLoad = tryFoldLoad(Node, N1, Tmp0, Tmp1, Tmp2, Tmp3, Tmp4); 4906 bool signBitIsZero = CurDAG->SignBitIsZero(N0); 4907 4908 SDValue InFlag; 4909 if (NVT == MVT::i8) { 4910 // Special case for div8, just use a move with zero extension to AX to 4911 // clear the upper 8 bits (AH). 4912 SDValue Tmp0, Tmp1, Tmp2, Tmp3, Tmp4, Chain; 4913 MachineSDNode *Move; 4914 if (tryFoldLoad(Node, N0, Tmp0, Tmp1, Tmp2, Tmp3, Tmp4)) { 4915 SDValue Ops[] = { Tmp0, Tmp1, Tmp2, Tmp3, Tmp4, N0.getOperand(0) }; 4916 unsigned Opc = (isSigned && !signBitIsZero) ? X86::MOVSX16rm8 4917 : X86::MOVZX16rm8; 4918 Move = CurDAG->getMachineNode(Opc, dl, MVT::i16, MVT::Other, Ops); 4919 Chain = SDValue(Move, 1); 4920 ReplaceUses(N0.getValue(1), Chain); 4921 // Record the mem-refs 4922 CurDAG->setNodeMemRefs(Move, {cast<LoadSDNode>(N0)->getMemOperand()}); 4923 } else { 4924 unsigned Opc = (isSigned && !signBitIsZero) ? X86::MOVSX16rr8 4925 : X86::MOVZX16rr8; 4926 Move = CurDAG->getMachineNode(Opc, dl, MVT::i16, N0); 4927 Chain = CurDAG->getEntryNode(); 4928 } 4929 Chain = CurDAG->getCopyToReg(Chain, dl, X86::AX, SDValue(Move, 0), 4930 SDValue()); 4931 InFlag = Chain.getValue(1); 4932 } else { 4933 InFlag = 4934 CurDAG->getCopyToReg(CurDAG->getEntryNode(), dl, 4935 LoReg, N0, SDValue()).getValue(1); 4936 if (isSigned && !signBitIsZero) { 4937 // Sign extend the low part into the high part. 4938 InFlag = 4939 SDValue(CurDAG->getMachineNode(SExtOpcode, dl, MVT::Glue, InFlag),0); 4940 } else { 4941 // Zero out the high part, effectively zero extending the input. 4942 SDVTList VTs = CurDAG->getVTList(MVT::i32, MVT::i32); 4943 SDValue ClrNode = 4944 SDValue(CurDAG->getMachineNode(X86::MOV32r0, dl, VTs, None), 0); 4945 switch (NVT.SimpleTy) { 4946 case MVT::i16: 4947 ClrNode = 4948 SDValue(CurDAG->getMachineNode( 4949 TargetOpcode::EXTRACT_SUBREG, dl, MVT::i16, ClrNode, 4950 CurDAG->getTargetConstant(X86::sub_16bit, dl, 4951 MVT::i32)), 4952 0); 4953 break; 4954 case MVT::i32: 4955 break; 4956 case MVT::i64: 4957 ClrNode = 4958 SDValue(CurDAG->getMachineNode( 4959 TargetOpcode::SUBREG_TO_REG, dl, MVT::i64, 4960 CurDAG->getTargetConstant(0, dl, MVT::i64), ClrNode, 4961 CurDAG->getTargetConstant(X86::sub_32bit, dl, 4962 MVT::i32)), 4963 0); 4964 break; 4965 default: 4966 llvm_unreachable("Unexpected division source"); 4967 } 4968 4969 InFlag = CurDAG->getCopyToReg(CurDAG->getEntryNode(), dl, ClrReg, 4970 ClrNode, InFlag).getValue(1); 4971 } 4972 } 4973 4974 if (foldedLoad) { 4975 SDValue Ops[] = { Tmp0, Tmp1, Tmp2, Tmp3, Tmp4, N1.getOperand(0), 4976 InFlag }; 4977 MachineSDNode *CNode = 4978 CurDAG->getMachineNode(MOpc, dl, MVT::Other, MVT::Glue, Ops); 4979 InFlag = SDValue(CNode, 1); 4980 // Update the chain. 4981 ReplaceUses(N1.getValue(1), SDValue(CNode, 0)); 4982 // Record the mem-refs 4983 CurDAG->setNodeMemRefs(CNode, {cast<LoadSDNode>(N1)->getMemOperand()}); 4984 } else { 4985 InFlag = 4986 SDValue(CurDAG->getMachineNode(ROpc, dl, MVT::Glue, N1, InFlag), 0); 4987 } 4988 4989 // Prevent use of AH in a REX instruction by explicitly copying it to 4990 // an ABCD_L register. 4991 // 4992 // The current assumption of the register allocator is that isel 4993 // won't generate explicit references to the GR8_ABCD_H registers. If 4994 // the allocator and/or the backend get enhanced to be more robust in 4995 // that regard, this can be, and should be, removed. 4996 if (HiReg == X86::AH && !SDValue(Node, 1).use_empty()) { 4997 SDValue AHCopy = CurDAG->getRegister(X86::AH, MVT::i8); 4998 unsigned AHExtOpcode = 4999 isSigned ? X86::MOVSX32rr8_NOREX : X86::MOVZX32rr8_NOREX; 5000 5001 SDNode *RNode = CurDAG->getMachineNode(AHExtOpcode, dl, MVT::i32, 5002 MVT::Glue, AHCopy, InFlag); 5003 SDValue Result(RNode, 0); 5004 InFlag = SDValue(RNode, 1); 5005 5006 Result = 5007 CurDAG->getTargetExtractSubreg(X86::sub_8bit, dl, MVT::i8, Result); 5008 5009 ReplaceUses(SDValue(Node, 1), Result); 5010 LLVM_DEBUG(dbgs() << "=> "; Result.getNode()->dump(CurDAG); 5011 dbgs() << '\n'); 5012 } 5013 // Copy the division (low) result, if it is needed. 5014 if (!SDValue(Node, 0).use_empty()) { 5015 SDValue Result = CurDAG->getCopyFromReg(CurDAG->getEntryNode(), dl, 5016 LoReg, NVT, InFlag); 5017 InFlag = Result.getValue(2); 5018 ReplaceUses(SDValue(Node, 0), Result); 5019 LLVM_DEBUG(dbgs() << "=> "; Result.getNode()->dump(CurDAG); 5020 dbgs() << '\n'); 5021 } 5022 // Copy the remainder (high) result, if it is needed. 5023 if (!SDValue(Node, 1).use_empty()) { 5024 SDValue Result = CurDAG->getCopyFromReg(CurDAG->getEntryNode(), dl, 5025 HiReg, NVT, InFlag); 5026 InFlag = Result.getValue(2); 5027 ReplaceUses(SDValue(Node, 1), Result); 5028 LLVM_DEBUG(dbgs() << "=> "; Result.getNode()->dump(CurDAG); 5029 dbgs() << '\n'); 5030 } 5031 CurDAG->RemoveDeadNode(Node); 5032 return; 5033 } 5034 5035 case X86ISD::FCMP: 5036 case X86ISD::STRICT_FCMP: 5037 case X86ISD::STRICT_FCMPS: { 5038 bool IsStrictCmp = Node->getOpcode() == X86ISD::STRICT_FCMP || 5039 Node->getOpcode() == X86ISD::STRICT_FCMPS; 5040 SDValue N0 = Node->getOperand(IsStrictCmp ? 1 : 0); 5041 SDValue N1 = Node->getOperand(IsStrictCmp ? 2 : 1); 5042 5043 // Save the original VT of the compare. 5044 MVT CmpVT = N0.getSimpleValueType(); 5045 5046 // Floating point needs special handling if we don't have FCOMI. 5047 if (Subtarget->hasCMov()) 5048 break; 5049 5050 bool IsSignaling = Node->getOpcode() == X86ISD::STRICT_FCMPS; 5051 5052 unsigned Opc; 5053 switch (CmpVT.SimpleTy) { 5054 default: llvm_unreachable("Unexpected type!"); 5055 case MVT::f32: 5056 Opc = IsSignaling ? X86::COM_Fpr32 : X86::UCOM_Fpr32; 5057 break; 5058 case MVT::f64: 5059 Opc = IsSignaling ? X86::COM_Fpr64 : X86::UCOM_Fpr64; 5060 break; 5061 case MVT::f80: 5062 Opc = IsSignaling ? X86::COM_Fpr80 : X86::UCOM_Fpr80; 5063 break; 5064 } 5065 5066 SDValue Cmp; 5067 SDValue Chain = 5068 IsStrictCmp ? Node->getOperand(0) : CurDAG->getEntryNode(); 5069 if (IsStrictCmp) { 5070 SDVTList VTs = CurDAG->getVTList(MVT::i16, MVT::Other); 5071 Cmp = SDValue(CurDAG->getMachineNode(Opc, dl, VTs, {N0, N1, Chain}), 0); 5072 Chain = Cmp.getValue(1); 5073 } else { 5074 Cmp = SDValue(CurDAG->getMachineNode(Opc, dl, MVT::i16, N0, N1), 0); 5075 } 5076 5077 // Move FPSW to AX. 5078 SDValue FPSW = CurDAG->getCopyToReg(Chain, dl, X86::FPSW, Cmp, SDValue()); 5079 Chain = FPSW; 5080 SDValue FNSTSW = 5081 SDValue(CurDAG->getMachineNode(X86::FNSTSW16r, dl, MVT::i16, FPSW, 5082 FPSW.getValue(1)), 5083 0); 5084 5085 // Extract upper 8-bits of AX. 5086 SDValue Extract = 5087 CurDAG->getTargetExtractSubreg(X86::sub_8bit_hi, dl, MVT::i8, FNSTSW); 5088 5089 // Move AH into flags. 5090 // Some 64-bit targets lack SAHF support, but they do support FCOMI. 5091 assert(Subtarget->hasLAHFSAHF() && 5092 "Target doesn't support SAHF or FCOMI?"); 5093 SDValue AH = CurDAG->getCopyToReg(Chain, dl, X86::AH, Extract, SDValue()); 5094 Chain = AH; 5095 SDValue SAHF = SDValue( 5096 CurDAG->getMachineNode(X86::SAHF, dl, MVT::i32, AH.getValue(1)), 0); 5097 5098 if (IsStrictCmp) 5099 ReplaceUses(SDValue(Node, 1), Chain); 5100 5101 ReplaceUses(SDValue(Node, 0), SAHF); 5102 CurDAG->RemoveDeadNode(Node); 5103 return; 5104 } 5105 5106 case X86ISD::CMP: { 5107 SDValue N0 = Node->getOperand(0); 5108 SDValue N1 = Node->getOperand(1); 5109 5110 // Optimizations for TEST compares. 5111 if (!isNullConstant(N1)) 5112 break; 5113 5114 // Save the original VT of the compare. 5115 MVT CmpVT = N0.getSimpleValueType(); 5116 5117 // If we are comparing (and (shr X, C, Mask) with 0, emit a BEXTR followed 5118 // by a test instruction. The test should be removed later by 5119 // analyzeCompare if we are using only the zero flag. 5120 // TODO: Should we check the users and use the BEXTR flags directly? 5121 if (N0.getOpcode() == ISD::AND && N0.hasOneUse()) { 5122 if (MachineSDNode *NewNode = matchBEXTRFromAndImm(N0.getNode())) { 5123 unsigned TestOpc = CmpVT == MVT::i64 ? X86::TEST64rr 5124 : X86::TEST32rr; 5125 SDValue BEXTR = SDValue(NewNode, 0); 5126 NewNode = CurDAG->getMachineNode(TestOpc, dl, MVT::i32, BEXTR, BEXTR); 5127 ReplaceUses(SDValue(Node, 0), SDValue(NewNode, 0)); 5128 CurDAG->RemoveDeadNode(Node); 5129 return; 5130 } 5131 } 5132 5133 // We can peek through truncates, but we need to be careful below. 5134 if (N0.getOpcode() == ISD::TRUNCATE && N0.hasOneUse()) 5135 N0 = N0.getOperand(0); 5136 5137 // Look for (X86cmp (and $op, $imm), 0) and see if we can convert it to 5138 // use a smaller encoding. 5139 // Look past the truncate if CMP is the only use of it. 5140 if (N0.getOpcode() == ISD::AND && 5141 N0.getNode()->hasOneUse() && 5142 N0.getValueType() != MVT::i8) { 5143 ConstantSDNode *C = dyn_cast<ConstantSDNode>(N0.getOperand(1)); 5144 if (!C) break; 5145 uint64_t Mask = C->getZExtValue(); 5146 5147 // Check if we can replace AND+IMM64 with a shift. This is possible for 5148 // masks/ like 0xFF000000 or 0x00FFFFFF and if we care only about the zero 5149 // flag. 5150 if (CmpVT == MVT::i64 && !isInt<32>(Mask) && 5151 onlyUsesZeroFlag(SDValue(Node, 0))) { 5152 if (isMask_64(~Mask)) { 5153 unsigned TrailingZeros = countTrailingZeros(Mask); 5154 SDValue Imm = CurDAG->getTargetConstant(TrailingZeros, dl, MVT::i64); 5155 SDValue Shift = 5156 SDValue(CurDAG->getMachineNode(X86::SHR64ri, dl, MVT::i64, MVT::i32, 5157 N0.getOperand(0), Imm), 0); 5158 MachineSDNode *Test = CurDAG->getMachineNode(X86::TEST64rr, dl, 5159 MVT::i32, Shift, Shift); 5160 ReplaceNode(Node, Test); 5161 return; 5162 } 5163 if (isMask_64(Mask)) { 5164 unsigned LeadingZeros = countLeadingZeros(Mask); 5165 SDValue Imm = CurDAG->getTargetConstant(LeadingZeros, dl, MVT::i64); 5166 SDValue Shift = 5167 SDValue(CurDAG->getMachineNode(X86::SHL64ri, dl, MVT::i64, MVT::i32, 5168 N0.getOperand(0), Imm), 0); 5169 MachineSDNode *Test = CurDAG->getMachineNode(X86::TEST64rr, dl, 5170 MVT::i32, Shift, Shift); 5171 ReplaceNode(Node, Test); 5172 return; 5173 } 5174 } 5175 5176 MVT VT; 5177 int SubRegOp; 5178 unsigned ROpc, MOpc; 5179 5180 // For each of these checks we need to be careful if the sign flag is 5181 // being used. It is only safe to use the sign flag in two conditions, 5182 // either the sign bit in the shrunken mask is zero or the final test 5183 // size is equal to the original compare size. 5184 5185 if (isUInt<8>(Mask) && 5186 (!(Mask & 0x80) || CmpVT == MVT::i8 || 5187 hasNoSignFlagUses(SDValue(Node, 0)))) { 5188 // For example, convert "testl %eax, $8" to "testb %al, $8" 5189 VT = MVT::i8; 5190 SubRegOp = X86::sub_8bit; 5191 ROpc = X86::TEST8ri; 5192 MOpc = X86::TEST8mi; 5193 } else if (OptForMinSize && isUInt<16>(Mask) && 5194 (!(Mask & 0x8000) || CmpVT == MVT::i16 || 5195 hasNoSignFlagUses(SDValue(Node, 0)))) { 5196 // For example, "testl %eax, $32776" to "testw %ax, $32776". 5197 // NOTE: We only want to form TESTW instructions if optimizing for 5198 // min size. Otherwise we only save one byte and possibly get a length 5199 // changing prefix penalty in the decoders. 5200 VT = MVT::i16; 5201 SubRegOp = X86::sub_16bit; 5202 ROpc = X86::TEST16ri; 5203 MOpc = X86::TEST16mi; 5204 } else if (isUInt<32>(Mask) && N0.getValueType() != MVT::i16 && 5205 ((!(Mask & 0x80000000) && 5206 // Without minsize 16-bit Cmps can get here so we need to 5207 // be sure we calculate the correct sign flag if needed. 5208 (CmpVT != MVT::i16 || !(Mask & 0x8000))) || 5209 CmpVT == MVT::i32 || 5210 hasNoSignFlagUses(SDValue(Node, 0)))) { 5211 // For example, "testq %rax, $268468232" to "testl %eax, $268468232". 5212 // NOTE: We only want to run that transform if N0 is 32 or 64 bits. 5213 // Otherwize, we find ourselves in a position where we have to do 5214 // promotion. If previous passes did not promote the and, we assume 5215 // they had a good reason not to and do not promote here. 5216 VT = MVT::i32; 5217 SubRegOp = X86::sub_32bit; 5218 ROpc = X86::TEST32ri; 5219 MOpc = X86::TEST32mi; 5220 } else { 5221 // No eligible transformation was found. 5222 break; 5223 } 5224 5225 SDValue Imm = CurDAG->getTargetConstant(Mask, dl, VT); 5226 SDValue Reg = N0.getOperand(0); 5227 5228 // Emit a testl or testw. 5229 MachineSDNode *NewNode; 5230 SDValue Tmp0, Tmp1, Tmp2, Tmp3, Tmp4; 5231 if (tryFoldLoad(Node, N0.getNode(), Reg, Tmp0, Tmp1, Tmp2, Tmp3, Tmp4)) { 5232 if (auto *LoadN = dyn_cast<LoadSDNode>(N0.getOperand(0).getNode())) { 5233 if (!LoadN->isSimple()) { 5234 unsigned NumVolBits = LoadN->getValueType(0).getSizeInBits(); 5235 if (MOpc == X86::TEST8mi && NumVolBits != 8) 5236 break; 5237 else if (MOpc == X86::TEST16mi && NumVolBits != 16) 5238 break; 5239 else if (MOpc == X86::TEST32mi && NumVolBits != 32) 5240 break; 5241 } 5242 } 5243 SDValue Ops[] = { Tmp0, Tmp1, Tmp2, Tmp3, Tmp4, Imm, 5244 Reg.getOperand(0) }; 5245 NewNode = CurDAG->getMachineNode(MOpc, dl, MVT::i32, MVT::Other, Ops); 5246 // Update the chain. 5247 ReplaceUses(Reg.getValue(1), SDValue(NewNode, 1)); 5248 // Record the mem-refs 5249 CurDAG->setNodeMemRefs(NewNode, 5250 {cast<LoadSDNode>(Reg)->getMemOperand()}); 5251 } else { 5252 // Extract the subregister if necessary. 5253 if (N0.getValueType() != VT) 5254 Reg = CurDAG->getTargetExtractSubreg(SubRegOp, dl, VT, Reg); 5255 5256 NewNode = CurDAG->getMachineNode(ROpc, dl, MVT::i32, Reg, Imm); 5257 } 5258 // Replace CMP with TEST. 5259 ReplaceNode(Node, NewNode); 5260 return; 5261 } 5262 break; 5263 } 5264 case X86ISD::PCMPISTR: { 5265 if (!Subtarget->hasSSE42()) 5266 break; 5267 5268 bool NeedIndex = !SDValue(Node, 0).use_empty(); 5269 bool NeedMask = !SDValue(Node, 1).use_empty(); 5270 // We can't fold a load if we are going to make two instructions. 5271 bool MayFoldLoad = !NeedIndex || !NeedMask; 5272 5273 MachineSDNode *CNode; 5274 if (NeedMask) { 5275 unsigned ROpc = Subtarget->hasAVX() ? X86::VPCMPISTRMrr : X86::PCMPISTRMrr; 5276 unsigned MOpc = Subtarget->hasAVX() ? X86::VPCMPISTRMrm : X86::PCMPISTRMrm; 5277 CNode = emitPCMPISTR(ROpc, MOpc, MayFoldLoad, dl, MVT::v16i8, Node); 5278 ReplaceUses(SDValue(Node, 1), SDValue(CNode, 0)); 5279 } 5280 if (NeedIndex || !NeedMask) { 5281 unsigned ROpc = Subtarget->hasAVX() ? X86::VPCMPISTRIrr : X86::PCMPISTRIrr; 5282 unsigned MOpc = Subtarget->hasAVX() ? X86::VPCMPISTRIrm : X86::PCMPISTRIrm; 5283 CNode = emitPCMPISTR(ROpc, MOpc, MayFoldLoad, dl, MVT::i32, Node); 5284 ReplaceUses(SDValue(Node, 0), SDValue(CNode, 0)); 5285 } 5286 5287 // Connect the flag usage to the last instruction created. 5288 ReplaceUses(SDValue(Node, 2), SDValue(CNode, 1)); 5289 CurDAG->RemoveDeadNode(Node); 5290 return; 5291 } 5292 case X86ISD::PCMPESTR: { 5293 if (!Subtarget->hasSSE42()) 5294 break; 5295 5296 // Copy the two implicit register inputs. 5297 SDValue InFlag = CurDAG->getCopyToReg(CurDAG->getEntryNode(), dl, X86::EAX, 5298 Node->getOperand(1), 5299 SDValue()).getValue(1); 5300 InFlag = CurDAG->getCopyToReg(CurDAG->getEntryNode(), dl, X86::EDX, 5301 Node->getOperand(3), InFlag).getValue(1); 5302 5303 bool NeedIndex = !SDValue(Node, 0).use_empty(); 5304 bool NeedMask = !SDValue(Node, 1).use_empty(); 5305 // We can't fold a load if we are going to make two instructions. 5306 bool MayFoldLoad = !NeedIndex || !NeedMask; 5307 5308 MachineSDNode *CNode; 5309 if (NeedMask) { 5310 unsigned ROpc = Subtarget->hasAVX() ? X86::VPCMPESTRMrr : X86::PCMPESTRMrr; 5311 unsigned MOpc = Subtarget->hasAVX() ? X86::VPCMPESTRMrm : X86::PCMPESTRMrm; 5312 CNode = emitPCMPESTR(ROpc, MOpc, MayFoldLoad, dl, MVT::v16i8, Node, 5313 InFlag); 5314 ReplaceUses(SDValue(Node, 1), SDValue(CNode, 0)); 5315 } 5316 if (NeedIndex || !NeedMask) { 5317 unsigned ROpc = Subtarget->hasAVX() ? X86::VPCMPESTRIrr : X86::PCMPESTRIrr; 5318 unsigned MOpc = Subtarget->hasAVX() ? X86::VPCMPESTRIrm : X86::PCMPESTRIrm; 5319 CNode = emitPCMPESTR(ROpc, MOpc, MayFoldLoad, dl, MVT::i32, Node, InFlag); 5320 ReplaceUses(SDValue(Node, 0), SDValue(CNode, 0)); 5321 } 5322 // Connect the flag usage to the last instruction created. 5323 ReplaceUses(SDValue(Node, 2), SDValue(CNode, 1)); 5324 CurDAG->RemoveDeadNode(Node); 5325 return; 5326 } 5327 5328 case ISD::SETCC: { 5329 if (NVT.isVector() && tryVPTESTM(Node, SDValue(Node, 0), SDValue())) 5330 return; 5331 5332 break; 5333 } 5334 5335 case ISD::STORE: 5336 if (foldLoadStoreIntoMemOperand(Node)) 5337 return; 5338 break; 5339 5340 case X86ISD::SETCC_CARRY: { 5341 // We have to do this manually because tblgen will put the eflags copy in 5342 // the wrong place if we use an extract_subreg in the pattern. 5343 MVT VT = Node->getSimpleValueType(0); 5344 5345 // Copy flags to the EFLAGS register and glue it to next node. 5346 SDValue EFLAGS = 5347 CurDAG->getCopyToReg(CurDAG->getEntryNode(), dl, X86::EFLAGS, 5348 Node->getOperand(1), SDValue()); 5349 5350 // Create a 64-bit instruction if the result is 64-bits otherwise use the 5351 // 32-bit version. 5352 unsigned Opc = VT == MVT::i64 ? X86::SETB_C64r : X86::SETB_C32r; 5353 MVT SetVT = VT == MVT::i64 ? MVT::i64 : MVT::i32; 5354 SDValue Result = SDValue( 5355 CurDAG->getMachineNode(Opc, dl, SetVT, EFLAGS, EFLAGS.getValue(1)), 0); 5356 5357 // For less than 32-bits we need to extract from the 32-bit node. 5358 if (VT == MVT::i8 || VT == MVT::i16) { 5359 int SubIndex = VT == MVT::i16 ? X86::sub_16bit : X86::sub_8bit; 5360 Result = CurDAG->getTargetExtractSubreg(SubIndex, dl, VT, Result); 5361 } 5362 5363 ReplaceUses(SDValue(Node, 0), Result); 5364 CurDAG->RemoveDeadNode(Node); 5365 return; 5366 } 5367 case X86ISD::SBB: { 5368 if (isNullConstant(Node->getOperand(0)) && 5369 isNullConstant(Node->getOperand(1))) { 5370 MVT VT = Node->getSimpleValueType(0); 5371 5372 // Create zero. 5373 SDVTList VTs = CurDAG->getVTList(MVT::i32, MVT::i32); 5374 SDValue Zero = 5375 SDValue(CurDAG->getMachineNode(X86::MOV32r0, dl, VTs, None), 0); 5376 if (VT == MVT::i64) { 5377 Zero = SDValue( 5378 CurDAG->getMachineNode( 5379 TargetOpcode::SUBREG_TO_REG, dl, MVT::i64, 5380 CurDAG->getTargetConstant(0, dl, MVT::i64), Zero, 5381 CurDAG->getTargetConstant(X86::sub_32bit, dl, MVT::i32)), 5382 0); 5383 } 5384 5385 // Copy flags to the EFLAGS register and glue it to next node. 5386 SDValue EFLAGS = 5387 CurDAG->getCopyToReg(CurDAG->getEntryNode(), dl, X86::EFLAGS, 5388 Node->getOperand(2), SDValue()); 5389 5390 // Create a 64-bit instruction if the result is 64-bits otherwise use the 5391 // 32-bit version. 5392 unsigned Opc = VT == MVT::i64 ? X86::SBB64rr : X86::SBB32rr; 5393 MVT SBBVT = VT == MVT::i64 ? MVT::i64 : MVT::i32; 5394 VTs = CurDAG->getVTList(SBBVT, MVT::i32); 5395 SDValue Result = 5396 SDValue(CurDAG->getMachineNode(Opc, dl, VTs, {Zero, Zero, EFLAGS, 5397 EFLAGS.getValue(1)}), 5398 0); 5399 5400 // Replace the flag use. 5401 ReplaceUses(SDValue(Node, 1), Result.getValue(1)); 5402 5403 // Replace the result use. 5404 if (!SDValue(Node, 0).use_empty()) { 5405 // For less than 32-bits we need to extract from the 32-bit node. 5406 if (VT == MVT::i8 || VT == MVT::i16) { 5407 int SubIndex = VT == MVT::i16 ? X86::sub_16bit : X86::sub_8bit; 5408 Result = CurDAG->getTargetExtractSubreg(SubIndex, dl, VT, Result); 5409 } 5410 ReplaceUses(SDValue(Node, 0), Result); 5411 } 5412 5413 CurDAG->RemoveDeadNode(Node); 5414 return; 5415 } 5416 break; 5417 } 5418 case X86ISD::MGATHER: { 5419 auto *Mgt = cast<X86MaskedGatherSDNode>(Node); 5420 SDValue IndexOp = Mgt->getIndex(); 5421 SDValue Mask = Mgt->getMask(); 5422 MVT IndexVT = IndexOp.getSimpleValueType(); 5423 MVT ValueVT = Node->getSimpleValueType(0); 5424 MVT MaskVT = Mask.getSimpleValueType(); 5425 5426 // This is just to prevent crashes if the nodes are malformed somehow. We're 5427 // otherwise only doing loose type checking in here based on type what 5428 // a type constraint would say just like table based isel. 5429 if (!ValueVT.isVector() || !MaskVT.isVector()) 5430 break; 5431 5432 unsigned NumElts = ValueVT.getVectorNumElements(); 5433 MVT ValueSVT = ValueVT.getVectorElementType(); 5434 5435 bool IsFP = ValueSVT.isFloatingPoint(); 5436 unsigned EltSize = ValueSVT.getSizeInBits(); 5437 5438 unsigned Opc = 0; 5439 bool AVX512Gather = MaskVT.getVectorElementType() == MVT::i1; 5440 if (AVX512Gather) { 5441 if (IndexVT == MVT::v4i32 && NumElts == 4 && EltSize == 32) 5442 Opc = IsFP ? X86::VGATHERDPSZ128rm : X86::VPGATHERDDZ128rm; 5443 else if (IndexVT == MVT::v8i32 && NumElts == 8 && EltSize == 32) 5444 Opc = IsFP ? X86::VGATHERDPSZ256rm : X86::VPGATHERDDZ256rm; 5445 else if (IndexVT == MVT::v16i32 && NumElts == 16 && EltSize == 32) 5446 Opc = IsFP ? X86::VGATHERDPSZrm : X86::VPGATHERDDZrm; 5447 else if (IndexVT == MVT::v4i32 && NumElts == 2 && EltSize == 64) 5448 Opc = IsFP ? X86::VGATHERDPDZ128rm : X86::VPGATHERDQZ128rm; 5449 else if (IndexVT == MVT::v4i32 && NumElts == 4 && EltSize == 64) 5450 Opc = IsFP ? X86::VGATHERDPDZ256rm : X86::VPGATHERDQZ256rm; 5451 else if (IndexVT == MVT::v8i32 && NumElts == 8 && EltSize == 64) 5452 Opc = IsFP ? X86::VGATHERDPDZrm : X86::VPGATHERDQZrm; 5453 else if (IndexVT == MVT::v2i64 && NumElts == 4 && EltSize == 32) 5454 Opc = IsFP ? X86::VGATHERQPSZ128rm : X86::VPGATHERQDZ128rm; 5455 else if (IndexVT == MVT::v4i64 && NumElts == 4 && EltSize == 32) 5456 Opc = IsFP ? X86::VGATHERQPSZ256rm : X86::VPGATHERQDZ256rm; 5457 else if (IndexVT == MVT::v8i64 && NumElts == 8 && EltSize == 32) 5458 Opc = IsFP ? X86::VGATHERQPSZrm : X86::VPGATHERQDZrm; 5459 else if (IndexVT == MVT::v2i64 && NumElts == 2 && EltSize == 64) 5460 Opc = IsFP ? X86::VGATHERQPDZ128rm : X86::VPGATHERQQZ128rm; 5461 else if (IndexVT == MVT::v4i64 && NumElts == 4 && EltSize == 64) 5462 Opc = IsFP ? X86::VGATHERQPDZ256rm : X86::VPGATHERQQZ256rm; 5463 else if (IndexVT == MVT::v8i64 && NumElts == 8 && EltSize == 64) 5464 Opc = IsFP ? X86::VGATHERQPDZrm : X86::VPGATHERQQZrm; 5465 } else { 5466 assert(EVT(MaskVT) == EVT(ValueVT).changeVectorElementTypeToInteger() && 5467 "Unexpected mask VT!"); 5468 if (IndexVT == MVT::v4i32 && NumElts == 4 && EltSize == 32) 5469 Opc = IsFP ? X86::VGATHERDPSrm : X86::VPGATHERDDrm; 5470 else if (IndexVT == MVT::v8i32 && NumElts == 8 && EltSize == 32) 5471 Opc = IsFP ? X86::VGATHERDPSYrm : X86::VPGATHERDDYrm; 5472 else if (IndexVT == MVT::v4i32 && NumElts == 2 && EltSize == 64) 5473 Opc = IsFP ? X86::VGATHERDPDrm : X86::VPGATHERDQrm; 5474 else if (IndexVT == MVT::v4i32 && NumElts == 4 && EltSize == 64) 5475 Opc = IsFP ? X86::VGATHERDPDYrm : X86::VPGATHERDQYrm; 5476 else if (IndexVT == MVT::v2i64 && NumElts == 4 && EltSize == 32) 5477 Opc = IsFP ? X86::VGATHERQPSrm : X86::VPGATHERQDrm; 5478 else if (IndexVT == MVT::v4i64 && NumElts == 4 && EltSize == 32) 5479 Opc = IsFP ? X86::VGATHERQPSYrm : X86::VPGATHERQDYrm; 5480 else if (IndexVT == MVT::v2i64 && NumElts == 2 && EltSize == 64) 5481 Opc = IsFP ? X86::VGATHERQPDrm : X86::VPGATHERQQrm; 5482 else if (IndexVT == MVT::v4i64 && NumElts == 4 && EltSize == 64) 5483 Opc = IsFP ? X86::VGATHERQPDYrm : X86::VPGATHERQQYrm; 5484 } 5485 5486 if (!Opc) 5487 break; 5488 5489 SDValue Base, Scale, Index, Disp, Segment; 5490 if (!selectVectorAddr(Mgt, Mgt->getBasePtr(), IndexOp, Mgt->getScale(), 5491 Base, Scale, Index, Disp, Segment)) 5492 break; 5493 5494 SDValue PassThru = Mgt->getPassThru(); 5495 SDValue Chain = Mgt->getChain(); 5496 // Gather instructions have a mask output not in the ISD node. 5497 SDVTList VTs = CurDAG->getVTList(ValueVT, MaskVT, MVT::Other); 5498 5499 MachineSDNode *NewNode; 5500 if (AVX512Gather) { 5501 SDValue Ops[] = {PassThru, Mask, Base, Scale, 5502 Index, Disp, Segment, Chain}; 5503 NewNode = CurDAG->getMachineNode(Opc, SDLoc(dl), VTs, Ops); 5504 } else { 5505 SDValue Ops[] = {PassThru, Base, Scale, Index, 5506 Disp, Segment, Mask, Chain}; 5507 NewNode = CurDAG->getMachineNode(Opc, SDLoc(dl), VTs, Ops); 5508 } 5509 CurDAG->setNodeMemRefs(NewNode, {Mgt->getMemOperand()}); 5510 ReplaceUses(SDValue(Node, 0), SDValue(NewNode, 0)); 5511 ReplaceUses(SDValue(Node, 1), SDValue(NewNode, 2)); 5512 CurDAG->RemoveDeadNode(Node); 5513 return; 5514 } 5515 case X86ISD::MSCATTER: { 5516 auto *Sc = cast<X86MaskedScatterSDNode>(Node); 5517 SDValue Value = Sc->getValue(); 5518 SDValue IndexOp = Sc->getIndex(); 5519 MVT IndexVT = IndexOp.getSimpleValueType(); 5520 MVT ValueVT = Value.getSimpleValueType(); 5521 5522 // This is just to prevent crashes if the nodes are malformed somehow. We're 5523 // otherwise only doing loose type checking in here based on type what 5524 // a type constraint would say just like table based isel. 5525 if (!ValueVT.isVector()) 5526 break; 5527 5528 unsigned NumElts = ValueVT.getVectorNumElements(); 5529 MVT ValueSVT = ValueVT.getVectorElementType(); 5530 5531 bool IsFP = ValueSVT.isFloatingPoint(); 5532 unsigned EltSize = ValueSVT.getSizeInBits(); 5533 5534 unsigned Opc; 5535 if (IndexVT == MVT::v4i32 && NumElts == 4 && EltSize == 32) 5536 Opc = IsFP ? X86::VSCATTERDPSZ128mr : X86::VPSCATTERDDZ128mr; 5537 else if (IndexVT == MVT::v8i32 && NumElts == 8 && EltSize == 32) 5538 Opc = IsFP ? X86::VSCATTERDPSZ256mr : X86::VPSCATTERDDZ256mr; 5539 else if (IndexVT == MVT::v16i32 && NumElts == 16 && EltSize == 32) 5540 Opc = IsFP ? X86::VSCATTERDPSZmr : X86::VPSCATTERDDZmr; 5541 else if (IndexVT == MVT::v4i32 && NumElts == 2 && EltSize == 64) 5542 Opc = IsFP ? X86::VSCATTERDPDZ128mr : X86::VPSCATTERDQZ128mr; 5543 else if (IndexVT == MVT::v4i32 && NumElts == 4 && EltSize == 64) 5544 Opc = IsFP ? X86::VSCATTERDPDZ256mr : X86::VPSCATTERDQZ256mr; 5545 else if (IndexVT == MVT::v8i32 && NumElts == 8 && EltSize == 64) 5546 Opc = IsFP ? X86::VSCATTERDPDZmr : X86::VPSCATTERDQZmr; 5547 else if (IndexVT == MVT::v2i64 && NumElts == 4 && EltSize == 32) 5548 Opc = IsFP ? X86::VSCATTERQPSZ128mr : X86::VPSCATTERQDZ128mr; 5549 else if (IndexVT == MVT::v4i64 && NumElts == 4 && EltSize == 32) 5550 Opc = IsFP ? X86::VSCATTERQPSZ256mr : X86::VPSCATTERQDZ256mr; 5551 else if (IndexVT == MVT::v8i64 && NumElts == 8 && EltSize == 32) 5552 Opc = IsFP ? X86::VSCATTERQPSZmr : X86::VPSCATTERQDZmr; 5553 else if (IndexVT == MVT::v2i64 && NumElts == 2 && EltSize == 64) 5554 Opc = IsFP ? X86::VSCATTERQPDZ128mr : X86::VPSCATTERQQZ128mr; 5555 else if (IndexVT == MVT::v4i64 && NumElts == 4 && EltSize == 64) 5556 Opc = IsFP ? X86::VSCATTERQPDZ256mr : X86::VPSCATTERQQZ256mr; 5557 else if (IndexVT == MVT::v8i64 && NumElts == 8 && EltSize == 64) 5558 Opc = IsFP ? X86::VSCATTERQPDZmr : X86::VPSCATTERQQZmr; 5559 else 5560 break; 5561 5562 SDValue Base, Scale, Index, Disp, Segment; 5563 if (!selectVectorAddr(Sc, Sc->getBasePtr(), IndexOp, Sc->getScale(), 5564 Base, Scale, Index, Disp, Segment)) 5565 break; 5566 5567 SDValue Mask = Sc->getMask(); 5568 SDValue Chain = Sc->getChain(); 5569 // Scatter instructions have a mask output not in the ISD node. 5570 SDVTList VTs = CurDAG->getVTList(Mask.getValueType(), MVT::Other); 5571 SDValue Ops[] = {Base, Scale, Index, Disp, Segment, Mask, Value, Chain}; 5572 5573 MachineSDNode *NewNode = CurDAG->getMachineNode(Opc, SDLoc(dl), VTs, Ops); 5574 CurDAG->setNodeMemRefs(NewNode, {Sc->getMemOperand()}); 5575 ReplaceUses(SDValue(Node, 0), SDValue(NewNode, 1)); 5576 CurDAG->RemoveDeadNode(Node); 5577 return; 5578 } 5579 case ISD::PREALLOCATED_SETUP: { 5580 auto *MFI = CurDAG->getMachineFunction().getInfo<X86MachineFunctionInfo>(); 5581 auto CallId = MFI->getPreallocatedIdForCallSite( 5582 cast<SrcValueSDNode>(Node->getOperand(1))->getValue()); 5583 SDValue Chain = Node->getOperand(0); 5584 SDValue CallIdValue = CurDAG->getTargetConstant(CallId, dl, MVT::i32); 5585 MachineSDNode *New = CurDAG->getMachineNode( 5586 TargetOpcode::PREALLOCATED_SETUP, dl, MVT::Other, CallIdValue, Chain); 5587 ReplaceUses(SDValue(Node, 0), SDValue(New, 0)); // Chain 5588 CurDAG->RemoveDeadNode(Node); 5589 return; 5590 } 5591 case ISD::PREALLOCATED_ARG: { 5592 auto *MFI = CurDAG->getMachineFunction().getInfo<X86MachineFunctionInfo>(); 5593 auto CallId = MFI->getPreallocatedIdForCallSite( 5594 cast<SrcValueSDNode>(Node->getOperand(1))->getValue()); 5595 SDValue Chain = Node->getOperand(0); 5596 SDValue CallIdValue = CurDAG->getTargetConstant(CallId, dl, MVT::i32); 5597 SDValue ArgIndex = Node->getOperand(2); 5598 SDValue Ops[3]; 5599 Ops[0] = CallIdValue; 5600 Ops[1] = ArgIndex; 5601 Ops[2] = Chain; 5602 MachineSDNode *New = CurDAG->getMachineNode( 5603 TargetOpcode::PREALLOCATED_ARG, dl, 5604 CurDAG->getVTList(TLI->getPointerTy(CurDAG->getDataLayout()), 5605 MVT::Other), 5606 Ops); 5607 ReplaceUses(SDValue(Node, 0), SDValue(New, 0)); // Arg pointer 5608 ReplaceUses(SDValue(Node, 1), SDValue(New, 1)); // Chain 5609 CurDAG->RemoveDeadNode(Node); 5610 return; 5611 } 5612 } 5613 5614 SelectCode(Node); 5615 } 5616 5617 bool X86DAGToDAGISel:: 5618 SelectInlineAsmMemoryOperand(const SDValue &Op, unsigned ConstraintID, 5619 std::vector<SDValue> &OutOps) { 5620 SDValue Op0, Op1, Op2, Op3, Op4; 5621 switch (ConstraintID) { 5622 default: 5623 llvm_unreachable("Unexpected asm memory constraint"); 5624 case InlineAsm::Constraint_o: // offsetable ?? 5625 case InlineAsm::Constraint_v: // not offsetable ?? 5626 case InlineAsm::Constraint_m: // memory 5627 case InlineAsm::Constraint_X: 5628 if (!selectAddr(nullptr, Op, Op0, Op1, Op2, Op3, Op4)) 5629 return true; 5630 break; 5631 } 5632 5633 OutOps.push_back(Op0); 5634 OutOps.push_back(Op1); 5635 OutOps.push_back(Op2); 5636 OutOps.push_back(Op3); 5637 OutOps.push_back(Op4); 5638 return false; 5639 } 5640 5641 /// This pass converts a legalized DAG into a X86-specific DAG, 5642 /// ready for instruction scheduling. 5643 FunctionPass *llvm::createX86ISelDag(X86TargetMachine &TM, 5644 CodeGenOpt::Level OptLevel) { 5645 return new X86DAGToDAGISel(TM, OptLevel); 5646 } 5647