1 //===- X86ISelDAGToDAG.cpp - A DAG pattern matching inst selector for X86 -===// 2 // 3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 4 // See https://llvm.org/LICENSE.txt for license information. 5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 6 // 7 //===----------------------------------------------------------------------===// 8 // 9 // This file defines a DAG pattern matching instruction selector for X86, 10 // converting from a legalized dag to a X86 dag. 11 // 12 //===----------------------------------------------------------------------===// 13 14 #include "X86.h" 15 #include "X86MachineFunctionInfo.h" 16 #include "X86RegisterInfo.h" 17 #include "X86Subtarget.h" 18 #include "X86TargetMachine.h" 19 #include "llvm/ADT/Statistic.h" 20 #include "llvm/CodeGen/SelectionDAGISel.h" 21 #include "llvm/Config/llvm-config.h" 22 #include "llvm/IR/ConstantRange.h" 23 #include "llvm/IR/Function.h" 24 #include "llvm/IR/Instructions.h" 25 #include "llvm/IR/Intrinsics.h" 26 #include "llvm/IR/IntrinsicsX86.h" 27 #include "llvm/IR/Type.h" 28 #include "llvm/Support/Debug.h" 29 #include "llvm/Support/ErrorHandling.h" 30 #include "llvm/Support/KnownBits.h" 31 #include "llvm/Support/MathExtras.h" 32 #include <stdint.h> 33 using namespace llvm; 34 35 #define DEBUG_TYPE "x86-isel" 36 37 STATISTIC(NumLoadMoved, "Number of loads moved below TokenFactor"); 38 39 static cl::opt<bool> AndImmShrink("x86-and-imm-shrink", cl::init(true), 40 cl::desc("Enable setting constant bits to reduce size of mask immediates"), 41 cl::Hidden); 42 43 static cl::opt<bool> EnablePromoteAnyextLoad( 44 "x86-promote-anyext-load", cl::init(true), 45 cl::desc("Enable promoting aligned anyext load to wider load"), cl::Hidden); 46 47 //===----------------------------------------------------------------------===// 48 // Pattern Matcher Implementation 49 //===----------------------------------------------------------------------===// 50 51 namespace { 52 /// This corresponds to X86AddressMode, but uses SDValue's instead of register 53 /// numbers for the leaves of the matched tree. 54 struct X86ISelAddressMode { 55 enum { 56 RegBase, 57 FrameIndexBase 58 } BaseType; 59 60 // This is really a union, discriminated by BaseType! 61 SDValue Base_Reg; 62 int Base_FrameIndex; 63 64 unsigned Scale; 65 SDValue IndexReg; 66 int32_t Disp; 67 SDValue Segment; 68 const GlobalValue *GV; 69 const Constant *CP; 70 const BlockAddress *BlockAddr; 71 const char *ES; 72 MCSymbol *MCSym; 73 int JT; 74 Align Alignment; // CP alignment. 75 unsigned char SymbolFlags; // X86II::MO_* 76 bool NegateIndex = false; 77 78 X86ISelAddressMode() 79 : BaseType(RegBase), Base_FrameIndex(0), Scale(1), IndexReg(), Disp(0), 80 Segment(), GV(nullptr), CP(nullptr), BlockAddr(nullptr), ES(nullptr), 81 MCSym(nullptr), JT(-1), SymbolFlags(X86II::MO_NO_FLAG) {} 82 83 bool hasSymbolicDisplacement() const { 84 return GV != nullptr || CP != nullptr || ES != nullptr || 85 MCSym != nullptr || JT != -1 || BlockAddr != nullptr; 86 } 87 88 bool hasBaseOrIndexReg() const { 89 return BaseType == FrameIndexBase || 90 IndexReg.getNode() != nullptr || Base_Reg.getNode() != nullptr; 91 } 92 93 /// Return true if this addressing mode is already RIP-relative. 94 bool isRIPRelative() const { 95 if (BaseType != RegBase) return false; 96 if (RegisterSDNode *RegNode = 97 dyn_cast_or_null<RegisterSDNode>(Base_Reg.getNode())) 98 return RegNode->getReg() == X86::RIP; 99 return false; 100 } 101 102 void setBaseReg(SDValue Reg) { 103 BaseType = RegBase; 104 Base_Reg = Reg; 105 } 106 107 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) 108 void dump(SelectionDAG *DAG = nullptr) { 109 dbgs() << "X86ISelAddressMode " << this << '\n'; 110 dbgs() << "Base_Reg "; 111 if (Base_Reg.getNode()) 112 Base_Reg.getNode()->dump(DAG); 113 else 114 dbgs() << "nul\n"; 115 if (BaseType == FrameIndexBase) 116 dbgs() << " Base.FrameIndex " << Base_FrameIndex << '\n'; 117 dbgs() << " Scale " << Scale << '\n' 118 << "IndexReg "; 119 if (NegateIndex) 120 dbgs() << "negate "; 121 if (IndexReg.getNode()) 122 IndexReg.getNode()->dump(DAG); 123 else 124 dbgs() << "nul\n"; 125 dbgs() << " Disp " << Disp << '\n' 126 << "GV "; 127 if (GV) 128 GV->dump(); 129 else 130 dbgs() << "nul"; 131 dbgs() << " CP "; 132 if (CP) 133 CP->dump(); 134 else 135 dbgs() << "nul"; 136 dbgs() << '\n' 137 << "ES "; 138 if (ES) 139 dbgs() << ES; 140 else 141 dbgs() << "nul"; 142 dbgs() << " MCSym "; 143 if (MCSym) 144 dbgs() << MCSym; 145 else 146 dbgs() << "nul"; 147 dbgs() << " JT" << JT << " Align" << Alignment.value() << '\n'; 148 } 149 #endif 150 }; 151 } 152 153 namespace { 154 //===--------------------------------------------------------------------===// 155 /// ISel - X86-specific code to select X86 machine instructions for 156 /// SelectionDAG operations. 157 /// 158 class X86DAGToDAGISel final : public SelectionDAGISel { 159 /// Keep a pointer to the X86Subtarget around so that we can 160 /// make the right decision when generating code for different targets. 161 const X86Subtarget *Subtarget; 162 163 /// If true, selector should try to optimize for minimum code size. 164 bool OptForMinSize; 165 166 /// Disable direct TLS access through segment registers. 167 bool IndirectTlsSegRefs; 168 169 public: 170 explicit X86DAGToDAGISel(X86TargetMachine &tm, CodeGenOpt::Level OptLevel) 171 : SelectionDAGISel(tm, OptLevel), Subtarget(nullptr), 172 OptForMinSize(false), IndirectTlsSegRefs(false) {} 173 174 StringRef getPassName() const override { 175 return "X86 DAG->DAG Instruction Selection"; 176 } 177 178 bool runOnMachineFunction(MachineFunction &MF) override { 179 // Reset the subtarget each time through. 180 Subtarget = &MF.getSubtarget<X86Subtarget>(); 181 IndirectTlsSegRefs = MF.getFunction().hasFnAttribute( 182 "indirect-tls-seg-refs"); 183 184 // OptFor[Min]Size are used in pattern predicates that isel is matching. 185 OptForMinSize = MF.getFunction().hasMinSize(); 186 assert((!OptForMinSize || MF.getFunction().hasOptSize()) && 187 "OptForMinSize implies OptForSize"); 188 189 SelectionDAGISel::runOnMachineFunction(MF); 190 return true; 191 } 192 193 void emitFunctionEntryCode() override; 194 195 bool IsProfitableToFold(SDValue N, SDNode *U, SDNode *Root) const override; 196 197 void PreprocessISelDAG() override; 198 void PostprocessISelDAG() override; 199 200 // Include the pieces autogenerated from the target description. 201 #include "X86GenDAGISel.inc" 202 203 private: 204 void Select(SDNode *N) override; 205 206 bool foldOffsetIntoAddress(uint64_t Offset, X86ISelAddressMode &AM); 207 bool matchLoadInAddress(LoadSDNode *N, X86ISelAddressMode &AM); 208 bool matchWrapper(SDValue N, X86ISelAddressMode &AM); 209 bool matchAddress(SDValue N, X86ISelAddressMode &AM); 210 bool matchVectorAddress(SDValue N, X86ISelAddressMode &AM); 211 bool matchAdd(SDValue &N, X86ISelAddressMode &AM, unsigned Depth); 212 bool matchAddressRecursively(SDValue N, X86ISelAddressMode &AM, 213 unsigned Depth); 214 bool matchAddressBase(SDValue N, X86ISelAddressMode &AM); 215 bool selectAddr(SDNode *Parent, SDValue N, SDValue &Base, 216 SDValue &Scale, SDValue &Index, SDValue &Disp, 217 SDValue &Segment); 218 bool selectVectorAddr(MemSDNode *Parent, SDValue BasePtr, SDValue IndexOp, 219 SDValue ScaleOp, SDValue &Base, SDValue &Scale, 220 SDValue &Index, SDValue &Disp, SDValue &Segment); 221 bool selectMOV64Imm32(SDValue N, SDValue &Imm); 222 bool selectLEAAddr(SDValue N, SDValue &Base, 223 SDValue &Scale, SDValue &Index, SDValue &Disp, 224 SDValue &Segment); 225 bool selectLEA64_32Addr(SDValue N, SDValue &Base, 226 SDValue &Scale, SDValue &Index, SDValue &Disp, 227 SDValue &Segment); 228 bool selectTLSADDRAddr(SDValue N, SDValue &Base, 229 SDValue &Scale, SDValue &Index, SDValue &Disp, 230 SDValue &Segment); 231 bool selectRelocImm(SDValue N, SDValue &Op); 232 233 bool tryFoldLoad(SDNode *Root, SDNode *P, SDValue N, 234 SDValue &Base, SDValue &Scale, 235 SDValue &Index, SDValue &Disp, 236 SDValue &Segment); 237 238 // Convenience method where P is also root. 239 bool tryFoldLoad(SDNode *P, SDValue N, 240 SDValue &Base, SDValue &Scale, 241 SDValue &Index, SDValue &Disp, 242 SDValue &Segment) { 243 return tryFoldLoad(P, P, N, Base, Scale, Index, Disp, Segment); 244 } 245 246 bool tryFoldBroadcast(SDNode *Root, SDNode *P, SDValue N, 247 SDValue &Base, SDValue &Scale, 248 SDValue &Index, SDValue &Disp, 249 SDValue &Segment); 250 251 bool isProfitableToFormMaskedOp(SDNode *N) const; 252 253 /// Implement addressing mode selection for inline asm expressions. 254 bool SelectInlineAsmMemoryOperand(const SDValue &Op, 255 unsigned ConstraintID, 256 std::vector<SDValue> &OutOps) override; 257 258 void emitSpecialCodeForMain(); 259 260 inline void getAddressOperands(X86ISelAddressMode &AM, const SDLoc &DL, 261 MVT VT, SDValue &Base, SDValue &Scale, 262 SDValue &Index, SDValue &Disp, 263 SDValue &Segment) { 264 if (AM.BaseType == X86ISelAddressMode::FrameIndexBase) 265 Base = CurDAG->getTargetFrameIndex( 266 AM.Base_FrameIndex, TLI->getPointerTy(CurDAG->getDataLayout())); 267 else if (AM.Base_Reg.getNode()) 268 Base = AM.Base_Reg; 269 else 270 Base = CurDAG->getRegister(0, VT); 271 272 Scale = getI8Imm(AM.Scale, DL); 273 274 // Negate the index if needed. 275 if (AM.NegateIndex) { 276 unsigned NegOpc = VT == MVT::i64 ? X86::NEG64r : X86::NEG32r; 277 SDValue Neg = SDValue(CurDAG->getMachineNode(NegOpc, DL, VT, MVT::i32, 278 AM.IndexReg), 0); 279 AM.IndexReg = Neg; 280 } 281 282 if (AM.IndexReg.getNode()) 283 Index = AM.IndexReg; 284 else 285 Index = CurDAG->getRegister(0, VT); 286 287 // These are 32-bit even in 64-bit mode since RIP-relative offset 288 // is 32-bit. 289 if (AM.GV) 290 Disp = CurDAG->getTargetGlobalAddress(AM.GV, SDLoc(), 291 MVT::i32, AM.Disp, 292 AM.SymbolFlags); 293 else if (AM.CP) 294 Disp = CurDAG->getTargetConstantPool(AM.CP, MVT::i32, AM.Alignment, 295 AM.Disp, AM.SymbolFlags); 296 else if (AM.ES) { 297 assert(!AM.Disp && "Non-zero displacement is ignored with ES."); 298 Disp = CurDAG->getTargetExternalSymbol(AM.ES, MVT::i32, AM.SymbolFlags); 299 } else if (AM.MCSym) { 300 assert(!AM.Disp && "Non-zero displacement is ignored with MCSym."); 301 assert(AM.SymbolFlags == 0 && "oo"); 302 Disp = CurDAG->getMCSymbol(AM.MCSym, MVT::i32); 303 } else if (AM.JT != -1) { 304 assert(!AM.Disp && "Non-zero displacement is ignored with JT."); 305 Disp = CurDAG->getTargetJumpTable(AM.JT, MVT::i32, AM.SymbolFlags); 306 } else if (AM.BlockAddr) 307 Disp = CurDAG->getTargetBlockAddress(AM.BlockAddr, MVT::i32, AM.Disp, 308 AM.SymbolFlags); 309 else 310 Disp = CurDAG->getTargetConstant(AM.Disp, DL, MVT::i32); 311 312 if (AM.Segment.getNode()) 313 Segment = AM.Segment; 314 else 315 Segment = CurDAG->getRegister(0, MVT::i16); 316 } 317 318 // Utility function to determine whether we should avoid selecting 319 // immediate forms of instructions for better code size or not. 320 // At a high level, we'd like to avoid such instructions when 321 // we have similar constants used within the same basic block 322 // that can be kept in a register. 323 // 324 bool shouldAvoidImmediateInstFormsForSize(SDNode *N) const { 325 uint32_t UseCount = 0; 326 327 // Do not want to hoist if we're not optimizing for size. 328 // TODO: We'd like to remove this restriction. 329 // See the comment in X86InstrInfo.td for more info. 330 if (!CurDAG->shouldOptForSize()) 331 return false; 332 333 // Walk all the users of the immediate. 334 for (SDNode::use_iterator UI = N->use_begin(), 335 UE = N->use_end(); (UI != UE) && (UseCount < 2); ++UI) { 336 337 SDNode *User = *UI; 338 339 // This user is already selected. Count it as a legitimate use and 340 // move on. 341 if (User->isMachineOpcode()) { 342 UseCount++; 343 continue; 344 } 345 346 // We want to count stores of immediates as real uses. 347 if (User->getOpcode() == ISD::STORE && 348 User->getOperand(1).getNode() == N) { 349 UseCount++; 350 continue; 351 } 352 353 // We don't currently match users that have > 2 operands (except 354 // for stores, which are handled above) 355 // Those instruction won't match in ISEL, for now, and would 356 // be counted incorrectly. 357 // This may change in the future as we add additional instruction 358 // types. 359 if (User->getNumOperands() != 2) 360 continue; 361 362 // If this is a sign-extended 8-bit integer immediate used in an ALU 363 // instruction, there is probably an opcode encoding to save space. 364 auto *C = dyn_cast<ConstantSDNode>(N); 365 if (C && isInt<8>(C->getSExtValue())) 366 continue; 367 368 // Immediates that are used for offsets as part of stack 369 // manipulation should be left alone. These are typically 370 // used to indicate SP offsets for argument passing and 371 // will get pulled into stores/pushes (implicitly). 372 if (User->getOpcode() == X86ISD::ADD || 373 User->getOpcode() == ISD::ADD || 374 User->getOpcode() == X86ISD::SUB || 375 User->getOpcode() == ISD::SUB) { 376 377 // Find the other operand of the add/sub. 378 SDValue OtherOp = User->getOperand(0); 379 if (OtherOp.getNode() == N) 380 OtherOp = User->getOperand(1); 381 382 // Don't count if the other operand is SP. 383 RegisterSDNode *RegNode; 384 if (OtherOp->getOpcode() == ISD::CopyFromReg && 385 (RegNode = dyn_cast_or_null<RegisterSDNode>( 386 OtherOp->getOperand(1).getNode()))) 387 if ((RegNode->getReg() == X86::ESP) || 388 (RegNode->getReg() == X86::RSP)) 389 continue; 390 } 391 392 // ... otherwise, count this and move on. 393 UseCount++; 394 } 395 396 // If we have more than 1 use, then recommend for hoisting. 397 return (UseCount > 1); 398 } 399 400 /// Return a target constant with the specified value of type i8. 401 inline SDValue getI8Imm(unsigned Imm, const SDLoc &DL) { 402 return CurDAG->getTargetConstant(Imm, DL, MVT::i8); 403 } 404 405 /// Return a target constant with the specified value, of type i32. 406 inline SDValue getI32Imm(unsigned Imm, const SDLoc &DL) { 407 return CurDAG->getTargetConstant(Imm, DL, MVT::i32); 408 } 409 410 /// Return a target constant with the specified value, of type i64. 411 inline SDValue getI64Imm(uint64_t Imm, const SDLoc &DL) { 412 return CurDAG->getTargetConstant(Imm, DL, MVT::i64); 413 } 414 415 SDValue getExtractVEXTRACTImmediate(SDNode *N, unsigned VecWidth, 416 const SDLoc &DL) { 417 assert((VecWidth == 128 || VecWidth == 256) && "Unexpected vector width"); 418 uint64_t Index = N->getConstantOperandVal(1); 419 MVT VecVT = N->getOperand(0).getSimpleValueType(); 420 return getI8Imm((Index * VecVT.getScalarSizeInBits()) / VecWidth, DL); 421 } 422 423 SDValue getInsertVINSERTImmediate(SDNode *N, unsigned VecWidth, 424 const SDLoc &DL) { 425 assert((VecWidth == 128 || VecWidth == 256) && "Unexpected vector width"); 426 uint64_t Index = N->getConstantOperandVal(2); 427 MVT VecVT = N->getSimpleValueType(0); 428 return getI8Imm((Index * VecVT.getScalarSizeInBits()) / VecWidth, DL); 429 } 430 431 // Helper to detect unneeded and instructions on shift amounts. Called 432 // from PatFrags in tablegen. 433 bool isUnneededShiftMask(SDNode *N, unsigned Width) const { 434 assert(N->getOpcode() == ISD::AND && "Unexpected opcode"); 435 const APInt &Val = cast<ConstantSDNode>(N->getOperand(1))->getAPIntValue(); 436 437 if (Val.countTrailingOnes() >= Width) 438 return true; 439 440 APInt Mask = Val | CurDAG->computeKnownBits(N->getOperand(0)).Zero; 441 return Mask.countTrailingOnes() >= Width; 442 } 443 444 /// Return an SDNode that returns the value of the global base register. 445 /// Output instructions required to initialize the global base register, 446 /// if necessary. 447 SDNode *getGlobalBaseReg(); 448 449 /// Return a reference to the TargetMachine, casted to the target-specific 450 /// type. 451 const X86TargetMachine &getTargetMachine() const { 452 return static_cast<const X86TargetMachine &>(TM); 453 } 454 455 /// Return a reference to the TargetInstrInfo, casted to the target-specific 456 /// type. 457 const X86InstrInfo *getInstrInfo() const { 458 return Subtarget->getInstrInfo(); 459 } 460 461 /// Address-mode matching performs shift-of-and to and-of-shift 462 /// reassociation in order to expose more scaled addressing 463 /// opportunities. 464 bool ComplexPatternFuncMutatesDAG() const override { 465 return true; 466 } 467 468 bool isSExtAbsoluteSymbolRef(unsigned Width, SDNode *N) const; 469 470 // Indicates we should prefer to use a non-temporal load for this load. 471 bool useNonTemporalLoad(LoadSDNode *N) const { 472 if (!N->isNonTemporal()) 473 return false; 474 475 unsigned StoreSize = N->getMemoryVT().getStoreSize(); 476 477 if (N->getAlignment() < StoreSize) 478 return false; 479 480 switch (StoreSize) { 481 default: llvm_unreachable("Unsupported store size"); 482 case 4: 483 case 8: 484 return false; 485 case 16: 486 return Subtarget->hasSSE41(); 487 case 32: 488 return Subtarget->hasAVX2(); 489 case 64: 490 return Subtarget->hasAVX512(); 491 } 492 } 493 494 bool foldLoadStoreIntoMemOperand(SDNode *Node); 495 MachineSDNode *matchBEXTRFromAndImm(SDNode *Node); 496 bool matchBitExtract(SDNode *Node); 497 bool shrinkAndImmediate(SDNode *N); 498 bool isMaskZeroExtended(SDNode *N) const; 499 bool tryShiftAmountMod(SDNode *N); 500 bool tryShrinkShlLogicImm(SDNode *N); 501 bool tryVPTERNLOG(SDNode *N); 502 bool tryVPTESTM(SDNode *Root, SDValue Setcc, SDValue Mask); 503 bool tryMatchBitSelect(SDNode *N); 504 505 MachineSDNode *emitPCMPISTR(unsigned ROpc, unsigned MOpc, bool MayFoldLoad, 506 const SDLoc &dl, MVT VT, SDNode *Node); 507 MachineSDNode *emitPCMPESTR(unsigned ROpc, unsigned MOpc, bool MayFoldLoad, 508 const SDLoc &dl, MVT VT, SDNode *Node, 509 SDValue &InFlag); 510 511 bool tryOptimizeRem8Extend(SDNode *N); 512 513 bool onlyUsesZeroFlag(SDValue Flags) const; 514 bool hasNoSignFlagUses(SDValue Flags) const; 515 bool hasNoCarryFlagUses(SDValue Flags) const; 516 }; 517 } 518 519 520 // Returns true if this masked compare can be implemented legally with this 521 // type. 522 static bool isLegalMaskCompare(SDNode *N, const X86Subtarget *Subtarget) { 523 unsigned Opcode = N->getOpcode(); 524 if (Opcode == X86ISD::CMPM || Opcode == X86ISD::STRICT_CMPM || 525 Opcode == ISD::SETCC || Opcode == X86ISD::CMPM_SAE || 526 Opcode == X86ISD::VFPCLASS) { 527 // We can get 256-bit 8 element types here without VLX being enabled. When 528 // this happens we will use 512-bit operations and the mask will not be 529 // zero extended. 530 EVT OpVT = N->getOperand(0).getValueType(); 531 // The first operand of X86ISD::STRICT_CMPM is chain, so we need to get the 532 // second operand. 533 if (Opcode == X86ISD::STRICT_CMPM) 534 OpVT = N->getOperand(1).getValueType(); 535 if (OpVT.is256BitVector() || OpVT.is128BitVector()) 536 return Subtarget->hasVLX(); 537 538 return true; 539 } 540 // Scalar opcodes use 128 bit registers, but aren't subject to the VLX check. 541 if (Opcode == X86ISD::VFPCLASSS || Opcode == X86ISD::FSETCCM || 542 Opcode == X86ISD::FSETCCM_SAE) 543 return true; 544 545 return false; 546 } 547 548 // Returns true if we can assume the writer of the mask has zero extended it 549 // for us. 550 bool X86DAGToDAGISel::isMaskZeroExtended(SDNode *N) const { 551 // If this is an AND, check if we have a compare on either side. As long as 552 // one side guarantees the mask is zero extended, the AND will preserve those 553 // zeros. 554 if (N->getOpcode() == ISD::AND) 555 return isLegalMaskCompare(N->getOperand(0).getNode(), Subtarget) || 556 isLegalMaskCompare(N->getOperand(1).getNode(), Subtarget); 557 558 return isLegalMaskCompare(N, Subtarget); 559 } 560 561 bool 562 X86DAGToDAGISel::IsProfitableToFold(SDValue N, SDNode *U, SDNode *Root) const { 563 if (OptLevel == CodeGenOpt::None) return false; 564 565 if (!N.hasOneUse()) 566 return false; 567 568 if (N.getOpcode() != ISD::LOAD) 569 return true; 570 571 // Don't fold non-temporal loads if we have an instruction for them. 572 if (useNonTemporalLoad(cast<LoadSDNode>(N))) 573 return false; 574 575 // If N is a load, do additional profitability checks. 576 if (U == Root) { 577 switch (U->getOpcode()) { 578 default: break; 579 case X86ISD::ADD: 580 case X86ISD::ADC: 581 case X86ISD::SUB: 582 case X86ISD::SBB: 583 case X86ISD::AND: 584 case X86ISD::XOR: 585 case X86ISD::OR: 586 case ISD::ADD: 587 case ISD::ADDCARRY: 588 case ISD::AND: 589 case ISD::OR: 590 case ISD::XOR: { 591 SDValue Op1 = U->getOperand(1); 592 593 // If the other operand is a 8-bit immediate we should fold the immediate 594 // instead. This reduces code size. 595 // e.g. 596 // movl 4(%esp), %eax 597 // addl $4, %eax 598 // vs. 599 // movl $4, %eax 600 // addl 4(%esp), %eax 601 // The former is 2 bytes shorter. In case where the increment is 1, then 602 // the saving can be 4 bytes (by using incl %eax). 603 if (ConstantSDNode *Imm = dyn_cast<ConstantSDNode>(Op1)) { 604 if (Imm->getAPIntValue().isSignedIntN(8)) 605 return false; 606 607 // If this is a 64-bit AND with an immediate that fits in 32-bits, 608 // prefer using the smaller and over folding the load. This is needed to 609 // make sure immediates created by shrinkAndImmediate are always folded. 610 // Ideally we would narrow the load during DAG combine and get the 611 // best of both worlds. 612 if (U->getOpcode() == ISD::AND && 613 Imm->getAPIntValue().getBitWidth() == 64 && 614 Imm->getAPIntValue().isIntN(32)) 615 return false; 616 617 // If this really a zext_inreg that can be represented with a movzx 618 // instruction, prefer that. 619 // TODO: We could shrink the load and fold if it is non-volatile. 620 if (U->getOpcode() == ISD::AND && 621 (Imm->getAPIntValue() == UINT8_MAX || 622 Imm->getAPIntValue() == UINT16_MAX || 623 Imm->getAPIntValue() == UINT32_MAX)) 624 return false; 625 626 // ADD/SUB with can negate the immediate and use the opposite operation 627 // to fit 128 into a sign extended 8 bit immediate. 628 if ((U->getOpcode() == ISD::ADD || U->getOpcode() == ISD::SUB) && 629 (-Imm->getAPIntValue()).isSignedIntN(8)) 630 return false; 631 632 if ((U->getOpcode() == X86ISD::ADD || U->getOpcode() == X86ISD::SUB) && 633 (-Imm->getAPIntValue()).isSignedIntN(8) && 634 hasNoCarryFlagUses(SDValue(U, 1))) 635 return false; 636 } 637 638 // If the other operand is a TLS address, we should fold it instead. 639 // This produces 640 // movl %gs:0, %eax 641 // leal i@NTPOFF(%eax), %eax 642 // instead of 643 // movl $i@NTPOFF, %eax 644 // addl %gs:0, %eax 645 // if the block also has an access to a second TLS address this will save 646 // a load. 647 // FIXME: This is probably also true for non-TLS addresses. 648 if (Op1.getOpcode() == X86ISD::Wrapper) { 649 SDValue Val = Op1.getOperand(0); 650 if (Val.getOpcode() == ISD::TargetGlobalTLSAddress) 651 return false; 652 } 653 654 // Don't fold load if this matches the BTS/BTR/BTC patterns. 655 // BTS: (or X, (shl 1, n)) 656 // BTR: (and X, (rotl -2, n)) 657 // BTC: (xor X, (shl 1, n)) 658 if (U->getOpcode() == ISD::OR || U->getOpcode() == ISD::XOR) { 659 if (U->getOperand(0).getOpcode() == ISD::SHL && 660 isOneConstant(U->getOperand(0).getOperand(0))) 661 return false; 662 663 if (U->getOperand(1).getOpcode() == ISD::SHL && 664 isOneConstant(U->getOperand(1).getOperand(0))) 665 return false; 666 } 667 if (U->getOpcode() == ISD::AND) { 668 SDValue U0 = U->getOperand(0); 669 SDValue U1 = U->getOperand(1); 670 if (U0.getOpcode() == ISD::ROTL) { 671 auto *C = dyn_cast<ConstantSDNode>(U0.getOperand(0)); 672 if (C && C->getSExtValue() == -2) 673 return false; 674 } 675 676 if (U1.getOpcode() == ISD::ROTL) { 677 auto *C = dyn_cast<ConstantSDNode>(U1.getOperand(0)); 678 if (C && C->getSExtValue() == -2) 679 return false; 680 } 681 } 682 683 break; 684 } 685 case ISD::SHL: 686 case ISD::SRA: 687 case ISD::SRL: 688 // Don't fold a load into a shift by immediate. The BMI2 instructions 689 // support folding a load, but not an immediate. The legacy instructions 690 // support folding an immediate, but can't fold a load. Folding an 691 // immediate is preferable to folding a load. 692 if (isa<ConstantSDNode>(U->getOperand(1))) 693 return false; 694 695 break; 696 } 697 } 698 699 // Prevent folding a load if this can implemented with an insert_subreg or 700 // a move that implicitly zeroes. 701 if (Root->getOpcode() == ISD::INSERT_SUBVECTOR && 702 isNullConstant(Root->getOperand(2)) && 703 (Root->getOperand(0).isUndef() || 704 ISD::isBuildVectorAllZeros(Root->getOperand(0).getNode()))) 705 return false; 706 707 return true; 708 } 709 710 // Indicates it is profitable to form an AVX512 masked operation. Returning 711 // false will favor a masked register-register masked move or vblendm and the 712 // operation will be selected separately. 713 bool X86DAGToDAGISel::isProfitableToFormMaskedOp(SDNode *N) const { 714 assert( 715 (N->getOpcode() == ISD::VSELECT || N->getOpcode() == X86ISD::SELECTS) && 716 "Unexpected opcode!"); 717 718 // If the operation has additional users, the operation will be duplicated. 719 // Check the use count to prevent that. 720 // FIXME: Are there cheap opcodes we might want to duplicate? 721 return N->getOperand(1).hasOneUse(); 722 } 723 724 /// Replace the original chain operand of the call with 725 /// load's chain operand and move load below the call's chain operand. 726 static void moveBelowOrigChain(SelectionDAG *CurDAG, SDValue Load, 727 SDValue Call, SDValue OrigChain) { 728 SmallVector<SDValue, 8> Ops; 729 SDValue Chain = OrigChain.getOperand(0); 730 if (Chain.getNode() == Load.getNode()) 731 Ops.push_back(Load.getOperand(0)); 732 else { 733 assert(Chain.getOpcode() == ISD::TokenFactor && 734 "Unexpected chain operand"); 735 for (unsigned i = 0, e = Chain.getNumOperands(); i != e; ++i) 736 if (Chain.getOperand(i).getNode() == Load.getNode()) 737 Ops.push_back(Load.getOperand(0)); 738 else 739 Ops.push_back(Chain.getOperand(i)); 740 SDValue NewChain = 741 CurDAG->getNode(ISD::TokenFactor, SDLoc(Load), MVT::Other, Ops); 742 Ops.clear(); 743 Ops.push_back(NewChain); 744 } 745 Ops.append(OrigChain->op_begin() + 1, OrigChain->op_end()); 746 CurDAG->UpdateNodeOperands(OrigChain.getNode(), Ops); 747 CurDAG->UpdateNodeOperands(Load.getNode(), Call.getOperand(0), 748 Load.getOperand(1), Load.getOperand(2)); 749 750 Ops.clear(); 751 Ops.push_back(SDValue(Load.getNode(), 1)); 752 Ops.append(Call->op_begin() + 1, Call->op_end()); 753 CurDAG->UpdateNodeOperands(Call.getNode(), Ops); 754 } 755 756 /// Return true if call address is a load and it can be 757 /// moved below CALLSEQ_START and the chains leading up to the call. 758 /// Return the CALLSEQ_START by reference as a second output. 759 /// In the case of a tail call, there isn't a callseq node between the call 760 /// chain and the load. 761 static bool isCalleeLoad(SDValue Callee, SDValue &Chain, bool HasCallSeq) { 762 // The transformation is somewhat dangerous if the call's chain was glued to 763 // the call. After MoveBelowOrigChain the load is moved between the call and 764 // the chain, this can create a cycle if the load is not folded. So it is 765 // *really* important that we are sure the load will be folded. 766 if (Callee.getNode() == Chain.getNode() || !Callee.hasOneUse()) 767 return false; 768 LoadSDNode *LD = dyn_cast<LoadSDNode>(Callee.getNode()); 769 if (!LD || 770 !LD->isSimple() || 771 LD->getAddressingMode() != ISD::UNINDEXED || 772 LD->getExtensionType() != ISD::NON_EXTLOAD) 773 return false; 774 775 // Now let's find the callseq_start. 776 while (HasCallSeq && Chain.getOpcode() != ISD::CALLSEQ_START) { 777 if (!Chain.hasOneUse()) 778 return false; 779 Chain = Chain.getOperand(0); 780 } 781 782 if (!Chain.getNumOperands()) 783 return false; 784 // Since we are not checking for AA here, conservatively abort if the chain 785 // writes to memory. It's not safe to move the callee (a load) across a store. 786 if (isa<MemSDNode>(Chain.getNode()) && 787 cast<MemSDNode>(Chain.getNode())->writeMem()) 788 return false; 789 if (Chain.getOperand(0).getNode() == Callee.getNode()) 790 return true; 791 if (Chain.getOperand(0).getOpcode() == ISD::TokenFactor && 792 Callee.getValue(1).isOperandOf(Chain.getOperand(0).getNode()) && 793 Callee.getValue(1).hasOneUse()) 794 return true; 795 return false; 796 } 797 798 void X86DAGToDAGISel::PreprocessISelDAG() { 799 bool MadeChange = false; 800 for (SelectionDAG::allnodes_iterator I = CurDAG->allnodes_begin(), 801 E = CurDAG->allnodes_end(); I != E; ) { 802 SDNode *N = &*I++; // Preincrement iterator to avoid invalidation issues. 803 804 // If this is a target specific AND node with no flag usages, turn it back 805 // into ISD::AND to enable test instruction matching. 806 if (N->getOpcode() == X86ISD::AND && !N->hasAnyUseOfValue(1)) { 807 SDValue Res = CurDAG->getNode(ISD::AND, SDLoc(N), N->getValueType(0), 808 N->getOperand(0), N->getOperand(1)); 809 --I; 810 CurDAG->ReplaceAllUsesOfValueWith(SDValue(N, 0), Res); 811 ++I; 812 MadeChange = true; 813 continue; 814 } 815 816 /// Convert vector increment or decrement to sub/add with an all-ones 817 /// constant: 818 /// add X, <1, 1...> --> sub X, <-1, -1...> 819 /// sub X, <1, 1...> --> add X, <-1, -1...> 820 /// The all-ones vector constant can be materialized using a pcmpeq 821 /// instruction that is commonly recognized as an idiom (has no register 822 /// dependency), so that's better/smaller than loading a splat 1 constant. 823 if ((N->getOpcode() == ISD::ADD || N->getOpcode() == ISD::SUB) && 824 N->getSimpleValueType(0).isVector()) { 825 826 APInt SplatVal; 827 if (X86::isConstantSplat(N->getOperand(1), SplatVal) && 828 SplatVal.isOneValue()) { 829 SDLoc DL(N); 830 831 MVT VT = N->getSimpleValueType(0); 832 unsigned NumElts = VT.getSizeInBits() / 32; 833 SDValue AllOnes = 834 CurDAG->getAllOnesConstant(DL, MVT::getVectorVT(MVT::i32, NumElts)); 835 AllOnes = CurDAG->getBitcast(VT, AllOnes); 836 837 unsigned NewOpcode = N->getOpcode() == ISD::ADD ? ISD::SUB : ISD::ADD; 838 SDValue Res = 839 CurDAG->getNode(NewOpcode, DL, VT, N->getOperand(0), AllOnes); 840 --I; 841 CurDAG->ReplaceAllUsesWith(N, Res.getNode()); 842 ++I; 843 MadeChange = true; 844 continue; 845 } 846 } 847 848 switch (N->getOpcode()) { 849 case X86ISD::VBROADCAST: { 850 MVT VT = N->getSimpleValueType(0); 851 // Emulate v32i16/v64i8 broadcast without BWI. 852 if (!Subtarget->hasBWI() && (VT == MVT::v32i16 || VT == MVT::v64i8)) { 853 MVT NarrowVT = VT == MVT::v32i16 ? MVT::v16i16 : MVT::v32i8; 854 SDLoc dl(N); 855 SDValue NarrowBCast = 856 CurDAG->getNode(X86ISD::VBROADCAST, dl, NarrowVT, N->getOperand(0)); 857 SDValue Res = 858 CurDAG->getNode(ISD::INSERT_SUBVECTOR, dl, VT, CurDAG->getUNDEF(VT), 859 NarrowBCast, CurDAG->getIntPtrConstant(0, dl)); 860 unsigned Index = VT == MVT::v32i16 ? 16 : 32; 861 Res = CurDAG->getNode(ISD::INSERT_SUBVECTOR, dl, VT, Res, NarrowBCast, 862 CurDAG->getIntPtrConstant(Index, dl)); 863 864 --I; 865 CurDAG->ReplaceAllUsesWith(N, Res.getNode()); 866 ++I; 867 MadeChange = true; 868 continue; 869 } 870 871 break; 872 } 873 case X86ISD::VBROADCAST_LOAD: { 874 MVT VT = N->getSimpleValueType(0); 875 // Emulate v32i16/v64i8 broadcast without BWI. 876 if (!Subtarget->hasBWI() && (VT == MVT::v32i16 || VT == MVT::v64i8)) { 877 MVT NarrowVT = VT == MVT::v32i16 ? MVT::v16i16 : MVT::v32i8; 878 auto *MemNode = cast<MemSDNode>(N); 879 SDLoc dl(N); 880 SDVTList VTs = CurDAG->getVTList(NarrowVT, MVT::Other); 881 SDValue Ops[] = {MemNode->getChain(), MemNode->getBasePtr()}; 882 SDValue NarrowBCast = CurDAG->getMemIntrinsicNode( 883 X86ISD::VBROADCAST_LOAD, dl, VTs, Ops, MemNode->getMemoryVT(), 884 MemNode->getMemOperand()); 885 SDValue Res = 886 CurDAG->getNode(ISD::INSERT_SUBVECTOR, dl, VT, CurDAG->getUNDEF(VT), 887 NarrowBCast, CurDAG->getIntPtrConstant(0, dl)); 888 unsigned Index = VT == MVT::v32i16 ? 16 : 32; 889 Res = CurDAG->getNode(ISD::INSERT_SUBVECTOR, dl, VT, Res, NarrowBCast, 890 CurDAG->getIntPtrConstant(Index, dl)); 891 892 --I; 893 SDValue To[] = {Res, NarrowBCast.getValue(1)}; 894 CurDAG->ReplaceAllUsesWith(N, To); 895 ++I; 896 MadeChange = true; 897 continue; 898 } 899 900 break; 901 } 902 case ISD::VSELECT: { 903 // Replace VSELECT with non-mask conditions with with BLENDV. 904 if (N->getOperand(0).getValueType().getVectorElementType() == MVT::i1) 905 break; 906 907 assert(Subtarget->hasSSE41() && "Expected SSE4.1 support!"); 908 SDValue Blendv = 909 CurDAG->getNode(X86ISD::BLENDV, SDLoc(N), N->getValueType(0), 910 N->getOperand(0), N->getOperand(1), N->getOperand(2)); 911 --I; 912 CurDAG->ReplaceAllUsesWith(N, Blendv.getNode()); 913 ++I; 914 MadeChange = true; 915 continue; 916 } 917 case ISD::FP_ROUND: 918 case ISD::STRICT_FP_ROUND: 919 case ISD::FP_TO_SINT: 920 case ISD::FP_TO_UINT: 921 case ISD::STRICT_FP_TO_SINT: 922 case ISD::STRICT_FP_TO_UINT: { 923 // Replace vector fp_to_s/uint with their X86 specific equivalent so we 924 // don't need 2 sets of patterns. 925 if (!N->getSimpleValueType(0).isVector()) 926 break; 927 928 unsigned NewOpc; 929 switch (N->getOpcode()) { 930 default: llvm_unreachable("Unexpected opcode!"); 931 case ISD::FP_ROUND: NewOpc = X86ISD::VFPROUND; break; 932 case ISD::STRICT_FP_ROUND: NewOpc = X86ISD::STRICT_VFPROUND; break; 933 case ISD::STRICT_FP_TO_SINT: NewOpc = X86ISD::STRICT_CVTTP2SI; break; 934 case ISD::FP_TO_SINT: NewOpc = X86ISD::CVTTP2SI; break; 935 case ISD::STRICT_FP_TO_UINT: NewOpc = X86ISD::STRICT_CVTTP2UI; break; 936 case ISD::FP_TO_UINT: NewOpc = X86ISD::CVTTP2UI; break; 937 } 938 SDValue Res; 939 if (N->isStrictFPOpcode()) 940 Res = 941 CurDAG->getNode(NewOpc, SDLoc(N), {N->getValueType(0), MVT::Other}, 942 {N->getOperand(0), N->getOperand(1)}); 943 else 944 Res = 945 CurDAG->getNode(NewOpc, SDLoc(N), N->getValueType(0), 946 N->getOperand(0)); 947 --I; 948 CurDAG->ReplaceAllUsesWith(N, Res.getNode()); 949 ++I; 950 MadeChange = true; 951 continue; 952 } 953 case ISD::SHL: 954 case ISD::SRA: 955 case ISD::SRL: { 956 // Replace vector shifts with their X86 specific equivalent so we don't 957 // need 2 sets of patterns. 958 if (!N->getValueType(0).isVector()) 959 break; 960 961 unsigned NewOpc; 962 switch (N->getOpcode()) { 963 default: llvm_unreachable("Unexpected opcode!"); 964 case ISD::SHL: NewOpc = X86ISD::VSHLV; break; 965 case ISD::SRA: NewOpc = X86ISD::VSRAV; break; 966 case ISD::SRL: NewOpc = X86ISD::VSRLV; break; 967 } 968 SDValue Res = CurDAG->getNode(NewOpc, SDLoc(N), N->getValueType(0), 969 N->getOperand(0), N->getOperand(1)); 970 --I; 971 CurDAG->ReplaceAllUsesOfValueWith(SDValue(N, 0), Res); 972 ++I; 973 MadeChange = true; 974 continue; 975 } 976 case ISD::ANY_EXTEND: 977 case ISD::ANY_EXTEND_VECTOR_INREG: { 978 // Replace vector any extend with the zero extend equivalents so we don't 979 // need 2 sets of patterns. Ignore vXi1 extensions. 980 if (!N->getValueType(0).isVector()) 981 break; 982 983 unsigned NewOpc; 984 if (N->getOperand(0).getScalarValueSizeInBits() == 1) { 985 assert(N->getOpcode() == ISD::ANY_EXTEND && 986 "Unexpected opcode for mask vector!"); 987 NewOpc = ISD::SIGN_EXTEND; 988 } else { 989 NewOpc = N->getOpcode() == ISD::ANY_EXTEND 990 ? ISD::ZERO_EXTEND 991 : ISD::ZERO_EXTEND_VECTOR_INREG; 992 } 993 994 SDValue Res = CurDAG->getNode(NewOpc, SDLoc(N), N->getValueType(0), 995 N->getOperand(0)); 996 --I; 997 CurDAG->ReplaceAllUsesOfValueWith(SDValue(N, 0), Res); 998 ++I; 999 MadeChange = true; 1000 continue; 1001 } 1002 case ISD::FCEIL: 1003 case ISD::STRICT_FCEIL: 1004 case ISD::FFLOOR: 1005 case ISD::STRICT_FFLOOR: 1006 case ISD::FTRUNC: 1007 case ISD::STRICT_FTRUNC: 1008 case ISD::FNEARBYINT: 1009 case ISD::STRICT_FNEARBYINT: 1010 case ISD::FRINT: 1011 case ISD::STRICT_FRINT: { 1012 // Replace fp rounding with their X86 specific equivalent so we don't 1013 // need 2 sets of patterns. 1014 unsigned Imm; 1015 switch (N->getOpcode()) { 1016 default: llvm_unreachable("Unexpected opcode!"); 1017 case ISD::STRICT_FCEIL: 1018 case ISD::FCEIL: Imm = 0xA; break; 1019 case ISD::STRICT_FFLOOR: 1020 case ISD::FFLOOR: Imm = 0x9; break; 1021 case ISD::STRICT_FTRUNC: 1022 case ISD::FTRUNC: Imm = 0xB; break; 1023 case ISD::STRICT_FNEARBYINT: 1024 case ISD::FNEARBYINT: Imm = 0xC; break; 1025 case ISD::STRICT_FRINT: 1026 case ISD::FRINT: Imm = 0x4; break; 1027 } 1028 SDLoc dl(N); 1029 bool IsStrict = N->isStrictFPOpcode(); 1030 SDValue Res; 1031 if (IsStrict) 1032 Res = CurDAG->getNode(X86ISD::STRICT_VRNDSCALE, dl, 1033 {N->getValueType(0), MVT::Other}, 1034 {N->getOperand(0), N->getOperand(1), 1035 CurDAG->getTargetConstant(Imm, dl, MVT::i8)}); 1036 else 1037 Res = CurDAG->getNode(X86ISD::VRNDSCALE, dl, N->getValueType(0), 1038 N->getOperand(0), 1039 CurDAG->getTargetConstant(Imm, dl, MVT::i8)); 1040 --I; 1041 CurDAG->ReplaceAllUsesWith(N, Res.getNode()); 1042 ++I; 1043 MadeChange = true; 1044 continue; 1045 } 1046 case X86ISD::FANDN: 1047 case X86ISD::FAND: 1048 case X86ISD::FOR: 1049 case X86ISD::FXOR: { 1050 // Widen scalar fp logic ops to vector to reduce isel patterns. 1051 // FIXME: Can we do this during lowering/combine. 1052 MVT VT = N->getSimpleValueType(0); 1053 if (VT.isVector() || VT == MVT::f128) 1054 break; 1055 1056 MVT VecVT = VT == MVT::f64 ? MVT::v2f64 : MVT::v4f32; 1057 SDLoc dl(N); 1058 SDValue Op0 = CurDAG->getNode(ISD::SCALAR_TO_VECTOR, dl, VecVT, 1059 N->getOperand(0)); 1060 SDValue Op1 = CurDAG->getNode(ISD::SCALAR_TO_VECTOR, dl, VecVT, 1061 N->getOperand(1)); 1062 1063 SDValue Res; 1064 if (Subtarget->hasSSE2()) { 1065 EVT IntVT = EVT(VecVT).changeVectorElementTypeToInteger(); 1066 Op0 = CurDAG->getNode(ISD::BITCAST, dl, IntVT, Op0); 1067 Op1 = CurDAG->getNode(ISD::BITCAST, dl, IntVT, Op1); 1068 unsigned Opc; 1069 switch (N->getOpcode()) { 1070 default: llvm_unreachable("Unexpected opcode!"); 1071 case X86ISD::FANDN: Opc = X86ISD::ANDNP; break; 1072 case X86ISD::FAND: Opc = ISD::AND; break; 1073 case X86ISD::FOR: Opc = ISD::OR; break; 1074 case X86ISD::FXOR: Opc = ISD::XOR; break; 1075 } 1076 Res = CurDAG->getNode(Opc, dl, IntVT, Op0, Op1); 1077 Res = CurDAG->getNode(ISD::BITCAST, dl, VecVT, Res); 1078 } else { 1079 Res = CurDAG->getNode(N->getOpcode(), dl, VecVT, Op0, Op1); 1080 } 1081 Res = CurDAG->getNode(ISD::EXTRACT_VECTOR_ELT, dl, VT, Res, 1082 CurDAG->getIntPtrConstant(0, dl)); 1083 --I; 1084 CurDAG->ReplaceAllUsesOfValueWith(SDValue(N, 0), Res); 1085 ++I; 1086 MadeChange = true; 1087 continue; 1088 } 1089 } 1090 1091 if (OptLevel != CodeGenOpt::None && 1092 // Only do this when the target can fold the load into the call or 1093 // jmp. 1094 !Subtarget->useIndirectThunkCalls() && 1095 ((N->getOpcode() == X86ISD::CALL && !Subtarget->slowTwoMemOps()) || 1096 (N->getOpcode() == X86ISD::TC_RETURN && 1097 (Subtarget->is64Bit() || 1098 !getTargetMachine().isPositionIndependent())))) { 1099 /// Also try moving call address load from outside callseq_start to just 1100 /// before the call to allow it to be folded. 1101 /// 1102 /// [Load chain] 1103 /// ^ 1104 /// | 1105 /// [Load] 1106 /// ^ ^ 1107 /// | | 1108 /// / \-- 1109 /// / | 1110 ///[CALLSEQ_START] | 1111 /// ^ | 1112 /// | | 1113 /// [LOAD/C2Reg] | 1114 /// | | 1115 /// \ / 1116 /// \ / 1117 /// [CALL] 1118 bool HasCallSeq = N->getOpcode() == X86ISD::CALL; 1119 SDValue Chain = N->getOperand(0); 1120 SDValue Load = N->getOperand(1); 1121 if (!isCalleeLoad(Load, Chain, HasCallSeq)) 1122 continue; 1123 moveBelowOrigChain(CurDAG, Load, SDValue(N, 0), Chain); 1124 ++NumLoadMoved; 1125 MadeChange = true; 1126 continue; 1127 } 1128 1129 // Lower fpround and fpextend nodes that target the FP stack to be store and 1130 // load to the stack. This is a gross hack. We would like to simply mark 1131 // these as being illegal, but when we do that, legalize produces these when 1132 // it expands calls, then expands these in the same legalize pass. We would 1133 // like dag combine to be able to hack on these between the call expansion 1134 // and the node legalization. As such this pass basically does "really 1135 // late" legalization of these inline with the X86 isel pass. 1136 // FIXME: This should only happen when not compiled with -O0. 1137 switch (N->getOpcode()) { 1138 default: continue; 1139 case ISD::FP_ROUND: 1140 case ISD::FP_EXTEND: 1141 { 1142 MVT SrcVT = N->getOperand(0).getSimpleValueType(); 1143 MVT DstVT = N->getSimpleValueType(0); 1144 1145 // If any of the sources are vectors, no fp stack involved. 1146 if (SrcVT.isVector() || DstVT.isVector()) 1147 continue; 1148 1149 // If the source and destination are SSE registers, then this is a legal 1150 // conversion that should not be lowered. 1151 const X86TargetLowering *X86Lowering = 1152 static_cast<const X86TargetLowering *>(TLI); 1153 bool SrcIsSSE = X86Lowering->isScalarFPTypeInSSEReg(SrcVT); 1154 bool DstIsSSE = X86Lowering->isScalarFPTypeInSSEReg(DstVT); 1155 if (SrcIsSSE && DstIsSSE) 1156 continue; 1157 1158 if (!SrcIsSSE && !DstIsSSE) { 1159 // If this is an FPStack extension, it is a noop. 1160 if (N->getOpcode() == ISD::FP_EXTEND) 1161 continue; 1162 // If this is a value-preserving FPStack truncation, it is a noop. 1163 if (N->getConstantOperandVal(1)) 1164 continue; 1165 } 1166 1167 // Here we could have an FP stack truncation or an FPStack <-> SSE convert. 1168 // FPStack has extload and truncstore. SSE can fold direct loads into other 1169 // operations. Based on this, decide what we want to do. 1170 MVT MemVT = (N->getOpcode() == ISD::FP_ROUND) ? DstVT : SrcVT; 1171 SDValue MemTmp = CurDAG->CreateStackTemporary(MemVT); 1172 int SPFI = cast<FrameIndexSDNode>(MemTmp)->getIndex(); 1173 MachinePointerInfo MPI = 1174 MachinePointerInfo::getFixedStack(CurDAG->getMachineFunction(), SPFI); 1175 SDLoc dl(N); 1176 1177 // FIXME: optimize the case where the src/dest is a load or store? 1178 1179 SDValue Store = CurDAG->getTruncStore( 1180 CurDAG->getEntryNode(), dl, N->getOperand(0), MemTmp, MPI, MemVT); 1181 SDValue Result = CurDAG->getExtLoad(ISD::EXTLOAD, dl, DstVT, Store, 1182 MemTmp, MPI, MemVT); 1183 1184 // We're about to replace all uses of the FP_ROUND/FP_EXTEND with the 1185 // extload we created. This will cause general havok on the dag because 1186 // anything below the conversion could be folded into other existing nodes. 1187 // To avoid invalidating 'I', back it up to the convert node. 1188 --I; 1189 CurDAG->ReplaceAllUsesOfValueWith(SDValue(N, 0), Result); 1190 break; 1191 } 1192 1193 //The sequence of events for lowering STRICT_FP versions of these nodes requires 1194 //dealing with the chain differently, as there is already a preexisting chain. 1195 case ISD::STRICT_FP_ROUND: 1196 case ISD::STRICT_FP_EXTEND: 1197 { 1198 MVT SrcVT = N->getOperand(1).getSimpleValueType(); 1199 MVT DstVT = N->getSimpleValueType(0); 1200 1201 // If any of the sources are vectors, no fp stack involved. 1202 if (SrcVT.isVector() || DstVT.isVector()) 1203 continue; 1204 1205 // If the source and destination are SSE registers, then this is a legal 1206 // conversion that should not be lowered. 1207 const X86TargetLowering *X86Lowering = 1208 static_cast<const X86TargetLowering *>(TLI); 1209 bool SrcIsSSE = X86Lowering->isScalarFPTypeInSSEReg(SrcVT); 1210 bool DstIsSSE = X86Lowering->isScalarFPTypeInSSEReg(DstVT); 1211 if (SrcIsSSE && DstIsSSE) 1212 continue; 1213 1214 if (!SrcIsSSE && !DstIsSSE) { 1215 // If this is an FPStack extension, it is a noop. 1216 if (N->getOpcode() == ISD::STRICT_FP_EXTEND) 1217 continue; 1218 // If this is a value-preserving FPStack truncation, it is a noop. 1219 if (N->getConstantOperandVal(2)) 1220 continue; 1221 } 1222 1223 // Here we could have an FP stack truncation or an FPStack <-> SSE convert. 1224 // FPStack has extload and truncstore. SSE can fold direct loads into other 1225 // operations. Based on this, decide what we want to do. 1226 MVT MemVT = (N->getOpcode() == ISD::STRICT_FP_ROUND) ? DstVT : SrcVT; 1227 SDValue MemTmp = CurDAG->CreateStackTemporary(MemVT); 1228 int SPFI = cast<FrameIndexSDNode>(MemTmp)->getIndex(); 1229 MachinePointerInfo MPI = 1230 MachinePointerInfo::getFixedStack(CurDAG->getMachineFunction(), SPFI); 1231 SDLoc dl(N); 1232 1233 // FIXME: optimize the case where the src/dest is a load or store? 1234 1235 //Since the operation is StrictFP, use the preexisting chain. 1236 SDValue Store, Result; 1237 if (!SrcIsSSE) { 1238 SDVTList VTs = CurDAG->getVTList(MVT::Other); 1239 SDValue Ops[] = {N->getOperand(0), N->getOperand(1), MemTmp}; 1240 Store = CurDAG->getMemIntrinsicNode(X86ISD::FST, dl, VTs, Ops, MemVT, 1241 MPI, /*Align*/ None, 1242 MachineMemOperand::MOStore); 1243 if (N->getFlags().hasNoFPExcept()) { 1244 SDNodeFlags Flags = Store->getFlags(); 1245 Flags.setNoFPExcept(true); 1246 Store->setFlags(Flags); 1247 } 1248 } else { 1249 assert(SrcVT == MemVT && "Unexpected VT!"); 1250 Store = CurDAG->getStore(N->getOperand(0), dl, N->getOperand(1), MemTmp, 1251 MPI); 1252 } 1253 1254 if (!DstIsSSE) { 1255 SDVTList VTs = CurDAG->getVTList(DstVT, MVT::Other); 1256 SDValue Ops[] = {Store, MemTmp}; 1257 Result = CurDAG->getMemIntrinsicNode( 1258 X86ISD::FLD, dl, VTs, Ops, MemVT, MPI, 1259 /*Align*/ None, MachineMemOperand::MOLoad); 1260 if (N->getFlags().hasNoFPExcept()) { 1261 SDNodeFlags Flags = Result->getFlags(); 1262 Flags.setNoFPExcept(true); 1263 Result->setFlags(Flags); 1264 } 1265 } else { 1266 assert(DstVT == MemVT && "Unexpected VT!"); 1267 Result = CurDAG->getLoad(DstVT, dl, Store, MemTmp, MPI); 1268 } 1269 1270 // We're about to replace all uses of the FP_ROUND/FP_EXTEND with the 1271 // extload we created. This will cause general havok on the dag because 1272 // anything below the conversion could be folded into other existing nodes. 1273 // To avoid invalidating 'I', back it up to the convert node. 1274 --I; 1275 CurDAG->ReplaceAllUsesWith(N, Result.getNode()); 1276 break; 1277 } 1278 } 1279 1280 1281 // Now that we did that, the node is dead. Increment the iterator to the 1282 // next node to process, then delete N. 1283 ++I; 1284 MadeChange = true; 1285 } 1286 1287 // Remove any dead nodes that may have been left behind. 1288 if (MadeChange) 1289 CurDAG->RemoveDeadNodes(); 1290 } 1291 1292 // Look for a redundant movzx/movsx that can occur after an 8-bit divrem. 1293 bool X86DAGToDAGISel::tryOptimizeRem8Extend(SDNode *N) { 1294 unsigned Opc = N->getMachineOpcode(); 1295 if (Opc != X86::MOVZX32rr8 && Opc != X86::MOVSX32rr8 && 1296 Opc != X86::MOVSX64rr8) 1297 return false; 1298 1299 SDValue N0 = N->getOperand(0); 1300 1301 // We need to be extracting the lower bit of an extend. 1302 if (!N0.isMachineOpcode() || 1303 N0.getMachineOpcode() != TargetOpcode::EXTRACT_SUBREG || 1304 N0.getConstantOperandVal(1) != X86::sub_8bit) 1305 return false; 1306 1307 // We're looking for either a movsx or movzx to match the original opcode. 1308 unsigned ExpectedOpc = Opc == X86::MOVZX32rr8 ? X86::MOVZX32rr8_NOREX 1309 : X86::MOVSX32rr8_NOREX; 1310 SDValue N00 = N0.getOperand(0); 1311 if (!N00.isMachineOpcode() || N00.getMachineOpcode() != ExpectedOpc) 1312 return false; 1313 1314 if (Opc == X86::MOVSX64rr8) { 1315 // If we had a sign extend from 8 to 64 bits. We still need to go from 32 1316 // to 64. 1317 MachineSDNode *Extend = CurDAG->getMachineNode(X86::MOVSX64rr32, SDLoc(N), 1318 MVT::i64, N00); 1319 ReplaceUses(N, Extend); 1320 } else { 1321 // Ok we can drop this extend and just use the original extend. 1322 ReplaceUses(N, N00.getNode()); 1323 } 1324 1325 return true; 1326 } 1327 1328 void X86DAGToDAGISel::PostprocessISelDAG() { 1329 // Skip peepholes at -O0. 1330 if (TM.getOptLevel() == CodeGenOpt::None) 1331 return; 1332 1333 SelectionDAG::allnodes_iterator Position = CurDAG->allnodes_end(); 1334 1335 bool MadeChange = false; 1336 while (Position != CurDAG->allnodes_begin()) { 1337 SDNode *N = &*--Position; 1338 // Skip dead nodes and any non-machine opcodes. 1339 if (N->use_empty() || !N->isMachineOpcode()) 1340 continue; 1341 1342 if (tryOptimizeRem8Extend(N)) { 1343 MadeChange = true; 1344 continue; 1345 } 1346 1347 // Look for a TESTrr+ANDrr pattern where both operands of the test are 1348 // the same. Rewrite to remove the AND. 1349 unsigned Opc = N->getMachineOpcode(); 1350 if ((Opc == X86::TEST8rr || Opc == X86::TEST16rr || 1351 Opc == X86::TEST32rr || Opc == X86::TEST64rr) && 1352 N->getOperand(0) == N->getOperand(1) && 1353 N->isOnlyUserOf(N->getOperand(0).getNode()) && 1354 N->getOperand(0).isMachineOpcode()) { 1355 SDValue And = N->getOperand(0); 1356 unsigned N0Opc = And.getMachineOpcode(); 1357 if (N0Opc == X86::AND8rr || N0Opc == X86::AND16rr || 1358 N0Opc == X86::AND32rr || N0Opc == X86::AND64rr) { 1359 MachineSDNode *Test = CurDAG->getMachineNode(Opc, SDLoc(N), 1360 MVT::i32, 1361 And.getOperand(0), 1362 And.getOperand(1)); 1363 ReplaceUses(N, Test); 1364 MadeChange = true; 1365 continue; 1366 } 1367 if (N0Opc == X86::AND8rm || N0Opc == X86::AND16rm || 1368 N0Opc == X86::AND32rm || N0Opc == X86::AND64rm) { 1369 unsigned NewOpc; 1370 switch (N0Opc) { 1371 case X86::AND8rm: NewOpc = X86::TEST8mr; break; 1372 case X86::AND16rm: NewOpc = X86::TEST16mr; break; 1373 case X86::AND32rm: NewOpc = X86::TEST32mr; break; 1374 case X86::AND64rm: NewOpc = X86::TEST64mr; break; 1375 } 1376 1377 // Need to swap the memory and register operand. 1378 SDValue Ops[] = { And.getOperand(1), 1379 And.getOperand(2), 1380 And.getOperand(3), 1381 And.getOperand(4), 1382 And.getOperand(5), 1383 And.getOperand(0), 1384 And.getOperand(6) /* Chain */ }; 1385 MachineSDNode *Test = CurDAG->getMachineNode(NewOpc, SDLoc(N), 1386 MVT::i32, MVT::Other, Ops); 1387 CurDAG->setNodeMemRefs( 1388 Test, cast<MachineSDNode>(And.getNode())->memoperands()); 1389 ReplaceUses(N, Test); 1390 MadeChange = true; 1391 continue; 1392 } 1393 } 1394 1395 // Look for a KAND+KORTEST and turn it into KTEST if only the zero flag is 1396 // used. We're doing this late so we can prefer to fold the AND into masked 1397 // comparisons. Doing that can be better for the live range of the mask 1398 // register. 1399 if ((Opc == X86::KORTESTBrr || Opc == X86::KORTESTWrr || 1400 Opc == X86::KORTESTDrr || Opc == X86::KORTESTQrr) && 1401 N->getOperand(0) == N->getOperand(1) && 1402 N->isOnlyUserOf(N->getOperand(0).getNode()) && 1403 N->getOperand(0).isMachineOpcode() && 1404 onlyUsesZeroFlag(SDValue(N, 0))) { 1405 SDValue And = N->getOperand(0); 1406 unsigned N0Opc = And.getMachineOpcode(); 1407 // KANDW is legal with AVX512F, but KTESTW requires AVX512DQ. The other 1408 // KAND instructions and KTEST use the same ISA feature. 1409 if (N0Opc == X86::KANDBrr || 1410 (N0Opc == X86::KANDWrr && Subtarget->hasDQI()) || 1411 N0Opc == X86::KANDDrr || N0Opc == X86::KANDQrr) { 1412 unsigned NewOpc; 1413 switch (Opc) { 1414 default: llvm_unreachable("Unexpected opcode!"); 1415 case X86::KORTESTBrr: NewOpc = X86::KTESTBrr; break; 1416 case X86::KORTESTWrr: NewOpc = X86::KTESTWrr; break; 1417 case X86::KORTESTDrr: NewOpc = X86::KTESTDrr; break; 1418 case X86::KORTESTQrr: NewOpc = X86::KTESTQrr; break; 1419 } 1420 MachineSDNode *KTest = CurDAG->getMachineNode(NewOpc, SDLoc(N), 1421 MVT::i32, 1422 And.getOperand(0), 1423 And.getOperand(1)); 1424 ReplaceUses(N, KTest); 1425 MadeChange = true; 1426 continue; 1427 } 1428 } 1429 1430 // Attempt to remove vectors moves that were inserted to zero upper bits. 1431 if (Opc != TargetOpcode::SUBREG_TO_REG) 1432 continue; 1433 1434 unsigned SubRegIdx = N->getConstantOperandVal(2); 1435 if (SubRegIdx != X86::sub_xmm && SubRegIdx != X86::sub_ymm) 1436 continue; 1437 1438 SDValue Move = N->getOperand(1); 1439 if (!Move.isMachineOpcode()) 1440 continue; 1441 1442 // Make sure its one of the move opcodes we recognize. 1443 switch (Move.getMachineOpcode()) { 1444 default: 1445 continue; 1446 case X86::VMOVAPDrr: case X86::VMOVUPDrr: 1447 case X86::VMOVAPSrr: case X86::VMOVUPSrr: 1448 case X86::VMOVDQArr: case X86::VMOVDQUrr: 1449 case X86::VMOVAPDYrr: case X86::VMOVUPDYrr: 1450 case X86::VMOVAPSYrr: case X86::VMOVUPSYrr: 1451 case X86::VMOVDQAYrr: case X86::VMOVDQUYrr: 1452 case X86::VMOVAPDZ128rr: case X86::VMOVUPDZ128rr: 1453 case X86::VMOVAPSZ128rr: case X86::VMOVUPSZ128rr: 1454 case X86::VMOVDQA32Z128rr: case X86::VMOVDQU32Z128rr: 1455 case X86::VMOVDQA64Z128rr: case X86::VMOVDQU64Z128rr: 1456 case X86::VMOVAPDZ256rr: case X86::VMOVUPDZ256rr: 1457 case X86::VMOVAPSZ256rr: case X86::VMOVUPSZ256rr: 1458 case X86::VMOVDQA32Z256rr: case X86::VMOVDQU32Z256rr: 1459 case X86::VMOVDQA64Z256rr: case X86::VMOVDQU64Z256rr: 1460 break; 1461 } 1462 1463 SDValue In = Move.getOperand(0); 1464 if (!In.isMachineOpcode() || 1465 In.getMachineOpcode() <= TargetOpcode::GENERIC_OP_END) 1466 continue; 1467 1468 // Make sure the instruction has a VEX, XOP, or EVEX prefix. This covers 1469 // the SHA instructions which use a legacy encoding. 1470 uint64_t TSFlags = getInstrInfo()->get(In.getMachineOpcode()).TSFlags; 1471 if ((TSFlags & X86II::EncodingMask) != X86II::VEX && 1472 (TSFlags & X86II::EncodingMask) != X86II::EVEX && 1473 (TSFlags & X86II::EncodingMask) != X86II::XOP) 1474 continue; 1475 1476 // Producing instruction is another vector instruction. We can drop the 1477 // move. 1478 CurDAG->UpdateNodeOperands(N, N->getOperand(0), In, N->getOperand(2)); 1479 MadeChange = true; 1480 } 1481 1482 if (MadeChange) 1483 CurDAG->RemoveDeadNodes(); 1484 } 1485 1486 1487 /// Emit any code that needs to be executed only in the main function. 1488 void X86DAGToDAGISel::emitSpecialCodeForMain() { 1489 if (Subtarget->isTargetCygMing()) { 1490 TargetLowering::ArgListTy Args; 1491 auto &DL = CurDAG->getDataLayout(); 1492 1493 TargetLowering::CallLoweringInfo CLI(*CurDAG); 1494 CLI.setChain(CurDAG->getRoot()) 1495 .setCallee(CallingConv::C, Type::getVoidTy(*CurDAG->getContext()), 1496 CurDAG->getExternalSymbol("__main", TLI->getPointerTy(DL)), 1497 std::move(Args)); 1498 const TargetLowering &TLI = CurDAG->getTargetLoweringInfo(); 1499 std::pair<SDValue, SDValue> Result = TLI.LowerCallTo(CLI); 1500 CurDAG->setRoot(Result.second); 1501 } 1502 } 1503 1504 void X86DAGToDAGISel::emitFunctionEntryCode() { 1505 // If this is main, emit special code for main. 1506 const Function &F = MF->getFunction(); 1507 if (F.hasExternalLinkage() && F.getName() == "main") 1508 emitSpecialCodeForMain(); 1509 } 1510 1511 static bool isDispSafeForFrameIndex(int64_t Val) { 1512 // On 64-bit platforms, we can run into an issue where a frame index 1513 // includes a displacement that, when added to the explicit displacement, 1514 // will overflow the displacement field. Assuming that the frame index 1515 // displacement fits into a 31-bit integer (which is only slightly more 1516 // aggressive than the current fundamental assumption that it fits into 1517 // a 32-bit integer), a 31-bit disp should always be safe. 1518 return isInt<31>(Val); 1519 } 1520 1521 bool X86DAGToDAGISel::foldOffsetIntoAddress(uint64_t Offset, 1522 X86ISelAddressMode &AM) { 1523 // We may have already matched a displacement and the caller just added the 1524 // symbolic displacement. So we still need to do the checks even if Offset 1525 // is zero. 1526 1527 int64_t Val = AM.Disp + Offset; 1528 1529 // Cannot combine ExternalSymbol displacements with integer offsets. 1530 if (Val != 0 && (AM.ES || AM.MCSym)) 1531 return true; 1532 1533 CodeModel::Model M = TM.getCodeModel(); 1534 if (Subtarget->is64Bit()) { 1535 if (Val != 0 && 1536 !X86::isOffsetSuitableForCodeModel(Val, M, 1537 AM.hasSymbolicDisplacement())) 1538 return true; 1539 // In addition to the checks required for a register base, check that 1540 // we do not try to use an unsafe Disp with a frame index. 1541 if (AM.BaseType == X86ISelAddressMode::FrameIndexBase && 1542 !isDispSafeForFrameIndex(Val)) 1543 return true; 1544 } 1545 AM.Disp = Val; 1546 return false; 1547 1548 } 1549 1550 bool X86DAGToDAGISel::matchLoadInAddress(LoadSDNode *N, X86ISelAddressMode &AM){ 1551 SDValue Address = N->getOperand(1); 1552 1553 // load gs:0 -> GS segment register. 1554 // load fs:0 -> FS segment register. 1555 // 1556 // This optimization is valid because the GNU TLS model defines that 1557 // gs:0 (or fs:0 on X86-64) contains its own address. 1558 // For more information see http://people.redhat.com/drepper/tls.pdf 1559 if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Address)) 1560 if (C->getSExtValue() == 0 && AM.Segment.getNode() == nullptr && 1561 !IndirectTlsSegRefs && 1562 (Subtarget->isTargetGlibc() || Subtarget->isTargetAndroid() || 1563 Subtarget->isTargetFuchsia())) 1564 switch (N->getPointerInfo().getAddrSpace()) { 1565 case X86AS::GS: 1566 AM.Segment = CurDAG->getRegister(X86::GS, MVT::i16); 1567 return false; 1568 case X86AS::FS: 1569 AM.Segment = CurDAG->getRegister(X86::FS, MVT::i16); 1570 return false; 1571 // Address space X86AS::SS is not handled here, because it is not used to 1572 // address TLS areas. 1573 } 1574 1575 return true; 1576 } 1577 1578 /// Try to match X86ISD::Wrapper and X86ISD::WrapperRIP nodes into an addressing 1579 /// mode. These wrap things that will resolve down into a symbol reference. 1580 /// If no match is possible, this returns true, otherwise it returns false. 1581 bool X86DAGToDAGISel::matchWrapper(SDValue N, X86ISelAddressMode &AM) { 1582 // If the addressing mode already has a symbol as the displacement, we can 1583 // never match another symbol. 1584 if (AM.hasSymbolicDisplacement()) 1585 return true; 1586 1587 bool IsRIPRelTLS = false; 1588 bool IsRIPRel = N.getOpcode() == X86ISD::WrapperRIP; 1589 if (IsRIPRel) { 1590 SDValue Val = N.getOperand(0); 1591 if (Val.getOpcode() == ISD::TargetGlobalTLSAddress) 1592 IsRIPRelTLS = true; 1593 } 1594 1595 // We can't use an addressing mode in the 64-bit large code model. 1596 // Global TLS addressing is an exception. In the medium code model, 1597 // we use can use a mode when RIP wrappers are present. 1598 // That signifies access to globals that are known to be "near", 1599 // such as the GOT itself. 1600 CodeModel::Model M = TM.getCodeModel(); 1601 if (Subtarget->is64Bit() && 1602 ((M == CodeModel::Large && !IsRIPRelTLS) || 1603 (M == CodeModel::Medium && !IsRIPRel))) 1604 return true; 1605 1606 // Base and index reg must be 0 in order to use %rip as base. 1607 if (IsRIPRel && AM.hasBaseOrIndexReg()) 1608 return true; 1609 1610 // Make a local copy in case we can't do this fold. 1611 X86ISelAddressMode Backup = AM; 1612 1613 int64_t Offset = 0; 1614 SDValue N0 = N.getOperand(0); 1615 if (GlobalAddressSDNode *G = dyn_cast<GlobalAddressSDNode>(N0)) { 1616 AM.GV = G->getGlobal(); 1617 AM.SymbolFlags = G->getTargetFlags(); 1618 Offset = G->getOffset(); 1619 } else if (ConstantPoolSDNode *CP = dyn_cast<ConstantPoolSDNode>(N0)) { 1620 AM.CP = CP->getConstVal(); 1621 AM.Alignment = CP->getAlign(); 1622 AM.SymbolFlags = CP->getTargetFlags(); 1623 Offset = CP->getOffset(); 1624 } else if (ExternalSymbolSDNode *S = dyn_cast<ExternalSymbolSDNode>(N0)) { 1625 AM.ES = S->getSymbol(); 1626 AM.SymbolFlags = S->getTargetFlags(); 1627 } else if (auto *S = dyn_cast<MCSymbolSDNode>(N0)) { 1628 AM.MCSym = S->getMCSymbol(); 1629 } else if (JumpTableSDNode *J = dyn_cast<JumpTableSDNode>(N0)) { 1630 AM.JT = J->getIndex(); 1631 AM.SymbolFlags = J->getTargetFlags(); 1632 } else if (BlockAddressSDNode *BA = dyn_cast<BlockAddressSDNode>(N0)) { 1633 AM.BlockAddr = BA->getBlockAddress(); 1634 AM.SymbolFlags = BA->getTargetFlags(); 1635 Offset = BA->getOffset(); 1636 } else 1637 llvm_unreachable("Unhandled symbol reference node."); 1638 1639 if (foldOffsetIntoAddress(Offset, AM)) { 1640 AM = Backup; 1641 return true; 1642 } 1643 1644 if (IsRIPRel) 1645 AM.setBaseReg(CurDAG->getRegister(X86::RIP, MVT::i64)); 1646 1647 // Commit the changes now that we know this fold is safe. 1648 return false; 1649 } 1650 1651 /// Add the specified node to the specified addressing mode, returning true if 1652 /// it cannot be done. This just pattern matches for the addressing mode. 1653 bool X86DAGToDAGISel::matchAddress(SDValue N, X86ISelAddressMode &AM) { 1654 if (matchAddressRecursively(N, AM, 0)) 1655 return true; 1656 1657 // Post-processing: Convert lea(,%reg,2) to lea(%reg,%reg), which has 1658 // a smaller encoding and avoids a scaled-index. 1659 if (AM.Scale == 2 && 1660 AM.BaseType == X86ISelAddressMode::RegBase && 1661 AM.Base_Reg.getNode() == nullptr) { 1662 AM.Base_Reg = AM.IndexReg; 1663 AM.Scale = 1; 1664 } 1665 1666 // Post-processing: Convert foo to foo(%rip), even in non-PIC mode, 1667 // because it has a smaller encoding. 1668 // TODO: Which other code models can use this? 1669 switch (TM.getCodeModel()) { 1670 default: break; 1671 case CodeModel::Small: 1672 case CodeModel::Kernel: 1673 if (Subtarget->is64Bit() && 1674 AM.Scale == 1 && 1675 AM.BaseType == X86ISelAddressMode::RegBase && 1676 AM.Base_Reg.getNode() == nullptr && 1677 AM.IndexReg.getNode() == nullptr && 1678 AM.SymbolFlags == X86II::MO_NO_FLAG && 1679 AM.hasSymbolicDisplacement()) 1680 AM.Base_Reg = CurDAG->getRegister(X86::RIP, MVT::i64); 1681 break; 1682 } 1683 1684 return false; 1685 } 1686 1687 bool X86DAGToDAGISel::matchAdd(SDValue &N, X86ISelAddressMode &AM, 1688 unsigned Depth) { 1689 // Add an artificial use to this node so that we can keep track of 1690 // it if it gets CSE'd with a different node. 1691 HandleSDNode Handle(N); 1692 1693 X86ISelAddressMode Backup = AM; 1694 if (!matchAddressRecursively(N.getOperand(0), AM, Depth+1) && 1695 !matchAddressRecursively(Handle.getValue().getOperand(1), AM, Depth+1)) 1696 return false; 1697 AM = Backup; 1698 1699 // Try again after commutating the operands. 1700 if (!matchAddressRecursively(Handle.getValue().getOperand(1), AM, 1701 Depth + 1) && 1702 !matchAddressRecursively(Handle.getValue().getOperand(0), AM, Depth + 1)) 1703 return false; 1704 AM = Backup; 1705 1706 // If we couldn't fold both operands into the address at the same time, 1707 // see if we can just put each operand into a register and fold at least 1708 // the add. 1709 if (AM.BaseType == X86ISelAddressMode::RegBase && 1710 !AM.Base_Reg.getNode() && 1711 !AM.IndexReg.getNode()) { 1712 N = Handle.getValue(); 1713 AM.Base_Reg = N.getOperand(0); 1714 AM.IndexReg = N.getOperand(1); 1715 AM.Scale = 1; 1716 return false; 1717 } 1718 N = Handle.getValue(); 1719 return true; 1720 } 1721 1722 // Insert a node into the DAG at least before the Pos node's position. This 1723 // will reposition the node as needed, and will assign it a node ID that is <= 1724 // the Pos node's ID. Note that this does *not* preserve the uniqueness of node 1725 // IDs! The selection DAG must no longer depend on their uniqueness when this 1726 // is used. 1727 static void insertDAGNode(SelectionDAG &DAG, SDValue Pos, SDValue N) { 1728 if (N->getNodeId() == -1 || 1729 (SelectionDAGISel::getUninvalidatedNodeId(N.getNode()) > 1730 SelectionDAGISel::getUninvalidatedNodeId(Pos.getNode()))) { 1731 DAG.RepositionNode(Pos->getIterator(), N.getNode()); 1732 // Mark Node as invalid for pruning as after this it may be a successor to a 1733 // selected node but otherwise be in the same position of Pos. 1734 // Conservatively mark it with the same -abs(Id) to assure node id 1735 // invariant is preserved. 1736 N->setNodeId(Pos->getNodeId()); 1737 SelectionDAGISel::InvalidateNodeId(N.getNode()); 1738 } 1739 } 1740 1741 // Transform "(X >> (8-C1)) & (0xff << C1)" to "((X >> 8) & 0xff) << C1" if 1742 // safe. This allows us to convert the shift and and into an h-register 1743 // extract and a scaled index. Returns false if the simplification is 1744 // performed. 1745 static bool foldMaskAndShiftToExtract(SelectionDAG &DAG, SDValue N, 1746 uint64_t Mask, 1747 SDValue Shift, SDValue X, 1748 X86ISelAddressMode &AM) { 1749 if (Shift.getOpcode() != ISD::SRL || 1750 !isa<ConstantSDNode>(Shift.getOperand(1)) || 1751 !Shift.hasOneUse()) 1752 return true; 1753 1754 int ScaleLog = 8 - Shift.getConstantOperandVal(1); 1755 if (ScaleLog <= 0 || ScaleLog >= 4 || 1756 Mask != (0xffu << ScaleLog)) 1757 return true; 1758 1759 MVT VT = N.getSimpleValueType(); 1760 SDLoc DL(N); 1761 SDValue Eight = DAG.getConstant(8, DL, MVT::i8); 1762 SDValue NewMask = DAG.getConstant(0xff, DL, VT); 1763 SDValue Srl = DAG.getNode(ISD::SRL, DL, VT, X, Eight); 1764 SDValue And = DAG.getNode(ISD::AND, DL, VT, Srl, NewMask); 1765 SDValue ShlCount = DAG.getConstant(ScaleLog, DL, MVT::i8); 1766 SDValue Shl = DAG.getNode(ISD::SHL, DL, VT, And, ShlCount); 1767 1768 // Insert the new nodes into the topological ordering. We must do this in 1769 // a valid topological ordering as nothing is going to go back and re-sort 1770 // these nodes. We continually insert before 'N' in sequence as this is 1771 // essentially a pre-flattened and pre-sorted sequence of nodes. There is no 1772 // hierarchy left to express. 1773 insertDAGNode(DAG, N, Eight); 1774 insertDAGNode(DAG, N, Srl); 1775 insertDAGNode(DAG, N, NewMask); 1776 insertDAGNode(DAG, N, And); 1777 insertDAGNode(DAG, N, ShlCount); 1778 insertDAGNode(DAG, N, Shl); 1779 DAG.ReplaceAllUsesWith(N, Shl); 1780 DAG.RemoveDeadNode(N.getNode()); 1781 AM.IndexReg = And; 1782 AM.Scale = (1 << ScaleLog); 1783 return false; 1784 } 1785 1786 // Transforms "(X << C1) & C2" to "(X & (C2>>C1)) << C1" if safe and if this 1787 // allows us to fold the shift into this addressing mode. Returns false if the 1788 // transform succeeded. 1789 static bool foldMaskedShiftToScaledMask(SelectionDAG &DAG, SDValue N, 1790 X86ISelAddressMode &AM) { 1791 SDValue Shift = N.getOperand(0); 1792 1793 // Use a signed mask so that shifting right will insert sign bits. These 1794 // bits will be removed when we shift the result left so it doesn't matter 1795 // what we use. This might allow a smaller immediate encoding. 1796 int64_t Mask = cast<ConstantSDNode>(N->getOperand(1))->getSExtValue(); 1797 1798 // If we have an any_extend feeding the AND, look through it to see if there 1799 // is a shift behind it. But only if the AND doesn't use the extended bits. 1800 // FIXME: Generalize this to other ANY_EXTEND than i32 to i64? 1801 bool FoundAnyExtend = false; 1802 if (Shift.getOpcode() == ISD::ANY_EXTEND && Shift.hasOneUse() && 1803 Shift.getOperand(0).getSimpleValueType() == MVT::i32 && 1804 isUInt<32>(Mask)) { 1805 FoundAnyExtend = true; 1806 Shift = Shift.getOperand(0); 1807 } 1808 1809 if (Shift.getOpcode() != ISD::SHL || 1810 !isa<ConstantSDNode>(Shift.getOperand(1))) 1811 return true; 1812 1813 SDValue X = Shift.getOperand(0); 1814 1815 // Not likely to be profitable if either the AND or SHIFT node has more 1816 // than one use (unless all uses are for address computation). Besides, 1817 // isel mechanism requires their node ids to be reused. 1818 if (!N.hasOneUse() || !Shift.hasOneUse()) 1819 return true; 1820 1821 // Verify that the shift amount is something we can fold. 1822 unsigned ShiftAmt = Shift.getConstantOperandVal(1); 1823 if (ShiftAmt != 1 && ShiftAmt != 2 && ShiftAmt != 3) 1824 return true; 1825 1826 MVT VT = N.getSimpleValueType(); 1827 SDLoc DL(N); 1828 if (FoundAnyExtend) { 1829 SDValue NewX = DAG.getNode(ISD::ANY_EXTEND, DL, VT, X); 1830 insertDAGNode(DAG, N, NewX); 1831 X = NewX; 1832 } 1833 1834 SDValue NewMask = DAG.getConstant(Mask >> ShiftAmt, DL, VT); 1835 SDValue NewAnd = DAG.getNode(ISD::AND, DL, VT, X, NewMask); 1836 SDValue NewShift = DAG.getNode(ISD::SHL, DL, VT, NewAnd, Shift.getOperand(1)); 1837 1838 // Insert the new nodes into the topological ordering. We must do this in 1839 // a valid topological ordering as nothing is going to go back and re-sort 1840 // these nodes. We continually insert before 'N' in sequence as this is 1841 // essentially a pre-flattened and pre-sorted sequence of nodes. There is no 1842 // hierarchy left to express. 1843 insertDAGNode(DAG, N, NewMask); 1844 insertDAGNode(DAG, N, NewAnd); 1845 insertDAGNode(DAG, N, NewShift); 1846 DAG.ReplaceAllUsesWith(N, NewShift); 1847 DAG.RemoveDeadNode(N.getNode()); 1848 1849 AM.Scale = 1 << ShiftAmt; 1850 AM.IndexReg = NewAnd; 1851 return false; 1852 } 1853 1854 // Implement some heroics to detect shifts of masked values where the mask can 1855 // be replaced by extending the shift and undoing that in the addressing mode 1856 // scale. Patterns such as (shl (srl x, c1), c2) are canonicalized into (and 1857 // (srl x, SHIFT), MASK) by DAGCombines that don't know the shl can be done in 1858 // the addressing mode. This results in code such as: 1859 // 1860 // int f(short *y, int *lookup_table) { 1861 // ... 1862 // return *y + lookup_table[*y >> 11]; 1863 // } 1864 // 1865 // Turning into: 1866 // movzwl (%rdi), %eax 1867 // movl %eax, %ecx 1868 // shrl $11, %ecx 1869 // addl (%rsi,%rcx,4), %eax 1870 // 1871 // Instead of: 1872 // movzwl (%rdi), %eax 1873 // movl %eax, %ecx 1874 // shrl $9, %ecx 1875 // andl $124, %rcx 1876 // addl (%rsi,%rcx), %eax 1877 // 1878 // Note that this function assumes the mask is provided as a mask *after* the 1879 // value is shifted. The input chain may or may not match that, but computing 1880 // such a mask is trivial. 1881 static bool foldMaskAndShiftToScale(SelectionDAG &DAG, SDValue N, 1882 uint64_t Mask, 1883 SDValue Shift, SDValue X, 1884 X86ISelAddressMode &AM) { 1885 if (Shift.getOpcode() != ISD::SRL || !Shift.hasOneUse() || 1886 !isa<ConstantSDNode>(Shift.getOperand(1))) 1887 return true; 1888 1889 unsigned ShiftAmt = Shift.getConstantOperandVal(1); 1890 unsigned MaskLZ = countLeadingZeros(Mask); 1891 unsigned MaskTZ = countTrailingZeros(Mask); 1892 1893 // The amount of shift we're trying to fit into the addressing mode is taken 1894 // from the trailing zeros of the mask. 1895 unsigned AMShiftAmt = MaskTZ; 1896 1897 // There is nothing we can do here unless the mask is removing some bits. 1898 // Also, the addressing mode can only represent shifts of 1, 2, or 3 bits. 1899 if (AMShiftAmt == 0 || AMShiftAmt > 3) return true; 1900 1901 // We also need to ensure that mask is a continuous run of bits. 1902 if (countTrailingOnes(Mask >> MaskTZ) + MaskTZ + MaskLZ != 64) return true; 1903 1904 // Scale the leading zero count down based on the actual size of the value. 1905 // Also scale it down based on the size of the shift. 1906 unsigned ScaleDown = (64 - X.getSimpleValueType().getSizeInBits()) + ShiftAmt; 1907 if (MaskLZ < ScaleDown) 1908 return true; 1909 MaskLZ -= ScaleDown; 1910 1911 // The final check is to ensure that any masked out high bits of X are 1912 // already known to be zero. Otherwise, the mask has a semantic impact 1913 // other than masking out a couple of low bits. Unfortunately, because of 1914 // the mask, zero extensions will be removed from operands in some cases. 1915 // This code works extra hard to look through extensions because we can 1916 // replace them with zero extensions cheaply if necessary. 1917 bool ReplacingAnyExtend = false; 1918 if (X.getOpcode() == ISD::ANY_EXTEND) { 1919 unsigned ExtendBits = X.getSimpleValueType().getSizeInBits() - 1920 X.getOperand(0).getSimpleValueType().getSizeInBits(); 1921 // Assume that we'll replace the any-extend with a zero-extend, and 1922 // narrow the search to the extended value. 1923 X = X.getOperand(0); 1924 MaskLZ = ExtendBits > MaskLZ ? 0 : MaskLZ - ExtendBits; 1925 ReplacingAnyExtend = true; 1926 } 1927 APInt MaskedHighBits = 1928 APInt::getHighBitsSet(X.getSimpleValueType().getSizeInBits(), MaskLZ); 1929 KnownBits Known = DAG.computeKnownBits(X); 1930 if (MaskedHighBits != Known.Zero) return true; 1931 1932 // We've identified a pattern that can be transformed into a single shift 1933 // and an addressing mode. Make it so. 1934 MVT VT = N.getSimpleValueType(); 1935 if (ReplacingAnyExtend) { 1936 assert(X.getValueType() != VT); 1937 // We looked through an ANY_EXTEND node, insert a ZERO_EXTEND. 1938 SDValue NewX = DAG.getNode(ISD::ZERO_EXTEND, SDLoc(X), VT, X); 1939 insertDAGNode(DAG, N, NewX); 1940 X = NewX; 1941 } 1942 SDLoc DL(N); 1943 SDValue NewSRLAmt = DAG.getConstant(ShiftAmt + AMShiftAmt, DL, MVT::i8); 1944 SDValue NewSRL = DAG.getNode(ISD::SRL, DL, VT, X, NewSRLAmt); 1945 SDValue NewSHLAmt = DAG.getConstant(AMShiftAmt, DL, MVT::i8); 1946 SDValue NewSHL = DAG.getNode(ISD::SHL, DL, VT, NewSRL, NewSHLAmt); 1947 1948 // Insert the new nodes into the topological ordering. We must do this in 1949 // a valid topological ordering as nothing is going to go back and re-sort 1950 // these nodes. We continually insert before 'N' in sequence as this is 1951 // essentially a pre-flattened and pre-sorted sequence of nodes. There is no 1952 // hierarchy left to express. 1953 insertDAGNode(DAG, N, NewSRLAmt); 1954 insertDAGNode(DAG, N, NewSRL); 1955 insertDAGNode(DAG, N, NewSHLAmt); 1956 insertDAGNode(DAG, N, NewSHL); 1957 DAG.ReplaceAllUsesWith(N, NewSHL); 1958 DAG.RemoveDeadNode(N.getNode()); 1959 1960 AM.Scale = 1 << AMShiftAmt; 1961 AM.IndexReg = NewSRL; 1962 return false; 1963 } 1964 1965 // Transform "(X >> SHIFT) & (MASK << C1)" to 1966 // "((X >> (SHIFT + C1)) & (MASK)) << C1". Everything before the SHL will be 1967 // matched to a BEXTR later. Returns false if the simplification is performed. 1968 static bool foldMaskedShiftToBEXTR(SelectionDAG &DAG, SDValue N, 1969 uint64_t Mask, 1970 SDValue Shift, SDValue X, 1971 X86ISelAddressMode &AM, 1972 const X86Subtarget &Subtarget) { 1973 if (Shift.getOpcode() != ISD::SRL || 1974 !isa<ConstantSDNode>(Shift.getOperand(1)) || 1975 !Shift.hasOneUse() || !N.hasOneUse()) 1976 return true; 1977 1978 // Only do this if BEXTR will be matched by matchBEXTRFromAndImm. 1979 if (!Subtarget.hasTBM() && 1980 !(Subtarget.hasBMI() && Subtarget.hasFastBEXTR())) 1981 return true; 1982 1983 // We need to ensure that mask is a continuous run of bits. 1984 if (!isShiftedMask_64(Mask)) return true; 1985 1986 unsigned ShiftAmt = Shift.getConstantOperandVal(1); 1987 1988 // The amount of shift we're trying to fit into the addressing mode is taken 1989 // from the trailing zeros of the mask. 1990 unsigned AMShiftAmt = countTrailingZeros(Mask); 1991 1992 // There is nothing we can do here unless the mask is removing some bits. 1993 // Also, the addressing mode can only represent shifts of 1, 2, or 3 bits. 1994 if (AMShiftAmt == 0 || AMShiftAmt > 3) return true; 1995 1996 MVT VT = N.getSimpleValueType(); 1997 SDLoc DL(N); 1998 SDValue NewSRLAmt = DAG.getConstant(ShiftAmt + AMShiftAmt, DL, MVT::i8); 1999 SDValue NewSRL = DAG.getNode(ISD::SRL, DL, VT, X, NewSRLAmt); 2000 SDValue NewMask = DAG.getConstant(Mask >> AMShiftAmt, DL, VT); 2001 SDValue NewAnd = DAG.getNode(ISD::AND, DL, VT, NewSRL, NewMask); 2002 SDValue NewSHLAmt = DAG.getConstant(AMShiftAmt, DL, MVT::i8); 2003 SDValue NewSHL = DAG.getNode(ISD::SHL, DL, VT, NewAnd, NewSHLAmt); 2004 2005 // Insert the new nodes into the topological ordering. We must do this in 2006 // a valid topological ordering as nothing is going to go back and re-sort 2007 // these nodes. We continually insert before 'N' in sequence as this is 2008 // essentially a pre-flattened and pre-sorted sequence of nodes. There is no 2009 // hierarchy left to express. 2010 insertDAGNode(DAG, N, NewSRLAmt); 2011 insertDAGNode(DAG, N, NewSRL); 2012 insertDAGNode(DAG, N, NewMask); 2013 insertDAGNode(DAG, N, NewAnd); 2014 insertDAGNode(DAG, N, NewSHLAmt); 2015 insertDAGNode(DAG, N, NewSHL); 2016 DAG.ReplaceAllUsesWith(N, NewSHL); 2017 DAG.RemoveDeadNode(N.getNode()); 2018 2019 AM.Scale = 1 << AMShiftAmt; 2020 AM.IndexReg = NewAnd; 2021 return false; 2022 } 2023 2024 bool X86DAGToDAGISel::matchAddressRecursively(SDValue N, X86ISelAddressMode &AM, 2025 unsigned Depth) { 2026 SDLoc dl(N); 2027 LLVM_DEBUG({ 2028 dbgs() << "MatchAddress: "; 2029 AM.dump(CurDAG); 2030 }); 2031 // Limit recursion. 2032 if (Depth > 5) 2033 return matchAddressBase(N, AM); 2034 2035 // If this is already a %rip relative address, we can only merge immediates 2036 // into it. Instead of handling this in every case, we handle it here. 2037 // RIP relative addressing: %rip + 32-bit displacement! 2038 if (AM.isRIPRelative()) { 2039 // FIXME: JumpTable and ExternalSymbol address currently don't like 2040 // displacements. It isn't very important, but this should be fixed for 2041 // consistency. 2042 if (!(AM.ES || AM.MCSym) && AM.JT != -1) 2043 return true; 2044 2045 if (ConstantSDNode *Cst = dyn_cast<ConstantSDNode>(N)) 2046 if (!foldOffsetIntoAddress(Cst->getSExtValue(), AM)) 2047 return false; 2048 return true; 2049 } 2050 2051 switch (N.getOpcode()) { 2052 default: break; 2053 case ISD::LOCAL_RECOVER: { 2054 if (!AM.hasSymbolicDisplacement() && AM.Disp == 0) 2055 if (const auto *ESNode = dyn_cast<MCSymbolSDNode>(N.getOperand(0))) { 2056 // Use the symbol and don't prefix it. 2057 AM.MCSym = ESNode->getMCSymbol(); 2058 return false; 2059 } 2060 break; 2061 } 2062 case ISD::Constant: { 2063 uint64_t Val = cast<ConstantSDNode>(N)->getSExtValue(); 2064 if (!foldOffsetIntoAddress(Val, AM)) 2065 return false; 2066 break; 2067 } 2068 2069 case X86ISD::Wrapper: 2070 case X86ISD::WrapperRIP: 2071 if (!matchWrapper(N, AM)) 2072 return false; 2073 break; 2074 2075 case ISD::LOAD: 2076 if (!matchLoadInAddress(cast<LoadSDNode>(N), AM)) 2077 return false; 2078 break; 2079 2080 case ISD::FrameIndex: 2081 if (AM.BaseType == X86ISelAddressMode::RegBase && 2082 AM.Base_Reg.getNode() == nullptr && 2083 (!Subtarget->is64Bit() || isDispSafeForFrameIndex(AM.Disp))) { 2084 AM.BaseType = X86ISelAddressMode::FrameIndexBase; 2085 AM.Base_FrameIndex = cast<FrameIndexSDNode>(N)->getIndex(); 2086 return false; 2087 } 2088 break; 2089 2090 case ISD::SHL: 2091 if (AM.IndexReg.getNode() != nullptr || AM.Scale != 1) 2092 break; 2093 2094 if (ConstantSDNode *CN = dyn_cast<ConstantSDNode>(N.getOperand(1))) { 2095 unsigned Val = CN->getZExtValue(); 2096 // Note that we handle x<<1 as (,x,2) rather than (x,x) here so 2097 // that the base operand remains free for further matching. If 2098 // the base doesn't end up getting used, a post-processing step 2099 // in MatchAddress turns (,x,2) into (x,x), which is cheaper. 2100 if (Val == 1 || Val == 2 || Val == 3) { 2101 AM.Scale = 1 << Val; 2102 SDValue ShVal = N.getOperand(0); 2103 2104 // Okay, we know that we have a scale by now. However, if the scaled 2105 // value is an add of something and a constant, we can fold the 2106 // constant into the disp field here. 2107 if (CurDAG->isBaseWithConstantOffset(ShVal)) { 2108 AM.IndexReg = ShVal.getOperand(0); 2109 ConstantSDNode *AddVal = cast<ConstantSDNode>(ShVal.getOperand(1)); 2110 uint64_t Disp = (uint64_t)AddVal->getSExtValue() << Val; 2111 if (!foldOffsetIntoAddress(Disp, AM)) 2112 return false; 2113 } 2114 2115 AM.IndexReg = ShVal; 2116 return false; 2117 } 2118 } 2119 break; 2120 2121 case ISD::SRL: { 2122 // Scale must not be used already. 2123 if (AM.IndexReg.getNode() != nullptr || AM.Scale != 1) break; 2124 2125 // We only handle up to 64-bit values here as those are what matter for 2126 // addressing mode optimizations. 2127 assert(N.getSimpleValueType().getSizeInBits() <= 64 && 2128 "Unexpected value size!"); 2129 2130 SDValue And = N.getOperand(0); 2131 if (And.getOpcode() != ISD::AND) break; 2132 SDValue X = And.getOperand(0); 2133 2134 // The mask used for the transform is expected to be post-shift, but we 2135 // found the shift first so just apply the shift to the mask before passing 2136 // it down. 2137 if (!isa<ConstantSDNode>(N.getOperand(1)) || 2138 !isa<ConstantSDNode>(And.getOperand(1))) 2139 break; 2140 uint64_t Mask = And.getConstantOperandVal(1) >> N.getConstantOperandVal(1); 2141 2142 // Try to fold the mask and shift into the scale, and return false if we 2143 // succeed. 2144 if (!foldMaskAndShiftToScale(*CurDAG, N, Mask, N, X, AM)) 2145 return false; 2146 break; 2147 } 2148 2149 case ISD::SMUL_LOHI: 2150 case ISD::UMUL_LOHI: 2151 // A mul_lohi where we need the low part can be folded as a plain multiply. 2152 if (N.getResNo() != 0) break; 2153 LLVM_FALLTHROUGH; 2154 case ISD::MUL: 2155 case X86ISD::MUL_IMM: 2156 // X*[3,5,9] -> X+X*[2,4,8] 2157 if (AM.BaseType == X86ISelAddressMode::RegBase && 2158 AM.Base_Reg.getNode() == nullptr && 2159 AM.IndexReg.getNode() == nullptr) { 2160 if (ConstantSDNode *CN = dyn_cast<ConstantSDNode>(N.getOperand(1))) 2161 if (CN->getZExtValue() == 3 || CN->getZExtValue() == 5 || 2162 CN->getZExtValue() == 9) { 2163 AM.Scale = unsigned(CN->getZExtValue())-1; 2164 2165 SDValue MulVal = N.getOperand(0); 2166 SDValue Reg; 2167 2168 // Okay, we know that we have a scale by now. However, if the scaled 2169 // value is an add of something and a constant, we can fold the 2170 // constant into the disp field here. 2171 if (MulVal.getNode()->getOpcode() == ISD::ADD && MulVal.hasOneUse() && 2172 isa<ConstantSDNode>(MulVal.getOperand(1))) { 2173 Reg = MulVal.getOperand(0); 2174 ConstantSDNode *AddVal = 2175 cast<ConstantSDNode>(MulVal.getOperand(1)); 2176 uint64_t Disp = AddVal->getSExtValue() * CN->getZExtValue(); 2177 if (foldOffsetIntoAddress(Disp, AM)) 2178 Reg = N.getOperand(0); 2179 } else { 2180 Reg = N.getOperand(0); 2181 } 2182 2183 AM.IndexReg = AM.Base_Reg = Reg; 2184 return false; 2185 } 2186 } 2187 break; 2188 2189 case ISD::SUB: { 2190 // Given A-B, if A can be completely folded into the address and 2191 // the index field with the index field unused, use -B as the index. 2192 // This is a win if a has multiple parts that can be folded into 2193 // the address. Also, this saves a mov if the base register has 2194 // other uses, since it avoids a two-address sub instruction, however 2195 // it costs an additional mov if the index register has other uses. 2196 2197 // Add an artificial use to this node so that we can keep track of 2198 // it if it gets CSE'd with a different node. 2199 HandleSDNode Handle(N); 2200 2201 // Test if the LHS of the sub can be folded. 2202 X86ISelAddressMode Backup = AM; 2203 if (matchAddressRecursively(N.getOperand(0), AM, Depth+1)) { 2204 N = Handle.getValue(); 2205 AM = Backup; 2206 break; 2207 } 2208 N = Handle.getValue(); 2209 // Test if the index field is free for use. 2210 if (AM.IndexReg.getNode() || AM.isRIPRelative()) { 2211 AM = Backup; 2212 break; 2213 } 2214 2215 int Cost = 0; 2216 SDValue RHS = N.getOperand(1); 2217 // If the RHS involves a register with multiple uses, this 2218 // transformation incurs an extra mov, due to the neg instruction 2219 // clobbering its operand. 2220 if (!RHS.getNode()->hasOneUse() || 2221 RHS.getNode()->getOpcode() == ISD::CopyFromReg || 2222 RHS.getNode()->getOpcode() == ISD::TRUNCATE || 2223 RHS.getNode()->getOpcode() == ISD::ANY_EXTEND || 2224 (RHS.getNode()->getOpcode() == ISD::ZERO_EXTEND && 2225 RHS.getOperand(0).getValueType() == MVT::i32)) 2226 ++Cost; 2227 // If the base is a register with multiple uses, this 2228 // transformation may save a mov. 2229 if ((AM.BaseType == X86ISelAddressMode::RegBase && AM.Base_Reg.getNode() && 2230 !AM.Base_Reg.getNode()->hasOneUse()) || 2231 AM.BaseType == X86ISelAddressMode::FrameIndexBase) 2232 --Cost; 2233 // If the folded LHS was interesting, this transformation saves 2234 // address arithmetic. 2235 if ((AM.hasSymbolicDisplacement() && !Backup.hasSymbolicDisplacement()) + 2236 ((AM.Disp != 0) && (Backup.Disp == 0)) + 2237 (AM.Segment.getNode() && !Backup.Segment.getNode()) >= 2) 2238 --Cost; 2239 // If it doesn't look like it may be an overall win, don't do it. 2240 if (Cost >= 0) { 2241 AM = Backup; 2242 break; 2243 } 2244 2245 // Ok, the transformation is legal and appears profitable. Go for it. 2246 // Negation will be emitted later to avoid creating dangling nodes if this 2247 // was an unprofitable LEA. 2248 AM.IndexReg = RHS; 2249 AM.NegateIndex = true; 2250 AM.Scale = 1; 2251 return false; 2252 } 2253 2254 case ISD::ADD: 2255 if (!matchAdd(N, AM, Depth)) 2256 return false; 2257 break; 2258 2259 case ISD::OR: 2260 // We want to look through a transform in InstCombine and DAGCombiner that 2261 // turns 'add' into 'or', so we can treat this 'or' exactly like an 'add'. 2262 // Example: (or (and x, 1), (shl y, 3)) --> (add (and x, 1), (shl y, 3)) 2263 // An 'lea' can then be used to match the shift (multiply) and add: 2264 // and $1, %esi 2265 // lea (%rsi, %rdi, 8), %rax 2266 if (CurDAG->haveNoCommonBitsSet(N.getOperand(0), N.getOperand(1)) && 2267 !matchAdd(N, AM, Depth)) 2268 return false; 2269 break; 2270 2271 case ISD::AND: { 2272 // Perform some heroic transforms on an and of a constant-count shift 2273 // with a constant to enable use of the scaled offset field. 2274 2275 // Scale must not be used already. 2276 if (AM.IndexReg.getNode() != nullptr || AM.Scale != 1) break; 2277 2278 // We only handle up to 64-bit values here as those are what matter for 2279 // addressing mode optimizations. 2280 assert(N.getSimpleValueType().getSizeInBits() <= 64 && 2281 "Unexpected value size!"); 2282 2283 if (!isa<ConstantSDNode>(N.getOperand(1))) 2284 break; 2285 2286 if (N.getOperand(0).getOpcode() == ISD::SRL) { 2287 SDValue Shift = N.getOperand(0); 2288 SDValue X = Shift.getOperand(0); 2289 2290 uint64_t Mask = N.getConstantOperandVal(1); 2291 2292 // Try to fold the mask and shift into an extract and scale. 2293 if (!foldMaskAndShiftToExtract(*CurDAG, N, Mask, Shift, X, AM)) 2294 return false; 2295 2296 // Try to fold the mask and shift directly into the scale. 2297 if (!foldMaskAndShiftToScale(*CurDAG, N, Mask, Shift, X, AM)) 2298 return false; 2299 2300 // Try to fold the mask and shift into BEXTR and scale. 2301 if (!foldMaskedShiftToBEXTR(*CurDAG, N, Mask, Shift, X, AM, *Subtarget)) 2302 return false; 2303 } 2304 2305 // Try to swap the mask and shift to place shifts which can be done as 2306 // a scale on the outside of the mask. 2307 if (!foldMaskedShiftToScaledMask(*CurDAG, N, AM)) 2308 return false; 2309 2310 break; 2311 } 2312 case ISD::ZERO_EXTEND: { 2313 // Try to widen a zexted shift left to the same size as its use, so we can 2314 // match the shift as a scale factor. 2315 if (AM.IndexReg.getNode() != nullptr || AM.Scale != 1) 2316 break; 2317 if (N.getOperand(0).getOpcode() != ISD::SHL || !N.getOperand(0).hasOneUse()) 2318 break; 2319 2320 // Give up if the shift is not a valid scale factor [1,2,3]. 2321 SDValue Shl = N.getOperand(0); 2322 auto *ShAmtC = dyn_cast<ConstantSDNode>(Shl.getOperand(1)); 2323 if (!ShAmtC || ShAmtC->getZExtValue() > 3) 2324 break; 2325 2326 // The narrow shift must only shift out zero bits (it must be 'nuw'). 2327 // That makes it safe to widen to the destination type. 2328 APInt HighZeros = APInt::getHighBitsSet(Shl.getValueSizeInBits(), 2329 ShAmtC->getZExtValue()); 2330 if (!CurDAG->MaskedValueIsZero(Shl.getOperand(0), HighZeros)) 2331 break; 2332 2333 // zext (shl nuw i8 %x, C) to i32 --> shl (zext i8 %x to i32), (zext C) 2334 MVT VT = N.getSimpleValueType(); 2335 SDLoc DL(N); 2336 SDValue Zext = CurDAG->getNode(ISD::ZERO_EXTEND, DL, VT, Shl.getOperand(0)); 2337 SDValue NewShl = CurDAG->getNode(ISD::SHL, DL, VT, Zext, Shl.getOperand(1)); 2338 2339 // Convert the shift to scale factor. 2340 AM.Scale = 1 << ShAmtC->getZExtValue(); 2341 AM.IndexReg = Zext; 2342 2343 insertDAGNode(*CurDAG, N, Zext); 2344 insertDAGNode(*CurDAG, N, NewShl); 2345 CurDAG->ReplaceAllUsesWith(N, NewShl); 2346 CurDAG->RemoveDeadNode(N.getNode()); 2347 return false; 2348 } 2349 } 2350 2351 return matchAddressBase(N, AM); 2352 } 2353 2354 /// Helper for MatchAddress. Add the specified node to the 2355 /// specified addressing mode without any further recursion. 2356 bool X86DAGToDAGISel::matchAddressBase(SDValue N, X86ISelAddressMode &AM) { 2357 // Is the base register already occupied? 2358 if (AM.BaseType != X86ISelAddressMode::RegBase || AM.Base_Reg.getNode()) { 2359 // If so, check to see if the scale index register is set. 2360 if (!AM.IndexReg.getNode()) { 2361 AM.IndexReg = N; 2362 AM.Scale = 1; 2363 return false; 2364 } 2365 2366 // Otherwise, we cannot select it. 2367 return true; 2368 } 2369 2370 // Default, generate it as a register. 2371 AM.BaseType = X86ISelAddressMode::RegBase; 2372 AM.Base_Reg = N; 2373 return false; 2374 } 2375 2376 /// Helper for selectVectorAddr. Handles things that can be folded into a 2377 /// gather scatter address. The index register and scale should have already 2378 /// been handled. 2379 bool X86DAGToDAGISel::matchVectorAddress(SDValue N, X86ISelAddressMode &AM) { 2380 // TODO: Support other operations. 2381 switch (N.getOpcode()) { 2382 case ISD::Constant: { 2383 uint64_t Val = cast<ConstantSDNode>(N)->getSExtValue(); 2384 if (!foldOffsetIntoAddress(Val, AM)) 2385 return false; 2386 break; 2387 } 2388 case X86ISD::Wrapper: 2389 if (!matchWrapper(N, AM)) 2390 return false; 2391 break; 2392 } 2393 2394 return matchAddressBase(N, AM); 2395 } 2396 2397 bool X86DAGToDAGISel::selectVectorAddr(MemSDNode *Parent, SDValue BasePtr, 2398 SDValue IndexOp, SDValue ScaleOp, 2399 SDValue &Base, SDValue &Scale, 2400 SDValue &Index, SDValue &Disp, 2401 SDValue &Segment) { 2402 X86ISelAddressMode AM; 2403 AM.IndexReg = IndexOp; 2404 AM.Scale = cast<ConstantSDNode>(ScaleOp)->getZExtValue(); 2405 2406 unsigned AddrSpace = Parent->getPointerInfo().getAddrSpace(); 2407 if (AddrSpace == X86AS::GS) 2408 AM.Segment = CurDAG->getRegister(X86::GS, MVT::i16); 2409 if (AddrSpace == X86AS::FS) 2410 AM.Segment = CurDAG->getRegister(X86::FS, MVT::i16); 2411 if (AddrSpace == X86AS::SS) 2412 AM.Segment = CurDAG->getRegister(X86::SS, MVT::i16); 2413 2414 SDLoc DL(BasePtr); 2415 MVT VT = BasePtr.getSimpleValueType(); 2416 2417 // Try to match into the base and displacement fields. 2418 if (matchVectorAddress(BasePtr, AM)) 2419 return false; 2420 2421 getAddressOperands(AM, DL, VT, Base, Scale, Index, Disp, Segment); 2422 return true; 2423 } 2424 2425 /// Returns true if it is able to pattern match an addressing mode. 2426 /// It returns the operands which make up the maximal addressing mode it can 2427 /// match by reference. 2428 /// 2429 /// Parent is the parent node of the addr operand that is being matched. It 2430 /// is always a load, store, atomic node, or null. It is only null when 2431 /// checking memory operands for inline asm nodes. 2432 bool X86DAGToDAGISel::selectAddr(SDNode *Parent, SDValue N, SDValue &Base, 2433 SDValue &Scale, SDValue &Index, 2434 SDValue &Disp, SDValue &Segment) { 2435 X86ISelAddressMode AM; 2436 2437 if (Parent && 2438 // This list of opcodes are all the nodes that have an "addr:$ptr" operand 2439 // that are not a MemSDNode, and thus don't have proper addrspace info. 2440 Parent->getOpcode() != ISD::INTRINSIC_W_CHAIN && // unaligned loads, fixme 2441 Parent->getOpcode() != ISD::INTRINSIC_VOID && // nontemporal stores 2442 Parent->getOpcode() != X86ISD::TLSCALL && // Fixme 2443 Parent->getOpcode() != X86ISD::ENQCMD && // Fixme 2444 Parent->getOpcode() != X86ISD::ENQCMDS && // Fixme 2445 Parent->getOpcode() != X86ISD::EH_SJLJ_SETJMP && // setjmp 2446 Parent->getOpcode() != X86ISD::EH_SJLJ_LONGJMP) { // longjmp 2447 unsigned AddrSpace = 2448 cast<MemSDNode>(Parent)->getPointerInfo().getAddrSpace(); 2449 if (AddrSpace == X86AS::GS) 2450 AM.Segment = CurDAG->getRegister(X86::GS, MVT::i16); 2451 if (AddrSpace == X86AS::FS) 2452 AM.Segment = CurDAG->getRegister(X86::FS, MVT::i16); 2453 if (AddrSpace == X86AS::SS) 2454 AM.Segment = CurDAG->getRegister(X86::SS, MVT::i16); 2455 } 2456 2457 // Save the DL and VT before calling matchAddress, it can invalidate N. 2458 SDLoc DL(N); 2459 MVT VT = N.getSimpleValueType(); 2460 2461 if (matchAddress(N, AM)) 2462 return false; 2463 2464 getAddressOperands(AM, DL, VT, Base, Scale, Index, Disp, Segment); 2465 return true; 2466 } 2467 2468 bool X86DAGToDAGISel::selectMOV64Imm32(SDValue N, SDValue &Imm) { 2469 // In static codegen with small code model, we can get the address of a label 2470 // into a register with 'movl' 2471 if (N->getOpcode() != X86ISD::Wrapper) 2472 return false; 2473 2474 N = N.getOperand(0); 2475 2476 // At least GNU as does not accept 'movl' for TPOFF relocations. 2477 // FIXME: We could use 'movl' when we know we are targeting MC. 2478 if (N->getOpcode() == ISD::TargetGlobalTLSAddress) 2479 return false; 2480 2481 Imm = N; 2482 if (N->getOpcode() != ISD::TargetGlobalAddress) 2483 return TM.getCodeModel() == CodeModel::Small; 2484 2485 Optional<ConstantRange> CR = 2486 cast<GlobalAddressSDNode>(N)->getGlobal()->getAbsoluteSymbolRange(); 2487 if (!CR) 2488 return TM.getCodeModel() == CodeModel::Small; 2489 2490 return CR->getUnsignedMax().ult(1ull << 32); 2491 } 2492 2493 bool X86DAGToDAGISel::selectLEA64_32Addr(SDValue N, SDValue &Base, 2494 SDValue &Scale, SDValue &Index, 2495 SDValue &Disp, SDValue &Segment) { 2496 // Save the debug loc before calling selectLEAAddr, in case it invalidates N. 2497 SDLoc DL(N); 2498 2499 if (!selectLEAAddr(N, Base, Scale, Index, Disp, Segment)) 2500 return false; 2501 2502 RegisterSDNode *RN = dyn_cast<RegisterSDNode>(Base); 2503 if (RN && RN->getReg() == 0) 2504 Base = CurDAG->getRegister(0, MVT::i64); 2505 else if (Base.getValueType() == MVT::i32 && !isa<FrameIndexSDNode>(Base)) { 2506 // Base could already be %rip, particularly in the x32 ABI. 2507 SDValue ImplDef = SDValue(CurDAG->getMachineNode(X86::IMPLICIT_DEF, DL, 2508 MVT::i64), 0); 2509 Base = CurDAG->getTargetInsertSubreg(X86::sub_32bit, DL, MVT::i64, ImplDef, 2510 Base); 2511 } 2512 2513 RN = dyn_cast<RegisterSDNode>(Index); 2514 if (RN && RN->getReg() == 0) 2515 Index = CurDAG->getRegister(0, MVT::i64); 2516 else { 2517 assert(Index.getValueType() == MVT::i32 && 2518 "Expect to be extending 32-bit registers for use in LEA"); 2519 SDValue ImplDef = SDValue(CurDAG->getMachineNode(X86::IMPLICIT_DEF, DL, 2520 MVT::i64), 0); 2521 Index = CurDAG->getTargetInsertSubreg(X86::sub_32bit, DL, MVT::i64, ImplDef, 2522 Index); 2523 } 2524 2525 return true; 2526 } 2527 2528 /// Calls SelectAddr and determines if the maximal addressing 2529 /// mode it matches can be cost effectively emitted as an LEA instruction. 2530 bool X86DAGToDAGISel::selectLEAAddr(SDValue N, 2531 SDValue &Base, SDValue &Scale, 2532 SDValue &Index, SDValue &Disp, 2533 SDValue &Segment) { 2534 X86ISelAddressMode AM; 2535 2536 // Save the DL and VT before calling matchAddress, it can invalidate N. 2537 SDLoc DL(N); 2538 MVT VT = N.getSimpleValueType(); 2539 2540 // Set AM.Segment to prevent MatchAddress from using one. LEA doesn't support 2541 // segments. 2542 SDValue Copy = AM.Segment; 2543 SDValue T = CurDAG->getRegister(0, MVT::i32); 2544 AM.Segment = T; 2545 if (matchAddress(N, AM)) 2546 return false; 2547 assert (T == AM.Segment); 2548 AM.Segment = Copy; 2549 2550 unsigned Complexity = 0; 2551 if (AM.BaseType == X86ISelAddressMode::RegBase && AM.Base_Reg.getNode()) 2552 Complexity = 1; 2553 else if (AM.BaseType == X86ISelAddressMode::FrameIndexBase) 2554 Complexity = 4; 2555 2556 if (AM.IndexReg.getNode()) 2557 Complexity++; 2558 2559 // Don't match just leal(,%reg,2). It's cheaper to do addl %reg, %reg, or with 2560 // a simple shift. 2561 if (AM.Scale > 1) 2562 Complexity++; 2563 2564 // FIXME: We are artificially lowering the criteria to turn ADD %reg, $GA 2565 // to a LEA. This is determined with some experimentation but is by no means 2566 // optimal (especially for code size consideration). LEA is nice because of 2567 // its three-address nature. Tweak the cost function again when we can run 2568 // convertToThreeAddress() at register allocation time. 2569 if (AM.hasSymbolicDisplacement()) { 2570 // For X86-64, always use LEA to materialize RIP-relative addresses. 2571 if (Subtarget->is64Bit()) 2572 Complexity = 4; 2573 else 2574 Complexity += 2; 2575 } 2576 2577 // Heuristic: try harder to form an LEA from ADD if the operands set flags. 2578 // Unlike ADD, LEA does not affect flags, so we will be less likely to require 2579 // duplicating flag-producing instructions later in the pipeline. 2580 if (N.getOpcode() == ISD::ADD) { 2581 auto isMathWithFlags = [](SDValue V) { 2582 switch (V.getOpcode()) { 2583 case X86ISD::ADD: 2584 case X86ISD::SUB: 2585 case X86ISD::ADC: 2586 case X86ISD::SBB: 2587 /* TODO: These opcodes can be added safely, but we may want to justify 2588 their inclusion for different reasons (better for reg-alloc). 2589 case X86ISD::SMUL: 2590 case X86ISD::UMUL: 2591 case X86ISD::OR: 2592 case X86ISD::XOR: 2593 case X86ISD::AND: 2594 */ 2595 // Value 1 is the flag output of the node - verify it's not dead. 2596 return !SDValue(V.getNode(), 1).use_empty(); 2597 default: 2598 return false; 2599 } 2600 }; 2601 // TODO: This could be an 'or' rather than 'and' to make the transform more 2602 // likely to happen. We might want to factor in whether there's a 2603 // load folding opportunity for the math op that disappears with LEA. 2604 if (isMathWithFlags(N.getOperand(0)) && isMathWithFlags(N.getOperand(1))) 2605 Complexity++; 2606 } 2607 2608 if (AM.Disp) 2609 Complexity++; 2610 2611 // If it isn't worth using an LEA, reject it. 2612 if (Complexity <= 2) 2613 return false; 2614 2615 getAddressOperands(AM, DL, VT, Base, Scale, Index, Disp, Segment); 2616 return true; 2617 } 2618 2619 /// This is only run on TargetGlobalTLSAddress nodes. 2620 bool X86DAGToDAGISel::selectTLSADDRAddr(SDValue N, SDValue &Base, 2621 SDValue &Scale, SDValue &Index, 2622 SDValue &Disp, SDValue &Segment) { 2623 assert(N.getOpcode() == ISD::TargetGlobalTLSAddress); 2624 const GlobalAddressSDNode *GA = cast<GlobalAddressSDNode>(N); 2625 2626 X86ISelAddressMode AM; 2627 AM.GV = GA->getGlobal(); 2628 AM.Disp += GA->getOffset(); 2629 AM.SymbolFlags = GA->getTargetFlags(); 2630 2631 MVT VT = N.getSimpleValueType(); 2632 if (VT == MVT::i32) { 2633 AM.Scale = 1; 2634 AM.IndexReg = CurDAG->getRegister(X86::EBX, MVT::i32); 2635 } 2636 2637 getAddressOperands(AM, SDLoc(N), VT, Base, Scale, Index, Disp, Segment); 2638 return true; 2639 } 2640 2641 bool X86DAGToDAGISel::selectRelocImm(SDValue N, SDValue &Op) { 2642 // Keep track of the original value type and whether this value was 2643 // truncated. If we see a truncation from pointer type to VT that truncates 2644 // bits that are known to be zero, we can use a narrow reference. 2645 EVT VT = N.getValueType(); 2646 bool WasTruncated = false; 2647 if (N.getOpcode() == ISD::TRUNCATE) { 2648 WasTruncated = true; 2649 N = N.getOperand(0); 2650 } 2651 2652 if (N.getOpcode() != X86ISD::Wrapper) 2653 return false; 2654 2655 // We can only use non-GlobalValues as immediates if they were not truncated, 2656 // as we do not have any range information. If we have a GlobalValue and the 2657 // address was not truncated, we can select it as an operand directly. 2658 unsigned Opc = N.getOperand(0)->getOpcode(); 2659 if (Opc != ISD::TargetGlobalAddress || !WasTruncated) { 2660 Op = N.getOperand(0); 2661 // We can only select the operand directly if we didn't have to look past a 2662 // truncate. 2663 return !WasTruncated; 2664 } 2665 2666 // Check that the global's range fits into VT. 2667 auto *GA = cast<GlobalAddressSDNode>(N.getOperand(0)); 2668 Optional<ConstantRange> CR = GA->getGlobal()->getAbsoluteSymbolRange(); 2669 if (!CR || CR->getUnsignedMax().uge(1ull << VT.getSizeInBits())) 2670 return false; 2671 2672 // Okay, we can use a narrow reference. 2673 Op = CurDAG->getTargetGlobalAddress(GA->getGlobal(), SDLoc(N), VT, 2674 GA->getOffset(), GA->getTargetFlags()); 2675 return true; 2676 } 2677 2678 bool X86DAGToDAGISel::tryFoldLoad(SDNode *Root, SDNode *P, SDValue N, 2679 SDValue &Base, SDValue &Scale, 2680 SDValue &Index, SDValue &Disp, 2681 SDValue &Segment) { 2682 assert(Root && P && "Unknown root/parent nodes"); 2683 if (!ISD::isNON_EXTLoad(N.getNode()) || 2684 !IsProfitableToFold(N, P, Root) || 2685 !IsLegalToFold(N, P, Root, OptLevel)) 2686 return false; 2687 2688 return selectAddr(N.getNode(), 2689 N.getOperand(1), Base, Scale, Index, Disp, Segment); 2690 } 2691 2692 bool X86DAGToDAGISel::tryFoldBroadcast(SDNode *Root, SDNode *P, SDValue N, 2693 SDValue &Base, SDValue &Scale, 2694 SDValue &Index, SDValue &Disp, 2695 SDValue &Segment) { 2696 assert(Root && P && "Unknown root/parent nodes"); 2697 if (N->getOpcode() != X86ISD::VBROADCAST_LOAD || 2698 !IsProfitableToFold(N, P, Root) || 2699 !IsLegalToFold(N, P, Root, OptLevel)) 2700 return false; 2701 2702 return selectAddr(N.getNode(), 2703 N.getOperand(1), Base, Scale, Index, Disp, Segment); 2704 } 2705 2706 /// Return an SDNode that returns the value of the global base register. 2707 /// Output instructions required to initialize the global base register, 2708 /// if necessary. 2709 SDNode *X86DAGToDAGISel::getGlobalBaseReg() { 2710 unsigned GlobalBaseReg = getInstrInfo()->getGlobalBaseReg(MF); 2711 auto &DL = MF->getDataLayout(); 2712 return CurDAG->getRegister(GlobalBaseReg, TLI->getPointerTy(DL)).getNode(); 2713 } 2714 2715 bool X86DAGToDAGISel::isSExtAbsoluteSymbolRef(unsigned Width, SDNode *N) const { 2716 if (N->getOpcode() == ISD::TRUNCATE) 2717 N = N->getOperand(0).getNode(); 2718 if (N->getOpcode() != X86ISD::Wrapper) 2719 return false; 2720 2721 auto *GA = dyn_cast<GlobalAddressSDNode>(N->getOperand(0)); 2722 if (!GA) 2723 return false; 2724 2725 Optional<ConstantRange> CR = GA->getGlobal()->getAbsoluteSymbolRange(); 2726 return CR && CR->getSignedMin().sge(-1ull << Width) && 2727 CR->getSignedMax().slt(1ull << Width); 2728 } 2729 2730 static X86::CondCode getCondFromNode(SDNode *N) { 2731 assert(N->isMachineOpcode() && "Unexpected node"); 2732 X86::CondCode CC = X86::COND_INVALID; 2733 unsigned Opc = N->getMachineOpcode(); 2734 if (Opc == X86::JCC_1) 2735 CC = static_cast<X86::CondCode>(N->getConstantOperandVal(1)); 2736 else if (Opc == X86::SETCCr) 2737 CC = static_cast<X86::CondCode>(N->getConstantOperandVal(0)); 2738 else if (Opc == X86::SETCCm) 2739 CC = static_cast<X86::CondCode>(N->getConstantOperandVal(5)); 2740 else if (Opc == X86::CMOV16rr || Opc == X86::CMOV32rr || 2741 Opc == X86::CMOV64rr) 2742 CC = static_cast<X86::CondCode>(N->getConstantOperandVal(2)); 2743 else if (Opc == X86::CMOV16rm || Opc == X86::CMOV32rm || 2744 Opc == X86::CMOV64rm) 2745 CC = static_cast<X86::CondCode>(N->getConstantOperandVal(6)); 2746 2747 return CC; 2748 } 2749 2750 /// Test whether the given X86ISD::CMP node has any users that use a flag 2751 /// other than ZF. 2752 bool X86DAGToDAGISel::onlyUsesZeroFlag(SDValue Flags) const { 2753 // Examine each user of the node. 2754 for (SDNode::use_iterator UI = Flags->use_begin(), UE = Flags->use_end(); 2755 UI != UE; ++UI) { 2756 // Only check things that use the flags. 2757 if (UI.getUse().getResNo() != Flags.getResNo()) 2758 continue; 2759 // Only examine CopyToReg uses that copy to EFLAGS. 2760 if (UI->getOpcode() != ISD::CopyToReg || 2761 cast<RegisterSDNode>(UI->getOperand(1))->getReg() != X86::EFLAGS) 2762 return false; 2763 // Examine each user of the CopyToReg use. 2764 for (SDNode::use_iterator FlagUI = UI->use_begin(), 2765 FlagUE = UI->use_end(); FlagUI != FlagUE; ++FlagUI) { 2766 // Only examine the Flag result. 2767 if (FlagUI.getUse().getResNo() != 1) continue; 2768 // Anything unusual: assume conservatively. 2769 if (!FlagUI->isMachineOpcode()) return false; 2770 // Examine the condition code of the user. 2771 X86::CondCode CC = getCondFromNode(*FlagUI); 2772 2773 switch (CC) { 2774 // Comparisons which only use the zero flag. 2775 case X86::COND_E: case X86::COND_NE: 2776 continue; 2777 // Anything else: assume conservatively. 2778 default: 2779 return false; 2780 } 2781 } 2782 } 2783 return true; 2784 } 2785 2786 /// Test whether the given X86ISD::CMP node has any uses which require the SF 2787 /// flag to be accurate. 2788 bool X86DAGToDAGISel::hasNoSignFlagUses(SDValue Flags) const { 2789 // Examine each user of the node. 2790 for (SDNode::use_iterator UI = Flags->use_begin(), UE = Flags->use_end(); 2791 UI != UE; ++UI) { 2792 // Only check things that use the flags. 2793 if (UI.getUse().getResNo() != Flags.getResNo()) 2794 continue; 2795 // Only examine CopyToReg uses that copy to EFLAGS. 2796 if (UI->getOpcode() != ISD::CopyToReg || 2797 cast<RegisterSDNode>(UI->getOperand(1))->getReg() != X86::EFLAGS) 2798 return false; 2799 // Examine each user of the CopyToReg use. 2800 for (SDNode::use_iterator FlagUI = UI->use_begin(), 2801 FlagUE = UI->use_end(); FlagUI != FlagUE; ++FlagUI) { 2802 // Only examine the Flag result. 2803 if (FlagUI.getUse().getResNo() != 1) continue; 2804 // Anything unusual: assume conservatively. 2805 if (!FlagUI->isMachineOpcode()) return false; 2806 // Examine the condition code of the user. 2807 X86::CondCode CC = getCondFromNode(*FlagUI); 2808 2809 switch (CC) { 2810 // Comparisons which don't examine the SF flag. 2811 case X86::COND_A: case X86::COND_AE: 2812 case X86::COND_B: case X86::COND_BE: 2813 case X86::COND_E: case X86::COND_NE: 2814 case X86::COND_O: case X86::COND_NO: 2815 case X86::COND_P: case X86::COND_NP: 2816 continue; 2817 // Anything else: assume conservatively. 2818 default: 2819 return false; 2820 } 2821 } 2822 } 2823 return true; 2824 } 2825 2826 static bool mayUseCarryFlag(X86::CondCode CC) { 2827 switch (CC) { 2828 // Comparisons which don't examine the CF flag. 2829 case X86::COND_O: case X86::COND_NO: 2830 case X86::COND_E: case X86::COND_NE: 2831 case X86::COND_S: case X86::COND_NS: 2832 case X86::COND_P: case X86::COND_NP: 2833 case X86::COND_L: case X86::COND_GE: 2834 case X86::COND_G: case X86::COND_LE: 2835 return false; 2836 // Anything else: assume conservatively. 2837 default: 2838 return true; 2839 } 2840 } 2841 2842 /// Test whether the given node which sets flags has any uses which require the 2843 /// CF flag to be accurate. 2844 bool X86DAGToDAGISel::hasNoCarryFlagUses(SDValue Flags) const { 2845 // Examine each user of the node. 2846 for (SDNode::use_iterator UI = Flags->use_begin(), UE = Flags->use_end(); 2847 UI != UE; ++UI) { 2848 // Only check things that use the flags. 2849 if (UI.getUse().getResNo() != Flags.getResNo()) 2850 continue; 2851 2852 unsigned UIOpc = UI->getOpcode(); 2853 2854 if (UIOpc == ISD::CopyToReg) { 2855 // Only examine CopyToReg uses that copy to EFLAGS. 2856 if (cast<RegisterSDNode>(UI->getOperand(1))->getReg() != X86::EFLAGS) 2857 return false; 2858 // Examine each user of the CopyToReg use. 2859 for (SDNode::use_iterator FlagUI = UI->use_begin(), FlagUE = UI->use_end(); 2860 FlagUI != FlagUE; ++FlagUI) { 2861 // Only examine the Flag result. 2862 if (FlagUI.getUse().getResNo() != 1) 2863 continue; 2864 // Anything unusual: assume conservatively. 2865 if (!FlagUI->isMachineOpcode()) 2866 return false; 2867 // Examine the condition code of the user. 2868 X86::CondCode CC = getCondFromNode(*FlagUI); 2869 2870 if (mayUseCarryFlag(CC)) 2871 return false; 2872 } 2873 2874 // This CopyToReg is ok. Move on to the next user. 2875 continue; 2876 } 2877 2878 // This might be an unselected node. So look for the pre-isel opcodes that 2879 // use flags. 2880 unsigned CCOpNo; 2881 switch (UIOpc) { 2882 default: 2883 // Something unusual. Be conservative. 2884 return false; 2885 case X86ISD::SETCC: CCOpNo = 0; break; 2886 case X86ISD::SETCC_CARRY: CCOpNo = 0; break; 2887 case X86ISD::CMOV: CCOpNo = 2; break; 2888 case X86ISD::BRCOND: CCOpNo = 2; break; 2889 } 2890 2891 X86::CondCode CC = (X86::CondCode)UI->getConstantOperandVal(CCOpNo); 2892 if (mayUseCarryFlag(CC)) 2893 return false; 2894 } 2895 return true; 2896 } 2897 2898 /// Check whether or not the chain ending in StoreNode is suitable for doing 2899 /// the {load; op; store} to modify transformation. 2900 static bool isFusableLoadOpStorePattern(StoreSDNode *StoreNode, 2901 SDValue StoredVal, SelectionDAG *CurDAG, 2902 unsigned LoadOpNo, 2903 LoadSDNode *&LoadNode, 2904 SDValue &InputChain) { 2905 // Is the stored value result 0 of the operation? 2906 if (StoredVal.getResNo() != 0) return false; 2907 2908 // Are there other uses of the operation other than the store? 2909 if (!StoredVal.getNode()->hasNUsesOfValue(1, 0)) return false; 2910 2911 // Is the store non-extending and non-indexed? 2912 if (!ISD::isNormalStore(StoreNode) || StoreNode->isNonTemporal()) 2913 return false; 2914 2915 SDValue Load = StoredVal->getOperand(LoadOpNo); 2916 // Is the stored value a non-extending and non-indexed load? 2917 if (!ISD::isNormalLoad(Load.getNode())) return false; 2918 2919 // Return LoadNode by reference. 2920 LoadNode = cast<LoadSDNode>(Load); 2921 2922 // Is store the only read of the loaded value? 2923 if (!Load.hasOneUse()) 2924 return false; 2925 2926 // Is the address of the store the same as the load? 2927 if (LoadNode->getBasePtr() != StoreNode->getBasePtr() || 2928 LoadNode->getOffset() != StoreNode->getOffset()) 2929 return false; 2930 2931 bool FoundLoad = false; 2932 SmallVector<SDValue, 4> ChainOps; 2933 SmallVector<const SDNode *, 4> LoopWorklist; 2934 SmallPtrSet<const SDNode *, 16> Visited; 2935 const unsigned int Max = 1024; 2936 2937 // Visualization of Load-Op-Store fusion: 2938 // ------------------------- 2939 // Legend: 2940 // *-lines = Chain operand dependencies. 2941 // |-lines = Normal operand dependencies. 2942 // Dependencies flow down and right. n-suffix references multiple nodes. 2943 // 2944 // C Xn C 2945 // * * * 2946 // * * * 2947 // Xn A-LD Yn TF Yn 2948 // * * \ | * | 2949 // * * \ | * | 2950 // * * \ | => A--LD_OP_ST 2951 // * * \| \ 2952 // TF OP \ 2953 // * | \ Zn 2954 // * | \ 2955 // A-ST Zn 2956 // 2957 2958 // This merge induced dependences from: #1: Xn -> LD, OP, Zn 2959 // #2: Yn -> LD 2960 // #3: ST -> Zn 2961 2962 // Ensure the transform is safe by checking for the dual 2963 // dependencies to make sure we do not induce a loop. 2964 2965 // As LD is a predecessor to both OP and ST we can do this by checking: 2966 // a). if LD is a predecessor to a member of Xn or Yn. 2967 // b). if a Zn is a predecessor to ST. 2968 2969 // However, (b) can only occur through being a chain predecessor to 2970 // ST, which is the same as Zn being a member or predecessor of Xn, 2971 // which is a subset of LD being a predecessor of Xn. So it's 2972 // subsumed by check (a). 2973 2974 SDValue Chain = StoreNode->getChain(); 2975 2976 // Gather X elements in ChainOps. 2977 if (Chain == Load.getValue(1)) { 2978 FoundLoad = true; 2979 ChainOps.push_back(Load.getOperand(0)); 2980 } else if (Chain.getOpcode() == ISD::TokenFactor) { 2981 for (unsigned i = 0, e = Chain.getNumOperands(); i != e; ++i) { 2982 SDValue Op = Chain.getOperand(i); 2983 if (Op == Load.getValue(1)) { 2984 FoundLoad = true; 2985 // Drop Load, but keep its chain. No cycle check necessary. 2986 ChainOps.push_back(Load.getOperand(0)); 2987 continue; 2988 } 2989 LoopWorklist.push_back(Op.getNode()); 2990 ChainOps.push_back(Op); 2991 } 2992 } 2993 2994 if (!FoundLoad) 2995 return false; 2996 2997 // Worklist is currently Xn. Add Yn to worklist. 2998 for (SDValue Op : StoredVal->ops()) 2999 if (Op.getNode() != LoadNode) 3000 LoopWorklist.push_back(Op.getNode()); 3001 3002 // Check (a) if Load is a predecessor to Xn + Yn 3003 if (SDNode::hasPredecessorHelper(Load.getNode(), Visited, LoopWorklist, Max, 3004 true)) 3005 return false; 3006 3007 InputChain = 3008 CurDAG->getNode(ISD::TokenFactor, SDLoc(Chain), MVT::Other, ChainOps); 3009 return true; 3010 } 3011 3012 // Change a chain of {load; op; store} of the same value into a simple op 3013 // through memory of that value, if the uses of the modified value and its 3014 // address are suitable. 3015 // 3016 // The tablegen pattern memory operand pattern is currently not able to match 3017 // the case where the EFLAGS on the original operation are used. 3018 // 3019 // To move this to tablegen, we'll need to improve tablegen to allow flags to 3020 // be transferred from a node in the pattern to the result node, probably with 3021 // a new keyword. For example, we have this 3022 // def DEC64m : RI<0xFF, MRM1m, (outs), (ins i64mem:$dst), "dec{q}\t$dst", 3023 // [(store (add (loadi64 addr:$dst), -1), addr:$dst), 3024 // (implicit EFLAGS)]>; 3025 // but maybe need something like this 3026 // def DEC64m : RI<0xFF, MRM1m, (outs), (ins i64mem:$dst), "dec{q}\t$dst", 3027 // [(store (add (loadi64 addr:$dst), -1), addr:$dst), 3028 // (transferrable EFLAGS)]>; 3029 // 3030 // Until then, we manually fold these and instruction select the operation 3031 // here. 3032 bool X86DAGToDAGISel::foldLoadStoreIntoMemOperand(SDNode *Node) { 3033 StoreSDNode *StoreNode = cast<StoreSDNode>(Node); 3034 SDValue StoredVal = StoreNode->getOperand(1); 3035 unsigned Opc = StoredVal->getOpcode(); 3036 3037 // Before we try to select anything, make sure this is memory operand size 3038 // and opcode we can handle. Note that this must match the code below that 3039 // actually lowers the opcodes. 3040 EVT MemVT = StoreNode->getMemoryVT(); 3041 if (MemVT != MVT::i64 && MemVT != MVT::i32 && MemVT != MVT::i16 && 3042 MemVT != MVT::i8) 3043 return false; 3044 3045 bool IsCommutable = false; 3046 bool IsNegate = false; 3047 switch (Opc) { 3048 default: 3049 return false; 3050 case X86ISD::SUB: 3051 IsNegate = isNullConstant(StoredVal.getOperand(0)); 3052 break; 3053 case X86ISD::SBB: 3054 break; 3055 case X86ISD::ADD: 3056 case X86ISD::ADC: 3057 case X86ISD::AND: 3058 case X86ISD::OR: 3059 case X86ISD::XOR: 3060 IsCommutable = true; 3061 break; 3062 } 3063 3064 unsigned LoadOpNo = IsNegate ? 1 : 0; 3065 LoadSDNode *LoadNode = nullptr; 3066 SDValue InputChain; 3067 if (!isFusableLoadOpStorePattern(StoreNode, StoredVal, CurDAG, LoadOpNo, 3068 LoadNode, InputChain)) { 3069 if (!IsCommutable) 3070 return false; 3071 3072 // This operation is commutable, try the other operand. 3073 LoadOpNo = 1; 3074 if (!isFusableLoadOpStorePattern(StoreNode, StoredVal, CurDAG, LoadOpNo, 3075 LoadNode, InputChain)) 3076 return false; 3077 } 3078 3079 SDValue Base, Scale, Index, Disp, Segment; 3080 if (!selectAddr(LoadNode, LoadNode->getBasePtr(), Base, Scale, Index, Disp, 3081 Segment)) 3082 return false; 3083 3084 auto SelectOpcode = [&](unsigned Opc64, unsigned Opc32, unsigned Opc16, 3085 unsigned Opc8) { 3086 switch (MemVT.getSimpleVT().SimpleTy) { 3087 case MVT::i64: 3088 return Opc64; 3089 case MVT::i32: 3090 return Opc32; 3091 case MVT::i16: 3092 return Opc16; 3093 case MVT::i8: 3094 return Opc8; 3095 default: 3096 llvm_unreachable("Invalid size!"); 3097 } 3098 }; 3099 3100 MachineSDNode *Result; 3101 switch (Opc) { 3102 case X86ISD::SUB: 3103 // Handle negate. 3104 if (IsNegate) { 3105 unsigned NewOpc = SelectOpcode(X86::NEG64m, X86::NEG32m, X86::NEG16m, 3106 X86::NEG8m); 3107 const SDValue Ops[] = {Base, Scale, Index, Disp, Segment, InputChain}; 3108 Result = CurDAG->getMachineNode(NewOpc, SDLoc(Node), MVT::i32, 3109 MVT::Other, Ops); 3110 break; 3111 } 3112 LLVM_FALLTHROUGH; 3113 case X86ISD::ADD: 3114 // Try to match inc/dec. 3115 if (!Subtarget->slowIncDec() || CurDAG->shouldOptForSize()) { 3116 bool IsOne = isOneConstant(StoredVal.getOperand(1)); 3117 bool IsNegOne = isAllOnesConstant(StoredVal.getOperand(1)); 3118 // ADD/SUB with 1/-1 and carry flag isn't used can use inc/dec. 3119 if ((IsOne || IsNegOne) && hasNoCarryFlagUses(StoredVal.getValue(1))) { 3120 unsigned NewOpc = 3121 ((Opc == X86ISD::ADD) == IsOne) 3122 ? SelectOpcode(X86::INC64m, X86::INC32m, X86::INC16m, X86::INC8m) 3123 : SelectOpcode(X86::DEC64m, X86::DEC32m, X86::DEC16m, X86::DEC8m); 3124 const SDValue Ops[] = {Base, Scale, Index, Disp, Segment, InputChain}; 3125 Result = CurDAG->getMachineNode(NewOpc, SDLoc(Node), MVT::i32, 3126 MVT::Other, Ops); 3127 break; 3128 } 3129 } 3130 LLVM_FALLTHROUGH; 3131 case X86ISD::ADC: 3132 case X86ISD::SBB: 3133 case X86ISD::AND: 3134 case X86ISD::OR: 3135 case X86ISD::XOR: { 3136 auto SelectRegOpcode = [SelectOpcode](unsigned Opc) { 3137 switch (Opc) { 3138 case X86ISD::ADD: 3139 return SelectOpcode(X86::ADD64mr, X86::ADD32mr, X86::ADD16mr, 3140 X86::ADD8mr); 3141 case X86ISD::ADC: 3142 return SelectOpcode(X86::ADC64mr, X86::ADC32mr, X86::ADC16mr, 3143 X86::ADC8mr); 3144 case X86ISD::SUB: 3145 return SelectOpcode(X86::SUB64mr, X86::SUB32mr, X86::SUB16mr, 3146 X86::SUB8mr); 3147 case X86ISD::SBB: 3148 return SelectOpcode(X86::SBB64mr, X86::SBB32mr, X86::SBB16mr, 3149 X86::SBB8mr); 3150 case X86ISD::AND: 3151 return SelectOpcode(X86::AND64mr, X86::AND32mr, X86::AND16mr, 3152 X86::AND8mr); 3153 case X86ISD::OR: 3154 return SelectOpcode(X86::OR64mr, X86::OR32mr, X86::OR16mr, X86::OR8mr); 3155 case X86ISD::XOR: 3156 return SelectOpcode(X86::XOR64mr, X86::XOR32mr, X86::XOR16mr, 3157 X86::XOR8mr); 3158 default: 3159 llvm_unreachable("Invalid opcode!"); 3160 } 3161 }; 3162 auto SelectImm8Opcode = [SelectOpcode](unsigned Opc) { 3163 switch (Opc) { 3164 case X86ISD::ADD: 3165 return SelectOpcode(X86::ADD64mi8, X86::ADD32mi8, X86::ADD16mi8, 0); 3166 case X86ISD::ADC: 3167 return SelectOpcode(X86::ADC64mi8, X86::ADC32mi8, X86::ADC16mi8, 0); 3168 case X86ISD::SUB: 3169 return SelectOpcode(X86::SUB64mi8, X86::SUB32mi8, X86::SUB16mi8, 0); 3170 case X86ISD::SBB: 3171 return SelectOpcode(X86::SBB64mi8, X86::SBB32mi8, X86::SBB16mi8, 0); 3172 case X86ISD::AND: 3173 return SelectOpcode(X86::AND64mi8, X86::AND32mi8, X86::AND16mi8, 0); 3174 case X86ISD::OR: 3175 return SelectOpcode(X86::OR64mi8, X86::OR32mi8, X86::OR16mi8, 0); 3176 case X86ISD::XOR: 3177 return SelectOpcode(X86::XOR64mi8, X86::XOR32mi8, X86::XOR16mi8, 0); 3178 default: 3179 llvm_unreachable("Invalid opcode!"); 3180 } 3181 }; 3182 auto SelectImmOpcode = [SelectOpcode](unsigned Opc) { 3183 switch (Opc) { 3184 case X86ISD::ADD: 3185 return SelectOpcode(X86::ADD64mi32, X86::ADD32mi, X86::ADD16mi, 3186 X86::ADD8mi); 3187 case X86ISD::ADC: 3188 return SelectOpcode(X86::ADC64mi32, X86::ADC32mi, X86::ADC16mi, 3189 X86::ADC8mi); 3190 case X86ISD::SUB: 3191 return SelectOpcode(X86::SUB64mi32, X86::SUB32mi, X86::SUB16mi, 3192 X86::SUB8mi); 3193 case X86ISD::SBB: 3194 return SelectOpcode(X86::SBB64mi32, X86::SBB32mi, X86::SBB16mi, 3195 X86::SBB8mi); 3196 case X86ISD::AND: 3197 return SelectOpcode(X86::AND64mi32, X86::AND32mi, X86::AND16mi, 3198 X86::AND8mi); 3199 case X86ISD::OR: 3200 return SelectOpcode(X86::OR64mi32, X86::OR32mi, X86::OR16mi, 3201 X86::OR8mi); 3202 case X86ISD::XOR: 3203 return SelectOpcode(X86::XOR64mi32, X86::XOR32mi, X86::XOR16mi, 3204 X86::XOR8mi); 3205 default: 3206 llvm_unreachable("Invalid opcode!"); 3207 } 3208 }; 3209 3210 unsigned NewOpc = SelectRegOpcode(Opc); 3211 SDValue Operand = StoredVal->getOperand(1-LoadOpNo); 3212 3213 // See if the operand is a constant that we can fold into an immediate 3214 // operand. 3215 if (auto *OperandC = dyn_cast<ConstantSDNode>(Operand)) { 3216 int64_t OperandV = OperandC->getSExtValue(); 3217 3218 // Check if we can shrink the operand enough to fit in an immediate (or 3219 // fit into a smaller immediate) by negating it and switching the 3220 // operation. 3221 if ((Opc == X86ISD::ADD || Opc == X86ISD::SUB) && 3222 ((MemVT != MVT::i8 && !isInt<8>(OperandV) && isInt<8>(-OperandV)) || 3223 (MemVT == MVT::i64 && !isInt<32>(OperandV) && 3224 isInt<32>(-OperandV))) && 3225 hasNoCarryFlagUses(StoredVal.getValue(1))) { 3226 OperandV = -OperandV; 3227 Opc = Opc == X86ISD::ADD ? X86ISD::SUB : X86ISD::ADD; 3228 } 3229 3230 // First try to fit this into an Imm8 operand. If it doesn't fit, then try 3231 // the larger immediate operand. 3232 if (MemVT != MVT::i8 && isInt<8>(OperandV)) { 3233 Operand = CurDAG->getTargetConstant(OperandV, SDLoc(Node), MemVT); 3234 NewOpc = SelectImm8Opcode(Opc); 3235 } else if (MemVT != MVT::i64 || isInt<32>(OperandV)) { 3236 Operand = CurDAG->getTargetConstant(OperandV, SDLoc(Node), MemVT); 3237 NewOpc = SelectImmOpcode(Opc); 3238 } 3239 } 3240 3241 if (Opc == X86ISD::ADC || Opc == X86ISD::SBB) { 3242 SDValue CopyTo = 3243 CurDAG->getCopyToReg(InputChain, SDLoc(Node), X86::EFLAGS, 3244 StoredVal.getOperand(2), SDValue()); 3245 3246 const SDValue Ops[] = {Base, Scale, Index, Disp, 3247 Segment, Operand, CopyTo, CopyTo.getValue(1)}; 3248 Result = CurDAG->getMachineNode(NewOpc, SDLoc(Node), MVT::i32, MVT::Other, 3249 Ops); 3250 } else { 3251 const SDValue Ops[] = {Base, Scale, Index, Disp, 3252 Segment, Operand, InputChain}; 3253 Result = CurDAG->getMachineNode(NewOpc, SDLoc(Node), MVT::i32, MVT::Other, 3254 Ops); 3255 } 3256 break; 3257 } 3258 default: 3259 llvm_unreachable("Invalid opcode!"); 3260 } 3261 3262 MachineMemOperand *MemOps[] = {StoreNode->getMemOperand(), 3263 LoadNode->getMemOperand()}; 3264 CurDAG->setNodeMemRefs(Result, MemOps); 3265 3266 // Update Load Chain uses as well. 3267 ReplaceUses(SDValue(LoadNode, 1), SDValue(Result, 1)); 3268 ReplaceUses(SDValue(StoreNode, 0), SDValue(Result, 1)); 3269 ReplaceUses(SDValue(StoredVal.getNode(), 1), SDValue(Result, 0)); 3270 CurDAG->RemoveDeadNode(Node); 3271 return true; 3272 } 3273 3274 // See if this is an X & Mask that we can match to BEXTR/BZHI. 3275 // Where Mask is one of the following patterns: 3276 // a) x & (1 << nbits) - 1 3277 // b) x & ~(-1 << nbits) 3278 // c) x & (-1 >> (32 - y)) 3279 // d) x << (32 - y) >> (32 - y) 3280 bool X86DAGToDAGISel::matchBitExtract(SDNode *Node) { 3281 assert( 3282 (Node->getOpcode() == ISD::AND || Node->getOpcode() == ISD::SRL) && 3283 "Should be either an and-mask, or right-shift after clearing high bits."); 3284 3285 // BEXTR is BMI instruction, BZHI is BMI2 instruction. We need at least one. 3286 if (!Subtarget->hasBMI() && !Subtarget->hasBMI2()) 3287 return false; 3288 3289 MVT NVT = Node->getSimpleValueType(0); 3290 3291 // Only supported for 32 and 64 bits. 3292 if (NVT != MVT::i32 && NVT != MVT::i64) 3293 return false; 3294 3295 SDValue NBits; 3296 3297 // If we have BMI2's BZHI, we are ok with muti-use patterns. 3298 // Else, if we only have BMI1's BEXTR, we require one-use. 3299 const bool CanHaveExtraUses = Subtarget->hasBMI2(); 3300 auto checkUses = [CanHaveExtraUses](SDValue Op, unsigned NUses) { 3301 return CanHaveExtraUses || 3302 Op.getNode()->hasNUsesOfValue(NUses, Op.getResNo()); 3303 }; 3304 auto checkOneUse = [checkUses](SDValue Op) { return checkUses(Op, 1); }; 3305 auto checkTwoUse = [checkUses](SDValue Op) { return checkUses(Op, 2); }; 3306 3307 auto peekThroughOneUseTruncation = [checkOneUse](SDValue V) { 3308 if (V->getOpcode() == ISD::TRUNCATE && checkOneUse(V)) { 3309 assert(V.getSimpleValueType() == MVT::i32 && 3310 V.getOperand(0).getSimpleValueType() == MVT::i64 && 3311 "Expected i64 -> i32 truncation"); 3312 V = V.getOperand(0); 3313 } 3314 return V; 3315 }; 3316 3317 // a) x & ((1 << nbits) + (-1)) 3318 auto matchPatternA = [checkOneUse, peekThroughOneUseTruncation, 3319 &NBits](SDValue Mask) -> bool { 3320 // Match `add`. Must only have one use! 3321 if (Mask->getOpcode() != ISD::ADD || !checkOneUse(Mask)) 3322 return false; 3323 // We should be adding all-ones constant (i.e. subtracting one.) 3324 if (!isAllOnesConstant(Mask->getOperand(1))) 3325 return false; 3326 // Match `1 << nbits`. Might be truncated. Must only have one use! 3327 SDValue M0 = peekThroughOneUseTruncation(Mask->getOperand(0)); 3328 if (M0->getOpcode() != ISD::SHL || !checkOneUse(M0)) 3329 return false; 3330 if (!isOneConstant(M0->getOperand(0))) 3331 return false; 3332 NBits = M0->getOperand(1); 3333 return true; 3334 }; 3335 3336 auto isAllOnes = [this, peekThroughOneUseTruncation, NVT](SDValue V) { 3337 V = peekThroughOneUseTruncation(V); 3338 return CurDAG->MaskedValueIsAllOnes( 3339 V, APInt::getLowBitsSet(V.getSimpleValueType().getSizeInBits(), 3340 NVT.getSizeInBits())); 3341 }; 3342 3343 // b) x & ~(-1 << nbits) 3344 auto matchPatternB = [checkOneUse, isAllOnes, peekThroughOneUseTruncation, 3345 &NBits](SDValue Mask) -> bool { 3346 // Match `~()`. Must only have one use! 3347 if (Mask.getOpcode() != ISD::XOR || !checkOneUse(Mask)) 3348 return false; 3349 // The -1 only has to be all-ones for the final Node's NVT. 3350 if (!isAllOnes(Mask->getOperand(1))) 3351 return false; 3352 // Match `-1 << nbits`. Might be truncated. Must only have one use! 3353 SDValue M0 = peekThroughOneUseTruncation(Mask->getOperand(0)); 3354 if (M0->getOpcode() != ISD::SHL || !checkOneUse(M0)) 3355 return false; 3356 // The -1 only has to be all-ones for the final Node's NVT. 3357 if (!isAllOnes(M0->getOperand(0))) 3358 return false; 3359 NBits = M0->getOperand(1); 3360 return true; 3361 }; 3362 3363 // Match potentially-truncated (bitwidth - y) 3364 auto matchShiftAmt = [checkOneUse, &NBits](SDValue ShiftAmt, 3365 unsigned Bitwidth) { 3366 // Skip over a truncate of the shift amount. 3367 if (ShiftAmt.getOpcode() == ISD::TRUNCATE) { 3368 ShiftAmt = ShiftAmt.getOperand(0); 3369 // The trunc should have been the only user of the real shift amount. 3370 if (!checkOneUse(ShiftAmt)) 3371 return false; 3372 } 3373 // Match the shift amount as: (bitwidth - y). It should go away, too. 3374 if (ShiftAmt.getOpcode() != ISD::SUB) 3375 return false; 3376 auto V0 = dyn_cast<ConstantSDNode>(ShiftAmt.getOperand(0)); 3377 if (!V0 || V0->getZExtValue() != Bitwidth) 3378 return false; 3379 NBits = ShiftAmt.getOperand(1); 3380 return true; 3381 }; 3382 3383 // c) x & (-1 >> (32 - y)) 3384 auto matchPatternC = [checkOneUse, peekThroughOneUseTruncation, 3385 matchShiftAmt](SDValue Mask) -> bool { 3386 // The mask itself may be truncated. 3387 Mask = peekThroughOneUseTruncation(Mask); 3388 unsigned Bitwidth = Mask.getSimpleValueType().getSizeInBits(); 3389 // Match `l>>`. Must only have one use! 3390 if (Mask.getOpcode() != ISD::SRL || !checkOneUse(Mask)) 3391 return false; 3392 // We should be shifting truly all-ones constant. 3393 if (!isAllOnesConstant(Mask.getOperand(0))) 3394 return false; 3395 SDValue M1 = Mask.getOperand(1); 3396 // The shift amount should not be used externally. 3397 if (!checkOneUse(M1)) 3398 return false; 3399 return matchShiftAmt(M1, Bitwidth); 3400 }; 3401 3402 SDValue X; 3403 3404 // d) x << (32 - y) >> (32 - y) 3405 auto matchPatternD = [checkOneUse, checkTwoUse, matchShiftAmt, 3406 &X](SDNode *Node) -> bool { 3407 if (Node->getOpcode() != ISD::SRL) 3408 return false; 3409 SDValue N0 = Node->getOperand(0); 3410 if (N0->getOpcode() != ISD::SHL || !checkOneUse(N0)) 3411 return false; 3412 unsigned Bitwidth = N0.getSimpleValueType().getSizeInBits(); 3413 SDValue N1 = Node->getOperand(1); 3414 SDValue N01 = N0->getOperand(1); 3415 // Both of the shifts must be by the exact same value. 3416 // There should not be any uses of the shift amount outside of the pattern. 3417 if (N1 != N01 || !checkTwoUse(N1)) 3418 return false; 3419 if (!matchShiftAmt(N1, Bitwidth)) 3420 return false; 3421 X = N0->getOperand(0); 3422 return true; 3423 }; 3424 3425 auto matchLowBitMask = [matchPatternA, matchPatternB, 3426 matchPatternC](SDValue Mask) -> bool { 3427 return matchPatternA(Mask) || matchPatternB(Mask) || matchPatternC(Mask); 3428 }; 3429 3430 if (Node->getOpcode() == ISD::AND) { 3431 X = Node->getOperand(0); 3432 SDValue Mask = Node->getOperand(1); 3433 3434 if (matchLowBitMask(Mask)) { 3435 // Great. 3436 } else { 3437 std::swap(X, Mask); 3438 if (!matchLowBitMask(Mask)) 3439 return false; 3440 } 3441 } else if (!matchPatternD(Node)) 3442 return false; 3443 3444 SDLoc DL(Node); 3445 3446 // Truncate the shift amount. 3447 NBits = CurDAG->getNode(ISD::TRUNCATE, DL, MVT::i8, NBits); 3448 insertDAGNode(*CurDAG, SDValue(Node, 0), NBits); 3449 3450 // Insert 8-bit NBits into lowest 8 bits of 32-bit register. 3451 // All the other bits are undefined, we do not care about them. 3452 SDValue ImplDef = SDValue( 3453 CurDAG->getMachineNode(TargetOpcode::IMPLICIT_DEF, DL, MVT::i32), 0); 3454 insertDAGNode(*CurDAG, SDValue(Node, 0), ImplDef); 3455 3456 SDValue SRIdxVal = CurDAG->getTargetConstant(X86::sub_8bit, DL, MVT::i32); 3457 insertDAGNode(*CurDAG, SDValue(Node, 0), SRIdxVal); 3458 NBits = SDValue( 3459 CurDAG->getMachineNode(TargetOpcode::INSERT_SUBREG, DL, MVT::i32, ImplDef, 3460 NBits, SRIdxVal), 0); 3461 insertDAGNode(*CurDAG, SDValue(Node, 0), NBits); 3462 3463 if (Subtarget->hasBMI2()) { 3464 // Great, just emit the the BZHI.. 3465 if (NVT != MVT::i32) { 3466 // But have to place the bit count into the wide-enough register first. 3467 NBits = CurDAG->getNode(ISD::ANY_EXTEND, DL, NVT, NBits); 3468 insertDAGNode(*CurDAG, SDValue(Node, 0), NBits); 3469 } 3470 3471 SDValue Extract = CurDAG->getNode(X86ISD::BZHI, DL, NVT, X, NBits); 3472 ReplaceNode(Node, Extract.getNode()); 3473 SelectCode(Extract.getNode()); 3474 return true; 3475 } 3476 3477 // Else, if we do *NOT* have BMI2, let's find out if the if the 'X' is 3478 // *logically* shifted (potentially with one-use trunc inbetween), 3479 // and the truncation was the only use of the shift, 3480 // and if so look past one-use truncation. 3481 { 3482 SDValue RealX = peekThroughOneUseTruncation(X); 3483 // FIXME: only if the shift is one-use? 3484 if (RealX != X && RealX.getOpcode() == ISD::SRL) 3485 X = RealX; 3486 } 3487 3488 MVT XVT = X.getSimpleValueType(); 3489 3490 // Else, emitting BEXTR requires one more step. 3491 // The 'control' of BEXTR has the pattern of: 3492 // [15...8 bit][ 7...0 bit] location 3493 // [ bit count][ shift] name 3494 // I.e. 0b000000011'00000001 means (x >> 0b1) & 0b11 3495 3496 // Shift NBits left by 8 bits, thus producing 'control'. 3497 // This makes the low 8 bits to be zero. 3498 SDValue C8 = CurDAG->getConstant(8, DL, MVT::i8); 3499 SDValue Control = CurDAG->getNode(ISD::SHL, DL, MVT::i32, NBits, C8); 3500 insertDAGNode(*CurDAG, SDValue(Node, 0), Control); 3501 3502 // If the 'X' is *logically* shifted, we can fold that shift into 'control'. 3503 // FIXME: only if the shift is one-use? 3504 if (X.getOpcode() == ISD::SRL) { 3505 SDValue ShiftAmt = X.getOperand(1); 3506 X = X.getOperand(0); 3507 3508 assert(ShiftAmt.getValueType() == MVT::i8 && 3509 "Expected shift amount to be i8"); 3510 3511 // Now, *zero*-extend the shift amount. The bits 8...15 *must* be zero! 3512 // We could zext to i16 in some form, but we intentionally don't do that. 3513 SDValue OrigShiftAmt = ShiftAmt; 3514 ShiftAmt = CurDAG->getNode(ISD::ZERO_EXTEND, DL, MVT::i32, ShiftAmt); 3515 insertDAGNode(*CurDAG, OrigShiftAmt, ShiftAmt); 3516 3517 // And now 'or' these low 8 bits of shift amount into the 'control'. 3518 Control = CurDAG->getNode(ISD::OR, DL, MVT::i32, Control, ShiftAmt); 3519 insertDAGNode(*CurDAG, SDValue(Node, 0), Control); 3520 } 3521 3522 // But have to place the 'control' into the wide-enough register first. 3523 if (XVT != MVT::i32) { 3524 Control = CurDAG->getNode(ISD::ANY_EXTEND, DL, XVT, Control); 3525 insertDAGNode(*CurDAG, SDValue(Node, 0), Control); 3526 } 3527 3528 // And finally, form the BEXTR itself. 3529 SDValue Extract = CurDAG->getNode(X86ISD::BEXTR, DL, XVT, X, Control); 3530 3531 // The 'X' was originally truncated. Do that now. 3532 if (XVT != NVT) { 3533 insertDAGNode(*CurDAG, SDValue(Node, 0), Extract); 3534 Extract = CurDAG->getNode(ISD::TRUNCATE, DL, NVT, Extract); 3535 } 3536 3537 ReplaceNode(Node, Extract.getNode()); 3538 SelectCode(Extract.getNode()); 3539 3540 return true; 3541 } 3542 3543 // See if this is an (X >> C1) & C2 that we can match to BEXTR/BEXTRI. 3544 MachineSDNode *X86DAGToDAGISel::matchBEXTRFromAndImm(SDNode *Node) { 3545 MVT NVT = Node->getSimpleValueType(0); 3546 SDLoc dl(Node); 3547 3548 SDValue N0 = Node->getOperand(0); 3549 SDValue N1 = Node->getOperand(1); 3550 3551 // If we have TBM we can use an immediate for the control. If we have BMI 3552 // we should only do this if the BEXTR instruction is implemented well. 3553 // Otherwise moving the control into a register makes this more costly. 3554 // TODO: Maybe load folding, greater than 32-bit masks, or a guarantee of LICM 3555 // hoisting the move immediate would make it worthwhile with a less optimal 3556 // BEXTR? 3557 bool PreferBEXTR = 3558 Subtarget->hasTBM() || (Subtarget->hasBMI() && Subtarget->hasFastBEXTR()); 3559 if (!PreferBEXTR && !Subtarget->hasBMI2()) 3560 return nullptr; 3561 3562 // Must have a shift right. 3563 if (N0->getOpcode() != ISD::SRL && N0->getOpcode() != ISD::SRA) 3564 return nullptr; 3565 3566 // Shift can't have additional users. 3567 if (!N0->hasOneUse()) 3568 return nullptr; 3569 3570 // Only supported for 32 and 64 bits. 3571 if (NVT != MVT::i32 && NVT != MVT::i64) 3572 return nullptr; 3573 3574 // Shift amount and RHS of and must be constant. 3575 ConstantSDNode *MaskCst = dyn_cast<ConstantSDNode>(N1); 3576 ConstantSDNode *ShiftCst = dyn_cast<ConstantSDNode>(N0->getOperand(1)); 3577 if (!MaskCst || !ShiftCst) 3578 return nullptr; 3579 3580 // And RHS must be a mask. 3581 uint64_t Mask = MaskCst->getZExtValue(); 3582 if (!isMask_64(Mask)) 3583 return nullptr; 3584 3585 uint64_t Shift = ShiftCst->getZExtValue(); 3586 uint64_t MaskSize = countPopulation(Mask); 3587 3588 // Don't interfere with something that can be handled by extracting AH. 3589 // TODO: If we are able to fold a load, BEXTR might still be better than AH. 3590 if (Shift == 8 && MaskSize == 8) 3591 return nullptr; 3592 3593 // Make sure we are only using bits that were in the original value, not 3594 // shifted in. 3595 if (Shift + MaskSize > NVT.getSizeInBits()) 3596 return nullptr; 3597 3598 // BZHI, if available, is always fast, unlike BEXTR. But even if we decide 3599 // that we can't use BEXTR, it is only worthwhile using BZHI if the mask 3600 // does not fit into 32 bits. Load folding is not a sufficient reason. 3601 if (!PreferBEXTR && MaskSize <= 32) 3602 return nullptr; 3603 3604 SDValue Control; 3605 unsigned ROpc, MOpc; 3606 3607 if (!PreferBEXTR) { 3608 assert(Subtarget->hasBMI2() && "We must have BMI2's BZHI then."); 3609 // If we can't make use of BEXTR then we can't fuse shift+mask stages. 3610 // Let's perform the mask first, and apply shift later. Note that we need to 3611 // widen the mask to account for the fact that we'll apply shift afterwards! 3612 Control = CurDAG->getTargetConstant(Shift + MaskSize, dl, NVT); 3613 ROpc = NVT == MVT::i64 ? X86::BZHI64rr : X86::BZHI32rr; 3614 MOpc = NVT == MVT::i64 ? X86::BZHI64rm : X86::BZHI32rm; 3615 unsigned NewOpc = NVT == MVT::i64 ? X86::MOV32ri64 : X86::MOV32ri; 3616 Control = SDValue(CurDAG->getMachineNode(NewOpc, dl, NVT, Control), 0); 3617 } else { 3618 // The 'control' of BEXTR has the pattern of: 3619 // [15...8 bit][ 7...0 bit] location 3620 // [ bit count][ shift] name 3621 // I.e. 0b000000011'00000001 means (x >> 0b1) & 0b11 3622 Control = CurDAG->getTargetConstant(Shift | (MaskSize << 8), dl, NVT); 3623 if (Subtarget->hasTBM()) { 3624 ROpc = NVT == MVT::i64 ? X86::BEXTRI64ri : X86::BEXTRI32ri; 3625 MOpc = NVT == MVT::i64 ? X86::BEXTRI64mi : X86::BEXTRI32mi; 3626 } else { 3627 assert(Subtarget->hasBMI() && "We must have BMI1's BEXTR then."); 3628 // BMI requires the immediate to placed in a register. 3629 ROpc = NVT == MVT::i64 ? X86::BEXTR64rr : X86::BEXTR32rr; 3630 MOpc = NVT == MVT::i64 ? X86::BEXTR64rm : X86::BEXTR32rm; 3631 unsigned NewOpc = NVT == MVT::i64 ? X86::MOV32ri64 : X86::MOV32ri; 3632 Control = SDValue(CurDAG->getMachineNode(NewOpc, dl, NVT, Control), 0); 3633 } 3634 } 3635 3636 MachineSDNode *NewNode; 3637 SDValue Input = N0->getOperand(0); 3638 SDValue Tmp0, Tmp1, Tmp2, Tmp3, Tmp4; 3639 if (tryFoldLoad(Node, N0.getNode(), Input, Tmp0, Tmp1, Tmp2, Tmp3, Tmp4)) { 3640 SDValue Ops[] = { 3641 Tmp0, Tmp1, Tmp2, Tmp3, Tmp4, Control, Input.getOperand(0)}; 3642 SDVTList VTs = CurDAG->getVTList(NVT, MVT::i32, MVT::Other); 3643 NewNode = CurDAG->getMachineNode(MOpc, dl, VTs, Ops); 3644 // Update the chain. 3645 ReplaceUses(Input.getValue(1), SDValue(NewNode, 2)); 3646 // Record the mem-refs 3647 CurDAG->setNodeMemRefs(NewNode, {cast<LoadSDNode>(Input)->getMemOperand()}); 3648 } else { 3649 NewNode = CurDAG->getMachineNode(ROpc, dl, NVT, MVT::i32, Input, Control); 3650 } 3651 3652 if (!PreferBEXTR) { 3653 // We still need to apply the shift. 3654 SDValue ShAmt = CurDAG->getTargetConstant(Shift, dl, NVT); 3655 unsigned NewOpc = NVT == MVT::i64 ? X86::SHR64ri : X86::SHR32ri; 3656 NewNode = 3657 CurDAG->getMachineNode(NewOpc, dl, NVT, SDValue(NewNode, 0), ShAmt); 3658 } 3659 3660 return NewNode; 3661 } 3662 3663 // Emit a PCMISTR(I/M) instruction. 3664 MachineSDNode *X86DAGToDAGISel::emitPCMPISTR(unsigned ROpc, unsigned MOpc, 3665 bool MayFoldLoad, const SDLoc &dl, 3666 MVT VT, SDNode *Node) { 3667 SDValue N0 = Node->getOperand(0); 3668 SDValue N1 = Node->getOperand(1); 3669 SDValue Imm = Node->getOperand(2); 3670 const ConstantInt *Val = cast<ConstantSDNode>(Imm)->getConstantIntValue(); 3671 Imm = CurDAG->getTargetConstant(*Val, SDLoc(Node), Imm.getValueType()); 3672 3673 // Try to fold a load. No need to check alignment. 3674 SDValue Tmp0, Tmp1, Tmp2, Tmp3, Tmp4; 3675 if (MayFoldLoad && tryFoldLoad(Node, N1, Tmp0, Tmp1, Tmp2, Tmp3, Tmp4)) { 3676 SDValue Ops[] = { N0, Tmp0, Tmp1, Tmp2, Tmp3, Tmp4, Imm, 3677 N1.getOperand(0) }; 3678 SDVTList VTs = CurDAG->getVTList(VT, MVT::i32, MVT::Other); 3679 MachineSDNode *CNode = CurDAG->getMachineNode(MOpc, dl, VTs, Ops); 3680 // Update the chain. 3681 ReplaceUses(N1.getValue(1), SDValue(CNode, 2)); 3682 // Record the mem-refs 3683 CurDAG->setNodeMemRefs(CNode, {cast<LoadSDNode>(N1)->getMemOperand()}); 3684 return CNode; 3685 } 3686 3687 SDValue Ops[] = { N0, N1, Imm }; 3688 SDVTList VTs = CurDAG->getVTList(VT, MVT::i32); 3689 MachineSDNode *CNode = CurDAG->getMachineNode(ROpc, dl, VTs, Ops); 3690 return CNode; 3691 } 3692 3693 // Emit a PCMESTR(I/M) instruction. Also return the Glue result in case we need 3694 // to emit a second instruction after this one. This is needed since we have two 3695 // copyToReg nodes glued before this and we need to continue that glue through. 3696 MachineSDNode *X86DAGToDAGISel::emitPCMPESTR(unsigned ROpc, unsigned MOpc, 3697 bool MayFoldLoad, const SDLoc &dl, 3698 MVT VT, SDNode *Node, 3699 SDValue &InFlag) { 3700 SDValue N0 = Node->getOperand(0); 3701 SDValue N2 = Node->getOperand(2); 3702 SDValue Imm = Node->getOperand(4); 3703 const ConstantInt *Val = cast<ConstantSDNode>(Imm)->getConstantIntValue(); 3704 Imm = CurDAG->getTargetConstant(*Val, SDLoc(Node), Imm.getValueType()); 3705 3706 // Try to fold a load. No need to check alignment. 3707 SDValue Tmp0, Tmp1, Tmp2, Tmp3, Tmp4; 3708 if (MayFoldLoad && tryFoldLoad(Node, N2, Tmp0, Tmp1, Tmp2, Tmp3, Tmp4)) { 3709 SDValue Ops[] = { N0, Tmp0, Tmp1, Tmp2, Tmp3, Tmp4, Imm, 3710 N2.getOperand(0), InFlag }; 3711 SDVTList VTs = CurDAG->getVTList(VT, MVT::i32, MVT::Other, MVT::Glue); 3712 MachineSDNode *CNode = CurDAG->getMachineNode(MOpc, dl, VTs, Ops); 3713 InFlag = SDValue(CNode, 3); 3714 // Update the chain. 3715 ReplaceUses(N2.getValue(1), SDValue(CNode, 2)); 3716 // Record the mem-refs 3717 CurDAG->setNodeMemRefs(CNode, {cast<LoadSDNode>(N2)->getMemOperand()}); 3718 return CNode; 3719 } 3720 3721 SDValue Ops[] = { N0, N2, Imm, InFlag }; 3722 SDVTList VTs = CurDAG->getVTList(VT, MVT::i32, MVT::Glue); 3723 MachineSDNode *CNode = CurDAG->getMachineNode(ROpc, dl, VTs, Ops); 3724 InFlag = SDValue(CNode, 2); 3725 return CNode; 3726 } 3727 3728 bool X86DAGToDAGISel::tryShiftAmountMod(SDNode *N) { 3729 EVT VT = N->getValueType(0); 3730 3731 // Only handle scalar shifts. 3732 if (VT.isVector()) 3733 return false; 3734 3735 // Narrower shifts only mask to 5 bits in hardware. 3736 unsigned Size = VT == MVT::i64 ? 64 : 32; 3737 3738 SDValue OrigShiftAmt = N->getOperand(1); 3739 SDValue ShiftAmt = OrigShiftAmt; 3740 SDLoc DL(N); 3741 3742 // Skip over a truncate of the shift amount. 3743 if (ShiftAmt->getOpcode() == ISD::TRUNCATE) 3744 ShiftAmt = ShiftAmt->getOperand(0); 3745 3746 // This function is called after X86DAGToDAGISel::matchBitExtract(), 3747 // so we are not afraid that we might mess up BZHI/BEXTR pattern. 3748 3749 SDValue NewShiftAmt; 3750 if (ShiftAmt->getOpcode() == ISD::ADD || ShiftAmt->getOpcode() == ISD::SUB) { 3751 SDValue Add0 = ShiftAmt->getOperand(0); 3752 SDValue Add1 = ShiftAmt->getOperand(1); 3753 // If we are shifting by X+/-N where N == 0 mod Size, then just shift by X 3754 // to avoid the ADD/SUB. 3755 if (isa<ConstantSDNode>(Add1) && 3756 cast<ConstantSDNode>(Add1)->getZExtValue() % Size == 0) { 3757 NewShiftAmt = Add0; 3758 // If we are shifting by N-X where N == 0 mod Size, then just shift by -X to 3759 // generate a NEG instead of a SUB of a constant. 3760 } else if (ShiftAmt->getOpcode() == ISD::SUB && 3761 isa<ConstantSDNode>(Add0) && 3762 cast<ConstantSDNode>(Add0)->getZExtValue() != 0 && 3763 cast<ConstantSDNode>(Add0)->getZExtValue() % Size == 0) { 3764 // Insert a negate op. 3765 // TODO: This isn't guaranteed to replace the sub if there is a logic cone 3766 // that uses it that's not a shift. 3767 EVT SubVT = ShiftAmt.getValueType(); 3768 SDValue Zero = CurDAG->getConstant(0, DL, SubVT); 3769 SDValue Neg = CurDAG->getNode(ISD::SUB, DL, SubVT, Zero, Add1); 3770 NewShiftAmt = Neg; 3771 3772 // Insert these operands into a valid topological order so they can 3773 // get selected independently. 3774 insertDAGNode(*CurDAG, OrigShiftAmt, Zero); 3775 insertDAGNode(*CurDAG, OrigShiftAmt, Neg); 3776 } else 3777 return false; 3778 } else 3779 return false; 3780 3781 if (NewShiftAmt.getValueType() != MVT::i8) { 3782 // Need to truncate the shift amount. 3783 NewShiftAmt = CurDAG->getNode(ISD::TRUNCATE, DL, MVT::i8, NewShiftAmt); 3784 // Add to a correct topological ordering. 3785 insertDAGNode(*CurDAG, OrigShiftAmt, NewShiftAmt); 3786 } 3787 3788 // Insert a new mask to keep the shift amount legal. This should be removed 3789 // by isel patterns. 3790 NewShiftAmt = CurDAG->getNode(ISD::AND, DL, MVT::i8, NewShiftAmt, 3791 CurDAG->getConstant(Size - 1, DL, MVT::i8)); 3792 // Place in a correct topological ordering. 3793 insertDAGNode(*CurDAG, OrigShiftAmt, NewShiftAmt); 3794 3795 SDNode *UpdatedNode = CurDAG->UpdateNodeOperands(N, N->getOperand(0), 3796 NewShiftAmt); 3797 if (UpdatedNode != N) { 3798 // If we found an existing node, we should replace ourselves with that node 3799 // and wait for it to be selected after its other users. 3800 ReplaceNode(N, UpdatedNode); 3801 return true; 3802 } 3803 3804 // If the original shift amount is now dead, delete it so that we don't run 3805 // it through isel. 3806 if (OrigShiftAmt.getNode()->use_empty()) 3807 CurDAG->RemoveDeadNode(OrigShiftAmt.getNode()); 3808 3809 // Now that we've optimized the shift amount, defer to normal isel to get 3810 // load folding and legacy vs BMI2 selection without repeating it here. 3811 SelectCode(N); 3812 return true; 3813 } 3814 3815 bool X86DAGToDAGISel::tryShrinkShlLogicImm(SDNode *N) { 3816 MVT NVT = N->getSimpleValueType(0); 3817 unsigned Opcode = N->getOpcode(); 3818 SDLoc dl(N); 3819 3820 // For operations of the form (x << C1) op C2, check if we can use a smaller 3821 // encoding for C2 by transforming it into (x op (C2>>C1)) << C1. 3822 SDValue Shift = N->getOperand(0); 3823 SDValue N1 = N->getOperand(1); 3824 3825 ConstantSDNode *Cst = dyn_cast<ConstantSDNode>(N1); 3826 if (!Cst) 3827 return false; 3828 3829 int64_t Val = Cst->getSExtValue(); 3830 3831 // If we have an any_extend feeding the AND, look through it to see if there 3832 // is a shift behind it. But only if the AND doesn't use the extended bits. 3833 // FIXME: Generalize this to other ANY_EXTEND than i32 to i64? 3834 bool FoundAnyExtend = false; 3835 if (Shift.getOpcode() == ISD::ANY_EXTEND && Shift.hasOneUse() && 3836 Shift.getOperand(0).getSimpleValueType() == MVT::i32 && 3837 isUInt<32>(Val)) { 3838 FoundAnyExtend = true; 3839 Shift = Shift.getOperand(0); 3840 } 3841 3842 if (Shift.getOpcode() != ISD::SHL || !Shift.hasOneUse()) 3843 return false; 3844 3845 // i8 is unshrinkable, i16 should be promoted to i32. 3846 if (NVT != MVT::i32 && NVT != MVT::i64) 3847 return false; 3848 3849 ConstantSDNode *ShlCst = dyn_cast<ConstantSDNode>(Shift.getOperand(1)); 3850 if (!ShlCst) 3851 return false; 3852 3853 uint64_t ShAmt = ShlCst->getZExtValue(); 3854 3855 // Make sure that we don't change the operation by removing bits. 3856 // This only matters for OR and XOR, AND is unaffected. 3857 uint64_t RemovedBitsMask = (1ULL << ShAmt) - 1; 3858 if (Opcode != ISD::AND && (Val & RemovedBitsMask) != 0) 3859 return false; 3860 3861 // Check the minimum bitwidth for the new constant. 3862 // TODO: Using 16 and 8 bit operations is also possible for or32 & xor32. 3863 auto CanShrinkImmediate = [&](int64_t &ShiftedVal) { 3864 if (Opcode == ISD::AND) { 3865 // AND32ri is the same as AND64ri32 with zext imm. 3866 // Try this before sign extended immediates below. 3867 ShiftedVal = (uint64_t)Val >> ShAmt; 3868 if (NVT == MVT::i64 && !isUInt<32>(Val) && isUInt<32>(ShiftedVal)) 3869 return true; 3870 // Also swap order when the AND can become MOVZX. 3871 if (ShiftedVal == UINT8_MAX || ShiftedVal == UINT16_MAX) 3872 return true; 3873 } 3874 ShiftedVal = Val >> ShAmt; 3875 if ((!isInt<8>(Val) && isInt<8>(ShiftedVal)) || 3876 (!isInt<32>(Val) && isInt<32>(ShiftedVal))) 3877 return true; 3878 if (Opcode != ISD::AND) { 3879 // MOV32ri+OR64r/XOR64r is cheaper than MOV64ri64+OR64rr/XOR64rr 3880 ShiftedVal = (uint64_t)Val >> ShAmt; 3881 if (NVT == MVT::i64 && !isUInt<32>(Val) && isUInt<32>(ShiftedVal)) 3882 return true; 3883 } 3884 return false; 3885 }; 3886 3887 int64_t ShiftedVal; 3888 if (!CanShrinkImmediate(ShiftedVal)) 3889 return false; 3890 3891 // Ok, we can reorder to get a smaller immediate. 3892 3893 // But, its possible the original immediate allowed an AND to become MOVZX. 3894 // Doing this late due to avoid the MakedValueIsZero call as late as 3895 // possible. 3896 if (Opcode == ISD::AND) { 3897 // Find the smallest zext this could possibly be. 3898 unsigned ZExtWidth = Cst->getAPIntValue().getActiveBits(); 3899 ZExtWidth = PowerOf2Ceil(std::max(ZExtWidth, 8U)); 3900 3901 // Figure out which bits need to be zero to achieve that mask. 3902 APInt NeededMask = APInt::getLowBitsSet(NVT.getSizeInBits(), 3903 ZExtWidth); 3904 NeededMask &= ~Cst->getAPIntValue(); 3905 3906 if (CurDAG->MaskedValueIsZero(N->getOperand(0), NeededMask)) 3907 return false; 3908 } 3909 3910 SDValue X = Shift.getOperand(0); 3911 if (FoundAnyExtend) { 3912 SDValue NewX = CurDAG->getNode(ISD::ANY_EXTEND, dl, NVT, X); 3913 insertDAGNode(*CurDAG, SDValue(N, 0), NewX); 3914 X = NewX; 3915 } 3916 3917 SDValue NewCst = CurDAG->getConstant(ShiftedVal, dl, NVT); 3918 insertDAGNode(*CurDAG, SDValue(N, 0), NewCst); 3919 SDValue NewBinOp = CurDAG->getNode(Opcode, dl, NVT, X, NewCst); 3920 insertDAGNode(*CurDAG, SDValue(N, 0), NewBinOp); 3921 SDValue NewSHL = CurDAG->getNode(ISD::SHL, dl, NVT, NewBinOp, 3922 Shift.getOperand(1)); 3923 ReplaceNode(N, NewSHL.getNode()); 3924 SelectCode(NewSHL.getNode()); 3925 return true; 3926 } 3927 3928 // Try to match two logic ops to a VPTERNLOG. 3929 // FIXME: Handle inverted inputs? 3930 // FIXME: Handle more complex patterns that use an operand more than once? 3931 bool X86DAGToDAGISel::tryVPTERNLOG(SDNode *N) { 3932 MVT NVT = N->getSimpleValueType(0); 3933 3934 // Make sure we support VPTERNLOG. 3935 if (!NVT.isVector() || !Subtarget->hasAVX512() || 3936 NVT.getVectorElementType() == MVT::i1) 3937 return false; 3938 3939 // We need VLX for 128/256-bit. 3940 if (!(Subtarget->hasVLX() || NVT.is512BitVector())) 3941 return false; 3942 3943 unsigned Opc1 = N->getOpcode(); 3944 SDValue N0 = N->getOperand(0); 3945 SDValue N1 = N->getOperand(1); 3946 3947 auto isLogicOp = [](unsigned Opc) { 3948 return Opc == ISD::AND || Opc == ISD::OR || Opc == ISD::XOR || 3949 Opc == X86ISD::ANDNP; 3950 }; 3951 3952 SDValue A, B, C; 3953 unsigned Opc2; 3954 if (isLogicOp(N1.getOpcode()) && N1.hasOneUse()) { 3955 Opc2 = N1.getOpcode(); 3956 A = N0; 3957 B = N1.getOperand(0); 3958 C = N1.getOperand(1); 3959 } else if (isLogicOp(N0.getOpcode()) && N0.hasOneUse()) { 3960 Opc2 = N0.getOpcode(); 3961 A = N1; 3962 B = N0.getOperand(0); 3963 C = N0.getOperand(1); 3964 } else 3965 return false; 3966 3967 uint64_t Imm; 3968 switch (Opc1) { 3969 default: llvm_unreachable("Unexpected opcode!"); 3970 case ISD::AND: 3971 switch (Opc2) { 3972 default: llvm_unreachable("Unexpected opcode!"); 3973 case ISD::AND: Imm = 0x80; break; 3974 case ISD::OR: Imm = 0xe0; break; 3975 case ISD::XOR: Imm = 0x60; break; 3976 case X86ISD::ANDNP: Imm = 0x20; break; 3977 } 3978 break; 3979 case ISD::OR: 3980 switch (Opc2) { 3981 default: llvm_unreachable("Unexpected opcode!"); 3982 case ISD::AND: Imm = 0xf8; break; 3983 case ISD::OR: Imm = 0xfe; break; 3984 case ISD::XOR: Imm = 0xf6; break; 3985 case X86ISD::ANDNP: Imm = 0xf2; break; 3986 } 3987 break; 3988 case ISD::XOR: 3989 switch (Opc2) { 3990 default: llvm_unreachable("Unexpected opcode!"); 3991 case ISD::AND: Imm = 0x78; break; 3992 case ISD::OR: Imm = 0x1e; break; 3993 case ISD::XOR: Imm = 0x96; break; 3994 case X86ISD::ANDNP: Imm = 0xd2; break; 3995 } 3996 break; 3997 } 3998 3999 SDLoc DL(N); 4000 SDValue New = CurDAG->getNode(X86ISD::VPTERNLOG, DL, NVT, A, B, C, 4001 CurDAG->getTargetConstant(Imm, DL, MVT::i8)); 4002 ReplaceNode(N, New.getNode()); 4003 SelectCode(New.getNode()); 4004 return true; 4005 } 4006 4007 /// If the high bits of an 'and' operand are known zero, try setting the 4008 /// high bits of an 'and' constant operand to produce a smaller encoding by 4009 /// creating a small, sign-extended negative immediate rather than a large 4010 /// positive one. This reverses a transform in SimplifyDemandedBits that 4011 /// shrinks mask constants by clearing bits. There is also a possibility that 4012 /// the 'and' mask can be made -1, so the 'and' itself is unnecessary. In that 4013 /// case, just replace the 'and'. Return 'true' if the node is replaced. 4014 bool X86DAGToDAGISel::shrinkAndImmediate(SDNode *And) { 4015 // i8 is unshrinkable, i16 should be promoted to i32, and vector ops don't 4016 // have immediate operands. 4017 MVT VT = And->getSimpleValueType(0); 4018 if (VT != MVT::i32 && VT != MVT::i64) 4019 return false; 4020 4021 auto *And1C = dyn_cast<ConstantSDNode>(And->getOperand(1)); 4022 if (!And1C) 4023 return false; 4024 4025 // Bail out if the mask constant is already negative. It's can't shrink more. 4026 // If the upper 32 bits of a 64 bit mask are all zeros, we have special isel 4027 // patterns to use a 32-bit and instead of a 64-bit and by relying on the 4028 // implicit zeroing of 32 bit ops. So we should check if the lower 32 bits 4029 // are negative too. 4030 APInt MaskVal = And1C->getAPIntValue(); 4031 unsigned MaskLZ = MaskVal.countLeadingZeros(); 4032 if (!MaskLZ || (VT == MVT::i64 && MaskLZ == 32)) 4033 return false; 4034 4035 // Don't extend into the upper 32 bits of a 64 bit mask. 4036 if (VT == MVT::i64 && MaskLZ >= 32) { 4037 MaskLZ -= 32; 4038 MaskVal = MaskVal.trunc(32); 4039 } 4040 4041 SDValue And0 = And->getOperand(0); 4042 APInt HighZeros = APInt::getHighBitsSet(MaskVal.getBitWidth(), MaskLZ); 4043 APInt NegMaskVal = MaskVal | HighZeros; 4044 4045 // If a negative constant would not allow a smaller encoding, there's no need 4046 // to continue. Only change the constant when we know it's a win. 4047 unsigned MinWidth = NegMaskVal.getMinSignedBits(); 4048 if (MinWidth > 32 || (MinWidth > 8 && MaskVal.getMinSignedBits() <= 32)) 4049 return false; 4050 4051 // Extend masks if we truncated above. 4052 if (VT == MVT::i64 && MaskVal.getBitWidth() < 64) { 4053 NegMaskVal = NegMaskVal.zext(64); 4054 HighZeros = HighZeros.zext(64); 4055 } 4056 4057 // The variable operand must be all zeros in the top bits to allow using the 4058 // new, negative constant as the mask. 4059 if (!CurDAG->MaskedValueIsZero(And0, HighZeros)) 4060 return false; 4061 4062 // Check if the mask is -1. In that case, this is an unnecessary instruction 4063 // that escaped earlier analysis. 4064 if (NegMaskVal.isAllOnesValue()) { 4065 ReplaceNode(And, And0.getNode()); 4066 return true; 4067 } 4068 4069 // A negative mask allows a smaller encoding. Create a new 'and' node. 4070 SDValue NewMask = CurDAG->getConstant(NegMaskVal, SDLoc(And), VT); 4071 SDValue NewAnd = CurDAG->getNode(ISD::AND, SDLoc(And), VT, And0, NewMask); 4072 ReplaceNode(And, NewAnd.getNode()); 4073 SelectCode(NewAnd.getNode()); 4074 return true; 4075 } 4076 4077 static unsigned getVPTESTMOpc(MVT TestVT, bool IsTestN, bool FoldedLoad, 4078 bool FoldedBCast, bool Masked) { 4079 #define VPTESTM_CASE(VT, SUFFIX) \ 4080 case MVT::VT: \ 4081 if (Masked) \ 4082 return IsTestN ? X86::VPTESTNM##SUFFIX##k: X86::VPTESTM##SUFFIX##k; \ 4083 return IsTestN ? X86::VPTESTNM##SUFFIX : X86::VPTESTM##SUFFIX; 4084 4085 4086 #define VPTESTM_BROADCAST_CASES(SUFFIX) \ 4087 default: llvm_unreachable("Unexpected VT!"); \ 4088 VPTESTM_CASE(v4i32, DZ128##SUFFIX) \ 4089 VPTESTM_CASE(v2i64, QZ128##SUFFIX) \ 4090 VPTESTM_CASE(v8i32, DZ256##SUFFIX) \ 4091 VPTESTM_CASE(v4i64, QZ256##SUFFIX) \ 4092 VPTESTM_CASE(v16i32, DZ##SUFFIX) \ 4093 VPTESTM_CASE(v8i64, QZ##SUFFIX) 4094 4095 #define VPTESTM_FULL_CASES(SUFFIX) \ 4096 VPTESTM_BROADCAST_CASES(SUFFIX) \ 4097 VPTESTM_CASE(v16i8, BZ128##SUFFIX) \ 4098 VPTESTM_CASE(v8i16, WZ128##SUFFIX) \ 4099 VPTESTM_CASE(v32i8, BZ256##SUFFIX) \ 4100 VPTESTM_CASE(v16i16, WZ256##SUFFIX) \ 4101 VPTESTM_CASE(v64i8, BZ##SUFFIX) \ 4102 VPTESTM_CASE(v32i16, WZ##SUFFIX) 4103 4104 if (FoldedLoad) { 4105 switch (TestVT.SimpleTy) { 4106 VPTESTM_FULL_CASES(rm) 4107 } 4108 } 4109 4110 if (FoldedBCast) { 4111 switch (TestVT.SimpleTy) { 4112 VPTESTM_BROADCAST_CASES(rmb) 4113 } 4114 } 4115 4116 switch (TestVT.SimpleTy) { 4117 VPTESTM_FULL_CASES(rr) 4118 } 4119 4120 #undef VPTESTM_FULL_CASES 4121 #undef VPTESTM_BROADCAST_CASES 4122 #undef VPTESTM_CASE 4123 } 4124 4125 // Try to create VPTESTM instruction. If InMask is not null, it will be used 4126 // to form a masked operation. 4127 bool X86DAGToDAGISel::tryVPTESTM(SDNode *Root, SDValue Setcc, 4128 SDValue InMask) { 4129 assert(Subtarget->hasAVX512() && "Expected AVX512!"); 4130 assert(Setcc.getSimpleValueType().getVectorElementType() == MVT::i1 && 4131 "Unexpected VT!"); 4132 4133 // Look for equal and not equal compares. 4134 ISD::CondCode CC = cast<CondCodeSDNode>(Setcc.getOperand(2))->get(); 4135 if (CC != ISD::SETEQ && CC != ISD::SETNE) 4136 return false; 4137 4138 SDValue SetccOp0 = Setcc.getOperand(0); 4139 SDValue SetccOp1 = Setcc.getOperand(1); 4140 4141 // Canonicalize the all zero vector to the RHS. 4142 if (ISD::isBuildVectorAllZeros(SetccOp0.getNode())) 4143 std::swap(SetccOp0, SetccOp1); 4144 4145 // See if we're comparing against zero. 4146 if (!ISD::isBuildVectorAllZeros(SetccOp1.getNode())) 4147 return false; 4148 4149 SDValue N0 = SetccOp0; 4150 4151 MVT CmpVT = N0.getSimpleValueType(); 4152 MVT CmpSVT = CmpVT.getVectorElementType(); 4153 4154 // Start with both operands the same. We'll try to refine this. 4155 SDValue Src0 = N0; 4156 SDValue Src1 = N0; 4157 4158 { 4159 // Look through single use bitcasts. 4160 SDValue N0Temp = N0; 4161 if (N0Temp.getOpcode() == ISD::BITCAST && N0Temp.hasOneUse()) 4162 N0Temp = N0.getOperand(0); 4163 4164 // Look for single use AND. 4165 if (N0Temp.getOpcode() == ISD::AND && N0Temp.hasOneUse()) { 4166 Src0 = N0Temp.getOperand(0); 4167 Src1 = N0Temp.getOperand(1); 4168 } 4169 } 4170 4171 // Without VLX we need to widen the load. 4172 bool Widen = !Subtarget->hasVLX() && !CmpVT.is512BitVector(); 4173 4174 // We can only fold loads if the sources are unique. 4175 bool CanFoldLoads = Src0 != Src1; 4176 4177 // Try to fold loads unless we need to widen. 4178 bool FoldedLoad = false; 4179 SDValue Tmp0, Tmp1, Tmp2, Tmp3, Tmp4, Load; 4180 if (!Widen && CanFoldLoads) { 4181 Load = Src1; 4182 FoldedLoad = tryFoldLoad(Root, N0.getNode(), Load, Tmp0, Tmp1, Tmp2, Tmp3, 4183 Tmp4); 4184 if (!FoldedLoad) { 4185 // And is computative. 4186 Load = Src0; 4187 FoldedLoad = tryFoldLoad(Root, N0.getNode(), Load, Tmp0, Tmp1, Tmp2, 4188 Tmp3, Tmp4); 4189 if (FoldedLoad) 4190 std::swap(Src0, Src1); 4191 } 4192 } 4193 4194 auto findBroadcastedOp = [](SDValue Src, MVT CmpSVT, SDNode *&Parent) { 4195 // Look through single use bitcasts. 4196 if (Src.getOpcode() == ISD::BITCAST && Src.hasOneUse()) { 4197 Parent = Src.getNode(); 4198 Src = Src.getOperand(0); 4199 } 4200 4201 if (Src.getOpcode() == X86ISD::VBROADCAST_LOAD && Src.hasOneUse()) { 4202 auto *MemIntr = cast<MemIntrinsicSDNode>(Src); 4203 if (MemIntr->getMemoryVT().getSizeInBits() == CmpSVT.getSizeInBits()) 4204 return Src; 4205 } 4206 4207 return SDValue(); 4208 }; 4209 4210 // If we didn't fold a load, try to match broadcast. No widening limitation 4211 // for this. But only 32 and 64 bit types are supported. 4212 bool FoldedBCast = false; 4213 if (!FoldedLoad && CanFoldLoads && 4214 (CmpSVT == MVT::i32 || CmpSVT == MVT::i64)) { 4215 SDNode *ParentNode = N0.getNode(); 4216 if ((Load = findBroadcastedOp(Src1, CmpSVT, ParentNode))) { 4217 FoldedBCast = tryFoldBroadcast(Root, ParentNode, Load, Tmp0, 4218 Tmp1, Tmp2, Tmp3, Tmp4); 4219 } 4220 4221 // Try the other operand. 4222 if (!FoldedBCast) { 4223 SDNode *ParentNode = N0.getNode(); 4224 if ((Load = findBroadcastedOp(Src0, CmpSVT, ParentNode))) { 4225 FoldedBCast = tryFoldBroadcast(Root, ParentNode, Load, Tmp0, 4226 Tmp1, Tmp2, Tmp3, Tmp4); 4227 if (FoldedBCast) 4228 std::swap(Src0, Src1); 4229 } 4230 } 4231 } 4232 4233 auto getMaskRC = [](MVT MaskVT) { 4234 switch (MaskVT.SimpleTy) { 4235 default: llvm_unreachable("Unexpected VT!"); 4236 case MVT::v2i1: return X86::VK2RegClassID; 4237 case MVT::v4i1: return X86::VK4RegClassID; 4238 case MVT::v8i1: return X86::VK8RegClassID; 4239 case MVT::v16i1: return X86::VK16RegClassID; 4240 case MVT::v32i1: return X86::VK32RegClassID; 4241 case MVT::v64i1: return X86::VK64RegClassID; 4242 } 4243 }; 4244 4245 bool IsMasked = InMask.getNode() != nullptr; 4246 4247 SDLoc dl(Root); 4248 4249 MVT ResVT = Setcc.getSimpleValueType(); 4250 MVT MaskVT = ResVT; 4251 if (Widen) { 4252 // Widen the inputs using insert_subreg or copy_to_regclass. 4253 unsigned Scale = CmpVT.is128BitVector() ? 4 : 2; 4254 unsigned SubReg = CmpVT.is128BitVector() ? X86::sub_xmm : X86::sub_ymm; 4255 unsigned NumElts = CmpVT.getVectorNumElements() * Scale; 4256 CmpVT = MVT::getVectorVT(CmpSVT, NumElts); 4257 MaskVT = MVT::getVectorVT(MVT::i1, NumElts); 4258 SDValue ImplDef = SDValue(CurDAG->getMachineNode(X86::IMPLICIT_DEF, dl, 4259 CmpVT), 0); 4260 Src0 = CurDAG->getTargetInsertSubreg(SubReg, dl, CmpVT, ImplDef, Src0); 4261 4262 assert(!FoldedLoad && "Shouldn't have folded the load"); 4263 if (!FoldedBCast) 4264 Src1 = CurDAG->getTargetInsertSubreg(SubReg, dl, CmpVT, ImplDef, Src1); 4265 4266 if (IsMasked) { 4267 // Widen the mask. 4268 unsigned RegClass = getMaskRC(MaskVT); 4269 SDValue RC = CurDAG->getTargetConstant(RegClass, dl, MVT::i32); 4270 InMask = SDValue(CurDAG->getMachineNode(TargetOpcode::COPY_TO_REGCLASS, 4271 dl, MaskVT, InMask, RC), 0); 4272 } 4273 } 4274 4275 bool IsTestN = CC == ISD::SETEQ; 4276 unsigned Opc = getVPTESTMOpc(CmpVT, IsTestN, FoldedLoad, FoldedBCast, 4277 IsMasked); 4278 4279 MachineSDNode *CNode; 4280 if (FoldedLoad || FoldedBCast) { 4281 SDVTList VTs = CurDAG->getVTList(MaskVT, MVT::Other); 4282 4283 if (IsMasked) { 4284 SDValue Ops[] = { InMask, Src0, Tmp0, Tmp1, Tmp2, Tmp3, Tmp4, 4285 Load.getOperand(0) }; 4286 CNode = CurDAG->getMachineNode(Opc, dl, VTs, Ops); 4287 } else { 4288 SDValue Ops[] = { Src0, Tmp0, Tmp1, Tmp2, Tmp3, Tmp4, 4289 Load.getOperand(0) }; 4290 CNode = CurDAG->getMachineNode(Opc, dl, VTs, Ops); 4291 } 4292 4293 // Update the chain. 4294 ReplaceUses(Load.getValue(1), SDValue(CNode, 1)); 4295 // Record the mem-refs 4296 CurDAG->setNodeMemRefs(CNode, {cast<MemSDNode>(Load)->getMemOperand()}); 4297 } else { 4298 if (IsMasked) 4299 CNode = CurDAG->getMachineNode(Opc, dl, MaskVT, InMask, Src0, Src1); 4300 else 4301 CNode = CurDAG->getMachineNode(Opc, dl, MaskVT, Src0, Src1); 4302 } 4303 4304 // If we widened, we need to shrink the mask VT. 4305 if (Widen) { 4306 unsigned RegClass = getMaskRC(ResVT); 4307 SDValue RC = CurDAG->getTargetConstant(RegClass, dl, MVT::i32); 4308 CNode = CurDAG->getMachineNode(TargetOpcode::COPY_TO_REGCLASS, 4309 dl, ResVT, SDValue(CNode, 0), RC); 4310 } 4311 4312 ReplaceUses(SDValue(Root, 0), SDValue(CNode, 0)); 4313 CurDAG->RemoveDeadNode(Root); 4314 return true; 4315 } 4316 4317 // Try to match the bitselect pattern (or (and A, B), (andn A, C)). Turn it 4318 // into vpternlog. 4319 bool X86DAGToDAGISel::tryMatchBitSelect(SDNode *N) { 4320 assert(N->getOpcode() == ISD::OR && "Unexpected opcode!"); 4321 4322 MVT NVT = N->getSimpleValueType(0); 4323 4324 // Make sure we support VPTERNLOG. 4325 if (!NVT.isVector() || !Subtarget->hasAVX512()) 4326 return false; 4327 4328 // We need VLX for 128/256-bit. 4329 if (!(Subtarget->hasVLX() || NVT.is512BitVector())) 4330 return false; 4331 4332 SDValue N0 = N->getOperand(0); 4333 SDValue N1 = N->getOperand(1); 4334 4335 // Canonicalize AND to LHS. 4336 if (N1.getOpcode() == ISD::AND) 4337 std::swap(N0, N1); 4338 4339 if (N0.getOpcode() != ISD::AND || 4340 N1.getOpcode() != X86ISD::ANDNP || 4341 !N0.hasOneUse() || !N1.hasOneUse()) 4342 return false; 4343 4344 // ANDN is not commutable, use it to pick down A and C. 4345 SDValue A = N1.getOperand(0); 4346 SDValue C = N1.getOperand(1); 4347 4348 // AND is commutable, if one operand matches A, the other operand is B. 4349 // Otherwise this isn't a match. 4350 SDValue B; 4351 if (N0.getOperand(0) == A) 4352 B = N0.getOperand(1); 4353 else if (N0.getOperand(1) == A) 4354 B = N0.getOperand(0); 4355 else 4356 return false; 4357 4358 SDLoc dl(N); 4359 SDValue Imm = CurDAG->getTargetConstant(0xCA, dl, MVT::i8); 4360 SDValue Ternlog = CurDAG->getNode(X86ISD::VPTERNLOG, dl, NVT, A, B, C, Imm); 4361 ReplaceNode(N, Ternlog.getNode()); 4362 SelectCode(Ternlog.getNode()); 4363 return true; 4364 } 4365 4366 void X86DAGToDAGISel::Select(SDNode *Node) { 4367 MVT NVT = Node->getSimpleValueType(0); 4368 unsigned Opcode = Node->getOpcode(); 4369 SDLoc dl(Node); 4370 4371 if (Node->isMachineOpcode()) { 4372 LLVM_DEBUG(dbgs() << "== "; Node->dump(CurDAG); dbgs() << '\n'); 4373 Node->setNodeId(-1); 4374 return; // Already selected. 4375 } 4376 4377 switch (Opcode) { 4378 default: break; 4379 case ISD::INTRINSIC_VOID: { 4380 unsigned IntNo = Node->getConstantOperandVal(1); 4381 switch (IntNo) { 4382 default: break; 4383 case Intrinsic::x86_sse3_monitor: 4384 case Intrinsic::x86_monitorx: 4385 case Intrinsic::x86_clzero: { 4386 bool Use64BitPtr = Node->getOperand(2).getValueType() == MVT::i64; 4387 4388 unsigned Opc = 0; 4389 switch (IntNo) { 4390 default: llvm_unreachable("Unexpected intrinsic!"); 4391 case Intrinsic::x86_sse3_monitor: 4392 if (!Subtarget->hasSSE3()) 4393 break; 4394 Opc = Use64BitPtr ? X86::MONITOR64rrr : X86::MONITOR32rrr; 4395 break; 4396 case Intrinsic::x86_monitorx: 4397 if (!Subtarget->hasMWAITX()) 4398 break; 4399 Opc = Use64BitPtr ? X86::MONITORX64rrr : X86::MONITORX32rrr; 4400 break; 4401 case Intrinsic::x86_clzero: 4402 if (!Subtarget->hasCLZERO()) 4403 break; 4404 Opc = Use64BitPtr ? X86::CLZERO64r : X86::CLZERO32r; 4405 break; 4406 } 4407 4408 if (Opc) { 4409 unsigned PtrReg = Use64BitPtr ? X86::RAX : X86::EAX; 4410 SDValue Chain = CurDAG->getCopyToReg(Node->getOperand(0), dl, PtrReg, 4411 Node->getOperand(2), SDValue()); 4412 SDValue InFlag = Chain.getValue(1); 4413 4414 if (IntNo == Intrinsic::x86_sse3_monitor || 4415 IntNo == Intrinsic::x86_monitorx) { 4416 // Copy the other two operands to ECX and EDX. 4417 Chain = CurDAG->getCopyToReg(Chain, dl, X86::ECX, Node->getOperand(3), 4418 InFlag); 4419 InFlag = Chain.getValue(1); 4420 Chain = CurDAG->getCopyToReg(Chain, dl, X86::EDX, Node->getOperand(4), 4421 InFlag); 4422 InFlag = Chain.getValue(1); 4423 } 4424 4425 MachineSDNode *CNode = CurDAG->getMachineNode(Opc, dl, MVT::Other, 4426 { Chain, InFlag}); 4427 ReplaceNode(Node, CNode); 4428 return; 4429 } 4430 4431 break; 4432 } 4433 case Intrinsic::x86_tileloadd64: 4434 case Intrinsic::x86_tileloaddt164: 4435 case Intrinsic::x86_tilestored64: { 4436 if (!Subtarget->hasAMXTILE()) 4437 break; 4438 unsigned Opc; 4439 switch (IntNo) { 4440 default: llvm_unreachable("Unexpected intrinsic!"); 4441 case Intrinsic::x86_tileloadd64: Opc = X86::PTILELOADD; break; 4442 case Intrinsic::x86_tileloaddt164: Opc = X86::PTILELOADDT1; break; 4443 case Intrinsic::x86_tilestored64: Opc = X86::PTILESTORED; break; 4444 } 4445 // FIXME: Match displacement and scale. 4446 unsigned TIndex = Node->getConstantOperandVal(2); 4447 SDValue TReg = getI8Imm(TIndex, dl); 4448 SDValue Base = Node->getOperand(3); 4449 SDValue Scale = getI8Imm(1, dl); 4450 SDValue Index = Node->getOperand(4); 4451 SDValue Disp = CurDAG->getTargetConstant(0, dl, MVT::i32); 4452 SDValue Segment = CurDAG->getRegister(0, MVT::i16); 4453 SDValue Chain = Node->getOperand(0); 4454 MachineSDNode *CNode; 4455 if (Opc == X86::PTILESTORED) { 4456 SDValue Ops[] = { Base, Scale, Index, Disp, Segment, TReg, Chain }; 4457 CNode = CurDAG->getMachineNode(Opc, dl, MVT::Other, Ops); 4458 } else { 4459 SDValue Ops[] = { TReg, Base, Scale, Index, Disp, Segment, Chain }; 4460 CNode = CurDAG->getMachineNode(Opc, dl, MVT::Other, Ops); 4461 } 4462 ReplaceNode(Node, CNode); 4463 return; 4464 } 4465 } 4466 break; 4467 } 4468 case ISD::BRIND: { 4469 if (Subtarget->isTargetNaCl()) 4470 // NaCl has its own pass where jmp %r32 are converted to jmp %r64. We 4471 // leave the instruction alone. 4472 break; 4473 if (Subtarget->isTarget64BitILP32()) { 4474 // Converts a 32-bit register to a 64-bit, zero-extended version of 4475 // it. This is needed because x86-64 can do many things, but jmp %r32 4476 // ain't one of them. 4477 SDValue Target = Node->getOperand(1); 4478 assert(Target.getValueType() == MVT::i32 && "Unexpected VT!"); 4479 SDValue ZextTarget = CurDAG->getZExtOrTrunc(Target, dl, MVT::i64); 4480 SDValue Brind = CurDAG->getNode(ISD::BRIND, dl, MVT::Other, 4481 Node->getOperand(0), ZextTarget); 4482 ReplaceNode(Node, Brind.getNode()); 4483 SelectCode(ZextTarget.getNode()); 4484 SelectCode(Brind.getNode()); 4485 return; 4486 } 4487 break; 4488 } 4489 case X86ISD::GlobalBaseReg: 4490 ReplaceNode(Node, getGlobalBaseReg()); 4491 return; 4492 4493 case ISD::BITCAST: 4494 // Just drop all 128/256/512-bit bitcasts. 4495 if (NVT.is512BitVector() || NVT.is256BitVector() || NVT.is128BitVector() || 4496 NVT == MVT::f128) { 4497 ReplaceUses(SDValue(Node, 0), Node->getOperand(0)); 4498 CurDAG->RemoveDeadNode(Node); 4499 return; 4500 } 4501 break; 4502 4503 case ISD::SRL: 4504 if (matchBitExtract(Node)) 4505 return; 4506 LLVM_FALLTHROUGH; 4507 case ISD::SRA: 4508 case ISD::SHL: 4509 if (tryShiftAmountMod(Node)) 4510 return; 4511 break; 4512 4513 case ISD::AND: 4514 if (NVT.isVector() && NVT.getVectorElementType() == MVT::i1) { 4515 // Try to form a masked VPTESTM. Operands can be in either order. 4516 SDValue N0 = Node->getOperand(0); 4517 SDValue N1 = Node->getOperand(1); 4518 if (N0.getOpcode() == ISD::SETCC && N0.hasOneUse() && 4519 tryVPTESTM(Node, N0, N1)) 4520 return; 4521 if (N1.getOpcode() == ISD::SETCC && N1.hasOneUse() && 4522 tryVPTESTM(Node, N1, N0)) 4523 return; 4524 } 4525 4526 if (MachineSDNode *NewNode = matchBEXTRFromAndImm(Node)) { 4527 ReplaceUses(SDValue(Node, 0), SDValue(NewNode, 0)); 4528 CurDAG->RemoveDeadNode(Node); 4529 return; 4530 } 4531 if (matchBitExtract(Node)) 4532 return; 4533 if (AndImmShrink && shrinkAndImmediate(Node)) 4534 return; 4535 4536 LLVM_FALLTHROUGH; 4537 case ISD::OR: 4538 case ISD::XOR: 4539 if (tryShrinkShlLogicImm(Node)) 4540 return; 4541 if (Opcode == ISD::OR && tryMatchBitSelect(Node)) 4542 return; 4543 if (tryVPTERNLOG(Node)) 4544 return; 4545 4546 LLVM_FALLTHROUGH; 4547 case ISD::ADD: 4548 case ISD::SUB: { 4549 // Try to avoid folding immediates with multiple uses for optsize. 4550 // This code tries to select to register form directly to avoid going 4551 // through the isel table which might fold the immediate. We can't change 4552 // the patterns on the add/sub/and/or/xor with immediate paterns in the 4553 // tablegen files to check immediate use count without making the patterns 4554 // unavailable to the fast-isel table. 4555 if (!CurDAG->shouldOptForSize()) 4556 break; 4557 4558 // Only handle i8/i16/i32/i64. 4559 if (NVT != MVT::i8 && NVT != MVT::i16 && NVT != MVT::i32 && NVT != MVT::i64) 4560 break; 4561 4562 SDValue N0 = Node->getOperand(0); 4563 SDValue N1 = Node->getOperand(1); 4564 4565 ConstantSDNode *Cst = dyn_cast<ConstantSDNode>(N1); 4566 if (!Cst) 4567 break; 4568 4569 int64_t Val = Cst->getSExtValue(); 4570 4571 // Make sure its an immediate that is considered foldable. 4572 // FIXME: Handle unsigned 32 bit immediates for 64-bit AND. 4573 if (!isInt<8>(Val) && !isInt<32>(Val)) 4574 break; 4575 4576 // If this can match to INC/DEC, let it go. 4577 if (Opcode == ISD::ADD && (Val == 1 || Val == -1)) 4578 break; 4579 4580 // Check if we should avoid folding this immediate. 4581 if (!shouldAvoidImmediateInstFormsForSize(N1.getNode())) 4582 break; 4583 4584 // We should not fold the immediate. So we need a register form instead. 4585 unsigned ROpc, MOpc; 4586 switch (NVT.SimpleTy) { 4587 default: llvm_unreachable("Unexpected VT!"); 4588 case MVT::i8: 4589 switch (Opcode) { 4590 default: llvm_unreachable("Unexpected opcode!"); 4591 case ISD::ADD: ROpc = X86::ADD8rr; MOpc = X86::ADD8rm; break; 4592 case ISD::SUB: ROpc = X86::SUB8rr; MOpc = X86::SUB8rm; break; 4593 case ISD::AND: ROpc = X86::AND8rr; MOpc = X86::AND8rm; break; 4594 case ISD::OR: ROpc = X86::OR8rr; MOpc = X86::OR8rm; break; 4595 case ISD::XOR: ROpc = X86::XOR8rr; MOpc = X86::XOR8rm; break; 4596 } 4597 break; 4598 case MVT::i16: 4599 switch (Opcode) { 4600 default: llvm_unreachable("Unexpected opcode!"); 4601 case ISD::ADD: ROpc = X86::ADD16rr; MOpc = X86::ADD16rm; break; 4602 case ISD::SUB: ROpc = X86::SUB16rr; MOpc = X86::SUB16rm; break; 4603 case ISD::AND: ROpc = X86::AND16rr; MOpc = X86::AND16rm; break; 4604 case ISD::OR: ROpc = X86::OR16rr; MOpc = X86::OR16rm; break; 4605 case ISD::XOR: ROpc = X86::XOR16rr; MOpc = X86::XOR16rm; break; 4606 } 4607 break; 4608 case MVT::i32: 4609 switch (Opcode) { 4610 default: llvm_unreachable("Unexpected opcode!"); 4611 case ISD::ADD: ROpc = X86::ADD32rr; MOpc = X86::ADD32rm; break; 4612 case ISD::SUB: ROpc = X86::SUB32rr; MOpc = X86::SUB32rm; break; 4613 case ISD::AND: ROpc = X86::AND32rr; MOpc = X86::AND32rm; break; 4614 case ISD::OR: ROpc = X86::OR32rr; MOpc = X86::OR32rm; break; 4615 case ISD::XOR: ROpc = X86::XOR32rr; MOpc = X86::XOR32rm; break; 4616 } 4617 break; 4618 case MVT::i64: 4619 switch (Opcode) { 4620 default: llvm_unreachable("Unexpected opcode!"); 4621 case ISD::ADD: ROpc = X86::ADD64rr; MOpc = X86::ADD64rm; break; 4622 case ISD::SUB: ROpc = X86::SUB64rr; MOpc = X86::SUB64rm; break; 4623 case ISD::AND: ROpc = X86::AND64rr; MOpc = X86::AND64rm; break; 4624 case ISD::OR: ROpc = X86::OR64rr; MOpc = X86::OR64rm; break; 4625 case ISD::XOR: ROpc = X86::XOR64rr; MOpc = X86::XOR64rm; break; 4626 } 4627 break; 4628 } 4629 4630 // Ok this is a AND/OR/XOR/ADD/SUB with constant. 4631 4632 // If this is a not a subtract, we can still try to fold a load. 4633 if (Opcode != ISD::SUB) { 4634 SDValue Tmp0, Tmp1, Tmp2, Tmp3, Tmp4; 4635 if (tryFoldLoad(Node, N0, Tmp0, Tmp1, Tmp2, Tmp3, Tmp4)) { 4636 SDValue Ops[] = { N1, Tmp0, Tmp1, Tmp2, Tmp3, Tmp4, N0.getOperand(0) }; 4637 SDVTList VTs = CurDAG->getVTList(NVT, MVT::i32, MVT::Other); 4638 MachineSDNode *CNode = CurDAG->getMachineNode(MOpc, dl, VTs, Ops); 4639 // Update the chain. 4640 ReplaceUses(N0.getValue(1), SDValue(CNode, 2)); 4641 // Record the mem-refs 4642 CurDAG->setNodeMemRefs(CNode, {cast<LoadSDNode>(N0)->getMemOperand()}); 4643 ReplaceUses(SDValue(Node, 0), SDValue(CNode, 0)); 4644 CurDAG->RemoveDeadNode(Node); 4645 return; 4646 } 4647 } 4648 4649 CurDAG->SelectNodeTo(Node, ROpc, NVT, MVT::i32, N0, N1); 4650 return; 4651 } 4652 4653 case X86ISD::SMUL: 4654 // i16/i32/i64 are handled with isel patterns. 4655 if (NVT != MVT::i8) 4656 break; 4657 LLVM_FALLTHROUGH; 4658 case X86ISD::UMUL: { 4659 SDValue N0 = Node->getOperand(0); 4660 SDValue N1 = Node->getOperand(1); 4661 4662 unsigned LoReg, ROpc, MOpc; 4663 switch (NVT.SimpleTy) { 4664 default: llvm_unreachable("Unsupported VT!"); 4665 case MVT::i8: 4666 LoReg = X86::AL; 4667 ROpc = Opcode == X86ISD::SMUL ? X86::IMUL8r : X86::MUL8r; 4668 MOpc = Opcode == X86ISD::SMUL ? X86::IMUL8m : X86::MUL8m; 4669 break; 4670 case MVT::i16: 4671 LoReg = X86::AX; 4672 ROpc = X86::MUL16r; 4673 MOpc = X86::MUL16m; 4674 break; 4675 case MVT::i32: 4676 LoReg = X86::EAX; 4677 ROpc = X86::MUL32r; 4678 MOpc = X86::MUL32m; 4679 break; 4680 case MVT::i64: 4681 LoReg = X86::RAX; 4682 ROpc = X86::MUL64r; 4683 MOpc = X86::MUL64m; 4684 break; 4685 } 4686 4687 SDValue Tmp0, Tmp1, Tmp2, Tmp3, Tmp4; 4688 bool FoldedLoad = tryFoldLoad(Node, N1, Tmp0, Tmp1, Tmp2, Tmp3, Tmp4); 4689 // Multiply is commutative. 4690 if (!FoldedLoad) { 4691 FoldedLoad = tryFoldLoad(Node, N0, Tmp0, Tmp1, Tmp2, Tmp3, Tmp4); 4692 if (FoldedLoad) 4693 std::swap(N0, N1); 4694 } 4695 4696 SDValue InFlag = CurDAG->getCopyToReg(CurDAG->getEntryNode(), dl, LoReg, 4697 N0, SDValue()).getValue(1); 4698 4699 MachineSDNode *CNode; 4700 if (FoldedLoad) { 4701 // i16/i32/i64 use an instruction that produces a low and high result even 4702 // though only the low result is used. 4703 SDVTList VTs; 4704 if (NVT == MVT::i8) 4705 VTs = CurDAG->getVTList(NVT, MVT::i32, MVT::Other); 4706 else 4707 VTs = CurDAG->getVTList(NVT, NVT, MVT::i32, MVT::Other); 4708 4709 SDValue Ops[] = { Tmp0, Tmp1, Tmp2, Tmp3, Tmp4, N1.getOperand(0), 4710 InFlag }; 4711 CNode = CurDAG->getMachineNode(MOpc, dl, VTs, Ops); 4712 4713 // Update the chain. 4714 ReplaceUses(N1.getValue(1), SDValue(CNode, NVT == MVT::i8 ? 2 : 3)); 4715 // Record the mem-refs 4716 CurDAG->setNodeMemRefs(CNode, {cast<LoadSDNode>(N1)->getMemOperand()}); 4717 } else { 4718 // i16/i32/i64 use an instruction that produces a low and high result even 4719 // though only the low result is used. 4720 SDVTList VTs; 4721 if (NVT == MVT::i8) 4722 VTs = CurDAG->getVTList(NVT, MVT::i32); 4723 else 4724 VTs = CurDAG->getVTList(NVT, NVT, MVT::i32); 4725 4726 CNode = CurDAG->getMachineNode(ROpc, dl, VTs, {N1, InFlag}); 4727 } 4728 4729 ReplaceUses(SDValue(Node, 0), SDValue(CNode, 0)); 4730 ReplaceUses(SDValue(Node, 1), SDValue(CNode, NVT == MVT::i8 ? 1 : 2)); 4731 CurDAG->RemoveDeadNode(Node); 4732 return; 4733 } 4734 4735 case ISD::SMUL_LOHI: 4736 case ISD::UMUL_LOHI: { 4737 SDValue N0 = Node->getOperand(0); 4738 SDValue N1 = Node->getOperand(1); 4739 4740 unsigned Opc, MOpc; 4741 unsigned LoReg, HiReg; 4742 bool IsSigned = Opcode == ISD::SMUL_LOHI; 4743 bool UseMULX = !IsSigned && Subtarget->hasBMI2(); 4744 bool UseMULXHi = UseMULX && SDValue(Node, 0).use_empty(); 4745 switch (NVT.SimpleTy) { 4746 default: llvm_unreachable("Unsupported VT!"); 4747 case MVT::i32: 4748 Opc = UseMULXHi ? X86::MULX32Hrr : 4749 UseMULX ? X86::MULX32rr : 4750 IsSigned ? X86::IMUL32r : X86::MUL32r; 4751 MOpc = UseMULXHi ? X86::MULX32Hrm : 4752 UseMULX ? X86::MULX32rm : 4753 IsSigned ? X86::IMUL32m : X86::MUL32m; 4754 LoReg = UseMULX ? X86::EDX : X86::EAX; 4755 HiReg = X86::EDX; 4756 break; 4757 case MVT::i64: 4758 Opc = UseMULXHi ? X86::MULX64Hrr : 4759 UseMULX ? X86::MULX64rr : 4760 IsSigned ? X86::IMUL64r : X86::MUL64r; 4761 MOpc = UseMULXHi ? X86::MULX64Hrm : 4762 UseMULX ? X86::MULX64rm : 4763 IsSigned ? X86::IMUL64m : X86::MUL64m; 4764 LoReg = UseMULX ? X86::RDX : X86::RAX; 4765 HiReg = X86::RDX; 4766 break; 4767 } 4768 4769 SDValue Tmp0, Tmp1, Tmp2, Tmp3, Tmp4; 4770 bool foldedLoad = tryFoldLoad(Node, N1, Tmp0, Tmp1, Tmp2, Tmp3, Tmp4); 4771 // Multiply is commmutative. 4772 if (!foldedLoad) { 4773 foldedLoad = tryFoldLoad(Node, N0, Tmp0, Tmp1, Tmp2, Tmp3, Tmp4); 4774 if (foldedLoad) 4775 std::swap(N0, N1); 4776 } 4777 4778 SDValue InFlag = CurDAG->getCopyToReg(CurDAG->getEntryNode(), dl, LoReg, 4779 N0, SDValue()).getValue(1); 4780 SDValue ResHi, ResLo; 4781 if (foldedLoad) { 4782 SDValue Chain; 4783 MachineSDNode *CNode = nullptr; 4784 SDValue Ops[] = { Tmp0, Tmp1, Tmp2, Tmp3, Tmp4, N1.getOperand(0), 4785 InFlag }; 4786 if (UseMULXHi) { 4787 SDVTList VTs = CurDAG->getVTList(NVT, MVT::Other); 4788 CNode = CurDAG->getMachineNode(MOpc, dl, VTs, Ops); 4789 ResHi = SDValue(CNode, 0); 4790 Chain = SDValue(CNode, 1); 4791 } else if (UseMULX) { 4792 SDVTList VTs = CurDAG->getVTList(NVT, NVT, MVT::Other); 4793 CNode = CurDAG->getMachineNode(MOpc, dl, VTs, Ops); 4794 ResHi = SDValue(CNode, 0); 4795 ResLo = SDValue(CNode, 1); 4796 Chain = SDValue(CNode, 2); 4797 } else { 4798 SDVTList VTs = CurDAG->getVTList(MVT::Other, MVT::Glue); 4799 CNode = CurDAG->getMachineNode(MOpc, dl, VTs, Ops); 4800 Chain = SDValue(CNode, 0); 4801 InFlag = SDValue(CNode, 1); 4802 } 4803 4804 // Update the chain. 4805 ReplaceUses(N1.getValue(1), Chain); 4806 // Record the mem-refs 4807 CurDAG->setNodeMemRefs(CNode, {cast<LoadSDNode>(N1)->getMemOperand()}); 4808 } else { 4809 SDValue Ops[] = { N1, InFlag }; 4810 if (UseMULXHi) { 4811 SDVTList VTs = CurDAG->getVTList(NVT); 4812 SDNode *CNode = CurDAG->getMachineNode(Opc, dl, VTs, Ops); 4813 ResHi = SDValue(CNode, 0); 4814 } else if (UseMULX) { 4815 SDVTList VTs = CurDAG->getVTList(NVT, NVT); 4816 SDNode *CNode = CurDAG->getMachineNode(Opc, dl, VTs, Ops); 4817 ResHi = SDValue(CNode, 0); 4818 ResLo = SDValue(CNode, 1); 4819 } else { 4820 SDVTList VTs = CurDAG->getVTList(MVT::Glue); 4821 SDNode *CNode = CurDAG->getMachineNode(Opc, dl, VTs, Ops); 4822 InFlag = SDValue(CNode, 0); 4823 } 4824 } 4825 4826 // Copy the low half of the result, if it is needed. 4827 if (!SDValue(Node, 0).use_empty()) { 4828 if (!ResLo) { 4829 assert(LoReg && "Register for low half is not defined!"); 4830 ResLo = CurDAG->getCopyFromReg(CurDAG->getEntryNode(), dl, LoReg, 4831 NVT, InFlag); 4832 InFlag = ResLo.getValue(2); 4833 } 4834 ReplaceUses(SDValue(Node, 0), ResLo); 4835 LLVM_DEBUG(dbgs() << "=> "; ResLo.getNode()->dump(CurDAG); 4836 dbgs() << '\n'); 4837 } 4838 // Copy the high half of the result, if it is needed. 4839 if (!SDValue(Node, 1).use_empty()) { 4840 if (!ResHi) { 4841 assert(HiReg && "Register for high half is not defined!"); 4842 ResHi = CurDAG->getCopyFromReg(CurDAG->getEntryNode(), dl, HiReg, 4843 NVT, InFlag); 4844 InFlag = ResHi.getValue(2); 4845 } 4846 ReplaceUses(SDValue(Node, 1), ResHi); 4847 LLVM_DEBUG(dbgs() << "=> "; ResHi.getNode()->dump(CurDAG); 4848 dbgs() << '\n'); 4849 } 4850 4851 CurDAG->RemoveDeadNode(Node); 4852 return; 4853 } 4854 4855 case ISD::SDIVREM: 4856 case ISD::UDIVREM: { 4857 SDValue N0 = Node->getOperand(0); 4858 SDValue N1 = Node->getOperand(1); 4859 4860 unsigned ROpc, MOpc; 4861 bool isSigned = Opcode == ISD::SDIVREM; 4862 if (!isSigned) { 4863 switch (NVT.SimpleTy) { 4864 default: llvm_unreachable("Unsupported VT!"); 4865 case MVT::i8: ROpc = X86::DIV8r; MOpc = X86::DIV8m; break; 4866 case MVT::i16: ROpc = X86::DIV16r; MOpc = X86::DIV16m; break; 4867 case MVT::i32: ROpc = X86::DIV32r; MOpc = X86::DIV32m; break; 4868 case MVT::i64: ROpc = X86::DIV64r; MOpc = X86::DIV64m; break; 4869 } 4870 } else { 4871 switch (NVT.SimpleTy) { 4872 default: llvm_unreachable("Unsupported VT!"); 4873 case MVT::i8: ROpc = X86::IDIV8r; MOpc = X86::IDIV8m; break; 4874 case MVT::i16: ROpc = X86::IDIV16r; MOpc = X86::IDIV16m; break; 4875 case MVT::i32: ROpc = X86::IDIV32r; MOpc = X86::IDIV32m; break; 4876 case MVT::i64: ROpc = X86::IDIV64r; MOpc = X86::IDIV64m; break; 4877 } 4878 } 4879 4880 unsigned LoReg, HiReg, ClrReg; 4881 unsigned SExtOpcode; 4882 switch (NVT.SimpleTy) { 4883 default: llvm_unreachable("Unsupported VT!"); 4884 case MVT::i8: 4885 LoReg = X86::AL; ClrReg = HiReg = X86::AH; 4886 SExtOpcode = 0; // Not used. 4887 break; 4888 case MVT::i16: 4889 LoReg = X86::AX; HiReg = X86::DX; 4890 ClrReg = X86::DX; 4891 SExtOpcode = X86::CWD; 4892 break; 4893 case MVT::i32: 4894 LoReg = X86::EAX; ClrReg = HiReg = X86::EDX; 4895 SExtOpcode = X86::CDQ; 4896 break; 4897 case MVT::i64: 4898 LoReg = X86::RAX; ClrReg = HiReg = X86::RDX; 4899 SExtOpcode = X86::CQO; 4900 break; 4901 } 4902 4903 SDValue Tmp0, Tmp1, Tmp2, Tmp3, Tmp4; 4904 bool foldedLoad = tryFoldLoad(Node, N1, Tmp0, Tmp1, Tmp2, Tmp3, Tmp4); 4905 bool signBitIsZero = CurDAG->SignBitIsZero(N0); 4906 4907 SDValue InFlag; 4908 if (NVT == MVT::i8) { 4909 // Special case for div8, just use a move with zero extension to AX to 4910 // clear the upper 8 bits (AH). 4911 SDValue Tmp0, Tmp1, Tmp2, Tmp3, Tmp4, Chain; 4912 MachineSDNode *Move; 4913 if (tryFoldLoad(Node, N0, Tmp0, Tmp1, Tmp2, Tmp3, Tmp4)) { 4914 SDValue Ops[] = { Tmp0, Tmp1, Tmp2, Tmp3, Tmp4, N0.getOperand(0) }; 4915 unsigned Opc = (isSigned && !signBitIsZero) ? X86::MOVSX16rm8 4916 : X86::MOVZX16rm8; 4917 Move = CurDAG->getMachineNode(Opc, dl, MVT::i16, MVT::Other, Ops); 4918 Chain = SDValue(Move, 1); 4919 ReplaceUses(N0.getValue(1), Chain); 4920 // Record the mem-refs 4921 CurDAG->setNodeMemRefs(Move, {cast<LoadSDNode>(N0)->getMemOperand()}); 4922 } else { 4923 unsigned Opc = (isSigned && !signBitIsZero) ? X86::MOVSX16rr8 4924 : X86::MOVZX16rr8; 4925 Move = CurDAG->getMachineNode(Opc, dl, MVT::i16, N0); 4926 Chain = CurDAG->getEntryNode(); 4927 } 4928 Chain = CurDAG->getCopyToReg(Chain, dl, X86::AX, SDValue(Move, 0), 4929 SDValue()); 4930 InFlag = Chain.getValue(1); 4931 } else { 4932 InFlag = 4933 CurDAG->getCopyToReg(CurDAG->getEntryNode(), dl, 4934 LoReg, N0, SDValue()).getValue(1); 4935 if (isSigned && !signBitIsZero) { 4936 // Sign extend the low part into the high part. 4937 InFlag = 4938 SDValue(CurDAG->getMachineNode(SExtOpcode, dl, MVT::Glue, InFlag),0); 4939 } else { 4940 // Zero out the high part, effectively zero extending the input. 4941 SDVTList VTs = CurDAG->getVTList(MVT::i32, MVT::i32); 4942 SDValue ClrNode = 4943 SDValue(CurDAG->getMachineNode(X86::MOV32r0, dl, VTs, None), 0); 4944 switch (NVT.SimpleTy) { 4945 case MVT::i16: 4946 ClrNode = 4947 SDValue(CurDAG->getMachineNode( 4948 TargetOpcode::EXTRACT_SUBREG, dl, MVT::i16, ClrNode, 4949 CurDAG->getTargetConstant(X86::sub_16bit, dl, 4950 MVT::i32)), 4951 0); 4952 break; 4953 case MVT::i32: 4954 break; 4955 case MVT::i64: 4956 ClrNode = 4957 SDValue(CurDAG->getMachineNode( 4958 TargetOpcode::SUBREG_TO_REG, dl, MVT::i64, 4959 CurDAG->getTargetConstant(0, dl, MVT::i64), ClrNode, 4960 CurDAG->getTargetConstant(X86::sub_32bit, dl, 4961 MVT::i32)), 4962 0); 4963 break; 4964 default: 4965 llvm_unreachable("Unexpected division source"); 4966 } 4967 4968 InFlag = CurDAG->getCopyToReg(CurDAG->getEntryNode(), dl, ClrReg, 4969 ClrNode, InFlag).getValue(1); 4970 } 4971 } 4972 4973 if (foldedLoad) { 4974 SDValue Ops[] = { Tmp0, Tmp1, Tmp2, Tmp3, Tmp4, N1.getOperand(0), 4975 InFlag }; 4976 MachineSDNode *CNode = 4977 CurDAG->getMachineNode(MOpc, dl, MVT::Other, MVT::Glue, Ops); 4978 InFlag = SDValue(CNode, 1); 4979 // Update the chain. 4980 ReplaceUses(N1.getValue(1), SDValue(CNode, 0)); 4981 // Record the mem-refs 4982 CurDAG->setNodeMemRefs(CNode, {cast<LoadSDNode>(N1)->getMemOperand()}); 4983 } else { 4984 InFlag = 4985 SDValue(CurDAG->getMachineNode(ROpc, dl, MVT::Glue, N1, InFlag), 0); 4986 } 4987 4988 // Prevent use of AH in a REX instruction by explicitly copying it to 4989 // an ABCD_L register. 4990 // 4991 // The current assumption of the register allocator is that isel 4992 // won't generate explicit references to the GR8_ABCD_H registers. If 4993 // the allocator and/or the backend get enhanced to be more robust in 4994 // that regard, this can be, and should be, removed. 4995 if (HiReg == X86::AH && !SDValue(Node, 1).use_empty()) { 4996 SDValue AHCopy = CurDAG->getRegister(X86::AH, MVT::i8); 4997 unsigned AHExtOpcode = 4998 isSigned ? X86::MOVSX32rr8_NOREX : X86::MOVZX32rr8_NOREX; 4999 5000 SDNode *RNode = CurDAG->getMachineNode(AHExtOpcode, dl, MVT::i32, 5001 MVT::Glue, AHCopy, InFlag); 5002 SDValue Result(RNode, 0); 5003 InFlag = SDValue(RNode, 1); 5004 5005 Result = 5006 CurDAG->getTargetExtractSubreg(X86::sub_8bit, dl, MVT::i8, Result); 5007 5008 ReplaceUses(SDValue(Node, 1), Result); 5009 LLVM_DEBUG(dbgs() << "=> "; Result.getNode()->dump(CurDAG); 5010 dbgs() << '\n'); 5011 } 5012 // Copy the division (low) result, if it is needed. 5013 if (!SDValue(Node, 0).use_empty()) { 5014 SDValue Result = CurDAG->getCopyFromReg(CurDAG->getEntryNode(), dl, 5015 LoReg, NVT, InFlag); 5016 InFlag = Result.getValue(2); 5017 ReplaceUses(SDValue(Node, 0), Result); 5018 LLVM_DEBUG(dbgs() << "=> "; Result.getNode()->dump(CurDAG); 5019 dbgs() << '\n'); 5020 } 5021 // Copy the remainder (high) result, if it is needed. 5022 if (!SDValue(Node, 1).use_empty()) { 5023 SDValue Result = CurDAG->getCopyFromReg(CurDAG->getEntryNode(), dl, 5024 HiReg, NVT, InFlag); 5025 InFlag = Result.getValue(2); 5026 ReplaceUses(SDValue(Node, 1), Result); 5027 LLVM_DEBUG(dbgs() << "=> "; Result.getNode()->dump(CurDAG); 5028 dbgs() << '\n'); 5029 } 5030 CurDAG->RemoveDeadNode(Node); 5031 return; 5032 } 5033 5034 case X86ISD::FCMP: 5035 case X86ISD::STRICT_FCMP: 5036 case X86ISD::STRICT_FCMPS: { 5037 bool IsStrictCmp = Node->getOpcode() == X86ISD::STRICT_FCMP || 5038 Node->getOpcode() == X86ISD::STRICT_FCMPS; 5039 SDValue N0 = Node->getOperand(IsStrictCmp ? 1 : 0); 5040 SDValue N1 = Node->getOperand(IsStrictCmp ? 2 : 1); 5041 5042 // Save the original VT of the compare. 5043 MVT CmpVT = N0.getSimpleValueType(); 5044 5045 // Floating point needs special handling if we don't have FCOMI. 5046 if (Subtarget->hasCMov()) 5047 break; 5048 5049 bool IsSignaling = Node->getOpcode() == X86ISD::STRICT_FCMPS; 5050 5051 unsigned Opc; 5052 switch (CmpVT.SimpleTy) { 5053 default: llvm_unreachable("Unexpected type!"); 5054 case MVT::f32: 5055 Opc = IsSignaling ? X86::COM_Fpr32 : X86::UCOM_Fpr32; 5056 break; 5057 case MVT::f64: 5058 Opc = IsSignaling ? X86::COM_Fpr64 : X86::UCOM_Fpr64; 5059 break; 5060 case MVT::f80: 5061 Opc = IsSignaling ? X86::COM_Fpr80 : X86::UCOM_Fpr80; 5062 break; 5063 } 5064 5065 SDValue Cmp; 5066 SDValue Chain = 5067 IsStrictCmp ? Node->getOperand(0) : CurDAG->getEntryNode(); 5068 if (IsStrictCmp) { 5069 SDVTList VTs = CurDAG->getVTList(MVT::i16, MVT::Other); 5070 Cmp = SDValue(CurDAG->getMachineNode(Opc, dl, VTs, {N0, N1, Chain}), 0); 5071 Chain = Cmp.getValue(1); 5072 } else { 5073 Cmp = SDValue(CurDAG->getMachineNode(Opc, dl, MVT::i16, N0, N1), 0); 5074 } 5075 5076 // Move FPSW to AX. 5077 SDValue FPSW = CurDAG->getCopyToReg(Chain, dl, X86::FPSW, Cmp, SDValue()); 5078 Chain = FPSW; 5079 SDValue FNSTSW = 5080 SDValue(CurDAG->getMachineNode(X86::FNSTSW16r, dl, MVT::i16, FPSW, 5081 FPSW.getValue(1)), 5082 0); 5083 5084 // Extract upper 8-bits of AX. 5085 SDValue Extract = 5086 CurDAG->getTargetExtractSubreg(X86::sub_8bit_hi, dl, MVT::i8, FNSTSW); 5087 5088 // Move AH into flags. 5089 // Some 64-bit targets lack SAHF support, but they do support FCOMI. 5090 assert(Subtarget->hasLAHFSAHF() && 5091 "Target doesn't support SAHF or FCOMI?"); 5092 SDValue AH = CurDAG->getCopyToReg(Chain, dl, X86::AH, Extract, SDValue()); 5093 Chain = AH; 5094 SDValue SAHF = SDValue( 5095 CurDAG->getMachineNode(X86::SAHF, dl, MVT::i32, AH.getValue(1)), 0); 5096 5097 if (IsStrictCmp) 5098 ReplaceUses(SDValue(Node, 1), Chain); 5099 5100 ReplaceUses(SDValue(Node, 0), SAHF); 5101 CurDAG->RemoveDeadNode(Node); 5102 return; 5103 } 5104 5105 case X86ISD::CMP: { 5106 SDValue N0 = Node->getOperand(0); 5107 SDValue N1 = Node->getOperand(1); 5108 5109 // Optimizations for TEST compares. 5110 if (!isNullConstant(N1)) 5111 break; 5112 5113 // Save the original VT of the compare. 5114 MVT CmpVT = N0.getSimpleValueType(); 5115 5116 // If we are comparing (and (shr X, C, Mask) with 0, emit a BEXTR followed 5117 // by a test instruction. The test should be removed later by 5118 // analyzeCompare if we are using only the zero flag. 5119 // TODO: Should we check the users and use the BEXTR flags directly? 5120 if (N0.getOpcode() == ISD::AND && N0.hasOneUse()) { 5121 if (MachineSDNode *NewNode = matchBEXTRFromAndImm(N0.getNode())) { 5122 unsigned TestOpc = CmpVT == MVT::i64 ? X86::TEST64rr 5123 : X86::TEST32rr; 5124 SDValue BEXTR = SDValue(NewNode, 0); 5125 NewNode = CurDAG->getMachineNode(TestOpc, dl, MVT::i32, BEXTR, BEXTR); 5126 ReplaceUses(SDValue(Node, 0), SDValue(NewNode, 0)); 5127 CurDAG->RemoveDeadNode(Node); 5128 return; 5129 } 5130 } 5131 5132 // We can peek through truncates, but we need to be careful below. 5133 if (N0.getOpcode() == ISD::TRUNCATE && N0.hasOneUse()) 5134 N0 = N0.getOperand(0); 5135 5136 // Look for (X86cmp (and $op, $imm), 0) and see if we can convert it to 5137 // use a smaller encoding. 5138 // Look past the truncate if CMP is the only use of it. 5139 if (N0.getOpcode() == ISD::AND && 5140 N0.getNode()->hasOneUse() && 5141 N0.getValueType() != MVT::i8) { 5142 ConstantSDNode *C = dyn_cast<ConstantSDNode>(N0.getOperand(1)); 5143 if (!C) break; 5144 uint64_t Mask = C->getZExtValue(); 5145 5146 // Check if we can replace AND+IMM64 with a shift. This is possible for 5147 // masks/ like 0xFF000000 or 0x00FFFFFF and if we care only about the zero 5148 // flag. 5149 if (CmpVT == MVT::i64 && !isInt<32>(Mask) && 5150 onlyUsesZeroFlag(SDValue(Node, 0))) { 5151 if (isMask_64(~Mask)) { 5152 unsigned TrailingZeros = countTrailingZeros(Mask); 5153 SDValue Imm = CurDAG->getTargetConstant(TrailingZeros, dl, MVT::i64); 5154 SDValue Shift = 5155 SDValue(CurDAG->getMachineNode(X86::SHR64ri, dl, MVT::i64, MVT::i32, 5156 N0.getOperand(0), Imm), 0); 5157 MachineSDNode *Test = CurDAG->getMachineNode(X86::TEST64rr, dl, 5158 MVT::i32, Shift, Shift); 5159 ReplaceNode(Node, Test); 5160 return; 5161 } 5162 if (isMask_64(Mask)) { 5163 unsigned LeadingZeros = countLeadingZeros(Mask); 5164 SDValue Imm = CurDAG->getTargetConstant(LeadingZeros, dl, MVT::i64); 5165 SDValue Shift = 5166 SDValue(CurDAG->getMachineNode(X86::SHL64ri, dl, MVT::i64, MVT::i32, 5167 N0.getOperand(0), Imm), 0); 5168 MachineSDNode *Test = CurDAG->getMachineNode(X86::TEST64rr, dl, 5169 MVT::i32, Shift, Shift); 5170 ReplaceNode(Node, Test); 5171 return; 5172 } 5173 } 5174 5175 MVT VT; 5176 int SubRegOp; 5177 unsigned ROpc, MOpc; 5178 5179 // For each of these checks we need to be careful if the sign flag is 5180 // being used. It is only safe to use the sign flag in two conditions, 5181 // either the sign bit in the shrunken mask is zero or the final test 5182 // size is equal to the original compare size. 5183 5184 if (isUInt<8>(Mask) && 5185 (!(Mask & 0x80) || CmpVT == MVT::i8 || 5186 hasNoSignFlagUses(SDValue(Node, 0)))) { 5187 // For example, convert "testl %eax, $8" to "testb %al, $8" 5188 VT = MVT::i8; 5189 SubRegOp = X86::sub_8bit; 5190 ROpc = X86::TEST8ri; 5191 MOpc = X86::TEST8mi; 5192 } else if (OptForMinSize && isUInt<16>(Mask) && 5193 (!(Mask & 0x8000) || CmpVT == MVT::i16 || 5194 hasNoSignFlagUses(SDValue(Node, 0)))) { 5195 // For example, "testl %eax, $32776" to "testw %ax, $32776". 5196 // NOTE: We only want to form TESTW instructions if optimizing for 5197 // min size. Otherwise we only save one byte and possibly get a length 5198 // changing prefix penalty in the decoders. 5199 VT = MVT::i16; 5200 SubRegOp = X86::sub_16bit; 5201 ROpc = X86::TEST16ri; 5202 MOpc = X86::TEST16mi; 5203 } else if (isUInt<32>(Mask) && N0.getValueType() != MVT::i16 && 5204 ((!(Mask & 0x80000000) && 5205 // Without minsize 16-bit Cmps can get here so we need to 5206 // be sure we calculate the correct sign flag if needed. 5207 (CmpVT != MVT::i16 || !(Mask & 0x8000))) || 5208 CmpVT == MVT::i32 || 5209 hasNoSignFlagUses(SDValue(Node, 0)))) { 5210 // For example, "testq %rax, $268468232" to "testl %eax, $268468232". 5211 // NOTE: We only want to run that transform if N0 is 32 or 64 bits. 5212 // Otherwize, we find ourselves in a position where we have to do 5213 // promotion. If previous passes did not promote the and, we assume 5214 // they had a good reason not to and do not promote here. 5215 VT = MVT::i32; 5216 SubRegOp = X86::sub_32bit; 5217 ROpc = X86::TEST32ri; 5218 MOpc = X86::TEST32mi; 5219 } else { 5220 // No eligible transformation was found. 5221 break; 5222 } 5223 5224 SDValue Imm = CurDAG->getTargetConstant(Mask, dl, VT); 5225 SDValue Reg = N0.getOperand(0); 5226 5227 // Emit a testl or testw. 5228 MachineSDNode *NewNode; 5229 SDValue Tmp0, Tmp1, Tmp2, Tmp3, Tmp4; 5230 if (tryFoldLoad(Node, N0.getNode(), Reg, Tmp0, Tmp1, Tmp2, Tmp3, Tmp4)) { 5231 if (auto *LoadN = dyn_cast<LoadSDNode>(N0.getOperand(0).getNode())) { 5232 if (!LoadN->isSimple()) { 5233 unsigned NumVolBits = LoadN->getValueType(0).getSizeInBits(); 5234 if (MOpc == X86::TEST8mi && NumVolBits != 8) 5235 break; 5236 else if (MOpc == X86::TEST16mi && NumVolBits != 16) 5237 break; 5238 else if (MOpc == X86::TEST32mi && NumVolBits != 32) 5239 break; 5240 } 5241 } 5242 SDValue Ops[] = { Tmp0, Tmp1, Tmp2, Tmp3, Tmp4, Imm, 5243 Reg.getOperand(0) }; 5244 NewNode = CurDAG->getMachineNode(MOpc, dl, MVT::i32, MVT::Other, Ops); 5245 // Update the chain. 5246 ReplaceUses(Reg.getValue(1), SDValue(NewNode, 1)); 5247 // Record the mem-refs 5248 CurDAG->setNodeMemRefs(NewNode, 5249 {cast<LoadSDNode>(Reg)->getMemOperand()}); 5250 } else { 5251 // Extract the subregister if necessary. 5252 if (N0.getValueType() != VT) 5253 Reg = CurDAG->getTargetExtractSubreg(SubRegOp, dl, VT, Reg); 5254 5255 NewNode = CurDAG->getMachineNode(ROpc, dl, MVT::i32, Reg, Imm); 5256 } 5257 // Replace CMP with TEST. 5258 ReplaceNode(Node, NewNode); 5259 return; 5260 } 5261 break; 5262 } 5263 case X86ISD::PCMPISTR: { 5264 if (!Subtarget->hasSSE42()) 5265 break; 5266 5267 bool NeedIndex = !SDValue(Node, 0).use_empty(); 5268 bool NeedMask = !SDValue(Node, 1).use_empty(); 5269 // We can't fold a load if we are going to make two instructions. 5270 bool MayFoldLoad = !NeedIndex || !NeedMask; 5271 5272 MachineSDNode *CNode; 5273 if (NeedMask) { 5274 unsigned ROpc = Subtarget->hasAVX() ? X86::VPCMPISTRMrr : X86::PCMPISTRMrr; 5275 unsigned MOpc = Subtarget->hasAVX() ? X86::VPCMPISTRMrm : X86::PCMPISTRMrm; 5276 CNode = emitPCMPISTR(ROpc, MOpc, MayFoldLoad, dl, MVT::v16i8, Node); 5277 ReplaceUses(SDValue(Node, 1), SDValue(CNode, 0)); 5278 } 5279 if (NeedIndex || !NeedMask) { 5280 unsigned ROpc = Subtarget->hasAVX() ? X86::VPCMPISTRIrr : X86::PCMPISTRIrr; 5281 unsigned MOpc = Subtarget->hasAVX() ? X86::VPCMPISTRIrm : X86::PCMPISTRIrm; 5282 CNode = emitPCMPISTR(ROpc, MOpc, MayFoldLoad, dl, MVT::i32, Node); 5283 ReplaceUses(SDValue(Node, 0), SDValue(CNode, 0)); 5284 } 5285 5286 // Connect the flag usage to the last instruction created. 5287 ReplaceUses(SDValue(Node, 2), SDValue(CNode, 1)); 5288 CurDAG->RemoveDeadNode(Node); 5289 return; 5290 } 5291 case X86ISD::PCMPESTR: { 5292 if (!Subtarget->hasSSE42()) 5293 break; 5294 5295 // Copy the two implicit register inputs. 5296 SDValue InFlag = CurDAG->getCopyToReg(CurDAG->getEntryNode(), dl, X86::EAX, 5297 Node->getOperand(1), 5298 SDValue()).getValue(1); 5299 InFlag = CurDAG->getCopyToReg(CurDAG->getEntryNode(), dl, X86::EDX, 5300 Node->getOperand(3), InFlag).getValue(1); 5301 5302 bool NeedIndex = !SDValue(Node, 0).use_empty(); 5303 bool NeedMask = !SDValue(Node, 1).use_empty(); 5304 // We can't fold a load if we are going to make two instructions. 5305 bool MayFoldLoad = !NeedIndex || !NeedMask; 5306 5307 MachineSDNode *CNode; 5308 if (NeedMask) { 5309 unsigned ROpc = Subtarget->hasAVX() ? X86::VPCMPESTRMrr : X86::PCMPESTRMrr; 5310 unsigned MOpc = Subtarget->hasAVX() ? X86::VPCMPESTRMrm : X86::PCMPESTRMrm; 5311 CNode = emitPCMPESTR(ROpc, MOpc, MayFoldLoad, dl, MVT::v16i8, Node, 5312 InFlag); 5313 ReplaceUses(SDValue(Node, 1), SDValue(CNode, 0)); 5314 } 5315 if (NeedIndex || !NeedMask) { 5316 unsigned ROpc = Subtarget->hasAVX() ? X86::VPCMPESTRIrr : X86::PCMPESTRIrr; 5317 unsigned MOpc = Subtarget->hasAVX() ? X86::VPCMPESTRIrm : X86::PCMPESTRIrm; 5318 CNode = emitPCMPESTR(ROpc, MOpc, MayFoldLoad, dl, MVT::i32, Node, InFlag); 5319 ReplaceUses(SDValue(Node, 0), SDValue(CNode, 0)); 5320 } 5321 // Connect the flag usage to the last instruction created. 5322 ReplaceUses(SDValue(Node, 2), SDValue(CNode, 1)); 5323 CurDAG->RemoveDeadNode(Node); 5324 return; 5325 } 5326 5327 case ISD::SETCC: { 5328 if (NVT.isVector() && tryVPTESTM(Node, SDValue(Node, 0), SDValue())) 5329 return; 5330 5331 break; 5332 } 5333 5334 case ISD::STORE: 5335 if (foldLoadStoreIntoMemOperand(Node)) 5336 return; 5337 break; 5338 5339 case X86ISD::SETCC_CARRY: { 5340 // We have to do this manually because tblgen will put the eflags copy in 5341 // the wrong place if we use an extract_subreg in the pattern. 5342 MVT VT = Node->getSimpleValueType(0); 5343 5344 // Copy flags to the EFLAGS register and glue it to next node. 5345 SDValue EFLAGS = 5346 CurDAG->getCopyToReg(CurDAG->getEntryNode(), dl, X86::EFLAGS, 5347 Node->getOperand(1), SDValue()); 5348 5349 // Create a 64-bit instruction if the result is 64-bits otherwise use the 5350 // 32-bit version. 5351 unsigned Opc = VT == MVT::i64 ? X86::SETB_C64r : X86::SETB_C32r; 5352 MVT SetVT = VT == MVT::i64 ? MVT::i64 : MVT::i32; 5353 SDValue Result = SDValue( 5354 CurDAG->getMachineNode(Opc, dl, SetVT, EFLAGS, EFLAGS.getValue(1)), 0); 5355 5356 // For less than 32-bits we need to extract from the 32-bit node. 5357 if (VT == MVT::i8 || VT == MVT::i16) { 5358 int SubIndex = VT == MVT::i16 ? X86::sub_16bit : X86::sub_8bit; 5359 Result = CurDAG->getTargetExtractSubreg(SubIndex, dl, VT, Result); 5360 } 5361 5362 ReplaceUses(SDValue(Node, 0), Result); 5363 CurDAG->RemoveDeadNode(Node); 5364 return; 5365 } 5366 case X86ISD::SBB: { 5367 if (isNullConstant(Node->getOperand(0)) && 5368 isNullConstant(Node->getOperand(1))) { 5369 MVT VT = Node->getSimpleValueType(0); 5370 5371 // Create zero. 5372 SDVTList VTs = CurDAG->getVTList(MVT::i32, MVT::i32); 5373 SDValue Zero = 5374 SDValue(CurDAG->getMachineNode(X86::MOV32r0, dl, VTs, None), 0); 5375 if (VT == MVT::i64) { 5376 Zero = SDValue( 5377 CurDAG->getMachineNode( 5378 TargetOpcode::SUBREG_TO_REG, dl, MVT::i64, 5379 CurDAG->getTargetConstant(0, dl, MVT::i64), Zero, 5380 CurDAG->getTargetConstant(X86::sub_32bit, dl, MVT::i32)), 5381 0); 5382 } 5383 5384 // Copy flags to the EFLAGS register and glue it to next node. 5385 SDValue EFLAGS = 5386 CurDAG->getCopyToReg(CurDAG->getEntryNode(), dl, X86::EFLAGS, 5387 Node->getOperand(2), SDValue()); 5388 5389 // Create a 64-bit instruction if the result is 64-bits otherwise use the 5390 // 32-bit version. 5391 unsigned Opc = VT == MVT::i64 ? X86::SBB64rr : X86::SBB32rr; 5392 MVT SBBVT = VT == MVT::i64 ? MVT::i64 : MVT::i32; 5393 VTs = CurDAG->getVTList(SBBVT, MVT::i32); 5394 SDValue Result = 5395 SDValue(CurDAG->getMachineNode(Opc, dl, VTs, {Zero, Zero, EFLAGS, 5396 EFLAGS.getValue(1)}), 5397 0); 5398 5399 // Replace the flag use. 5400 ReplaceUses(SDValue(Node, 1), Result.getValue(1)); 5401 5402 // Replace the result use. 5403 if (!SDValue(Node, 0).use_empty()) { 5404 // For less than 32-bits we need to extract from the 32-bit node. 5405 if (VT == MVT::i8 || VT == MVT::i16) { 5406 int SubIndex = VT == MVT::i16 ? X86::sub_16bit : X86::sub_8bit; 5407 Result = CurDAG->getTargetExtractSubreg(SubIndex, dl, VT, Result); 5408 } 5409 ReplaceUses(SDValue(Node, 0), Result); 5410 } 5411 5412 CurDAG->RemoveDeadNode(Node); 5413 return; 5414 } 5415 break; 5416 } 5417 case X86ISD::MGATHER: { 5418 auto *Mgt = cast<X86MaskedGatherSDNode>(Node); 5419 SDValue IndexOp = Mgt->getIndex(); 5420 SDValue Mask = Mgt->getMask(); 5421 MVT IndexVT = IndexOp.getSimpleValueType(); 5422 MVT ValueVT = Node->getSimpleValueType(0); 5423 MVT MaskVT = Mask.getSimpleValueType(); 5424 5425 // This is just to prevent crashes if the nodes are malformed somehow. We're 5426 // otherwise only doing loose type checking in here based on type what 5427 // a type constraint would say just like table based isel. 5428 if (!ValueVT.isVector() || !MaskVT.isVector()) 5429 break; 5430 5431 unsigned NumElts = ValueVT.getVectorNumElements(); 5432 MVT ValueSVT = ValueVT.getVectorElementType(); 5433 5434 bool IsFP = ValueSVT.isFloatingPoint(); 5435 unsigned EltSize = ValueSVT.getSizeInBits(); 5436 5437 unsigned Opc = 0; 5438 bool AVX512Gather = MaskVT.getVectorElementType() == MVT::i1; 5439 if (AVX512Gather) { 5440 if (IndexVT == MVT::v4i32 && NumElts == 4 && EltSize == 32) 5441 Opc = IsFP ? X86::VGATHERDPSZ128rm : X86::VPGATHERDDZ128rm; 5442 else if (IndexVT == MVT::v8i32 && NumElts == 8 && EltSize == 32) 5443 Opc = IsFP ? X86::VGATHERDPSZ256rm : X86::VPGATHERDDZ256rm; 5444 else if (IndexVT == MVT::v16i32 && NumElts == 16 && EltSize == 32) 5445 Opc = IsFP ? X86::VGATHERDPSZrm : X86::VPGATHERDDZrm; 5446 else if (IndexVT == MVT::v4i32 && NumElts == 2 && EltSize == 64) 5447 Opc = IsFP ? X86::VGATHERDPDZ128rm : X86::VPGATHERDQZ128rm; 5448 else if (IndexVT == MVT::v4i32 && NumElts == 4 && EltSize == 64) 5449 Opc = IsFP ? X86::VGATHERDPDZ256rm : X86::VPGATHERDQZ256rm; 5450 else if (IndexVT == MVT::v8i32 && NumElts == 8 && EltSize == 64) 5451 Opc = IsFP ? X86::VGATHERDPDZrm : X86::VPGATHERDQZrm; 5452 else if (IndexVT == MVT::v2i64 && NumElts == 4 && EltSize == 32) 5453 Opc = IsFP ? X86::VGATHERQPSZ128rm : X86::VPGATHERQDZ128rm; 5454 else if (IndexVT == MVT::v4i64 && NumElts == 4 && EltSize == 32) 5455 Opc = IsFP ? X86::VGATHERQPSZ256rm : X86::VPGATHERQDZ256rm; 5456 else if (IndexVT == MVT::v8i64 && NumElts == 8 && EltSize == 32) 5457 Opc = IsFP ? X86::VGATHERQPSZrm : X86::VPGATHERQDZrm; 5458 else if (IndexVT == MVT::v2i64 && NumElts == 2 && EltSize == 64) 5459 Opc = IsFP ? X86::VGATHERQPDZ128rm : X86::VPGATHERQQZ128rm; 5460 else if (IndexVT == MVT::v4i64 && NumElts == 4 && EltSize == 64) 5461 Opc = IsFP ? X86::VGATHERQPDZ256rm : X86::VPGATHERQQZ256rm; 5462 else if (IndexVT == MVT::v8i64 && NumElts == 8 && EltSize == 64) 5463 Opc = IsFP ? X86::VGATHERQPDZrm : X86::VPGATHERQQZrm; 5464 } else { 5465 assert(EVT(MaskVT) == EVT(ValueVT).changeVectorElementTypeToInteger() && 5466 "Unexpected mask VT!"); 5467 if (IndexVT == MVT::v4i32 && NumElts == 4 && EltSize == 32) 5468 Opc = IsFP ? X86::VGATHERDPSrm : X86::VPGATHERDDrm; 5469 else if (IndexVT == MVT::v8i32 && NumElts == 8 && EltSize == 32) 5470 Opc = IsFP ? X86::VGATHERDPSYrm : X86::VPGATHERDDYrm; 5471 else if (IndexVT == MVT::v4i32 && NumElts == 2 && EltSize == 64) 5472 Opc = IsFP ? X86::VGATHERDPDrm : X86::VPGATHERDQrm; 5473 else if (IndexVT == MVT::v4i32 && NumElts == 4 && EltSize == 64) 5474 Opc = IsFP ? X86::VGATHERDPDYrm : X86::VPGATHERDQYrm; 5475 else if (IndexVT == MVT::v2i64 && NumElts == 4 && EltSize == 32) 5476 Opc = IsFP ? X86::VGATHERQPSrm : X86::VPGATHERQDrm; 5477 else if (IndexVT == MVT::v4i64 && NumElts == 4 && EltSize == 32) 5478 Opc = IsFP ? X86::VGATHERQPSYrm : X86::VPGATHERQDYrm; 5479 else if (IndexVT == MVT::v2i64 && NumElts == 2 && EltSize == 64) 5480 Opc = IsFP ? X86::VGATHERQPDrm : X86::VPGATHERQQrm; 5481 else if (IndexVT == MVT::v4i64 && NumElts == 4 && EltSize == 64) 5482 Opc = IsFP ? X86::VGATHERQPDYrm : X86::VPGATHERQQYrm; 5483 } 5484 5485 if (!Opc) 5486 break; 5487 5488 SDValue Base, Scale, Index, Disp, Segment; 5489 if (!selectVectorAddr(Mgt, Mgt->getBasePtr(), IndexOp, Mgt->getScale(), 5490 Base, Scale, Index, Disp, Segment)) 5491 break; 5492 5493 SDValue PassThru = Mgt->getPassThru(); 5494 SDValue Chain = Mgt->getChain(); 5495 // Gather instructions have a mask output not in the ISD node. 5496 SDVTList VTs = CurDAG->getVTList(ValueVT, MaskVT, MVT::Other); 5497 5498 MachineSDNode *NewNode; 5499 if (AVX512Gather) { 5500 SDValue Ops[] = {PassThru, Mask, Base, Scale, 5501 Index, Disp, Segment, Chain}; 5502 NewNode = CurDAG->getMachineNode(Opc, SDLoc(dl), VTs, Ops); 5503 } else { 5504 SDValue Ops[] = {PassThru, Base, Scale, Index, 5505 Disp, Segment, Mask, Chain}; 5506 NewNode = CurDAG->getMachineNode(Opc, SDLoc(dl), VTs, Ops); 5507 } 5508 CurDAG->setNodeMemRefs(NewNode, {Mgt->getMemOperand()}); 5509 ReplaceUses(SDValue(Node, 0), SDValue(NewNode, 0)); 5510 ReplaceUses(SDValue(Node, 1), SDValue(NewNode, 2)); 5511 CurDAG->RemoveDeadNode(Node); 5512 return; 5513 } 5514 case X86ISD::MSCATTER: { 5515 auto *Sc = cast<X86MaskedScatterSDNode>(Node); 5516 SDValue Value = Sc->getValue(); 5517 SDValue IndexOp = Sc->getIndex(); 5518 MVT IndexVT = IndexOp.getSimpleValueType(); 5519 MVT ValueVT = Value.getSimpleValueType(); 5520 5521 // This is just to prevent crashes if the nodes are malformed somehow. We're 5522 // otherwise only doing loose type checking in here based on type what 5523 // a type constraint would say just like table based isel. 5524 if (!ValueVT.isVector()) 5525 break; 5526 5527 unsigned NumElts = ValueVT.getVectorNumElements(); 5528 MVT ValueSVT = ValueVT.getVectorElementType(); 5529 5530 bool IsFP = ValueSVT.isFloatingPoint(); 5531 unsigned EltSize = ValueSVT.getSizeInBits(); 5532 5533 unsigned Opc; 5534 if (IndexVT == MVT::v4i32 && NumElts == 4 && EltSize == 32) 5535 Opc = IsFP ? X86::VSCATTERDPSZ128mr : X86::VPSCATTERDDZ128mr; 5536 else if (IndexVT == MVT::v8i32 && NumElts == 8 && EltSize == 32) 5537 Opc = IsFP ? X86::VSCATTERDPSZ256mr : X86::VPSCATTERDDZ256mr; 5538 else if (IndexVT == MVT::v16i32 && NumElts == 16 && EltSize == 32) 5539 Opc = IsFP ? X86::VSCATTERDPSZmr : X86::VPSCATTERDDZmr; 5540 else if (IndexVT == MVT::v4i32 && NumElts == 2 && EltSize == 64) 5541 Opc = IsFP ? X86::VSCATTERDPDZ128mr : X86::VPSCATTERDQZ128mr; 5542 else if (IndexVT == MVT::v4i32 && NumElts == 4 && EltSize == 64) 5543 Opc = IsFP ? X86::VSCATTERDPDZ256mr : X86::VPSCATTERDQZ256mr; 5544 else if (IndexVT == MVT::v8i32 && NumElts == 8 && EltSize == 64) 5545 Opc = IsFP ? X86::VSCATTERDPDZmr : X86::VPSCATTERDQZmr; 5546 else if (IndexVT == MVT::v2i64 && NumElts == 4 && EltSize == 32) 5547 Opc = IsFP ? X86::VSCATTERQPSZ128mr : X86::VPSCATTERQDZ128mr; 5548 else if (IndexVT == MVT::v4i64 && NumElts == 4 && EltSize == 32) 5549 Opc = IsFP ? X86::VSCATTERQPSZ256mr : X86::VPSCATTERQDZ256mr; 5550 else if (IndexVT == MVT::v8i64 && NumElts == 8 && EltSize == 32) 5551 Opc = IsFP ? X86::VSCATTERQPSZmr : X86::VPSCATTERQDZmr; 5552 else if (IndexVT == MVT::v2i64 && NumElts == 2 && EltSize == 64) 5553 Opc = IsFP ? X86::VSCATTERQPDZ128mr : X86::VPSCATTERQQZ128mr; 5554 else if (IndexVT == MVT::v4i64 && NumElts == 4 && EltSize == 64) 5555 Opc = IsFP ? X86::VSCATTERQPDZ256mr : X86::VPSCATTERQQZ256mr; 5556 else if (IndexVT == MVT::v8i64 && NumElts == 8 && EltSize == 64) 5557 Opc = IsFP ? X86::VSCATTERQPDZmr : X86::VPSCATTERQQZmr; 5558 else 5559 break; 5560 5561 SDValue Base, Scale, Index, Disp, Segment; 5562 if (!selectVectorAddr(Sc, Sc->getBasePtr(), IndexOp, Sc->getScale(), 5563 Base, Scale, Index, Disp, Segment)) 5564 break; 5565 5566 SDValue Mask = Sc->getMask(); 5567 SDValue Chain = Sc->getChain(); 5568 // Scatter instructions have a mask output not in the ISD node. 5569 SDVTList VTs = CurDAG->getVTList(Mask.getValueType(), MVT::Other); 5570 SDValue Ops[] = {Base, Scale, Index, Disp, Segment, Mask, Value, Chain}; 5571 5572 MachineSDNode *NewNode = CurDAG->getMachineNode(Opc, SDLoc(dl), VTs, Ops); 5573 CurDAG->setNodeMemRefs(NewNode, {Sc->getMemOperand()}); 5574 ReplaceUses(SDValue(Node, 0), SDValue(NewNode, 1)); 5575 CurDAG->RemoveDeadNode(Node); 5576 return; 5577 } 5578 case ISD::PREALLOCATED_SETUP: { 5579 auto *MFI = CurDAG->getMachineFunction().getInfo<X86MachineFunctionInfo>(); 5580 auto CallId = MFI->getPreallocatedIdForCallSite( 5581 cast<SrcValueSDNode>(Node->getOperand(1))->getValue()); 5582 SDValue Chain = Node->getOperand(0); 5583 SDValue CallIdValue = CurDAG->getTargetConstant(CallId, dl, MVT::i32); 5584 MachineSDNode *New = CurDAG->getMachineNode( 5585 TargetOpcode::PREALLOCATED_SETUP, dl, MVT::Other, CallIdValue, Chain); 5586 ReplaceUses(SDValue(Node, 0), SDValue(New, 0)); // Chain 5587 CurDAG->RemoveDeadNode(Node); 5588 return; 5589 } 5590 case ISD::PREALLOCATED_ARG: { 5591 auto *MFI = CurDAG->getMachineFunction().getInfo<X86MachineFunctionInfo>(); 5592 auto CallId = MFI->getPreallocatedIdForCallSite( 5593 cast<SrcValueSDNode>(Node->getOperand(1))->getValue()); 5594 SDValue Chain = Node->getOperand(0); 5595 SDValue CallIdValue = CurDAG->getTargetConstant(CallId, dl, MVT::i32); 5596 SDValue ArgIndex = Node->getOperand(2); 5597 SDValue Ops[3]; 5598 Ops[0] = CallIdValue; 5599 Ops[1] = ArgIndex; 5600 Ops[2] = Chain; 5601 MachineSDNode *New = CurDAG->getMachineNode( 5602 TargetOpcode::PREALLOCATED_ARG, dl, 5603 CurDAG->getVTList(TLI->getPointerTy(CurDAG->getDataLayout()), 5604 MVT::Other), 5605 Ops); 5606 ReplaceUses(SDValue(Node, 0), SDValue(New, 0)); // Arg pointer 5607 ReplaceUses(SDValue(Node, 1), SDValue(New, 1)); // Chain 5608 CurDAG->RemoveDeadNode(Node); 5609 return; 5610 } 5611 } 5612 5613 SelectCode(Node); 5614 } 5615 5616 bool X86DAGToDAGISel:: 5617 SelectInlineAsmMemoryOperand(const SDValue &Op, unsigned ConstraintID, 5618 std::vector<SDValue> &OutOps) { 5619 SDValue Op0, Op1, Op2, Op3, Op4; 5620 switch (ConstraintID) { 5621 default: 5622 llvm_unreachable("Unexpected asm memory constraint"); 5623 case InlineAsm::Constraint_o: // offsetable ?? 5624 case InlineAsm::Constraint_v: // not offsetable ?? 5625 case InlineAsm::Constraint_m: // memory 5626 case InlineAsm::Constraint_X: 5627 if (!selectAddr(nullptr, Op, Op0, Op1, Op2, Op3, Op4)) 5628 return true; 5629 break; 5630 } 5631 5632 OutOps.push_back(Op0); 5633 OutOps.push_back(Op1); 5634 OutOps.push_back(Op2); 5635 OutOps.push_back(Op3); 5636 OutOps.push_back(Op4); 5637 return false; 5638 } 5639 5640 /// This pass converts a legalized DAG into a X86-specific DAG, 5641 /// ready for instruction scheduling. 5642 FunctionPass *llvm::createX86ISelDag(X86TargetMachine &TM, 5643 CodeGenOpt::Level OptLevel) { 5644 return new X86DAGToDAGISel(TM, OptLevel); 5645 } 5646