1 //===- X86ISelDAGToDAG.cpp - A DAG pattern matching inst selector for X86 -===// 2 // 3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 4 // See https://llvm.org/LICENSE.txt for license information. 5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 6 // 7 //===----------------------------------------------------------------------===// 8 // 9 // This file defines a DAG pattern matching instruction selector for X86, 10 // converting from a legalized dag to a X86 dag. 11 // 12 //===----------------------------------------------------------------------===// 13 14 #include "X86.h" 15 #include "X86MachineFunctionInfo.h" 16 #include "X86RegisterInfo.h" 17 #include "X86Subtarget.h" 18 #include "X86TargetMachine.h" 19 #include "llvm/ADT/Statistic.h" 20 #include "llvm/CodeGen/MachineFrameInfo.h" 21 #include "llvm/CodeGen/MachineFunction.h" 22 #include "llvm/CodeGen/SelectionDAGISel.h" 23 #include "llvm/Config/llvm-config.h" 24 #include "llvm/IR/ConstantRange.h" 25 #include "llvm/IR/Function.h" 26 #include "llvm/IR/Instructions.h" 27 #include "llvm/IR/Intrinsics.h" 28 #include "llvm/IR/Type.h" 29 #include "llvm/Support/Debug.h" 30 #include "llvm/Support/ErrorHandling.h" 31 #include "llvm/Support/KnownBits.h" 32 #include "llvm/Support/MathExtras.h" 33 #include "llvm/Support/raw_ostream.h" 34 #include "llvm/Target/TargetMachine.h" 35 #include "llvm/Target/TargetOptions.h" 36 #include <stdint.h> 37 using namespace llvm; 38 39 #define DEBUG_TYPE "x86-isel" 40 41 STATISTIC(NumLoadMoved, "Number of loads moved below TokenFactor"); 42 43 static cl::opt<bool> AndImmShrink("x86-and-imm-shrink", cl::init(true), 44 cl::desc("Enable setting constant bits to reduce size of mask immediates"), 45 cl::Hidden); 46 47 //===----------------------------------------------------------------------===// 48 // Pattern Matcher Implementation 49 //===----------------------------------------------------------------------===// 50 51 namespace { 52 /// This corresponds to X86AddressMode, but uses SDValue's instead of register 53 /// numbers for the leaves of the matched tree. 54 struct X86ISelAddressMode { 55 enum { 56 RegBase, 57 FrameIndexBase 58 } BaseType; 59 60 // This is really a union, discriminated by BaseType! 61 SDValue Base_Reg; 62 int Base_FrameIndex; 63 64 unsigned Scale; 65 SDValue IndexReg; 66 int32_t Disp; 67 SDValue Segment; 68 const GlobalValue *GV; 69 const Constant *CP; 70 const BlockAddress *BlockAddr; 71 const char *ES; 72 MCSymbol *MCSym; 73 int JT; 74 unsigned Align; // CP alignment. 75 unsigned char SymbolFlags; // X86II::MO_* 76 bool NegateIndex = false; 77 78 X86ISelAddressMode() 79 : BaseType(RegBase), Base_FrameIndex(0), Scale(1), IndexReg(), Disp(0), 80 Segment(), GV(nullptr), CP(nullptr), BlockAddr(nullptr), ES(nullptr), 81 MCSym(nullptr), JT(-1), Align(0), SymbolFlags(X86II::MO_NO_FLAG) {} 82 83 bool hasSymbolicDisplacement() const { 84 return GV != nullptr || CP != nullptr || ES != nullptr || 85 MCSym != nullptr || JT != -1 || BlockAddr != nullptr; 86 } 87 88 bool hasBaseOrIndexReg() const { 89 return BaseType == FrameIndexBase || 90 IndexReg.getNode() != nullptr || Base_Reg.getNode() != nullptr; 91 } 92 93 /// Return true if this addressing mode is already RIP-relative. 94 bool isRIPRelative() const { 95 if (BaseType != RegBase) return false; 96 if (RegisterSDNode *RegNode = 97 dyn_cast_or_null<RegisterSDNode>(Base_Reg.getNode())) 98 return RegNode->getReg() == X86::RIP; 99 return false; 100 } 101 102 void setBaseReg(SDValue Reg) { 103 BaseType = RegBase; 104 Base_Reg = Reg; 105 } 106 107 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) 108 void dump(SelectionDAG *DAG = nullptr) { 109 dbgs() << "X86ISelAddressMode " << this << '\n'; 110 dbgs() << "Base_Reg "; 111 if (Base_Reg.getNode()) 112 Base_Reg.getNode()->dump(DAG); 113 else 114 dbgs() << "nul\n"; 115 if (BaseType == FrameIndexBase) 116 dbgs() << " Base.FrameIndex " << Base_FrameIndex << '\n'; 117 dbgs() << " Scale " << Scale << '\n' 118 << "IndexReg "; 119 if (NegateIndex) 120 dbgs() << "negate "; 121 if (IndexReg.getNode()) 122 IndexReg.getNode()->dump(DAG); 123 else 124 dbgs() << "nul\n"; 125 dbgs() << " Disp " << Disp << '\n' 126 << "GV "; 127 if (GV) 128 GV->dump(); 129 else 130 dbgs() << "nul"; 131 dbgs() << " CP "; 132 if (CP) 133 CP->dump(); 134 else 135 dbgs() << "nul"; 136 dbgs() << '\n' 137 << "ES "; 138 if (ES) 139 dbgs() << ES; 140 else 141 dbgs() << "nul"; 142 dbgs() << " MCSym "; 143 if (MCSym) 144 dbgs() << MCSym; 145 else 146 dbgs() << "nul"; 147 dbgs() << " JT" << JT << " Align" << Align << '\n'; 148 } 149 #endif 150 }; 151 } 152 153 namespace { 154 //===--------------------------------------------------------------------===// 155 /// ISel - X86-specific code to select X86 machine instructions for 156 /// SelectionDAG operations. 157 /// 158 class X86DAGToDAGISel final : public SelectionDAGISel { 159 /// Keep a pointer to the X86Subtarget around so that we can 160 /// make the right decision when generating code for different targets. 161 const X86Subtarget *Subtarget; 162 163 /// If true, selector should try to optimize for code size instead of 164 /// performance. 165 bool OptForSize; 166 167 /// If true, selector should try to optimize for minimum code size. 168 bool OptForMinSize; 169 170 /// Disable direct TLS access through segment registers. 171 bool IndirectTlsSegRefs; 172 173 public: 174 explicit X86DAGToDAGISel(X86TargetMachine &tm, CodeGenOpt::Level OptLevel) 175 : SelectionDAGISel(tm, OptLevel), Subtarget(nullptr), OptForSize(false), 176 OptForMinSize(false), IndirectTlsSegRefs(false) {} 177 178 StringRef getPassName() const override { 179 return "X86 DAG->DAG Instruction Selection"; 180 } 181 182 bool runOnMachineFunction(MachineFunction &MF) override { 183 // Reset the subtarget each time through. 184 Subtarget = &MF.getSubtarget<X86Subtarget>(); 185 IndirectTlsSegRefs = MF.getFunction().hasFnAttribute( 186 "indirect-tls-seg-refs"); 187 188 // OptFor[Min]Size are used in pattern predicates that isel is matching. 189 OptForSize = MF.getFunction().hasOptSize(); 190 OptForMinSize = MF.getFunction().hasMinSize(); 191 assert((!OptForMinSize || OptForSize) && 192 "OptForMinSize implies OptForSize"); 193 194 SelectionDAGISel::runOnMachineFunction(MF); 195 return true; 196 } 197 198 void EmitFunctionEntryCode() override; 199 200 bool IsProfitableToFold(SDValue N, SDNode *U, SDNode *Root) const override; 201 202 void PreprocessISelDAG() override; 203 void PostprocessISelDAG() override; 204 205 // Include the pieces autogenerated from the target description. 206 #include "X86GenDAGISel.inc" 207 208 private: 209 void Select(SDNode *N) override; 210 211 bool foldOffsetIntoAddress(uint64_t Offset, X86ISelAddressMode &AM); 212 bool matchLoadInAddress(LoadSDNode *N, X86ISelAddressMode &AM); 213 bool matchWrapper(SDValue N, X86ISelAddressMode &AM); 214 bool matchAddress(SDValue N, X86ISelAddressMode &AM); 215 bool matchVectorAddress(SDValue N, X86ISelAddressMode &AM); 216 bool matchAdd(SDValue &N, X86ISelAddressMode &AM, unsigned Depth); 217 bool matchAddressRecursively(SDValue N, X86ISelAddressMode &AM, 218 unsigned Depth); 219 bool matchAddressBase(SDValue N, X86ISelAddressMode &AM); 220 bool selectAddr(SDNode *Parent, SDValue N, SDValue &Base, 221 SDValue &Scale, SDValue &Index, SDValue &Disp, 222 SDValue &Segment); 223 bool selectVectorAddr(SDNode *Parent, SDValue N, SDValue &Base, 224 SDValue &Scale, SDValue &Index, SDValue &Disp, 225 SDValue &Segment); 226 bool selectMOV64Imm32(SDValue N, SDValue &Imm); 227 bool selectLEAAddr(SDValue N, SDValue &Base, 228 SDValue &Scale, SDValue &Index, SDValue &Disp, 229 SDValue &Segment); 230 bool selectLEA64_32Addr(SDValue N, SDValue &Base, 231 SDValue &Scale, SDValue &Index, SDValue &Disp, 232 SDValue &Segment); 233 bool selectTLSADDRAddr(SDValue N, SDValue &Base, 234 SDValue &Scale, SDValue &Index, SDValue &Disp, 235 SDValue &Segment); 236 bool selectScalarSSELoad(SDNode *Root, SDNode *Parent, SDValue N, 237 SDValue &Base, SDValue &Scale, 238 SDValue &Index, SDValue &Disp, 239 SDValue &Segment, 240 SDValue &NodeWithChain); 241 bool selectRelocImm(SDValue N, SDValue &Op); 242 243 bool tryFoldLoad(SDNode *Root, SDNode *P, SDValue N, 244 SDValue &Base, SDValue &Scale, 245 SDValue &Index, SDValue &Disp, 246 SDValue &Segment); 247 248 // Convenience method where P is also root. 249 bool tryFoldLoad(SDNode *P, SDValue N, 250 SDValue &Base, SDValue &Scale, 251 SDValue &Index, SDValue &Disp, 252 SDValue &Segment) { 253 return tryFoldLoad(P, P, N, Base, Scale, Index, Disp, Segment); 254 } 255 256 bool tryFoldBroadcast(SDNode *Root, SDNode *P, SDValue N, 257 SDValue &Base, SDValue &Scale, 258 SDValue &Index, SDValue &Disp, 259 SDValue &Segment); 260 261 /// Implement addressing mode selection for inline asm expressions. 262 bool SelectInlineAsmMemoryOperand(const SDValue &Op, 263 unsigned ConstraintID, 264 std::vector<SDValue> &OutOps) override; 265 266 void emitSpecialCodeForMain(); 267 268 inline void getAddressOperands(X86ISelAddressMode &AM, const SDLoc &DL, 269 MVT VT, SDValue &Base, SDValue &Scale, 270 SDValue &Index, SDValue &Disp, 271 SDValue &Segment) { 272 if (AM.BaseType == X86ISelAddressMode::FrameIndexBase) 273 Base = CurDAG->getTargetFrameIndex( 274 AM.Base_FrameIndex, TLI->getPointerTy(CurDAG->getDataLayout())); 275 else if (AM.Base_Reg.getNode()) 276 Base = AM.Base_Reg; 277 else 278 Base = CurDAG->getRegister(0, VT); 279 280 Scale = getI8Imm(AM.Scale, DL); 281 282 // Negate the index if needed. 283 if (AM.NegateIndex) { 284 unsigned NegOpc = VT == MVT::i64 ? X86::NEG64r : X86::NEG32r; 285 SDValue Neg = SDValue(CurDAG->getMachineNode(NegOpc, DL, VT, MVT::i32, 286 AM.IndexReg), 0); 287 AM.IndexReg = Neg; 288 } 289 290 if (AM.IndexReg.getNode()) 291 Index = AM.IndexReg; 292 else 293 Index = CurDAG->getRegister(0, VT); 294 295 // These are 32-bit even in 64-bit mode since RIP-relative offset 296 // is 32-bit. 297 if (AM.GV) 298 Disp = CurDAG->getTargetGlobalAddress(AM.GV, SDLoc(), 299 MVT::i32, AM.Disp, 300 AM.SymbolFlags); 301 else if (AM.CP) 302 Disp = CurDAG->getTargetConstantPool(AM.CP, MVT::i32, 303 AM.Align, AM.Disp, AM.SymbolFlags); 304 else if (AM.ES) { 305 assert(!AM.Disp && "Non-zero displacement is ignored with ES."); 306 Disp = CurDAG->getTargetExternalSymbol(AM.ES, MVT::i32, AM.SymbolFlags); 307 } else if (AM.MCSym) { 308 assert(!AM.Disp && "Non-zero displacement is ignored with MCSym."); 309 assert(AM.SymbolFlags == 0 && "oo"); 310 Disp = CurDAG->getMCSymbol(AM.MCSym, MVT::i32); 311 } else if (AM.JT != -1) { 312 assert(!AM.Disp && "Non-zero displacement is ignored with JT."); 313 Disp = CurDAG->getTargetJumpTable(AM.JT, MVT::i32, AM.SymbolFlags); 314 } else if (AM.BlockAddr) 315 Disp = CurDAG->getTargetBlockAddress(AM.BlockAddr, MVT::i32, AM.Disp, 316 AM.SymbolFlags); 317 else 318 Disp = CurDAG->getTargetConstant(AM.Disp, DL, MVT::i32); 319 320 if (AM.Segment.getNode()) 321 Segment = AM.Segment; 322 else 323 Segment = CurDAG->getRegister(0, MVT::i16); 324 } 325 326 // Utility function to determine whether we should avoid selecting 327 // immediate forms of instructions for better code size or not. 328 // At a high level, we'd like to avoid such instructions when 329 // we have similar constants used within the same basic block 330 // that can be kept in a register. 331 // 332 bool shouldAvoidImmediateInstFormsForSize(SDNode *N) const { 333 uint32_t UseCount = 0; 334 335 // Do not want to hoist if we're not optimizing for size. 336 // TODO: We'd like to remove this restriction. 337 // See the comment in X86InstrInfo.td for more info. 338 if (!OptForSize) 339 return false; 340 341 // Walk all the users of the immediate. 342 for (SDNode::use_iterator UI = N->use_begin(), 343 UE = N->use_end(); (UI != UE) && (UseCount < 2); ++UI) { 344 345 SDNode *User = *UI; 346 347 // This user is already selected. Count it as a legitimate use and 348 // move on. 349 if (User->isMachineOpcode()) { 350 UseCount++; 351 continue; 352 } 353 354 // We want to count stores of immediates as real uses. 355 if (User->getOpcode() == ISD::STORE && 356 User->getOperand(1).getNode() == N) { 357 UseCount++; 358 continue; 359 } 360 361 // We don't currently match users that have > 2 operands (except 362 // for stores, which are handled above) 363 // Those instruction won't match in ISEL, for now, and would 364 // be counted incorrectly. 365 // This may change in the future as we add additional instruction 366 // types. 367 if (User->getNumOperands() != 2) 368 continue; 369 370 // If this can match to INC/DEC, don't count it as a use. 371 if (User->getOpcode() == ISD::ADD && 372 (isOneConstant(SDValue(N, 0)) || isAllOnesConstant(SDValue(N, 0)))) 373 continue; 374 375 // Immediates that are used for offsets as part of stack 376 // manipulation should be left alone. These are typically 377 // used to indicate SP offsets for argument passing and 378 // will get pulled into stores/pushes (implicitly). 379 if (User->getOpcode() == X86ISD::ADD || 380 User->getOpcode() == ISD::ADD || 381 User->getOpcode() == X86ISD::SUB || 382 User->getOpcode() == ISD::SUB) { 383 384 // Find the other operand of the add/sub. 385 SDValue OtherOp = User->getOperand(0); 386 if (OtherOp.getNode() == N) 387 OtherOp = User->getOperand(1); 388 389 // Don't count if the other operand is SP. 390 RegisterSDNode *RegNode; 391 if (OtherOp->getOpcode() == ISD::CopyFromReg && 392 (RegNode = dyn_cast_or_null<RegisterSDNode>( 393 OtherOp->getOperand(1).getNode()))) 394 if ((RegNode->getReg() == X86::ESP) || 395 (RegNode->getReg() == X86::RSP)) 396 continue; 397 } 398 399 // ... otherwise, count this and move on. 400 UseCount++; 401 } 402 403 // If we have more than 1 use, then recommend for hoisting. 404 return (UseCount > 1); 405 } 406 407 /// Return a target constant with the specified value of type i8. 408 inline SDValue getI8Imm(unsigned Imm, const SDLoc &DL) { 409 return CurDAG->getTargetConstant(Imm, DL, MVT::i8); 410 } 411 412 /// Return a target constant with the specified value, of type i32. 413 inline SDValue getI32Imm(unsigned Imm, const SDLoc &DL) { 414 return CurDAG->getTargetConstant(Imm, DL, MVT::i32); 415 } 416 417 /// Return a target constant with the specified value, of type i64. 418 inline SDValue getI64Imm(uint64_t Imm, const SDLoc &DL) { 419 return CurDAG->getTargetConstant(Imm, DL, MVT::i64); 420 } 421 422 SDValue getExtractVEXTRACTImmediate(SDNode *N, unsigned VecWidth, 423 const SDLoc &DL) { 424 assert((VecWidth == 128 || VecWidth == 256) && "Unexpected vector width"); 425 uint64_t Index = N->getConstantOperandVal(1); 426 MVT VecVT = N->getOperand(0).getSimpleValueType(); 427 return getI8Imm((Index * VecVT.getScalarSizeInBits()) / VecWidth, DL); 428 } 429 430 SDValue getInsertVINSERTImmediate(SDNode *N, unsigned VecWidth, 431 const SDLoc &DL) { 432 assert((VecWidth == 128 || VecWidth == 256) && "Unexpected vector width"); 433 uint64_t Index = N->getConstantOperandVal(2); 434 MVT VecVT = N->getSimpleValueType(0); 435 return getI8Imm((Index * VecVT.getScalarSizeInBits()) / VecWidth, DL); 436 } 437 438 // Helper to detect unneeded and instructions on shift amounts. Called 439 // from PatFrags in tablegen. 440 bool isUnneededShiftMask(SDNode *N, unsigned Width) const { 441 assert(N->getOpcode() == ISD::AND && "Unexpected opcode"); 442 const APInt &Val = cast<ConstantSDNode>(N->getOperand(1))->getAPIntValue(); 443 444 if (Val.countTrailingOnes() >= Width) 445 return true; 446 447 APInt Mask = Val | CurDAG->computeKnownBits(N->getOperand(0)).Zero; 448 return Mask.countTrailingOnes() >= Width; 449 } 450 451 /// Return an SDNode that returns the value of the global base register. 452 /// Output instructions required to initialize the global base register, 453 /// if necessary. 454 SDNode *getGlobalBaseReg(); 455 456 /// Return a reference to the TargetMachine, casted to the target-specific 457 /// type. 458 const X86TargetMachine &getTargetMachine() const { 459 return static_cast<const X86TargetMachine &>(TM); 460 } 461 462 /// Return a reference to the TargetInstrInfo, casted to the target-specific 463 /// type. 464 const X86InstrInfo *getInstrInfo() const { 465 return Subtarget->getInstrInfo(); 466 } 467 468 /// Address-mode matching performs shift-of-and to and-of-shift 469 /// reassociation in order to expose more scaled addressing 470 /// opportunities. 471 bool ComplexPatternFuncMutatesDAG() const override { 472 return true; 473 } 474 475 bool isSExtAbsoluteSymbolRef(unsigned Width, SDNode *N) const; 476 477 /// Returns whether this is a relocatable immediate in the range 478 /// [-2^Width .. 2^Width-1]. 479 template <unsigned Width> bool isSExtRelocImm(SDNode *N) const { 480 if (auto *CN = dyn_cast<ConstantSDNode>(N)) 481 return isInt<Width>(CN->getSExtValue()); 482 return isSExtAbsoluteSymbolRef(Width, N); 483 } 484 485 // Indicates we should prefer to use a non-temporal load for this load. 486 bool useNonTemporalLoad(LoadSDNode *N) const { 487 if (!N->isNonTemporal()) 488 return false; 489 490 unsigned StoreSize = N->getMemoryVT().getStoreSize(); 491 492 if (N->getAlignment() < StoreSize) 493 return false; 494 495 switch (StoreSize) { 496 default: llvm_unreachable("Unsupported store size"); 497 case 4: 498 case 8: 499 return false; 500 case 16: 501 return Subtarget->hasSSE41(); 502 case 32: 503 return Subtarget->hasAVX2(); 504 case 64: 505 return Subtarget->hasAVX512(); 506 } 507 } 508 509 bool foldLoadStoreIntoMemOperand(SDNode *Node); 510 MachineSDNode *matchBEXTRFromAndImm(SDNode *Node); 511 bool matchBitExtract(SDNode *Node); 512 bool shrinkAndImmediate(SDNode *N); 513 bool isMaskZeroExtended(SDNode *N) const; 514 bool tryShiftAmountMod(SDNode *N); 515 bool combineIncDecVector(SDNode *Node); 516 bool tryShrinkShlLogicImm(SDNode *N); 517 bool tryVPTESTM(SDNode *Root, SDValue Setcc, SDValue Mask); 518 bool tryMatchBitSelect(SDNode *N); 519 520 MachineSDNode *emitPCMPISTR(unsigned ROpc, unsigned MOpc, bool MayFoldLoad, 521 const SDLoc &dl, MVT VT, SDNode *Node); 522 MachineSDNode *emitPCMPESTR(unsigned ROpc, unsigned MOpc, bool MayFoldLoad, 523 const SDLoc &dl, MVT VT, SDNode *Node, 524 SDValue &InFlag); 525 526 bool tryOptimizeRem8Extend(SDNode *N); 527 528 bool onlyUsesZeroFlag(SDValue Flags) const; 529 bool hasNoSignFlagUses(SDValue Flags) const; 530 bool hasNoCarryFlagUses(SDValue Flags) const; 531 }; 532 } 533 534 535 // Returns true if this masked compare can be implemented legally with this 536 // type. 537 static bool isLegalMaskCompare(SDNode *N, const X86Subtarget *Subtarget) { 538 unsigned Opcode = N->getOpcode(); 539 if (Opcode == X86ISD::CMPM || Opcode == ISD::SETCC || 540 Opcode == X86ISD::CMPM_SAE || Opcode == X86ISD::VFPCLASS) { 541 // We can get 256-bit 8 element types here without VLX being enabled. When 542 // this happens we will use 512-bit operations and the mask will not be 543 // zero extended. 544 EVT OpVT = N->getOperand(0).getValueType(); 545 if (OpVT.is256BitVector() || OpVT.is128BitVector()) 546 return Subtarget->hasVLX(); 547 548 return true; 549 } 550 // Scalar opcodes use 128 bit registers, but aren't subject to the VLX check. 551 if (Opcode == X86ISD::VFPCLASSS || Opcode == X86ISD::FSETCCM || 552 Opcode == X86ISD::FSETCCM_SAE) 553 return true; 554 555 return false; 556 } 557 558 // Returns true if we can assume the writer of the mask has zero extended it 559 // for us. 560 bool X86DAGToDAGISel::isMaskZeroExtended(SDNode *N) const { 561 // If this is an AND, check if we have a compare on either side. As long as 562 // one side guarantees the mask is zero extended, the AND will preserve those 563 // zeros. 564 if (N->getOpcode() == ISD::AND) 565 return isLegalMaskCompare(N->getOperand(0).getNode(), Subtarget) || 566 isLegalMaskCompare(N->getOperand(1).getNode(), Subtarget); 567 568 return isLegalMaskCompare(N, Subtarget); 569 } 570 571 bool 572 X86DAGToDAGISel::IsProfitableToFold(SDValue N, SDNode *U, SDNode *Root) const { 573 if (OptLevel == CodeGenOpt::None) return false; 574 575 if (!N.hasOneUse()) 576 return false; 577 578 if (N.getOpcode() != ISD::LOAD) 579 return true; 580 581 // Don't fold non-temporal loads if we have an instruction for them. 582 if (useNonTemporalLoad(cast<LoadSDNode>(N))) 583 return false; 584 585 // If N is a load, do additional profitability checks. 586 if (U == Root) { 587 switch (U->getOpcode()) { 588 default: break; 589 case X86ISD::ADD: 590 case X86ISD::ADC: 591 case X86ISD::SUB: 592 case X86ISD::SBB: 593 case X86ISD::AND: 594 case X86ISD::XOR: 595 case X86ISD::OR: 596 case ISD::ADD: 597 case ISD::ADDCARRY: 598 case ISD::AND: 599 case ISD::OR: 600 case ISD::XOR: { 601 SDValue Op1 = U->getOperand(1); 602 603 // If the other operand is a 8-bit immediate we should fold the immediate 604 // instead. This reduces code size. 605 // e.g. 606 // movl 4(%esp), %eax 607 // addl $4, %eax 608 // vs. 609 // movl $4, %eax 610 // addl 4(%esp), %eax 611 // The former is 2 bytes shorter. In case where the increment is 1, then 612 // the saving can be 4 bytes (by using incl %eax). 613 if (ConstantSDNode *Imm = dyn_cast<ConstantSDNode>(Op1)) { 614 if (Imm->getAPIntValue().isSignedIntN(8)) 615 return false; 616 617 // If this is a 64-bit AND with an immediate that fits in 32-bits, 618 // prefer using the smaller and over folding the load. This is needed to 619 // make sure immediates created by shrinkAndImmediate are always folded. 620 // Ideally we would narrow the load during DAG combine and get the 621 // best of both worlds. 622 if (U->getOpcode() == ISD::AND && 623 Imm->getAPIntValue().getBitWidth() == 64 && 624 Imm->getAPIntValue().isIntN(32)) 625 return false; 626 627 // If this really a zext_inreg that can be represented with a movzx 628 // instruction, prefer that. 629 // TODO: We could shrink the load and fold if it is non-volatile. 630 if (U->getOpcode() == ISD::AND && 631 (Imm->getAPIntValue() == UINT8_MAX || 632 Imm->getAPIntValue() == UINT16_MAX || 633 Imm->getAPIntValue() == UINT32_MAX)) 634 return false; 635 636 // ADD/SUB with can negate the immediate and use the opposite operation 637 // to fit 128 into a sign extended 8 bit immediate. 638 if ((U->getOpcode() == ISD::ADD || U->getOpcode() == ISD::SUB) && 639 (-Imm->getAPIntValue()).isSignedIntN(8)) 640 return false; 641 } 642 643 // If the other operand is a TLS address, we should fold it instead. 644 // This produces 645 // movl %gs:0, %eax 646 // leal i@NTPOFF(%eax), %eax 647 // instead of 648 // movl $i@NTPOFF, %eax 649 // addl %gs:0, %eax 650 // if the block also has an access to a second TLS address this will save 651 // a load. 652 // FIXME: This is probably also true for non-TLS addresses. 653 if (Op1.getOpcode() == X86ISD::Wrapper) { 654 SDValue Val = Op1.getOperand(0); 655 if (Val.getOpcode() == ISD::TargetGlobalTLSAddress) 656 return false; 657 } 658 659 // Don't fold load if this matches the BTS/BTR/BTC patterns. 660 // BTS: (or X, (shl 1, n)) 661 // BTR: (and X, (rotl -2, n)) 662 // BTC: (xor X, (shl 1, n)) 663 if (U->getOpcode() == ISD::OR || U->getOpcode() == ISD::XOR) { 664 if (U->getOperand(0).getOpcode() == ISD::SHL && 665 isOneConstant(U->getOperand(0).getOperand(0))) 666 return false; 667 668 if (U->getOperand(1).getOpcode() == ISD::SHL && 669 isOneConstant(U->getOperand(1).getOperand(0))) 670 return false; 671 } 672 if (U->getOpcode() == ISD::AND) { 673 SDValue U0 = U->getOperand(0); 674 SDValue U1 = U->getOperand(1); 675 if (U0.getOpcode() == ISD::ROTL) { 676 auto *C = dyn_cast<ConstantSDNode>(U0.getOperand(0)); 677 if (C && C->getSExtValue() == -2) 678 return false; 679 } 680 681 if (U1.getOpcode() == ISD::ROTL) { 682 auto *C = dyn_cast<ConstantSDNode>(U1.getOperand(0)); 683 if (C && C->getSExtValue() == -2) 684 return false; 685 } 686 } 687 688 break; 689 } 690 case ISD::SHL: 691 case ISD::SRA: 692 case ISD::SRL: 693 // Don't fold a load into a shift by immediate. The BMI2 instructions 694 // support folding a load, but not an immediate. The legacy instructions 695 // support folding an immediate, but can't fold a load. Folding an 696 // immediate is preferable to folding a load. 697 if (isa<ConstantSDNode>(U->getOperand(1))) 698 return false; 699 700 break; 701 } 702 } 703 704 // Prevent folding a load if this can implemented with an insert_subreg or 705 // a move that implicitly zeroes. 706 if (Root->getOpcode() == ISD::INSERT_SUBVECTOR && 707 isNullConstant(Root->getOperand(2)) && 708 (Root->getOperand(0).isUndef() || 709 ISD::isBuildVectorAllZeros(Root->getOperand(0).getNode()))) 710 return false; 711 712 return true; 713 } 714 715 /// Replace the original chain operand of the call with 716 /// load's chain operand and move load below the call's chain operand. 717 static void moveBelowOrigChain(SelectionDAG *CurDAG, SDValue Load, 718 SDValue Call, SDValue OrigChain) { 719 SmallVector<SDValue, 8> Ops; 720 SDValue Chain = OrigChain.getOperand(0); 721 if (Chain.getNode() == Load.getNode()) 722 Ops.push_back(Load.getOperand(0)); 723 else { 724 assert(Chain.getOpcode() == ISD::TokenFactor && 725 "Unexpected chain operand"); 726 for (unsigned i = 0, e = Chain.getNumOperands(); i != e; ++i) 727 if (Chain.getOperand(i).getNode() == Load.getNode()) 728 Ops.push_back(Load.getOperand(0)); 729 else 730 Ops.push_back(Chain.getOperand(i)); 731 SDValue NewChain = 732 CurDAG->getNode(ISD::TokenFactor, SDLoc(Load), MVT::Other, Ops); 733 Ops.clear(); 734 Ops.push_back(NewChain); 735 } 736 Ops.append(OrigChain->op_begin() + 1, OrigChain->op_end()); 737 CurDAG->UpdateNodeOperands(OrigChain.getNode(), Ops); 738 CurDAG->UpdateNodeOperands(Load.getNode(), Call.getOperand(0), 739 Load.getOperand(1), Load.getOperand(2)); 740 741 Ops.clear(); 742 Ops.push_back(SDValue(Load.getNode(), 1)); 743 Ops.append(Call->op_begin() + 1, Call->op_end()); 744 CurDAG->UpdateNodeOperands(Call.getNode(), Ops); 745 } 746 747 /// Return true if call address is a load and it can be 748 /// moved below CALLSEQ_START and the chains leading up to the call. 749 /// Return the CALLSEQ_START by reference as a second output. 750 /// In the case of a tail call, there isn't a callseq node between the call 751 /// chain and the load. 752 static bool isCalleeLoad(SDValue Callee, SDValue &Chain, bool HasCallSeq) { 753 // The transformation is somewhat dangerous if the call's chain was glued to 754 // the call. After MoveBelowOrigChain the load is moved between the call and 755 // the chain, this can create a cycle if the load is not folded. So it is 756 // *really* important that we are sure the load will be folded. 757 if (Callee.getNode() == Chain.getNode() || !Callee.hasOneUse()) 758 return false; 759 LoadSDNode *LD = dyn_cast<LoadSDNode>(Callee.getNode()); 760 if (!LD || 761 !LD->isSimple() || 762 LD->getAddressingMode() != ISD::UNINDEXED || 763 LD->getExtensionType() != ISD::NON_EXTLOAD) 764 return false; 765 766 // Now let's find the callseq_start. 767 while (HasCallSeq && Chain.getOpcode() != ISD::CALLSEQ_START) { 768 if (!Chain.hasOneUse()) 769 return false; 770 Chain = Chain.getOperand(0); 771 } 772 773 if (!Chain.getNumOperands()) 774 return false; 775 // Since we are not checking for AA here, conservatively abort if the chain 776 // writes to memory. It's not safe to move the callee (a load) across a store. 777 if (isa<MemSDNode>(Chain.getNode()) && 778 cast<MemSDNode>(Chain.getNode())->writeMem()) 779 return false; 780 if (Chain.getOperand(0).getNode() == Callee.getNode()) 781 return true; 782 if (Chain.getOperand(0).getOpcode() == ISD::TokenFactor && 783 Callee.getValue(1).isOperandOf(Chain.getOperand(0).getNode()) && 784 Callee.getValue(1).hasOneUse()) 785 return true; 786 return false; 787 } 788 789 void X86DAGToDAGISel::PreprocessISelDAG() { 790 for (SelectionDAG::allnodes_iterator I = CurDAG->allnodes_begin(), 791 E = CurDAG->allnodes_end(); I != E; ) { 792 SDNode *N = &*I++; // Preincrement iterator to avoid invalidation issues. 793 794 // If this is a target specific AND node with no flag usages, turn it back 795 // into ISD::AND to enable test instruction matching. 796 if (N->getOpcode() == X86ISD::AND && !N->hasAnyUseOfValue(1)) { 797 SDValue Res = CurDAG->getNode(ISD::AND, SDLoc(N), N->getValueType(0), 798 N->getOperand(0), N->getOperand(1)); 799 --I; 800 CurDAG->ReplaceAllUsesOfValueWith(SDValue(N, 0), Res); 801 ++I; 802 CurDAG->DeleteNode(N); 803 continue; 804 } 805 806 switch (N->getOpcode()) { 807 case ISD::FP_TO_SINT: 808 case ISD::FP_TO_UINT: { 809 // Replace vector fp_to_s/uint with their X86 specific equivalent so we 810 // don't need 2 sets of patterns. 811 if (!N->getSimpleValueType(0).isVector()) 812 break; 813 814 unsigned NewOpc; 815 switch (N->getOpcode()) { 816 default: llvm_unreachable("Unexpected opcode!"); 817 case ISD::FP_TO_SINT: NewOpc = X86ISD::CVTTP2SI; break; 818 case ISD::FP_TO_UINT: NewOpc = X86ISD::CVTTP2UI; break; 819 } 820 SDValue Res = CurDAG->getNode(NewOpc, SDLoc(N), N->getValueType(0), 821 N->getOperand(0)); 822 --I; 823 CurDAG->ReplaceAllUsesOfValueWith(SDValue(N, 0), Res); 824 ++I; 825 CurDAG->DeleteNode(N); 826 continue; 827 } 828 case ISD::SHL: 829 case ISD::SRA: 830 case ISD::SRL: { 831 // Replace vector shifts with their X86 specific equivalent so we don't 832 // need 2 sets of patterns. 833 if (!N->getValueType(0).isVector()) 834 break; 835 836 unsigned NewOpc; 837 switch (N->getOpcode()) { 838 default: llvm_unreachable("Unexpected opcode!"); 839 case ISD::SHL: NewOpc = X86ISD::VSHLV; break; 840 case ISD::SRA: NewOpc = X86ISD::VSRAV; break; 841 case ISD::SRL: NewOpc = X86ISD::VSRLV; break; 842 } 843 SDValue Res = CurDAG->getNode(NewOpc, SDLoc(N), N->getValueType(0), 844 N->getOperand(0), N->getOperand(1)); 845 --I; 846 CurDAG->ReplaceAllUsesOfValueWith(SDValue(N, 0), Res); 847 ++I; 848 CurDAG->DeleteNode(N); 849 continue; 850 } 851 case ISD::ANY_EXTEND: 852 case ISD::ANY_EXTEND_VECTOR_INREG: { 853 // Replace vector any extend with the zero extend equivalents so we don't 854 // need 2 sets of patterns. Ignore vXi1 extensions. 855 if (!N->getValueType(0).isVector() || 856 N->getOperand(0).getScalarValueSizeInBits() == 1) 857 break; 858 859 unsigned NewOpc = N->getOpcode() == ISD::ANY_EXTEND 860 ? ISD::ZERO_EXTEND 861 : ISD::ZERO_EXTEND_VECTOR_INREG; 862 863 SDValue Res = CurDAG->getNode(NewOpc, SDLoc(N), N->getValueType(0), 864 N->getOperand(0)); 865 --I; 866 CurDAG->ReplaceAllUsesOfValueWith(SDValue(N, 0), Res); 867 ++I; 868 CurDAG->DeleteNode(N); 869 continue; 870 } 871 case ISD::FCEIL: 872 case ISD::FFLOOR: 873 case ISD::FTRUNC: 874 case ISD::FNEARBYINT: 875 case ISD::FRINT: { 876 // Replace fp rounding with their X86 specific equivalent so we don't 877 // need 2 sets of patterns. 878 unsigned Imm; 879 switch (N->getOpcode()) { 880 default: llvm_unreachable("Unexpected opcode!"); 881 case ISD::FCEIL: Imm = 0xA; break; 882 case ISD::FFLOOR: Imm = 0x9; break; 883 case ISD::FTRUNC: Imm = 0xB; break; 884 case ISD::FNEARBYINT: Imm = 0xC; break; 885 case ISD::FRINT: Imm = 0x4; break; 886 } 887 SDLoc dl(N); 888 SDValue Res = CurDAG->getNode( 889 X86ISD::VRNDSCALE, dl, N->getValueType(0), N->getOperand(0), 890 CurDAG->getTargetConstant(Imm, dl, MVT::i8)); 891 --I; 892 CurDAG->ReplaceAllUsesOfValueWith(SDValue(N, 0), Res); 893 ++I; 894 CurDAG->DeleteNode(N); 895 continue; 896 } 897 case X86ISD::FANDN: 898 case X86ISD::FAND: 899 case X86ISD::FOR: 900 case X86ISD::FXOR: { 901 // Widen scalar fp logic ops to vector to reduce isel patterns. 902 // FIXME: Can we do this during lowering/combine. 903 MVT VT = N->getSimpleValueType(0); 904 if (VT.isVector() || VT == MVT::f128) 905 break; 906 907 MVT VecVT = VT == MVT::f64 ? MVT::v2f64 : MVT::v4f32; 908 SDLoc dl(N); 909 SDValue Op0 = CurDAG->getNode(ISD::SCALAR_TO_VECTOR, dl, VecVT, 910 N->getOperand(0)); 911 SDValue Op1 = CurDAG->getNode(ISD::SCALAR_TO_VECTOR, dl, VecVT, 912 N->getOperand(1)); 913 914 SDValue Res; 915 if (Subtarget->hasSSE2()) { 916 EVT IntVT = EVT(VecVT).changeVectorElementTypeToInteger(); 917 Op0 = CurDAG->getNode(ISD::BITCAST, dl, IntVT, Op0); 918 Op1 = CurDAG->getNode(ISD::BITCAST, dl, IntVT, Op1); 919 unsigned Opc; 920 switch (N->getOpcode()) { 921 default: llvm_unreachable("Unexpected opcode!"); 922 case X86ISD::FANDN: Opc = X86ISD::ANDNP; break; 923 case X86ISD::FAND: Opc = ISD::AND; break; 924 case X86ISD::FOR: Opc = ISD::OR; break; 925 case X86ISD::FXOR: Opc = ISD::XOR; break; 926 } 927 Res = CurDAG->getNode(Opc, dl, IntVT, Op0, Op1); 928 Res = CurDAG->getNode(ISD::BITCAST, dl, VecVT, Res); 929 } else { 930 Res = CurDAG->getNode(N->getOpcode(), dl, VecVT, Op0, Op1); 931 } 932 Res = CurDAG->getNode(ISD::EXTRACT_VECTOR_ELT, dl, VT, Res, 933 CurDAG->getIntPtrConstant(0, dl)); 934 --I; 935 CurDAG->ReplaceAllUsesOfValueWith(SDValue(N, 0), Res); 936 ++I; 937 CurDAG->DeleteNode(N); 938 continue; 939 } 940 } 941 942 if (OptLevel != CodeGenOpt::None && 943 // Only do this when the target can fold the load into the call or 944 // jmp. 945 !Subtarget->useRetpolineIndirectCalls() && 946 ((N->getOpcode() == X86ISD::CALL && !Subtarget->slowTwoMemOps()) || 947 (N->getOpcode() == X86ISD::TC_RETURN && 948 (Subtarget->is64Bit() || 949 !getTargetMachine().isPositionIndependent())))) { 950 /// Also try moving call address load from outside callseq_start to just 951 /// before the call to allow it to be folded. 952 /// 953 /// [Load chain] 954 /// ^ 955 /// | 956 /// [Load] 957 /// ^ ^ 958 /// | | 959 /// / \-- 960 /// / | 961 ///[CALLSEQ_START] | 962 /// ^ | 963 /// | | 964 /// [LOAD/C2Reg] | 965 /// | | 966 /// \ / 967 /// \ / 968 /// [CALL] 969 bool HasCallSeq = N->getOpcode() == X86ISD::CALL; 970 SDValue Chain = N->getOperand(0); 971 SDValue Load = N->getOperand(1); 972 if (!isCalleeLoad(Load, Chain, HasCallSeq)) 973 continue; 974 moveBelowOrigChain(CurDAG, Load, SDValue(N, 0), Chain); 975 ++NumLoadMoved; 976 continue; 977 } 978 979 // Lower fpround and fpextend nodes that target the FP stack to be store and 980 // load to the stack. This is a gross hack. We would like to simply mark 981 // these as being illegal, but when we do that, legalize produces these when 982 // it expands calls, then expands these in the same legalize pass. We would 983 // like dag combine to be able to hack on these between the call expansion 984 // and the node legalization. As such this pass basically does "really 985 // late" legalization of these inline with the X86 isel pass. 986 // FIXME: This should only happen when not compiled with -O0. 987 switch (N->getOpcode()) { 988 default: continue; 989 case ISD::FP_ROUND: 990 case ISD::FP_EXTEND: 991 { 992 MVT SrcVT = N->getOperand(0).getSimpleValueType(); 993 MVT DstVT = N->getSimpleValueType(0); 994 995 // If any of the sources are vectors, no fp stack involved. 996 if (SrcVT.isVector() || DstVT.isVector()) 997 continue; 998 999 // If the source and destination are SSE registers, then this is a legal 1000 // conversion that should not be lowered. 1001 const X86TargetLowering *X86Lowering = 1002 static_cast<const X86TargetLowering *>(TLI); 1003 bool SrcIsSSE = X86Lowering->isScalarFPTypeInSSEReg(SrcVT); 1004 bool DstIsSSE = X86Lowering->isScalarFPTypeInSSEReg(DstVT); 1005 if (SrcIsSSE && DstIsSSE) 1006 continue; 1007 1008 if (!SrcIsSSE && !DstIsSSE) { 1009 // If this is an FPStack extension, it is a noop. 1010 if (N->getOpcode() == ISD::FP_EXTEND) 1011 continue; 1012 // If this is a value-preserving FPStack truncation, it is a noop. 1013 if (N->getConstantOperandVal(1)) 1014 continue; 1015 } 1016 1017 // Here we could have an FP stack truncation or an FPStack <-> SSE convert. 1018 // FPStack has extload and truncstore. SSE can fold direct loads into other 1019 // operations. Based on this, decide what we want to do. 1020 MVT MemVT; 1021 if (N->getOpcode() == ISD::FP_ROUND) 1022 MemVT = DstVT; // FP_ROUND must use DstVT, we can't do a 'trunc load'. 1023 else 1024 MemVT = SrcIsSSE ? SrcVT : DstVT; 1025 1026 SDValue MemTmp = CurDAG->CreateStackTemporary(MemVT); 1027 SDLoc dl(N); 1028 1029 // FIXME: optimize the case where the src/dest is a load or store? 1030 1031 SDValue Store = CurDAG->getTruncStore(CurDAG->getEntryNode(), dl, N->getOperand(0), 1032 MemTmp, MachinePointerInfo(), MemVT); 1033 SDValue Result = CurDAG->getExtLoad(ISD::EXTLOAD, dl, DstVT, Store, MemTmp, 1034 MachinePointerInfo(), MemVT); 1035 1036 // We're about to replace all uses of the FP_ROUND/FP_EXTEND with the 1037 // extload we created. This will cause general havok on the dag because 1038 // anything below the conversion could be folded into other existing nodes. 1039 // To avoid invalidating 'I', back it up to the convert node. 1040 --I; 1041 CurDAG->ReplaceAllUsesOfValueWith(SDValue(N, 0), Result); 1042 break; 1043 } 1044 1045 //The sequence of events for lowering STRICT_FP versions of these nodes requires 1046 //dealing with the chain differently, as there is already a preexisting chain. 1047 case ISD::STRICT_FP_ROUND: 1048 case ISD::STRICT_FP_EXTEND: 1049 { 1050 MVT SrcVT = N->getOperand(1).getSimpleValueType(); 1051 MVT DstVT = N->getSimpleValueType(0); 1052 1053 // If any of the sources are vectors, no fp stack involved. 1054 if (SrcVT.isVector() || DstVT.isVector()) 1055 continue; 1056 1057 // If the source and destination are SSE registers, then this is a legal 1058 // conversion that should not be lowered. 1059 const X86TargetLowering *X86Lowering = 1060 static_cast<const X86TargetLowering *>(TLI); 1061 bool SrcIsSSE = X86Lowering->isScalarFPTypeInSSEReg(SrcVT); 1062 bool DstIsSSE = X86Lowering->isScalarFPTypeInSSEReg(DstVT); 1063 if (SrcIsSSE && DstIsSSE) 1064 continue; 1065 1066 if (!SrcIsSSE && !DstIsSSE) { 1067 // If this is an FPStack extension, it is a noop. 1068 if (N->getOpcode() == ISD::STRICT_FP_EXTEND) 1069 continue; 1070 // If this is a value-preserving FPStack truncation, it is a noop. 1071 if (N->getConstantOperandVal(2)) 1072 continue; 1073 } 1074 1075 // Here we could have an FP stack truncation or an FPStack <-> SSE convert. 1076 // FPStack has extload and truncstore. SSE can fold direct loads into other 1077 // operations. Based on this, decide what we want to do. 1078 MVT MemVT; 1079 if (N->getOpcode() == ISD::STRICT_FP_ROUND) 1080 MemVT = DstVT; // FP_ROUND must use DstVT, we can't do a 'trunc load'. 1081 else 1082 MemVT = SrcIsSSE ? SrcVT : DstVT; 1083 1084 SDValue MemTmp = CurDAG->CreateStackTemporary(MemVT); 1085 SDLoc dl(N); 1086 1087 // FIXME: optimize the case where the src/dest is a load or store? 1088 1089 //Since the operation is StrictFP, use the preexisting chain. 1090 SDValue Store = CurDAG->getTruncStore(N->getOperand(0), dl, N->getOperand(1), 1091 MemTmp, MachinePointerInfo(), MemVT); 1092 SDValue Result = CurDAG->getExtLoad(ISD::EXTLOAD, dl, DstVT, Store, MemTmp, 1093 MachinePointerInfo(), MemVT); 1094 1095 // We're about to replace all uses of the FP_ROUND/FP_EXTEND with the 1096 // extload we created. This will cause general havok on the dag because 1097 // anything below the conversion could be folded into other existing nodes. 1098 // To avoid invalidating 'I', back it up to the convert node. 1099 --I; 1100 CurDAG->ReplaceAllUsesWith(N, Result.getNode()); 1101 break; 1102 } 1103 } 1104 1105 1106 // Now that we did that, the node is dead. Increment the iterator to the 1107 // next node to process, then delete N. 1108 ++I; 1109 CurDAG->DeleteNode(N); 1110 } 1111 1112 // The load+call transform above can leave some dead nodes in the graph. Make 1113 // sure we remove them. Its possible some of the other transforms do to so 1114 // just remove dead nodes unconditionally. 1115 CurDAG->RemoveDeadNodes(); 1116 } 1117 1118 // Look for a redundant movzx/movsx that can occur after an 8-bit divrem. 1119 bool X86DAGToDAGISel::tryOptimizeRem8Extend(SDNode *N) { 1120 unsigned Opc = N->getMachineOpcode(); 1121 if (Opc != X86::MOVZX32rr8 && Opc != X86::MOVSX32rr8 && 1122 Opc != X86::MOVSX64rr8) 1123 return false; 1124 1125 SDValue N0 = N->getOperand(0); 1126 1127 // We need to be extracting the lower bit of an extend. 1128 if (!N0.isMachineOpcode() || 1129 N0.getMachineOpcode() != TargetOpcode::EXTRACT_SUBREG || 1130 N0.getConstantOperandVal(1) != X86::sub_8bit) 1131 return false; 1132 1133 // We're looking for either a movsx or movzx to match the original opcode. 1134 unsigned ExpectedOpc = Opc == X86::MOVZX32rr8 ? X86::MOVZX32rr8_NOREX 1135 : X86::MOVSX32rr8_NOREX; 1136 SDValue N00 = N0.getOperand(0); 1137 if (!N00.isMachineOpcode() || N00.getMachineOpcode() != ExpectedOpc) 1138 return false; 1139 1140 if (Opc == X86::MOVSX64rr8) { 1141 // If we had a sign extend from 8 to 64 bits. We still need to go from 32 1142 // to 64. 1143 MachineSDNode *Extend = CurDAG->getMachineNode(X86::MOVSX64rr32, SDLoc(N), 1144 MVT::i64, N00); 1145 ReplaceUses(N, Extend); 1146 } else { 1147 // Ok we can drop this extend and just use the original extend. 1148 ReplaceUses(N, N00.getNode()); 1149 } 1150 1151 return true; 1152 } 1153 1154 void X86DAGToDAGISel::PostprocessISelDAG() { 1155 // Skip peepholes at -O0. 1156 if (TM.getOptLevel() == CodeGenOpt::None) 1157 return; 1158 1159 SelectionDAG::allnodes_iterator Position = CurDAG->allnodes_end(); 1160 1161 bool MadeChange = false; 1162 while (Position != CurDAG->allnodes_begin()) { 1163 SDNode *N = &*--Position; 1164 // Skip dead nodes and any non-machine opcodes. 1165 if (N->use_empty() || !N->isMachineOpcode()) 1166 continue; 1167 1168 if (tryOptimizeRem8Extend(N)) { 1169 MadeChange = true; 1170 continue; 1171 } 1172 1173 // Look for a TESTrr+ANDrr pattern where both operands of the test are 1174 // the same. Rewrite to remove the AND. 1175 unsigned Opc = N->getMachineOpcode(); 1176 if ((Opc == X86::TEST8rr || Opc == X86::TEST16rr || 1177 Opc == X86::TEST32rr || Opc == X86::TEST64rr) && 1178 N->getOperand(0) == N->getOperand(1) && 1179 N->isOnlyUserOf(N->getOperand(0).getNode()) && 1180 N->getOperand(0).isMachineOpcode()) { 1181 SDValue And = N->getOperand(0); 1182 unsigned N0Opc = And.getMachineOpcode(); 1183 if (N0Opc == X86::AND8rr || N0Opc == X86::AND16rr || 1184 N0Opc == X86::AND32rr || N0Opc == X86::AND64rr) { 1185 MachineSDNode *Test = CurDAG->getMachineNode(Opc, SDLoc(N), 1186 MVT::i32, 1187 And.getOperand(0), 1188 And.getOperand(1)); 1189 ReplaceUses(N, Test); 1190 MadeChange = true; 1191 continue; 1192 } 1193 if (N0Opc == X86::AND8rm || N0Opc == X86::AND16rm || 1194 N0Opc == X86::AND32rm || N0Opc == X86::AND64rm) { 1195 unsigned NewOpc; 1196 switch (N0Opc) { 1197 case X86::AND8rm: NewOpc = X86::TEST8mr; break; 1198 case X86::AND16rm: NewOpc = X86::TEST16mr; break; 1199 case X86::AND32rm: NewOpc = X86::TEST32mr; break; 1200 case X86::AND64rm: NewOpc = X86::TEST64mr; break; 1201 } 1202 1203 // Need to swap the memory and register operand. 1204 SDValue Ops[] = { And.getOperand(1), 1205 And.getOperand(2), 1206 And.getOperand(3), 1207 And.getOperand(4), 1208 And.getOperand(5), 1209 And.getOperand(0), 1210 And.getOperand(6) /* Chain */ }; 1211 MachineSDNode *Test = CurDAG->getMachineNode(NewOpc, SDLoc(N), 1212 MVT::i32, MVT::Other, Ops); 1213 ReplaceUses(N, Test); 1214 MadeChange = true; 1215 continue; 1216 } 1217 } 1218 1219 // Look for a KAND+KORTEST and turn it into KTEST if only the zero flag is 1220 // used. We're doing this late so we can prefer to fold the AND into masked 1221 // comparisons. Doing that can be better for the live range of the mask 1222 // register. 1223 if ((Opc == X86::KORTESTBrr || Opc == X86::KORTESTWrr || 1224 Opc == X86::KORTESTDrr || Opc == X86::KORTESTQrr) && 1225 N->getOperand(0) == N->getOperand(1) && 1226 N->isOnlyUserOf(N->getOperand(0).getNode()) && 1227 N->getOperand(0).isMachineOpcode() && 1228 onlyUsesZeroFlag(SDValue(N, 0))) { 1229 SDValue And = N->getOperand(0); 1230 unsigned N0Opc = And.getMachineOpcode(); 1231 // KANDW is legal with AVX512F, but KTESTW requires AVX512DQ. The other 1232 // KAND instructions and KTEST use the same ISA feature. 1233 if (N0Opc == X86::KANDBrr || 1234 (N0Opc == X86::KANDWrr && Subtarget->hasDQI()) || 1235 N0Opc == X86::KANDDrr || N0Opc == X86::KANDQrr) { 1236 unsigned NewOpc; 1237 switch (Opc) { 1238 default: llvm_unreachable("Unexpected opcode!"); 1239 case X86::KORTESTBrr: NewOpc = X86::KTESTBrr; break; 1240 case X86::KORTESTWrr: NewOpc = X86::KTESTWrr; break; 1241 case X86::KORTESTDrr: NewOpc = X86::KTESTDrr; break; 1242 case X86::KORTESTQrr: NewOpc = X86::KTESTQrr; break; 1243 } 1244 MachineSDNode *KTest = CurDAG->getMachineNode(NewOpc, SDLoc(N), 1245 MVT::i32, 1246 And.getOperand(0), 1247 And.getOperand(1)); 1248 ReplaceUses(N, KTest); 1249 MadeChange = true; 1250 continue; 1251 } 1252 } 1253 1254 // Attempt to remove vectors moves that were inserted to zero upper bits. 1255 if (Opc != TargetOpcode::SUBREG_TO_REG) 1256 continue; 1257 1258 unsigned SubRegIdx = N->getConstantOperandVal(2); 1259 if (SubRegIdx != X86::sub_xmm && SubRegIdx != X86::sub_ymm) 1260 continue; 1261 1262 SDValue Move = N->getOperand(1); 1263 if (!Move.isMachineOpcode()) 1264 continue; 1265 1266 // Make sure its one of the move opcodes we recognize. 1267 switch (Move.getMachineOpcode()) { 1268 default: 1269 continue; 1270 case X86::VMOVAPDrr: case X86::VMOVUPDrr: 1271 case X86::VMOVAPSrr: case X86::VMOVUPSrr: 1272 case X86::VMOVDQArr: case X86::VMOVDQUrr: 1273 case X86::VMOVAPDYrr: case X86::VMOVUPDYrr: 1274 case X86::VMOVAPSYrr: case X86::VMOVUPSYrr: 1275 case X86::VMOVDQAYrr: case X86::VMOVDQUYrr: 1276 case X86::VMOVAPDZ128rr: case X86::VMOVUPDZ128rr: 1277 case X86::VMOVAPSZ128rr: case X86::VMOVUPSZ128rr: 1278 case X86::VMOVDQA32Z128rr: case X86::VMOVDQU32Z128rr: 1279 case X86::VMOVDQA64Z128rr: case X86::VMOVDQU64Z128rr: 1280 case X86::VMOVAPDZ256rr: case X86::VMOVUPDZ256rr: 1281 case X86::VMOVAPSZ256rr: case X86::VMOVUPSZ256rr: 1282 case X86::VMOVDQA32Z256rr: case X86::VMOVDQU32Z256rr: 1283 case X86::VMOVDQA64Z256rr: case X86::VMOVDQU64Z256rr: 1284 break; 1285 } 1286 1287 SDValue In = Move.getOperand(0); 1288 if (!In.isMachineOpcode() || 1289 In.getMachineOpcode() <= TargetOpcode::GENERIC_OP_END) 1290 continue; 1291 1292 // Make sure the instruction has a VEX, XOP, or EVEX prefix. This covers 1293 // the SHA instructions which use a legacy encoding. 1294 uint64_t TSFlags = getInstrInfo()->get(In.getMachineOpcode()).TSFlags; 1295 if ((TSFlags & X86II::EncodingMask) != X86II::VEX && 1296 (TSFlags & X86II::EncodingMask) != X86II::EVEX && 1297 (TSFlags & X86II::EncodingMask) != X86II::XOP) 1298 continue; 1299 1300 // Producing instruction is another vector instruction. We can drop the 1301 // move. 1302 CurDAG->UpdateNodeOperands(N, N->getOperand(0), In, N->getOperand(2)); 1303 MadeChange = true; 1304 } 1305 1306 if (MadeChange) 1307 CurDAG->RemoveDeadNodes(); 1308 } 1309 1310 1311 /// Emit any code that needs to be executed only in the main function. 1312 void X86DAGToDAGISel::emitSpecialCodeForMain() { 1313 if (Subtarget->isTargetCygMing()) { 1314 TargetLowering::ArgListTy Args; 1315 auto &DL = CurDAG->getDataLayout(); 1316 1317 TargetLowering::CallLoweringInfo CLI(*CurDAG); 1318 CLI.setChain(CurDAG->getRoot()) 1319 .setCallee(CallingConv::C, Type::getVoidTy(*CurDAG->getContext()), 1320 CurDAG->getExternalSymbol("__main", TLI->getPointerTy(DL)), 1321 std::move(Args)); 1322 const TargetLowering &TLI = CurDAG->getTargetLoweringInfo(); 1323 std::pair<SDValue, SDValue> Result = TLI.LowerCallTo(CLI); 1324 CurDAG->setRoot(Result.second); 1325 } 1326 } 1327 1328 void X86DAGToDAGISel::EmitFunctionEntryCode() { 1329 // If this is main, emit special code for main. 1330 const Function &F = MF->getFunction(); 1331 if (F.hasExternalLinkage() && F.getName() == "main") 1332 emitSpecialCodeForMain(); 1333 } 1334 1335 static bool isDispSafeForFrameIndex(int64_t Val) { 1336 // On 64-bit platforms, we can run into an issue where a frame index 1337 // includes a displacement that, when added to the explicit displacement, 1338 // will overflow the displacement field. Assuming that the frame index 1339 // displacement fits into a 31-bit integer (which is only slightly more 1340 // aggressive than the current fundamental assumption that it fits into 1341 // a 32-bit integer), a 31-bit disp should always be safe. 1342 return isInt<31>(Val); 1343 } 1344 1345 bool X86DAGToDAGISel::foldOffsetIntoAddress(uint64_t Offset, 1346 X86ISelAddressMode &AM) { 1347 // If there's no offset to fold, we don't need to do any work. 1348 if (Offset == 0) 1349 return false; 1350 1351 // Cannot combine ExternalSymbol displacements with integer offsets. 1352 if (AM.ES || AM.MCSym) 1353 return true; 1354 1355 int64_t Val = AM.Disp + Offset; 1356 CodeModel::Model M = TM.getCodeModel(); 1357 if (Subtarget->is64Bit()) { 1358 if (!X86::isOffsetSuitableForCodeModel(Val, M, 1359 AM.hasSymbolicDisplacement())) 1360 return true; 1361 // In addition to the checks required for a register base, check that 1362 // we do not try to use an unsafe Disp with a frame index. 1363 if (AM.BaseType == X86ISelAddressMode::FrameIndexBase && 1364 !isDispSafeForFrameIndex(Val)) 1365 return true; 1366 } 1367 AM.Disp = Val; 1368 return false; 1369 1370 } 1371 1372 bool X86DAGToDAGISel::matchLoadInAddress(LoadSDNode *N, X86ISelAddressMode &AM){ 1373 SDValue Address = N->getOperand(1); 1374 1375 // load gs:0 -> GS segment register. 1376 // load fs:0 -> FS segment register. 1377 // 1378 // This optimization is valid because the GNU TLS model defines that 1379 // gs:0 (or fs:0 on X86-64) contains its own address. 1380 // For more information see http://people.redhat.com/drepper/tls.pdf 1381 if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Address)) 1382 if (C->getSExtValue() == 0 && AM.Segment.getNode() == nullptr && 1383 !IndirectTlsSegRefs && 1384 (Subtarget->isTargetGlibc() || Subtarget->isTargetAndroid() || 1385 Subtarget->isTargetFuchsia())) 1386 switch (N->getPointerInfo().getAddrSpace()) { 1387 case 256: 1388 AM.Segment = CurDAG->getRegister(X86::GS, MVT::i16); 1389 return false; 1390 case 257: 1391 AM.Segment = CurDAG->getRegister(X86::FS, MVT::i16); 1392 return false; 1393 // Address space 258 is not handled here, because it is not used to 1394 // address TLS areas. 1395 } 1396 1397 return true; 1398 } 1399 1400 /// Try to match X86ISD::Wrapper and X86ISD::WrapperRIP nodes into an addressing 1401 /// mode. These wrap things that will resolve down into a symbol reference. 1402 /// If no match is possible, this returns true, otherwise it returns false. 1403 bool X86DAGToDAGISel::matchWrapper(SDValue N, X86ISelAddressMode &AM) { 1404 // If the addressing mode already has a symbol as the displacement, we can 1405 // never match another symbol. 1406 if (AM.hasSymbolicDisplacement()) 1407 return true; 1408 1409 bool IsRIPRelTLS = false; 1410 bool IsRIPRel = N.getOpcode() == X86ISD::WrapperRIP; 1411 if (IsRIPRel) { 1412 SDValue Val = N.getOperand(0); 1413 if (Val.getOpcode() == ISD::TargetGlobalTLSAddress) 1414 IsRIPRelTLS = true; 1415 } 1416 1417 // We can't use an addressing mode in the 64-bit large code model. 1418 // Global TLS addressing is an exception. In the medium code model, 1419 // we use can use a mode when RIP wrappers are present. 1420 // That signifies access to globals that are known to be "near", 1421 // such as the GOT itself. 1422 CodeModel::Model M = TM.getCodeModel(); 1423 if (Subtarget->is64Bit() && 1424 ((M == CodeModel::Large && !IsRIPRelTLS) || 1425 (M == CodeModel::Medium && !IsRIPRel))) 1426 return true; 1427 1428 // Base and index reg must be 0 in order to use %rip as base. 1429 if (IsRIPRel && AM.hasBaseOrIndexReg()) 1430 return true; 1431 1432 // Make a local copy in case we can't do this fold. 1433 X86ISelAddressMode Backup = AM; 1434 1435 int64_t Offset = 0; 1436 SDValue N0 = N.getOperand(0); 1437 if (GlobalAddressSDNode *G = dyn_cast<GlobalAddressSDNode>(N0)) { 1438 AM.GV = G->getGlobal(); 1439 AM.SymbolFlags = G->getTargetFlags(); 1440 Offset = G->getOffset(); 1441 } else if (ConstantPoolSDNode *CP = dyn_cast<ConstantPoolSDNode>(N0)) { 1442 AM.CP = CP->getConstVal(); 1443 AM.Align = CP->getAlignment(); 1444 AM.SymbolFlags = CP->getTargetFlags(); 1445 Offset = CP->getOffset(); 1446 } else if (ExternalSymbolSDNode *S = dyn_cast<ExternalSymbolSDNode>(N0)) { 1447 AM.ES = S->getSymbol(); 1448 AM.SymbolFlags = S->getTargetFlags(); 1449 } else if (auto *S = dyn_cast<MCSymbolSDNode>(N0)) { 1450 AM.MCSym = S->getMCSymbol(); 1451 } else if (JumpTableSDNode *J = dyn_cast<JumpTableSDNode>(N0)) { 1452 AM.JT = J->getIndex(); 1453 AM.SymbolFlags = J->getTargetFlags(); 1454 } else if (BlockAddressSDNode *BA = dyn_cast<BlockAddressSDNode>(N0)) { 1455 AM.BlockAddr = BA->getBlockAddress(); 1456 AM.SymbolFlags = BA->getTargetFlags(); 1457 Offset = BA->getOffset(); 1458 } else 1459 llvm_unreachable("Unhandled symbol reference node."); 1460 1461 if (foldOffsetIntoAddress(Offset, AM)) { 1462 AM = Backup; 1463 return true; 1464 } 1465 1466 if (IsRIPRel) 1467 AM.setBaseReg(CurDAG->getRegister(X86::RIP, MVT::i64)); 1468 1469 // Commit the changes now that we know this fold is safe. 1470 return false; 1471 } 1472 1473 /// Add the specified node to the specified addressing mode, returning true if 1474 /// it cannot be done. This just pattern matches for the addressing mode. 1475 bool X86DAGToDAGISel::matchAddress(SDValue N, X86ISelAddressMode &AM) { 1476 if (matchAddressRecursively(N, AM, 0)) 1477 return true; 1478 1479 // Post-processing: Convert lea(,%reg,2) to lea(%reg,%reg), which has 1480 // a smaller encoding and avoids a scaled-index. 1481 if (AM.Scale == 2 && 1482 AM.BaseType == X86ISelAddressMode::RegBase && 1483 AM.Base_Reg.getNode() == nullptr) { 1484 AM.Base_Reg = AM.IndexReg; 1485 AM.Scale = 1; 1486 } 1487 1488 // Post-processing: Convert foo to foo(%rip), even in non-PIC mode, 1489 // because it has a smaller encoding. 1490 // TODO: Which other code models can use this? 1491 switch (TM.getCodeModel()) { 1492 default: break; 1493 case CodeModel::Small: 1494 case CodeModel::Kernel: 1495 if (Subtarget->is64Bit() && 1496 AM.Scale == 1 && 1497 AM.BaseType == X86ISelAddressMode::RegBase && 1498 AM.Base_Reg.getNode() == nullptr && 1499 AM.IndexReg.getNode() == nullptr && 1500 AM.SymbolFlags == X86II::MO_NO_FLAG && 1501 AM.hasSymbolicDisplacement()) 1502 AM.Base_Reg = CurDAG->getRegister(X86::RIP, MVT::i64); 1503 break; 1504 } 1505 1506 return false; 1507 } 1508 1509 bool X86DAGToDAGISel::matchAdd(SDValue &N, X86ISelAddressMode &AM, 1510 unsigned Depth) { 1511 // Add an artificial use to this node so that we can keep track of 1512 // it if it gets CSE'd with a different node. 1513 HandleSDNode Handle(N); 1514 1515 X86ISelAddressMode Backup = AM; 1516 if (!matchAddressRecursively(N.getOperand(0), AM, Depth+1) && 1517 !matchAddressRecursively(Handle.getValue().getOperand(1), AM, Depth+1)) 1518 return false; 1519 AM = Backup; 1520 1521 // Try again after commuting the operands. 1522 if (!matchAddressRecursively(Handle.getValue().getOperand(1), AM, Depth+1) && 1523 !matchAddressRecursively(Handle.getValue().getOperand(0), AM, Depth+1)) 1524 return false; 1525 AM = Backup; 1526 1527 // If we couldn't fold both operands into the address at the same time, 1528 // see if we can just put each operand into a register and fold at least 1529 // the add. 1530 if (AM.BaseType == X86ISelAddressMode::RegBase && 1531 !AM.Base_Reg.getNode() && 1532 !AM.IndexReg.getNode()) { 1533 N = Handle.getValue(); 1534 AM.Base_Reg = N.getOperand(0); 1535 AM.IndexReg = N.getOperand(1); 1536 AM.Scale = 1; 1537 return false; 1538 } 1539 N = Handle.getValue(); 1540 return true; 1541 } 1542 1543 // Insert a node into the DAG at least before the Pos node's position. This 1544 // will reposition the node as needed, and will assign it a node ID that is <= 1545 // the Pos node's ID. Note that this does *not* preserve the uniqueness of node 1546 // IDs! The selection DAG must no longer depend on their uniqueness when this 1547 // is used. 1548 static void insertDAGNode(SelectionDAG &DAG, SDValue Pos, SDValue N) { 1549 if (N->getNodeId() == -1 || 1550 (SelectionDAGISel::getUninvalidatedNodeId(N.getNode()) > 1551 SelectionDAGISel::getUninvalidatedNodeId(Pos.getNode()))) { 1552 DAG.RepositionNode(Pos->getIterator(), N.getNode()); 1553 // Mark Node as invalid for pruning as after this it may be a successor to a 1554 // selected node but otherwise be in the same position of Pos. 1555 // Conservatively mark it with the same -abs(Id) to assure node id 1556 // invariant is preserved. 1557 N->setNodeId(Pos->getNodeId()); 1558 SelectionDAGISel::InvalidateNodeId(N.getNode()); 1559 } 1560 } 1561 1562 // Transform "(X >> (8-C1)) & (0xff << C1)" to "((X >> 8) & 0xff) << C1" if 1563 // safe. This allows us to convert the shift and and into an h-register 1564 // extract and a scaled index. Returns false if the simplification is 1565 // performed. 1566 static bool foldMaskAndShiftToExtract(SelectionDAG &DAG, SDValue N, 1567 uint64_t Mask, 1568 SDValue Shift, SDValue X, 1569 X86ISelAddressMode &AM) { 1570 if (Shift.getOpcode() != ISD::SRL || 1571 !isa<ConstantSDNode>(Shift.getOperand(1)) || 1572 !Shift.hasOneUse()) 1573 return true; 1574 1575 int ScaleLog = 8 - Shift.getConstantOperandVal(1); 1576 if (ScaleLog <= 0 || ScaleLog >= 4 || 1577 Mask != (0xffu << ScaleLog)) 1578 return true; 1579 1580 MVT VT = N.getSimpleValueType(); 1581 SDLoc DL(N); 1582 SDValue Eight = DAG.getConstant(8, DL, MVT::i8); 1583 SDValue NewMask = DAG.getConstant(0xff, DL, VT); 1584 SDValue Srl = DAG.getNode(ISD::SRL, DL, VT, X, Eight); 1585 SDValue And = DAG.getNode(ISD::AND, DL, VT, Srl, NewMask); 1586 SDValue ShlCount = DAG.getConstant(ScaleLog, DL, MVT::i8); 1587 SDValue Shl = DAG.getNode(ISD::SHL, DL, VT, And, ShlCount); 1588 1589 // Insert the new nodes into the topological ordering. We must do this in 1590 // a valid topological ordering as nothing is going to go back and re-sort 1591 // these nodes. We continually insert before 'N' in sequence as this is 1592 // essentially a pre-flattened and pre-sorted sequence of nodes. There is no 1593 // hierarchy left to express. 1594 insertDAGNode(DAG, N, Eight); 1595 insertDAGNode(DAG, N, Srl); 1596 insertDAGNode(DAG, N, NewMask); 1597 insertDAGNode(DAG, N, And); 1598 insertDAGNode(DAG, N, ShlCount); 1599 insertDAGNode(DAG, N, Shl); 1600 DAG.ReplaceAllUsesWith(N, Shl); 1601 DAG.RemoveDeadNode(N.getNode()); 1602 AM.IndexReg = And; 1603 AM.Scale = (1 << ScaleLog); 1604 return false; 1605 } 1606 1607 // Transforms "(X << C1) & C2" to "(X & (C2>>C1)) << C1" if safe and if this 1608 // allows us to fold the shift into this addressing mode. Returns false if the 1609 // transform succeeded. 1610 static bool foldMaskedShiftToScaledMask(SelectionDAG &DAG, SDValue N, 1611 X86ISelAddressMode &AM) { 1612 SDValue Shift = N.getOperand(0); 1613 1614 // Use a signed mask so that shifting right will insert sign bits. These 1615 // bits will be removed when we shift the result left so it doesn't matter 1616 // what we use. This might allow a smaller immediate encoding. 1617 int64_t Mask = cast<ConstantSDNode>(N->getOperand(1))->getSExtValue(); 1618 1619 // If we have an any_extend feeding the AND, look through it to see if there 1620 // is a shift behind it. But only if the AND doesn't use the extended bits. 1621 // FIXME: Generalize this to other ANY_EXTEND than i32 to i64? 1622 bool FoundAnyExtend = false; 1623 if (Shift.getOpcode() == ISD::ANY_EXTEND && Shift.hasOneUse() && 1624 Shift.getOperand(0).getSimpleValueType() == MVT::i32 && 1625 isUInt<32>(Mask)) { 1626 FoundAnyExtend = true; 1627 Shift = Shift.getOperand(0); 1628 } 1629 1630 if (Shift.getOpcode() != ISD::SHL || 1631 !isa<ConstantSDNode>(Shift.getOperand(1))) 1632 return true; 1633 1634 SDValue X = Shift.getOperand(0); 1635 1636 // Not likely to be profitable if either the AND or SHIFT node has more 1637 // than one use (unless all uses are for address computation). Besides, 1638 // isel mechanism requires their node ids to be reused. 1639 if (!N.hasOneUse() || !Shift.hasOneUse()) 1640 return true; 1641 1642 // Verify that the shift amount is something we can fold. 1643 unsigned ShiftAmt = Shift.getConstantOperandVal(1); 1644 if (ShiftAmt != 1 && ShiftAmt != 2 && ShiftAmt != 3) 1645 return true; 1646 1647 MVT VT = N.getSimpleValueType(); 1648 SDLoc DL(N); 1649 if (FoundAnyExtend) { 1650 SDValue NewX = DAG.getNode(ISD::ANY_EXTEND, DL, VT, X); 1651 insertDAGNode(DAG, N, NewX); 1652 X = NewX; 1653 } 1654 1655 SDValue NewMask = DAG.getConstant(Mask >> ShiftAmt, DL, VT); 1656 SDValue NewAnd = DAG.getNode(ISD::AND, DL, VT, X, NewMask); 1657 SDValue NewShift = DAG.getNode(ISD::SHL, DL, VT, NewAnd, Shift.getOperand(1)); 1658 1659 // Insert the new nodes into the topological ordering. We must do this in 1660 // a valid topological ordering as nothing is going to go back and re-sort 1661 // these nodes. We continually insert before 'N' in sequence as this is 1662 // essentially a pre-flattened and pre-sorted sequence of nodes. There is no 1663 // hierarchy left to express. 1664 insertDAGNode(DAG, N, NewMask); 1665 insertDAGNode(DAG, N, NewAnd); 1666 insertDAGNode(DAG, N, NewShift); 1667 DAG.ReplaceAllUsesWith(N, NewShift); 1668 DAG.RemoveDeadNode(N.getNode()); 1669 1670 AM.Scale = 1 << ShiftAmt; 1671 AM.IndexReg = NewAnd; 1672 return false; 1673 } 1674 1675 // Implement some heroics to detect shifts of masked values where the mask can 1676 // be replaced by extending the shift and undoing that in the addressing mode 1677 // scale. Patterns such as (shl (srl x, c1), c2) are canonicalized into (and 1678 // (srl x, SHIFT), MASK) by DAGCombines that don't know the shl can be done in 1679 // the addressing mode. This results in code such as: 1680 // 1681 // int f(short *y, int *lookup_table) { 1682 // ... 1683 // return *y + lookup_table[*y >> 11]; 1684 // } 1685 // 1686 // Turning into: 1687 // movzwl (%rdi), %eax 1688 // movl %eax, %ecx 1689 // shrl $11, %ecx 1690 // addl (%rsi,%rcx,4), %eax 1691 // 1692 // Instead of: 1693 // movzwl (%rdi), %eax 1694 // movl %eax, %ecx 1695 // shrl $9, %ecx 1696 // andl $124, %rcx 1697 // addl (%rsi,%rcx), %eax 1698 // 1699 // Note that this function assumes the mask is provided as a mask *after* the 1700 // value is shifted. The input chain may or may not match that, but computing 1701 // such a mask is trivial. 1702 static bool foldMaskAndShiftToScale(SelectionDAG &DAG, SDValue N, 1703 uint64_t Mask, 1704 SDValue Shift, SDValue X, 1705 X86ISelAddressMode &AM) { 1706 if (Shift.getOpcode() != ISD::SRL || !Shift.hasOneUse() || 1707 !isa<ConstantSDNode>(Shift.getOperand(1))) 1708 return true; 1709 1710 unsigned ShiftAmt = Shift.getConstantOperandVal(1); 1711 unsigned MaskLZ = countLeadingZeros(Mask); 1712 unsigned MaskTZ = countTrailingZeros(Mask); 1713 1714 // The amount of shift we're trying to fit into the addressing mode is taken 1715 // from the trailing zeros of the mask. 1716 unsigned AMShiftAmt = MaskTZ; 1717 1718 // There is nothing we can do here unless the mask is removing some bits. 1719 // Also, the addressing mode can only represent shifts of 1, 2, or 3 bits. 1720 if (AMShiftAmt <= 0 || AMShiftAmt > 3) return true; 1721 1722 // We also need to ensure that mask is a continuous run of bits. 1723 if (countTrailingOnes(Mask >> MaskTZ) + MaskTZ + MaskLZ != 64) return true; 1724 1725 // Scale the leading zero count down based on the actual size of the value. 1726 // Also scale it down based on the size of the shift. 1727 unsigned ScaleDown = (64 - X.getSimpleValueType().getSizeInBits()) + ShiftAmt; 1728 if (MaskLZ < ScaleDown) 1729 return true; 1730 MaskLZ -= ScaleDown; 1731 1732 // The final check is to ensure that any masked out high bits of X are 1733 // already known to be zero. Otherwise, the mask has a semantic impact 1734 // other than masking out a couple of low bits. Unfortunately, because of 1735 // the mask, zero extensions will be removed from operands in some cases. 1736 // This code works extra hard to look through extensions because we can 1737 // replace them with zero extensions cheaply if necessary. 1738 bool ReplacingAnyExtend = false; 1739 if (X.getOpcode() == ISD::ANY_EXTEND) { 1740 unsigned ExtendBits = X.getSimpleValueType().getSizeInBits() - 1741 X.getOperand(0).getSimpleValueType().getSizeInBits(); 1742 // Assume that we'll replace the any-extend with a zero-extend, and 1743 // narrow the search to the extended value. 1744 X = X.getOperand(0); 1745 MaskLZ = ExtendBits > MaskLZ ? 0 : MaskLZ - ExtendBits; 1746 ReplacingAnyExtend = true; 1747 } 1748 APInt MaskedHighBits = 1749 APInt::getHighBitsSet(X.getSimpleValueType().getSizeInBits(), MaskLZ); 1750 KnownBits Known = DAG.computeKnownBits(X); 1751 if (MaskedHighBits != Known.Zero) return true; 1752 1753 // We've identified a pattern that can be transformed into a single shift 1754 // and an addressing mode. Make it so. 1755 MVT VT = N.getSimpleValueType(); 1756 if (ReplacingAnyExtend) { 1757 assert(X.getValueType() != VT); 1758 // We looked through an ANY_EXTEND node, insert a ZERO_EXTEND. 1759 SDValue NewX = DAG.getNode(ISD::ZERO_EXTEND, SDLoc(X), VT, X); 1760 insertDAGNode(DAG, N, NewX); 1761 X = NewX; 1762 } 1763 SDLoc DL(N); 1764 SDValue NewSRLAmt = DAG.getConstant(ShiftAmt + AMShiftAmt, DL, MVT::i8); 1765 SDValue NewSRL = DAG.getNode(ISD::SRL, DL, VT, X, NewSRLAmt); 1766 SDValue NewSHLAmt = DAG.getConstant(AMShiftAmt, DL, MVT::i8); 1767 SDValue NewSHL = DAG.getNode(ISD::SHL, DL, VT, NewSRL, NewSHLAmt); 1768 1769 // Insert the new nodes into the topological ordering. We must do this in 1770 // a valid topological ordering as nothing is going to go back and re-sort 1771 // these nodes. We continually insert before 'N' in sequence as this is 1772 // essentially a pre-flattened and pre-sorted sequence of nodes. There is no 1773 // hierarchy left to express. 1774 insertDAGNode(DAG, N, NewSRLAmt); 1775 insertDAGNode(DAG, N, NewSRL); 1776 insertDAGNode(DAG, N, NewSHLAmt); 1777 insertDAGNode(DAG, N, NewSHL); 1778 DAG.ReplaceAllUsesWith(N, NewSHL); 1779 DAG.RemoveDeadNode(N.getNode()); 1780 1781 AM.Scale = 1 << AMShiftAmt; 1782 AM.IndexReg = NewSRL; 1783 return false; 1784 } 1785 1786 // Transform "(X >> SHIFT) & (MASK << C1)" to 1787 // "((X >> (SHIFT + C1)) & (MASK)) << C1". Everything before the SHL will be 1788 // matched to a BEXTR later. Returns false if the simplification is performed. 1789 static bool foldMaskedShiftToBEXTR(SelectionDAG &DAG, SDValue N, 1790 uint64_t Mask, 1791 SDValue Shift, SDValue X, 1792 X86ISelAddressMode &AM, 1793 const X86Subtarget &Subtarget) { 1794 if (Shift.getOpcode() != ISD::SRL || 1795 !isa<ConstantSDNode>(Shift.getOperand(1)) || 1796 !Shift.hasOneUse() || !N.hasOneUse()) 1797 return true; 1798 1799 // Only do this if BEXTR will be matched by matchBEXTRFromAndImm. 1800 if (!Subtarget.hasTBM() && 1801 !(Subtarget.hasBMI() && Subtarget.hasFastBEXTR())) 1802 return true; 1803 1804 // We need to ensure that mask is a continuous run of bits. 1805 if (!isShiftedMask_64(Mask)) return true; 1806 1807 unsigned ShiftAmt = Shift.getConstantOperandVal(1); 1808 1809 // The amount of shift we're trying to fit into the addressing mode is taken 1810 // from the trailing zeros of the mask. 1811 unsigned AMShiftAmt = countTrailingZeros(Mask); 1812 1813 // There is nothing we can do here unless the mask is removing some bits. 1814 // Also, the addressing mode can only represent shifts of 1, 2, or 3 bits. 1815 if (AMShiftAmt <= 0 || AMShiftAmt > 3) return true; 1816 1817 MVT VT = N.getSimpleValueType(); 1818 SDLoc DL(N); 1819 SDValue NewSRLAmt = DAG.getConstant(ShiftAmt + AMShiftAmt, DL, MVT::i8); 1820 SDValue NewSRL = DAG.getNode(ISD::SRL, DL, VT, X, NewSRLAmt); 1821 SDValue NewMask = DAG.getConstant(Mask >> AMShiftAmt, DL, VT); 1822 SDValue NewAnd = DAG.getNode(ISD::AND, DL, VT, NewSRL, NewMask); 1823 SDValue NewSHLAmt = DAG.getConstant(AMShiftAmt, DL, MVT::i8); 1824 SDValue NewSHL = DAG.getNode(ISD::SHL, DL, VT, NewAnd, NewSHLAmt); 1825 1826 // Insert the new nodes into the topological ordering. We must do this in 1827 // a valid topological ordering as nothing is going to go back and re-sort 1828 // these nodes. We continually insert before 'N' in sequence as this is 1829 // essentially a pre-flattened and pre-sorted sequence of nodes. There is no 1830 // hierarchy left to express. 1831 insertDAGNode(DAG, N, NewSRLAmt); 1832 insertDAGNode(DAG, N, NewSRL); 1833 insertDAGNode(DAG, N, NewMask); 1834 insertDAGNode(DAG, N, NewAnd); 1835 insertDAGNode(DAG, N, NewSHLAmt); 1836 insertDAGNode(DAG, N, NewSHL); 1837 DAG.ReplaceAllUsesWith(N, NewSHL); 1838 DAG.RemoveDeadNode(N.getNode()); 1839 1840 AM.Scale = 1 << AMShiftAmt; 1841 AM.IndexReg = NewAnd; 1842 return false; 1843 } 1844 1845 bool X86DAGToDAGISel::matchAddressRecursively(SDValue N, X86ISelAddressMode &AM, 1846 unsigned Depth) { 1847 SDLoc dl(N); 1848 LLVM_DEBUG({ 1849 dbgs() << "MatchAddress: "; 1850 AM.dump(CurDAG); 1851 }); 1852 // Limit recursion. 1853 if (Depth > 5) 1854 return matchAddressBase(N, AM); 1855 1856 // If this is already a %rip relative address, we can only merge immediates 1857 // into it. Instead of handling this in every case, we handle it here. 1858 // RIP relative addressing: %rip + 32-bit displacement! 1859 if (AM.isRIPRelative()) { 1860 // FIXME: JumpTable and ExternalSymbol address currently don't like 1861 // displacements. It isn't very important, but this should be fixed for 1862 // consistency. 1863 if (!(AM.ES || AM.MCSym) && AM.JT != -1) 1864 return true; 1865 1866 if (ConstantSDNode *Cst = dyn_cast<ConstantSDNode>(N)) 1867 if (!foldOffsetIntoAddress(Cst->getSExtValue(), AM)) 1868 return false; 1869 return true; 1870 } 1871 1872 switch (N.getOpcode()) { 1873 default: break; 1874 case ISD::LOCAL_RECOVER: { 1875 if (!AM.hasSymbolicDisplacement() && AM.Disp == 0) 1876 if (const auto *ESNode = dyn_cast<MCSymbolSDNode>(N.getOperand(0))) { 1877 // Use the symbol and don't prefix it. 1878 AM.MCSym = ESNode->getMCSymbol(); 1879 return false; 1880 } 1881 break; 1882 } 1883 case ISD::Constant: { 1884 uint64_t Val = cast<ConstantSDNode>(N)->getSExtValue(); 1885 if (!foldOffsetIntoAddress(Val, AM)) 1886 return false; 1887 break; 1888 } 1889 1890 case X86ISD::Wrapper: 1891 case X86ISD::WrapperRIP: 1892 if (!matchWrapper(N, AM)) 1893 return false; 1894 break; 1895 1896 case ISD::LOAD: 1897 if (!matchLoadInAddress(cast<LoadSDNode>(N), AM)) 1898 return false; 1899 break; 1900 1901 case ISD::FrameIndex: 1902 if (AM.BaseType == X86ISelAddressMode::RegBase && 1903 AM.Base_Reg.getNode() == nullptr && 1904 (!Subtarget->is64Bit() || isDispSafeForFrameIndex(AM.Disp))) { 1905 AM.BaseType = X86ISelAddressMode::FrameIndexBase; 1906 AM.Base_FrameIndex = cast<FrameIndexSDNode>(N)->getIndex(); 1907 return false; 1908 } 1909 break; 1910 1911 case ISD::SHL: 1912 if (AM.IndexReg.getNode() != nullptr || AM.Scale != 1) 1913 break; 1914 1915 if (ConstantSDNode *CN = dyn_cast<ConstantSDNode>(N.getOperand(1))) { 1916 unsigned Val = CN->getZExtValue(); 1917 // Note that we handle x<<1 as (,x,2) rather than (x,x) here so 1918 // that the base operand remains free for further matching. If 1919 // the base doesn't end up getting used, a post-processing step 1920 // in MatchAddress turns (,x,2) into (x,x), which is cheaper. 1921 if (Val == 1 || Val == 2 || Val == 3) { 1922 AM.Scale = 1 << Val; 1923 SDValue ShVal = N.getOperand(0); 1924 1925 // Okay, we know that we have a scale by now. However, if the scaled 1926 // value is an add of something and a constant, we can fold the 1927 // constant into the disp field here. 1928 if (CurDAG->isBaseWithConstantOffset(ShVal)) { 1929 AM.IndexReg = ShVal.getOperand(0); 1930 ConstantSDNode *AddVal = cast<ConstantSDNode>(ShVal.getOperand(1)); 1931 uint64_t Disp = (uint64_t)AddVal->getSExtValue() << Val; 1932 if (!foldOffsetIntoAddress(Disp, AM)) 1933 return false; 1934 } 1935 1936 AM.IndexReg = ShVal; 1937 return false; 1938 } 1939 } 1940 break; 1941 1942 case ISD::SRL: { 1943 // Scale must not be used already. 1944 if (AM.IndexReg.getNode() != nullptr || AM.Scale != 1) break; 1945 1946 // We only handle up to 64-bit values here as those are what matter for 1947 // addressing mode optimizations. 1948 assert(N.getSimpleValueType().getSizeInBits() <= 64 && 1949 "Unexpected value size!"); 1950 1951 SDValue And = N.getOperand(0); 1952 if (And.getOpcode() != ISD::AND) break; 1953 SDValue X = And.getOperand(0); 1954 1955 // The mask used for the transform is expected to be post-shift, but we 1956 // found the shift first so just apply the shift to the mask before passing 1957 // it down. 1958 if (!isa<ConstantSDNode>(N.getOperand(1)) || 1959 !isa<ConstantSDNode>(And.getOperand(1))) 1960 break; 1961 uint64_t Mask = And.getConstantOperandVal(1) >> N.getConstantOperandVal(1); 1962 1963 // Try to fold the mask and shift into the scale, and return false if we 1964 // succeed. 1965 if (!foldMaskAndShiftToScale(*CurDAG, N, Mask, N, X, AM)) 1966 return false; 1967 break; 1968 } 1969 1970 case ISD::SMUL_LOHI: 1971 case ISD::UMUL_LOHI: 1972 // A mul_lohi where we need the low part can be folded as a plain multiply. 1973 if (N.getResNo() != 0) break; 1974 LLVM_FALLTHROUGH; 1975 case ISD::MUL: 1976 case X86ISD::MUL_IMM: 1977 // X*[3,5,9] -> X+X*[2,4,8] 1978 if (AM.BaseType == X86ISelAddressMode::RegBase && 1979 AM.Base_Reg.getNode() == nullptr && 1980 AM.IndexReg.getNode() == nullptr) { 1981 if (ConstantSDNode *CN = dyn_cast<ConstantSDNode>(N.getOperand(1))) 1982 if (CN->getZExtValue() == 3 || CN->getZExtValue() == 5 || 1983 CN->getZExtValue() == 9) { 1984 AM.Scale = unsigned(CN->getZExtValue())-1; 1985 1986 SDValue MulVal = N.getOperand(0); 1987 SDValue Reg; 1988 1989 // Okay, we know that we have a scale by now. However, if the scaled 1990 // value is an add of something and a constant, we can fold the 1991 // constant into the disp field here. 1992 if (MulVal.getNode()->getOpcode() == ISD::ADD && MulVal.hasOneUse() && 1993 isa<ConstantSDNode>(MulVal.getOperand(1))) { 1994 Reg = MulVal.getOperand(0); 1995 ConstantSDNode *AddVal = 1996 cast<ConstantSDNode>(MulVal.getOperand(1)); 1997 uint64_t Disp = AddVal->getSExtValue() * CN->getZExtValue(); 1998 if (foldOffsetIntoAddress(Disp, AM)) 1999 Reg = N.getOperand(0); 2000 } else { 2001 Reg = N.getOperand(0); 2002 } 2003 2004 AM.IndexReg = AM.Base_Reg = Reg; 2005 return false; 2006 } 2007 } 2008 break; 2009 2010 case ISD::SUB: { 2011 // Given A-B, if A can be completely folded into the address and 2012 // the index field with the index field unused, use -B as the index. 2013 // This is a win if a has multiple parts that can be folded into 2014 // the address. Also, this saves a mov if the base register has 2015 // other uses, since it avoids a two-address sub instruction, however 2016 // it costs an additional mov if the index register has other uses. 2017 2018 // Add an artificial use to this node so that we can keep track of 2019 // it if it gets CSE'd with a different node. 2020 HandleSDNode Handle(N); 2021 2022 // Test if the LHS of the sub can be folded. 2023 X86ISelAddressMode Backup = AM; 2024 if (matchAddressRecursively(N.getOperand(0), AM, Depth+1)) { 2025 N = Handle.getValue(); 2026 AM = Backup; 2027 break; 2028 } 2029 N = Handle.getValue(); 2030 // Test if the index field is free for use. 2031 if (AM.IndexReg.getNode() || AM.isRIPRelative()) { 2032 AM = Backup; 2033 break; 2034 } 2035 2036 int Cost = 0; 2037 SDValue RHS = N.getOperand(1); 2038 // If the RHS involves a register with multiple uses, this 2039 // transformation incurs an extra mov, due to the neg instruction 2040 // clobbering its operand. 2041 if (!RHS.getNode()->hasOneUse() || 2042 RHS.getNode()->getOpcode() == ISD::CopyFromReg || 2043 RHS.getNode()->getOpcode() == ISD::TRUNCATE || 2044 RHS.getNode()->getOpcode() == ISD::ANY_EXTEND || 2045 (RHS.getNode()->getOpcode() == ISD::ZERO_EXTEND && 2046 RHS.getOperand(0).getValueType() == MVT::i32)) 2047 ++Cost; 2048 // If the base is a register with multiple uses, this 2049 // transformation may save a mov. 2050 if ((AM.BaseType == X86ISelAddressMode::RegBase && AM.Base_Reg.getNode() && 2051 !AM.Base_Reg.getNode()->hasOneUse()) || 2052 AM.BaseType == X86ISelAddressMode::FrameIndexBase) 2053 --Cost; 2054 // If the folded LHS was interesting, this transformation saves 2055 // address arithmetic. 2056 if ((AM.hasSymbolicDisplacement() && !Backup.hasSymbolicDisplacement()) + 2057 ((AM.Disp != 0) && (Backup.Disp == 0)) + 2058 (AM.Segment.getNode() && !Backup.Segment.getNode()) >= 2) 2059 --Cost; 2060 // If it doesn't look like it may be an overall win, don't do it. 2061 if (Cost >= 0) { 2062 AM = Backup; 2063 break; 2064 } 2065 2066 // Ok, the transformation is legal and appears profitable. Go for it. 2067 // Negation will be emitted later to avoid creating dangling nodes if this 2068 // was an unprofitable LEA. 2069 AM.IndexReg = RHS; 2070 AM.NegateIndex = true; 2071 AM.Scale = 1; 2072 return false; 2073 } 2074 2075 case ISD::ADD: 2076 if (!matchAdd(N, AM, Depth)) 2077 return false; 2078 break; 2079 2080 case ISD::OR: 2081 // We want to look through a transform in InstCombine and DAGCombiner that 2082 // turns 'add' into 'or', so we can treat this 'or' exactly like an 'add'. 2083 // Example: (or (and x, 1), (shl y, 3)) --> (add (and x, 1), (shl y, 3)) 2084 // An 'lea' can then be used to match the shift (multiply) and add: 2085 // and $1, %esi 2086 // lea (%rsi, %rdi, 8), %rax 2087 if (CurDAG->haveNoCommonBitsSet(N.getOperand(0), N.getOperand(1)) && 2088 !matchAdd(N, AM, Depth)) 2089 return false; 2090 break; 2091 2092 case ISD::AND: { 2093 // Perform some heroic transforms on an and of a constant-count shift 2094 // with a constant to enable use of the scaled offset field. 2095 2096 // Scale must not be used already. 2097 if (AM.IndexReg.getNode() != nullptr || AM.Scale != 1) break; 2098 2099 // We only handle up to 64-bit values here as those are what matter for 2100 // addressing mode optimizations. 2101 assert(N.getSimpleValueType().getSizeInBits() <= 64 && 2102 "Unexpected value size!"); 2103 2104 if (!isa<ConstantSDNode>(N.getOperand(1))) 2105 break; 2106 2107 if (N.getOperand(0).getOpcode() == ISD::SRL) { 2108 SDValue Shift = N.getOperand(0); 2109 SDValue X = Shift.getOperand(0); 2110 2111 uint64_t Mask = N.getConstantOperandVal(1); 2112 2113 // Try to fold the mask and shift into an extract and scale. 2114 if (!foldMaskAndShiftToExtract(*CurDAG, N, Mask, Shift, X, AM)) 2115 return false; 2116 2117 // Try to fold the mask and shift directly into the scale. 2118 if (!foldMaskAndShiftToScale(*CurDAG, N, Mask, Shift, X, AM)) 2119 return false; 2120 2121 // Try to fold the mask and shift into BEXTR and scale. 2122 if (!foldMaskedShiftToBEXTR(*CurDAG, N, Mask, Shift, X, AM, *Subtarget)) 2123 return false; 2124 } 2125 2126 // Try to swap the mask and shift to place shifts which can be done as 2127 // a scale on the outside of the mask. 2128 if (!foldMaskedShiftToScaledMask(*CurDAG, N, AM)) 2129 return false; 2130 2131 break; 2132 } 2133 case ISD::ZERO_EXTEND: { 2134 // Try to widen a zexted shift left to the same size as its use, so we can 2135 // match the shift as a scale factor. 2136 if (AM.IndexReg.getNode() != nullptr || AM.Scale != 1) 2137 break; 2138 if (N.getOperand(0).getOpcode() != ISD::SHL || !N.getOperand(0).hasOneUse()) 2139 break; 2140 2141 // Give up if the shift is not a valid scale factor [1,2,3]. 2142 SDValue Shl = N.getOperand(0); 2143 auto *ShAmtC = dyn_cast<ConstantSDNode>(Shl.getOperand(1)); 2144 if (!ShAmtC || ShAmtC->getZExtValue() > 3) 2145 break; 2146 2147 // The narrow shift must only shift out zero bits (it must be 'nuw'). 2148 // That makes it safe to widen to the destination type. 2149 APInt HighZeros = APInt::getHighBitsSet(Shl.getValueSizeInBits(), 2150 ShAmtC->getZExtValue()); 2151 if (!CurDAG->MaskedValueIsZero(Shl.getOperand(0), HighZeros)) 2152 break; 2153 2154 // zext (shl nuw i8 %x, C) to i32 --> shl (zext i8 %x to i32), (zext C) 2155 MVT VT = N.getSimpleValueType(); 2156 SDLoc DL(N); 2157 SDValue Zext = CurDAG->getNode(ISD::ZERO_EXTEND, DL, VT, Shl.getOperand(0)); 2158 SDValue NewShl = CurDAG->getNode(ISD::SHL, DL, VT, Zext, Shl.getOperand(1)); 2159 2160 // Convert the shift to scale factor. 2161 AM.Scale = 1 << ShAmtC->getZExtValue(); 2162 AM.IndexReg = Zext; 2163 2164 insertDAGNode(*CurDAG, N, Zext); 2165 insertDAGNode(*CurDAG, N, NewShl); 2166 CurDAG->ReplaceAllUsesWith(N, NewShl); 2167 CurDAG->RemoveDeadNode(N.getNode()); 2168 return false; 2169 } 2170 } 2171 2172 return matchAddressBase(N, AM); 2173 } 2174 2175 /// Helper for MatchAddress. Add the specified node to the 2176 /// specified addressing mode without any further recursion. 2177 bool X86DAGToDAGISel::matchAddressBase(SDValue N, X86ISelAddressMode &AM) { 2178 // Is the base register already occupied? 2179 if (AM.BaseType != X86ISelAddressMode::RegBase || AM.Base_Reg.getNode()) { 2180 // If so, check to see if the scale index register is set. 2181 if (!AM.IndexReg.getNode()) { 2182 AM.IndexReg = N; 2183 AM.Scale = 1; 2184 return false; 2185 } 2186 2187 // Otherwise, we cannot select it. 2188 return true; 2189 } 2190 2191 // Default, generate it as a register. 2192 AM.BaseType = X86ISelAddressMode::RegBase; 2193 AM.Base_Reg = N; 2194 return false; 2195 } 2196 2197 /// Helper for selectVectorAddr. Handles things that can be folded into a 2198 /// gather scatter address. The index register and scale should have already 2199 /// been handled. 2200 bool X86DAGToDAGISel::matchVectorAddress(SDValue N, X86ISelAddressMode &AM) { 2201 // TODO: Support other operations. 2202 switch (N.getOpcode()) { 2203 case ISD::Constant: { 2204 uint64_t Val = cast<ConstantSDNode>(N)->getSExtValue(); 2205 if (!foldOffsetIntoAddress(Val, AM)) 2206 return false; 2207 break; 2208 } 2209 case X86ISD::Wrapper: 2210 if (!matchWrapper(N, AM)) 2211 return false; 2212 break; 2213 } 2214 2215 return matchAddressBase(N, AM); 2216 } 2217 2218 bool X86DAGToDAGISel::selectVectorAddr(SDNode *Parent, SDValue N, SDValue &Base, 2219 SDValue &Scale, SDValue &Index, 2220 SDValue &Disp, SDValue &Segment) { 2221 X86ISelAddressMode AM; 2222 auto *Mgs = cast<X86MaskedGatherScatterSDNode>(Parent); 2223 AM.IndexReg = Mgs->getIndex(); 2224 AM.Scale = cast<ConstantSDNode>(Mgs->getScale())->getZExtValue(); 2225 2226 unsigned AddrSpace = cast<MemSDNode>(Parent)->getPointerInfo().getAddrSpace(); 2227 // AddrSpace 256 -> GS, 257 -> FS, 258 -> SS. 2228 if (AddrSpace == 256) 2229 AM.Segment = CurDAG->getRegister(X86::GS, MVT::i16); 2230 if (AddrSpace == 257) 2231 AM.Segment = CurDAG->getRegister(X86::FS, MVT::i16); 2232 if (AddrSpace == 258) 2233 AM.Segment = CurDAG->getRegister(X86::SS, MVT::i16); 2234 2235 SDLoc DL(N); 2236 MVT VT = N.getSimpleValueType(); 2237 2238 // Try to match into the base and displacement fields. 2239 if (matchVectorAddress(N, AM)) 2240 return false; 2241 2242 getAddressOperands(AM, DL, VT, Base, Scale, Index, Disp, Segment); 2243 return true; 2244 } 2245 2246 /// Returns true if it is able to pattern match an addressing mode. 2247 /// It returns the operands which make up the maximal addressing mode it can 2248 /// match by reference. 2249 /// 2250 /// Parent is the parent node of the addr operand that is being matched. It 2251 /// is always a load, store, atomic node, or null. It is only null when 2252 /// checking memory operands for inline asm nodes. 2253 bool X86DAGToDAGISel::selectAddr(SDNode *Parent, SDValue N, SDValue &Base, 2254 SDValue &Scale, SDValue &Index, 2255 SDValue &Disp, SDValue &Segment) { 2256 X86ISelAddressMode AM; 2257 2258 if (Parent && 2259 // This list of opcodes are all the nodes that have an "addr:$ptr" operand 2260 // that are not a MemSDNode, and thus don't have proper addrspace info. 2261 Parent->getOpcode() != ISD::INTRINSIC_W_CHAIN && // unaligned loads, fixme 2262 Parent->getOpcode() != ISD::INTRINSIC_VOID && // nontemporal stores 2263 Parent->getOpcode() != X86ISD::TLSCALL && // Fixme 2264 Parent->getOpcode() != X86ISD::ENQCMD && // Fixme 2265 Parent->getOpcode() != X86ISD::ENQCMDS && // Fixme 2266 Parent->getOpcode() != X86ISD::EH_SJLJ_SETJMP && // setjmp 2267 Parent->getOpcode() != X86ISD::EH_SJLJ_LONGJMP) { // longjmp 2268 unsigned AddrSpace = 2269 cast<MemSDNode>(Parent)->getPointerInfo().getAddrSpace(); 2270 // AddrSpace 256 -> GS, 257 -> FS, 258 -> SS. 2271 if (AddrSpace == 256) 2272 AM.Segment = CurDAG->getRegister(X86::GS, MVT::i16); 2273 if (AddrSpace == 257) 2274 AM.Segment = CurDAG->getRegister(X86::FS, MVT::i16); 2275 if (AddrSpace == 258) 2276 AM.Segment = CurDAG->getRegister(X86::SS, MVT::i16); 2277 } 2278 2279 // Save the DL and VT before calling matchAddress, it can invalidate N. 2280 SDLoc DL(N); 2281 MVT VT = N.getSimpleValueType(); 2282 2283 if (matchAddress(N, AM)) 2284 return false; 2285 2286 getAddressOperands(AM, DL, VT, Base, Scale, Index, Disp, Segment); 2287 return true; 2288 } 2289 2290 // We can only fold a load if all nodes between it and the root node have a 2291 // single use. If there are additional uses, we could end up duplicating the 2292 // load. 2293 static bool hasSingleUsesFromRoot(SDNode *Root, SDNode *User) { 2294 while (User != Root) { 2295 if (!User->hasOneUse()) 2296 return false; 2297 User = *User->use_begin(); 2298 } 2299 2300 return true; 2301 } 2302 2303 /// Match a scalar SSE load. In particular, we want to match a load whose top 2304 /// elements are either undef or zeros. The load flavor is derived from the 2305 /// type of N, which is either v4f32 or v2f64. 2306 /// 2307 /// We also return: 2308 /// PatternChainNode: this is the matched node that has a chain input and 2309 /// output. 2310 bool X86DAGToDAGISel::selectScalarSSELoad(SDNode *Root, SDNode *Parent, 2311 SDValue N, SDValue &Base, 2312 SDValue &Scale, SDValue &Index, 2313 SDValue &Disp, SDValue &Segment, 2314 SDValue &PatternNodeWithChain) { 2315 if (!hasSingleUsesFromRoot(Root, Parent)) 2316 return false; 2317 2318 // We can allow a full vector load here since narrowing a load is ok unless 2319 // it's volatile or atomic. 2320 if (ISD::isNON_EXTLoad(N.getNode())) { 2321 LoadSDNode *LD = cast<LoadSDNode>(N); 2322 if (LD->isSimple() && 2323 IsProfitableToFold(N, LD, Root) && 2324 IsLegalToFold(N, Parent, Root, OptLevel)) { 2325 PatternNodeWithChain = N; 2326 return selectAddr(LD, LD->getBasePtr(), Base, Scale, Index, Disp, 2327 Segment); 2328 } 2329 } 2330 2331 // We can also match the special zero extended load opcode. 2332 if (N.getOpcode() == X86ISD::VZEXT_LOAD) { 2333 PatternNodeWithChain = N; 2334 if (IsProfitableToFold(PatternNodeWithChain, N.getNode(), Root) && 2335 IsLegalToFold(PatternNodeWithChain, Parent, Root, OptLevel)) { 2336 auto *MI = cast<MemIntrinsicSDNode>(PatternNodeWithChain); 2337 return selectAddr(MI, MI->getBasePtr(), Base, Scale, Index, Disp, 2338 Segment); 2339 } 2340 } 2341 2342 // Need to make sure that the SCALAR_TO_VECTOR and load are both only used 2343 // once. Otherwise the load might get duplicated and the chain output of the 2344 // duplicate load will not be observed by all dependencies. 2345 if (N.getOpcode() == ISD::SCALAR_TO_VECTOR && N.getNode()->hasOneUse()) { 2346 PatternNodeWithChain = N.getOperand(0); 2347 if (ISD::isNON_EXTLoad(PatternNodeWithChain.getNode()) && 2348 IsProfitableToFold(PatternNodeWithChain, N.getNode(), Root) && 2349 IsLegalToFold(PatternNodeWithChain, N.getNode(), Root, OptLevel)) { 2350 LoadSDNode *LD = cast<LoadSDNode>(PatternNodeWithChain); 2351 return selectAddr(LD, LD->getBasePtr(), Base, Scale, Index, Disp, 2352 Segment); 2353 } 2354 } 2355 2356 return false; 2357 } 2358 2359 2360 bool X86DAGToDAGISel::selectMOV64Imm32(SDValue N, SDValue &Imm) { 2361 if (const ConstantSDNode *CN = dyn_cast<ConstantSDNode>(N)) { 2362 uint64_t ImmVal = CN->getZExtValue(); 2363 if (!isUInt<32>(ImmVal)) 2364 return false; 2365 2366 Imm = CurDAG->getTargetConstant(ImmVal, SDLoc(N), MVT::i64); 2367 return true; 2368 } 2369 2370 // In static codegen with small code model, we can get the address of a label 2371 // into a register with 'movl' 2372 if (N->getOpcode() != X86ISD::Wrapper) 2373 return false; 2374 2375 N = N.getOperand(0); 2376 2377 // At least GNU as does not accept 'movl' for TPOFF relocations. 2378 // FIXME: We could use 'movl' when we know we are targeting MC. 2379 if (N->getOpcode() == ISD::TargetGlobalTLSAddress) 2380 return false; 2381 2382 Imm = N; 2383 if (N->getOpcode() != ISD::TargetGlobalAddress) 2384 return TM.getCodeModel() == CodeModel::Small; 2385 2386 Optional<ConstantRange> CR = 2387 cast<GlobalAddressSDNode>(N)->getGlobal()->getAbsoluteSymbolRange(); 2388 if (!CR) 2389 return TM.getCodeModel() == CodeModel::Small; 2390 2391 return CR->getUnsignedMax().ult(1ull << 32); 2392 } 2393 2394 bool X86DAGToDAGISel::selectLEA64_32Addr(SDValue N, SDValue &Base, 2395 SDValue &Scale, SDValue &Index, 2396 SDValue &Disp, SDValue &Segment) { 2397 // Save the debug loc before calling selectLEAAddr, in case it invalidates N. 2398 SDLoc DL(N); 2399 2400 if (!selectLEAAddr(N, Base, Scale, Index, Disp, Segment)) 2401 return false; 2402 2403 RegisterSDNode *RN = dyn_cast<RegisterSDNode>(Base); 2404 if (RN && RN->getReg() == 0) 2405 Base = CurDAG->getRegister(0, MVT::i64); 2406 else if (Base.getValueType() == MVT::i32 && !isa<FrameIndexSDNode>(Base)) { 2407 // Base could already be %rip, particularly in the x32 ABI. 2408 SDValue ImplDef = SDValue(CurDAG->getMachineNode(X86::IMPLICIT_DEF, DL, 2409 MVT::i64), 0); 2410 Base = CurDAG->getTargetInsertSubreg(X86::sub_32bit, DL, MVT::i64, ImplDef, 2411 Base); 2412 } 2413 2414 RN = dyn_cast<RegisterSDNode>(Index); 2415 if (RN && RN->getReg() == 0) 2416 Index = CurDAG->getRegister(0, MVT::i64); 2417 else { 2418 assert(Index.getValueType() == MVT::i32 && 2419 "Expect to be extending 32-bit registers for use in LEA"); 2420 SDValue ImplDef = SDValue(CurDAG->getMachineNode(X86::IMPLICIT_DEF, DL, 2421 MVT::i64), 0); 2422 Index = CurDAG->getTargetInsertSubreg(X86::sub_32bit, DL, MVT::i64, ImplDef, 2423 Index); 2424 } 2425 2426 return true; 2427 } 2428 2429 /// Calls SelectAddr and determines if the maximal addressing 2430 /// mode it matches can be cost effectively emitted as an LEA instruction. 2431 bool X86DAGToDAGISel::selectLEAAddr(SDValue N, 2432 SDValue &Base, SDValue &Scale, 2433 SDValue &Index, SDValue &Disp, 2434 SDValue &Segment) { 2435 X86ISelAddressMode AM; 2436 2437 // Save the DL and VT before calling matchAddress, it can invalidate N. 2438 SDLoc DL(N); 2439 MVT VT = N.getSimpleValueType(); 2440 2441 // Set AM.Segment to prevent MatchAddress from using one. LEA doesn't support 2442 // segments. 2443 SDValue Copy = AM.Segment; 2444 SDValue T = CurDAG->getRegister(0, MVT::i32); 2445 AM.Segment = T; 2446 if (matchAddress(N, AM)) 2447 return false; 2448 assert (T == AM.Segment); 2449 AM.Segment = Copy; 2450 2451 unsigned Complexity = 0; 2452 if (AM.BaseType == X86ISelAddressMode::RegBase && AM.Base_Reg.getNode()) 2453 Complexity = 1; 2454 else if (AM.BaseType == X86ISelAddressMode::FrameIndexBase) 2455 Complexity = 4; 2456 2457 if (AM.IndexReg.getNode()) 2458 Complexity++; 2459 2460 // Don't match just leal(,%reg,2). It's cheaper to do addl %reg, %reg, or with 2461 // a simple shift. 2462 if (AM.Scale > 1) 2463 Complexity++; 2464 2465 // FIXME: We are artificially lowering the criteria to turn ADD %reg, $GA 2466 // to a LEA. This is determined with some experimentation but is by no means 2467 // optimal (especially for code size consideration). LEA is nice because of 2468 // its three-address nature. Tweak the cost function again when we can run 2469 // convertToThreeAddress() at register allocation time. 2470 if (AM.hasSymbolicDisplacement()) { 2471 // For X86-64, always use LEA to materialize RIP-relative addresses. 2472 if (Subtarget->is64Bit()) 2473 Complexity = 4; 2474 else 2475 Complexity += 2; 2476 } 2477 2478 // Heuristic: try harder to form an LEA from ADD if the operands set flags. 2479 // Unlike ADD, LEA does not affect flags, so we will be less likely to require 2480 // duplicating flag-producing instructions later in the pipeline. 2481 if (N.getOpcode() == ISD::ADD) { 2482 auto isMathWithFlags = [](SDValue V) { 2483 switch (V.getOpcode()) { 2484 case X86ISD::ADD: 2485 case X86ISD::SUB: 2486 case X86ISD::ADC: 2487 case X86ISD::SBB: 2488 /* TODO: These opcodes can be added safely, but we may want to justify 2489 their inclusion for different reasons (better for reg-alloc). 2490 case X86ISD::SMUL: 2491 case X86ISD::UMUL: 2492 case X86ISD::OR: 2493 case X86ISD::XOR: 2494 case X86ISD::AND: 2495 */ 2496 // Value 1 is the flag output of the node - verify it's not dead. 2497 return !SDValue(V.getNode(), 1).use_empty(); 2498 default: 2499 return false; 2500 } 2501 }; 2502 // TODO: This could be an 'or' rather than 'and' to make the transform more 2503 // likely to happen. We might want to factor in whether there's a 2504 // load folding opportunity for the math op that disappears with LEA. 2505 if (isMathWithFlags(N.getOperand(0)) && isMathWithFlags(N.getOperand(1))) 2506 Complexity++; 2507 } 2508 2509 if (AM.Disp) 2510 Complexity++; 2511 2512 // If it isn't worth using an LEA, reject it. 2513 if (Complexity <= 2) 2514 return false; 2515 2516 getAddressOperands(AM, DL, VT, Base, Scale, Index, Disp, Segment); 2517 return true; 2518 } 2519 2520 /// This is only run on TargetGlobalTLSAddress nodes. 2521 bool X86DAGToDAGISel::selectTLSADDRAddr(SDValue N, SDValue &Base, 2522 SDValue &Scale, SDValue &Index, 2523 SDValue &Disp, SDValue &Segment) { 2524 assert(N.getOpcode() == ISD::TargetGlobalTLSAddress); 2525 const GlobalAddressSDNode *GA = cast<GlobalAddressSDNode>(N); 2526 2527 X86ISelAddressMode AM; 2528 AM.GV = GA->getGlobal(); 2529 AM.Disp += GA->getOffset(); 2530 AM.SymbolFlags = GA->getTargetFlags(); 2531 2532 MVT VT = N.getSimpleValueType(); 2533 if (VT == MVT::i32) { 2534 AM.Scale = 1; 2535 AM.IndexReg = CurDAG->getRegister(X86::EBX, MVT::i32); 2536 } 2537 2538 getAddressOperands(AM, SDLoc(N), VT, Base, Scale, Index, Disp, Segment); 2539 return true; 2540 } 2541 2542 bool X86DAGToDAGISel::selectRelocImm(SDValue N, SDValue &Op) { 2543 if (auto *CN = dyn_cast<ConstantSDNode>(N)) { 2544 Op = CurDAG->getTargetConstant(CN->getAPIntValue(), SDLoc(CN), 2545 N.getValueType()); 2546 return true; 2547 } 2548 2549 // Keep track of the original value type and whether this value was 2550 // truncated. If we see a truncation from pointer type to VT that truncates 2551 // bits that are known to be zero, we can use a narrow reference. 2552 EVT VT = N.getValueType(); 2553 bool WasTruncated = false; 2554 if (N.getOpcode() == ISD::TRUNCATE) { 2555 WasTruncated = true; 2556 N = N.getOperand(0); 2557 } 2558 2559 if (N.getOpcode() != X86ISD::Wrapper) 2560 return false; 2561 2562 // We can only use non-GlobalValues as immediates if they were not truncated, 2563 // as we do not have any range information. If we have a GlobalValue and the 2564 // address was not truncated, we can select it as an operand directly. 2565 unsigned Opc = N.getOperand(0)->getOpcode(); 2566 if (Opc != ISD::TargetGlobalAddress || !WasTruncated) { 2567 Op = N.getOperand(0); 2568 // We can only select the operand directly if we didn't have to look past a 2569 // truncate. 2570 return !WasTruncated; 2571 } 2572 2573 // Check that the global's range fits into VT. 2574 auto *GA = cast<GlobalAddressSDNode>(N.getOperand(0)); 2575 Optional<ConstantRange> CR = GA->getGlobal()->getAbsoluteSymbolRange(); 2576 if (!CR || CR->getUnsignedMax().uge(1ull << VT.getSizeInBits())) 2577 return false; 2578 2579 // Okay, we can use a narrow reference. 2580 Op = CurDAG->getTargetGlobalAddress(GA->getGlobal(), SDLoc(N), VT, 2581 GA->getOffset(), GA->getTargetFlags()); 2582 return true; 2583 } 2584 2585 bool X86DAGToDAGISel::tryFoldLoad(SDNode *Root, SDNode *P, SDValue N, 2586 SDValue &Base, SDValue &Scale, 2587 SDValue &Index, SDValue &Disp, 2588 SDValue &Segment) { 2589 assert(Root && P && "Unknown root/parent nodes"); 2590 if (!ISD::isNON_EXTLoad(N.getNode()) || 2591 !IsProfitableToFold(N, P, Root) || 2592 !IsLegalToFold(N, P, Root, OptLevel)) 2593 return false; 2594 2595 return selectAddr(N.getNode(), 2596 N.getOperand(1), Base, Scale, Index, Disp, Segment); 2597 } 2598 2599 bool X86DAGToDAGISel::tryFoldBroadcast(SDNode *Root, SDNode *P, SDValue N, 2600 SDValue &Base, SDValue &Scale, 2601 SDValue &Index, SDValue &Disp, 2602 SDValue &Segment) { 2603 assert(Root && P && "Unknown root/parent nodes"); 2604 if (N->getOpcode() != X86ISD::VBROADCAST_LOAD || 2605 !IsProfitableToFold(N, P, Root) || 2606 !IsLegalToFold(N, P, Root, OptLevel)) 2607 return false; 2608 2609 return selectAddr(N.getNode(), 2610 N.getOperand(1), Base, Scale, Index, Disp, Segment); 2611 } 2612 2613 /// Return an SDNode that returns the value of the global base register. 2614 /// Output instructions required to initialize the global base register, 2615 /// if necessary. 2616 SDNode *X86DAGToDAGISel::getGlobalBaseReg() { 2617 unsigned GlobalBaseReg = getInstrInfo()->getGlobalBaseReg(MF); 2618 auto &DL = MF->getDataLayout(); 2619 return CurDAG->getRegister(GlobalBaseReg, TLI->getPointerTy(DL)).getNode(); 2620 } 2621 2622 bool X86DAGToDAGISel::isSExtAbsoluteSymbolRef(unsigned Width, SDNode *N) const { 2623 if (N->getOpcode() == ISD::TRUNCATE) 2624 N = N->getOperand(0).getNode(); 2625 if (N->getOpcode() != X86ISD::Wrapper) 2626 return false; 2627 2628 auto *GA = dyn_cast<GlobalAddressSDNode>(N->getOperand(0)); 2629 if (!GA) 2630 return false; 2631 2632 Optional<ConstantRange> CR = GA->getGlobal()->getAbsoluteSymbolRange(); 2633 return CR && CR->getSignedMin().sge(-1ull << Width) && 2634 CR->getSignedMax().slt(1ull << Width); 2635 } 2636 2637 static X86::CondCode getCondFromNode(SDNode *N) { 2638 assert(N->isMachineOpcode() && "Unexpected node"); 2639 X86::CondCode CC = X86::COND_INVALID; 2640 unsigned Opc = N->getMachineOpcode(); 2641 if (Opc == X86::JCC_1) 2642 CC = static_cast<X86::CondCode>(N->getConstantOperandVal(1)); 2643 else if (Opc == X86::SETCCr) 2644 CC = static_cast<X86::CondCode>(N->getConstantOperandVal(0)); 2645 else if (Opc == X86::SETCCm) 2646 CC = static_cast<X86::CondCode>(N->getConstantOperandVal(5)); 2647 else if (Opc == X86::CMOV16rr || Opc == X86::CMOV32rr || 2648 Opc == X86::CMOV64rr) 2649 CC = static_cast<X86::CondCode>(N->getConstantOperandVal(2)); 2650 else if (Opc == X86::CMOV16rm || Opc == X86::CMOV32rm || 2651 Opc == X86::CMOV64rm) 2652 CC = static_cast<X86::CondCode>(N->getConstantOperandVal(6)); 2653 2654 return CC; 2655 } 2656 2657 /// Test whether the given X86ISD::CMP node has any users that use a flag 2658 /// other than ZF. 2659 bool X86DAGToDAGISel::onlyUsesZeroFlag(SDValue Flags) const { 2660 // Examine each user of the node. 2661 for (SDNode::use_iterator UI = Flags->use_begin(), UE = Flags->use_end(); 2662 UI != UE; ++UI) { 2663 // Only check things that use the flags. 2664 if (UI.getUse().getResNo() != Flags.getResNo()) 2665 continue; 2666 // Only examine CopyToReg uses that copy to EFLAGS. 2667 if (UI->getOpcode() != ISD::CopyToReg || 2668 cast<RegisterSDNode>(UI->getOperand(1))->getReg() != X86::EFLAGS) 2669 return false; 2670 // Examine each user of the CopyToReg use. 2671 for (SDNode::use_iterator FlagUI = UI->use_begin(), 2672 FlagUE = UI->use_end(); FlagUI != FlagUE; ++FlagUI) { 2673 // Only examine the Flag result. 2674 if (FlagUI.getUse().getResNo() != 1) continue; 2675 // Anything unusual: assume conservatively. 2676 if (!FlagUI->isMachineOpcode()) return false; 2677 // Examine the condition code of the user. 2678 X86::CondCode CC = getCondFromNode(*FlagUI); 2679 2680 switch (CC) { 2681 // Comparisons which only use the zero flag. 2682 case X86::COND_E: case X86::COND_NE: 2683 continue; 2684 // Anything else: assume conservatively. 2685 default: 2686 return false; 2687 } 2688 } 2689 } 2690 return true; 2691 } 2692 2693 /// Test whether the given X86ISD::CMP node has any uses which require the SF 2694 /// flag to be accurate. 2695 bool X86DAGToDAGISel::hasNoSignFlagUses(SDValue Flags) const { 2696 // Examine each user of the node. 2697 for (SDNode::use_iterator UI = Flags->use_begin(), UE = Flags->use_end(); 2698 UI != UE; ++UI) { 2699 // Only check things that use the flags. 2700 if (UI.getUse().getResNo() != Flags.getResNo()) 2701 continue; 2702 // Only examine CopyToReg uses that copy to EFLAGS. 2703 if (UI->getOpcode() != ISD::CopyToReg || 2704 cast<RegisterSDNode>(UI->getOperand(1))->getReg() != X86::EFLAGS) 2705 return false; 2706 // Examine each user of the CopyToReg use. 2707 for (SDNode::use_iterator FlagUI = UI->use_begin(), 2708 FlagUE = UI->use_end(); FlagUI != FlagUE; ++FlagUI) { 2709 // Only examine the Flag result. 2710 if (FlagUI.getUse().getResNo() != 1) continue; 2711 // Anything unusual: assume conservatively. 2712 if (!FlagUI->isMachineOpcode()) return false; 2713 // Examine the condition code of the user. 2714 X86::CondCode CC = getCondFromNode(*FlagUI); 2715 2716 switch (CC) { 2717 // Comparisons which don't examine the SF flag. 2718 case X86::COND_A: case X86::COND_AE: 2719 case X86::COND_B: case X86::COND_BE: 2720 case X86::COND_E: case X86::COND_NE: 2721 case X86::COND_O: case X86::COND_NO: 2722 case X86::COND_P: case X86::COND_NP: 2723 continue; 2724 // Anything else: assume conservatively. 2725 default: 2726 return false; 2727 } 2728 } 2729 } 2730 return true; 2731 } 2732 2733 static bool mayUseCarryFlag(X86::CondCode CC) { 2734 switch (CC) { 2735 // Comparisons which don't examine the CF flag. 2736 case X86::COND_O: case X86::COND_NO: 2737 case X86::COND_E: case X86::COND_NE: 2738 case X86::COND_S: case X86::COND_NS: 2739 case X86::COND_P: case X86::COND_NP: 2740 case X86::COND_L: case X86::COND_GE: 2741 case X86::COND_G: case X86::COND_LE: 2742 return false; 2743 // Anything else: assume conservatively. 2744 default: 2745 return true; 2746 } 2747 } 2748 2749 /// Test whether the given node which sets flags has any uses which require the 2750 /// CF flag to be accurate. 2751 bool X86DAGToDAGISel::hasNoCarryFlagUses(SDValue Flags) const { 2752 // Examine each user of the node. 2753 for (SDNode::use_iterator UI = Flags->use_begin(), UE = Flags->use_end(); 2754 UI != UE; ++UI) { 2755 // Only check things that use the flags. 2756 if (UI.getUse().getResNo() != Flags.getResNo()) 2757 continue; 2758 2759 unsigned UIOpc = UI->getOpcode(); 2760 2761 if (UIOpc == ISD::CopyToReg) { 2762 // Only examine CopyToReg uses that copy to EFLAGS. 2763 if (cast<RegisterSDNode>(UI->getOperand(1))->getReg() != X86::EFLAGS) 2764 return false; 2765 // Examine each user of the CopyToReg use. 2766 for (SDNode::use_iterator FlagUI = UI->use_begin(), FlagUE = UI->use_end(); 2767 FlagUI != FlagUE; ++FlagUI) { 2768 // Only examine the Flag result. 2769 if (FlagUI.getUse().getResNo() != 1) 2770 continue; 2771 // Anything unusual: assume conservatively. 2772 if (!FlagUI->isMachineOpcode()) 2773 return false; 2774 // Examine the condition code of the user. 2775 X86::CondCode CC = getCondFromNode(*FlagUI); 2776 2777 if (mayUseCarryFlag(CC)) 2778 return false; 2779 } 2780 2781 // This CopyToReg is ok. Move on to the next user. 2782 continue; 2783 } 2784 2785 // This might be an unselected node. So look for the pre-isel opcodes that 2786 // use flags. 2787 unsigned CCOpNo; 2788 switch (UIOpc) { 2789 default: 2790 // Something unusual. Be conservative. 2791 return false; 2792 case X86ISD::SETCC: CCOpNo = 0; break; 2793 case X86ISD::SETCC_CARRY: CCOpNo = 0; break; 2794 case X86ISD::CMOV: CCOpNo = 2; break; 2795 case X86ISD::BRCOND: CCOpNo = 2; break; 2796 } 2797 2798 X86::CondCode CC = (X86::CondCode)UI->getConstantOperandVal(CCOpNo); 2799 if (mayUseCarryFlag(CC)) 2800 return false; 2801 } 2802 return true; 2803 } 2804 2805 /// Check whether or not the chain ending in StoreNode is suitable for doing 2806 /// the {load; op; store} to modify transformation. 2807 static bool isFusableLoadOpStorePattern(StoreSDNode *StoreNode, 2808 SDValue StoredVal, SelectionDAG *CurDAG, 2809 unsigned LoadOpNo, 2810 LoadSDNode *&LoadNode, 2811 SDValue &InputChain) { 2812 // Is the stored value result 0 of the operation? 2813 if (StoredVal.getResNo() != 0) return false; 2814 2815 // Are there other uses of the operation other than the store? 2816 if (!StoredVal.getNode()->hasNUsesOfValue(1, 0)) return false; 2817 2818 // Is the store non-extending and non-indexed? 2819 if (!ISD::isNormalStore(StoreNode) || StoreNode->isNonTemporal()) 2820 return false; 2821 2822 SDValue Load = StoredVal->getOperand(LoadOpNo); 2823 // Is the stored value a non-extending and non-indexed load? 2824 if (!ISD::isNormalLoad(Load.getNode())) return false; 2825 2826 // Return LoadNode by reference. 2827 LoadNode = cast<LoadSDNode>(Load); 2828 2829 // Is store the only read of the loaded value? 2830 if (!Load.hasOneUse()) 2831 return false; 2832 2833 // Is the address of the store the same as the load? 2834 if (LoadNode->getBasePtr() != StoreNode->getBasePtr() || 2835 LoadNode->getOffset() != StoreNode->getOffset()) 2836 return false; 2837 2838 bool FoundLoad = false; 2839 SmallVector<SDValue, 4> ChainOps; 2840 SmallVector<const SDNode *, 4> LoopWorklist; 2841 SmallPtrSet<const SDNode *, 16> Visited; 2842 const unsigned int Max = 1024; 2843 2844 // Visualization of Load-Op-Store fusion: 2845 // ------------------------- 2846 // Legend: 2847 // *-lines = Chain operand dependencies. 2848 // |-lines = Normal operand dependencies. 2849 // Dependencies flow down and right. n-suffix references multiple nodes. 2850 // 2851 // C Xn C 2852 // * * * 2853 // * * * 2854 // Xn A-LD Yn TF Yn 2855 // * * \ | * | 2856 // * * \ | * | 2857 // * * \ | => A--LD_OP_ST 2858 // * * \| \ 2859 // TF OP \ 2860 // * | \ Zn 2861 // * | \ 2862 // A-ST Zn 2863 // 2864 2865 // This merge induced dependences from: #1: Xn -> LD, OP, Zn 2866 // #2: Yn -> LD 2867 // #3: ST -> Zn 2868 2869 // Ensure the transform is safe by checking for the dual 2870 // dependencies to make sure we do not induce a loop. 2871 2872 // As LD is a predecessor to both OP and ST we can do this by checking: 2873 // a). if LD is a predecessor to a member of Xn or Yn. 2874 // b). if a Zn is a predecessor to ST. 2875 2876 // However, (b) can only occur through being a chain predecessor to 2877 // ST, which is the same as Zn being a member or predecessor of Xn, 2878 // which is a subset of LD being a predecessor of Xn. So it's 2879 // subsumed by check (a). 2880 2881 SDValue Chain = StoreNode->getChain(); 2882 2883 // Gather X elements in ChainOps. 2884 if (Chain == Load.getValue(1)) { 2885 FoundLoad = true; 2886 ChainOps.push_back(Load.getOperand(0)); 2887 } else if (Chain.getOpcode() == ISD::TokenFactor) { 2888 for (unsigned i = 0, e = Chain.getNumOperands(); i != e; ++i) { 2889 SDValue Op = Chain.getOperand(i); 2890 if (Op == Load.getValue(1)) { 2891 FoundLoad = true; 2892 // Drop Load, but keep its chain. No cycle check necessary. 2893 ChainOps.push_back(Load.getOperand(0)); 2894 continue; 2895 } 2896 LoopWorklist.push_back(Op.getNode()); 2897 ChainOps.push_back(Op); 2898 } 2899 } 2900 2901 if (!FoundLoad) 2902 return false; 2903 2904 // Worklist is currently Xn. Add Yn to worklist. 2905 for (SDValue Op : StoredVal->ops()) 2906 if (Op.getNode() != LoadNode) 2907 LoopWorklist.push_back(Op.getNode()); 2908 2909 // Check (a) if Load is a predecessor to Xn + Yn 2910 if (SDNode::hasPredecessorHelper(Load.getNode(), Visited, LoopWorklist, Max, 2911 true)) 2912 return false; 2913 2914 InputChain = 2915 CurDAG->getNode(ISD::TokenFactor, SDLoc(Chain), MVT::Other, ChainOps); 2916 return true; 2917 } 2918 2919 // Change a chain of {load; op; store} of the same value into a simple op 2920 // through memory of that value, if the uses of the modified value and its 2921 // address are suitable. 2922 // 2923 // The tablegen pattern memory operand pattern is currently not able to match 2924 // the case where the EFLAGS on the original operation are used. 2925 // 2926 // To move this to tablegen, we'll need to improve tablegen to allow flags to 2927 // be transferred from a node in the pattern to the result node, probably with 2928 // a new keyword. For example, we have this 2929 // def DEC64m : RI<0xFF, MRM1m, (outs), (ins i64mem:$dst), "dec{q}\t$dst", 2930 // [(store (add (loadi64 addr:$dst), -1), addr:$dst), 2931 // (implicit EFLAGS)]>; 2932 // but maybe need something like this 2933 // def DEC64m : RI<0xFF, MRM1m, (outs), (ins i64mem:$dst), "dec{q}\t$dst", 2934 // [(store (add (loadi64 addr:$dst), -1), addr:$dst), 2935 // (transferrable EFLAGS)]>; 2936 // 2937 // Until then, we manually fold these and instruction select the operation 2938 // here. 2939 bool X86DAGToDAGISel::foldLoadStoreIntoMemOperand(SDNode *Node) { 2940 StoreSDNode *StoreNode = cast<StoreSDNode>(Node); 2941 SDValue StoredVal = StoreNode->getOperand(1); 2942 unsigned Opc = StoredVal->getOpcode(); 2943 2944 // Before we try to select anything, make sure this is memory operand size 2945 // and opcode we can handle. Note that this must match the code below that 2946 // actually lowers the opcodes. 2947 EVT MemVT = StoreNode->getMemoryVT(); 2948 if (MemVT != MVT::i64 && MemVT != MVT::i32 && MemVT != MVT::i16 && 2949 MemVT != MVT::i8) 2950 return false; 2951 2952 bool IsCommutable = false; 2953 bool IsNegate = false; 2954 switch (Opc) { 2955 default: 2956 return false; 2957 case X86ISD::SUB: 2958 IsNegate = isNullConstant(StoredVal.getOperand(0)); 2959 break; 2960 case X86ISD::SBB: 2961 break; 2962 case X86ISD::ADD: 2963 case X86ISD::ADC: 2964 case X86ISD::AND: 2965 case X86ISD::OR: 2966 case X86ISD::XOR: 2967 IsCommutable = true; 2968 break; 2969 } 2970 2971 unsigned LoadOpNo = IsNegate ? 1 : 0; 2972 LoadSDNode *LoadNode = nullptr; 2973 SDValue InputChain; 2974 if (!isFusableLoadOpStorePattern(StoreNode, StoredVal, CurDAG, LoadOpNo, 2975 LoadNode, InputChain)) { 2976 if (!IsCommutable) 2977 return false; 2978 2979 // This operation is commutable, try the other operand. 2980 LoadOpNo = 1; 2981 if (!isFusableLoadOpStorePattern(StoreNode, StoredVal, CurDAG, LoadOpNo, 2982 LoadNode, InputChain)) 2983 return false; 2984 } 2985 2986 SDValue Base, Scale, Index, Disp, Segment; 2987 if (!selectAddr(LoadNode, LoadNode->getBasePtr(), Base, Scale, Index, Disp, 2988 Segment)) 2989 return false; 2990 2991 auto SelectOpcode = [&](unsigned Opc64, unsigned Opc32, unsigned Opc16, 2992 unsigned Opc8) { 2993 switch (MemVT.getSimpleVT().SimpleTy) { 2994 case MVT::i64: 2995 return Opc64; 2996 case MVT::i32: 2997 return Opc32; 2998 case MVT::i16: 2999 return Opc16; 3000 case MVT::i8: 3001 return Opc8; 3002 default: 3003 llvm_unreachable("Invalid size!"); 3004 } 3005 }; 3006 3007 MachineSDNode *Result; 3008 switch (Opc) { 3009 case X86ISD::SUB: 3010 // Handle negate. 3011 if (IsNegate) { 3012 unsigned NewOpc = SelectOpcode(X86::NEG64m, X86::NEG32m, X86::NEG16m, 3013 X86::NEG8m); 3014 const SDValue Ops[] = {Base, Scale, Index, Disp, Segment, InputChain}; 3015 Result = CurDAG->getMachineNode(NewOpc, SDLoc(Node), MVT::i32, 3016 MVT::Other, Ops); 3017 break; 3018 } 3019 LLVM_FALLTHROUGH; 3020 case X86ISD::ADD: 3021 // Try to match inc/dec. 3022 if (!Subtarget->slowIncDec() || OptForSize) { 3023 bool IsOne = isOneConstant(StoredVal.getOperand(1)); 3024 bool IsNegOne = isAllOnesConstant(StoredVal.getOperand(1)); 3025 // ADD/SUB with 1/-1 and carry flag isn't used can use inc/dec. 3026 if ((IsOne || IsNegOne) && hasNoCarryFlagUses(StoredVal.getValue(1))) { 3027 unsigned NewOpc = 3028 ((Opc == X86ISD::ADD) == IsOne) 3029 ? SelectOpcode(X86::INC64m, X86::INC32m, X86::INC16m, X86::INC8m) 3030 : SelectOpcode(X86::DEC64m, X86::DEC32m, X86::DEC16m, X86::DEC8m); 3031 const SDValue Ops[] = {Base, Scale, Index, Disp, Segment, InputChain}; 3032 Result = CurDAG->getMachineNode(NewOpc, SDLoc(Node), MVT::i32, 3033 MVT::Other, Ops); 3034 break; 3035 } 3036 } 3037 LLVM_FALLTHROUGH; 3038 case X86ISD::ADC: 3039 case X86ISD::SBB: 3040 case X86ISD::AND: 3041 case X86ISD::OR: 3042 case X86ISD::XOR: { 3043 auto SelectRegOpcode = [SelectOpcode](unsigned Opc) { 3044 switch (Opc) { 3045 case X86ISD::ADD: 3046 return SelectOpcode(X86::ADD64mr, X86::ADD32mr, X86::ADD16mr, 3047 X86::ADD8mr); 3048 case X86ISD::ADC: 3049 return SelectOpcode(X86::ADC64mr, X86::ADC32mr, X86::ADC16mr, 3050 X86::ADC8mr); 3051 case X86ISD::SUB: 3052 return SelectOpcode(X86::SUB64mr, X86::SUB32mr, X86::SUB16mr, 3053 X86::SUB8mr); 3054 case X86ISD::SBB: 3055 return SelectOpcode(X86::SBB64mr, X86::SBB32mr, X86::SBB16mr, 3056 X86::SBB8mr); 3057 case X86ISD::AND: 3058 return SelectOpcode(X86::AND64mr, X86::AND32mr, X86::AND16mr, 3059 X86::AND8mr); 3060 case X86ISD::OR: 3061 return SelectOpcode(X86::OR64mr, X86::OR32mr, X86::OR16mr, X86::OR8mr); 3062 case X86ISD::XOR: 3063 return SelectOpcode(X86::XOR64mr, X86::XOR32mr, X86::XOR16mr, 3064 X86::XOR8mr); 3065 default: 3066 llvm_unreachable("Invalid opcode!"); 3067 } 3068 }; 3069 auto SelectImm8Opcode = [SelectOpcode](unsigned Opc) { 3070 switch (Opc) { 3071 case X86ISD::ADD: 3072 return SelectOpcode(X86::ADD64mi8, X86::ADD32mi8, X86::ADD16mi8, 0); 3073 case X86ISD::ADC: 3074 return SelectOpcode(X86::ADC64mi8, X86::ADC32mi8, X86::ADC16mi8, 0); 3075 case X86ISD::SUB: 3076 return SelectOpcode(X86::SUB64mi8, X86::SUB32mi8, X86::SUB16mi8, 0); 3077 case X86ISD::SBB: 3078 return SelectOpcode(X86::SBB64mi8, X86::SBB32mi8, X86::SBB16mi8, 0); 3079 case X86ISD::AND: 3080 return SelectOpcode(X86::AND64mi8, X86::AND32mi8, X86::AND16mi8, 0); 3081 case X86ISD::OR: 3082 return SelectOpcode(X86::OR64mi8, X86::OR32mi8, X86::OR16mi8, 0); 3083 case X86ISD::XOR: 3084 return SelectOpcode(X86::XOR64mi8, X86::XOR32mi8, X86::XOR16mi8, 0); 3085 default: 3086 llvm_unreachable("Invalid opcode!"); 3087 } 3088 }; 3089 auto SelectImmOpcode = [SelectOpcode](unsigned Opc) { 3090 switch (Opc) { 3091 case X86ISD::ADD: 3092 return SelectOpcode(X86::ADD64mi32, X86::ADD32mi, X86::ADD16mi, 3093 X86::ADD8mi); 3094 case X86ISD::ADC: 3095 return SelectOpcode(X86::ADC64mi32, X86::ADC32mi, X86::ADC16mi, 3096 X86::ADC8mi); 3097 case X86ISD::SUB: 3098 return SelectOpcode(X86::SUB64mi32, X86::SUB32mi, X86::SUB16mi, 3099 X86::SUB8mi); 3100 case X86ISD::SBB: 3101 return SelectOpcode(X86::SBB64mi32, X86::SBB32mi, X86::SBB16mi, 3102 X86::SBB8mi); 3103 case X86ISD::AND: 3104 return SelectOpcode(X86::AND64mi32, X86::AND32mi, X86::AND16mi, 3105 X86::AND8mi); 3106 case X86ISD::OR: 3107 return SelectOpcode(X86::OR64mi32, X86::OR32mi, X86::OR16mi, 3108 X86::OR8mi); 3109 case X86ISD::XOR: 3110 return SelectOpcode(X86::XOR64mi32, X86::XOR32mi, X86::XOR16mi, 3111 X86::XOR8mi); 3112 default: 3113 llvm_unreachable("Invalid opcode!"); 3114 } 3115 }; 3116 3117 unsigned NewOpc = SelectRegOpcode(Opc); 3118 SDValue Operand = StoredVal->getOperand(1-LoadOpNo); 3119 3120 // See if the operand is a constant that we can fold into an immediate 3121 // operand. 3122 if (auto *OperandC = dyn_cast<ConstantSDNode>(Operand)) { 3123 int64_t OperandV = OperandC->getSExtValue(); 3124 3125 // Check if we can shrink the operand enough to fit in an immediate (or 3126 // fit into a smaller immediate) by negating it and switching the 3127 // operation. 3128 if ((Opc == X86ISD::ADD || Opc == X86ISD::SUB) && 3129 ((MemVT != MVT::i8 && !isInt<8>(OperandV) && isInt<8>(-OperandV)) || 3130 (MemVT == MVT::i64 && !isInt<32>(OperandV) && 3131 isInt<32>(-OperandV))) && 3132 hasNoCarryFlagUses(StoredVal.getValue(1))) { 3133 OperandV = -OperandV; 3134 Opc = Opc == X86ISD::ADD ? X86ISD::SUB : X86ISD::ADD; 3135 } 3136 3137 // First try to fit this into an Imm8 operand. If it doesn't fit, then try 3138 // the larger immediate operand. 3139 if (MemVT != MVT::i8 && isInt<8>(OperandV)) { 3140 Operand = CurDAG->getTargetConstant(OperandV, SDLoc(Node), MemVT); 3141 NewOpc = SelectImm8Opcode(Opc); 3142 } else if (MemVT != MVT::i64 || isInt<32>(OperandV)) { 3143 Operand = CurDAG->getTargetConstant(OperandV, SDLoc(Node), MemVT); 3144 NewOpc = SelectImmOpcode(Opc); 3145 } 3146 } 3147 3148 if (Opc == X86ISD::ADC || Opc == X86ISD::SBB) { 3149 SDValue CopyTo = 3150 CurDAG->getCopyToReg(InputChain, SDLoc(Node), X86::EFLAGS, 3151 StoredVal.getOperand(2), SDValue()); 3152 3153 const SDValue Ops[] = {Base, Scale, Index, Disp, 3154 Segment, Operand, CopyTo, CopyTo.getValue(1)}; 3155 Result = CurDAG->getMachineNode(NewOpc, SDLoc(Node), MVT::i32, MVT::Other, 3156 Ops); 3157 } else { 3158 const SDValue Ops[] = {Base, Scale, Index, Disp, 3159 Segment, Operand, InputChain}; 3160 Result = CurDAG->getMachineNode(NewOpc, SDLoc(Node), MVT::i32, MVT::Other, 3161 Ops); 3162 } 3163 break; 3164 } 3165 default: 3166 llvm_unreachable("Invalid opcode!"); 3167 } 3168 3169 MachineMemOperand *MemOps[] = {StoreNode->getMemOperand(), 3170 LoadNode->getMemOperand()}; 3171 CurDAG->setNodeMemRefs(Result, MemOps); 3172 3173 // Update Load Chain uses as well. 3174 ReplaceUses(SDValue(LoadNode, 1), SDValue(Result, 1)); 3175 ReplaceUses(SDValue(StoreNode, 0), SDValue(Result, 1)); 3176 ReplaceUses(SDValue(StoredVal.getNode(), 1), SDValue(Result, 0)); 3177 CurDAG->RemoveDeadNode(Node); 3178 return true; 3179 } 3180 3181 // See if this is an X & Mask that we can match to BEXTR/BZHI. 3182 // Where Mask is one of the following patterns: 3183 // a) x & (1 << nbits) - 1 3184 // b) x & ~(-1 << nbits) 3185 // c) x & (-1 >> (32 - y)) 3186 // d) x << (32 - y) >> (32 - y) 3187 bool X86DAGToDAGISel::matchBitExtract(SDNode *Node) { 3188 assert( 3189 (Node->getOpcode() == ISD::AND || Node->getOpcode() == ISD::SRL) && 3190 "Should be either an and-mask, or right-shift after clearing high bits."); 3191 3192 // BEXTR is BMI instruction, BZHI is BMI2 instruction. We need at least one. 3193 if (!Subtarget->hasBMI() && !Subtarget->hasBMI2()) 3194 return false; 3195 3196 MVT NVT = Node->getSimpleValueType(0); 3197 3198 // Only supported for 32 and 64 bits. 3199 if (NVT != MVT::i32 && NVT != MVT::i64) 3200 return false; 3201 3202 SDValue NBits; 3203 3204 // If we have BMI2's BZHI, we are ok with muti-use patterns. 3205 // Else, if we only have BMI1's BEXTR, we require one-use. 3206 const bool CanHaveExtraUses = Subtarget->hasBMI2(); 3207 auto checkUses = [CanHaveExtraUses](SDValue Op, unsigned NUses) { 3208 return CanHaveExtraUses || 3209 Op.getNode()->hasNUsesOfValue(NUses, Op.getResNo()); 3210 }; 3211 auto checkOneUse = [checkUses](SDValue Op) { return checkUses(Op, 1); }; 3212 auto checkTwoUse = [checkUses](SDValue Op) { return checkUses(Op, 2); }; 3213 3214 auto peekThroughOneUseTruncation = [checkOneUse](SDValue V) { 3215 if (V->getOpcode() == ISD::TRUNCATE && checkOneUse(V)) { 3216 assert(V.getSimpleValueType() == MVT::i32 && 3217 V.getOperand(0).getSimpleValueType() == MVT::i64 && 3218 "Expected i64 -> i32 truncation"); 3219 V = V.getOperand(0); 3220 } 3221 return V; 3222 }; 3223 3224 // a) x & ((1 << nbits) + (-1)) 3225 auto matchPatternA = [checkOneUse, peekThroughOneUseTruncation, 3226 &NBits](SDValue Mask) -> bool { 3227 // Match `add`. Must only have one use! 3228 if (Mask->getOpcode() != ISD::ADD || !checkOneUse(Mask)) 3229 return false; 3230 // We should be adding all-ones constant (i.e. subtracting one.) 3231 if (!isAllOnesConstant(Mask->getOperand(1))) 3232 return false; 3233 // Match `1 << nbits`. Might be truncated. Must only have one use! 3234 SDValue M0 = peekThroughOneUseTruncation(Mask->getOperand(0)); 3235 if (M0->getOpcode() != ISD::SHL || !checkOneUse(M0)) 3236 return false; 3237 if (!isOneConstant(M0->getOperand(0))) 3238 return false; 3239 NBits = M0->getOperand(1); 3240 return true; 3241 }; 3242 3243 auto isAllOnes = [this, peekThroughOneUseTruncation, NVT](SDValue V) { 3244 V = peekThroughOneUseTruncation(V); 3245 return CurDAG->MaskedValueIsAllOnes( 3246 V, APInt::getLowBitsSet(V.getSimpleValueType().getSizeInBits(), 3247 NVT.getSizeInBits())); 3248 }; 3249 3250 // b) x & ~(-1 << nbits) 3251 auto matchPatternB = [checkOneUse, isAllOnes, peekThroughOneUseTruncation, 3252 &NBits](SDValue Mask) -> bool { 3253 // Match `~()`. Must only have one use! 3254 if (Mask.getOpcode() != ISD::XOR || !checkOneUse(Mask)) 3255 return false; 3256 // The -1 only has to be all-ones for the final Node's NVT. 3257 if (!isAllOnes(Mask->getOperand(1))) 3258 return false; 3259 // Match `-1 << nbits`. Might be truncated. Must only have one use! 3260 SDValue M0 = peekThroughOneUseTruncation(Mask->getOperand(0)); 3261 if (M0->getOpcode() != ISD::SHL || !checkOneUse(M0)) 3262 return false; 3263 // The -1 only has to be all-ones for the final Node's NVT. 3264 if (!isAllOnes(M0->getOperand(0))) 3265 return false; 3266 NBits = M0->getOperand(1); 3267 return true; 3268 }; 3269 3270 // Match potentially-truncated (bitwidth - y) 3271 auto matchShiftAmt = [checkOneUse, &NBits](SDValue ShiftAmt, 3272 unsigned Bitwidth) { 3273 // Skip over a truncate of the shift amount. 3274 if (ShiftAmt.getOpcode() == ISD::TRUNCATE) { 3275 ShiftAmt = ShiftAmt.getOperand(0); 3276 // The trunc should have been the only user of the real shift amount. 3277 if (!checkOneUse(ShiftAmt)) 3278 return false; 3279 } 3280 // Match the shift amount as: (bitwidth - y). It should go away, too. 3281 if (ShiftAmt.getOpcode() != ISD::SUB) 3282 return false; 3283 auto V0 = dyn_cast<ConstantSDNode>(ShiftAmt.getOperand(0)); 3284 if (!V0 || V0->getZExtValue() != Bitwidth) 3285 return false; 3286 NBits = ShiftAmt.getOperand(1); 3287 return true; 3288 }; 3289 3290 // c) x & (-1 >> (32 - y)) 3291 auto matchPatternC = [checkOneUse, peekThroughOneUseTruncation, 3292 matchShiftAmt](SDValue Mask) -> bool { 3293 // The mask itself may be truncated. 3294 Mask = peekThroughOneUseTruncation(Mask); 3295 unsigned Bitwidth = Mask.getSimpleValueType().getSizeInBits(); 3296 // Match `l>>`. Must only have one use! 3297 if (Mask.getOpcode() != ISD::SRL || !checkOneUse(Mask)) 3298 return false; 3299 // We should be shifting truly all-ones constant. 3300 if (!isAllOnesConstant(Mask.getOperand(0))) 3301 return false; 3302 SDValue M1 = Mask.getOperand(1); 3303 // The shift amount should not be used externally. 3304 if (!checkOneUse(M1)) 3305 return false; 3306 return matchShiftAmt(M1, Bitwidth); 3307 }; 3308 3309 SDValue X; 3310 3311 // d) x << (32 - y) >> (32 - y) 3312 auto matchPatternD = [checkOneUse, checkTwoUse, matchShiftAmt, 3313 &X](SDNode *Node) -> bool { 3314 if (Node->getOpcode() != ISD::SRL) 3315 return false; 3316 SDValue N0 = Node->getOperand(0); 3317 if (N0->getOpcode() != ISD::SHL || !checkOneUse(N0)) 3318 return false; 3319 unsigned Bitwidth = N0.getSimpleValueType().getSizeInBits(); 3320 SDValue N1 = Node->getOperand(1); 3321 SDValue N01 = N0->getOperand(1); 3322 // Both of the shifts must be by the exact same value. 3323 // There should not be any uses of the shift amount outside of the pattern. 3324 if (N1 != N01 || !checkTwoUse(N1)) 3325 return false; 3326 if (!matchShiftAmt(N1, Bitwidth)) 3327 return false; 3328 X = N0->getOperand(0); 3329 return true; 3330 }; 3331 3332 auto matchLowBitMask = [matchPatternA, matchPatternB, 3333 matchPatternC](SDValue Mask) -> bool { 3334 return matchPatternA(Mask) || matchPatternB(Mask) || matchPatternC(Mask); 3335 }; 3336 3337 if (Node->getOpcode() == ISD::AND) { 3338 X = Node->getOperand(0); 3339 SDValue Mask = Node->getOperand(1); 3340 3341 if (matchLowBitMask(Mask)) { 3342 // Great. 3343 } else { 3344 std::swap(X, Mask); 3345 if (!matchLowBitMask(Mask)) 3346 return false; 3347 } 3348 } else if (!matchPatternD(Node)) 3349 return false; 3350 3351 SDLoc DL(Node); 3352 3353 // Truncate the shift amount. 3354 NBits = CurDAG->getNode(ISD::TRUNCATE, DL, MVT::i8, NBits); 3355 insertDAGNode(*CurDAG, SDValue(Node, 0), NBits); 3356 3357 // Insert 8-bit NBits into lowest 8 bits of 32-bit register. 3358 // All the other bits are undefined, we do not care about them. 3359 SDValue ImplDef = SDValue( 3360 CurDAG->getMachineNode(TargetOpcode::IMPLICIT_DEF, DL, MVT::i32), 0); 3361 insertDAGNode(*CurDAG, SDValue(Node, 0), ImplDef); 3362 3363 SDValue SRIdxVal = CurDAG->getTargetConstant(X86::sub_8bit, DL, MVT::i32); 3364 insertDAGNode(*CurDAG, SDValue(Node, 0), SRIdxVal); 3365 NBits = SDValue( 3366 CurDAG->getMachineNode(TargetOpcode::INSERT_SUBREG, DL, MVT::i32, ImplDef, 3367 NBits, SRIdxVal), 0); 3368 insertDAGNode(*CurDAG, SDValue(Node, 0), NBits); 3369 3370 if (Subtarget->hasBMI2()) { 3371 // Great, just emit the the BZHI.. 3372 if (NVT != MVT::i32) { 3373 // But have to place the bit count into the wide-enough register first. 3374 NBits = CurDAG->getNode(ISD::ANY_EXTEND, DL, NVT, NBits); 3375 insertDAGNode(*CurDAG, SDValue(Node, 0), NBits); 3376 } 3377 3378 SDValue Extract = CurDAG->getNode(X86ISD::BZHI, DL, NVT, X, NBits); 3379 ReplaceNode(Node, Extract.getNode()); 3380 SelectCode(Extract.getNode()); 3381 return true; 3382 } 3383 3384 // Else, if we do *NOT* have BMI2, let's find out if the if the 'X' is 3385 // *logically* shifted (potentially with one-use trunc inbetween), 3386 // and the truncation was the only use of the shift, 3387 // and if so look past one-use truncation. 3388 { 3389 SDValue RealX = peekThroughOneUseTruncation(X); 3390 // FIXME: only if the shift is one-use? 3391 if (RealX != X && RealX.getOpcode() == ISD::SRL) 3392 X = RealX; 3393 } 3394 3395 MVT XVT = X.getSimpleValueType(); 3396 3397 // Else, emitting BEXTR requires one more step. 3398 // The 'control' of BEXTR has the pattern of: 3399 // [15...8 bit][ 7...0 bit] location 3400 // [ bit count][ shift] name 3401 // I.e. 0b000000011'00000001 means (x >> 0b1) & 0b11 3402 3403 // Shift NBits left by 8 bits, thus producing 'control'. 3404 // This makes the low 8 bits to be zero. 3405 SDValue C8 = CurDAG->getConstant(8, DL, MVT::i8); 3406 SDValue Control = CurDAG->getNode(ISD::SHL, DL, MVT::i32, NBits, C8); 3407 insertDAGNode(*CurDAG, SDValue(Node, 0), Control); 3408 3409 // If the 'X' is *logically* shifted, we can fold that shift into 'control'. 3410 // FIXME: only if the shift is one-use? 3411 if (X.getOpcode() == ISD::SRL) { 3412 SDValue ShiftAmt = X.getOperand(1); 3413 X = X.getOperand(0); 3414 3415 assert(ShiftAmt.getValueType() == MVT::i8 && 3416 "Expected shift amount to be i8"); 3417 3418 // Now, *zero*-extend the shift amount. The bits 8...15 *must* be zero! 3419 // We could zext to i16 in some form, but we intentionally don't do that. 3420 SDValue OrigShiftAmt = ShiftAmt; 3421 ShiftAmt = CurDAG->getNode(ISD::ZERO_EXTEND, DL, MVT::i32, ShiftAmt); 3422 insertDAGNode(*CurDAG, OrigShiftAmt, ShiftAmt); 3423 3424 // And now 'or' these low 8 bits of shift amount into the 'control'. 3425 Control = CurDAG->getNode(ISD::OR, DL, MVT::i32, Control, ShiftAmt); 3426 insertDAGNode(*CurDAG, SDValue(Node, 0), Control); 3427 } 3428 3429 // But have to place the 'control' into the wide-enough register first. 3430 if (XVT != MVT::i32) { 3431 Control = CurDAG->getNode(ISD::ANY_EXTEND, DL, XVT, Control); 3432 insertDAGNode(*CurDAG, SDValue(Node, 0), Control); 3433 } 3434 3435 // And finally, form the BEXTR itself. 3436 SDValue Extract = CurDAG->getNode(X86ISD::BEXTR, DL, XVT, X, Control); 3437 3438 // The 'X' was originally truncated. Do that now. 3439 if (XVT != NVT) { 3440 insertDAGNode(*CurDAG, SDValue(Node, 0), Extract); 3441 Extract = CurDAG->getNode(ISD::TRUNCATE, DL, NVT, Extract); 3442 } 3443 3444 ReplaceNode(Node, Extract.getNode()); 3445 SelectCode(Extract.getNode()); 3446 3447 return true; 3448 } 3449 3450 // See if this is an (X >> C1) & C2 that we can match to BEXTR/BEXTRI. 3451 MachineSDNode *X86DAGToDAGISel::matchBEXTRFromAndImm(SDNode *Node) { 3452 MVT NVT = Node->getSimpleValueType(0); 3453 SDLoc dl(Node); 3454 3455 SDValue N0 = Node->getOperand(0); 3456 SDValue N1 = Node->getOperand(1); 3457 3458 // If we have TBM we can use an immediate for the control. If we have BMI 3459 // we should only do this if the BEXTR instruction is implemented well. 3460 // Otherwise moving the control into a register makes this more costly. 3461 // TODO: Maybe load folding, greater than 32-bit masks, or a guarantee of LICM 3462 // hoisting the move immediate would make it worthwhile with a less optimal 3463 // BEXTR? 3464 bool PreferBEXTR = 3465 Subtarget->hasTBM() || (Subtarget->hasBMI() && Subtarget->hasFastBEXTR()); 3466 if (!PreferBEXTR && !Subtarget->hasBMI2()) 3467 return nullptr; 3468 3469 // Must have a shift right. 3470 if (N0->getOpcode() != ISD::SRL && N0->getOpcode() != ISD::SRA) 3471 return nullptr; 3472 3473 // Shift can't have additional users. 3474 if (!N0->hasOneUse()) 3475 return nullptr; 3476 3477 // Only supported for 32 and 64 bits. 3478 if (NVT != MVT::i32 && NVT != MVT::i64) 3479 return nullptr; 3480 3481 // Shift amount and RHS of and must be constant. 3482 ConstantSDNode *MaskCst = dyn_cast<ConstantSDNode>(N1); 3483 ConstantSDNode *ShiftCst = dyn_cast<ConstantSDNode>(N0->getOperand(1)); 3484 if (!MaskCst || !ShiftCst) 3485 return nullptr; 3486 3487 // And RHS must be a mask. 3488 uint64_t Mask = MaskCst->getZExtValue(); 3489 if (!isMask_64(Mask)) 3490 return nullptr; 3491 3492 uint64_t Shift = ShiftCst->getZExtValue(); 3493 uint64_t MaskSize = countPopulation(Mask); 3494 3495 // Don't interfere with something that can be handled by extracting AH. 3496 // TODO: If we are able to fold a load, BEXTR might still be better than AH. 3497 if (Shift == 8 && MaskSize == 8) 3498 return nullptr; 3499 3500 // Make sure we are only using bits that were in the original value, not 3501 // shifted in. 3502 if (Shift + MaskSize > NVT.getSizeInBits()) 3503 return nullptr; 3504 3505 // BZHI, if available, is always fast, unlike BEXTR. But even if we decide 3506 // that we can't use BEXTR, it is only worthwhile using BZHI if the mask 3507 // does not fit into 32 bits. Load folding is not a sufficient reason. 3508 if (!PreferBEXTR && MaskSize <= 32) 3509 return nullptr; 3510 3511 SDValue Control; 3512 unsigned ROpc, MOpc; 3513 3514 if (!PreferBEXTR) { 3515 assert(Subtarget->hasBMI2() && "We must have BMI2's BZHI then."); 3516 // If we can't make use of BEXTR then we can't fuse shift+mask stages. 3517 // Let's perform the mask first, and apply shift later. Note that we need to 3518 // widen the mask to account for the fact that we'll apply shift afterwards! 3519 Control = CurDAG->getTargetConstant(Shift + MaskSize, dl, NVT); 3520 ROpc = NVT == MVT::i64 ? X86::BZHI64rr : X86::BZHI32rr; 3521 MOpc = NVT == MVT::i64 ? X86::BZHI64rm : X86::BZHI32rm; 3522 unsigned NewOpc = NVT == MVT::i64 ? X86::MOV32ri64 : X86::MOV32ri; 3523 Control = SDValue(CurDAG->getMachineNode(NewOpc, dl, NVT, Control), 0); 3524 } else { 3525 // The 'control' of BEXTR has the pattern of: 3526 // [15...8 bit][ 7...0 bit] location 3527 // [ bit count][ shift] name 3528 // I.e. 0b000000011'00000001 means (x >> 0b1) & 0b11 3529 Control = CurDAG->getTargetConstant(Shift | (MaskSize << 8), dl, NVT); 3530 if (Subtarget->hasTBM()) { 3531 ROpc = NVT == MVT::i64 ? X86::BEXTRI64ri : X86::BEXTRI32ri; 3532 MOpc = NVT == MVT::i64 ? X86::BEXTRI64mi : X86::BEXTRI32mi; 3533 } else { 3534 assert(Subtarget->hasBMI() && "We must have BMI1's BEXTR then."); 3535 // BMI requires the immediate to placed in a register. 3536 ROpc = NVT == MVT::i64 ? X86::BEXTR64rr : X86::BEXTR32rr; 3537 MOpc = NVT == MVT::i64 ? X86::BEXTR64rm : X86::BEXTR32rm; 3538 unsigned NewOpc = NVT == MVT::i64 ? X86::MOV32ri64 : X86::MOV32ri; 3539 Control = SDValue(CurDAG->getMachineNode(NewOpc, dl, NVT, Control), 0); 3540 } 3541 } 3542 3543 MachineSDNode *NewNode; 3544 SDValue Input = N0->getOperand(0); 3545 SDValue Tmp0, Tmp1, Tmp2, Tmp3, Tmp4; 3546 if (tryFoldLoad(Node, N0.getNode(), Input, Tmp0, Tmp1, Tmp2, Tmp3, Tmp4)) { 3547 SDValue Ops[] = { 3548 Tmp0, Tmp1, Tmp2, Tmp3, Tmp4, Control, Input.getOperand(0)}; 3549 SDVTList VTs = CurDAG->getVTList(NVT, MVT::i32, MVT::Other); 3550 NewNode = CurDAG->getMachineNode(MOpc, dl, VTs, Ops); 3551 // Update the chain. 3552 ReplaceUses(Input.getValue(1), SDValue(NewNode, 2)); 3553 // Record the mem-refs 3554 CurDAG->setNodeMemRefs(NewNode, {cast<LoadSDNode>(Input)->getMemOperand()}); 3555 } else { 3556 NewNode = CurDAG->getMachineNode(ROpc, dl, NVT, MVT::i32, Input, Control); 3557 } 3558 3559 if (!PreferBEXTR) { 3560 // We still need to apply the shift. 3561 SDValue ShAmt = CurDAG->getTargetConstant(Shift, dl, NVT); 3562 unsigned NewOpc = NVT == MVT::i64 ? X86::SHR64ri : X86::SHR32ri; 3563 NewNode = 3564 CurDAG->getMachineNode(NewOpc, dl, NVT, SDValue(NewNode, 0), ShAmt); 3565 } 3566 3567 return NewNode; 3568 } 3569 3570 // Emit a PCMISTR(I/M) instruction. 3571 MachineSDNode *X86DAGToDAGISel::emitPCMPISTR(unsigned ROpc, unsigned MOpc, 3572 bool MayFoldLoad, const SDLoc &dl, 3573 MVT VT, SDNode *Node) { 3574 SDValue N0 = Node->getOperand(0); 3575 SDValue N1 = Node->getOperand(1); 3576 SDValue Imm = Node->getOperand(2); 3577 const ConstantInt *Val = cast<ConstantSDNode>(Imm)->getConstantIntValue(); 3578 Imm = CurDAG->getTargetConstant(*Val, SDLoc(Node), Imm.getValueType()); 3579 3580 // Try to fold a load. No need to check alignment. 3581 SDValue Tmp0, Tmp1, Tmp2, Tmp3, Tmp4; 3582 if (MayFoldLoad && tryFoldLoad(Node, N1, Tmp0, Tmp1, Tmp2, Tmp3, Tmp4)) { 3583 SDValue Ops[] = { N0, Tmp0, Tmp1, Tmp2, Tmp3, Tmp4, Imm, 3584 N1.getOperand(0) }; 3585 SDVTList VTs = CurDAG->getVTList(VT, MVT::i32, MVT::Other); 3586 MachineSDNode *CNode = CurDAG->getMachineNode(MOpc, dl, VTs, Ops); 3587 // Update the chain. 3588 ReplaceUses(N1.getValue(1), SDValue(CNode, 2)); 3589 // Record the mem-refs 3590 CurDAG->setNodeMemRefs(CNode, {cast<LoadSDNode>(N1)->getMemOperand()}); 3591 return CNode; 3592 } 3593 3594 SDValue Ops[] = { N0, N1, Imm }; 3595 SDVTList VTs = CurDAG->getVTList(VT, MVT::i32); 3596 MachineSDNode *CNode = CurDAG->getMachineNode(ROpc, dl, VTs, Ops); 3597 return CNode; 3598 } 3599 3600 // Emit a PCMESTR(I/M) instruction. Also return the Glue result in case we need 3601 // to emit a second instruction after this one. This is needed since we have two 3602 // copyToReg nodes glued before this and we need to continue that glue through. 3603 MachineSDNode *X86DAGToDAGISel::emitPCMPESTR(unsigned ROpc, unsigned MOpc, 3604 bool MayFoldLoad, const SDLoc &dl, 3605 MVT VT, SDNode *Node, 3606 SDValue &InFlag) { 3607 SDValue N0 = Node->getOperand(0); 3608 SDValue N2 = Node->getOperand(2); 3609 SDValue Imm = Node->getOperand(4); 3610 const ConstantInt *Val = cast<ConstantSDNode>(Imm)->getConstantIntValue(); 3611 Imm = CurDAG->getTargetConstant(*Val, SDLoc(Node), Imm.getValueType()); 3612 3613 // Try to fold a load. No need to check alignment. 3614 SDValue Tmp0, Tmp1, Tmp2, Tmp3, Tmp4; 3615 if (MayFoldLoad && tryFoldLoad(Node, N2, Tmp0, Tmp1, Tmp2, Tmp3, Tmp4)) { 3616 SDValue Ops[] = { N0, Tmp0, Tmp1, Tmp2, Tmp3, Tmp4, Imm, 3617 N2.getOperand(0), InFlag }; 3618 SDVTList VTs = CurDAG->getVTList(VT, MVT::i32, MVT::Other, MVT::Glue); 3619 MachineSDNode *CNode = CurDAG->getMachineNode(MOpc, dl, VTs, Ops); 3620 InFlag = SDValue(CNode, 3); 3621 // Update the chain. 3622 ReplaceUses(N2.getValue(1), SDValue(CNode, 2)); 3623 // Record the mem-refs 3624 CurDAG->setNodeMemRefs(CNode, {cast<LoadSDNode>(N2)->getMemOperand()}); 3625 return CNode; 3626 } 3627 3628 SDValue Ops[] = { N0, N2, Imm, InFlag }; 3629 SDVTList VTs = CurDAG->getVTList(VT, MVT::i32, MVT::Glue); 3630 MachineSDNode *CNode = CurDAG->getMachineNode(ROpc, dl, VTs, Ops); 3631 InFlag = SDValue(CNode, 2); 3632 return CNode; 3633 } 3634 3635 bool X86DAGToDAGISel::tryShiftAmountMod(SDNode *N) { 3636 EVT VT = N->getValueType(0); 3637 3638 // Only handle scalar shifts. 3639 if (VT.isVector()) 3640 return false; 3641 3642 // Narrower shifts only mask to 5 bits in hardware. 3643 unsigned Size = VT == MVT::i64 ? 64 : 32; 3644 3645 SDValue OrigShiftAmt = N->getOperand(1); 3646 SDValue ShiftAmt = OrigShiftAmt; 3647 SDLoc DL(N); 3648 3649 // Skip over a truncate of the shift amount. 3650 if (ShiftAmt->getOpcode() == ISD::TRUNCATE) 3651 ShiftAmt = ShiftAmt->getOperand(0); 3652 3653 // This function is called after X86DAGToDAGISel::matchBitExtract(), 3654 // so we are not afraid that we might mess up BZHI/BEXTR pattern. 3655 3656 SDValue NewShiftAmt; 3657 if (ShiftAmt->getOpcode() == ISD::ADD || ShiftAmt->getOpcode() == ISD::SUB) { 3658 SDValue Add0 = ShiftAmt->getOperand(0); 3659 SDValue Add1 = ShiftAmt->getOperand(1); 3660 // If we are shifting by X+/-N where N == 0 mod Size, then just shift by X 3661 // to avoid the ADD/SUB. 3662 if (isa<ConstantSDNode>(Add1) && 3663 cast<ConstantSDNode>(Add1)->getZExtValue() % Size == 0) { 3664 NewShiftAmt = Add0; 3665 // If we are shifting by N-X where N == 0 mod Size, then just shift by -X to 3666 // generate a NEG instead of a SUB of a constant. 3667 } else if (ShiftAmt->getOpcode() == ISD::SUB && 3668 isa<ConstantSDNode>(Add0) && 3669 cast<ConstantSDNode>(Add0)->getZExtValue() != 0 && 3670 cast<ConstantSDNode>(Add0)->getZExtValue() % Size == 0) { 3671 // Insert a negate op. 3672 // TODO: This isn't guaranteed to replace the sub if there is a logic cone 3673 // that uses it that's not a shift. 3674 EVT SubVT = ShiftAmt.getValueType(); 3675 SDValue Zero = CurDAG->getConstant(0, DL, SubVT); 3676 SDValue Neg = CurDAG->getNode(ISD::SUB, DL, SubVT, Zero, Add1); 3677 NewShiftAmt = Neg; 3678 3679 // Insert these operands into a valid topological order so they can 3680 // get selected independently. 3681 insertDAGNode(*CurDAG, OrigShiftAmt, Zero); 3682 insertDAGNode(*CurDAG, OrigShiftAmt, Neg); 3683 } else 3684 return false; 3685 } else 3686 return false; 3687 3688 if (NewShiftAmt.getValueType() != MVT::i8) { 3689 // Need to truncate the shift amount. 3690 NewShiftAmt = CurDAG->getNode(ISD::TRUNCATE, DL, MVT::i8, NewShiftAmt); 3691 // Add to a correct topological ordering. 3692 insertDAGNode(*CurDAG, OrigShiftAmt, NewShiftAmt); 3693 } 3694 3695 // Insert a new mask to keep the shift amount legal. This should be removed 3696 // by isel patterns. 3697 NewShiftAmt = CurDAG->getNode(ISD::AND, DL, MVT::i8, NewShiftAmt, 3698 CurDAG->getConstant(Size - 1, DL, MVT::i8)); 3699 // Place in a correct topological ordering. 3700 insertDAGNode(*CurDAG, OrigShiftAmt, NewShiftAmt); 3701 3702 SDNode *UpdatedNode = CurDAG->UpdateNodeOperands(N, N->getOperand(0), 3703 NewShiftAmt); 3704 if (UpdatedNode != N) { 3705 // If we found an existing node, we should replace ourselves with that node 3706 // and wait for it to be selected after its other users. 3707 ReplaceNode(N, UpdatedNode); 3708 return true; 3709 } 3710 3711 // If the original shift amount is now dead, delete it so that we don't run 3712 // it through isel. 3713 if (OrigShiftAmt.getNode()->use_empty()) 3714 CurDAG->RemoveDeadNode(OrigShiftAmt.getNode()); 3715 3716 // Now that we've optimized the shift amount, defer to normal isel to get 3717 // load folding and legacy vs BMI2 selection without repeating it here. 3718 SelectCode(N); 3719 return true; 3720 } 3721 3722 bool X86DAGToDAGISel::tryShrinkShlLogicImm(SDNode *N) { 3723 MVT NVT = N->getSimpleValueType(0); 3724 unsigned Opcode = N->getOpcode(); 3725 SDLoc dl(N); 3726 3727 // For operations of the form (x << C1) op C2, check if we can use a smaller 3728 // encoding for C2 by transforming it into (x op (C2>>C1)) << C1. 3729 SDValue Shift = N->getOperand(0); 3730 SDValue N1 = N->getOperand(1); 3731 3732 ConstantSDNode *Cst = dyn_cast<ConstantSDNode>(N1); 3733 if (!Cst) 3734 return false; 3735 3736 int64_t Val = Cst->getSExtValue(); 3737 3738 // If we have an any_extend feeding the AND, look through it to see if there 3739 // is a shift behind it. But only if the AND doesn't use the extended bits. 3740 // FIXME: Generalize this to other ANY_EXTEND than i32 to i64? 3741 bool FoundAnyExtend = false; 3742 if (Shift.getOpcode() == ISD::ANY_EXTEND && Shift.hasOneUse() && 3743 Shift.getOperand(0).getSimpleValueType() == MVT::i32 && 3744 isUInt<32>(Val)) { 3745 FoundAnyExtend = true; 3746 Shift = Shift.getOperand(0); 3747 } 3748 3749 if (Shift.getOpcode() != ISD::SHL || !Shift.hasOneUse()) 3750 return false; 3751 3752 // i8 is unshrinkable, i16 should be promoted to i32. 3753 if (NVT != MVT::i32 && NVT != MVT::i64) 3754 return false; 3755 3756 ConstantSDNode *ShlCst = dyn_cast<ConstantSDNode>(Shift.getOperand(1)); 3757 if (!ShlCst) 3758 return false; 3759 3760 uint64_t ShAmt = ShlCst->getZExtValue(); 3761 3762 // Make sure that we don't change the operation by removing bits. 3763 // This only matters for OR and XOR, AND is unaffected. 3764 uint64_t RemovedBitsMask = (1ULL << ShAmt) - 1; 3765 if (Opcode != ISD::AND && (Val & RemovedBitsMask) != 0) 3766 return false; 3767 3768 // Check the minimum bitwidth for the new constant. 3769 // TODO: Using 16 and 8 bit operations is also possible for or32 & xor32. 3770 auto CanShrinkImmediate = [&](int64_t &ShiftedVal) { 3771 if (Opcode == ISD::AND) { 3772 // AND32ri is the same as AND64ri32 with zext imm. 3773 // Try this before sign extended immediates below. 3774 ShiftedVal = (uint64_t)Val >> ShAmt; 3775 if (NVT == MVT::i64 && !isUInt<32>(Val) && isUInt<32>(ShiftedVal)) 3776 return true; 3777 // Also swap order when the AND can become MOVZX. 3778 if (ShiftedVal == UINT8_MAX || ShiftedVal == UINT16_MAX) 3779 return true; 3780 } 3781 ShiftedVal = Val >> ShAmt; 3782 if ((!isInt<8>(Val) && isInt<8>(ShiftedVal)) || 3783 (!isInt<32>(Val) && isInt<32>(ShiftedVal))) 3784 return true; 3785 if (Opcode != ISD::AND) { 3786 // MOV32ri+OR64r/XOR64r is cheaper than MOV64ri64+OR64rr/XOR64rr 3787 ShiftedVal = (uint64_t)Val >> ShAmt; 3788 if (NVT == MVT::i64 && !isUInt<32>(Val) && isUInt<32>(ShiftedVal)) 3789 return true; 3790 } 3791 return false; 3792 }; 3793 3794 int64_t ShiftedVal; 3795 if (!CanShrinkImmediate(ShiftedVal)) 3796 return false; 3797 3798 // Ok, we can reorder to get a smaller immediate. 3799 3800 // But, its possible the original immediate allowed an AND to become MOVZX. 3801 // Doing this late due to avoid the MakedValueIsZero call as late as 3802 // possible. 3803 if (Opcode == ISD::AND) { 3804 // Find the smallest zext this could possibly be. 3805 unsigned ZExtWidth = Cst->getAPIntValue().getActiveBits(); 3806 ZExtWidth = PowerOf2Ceil(std::max(ZExtWidth, 8U)); 3807 3808 // Figure out which bits need to be zero to achieve that mask. 3809 APInt NeededMask = APInt::getLowBitsSet(NVT.getSizeInBits(), 3810 ZExtWidth); 3811 NeededMask &= ~Cst->getAPIntValue(); 3812 3813 if (CurDAG->MaskedValueIsZero(N->getOperand(0), NeededMask)) 3814 return false; 3815 } 3816 3817 SDValue X = Shift.getOperand(0); 3818 if (FoundAnyExtend) { 3819 SDValue NewX = CurDAG->getNode(ISD::ANY_EXTEND, dl, NVT, X); 3820 insertDAGNode(*CurDAG, SDValue(N, 0), NewX); 3821 X = NewX; 3822 } 3823 3824 SDValue NewCst = CurDAG->getConstant(ShiftedVal, dl, NVT); 3825 insertDAGNode(*CurDAG, SDValue(N, 0), NewCst); 3826 SDValue NewBinOp = CurDAG->getNode(Opcode, dl, NVT, X, NewCst); 3827 insertDAGNode(*CurDAG, SDValue(N, 0), NewBinOp); 3828 SDValue NewSHL = CurDAG->getNode(ISD::SHL, dl, NVT, NewBinOp, 3829 Shift.getOperand(1)); 3830 ReplaceNode(N, NewSHL.getNode()); 3831 SelectCode(NewSHL.getNode()); 3832 return true; 3833 } 3834 3835 /// Convert vector increment or decrement to sub/add with an all-ones constant: 3836 /// add X, <1, 1...> --> sub X, <-1, -1...> 3837 /// sub X, <1, 1...> --> add X, <-1, -1...> 3838 /// The all-ones vector constant can be materialized using a pcmpeq instruction 3839 /// that is commonly recognized as an idiom (has no register dependency), so 3840 /// that's better/smaller than loading a splat 1 constant. 3841 bool X86DAGToDAGISel::combineIncDecVector(SDNode *Node) { 3842 assert((Node->getOpcode() == ISD::ADD || Node->getOpcode() == ISD::SUB) && 3843 "Unexpected opcode for increment/decrement transform"); 3844 3845 EVT VT = Node->getValueType(0); 3846 assert(VT.isVector() && "Should only be called for vectors."); 3847 3848 SDValue X = Node->getOperand(0); 3849 SDValue OneVec = Node->getOperand(1); 3850 3851 APInt SplatVal; 3852 if (!X86::isConstantSplat(OneVec, SplatVal) || !SplatVal.isOneValue()) 3853 return false; 3854 3855 SDLoc DL(Node); 3856 SDValue OneConstant, AllOnesVec; 3857 3858 APInt Ones = APInt::getAllOnesValue(32); 3859 assert(VT.getSizeInBits() % 32 == 0 && 3860 "Expected bit count to be a multiple of 32"); 3861 OneConstant = CurDAG->getConstant(Ones, DL, MVT::i32); 3862 insertDAGNode(*CurDAG, X, OneConstant); 3863 3864 unsigned NumElts = VT.getSizeInBits() / 32; 3865 assert(NumElts > 0 && "Expected to get non-empty vector."); 3866 AllOnesVec = CurDAG->getSplatBuildVector(MVT::getVectorVT(MVT::i32, NumElts), 3867 DL, OneConstant); 3868 insertDAGNode(*CurDAG, X, AllOnesVec); 3869 3870 AllOnesVec = CurDAG->getBitcast(VT, AllOnesVec); 3871 insertDAGNode(*CurDAG, X, AllOnesVec); 3872 3873 unsigned NewOpcode = Node->getOpcode() == ISD::ADD ? ISD::SUB : ISD::ADD; 3874 SDValue NewNode = CurDAG->getNode(NewOpcode, DL, VT, X, AllOnesVec); 3875 3876 ReplaceNode(Node, NewNode.getNode()); 3877 SelectCode(NewNode.getNode()); 3878 return true; 3879 } 3880 3881 /// If the high bits of an 'and' operand are known zero, try setting the 3882 /// high bits of an 'and' constant operand to produce a smaller encoding by 3883 /// creating a small, sign-extended negative immediate rather than a large 3884 /// positive one. This reverses a transform in SimplifyDemandedBits that 3885 /// shrinks mask constants by clearing bits. There is also a possibility that 3886 /// the 'and' mask can be made -1, so the 'and' itself is unnecessary. In that 3887 /// case, just replace the 'and'. Return 'true' if the node is replaced. 3888 bool X86DAGToDAGISel::shrinkAndImmediate(SDNode *And) { 3889 // i8 is unshrinkable, i16 should be promoted to i32, and vector ops don't 3890 // have immediate operands. 3891 MVT VT = And->getSimpleValueType(0); 3892 if (VT != MVT::i32 && VT != MVT::i64) 3893 return false; 3894 3895 auto *And1C = dyn_cast<ConstantSDNode>(And->getOperand(1)); 3896 if (!And1C) 3897 return false; 3898 3899 // Bail out if the mask constant is already negative. It's can't shrink more. 3900 // If the upper 32 bits of a 64 bit mask are all zeros, we have special isel 3901 // patterns to use a 32-bit and instead of a 64-bit and by relying on the 3902 // implicit zeroing of 32 bit ops. So we should check if the lower 32 bits 3903 // are negative too. 3904 APInt MaskVal = And1C->getAPIntValue(); 3905 unsigned MaskLZ = MaskVal.countLeadingZeros(); 3906 if (!MaskLZ || (VT == MVT::i64 && MaskLZ == 32)) 3907 return false; 3908 3909 // Don't extend into the upper 32 bits of a 64 bit mask. 3910 if (VT == MVT::i64 && MaskLZ >= 32) { 3911 MaskLZ -= 32; 3912 MaskVal = MaskVal.trunc(32); 3913 } 3914 3915 SDValue And0 = And->getOperand(0); 3916 APInt HighZeros = APInt::getHighBitsSet(MaskVal.getBitWidth(), MaskLZ); 3917 APInt NegMaskVal = MaskVal | HighZeros; 3918 3919 // If a negative constant would not allow a smaller encoding, there's no need 3920 // to continue. Only change the constant when we know it's a win. 3921 unsigned MinWidth = NegMaskVal.getMinSignedBits(); 3922 if (MinWidth > 32 || (MinWidth > 8 && MaskVal.getMinSignedBits() <= 32)) 3923 return false; 3924 3925 // Extend masks if we truncated above. 3926 if (VT == MVT::i64 && MaskVal.getBitWidth() < 64) { 3927 NegMaskVal = NegMaskVal.zext(64); 3928 HighZeros = HighZeros.zext(64); 3929 } 3930 3931 // The variable operand must be all zeros in the top bits to allow using the 3932 // new, negative constant as the mask. 3933 if (!CurDAG->MaskedValueIsZero(And0, HighZeros)) 3934 return false; 3935 3936 // Check if the mask is -1. In that case, this is an unnecessary instruction 3937 // that escaped earlier analysis. 3938 if (NegMaskVal.isAllOnesValue()) { 3939 ReplaceNode(And, And0.getNode()); 3940 return true; 3941 } 3942 3943 // A negative mask allows a smaller encoding. Create a new 'and' node. 3944 SDValue NewMask = CurDAG->getConstant(NegMaskVal, SDLoc(And), VT); 3945 SDValue NewAnd = CurDAG->getNode(ISD::AND, SDLoc(And), VT, And0, NewMask); 3946 ReplaceNode(And, NewAnd.getNode()); 3947 SelectCode(NewAnd.getNode()); 3948 return true; 3949 } 3950 3951 static unsigned getVPTESTMOpc(MVT TestVT, bool IsTestN, bool FoldedLoad, 3952 bool FoldedBCast, bool Masked) { 3953 if (Masked) { 3954 if (FoldedLoad) { 3955 switch (TestVT.SimpleTy) { 3956 default: llvm_unreachable("Unexpected VT!"); 3957 case MVT::v16i8: 3958 return IsTestN ? X86::VPTESTNMBZ128rmk : X86::VPTESTMBZ128rmk; 3959 case MVT::v8i16: 3960 return IsTestN ? X86::VPTESTNMWZ128rmk : X86::VPTESTMWZ128rmk; 3961 case MVT::v4i32: 3962 return IsTestN ? X86::VPTESTNMDZ128rmk : X86::VPTESTMDZ128rmk; 3963 case MVT::v2i64: 3964 return IsTestN ? X86::VPTESTNMQZ128rmk : X86::VPTESTMQZ128rmk; 3965 case MVT::v32i8: 3966 return IsTestN ? X86::VPTESTNMBZ256rmk : X86::VPTESTMBZ256rmk; 3967 case MVT::v16i16: 3968 return IsTestN ? X86::VPTESTNMWZ256rmk : X86::VPTESTMWZ256rmk; 3969 case MVT::v8i32: 3970 return IsTestN ? X86::VPTESTNMDZ256rmk : X86::VPTESTMDZ256rmk; 3971 case MVT::v4i64: 3972 return IsTestN ? X86::VPTESTNMQZ256rmk : X86::VPTESTMQZ256rmk; 3973 case MVT::v64i8: 3974 return IsTestN ? X86::VPTESTNMBZrmk : X86::VPTESTMBZrmk; 3975 case MVT::v32i16: 3976 return IsTestN ? X86::VPTESTNMWZrmk : X86::VPTESTMWZrmk; 3977 case MVT::v16i32: 3978 return IsTestN ? X86::VPTESTNMDZrmk : X86::VPTESTMDZrmk; 3979 case MVT::v8i64: 3980 return IsTestN ? X86::VPTESTNMQZrmk : X86::VPTESTMQZrmk; 3981 } 3982 } 3983 3984 if (FoldedBCast) { 3985 switch (TestVT.SimpleTy) { 3986 default: llvm_unreachable("Unexpected VT!"); 3987 case MVT::v4i32: 3988 return IsTestN ? X86::VPTESTNMDZ128rmbk : X86::VPTESTMDZ128rmbk; 3989 case MVT::v2i64: 3990 return IsTestN ? X86::VPTESTNMQZ128rmbk : X86::VPTESTMQZ128rmbk; 3991 case MVT::v8i32: 3992 return IsTestN ? X86::VPTESTNMDZ256rmbk : X86::VPTESTMDZ256rmbk; 3993 case MVT::v4i64: 3994 return IsTestN ? X86::VPTESTNMQZ256rmbk : X86::VPTESTMQZ256rmbk; 3995 case MVT::v16i32: 3996 return IsTestN ? X86::VPTESTNMDZrmbk : X86::VPTESTMDZrmbk; 3997 case MVT::v8i64: 3998 return IsTestN ? X86::VPTESTNMQZrmbk : X86::VPTESTMQZrmbk; 3999 } 4000 } 4001 4002 switch (TestVT.SimpleTy) { 4003 default: llvm_unreachable("Unexpected VT!"); 4004 case MVT::v16i8: 4005 return IsTestN ? X86::VPTESTNMBZ128rrk : X86::VPTESTMBZ128rrk; 4006 case MVT::v8i16: 4007 return IsTestN ? X86::VPTESTNMWZ128rrk : X86::VPTESTMWZ128rrk; 4008 case MVT::v4i32: 4009 return IsTestN ? X86::VPTESTNMDZ128rrk : X86::VPTESTMDZ128rrk; 4010 case MVT::v2i64: 4011 return IsTestN ? X86::VPTESTNMQZ128rrk : X86::VPTESTMQZ128rrk; 4012 case MVT::v32i8: 4013 return IsTestN ? X86::VPTESTNMBZ256rrk : X86::VPTESTMBZ256rrk; 4014 case MVT::v16i16: 4015 return IsTestN ? X86::VPTESTNMWZ256rrk : X86::VPTESTMWZ256rrk; 4016 case MVT::v8i32: 4017 return IsTestN ? X86::VPTESTNMDZ256rrk : X86::VPTESTMDZ256rrk; 4018 case MVT::v4i64: 4019 return IsTestN ? X86::VPTESTNMQZ256rrk : X86::VPTESTMQZ256rrk; 4020 case MVT::v64i8: 4021 return IsTestN ? X86::VPTESTNMBZrrk : X86::VPTESTMBZrrk; 4022 case MVT::v32i16: 4023 return IsTestN ? X86::VPTESTNMWZrrk : X86::VPTESTMWZrrk; 4024 case MVT::v16i32: 4025 return IsTestN ? X86::VPTESTNMDZrrk : X86::VPTESTMDZrrk; 4026 case MVT::v8i64: 4027 return IsTestN ? X86::VPTESTNMQZrrk : X86::VPTESTMQZrrk; 4028 } 4029 } 4030 4031 if (FoldedLoad) { 4032 switch (TestVT.SimpleTy) { 4033 default: llvm_unreachable("Unexpected VT!"); 4034 case MVT::v16i8: 4035 return IsTestN ? X86::VPTESTNMBZ128rm : X86::VPTESTMBZ128rm; 4036 case MVT::v8i16: 4037 return IsTestN ? X86::VPTESTNMWZ128rm : X86::VPTESTMWZ128rm; 4038 case MVT::v4i32: 4039 return IsTestN ? X86::VPTESTNMDZ128rm : X86::VPTESTMDZ128rm; 4040 case MVT::v2i64: 4041 return IsTestN ? X86::VPTESTNMQZ128rm : X86::VPTESTMQZ128rm; 4042 case MVT::v32i8: 4043 return IsTestN ? X86::VPTESTNMBZ256rm : X86::VPTESTMBZ256rm; 4044 case MVT::v16i16: 4045 return IsTestN ? X86::VPTESTNMWZ256rm : X86::VPTESTMWZ256rm; 4046 case MVT::v8i32: 4047 return IsTestN ? X86::VPTESTNMDZ256rm : X86::VPTESTMDZ256rm; 4048 case MVT::v4i64: 4049 return IsTestN ? X86::VPTESTNMQZ256rm : X86::VPTESTMQZ256rm; 4050 case MVT::v64i8: 4051 return IsTestN ? X86::VPTESTNMBZrm : X86::VPTESTMBZrm; 4052 case MVT::v32i16: 4053 return IsTestN ? X86::VPTESTNMWZrm : X86::VPTESTMWZrm; 4054 case MVT::v16i32: 4055 return IsTestN ? X86::VPTESTNMDZrm : X86::VPTESTMDZrm; 4056 case MVT::v8i64: 4057 return IsTestN ? X86::VPTESTNMQZrm : X86::VPTESTMQZrm; 4058 } 4059 } 4060 4061 if (FoldedBCast) { 4062 switch (TestVT.SimpleTy) { 4063 default: llvm_unreachable("Unexpected VT!"); 4064 case MVT::v4i32: 4065 return IsTestN ? X86::VPTESTNMDZ128rmb : X86::VPTESTMDZ128rmb; 4066 case MVT::v2i64: 4067 return IsTestN ? X86::VPTESTNMQZ128rmb : X86::VPTESTMQZ128rmb; 4068 case MVT::v8i32: 4069 return IsTestN ? X86::VPTESTNMDZ256rmb : X86::VPTESTMDZ256rmb; 4070 case MVT::v4i64: 4071 return IsTestN ? X86::VPTESTNMQZ256rmb : X86::VPTESTMQZ256rmb; 4072 case MVT::v16i32: 4073 return IsTestN ? X86::VPTESTNMDZrmb : X86::VPTESTMDZrmb; 4074 case MVT::v8i64: 4075 return IsTestN ? X86::VPTESTNMQZrmb : X86::VPTESTMQZrmb; 4076 } 4077 } 4078 4079 switch (TestVT.SimpleTy) { 4080 default: llvm_unreachable("Unexpected VT!"); 4081 case MVT::v16i8: 4082 return IsTestN ? X86::VPTESTNMBZ128rr : X86::VPTESTMBZ128rr; 4083 case MVT::v8i16: 4084 return IsTestN ? X86::VPTESTNMWZ128rr : X86::VPTESTMWZ128rr; 4085 case MVT::v4i32: 4086 return IsTestN ? X86::VPTESTNMDZ128rr : X86::VPTESTMDZ128rr; 4087 case MVT::v2i64: 4088 return IsTestN ? X86::VPTESTNMQZ128rr : X86::VPTESTMQZ128rr; 4089 case MVT::v32i8: 4090 return IsTestN ? X86::VPTESTNMBZ256rr : X86::VPTESTMBZ256rr; 4091 case MVT::v16i16: 4092 return IsTestN ? X86::VPTESTNMWZ256rr : X86::VPTESTMWZ256rr; 4093 case MVT::v8i32: 4094 return IsTestN ? X86::VPTESTNMDZ256rr : X86::VPTESTMDZ256rr; 4095 case MVT::v4i64: 4096 return IsTestN ? X86::VPTESTNMQZ256rr : X86::VPTESTMQZ256rr; 4097 case MVT::v64i8: 4098 return IsTestN ? X86::VPTESTNMBZrr : X86::VPTESTMBZrr; 4099 case MVT::v32i16: 4100 return IsTestN ? X86::VPTESTNMWZrr : X86::VPTESTMWZrr; 4101 case MVT::v16i32: 4102 return IsTestN ? X86::VPTESTNMDZrr : X86::VPTESTMDZrr; 4103 case MVT::v8i64: 4104 return IsTestN ? X86::VPTESTNMQZrr : X86::VPTESTMQZrr; 4105 } 4106 } 4107 4108 // Try to create VPTESTM instruction. If InMask is not null, it will be used 4109 // to form a masked operation. 4110 bool X86DAGToDAGISel::tryVPTESTM(SDNode *Root, SDValue Setcc, 4111 SDValue InMask) { 4112 assert(Subtarget->hasAVX512() && "Expected AVX512!"); 4113 assert(Setcc.getSimpleValueType().getVectorElementType() == MVT::i1 && 4114 "Unexpected VT!"); 4115 4116 // Look for equal and not equal compares. 4117 ISD::CondCode CC = cast<CondCodeSDNode>(Setcc.getOperand(2))->get(); 4118 if (CC != ISD::SETEQ && CC != ISD::SETNE) 4119 return false; 4120 4121 SDValue SetccOp0 = Setcc.getOperand(0); 4122 SDValue SetccOp1 = Setcc.getOperand(1); 4123 4124 // Canonicalize the all zero vector to the RHS. 4125 if (ISD::isBuildVectorAllZeros(SetccOp0.getNode())) 4126 std::swap(SetccOp0, SetccOp1); 4127 4128 // See if we're comparing against zero. 4129 if (!ISD::isBuildVectorAllZeros(SetccOp1.getNode())) 4130 return false; 4131 4132 SDValue N0 = SetccOp0; 4133 4134 MVT CmpVT = N0.getSimpleValueType(); 4135 MVT CmpSVT = CmpVT.getVectorElementType(); 4136 4137 // Start with both operands the same. We'll try to refine this. 4138 SDValue Src0 = N0; 4139 SDValue Src1 = N0; 4140 4141 { 4142 // Look through single use bitcasts. 4143 SDValue N0Temp = N0; 4144 if (N0Temp.getOpcode() == ISD::BITCAST && N0Temp.hasOneUse()) 4145 N0Temp = N0.getOperand(0); 4146 4147 // Look for single use AND. 4148 if (N0Temp.getOpcode() == ISD::AND && N0Temp.hasOneUse()) { 4149 Src0 = N0Temp.getOperand(0); 4150 Src1 = N0Temp.getOperand(1); 4151 } 4152 } 4153 4154 // Without VLX we need to widen the load. 4155 bool Widen = !Subtarget->hasVLX() && !CmpVT.is512BitVector(); 4156 4157 // We can only fold loads if the sources are unique. 4158 bool CanFoldLoads = Src0 != Src1; 4159 4160 // Try to fold loads unless we need to widen. 4161 bool FoldedLoad = false; 4162 SDValue Tmp0, Tmp1, Tmp2, Tmp3, Tmp4, Load; 4163 if (!Widen && CanFoldLoads) { 4164 Load = Src1; 4165 FoldedLoad = tryFoldLoad(Root, N0.getNode(), Load, Tmp0, Tmp1, Tmp2, Tmp3, 4166 Tmp4); 4167 if (!FoldedLoad) { 4168 // And is computative. 4169 Load = Src0; 4170 FoldedLoad = tryFoldLoad(Root, N0.getNode(), Load, Tmp0, Tmp1, Tmp2, 4171 Tmp3, Tmp4); 4172 if (FoldedLoad) 4173 std::swap(Src0, Src1); 4174 } 4175 } 4176 4177 auto findBroadcastedOp = [](SDValue Src, MVT CmpSVT, SDNode *&Parent) { 4178 // Look through single use bitcasts. 4179 if (Src.getOpcode() == ISD::BITCAST && Src.hasOneUse()) { 4180 Parent = Src.getNode(); 4181 Src = Src.getOperand(0); 4182 } 4183 4184 if (Src.getOpcode() == X86ISD::VBROADCAST_LOAD && Src.hasOneUse()) { 4185 auto *MemIntr = cast<MemIntrinsicSDNode>(Src); 4186 if (MemIntr->getMemoryVT().getSizeInBits() == CmpSVT.getSizeInBits()) 4187 return Src; 4188 } 4189 4190 return SDValue(); 4191 }; 4192 4193 // If we didn't fold a load, try to match broadcast. No widening limitation 4194 // for this. But only 32 and 64 bit types are supported. 4195 bool FoldedBCast = false; 4196 if (!FoldedLoad && CanFoldLoads && 4197 (CmpSVT == MVT::i32 || CmpSVT == MVT::i64)) { 4198 SDNode *ParentNode = N0.getNode(); 4199 if ((Load = findBroadcastedOp(Src1, CmpSVT, ParentNode))) { 4200 FoldedBCast = tryFoldBroadcast(Root, ParentNode, Load, Tmp0, 4201 Tmp1, Tmp2, Tmp3, Tmp4); 4202 } 4203 4204 // Try the other operand. 4205 if (!FoldedBCast) { 4206 SDNode *ParentNode = N0.getNode(); 4207 if ((Load = findBroadcastedOp(Src0, CmpSVT, ParentNode))) { 4208 FoldedBCast = tryFoldBroadcast(Root, ParentNode, Load, Tmp0, 4209 Tmp1, Tmp2, Tmp3, Tmp4); 4210 if (FoldedBCast) 4211 std::swap(Src0, Src1); 4212 } 4213 } 4214 } 4215 4216 auto getMaskRC = [](MVT MaskVT) { 4217 switch (MaskVT.SimpleTy) { 4218 default: llvm_unreachable("Unexpected VT!"); 4219 case MVT::v2i1: return X86::VK2RegClassID; 4220 case MVT::v4i1: return X86::VK4RegClassID; 4221 case MVT::v8i1: return X86::VK8RegClassID; 4222 case MVT::v16i1: return X86::VK16RegClassID; 4223 case MVT::v32i1: return X86::VK32RegClassID; 4224 case MVT::v64i1: return X86::VK64RegClassID; 4225 } 4226 }; 4227 4228 bool IsMasked = InMask.getNode() != nullptr; 4229 4230 SDLoc dl(Root); 4231 4232 MVT ResVT = Setcc.getSimpleValueType(); 4233 MVT MaskVT = ResVT; 4234 if (Widen) { 4235 // Widen the inputs using insert_subreg or copy_to_regclass. 4236 unsigned Scale = CmpVT.is128BitVector() ? 4 : 2; 4237 unsigned SubReg = CmpVT.is128BitVector() ? X86::sub_xmm : X86::sub_ymm; 4238 unsigned NumElts = CmpVT.getVectorNumElements() * Scale; 4239 CmpVT = MVT::getVectorVT(CmpSVT, NumElts); 4240 MaskVT = MVT::getVectorVT(MVT::i1, NumElts); 4241 SDValue ImplDef = SDValue(CurDAG->getMachineNode(X86::IMPLICIT_DEF, dl, 4242 CmpVT), 0); 4243 Src0 = CurDAG->getTargetInsertSubreg(SubReg, dl, CmpVT, ImplDef, Src0); 4244 4245 assert(!FoldedLoad && "Shouldn't have folded the load"); 4246 if (!FoldedBCast) 4247 Src1 = CurDAG->getTargetInsertSubreg(SubReg, dl, CmpVT, ImplDef, Src1); 4248 4249 if (IsMasked) { 4250 // Widen the mask. 4251 unsigned RegClass = getMaskRC(MaskVT); 4252 SDValue RC = CurDAG->getTargetConstant(RegClass, dl, MVT::i32); 4253 InMask = SDValue(CurDAG->getMachineNode(TargetOpcode::COPY_TO_REGCLASS, 4254 dl, MaskVT, InMask, RC), 0); 4255 } 4256 } 4257 4258 bool IsTestN = CC == ISD::SETEQ; 4259 unsigned Opc = getVPTESTMOpc(CmpVT, IsTestN, FoldedLoad, FoldedBCast, 4260 IsMasked); 4261 4262 MachineSDNode *CNode; 4263 if (FoldedLoad || FoldedBCast) { 4264 SDVTList VTs = CurDAG->getVTList(MaskVT, MVT::Other); 4265 4266 if (IsMasked) { 4267 SDValue Ops[] = { InMask, Src0, Tmp0, Tmp1, Tmp2, Tmp3, Tmp4, 4268 Load.getOperand(0) }; 4269 CNode = CurDAG->getMachineNode(Opc, dl, VTs, Ops); 4270 } else { 4271 SDValue Ops[] = { Src0, Tmp0, Tmp1, Tmp2, Tmp3, Tmp4, 4272 Load.getOperand(0) }; 4273 CNode = CurDAG->getMachineNode(Opc, dl, VTs, Ops); 4274 } 4275 4276 // Update the chain. 4277 ReplaceUses(Load.getValue(1), SDValue(CNode, 1)); 4278 // Record the mem-refs 4279 CurDAG->setNodeMemRefs(CNode, {cast<MemSDNode>(Load)->getMemOperand()}); 4280 } else { 4281 if (IsMasked) 4282 CNode = CurDAG->getMachineNode(Opc, dl, MaskVT, InMask, Src0, Src1); 4283 else 4284 CNode = CurDAG->getMachineNode(Opc, dl, MaskVT, Src0, Src1); 4285 } 4286 4287 // If we widened, we need to shrink the mask VT. 4288 if (Widen) { 4289 unsigned RegClass = getMaskRC(ResVT); 4290 SDValue RC = CurDAG->getTargetConstant(RegClass, dl, MVT::i32); 4291 CNode = CurDAG->getMachineNode(TargetOpcode::COPY_TO_REGCLASS, 4292 dl, ResVT, SDValue(CNode, 0), RC); 4293 } 4294 4295 ReplaceUses(SDValue(Root, 0), SDValue(CNode, 0)); 4296 CurDAG->RemoveDeadNode(Root); 4297 return true; 4298 } 4299 4300 // Try to match the bitselect pattern (or (and A, B), (andn A, C)). Turn it 4301 // into vpternlog. 4302 bool X86DAGToDAGISel::tryMatchBitSelect(SDNode *N) { 4303 assert(N->getOpcode() == ISD::OR && "Unexpected opcode!"); 4304 4305 MVT NVT = N->getSimpleValueType(0); 4306 4307 // Make sure we support VPTERNLOG. 4308 if (!NVT.isVector() || !Subtarget->hasAVX512()) 4309 return false; 4310 4311 // We need VLX for 128/256-bit. 4312 if (!(Subtarget->hasVLX() || NVT.is512BitVector())) 4313 return false; 4314 4315 SDValue N0 = N->getOperand(0); 4316 SDValue N1 = N->getOperand(1); 4317 4318 // Canonicalize AND to LHS. 4319 if (N1.getOpcode() == ISD::AND) 4320 std::swap(N0, N1); 4321 4322 if (N0.getOpcode() != ISD::AND || 4323 N1.getOpcode() != X86ISD::ANDNP || 4324 !N0.hasOneUse() || !N1.hasOneUse()) 4325 return false; 4326 4327 // ANDN is not commutable, use it to pick down A and C. 4328 SDValue A = N1.getOperand(0); 4329 SDValue C = N1.getOperand(1); 4330 4331 // AND is commutable, if one operand matches A, the other operand is B. 4332 // Otherwise this isn't a match. 4333 SDValue B; 4334 if (N0.getOperand(0) == A) 4335 B = N0.getOperand(1); 4336 else if (N0.getOperand(1) == A) 4337 B = N0.getOperand(0); 4338 else 4339 return false; 4340 4341 SDLoc dl(N); 4342 SDValue Imm = CurDAG->getTargetConstant(0xCA, dl, MVT::i8); 4343 SDValue Ternlog = CurDAG->getNode(X86ISD::VPTERNLOG, dl, NVT, A, B, C, Imm); 4344 ReplaceNode(N, Ternlog.getNode()); 4345 SelectCode(Ternlog.getNode()); 4346 return true; 4347 } 4348 4349 void X86DAGToDAGISel::Select(SDNode *Node) { 4350 MVT NVT = Node->getSimpleValueType(0); 4351 unsigned Opcode = Node->getOpcode(); 4352 SDLoc dl(Node); 4353 4354 if (Node->isMachineOpcode()) { 4355 LLVM_DEBUG(dbgs() << "== "; Node->dump(CurDAG); dbgs() << '\n'); 4356 Node->setNodeId(-1); 4357 return; // Already selected. 4358 } 4359 4360 switch (Opcode) { 4361 default: break; 4362 case ISD::INTRINSIC_VOID: { 4363 unsigned IntNo = Node->getConstantOperandVal(1); 4364 switch (IntNo) { 4365 default: break; 4366 case Intrinsic::x86_sse3_monitor: 4367 case Intrinsic::x86_monitorx: 4368 case Intrinsic::x86_clzero: { 4369 bool Use64BitPtr = Node->getOperand(2).getValueType() == MVT::i64; 4370 4371 unsigned Opc = 0; 4372 switch (IntNo) { 4373 default: llvm_unreachable("Unexpected intrinsic!"); 4374 case Intrinsic::x86_sse3_monitor: 4375 if (!Subtarget->hasSSE3()) 4376 break; 4377 Opc = Use64BitPtr ? X86::MONITOR64rrr : X86::MONITOR32rrr; 4378 break; 4379 case Intrinsic::x86_monitorx: 4380 if (!Subtarget->hasMWAITX()) 4381 break; 4382 Opc = Use64BitPtr ? X86::MONITORX64rrr : X86::MONITORX32rrr; 4383 break; 4384 case Intrinsic::x86_clzero: 4385 if (!Subtarget->hasCLZERO()) 4386 break; 4387 Opc = Use64BitPtr ? X86::CLZERO64r : X86::CLZERO32r; 4388 break; 4389 } 4390 4391 if (Opc) { 4392 unsigned PtrReg = Use64BitPtr ? X86::RAX : X86::EAX; 4393 SDValue Chain = CurDAG->getCopyToReg(Node->getOperand(0), dl, PtrReg, 4394 Node->getOperand(2), SDValue()); 4395 SDValue InFlag = Chain.getValue(1); 4396 4397 if (IntNo == Intrinsic::x86_sse3_monitor || 4398 IntNo == Intrinsic::x86_monitorx) { 4399 // Copy the other two operands to ECX and EDX. 4400 Chain = CurDAG->getCopyToReg(Chain, dl, X86::ECX, Node->getOperand(3), 4401 InFlag); 4402 InFlag = Chain.getValue(1); 4403 Chain = CurDAG->getCopyToReg(Chain, dl, X86::EDX, Node->getOperand(4), 4404 InFlag); 4405 InFlag = Chain.getValue(1); 4406 } 4407 4408 MachineSDNode *CNode = CurDAG->getMachineNode(Opc, dl, MVT::Other, 4409 { Chain, InFlag}); 4410 ReplaceNode(Node, CNode); 4411 return; 4412 } 4413 } 4414 } 4415 4416 break; 4417 } 4418 case ISD::BRIND: { 4419 if (Subtarget->isTargetNaCl()) 4420 // NaCl has its own pass where jmp %r32 are converted to jmp %r64. We 4421 // leave the instruction alone. 4422 break; 4423 if (Subtarget->isTarget64BitILP32()) { 4424 // Converts a 32-bit register to a 64-bit, zero-extended version of 4425 // it. This is needed because x86-64 can do many things, but jmp %r32 4426 // ain't one of them. 4427 const SDValue &Target = Node->getOperand(1); 4428 assert(Target.getSimpleValueType() == llvm::MVT::i32); 4429 SDValue ZextTarget = CurDAG->getZExtOrTrunc(Target, dl, EVT(MVT::i64)); 4430 SDValue Brind = CurDAG->getNode(ISD::BRIND, dl, MVT::Other, 4431 Node->getOperand(0), ZextTarget); 4432 ReplaceNode(Node, Brind.getNode()); 4433 SelectCode(ZextTarget.getNode()); 4434 SelectCode(Brind.getNode()); 4435 return; 4436 } 4437 break; 4438 } 4439 case X86ISD::GlobalBaseReg: 4440 ReplaceNode(Node, getGlobalBaseReg()); 4441 return; 4442 4443 case ISD::BITCAST: 4444 // Just drop all 128/256/512-bit bitcasts. 4445 if (NVT.is512BitVector() || NVT.is256BitVector() || NVT.is128BitVector() || 4446 NVT == MVT::f128) { 4447 ReplaceUses(SDValue(Node, 0), Node->getOperand(0)); 4448 CurDAG->RemoveDeadNode(Node); 4449 return; 4450 } 4451 break; 4452 4453 case ISD::VSELECT: { 4454 // Replace VSELECT with non-mask conditions with with BLENDV. 4455 if (Node->getOperand(0).getValueType().getVectorElementType() == MVT::i1) 4456 break; 4457 4458 assert(Subtarget->hasSSE41() && "Expected SSE4.1 support!"); 4459 SDValue Blendv = CurDAG->getNode( 4460 X86ISD::BLENDV, SDLoc(Node), Node->getValueType(0), Node->getOperand(0), 4461 Node->getOperand(1), Node->getOperand(2)); 4462 ReplaceNode(Node, Blendv.getNode()); 4463 SelectCode(Blendv.getNode()); 4464 // We already called ReplaceUses. 4465 return; 4466 } 4467 4468 case ISD::SRL: 4469 if (matchBitExtract(Node)) 4470 return; 4471 LLVM_FALLTHROUGH; 4472 case ISD::SRA: 4473 case ISD::SHL: 4474 if (tryShiftAmountMod(Node)) 4475 return; 4476 break; 4477 4478 case ISD::AND: 4479 if (NVT.isVector() && NVT.getVectorElementType() == MVT::i1) { 4480 // Try to form a masked VPTESTM. Operands can be in either order. 4481 SDValue N0 = Node->getOperand(0); 4482 SDValue N1 = Node->getOperand(1); 4483 if (N0.getOpcode() == ISD::SETCC && N0.hasOneUse() && 4484 tryVPTESTM(Node, N0, N1)) 4485 return; 4486 if (N1.getOpcode() == ISD::SETCC && N1.hasOneUse() && 4487 tryVPTESTM(Node, N1, N0)) 4488 return; 4489 } 4490 4491 if (MachineSDNode *NewNode = matchBEXTRFromAndImm(Node)) { 4492 ReplaceUses(SDValue(Node, 0), SDValue(NewNode, 0)); 4493 CurDAG->RemoveDeadNode(Node); 4494 return; 4495 } 4496 if (matchBitExtract(Node)) 4497 return; 4498 if (AndImmShrink && shrinkAndImmediate(Node)) 4499 return; 4500 4501 LLVM_FALLTHROUGH; 4502 case ISD::OR: 4503 case ISD::XOR: 4504 if (tryShrinkShlLogicImm(Node)) 4505 return; 4506 4507 if (Opcode == ISD::OR && tryMatchBitSelect(Node)) 4508 return; 4509 4510 LLVM_FALLTHROUGH; 4511 case ISD::ADD: 4512 case ISD::SUB: { 4513 if ((Opcode == ISD::ADD || Opcode == ISD::SUB) && NVT.isVector() && 4514 combineIncDecVector(Node)) 4515 return; 4516 4517 // Try to avoid folding immediates with multiple uses for optsize. 4518 // This code tries to select to register form directly to avoid going 4519 // through the isel table which might fold the immediate. We can't change 4520 // the patterns on the add/sub/and/or/xor with immediate paterns in the 4521 // tablegen files to check immediate use count without making the patterns 4522 // unavailable to the fast-isel table. 4523 if (!OptForSize) 4524 break; 4525 4526 // Only handle i8/i16/i32/i64. 4527 if (NVT != MVT::i8 && NVT != MVT::i16 && NVT != MVT::i32 && NVT != MVT::i64) 4528 break; 4529 4530 SDValue N0 = Node->getOperand(0); 4531 SDValue N1 = Node->getOperand(1); 4532 4533 ConstantSDNode *Cst = dyn_cast<ConstantSDNode>(N1); 4534 if (!Cst) 4535 break; 4536 4537 int64_t Val = Cst->getSExtValue(); 4538 4539 // Make sure its an immediate that is considered foldable. 4540 // FIXME: Handle unsigned 32 bit immediates for 64-bit AND. 4541 if (!isInt<8>(Val) && !isInt<32>(Val)) 4542 break; 4543 4544 // If this can match to INC/DEC, let it go. 4545 if (Opcode == ISD::ADD && (Val == 1 || Val == -1)) 4546 break; 4547 4548 // Check if we should avoid folding this immediate. 4549 if (!shouldAvoidImmediateInstFormsForSize(N1.getNode())) 4550 break; 4551 4552 // We should not fold the immediate. So we need a register form instead. 4553 unsigned ROpc, MOpc; 4554 switch (NVT.SimpleTy) { 4555 default: llvm_unreachable("Unexpected VT!"); 4556 case MVT::i8: 4557 switch (Opcode) { 4558 default: llvm_unreachable("Unexpected opcode!"); 4559 case ISD::ADD: ROpc = X86::ADD8rr; MOpc = X86::ADD8rm; break; 4560 case ISD::SUB: ROpc = X86::SUB8rr; MOpc = X86::SUB8rm; break; 4561 case ISD::AND: ROpc = X86::AND8rr; MOpc = X86::AND8rm; break; 4562 case ISD::OR: ROpc = X86::OR8rr; MOpc = X86::OR8rm; break; 4563 case ISD::XOR: ROpc = X86::XOR8rr; MOpc = X86::XOR8rm; break; 4564 } 4565 break; 4566 case MVT::i16: 4567 switch (Opcode) { 4568 default: llvm_unreachable("Unexpected opcode!"); 4569 case ISD::ADD: ROpc = X86::ADD16rr; MOpc = X86::ADD16rm; break; 4570 case ISD::SUB: ROpc = X86::SUB16rr; MOpc = X86::SUB16rm; break; 4571 case ISD::AND: ROpc = X86::AND16rr; MOpc = X86::AND16rm; break; 4572 case ISD::OR: ROpc = X86::OR16rr; MOpc = X86::OR16rm; break; 4573 case ISD::XOR: ROpc = X86::XOR16rr; MOpc = X86::XOR16rm; break; 4574 } 4575 break; 4576 case MVT::i32: 4577 switch (Opcode) { 4578 default: llvm_unreachable("Unexpected opcode!"); 4579 case ISD::ADD: ROpc = X86::ADD32rr; MOpc = X86::ADD32rm; break; 4580 case ISD::SUB: ROpc = X86::SUB32rr; MOpc = X86::SUB32rm; break; 4581 case ISD::AND: ROpc = X86::AND32rr; MOpc = X86::AND32rm; break; 4582 case ISD::OR: ROpc = X86::OR32rr; MOpc = X86::OR32rm; break; 4583 case ISD::XOR: ROpc = X86::XOR32rr; MOpc = X86::XOR32rm; break; 4584 } 4585 break; 4586 case MVT::i64: 4587 switch (Opcode) { 4588 default: llvm_unreachable("Unexpected opcode!"); 4589 case ISD::ADD: ROpc = X86::ADD64rr; MOpc = X86::ADD64rm; break; 4590 case ISD::SUB: ROpc = X86::SUB64rr; MOpc = X86::SUB64rm; break; 4591 case ISD::AND: ROpc = X86::AND64rr; MOpc = X86::AND64rm; break; 4592 case ISD::OR: ROpc = X86::OR64rr; MOpc = X86::OR64rm; break; 4593 case ISD::XOR: ROpc = X86::XOR64rr; MOpc = X86::XOR64rm; break; 4594 } 4595 break; 4596 } 4597 4598 // Ok this is a AND/OR/XOR/ADD/SUB with constant. 4599 4600 // If this is a not a subtract, we can still try to fold a load. 4601 if (Opcode != ISD::SUB) { 4602 SDValue Tmp0, Tmp1, Tmp2, Tmp3, Tmp4; 4603 if (tryFoldLoad(Node, N0, Tmp0, Tmp1, Tmp2, Tmp3, Tmp4)) { 4604 SDValue Ops[] = { N1, Tmp0, Tmp1, Tmp2, Tmp3, Tmp4, N0.getOperand(0) }; 4605 SDVTList VTs = CurDAG->getVTList(NVT, MVT::i32, MVT::Other); 4606 MachineSDNode *CNode = CurDAG->getMachineNode(MOpc, dl, VTs, Ops); 4607 // Update the chain. 4608 ReplaceUses(N0.getValue(1), SDValue(CNode, 2)); 4609 // Record the mem-refs 4610 CurDAG->setNodeMemRefs(CNode, {cast<LoadSDNode>(N0)->getMemOperand()}); 4611 ReplaceUses(SDValue(Node, 0), SDValue(CNode, 0)); 4612 CurDAG->RemoveDeadNode(Node); 4613 return; 4614 } 4615 } 4616 4617 CurDAG->SelectNodeTo(Node, ROpc, NVT, MVT::i32, N0, N1); 4618 return; 4619 } 4620 4621 case X86ISD::SMUL: 4622 // i16/i32/i64 are handled with isel patterns. 4623 if (NVT != MVT::i8) 4624 break; 4625 LLVM_FALLTHROUGH; 4626 case X86ISD::UMUL: { 4627 SDValue N0 = Node->getOperand(0); 4628 SDValue N1 = Node->getOperand(1); 4629 4630 unsigned LoReg, ROpc, MOpc; 4631 switch (NVT.SimpleTy) { 4632 default: llvm_unreachable("Unsupported VT!"); 4633 case MVT::i8: 4634 LoReg = X86::AL; 4635 ROpc = Opcode == X86ISD::SMUL ? X86::IMUL8r : X86::MUL8r; 4636 MOpc = Opcode == X86ISD::SMUL ? X86::IMUL8m : X86::MUL8m; 4637 break; 4638 case MVT::i16: 4639 LoReg = X86::AX; 4640 ROpc = X86::MUL16r; 4641 MOpc = X86::MUL16m; 4642 break; 4643 case MVT::i32: 4644 LoReg = X86::EAX; 4645 ROpc = X86::MUL32r; 4646 MOpc = X86::MUL32m; 4647 break; 4648 case MVT::i64: 4649 LoReg = X86::RAX; 4650 ROpc = X86::MUL64r; 4651 MOpc = X86::MUL64m; 4652 break; 4653 } 4654 4655 SDValue Tmp0, Tmp1, Tmp2, Tmp3, Tmp4; 4656 bool FoldedLoad = tryFoldLoad(Node, N1, Tmp0, Tmp1, Tmp2, Tmp3, Tmp4); 4657 // Multiply is commmutative. 4658 if (!FoldedLoad) { 4659 FoldedLoad = tryFoldLoad(Node, N0, Tmp0, Tmp1, Tmp2, Tmp3, Tmp4); 4660 if (FoldedLoad) 4661 std::swap(N0, N1); 4662 } 4663 4664 SDValue InFlag = CurDAG->getCopyToReg(CurDAG->getEntryNode(), dl, LoReg, 4665 N0, SDValue()).getValue(1); 4666 4667 MachineSDNode *CNode; 4668 if (FoldedLoad) { 4669 // i16/i32/i64 use an instruction that produces a low and high result even 4670 // though only the low result is used. 4671 SDVTList VTs; 4672 if (NVT == MVT::i8) 4673 VTs = CurDAG->getVTList(NVT, MVT::i32, MVT::Other); 4674 else 4675 VTs = CurDAG->getVTList(NVT, NVT, MVT::i32, MVT::Other); 4676 4677 SDValue Ops[] = { Tmp0, Tmp1, Tmp2, Tmp3, Tmp4, N1.getOperand(0), 4678 InFlag }; 4679 CNode = CurDAG->getMachineNode(MOpc, dl, VTs, Ops); 4680 4681 // Update the chain. 4682 ReplaceUses(N1.getValue(1), SDValue(CNode, NVT == MVT::i8 ? 2 : 3)); 4683 // Record the mem-refs 4684 CurDAG->setNodeMemRefs(CNode, {cast<LoadSDNode>(N1)->getMemOperand()}); 4685 } else { 4686 // i16/i32/i64 use an instruction that produces a low and high result even 4687 // though only the low result is used. 4688 SDVTList VTs; 4689 if (NVT == MVT::i8) 4690 VTs = CurDAG->getVTList(NVT, MVT::i32); 4691 else 4692 VTs = CurDAG->getVTList(NVT, NVT, MVT::i32); 4693 4694 CNode = CurDAG->getMachineNode(ROpc, dl, VTs, {N1, InFlag}); 4695 } 4696 4697 ReplaceUses(SDValue(Node, 0), SDValue(CNode, 0)); 4698 ReplaceUses(SDValue(Node, 1), SDValue(CNode, NVT == MVT::i8 ? 1 : 2)); 4699 CurDAG->RemoveDeadNode(Node); 4700 return; 4701 } 4702 4703 case ISD::SMUL_LOHI: 4704 case ISD::UMUL_LOHI: { 4705 SDValue N0 = Node->getOperand(0); 4706 SDValue N1 = Node->getOperand(1); 4707 4708 unsigned Opc, MOpc; 4709 bool isSigned = Opcode == ISD::SMUL_LOHI; 4710 if (!isSigned) { 4711 switch (NVT.SimpleTy) { 4712 default: llvm_unreachable("Unsupported VT!"); 4713 case MVT::i32: Opc = X86::MUL32r; MOpc = X86::MUL32m; break; 4714 case MVT::i64: Opc = X86::MUL64r; MOpc = X86::MUL64m; break; 4715 } 4716 } else { 4717 switch (NVT.SimpleTy) { 4718 default: llvm_unreachable("Unsupported VT!"); 4719 case MVT::i32: Opc = X86::IMUL32r; MOpc = X86::IMUL32m; break; 4720 case MVT::i64: Opc = X86::IMUL64r; MOpc = X86::IMUL64m; break; 4721 } 4722 } 4723 4724 unsigned SrcReg, LoReg, HiReg; 4725 switch (Opc) { 4726 default: llvm_unreachable("Unknown MUL opcode!"); 4727 case X86::IMUL32r: 4728 case X86::MUL32r: 4729 SrcReg = LoReg = X86::EAX; HiReg = X86::EDX; 4730 break; 4731 case X86::IMUL64r: 4732 case X86::MUL64r: 4733 SrcReg = LoReg = X86::RAX; HiReg = X86::RDX; 4734 break; 4735 } 4736 4737 SDValue Tmp0, Tmp1, Tmp2, Tmp3, Tmp4; 4738 bool foldedLoad = tryFoldLoad(Node, N1, Tmp0, Tmp1, Tmp2, Tmp3, Tmp4); 4739 // Multiply is commmutative. 4740 if (!foldedLoad) { 4741 foldedLoad = tryFoldLoad(Node, N0, Tmp0, Tmp1, Tmp2, Tmp3, Tmp4); 4742 if (foldedLoad) 4743 std::swap(N0, N1); 4744 } 4745 4746 SDValue InFlag = CurDAG->getCopyToReg(CurDAG->getEntryNode(), dl, SrcReg, 4747 N0, SDValue()).getValue(1); 4748 if (foldedLoad) { 4749 SDValue Chain; 4750 MachineSDNode *CNode = nullptr; 4751 SDValue Ops[] = { Tmp0, Tmp1, Tmp2, Tmp3, Tmp4, N1.getOperand(0), 4752 InFlag }; 4753 SDVTList VTs = CurDAG->getVTList(MVT::Other, MVT::Glue); 4754 CNode = CurDAG->getMachineNode(MOpc, dl, VTs, Ops); 4755 Chain = SDValue(CNode, 0); 4756 InFlag = SDValue(CNode, 1); 4757 4758 // Update the chain. 4759 ReplaceUses(N1.getValue(1), Chain); 4760 // Record the mem-refs 4761 CurDAG->setNodeMemRefs(CNode, {cast<LoadSDNode>(N1)->getMemOperand()}); 4762 } else { 4763 SDValue Ops[] = { N1, InFlag }; 4764 SDVTList VTs = CurDAG->getVTList(MVT::Glue); 4765 SDNode *CNode = CurDAG->getMachineNode(Opc, dl, VTs, Ops); 4766 InFlag = SDValue(CNode, 0); 4767 } 4768 4769 // Copy the low half of the result, if it is needed. 4770 if (!SDValue(Node, 0).use_empty()) { 4771 assert(LoReg && "Register for low half is not defined!"); 4772 SDValue ResLo = CurDAG->getCopyFromReg(CurDAG->getEntryNode(), dl, LoReg, 4773 NVT, InFlag); 4774 InFlag = ResLo.getValue(2); 4775 ReplaceUses(SDValue(Node, 0), ResLo); 4776 LLVM_DEBUG(dbgs() << "=> "; ResLo.getNode()->dump(CurDAG); 4777 dbgs() << '\n'); 4778 } 4779 // Copy the high half of the result, if it is needed. 4780 if (!SDValue(Node, 1).use_empty()) { 4781 assert(HiReg && "Register for high half is not defined!"); 4782 SDValue ResHi = CurDAG->getCopyFromReg(CurDAG->getEntryNode(), dl, HiReg, 4783 NVT, InFlag); 4784 InFlag = ResHi.getValue(2); 4785 ReplaceUses(SDValue(Node, 1), ResHi); 4786 LLVM_DEBUG(dbgs() << "=> "; ResHi.getNode()->dump(CurDAG); 4787 dbgs() << '\n'); 4788 } 4789 4790 CurDAG->RemoveDeadNode(Node); 4791 return; 4792 } 4793 4794 case ISD::SDIVREM: 4795 case ISD::UDIVREM: { 4796 SDValue N0 = Node->getOperand(0); 4797 SDValue N1 = Node->getOperand(1); 4798 4799 unsigned Opc, MOpc; 4800 bool isSigned = Opcode == ISD::SDIVREM; 4801 if (!isSigned) { 4802 switch (NVT.SimpleTy) { 4803 default: llvm_unreachable("Unsupported VT!"); 4804 case MVT::i8: Opc = X86::DIV8r; MOpc = X86::DIV8m; break; 4805 case MVT::i16: Opc = X86::DIV16r; MOpc = X86::DIV16m; break; 4806 case MVT::i32: Opc = X86::DIV32r; MOpc = X86::DIV32m; break; 4807 case MVT::i64: Opc = X86::DIV64r; MOpc = X86::DIV64m; break; 4808 } 4809 } else { 4810 switch (NVT.SimpleTy) { 4811 default: llvm_unreachable("Unsupported VT!"); 4812 case MVT::i8: Opc = X86::IDIV8r; MOpc = X86::IDIV8m; break; 4813 case MVT::i16: Opc = X86::IDIV16r; MOpc = X86::IDIV16m; break; 4814 case MVT::i32: Opc = X86::IDIV32r; MOpc = X86::IDIV32m; break; 4815 case MVT::i64: Opc = X86::IDIV64r; MOpc = X86::IDIV64m; break; 4816 } 4817 } 4818 4819 unsigned LoReg, HiReg, ClrReg; 4820 unsigned SExtOpcode; 4821 switch (NVT.SimpleTy) { 4822 default: llvm_unreachable("Unsupported VT!"); 4823 case MVT::i8: 4824 LoReg = X86::AL; ClrReg = HiReg = X86::AH; 4825 SExtOpcode = 0; // Not used. 4826 break; 4827 case MVT::i16: 4828 LoReg = X86::AX; HiReg = X86::DX; 4829 ClrReg = X86::DX; 4830 SExtOpcode = X86::CWD; 4831 break; 4832 case MVT::i32: 4833 LoReg = X86::EAX; ClrReg = HiReg = X86::EDX; 4834 SExtOpcode = X86::CDQ; 4835 break; 4836 case MVT::i64: 4837 LoReg = X86::RAX; ClrReg = HiReg = X86::RDX; 4838 SExtOpcode = X86::CQO; 4839 break; 4840 } 4841 4842 SDValue Tmp0, Tmp1, Tmp2, Tmp3, Tmp4; 4843 bool foldedLoad = tryFoldLoad(Node, N1, Tmp0, Tmp1, Tmp2, Tmp3, Tmp4); 4844 bool signBitIsZero = CurDAG->SignBitIsZero(N0); 4845 4846 SDValue InFlag; 4847 if (NVT == MVT::i8) { 4848 // Special case for div8, just use a move with zero extension to AX to 4849 // clear the upper 8 bits (AH). 4850 SDValue Tmp0, Tmp1, Tmp2, Tmp3, Tmp4, Chain; 4851 MachineSDNode *Move; 4852 if (tryFoldLoad(Node, N0, Tmp0, Tmp1, Tmp2, Tmp3, Tmp4)) { 4853 SDValue Ops[] = { Tmp0, Tmp1, Tmp2, Tmp3, Tmp4, N0.getOperand(0) }; 4854 unsigned Opc = (isSigned && !signBitIsZero) ? X86::MOVSX16rm8 4855 : X86::MOVZX16rm8; 4856 Move = CurDAG->getMachineNode(Opc, dl, MVT::i16, MVT::Other, Ops); 4857 Chain = SDValue(Move, 1); 4858 ReplaceUses(N0.getValue(1), Chain); 4859 // Record the mem-refs 4860 CurDAG->setNodeMemRefs(Move, {cast<LoadSDNode>(N0)->getMemOperand()}); 4861 } else { 4862 unsigned Opc = (isSigned && !signBitIsZero) ? X86::MOVSX16rr8 4863 : X86::MOVZX16rr8; 4864 Move = CurDAG->getMachineNode(Opc, dl, MVT::i16, N0); 4865 Chain = CurDAG->getEntryNode(); 4866 } 4867 Chain = CurDAG->getCopyToReg(Chain, dl, X86::AX, SDValue(Move, 0), 4868 SDValue()); 4869 InFlag = Chain.getValue(1); 4870 } else { 4871 InFlag = 4872 CurDAG->getCopyToReg(CurDAG->getEntryNode(), dl, 4873 LoReg, N0, SDValue()).getValue(1); 4874 if (isSigned && !signBitIsZero) { 4875 // Sign extend the low part into the high part. 4876 InFlag = 4877 SDValue(CurDAG->getMachineNode(SExtOpcode, dl, MVT::Glue, InFlag),0); 4878 } else { 4879 // Zero out the high part, effectively zero extending the input. 4880 SDValue ClrNode = SDValue(CurDAG->getMachineNode(X86::MOV32r0, dl, NVT), 0); 4881 switch (NVT.SimpleTy) { 4882 case MVT::i16: 4883 ClrNode = 4884 SDValue(CurDAG->getMachineNode( 4885 TargetOpcode::EXTRACT_SUBREG, dl, MVT::i16, ClrNode, 4886 CurDAG->getTargetConstant(X86::sub_16bit, dl, 4887 MVT::i32)), 4888 0); 4889 break; 4890 case MVT::i32: 4891 break; 4892 case MVT::i64: 4893 ClrNode = 4894 SDValue(CurDAG->getMachineNode( 4895 TargetOpcode::SUBREG_TO_REG, dl, MVT::i64, 4896 CurDAG->getTargetConstant(0, dl, MVT::i64), ClrNode, 4897 CurDAG->getTargetConstant(X86::sub_32bit, dl, 4898 MVT::i32)), 4899 0); 4900 break; 4901 default: 4902 llvm_unreachable("Unexpected division source"); 4903 } 4904 4905 InFlag = CurDAG->getCopyToReg(CurDAG->getEntryNode(), dl, ClrReg, 4906 ClrNode, InFlag).getValue(1); 4907 } 4908 } 4909 4910 if (foldedLoad) { 4911 SDValue Ops[] = { Tmp0, Tmp1, Tmp2, Tmp3, Tmp4, N1.getOperand(0), 4912 InFlag }; 4913 MachineSDNode *CNode = 4914 CurDAG->getMachineNode(MOpc, dl, MVT::Other, MVT::Glue, Ops); 4915 InFlag = SDValue(CNode, 1); 4916 // Update the chain. 4917 ReplaceUses(N1.getValue(1), SDValue(CNode, 0)); 4918 // Record the mem-refs 4919 CurDAG->setNodeMemRefs(CNode, {cast<LoadSDNode>(N1)->getMemOperand()}); 4920 } else { 4921 InFlag = 4922 SDValue(CurDAG->getMachineNode(Opc, dl, MVT::Glue, N1, InFlag), 0); 4923 } 4924 4925 // Prevent use of AH in a REX instruction by explicitly copying it to 4926 // an ABCD_L register. 4927 // 4928 // The current assumption of the register allocator is that isel 4929 // won't generate explicit references to the GR8_ABCD_H registers. If 4930 // the allocator and/or the backend get enhanced to be more robust in 4931 // that regard, this can be, and should be, removed. 4932 if (HiReg == X86::AH && !SDValue(Node, 1).use_empty()) { 4933 SDValue AHCopy = CurDAG->getRegister(X86::AH, MVT::i8); 4934 unsigned AHExtOpcode = 4935 isSigned ? X86::MOVSX32rr8_NOREX : X86::MOVZX32rr8_NOREX; 4936 4937 SDNode *RNode = CurDAG->getMachineNode(AHExtOpcode, dl, MVT::i32, 4938 MVT::Glue, AHCopy, InFlag); 4939 SDValue Result(RNode, 0); 4940 InFlag = SDValue(RNode, 1); 4941 4942 Result = 4943 CurDAG->getTargetExtractSubreg(X86::sub_8bit, dl, MVT::i8, Result); 4944 4945 ReplaceUses(SDValue(Node, 1), Result); 4946 LLVM_DEBUG(dbgs() << "=> "; Result.getNode()->dump(CurDAG); 4947 dbgs() << '\n'); 4948 } 4949 // Copy the division (low) result, if it is needed. 4950 if (!SDValue(Node, 0).use_empty()) { 4951 SDValue Result = CurDAG->getCopyFromReg(CurDAG->getEntryNode(), dl, 4952 LoReg, NVT, InFlag); 4953 InFlag = Result.getValue(2); 4954 ReplaceUses(SDValue(Node, 0), Result); 4955 LLVM_DEBUG(dbgs() << "=> "; Result.getNode()->dump(CurDAG); 4956 dbgs() << '\n'); 4957 } 4958 // Copy the remainder (high) result, if it is needed. 4959 if (!SDValue(Node, 1).use_empty()) { 4960 SDValue Result = CurDAG->getCopyFromReg(CurDAG->getEntryNode(), dl, 4961 HiReg, NVT, InFlag); 4962 InFlag = Result.getValue(2); 4963 ReplaceUses(SDValue(Node, 1), Result); 4964 LLVM_DEBUG(dbgs() << "=> "; Result.getNode()->dump(CurDAG); 4965 dbgs() << '\n'); 4966 } 4967 CurDAG->RemoveDeadNode(Node); 4968 return; 4969 } 4970 4971 case X86ISD::CMP: { 4972 SDValue N0 = Node->getOperand(0); 4973 SDValue N1 = Node->getOperand(1); 4974 4975 // Optimizations for TEST compares. 4976 if (!isNullConstant(N1)) 4977 break; 4978 4979 // Save the original VT of the compare. 4980 MVT CmpVT = N0.getSimpleValueType(); 4981 4982 // If we are comparing (and (shr X, C, Mask) with 0, emit a BEXTR followed 4983 // by a test instruction. The test should be removed later by 4984 // analyzeCompare if we are using only the zero flag. 4985 // TODO: Should we check the users and use the BEXTR flags directly? 4986 if (N0.getOpcode() == ISD::AND && N0.hasOneUse()) { 4987 if (MachineSDNode *NewNode = matchBEXTRFromAndImm(N0.getNode())) { 4988 unsigned TestOpc = CmpVT == MVT::i64 ? X86::TEST64rr 4989 : X86::TEST32rr; 4990 SDValue BEXTR = SDValue(NewNode, 0); 4991 NewNode = CurDAG->getMachineNode(TestOpc, dl, MVT::i32, BEXTR, BEXTR); 4992 ReplaceUses(SDValue(Node, 0), SDValue(NewNode, 0)); 4993 CurDAG->RemoveDeadNode(Node); 4994 return; 4995 } 4996 } 4997 4998 // We can peek through truncates, but we need to be careful below. 4999 if (N0.getOpcode() == ISD::TRUNCATE && N0.hasOneUse()) 5000 N0 = N0.getOperand(0); 5001 5002 // Look for (X86cmp (and $op, $imm), 0) and see if we can convert it to 5003 // use a smaller encoding. 5004 // Look past the truncate if CMP is the only use of it. 5005 if (N0.getOpcode() == ISD::AND && 5006 N0.getNode()->hasOneUse() && 5007 N0.getValueType() != MVT::i8) { 5008 ConstantSDNode *C = dyn_cast<ConstantSDNode>(N0.getOperand(1)); 5009 if (!C) break; 5010 uint64_t Mask = C->getZExtValue(); 5011 5012 // Check if we can replace AND+IMM64 with a shift. This is possible for 5013 // masks/ like 0xFF000000 or 0x00FFFFFF and if we care only about the zero 5014 // flag. 5015 if (CmpVT == MVT::i64 && !isInt<32>(Mask) && 5016 onlyUsesZeroFlag(SDValue(Node, 0))) { 5017 if (isMask_64(~Mask)) { 5018 unsigned TrailingZeros = countTrailingZeros(Mask); 5019 SDValue Imm = CurDAG->getTargetConstant(TrailingZeros, dl, MVT::i64); 5020 SDValue Shift = 5021 SDValue(CurDAG->getMachineNode(X86::SHR64ri, dl, MVT::i64, MVT::i32, 5022 N0.getOperand(0), Imm), 0); 5023 MachineSDNode *Test = CurDAG->getMachineNode(X86::TEST64rr, dl, 5024 MVT::i32, Shift, Shift); 5025 ReplaceNode(Node, Test); 5026 return; 5027 } 5028 if (isMask_64(Mask)) { 5029 unsigned LeadingZeros = countLeadingZeros(Mask); 5030 SDValue Imm = CurDAG->getTargetConstant(LeadingZeros, dl, MVT::i64); 5031 SDValue Shift = 5032 SDValue(CurDAG->getMachineNode(X86::SHL64ri, dl, MVT::i64, MVT::i32, 5033 N0.getOperand(0), Imm), 0); 5034 MachineSDNode *Test = CurDAG->getMachineNode(X86::TEST64rr, dl, 5035 MVT::i32, Shift, Shift); 5036 ReplaceNode(Node, Test); 5037 return; 5038 } 5039 } 5040 5041 MVT VT; 5042 int SubRegOp; 5043 unsigned ROpc, MOpc; 5044 5045 // For each of these checks we need to be careful if the sign flag is 5046 // being used. It is only safe to use the sign flag in two conditions, 5047 // either the sign bit in the shrunken mask is zero or the final test 5048 // size is equal to the original compare size. 5049 5050 if (isUInt<8>(Mask) && 5051 (!(Mask & 0x80) || CmpVT == MVT::i8 || 5052 hasNoSignFlagUses(SDValue(Node, 0)))) { 5053 // For example, convert "testl %eax, $8" to "testb %al, $8" 5054 VT = MVT::i8; 5055 SubRegOp = X86::sub_8bit; 5056 ROpc = X86::TEST8ri; 5057 MOpc = X86::TEST8mi; 5058 } else if (OptForMinSize && isUInt<16>(Mask) && 5059 (!(Mask & 0x8000) || CmpVT == MVT::i16 || 5060 hasNoSignFlagUses(SDValue(Node, 0)))) { 5061 // For example, "testl %eax, $32776" to "testw %ax, $32776". 5062 // NOTE: We only want to form TESTW instructions if optimizing for 5063 // min size. Otherwise we only save one byte and possibly get a length 5064 // changing prefix penalty in the decoders. 5065 VT = MVT::i16; 5066 SubRegOp = X86::sub_16bit; 5067 ROpc = X86::TEST16ri; 5068 MOpc = X86::TEST16mi; 5069 } else if (isUInt<32>(Mask) && N0.getValueType() != MVT::i16 && 5070 ((!(Mask & 0x80000000) && 5071 // Without minsize 16-bit Cmps can get here so we need to 5072 // be sure we calculate the correct sign flag if needed. 5073 (CmpVT != MVT::i16 || !(Mask & 0x8000))) || 5074 CmpVT == MVT::i32 || 5075 hasNoSignFlagUses(SDValue(Node, 0)))) { 5076 // For example, "testq %rax, $268468232" to "testl %eax, $268468232". 5077 // NOTE: We only want to run that transform if N0 is 32 or 64 bits. 5078 // Otherwize, we find ourselves in a position where we have to do 5079 // promotion. If previous passes did not promote the and, we assume 5080 // they had a good reason not to and do not promote here. 5081 VT = MVT::i32; 5082 SubRegOp = X86::sub_32bit; 5083 ROpc = X86::TEST32ri; 5084 MOpc = X86::TEST32mi; 5085 } else { 5086 // No eligible transformation was found. 5087 break; 5088 } 5089 5090 SDValue Imm = CurDAG->getTargetConstant(Mask, dl, VT); 5091 SDValue Reg = N0.getOperand(0); 5092 5093 // Emit a testl or testw. 5094 MachineSDNode *NewNode; 5095 SDValue Tmp0, Tmp1, Tmp2, Tmp3, Tmp4; 5096 if (tryFoldLoad(Node, N0.getNode(), Reg, Tmp0, Tmp1, Tmp2, Tmp3, Tmp4)) { 5097 SDValue Ops[] = { Tmp0, Tmp1, Tmp2, Tmp3, Tmp4, Imm, 5098 Reg.getOperand(0) }; 5099 NewNode = CurDAG->getMachineNode(MOpc, dl, MVT::i32, MVT::Other, Ops); 5100 // Update the chain. 5101 ReplaceUses(Reg.getValue(1), SDValue(NewNode, 1)); 5102 // Record the mem-refs 5103 CurDAG->setNodeMemRefs(NewNode, 5104 {cast<LoadSDNode>(Reg)->getMemOperand()}); 5105 } else { 5106 // Extract the subregister if necessary. 5107 if (N0.getValueType() != VT) 5108 Reg = CurDAG->getTargetExtractSubreg(SubRegOp, dl, VT, Reg); 5109 5110 NewNode = CurDAG->getMachineNode(ROpc, dl, MVT::i32, Reg, Imm); 5111 } 5112 // Replace CMP with TEST. 5113 ReplaceNode(Node, NewNode); 5114 return; 5115 } 5116 break; 5117 } 5118 case X86ISD::PCMPISTR: { 5119 if (!Subtarget->hasSSE42()) 5120 break; 5121 5122 bool NeedIndex = !SDValue(Node, 0).use_empty(); 5123 bool NeedMask = !SDValue(Node, 1).use_empty(); 5124 // We can't fold a load if we are going to make two instructions. 5125 bool MayFoldLoad = !NeedIndex || !NeedMask; 5126 5127 MachineSDNode *CNode; 5128 if (NeedMask) { 5129 unsigned ROpc = Subtarget->hasAVX() ? X86::VPCMPISTRMrr : X86::PCMPISTRMrr; 5130 unsigned MOpc = Subtarget->hasAVX() ? X86::VPCMPISTRMrm : X86::PCMPISTRMrm; 5131 CNode = emitPCMPISTR(ROpc, MOpc, MayFoldLoad, dl, MVT::v16i8, Node); 5132 ReplaceUses(SDValue(Node, 1), SDValue(CNode, 0)); 5133 } 5134 if (NeedIndex || !NeedMask) { 5135 unsigned ROpc = Subtarget->hasAVX() ? X86::VPCMPISTRIrr : X86::PCMPISTRIrr; 5136 unsigned MOpc = Subtarget->hasAVX() ? X86::VPCMPISTRIrm : X86::PCMPISTRIrm; 5137 CNode = emitPCMPISTR(ROpc, MOpc, MayFoldLoad, dl, MVT::i32, Node); 5138 ReplaceUses(SDValue(Node, 0), SDValue(CNode, 0)); 5139 } 5140 5141 // Connect the flag usage to the last instruction created. 5142 ReplaceUses(SDValue(Node, 2), SDValue(CNode, 1)); 5143 CurDAG->RemoveDeadNode(Node); 5144 return; 5145 } 5146 case X86ISD::PCMPESTR: { 5147 if (!Subtarget->hasSSE42()) 5148 break; 5149 5150 // Copy the two implicit register inputs. 5151 SDValue InFlag = CurDAG->getCopyToReg(CurDAG->getEntryNode(), dl, X86::EAX, 5152 Node->getOperand(1), 5153 SDValue()).getValue(1); 5154 InFlag = CurDAG->getCopyToReg(CurDAG->getEntryNode(), dl, X86::EDX, 5155 Node->getOperand(3), InFlag).getValue(1); 5156 5157 bool NeedIndex = !SDValue(Node, 0).use_empty(); 5158 bool NeedMask = !SDValue(Node, 1).use_empty(); 5159 // We can't fold a load if we are going to make two instructions. 5160 bool MayFoldLoad = !NeedIndex || !NeedMask; 5161 5162 MachineSDNode *CNode; 5163 if (NeedMask) { 5164 unsigned ROpc = Subtarget->hasAVX() ? X86::VPCMPESTRMrr : X86::PCMPESTRMrr; 5165 unsigned MOpc = Subtarget->hasAVX() ? X86::VPCMPESTRMrm : X86::PCMPESTRMrm; 5166 CNode = emitPCMPESTR(ROpc, MOpc, MayFoldLoad, dl, MVT::v16i8, Node, 5167 InFlag); 5168 ReplaceUses(SDValue(Node, 1), SDValue(CNode, 0)); 5169 } 5170 if (NeedIndex || !NeedMask) { 5171 unsigned ROpc = Subtarget->hasAVX() ? X86::VPCMPESTRIrr : X86::PCMPESTRIrr; 5172 unsigned MOpc = Subtarget->hasAVX() ? X86::VPCMPESTRIrm : X86::PCMPESTRIrm; 5173 CNode = emitPCMPESTR(ROpc, MOpc, MayFoldLoad, dl, MVT::i32, Node, InFlag); 5174 ReplaceUses(SDValue(Node, 0), SDValue(CNode, 0)); 5175 } 5176 // Connect the flag usage to the last instruction created. 5177 ReplaceUses(SDValue(Node, 2), SDValue(CNode, 1)); 5178 CurDAG->RemoveDeadNode(Node); 5179 return; 5180 } 5181 5182 case ISD::SETCC: { 5183 if (NVT.isVector() && tryVPTESTM(Node, SDValue(Node, 0), SDValue())) 5184 return; 5185 5186 break; 5187 } 5188 5189 case ISD::STORE: 5190 if (foldLoadStoreIntoMemOperand(Node)) 5191 return; 5192 break; 5193 case ISD::FCEIL: 5194 case ISD::FFLOOR: 5195 case ISD::FTRUNC: 5196 case ISD::FNEARBYINT: 5197 case ISD::FRINT: { 5198 // Replace fp rounding with their X86 specific equivalent so we don't 5199 // need 2 sets of patterns. 5200 // FIXME: This can only happen when the nodes started as STRICT_* and have 5201 // been mutated into their non-STRICT equivalents. Eventually this 5202 // mutation will be removed and we should switch the STRICT_ nodes to a 5203 // strict version of RNDSCALE in PreProcessISelDAG. 5204 unsigned Imm; 5205 switch (Node->getOpcode()) { 5206 default: llvm_unreachable("Unexpected opcode!"); 5207 case ISD::FCEIL: Imm = 0xA; break; 5208 case ISD::FFLOOR: Imm = 0x9; break; 5209 case ISD::FTRUNC: Imm = 0xB; break; 5210 case ISD::FNEARBYINT: Imm = 0xC; break; 5211 case ISD::FRINT: Imm = 0x4; break; 5212 } 5213 SDLoc dl(Node); 5214 SDValue Res = CurDAG->getNode(X86ISD::VRNDSCALE, dl, Node->getValueType(0), 5215 Node->getOperand(0), 5216 CurDAG->getTargetConstant(Imm, dl, MVT::i8)); 5217 ReplaceNode(Node, Res.getNode()); 5218 SelectCode(Res.getNode()); 5219 return; 5220 } 5221 } 5222 5223 SelectCode(Node); 5224 } 5225 5226 bool X86DAGToDAGISel:: 5227 SelectInlineAsmMemoryOperand(const SDValue &Op, unsigned ConstraintID, 5228 std::vector<SDValue> &OutOps) { 5229 SDValue Op0, Op1, Op2, Op3, Op4; 5230 switch (ConstraintID) { 5231 default: 5232 llvm_unreachable("Unexpected asm memory constraint"); 5233 case InlineAsm::Constraint_i: 5234 // FIXME: It seems strange that 'i' is needed here since it's supposed to 5235 // be an immediate and not a memory constraint. 5236 LLVM_FALLTHROUGH; 5237 case InlineAsm::Constraint_o: // offsetable ?? 5238 case InlineAsm::Constraint_v: // not offsetable ?? 5239 case InlineAsm::Constraint_m: // memory 5240 case InlineAsm::Constraint_X: 5241 if (!selectAddr(nullptr, Op, Op0, Op1, Op2, Op3, Op4)) 5242 return true; 5243 break; 5244 } 5245 5246 OutOps.push_back(Op0); 5247 OutOps.push_back(Op1); 5248 OutOps.push_back(Op2); 5249 OutOps.push_back(Op3); 5250 OutOps.push_back(Op4); 5251 return false; 5252 } 5253 5254 /// This pass converts a legalized DAG into a X86-specific DAG, 5255 /// ready for instruction scheduling. 5256 FunctionPass *llvm::createX86ISelDag(X86TargetMachine &TM, 5257 CodeGenOpt::Level OptLevel) { 5258 return new X86DAGToDAGISel(TM, OptLevel); 5259 } 5260