1 //===- X86ISelDAGToDAG.cpp - A DAG pattern matching inst selector for X86 -===// 2 // 3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 4 // See https://llvm.org/LICENSE.txt for license information. 5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 6 // 7 //===----------------------------------------------------------------------===// 8 // 9 // This file defines a DAG pattern matching instruction selector for X86, 10 // converting from a legalized dag to a X86 dag. 11 // 12 //===----------------------------------------------------------------------===// 13 14 #include "X86.h" 15 #include "X86MachineFunctionInfo.h" 16 #include "X86RegisterInfo.h" 17 #include "X86Subtarget.h" 18 #include "X86TargetMachine.h" 19 #include "llvm/ADT/Statistic.h" 20 #include "llvm/CodeGen/MachineFrameInfo.h" 21 #include "llvm/CodeGen/MachineFunction.h" 22 #include "llvm/CodeGen/SelectionDAGISel.h" 23 #include "llvm/Config/llvm-config.h" 24 #include "llvm/IR/ConstantRange.h" 25 #include "llvm/IR/Function.h" 26 #include "llvm/IR/Instructions.h" 27 #include "llvm/IR/Intrinsics.h" 28 #include "llvm/IR/Type.h" 29 #include "llvm/Support/Debug.h" 30 #include "llvm/Support/ErrorHandling.h" 31 #include "llvm/Support/KnownBits.h" 32 #include "llvm/Support/MathExtras.h" 33 #include "llvm/Support/raw_ostream.h" 34 #include "llvm/Target/TargetMachine.h" 35 #include "llvm/Target/TargetOptions.h" 36 #include <stdint.h> 37 using namespace llvm; 38 39 #define DEBUG_TYPE "x86-isel" 40 41 STATISTIC(NumLoadMoved, "Number of loads moved below TokenFactor"); 42 43 static cl::opt<bool> AndImmShrink("x86-and-imm-shrink", cl::init(true), 44 cl::desc("Enable setting constant bits to reduce size of mask immediates"), 45 cl::Hidden); 46 47 //===----------------------------------------------------------------------===// 48 // Pattern Matcher Implementation 49 //===----------------------------------------------------------------------===// 50 51 namespace { 52 /// This corresponds to X86AddressMode, but uses SDValue's instead of register 53 /// numbers for the leaves of the matched tree. 54 struct X86ISelAddressMode { 55 enum { 56 RegBase, 57 FrameIndexBase 58 } BaseType; 59 60 // This is really a union, discriminated by BaseType! 61 SDValue Base_Reg; 62 int Base_FrameIndex; 63 64 unsigned Scale; 65 SDValue IndexReg; 66 int32_t Disp; 67 SDValue Segment; 68 const GlobalValue *GV; 69 const Constant *CP; 70 const BlockAddress *BlockAddr; 71 const char *ES; 72 MCSymbol *MCSym; 73 int JT; 74 unsigned Align; // CP alignment. 75 unsigned char SymbolFlags; // X86II::MO_* 76 bool NegateIndex = false; 77 78 X86ISelAddressMode() 79 : BaseType(RegBase), Base_FrameIndex(0), Scale(1), IndexReg(), Disp(0), 80 Segment(), GV(nullptr), CP(nullptr), BlockAddr(nullptr), ES(nullptr), 81 MCSym(nullptr), JT(-1), Align(0), SymbolFlags(X86II::MO_NO_FLAG) {} 82 83 bool hasSymbolicDisplacement() const { 84 return GV != nullptr || CP != nullptr || ES != nullptr || 85 MCSym != nullptr || JT != -1 || BlockAddr != nullptr; 86 } 87 88 bool hasBaseOrIndexReg() const { 89 return BaseType == FrameIndexBase || 90 IndexReg.getNode() != nullptr || Base_Reg.getNode() != nullptr; 91 } 92 93 /// Return true if this addressing mode is already RIP-relative. 94 bool isRIPRelative() const { 95 if (BaseType != RegBase) return false; 96 if (RegisterSDNode *RegNode = 97 dyn_cast_or_null<RegisterSDNode>(Base_Reg.getNode())) 98 return RegNode->getReg() == X86::RIP; 99 return false; 100 } 101 102 void setBaseReg(SDValue Reg) { 103 BaseType = RegBase; 104 Base_Reg = Reg; 105 } 106 107 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) 108 void dump(SelectionDAG *DAG = nullptr) { 109 dbgs() << "X86ISelAddressMode " << this << '\n'; 110 dbgs() << "Base_Reg "; 111 if (Base_Reg.getNode()) 112 Base_Reg.getNode()->dump(DAG); 113 else 114 dbgs() << "nul\n"; 115 if (BaseType == FrameIndexBase) 116 dbgs() << " Base.FrameIndex " << Base_FrameIndex << '\n'; 117 dbgs() << " Scale " << Scale << '\n' 118 << "IndexReg "; 119 if (NegateIndex) 120 dbgs() << "negate "; 121 if (IndexReg.getNode()) 122 IndexReg.getNode()->dump(DAG); 123 else 124 dbgs() << "nul\n"; 125 dbgs() << " Disp " << Disp << '\n' 126 << "GV "; 127 if (GV) 128 GV->dump(); 129 else 130 dbgs() << "nul"; 131 dbgs() << " CP "; 132 if (CP) 133 CP->dump(); 134 else 135 dbgs() << "nul"; 136 dbgs() << '\n' 137 << "ES "; 138 if (ES) 139 dbgs() << ES; 140 else 141 dbgs() << "nul"; 142 dbgs() << " MCSym "; 143 if (MCSym) 144 dbgs() << MCSym; 145 else 146 dbgs() << "nul"; 147 dbgs() << " JT" << JT << " Align" << Align << '\n'; 148 } 149 #endif 150 }; 151 } 152 153 namespace { 154 //===--------------------------------------------------------------------===// 155 /// ISel - X86-specific code to select X86 machine instructions for 156 /// SelectionDAG operations. 157 /// 158 class X86DAGToDAGISel final : public SelectionDAGISel { 159 /// Keep a pointer to the X86Subtarget around so that we can 160 /// make the right decision when generating code for different targets. 161 const X86Subtarget *Subtarget; 162 163 /// If true, selector should try to optimize for code size instead of 164 /// performance. 165 bool OptForSize; 166 167 /// If true, selector should try to optimize for minimum code size. 168 bool OptForMinSize; 169 170 /// Disable direct TLS access through segment registers. 171 bool IndirectTlsSegRefs; 172 173 public: 174 explicit X86DAGToDAGISel(X86TargetMachine &tm, CodeGenOpt::Level OptLevel) 175 : SelectionDAGISel(tm, OptLevel), Subtarget(nullptr), OptForSize(false), 176 OptForMinSize(false), IndirectTlsSegRefs(false) {} 177 178 StringRef getPassName() const override { 179 return "X86 DAG->DAG Instruction Selection"; 180 } 181 182 bool runOnMachineFunction(MachineFunction &MF) override { 183 // Reset the subtarget each time through. 184 Subtarget = &MF.getSubtarget<X86Subtarget>(); 185 IndirectTlsSegRefs = MF.getFunction().hasFnAttribute( 186 "indirect-tls-seg-refs"); 187 188 // OptFor[Min]Size are used in pattern predicates that isel is matching. 189 OptForSize = MF.getFunction().hasOptSize(); 190 OptForMinSize = MF.getFunction().hasMinSize(); 191 assert((!OptForMinSize || OptForSize) && 192 "OptForMinSize implies OptForSize"); 193 194 SelectionDAGISel::runOnMachineFunction(MF); 195 return true; 196 } 197 198 void EmitFunctionEntryCode() override; 199 200 bool IsProfitableToFold(SDValue N, SDNode *U, SDNode *Root) const override; 201 202 void PreprocessISelDAG() override; 203 void PostprocessISelDAG() override; 204 205 // Include the pieces autogenerated from the target description. 206 #include "X86GenDAGISel.inc" 207 208 private: 209 void Select(SDNode *N) override; 210 211 bool foldOffsetIntoAddress(uint64_t Offset, X86ISelAddressMode &AM); 212 bool matchLoadInAddress(LoadSDNode *N, X86ISelAddressMode &AM); 213 bool matchWrapper(SDValue N, X86ISelAddressMode &AM); 214 bool matchAddress(SDValue N, X86ISelAddressMode &AM); 215 bool matchVectorAddress(SDValue N, X86ISelAddressMode &AM); 216 bool matchAdd(SDValue &N, X86ISelAddressMode &AM, unsigned Depth); 217 bool matchAddressRecursively(SDValue N, X86ISelAddressMode &AM, 218 unsigned Depth); 219 bool matchAddressBase(SDValue N, X86ISelAddressMode &AM); 220 bool selectAddr(SDNode *Parent, SDValue N, SDValue &Base, 221 SDValue &Scale, SDValue &Index, SDValue &Disp, 222 SDValue &Segment); 223 bool selectVectorAddr(SDNode *Parent, SDValue N, SDValue &Base, 224 SDValue &Scale, SDValue &Index, SDValue &Disp, 225 SDValue &Segment); 226 bool selectMOV64Imm32(SDValue N, SDValue &Imm); 227 bool selectLEAAddr(SDValue N, SDValue &Base, 228 SDValue &Scale, SDValue &Index, SDValue &Disp, 229 SDValue &Segment); 230 bool selectLEA64_32Addr(SDValue N, SDValue &Base, 231 SDValue &Scale, SDValue &Index, SDValue &Disp, 232 SDValue &Segment); 233 bool selectTLSADDRAddr(SDValue N, SDValue &Base, 234 SDValue &Scale, SDValue &Index, SDValue &Disp, 235 SDValue &Segment); 236 bool selectScalarSSELoad(SDNode *Root, SDNode *Parent, SDValue N, 237 SDValue &Base, SDValue &Scale, 238 SDValue &Index, SDValue &Disp, 239 SDValue &Segment, 240 SDValue &NodeWithChain); 241 bool selectRelocImm(SDValue N, SDValue &Op); 242 243 bool tryFoldLoad(SDNode *Root, SDNode *P, SDValue N, 244 SDValue &Base, SDValue &Scale, 245 SDValue &Index, SDValue &Disp, 246 SDValue &Segment); 247 248 // Convenience method where P is also root. 249 bool tryFoldLoad(SDNode *P, SDValue N, 250 SDValue &Base, SDValue &Scale, 251 SDValue &Index, SDValue &Disp, 252 SDValue &Segment) { 253 return tryFoldLoad(P, P, N, Base, Scale, Index, Disp, Segment); 254 } 255 256 /// Implement addressing mode selection for inline asm expressions. 257 bool SelectInlineAsmMemoryOperand(const SDValue &Op, 258 unsigned ConstraintID, 259 std::vector<SDValue> &OutOps) override; 260 261 void emitSpecialCodeForMain(); 262 263 inline void getAddressOperands(X86ISelAddressMode &AM, const SDLoc &DL, 264 MVT VT, SDValue &Base, SDValue &Scale, 265 SDValue &Index, SDValue &Disp, 266 SDValue &Segment) { 267 if (AM.BaseType == X86ISelAddressMode::FrameIndexBase) 268 Base = CurDAG->getTargetFrameIndex( 269 AM.Base_FrameIndex, TLI->getPointerTy(CurDAG->getDataLayout())); 270 else if (AM.Base_Reg.getNode()) 271 Base = AM.Base_Reg; 272 else 273 Base = CurDAG->getRegister(0, VT); 274 275 Scale = getI8Imm(AM.Scale, DL); 276 277 // Negate the index if needed. 278 if (AM.NegateIndex) { 279 unsigned NegOpc = VT == MVT::i64 ? X86::NEG64r : X86::NEG32r; 280 SDValue Neg = SDValue(CurDAG->getMachineNode(NegOpc, DL, VT, MVT::i32, 281 AM.IndexReg), 0); 282 AM.IndexReg = Neg; 283 } 284 285 if (AM.IndexReg.getNode()) 286 Index = AM.IndexReg; 287 else 288 Index = CurDAG->getRegister(0, VT); 289 290 // These are 32-bit even in 64-bit mode since RIP-relative offset 291 // is 32-bit. 292 if (AM.GV) 293 Disp = CurDAG->getTargetGlobalAddress(AM.GV, SDLoc(), 294 MVT::i32, AM.Disp, 295 AM.SymbolFlags); 296 else if (AM.CP) 297 Disp = CurDAG->getTargetConstantPool(AM.CP, MVT::i32, 298 AM.Align, AM.Disp, AM.SymbolFlags); 299 else if (AM.ES) { 300 assert(!AM.Disp && "Non-zero displacement is ignored with ES."); 301 Disp = CurDAG->getTargetExternalSymbol(AM.ES, MVT::i32, AM.SymbolFlags); 302 } else if (AM.MCSym) { 303 assert(!AM.Disp && "Non-zero displacement is ignored with MCSym."); 304 assert(AM.SymbolFlags == 0 && "oo"); 305 Disp = CurDAG->getMCSymbol(AM.MCSym, MVT::i32); 306 } else if (AM.JT != -1) { 307 assert(!AM.Disp && "Non-zero displacement is ignored with JT."); 308 Disp = CurDAG->getTargetJumpTable(AM.JT, MVT::i32, AM.SymbolFlags); 309 } else if (AM.BlockAddr) 310 Disp = CurDAG->getTargetBlockAddress(AM.BlockAddr, MVT::i32, AM.Disp, 311 AM.SymbolFlags); 312 else 313 Disp = CurDAG->getTargetConstant(AM.Disp, DL, MVT::i32); 314 315 if (AM.Segment.getNode()) 316 Segment = AM.Segment; 317 else 318 Segment = CurDAG->getRegister(0, MVT::i16); 319 } 320 321 // Utility function to determine whether we should avoid selecting 322 // immediate forms of instructions for better code size or not. 323 // At a high level, we'd like to avoid such instructions when 324 // we have similar constants used within the same basic block 325 // that can be kept in a register. 326 // 327 bool shouldAvoidImmediateInstFormsForSize(SDNode *N) const { 328 uint32_t UseCount = 0; 329 330 // Do not want to hoist if we're not optimizing for size. 331 // TODO: We'd like to remove this restriction. 332 // See the comment in X86InstrInfo.td for more info. 333 if (!OptForSize) 334 return false; 335 336 // Walk all the users of the immediate. 337 for (SDNode::use_iterator UI = N->use_begin(), 338 UE = N->use_end(); (UI != UE) && (UseCount < 2); ++UI) { 339 340 SDNode *User = *UI; 341 342 // This user is already selected. Count it as a legitimate use and 343 // move on. 344 if (User->isMachineOpcode()) { 345 UseCount++; 346 continue; 347 } 348 349 // We want to count stores of immediates as real uses. 350 if (User->getOpcode() == ISD::STORE && 351 User->getOperand(1).getNode() == N) { 352 UseCount++; 353 continue; 354 } 355 356 // We don't currently match users that have > 2 operands (except 357 // for stores, which are handled above) 358 // Those instruction won't match in ISEL, for now, and would 359 // be counted incorrectly. 360 // This may change in the future as we add additional instruction 361 // types. 362 if (User->getNumOperands() != 2) 363 continue; 364 365 // Immediates that are used for offsets as part of stack 366 // manipulation should be left alone. These are typically 367 // used to indicate SP offsets for argument passing and 368 // will get pulled into stores/pushes (implicitly). 369 if (User->getOpcode() == X86ISD::ADD || 370 User->getOpcode() == ISD::ADD || 371 User->getOpcode() == X86ISD::SUB || 372 User->getOpcode() == ISD::SUB) { 373 374 // Find the other operand of the add/sub. 375 SDValue OtherOp = User->getOperand(0); 376 if (OtherOp.getNode() == N) 377 OtherOp = User->getOperand(1); 378 379 // Don't count if the other operand is SP. 380 RegisterSDNode *RegNode; 381 if (OtherOp->getOpcode() == ISD::CopyFromReg && 382 (RegNode = dyn_cast_or_null<RegisterSDNode>( 383 OtherOp->getOperand(1).getNode()))) 384 if ((RegNode->getReg() == X86::ESP) || 385 (RegNode->getReg() == X86::RSP)) 386 continue; 387 } 388 389 // ... otherwise, count this and move on. 390 UseCount++; 391 } 392 393 // If we have more than 1 use, then recommend for hoisting. 394 return (UseCount > 1); 395 } 396 397 /// Return a target constant with the specified value of type i8. 398 inline SDValue getI8Imm(unsigned Imm, const SDLoc &DL) { 399 return CurDAG->getTargetConstant(Imm, DL, MVT::i8); 400 } 401 402 /// Return a target constant with the specified value, of type i32. 403 inline SDValue getI32Imm(unsigned Imm, const SDLoc &DL) { 404 return CurDAG->getTargetConstant(Imm, DL, MVT::i32); 405 } 406 407 /// Return a target constant with the specified value, of type i64. 408 inline SDValue getI64Imm(uint64_t Imm, const SDLoc &DL) { 409 return CurDAG->getTargetConstant(Imm, DL, MVT::i64); 410 } 411 412 SDValue getExtractVEXTRACTImmediate(SDNode *N, unsigned VecWidth, 413 const SDLoc &DL) { 414 assert((VecWidth == 128 || VecWidth == 256) && "Unexpected vector width"); 415 uint64_t Index = N->getConstantOperandVal(1); 416 MVT VecVT = N->getOperand(0).getSimpleValueType(); 417 return getI8Imm((Index * VecVT.getScalarSizeInBits()) / VecWidth, DL); 418 } 419 420 SDValue getInsertVINSERTImmediate(SDNode *N, unsigned VecWidth, 421 const SDLoc &DL) { 422 assert((VecWidth == 128 || VecWidth == 256) && "Unexpected vector width"); 423 uint64_t Index = N->getConstantOperandVal(2); 424 MVT VecVT = N->getSimpleValueType(0); 425 return getI8Imm((Index * VecVT.getScalarSizeInBits()) / VecWidth, DL); 426 } 427 428 // Helper to detect unneeded and instructions on shift amounts. Called 429 // from PatFrags in tablegen. 430 bool isUnneededShiftMask(SDNode *N, unsigned Width) const { 431 assert(N->getOpcode() == ISD::AND && "Unexpected opcode"); 432 const APInt &Val = cast<ConstantSDNode>(N->getOperand(1))->getAPIntValue(); 433 434 if (Val.countTrailingOnes() >= Width) 435 return true; 436 437 APInt Mask = Val | CurDAG->computeKnownBits(N->getOperand(0)).Zero; 438 return Mask.countTrailingOnes() >= Width; 439 } 440 441 /// Return an SDNode that returns the value of the global base register. 442 /// Output instructions required to initialize the global base register, 443 /// if necessary. 444 SDNode *getGlobalBaseReg(); 445 446 /// Return a reference to the TargetMachine, casted to the target-specific 447 /// type. 448 const X86TargetMachine &getTargetMachine() const { 449 return static_cast<const X86TargetMachine &>(TM); 450 } 451 452 /// Return a reference to the TargetInstrInfo, casted to the target-specific 453 /// type. 454 const X86InstrInfo *getInstrInfo() const { 455 return Subtarget->getInstrInfo(); 456 } 457 458 /// Address-mode matching performs shift-of-and to and-of-shift 459 /// reassociation in order to expose more scaled addressing 460 /// opportunities. 461 bool ComplexPatternFuncMutatesDAG() const override { 462 return true; 463 } 464 465 bool isSExtAbsoluteSymbolRef(unsigned Width, SDNode *N) const; 466 467 /// Returns whether this is a relocatable immediate in the range 468 /// [-2^Width .. 2^Width-1]. 469 template <unsigned Width> bool isSExtRelocImm(SDNode *N) const { 470 if (auto *CN = dyn_cast<ConstantSDNode>(N)) 471 return isInt<Width>(CN->getSExtValue()); 472 return isSExtAbsoluteSymbolRef(Width, N); 473 } 474 475 // Indicates we should prefer to use a non-temporal load for this load. 476 bool useNonTemporalLoad(LoadSDNode *N) const { 477 if (!N->isNonTemporal()) 478 return false; 479 480 unsigned StoreSize = N->getMemoryVT().getStoreSize(); 481 482 if (N->getAlignment() < StoreSize) 483 return false; 484 485 switch (StoreSize) { 486 default: llvm_unreachable("Unsupported store size"); 487 case 4: 488 case 8: 489 return false; 490 case 16: 491 return Subtarget->hasSSE41(); 492 case 32: 493 return Subtarget->hasAVX2(); 494 case 64: 495 return Subtarget->hasAVX512(); 496 } 497 } 498 499 bool foldLoadStoreIntoMemOperand(SDNode *Node); 500 MachineSDNode *matchBEXTRFromAndImm(SDNode *Node); 501 bool matchBitExtract(SDNode *Node); 502 bool shrinkAndImmediate(SDNode *N); 503 bool isMaskZeroExtended(SDNode *N) const; 504 bool tryShiftAmountMod(SDNode *N); 505 bool tryShrinkShlLogicImm(SDNode *N); 506 bool tryVPTESTM(SDNode *Root, SDValue Setcc, SDValue Mask); 507 508 MachineSDNode *emitPCMPISTR(unsigned ROpc, unsigned MOpc, bool MayFoldLoad, 509 const SDLoc &dl, MVT VT, SDNode *Node); 510 MachineSDNode *emitPCMPESTR(unsigned ROpc, unsigned MOpc, bool MayFoldLoad, 511 const SDLoc &dl, MVT VT, SDNode *Node, 512 SDValue &InFlag); 513 514 bool tryOptimizeRem8Extend(SDNode *N); 515 516 bool onlyUsesZeroFlag(SDValue Flags) const; 517 bool hasNoSignFlagUses(SDValue Flags) const; 518 bool hasNoCarryFlagUses(SDValue Flags) const; 519 }; 520 } 521 522 523 // Returns true if this masked compare can be implemented legally with this 524 // type. 525 static bool isLegalMaskCompare(SDNode *N, const X86Subtarget *Subtarget) { 526 unsigned Opcode = N->getOpcode(); 527 if (Opcode == X86ISD::CMPM || Opcode == ISD::SETCC || 528 Opcode == X86ISD::CMPM_SAE || Opcode == X86ISD::VFPCLASS) { 529 // We can get 256-bit 8 element types here without VLX being enabled. When 530 // this happens we will use 512-bit operations and the mask will not be 531 // zero extended. 532 EVT OpVT = N->getOperand(0).getValueType(); 533 if (OpVT.is256BitVector() || OpVT.is128BitVector()) 534 return Subtarget->hasVLX(); 535 536 return true; 537 } 538 // Scalar opcodes use 128 bit registers, but aren't subject to the VLX check. 539 if (Opcode == X86ISD::VFPCLASSS || Opcode == X86ISD::FSETCCM || 540 Opcode == X86ISD::FSETCCM_SAE) 541 return true; 542 543 return false; 544 } 545 546 // Returns true if we can assume the writer of the mask has zero extended it 547 // for us. 548 bool X86DAGToDAGISel::isMaskZeroExtended(SDNode *N) const { 549 // If this is an AND, check if we have a compare on either side. As long as 550 // one side guarantees the mask is zero extended, the AND will preserve those 551 // zeros. 552 if (N->getOpcode() == ISD::AND) 553 return isLegalMaskCompare(N->getOperand(0).getNode(), Subtarget) || 554 isLegalMaskCompare(N->getOperand(1).getNode(), Subtarget); 555 556 return isLegalMaskCompare(N, Subtarget); 557 } 558 559 bool 560 X86DAGToDAGISel::IsProfitableToFold(SDValue N, SDNode *U, SDNode *Root) const { 561 if (OptLevel == CodeGenOpt::None) return false; 562 563 if (!N.hasOneUse()) 564 return false; 565 566 if (N.getOpcode() != ISD::LOAD) 567 return true; 568 569 // Don't fold non-temporal loads if we have an instruction for them. 570 if (useNonTemporalLoad(cast<LoadSDNode>(N))) 571 return false; 572 573 // If N is a load, do additional profitability checks. 574 if (U == Root) { 575 switch (U->getOpcode()) { 576 default: break; 577 case X86ISD::ADD: 578 case X86ISD::ADC: 579 case X86ISD::SUB: 580 case X86ISD::SBB: 581 case X86ISD::AND: 582 case X86ISD::XOR: 583 case X86ISD::OR: 584 case ISD::ADD: 585 case ISD::ADDCARRY: 586 case ISD::AND: 587 case ISD::OR: 588 case ISD::XOR: { 589 SDValue Op1 = U->getOperand(1); 590 591 // If the other operand is a 8-bit immediate we should fold the immediate 592 // instead. This reduces code size. 593 // e.g. 594 // movl 4(%esp), %eax 595 // addl $4, %eax 596 // vs. 597 // movl $4, %eax 598 // addl 4(%esp), %eax 599 // The former is 2 bytes shorter. In case where the increment is 1, then 600 // the saving can be 4 bytes (by using incl %eax). 601 if (ConstantSDNode *Imm = dyn_cast<ConstantSDNode>(Op1)) { 602 if (Imm->getAPIntValue().isSignedIntN(8)) 603 return false; 604 605 // If this is a 64-bit AND with an immediate that fits in 32-bits, 606 // prefer using the smaller and over folding the load. This is needed to 607 // make sure immediates created by shrinkAndImmediate are always folded. 608 // Ideally we would narrow the load during DAG combine and get the 609 // best of both worlds. 610 if (U->getOpcode() == ISD::AND && 611 Imm->getAPIntValue().getBitWidth() == 64 && 612 Imm->getAPIntValue().isIntN(32)) 613 return false; 614 615 // If this really a zext_inreg that can be represented with a movzx 616 // instruction, prefer that. 617 // TODO: We could shrink the load and fold if it is non-volatile. 618 if (U->getOpcode() == ISD::AND && 619 (Imm->getAPIntValue() == UINT8_MAX || 620 Imm->getAPIntValue() == UINT16_MAX || 621 Imm->getAPIntValue() == UINT32_MAX)) 622 return false; 623 624 // ADD/SUB with can negate the immediate and use the opposite operation 625 // to fit 128 into a sign extended 8 bit immediate. 626 if ((U->getOpcode() == ISD::ADD || U->getOpcode() == ISD::SUB) && 627 (-Imm->getAPIntValue()).isSignedIntN(8)) 628 return false; 629 } 630 631 // If the other operand is a TLS address, we should fold it instead. 632 // This produces 633 // movl %gs:0, %eax 634 // leal i@NTPOFF(%eax), %eax 635 // instead of 636 // movl $i@NTPOFF, %eax 637 // addl %gs:0, %eax 638 // if the block also has an access to a second TLS address this will save 639 // a load. 640 // FIXME: This is probably also true for non-TLS addresses. 641 if (Op1.getOpcode() == X86ISD::Wrapper) { 642 SDValue Val = Op1.getOperand(0); 643 if (Val.getOpcode() == ISD::TargetGlobalTLSAddress) 644 return false; 645 } 646 647 // Don't fold load if this matches the BTS/BTR/BTC patterns. 648 // BTS: (or X, (shl 1, n)) 649 // BTR: (and X, (rotl -2, n)) 650 // BTC: (xor X, (shl 1, n)) 651 if (U->getOpcode() == ISD::OR || U->getOpcode() == ISD::XOR) { 652 if (U->getOperand(0).getOpcode() == ISD::SHL && 653 isOneConstant(U->getOperand(0).getOperand(0))) 654 return false; 655 656 if (U->getOperand(1).getOpcode() == ISD::SHL && 657 isOneConstant(U->getOperand(1).getOperand(0))) 658 return false; 659 } 660 if (U->getOpcode() == ISD::AND) { 661 SDValue U0 = U->getOperand(0); 662 SDValue U1 = U->getOperand(1); 663 if (U0.getOpcode() == ISD::ROTL) { 664 auto *C = dyn_cast<ConstantSDNode>(U0.getOperand(0)); 665 if (C && C->getSExtValue() == -2) 666 return false; 667 } 668 669 if (U1.getOpcode() == ISD::ROTL) { 670 auto *C = dyn_cast<ConstantSDNode>(U1.getOperand(0)); 671 if (C && C->getSExtValue() == -2) 672 return false; 673 } 674 } 675 676 break; 677 } 678 case ISD::SHL: 679 case ISD::SRA: 680 case ISD::SRL: 681 // Don't fold a load into a shift by immediate. The BMI2 instructions 682 // support folding a load, but not an immediate. The legacy instructions 683 // support folding an immediate, but can't fold a load. Folding an 684 // immediate is preferable to folding a load. 685 if (isa<ConstantSDNode>(U->getOperand(1))) 686 return false; 687 688 break; 689 } 690 } 691 692 // Prevent folding a load if this can implemented with an insert_subreg or 693 // a move that implicitly zeroes. 694 if (Root->getOpcode() == ISD::INSERT_SUBVECTOR && 695 isNullConstant(Root->getOperand(2)) && 696 (Root->getOperand(0).isUndef() || 697 ISD::isBuildVectorAllZeros(Root->getOperand(0).getNode()))) 698 return false; 699 700 return true; 701 } 702 703 /// Replace the original chain operand of the call with 704 /// load's chain operand and move load below the call's chain operand. 705 static void moveBelowOrigChain(SelectionDAG *CurDAG, SDValue Load, 706 SDValue Call, SDValue OrigChain) { 707 SmallVector<SDValue, 8> Ops; 708 SDValue Chain = OrigChain.getOperand(0); 709 if (Chain.getNode() == Load.getNode()) 710 Ops.push_back(Load.getOperand(0)); 711 else { 712 assert(Chain.getOpcode() == ISD::TokenFactor && 713 "Unexpected chain operand"); 714 for (unsigned i = 0, e = Chain.getNumOperands(); i != e; ++i) 715 if (Chain.getOperand(i).getNode() == Load.getNode()) 716 Ops.push_back(Load.getOperand(0)); 717 else 718 Ops.push_back(Chain.getOperand(i)); 719 SDValue NewChain = 720 CurDAG->getNode(ISD::TokenFactor, SDLoc(Load), MVT::Other, Ops); 721 Ops.clear(); 722 Ops.push_back(NewChain); 723 } 724 Ops.append(OrigChain->op_begin() + 1, OrigChain->op_end()); 725 CurDAG->UpdateNodeOperands(OrigChain.getNode(), Ops); 726 CurDAG->UpdateNodeOperands(Load.getNode(), Call.getOperand(0), 727 Load.getOperand(1), Load.getOperand(2)); 728 729 Ops.clear(); 730 Ops.push_back(SDValue(Load.getNode(), 1)); 731 Ops.append(Call->op_begin() + 1, Call->op_end()); 732 CurDAG->UpdateNodeOperands(Call.getNode(), Ops); 733 } 734 735 /// Return true if call address is a load and it can be 736 /// moved below CALLSEQ_START and the chains leading up to the call. 737 /// Return the CALLSEQ_START by reference as a second output. 738 /// In the case of a tail call, there isn't a callseq node between the call 739 /// chain and the load. 740 static bool isCalleeLoad(SDValue Callee, SDValue &Chain, bool HasCallSeq) { 741 // The transformation is somewhat dangerous if the call's chain was glued to 742 // the call. After MoveBelowOrigChain the load is moved between the call and 743 // the chain, this can create a cycle if the load is not folded. So it is 744 // *really* important that we are sure the load will be folded. 745 if (Callee.getNode() == Chain.getNode() || !Callee.hasOneUse()) 746 return false; 747 LoadSDNode *LD = dyn_cast<LoadSDNode>(Callee.getNode()); 748 if (!LD || 749 LD->isVolatile() || 750 LD->getAddressingMode() != ISD::UNINDEXED || 751 LD->getExtensionType() != ISD::NON_EXTLOAD) 752 return false; 753 754 // Now let's find the callseq_start. 755 while (HasCallSeq && Chain.getOpcode() != ISD::CALLSEQ_START) { 756 if (!Chain.hasOneUse()) 757 return false; 758 Chain = Chain.getOperand(0); 759 } 760 761 if (!Chain.getNumOperands()) 762 return false; 763 // Since we are not checking for AA here, conservatively abort if the chain 764 // writes to memory. It's not safe to move the callee (a load) across a store. 765 if (isa<MemSDNode>(Chain.getNode()) && 766 cast<MemSDNode>(Chain.getNode())->writeMem()) 767 return false; 768 if (Chain.getOperand(0).getNode() == Callee.getNode()) 769 return true; 770 if (Chain.getOperand(0).getOpcode() == ISD::TokenFactor && 771 Callee.getValue(1).isOperandOf(Chain.getOperand(0).getNode()) && 772 Callee.getValue(1).hasOneUse()) 773 return true; 774 return false; 775 } 776 777 void X86DAGToDAGISel::PreprocessISelDAG() { 778 for (SelectionDAG::allnodes_iterator I = CurDAG->allnodes_begin(), 779 E = CurDAG->allnodes_end(); I != E; ) { 780 SDNode *N = &*I++; // Preincrement iterator to avoid invalidation issues. 781 782 // If this is a target specific AND node with no flag usages, turn it back 783 // into ISD::AND to enable test instruction matching. 784 if (N->getOpcode() == X86ISD::AND && !N->hasAnyUseOfValue(1)) { 785 SDValue Res = CurDAG->getNode(ISD::AND, SDLoc(N), N->getValueType(0), 786 N->getOperand(0), N->getOperand(1)); 787 --I; 788 CurDAG->ReplaceAllUsesOfValueWith(SDValue(N, 0), Res); 789 ++I; 790 CurDAG->DeleteNode(N); 791 continue; 792 } 793 794 switch (N->getOpcode()) { 795 case ISD::FP_TO_SINT: 796 case ISD::FP_TO_UINT: { 797 // Replace vector fp_to_s/uint with their X86 specific equivalent so we 798 // don't need 2 sets of patterns. 799 if (!N->getSimpleValueType(0).isVector()) 800 break; 801 802 unsigned NewOpc; 803 switch (N->getOpcode()) { 804 default: llvm_unreachable("Unexpected opcode!"); 805 case ISD::FP_TO_SINT: NewOpc = X86ISD::CVTTP2SI; break; 806 case ISD::FP_TO_UINT: NewOpc = X86ISD::CVTTP2UI; break; 807 } 808 SDValue Res = CurDAG->getNode(NewOpc, SDLoc(N), N->getValueType(0), 809 N->getOperand(0)); 810 --I; 811 CurDAG->ReplaceAllUsesOfValueWith(SDValue(N, 0), Res); 812 ++I; 813 CurDAG->DeleteNode(N); 814 continue; 815 } 816 case ISD::SHL: 817 case ISD::SRA: 818 case ISD::SRL: { 819 // Replace vector shifts with their X86 specific equivalent so we don't 820 // need 2 sets of patterns. 821 if (!N->getValueType(0).isVector()) 822 break; 823 824 unsigned NewOpc; 825 switch (N->getOpcode()) { 826 default: llvm_unreachable("Unexpected opcode!"); 827 case ISD::SHL: NewOpc = X86ISD::VSHLV; break; 828 case ISD::SRA: NewOpc = X86ISD::VSRAV; break; 829 case ISD::SRL: NewOpc = X86ISD::VSRLV; break; 830 } 831 SDValue Res = CurDAG->getNode(NewOpc, SDLoc(N), N->getValueType(0), 832 N->getOperand(0), N->getOperand(1)); 833 --I; 834 CurDAG->ReplaceAllUsesOfValueWith(SDValue(N, 0), Res); 835 ++I; 836 CurDAG->DeleteNode(N); 837 continue; 838 } 839 case ISD::ANY_EXTEND: 840 case ISD::ANY_EXTEND_VECTOR_INREG: { 841 // Replace vector any extend with the zero extend equivalents so we don't 842 // need 2 sets of patterns. Ignore vXi1 extensions. 843 if (!N->getValueType(0).isVector() || 844 N->getOperand(0).getScalarValueSizeInBits() == 1) 845 break; 846 847 unsigned NewOpc = N->getOpcode() == ISD::ANY_EXTEND 848 ? ISD::ZERO_EXTEND 849 : ISD::ZERO_EXTEND_VECTOR_INREG; 850 851 SDValue Res = CurDAG->getNode(NewOpc, SDLoc(N), N->getValueType(0), 852 N->getOperand(0)); 853 --I; 854 CurDAG->ReplaceAllUsesOfValueWith(SDValue(N, 0), Res); 855 ++I; 856 CurDAG->DeleteNode(N); 857 continue; 858 } 859 case ISD::FCEIL: 860 case ISD::FFLOOR: 861 case ISD::FTRUNC: 862 case ISD::FNEARBYINT: 863 case ISD::FRINT: { 864 // Replace fp rounding with their X86 specific equivalent so we don't 865 // need 2 sets of patterns. 866 unsigned Imm; 867 switch (N->getOpcode()) { 868 default: llvm_unreachable("Unexpected opcode!"); 869 case ISD::FCEIL: Imm = 0xA; break; 870 case ISD::FFLOOR: Imm = 0x9; break; 871 case ISD::FTRUNC: Imm = 0xB; break; 872 case ISD::FNEARBYINT: Imm = 0xC; break; 873 case ISD::FRINT: Imm = 0x4; break; 874 } 875 SDLoc dl(N); 876 SDValue Res = CurDAG->getNode(X86ISD::VRNDSCALE, dl, 877 N->getValueType(0), 878 N->getOperand(0), 879 CurDAG->getConstant(Imm, dl, MVT::i8)); 880 --I; 881 CurDAG->ReplaceAllUsesOfValueWith(SDValue(N, 0), Res); 882 ++I; 883 CurDAG->DeleteNode(N); 884 continue; 885 } 886 case X86ISD::FANDN: 887 case X86ISD::FAND: 888 case X86ISD::FOR: 889 case X86ISD::FXOR: { 890 // Widen scalar fp logic ops to vector to reduce isel patterns. 891 // FIXME: Can we do this during lowering/combine. 892 MVT VT = N->getSimpleValueType(0); 893 if (VT.isVector() || VT == MVT::f128) 894 break; 895 896 MVT VecVT = VT == MVT::f64 ? MVT::v2f64 : MVT::v4f32; 897 SDLoc dl(N); 898 SDValue Op0 = CurDAG->getNode(ISD::SCALAR_TO_VECTOR, dl, VecVT, 899 N->getOperand(0)); 900 SDValue Op1 = CurDAG->getNode(ISD::SCALAR_TO_VECTOR, dl, VecVT, 901 N->getOperand(1)); 902 903 SDValue Res; 904 if (Subtarget->hasSSE2()) { 905 EVT IntVT = EVT(VecVT).changeVectorElementTypeToInteger(); 906 Op0 = CurDAG->getNode(ISD::BITCAST, dl, IntVT, Op0); 907 Op1 = CurDAG->getNode(ISD::BITCAST, dl, IntVT, Op1); 908 unsigned Opc; 909 switch (N->getOpcode()) { 910 default: llvm_unreachable("Unexpected opcode!"); 911 case X86ISD::FANDN: Opc = X86ISD::ANDNP; break; 912 case X86ISD::FAND: Opc = ISD::AND; break; 913 case X86ISD::FOR: Opc = ISD::OR; break; 914 case X86ISD::FXOR: Opc = ISD::XOR; break; 915 } 916 Res = CurDAG->getNode(Opc, dl, IntVT, Op0, Op1); 917 Res = CurDAG->getNode(ISD::BITCAST, dl, VecVT, Res); 918 } else { 919 Res = CurDAG->getNode(N->getOpcode(), dl, VecVT, Op0, Op1); 920 } 921 Res = CurDAG->getNode(ISD::EXTRACT_VECTOR_ELT, dl, VT, Res, 922 CurDAG->getIntPtrConstant(0, dl)); 923 --I; 924 CurDAG->ReplaceAllUsesOfValueWith(SDValue(N, 0), Res); 925 ++I; 926 CurDAG->DeleteNode(N); 927 continue; 928 } 929 } 930 931 if (OptLevel != CodeGenOpt::None && 932 // Only do this when the target can fold the load into the call or 933 // jmp. 934 !Subtarget->useRetpolineIndirectCalls() && 935 ((N->getOpcode() == X86ISD::CALL && !Subtarget->slowTwoMemOps()) || 936 (N->getOpcode() == X86ISD::TC_RETURN && 937 (Subtarget->is64Bit() || 938 !getTargetMachine().isPositionIndependent())))) { 939 /// Also try moving call address load from outside callseq_start to just 940 /// before the call to allow it to be folded. 941 /// 942 /// [Load chain] 943 /// ^ 944 /// | 945 /// [Load] 946 /// ^ ^ 947 /// | | 948 /// / \-- 949 /// / | 950 ///[CALLSEQ_START] | 951 /// ^ | 952 /// | | 953 /// [LOAD/C2Reg] | 954 /// | | 955 /// \ / 956 /// \ / 957 /// [CALL] 958 bool HasCallSeq = N->getOpcode() == X86ISD::CALL; 959 SDValue Chain = N->getOperand(0); 960 SDValue Load = N->getOperand(1); 961 if (!isCalleeLoad(Load, Chain, HasCallSeq)) 962 continue; 963 moveBelowOrigChain(CurDAG, Load, SDValue(N, 0), Chain); 964 ++NumLoadMoved; 965 continue; 966 } 967 968 // Lower fpround and fpextend nodes that target the FP stack to be store and 969 // load to the stack. This is a gross hack. We would like to simply mark 970 // these as being illegal, but when we do that, legalize produces these when 971 // it expands calls, then expands these in the same legalize pass. We would 972 // like dag combine to be able to hack on these between the call expansion 973 // and the node legalization. As such this pass basically does "really 974 // late" legalization of these inline with the X86 isel pass. 975 // FIXME: This should only happen when not compiled with -O0. 976 switch (N->getOpcode()) { 977 default: continue; 978 case ISD::FP_ROUND: 979 case ISD::FP_EXTEND: 980 { 981 MVT SrcVT = N->getOperand(0).getSimpleValueType(); 982 MVT DstVT = N->getSimpleValueType(0); 983 984 // If any of the sources are vectors, no fp stack involved. 985 if (SrcVT.isVector() || DstVT.isVector()) 986 continue; 987 988 // If the source and destination are SSE registers, then this is a legal 989 // conversion that should not be lowered. 990 const X86TargetLowering *X86Lowering = 991 static_cast<const X86TargetLowering *>(TLI); 992 bool SrcIsSSE = X86Lowering->isScalarFPTypeInSSEReg(SrcVT); 993 bool DstIsSSE = X86Lowering->isScalarFPTypeInSSEReg(DstVT); 994 if (SrcIsSSE && DstIsSSE) 995 continue; 996 997 if (!SrcIsSSE && !DstIsSSE) { 998 // If this is an FPStack extension, it is a noop. 999 if (N->getOpcode() == ISD::FP_EXTEND) 1000 continue; 1001 // If this is a value-preserving FPStack truncation, it is a noop. 1002 if (N->getConstantOperandVal(1)) 1003 continue; 1004 } 1005 1006 // Here we could have an FP stack truncation or an FPStack <-> SSE convert. 1007 // FPStack has extload and truncstore. SSE can fold direct loads into other 1008 // operations. Based on this, decide what we want to do. 1009 MVT MemVT; 1010 if (N->getOpcode() == ISD::FP_ROUND) 1011 MemVT = DstVT; // FP_ROUND must use DstVT, we can't do a 'trunc load'. 1012 else 1013 MemVT = SrcIsSSE ? SrcVT : DstVT; 1014 1015 SDValue MemTmp = CurDAG->CreateStackTemporary(MemVT); 1016 SDLoc dl(N); 1017 1018 // FIXME: optimize the case where the src/dest is a load or store? 1019 1020 SDValue Store = CurDAG->getTruncStore(CurDAG->getEntryNode(), dl, N->getOperand(0), 1021 MemTmp, MachinePointerInfo(), MemVT); 1022 SDValue Result = CurDAG->getExtLoad(ISD::EXTLOAD, dl, DstVT, Store, MemTmp, 1023 MachinePointerInfo(), MemVT); 1024 1025 // We're about to replace all uses of the FP_ROUND/FP_EXTEND with the 1026 // extload we created. This will cause general havok on the dag because 1027 // anything below the conversion could be folded into other existing nodes. 1028 // To avoid invalidating 'I', back it up to the convert node. 1029 --I; 1030 CurDAG->ReplaceAllUsesOfValueWith(SDValue(N, 0), Result); 1031 break; 1032 } 1033 1034 //The sequence of events for lowering STRICT_FP versions of these nodes requires 1035 //dealing with the chain differently, as there is already a preexisting chain. 1036 case ISD::STRICT_FP_ROUND: 1037 case ISD::STRICT_FP_EXTEND: 1038 { 1039 MVT SrcVT = N->getOperand(1).getSimpleValueType(); 1040 MVT DstVT = N->getSimpleValueType(0); 1041 1042 // If any of the sources are vectors, no fp stack involved. 1043 if (SrcVT.isVector() || DstVT.isVector()) 1044 continue; 1045 1046 // If the source and destination are SSE registers, then this is a legal 1047 // conversion that should not be lowered. 1048 const X86TargetLowering *X86Lowering = 1049 static_cast<const X86TargetLowering *>(TLI); 1050 bool SrcIsSSE = X86Lowering->isScalarFPTypeInSSEReg(SrcVT); 1051 bool DstIsSSE = X86Lowering->isScalarFPTypeInSSEReg(DstVT); 1052 if (SrcIsSSE && DstIsSSE) 1053 continue; 1054 1055 if (!SrcIsSSE && !DstIsSSE) { 1056 // If this is an FPStack extension, it is a noop. 1057 if (N->getOpcode() == ISD::STRICT_FP_EXTEND) 1058 continue; 1059 // If this is a value-preserving FPStack truncation, it is a noop. 1060 if (N->getConstantOperandVal(2)) 1061 continue; 1062 } 1063 1064 // Here we could have an FP stack truncation or an FPStack <-> SSE convert. 1065 // FPStack has extload and truncstore. SSE can fold direct loads into other 1066 // operations. Based on this, decide what we want to do. 1067 MVT MemVT; 1068 if (N->getOpcode() == ISD::STRICT_FP_ROUND) 1069 MemVT = DstVT; // FP_ROUND must use DstVT, we can't do a 'trunc load'. 1070 else 1071 MemVT = SrcIsSSE ? SrcVT : DstVT; 1072 1073 SDValue MemTmp = CurDAG->CreateStackTemporary(MemVT); 1074 SDLoc dl(N); 1075 1076 // FIXME: optimize the case where the src/dest is a load or store? 1077 1078 //Since the operation is StrictFP, use the preexisting chain. 1079 SDValue Store = CurDAG->getTruncStore(N->getOperand(0), dl, N->getOperand(1), 1080 MemTmp, MachinePointerInfo(), MemVT); 1081 SDValue Result = CurDAG->getExtLoad(ISD::EXTLOAD, dl, DstVT, Store, MemTmp, 1082 MachinePointerInfo(), MemVT); 1083 1084 // We're about to replace all uses of the FP_ROUND/FP_EXTEND with the 1085 // extload we created. This will cause general havok on the dag because 1086 // anything below the conversion could be folded into other existing nodes. 1087 // To avoid invalidating 'I', back it up to the convert node. 1088 --I; 1089 CurDAG->ReplaceAllUsesWith(N, Result.getNode()); 1090 break; 1091 } 1092 } 1093 1094 1095 // Now that we did that, the node is dead. Increment the iterator to the 1096 // next node to process, then delete N. 1097 ++I; 1098 CurDAG->DeleteNode(N); 1099 } 1100 1101 // The load+call transform above can leave some dead nodes in the graph. Make 1102 // sure we remove them. Its possible some of the other transforms do to so 1103 // just remove dead nodes unconditionally. 1104 CurDAG->RemoveDeadNodes(); 1105 } 1106 1107 // Look for a redundant movzx/movsx that can occur after an 8-bit divrem. 1108 bool X86DAGToDAGISel::tryOptimizeRem8Extend(SDNode *N) { 1109 unsigned Opc = N->getMachineOpcode(); 1110 if (Opc != X86::MOVZX32rr8 && Opc != X86::MOVSX32rr8 && 1111 Opc != X86::MOVSX64rr8) 1112 return false; 1113 1114 SDValue N0 = N->getOperand(0); 1115 1116 // We need to be extracting the lower bit of an extend. 1117 if (!N0.isMachineOpcode() || 1118 N0.getMachineOpcode() != TargetOpcode::EXTRACT_SUBREG || 1119 N0.getConstantOperandVal(1) != X86::sub_8bit) 1120 return false; 1121 1122 // We're looking for either a movsx or movzx to match the original opcode. 1123 unsigned ExpectedOpc = Opc == X86::MOVZX32rr8 ? X86::MOVZX32rr8_NOREX 1124 : X86::MOVSX32rr8_NOREX; 1125 SDValue N00 = N0.getOperand(0); 1126 if (!N00.isMachineOpcode() || N00.getMachineOpcode() != ExpectedOpc) 1127 return false; 1128 1129 if (Opc == X86::MOVSX64rr8) { 1130 // If we had a sign extend from 8 to 64 bits. We still need to go from 32 1131 // to 64. 1132 MachineSDNode *Extend = CurDAG->getMachineNode(X86::MOVSX64rr32, SDLoc(N), 1133 MVT::i64, N00); 1134 ReplaceUses(N, Extend); 1135 } else { 1136 // Ok we can drop this extend and just use the original extend. 1137 ReplaceUses(N, N00.getNode()); 1138 } 1139 1140 return true; 1141 } 1142 1143 void X86DAGToDAGISel::PostprocessISelDAG() { 1144 // Skip peepholes at -O0. 1145 if (TM.getOptLevel() == CodeGenOpt::None) 1146 return; 1147 1148 SelectionDAG::allnodes_iterator Position = CurDAG->allnodes_end(); 1149 1150 bool MadeChange = false; 1151 while (Position != CurDAG->allnodes_begin()) { 1152 SDNode *N = &*--Position; 1153 // Skip dead nodes and any non-machine opcodes. 1154 if (N->use_empty() || !N->isMachineOpcode()) 1155 continue; 1156 1157 if (tryOptimizeRem8Extend(N)) { 1158 MadeChange = true; 1159 continue; 1160 } 1161 1162 // Look for a TESTrr+ANDrr pattern where both operands of the test are 1163 // the same. Rewrite to remove the AND. 1164 unsigned Opc = N->getMachineOpcode(); 1165 if ((Opc == X86::TEST8rr || Opc == X86::TEST16rr || 1166 Opc == X86::TEST32rr || Opc == X86::TEST64rr) && 1167 N->getOperand(0) == N->getOperand(1) && 1168 N->isOnlyUserOf(N->getOperand(0).getNode()) && 1169 N->getOperand(0).isMachineOpcode()) { 1170 SDValue And = N->getOperand(0); 1171 unsigned N0Opc = And.getMachineOpcode(); 1172 if (N0Opc == X86::AND8rr || N0Opc == X86::AND16rr || 1173 N0Opc == X86::AND32rr || N0Opc == X86::AND64rr) { 1174 MachineSDNode *Test = CurDAG->getMachineNode(Opc, SDLoc(N), 1175 MVT::i32, 1176 And.getOperand(0), 1177 And.getOperand(1)); 1178 ReplaceUses(N, Test); 1179 MadeChange = true; 1180 continue; 1181 } 1182 if (N0Opc == X86::AND8rm || N0Opc == X86::AND16rm || 1183 N0Opc == X86::AND32rm || N0Opc == X86::AND64rm) { 1184 unsigned NewOpc; 1185 switch (N0Opc) { 1186 case X86::AND8rm: NewOpc = X86::TEST8mr; break; 1187 case X86::AND16rm: NewOpc = X86::TEST16mr; break; 1188 case X86::AND32rm: NewOpc = X86::TEST32mr; break; 1189 case X86::AND64rm: NewOpc = X86::TEST64mr; break; 1190 } 1191 1192 // Need to swap the memory and register operand. 1193 SDValue Ops[] = { And.getOperand(1), 1194 And.getOperand(2), 1195 And.getOperand(3), 1196 And.getOperand(4), 1197 And.getOperand(5), 1198 And.getOperand(0), 1199 And.getOperand(6) /* Chain */ }; 1200 MachineSDNode *Test = CurDAG->getMachineNode(NewOpc, SDLoc(N), 1201 MVT::i32, MVT::Other, Ops); 1202 ReplaceUses(N, Test); 1203 MadeChange = true; 1204 continue; 1205 } 1206 } 1207 1208 // Look for a KAND+KORTEST and turn it into KTEST if only the zero flag is 1209 // used. We're doing this late so we can prefer to fold the AND into masked 1210 // comparisons. Doing that can be better for the live range of the mask 1211 // register. 1212 if ((Opc == X86::KORTESTBrr || Opc == X86::KORTESTWrr || 1213 Opc == X86::KORTESTDrr || Opc == X86::KORTESTQrr) && 1214 N->getOperand(0) == N->getOperand(1) && 1215 N->isOnlyUserOf(N->getOperand(0).getNode()) && 1216 N->getOperand(0).isMachineOpcode() && 1217 onlyUsesZeroFlag(SDValue(N, 0))) { 1218 SDValue And = N->getOperand(0); 1219 unsigned N0Opc = And.getMachineOpcode(); 1220 // KANDW is legal with AVX512F, but KTESTW requires AVX512DQ. The other 1221 // KAND instructions and KTEST use the same ISA feature. 1222 if (N0Opc == X86::KANDBrr || 1223 (N0Opc == X86::KANDWrr && Subtarget->hasDQI()) || 1224 N0Opc == X86::KANDDrr || N0Opc == X86::KANDQrr) { 1225 unsigned NewOpc; 1226 switch (Opc) { 1227 default: llvm_unreachable("Unexpected opcode!"); 1228 case X86::KORTESTBrr: NewOpc = X86::KTESTBrr; break; 1229 case X86::KORTESTWrr: NewOpc = X86::KTESTWrr; break; 1230 case X86::KORTESTDrr: NewOpc = X86::KTESTDrr; break; 1231 case X86::KORTESTQrr: NewOpc = X86::KTESTQrr; break; 1232 } 1233 MachineSDNode *KTest = CurDAG->getMachineNode(NewOpc, SDLoc(N), 1234 MVT::i32, 1235 And.getOperand(0), 1236 And.getOperand(1)); 1237 ReplaceUses(N, KTest); 1238 MadeChange = true; 1239 continue; 1240 } 1241 } 1242 1243 // Attempt to remove vectors moves that were inserted to zero upper bits. 1244 if (Opc != TargetOpcode::SUBREG_TO_REG) 1245 continue; 1246 1247 unsigned SubRegIdx = N->getConstantOperandVal(2); 1248 if (SubRegIdx != X86::sub_xmm && SubRegIdx != X86::sub_ymm) 1249 continue; 1250 1251 SDValue Move = N->getOperand(1); 1252 if (!Move.isMachineOpcode()) 1253 continue; 1254 1255 // Make sure its one of the move opcodes we recognize. 1256 switch (Move.getMachineOpcode()) { 1257 default: 1258 continue; 1259 case X86::VMOVAPDrr: case X86::VMOVUPDrr: 1260 case X86::VMOVAPSrr: case X86::VMOVUPSrr: 1261 case X86::VMOVDQArr: case X86::VMOVDQUrr: 1262 case X86::VMOVAPDYrr: case X86::VMOVUPDYrr: 1263 case X86::VMOVAPSYrr: case X86::VMOVUPSYrr: 1264 case X86::VMOVDQAYrr: case X86::VMOVDQUYrr: 1265 case X86::VMOVAPDZ128rr: case X86::VMOVUPDZ128rr: 1266 case X86::VMOVAPSZ128rr: case X86::VMOVUPSZ128rr: 1267 case X86::VMOVDQA32Z128rr: case X86::VMOVDQU32Z128rr: 1268 case X86::VMOVDQA64Z128rr: case X86::VMOVDQU64Z128rr: 1269 case X86::VMOVAPDZ256rr: case X86::VMOVUPDZ256rr: 1270 case X86::VMOVAPSZ256rr: case X86::VMOVUPSZ256rr: 1271 case X86::VMOVDQA32Z256rr: case X86::VMOVDQU32Z256rr: 1272 case X86::VMOVDQA64Z256rr: case X86::VMOVDQU64Z256rr: 1273 break; 1274 } 1275 1276 SDValue In = Move.getOperand(0); 1277 if (!In.isMachineOpcode() || 1278 In.getMachineOpcode() <= TargetOpcode::GENERIC_OP_END) 1279 continue; 1280 1281 // Make sure the instruction has a VEX, XOP, or EVEX prefix. This covers 1282 // the SHA instructions which use a legacy encoding. 1283 uint64_t TSFlags = getInstrInfo()->get(In.getMachineOpcode()).TSFlags; 1284 if ((TSFlags & X86II::EncodingMask) != X86II::VEX && 1285 (TSFlags & X86II::EncodingMask) != X86II::EVEX && 1286 (TSFlags & X86II::EncodingMask) != X86II::XOP) 1287 continue; 1288 1289 // Producing instruction is another vector instruction. We can drop the 1290 // move. 1291 CurDAG->UpdateNodeOperands(N, N->getOperand(0), In, N->getOperand(2)); 1292 MadeChange = true; 1293 } 1294 1295 if (MadeChange) 1296 CurDAG->RemoveDeadNodes(); 1297 } 1298 1299 1300 /// Emit any code that needs to be executed only in the main function. 1301 void X86DAGToDAGISel::emitSpecialCodeForMain() { 1302 if (Subtarget->isTargetCygMing()) { 1303 TargetLowering::ArgListTy Args; 1304 auto &DL = CurDAG->getDataLayout(); 1305 1306 TargetLowering::CallLoweringInfo CLI(*CurDAG); 1307 CLI.setChain(CurDAG->getRoot()) 1308 .setCallee(CallingConv::C, Type::getVoidTy(*CurDAG->getContext()), 1309 CurDAG->getExternalSymbol("__main", TLI->getPointerTy(DL)), 1310 std::move(Args)); 1311 const TargetLowering &TLI = CurDAG->getTargetLoweringInfo(); 1312 std::pair<SDValue, SDValue> Result = TLI.LowerCallTo(CLI); 1313 CurDAG->setRoot(Result.second); 1314 } 1315 } 1316 1317 void X86DAGToDAGISel::EmitFunctionEntryCode() { 1318 // If this is main, emit special code for main. 1319 const Function &F = MF->getFunction(); 1320 if (F.hasExternalLinkage() && F.getName() == "main") 1321 emitSpecialCodeForMain(); 1322 } 1323 1324 static bool isDispSafeForFrameIndex(int64_t Val) { 1325 // On 64-bit platforms, we can run into an issue where a frame index 1326 // includes a displacement that, when added to the explicit displacement, 1327 // will overflow the displacement field. Assuming that the frame index 1328 // displacement fits into a 31-bit integer (which is only slightly more 1329 // aggressive than the current fundamental assumption that it fits into 1330 // a 32-bit integer), a 31-bit disp should always be safe. 1331 return isInt<31>(Val); 1332 } 1333 1334 bool X86DAGToDAGISel::foldOffsetIntoAddress(uint64_t Offset, 1335 X86ISelAddressMode &AM) { 1336 // If there's no offset to fold, we don't need to do any work. 1337 if (Offset == 0) 1338 return false; 1339 1340 // Cannot combine ExternalSymbol displacements with integer offsets. 1341 if (AM.ES || AM.MCSym) 1342 return true; 1343 1344 int64_t Val = AM.Disp + Offset; 1345 CodeModel::Model M = TM.getCodeModel(); 1346 if (Subtarget->is64Bit()) { 1347 if (!X86::isOffsetSuitableForCodeModel(Val, M, 1348 AM.hasSymbolicDisplacement())) 1349 return true; 1350 // In addition to the checks required for a register base, check that 1351 // we do not try to use an unsafe Disp with a frame index. 1352 if (AM.BaseType == X86ISelAddressMode::FrameIndexBase && 1353 !isDispSafeForFrameIndex(Val)) 1354 return true; 1355 } 1356 AM.Disp = Val; 1357 return false; 1358 1359 } 1360 1361 bool X86DAGToDAGISel::matchLoadInAddress(LoadSDNode *N, X86ISelAddressMode &AM){ 1362 SDValue Address = N->getOperand(1); 1363 1364 // load gs:0 -> GS segment register. 1365 // load fs:0 -> FS segment register. 1366 // 1367 // This optimization is valid because the GNU TLS model defines that 1368 // gs:0 (or fs:0 on X86-64) contains its own address. 1369 // For more information see http://people.redhat.com/drepper/tls.pdf 1370 if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Address)) 1371 if (C->getSExtValue() == 0 && AM.Segment.getNode() == nullptr && 1372 !IndirectTlsSegRefs && 1373 (Subtarget->isTargetGlibc() || Subtarget->isTargetAndroid() || 1374 Subtarget->isTargetFuchsia())) 1375 switch (N->getPointerInfo().getAddrSpace()) { 1376 case 256: 1377 AM.Segment = CurDAG->getRegister(X86::GS, MVT::i16); 1378 return false; 1379 case 257: 1380 AM.Segment = CurDAG->getRegister(X86::FS, MVT::i16); 1381 return false; 1382 // Address space 258 is not handled here, because it is not used to 1383 // address TLS areas. 1384 } 1385 1386 return true; 1387 } 1388 1389 /// Try to match X86ISD::Wrapper and X86ISD::WrapperRIP nodes into an addressing 1390 /// mode. These wrap things that will resolve down into a symbol reference. 1391 /// If no match is possible, this returns true, otherwise it returns false. 1392 bool X86DAGToDAGISel::matchWrapper(SDValue N, X86ISelAddressMode &AM) { 1393 // If the addressing mode already has a symbol as the displacement, we can 1394 // never match another symbol. 1395 if (AM.hasSymbolicDisplacement()) 1396 return true; 1397 1398 bool IsRIPRelTLS = false; 1399 bool IsRIPRel = N.getOpcode() == X86ISD::WrapperRIP; 1400 if (IsRIPRel) { 1401 SDValue Val = N.getOperand(0); 1402 if (Val.getOpcode() == ISD::TargetGlobalTLSAddress) 1403 IsRIPRelTLS = true; 1404 } 1405 1406 // We can't use an addressing mode in the 64-bit large code model. 1407 // Global TLS addressing is an exception. In the medium code model, 1408 // we use can use a mode when RIP wrappers are present. 1409 // That signifies access to globals that are known to be "near", 1410 // such as the GOT itself. 1411 CodeModel::Model M = TM.getCodeModel(); 1412 if (Subtarget->is64Bit() && 1413 ((M == CodeModel::Large && !IsRIPRelTLS) || 1414 (M == CodeModel::Medium && !IsRIPRel))) 1415 return true; 1416 1417 // Base and index reg must be 0 in order to use %rip as base. 1418 if (IsRIPRel && AM.hasBaseOrIndexReg()) 1419 return true; 1420 1421 // Make a local copy in case we can't do this fold. 1422 X86ISelAddressMode Backup = AM; 1423 1424 int64_t Offset = 0; 1425 SDValue N0 = N.getOperand(0); 1426 if (GlobalAddressSDNode *G = dyn_cast<GlobalAddressSDNode>(N0)) { 1427 AM.GV = G->getGlobal(); 1428 AM.SymbolFlags = G->getTargetFlags(); 1429 Offset = G->getOffset(); 1430 } else if (ConstantPoolSDNode *CP = dyn_cast<ConstantPoolSDNode>(N0)) { 1431 AM.CP = CP->getConstVal(); 1432 AM.Align = CP->getAlignment(); 1433 AM.SymbolFlags = CP->getTargetFlags(); 1434 Offset = CP->getOffset(); 1435 } else if (ExternalSymbolSDNode *S = dyn_cast<ExternalSymbolSDNode>(N0)) { 1436 AM.ES = S->getSymbol(); 1437 AM.SymbolFlags = S->getTargetFlags(); 1438 } else if (auto *S = dyn_cast<MCSymbolSDNode>(N0)) { 1439 AM.MCSym = S->getMCSymbol(); 1440 } else if (JumpTableSDNode *J = dyn_cast<JumpTableSDNode>(N0)) { 1441 AM.JT = J->getIndex(); 1442 AM.SymbolFlags = J->getTargetFlags(); 1443 } else if (BlockAddressSDNode *BA = dyn_cast<BlockAddressSDNode>(N0)) { 1444 AM.BlockAddr = BA->getBlockAddress(); 1445 AM.SymbolFlags = BA->getTargetFlags(); 1446 Offset = BA->getOffset(); 1447 } else 1448 llvm_unreachable("Unhandled symbol reference node."); 1449 1450 if (foldOffsetIntoAddress(Offset, AM)) { 1451 AM = Backup; 1452 return true; 1453 } 1454 1455 if (IsRIPRel) 1456 AM.setBaseReg(CurDAG->getRegister(X86::RIP, MVT::i64)); 1457 1458 // Commit the changes now that we know this fold is safe. 1459 return false; 1460 } 1461 1462 /// Add the specified node to the specified addressing mode, returning true if 1463 /// it cannot be done. This just pattern matches for the addressing mode. 1464 bool X86DAGToDAGISel::matchAddress(SDValue N, X86ISelAddressMode &AM) { 1465 if (matchAddressRecursively(N, AM, 0)) 1466 return true; 1467 1468 // Post-processing: Convert lea(,%reg,2) to lea(%reg,%reg), which has 1469 // a smaller encoding and avoids a scaled-index. 1470 if (AM.Scale == 2 && 1471 AM.BaseType == X86ISelAddressMode::RegBase && 1472 AM.Base_Reg.getNode() == nullptr) { 1473 AM.Base_Reg = AM.IndexReg; 1474 AM.Scale = 1; 1475 } 1476 1477 // Post-processing: Convert foo to foo(%rip), even in non-PIC mode, 1478 // because it has a smaller encoding. 1479 // TODO: Which other code models can use this? 1480 switch (TM.getCodeModel()) { 1481 default: break; 1482 case CodeModel::Small: 1483 case CodeModel::Kernel: 1484 if (Subtarget->is64Bit() && 1485 AM.Scale == 1 && 1486 AM.BaseType == X86ISelAddressMode::RegBase && 1487 AM.Base_Reg.getNode() == nullptr && 1488 AM.IndexReg.getNode() == nullptr && 1489 AM.SymbolFlags == X86II::MO_NO_FLAG && 1490 AM.hasSymbolicDisplacement()) 1491 AM.Base_Reg = CurDAG->getRegister(X86::RIP, MVT::i64); 1492 break; 1493 } 1494 1495 return false; 1496 } 1497 1498 bool X86DAGToDAGISel::matchAdd(SDValue &N, X86ISelAddressMode &AM, 1499 unsigned Depth) { 1500 // Add an artificial use to this node so that we can keep track of 1501 // it if it gets CSE'd with a different node. 1502 HandleSDNode Handle(N); 1503 1504 X86ISelAddressMode Backup = AM; 1505 if (!matchAddressRecursively(N.getOperand(0), AM, Depth+1) && 1506 !matchAddressRecursively(Handle.getValue().getOperand(1), AM, Depth+1)) 1507 return false; 1508 AM = Backup; 1509 1510 // Try again after commuting the operands. 1511 if (!matchAddressRecursively(Handle.getValue().getOperand(1), AM, Depth+1) && 1512 !matchAddressRecursively(Handle.getValue().getOperand(0), AM, Depth+1)) 1513 return false; 1514 AM = Backup; 1515 1516 // If we couldn't fold both operands into the address at the same time, 1517 // see if we can just put each operand into a register and fold at least 1518 // the add. 1519 if (AM.BaseType == X86ISelAddressMode::RegBase && 1520 !AM.Base_Reg.getNode() && 1521 !AM.IndexReg.getNode()) { 1522 N = Handle.getValue(); 1523 AM.Base_Reg = N.getOperand(0); 1524 AM.IndexReg = N.getOperand(1); 1525 AM.Scale = 1; 1526 return false; 1527 } 1528 N = Handle.getValue(); 1529 return true; 1530 } 1531 1532 // Insert a node into the DAG at least before the Pos node's position. This 1533 // will reposition the node as needed, and will assign it a node ID that is <= 1534 // the Pos node's ID. Note that this does *not* preserve the uniqueness of node 1535 // IDs! The selection DAG must no longer depend on their uniqueness when this 1536 // is used. 1537 static void insertDAGNode(SelectionDAG &DAG, SDValue Pos, SDValue N) { 1538 if (N->getNodeId() == -1 || 1539 (SelectionDAGISel::getUninvalidatedNodeId(N.getNode()) > 1540 SelectionDAGISel::getUninvalidatedNodeId(Pos.getNode()))) { 1541 DAG.RepositionNode(Pos->getIterator(), N.getNode()); 1542 // Mark Node as invalid for pruning as after this it may be a successor to a 1543 // selected node but otherwise be in the same position of Pos. 1544 // Conservatively mark it with the same -abs(Id) to assure node id 1545 // invariant is preserved. 1546 N->setNodeId(Pos->getNodeId()); 1547 SelectionDAGISel::InvalidateNodeId(N.getNode()); 1548 } 1549 } 1550 1551 // Transform "(X >> (8-C1)) & (0xff << C1)" to "((X >> 8) & 0xff) << C1" if 1552 // safe. This allows us to convert the shift and and into an h-register 1553 // extract and a scaled index. Returns false if the simplification is 1554 // performed. 1555 static bool foldMaskAndShiftToExtract(SelectionDAG &DAG, SDValue N, 1556 uint64_t Mask, 1557 SDValue Shift, SDValue X, 1558 X86ISelAddressMode &AM) { 1559 if (Shift.getOpcode() != ISD::SRL || 1560 !isa<ConstantSDNode>(Shift.getOperand(1)) || 1561 !Shift.hasOneUse()) 1562 return true; 1563 1564 int ScaleLog = 8 - Shift.getConstantOperandVal(1); 1565 if (ScaleLog <= 0 || ScaleLog >= 4 || 1566 Mask != (0xffu << ScaleLog)) 1567 return true; 1568 1569 MVT VT = N.getSimpleValueType(); 1570 SDLoc DL(N); 1571 SDValue Eight = DAG.getConstant(8, DL, MVT::i8); 1572 SDValue NewMask = DAG.getConstant(0xff, DL, VT); 1573 SDValue Srl = DAG.getNode(ISD::SRL, DL, VT, X, Eight); 1574 SDValue And = DAG.getNode(ISD::AND, DL, VT, Srl, NewMask); 1575 SDValue ShlCount = DAG.getConstant(ScaleLog, DL, MVT::i8); 1576 SDValue Shl = DAG.getNode(ISD::SHL, DL, VT, And, ShlCount); 1577 1578 // Insert the new nodes into the topological ordering. We must do this in 1579 // a valid topological ordering as nothing is going to go back and re-sort 1580 // these nodes. We continually insert before 'N' in sequence as this is 1581 // essentially a pre-flattened and pre-sorted sequence of nodes. There is no 1582 // hierarchy left to express. 1583 insertDAGNode(DAG, N, Eight); 1584 insertDAGNode(DAG, N, Srl); 1585 insertDAGNode(DAG, N, NewMask); 1586 insertDAGNode(DAG, N, And); 1587 insertDAGNode(DAG, N, ShlCount); 1588 insertDAGNode(DAG, N, Shl); 1589 DAG.ReplaceAllUsesWith(N, Shl); 1590 DAG.RemoveDeadNode(N.getNode()); 1591 AM.IndexReg = And; 1592 AM.Scale = (1 << ScaleLog); 1593 return false; 1594 } 1595 1596 // Transforms "(X << C1) & C2" to "(X & (C2>>C1)) << C1" if safe and if this 1597 // allows us to fold the shift into this addressing mode. Returns false if the 1598 // transform succeeded. 1599 static bool foldMaskedShiftToScaledMask(SelectionDAG &DAG, SDValue N, 1600 X86ISelAddressMode &AM) { 1601 SDValue Shift = N.getOperand(0); 1602 1603 // Use a signed mask so that shifting right will insert sign bits. These 1604 // bits will be removed when we shift the result left so it doesn't matter 1605 // what we use. This might allow a smaller immediate encoding. 1606 int64_t Mask = cast<ConstantSDNode>(N->getOperand(1))->getSExtValue(); 1607 1608 // If we have an any_extend feeding the AND, look through it to see if there 1609 // is a shift behind it. But only if the AND doesn't use the extended bits. 1610 // FIXME: Generalize this to other ANY_EXTEND than i32 to i64? 1611 bool FoundAnyExtend = false; 1612 if (Shift.getOpcode() == ISD::ANY_EXTEND && Shift.hasOneUse() && 1613 Shift.getOperand(0).getSimpleValueType() == MVT::i32 && 1614 isUInt<32>(Mask)) { 1615 FoundAnyExtend = true; 1616 Shift = Shift.getOperand(0); 1617 } 1618 1619 if (Shift.getOpcode() != ISD::SHL || 1620 !isa<ConstantSDNode>(Shift.getOperand(1))) 1621 return true; 1622 1623 SDValue X = Shift.getOperand(0); 1624 1625 // Not likely to be profitable if either the AND or SHIFT node has more 1626 // than one use (unless all uses are for address computation). Besides, 1627 // isel mechanism requires their node ids to be reused. 1628 if (!N.hasOneUse() || !Shift.hasOneUse()) 1629 return true; 1630 1631 // Verify that the shift amount is something we can fold. 1632 unsigned ShiftAmt = Shift.getConstantOperandVal(1); 1633 if (ShiftAmt != 1 && ShiftAmt != 2 && ShiftAmt != 3) 1634 return true; 1635 1636 MVT VT = N.getSimpleValueType(); 1637 SDLoc DL(N); 1638 if (FoundAnyExtend) { 1639 SDValue NewX = DAG.getNode(ISD::ANY_EXTEND, DL, VT, X); 1640 insertDAGNode(DAG, N, NewX); 1641 X = NewX; 1642 } 1643 1644 SDValue NewMask = DAG.getConstant(Mask >> ShiftAmt, DL, VT); 1645 SDValue NewAnd = DAG.getNode(ISD::AND, DL, VT, X, NewMask); 1646 SDValue NewShift = DAG.getNode(ISD::SHL, DL, VT, NewAnd, Shift.getOperand(1)); 1647 1648 // Insert the new nodes into the topological ordering. We must do this in 1649 // a valid topological ordering as nothing is going to go back and re-sort 1650 // these nodes. We continually insert before 'N' in sequence as this is 1651 // essentially a pre-flattened and pre-sorted sequence of nodes. There is no 1652 // hierarchy left to express. 1653 insertDAGNode(DAG, N, NewMask); 1654 insertDAGNode(DAG, N, NewAnd); 1655 insertDAGNode(DAG, N, NewShift); 1656 DAG.ReplaceAllUsesWith(N, NewShift); 1657 DAG.RemoveDeadNode(N.getNode()); 1658 1659 AM.Scale = 1 << ShiftAmt; 1660 AM.IndexReg = NewAnd; 1661 return false; 1662 } 1663 1664 // Implement some heroics to detect shifts of masked values where the mask can 1665 // be replaced by extending the shift and undoing that in the addressing mode 1666 // scale. Patterns such as (shl (srl x, c1), c2) are canonicalized into (and 1667 // (srl x, SHIFT), MASK) by DAGCombines that don't know the shl can be done in 1668 // the addressing mode. This results in code such as: 1669 // 1670 // int f(short *y, int *lookup_table) { 1671 // ... 1672 // return *y + lookup_table[*y >> 11]; 1673 // } 1674 // 1675 // Turning into: 1676 // movzwl (%rdi), %eax 1677 // movl %eax, %ecx 1678 // shrl $11, %ecx 1679 // addl (%rsi,%rcx,4), %eax 1680 // 1681 // Instead of: 1682 // movzwl (%rdi), %eax 1683 // movl %eax, %ecx 1684 // shrl $9, %ecx 1685 // andl $124, %rcx 1686 // addl (%rsi,%rcx), %eax 1687 // 1688 // Note that this function assumes the mask is provided as a mask *after* the 1689 // value is shifted. The input chain may or may not match that, but computing 1690 // such a mask is trivial. 1691 static bool foldMaskAndShiftToScale(SelectionDAG &DAG, SDValue N, 1692 uint64_t Mask, 1693 SDValue Shift, SDValue X, 1694 X86ISelAddressMode &AM) { 1695 if (Shift.getOpcode() != ISD::SRL || !Shift.hasOneUse() || 1696 !isa<ConstantSDNode>(Shift.getOperand(1))) 1697 return true; 1698 1699 unsigned ShiftAmt = Shift.getConstantOperandVal(1); 1700 unsigned MaskLZ = countLeadingZeros(Mask); 1701 unsigned MaskTZ = countTrailingZeros(Mask); 1702 1703 // The amount of shift we're trying to fit into the addressing mode is taken 1704 // from the trailing zeros of the mask. 1705 unsigned AMShiftAmt = MaskTZ; 1706 1707 // There is nothing we can do here unless the mask is removing some bits. 1708 // Also, the addressing mode can only represent shifts of 1, 2, or 3 bits. 1709 if (AMShiftAmt <= 0 || AMShiftAmt > 3) return true; 1710 1711 // We also need to ensure that mask is a continuous run of bits. 1712 if (countTrailingOnes(Mask >> MaskTZ) + MaskTZ + MaskLZ != 64) return true; 1713 1714 // Scale the leading zero count down based on the actual size of the value. 1715 // Also scale it down based on the size of the shift. 1716 unsigned ScaleDown = (64 - X.getSimpleValueType().getSizeInBits()) + ShiftAmt; 1717 if (MaskLZ < ScaleDown) 1718 return true; 1719 MaskLZ -= ScaleDown; 1720 1721 // The final check is to ensure that any masked out high bits of X are 1722 // already known to be zero. Otherwise, the mask has a semantic impact 1723 // other than masking out a couple of low bits. Unfortunately, because of 1724 // the mask, zero extensions will be removed from operands in some cases. 1725 // This code works extra hard to look through extensions because we can 1726 // replace them with zero extensions cheaply if necessary. 1727 bool ReplacingAnyExtend = false; 1728 if (X.getOpcode() == ISD::ANY_EXTEND) { 1729 unsigned ExtendBits = X.getSimpleValueType().getSizeInBits() - 1730 X.getOperand(0).getSimpleValueType().getSizeInBits(); 1731 // Assume that we'll replace the any-extend with a zero-extend, and 1732 // narrow the search to the extended value. 1733 X = X.getOperand(0); 1734 MaskLZ = ExtendBits > MaskLZ ? 0 : MaskLZ - ExtendBits; 1735 ReplacingAnyExtend = true; 1736 } 1737 APInt MaskedHighBits = 1738 APInt::getHighBitsSet(X.getSimpleValueType().getSizeInBits(), MaskLZ); 1739 KnownBits Known = DAG.computeKnownBits(X); 1740 if (MaskedHighBits != Known.Zero) return true; 1741 1742 // We've identified a pattern that can be transformed into a single shift 1743 // and an addressing mode. Make it so. 1744 MVT VT = N.getSimpleValueType(); 1745 if (ReplacingAnyExtend) { 1746 assert(X.getValueType() != VT); 1747 // We looked through an ANY_EXTEND node, insert a ZERO_EXTEND. 1748 SDValue NewX = DAG.getNode(ISD::ZERO_EXTEND, SDLoc(X), VT, X); 1749 insertDAGNode(DAG, N, NewX); 1750 X = NewX; 1751 } 1752 SDLoc DL(N); 1753 SDValue NewSRLAmt = DAG.getConstant(ShiftAmt + AMShiftAmt, DL, MVT::i8); 1754 SDValue NewSRL = DAG.getNode(ISD::SRL, DL, VT, X, NewSRLAmt); 1755 SDValue NewSHLAmt = DAG.getConstant(AMShiftAmt, DL, MVT::i8); 1756 SDValue NewSHL = DAG.getNode(ISD::SHL, DL, VT, NewSRL, NewSHLAmt); 1757 1758 // Insert the new nodes into the topological ordering. We must do this in 1759 // a valid topological ordering as nothing is going to go back and re-sort 1760 // these nodes. We continually insert before 'N' in sequence as this is 1761 // essentially a pre-flattened and pre-sorted sequence of nodes. There is no 1762 // hierarchy left to express. 1763 insertDAGNode(DAG, N, NewSRLAmt); 1764 insertDAGNode(DAG, N, NewSRL); 1765 insertDAGNode(DAG, N, NewSHLAmt); 1766 insertDAGNode(DAG, N, NewSHL); 1767 DAG.ReplaceAllUsesWith(N, NewSHL); 1768 DAG.RemoveDeadNode(N.getNode()); 1769 1770 AM.Scale = 1 << AMShiftAmt; 1771 AM.IndexReg = NewSRL; 1772 return false; 1773 } 1774 1775 // Transform "(X >> SHIFT) & (MASK << C1)" to 1776 // "((X >> (SHIFT + C1)) & (MASK)) << C1". Everything before the SHL will be 1777 // matched to a BEXTR later. Returns false if the simplification is performed. 1778 static bool foldMaskedShiftToBEXTR(SelectionDAG &DAG, SDValue N, 1779 uint64_t Mask, 1780 SDValue Shift, SDValue X, 1781 X86ISelAddressMode &AM, 1782 const X86Subtarget &Subtarget) { 1783 if (Shift.getOpcode() != ISD::SRL || 1784 !isa<ConstantSDNode>(Shift.getOperand(1)) || 1785 !Shift.hasOneUse() || !N.hasOneUse()) 1786 return true; 1787 1788 // Only do this if BEXTR will be matched by matchBEXTRFromAndImm. 1789 if (!Subtarget.hasTBM() && 1790 !(Subtarget.hasBMI() && Subtarget.hasFastBEXTR())) 1791 return true; 1792 1793 // We need to ensure that mask is a continuous run of bits. 1794 if (!isShiftedMask_64(Mask)) return true; 1795 1796 unsigned ShiftAmt = Shift.getConstantOperandVal(1); 1797 1798 // The amount of shift we're trying to fit into the addressing mode is taken 1799 // from the trailing zeros of the mask. 1800 unsigned AMShiftAmt = countTrailingZeros(Mask); 1801 1802 // There is nothing we can do here unless the mask is removing some bits. 1803 // Also, the addressing mode can only represent shifts of 1, 2, or 3 bits. 1804 if (AMShiftAmt <= 0 || AMShiftAmt > 3) return true; 1805 1806 MVT VT = N.getSimpleValueType(); 1807 SDLoc DL(N); 1808 SDValue NewSRLAmt = DAG.getConstant(ShiftAmt + AMShiftAmt, DL, MVT::i8); 1809 SDValue NewSRL = DAG.getNode(ISD::SRL, DL, VT, X, NewSRLAmt); 1810 SDValue NewMask = DAG.getConstant(Mask >> AMShiftAmt, DL, VT); 1811 SDValue NewAnd = DAG.getNode(ISD::AND, DL, VT, NewSRL, NewMask); 1812 SDValue NewSHLAmt = DAG.getConstant(AMShiftAmt, DL, MVT::i8); 1813 SDValue NewSHL = DAG.getNode(ISD::SHL, DL, VT, NewAnd, NewSHLAmt); 1814 1815 // Insert the new nodes into the topological ordering. We must do this in 1816 // a valid topological ordering as nothing is going to go back and re-sort 1817 // these nodes. We continually insert before 'N' in sequence as this is 1818 // essentially a pre-flattened and pre-sorted sequence of nodes. There is no 1819 // hierarchy left to express. 1820 insertDAGNode(DAG, N, NewSRLAmt); 1821 insertDAGNode(DAG, N, NewSRL); 1822 insertDAGNode(DAG, N, NewMask); 1823 insertDAGNode(DAG, N, NewAnd); 1824 insertDAGNode(DAG, N, NewSHLAmt); 1825 insertDAGNode(DAG, N, NewSHL); 1826 DAG.ReplaceAllUsesWith(N, NewSHL); 1827 DAG.RemoveDeadNode(N.getNode()); 1828 1829 AM.Scale = 1 << AMShiftAmt; 1830 AM.IndexReg = NewAnd; 1831 return false; 1832 } 1833 1834 bool X86DAGToDAGISel::matchAddressRecursively(SDValue N, X86ISelAddressMode &AM, 1835 unsigned Depth) { 1836 SDLoc dl(N); 1837 LLVM_DEBUG({ 1838 dbgs() << "MatchAddress: "; 1839 AM.dump(CurDAG); 1840 }); 1841 // Limit recursion. 1842 if (Depth > 5) 1843 return matchAddressBase(N, AM); 1844 1845 // If this is already a %rip relative address, we can only merge immediates 1846 // into it. Instead of handling this in every case, we handle it here. 1847 // RIP relative addressing: %rip + 32-bit displacement! 1848 if (AM.isRIPRelative()) { 1849 // FIXME: JumpTable and ExternalSymbol address currently don't like 1850 // displacements. It isn't very important, but this should be fixed for 1851 // consistency. 1852 if (!(AM.ES || AM.MCSym) && AM.JT != -1) 1853 return true; 1854 1855 if (ConstantSDNode *Cst = dyn_cast<ConstantSDNode>(N)) 1856 if (!foldOffsetIntoAddress(Cst->getSExtValue(), AM)) 1857 return false; 1858 return true; 1859 } 1860 1861 switch (N.getOpcode()) { 1862 default: break; 1863 case ISD::LOCAL_RECOVER: { 1864 if (!AM.hasSymbolicDisplacement() && AM.Disp == 0) 1865 if (const auto *ESNode = dyn_cast<MCSymbolSDNode>(N.getOperand(0))) { 1866 // Use the symbol and don't prefix it. 1867 AM.MCSym = ESNode->getMCSymbol(); 1868 return false; 1869 } 1870 break; 1871 } 1872 case ISD::Constant: { 1873 uint64_t Val = cast<ConstantSDNode>(N)->getSExtValue(); 1874 if (!foldOffsetIntoAddress(Val, AM)) 1875 return false; 1876 break; 1877 } 1878 1879 case X86ISD::Wrapper: 1880 case X86ISD::WrapperRIP: 1881 if (!matchWrapper(N, AM)) 1882 return false; 1883 break; 1884 1885 case ISD::LOAD: 1886 if (!matchLoadInAddress(cast<LoadSDNode>(N), AM)) 1887 return false; 1888 break; 1889 1890 case ISD::FrameIndex: 1891 if (AM.BaseType == X86ISelAddressMode::RegBase && 1892 AM.Base_Reg.getNode() == nullptr && 1893 (!Subtarget->is64Bit() || isDispSafeForFrameIndex(AM.Disp))) { 1894 AM.BaseType = X86ISelAddressMode::FrameIndexBase; 1895 AM.Base_FrameIndex = cast<FrameIndexSDNode>(N)->getIndex(); 1896 return false; 1897 } 1898 break; 1899 1900 case ISD::SHL: 1901 if (AM.IndexReg.getNode() != nullptr || AM.Scale != 1) 1902 break; 1903 1904 if (ConstantSDNode *CN = dyn_cast<ConstantSDNode>(N.getOperand(1))) { 1905 unsigned Val = CN->getZExtValue(); 1906 // Note that we handle x<<1 as (,x,2) rather than (x,x) here so 1907 // that the base operand remains free for further matching. If 1908 // the base doesn't end up getting used, a post-processing step 1909 // in MatchAddress turns (,x,2) into (x,x), which is cheaper. 1910 if (Val == 1 || Val == 2 || Val == 3) { 1911 AM.Scale = 1 << Val; 1912 SDValue ShVal = N.getOperand(0); 1913 1914 // Okay, we know that we have a scale by now. However, if the scaled 1915 // value is an add of something and a constant, we can fold the 1916 // constant into the disp field here. 1917 if (CurDAG->isBaseWithConstantOffset(ShVal)) { 1918 AM.IndexReg = ShVal.getOperand(0); 1919 ConstantSDNode *AddVal = cast<ConstantSDNode>(ShVal.getOperand(1)); 1920 uint64_t Disp = (uint64_t)AddVal->getSExtValue() << Val; 1921 if (!foldOffsetIntoAddress(Disp, AM)) 1922 return false; 1923 } 1924 1925 AM.IndexReg = ShVal; 1926 return false; 1927 } 1928 } 1929 break; 1930 1931 case ISD::SRL: { 1932 // Scale must not be used already. 1933 if (AM.IndexReg.getNode() != nullptr || AM.Scale != 1) break; 1934 1935 // We only handle up to 64-bit values here as those are what matter for 1936 // addressing mode optimizations. 1937 assert(N.getSimpleValueType().getSizeInBits() <= 64 && 1938 "Unexpected value size!"); 1939 1940 SDValue And = N.getOperand(0); 1941 if (And.getOpcode() != ISD::AND) break; 1942 SDValue X = And.getOperand(0); 1943 1944 // The mask used for the transform is expected to be post-shift, but we 1945 // found the shift first so just apply the shift to the mask before passing 1946 // it down. 1947 if (!isa<ConstantSDNode>(N.getOperand(1)) || 1948 !isa<ConstantSDNode>(And.getOperand(1))) 1949 break; 1950 uint64_t Mask = And.getConstantOperandVal(1) >> N.getConstantOperandVal(1); 1951 1952 // Try to fold the mask and shift into the scale, and return false if we 1953 // succeed. 1954 if (!foldMaskAndShiftToScale(*CurDAG, N, Mask, N, X, AM)) 1955 return false; 1956 break; 1957 } 1958 1959 case ISD::SMUL_LOHI: 1960 case ISD::UMUL_LOHI: 1961 // A mul_lohi where we need the low part can be folded as a plain multiply. 1962 if (N.getResNo() != 0) break; 1963 LLVM_FALLTHROUGH; 1964 case ISD::MUL: 1965 case X86ISD::MUL_IMM: 1966 // X*[3,5,9] -> X+X*[2,4,8] 1967 if (AM.BaseType == X86ISelAddressMode::RegBase && 1968 AM.Base_Reg.getNode() == nullptr && 1969 AM.IndexReg.getNode() == nullptr) { 1970 if (ConstantSDNode *CN = dyn_cast<ConstantSDNode>(N.getOperand(1))) 1971 if (CN->getZExtValue() == 3 || CN->getZExtValue() == 5 || 1972 CN->getZExtValue() == 9) { 1973 AM.Scale = unsigned(CN->getZExtValue())-1; 1974 1975 SDValue MulVal = N.getOperand(0); 1976 SDValue Reg; 1977 1978 // Okay, we know that we have a scale by now. However, if the scaled 1979 // value is an add of something and a constant, we can fold the 1980 // constant into the disp field here. 1981 if (MulVal.getNode()->getOpcode() == ISD::ADD && MulVal.hasOneUse() && 1982 isa<ConstantSDNode>(MulVal.getOperand(1))) { 1983 Reg = MulVal.getOperand(0); 1984 ConstantSDNode *AddVal = 1985 cast<ConstantSDNode>(MulVal.getOperand(1)); 1986 uint64_t Disp = AddVal->getSExtValue() * CN->getZExtValue(); 1987 if (foldOffsetIntoAddress(Disp, AM)) 1988 Reg = N.getOperand(0); 1989 } else { 1990 Reg = N.getOperand(0); 1991 } 1992 1993 AM.IndexReg = AM.Base_Reg = Reg; 1994 return false; 1995 } 1996 } 1997 break; 1998 1999 case ISD::SUB: { 2000 // Given A-B, if A can be completely folded into the address and 2001 // the index field with the index field unused, use -B as the index. 2002 // This is a win if a has multiple parts that can be folded into 2003 // the address. Also, this saves a mov if the base register has 2004 // other uses, since it avoids a two-address sub instruction, however 2005 // it costs an additional mov if the index register has other uses. 2006 2007 // Add an artificial use to this node so that we can keep track of 2008 // it if it gets CSE'd with a different node. 2009 HandleSDNode Handle(N); 2010 2011 // Test if the LHS of the sub can be folded. 2012 X86ISelAddressMode Backup = AM; 2013 if (matchAddressRecursively(N.getOperand(0), AM, Depth+1)) { 2014 N = Handle.getValue(); 2015 AM = Backup; 2016 break; 2017 } 2018 N = Handle.getValue(); 2019 // Test if the index field is free for use. 2020 if (AM.IndexReg.getNode() || AM.isRIPRelative()) { 2021 AM = Backup; 2022 break; 2023 } 2024 2025 int Cost = 0; 2026 SDValue RHS = N.getOperand(1); 2027 // If the RHS involves a register with multiple uses, this 2028 // transformation incurs an extra mov, due to the neg instruction 2029 // clobbering its operand. 2030 if (!RHS.getNode()->hasOneUse() || 2031 RHS.getNode()->getOpcode() == ISD::CopyFromReg || 2032 RHS.getNode()->getOpcode() == ISD::TRUNCATE || 2033 RHS.getNode()->getOpcode() == ISD::ANY_EXTEND || 2034 (RHS.getNode()->getOpcode() == ISD::ZERO_EXTEND && 2035 RHS.getOperand(0).getValueType() == MVT::i32)) 2036 ++Cost; 2037 // If the base is a register with multiple uses, this 2038 // transformation may save a mov. 2039 if ((AM.BaseType == X86ISelAddressMode::RegBase && AM.Base_Reg.getNode() && 2040 !AM.Base_Reg.getNode()->hasOneUse()) || 2041 AM.BaseType == X86ISelAddressMode::FrameIndexBase) 2042 --Cost; 2043 // If the folded LHS was interesting, this transformation saves 2044 // address arithmetic. 2045 if ((AM.hasSymbolicDisplacement() && !Backup.hasSymbolicDisplacement()) + 2046 ((AM.Disp != 0) && (Backup.Disp == 0)) + 2047 (AM.Segment.getNode() && !Backup.Segment.getNode()) >= 2) 2048 --Cost; 2049 // If it doesn't look like it may be an overall win, don't do it. 2050 if (Cost >= 0) { 2051 AM = Backup; 2052 break; 2053 } 2054 2055 // Ok, the transformation is legal and appears profitable. Go for it. 2056 // Negation will be emitted later to avoid creating dangling nodes if this 2057 // was an unprofitable LEA. 2058 AM.IndexReg = RHS; 2059 AM.NegateIndex = true; 2060 AM.Scale = 1; 2061 return false; 2062 } 2063 2064 case ISD::ADD: 2065 if (!matchAdd(N, AM, Depth)) 2066 return false; 2067 break; 2068 2069 case ISD::OR: 2070 // We want to look through a transform in InstCombine and DAGCombiner that 2071 // turns 'add' into 'or', so we can treat this 'or' exactly like an 'add'. 2072 // Example: (or (and x, 1), (shl y, 3)) --> (add (and x, 1), (shl y, 3)) 2073 // An 'lea' can then be used to match the shift (multiply) and add: 2074 // and $1, %esi 2075 // lea (%rsi, %rdi, 8), %rax 2076 if (CurDAG->haveNoCommonBitsSet(N.getOperand(0), N.getOperand(1)) && 2077 !matchAdd(N, AM, Depth)) 2078 return false; 2079 break; 2080 2081 case ISD::AND: { 2082 // Perform some heroic transforms on an and of a constant-count shift 2083 // with a constant to enable use of the scaled offset field. 2084 2085 // Scale must not be used already. 2086 if (AM.IndexReg.getNode() != nullptr || AM.Scale != 1) break; 2087 2088 // We only handle up to 64-bit values here as those are what matter for 2089 // addressing mode optimizations. 2090 assert(N.getSimpleValueType().getSizeInBits() <= 64 && 2091 "Unexpected value size!"); 2092 2093 if (!isa<ConstantSDNode>(N.getOperand(1))) 2094 break; 2095 2096 if (N.getOperand(0).getOpcode() == ISD::SRL) { 2097 SDValue Shift = N.getOperand(0); 2098 SDValue X = Shift.getOperand(0); 2099 2100 uint64_t Mask = N.getConstantOperandVal(1); 2101 2102 // Try to fold the mask and shift into an extract and scale. 2103 if (!foldMaskAndShiftToExtract(*CurDAG, N, Mask, Shift, X, AM)) 2104 return false; 2105 2106 // Try to fold the mask and shift directly into the scale. 2107 if (!foldMaskAndShiftToScale(*CurDAG, N, Mask, Shift, X, AM)) 2108 return false; 2109 2110 // Try to fold the mask and shift into BEXTR and scale. 2111 if (!foldMaskedShiftToBEXTR(*CurDAG, N, Mask, Shift, X, AM, *Subtarget)) 2112 return false; 2113 } 2114 2115 // Try to swap the mask and shift to place shifts which can be done as 2116 // a scale on the outside of the mask. 2117 if (!foldMaskedShiftToScaledMask(*CurDAG, N, AM)) 2118 return false; 2119 2120 break; 2121 } 2122 case ISD::ZERO_EXTEND: { 2123 // Try to widen a zexted shift left to the same size as its use, so we can 2124 // match the shift as a scale factor. 2125 if (AM.IndexReg.getNode() != nullptr || AM.Scale != 1) 2126 break; 2127 if (N.getOperand(0).getOpcode() != ISD::SHL || !N.getOperand(0).hasOneUse()) 2128 break; 2129 2130 // Give up if the shift is not a valid scale factor [1,2,3]. 2131 SDValue Shl = N.getOperand(0); 2132 auto *ShAmtC = dyn_cast<ConstantSDNode>(Shl.getOperand(1)); 2133 if (!ShAmtC || ShAmtC->getZExtValue() > 3) 2134 break; 2135 2136 // The narrow shift must only shift out zero bits (it must be 'nuw'). 2137 // That makes it safe to widen to the destination type. 2138 APInt HighZeros = APInt::getHighBitsSet(Shl.getValueSizeInBits(), 2139 ShAmtC->getZExtValue()); 2140 if (!CurDAG->MaskedValueIsZero(Shl.getOperand(0), HighZeros)) 2141 break; 2142 2143 // zext (shl nuw i8 %x, C) to i32 --> shl (zext i8 %x to i32), (zext C) 2144 MVT VT = N.getSimpleValueType(); 2145 SDLoc DL(N); 2146 SDValue Zext = CurDAG->getNode(ISD::ZERO_EXTEND, DL, VT, Shl.getOperand(0)); 2147 SDValue NewShl = CurDAG->getNode(ISD::SHL, DL, VT, Zext, Shl.getOperand(1)); 2148 2149 // Convert the shift to scale factor. 2150 AM.Scale = 1 << ShAmtC->getZExtValue(); 2151 AM.IndexReg = Zext; 2152 2153 insertDAGNode(*CurDAG, N, Zext); 2154 insertDAGNode(*CurDAG, N, NewShl); 2155 CurDAG->ReplaceAllUsesWith(N, NewShl); 2156 CurDAG->RemoveDeadNode(N.getNode()); 2157 return false; 2158 } 2159 } 2160 2161 return matchAddressBase(N, AM); 2162 } 2163 2164 /// Helper for MatchAddress. Add the specified node to the 2165 /// specified addressing mode without any further recursion. 2166 bool X86DAGToDAGISel::matchAddressBase(SDValue N, X86ISelAddressMode &AM) { 2167 // Is the base register already occupied? 2168 if (AM.BaseType != X86ISelAddressMode::RegBase || AM.Base_Reg.getNode()) { 2169 // If so, check to see if the scale index register is set. 2170 if (!AM.IndexReg.getNode()) { 2171 AM.IndexReg = N; 2172 AM.Scale = 1; 2173 return false; 2174 } 2175 2176 // Otherwise, we cannot select it. 2177 return true; 2178 } 2179 2180 // Default, generate it as a register. 2181 AM.BaseType = X86ISelAddressMode::RegBase; 2182 AM.Base_Reg = N; 2183 return false; 2184 } 2185 2186 /// Helper for selectVectorAddr. Handles things that can be folded into a 2187 /// gather scatter address. The index register and scale should have already 2188 /// been handled. 2189 bool X86DAGToDAGISel::matchVectorAddress(SDValue N, X86ISelAddressMode &AM) { 2190 // TODO: Support other operations. 2191 switch (N.getOpcode()) { 2192 case ISD::Constant: { 2193 uint64_t Val = cast<ConstantSDNode>(N)->getSExtValue(); 2194 if (!foldOffsetIntoAddress(Val, AM)) 2195 return false; 2196 break; 2197 } 2198 case X86ISD::Wrapper: 2199 if (!matchWrapper(N, AM)) 2200 return false; 2201 break; 2202 } 2203 2204 return matchAddressBase(N, AM); 2205 } 2206 2207 bool X86DAGToDAGISel::selectVectorAddr(SDNode *Parent, SDValue N, SDValue &Base, 2208 SDValue &Scale, SDValue &Index, 2209 SDValue &Disp, SDValue &Segment) { 2210 X86ISelAddressMode AM; 2211 auto *Mgs = cast<X86MaskedGatherScatterSDNode>(Parent); 2212 AM.IndexReg = Mgs->getIndex(); 2213 AM.Scale = cast<ConstantSDNode>(Mgs->getScale())->getZExtValue(); 2214 2215 unsigned AddrSpace = cast<MemSDNode>(Parent)->getPointerInfo().getAddrSpace(); 2216 // AddrSpace 256 -> GS, 257 -> FS, 258 -> SS. 2217 if (AddrSpace == 256) 2218 AM.Segment = CurDAG->getRegister(X86::GS, MVT::i16); 2219 if (AddrSpace == 257) 2220 AM.Segment = CurDAG->getRegister(X86::FS, MVT::i16); 2221 if (AddrSpace == 258) 2222 AM.Segment = CurDAG->getRegister(X86::SS, MVT::i16); 2223 2224 SDLoc DL(N); 2225 MVT VT = N.getSimpleValueType(); 2226 2227 // Try to match into the base and displacement fields. 2228 if (matchVectorAddress(N, AM)) 2229 return false; 2230 2231 getAddressOperands(AM, DL, VT, Base, Scale, Index, Disp, Segment); 2232 return true; 2233 } 2234 2235 /// Returns true if it is able to pattern match an addressing mode. 2236 /// It returns the operands which make up the maximal addressing mode it can 2237 /// match by reference. 2238 /// 2239 /// Parent is the parent node of the addr operand that is being matched. It 2240 /// is always a load, store, atomic node, or null. It is only null when 2241 /// checking memory operands for inline asm nodes. 2242 bool X86DAGToDAGISel::selectAddr(SDNode *Parent, SDValue N, SDValue &Base, 2243 SDValue &Scale, SDValue &Index, 2244 SDValue &Disp, SDValue &Segment) { 2245 X86ISelAddressMode AM; 2246 2247 if (Parent && 2248 // This list of opcodes are all the nodes that have an "addr:$ptr" operand 2249 // that are not a MemSDNode, and thus don't have proper addrspace info. 2250 Parent->getOpcode() != ISD::INTRINSIC_W_CHAIN && // unaligned loads, fixme 2251 Parent->getOpcode() != ISD::INTRINSIC_VOID && // nontemporal stores 2252 Parent->getOpcode() != X86ISD::TLSCALL && // Fixme 2253 Parent->getOpcode() != X86ISD::ENQCMD && // Fixme 2254 Parent->getOpcode() != X86ISD::ENQCMDS && // Fixme 2255 Parent->getOpcode() != X86ISD::EH_SJLJ_SETJMP && // setjmp 2256 Parent->getOpcode() != X86ISD::EH_SJLJ_LONGJMP) { // longjmp 2257 unsigned AddrSpace = 2258 cast<MemSDNode>(Parent)->getPointerInfo().getAddrSpace(); 2259 // AddrSpace 256 -> GS, 257 -> FS, 258 -> SS. 2260 if (AddrSpace == 256) 2261 AM.Segment = CurDAG->getRegister(X86::GS, MVT::i16); 2262 if (AddrSpace == 257) 2263 AM.Segment = CurDAG->getRegister(X86::FS, MVT::i16); 2264 if (AddrSpace == 258) 2265 AM.Segment = CurDAG->getRegister(X86::SS, MVT::i16); 2266 } 2267 2268 // Save the DL and VT before calling matchAddress, it can invalidate N. 2269 SDLoc DL(N); 2270 MVT VT = N.getSimpleValueType(); 2271 2272 if (matchAddress(N, AM)) 2273 return false; 2274 2275 getAddressOperands(AM, DL, VT, Base, Scale, Index, Disp, Segment); 2276 return true; 2277 } 2278 2279 // We can only fold a load if all nodes between it and the root node have a 2280 // single use. If there are additional uses, we could end up duplicating the 2281 // load. 2282 static bool hasSingleUsesFromRoot(SDNode *Root, SDNode *User) { 2283 while (User != Root) { 2284 if (!User->hasOneUse()) 2285 return false; 2286 User = *User->use_begin(); 2287 } 2288 2289 return true; 2290 } 2291 2292 /// Match a scalar SSE load. In particular, we want to match a load whose top 2293 /// elements are either undef or zeros. The load flavor is derived from the 2294 /// type of N, which is either v4f32 or v2f64. 2295 /// 2296 /// We also return: 2297 /// PatternChainNode: this is the matched node that has a chain input and 2298 /// output. 2299 bool X86DAGToDAGISel::selectScalarSSELoad(SDNode *Root, SDNode *Parent, 2300 SDValue N, SDValue &Base, 2301 SDValue &Scale, SDValue &Index, 2302 SDValue &Disp, SDValue &Segment, 2303 SDValue &PatternNodeWithChain) { 2304 if (!hasSingleUsesFromRoot(Root, Parent)) 2305 return false; 2306 2307 // We can allow a full vector load here since narrowing a load is ok unless 2308 // it's volatile. 2309 if (ISD::isNON_EXTLoad(N.getNode())) { 2310 LoadSDNode *LD = cast<LoadSDNode>(N); 2311 if (!LD->isVolatile() && 2312 IsProfitableToFold(N, LD, Root) && 2313 IsLegalToFold(N, Parent, Root, OptLevel)) { 2314 PatternNodeWithChain = N; 2315 return selectAddr(LD, LD->getBasePtr(), Base, Scale, Index, Disp, 2316 Segment); 2317 } 2318 } 2319 2320 // We can also match the special zero extended load opcode. 2321 if (N.getOpcode() == X86ISD::VZEXT_LOAD) { 2322 PatternNodeWithChain = N; 2323 if (IsProfitableToFold(PatternNodeWithChain, N.getNode(), Root) && 2324 IsLegalToFold(PatternNodeWithChain, Parent, Root, OptLevel)) { 2325 auto *MI = cast<MemIntrinsicSDNode>(PatternNodeWithChain); 2326 return selectAddr(MI, MI->getBasePtr(), Base, Scale, Index, Disp, 2327 Segment); 2328 } 2329 } 2330 2331 // Need to make sure that the SCALAR_TO_VECTOR and load are both only used 2332 // once. Otherwise the load might get duplicated and the chain output of the 2333 // duplicate load will not be observed by all dependencies. 2334 if (N.getOpcode() == ISD::SCALAR_TO_VECTOR && N.getNode()->hasOneUse()) { 2335 PatternNodeWithChain = N.getOperand(0); 2336 if (ISD::isNON_EXTLoad(PatternNodeWithChain.getNode()) && 2337 IsProfitableToFold(PatternNodeWithChain, N.getNode(), Root) && 2338 IsLegalToFold(PatternNodeWithChain, N.getNode(), Root, OptLevel)) { 2339 LoadSDNode *LD = cast<LoadSDNode>(PatternNodeWithChain); 2340 return selectAddr(LD, LD->getBasePtr(), Base, Scale, Index, Disp, 2341 Segment); 2342 } 2343 } 2344 2345 return false; 2346 } 2347 2348 2349 bool X86DAGToDAGISel::selectMOV64Imm32(SDValue N, SDValue &Imm) { 2350 if (const ConstantSDNode *CN = dyn_cast<ConstantSDNode>(N)) { 2351 uint64_t ImmVal = CN->getZExtValue(); 2352 if (!isUInt<32>(ImmVal)) 2353 return false; 2354 2355 Imm = CurDAG->getTargetConstant(ImmVal, SDLoc(N), MVT::i64); 2356 return true; 2357 } 2358 2359 // In static codegen with small code model, we can get the address of a label 2360 // into a register with 'movl' 2361 if (N->getOpcode() != X86ISD::Wrapper) 2362 return false; 2363 2364 N = N.getOperand(0); 2365 2366 // At least GNU as does not accept 'movl' for TPOFF relocations. 2367 // FIXME: We could use 'movl' when we know we are targeting MC. 2368 if (N->getOpcode() == ISD::TargetGlobalTLSAddress) 2369 return false; 2370 2371 Imm = N; 2372 if (N->getOpcode() != ISD::TargetGlobalAddress) 2373 return TM.getCodeModel() == CodeModel::Small; 2374 2375 Optional<ConstantRange> CR = 2376 cast<GlobalAddressSDNode>(N)->getGlobal()->getAbsoluteSymbolRange(); 2377 if (!CR) 2378 return TM.getCodeModel() == CodeModel::Small; 2379 2380 return CR->getUnsignedMax().ult(1ull << 32); 2381 } 2382 2383 bool X86DAGToDAGISel::selectLEA64_32Addr(SDValue N, SDValue &Base, 2384 SDValue &Scale, SDValue &Index, 2385 SDValue &Disp, SDValue &Segment) { 2386 // Save the debug loc before calling selectLEAAddr, in case it invalidates N. 2387 SDLoc DL(N); 2388 2389 if (!selectLEAAddr(N, Base, Scale, Index, Disp, Segment)) 2390 return false; 2391 2392 RegisterSDNode *RN = dyn_cast<RegisterSDNode>(Base); 2393 if (RN && RN->getReg() == 0) 2394 Base = CurDAG->getRegister(0, MVT::i64); 2395 else if (Base.getValueType() == MVT::i32 && !isa<FrameIndexSDNode>(Base)) { 2396 // Base could already be %rip, particularly in the x32 ABI. 2397 SDValue ImplDef = SDValue(CurDAG->getMachineNode(X86::IMPLICIT_DEF, DL, 2398 MVT::i64), 0); 2399 Base = CurDAG->getTargetInsertSubreg(X86::sub_32bit, DL, MVT::i64, ImplDef, 2400 Base); 2401 } 2402 2403 RN = dyn_cast<RegisterSDNode>(Index); 2404 if (RN && RN->getReg() == 0) 2405 Index = CurDAG->getRegister(0, MVT::i64); 2406 else { 2407 assert(Index.getValueType() == MVT::i32 && 2408 "Expect to be extending 32-bit registers for use in LEA"); 2409 SDValue ImplDef = SDValue(CurDAG->getMachineNode(X86::IMPLICIT_DEF, DL, 2410 MVT::i64), 0); 2411 Index = CurDAG->getTargetInsertSubreg(X86::sub_32bit, DL, MVT::i64, ImplDef, 2412 Index); 2413 } 2414 2415 return true; 2416 } 2417 2418 /// Calls SelectAddr and determines if the maximal addressing 2419 /// mode it matches can be cost effectively emitted as an LEA instruction. 2420 bool X86DAGToDAGISel::selectLEAAddr(SDValue N, 2421 SDValue &Base, SDValue &Scale, 2422 SDValue &Index, SDValue &Disp, 2423 SDValue &Segment) { 2424 X86ISelAddressMode AM; 2425 2426 // Save the DL and VT before calling matchAddress, it can invalidate N. 2427 SDLoc DL(N); 2428 MVT VT = N.getSimpleValueType(); 2429 2430 // Set AM.Segment to prevent MatchAddress from using one. LEA doesn't support 2431 // segments. 2432 SDValue Copy = AM.Segment; 2433 SDValue T = CurDAG->getRegister(0, MVT::i32); 2434 AM.Segment = T; 2435 if (matchAddress(N, AM)) 2436 return false; 2437 assert (T == AM.Segment); 2438 AM.Segment = Copy; 2439 2440 unsigned Complexity = 0; 2441 if (AM.BaseType == X86ISelAddressMode::RegBase && AM.Base_Reg.getNode()) 2442 Complexity = 1; 2443 else if (AM.BaseType == X86ISelAddressMode::FrameIndexBase) 2444 Complexity = 4; 2445 2446 if (AM.IndexReg.getNode()) 2447 Complexity++; 2448 2449 // Don't match just leal(,%reg,2). It's cheaper to do addl %reg, %reg, or with 2450 // a simple shift. 2451 if (AM.Scale > 1) 2452 Complexity++; 2453 2454 // FIXME: We are artificially lowering the criteria to turn ADD %reg, $GA 2455 // to a LEA. This is determined with some experimentation but is by no means 2456 // optimal (especially for code size consideration). LEA is nice because of 2457 // its three-address nature. Tweak the cost function again when we can run 2458 // convertToThreeAddress() at register allocation time. 2459 if (AM.hasSymbolicDisplacement()) { 2460 // For X86-64, always use LEA to materialize RIP-relative addresses. 2461 if (Subtarget->is64Bit()) 2462 Complexity = 4; 2463 else 2464 Complexity += 2; 2465 } 2466 2467 // Heuristic: try harder to form an LEA from ADD if the operands set flags. 2468 // Unlike ADD, LEA does not affect flags, so we will be less likely to require 2469 // duplicating flag-producing instructions later in the pipeline. 2470 if (N.getOpcode() == ISD::ADD) { 2471 auto isMathWithFlags = [](SDValue V) { 2472 switch (V.getOpcode()) { 2473 case X86ISD::ADD: 2474 case X86ISD::SUB: 2475 case X86ISD::ADC: 2476 case X86ISD::SBB: 2477 /* TODO: These opcodes can be added safely, but we may want to justify 2478 their inclusion for different reasons (better for reg-alloc). 2479 case X86ISD::SMUL: 2480 case X86ISD::UMUL: 2481 case X86ISD::OR: 2482 case X86ISD::XOR: 2483 case X86ISD::AND: 2484 */ 2485 // Value 1 is the flag output of the node - verify it's not dead. 2486 return !SDValue(V.getNode(), 1).use_empty(); 2487 default: 2488 return false; 2489 } 2490 }; 2491 // TODO: This could be an 'or' rather than 'and' to make the transform more 2492 // likely to happen. We might want to factor in whether there's a 2493 // load folding opportunity for the math op that disappears with LEA. 2494 if (isMathWithFlags(N.getOperand(0)) && isMathWithFlags(N.getOperand(1))) 2495 Complexity++; 2496 } 2497 2498 if (AM.Disp) 2499 Complexity++; 2500 2501 // If it isn't worth using an LEA, reject it. 2502 if (Complexity <= 2) 2503 return false; 2504 2505 getAddressOperands(AM, DL, VT, Base, Scale, Index, Disp, Segment); 2506 return true; 2507 } 2508 2509 /// This is only run on TargetGlobalTLSAddress nodes. 2510 bool X86DAGToDAGISel::selectTLSADDRAddr(SDValue N, SDValue &Base, 2511 SDValue &Scale, SDValue &Index, 2512 SDValue &Disp, SDValue &Segment) { 2513 assert(N.getOpcode() == ISD::TargetGlobalTLSAddress); 2514 const GlobalAddressSDNode *GA = cast<GlobalAddressSDNode>(N); 2515 2516 X86ISelAddressMode AM; 2517 AM.GV = GA->getGlobal(); 2518 AM.Disp += GA->getOffset(); 2519 AM.SymbolFlags = GA->getTargetFlags(); 2520 2521 MVT VT = N.getSimpleValueType(); 2522 if (VT == MVT::i32) { 2523 AM.Scale = 1; 2524 AM.IndexReg = CurDAG->getRegister(X86::EBX, MVT::i32); 2525 } 2526 2527 getAddressOperands(AM, SDLoc(N), VT, Base, Scale, Index, Disp, Segment); 2528 return true; 2529 } 2530 2531 bool X86DAGToDAGISel::selectRelocImm(SDValue N, SDValue &Op) { 2532 if (auto *CN = dyn_cast<ConstantSDNode>(N)) { 2533 Op = CurDAG->getTargetConstant(CN->getAPIntValue(), SDLoc(CN), 2534 N.getValueType()); 2535 return true; 2536 } 2537 2538 // Keep track of the original value type and whether this value was 2539 // truncated. If we see a truncation from pointer type to VT that truncates 2540 // bits that are known to be zero, we can use a narrow reference. 2541 EVT VT = N.getValueType(); 2542 bool WasTruncated = false; 2543 if (N.getOpcode() == ISD::TRUNCATE) { 2544 WasTruncated = true; 2545 N = N.getOperand(0); 2546 } 2547 2548 if (N.getOpcode() != X86ISD::Wrapper) 2549 return false; 2550 2551 // We can only use non-GlobalValues as immediates if they were not truncated, 2552 // as we do not have any range information. If we have a GlobalValue and the 2553 // address was not truncated, we can select it as an operand directly. 2554 unsigned Opc = N.getOperand(0)->getOpcode(); 2555 if (Opc != ISD::TargetGlobalAddress || !WasTruncated) { 2556 Op = N.getOperand(0); 2557 // We can only select the operand directly if we didn't have to look past a 2558 // truncate. 2559 return !WasTruncated; 2560 } 2561 2562 // Check that the global's range fits into VT. 2563 auto *GA = cast<GlobalAddressSDNode>(N.getOperand(0)); 2564 Optional<ConstantRange> CR = GA->getGlobal()->getAbsoluteSymbolRange(); 2565 if (!CR || CR->getUnsignedMax().uge(1ull << VT.getSizeInBits())) 2566 return false; 2567 2568 // Okay, we can use a narrow reference. 2569 Op = CurDAG->getTargetGlobalAddress(GA->getGlobal(), SDLoc(N), VT, 2570 GA->getOffset(), GA->getTargetFlags()); 2571 return true; 2572 } 2573 2574 bool X86DAGToDAGISel::tryFoldLoad(SDNode *Root, SDNode *P, SDValue N, 2575 SDValue &Base, SDValue &Scale, 2576 SDValue &Index, SDValue &Disp, 2577 SDValue &Segment) { 2578 if (!ISD::isNON_EXTLoad(N.getNode()) || 2579 !IsProfitableToFold(N, P, Root) || 2580 !IsLegalToFold(N, P, Root, OptLevel)) 2581 return false; 2582 2583 return selectAddr(N.getNode(), 2584 N.getOperand(1), Base, Scale, Index, Disp, Segment); 2585 } 2586 2587 /// Return an SDNode that returns the value of the global base register. 2588 /// Output instructions required to initialize the global base register, 2589 /// if necessary. 2590 SDNode *X86DAGToDAGISel::getGlobalBaseReg() { 2591 unsigned GlobalBaseReg = getInstrInfo()->getGlobalBaseReg(MF); 2592 auto &DL = MF->getDataLayout(); 2593 return CurDAG->getRegister(GlobalBaseReg, TLI->getPointerTy(DL)).getNode(); 2594 } 2595 2596 bool X86DAGToDAGISel::isSExtAbsoluteSymbolRef(unsigned Width, SDNode *N) const { 2597 if (N->getOpcode() == ISD::TRUNCATE) 2598 N = N->getOperand(0).getNode(); 2599 if (N->getOpcode() != X86ISD::Wrapper) 2600 return false; 2601 2602 auto *GA = dyn_cast<GlobalAddressSDNode>(N->getOperand(0)); 2603 if (!GA) 2604 return false; 2605 2606 Optional<ConstantRange> CR = GA->getGlobal()->getAbsoluteSymbolRange(); 2607 return CR && CR->getSignedMin().sge(-1ull << Width) && 2608 CR->getSignedMax().slt(1ull << Width); 2609 } 2610 2611 static X86::CondCode getCondFromNode(SDNode *N) { 2612 assert(N->isMachineOpcode() && "Unexpected node"); 2613 X86::CondCode CC = X86::COND_INVALID; 2614 unsigned Opc = N->getMachineOpcode(); 2615 if (Opc == X86::JCC_1) 2616 CC = static_cast<X86::CondCode>(N->getConstantOperandVal(1)); 2617 else if (Opc == X86::SETCCr) 2618 CC = static_cast<X86::CondCode>(N->getConstantOperandVal(0)); 2619 else if (Opc == X86::SETCCm) 2620 CC = static_cast<X86::CondCode>(N->getConstantOperandVal(5)); 2621 else if (Opc == X86::CMOV16rr || Opc == X86::CMOV32rr || 2622 Opc == X86::CMOV64rr) 2623 CC = static_cast<X86::CondCode>(N->getConstantOperandVal(2)); 2624 else if (Opc == X86::CMOV16rm || Opc == X86::CMOV32rm || 2625 Opc == X86::CMOV64rm) 2626 CC = static_cast<X86::CondCode>(N->getConstantOperandVal(6)); 2627 2628 return CC; 2629 } 2630 2631 /// Test whether the given X86ISD::CMP node has any users that use a flag 2632 /// other than ZF. 2633 bool X86DAGToDAGISel::onlyUsesZeroFlag(SDValue Flags) const { 2634 // Examine each user of the node. 2635 for (SDNode::use_iterator UI = Flags->use_begin(), UE = Flags->use_end(); 2636 UI != UE; ++UI) { 2637 // Only check things that use the flags. 2638 if (UI.getUse().getResNo() != Flags.getResNo()) 2639 continue; 2640 // Only examine CopyToReg uses that copy to EFLAGS. 2641 if (UI->getOpcode() != ISD::CopyToReg || 2642 cast<RegisterSDNode>(UI->getOperand(1))->getReg() != X86::EFLAGS) 2643 return false; 2644 // Examine each user of the CopyToReg use. 2645 for (SDNode::use_iterator FlagUI = UI->use_begin(), 2646 FlagUE = UI->use_end(); FlagUI != FlagUE; ++FlagUI) { 2647 // Only examine the Flag result. 2648 if (FlagUI.getUse().getResNo() != 1) continue; 2649 // Anything unusual: assume conservatively. 2650 if (!FlagUI->isMachineOpcode()) return false; 2651 // Examine the condition code of the user. 2652 X86::CondCode CC = getCondFromNode(*FlagUI); 2653 2654 switch (CC) { 2655 // Comparisons which only use the zero flag. 2656 case X86::COND_E: case X86::COND_NE: 2657 continue; 2658 // Anything else: assume conservatively. 2659 default: 2660 return false; 2661 } 2662 } 2663 } 2664 return true; 2665 } 2666 2667 /// Test whether the given X86ISD::CMP node has any uses which require the SF 2668 /// flag to be accurate. 2669 bool X86DAGToDAGISel::hasNoSignFlagUses(SDValue Flags) const { 2670 // Examine each user of the node. 2671 for (SDNode::use_iterator UI = Flags->use_begin(), UE = Flags->use_end(); 2672 UI != UE; ++UI) { 2673 // Only check things that use the flags. 2674 if (UI.getUse().getResNo() != Flags.getResNo()) 2675 continue; 2676 // Only examine CopyToReg uses that copy to EFLAGS. 2677 if (UI->getOpcode() != ISD::CopyToReg || 2678 cast<RegisterSDNode>(UI->getOperand(1))->getReg() != X86::EFLAGS) 2679 return false; 2680 // Examine each user of the CopyToReg use. 2681 for (SDNode::use_iterator FlagUI = UI->use_begin(), 2682 FlagUE = UI->use_end(); FlagUI != FlagUE; ++FlagUI) { 2683 // Only examine the Flag result. 2684 if (FlagUI.getUse().getResNo() != 1) continue; 2685 // Anything unusual: assume conservatively. 2686 if (!FlagUI->isMachineOpcode()) return false; 2687 // Examine the condition code of the user. 2688 X86::CondCode CC = getCondFromNode(*FlagUI); 2689 2690 switch (CC) { 2691 // Comparisons which don't examine the SF flag. 2692 case X86::COND_A: case X86::COND_AE: 2693 case X86::COND_B: case X86::COND_BE: 2694 case X86::COND_E: case X86::COND_NE: 2695 case X86::COND_O: case X86::COND_NO: 2696 case X86::COND_P: case X86::COND_NP: 2697 continue; 2698 // Anything else: assume conservatively. 2699 default: 2700 return false; 2701 } 2702 } 2703 } 2704 return true; 2705 } 2706 2707 static bool mayUseCarryFlag(X86::CondCode CC) { 2708 switch (CC) { 2709 // Comparisons which don't examine the CF flag. 2710 case X86::COND_O: case X86::COND_NO: 2711 case X86::COND_E: case X86::COND_NE: 2712 case X86::COND_S: case X86::COND_NS: 2713 case X86::COND_P: case X86::COND_NP: 2714 case X86::COND_L: case X86::COND_GE: 2715 case X86::COND_G: case X86::COND_LE: 2716 return false; 2717 // Anything else: assume conservatively. 2718 default: 2719 return true; 2720 } 2721 } 2722 2723 /// Test whether the given node which sets flags has any uses which require the 2724 /// CF flag to be accurate. 2725 bool X86DAGToDAGISel::hasNoCarryFlagUses(SDValue Flags) const { 2726 // Examine each user of the node. 2727 for (SDNode::use_iterator UI = Flags->use_begin(), UE = Flags->use_end(); 2728 UI != UE; ++UI) { 2729 // Only check things that use the flags. 2730 if (UI.getUse().getResNo() != Flags.getResNo()) 2731 continue; 2732 2733 unsigned UIOpc = UI->getOpcode(); 2734 2735 if (UIOpc == ISD::CopyToReg) { 2736 // Only examine CopyToReg uses that copy to EFLAGS. 2737 if (cast<RegisterSDNode>(UI->getOperand(1))->getReg() != X86::EFLAGS) 2738 return false; 2739 // Examine each user of the CopyToReg use. 2740 for (SDNode::use_iterator FlagUI = UI->use_begin(), FlagUE = UI->use_end(); 2741 FlagUI != FlagUE; ++FlagUI) { 2742 // Only examine the Flag result. 2743 if (FlagUI.getUse().getResNo() != 1) 2744 continue; 2745 // Anything unusual: assume conservatively. 2746 if (!FlagUI->isMachineOpcode()) 2747 return false; 2748 // Examine the condition code of the user. 2749 X86::CondCode CC = getCondFromNode(*FlagUI); 2750 2751 if (mayUseCarryFlag(CC)) 2752 return false; 2753 } 2754 2755 // This CopyToReg is ok. Move on to the next user. 2756 continue; 2757 } 2758 2759 // This might be an unselected node. So look for the pre-isel opcodes that 2760 // use flags. 2761 unsigned CCOpNo; 2762 switch (UIOpc) { 2763 default: 2764 // Something unusual. Be conservative. 2765 return false; 2766 case X86ISD::SETCC: CCOpNo = 0; break; 2767 case X86ISD::SETCC_CARRY: CCOpNo = 0; break; 2768 case X86ISD::CMOV: CCOpNo = 2; break; 2769 case X86ISD::BRCOND: CCOpNo = 2; break; 2770 } 2771 2772 X86::CondCode CC = (X86::CondCode)UI->getConstantOperandVal(CCOpNo); 2773 if (mayUseCarryFlag(CC)) 2774 return false; 2775 } 2776 return true; 2777 } 2778 2779 /// Check whether or not the chain ending in StoreNode is suitable for doing 2780 /// the {load; op; store} to modify transformation. 2781 static bool isFusableLoadOpStorePattern(StoreSDNode *StoreNode, 2782 SDValue StoredVal, SelectionDAG *CurDAG, 2783 unsigned LoadOpNo, 2784 LoadSDNode *&LoadNode, 2785 SDValue &InputChain) { 2786 // Is the stored value result 0 of the operation? 2787 if (StoredVal.getResNo() != 0) return false; 2788 2789 // Are there other uses of the operation other than the store? 2790 if (!StoredVal.getNode()->hasNUsesOfValue(1, 0)) return false; 2791 2792 // Is the store non-extending and non-indexed? 2793 if (!ISD::isNormalStore(StoreNode) || StoreNode->isNonTemporal()) 2794 return false; 2795 2796 SDValue Load = StoredVal->getOperand(LoadOpNo); 2797 // Is the stored value a non-extending and non-indexed load? 2798 if (!ISD::isNormalLoad(Load.getNode())) return false; 2799 2800 // Return LoadNode by reference. 2801 LoadNode = cast<LoadSDNode>(Load); 2802 2803 // Is store the only read of the loaded value? 2804 if (!Load.hasOneUse()) 2805 return false; 2806 2807 // Is the address of the store the same as the load? 2808 if (LoadNode->getBasePtr() != StoreNode->getBasePtr() || 2809 LoadNode->getOffset() != StoreNode->getOffset()) 2810 return false; 2811 2812 bool FoundLoad = false; 2813 SmallVector<SDValue, 4> ChainOps; 2814 SmallVector<const SDNode *, 4> LoopWorklist; 2815 SmallPtrSet<const SDNode *, 16> Visited; 2816 const unsigned int Max = 1024; 2817 2818 // Visualization of Load-Op-Store fusion: 2819 // ------------------------- 2820 // Legend: 2821 // *-lines = Chain operand dependencies. 2822 // |-lines = Normal operand dependencies. 2823 // Dependencies flow down and right. n-suffix references multiple nodes. 2824 // 2825 // C Xn C 2826 // * * * 2827 // * * * 2828 // Xn A-LD Yn TF Yn 2829 // * * \ | * | 2830 // * * \ | * | 2831 // * * \ | => A--LD_OP_ST 2832 // * * \| \ 2833 // TF OP \ 2834 // * | \ Zn 2835 // * | \ 2836 // A-ST Zn 2837 // 2838 2839 // This merge induced dependences from: #1: Xn -> LD, OP, Zn 2840 // #2: Yn -> LD 2841 // #3: ST -> Zn 2842 2843 // Ensure the transform is safe by checking for the dual 2844 // dependencies to make sure we do not induce a loop. 2845 2846 // As LD is a predecessor to both OP and ST we can do this by checking: 2847 // a). if LD is a predecessor to a member of Xn or Yn. 2848 // b). if a Zn is a predecessor to ST. 2849 2850 // However, (b) can only occur through being a chain predecessor to 2851 // ST, which is the same as Zn being a member or predecessor of Xn, 2852 // which is a subset of LD being a predecessor of Xn. So it's 2853 // subsumed by check (a). 2854 2855 SDValue Chain = StoreNode->getChain(); 2856 2857 // Gather X elements in ChainOps. 2858 if (Chain == Load.getValue(1)) { 2859 FoundLoad = true; 2860 ChainOps.push_back(Load.getOperand(0)); 2861 } else if (Chain.getOpcode() == ISD::TokenFactor) { 2862 for (unsigned i = 0, e = Chain.getNumOperands(); i != e; ++i) { 2863 SDValue Op = Chain.getOperand(i); 2864 if (Op == Load.getValue(1)) { 2865 FoundLoad = true; 2866 // Drop Load, but keep its chain. No cycle check necessary. 2867 ChainOps.push_back(Load.getOperand(0)); 2868 continue; 2869 } 2870 LoopWorklist.push_back(Op.getNode()); 2871 ChainOps.push_back(Op); 2872 } 2873 } 2874 2875 if (!FoundLoad) 2876 return false; 2877 2878 // Worklist is currently Xn. Add Yn to worklist. 2879 for (SDValue Op : StoredVal->ops()) 2880 if (Op.getNode() != LoadNode) 2881 LoopWorklist.push_back(Op.getNode()); 2882 2883 // Check (a) if Load is a predecessor to Xn + Yn 2884 if (SDNode::hasPredecessorHelper(Load.getNode(), Visited, LoopWorklist, Max, 2885 true)) 2886 return false; 2887 2888 InputChain = 2889 CurDAG->getNode(ISD::TokenFactor, SDLoc(Chain), MVT::Other, ChainOps); 2890 return true; 2891 } 2892 2893 // Change a chain of {load; op; store} of the same value into a simple op 2894 // through memory of that value, if the uses of the modified value and its 2895 // address are suitable. 2896 // 2897 // The tablegen pattern memory operand pattern is currently not able to match 2898 // the case where the EFLAGS on the original operation are used. 2899 // 2900 // To move this to tablegen, we'll need to improve tablegen to allow flags to 2901 // be transferred from a node in the pattern to the result node, probably with 2902 // a new keyword. For example, we have this 2903 // def DEC64m : RI<0xFF, MRM1m, (outs), (ins i64mem:$dst), "dec{q}\t$dst", 2904 // [(store (add (loadi64 addr:$dst), -1), addr:$dst), 2905 // (implicit EFLAGS)]>; 2906 // but maybe need something like this 2907 // def DEC64m : RI<0xFF, MRM1m, (outs), (ins i64mem:$dst), "dec{q}\t$dst", 2908 // [(store (add (loadi64 addr:$dst), -1), addr:$dst), 2909 // (transferrable EFLAGS)]>; 2910 // 2911 // Until then, we manually fold these and instruction select the operation 2912 // here. 2913 bool X86DAGToDAGISel::foldLoadStoreIntoMemOperand(SDNode *Node) { 2914 StoreSDNode *StoreNode = cast<StoreSDNode>(Node); 2915 SDValue StoredVal = StoreNode->getOperand(1); 2916 unsigned Opc = StoredVal->getOpcode(); 2917 2918 // Before we try to select anything, make sure this is memory operand size 2919 // and opcode we can handle. Note that this must match the code below that 2920 // actually lowers the opcodes. 2921 EVT MemVT = StoreNode->getMemoryVT(); 2922 if (MemVT != MVT::i64 && MemVT != MVT::i32 && MemVT != MVT::i16 && 2923 MemVT != MVT::i8) 2924 return false; 2925 2926 bool IsCommutable = false; 2927 bool IsNegate = false; 2928 switch (Opc) { 2929 default: 2930 return false; 2931 case X86ISD::SUB: 2932 IsNegate = isNullConstant(StoredVal.getOperand(0)); 2933 break; 2934 case X86ISD::SBB: 2935 break; 2936 case X86ISD::ADD: 2937 case X86ISD::ADC: 2938 case X86ISD::AND: 2939 case X86ISD::OR: 2940 case X86ISD::XOR: 2941 IsCommutable = true; 2942 break; 2943 } 2944 2945 unsigned LoadOpNo = IsNegate ? 1 : 0; 2946 LoadSDNode *LoadNode = nullptr; 2947 SDValue InputChain; 2948 if (!isFusableLoadOpStorePattern(StoreNode, StoredVal, CurDAG, LoadOpNo, 2949 LoadNode, InputChain)) { 2950 if (!IsCommutable) 2951 return false; 2952 2953 // This operation is commutable, try the other operand. 2954 LoadOpNo = 1; 2955 if (!isFusableLoadOpStorePattern(StoreNode, StoredVal, CurDAG, LoadOpNo, 2956 LoadNode, InputChain)) 2957 return false; 2958 } 2959 2960 SDValue Base, Scale, Index, Disp, Segment; 2961 if (!selectAddr(LoadNode, LoadNode->getBasePtr(), Base, Scale, Index, Disp, 2962 Segment)) 2963 return false; 2964 2965 auto SelectOpcode = [&](unsigned Opc64, unsigned Opc32, unsigned Opc16, 2966 unsigned Opc8) { 2967 switch (MemVT.getSimpleVT().SimpleTy) { 2968 case MVT::i64: 2969 return Opc64; 2970 case MVT::i32: 2971 return Opc32; 2972 case MVT::i16: 2973 return Opc16; 2974 case MVT::i8: 2975 return Opc8; 2976 default: 2977 llvm_unreachable("Invalid size!"); 2978 } 2979 }; 2980 2981 MachineSDNode *Result; 2982 switch (Opc) { 2983 case X86ISD::SUB: 2984 // Handle negate. 2985 if (IsNegate) { 2986 unsigned NewOpc = SelectOpcode(X86::NEG64m, X86::NEG32m, X86::NEG16m, 2987 X86::NEG8m); 2988 const SDValue Ops[] = {Base, Scale, Index, Disp, Segment, InputChain}; 2989 Result = CurDAG->getMachineNode(NewOpc, SDLoc(Node), MVT::i32, 2990 MVT::Other, Ops); 2991 break; 2992 } 2993 LLVM_FALLTHROUGH; 2994 case X86ISD::ADD: 2995 // Try to match inc/dec. 2996 if (!Subtarget->slowIncDec() || OptForSize) { 2997 bool IsOne = isOneConstant(StoredVal.getOperand(1)); 2998 bool IsNegOne = isAllOnesConstant(StoredVal.getOperand(1)); 2999 // ADD/SUB with 1/-1 and carry flag isn't used can use inc/dec. 3000 if ((IsOne || IsNegOne) && hasNoCarryFlagUses(StoredVal.getValue(1))) { 3001 unsigned NewOpc = 3002 ((Opc == X86ISD::ADD) == IsOne) 3003 ? SelectOpcode(X86::INC64m, X86::INC32m, X86::INC16m, X86::INC8m) 3004 : SelectOpcode(X86::DEC64m, X86::DEC32m, X86::DEC16m, X86::DEC8m); 3005 const SDValue Ops[] = {Base, Scale, Index, Disp, Segment, InputChain}; 3006 Result = CurDAG->getMachineNode(NewOpc, SDLoc(Node), MVT::i32, 3007 MVT::Other, Ops); 3008 break; 3009 } 3010 } 3011 LLVM_FALLTHROUGH; 3012 case X86ISD::ADC: 3013 case X86ISD::SBB: 3014 case X86ISD::AND: 3015 case X86ISD::OR: 3016 case X86ISD::XOR: { 3017 auto SelectRegOpcode = [SelectOpcode](unsigned Opc) { 3018 switch (Opc) { 3019 case X86ISD::ADD: 3020 return SelectOpcode(X86::ADD64mr, X86::ADD32mr, X86::ADD16mr, 3021 X86::ADD8mr); 3022 case X86ISD::ADC: 3023 return SelectOpcode(X86::ADC64mr, X86::ADC32mr, X86::ADC16mr, 3024 X86::ADC8mr); 3025 case X86ISD::SUB: 3026 return SelectOpcode(X86::SUB64mr, X86::SUB32mr, X86::SUB16mr, 3027 X86::SUB8mr); 3028 case X86ISD::SBB: 3029 return SelectOpcode(X86::SBB64mr, X86::SBB32mr, X86::SBB16mr, 3030 X86::SBB8mr); 3031 case X86ISD::AND: 3032 return SelectOpcode(X86::AND64mr, X86::AND32mr, X86::AND16mr, 3033 X86::AND8mr); 3034 case X86ISD::OR: 3035 return SelectOpcode(X86::OR64mr, X86::OR32mr, X86::OR16mr, X86::OR8mr); 3036 case X86ISD::XOR: 3037 return SelectOpcode(X86::XOR64mr, X86::XOR32mr, X86::XOR16mr, 3038 X86::XOR8mr); 3039 default: 3040 llvm_unreachable("Invalid opcode!"); 3041 } 3042 }; 3043 auto SelectImm8Opcode = [SelectOpcode](unsigned Opc) { 3044 switch (Opc) { 3045 case X86ISD::ADD: 3046 return SelectOpcode(X86::ADD64mi8, X86::ADD32mi8, X86::ADD16mi8, 0); 3047 case X86ISD::ADC: 3048 return SelectOpcode(X86::ADC64mi8, X86::ADC32mi8, X86::ADC16mi8, 0); 3049 case X86ISD::SUB: 3050 return SelectOpcode(X86::SUB64mi8, X86::SUB32mi8, X86::SUB16mi8, 0); 3051 case X86ISD::SBB: 3052 return SelectOpcode(X86::SBB64mi8, X86::SBB32mi8, X86::SBB16mi8, 0); 3053 case X86ISD::AND: 3054 return SelectOpcode(X86::AND64mi8, X86::AND32mi8, X86::AND16mi8, 0); 3055 case X86ISD::OR: 3056 return SelectOpcode(X86::OR64mi8, X86::OR32mi8, X86::OR16mi8, 0); 3057 case X86ISD::XOR: 3058 return SelectOpcode(X86::XOR64mi8, X86::XOR32mi8, X86::XOR16mi8, 0); 3059 default: 3060 llvm_unreachable("Invalid opcode!"); 3061 } 3062 }; 3063 auto SelectImmOpcode = [SelectOpcode](unsigned Opc) { 3064 switch (Opc) { 3065 case X86ISD::ADD: 3066 return SelectOpcode(X86::ADD64mi32, X86::ADD32mi, X86::ADD16mi, 3067 X86::ADD8mi); 3068 case X86ISD::ADC: 3069 return SelectOpcode(X86::ADC64mi32, X86::ADC32mi, X86::ADC16mi, 3070 X86::ADC8mi); 3071 case X86ISD::SUB: 3072 return SelectOpcode(X86::SUB64mi32, X86::SUB32mi, X86::SUB16mi, 3073 X86::SUB8mi); 3074 case X86ISD::SBB: 3075 return SelectOpcode(X86::SBB64mi32, X86::SBB32mi, X86::SBB16mi, 3076 X86::SBB8mi); 3077 case X86ISD::AND: 3078 return SelectOpcode(X86::AND64mi32, X86::AND32mi, X86::AND16mi, 3079 X86::AND8mi); 3080 case X86ISD::OR: 3081 return SelectOpcode(X86::OR64mi32, X86::OR32mi, X86::OR16mi, 3082 X86::OR8mi); 3083 case X86ISD::XOR: 3084 return SelectOpcode(X86::XOR64mi32, X86::XOR32mi, X86::XOR16mi, 3085 X86::XOR8mi); 3086 default: 3087 llvm_unreachable("Invalid opcode!"); 3088 } 3089 }; 3090 3091 unsigned NewOpc = SelectRegOpcode(Opc); 3092 SDValue Operand = StoredVal->getOperand(1-LoadOpNo); 3093 3094 // See if the operand is a constant that we can fold into an immediate 3095 // operand. 3096 if (auto *OperandC = dyn_cast<ConstantSDNode>(Operand)) { 3097 int64_t OperandV = OperandC->getSExtValue(); 3098 3099 // Check if we can shrink the operand enough to fit in an immediate (or 3100 // fit into a smaller immediate) by negating it and switching the 3101 // operation. 3102 if ((Opc == X86ISD::ADD || Opc == X86ISD::SUB) && 3103 ((MemVT != MVT::i8 && !isInt<8>(OperandV) && isInt<8>(-OperandV)) || 3104 (MemVT == MVT::i64 && !isInt<32>(OperandV) && 3105 isInt<32>(-OperandV))) && 3106 hasNoCarryFlagUses(StoredVal.getValue(1))) { 3107 OperandV = -OperandV; 3108 Opc = Opc == X86ISD::ADD ? X86ISD::SUB : X86ISD::ADD; 3109 } 3110 3111 // First try to fit this into an Imm8 operand. If it doesn't fit, then try 3112 // the larger immediate operand. 3113 if (MemVT != MVT::i8 && isInt<8>(OperandV)) { 3114 Operand = CurDAG->getTargetConstant(OperandV, SDLoc(Node), MemVT); 3115 NewOpc = SelectImm8Opcode(Opc); 3116 } else if (MemVT != MVT::i64 || isInt<32>(OperandV)) { 3117 Operand = CurDAG->getTargetConstant(OperandV, SDLoc(Node), MemVT); 3118 NewOpc = SelectImmOpcode(Opc); 3119 } 3120 } 3121 3122 if (Opc == X86ISD::ADC || Opc == X86ISD::SBB) { 3123 SDValue CopyTo = 3124 CurDAG->getCopyToReg(InputChain, SDLoc(Node), X86::EFLAGS, 3125 StoredVal.getOperand(2), SDValue()); 3126 3127 const SDValue Ops[] = {Base, Scale, Index, Disp, 3128 Segment, Operand, CopyTo, CopyTo.getValue(1)}; 3129 Result = CurDAG->getMachineNode(NewOpc, SDLoc(Node), MVT::i32, MVT::Other, 3130 Ops); 3131 } else { 3132 const SDValue Ops[] = {Base, Scale, Index, Disp, 3133 Segment, Operand, InputChain}; 3134 Result = CurDAG->getMachineNode(NewOpc, SDLoc(Node), MVT::i32, MVT::Other, 3135 Ops); 3136 } 3137 break; 3138 } 3139 default: 3140 llvm_unreachable("Invalid opcode!"); 3141 } 3142 3143 MachineMemOperand *MemOps[] = {StoreNode->getMemOperand(), 3144 LoadNode->getMemOperand()}; 3145 CurDAG->setNodeMemRefs(Result, MemOps); 3146 3147 // Update Load Chain uses as well. 3148 ReplaceUses(SDValue(LoadNode, 1), SDValue(Result, 1)); 3149 ReplaceUses(SDValue(StoreNode, 0), SDValue(Result, 1)); 3150 ReplaceUses(SDValue(StoredVal.getNode(), 1), SDValue(Result, 0)); 3151 CurDAG->RemoveDeadNode(Node); 3152 return true; 3153 } 3154 3155 // See if this is an X & Mask that we can match to BEXTR/BZHI. 3156 // Where Mask is one of the following patterns: 3157 // a) x & (1 << nbits) - 1 3158 // b) x & ~(-1 << nbits) 3159 // c) x & (-1 >> (32 - y)) 3160 // d) x << (32 - y) >> (32 - y) 3161 bool X86DAGToDAGISel::matchBitExtract(SDNode *Node) { 3162 assert( 3163 (Node->getOpcode() == ISD::AND || Node->getOpcode() == ISD::SRL) && 3164 "Should be either an and-mask, or right-shift after clearing high bits."); 3165 3166 // BEXTR is BMI instruction, BZHI is BMI2 instruction. We need at least one. 3167 if (!Subtarget->hasBMI() && !Subtarget->hasBMI2()) 3168 return false; 3169 3170 MVT NVT = Node->getSimpleValueType(0); 3171 3172 // Only supported for 32 and 64 bits. 3173 if (NVT != MVT::i32 && NVT != MVT::i64) 3174 return false; 3175 3176 SDValue NBits; 3177 3178 // If we have BMI2's BZHI, we are ok with muti-use patterns. 3179 // Else, if we only have BMI1's BEXTR, we require one-use. 3180 const bool CanHaveExtraUses = Subtarget->hasBMI2(); 3181 auto checkUses = [CanHaveExtraUses](SDValue Op, unsigned NUses) { 3182 return CanHaveExtraUses || 3183 Op.getNode()->hasNUsesOfValue(NUses, Op.getResNo()); 3184 }; 3185 auto checkOneUse = [checkUses](SDValue Op) { return checkUses(Op, 1); }; 3186 auto checkTwoUse = [checkUses](SDValue Op) { return checkUses(Op, 2); }; 3187 3188 auto peekThroughOneUseTruncation = [checkOneUse](SDValue V) { 3189 if (V->getOpcode() == ISD::TRUNCATE && checkOneUse(V)) { 3190 assert(V.getSimpleValueType() == MVT::i32 && 3191 V.getOperand(0).getSimpleValueType() == MVT::i64 && 3192 "Expected i64 -> i32 truncation"); 3193 V = V.getOperand(0); 3194 } 3195 return V; 3196 }; 3197 3198 // a) x & ((1 << nbits) + (-1)) 3199 auto matchPatternA = [checkOneUse, peekThroughOneUseTruncation, 3200 &NBits](SDValue Mask) -> bool { 3201 // Match `add`. Must only have one use! 3202 if (Mask->getOpcode() != ISD::ADD || !checkOneUse(Mask)) 3203 return false; 3204 // We should be adding all-ones constant (i.e. subtracting one.) 3205 if (!isAllOnesConstant(Mask->getOperand(1))) 3206 return false; 3207 // Match `1 << nbits`. Might be truncated. Must only have one use! 3208 SDValue M0 = peekThroughOneUseTruncation(Mask->getOperand(0)); 3209 if (M0->getOpcode() != ISD::SHL || !checkOneUse(M0)) 3210 return false; 3211 if (!isOneConstant(M0->getOperand(0))) 3212 return false; 3213 NBits = M0->getOperand(1); 3214 return true; 3215 }; 3216 3217 auto isAllOnes = [this, peekThroughOneUseTruncation, NVT](SDValue V) { 3218 V = peekThroughOneUseTruncation(V); 3219 return CurDAG->MaskedValueIsAllOnes( 3220 V, APInt::getLowBitsSet(V.getSimpleValueType().getSizeInBits(), 3221 NVT.getSizeInBits())); 3222 }; 3223 3224 // b) x & ~(-1 << nbits) 3225 auto matchPatternB = [checkOneUse, isAllOnes, peekThroughOneUseTruncation, 3226 &NBits](SDValue Mask) -> bool { 3227 // Match `~()`. Must only have one use! 3228 if (Mask.getOpcode() != ISD::XOR || !checkOneUse(Mask)) 3229 return false; 3230 // The -1 only has to be all-ones for the final Node's NVT. 3231 if (!isAllOnes(Mask->getOperand(1))) 3232 return false; 3233 // Match `-1 << nbits`. Might be truncated. Must only have one use! 3234 SDValue M0 = peekThroughOneUseTruncation(Mask->getOperand(0)); 3235 if (M0->getOpcode() != ISD::SHL || !checkOneUse(M0)) 3236 return false; 3237 // The -1 only has to be all-ones for the final Node's NVT. 3238 if (!isAllOnes(M0->getOperand(0))) 3239 return false; 3240 NBits = M0->getOperand(1); 3241 return true; 3242 }; 3243 3244 // Match potentially-truncated (bitwidth - y) 3245 auto matchShiftAmt = [checkOneUse, &NBits](SDValue ShiftAmt, 3246 unsigned Bitwidth) { 3247 // Skip over a truncate of the shift amount. 3248 if (ShiftAmt.getOpcode() == ISD::TRUNCATE) { 3249 ShiftAmt = ShiftAmt.getOperand(0); 3250 // The trunc should have been the only user of the real shift amount. 3251 if (!checkOneUse(ShiftAmt)) 3252 return false; 3253 } 3254 // Match the shift amount as: (bitwidth - y). It should go away, too. 3255 if (ShiftAmt.getOpcode() != ISD::SUB) 3256 return false; 3257 auto V0 = dyn_cast<ConstantSDNode>(ShiftAmt.getOperand(0)); 3258 if (!V0 || V0->getZExtValue() != Bitwidth) 3259 return false; 3260 NBits = ShiftAmt.getOperand(1); 3261 return true; 3262 }; 3263 3264 // c) x & (-1 >> (32 - y)) 3265 auto matchPatternC = [checkOneUse, peekThroughOneUseTruncation, 3266 matchShiftAmt](SDValue Mask) -> bool { 3267 // The mask itself may be truncated. 3268 Mask = peekThroughOneUseTruncation(Mask); 3269 unsigned Bitwidth = Mask.getSimpleValueType().getSizeInBits(); 3270 // Match `l>>`. Must only have one use! 3271 if (Mask.getOpcode() != ISD::SRL || !checkOneUse(Mask)) 3272 return false; 3273 // We should be shifting truly all-ones constant. 3274 if (!isAllOnesConstant(Mask.getOperand(0))) 3275 return false; 3276 SDValue M1 = Mask.getOperand(1); 3277 // The shift amount should not be used externally. 3278 if (!checkOneUse(M1)) 3279 return false; 3280 return matchShiftAmt(M1, Bitwidth); 3281 }; 3282 3283 SDValue X; 3284 3285 // d) x << (32 - y) >> (32 - y) 3286 auto matchPatternD = [checkOneUse, checkTwoUse, matchShiftAmt, 3287 &X](SDNode *Node) -> bool { 3288 if (Node->getOpcode() != ISD::SRL) 3289 return false; 3290 SDValue N0 = Node->getOperand(0); 3291 if (N0->getOpcode() != ISD::SHL || !checkOneUse(N0)) 3292 return false; 3293 unsigned Bitwidth = N0.getSimpleValueType().getSizeInBits(); 3294 SDValue N1 = Node->getOperand(1); 3295 SDValue N01 = N0->getOperand(1); 3296 // Both of the shifts must be by the exact same value. 3297 // There should not be any uses of the shift amount outside of the pattern. 3298 if (N1 != N01 || !checkTwoUse(N1)) 3299 return false; 3300 if (!matchShiftAmt(N1, Bitwidth)) 3301 return false; 3302 X = N0->getOperand(0); 3303 return true; 3304 }; 3305 3306 auto matchLowBitMask = [matchPatternA, matchPatternB, 3307 matchPatternC](SDValue Mask) -> bool { 3308 return matchPatternA(Mask) || matchPatternB(Mask) || matchPatternC(Mask); 3309 }; 3310 3311 if (Node->getOpcode() == ISD::AND) { 3312 X = Node->getOperand(0); 3313 SDValue Mask = Node->getOperand(1); 3314 3315 if (matchLowBitMask(Mask)) { 3316 // Great. 3317 } else { 3318 std::swap(X, Mask); 3319 if (!matchLowBitMask(Mask)) 3320 return false; 3321 } 3322 } else if (!matchPatternD(Node)) 3323 return false; 3324 3325 SDLoc DL(Node); 3326 3327 // Truncate the shift amount. 3328 NBits = CurDAG->getNode(ISD::TRUNCATE, DL, MVT::i8, NBits); 3329 insertDAGNode(*CurDAG, SDValue(Node, 0), NBits); 3330 3331 // Insert 8-bit NBits into lowest 8 bits of 32-bit register. 3332 // All the other bits are undefined, we do not care about them. 3333 SDValue ImplDef = SDValue( 3334 CurDAG->getMachineNode(TargetOpcode::IMPLICIT_DEF, DL, MVT::i32), 0); 3335 insertDAGNode(*CurDAG, SDValue(Node, 0), ImplDef); 3336 3337 SDValue SRIdxVal = CurDAG->getTargetConstant(X86::sub_8bit, DL, MVT::i32); 3338 insertDAGNode(*CurDAG, SDValue(Node, 0), SRIdxVal); 3339 NBits = SDValue( 3340 CurDAG->getMachineNode(TargetOpcode::INSERT_SUBREG, DL, MVT::i32, ImplDef, 3341 NBits, SRIdxVal), 0); 3342 insertDAGNode(*CurDAG, SDValue(Node, 0), NBits); 3343 3344 if (Subtarget->hasBMI2()) { 3345 // Great, just emit the the BZHI.. 3346 if (NVT != MVT::i32) { 3347 // But have to place the bit count into the wide-enough register first. 3348 NBits = CurDAG->getNode(ISD::ANY_EXTEND, DL, NVT, NBits); 3349 insertDAGNode(*CurDAG, SDValue(Node, 0), NBits); 3350 } 3351 3352 SDValue Extract = CurDAG->getNode(X86ISD::BZHI, DL, NVT, X, NBits); 3353 ReplaceNode(Node, Extract.getNode()); 3354 SelectCode(Extract.getNode()); 3355 return true; 3356 } 3357 3358 // Else, if we do *NOT* have BMI2, let's find out if the if the 'X' is 3359 // *logically* shifted (potentially with one-use trunc inbetween), 3360 // and the truncation was the only use of the shift, 3361 // and if so look past one-use truncation. 3362 { 3363 SDValue RealX = peekThroughOneUseTruncation(X); 3364 // FIXME: only if the shift is one-use? 3365 if (RealX != X && RealX.getOpcode() == ISD::SRL) 3366 X = RealX; 3367 } 3368 3369 MVT XVT = X.getSimpleValueType(); 3370 3371 // Else, emitting BEXTR requires one more step. 3372 // The 'control' of BEXTR has the pattern of: 3373 // [15...8 bit][ 7...0 bit] location 3374 // [ bit count][ shift] name 3375 // I.e. 0b000000011'00000001 means (x >> 0b1) & 0b11 3376 3377 // Shift NBits left by 8 bits, thus producing 'control'. 3378 // This makes the low 8 bits to be zero. 3379 SDValue C8 = CurDAG->getConstant(8, DL, MVT::i8); 3380 SDValue Control = CurDAG->getNode(ISD::SHL, DL, MVT::i32, NBits, C8); 3381 insertDAGNode(*CurDAG, SDValue(Node, 0), Control); 3382 3383 // If the 'X' is *logically* shifted, we can fold that shift into 'control'. 3384 // FIXME: only if the shift is one-use? 3385 if (X.getOpcode() == ISD::SRL) { 3386 SDValue ShiftAmt = X.getOperand(1); 3387 X = X.getOperand(0); 3388 3389 assert(ShiftAmt.getValueType() == MVT::i8 && 3390 "Expected shift amount to be i8"); 3391 3392 // Now, *zero*-extend the shift amount. The bits 8...15 *must* be zero! 3393 // We could zext to i16 in some form, but we intentionally don't do that. 3394 SDValue OrigShiftAmt = ShiftAmt; 3395 ShiftAmt = CurDAG->getNode(ISD::ZERO_EXTEND, DL, MVT::i32, ShiftAmt); 3396 insertDAGNode(*CurDAG, OrigShiftAmt, ShiftAmt); 3397 3398 // And now 'or' these low 8 bits of shift amount into the 'control'. 3399 Control = CurDAG->getNode(ISD::OR, DL, MVT::i32, Control, ShiftAmt); 3400 insertDAGNode(*CurDAG, SDValue(Node, 0), Control); 3401 } 3402 3403 // But have to place the 'control' into the wide-enough register first. 3404 if (XVT != MVT::i32) { 3405 Control = CurDAG->getNode(ISD::ANY_EXTEND, DL, XVT, Control); 3406 insertDAGNode(*CurDAG, SDValue(Node, 0), Control); 3407 } 3408 3409 // And finally, form the BEXTR itself. 3410 SDValue Extract = CurDAG->getNode(X86ISD::BEXTR, DL, XVT, X, Control); 3411 3412 // The 'X' was originally truncated. Do that now. 3413 if (XVT != NVT) { 3414 insertDAGNode(*CurDAG, SDValue(Node, 0), Extract); 3415 Extract = CurDAG->getNode(ISD::TRUNCATE, DL, NVT, Extract); 3416 } 3417 3418 ReplaceNode(Node, Extract.getNode()); 3419 SelectCode(Extract.getNode()); 3420 3421 return true; 3422 } 3423 3424 // See if this is an (X >> C1) & C2 that we can match to BEXTR/BEXTRI. 3425 MachineSDNode *X86DAGToDAGISel::matchBEXTRFromAndImm(SDNode *Node) { 3426 MVT NVT = Node->getSimpleValueType(0); 3427 SDLoc dl(Node); 3428 3429 SDValue N0 = Node->getOperand(0); 3430 SDValue N1 = Node->getOperand(1); 3431 3432 // If we have TBM we can use an immediate for the control. If we have BMI 3433 // we should only do this if the BEXTR instruction is implemented well. 3434 // Otherwise moving the control into a register makes this more costly. 3435 // TODO: Maybe load folding, greater than 32-bit masks, or a guarantee of LICM 3436 // hoisting the move immediate would make it worthwhile with a less optimal 3437 // BEXTR? 3438 if (!Subtarget->hasTBM() && 3439 !(Subtarget->hasBMI() && Subtarget->hasFastBEXTR())) 3440 return nullptr; 3441 3442 // Must have a shift right. 3443 if (N0->getOpcode() != ISD::SRL && N0->getOpcode() != ISD::SRA) 3444 return nullptr; 3445 3446 // Shift can't have additional users. 3447 if (!N0->hasOneUse()) 3448 return nullptr; 3449 3450 // Only supported for 32 and 64 bits. 3451 if (NVT != MVT::i32 && NVT != MVT::i64) 3452 return nullptr; 3453 3454 // Shift amount and RHS of and must be constant. 3455 ConstantSDNode *MaskCst = dyn_cast<ConstantSDNode>(N1); 3456 ConstantSDNode *ShiftCst = dyn_cast<ConstantSDNode>(N0->getOperand(1)); 3457 if (!MaskCst || !ShiftCst) 3458 return nullptr; 3459 3460 // And RHS must be a mask. 3461 uint64_t Mask = MaskCst->getZExtValue(); 3462 if (!isMask_64(Mask)) 3463 return nullptr; 3464 3465 uint64_t Shift = ShiftCst->getZExtValue(); 3466 uint64_t MaskSize = countPopulation(Mask); 3467 3468 // Don't interfere with something that can be handled by extracting AH. 3469 // TODO: If we are able to fold a load, BEXTR might still be better than AH. 3470 if (Shift == 8 && MaskSize == 8) 3471 return nullptr; 3472 3473 // Make sure we are only using bits that were in the original value, not 3474 // shifted in. 3475 if (Shift + MaskSize > NVT.getSizeInBits()) 3476 return nullptr; 3477 3478 SDValue New = CurDAG->getTargetConstant(Shift | (MaskSize << 8), dl, NVT); 3479 unsigned ROpc = NVT == MVT::i64 ? X86::BEXTRI64ri : X86::BEXTRI32ri; 3480 unsigned MOpc = NVT == MVT::i64 ? X86::BEXTRI64mi : X86::BEXTRI32mi; 3481 3482 // BMI requires the immediate to placed in a register. 3483 if (!Subtarget->hasTBM()) { 3484 ROpc = NVT == MVT::i64 ? X86::BEXTR64rr : X86::BEXTR32rr; 3485 MOpc = NVT == MVT::i64 ? X86::BEXTR64rm : X86::BEXTR32rm; 3486 unsigned NewOpc = NVT == MVT::i64 ? X86::MOV32ri64 : X86::MOV32ri; 3487 New = SDValue(CurDAG->getMachineNode(NewOpc, dl, NVT, New), 0); 3488 } 3489 3490 MachineSDNode *NewNode; 3491 SDValue Input = N0->getOperand(0); 3492 SDValue Tmp0, Tmp1, Tmp2, Tmp3, Tmp4; 3493 if (tryFoldLoad(Node, N0.getNode(), Input, Tmp0, Tmp1, Tmp2, Tmp3, Tmp4)) { 3494 SDValue Ops[] = { Tmp0, Tmp1, Tmp2, Tmp3, Tmp4, New, Input.getOperand(0) }; 3495 SDVTList VTs = CurDAG->getVTList(NVT, MVT::i32, MVT::Other); 3496 NewNode = CurDAG->getMachineNode(MOpc, dl, VTs, Ops); 3497 // Update the chain. 3498 ReplaceUses(Input.getValue(1), SDValue(NewNode, 2)); 3499 // Record the mem-refs 3500 CurDAG->setNodeMemRefs(NewNode, {cast<LoadSDNode>(Input)->getMemOperand()}); 3501 } else { 3502 NewNode = CurDAG->getMachineNode(ROpc, dl, NVT, MVT::i32, Input, New); 3503 } 3504 3505 return NewNode; 3506 } 3507 3508 // Emit a PCMISTR(I/M) instruction. 3509 MachineSDNode *X86DAGToDAGISel::emitPCMPISTR(unsigned ROpc, unsigned MOpc, 3510 bool MayFoldLoad, const SDLoc &dl, 3511 MVT VT, SDNode *Node) { 3512 SDValue N0 = Node->getOperand(0); 3513 SDValue N1 = Node->getOperand(1); 3514 SDValue Imm = Node->getOperand(2); 3515 const ConstantInt *Val = cast<ConstantSDNode>(Imm)->getConstantIntValue(); 3516 Imm = CurDAG->getTargetConstant(*Val, SDLoc(Node), Imm.getValueType()); 3517 3518 // Try to fold a load. No need to check alignment. 3519 SDValue Tmp0, Tmp1, Tmp2, Tmp3, Tmp4; 3520 if (MayFoldLoad && tryFoldLoad(Node, N1, Tmp0, Tmp1, Tmp2, Tmp3, Tmp4)) { 3521 SDValue Ops[] = { N0, Tmp0, Tmp1, Tmp2, Tmp3, Tmp4, Imm, 3522 N1.getOperand(0) }; 3523 SDVTList VTs = CurDAG->getVTList(VT, MVT::i32, MVT::Other); 3524 MachineSDNode *CNode = CurDAG->getMachineNode(MOpc, dl, VTs, Ops); 3525 // Update the chain. 3526 ReplaceUses(N1.getValue(1), SDValue(CNode, 2)); 3527 // Record the mem-refs 3528 CurDAG->setNodeMemRefs(CNode, {cast<LoadSDNode>(N1)->getMemOperand()}); 3529 return CNode; 3530 } 3531 3532 SDValue Ops[] = { N0, N1, Imm }; 3533 SDVTList VTs = CurDAG->getVTList(VT, MVT::i32); 3534 MachineSDNode *CNode = CurDAG->getMachineNode(ROpc, dl, VTs, Ops); 3535 return CNode; 3536 } 3537 3538 // Emit a PCMESTR(I/M) instruction. Also return the Glue result in case we need 3539 // to emit a second instruction after this one. This is needed since we have two 3540 // copyToReg nodes glued before this and we need to continue that glue through. 3541 MachineSDNode *X86DAGToDAGISel::emitPCMPESTR(unsigned ROpc, unsigned MOpc, 3542 bool MayFoldLoad, const SDLoc &dl, 3543 MVT VT, SDNode *Node, 3544 SDValue &InFlag) { 3545 SDValue N0 = Node->getOperand(0); 3546 SDValue N2 = Node->getOperand(2); 3547 SDValue Imm = Node->getOperand(4); 3548 const ConstantInt *Val = cast<ConstantSDNode>(Imm)->getConstantIntValue(); 3549 Imm = CurDAG->getTargetConstant(*Val, SDLoc(Node), Imm.getValueType()); 3550 3551 // Try to fold a load. No need to check alignment. 3552 SDValue Tmp0, Tmp1, Tmp2, Tmp3, Tmp4; 3553 if (MayFoldLoad && tryFoldLoad(Node, N2, Tmp0, Tmp1, Tmp2, Tmp3, Tmp4)) { 3554 SDValue Ops[] = { N0, Tmp0, Tmp1, Tmp2, Tmp3, Tmp4, Imm, 3555 N2.getOperand(0), InFlag }; 3556 SDVTList VTs = CurDAG->getVTList(VT, MVT::i32, MVT::Other, MVT::Glue); 3557 MachineSDNode *CNode = CurDAG->getMachineNode(MOpc, dl, VTs, Ops); 3558 InFlag = SDValue(CNode, 3); 3559 // Update the chain. 3560 ReplaceUses(N2.getValue(1), SDValue(CNode, 2)); 3561 // Record the mem-refs 3562 CurDAG->setNodeMemRefs(CNode, {cast<LoadSDNode>(N2)->getMemOperand()}); 3563 return CNode; 3564 } 3565 3566 SDValue Ops[] = { N0, N2, Imm, InFlag }; 3567 SDVTList VTs = CurDAG->getVTList(VT, MVT::i32, MVT::Glue); 3568 MachineSDNode *CNode = CurDAG->getMachineNode(ROpc, dl, VTs, Ops); 3569 InFlag = SDValue(CNode, 2); 3570 return CNode; 3571 } 3572 3573 bool X86DAGToDAGISel::tryShiftAmountMod(SDNode *N) { 3574 EVT VT = N->getValueType(0); 3575 3576 // Only handle scalar shifts. 3577 if (VT.isVector()) 3578 return false; 3579 3580 // Narrower shifts only mask to 5 bits in hardware. 3581 unsigned Size = VT == MVT::i64 ? 64 : 32; 3582 3583 SDValue OrigShiftAmt = N->getOperand(1); 3584 SDValue ShiftAmt = OrigShiftAmt; 3585 SDLoc DL(N); 3586 3587 // Skip over a truncate of the shift amount. 3588 if (ShiftAmt->getOpcode() == ISD::TRUNCATE) 3589 ShiftAmt = ShiftAmt->getOperand(0); 3590 3591 // This function is called after X86DAGToDAGISel::matchBitExtract(), 3592 // so we are not afraid that we might mess up BZHI/BEXTR pattern. 3593 3594 SDValue NewShiftAmt; 3595 if (ShiftAmt->getOpcode() == ISD::ADD || ShiftAmt->getOpcode() == ISD::SUB) { 3596 SDValue Add0 = ShiftAmt->getOperand(0); 3597 SDValue Add1 = ShiftAmt->getOperand(1); 3598 // If we are shifting by X+/-N where N == 0 mod Size, then just shift by X 3599 // to avoid the ADD/SUB. 3600 if (isa<ConstantSDNode>(Add1) && 3601 cast<ConstantSDNode>(Add1)->getZExtValue() % Size == 0) { 3602 NewShiftAmt = Add0; 3603 // If we are shifting by N-X where N == 0 mod Size, then just shift by -X to 3604 // generate a NEG instead of a SUB of a constant. 3605 } else if (ShiftAmt->getOpcode() == ISD::SUB && 3606 isa<ConstantSDNode>(Add0) && 3607 cast<ConstantSDNode>(Add0)->getZExtValue() != 0 && 3608 cast<ConstantSDNode>(Add0)->getZExtValue() % Size == 0) { 3609 // Insert a negate op. 3610 // TODO: This isn't guaranteed to replace the sub if there is a logic cone 3611 // that uses it that's not a shift. 3612 EVT SubVT = ShiftAmt.getValueType(); 3613 SDValue Zero = CurDAG->getConstant(0, DL, SubVT); 3614 SDValue Neg = CurDAG->getNode(ISD::SUB, DL, SubVT, Zero, Add1); 3615 NewShiftAmt = Neg; 3616 3617 // Insert these operands into a valid topological order so they can 3618 // get selected independently. 3619 insertDAGNode(*CurDAG, OrigShiftAmt, Zero); 3620 insertDAGNode(*CurDAG, OrigShiftAmt, Neg); 3621 } else 3622 return false; 3623 } else 3624 return false; 3625 3626 if (NewShiftAmt.getValueType() != MVT::i8) { 3627 // Need to truncate the shift amount. 3628 NewShiftAmt = CurDAG->getNode(ISD::TRUNCATE, DL, MVT::i8, NewShiftAmt); 3629 // Add to a correct topological ordering. 3630 insertDAGNode(*CurDAG, OrigShiftAmt, NewShiftAmt); 3631 } 3632 3633 // Insert a new mask to keep the shift amount legal. This should be removed 3634 // by isel patterns. 3635 NewShiftAmt = CurDAG->getNode(ISD::AND, DL, MVT::i8, NewShiftAmt, 3636 CurDAG->getConstant(Size - 1, DL, MVT::i8)); 3637 // Place in a correct topological ordering. 3638 insertDAGNode(*CurDAG, OrigShiftAmt, NewShiftAmt); 3639 3640 SDNode *UpdatedNode = CurDAG->UpdateNodeOperands(N, N->getOperand(0), 3641 NewShiftAmt); 3642 if (UpdatedNode != N) { 3643 // If we found an existing node, we should replace ourselves with that node 3644 // and wait for it to be selected after its other users. 3645 ReplaceNode(N, UpdatedNode); 3646 return true; 3647 } 3648 3649 // If the original shift amount is now dead, delete it so that we don't run 3650 // it through isel. 3651 if (OrigShiftAmt.getNode()->use_empty()) 3652 CurDAG->RemoveDeadNode(OrigShiftAmt.getNode()); 3653 3654 // Now that we've optimized the shift amount, defer to normal isel to get 3655 // load folding and legacy vs BMI2 selection without repeating it here. 3656 SelectCode(N); 3657 return true; 3658 } 3659 3660 bool X86DAGToDAGISel::tryShrinkShlLogicImm(SDNode *N) { 3661 MVT NVT = N->getSimpleValueType(0); 3662 unsigned Opcode = N->getOpcode(); 3663 SDLoc dl(N); 3664 3665 // For operations of the form (x << C1) op C2, check if we can use a smaller 3666 // encoding for C2 by transforming it into (x op (C2>>C1)) << C1. 3667 SDValue Shift = N->getOperand(0); 3668 SDValue N1 = N->getOperand(1); 3669 3670 ConstantSDNode *Cst = dyn_cast<ConstantSDNode>(N1); 3671 if (!Cst) 3672 return false; 3673 3674 int64_t Val = Cst->getSExtValue(); 3675 3676 // If we have an any_extend feeding the AND, look through it to see if there 3677 // is a shift behind it. But only if the AND doesn't use the extended bits. 3678 // FIXME: Generalize this to other ANY_EXTEND than i32 to i64? 3679 bool FoundAnyExtend = false; 3680 if (Shift.getOpcode() == ISD::ANY_EXTEND && Shift.hasOneUse() && 3681 Shift.getOperand(0).getSimpleValueType() == MVT::i32 && 3682 isUInt<32>(Val)) { 3683 FoundAnyExtend = true; 3684 Shift = Shift.getOperand(0); 3685 } 3686 3687 if (Shift.getOpcode() != ISD::SHL || !Shift.hasOneUse()) 3688 return false; 3689 3690 // i8 is unshrinkable, i16 should be promoted to i32. 3691 if (NVT != MVT::i32 && NVT != MVT::i64) 3692 return false; 3693 3694 ConstantSDNode *ShlCst = dyn_cast<ConstantSDNode>(Shift.getOperand(1)); 3695 if (!ShlCst) 3696 return false; 3697 3698 uint64_t ShAmt = ShlCst->getZExtValue(); 3699 3700 // Make sure that we don't change the operation by removing bits. 3701 // This only matters for OR and XOR, AND is unaffected. 3702 uint64_t RemovedBitsMask = (1ULL << ShAmt) - 1; 3703 if (Opcode != ISD::AND && (Val & RemovedBitsMask) != 0) 3704 return false; 3705 3706 // Check the minimum bitwidth for the new constant. 3707 // TODO: Using 16 and 8 bit operations is also possible for or32 & xor32. 3708 auto CanShrinkImmediate = [&](int64_t &ShiftedVal) { 3709 if (Opcode == ISD::AND) { 3710 // AND32ri is the same as AND64ri32 with zext imm. 3711 // Try this before sign extended immediates below. 3712 ShiftedVal = (uint64_t)Val >> ShAmt; 3713 if (NVT == MVT::i64 && !isUInt<32>(Val) && isUInt<32>(ShiftedVal)) 3714 return true; 3715 // Also swap order when the AND can become MOVZX. 3716 if (ShiftedVal == UINT8_MAX || ShiftedVal == UINT16_MAX) 3717 return true; 3718 } 3719 ShiftedVal = Val >> ShAmt; 3720 if ((!isInt<8>(Val) && isInt<8>(ShiftedVal)) || 3721 (!isInt<32>(Val) && isInt<32>(ShiftedVal))) 3722 return true; 3723 if (Opcode != ISD::AND) { 3724 // MOV32ri+OR64r/XOR64r is cheaper than MOV64ri64+OR64rr/XOR64rr 3725 ShiftedVal = (uint64_t)Val >> ShAmt; 3726 if (NVT == MVT::i64 && !isUInt<32>(Val) && isUInt<32>(ShiftedVal)) 3727 return true; 3728 } 3729 return false; 3730 }; 3731 3732 int64_t ShiftedVal; 3733 if (!CanShrinkImmediate(ShiftedVal)) 3734 return false; 3735 3736 // Ok, we can reorder to get a smaller immediate. 3737 3738 // But, its possible the original immediate allowed an AND to become MOVZX. 3739 // Doing this late due to avoid the MakedValueIsZero call as late as 3740 // possible. 3741 if (Opcode == ISD::AND) { 3742 // Find the smallest zext this could possibly be. 3743 unsigned ZExtWidth = Cst->getAPIntValue().getActiveBits(); 3744 ZExtWidth = PowerOf2Ceil(std::max(ZExtWidth, 8U)); 3745 3746 // Figure out which bits need to be zero to achieve that mask. 3747 APInt NeededMask = APInt::getLowBitsSet(NVT.getSizeInBits(), 3748 ZExtWidth); 3749 NeededMask &= ~Cst->getAPIntValue(); 3750 3751 if (CurDAG->MaskedValueIsZero(N->getOperand(0), NeededMask)) 3752 return false; 3753 } 3754 3755 SDValue X = Shift.getOperand(0); 3756 if (FoundAnyExtend) { 3757 SDValue NewX = CurDAG->getNode(ISD::ANY_EXTEND, dl, NVT, X); 3758 insertDAGNode(*CurDAG, SDValue(N, 0), NewX); 3759 X = NewX; 3760 } 3761 3762 SDValue NewCst = CurDAG->getConstant(ShiftedVal, dl, NVT); 3763 insertDAGNode(*CurDAG, SDValue(N, 0), NewCst); 3764 SDValue NewBinOp = CurDAG->getNode(Opcode, dl, NVT, X, NewCst); 3765 insertDAGNode(*CurDAG, SDValue(N, 0), NewBinOp); 3766 SDValue NewSHL = CurDAG->getNode(ISD::SHL, dl, NVT, NewBinOp, 3767 Shift.getOperand(1)); 3768 ReplaceNode(N, NewSHL.getNode()); 3769 SelectCode(NewSHL.getNode()); 3770 return true; 3771 } 3772 3773 /// If the high bits of an 'and' operand are known zero, try setting the 3774 /// high bits of an 'and' constant operand to produce a smaller encoding by 3775 /// creating a small, sign-extended negative immediate rather than a large 3776 /// positive one. This reverses a transform in SimplifyDemandedBits that 3777 /// shrinks mask constants by clearing bits. There is also a possibility that 3778 /// the 'and' mask can be made -1, so the 'and' itself is unnecessary. In that 3779 /// case, just replace the 'and'. Return 'true' if the node is replaced. 3780 bool X86DAGToDAGISel::shrinkAndImmediate(SDNode *And) { 3781 // i8 is unshrinkable, i16 should be promoted to i32, and vector ops don't 3782 // have immediate operands. 3783 MVT VT = And->getSimpleValueType(0); 3784 if (VT != MVT::i32 && VT != MVT::i64) 3785 return false; 3786 3787 auto *And1C = dyn_cast<ConstantSDNode>(And->getOperand(1)); 3788 if (!And1C) 3789 return false; 3790 3791 // Bail out if the mask constant is already negative. It's can't shrink more. 3792 // If the upper 32 bits of a 64 bit mask are all zeros, we have special isel 3793 // patterns to use a 32-bit and instead of a 64-bit and by relying on the 3794 // implicit zeroing of 32 bit ops. So we should check if the lower 32 bits 3795 // are negative too. 3796 APInt MaskVal = And1C->getAPIntValue(); 3797 unsigned MaskLZ = MaskVal.countLeadingZeros(); 3798 if (!MaskLZ || (VT == MVT::i64 && MaskLZ == 32)) 3799 return false; 3800 3801 // Don't extend into the upper 32 bits of a 64 bit mask. 3802 if (VT == MVT::i64 && MaskLZ >= 32) { 3803 MaskLZ -= 32; 3804 MaskVal = MaskVal.trunc(32); 3805 } 3806 3807 SDValue And0 = And->getOperand(0); 3808 APInt HighZeros = APInt::getHighBitsSet(MaskVal.getBitWidth(), MaskLZ); 3809 APInt NegMaskVal = MaskVal | HighZeros; 3810 3811 // If a negative constant would not allow a smaller encoding, there's no need 3812 // to continue. Only change the constant when we know it's a win. 3813 unsigned MinWidth = NegMaskVal.getMinSignedBits(); 3814 if (MinWidth > 32 || (MinWidth > 8 && MaskVal.getMinSignedBits() <= 32)) 3815 return false; 3816 3817 // Extend masks if we truncated above. 3818 if (VT == MVT::i64 && MaskVal.getBitWidth() < 64) { 3819 NegMaskVal = NegMaskVal.zext(64); 3820 HighZeros = HighZeros.zext(64); 3821 } 3822 3823 // The variable operand must be all zeros in the top bits to allow using the 3824 // new, negative constant as the mask. 3825 if (!CurDAG->MaskedValueIsZero(And0, HighZeros)) 3826 return false; 3827 3828 // Check if the mask is -1. In that case, this is an unnecessary instruction 3829 // that escaped earlier analysis. 3830 if (NegMaskVal.isAllOnesValue()) { 3831 ReplaceNode(And, And0.getNode()); 3832 return true; 3833 } 3834 3835 // A negative mask allows a smaller encoding. Create a new 'and' node. 3836 SDValue NewMask = CurDAG->getConstant(NegMaskVal, SDLoc(And), VT); 3837 SDValue NewAnd = CurDAG->getNode(ISD::AND, SDLoc(And), VT, And0, NewMask); 3838 ReplaceNode(And, NewAnd.getNode()); 3839 SelectCode(NewAnd.getNode()); 3840 return true; 3841 } 3842 3843 static unsigned getVPTESTMOpc(MVT TestVT, bool IsTestN, bool FoldedLoad, 3844 bool FoldedBCast, bool Masked) { 3845 if (Masked) { 3846 if (FoldedLoad) { 3847 switch (TestVT.SimpleTy) { 3848 default: llvm_unreachable("Unexpected VT!"); 3849 case MVT::v16i8: 3850 return IsTestN ? X86::VPTESTNMBZ128rmk : X86::VPTESTMBZ128rmk; 3851 case MVT::v8i16: 3852 return IsTestN ? X86::VPTESTNMWZ128rmk : X86::VPTESTMWZ128rmk; 3853 case MVT::v4i32: 3854 return IsTestN ? X86::VPTESTNMDZ128rmk : X86::VPTESTMDZ128rmk; 3855 case MVT::v2i64: 3856 return IsTestN ? X86::VPTESTNMQZ128rmk : X86::VPTESTMQZ128rmk; 3857 case MVT::v32i8: 3858 return IsTestN ? X86::VPTESTNMBZ256rmk : X86::VPTESTMBZ256rmk; 3859 case MVT::v16i16: 3860 return IsTestN ? X86::VPTESTNMWZ256rmk : X86::VPTESTMWZ256rmk; 3861 case MVT::v8i32: 3862 return IsTestN ? X86::VPTESTNMDZ256rmk : X86::VPTESTMDZ256rmk; 3863 case MVT::v4i64: 3864 return IsTestN ? X86::VPTESTNMQZ256rmk : X86::VPTESTMQZ256rmk; 3865 case MVT::v64i8: 3866 return IsTestN ? X86::VPTESTNMBZrmk : X86::VPTESTMBZrmk; 3867 case MVT::v32i16: 3868 return IsTestN ? X86::VPTESTNMWZrmk : X86::VPTESTMWZrmk; 3869 case MVT::v16i32: 3870 return IsTestN ? X86::VPTESTNMDZrmk : X86::VPTESTMDZrmk; 3871 case MVT::v8i64: 3872 return IsTestN ? X86::VPTESTNMQZrmk : X86::VPTESTMQZrmk; 3873 } 3874 } 3875 3876 if (FoldedBCast) { 3877 switch (TestVT.SimpleTy) { 3878 default: llvm_unreachable("Unexpected VT!"); 3879 case MVT::v4i32: 3880 return IsTestN ? X86::VPTESTNMDZ128rmbk : X86::VPTESTMDZ128rmbk; 3881 case MVT::v2i64: 3882 return IsTestN ? X86::VPTESTNMQZ128rmbk : X86::VPTESTMQZ128rmbk; 3883 case MVT::v8i32: 3884 return IsTestN ? X86::VPTESTNMDZ256rmbk : X86::VPTESTMDZ256rmbk; 3885 case MVT::v4i64: 3886 return IsTestN ? X86::VPTESTNMQZ256rmbk : X86::VPTESTMQZ256rmbk; 3887 case MVT::v16i32: 3888 return IsTestN ? X86::VPTESTNMDZrmbk : X86::VPTESTMDZrmbk; 3889 case MVT::v8i64: 3890 return IsTestN ? X86::VPTESTNMQZrmbk : X86::VPTESTMQZrmbk; 3891 } 3892 } 3893 3894 switch (TestVT.SimpleTy) { 3895 default: llvm_unreachable("Unexpected VT!"); 3896 case MVT::v16i8: 3897 return IsTestN ? X86::VPTESTNMBZ128rrk : X86::VPTESTMBZ128rrk; 3898 case MVT::v8i16: 3899 return IsTestN ? X86::VPTESTNMWZ128rrk : X86::VPTESTMWZ128rrk; 3900 case MVT::v4i32: 3901 return IsTestN ? X86::VPTESTNMDZ128rrk : X86::VPTESTMDZ128rrk; 3902 case MVT::v2i64: 3903 return IsTestN ? X86::VPTESTNMQZ128rrk : X86::VPTESTMQZ128rrk; 3904 case MVT::v32i8: 3905 return IsTestN ? X86::VPTESTNMBZ256rrk : X86::VPTESTMBZ256rrk; 3906 case MVT::v16i16: 3907 return IsTestN ? X86::VPTESTNMWZ256rrk : X86::VPTESTMWZ256rrk; 3908 case MVT::v8i32: 3909 return IsTestN ? X86::VPTESTNMDZ256rrk : X86::VPTESTMDZ256rrk; 3910 case MVT::v4i64: 3911 return IsTestN ? X86::VPTESTNMQZ256rrk : X86::VPTESTMQZ256rrk; 3912 case MVT::v64i8: 3913 return IsTestN ? X86::VPTESTNMBZrrk : X86::VPTESTMBZrrk; 3914 case MVT::v32i16: 3915 return IsTestN ? X86::VPTESTNMWZrrk : X86::VPTESTMWZrrk; 3916 case MVT::v16i32: 3917 return IsTestN ? X86::VPTESTNMDZrrk : X86::VPTESTMDZrrk; 3918 case MVT::v8i64: 3919 return IsTestN ? X86::VPTESTNMQZrrk : X86::VPTESTMQZrrk; 3920 } 3921 } 3922 3923 if (FoldedLoad) { 3924 switch (TestVT.SimpleTy) { 3925 default: llvm_unreachable("Unexpected VT!"); 3926 case MVT::v16i8: 3927 return IsTestN ? X86::VPTESTNMBZ128rm : X86::VPTESTMBZ128rm; 3928 case MVT::v8i16: 3929 return IsTestN ? X86::VPTESTNMWZ128rm : X86::VPTESTMWZ128rm; 3930 case MVT::v4i32: 3931 return IsTestN ? X86::VPTESTNMDZ128rm : X86::VPTESTMDZ128rm; 3932 case MVT::v2i64: 3933 return IsTestN ? X86::VPTESTNMQZ128rm : X86::VPTESTMQZ128rm; 3934 case MVT::v32i8: 3935 return IsTestN ? X86::VPTESTNMBZ256rm : X86::VPTESTMBZ256rm; 3936 case MVT::v16i16: 3937 return IsTestN ? X86::VPTESTNMWZ256rm : X86::VPTESTMWZ256rm; 3938 case MVT::v8i32: 3939 return IsTestN ? X86::VPTESTNMDZ256rm : X86::VPTESTMDZ256rm; 3940 case MVT::v4i64: 3941 return IsTestN ? X86::VPTESTNMQZ256rm : X86::VPTESTMQZ256rm; 3942 case MVT::v64i8: 3943 return IsTestN ? X86::VPTESTNMBZrm : X86::VPTESTMBZrm; 3944 case MVT::v32i16: 3945 return IsTestN ? X86::VPTESTNMWZrm : X86::VPTESTMWZrm; 3946 case MVT::v16i32: 3947 return IsTestN ? X86::VPTESTNMDZrm : X86::VPTESTMDZrm; 3948 case MVT::v8i64: 3949 return IsTestN ? X86::VPTESTNMQZrm : X86::VPTESTMQZrm; 3950 } 3951 } 3952 3953 if (FoldedBCast) { 3954 switch (TestVT.SimpleTy) { 3955 default: llvm_unreachable("Unexpected VT!"); 3956 case MVT::v4i32: 3957 return IsTestN ? X86::VPTESTNMDZ128rmb : X86::VPTESTMDZ128rmb; 3958 case MVT::v2i64: 3959 return IsTestN ? X86::VPTESTNMQZ128rmb : X86::VPTESTMQZ128rmb; 3960 case MVT::v8i32: 3961 return IsTestN ? X86::VPTESTNMDZ256rmb : X86::VPTESTMDZ256rmb; 3962 case MVT::v4i64: 3963 return IsTestN ? X86::VPTESTNMQZ256rmb : X86::VPTESTMQZ256rmb; 3964 case MVT::v16i32: 3965 return IsTestN ? X86::VPTESTNMDZrmb : X86::VPTESTMDZrmb; 3966 case MVT::v8i64: 3967 return IsTestN ? X86::VPTESTNMQZrmb : X86::VPTESTMQZrmb; 3968 } 3969 } 3970 3971 switch (TestVT.SimpleTy) { 3972 default: llvm_unreachable("Unexpected VT!"); 3973 case MVT::v16i8: 3974 return IsTestN ? X86::VPTESTNMBZ128rr : X86::VPTESTMBZ128rr; 3975 case MVT::v8i16: 3976 return IsTestN ? X86::VPTESTNMWZ128rr : X86::VPTESTMWZ128rr; 3977 case MVT::v4i32: 3978 return IsTestN ? X86::VPTESTNMDZ128rr : X86::VPTESTMDZ128rr; 3979 case MVT::v2i64: 3980 return IsTestN ? X86::VPTESTNMQZ128rr : X86::VPTESTMQZ128rr; 3981 case MVT::v32i8: 3982 return IsTestN ? X86::VPTESTNMBZ256rr : X86::VPTESTMBZ256rr; 3983 case MVT::v16i16: 3984 return IsTestN ? X86::VPTESTNMWZ256rr : X86::VPTESTMWZ256rr; 3985 case MVT::v8i32: 3986 return IsTestN ? X86::VPTESTNMDZ256rr : X86::VPTESTMDZ256rr; 3987 case MVT::v4i64: 3988 return IsTestN ? X86::VPTESTNMQZ256rr : X86::VPTESTMQZ256rr; 3989 case MVT::v64i8: 3990 return IsTestN ? X86::VPTESTNMBZrr : X86::VPTESTMBZrr; 3991 case MVT::v32i16: 3992 return IsTestN ? X86::VPTESTNMWZrr : X86::VPTESTMWZrr; 3993 case MVT::v16i32: 3994 return IsTestN ? X86::VPTESTNMDZrr : X86::VPTESTMDZrr; 3995 case MVT::v8i64: 3996 return IsTestN ? X86::VPTESTNMQZrr : X86::VPTESTMQZrr; 3997 } 3998 } 3999 4000 // Try to create VPTESTM instruction. If InMask is not null, it will be used 4001 // to form a masked operation. 4002 bool X86DAGToDAGISel::tryVPTESTM(SDNode *Root, SDValue Setcc, 4003 SDValue InMask) { 4004 assert(Subtarget->hasAVX512() && "Expected AVX512!"); 4005 assert(Setcc.getSimpleValueType().getVectorElementType() == MVT::i1 && 4006 "Unexpected VT!"); 4007 4008 // Look for equal and not equal compares. 4009 ISD::CondCode CC = cast<CondCodeSDNode>(Setcc.getOperand(2))->get(); 4010 if (CC != ISD::SETEQ && CC != ISD::SETNE) 4011 return false; 4012 4013 // See if we're comparing against zero. This should have been canonicalized 4014 // to RHS during lowering. 4015 if (!ISD::isBuildVectorAllZeros(Setcc.getOperand(1).getNode())) 4016 return false; 4017 4018 SDValue N0 = Setcc.getOperand(0); 4019 4020 MVT CmpVT = N0.getSimpleValueType(); 4021 MVT CmpSVT = CmpVT.getVectorElementType(); 4022 4023 // Start with both operands the same. We'll try to refine this. 4024 SDValue Src0 = N0; 4025 SDValue Src1 = N0; 4026 4027 { 4028 // Look through single use bitcasts. 4029 SDValue N0Temp = N0; 4030 if (N0Temp.getOpcode() == ISD::BITCAST && N0Temp.hasOneUse()) 4031 N0Temp = N0.getOperand(0); 4032 4033 // Look for single use AND. 4034 if (N0Temp.getOpcode() == ISD::AND && N0Temp.hasOneUse()) { 4035 Src0 = N0Temp.getOperand(0); 4036 Src1 = N0Temp.getOperand(1); 4037 } 4038 } 4039 4040 // Without VLX we need to widen the load. 4041 bool Widen = !Subtarget->hasVLX() && !CmpVT.is512BitVector(); 4042 4043 // We can only fold loads if the sources are unique. 4044 bool CanFoldLoads = Src0 != Src1; 4045 4046 // Try to fold loads unless we need to widen. 4047 bool FoldedLoad = false; 4048 SDValue Tmp0, Tmp1, Tmp2, Tmp3, Tmp4, Load; 4049 if (!Widen && CanFoldLoads) { 4050 Load = Src1; 4051 FoldedLoad = tryFoldLoad(Root, N0.getNode(), Load, Tmp0, Tmp1, Tmp2, Tmp3, 4052 Tmp4); 4053 if (!FoldedLoad) { 4054 // And is computative. 4055 Load = Src0; 4056 FoldedLoad = tryFoldLoad(Root, N0.getNode(), Load, Tmp0, Tmp1, Tmp2, 4057 Tmp3, Tmp4); 4058 if (FoldedLoad) 4059 std::swap(Src0, Src1); 4060 } 4061 } 4062 4063 auto findBroadcastedOp = [](SDValue Src, MVT CmpSVT, SDNode *&Parent) { 4064 // Look through single use bitcasts. 4065 if (Src.getOpcode() == ISD::BITCAST && Src.hasOneUse()) 4066 Src = Src.getOperand(0); 4067 4068 if (Src.getOpcode() == X86ISD::VBROADCAST && Src.hasOneUse()) { 4069 Parent = Src.getNode(); 4070 Src = Src.getOperand(0); 4071 if (Src.getSimpleValueType() == CmpSVT) 4072 return Src; 4073 } 4074 4075 return SDValue(); 4076 }; 4077 4078 // If we didn't fold a load, try to match broadcast. No widening limitation 4079 // for this. But only 32 and 64 bit types are supported. 4080 bool FoldedBCast = false; 4081 if (!FoldedLoad && CanFoldLoads && 4082 (CmpSVT == MVT::i32 || CmpSVT == MVT::i64)) { 4083 SDNode *ParentNode = nullptr; 4084 if ((Load = findBroadcastedOp(Src1, CmpSVT, ParentNode))) { 4085 FoldedBCast = tryFoldLoad(Root, ParentNode, Load, Tmp0, 4086 Tmp1, Tmp2, Tmp3, Tmp4); 4087 } 4088 4089 // Try the other operand. 4090 if (!FoldedBCast) { 4091 if ((Load = findBroadcastedOp(Src0, CmpSVT, ParentNode))) { 4092 FoldedBCast = tryFoldLoad(Root, ParentNode, Load, Tmp0, 4093 Tmp1, Tmp2, Tmp3, Tmp4); 4094 if (FoldedBCast) 4095 std::swap(Src0, Src1); 4096 } 4097 } 4098 } 4099 4100 auto getMaskRC = [](MVT MaskVT) { 4101 switch (MaskVT.SimpleTy) { 4102 default: llvm_unreachable("Unexpected VT!"); 4103 case MVT::v2i1: return X86::VK2RegClassID; 4104 case MVT::v4i1: return X86::VK4RegClassID; 4105 case MVT::v8i1: return X86::VK8RegClassID; 4106 case MVT::v16i1: return X86::VK16RegClassID; 4107 case MVT::v32i1: return X86::VK32RegClassID; 4108 case MVT::v64i1: return X86::VK64RegClassID; 4109 } 4110 }; 4111 4112 bool IsMasked = InMask.getNode() != nullptr; 4113 4114 SDLoc dl(Root); 4115 4116 MVT ResVT = Setcc.getSimpleValueType(); 4117 MVT MaskVT = ResVT; 4118 if (Widen) { 4119 // Widen the inputs using insert_subreg or copy_to_regclass. 4120 unsigned Scale = CmpVT.is128BitVector() ? 4 : 2; 4121 unsigned SubReg = CmpVT.is128BitVector() ? X86::sub_xmm : X86::sub_ymm; 4122 unsigned NumElts = CmpVT.getVectorNumElements() * Scale; 4123 CmpVT = MVT::getVectorVT(CmpSVT, NumElts); 4124 MaskVT = MVT::getVectorVT(MVT::i1, NumElts); 4125 SDValue ImplDef = SDValue(CurDAG->getMachineNode(X86::IMPLICIT_DEF, dl, 4126 CmpVT), 0); 4127 Src0 = CurDAG->getTargetInsertSubreg(SubReg, dl, CmpVT, ImplDef, Src0); 4128 4129 assert(!FoldedLoad && "Shouldn't have folded the load"); 4130 if (!FoldedBCast) 4131 Src1 = CurDAG->getTargetInsertSubreg(SubReg, dl, CmpVT, ImplDef, Src1); 4132 4133 if (IsMasked) { 4134 // Widen the mask. 4135 unsigned RegClass = getMaskRC(MaskVT); 4136 SDValue RC = CurDAG->getTargetConstant(RegClass, dl, MVT::i32); 4137 InMask = SDValue(CurDAG->getMachineNode(TargetOpcode::COPY_TO_REGCLASS, 4138 dl, MaskVT, InMask, RC), 0); 4139 } 4140 } 4141 4142 bool IsTestN = CC == ISD::SETEQ; 4143 unsigned Opc = getVPTESTMOpc(CmpVT, IsTestN, FoldedLoad, FoldedBCast, 4144 IsMasked); 4145 4146 MachineSDNode *CNode; 4147 if (FoldedLoad || FoldedBCast) { 4148 SDVTList VTs = CurDAG->getVTList(MaskVT, MVT::Other); 4149 4150 if (IsMasked) { 4151 SDValue Ops[] = { InMask, Src0, Tmp0, Tmp1, Tmp2, Tmp3, Tmp4, 4152 Load.getOperand(0) }; 4153 CNode = CurDAG->getMachineNode(Opc, dl, VTs, Ops); 4154 } else { 4155 SDValue Ops[] = { Src0, Tmp0, Tmp1, Tmp2, Tmp3, Tmp4, 4156 Load.getOperand(0) }; 4157 CNode = CurDAG->getMachineNode(Opc, dl, VTs, Ops); 4158 } 4159 4160 // Update the chain. 4161 ReplaceUses(Load.getValue(1), SDValue(CNode, 1)); 4162 // Record the mem-refs 4163 CurDAG->setNodeMemRefs(CNode, {cast<LoadSDNode>(Load)->getMemOperand()}); 4164 } else { 4165 if (IsMasked) 4166 CNode = CurDAG->getMachineNode(Opc, dl, MaskVT, InMask, Src0, Src1); 4167 else 4168 CNode = CurDAG->getMachineNode(Opc, dl, MaskVT, Src0, Src1); 4169 } 4170 4171 // If we widened, we need to shrink the mask VT. 4172 if (Widen) { 4173 unsigned RegClass = getMaskRC(ResVT); 4174 SDValue RC = CurDAG->getTargetConstant(RegClass, dl, MVT::i32); 4175 CNode = CurDAG->getMachineNode(TargetOpcode::COPY_TO_REGCLASS, 4176 dl, ResVT, SDValue(CNode, 0), RC); 4177 } 4178 4179 ReplaceUses(SDValue(Root, 0), SDValue(CNode, 0)); 4180 CurDAG->RemoveDeadNode(Root); 4181 return true; 4182 } 4183 4184 void X86DAGToDAGISel::Select(SDNode *Node) { 4185 MVT NVT = Node->getSimpleValueType(0); 4186 unsigned Opcode = Node->getOpcode(); 4187 SDLoc dl(Node); 4188 4189 if (Node->isMachineOpcode()) { 4190 LLVM_DEBUG(dbgs() << "== "; Node->dump(CurDAG); dbgs() << '\n'); 4191 Node->setNodeId(-1); 4192 return; // Already selected. 4193 } 4194 4195 switch (Opcode) { 4196 default: break; 4197 case ISD::INTRINSIC_VOID: { 4198 unsigned IntNo = Node->getConstantOperandVal(1); 4199 switch (IntNo) { 4200 default: break; 4201 case Intrinsic::x86_sse3_monitor: 4202 case Intrinsic::x86_monitorx: 4203 case Intrinsic::x86_clzero: { 4204 bool Use64BitPtr = Node->getOperand(2).getValueType() == MVT::i64; 4205 4206 unsigned Opc = 0; 4207 switch (IntNo) { 4208 case Intrinsic::x86_sse3_monitor: 4209 if (!Subtarget->hasSSE3()) 4210 break; 4211 Opc = Use64BitPtr ? X86::MONITOR64rrr : X86::MONITOR32rrr; 4212 break; 4213 case Intrinsic::x86_monitorx: 4214 if (!Subtarget->hasMWAITX()) 4215 break; 4216 Opc = Use64BitPtr ? X86::MONITORX64rrr : X86::MONITORX32rrr; 4217 break; 4218 case Intrinsic::x86_clzero: 4219 if (!Subtarget->hasCLZERO()) 4220 break; 4221 Opc = Use64BitPtr ? X86::CLZERO64r : X86::CLZERO32r; 4222 break; 4223 } 4224 4225 if (Opc) { 4226 unsigned PtrReg = Use64BitPtr ? X86::RAX : X86::EAX; 4227 SDValue Chain = CurDAG->getCopyToReg(Node->getOperand(0), dl, PtrReg, 4228 Node->getOperand(2), SDValue()); 4229 SDValue InFlag = Chain.getValue(1); 4230 4231 if (IntNo == Intrinsic::x86_sse3_monitor || 4232 IntNo == Intrinsic::x86_monitorx) { 4233 // Copy the other two operands to ECX and EDX. 4234 Chain = CurDAG->getCopyToReg(Chain, dl, X86::ECX, Node->getOperand(3), 4235 InFlag); 4236 InFlag = Chain.getValue(1); 4237 Chain = CurDAG->getCopyToReg(Chain, dl, X86::EDX, Node->getOperand(4), 4238 InFlag); 4239 InFlag = Chain.getValue(1); 4240 } 4241 4242 MachineSDNode *CNode = CurDAG->getMachineNode(Opc, dl, MVT::Other, 4243 { Chain, InFlag}); 4244 ReplaceNode(Node, CNode); 4245 return; 4246 } 4247 } 4248 } 4249 4250 break; 4251 } 4252 case ISD::BRIND: { 4253 if (Subtarget->isTargetNaCl()) 4254 // NaCl has its own pass where jmp %r32 are converted to jmp %r64. We 4255 // leave the instruction alone. 4256 break; 4257 if (Subtarget->isTarget64BitILP32()) { 4258 // Converts a 32-bit register to a 64-bit, zero-extended version of 4259 // it. This is needed because x86-64 can do many things, but jmp %r32 4260 // ain't one of them. 4261 const SDValue &Target = Node->getOperand(1); 4262 assert(Target.getSimpleValueType() == llvm::MVT::i32); 4263 SDValue ZextTarget = CurDAG->getZExtOrTrunc(Target, dl, EVT(MVT::i64)); 4264 SDValue Brind = CurDAG->getNode(ISD::BRIND, dl, MVT::Other, 4265 Node->getOperand(0), ZextTarget); 4266 ReplaceNode(Node, Brind.getNode()); 4267 SelectCode(ZextTarget.getNode()); 4268 SelectCode(Brind.getNode()); 4269 return; 4270 } 4271 break; 4272 } 4273 case X86ISD::GlobalBaseReg: 4274 ReplaceNode(Node, getGlobalBaseReg()); 4275 return; 4276 4277 case ISD::BITCAST: 4278 // Just drop all 128/256/512-bit bitcasts. 4279 if (NVT.is512BitVector() || NVT.is256BitVector() || NVT.is128BitVector() || 4280 NVT == MVT::f128) { 4281 ReplaceUses(SDValue(Node, 0), Node->getOperand(0)); 4282 CurDAG->RemoveDeadNode(Node); 4283 return; 4284 } 4285 break; 4286 4287 case ISD::VSELECT: { 4288 // Replace VSELECT with non-mask conditions with with BLENDV. 4289 if (Node->getOperand(0).getValueType().getVectorElementType() == MVT::i1) 4290 break; 4291 4292 assert(Subtarget->hasSSE41() && "Expected SSE4.1 support!"); 4293 SDValue Blendv = CurDAG->getNode( 4294 X86ISD::BLENDV, SDLoc(Node), Node->getValueType(0), Node->getOperand(0), 4295 Node->getOperand(1), Node->getOperand(2)); 4296 ReplaceNode(Node, Blendv.getNode()); 4297 SelectCode(Blendv.getNode()); 4298 // We already called ReplaceUses. 4299 return; 4300 } 4301 4302 case ISD::SRL: 4303 if (matchBitExtract(Node)) 4304 return; 4305 LLVM_FALLTHROUGH; 4306 case ISD::SRA: 4307 case ISD::SHL: 4308 if (tryShiftAmountMod(Node)) 4309 return; 4310 break; 4311 4312 case ISD::AND: 4313 if (NVT.isVector() && NVT.getVectorElementType() == MVT::i1) { 4314 // Try to form a masked VPTESTM. Operands can be in either order. 4315 SDValue N0 = Node->getOperand(0); 4316 SDValue N1 = Node->getOperand(1); 4317 if (N0.getOpcode() == ISD::SETCC && N0.hasOneUse() && 4318 tryVPTESTM(Node, N0, N1)) 4319 return; 4320 if (N1.getOpcode() == ISD::SETCC && N1.hasOneUse() && 4321 tryVPTESTM(Node, N1, N0)) 4322 return; 4323 } 4324 4325 if (MachineSDNode *NewNode = matchBEXTRFromAndImm(Node)) { 4326 ReplaceUses(SDValue(Node, 0), SDValue(NewNode, 0)); 4327 CurDAG->RemoveDeadNode(Node); 4328 return; 4329 } 4330 if (matchBitExtract(Node)) 4331 return; 4332 if (AndImmShrink && shrinkAndImmediate(Node)) 4333 return; 4334 4335 LLVM_FALLTHROUGH; 4336 case ISD::OR: 4337 case ISD::XOR: 4338 if (tryShrinkShlLogicImm(Node)) 4339 return; 4340 4341 LLVM_FALLTHROUGH; 4342 case ISD::ADD: 4343 case ISD::SUB: { 4344 // Try to avoid folding immediates with multiple uses for optsize. 4345 // This code tries to select to register form directly to avoid going 4346 // through the isel table which might fold the immediate. We can't change 4347 // the patterns on the add/sub/and/or/xor with immediate paterns in the 4348 // tablegen files to check immediate use count without making the patterns 4349 // unavailable to the fast-isel table. 4350 if (!OptForSize) 4351 break; 4352 4353 // Only handle i8/i16/i32/i64. 4354 if (NVT != MVT::i8 && NVT != MVT::i16 && NVT != MVT::i32 && NVT != MVT::i64) 4355 break; 4356 4357 SDValue N0 = Node->getOperand(0); 4358 SDValue N1 = Node->getOperand(1); 4359 4360 ConstantSDNode *Cst = dyn_cast<ConstantSDNode>(N1); 4361 if (!Cst) 4362 break; 4363 4364 int64_t Val = Cst->getSExtValue(); 4365 4366 // Make sure its an immediate that is considered foldable. 4367 // FIXME: Handle unsigned 32 bit immediates for 64-bit AND. 4368 if (!isInt<8>(Val) && !isInt<32>(Val)) 4369 break; 4370 4371 // Check if we should avoid folding this immediate. 4372 if (!shouldAvoidImmediateInstFormsForSize(N1.getNode())) 4373 break; 4374 4375 // We should not fold the immediate. So we need a register form instead. 4376 unsigned ROpc, MOpc; 4377 switch (NVT.SimpleTy) { 4378 default: llvm_unreachable("Unexpected VT!"); 4379 case MVT::i8: 4380 switch (Opcode) { 4381 default: llvm_unreachable("Unexpected opcode!"); 4382 case ISD::ADD: ROpc = X86::ADD8rr; MOpc = X86::ADD8rm; break; 4383 case ISD::SUB: ROpc = X86::SUB8rr; MOpc = X86::SUB8rm; break; 4384 case ISD::AND: ROpc = X86::AND8rr; MOpc = X86::AND8rm; break; 4385 case ISD::OR: ROpc = X86::OR8rr; MOpc = X86::OR8rm; break; 4386 case ISD::XOR: ROpc = X86::XOR8rr; MOpc = X86::XOR8rm; break; 4387 } 4388 break; 4389 case MVT::i16: 4390 switch (Opcode) { 4391 default: llvm_unreachable("Unexpected opcode!"); 4392 case ISD::ADD: ROpc = X86::ADD16rr; MOpc = X86::ADD16rm; break; 4393 case ISD::SUB: ROpc = X86::SUB16rr; MOpc = X86::SUB16rm; break; 4394 case ISD::AND: ROpc = X86::AND16rr; MOpc = X86::AND16rm; break; 4395 case ISD::OR: ROpc = X86::OR16rr; MOpc = X86::OR16rm; break; 4396 case ISD::XOR: ROpc = X86::XOR16rr; MOpc = X86::XOR16rm; break; 4397 } 4398 break; 4399 case MVT::i32: 4400 switch (Opcode) { 4401 default: llvm_unreachable("Unexpected opcode!"); 4402 case ISD::ADD: ROpc = X86::ADD32rr; MOpc = X86::ADD32rm; break; 4403 case ISD::SUB: ROpc = X86::SUB32rr; MOpc = X86::SUB32rm; break; 4404 case ISD::AND: ROpc = X86::AND32rr; MOpc = X86::AND32rm; break; 4405 case ISD::OR: ROpc = X86::OR32rr; MOpc = X86::OR32rm; break; 4406 case ISD::XOR: ROpc = X86::XOR32rr; MOpc = X86::XOR32rm; break; 4407 } 4408 break; 4409 case MVT::i64: 4410 switch (Opcode) { 4411 default: llvm_unreachable("Unexpected opcode!"); 4412 case ISD::ADD: ROpc = X86::ADD64rr; MOpc = X86::ADD64rm; break; 4413 case ISD::SUB: ROpc = X86::SUB64rr; MOpc = X86::SUB64rm; break; 4414 case ISD::AND: ROpc = X86::AND64rr; MOpc = X86::AND64rm; break; 4415 case ISD::OR: ROpc = X86::OR64rr; MOpc = X86::OR64rm; break; 4416 case ISD::XOR: ROpc = X86::XOR64rr; MOpc = X86::XOR64rm; break; 4417 } 4418 break; 4419 } 4420 4421 // Ok this is a AND/OR/XOR/ADD/SUB with constant. 4422 4423 // If this is a not a subtract, we can still try to fold a load. 4424 if (Opcode != ISD::SUB) { 4425 SDValue Tmp0, Tmp1, Tmp2, Tmp3, Tmp4; 4426 if (tryFoldLoad(Node, N0, Tmp0, Tmp1, Tmp2, Tmp3, Tmp4)) { 4427 SDValue Ops[] = { N1, Tmp0, Tmp1, Tmp2, Tmp3, Tmp4, N0.getOperand(0) }; 4428 SDVTList VTs = CurDAG->getVTList(NVT, MVT::i32, MVT::Other); 4429 MachineSDNode *CNode = CurDAG->getMachineNode(MOpc, dl, VTs, Ops); 4430 // Update the chain. 4431 ReplaceUses(N0.getValue(1), SDValue(CNode, 2)); 4432 // Record the mem-refs 4433 CurDAG->setNodeMemRefs(CNode, {cast<LoadSDNode>(N0)->getMemOperand()}); 4434 ReplaceUses(SDValue(Node, 0), SDValue(CNode, 0)); 4435 CurDAG->RemoveDeadNode(Node); 4436 return; 4437 } 4438 } 4439 4440 CurDAG->SelectNodeTo(Node, ROpc, NVT, MVT::i32, N0, N1); 4441 return; 4442 } 4443 4444 case X86ISD::SMUL: 4445 // i16/i32/i64 are handled with isel patterns. 4446 if (NVT != MVT::i8) 4447 break; 4448 LLVM_FALLTHROUGH; 4449 case X86ISD::UMUL: { 4450 SDValue N0 = Node->getOperand(0); 4451 SDValue N1 = Node->getOperand(1); 4452 4453 unsigned LoReg, ROpc, MOpc; 4454 switch (NVT.SimpleTy) { 4455 default: llvm_unreachable("Unsupported VT!"); 4456 case MVT::i8: 4457 LoReg = X86::AL; 4458 ROpc = Opcode == X86ISD::SMUL ? X86::IMUL8r : X86::MUL8r; 4459 MOpc = Opcode == X86ISD::SMUL ? X86::IMUL8m : X86::MUL8m; 4460 break; 4461 case MVT::i16: 4462 LoReg = X86::AX; 4463 ROpc = X86::MUL16r; 4464 MOpc = X86::MUL16m; 4465 break; 4466 case MVT::i32: 4467 LoReg = X86::EAX; 4468 ROpc = X86::MUL32r; 4469 MOpc = X86::MUL32m; 4470 break; 4471 case MVT::i64: 4472 LoReg = X86::RAX; 4473 ROpc = X86::MUL64r; 4474 MOpc = X86::MUL64m; 4475 break; 4476 } 4477 4478 SDValue Tmp0, Tmp1, Tmp2, Tmp3, Tmp4; 4479 bool FoldedLoad = tryFoldLoad(Node, N1, Tmp0, Tmp1, Tmp2, Tmp3, Tmp4); 4480 // Multiply is commmutative. 4481 if (!FoldedLoad) { 4482 FoldedLoad = tryFoldLoad(Node, N0, Tmp0, Tmp1, Tmp2, Tmp3, Tmp4); 4483 if (FoldedLoad) 4484 std::swap(N0, N1); 4485 } 4486 4487 SDValue InFlag = CurDAG->getCopyToReg(CurDAG->getEntryNode(), dl, LoReg, 4488 N0, SDValue()).getValue(1); 4489 4490 MachineSDNode *CNode; 4491 if (FoldedLoad) { 4492 // i16/i32/i64 use an instruction that produces a low and high result even 4493 // though only the low result is used. 4494 SDVTList VTs; 4495 if (NVT == MVT::i8) 4496 VTs = CurDAG->getVTList(NVT, MVT::i32, MVT::Other); 4497 else 4498 VTs = CurDAG->getVTList(NVT, NVT, MVT::i32, MVT::Other); 4499 4500 SDValue Ops[] = { Tmp0, Tmp1, Tmp2, Tmp3, Tmp4, N1.getOperand(0), 4501 InFlag }; 4502 CNode = CurDAG->getMachineNode(MOpc, dl, VTs, Ops); 4503 4504 // Update the chain. 4505 ReplaceUses(N1.getValue(1), SDValue(CNode, NVT == MVT::i8 ? 2 : 3)); 4506 // Record the mem-refs 4507 CurDAG->setNodeMemRefs(CNode, {cast<LoadSDNode>(N1)->getMemOperand()}); 4508 } else { 4509 // i16/i32/i64 use an instruction that produces a low and high result even 4510 // though only the low result is used. 4511 SDVTList VTs; 4512 if (NVT == MVT::i8) 4513 VTs = CurDAG->getVTList(NVT, MVT::i32); 4514 else 4515 VTs = CurDAG->getVTList(NVT, NVT, MVT::i32); 4516 4517 CNode = CurDAG->getMachineNode(ROpc, dl, VTs, {N1, InFlag}); 4518 } 4519 4520 ReplaceUses(SDValue(Node, 0), SDValue(CNode, 0)); 4521 ReplaceUses(SDValue(Node, 1), SDValue(CNode, NVT == MVT::i8 ? 1 : 2)); 4522 CurDAG->RemoveDeadNode(Node); 4523 return; 4524 } 4525 4526 case ISD::SMUL_LOHI: 4527 case ISD::UMUL_LOHI: { 4528 SDValue N0 = Node->getOperand(0); 4529 SDValue N1 = Node->getOperand(1); 4530 4531 unsigned Opc, MOpc; 4532 bool isSigned = Opcode == ISD::SMUL_LOHI; 4533 if (!isSigned) { 4534 switch (NVT.SimpleTy) { 4535 default: llvm_unreachable("Unsupported VT!"); 4536 case MVT::i32: Opc = X86::MUL32r; MOpc = X86::MUL32m; break; 4537 case MVT::i64: Opc = X86::MUL64r; MOpc = X86::MUL64m; break; 4538 } 4539 } else { 4540 switch (NVT.SimpleTy) { 4541 default: llvm_unreachable("Unsupported VT!"); 4542 case MVT::i32: Opc = X86::IMUL32r; MOpc = X86::IMUL32m; break; 4543 case MVT::i64: Opc = X86::IMUL64r; MOpc = X86::IMUL64m; break; 4544 } 4545 } 4546 4547 unsigned SrcReg, LoReg, HiReg; 4548 switch (Opc) { 4549 default: llvm_unreachable("Unknown MUL opcode!"); 4550 case X86::IMUL32r: 4551 case X86::MUL32r: 4552 SrcReg = LoReg = X86::EAX; HiReg = X86::EDX; 4553 break; 4554 case X86::IMUL64r: 4555 case X86::MUL64r: 4556 SrcReg = LoReg = X86::RAX; HiReg = X86::RDX; 4557 break; 4558 } 4559 4560 SDValue Tmp0, Tmp1, Tmp2, Tmp3, Tmp4; 4561 bool foldedLoad = tryFoldLoad(Node, N1, Tmp0, Tmp1, Tmp2, Tmp3, Tmp4); 4562 // Multiply is commmutative. 4563 if (!foldedLoad) { 4564 foldedLoad = tryFoldLoad(Node, N0, Tmp0, Tmp1, Tmp2, Tmp3, Tmp4); 4565 if (foldedLoad) 4566 std::swap(N0, N1); 4567 } 4568 4569 SDValue InFlag = CurDAG->getCopyToReg(CurDAG->getEntryNode(), dl, SrcReg, 4570 N0, SDValue()).getValue(1); 4571 if (foldedLoad) { 4572 SDValue Chain; 4573 MachineSDNode *CNode = nullptr; 4574 SDValue Ops[] = { Tmp0, Tmp1, Tmp2, Tmp3, Tmp4, N1.getOperand(0), 4575 InFlag }; 4576 SDVTList VTs = CurDAG->getVTList(MVT::Other, MVT::Glue); 4577 CNode = CurDAG->getMachineNode(MOpc, dl, VTs, Ops); 4578 Chain = SDValue(CNode, 0); 4579 InFlag = SDValue(CNode, 1); 4580 4581 // Update the chain. 4582 ReplaceUses(N1.getValue(1), Chain); 4583 // Record the mem-refs 4584 CurDAG->setNodeMemRefs(CNode, {cast<LoadSDNode>(N1)->getMemOperand()}); 4585 } else { 4586 SDValue Ops[] = { N1, InFlag }; 4587 SDVTList VTs = CurDAG->getVTList(MVT::Glue); 4588 SDNode *CNode = CurDAG->getMachineNode(Opc, dl, VTs, Ops); 4589 InFlag = SDValue(CNode, 0); 4590 } 4591 4592 // Copy the low half of the result, if it is needed. 4593 if (!SDValue(Node, 0).use_empty()) { 4594 assert(LoReg && "Register for low half is not defined!"); 4595 SDValue ResLo = CurDAG->getCopyFromReg(CurDAG->getEntryNode(), dl, LoReg, 4596 NVT, InFlag); 4597 InFlag = ResLo.getValue(2); 4598 ReplaceUses(SDValue(Node, 0), ResLo); 4599 LLVM_DEBUG(dbgs() << "=> "; ResLo.getNode()->dump(CurDAG); 4600 dbgs() << '\n'); 4601 } 4602 // Copy the high half of the result, if it is needed. 4603 if (!SDValue(Node, 1).use_empty()) { 4604 assert(HiReg && "Register for high half is not defined!"); 4605 SDValue ResHi = CurDAG->getCopyFromReg(CurDAG->getEntryNode(), dl, HiReg, 4606 NVT, InFlag); 4607 InFlag = ResHi.getValue(2); 4608 ReplaceUses(SDValue(Node, 1), ResHi); 4609 LLVM_DEBUG(dbgs() << "=> "; ResHi.getNode()->dump(CurDAG); 4610 dbgs() << '\n'); 4611 } 4612 4613 CurDAG->RemoveDeadNode(Node); 4614 return; 4615 } 4616 4617 case ISD::SDIVREM: 4618 case ISD::UDIVREM: { 4619 SDValue N0 = Node->getOperand(0); 4620 SDValue N1 = Node->getOperand(1); 4621 4622 unsigned Opc, MOpc; 4623 bool isSigned = Opcode == ISD::SDIVREM; 4624 if (!isSigned) { 4625 switch (NVT.SimpleTy) { 4626 default: llvm_unreachable("Unsupported VT!"); 4627 case MVT::i8: Opc = X86::DIV8r; MOpc = X86::DIV8m; break; 4628 case MVT::i16: Opc = X86::DIV16r; MOpc = X86::DIV16m; break; 4629 case MVT::i32: Opc = X86::DIV32r; MOpc = X86::DIV32m; break; 4630 case MVT::i64: Opc = X86::DIV64r; MOpc = X86::DIV64m; break; 4631 } 4632 } else { 4633 switch (NVT.SimpleTy) { 4634 default: llvm_unreachable("Unsupported VT!"); 4635 case MVT::i8: Opc = X86::IDIV8r; MOpc = X86::IDIV8m; break; 4636 case MVT::i16: Opc = X86::IDIV16r; MOpc = X86::IDIV16m; break; 4637 case MVT::i32: Opc = X86::IDIV32r; MOpc = X86::IDIV32m; break; 4638 case MVT::i64: Opc = X86::IDIV64r; MOpc = X86::IDIV64m; break; 4639 } 4640 } 4641 4642 unsigned LoReg, HiReg, ClrReg; 4643 unsigned SExtOpcode; 4644 switch (NVT.SimpleTy) { 4645 default: llvm_unreachable("Unsupported VT!"); 4646 case MVT::i8: 4647 LoReg = X86::AL; ClrReg = HiReg = X86::AH; 4648 SExtOpcode = X86::CBW; 4649 break; 4650 case MVT::i16: 4651 LoReg = X86::AX; HiReg = X86::DX; 4652 ClrReg = X86::DX; 4653 SExtOpcode = X86::CWD; 4654 break; 4655 case MVT::i32: 4656 LoReg = X86::EAX; ClrReg = HiReg = X86::EDX; 4657 SExtOpcode = X86::CDQ; 4658 break; 4659 case MVT::i64: 4660 LoReg = X86::RAX; ClrReg = HiReg = X86::RDX; 4661 SExtOpcode = X86::CQO; 4662 break; 4663 } 4664 4665 SDValue Tmp0, Tmp1, Tmp2, Tmp3, Tmp4; 4666 bool foldedLoad = tryFoldLoad(Node, N1, Tmp0, Tmp1, Tmp2, Tmp3, Tmp4); 4667 bool signBitIsZero = CurDAG->SignBitIsZero(N0); 4668 4669 SDValue InFlag; 4670 if (NVT == MVT::i8 && (!isSigned || signBitIsZero)) { 4671 // Special case for div8, just use a move with zero extension to AX to 4672 // clear the upper 8 bits (AH). 4673 SDValue Tmp0, Tmp1, Tmp2, Tmp3, Tmp4, Chain; 4674 MachineSDNode *Move; 4675 if (tryFoldLoad(Node, N0, Tmp0, Tmp1, Tmp2, Tmp3, Tmp4)) { 4676 SDValue Ops[] = { Tmp0, Tmp1, Tmp2, Tmp3, Tmp4, N0.getOperand(0) }; 4677 Move = CurDAG->getMachineNode(X86::MOVZX32rm8, dl, MVT::i32, 4678 MVT::Other, Ops); 4679 Chain = SDValue(Move, 1); 4680 ReplaceUses(N0.getValue(1), Chain); 4681 // Record the mem-refs 4682 CurDAG->setNodeMemRefs(Move, {cast<LoadSDNode>(N0)->getMemOperand()}); 4683 } else { 4684 Move = CurDAG->getMachineNode(X86::MOVZX32rr8, dl, MVT::i32, N0); 4685 Chain = CurDAG->getEntryNode(); 4686 } 4687 Chain = CurDAG->getCopyToReg(Chain, dl, X86::EAX, SDValue(Move, 0), 4688 SDValue()); 4689 InFlag = Chain.getValue(1); 4690 } else { 4691 InFlag = 4692 CurDAG->getCopyToReg(CurDAG->getEntryNode(), dl, 4693 LoReg, N0, SDValue()).getValue(1); 4694 if (isSigned && !signBitIsZero) { 4695 // Sign extend the low part into the high part. 4696 InFlag = 4697 SDValue(CurDAG->getMachineNode(SExtOpcode, dl, MVT::Glue, InFlag),0); 4698 } else { 4699 // Zero out the high part, effectively zero extending the input. 4700 SDValue ClrNode = SDValue(CurDAG->getMachineNode(X86::MOV32r0, dl, NVT), 0); 4701 switch (NVT.SimpleTy) { 4702 case MVT::i16: 4703 ClrNode = 4704 SDValue(CurDAG->getMachineNode( 4705 TargetOpcode::EXTRACT_SUBREG, dl, MVT::i16, ClrNode, 4706 CurDAG->getTargetConstant(X86::sub_16bit, dl, 4707 MVT::i32)), 4708 0); 4709 break; 4710 case MVT::i32: 4711 break; 4712 case MVT::i64: 4713 ClrNode = 4714 SDValue(CurDAG->getMachineNode( 4715 TargetOpcode::SUBREG_TO_REG, dl, MVT::i64, 4716 CurDAG->getTargetConstant(0, dl, MVT::i64), ClrNode, 4717 CurDAG->getTargetConstant(X86::sub_32bit, dl, 4718 MVT::i32)), 4719 0); 4720 break; 4721 default: 4722 llvm_unreachable("Unexpected division source"); 4723 } 4724 4725 InFlag = CurDAG->getCopyToReg(CurDAG->getEntryNode(), dl, ClrReg, 4726 ClrNode, InFlag).getValue(1); 4727 } 4728 } 4729 4730 if (foldedLoad) { 4731 SDValue Ops[] = { Tmp0, Tmp1, Tmp2, Tmp3, Tmp4, N1.getOperand(0), 4732 InFlag }; 4733 MachineSDNode *CNode = 4734 CurDAG->getMachineNode(MOpc, dl, MVT::Other, MVT::Glue, Ops); 4735 InFlag = SDValue(CNode, 1); 4736 // Update the chain. 4737 ReplaceUses(N1.getValue(1), SDValue(CNode, 0)); 4738 // Record the mem-refs 4739 CurDAG->setNodeMemRefs(CNode, {cast<LoadSDNode>(N1)->getMemOperand()}); 4740 } else { 4741 InFlag = 4742 SDValue(CurDAG->getMachineNode(Opc, dl, MVT::Glue, N1, InFlag), 0); 4743 } 4744 4745 // Prevent use of AH in a REX instruction by explicitly copying it to 4746 // an ABCD_L register. 4747 // 4748 // The current assumption of the register allocator is that isel 4749 // won't generate explicit references to the GR8_ABCD_H registers. If 4750 // the allocator and/or the backend get enhanced to be more robust in 4751 // that regard, this can be, and should be, removed. 4752 if (HiReg == X86::AH && !SDValue(Node, 1).use_empty()) { 4753 SDValue AHCopy = CurDAG->getRegister(X86::AH, MVT::i8); 4754 unsigned AHExtOpcode = 4755 isSigned ? X86::MOVSX32rr8_NOREX : X86::MOVZX32rr8_NOREX; 4756 4757 SDNode *RNode = CurDAG->getMachineNode(AHExtOpcode, dl, MVT::i32, 4758 MVT::Glue, AHCopy, InFlag); 4759 SDValue Result(RNode, 0); 4760 InFlag = SDValue(RNode, 1); 4761 4762 Result = 4763 CurDAG->getTargetExtractSubreg(X86::sub_8bit, dl, MVT::i8, Result); 4764 4765 ReplaceUses(SDValue(Node, 1), Result); 4766 LLVM_DEBUG(dbgs() << "=> "; Result.getNode()->dump(CurDAG); 4767 dbgs() << '\n'); 4768 } 4769 // Copy the division (low) result, if it is needed. 4770 if (!SDValue(Node, 0).use_empty()) { 4771 SDValue Result = CurDAG->getCopyFromReg(CurDAG->getEntryNode(), dl, 4772 LoReg, NVT, InFlag); 4773 InFlag = Result.getValue(2); 4774 ReplaceUses(SDValue(Node, 0), Result); 4775 LLVM_DEBUG(dbgs() << "=> "; Result.getNode()->dump(CurDAG); 4776 dbgs() << '\n'); 4777 } 4778 // Copy the remainder (high) result, if it is needed. 4779 if (!SDValue(Node, 1).use_empty()) { 4780 SDValue Result = CurDAG->getCopyFromReg(CurDAG->getEntryNode(), dl, 4781 HiReg, NVT, InFlag); 4782 InFlag = Result.getValue(2); 4783 ReplaceUses(SDValue(Node, 1), Result); 4784 LLVM_DEBUG(dbgs() << "=> "; Result.getNode()->dump(CurDAG); 4785 dbgs() << '\n'); 4786 } 4787 CurDAG->RemoveDeadNode(Node); 4788 return; 4789 } 4790 4791 case X86ISD::CMP: { 4792 SDValue N0 = Node->getOperand(0); 4793 SDValue N1 = Node->getOperand(1); 4794 4795 // Optimizations for TEST compares. 4796 if (!isNullConstant(N1)) 4797 break; 4798 4799 // Save the original VT of the compare. 4800 MVT CmpVT = N0.getSimpleValueType(); 4801 4802 // If we are comparing (and (shr X, C, Mask) with 0, emit a BEXTR followed 4803 // by a test instruction. The test should be removed later by 4804 // analyzeCompare if we are using only the zero flag. 4805 // TODO: Should we check the users and use the BEXTR flags directly? 4806 if (N0.getOpcode() == ISD::AND && N0.hasOneUse()) { 4807 if (MachineSDNode *NewNode = matchBEXTRFromAndImm(N0.getNode())) { 4808 unsigned TestOpc = CmpVT == MVT::i64 ? X86::TEST64rr 4809 : X86::TEST32rr; 4810 SDValue BEXTR = SDValue(NewNode, 0); 4811 NewNode = CurDAG->getMachineNode(TestOpc, dl, MVT::i32, BEXTR, BEXTR); 4812 ReplaceUses(SDValue(Node, 0), SDValue(NewNode, 0)); 4813 CurDAG->RemoveDeadNode(Node); 4814 return; 4815 } 4816 } 4817 4818 // We can peek through truncates, but we need to be careful below. 4819 if (N0.getOpcode() == ISD::TRUNCATE && N0.hasOneUse()) 4820 N0 = N0.getOperand(0); 4821 4822 // Look for (X86cmp (and $op, $imm), 0) and see if we can convert it to 4823 // use a smaller encoding. 4824 // Look past the truncate if CMP is the only use of it. 4825 if (N0.getOpcode() == ISD::AND && 4826 N0.getNode()->hasOneUse() && 4827 N0.getValueType() != MVT::i8) { 4828 ConstantSDNode *C = dyn_cast<ConstantSDNode>(N0.getOperand(1)); 4829 if (!C) break; 4830 uint64_t Mask = C->getZExtValue(); 4831 4832 // Check if we can replace AND+IMM64 with a shift. This is possible for 4833 // masks/ like 0xFF000000 or 0x00FFFFFF and if we care only about the zero 4834 // flag. 4835 if (CmpVT == MVT::i64 && !isInt<32>(Mask) && 4836 onlyUsesZeroFlag(SDValue(Node, 0))) { 4837 if (isMask_64(~Mask)) { 4838 unsigned TrailingZeros = countTrailingZeros(Mask); 4839 SDValue Imm = CurDAG->getTargetConstant(TrailingZeros, dl, MVT::i64); 4840 SDValue Shift = 4841 SDValue(CurDAG->getMachineNode(X86::SHR64ri, dl, MVT::i64, MVT::i32, 4842 N0.getOperand(0), Imm), 0); 4843 MachineSDNode *Test = CurDAG->getMachineNode(X86::TEST64rr, dl, 4844 MVT::i32, Shift, Shift); 4845 ReplaceNode(Node, Test); 4846 return; 4847 } 4848 if (isMask_64(Mask)) { 4849 unsigned LeadingZeros = countLeadingZeros(Mask); 4850 SDValue Imm = CurDAG->getTargetConstant(LeadingZeros, dl, MVT::i64); 4851 SDValue Shift = 4852 SDValue(CurDAG->getMachineNode(X86::SHL64ri, dl, MVT::i64, MVT::i32, 4853 N0.getOperand(0), Imm), 0); 4854 MachineSDNode *Test = CurDAG->getMachineNode(X86::TEST64rr, dl, 4855 MVT::i32, Shift, Shift); 4856 ReplaceNode(Node, Test); 4857 return; 4858 } 4859 } 4860 4861 MVT VT; 4862 int SubRegOp; 4863 unsigned ROpc, MOpc; 4864 4865 // For each of these checks we need to be careful if the sign flag is 4866 // being used. It is only safe to use the sign flag in two conditions, 4867 // either the sign bit in the shrunken mask is zero or the final test 4868 // size is equal to the original compare size. 4869 4870 if (isUInt<8>(Mask) && 4871 (!(Mask & 0x80) || CmpVT == MVT::i8 || 4872 hasNoSignFlagUses(SDValue(Node, 0)))) { 4873 // For example, convert "testl %eax, $8" to "testb %al, $8" 4874 VT = MVT::i8; 4875 SubRegOp = X86::sub_8bit; 4876 ROpc = X86::TEST8ri; 4877 MOpc = X86::TEST8mi; 4878 } else if (OptForMinSize && isUInt<16>(Mask) && 4879 (!(Mask & 0x8000) || CmpVT == MVT::i16 || 4880 hasNoSignFlagUses(SDValue(Node, 0)))) { 4881 // For example, "testl %eax, $32776" to "testw %ax, $32776". 4882 // NOTE: We only want to form TESTW instructions if optimizing for 4883 // min size. Otherwise we only save one byte and possibly get a length 4884 // changing prefix penalty in the decoders. 4885 VT = MVT::i16; 4886 SubRegOp = X86::sub_16bit; 4887 ROpc = X86::TEST16ri; 4888 MOpc = X86::TEST16mi; 4889 } else if (isUInt<32>(Mask) && N0.getValueType() != MVT::i16 && 4890 ((!(Mask & 0x80000000) && 4891 // Without minsize 16-bit Cmps can get here so we need to 4892 // be sure we calculate the correct sign flag if needed. 4893 (CmpVT != MVT::i16 || !(Mask & 0x8000))) || 4894 CmpVT == MVT::i32 || 4895 hasNoSignFlagUses(SDValue(Node, 0)))) { 4896 // For example, "testq %rax, $268468232" to "testl %eax, $268468232". 4897 // NOTE: We only want to run that transform if N0 is 32 or 64 bits. 4898 // Otherwize, we find ourselves in a position where we have to do 4899 // promotion. If previous passes did not promote the and, we assume 4900 // they had a good reason not to and do not promote here. 4901 VT = MVT::i32; 4902 SubRegOp = X86::sub_32bit; 4903 ROpc = X86::TEST32ri; 4904 MOpc = X86::TEST32mi; 4905 } else { 4906 // No eligible transformation was found. 4907 break; 4908 } 4909 4910 SDValue Imm = CurDAG->getTargetConstant(Mask, dl, VT); 4911 SDValue Reg = N0.getOperand(0); 4912 4913 // Emit a testl or testw. 4914 MachineSDNode *NewNode; 4915 SDValue Tmp0, Tmp1, Tmp2, Tmp3, Tmp4; 4916 if (tryFoldLoad(Node, N0.getNode(), Reg, Tmp0, Tmp1, Tmp2, Tmp3, Tmp4)) { 4917 SDValue Ops[] = { Tmp0, Tmp1, Tmp2, Tmp3, Tmp4, Imm, 4918 Reg.getOperand(0) }; 4919 NewNode = CurDAG->getMachineNode(MOpc, dl, MVT::i32, MVT::Other, Ops); 4920 // Update the chain. 4921 ReplaceUses(Reg.getValue(1), SDValue(NewNode, 1)); 4922 // Record the mem-refs 4923 CurDAG->setNodeMemRefs(NewNode, 4924 {cast<LoadSDNode>(Reg)->getMemOperand()}); 4925 } else { 4926 // Extract the subregister if necessary. 4927 if (N0.getValueType() != VT) 4928 Reg = CurDAG->getTargetExtractSubreg(SubRegOp, dl, VT, Reg); 4929 4930 NewNode = CurDAG->getMachineNode(ROpc, dl, MVT::i32, Reg, Imm); 4931 } 4932 // Replace CMP with TEST. 4933 ReplaceNode(Node, NewNode); 4934 return; 4935 } 4936 break; 4937 } 4938 case X86ISD::PCMPISTR: { 4939 if (!Subtarget->hasSSE42()) 4940 break; 4941 4942 bool NeedIndex = !SDValue(Node, 0).use_empty(); 4943 bool NeedMask = !SDValue(Node, 1).use_empty(); 4944 // We can't fold a load if we are going to make two instructions. 4945 bool MayFoldLoad = !NeedIndex || !NeedMask; 4946 4947 MachineSDNode *CNode; 4948 if (NeedMask) { 4949 unsigned ROpc = Subtarget->hasAVX() ? X86::VPCMPISTRMrr : X86::PCMPISTRMrr; 4950 unsigned MOpc = Subtarget->hasAVX() ? X86::VPCMPISTRMrm : X86::PCMPISTRMrm; 4951 CNode = emitPCMPISTR(ROpc, MOpc, MayFoldLoad, dl, MVT::v16i8, Node); 4952 ReplaceUses(SDValue(Node, 1), SDValue(CNode, 0)); 4953 } 4954 if (NeedIndex || !NeedMask) { 4955 unsigned ROpc = Subtarget->hasAVX() ? X86::VPCMPISTRIrr : X86::PCMPISTRIrr; 4956 unsigned MOpc = Subtarget->hasAVX() ? X86::VPCMPISTRIrm : X86::PCMPISTRIrm; 4957 CNode = emitPCMPISTR(ROpc, MOpc, MayFoldLoad, dl, MVT::i32, Node); 4958 ReplaceUses(SDValue(Node, 0), SDValue(CNode, 0)); 4959 } 4960 4961 // Connect the flag usage to the last instruction created. 4962 ReplaceUses(SDValue(Node, 2), SDValue(CNode, 1)); 4963 CurDAG->RemoveDeadNode(Node); 4964 return; 4965 } 4966 case X86ISD::PCMPESTR: { 4967 if (!Subtarget->hasSSE42()) 4968 break; 4969 4970 // Copy the two implicit register inputs. 4971 SDValue InFlag = CurDAG->getCopyToReg(CurDAG->getEntryNode(), dl, X86::EAX, 4972 Node->getOperand(1), 4973 SDValue()).getValue(1); 4974 InFlag = CurDAG->getCopyToReg(CurDAG->getEntryNode(), dl, X86::EDX, 4975 Node->getOperand(3), InFlag).getValue(1); 4976 4977 bool NeedIndex = !SDValue(Node, 0).use_empty(); 4978 bool NeedMask = !SDValue(Node, 1).use_empty(); 4979 // We can't fold a load if we are going to make two instructions. 4980 bool MayFoldLoad = !NeedIndex || !NeedMask; 4981 4982 MachineSDNode *CNode; 4983 if (NeedMask) { 4984 unsigned ROpc = Subtarget->hasAVX() ? X86::VPCMPESTRMrr : X86::PCMPESTRMrr; 4985 unsigned MOpc = Subtarget->hasAVX() ? X86::VPCMPESTRMrm : X86::PCMPESTRMrm; 4986 CNode = emitPCMPESTR(ROpc, MOpc, MayFoldLoad, dl, MVT::v16i8, Node, 4987 InFlag); 4988 ReplaceUses(SDValue(Node, 1), SDValue(CNode, 0)); 4989 } 4990 if (NeedIndex || !NeedMask) { 4991 unsigned ROpc = Subtarget->hasAVX() ? X86::VPCMPESTRIrr : X86::PCMPESTRIrr; 4992 unsigned MOpc = Subtarget->hasAVX() ? X86::VPCMPESTRIrm : X86::PCMPESTRIrm; 4993 CNode = emitPCMPESTR(ROpc, MOpc, MayFoldLoad, dl, MVT::i32, Node, InFlag); 4994 ReplaceUses(SDValue(Node, 0), SDValue(CNode, 0)); 4995 } 4996 // Connect the flag usage to the last instruction created. 4997 ReplaceUses(SDValue(Node, 2), SDValue(CNode, 1)); 4998 CurDAG->RemoveDeadNode(Node); 4999 return; 5000 } 5001 5002 case ISD::SETCC: { 5003 if (NVT.isVector() && tryVPTESTM(Node, SDValue(Node, 0), SDValue())) 5004 return; 5005 5006 break; 5007 } 5008 5009 case ISD::STORE: 5010 if (foldLoadStoreIntoMemOperand(Node)) 5011 return; 5012 break; 5013 case ISD::FCEIL: 5014 case ISD::FFLOOR: 5015 case ISD::FTRUNC: 5016 case ISD::FNEARBYINT: 5017 case ISD::FRINT: { 5018 // Replace fp rounding with their X86 specific equivalent so we don't 5019 // need 2 sets of patterns. 5020 // FIXME: This can only happen when the nodes started as STRICT_* and have 5021 // been mutated into their non-STRICT equivalents. Eventually this 5022 // mutation will be removed and we should switch the STRICT_ nodes to a 5023 // strict version of RNDSCALE in PreProcessISelDAG. 5024 unsigned Imm; 5025 switch (Node->getOpcode()) { 5026 default: llvm_unreachable("Unexpected opcode!"); 5027 case ISD::FCEIL: Imm = 0xA; break; 5028 case ISD::FFLOOR: Imm = 0x9; break; 5029 case ISD::FTRUNC: Imm = 0xB; break; 5030 case ISD::FNEARBYINT: Imm = 0xC; break; 5031 case ISD::FRINT: Imm = 0x4; break; 5032 } 5033 SDLoc dl(Node); 5034 SDValue Res = CurDAG->getNode(X86ISD::VRNDSCALE, dl, 5035 Node->getValueType(0), 5036 Node->getOperand(0), 5037 CurDAG->getConstant(Imm, dl, MVT::i8)); 5038 ReplaceNode(Node, Res.getNode()); 5039 SelectCode(Res.getNode()); 5040 return; 5041 } 5042 } 5043 5044 SelectCode(Node); 5045 } 5046 5047 bool X86DAGToDAGISel:: 5048 SelectInlineAsmMemoryOperand(const SDValue &Op, unsigned ConstraintID, 5049 std::vector<SDValue> &OutOps) { 5050 SDValue Op0, Op1, Op2, Op3, Op4; 5051 switch (ConstraintID) { 5052 default: 5053 llvm_unreachable("Unexpected asm memory constraint"); 5054 case InlineAsm::Constraint_i: 5055 // FIXME: It seems strange that 'i' is needed here since it's supposed to 5056 // be an immediate and not a memory constraint. 5057 LLVM_FALLTHROUGH; 5058 case InlineAsm::Constraint_o: // offsetable ?? 5059 case InlineAsm::Constraint_v: // not offsetable ?? 5060 case InlineAsm::Constraint_m: // memory 5061 case InlineAsm::Constraint_X: 5062 if (!selectAddr(nullptr, Op, Op0, Op1, Op2, Op3, Op4)) 5063 return true; 5064 break; 5065 } 5066 5067 OutOps.push_back(Op0); 5068 OutOps.push_back(Op1); 5069 OutOps.push_back(Op2); 5070 OutOps.push_back(Op3); 5071 OutOps.push_back(Op4); 5072 return false; 5073 } 5074 5075 /// This pass converts a legalized DAG into a X86-specific DAG, 5076 /// ready for instruction scheduling. 5077 FunctionPass *llvm::createX86ISelDag(X86TargetMachine &TM, 5078 CodeGenOpt::Level OptLevel) { 5079 return new X86DAGToDAGISel(TM, OptLevel); 5080 } 5081