xref: /freebsd/contrib/llvm-project/llvm/lib/Target/X86/X86ISelDAGToDAG.cpp (revision 700637cbb5e582861067a11aaca4d053546871d2)
1 //===- X86ISelDAGToDAG.cpp - A DAG pattern matching inst selector for X86 -===//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 //
9 // This file defines a DAG pattern matching instruction selector for X86,
10 // converting from a legalized dag to a X86 dag.
11 //
12 //===----------------------------------------------------------------------===//
13 
14 #include "X86ISelDAGToDAG.h"
15 #include "X86.h"
16 #include "X86MachineFunctionInfo.h"
17 #include "X86Subtarget.h"
18 #include "X86TargetMachine.h"
19 #include "llvm/ADT/Statistic.h"
20 #include "llvm/CodeGen/MachineModuleInfo.h"
21 #include "llvm/CodeGen/SelectionDAGISel.h"
22 #include "llvm/Config/llvm-config.h"
23 #include "llvm/IR/ConstantRange.h"
24 #include "llvm/IR/Function.h"
25 #include "llvm/IR/Instructions.h"
26 #include "llvm/IR/Intrinsics.h"
27 #include "llvm/IR/IntrinsicsX86.h"
28 #include "llvm/IR/Module.h"
29 #include "llvm/IR/Type.h"
30 #include "llvm/Support/Debug.h"
31 #include "llvm/Support/ErrorHandling.h"
32 #include "llvm/Support/KnownBits.h"
33 #include "llvm/Support/MathExtras.h"
34 #include <cstdint>
35 
36 using namespace llvm;
37 
38 #define DEBUG_TYPE "x86-isel"
39 #define PASS_NAME "X86 DAG->DAG Instruction Selection"
40 
41 STATISTIC(NumLoadMoved, "Number of loads moved below TokenFactor");
42 
43 static cl::opt<bool> AndImmShrink("x86-and-imm-shrink", cl::init(true),
44     cl::desc("Enable setting constant bits to reduce size of mask immediates"),
45     cl::Hidden);
46 
47 static cl::opt<bool> EnablePromoteAnyextLoad(
48     "x86-promote-anyext-load", cl::init(true),
49     cl::desc("Enable promoting aligned anyext load to wider load"), cl::Hidden);
50 
51 extern cl::opt<bool> IndirectBranchTracking;
52 
53 //===----------------------------------------------------------------------===//
54 //                      Pattern Matcher Implementation
55 //===----------------------------------------------------------------------===//
56 
57 namespace {
58   /// This corresponds to X86AddressMode, but uses SDValue's instead of register
59   /// numbers for the leaves of the matched tree.
60   struct X86ISelAddressMode {
61     enum {
62       RegBase,
63       FrameIndexBase
64     } BaseType = RegBase;
65 
66     // This is really a union, discriminated by BaseType!
67     SDValue Base_Reg;
68     int Base_FrameIndex = 0;
69 
70     unsigned Scale = 1;
71     SDValue IndexReg;
72     int32_t Disp = 0;
73     SDValue Segment;
74     const GlobalValue *GV = nullptr;
75     const Constant *CP = nullptr;
76     const BlockAddress *BlockAddr = nullptr;
77     const char *ES = nullptr;
78     MCSymbol *MCSym = nullptr;
79     int JT = -1;
80     Align Alignment;            // CP alignment.
81     unsigned char SymbolFlags = X86II::MO_NO_FLAG;  // X86II::MO_*
82     bool NegateIndex = false;
83 
84     X86ISelAddressMode() = default;
85 
hasSymbolicDisplacement__anon5504e2c90111::X86ISelAddressMode86     bool hasSymbolicDisplacement() const {
87       return GV != nullptr || CP != nullptr || ES != nullptr ||
88              MCSym != nullptr || JT != -1 || BlockAddr != nullptr;
89     }
90 
hasBaseOrIndexReg__anon5504e2c90111::X86ISelAddressMode91     bool hasBaseOrIndexReg() const {
92       return BaseType == FrameIndexBase ||
93              IndexReg.getNode() != nullptr || Base_Reg.getNode() != nullptr;
94     }
95 
96     /// Return true if this addressing mode is already RIP-relative.
isRIPRelative__anon5504e2c90111::X86ISelAddressMode97     bool isRIPRelative() const {
98       if (BaseType != RegBase) return false;
99       if (RegisterSDNode *RegNode =
100             dyn_cast_or_null<RegisterSDNode>(Base_Reg.getNode()))
101         return RegNode->getReg() == X86::RIP;
102       return false;
103     }
104 
setBaseReg__anon5504e2c90111::X86ISelAddressMode105     void setBaseReg(SDValue Reg) {
106       BaseType = RegBase;
107       Base_Reg = Reg;
108     }
109 
110 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
dump__anon5504e2c90111::X86ISelAddressMode111     void dump(SelectionDAG *DAG = nullptr) {
112       dbgs() << "X86ISelAddressMode " << this << '\n';
113       dbgs() << "Base_Reg ";
114       if (Base_Reg.getNode())
115         Base_Reg.getNode()->dump(DAG);
116       else
117         dbgs() << "nul\n";
118       if (BaseType == FrameIndexBase)
119         dbgs() << " Base.FrameIndex " << Base_FrameIndex << '\n';
120       dbgs() << " Scale " << Scale << '\n'
121              << "IndexReg ";
122       if (NegateIndex)
123         dbgs() << "negate ";
124       if (IndexReg.getNode())
125         IndexReg.getNode()->dump(DAG);
126       else
127         dbgs() << "nul\n";
128       dbgs() << " Disp " << Disp << '\n'
129              << "GV ";
130       if (GV)
131         GV->dump();
132       else
133         dbgs() << "nul";
134       dbgs() << " CP ";
135       if (CP)
136         CP->dump();
137       else
138         dbgs() << "nul";
139       dbgs() << '\n'
140              << "ES ";
141       if (ES)
142         dbgs() << ES;
143       else
144         dbgs() << "nul";
145       dbgs() << " MCSym ";
146       if (MCSym)
147         dbgs() << MCSym;
148       else
149         dbgs() << "nul";
150       dbgs() << " JT" << JT << " Align" << Alignment.value() << '\n';
151     }
152 #endif
153   };
154 }
155 
156 namespace {
157   //===--------------------------------------------------------------------===//
158   /// ISel - X86-specific code to select X86 machine instructions for
159   /// SelectionDAG operations.
160   ///
161   class X86DAGToDAGISel final : public SelectionDAGISel {
162     /// Keep a pointer to the X86Subtarget around so that we can
163     /// make the right decision when generating code for different targets.
164     const X86Subtarget *Subtarget;
165 
166     /// If true, selector should try to optimize for minimum code size.
167     bool OptForMinSize;
168 
169     /// Disable direct TLS access through segment registers.
170     bool IndirectTlsSegRefs;
171 
172   public:
173     X86DAGToDAGISel() = delete;
174 
X86DAGToDAGISel(X86TargetMachine & tm,CodeGenOptLevel OptLevel)175     explicit X86DAGToDAGISel(X86TargetMachine &tm, CodeGenOptLevel OptLevel)
176         : SelectionDAGISel(tm, OptLevel), Subtarget(nullptr),
177           OptForMinSize(false), IndirectTlsSegRefs(false) {}
178 
runOnMachineFunction(MachineFunction & MF)179     bool runOnMachineFunction(MachineFunction &MF) override {
180       // Reset the subtarget each time through.
181       Subtarget = &MF.getSubtarget<X86Subtarget>();
182       IndirectTlsSegRefs = MF.getFunction().hasFnAttribute(
183                              "indirect-tls-seg-refs");
184 
185       // OptFor[Min]Size are used in pattern predicates that isel is matching.
186       OptForMinSize = MF.getFunction().hasMinSize();
187       return SelectionDAGISel::runOnMachineFunction(MF);
188     }
189 
190     void emitFunctionEntryCode() override;
191 
192     bool IsProfitableToFold(SDValue N, SDNode *U, SDNode *Root) const override;
193 
194     void PreprocessISelDAG() override;
195     void PostprocessISelDAG() override;
196 
197 // Include the pieces autogenerated from the target description.
198 #include "X86GenDAGISel.inc"
199 
200   private:
201     void Select(SDNode *N) override;
202 
203     bool foldOffsetIntoAddress(uint64_t Offset, X86ISelAddressMode &AM);
204     bool matchLoadInAddress(LoadSDNode *N, X86ISelAddressMode &AM,
205                             bool AllowSegmentRegForX32 = false);
206     bool matchWrapper(SDValue N, X86ISelAddressMode &AM);
207     bool matchAddress(SDValue N, X86ISelAddressMode &AM);
208     bool matchVectorAddress(SDValue N, X86ISelAddressMode &AM);
209     bool matchAdd(SDValue &N, X86ISelAddressMode &AM, unsigned Depth);
210     SDValue matchIndexRecursively(SDValue N, X86ISelAddressMode &AM,
211                                   unsigned Depth);
212     bool matchAddressRecursively(SDValue N, X86ISelAddressMode &AM,
213                                  unsigned Depth);
214     bool matchVectorAddressRecursively(SDValue N, X86ISelAddressMode &AM,
215                                        unsigned Depth);
216     bool matchAddressBase(SDValue N, X86ISelAddressMode &AM);
217     bool selectAddr(SDNode *Parent, SDValue N, SDValue &Base,
218                     SDValue &Scale, SDValue &Index, SDValue &Disp,
219                     SDValue &Segment);
220     bool selectVectorAddr(MemSDNode *Parent, SDValue BasePtr, SDValue IndexOp,
221                           SDValue ScaleOp, SDValue &Base, SDValue &Scale,
222                           SDValue &Index, SDValue &Disp, SDValue &Segment);
223     bool selectMOV64Imm32(SDValue N, SDValue &Imm);
224     bool selectLEAAddr(SDValue N, SDValue &Base,
225                        SDValue &Scale, SDValue &Index, SDValue &Disp,
226                        SDValue &Segment);
227     bool selectLEA64_Addr(SDValue N, SDValue &Base, SDValue &Scale,
228                           SDValue &Index, SDValue &Disp, SDValue &Segment);
229     bool selectTLSADDRAddr(SDValue N, SDValue &Base,
230                            SDValue &Scale, SDValue &Index, SDValue &Disp,
231                            SDValue &Segment);
232     bool selectRelocImm(SDValue N, SDValue &Op);
233 
234     bool tryFoldLoad(SDNode *Root, SDNode *P, SDValue N,
235                      SDValue &Base, SDValue &Scale,
236                      SDValue &Index, SDValue &Disp,
237                      SDValue &Segment);
238 
239     // Convenience method where P is also root.
tryFoldLoad(SDNode * P,SDValue N,SDValue & Base,SDValue & Scale,SDValue & Index,SDValue & Disp,SDValue & Segment)240     bool tryFoldLoad(SDNode *P, SDValue N,
241                      SDValue &Base, SDValue &Scale,
242                      SDValue &Index, SDValue &Disp,
243                      SDValue &Segment) {
244       return tryFoldLoad(P, P, N, Base, Scale, Index, Disp, Segment);
245     }
246 
247     bool tryFoldBroadcast(SDNode *Root, SDNode *P, SDValue N,
248                           SDValue &Base, SDValue &Scale,
249                           SDValue &Index, SDValue &Disp,
250                           SDValue &Segment);
251 
252     bool isProfitableToFormMaskedOp(SDNode *N) const;
253 
254     /// Implement addressing mode selection for inline asm expressions.
255     bool SelectInlineAsmMemoryOperand(const SDValue &Op,
256                                       InlineAsm::ConstraintCode ConstraintID,
257                                       std::vector<SDValue> &OutOps) override;
258 
259     void emitSpecialCodeForMain();
260 
getAddressOperands(X86ISelAddressMode & AM,const SDLoc & DL,MVT VT,SDValue & Base,SDValue & Scale,SDValue & Index,SDValue & Disp,SDValue & Segment)261     inline void getAddressOperands(X86ISelAddressMode &AM, const SDLoc &DL,
262                                    MVT VT, SDValue &Base, SDValue &Scale,
263                                    SDValue &Index, SDValue &Disp,
264                                    SDValue &Segment) {
265       if (AM.BaseType == X86ISelAddressMode::FrameIndexBase)
266         Base = CurDAG->getTargetFrameIndex(
267             AM.Base_FrameIndex, TLI->getPointerTy(CurDAG->getDataLayout()));
268       else if (AM.Base_Reg.getNode())
269         Base = AM.Base_Reg;
270       else
271         Base = CurDAG->getRegister(0, VT);
272 
273       Scale = getI8Imm(AM.Scale, DL);
274 
275 #define GET_ND_IF_ENABLED(OPC) (Subtarget->hasNDD() ? OPC##_ND : OPC)
276       // Negate the index if needed.
277       if (AM.NegateIndex) {
278         unsigned NegOpc;
279         switch (VT.SimpleTy) {
280         default:
281           llvm_unreachable("Unsupported VT!");
282         case MVT::i64:
283           NegOpc = GET_ND_IF_ENABLED(X86::NEG64r);
284           break;
285         case MVT::i32:
286           NegOpc = GET_ND_IF_ENABLED(X86::NEG32r);
287           break;
288         case MVT::i16:
289           NegOpc = GET_ND_IF_ENABLED(X86::NEG16r);
290           break;
291         case MVT::i8:
292           NegOpc = GET_ND_IF_ENABLED(X86::NEG8r);
293           break;
294         }
295         SDValue Neg = SDValue(CurDAG->getMachineNode(NegOpc, DL, VT, MVT::i32,
296                                                      AM.IndexReg), 0);
297         AM.IndexReg = Neg;
298       }
299 
300       if (AM.IndexReg.getNode())
301         Index = AM.IndexReg;
302       else
303         Index = CurDAG->getRegister(0, VT);
304 
305       // These are 32-bit even in 64-bit mode since RIP-relative offset
306       // is 32-bit.
307       if (AM.GV)
308         Disp = CurDAG->getTargetGlobalAddress(AM.GV, SDLoc(),
309                                               MVT::i32, AM.Disp,
310                                               AM.SymbolFlags);
311       else if (AM.CP)
312         Disp = CurDAG->getTargetConstantPool(AM.CP, MVT::i32, AM.Alignment,
313                                              AM.Disp, AM.SymbolFlags);
314       else if (AM.ES) {
315         assert(!AM.Disp && "Non-zero displacement is ignored with ES.");
316         Disp = CurDAG->getTargetExternalSymbol(AM.ES, MVT::i32, AM.SymbolFlags);
317       } else if (AM.MCSym) {
318         assert(!AM.Disp && "Non-zero displacement is ignored with MCSym.");
319         assert(AM.SymbolFlags == 0 && "oo");
320         Disp = CurDAG->getMCSymbol(AM.MCSym, MVT::i32);
321       } else if (AM.JT != -1) {
322         assert(!AM.Disp && "Non-zero displacement is ignored with JT.");
323         Disp = CurDAG->getTargetJumpTable(AM.JT, MVT::i32, AM.SymbolFlags);
324       } else if (AM.BlockAddr)
325         Disp = CurDAG->getTargetBlockAddress(AM.BlockAddr, MVT::i32, AM.Disp,
326                                              AM.SymbolFlags);
327       else
328         Disp = CurDAG->getSignedTargetConstant(AM.Disp, DL, MVT::i32);
329 
330       if (AM.Segment.getNode())
331         Segment = AM.Segment;
332       else
333         Segment = CurDAG->getRegister(0, MVT::i16);
334     }
335 
336     // Utility function to determine whether it is AMX SDNode right after
337     // lowering but before ISEL.
isAMXSDNode(SDNode * N) const338     bool isAMXSDNode(SDNode *N) const {
339       // Check if N is AMX SDNode:
340       // 1. check specific opcode since these carry MVT::Untyped instead of
341       // x86amx_type;
342       // 2. check result type;
343       // 3. check operand type;
344       switch (N->getOpcode()) {
345       default:
346         break;
347       case X86::PT2RPNTLVWZ0V:
348       case X86::PT2RPNTLVWZ0T1V:
349       case X86::PT2RPNTLVWZ1V:
350       case X86::PT2RPNTLVWZ1T1V:
351       case X86::PT2RPNTLVWZ0RSV:
352       case X86::PT2RPNTLVWZ0RST1V:
353       case X86::PT2RPNTLVWZ1RSV:
354       case X86::PT2RPNTLVWZ1RST1V:
355         return true;
356       }
357       for (unsigned Idx = 0, E = N->getNumValues(); Idx != E; ++Idx) {
358         if (N->getValueType(Idx) == MVT::x86amx)
359           return true;
360       }
361       for (unsigned Idx = 0, E = N->getNumOperands(); Idx != E; ++Idx) {
362         SDValue Op = N->getOperand(Idx);
363         if (Op.getValueType() == MVT::x86amx)
364           return true;
365       }
366       return false;
367     }
368 
369     // Utility function to determine whether we should avoid selecting
370     // immediate forms of instructions for better code size or not.
371     // At a high level, we'd like to avoid such instructions when
372     // we have similar constants used within the same basic block
373     // that can be kept in a register.
374     //
shouldAvoidImmediateInstFormsForSize(SDNode * N) const375     bool shouldAvoidImmediateInstFormsForSize(SDNode *N) const {
376       uint32_t UseCount = 0;
377 
378       // Do not want to hoist if we're not optimizing for size.
379       // TODO: We'd like to remove this restriction.
380       // See the comment in X86InstrInfo.td for more info.
381       if (!CurDAG->shouldOptForSize())
382         return false;
383 
384       // Walk all the users of the immediate.
385       for (const SDNode *User : N->users()) {
386         if (UseCount >= 2)
387           break;
388 
389         // This user is already selected. Count it as a legitimate use and
390         // move on.
391         if (User->isMachineOpcode()) {
392           UseCount++;
393           continue;
394         }
395 
396         // We want to count stores of immediates as real uses.
397         if (User->getOpcode() == ISD::STORE &&
398             User->getOperand(1).getNode() == N) {
399           UseCount++;
400           continue;
401         }
402 
403         // We don't currently match users that have > 2 operands (except
404         // for stores, which are handled above)
405         // Those instruction won't match in ISEL, for now, and would
406         // be counted incorrectly.
407         // This may change in the future as we add additional instruction
408         // types.
409         if (User->getNumOperands() != 2)
410           continue;
411 
412         // If this is a sign-extended 8-bit integer immediate used in an ALU
413         // instruction, there is probably an opcode encoding to save space.
414         auto *C = dyn_cast<ConstantSDNode>(N);
415         if (C && isInt<8>(C->getSExtValue()))
416           continue;
417 
418         // Immediates that are used for offsets as part of stack
419         // manipulation should be left alone. These are typically
420         // used to indicate SP offsets for argument passing and
421         // will get pulled into stores/pushes (implicitly).
422         if (User->getOpcode() == X86ISD::ADD ||
423             User->getOpcode() == ISD::ADD    ||
424             User->getOpcode() == X86ISD::SUB ||
425             User->getOpcode() == ISD::SUB) {
426 
427           // Find the other operand of the add/sub.
428           SDValue OtherOp = User->getOperand(0);
429           if (OtherOp.getNode() == N)
430             OtherOp = User->getOperand(1);
431 
432           // Don't count if the other operand is SP.
433           RegisterSDNode *RegNode;
434           if (OtherOp->getOpcode() == ISD::CopyFromReg &&
435               (RegNode = dyn_cast_or_null<RegisterSDNode>(
436                  OtherOp->getOperand(1).getNode())))
437             if ((RegNode->getReg() == X86::ESP) ||
438                 (RegNode->getReg() == X86::RSP))
439               continue;
440         }
441 
442         // ... otherwise, count this and move on.
443         UseCount++;
444       }
445 
446       // If we have more than 1 use, then recommend for hoisting.
447       return (UseCount > 1);
448     }
449 
450     /// Return a target constant with the specified value of type i8.
getI8Imm(unsigned Imm,const SDLoc & DL)451     inline SDValue getI8Imm(unsigned Imm, const SDLoc &DL) {
452       return CurDAG->getTargetConstant(Imm, DL, MVT::i8);
453     }
454 
455     /// Return a target constant with the specified value, of type i32.
getI32Imm(unsigned Imm,const SDLoc & DL)456     inline SDValue getI32Imm(unsigned Imm, const SDLoc &DL) {
457       return CurDAG->getTargetConstant(Imm, DL, MVT::i32);
458     }
459 
460     /// Return a target constant with the specified value, of type i64.
getI64Imm(uint64_t Imm,const SDLoc & DL)461     inline SDValue getI64Imm(uint64_t Imm, const SDLoc &DL) {
462       return CurDAG->getTargetConstant(Imm, DL, MVT::i64);
463     }
464 
getExtractVEXTRACTImmediate(SDNode * N,unsigned VecWidth,const SDLoc & DL)465     SDValue getExtractVEXTRACTImmediate(SDNode *N, unsigned VecWidth,
466                                         const SDLoc &DL) {
467       assert((VecWidth == 128 || VecWidth == 256) && "Unexpected vector width");
468       uint64_t Index = N->getConstantOperandVal(1);
469       MVT VecVT = N->getOperand(0).getSimpleValueType();
470       return getI8Imm((Index * VecVT.getScalarSizeInBits()) / VecWidth, DL);
471     }
472 
getInsertVINSERTImmediate(SDNode * N,unsigned VecWidth,const SDLoc & DL)473     SDValue getInsertVINSERTImmediate(SDNode *N, unsigned VecWidth,
474                                       const SDLoc &DL) {
475       assert((VecWidth == 128 || VecWidth == 256) && "Unexpected vector width");
476       uint64_t Index = N->getConstantOperandVal(2);
477       MVT VecVT = N->getSimpleValueType(0);
478       return getI8Imm((Index * VecVT.getScalarSizeInBits()) / VecWidth, DL);
479     }
480 
getPermuteVINSERTCommutedImmediate(SDNode * N,unsigned VecWidth,const SDLoc & DL)481     SDValue getPermuteVINSERTCommutedImmediate(SDNode *N, unsigned VecWidth,
482                                                const SDLoc &DL) {
483       assert(VecWidth == 128 && "Unexpected vector width");
484       uint64_t Index = N->getConstantOperandVal(2);
485       MVT VecVT = N->getSimpleValueType(0);
486       uint64_t InsertIdx = (Index * VecVT.getScalarSizeInBits()) / VecWidth;
487       assert((InsertIdx == 0 || InsertIdx == 1) && "Bad insertf128 index");
488       // vinsert(0,sub,vec) -> [sub0][vec1] -> vperm2x128(0x30,vec,sub)
489       // vinsert(1,sub,vec) -> [vec0][sub0] -> vperm2x128(0x02,vec,sub)
490       return getI8Imm(InsertIdx ? 0x02 : 0x30, DL);
491     }
492 
getSBBZero(SDNode * N)493     SDValue getSBBZero(SDNode *N) {
494       SDLoc dl(N);
495       MVT VT = N->getSimpleValueType(0);
496 
497       // Create zero.
498       SDVTList VTs = CurDAG->getVTList(MVT::i32, MVT::i32);
499       SDValue Zero =
500           SDValue(CurDAG->getMachineNode(X86::MOV32r0, dl, VTs, {}), 0);
501       if (VT == MVT::i64) {
502         Zero = SDValue(
503             CurDAG->getMachineNode(
504                 TargetOpcode::SUBREG_TO_REG, dl, MVT::i64,
505                 CurDAG->getTargetConstant(0, dl, MVT::i64), Zero,
506                 CurDAG->getTargetConstant(X86::sub_32bit, dl, MVT::i32)),
507             0);
508       }
509 
510       // Copy flags to the EFLAGS register and glue it to next node.
511       unsigned Opcode = N->getOpcode();
512       assert((Opcode == X86ISD::SBB || Opcode == X86ISD::SETCC_CARRY) &&
513              "Unexpected opcode for SBB materialization");
514       unsigned FlagOpIndex = Opcode == X86ISD::SBB ? 2 : 1;
515       SDValue EFLAGS =
516           CurDAG->getCopyToReg(CurDAG->getEntryNode(), dl, X86::EFLAGS,
517                                N->getOperand(FlagOpIndex), SDValue());
518 
519       // Create a 64-bit instruction if the result is 64-bits otherwise use the
520       // 32-bit version.
521       unsigned Opc = VT == MVT::i64 ? X86::SBB64rr : X86::SBB32rr;
522       MVT SBBVT = VT == MVT::i64 ? MVT::i64 : MVT::i32;
523       VTs = CurDAG->getVTList(SBBVT, MVT::i32);
524       return SDValue(
525           CurDAG->getMachineNode(Opc, dl, VTs,
526                                  {Zero, Zero, EFLAGS, EFLAGS.getValue(1)}),
527           0);
528     }
529 
530     // Helper to detect unneeded and instructions on shift amounts. Called
531     // from PatFrags in tablegen.
isUnneededShiftMask(SDNode * N,unsigned Width) const532     bool isUnneededShiftMask(SDNode *N, unsigned Width) const {
533       assert(N->getOpcode() == ISD::AND && "Unexpected opcode");
534       const APInt &Val = N->getConstantOperandAPInt(1);
535 
536       if (Val.countr_one() >= Width)
537         return true;
538 
539       APInt Mask = Val | CurDAG->computeKnownBits(N->getOperand(0)).Zero;
540       return Mask.countr_one() >= Width;
541     }
542 
543     /// Return an SDNode that returns the value of the global base register.
544     /// Output instructions required to initialize the global base register,
545     /// if necessary.
546     SDNode *getGlobalBaseReg();
547 
548     /// Return a reference to the TargetMachine, casted to the target-specific
549     /// type.
getTargetMachine() const550     const X86TargetMachine &getTargetMachine() const {
551       return static_cast<const X86TargetMachine &>(TM);
552     }
553 
554     /// Return a reference to the TargetInstrInfo, casted to the target-specific
555     /// type.
getInstrInfo() const556     const X86InstrInfo *getInstrInfo() const {
557       return Subtarget->getInstrInfo();
558     }
559 
560     /// Return a condition code of the given SDNode
561     X86::CondCode getCondFromNode(SDNode *N) const;
562 
563     /// Address-mode matching performs shift-of-and to and-of-shift
564     /// reassociation in order to expose more scaled addressing
565     /// opportunities.
ComplexPatternFuncMutatesDAG() const566     bool ComplexPatternFuncMutatesDAG() const override {
567       return true;
568     }
569 
570     bool isSExtAbsoluteSymbolRef(unsigned Width, SDNode *N) const;
571 
572     // Indicates we should prefer to use a non-temporal load for this load.
useNonTemporalLoad(LoadSDNode * N) const573     bool useNonTemporalLoad(LoadSDNode *N) const {
574       if (!N->isNonTemporal())
575         return false;
576 
577       unsigned StoreSize = N->getMemoryVT().getStoreSize();
578 
579       if (N->getAlign().value() < StoreSize)
580         return false;
581 
582       switch (StoreSize) {
583       default: llvm_unreachable("Unsupported store size");
584       case 4:
585       case 8:
586         return false;
587       case 16:
588         return Subtarget->hasSSE41();
589       case 32:
590         return Subtarget->hasAVX2();
591       case 64:
592         return Subtarget->hasAVX512();
593       }
594     }
595 
596     bool foldLoadStoreIntoMemOperand(SDNode *Node);
597     MachineSDNode *matchBEXTRFromAndImm(SDNode *Node);
598     bool matchBitExtract(SDNode *Node);
599     bool shrinkAndImmediate(SDNode *N);
600     bool isMaskZeroExtended(SDNode *N) const;
601     bool tryShiftAmountMod(SDNode *N);
602     bool tryShrinkShlLogicImm(SDNode *N);
603     bool tryVPTERNLOG(SDNode *N);
604     bool matchVPTERNLOG(SDNode *Root, SDNode *ParentA, SDNode *ParentB,
605                         SDNode *ParentC, SDValue A, SDValue B, SDValue C,
606                         uint8_t Imm);
607     bool tryVPTESTM(SDNode *Root, SDValue Setcc, SDValue Mask);
608     bool tryMatchBitSelect(SDNode *N);
609 
610     MachineSDNode *emitPCMPISTR(unsigned ROpc, unsigned MOpc, bool MayFoldLoad,
611                                 const SDLoc &dl, MVT VT, SDNode *Node);
612     MachineSDNode *emitPCMPESTR(unsigned ROpc, unsigned MOpc, bool MayFoldLoad,
613                                 const SDLoc &dl, MVT VT, SDNode *Node,
614                                 SDValue &InGlue);
615 
616     bool tryOptimizeRem8Extend(SDNode *N);
617 
618     bool onlyUsesZeroFlag(SDValue Flags) const;
619     bool hasNoSignFlagUses(SDValue Flags) const;
620     bool hasNoCarryFlagUses(SDValue Flags) const;
621   };
622 
623   class X86DAGToDAGISelLegacy : public SelectionDAGISelLegacy {
624   public:
625     static char ID;
X86DAGToDAGISelLegacy(X86TargetMachine & tm,CodeGenOptLevel OptLevel)626     explicit X86DAGToDAGISelLegacy(X86TargetMachine &tm,
627                                    CodeGenOptLevel OptLevel)
628         : SelectionDAGISelLegacy(
629               ID, std::make_unique<X86DAGToDAGISel>(tm, OptLevel)) {}
630   };
631 }
632 
633 char X86DAGToDAGISelLegacy::ID = 0;
634 
INITIALIZE_PASS(X86DAGToDAGISelLegacy,DEBUG_TYPE,PASS_NAME,false,false)635 INITIALIZE_PASS(X86DAGToDAGISelLegacy, DEBUG_TYPE, PASS_NAME, false, false)
636 
637 // Returns true if this masked compare can be implemented legally with this
638 // type.
639 static bool isLegalMaskCompare(SDNode *N, const X86Subtarget *Subtarget) {
640   unsigned Opcode = N->getOpcode();
641   if (Opcode == X86ISD::CMPM || Opcode == X86ISD::CMPMM ||
642       Opcode == X86ISD::STRICT_CMPM || Opcode == ISD::SETCC ||
643       Opcode == X86ISD::CMPMM_SAE || Opcode == X86ISD::VFPCLASS) {
644     // We can get 256-bit 8 element types here without VLX being enabled. When
645     // this happens we will use 512-bit operations and the mask will not be
646     // zero extended.
647     EVT OpVT = N->getOperand(0).getValueType();
648     // The first operand of X86ISD::STRICT_CMPM is chain, so we need to get the
649     // second operand.
650     if (Opcode == X86ISD::STRICT_CMPM)
651       OpVT = N->getOperand(1).getValueType();
652     if (OpVT.is256BitVector() || OpVT.is128BitVector())
653       return Subtarget->hasVLX();
654 
655     return true;
656   }
657   // Scalar opcodes use 128 bit registers, but aren't subject to the VLX check.
658   if (Opcode == X86ISD::VFPCLASSS || Opcode == X86ISD::FSETCCM ||
659       Opcode == X86ISD::FSETCCM_SAE)
660     return true;
661 
662   return false;
663 }
664 
665 // Returns true if we can assume the writer of the mask has zero extended it
666 // for us.
isMaskZeroExtended(SDNode * N) const667 bool X86DAGToDAGISel::isMaskZeroExtended(SDNode *N) const {
668   // If this is an AND, check if we have a compare on either side. As long as
669   // one side guarantees the mask is zero extended, the AND will preserve those
670   // zeros.
671   if (N->getOpcode() == ISD::AND)
672     return isLegalMaskCompare(N->getOperand(0).getNode(), Subtarget) ||
673            isLegalMaskCompare(N->getOperand(1).getNode(), Subtarget);
674 
675   return isLegalMaskCompare(N, Subtarget);
676 }
677 
678 bool
IsProfitableToFold(SDValue N,SDNode * U,SDNode * Root) const679 X86DAGToDAGISel::IsProfitableToFold(SDValue N, SDNode *U, SDNode *Root) const {
680   if (OptLevel == CodeGenOptLevel::None)
681     return false;
682 
683   if (!N.hasOneUse())
684     return false;
685 
686   if (N.getOpcode() != ISD::LOAD)
687     return true;
688 
689   // Don't fold non-temporal loads if we have an instruction for them.
690   if (useNonTemporalLoad(cast<LoadSDNode>(N)))
691     return false;
692 
693   // If N is a load, do additional profitability checks.
694   if (U == Root) {
695     switch (U->getOpcode()) {
696     default: break;
697     case X86ISD::ADD:
698     case X86ISD::ADC:
699     case X86ISD::SUB:
700     case X86ISD::SBB:
701     case X86ISD::AND:
702     case X86ISD::XOR:
703     case X86ISD::OR:
704     case ISD::ADD:
705     case ISD::UADDO_CARRY:
706     case ISD::AND:
707     case ISD::OR:
708     case ISD::XOR: {
709       SDValue Op1 = U->getOperand(1);
710 
711       // If the other operand is a 8-bit immediate we should fold the immediate
712       // instead. This reduces code size.
713       // e.g.
714       // movl 4(%esp), %eax
715       // addl $4, %eax
716       // vs.
717       // movl $4, %eax
718       // addl 4(%esp), %eax
719       // The former is 2 bytes shorter. In case where the increment is 1, then
720       // the saving can be 4 bytes (by using incl %eax).
721       if (auto *Imm = dyn_cast<ConstantSDNode>(Op1)) {
722         if (Imm->getAPIntValue().isSignedIntN(8))
723           return false;
724 
725         // If this is a 64-bit AND with an immediate that fits in 32-bits,
726         // prefer using the smaller and over folding the load. This is needed to
727         // make sure immediates created by shrinkAndImmediate are always folded.
728         // Ideally we would narrow the load during DAG combine and get the
729         // best of both worlds.
730         if (U->getOpcode() == ISD::AND &&
731             Imm->getAPIntValue().getBitWidth() == 64 &&
732             Imm->getAPIntValue().isIntN(32))
733           return false;
734 
735         // If this really a zext_inreg that can be represented with a movzx
736         // instruction, prefer that.
737         // TODO: We could shrink the load and fold if it is non-volatile.
738         if (U->getOpcode() == ISD::AND &&
739             (Imm->getAPIntValue() == UINT8_MAX ||
740              Imm->getAPIntValue() == UINT16_MAX ||
741              Imm->getAPIntValue() == UINT32_MAX))
742           return false;
743 
744         // ADD/SUB with can negate the immediate and use the opposite operation
745         // to fit 128 into a sign extended 8 bit immediate.
746         if ((U->getOpcode() == ISD::ADD || U->getOpcode() == ISD::SUB) &&
747             (-Imm->getAPIntValue()).isSignedIntN(8))
748           return false;
749 
750         if ((U->getOpcode() == X86ISD::ADD || U->getOpcode() == X86ISD::SUB) &&
751             (-Imm->getAPIntValue()).isSignedIntN(8) &&
752             hasNoCarryFlagUses(SDValue(U, 1)))
753           return false;
754       }
755 
756       // If the other operand is a TLS address, we should fold it instead.
757       // This produces
758       // movl    %gs:0, %eax
759       // leal    i@NTPOFF(%eax), %eax
760       // instead of
761       // movl    $i@NTPOFF, %eax
762       // addl    %gs:0, %eax
763       // if the block also has an access to a second TLS address this will save
764       // a load.
765       // FIXME: This is probably also true for non-TLS addresses.
766       if (Op1.getOpcode() == X86ISD::Wrapper) {
767         SDValue Val = Op1.getOperand(0);
768         if (Val.getOpcode() == ISD::TargetGlobalTLSAddress)
769           return false;
770       }
771 
772       // Don't fold load if this matches the BTS/BTR/BTC patterns.
773       // BTS: (or X, (shl 1, n))
774       // BTR: (and X, (rotl -2, n))
775       // BTC: (xor X, (shl 1, n))
776       if (U->getOpcode() == ISD::OR || U->getOpcode() == ISD::XOR) {
777         if (U->getOperand(0).getOpcode() == ISD::SHL &&
778             isOneConstant(U->getOperand(0).getOperand(0)))
779           return false;
780 
781         if (U->getOperand(1).getOpcode() == ISD::SHL &&
782             isOneConstant(U->getOperand(1).getOperand(0)))
783           return false;
784       }
785       if (U->getOpcode() == ISD::AND) {
786         SDValue U0 = U->getOperand(0);
787         SDValue U1 = U->getOperand(1);
788         if (U0.getOpcode() == ISD::ROTL) {
789           auto *C = dyn_cast<ConstantSDNode>(U0.getOperand(0));
790           if (C && C->getSExtValue() == -2)
791             return false;
792         }
793 
794         if (U1.getOpcode() == ISD::ROTL) {
795           auto *C = dyn_cast<ConstantSDNode>(U1.getOperand(0));
796           if (C && C->getSExtValue() == -2)
797             return false;
798         }
799       }
800 
801       break;
802     }
803     case ISD::SHL:
804     case ISD::SRA:
805     case ISD::SRL:
806       // Don't fold a load into a shift by immediate. The BMI2 instructions
807       // support folding a load, but not an immediate. The legacy instructions
808       // support folding an immediate, but can't fold a load. Folding an
809       // immediate is preferable to folding a load.
810       if (isa<ConstantSDNode>(U->getOperand(1)))
811         return false;
812 
813       break;
814     }
815   }
816 
817   // Prevent folding a load if this can implemented with an insert_subreg or
818   // a move that implicitly zeroes.
819   if (Root->getOpcode() == ISD::INSERT_SUBVECTOR &&
820       isNullConstant(Root->getOperand(2)) &&
821       (Root->getOperand(0).isUndef() ||
822        ISD::isBuildVectorAllZeros(Root->getOperand(0).getNode())))
823     return false;
824 
825   return true;
826 }
827 
828 // Indicates it is profitable to form an AVX512 masked operation. Returning
829 // false will favor a masked register-register masked move or vblendm and the
830 // operation will be selected separately.
isProfitableToFormMaskedOp(SDNode * N) const831 bool X86DAGToDAGISel::isProfitableToFormMaskedOp(SDNode *N) const {
832   assert(
833       (N->getOpcode() == ISD::VSELECT || N->getOpcode() == X86ISD::SELECTS) &&
834       "Unexpected opcode!");
835 
836   // If the operation has additional users, the operation will be duplicated.
837   // Check the use count to prevent that.
838   // FIXME: Are there cheap opcodes we might want to duplicate?
839   return N->getOperand(1).hasOneUse();
840 }
841 
842 /// Replace the original chain operand of the call with
843 /// load's chain operand and move load below the call's chain operand.
moveBelowOrigChain(SelectionDAG * CurDAG,SDValue Load,SDValue Call,SDValue OrigChain)844 static void moveBelowOrigChain(SelectionDAG *CurDAG, SDValue Load,
845                                SDValue Call, SDValue OrigChain) {
846   SmallVector<SDValue, 8> Ops;
847   SDValue Chain = OrigChain.getOperand(0);
848   if (Chain.getNode() == Load.getNode())
849     Ops.push_back(Load.getOperand(0));
850   else {
851     assert(Chain.getOpcode() == ISD::TokenFactor &&
852            "Unexpected chain operand");
853     for (unsigned i = 0, e = Chain.getNumOperands(); i != e; ++i)
854       if (Chain.getOperand(i).getNode() == Load.getNode())
855         Ops.push_back(Load.getOperand(0));
856       else
857         Ops.push_back(Chain.getOperand(i));
858     SDValue NewChain =
859       CurDAG->getNode(ISD::TokenFactor, SDLoc(Load), MVT::Other, Ops);
860     Ops.clear();
861     Ops.push_back(NewChain);
862   }
863   Ops.append(OrigChain->op_begin() + 1, OrigChain->op_end());
864   CurDAG->UpdateNodeOperands(OrigChain.getNode(), Ops);
865   CurDAG->UpdateNodeOperands(Load.getNode(), Call.getOperand(0),
866                              Load.getOperand(1), Load.getOperand(2));
867 
868   Ops.clear();
869   Ops.push_back(SDValue(Load.getNode(), 1));
870   Ops.append(Call->op_begin() + 1, Call->op_end());
871   CurDAG->UpdateNodeOperands(Call.getNode(), Ops);
872 }
873 
874 /// Return true if call address is a load and it can be
875 /// moved below CALLSEQ_START and the chains leading up to the call.
876 /// Return the CALLSEQ_START by reference as a second output.
877 /// In the case of a tail call, there isn't a callseq node between the call
878 /// chain and the load.
isCalleeLoad(SDValue Callee,SDValue & Chain,bool HasCallSeq)879 static bool isCalleeLoad(SDValue Callee, SDValue &Chain, bool HasCallSeq) {
880   // The transformation is somewhat dangerous if the call's chain was glued to
881   // the call. After MoveBelowOrigChain the load is moved between the call and
882   // the chain, this can create a cycle if the load is not folded. So it is
883   // *really* important that we are sure the load will be folded.
884   if (Callee.getNode() == Chain.getNode() || !Callee.hasOneUse())
885     return false;
886   auto *LD = dyn_cast<LoadSDNode>(Callee.getNode());
887   if (!LD ||
888       !LD->isSimple() ||
889       LD->getAddressingMode() != ISD::UNINDEXED ||
890       LD->getExtensionType() != ISD::NON_EXTLOAD)
891     return false;
892 
893   // Now let's find the callseq_start.
894   while (HasCallSeq && Chain.getOpcode() != ISD::CALLSEQ_START) {
895     if (!Chain.hasOneUse())
896       return false;
897     Chain = Chain.getOperand(0);
898   }
899 
900   if (!Chain.getNumOperands())
901     return false;
902   // Since we are not checking for AA here, conservatively abort if the chain
903   // writes to memory. It's not safe to move the callee (a load) across a store.
904   if (isa<MemSDNode>(Chain.getNode()) &&
905       cast<MemSDNode>(Chain.getNode())->writeMem())
906     return false;
907   if (Chain.getOperand(0).getNode() == Callee.getNode())
908     return true;
909   if (Chain.getOperand(0).getOpcode() == ISD::TokenFactor &&
910       Callee.getValue(1).isOperandOf(Chain.getOperand(0).getNode()) &&
911       Callee.getValue(1).hasOneUse())
912     return true;
913   return false;
914 }
915 
isEndbrImm64(uint64_t Imm)916 static bool isEndbrImm64(uint64_t Imm) {
917 // There may be some other prefix bytes between 0xF3 and 0x0F1EFA.
918 // i.g: 0xF3660F1EFA, 0xF3670F1EFA
919   if ((Imm & 0x00FFFFFF) != 0x0F1EFA)
920     return false;
921 
922   uint8_t OptionalPrefixBytes [] = {0x26, 0x2e, 0x36, 0x3e, 0x64,
923                                     0x65, 0x66, 0x67, 0xf0, 0xf2};
924   int i = 24; // 24bit 0x0F1EFA has matched
925   while (i < 64) {
926     uint8_t Byte = (Imm >> i) & 0xFF;
927     if (Byte == 0xF3)
928       return true;
929     if (!llvm::is_contained(OptionalPrefixBytes, Byte))
930       return false;
931     i += 8;
932   }
933 
934   return false;
935 }
936 
needBWI(MVT VT)937 static bool needBWI(MVT VT) {
938   return (VT == MVT::v32i16 || VT == MVT::v32f16 || VT == MVT::v64i8);
939 }
940 
PreprocessISelDAG()941 void X86DAGToDAGISel::PreprocessISelDAG() {
942   bool MadeChange = false;
943   for (SelectionDAG::allnodes_iterator I = CurDAG->allnodes_begin(),
944        E = CurDAG->allnodes_end(); I != E; ) {
945     SDNode *N = &*I++; // Preincrement iterator to avoid invalidation issues.
946 
947     // This is for CET enhancement.
948     //
949     // ENDBR32 and ENDBR64 have specific opcodes:
950     // ENDBR32: F3 0F 1E FB
951     // ENDBR64: F3 0F 1E FA
952     // And we want that attackers won’t find unintended ENDBR32/64
953     // opcode matches in the binary
954     // Here’s an example:
955     // If the compiler had to generate asm for the following code:
956     // a = 0xF30F1EFA
957     // it could, for example, generate:
958     // mov 0xF30F1EFA, dword ptr[a]
959     // In such a case, the binary would include a gadget that starts
960     // with a fake ENDBR64 opcode. Therefore, we split such generation
961     // into multiple operations, let it not shows in the binary
962     if (N->getOpcode() == ISD::Constant) {
963       MVT VT = N->getSimpleValueType(0);
964       int64_t Imm = cast<ConstantSDNode>(N)->getSExtValue();
965       int32_t EndbrImm = Subtarget->is64Bit() ? 0xF30F1EFA : 0xF30F1EFB;
966       if (Imm == EndbrImm || isEndbrImm64(Imm)) {
967         // Check that the cf-protection-branch is enabled.
968         Metadata *CFProtectionBranch =
969             MF->getFunction().getParent()->getModuleFlag(
970                 "cf-protection-branch");
971         if (CFProtectionBranch || IndirectBranchTracking) {
972           SDLoc dl(N);
973           SDValue Complement = CurDAG->getConstant(~Imm, dl, VT, false, true);
974           Complement = CurDAG->getNOT(dl, Complement, VT);
975           --I;
976           CurDAG->ReplaceAllUsesOfValueWith(SDValue(N, 0), Complement);
977           ++I;
978           MadeChange = true;
979           continue;
980         }
981       }
982     }
983 
984     // If this is a target specific AND node with no flag usages, turn it back
985     // into ISD::AND to enable test instruction matching.
986     if (N->getOpcode() == X86ISD::AND && !N->hasAnyUseOfValue(1)) {
987       SDValue Res = CurDAG->getNode(ISD::AND, SDLoc(N), N->getValueType(0),
988                                     N->getOperand(0), N->getOperand(1));
989       --I;
990       CurDAG->ReplaceAllUsesOfValueWith(SDValue(N, 0), Res);
991       ++I;
992       MadeChange = true;
993       continue;
994     }
995 
996     // Convert vector increment or decrement to sub/add with an all-ones
997     // constant:
998     // add X, <1, 1...> --> sub X, <-1, -1...>
999     // sub X, <1, 1...> --> add X, <-1, -1...>
1000     // The all-ones vector constant can be materialized using a pcmpeq
1001     // instruction that is commonly recognized as an idiom (has no register
1002     // dependency), so that's better/smaller than loading a splat 1 constant.
1003     //
1004     // But don't do this if it would inhibit a potentially profitable load
1005     // folding opportunity for the other operand. That only occurs with the
1006     // intersection of:
1007     // (1) The other operand (op0) is load foldable.
1008     // (2) The op is an add (otherwise, we are *creating* an add and can still
1009     //     load fold the other op).
1010     // (3) The target has AVX (otherwise, we have a destructive add and can't
1011     //     load fold the other op without killing the constant op).
1012     // (4) The constant 1 vector has multiple uses (so it is profitable to load
1013     //     into a register anyway).
1014     auto mayPreventLoadFold = [&]() {
1015       return X86::mayFoldLoad(N->getOperand(0), *Subtarget) &&
1016              N->getOpcode() == ISD::ADD && Subtarget->hasAVX() &&
1017              !N->getOperand(1).hasOneUse();
1018     };
1019     if ((N->getOpcode() == ISD::ADD || N->getOpcode() == ISD::SUB) &&
1020         N->getSimpleValueType(0).isVector() && !mayPreventLoadFold()) {
1021       APInt SplatVal;
1022       if (X86::isConstantSplat(N->getOperand(1), SplatVal) &&
1023           SplatVal.isOne()) {
1024         SDLoc DL(N);
1025 
1026         MVT VT = N->getSimpleValueType(0);
1027         unsigned NumElts = VT.getSizeInBits() / 32;
1028         SDValue AllOnes =
1029             CurDAG->getAllOnesConstant(DL, MVT::getVectorVT(MVT::i32, NumElts));
1030         AllOnes = CurDAG->getBitcast(VT, AllOnes);
1031 
1032         unsigned NewOpcode = N->getOpcode() == ISD::ADD ? ISD::SUB : ISD::ADD;
1033         SDValue Res =
1034             CurDAG->getNode(NewOpcode, DL, VT, N->getOperand(0), AllOnes);
1035         --I;
1036         CurDAG->ReplaceAllUsesWith(N, Res.getNode());
1037         ++I;
1038         MadeChange = true;
1039         continue;
1040       }
1041     }
1042 
1043     switch (N->getOpcode()) {
1044     case X86ISD::VBROADCAST: {
1045       MVT VT = N->getSimpleValueType(0);
1046       // Emulate v32i16/v64i8 broadcast without BWI.
1047       if (!Subtarget->hasBWI() && needBWI(VT)) {
1048         MVT NarrowVT = VT.getHalfNumVectorElementsVT();
1049         SDLoc dl(N);
1050         SDValue NarrowBCast =
1051             CurDAG->getNode(X86ISD::VBROADCAST, dl, NarrowVT, N->getOperand(0));
1052         SDValue Res =
1053             CurDAG->getNode(ISD::INSERT_SUBVECTOR, dl, VT, CurDAG->getUNDEF(VT),
1054                             NarrowBCast, CurDAG->getIntPtrConstant(0, dl));
1055         unsigned Index = NarrowVT.getVectorMinNumElements();
1056         Res = CurDAG->getNode(ISD::INSERT_SUBVECTOR, dl, VT, Res, NarrowBCast,
1057                               CurDAG->getIntPtrConstant(Index, dl));
1058 
1059         --I;
1060         CurDAG->ReplaceAllUsesWith(N, Res.getNode());
1061         ++I;
1062         MadeChange = true;
1063         continue;
1064       }
1065 
1066       break;
1067     }
1068     case X86ISD::VBROADCAST_LOAD: {
1069       MVT VT = N->getSimpleValueType(0);
1070       // Emulate v32i16/v64i8 broadcast without BWI.
1071       if (!Subtarget->hasBWI() && needBWI(VT)) {
1072         MVT NarrowVT = VT.getHalfNumVectorElementsVT();
1073         auto *MemNode = cast<MemSDNode>(N);
1074         SDLoc dl(N);
1075         SDVTList VTs = CurDAG->getVTList(NarrowVT, MVT::Other);
1076         SDValue Ops[] = {MemNode->getChain(), MemNode->getBasePtr()};
1077         SDValue NarrowBCast = CurDAG->getMemIntrinsicNode(
1078             X86ISD::VBROADCAST_LOAD, dl, VTs, Ops, MemNode->getMemoryVT(),
1079             MemNode->getMemOperand());
1080         SDValue Res =
1081             CurDAG->getNode(ISD::INSERT_SUBVECTOR, dl, VT, CurDAG->getUNDEF(VT),
1082                             NarrowBCast, CurDAG->getIntPtrConstant(0, dl));
1083         unsigned Index = NarrowVT.getVectorMinNumElements();
1084         Res = CurDAG->getNode(ISD::INSERT_SUBVECTOR, dl, VT, Res, NarrowBCast,
1085                               CurDAG->getIntPtrConstant(Index, dl));
1086 
1087         --I;
1088         SDValue To[] = {Res, NarrowBCast.getValue(1)};
1089         CurDAG->ReplaceAllUsesWith(N, To);
1090         ++I;
1091         MadeChange = true;
1092         continue;
1093       }
1094 
1095       break;
1096     }
1097     case ISD::LOAD: {
1098       // If this is a XMM/YMM load of the same lower bits as another YMM/ZMM
1099       // load, then just extract the lower subvector and avoid the second load.
1100       auto *Ld = cast<LoadSDNode>(N);
1101       MVT VT = N->getSimpleValueType(0);
1102       if (!ISD::isNormalLoad(Ld) || !Ld->isSimple() ||
1103           !(VT.is128BitVector() || VT.is256BitVector()))
1104         break;
1105 
1106       MVT MaxVT = VT;
1107       SDNode *MaxLd = nullptr;
1108       SDValue Ptr = Ld->getBasePtr();
1109       SDValue Chain = Ld->getChain();
1110       for (SDNode *User : Ptr->users()) {
1111         auto *UserLd = dyn_cast<LoadSDNode>(User);
1112         MVT UserVT = User->getSimpleValueType(0);
1113         if (User != N && UserLd && ISD::isNormalLoad(User) &&
1114             UserLd->getBasePtr() == Ptr && UserLd->getChain() == Chain &&
1115             !User->hasAnyUseOfValue(1) &&
1116             (UserVT.is256BitVector() || UserVT.is512BitVector()) &&
1117             UserVT.getSizeInBits() > VT.getSizeInBits() &&
1118             (!MaxLd || UserVT.getSizeInBits() > MaxVT.getSizeInBits())) {
1119           MaxLd = User;
1120           MaxVT = UserVT;
1121         }
1122       }
1123       if (MaxLd) {
1124         SDLoc dl(N);
1125         unsigned NumSubElts = VT.getSizeInBits() / MaxVT.getScalarSizeInBits();
1126         MVT SubVT = MVT::getVectorVT(MaxVT.getScalarType(), NumSubElts);
1127         SDValue Extract = CurDAG->getNode(ISD::EXTRACT_SUBVECTOR, dl, SubVT,
1128                                           SDValue(MaxLd, 0),
1129                                           CurDAG->getIntPtrConstant(0, dl));
1130         SDValue Res = CurDAG->getBitcast(VT, Extract);
1131 
1132         --I;
1133         SDValue To[] = {Res, SDValue(MaxLd, 1)};
1134         CurDAG->ReplaceAllUsesWith(N, To);
1135         ++I;
1136         MadeChange = true;
1137         continue;
1138       }
1139       break;
1140     }
1141     case ISD::VSELECT: {
1142       // Replace VSELECT with non-mask conditions with with BLENDV/VPTERNLOG.
1143       EVT EleVT = N->getOperand(0).getValueType().getVectorElementType();
1144       if (EleVT == MVT::i1)
1145         break;
1146 
1147       assert(Subtarget->hasSSE41() && "Expected SSE4.1 support!");
1148       assert(N->getValueType(0).getVectorElementType() != MVT::i16 &&
1149              "We can't replace VSELECT with BLENDV in vXi16!");
1150       SDValue R;
1151       if (Subtarget->hasVLX() && CurDAG->ComputeNumSignBits(N->getOperand(0)) ==
1152                                      EleVT.getSizeInBits()) {
1153         R = CurDAG->getNode(X86ISD::VPTERNLOG, SDLoc(N), N->getValueType(0),
1154                             N->getOperand(0), N->getOperand(1), N->getOperand(2),
1155                             CurDAG->getTargetConstant(0xCA, SDLoc(N), MVT::i8));
1156       } else {
1157         R = CurDAG->getNode(X86ISD::BLENDV, SDLoc(N), N->getValueType(0),
1158                             N->getOperand(0), N->getOperand(1),
1159                             N->getOperand(2));
1160       }
1161       --I;
1162       CurDAG->ReplaceAllUsesWith(N, R.getNode());
1163       ++I;
1164       MadeChange = true;
1165       continue;
1166     }
1167     case ISD::FP_ROUND:
1168     case ISD::STRICT_FP_ROUND:
1169     case ISD::FP_TO_SINT:
1170     case ISD::FP_TO_UINT:
1171     case ISD::STRICT_FP_TO_SINT:
1172     case ISD::STRICT_FP_TO_UINT: {
1173       // Replace vector fp_to_s/uint with their X86 specific equivalent so we
1174       // don't need 2 sets of patterns.
1175       if (!N->getSimpleValueType(0).isVector())
1176         break;
1177 
1178       unsigned NewOpc;
1179       switch (N->getOpcode()) {
1180       default: llvm_unreachable("Unexpected opcode!");
1181       case ISD::FP_ROUND:          NewOpc = X86ISD::VFPROUND;        break;
1182       case ISD::STRICT_FP_ROUND:   NewOpc = X86ISD::STRICT_VFPROUND; break;
1183       case ISD::STRICT_FP_TO_SINT: NewOpc = X86ISD::STRICT_CVTTP2SI; break;
1184       case ISD::FP_TO_SINT:        NewOpc = X86ISD::CVTTP2SI;        break;
1185       case ISD::STRICT_FP_TO_UINT: NewOpc = X86ISD::STRICT_CVTTP2UI; break;
1186       case ISD::FP_TO_UINT:        NewOpc = X86ISD::CVTTP2UI;        break;
1187       }
1188       SDValue Res;
1189       if (N->isStrictFPOpcode())
1190         Res =
1191             CurDAG->getNode(NewOpc, SDLoc(N), {N->getValueType(0), MVT::Other},
1192                             {N->getOperand(0), N->getOperand(1)});
1193       else
1194         Res =
1195             CurDAG->getNode(NewOpc, SDLoc(N), N->getValueType(0),
1196                             N->getOperand(0));
1197       --I;
1198       CurDAG->ReplaceAllUsesWith(N, Res.getNode());
1199       ++I;
1200       MadeChange = true;
1201       continue;
1202     }
1203     case ISD::SHL:
1204     case ISD::SRA:
1205     case ISD::SRL: {
1206       // Replace vector shifts with their X86 specific equivalent so we don't
1207       // need 2 sets of patterns.
1208       if (!N->getValueType(0).isVector())
1209         break;
1210 
1211       unsigned NewOpc;
1212       switch (N->getOpcode()) {
1213       default: llvm_unreachable("Unexpected opcode!");
1214       case ISD::SHL: NewOpc = X86ISD::VSHLV; break;
1215       case ISD::SRA: NewOpc = X86ISD::VSRAV; break;
1216       case ISD::SRL: NewOpc = X86ISD::VSRLV; break;
1217       }
1218       SDValue Res = CurDAG->getNode(NewOpc, SDLoc(N), N->getValueType(0),
1219                                     N->getOperand(0), N->getOperand(1));
1220       --I;
1221       CurDAG->ReplaceAllUsesOfValueWith(SDValue(N, 0), Res);
1222       ++I;
1223       MadeChange = true;
1224       continue;
1225     }
1226     case ISD::ANY_EXTEND:
1227     case ISD::ANY_EXTEND_VECTOR_INREG: {
1228       // Replace vector any extend with the zero extend equivalents so we don't
1229       // need 2 sets of patterns. Ignore vXi1 extensions.
1230       if (!N->getValueType(0).isVector())
1231         break;
1232 
1233       unsigned NewOpc;
1234       if (N->getOperand(0).getScalarValueSizeInBits() == 1) {
1235         assert(N->getOpcode() == ISD::ANY_EXTEND &&
1236                "Unexpected opcode for mask vector!");
1237         NewOpc = ISD::SIGN_EXTEND;
1238       } else {
1239         NewOpc = N->getOpcode() == ISD::ANY_EXTEND
1240                               ? ISD::ZERO_EXTEND
1241                               : ISD::ZERO_EXTEND_VECTOR_INREG;
1242       }
1243 
1244       SDValue Res = CurDAG->getNode(NewOpc, SDLoc(N), N->getValueType(0),
1245                                     N->getOperand(0));
1246       --I;
1247       CurDAG->ReplaceAllUsesOfValueWith(SDValue(N, 0), Res);
1248       ++I;
1249       MadeChange = true;
1250       continue;
1251     }
1252     case ISD::FCEIL:
1253     case ISD::STRICT_FCEIL:
1254     case ISD::FFLOOR:
1255     case ISD::STRICT_FFLOOR:
1256     case ISD::FTRUNC:
1257     case ISD::STRICT_FTRUNC:
1258     case ISD::FROUNDEVEN:
1259     case ISD::STRICT_FROUNDEVEN:
1260     case ISD::FNEARBYINT:
1261     case ISD::STRICT_FNEARBYINT:
1262     case ISD::FRINT:
1263     case ISD::STRICT_FRINT: {
1264       // Replace fp rounding with their X86 specific equivalent so we don't
1265       // need 2 sets of patterns.
1266       unsigned Imm;
1267       switch (N->getOpcode()) {
1268       default: llvm_unreachable("Unexpected opcode!");
1269       case ISD::STRICT_FCEIL:
1270       case ISD::FCEIL:      Imm = 0xA; break;
1271       case ISD::STRICT_FFLOOR:
1272       case ISD::FFLOOR:     Imm = 0x9; break;
1273       case ISD::STRICT_FTRUNC:
1274       case ISD::FTRUNC:     Imm = 0xB; break;
1275       case ISD::STRICT_FROUNDEVEN:
1276       case ISD::FROUNDEVEN: Imm = 0x8; break;
1277       case ISD::STRICT_FNEARBYINT:
1278       case ISD::FNEARBYINT: Imm = 0xC; break;
1279       case ISD::STRICT_FRINT:
1280       case ISD::FRINT:      Imm = 0x4; break;
1281       }
1282       SDLoc dl(N);
1283       bool IsStrict = N->isStrictFPOpcode();
1284       SDValue Res;
1285       if (IsStrict)
1286         Res = CurDAG->getNode(X86ISD::STRICT_VRNDSCALE, dl,
1287                               {N->getValueType(0), MVT::Other},
1288                               {N->getOperand(0), N->getOperand(1),
1289                                CurDAG->getTargetConstant(Imm, dl, MVT::i32)});
1290       else
1291         Res = CurDAG->getNode(X86ISD::VRNDSCALE, dl, N->getValueType(0),
1292                               N->getOperand(0),
1293                               CurDAG->getTargetConstant(Imm, dl, MVT::i32));
1294       --I;
1295       CurDAG->ReplaceAllUsesWith(N, Res.getNode());
1296       ++I;
1297       MadeChange = true;
1298       continue;
1299     }
1300     case X86ISD::FANDN:
1301     case X86ISD::FAND:
1302     case X86ISD::FOR:
1303     case X86ISD::FXOR: {
1304       // Widen scalar fp logic ops to vector to reduce isel patterns.
1305       // FIXME: Can we do this during lowering/combine.
1306       MVT VT = N->getSimpleValueType(0);
1307       if (VT.isVector() || VT == MVT::f128)
1308         break;
1309 
1310       MVT VecVT = VT == MVT::f64   ? MVT::v2f64
1311                   : VT == MVT::f32 ? MVT::v4f32
1312                                    : MVT::v8f16;
1313 
1314       SDLoc dl(N);
1315       SDValue Op0 = CurDAG->getNode(ISD::SCALAR_TO_VECTOR, dl, VecVT,
1316                                     N->getOperand(0));
1317       SDValue Op1 = CurDAG->getNode(ISD::SCALAR_TO_VECTOR, dl, VecVT,
1318                                     N->getOperand(1));
1319 
1320       SDValue Res;
1321       if (Subtarget->hasSSE2()) {
1322         EVT IntVT = EVT(VecVT).changeVectorElementTypeToInteger();
1323         Op0 = CurDAG->getNode(ISD::BITCAST, dl, IntVT, Op0);
1324         Op1 = CurDAG->getNode(ISD::BITCAST, dl, IntVT, Op1);
1325         unsigned Opc;
1326         switch (N->getOpcode()) {
1327         default: llvm_unreachable("Unexpected opcode!");
1328         case X86ISD::FANDN: Opc = X86ISD::ANDNP; break;
1329         case X86ISD::FAND:  Opc = ISD::AND;      break;
1330         case X86ISD::FOR:   Opc = ISD::OR;       break;
1331         case X86ISD::FXOR:  Opc = ISD::XOR;      break;
1332         }
1333         Res = CurDAG->getNode(Opc, dl, IntVT, Op0, Op1);
1334         Res = CurDAG->getNode(ISD::BITCAST, dl, VecVT, Res);
1335       } else {
1336         Res = CurDAG->getNode(N->getOpcode(), dl, VecVT, Op0, Op1);
1337       }
1338       Res = CurDAG->getNode(ISD::EXTRACT_VECTOR_ELT, dl, VT, Res,
1339                             CurDAG->getIntPtrConstant(0, dl));
1340       --I;
1341       CurDAG->ReplaceAllUsesOfValueWith(SDValue(N, 0), Res);
1342       ++I;
1343       MadeChange = true;
1344       continue;
1345     }
1346     }
1347 
1348     if (OptLevel != CodeGenOptLevel::None &&
1349         // Only do this when the target can fold the load into the call or
1350         // jmp.
1351         !Subtarget->useIndirectThunkCalls() &&
1352         ((N->getOpcode() == X86ISD::CALL && !Subtarget->slowTwoMemOps()) ||
1353          (N->getOpcode() == X86ISD::TC_RETURN &&
1354           (Subtarget->is64Bit() ||
1355            !getTargetMachine().isPositionIndependent())))) {
1356       /// Also try moving call address load from outside callseq_start to just
1357       /// before the call to allow it to be folded.
1358       ///
1359       ///     [Load chain]
1360       ///         ^
1361       ///         |
1362       ///       [Load]
1363       ///       ^    ^
1364       ///       |    |
1365       ///      /      \--
1366       ///     /          |
1367       ///[CALLSEQ_START] |
1368       ///     ^          |
1369       ///     |          |
1370       /// [LOAD/C2Reg]   |
1371       ///     |          |
1372       ///      \        /
1373       ///       \      /
1374       ///       [CALL]
1375       bool HasCallSeq = N->getOpcode() == X86ISD::CALL;
1376       SDValue Chain = N->getOperand(0);
1377       SDValue Load  = N->getOperand(1);
1378       if (!isCalleeLoad(Load, Chain, HasCallSeq))
1379         continue;
1380       moveBelowOrigChain(CurDAG, Load, SDValue(N, 0), Chain);
1381       ++NumLoadMoved;
1382       MadeChange = true;
1383       continue;
1384     }
1385 
1386     // Lower fpround and fpextend nodes that target the FP stack to be store and
1387     // load to the stack.  This is a gross hack.  We would like to simply mark
1388     // these as being illegal, but when we do that, legalize produces these when
1389     // it expands calls, then expands these in the same legalize pass.  We would
1390     // like dag combine to be able to hack on these between the call expansion
1391     // and the node legalization.  As such this pass basically does "really
1392     // late" legalization of these inline with the X86 isel pass.
1393     // FIXME: This should only happen when not compiled with -O0.
1394     switch (N->getOpcode()) {
1395     default: continue;
1396     case ISD::FP_ROUND:
1397     case ISD::FP_EXTEND:
1398     {
1399       MVT SrcVT = N->getOperand(0).getSimpleValueType();
1400       MVT DstVT = N->getSimpleValueType(0);
1401 
1402       // If any of the sources are vectors, no fp stack involved.
1403       if (SrcVT.isVector() || DstVT.isVector())
1404         continue;
1405 
1406       // If the source and destination are SSE registers, then this is a legal
1407       // conversion that should not be lowered.
1408       const X86TargetLowering *X86Lowering =
1409           static_cast<const X86TargetLowering *>(TLI);
1410       bool SrcIsSSE = X86Lowering->isScalarFPTypeInSSEReg(SrcVT);
1411       bool DstIsSSE = X86Lowering->isScalarFPTypeInSSEReg(DstVT);
1412       if (SrcIsSSE && DstIsSSE)
1413         continue;
1414 
1415       if (!SrcIsSSE && !DstIsSSE) {
1416         // If this is an FPStack extension, it is a noop.
1417         if (N->getOpcode() == ISD::FP_EXTEND)
1418           continue;
1419         // If this is a value-preserving FPStack truncation, it is a noop.
1420         if (N->getConstantOperandVal(1))
1421           continue;
1422       }
1423 
1424       // Here we could have an FP stack truncation or an FPStack <-> SSE convert.
1425       // FPStack has extload and truncstore.  SSE can fold direct loads into other
1426       // operations.  Based on this, decide what we want to do.
1427       MVT MemVT = (N->getOpcode() == ISD::FP_ROUND) ? DstVT : SrcVT;
1428       SDValue MemTmp = CurDAG->CreateStackTemporary(MemVT);
1429       int SPFI = cast<FrameIndexSDNode>(MemTmp)->getIndex();
1430       MachinePointerInfo MPI =
1431           MachinePointerInfo::getFixedStack(CurDAG->getMachineFunction(), SPFI);
1432       SDLoc dl(N);
1433 
1434       // FIXME: optimize the case where the src/dest is a load or store?
1435 
1436       SDValue Store = CurDAG->getTruncStore(
1437           CurDAG->getEntryNode(), dl, N->getOperand(0), MemTmp, MPI, MemVT);
1438       SDValue Result = CurDAG->getExtLoad(ISD::EXTLOAD, dl, DstVT, Store,
1439                                           MemTmp, MPI, MemVT);
1440 
1441       // We're about to replace all uses of the FP_ROUND/FP_EXTEND with the
1442       // extload we created.  This will cause general havok on the dag because
1443       // anything below the conversion could be folded into other existing nodes.
1444       // To avoid invalidating 'I', back it up to the convert node.
1445       --I;
1446       CurDAG->ReplaceAllUsesOfValueWith(SDValue(N, 0), Result);
1447       break;
1448     }
1449 
1450     //The sequence of events for lowering STRICT_FP versions of these nodes requires
1451     //dealing with the chain differently, as there is already a preexisting chain.
1452     case ISD::STRICT_FP_ROUND:
1453     case ISD::STRICT_FP_EXTEND:
1454     {
1455       MVT SrcVT = N->getOperand(1).getSimpleValueType();
1456       MVT DstVT = N->getSimpleValueType(0);
1457 
1458       // If any of the sources are vectors, no fp stack involved.
1459       if (SrcVT.isVector() || DstVT.isVector())
1460         continue;
1461 
1462       // If the source and destination are SSE registers, then this is a legal
1463       // conversion that should not be lowered.
1464       const X86TargetLowering *X86Lowering =
1465           static_cast<const X86TargetLowering *>(TLI);
1466       bool SrcIsSSE = X86Lowering->isScalarFPTypeInSSEReg(SrcVT);
1467       bool DstIsSSE = X86Lowering->isScalarFPTypeInSSEReg(DstVT);
1468       if (SrcIsSSE && DstIsSSE)
1469         continue;
1470 
1471       if (!SrcIsSSE && !DstIsSSE) {
1472         // If this is an FPStack extension, it is a noop.
1473         if (N->getOpcode() == ISD::STRICT_FP_EXTEND)
1474           continue;
1475         // If this is a value-preserving FPStack truncation, it is a noop.
1476         if (N->getConstantOperandVal(2))
1477           continue;
1478       }
1479 
1480       // Here we could have an FP stack truncation or an FPStack <-> SSE convert.
1481       // FPStack has extload and truncstore.  SSE can fold direct loads into other
1482       // operations.  Based on this, decide what we want to do.
1483       MVT MemVT = (N->getOpcode() == ISD::STRICT_FP_ROUND) ? DstVT : SrcVT;
1484       SDValue MemTmp = CurDAG->CreateStackTemporary(MemVT);
1485       int SPFI = cast<FrameIndexSDNode>(MemTmp)->getIndex();
1486       MachinePointerInfo MPI =
1487           MachinePointerInfo::getFixedStack(CurDAG->getMachineFunction(), SPFI);
1488       SDLoc dl(N);
1489 
1490       // FIXME: optimize the case where the src/dest is a load or store?
1491 
1492       //Since the operation is StrictFP, use the preexisting chain.
1493       SDValue Store, Result;
1494       if (!SrcIsSSE) {
1495         SDVTList VTs = CurDAG->getVTList(MVT::Other);
1496         SDValue Ops[] = {N->getOperand(0), N->getOperand(1), MemTmp};
1497         Store = CurDAG->getMemIntrinsicNode(X86ISD::FST, dl, VTs, Ops, MemVT,
1498                                             MPI, /*Align*/ std::nullopt,
1499                                             MachineMemOperand::MOStore);
1500         if (N->getFlags().hasNoFPExcept()) {
1501           SDNodeFlags Flags = Store->getFlags();
1502           Flags.setNoFPExcept(true);
1503           Store->setFlags(Flags);
1504         }
1505       } else {
1506         assert(SrcVT == MemVT && "Unexpected VT!");
1507         Store = CurDAG->getStore(N->getOperand(0), dl, N->getOperand(1), MemTmp,
1508                                  MPI);
1509       }
1510 
1511       if (!DstIsSSE) {
1512         SDVTList VTs = CurDAG->getVTList(DstVT, MVT::Other);
1513         SDValue Ops[] = {Store, MemTmp};
1514         Result = CurDAG->getMemIntrinsicNode(
1515             X86ISD::FLD, dl, VTs, Ops, MemVT, MPI,
1516             /*Align*/ std::nullopt, MachineMemOperand::MOLoad);
1517         if (N->getFlags().hasNoFPExcept()) {
1518           SDNodeFlags Flags = Result->getFlags();
1519           Flags.setNoFPExcept(true);
1520           Result->setFlags(Flags);
1521         }
1522       } else {
1523         assert(DstVT == MemVT && "Unexpected VT!");
1524         Result = CurDAG->getLoad(DstVT, dl, Store, MemTmp, MPI);
1525       }
1526 
1527       // We're about to replace all uses of the FP_ROUND/FP_EXTEND with the
1528       // extload we created.  This will cause general havok on the dag because
1529       // anything below the conversion could be folded into other existing nodes.
1530       // To avoid invalidating 'I', back it up to the convert node.
1531       --I;
1532       CurDAG->ReplaceAllUsesWith(N, Result.getNode());
1533       break;
1534     }
1535     }
1536 
1537 
1538     // Now that we did that, the node is dead.  Increment the iterator to the
1539     // next node to process, then delete N.
1540     ++I;
1541     MadeChange = true;
1542   }
1543 
1544   // Remove any dead nodes that may have been left behind.
1545   if (MadeChange)
1546     CurDAG->RemoveDeadNodes();
1547 }
1548 
1549 // Look for a redundant movzx/movsx that can occur after an 8-bit divrem.
tryOptimizeRem8Extend(SDNode * N)1550 bool X86DAGToDAGISel::tryOptimizeRem8Extend(SDNode *N) {
1551   unsigned Opc = N->getMachineOpcode();
1552   if (Opc != X86::MOVZX32rr8 && Opc != X86::MOVSX32rr8 &&
1553       Opc != X86::MOVSX64rr8)
1554     return false;
1555 
1556   SDValue N0 = N->getOperand(0);
1557 
1558   // We need to be extracting the lower bit of an extend.
1559   if (!N0.isMachineOpcode() ||
1560       N0.getMachineOpcode() != TargetOpcode::EXTRACT_SUBREG ||
1561       N0.getConstantOperandVal(1) != X86::sub_8bit)
1562     return false;
1563 
1564   // We're looking for either a movsx or movzx to match the original opcode.
1565   unsigned ExpectedOpc = Opc == X86::MOVZX32rr8 ? X86::MOVZX32rr8_NOREX
1566                                                 : X86::MOVSX32rr8_NOREX;
1567   SDValue N00 = N0.getOperand(0);
1568   if (!N00.isMachineOpcode() || N00.getMachineOpcode() != ExpectedOpc)
1569     return false;
1570 
1571   if (Opc == X86::MOVSX64rr8) {
1572     // If we had a sign extend from 8 to 64 bits. We still need to go from 32
1573     // to 64.
1574     MachineSDNode *Extend = CurDAG->getMachineNode(X86::MOVSX64rr32, SDLoc(N),
1575                                                    MVT::i64, N00);
1576     ReplaceUses(N, Extend);
1577   } else {
1578     // Ok we can drop this extend and just use the original extend.
1579     ReplaceUses(N, N00.getNode());
1580   }
1581 
1582   return true;
1583 }
1584 
PostprocessISelDAG()1585 void X86DAGToDAGISel::PostprocessISelDAG() {
1586   // Skip peepholes at -O0.
1587   if (TM.getOptLevel() == CodeGenOptLevel::None)
1588     return;
1589 
1590   SelectionDAG::allnodes_iterator Position = CurDAG->allnodes_end();
1591 
1592   bool MadeChange = false;
1593   while (Position != CurDAG->allnodes_begin()) {
1594     SDNode *N = &*--Position;
1595     // Skip dead nodes and any non-machine opcodes.
1596     if (N->use_empty() || !N->isMachineOpcode())
1597       continue;
1598 
1599     if (tryOptimizeRem8Extend(N)) {
1600       MadeChange = true;
1601       continue;
1602     }
1603 
1604     unsigned Opc = N->getMachineOpcode();
1605     switch (Opc) {
1606     default:
1607       continue;
1608     // ANDrr/rm + TESTrr+ -> TESTrr/TESTmr
1609     case X86::TEST8rr:
1610     case X86::TEST16rr:
1611     case X86::TEST32rr:
1612     case X86::TEST64rr:
1613     // ANDrr/rm + CTESTrr -> CTESTrr/CTESTmr
1614     case X86::CTEST8rr:
1615     case X86::CTEST16rr:
1616     case X86::CTEST32rr:
1617     case X86::CTEST64rr: {
1618       auto &Op0 = N->getOperand(0);
1619       if (Op0 != N->getOperand(1) || !Op0->hasNUsesOfValue(2, Op0.getResNo()) ||
1620           !Op0.isMachineOpcode())
1621         continue;
1622       SDValue And = N->getOperand(0);
1623 #define CASE_ND(OP)                                                            \
1624   case X86::OP:                                                                \
1625   case X86::OP##_ND:
1626       switch (And.getMachineOpcode()) {
1627       default:
1628         continue;
1629         CASE_ND(AND8rr)
1630         CASE_ND(AND16rr)
1631         CASE_ND(AND32rr)
1632         CASE_ND(AND64rr) {
1633           if (And->hasAnyUseOfValue(1))
1634             continue;
1635           SmallVector<SDValue> Ops(N->op_values());
1636           Ops[0] = And.getOperand(0);
1637           Ops[1] = And.getOperand(1);
1638           MachineSDNode *Test =
1639               CurDAG->getMachineNode(Opc, SDLoc(N), MVT::i32, Ops);
1640           ReplaceUses(N, Test);
1641           MadeChange = true;
1642           continue;
1643         }
1644         CASE_ND(AND8rm)
1645         CASE_ND(AND16rm)
1646         CASE_ND(AND32rm)
1647         CASE_ND(AND64rm) {
1648           if (And->hasAnyUseOfValue(1))
1649             continue;
1650           unsigned NewOpc;
1651           bool IsCTESTCC = X86::isCTESTCC(Opc);
1652 #define FROM_TO(A, B)                                                          \
1653   CASE_ND(A) NewOpc = IsCTESTCC ? X86::C##B : X86::B;                          \
1654   break;
1655           switch (And.getMachineOpcode()) {
1656             FROM_TO(AND8rm, TEST8mr);
1657             FROM_TO(AND16rm, TEST16mr);
1658             FROM_TO(AND32rm, TEST32mr);
1659             FROM_TO(AND64rm, TEST64mr);
1660           }
1661 #undef FROM_TO
1662 #undef CASE_ND
1663           // Need to swap the memory and register operand.
1664           SmallVector<SDValue> Ops = {And.getOperand(1), And.getOperand(2),
1665                                       And.getOperand(3), And.getOperand(4),
1666                                       And.getOperand(5), And.getOperand(0)};
1667           // CC, Cflags.
1668           if (IsCTESTCC) {
1669             Ops.push_back(N->getOperand(2));
1670             Ops.push_back(N->getOperand(3));
1671           }
1672           // Chain of memory load
1673           Ops.push_back(And.getOperand(6));
1674           // Glue
1675           if (IsCTESTCC)
1676             Ops.push_back(N->getOperand(4));
1677 
1678           MachineSDNode *Test = CurDAG->getMachineNode(
1679               NewOpc, SDLoc(N), MVT::i32, MVT::Other, Ops);
1680           CurDAG->setNodeMemRefs(
1681               Test, cast<MachineSDNode>(And.getNode())->memoperands());
1682           ReplaceUses(And.getValue(2), SDValue(Test, 1));
1683           ReplaceUses(SDValue(N, 0), SDValue(Test, 0));
1684           MadeChange = true;
1685           continue;
1686         }
1687       }
1688     }
1689     // Look for a KAND+KORTEST and turn it into KTEST if only the zero flag is
1690     // used. We're doing this late so we can prefer to fold the AND into masked
1691     // comparisons. Doing that can be better for the live range of the mask
1692     // register.
1693     case X86::KORTESTBkk:
1694     case X86::KORTESTWkk:
1695     case X86::KORTESTDkk:
1696     case X86::KORTESTQkk: {
1697       SDValue Op0 = N->getOperand(0);
1698       if (Op0 != N->getOperand(1) || !N->isOnlyUserOf(Op0.getNode()) ||
1699           !Op0.isMachineOpcode() || !onlyUsesZeroFlag(SDValue(N, 0)))
1700         continue;
1701 #define CASE(A)                                                                \
1702   case X86::A:                                                                 \
1703     break;
1704       switch (Op0.getMachineOpcode()) {
1705       default:
1706         continue;
1707         CASE(KANDBkk)
1708         CASE(KANDWkk)
1709         CASE(KANDDkk)
1710         CASE(KANDQkk)
1711       }
1712       unsigned NewOpc;
1713 #define FROM_TO(A, B)                                                          \
1714   case X86::A:                                                                 \
1715     NewOpc = X86::B;                                                           \
1716     break;
1717       switch (Opc) {
1718         FROM_TO(KORTESTBkk, KTESTBkk)
1719         FROM_TO(KORTESTWkk, KTESTWkk)
1720         FROM_TO(KORTESTDkk, KTESTDkk)
1721         FROM_TO(KORTESTQkk, KTESTQkk)
1722       }
1723       // KANDW is legal with AVX512F, but KTESTW requires AVX512DQ. The other
1724       // KAND instructions and KTEST use the same ISA feature.
1725       if (NewOpc == X86::KTESTWkk && !Subtarget->hasDQI())
1726         continue;
1727 #undef FROM_TO
1728       MachineSDNode *KTest = CurDAG->getMachineNode(
1729           NewOpc, SDLoc(N), MVT::i32, Op0.getOperand(0), Op0.getOperand(1));
1730       ReplaceUses(N, KTest);
1731       MadeChange = true;
1732       continue;
1733     }
1734     // Attempt to remove vectors moves that were inserted to zero upper bits.
1735     case TargetOpcode::SUBREG_TO_REG: {
1736       unsigned SubRegIdx = N->getConstantOperandVal(2);
1737       if (SubRegIdx != X86::sub_xmm && SubRegIdx != X86::sub_ymm)
1738         continue;
1739 
1740       SDValue Move = N->getOperand(1);
1741       if (!Move.isMachineOpcode())
1742         continue;
1743 
1744       // Make sure its one of the move opcodes we recognize.
1745       switch (Move.getMachineOpcode()) {
1746       default:
1747         continue;
1748         CASE(VMOVAPDrr)       CASE(VMOVUPDrr)
1749         CASE(VMOVAPSrr)       CASE(VMOVUPSrr)
1750         CASE(VMOVDQArr)       CASE(VMOVDQUrr)
1751         CASE(VMOVAPDYrr)      CASE(VMOVUPDYrr)
1752         CASE(VMOVAPSYrr)      CASE(VMOVUPSYrr)
1753         CASE(VMOVDQAYrr)      CASE(VMOVDQUYrr)
1754         CASE(VMOVAPDZ128rr)   CASE(VMOVUPDZ128rr)
1755         CASE(VMOVAPSZ128rr)   CASE(VMOVUPSZ128rr)
1756         CASE(VMOVDQA32Z128rr) CASE(VMOVDQU32Z128rr)
1757         CASE(VMOVDQA64Z128rr) CASE(VMOVDQU64Z128rr)
1758         CASE(VMOVAPDZ256rr)   CASE(VMOVUPDZ256rr)
1759         CASE(VMOVAPSZ256rr)   CASE(VMOVUPSZ256rr)
1760         CASE(VMOVDQA32Z256rr) CASE(VMOVDQU32Z256rr)
1761         CASE(VMOVDQA64Z256rr) CASE(VMOVDQU64Z256rr)
1762       }
1763 #undef CASE
1764 
1765     SDValue In = Move.getOperand(0);
1766     if (!In.isMachineOpcode() ||
1767         In.getMachineOpcode() <= TargetOpcode::GENERIC_OP_END)
1768       continue;
1769 
1770     // Make sure the instruction has a VEX, XOP, or EVEX prefix. This covers
1771     // the SHA instructions which use a legacy encoding.
1772     uint64_t TSFlags = getInstrInfo()->get(In.getMachineOpcode()).TSFlags;
1773     if ((TSFlags & X86II::EncodingMask) != X86II::VEX &&
1774         (TSFlags & X86II::EncodingMask) != X86II::EVEX &&
1775         (TSFlags & X86II::EncodingMask) != X86II::XOP)
1776       continue;
1777 
1778     // Producing instruction is another vector instruction. We can drop the
1779     // move.
1780     CurDAG->UpdateNodeOperands(N, N->getOperand(0), In, N->getOperand(2));
1781     MadeChange = true;
1782     }
1783     }
1784   }
1785 
1786   if (MadeChange)
1787     CurDAG->RemoveDeadNodes();
1788 }
1789 
1790 
1791 /// Emit any code that needs to be executed only in the main function.
emitSpecialCodeForMain()1792 void X86DAGToDAGISel::emitSpecialCodeForMain() {
1793   if (Subtarget->isTargetCygMing()) {
1794     TargetLowering::ArgListTy Args;
1795     auto &DL = CurDAG->getDataLayout();
1796 
1797     TargetLowering::CallLoweringInfo CLI(*CurDAG);
1798     CLI.setChain(CurDAG->getRoot())
1799         .setCallee(CallingConv::C, Type::getVoidTy(*CurDAG->getContext()),
1800                    CurDAG->getExternalSymbol("__main", TLI->getPointerTy(DL)),
1801                    std::move(Args));
1802     const TargetLowering &TLI = CurDAG->getTargetLoweringInfo();
1803     std::pair<SDValue, SDValue> Result = TLI.LowerCallTo(CLI);
1804     CurDAG->setRoot(Result.second);
1805   }
1806 }
1807 
emitFunctionEntryCode()1808 void X86DAGToDAGISel::emitFunctionEntryCode() {
1809   // If this is main, emit special code for main.
1810   const Function &F = MF->getFunction();
1811   if (F.hasExternalLinkage() && F.getName() == "main")
1812     emitSpecialCodeForMain();
1813 }
1814 
isDispSafeForFrameIndexOrRegBase(int64_t Val)1815 static bool isDispSafeForFrameIndexOrRegBase(int64_t Val) {
1816   // We can run into an issue where a frame index or a register base
1817   // includes a displacement that, when added to the explicit displacement,
1818   // will overflow the displacement field. Assuming that the
1819   // displacement fits into a 31-bit integer  (which is only slightly more
1820   // aggressive than the current fundamental assumption that it fits into
1821   // a 32-bit integer), a 31-bit disp should always be safe.
1822   return isInt<31>(Val);
1823 }
1824 
foldOffsetIntoAddress(uint64_t Offset,X86ISelAddressMode & AM)1825 bool X86DAGToDAGISel::foldOffsetIntoAddress(uint64_t Offset,
1826                                             X86ISelAddressMode &AM) {
1827   // We may have already matched a displacement and the caller just added the
1828   // symbolic displacement. So we still need to do the checks even if Offset
1829   // is zero.
1830 
1831   int64_t Val = AM.Disp + Offset;
1832 
1833   // Cannot combine ExternalSymbol displacements with integer offsets.
1834   if (Val != 0 && (AM.ES || AM.MCSym))
1835     return true;
1836 
1837   CodeModel::Model M = TM.getCodeModel();
1838   if (Subtarget->is64Bit()) {
1839     if (Val != 0 &&
1840         !X86::isOffsetSuitableForCodeModel(Val, M,
1841                                            AM.hasSymbolicDisplacement()))
1842       return true;
1843     // In addition to the checks required for a register base, check that
1844     // we do not try to use an unsafe Disp with a frame index.
1845     if (AM.BaseType == X86ISelAddressMode::FrameIndexBase &&
1846         !isDispSafeForFrameIndexOrRegBase(Val))
1847       return true;
1848     // In ILP32 (x32) mode, pointers are 32 bits and need to be zero-extended to
1849     // 64 bits. Instructions with 32-bit register addresses perform this zero
1850     // extension for us and we can safely ignore the high bits of Offset.
1851     // Instructions with only a 32-bit immediate address do not, though: they
1852     // sign extend instead. This means only address the low 2GB of address space
1853     // is directly addressable, we need indirect addressing for the high 2GB of
1854     // address space.
1855     // TODO: Some of the earlier checks may be relaxed for ILP32 mode as the
1856     // implicit zero extension of instructions would cover up any problem.
1857     // However, we have asserts elsewhere that get triggered if we do, so keep
1858     // the checks for now.
1859     // TODO: We would actually be able to accept these, as well as the same
1860     // addresses in LP64 mode, by adding the EIZ pseudo-register as an operand
1861     // to get an address size override to be emitted. However, this
1862     // pseudo-register is not part of any register class and therefore causes
1863     // MIR verification to fail.
1864     if (Subtarget->isTarget64BitILP32() &&
1865         !isDispSafeForFrameIndexOrRegBase((uint32_t)Val) &&
1866         !AM.hasBaseOrIndexReg())
1867       return true;
1868   } else if (AM.hasBaseOrIndexReg() && !isDispSafeForFrameIndexOrRegBase(Val))
1869     // For 32-bit X86, make sure the displacement still isn't close to the
1870     // expressible limit.
1871     return true;
1872   AM.Disp = Val;
1873   return false;
1874 }
1875 
matchLoadInAddress(LoadSDNode * N,X86ISelAddressMode & AM,bool AllowSegmentRegForX32)1876 bool X86DAGToDAGISel::matchLoadInAddress(LoadSDNode *N, X86ISelAddressMode &AM,
1877                                          bool AllowSegmentRegForX32) {
1878   SDValue Address = N->getOperand(1);
1879 
1880   // load gs:0 -> GS segment register.
1881   // load fs:0 -> FS segment register.
1882   //
1883   // This optimization is generally valid because the GNU TLS model defines that
1884   // gs:0 (or fs:0 on X86-64) contains its own address. However, for X86-64 mode
1885   // with 32-bit registers, as we get in ILP32 mode, those registers are first
1886   // zero-extended to 64 bits and then added it to the base address, which gives
1887   // unwanted results when the register holds a negative value.
1888   // For more information see http://people.redhat.com/drepper/tls.pdf
1889   if (isNullConstant(Address) && AM.Segment.getNode() == nullptr &&
1890       !IndirectTlsSegRefs &&
1891       (Subtarget->isTargetGlibc() || Subtarget->isTargetAndroid() ||
1892        Subtarget->isTargetFuchsia())) {
1893     if (Subtarget->isTarget64BitILP32() && !AllowSegmentRegForX32)
1894       return true;
1895     switch (N->getPointerInfo().getAddrSpace()) {
1896     case X86AS::GS:
1897       AM.Segment = CurDAG->getRegister(X86::GS, MVT::i16);
1898       return false;
1899     case X86AS::FS:
1900       AM.Segment = CurDAG->getRegister(X86::FS, MVT::i16);
1901       return false;
1902       // Address space X86AS::SS is not handled here, because it is not used to
1903       // address TLS areas.
1904     }
1905   }
1906 
1907   return true;
1908 }
1909 
1910 /// Try to match X86ISD::Wrapper and X86ISD::WrapperRIP nodes into an addressing
1911 /// mode. These wrap things that will resolve down into a symbol reference.
1912 /// If no match is possible, this returns true, otherwise it returns false.
matchWrapper(SDValue N,X86ISelAddressMode & AM)1913 bool X86DAGToDAGISel::matchWrapper(SDValue N, X86ISelAddressMode &AM) {
1914   // If the addressing mode already has a symbol as the displacement, we can
1915   // never match another symbol.
1916   if (AM.hasSymbolicDisplacement())
1917     return true;
1918 
1919   bool IsRIPRelTLS = false;
1920   bool IsRIPRel = N.getOpcode() == X86ISD::WrapperRIP;
1921   if (IsRIPRel) {
1922     SDValue Val = N.getOperand(0);
1923     if (Val.getOpcode() == ISD::TargetGlobalTLSAddress)
1924       IsRIPRelTLS = true;
1925   }
1926 
1927   // We can't use an addressing mode in the 64-bit large code model.
1928   // Global TLS addressing is an exception. In the medium code model,
1929   // we use can use a mode when RIP wrappers are present.
1930   // That signifies access to globals that are known to be "near",
1931   // such as the GOT itself.
1932   CodeModel::Model M = TM.getCodeModel();
1933   if (Subtarget->is64Bit() && M == CodeModel::Large && !IsRIPRelTLS)
1934     return true;
1935 
1936   // Base and index reg must be 0 in order to use %rip as base.
1937   if (IsRIPRel && AM.hasBaseOrIndexReg())
1938     return true;
1939 
1940   // Make a local copy in case we can't do this fold.
1941   X86ISelAddressMode Backup = AM;
1942 
1943   int64_t Offset = 0;
1944   SDValue N0 = N.getOperand(0);
1945   if (auto *G = dyn_cast<GlobalAddressSDNode>(N0)) {
1946     AM.GV = G->getGlobal();
1947     AM.SymbolFlags = G->getTargetFlags();
1948     Offset = G->getOffset();
1949   } else if (auto *CP = dyn_cast<ConstantPoolSDNode>(N0)) {
1950     AM.CP = CP->getConstVal();
1951     AM.Alignment = CP->getAlign();
1952     AM.SymbolFlags = CP->getTargetFlags();
1953     Offset = CP->getOffset();
1954   } else if (auto *S = dyn_cast<ExternalSymbolSDNode>(N0)) {
1955     AM.ES = S->getSymbol();
1956     AM.SymbolFlags = S->getTargetFlags();
1957   } else if (auto *S = dyn_cast<MCSymbolSDNode>(N0)) {
1958     AM.MCSym = S->getMCSymbol();
1959   } else if (auto *J = dyn_cast<JumpTableSDNode>(N0)) {
1960     AM.JT = J->getIndex();
1961     AM.SymbolFlags = J->getTargetFlags();
1962   } else if (auto *BA = dyn_cast<BlockAddressSDNode>(N0)) {
1963     AM.BlockAddr = BA->getBlockAddress();
1964     AM.SymbolFlags = BA->getTargetFlags();
1965     Offset = BA->getOffset();
1966   } else
1967     llvm_unreachable("Unhandled symbol reference node.");
1968 
1969   // Can't use an addressing mode with large globals.
1970   if (Subtarget->is64Bit() && !IsRIPRel && AM.GV &&
1971       TM.isLargeGlobalValue(AM.GV)) {
1972     AM = Backup;
1973     return true;
1974   }
1975 
1976   if (foldOffsetIntoAddress(Offset, AM)) {
1977     AM = Backup;
1978     return true;
1979   }
1980 
1981   if (IsRIPRel)
1982     AM.setBaseReg(CurDAG->getRegister(X86::RIP, MVT::i64));
1983 
1984   // Commit the changes now that we know this fold is safe.
1985   return false;
1986 }
1987 
1988 /// Add the specified node to the specified addressing mode, returning true if
1989 /// it cannot be done. This just pattern matches for the addressing mode.
matchAddress(SDValue N,X86ISelAddressMode & AM)1990 bool X86DAGToDAGISel::matchAddress(SDValue N, X86ISelAddressMode &AM) {
1991   if (matchAddressRecursively(N, AM, 0))
1992     return true;
1993 
1994   // Post-processing: Make a second attempt to fold a load, if we now know
1995   // that there will not be any other register. This is only performed for
1996   // 64-bit ILP32 mode since 32-bit mode and 64-bit LP64 mode will have folded
1997   // any foldable load the first time.
1998   if (Subtarget->isTarget64BitILP32() &&
1999       AM.BaseType == X86ISelAddressMode::RegBase &&
2000       AM.Base_Reg.getNode() != nullptr && AM.IndexReg.getNode() == nullptr) {
2001     SDValue Save_Base_Reg = AM.Base_Reg;
2002     if (auto *LoadN = dyn_cast<LoadSDNode>(Save_Base_Reg)) {
2003       AM.Base_Reg = SDValue();
2004       if (matchLoadInAddress(LoadN, AM, /*AllowSegmentRegForX32=*/true))
2005         AM.Base_Reg = Save_Base_Reg;
2006     }
2007   }
2008 
2009   // Post-processing: Convert lea(,%reg,2) to lea(%reg,%reg), which has
2010   // a smaller encoding and avoids a scaled-index.
2011   if (AM.Scale == 2 &&
2012       AM.BaseType == X86ISelAddressMode::RegBase &&
2013       AM.Base_Reg.getNode() == nullptr) {
2014     AM.Base_Reg = AM.IndexReg;
2015     AM.Scale = 1;
2016   }
2017 
2018   // Post-processing: Convert foo to foo(%rip), even in non-PIC mode,
2019   // because it has a smaller encoding.
2020   if (TM.getCodeModel() != CodeModel::Large &&
2021       (!AM.GV || !TM.isLargeGlobalValue(AM.GV)) && Subtarget->is64Bit() &&
2022       AM.Scale == 1 && AM.BaseType == X86ISelAddressMode::RegBase &&
2023       AM.Base_Reg.getNode() == nullptr && AM.IndexReg.getNode() == nullptr &&
2024       AM.SymbolFlags == X86II::MO_NO_FLAG && AM.hasSymbolicDisplacement()) {
2025     // However, when GV is a local function symbol and in the same section as
2026     // the current instruction, and AM.Disp is negative and near INT32_MIN,
2027     // referencing GV+Disp generates a relocation referencing the section symbol
2028     // with an even smaller offset, which might underflow. We should bail out if
2029     // the negative offset is too close to INT32_MIN. Actually, we are more
2030     // conservative here, using a smaller magic number also used by
2031     // isOffsetSuitableForCodeModel.
2032     if (isa_and_nonnull<Function>(AM.GV) && AM.Disp < -16 * 1024 * 1024)
2033       return true;
2034 
2035     AM.Base_Reg = CurDAG->getRegister(X86::RIP, MVT::i64);
2036   }
2037 
2038   return false;
2039 }
2040 
matchAdd(SDValue & N,X86ISelAddressMode & AM,unsigned Depth)2041 bool X86DAGToDAGISel::matchAdd(SDValue &N, X86ISelAddressMode &AM,
2042                                unsigned Depth) {
2043   // Add an artificial use to this node so that we can keep track of
2044   // it if it gets CSE'd with a different node.
2045   HandleSDNode Handle(N);
2046 
2047   X86ISelAddressMode Backup = AM;
2048   if (!matchAddressRecursively(N.getOperand(0), AM, Depth+1) &&
2049       !matchAddressRecursively(Handle.getValue().getOperand(1), AM, Depth+1))
2050     return false;
2051   AM = Backup;
2052 
2053   // Try again after commutating the operands.
2054   if (!matchAddressRecursively(Handle.getValue().getOperand(1), AM,
2055                                Depth + 1) &&
2056       !matchAddressRecursively(Handle.getValue().getOperand(0), AM, Depth + 1))
2057     return false;
2058   AM = Backup;
2059 
2060   // If we couldn't fold both operands into the address at the same time,
2061   // see if we can just put each operand into a register and fold at least
2062   // the add.
2063   if (AM.BaseType == X86ISelAddressMode::RegBase &&
2064       !AM.Base_Reg.getNode() &&
2065       !AM.IndexReg.getNode()) {
2066     N = Handle.getValue();
2067     AM.Base_Reg = N.getOperand(0);
2068     AM.IndexReg = N.getOperand(1);
2069     AM.Scale = 1;
2070     return false;
2071   }
2072   N = Handle.getValue();
2073   return true;
2074 }
2075 
2076 // Insert a node into the DAG at least before the Pos node's position. This
2077 // will reposition the node as needed, and will assign it a node ID that is <=
2078 // the Pos node's ID. Note that this does *not* preserve the uniqueness of node
2079 // IDs! The selection DAG must no longer depend on their uniqueness when this
2080 // is used.
insertDAGNode(SelectionDAG & DAG,SDValue Pos,SDValue N)2081 static void insertDAGNode(SelectionDAG &DAG, SDValue Pos, SDValue N) {
2082   if (N->getNodeId() == -1 ||
2083       (SelectionDAGISel::getUninvalidatedNodeId(N.getNode()) >
2084        SelectionDAGISel::getUninvalidatedNodeId(Pos.getNode()))) {
2085     DAG.RepositionNode(Pos->getIterator(), N.getNode());
2086     // Mark Node as invalid for pruning as after this it may be a successor to a
2087     // selected node but otherwise be in the same position of Pos.
2088     // Conservatively mark it with the same -abs(Id) to assure node id
2089     // invariant is preserved.
2090     N->setNodeId(Pos->getNodeId());
2091     SelectionDAGISel::InvalidateNodeId(N.getNode());
2092   }
2093 }
2094 
2095 // Transform "(X >> (8-C1)) & (0xff << C1)" to "((X >> 8) & 0xff) << C1" if
2096 // safe. This allows us to convert the shift and and into an h-register
2097 // extract and a scaled index. Returns false if the simplification is
2098 // performed.
foldMaskAndShiftToExtract(SelectionDAG & DAG,SDValue N,uint64_t Mask,SDValue Shift,SDValue X,X86ISelAddressMode & AM)2099 static bool foldMaskAndShiftToExtract(SelectionDAG &DAG, SDValue N,
2100                                       uint64_t Mask,
2101                                       SDValue Shift, SDValue X,
2102                                       X86ISelAddressMode &AM) {
2103   if (Shift.getOpcode() != ISD::SRL ||
2104       !isa<ConstantSDNode>(Shift.getOperand(1)) ||
2105       !Shift.hasOneUse())
2106     return true;
2107 
2108   int ScaleLog = 8 - Shift.getConstantOperandVal(1);
2109   if (ScaleLog <= 0 || ScaleLog >= 4 ||
2110       Mask != (0xffu << ScaleLog))
2111     return true;
2112 
2113   MVT XVT = X.getSimpleValueType();
2114   MVT VT = N.getSimpleValueType();
2115   SDLoc DL(N);
2116   SDValue Eight = DAG.getConstant(8, DL, MVT::i8);
2117   SDValue NewMask = DAG.getConstant(0xff, DL, XVT);
2118   SDValue Srl = DAG.getNode(ISD::SRL, DL, XVT, X, Eight);
2119   SDValue And = DAG.getNode(ISD::AND, DL, XVT, Srl, NewMask);
2120   SDValue Ext = DAG.getZExtOrTrunc(And, DL, VT);
2121   SDValue ShlCount = DAG.getConstant(ScaleLog, DL, MVT::i8);
2122   SDValue Shl = DAG.getNode(ISD::SHL, DL, VT, Ext, ShlCount);
2123 
2124   // Insert the new nodes into the topological ordering. We must do this in
2125   // a valid topological ordering as nothing is going to go back and re-sort
2126   // these nodes. We continually insert before 'N' in sequence as this is
2127   // essentially a pre-flattened and pre-sorted sequence of nodes. There is no
2128   // hierarchy left to express.
2129   insertDAGNode(DAG, N, Eight);
2130   insertDAGNode(DAG, N, NewMask);
2131   insertDAGNode(DAG, N, Srl);
2132   insertDAGNode(DAG, N, And);
2133   insertDAGNode(DAG, N, Ext);
2134   insertDAGNode(DAG, N, ShlCount);
2135   insertDAGNode(DAG, N, Shl);
2136   DAG.ReplaceAllUsesWith(N, Shl);
2137   DAG.RemoveDeadNode(N.getNode());
2138   AM.IndexReg = Ext;
2139   AM.Scale = (1 << ScaleLog);
2140   return false;
2141 }
2142 
2143 // Transforms "(X << C1) & C2" to "(X & (C2>>C1)) << C1" if safe and if this
2144 // allows us to fold the shift into this addressing mode. Returns false if the
2145 // transform succeeded.
foldMaskedShiftToScaledMask(SelectionDAG & DAG,SDValue N,X86ISelAddressMode & AM)2146 static bool foldMaskedShiftToScaledMask(SelectionDAG &DAG, SDValue N,
2147                                         X86ISelAddressMode &AM) {
2148   SDValue Shift = N.getOperand(0);
2149 
2150   // Use a signed mask so that shifting right will insert sign bits. These
2151   // bits will be removed when we shift the result left so it doesn't matter
2152   // what we use. This might allow a smaller immediate encoding.
2153   int64_t Mask = cast<ConstantSDNode>(N->getOperand(1))->getSExtValue();
2154 
2155   // If we have an any_extend feeding the AND, look through it to see if there
2156   // is a shift behind it. But only if the AND doesn't use the extended bits.
2157   // FIXME: Generalize this to other ANY_EXTEND than i32 to i64?
2158   bool FoundAnyExtend = false;
2159   if (Shift.getOpcode() == ISD::ANY_EXTEND && Shift.hasOneUse() &&
2160       Shift.getOperand(0).getSimpleValueType() == MVT::i32 &&
2161       isUInt<32>(Mask)) {
2162     FoundAnyExtend = true;
2163     Shift = Shift.getOperand(0);
2164   }
2165 
2166   if (Shift.getOpcode() != ISD::SHL ||
2167       !isa<ConstantSDNode>(Shift.getOperand(1)))
2168     return true;
2169 
2170   SDValue X = Shift.getOperand(0);
2171 
2172   // Not likely to be profitable if either the AND or SHIFT node has more
2173   // than one use (unless all uses are for address computation). Besides,
2174   // isel mechanism requires their node ids to be reused.
2175   if (!N.hasOneUse() || !Shift.hasOneUse())
2176     return true;
2177 
2178   // Verify that the shift amount is something we can fold.
2179   unsigned ShiftAmt = Shift.getConstantOperandVal(1);
2180   if (ShiftAmt != 1 && ShiftAmt != 2 && ShiftAmt != 3)
2181     return true;
2182 
2183   MVT VT = N.getSimpleValueType();
2184   SDLoc DL(N);
2185   if (FoundAnyExtend) {
2186     SDValue NewX = DAG.getNode(ISD::ANY_EXTEND, DL, VT, X);
2187     insertDAGNode(DAG, N, NewX);
2188     X = NewX;
2189   }
2190 
2191   SDValue NewMask = DAG.getSignedConstant(Mask >> ShiftAmt, DL, VT);
2192   SDValue NewAnd = DAG.getNode(ISD::AND, DL, VT, X, NewMask);
2193   SDValue NewShift = DAG.getNode(ISD::SHL, DL, VT, NewAnd, Shift.getOperand(1));
2194 
2195   // Insert the new nodes into the topological ordering. We must do this in
2196   // a valid topological ordering as nothing is going to go back and re-sort
2197   // these nodes. We continually insert before 'N' in sequence as this is
2198   // essentially a pre-flattened and pre-sorted sequence of nodes. There is no
2199   // hierarchy left to express.
2200   insertDAGNode(DAG, N, NewMask);
2201   insertDAGNode(DAG, N, NewAnd);
2202   insertDAGNode(DAG, N, NewShift);
2203   DAG.ReplaceAllUsesWith(N, NewShift);
2204   DAG.RemoveDeadNode(N.getNode());
2205 
2206   AM.Scale = 1 << ShiftAmt;
2207   AM.IndexReg = NewAnd;
2208   return false;
2209 }
2210 
2211 // Implement some heroics to detect shifts of masked values where the mask can
2212 // be replaced by extending the shift and undoing that in the addressing mode
2213 // scale. Patterns such as (shl (srl x, c1), c2) are canonicalized into (and
2214 // (srl x, SHIFT), MASK) by DAGCombines that don't know the shl can be done in
2215 // the addressing mode. This results in code such as:
2216 //
2217 //   int f(short *y, int *lookup_table) {
2218 //     ...
2219 //     return *y + lookup_table[*y >> 11];
2220 //   }
2221 //
2222 // Turning into:
2223 //   movzwl (%rdi), %eax
2224 //   movl %eax, %ecx
2225 //   shrl $11, %ecx
2226 //   addl (%rsi,%rcx,4), %eax
2227 //
2228 // Instead of:
2229 //   movzwl (%rdi), %eax
2230 //   movl %eax, %ecx
2231 //   shrl $9, %ecx
2232 //   andl $124, %rcx
2233 //   addl (%rsi,%rcx), %eax
2234 //
2235 // Note that this function assumes the mask is provided as a mask *after* the
2236 // value is shifted. The input chain may or may not match that, but computing
2237 // such a mask is trivial.
foldMaskAndShiftToScale(SelectionDAG & DAG,SDValue N,uint64_t Mask,SDValue Shift,SDValue X,X86ISelAddressMode & AM)2238 static bool foldMaskAndShiftToScale(SelectionDAG &DAG, SDValue N,
2239                                     uint64_t Mask,
2240                                     SDValue Shift, SDValue X,
2241                                     X86ISelAddressMode &AM) {
2242   if (Shift.getOpcode() != ISD::SRL || !Shift.hasOneUse() ||
2243       !isa<ConstantSDNode>(Shift.getOperand(1)))
2244     return true;
2245 
2246   // We need to ensure that mask is a continuous run of bits.
2247   unsigned MaskIdx, MaskLen;
2248   if (!isShiftedMask_64(Mask, MaskIdx, MaskLen))
2249     return true;
2250   unsigned MaskLZ = 64 - (MaskIdx + MaskLen);
2251 
2252   unsigned ShiftAmt = Shift.getConstantOperandVal(1);
2253 
2254   // The amount of shift we're trying to fit into the addressing mode is taken
2255   // from the shifted mask index (number of trailing zeros of the mask).
2256   unsigned AMShiftAmt = MaskIdx;
2257 
2258   // There is nothing we can do here unless the mask is removing some bits.
2259   // Also, the addressing mode can only represent shifts of 1, 2, or 3 bits.
2260   if (AMShiftAmt == 0 || AMShiftAmt > 3) return true;
2261 
2262   // Scale the leading zero count down based on the actual size of the value.
2263   // Also scale it down based on the size of the shift.
2264   unsigned ScaleDown = (64 - X.getSimpleValueType().getSizeInBits()) + ShiftAmt;
2265   if (MaskLZ < ScaleDown)
2266     return true;
2267   MaskLZ -= ScaleDown;
2268 
2269   // The final check is to ensure that any masked out high bits of X are
2270   // already known to be zero. Otherwise, the mask has a semantic impact
2271   // other than masking out a couple of low bits. Unfortunately, because of
2272   // the mask, zero extensions will be removed from operands in some cases.
2273   // This code works extra hard to look through extensions because we can
2274   // replace them with zero extensions cheaply if necessary.
2275   bool ReplacingAnyExtend = false;
2276   if (X.getOpcode() == ISD::ANY_EXTEND) {
2277     unsigned ExtendBits = X.getSimpleValueType().getSizeInBits() -
2278                           X.getOperand(0).getSimpleValueType().getSizeInBits();
2279     // Assume that we'll replace the any-extend with a zero-extend, and
2280     // narrow the search to the extended value.
2281     X = X.getOperand(0);
2282     MaskLZ = ExtendBits > MaskLZ ? 0 : MaskLZ - ExtendBits;
2283     ReplacingAnyExtend = true;
2284   }
2285   APInt MaskedHighBits =
2286     APInt::getHighBitsSet(X.getSimpleValueType().getSizeInBits(), MaskLZ);
2287   if (!DAG.MaskedValueIsZero(X, MaskedHighBits))
2288     return true;
2289 
2290   // We've identified a pattern that can be transformed into a single shift
2291   // and an addressing mode. Make it so.
2292   MVT VT = N.getSimpleValueType();
2293   if (ReplacingAnyExtend) {
2294     assert(X.getValueType() != VT);
2295     // We looked through an ANY_EXTEND node, insert a ZERO_EXTEND.
2296     SDValue NewX = DAG.getNode(ISD::ZERO_EXTEND, SDLoc(X), VT, X);
2297     insertDAGNode(DAG, N, NewX);
2298     X = NewX;
2299   }
2300 
2301   MVT XVT = X.getSimpleValueType();
2302   SDLoc DL(N);
2303   SDValue NewSRLAmt = DAG.getConstant(ShiftAmt + AMShiftAmt, DL, MVT::i8);
2304   SDValue NewSRL = DAG.getNode(ISD::SRL, DL, XVT, X, NewSRLAmt);
2305   SDValue NewExt = DAG.getZExtOrTrunc(NewSRL, DL, VT);
2306   SDValue NewSHLAmt = DAG.getConstant(AMShiftAmt, DL, MVT::i8);
2307   SDValue NewSHL = DAG.getNode(ISD::SHL, DL, VT, NewExt, NewSHLAmt);
2308 
2309   // Insert the new nodes into the topological ordering. We must do this in
2310   // a valid topological ordering as nothing is going to go back and re-sort
2311   // these nodes. We continually insert before 'N' in sequence as this is
2312   // essentially a pre-flattened and pre-sorted sequence of nodes. There is no
2313   // hierarchy left to express.
2314   insertDAGNode(DAG, N, NewSRLAmt);
2315   insertDAGNode(DAG, N, NewSRL);
2316   insertDAGNode(DAG, N, NewExt);
2317   insertDAGNode(DAG, N, NewSHLAmt);
2318   insertDAGNode(DAG, N, NewSHL);
2319   DAG.ReplaceAllUsesWith(N, NewSHL);
2320   DAG.RemoveDeadNode(N.getNode());
2321 
2322   AM.Scale = 1 << AMShiftAmt;
2323   AM.IndexReg = NewExt;
2324   return false;
2325 }
2326 
2327 // Transform "(X >> SHIFT) & (MASK << C1)" to
2328 // "((X >> (SHIFT + C1)) & (MASK)) << C1". Everything before the SHL will be
2329 // matched to a BEXTR later. Returns false if the simplification is performed.
foldMaskedShiftToBEXTR(SelectionDAG & DAG,SDValue N,uint64_t Mask,SDValue Shift,SDValue X,X86ISelAddressMode & AM,const X86Subtarget & Subtarget)2330 static bool foldMaskedShiftToBEXTR(SelectionDAG &DAG, SDValue N,
2331                                    uint64_t Mask,
2332                                    SDValue Shift, SDValue X,
2333                                    X86ISelAddressMode &AM,
2334                                    const X86Subtarget &Subtarget) {
2335   if (Shift.getOpcode() != ISD::SRL ||
2336       !isa<ConstantSDNode>(Shift.getOperand(1)) ||
2337       !Shift.hasOneUse() || !N.hasOneUse())
2338     return true;
2339 
2340   // Only do this if BEXTR will be matched by matchBEXTRFromAndImm.
2341   if (!Subtarget.hasTBM() &&
2342       !(Subtarget.hasBMI() && Subtarget.hasFastBEXTR()))
2343     return true;
2344 
2345   // We need to ensure that mask is a continuous run of bits.
2346   unsigned MaskIdx, MaskLen;
2347   if (!isShiftedMask_64(Mask, MaskIdx, MaskLen))
2348     return true;
2349 
2350   unsigned ShiftAmt = Shift.getConstantOperandVal(1);
2351 
2352   // The amount of shift we're trying to fit into the addressing mode is taken
2353   // from the shifted mask index (number of trailing zeros of the mask).
2354   unsigned AMShiftAmt = MaskIdx;
2355 
2356   // There is nothing we can do here unless the mask is removing some bits.
2357   // Also, the addressing mode can only represent shifts of 1, 2, or 3 bits.
2358   if (AMShiftAmt == 0 || AMShiftAmt > 3) return true;
2359 
2360   MVT XVT = X.getSimpleValueType();
2361   MVT VT = N.getSimpleValueType();
2362   SDLoc DL(N);
2363   SDValue NewSRLAmt = DAG.getConstant(ShiftAmt + AMShiftAmt, DL, MVT::i8);
2364   SDValue NewSRL = DAG.getNode(ISD::SRL, DL, XVT, X, NewSRLAmt);
2365   SDValue NewMask = DAG.getConstant(Mask >> AMShiftAmt, DL, XVT);
2366   SDValue NewAnd = DAG.getNode(ISD::AND, DL, XVT, NewSRL, NewMask);
2367   SDValue NewExt = DAG.getZExtOrTrunc(NewAnd, DL, VT);
2368   SDValue NewSHLAmt = DAG.getConstant(AMShiftAmt, DL, MVT::i8);
2369   SDValue NewSHL = DAG.getNode(ISD::SHL, DL, VT, NewExt, NewSHLAmt);
2370 
2371   // Insert the new nodes into the topological ordering. We must do this in
2372   // a valid topological ordering as nothing is going to go back and re-sort
2373   // these nodes. We continually insert before 'N' in sequence as this is
2374   // essentially a pre-flattened and pre-sorted sequence of nodes. There is no
2375   // hierarchy left to express.
2376   insertDAGNode(DAG, N, NewSRLAmt);
2377   insertDAGNode(DAG, N, NewSRL);
2378   insertDAGNode(DAG, N, NewMask);
2379   insertDAGNode(DAG, N, NewAnd);
2380   insertDAGNode(DAG, N, NewExt);
2381   insertDAGNode(DAG, N, NewSHLAmt);
2382   insertDAGNode(DAG, N, NewSHL);
2383   DAG.ReplaceAllUsesWith(N, NewSHL);
2384   DAG.RemoveDeadNode(N.getNode());
2385 
2386   AM.Scale = 1 << AMShiftAmt;
2387   AM.IndexReg = NewExt;
2388   return false;
2389 }
2390 
2391 // Attempt to peek further into a scaled index register, collecting additional
2392 // extensions / offsets / etc. Returns /p N if we can't peek any further.
matchIndexRecursively(SDValue N,X86ISelAddressMode & AM,unsigned Depth)2393 SDValue X86DAGToDAGISel::matchIndexRecursively(SDValue N,
2394                                                X86ISelAddressMode &AM,
2395                                                unsigned Depth) {
2396   assert(AM.IndexReg.getNode() == nullptr && "IndexReg already matched");
2397   assert((AM.Scale == 1 || AM.Scale == 2 || AM.Scale == 4 || AM.Scale == 8) &&
2398          "Illegal index scale");
2399 
2400   // Limit recursion.
2401   if (Depth >= SelectionDAG::MaxRecursionDepth)
2402     return N;
2403 
2404   EVT VT = N.getValueType();
2405   unsigned Opc = N.getOpcode();
2406 
2407   // index: add(x,c) -> index: x, disp + c
2408   if (CurDAG->isBaseWithConstantOffset(N)) {
2409     auto *AddVal = cast<ConstantSDNode>(N.getOperand(1));
2410     uint64_t Offset = (uint64_t)AddVal->getSExtValue() * AM.Scale;
2411     if (!foldOffsetIntoAddress(Offset, AM))
2412       return matchIndexRecursively(N.getOperand(0), AM, Depth + 1);
2413   }
2414 
2415   // index: add(x,x) -> index: x, scale * 2
2416   if (Opc == ISD::ADD && N.getOperand(0) == N.getOperand(1)) {
2417     if (AM.Scale <= 4) {
2418       AM.Scale *= 2;
2419       return matchIndexRecursively(N.getOperand(0), AM, Depth + 1);
2420     }
2421   }
2422 
2423   // index: shl(x,i) -> index: x, scale * (1 << i)
2424   if (Opc == X86ISD::VSHLI) {
2425     uint64_t ShiftAmt = N.getConstantOperandVal(1);
2426     uint64_t ScaleAmt = 1ULL << ShiftAmt;
2427     if ((AM.Scale * ScaleAmt) <= 8) {
2428       AM.Scale *= ScaleAmt;
2429       return matchIndexRecursively(N.getOperand(0), AM, Depth + 1);
2430     }
2431   }
2432 
2433   // index: sext(add_nsw(x,c)) -> index: sext(x), disp + sext(c)
2434   // TODO: call matchIndexRecursively(AddSrc) if we won't corrupt sext?
2435   if (Opc == ISD::SIGN_EXTEND && !VT.isVector() && N.hasOneUse()) {
2436     SDValue Src = N.getOperand(0);
2437     if (Src.getOpcode() == ISD::ADD && Src->getFlags().hasNoSignedWrap() &&
2438         Src.hasOneUse()) {
2439       if (CurDAG->isBaseWithConstantOffset(Src)) {
2440         SDValue AddSrc = Src.getOperand(0);
2441         auto *AddVal = cast<ConstantSDNode>(Src.getOperand(1));
2442         int64_t Offset = AddVal->getSExtValue();
2443         if (!foldOffsetIntoAddress((uint64_t)Offset * AM.Scale, AM)) {
2444           SDLoc DL(N);
2445           SDValue ExtSrc = CurDAG->getNode(Opc, DL, VT, AddSrc);
2446           SDValue ExtVal = CurDAG->getSignedConstant(Offset, DL, VT);
2447           SDValue ExtAdd = CurDAG->getNode(ISD::ADD, DL, VT, ExtSrc, ExtVal);
2448           insertDAGNode(*CurDAG, N, ExtSrc);
2449           insertDAGNode(*CurDAG, N, ExtVal);
2450           insertDAGNode(*CurDAG, N, ExtAdd);
2451           CurDAG->ReplaceAllUsesWith(N, ExtAdd);
2452           CurDAG->RemoveDeadNode(N.getNode());
2453           return ExtSrc;
2454         }
2455       }
2456     }
2457   }
2458 
2459   // index: zext(add_nuw(x,c)) -> index: zext(x), disp + zext(c)
2460   // index: zext(addlike(x,c)) -> index: zext(x), disp + zext(c)
2461   // TODO: call matchIndexRecursively(AddSrc) if we won't corrupt sext?
2462   if (Opc == ISD::ZERO_EXTEND && !VT.isVector() && N.hasOneUse()) {
2463     SDValue Src = N.getOperand(0);
2464     unsigned SrcOpc = Src.getOpcode();
2465     if (((SrcOpc == ISD::ADD && Src->getFlags().hasNoUnsignedWrap()) ||
2466          CurDAG->isADDLike(Src, /*NoWrap=*/true)) &&
2467         Src.hasOneUse()) {
2468       if (CurDAG->isBaseWithConstantOffset(Src)) {
2469         SDValue AddSrc = Src.getOperand(0);
2470         uint64_t Offset = Src.getConstantOperandVal(1);
2471         if (!foldOffsetIntoAddress(Offset * AM.Scale, AM)) {
2472           SDLoc DL(N);
2473           SDValue Res;
2474           // If we're also scaling, see if we can use that as well.
2475           if (AddSrc.getOpcode() == ISD::SHL &&
2476               isa<ConstantSDNode>(AddSrc.getOperand(1))) {
2477             SDValue ShVal = AddSrc.getOperand(0);
2478             uint64_t ShAmt = AddSrc.getConstantOperandVal(1);
2479             APInt HiBits =
2480                 APInt::getHighBitsSet(AddSrc.getScalarValueSizeInBits(), ShAmt);
2481             uint64_t ScaleAmt = 1ULL << ShAmt;
2482             if ((AM.Scale * ScaleAmt) <= 8 &&
2483                 (AddSrc->getFlags().hasNoUnsignedWrap() ||
2484                  CurDAG->MaskedValueIsZero(ShVal, HiBits))) {
2485               AM.Scale *= ScaleAmt;
2486               SDValue ExtShVal = CurDAG->getNode(Opc, DL, VT, ShVal);
2487               SDValue ExtShift = CurDAG->getNode(ISD::SHL, DL, VT, ExtShVal,
2488                                                  AddSrc.getOperand(1));
2489               insertDAGNode(*CurDAG, N, ExtShVal);
2490               insertDAGNode(*CurDAG, N, ExtShift);
2491               AddSrc = ExtShift;
2492               Res = ExtShVal;
2493             }
2494           }
2495           SDValue ExtSrc = CurDAG->getNode(Opc, DL, VT, AddSrc);
2496           SDValue ExtVal = CurDAG->getConstant(Offset, DL, VT);
2497           SDValue ExtAdd = CurDAG->getNode(SrcOpc, DL, VT, ExtSrc, ExtVal);
2498           insertDAGNode(*CurDAG, N, ExtSrc);
2499           insertDAGNode(*CurDAG, N, ExtVal);
2500           insertDAGNode(*CurDAG, N, ExtAdd);
2501           CurDAG->ReplaceAllUsesWith(N, ExtAdd);
2502           CurDAG->RemoveDeadNode(N.getNode());
2503           return Res ? Res : ExtSrc;
2504         }
2505       }
2506     }
2507   }
2508 
2509   // TODO: Handle extensions, shifted masks etc.
2510   return N;
2511 }
2512 
matchAddressRecursively(SDValue N,X86ISelAddressMode & AM,unsigned Depth)2513 bool X86DAGToDAGISel::matchAddressRecursively(SDValue N, X86ISelAddressMode &AM,
2514                                               unsigned Depth) {
2515   LLVM_DEBUG({
2516     dbgs() << "MatchAddress: ";
2517     AM.dump(CurDAG);
2518   });
2519   // Limit recursion.
2520   if (Depth >= SelectionDAG::MaxRecursionDepth)
2521     return matchAddressBase(N, AM);
2522 
2523   // If this is already a %rip relative address, we can only merge immediates
2524   // into it.  Instead of handling this in every case, we handle it here.
2525   // RIP relative addressing: %rip + 32-bit displacement!
2526   if (AM.isRIPRelative()) {
2527     // FIXME: JumpTable and ExternalSymbol address currently don't like
2528     // displacements.  It isn't very important, but this should be fixed for
2529     // consistency.
2530     if (!(AM.ES || AM.MCSym) && AM.JT != -1)
2531       return true;
2532 
2533     if (auto *Cst = dyn_cast<ConstantSDNode>(N))
2534       if (!foldOffsetIntoAddress(Cst->getSExtValue(), AM))
2535         return false;
2536     return true;
2537   }
2538 
2539   switch (N.getOpcode()) {
2540   default: break;
2541   case ISD::LOCAL_RECOVER: {
2542     if (!AM.hasSymbolicDisplacement() && AM.Disp == 0)
2543       if (const auto *ESNode = dyn_cast<MCSymbolSDNode>(N.getOperand(0))) {
2544         // Use the symbol and don't prefix it.
2545         AM.MCSym = ESNode->getMCSymbol();
2546         return false;
2547       }
2548     break;
2549   }
2550   case ISD::Constant: {
2551     uint64_t Val = cast<ConstantSDNode>(N)->getSExtValue();
2552     if (!foldOffsetIntoAddress(Val, AM))
2553       return false;
2554     break;
2555   }
2556 
2557   case X86ISD::Wrapper:
2558   case X86ISD::WrapperRIP:
2559     if (!matchWrapper(N, AM))
2560       return false;
2561     break;
2562 
2563   case ISD::LOAD:
2564     if (!matchLoadInAddress(cast<LoadSDNode>(N), AM))
2565       return false;
2566     break;
2567 
2568   case ISD::FrameIndex:
2569     if (AM.BaseType == X86ISelAddressMode::RegBase &&
2570         AM.Base_Reg.getNode() == nullptr &&
2571         (!Subtarget->is64Bit() || isDispSafeForFrameIndexOrRegBase(AM.Disp))) {
2572       AM.BaseType = X86ISelAddressMode::FrameIndexBase;
2573       AM.Base_FrameIndex = cast<FrameIndexSDNode>(N)->getIndex();
2574       return false;
2575     }
2576     break;
2577 
2578   case ISD::SHL:
2579     if (AM.IndexReg.getNode() != nullptr || AM.Scale != 1)
2580       break;
2581 
2582     if (auto *CN = dyn_cast<ConstantSDNode>(N.getOperand(1))) {
2583       unsigned Val = CN->getZExtValue();
2584       // Note that we handle x<<1 as (,x,2) rather than (x,x) here so
2585       // that the base operand remains free for further matching. If
2586       // the base doesn't end up getting used, a post-processing step
2587       // in MatchAddress turns (,x,2) into (x,x), which is cheaper.
2588       if (Val == 1 || Val == 2 || Val == 3) {
2589         SDValue ShVal = N.getOperand(0);
2590         AM.Scale = 1 << Val;
2591         AM.IndexReg = matchIndexRecursively(ShVal, AM, Depth + 1);
2592         return false;
2593       }
2594     }
2595     break;
2596 
2597   case ISD::SRL: {
2598     // Scale must not be used already.
2599     if (AM.IndexReg.getNode() != nullptr || AM.Scale != 1) break;
2600 
2601     // We only handle up to 64-bit values here as those are what matter for
2602     // addressing mode optimizations.
2603     assert(N.getSimpleValueType().getSizeInBits() <= 64 &&
2604            "Unexpected value size!");
2605 
2606     SDValue And = N.getOperand(0);
2607     if (And.getOpcode() != ISD::AND) break;
2608     SDValue X = And.getOperand(0);
2609 
2610     // The mask used for the transform is expected to be post-shift, but we
2611     // found the shift first so just apply the shift to the mask before passing
2612     // it down.
2613     if (!isa<ConstantSDNode>(N.getOperand(1)) ||
2614         !isa<ConstantSDNode>(And.getOperand(1)))
2615       break;
2616     uint64_t Mask = And.getConstantOperandVal(1) >> N.getConstantOperandVal(1);
2617 
2618     // Try to fold the mask and shift into the scale, and return false if we
2619     // succeed.
2620     if (!foldMaskAndShiftToScale(*CurDAG, N, Mask, N, X, AM))
2621       return false;
2622     break;
2623   }
2624 
2625   case ISD::SMUL_LOHI:
2626   case ISD::UMUL_LOHI:
2627     // A mul_lohi where we need the low part can be folded as a plain multiply.
2628     if (N.getResNo() != 0) break;
2629     [[fallthrough]];
2630   case ISD::MUL:
2631   case X86ISD::MUL_IMM:
2632     // X*[3,5,9] -> X+X*[2,4,8]
2633     if (AM.BaseType == X86ISelAddressMode::RegBase &&
2634         AM.Base_Reg.getNode() == nullptr &&
2635         AM.IndexReg.getNode() == nullptr) {
2636       if (auto *CN = dyn_cast<ConstantSDNode>(N.getOperand(1)))
2637         if (CN->getZExtValue() == 3 || CN->getZExtValue() == 5 ||
2638             CN->getZExtValue() == 9) {
2639           AM.Scale = unsigned(CN->getZExtValue())-1;
2640 
2641           SDValue MulVal = N.getOperand(0);
2642           SDValue Reg;
2643 
2644           // Okay, we know that we have a scale by now.  However, if the scaled
2645           // value is an add of something and a constant, we can fold the
2646           // constant into the disp field here.
2647           if (MulVal.getNode()->getOpcode() == ISD::ADD && MulVal.hasOneUse() &&
2648               isa<ConstantSDNode>(MulVal.getOperand(1))) {
2649             Reg = MulVal.getOperand(0);
2650             auto *AddVal = cast<ConstantSDNode>(MulVal.getOperand(1));
2651             uint64_t Disp = AddVal->getSExtValue() * CN->getZExtValue();
2652             if (foldOffsetIntoAddress(Disp, AM))
2653               Reg = N.getOperand(0);
2654           } else {
2655             Reg = N.getOperand(0);
2656           }
2657 
2658           AM.IndexReg = AM.Base_Reg = Reg;
2659           return false;
2660         }
2661     }
2662     break;
2663 
2664   case ISD::SUB: {
2665     // Given A-B, if A can be completely folded into the address and
2666     // the index field with the index field unused, use -B as the index.
2667     // This is a win if a has multiple parts that can be folded into
2668     // the address. Also, this saves a mov if the base register has
2669     // other uses, since it avoids a two-address sub instruction, however
2670     // it costs an additional mov if the index register has other uses.
2671 
2672     // Add an artificial use to this node so that we can keep track of
2673     // it if it gets CSE'd with a different node.
2674     HandleSDNode Handle(N);
2675 
2676     // Test if the LHS of the sub can be folded.
2677     X86ISelAddressMode Backup = AM;
2678     if (matchAddressRecursively(N.getOperand(0), AM, Depth+1)) {
2679       N = Handle.getValue();
2680       AM = Backup;
2681       break;
2682     }
2683     N = Handle.getValue();
2684     // Test if the index field is free for use.
2685     if (AM.IndexReg.getNode() || AM.isRIPRelative()) {
2686       AM = Backup;
2687       break;
2688     }
2689 
2690     int Cost = 0;
2691     SDValue RHS = N.getOperand(1);
2692     // If the RHS involves a register with multiple uses, this
2693     // transformation incurs an extra mov, due to the neg instruction
2694     // clobbering its operand.
2695     if (!RHS.getNode()->hasOneUse() ||
2696         RHS.getNode()->getOpcode() == ISD::CopyFromReg ||
2697         RHS.getNode()->getOpcode() == ISD::TRUNCATE ||
2698         RHS.getNode()->getOpcode() == ISD::ANY_EXTEND ||
2699         (RHS.getNode()->getOpcode() == ISD::ZERO_EXTEND &&
2700          RHS.getOperand(0).getValueType() == MVT::i32))
2701       ++Cost;
2702     // If the base is a register with multiple uses, this
2703     // transformation may save a mov.
2704     if ((AM.BaseType == X86ISelAddressMode::RegBase && AM.Base_Reg.getNode() &&
2705          !AM.Base_Reg.getNode()->hasOneUse()) ||
2706         AM.BaseType == X86ISelAddressMode::FrameIndexBase)
2707       --Cost;
2708     // If the folded LHS was interesting, this transformation saves
2709     // address arithmetic.
2710     if ((AM.hasSymbolicDisplacement() && !Backup.hasSymbolicDisplacement()) +
2711         ((AM.Disp != 0) && (Backup.Disp == 0)) +
2712         (AM.Segment.getNode() && !Backup.Segment.getNode()) >= 2)
2713       --Cost;
2714     // If it doesn't look like it may be an overall win, don't do it.
2715     if (Cost >= 0) {
2716       AM = Backup;
2717       break;
2718     }
2719 
2720     // Ok, the transformation is legal and appears profitable. Go for it.
2721     // Negation will be emitted later to avoid creating dangling nodes if this
2722     // was an unprofitable LEA.
2723     AM.IndexReg = RHS;
2724     AM.NegateIndex = true;
2725     AM.Scale = 1;
2726     return false;
2727   }
2728 
2729   case ISD::OR:
2730   case ISD::XOR:
2731     // See if we can treat the OR/XOR node as an ADD node.
2732     if (!CurDAG->isADDLike(N))
2733       break;
2734     [[fallthrough]];
2735   case ISD::ADD:
2736     if (!matchAdd(N, AM, Depth))
2737       return false;
2738     break;
2739 
2740   case ISD::AND: {
2741     // Perform some heroic transforms on an and of a constant-count shift
2742     // with a constant to enable use of the scaled offset field.
2743 
2744     // Scale must not be used already.
2745     if (AM.IndexReg.getNode() != nullptr || AM.Scale != 1) break;
2746 
2747     // We only handle up to 64-bit values here as those are what matter for
2748     // addressing mode optimizations.
2749     assert(N.getSimpleValueType().getSizeInBits() <= 64 &&
2750            "Unexpected value size!");
2751 
2752     if (!isa<ConstantSDNode>(N.getOperand(1)))
2753       break;
2754 
2755     if (N.getOperand(0).getOpcode() == ISD::SRL) {
2756       SDValue Shift = N.getOperand(0);
2757       SDValue X = Shift.getOperand(0);
2758 
2759       uint64_t Mask = N.getConstantOperandVal(1);
2760 
2761       // Try to fold the mask and shift into an extract and scale.
2762       if (!foldMaskAndShiftToExtract(*CurDAG, N, Mask, Shift, X, AM))
2763         return false;
2764 
2765       // Try to fold the mask and shift directly into the scale.
2766       if (!foldMaskAndShiftToScale(*CurDAG, N, Mask, Shift, X, AM))
2767         return false;
2768 
2769       // Try to fold the mask and shift into BEXTR and scale.
2770       if (!foldMaskedShiftToBEXTR(*CurDAG, N, Mask, Shift, X, AM, *Subtarget))
2771         return false;
2772     }
2773 
2774     // Try to swap the mask and shift to place shifts which can be done as
2775     // a scale on the outside of the mask.
2776     if (!foldMaskedShiftToScaledMask(*CurDAG, N, AM))
2777       return false;
2778 
2779     break;
2780   }
2781   case ISD::ZERO_EXTEND: {
2782     // Try to widen a zexted shift left to the same size as its use, so we can
2783     // match the shift as a scale factor.
2784     if (AM.IndexReg.getNode() != nullptr || AM.Scale != 1)
2785       break;
2786 
2787     SDValue Src = N.getOperand(0);
2788 
2789     // See if we can match a zext(addlike(x,c)).
2790     // TODO: Move more ZERO_EXTEND patterns into matchIndexRecursively.
2791     if (Src.getOpcode() == ISD::ADD || Src.getOpcode() == ISD::OR)
2792       if (SDValue Index = matchIndexRecursively(N, AM, Depth + 1))
2793         if (Index != N) {
2794           AM.IndexReg = Index;
2795           return false;
2796         }
2797 
2798     // Peek through mask: zext(and(shl(x,c1),c2))
2799     APInt Mask = APInt::getAllOnes(Src.getScalarValueSizeInBits());
2800     if (Src.getOpcode() == ISD::AND && Src.hasOneUse())
2801       if (auto *MaskC = dyn_cast<ConstantSDNode>(Src.getOperand(1))) {
2802         Mask = MaskC->getAPIntValue();
2803         Src = Src.getOperand(0);
2804       }
2805 
2806     if (Src.getOpcode() == ISD::SHL && Src.hasOneUse() && N->hasOneUse()) {
2807       // Give up if the shift is not a valid scale factor [1,2,3].
2808       SDValue ShlSrc = Src.getOperand(0);
2809       SDValue ShlAmt = Src.getOperand(1);
2810       auto *ShAmtC = dyn_cast<ConstantSDNode>(ShlAmt);
2811       if (!ShAmtC)
2812         break;
2813       unsigned ShAmtV = ShAmtC->getZExtValue();
2814       if (ShAmtV > 3)
2815         break;
2816 
2817       // The narrow shift must only shift out zero bits (it must be 'nuw').
2818       // That makes it safe to widen to the destination type.
2819       APInt HighZeros =
2820           APInt::getHighBitsSet(ShlSrc.getValueSizeInBits(), ShAmtV);
2821       if (!Src->getFlags().hasNoUnsignedWrap() &&
2822           !CurDAG->MaskedValueIsZero(ShlSrc, HighZeros & Mask))
2823         break;
2824 
2825       // zext (shl nuw i8 %x, C1) to i32
2826       // --> shl (zext i8 %x to i32), (zext C1)
2827       // zext (and (shl nuw i8 %x, C1), C2) to i32
2828       // --> shl (zext i8 (and %x, C2 >> C1) to i32), (zext C1)
2829       MVT SrcVT = ShlSrc.getSimpleValueType();
2830       MVT VT = N.getSimpleValueType();
2831       SDLoc DL(N);
2832 
2833       SDValue Res = ShlSrc;
2834       if (!Mask.isAllOnes()) {
2835         Res = CurDAG->getConstant(Mask.lshr(ShAmtV), DL, SrcVT);
2836         insertDAGNode(*CurDAG, N, Res);
2837         Res = CurDAG->getNode(ISD::AND, DL, SrcVT, ShlSrc, Res);
2838         insertDAGNode(*CurDAG, N, Res);
2839       }
2840       SDValue Zext = CurDAG->getNode(ISD::ZERO_EXTEND, DL, VT, Res);
2841       insertDAGNode(*CurDAG, N, Zext);
2842       SDValue NewShl = CurDAG->getNode(ISD::SHL, DL, VT, Zext, ShlAmt);
2843       insertDAGNode(*CurDAG, N, NewShl);
2844       CurDAG->ReplaceAllUsesWith(N, NewShl);
2845       CurDAG->RemoveDeadNode(N.getNode());
2846 
2847       // Convert the shift to scale factor.
2848       AM.Scale = 1 << ShAmtV;
2849       // If matchIndexRecursively is not called here,
2850       // Zext may be replaced by other nodes but later used to call a builder
2851       // method
2852       AM.IndexReg = matchIndexRecursively(Zext, AM, Depth + 1);
2853       return false;
2854     }
2855 
2856     if (Src.getOpcode() == ISD::SRL && !Mask.isAllOnes()) {
2857       // Try to fold the mask and shift into an extract and scale.
2858       if (!foldMaskAndShiftToExtract(*CurDAG, N, Mask.getZExtValue(), Src,
2859                                      Src.getOperand(0), AM))
2860         return false;
2861 
2862       // Try to fold the mask and shift directly into the scale.
2863       if (!foldMaskAndShiftToScale(*CurDAG, N, Mask.getZExtValue(), Src,
2864                                    Src.getOperand(0), AM))
2865         return false;
2866 
2867       // Try to fold the mask and shift into BEXTR and scale.
2868       if (!foldMaskedShiftToBEXTR(*CurDAG, N, Mask.getZExtValue(), Src,
2869                                   Src.getOperand(0), AM, *Subtarget))
2870         return false;
2871     }
2872 
2873     break;
2874   }
2875   }
2876 
2877   return matchAddressBase(N, AM);
2878 }
2879 
2880 /// Helper for MatchAddress. Add the specified node to the
2881 /// specified addressing mode without any further recursion.
matchAddressBase(SDValue N,X86ISelAddressMode & AM)2882 bool X86DAGToDAGISel::matchAddressBase(SDValue N, X86ISelAddressMode &AM) {
2883   // Is the base register already occupied?
2884   if (AM.BaseType != X86ISelAddressMode::RegBase || AM.Base_Reg.getNode()) {
2885     // If so, check to see if the scale index register is set.
2886     if (!AM.IndexReg.getNode()) {
2887       AM.IndexReg = N;
2888       AM.Scale = 1;
2889       return false;
2890     }
2891 
2892     // Otherwise, we cannot select it.
2893     return true;
2894   }
2895 
2896   // Default, generate it as a register.
2897   AM.BaseType = X86ISelAddressMode::RegBase;
2898   AM.Base_Reg = N;
2899   return false;
2900 }
2901 
matchVectorAddressRecursively(SDValue N,X86ISelAddressMode & AM,unsigned Depth)2902 bool X86DAGToDAGISel::matchVectorAddressRecursively(SDValue N,
2903                                                     X86ISelAddressMode &AM,
2904                                                     unsigned Depth) {
2905   LLVM_DEBUG({
2906     dbgs() << "MatchVectorAddress: ";
2907     AM.dump(CurDAG);
2908   });
2909   // Limit recursion.
2910   if (Depth >= SelectionDAG::MaxRecursionDepth)
2911     return matchAddressBase(N, AM);
2912 
2913   // TODO: Support other operations.
2914   switch (N.getOpcode()) {
2915   case ISD::Constant: {
2916     uint64_t Val = cast<ConstantSDNode>(N)->getSExtValue();
2917     if (!foldOffsetIntoAddress(Val, AM))
2918       return false;
2919     break;
2920   }
2921   case X86ISD::Wrapper:
2922     if (!matchWrapper(N, AM))
2923       return false;
2924     break;
2925   case ISD::ADD: {
2926     // Add an artificial use to this node so that we can keep track of
2927     // it if it gets CSE'd with a different node.
2928     HandleSDNode Handle(N);
2929 
2930     X86ISelAddressMode Backup = AM;
2931     if (!matchVectorAddressRecursively(N.getOperand(0), AM, Depth + 1) &&
2932         !matchVectorAddressRecursively(Handle.getValue().getOperand(1), AM,
2933                                        Depth + 1))
2934       return false;
2935     AM = Backup;
2936 
2937     // Try again after commuting the operands.
2938     if (!matchVectorAddressRecursively(Handle.getValue().getOperand(1), AM,
2939                                        Depth + 1) &&
2940         !matchVectorAddressRecursively(Handle.getValue().getOperand(0), AM,
2941                                        Depth + 1))
2942       return false;
2943     AM = Backup;
2944 
2945     N = Handle.getValue();
2946     break;
2947   }
2948   }
2949 
2950   return matchAddressBase(N, AM);
2951 }
2952 
2953 /// Helper for selectVectorAddr. Handles things that can be folded into a
2954 /// gather/scatter address. The index register and scale should have already
2955 /// been handled.
matchVectorAddress(SDValue N,X86ISelAddressMode & AM)2956 bool X86DAGToDAGISel::matchVectorAddress(SDValue N, X86ISelAddressMode &AM) {
2957   return matchVectorAddressRecursively(N, AM, 0);
2958 }
2959 
selectVectorAddr(MemSDNode * Parent,SDValue BasePtr,SDValue IndexOp,SDValue ScaleOp,SDValue & Base,SDValue & Scale,SDValue & Index,SDValue & Disp,SDValue & Segment)2960 bool X86DAGToDAGISel::selectVectorAddr(MemSDNode *Parent, SDValue BasePtr,
2961                                        SDValue IndexOp, SDValue ScaleOp,
2962                                        SDValue &Base, SDValue &Scale,
2963                                        SDValue &Index, SDValue &Disp,
2964                                        SDValue &Segment) {
2965   X86ISelAddressMode AM;
2966   AM.Scale = ScaleOp->getAsZExtVal();
2967 
2968   // Attempt to match index patterns, as long as we're not relying on implicit
2969   // sign-extension, which is performed BEFORE scale.
2970   if (IndexOp.getScalarValueSizeInBits() == BasePtr.getScalarValueSizeInBits())
2971     AM.IndexReg = matchIndexRecursively(IndexOp, AM, 0);
2972   else
2973     AM.IndexReg = IndexOp;
2974 
2975   unsigned AddrSpace = Parent->getPointerInfo().getAddrSpace();
2976   if (AddrSpace == X86AS::GS)
2977     AM.Segment = CurDAG->getRegister(X86::GS, MVT::i16);
2978   if (AddrSpace == X86AS::FS)
2979     AM.Segment = CurDAG->getRegister(X86::FS, MVT::i16);
2980   if (AddrSpace == X86AS::SS)
2981     AM.Segment = CurDAG->getRegister(X86::SS, MVT::i16);
2982 
2983   SDLoc DL(BasePtr);
2984   MVT VT = BasePtr.getSimpleValueType();
2985 
2986   // Try to match into the base and displacement fields.
2987   if (matchVectorAddress(BasePtr, AM))
2988     return false;
2989 
2990   getAddressOperands(AM, DL, VT, Base, Scale, Index, Disp, Segment);
2991   return true;
2992 }
2993 
2994 /// Returns true if it is able to pattern match an addressing mode.
2995 /// It returns the operands which make up the maximal addressing mode it can
2996 /// match by reference.
2997 ///
2998 /// Parent is the parent node of the addr operand that is being matched.  It
2999 /// is always a load, store, atomic node, or null.  It is only null when
3000 /// checking memory operands for inline asm nodes.
selectAddr(SDNode * Parent,SDValue N,SDValue & Base,SDValue & Scale,SDValue & Index,SDValue & Disp,SDValue & Segment)3001 bool X86DAGToDAGISel::selectAddr(SDNode *Parent, SDValue N, SDValue &Base,
3002                                  SDValue &Scale, SDValue &Index,
3003                                  SDValue &Disp, SDValue &Segment) {
3004   X86ISelAddressMode AM;
3005 
3006   if (Parent &&
3007       // This list of opcodes are all the nodes that have an "addr:$ptr" operand
3008       // that are not a MemSDNode, and thus don't have proper addrspace info.
3009       Parent->getOpcode() != ISD::INTRINSIC_W_CHAIN && // unaligned loads, fixme
3010       Parent->getOpcode() != ISD::INTRINSIC_VOID && // nontemporal stores
3011       Parent->getOpcode() != X86ISD::TLSCALL && // Fixme
3012       Parent->getOpcode() != X86ISD::ENQCMD && // Fixme
3013       Parent->getOpcode() != X86ISD::ENQCMDS && // Fixme
3014       Parent->getOpcode() != X86ISD::EH_SJLJ_SETJMP && // setjmp
3015       Parent->getOpcode() != X86ISD::EH_SJLJ_LONGJMP) { // longjmp
3016     unsigned AddrSpace =
3017       cast<MemSDNode>(Parent)->getPointerInfo().getAddrSpace();
3018     if (AddrSpace == X86AS::GS)
3019       AM.Segment = CurDAG->getRegister(X86::GS, MVT::i16);
3020     if (AddrSpace == X86AS::FS)
3021       AM.Segment = CurDAG->getRegister(X86::FS, MVT::i16);
3022     if (AddrSpace == X86AS::SS)
3023       AM.Segment = CurDAG->getRegister(X86::SS, MVT::i16);
3024   }
3025 
3026   // Save the DL and VT before calling matchAddress, it can invalidate N.
3027   SDLoc DL(N);
3028   MVT VT = N.getSimpleValueType();
3029 
3030   if (matchAddress(N, AM))
3031     return false;
3032 
3033   getAddressOperands(AM, DL, VT, Base, Scale, Index, Disp, Segment);
3034   return true;
3035 }
3036 
selectMOV64Imm32(SDValue N,SDValue & Imm)3037 bool X86DAGToDAGISel::selectMOV64Imm32(SDValue N, SDValue &Imm) {
3038   // Cannot use 32 bit constants to reference objects in kernel/large code
3039   // model.
3040   if (TM.getCodeModel() == CodeModel::Kernel ||
3041       TM.getCodeModel() == CodeModel::Large)
3042     return false;
3043 
3044   // In static codegen with small code model, we can get the address of a label
3045   // into a register with 'movl'
3046   if (N->getOpcode() != X86ISD::Wrapper)
3047     return false;
3048 
3049   N = N.getOperand(0);
3050 
3051   // At least GNU as does not accept 'movl' for TPOFF relocations.
3052   // FIXME: We could use 'movl' when we know we are targeting MC.
3053   if (N->getOpcode() == ISD::TargetGlobalTLSAddress)
3054     return false;
3055 
3056   Imm = N;
3057   // Small/medium code model can reference non-TargetGlobalAddress objects with
3058   // 32 bit constants.
3059   if (N->getOpcode() != ISD::TargetGlobalAddress) {
3060     return TM.getCodeModel() == CodeModel::Small ||
3061            TM.getCodeModel() == CodeModel::Medium;
3062   }
3063 
3064   const GlobalValue *GV = cast<GlobalAddressSDNode>(N)->getGlobal();
3065   if (std::optional<ConstantRange> CR = GV->getAbsoluteSymbolRange())
3066     return CR->getUnsignedMax().ult(1ull << 32);
3067 
3068   return !TM.isLargeGlobalValue(GV);
3069 }
3070 
selectLEA64_Addr(SDValue N,SDValue & Base,SDValue & Scale,SDValue & Index,SDValue & Disp,SDValue & Segment)3071 bool X86DAGToDAGISel::selectLEA64_Addr(SDValue N, SDValue &Base, SDValue &Scale,
3072                                        SDValue &Index, SDValue &Disp,
3073                                        SDValue &Segment) {
3074   // Save the debug loc before calling selectLEAAddr, in case it invalidates N.
3075   SDLoc DL(N);
3076 
3077   if (!selectLEAAddr(N, Base, Scale, Index, Disp, Segment))
3078     return false;
3079 
3080   EVT BaseType = Base.getValueType();
3081   unsigned SubReg;
3082   if (BaseType == MVT::i8)
3083     SubReg = X86::sub_8bit;
3084   else if (BaseType == MVT::i16)
3085     SubReg = X86::sub_16bit;
3086   else
3087     SubReg = X86::sub_32bit;
3088 
3089   auto *RN = dyn_cast<RegisterSDNode>(Base);
3090   if (RN && RN->getReg() == 0)
3091     Base = CurDAG->getRegister(0, MVT::i64);
3092   else if ((BaseType == MVT::i8 || BaseType == MVT::i16 ||
3093             BaseType == MVT::i32) &&
3094            !isa<FrameIndexSDNode>(Base)) {
3095     // Base could already be %rip, particularly in the x32 ABI.
3096     SDValue ImplDef = SDValue(CurDAG->getMachineNode(X86::IMPLICIT_DEF, DL,
3097                                                      MVT::i64), 0);
3098     Base = CurDAG->getTargetInsertSubreg(SubReg, DL, MVT::i64, ImplDef, Base);
3099   }
3100 
3101   [[maybe_unused]] EVT IndexType = Index.getValueType();
3102   RN = dyn_cast<RegisterSDNode>(Index);
3103   if (RN && RN->getReg() == 0)
3104     Index = CurDAG->getRegister(0, MVT::i64);
3105   else {
3106     assert((IndexType == BaseType) &&
3107            "Expect to be extending 8/16/32-bit registers for use in LEA");
3108     SDValue ImplDef = SDValue(CurDAG->getMachineNode(X86::IMPLICIT_DEF, DL,
3109                                                      MVT::i64), 0);
3110     Index = CurDAG->getTargetInsertSubreg(SubReg, DL, MVT::i64, ImplDef, Index);
3111   }
3112 
3113   return true;
3114 }
3115 
3116 /// Calls SelectAddr and determines if the maximal addressing
3117 /// mode it matches can be cost effectively emitted as an LEA instruction.
selectLEAAddr(SDValue N,SDValue & Base,SDValue & Scale,SDValue & Index,SDValue & Disp,SDValue & Segment)3118 bool X86DAGToDAGISel::selectLEAAddr(SDValue N,
3119                                     SDValue &Base, SDValue &Scale,
3120                                     SDValue &Index, SDValue &Disp,
3121                                     SDValue &Segment) {
3122   X86ISelAddressMode AM;
3123 
3124   // Save the DL and VT before calling matchAddress, it can invalidate N.
3125   SDLoc DL(N);
3126   MVT VT = N.getSimpleValueType();
3127 
3128   // Set AM.Segment to prevent MatchAddress from using one. LEA doesn't support
3129   // segments.
3130   SDValue Copy = AM.Segment;
3131   SDValue T = CurDAG->getRegister(0, MVT::i32);
3132   AM.Segment = T;
3133   if (matchAddress(N, AM))
3134     return false;
3135   assert (T == AM.Segment);
3136   AM.Segment = Copy;
3137 
3138   unsigned Complexity = 0;
3139   if (AM.BaseType == X86ISelAddressMode::RegBase && AM.Base_Reg.getNode())
3140     Complexity = 1;
3141   else if (AM.BaseType == X86ISelAddressMode::FrameIndexBase)
3142     Complexity = 4;
3143 
3144   if (AM.IndexReg.getNode())
3145     Complexity++;
3146 
3147   // Don't match just leal(,%reg,2). It's cheaper to do addl %reg, %reg, or with
3148   // a simple shift.
3149   if (AM.Scale > 1)
3150     Complexity++;
3151 
3152   // FIXME: We are artificially lowering the criteria to turn ADD %reg, $GA
3153   // to a LEA. This is determined with some experimentation but is by no means
3154   // optimal (especially for code size consideration). LEA is nice because of
3155   // its three-address nature. Tweak the cost function again when we can run
3156   // convertToThreeAddress() at register allocation time.
3157   if (AM.hasSymbolicDisplacement()) {
3158     // For X86-64, always use LEA to materialize RIP-relative addresses.
3159     if (Subtarget->is64Bit())
3160       Complexity = 4;
3161     else
3162       Complexity += 2;
3163   }
3164 
3165   // Heuristic: try harder to form an LEA from ADD if the operands set flags.
3166   // Unlike ADD, LEA does not affect flags, so we will be less likely to require
3167   // duplicating flag-producing instructions later in the pipeline.
3168   if (N.getOpcode() == ISD::ADD) {
3169     auto isMathWithFlags = [](SDValue V) {
3170       switch (V.getOpcode()) {
3171       case X86ISD::ADD:
3172       case X86ISD::SUB:
3173       case X86ISD::ADC:
3174       case X86ISD::SBB:
3175       case X86ISD::SMUL:
3176       case X86ISD::UMUL:
3177       /* TODO: These opcodes can be added safely, but we may want to justify
3178                their inclusion for different reasons (better for reg-alloc).
3179       case X86ISD::OR:
3180       case X86ISD::XOR:
3181       case X86ISD::AND:
3182       */
3183         // Value 1 is the flag output of the node - verify it's not dead.
3184         return !SDValue(V.getNode(), 1).use_empty();
3185       default:
3186         return false;
3187       }
3188     };
3189     // TODO: We might want to factor in whether there's a load folding
3190     // opportunity for the math op that disappears with LEA.
3191     if (isMathWithFlags(N.getOperand(0)) || isMathWithFlags(N.getOperand(1)))
3192       Complexity++;
3193   }
3194 
3195   if (AM.Disp)
3196     Complexity++;
3197 
3198   // If it isn't worth using an LEA, reject it.
3199   if (Complexity <= 2)
3200     return false;
3201 
3202   getAddressOperands(AM, DL, VT, Base, Scale, Index, Disp, Segment);
3203   return true;
3204 }
3205 
3206 /// This is only run on TargetGlobalTLSAddress nodes.
selectTLSADDRAddr(SDValue N,SDValue & Base,SDValue & Scale,SDValue & Index,SDValue & Disp,SDValue & Segment)3207 bool X86DAGToDAGISel::selectTLSADDRAddr(SDValue N, SDValue &Base,
3208                                         SDValue &Scale, SDValue &Index,
3209                                         SDValue &Disp, SDValue &Segment) {
3210   assert(N.getOpcode() == ISD::TargetGlobalTLSAddress ||
3211          N.getOpcode() == ISD::TargetExternalSymbol);
3212 
3213   X86ISelAddressMode AM;
3214   if (auto *GA = dyn_cast<GlobalAddressSDNode>(N)) {
3215     AM.GV = GA->getGlobal();
3216     AM.Disp += GA->getOffset();
3217     AM.SymbolFlags = GA->getTargetFlags();
3218   } else {
3219     auto *SA = cast<ExternalSymbolSDNode>(N);
3220     AM.ES = SA->getSymbol();
3221     AM.SymbolFlags = SA->getTargetFlags();
3222   }
3223 
3224   if (Subtarget->is32Bit()) {
3225     AM.Scale = 1;
3226     AM.IndexReg = CurDAG->getRegister(X86::EBX, MVT::i32);
3227   }
3228 
3229   MVT VT = N.getSimpleValueType();
3230   getAddressOperands(AM, SDLoc(N), VT, Base, Scale, Index, Disp, Segment);
3231   return true;
3232 }
3233 
selectRelocImm(SDValue N,SDValue & Op)3234 bool X86DAGToDAGISel::selectRelocImm(SDValue N, SDValue &Op) {
3235   // Keep track of the original value type and whether this value was
3236   // truncated. If we see a truncation from pointer type to VT that truncates
3237   // bits that are known to be zero, we can use a narrow reference.
3238   EVT VT = N.getValueType();
3239   bool WasTruncated = false;
3240   if (N.getOpcode() == ISD::TRUNCATE) {
3241     WasTruncated = true;
3242     N = N.getOperand(0);
3243   }
3244 
3245   if (N.getOpcode() != X86ISD::Wrapper)
3246     return false;
3247 
3248   // We can only use non-GlobalValues as immediates if they were not truncated,
3249   // as we do not have any range information. If we have a GlobalValue and the
3250   // address was not truncated, we can select it as an operand directly.
3251   unsigned Opc = N.getOperand(0)->getOpcode();
3252   if (Opc != ISD::TargetGlobalAddress || !WasTruncated) {
3253     Op = N.getOperand(0);
3254     // We can only select the operand directly if we didn't have to look past a
3255     // truncate.
3256     return !WasTruncated;
3257   }
3258 
3259   // Check that the global's range fits into VT.
3260   auto *GA = cast<GlobalAddressSDNode>(N.getOperand(0));
3261   std::optional<ConstantRange> CR = GA->getGlobal()->getAbsoluteSymbolRange();
3262   if (!CR || CR->getUnsignedMax().uge(1ull << VT.getSizeInBits()))
3263     return false;
3264 
3265   // Okay, we can use a narrow reference.
3266   Op = CurDAG->getTargetGlobalAddress(GA->getGlobal(), SDLoc(N), VT,
3267                                       GA->getOffset(), GA->getTargetFlags());
3268   return true;
3269 }
3270 
tryFoldLoad(SDNode * Root,SDNode * P,SDValue N,SDValue & Base,SDValue & Scale,SDValue & Index,SDValue & Disp,SDValue & Segment)3271 bool X86DAGToDAGISel::tryFoldLoad(SDNode *Root, SDNode *P, SDValue N,
3272                                   SDValue &Base, SDValue &Scale,
3273                                   SDValue &Index, SDValue &Disp,
3274                                   SDValue &Segment) {
3275   assert(Root && P && "Unknown root/parent nodes");
3276   if (!ISD::isNON_EXTLoad(N.getNode()) ||
3277       !IsProfitableToFold(N, P, Root) ||
3278       !IsLegalToFold(N, P, Root, OptLevel))
3279     return false;
3280 
3281   return selectAddr(N.getNode(),
3282                     N.getOperand(1), Base, Scale, Index, Disp, Segment);
3283 }
3284 
tryFoldBroadcast(SDNode * Root,SDNode * P,SDValue N,SDValue & Base,SDValue & Scale,SDValue & Index,SDValue & Disp,SDValue & Segment)3285 bool X86DAGToDAGISel::tryFoldBroadcast(SDNode *Root, SDNode *P, SDValue N,
3286                                        SDValue &Base, SDValue &Scale,
3287                                        SDValue &Index, SDValue &Disp,
3288                                        SDValue &Segment) {
3289   assert(Root && P && "Unknown root/parent nodes");
3290   if (N->getOpcode() != X86ISD::VBROADCAST_LOAD ||
3291       !IsProfitableToFold(N, P, Root) ||
3292       !IsLegalToFold(N, P, Root, OptLevel))
3293     return false;
3294 
3295   return selectAddr(N.getNode(),
3296                     N.getOperand(1), Base, Scale, Index, Disp, Segment);
3297 }
3298 
3299 /// Return an SDNode that returns the value of the global base register.
3300 /// Output instructions required to initialize the global base register,
3301 /// if necessary.
getGlobalBaseReg()3302 SDNode *X86DAGToDAGISel::getGlobalBaseReg() {
3303   Register GlobalBaseReg = getInstrInfo()->getGlobalBaseReg(MF);
3304   auto &DL = MF->getDataLayout();
3305   return CurDAG->getRegister(GlobalBaseReg, TLI->getPointerTy(DL)).getNode();
3306 }
3307 
isSExtAbsoluteSymbolRef(unsigned Width,SDNode * N) const3308 bool X86DAGToDAGISel::isSExtAbsoluteSymbolRef(unsigned Width, SDNode *N) const {
3309   if (N->getOpcode() == ISD::TRUNCATE)
3310     N = N->getOperand(0).getNode();
3311   if (N->getOpcode() != X86ISD::Wrapper)
3312     return false;
3313 
3314   auto *GA = dyn_cast<GlobalAddressSDNode>(N->getOperand(0));
3315   if (!GA)
3316     return false;
3317 
3318   auto *GV = GA->getGlobal();
3319   std::optional<ConstantRange> CR = GV->getAbsoluteSymbolRange();
3320   if (CR)
3321     return CR->getSignedMin().sge(-1ull << Width) &&
3322            CR->getSignedMax().slt(1ull << Width);
3323   // In the kernel code model, globals are in the negative 2GB of the address
3324   // space, so globals can be a sign extended 32-bit immediate.
3325   // In other code models, small globals are in the low 2GB of the address
3326   // space, so sign extending them is equivalent to zero extending them.
3327   return Width == 32 && !TM.isLargeGlobalValue(GV);
3328 }
3329 
getCondFromNode(SDNode * N) const3330 X86::CondCode X86DAGToDAGISel::getCondFromNode(SDNode *N) const {
3331   assert(N->isMachineOpcode() && "Unexpected node");
3332   unsigned Opc = N->getMachineOpcode();
3333   const MCInstrDesc &MCID = getInstrInfo()->get(Opc);
3334   int CondNo = X86::getCondSrcNoFromDesc(MCID);
3335   if (CondNo < 0)
3336     return X86::COND_INVALID;
3337 
3338   return static_cast<X86::CondCode>(N->getConstantOperandVal(CondNo));
3339 }
3340 
3341 /// Test whether the given X86ISD::CMP node has any users that use a flag
3342 /// other than ZF.
onlyUsesZeroFlag(SDValue Flags) const3343 bool X86DAGToDAGISel::onlyUsesZeroFlag(SDValue Flags) const {
3344   // Examine each user of the node.
3345   for (SDUse &Use : Flags->uses()) {
3346     // Only check things that use the flags.
3347     if (Use.getResNo() != Flags.getResNo())
3348       continue;
3349     SDNode *User = Use.getUser();
3350     // Only examine CopyToReg uses that copy to EFLAGS.
3351     if (User->getOpcode() != ISD::CopyToReg ||
3352         cast<RegisterSDNode>(User->getOperand(1))->getReg() != X86::EFLAGS)
3353       return false;
3354     // Examine each user of the CopyToReg use.
3355     for (SDUse &FlagUse : User->uses()) {
3356       // Only examine the Flag result.
3357       if (FlagUse.getResNo() != 1)
3358         continue;
3359       // Anything unusual: assume conservatively.
3360       if (!FlagUse.getUser()->isMachineOpcode())
3361         return false;
3362       // Examine the condition code of the user.
3363       X86::CondCode CC = getCondFromNode(FlagUse.getUser());
3364 
3365       switch (CC) {
3366       // Comparisons which only use the zero flag.
3367       case X86::COND_E: case X86::COND_NE:
3368         continue;
3369       // Anything else: assume conservatively.
3370       default:
3371         return false;
3372       }
3373     }
3374   }
3375   return true;
3376 }
3377 
3378 /// Test whether the given X86ISD::CMP node has any uses which require the SF
3379 /// flag to be accurate.
hasNoSignFlagUses(SDValue Flags) const3380 bool X86DAGToDAGISel::hasNoSignFlagUses(SDValue Flags) const {
3381   // Examine each user of the node.
3382   for (SDUse &Use : Flags->uses()) {
3383     // Only check things that use the flags.
3384     if (Use.getResNo() != Flags.getResNo())
3385       continue;
3386     SDNode *User = Use.getUser();
3387     // Only examine CopyToReg uses that copy to EFLAGS.
3388     if (User->getOpcode() != ISD::CopyToReg ||
3389         cast<RegisterSDNode>(User->getOperand(1))->getReg() != X86::EFLAGS)
3390       return false;
3391     // Examine each user of the CopyToReg use.
3392     for (SDUse &FlagUse : User->uses()) {
3393       // Only examine the Flag result.
3394       if (FlagUse.getResNo() != 1)
3395         continue;
3396       // Anything unusual: assume conservatively.
3397       if (!FlagUse.getUser()->isMachineOpcode())
3398         return false;
3399       // Examine the condition code of the user.
3400       X86::CondCode CC = getCondFromNode(FlagUse.getUser());
3401 
3402       switch (CC) {
3403       // Comparisons which don't examine the SF flag.
3404       case X86::COND_A: case X86::COND_AE:
3405       case X86::COND_B: case X86::COND_BE:
3406       case X86::COND_E: case X86::COND_NE:
3407       case X86::COND_O: case X86::COND_NO:
3408       case X86::COND_P: case X86::COND_NP:
3409         continue;
3410       // Anything else: assume conservatively.
3411       default:
3412         return false;
3413       }
3414     }
3415   }
3416   return true;
3417 }
3418 
mayUseCarryFlag(X86::CondCode CC)3419 static bool mayUseCarryFlag(X86::CondCode CC) {
3420   switch (CC) {
3421   // Comparisons which don't examine the CF flag.
3422   case X86::COND_O: case X86::COND_NO:
3423   case X86::COND_E: case X86::COND_NE:
3424   case X86::COND_S: case X86::COND_NS:
3425   case X86::COND_P: case X86::COND_NP:
3426   case X86::COND_L: case X86::COND_GE:
3427   case X86::COND_G: case X86::COND_LE:
3428     return false;
3429   // Anything else: assume conservatively.
3430   default:
3431     return true;
3432   }
3433 }
3434 
3435 /// Test whether the given node which sets flags has any uses which require the
3436 /// CF flag to be accurate.
hasNoCarryFlagUses(SDValue Flags) const3437  bool X86DAGToDAGISel::hasNoCarryFlagUses(SDValue Flags) const {
3438   // Examine each user of the node.
3439   for (SDUse &Use : Flags->uses()) {
3440     // Only check things that use the flags.
3441     if (Use.getResNo() != Flags.getResNo())
3442       continue;
3443 
3444     SDNode *User = Use.getUser();
3445     unsigned UserOpc = User->getOpcode();
3446 
3447     if (UserOpc == ISD::CopyToReg) {
3448       // Only examine CopyToReg uses that copy to EFLAGS.
3449       if (cast<RegisterSDNode>(User->getOperand(1))->getReg() != X86::EFLAGS)
3450         return false;
3451       // Examine each user of the CopyToReg use.
3452       for (SDUse &FlagUse : User->uses()) {
3453         // Only examine the Flag result.
3454         if (FlagUse.getResNo() != 1)
3455           continue;
3456         // Anything unusual: assume conservatively.
3457         if (!FlagUse.getUser()->isMachineOpcode())
3458           return false;
3459         // Examine the condition code of the user.
3460         X86::CondCode CC = getCondFromNode(FlagUse.getUser());
3461 
3462         if (mayUseCarryFlag(CC))
3463           return false;
3464       }
3465 
3466       // This CopyToReg is ok. Move on to the next user.
3467       continue;
3468     }
3469 
3470     // This might be an unselected node. So look for the pre-isel opcodes that
3471     // use flags.
3472     unsigned CCOpNo;
3473     switch (UserOpc) {
3474     default:
3475       // Something unusual. Be conservative.
3476       return false;
3477     case X86ISD::SETCC:       CCOpNo = 0; break;
3478     case X86ISD::SETCC_CARRY: CCOpNo = 0; break;
3479     case X86ISD::CMOV:        CCOpNo = 2; break;
3480     case X86ISD::BRCOND:      CCOpNo = 2; break;
3481     }
3482 
3483     X86::CondCode CC = (X86::CondCode)User->getConstantOperandVal(CCOpNo);
3484     if (mayUseCarryFlag(CC))
3485       return false;
3486   }
3487   return true;
3488 }
3489 
3490 /// Check whether or not the chain ending in StoreNode is suitable for doing
3491 /// the {load; op; store} to modify transformation.
isFusableLoadOpStorePattern(StoreSDNode * StoreNode,SDValue StoredVal,SelectionDAG * CurDAG,unsigned LoadOpNo,LoadSDNode * & LoadNode,SDValue & InputChain)3492 static bool isFusableLoadOpStorePattern(StoreSDNode *StoreNode,
3493                                         SDValue StoredVal, SelectionDAG *CurDAG,
3494                                         unsigned LoadOpNo,
3495                                         LoadSDNode *&LoadNode,
3496                                         SDValue &InputChain) {
3497   // Is the stored value result 0 of the operation?
3498   if (StoredVal.getResNo() != 0) return false;
3499 
3500   // Are there other uses of the operation other than the store?
3501   if (!StoredVal.getNode()->hasNUsesOfValue(1, 0)) return false;
3502 
3503   // Is the store non-extending and non-indexed?
3504   if (!ISD::isNormalStore(StoreNode) || StoreNode->isNonTemporal())
3505     return false;
3506 
3507   SDValue Load = StoredVal->getOperand(LoadOpNo);
3508   // Is the stored value a non-extending and non-indexed load?
3509   if (!ISD::isNormalLoad(Load.getNode())) return false;
3510 
3511   // Return LoadNode by reference.
3512   LoadNode = cast<LoadSDNode>(Load);
3513 
3514   // Is store the only read of the loaded value?
3515   if (!Load.hasOneUse())
3516     return false;
3517 
3518   // Is the address of the store the same as the load?
3519   if (LoadNode->getBasePtr() != StoreNode->getBasePtr() ||
3520       LoadNode->getOffset() != StoreNode->getOffset())
3521     return false;
3522 
3523   bool FoundLoad = false;
3524   SmallVector<SDValue, 4> ChainOps;
3525   SmallVector<const SDNode *, 4> LoopWorklist;
3526   SmallPtrSet<const SDNode *, 16> Visited;
3527   const unsigned int Max = 1024;
3528 
3529   //  Visualization of Load-Op-Store fusion:
3530   // -------------------------
3531   // Legend:
3532   //    *-lines = Chain operand dependencies.
3533   //    |-lines = Normal operand dependencies.
3534   //    Dependencies flow down and right. n-suffix references multiple nodes.
3535   //
3536   //        C                        Xn  C
3537   //        *                         *  *
3538   //        *                          * *
3539   //  Xn  A-LD    Yn                    TF         Yn
3540   //   *    * \   |                       *        |
3541   //    *   *  \  |                        *       |
3542   //     *  *   \ |             =>       A--LD_OP_ST
3543   //      * *    \|                                 \
3544   //       TF    OP                                  \
3545   //         *   | \                                  Zn
3546   //          *  |  \
3547   //         A-ST    Zn
3548   //
3549 
3550   // This merge induced dependences from: #1: Xn -> LD, OP, Zn
3551   //                                      #2: Yn -> LD
3552   //                                      #3: ST -> Zn
3553 
3554   // Ensure the transform is safe by checking for the dual
3555   // dependencies to make sure we do not induce a loop.
3556 
3557   // As LD is a predecessor to both OP and ST we can do this by checking:
3558   //  a). if LD is a predecessor to a member of Xn or Yn.
3559   //  b). if a Zn is a predecessor to ST.
3560 
3561   // However, (b) can only occur through being a chain predecessor to
3562   // ST, which is the same as Zn being a member or predecessor of Xn,
3563   // which is a subset of LD being a predecessor of Xn. So it's
3564   // subsumed by check (a).
3565 
3566   SDValue Chain = StoreNode->getChain();
3567 
3568   // Gather X elements in ChainOps.
3569   if (Chain == Load.getValue(1)) {
3570     FoundLoad = true;
3571     ChainOps.push_back(Load.getOperand(0));
3572   } else if (Chain.getOpcode() == ISD::TokenFactor) {
3573     for (unsigned i = 0, e = Chain.getNumOperands(); i != e; ++i) {
3574       SDValue Op = Chain.getOperand(i);
3575       if (Op == Load.getValue(1)) {
3576         FoundLoad = true;
3577         // Drop Load, but keep its chain. No cycle check necessary.
3578         ChainOps.push_back(Load.getOperand(0));
3579         continue;
3580       }
3581       LoopWorklist.push_back(Op.getNode());
3582       ChainOps.push_back(Op);
3583     }
3584   }
3585 
3586   if (!FoundLoad)
3587     return false;
3588 
3589   // Worklist is currently Xn. Add Yn to worklist.
3590   for (SDValue Op : StoredVal->ops())
3591     if (Op.getNode() != LoadNode)
3592       LoopWorklist.push_back(Op.getNode());
3593 
3594   // Check (a) if Load is a predecessor to Xn + Yn
3595   if (SDNode::hasPredecessorHelper(Load.getNode(), Visited, LoopWorklist, Max,
3596                                    true))
3597     return false;
3598 
3599   InputChain =
3600       CurDAG->getNode(ISD::TokenFactor, SDLoc(Chain), MVT::Other, ChainOps);
3601   return true;
3602 }
3603 
3604 // Change a chain of {load; op; store} of the same value into a simple op
3605 // through memory of that value, if the uses of the modified value and its
3606 // address are suitable.
3607 //
3608 // The tablegen pattern memory operand pattern is currently not able to match
3609 // the case where the EFLAGS on the original operation are used.
3610 //
3611 // To move this to tablegen, we'll need to improve tablegen to allow flags to
3612 // be transferred from a node in the pattern to the result node, probably with
3613 // a new keyword. For example, we have this
3614 // def DEC64m : RI<0xFF, MRM1m, (outs), (ins i64mem:$dst), "dec{q}\t$dst",
3615 //  [(store (add (loadi64 addr:$dst), -1), addr:$dst)]>;
3616 // but maybe need something like this
3617 // def DEC64m : RI<0xFF, MRM1m, (outs), (ins i64mem:$dst), "dec{q}\t$dst",
3618 //  [(store (X86add_flag (loadi64 addr:$dst), -1), addr:$dst),
3619 //   (transferrable EFLAGS)]>;
3620 //
3621 // Until then, we manually fold these and instruction select the operation
3622 // here.
foldLoadStoreIntoMemOperand(SDNode * Node)3623 bool X86DAGToDAGISel::foldLoadStoreIntoMemOperand(SDNode *Node) {
3624   auto *StoreNode = cast<StoreSDNode>(Node);
3625   SDValue StoredVal = StoreNode->getOperand(1);
3626   unsigned Opc = StoredVal->getOpcode();
3627 
3628   // Before we try to select anything, make sure this is memory operand size
3629   // and opcode we can handle. Note that this must match the code below that
3630   // actually lowers the opcodes.
3631   EVT MemVT = StoreNode->getMemoryVT();
3632   if (MemVT != MVT::i64 && MemVT != MVT::i32 && MemVT != MVT::i16 &&
3633       MemVT != MVT::i8)
3634     return false;
3635 
3636   bool IsCommutable = false;
3637   bool IsNegate = false;
3638   switch (Opc) {
3639   default:
3640     return false;
3641   case X86ISD::SUB:
3642     IsNegate = isNullConstant(StoredVal.getOperand(0));
3643     break;
3644   case X86ISD::SBB:
3645     break;
3646   case X86ISD::ADD:
3647   case X86ISD::ADC:
3648   case X86ISD::AND:
3649   case X86ISD::OR:
3650   case X86ISD::XOR:
3651     IsCommutable = true;
3652     break;
3653   }
3654 
3655   unsigned LoadOpNo = IsNegate ? 1 : 0;
3656   LoadSDNode *LoadNode = nullptr;
3657   SDValue InputChain;
3658   if (!isFusableLoadOpStorePattern(StoreNode, StoredVal, CurDAG, LoadOpNo,
3659                                    LoadNode, InputChain)) {
3660     if (!IsCommutable)
3661       return false;
3662 
3663     // This operation is commutable, try the other operand.
3664     LoadOpNo = 1;
3665     if (!isFusableLoadOpStorePattern(StoreNode, StoredVal, CurDAG, LoadOpNo,
3666                                      LoadNode, InputChain))
3667       return false;
3668   }
3669 
3670   SDValue Base, Scale, Index, Disp, Segment;
3671   if (!selectAddr(LoadNode, LoadNode->getBasePtr(), Base, Scale, Index, Disp,
3672                   Segment))
3673     return false;
3674 
3675   auto SelectOpcode = [&](unsigned Opc64, unsigned Opc32, unsigned Opc16,
3676                           unsigned Opc8) {
3677     switch (MemVT.getSimpleVT().SimpleTy) {
3678     case MVT::i64:
3679       return Opc64;
3680     case MVT::i32:
3681       return Opc32;
3682     case MVT::i16:
3683       return Opc16;
3684     case MVT::i8:
3685       return Opc8;
3686     default:
3687       llvm_unreachable("Invalid size!");
3688     }
3689   };
3690 
3691   MachineSDNode *Result;
3692   switch (Opc) {
3693   case X86ISD::SUB:
3694     // Handle negate.
3695     if (IsNegate) {
3696       unsigned NewOpc = SelectOpcode(X86::NEG64m, X86::NEG32m, X86::NEG16m,
3697                                      X86::NEG8m);
3698       const SDValue Ops[] = {Base, Scale, Index, Disp, Segment, InputChain};
3699       Result = CurDAG->getMachineNode(NewOpc, SDLoc(Node), MVT::i32,
3700                                       MVT::Other, Ops);
3701       break;
3702     }
3703    [[fallthrough]];
3704   case X86ISD::ADD:
3705     // Try to match inc/dec.
3706     if (!Subtarget->slowIncDec() || CurDAG->shouldOptForSize()) {
3707       bool IsOne = isOneConstant(StoredVal.getOperand(1));
3708       bool IsNegOne = isAllOnesConstant(StoredVal.getOperand(1));
3709       // ADD/SUB with 1/-1 and carry flag isn't used can use inc/dec.
3710       if ((IsOne || IsNegOne) && hasNoCarryFlagUses(StoredVal.getValue(1))) {
3711         unsigned NewOpc =
3712           ((Opc == X86ISD::ADD) == IsOne)
3713               ? SelectOpcode(X86::INC64m, X86::INC32m, X86::INC16m, X86::INC8m)
3714               : SelectOpcode(X86::DEC64m, X86::DEC32m, X86::DEC16m, X86::DEC8m);
3715         const SDValue Ops[] = {Base, Scale, Index, Disp, Segment, InputChain};
3716         Result = CurDAG->getMachineNode(NewOpc, SDLoc(Node), MVT::i32,
3717                                         MVT::Other, Ops);
3718         break;
3719       }
3720     }
3721     [[fallthrough]];
3722   case X86ISD::ADC:
3723   case X86ISD::SBB:
3724   case X86ISD::AND:
3725   case X86ISD::OR:
3726   case X86ISD::XOR: {
3727     auto SelectRegOpcode = [SelectOpcode](unsigned Opc) {
3728       switch (Opc) {
3729       case X86ISD::ADD:
3730         return SelectOpcode(X86::ADD64mr, X86::ADD32mr, X86::ADD16mr,
3731                             X86::ADD8mr);
3732       case X86ISD::ADC:
3733         return SelectOpcode(X86::ADC64mr, X86::ADC32mr, X86::ADC16mr,
3734                             X86::ADC8mr);
3735       case X86ISD::SUB:
3736         return SelectOpcode(X86::SUB64mr, X86::SUB32mr, X86::SUB16mr,
3737                             X86::SUB8mr);
3738       case X86ISD::SBB:
3739         return SelectOpcode(X86::SBB64mr, X86::SBB32mr, X86::SBB16mr,
3740                             X86::SBB8mr);
3741       case X86ISD::AND:
3742         return SelectOpcode(X86::AND64mr, X86::AND32mr, X86::AND16mr,
3743                             X86::AND8mr);
3744       case X86ISD::OR:
3745         return SelectOpcode(X86::OR64mr, X86::OR32mr, X86::OR16mr, X86::OR8mr);
3746       case X86ISD::XOR:
3747         return SelectOpcode(X86::XOR64mr, X86::XOR32mr, X86::XOR16mr,
3748                             X86::XOR8mr);
3749       default:
3750         llvm_unreachable("Invalid opcode!");
3751       }
3752     };
3753     auto SelectImmOpcode = [SelectOpcode](unsigned Opc) {
3754       switch (Opc) {
3755       case X86ISD::ADD:
3756         return SelectOpcode(X86::ADD64mi32, X86::ADD32mi, X86::ADD16mi,
3757                             X86::ADD8mi);
3758       case X86ISD::ADC:
3759         return SelectOpcode(X86::ADC64mi32, X86::ADC32mi, X86::ADC16mi,
3760                             X86::ADC8mi);
3761       case X86ISD::SUB:
3762         return SelectOpcode(X86::SUB64mi32, X86::SUB32mi, X86::SUB16mi,
3763                             X86::SUB8mi);
3764       case X86ISD::SBB:
3765         return SelectOpcode(X86::SBB64mi32, X86::SBB32mi, X86::SBB16mi,
3766                             X86::SBB8mi);
3767       case X86ISD::AND:
3768         return SelectOpcode(X86::AND64mi32, X86::AND32mi, X86::AND16mi,
3769                             X86::AND8mi);
3770       case X86ISD::OR:
3771         return SelectOpcode(X86::OR64mi32, X86::OR32mi, X86::OR16mi,
3772                             X86::OR8mi);
3773       case X86ISD::XOR:
3774         return SelectOpcode(X86::XOR64mi32, X86::XOR32mi, X86::XOR16mi,
3775                             X86::XOR8mi);
3776       default:
3777         llvm_unreachable("Invalid opcode!");
3778       }
3779     };
3780 
3781     unsigned NewOpc = SelectRegOpcode(Opc);
3782     SDValue Operand = StoredVal->getOperand(1-LoadOpNo);
3783 
3784     // See if the operand is a constant that we can fold into an immediate
3785     // operand.
3786     if (auto *OperandC = dyn_cast<ConstantSDNode>(Operand)) {
3787       int64_t OperandV = OperandC->getSExtValue();
3788 
3789       // Check if we can shrink the operand enough to fit in an immediate (or
3790       // fit into a smaller immediate) by negating it and switching the
3791       // operation.
3792       if ((Opc == X86ISD::ADD || Opc == X86ISD::SUB) &&
3793           ((MemVT != MVT::i8 && !isInt<8>(OperandV) && isInt<8>(-OperandV)) ||
3794            (MemVT == MVT::i64 && !isInt<32>(OperandV) &&
3795             isInt<32>(-OperandV))) &&
3796           hasNoCarryFlagUses(StoredVal.getValue(1))) {
3797         OperandV = -OperandV;
3798         Opc = Opc == X86ISD::ADD ? X86ISD::SUB : X86ISD::ADD;
3799       }
3800 
3801       if (MemVT != MVT::i64 || isInt<32>(OperandV)) {
3802         Operand = CurDAG->getSignedTargetConstant(OperandV, SDLoc(Node), MemVT);
3803         NewOpc = SelectImmOpcode(Opc);
3804       }
3805     }
3806 
3807     if (Opc == X86ISD::ADC || Opc == X86ISD::SBB) {
3808       SDValue CopyTo =
3809           CurDAG->getCopyToReg(InputChain, SDLoc(Node), X86::EFLAGS,
3810                                StoredVal.getOperand(2), SDValue());
3811 
3812       const SDValue Ops[] = {Base,    Scale,   Index,  Disp,
3813                              Segment, Operand, CopyTo, CopyTo.getValue(1)};
3814       Result = CurDAG->getMachineNode(NewOpc, SDLoc(Node), MVT::i32, MVT::Other,
3815                                       Ops);
3816     } else {
3817       const SDValue Ops[] = {Base,    Scale,   Index,     Disp,
3818                              Segment, Operand, InputChain};
3819       Result = CurDAG->getMachineNode(NewOpc, SDLoc(Node), MVT::i32, MVT::Other,
3820                                       Ops);
3821     }
3822     break;
3823   }
3824   default:
3825     llvm_unreachable("Invalid opcode!");
3826   }
3827 
3828   MachineMemOperand *MemOps[] = {StoreNode->getMemOperand(),
3829                                  LoadNode->getMemOperand()};
3830   CurDAG->setNodeMemRefs(Result, MemOps);
3831 
3832   // Update Load Chain uses as well.
3833   ReplaceUses(SDValue(LoadNode, 1), SDValue(Result, 1));
3834   ReplaceUses(SDValue(StoreNode, 0), SDValue(Result, 1));
3835   ReplaceUses(SDValue(StoredVal.getNode(), 1), SDValue(Result, 0));
3836   CurDAG->RemoveDeadNode(Node);
3837   return true;
3838 }
3839 
3840 // See if this is an  X & Mask  that we can match to BEXTR/BZHI.
3841 // Where Mask is one of the following patterns:
3842 //   a) x &  (1 << nbits) - 1
3843 //   b) x & ~(-1 << nbits)
3844 //   c) x &  (-1 >> (32 - y))
3845 //   d) x << (32 - y) >> (32 - y)
3846 //   e) (1 << nbits) - 1
matchBitExtract(SDNode * Node)3847 bool X86DAGToDAGISel::matchBitExtract(SDNode *Node) {
3848   assert(
3849       (Node->getOpcode() == ISD::ADD || Node->getOpcode() == ISD::AND ||
3850        Node->getOpcode() == ISD::SRL) &&
3851       "Should be either an and-mask, or right-shift after clearing high bits.");
3852 
3853   // BEXTR is BMI instruction, BZHI is BMI2 instruction. We need at least one.
3854   if (!Subtarget->hasBMI() && !Subtarget->hasBMI2())
3855     return false;
3856 
3857   MVT NVT = Node->getSimpleValueType(0);
3858 
3859   // Only supported for 32 and 64 bits.
3860   if (NVT != MVT::i32 && NVT != MVT::i64)
3861     return false;
3862 
3863   SDValue NBits;
3864   bool NegateNBits;
3865 
3866   // If we have BMI2's BZHI, we are ok with muti-use patterns.
3867   // Else, if we only have BMI1's BEXTR, we require one-use.
3868   const bool AllowExtraUsesByDefault = Subtarget->hasBMI2();
3869   auto checkUses = [AllowExtraUsesByDefault](
3870                        SDValue Op, unsigned NUses,
3871                        std::optional<bool> AllowExtraUses) {
3872     return AllowExtraUses.value_or(AllowExtraUsesByDefault) ||
3873            Op.getNode()->hasNUsesOfValue(NUses, Op.getResNo());
3874   };
3875   auto checkOneUse = [checkUses](SDValue Op,
3876                                  std::optional<bool> AllowExtraUses =
3877                                      std::nullopt) {
3878     return checkUses(Op, 1, AllowExtraUses);
3879   };
3880   auto checkTwoUse = [checkUses](SDValue Op,
3881                                  std::optional<bool> AllowExtraUses =
3882                                      std::nullopt) {
3883     return checkUses(Op, 2, AllowExtraUses);
3884   };
3885 
3886   auto peekThroughOneUseTruncation = [checkOneUse](SDValue V) {
3887     if (V->getOpcode() == ISD::TRUNCATE && checkOneUse(V)) {
3888       assert(V.getSimpleValueType() == MVT::i32 &&
3889              V.getOperand(0).getSimpleValueType() == MVT::i64 &&
3890              "Expected i64 -> i32 truncation");
3891       V = V.getOperand(0);
3892     }
3893     return V;
3894   };
3895 
3896   // a) x & ((1 << nbits) + (-1))
3897   auto matchPatternA = [checkOneUse, peekThroughOneUseTruncation, &NBits,
3898                         &NegateNBits](SDValue Mask) -> bool {
3899     // Match `add`. Must only have one use!
3900     if (Mask->getOpcode() != ISD::ADD || !checkOneUse(Mask))
3901       return false;
3902     // We should be adding all-ones constant (i.e. subtracting one.)
3903     if (!isAllOnesConstant(Mask->getOperand(1)))
3904       return false;
3905     // Match `1 << nbits`. Might be truncated. Must only have one use!
3906     SDValue M0 = peekThroughOneUseTruncation(Mask->getOperand(0));
3907     if (M0->getOpcode() != ISD::SHL || !checkOneUse(M0))
3908       return false;
3909     if (!isOneConstant(M0->getOperand(0)))
3910       return false;
3911     NBits = M0->getOperand(1);
3912     NegateNBits = false;
3913     return true;
3914   };
3915 
3916   auto isAllOnes = [this, peekThroughOneUseTruncation, NVT](SDValue V) {
3917     V = peekThroughOneUseTruncation(V);
3918     return CurDAG->MaskedValueIsAllOnes(
3919         V, APInt::getLowBitsSet(V.getSimpleValueType().getSizeInBits(),
3920                                 NVT.getSizeInBits()));
3921   };
3922 
3923   // b) x & ~(-1 << nbits)
3924   auto matchPatternB = [checkOneUse, isAllOnes, peekThroughOneUseTruncation,
3925                         &NBits, &NegateNBits](SDValue Mask) -> bool {
3926     // Match `~()`. Must only have one use!
3927     if (Mask.getOpcode() != ISD::XOR || !checkOneUse(Mask))
3928       return false;
3929     // The -1 only has to be all-ones for the final Node's NVT.
3930     if (!isAllOnes(Mask->getOperand(1)))
3931       return false;
3932     // Match `-1 << nbits`. Might be truncated. Must only have one use!
3933     SDValue M0 = peekThroughOneUseTruncation(Mask->getOperand(0));
3934     if (M0->getOpcode() != ISD::SHL || !checkOneUse(M0))
3935       return false;
3936     // The -1 only has to be all-ones for the final Node's NVT.
3937     if (!isAllOnes(M0->getOperand(0)))
3938       return false;
3939     NBits = M0->getOperand(1);
3940     NegateNBits = false;
3941     return true;
3942   };
3943 
3944   // Try to match potentially-truncated shift amount as `(bitwidth - y)`,
3945   // or leave the shift amount as-is, but then we'll have to negate it.
3946   auto canonicalizeShiftAmt = [&NBits, &NegateNBits](SDValue ShiftAmt,
3947                                                      unsigned Bitwidth) {
3948     NBits = ShiftAmt;
3949     NegateNBits = true;
3950     // Skip over a truncate of the shift amount, if any.
3951     if (NBits.getOpcode() == ISD::TRUNCATE)
3952       NBits = NBits.getOperand(0);
3953     // Try to match the shift amount as (bitwidth - y). It should go away, too.
3954     // If it doesn't match, that's fine, we'll just negate it ourselves.
3955     if (NBits.getOpcode() != ISD::SUB)
3956       return;
3957     auto *V0 = dyn_cast<ConstantSDNode>(NBits.getOperand(0));
3958     if (!V0 || V0->getZExtValue() != Bitwidth)
3959       return;
3960     NBits = NBits.getOperand(1);
3961     NegateNBits = false;
3962   };
3963 
3964   // c) x &  (-1 >> z)  but then we'll have to subtract z from bitwidth
3965   //   or
3966   // c) x &  (-1 >> (32 - y))
3967   auto matchPatternC = [checkOneUse, peekThroughOneUseTruncation, &NegateNBits,
3968                         canonicalizeShiftAmt](SDValue Mask) -> bool {
3969     // The mask itself may be truncated.
3970     Mask = peekThroughOneUseTruncation(Mask);
3971     unsigned Bitwidth = Mask.getSimpleValueType().getSizeInBits();
3972     // Match `l>>`. Must only have one use!
3973     if (Mask.getOpcode() != ISD::SRL || !checkOneUse(Mask))
3974       return false;
3975     // We should be shifting truly all-ones constant.
3976     if (!isAllOnesConstant(Mask.getOperand(0)))
3977       return false;
3978     SDValue M1 = Mask.getOperand(1);
3979     // The shift amount should not be used externally.
3980     if (!checkOneUse(M1))
3981       return false;
3982     canonicalizeShiftAmt(M1, Bitwidth);
3983     // Pattern c. is non-canonical, and is expanded into pattern d. iff there
3984     // is no extra use of the mask. Clearly, there was one since we are here.
3985     // But at the same time, if we need to negate the shift amount,
3986     // then we don't want the mask to stick around, else it's unprofitable.
3987     return !NegateNBits;
3988   };
3989 
3990   SDValue X;
3991 
3992   // d) x << z >> z  but then we'll have to subtract z from bitwidth
3993   //   or
3994   // d) x << (32 - y) >> (32 - y)
3995   auto matchPatternD = [checkOneUse, checkTwoUse, canonicalizeShiftAmt,
3996                         AllowExtraUsesByDefault, &NegateNBits,
3997                         &X](SDNode *Node) -> bool {
3998     if (Node->getOpcode() != ISD::SRL)
3999       return false;
4000     SDValue N0 = Node->getOperand(0);
4001     if (N0->getOpcode() != ISD::SHL)
4002       return false;
4003     unsigned Bitwidth = N0.getSimpleValueType().getSizeInBits();
4004     SDValue N1 = Node->getOperand(1);
4005     SDValue N01 = N0->getOperand(1);
4006     // Both of the shifts must be by the exact same value.
4007     if (N1 != N01)
4008       return false;
4009     canonicalizeShiftAmt(N1, Bitwidth);
4010     // There should not be any external uses of the inner shift / shift amount.
4011     // Note that while we are generally okay with external uses given BMI2,
4012     // iff we need to negate the shift amount, we are not okay with extra uses.
4013     const bool AllowExtraUses = AllowExtraUsesByDefault && !NegateNBits;
4014     if (!checkOneUse(N0, AllowExtraUses) || !checkTwoUse(N1, AllowExtraUses))
4015       return false;
4016     X = N0->getOperand(0);
4017     return true;
4018   };
4019 
4020   auto matchLowBitMask = [matchPatternA, matchPatternB,
4021                           matchPatternC](SDValue Mask) -> bool {
4022     return matchPatternA(Mask) || matchPatternB(Mask) || matchPatternC(Mask);
4023   };
4024 
4025   if (Node->getOpcode() == ISD::AND) {
4026     X = Node->getOperand(0);
4027     SDValue Mask = Node->getOperand(1);
4028 
4029     if (matchLowBitMask(Mask)) {
4030       // Great.
4031     } else {
4032       std::swap(X, Mask);
4033       if (!matchLowBitMask(Mask))
4034         return false;
4035     }
4036   } else if (matchLowBitMask(SDValue(Node, 0))) {
4037     X = CurDAG->getAllOnesConstant(SDLoc(Node), NVT);
4038   } else if (!matchPatternD(Node))
4039     return false;
4040 
4041   // If we need to negate the shift amount, require BMI2 BZHI support.
4042   // It's just too unprofitable for BMI1 BEXTR.
4043   if (NegateNBits && !Subtarget->hasBMI2())
4044     return false;
4045 
4046   SDLoc DL(Node);
4047 
4048   // Truncate the shift amount.
4049   NBits = CurDAG->getNode(ISD::TRUNCATE, DL, MVT::i8, NBits);
4050   insertDAGNode(*CurDAG, SDValue(Node, 0), NBits);
4051 
4052   // Insert 8-bit NBits into lowest 8 bits of 32-bit register.
4053   // All the other bits are undefined, we do not care about them.
4054   SDValue ImplDef = SDValue(
4055       CurDAG->getMachineNode(TargetOpcode::IMPLICIT_DEF, DL, MVT::i32), 0);
4056   insertDAGNode(*CurDAG, SDValue(Node, 0), ImplDef);
4057 
4058   SDValue SRIdxVal = CurDAG->getTargetConstant(X86::sub_8bit, DL, MVT::i32);
4059   insertDAGNode(*CurDAG, SDValue(Node, 0), SRIdxVal);
4060   NBits = SDValue(CurDAG->getMachineNode(TargetOpcode::INSERT_SUBREG, DL,
4061                                          MVT::i32, ImplDef, NBits, SRIdxVal),
4062                   0);
4063   insertDAGNode(*CurDAG, SDValue(Node, 0), NBits);
4064 
4065   // We might have matched the amount of high bits to be cleared,
4066   // but we want the amount of low bits to be kept, so negate it then.
4067   if (NegateNBits) {
4068     SDValue BitWidthC = CurDAG->getConstant(NVT.getSizeInBits(), DL, MVT::i32);
4069     insertDAGNode(*CurDAG, SDValue(Node, 0), BitWidthC);
4070 
4071     NBits = CurDAG->getNode(ISD::SUB, DL, MVT::i32, BitWidthC, NBits);
4072     insertDAGNode(*CurDAG, SDValue(Node, 0), NBits);
4073   }
4074 
4075   if (Subtarget->hasBMI2()) {
4076     // Great, just emit the BZHI..
4077     if (NVT != MVT::i32) {
4078       // But have to place the bit count into the wide-enough register first.
4079       NBits = CurDAG->getNode(ISD::ANY_EXTEND, DL, NVT, NBits);
4080       insertDAGNode(*CurDAG, SDValue(Node, 0), NBits);
4081     }
4082 
4083     SDValue Extract = CurDAG->getNode(X86ISD::BZHI, DL, NVT, X, NBits);
4084     ReplaceNode(Node, Extract.getNode());
4085     SelectCode(Extract.getNode());
4086     return true;
4087   }
4088 
4089   // Else, if we do *NOT* have BMI2, let's find out if the if the 'X' is
4090   // *logically* shifted (potentially with one-use trunc inbetween),
4091   // and the truncation was the only use of the shift,
4092   // and if so look past one-use truncation.
4093   {
4094     SDValue RealX = peekThroughOneUseTruncation(X);
4095     // FIXME: only if the shift is one-use?
4096     if (RealX != X && RealX.getOpcode() == ISD::SRL)
4097       X = RealX;
4098   }
4099 
4100   MVT XVT = X.getSimpleValueType();
4101 
4102   // Else, emitting BEXTR requires one more step.
4103   // The 'control' of BEXTR has the pattern of:
4104   // [15...8 bit][ 7...0 bit] location
4105   // [ bit count][     shift] name
4106   // I.e. 0b000000011'00000001 means  (x >> 0b1) & 0b11
4107 
4108   // Shift NBits left by 8 bits, thus producing 'control'.
4109   // This makes the low 8 bits to be zero.
4110   SDValue C8 = CurDAG->getConstant(8, DL, MVT::i8);
4111   insertDAGNode(*CurDAG, SDValue(Node, 0), C8);
4112   SDValue Control = CurDAG->getNode(ISD::SHL, DL, MVT::i32, NBits, C8);
4113   insertDAGNode(*CurDAG, SDValue(Node, 0), Control);
4114 
4115   // If the 'X' is *logically* shifted, we can fold that shift into 'control'.
4116   // FIXME: only if the shift is one-use?
4117   if (X.getOpcode() == ISD::SRL) {
4118     SDValue ShiftAmt = X.getOperand(1);
4119     X = X.getOperand(0);
4120 
4121     assert(ShiftAmt.getValueType() == MVT::i8 &&
4122            "Expected shift amount to be i8");
4123 
4124     // Now, *zero*-extend the shift amount. The bits 8...15 *must* be zero!
4125     // We could zext to i16 in some form, but we intentionally don't do that.
4126     SDValue OrigShiftAmt = ShiftAmt;
4127     ShiftAmt = CurDAG->getNode(ISD::ZERO_EXTEND, DL, MVT::i32, ShiftAmt);
4128     insertDAGNode(*CurDAG, OrigShiftAmt, ShiftAmt);
4129 
4130     // And now 'or' these low 8 bits of shift amount into the 'control'.
4131     Control = CurDAG->getNode(ISD::OR, DL, MVT::i32, Control, ShiftAmt);
4132     insertDAGNode(*CurDAG, SDValue(Node, 0), Control);
4133   }
4134 
4135   // But have to place the 'control' into the wide-enough register first.
4136   if (XVT != MVT::i32) {
4137     Control = CurDAG->getNode(ISD::ANY_EXTEND, DL, XVT, Control);
4138     insertDAGNode(*CurDAG, SDValue(Node, 0), Control);
4139   }
4140 
4141   // And finally, form the BEXTR itself.
4142   SDValue Extract = CurDAG->getNode(X86ISD::BEXTR, DL, XVT, X, Control);
4143 
4144   // The 'X' was originally truncated. Do that now.
4145   if (XVT != NVT) {
4146     insertDAGNode(*CurDAG, SDValue(Node, 0), Extract);
4147     Extract = CurDAG->getNode(ISD::TRUNCATE, DL, NVT, Extract);
4148   }
4149 
4150   ReplaceNode(Node, Extract.getNode());
4151   SelectCode(Extract.getNode());
4152 
4153   return true;
4154 }
4155 
4156 // See if this is an (X >> C1) & C2 that we can match to BEXTR/BEXTRI.
matchBEXTRFromAndImm(SDNode * Node)4157 MachineSDNode *X86DAGToDAGISel::matchBEXTRFromAndImm(SDNode *Node) {
4158   MVT NVT = Node->getSimpleValueType(0);
4159   SDLoc dl(Node);
4160 
4161   SDValue N0 = Node->getOperand(0);
4162   SDValue N1 = Node->getOperand(1);
4163 
4164   // If we have TBM we can use an immediate for the control. If we have BMI
4165   // we should only do this if the BEXTR instruction is implemented well.
4166   // Otherwise moving the control into a register makes this more costly.
4167   // TODO: Maybe load folding, greater than 32-bit masks, or a guarantee of LICM
4168   // hoisting the move immediate would make it worthwhile with a less optimal
4169   // BEXTR?
4170   bool PreferBEXTR =
4171       Subtarget->hasTBM() || (Subtarget->hasBMI() && Subtarget->hasFastBEXTR());
4172   if (!PreferBEXTR && !Subtarget->hasBMI2())
4173     return nullptr;
4174 
4175   // Must have a shift right.
4176   if (N0->getOpcode() != ISD::SRL && N0->getOpcode() != ISD::SRA)
4177     return nullptr;
4178 
4179   // Shift can't have additional users.
4180   if (!N0->hasOneUse())
4181     return nullptr;
4182 
4183   // Only supported for 32 and 64 bits.
4184   if (NVT != MVT::i32 && NVT != MVT::i64)
4185     return nullptr;
4186 
4187   // Shift amount and RHS of and must be constant.
4188   auto *MaskCst = dyn_cast<ConstantSDNode>(N1);
4189   auto *ShiftCst = dyn_cast<ConstantSDNode>(N0->getOperand(1));
4190   if (!MaskCst || !ShiftCst)
4191     return nullptr;
4192 
4193   // And RHS must be a mask.
4194   uint64_t Mask = MaskCst->getZExtValue();
4195   if (!isMask_64(Mask))
4196     return nullptr;
4197 
4198   uint64_t Shift = ShiftCst->getZExtValue();
4199   uint64_t MaskSize = llvm::popcount(Mask);
4200 
4201   // Don't interfere with something that can be handled by extracting AH.
4202   // TODO: If we are able to fold a load, BEXTR might still be better than AH.
4203   if (Shift == 8 && MaskSize == 8)
4204     return nullptr;
4205 
4206   // Make sure we are only using bits that were in the original value, not
4207   // shifted in.
4208   if (Shift + MaskSize > NVT.getSizeInBits())
4209     return nullptr;
4210 
4211   // BZHI, if available, is always fast, unlike BEXTR. But even if we decide
4212   // that we can't use BEXTR, it is only worthwhile using BZHI if the mask
4213   // does not fit into 32 bits. Load folding is not a sufficient reason.
4214   if (!PreferBEXTR && MaskSize <= 32)
4215     return nullptr;
4216 
4217   SDValue Control;
4218   unsigned ROpc, MOpc;
4219 
4220 #define GET_EGPR_IF_ENABLED(OPC) (Subtarget->hasEGPR() ? OPC##_EVEX : OPC)
4221   if (!PreferBEXTR) {
4222     assert(Subtarget->hasBMI2() && "We must have BMI2's BZHI then.");
4223     // If we can't make use of BEXTR then we can't fuse shift+mask stages.
4224     // Let's perform the mask first, and apply shift later. Note that we need to
4225     // widen the mask to account for the fact that we'll apply shift afterwards!
4226     Control = CurDAG->getTargetConstant(Shift + MaskSize, dl, NVT);
4227     ROpc = NVT == MVT::i64 ? GET_EGPR_IF_ENABLED(X86::BZHI64rr)
4228                            : GET_EGPR_IF_ENABLED(X86::BZHI32rr);
4229     MOpc = NVT == MVT::i64 ? GET_EGPR_IF_ENABLED(X86::BZHI64rm)
4230                            : GET_EGPR_IF_ENABLED(X86::BZHI32rm);
4231     unsigned NewOpc = NVT == MVT::i64 ? X86::MOV32ri64 : X86::MOV32ri;
4232     Control = SDValue(CurDAG->getMachineNode(NewOpc, dl, NVT, Control), 0);
4233   } else {
4234     // The 'control' of BEXTR has the pattern of:
4235     // [15...8 bit][ 7...0 bit] location
4236     // [ bit count][     shift] name
4237     // I.e. 0b000000011'00000001 means  (x >> 0b1) & 0b11
4238     Control = CurDAG->getTargetConstant(Shift | (MaskSize << 8), dl, NVT);
4239     if (Subtarget->hasTBM()) {
4240       ROpc = NVT == MVT::i64 ? X86::BEXTRI64ri : X86::BEXTRI32ri;
4241       MOpc = NVT == MVT::i64 ? X86::BEXTRI64mi : X86::BEXTRI32mi;
4242     } else {
4243       assert(Subtarget->hasBMI() && "We must have BMI1's BEXTR then.");
4244       // BMI requires the immediate to placed in a register.
4245       ROpc = NVT == MVT::i64 ? GET_EGPR_IF_ENABLED(X86::BEXTR64rr)
4246                              : GET_EGPR_IF_ENABLED(X86::BEXTR32rr);
4247       MOpc = NVT == MVT::i64 ? GET_EGPR_IF_ENABLED(X86::BEXTR64rm)
4248                              : GET_EGPR_IF_ENABLED(X86::BEXTR32rm);
4249       unsigned NewOpc = NVT == MVT::i64 ? X86::MOV32ri64 : X86::MOV32ri;
4250       Control = SDValue(CurDAG->getMachineNode(NewOpc, dl, NVT, Control), 0);
4251     }
4252   }
4253 
4254   MachineSDNode *NewNode;
4255   SDValue Input = N0->getOperand(0);
4256   SDValue Tmp0, Tmp1, Tmp2, Tmp3, Tmp4;
4257   if (tryFoldLoad(Node, N0.getNode(), Input, Tmp0, Tmp1, Tmp2, Tmp3, Tmp4)) {
4258     SDValue Ops[] = {
4259         Tmp0, Tmp1, Tmp2, Tmp3, Tmp4, Control, Input.getOperand(0)};
4260     SDVTList VTs = CurDAG->getVTList(NVT, MVT::i32, MVT::Other);
4261     NewNode = CurDAG->getMachineNode(MOpc, dl, VTs, Ops);
4262     // Update the chain.
4263     ReplaceUses(Input.getValue(1), SDValue(NewNode, 2));
4264     // Record the mem-refs
4265     CurDAG->setNodeMemRefs(NewNode, {cast<LoadSDNode>(Input)->getMemOperand()});
4266   } else {
4267     NewNode = CurDAG->getMachineNode(ROpc, dl, NVT, MVT::i32, Input, Control);
4268   }
4269 
4270   if (!PreferBEXTR) {
4271     // We still need to apply the shift.
4272     SDValue ShAmt = CurDAG->getTargetConstant(Shift, dl, NVT);
4273     unsigned NewOpc = NVT == MVT::i64 ? GET_ND_IF_ENABLED(X86::SHR64ri)
4274                                       : GET_ND_IF_ENABLED(X86::SHR32ri);
4275     NewNode =
4276         CurDAG->getMachineNode(NewOpc, dl, NVT, SDValue(NewNode, 0), ShAmt);
4277   }
4278 
4279   return NewNode;
4280 }
4281 
4282 // Emit a PCMISTR(I/M) instruction.
emitPCMPISTR(unsigned ROpc,unsigned MOpc,bool MayFoldLoad,const SDLoc & dl,MVT VT,SDNode * Node)4283 MachineSDNode *X86DAGToDAGISel::emitPCMPISTR(unsigned ROpc, unsigned MOpc,
4284                                              bool MayFoldLoad, const SDLoc &dl,
4285                                              MVT VT, SDNode *Node) {
4286   SDValue N0 = Node->getOperand(0);
4287   SDValue N1 = Node->getOperand(1);
4288   SDValue Imm = Node->getOperand(2);
4289   auto *Val = cast<ConstantSDNode>(Imm)->getConstantIntValue();
4290   Imm = CurDAG->getTargetConstant(*Val, SDLoc(Node), Imm.getValueType());
4291 
4292   // Try to fold a load. No need to check alignment.
4293   SDValue Tmp0, Tmp1, Tmp2, Tmp3, Tmp4;
4294   if (MayFoldLoad && tryFoldLoad(Node, N1, Tmp0, Tmp1, Tmp2, Tmp3, Tmp4)) {
4295     SDValue Ops[] = { N0, Tmp0, Tmp1, Tmp2, Tmp3, Tmp4, Imm,
4296                       N1.getOperand(0) };
4297     SDVTList VTs = CurDAG->getVTList(VT, MVT::i32, MVT::Other);
4298     MachineSDNode *CNode = CurDAG->getMachineNode(MOpc, dl, VTs, Ops);
4299     // Update the chain.
4300     ReplaceUses(N1.getValue(1), SDValue(CNode, 2));
4301     // Record the mem-refs
4302     CurDAG->setNodeMemRefs(CNode, {cast<LoadSDNode>(N1)->getMemOperand()});
4303     return CNode;
4304   }
4305 
4306   SDValue Ops[] = { N0, N1, Imm };
4307   SDVTList VTs = CurDAG->getVTList(VT, MVT::i32);
4308   MachineSDNode *CNode = CurDAG->getMachineNode(ROpc, dl, VTs, Ops);
4309   return CNode;
4310 }
4311 
4312 // Emit a PCMESTR(I/M) instruction. Also return the Glue result in case we need
4313 // to emit a second instruction after this one. This is needed since we have two
4314 // copyToReg nodes glued before this and we need to continue that glue through.
emitPCMPESTR(unsigned ROpc,unsigned MOpc,bool MayFoldLoad,const SDLoc & dl,MVT VT,SDNode * Node,SDValue & InGlue)4315 MachineSDNode *X86DAGToDAGISel::emitPCMPESTR(unsigned ROpc, unsigned MOpc,
4316                                              bool MayFoldLoad, const SDLoc &dl,
4317                                              MVT VT, SDNode *Node,
4318                                              SDValue &InGlue) {
4319   SDValue N0 = Node->getOperand(0);
4320   SDValue N2 = Node->getOperand(2);
4321   SDValue Imm = Node->getOperand(4);
4322   auto *Val = cast<ConstantSDNode>(Imm)->getConstantIntValue();
4323   Imm = CurDAG->getTargetConstant(*Val, SDLoc(Node), Imm.getValueType());
4324 
4325   // Try to fold a load. No need to check alignment.
4326   SDValue Tmp0, Tmp1, Tmp2, Tmp3, Tmp4;
4327   if (MayFoldLoad && tryFoldLoad(Node, N2, Tmp0, Tmp1, Tmp2, Tmp3, Tmp4)) {
4328     SDValue Ops[] = { N0, Tmp0, Tmp1, Tmp2, Tmp3, Tmp4, Imm,
4329                       N2.getOperand(0), InGlue };
4330     SDVTList VTs = CurDAG->getVTList(VT, MVT::i32, MVT::Other, MVT::Glue);
4331     MachineSDNode *CNode = CurDAG->getMachineNode(MOpc, dl, VTs, Ops);
4332     InGlue = SDValue(CNode, 3);
4333     // Update the chain.
4334     ReplaceUses(N2.getValue(1), SDValue(CNode, 2));
4335     // Record the mem-refs
4336     CurDAG->setNodeMemRefs(CNode, {cast<LoadSDNode>(N2)->getMemOperand()});
4337     return CNode;
4338   }
4339 
4340   SDValue Ops[] = { N0, N2, Imm, InGlue };
4341   SDVTList VTs = CurDAG->getVTList(VT, MVT::i32, MVT::Glue);
4342   MachineSDNode *CNode = CurDAG->getMachineNode(ROpc, dl, VTs, Ops);
4343   InGlue = SDValue(CNode, 2);
4344   return CNode;
4345 }
4346 
tryShiftAmountMod(SDNode * N)4347 bool X86DAGToDAGISel::tryShiftAmountMod(SDNode *N) {
4348   EVT VT = N->getValueType(0);
4349 
4350   // Only handle scalar shifts.
4351   if (VT.isVector())
4352     return false;
4353 
4354   // Narrower shifts only mask to 5 bits in hardware.
4355   unsigned Size = VT == MVT::i64 ? 64 : 32;
4356 
4357   SDValue OrigShiftAmt = N->getOperand(1);
4358   SDValue ShiftAmt = OrigShiftAmt;
4359   SDLoc DL(N);
4360 
4361   // Skip over a truncate of the shift amount.
4362   if (ShiftAmt->getOpcode() == ISD::TRUNCATE)
4363     ShiftAmt = ShiftAmt->getOperand(0);
4364 
4365   // This function is called after X86DAGToDAGISel::matchBitExtract(),
4366   // so we are not afraid that we might mess up BZHI/BEXTR pattern.
4367 
4368   SDValue NewShiftAmt;
4369   if (ShiftAmt->getOpcode() == ISD::ADD || ShiftAmt->getOpcode() == ISD::SUB ||
4370       ShiftAmt->getOpcode() == ISD::XOR) {
4371     SDValue Add0 = ShiftAmt->getOperand(0);
4372     SDValue Add1 = ShiftAmt->getOperand(1);
4373     auto *Add0C = dyn_cast<ConstantSDNode>(Add0);
4374     auto *Add1C = dyn_cast<ConstantSDNode>(Add1);
4375     // If we are shifting by X+/-/^N where N == 0 mod Size, then just shift by X
4376     // to avoid the ADD/SUB/XOR.
4377     if (Add1C && Add1C->getAPIntValue().urem(Size) == 0) {
4378       NewShiftAmt = Add0;
4379 
4380     } else if (ShiftAmt->getOpcode() != ISD::ADD && ShiftAmt.hasOneUse() &&
4381                ((Add0C && Add0C->getAPIntValue().urem(Size) == Size - 1) ||
4382                 (Add1C && Add1C->getAPIntValue().urem(Size) == Size - 1))) {
4383       // If we are doing a NOT on just the lower bits with (Size*N-1) -/^ X
4384       // we can replace it with a NOT. In the XOR case it may save some code
4385       // size, in the SUB case it also may save a move.
4386       assert(Add0C == nullptr || Add1C == nullptr);
4387 
4388       // We can only do N-X, not X-N
4389       if (ShiftAmt->getOpcode() == ISD::SUB && Add0C == nullptr)
4390         return false;
4391 
4392       EVT OpVT = ShiftAmt.getValueType();
4393 
4394       SDValue AllOnes = CurDAG->getAllOnesConstant(DL, OpVT);
4395       NewShiftAmt = CurDAG->getNode(ISD::XOR, DL, OpVT,
4396                                     Add0C == nullptr ? Add0 : Add1, AllOnes);
4397       insertDAGNode(*CurDAG, OrigShiftAmt, AllOnes);
4398       insertDAGNode(*CurDAG, OrigShiftAmt, NewShiftAmt);
4399       // If we are shifting by N-X where N == 0 mod Size, then just shift by
4400       // -X to generate a NEG instead of a SUB of a constant.
4401     } else if (ShiftAmt->getOpcode() == ISD::SUB && Add0C &&
4402                Add0C->getZExtValue() != 0) {
4403       EVT SubVT = ShiftAmt.getValueType();
4404       SDValue X;
4405       if (Add0C->getZExtValue() % Size == 0)
4406         X = Add1;
4407       else if (ShiftAmt.hasOneUse() && Size == 64 &&
4408                Add0C->getZExtValue() % 32 == 0) {
4409         // We have a 64-bit shift by (n*32-x), turn it into -(x+n*32).
4410         // This is mainly beneficial if we already compute (x+n*32).
4411         if (Add1.getOpcode() == ISD::TRUNCATE) {
4412           Add1 = Add1.getOperand(0);
4413           SubVT = Add1.getValueType();
4414         }
4415         if (Add0.getValueType() != SubVT) {
4416           Add0 = CurDAG->getZExtOrTrunc(Add0, DL, SubVT);
4417           insertDAGNode(*CurDAG, OrigShiftAmt, Add0);
4418         }
4419 
4420         X = CurDAG->getNode(ISD::ADD, DL, SubVT, Add1, Add0);
4421         insertDAGNode(*CurDAG, OrigShiftAmt, X);
4422       } else
4423         return false;
4424       // Insert a negate op.
4425       // TODO: This isn't guaranteed to replace the sub if there is a logic cone
4426       // that uses it that's not a shift.
4427       SDValue Zero = CurDAG->getConstant(0, DL, SubVT);
4428       SDValue Neg = CurDAG->getNode(ISD::SUB, DL, SubVT, Zero, X);
4429       NewShiftAmt = Neg;
4430 
4431       // Insert these operands into a valid topological order so they can
4432       // get selected independently.
4433       insertDAGNode(*CurDAG, OrigShiftAmt, Zero);
4434       insertDAGNode(*CurDAG, OrigShiftAmt, Neg);
4435     } else
4436       return false;
4437   } else
4438     return false;
4439 
4440   if (NewShiftAmt.getValueType() != MVT::i8) {
4441     // Need to truncate the shift amount.
4442     NewShiftAmt = CurDAG->getNode(ISD::TRUNCATE, DL, MVT::i8, NewShiftAmt);
4443     // Add to a correct topological ordering.
4444     insertDAGNode(*CurDAG, OrigShiftAmt, NewShiftAmt);
4445   }
4446 
4447   // Insert a new mask to keep the shift amount legal. This should be removed
4448   // by isel patterns.
4449   NewShiftAmt = CurDAG->getNode(ISD::AND, DL, MVT::i8, NewShiftAmt,
4450                                 CurDAG->getConstant(Size - 1, DL, MVT::i8));
4451   // Place in a correct topological ordering.
4452   insertDAGNode(*CurDAG, OrigShiftAmt, NewShiftAmt);
4453 
4454   SDNode *UpdatedNode = CurDAG->UpdateNodeOperands(N, N->getOperand(0),
4455                                                    NewShiftAmt);
4456   if (UpdatedNode != N) {
4457     // If we found an existing node, we should replace ourselves with that node
4458     // and wait for it to be selected after its other users.
4459     ReplaceNode(N, UpdatedNode);
4460     return true;
4461   }
4462 
4463   // If the original shift amount is now dead, delete it so that we don't run
4464   // it through isel.
4465   if (OrigShiftAmt.getNode()->use_empty())
4466     CurDAG->RemoveDeadNode(OrigShiftAmt.getNode());
4467 
4468   // Now that we've optimized the shift amount, defer to normal isel to get
4469   // load folding and legacy vs BMI2 selection without repeating it here.
4470   SelectCode(N);
4471   return true;
4472 }
4473 
tryShrinkShlLogicImm(SDNode * N)4474 bool X86DAGToDAGISel::tryShrinkShlLogicImm(SDNode *N) {
4475   MVT NVT = N->getSimpleValueType(0);
4476   unsigned Opcode = N->getOpcode();
4477   SDLoc dl(N);
4478 
4479   // For operations of the form (x << C1) op C2, check if we can use a smaller
4480   // encoding for C2 by transforming it into (x op (C2>>C1)) << C1.
4481   SDValue Shift = N->getOperand(0);
4482   SDValue N1 = N->getOperand(1);
4483 
4484   auto *Cst = dyn_cast<ConstantSDNode>(N1);
4485   if (!Cst)
4486     return false;
4487 
4488   int64_t Val = Cst->getSExtValue();
4489 
4490   // If we have an any_extend feeding the AND, look through it to see if there
4491   // is a shift behind it. But only if the AND doesn't use the extended bits.
4492   // FIXME: Generalize this to other ANY_EXTEND than i32 to i64?
4493   bool FoundAnyExtend = false;
4494   if (Shift.getOpcode() == ISD::ANY_EXTEND && Shift.hasOneUse() &&
4495       Shift.getOperand(0).getSimpleValueType() == MVT::i32 &&
4496       isUInt<32>(Val)) {
4497     FoundAnyExtend = true;
4498     Shift = Shift.getOperand(0);
4499   }
4500 
4501   if (Shift.getOpcode() != ISD::SHL || !Shift.hasOneUse())
4502     return false;
4503 
4504   // i8 is unshrinkable, i16 should be promoted to i32.
4505   if (NVT != MVT::i32 && NVT != MVT::i64)
4506     return false;
4507 
4508   auto *ShlCst = dyn_cast<ConstantSDNode>(Shift.getOperand(1));
4509   if (!ShlCst)
4510     return false;
4511 
4512   uint64_t ShAmt = ShlCst->getZExtValue();
4513 
4514   // Make sure that we don't change the operation by removing bits.
4515   // This only matters for OR and XOR, AND is unaffected.
4516   uint64_t RemovedBitsMask = (1ULL << ShAmt) - 1;
4517   if (Opcode != ISD::AND && (Val & RemovedBitsMask) != 0)
4518     return false;
4519 
4520   // Check the minimum bitwidth for the new constant.
4521   // TODO: Using 16 and 8 bit operations is also possible for or32 & xor32.
4522   auto CanShrinkImmediate = [&](int64_t &ShiftedVal) {
4523     if (Opcode == ISD::AND) {
4524       // AND32ri is the same as AND64ri32 with zext imm.
4525       // Try this before sign extended immediates below.
4526       ShiftedVal = (uint64_t)Val >> ShAmt;
4527       if (NVT == MVT::i64 && !isUInt<32>(Val) && isUInt<32>(ShiftedVal))
4528         return true;
4529       // Also swap order when the AND can become MOVZX.
4530       if (ShiftedVal == UINT8_MAX || ShiftedVal == UINT16_MAX)
4531         return true;
4532     }
4533     ShiftedVal = Val >> ShAmt;
4534     if ((!isInt<8>(Val) && isInt<8>(ShiftedVal)) ||
4535         (!isInt<32>(Val) && isInt<32>(ShiftedVal)))
4536       return true;
4537     if (Opcode != ISD::AND) {
4538       // MOV32ri+OR64r/XOR64r is cheaper than MOV64ri64+OR64rr/XOR64rr
4539       ShiftedVal = (uint64_t)Val >> ShAmt;
4540       if (NVT == MVT::i64 && !isUInt<32>(Val) && isUInt<32>(ShiftedVal))
4541         return true;
4542     }
4543     return false;
4544   };
4545 
4546   int64_t ShiftedVal;
4547   if (!CanShrinkImmediate(ShiftedVal))
4548     return false;
4549 
4550   // Ok, we can reorder to get a smaller immediate.
4551 
4552   // But, its possible the original immediate allowed an AND to become MOVZX.
4553   // Doing this late due to avoid the MakedValueIsZero call as late as
4554   // possible.
4555   if (Opcode == ISD::AND) {
4556     // Find the smallest zext this could possibly be.
4557     unsigned ZExtWidth = Cst->getAPIntValue().getActiveBits();
4558     ZExtWidth = llvm::bit_ceil(std::max(ZExtWidth, 8U));
4559 
4560     // Figure out which bits need to be zero to achieve that mask.
4561     APInt NeededMask = APInt::getLowBitsSet(NVT.getSizeInBits(),
4562                                             ZExtWidth);
4563     NeededMask &= ~Cst->getAPIntValue();
4564 
4565     if (CurDAG->MaskedValueIsZero(N->getOperand(0), NeededMask))
4566       return false;
4567   }
4568 
4569   SDValue X = Shift.getOperand(0);
4570   if (FoundAnyExtend) {
4571     SDValue NewX = CurDAG->getNode(ISD::ANY_EXTEND, dl, NVT, X);
4572     insertDAGNode(*CurDAG, SDValue(N, 0), NewX);
4573     X = NewX;
4574   }
4575 
4576   SDValue NewCst = CurDAG->getSignedConstant(ShiftedVal, dl, NVT);
4577   insertDAGNode(*CurDAG, SDValue(N, 0), NewCst);
4578   SDValue NewBinOp = CurDAG->getNode(Opcode, dl, NVT, X, NewCst);
4579   insertDAGNode(*CurDAG, SDValue(N, 0), NewBinOp);
4580   SDValue NewSHL = CurDAG->getNode(ISD::SHL, dl, NVT, NewBinOp,
4581                                    Shift.getOperand(1));
4582   ReplaceNode(N, NewSHL.getNode());
4583   SelectCode(NewSHL.getNode());
4584   return true;
4585 }
4586 
matchVPTERNLOG(SDNode * Root,SDNode * ParentA,SDNode * ParentB,SDNode * ParentC,SDValue A,SDValue B,SDValue C,uint8_t Imm)4587 bool X86DAGToDAGISel::matchVPTERNLOG(SDNode *Root, SDNode *ParentA,
4588                                      SDNode *ParentB, SDNode *ParentC,
4589                                      SDValue A, SDValue B, SDValue C,
4590                                      uint8_t Imm) {
4591   assert(A.isOperandOf(ParentA) && B.isOperandOf(ParentB) &&
4592          C.isOperandOf(ParentC) && "Incorrect parent node");
4593 
4594   auto tryFoldLoadOrBCast =
4595       [this](SDNode *Root, SDNode *P, SDValue &L, SDValue &Base, SDValue &Scale,
4596              SDValue &Index, SDValue &Disp, SDValue &Segment) {
4597         if (tryFoldLoad(Root, P, L, Base, Scale, Index, Disp, Segment))
4598           return true;
4599 
4600         // Not a load, check for broadcast which may be behind a bitcast.
4601         if (L.getOpcode() == ISD::BITCAST && L.hasOneUse()) {
4602           P = L.getNode();
4603           L = L.getOperand(0);
4604         }
4605 
4606         if (L.getOpcode() != X86ISD::VBROADCAST_LOAD)
4607           return false;
4608 
4609         // Only 32 and 64 bit broadcasts are supported.
4610         auto *MemIntr = cast<MemIntrinsicSDNode>(L);
4611         unsigned Size = MemIntr->getMemoryVT().getSizeInBits();
4612         if (Size != 32 && Size != 64)
4613           return false;
4614 
4615         return tryFoldBroadcast(Root, P, L, Base, Scale, Index, Disp, Segment);
4616       };
4617 
4618   bool FoldedLoad = false;
4619   SDValue Tmp0, Tmp1, Tmp2, Tmp3, Tmp4;
4620   if (tryFoldLoadOrBCast(Root, ParentC, C, Tmp0, Tmp1, Tmp2, Tmp3, Tmp4)) {
4621     FoldedLoad = true;
4622   } else if (tryFoldLoadOrBCast(Root, ParentA, A, Tmp0, Tmp1, Tmp2, Tmp3,
4623                                 Tmp4)) {
4624     FoldedLoad = true;
4625     std::swap(A, C);
4626     // Swap bits 1/4 and 3/6.
4627     uint8_t OldImm = Imm;
4628     Imm = OldImm & 0xa5;
4629     if (OldImm & 0x02) Imm |= 0x10;
4630     if (OldImm & 0x10) Imm |= 0x02;
4631     if (OldImm & 0x08) Imm |= 0x40;
4632     if (OldImm & 0x40) Imm |= 0x08;
4633   } else if (tryFoldLoadOrBCast(Root, ParentB, B, Tmp0, Tmp1, Tmp2, Tmp3,
4634                                 Tmp4)) {
4635     FoldedLoad = true;
4636     std::swap(B, C);
4637     // Swap bits 1/2 and 5/6.
4638     uint8_t OldImm = Imm;
4639     Imm = OldImm & 0x99;
4640     if (OldImm & 0x02) Imm |= 0x04;
4641     if (OldImm & 0x04) Imm |= 0x02;
4642     if (OldImm & 0x20) Imm |= 0x40;
4643     if (OldImm & 0x40) Imm |= 0x20;
4644   }
4645 
4646   SDLoc DL(Root);
4647 
4648   SDValue TImm = CurDAG->getTargetConstant(Imm, DL, MVT::i8);
4649 
4650   MVT NVT = Root->getSimpleValueType(0);
4651 
4652   MachineSDNode *MNode;
4653   if (FoldedLoad) {
4654     SDVTList VTs = CurDAG->getVTList(NVT, MVT::Other);
4655 
4656     unsigned Opc;
4657     if (C.getOpcode() == X86ISD::VBROADCAST_LOAD) {
4658       auto *MemIntr = cast<MemIntrinsicSDNode>(C);
4659       unsigned EltSize = MemIntr->getMemoryVT().getSizeInBits();
4660       assert((EltSize == 32 || EltSize == 64) && "Unexpected broadcast size!");
4661 
4662       bool UseD = EltSize == 32;
4663       if (NVT.is128BitVector())
4664         Opc = UseD ? X86::VPTERNLOGDZ128rmbi : X86::VPTERNLOGQZ128rmbi;
4665       else if (NVT.is256BitVector())
4666         Opc = UseD ? X86::VPTERNLOGDZ256rmbi : X86::VPTERNLOGQZ256rmbi;
4667       else if (NVT.is512BitVector())
4668         Opc = UseD ? X86::VPTERNLOGDZrmbi : X86::VPTERNLOGQZrmbi;
4669       else
4670         llvm_unreachable("Unexpected vector size!");
4671     } else {
4672       bool UseD = NVT.getVectorElementType() == MVT::i32;
4673       if (NVT.is128BitVector())
4674         Opc = UseD ? X86::VPTERNLOGDZ128rmi : X86::VPTERNLOGQZ128rmi;
4675       else if (NVT.is256BitVector())
4676         Opc = UseD ? X86::VPTERNLOGDZ256rmi : X86::VPTERNLOGQZ256rmi;
4677       else if (NVT.is512BitVector())
4678         Opc = UseD ? X86::VPTERNLOGDZrmi : X86::VPTERNLOGQZrmi;
4679       else
4680         llvm_unreachable("Unexpected vector size!");
4681     }
4682 
4683     SDValue Ops[] = {A, B, Tmp0, Tmp1, Tmp2, Tmp3, Tmp4, TImm, C.getOperand(0)};
4684     MNode = CurDAG->getMachineNode(Opc, DL, VTs, Ops);
4685 
4686     // Update the chain.
4687     ReplaceUses(C.getValue(1), SDValue(MNode, 1));
4688     // Record the mem-refs
4689     CurDAG->setNodeMemRefs(MNode, {cast<MemSDNode>(C)->getMemOperand()});
4690   } else {
4691     bool UseD = NVT.getVectorElementType() == MVT::i32;
4692     unsigned Opc;
4693     if (NVT.is128BitVector())
4694       Opc = UseD ? X86::VPTERNLOGDZ128rri : X86::VPTERNLOGQZ128rri;
4695     else if (NVT.is256BitVector())
4696       Opc = UseD ? X86::VPTERNLOGDZ256rri : X86::VPTERNLOGQZ256rri;
4697     else if (NVT.is512BitVector())
4698       Opc = UseD ? X86::VPTERNLOGDZrri : X86::VPTERNLOGQZrri;
4699     else
4700       llvm_unreachable("Unexpected vector size!");
4701 
4702     MNode = CurDAG->getMachineNode(Opc, DL, NVT, {A, B, C, TImm});
4703   }
4704 
4705   ReplaceUses(SDValue(Root, 0), SDValue(MNode, 0));
4706   CurDAG->RemoveDeadNode(Root);
4707   return true;
4708 }
4709 
4710 // Try to match two logic ops to a VPTERNLOG.
4711 // FIXME: Handle more complex patterns that use an operand more than once?
tryVPTERNLOG(SDNode * N)4712 bool X86DAGToDAGISel::tryVPTERNLOG(SDNode *N) {
4713   MVT NVT = N->getSimpleValueType(0);
4714 
4715   // Make sure we support VPTERNLOG.
4716   if (!NVT.isVector() || !Subtarget->hasAVX512() ||
4717       NVT.getVectorElementType() == MVT::i1)
4718     return false;
4719 
4720   // We need VLX for 128/256-bit.
4721   if (!(Subtarget->hasVLX() || NVT.is512BitVector()))
4722     return false;
4723 
4724   SDValue N0 = N->getOperand(0);
4725   SDValue N1 = N->getOperand(1);
4726 
4727   auto getFoldableLogicOp = [](SDValue Op) {
4728     // Peek through single use bitcast.
4729     if (Op.getOpcode() == ISD::BITCAST && Op.hasOneUse())
4730       Op = Op.getOperand(0);
4731 
4732     if (!Op.hasOneUse())
4733       return SDValue();
4734 
4735     unsigned Opc = Op.getOpcode();
4736     if (Opc == ISD::AND || Opc == ISD::OR || Opc == ISD::XOR ||
4737         Opc == X86ISD::ANDNP)
4738       return Op;
4739 
4740     return SDValue();
4741   };
4742 
4743   SDValue A, FoldableOp;
4744   if ((FoldableOp = getFoldableLogicOp(N1))) {
4745     A = N0;
4746   } else if ((FoldableOp = getFoldableLogicOp(N0))) {
4747     A = N1;
4748   } else
4749     return false;
4750 
4751   SDValue B = FoldableOp.getOperand(0);
4752   SDValue C = FoldableOp.getOperand(1);
4753   SDNode *ParentA = N;
4754   SDNode *ParentB = FoldableOp.getNode();
4755   SDNode *ParentC = FoldableOp.getNode();
4756 
4757   // We can build the appropriate control immediate by performing the logic
4758   // operation we're matching using these constants for A, B, and C.
4759   uint8_t TernlogMagicA = 0xf0;
4760   uint8_t TernlogMagicB = 0xcc;
4761   uint8_t TernlogMagicC = 0xaa;
4762 
4763   // Some of the inputs may be inverted, peek through them and invert the
4764   // magic values accordingly.
4765   // TODO: There may be a bitcast before the xor that we should peek through.
4766   auto PeekThroughNot = [](SDValue &Op, SDNode *&Parent, uint8_t &Magic) {
4767     if (Op.getOpcode() == ISD::XOR && Op.hasOneUse() &&
4768         ISD::isBuildVectorAllOnes(Op.getOperand(1).getNode())) {
4769       Magic = ~Magic;
4770       Parent = Op.getNode();
4771       Op = Op.getOperand(0);
4772     }
4773   };
4774 
4775   PeekThroughNot(A, ParentA, TernlogMagicA);
4776   PeekThroughNot(B, ParentB, TernlogMagicB);
4777   PeekThroughNot(C, ParentC, TernlogMagicC);
4778 
4779   uint8_t Imm;
4780   switch (FoldableOp.getOpcode()) {
4781   default: llvm_unreachable("Unexpected opcode!");
4782   case ISD::AND:      Imm = TernlogMagicB & TernlogMagicC; break;
4783   case ISD::OR:       Imm = TernlogMagicB | TernlogMagicC; break;
4784   case ISD::XOR:      Imm = TernlogMagicB ^ TernlogMagicC; break;
4785   case X86ISD::ANDNP: Imm = ~(TernlogMagicB) & TernlogMagicC; break;
4786   }
4787 
4788   switch (N->getOpcode()) {
4789   default: llvm_unreachable("Unexpected opcode!");
4790   case X86ISD::ANDNP:
4791     if (A == N0)
4792       Imm &= ~TernlogMagicA;
4793     else
4794       Imm = ~(Imm) & TernlogMagicA;
4795     break;
4796   case ISD::AND: Imm &= TernlogMagicA; break;
4797   case ISD::OR:  Imm |= TernlogMagicA; break;
4798   case ISD::XOR: Imm ^= TernlogMagicA; break;
4799   }
4800 
4801   return matchVPTERNLOG(N, ParentA, ParentB, ParentC, A, B, C, Imm);
4802 }
4803 
4804 /// If the high bits of an 'and' operand are known zero, try setting the
4805 /// high bits of an 'and' constant operand to produce a smaller encoding by
4806 /// creating a small, sign-extended negative immediate rather than a large
4807 /// positive one. This reverses a transform in SimplifyDemandedBits that
4808 /// shrinks mask constants by clearing bits. There is also a possibility that
4809 /// the 'and' mask can be made -1, so the 'and' itself is unnecessary. In that
4810 /// case, just replace the 'and'. Return 'true' if the node is replaced.
shrinkAndImmediate(SDNode * And)4811 bool X86DAGToDAGISel::shrinkAndImmediate(SDNode *And) {
4812   // i8 is unshrinkable, i16 should be promoted to i32, and vector ops don't
4813   // have immediate operands.
4814   MVT VT = And->getSimpleValueType(0);
4815   if (VT != MVT::i32 && VT != MVT::i64)
4816     return false;
4817 
4818   auto *And1C = dyn_cast<ConstantSDNode>(And->getOperand(1));
4819   if (!And1C)
4820     return false;
4821 
4822   // Bail out if the mask constant is already negative. It's can't shrink more.
4823   // If the upper 32 bits of a 64 bit mask are all zeros, we have special isel
4824   // patterns to use a 32-bit and instead of a 64-bit and by relying on the
4825   // implicit zeroing of 32 bit ops. So we should check if the lower 32 bits
4826   // are negative too.
4827   APInt MaskVal = And1C->getAPIntValue();
4828   unsigned MaskLZ = MaskVal.countl_zero();
4829   if (!MaskLZ || (VT == MVT::i64 && MaskLZ == 32))
4830     return false;
4831 
4832   // Don't extend into the upper 32 bits of a 64 bit mask.
4833   if (VT == MVT::i64 && MaskLZ >= 32) {
4834     MaskLZ -= 32;
4835     MaskVal = MaskVal.trunc(32);
4836   }
4837 
4838   SDValue And0 = And->getOperand(0);
4839   APInt HighZeros = APInt::getHighBitsSet(MaskVal.getBitWidth(), MaskLZ);
4840   APInt NegMaskVal = MaskVal | HighZeros;
4841 
4842   // If a negative constant would not allow a smaller encoding, there's no need
4843   // to continue. Only change the constant when we know it's a win.
4844   unsigned MinWidth = NegMaskVal.getSignificantBits();
4845   if (MinWidth > 32 || (MinWidth > 8 && MaskVal.getSignificantBits() <= 32))
4846     return false;
4847 
4848   // Extend masks if we truncated above.
4849   if (VT == MVT::i64 && MaskVal.getBitWidth() < 64) {
4850     NegMaskVal = NegMaskVal.zext(64);
4851     HighZeros = HighZeros.zext(64);
4852   }
4853 
4854   // The variable operand must be all zeros in the top bits to allow using the
4855   // new, negative constant as the mask.
4856   // TODO: Handle constant folding?
4857   KnownBits Known0 = CurDAG->computeKnownBits(And0);
4858   if (Known0.isConstant() || !HighZeros.isSubsetOf(Known0.Zero))
4859     return false;
4860 
4861   // Check if the mask is -1. In that case, this is an unnecessary instruction
4862   // that escaped earlier analysis.
4863   if (NegMaskVal.isAllOnes()) {
4864     ReplaceNode(And, And0.getNode());
4865     return true;
4866   }
4867 
4868   // A negative mask allows a smaller encoding. Create a new 'and' node.
4869   SDValue NewMask = CurDAG->getConstant(NegMaskVal, SDLoc(And), VT);
4870   insertDAGNode(*CurDAG, SDValue(And, 0), NewMask);
4871   SDValue NewAnd = CurDAG->getNode(ISD::AND, SDLoc(And), VT, And0, NewMask);
4872   ReplaceNode(And, NewAnd.getNode());
4873   SelectCode(NewAnd.getNode());
4874   return true;
4875 }
4876 
getVPTESTMOpc(MVT TestVT,bool IsTestN,bool FoldedLoad,bool FoldedBCast,bool Masked)4877 static unsigned getVPTESTMOpc(MVT TestVT, bool IsTestN, bool FoldedLoad,
4878                               bool FoldedBCast, bool Masked) {
4879 #define VPTESTM_CASE(VT, SUFFIX) \
4880 case MVT::VT: \
4881   if (Masked) \
4882     return IsTestN ? X86::VPTESTNM##SUFFIX##k: X86::VPTESTM##SUFFIX##k; \
4883   return IsTestN ? X86::VPTESTNM##SUFFIX : X86::VPTESTM##SUFFIX;
4884 
4885 
4886 #define VPTESTM_BROADCAST_CASES(SUFFIX) \
4887 default: llvm_unreachable("Unexpected VT!"); \
4888 VPTESTM_CASE(v4i32, DZ128##SUFFIX) \
4889 VPTESTM_CASE(v2i64, QZ128##SUFFIX) \
4890 VPTESTM_CASE(v8i32, DZ256##SUFFIX) \
4891 VPTESTM_CASE(v4i64, QZ256##SUFFIX) \
4892 VPTESTM_CASE(v16i32, DZ##SUFFIX) \
4893 VPTESTM_CASE(v8i64, QZ##SUFFIX)
4894 
4895 #define VPTESTM_FULL_CASES(SUFFIX) \
4896 VPTESTM_BROADCAST_CASES(SUFFIX) \
4897 VPTESTM_CASE(v16i8, BZ128##SUFFIX) \
4898 VPTESTM_CASE(v8i16, WZ128##SUFFIX) \
4899 VPTESTM_CASE(v32i8, BZ256##SUFFIX) \
4900 VPTESTM_CASE(v16i16, WZ256##SUFFIX) \
4901 VPTESTM_CASE(v64i8, BZ##SUFFIX) \
4902 VPTESTM_CASE(v32i16, WZ##SUFFIX)
4903 
4904   if (FoldedBCast) {
4905     switch (TestVT.SimpleTy) {
4906     VPTESTM_BROADCAST_CASES(rmb)
4907     }
4908   }
4909 
4910   if (FoldedLoad) {
4911     switch (TestVT.SimpleTy) {
4912     VPTESTM_FULL_CASES(rm)
4913     }
4914   }
4915 
4916   switch (TestVT.SimpleTy) {
4917   VPTESTM_FULL_CASES(rr)
4918   }
4919 
4920 #undef VPTESTM_FULL_CASES
4921 #undef VPTESTM_BROADCAST_CASES
4922 #undef VPTESTM_CASE
4923 }
4924 
4925 // Try to create VPTESTM instruction. If InMask is not null, it will be used
4926 // to form a masked operation.
tryVPTESTM(SDNode * Root,SDValue Setcc,SDValue InMask)4927 bool X86DAGToDAGISel::tryVPTESTM(SDNode *Root, SDValue Setcc,
4928                                  SDValue InMask) {
4929   assert(Subtarget->hasAVX512() && "Expected AVX512!");
4930   assert(Setcc.getSimpleValueType().getVectorElementType() == MVT::i1 &&
4931          "Unexpected VT!");
4932 
4933   // Look for equal and not equal compares.
4934   ISD::CondCode CC = cast<CondCodeSDNode>(Setcc.getOperand(2))->get();
4935   if (CC != ISD::SETEQ && CC != ISD::SETNE)
4936     return false;
4937 
4938   SDValue SetccOp0 = Setcc.getOperand(0);
4939   SDValue SetccOp1 = Setcc.getOperand(1);
4940 
4941   // Canonicalize the all zero vector to the RHS.
4942   if (ISD::isBuildVectorAllZeros(SetccOp0.getNode()))
4943     std::swap(SetccOp0, SetccOp1);
4944 
4945   // See if we're comparing against zero.
4946   if (!ISD::isBuildVectorAllZeros(SetccOp1.getNode()))
4947     return false;
4948 
4949   SDValue N0 = SetccOp0;
4950 
4951   MVT CmpVT = N0.getSimpleValueType();
4952   MVT CmpSVT = CmpVT.getVectorElementType();
4953 
4954   // Start with both operands the same. We'll try to refine this.
4955   SDValue Src0 = N0;
4956   SDValue Src1 = N0;
4957 
4958   {
4959     // Look through single use bitcasts.
4960     SDValue N0Temp = N0;
4961     if (N0Temp.getOpcode() == ISD::BITCAST && N0Temp.hasOneUse())
4962       N0Temp = N0.getOperand(0);
4963 
4964      // Look for single use AND.
4965     if (N0Temp.getOpcode() == ISD::AND && N0Temp.hasOneUse()) {
4966       Src0 = N0Temp.getOperand(0);
4967       Src1 = N0Temp.getOperand(1);
4968     }
4969   }
4970 
4971   // Without VLX we need to widen the operation.
4972   bool Widen = !Subtarget->hasVLX() && !CmpVT.is512BitVector();
4973 
4974   auto tryFoldLoadOrBCast = [&](SDNode *Root, SDNode *P, SDValue &L,
4975                                 SDValue &Base, SDValue &Scale, SDValue &Index,
4976                                 SDValue &Disp, SDValue &Segment) {
4977     // If we need to widen, we can't fold the load.
4978     if (!Widen)
4979       if (tryFoldLoad(Root, P, L, Base, Scale, Index, Disp, Segment))
4980         return true;
4981 
4982     // If we didn't fold a load, try to match broadcast. No widening limitation
4983     // for this. But only 32 and 64 bit types are supported.
4984     if (CmpSVT != MVT::i32 && CmpSVT != MVT::i64)
4985       return false;
4986 
4987     // Look through single use bitcasts.
4988     if (L.getOpcode() == ISD::BITCAST && L.hasOneUse()) {
4989       P = L.getNode();
4990       L = L.getOperand(0);
4991     }
4992 
4993     if (L.getOpcode() != X86ISD::VBROADCAST_LOAD)
4994       return false;
4995 
4996     auto *MemIntr = cast<MemIntrinsicSDNode>(L);
4997     if (MemIntr->getMemoryVT().getSizeInBits() != CmpSVT.getSizeInBits())
4998       return false;
4999 
5000     return tryFoldBroadcast(Root, P, L, Base, Scale, Index, Disp, Segment);
5001   };
5002 
5003   // We can only fold loads if the sources are unique.
5004   bool CanFoldLoads = Src0 != Src1;
5005 
5006   bool FoldedLoad = false;
5007   SDValue Tmp0, Tmp1, Tmp2, Tmp3, Tmp4;
5008   if (CanFoldLoads) {
5009     FoldedLoad = tryFoldLoadOrBCast(Root, N0.getNode(), Src1, Tmp0, Tmp1, Tmp2,
5010                                     Tmp3, Tmp4);
5011     if (!FoldedLoad) {
5012       // And is commutative.
5013       FoldedLoad = tryFoldLoadOrBCast(Root, N0.getNode(), Src0, Tmp0, Tmp1,
5014                                       Tmp2, Tmp3, Tmp4);
5015       if (FoldedLoad)
5016         std::swap(Src0, Src1);
5017     }
5018   }
5019 
5020   bool FoldedBCast = FoldedLoad && Src1.getOpcode() == X86ISD::VBROADCAST_LOAD;
5021 
5022   bool IsMasked = InMask.getNode() != nullptr;
5023 
5024   SDLoc dl(Root);
5025 
5026   MVT ResVT = Setcc.getSimpleValueType();
5027   MVT MaskVT = ResVT;
5028   if (Widen) {
5029     // Widen the inputs using insert_subreg or copy_to_regclass.
5030     unsigned Scale = CmpVT.is128BitVector() ? 4 : 2;
5031     unsigned SubReg = CmpVT.is128BitVector() ? X86::sub_xmm : X86::sub_ymm;
5032     unsigned NumElts = CmpVT.getVectorNumElements() * Scale;
5033     CmpVT = MVT::getVectorVT(CmpSVT, NumElts);
5034     MaskVT = MVT::getVectorVT(MVT::i1, NumElts);
5035     SDValue ImplDef = SDValue(CurDAG->getMachineNode(X86::IMPLICIT_DEF, dl,
5036                                                      CmpVT), 0);
5037     Src0 = CurDAG->getTargetInsertSubreg(SubReg, dl, CmpVT, ImplDef, Src0);
5038 
5039     if (!FoldedBCast)
5040       Src1 = CurDAG->getTargetInsertSubreg(SubReg, dl, CmpVT, ImplDef, Src1);
5041 
5042     if (IsMasked) {
5043       // Widen the mask.
5044       unsigned RegClass = TLI->getRegClassFor(MaskVT)->getID();
5045       SDValue RC = CurDAG->getTargetConstant(RegClass, dl, MVT::i32);
5046       InMask = SDValue(CurDAG->getMachineNode(TargetOpcode::COPY_TO_REGCLASS,
5047                                               dl, MaskVT, InMask, RC), 0);
5048     }
5049   }
5050 
5051   bool IsTestN = CC == ISD::SETEQ;
5052   unsigned Opc = getVPTESTMOpc(CmpVT, IsTestN, FoldedLoad, FoldedBCast,
5053                                IsMasked);
5054 
5055   MachineSDNode *CNode;
5056   if (FoldedLoad) {
5057     SDVTList VTs = CurDAG->getVTList(MaskVT, MVT::Other);
5058 
5059     if (IsMasked) {
5060       SDValue Ops[] = { InMask, Src0, Tmp0, Tmp1, Tmp2, Tmp3, Tmp4,
5061                         Src1.getOperand(0) };
5062       CNode = CurDAG->getMachineNode(Opc, dl, VTs, Ops);
5063     } else {
5064       SDValue Ops[] = { Src0, Tmp0, Tmp1, Tmp2, Tmp3, Tmp4,
5065                         Src1.getOperand(0) };
5066       CNode = CurDAG->getMachineNode(Opc, dl, VTs, Ops);
5067     }
5068 
5069     // Update the chain.
5070     ReplaceUses(Src1.getValue(1), SDValue(CNode, 1));
5071     // Record the mem-refs
5072     CurDAG->setNodeMemRefs(CNode, {cast<MemSDNode>(Src1)->getMemOperand()});
5073   } else {
5074     if (IsMasked)
5075       CNode = CurDAG->getMachineNode(Opc, dl, MaskVT, InMask, Src0, Src1);
5076     else
5077       CNode = CurDAG->getMachineNode(Opc, dl, MaskVT, Src0, Src1);
5078   }
5079 
5080   // If we widened, we need to shrink the mask VT.
5081   if (Widen) {
5082     unsigned RegClass = TLI->getRegClassFor(ResVT)->getID();
5083     SDValue RC = CurDAG->getTargetConstant(RegClass, dl, MVT::i32);
5084     CNode = CurDAG->getMachineNode(TargetOpcode::COPY_TO_REGCLASS,
5085                                    dl, ResVT, SDValue(CNode, 0), RC);
5086   }
5087 
5088   ReplaceUses(SDValue(Root, 0), SDValue(CNode, 0));
5089   CurDAG->RemoveDeadNode(Root);
5090   return true;
5091 }
5092 
5093 // Try to match the bitselect pattern (or (and A, B), (andn A, C)). Turn it
5094 // into vpternlog.
tryMatchBitSelect(SDNode * N)5095 bool X86DAGToDAGISel::tryMatchBitSelect(SDNode *N) {
5096   assert(N->getOpcode() == ISD::OR && "Unexpected opcode!");
5097 
5098   MVT NVT = N->getSimpleValueType(0);
5099 
5100   // Make sure we support VPTERNLOG.
5101   if (!NVT.isVector() || !Subtarget->hasAVX512())
5102     return false;
5103 
5104   // We need VLX for 128/256-bit.
5105   if (!(Subtarget->hasVLX() || NVT.is512BitVector()))
5106     return false;
5107 
5108   SDValue N0 = N->getOperand(0);
5109   SDValue N1 = N->getOperand(1);
5110 
5111   // Canonicalize AND to LHS.
5112   if (N1.getOpcode() == ISD::AND)
5113     std::swap(N0, N1);
5114 
5115   if (N0.getOpcode() != ISD::AND ||
5116       N1.getOpcode() != X86ISD::ANDNP ||
5117       !N0.hasOneUse() || !N1.hasOneUse())
5118     return false;
5119 
5120   // ANDN is not commutable, use it to pick down A and C.
5121   SDValue A = N1.getOperand(0);
5122   SDValue C = N1.getOperand(1);
5123 
5124   // AND is commutable, if one operand matches A, the other operand is B.
5125   // Otherwise this isn't a match.
5126   SDValue B;
5127   if (N0.getOperand(0) == A)
5128     B = N0.getOperand(1);
5129   else if (N0.getOperand(1) == A)
5130     B = N0.getOperand(0);
5131   else
5132     return false;
5133 
5134   SDLoc dl(N);
5135   SDValue Imm = CurDAG->getTargetConstant(0xCA, dl, MVT::i8);
5136   SDValue Ternlog = CurDAG->getNode(X86ISD::VPTERNLOG, dl, NVT, A, B, C, Imm);
5137   ReplaceNode(N, Ternlog.getNode());
5138 
5139   return matchVPTERNLOG(Ternlog.getNode(), Ternlog.getNode(), Ternlog.getNode(),
5140                         Ternlog.getNode(), A, B, C, 0xCA);
5141 }
5142 
Select(SDNode * Node)5143 void X86DAGToDAGISel::Select(SDNode *Node) {
5144   MVT NVT = Node->getSimpleValueType(0);
5145   unsigned Opcode = Node->getOpcode();
5146   SDLoc dl(Node);
5147 
5148   if (Node->isMachineOpcode()) {
5149     LLVM_DEBUG(dbgs() << "== "; Node->dump(CurDAG); dbgs() << '\n');
5150     Node->setNodeId(-1);
5151     return;   // Already selected.
5152   }
5153 
5154   switch (Opcode) {
5155   default: break;
5156   case ISD::INTRINSIC_W_CHAIN: {
5157     unsigned IntNo = Node->getConstantOperandVal(1);
5158     switch (IntNo) {
5159     default: break;
5160     case Intrinsic::x86_encodekey128:
5161     case Intrinsic::x86_encodekey256: {
5162       if (!Subtarget->hasKL())
5163         break;
5164 
5165       unsigned Opcode;
5166       switch (IntNo) {
5167       default: llvm_unreachable("Impossible intrinsic");
5168       case Intrinsic::x86_encodekey128:
5169         Opcode = X86::ENCODEKEY128;
5170         break;
5171       case Intrinsic::x86_encodekey256:
5172         Opcode = X86::ENCODEKEY256;
5173         break;
5174       }
5175 
5176       SDValue Chain = Node->getOperand(0);
5177       Chain = CurDAG->getCopyToReg(Chain, dl, X86::XMM0, Node->getOperand(3),
5178                                    SDValue());
5179       if (Opcode == X86::ENCODEKEY256)
5180         Chain = CurDAG->getCopyToReg(Chain, dl, X86::XMM1, Node->getOperand(4),
5181                                      Chain.getValue(1));
5182 
5183       MachineSDNode *Res = CurDAG->getMachineNode(
5184           Opcode, dl, Node->getVTList(),
5185           {Node->getOperand(2), Chain, Chain.getValue(1)});
5186       ReplaceNode(Node, Res);
5187       return;
5188     }
5189     case Intrinsic::x86_tileloaddrs64_internal:
5190     case Intrinsic::x86_tileloaddrst164_internal:
5191       if (!Subtarget->hasAMXMOVRS())
5192         break;
5193       [[fallthrough]];
5194     case Intrinsic::x86_tileloadd64_internal:
5195     case Intrinsic::x86_tileloaddt164_internal: {
5196       if (!Subtarget->hasAMXTILE())
5197         break;
5198       auto *MFI =
5199           CurDAG->getMachineFunction().getInfo<X86MachineFunctionInfo>();
5200       MFI->setAMXProgModel(AMXProgModelEnum::ManagedRA);
5201       unsigned Opc;
5202       switch (IntNo) {
5203       default:
5204         llvm_unreachable("Unexpected intrinsic!");
5205       case Intrinsic::x86_tileloaddrs64_internal:
5206         Opc = X86::PTILELOADDRSV;
5207         break;
5208       case Intrinsic::x86_tileloaddrst164_internal:
5209         Opc = X86::PTILELOADDRST1V;
5210         break;
5211       case Intrinsic::x86_tileloadd64_internal:
5212         Opc = X86::PTILELOADDV;
5213         break;
5214       case Intrinsic::x86_tileloaddt164_internal:
5215         Opc = X86::PTILELOADDT1V;
5216         break;
5217       }
5218       // _tile_loadd_internal(row, col, buf, STRIDE)
5219       SDValue Base = Node->getOperand(4);
5220       SDValue Scale = getI8Imm(1, dl);
5221       SDValue Index = Node->getOperand(5);
5222       SDValue Disp = CurDAG->getTargetConstant(0, dl, MVT::i32);
5223       SDValue Segment = CurDAG->getRegister(0, MVT::i16);
5224       SDValue Chain = Node->getOperand(0);
5225       MachineSDNode *CNode;
5226       SDValue Ops[] = {Node->getOperand(2),
5227                        Node->getOperand(3),
5228                        Base,
5229                        Scale,
5230                        Index,
5231                        Disp,
5232                        Segment,
5233                        Chain};
5234       CNode = CurDAG->getMachineNode(Opc, dl, {MVT::x86amx, MVT::Other}, Ops);
5235       ReplaceNode(Node, CNode);
5236       return;
5237     }
5238     }
5239     break;
5240   }
5241   case ISD::INTRINSIC_VOID: {
5242     unsigned IntNo = Node->getConstantOperandVal(1);
5243     switch (IntNo) {
5244     default: break;
5245     case Intrinsic::x86_sse3_monitor:
5246     case Intrinsic::x86_monitorx:
5247     case Intrinsic::x86_clzero: {
5248       bool Use64BitPtr = Node->getOperand(2).getValueType() == MVT::i64;
5249 
5250       unsigned Opc = 0;
5251       switch (IntNo) {
5252       default: llvm_unreachable("Unexpected intrinsic!");
5253       case Intrinsic::x86_sse3_monitor:
5254         if (!Subtarget->hasSSE3())
5255           break;
5256         Opc = Use64BitPtr ? X86::MONITOR64rrr : X86::MONITOR32rrr;
5257         break;
5258       case Intrinsic::x86_monitorx:
5259         if (!Subtarget->hasMWAITX())
5260           break;
5261         Opc = Use64BitPtr ? X86::MONITORX64rrr : X86::MONITORX32rrr;
5262         break;
5263       case Intrinsic::x86_clzero:
5264         if (!Subtarget->hasCLZERO())
5265           break;
5266         Opc = Use64BitPtr ? X86::CLZERO64r : X86::CLZERO32r;
5267         break;
5268       }
5269 
5270       if (Opc) {
5271         unsigned PtrReg = Use64BitPtr ? X86::RAX : X86::EAX;
5272         SDValue Chain = CurDAG->getCopyToReg(Node->getOperand(0), dl, PtrReg,
5273                                              Node->getOperand(2), SDValue());
5274         SDValue InGlue = Chain.getValue(1);
5275 
5276         if (IntNo == Intrinsic::x86_sse3_monitor ||
5277             IntNo == Intrinsic::x86_monitorx) {
5278           // Copy the other two operands to ECX and EDX.
5279           Chain = CurDAG->getCopyToReg(Chain, dl, X86::ECX, Node->getOperand(3),
5280                                        InGlue);
5281           InGlue = Chain.getValue(1);
5282           Chain = CurDAG->getCopyToReg(Chain, dl, X86::EDX, Node->getOperand(4),
5283                                        InGlue);
5284           InGlue = Chain.getValue(1);
5285         }
5286 
5287         MachineSDNode *CNode = CurDAG->getMachineNode(Opc, dl, MVT::Other,
5288                                                       { Chain, InGlue});
5289         ReplaceNode(Node, CNode);
5290         return;
5291       }
5292 
5293       break;
5294     }
5295     case Intrinsic::x86_tilestored64_internal: {
5296       auto *MFI =
5297           CurDAG->getMachineFunction().getInfo<X86MachineFunctionInfo>();
5298       MFI->setAMXProgModel(AMXProgModelEnum::ManagedRA);
5299       unsigned Opc = X86::PTILESTOREDV;
5300       // _tile_stored_internal(row, col, buf, STRIDE, c)
5301       SDValue Base = Node->getOperand(4);
5302       SDValue Scale = getI8Imm(1, dl);
5303       SDValue Index = Node->getOperand(5);
5304       SDValue Disp = CurDAG->getTargetConstant(0, dl, MVT::i32);
5305       SDValue Segment = CurDAG->getRegister(0, MVT::i16);
5306       SDValue Chain = Node->getOperand(0);
5307       MachineSDNode *CNode;
5308       SDValue Ops[] = {Node->getOperand(2),
5309                        Node->getOperand(3),
5310                        Base,
5311                        Scale,
5312                        Index,
5313                        Disp,
5314                        Segment,
5315                        Node->getOperand(6),
5316                        Chain};
5317       CNode = CurDAG->getMachineNode(Opc, dl, MVT::Other, Ops);
5318       ReplaceNode(Node, CNode);
5319       return;
5320     }
5321     case Intrinsic::x86_tileloaddrs64:
5322     case Intrinsic::x86_tileloaddrst164:
5323       if (!Subtarget->hasAMXMOVRS())
5324         break;
5325       [[fallthrough]];
5326     case Intrinsic::x86_tileloadd64:
5327     case Intrinsic::x86_tileloaddt164:
5328     case Intrinsic::x86_tilestored64: {
5329       if (!Subtarget->hasAMXTILE())
5330         break;
5331       auto *MFI =
5332           CurDAG->getMachineFunction().getInfo<X86MachineFunctionInfo>();
5333       MFI->setAMXProgModel(AMXProgModelEnum::DirectReg);
5334       unsigned Opc;
5335       switch (IntNo) {
5336       default: llvm_unreachable("Unexpected intrinsic!");
5337       case Intrinsic::x86_tileloadd64:   Opc = X86::PTILELOADD; break;
5338       case Intrinsic::x86_tileloaddrs64:
5339         Opc = X86::PTILELOADDRS;
5340         break;
5341       case Intrinsic::x86_tileloaddt164: Opc = X86::PTILELOADDT1; break;
5342       case Intrinsic::x86_tileloaddrst164:
5343         Opc = X86::PTILELOADDRST1;
5344         break;
5345       case Intrinsic::x86_tilestored64:  Opc = X86::PTILESTORED; break;
5346       }
5347       // FIXME: Match displacement and scale.
5348       unsigned TIndex = Node->getConstantOperandVal(2);
5349       SDValue TReg = getI8Imm(TIndex, dl);
5350       SDValue Base = Node->getOperand(3);
5351       SDValue Scale = getI8Imm(1, dl);
5352       SDValue Index = Node->getOperand(4);
5353       SDValue Disp = CurDAG->getTargetConstant(0, dl, MVT::i32);
5354       SDValue Segment = CurDAG->getRegister(0, MVT::i16);
5355       SDValue Chain = Node->getOperand(0);
5356       MachineSDNode *CNode;
5357       if (Opc == X86::PTILESTORED) {
5358         SDValue Ops[] = { Base, Scale, Index, Disp, Segment, TReg, Chain };
5359         CNode = CurDAG->getMachineNode(Opc, dl, MVT::Other, Ops);
5360       } else {
5361         SDValue Ops[] = { TReg, Base, Scale, Index, Disp, Segment, Chain };
5362         CNode = CurDAG->getMachineNode(Opc, dl, MVT::Other, Ops);
5363       }
5364       ReplaceNode(Node, CNode);
5365       return;
5366     }
5367     case Intrinsic::x86_t2rpntlvwz0rs:
5368     case Intrinsic::x86_t2rpntlvwz0rst1:
5369     case Intrinsic::x86_t2rpntlvwz1rs:
5370     case Intrinsic::x86_t2rpntlvwz1rst1:
5371       if (!Subtarget->hasAMXMOVRS())
5372         break;
5373       [[fallthrough]];
5374     case Intrinsic::x86_t2rpntlvwz0:
5375     case Intrinsic::x86_t2rpntlvwz0t1:
5376     case Intrinsic::x86_t2rpntlvwz1:
5377     case Intrinsic::x86_t2rpntlvwz1t1: {
5378       if (!Subtarget->hasAMXTRANSPOSE())
5379         break;
5380       auto *MFI =
5381           CurDAG->getMachineFunction().getInfo<X86MachineFunctionInfo>();
5382       MFI->setAMXProgModel(AMXProgModelEnum::DirectReg);
5383       unsigned Opc;
5384       switch (IntNo) {
5385       default:
5386         llvm_unreachable("Unexpected intrinsic!");
5387       case Intrinsic::x86_t2rpntlvwz0:
5388         Opc = X86::PT2RPNTLVWZ0;
5389         break;
5390       case Intrinsic::x86_t2rpntlvwz0t1:
5391         Opc = X86::PT2RPNTLVWZ0T1;
5392         break;
5393       case Intrinsic::x86_t2rpntlvwz1:
5394         Opc = X86::PT2RPNTLVWZ1;
5395         break;
5396       case Intrinsic::x86_t2rpntlvwz1t1:
5397         Opc = X86::PT2RPNTLVWZ1T1;
5398         break;
5399       case Intrinsic::x86_t2rpntlvwz0rs:
5400         Opc = X86::PT2RPNTLVWZ0RS;
5401         break;
5402       case Intrinsic::x86_t2rpntlvwz0rst1:
5403         Opc = X86::PT2RPNTLVWZ0RST1;
5404         break;
5405       case Intrinsic::x86_t2rpntlvwz1rs:
5406         Opc = X86::PT2RPNTLVWZ1RS;
5407         break;
5408       case Intrinsic::x86_t2rpntlvwz1rst1:
5409         Opc = X86::PT2RPNTLVWZ1RST1;
5410         break;
5411       }
5412       // FIXME: Match displacement and scale.
5413       unsigned TIndex = Node->getConstantOperandVal(2);
5414       SDValue TReg = getI8Imm(TIndex, dl);
5415       SDValue Base = Node->getOperand(3);
5416       SDValue Scale = getI8Imm(1, dl);
5417       SDValue Index = Node->getOperand(4);
5418       SDValue Disp = CurDAG->getTargetConstant(0, dl, MVT::i32);
5419       SDValue Segment = CurDAG->getRegister(0, MVT::i16);
5420       SDValue Chain = Node->getOperand(0);
5421       SDValue Ops[] = {TReg, Base, Scale, Index, Disp, Segment, Chain};
5422       MachineSDNode *CNode = CurDAG->getMachineNode(Opc, dl, MVT::Other, Ops);
5423       ReplaceNode(Node, CNode);
5424       return;
5425     }
5426     }
5427     break;
5428   }
5429   case ISD::BRIND:
5430   case X86ISD::NT_BRIND: {
5431     if (Subtarget->isTargetNaCl())
5432       // NaCl has its own pass where jmp %r32 are converted to jmp %r64. We
5433       // leave the instruction alone.
5434       break;
5435     if (Subtarget->isTarget64BitILP32()) {
5436       // Converts a 32-bit register to a 64-bit, zero-extended version of
5437       // it. This is needed because x86-64 can do many things, but jmp %r32
5438       // ain't one of them.
5439       SDValue Target = Node->getOperand(1);
5440       assert(Target.getValueType() == MVT::i32 && "Unexpected VT!");
5441       SDValue ZextTarget = CurDAG->getZExtOrTrunc(Target, dl, MVT::i64);
5442       SDValue Brind = CurDAG->getNode(Opcode, dl, MVT::Other,
5443                                       Node->getOperand(0), ZextTarget);
5444       ReplaceNode(Node, Brind.getNode());
5445       SelectCode(ZextTarget.getNode());
5446       SelectCode(Brind.getNode());
5447       return;
5448     }
5449     break;
5450   }
5451   case X86ISD::GlobalBaseReg:
5452     ReplaceNode(Node, getGlobalBaseReg());
5453     return;
5454 
5455   case ISD::BITCAST:
5456     // Just drop all 128/256/512-bit bitcasts.
5457     if (NVT.is512BitVector() || NVT.is256BitVector() || NVT.is128BitVector() ||
5458         NVT == MVT::f128) {
5459       ReplaceUses(SDValue(Node, 0), Node->getOperand(0));
5460       CurDAG->RemoveDeadNode(Node);
5461       return;
5462     }
5463     break;
5464 
5465   case ISD::SRL:
5466     if (matchBitExtract(Node))
5467       return;
5468     [[fallthrough]];
5469   case ISD::SRA:
5470   case ISD::SHL:
5471     if (tryShiftAmountMod(Node))
5472       return;
5473     break;
5474 
5475   case X86ISD::VPTERNLOG: {
5476     uint8_t Imm = Node->getConstantOperandVal(3);
5477     if (matchVPTERNLOG(Node, Node, Node, Node, Node->getOperand(0),
5478                        Node->getOperand(1), Node->getOperand(2), Imm))
5479       return;
5480     break;
5481   }
5482 
5483   case X86ISD::ANDNP:
5484     if (tryVPTERNLOG(Node))
5485       return;
5486     break;
5487 
5488   case ISD::AND:
5489     if (NVT.isVector() && NVT.getVectorElementType() == MVT::i1) {
5490       // Try to form a masked VPTESTM. Operands can be in either order.
5491       SDValue N0 = Node->getOperand(0);
5492       SDValue N1 = Node->getOperand(1);
5493       if (N0.getOpcode() == ISD::SETCC && N0.hasOneUse() &&
5494           tryVPTESTM(Node, N0, N1))
5495         return;
5496       if (N1.getOpcode() == ISD::SETCC && N1.hasOneUse() &&
5497           tryVPTESTM(Node, N1, N0))
5498         return;
5499     }
5500 
5501     if (MachineSDNode *NewNode = matchBEXTRFromAndImm(Node)) {
5502       ReplaceUses(SDValue(Node, 0), SDValue(NewNode, 0));
5503       CurDAG->RemoveDeadNode(Node);
5504       return;
5505     }
5506     if (matchBitExtract(Node))
5507       return;
5508     if (AndImmShrink && shrinkAndImmediate(Node))
5509       return;
5510 
5511     [[fallthrough]];
5512   case ISD::OR:
5513   case ISD::XOR:
5514     if (tryShrinkShlLogicImm(Node))
5515       return;
5516     if (Opcode == ISD::OR && tryMatchBitSelect(Node))
5517       return;
5518     if (tryVPTERNLOG(Node))
5519       return;
5520 
5521     [[fallthrough]];
5522   case ISD::ADD:
5523     if (Opcode == ISD::ADD && matchBitExtract(Node))
5524       return;
5525     [[fallthrough]];
5526   case ISD::SUB: {
5527     // Try to avoid folding immediates with multiple uses for optsize.
5528     // This code tries to select to register form directly to avoid going
5529     // through the isel table which might fold the immediate. We can't change
5530     // the patterns on the add/sub/and/or/xor with immediate paterns in the
5531     // tablegen files to check immediate use count without making the patterns
5532     // unavailable to the fast-isel table.
5533     if (!CurDAG->shouldOptForSize())
5534       break;
5535 
5536     // Only handle i8/i16/i32/i64.
5537     if (NVT != MVT::i8 && NVT != MVT::i16 && NVT != MVT::i32 && NVT != MVT::i64)
5538       break;
5539 
5540     SDValue N0 = Node->getOperand(0);
5541     SDValue N1 = Node->getOperand(1);
5542 
5543     auto *Cst = dyn_cast<ConstantSDNode>(N1);
5544     if (!Cst)
5545       break;
5546 
5547     int64_t Val = Cst->getSExtValue();
5548 
5549     // Make sure its an immediate that is considered foldable.
5550     // FIXME: Handle unsigned 32 bit immediates for 64-bit AND.
5551     if (!isInt<8>(Val) && !isInt<32>(Val))
5552       break;
5553 
5554     // If this can match to INC/DEC, let it go.
5555     if (Opcode == ISD::ADD && (Val == 1 || Val == -1))
5556       break;
5557 
5558     // Check if we should avoid folding this immediate.
5559     if (!shouldAvoidImmediateInstFormsForSize(N1.getNode()))
5560       break;
5561 
5562     // We should not fold the immediate. So we need a register form instead.
5563     unsigned ROpc, MOpc;
5564     switch (NVT.SimpleTy) {
5565     default: llvm_unreachable("Unexpected VT!");
5566     case MVT::i8:
5567       switch (Opcode) {
5568       default: llvm_unreachable("Unexpected opcode!");
5569       case ISD::ADD:
5570         ROpc = GET_ND_IF_ENABLED(X86::ADD8rr);
5571         MOpc = GET_ND_IF_ENABLED(X86::ADD8rm);
5572         break;
5573       case ISD::SUB:
5574         ROpc = GET_ND_IF_ENABLED(X86::SUB8rr);
5575         MOpc = GET_ND_IF_ENABLED(X86::SUB8rm);
5576         break;
5577       case ISD::AND:
5578         ROpc = GET_ND_IF_ENABLED(X86::AND8rr);
5579         MOpc = GET_ND_IF_ENABLED(X86::AND8rm);
5580         break;
5581       case ISD::OR:
5582         ROpc = GET_ND_IF_ENABLED(X86::OR8rr);
5583         MOpc = GET_ND_IF_ENABLED(X86::OR8rm);
5584         break;
5585       case ISD::XOR:
5586         ROpc = GET_ND_IF_ENABLED(X86::XOR8rr);
5587         MOpc = GET_ND_IF_ENABLED(X86::XOR8rm);
5588         break;
5589       }
5590       break;
5591     case MVT::i16:
5592       switch (Opcode) {
5593       default: llvm_unreachable("Unexpected opcode!");
5594       case ISD::ADD:
5595         ROpc = GET_ND_IF_ENABLED(X86::ADD16rr);
5596         MOpc = GET_ND_IF_ENABLED(X86::ADD16rm);
5597         break;
5598       case ISD::SUB:
5599         ROpc = GET_ND_IF_ENABLED(X86::SUB16rr);
5600         MOpc = GET_ND_IF_ENABLED(X86::SUB16rm);
5601         break;
5602       case ISD::AND:
5603         ROpc = GET_ND_IF_ENABLED(X86::AND16rr);
5604         MOpc = GET_ND_IF_ENABLED(X86::AND16rm);
5605         break;
5606       case ISD::OR:
5607         ROpc = GET_ND_IF_ENABLED(X86::OR16rr);
5608         MOpc = GET_ND_IF_ENABLED(X86::OR16rm);
5609         break;
5610       case ISD::XOR:
5611         ROpc = GET_ND_IF_ENABLED(X86::XOR16rr);
5612         MOpc = GET_ND_IF_ENABLED(X86::XOR16rm);
5613         break;
5614       }
5615       break;
5616     case MVT::i32:
5617       switch (Opcode) {
5618       default: llvm_unreachable("Unexpected opcode!");
5619       case ISD::ADD:
5620         ROpc = GET_ND_IF_ENABLED(X86::ADD32rr);
5621         MOpc = GET_ND_IF_ENABLED(X86::ADD32rm);
5622         break;
5623       case ISD::SUB:
5624         ROpc = GET_ND_IF_ENABLED(X86::SUB32rr);
5625         MOpc = GET_ND_IF_ENABLED(X86::SUB32rm);
5626         break;
5627       case ISD::AND:
5628         ROpc = GET_ND_IF_ENABLED(X86::AND32rr);
5629         MOpc = GET_ND_IF_ENABLED(X86::AND32rm);
5630         break;
5631       case ISD::OR:
5632         ROpc = GET_ND_IF_ENABLED(X86::OR32rr);
5633         MOpc = GET_ND_IF_ENABLED(X86::OR32rm);
5634         break;
5635       case ISD::XOR:
5636         ROpc = GET_ND_IF_ENABLED(X86::XOR32rr);
5637         MOpc = GET_ND_IF_ENABLED(X86::XOR32rm);
5638         break;
5639       }
5640       break;
5641     case MVT::i64:
5642       switch (Opcode) {
5643       default: llvm_unreachable("Unexpected opcode!");
5644       case ISD::ADD:
5645         ROpc = GET_ND_IF_ENABLED(X86::ADD64rr);
5646         MOpc = GET_ND_IF_ENABLED(X86::ADD64rm);
5647         break;
5648       case ISD::SUB:
5649         ROpc = GET_ND_IF_ENABLED(X86::SUB64rr);
5650         MOpc = GET_ND_IF_ENABLED(X86::SUB64rm);
5651         break;
5652       case ISD::AND:
5653         ROpc = GET_ND_IF_ENABLED(X86::AND64rr);
5654         MOpc = GET_ND_IF_ENABLED(X86::AND64rm);
5655         break;
5656       case ISD::OR:
5657         ROpc = GET_ND_IF_ENABLED(X86::OR64rr);
5658         MOpc = GET_ND_IF_ENABLED(X86::OR64rm);
5659         break;
5660       case ISD::XOR:
5661         ROpc = GET_ND_IF_ENABLED(X86::XOR64rr);
5662         MOpc = GET_ND_IF_ENABLED(X86::XOR64rm);
5663         break;
5664       }
5665       break;
5666     }
5667 
5668     // Ok this is a AND/OR/XOR/ADD/SUB with constant.
5669 
5670     // If this is a not a subtract, we can still try to fold a load.
5671     if (Opcode != ISD::SUB) {
5672       SDValue Tmp0, Tmp1, Tmp2, Tmp3, Tmp4;
5673       if (tryFoldLoad(Node, N0, Tmp0, Tmp1, Tmp2, Tmp3, Tmp4)) {
5674         SDValue Ops[] = { N1, Tmp0, Tmp1, Tmp2, Tmp3, Tmp4, N0.getOperand(0) };
5675         SDVTList VTs = CurDAG->getVTList(NVT, MVT::i32, MVT::Other);
5676         MachineSDNode *CNode = CurDAG->getMachineNode(MOpc, dl, VTs, Ops);
5677         // Update the chain.
5678         ReplaceUses(N0.getValue(1), SDValue(CNode, 2));
5679         // Record the mem-refs
5680         CurDAG->setNodeMemRefs(CNode, {cast<LoadSDNode>(N0)->getMemOperand()});
5681         ReplaceUses(SDValue(Node, 0), SDValue(CNode, 0));
5682         CurDAG->RemoveDeadNode(Node);
5683         return;
5684       }
5685     }
5686 
5687     CurDAG->SelectNodeTo(Node, ROpc, NVT, MVT::i32, N0, N1);
5688     return;
5689   }
5690 
5691   case X86ISD::SMUL:
5692     // i16/i32/i64 are handled with isel patterns.
5693     if (NVT != MVT::i8)
5694       break;
5695     [[fallthrough]];
5696   case X86ISD::UMUL: {
5697     SDValue N0 = Node->getOperand(0);
5698     SDValue N1 = Node->getOperand(1);
5699 
5700     unsigned LoReg, ROpc, MOpc;
5701     switch (NVT.SimpleTy) {
5702     default: llvm_unreachable("Unsupported VT!");
5703     case MVT::i8:
5704       LoReg = X86::AL;
5705       ROpc = Opcode == X86ISD::SMUL ? X86::IMUL8r : X86::MUL8r;
5706       MOpc = Opcode == X86ISD::SMUL ? X86::IMUL8m : X86::MUL8m;
5707       break;
5708     case MVT::i16:
5709       LoReg = X86::AX;
5710       ROpc = X86::MUL16r;
5711       MOpc = X86::MUL16m;
5712       break;
5713     case MVT::i32:
5714       LoReg = X86::EAX;
5715       ROpc = X86::MUL32r;
5716       MOpc = X86::MUL32m;
5717       break;
5718     case MVT::i64:
5719       LoReg = X86::RAX;
5720       ROpc = X86::MUL64r;
5721       MOpc = X86::MUL64m;
5722       break;
5723     }
5724 
5725     SDValue Tmp0, Tmp1, Tmp2, Tmp3, Tmp4;
5726     bool FoldedLoad = tryFoldLoad(Node, N1, Tmp0, Tmp1, Tmp2, Tmp3, Tmp4);
5727     // Multiply is commutative.
5728     if (!FoldedLoad) {
5729       FoldedLoad = tryFoldLoad(Node, N0, Tmp0, Tmp1, Tmp2, Tmp3, Tmp4);
5730       if (FoldedLoad)
5731         std::swap(N0, N1);
5732     }
5733 
5734     SDValue InGlue = CurDAG->getCopyToReg(CurDAG->getEntryNode(), dl, LoReg,
5735                                           N0, SDValue()).getValue(1);
5736 
5737     MachineSDNode *CNode;
5738     if (FoldedLoad) {
5739       // i16/i32/i64 use an instruction that produces a low and high result even
5740       // though only the low result is used.
5741       SDVTList VTs;
5742       if (NVT == MVT::i8)
5743         VTs = CurDAG->getVTList(NVT, MVT::i32, MVT::Other);
5744       else
5745         VTs = CurDAG->getVTList(NVT, NVT, MVT::i32, MVT::Other);
5746 
5747       SDValue Ops[] = { Tmp0, Tmp1, Tmp2, Tmp3, Tmp4, N1.getOperand(0),
5748                         InGlue };
5749       CNode = CurDAG->getMachineNode(MOpc, dl, VTs, Ops);
5750 
5751       // Update the chain.
5752       ReplaceUses(N1.getValue(1), SDValue(CNode, NVT == MVT::i8 ? 2 : 3));
5753       // Record the mem-refs
5754       CurDAG->setNodeMemRefs(CNode, {cast<LoadSDNode>(N1)->getMemOperand()});
5755     } else {
5756       // i16/i32/i64 use an instruction that produces a low and high result even
5757       // though only the low result is used.
5758       SDVTList VTs;
5759       if (NVT == MVT::i8)
5760         VTs = CurDAG->getVTList(NVT, MVT::i32);
5761       else
5762         VTs = CurDAG->getVTList(NVT, NVT, MVT::i32);
5763 
5764       CNode = CurDAG->getMachineNode(ROpc, dl, VTs, {N1, InGlue});
5765     }
5766 
5767     ReplaceUses(SDValue(Node, 0), SDValue(CNode, 0));
5768     ReplaceUses(SDValue(Node, 1), SDValue(CNode, NVT == MVT::i8 ? 1 : 2));
5769     CurDAG->RemoveDeadNode(Node);
5770     return;
5771   }
5772 
5773   case ISD::SMUL_LOHI:
5774   case ISD::UMUL_LOHI: {
5775     SDValue N0 = Node->getOperand(0);
5776     SDValue N1 = Node->getOperand(1);
5777 
5778     unsigned Opc, MOpc;
5779     unsigned LoReg, HiReg;
5780     bool IsSigned = Opcode == ISD::SMUL_LOHI;
5781     bool UseMULX = !IsSigned && Subtarget->hasBMI2();
5782     bool UseMULXHi = UseMULX && SDValue(Node, 0).use_empty();
5783     switch (NVT.SimpleTy) {
5784     default: llvm_unreachable("Unsupported VT!");
5785     case MVT::i32:
5786       Opc = UseMULXHi  ? X86::MULX32Hrr
5787             : UseMULX  ? GET_EGPR_IF_ENABLED(X86::MULX32rr)
5788             : IsSigned ? X86::IMUL32r
5789                        : X86::MUL32r;
5790       MOpc = UseMULXHi  ? X86::MULX32Hrm
5791              : UseMULX  ? GET_EGPR_IF_ENABLED(X86::MULX32rm)
5792              : IsSigned ? X86::IMUL32m
5793                         : X86::MUL32m;
5794       LoReg = UseMULX ? X86::EDX : X86::EAX;
5795       HiReg = X86::EDX;
5796       break;
5797     case MVT::i64:
5798       Opc = UseMULXHi  ? X86::MULX64Hrr
5799             : UseMULX  ? GET_EGPR_IF_ENABLED(X86::MULX64rr)
5800             : IsSigned ? X86::IMUL64r
5801                        : X86::MUL64r;
5802       MOpc = UseMULXHi  ? X86::MULX64Hrm
5803              : UseMULX  ? GET_EGPR_IF_ENABLED(X86::MULX64rm)
5804              : IsSigned ? X86::IMUL64m
5805                         : X86::MUL64m;
5806       LoReg = UseMULX ? X86::RDX : X86::RAX;
5807       HiReg = X86::RDX;
5808       break;
5809     }
5810 
5811     SDValue Tmp0, Tmp1, Tmp2, Tmp3, Tmp4;
5812     bool foldedLoad = tryFoldLoad(Node, N1, Tmp0, Tmp1, Tmp2, Tmp3, Tmp4);
5813     // Multiply is commutative.
5814     if (!foldedLoad) {
5815       foldedLoad = tryFoldLoad(Node, N0, Tmp0, Tmp1, Tmp2, Tmp3, Tmp4);
5816       if (foldedLoad)
5817         std::swap(N0, N1);
5818     }
5819 
5820     SDValue InGlue = CurDAG->getCopyToReg(CurDAG->getEntryNode(), dl, LoReg,
5821                                           N0, SDValue()).getValue(1);
5822     SDValue ResHi, ResLo;
5823     if (foldedLoad) {
5824       SDValue Chain;
5825       MachineSDNode *CNode = nullptr;
5826       SDValue Ops[] = { Tmp0, Tmp1, Tmp2, Tmp3, Tmp4, N1.getOperand(0),
5827                         InGlue };
5828       if (UseMULXHi) {
5829         SDVTList VTs = CurDAG->getVTList(NVT, MVT::Other);
5830         CNode = CurDAG->getMachineNode(MOpc, dl, VTs, Ops);
5831         ResHi = SDValue(CNode, 0);
5832         Chain = SDValue(CNode, 1);
5833       } else if (UseMULX) {
5834         SDVTList VTs = CurDAG->getVTList(NVT, NVT, MVT::Other);
5835         CNode = CurDAG->getMachineNode(MOpc, dl, VTs, Ops);
5836         ResHi = SDValue(CNode, 0);
5837         ResLo = SDValue(CNode, 1);
5838         Chain = SDValue(CNode, 2);
5839       } else {
5840         SDVTList VTs = CurDAG->getVTList(MVT::Other, MVT::Glue);
5841         CNode = CurDAG->getMachineNode(MOpc, dl, VTs, Ops);
5842         Chain = SDValue(CNode, 0);
5843         InGlue = SDValue(CNode, 1);
5844       }
5845 
5846       // Update the chain.
5847       ReplaceUses(N1.getValue(1), Chain);
5848       // Record the mem-refs
5849       CurDAG->setNodeMemRefs(CNode, {cast<LoadSDNode>(N1)->getMemOperand()});
5850     } else {
5851       SDValue Ops[] = { N1, InGlue };
5852       if (UseMULXHi) {
5853         SDVTList VTs = CurDAG->getVTList(NVT);
5854         SDNode *CNode = CurDAG->getMachineNode(Opc, dl, VTs, Ops);
5855         ResHi = SDValue(CNode, 0);
5856       } else if (UseMULX) {
5857         SDVTList VTs = CurDAG->getVTList(NVT, NVT);
5858         SDNode *CNode = CurDAG->getMachineNode(Opc, dl, VTs, Ops);
5859         ResHi = SDValue(CNode, 0);
5860         ResLo = SDValue(CNode, 1);
5861       } else {
5862         SDVTList VTs = CurDAG->getVTList(MVT::Glue);
5863         SDNode *CNode = CurDAG->getMachineNode(Opc, dl, VTs, Ops);
5864         InGlue = SDValue(CNode, 0);
5865       }
5866     }
5867 
5868     // Copy the low half of the result, if it is needed.
5869     if (!SDValue(Node, 0).use_empty()) {
5870       if (!ResLo) {
5871         assert(LoReg && "Register for low half is not defined!");
5872         ResLo = CurDAG->getCopyFromReg(CurDAG->getEntryNode(), dl, LoReg,
5873                                        NVT, InGlue);
5874         InGlue = ResLo.getValue(2);
5875       }
5876       ReplaceUses(SDValue(Node, 0), ResLo);
5877       LLVM_DEBUG(dbgs() << "=> "; ResLo.getNode()->dump(CurDAG);
5878                  dbgs() << '\n');
5879     }
5880     // Copy the high half of the result, if it is needed.
5881     if (!SDValue(Node, 1).use_empty()) {
5882       if (!ResHi) {
5883         assert(HiReg && "Register for high half is not defined!");
5884         ResHi = CurDAG->getCopyFromReg(CurDAG->getEntryNode(), dl, HiReg,
5885                                        NVT, InGlue);
5886         InGlue = ResHi.getValue(2);
5887       }
5888       ReplaceUses(SDValue(Node, 1), ResHi);
5889       LLVM_DEBUG(dbgs() << "=> "; ResHi.getNode()->dump(CurDAG);
5890                  dbgs() << '\n');
5891     }
5892 
5893     CurDAG->RemoveDeadNode(Node);
5894     return;
5895   }
5896 
5897   case ISD::SDIVREM:
5898   case ISD::UDIVREM: {
5899     SDValue N0 = Node->getOperand(0);
5900     SDValue N1 = Node->getOperand(1);
5901 
5902     unsigned ROpc, MOpc;
5903     bool isSigned = Opcode == ISD::SDIVREM;
5904     if (!isSigned) {
5905       switch (NVT.SimpleTy) {
5906       default: llvm_unreachable("Unsupported VT!");
5907       case MVT::i8:  ROpc = X86::DIV8r;  MOpc = X86::DIV8m;  break;
5908       case MVT::i16: ROpc = X86::DIV16r; MOpc = X86::DIV16m; break;
5909       case MVT::i32: ROpc = X86::DIV32r; MOpc = X86::DIV32m; break;
5910       case MVT::i64: ROpc = X86::DIV64r; MOpc = X86::DIV64m; break;
5911       }
5912     } else {
5913       switch (NVT.SimpleTy) {
5914       default: llvm_unreachable("Unsupported VT!");
5915       case MVT::i8:  ROpc = X86::IDIV8r;  MOpc = X86::IDIV8m;  break;
5916       case MVT::i16: ROpc = X86::IDIV16r; MOpc = X86::IDIV16m; break;
5917       case MVT::i32: ROpc = X86::IDIV32r; MOpc = X86::IDIV32m; break;
5918       case MVT::i64: ROpc = X86::IDIV64r; MOpc = X86::IDIV64m; break;
5919       }
5920     }
5921 
5922     unsigned LoReg, HiReg, ClrReg;
5923     unsigned SExtOpcode;
5924     switch (NVT.SimpleTy) {
5925     default: llvm_unreachable("Unsupported VT!");
5926     case MVT::i8:
5927       LoReg = X86::AL;  ClrReg = HiReg = X86::AH;
5928       SExtOpcode = 0; // Not used.
5929       break;
5930     case MVT::i16:
5931       LoReg = X86::AX;  HiReg = X86::DX;
5932       ClrReg = X86::DX;
5933       SExtOpcode = X86::CWD;
5934       break;
5935     case MVT::i32:
5936       LoReg = X86::EAX; ClrReg = HiReg = X86::EDX;
5937       SExtOpcode = X86::CDQ;
5938       break;
5939     case MVT::i64:
5940       LoReg = X86::RAX; ClrReg = HiReg = X86::RDX;
5941       SExtOpcode = X86::CQO;
5942       break;
5943     }
5944 
5945     SDValue Tmp0, Tmp1, Tmp2, Tmp3, Tmp4;
5946     bool foldedLoad = tryFoldLoad(Node, N1, Tmp0, Tmp1, Tmp2, Tmp3, Tmp4);
5947     bool signBitIsZero = CurDAG->SignBitIsZero(N0);
5948 
5949     SDValue InGlue;
5950     if (NVT == MVT::i8) {
5951       // Special case for div8, just use a move with zero extension to AX to
5952       // clear the upper 8 bits (AH).
5953       SDValue Tmp0, Tmp1, Tmp2, Tmp3, Tmp4, Chain;
5954       MachineSDNode *Move;
5955       if (tryFoldLoad(Node, N0, Tmp0, Tmp1, Tmp2, Tmp3, Tmp4)) {
5956         SDValue Ops[] = { Tmp0, Tmp1, Tmp2, Tmp3, Tmp4, N0.getOperand(0) };
5957         unsigned Opc = (isSigned && !signBitIsZero) ? X86::MOVSX16rm8
5958                                                     : X86::MOVZX16rm8;
5959         Move = CurDAG->getMachineNode(Opc, dl, MVT::i16, MVT::Other, Ops);
5960         Chain = SDValue(Move, 1);
5961         ReplaceUses(N0.getValue(1), Chain);
5962         // Record the mem-refs
5963         CurDAG->setNodeMemRefs(Move, {cast<LoadSDNode>(N0)->getMemOperand()});
5964       } else {
5965         unsigned Opc = (isSigned && !signBitIsZero) ? X86::MOVSX16rr8
5966                                                     : X86::MOVZX16rr8;
5967         Move = CurDAG->getMachineNode(Opc, dl, MVT::i16, N0);
5968         Chain = CurDAG->getEntryNode();
5969       }
5970       Chain  = CurDAG->getCopyToReg(Chain, dl, X86::AX, SDValue(Move, 0),
5971                                     SDValue());
5972       InGlue = Chain.getValue(1);
5973     } else {
5974       InGlue =
5975         CurDAG->getCopyToReg(CurDAG->getEntryNode(), dl,
5976                              LoReg, N0, SDValue()).getValue(1);
5977       if (isSigned && !signBitIsZero) {
5978         // Sign extend the low part into the high part.
5979         InGlue =
5980           SDValue(CurDAG->getMachineNode(SExtOpcode, dl, MVT::Glue, InGlue),0);
5981       } else {
5982         // Zero out the high part, effectively zero extending the input.
5983         SDVTList VTs = CurDAG->getVTList(MVT::i32, MVT::i32);
5984         SDValue ClrNode =
5985             SDValue(CurDAG->getMachineNode(X86::MOV32r0, dl, VTs, {}), 0);
5986         switch (NVT.SimpleTy) {
5987         case MVT::i16:
5988           ClrNode =
5989               SDValue(CurDAG->getMachineNode(
5990                           TargetOpcode::EXTRACT_SUBREG, dl, MVT::i16, ClrNode,
5991                           CurDAG->getTargetConstant(X86::sub_16bit, dl,
5992                                                     MVT::i32)),
5993                       0);
5994           break;
5995         case MVT::i32:
5996           break;
5997         case MVT::i64:
5998           ClrNode =
5999               SDValue(CurDAG->getMachineNode(
6000                           TargetOpcode::SUBREG_TO_REG, dl, MVT::i64,
6001                           CurDAG->getTargetConstant(0, dl, MVT::i64), ClrNode,
6002                           CurDAG->getTargetConstant(X86::sub_32bit, dl,
6003                                                     MVT::i32)),
6004                       0);
6005           break;
6006         default:
6007           llvm_unreachable("Unexpected division source");
6008         }
6009 
6010         InGlue = CurDAG->getCopyToReg(CurDAG->getEntryNode(), dl, ClrReg,
6011                                       ClrNode, InGlue).getValue(1);
6012       }
6013     }
6014 
6015     if (foldedLoad) {
6016       SDValue Ops[] = { Tmp0, Tmp1, Tmp2, Tmp3, Tmp4, N1.getOperand(0),
6017                         InGlue };
6018       MachineSDNode *CNode =
6019         CurDAG->getMachineNode(MOpc, dl, MVT::Other, MVT::Glue, Ops);
6020       InGlue = SDValue(CNode, 1);
6021       // Update the chain.
6022       ReplaceUses(N1.getValue(1), SDValue(CNode, 0));
6023       // Record the mem-refs
6024       CurDAG->setNodeMemRefs(CNode, {cast<LoadSDNode>(N1)->getMemOperand()});
6025     } else {
6026       InGlue =
6027         SDValue(CurDAG->getMachineNode(ROpc, dl, MVT::Glue, N1, InGlue), 0);
6028     }
6029 
6030     // Prevent use of AH in a REX instruction by explicitly copying it to
6031     // an ABCD_L register.
6032     //
6033     // The current assumption of the register allocator is that isel
6034     // won't generate explicit references to the GR8_ABCD_H registers. If
6035     // the allocator and/or the backend get enhanced to be more robust in
6036     // that regard, this can be, and should be, removed.
6037     if (HiReg == X86::AH && !SDValue(Node, 1).use_empty()) {
6038       SDValue AHCopy = CurDAG->getRegister(X86::AH, MVT::i8);
6039       unsigned AHExtOpcode =
6040           isSigned ? X86::MOVSX32rr8_NOREX : X86::MOVZX32rr8_NOREX;
6041 
6042       SDNode *RNode = CurDAG->getMachineNode(AHExtOpcode, dl, MVT::i32,
6043                                              MVT::Glue, AHCopy, InGlue);
6044       SDValue Result(RNode, 0);
6045       InGlue = SDValue(RNode, 1);
6046 
6047       Result =
6048           CurDAG->getTargetExtractSubreg(X86::sub_8bit, dl, MVT::i8, Result);
6049 
6050       ReplaceUses(SDValue(Node, 1), Result);
6051       LLVM_DEBUG(dbgs() << "=> "; Result.getNode()->dump(CurDAG);
6052                  dbgs() << '\n');
6053     }
6054     // Copy the division (low) result, if it is needed.
6055     if (!SDValue(Node, 0).use_empty()) {
6056       SDValue Result = CurDAG->getCopyFromReg(CurDAG->getEntryNode(), dl,
6057                                                 LoReg, NVT, InGlue);
6058       InGlue = Result.getValue(2);
6059       ReplaceUses(SDValue(Node, 0), Result);
6060       LLVM_DEBUG(dbgs() << "=> "; Result.getNode()->dump(CurDAG);
6061                  dbgs() << '\n');
6062     }
6063     // Copy the remainder (high) result, if it is needed.
6064     if (!SDValue(Node, 1).use_empty()) {
6065       SDValue Result = CurDAG->getCopyFromReg(CurDAG->getEntryNode(), dl,
6066                                               HiReg, NVT, InGlue);
6067       InGlue = Result.getValue(2);
6068       ReplaceUses(SDValue(Node, 1), Result);
6069       LLVM_DEBUG(dbgs() << "=> "; Result.getNode()->dump(CurDAG);
6070                  dbgs() << '\n');
6071     }
6072     CurDAG->RemoveDeadNode(Node);
6073     return;
6074   }
6075 
6076   case X86ISD::FCMP:
6077   case X86ISD::STRICT_FCMP:
6078   case X86ISD::STRICT_FCMPS: {
6079     bool IsStrictCmp = Node->getOpcode() == X86ISD::STRICT_FCMP ||
6080                        Node->getOpcode() == X86ISD::STRICT_FCMPS;
6081     SDValue N0 = Node->getOperand(IsStrictCmp ? 1 : 0);
6082     SDValue N1 = Node->getOperand(IsStrictCmp ? 2 : 1);
6083 
6084     // Save the original VT of the compare.
6085     MVT CmpVT = N0.getSimpleValueType();
6086 
6087     // Floating point needs special handling if we don't have FCOMI.
6088     if (Subtarget->canUseCMOV())
6089       break;
6090 
6091     bool IsSignaling = Node->getOpcode() == X86ISD::STRICT_FCMPS;
6092 
6093     unsigned Opc;
6094     switch (CmpVT.SimpleTy) {
6095     default: llvm_unreachable("Unexpected type!");
6096     case MVT::f32:
6097       Opc = IsSignaling ? X86::COM_Fpr32 : X86::UCOM_Fpr32;
6098       break;
6099     case MVT::f64:
6100       Opc = IsSignaling ? X86::COM_Fpr64 : X86::UCOM_Fpr64;
6101       break;
6102     case MVT::f80:
6103       Opc = IsSignaling ? X86::COM_Fpr80 : X86::UCOM_Fpr80;
6104       break;
6105     }
6106 
6107     SDValue Chain =
6108         IsStrictCmp ? Node->getOperand(0) : CurDAG->getEntryNode();
6109     SDValue Glue;
6110     if (IsStrictCmp) {
6111       SDVTList VTs = CurDAG->getVTList(MVT::Other, MVT::Glue);
6112       Chain = SDValue(CurDAG->getMachineNode(Opc, dl, VTs, {N0, N1, Chain}), 0);
6113       Glue = Chain.getValue(1);
6114     } else {
6115       Glue = SDValue(CurDAG->getMachineNode(Opc, dl, MVT::Glue, N0, N1), 0);
6116     }
6117 
6118     // Move FPSW to AX.
6119     SDValue FNSTSW =
6120         SDValue(CurDAG->getMachineNode(X86::FNSTSW16r, dl, MVT::i16, Glue), 0);
6121 
6122     // Extract upper 8-bits of AX.
6123     SDValue Extract =
6124         CurDAG->getTargetExtractSubreg(X86::sub_8bit_hi, dl, MVT::i8, FNSTSW);
6125 
6126     // Move AH into flags.
6127     // Some 64-bit targets lack SAHF support, but they do support FCOMI.
6128     assert(Subtarget->canUseLAHFSAHF() &&
6129            "Target doesn't support SAHF or FCOMI?");
6130     SDValue AH = CurDAG->getCopyToReg(Chain, dl, X86::AH, Extract, SDValue());
6131     Chain = AH;
6132     SDValue SAHF = SDValue(
6133         CurDAG->getMachineNode(X86::SAHF, dl, MVT::i32, AH.getValue(1)), 0);
6134 
6135     if (IsStrictCmp)
6136       ReplaceUses(SDValue(Node, 1), Chain);
6137 
6138     ReplaceUses(SDValue(Node, 0), SAHF);
6139     CurDAG->RemoveDeadNode(Node);
6140     return;
6141   }
6142 
6143   case X86ISD::CMP: {
6144     SDValue N0 = Node->getOperand(0);
6145     SDValue N1 = Node->getOperand(1);
6146 
6147     // Optimizations for TEST compares.
6148     if (!isNullConstant(N1))
6149       break;
6150 
6151     // Save the original VT of the compare.
6152     MVT CmpVT = N0.getSimpleValueType();
6153 
6154     // If we are comparing (and (shr X, C, Mask) with 0, emit a BEXTR followed
6155     // by a test instruction. The test should be removed later by
6156     // analyzeCompare if we are using only the zero flag.
6157     // TODO: Should we check the users and use the BEXTR flags directly?
6158     if (N0.getOpcode() == ISD::AND && N0.hasOneUse()) {
6159       if (MachineSDNode *NewNode = matchBEXTRFromAndImm(N0.getNode())) {
6160         unsigned TestOpc = CmpVT == MVT::i64 ? X86::TEST64rr
6161                                              : X86::TEST32rr;
6162         SDValue BEXTR = SDValue(NewNode, 0);
6163         NewNode = CurDAG->getMachineNode(TestOpc, dl, MVT::i32, BEXTR, BEXTR);
6164         ReplaceUses(SDValue(Node, 0), SDValue(NewNode, 0));
6165         CurDAG->RemoveDeadNode(Node);
6166         return;
6167       }
6168     }
6169 
6170     // We can peek through truncates, but we need to be careful below.
6171     if (N0.getOpcode() == ISD::TRUNCATE && N0.hasOneUse())
6172       N0 = N0.getOperand(0);
6173 
6174     // Look for (X86cmp (and $op, $imm), 0) and see if we can convert it to
6175     // use a smaller encoding.
6176     // Look past the truncate if CMP is the only use of it.
6177     if (N0.getOpcode() == ISD::AND && N0.getNode()->hasOneUse() &&
6178         N0.getValueType() != MVT::i8) {
6179       auto *MaskC = dyn_cast<ConstantSDNode>(N0.getOperand(1));
6180       if (!MaskC)
6181         break;
6182 
6183       // We may have looked through a truncate so mask off any bits that
6184       // shouldn't be part of the compare.
6185       uint64_t Mask = MaskC->getZExtValue();
6186       Mask &= maskTrailingOnes<uint64_t>(CmpVT.getScalarSizeInBits());
6187 
6188       // Check if we can replace AND+IMM{32,64} with a shift. This is possible
6189       // for masks like 0xFF000000 or 0x00FFFFFF and if we care only about the
6190       // zero flag.
6191       if (CmpVT == MVT::i64 && !isInt<8>(Mask) && isShiftedMask_64(Mask) &&
6192           onlyUsesZeroFlag(SDValue(Node, 0))) {
6193         unsigned ShiftOpcode = ISD::DELETED_NODE;
6194         unsigned ShiftAmt;
6195         unsigned SubRegIdx;
6196         MVT SubRegVT;
6197         unsigned TestOpcode;
6198         unsigned LeadingZeros = llvm::countl_zero(Mask);
6199         unsigned TrailingZeros = llvm::countr_zero(Mask);
6200 
6201         // With leading/trailing zeros, the transform is profitable if we can
6202         // eliminate a movabsq or shrink a 32-bit immediate to 8-bit without
6203         // incurring any extra register moves.
6204         bool SavesBytes = !isInt<32>(Mask) || N0.getOperand(0).hasOneUse();
6205         if (LeadingZeros == 0 && SavesBytes) {
6206           // If the mask covers the most significant bit, then we can replace
6207           // TEST+AND with a SHR and check eflags.
6208           // This emits a redundant TEST which is subsequently eliminated.
6209           ShiftOpcode = GET_ND_IF_ENABLED(X86::SHR64ri);
6210           ShiftAmt = TrailingZeros;
6211           SubRegIdx = 0;
6212           TestOpcode = X86::TEST64rr;
6213         } else if (TrailingZeros == 0 && SavesBytes) {
6214           // If the mask covers the least significant bit, then we can replace
6215           // TEST+AND with a SHL and check eflags.
6216           // This emits a redundant TEST which is subsequently eliminated.
6217           ShiftOpcode = GET_ND_IF_ENABLED(X86::SHL64ri);
6218           ShiftAmt = LeadingZeros;
6219           SubRegIdx = 0;
6220           TestOpcode = X86::TEST64rr;
6221         } else if (MaskC->hasOneUse() && !isInt<32>(Mask)) {
6222           // If the shifted mask extends into the high half and is 8/16/32 bits
6223           // wide, then replace it with a SHR and a TEST8rr/TEST16rr/TEST32rr.
6224           unsigned PopCount = 64 - LeadingZeros - TrailingZeros;
6225           if (PopCount == 8) {
6226             ShiftOpcode = GET_ND_IF_ENABLED(X86::SHR64ri);
6227             ShiftAmt = TrailingZeros;
6228             SubRegIdx = X86::sub_8bit;
6229             SubRegVT = MVT::i8;
6230             TestOpcode = X86::TEST8rr;
6231           } else if (PopCount == 16) {
6232             ShiftOpcode = GET_ND_IF_ENABLED(X86::SHR64ri);
6233             ShiftAmt = TrailingZeros;
6234             SubRegIdx = X86::sub_16bit;
6235             SubRegVT = MVT::i16;
6236             TestOpcode = X86::TEST16rr;
6237           } else if (PopCount == 32) {
6238             ShiftOpcode = GET_ND_IF_ENABLED(X86::SHR64ri);
6239             ShiftAmt = TrailingZeros;
6240             SubRegIdx = X86::sub_32bit;
6241             SubRegVT = MVT::i32;
6242             TestOpcode = X86::TEST32rr;
6243           }
6244         }
6245         if (ShiftOpcode != ISD::DELETED_NODE) {
6246           SDValue ShiftC = CurDAG->getTargetConstant(ShiftAmt, dl, MVT::i64);
6247           SDValue Shift = SDValue(
6248               CurDAG->getMachineNode(ShiftOpcode, dl, MVT::i64, MVT::i32,
6249                                      N0.getOperand(0), ShiftC),
6250               0);
6251           if (SubRegIdx != 0) {
6252             Shift =
6253                 CurDAG->getTargetExtractSubreg(SubRegIdx, dl, SubRegVT, Shift);
6254           }
6255           MachineSDNode *Test =
6256               CurDAG->getMachineNode(TestOpcode, dl, MVT::i32, Shift, Shift);
6257           ReplaceNode(Node, Test);
6258           return;
6259         }
6260       }
6261 
6262       MVT VT;
6263       int SubRegOp;
6264       unsigned ROpc, MOpc;
6265 
6266       // For each of these checks we need to be careful if the sign flag is
6267       // being used. It is only safe to use the sign flag in two conditions,
6268       // either the sign bit in the shrunken mask is zero or the final test
6269       // size is equal to the original compare size.
6270 
6271       if (isUInt<8>(Mask) &&
6272           (!(Mask & 0x80) || CmpVT == MVT::i8 ||
6273            hasNoSignFlagUses(SDValue(Node, 0)))) {
6274         // For example, convert "testl %eax, $8" to "testb %al, $8"
6275         VT = MVT::i8;
6276         SubRegOp = X86::sub_8bit;
6277         ROpc = X86::TEST8ri;
6278         MOpc = X86::TEST8mi;
6279       } else if (OptForMinSize && isUInt<16>(Mask) &&
6280                  (!(Mask & 0x8000) || CmpVT == MVT::i16 ||
6281                   hasNoSignFlagUses(SDValue(Node, 0)))) {
6282         // For example, "testl %eax, $32776" to "testw %ax, $32776".
6283         // NOTE: We only want to form TESTW instructions if optimizing for
6284         // min size. Otherwise we only save one byte and possibly get a length
6285         // changing prefix penalty in the decoders.
6286         VT = MVT::i16;
6287         SubRegOp = X86::sub_16bit;
6288         ROpc = X86::TEST16ri;
6289         MOpc = X86::TEST16mi;
6290       } else if (isUInt<32>(Mask) && N0.getValueType() != MVT::i16 &&
6291                  ((!(Mask & 0x80000000) &&
6292                    // Without minsize 16-bit Cmps can get here so we need to
6293                    // be sure we calculate the correct sign flag if needed.
6294                    (CmpVT != MVT::i16 || !(Mask & 0x8000))) ||
6295                   CmpVT == MVT::i32 ||
6296                   hasNoSignFlagUses(SDValue(Node, 0)))) {
6297         // For example, "testq %rax, $268468232" to "testl %eax, $268468232".
6298         // NOTE: We only want to run that transform if N0 is 32 or 64 bits.
6299         // Otherwize, we find ourselves in a position where we have to do
6300         // promotion. If previous passes did not promote the and, we assume
6301         // they had a good reason not to and do not promote here.
6302         VT = MVT::i32;
6303         SubRegOp = X86::sub_32bit;
6304         ROpc = X86::TEST32ri;
6305         MOpc = X86::TEST32mi;
6306       } else {
6307         // No eligible transformation was found.
6308         break;
6309       }
6310 
6311       SDValue Imm = CurDAG->getTargetConstant(Mask, dl, VT);
6312       SDValue Reg = N0.getOperand(0);
6313 
6314       // Emit a testl or testw.
6315       MachineSDNode *NewNode;
6316       SDValue Tmp0, Tmp1, Tmp2, Tmp3, Tmp4;
6317       if (tryFoldLoad(Node, N0.getNode(), Reg, Tmp0, Tmp1, Tmp2, Tmp3, Tmp4)) {
6318         if (auto *LoadN = dyn_cast<LoadSDNode>(N0.getOperand(0).getNode())) {
6319           if (!LoadN->isSimple()) {
6320             unsigned NumVolBits = LoadN->getValueType(0).getSizeInBits();
6321             if ((MOpc == X86::TEST8mi && NumVolBits != 8) ||
6322                 (MOpc == X86::TEST16mi && NumVolBits != 16) ||
6323                 (MOpc == X86::TEST32mi && NumVolBits != 32))
6324               break;
6325           }
6326         }
6327         SDValue Ops[] = { Tmp0, Tmp1, Tmp2, Tmp3, Tmp4, Imm,
6328                           Reg.getOperand(0) };
6329         NewNode = CurDAG->getMachineNode(MOpc, dl, MVT::i32, MVT::Other, Ops);
6330         // Update the chain.
6331         ReplaceUses(Reg.getValue(1), SDValue(NewNode, 1));
6332         // Record the mem-refs
6333         CurDAG->setNodeMemRefs(NewNode,
6334                                {cast<LoadSDNode>(Reg)->getMemOperand()});
6335       } else {
6336         // Extract the subregister if necessary.
6337         if (N0.getValueType() != VT)
6338           Reg = CurDAG->getTargetExtractSubreg(SubRegOp, dl, VT, Reg);
6339 
6340         NewNode = CurDAG->getMachineNode(ROpc, dl, MVT::i32, Reg, Imm);
6341       }
6342       // Replace CMP with TEST.
6343       ReplaceNode(Node, NewNode);
6344       return;
6345     }
6346     break;
6347   }
6348   case X86ISD::PCMPISTR: {
6349     if (!Subtarget->hasSSE42())
6350       break;
6351 
6352     bool NeedIndex = !SDValue(Node, 0).use_empty();
6353     bool NeedMask = !SDValue(Node, 1).use_empty();
6354     // We can't fold a load if we are going to make two instructions.
6355     bool MayFoldLoad = !NeedIndex || !NeedMask;
6356 
6357     MachineSDNode *CNode;
6358     if (NeedMask) {
6359       unsigned ROpc =
6360           Subtarget->hasAVX() ? X86::VPCMPISTRMrri : X86::PCMPISTRMrri;
6361       unsigned MOpc =
6362           Subtarget->hasAVX() ? X86::VPCMPISTRMrmi : X86::PCMPISTRMrmi;
6363       CNode = emitPCMPISTR(ROpc, MOpc, MayFoldLoad, dl, MVT::v16i8, Node);
6364       ReplaceUses(SDValue(Node, 1), SDValue(CNode, 0));
6365     }
6366     if (NeedIndex || !NeedMask) {
6367       unsigned ROpc =
6368           Subtarget->hasAVX() ? X86::VPCMPISTRIrri : X86::PCMPISTRIrri;
6369       unsigned MOpc =
6370           Subtarget->hasAVX() ? X86::VPCMPISTRIrmi : X86::PCMPISTRIrmi;
6371       CNode = emitPCMPISTR(ROpc, MOpc, MayFoldLoad, dl, MVT::i32, Node);
6372       ReplaceUses(SDValue(Node, 0), SDValue(CNode, 0));
6373     }
6374 
6375     // Connect the flag usage to the last instruction created.
6376     ReplaceUses(SDValue(Node, 2), SDValue(CNode, 1));
6377     CurDAG->RemoveDeadNode(Node);
6378     return;
6379   }
6380   case X86ISD::PCMPESTR: {
6381     if (!Subtarget->hasSSE42())
6382       break;
6383 
6384     // Copy the two implicit register inputs.
6385     SDValue InGlue = CurDAG->getCopyToReg(CurDAG->getEntryNode(), dl, X86::EAX,
6386                                           Node->getOperand(1),
6387                                           SDValue()).getValue(1);
6388     InGlue = CurDAG->getCopyToReg(CurDAG->getEntryNode(), dl, X86::EDX,
6389                                   Node->getOperand(3), InGlue).getValue(1);
6390 
6391     bool NeedIndex = !SDValue(Node, 0).use_empty();
6392     bool NeedMask = !SDValue(Node, 1).use_empty();
6393     // We can't fold a load if we are going to make two instructions.
6394     bool MayFoldLoad = !NeedIndex || !NeedMask;
6395 
6396     MachineSDNode *CNode;
6397     if (NeedMask) {
6398       unsigned ROpc =
6399           Subtarget->hasAVX() ? X86::VPCMPESTRMrri : X86::PCMPESTRMrri;
6400       unsigned MOpc =
6401           Subtarget->hasAVX() ? X86::VPCMPESTRMrmi : X86::PCMPESTRMrmi;
6402       CNode =
6403           emitPCMPESTR(ROpc, MOpc, MayFoldLoad, dl, MVT::v16i8, Node, InGlue);
6404       ReplaceUses(SDValue(Node, 1), SDValue(CNode, 0));
6405     }
6406     if (NeedIndex || !NeedMask) {
6407       unsigned ROpc =
6408           Subtarget->hasAVX() ? X86::VPCMPESTRIrri : X86::PCMPESTRIrri;
6409       unsigned MOpc =
6410           Subtarget->hasAVX() ? X86::VPCMPESTRIrmi : X86::PCMPESTRIrmi;
6411       CNode = emitPCMPESTR(ROpc, MOpc, MayFoldLoad, dl, MVT::i32, Node, InGlue);
6412       ReplaceUses(SDValue(Node, 0), SDValue(CNode, 0));
6413     }
6414     // Connect the flag usage to the last instruction created.
6415     ReplaceUses(SDValue(Node, 2), SDValue(CNode, 1));
6416     CurDAG->RemoveDeadNode(Node);
6417     return;
6418   }
6419 
6420   case ISD::SETCC: {
6421     if (NVT.isVector() && tryVPTESTM(Node, SDValue(Node, 0), SDValue()))
6422       return;
6423 
6424     break;
6425   }
6426 
6427   case ISD::STORE:
6428     if (foldLoadStoreIntoMemOperand(Node))
6429       return;
6430     break;
6431 
6432   case X86ISD::SETCC_CARRY: {
6433     MVT VT = Node->getSimpleValueType(0);
6434     SDValue Result;
6435     if (Subtarget->hasSBBDepBreaking()) {
6436       // We have to do this manually because tblgen will put the eflags copy in
6437       // the wrong place if we use an extract_subreg in the pattern.
6438       // Copy flags to the EFLAGS register and glue it to next node.
6439       SDValue EFLAGS =
6440           CurDAG->getCopyToReg(CurDAG->getEntryNode(), dl, X86::EFLAGS,
6441                                Node->getOperand(1), SDValue());
6442 
6443       // Create a 64-bit instruction if the result is 64-bits otherwise use the
6444       // 32-bit version.
6445       unsigned Opc = VT == MVT::i64 ? X86::SETB_C64r : X86::SETB_C32r;
6446       MVT SetVT = VT == MVT::i64 ? MVT::i64 : MVT::i32;
6447       Result = SDValue(
6448           CurDAG->getMachineNode(Opc, dl, SetVT, EFLAGS, EFLAGS.getValue(1)),
6449           0);
6450     } else {
6451       // The target does not recognize sbb with the same reg operand as a
6452       // no-source idiom, so we explicitly zero the input values.
6453       Result = getSBBZero(Node);
6454     }
6455 
6456     // For less than 32-bits we need to extract from the 32-bit node.
6457     if (VT == MVT::i8 || VT == MVT::i16) {
6458       int SubIndex = VT == MVT::i16 ? X86::sub_16bit : X86::sub_8bit;
6459       Result = CurDAG->getTargetExtractSubreg(SubIndex, dl, VT, Result);
6460     }
6461 
6462     ReplaceUses(SDValue(Node, 0), Result);
6463     CurDAG->RemoveDeadNode(Node);
6464     return;
6465   }
6466   case X86ISD::SBB: {
6467     if (isNullConstant(Node->getOperand(0)) &&
6468         isNullConstant(Node->getOperand(1))) {
6469       SDValue Result = getSBBZero(Node);
6470 
6471       // Replace the flag use.
6472       ReplaceUses(SDValue(Node, 1), Result.getValue(1));
6473 
6474       // Replace the result use.
6475       if (!SDValue(Node, 0).use_empty()) {
6476         // For less than 32-bits we need to extract from the 32-bit node.
6477         MVT VT = Node->getSimpleValueType(0);
6478         if (VT == MVT::i8 || VT == MVT::i16) {
6479           int SubIndex = VT == MVT::i16 ? X86::sub_16bit : X86::sub_8bit;
6480           Result = CurDAG->getTargetExtractSubreg(SubIndex, dl, VT, Result);
6481         }
6482         ReplaceUses(SDValue(Node, 0), Result);
6483       }
6484 
6485       CurDAG->RemoveDeadNode(Node);
6486       return;
6487     }
6488     break;
6489   }
6490   case X86ISD::MGATHER: {
6491     auto *Mgt = cast<X86MaskedGatherSDNode>(Node);
6492     SDValue IndexOp = Mgt->getIndex();
6493     SDValue Mask = Mgt->getMask();
6494     MVT IndexVT = IndexOp.getSimpleValueType();
6495     MVT ValueVT = Node->getSimpleValueType(0);
6496     MVT MaskVT = Mask.getSimpleValueType();
6497 
6498     // This is just to prevent crashes if the nodes are malformed somehow. We're
6499     // otherwise only doing loose type checking in here based on type what
6500     // a type constraint would say just like table based isel.
6501     if (!ValueVT.isVector() || !MaskVT.isVector())
6502       break;
6503 
6504     unsigned NumElts = ValueVT.getVectorNumElements();
6505     MVT ValueSVT = ValueVT.getVectorElementType();
6506 
6507     bool IsFP = ValueSVT.isFloatingPoint();
6508     unsigned EltSize = ValueSVT.getSizeInBits();
6509 
6510     unsigned Opc = 0;
6511     bool AVX512Gather = MaskVT.getVectorElementType() == MVT::i1;
6512     if (AVX512Gather) {
6513       if (IndexVT == MVT::v4i32 && NumElts == 4 && EltSize == 32)
6514         Opc = IsFP ? X86::VGATHERDPSZ128rm : X86::VPGATHERDDZ128rm;
6515       else if (IndexVT == MVT::v8i32 && NumElts == 8 && EltSize == 32)
6516         Opc = IsFP ? X86::VGATHERDPSZ256rm : X86::VPGATHERDDZ256rm;
6517       else if (IndexVT == MVT::v16i32 && NumElts == 16 && EltSize == 32)
6518         Opc = IsFP ? X86::VGATHERDPSZrm : X86::VPGATHERDDZrm;
6519       else if (IndexVT == MVT::v4i32 && NumElts == 2 && EltSize == 64)
6520         Opc = IsFP ? X86::VGATHERDPDZ128rm : X86::VPGATHERDQZ128rm;
6521       else if (IndexVT == MVT::v4i32 && NumElts == 4 && EltSize == 64)
6522         Opc = IsFP ? X86::VGATHERDPDZ256rm : X86::VPGATHERDQZ256rm;
6523       else if (IndexVT == MVT::v8i32 && NumElts == 8 && EltSize == 64)
6524         Opc = IsFP ? X86::VGATHERDPDZrm : X86::VPGATHERDQZrm;
6525       else if (IndexVT == MVT::v2i64 && NumElts == 4 && EltSize == 32)
6526         Opc = IsFP ? X86::VGATHERQPSZ128rm : X86::VPGATHERQDZ128rm;
6527       else if (IndexVT == MVT::v4i64 && NumElts == 4 && EltSize == 32)
6528         Opc = IsFP ? X86::VGATHERQPSZ256rm : X86::VPGATHERQDZ256rm;
6529       else if (IndexVT == MVT::v8i64 && NumElts == 8 && EltSize == 32)
6530         Opc = IsFP ? X86::VGATHERQPSZrm : X86::VPGATHERQDZrm;
6531       else if (IndexVT == MVT::v2i64 && NumElts == 2 && EltSize == 64)
6532         Opc = IsFP ? X86::VGATHERQPDZ128rm : X86::VPGATHERQQZ128rm;
6533       else if (IndexVT == MVT::v4i64 && NumElts == 4 && EltSize == 64)
6534         Opc = IsFP ? X86::VGATHERQPDZ256rm : X86::VPGATHERQQZ256rm;
6535       else if (IndexVT == MVT::v8i64 && NumElts == 8 && EltSize == 64)
6536         Opc = IsFP ? X86::VGATHERQPDZrm : X86::VPGATHERQQZrm;
6537     } else {
6538       assert(EVT(MaskVT) == EVT(ValueVT).changeVectorElementTypeToInteger() &&
6539              "Unexpected mask VT!");
6540       if (IndexVT == MVT::v4i32 && NumElts == 4 && EltSize == 32)
6541         Opc = IsFP ? X86::VGATHERDPSrm : X86::VPGATHERDDrm;
6542       else if (IndexVT == MVT::v8i32 && NumElts == 8 && EltSize == 32)
6543         Opc = IsFP ? X86::VGATHERDPSYrm : X86::VPGATHERDDYrm;
6544       else if (IndexVT == MVT::v4i32 && NumElts == 2 && EltSize == 64)
6545         Opc = IsFP ? X86::VGATHERDPDrm : X86::VPGATHERDQrm;
6546       else if (IndexVT == MVT::v4i32 && NumElts == 4 && EltSize == 64)
6547         Opc = IsFP ? X86::VGATHERDPDYrm : X86::VPGATHERDQYrm;
6548       else if (IndexVT == MVT::v2i64 && NumElts == 4 && EltSize == 32)
6549         Opc = IsFP ? X86::VGATHERQPSrm : X86::VPGATHERQDrm;
6550       else if (IndexVT == MVT::v4i64 && NumElts == 4 && EltSize == 32)
6551         Opc = IsFP ? X86::VGATHERQPSYrm : X86::VPGATHERQDYrm;
6552       else if (IndexVT == MVT::v2i64 && NumElts == 2 && EltSize == 64)
6553         Opc = IsFP ? X86::VGATHERQPDrm : X86::VPGATHERQQrm;
6554       else if (IndexVT == MVT::v4i64 && NumElts == 4 && EltSize == 64)
6555         Opc = IsFP ? X86::VGATHERQPDYrm : X86::VPGATHERQQYrm;
6556     }
6557 
6558     if (!Opc)
6559       break;
6560 
6561     SDValue Base, Scale, Index, Disp, Segment;
6562     if (!selectVectorAddr(Mgt, Mgt->getBasePtr(), IndexOp, Mgt->getScale(),
6563                           Base, Scale, Index, Disp, Segment))
6564       break;
6565 
6566     SDValue PassThru = Mgt->getPassThru();
6567     SDValue Chain = Mgt->getChain();
6568     // Gather instructions have a mask output not in the ISD node.
6569     SDVTList VTs = CurDAG->getVTList(ValueVT, MaskVT, MVT::Other);
6570 
6571     MachineSDNode *NewNode;
6572     if (AVX512Gather) {
6573       SDValue Ops[] = {PassThru, Mask, Base,    Scale,
6574                        Index,    Disp, Segment, Chain};
6575       NewNode = CurDAG->getMachineNode(Opc, SDLoc(dl), VTs, Ops);
6576     } else {
6577       SDValue Ops[] = {PassThru, Base,    Scale, Index,
6578                        Disp,     Segment, Mask,  Chain};
6579       NewNode = CurDAG->getMachineNode(Opc, SDLoc(dl), VTs, Ops);
6580     }
6581     CurDAG->setNodeMemRefs(NewNode, {Mgt->getMemOperand()});
6582     ReplaceUses(SDValue(Node, 0), SDValue(NewNode, 0));
6583     ReplaceUses(SDValue(Node, 1), SDValue(NewNode, 2));
6584     CurDAG->RemoveDeadNode(Node);
6585     return;
6586   }
6587   case X86ISD::MSCATTER: {
6588     auto *Sc = cast<X86MaskedScatterSDNode>(Node);
6589     SDValue Value = Sc->getValue();
6590     SDValue IndexOp = Sc->getIndex();
6591     MVT IndexVT = IndexOp.getSimpleValueType();
6592     MVT ValueVT = Value.getSimpleValueType();
6593 
6594     // This is just to prevent crashes if the nodes are malformed somehow. We're
6595     // otherwise only doing loose type checking in here based on type what
6596     // a type constraint would say just like table based isel.
6597     if (!ValueVT.isVector())
6598       break;
6599 
6600     unsigned NumElts = ValueVT.getVectorNumElements();
6601     MVT ValueSVT = ValueVT.getVectorElementType();
6602 
6603     bool IsFP = ValueSVT.isFloatingPoint();
6604     unsigned EltSize = ValueSVT.getSizeInBits();
6605 
6606     unsigned Opc;
6607     if (IndexVT == MVT::v4i32 && NumElts == 4 && EltSize == 32)
6608       Opc = IsFP ? X86::VSCATTERDPSZ128mr : X86::VPSCATTERDDZ128mr;
6609     else if (IndexVT == MVT::v8i32 && NumElts == 8 && EltSize == 32)
6610       Opc = IsFP ? X86::VSCATTERDPSZ256mr : X86::VPSCATTERDDZ256mr;
6611     else if (IndexVT == MVT::v16i32 && NumElts == 16 && EltSize == 32)
6612       Opc = IsFP ? X86::VSCATTERDPSZmr : X86::VPSCATTERDDZmr;
6613     else if (IndexVT == MVT::v4i32 && NumElts == 2 && EltSize == 64)
6614       Opc = IsFP ? X86::VSCATTERDPDZ128mr : X86::VPSCATTERDQZ128mr;
6615     else if (IndexVT == MVT::v4i32 && NumElts == 4 && EltSize == 64)
6616       Opc = IsFP ? X86::VSCATTERDPDZ256mr : X86::VPSCATTERDQZ256mr;
6617     else if (IndexVT == MVT::v8i32 && NumElts == 8 && EltSize == 64)
6618       Opc = IsFP ? X86::VSCATTERDPDZmr : X86::VPSCATTERDQZmr;
6619     else if (IndexVT == MVT::v2i64 && NumElts == 4 && EltSize == 32)
6620       Opc = IsFP ? X86::VSCATTERQPSZ128mr : X86::VPSCATTERQDZ128mr;
6621     else if (IndexVT == MVT::v4i64 && NumElts == 4 && EltSize == 32)
6622       Opc = IsFP ? X86::VSCATTERQPSZ256mr : X86::VPSCATTERQDZ256mr;
6623     else if (IndexVT == MVT::v8i64 && NumElts == 8 && EltSize == 32)
6624       Opc = IsFP ? X86::VSCATTERQPSZmr : X86::VPSCATTERQDZmr;
6625     else if (IndexVT == MVT::v2i64 && NumElts == 2 && EltSize == 64)
6626       Opc = IsFP ? X86::VSCATTERQPDZ128mr : X86::VPSCATTERQQZ128mr;
6627     else if (IndexVT == MVT::v4i64 && NumElts == 4 && EltSize == 64)
6628       Opc = IsFP ? X86::VSCATTERQPDZ256mr : X86::VPSCATTERQQZ256mr;
6629     else if (IndexVT == MVT::v8i64 && NumElts == 8 && EltSize == 64)
6630       Opc = IsFP ? X86::VSCATTERQPDZmr : X86::VPSCATTERQQZmr;
6631     else
6632       break;
6633 
6634     SDValue Base, Scale, Index, Disp, Segment;
6635     if (!selectVectorAddr(Sc, Sc->getBasePtr(), IndexOp, Sc->getScale(),
6636                           Base, Scale, Index, Disp, Segment))
6637       break;
6638 
6639     SDValue Mask = Sc->getMask();
6640     SDValue Chain = Sc->getChain();
6641     // Scatter instructions have a mask output not in the ISD node.
6642     SDVTList VTs = CurDAG->getVTList(Mask.getValueType(), MVT::Other);
6643     SDValue Ops[] = {Base, Scale, Index, Disp, Segment, Mask, Value, Chain};
6644 
6645     MachineSDNode *NewNode = CurDAG->getMachineNode(Opc, SDLoc(dl), VTs, Ops);
6646     CurDAG->setNodeMemRefs(NewNode, {Sc->getMemOperand()});
6647     ReplaceUses(SDValue(Node, 0), SDValue(NewNode, 1));
6648     CurDAG->RemoveDeadNode(Node);
6649     return;
6650   }
6651   case ISD::PREALLOCATED_SETUP: {
6652     auto *MFI = CurDAG->getMachineFunction().getInfo<X86MachineFunctionInfo>();
6653     auto CallId = MFI->getPreallocatedIdForCallSite(
6654         cast<SrcValueSDNode>(Node->getOperand(1))->getValue());
6655     SDValue Chain = Node->getOperand(0);
6656     SDValue CallIdValue = CurDAG->getTargetConstant(CallId, dl, MVT::i32);
6657     MachineSDNode *New = CurDAG->getMachineNode(
6658         TargetOpcode::PREALLOCATED_SETUP, dl, MVT::Other, CallIdValue, Chain);
6659     ReplaceUses(SDValue(Node, 0), SDValue(New, 0)); // Chain
6660     CurDAG->RemoveDeadNode(Node);
6661     return;
6662   }
6663   case ISD::PREALLOCATED_ARG: {
6664     auto *MFI = CurDAG->getMachineFunction().getInfo<X86MachineFunctionInfo>();
6665     auto CallId = MFI->getPreallocatedIdForCallSite(
6666         cast<SrcValueSDNode>(Node->getOperand(1))->getValue());
6667     SDValue Chain = Node->getOperand(0);
6668     SDValue CallIdValue = CurDAG->getTargetConstant(CallId, dl, MVT::i32);
6669     SDValue ArgIndex = Node->getOperand(2);
6670     SDValue Ops[3];
6671     Ops[0] = CallIdValue;
6672     Ops[1] = ArgIndex;
6673     Ops[2] = Chain;
6674     MachineSDNode *New = CurDAG->getMachineNode(
6675         TargetOpcode::PREALLOCATED_ARG, dl,
6676         CurDAG->getVTList(TLI->getPointerTy(CurDAG->getDataLayout()),
6677                           MVT::Other),
6678         Ops);
6679     ReplaceUses(SDValue(Node, 0), SDValue(New, 0)); // Arg pointer
6680     ReplaceUses(SDValue(Node, 1), SDValue(New, 1)); // Chain
6681     CurDAG->RemoveDeadNode(Node);
6682     return;
6683   }
6684   case X86ISD::AESENCWIDE128KL:
6685   case X86ISD::AESDECWIDE128KL:
6686   case X86ISD::AESENCWIDE256KL:
6687   case X86ISD::AESDECWIDE256KL: {
6688     if (!Subtarget->hasWIDEKL())
6689       break;
6690 
6691     unsigned Opcode;
6692     switch (Node->getOpcode()) {
6693     default:
6694       llvm_unreachable("Unexpected opcode!");
6695     case X86ISD::AESENCWIDE128KL:
6696       Opcode = X86::AESENCWIDE128KL;
6697       break;
6698     case X86ISD::AESDECWIDE128KL:
6699       Opcode = X86::AESDECWIDE128KL;
6700       break;
6701     case X86ISD::AESENCWIDE256KL:
6702       Opcode = X86::AESENCWIDE256KL;
6703       break;
6704     case X86ISD::AESDECWIDE256KL:
6705       Opcode = X86::AESDECWIDE256KL;
6706       break;
6707     }
6708 
6709     SDValue Chain = Node->getOperand(0);
6710     SDValue Addr = Node->getOperand(1);
6711 
6712     SDValue Base, Scale, Index, Disp, Segment;
6713     if (!selectAddr(Node, Addr, Base, Scale, Index, Disp, Segment))
6714       break;
6715 
6716     Chain = CurDAG->getCopyToReg(Chain, dl, X86::XMM0, Node->getOperand(2),
6717                                  SDValue());
6718     Chain = CurDAG->getCopyToReg(Chain, dl, X86::XMM1, Node->getOperand(3),
6719                                  Chain.getValue(1));
6720     Chain = CurDAG->getCopyToReg(Chain, dl, X86::XMM2, Node->getOperand(4),
6721                                  Chain.getValue(1));
6722     Chain = CurDAG->getCopyToReg(Chain, dl, X86::XMM3, Node->getOperand(5),
6723                                  Chain.getValue(1));
6724     Chain = CurDAG->getCopyToReg(Chain, dl, X86::XMM4, Node->getOperand(6),
6725                                  Chain.getValue(1));
6726     Chain = CurDAG->getCopyToReg(Chain, dl, X86::XMM5, Node->getOperand(7),
6727                                  Chain.getValue(1));
6728     Chain = CurDAG->getCopyToReg(Chain, dl, X86::XMM6, Node->getOperand(8),
6729                                  Chain.getValue(1));
6730     Chain = CurDAG->getCopyToReg(Chain, dl, X86::XMM7, Node->getOperand(9),
6731                                  Chain.getValue(1));
6732 
6733     MachineSDNode *Res = CurDAG->getMachineNode(
6734         Opcode, dl, Node->getVTList(),
6735         {Base, Scale, Index, Disp, Segment, Chain, Chain.getValue(1)});
6736     CurDAG->setNodeMemRefs(Res, cast<MemSDNode>(Node)->getMemOperand());
6737     ReplaceNode(Node, Res);
6738     return;
6739   }
6740   case X86ISD::POP_FROM_X87_REG: {
6741     SDValue Chain = Node->getOperand(0);
6742     Register Reg = cast<RegisterSDNode>(Node->getOperand(1))->getReg();
6743     SDValue Glue;
6744     if (Node->getNumValues() == 3)
6745       Glue = Node->getOperand(2);
6746     SDValue Copy =
6747         CurDAG->getCopyFromReg(Chain, dl, Reg, Node->getValueType(0), Glue);
6748     ReplaceNode(Node, Copy.getNode());
6749     return;
6750   }
6751   }
6752 
6753   SelectCode(Node);
6754 }
6755 
SelectInlineAsmMemoryOperand(const SDValue & Op,InlineAsm::ConstraintCode ConstraintID,std::vector<SDValue> & OutOps)6756 bool X86DAGToDAGISel::SelectInlineAsmMemoryOperand(
6757     const SDValue &Op, InlineAsm::ConstraintCode ConstraintID,
6758     std::vector<SDValue> &OutOps) {
6759   SDValue Op0, Op1, Op2, Op3, Op4;
6760   switch (ConstraintID) {
6761   default:
6762     llvm_unreachable("Unexpected asm memory constraint");
6763   case InlineAsm::ConstraintCode::o: // offsetable        ??
6764   case InlineAsm::ConstraintCode::v: // not offsetable    ??
6765   case InlineAsm::ConstraintCode::m: // memory
6766   case InlineAsm::ConstraintCode::X:
6767   case InlineAsm::ConstraintCode::p: // address
6768     if (!selectAddr(nullptr, Op, Op0, Op1, Op2, Op3, Op4))
6769       return true;
6770     break;
6771   }
6772 
6773   OutOps.push_back(Op0);
6774   OutOps.push_back(Op1);
6775   OutOps.push_back(Op2);
6776   OutOps.push_back(Op3);
6777   OutOps.push_back(Op4);
6778   return false;
6779 }
6780 
X86ISelDAGToDAGPass(X86TargetMachine & TM)6781 X86ISelDAGToDAGPass::X86ISelDAGToDAGPass(X86TargetMachine &TM)
6782     : SelectionDAGISelPass(
6783           std::make_unique<X86DAGToDAGISel>(TM, TM.getOptLevel())) {}
6784 
6785 /// This pass converts a legalized DAG into a X86-specific DAG,
6786 /// ready for instruction scheduling.
createX86ISelDag(X86TargetMachine & TM,CodeGenOptLevel OptLevel)6787 FunctionPass *llvm::createX86ISelDag(X86TargetMachine &TM,
6788                                      CodeGenOptLevel OptLevel) {
6789   return new X86DAGToDAGISelLegacy(TM, OptLevel);
6790 }
6791