xref: /freebsd/contrib/llvm-project/llvm/lib/Target/X86/X86ISelDAGToDAG.cpp (revision 7fdf597e96a02165cfe22ff357b857d5fa15ed8a)
1 //===- X86ISelDAGToDAG.cpp - A DAG pattern matching inst selector for X86 -===//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 //
9 // This file defines a DAG pattern matching instruction selector for X86,
10 // converting from a legalized dag to a X86 dag.
11 //
12 //===----------------------------------------------------------------------===//
13 
14 #include "X86ISelDAGToDAG.h"
15 #include "X86.h"
16 #include "X86MachineFunctionInfo.h"
17 #include "X86RegisterInfo.h"
18 #include "X86Subtarget.h"
19 #include "X86TargetMachine.h"
20 #include "llvm/ADT/Statistic.h"
21 #include "llvm/CodeGen/MachineModuleInfo.h"
22 #include "llvm/CodeGen/SelectionDAGISel.h"
23 #include "llvm/Config/llvm-config.h"
24 #include "llvm/IR/ConstantRange.h"
25 #include "llvm/IR/Function.h"
26 #include "llvm/IR/Instructions.h"
27 #include "llvm/IR/Intrinsics.h"
28 #include "llvm/IR/IntrinsicsX86.h"
29 #include "llvm/IR/Module.h"
30 #include "llvm/IR/Type.h"
31 #include "llvm/Support/Debug.h"
32 #include "llvm/Support/ErrorHandling.h"
33 #include "llvm/Support/KnownBits.h"
34 #include "llvm/Support/MathExtras.h"
35 #include <cstdint>
36 
37 using namespace llvm;
38 
39 #define DEBUG_TYPE "x86-isel"
40 #define PASS_NAME "X86 DAG->DAG Instruction Selection"
41 
42 STATISTIC(NumLoadMoved, "Number of loads moved below TokenFactor");
43 
44 static cl::opt<bool> AndImmShrink("x86-and-imm-shrink", cl::init(true),
45     cl::desc("Enable setting constant bits to reduce size of mask immediates"),
46     cl::Hidden);
47 
48 static cl::opt<bool> EnablePromoteAnyextLoad(
49     "x86-promote-anyext-load", cl::init(true),
50     cl::desc("Enable promoting aligned anyext load to wider load"), cl::Hidden);
51 
52 extern cl::opt<bool> IndirectBranchTracking;
53 
54 //===----------------------------------------------------------------------===//
55 //                      Pattern Matcher Implementation
56 //===----------------------------------------------------------------------===//
57 
58 namespace {
59   /// This corresponds to X86AddressMode, but uses SDValue's instead of register
60   /// numbers for the leaves of the matched tree.
61   struct X86ISelAddressMode {
62     enum {
63       RegBase,
64       FrameIndexBase
65     } BaseType = RegBase;
66 
67     // This is really a union, discriminated by BaseType!
68     SDValue Base_Reg;
69     int Base_FrameIndex = 0;
70 
71     unsigned Scale = 1;
72     SDValue IndexReg;
73     int32_t Disp = 0;
74     SDValue Segment;
75     const GlobalValue *GV = nullptr;
76     const Constant *CP = nullptr;
77     const BlockAddress *BlockAddr = nullptr;
78     const char *ES = nullptr;
79     MCSymbol *MCSym = nullptr;
80     int JT = -1;
81     Align Alignment;            // CP alignment.
82     unsigned char SymbolFlags = X86II::MO_NO_FLAG;  // X86II::MO_*
83     bool NegateIndex = false;
84 
85     X86ISelAddressMode() = default;
86 
87     bool hasSymbolicDisplacement() const {
88       return GV != nullptr || CP != nullptr || ES != nullptr ||
89              MCSym != nullptr || JT != -1 || BlockAddr != nullptr;
90     }
91 
92     bool hasBaseOrIndexReg() const {
93       return BaseType == FrameIndexBase ||
94              IndexReg.getNode() != nullptr || Base_Reg.getNode() != nullptr;
95     }
96 
97     /// Return true if this addressing mode is already RIP-relative.
98     bool isRIPRelative() const {
99       if (BaseType != RegBase) return false;
100       if (RegisterSDNode *RegNode =
101             dyn_cast_or_null<RegisterSDNode>(Base_Reg.getNode()))
102         return RegNode->getReg() == X86::RIP;
103       return false;
104     }
105 
106     void setBaseReg(SDValue Reg) {
107       BaseType = RegBase;
108       Base_Reg = Reg;
109     }
110 
111 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
112     void dump(SelectionDAG *DAG = nullptr) {
113       dbgs() << "X86ISelAddressMode " << this << '\n';
114       dbgs() << "Base_Reg ";
115       if (Base_Reg.getNode())
116         Base_Reg.getNode()->dump(DAG);
117       else
118         dbgs() << "nul\n";
119       if (BaseType == FrameIndexBase)
120         dbgs() << " Base.FrameIndex " << Base_FrameIndex << '\n';
121       dbgs() << " Scale " << Scale << '\n'
122              << "IndexReg ";
123       if (NegateIndex)
124         dbgs() << "negate ";
125       if (IndexReg.getNode())
126         IndexReg.getNode()->dump(DAG);
127       else
128         dbgs() << "nul\n";
129       dbgs() << " Disp " << Disp << '\n'
130              << "GV ";
131       if (GV)
132         GV->dump();
133       else
134         dbgs() << "nul";
135       dbgs() << " CP ";
136       if (CP)
137         CP->dump();
138       else
139         dbgs() << "nul";
140       dbgs() << '\n'
141              << "ES ";
142       if (ES)
143         dbgs() << ES;
144       else
145         dbgs() << "nul";
146       dbgs() << " MCSym ";
147       if (MCSym)
148         dbgs() << MCSym;
149       else
150         dbgs() << "nul";
151       dbgs() << " JT" << JT << " Align" << Alignment.value() << '\n';
152     }
153 #endif
154   };
155 }
156 
157 namespace {
158   //===--------------------------------------------------------------------===//
159   /// ISel - X86-specific code to select X86 machine instructions for
160   /// SelectionDAG operations.
161   ///
162   class X86DAGToDAGISel final : public SelectionDAGISel {
163     /// Keep a pointer to the X86Subtarget around so that we can
164     /// make the right decision when generating code for different targets.
165     const X86Subtarget *Subtarget;
166 
167     /// If true, selector should try to optimize for minimum code size.
168     bool OptForMinSize;
169 
170     /// Disable direct TLS access through segment registers.
171     bool IndirectTlsSegRefs;
172 
173   public:
174     X86DAGToDAGISel() = delete;
175 
176     explicit X86DAGToDAGISel(X86TargetMachine &tm, CodeGenOptLevel OptLevel)
177         : SelectionDAGISel(tm, OptLevel), Subtarget(nullptr),
178           OptForMinSize(false), IndirectTlsSegRefs(false) {}
179 
180     bool runOnMachineFunction(MachineFunction &MF) override {
181       // Reset the subtarget each time through.
182       Subtarget = &MF.getSubtarget<X86Subtarget>();
183       IndirectTlsSegRefs = MF.getFunction().hasFnAttribute(
184                              "indirect-tls-seg-refs");
185 
186       // OptFor[Min]Size are used in pattern predicates that isel is matching.
187       OptForMinSize = MF.getFunction().hasMinSize();
188       assert((!OptForMinSize || MF.getFunction().hasOptSize()) &&
189              "OptForMinSize implies OptForSize");
190       return SelectionDAGISel::runOnMachineFunction(MF);
191     }
192 
193     void emitFunctionEntryCode() override;
194 
195     bool IsProfitableToFold(SDValue N, SDNode *U, SDNode *Root) const override;
196 
197     void PreprocessISelDAG() override;
198     void PostprocessISelDAG() override;
199 
200 // Include the pieces autogenerated from the target description.
201 #include "X86GenDAGISel.inc"
202 
203   private:
204     void Select(SDNode *N) override;
205 
206     bool foldOffsetIntoAddress(uint64_t Offset, X86ISelAddressMode &AM);
207     bool matchLoadInAddress(LoadSDNode *N, X86ISelAddressMode &AM,
208                             bool AllowSegmentRegForX32 = false);
209     bool matchWrapper(SDValue N, X86ISelAddressMode &AM);
210     bool matchAddress(SDValue N, X86ISelAddressMode &AM);
211     bool matchVectorAddress(SDValue N, X86ISelAddressMode &AM);
212     bool matchAdd(SDValue &N, X86ISelAddressMode &AM, unsigned Depth);
213     SDValue matchIndexRecursively(SDValue N, X86ISelAddressMode &AM,
214                                   unsigned Depth);
215     bool matchAddressRecursively(SDValue N, X86ISelAddressMode &AM,
216                                  unsigned Depth);
217     bool matchVectorAddressRecursively(SDValue N, X86ISelAddressMode &AM,
218                                        unsigned Depth);
219     bool matchAddressBase(SDValue N, X86ISelAddressMode &AM);
220     bool selectAddr(SDNode *Parent, SDValue N, SDValue &Base,
221                     SDValue &Scale, SDValue &Index, SDValue &Disp,
222                     SDValue &Segment);
223     bool selectVectorAddr(MemSDNode *Parent, SDValue BasePtr, SDValue IndexOp,
224                           SDValue ScaleOp, SDValue &Base, SDValue &Scale,
225                           SDValue &Index, SDValue &Disp, SDValue &Segment);
226     bool selectMOV64Imm32(SDValue N, SDValue &Imm);
227     bool selectLEAAddr(SDValue N, SDValue &Base,
228                        SDValue &Scale, SDValue &Index, SDValue &Disp,
229                        SDValue &Segment);
230     bool selectLEA64_32Addr(SDValue N, SDValue &Base,
231                             SDValue &Scale, SDValue &Index, SDValue &Disp,
232                             SDValue &Segment);
233     bool selectTLSADDRAddr(SDValue N, SDValue &Base,
234                            SDValue &Scale, SDValue &Index, SDValue &Disp,
235                            SDValue &Segment);
236     bool selectRelocImm(SDValue N, SDValue &Op);
237 
238     bool tryFoldLoad(SDNode *Root, SDNode *P, SDValue N,
239                      SDValue &Base, SDValue &Scale,
240                      SDValue &Index, SDValue &Disp,
241                      SDValue &Segment);
242 
243     // Convenience method where P is also root.
244     bool tryFoldLoad(SDNode *P, SDValue N,
245                      SDValue &Base, SDValue &Scale,
246                      SDValue &Index, SDValue &Disp,
247                      SDValue &Segment) {
248       return tryFoldLoad(P, P, N, Base, Scale, Index, Disp, Segment);
249     }
250 
251     bool tryFoldBroadcast(SDNode *Root, SDNode *P, SDValue N,
252                           SDValue &Base, SDValue &Scale,
253                           SDValue &Index, SDValue &Disp,
254                           SDValue &Segment);
255 
256     bool isProfitableToFormMaskedOp(SDNode *N) const;
257 
258     /// Implement addressing mode selection for inline asm expressions.
259     bool SelectInlineAsmMemoryOperand(const SDValue &Op,
260                                       InlineAsm::ConstraintCode ConstraintID,
261                                       std::vector<SDValue> &OutOps) override;
262 
263     void emitSpecialCodeForMain();
264 
265     inline void getAddressOperands(X86ISelAddressMode &AM, const SDLoc &DL,
266                                    MVT VT, SDValue &Base, SDValue &Scale,
267                                    SDValue &Index, SDValue &Disp,
268                                    SDValue &Segment) {
269       if (AM.BaseType == X86ISelAddressMode::FrameIndexBase)
270         Base = CurDAG->getTargetFrameIndex(
271             AM.Base_FrameIndex, TLI->getPointerTy(CurDAG->getDataLayout()));
272       else if (AM.Base_Reg.getNode())
273         Base = AM.Base_Reg;
274       else
275         Base = CurDAG->getRegister(0, VT);
276 
277       Scale = getI8Imm(AM.Scale, DL);
278 
279 #define GET_ND_IF_ENABLED(OPC) (Subtarget->hasNDD() ? OPC##_ND : OPC)
280       // Negate the index if needed.
281       if (AM.NegateIndex) {
282         unsigned NegOpc = VT == MVT::i64 ? GET_ND_IF_ENABLED(X86::NEG64r)
283                                          : GET_ND_IF_ENABLED(X86::NEG32r);
284         SDValue Neg = SDValue(CurDAG->getMachineNode(NegOpc, DL, VT, MVT::i32,
285                                                      AM.IndexReg), 0);
286         AM.IndexReg = Neg;
287       }
288 
289       if (AM.IndexReg.getNode())
290         Index = AM.IndexReg;
291       else
292         Index = CurDAG->getRegister(0, VT);
293 
294       // These are 32-bit even in 64-bit mode since RIP-relative offset
295       // is 32-bit.
296       if (AM.GV)
297         Disp = CurDAG->getTargetGlobalAddress(AM.GV, SDLoc(),
298                                               MVT::i32, AM.Disp,
299                                               AM.SymbolFlags);
300       else if (AM.CP)
301         Disp = CurDAG->getTargetConstantPool(AM.CP, MVT::i32, AM.Alignment,
302                                              AM.Disp, AM.SymbolFlags);
303       else if (AM.ES) {
304         assert(!AM.Disp && "Non-zero displacement is ignored with ES.");
305         Disp = CurDAG->getTargetExternalSymbol(AM.ES, MVT::i32, AM.SymbolFlags);
306       } else if (AM.MCSym) {
307         assert(!AM.Disp && "Non-zero displacement is ignored with MCSym.");
308         assert(AM.SymbolFlags == 0 && "oo");
309         Disp = CurDAG->getMCSymbol(AM.MCSym, MVT::i32);
310       } else if (AM.JT != -1) {
311         assert(!AM.Disp && "Non-zero displacement is ignored with JT.");
312         Disp = CurDAG->getTargetJumpTable(AM.JT, MVT::i32, AM.SymbolFlags);
313       } else if (AM.BlockAddr)
314         Disp = CurDAG->getTargetBlockAddress(AM.BlockAddr, MVT::i32, AM.Disp,
315                                              AM.SymbolFlags);
316       else
317         Disp = CurDAG->getTargetConstant(AM.Disp, DL, MVT::i32);
318 
319       if (AM.Segment.getNode())
320         Segment = AM.Segment;
321       else
322         Segment = CurDAG->getRegister(0, MVT::i16);
323     }
324 
325     // Utility function to determine whether we should avoid selecting
326     // immediate forms of instructions for better code size or not.
327     // At a high level, we'd like to avoid such instructions when
328     // we have similar constants used within the same basic block
329     // that can be kept in a register.
330     //
331     bool shouldAvoidImmediateInstFormsForSize(SDNode *N) const {
332       uint32_t UseCount = 0;
333 
334       // Do not want to hoist if we're not optimizing for size.
335       // TODO: We'd like to remove this restriction.
336       // See the comment in X86InstrInfo.td for more info.
337       if (!CurDAG->shouldOptForSize())
338         return false;
339 
340       // Walk all the users of the immediate.
341       for (const SDNode *User : N->uses()) {
342         if (UseCount >= 2)
343           break;
344 
345         // This user is already selected. Count it as a legitimate use and
346         // move on.
347         if (User->isMachineOpcode()) {
348           UseCount++;
349           continue;
350         }
351 
352         // We want to count stores of immediates as real uses.
353         if (User->getOpcode() == ISD::STORE &&
354             User->getOperand(1).getNode() == N) {
355           UseCount++;
356           continue;
357         }
358 
359         // We don't currently match users that have > 2 operands (except
360         // for stores, which are handled above)
361         // Those instruction won't match in ISEL, for now, and would
362         // be counted incorrectly.
363         // This may change in the future as we add additional instruction
364         // types.
365         if (User->getNumOperands() != 2)
366           continue;
367 
368         // If this is a sign-extended 8-bit integer immediate used in an ALU
369         // instruction, there is probably an opcode encoding to save space.
370         auto *C = dyn_cast<ConstantSDNode>(N);
371         if (C && isInt<8>(C->getSExtValue()))
372           continue;
373 
374         // Immediates that are used for offsets as part of stack
375         // manipulation should be left alone. These are typically
376         // used to indicate SP offsets for argument passing and
377         // will get pulled into stores/pushes (implicitly).
378         if (User->getOpcode() == X86ISD::ADD ||
379             User->getOpcode() == ISD::ADD    ||
380             User->getOpcode() == X86ISD::SUB ||
381             User->getOpcode() == ISD::SUB) {
382 
383           // Find the other operand of the add/sub.
384           SDValue OtherOp = User->getOperand(0);
385           if (OtherOp.getNode() == N)
386             OtherOp = User->getOperand(1);
387 
388           // Don't count if the other operand is SP.
389           RegisterSDNode *RegNode;
390           if (OtherOp->getOpcode() == ISD::CopyFromReg &&
391               (RegNode = dyn_cast_or_null<RegisterSDNode>(
392                  OtherOp->getOperand(1).getNode())))
393             if ((RegNode->getReg() == X86::ESP) ||
394                 (RegNode->getReg() == X86::RSP))
395               continue;
396         }
397 
398         // ... otherwise, count this and move on.
399         UseCount++;
400       }
401 
402       // If we have more than 1 use, then recommend for hoisting.
403       return (UseCount > 1);
404     }
405 
406     /// Return a target constant with the specified value of type i8.
407     inline SDValue getI8Imm(unsigned Imm, const SDLoc &DL) {
408       return CurDAG->getTargetConstant(Imm, DL, MVT::i8);
409     }
410 
411     /// Return a target constant with the specified value, of type i32.
412     inline SDValue getI32Imm(unsigned Imm, const SDLoc &DL) {
413       return CurDAG->getTargetConstant(Imm, DL, MVT::i32);
414     }
415 
416     /// Return a target constant with the specified value, of type i64.
417     inline SDValue getI64Imm(uint64_t Imm, const SDLoc &DL) {
418       return CurDAG->getTargetConstant(Imm, DL, MVT::i64);
419     }
420 
421     SDValue getExtractVEXTRACTImmediate(SDNode *N, unsigned VecWidth,
422                                         const SDLoc &DL) {
423       assert((VecWidth == 128 || VecWidth == 256) && "Unexpected vector width");
424       uint64_t Index = N->getConstantOperandVal(1);
425       MVT VecVT = N->getOperand(0).getSimpleValueType();
426       return getI8Imm((Index * VecVT.getScalarSizeInBits()) / VecWidth, DL);
427     }
428 
429     SDValue getInsertVINSERTImmediate(SDNode *N, unsigned VecWidth,
430                                       const SDLoc &DL) {
431       assert((VecWidth == 128 || VecWidth == 256) && "Unexpected vector width");
432       uint64_t Index = N->getConstantOperandVal(2);
433       MVT VecVT = N->getSimpleValueType(0);
434       return getI8Imm((Index * VecVT.getScalarSizeInBits()) / VecWidth, DL);
435     }
436 
437     SDValue getPermuteVINSERTCommutedImmediate(SDNode *N, unsigned VecWidth,
438                                                const SDLoc &DL) {
439       assert(VecWidth == 128 && "Unexpected vector width");
440       uint64_t Index = N->getConstantOperandVal(2);
441       MVT VecVT = N->getSimpleValueType(0);
442       uint64_t InsertIdx = (Index * VecVT.getScalarSizeInBits()) / VecWidth;
443       assert((InsertIdx == 0 || InsertIdx == 1) && "Bad insertf128 index");
444       // vinsert(0,sub,vec) -> [sub0][vec1] -> vperm2x128(0x30,vec,sub)
445       // vinsert(1,sub,vec) -> [vec0][sub0] -> vperm2x128(0x02,vec,sub)
446       return getI8Imm(InsertIdx ? 0x02 : 0x30, DL);
447     }
448 
449     SDValue getSBBZero(SDNode *N) {
450       SDLoc dl(N);
451       MVT VT = N->getSimpleValueType(0);
452 
453       // Create zero.
454       SDVTList VTs = CurDAG->getVTList(MVT::i32, MVT::i32);
455       SDValue Zero = SDValue(
456           CurDAG->getMachineNode(X86::MOV32r0, dl, VTs, std::nullopt), 0);
457       if (VT == MVT::i64) {
458         Zero = SDValue(
459             CurDAG->getMachineNode(
460                 TargetOpcode::SUBREG_TO_REG, dl, MVT::i64,
461                 CurDAG->getTargetConstant(0, dl, MVT::i64), Zero,
462                 CurDAG->getTargetConstant(X86::sub_32bit, dl, MVT::i32)),
463             0);
464       }
465 
466       // Copy flags to the EFLAGS register and glue it to next node.
467       unsigned Opcode = N->getOpcode();
468       assert((Opcode == X86ISD::SBB || Opcode == X86ISD::SETCC_CARRY) &&
469              "Unexpected opcode for SBB materialization");
470       unsigned FlagOpIndex = Opcode == X86ISD::SBB ? 2 : 1;
471       SDValue EFLAGS =
472           CurDAG->getCopyToReg(CurDAG->getEntryNode(), dl, X86::EFLAGS,
473                                N->getOperand(FlagOpIndex), SDValue());
474 
475       // Create a 64-bit instruction if the result is 64-bits otherwise use the
476       // 32-bit version.
477       unsigned Opc = VT == MVT::i64 ? X86::SBB64rr : X86::SBB32rr;
478       MVT SBBVT = VT == MVT::i64 ? MVT::i64 : MVT::i32;
479       VTs = CurDAG->getVTList(SBBVT, MVT::i32);
480       return SDValue(
481           CurDAG->getMachineNode(Opc, dl, VTs,
482                                  {Zero, Zero, EFLAGS, EFLAGS.getValue(1)}),
483           0);
484     }
485 
486     // Helper to detect unneeded and instructions on shift amounts. Called
487     // from PatFrags in tablegen.
488     bool isUnneededShiftMask(SDNode *N, unsigned Width) const {
489       assert(N->getOpcode() == ISD::AND && "Unexpected opcode");
490       const APInt &Val = N->getConstantOperandAPInt(1);
491 
492       if (Val.countr_one() >= Width)
493         return true;
494 
495       APInt Mask = Val | CurDAG->computeKnownBits(N->getOperand(0)).Zero;
496       return Mask.countr_one() >= Width;
497     }
498 
499     /// Return an SDNode that returns the value of the global base register.
500     /// Output instructions required to initialize the global base register,
501     /// if necessary.
502     SDNode *getGlobalBaseReg();
503 
504     /// Return a reference to the TargetMachine, casted to the target-specific
505     /// type.
506     const X86TargetMachine &getTargetMachine() const {
507       return static_cast<const X86TargetMachine &>(TM);
508     }
509 
510     /// Return a reference to the TargetInstrInfo, casted to the target-specific
511     /// type.
512     const X86InstrInfo *getInstrInfo() const {
513       return Subtarget->getInstrInfo();
514     }
515 
516     /// Return a condition code of the given SDNode
517     X86::CondCode getCondFromNode(SDNode *N) const;
518 
519     /// Address-mode matching performs shift-of-and to and-of-shift
520     /// reassociation in order to expose more scaled addressing
521     /// opportunities.
522     bool ComplexPatternFuncMutatesDAG() const override {
523       return true;
524     }
525 
526     bool isSExtAbsoluteSymbolRef(unsigned Width, SDNode *N) const;
527 
528     // Indicates we should prefer to use a non-temporal load for this load.
529     bool useNonTemporalLoad(LoadSDNode *N) const {
530       if (!N->isNonTemporal())
531         return false;
532 
533       unsigned StoreSize = N->getMemoryVT().getStoreSize();
534 
535       if (N->getAlign().value() < StoreSize)
536         return false;
537 
538       switch (StoreSize) {
539       default: llvm_unreachable("Unsupported store size");
540       case 4:
541       case 8:
542         return false;
543       case 16:
544         return Subtarget->hasSSE41();
545       case 32:
546         return Subtarget->hasAVX2();
547       case 64:
548         return Subtarget->hasAVX512();
549       }
550     }
551 
552     bool foldLoadStoreIntoMemOperand(SDNode *Node);
553     MachineSDNode *matchBEXTRFromAndImm(SDNode *Node);
554     bool matchBitExtract(SDNode *Node);
555     bool shrinkAndImmediate(SDNode *N);
556     bool isMaskZeroExtended(SDNode *N) const;
557     bool tryShiftAmountMod(SDNode *N);
558     bool tryShrinkShlLogicImm(SDNode *N);
559     bool tryVPTERNLOG(SDNode *N);
560     bool matchVPTERNLOG(SDNode *Root, SDNode *ParentA, SDNode *ParentB,
561                         SDNode *ParentC, SDValue A, SDValue B, SDValue C,
562                         uint8_t Imm);
563     bool tryVPTESTM(SDNode *Root, SDValue Setcc, SDValue Mask);
564     bool tryMatchBitSelect(SDNode *N);
565 
566     MachineSDNode *emitPCMPISTR(unsigned ROpc, unsigned MOpc, bool MayFoldLoad,
567                                 const SDLoc &dl, MVT VT, SDNode *Node);
568     MachineSDNode *emitPCMPESTR(unsigned ROpc, unsigned MOpc, bool MayFoldLoad,
569                                 const SDLoc &dl, MVT VT, SDNode *Node,
570                                 SDValue &InGlue);
571 
572     bool tryOptimizeRem8Extend(SDNode *N);
573 
574     bool onlyUsesZeroFlag(SDValue Flags) const;
575     bool hasNoSignFlagUses(SDValue Flags) const;
576     bool hasNoCarryFlagUses(SDValue Flags) const;
577   };
578 
579   class X86DAGToDAGISelLegacy : public SelectionDAGISelLegacy {
580   public:
581     static char ID;
582     explicit X86DAGToDAGISelLegacy(X86TargetMachine &tm,
583                                    CodeGenOptLevel OptLevel)
584         : SelectionDAGISelLegacy(
585               ID, std::make_unique<X86DAGToDAGISel>(tm, OptLevel)) {}
586   };
587 }
588 
589 char X86DAGToDAGISelLegacy::ID = 0;
590 
591 INITIALIZE_PASS(X86DAGToDAGISelLegacy, DEBUG_TYPE, PASS_NAME, false, false)
592 
593 // Returns true if this masked compare can be implemented legally with this
594 // type.
595 static bool isLegalMaskCompare(SDNode *N, const X86Subtarget *Subtarget) {
596   unsigned Opcode = N->getOpcode();
597   if (Opcode == X86ISD::CMPM || Opcode == X86ISD::CMPMM ||
598       Opcode == X86ISD::STRICT_CMPM || Opcode == ISD::SETCC ||
599       Opcode == X86ISD::CMPMM_SAE || Opcode == X86ISD::VFPCLASS) {
600     // We can get 256-bit 8 element types here without VLX being enabled. When
601     // this happens we will use 512-bit operations and the mask will not be
602     // zero extended.
603     EVT OpVT = N->getOperand(0).getValueType();
604     // The first operand of X86ISD::STRICT_CMPM is chain, so we need to get the
605     // second operand.
606     if (Opcode == X86ISD::STRICT_CMPM)
607       OpVT = N->getOperand(1).getValueType();
608     if (OpVT.is256BitVector() || OpVT.is128BitVector())
609       return Subtarget->hasVLX();
610 
611     return true;
612   }
613   // Scalar opcodes use 128 bit registers, but aren't subject to the VLX check.
614   if (Opcode == X86ISD::VFPCLASSS || Opcode == X86ISD::FSETCCM ||
615       Opcode == X86ISD::FSETCCM_SAE)
616     return true;
617 
618   return false;
619 }
620 
621 // Returns true if we can assume the writer of the mask has zero extended it
622 // for us.
623 bool X86DAGToDAGISel::isMaskZeroExtended(SDNode *N) const {
624   // If this is an AND, check if we have a compare on either side. As long as
625   // one side guarantees the mask is zero extended, the AND will preserve those
626   // zeros.
627   if (N->getOpcode() == ISD::AND)
628     return isLegalMaskCompare(N->getOperand(0).getNode(), Subtarget) ||
629            isLegalMaskCompare(N->getOperand(1).getNode(), Subtarget);
630 
631   return isLegalMaskCompare(N, Subtarget);
632 }
633 
634 bool
635 X86DAGToDAGISel::IsProfitableToFold(SDValue N, SDNode *U, SDNode *Root) const {
636   if (OptLevel == CodeGenOptLevel::None)
637     return false;
638 
639   if (!N.hasOneUse())
640     return false;
641 
642   if (N.getOpcode() != ISD::LOAD)
643     return true;
644 
645   // Don't fold non-temporal loads if we have an instruction for them.
646   if (useNonTemporalLoad(cast<LoadSDNode>(N)))
647     return false;
648 
649   // If N is a load, do additional profitability checks.
650   if (U == Root) {
651     switch (U->getOpcode()) {
652     default: break;
653     case X86ISD::ADD:
654     case X86ISD::ADC:
655     case X86ISD::SUB:
656     case X86ISD::SBB:
657     case X86ISD::AND:
658     case X86ISD::XOR:
659     case X86ISD::OR:
660     case ISD::ADD:
661     case ISD::UADDO_CARRY:
662     case ISD::AND:
663     case ISD::OR:
664     case ISD::XOR: {
665       SDValue Op1 = U->getOperand(1);
666 
667       // If the other operand is a 8-bit immediate we should fold the immediate
668       // instead. This reduces code size.
669       // e.g.
670       // movl 4(%esp), %eax
671       // addl $4, %eax
672       // vs.
673       // movl $4, %eax
674       // addl 4(%esp), %eax
675       // The former is 2 bytes shorter. In case where the increment is 1, then
676       // the saving can be 4 bytes (by using incl %eax).
677       if (auto *Imm = dyn_cast<ConstantSDNode>(Op1)) {
678         if (Imm->getAPIntValue().isSignedIntN(8))
679           return false;
680 
681         // If this is a 64-bit AND with an immediate that fits in 32-bits,
682         // prefer using the smaller and over folding the load. This is needed to
683         // make sure immediates created by shrinkAndImmediate are always folded.
684         // Ideally we would narrow the load during DAG combine and get the
685         // best of both worlds.
686         if (U->getOpcode() == ISD::AND &&
687             Imm->getAPIntValue().getBitWidth() == 64 &&
688             Imm->getAPIntValue().isIntN(32))
689           return false;
690 
691         // If this really a zext_inreg that can be represented with a movzx
692         // instruction, prefer that.
693         // TODO: We could shrink the load and fold if it is non-volatile.
694         if (U->getOpcode() == ISD::AND &&
695             (Imm->getAPIntValue() == UINT8_MAX ||
696              Imm->getAPIntValue() == UINT16_MAX ||
697              Imm->getAPIntValue() == UINT32_MAX))
698           return false;
699 
700         // ADD/SUB with can negate the immediate and use the opposite operation
701         // to fit 128 into a sign extended 8 bit immediate.
702         if ((U->getOpcode() == ISD::ADD || U->getOpcode() == ISD::SUB) &&
703             (-Imm->getAPIntValue()).isSignedIntN(8))
704           return false;
705 
706         if ((U->getOpcode() == X86ISD::ADD || U->getOpcode() == X86ISD::SUB) &&
707             (-Imm->getAPIntValue()).isSignedIntN(8) &&
708             hasNoCarryFlagUses(SDValue(U, 1)))
709           return false;
710       }
711 
712       // If the other operand is a TLS address, we should fold it instead.
713       // This produces
714       // movl    %gs:0, %eax
715       // leal    i@NTPOFF(%eax), %eax
716       // instead of
717       // movl    $i@NTPOFF, %eax
718       // addl    %gs:0, %eax
719       // if the block also has an access to a second TLS address this will save
720       // a load.
721       // FIXME: This is probably also true for non-TLS addresses.
722       if (Op1.getOpcode() == X86ISD::Wrapper) {
723         SDValue Val = Op1.getOperand(0);
724         if (Val.getOpcode() == ISD::TargetGlobalTLSAddress)
725           return false;
726       }
727 
728       // Don't fold load if this matches the BTS/BTR/BTC patterns.
729       // BTS: (or X, (shl 1, n))
730       // BTR: (and X, (rotl -2, n))
731       // BTC: (xor X, (shl 1, n))
732       if (U->getOpcode() == ISD::OR || U->getOpcode() == ISD::XOR) {
733         if (U->getOperand(0).getOpcode() == ISD::SHL &&
734             isOneConstant(U->getOperand(0).getOperand(0)))
735           return false;
736 
737         if (U->getOperand(1).getOpcode() == ISD::SHL &&
738             isOneConstant(U->getOperand(1).getOperand(0)))
739           return false;
740       }
741       if (U->getOpcode() == ISD::AND) {
742         SDValue U0 = U->getOperand(0);
743         SDValue U1 = U->getOperand(1);
744         if (U0.getOpcode() == ISD::ROTL) {
745           auto *C = dyn_cast<ConstantSDNode>(U0.getOperand(0));
746           if (C && C->getSExtValue() == -2)
747             return false;
748         }
749 
750         if (U1.getOpcode() == ISD::ROTL) {
751           auto *C = dyn_cast<ConstantSDNode>(U1.getOperand(0));
752           if (C && C->getSExtValue() == -2)
753             return false;
754         }
755       }
756 
757       break;
758     }
759     case ISD::SHL:
760     case ISD::SRA:
761     case ISD::SRL:
762       // Don't fold a load into a shift by immediate. The BMI2 instructions
763       // support folding a load, but not an immediate. The legacy instructions
764       // support folding an immediate, but can't fold a load. Folding an
765       // immediate is preferable to folding a load.
766       if (isa<ConstantSDNode>(U->getOperand(1)))
767         return false;
768 
769       break;
770     }
771   }
772 
773   // Prevent folding a load if this can implemented with an insert_subreg or
774   // a move that implicitly zeroes.
775   if (Root->getOpcode() == ISD::INSERT_SUBVECTOR &&
776       isNullConstant(Root->getOperand(2)) &&
777       (Root->getOperand(0).isUndef() ||
778        ISD::isBuildVectorAllZeros(Root->getOperand(0).getNode())))
779     return false;
780 
781   return true;
782 }
783 
784 // Indicates it is profitable to form an AVX512 masked operation. Returning
785 // false will favor a masked register-register masked move or vblendm and the
786 // operation will be selected separately.
787 bool X86DAGToDAGISel::isProfitableToFormMaskedOp(SDNode *N) const {
788   assert(
789       (N->getOpcode() == ISD::VSELECT || N->getOpcode() == X86ISD::SELECTS) &&
790       "Unexpected opcode!");
791 
792   // If the operation has additional users, the operation will be duplicated.
793   // Check the use count to prevent that.
794   // FIXME: Are there cheap opcodes we might want to duplicate?
795   return N->getOperand(1).hasOneUse();
796 }
797 
798 /// Replace the original chain operand of the call with
799 /// load's chain operand and move load below the call's chain operand.
800 static void moveBelowOrigChain(SelectionDAG *CurDAG, SDValue Load,
801                                SDValue Call, SDValue OrigChain) {
802   SmallVector<SDValue, 8> Ops;
803   SDValue Chain = OrigChain.getOperand(0);
804   if (Chain.getNode() == Load.getNode())
805     Ops.push_back(Load.getOperand(0));
806   else {
807     assert(Chain.getOpcode() == ISD::TokenFactor &&
808            "Unexpected chain operand");
809     for (unsigned i = 0, e = Chain.getNumOperands(); i != e; ++i)
810       if (Chain.getOperand(i).getNode() == Load.getNode())
811         Ops.push_back(Load.getOperand(0));
812       else
813         Ops.push_back(Chain.getOperand(i));
814     SDValue NewChain =
815       CurDAG->getNode(ISD::TokenFactor, SDLoc(Load), MVT::Other, Ops);
816     Ops.clear();
817     Ops.push_back(NewChain);
818   }
819   Ops.append(OrigChain->op_begin() + 1, OrigChain->op_end());
820   CurDAG->UpdateNodeOperands(OrigChain.getNode(), Ops);
821   CurDAG->UpdateNodeOperands(Load.getNode(), Call.getOperand(0),
822                              Load.getOperand(1), Load.getOperand(2));
823 
824   Ops.clear();
825   Ops.push_back(SDValue(Load.getNode(), 1));
826   Ops.append(Call->op_begin() + 1, Call->op_end());
827   CurDAG->UpdateNodeOperands(Call.getNode(), Ops);
828 }
829 
830 /// Return true if call address is a load and it can be
831 /// moved below CALLSEQ_START and the chains leading up to the call.
832 /// Return the CALLSEQ_START by reference as a second output.
833 /// In the case of a tail call, there isn't a callseq node between the call
834 /// chain and the load.
835 static bool isCalleeLoad(SDValue Callee, SDValue &Chain, bool HasCallSeq) {
836   // The transformation is somewhat dangerous if the call's chain was glued to
837   // the call. After MoveBelowOrigChain the load is moved between the call and
838   // the chain, this can create a cycle if the load is not folded. So it is
839   // *really* important that we are sure the load will be folded.
840   if (Callee.getNode() == Chain.getNode() || !Callee.hasOneUse())
841     return false;
842   auto *LD = dyn_cast<LoadSDNode>(Callee.getNode());
843   if (!LD ||
844       !LD->isSimple() ||
845       LD->getAddressingMode() != ISD::UNINDEXED ||
846       LD->getExtensionType() != ISD::NON_EXTLOAD)
847     return false;
848 
849   // Now let's find the callseq_start.
850   while (HasCallSeq && Chain.getOpcode() != ISD::CALLSEQ_START) {
851     if (!Chain.hasOneUse())
852       return false;
853     Chain = Chain.getOperand(0);
854   }
855 
856   if (!Chain.getNumOperands())
857     return false;
858   // Since we are not checking for AA here, conservatively abort if the chain
859   // writes to memory. It's not safe to move the callee (a load) across a store.
860   if (isa<MemSDNode>(Chain.getNode()) &&
861       cast<MemSDNode>(Chain.getNode())->writeMem())
862     return false;
863   if (Chain.getOperand(0).getNode() == Callee.getNode())
864     return true;
865   if (Chain.getOperand(0).getOpcode() == ISD::TokenFactor &&
866       Callee.getValue(1).isOperandOf(Chain.getOperand(0).getNode()) &&
867       Callee.getValue(1).hasOneUse())
868     return true;
869   return false;
870 }
871 
872 static bool isEndbrImm64(uint64_t Imm) {
873 // There may be some other prefix bytes between 0xF3 and 0x0F1EFA.
874 // i.g: 0xF3660F1EFA, 0xF3670F1EFA
875   if ((Imm & 0x00FFFFFF) != 0x0F1EFA)
876     return false;
877 
878   uint8_t OptionalPrefixBytes [] = {0x26, 0x2e, 0x36, 0x3e, 0x64,
879                                     0x65, 0x66, 0x67, 0xf0, 0xf2};
880   int i = 24; // 24bit 0x0F1EFA has matched
881   while (i < 64) {
882     uint8_t Byte = (Imm >> i) & 0xFF;
883     if (Byte == 0xF3)
884       return true;
885     if (!llvm::is_contained(OptionalPrefixBytes, Byte))
886       return false;
887     i += 8;
888   }
889 
890   return false;
891 }
892 
893 static bool needBWI(MVT VT) {
894   return (VT == MVT::v32i16 || VT == MVT::v32f16 || VT == MVT::v64i8);
895 }
896 
897 void X86DAGToDAGISel::PreprocessISelDAG() {
898   bool MadeChange = false;
899   for (SelectionDAG::allnodes_iterator I = CurDAG->allnodes_begin(),
900        E = CurDAG->allnodes_end(); I != E; ) {
901     SDNode *N = &*I++; // Preincrement iterator to avoid invalidation issues.
902 
903     // This is for CET enhancement.
904     //
905     // ENDBR32 and ENDBR64 have specific opcodes:
906     // ENDBR32: F3 0F 1E FB
907     // ENDBR64: F3 0F 1E FA
908     // And we want that attackers won’t find unintended ENDBR32/64
909     // opcode matches in the binary
910     // Here’s an example:
911     // If the compiler had to generate asm for the following code:
912     // a = 0xF30F1EFA
913     // it could, for example, generate:
914     // mov 0xF30F1EFA, dword ptr[a]
915     // In such a case, the binary would include a gadget that starts
916     // with a fake ENDBR64 opcode. Therefore, we split such generation
917     // into multiple operations, let it not shows in the binary
918     if (N->getOpcode() == ISD::Constant) {
919       MVT VT = N->getSimpleValueType(0);
920       int64_t Imm = cast<ConstantSDNode>(N)->getSExtValue();
921       int32_t EndbrImm = Subtarget->is64Bit() ? 0xF30F1EFA : 0xF30F1EFB;
922       if (Imm == EndbrImm || isEndbrImm64(Imm)) {
923         // Check that the cf-protection-branch is enabled.
924         Metadata *CFProtectionBranch =
925             MF->getFunction().getParent()->getModuleFlag(
926                 "cf-protection-branch");
927         if (CFProtectionBranch || IndirectBranchTracking) {
928           SDLoc dl(N);
929           SDValue Complement = CurDAG->getConstant(~Imm, dl, VT, false, true);
930           Complement = CurDAG->getNOT(dl, Complement, VT);
931           --I;
932           CurDAG->ReplaceAllUsesOfValueWith(SDValue(N, 0), Complement);
933           ++I;
934           MadeChange = true;
935           continue;
936         }
937       }
938     }
939 
940     // If this is a target specific AND node with no flag usages, turn it back
941     // into ISD::AND to enable test instruction matching.
942     if (N->getOpcode() == X86ISD::AND && !N->hasAnyUseOfValue(1)) {
943       SDValue Res = CurDAG->getNode(ISD::AND, SDLoc(N), N->getValueType(0),
944                                     N->getOperand(0), N->getOperand(1));
945       --I;
946       CurDAG->ReplaceAllUsesOfValueWith(SDValue(N, 0), Res);
947       ++I;
948       MadeChange = true;
949       continue;
950     }
951 
952     // Convert vector increment or decrement to sub/add with an all-ones
953     // constant:
954     // add X, <1, 1...> --> sub X, <-1, -1...>
955     // sub X, <1, 1...> --> add X, <-1, -1...>
956     // The all-ones vector constant can be materialized using a pcmpeq
957     // instruction that is commonly recognized as an idiom (has no register
958     // dependency), so that's better/smaller than loading a splat 1 constant.
959     //
960     // But don't do this if it would inhibit a potentially profitable load
961     // folding opportunity for the other operand. That only occurs with the
962     // intersection of:
963     // (1) The other operand (op0) is load foldable.
964     // (2) The op is an add (otherwise, we are *creating* an add and can still
965     //     load fold the other op).
966     // (3) The target has AVX (otherwise, we have a destructive add and can't
967     //     load fold the other op without killing the constant op).
968     // (4) The constant 1 vector has multiple uses (so it is profitable to load
969     //     into a register anyway).
970     auto mayPreventLoadFold = [&]() {
971       return X86::mayFoldLoad(N->getOperand(0), *Subtarget) &&
972              N->getOpcode() == ISD::ADD && Subtarget->hasAVX() &&
973              !N->getOperand(1).hasOneUse();
974     };
975     if ((N->getOpcode() == ISD::ADD || N->getOpcode() == ISD::SUB) &&
976         N->getSimpleValueType(0).isVector() && !mayPreventLoadFold()) {
977       APInt SplatVal;
978       if (X86::isConstantSplat(N->getOperand(1), SplatVal) &&
979           SplatVal.isOne()) {
980         SDLoc DL(N);
981 
982         MVT VT = N->getSimpleValueType(0);
983         unsigned NumElts = VT.getSizeInBits() / 32;
984         SDValue AllOnes =
985             CurDAG->getAllOnesConstant(DL, MVT::getVectorVT(MVT::i32, NumElts));
986         AllOnes = CurDAG->getBitcast(VT, AllOnes);
987 
988         unsigned NewOpcode = N->getOpcode() == ISD::ADD ? ISD::SUB : ISD::ADD;
989         SDValue Res =
990             CurDAG->getNode(NewOpcode, DL, VT, N->getOperand(0), AllOnes);
991         --I;
992         CurDAG->ReplaceAllUsesWith(N, Res.getNode());
993         ++I;
994         MadeChange = true;
995         continue;
996       }
997     }
998 
999     switch (N->getOpcode()) {
1000     case X86ISD::VBROADCAST: {
1001       MVT VT = N->getSimpleValueType(0);
1002       // Emulate v32i16/v64i8 broadcast without BWI.
1003       if (!Subtarget->hasBWI() && needBWI(VT)) {
1004         MVT NarrowVT = VT.getHalfNumVectorElementsVT();
1005         SDLoc dl(N);
1006         SDValue NarrowBCast =
1007             CurDAG->getNode(X86ISD::VBROADCAST, dl, NarrowVT, N->getOperand(0));
1008         SDValue Res =
1009             CurDAG->getNode(ISD::INSERT_SUBVECTOR, dl, VT, CurDAG->getUNDEF(VT),
1010                             NarrowBCast, CurDAG->getIntPtrConstant(0, dl));
1011         unsigned Index = NarrowVT.getVectorMinNumElements();
1012         Res = CurDAG->getNode(ISD::INSERT_SUBVECTOR, dl, VT, Res, NarrowBCast,
1013                               CurDAG->getIntPtrConstant(Index, dl));
1014 
1015         --I;
1016         CurDAG->ReplaceAllUsesWith(N, Res.getNode());
1017         ++I;
1018         MadeChange = true;
1019         continue;
1020       }
1021 
1022       break;
1023     }
1024     case X86ISD::VBROADCAST_LOAD: {
1025       MVT VT = N->getSimpleValueType(0);
1026       // Emulate v32i16/v64i8 broadcast without BWI.
1027       if (!Subtarget->hasBWI() && needBWI(VT)) {
1028         MVT NarrowVT = VT.getHalfNumVectorElementsVT();
1029         auto *MemNode = cast<MemSDNode>(N);
1030         SDLoc dl(N);
1031         SDVTList VTs = CurDAG->getVTList(NarrowVT, MVT::Other);
1032         SDValue Ops[] = {MemNode->getChain(), MemNode->getBasePtr()};
1033         SDValue NarrowBCast = CurDAG->getMemIntrinsicNode(
1034             X86ISD::VBROADCAST_LOAD, dl, VTs, Ops, MemNode->getMemoryVT(),
1035             MemNode->getMemOperand());
1036         SDValue Res =
1037             CurDAG->getNode(ISD::INSERT_SUBVECTOR, dl, VT, CurDAG->getUNDEF(VT),
1038                             NarrowBCast, CurDAG->getIntPtrConstant(0, dl));
1039         unsigned Index = NarrowVT.getVectorMinNumElements();
1040         Res = CurDAG->getNode(ISD::INSERT_SUBVECTOR, dl, VT, Res, NarrowBCast,
1041                               CurDAG->getIntPtrConstant(Index, dl));
1042 
1043         --I;
1044         SDValue To[] = {Res, NarrowBCast.getValue(1)};
1045         CurDAG->ReplaceAllUsesWith(N, To);
1046         ++I;
1047         MadeChange = true;
1048         continue;
1049       }
1050 
1051       break;
1052     }
1053     case ISD::LOAD: {
1054       // If this is a XMM/YMM load of the same lower bits as another YMM/ZMM
1055       // load, then just extract the lower subvector and avoid the second load.
1056       auto *Ld = cast<LoadSDNode>(N);
1057       MVT VT = N->getSimpleValueType(0);
1058       if (!ISD::isNormalLoad(Ld) || !Ld->isSimple() ||
1059           !(VT.is128BitVector() || VT.is256BitVector()))
1060         break;
1061 
1062       MVT MaxVT = VT;
1063       SDNode *MaxLd = nullptr;
1064       SDValue Ptr = Ld->getBasePtr();
1065       SDValue Chain = Ld->getChain();
1066       for (SDNode *User : Ptr->uses()) {
1067         auto *UserLd = dyn_cast<LoadSDNode>(User);
1068         MVT UserVT = User->getSimpleValueType(0);
1069         if (User != N && UserLd && ISD::isNormalLoad(User) &&
1070             UserLd->getBasePtr() == Ptr && UserLd->getChain() == Chain &&
1071             !User->hasAnyUseOfValue(1) &&
1072             (UserVT.is256BitVector() || UserVT.is512BitVector()) &&
1073             UserVT.getSizeInBits() > VT.getSizeInBits() &&
1074             (!MaxLd || UserVT.getSizeInBits() > MaxVT.getSizeInBits())) {
1075           MaxLd = User;
1076           MaxVT = UserVT;
1077         }
1078       }
1079       if (MaxLd) {
1080         SDLoc dl(N);
1081         unsigned NumSubElts = VT.getSizeInBits() / MaxVT.getScalarSizeInBits();
1082         MVT SubVT = MVT::getVectorVT(MaxVT.getScalarType(), NumSubElts);
1083         SDValue Extract = CurDAG->getNode(ISD::EXTRACT_SUBVECTOR, dl, SubVT,
1084                                           SDValue(MaxLd, 0),
1085                                           CurDAG->getIntPtrConstant(0, dl));
1086         SDValue Res = CurDAG->getBitcast(VT, Extract);
1087 
1088         --I;
1089         SDValue To[] = {Res, SDValue(MaxLd, 1)};
1090         CurDAG->ReplaceAllUsesWith(N, To);
1091         ++I;
1092         MadeChange = true;
1093         continue;
1094       }
1095       break;
1096     }
1097     case ISD::VSELECT: {
1098       // Replace VSELECT with non-mask conditions with with BLENDV/VPTERNLOG.
1099       EVT EleVT = N->getOperand(0).getValueType().getVectorElementType();
1100       if (EleVT == MVT::i1)
1101         break;
1102 
1103       assert(Subtarget->hasSSE41() && "Expected SSE4.1 support!");
1104       assert(N->getValueType(0).getVectorElementType() != MVT::i16 &&
1105              "We can't replace VSELECT with BLENDV in vXi16!");
1106       SDValue R;
1107       if (Subtarget->hasVLX() && CurDAG->ComputeNumSignBits(N->getOperand(0)) ==
1108                                      EleVT.getSizeInBits()) {
1109         R = CurDAG->getNode(X86ISD::VPTERNLOG, SDLoc(N), N->getValueType(0),
1110                             N->getOperand(0), N->getOperand(1), N->getOperand(2),
1111                             CurDAG->getTargetConstant(0xCA, SDLoc(N), MVT::i8));
1112       } else {
1113         R = CurDAG->getNode(X86ISD::BLENDV, SDLoc(N), N->getValueType(0),
1114                             N->getOperand(0), N->getOperand(1),
1115                             N->getOperand(2));
1116       }
1117       --I;
1118       CurDAG->ReplaceAllUsesWith(N, R.getNode());
1119       ++I;
1120       MadeChange = true;
1121       continue;
1122     }
1123     case ISD::FP_ROUND:
1124     case ISD::STRICT_FP_ROUND:
1125     case ISD::FP_TO_SINT:
1126     case ISD::FP_TO_UINT:
1127     case ISD::STRICT_FP_TO_SINT:
1128     case ISD::STRICT_FP_TO_UINT: {
1129       // Replace vector fp_to_s/uint with their X86 specific equivalent so we
1130       // don't need 2 sets of patterns.
1131       if (!N->getSimpleValueType(0).isVector())
1132         break;
1133 
1134       unsigned NewOpc;
1135       switch (N->getOpcode()) {
1136       default: llvm_unreachable("Unexpected opcode!");
1137       case ISD::FP_ROUND:          NewOpc = X86ISD::VFPROUND;        break;
1138       case ISD::STRICT_FP_ROUND:   NewOpc = X86ISD::STRICT_VFPROUND; break;
1139       case ISD::STRICT_FP_TO_SINT: NewOpc = X86ISD::STRICT_CVTTP2SI; break;
1140       case ISD::FP_TO_SINT:        NewOpc = X86ISD::CVTTP2SI;        break;
1141       case ISD::STRICT_FP_TO_UINT: NewOpc = X86ISD::STRICT_CVTTP2UI; break;
1142       case ISD::FP_TO_UINT:        NewOpc = X86ISD::CVTTP2UI;        break;
1143       }
1144       SDValue Res;
1145       if (N->isStrictFPOpcode())
1146         Res =
1147             CurDAG->getNode(NewOpc, SDLoc(N), {N->getValueType(0), MVT::Other},
1148                             {N->getOperand(0), N->getOperand(1)});
1149       else
1150         Res =
1151             CurDAG->getNode(NewOpc, SDLoc(N), N->getValueType(0),
1152                             N->getOperand(0));
1153       --I;
1154       CurDAG->ReplaceAllUsesWith(N, Res.getNode());
1155       ++I;
1156       MadeChange = true;
1157       continue;
1158     }
1159     case ISD::SHL:
1160     case ISD::SRA:
1161     case ISD::SRL: {
1162       // Replace vector shifts with their X86 specific equivalent so we don't
1163       // need 2 sets of patterns.
1164       if (!N->getValueType(0).isVector())
1165         break;
1166 
1167       unsigned NewOpc;
1168       switch (N->getOpcode()) {
1169       default: llvm_unreachable("Unexpected opcode!");
1170       case ISD::SHL: NewOpc = X86ISD::VSHLV; break;
1171       case ISD::SRA: NewOpc = X86ISD::VSRAV; break;
1172       case ISD::SRL: NewOpc = X86ISD::VSRLV; break;
1173       }
1174       SDValue Res = CurDAG->getNode(NewOpc, SDLoc(N), N->getValueType(0),
1175                                     N->getOperand(0), N->getOperand(1));
1176       --I;
1177       CurDAG->ReplaceAllUsesOfValueWith(SDValue(N, 0), Res);
1178       ++I;
1179       MadeChange = true;
1180       continue;
1181     }
1182     case ISD::ANY_EXTEND:
1183     case ISD::ANY_EXTEND_VECTOR_INREG: {
1184       // Replace vector any extend with the zero extend equivalents so we don't
1185       // need 2 sets of patterns. Ignore vXi1 extensions.
1186       if (!N->getValueType(0).isVector())
1187         break;
1188 
1189       unsigned NewOpc;
1190       if (N->getOperand(0).getScalarValueSizeInBits() == 1) {
1191         assert(N->getOpcode() == ISD::ANY_EXTEND &&
1192                "Unexpected opcode for mask vector!");
1193         NewOpc = ISD::SIGN_EXTEND;
1194       } else {
1195         NewOpc = N->getOpcode() == ISD::ANY_EXTEND
1196                               ? ISD::ZERO_EXTEND
1197                               : ISD::ZERO_EXTEND_VECTOR_INREG;
1198       }
1199 
1200       SDValue Res = CurDAG->getNode(NewOpc, SDLoc(N), N->getValueType(0),
1201                                     N->getOperand(0));
1202       --I;
1203       CurDAG->ReplaceAllUsesOfValueWith(SDValue(N, 0), Res);
1204       ++I;
1205       MadeChange = true;
1206       continue;
1207     }
1208     case ISD::FCEIL:
1209     case ISD::STRICT_FCEIL:
1210     case ISD::FFLOOR:
1211     case ISD::STRICT_FFLOOR:
1212     case ISD::FTRUNC:
1213     case ISD::STRICT_FTRUNC:
1214     case ISD::FROUNDEVEN:
1215     case ISD::STRICT_FROUNDEVEN:
1216     case ISD::FNEARBYINT:
1217     case ISD::STRICT_FNEARBYINT:
1218     case ISD::FRINT:
1219     case ISD::STRICT_FRINT: {
1220       // Replace fp rounding with their X86 specific equivalent so we don't
1221       // need 2 sets of patterns.
1222       unsigned Imm;
1223       switch (N->getOpcode()) {
1224       default: llvm_unreachable("Unexpected opcode!");
1225       case ISD::STRICT_FCEIL:
1226       case ISD::FCEIL:      Imm = 0xA; break;
1227       case ISD::STRICT_FFLOOR:
1228       case ISD::FFLOOR:     Imm = 0x9; break;
1229       case ISD::STRICT_FTRUNC:
1230       case ISD::FTRUNC:     Imm = 0xB; break;
1231       case ISD::STRICT_FROUNDEVEN:
1232       case ISD::FROUNDEVEN: Imm = 0x8; break;
1233       case ISD::STRICT_FNEARBYINT:
1234       case ISD::FNEARBYINT: Imm = 0xC; break;
1235       case ISD::STRICT_FRINT:
1236       case ISD::FRINT:      Imm = 0x4; break;
1237       }
1238       SDLoc dl(N);
1239       bool IsStrict = N->isStrictFPOpcode();
1240       SDValue Res;
1241       if (IsStrict)
1242         Res = CurDAG->getNode(X86ISD::STRICT_VRNDSCALE, dl,
1243                               {N->getValueType(0), MVT::Other},
1244                               {N->getOperand(0), N->getOperand(1),
1245                                CurDAG->getTargetConstant(Imm, dl, MVT::i32)});
1246       else
1247         Res = CurDAG->getNode(X86ISD::VRNDSCALE, dl, N->getValueType(0),
1248                               N->getOperand(0),
1249                               CurDAG->getTargetConstant(Imm, dl, MVT::i32));
1250       --I;
1251       CurDAG->ReplaceAllUsesWith(N, Res.getNode());
1252       ++I;
1253       MadeChange = true;
1254       continue;
1255     }
1256     case X86ISD::FANDN:
1257     case X86ISD::FAND:
1258     case X86ISD::FOR:
1259     case X86ISD::FXOR: {
1260       // Widen scalar fp logic ops to vector to reduce isel patterns.
1261       // FIXME: Can we do this during lowering/combine.
1262       MVT VT = N->getSimpleValueType(0);
1263       if (VT.isVector() || VT == MVT::f128)
1264         break;
1265 
1266       MVT VecVT = VT == MVT::f64   ? MVT::v2f64
1267                   : VT == MVT::f32 ? MVT::v4f32
1268                                    : MVT::v8f16;
1269 
1270       SDLoc dl(N);
1271       SDValue Op0 = CurDAG->getNode(ISD::SCALAR_TO_VECTOR, dl, VecVT,
1272                                     N->getOperand(0));
1273       SDValue Op1 = CurDAG->getNode(ISD::SCALAR_TO_VECTOR, dl, VecVT,
1274                                     N->getOperand(1));
1275 
1276       SDValue Res;
1277       if (Subtarget->hasSSE2()) {
1278         EVT IntVT = EVT(VecVT).changeVectorElementTypeToInteger();
1279         Op0 = CurDAG->getNode(ISD::BITCAST, dl, IntVT, Op0);
1280         Op1 = CurDAG->getNode(ISD::BITCAST, dl, IntVT, Op1);
1281         unsigned Opc;
1282         switch (N->getOpcode()) {
1283         default: llvm_unreachable("Unexpected opcode!");
1284         case X86ISD::FANDN: Opc = X86ISD::ANDNP; break;
1285         case X86ISD::FAND:  Opc = ISD::AND;      break;
1286         case X86ISD::FOR:   Opc = ISD::OR;       break;
1287         case X86ISD::FXOR:  Opc = ISD::XOR;      break;
1288         }
1289         Res = CurDAG->getNode(Opc, dl, IntVT, Op0, Op1);
1290         Res = CurDAG->getNode(ISD::BITCAST, dl, VecVT, Res);
1291       } else {
1292         Res = CurDAG->getNode(N->getOpcode(), dl, VecVT, Op0, Op1);
1293       }
1294       Res = CurDAG->getNode(ISD::EXTRACT_VECTOR_ELT, dl, VT, Res,
1295                             CurDAG->getIntPtrConstant(0, dl));
1296       --I;
1297       CurDAG->ReplaceAllUsesOfValueWith(SDValue(N, 0), Res);
1298       ++I;
1299       MadeChange = true;
1300       continue;
1301     }
1302     }
1303 
1304     if (OptLevel != CodeGenOptLevel::None &&
1305         // Only do this when the target can fold the load into the call or
1306         // jmp.
1307         !Subtarget->useIndirectThunkCalls() &&
1308         ((N->getOpcode() == X86ISD::CALL && !Subtarget->slowTwoMemOps()) ||
1309          (N->getOpcode() == X86ISD::TC_RETURN &&
1310           (Subtarget->is64Bit() ||
1311            !getTargetMachine().isPositionIndependent())))) {
1312       /// Also try moving call address load from outside callseq_start to just
1313       /// before the call to allow it to be folded.
1314       ///
1315       ///     [Load chain]
1316       ///         ^
1317       ///         |
1318       ///       [Load]
1319       ///       ^    ^
1320       ///       |    |
1321       ///      /      \--
1322       ///     /          |
1323       ///[CALLSEQ_START] |
1324       ///     ^          |
1325       ///     |          |
1326       /// [LOAD/C2Reg]   |
1327       ///     |          |
1328       ///      \        /
1329       ///       \      /
1330       ///       [CALL]
1331       bool HasCallSeq = N->getOpcode() == X86ISD::CALL;
1332       SDValue Chain = N->getOperand(0);
1333       SDValue Load  = N->getOperand(1);
1334       if (!isCalleeLoad(Load, Chain, HasCallSeq))
1335         continue;
1336       moveBelowOrigChain(CurDAG, Load, SDValue(N, 0), Chain);
1337       ++NumLoadMoved;
1338       MadeChange = true;
1339       continue;
1340     }
1341 
1342     // Lower fpround and fpextend nodes that target the FP stack to be store and
1343     // load to the stack.  This is a gross hack.  We would like to simply mark
1344     // these as being illegal, but when we do that, legalize produces these when
1345     // it expands calls, then expands these in the same legalize pass.  We would
1346     // like dag combine to be able to hack on these between the call expansion
1347     // and the node legalization.  As such this pass basically does "really
1348     // late" legalization of these inline with the X86 isel pass.
1349     // FIXME: This should only happen when not compiled with -O0.
1350     switch (N->getOpcode()) {
1351     default: continue;
1352     case ISD::FP_ROUND:
1353     case ISD::FP_EXTEND:
1354     {
1355       MVT SrcVT = N->getOperand(0).getSimpleValueType();
1356       MVT DstVT = N->getSimpleValueType(0);
1357 
1358       // If any of the sources are vectors, no fp stack involved.
1359       if (SrcVT.isVector() || DstVT.isVector())
1360         continue;
1361 
1362       // If the source and destination are SSE registers, then this is a legal
1363       // conversion that should not be lowered.
1364       const X86TargetLowering *X86Lowering =
1365           static_cast<const X86TargetLowering *>(TLI);
1366       bool SrcIsSSE = X86Lowering->isScalarFPTypeInSSEReg(SrcVT);
1367       bool DstIsSSE = X86Lowering->isScalarFPTypeInSSEReg(DstVT);
1368       if (SrcIsSSE && DstIsSSE)
1369         continue;
1370 
1371       if (!SrcIsSSE && !DstIsSSE) {
1372         // If this is an FPStack extension, it is a noop.
1373         if (N->getOpcode() == ISD::FP_EXTEND)
1374           continue;
1375         // If this is a value-preserving FPStack truncation, it is a noop.
1376         if (N->getConstantOperandVal(1))
1377           continue;
1378       }
1379 
1380       // Here we could have an FP stack truncation or an FPStack <-> SSE convert.
1381       // FPStack has extload and truncstore.  SSE can fold direct loads into other
1382       // operations.  Based on this, decide what we want to do.
1383       MVT MemVT = (N->getOpcode() == ISD::FP_ROUND) ? DstVT : SrcVT;
1384       SDValue MemTmp = CurDAG->CreateStackTemporary(MemVT);
1385       int SPFI = cast<FrameIndexSDNode>(MemTmp)->getIndex();
1386       MachinePointerInfo MPI =
1387           MachinePointerInfo::getFixedStack(CurDAG->getMachineFunction(), SPFI);
1388       SDLoc dl(N);
1389 
1390       // FIXME: optimize the case where the src/dest is a load or store?
1391 
1392       SDValue Store = CurDAG->getTruncStore(
1393           CurDAG->getEntryNode(), dl, N->getOperand(0), MemTmp, MPI, MemVT);
1394       SDValue Result = CurDAG->getExtLoad(ISD::EXTLOAD, dl, DstVT, Store,
1395                                           MemTmp, MPI, MemVT);
1396 
1397       // We're about to replace all uses of the FP_ROUND/FP_EXTEND with the
1398       // extload we created.  This will cause general havok on the dag because
1399       // anything below the conversion could be folded into other existing nodes.
1400       // To avoid invalidating 'I', back it up to the convert node.
1401       --I;
1402       CurDAG->ReplaceAllUsesOfValueWith(SDValue(N, 0), Result);
1403       break;
1404     }
1405 
1406     //The sequence of events for lowering STRICT_FP versions of these nodes requires
1407     //dealing with the chain differently, as there is already a preexisting chain.
1408     case ISD::STRICT_FP_ROUND:
1409     case ISD::STRICT_FP_EXTEND:
1410     {
1411       MVT SrcVT = N->getOperand(1).getSimpleValueType();
1412       MVT DstVT = N->getSimpleValueType(0);
1413 
1414       // If any of the sources are vectors, no fp stack involved.
1415       if (SrcVT.isVector() || DstVT.isVector())
1416         continue;
1417 
1418       // If the source and destination are SSE registers, then this is a legal
1419       // conversion that should not be lowered.
1420       const X86TargetLowering *X86Lowering =
1421           static_cast<const X86TargetLowering *>(TLI);
1422       bool SrcIsSSE = X86Lowering->isScalarFPTypeInSSEReg(SrcVT);
1423       bool DstIsSSE = X86Lowering->isScalarFPTypeInSSEReg(DstVT);
1424       if (SrcIsSSE && DstIsSSE)
1425         continue;
1426 
1427       if (!SrcIsSSE && !DstIsSSE) {
1428         // If this is an FPStack extension, it is a noop.
1429         if (N->getOpcode() == ISD::STRICT_FP_EXTEND)
1430           continue;
1431         // If this is a value-preserving FPStack truncation, it is a noop.
1432         if (N->getConstantOperandVal(2))
1433           continue;
1434       }
1435 
1436       // Here we could have an FP stack truncation or an FPStack <-> SSE convert.
1437       // FPStack has extload and truncstore.  SSE can fold direct loads into other
1438       // operations.  Based on this, decide what we want to do.
1439       MVT MemVT = (N->getOpcode() == ISD::STRICT_FP_ROUND) ? DstVT : SrcVT;
1440       SDValue MemTmp = CurDAG->CreateStackTemporary(MemVT);
1441       int SPFI = cast<FrameIndexSDNode>(MemTmp)->getIndex();
1442       MachinePointerInfo MPI =
1443           MachinePointerInfo::getFixedStack(CurDAG->getMachineFunction(), SPFI);
1444       SDLoc dl(N);
1445 
1446       // FIXME: optimize the case where the src/dest is a load or store?
1447 
1448       //Since the operation is StrictFP, use the preexisting chain.
1449       SDValue Store, Result;
1450       if (!SrcIsSSE) {
1451         SDVTList VTs = CurDAG->getVTList(MVT::Other);
1452         SDValue Ops[] = {N->getOperand(0), N->getOperand(1), MemTmp};
1453         Store = CurDAG->getMemIntrinsicNode(X86ISD::FST, dl, VTs, Ops, MemVT,
1454                                             MPI, /*Align*/ std::nullopt,
1455                                             MachineMemOperand::MOStore);
1456         if (N->getFlags().hasNoFPExcept()) {
1457           SDNodeFlags Flags = Store->getFlags();
1458           Flags.setNoFPExcept(true);
1459           Store->setFlags(Flags);
1460         }
1461       } else {
1462         assert(SrcVT == MemVT && "Unexpected VT!");
1463         Store = CurDAG->getStore(N->getOperand(0), dl, N->getOperand(1), MemTmp,
1464                                  MPI);
1465       }
1466 
1467       if (!DstIsSSE) {
1468         SDVTList VTs = CurDAG->getVTList(DstVT, MVT::Other);
1469         SDValue Ops[] = {Store, MemTmp};
1470         Result = CurDAG->getMemIntrinsicNode(
1471             X86ISD::FLD, dl, VTs, Ops, MemVT, MPI,
1472             /*Align*/ std::nullopt, MachineMemOperand::MOLoad);
1473         if (N->getFlags().hasNoFPExcept()) {
1474           SDNodeFlags Flags = Result->getFlags();
1475           Flags.setNoFPExcept(true);
1476           Result->setFlags(Flags);
1477         }
1478       } else {
1479         assert(DstVT == MemVT && "Unexpected VT!");
1480         Result = CurDAG->getLoad(DstVT, dl, Store, MemTmp, MPI);
1481       }
1482 
1483       // We're about to replace all uses of the FP_ROUND/FP_EXTEND with the
1484       // extload we created.  This will cause general havok on the dag because
1485       // anything below the conversion could be folded into other existing nodes.
1486       // To avoid invalidating 'I', back it up to the convert node.
1487       --I;
1488       CurDAG->ReplaceAllUsesWith(N, Result.getNode());
1489       break;
1490     }
1491     }
1492 
1493 
1494     // Now that we did that, the node is dead.  Increment the iterator to the
1495     // next node to process, then delete N.
1496     ++I;
1497     MadeChange = true;
1498   }
1499 
1500   // Remove any dead nodes that may have been left behind.
1501   if (MadeChange)
1502     CurDAG->RemoveDeadNodes();
1503 }
1504 
1505 // Look for a redundant movzx/movsx that can occur after an 8-bit divrem.
1506 bool X86DAGToDAGISel::tryOptimizeRem8Extend(SDNode *N) {
1507   unsigned Opc = N->getMachineOpcode();
1508   if (Opc != X86::MOVZX32rr8 && Opc != X86::MOVSX32rr8 &&
1509       Opc != X86::MOVSX64rr8)
1510     return false;
1511 
1512   SDValue N0 = N->getOperand(0);
1513 
1514   // We need to be extracting the lower bit of an extend.
1515   if (!N0.isMachineOpcode() ||
1516       N0.getMachineOpcode() != TargetOpcode::EXTRACT_SUBREG ||
1517       N0.getConstantOperandVal(1) != X86::sub_8bit)
1518     return false;
1519 
1520   // We're looking for either a movsx or movzx to match the original opcode.
1521   unsigned ExpectedOpc = Opc == X86::MOVZX32rr8 ? X86::MOVZX32rr8_NOREX
1522                                                 : X86::MOVSX32rr8_NOREX;
1523   SDValue N00 = N0.getOperand(0);
1524   if (!N00.isMachineOpcode() || N00.getMachineOpcode() != ExpectedOpc)
1525     return false;
1526 
1527   if (Opc == X86::MOVSX64rr8) {
1528     // If we had a sign extend from 8 to 64 bits. We still need to go from 32
1529     // to 64.
1530     MachineSDNode *Extend = CurDAG->getMachineNode(X86::MOVSX64rr32, SDLoc(N),
1531                                                    MVT::i64, N00);
1532     ReplaceUses(N, Extend);
1533   } else {
1534     // Ok we can drop this extend and just use the original extend.
1535     ReplaceUses(N, N00.getNode());
1536   }
1537 
1538   return true;
1539 }
1540 
1541 void X86DAGToDAGISel::PostprocessISelDAG() {
1542   // Skip peepholes at -O0.
1543   if (TM.getOptLevel() == CodeGenOptLevel::None)
1544     return;
1545 
1546   SelectionDAG::allnodes_iterator Position = CurDAG->allnodes_end();
1547 
1548   bool MadeChange = false;
1549   while (Position != CurDAG->allnodes_begin()) {
1550     SDNode *N = &*--Position;
1551     // Skip dead nodes and any non-machine opcodes.
1552     if (N->use_empty() || !N->isMachineOpcode())
1553       continue;
1554 
1555     if (tryOptimizeRem8Extend(N)) {
1556       MadeChange = true;
1557       continue;
1558     }
1559 
1560     unsigned Opc = N->getMachineOpcode();
1561     switch (Opc) {
1562     default:
1563       continue;
1564     // ANDrr/rm + TESTrr+ -> TESTrr/TESTmr
1565     case X86::TEST8rr:
1566     case X86::TEST16rr:
1567     case X86::TEST32rr:
1568     case X86::TEST64rr:
1569     // ANDrr/rm + CTESTrr -> CTESTrr/CTESTmr
1570     case X86::CTEST8rr:
1571     case X86::CTEST16rr:
1572     case X86::CTEST32rr:
1573     case X86::CTEST64rr: {
1574       auto &Op0 = N->getOperand(0);
1575       if (Op0 != N->getOperand(1) || !Op0->hasNUsesOfValue(2, Op0.getResNo()) ||
1576           !Op0.isMachineOpcode())
1577         continue;
1578       SDValue And = N->getOperand(0);
1579 #define CASE_ND(OP)                                                            \
1580   case X86::OP:                                                                \
1581   case X86::OP##_ND:
1582       switch (And.getMachineOpcode()) {
1583       default:
1584         continue;
1585         CASE_ND(AND8rr)
1586         CASE_ND(AND16rr)
1587         CASE_ND(AND32rr)
1588         CASE_ND(AND64rr) {
1589           if (And->hasAnyUseOfValue(1))
1590             continue;
1591           SmallVector<SDValue> Ops(N->op_values());
1592           Ops[0] = And.getOperand(0);
1593           Ops[1] = And.getOperand(1);
1594           MachineSDNode *Test =
1595               CurDAG->getMachineNode(Opc, SDLoc(N), MVT::i32, Ops);
1596           ReplaceUses(N, Test);
1597           MadeChange = true;
1598           continue;
1599         }
1600         CASE_ND(AND8rm)
1601         CASE_ND(AND16rm)
1602         CASE_ND(AND32rm)
1603         CASE_ND(AND64rm) {
1604           if (And->hasAnyUseOfValue(1))
1605             continue;
1606           unsigned NewOpc;
1607           bool IsCTESTCC = X86::isCTESTCC(Opc);
1608 #define FROM_TO(A, B)                                                          \
1609   CASE_ND(A) NewOpc = IsCTESTCC ? X86::C##B : X86::B;                          \
1610   break;
1611           switch (And.getMachineOpcode()) {
1612             FROM_TO(AND8rm, TEST8mr);
1613             FROM_TO(AND16rm, TEST16mr);
1614             FROM_TO(AND32rm, TEST32mr);
1615             FROM_TO(AND64rm, TEST64mr);
1616           }
1617 #undef FROM_TO
1618 #undef CASE_ND
1619           // Need to swap the memory and register operand.
1620           SmallVector<SDValue> Ops = {And.getOperand(1), And.getOperand(2),
1621                                       And.getOperand(3), And.getOperand(4),
1622                                       And.getOperand(5), And.getOperand(0)};
1623           // CC, Cflags.
1624           if (IsCTESTCC) {
1625             Ops.push_back(N->getOperand(2));
1626             Ops.push_back(N->getOperand(3));
1627           }
1628           // Chain of memory load
1629           Ops.push_back(And.getOperand(6));
1630           // Glue
1631           if (IsCTESTCC)
1632             Ops.push_back(N->getOperand(4));
1633 
1634           MachineSDNode *Test = CurDAG->getMachineNode(
1635               NewOpc, SDLoc(N), MVT::i32, MVT::Other, Ops);
1636           CurDAG->setNodeMemRefs(
1637               Test, cast<MachineSDNode>(And.getNode())->memoperands());
1638           ReplaceUses(And.getValue(2), SDValue(Test, 1));
1639           ReplaceUses(SDValue(N, 0), SDValue(Test, 0));
1640           MadeChange = true;
1641           continue;
1642         }
1643       }
1644     }
1645     // Look for a KAND+KORTEST and turn it into KTEST if only the zero flag is
1646     // used. We're doing this late so we can prefer to fold the AND into masked
1647     // comparisons. Doing that can be better for the live range of the mask
1648     // register.
1649     case X86::KORTESTBrr:
1650     case X86::KORTESTWrr:
1651     case X86::KORTESTDrr:
1652     case X86::KORTESTQrr: {
1653       SDValue Op0 = N->getOperand(0);
1654       if (Op0 != N->getOperand(1) || !N->isOnlyUserOf(Op0.getNode()) ||
1655           !Op0.isMachineOpcode() || !onlyUsesZeroFlag(SDValue(N, 0)))
1656         continue;
1657 #define CASE(A)                                                                \
1658   case X86::A:                                                                 \
1659     break;
1660       switch (Op0.getMachineOpcode()) {
1661       default:
1662         continue;
1663         CASE(KANDBrr)
1664         CASE(KANDWrr)
1665         CASE(KANDDrr)
1666         CASE(KANDQrr)
1667       }
1668       unsigned NewOpc;
1669 #define FROM_TO(A, B)                                                          \
1670   case X86::A:                                                                 \
1671     NewOpc = X86::B;                                                           \
1672     break;
1673       switch (Opc) {
1674         FROM_TO(KORTESTBrr, KTESTBrr)
1675         FROM_TO(KORTESTWrr, KTESTWrr)
1676         FROM_TO(KORTESTDrr, KTESTDrr)
1677         FROM_TO(KORTESTQrr, KTESTQrr)
1678       }
1679       // KANDW is legal with AVX512F, but KTESTW requires AVX512DQ. The other
1680       // KAND instructions and KTEST use the same ISA feature.
1681       if (NewOpc == X86::KTESTWrr && !Subtarget->hasDQI())
1682         continue;
1683 #undef FROM_TO
1684       MachineSDNode *KTest = CurDAG->getMachineNode(
1685           NewOpc, SDLoc(N), MVT::i32, Op0.getOperand(0), Op0.getOperand(1));
1686       ReplaceUses(N, KTest);
1687       MadeChange = true;
1688       continue;
1689     }
1690     // Attempt to remove vectors moves that were inserted to zero upper bits.
1691     case TargetOpcode::SUBREG_TO_REG: {
1692       unsigned SubRegIdx = N->getConstantOperandVal(2);
1693       if (SubRegIdx != X86::sub_xmm && SubRegIdx != X86::sub_ymm)
1694         continue;
1695 
1696       SDValue Move = N->getOperand(1);
1697       if (!Move.isMachineOpcode())
1698         continue;
1699 
1700       // Make sure its one of the move opcodes we recognize.
1701       switch (Move.getMachineOpcode()) {
1702       default:
1703         continue;
1704         CASE(VMOVAPDrr)       CASE(VMOVUPDrr)
1705         CASE(VMOVAPSrr)       CASE(VMOVUPSrr)
1706         CASE(VMOVDQArr)       CASE(VMOVDQUrr)
1707         CASE(VMOVAPDYrr)      CASE(VMOVUPDYrr)
1708         CASE(VMOVAPSYrr)      CASE(VMOVUPSYrr)
1709         CASE(VMOVDQAYrr)      CASE(VMOVDQUYrr)
1710         CASE(VMOVAPDZ128rr)   CASE(VMOVUPDZ128rr)
1711         CASE(VMOVAPSZ128rr)   CASE(VMOVUPSZ128rr)
1712         CASE(VMOVDQA32Z128rr) CASE(VMOVDQU32Z128rr)
1713         CASE(VMOVDQA64Z128rr) CASE(VMOVDQU64Z128rr)
1714         CASE(VMOVAPDZ256rr)   CASE(VMOVUPDZ256rr)
1715         CASE(VMOVAPSZ256rr)   CASE(VMOVUPSZ256rr)
1716         CASE(VMOVDQA32Z256rr) CASE(VMOVDQU32Z256rr)
1717         CASE(VMOVDQA64Z256rr) CASE(VMOVDQU64Z256rr)
1718       }
1719 #undef CASE
1720 
1721     SDValue In = Move.getOperand(0);
1722     if (!In.isMachineOpcode() ||
1723         In.getMachineOpcode() <= TargetOpcode::GENERIC_OP_END)
1724       continue;
1725 
1726     // Make sure the instruction has a VEX, XOP, or EVEX prefix. This covers
1727     // the SHA instructions which use a legacy encoding.
1728     uint64_t TSFlags = getInstrInfo()->get(In.getMachineOpcode()).TSFlags;
1729     if ((TSFlags & X86II::EncodingMask) != X86II::VEX &&
1730         (TSFlags & X86II::EncodingMask) != X86II::EVEX &&
1731         (TSFlags & X86II::EncodingMask) != X86II::XOP)
1732       continue;
1733 
1734     // Producing instruction is another vector instruction. We can drop the
1735     // move.
1736     CurDAG->UpdateNodeOperands(N, N->getOperand(0), In, N->getOperand(2));
1737     MadeChange = true;
1738     }
1739     }
1740   }
1741 
1742   if (MadeChange)
1743     CurDAG->RemoveDeadNodes();
1744 }
1745 
1746 
1747 /// Emit any code that needs to be executed only in the main function.
1748 void X86DAGToDAGISel::emitSpecialCodeForMain() {
1749   if (Subtarget->isTargetCygMing()) {
1750     TargetLowering::ArgListTy Args;
1751     auto &DL = CurDAG->getDataLayout();
1752 
1753     TargetLowering::CallLoweringInfo CLI(*CurDAG);
1754     CLI.setChain(CurDAG->getRoot())
1755         .setCallee(CallingConv::C, Type::getVoidTy(*CurDAG->getContext()),
1756                    CurDAG->getExternalSymbol("__main", TLI->getPointerTy(DL)),
1757                    std::move(Args));
1758     const TargetLowering &TLI = CurDAG->getTargetLoweringInfo();
1759     std::pair<SDValue, SDValue> Result = TLI.LowerCallTo(CLI);
1760     CurDAG->setRoot(Result.second);
1761   }
1762 }
1763 
1764 void X86DAGToDAGISel::emitFunctionEntryCode() {
1765   // If this is main, emit special code for main.
1766   const Function &F = MF->getFunction();
1767   if (F.hasExternalLinkage() && F.getName() == "main")
1768     emitSpecialCodeForMain();
1769 }
1770 
1771 static bool isDispSafeForFrameIndex(int64_t Val) {
1772   // On 64-bit platforms, we can run into an issue where a frame index
1773   // includes a displacement that, when added to the explicit displacement,
1774   // will overflow the displacement field. Assuming that the frame index
1775   // displacement fits into a 31-bit integer  (which is only slightly more
1776   // aggressive than the current fundamental assumption that it fits into
1777   // a 32-bit integer), a 31-bit disp should always be safe.
1778   return isInt<31>(Val);
1779 }
1780 
1781 bool X86DAGToDAGISel::foldOffsetIntoAddress(uint64_t Offset,
1782                                             X86ISelAddressMode &AM) {
1783   // We may have already matched a displacement and the caller just added the
1784   // symbolic displacement. So we still need to do the checks even if Offset
1785   // is zero.
1786 
1787   int64_t Val = AM.Disp + Offset;
1788 
1789   // Cannot combine ExternalSymbol displacements with integer offsets.
1790   if (Val != 0 && (AM.ES || AM.MCSym))
1791     return true;
1792 
1793   CodeModel::Model M = TM.getCodeModel();
1794   if (Subtarget->is64Bit()) {
1795     if (Val != 0 &&
1796         !X86::isOffsetSuitableForCodeModel(Val, M,
1797                                            AM.hasSymbolicDisplacement()))
1798       return true;
1799     // In addition to the checks required for a register base, check that
1800     // we do not try to use an unsafe Disp with a frame index.
1801     if (AM.BaseType == X86ISelAddressMode::FrameIndexBase &&
1802         !isDispSafeForFrameIndex(Val))
1803       return true;
1804     // In ILP32 (x32) mode, pointers are 32 bits and need to be zero-extended to
1805     // 64 bits. Instructions with 32-bit register addresses perform this zero
1806     // extension for us and we can safely ignore the high bits of Offset.
1807     // Instructions with only a 32-bit immediate address do not, though: they
1808     // sign extend instead. This means only address the low 2GB of address space
1809     // is directly addressable, we need indirect addressing for the high 2GB of
1810     // address space.
1811     // TODO: Some of the earlier checks may be relaxed for ILP32 mode as the
1812     // implicit zero extension of instructions would cover up any problem.
1813     // However, we have asserts elsewhere that get triggered if we do, so keep
1814     // the checks for now.
1815     // TODO: We would actually be able to accept these, as well as the same
1816     // addresses in LP64 mode, by adding the EIZ pseudo-register as an operand
1817     // to get an address size override to be emitted. However, this
1818     // pseudo-register is not part of any register class and therefore causes
1819     // MIR verification to fail.
1820     if (Subtarget->isTarget64BitILP32() && !isUInt<31>(Val) &&
1821         !AM.hasBaseOrIndexReg())
1822       return true;
1823   }
1824   AM.Disp = Val;
1825   return false;
1826 }
1827 
1828 bool X86DAGToDAGISel::matchLoadInAddress(LoadSDNode *N, X86ISelAddressMode &AM,
1829                                          bool AllowSegmentRegForX32) {
1830   SDValue Address = N->getOperand(1);
1831 
1832   // load gs:0 -> GS segment register.
1833   // load fs:0 -> FS segment register.
1834   //
1835   // This optimization is generally valid because the GNU TLS model defines that
1836   // gs:0 (or fs:0 on X86-64) contains its own address. However, for X86-64 mode
1837   // with 32-bit registers, as we get in ILP32 mode, those registers are first
1838   // zero-extended to 64 bits and then added it to the base address, which gives
1839   // unwanted results when the register holds a negative value.
1840   // For more information see http://people.redhat.com/drepper/tls.pdf
1841   if (isNullConstant(Address) && AM.Segment.getNode() == nullptr &&
1842       !IndirectTlsSegRefs &&
1843       (Subtarget->isTargetGlibc() || Subtarget->isTargetAndroid() ||
1844        Subtarget->isTargetFuchsia())) {
1845     if (Subtarget->isTarget64BitILP32() && !AllowSegmentRegForX32)
1846       return true;
1847     switch (N->getPointerInfo().getAddrSpace()) {
1848     case X86AS::GS:
1849       AM.Segment = CurDAG->getRegister(X86::GS, MVT::i16);
1850       return false;
1851     case X86AS::FS:
1852       AM.Segment = CurDAG->getRegister(X86::FS, MVT::i16);
1853       return false;
1854       // Address space X86AS::SS is not handled here, because it is not used to
1855       // address TLS areas.
1856     }
1857   }
1858 
1859   return true;
1860 }
1861 
1862 /// Try to match X86ISD::Wrapper and X86ISD::WrapperRIP nodes into an addressing
1863 /// mode. These wrap things that will resolve down into a symbol reference.
1864 /// If no match is possible, this returns true, otherwise it returns false.
1865 bool X86DAGToDAGISel::matchWrapper(SDValue N, X86ISelAddressMode &AM) {
1866   // If the addressing mode already has a symbol as the displacement, we can
1867   // never match another symbol.
1868   if (AM.hasSymbolicDisplacement())
1869     return true;
1870 
1871   bool IsRIPRelTLS = false;
1872   bool IsRIPRel = N.getOpcode() == X86ISD::WrapperRIP;
1873   if (IsRIPRel) {
1874     SDValue Val = N.getOperand(0);
1875     if (Val.getOpcode() == ISD::TargetGlobalTLSAddress)
1876       IsRIPRelTLS = true;
1877   }
1878 
1879   // We can't use an addressing mode in the 64-bit large code model.
1880   // Global TLS addressing is an exception. In the medium code model,
1881   // we use can use a mode when RIP wrappers are present.
1882   // That signifies access to globals that are known to be "near",
1883   // such as the GOT itself.
1884   CodeModel::Model M = TM.getCodeModel();
1885   if (Subtarget->is64Bit() && M == CodeModel::Large && !IsRIPRelTLS)
1886     return true;
1887 
1888   // Base and index reg must be 0 in order to use %rip as base.
1889   if (IsRIPRel && AM.hasBaseOrIndexReg())
1890     return true;
1891 
1892   // Make a local copy in case we can't do this fold.
1893   X86ISelAddressMode Backup = AM;
1894 
1895   int64_t Offset = 0;
1896   SDValue N0 = N.getOperand(0);
1897   if (auto *G = dyn_cast<GlobalAddressSDNode>(N0)) {
1898     AM.GV = G->getGlobal();
1899     AM.SymbolFlags = G->getTargetFlags();
1900     Offset = G->getOffset();
1901   } else if (auto *CP = dyn_cast<ConstantPoolSDNode>(N0)) {
1902     AM.CP = CP->getConstVal();
1903     AM.Alignment = CP->getAlign();
1904     AM.SymbolFlags = CP->getTargetFlags();
1905     Offset = CP->getOffset();
1906   } else if (auto *S = dyn_cast<ExternalSymbolSDNode>(N0)) {
1907     AM.ES = S->getSymbol();
1908     AM.SymbolFlags = S->getTargetFlags();
1909   } else if (auto *S = dyn_cast<MCSymbolSDNode>(N0)) {
1910     AM.MCSym = S->getMCSymbol();
1911   } else if (auto *J = dyn_cast<JumpTableSDNode>(N0)) {
1912     AM.JT = J->getIndex();
1913     AM.SymbolFlags = J->getTargetFlags();
1914   } else if (auto *BA = dyn_cast<BlockAddressSDNode>(N0)) {
1915     AM.BlockAddr = BA->getBlockAddress();
1916     AM.SymbolFlags = BA->getTargetFlags();
1917     Offset = BA->getOffset();
1918   } else
1919     llvm_unreachable("Unhandled symbol reference node.");
1920 
1921   // Can't use an addressing mode with large globals.
1922   if (Subtarget->is64Bit() && !IsRIPRel && AM.GV &&
1923       TM.isLargeGlobalValue(AM.GV)) {
1924     AM = Backup;
1925     return true;
1926   }
1927 
1928   if (foldOffsetIntoAddress(Offset, AM)) {
1929     AM = Backup;
1930     return true;
1931   }
1932 
1933   if (IsRIPRel)
1934     AM.setBaseReg(CurDAG->getRegister(X86::RIP, MVT::i64));
1935 
1936   // Commit the changes now that we know this fold is safe.
1937   return false;
1938 }
1939 
1940 /// Add the specified node to the specified addressing mode, returning true if
1941 /// it cannot be done. This just pattern matches for the addressing mode.
1942 bool X86DAGToDAGISel::matchAddress(SDValue N, X86ISelAddressMode &AM) {
1943   if (matchAddressRecursively(N, AM, 0))
1944     return true;
1945 
1946   // Post-processing: Make a second attempt to fold a load, if we now know
1947   // that there will not be any other register. This is only performed for
1948   // 64-bit ILP32 mode since 32-bit mode and 64-bit LP64 mode will have folded
1949   // any foldable load the first time.
1950   if (Subtarget->isTarget64BitILP32() &&
1951       AM.BaseType == X86ISelAddressMode::RegBase &&
1952       AM.Base_Reg.getNode() != nullptr && AM.IndexReg.getNode() == nullptr) {
1953     SDValue Save_Base_Reg = AM.Base_Reg;
1954     if (auto *LoadN = dyn_cast<LoadSDNode>(Save_Base_Reg)) {
1955       AM.Base_Reg = SDValue();
1956       if (matchLoadInAddress(LoadN, AM, /*AllowSegmentRegForX32=*/true))
1957         AM.Base_Reg = Save_Base_Reg;
1958     }
1959   }
1960 
1961   // Post-processing: Convert lea(,%reg,2) to lea(%reg,%reg), which has
1962   // a smaller encoding and avoids a scaled-index.
1963   if (AM.Scale == 2 &&
1964       AM.BaseType == X86ISelAddressMode::RegBase &&
1965       AM.Base_Reg.getNode() == nullptr) {
1966     AM.Base_Reg = AM.IndexReg;
1967     AM.Scale = 1;
1968   }
1969 
1970   // Post-processing: Convert foo to foo(%rip), even in non-PIC mode,
1971   // because it has a smaller encoding.
1972   if (TM.getCodeModel() != CodeModel::Large &&
1973       (!AM.GV || !TM.isLargeGlobalValue(AM.GV)) && Subtarget->is64Bit() &&
1974       AM.Scale == 1 && AM.BaseType == X86ISelAddressMode::RegBase &&
1975       AM.Base_Reg.getNode() == nullptr && AM.IndexReg.getNode() == nullptr &&
1976       AM.SymbolFlags == X86II::MO_NO_FLAG && AM.hasSymbolicDisplacement()) {
1977     AM.Base_Reg = CurDAG->getRegister(X86::RIP, MVT::i64);
1978   }
1979 
1980   return false;
1981 }
1982 
1983 bool X86DAGToDAGISel::matchAdd(SDValue &N, X86ISelAddressMode &AM,
1984                                unsigned Depth) {
1985   // Add an artificial use to this node so that we can keep track of
1986   // it if it gets CSE'd with a different node.
1987   HandleSDNode Handle(N);
1988 
1989   X86ISelAddressMode Backup = AM;
1990   if (!matchAddressRecursively(N.getOperand(0), AM, Depth+1) &&
1991       !matchAddressRecursively(Handle.getValue().getOperand(1), AM, Depth+1))
1992     return false;
1993   AM = Backup;
1994 
1995   // Try again after commutating the operands.
1996   if (!matchAddressRecursively(Handle.getValue().getOperand(1), AM,
1997                                Depth + 1) &&
1998       !matchAddressRecursively(Handle.getValue().getOperand(0), AM, Depth + 1))
1999     return false;
2000   AM = Backup;
2001 
2002   // If we couldn't fold both operands into the address at the same time,
2003   // see if we can just put each operand into a register and fold at least
2004   // the add.
2005   if (AM.BaseType == X86ISelAddressMode::RegBase &&
2006       !AM.Base_Reg.getNode() &&
2007       !AM.IndexReg.getNode()) {
2008     N = Handle.getValue();
2009     AM.Base_Reg = N.getOperand(0);
2010     AM.IndexReg = N.getOperand(1);
2011     AM.Scale = 1;
2012     return false;
2013   }
2014   N = Handle.getValue();
2015   return true;
2016 }
2017 
2018 // Insert a node into the DAG at least before the Pos node's position. This
2019 // will reposition the node as needed, and will assign it a node ID that is <=
2020 // the Pos node's ID. Note that this does *not* preserve the uniqueness of node
2021 // IDs! The selection DAG must no longer depend on their uniqueness when this
2022 // is used.
2023 static void insertDAGNode(SelectionDAG &DAG, SDValue Pos, SDValue N) {
2024   if (N->getNodeId() == -1 ||
2025       (SelectionDAGISel::getUninvalidatedNodeId(N.getNode()) >
2026        SelectionDAGISel::getUninvalidatedNodeId(Pos.getNode()))) {
2027     DAG.RepositionNode(Pos->getIterator(), N.getNode());
2028     // Mark Node as invalid for pruning as after this it may be a successor to a
2029     // selected node but otherwise be in the same position of Pos.
2030     // Conservatively mark it with the same -abs(Id) to assure node id
2031     // invariant is preserved.
2032     N->setNodeId(Pos->getNodeId());
2033     SelectionDAGISel::InvalidateNodeId(N.getNode());
2034   }
2035 }
2036 
2037 // Transform "(X >> (8-C1)) & (0xff << C1)" to "((X >> 8) & 0xff) << C1" if
2038 // safe. This allows us to convert the shift and and into an h-register
2039 // extract and a scaled index. Returns false if the simplification is
2040 // performed.
2041 static bool foldMaskAndShiftToExtract(SelectionDAG &DAG, SDValue N,
2042                                       uint64_t Mask,
2043                                       SDValue Shift, SDValue X,
2044                                       X86ISelAddressMode &AM) {
2045   if (Shift.getOpcode() != ISD::SRL ||
2046       !isa<ConstantSDNode>(Shift.getOperand(1)) ||
2047       !Shift.hasOneUse())
2048     return true;
2049 
2050   int ScaleLog = 8 - Shift.getConstantOperandVal(1);
2051   if (ScaleLog <= 0 || ScaleLog >= 4 ||
2052       Mask != (0xffu << ScaleLog))
2053     return true;
2054 
2055   MVT XVT = X.getSimpleValueType();
2056   MVT VT = N.getSimpleValueType();
2057   SDLoc DL(N);
2058   SDValue Eight = DAG.getConstant(8, DL, MVT::i8);
2059   SDValue NewMask = DAG.getConstant(0xff, DL, XVT);
2060   SDValue Srl = DAG.getNode(ISD::SRL, DL, XVT, X, Eight);
2061   SDValue And = DAG.getNode(ISD::AND, DL, XVT, Srl, NewMask);
2062   SDValue Ext = DAG.getZExtOrTrunc(And, DL, VT);
2063   SDValue ShlCount = DAG.getConstant(ScaleLog, DL, MVT::i8);
2064   SDValue Shl = DAG.getNode(ISD::SHL, DL, VT, Ext, ShlCount);
2065 
2066   // Insert the new nodes into the topological ordering. We must do this in
2067   // a valid topological ordering as nothing is going to go back and re-sort
2068   // these nodes. We continually insert before 'N' in sequence as this is
2069   // essentially a pre-flattened and pre-sorted sequence of nodes. There is no
2070   // hierarchy left to express.
2071   insertDAGNode(DAG, N, Eight);
2072   insertDAGNode(DAG, N, NewMask);
2073   insertDAGNode(DAG, N, Srl);
2074   insertDAGNode(DAG, N, And);
2075   insertDAGNode(DAG, N, Ext);
2076   insertDAGNode(DAG, N, ShlCount);
2077   insertDAGNode(DAG, N, Shl);
2078   DAG.ReplaceAllUsesWith(N, Shl);
2079   DAG.RemoveDeadNode(N.getNode());
2080   AM.IndexReg = Ext;
2081   AM.Scale = (1 << ScaleLog);
2082   return false;
2083 }
2084 
2085 // Transforms "(X << C1) & C2" to "(X & (C2>>C1)) << C1" if safe and if this
2086 // allows us to fold the shift into this addressing mode. Returns false if the
2087 // transform succeeded.
2088 static bool foldMaskedShiftToScaledMask(SelectionDAG &DAG, SDValue N,
2089                                         X86ISelAddressMode &AM) {
2090   SDValue Shift = N.getOperand(0);
2091 
2092   // Use a signed mask so that shifting right will insert sign bits. These
2093   // bits will be removed when we shift the result left so it doesn't matter
2094   // what we use. This might allow a smaller immediate encoding.
2095   int64_t Mask = cast<ConstantSDNode>(N->getOperand(1))->getSExtValue();
2096 
2097   // If we have an any_extend feeding the AND, look through it to see if there
2098   // is a shift behind it. But only if the AND doesn't use the extended bits.
2099   // FIXME: Generalize this to other ANY_EXTEND than i32 to i64?
2100   bool FoundAnyExtend = false;
2101   if (Shift.getOpcode() == ISD::ANY_EXTEND && Shift.hasOneUse() &&
2102       Shift.getOperand(0).getSimpleValueType() == MVT::i32 &&
2103       isUInt<32>(Mask)) {
2104     FoundAnyExtend = true;
2105     Shift = Shift.getOperand(0);
2106   }
2107 
2108   if (Shift.getOpcode() != ISD::SHL ||
2109       !isa<ConstantSDNode>(Shift.getOperand(1)))
2110     return true;
2111 
2112   SDValue X = Shift.getOperand(0);
2113 
2114   // Not likely to be profitable if either the AND or SHIFT node has more
2115   // than one use (unless all uses are for address computation). Besides,
2116   // isel mechanism requires their node ids to be reused.
2117   if (!N.hasOneUse() || !Shift.hasOneUse())
2118     return true;
2119 
2120   // Verify that the shift amount is something we can fold.
2121   unsigned ShiftAmt = Shift.getConstantOperandVal(1);
2122   if (ShiftAmt != 1 && ShiftAmt != 2 && ShiftAmt != 3)
2123     return true;
2124 
2125   MVT VT = N.getSimpleValueType();
2126   SDLoc DL(N);
2127   if (FoundAnyExtend) {
2128     SDValue NewX = DAG.getNode(ISD::ANY_EXTEND, DL, VT, X);
2129     insertDAGNode(DAG, N, NewX);
2130     X = NewX;
2131   }
2132 
2133   SDValue NewMask = DAG.getConstant(Mask >> ShiftAmt, DL, VT);
2134   SDValue NewAnd = DAG.getNode(ISD::AND, DL, VT, X, NewMask);
2135   SDValue NewShift = DAG.getNode(ISD::SHL, DL, VT, NewAnd, Shift.getOperand(1));
2136 
2137   // Insert the new nodes into the topological ordering. We must do this in
2138   // a valid topological ordering as nothing is going to go back and re-sort
2139   // these nodes. We continually insert before 'N' in sequence as this is
2140   // essentially a pre-flattened and pre-sorted sequence of nodes. There is no
2141   // hierarchy left to express.
2142   insertDAGNode(DAG, N, NewMask);
2143   insertDAGNode(DAG, N, NewAnd);
2144   insertDAGNode(DAG, N, NewShift);
2145   DAG.ReplaceAllUsesWith(N, NewShift);
2146   DAG.RemoveDeadNode(N.getNode());
2147 
2148   AM.Scale = 1 << ShiftAmt;
2149   AM.IndexReg = NewAnd;
2150   return false;
2151 }
2152 
2153 // Implement some heroics to detect shifts of masked values where the mask can
2154 // be replaced by extending the shift and undoing that in the addressing mode
2155 // scale. Patterns such as (shl (srl x, c1), c2) are canonicalized into (and
2156 // (srl x, SHIFT), MASK) by DAGCombines that don't know the shl can be done in
2157 // the addressing mode. This results in code such as:
2158 //
2159 //   int f(short *y, int *lookup_table) {
2160 //     ...
2161 //     return *y + lookup_table[*y >> 11];
2162 //   }
2163 //
2164 // Turning into:
2165 //   movzwl (%rdi), %eax
2166 //   movl %eax, %ecx
2167 //   shrl $11, %ecx
2168 //   addl (%rsi,%rcx,4), %eax
2169 //
2170 // Instead of:
2171 //   movzwl (%rdi), %eax
2172 //   movl %eax, %ecx
2173 //   shrl $9, %ecx
2174 //   andl $124, %rcx
2175 //   addl (%rsi,%rcx), %eax
2176 //
2177 // Note that this function assumes the mask is provided as a mask *after* the
2178 // value is shifted. The input chain may or may not match that, but computing
2179 // such a mask is trivial.
2180 static bool foldMaskAndShiftToScale(SelectionDAG &DAG, SDValue N,
2181                                     uint64_t Mask,
2182                                     SDValue Shift, SDValue X,
2183                                     X86ISelAddressMode &AM) {
2184   if (Shift.getOpcode() != ISD::SRL || !Shift.hasOneUse() ||
2185       !isa<ConstantSDNode>(Shift.getOperand(1)))
2186     return true;
2187 
2188   // We need to ensure that mask is a continuous run of bits.
2189   unsigned MaskIdx, MaskLen;
2190   if (!isShiftedMask_64(Mask, MaskIdx, MaskLen))
2191     return true;
2192   unsigned MaskLZ = 64 - (MaskIdx + MaskLen);
2193 
2194   unsigned ShiftAmt = Shift.getConstantOperandVal(1);
2195 
2196   // The amount of shift we're trying to fit into the addressing mode is taken
2197   // from the shifted mask index (number of trailing zeros of the mask).
2198   unsigned AMShiftAmt = MaskIdx;
2199 
2200   // There is nothing we can do here unless the mask is removing some bits.
2201   // Also, the addressing mode can only represent shifts of 1, 2, or 3 bits.
2202   if (AMShiftAmt == 0 || AMShiftAmt > 3) return true;
2203 
2204   // Scale the leading zero count down based on the actual size of the value.
2205   // Also scale it down based on the size of the shift.
2206   unsigned ScaleDown = (64 - X.getSimpleValueType().getSizeInBits()) + ShiftAmt;
2207   if (MaskLZ < ScaleDown)
2208     return true;
2209   MaskLZ -= ScaleDown;
2210 
2211   // The final check is to ensure that any masked out high bits of X are
2212   // already known to be zero. Otherwise, the mask has a semantic impact
2213   // other than masking out a couple of low bits. Unfortunately, because of
2214   // the mask, zero extensions will be removed from operands in some cases.
2215   // This code works extra hard to look through extensions because we can
2216   // replace them with zero extensions cheaply if necessary.
2217   bool ReplacingAnyExtend = false;
2218   if (X.getOpcode() == ISD::ANY_EXTEND) {
2219     unsigned ExtendBits = X.getSimpleValueType().getSizeInBits() -
2220                           X.getOperand(0).getSimpleValueType().getSizeInBits();
2221     // Assume that we'll replace the any-extend with a zero-extend, and
2222     // narrow the search to the extended value.
2223     X = X.getOperand(0);
2224     MaskLZ = ExtendBits > MaskLZ ? 0 : MaskLZ - ExtendBits;
2225     ReplacingAnyExtend = true;
2226   }
2227   APInt MaskedHighBits =
2228     APInt::getHighBitsSet(X.getSimpleValueType().getSizeInBits(), MaskLZ);
2229   if (!DAG.MaskedValueIsZero(X, MaskedHighBits))
2230     return true;
2231 
2232   // We've identified a pattern that can be transformed into a single shift
2233   // and an addressing mode. Make it so.
2234   MVT VT = N.getSimpleValueType();
2235   if (ReplacingAnyExtend) {
2236     assert(X.getValueType() != VT);
2237     // We looked through an ANY_EXTEND node, insert a ZERO_EXTEND.
2238     SDValue NewX = DAG.getNode(ISD::ZERO_EXTEND, SDLoc(X), VT, X);
2239     insertDAGNode(DAG, N, NewX);
2240     X = NewX;
2241   }
2242 
2243   MVT XVT = X.getSimpleValueType();
2244   SDLoc DL(N);
2245   SDValue NewSRLAmt = DAG.getConstant(ShiftAmt + AMShiftAmt, DL, MVT::i8);
2246   SDValue NewSRL = DAG.getNode(ISD::SRL, DL, XVT, X, NewSRLAmt);
2247   SDValue NewExt = DAG.getZExtOrTrunc(NewSRL, DL, VT);
2248   SDValue NewSHLAmt = DAG.getConstant(AMShiftAmt, DL, MVT::i8);
2249   SDValue NewSHL = DAG.getNode(ISD::SHL, DL, VT, NewExt, NewSHLAmt);
2250 
2251   // Insert the new nodes into the topological ordering. We must do this in
2252   // a valid topological ordering as nothing is going to go back and re-sort
2253   // these nodes. We continually insert before 'N' in sequence as this is
2254   // essentially a pre-flattened and pre-sorted sequence of nodes. There is no
2255   // hierarchy left to express.
2256   insertDAGNode(DAG, N, NewSRLAmt);
2257   insertDAGNode(DAG, N, NewSRL);
2258   insertDAGNode(DAG, N, NewExt);
2259   insertDAGNode(DAG, N, NewSHLAmt);
2260   insertDAGNode(DAG, N, NewSHL);
2261   DAG.ReplaceAllUsesWith(N, NewSHL);
2262   DAG.RemoveDeadNode(N.getNode());
2263 
2264   AM.Scale = 1 << AMShiftAmt;
2265   AM.IndexReg = NewExt;
2266   return false;
2267 }
2268 
2269 // Transform "(X >> SHIFT) & (MASK << C1)" to
2270 // "((X >> (SHIFT + C1)) & (MASK)) << C1". Everything before the SHL will be
2271 // matched to a BEXTR later. Returns false if the simplification is performed.
2272 static bool foldMaskedShiftToBEXTR(SelectionDAG &DAG, SDValue N,
2273                                    uint64_t Mask,
2274                                    SDValue Shift, SDValue X,
2275                                    X86ISelAddressMode &AM,
2276                                    const X86Subtarget &Subtarget) {
2277   if (Shift.getOpcode() != ISD::SRL ||
2278       !isa<ConstantSDNode>(Shift.getOperand(1)) ||
2279       !Shift.hasOneUse() || !N.hasOneUse())
2280     return true;
2281 
2282   // Only do this if BEXTR will be matched by matchBEXTRFromAndImm.
2283   if (!Subtarget.hasTBM() &&
2284       !(Subtarget.hasBMI() && Subtarget.hasFastBEXTR()))
2285     return true;
2286 
2287   // We need to ensure that mask is a continuous run of bits.
2288   unsigned MaskIdx, MaskLen;
2289   if (!isShiftedMask_64(Mask, MaskIdx, MaskLen))
2290     return true;
2291 
2292   unsigned ShiftAmt = Shift.getConstantOperandVal(1);
2293 
2294   // The amount of shift we're trying to fit into the addressing mode is taken
2295   // from the shifted mask index (number of trailing zeros of the mask).
2296   unsigned AMShiftAmt = MaskIdx;
2297 
2298   // There is nothing we can do here unless the mask is removing some bits.
2299   // Also, the addressing mode can only represent shifts of 1, 2, or 3 bits.
2300   if (AMShiftAmt == 0 || AMShiftAmt > 3) return true;
2301 
2302   MVT XVT = X.getSimpleValueType();
2303   MVT VT = N.getSimpleValueType();
2304   SDLoc DL(N);
2305   SDValue NewSRLAmt = DAG.getConstant(ShiftAmt + AMShiftAmt, DL, MVT::i8);
2306   SDValue NewSRL = DAG.getNode(ISD::SRL, DL, XVT, X, NewSRLAmt);
2307   SDValue NewMask = DAG.getConstant(Mask >> AMShiftAmt, DL, XVT);
2308   SDValue NewAnd = DAG.getNode(ISD::AND, DL, XVT, NewSRL, NewMask);
2309   SDValue NewExt = DAG.getZExtOrTrunc(NewAnd, DL, VT);
2310   SDValue NewSHLAmt = DAG.getConstant(AMShiftAmt, DL, MVT::i8);
2311   SDValue NewSHL = DAG.getNode(ISD::SHL, DL, VT, NewExt, NewSHLAmt);
2312 
2313   // Insert the new nodes into the topological ordering. We must do this in
2314   // a valid topological ordering as nothing is going to go back and re-sort
2315   // these nodes. We continually insert before 'N' in sequence as this is
2316   // essentially a pre-flattened and pre-sorted sequence of nodes. There is no
2317   // hierarchy left to express.
2318   insertDAGNode(DAG, N, NewSRLAmt);
2319   insertDAGNode(DAG, N, NewSRL);
2320   insertDAGNode(DAG, N, NewMask);
2321   insertDAGNode(DAG, N, NewAnd);
2322   insertDAGNode(DAG, N, NewExt);
2323   insertDAGNode(DAG, N, NewSHLAmt);
2324   insertDAGNode(DAG, N, NewSHL);
2325   DAG.ReplaceAllUsesWith(N, NewSHL);
2326   DAG.RemoveDeadNode(N.getNode());
2327 
2328   AM.Scale = 1 << AMShiftAmt;
2329   AM.IndexReg = NewExt;
2330   return false;
2331 }
2332 
2333 // Attempt to peek further into a scaled index register, collecting additional
2334 // extensions / offsets / etc. Returns /p N if we can't peek any further.
2335 SDValue X86DAGToDAGISel::matchIndexRecursively(SDValue N,
2336                                                X86ISelAddressMode &AM,
2337                                                unsigned Depth) {
2338   assert(AM.IndexReg.getNode() == nullptr && "IndexReg already matched");
2339   assert((AM.Scale == 1 || AM.Scale == 2 || AM.Scale == 4 || AM.Scale == 8) &&
2340          "Illegal index scale");
2341 
2342   // Limit recursion.
2343   if (Depth >= SelectionDAG::MaxRecursionDepth)
2344     return N;
2345 
2346   EVT VT = N.getValueType();
2347   unsigned Opc = N.getOpcode();
2348 
2349   // index: add(x,c) -> index: x, disp + c
2350   if (CurDAG->isBaseWithConstantOffset(N)) {
2351     auto *AddVal = cast<ConstantSDNode>(N.getOperand(1));
2352     uint64_t Offset = (uint64_t)AddVal->getSExtValue() * AM.Scale;
2353     if (!foldOffsetIntoAddress(Offset, AM))
2354       return matchIndexRecursively(N.getOperand(0), AM, Depth + 1);
2355   }
2356 
2357   // index: add(x,x) -> index: x, scale * 2
2358   if (Opc == ISD::ADD && N.getOperand(0) == N.getOperand(1)) {
2359     if (AM.Scale <= 4) {
2360       AM.Scale *= 2;
2361       return matchIndexRecursively(N.getOperand(0), AM, Depth + 1);
2362     }
2363   }
2364 
2365   // index: shl(x,i) -> index: x, scale * (1 << i)
2366   if (Opc == X86ISD::VSHLI) {
2367     uint64_t ShiftAmt = N.getConstantOperandVal(1);
2368     uint64_t ScaleAmt = 1ULL << ShiftAmt;
2369     if ((AM.Scale * ScaleAmt) <= 8) {
2370       AM.Scale *= ScaleAmt;
2371       return matchIndexRecursively(N.getOperand(0), AM, Depth + 1);
2372     }
2373   }
2374 
2375   // index: sext(add_nsw(x,c)) -> index: sext(x), disp + sext(c)
2376   // TODO: call matchIndexRecursively(AddSrc) if we won't corrupt sext?
2377   if (Opc == ISD::SIGN_EXTEND && !VT.isVector() && N.hasOneUse()) {
2378     SDValue Src = N.getOperand(0);
2379     if (Src.getOpcode() == ISD::ADD && Src->getFlags().hasNoSignedWrap() &&
2380         Src.hasOneUse()) {
2381       if (CurDAG->isBaseWithConstantOffset(Src)) {
2382         SDValue AddSrc = Src.getOperand(0);
2383         auto *AddVal = cast<ConstantSDNode>(Src.getOperand(1));
2384         uint64_t Offset = (uint64_t)AddVal->getSExtValue();
2385         if (!foldOffsetIntoAddress(Offset * AM.Scale, AM)) {
2386           SDLoc DL(N);
2387           SDValue ExtSrc = CurDAG->getNode(Opc, DL, VT, AddSrc);
2388           SDValue ExtVal = CurDAG->getConstant(Offset, DL, VT);
2389           SDValue ExtAdd = CurDAG->getNode(ISD::ADD, DL, VT, ExtSrc, ExtVal);
2390           insertDAGNode(*CurDAG, N, ExtSrc);
2391           insertDAGNode(*CurDAG, N, ExtVal);
2392           insertDAGNode(*CurDAG, N, ExtAdd);
2393           CurDAG->ReplaceAllUsesWith(N, ExtAdd);
2394           CurDAG->RemoveDeadNode(N.getNode());
2395           return ExtSrc;
2396         }
2397       }
2398     }
2399   }
2400 
2401   // index: zext(add_nuw(x,c)) -> index: zext(x), disp + zext(c)
2402   // index: zext(addlike(x,c)) -> index: zext(x), disp + zext(c)
2403   // TODO: call matchIndexRecursively(AddSrc) if we won't corrupt sext?
2404   if (Opc == ISD::ZERO_EXTEND && !VT.isVector() && N.hasOneUse()) {
2405     SDValue Src = N.getOperand(0);
2406     unsigned SrcOpc = Src.getOpcode();
2407     if (((SrcOpc == ISD::ADD && Src->getFlags().hasNoUnsignedWrap()) ||
2408          CurDAG->isADDLike(Src, /*NoWrap=*/true)) &&
2409         Src.hasOneUse()) {
2410       if (CurDAG->isBaseWithConstantOffset(Src)) {
2411         SDValue AddSrc = Src.getOperand(0);
2412         uint64_t Offset = Src.getConstantOperandVal(1);
2413         if (!foldOffsetIntoAddress(Offset * AM.Scale, AM)) {
2414           SDLoc DL(N);
2415           SDValue Res;
2416           // If we're also scaling, see if we can use that as well.
2417           if (AddSrc.getOpcode() == ISD::SHL &&
2418               isa<ConstantSDNode>(AddSrc.getOperand(1))) {
2419             SDValue ShVal = AddSrc.getOperand(0);
2420             uint64_t ShAmt = AddSrc.getConstantOperandVal(1);
2421             APInt HiBits =
2422                 APInt::getHighBitsSet(AddSrc.getScalarValueSizeInBits(), ShAmt);
2423             uint64_t ScaleAmt = 1ULL << ShAmt;
2424             if ((AM.Scale * ScaleAmt) <= 8 &&
2425                 (AddSrc->getFlags().hasNoUnsignedWrap() ||
2426                  CurDAG->MaskedValueIsZero(ShVal, HiBits))) {
2427               AM.Scale *= ScaleAmt;
2428               SDValue ExtShVal = CurDAG->getNode(Opc, DL, VT, ShVal);
2429               SDValue ExtShift = CurDAG->getNode(ISD::SHL, DL, VT, ExtShVal,
2430                                                  AddSrc.getOperand(1));
2431               insertDAGNode(*CurDAG, N, ExtShVal);
2432               insertDAGNode(*CurDAG, N, ExtShift);
2433               AddSrc = ExtShift;
2434               Res = ExtShVal;
2435             }
2436           }
2437           SDValue ExtSrc = CurDAG->getNode(Opc, DL, VT, AddSrc);
2438           SDValue ExtVal = CurDAG->getConstant(Offset, DL, VT);
2439           SDValue ExtAdd = CurDAG->getNode(SrcOpc, DL, VT, ExtSrc, ExtVal);
2440           insertDAGNode(*CurDAG, N, ExtSrc);
2441           insertDAGNode(*CurDAG, N, ExtVal);
2442           insertDAGNode(*CurDAG, N, ExtAdd);
2443           CurDAG->ReplaceAllUsesWith(N, ExtAdd);
2444           CurDAG->RemoveDeadNode(N.getNode());
2445           return Res ? Res : ExtSrc;
2446         }
2447       }
2448     }
2449   }
2450 
2451   // TODO: Handle extensions, shifted masks etc.
2452   return N;
2453 }
2454 
2455 bool X86DAGToDAGISel::matchAddressRecursively(SDValue N, X86ISelAddressMode &AM,
2456                                               unsigned Depth) {
2457   SDLoc dl(N);
2458   LLVM_DEBUG({
2459     dbgs() << "MatchAddress: ";
2460     AM.dump(CurDAG);
2461   });
2462   // Limit recursion.
2463   if (Depth >= SelectionDAG::MaxRecursionDepth)
2464     return matchAddressBase(N, AM);
2465 
2466   // If this is already a %rip relative address, we can only merge immediates
2467   // into it.  Instead of handling this in every case, we handle it here.
2468   // RIP relative addressing: %rip + 32-bit displacement!
2469   if (AM.isRIPRelative()) {
2470     // FIXME: JumpTable and ExternalSymbol address currently don't like
2471     // displacements.  It isn't very important, but this should be fixed for
2472     // consistency.
2473     if (!(AM.ES || AM.MCSym) && AM.JT != -1)
2474       return true;
2475 
2476     if (auto *Cst = dyn_cast<ConstantSDNode>(N))
2477       if (!foldOffsetIntoAddress(Cst->getSExtValue(), AM))
2478         return false;
2479     return true;
2480   }
2481 
2482   switch (N.getOpcode()) {
2483   default: break;
2484   case ISD::LOCAL_RECOVER: {
2485     if (!AM.hasSymbolicDisplacement() && AM.Disp == 0)
2486       if (const auto *ESNode = dyn_cast<MCSymbolSDNode>(N.getOperand(0))) {
2487         // Use the symbol and don't prefix it.
2488         AM.MCSym = ESNode->getMCSymbol();
2489         return false;
2490       }
2491     break;
2492   }
2493   case ISD::Constant: {
2494     uint64_t Val = cast<ConstantSDNode>(N)->getSExtValue();
2495     if (!foldOffsetIntoAddress(Val, AM))
2496       return false;
2497     break;
2498   }
2499 
2500   case X86ISD::Wrapper:
2501   case X86ISD::WrapperRIP:
2502     if (!matchWrapper(N, AM))
2503       return false;
2504     break;
2505 
2506   case ISD::LOAD:
2507     if (!matchLoadInAddress(cast<LoadSDNode>(N), AM))
2508       return false;
2509     break;
2510 
2511   case ISD::FrameIndex:
2512     if (AM.BaseType == X86ISelAddressMode::RegBase &&
2513         AM.Base_Reg.getNode() == nullptr &&
2514         (!Subtarget->is64Bit() || isDispSafeForFrameIndex(AM.Disp))) {
2515       AM.BaseType = X86ISelAddressMode::FrameIndexBase;
2516       AM.Base_FrameIndex = cast<FrameIndexSDNode>(N)->getIndex();
2517       return false;
2518     }
2519     break;
2520 
2521   case ISD::SHL:
2522     if (AM.IndexReg.getNode() != nullptr || AM.Scale != 1)
2523       break;
2524 
2525     if (auto *CN = dyn_cast<ConstantSDNode>(N.getOperand(1))) {
2526       unsigned Val = CN->getZExtValue();
2527       // Note that we handle x<<1 as (,x,2) rather than (x,x) here so
2528       // that the base operand remains free for further matching. If
2529       // the base doesn't end up getting used, a post-processing step
2530       // in MatchAddress turns (,x,2) into (x,x), which is cheaper.
2531       if (Val == 1 || Val == 2 || Val == 3) {
2532         SDValue ShVal = N.getOperand(0);
2533         AM.Scale = 1 << Val;
2534         AM.IndexReg = matchIndexRecursively(ShVal, AM, Depth + 1);
2535         return false;
2536       }
2537     }
2538     break;
2539 
2540   case ISD::SRL: {
2541     // Scale must not be used already.
2542     if (AM.IndexReg.getNode() != nullptr || AM.Scale != 1) break;
2543 
2544     // We only handle up to 64-bit values here as those are what matter for
2545     // addressing mode optimizations.
2546     assert(N.getSimpleValueType().getSizeInBits() <= 64 &&
2547            "Unexpected value size!");
2548 
2549     SDValue And = N.getOperand(0);
2550     if (And.getOpcode() != ISD::AND) break;
2551     SDValue X = And.getOperand(0);
2552 
2553     // The mask used for the transform is expected to be post-shift, but we
2554     // found the shift first so just apply the shift to the mask before passing
2555     // it down.
2556     if (!isa<ConstantSDNode>(N.getOperand(1)) ||
2557         !isa<ConstantSDNode>(And.getOperand(1)))
2558       break;
2559     uint64_t Mask = And.getConstantOperandVal(1) >> N.getConstantOperandVal(1);
2560 
2561     // Try to fold the mask and shift into the scale, and return false if we
2562     // succeed.
2563     if (!foldMaskAndShiftToScale(*CurDAG, N, Mask, N, X, AM))
2564       return false;
2565     break;
2566   }
2567 
2568   case ISD::SMUL_LOHI:
2569   case ISD::UMUL_LOHI:
2570     // A mul_lohi where we need the low part can be folded as a plain multiply.
2571     if (N.getResNo() != 0) break;
2572     [[fallthrough]];
2573   case ISD::MUL:
2574   case X86ISD::MUL_IMM:
2575     // X*[3,5,9] -> X+X*[2,4,8]
2576     if (AM.BaseType == X86ISelAddressMode::RegBase &&
2577         AM.Base_Reg.getNode() == nullptr &&
2578         AM.IndexReg.getNode() == nullptr) {
2579       if (auto *CN = dyn_cast<ConstantSDNode>(N.getOperand(1)))
2580         if (CN->getZExtValue() == 3 || CN->getZExtValue() == 5 ||
2581             CN->getZExtValue() == 9) {
2582           AM.Scale = unsigned(CN->getZExtValue())-1;
2583 
2584           SDValue MulVal = N.getOperand(0);
2585           SDValue Reg;
2586 
2587           // Okay, we know that we have a scale by now.  However, if the scaled
2588           // value is an add of something and a constant, we can fold the
2589           // constant into the disp field here.
2590           if (MulVal.getNode()->getOpcode() == ISD::ADD && MulVal.hasOneUse() &&
2591               isa<ConstantSDNode>(MulVal.getOperand(1))) {
2592             Reg = MulVal.getOperand(0);
2593             auto *AddVal = cast<ConstantSDNode>(MulVal.getOperand(1));
2594             uint64_t Disp = AddVal->getSExtValue() * CN->getZExtValue();
2595             if (foldOffsetIntoAddress(Disp, AM))
2596               Reg = N.getOperand(0);
2597           } else {
2598             Reg = N.getOperand(0);
2599           }
2600 
2601           AM.IndexReg = AM.Base_Reg = Reg;
2602           return false;
2603         }
2604     }
2605     break;
2606 
2607   case ISD::SUB: {
2608     // Given A-B, if A can be completely folded into the address and
2609     // the index field with the index field unused, use -B as the index.
2610     // This is a win if a has multiple parts that can be folded into
2611     // the address. Also, this saves a mov if the base register has
2612     // other uses, since it avoids a two-address sub instruction, however
2613     // it costs an additional mov if the index register has other uses.
2614 
2615     // Add an artificial use to this node so that we can keep track of
2616     // it if it gets CSE'd with a different node.
2617     HandleSDNode Handle(N);
2618 
2619     // Test if the LHS of the sub can be folded.
2620     X86ISelAddressMode Backup = AM;
2621     if (matchAddressRecursively(N.getOperand(0), AM, Depth+1)) {
2622       N = Handle.getValue();
2623       AM = Backup;
2624       break;
2625     }
2626     N = Handle.getValue();
2627     // Test if the index field is free for use.
2628     if (AM.IndexReg.getNode() || AM.isRIPRelative()) {
2629       AM = Backup;
2630       break;
2631     }
2632 
2633     int Cost = 0;
2634     SDValue RHS = N.getOperand(1);
2635     // If the RHS involves a register with multiple uses, this
2636     // transformation incurs an extra mov, due to the neg instruction
2637     // clobbering its operand.
2638     if (!RHS.getNode()->hasOneUse() ||
2639         RHS.getNode()->getOpcode() == ISD::CopyFromReg ||
2640         RHS.getNode()->getOpcode() == ISD::TRUNCATE ||
2641         RHS.getNode()->getOpcode() == ISD::ANY_EXTEND ||
2642         (RHS.getNode()->getOpcode() == ISD::ZERO_EXTEND &&
2643          RHS.getOperand(0).getValueType() == MVT::i32))
2644       ++Cost;
2645     // If the base is a register with multiple uses, this
2646     // transformation may save a mov.
2647     if ((AM.BaseType == X86ISelAddressMode::RegBase && AM.Base_Reg.getNode() &&
2648          !AM.Base_Reg.getNode()->hasOneUse()) ||
2649         AM.BaseType == X86ISelAddressMode::FrameIndexBase)
2650       --Cost;
2651     // If the folded LHS was interesting, this transformation saves
2652     // address arithmetic.
2653     if ((AM.hasSymbolicDisplacement() && !Backup.hasSymbolicDisplacement()) +
2654         ((AM.Disp != 0) && (Backup.Disp == 0)) +
2655         (AM.Segment.getNode() && !Backup.Segment.getNode()) >= 2)
2656       --Cost;
2657     // If it doesn't look like it may be an overall win, don't do it.
2658     if (Cost >= 0) {
2659       AM = Backup;
2660       break;
2661     }
2662 
2663     // Ok, the transformation is legal and appears profitable. Go for it.
2664     // Negation will be emitted later to avoid creating dangling nodes if this
2665     // was an unprofitable LEA.
2666     AM.IndexReg = RHS;
2667     AM.NegateIndex = true;
2668     AM.Scale = 1;
2669     return false;
2670   }
2671 
2672   case ISD::OR:
2673   case ISD::XOR:
2674     // See if we can treat the OR/XOR node as an ADD node.
2675     if (!CurDAG->isADDLike(N))
2676       break;
2677     [[fallthrough]];
2678   case ISD::ADD:
2679     if (!matchAdd(N, AM, Depth))
2680       return false;
2681     break;
2682 
2683   case ISD::AND: {
2684     // Perform some heroic transforms on an and of a constant-count shift
2685     // with a constant to enable use of the scaled offset field.
2686 
2687     // Scale must not be used already.
2688     if (AM.IndexReg.getNode() != nullptr || AM.Scale != 1) break;
2689 
2690     // We only handle up to 64-bit values here as those are what matter for
2691     // addressing mode optimizations.
2692     assert(N.getSimpleValueType().getSizeInBits() <= 64 &&
2693            "Unexpected value size!");
2694 
2695     if (!isa<ConstantSDNode>(N.getOperand(1)))
2696       break;
2697 
2698     if (N.getOperand(0).getOpcode() == ISD::SRL) {
2699       SDValue Shift = N.getOperand(0);
2700       SDValue X = Shift.getOperand(0);
2701 
2702       uint64_t Mask = N.getConstantOperandVal(1);
2703 
2704       // Try to fold the mask and shift into an extract and scale.
2705       if (!foldMaskAndShiftToExtract(*CurDAG, N, Mask, Shift, X, AM))
2706         return false;
2707 
2708       // Try to fold the mask and shift directly into the scale.
2709       if (!foldMaskAndShiftToScale(*CurDAG, N, Mask, Shift, X, AM))
2710         return false;
2711 
2712       // Try to fold the mask and shift into BEXTR and scale.
2713       if (!foldMaskedShiftToBEXTR(*CurDAG, N, Mask, Shift, X, AM, *Subtarget))
2714         return false;
2715     }
2716 
2717     // Try to swap the mask and shift to place shifts which can be done as
2718     // a scale on the outside of the mask.
2719     if (!foldMaskedShiftToScaledMask(*CurDAG, N, AM))
2720       return false;
2721 
2722     break;
2723   }
2724   case ISD::ZERO_EXTEND: {
2725     // Try to widen a zexted shift left to the same size as its use, so we can
2726     // match the shift as a scale factor.
2727     if (AM.IndexReg.getNode() != nullptr || AM.Scale != 1)
2728       break;
2729 
2730     SDValue Src = N.getOperand(0);
2731 
2732     // See if we can match a zext(addlike(x,c)).
2733     // TODO: Move more ZERO_EXTEND patterns into matchIndexRecursively.
2734     if (Src.getOpcode() == ISD::ADD || Src.getOpcode() == ISD::OR)
2735       if (SDValue Index = matchIndexRecursively(N, AM, Depth + 1))
2736         if (Index != N) {
2737           AM.IndexReg = Index;
2738           return false;
2739         }
2740 
2741     // Peek through mask: zext(and(shl(x,c1),c2))
2742     APInt Mask = APInt::getAllOnes(Src.getScalarValueSizeInBits());
2743     if (Src.getOpcode() == ISD::AND && Src.hasOneUse())
2744       if (auto *MaskC = dyn_cast<ConstantSDNode>(Src.getOperand(1))) {
2745         Mask = MaskC->getAPIntValue();
2746         Src = Src.getOperand(0);
2747       }
2748 
2749     if (Src.getOpcode() == ISD::SHL && Src.hasOneUse() && N->hasOneUse()) {
2750       // Give up if the shift is not a valid scale factor [1,2,3].
2751       SDValue ShlSrc = Src.getOperand(0);
2752       SDValue ShlAmt = Src.getOperand(1);
2753       auto *ShAmtC = dyn_cast<ConstantSDNode>(ShlAmt);
2754       if (!ShAmtC)
2755         break;
2756       unsigned ShAmtV = ShAmtC->getZExtValue();
2757       if (ShAmtV > 3)
2758         break;
2759 
2760       // The narrow shift must only shift out zero bits (it must be 'nuw').
2761       // That makes it safe to widen to the destination type.
2762       APInt HighZeros =
2763           APInt::getHighBitsSet(ShlSrc.getValueSizeInBits(), ShAmtV);
2764       if (!Src->getFlags().hasNoUnsignedWrap() &&
2765           !CurDAG->MaskedValueIsZero(ShlSrc, HighZeros & Mask))
2766         break;
2767 
2768       // zext (shl nuw i8 %x, C1) to i32
2769       // --> shl (zext i8 %x to i32), (zext C1)
2770       // zext (and (shl nuw i8 %x, C1), C2) to i32
2771       // --> shl (zext i8 (and %x, C2 >> C1) to i32), (zext C1)
2772       MVT SrcVT = ShlSrc.getSimpleValueType();
2773       MVT VT = N.getSimpleValueType();
2774       SDLoc DL(N);
2775 
2776       SDValue Res = ShlSrc;
2777       if (!Mask.isAllOnes()) {
2778         Res = CurDAG->getConstant(Mask.lshr(ShAmtV), DL, SrcVT);
2779         insertDAGNode(*CurDAG, N, Res);
2780         Res = CurDAG->getNode(ISD::AND, DL, SrcVT, ShlSrc, Res);
2781         insertDAGNode(*CurDAG, N, Res);
2782       }
2783       SDValue Zext = CurDAG->getNode(ISD::ZERO_EXTEND, DL, VT, Res);
2784       insertDAGNode(*CurDAG, N, Zext);
2785       SDValue NewShl = CurDAG->getNode(ISD::SHL, DL, VT, Zext, ShlAmt);
2786       insertDAGNode(*CurDAG, N, NewShl);
2787       CurDAG->ReplaceAllUsesWith(N, NewShl);
2788       CurDAG->RemoveDeadNode(N.getNode());
2789 
2790       // Convert the shift to scale factor.
2791       AM.Scale = 1 << ShAmtV;
2792       // If matchIndexRecursively is not called here,
2793       // Zext may be replaced by other nodes but later used to call a builder
2794       // method
2795       AM.IndexReg = matchIndexRecursively(Zext, AM, Depth + 1);
2796       return false;
2797     }
2798 
2799     if (Src.getOpcode() == ISD::SRL && !Mask.isAllOnes()) {
2800       // Try to fold the mask and shift into an extract and scale.
2801       if (!foldMaskAndShiftToExtract(*CurDAG, N, Mask.getZExtValue(), Src,
2802                                      Src.getOperand(0), AM))
2803         return false;
2804 
2805       // Try to fold the mask and shift directly into the scale.
2806       if (!foldMaskAndShiftToScale(*CurDAG, N, Mask.getZExtValue(), Src,
2807                                    Src.getOperand(0), AM))
2808         return false;
2809 
2810       // Try to fold the mask and shift into BEXTR and scale.
2811       if (!foldMaskedShiftToBEXTR(*CurDAG, N, Mask.getZExtValue(), Src,
2812                                   Src.getOperand(0), AM, *Subtarget))
2813         return false;
2814     }
2815 
2816     break;
2817   }
2818   }
2819 
2820   return matchAddressBase(N, AM);
2821 }
2822 
2823 /// Helper for MatchAddress. Add the specified node to the
2824 /// specified addressing mode without any further recursion.
2825 bool X86DAGToDAGISel::matchAddressBase(SDValue N, X86ISelAddressMode &AM) {
2826   // Is the base register already occupied?
2827   if (AM.BaseType != X86ISelAddressMode::RegBase || AM.Base_Reg.getNode()) {
2828     // If so, check to see if the scale index register is set.
2829     if (!AM.IndexReg.getNode()) {
2830       AM.IndexReg = N;
2831       AM.Scale = 1;
2832       return false;
2833     }
2834 
2835     // Otherwise, we cannot select it.
2836     return true;
2837   }
2838 
2839   // Default, generate it as a register.
2840   AM.BaseType = X86ISelAddressMode::RegBase;
2841   AM.Base_Reg = N;
2842   return false;
2843 }
2844 
2845 bool X86DAGToDAGISel::matchVectorAddressRecursively(SDValue N,
2846                                                     X86ISelAddressMode &AM,
2847                                                     unsigned Depth) {
2848   SDLoc dl(N);
2849   LLVM_DEBUG({
2850     dbgs() << "MatchVectorAddress: ";
2851     AM.dump(CurDAG);
2852   });
2853   // Limit recursion.
2854   if (Depth >= SelectionDAG::MaxRecursionDepth)
2855     return matchAddressBase(N, AM);
2856 
2857   // TODO: Support other operations.
2858   switch (N.getOpcode()) {
2859   case ISD::Constant: {
2860     uint64_t Val = cast<ConstantSDNode>(N)->getSExtValue();
2861     if (!foldOffsetIntoAddress(Val, AM))
2862       return false;
2863     break;
2864   }
2865   case X86ISD::Wrapper:
2866     if (!matchWrapper(N, AM))
2867       return false;
2868     break;
2869   case ISD::ADD: {
2870     // Add an artificial use to this node so that we can keep track of
2871     // it if it gets CSE'd with a different node.
2872     HandleSDNode Handle(N);
2873 
2874     X86ISelAddressMode Backup = AM;
2875     if (!matchVectorAddressRecursively(N.getOperand(0), AM, Depth + 1) &&
2876         !matchVectorAddressRecursively(Handle.getValue().getOperand(1), AM,
2877                                        Depth + 1))
2878       return false;
2879     AM = Backup;
2880 
2881     // Try again after commuting the operands.
2882     if (!matchVectorAddressRecursively(Handle.getValue().getOperand(1), AM,
2883                                        Depth + 1) &&
2884         !matchVectorAddressRecursively(Handle.getValue().getOperand(0), AM,
2885                                        Depth + 1))
2886       return false;
2887     AM = Backup;
2888 
2889     N = Handle.getValue();
2890     break;
2891   }
2892   }
2893 
2894   return matchAddressBase(N, AM);
2895 }
2896 
2897 /// Helper for selectVectorAddr. Handles things that can be folded into a
2898 /// gather/scatter address. The index register and scale should have already
2899 /// been handled.
2900 bool X86DAGToDAGISel::matchVectorAddress(SDValue N, X86ISelAddressMode &AM) {
2901   return matchVectorAddressRecursively(N, AM, 0);
2902 }
2903 
2904 bool X86DAGToDAGISel::selectVectorAddr(MemSDNode *Parent, SDValue BasePtr,
2905                                        SDValue IndexOp, SDValue ScaleOp,
2906                                        SDValue &Base, SDValue &Scale,
2907                                        SDValue &Index, SDValue &Disp,
2908                                        SDValue &Segment) {
2909   X86ISelAddressMode AM;
2910   AM.Scale = ScaleOp->getAsZExtVal();
2911 
2912   // Attempt to match index patterns, as long as we're not relying on implicit
2913   // sign-extension, which is performed BEFORE scale.
2914   if (IndexOp.getScalarValueSizeInBits() == BasePtr.getScalarValueSizeInBits())
2915     AM.IndexReg = matchIndexRecursively(IndexOp, AM, 0);
2916   else
2917     AM.IndexReg = IndexOp;
2918 
2919   unsigned AddrSpace = Parent->getPointerInfo().getAddrSpace();
2920   if (AddrSpace == X86AS::GS)
2921     AM.Segment = CurDAG->getRegister(X86::GS, MVT::i16);
2922   if (AddrSpace == X86AS::FS)
2923     AM.Segment = CurDAG->getRegister(X86::FS, MVT::i16);
2924   if (AddrSpace == X86AS::SS)
2925     AM.Segment = CurDAG->getRegister(X86::SS, MVT::i16);
2926 
2927   SDLoc DL(BasePtr);
2928   MVT VT = BasePtr.getSimpleValueType();
2929 
2930   // Try to match into the base and displacement fields.
2931   if (matchVectorAddress(BasePtr, AM))
2932     return false;
2933 
2934   getAddressOperands(AM, DL, VT, Base, Scale, Index, Disp, Segment);
2935   return true;
2936 }
2937 
2938 /// Returns true if it is able to pattern match an addressing mode.
2939 /// It returns the operands which make up the maximal addressing mode it can
2940 /// match by reference.
2941 ///
2942 /// Parent is the parent node of the addr operand that is being matched.  It
2943 /// is always a load, store, atomic node, or null.  It is only null when
2944 /// checking memory operands for inline asm nodes.
2945 bool X86DAGToDAGISel::selectAddr(SDNode *Parent, SDValue N, SDValue &Base,
2946                                  SDValue &Scale, SDValue &Index,
2947                                  SDValue &Disp, SDValue &Segment) {
2948   X86ISelAddressMode AM;
2949 
2950   if (Parent &&
2951       // This list of opcodes are all the nodes that have an "addr:$ptr" operand
2952       // that are not a MemSDNode, and thus don't have proper addrspace info.
2953       Parent->getOpcode() != ISD::INTRINSIC_W_CHAIN && // unaligned loads, fixme
2954       Parent->getOpcode() != ISD::INTRINSIC_VOID && // nontemporal stores
2955       Parent->getOpcode() != X86ISD::TLSCALL && // Fixme
2956       Parent->getOpcode() != X86ISD::ENQCMD && // Fixme
2957       Parent->getOpcode() != X86ISD::ENQCMDS && // Fixme
2958       Parent->getOpcode() != X86ISD::EH_SJLJ_SETJMP && // setjmp
2959       Parent->getOpcode() != X86ISD::EH_SJLJ_LONGJMP) { // longjmp
2960     unsigned AddrSpace =
2961       cast<MemSDNode>(Parent)->getPointerInfo().getAddrSpace();
2962     if (AddrSpace == X86AS::GS)
2963       AM.Segment = CurDAG->getRegister(X86::GS, MVT::i16);
2964     if (AddrSpace == X86AS::FS)
2965       AM.Segment = CurDAG->getRegister(X86::FS, MVT::i16);
2966     if (AddrSpace == X86AS::SS)
2967       AM.Segment = CurDAG->getRegister(X86::SS, MVT::i16);
2968   }
2969 
2970   // Save the DL and VT before calling matchAddress, it can invalidate N.
2971   SDLoc DL(N);
2972   MVT VT = N.getSimpleValueType();
2973 
2974   if (matchAddress(N, AM))
2975     return false;
2976 
2977   getAddressOperands(AM, DL, VT, Base, Scale, Index, Disp, Segment);
2978   return true;
2979 }
2980 
2981 bool X86DAGToDAGISel::selectMOV64Imm32(SDValue N, SDValue &Imm) {
2982   // Cannot use 32 bit constants to reference objects in kernel/large code
2983   // model.
2984   if (TM.getCodeModel() == CodeModel::Kernel ||
2985       TM.getCodeModel() == CodeModel::Large)
2986     return false;
2987 
2988   // In static codegen with small code model, we can get the address of a label
2989   // into a register with 'movl'
2990   if (N->getOpcode() != X86ISD::Wrapper)
2991     return false;
2992 
2993   N = N.getOperand(0);
2994 
2995   // At least GNU as does not accept 'movl' for TPOFF relocations.
2996   // FIXME: We could use 'movl' when we know we are targeting MC.
2997   if (N->getOpcode() == ISD::TargetGlobalTLSAddress)
2998     return false;
2999 
3000   Imm = N;
3001   // Small/medium code model can reference non-TargetGlobalAddress objects with
3002   // 32 bit constants.
3003   if (N->getOpcode() != ISD::TargetGlobalAddress) {
3004     return TM.getCodeModel() == CodeModel::Small ||
3005            TM.getCodeModel() == CodeModel::Medium;
3006   }
3007 
3008   const GlobalValue *GV = cast<GlobalAddressSDNode>(N)->getGlobal();
3009   if (std::optional<ConstantRange> CR = GV->getAbsoluteSymbolRange())
3010     return CR->getUnsignedMax().ult(1ull << 32);
3011 
3012   return !TM.isLargeGlobalValue(GV);
3013 }
3014 
3015 bool X86DAGToDAGISel::selectLEA64_32Addr(SDValue N, SDValue &Base,
3016                                          SDValue &Scale, SDValue &Index,
3017                                          SDValue &Disp, SDValue &Segment) {
3018   // Save the debug loc before calling selectLEAAddr, in case it invalidates N.
3019   SDLoc DL(N);
3020 
3021   if (!selectLEAAddr(N, Base, Scale, Index, Disp, Segment))
3022     return false;
3023 
3024   auto *RN = dyn_cast<RegisterSDNode>(Base);
3025   if (RN && RN->getReg() == 0)
3026     Base = CurDAG->getRegister(0, MVT::i64);
3027   else if (Base.getValueType() == MVT::i32 && !isa<FrameIndexSDNode>(Base)) {
3028     // Base could already be %rip, particularly in the x32 ABI.
3029     SDValue ImplDef = SDValue(CurDAG->getMachineNode(X86::IMPLICIT_DEF, DL,
3030                                                      MVT::i64), 0);
3031     Base = CurDAG->getTargetInsertSubreg(X86::sub_32bit, DL, MVT::i64, ImplDef,
3032                                          Base);
3033   }
3034 
3035   RN = dyn_cast<RegisterSDNode>(Index);
3036   if (RN && RN->getReg() == 0)
3037     Index = CurDAG->getRegister(0, MVT::i64);
3038   else {
3039     assert(Index.getValueType() == MVT::i32 &&
3040            "Expect to be extending 32-bit registers for use in LEA");
3041     SDValue ImplDef = SDValue(CurDAG->getMachineNode(X86::IMPLICIT_DEF, DL,
3042                                                      MVT::i64), 0);
3043     Index = CurDAG->getTargetInsertSubreg(X86::sub_32bit, DL, MVT::i64, ImplDef,
3044                                           Index);
3045   }
3046 
3047   return true;
3048 }
3049 
3050 /// Calls SelectAddr and determines if the maximal addressing
3051 /// mode it matches can be cost effectively emitted as an LEA instruction.
3052 bool X86DAGToDAGISel::selectLEAAddr(SDValue N,
3053                                     SDValue &Base, SDValue &Scale,
3054                                     SDValue &Index, SDValue &Disp,
3055                                     SDValue &Segment) {
3056   X86ISelAddressMode AM;
3057 
3058   // Save the DL and VT before calling matchAddress, it can invalidate N.
3059   SDLoc DL(N);
3060   MVT VT = N.getSimpleValueType();
3061 
3062   // Set AM.Segment to prevent MatchAddress from using one. LEA doesn't support
3063   // segments.
3064   SDValue Copy = AM.Segment;
3065   SDValue T = CurDAG->getRegister(0, MVT::i32);
3066   AM.Segment = T;
3067   if (matchAddress(N, AM))
3068     return false;
3069   assert (T == AM.Segment);
3070   AM.Segment = Copy;
3071 
3072   unsigned Complexity = 0;
3073   if (AM.BaseType == X86ISelAddressMode::RegBase && AM.Base_Reg.getNode())
3074     Complexity = 1;
3075   else if (AM.BaseType == X86ISelAddressMode::FrameIndexBase)
3076     Complexity = 4;
3077 
3078   if (AM.IndexReg.getNode())
3079     Complexity++;
3080 
3081   // Don't match just leal(,%reg,2). It's cheaper to do addl %reg, %reg, or with
3082   // a simple shift.
3083   if (AM.Scale > 1)
3084     Complexity++;
3085 
3086   // FIXME: We are artificially lowering the criteria to turn ADD %reg, $GA
3087   // to a LEA. This is determined with some experimentation but is by no means
3088   // optimal (especially for code size consideration). LEA is nice because of
3089   // its three-address nature. Tweak the cost function again when we can run
3090   // convertToThreeAddress() at register allocation time.
3091   if (AM.hasSymbolicDisplacement()) {
3092     // For X86-64, always use LEA to materialize RIP-relative addresses.
3093     if (Subtarget->is64Bit())
3094       Complexity = 4;
3095     else
3096       Complexity += 2;
3097   }
3098 
3099   // Heuristic: try harder to form an LEA from ADD if the operands set flags.
3100   // Unlike ADD, LEA does not affect flags, so we will be less likely to require
3101   // duplicating flag-producing instructions later in the pipeline.
3102   if (N.getOpcode() == ISD::ADD) {
3103     auto isMathWithFlags = [](SDValue V) {
3104       switch (V.getOpcode()) {
3105       case X86ISD::ADD:
3106       case X86ISD::SUB:
3107       case X86ISD::ADC:
3108       case X86ISD::SBB:
3109       case X86ISD::SMUL:
3110       case X86ISD::UMUL:
3111       /* TODO: These opcodes can be added safely, but we may want to justify
3112                their inclusion for different reasons (better for reg-alloc).
3113       case X86ISD::OR:
3114       case X86ISD::XOR:
3115       case X86ISD::AND:
3116       */
3117         // Value 1 is the flag output of the node - verify it's not dead.
3118         return !SDValue(V.getNode(), 1).use_empty();
3119       default:
3120         return false;
3121       }
3122     };
3123     // TODO: We might want to factor in whether there's a load folding
3124     // opportunity for the math op that disappears with LEA.
3125     if (isMathWithFlags(N.getOperand(0)) || isMathWithFlags(N.getOperand(1)))
3126       Complexity++;
3127   }
3128 
3129   if (AM.Disp)
3130     Complexity++;
3131 
3132   // If it isn't worth using an LEA, reject it.
3133   if (Complexity <= 2)
3134     return false;
3135 
3136   getAddressOperands(AM, DL, VT, Base, Scale, Index, Disp, Segment);
3137   return true;
3138 }
3139 
3140 /// This is only run on TargetGlobalTLSAddress nodes.
3141 bool X86DAGToDAGISel::selectTLSADDRAddr(SDValue N, SDValue &Base,
3142                                         SDValue &Scale, SDValue &Index,
3143                                         SDValue &Disp, SDValue &Segment) {
3144   assert(N.getOpcode() == ISD::TargetGlobalTLSAddress ||
3145          N.getOpcode() == ISD::TargetExternalSymbol);
3146 
3147   X86ISelAddressMode AM;
3148   if (auto *GA = dyn_cast<GlobalAddressSDNode>(N)) {
3149     AM.GV = GA->getGlobal();
3150     AM.Disp += GA->getOffset();
3151     AM.SymbolFlags = GA->getTargetFlags();
3152   } else {
3153     auto *SA = cast<ExternalSymbolSDNode>(N);
3154     AM.ES = SA->getSymbol();
3155     AM.SymbolFlags = SA->getTargetFlags();
3156   }
3157 
3158   if (Subtarget->is32Bit()) {
3159     AM.Scale = 1;
3160     AM.IndexReg = CurDAG->getRegister(X86::EBX, MVT::i32);
3161   }
3162 
3163   MVT VT = N.getSimpleValueType();
3164   getAddressOperands(AM, SDLoc(N), VT, Base, Scale, Index, Disp, Segment);
3165   return true;
3166 }
3167 
3168 bool X86DAGToDAGISel::selectRelocImm(SDValue N, SDValue &Op) {
3169   // Keep track of the original value type and whether this value was
3170   // truncated. If we see a truncation from pointer type to VT that truncates
3171   // bits that are known to be zero, we can use a narrow reference.
3172   EVT VT = N.getValueType();
3173   bool WasTruncated = false;
3174   if (N.getOpcode() == ISD::TRUNCATE) {
3175     WasTruncated = true;
3176     N = N.getOperand(0);
3177   }
3178 
3179   if (N.getOpcode() != X86ISD::Wrapper)
3180     return false;
3181 
3182   // We can only use non-GlobalValues as immediates if they were not truncated,
3183   // as we do not have any range information. If we have a GlobalValue and the
3184   // address was not truncated, we can select it as an operand directly.
3185   unsigned Opc = N.getOperand(0)->getOpcode();
3186   if (Opc != ISD::TargetGlobalAddress || !WasTruncated) {
3187     Op = N.getOperand(0);
3188     // We can only select the operand directly if we didn't have to look past a
3189     // truncate.
3190     return !WasTruncated;
3191   }
3192 
3193   // Check that the global's range fits into VT.
3194   auto *GA = cast<GlobalAddressSDNode>(N.getOperand(0));
3195   std::optional<ConstantRange> CR = GA->getGlobal()->getAbsoluteSymbolRange();
3196   if (!CR || CR->getUnsignedMax().uge(1ull << VT.getSizeInBits()))
3197     return false;
3198 
3199   // Okay, we can use a narrow reference.
3200   Op = CurDAG->getTargetGlobalAddress(GA->getGlobal(), SDLoc(N), VT,
3201                                       GA->getOffset(), GA->getTargetFlags());
3202   return true;
3203 }
3204 
3205 bool X86DAGToDAGISel::tryFoldLoad(SDNode *Root, SDNode *P, SDValue N,
3206                                   SDValue &Base, SDValue &Scale,
3207                                   SDValue &Index, SDValue &Disp,
3208                                   SDValue &Segment) {
3209   assert(Root && P && "Unknown root/parent nodes");
3210   if (!ISD::isNON_EXTLoad(N.getNode()) ||
3211       !IsProfitableToFold(N, P, Root) ||
3212       !IsLegalToFold(N, P, Root, OptLevel))
3213     return false;
3214 
3215   return selectAddr(N.getNode(),
3216                     N.getOperand(1), Base, Scale, Index, Disp, Segment);
3217 }
3218 
3219 bool X86DAGToDAGISel::tryFoldBroadcast(SDNode *Root, SDNode *P, SDValue N,
3220                                        SDValue &Base, SDValue &Scale,
3221                                        SDValue &Index, SDValue &Disp,
3222                                        SDValue &Segment) {
3223   assert(Root && P && "Unknown root/parent nodes");
3224   if (N->getOpcode() != X86ISD::VBROADCAST_LOAD ||
3225       !IsProfitableToFold(N, P, Root) ||
3226       !IsLegalToFold(N, P, Root, OptLevel))
3227     return false;
3228 
3229   return selectAddr(N.getNode(),
3230                     N.getOperand(1), Base, Scale, Index, Disp, Segment);
3231 }
3232 
3233 /// Return an SDNode that returns the value of the global base register.
3234 /// Output instructions required to initialize the global base register,
3235 /// if necessary.
3236 SDNode *X86DAGToDAGISel::getGlobalBaseReg() {
3237   unsigned GlobalBaseReg = getInstrInfo()->getGlobalBaseReg(MF);
3238   auto &DL = MF->getDataLayout();
3239   return CurDAG->getRegister(GlobalBaseReg, TLI->getPointerTy(DL)).getNode();
3240 }
3241 
3242 bool X86DAGToDAGISel::isSExtAbsoluteSymbolRef(unsigned Width, SDNode *N) const {
3243   if (N->getOpcode() == ISD::TRUNCATE)
3244     N = N->getOperand(0).getNode();
3245   if (N->getOpcode() != X86ISD::Wrapper)
3246     return false;
3247 
3248   auto *GA = dyn_cast<GlobalAddressSDNode>(N->getOperand(0));
3249   if (!GA)
3250     return false;
3251 
3252   auto *GV = GA->getGlobal();
3253   std::optional<ConstantRange> CR = GV->getAbsoluteSymbolRange();
3254   if (CR)
3255     return CR->getSignedMin().sge(-1ull << Width) &&
3256            CR->getSignedMax().slt(1ull << Width);
3257   // In the kernel code model, globals are in the negative 2GB of the address
3258   // space, so globals can be a sign extended 32-bit immediate.
3259   // In other code models, small globals are in the low 2GB of the address
3260   // space, so sign extending them is equivalent to zero extending them.
3261   return Width == 32 && !TM.isLargeGlobalValue(GV);
3262 }
3263 
3264 X86::CondCode X86DAGToDAGISel::getCondFromNode(SDNode *N) const {
3265   assert(N->isMachineOpcode() && "Unexpected node");
3266   unsigned Opc = N->getMachineOpcode();
3267   const MCInstrDesc &MCID = getInstrInfo()->get(Opc);
3268   int CondNo = X86::getCondSrcNoFromDesc(MCID);
3269   if (CondNo < 0)
3270     return X86::COND_INVALID;
3271 
3272   return static_cast<X86::CondCode>(N->getConstantOperandVal(CondNo));
3273 }
3274 
3275 /// Test whether the given X86ISD::CMP node has any users that use a flag
3276 /// other than ZF.
3277 bool X86DAGToDAGISel::onlyUsesZeroFlag(SDValue Flags) const {
3278   // Examine each user of the node.
3279   for (SDNode::use_iterator UI = Flags->use_begin(), UE = Flags->use_end();
3280          UI != UE; ++UI) {
3281     // Only check things that use the flags.
3282     if (UI.getUse().getResNo() != Flags.getResNo())
3283       continue;
3284     // Only examine CopyToReg uses that copy to EFLAGS.
3285     if (UI->getOpcode() != ISD::CopyToReg ||
3286         cast<RegisterSDNode>(UI->getOperand(1))->getReg() != X86::EFLAGS)
3287       return false;
3288     // Examine each user of the CopyToReg use.
3289     for (SDNode::use_iterator FlagUI = UI->use_begin(),
3290            FlagUE = UI->use_end(); FlagUI != FlagUE; ++FlagUI) {
3291       // Only examine the Flag result.
3292       if (FlagUI.getUse().getResNo() != 1) continue;
3293       // Anything unusual: assume conservatively.
3294       if (!FlagUI->isMachineOpcode()) return false;
3295       // Examine the condition code of the user.
3296       X86::CondCode CC = getCondFromNode(*FlagUI);
3297 
3298       switch (CC) {
3299       // Comparisons which only use the zero flag.
3300       case X86::COND_E: case X86::COND_NE:
3301         continue;
3302       // Anything else: assume conservatively.
3303       default:
3304         return false;
3305       }
3306     }
3307   }
3308   return true;
3309 }
3310 
3311 /// Test whether the given X86ISD::CMP node has any uses which require the SF
3312 /// flag to be accurate.
3313 bool X86DAGToDAGISel::hasNoSignFlagUses(SDValue Flags) const {
3314   // Examine each user of the node.
3315   for (SDNode::use_iterator UI = Flags->use_begin(), UE = Flags->use_end();
3316          UI != UE; ++UI) {
3317     // Only check things that use the flags.
3318     if (UI.getUse().getResNo() != Flags.getResNo())
3319       continue;
3320     // Only examine CopyToReg uses that copy to EFLAGS.
3321     if (UI->getOpcode() != ISD::CopyToReg ||
3322         cast<RegisterSDNode>(UI->getOperand(1))->getReg() != X86::EFLAGS)
3323       return false;
3324     // Examine each user of the CopyToReg use.
3325     for (SDNode::use_iterator FlagUI = UI->use_begin(),
3326            FlagUE = UI->use_end(); FlagUI != FlagUE; ++FlagUI) {
3327       // Only examine the Flag result.
3328       if (FlagUI.getUse().getResNo() != 1) continue;
3329       // Anything unusual: assume conservatively.
3330       if (!FlagUI->isMachineOpcode()) return false;
3331       // Examine the condition code of the user.
3332       X86::CondCode CC = getCondFromNode(*FlagUI);
3333 
3334       switch (CC) {
3335       // Comparisons which don't examine the SF flag.
3336       case X86::COND_A: case X86::COND_AE:
3337       case X86::COND_B: case X86::COND_BE:
3338       case X86::COND_E: case X86::COND_NE:
3339       case X86::COND_O: case X86::COND_NO:
3340       case X86::COND_P: case X86::COND_NP:
3341         continue;
3342       // Anything else: assume conservatively.
3343       default:
3344         return false;
3345       }
3346     }
3347   }
3348   return true;
3349 }
3350 
3351 static bool mayUseCarryFlag(X86::CondCode CC) {
3352   switch (CC) {
3353   // Comparisons which don't examine the CF flag.
3354   case X86::COND_O: case X86::COND_NO:
3355   case X86::COND_E: case X86::COND_NE:
3356   case X86::COND_S: case X86::COND_NS:
3357   case X86::COND_P: case X86::COND_NP:
3358   case X86::COND_L: case X86::COND_GE:
3359   case X86::COND_G: case X86::COND_LE:
3360     return false;
3361   // Anything else: assume conservatively.
3362   default:
3363     return true;
3364   }
3365 }
3366 
3367 /// Test whether the given node which sets flags has any uses which require the
3368 /// CF flag to be accurate.
3369  bool X86DAGToDAGISel::hasNoCarryFlagUses(SDValue Flags) const {
3370   // Examine each user of the node.
3371   for (SDNode::use_iterator UI = Flags->use_begin(), UE = Flags->use_end();
3372          UI != UE; ++UI) {
3373     // Only check things that use the flags.
3374     if (UI.getUse().getResNo() != Flags.getResNo())
3375       continue;
3376 
3377     unsigned UIOpc = UI->getOpcode();
3378 
3379     if (UIOpc == ISD::CopyToReg) {
3380       // Only examine CopyToReg uses that copy to EFLAGS.
3381       if (cast<RegisterSDNode>(UI->getOperand(1))->getReg() != X86::EFLAGS)
3382         return false;
3383       // Examine each user of the CopyToReg use.
3384       for (SDNode::use_iterator FlagUI = UI->use_begin(), FlagUE = UI->use_end();
3385            FlagUI != FlagUE; ++FlagUI) {
3386         // Only examine the Flag result.
3387         if (FlagUI.getUse().getResNo() != 1)
3388           continue;
3389         // Anything unusual: assume conservatively.
3390         if (!FlagUI->isMachineOpcode())
3391           return false;
3392         // Examine the condition code of the user.
3393         X86::CondCode CC = getCondFromNode(*FlagUI);
3394 
3395         if (mayUseCarryFlag(CC))
3396           return false;
3397       }
3398 
3399       // This CopyToReg is ok. Move on to the next user.
3400       continue;
3401     }
3402 
3403     // This might be an unselected node. So look for the pre-isel opcodes that
3404     // use flags.
3405     unsigned CCOpNo;
3406     switch (UIOpc) {
3407     default:
3408       // Something unusual. Be conservative.
3409       return false;
3410     case X86ISD::SETCC:       CCOpNo = 0; break;
3411     case X86ISD::SETCC_CARRY: CCOpNo = 0; break;
3412     case X86ISD::CMOV:        CCOpNo = 2; break;
3413     case X86ISD::BRCOND:      CCOpNo = 2; break;
3414     }
3415 
3416     X86::CondCode CC = (X86::CondCode)UI->getConstantOperandVal(CCOpNo);
3417     if (mayUseCarryFlag(CC))
3418       return false;
3419   }
3420   return true;
3421 }
3422 
3423 /// Check whether or not the chain ending in StoreNode is suitable for doing
3424 /// the {load; op; store} to modify transformation.
3425 static bool isFusableLoadOpStorePattern(StoreSDNode *StoreNode,
3426                                         SDValue StoredVal, SelectionDAG *CurDAG,
3427                                         unsigned LoadOpNo,
3428                                         LoadSDNode *&LoadNode,
3429                                         SDValue &InputChain) {
3430   // Is the stored value result 0 of the operation?
3431   if (StoredVal.getResNo() != 0) return false;
3432 
3433   // Are there other uses of the operation other than the store?
3434   if (!StoredVal.getNode()->hasNUsesOfValue(1, 0)) return false;
3435 
3436   // Is the store non-extending and non-indexed?
3437   if (!ISD::isNormalStore(StoreNode) || StoreNode->isNonTemporal())
3438     return false;
3439 
3440   SDValue Load = StoredVal->getOperand(LoadOpNo);
3441   // Is the stored value a non-extending and non-indexed load?
3442   if (!ISD::isNormalLoad(Load.getNode())) return false;
3443 
3444   // Return LoadNode by reference.
3445   LoadNode = cast<LoadSDNode>(Load);
3446 
3447   // Is store the only read of the loaded value?
3448   if (!Load.hasOneUse())
3449     return false;
3450 
3451   // Is the address of the store the same as the load?
3452   if (LoadNode->getBasePtr() != StoreNode->getBasePtr() ||
3453       LoadNode->getOffset() != StoreNode->getOffset())
3454     return false;
3455 
3456   bool FoundLoad = false;
3457   SmallVector<SDValue, 4> ChainOps;
3458   SmallVector<const SDNode *, 4> LoopWorklist;
3459   SmallPtrSet<const SDNode *, 16> Visited;
3460   const unsigned int Max = 1024;
3461 
3462   //  Visualization of Load-Op-Store fusion:
3463   // -------------------------
3464   // Legend:
3465   //    *-lines = Chain operand dependencies.
3466   //    |-lines = Normal operand dependencies.
3467   //    Dependencies flow down and right. n-suffix references multiple nodes.
3468   //
3469   //        C                        Xn  C
3470   //        *                         *  *
3471   //        *                          * *
3472   //  Xn  A-LD    Yn                    TF         Yn
3473   //   *    * \   |                       *        |
3474   //    *   *  \  |                        *       |
3475   //     *  *   \ |             =>       A--LD_OP_ST
3476   //      * *    \|                                 \
3477   //       TF    OP                                  \
3478   //         *   | \                                  Zn
3479   //          *  |  \
3480   //         A-ST    Zn
3481   //
3482 
3483   // This merge induced dependences from: #1: Xn -> LD, OP, Zn
3484   //                                      #2: Yn -> LD
3485   //                                      #3: ST -> Zn
3486 
3487   // Ensure the transform is safe by checking for the dual
3488   // dependencies to make sure we do not induce a loop.
3489 
3490   // As LD is a predecessor to both OP and ST we can do this by checking:
3491   //  a). if LD is a predecessor to a member of Xn or Yn.
3492   //  b). if a Zn is a predecessor to ST.
3493 
3494   // However, (b) can only occur through being a chain predecessor to
3495   // ST, which is the same as Zn being a member or predecessor of Xn,
3496   // which is a subset of LD being a predecessor of Xn. So it's
3497   // subsumed by check (a).
3498 
3499   SDValue Chain = StoreNode->getChain();
3500 
3501   // Gather X elements in ChainOps.
3502   if (Chain == Load.getValue(1)) {
3503     FoundLoad = true;
3504     ChainOps.push_back(Load.getOperand(0));
3505   } else if (Chain.getOpcode() == ISD::TokenFactor) {
3506     for (unsigned i = 0, e = Chain.getNumOperands(); i != e; ++i) {
3507       SDValue Op = Chain.getOperand(i);
3508       if (Op == Load.getValue(1)) {
3509         FoundLoad = true;
3510         // Drop Load, but keep its chain. No cycle check necessary.
3511         ChainOps.push_back(Load.getOperand(0));
3512         continue;
3513       }
3514       LoopWorklist.push_back(Op.getNode());
3515       ChainOps.push_back(Op);
3516     }
3517   }
3518 
3519   if (!FoundLoad)
3520     return false;
3521 
3522   // Worklist is currently Xn. Add Yn to worklist.
3523   for (SDValue Op : StoredVal->ops())
3524     if (Op.getNode() != LoadNode)
3525       LoopWorklist.push_back(Op.getNode());
3526 
3527   // Check (a) if Load is a predecessor to Xn + Yn
3528   if (SDNode::hasPredecessorHelper(Load.getNode(), Visited, LoopWorklist, Max,
3529                                    true))
3530     return false;
3531 
3532   InputChain =
3533       CurDAG->getNode(ISD::TokenFactor, SDLoc(Chain), MVT::Other, ChainOps);
3534   return true;
3535 }
3536 
3537 // Change a chain of {load; op; store} of the same value into a simple op
3538 // through memory of that value, if the uses of the modified value and its
3539 // address are suitable.
3540 //
3541 // The tablegen pattern memory operand pattern is currently not able to match
3542 // the case where the EFLAGS on the original operation are used.
3543 //
3544 // To move this to tablegen, we'll need to improve tablegen to allow flags to
3545 // be transferred from a node in the pattern to the result node, probably with
3546 // a new keyword. For example, we have this
3547 // def DEC64m : RI<0xFF, MRM1m, (outs), (ins i64mem:$dst), "dec{q}\t$dst",
3548 //  [(store (add (loadi64 addr:$dst), -1), addr:$dst),
3549 //   (implicit EFLAGS)]>;
3550 // but maybe need something like this
3551 // def DEC64m : RI<0xFF, MRM1m, (outs), (ins i64mem:$dst), "dec{q}\t$dst",
3552 //  [(store (add (loadi64 addr:$dst), -1), addr:$dst),
3553 //   (transferrable EFLAGS)]>;
3554 //
3555 // Until then, we manually fold these and instruction select the operation
3556 // here.
3557 bool X86DAGToDAGISel::foldLoadStoreIntoMemOperand(SDNode *Node) {
3558   auto *StoreNode = cast<StoreSDNode>(Node);
3559   SDValue StoredVal = StoreNode->getOperand(1);
3560   unsigned Opc = StoredVal->getOpcode();
3561 
3562   // Before we try to select anything, make sure this is memory operand size
3563   // and opcode we can handle. Note that this must match the code below that
3564   // actually lowers the opcodes.
3565   EVT MemVT = StoreNode->getMemoryVT();
3566   if (MemVT != MVT::i64 && MemVT != MVT::i32 && MemVT != MVT::i16 &&
3567       MemVT != MVT::i8)
3568     return false;
3569 
3570   bool IsCommutable = false;
3571   bool IsNegate = false;
3572   switch (Opc) {
3573   default:
3574     return false;
3575   case X86ISD::SUB:
3576     IsNegate = isNullConstant(StoredVal.getOperand(0));
3577     break;
3578   case X86ISD::SBB:
3579     break;
3580   case X86ISD::ADD:
3581   case X86ISD::ADC:
3582   case X86ISD::AND:
3583   case X86ISD::OR:
3584   case X86ISD::XOR:
3585     IsCommutable = true;
3586     break;
3587   }
3588 
3589   unsigned LoadOpNo = IsNegate ? 1 : 0;
3590   LoadSDNode *LoadNode = nullptr;
3591   SDValue InputChain;
3592   if (!isFusableLoadOpStorePattern(StoreNode, StoredVal, CurDAG, LoadOpNo,
3593                                    LoadNode, InputChain)) {
3594     if (!IsCommutable)
3595       return false;
3596 
3597     // This operation is commutable, try the other operand.
3598     LoadOpNo = 1;
3599     if (!isFusableLoadOpStorePattern(StoreNode, StoredVal, CurDAG, LoadOpNo,
3600                                      LoadNode, InputChain))
3601       return false;
3602   }
3603 
3604   SDValue Base, Scale, Index, Disp, Segment;
3605   if (!selectAddr(LoadNode, LoadNode->getBasePtr(), Base, Scale, Index, Disp,
3606                   Segment))
3607     return false;
3608 
3609   auto SelectOpcode = [&](unsigned Opc64, unsigned Opc32, unsigned Opc16,
3610                           unsigned Opc8) {
3611     switch (MemVT.getSimpleVT().SimpleTy) {
3612     case MVT::i64:
3613       return Opc64;
3614     case MVT::i32:
3615       return Opc32;
3616     case MVT::i16:
3617       return Opc16;
3618     case MVT::i8:
3619       return Opc8;
3620     default:
3621       llvm_unreachable("Invalid size!");
3622     }
3623   };
3624 
3625   MachineSDNode *Result;
3626   switch (Opc) {
3627   case X86ISD::SUB:
3628     // Handle negate.
3629     if (IsNegate) {
3630       unsigned NewOpc = SelectOpcode(X86::NEG64m, X86::NEG32m, X86::NEG16m,
3631                                      X86::NEG8m);
3632       const SDValue Ops[] = {Base, Scale, Index, Disp, Segment, InputChain};
3633       Result = CurDAG->getMachineNode(NewOpc, SDLoc(Node), MVT::i32,
3634                                       MVT::Other, Ops);
3635       break;
3636     }
3637    [[fallthrough]];
3638   case X86ISD::ADD:
3639     // Try to match inc/dec.
3640     if (!Subtarget->slowIncDec() || CurDAG->shouldOptForSize()) {
3641       bool IsOne = isOneConstant(StoredVal.getOperand(1));
3642       bool IsNegOne = isAllOnesConstant(StoredVal.getOperand(1));
3643       // ADD/SUB with 1/-1 and carry flag isn't used can use inc/dec.
3644       if ((IsOne || IsNegOne) && hasNoCarryFlagUses(StoredVal.getValue(1))) {
3645         unsigned NewOpc =
3646           ((Opc == X86ISD::ADD) == IsOne)
3647               ? SelectOpcode(X86::INC64m, X86::INC32m, X86::INC16m, X86::INC8m)
3648               : SelectOpcode(X86::DEC64m, X86::DEC32m, X86::DEC16m, X86::DEC8m);
3649         const SDValue Ops[] = {Base, Scale, Index, Disp, Segment, InputChain};
3650         Result = CurDAG->getMachineNode(NewOpc, SDLoc(Node), MVT::i32,
3651                                         MVT::Other, Ops);
3652         break;
3653       }
3654     }
3655     [[fallthrough]];
3656   case X86ISD::ADC:
3657   case X86ISD::SBB:
3658   case X86ISD::AND:
3659   case X86ISD::OR:
3660   case X86ISD::XOR: {
3661     auto SelectRegOpcode = [SelectOpcode](unsigned Opc) {
3662       switch (Opc) {
3663       case X86ISD::ADD:
3664         return SelectOpcode(X86::ADD64mr, X86::ADD32mr, X86::ADD16mr,
3665                             X86::ADD8mr);
3666       case X86ISD::ADC:
3667         return SelectOpcode(X86::ADC64mr, X86::ADC32mr, X86::ADC16mr,
3668                             X86::ADC8mr);
3669       case X86ISD::SUB:
3670         return SelectOpcode(X86::SUB64mr, X86::SUB32mr, X86::SUB16mr,
3671                             X86::SUB8mr);
3672       case X86ISD::SBB:
3673         return SelectOpcode(X86::SBB64mr, X86::SBB32mr, X86::SBB16mr,
3674                             X86::SBB8mr);
3675       case X86ISD::AND:
3676         return SelectOpcode(X86::AND64mr, X86::AND32mr, X86::AND16mr,
3677                             X86::AND8mr);
3678       case X86ISD::OR:
3679         return SelectOpcode(X86::OR64mr, X86::OR32mr, X86::OR16mr, X86::OR8mr);
3680       case X86ISD::XOR:
3681         return SelectOpcode(X86::XOR64mr, X86::XOR32mr, X86::XOR16mr,
3682                             X86::XOR8mr);
3683       default:
3684         llvm_unreachable("Invalid opcode!");
3685       }
3686     };
3687     auto SelectImmOpcode = [SelectOpcode](unsigned Opc) {
3688       switch (Opc) {
3689       case X86ISD::ADD:
3690         return SelectOpcode(X86::ADD64mi32, X86::ADD32mi, X86::ADD16mi,
3691                             X86::ADD8mi);
3692       case X86ISD::ADC:
3693         return SelectOpcode(X86::ADC64mi32, X86::ADC32mi, X86::ADC16mi,
3694                             X86::ADC8mi);
3695       case X86ISD::SUB:
3696         return SelectOpcode(X86::SUB64mi32, X86::SUB32mi, X86::SUB16mi,
3697                             X86::SUB8mi);
3698       case X86ISD::SBB:
3699         return SelectOpcode(X86::SBB64mi32, X86::SBB32mi, X86::SBB16mi,
3700                             X86::SBB8mi);
3701       case X86ISD::AND:
3702         return SelectOpcode(X86::AND64mi32, X86::AND32mi, X86::AND16mi,
3703                             X86::AND8mi);
3704       case X86ISD::OR:
3705         return SelectOpcode(X86::OR64mi32, X86::OR32mi, X86::OR16mi,
3706                             X86::OR8mi);
3707       case X86ISD::XOR:
3708         return SelectOpcode(X86::XOR64mi32, X86::XOR32mi, X86::XOR16mi,
3709                             X86::XOR8mi);
3710       default:
3711         llvm_unreachable("Invalid opcode!");
3712       }
3713     };
3714 
3715     unsigned NewOpc = SelectRegOpcode(Opc);
3716     SDValue Operand = StoredVal->getOperand(1-LoadOpNo);
3717 
3718     // See if the operand is a constant that we can fold into an immediate
3719     // operand.
3720     if (auto *OperandC = dyn_cast<ConstantSDNode>(Operand)) {
3721       int64_t OperandV = OperandC->getSExtValue();
3722 
3723       // Check if we can shrink the operand enough to fit in an immediate (or
3724       // fit into a smaller immediate) by negating it and switching the
3725       // operation.
3726       if ((Opc == X86ISD::ADD || Opc == X86ISD::SUB) &&
3727           ((MemVT != MVT::i8 && !isInt<8>(OperandV) && isInt<8>(-OperandV)) ||
3728            (MemVT == MVT::i64 && !isInt<32>(OperandV) &&
3729             isInt<32>(-OperandV))) &&
3730           hasNoCarryFlagUses(StoredVal.getValue(1))) {
3731         OperandV = -OperandV;
3732         Opc = Opc == X86ISD::ADD ? X86ISD::SUB : X86ISD::ADD;
3733       }
3734 
3735       if (MemVT != MVT::i64 || isInt<32>(OperandV)) {
3736         Operand = CurDAG->getTargetConstant(OperandV, SDLoc(Node), MemVT);
3737         NewOpc = SelectImmOpcode(Opc);
3738       }
3739     }
3740 
3741     if (Opc == X86ISD::ADC || Opc == X86ISD::SBB) {
3742       SDValue CopyTo =
3743           CurDAG->getCopyToReg(InputChain, SDLoc(Node), X86::EFLAGS,
3744                                StoredVal.getOperand(2), SDValue());
3745 
3746       const SDValue Ops[] = {Base,    Scale,   Index,  Disp,
3747                              Segment, Operand, CopyTo, CopyTo.getValue(1)};
3748       Result = CurDAG->getMachineNode(NewOpc, SDLoc(Node), MVT::i32, MVT::Other,
3749                                       Ops);
3750     } else {
3751       const SDValue Ops[] = {Base,    Scale,   Index,     Disp,
3752                              Segment, Operand, InputChain};
3753       Result = CurDAG->getMachineNode(NewOpc, SDLoc(Node), MVT::i32, MVT::Other,
3754                                       Ops);
3755     }
3756     break;
3757   }
3758   default:
3759     llvm_unreachable("Invalid opcode!");
3760   }
3761 
3762   MachineMemOperand *MemOps[] = {StoreNode->getMemOperand(),
3763                                  LoadNode->getMemOperand()};
3764   CurDAG->setNodeMemRefs(Result, MemOps);
3765 
3766   // Update Load Chain uses as well.
3767   ReplaceUses(SDValue(LoadNode, 1), SDValue(Result, 1));
3768   ReplaceUses(SDValue(StoreNode, 0), SDValue(Result, 1));
3769   ReplaceUses(SDValue(StoredVal.getNode(), 1), SDValue(Result, 0));
3770   CurDAG->RemoveDeadNode(Node);
3771   return true;
3772 }
3773 
3774 // See if this is an  X & Mask  that we can match to BEXTR/BZHI.
3775 // Where Mask is one of the following patterns:
3776 //   a) x &  (1 << nbits) - 1
3777 //   b) x & ~(-1 << nbits)
3778 //   c) x &  (-1 >> (32 - y))
3779 //   d) x << (32 - y) >> (32 - y)
3780 //   e) (1 << nbits) - 1
3781 bool X86DAGToDAGISel::matchBitExtract(SDNode *Node) {
3782   assert(
3783       (Node->getOpcode() == ISD::ADD || Node->getOpcode() == ISD::AND ||
3784        Node->getOpcode() == ISD::SRL) &&
3785       "Should be either an and-mask, or right-shift after clearing high bits.");
3786 
3787   // BEXTR is BMI instruction, BZHI is BMI2 instruction. We need at least one.
3788   if (!Subtarget->hasBMI() && !Subtarget->hasBMI2())
3789     return false;
3790 
3791   MVT NVT = Node->getSimpleValueType(0);
3792 
3793   // Only supported for 32 and 64 bits.
3794   if (NVT != MVT::i32 && NVT != MVT::i64)
3795     return false;
3796 
3797   SDValue NBits;
3798   bool NegateNBits;
3799 
3800   // If we have BMI2's BZHI, we are ok with muti-use patterns.
3801   // Else, if we only have BMI1's BEXTR, we require one-use.
3802   const bool AllowExtraUsesByDefault = Subtarget->hasBMI2();
3803   auto checkUses = [AllowExtraUsesByDefault](
3804                        SDValue Op, unsigned NUses,
3805                        std::optional<bool> AllowExtraUses) {
3806     return AllowExtraUses.value_or(AllowExtraUsesByDefault) ||
3807            Op.getNode()->hasNUsesOfValue(NUses, Op.getResNo());
3808   };
3809   auto checkOneUse = [checkUses](SDValue Op,
3810                                  std::optional<bool> AllowExtraUses =
3811                                      std::nullopt) {
3812     return checkUses(Op, 1, AllowExtraUses);
3813   };
3814   auto checkTwoUse = [checkUses](SDValue Op,
3815                                  std::optional<bool> AllowExtraUses =
3816                                      std::nullopt) {
3817     return checkUses(Op, 2, AllowExtraUses);
3818   };
3819 
3820   auto peekThroughOneUseTruncation = [checkOneUse](SDValue V) {
3821     if (V->getOpcode() == ISD::TRUNCATE && checkOneUse(V)) {
3822       assert(V.getSimpleValueType() == MVT::i32 &&
3823              V.getOperand(0).getSimpleValueType() == MVT::i64 &&
3824              "Expected i64 -> i32 truncation");
3825       V = V.getOperand(0);
3826     }
3827     return V;
3828   };
3829 
3830   // a) x & ((1 << nbits) + (-1))
3831   auto matchPatternA = [checkOneUse, peekThroughOneUseTruncation, &NBits,
3832                         &NegateNBits](SDValue Mask) -> bool {
3833     // Match `add`. Must only have one use!
3834     if (Mask->getOpcode() != ISD::ADD || !checkOneUse(Mask))
3835       return false;
3836     // We should be adding all-ones constant (i.e. subtracting one.)
3837     if (!isAllOnesConstant(Mask->getOperand(1)))
3838       return false;
3839     // Match `1 << nbits`. Might be truncated. Must only have one use!
3840     SDValue M0 = peekThroughOneUseTruncation(Mask->getOperand(0));
3841     if (M0->getOpcode() != ISD::SHL || !checkOneUse(M0))
3842       return false;
3843     if (!isOneConstant(M0->getOperand(0)))
3844       return false;
3845     NBits = M0->getOperand(1);
3846     NegateNBits = false;
3847     return true;
3848   };
3849 
3850   auto isAllOnes = [this, peekThroughOneUseTruncation, NVT](SDValue V) {
3851     V = peekThroughOneUseTruncation(V);
3852     return CurDAG->MaskedValueIsAllOnes(
3853         V, APInt::getLowBitsSet(V.getSimpleValueType().getSizeInBits(),
3854                                 NVT.getSizeInBits()));
3855   };
3856 
3857   // b) x & ~(-1 << nbits)
3858   auto matchPatternB = [checkOneUse, isAllOnes, peekThroughOneUseTruncation,
3859                         &NBits, &NegateNBits](SDValue Mask) -> bool {
3860     // Match `~()`. Must only have one use!
3861     if (Mask.getOpcode() != ISD::XOR || !checkOneUse(Mask))
3862       return false;
3863     // The -1 only has to be all-ones for the final Node's NVT.
3864     if (!isAllOnes(Mask->getOperand(1)))
3865       return false;
3866     // Match `-1 << nbits`. Might be truncated. Must only have one use!
3867     SDValue M0 = peekThroughOneUseTruncation(Mask->getOperand(0));
3868     if (M0->getOpcode() != ISD::SHL || !checkOneUse(M0))
3869       return false;
3870     // The -1 only has to be all-ones for the final Node's NVT.
3871     if (!isAllOnes(M0->getOperand(0)))
3872       return false;
3873     NBits = M0->getOperand(1);
3874     NegateNBits = false;
3875     return true;
3876   };
3877 
3878   // Try to match potentially-truncated shift amount as `(bitwidth - y)`,
3879   // or leave the shift amount as-is, but then we'll have to negate it.
3880   auto canonicalizeShiftAmt = [&NBits, &NegateNBits](SDValue ShiftAmt,
3881                                                      unsigned Bitwidth) {
3882     NBits = ShiftAmt;
3883     NegateNBits = true;
3884     // Skip over a truncate of the shift amount, if any.
3885     if (NBits.getOpcode() == ISD::TRUNCATE)
3886       NBits = NBits.getOperand(0);
3887     // Try to match the shift amount as (bitwidth - y). It should go away, too.
3888     // If it doesn't match, that's fine, we'll just negate it ourselves.
3889     if (NBits.getOpcode() != ISD::SUB)
3890       return;
3891     auto *V0 = dyn_cast<ConstantSDNode>(NBits.getOperand(0));
3892     if (!V0 || V0->getZExtValue() != Bitwidth)
3893       return;
3894     NBits = NBits.getOperand(1);
3895     NegateNBits = false;
3896   };
3897 
3898   // c) x &  (-1 >> z)  but then we'll have to subtract z from bitwidth
3899   //   or
3900   // c) x &  (-1 >> (32 - y))
3901   auto matchPatternC = [checkOneUse, peekThroughOneUseTruncation, &NegateNBits,
3902                         canonicalizeShiftAmt](SDValue Mask) -> bool {
3903     // The mask itself may be truncated.
3904     Mask = peekThroughOneUseTruncation(Mask);
3905     unsigned Bitwidth = Mask.getSimpleValueType().getSizeInBits();
3906     // Match `l>>`. Must only have one use!
3907     if (Mask.getOpcode() != ISD::SRL || !checkOneUse(Mask))
3908       return false;
3909     // We should be shifting truly all-ones constant.
3910     if (!isAllOnesConstant(Mask.getOperand(0)))
3911       return false;
3912     SDValue M1 = Mask.getOperand(1);
3913     // The shift amount should not be used externally.
3914     if (!checkOneUse(M1))
3915       return false;
3916     canonicalizeShiftAmt(M1, Bitwidth);
3917     // Pattern c. is non-canonical, and is expanded into pattern d. iff there
3918     // is no extra use of the mask. Clearly, there was one since we are here.
3919     // But at the same time, if we need to negate the shift amount,
3920     // then we don't want the mask to stick around, else it's unprofitable.
3921     return !NegateNBits;
3922   };
3923 
3924   SDValue X;
3925 
3926   // d) x << z >> z  but then we'll have to subtract z from bitwidth
3927   //   or
3928   // d) x << (32 - y) >> (32 - y)
3929   auto matchPatternD = [checkOneUse, checkTwoUse, canonicalizeShiftAmt,
3930                         AllowExtraUsesByDefault, &NegateNBits,
3931                         &X](SDNode *Node) -> bool {
3932     if (Node->getOpcode() != ISD::SRL)
3933       return false;
3934     SDValue N0 = Node->getOperand(0);
3935     if (N0->getOpcode() != ISD::SHL)
3936       return false;
3937     unsigned Bitwidth = N0.getSimpleValueType().getSizeInBits();
3938     SDValue N1 = Node->getOperand(1);
3939     SDValue N01 = N0->getOperand(1);
3940     // Both of the shifts must be by the exact same value.
3941     if (N1 != N01)
3942       return false;
3943     canonicalizeShiftAmt(N1, Bitwidth);
3944     // There should not be any external uses of the inner shift / shift amount.
3945     // Note that while we are generally okay with external uses given BMI2,
3946     // iff we need to negate the shift amount, we are not okay with extra uses.
3947     const bool AllowExtraUses = AllowExtraUsesByDefault && !NegateNBits;
3948     if (!checkOneUse(N0, AllowExtraUses) || !checkTwoUse(N1, AllowExtraUses))
3949       return false;
3950     X = N0->getOperand(0);
3951     return true;
3952   };
3953 
3954   auto matchLowBitMask = [matchPatternA, matchPatternB,
3955                           matchPatternC](SDValue Mask) -> bool {
3956     return matchPatternA(Mask) || matchPatternB(Mask) || matchPatternC(Mask);
3957   };
3958 
3959   if (Node->getOpcode() == ISD::AND) {
3960     X = Node->getOperand(0);
3961     SDValue Mask = Node->getOperand(1);
3962 
3963     if (matchLowBitMask(Mask)) {
3964       // Great.
3965     } else {
3966       std::swap(X, Mask);
3967       if (!matchLowBitMask(Mask))
3968         return false;
3969     }
3970   } else if (matchLowBitMask(SDValue(Node, 0))) {
3971     X = CurDAG->getAllOnesConstant(SDLoc(Node), NVT);
3972   } else if (!matchPatternD(Node))
3973     return false;
3974 
3975   // If we need to negate the shift amount, require BMI2 BZHI support.
3976   // It's just too unprofitable for BMI1 BEXTR.
3977   if (NegateNBits && !Subtarget->hasBMI2())
3978     return false;
3979 
3980   SDLoc DL(Node);
3981 
3982   // Truncate the shift amount.
3983   NBits = CurDAG->getNode(ISD::TRUNCATE, DL, MVT::i8, NBits);
3984   insertDAGNode(*CurDAG, SDValue(Node, 0), NBits);
3985 
3986   // Insert 8-bit NBits into lowest 8 bits of 32-bit register.
3987   // All the other bits are undefined, we do not care about them.
3988   SDValue ImplDef = SDValue(
3989       CurDAG->getMachineNode(TargetOpcode::IMPLICIT_DEF, DL, MVT::i32), 0);
3990   insertDAGNode(*CurDAG, SDValue(Node, 0), ImplDef);
3991 
3992   SDValue SRIdxVal = CurDAG->getTargetConstant(X86::sub_8bit, DL, MVT::i32);
3993   insertDAGNode(*CurDAG, SDValue(Node, 0), SRIdxVal);
3994   NBits = SDValue(CurDAG->getMachineNode(TargetOpcode::INSERT_SUBREG, DL,
3995                                          MVT::i32, ImplDef, NBits, SRIdxVal),
3996                   0);
3997   insertDAGNode(*CurDAG, SDValue(Node, 0), NBits);
3998 
3999   // We might have matched the amount of high bits to be cleared,
4000   // but we want the amount of low bits to be kept, so negate it then.
4001   if (NegateNBits) {
4002     SDValue BitWidthC = CurDAG->getConstant(NVT.getSizeInBits(), DL, MVT::i32);
4003     insertDAGNode(*CurDAG, SDValue(Node, 0), BitWidthC);
4004 
4005     NBits = CurDAG->getNode(ISD::SUB, DL, MVT::i32, BitWidthC, NBits);
4006     insertDAGNode(*CurDAG, SDValue(Node, 0), NBits);
4007   }
4008 
4009   if (Subtarget->hasBMI2()) {
4010     // Great, just emit the BZHI..
4011     if (NVT != MVT::i32) {
4012       // But have to place the bit count into the wide-enough register first.
4013       NBits = CurDAG->getNode(ISD::ANY_EXTEND, DL, NVT, NBits);
4014       insertDAGNode(*CurDAG, SDValue(Node, 0), NBits);
4015     }
4016 
4017     SDValue Extract = CurDAG->getNode(X86ISD::BZHI, DL, NVT, X, NBits);
4018     ReplaceNode(Node, Extract.getNode());
4019     SelectCode(Extract.getNode());
4020     return true;
4021   }
4022 
4023   // Else, if we do *NOT* have BMI2, let's find out if the if the 'X' is
4024   // *logically* shifted (potentially with one-use trunc inbetween),
4025   // and the truncation was the only use of the shift,
4026   // and if so look past one-use truncation.
4027   {
4028     SDValue RealX = peekThroughOneUseTruncation(X);
4029     // FIXME: only if the shift is one-use?
4030     if (RealX != X && RealX.getOpcode() == ISD::SRL)
4031       X = RealX;
4032   }
4033 
4034   MVT XVT = X.getSimpleValueType();
4035 
4036   // Else, emitting BEXTR requires one more step.
4037   // The 'control' of BEXTR has the pattern of:
4038   // [15...8 bit][ 7...0 bit] location
4039   // [ bit count][     shift] name
4040   // I.e. 0b000000011'00000001 means  (x >> 0b1) & 0b11
4041 
4042   // Shift NBits left by 8 bits, thus producing 'control'.
4043   // This makes the low 8 bits to be zero.
4044   SDValue C8 = CurDAG->getConstant(8, DL, MVT::i8);
4045   insertDAGNode(*CurDAG, SDValue(Node, 0), C8);
4046   SDValue Control = CurDAG->getNode(ISD::SHL, DL, MVT::i32, NBits, C8);
4047   insertDAGNode(*CurDAG, SDValue(Node, 0), Control);
4048 
4049   // If the 'X' is *logically* shifted, we can fold that shift into 'control'.
4050   // FIXME: only if the shift is one-use?
4051   if (X.getOpcode() == ISD::SRL) {
4052     SDValue ShiftAmt = X.getOperand(1);
4053     X = X.getOperand(0);
4054 
4055     assert(ShiftAmt.getValueType() == MVT::i8 &&
4056            "Expected shift amount to be i8");
4057 
4058     // Now, *zero*-extend the shift amount. The bits 8...15 *must* be zero!
4059     // We could zext to i16 in some form, but we intentionally don't do that.
4060     SDValue OrigShiftAmt = ShiftAmt;
4061     ShiftAmt = CurDAG->getNode(ISD::ZERO_EXTEND, DL, MVT::i32, ShiftAmt);
4062     insertDAGNode(*CurDAG, OrigShiftAmt, ShiftAmt);
4063 
4064     // And now 'or' these low 8 bits of shift amount into the 'control'.
4065     Control = CurDAG->getNode(ISD::OR, DL, MVT::i32, Control, ShiftAmt);
4066     insertDAGNode(*CurDAG, SDValue(Node, 0), Control);
4067   }
4068 
4069   // But have to place the 'control' into the wide-enough register first.
4070   if (XVT != MVT::i32) {
4071     Control = CurDAG->getNode(ISD::ANY_EXTEND, DL, XVT, Control);
4072     insertDAGNode(*CurDAG, SDValue(Node, 0), Control);
4073   }
4074 
4075   // And finally, form the BEXTR itself.
4076   SDValue Extract = CurDAG->getNode(X86ISD::BEXTR, DL, XVT, X, Control);
4077 
4078   // The 'X' was originally truncated. Do that now.
4079   if (XVT != NVT) {
4080     insertDAGNode(*CurDAG, SDValue(Node, 0), Extract);
4081     Extract = CurDAG->getNode(ISD::TRUNCATE, DL, NVT, Extract);
4082   }
4083 
4084   ReplaceNode(Node, Extract.getNode());
4085   SelectCode(Extract.getNode());
4086 
4087   return true;
4088 }
4089 
4090 // See if this is an (X >> C1) & C2 that we can match to BEXTR/BEXTRI.
4091 MachineSDNode *X86DAGToDAGISel::matchBEXTRFromAndImm(SDNode *Node) {
4092   MVT NVT = Node->getSimpleValueType(0);
4093   SDLoc dl(Node);
4094 
4095   SDValue N0 = Node->getOperand(0);
4096   SDValue N1 = Node->getOperand(1);
4097 
4098   // If we have TBM we can use an immediate for the control. If we have BMI
4099   // we should only do this if the BEXTR instruction is implemented well.
4100   // Otherwise moving the control into a register makes this more costly.
4101   // TODO: Maybe load folding, greater than 32-bit masks, or a guarantee of LICM
4102   // hoisting the move immediate would make it worthwhile with a less optimal
4103   // BEXTR?
4104   bool PreferBEXTR =
4105       Subtarget->hasTBM() || (Subtarget->hasBMI() && Subtarget->hasFastBEXTR());
4106   if (!PreferBEXTR && !Subtarget->hasBMI2())
4107     return nullptr;
4108 
4109   // Must have a shift right.
4110   if (N0->getOpcode() != ISD::SRL && N0->getOpcode() != ISD::SRA)
4111     return nullptr;
4112 
4113   // Shift can't have additional users.
4114   if (!N0->hasOneUse())
4115     return nullptr;
4116 
4117   // Only supported for 32 and 64 bits.
4118   if (NVT != MVT::i32 && NVT != MVT::i64)
4119     return nullptr;
4120 
4121   // Shift amount and RHS of and must be constant.
4122   auto *MaskCst = dyn_cast<ConstantSDNode>(N1);
4123   auto *ShiftCst = dyn_cast<ConstantSDNode>(N0->getOperand(1));
4124   if (!MaskCst || !ShiftCst)
4125     return nullptr;
4126 
4127   // And RHS must be a mask.
4128   uint64_t Mask = MaskCst->getZExtValue();
4129   if (!isMask_64(Mask))
4130     return nullptr;
4131 
4132   uint64_t Shift = ShiftCst->getZExtValue();
4133   uint64_t MaskSize = llvm::popcount(Mask);
4134 
4135   // Don't interfere with something that can be handled by extracting AH.
4136   // TODO: If we are able to fold a load, BEXTR might still be better than AH.
4137   if (Shift == 8 && MaskSize == 8)
4138     return nullptr;
4139 
4140   // Make sure we are only using bits that were in the original value, not
4141   // shifted in.
4142   if (Shift + MaskSize > NVT.getSizeInBits())
4143     return nullptr;
4144 
4145   // BZHI, if available, is always fast, unlike BEXTR. But even if we decide
4146   // that we can't use BEXTR, it is only worthwhile using BZHI if the mask
4147   // does not fit into 32 bits. Load folding is not a sufficient reason.
4148   if (!PreferBEXTR && MaskSize <= 32)
4149     return nullptr;
4150 
4151   SDValue Control;
4152   unsigned ROpc, MOpc;
4153 
4154 #define GET_EGPR_IF_ENABLED(OPC) (Subtarget->hasEGPR() ? OPC##_EVEX : OPC)
4155   if (!PreferBEXTR) {
4156     assert(Subtarget->hasBMI2() && "We must have BMI2's BZHI then.");
4157     // If we can't make use of BEXTR then we can't fuse shift+mask stages.
4158     // Let's perform the mask first, and apply shift later. Note that we need to
4159     // widen the mask to account for the fact that we'll apply shift afterwards!
4160     Control = CurDAG->getTargetConstant(Shift + MaskSize, dl, NVT);
4161     ROpc = NVT == MVT::i64 ? GET_EGPR_IF_ENABLED(X86::BZHI64rr)
4162                            : GET_EGPR_IF_ENABLED(X86::BZHI32rr);
4163     MOpc = NVT == MVT::i64 ? GET_EGPR_IF_ENABLED(X86::BZHI64rm)
4164                            : GET_EGPR_IF_ENABLED(X86::BZHI32rm);
4165     unsigned NewOpc = NVT == MVT::i64 ? X86::MOV32ri64 : X86::MOV32ri;
4166     Control = SDValue(CurDAG->getMachineNode(NewOpc, dl, NVT, Control), 0);
4167   } else {
4168     // The 'control' of BEXTR has the pattern of:
4169     // [15...8 bit][ 7...0 bit] location
4170     // [ bit count][     shift] name
4171     // I.e. 0b000000011'00000001 means  (x >> 0b1) & 0b11
4172     Control = CurDAG->getTargetConstant(Shift | (MaskSize << 8), dl, NVT);
4173     if (Subtarget->hasTBM()) {
4174       ROpc = NVT == MVT::i64 ? X86::BEXTRI64ri : X86::BEXTRI32ri;
4175       MOpc = NVT == MVT::i64 ? X86::BEXTRI64mi : X86::BEXTRI32mi;
4176     } else {
4177       assert(Subtarget->hasBMI() && "We must have BMI1's BEXTR then.");
4178       // BMI requires the immediate to placed in a register.
4179       ROpc = NVT == MVT::i64 ? GET_EGPR_IF_ENABLED(X86::BEXTR64rr)
4180                              : GET_EGPR_IF_ENABLED(X86::BEXTR32rr);
4181       MOpc = NVT == MVT::i64 ? GET_EGPR_IF_ENABLED(X86::BEXTR64rm)
4182                              : GET_EGPR_IF_ENABLED(X86::BEXTR32rm);
4183       unsigned NewOpc = NVT == MVT::i64 ? X86::MOV32ri64 : X86::MOV32ri;
4184       Control = SDValue(CurDAG->getMachineNode(NewOpc, dl, NVT, Control), 0);
4185     }
4186   }
4187 
4188   MachineSDNode *NewNode;
4189   SDValue Input = N0->getOperand(0);
4190   SDValue Tmp0, Tmp1, Tmp2, Tmp3, Tmp4;
4191   if (tryFoldLoad(Node, N0.getNode(), Input, Tmp0, Tmp1, Tmp2, Tmp3, Tmp4)) {
4192     SDValue Ops[] = {
4193         Tmp0, Tmp1, Tmp2, Tmp3, Tmp4, Control, Input.getOperand(0)};
4194     SDVTList VTs = CurDAG->getVTList(NVT, MVT::i32, MVT::Other);
4195     NewNode = CurDAG->getMachineNode(MOpc, dl, VTs, Ops);
4196     // Update the chain.
4197     ReplaceUses(Input.getValue(1), SDValue(NewNode, 2));
4198     // Record the mem-refs
4199     CurDAG->setNodeMemRefs(NewNode, {cast<LoadSDNode>(Input)->getMemOperand()});
4200   } else {
4201     NewNode = CurDAG->getMachineNode(ROpc, dl, NVT, MVT::i32, Input, Control);
4202   }
4203 
4204   if (!PreferBEXTR) {
4205     // We still need to apply the shift.
4206     SDValue ShAmt = CurDAG->getTargetConstant(Shift, dl, NVT);
4207     unsigned NewOpc = NVT == MVT::i64 ? GET_ND_IF_ENABLED(X86::SHR64ri)
4208                                       : GET_ND_IF_ENABLED(X86::SHR32ri);
4209     NewNode =
4210         CurDAG->getMachineNode(NewOpc, dl, NVT, SDValue(NewNode, 0), ShAmt);
4211   }
4212 
4213   return NewNode;
4214 }
4215 
4216 // Emit a PCMISTR(I/M) instruction.
4217 MachineSDNode *X86DAGToDAGISel::emitPCMPISTR(unsigned ROpc, unsigned MOpc,
4218                                              bool MayFoldLoad, const SDLoc &dl,
4219                                              MVT VT, SDNode *Node) {
4220   SDValue N0 = Node->getOperand(0);
4221   SDValue N1 = Node->getOperand(1);
4222   SDValue Imm = Node->getOperand(2);
4223   auto *Val = cast<ConstantSDNode>(Imm)->getConstantIntValue();
4224   Imm = CurDAG->getTargetConstant(*Val, SDLoc(Node), Imm.getValueType());
4225 
4226   // Try to fold a load. No need to check alignment.
4227   SDValue Tmp0, Tmp1, Tmp2, Tmp3, Tmp4;
4228   if (MayFoldLoad && tryFoldLoad(Node, N1, Tmp0, Tmp1, Tmp2, Tmp3, Tmp4)) {
4229     SDValue Ops[] = { N0, Tmp0, Tmp1, Tmp2, Tmp3, Tmp4, Imm,
4230                       N1.getOperand(0) };
4231     SDVTList VTs = CurDAG->getVTList(VT, MVT::i32, MVT::Other);
4232     MachineSDNode *CNode = CurDAG->getMachineNode(MOpc, dl, VTs, Ops);
4233     // Update the chain.
4234     ReplaceUses(N1.getValue(1), SDValue(CNode, 2));
4235     // Record the mem-refs
4236     CurDAG->setNodeMemRefs(CNode, {cast<LoadSDNode>(N1)->getMemOperand()});
4237     return CNode;
4238   }
4239 
4240   SDValue Ops[] = { N0, N1, Imm };
4241   SDVTList VTs = CurDAG->getVTList(VT, MVT::i32);
4242   MachineSDNode *CNode = CurDAG->getMachineNode(ROpc, dl, VTs, Ops);
4243   return CNode;
4244 }
4245 
4246 // Emit a PCMESTR(I/M) instruction. Also return the Glue result in case we need
4247 // to emit a second instruction after this one. This is needed since we have two
4248 // copyToReg nodes glued before this and we need to continue that glue through.
4249 MachineSDNode *X86DAGToDAGISel::emitPCMPESTR(unsigned ROpc, unsigned MOpc,
4250                                              bool MayFoldLoad, const SDLoc &dl,
4251                                              MVT VT, SDNode *Node,
4252                                              SDValue &InGlue) {
4253   SDValue N0 = Node->getOperand(0);
4254   SDValue N2 = Node->getOperand(2);
4255   SDValue Imm = Node->getOperand(4);
4256   auto *Val = cast<ConstantSDNode>(Imm)->getConstantIntValue();
4257   Imm = CurDAG->getTargetConstant(*Val, SDLoc(Node), Imm.getValueType());
4258 
4259   // Try to fold a load. No need to check alignment.
4260   SDValue Tmp0, Tmp1, Tmp2, Tmp3, Tmp4;
4261   if (MayFoldLoad && tryFoldLoad(Node, N2, Tmp0, Tmp1, Tmp2, Tmp3, Tmp4)) {
4262     SDValue Ops[] = { N0, Tmp0, Tmp1, Tmp2, Tmp3, Tmp4, Imm,
4263                       N2.getOperand(0), InGlue };
4264     SDVTList VTs = CurDAG->getVTList(VT, MVT::i32, MVT::Other, MVT::Glue);
4265     MachineSDNode *CNode = CurDAG->getMachineNode(MOpc, dl, VTs, Ops);
4266     InGlue = SDValue(CNode, 3);
4267     // Update the chain.
4268     ReplaceUses(N2.getValue(1), SDValue(CNode, 2));
4269     // Record the mem-refs
4270     CurDAG->setNodeMemRefs(CNode, {cast<LoadSDNode>(N2)->getMemOperand()});
4271     return CNode;
4272   }
4273 
4274   SDValue Ops[] = { N0, N2, Imm, InGlue };
4275   SDVTList VTs = CurDAG->getVTList(VT, MVT::i32, MVT::Glue);
4276   MachineSDNode *CNode = CurDAG->getMachineNode(ROpc, dl, VTs, Ops);
4277   InGlue = SDValue(CNode, 2);
4278   return CNode;
4279 }
4280 
4281 bool X86DAGToDAGISel::tryShiftAmountMod(SDNode *N) {
4282   EVT VT = N->getValueType(0);
4283 
4284   // Only handle scalar shifts.
4285   if (VT.isVector())
4286     return false;
4287 
4288   // Narrower shifts only mask to 5 bits in hardware.
4289   unsigned Size = VT == MVT::i64 ? 64 : 32;
4290 
4291   SDValue OrigShiftAmt = N->getOperand(1);
4292   SDValue ShiftAmt = OrigShiftAmt;
4293   SDLoc DL(N);
4294 
4295   // Skip over a truncate of the shift amount.
4296   if (ShiftAmt->getOpcode() == ISD::TRUNCATE)
4297     ShiftAmt = ShiftAmt->getOperand(0);
4298 
4299   // This function is called after X86DAGToDAGISel::matchBitExtract(),
4300   // so we are not afraid that we might mess up BZHI/BEXTR pattern.
4301 
4302   SDValue NewShiftAmt;
4303   if (ShiftAmt->getOpcode() == ISD::ADD || ShiftAmt->getOpcode() == ISD::SUB ||
4304       ShiftAmt->getOpcode() == ISD::XOR) {
4305     SDValue Add0 = ShiftAmt->getOperand(0);
4306     SDValue Add1 = ShiftAmt->getOperand(1);
4307     auto *Add0C = dyn_cast<ConstantSDNode>(Add0);
4308     auto *Add1C = dyn_cast<ConstantSDNode>(Add1);
4309     // If we are shifting by X+/-/^N where N == 0 mod Size, then just shift by X
4310     // to avoid the ADD/SUB/XOR.
4311     if (Add1C && Add1C->getAPIntValue().urem(Size) == 0) {
4312       NewShiftAmt = Add0;
4313 
4314     } else if (ShiftAmt->getOpcode() != ISD::ADD && ShiftAmt.hasOneUse() &&
4315                ((Add0C && Add0C->getAPIntValue().urem(Size) == Size - 1) ||
4316                 (Add1C && Add1C->getAPIntValue().urem(Size) == Size - 1))) {
4317       // If we are doing a NOT on just the lower bits with (Size*N-1) -/^ X
4318       // we can replace it with a NOT. In the XOR case it may save some code
4319       // size, in the SUB case it also may save a move.
4320       assert(Add0C == nullptr || Add1C == nullptr);
4321 
4322       // We can only do N-X, not X-N
4323       if (ShiftAmt->getOpcode() == ISD::SUB && Add0C == nullptr)
4324         return false;
4325 
4326       EVT OpVT = ShiftAmt.getValueType();
4327 
4328       SDValue AllOnes = CurDAG->getAllOnesConstant(DL, OpVT);
4329       NewShiftAmt = CurDAG->getNode(ISD::XOR, DL, OpVT,
4330                                     Add0C == nullptr ? Add0 : Add1, AllOnes);
4331       insertDAGNode(*CurDAG, OrigShiftAmt, AllOnes);
4332       insertDAGNode(*CurDAG, OrigShiftAmt, NewShiftAmt);
4333       // If we are shifting by N-X where N == 0 mod Size, then just shift by
4334       // -X to generate a NEG instead of a SUB of a constant.
4335     } else if (ShiftAmt->getOpcode() == ISD::SUB && Add0C &&
4336                Add0C->getZExtValue() != 0) {
4337       EVT SubVT = ShiftAmt.getValueType();
4338       SDValue X;
4339       if (Add0C->getZExtValue() % Size == 0)
4340         X = Add1;
4341       else if (ShiftAmt.hasOneUse() && Size == 64 &&
4342                Add0C->getZExtValue() % 32 == 0) {
4343         // We have a 64-bit shift by (n*32-x), turn it into -(x+n*32).
4344         // This is mainly beneficial if we already compute (x+n*32).
4345         if (Add1.getOpcode() == ISD::TRUNCATE) {
4346           Add1 = Add1.getOperand(0);
4347           SubVT = Add1.getValueType();
4348         }
4349         if (Add0.getValueType() != SubVT) {
4350           Add0 = CurDAG->getZExtOrTrunc(Add0, DL, SubVT);
4351           insertDAGNode(*CurDAG, OrigShiftAmt, Add0);
4352         }
4353 
4354         X = CurDAG->getNode(ISD::ADD, DL, SubVT, Add1, Add0);
4355         insertDAGNode(*CurDAG, OrigShiftAmt, X);
4356       } else
4357         return false;
4358       // Insert a negate op.
4359       // TODO: This isn't guaranteed to replace the sub if there is a logic cone
4360       // that uses it that's not a shift.
4361       SDValue Zero = CurDAG->getConstant(0, DL, SubVT);
4362       SDValue Neg = CurDAG->getNode(ISD::SUB, DL, SubVT, Zero, X);
4363       NewShiftAmt = Neg;
4364 
4365       // Insert these operands into a valid topological order so they can
4366       // get selected independently.
4367       insertDAGNode(*CurDAG, OrigShiftAmt, Zero);
4368       insertDAGNode(*CurDAG, OrigShiftAmt, Neg);
4369     } else
4370       return false;
4371   } else
4372     return false;
4373 
4374   if (NewShiftAmt.getValueType() != MVT::i8) {
4375     // Need to truncate the shift amount.
4376     NewShiftAmt = CurDAG->getNode(ISD::TRUNCATE, DL, MVT::i8, NewShiftAmt);
4377     // Add to a correct topological ordering.
4378     insertDAGNode(*CurDAG, OrigShiftAmt, NewShiftAmt);
4379   }
4380 
4381   // Insert a new mask to keep the shift amount legal. This should be removed
4382   // by isel patterns.
4383   NewShiftAmt = CurDAG->getNode(ISD::AND, DL, MVT::i8, NewShiftAmt,
4384                                 CurDAG->getConstant(Size - 1, DL, MVT::i8));
4385   // Place in a correct topological ordering.
4386   insertDAGNode(*CurDAG, OrigShiftAmt, NewShiftAmt);
4387 
4388   SDNode *UpdatedNode = CurDAG->UpdateNodeOperands(N, N->getOperand(0),
4389                                                    NewShiftAmt);
4390   if (UpdatedNode != N) {
4391     // If we found an existing node, we should replace ourselves with that node
4392     // and wait for it to be selected after its other users.
4393     ReplaceNode(N, UpdatedNode);
4394     return true;
4395   }
4396 
4397   // If the original shift amount is now dead, delete it so that we don't run
4398   // it through isel.
4399   if (OrigShiftAmt.getNode()->use_empty())
4400     CurDAG->RemoveDeadNode(OrigShiftAmt.getNode());
4401 
4402   // Now that we've optimized the shift amount, defer to normal isel to get
4403   // load folding and legacy vs BMI2 selection without repeating it here.
4404   SelectCode(N);
4405   return true;
4406 }
4407 
4408 bool X86DAGToDAGISel::tryShrinkShlLogicImm(SDNode *N) {
4409   MVT NVT = N->getSimpleValueType(0);
4410   unsigned Opcode = N->getOpcode();
4411   SDLoc dl(N);
4412 
4413   // For operations of the form (x << C1) op C2, check if we can use a smaller
4414   // encoding for C2 by transforming it into (x op (C2>>C1)) << C1.
4415   SDValue Shift = N->getOperand(0);
4416   SDValue N1 = N->getOperand(1);
4417 
4418   auto *Cst = dyn_cast<ConstantSDNode>(N1);
4419   if (!Cst)
4420     return false;
4421 
4422   int64_t Val = Cst->getSExtValue();
4423 
4424   // If we have an any_extend feeding the AND, look through it to see if there
4425   // is a shift behind it. But only if the AND doesn't use the extended bits.
4426   // FIXME: Generalize this to other ANY_EXTEND than i32 to i64?
4427   bool FoundAnyExtend = false;
4428   if (Shift.getOpcode() == ISD::ANY_EXTEND && Shift.hasOneUse() &&
4429       Shift.getOperand(0).getSimpleValueType() == MVT::i32 &&
4430       isUInt<32>(Val)) {
4431     FoundAnyExtend = true;
4432     Shift = Shift.getOperand(0);
4433   }
4434 
4435   if (Shift.getOpcode() != ISD::SHL || !Shift.hasOneUse())
4436     return false;
4437 
4438   // i8 is unshrinkable, i16 should be promoted to i32.
4439   if (NVT != MVT::i32 && NVT != MVT::i64)
4440     return false;
4441 
4442   auto *ShlCst = dyn_cast<ConstantSDNode>(Shift.getOperand(1));
4443   if (!ShlCst)
4444     return false;
4445 
4446   uint64_t ShAmt = ShlCst->getZExtValue();
4447 
4448   // Make sure that we don't change the operation by removing bits.
4449   // This only matters for OR and XOR, AND is unaffected.
4450   uint64_t RemovedBitsMask = (1ULL << ShAmt) - 1;
4451   if (Opcode != ISD::AND && (Val & RemovedBitsMask) != 0)
4452     return false;
4453 
4454   // Check the minimum bitwidth for the new constant.
4455   // TODO: Using 16 and 8 bit operations is also possible for or32 & xor32.
4456   auto CanShrinkImmediate = [&](int64_t &ShiftedVal) {
4457     if (Opcode == ISD::AND) {
4458       // AND32ri is the same as AND64ri32 with zext imm.
4459       // Try this before sign extended immediates below.
4460       ShiftedVal = (uint64_t)Val >> ShAmt;
4461       if (NVT == MVT::i64 && !isUInt<32>(Val) && isUInt<32>(ShiftedVal))
4462         return true;
4463       // Also swap order when the AND can become MOVZX.
4464       if (ShiftedVal == UINT8_MAX || ShiftedVal == UINT16_MAX)
4465         return true;
4466     }
4467     ShiftedVal = Val >> ShAmt;
4468     if ((!isInt<8>(Val) && isInt<8>(ShiftedVal)) ||
4469         (!isInt<32>(Val) && isInt<32>(ShiftedVal)))
4470       return true;
4471     if (Opcode != ISD::AND) {
4472       // MOV32ri+OR64r/XOR64r is cheaper than MOV64ri64+OR64rr/XOR64rr
4473       ShiftedVal = (uint64_t)Val >> ShAmt;
4474       if (NVT == MVT::i64 && !isUInt<32>(Val) && isUInt<32>(ShiftedVal))
4475         return true;
4476     }
4477     return false;
4478   };
4479 
4480   int64_t ShiftedVal;
4481   if (!CanShrinkImmediate(ShiftedVal))
4482     return false;
4483 
4484   // Ok, we can reorder to get a smaller immediate.
4485 
4486   // But, its possible the original immediate allowed an AND to become MOVZX.
4487   // Doing this late due to avoid the MakedValueIsZero call as late as
4488   // possible.
4489   if (Opcode == ISD::AND) {
4490     // Find the smallest zext this could possibly be.
4491     unsigned ZExtWidth = Cst->getAPIntValue().getActiveBits();
4492     ZExtWidth = llvm::bit_ceil(std::max(ZExtWidth, 8U));
4493 
4494     // Figure out which bits need to be zero to achieve that mask.
4495     APInt NeededMask = APInt::getLowBitsSet(NVT.getSizeInBits(),
4496                                             ZExtWidth);
4497     NeededMask &= ~Cst->getAPIntValue();
4498 
4499     if (CurDAG->MaskedValueIsZero(N->getOperand(0), NeededMask))
4500       return false;
4501   }
4502 
4503   SDValue X = Shift.getOperand(0);
4504   if (FoundAnyExtend) {
4505     SDValue NewX = CurDAG->getNode(ISD::ANY_EXTEND, dl, NVT, X);
4506     insertDAGNode(*CurDAG, SDValue(N, 0), NewX);
4507     X = NewX;
4508   }
4509 
4510   SDValue NewCst = CurDAG->getConstant(ShiftedVal, dl, NVT);
4511   insertDAGNode(*CurDAG, SDValue(N, 0), NewCst);
4512   SDValue NewBinOp = CurDAG->getNode(Opcode, dl, NVT, X, NewCst);
4513   insertDAGNode(*CurDAG, SDValue(N, 0), NewBinOp);
4514   SDValue NewSHL = CurDAG->getNode(ISD::SHL, dl, NVT, NewBinOp,
4515                                    Shift.getOperand(1));
4516   ReplaceNode(N, NewSHL.getNode());
4517   SelectCode(NewSHL.getNode());
4518   return true;
4519 }
4520 
4521 bool X86DAGToDAGISel::matchVPTERNLOG(SDNode *Root, SDNode *ParentA,
4522                                      SDNode *ParentB, SDNode *ParentC,
4523                                      SDValue A, SDValue B, SDValue C,
4524                                      uint8_t Imm) {
4525   assert(A.isOperandOf(ParentA) && B.isOperandOf(ParentB) &&
4526          C.isOperandOf(ParentC) && "Incorrect parent node");
4527 
4528   auto tryFoldLoadOrBCast =
4529       [this](SDNode *Root, SDNode *P, SDValue &L, SDValue &Base, SDValue &Scale,
4530              SDValue &Index, SDValue &Disp, SDValue &Segment) {
4531         if (tryFoldLoad(Root, P, L, Base, Scale, Index, Disp, Segment))
4532           return true;
4533 
4534         // Not a load, check for broadcast which may be behind a bitcast.
4535         if (L.getOpcode() == ISD::BITCAST && L.hasOneUse()) {
4536           P = L.getNode();
4537           L = L.getOperand(0);
4538         }
4539 
4540         if (L.getOpcode() != X86ISD::VBROADCAST_LOAD)
4541           return false;
4542 
4543         // Only 32 and 64 bit broadcasts are supported.
4544         auto *MemIntr = cast<MemIntrinsicSDNode>(L);
4545         unsigned Size = MemIntr->getMemoryVT().getSizeInBits();
4546         if (Size != 32 && Size != 64)
4547           return false;
4548 
4549         return tryFoldBroadcast(Root, P, L, Base, Scale, Index, Disp, Segment);
4550       };
4551 
4552   bool FoldedLoad = false;
4553   SDValue Tmp0, Tmp1, Tmp2, Tmp3, Tmp4;
4554   if (tryFoldLoadOrBCast(Root, ParentC, C, Tmp0, Tmp1, Tmp2, Tmp3, Tmp4)) {
4555     FoldedLoad = true;
4556   } else if (tryFoldLoadOrBCast(Root, ParentA, A, Tmp0, Tmp1, Tmp2, Tmp3,
4557                                 Tmp4)) {
4558     FoldedLoad = true;
4559     std::swap(A, C);
4560     // Swap bits 1/4 and 3/6.
4561     uint8_t OldImm = Imm;
4562     Imm = OldImm & 0xa5;
4563     if (OldImm & 0x02) Imm |= 0x10;
4564     if (OldImm & 0x10) Imm |= 0x02;
4565     if (OldImm & 0x08) Imm |= 0x40;
4566     if (OldImm & 0x40) Imm |= 0x08;
4567   } else if (tryFoldLoadOrBCast(Root, ParentB, B, Tmp0, Tmp1, Tmp2, Tmp3,
4568                                 Tmp4)) {
4569     FoldedLoad = true;
4570     std::swap(B, C);
4571     // Swap bits 1/2 and 5/6.
4572     uint8_t OldImm = Imm;
4573     Imm = OldImm & 0x99;
4574     if (OldImm & 0x02) Imm |= 0x04;
4575     if (OldImm & 0x04) Imm |= 0x02;
4576     if (OldImm & 0x20) Imm |= 0x40;
4577     if (OldImm & 0x40) Imm |= 0x20;
4578   }
4579 
4580   SDLoc DL(Root);
4581 
4582   SDValue TImm = CurDAG->getTargetConstant(Imm, DL, MVT::i8);
4583 
4584   MVT NVT = Root->getSimpleValueType(0);
4585 
4586   MachineSDNode *MNode;
4587   if (FoldedLoad) {
4588     SDVTList VTs = CurDAG->getVTList(NVT, MVT::Other);
4589 
4590     unsigned Opc;
4591     if (C.getOpcode() == X86ISD::VBROADCAST_LOAD) {
4592       auto *MemIntr = cast<MemIntrinsicSDNode>(C);
4593       unsigned EltSize = MemIntr->getMemoryVT().getSizeInBits();
4594       assert((EltSize == 32 || EltSize == 64) && "Unexpected broadcast size!");
4595 
4596       bool UseD = EltSize == 32;
4597       if (NVT.is128BitVector())
4598         Opc = UseD ? X86::VPTERNLOGDZ128rmbi : X86::VPTERNLOGQZ128rmbi;
4599       else if (NVT.is256BitVector())
4600         Opc = UseD ? X86::VPTERNLOGDZ256rmbi : X86::VPTERNLOGQZ256rmbi;
4601       else if (NVT.is512BitVector())
4602         Opc = UseD ? X86::VPTERNLOGDZrmbi : X86::VPTERNLOGQZrmbi;
4603       else
4604         llvm_unreachable("Unexpected vector size!");
4605     } else {
4606       bool UseD = NVT.getVectorElementType() == MVT::i32;
4607       if (NVT.is128BitVector())
4608         Opc = UseD ? X86::VPTERNLOGDZ128rmi : X86::VPTERNLOGQZ128rmi;
4609       else if (NVT.is256BitVector())
4610         Opc = UseD ? X86::VPTERNLOGDZ256rmi : X86::VPTERNLOGQZ256rmi;
4611       else if (NVT.is512BitVector())
4612         Opc = UseD ? X86::VPTERNLOGDZrmi : X86::VPTERNLOGQZrmi;
4613       else
4614         llvm_unreachable("Unexpected vector size!");
4615     }
4616 
4617     SDValue Ops[] = {A, B, Tmp0, Tmp1, Tmp2, Tmp3, Tmp4, TImm, C.getOperand(0)};
4618     MNode = CurDAG->getMachineNode(Opc, DL, VTs, Ops);
4619 
4620     // Update the chain.
4621     ReplaceUses(C.getValue(1), SDValue(MNode, 1));
4622     // Record the mem-refs
4623     CurDAG->setNodeMemRefs(MNode, {cast<MemSDNode>(C)->getMemOperand()});
4624   } else {
4625     bool UseD = NVT.getVectorElementType() == MVT::i32;
4626     unsigned Opc;
4627     if (NVT.is128BitVector())
4628       Opc = UseD ? X86::VPTERNLOGDZ128rri : X86::VPTERNLOGQZ128rri;
4629     else if (NVT.is256BitVector())
4630       Opc = UseD ? X86::VPTERNLOGDZ256rri : X86::VPTERNLOGQZ256rri;
4631     else if (NVT.is512BitVector())
4632       Opc = UseD ? X86::VPTERNLOGDZrri : X86::VPTERNLOGQZrri;
4633     else
4634       llvm_unreachable("Unexpected vector size!");
4635 
4636     MNode = CurDAG->getMachineNode(Opc, DL, NVT, {A, B, C, TImm});
4637   }
4638 
4639   ReplaceUses(SDValue(Root, 0), SDValue(MNode, 0));
4640   CurDAG->RemoveDeadNode(Root);
4641   return true;
4642 }
4643 
4644 // Try to match two logic ops to a VPTERNLOG.
4645 // FIXME: Handle more complex patterns that use an operand more than once?
4646 bool X86DAGToDAGISel::tryVPTERNLOG(SDNode *N) {
4647   MVT NVT = N->getSimpleValueType(0);
4648 
4649   // Make sure we support VPTERNLOG.
4650   if (!NVT.isVector() || !Subtarget->hasAVX512() ||
4651       NVT.getVectorElementType() == MVT::i1)
4652     return false;
4653 
4654   // We need VLX for 128/256-bit.
4655   if (!(Subtarget->hasVLX() || NVT.is512BitVector()))
4656     return false;
4657 
4658   SDValue N0 = N->getOperand(0);
4659   SDValue N1 = N->getOperand(1);
4660 
4661   auto getFoldableLogicOp = [](SDValue Op) {
4662     // Peek through single use bitcast.
4663     if (Op.getOpcode() == ISD::BITCAST && Op.hasOneUse())
4664       Op = Op.getOperand(0);
4665 
4666     if (!Op.hasOneUse())
4667       return SDValue();
4668 
4669     unsigned Opc = Op.getOpcode();
4670     if (Opc == ISD::AND || Opc == ISD::OR || Opc == ISD::XOR ||
4671         Opc == X86ISD::ANDNP)
4672       return Op;
4673 
4674     return SDValue();
4675   };
4676 
4677   SDValue A, FoldableOp;
4678   if ((FoldableOp = getFoldableLogicOp(N1))) {
4679     A = N0;
4680   } else if ((FoldableOp = getFoldableLogicOp(N0))) {
4681     A = N1;
4682   } else
4683     return false;
4684 
4685   SDValue B = FoldableOp.getOperand(0);
4686   SDValue C = FoldableOp.getOperand(1);
4687   SDNode *ParentA = N;
4688   SDNode *ParentB = FoldableOp.getNode();
4689   SDNode *ParentC = FoldableOp.getNode();
4690 
4691   // We can build the appropriate control immediate by performing the logic
4692   // operation we're matching using these constants for A, B, and C.
4693   uint8_t TernlogMagicA = 0xf0;
4694   uint8_t TernlogMagicB = 0xcc;
4695   uint8_t TernlogMagicC = 0xaa;
4696 
4697   // Some of the inputs may be inverted, peek through them and invert the
4698   // magic values accordingly.
4699   // TODO: There may be a bitcast before the xor that we should peek through.
4700   auto PeekThroughNot = [](SDValue &Op, SDNode *&Parent, uint8_t &Magic) {
4701     if (Op.getOpcode() == ISD::XOR && Op.hasOneUse() &&
4702         ISD::isBuildVectorAllOnes(Op.getOperand(1).getNode())) {
4703       Magic = ~Magic;
4704       Parent = Op.getNode();
4705       Op = Op.getOperand(0);
4706     }
4707   };
4708 
4709   PeekThroughNot(A, ParentA, TernlogMagicA);
4710   PeekThroughNot(B, ParentB, TernlogMagicB);
4711   PeekThroughNot(C, ParentC, TernlogMagicC);
4712 
4713   uint8_t Imm;
4714   switch (FoldableOp.getOpcode()) {
4715   default: llvm_unreachable("Unexpected opcode!");
4716   case ISD::AND:      Imm = TernlogMagicB & TernlogMagicC; break;
4717   case ISD::OR:       Imm = TernlogMagicB | TernlogMagicC; break;
4718   case ISD::XOR:      Imm = TernlogMagicB ^ TernlogMagicC; break;
4719   case X86ISD::ANDNP: Imm = ~(TernlogMagicB) & TernlogMagicC; break;
4720   }
4721 
4722   switch (N->getOpcode()) {
4723   default: llvm_unreachable("Unexpected opcode!");
4724   case X86ISD::ANDNP:
4725     if (A == N0)
4726       Imm &= ~TernlogMagicA;
4727     else
4728       Imm = ~(Imm) & TernlogMagicA;
4729     break;
4730   case ISD::AND: Imm &= TernlogMagicA; break;
4731   case ISD::OR:  Imm |= TernlogMagicA; break;
4732   case ISD::XOR: Imm ^= TernlogMagicA; break;
4733   }
4734 
4735   return matchVPTERNLOG(N, ParentA, ParentB, ParentC, A, B, C, Imm);
4736 }
4737 
4738 /// If the high bits of an 'and' operand are known zero, try setting the
4739 /// high bits of an 'and' constant operand to produce a smaller encoding by
4740 /// creating a small, sign-extended negative immediate rather than a large
4741 /// positive one. This reverses a transform in SimplifyDemandedBits that
4742 /// shrinks mask constants by clearing bits. There is also a possibility that
4743 /// the 'and' mask can be made -1, so the 'and' itself is unnecessary. In that
4744 /// case, just replace the 'and'. Return 'true' if the node is replaced.
4745 bool X86DAGToDAGISel::shrinkAndImmediate(SDNode *And) {
4746   // i8 is unshrinkable, i16 should be promoted to i32, and vector ops don't
4747   // have immediate operands.
4748   MVT VT = And->getSimpleValueType(0);
4749   if (VT != MVT::i32 && VT != MVT::i64)
4750     return false;
4751 
4752   auto *And1C = dyn_cast<ConstantSDNode>(And->getOperand(1));
4753   if (!And1C)
4754     return false;
4755 
4756   // Bail out if the mask constant is already negative. It's can't shrink more.
4757   // If the upper 32 bits of a 64 bit mask are all zeros, we have special isel
4758   // patterns to use a 32-bit and instead of a 64-bit and by relying on the
4759   // implicit zeroing of 32 bit ops. So we should check if the lower 32 bits
4760   // are negative too.
4761   APInt MaskVal = And1C->getAPIntValue();
4762   unsigned MaskLZ = MaskVal.countl_zero();
4763   if (!MaskLZ || (VT == MVT::i64 && MaskLZ == 32))
4764     return false;
4765 
4766   // Don't extend into the upper 32 bits of a 64 bit mask.
4767   if (VT == MVT::i64 && MaskLZ >= 32) {
4768     MaskLZ -= 32;
4769     MaskVal = MaskVal.trunc(32);
4770   }
4771 
4772   SDValue And0 = And->getOperand(0);
4773   APInt HighZeros = APInt::getHighBitsSet(MaskVal.getBitWidth(), MaskLZ);
4774   APInt NegMaskVal = MaskVal | HighZeros;
4775 
4776   // If a negative constant would not allow a smaller encoding, there's no need
4777   // to continue. Only change the constant when we know it's a win.
4778   unsigned MinWidth = NegMaskVal.getSignificantBits();
4779   if (MinWidth > 32 || (MinWidth > 8 && MaskVal.getSignificantBits() <= 32))
4780     return false;
4781 
4782   // Extend masks if we truncated above.
4783   if (VT == MVT::i64 && MaskVal.getBitWidth() < 64) {
4784     NegMaskVal = NegMaskVal.zext(64);
4785     HighZeros = HighZeros.zext(64);
4786   }
4787 
4788   // The variable operand must be all zeros in the top bits to allow using the
4789   // new, negative constant as the mask.
4790   if (!CurDAG->MaskedValueIsZero(And0, HighZeros))
4791     return false;
4792 
4793   // Check if the mask is -1. In that case, this is an unnecessary instruction
4794   // that escaped earlier analysis.
4795   if (NegMaskVal.isAllOnes()) {
4796     ReplaceNode(And, And0.getNode());
4797     return true;
4798   }
4799 
4800   // A negative mask allows a smaller encoding. Create a new 'and' node.
4801   SDValue NewMask = CurDAG->getConstant(NegMaskVal, SDLoc(And), VT);
4802   insertDAGNode(*CurDAG, SDValue(And, 0), NewMask);
4803   SDValue NewAnd = CurDAG->getNode(ISD::AND, SDLoc(And), VT, And0, NewMask);
4804   ReplaceNode(And, NewAnd.getNode());
4805   SelectCode(NewAnd.getNode());
4806   return true;
4807 }
4808 
4809 static unsigned getVPTESTMOpc(MVT TestVT, bool IsTestN, bool FoldedLoad,
4810                               bool FoldedBCast, bool Masked) {
4811 #define VPTESTM_CASE(VT, SUFFIX) \
4812 case MVT::VT: \
4813   if (Masked) \
4814     return IsTestN ? X86::VPTESTNM##SUFFIX##k: X86::VPTESTM##SUFFIX##k; \
4815   return IsTestN ? X86::VPTESTNM##SUFFIX : X86::VPTESTM##SUFFIX;
4816 
4817 
4818 #define VPTESTM_BROADCAST_CASES(SUFFIX) \
4819 default: llvm_unreachable("Unexpected VT!"); \
4820 VPTESTM_CASE(v4i32, DZ128##SUFFIX) \
4821 VPTESTM_CASE(v2i64, QZ128##SUFFIX) \
4822 VPTESTM_CASE(v8i32, DZ256##SUFFIX) \
4823 VPTESTM_CASE(v4i64, QZ256##SUFFIX) \
4824 VPTESTM_CASE(v16i32, DZ##SUFFIX) \
4825 VPTESTM_CASE(v8i64, QZ##SUFFIX)
4826 
4827 #define VPTESTM_FULL_CASES(SUFFIX) \
4828 VPTESTM_BROADCAST_CASES(SUFFIX) \
4829 VPTESTM_CASE(v16i8, BZ128##SUFFIX) \
4830 VPTESTM_CASE(v8i16, WZ128##SUFFIX) \
4831 VPTESTM_CASE(v32i8, BZ256##SUFFIX) \
4832 VPTESTM_CASE(v16i16, WZ256##SUFFIX) \
4833 VPTESTM_CASE(v64i8, BZ##SUFFIX) \
4834 VPTESTM_CASE(v32i16, WZ##SUFFIX)
4835 
4836   if (FoldedBCast) {
4837     switch (TestVT.SimpleTy) {
4838     VPTESTM_BROADCAST_CASES(rmb)
4839     }
4840   }
4841 
4842   if (FoldedLoad) {
4843     switch (TestVT.SimpleTy) {
4844     VPTESTM_FULL_CASES(rm)
4845     }
4846   }
4847 
4848   switch (TestVT.SimpleTy) {
4849   VPTESTM_FULL_CASES(rr)
4850   }
4851 
4852 #undef VPTESTM_FULL_CASES
4853 #undef VPTESTM_BROADCAST_CASES
4854 #undef VPTESTM_CASE
4855 }
4856 
4857 // Try to create VPTESTM instruction. If InMask is not null, it will be used
4858 // to form a masked operation.
4859 bool X86DAGToDAGISel::tryVPTESTM(SDNode *Root, SDValue Setcc,
4860                                  SDValue InMask) {
4861   assert(Subtarget->hasAVX512() && "Expected AVX512!");
4862   assert(Setcc.getSimpleValueType().getVectorElementType() == MVT::i1 &&
4863          "Unexpected VT!");
4864 
4865   // Look for equal and not equal compares.
4866   ISD::CondCode CC = cast<CondCodeSDNode>(Setcc.getOperand(2))->get();
4867   if (CC != ISD::SETEQ && CC != ISD::SETNE)
4868     return false;
4869 
4870   SDValue SetccOp0 = Setcc.getOperand(0);
4871   SDValue SetccOp1 = Setcc.getOperand(1);
4872 
4873   // Canonicalize the all zero vector to the RHS.
4874   if (ISD::isBuildVectorAllZeros(SetccOp0.getNode()))
4875     std::swap(SetccOp0, SetccOp1);
4876 
4877   // See if we're comparing against zero.
4878   if (!ISD::isBuildVectorAllZeros(SetccOp1.getNode()))
4879     return false;
4880 
4881   SDValue N0 = SetccOp0;
4882 
4883   MVT CmpVT = N0.getSimpleValueType();
4884   MVT CmpSVT = CmpVT.getVectorElementType();
4885 
4886   // Start with both operands the same. We'll try to refine this.
4887   SDValue Src0 = N0;
4888   SDValue Src1 = N0;
4889 
4890   {
4891     // Look through single use bitcasts.
4892     SDValue N0Temp = N0;
4893     if (N0Temp.getOpcode() == ISD::BITCAST && N0Temp.hasOneUse())
4894       N0Temp = N0.getOperand(0);
4895 
4896      // Look for single use AND.
4897     if (N0Temp.getOpcode() == ISD::AND && N0Temp.hasOneUse()) {
4898       Src0 = N0Temp.getOperand(0);
4899       Src1 = N0Temp.getOperand(1);
4900     }
4901   }
4902 
4903   // Without VLX we need to widen the operation.
4904   bool Widen = !Subtarget->hasVLX() && !CmpVT.is512BitVector();
4905 
4906   auto tryFoldLoadOrBCast = [&](SDNode *Root, SDNode *P, SDValue &L,
4907                                 SDValue &Base, SDValue &Scale, SDValue &Index,
4908                                 SDValue &Disp, SDValue &Segment) {
4909     // If we need to widen, we can't fold the load.
4910     if (!Widen)
4911       if (tryFoldLoad(Root, P, L, Base, Scale, Index, Disp, Segment))
4912         return true;
4913 
4914     // If we didn't fold a load, try to match broadcast. No widening limitation
4915     // for this. But only 32 and 64 bit types are supported.
4916     if (CmpSVT != MVT::i32 && CmpSVT != MVT::i64)
4917       return false;
4918 
4919     // Look through single use bitcasts.
4920     if (L.getOpcode() == ISD::BITCAST && L.hasOneUse()) {
4921       P = L.getNode();
4922       L = L.getOperand(0);
4923     }
4924 
4925     if (L.getOpcode() != X86ISD::VBROADCAST_LOAD)
4926       return false;
4927 
4928     auto *MemIntr = cast<MemIntrinsicSDNode>(L);
4929     if (MemIntr->getMemoryVT().getSizeInBits() != CmpSVT.getSizeInBits())
4930       return false;
4931 
4932     return tryFoldBroadcast(Root, P, L, Base, Scale, Index, Disp, Segment);
4933   };
4934 
4935   // We can only fold loads if the sources are unique.
4936   bool CanFoldLoads = Src0 != Src1;
4937 
4938   bool FoldedLoad = false;
4939   SDValue Tmp0, Tmp1, Tmp2, Tmp3, Tmp4;
4940   if (CanFoldLoads) {
4941     FoldedLoad = tryFoldLoadOrBCast(Root, N0.getNode(), Src1, Tmp0, Tmp1, Tmp2,
4942                                     Tmp3, Tmp4);
4943     if (!FoldedLoad) {
4944       // And is commutative.
4945       FoldedLoad = tryFoldLoadOrBCast(Root, N0.getNode(), Src0, Tmp0, Tmp1,
4946                                       Tmp2, Tmp3, Tmp4);
4947       if (FoldedLoad)
4948         std::swap(Src0, Src1);
4949     }
4950   }
4951 
4952   bool FoldedBCast = FoldedLoad && Src1.getOpcode() == X86ISD::VBROADCAST_LOAD;
4953 
4954   bool IsMasked = InMask.getNode() != nullptr;
4955 
4956   SDLoc dl(Root);
4957 
4958   MVT ResVT = Setcc.getSimpleValueType();
4959   MVT MaskVT = ResVT;
4960   if (Widen) {
4961     // Widen the inputs using insert_subreg or copy_to_regclass.
4962     unsigned Scale = CmpVT.is128BitVector() ? 4 : 2;
4963     unsigned SubReg = CmpVT.is128BitVector() ? X86::sub_xmm : X86::sub_ymm;
4964     unsigned NumElts = CmpVT.getVectorNumElements() * Scale;
4965     CmpVT = MVT::getVectorVT(CmpSVT, NumElts);
4966     MaskVT = MVT::getVectorVT(MVT::i1, NumElts);
4967     SDValue ImplDef = SDValue(CurDAG->getMachineNode(X86::IMPLICIT_DEF, dl,
4968                                                      CmpVT), 0);
4969     Src0 = CurDAG->getTargetInsertSubreg(SubReg, dl, CmpVT, ImplDef, Src0);
4970 
4971     if (!FoldedBCast)
4972       Src1 = CurDAG->getTargetInsertSubreg(SubReg, dl, CmpVT, ImplDef, Src1);
4973 
4974     if (IsMasked) {
4975       // Widen the mask.
4976       unsigned RegClass = TLI->getRegClassFor(MaskVT)->getID();
4977       SDValue RC = CurDAG->getTargetConstant(RegClass, dl, MVT::i32);
4978       InMask = SDValue(CurDAG->getMachineNode(TargetOpcode::COPY_TO_REGCLASS,
4979                                               dl, MaskVT, InMask, RC), 0);
4980     }
4981   }
4982 
4983   bool IsTestN = CC == ISD::SETEQ;
4984   unsigned Opc = getVPTESTMOpc(CmpVT, IsTestN, FoldedLoad, FoldedBCast,
4985                                IsMasked);
4986 
4987   MachineSDNode *CNode;
4988   if (FoldedLoad) {
4989     SDVTList VTs = CurDAG->getVTList(MaskVT, MVT::Other);
4990 
4991     if (IsMasked) {
4992       SDValue Ops[] = { InMask, Src0, Tmp0, Tmp1, Tmp2, Tmp3, Tmp4,
4993                         Src1.getOperand(0) };
4994       CNode = CurDAG->getMachineNode(Opc, dl, VTs, Ops);
4995     } else {
4996       SDValue Ops[] = { Src0, Tmp0, Tmp1, Tmp2, Tmp3, Tmp4,
4997                         Src1.getOperand(0) };
4998       CNode = CurDAG->getMachineNode(Opc, dl, VTs, Ops);
4999     }
5000 
5001     // Update the chain.
5002     ReplaceUses(Src1.getValue(1), SDValue(CNode, 1));
5003     // Record the mem-refs
5004     CurDAG->setNodeMemRefs(CNode, {cast<MemSDNode>(Src1)->getMemOperand()});
5005   } else {
5006     if (IsMasked)
5007       CNode = CurDAG->getMachineNode(Opc, dl, MaskVT, InMask, Src0, Src1);
5008     else
5009       CNode = CurDAG->getMachineNode(Opc, dl, MaskVT, Src0, Src1);
5010   }
5011 
5012   // If we widened, we need to shrink the mask VT.
5013   if (Widen) {
5014     unsigned RegClass = TLI->getRegClassFor(ResVT)->getID();
5015     SDValue RC = CurDAG->getTargetConstant(RegClass, dl, MVT::i32);
5016     CNode = CurDAG->getMachineNode(TargetOpcode::COPY_TO_REGCLASS,
5017                                    dl, ResVT, SDValue(CNode, 0), RC);
5018   }
5019 
5020   ReplaceUses(SDValue(Root, 0), SDValue(CNode, 0));
5021   CurDAG->RemoveDeadNode(Root);
5022   return true;
5023 }
5024 
5025 // Try to match the bitselect pattern (or (and A, B), (andn A, C)). Turn it
5026 // into vpternlog.
5027 bool X86DAGToDAGISel::tryMatchBitSelect(SDNode *N) {
5028   assert(N->getOpcode() == ISD::OR && "Unexpected opcode!");
5029 
5030   MVT NVT = N->getSimpleValueType(0);
5031 
5032   // Make sure we support VPTERNLOG.
5033   if (!NVT.isVector() || !Subtarget->hasAVX512())
5034     return false;
5035 
5036   // We need VLX for 128/256-bit.
5037   if (!(Subtarget->hasVLX() || NVT.is512BitVector()))
5038     return false;
5039 
5040   SDValue N0 = N->getOperand(0);
5041   SDValue N1 = N->getOperand(1);
5042 
5043   // Canonicalize AND to LHS.
5044   if (N1.getOpcode() == ISD::AND)
5045     std::swap(N0, N1);
5046 
5047   if (N0.getOpcode() != ISD::AND ||
5048       N1.getOpcode() != X86ISD::ANDNP ||
5049       !N0.hasOneUse() || !N1.hasOneUse())
5050     return false;
5051 
5052   // ANDN is not commutable, use it to pick down A and C.
5053   SDValue A = N1.getOperand(0);
5054   SDValue C = N1.getOperand(1);
5055 
5056   // AND is commutable, if one operand matches A, the other operand is B.
5057   // Otherwise this isn't a match.
5058   SDValue B;
5059   if (N0.getOperand(0) == A)
5060     B = N0.getOperand(1);
5061   else if (N0.getOperand(1) == A)
5062     B = N0.getOperand(0);
5063   else
5064     return false;
5065 
5066   SDLoc dl(N);
5067   SDValue Imm = CurDAG->getTargetConstant(0xCA, dl, MVT::i8);
5068   SDValue Ternlog = CurDAG->getNode(X86ISD::VPTERNLOG, dl, NVT, A, B, C, Imm);
5069   ReplaceNode(N, Ternlog.getNode());
5070 
5071   return matchVPTERNLOG(Ternlog.getNode(), Ternlog.getNode(), Ternlog.getNode(),
5072                         Ternlog.getNode(), A, B, C, 0xCA);
5073 }
5074 
5075 void X86DAGToDAGISel::Select(SDNode *Node) {
5076   MVT NVT = Node->getSimpleValueType(0);
5077   unsigned Opcode = Node->getOpcode();
5078   SDLoc dl(Node);
5079 
5080   if (Node->isMachineOpcode()) {
5081     LLVM_DEBUG(dbgs() << "== "; Node->dump(CurDAG); dbgs() << '\n');
5082     Node->setNodeId(-1);
5083     return;   // Already selected.
5084   }
5085 
5086   switch (Opcode) {
5087   default: break;
5088   case ISD::INTRINSIC_W_CHAIN: {
5089     unsigned IntNo = Node->getConstantOperandVal(1);
5090     switch (IntNo) {
5091     default: break;
5092     case Intrinsic::x86_encodekey128:
5093     case Intrinsic::x86_encodekey256: {
5094       if (!Subtarget->hasKL())
5095         break;
5096 
5097       unsigned Opcode;
5098       switch (IntNo) {
5099       default: llvm_unreachable("Impossible intrinsic");
5100       case Intrinsic::x86_encodekey128:
5101         Opcode = X86::ENCODEKEY128;
5102         break;
5103       case Intrinsic::x86_encodekey256:
5104         Opcode = X86::ENCODEKEY256;
5105         break;
5106       }
5107 
5108       SDValue Chain = Node->getOperand(0);
5109       Chain = CurDAG->getCopyToReg(Chain, dl, X86::XMM0, Node->getOperand(3),
5110                                    SDValue());
5111       if (Opcode == X86::ENCODEKEY256)
5112         Chain = CurDAG->getCopyToReg(Chain, dl, X86::XMM1, Node->getOperand(4),
5113                                      Chain.getValue(1));
5114 
5115       MachineSDNode *Res = CurDAG->getMachineNode(
5116           Opcode, dl, Node->getVTList(),
5117           {Node->getOperand(2), Chain, Chain.getValue(1)});
5118       ReplaceNode(Node, Res);
5119       return;
5120     }
5121     case Intrinsic::x86_tileloadd64_internal:
5122     case Intrinsic::x86_tileloaddt164_internal: {
5123       if (!Subtarget->hasAMXTILE())
5124         break;
5125       auto *MFI =
5126           CurDAG->getMachineFunction().getInfo<X86MachineFunctionInfo>();
5127       MFI->setAMXProgModel(AMXProgModelEnum::ManagedRA);
5128       unsigned Opc = IntNo == Intrinsic::x86_tileloadd64_internal
5129                          ? X86::PTILELOADDV
5130                          : X86::PTILELOADDT1V;
5131       // _tile_loadd_internal(row, col, buf, STRIDE)
5132       SDValue Base = Node->getOperand(4);
5133       SDValue Scale = getI8Imm(1, dl);
5134       SDValue Index = Node->getOperand(5);
5135       SDValue Disp = CurDAG->getTargetConstant(0, dl, MVT::i32);
5136       SDValue Segment = CurDAG->getRegister(0, MVT::i16);
5137       SDValue Chain = Node->getOperand(0);
5138       MachineSDNode *CNode;
5139       SDValue Ops[] = {Node->getOperand(2),
5140                        Node->getOperand(3),
5141                        Base,
5142                        Scale,
5143                        Index,
5144                        Disp,
5145                        Segment,
5146                        Chain};
5147       CNode = CurDAG->getMachineNode(Opc, dl, {MVT::x86amx, MVT::Other}, Ops);
5148       ReplaceNode(Node, CNode);
5149       return;
5150     }
5151     }
5152     break;
5153   }
5154   case ISD::INTRINSIC_VOID: {
5155     unsigned IntNo = Node->getConstantOperandVal(1);
5156     switch (IntNo) {
5157     default: break;
5158     case Intrinsic::x86_sse3_monitor:
5159     case Intrinsic::x86_monitorx:
5160     case Intrinsic::x86_clzero: {
5161       bool Use64BitPtr = Node->getOperand(2).getValueType() == MVT::i64;
5162 
5163       unsigned Opc = 0;
5164       switch (IntNo) {
5165       default: llvm_unreachable("Unexpected intrinsic!");
5166       case Intrinsic::x86_sse3_monitor:
5167         if (!Subtarget->hasSSE3())
5168           break;
5169         Opc = Use64BitPtr ? X86::MONITOR64rrr : X86::MONITOR32rrr;
5170         break;
5171       case Intrinsic::x86_monitorx:
5172         if (!Subtarget->hasMWAITX())
5173           break;
5174         Opc = Use64BitPtr ? X86::MONITORX64rrr : X86::MONITORX32rrr;
5175         break;
5176       case Intrinsic::x86_clzero:
5177         if (!Subtarget->hasCLZERO())
5178           break;
5179         Opc = Use64BitPtr ? X86::CLZERO64r : X86::CLZERO32r;
5180         break;
5181       }
5182 
5183       if (Opc) {
5184         unsigned PtrReg = Use64BitPtr ? X86::RAX : X86::EAX;
5185         SDValue Chain = CurDAG->getCopyToReg(Node->getOperand(0), dl, PtrReg,
5186                                              Node->getOperand(2), SDValue());
5187         SDValue InGlue = Chain.getValue(1);
5188 
5189         if (IntNo == Intrinsic::x86_sse3_monitor ||
5190             IntNo == Intrinsic::x86_monitorx) {
5191           // Copy the other two operands to ECX and EDX.
5192           Chain = CurDAG->getCopyToReg(Chain, dl, X86::ECX, Node->getOperand(3),
5193                                        InGlue);
5194           InGlue = Chain.getValue(1);
5195           Chain = CurDAG->getCopyToReg(Chain, dl, X86::EDX, Node->getOperand(4),
5196                                        InGlue);
5197           InGlue = Chain.getValue(1);
5198         }
5199 
5200         MachineSDNode *CNode = CurDAG->getMachineNode(Opc, dl, MVT::Other,
5201                                                       { Chain, InGlue});
5202         ReplaceNode(Node, CNode);
5203         return;
5204       }
5205 
5206       break;
5207     }
5208     case Intrinsic::x86_tilestored64_internal: {
5209       auto *MFI =
5210           CurDAG->getMachineFunction().getInfo<X86MachineFunctionInfo>();
5211       MFI->setAMXProgModel(AMXProgModelEnum::ManagedRA);
5212       unsigned Opc = X86::PTILESTOREDV;
5213       // _tile_stored_internal(row, col, buf, STRIDE, c)
5214       SDValue Base = Node->getOperand(4);
5215       SDValue Scale = getI8Imm(1, dl);
5216       SDValue Index = Node->getOperand(5);
5217       SDValue Disp = CurDAG->getTargetConstant(0, dl, MVT::i32);
5218       SDValue Segment = CurDAG->getRegister(0, MVT::i16);
5219       SDValue Chain = Node->getOperand(0);
5220       MachineSDNode *CNode;
5221       SDValue Ops[] = {Node->getOperand(2),
5222                        Node->getOperand(3),
5223                        Base,
5224                        Scale,
5225                        Index,
5226                        Disp,
5227                        Segment,
5228                        Node->getOperand(6),
5229                        Chain};
5230       CNode = CurDAG->getMachineNode(Opc, dl, MVT::Other, Ops);
5231       ReplaceNode(Node, CNode);
5232       return;
5233     }
5234     case Intrinsic::x86_tileloadd64:
5235     case Intrinsic::x86_tileloaddt164:
5236     case Intrinsic::x86_tilestored64: {
5237       if (!Subtarget->hasAMXTILE())
5238         break;
5239       auto *MFI =
5240           CurDAG->getMachineFunction().getInfo<X86MachineFunctionInfo>();
5241       MFI->setAMXProgModel(AMXProgModelEnum::DirectReg);
5242       unsigned Opc;
5243       switch (IntNo) {
5244       default: llvm_unreachable("Unexpected intrinsic!");
5245       case Intrinsic::x86_tileloadd64:   Opc = X86::PTILELOADD; break;
5246       case Intrinsic::x86_tileloaddt164: Opc = X86::PTILELOADDT1; break;
5247       case Intrinsic::x86_tilestored64:  Opc = X86::PTILESTORED; break;
5248       }
5249       // FIXME: Match displacement and scale.
5250       unsigned TIndex = Node->getConstantOperandVal(2);
5251       SDValue TReg = getI8Imm(TIndex, dl);
5252       SDValue Base = Node->getOperand(3);
5253       SDValue Scale = getI8Imm(1, dl);
5254       SDValue Index = Node->getOperand(4);
5255       SDValue Disp = CurDAG->getTargetConstant(0, dl, MVT::i32);
5256       SDValue Segment = CurDAG->getRegister(0, MVT::i16);
5257       SDValue Chain = Node->getOperand(0);
5258       MachineSDNode *CNode;
5259       if (Opc == X86::PTILESTORED) {
5260         SDValue Ops[] = { Base, Scale, Index, Disp, Segment, TReg, Chain };
5261         CNode = CurDAG->getMachineNode(Opc, dl, MVT::Other, Ops);
5262       } else {
5263         SDValue Ops[] = { TReg, Base, Scale, Index, Disp, Segment, Chain };
5264         CNode = CurDAG->getMachineNode(Opc, dl, MVT::Other, Ops);
5265       }
5266       ReplaceNode(Node, CNode);
5267       return;
5268     }
5269     }
5270     break;
5271   }
5272   case ISD::BRIND:
5273   case X86ISD::NT_BRIND: {
5274     if (Subtarget->isTargetNaCl())
5275       // NaCl has its own pass where jmp %r32 are converted to jmp %r64. We
5276       // leave the instruction alone.
5277       break;
5278     if (Subtarget->isTarget64BitILP32()) {
5279       // Converts a 32-bit register to a 64-bit, zero-extended version of
5280       // it. This is needed because x86-64 can do many things, but jmp %r32
5281       // ain't one of them.
5282       SDValue Target = Node->getOperand(1);
5283       assert(Target.getValueType() == MVT::i32 && "Unexpected VT!");
5284       SDValue ZextTarget = CurDAG->getZExtOrTrunc(Target, dl, MVT::i64);
5285       SDValue Brind = CurDAG->getNode(Opcode, dl, MVT::Other,
5286                                       Node->getOperand(0), ZextTarget);
5287       ReplaceNode(Node, Brind.getNode());
5288       SelectCode(ZextTarget.getNode());
5289       SelectCode(Brind.getNode());
5290       return;
5291     }
5292     break;
5293   }
5294   case X86ISD::GlobalBaseReg:
5295     ReplaceNode(Node, getGlobalBaseReg());
5296     return;
5297 
5298   case ISD::BITCAST:
5299     // Just drop all 128/256/512-bit bitcasts.
5300     if (NVT.is512BitVector() || NVT.is256BitVector() || NVT.is128BitVector() ||
5301         NVT == MVT::f128) {
5302       ReplaceUses(SDValue(Node, 0), Node->getOperand(0));
5303       CurDAG->RemoveDeadNode(Node);
5304       return;
5305     }
5306     break;
5307 
5308   case ISD::SRL:
5309     if (matchBitExtract(Node))
5310       return;
5311     [[fallthrough]];
5312   case ISD::SRA:
5313   case ISD::SHL:
5314     if (tryShiftAmountMod(Node))
5315       return;
5316     break;
5317 
5318   case X86ISD::VPTERNLOG: {
5319     uint8_t Imm = Node->getConstantOperandVal(3);
5320     if (matchVPTERNLOG(Node, Node, Node, Node, Node->getOperand(0),
5321                        Node->getOperand(1), Node->getOperand(2), Imm))
5322       return;
5323     break;
5324   }
5325 
5326   case X86ISD::ANDNP:
5327     if (tryVPTERNLOG(Node))
5328       return;
5329     break;
5330 
5331   case ISD::AND:
5332     if (NVT.isVector() && NVT.getVectorElementType() == MVT::i1) {
5333       // Try to form a masked VPTESTM. Operands can be in either order.
5334       SDValue N0 = Node->getOperand(0);
5335       SDValue N1 = Node->getOperand(1);
5336       if (N0.getOpcode() == ISD::SETCC && N0.hasOneUse() &&
5337           tryVPTESTM(Node, N0, N1))
5338         return;
5339       if (N1.getOpcode() == ISD::SETCC && N1.hasOneUse() &&
5340           tryVPTESTM(Node, N1, N0))
5341         return;
5342     }
5343 
5344     if (MachineSDNode *NewNode = matchBEXTRFromAndImm(Node)) {
5345       ReplaceUses(SDValue(Node, 0), SDValue(NewNode, 0));
5346       CurDAG->RemoveDeadNode(Node);
5347       return;
5348     }
5349     if (matchBitExtract(Node))
5350       return;
5351     if (AndImmShrink && shrinkAndImmediate(Node))
5352       return;
5353 
5354     [[fallthrough]];
5355   case ISD::OR:
5356   case ISD::XOR:
5357     if (tryShrinkShlLogicImm(Node))
5358       return;
5359     if (Opcode == ISD::OR && tryMatchBitSelect(Node))
5360       return;
5361     if (tryVPTERNLOG(Node))
5362       return;
5363 
5364     [[fallthrough]];
5365   case ISD::ADD:
5366     if (Opcode == ISD::ADD && matchBitExtract(Node))
5367       return;
5368     [[fallthrough]];
5369   case ISD::SUB: {
5370     // Try to avoid folding immediates with multiple uses for optsize.
5371     // This code tries to select to register form directly to avoid going
5372     // through the isel table which might fold the immediate. We can't change
5373     // the patterns on the add/sub/and/or/xor with immediate paterns in the
5374     // tablegen files to check immediate use count without making the patterns
5375     // unavailable to the fast-isel table.
5376     if (!CurDAG->shouldOptForSize())
5377       break;
5378 
5379     // Only handle i8/i16/i32/i64.
5380     if (NVT != MVT::i8 && NVT != MVT::i16 && NVT != MVT::i32 && NVT != MVT::i64)
5381       break;
5382 
5383     SDValue N0 = Node->getOperand(0);
5384     SDValue N1 = Node->getOperand(1);
5385 
5386     auto *Cst = dyn_cast<ConstantSDNode>(N1);
5387     if (!Cst)
5388       break;
5389 
5390     int64_t Val = Cst->getSExtValue();
5391 
5392     // Make sure its an immediate that is considered foldable.
5393     // FIXME: Handle unsigned 32 bit immediates for 64-bit AND.
5394     if (!isInt<8>(Val) && !isInt<32>(Val))
5395       break;
5396 
5397     // If this can match to INC/DEC, let it go.
5398     if (Opcode == ISD::ADD && (Val == 1 || Val == -1))
5399       break;
5400 
5401     // Check if we should avoid folding this immediate.
5402     if (!shouldAvoidImmediateInstFormsForSize(N1.getNode()))
5403       break;
5404 
5405     // We should not fold the immediate. So we need a register form instead.
5406     unsigned ROpc, MOpc;
5407     switch (NVT.SimpleTy) {
5408     default: llvm_unreachable("Unexpected VT!");
5409     case MVT::i8:
5410       switch (Opcode) {
5411       default: llvm_unreachable("Unexpected opcode!");
5412       case ISD::ADD:
5413         ROpc = GET_ND_IF_ENABLED(X86::ADD8rr);
5414         MOpc = GET_ND_IF_ENABLED(X86::ADD8rm);
5415         break;
5416       case ISD::SUB:
5417         ROpc = GET_ND_IF_ENABLED(X86::SUB8rr);
5418         MOpc = GET_ND_IF_ENABLED(X86::SUB8rm);
5419         break;
5420       case ISD::AND:
5421         ROpc = GET_ND_IF_ENABLED(X86::AND8rr);
5422         MOpc = GET_ND_IF_ENABLED(X86::AND8rm);
5423         break;
5424       case ISD::OR:
5425         ROpc = GET_ND_IF_ENABLED(X86::OR8rr);
5426         MOpc = GET_ND_IF_ENABLED(X86::OR8rm);
5427         break;
5428       case ISD::XOR:
5429         ROpc = GET_ND_IF_ENABLED(X86::XOR8rr);
5430         MOpc = GET_ND_IF_ENABLED(X86::XOR8rm);
5431         break;
5432       }
5433       break;
5434     case MVT::i16:
5435       switch (Opcode) {
5436       default: llvm_unreachable("Unexpected opcode!");
5437       case ISD::ADD:
5438         ROpc = GET_ND_IF_ENABLED(X86::ADD16rr);
5439         MOpc = GET_ND_IF_ENABLED(X86::ADD16rm);
5440         break;
5441       case ISD::SUB:
5442         ROpc = GET_ND_IF_ENABLED(X86::SUB16rr);
5443         MOpc = GET_ND_IF_ENABLED(X86::SUB16rm);
5444         break;
5445       case ISD::AND:
5446         ROpc = GET_ND_IF_ENABLED(X86::AND16rr);
5447         MOpc = GET_ND_IF_ENABLED(X86::AND16rm);
5448         break;
5449       case ISD::OR:
5450         ROpc = GET_ND_IF_ENABLED(X86::OR16rr);
5451         MOpc = GET_ND_IF_ENABLED(X86::OR16rm);
5452         break;
5453       case ISD::XOR:
5454         ROpc = GET_ND_IF_ENABLED(X86::XOR16rr);
5455         MOpc = GET_ND_IF_ENABLED(X86::XOR16rm);
5456         break;
5457       }
5458       break;
5459     case MVT::i32:
5460       switch (Opcode) {
5461       default: llvm_unreachable("Unexpected opcode!");
5462       case ISD::ADD:
5463         ROpc = GET_ND_IF_ENABLED(X86::ADD32rr);
5464         MOpc = GET_ND_IF_ENABLED(X86::ADD32rm);
5465         break;
5466       case ISD::SUB:
5467         ROpc = GET_ND_IF_ENABLED(X86::SUB32rr);
5468         MOpc = GET_ND_IF_ENABLED(X86::SUB32rm);
5469         break;
5470       case ISD::AND:
5471         ROpc = GET_ND_IF_ENABLED(X86::AND32rr);
5472         MOpc = GET_ND_IF_ENABLED(X86::AND32rm);
5473         break;
5474       case ISD::OR:
5475         ROpc = GET_ND_IF_ENABLED(X86::OR32rr);
5476         MOpc = GET_ND_IF_ENABLED(X86::OR32rm);
5477         break;
5478       case ISD::XOR:
5479         ROpc = GET_ND_IF_ENABLED(X86::XOR32rr);
5480         MOpc = GET_ND_IF_ENABLED(X86::XOR32rm);
5481         break;
5482       }
5483       break;
5484     case MVT::i64:
5485       switch (Opcode) {
5486       default: llvm_unreachable("Unexpected opcode!");
5487       case ISD::ADD:
5488         ROpc = GET_ND_IF_ENABLED(X86::ADD64rr);
5489         MOpc = GET_ND_IF_ENABLED(X86::ADD64rm);
5490         break;
5491       case ISD::SUB:
5492         ROpc = GET_ND_IF_ENABLED(X86::SUB64rr);
5493         MOpc = GET_ND_IF_ENABLED(X86::SUB64rm);
5494         break;
5495       case ISD::AND:
5496         ROpc = GET_ND_IF_ENABLED(X86::AND64rr);
5497         MOpc = GET_ND_IF_ENABLED(X86::AND64rm);
5498         break;
5499       case ISD::OR:
5500         ROpc = GET_ND_IF_ENABLED(X86::OR64rr);
5501         MOpc = GET_ND_IF_ENABLED(X86::OR64rm);
5502         break;
5503       case ISD::XOR:
5504         ROpc = GET_ND_IF_ENABLED(X86::XOR64rr);
5505         MOpc = GET_ND_IF_ENABLED(X86::XOR64rm);
5506         break;
5507       }
5508       break;
5509     }
5510 
5511     // Ok this is a AND/OR/XOR/ADD/SUB with constant.
5512 
5513     // If this is a not a subtract, we can still try to fold a load.
5514     if (Opcode != ISD::SUB) {
5515       SDValue Tmp0, Tmp1, Tmp2, Tmp3, Tmp4;
5516       if (tryFoldLoad(Node, N0, Tmp0, Tmp1, Tmp2, Tmp3, Tmp4)) {
5517         SDValue Ops[] = { N1, Tmp0, Tmp1, Tmp2, Tmp3, Tmp4, N0.getOperand(0) };
5518         SDVTList VTs = CurDAG->getVTList(NVT, MVT::i32, MVT::Other);
5519         MachineSDNode *CNode = CurDAG->getMachineNode(MOpc, dl, VTs, Ops);
5520         // Update the chain.
5521         ReplaceUses(N0.getValue(1), SDValue(CNode, 2));
5522         // Record the mem-refs
5523         CurDAG->setNodeMemRefs(CNode, {cast<LoadSDNode>(N0)->getMemOperand()});
5524         ReplaceUses(SDValue(Node, 0), SDValue(CNode, 0));
5525         CurDAG->RemoveDeadNode(Node);
5526         return;
5527       }
5528     }
5529 
5530     CurDAG->SelectNodeTo(Node, ROpc, NVT, MVT::i32, N0, N1);
5531     return;
5532   }
5533 
5534   case X86ISD::SMUL:
5535     // i16/i32/i64 are handled with isel patterns.
5536     if (NVT != MVT::i8)
5537       break;
5538     [[fallthrough]];
5539   case X86ISD::UMUL: {
5540     SDValue N0 = Node->getOperand(0);
5541     SDValue N1 = Node->getOperand(1);
5542 
5543     unsigned LoReg, ROpc, MOpc;
5544     switch (NVT.SimpleTy) {
5545     default: llvm_unreachable("Unsupported VT!");
5546     case MVT::i8:
5547       LoReg = X86::AL;
5548       ROpc = Opcode == X86ISD::SMUL ? X86::IMUL8r : X86::MUL8r;
5549       MOpc = Opcode == X86ISD::SMUL ? X86::IMUL8m : X86::MUL8m;
5550       break;
5551     case MVT::i16:
5552       LoReg = X86::AX;
5553       ROpc = X86::MUL16r;
5554       MOpc = X86::MUL16m;
5555       break;
5556     case MVT::i32:
5557       LoReg = X86::EAX;
5558       ROpc = X86::MUL32r;
5559       MOpc = X86::MUL32m;
5560       break;
5561     case MVT::i64:
5562       LoReg = X86::RAX;
5563       ROpc = X86::MUL64r;
5564       MOpc = X86::MUL64m;
5565       break;
5566     }
5567 
5568     SDValue Tmp0, Tmp1, Tmp2, Tmp3, Tmp4;
5569     bool FoldedLoad = tryFoldLoad(Node, N1, Tmp0, Tmp1, Tmp2, Tmp3, Tmp4);
5570     // Multiply is commutative.
5571     if (!FoldedLoad) {
5572       FoldedLoad = tryFoldLoad(Node, N0, Tmp0, Tmp1, Tmp2, Tmp3, Tmp4);
5573       if (FoldedLoad)
5574         std::swap(N0, N1);
5575     }
5576 
5577     SDValue InGlue = CurDAG->getCopyToReg(CurDAG->getEntryNode(), dl, LoReg,
5578                                           N0, SDValue()).getValue(1);
5579 
5580     MachineSDNode *CNode;
5581     if (FoldedLoad) {
5582       // i16/i32/i64 use an instruction that produces a low and high result even
5583       // though only the low result is used.
5584       SDVTList VTs;
5585       if (NVT == MVT::i8)
5586         VTs = CurDAG->getVTList(NVT, MVT::i32, MVT::Other);
5587       else
5588         VTs = CurDAG->getVTList(NVT, NVT, MVT::i32, MVT::Other);
5589 
5590       SDValue Ops[] = { Tmp0, Tmp1, Tmp2, Tmp3, Tmp4, N1.getOperand(0),
5591                         InGlue };
5592       CNode = CurDAG->getMachineNode(MOpc, dl, VTs, Ops);
5593 
5594       // Update the chain.
5595       ReplaceUses(N1.getValue(1), SDValue(CNode, NVT == MVT::i8 ? 2 : 3));
5596       // Record the mem-refs
5597       CurDAG->setNodeMemRefs(CNode, {cast<LoadSDNode>(N1)->getMemOperand()});
5598     } else {
5599       // i16/i32/i64 use an instruction that produces a low and high result even
5600       // though only the low result is used.
5601       SDVTList VTs;
5602       if (NVT == MVT::i8)
5603         VTs = CurDAG->getVTList(NVT, MVT::i32);
5604       else
5605         VTs = CurDAG->getVTList(NVT, NVT, MVT::i32);
5606 
5607       CNode = CurDAG->getMachineNode(ROpc, dl, VTs, {N1, InGlue});
5608     }
5609 
5610     ReplaceUses(SDValue(Node, 0), SDValue(CNode, 0));
5611     ReplaceUses(SDValue(Node, 1), SDValue(CNode, NVT == MVT::i8 ? 1 : 2));
5612     CurDAG->RemoveDeadNode(Node);
5613     return;
5614   }
5615 
5616   case ISD::SMUL_LOHI:
5617   case ISD::UMUL_LOHI: {
5618     SDValue N0 = Node->getOperand(0);
5619     SDValue N1 = Node->getOperand(1);
5620 
5621     unsigned Opc, MOpc;
5622     unsigned LoReg, HiReg;
5623     bool IsSigned = Opcode == ISD::SMUL_LOHI;
5624     bool UseMULX = !IsSigned && Subtarget->hasBMI2();
5625     bool UseMULXHi = UseMULX && SDValue(Node, 0).use_empty();
5626     switch (NVT.SimpleTy) {
5627     default: llvm_unreachable("Unsupported VT!");
5628     case MVT::i32:
5629       Opc = UseMULXHi  ? X86::MULX32Hrr
5630             : UseMULX  ? GET_EGPR_IF_ENABLED(X86::MULX32rr)
5631             : IsSigned ? X86::IMUL32r
5632                        : X86::MUL32r;
5633       MOpc = UseMULXHi  ? X86::MULX32Hrm
5634              : UseMULX  ? GET_EGPR_IF_ENABLED(X86::MULX32rm)
5635              : IsSigned ? X86::IMUL32m
5636                         : X86::MUL32m;
5637       LoReg = UseMULX ? X86::EDX : X86::EAX;
5638       HiReg = X86::EDX;
5639       break;
5640     case MVT::i64:
5641       Opc = UseMULXHi  ? X86::MULX64Hrr
5642             : UseMULX  ? GET_EGPR_IF_ENABLED(X86::MULX64rr)
5643             : IsSigned ? X86::IMUL64r
5644                        : X86::MUL64r;
5645       MOpc = UseMULXHi  ? X86::MULX64Hrm
5646              : UseMULX  ? GET_EGPR_IF_ENABLED(X86::MULX64rm)
5647              : IsSigned ? X86::IMUL64m
5648                         : X86::MUL64m;
5649       LoReg = UseMULX ? X86::RDX : X86::RAX;
5650       HiReg = X86::RDX;
5651       break;
5652     }
5653 
5654     SDValue Tmp0, Tmp1, Tmp2, Tmp3, Tmp4;
5655     bool foldedLoad = tryFoldLoad(Node, N1, Tmp0, Tmp1, Tmp2, Tmp3, Tmp4);
5656     // Multiply is commutative.
5657     if (!foldedLoad) {
5658       foldedLoad = tryFoldLoad(Node, N0, Tmp0, Tmp1, Tmp2, Tmp3, Tmp4);
5659       if (foldedLoad)
5660         std::swap(N0, N1);
5661     }
5662 
5663     SDValue InGlue = CurDAG->getCopyToReg(CurDAG->getEntryNode(), dl, LoReg,
5664                                           N0, SDValue()).getValue(1);
5665     SDValue ResHi, ResLo;
5666     if (foldedLoad) {
5667       SDValue Chain;
5668       MachineSDNode *CNode = nullptr;
5669       SDValue Ops[] = { Tmp0, Tmp1, Tmp2, Tmp3, Tmp4, N1.getOperand(0),
5670                         InGlue };
5671       if (UseMULXHi) {
5672         SDVTList VTs = CurDAG->getVTList(NVT, MVT::Other);
5673         CNode = CurDAG->getMachineNode(MOpc, dl, VTs, Ops);
5674         ResHi = SDValue(CNode, 0);
5675         Chain = SDValue(CNode, 1);
5676       } else if (UseMULX) {
5677         SDVTList VTs = CurDAG->getVTList(NVT, NVT, MVT::Other);
5678         CNode = CurDAG->getMachineNode(MOpc, dl, VTs, Ops);
5679         ResHi = SDValue(CNode, 0);
5680         ResLo = SDValue(CNode, 1);
5681         Chain = SDValue(CNode, 2);
5682       } else {
5683         SDVTList VTs = CurDAG->getVTList(MVT::Other, MVT::Glue);
5684         CNode = CurDAG->getMachineNode(MOpc, dl, VTs, Ops);
5685         Chain = SDValue(CNode, 0);
5686         InGlue = SDValue(CNode, 1);
5687       }
5688 
5689       // Update the chain.
5690       ReplaceUses(N1.getValue(1), Chain);
5691       // Record the mem-refs
5692       CurDAG->setNodeMemRefs(CNode, {cast<LoadSDNode>(N1)->getMemOperand()});
5693     } else {
5694       SDValue Ops[] = { N1, InGlue };
5695       if (UseMULXHi) {
5696         SDVTList VTs = CurDAG->getVTList(NVT);
5697         SDNode *CNode = CurDAG->getMachineNode(Opc, dl, VTs, Ops);
5698         ResHi = SDValue(CNode, 0);
5699       } else if (UseMULX) {
5700         SDVTList VTs = CurDAG->getVTList(NVT, NVT);
5701         SDNode *CNode = CurDAG->getMachineNode(Opc, dl, VTs, Ops);
5702         ResHi = SDValue(CNode, 0);
5703         ResLo = SDValue(CNode, 1);
5704       } else {
5705         SDVTList VTs = CurDAG->getVTList(MVT::Glue);
5706         SDNode *CNode = CurDAG->getMachineNode(Opc, dl, VTs, Ops);
5707         InGlue = SDValue(CNode, 0);
5708       }
5709     }
5710 
5711     // Copy the low half of the result, if it is needed.
5712     if (!SDValue(Node, 0).use_empty()) {
5713       if (!ResLo) {
5714         assert(LoReg && "Register for low half is not defined!");
5715         ResLo = CurDAG->getCopyFromReg(CurDAG->getEntryNode(), dl, LoReg,
5716                                        NVT, InGlue);
5717         InGlue = ResLo.getValue(2);
5718       }
5719       ReplaceUses(SDValue(Node, 0), ResLo);
5720       LLVM_DEBUG(dbgs() << "=> "; ResLo.getNode()->dump(CurDAG);
5721                  dbgs() << '\n');
5722     }
5723     // Copy the high half of the result, if it is needed.
5724     if (!SDValue(Node, 1).use_empty()) {
5725       if (!ResHi) {
5726         assert(HiReg && "Register for high half is not defined!");
5727         ResHi = CurDAG->getCopyFromReg(CurDAG->getEntryNode(), dl, HiReg,
5728                                        NVT, InGlue);
5729         InGlue = ResHi.getValue(2);
5730       }
5731       ReplaceUses(SDValue(Node, 1), ResHi);
5732       LLVM_DEBUG(dbgs() << "=> "; ResHi.getNode()->dump(CurDAG);
5733                  dbgs() << '\n');
5734     }
5735 
5736     CurDAG->RemoveDeadNode(Node);
5737     return;
5738   }
5739 
5740   case ISD::SDIVREM:
5741   case ISD::UDIVREM: {
5742     SDValue N0 = Node->getOperand(0);
5743     SDValue N1 = Node->getOperand(1);
5744 
5745     unsigned ROpc, MOpc;
5746     bool isSigned = Opcode == ISD::SDIVREM;
5747     if (!isSigned) {
5748       switch (NVT.SimpleTy) {
5749       default: llvm_unreachable("Unsupported VT!");
5750       case MVT::i8:  ROpc = X86::DIV8r;  MOpc = X86::DIV8m;  break;
5751       case MVT::i16: ROpc = X86::DIV16r; MOpc = X86::DIV16m; break;
5752       case MVT::i32: ROpc = X86::DIV32r; MOpc = X86::DIV32m; break;
5753       case MVT::i64: ROpc = X86::DIV64r; MOpc = X86::DIV64m; break;
5754       }
5755     } else {
5756       switch (NVT.SimpleTy) {
5757       default: llvm_unreachable("Unsupported VT!");
5758       case MVT::i8:  ROpc = X86::IDIV8r;  MOpc = X86::IDIV8m;  break;
5759       case MVT::i16: ROpc = X86::IDIV16r; MOpc = X86::IDIV16m; break;
5760       case MVT::i32: ROpc = X86::IDIV32r; MOpc = X86::IDIV32m; break;
5761       case MVT::i64: ROpc = X86::IDIV64r; MOpc = X86::IDIV64m; break;
5762       }
5763     }
5764 
5765     unsigned LoReg, HiReg, ClrReg;
5766     unsigned SExtOpcode;
5767     switch (NVT.SimpleTy) {
5768     default: llvm_unreachable("Unsupported VT!");
5769     case MVT::i8:
5770       LoReg = X86::AL;  ClrReg = HiReg = X86::AH;
5771       SExtOpcode = 0; // Not used.
5772       break;
5773     case MVT::i16:
5774       LoReg = X86::AX;  HiReg = X86::DX;
5775       ClrReg = X86::DX;
5776       SExtOpcode = X86::CWD;
5777       break;
5778     case MVT::i32:
5779       LoReg = X86::EAX; ClrReg = HiReg = X86::EDX;
5780       SExtOpcode = X86::CDQ;
5781       break;
5782     case MVT::i64:
5783       LoReg = X86::RAX; ClrReg = HiReg = X86::RDX;
5784       SExtOpcode = X86::CQO;
5785       break;
5786     }
5787 
5788     SDValue Tmp0, Tmp1, Tmp2, Tmp3, Tmp4;
5789     bool foldedLoad = tryFoldLoad(Node, N1, Tmp0, Tmp1, Tmp2, Tmp3, Tmp4);
5790     bool signBitIsZero = CurDAG->SignBitIsZero(N0);
5791 
5792     SDValue InGlue;
5793     if (NVT == MVT::i8) {
5794       // Special case for div8, just use a move with zero extension to AX to
5795       // clear the upper 8 bits (AH).
5796       SDValue Tmp0, Tmp1, Tmp2, Tmp3, Tmp4, Chain;
5797       MachineSDNode *Move;
5798       if (tryFoldLoad(Node, N0, Tmp0, Tmp1, Tmp2, Tmp3, Tmp4)) {
5799         SDValue Ops[] = { Tmp0, Tmp1, Tmp2, Tmp3, Tmp4, N0.getOperand(0) };
5800         unsigned Opc = (isSigned && !signBitIsZero) ? X86::MOVSX16rm8
5801                                                     : X86::MOVZX16rm8;
5802         Move = CurDAG->getMachineNode(Opc, dl, MVT::i16, MVT::Other, Ops);
5803         Chain = SDValue(Move, 1);
5804         ReplaceUses(N0.getValue(1), Chain);
5805         // Record the mem-refs
5806         CurDAG->setNodeMemRefs(Move, {cast<LoadSDNode>(N0)->getMemOperand()});
5807       } else {
5808         unsigned Opc = (isSigned && !signBitIsZero) ? X86::MOVSX16rr8
5809                                                     : X86::MOVZX16rr8;
5810         Move = CurDAG->getMachineNode(Opc, dl, MVT::i16, N0);
5811         Chain = CurDAG->getEntryNode();
5812       }
5813       Chain  = CurDAG->getCopyToReg(Chain, dl, X86::AX, SDValue(Move, 0),
5814                                     SDValue());
5815       InGlue = Chain.getValue(1);
5816     } else {
5817       InGlue =
5818         CurDAG->getCopyToReg(CurDAG->getEntryNode(), dl,
5819                              LoReg, N0, SDValue()).getValue(1);
5820       if (isSigned && !signBitIsZero) {
5821         // Sign extend the low part into the high part.
5822         InGlue =
5823           SDValue(CurDAG->getMachineNode(SExtOpcode, dl, MVT::Glue, InGlue),0);
5824       } else {
5825         // Zero out the high part, effectively zero extending the input.
5826         SDVTList VTs = CurDAG->getVTList(MVT::i32, MVT::i32);
5827         SDValue ClrNode = SDValue(
5828             CurDAG->getMachineNode(X86::MOV32r0, dl, VTs, std::nullopt), 0);
5829         switch (NVT.SimpleTy) {
5830         case MVT::i16:
5831           ClrNode =
5832               SDValue(CurDAG->getMachineNode(
5833                           TargetOpcode::EXTRACT_SUBREG, dl, MVT::i16, ClrNode,
5834                           CurDAG->getTargetConstant(X86::sub_16bit, dl,
5835                                                     MVT::i32)),
5836                       0);
5837           break;
5838         case MVT::i32:
5839           break;
5840         case MVT::i64:
5841           ClrNode =
5842               SDValue(CurDAG->getMachineNode(
5843                           TargetOpcode::SUBREG_TO_REG, dl, MVT::i64,
5844                           CurDAG->getTargetConstant(0, dl, MVT::i64), ClrNode,
5845                           CurDAG->getTargetConstant(X86::sub_32bit, dl,
5846                                                     MVT::i32)),
5847                       0);
5848           break;
5849         default:
5850           llvm_unreachable("Unexpected division source");
5851         }
5852 
5853         InGlue = CurDAG->getCopyToReg(CurDAG->getEntryNode(), dl, ClrReg,
5854                                       ClrNode, InGlue).getValue(1);
5855       }
5856     }
5857 
5858     if (foldedLoad) {
5859       SDValue Ops[] = { Tmp0, Tmp1, Tmp2, Tmp3, Tmp4, N1.getOperand(0),
5860                         InGlue };
5861       MachineSDNode *CNode =
5862         CurDAG->getMachineNode(MOpc, dl, MVT::Other, MVT::Glue, Ops);
5863       InGlue = SDValue(CNode, 1);
5864       // Update the chain.
5865       ReplaceUses(N1.getValue(1), SDValue(CNode, 0));
5866       // Record the mem-refs
5867       CurDAG->setNodeMemRefs(CNode, {cast<LoadSDNode>(N1)->getMemOperand()});
5868     } else {
5869       InGlue =
5870         SDValue(CurDAG->getMachineNode(ROpc, dl, MVT::Glue, N1, InGlue), 0);
5871     }
5872 
5873     // Prevent use of AH in a REX instruction by explicitly copying it to
5874     // an ABCD_L register.
5875     //
5876     // The current assumption of the register allocator is that isel
5877     // won't generate explicit references to the GR8_ABCD_H registers. If
5878     // the allocator and/or the backend get enhanced to be more robust in
5879     // that regard, this can be, and should be, removed.
5880     if (HiReg == X86::AH && !SDValue(Node, 1).use_empty()) {
5881       SDValue AHCopy = CurDAG->getRegister(X86::AH, MVT::i8);
5882       unsigned AHExtOpcode =
5883           isSigned ? X86::MOVSX32rr8_NOREX : X86::MOVZX32rr8_NOREX;
5884 
5885       SDNode *RNode = CurDAG->getMachineNode(AHExtOpcode, dl, MVT::i32,
5886                                              MVT::Glue, AHCopy, InGlue);
5887       SDValue Result(RNode, 0);
5888       InGlue = SDValue(RNode, 1);
5889 
5890       Result =
5891           CurDAG->getTargetExtractSubreg(X86::sub_8bit, dl, MVT::i8, Result);
5892 
5893       ReplaceUses(SDValue(Node, 1), Result);
5894       LLVM_DEBUG(dbgs() << "=> "; Result.getNode()->dump(CurDAG);
5895                  dbgs() << '\n');
5896     }
5897     // Copy the division (low) result, if it is needed.
5898     if (!SDValue(Node, 0).use_empty()) {
5899       SDValue Result = CurDAG->getCopyFromReg(CurDAG->getEntryNode(), dl,
5900                                                 LoReg, NVT, InGlue);
5901       InGlue = Result.getValue(2);
5902       ReplaceUses(SDValue(Node, 0), Result);
5903       LLVM_DEBUG(dbgs() << "=> "; Result.getNode()->dump(CurDAG);
5904                  dbgs() << '\n');
5905     }
5906     // Copy the remainder (high) result, if it is needed.
5907     if (!SDValue(Node, 1).use_empty()) {
5908       SDValue Result = CurDAG->getCopyFromReg(CurDAG->getEntryNode(), dl,
5909                                               HiReg, NVT, InGlue);
5910       InGlue = Result.getValue(2);
5911       ReplaceUses(SDValue(Node, 1), Result);
5912       LLVM_DEBUG(dbgs() << "=> "; Result.getNode()->dump(CurDAG);
5913                  dbgs() << '\n');
5914     }
5915     CurDAG->RemoveDeadNode(Node);
5916     return;
5917   }
5918 
5919   case X86ISD::FCMP:
5920   case X86ISD::STRICT_FCMP:
5921   case X86ISD::STRICT_FCMPS: {
5922     bool IsStrictCmp = Node->getOpcode() == X86ISD::STRICT_FCMP ||
5923                        Node->getOpcode() == X86ISD::STRICT_FCMPS;
5924     SDValue N0 = Node->getOperand(IsStrictCmp ? 1 : 0);
5925     SDValue N1 = Node->getOperand(IsStrictCmp ? 2 : 1);
5926 
5927     // Save the original VT of the compare.
5928     MVT CmpVT = N0.getSimpleValueType();
5929 
5930     // Floating point needs special handling if we don't have FCOMI.
5931     if (Subtarget->canUseCMOV())
5932       break;
5933 
5934     bool IsSignaling = Node->getOpcode() == X86ISD::STRICT_FCMPS;
5935 
5936     unsigned Opc;
5937     switch (CmpVT.SimpleTy) {
5938     default: llvm_unreachable("Unexpected type!");
5939     case MVT::f32:
5940       Opc = IsSignaling ? X86::COM_Fpr32 : X86::UCOM_Fpr32;
5941       break;
5942     case MVT::f64:
5943       Opc = IsSignaling ? X86::COM_Fpr64 : X86::UCOM_Fpr64;
5944       break;
5945     case MVT::f80:
5946       Opc = IsSignaling ? X86::COM_Fpr80 : X86::UCOM_Fpr80;
5947       break;
5948     }
5949 
5950     SDValue Chain =
5951         IsStrictCmp ? Node->getOperand(0) : CurDAG->getEntryNode();
5952     SDValue Glue;
5953     if (IsStrictCmp) {
5954       SDVTList VTs = CurDAG->getVTList(MVT::Other, MVT::Glue);
5955       Chain = SDValue(CurDAG->getMachineNode(Opc, dl, VTs, {N0, N1, Chain}), 0);
5956       Glue = Chain.getValue(1);
5957     } else {
5958       Glue = SDValue(CurDAG->getMachineNode(Opc, dl, MVT::Glue, N0, N1), 0);
5959     }
5960 
5961     // Move FPSW to AX.
5962     SDValue FNSTSW =
5963         SDValue(CurDAG->getMachineNode(X86::FNSTSW16r, dl, MVT::i16, Glue), 0);
5964 
5965     // Extract upper 8-bits of AX.
5966     SDValue Extract =
5967         CurDAG->getTargetExtractSubreg(X86::sub_8bit_hi, dl, MVT::i8, FNSTSW);
5968 
5969     // Move AH into flags.
5970     // Some 64-bit targets lack SAHF support, but they do support FCOMI.
5971     assert(Subtarget->canUseLAHFSAHF() &&
5972            "Target doesn't support SAHF or FCOMI?");
5973     SDValue AH = CurDAG->getCopyToReg(Chain, dl, X86::AH, Extract, SDValue());
5974     Chain = AH;
5975     SDValue SAHF = SDValue(
5976         CurDAG->getMachineNode(X86::SAHF, dl, MVT::i32, AH.getValue(1)), 0);
5977 
5978     if (IsStrictCmp)
5979       ReplaceUses(SDValue(Node, 1), Chain);
5980 
5981     ReplaceUses(SDValue(Node, 0), SAHF);
5982     CurDAG->RemoveDeadNode(Node);
5983     return;
5984   }
5985 
5986   case X86ISD::CMP: {
5987     SDValue N0 = Node->getOperand(0);
5988     SDValue N1 = Node->getOperand(1);
5989 
5990     // Optimizations for TEST compares.
5991     if (!isNullConstant(N1))
5992       break;
5993 
5994     // Save the original VT of the compare.
5995     MVT CmpVT = N0.getSimpleValueType();
5996 
5997     // If we are comparing (and (shr X, C, Mask) with 0, emit a BEXTR followed
5998     // by a test instruction. The test should be removed later by
5999     // analyzeCompare if we are using only the zero flag.
6000     // TODO: Should we check the users and use the BEXTR flags directly?
6001     if (N0.getOpcode() == ISD::AND && N0.hasOneUse()) {
6002       if (MachineSDNode *NewNode = matchBEXTRFromAndImm(N0.getNode())) {
6003         unsigned TestOpc = CmpVT == MVT::i64 ? X86::TEST64rr
6004                                              : X86::TEST32rr;
6005         SDValue BEXTR = SDValue(NewNode, 0);
6006         NewNode = CurDAG->getMachineNode(TestOpc, dl, MVT::i32, BEXTR, BEXTR);
6007         ReplaceUses(SDValue(Node, 0), SDValue(NewNode, 0));
6008         CurDAG->RemoveDeadNode(Node);
6009         return;
6010       }
6011     }
6012 
6013     // We can peek through truncates, but we need to be careful below.
6014     if (N0.getOpcode() == ISD::TRUNCATE && N0.hasOneUse())
6015       N0 = N0.getOperand(0);
6016 
6017     // Look for (X86cmp (and $op, $imm), 0) and see if we can convert it to
6018     // use a smaller encoding.
6019     // Look past the truncate if CMP is the only use of it.
6020     if (N0.getOpcode() == ISD::AND && N0.getNode()->hasOneUse() &&
6021         N0.getValueType() != MVT::i8) {
6022       auto *MaskC = dyn_cast<ConstantSDNode>(N0.getOperand(1));
6023       if (!MaskC)
6024         break;
6025 
6026       // We may have looked through a truncate so mask off any bits that
6027       // shouldn't be part of the compare.
6028       uint64_t Mask = MaskC->getZExtValue();
6029       Mask &= maskTrailingOnes<uint64_t>(CmpVT.getScalarSizeInBits());
6030 
6031       // Check if we can replace AND+IMM{32,64} with a shift. This is possible
6032       // for masks like 0xFF000000 or 0x00FFFFFF and if we care only about the
6033       // zero flag.
6034       if (CmpVT == MVT::i64 && !isInt<8>(Mask) && isShiftedMask_64(Mask) &&
6035           onlyUsesZeroFlag(SDValue(Node, 0))) {
6036         unsigned ShiftOpcode = ISD::DELETED_NODE;
6037         unsigned ShiftAmt;
6038         unsigned SubRegIdx;
6039         MVT SubRegVT;
6040         unsigned TestOpcode;
6041         unsigned LeadingZeros = llvm::countl_zero(Mask);
6042         unsigned TrailingZeros = llvm::countr_zero(Mask);
6043 
6044         // With leading/trailing zeros, the transform is profitable if we can
6045         // eliminate a movabsq or shrink a 32-bit immediate to 8-bit without
6046         // incurring any extra register moves.
6047         bool SavesBytes = !isInt<32>(Mask) || N0.getOperand(0).hasOneUse();
6048         if (LeadingZeros == 0 && SavesBytes) {
6049           // If the mask covers the most significant bit, then we can replace
6050           // TEST+AND with a SHR and check eflags.
6051           // This emits a redundant TEST which is subsequently eliminated.
6052           ShiftOpcode = GET_ND_IF_ENABLED(X86::SHR64ri);
6053           ShiftAmt = TrailingZeros;
6054           SubRegIdx = 0;
6055           TestOpcode = X86::TEST64rr;
6056         } else if (TrailingZeros == 0 && SavesBytes) {
6057           // If the mask covers the least significant bit, then we can replace
6058           // TEST+AND with a SHL and check eflags.
6059           // This emits a redundant TEST which is subsequently eliminated.
6060           ShiftOpcode = GET_ND_IF_ENABLED(X86::SHL64ri);
6061           ShiftAmt = LeadingZeros;
6062           SubRegIdx = 0;
6063           TestOpcode = X86::TEST64rr;
6064         } else if (MaskC->hasOneUse() && !isInt<32>(Mask)) {
6065           // If the shifted mask extends into the high half and is 8/16/32 bits
6066           // wide, then replace it with a SHR and a TEST8rr/TEST16rr/TEST32rr.
6067           unsigned PopCount = 64 - LeadingZeros - TrailingZeros;
6068           if (PopCount == 8) {
6069             ShiftOpcode = GET_ND_IF_ENABLED(X86::SHR64ri);
6070             ShiftAmt = TrailingZeros;
6071             SubRegIdx = X86::sub_8bit;
6072             SubRegVT = MVT::i8;
6073             TestOpcode = X86::TEST8rr;
6074           } else if (PopCount == 16) {
6075             ShiftOpcode = GET_ND_IF_ENABLED(X86::SHR64ri);
6076             ShiftAmt = TrailingZeros;
6077             SubRegIdx = X86::sub_16bit;
6078             SubRegVT = MVT::i16;
6079             TestOpcode = X86::TEST16rr;
6080           } else if (PopCount == 32) {
6081             ShiftOpcode = GET_ND_IF_ENABLED(X86::SHR64ri);
6082             ShiftAmt = TrailingZeros;
6083             SubRegIdx = X86::sub_32bit;
6084             SubRegVT = MVT::i32;
6085             TestOpcode = X86::TEST32rr;
6086           }
6087         }
6088         if (ShiftOpcode != ISD::DELETED_NODE) {
6089           SDValue ShiftC = CurDAG->getTargetConstant(ShiftAmt, dl, MVT::i64);
6090           SDValue Shift = SDValue(
6091               CurDAG->getMachineNode(ShiftOpcode, dl, MVT::i64, MVT::i32,
6092                                      N0.getOperand(0), ShiftC),
6093               0);
6094           if (SubRegIdx != 0) {
6095             Shift =
6096                 CurDAG->getTargetExtractSubreg(SubRegIdx, dl, SubRegVT, Shift);
6097           }
6098           MachineSDNode *Test =
6099               CurDAG->getMachineNode(TestOpcode, dl, MVT::i32, Shift, Shift);
6100           ReplaceNode(Node, Test);
6101           return;
6102         }
6103       }
6104 
6105       MVT VT;
6106       int SubRegOp;
6107       unsigned ROpc, MOpc;
6108 
6109       // For each of these checks we need to be careful if the sign flag is
6110       // being used. It is only safe to use the sign flag in two conditions,
6111       // either the sign bit in the shrunken mask is zero or the final test
6112       // size is equal to the original compare size.
6113 
6114       if (isUInt<8>(Mask) &&
6115           (!(Mask & 0x80) || CmpVT == MVT::i8 ||
6116            hasNoSignFlagUses(SDValue(Node, 0)))) {
6117         // For example, convert "testl %eax, $8" to "testb %al, $8"
6118         VT = MVT::i8;
6119         SubRegOp = X86::sub_8bit;
6120         ROpc = X86::TEST8ri;
6121         MOpc = X86::TEST8mi;
6122       } else if (OptForMinSize && isUInt<16>(Mask) &&
6123                  (!(Mask & 0x8000) || CmpVT == MVT::i16 ||
6124                   hasNoSignFlagUses(SDValue(Node, 0)))) {
6125         // For example, "testl %eax, $32776" to "testw %ax, $32776".
6126         // NOTE: We only want to form TESTW instructions if optimizing for
6127         // min size. Otherwise we only save one byte and possibly get a length
6128         // changing prefix penalty in the decoders.
6129         VT = MVT::i16;
6130         SubRegOp = X86::sub_16bit;
6131         ROpc = X86::TEST16ri;
6132         MOpc = X86::TEST16mi;
6133       } else if (isUInt<32>(Mask) && N0.getValueType() != MVT::i16 &&
6134                  ((!(Mask & 0x80000000) &&
6135                    // Without minsize 16-bit Cmps can get here so we need to
6136                    // be sure we calculate the correct sign flag if needed.
6137                    (CmpVT != MVT::i16 || !(Mask & 0x8000))) ||
6138                   CmpVT == MVT::i32 ||
6139                   hasNoSignFlagUses(SDValue(Node, 0)))) {
6140         // For example, "testq %rax, $268468232" to "testl %eax, $268468232".
6141         // NOTE: We only want to run that transform if N0 is 32 or 64 bits.
6142         // Otherwize, we find ourselves in a position where we have to do
6143         // promotion. If previous passes did not promote the and, we assume
6144         // they had a good reason not to and do not promote here.
6145         VT = MVT::i32;
6146         SubRegOp = X86::sub_32bit;
6147         ROpc = X86::TEST32ri;
6148         MOpc = X86::TEST32mi;
6149       } else {
6150         // No eligible transformation was found.
6151         break;
6152       }
6153 
6154       SDValue Imm = CurDAG->getTargetConstant(Mask, dl, VT);
6155       SDValue Reg = N0.getOperand(0);
6156 
6157       // Emit a testl or testw.
6158       MachineSDNode *NewNode;
6159       SDValue Tmp0, Tmp1, Tmp2, Tmp3, Tmp4;
6160       if (tryFoldLoad(Node, N0.getNode(), Reg, Tmp0, Tmp1, Tmp2, Tmp3, Tmp4)) {
6161         if (auto *LoadN = dyn_cast<LoadSDNode>(N0.getOperand(0).getNode())) {
6162           if (!LoadN->isSimple()) {
6163             unsigned NumVolBits = LoadN->getValueType(0).getSizeInBits();
6164             if ((MOpc == X86::TEST8mi && NumVolBits != 8) ||
6165                 (MOpc == X86::TEST16mi && NumVolBits != 16) ||
6166                 (MOpc == X86::TEST32mi && NumVolBits != 32))
6167               break;
6168           }
6169         }
6170         SDValue Ops[] = { Tmp0, Tmp1, Tmp2, Tmp3, Tmp4, Imm,
6171                           Reg.getOperand(0) };
6172         NewNode = CurDAG->getMachineNode(MOpc, dl, MVT::i32, MVT::Other, Ops);
6173         // Update the chain.
6174         ReplaceUses(Reg.getValue(1), SDValue(NewNode, 1));
6175         // Record the mem-refs
6176         CurDAG->setNodeMemRefs(NewNode,
6177                                {cast<LoadSDNode>(Reg)->getMemOperand()});
6178       } else {
6179         // Extract the subregister if necessary.
6180         if (N0.getValueType() != VT)
6181           Reg = CurDAG->getTargetExtractSubreg(SubRegOp, dl, VT, Reg);
6182 
6183         NewNode = CurDAG->getMachineNode(ROpc, dl, MVT::i32, Reg, Imm);
6184       }
6185       // Replace CMP with TEST.
6186       ReplaceNode(Node, NewNode);
6187       return;
6188     }
6189     break;
6190   }
6191   case X86ISD::PCMPISTR: {
6192     if (!Subtarget->hasSSE42())
6193       break;
6194 
6195     bool NeedIndex = !SDValue(Node, 0).use_empty();
6196     bool NeedMask = !SDValue(Node, 1).use_empty();
6197     // We can't fold a load if we are going to make two instructions.
6198     bool MayFoldLoad = !NeedIndex || !NeedMask;
6199 
6200     MachineSDNode *CNode;
6201     if (NeedMask) {
6202       unsigned ROpc =
6203           Subtarget->hasAVX() ? X86::VPCMPISTRMrri : X86::PCMPISTRMrri;
6204       unsigned MOpc =
6205           Subtarget->hasAVX() ? X86::VPCMPISTRMrmi : X86::PCMPISTRMrmi;
6206       CNode = emitPCMPISTR(ROpc, MOpc, MayFoldLoad, dl, MVT::v16i8, Node);
6207       ReplaceUses(SDValue(Node, 1), SDValue(CNode, 0));
6208     }
6209     if (NeedIndex || !NeedMask) {
6210       unsigned ROpc =
6211           Subtarget->hasAVX() ? X86::VPCMPISTRIrri : X86::PCMPISTRIrri;
6212       unsigned MOpc =
6213           Subtarget->hasAVX() ? X86::VPCMPISTRIrmi : X86::PCMPISTRIrmi;
6214       CNode = emitPCMPISTR(ROpc, MOpc, MayFoldLoad, dl, MVT::i32, Node);
6215       ReplaceUses(SDValue(Node, 0), SDValue(CNode, 0));
6216     }
6217 
6218     // Connect the flag usage to the last instruction created.
6219     ReplaceUses(SDValue(Node, 2), SDValue(CNode, 1));
6220     CurDAG->RemoveDeadNode(Node);
6221     return;
6222   }
6223   case X86ISD::PCMPESTR: {
6224     if (!Subtarget->hasSSE42())
6225       break;
6226 
6227     // Copy the two implicit register inputs.
6228     SDValue InGlue = CurDAG->getCopyToReg(CurDAG->getEntryNode(), dl, X86::EAX,
6229                                           Node->getOperand(1),
6230                                           SDValue()).getValue(1);
6231     InGlue = CurDAG->getCopyToReg(CurDAG->getEntryNode(), dl, X86::EDX,
6232                                   Node->getOperand(3), InGlue).getValue(1);
6233 
6234     bool NeedIndex = !SDValue(Node, 0).use_empty();
6235     bool NeedMask = !SDValue(Node, 1).use_empty();
6236     // We can't fold a load if we are going to make two instructions.
6237     bool MayFoldLoad = !NeedIndex || !NeedMask;
6238 
6239     MachineSDNode *CNode;
6240     if (NeedMask) {
6241       unsigned ROpc =
6242           Subtarget->hasAVX() ? X86::VPCMPESTRMrri : X86::PCMPESTRMrri;
6243       unsigned MOpc =
6244           Subtarget->hasAVX() ? X86::VPCMPESTRMrmi : X86::PCMPESTRMrmi;
6245       CNode =
6246           emitPCMPESTR(ROpc, MOpc, MayFoldLoad, dl, MVT::v16i8, Node, InGlue);
6247       ReplaceUses(SDValue(Node, 1), SDValue(CNode, 0));
6248     }
6249     if (NeedIndex || !NeedMask) {
6250       unsigned ROpc =
6251           Subtarget->hasAVX() ? X86::VPCMPESTRIrri : X86::PCMPESTRIrri;
6252       unsigned MOpc =
6253           Subtarget->hasAVX() ? X86::VPCMPESTRIrmi : X86::PCMPESTRIrmi;
6254       CNode = emitPCMPESTR(ROpc, MOpc, MayFoldLoad, dl, MVT::i32, Node, InGlue);
6255       ReplaceUses(SDValue(Node, 0), SDValue(CNode, 0));
6256     }
6257     // Connect the flag usage to the last instruction created.
6258     ReplaceUses(SDValue(Node, 2), SDValue(CNode, 1));
6259     CurDAG->RemoveDeadNode(Node);
6260     return;
6261   }
6262 
6263   case ISD::SETCC: {
6264     if (NVT.isVector() && tryVPTESTM(Node, SDValue(Node, 0), SDValue()))
6265       return;
6266 
6267     break;
6268   }
6269 
6270   case ISD::STORE:
6271     if (foldLoadStoreIntoMemOperand(Node))
6272       return;
6273     break;
6274 
6275   case X86ISD::SETCC_CARRY: {
6276     MVT VT = Node->getSimpleValueType(0);
6277     SDValue Result;
6278     if (Subtarget->hasSBBDepBreaking()) {
6279       // We have to do this manually because tblgen will put the eflags copy in
6280       // the wrong place if we use an extract_subreg in the pattern.
6281       // Copy flags to the EFLAGS register and glue it to next node.
6282       SDValue EFLAGS =
6283           CurDAG->getCopyToReg(CurDAG->getEntryNode(), dl, X86::EFLAGS,
6284                                Node->getOperand(1), SDValue());
6285 
6286       // Create a 64-bit instruction if the result is 64-bits otherwise use the
6287       // 32-bit version.
6288       unsigned Opc = VT == MVT::i64 ? X86::SETB_C64r : X86::SETB_C32r;
6289       MVT SetVT = VT == MVT::i64 ? MVT::i64 : MVT::i32;
6290       Result = SDValue(
6291           CurDAG->getMachineNode(Opc, dl, SetVT, EFLAGS, EFLAGS.getValue(1)),
6292           0);
6293     } else {
6294       // The target does not recognize sbb with the same reg operand as a
6295       // no-source idiom, so we explicitly zero the input values.
6296       Result = getSBBZero(Node);
6297     }
6298 
6299     // For less than 32-bits we need to extract from the 32-bit node.
6300     if (VT == MVT::i8 || VT == MVT::i16) {
6301       int SubIndex = VT == MVT::i16 ? X86::sub_16bit : X86::sub_8bit;
6302       Result = CurDAG->getTargetExtractSubreg(SubIndex, dl, VT, Result);
6303     }
6304 
6305     ReplaceUses(SDValue(Node, 0), Result);
6306     CurDAG->RemoveDeadNode(Node);
6307     return;
6308   }
6309   case X86ISD::SBB: {
6310     if (isNullConstant(Node->getOperand(0)) &&
6311         isNullConstant(Node->getOperand(1))) {
6312       SDValue Result = getSBBZero(Node);
6313 
6314       // Replace the flag use.
6315       ReplaceUses(SDValue(Node, 1), Result.getValue(1));
6316 
6317       // Replace the result use.
6318       if (!SDValue(Node, 0).use_empty()) {
6319         // For less than 32-bits we need to extract from the 32-bit node.
6320         MVT VT = Node->getSimpleValueType(0);
6321         if (VT == MVT::i8 || VT == MVT::i16) {
6322           int SubIndex = VT == MVT::i16 ? X86::sub_16bit : X86::sub_8bit;
6323           Result = CurDAG->getTargetExtractSubreg(SubIndex, dl, VT, Result);
6324         }
6325         ReplaceUses(SDValue(Node, 0), Result);
6326       }
6327 
6328       CurDAG->RemoveDeadNode(Node);
6329       return;
6330     }
6331     break;
6332   }
6333   case X86ISD::MGATHER: {
6334     auto *Mgt = cast<X86MaskedGatherSDNode>(Node);
6335     SDValue IndexOp = Mgt->getIndex();
6336     SDValue Mask = Mgt->getMask();
6337     MVT IndexVT = IndexOp.getSimpleValueType();
6338     MVT ValueVT = Node->getSimpleValueType(0);
6339     MVT MaskVT = Mask.getSimpleValueType();
6340 
6341     // This is just to prevent crashes if the nodes are malformed somehow. We're
6342     // otherwise only doing loose type checking in here based on type what
6343     // a type constraint would say just like table based isel.
6344     if (!ValueVT.isVector() || !MaskVT.isVector())
6345       break;
6346 
6347     unsigned NumElts = ValueVT.getVectorNumElements();
6348     MVT ValueSVT = ValueVT.getVectorElementType();
6349 
6350     bool IsFP = ValueSVT.isFloatingPoint();
6351     unsigned EltSize = ValueSVT.getSizeInBits();
6352 
6353     unsigned Opc = 0;
6354     bool AVX512Gather = MaskVT.getVectorElementType() == MVT::i1;
6355     if (AVX512Gather) {
6356       if (IndexVT == MVT::v4i32 && NumElts == 4 && EltSize == 32)
6357         Opc = IsFP ? X86::VGATHERDPSZ128rm : X86::VPGATHERDDZ128rm;
6358       else if (IndexVT == MVT::v8i32 && NumElts == 8 && EltSize == 32)
6359         Opc = IsFP ? X86::VGATHERDPSZ256rm : X86::VPGATHERDDZ256rm;
6360       else if (IndexVT == MVT::v16i32 && NumElts == 16 && EltSize == 32)
6361         Opc = IsFP ? X86::VGATHERDPSZrm : X86::VPGATHERDDZrm;
6362       else if (IndexVT == MVT::v4i32 && NumElts == 2 && EltSize == 64)
6363         Opc = IsFP ? X86::VGATHERDPDZ128rm : X86::VPGATHERDQZ128rm;
6364       else if (IndexVT == MVT::v4i32 && NumElts == 4 && EltSize == 64)
6365         Opc = IsFP ? X86::VGATHERDPDZ256rm : X86::VPGATHERDQZ256rm;
6366       else if (IndexVT == MVT::v8i32 && NumElts == 8 && EltSize == 64)
6367         Opc = IsFP ? X86::VGATHERDPDZrm : X86::VPGATHERDQZrm;
6368       else if (IndexVT == MVT::v2i64 && NumElts == 4 && EltSize == 32)
6369         Opc = IsFP ? X86::VGATHERQPSZ128rm : X86::VPGATHERQDZ128rm;
6370       else if (IndexVT == MVT::v4i64 && NumElts == 4 && EltSize == 32)
6371         Opc = IsFP ? X86::VGATHERQPSZ256rm : X86::VPGATHERQDZ256rm;
6372       else if (IndexVT == MVT::v8i64 && NumElts == 8 && EltSize == 32)
6373         Opc = IsFP ? X86::VGATHERQPSZrm : X86::VPGATHERQDZrm;
6374       else if (IndexVT == MVT::v2i64 && NumElts == 2 && EltSize == 64)
6375         Opc = IsFP ? X86::VGATHERQPDZ128rm : X86::VPGATHERQQZ128rm;
6376       else if (IndexVT == MVT::v4i64 && NumElts == 4 && EltSize == 64)
6377         Opc = IsFP ? X86::VGATHERQPDZ256rm : X86::VPGATHERQQZ256rm;
6378       else if (IndexVT == MVT::v8i64 && NumElts == 8 && EltSize == 64)
6379         Opc = IsFP ? X86::VGATHERQPDZrm : X86::VPGATHERQQZrm;
6380     } else {
6381       assert(EVT(MaskVT) == EVT(ValueVT).changeVectorElementTypeToInteger() &&
6382              "Unexpected mask VT!");
6383       if (IndexVT == MVT::v4i32 && NumElts == 4 && EltSize == 32)
6384         Opc = IsFP ? X86::VGATHERDPSrm : X86::VPGATHERDDrm;
6385       else if (IndexVT == MVT::v8i32 && NumElts == 8 && EltSize == 32)
6386         Opc = IsFP ? X86::VGATHERDPSYrm : X86::VPGATHERDDYrm;
6387       else if (IndexVT == MVT::v4i32 && NumElts == 2 && EltSize == 64)
6388         Opc = IsFP ? X86::VGATHERDPDrm : X86::VPGATHERDQrm;
6389       else if (IndexVT == MVT::v4i32 && NumElts == 4 && EltSize == 64)
6390         Opc = IsFP ? X86::VGATHERDPDYrm : X86::VPGATHERDQYrm;
6391       else if (IndexVT == MVT::v2i64 && NumElts == 4 && EltSize == 32)
6392         Opc = IsFP ? X86::VGATHERQPSrm : X86::VPGATHERQDrm;
6393       else if (IndexVT == MVT::v4i64 && NumElts == 4 && EltSize == 32)
6394         Opc = IsFP ? X86::VGATHERQPSYrm : X86::VPGATHERQDYrm;
6395       else if (IndexVT == MVT::v2i64 && NumElts == 2 && EltSize == 64)
6396         Opc = IsFP ? X86::VGATHERQPDrm : X86::VPGATHERQQrm;
6397       else if (IndexVT == MVT::v4i64 && NumElts == 4 && EltSize == 64)
6398         Opc = IsFP ? X86::VGATHERQPDYrm : X86::VPGATHERQQYrm;
6399     }
6400 
6401     if (!Opc)
6402       break;
6403 
6404     SDValue Base, Scale, Index, Disp, Segment;
6405     if (!selectVectorAddr(Mgt, Mgt->getBasePtr(), IndexOp, Mgt->getScale(),
6406                           Base, Scale, Index, Disp, Segment))
6407       break;
6408 
6409     SDValue PassThru = Mgt->getPassThru();
6410     SDValue Chain = Mgt->getChain();
6411     // Gather instructions have a mask output not in the ISD node.
6412     SDVTList VTs = CurDAG->getVTList(ValueVT, MaskVT, MVT::Other);
6413 
6414     MachineSDNode *NewNode;
6415     if (AVX512Gather) {
6416       SDValue Ops[] = {PassThru, Mask, Base,    Scale,
6417                        Index,    Disp, Segment, Chain};
6418       NewNode = CurDAG->getMachineNode(Opc, SDLoc(dl), VTs, Ops);
6419     } else {
6420       SDValue Ops[] = {PassThru, Base,    Scale, Index,
6421                        Disp,     Segment, Mask,  Chain};
6422       NewNode = CurDAG->getMachineNode(Opc, SDLoc(dl), VTs, Ops);
6423     }
6424     CurDAG->setNodeMemRefs(NewNode, {Mgt->getMemOperand()});
6425     ReplaceUses(SDValue(Node, 0), SDValue(NewNode, 0));
6426     ReplaceUses(SDValue(Node, 1), SDValue(NewNode, 2));
6427     CurDAG->RemoveDeadNode(Node);
6428     return;
6429   }
6430   case X86ISD::MSCATTER: {
6431     auto *Sc = cast<X86MaskedScatterSDNode>(Node);
6432     SDValue Value = Sc->getValue();
6433     SDValue IndexOp = Sc->getIndex();
6434     MVT IndexVT = IndexOp.getSimpleValueType();
6435     MVT ValueVT = Value.getSimpleValueType();
6436 
6437     // This is just to prevent crashes if the nodes are malformed somehow. We're
6438     // otherwise only doing loose type checking in here based on type what
6439     // a type constraint would say just like table based isel.
6440     if (!ValueVT.isVector())
6441       break;
6442 
6443     unsigned NumElts = ValueVT.getVectorNumElements();
6444     MVT ValueSVT = ValueVT.getVectorElementType();
6445 
6446     bool IsFP = ValueSVT.isFloatingPoint();
6447     unsigned EltSize = ValueSVT.getSizeInBits();
6448 
6449     unsigned Opc;
6450     if (IndexVT == MVT::v4i32 && NumElts == 4 && EltSize == 32)
6451       Opc = IsFP ? X86::VSCATTERDPSZ128mr : X86::VPSCATTERDDZ128mr;
6452     else if (IndexVT == MVT::v8i32 && NumElts == 8 && EltSize == 32)
6453       Opc = IsFP ? X86::VSCATTERDPSZ256mr : X86::VPSCATTERDDZ256mr;
6454     else if (IndexVT == MVT::v16i32 && NumElts == 16 && EltSize == 32)
6455       Opc = IsFP ? X86::VSCATTERDPSZmr : X86::VPSCATTERDDZmr;
6456     else if (IndexVT == MVT::v4i32 && NumElts == 2 && EltSize == 64)
6457       Opc = IsFP ? X86::VSCATTERDPDZ128mr : X86::VPSCATTERDQZ128mr;
6458     else if (IndexVT == MVT::v4i32 && NumElts == 4 && EltSize == 64)
6459       Opc = IsFP ? X86::VSCATTERDPDZ256mr : X86::VPSCATTERDQZ256mr;
6460     else if (IndexVT == MVT::v8i32 && NumElts == 8 && EltSize == 64)
6461       Opc = IsFP ? X86::VSCATTERDPDZmr : X86::VPSCATTERDQZmr;
6462     else if (IndexVT == MVT::v2i64 && NumElts == 4 && EltSize == 32)
6463       Opc = IsFP ? X86::VSCATTERQPSZ128mr : X86::VPSCATTERQDZ128mr;
6464     else if (IndexVT == MVT::v4i64 && NumElts == 4 && EltSize == 32)
6465       Opc = IsFP ? X86::VSCATTERQPSZ256mr : X86::VPSCATTERQDZ256mr;
6466     else if (IndexVT == MVT::v8i64 && NumElts == 8 && EltSize == 32)
6467       Opc = IsFP ? X86::VSCATTERQPSZmr : X86::VPSCATTERQDZmr;
6468     else if (IndexVT == MVT::v2i64 && NumElts == 2 && EltSize == 64)
6469       Opc = IsFP ? X86::VSCATTERQPDZ128mr : X86::VPSCATTERQQZ128mr;
6470     else if (IndexVT == MVT::v4i64 && NumElts == 4 && EltSize == 64)
6471       Opc = IsFP ? X86::VSCATTERQPDZ256mr : X86::VPSCATTERQQZ256mr;
6472     else if (IndexVT == MVT::v8i64 && NumElts == 8 && EltSize == 64)
6473       Opc = IsFP ? X86::VSCATTERQPDZmr : X86::VPSCATTERQQZmr;
6474     else
6475       break;
6476 
6477     SDValue Base, Scale, Index, Disp, Segment;
6478     if (!selectVectorAddr(Sc, Sc->getBasePtr(), IndexOp, Sc->getScale(),
6479                           Base, Scale, Index, Disp, Segment))
6480       break;
6481 
6482     SDValue Mask = Sc->getMask();
6483     SDValue Chain = Sc->getChain();
6484     // Scatter instructions have a mask output not in the ISD node.
6485     SDVTList VTs = CurDAG->getVTList(Mask.getValueType(), MVT::Other);
6486     SDValue Ops[] = {Base, Scale, Index, Disp, Segment, Mask, Value, Chain};
6487 
6488     MachineSDNode *NewNode = CurDAG->getMachineNode(Opc, SDLoc(dl), VTs, Ops);
6489     CurDAG->setNodeMemRefs(NewNode, {Sc->getMemOperand()});
6490     ReplaceUses(SDValue(Node, 0), SDValue(NewNode, 1));
6491     CurDAG->RemoveDeadNode(Node);
6492     return;
6493   }
6494   case ISD::PREALLOCATED_SETUP: {
6495     auto *MFI = CurDAG->getMachineFunction().getInfo<X86MachineFunctionInfo>();
6496     auto CallId = MFI->getPreallocatedIdForCallSite(
6497         cast<SrcValueSDNode>(Node->getOperand(1))->getValue());
6498     SDValue Chain = Node->getOperand(0);
6499     SDValue CallIdValue = CurDAG->getTargetConstant(CallId, dl, MVT::i32);
6500     MachineSDNode *New = CurDAG->getMachineNode(
6501         TargetOpcode::PREALLOCATED_SETUP, dl, MVT::Other, CallIdValue, Chain);
6502     ReplaceUses(SDValue(Node, 0), SDValue(New, 0)); // Chain
6503     CurDAG->RemoveDeadNode(Node);
6504     return;
6505   }
6506   case ISD::PREALLOCATED_ARG: {
6507     auto *MFI = CurDAG->getMachineFunction().getInfo<X86MachineFunctionInfo>();
6508     auto CallId = MFI->getPreallocatedIdForCallSite(
6509         cast<SrcValueSDNode>(Node->getOperand(1))->getValue());
6510     SDValue Chain = Node->getOperand(0);
6511     SDValue CallIdValue = CurDAG->getTargetConstant(CallId, dl, MVT::i32);
6512     SDValue ArgIndex = Node->getOperand(2);
6513     SDValue Ops[3];
6514     Ops[0] = CallIdValue;
6515     Ops[1] = ArgIndex;
6516     Ops[2] = Chain;
6517     MachineSDNode *New = CurDAG->getMachineNode(
6518         TargetOpcode::PREALLOCATED_ARG, dl,
6519         CurDAG->getVTList(TLI->getPointerTy(CurDAG->getDataLayout()),
6520                           MVT::Other),
6521         Ops);
6522     ReplaceUses(SDValue(Node, 0), SDValue(New, 0)); // Arg pointer
6523     ReplaceUses(SDValue(Node, 1), SDValue(New, 1)); // Chain
6524     CurDAG->RemoveDeadNode(Node);
6525     return;
6526   }
6527   case X86ISD::AESENCWIDE128KL:
6528   case X86ISD::AESDECWIDE128KL:
6529   case X86ISD::AESENCWIDE256KL:
6530   case X86ISD::AESDECWIDE256KL: {
6531     if (!Subtarget->hasWIDEKL())
6532       break;
6533 
6534     unsigned Opcode;
6535     switch (Node->getOpcode()) {
6536     default:
6537       llvm_unreachable("Unexpected opcode!");
6538     case X86ISD::AESENCWIDE128KL:
6539       Opcode = X86::AESENCWIDE128KL;
6540       break;
6541     case X86ISD::AESDECWIDE128KL:
6542       Opcode = X86::AESDECWIDE128KL;
6543       break;
6544     case X86ISD::AESENCWIDE256KL:
6545       Opcode = X86::AESENCWIDE256KL;
6546       break;
6547     case X86ISD::AESDECWIDE256KL:
6548       Opcode = X86::AESDECWIDE256KL;
6549       break;
6550     }
6551 
6552     SDValue Chain = Node->getOperand(0);
6553     SDValue Addr = Node->getOperand(1);
6554 
6555     SDValue Base, Scale, Index, Disp, Segment;
6556     if (!selectAddr(Node, Addr, Base, Scale, Index, Disp, Segment))
6557       break;
6558 
6559     Chain = CurDAG->getCopyToReg(Chain, dl, X86::XMM0, Node->getOperand(2),
6560                                  SDValue());
6561     Chain = CurDAG->getCopyToReg(Chain, dl, X86::XMM1, Node->getOperand(3),
6562                                  Chain.getValue(1));
6563     Chain = CurDAG->getCopyToReg(Chain, dl, X86::XMM2, Node->getOperand(4),
6564                                  Chain.getValue(1));
6565     Chain = CurDAG->getCopyToReg(Chain, dl, X86::XMM3, Node->getOperand(5),
6566                                  Chain.getValue(1));
6567     Chain = CurDAG->getCopyToReg(Chain, dl, X86::XMM4, Node->getOperand(6),
6568                                  Chain.getValue(1));
6569     Chain = CurDAG->getCopyToReg(Chain, dl, X86::XMM5, Node->getOperand(7),
6570                                  Chain.getValue(1));
6571     Chain = CurDAG->getCopyToReg(Chain, dl, X86::XMM6, Node->getOperand(8),
6572                                  Chain.getValue(1));
6573     Chain = CurDAG->getCopyToReg(Chain, dl, X86::XMM7, Node->getOperand(9),
6574                                  Chain.getValue(1));
6575 
6576     MachineSDNode *Res = CurDAG->getMachineNode(
6577         Opcode, dl, Node->getVTList(),
6578         {Base, Scale, Index, Disp, Segment, Chain, Chain.getValue(1)});
6579     CurDAG->setNodeMemRefs(Res, cast<MemSDNode>(Node)->getMemOperand());
6580     ReplaceNode(Node, Res);
6581     return;
6582   }
6583   }
6584 
6585   SelectCode(Node);
6586 }
6587 
6588 bool X86DAGToDAGISel::SelectInlineAsmMemoryOperand(
6589     const SDValue &Op, InlineAsm::ConstraintCode ConstraintID,
6590     std::vector<SDValue> &OutOps) {
6591   SDValue Op0, Op1, Op2, Op3, Op4;
6592   switch (ConstraintID) {
6593   default:
6594     llvm_unreachable("Unexpected asm memory constraint");
6595   case InlineAsm::ConstraintCode::o: // offsetable        ??
6596   case InlineAsm::ConstraintCode::v: // not offsetable    ??
6597   case InlineAsm::ConstraintCode::m: // memory
6598   case InlineAsm::ConstraintCode::X:
6599   case InlineAsm::ConstraintCode::p: // address
6600     if (!selectAddr(nullptr, Op, Op0, Op1, Op2, Op3, Op4))
6601       return true;
6602     break;
6603   }
6604 
6605   OutOps.push_back(Op0);
6606   OutOps.push_back(Op1);
6607   OutOps.push_back(Op2);
6608   OutOps.push_back(Op3);
6609   OutOps.push_back(Op4);
6610   return false;
6611 }
6612 
6613 X86ISelDAGToDAGPass::X86ISelDAGToDAGPass(X86TargetMachine &TM)
6614     : SelectionDAGISelPass(
6615           std::make_unique<X86DAGToDAGISel>(TM, TM.getOptLevel())) {}
6616 
6617 /// This pass converts a legalized DAG into a X86-specific DAG,
6618 /// ready for instruction scheduling.
6619 FunctionPass *llvm::createX86ISelDag(X86TargetMachine &TM,
6620                                      CodeGenOptLevel OptLevel) {
6621   return new X86DAGToDAGISelLegacy(TM, OptLevel);
6622 }
6623