xref: /freebsd/contrib/llvm-project/llvm/lib/Target/AArch64/GISel/AArch64InstructionSelector.cpp (revision 924226fba12cc9a228c73b956e1b7fa24c60b055)
1 //===- AArch64InstructionSelector.cpp ----------------------------*- C++ -*-==//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 /// \file
9 /// This file implements the targeting of the InstructionSelector class for
10 /// AArch64.
11 /// \todo This should be generated by TableGen.
12 //===----------------------------------------------------------------------===//
13 
14 #include "AArch64GlobalISelUtils.h"
15 #include "AArch64InstrInfo.h"
16 #include "AArch64MachineFunctionInfo.h"
17 #include "AArch64RegisterBankInfo.h"
18 #include "AArch64RegisterInfo.h"
19 #include "AArch64Subtarget.h"
20 #include "AArch64TargetMachine.h"
21 #include "MCTargetDesc/AArch64AddressingModes.h"
22 #include "MCTargetDesc/AArch64MCTargetDesc.h"
23 #include "llvm/ADT/Optional.h"
24 #include "llvm/CodeGen/GlobalISel/GenericMachineInstrs.h"
25 #include "llvm/CodeGen/GlobalISel/InstructionSelector.h"
26 #include "llvm/CodeGen/GlobalISel/InstructionSelectorImpl.h"
27 #include "llvm/CodeGen/GlobalISel/MachineIRBuilder.h"
28 #include "llvm/CodeGen/GlobalISel/MIPatternMatch.h"
29 #include "llvm/CodeGen/MachineBasicBlock.h"
30 #include "llvm/CodeGen/MachineConstantPool.h"
31 #include "llvm/CodeGen/MachineFunction.h"
32 #include "llvm/CodeGen/MachineInstr.h"
33 #include "llvm/CodeGen/MachineInstrBuilder.h"
34 #include "llvm/CodeGen/MachineMemOperand.h"
35 #include "llvm/CodeGen/MachineOperand.h"
36 #include "llvm/CodeGen/MachineRegisterInfo.h"
37 #include "llvm/CodeGen/TargetOpcodes.h"
38 #include "llvm/IR/Constants.h"
39 #include "llvm/IR/DerivedTypes.h"
40 #include "llvm/IR/Instructions.h"
41 #include "llvm/IR/PatternMatch.h"
42 #include "llvm/IR/Type.h"
43 #include "llvm/IR/IntrinsicsAArch64.h"
44 #include "llvm/Pass.h"
45 #include "llvm/Support/Debug.h"
46 #include "llvm/Support/raw_ostream.h"
47 
48 #define DEBUG_TYPE "aarch64-isel"
49 
50 using namespace llvm;
51 using namespace MIPatternMatch;
52 using namespace AArch64GISelUtils;
53 
54 namespace llvm {
55 class BlockFrequencyInfo;
56 class ProfileSummaryInfo;
57 }
58 
59 namespace {
60 
61 #define GET_GLOBALISEL_PREDICATE_BITSET
62 #include "AArch64GenGlobalISel.inc"
63 #undef GET_GLOBALISEL_PREDICATE_BITSET
64 
65 class AArch64InstructionSelector : public InstructionSelector {
66 public:
67   AArch64InstructionSelector(const AArch64TargetMachine &TM,
68                              const AArch64Subtarget &STI,
69                              const AArch64RegisterBankInfo &RBI);
70 
71   bool select(MachineInstr &I) override;
72   static const char *getName() { return DEBUG_TYPE; }
73 
74   void setupMF(MachineFunction &MF, GISelKnownBits *KB,
75                CodeGenCoverage &CoverageInfo, ProfileSummaryInfo *PSI,
76                BlockFrequencyInfo *BFI) override {
77     InstructionSelector::setupMF(MF, KB, CoverageInfo, PSI, BFI);
78     MIB.setMF(MF);
79 
80     // hasFnAttribute() is expensive to call on every BRCOND selection, so
81     // cache it here for each run of the selector.
82     ProduceNonFlagSettingCondBr =
83         !MF.getFunction().hasFnAttribute(Attribute::SpeculativeLoadHardening);
84     MFReturnAddr = Register();
85 
86     processPHIs(MF);
87   }
88 
89 private:
90   /// tblgen-erated 'select' implementation, used as the initial selector for
91   /// the patterns that don't require complex C++.
92   bool selectImpl(MachineInstr &I, CodeGenCoverage &CoverageInfo) const;
93 
94   // A lowering phase that runs before any selection attempts.
95   // Returns true if the instruction was modified.
96   bool preISelLower(MachineInstr &I);
97 
98   // An early selection function that runs before the selectImpl() call.
99   bool earlySelect(MachineInstr &I);
100 
101   // Do some preprocessing of G_PHIs before we begin selection.
102   void processPHIs(MachineFunction &MF);
103 
104   bool earlySelectSHL(MachineInstr &I, MachineRegisterInfo &MRI);
105 
106   /// Eliminate same-sized cross-bank copies into stores before selectImpl().
107   bool contractCrossBankCopyIntoStore(MachineInstr &I,
108                                       MachineRegisterInfo &MRI);
109 
110   bool convertPtrAddToAdd(MachineInstr &I, MachineRegisterInfo &MRI);
111 
112   bool selectVaStartAAPCS(MachineInstr &I, MachineFunction &MF,
113                           MachineRegisterInfo &MRI) const;
114   bool selectVaStartDarwin(MachineInstr &I, MachineFunction &MF,
115                            MachineRegisterInfo &MRI) const;
116 
117   ///@{
118   /// Helper functions for selectCompareBranch.
119   bool selectCompareBranchFedByFCmp(MachineInstr &I, MachineInstr &FCmp,
120                                     MachineIRBuilder &MIB) const;
121   bool selectCompareBranchFedByICmp(MachineInstr &I, MachineInstr &ICmp,
122                                     MachineIRBuilder &MIB) const;
123   bool tryOptCompareBranchFedByICmp(MachineInstr &I, MachineInstr &ICmp,
124                                     MachineIRBuilder &MIB) const;
125   bool tryOptAndIntoCompareBranch(MachineInstr &AndInst, bool Invert,
126                                   MachineBasicBlock *DstMBB,
127                                   MachineIRBuilder &MIB) const;
128   ///@}
129 
130   bool selectCompareBranch(MachineInstr &I, MachineFunction &MF,
131                            MachineRegisterInfo &MRI);
132 
133   bool selectVectorAshrLshr(MachineInstr &I, MachineRegisterInfo &MRI);
134   bool selectVectorSHL(MachineInstr &I, MachineRegisterInfo &MRI);
135 
136   // Helper to generate an equivalent of scalar_to_vector into a new register,
137   // returned via 'Dst'.
138   MachineInstr *emitScalarToVector(unsigned EltSize,
139                                    const TargetRegisterClass *DstRC,
140                                    Register Scalar,
141                                    MachineIRBuilder &MIRBuilder) const;
142 
143   /// Emit a lane insert into \p DstReg, or a new vector register if None is
144   /// provided.
145   ///
146   /// The lane inserted into is defined by \p LaneIdx. The vector source
147   /// register is given by \p SrcReg. The register containing the element is
148   /// given by \p EltReg.
149   MachineInstr *emitLaneInsert(Optional<Register> DstReg, Register SrcReg,
150                                Register EltReg, unsigned LaneIdx,
151                                const RegisterBank &RB,
152                                MachineIRBuilder &MIRBuilder) const;
153 
154   /// Emit a sequence of instructions representing a constant \p CV for a
155   /// vector register \p Dst. (E.g. a MOV, or a load from a constant pool.)
156   ///
157   /// \returns the last instruction in the sequence on success, and nullptr
158   /// otherwise.
159   MachineInstr *emitConstantVector(Register Dst, Constant *CV,
160                                    MachineIRBuilder &MIRBuilder,
161                                    MachineRegisterInfo &MRI);
162 
163   bool selectInsertElt(MachineInstr &I, MachineRegisterInfo &MRI);
164   bool tryOptConstantBuildVec(MachineInstr &MI, LLT DstTy,
165                               MachineRegisterInfo &MRI);
166   /// \returns true if a G_BUILD_VECTOR instruction \p MI can be selected as a
167   /// SUBREG_TO_REG.
168   bool tryOptBuildVecToSubregToReg(MachineInstr &MI, MachineRegisterInfo &MRI);
169   bool selectBuildVector(MachineInstr &I, MachineRegisterInfo &MRI);
170   bool selectMergeValues(MachineInstr &I, MachineRegisterInfo &MRI);
171   bool selectUnmergeValues(MachineInstr &I, MachineRegisterInfo &MRI);
172 
173   bool selectShuffleVector(MachineInstr &I, MachineRegisterInfo &MRI);
174   bool selectExtractElt(MachineInstr &I, MachineRegisterInfo &MRI);
175   bool selectConcatVectors(MachineInstr &I, MachineRegisterInfo &MRI);
176   bool selectSplitVectorUnmerge(MachineInstr &I, MachineRegisterInfo &MRI);
177 
178   /// Helper function to select vector load intrinsics like
179   /// @llvm.aarch64.neon.ld2.*, @llvm.aarch64.neon.ld4.*, etc.
180   /// \p Opc is the opcode that the selected instruction should use.
181   /// \p NumVecs is the number of vector destinations for the instruction.
182   /// \p I is the original G_INTRINSIC_W_SIDE_EFFECTS instruction.
183   bool selectVectorLoadIntrinsic(unsigned Opc, unsigned NumVecs,
184                                  MachineInstr &I);
185   bool selectIntrinsicWithSideEffects(MachineInstr &I,
186                                       MachineRegisterInfo &MRI);
187   bool selectIntrinsic(MachineInstr &I, MachineRegisterInfo &MRI);
188   bool selectVectorICmp(MachineInstr &I, MachineRegisterInfo &MRI);
189   bool selectIntrinsicTrunc(MachineInstr &I, MachineRegisterInfo &MRI) const;
190   bool selectIntrinsicRound(MachineInstr &I, MachineRegisterInfo &MRI) const;
191   bool selectJumpTable(MachineInstr &I, MachineRegisterInfo &MRI);
192   bool selectBrJT(MachineInstr &I, MachineRegisterInfo &MRI);
193   bool selectTLSGlobalValue(MachineInstr &I, MachineRegisterInfo &MRI);
194   bool selectReduction(MachineInstr &I, MachineRegisterInfo &MRI);
195   bool selectMOPS(MachineInstr &I, MachineRegisterInfo &MRI);
196   bool selectUSMovFromExtend(MachineInstr &I, MachineRegisterInfo &MRI);
197 
198   unsigned emitConstantPoolEntry(const Constant *CPVal,
199                                  MachineFunction &MF) const;
200   MachineInstr *emitLoadFromConstantPool(const Constant *CPVal,
201                                          MachineIRBuilder &MIRBuilder) const;
202 
203   // Emit a vector concat operation.
204   MachineInstr *emitVectorConcat(Optional<Register> Dst, Register Op1,
205                                  Register Op2,
206                                  MachineIRBuilder &MIRBuilder) const;
207 
208   // Emit an integer compare between LHS and RHS, which checks for Predicate.
209   MachineInstr *emitIntegerCompare(MachineOperand &LHS, MachineOperand &RHS,
210                                    MachineOperand &Predicate,
211                                    MachineIRBuilder &MIRBuilder) const;
212 
213   /// Emit a floating point comparison between \p LHS and \p RHS.
214   /// \p Pred if given is the intended predicate to use.
215   MachineInstr *emitFPCompare(Register LHS, Register RHS,
216                               MachineIRBuilder &MIRBuilder,
217                               Optional<CmpInst::Predicate> = None) const;
218 
219   MachineInstr *emitInstr(unsigned Opcode,
220                           std::initializer_list<llvm::DstOp> DstOps,
221                           std::initializer_list<llvm::SrcOp> SrcOps,
222                           MachineIRBuilder &MIRBuilder,
223                           const ComplexRendererFns &RenderFns = None) const;
224   /// Helper function to emit an add or sub instruction.
225   ///
226   /// \p AddrModeAndSizeToOpcode must contain each of the opcode variants above
227   /// in a specific order.
228   ///
229   /// Below is an example of the expected input to \p AddrModeAndSizeToOpcode.
230   ///
231   /// \code
232   ///   const std::array<std::array<unsigned, 2>, 4> Table {
233   ///    {{AArch64::ADDXri, AArch64::ADDWri},
234   ///     {AArch64::ADDXrs, AArch64::ADDWrs},
235   ///     {AArch64::ADDXrr, AArch64::ADDWrr},
236   ///     {AArch64::SUBXri, AArch64::SUBWri},
237   ///     {AArch64::ADDXrx, AArch64::ADDWrx}}};
238   /// \endcode
239   ///
240   /// Each row in the table corresponds to a different addressing mode. Each
241   /// column corresponds to a different register size.
242   ///
243   /// \attention Rows must be structured as follows:
244   ///   - Row 0: The ri opcode variants
245   ///   - Row 1: The rs opcode variants
246   ///   - Row 2: The rr opcode variants
247   ///   - Row 3: The ri opcode variants for negative immediates
248   ///   - Row 4: The rx opcode variants
249   ///
250   /// \attention Columns must be structured as follows:
251   ///   - Column 0: The 64-bit opcode variants
252   ///   - Column 1: The 32-bit opcode variants
253   ///
254   /// \p Dst is the destination register of the binop to emit.
255   /// \p LHS is the left-hand operand of the binop to emit.
256   /// \p RHS is the right-hand operand of the binop to emit.
257   MachineInstr *emitAddSub(
258       const std::array<std::array<unsigned, 2>, 5> &AddrModeAndSizeToOpcode,
259       Register Dst, MachineOperand &LHS, MachineOperand &RHS,
260       MachineIRBuilder &MIRBuilder) const;
261   MachineInstr *emitADD(Register DefReg, MachineOperand &LHS,
262                         MachineOperand &RHS,
263                         MachineIRBuilder &MIRBuilder) const;
264   MachineInstr *emitADDS(Register Dst, MachineOperand &LHS, MachineOperand &RHS,
265                          MachineIRBuilder &MIRBuilder) const;
266   MachineInstr *emitSUBS(Register Dst, MachineOperand &LHS, MachineOperand &RHS,
267                          MachineIRBuilder &MIRBuilder) const;
268   MachineInstr *emitCMN(MachineOperand &LHS, MachineOperand &RHS,
269                         MachineIRBuilder &MIRBuilder) const;
270   MachineInstr *emitTST(MachineOperand &LHS, MachineOperand &RHS,
271                         MachineIRBuilder &MIRBuilder) const;
272   MachineInstr *emitSelect(Register Dst, Register LHS, Register RHS,
273                            AArch64CC::CondCode CC,
274                            MachineIRBuilder &MIRBuilder) const;
275   MachineInstr *emitExtractVectorElt(Optional<Register> DstReg,
276                                      const RegisterBank &DstRB, LLT ScalarTy,
277                                      Register VecReg, unsigned LaneIdx,
278                                      MachineIRBuilder &MIRBuilder) const;
279   MachineInstr *emitCSINC(Register Dst, Register Src1, Register Src2,
280                           AArch64CC::CondCode Pred,
281                           MachineIRBuilder &MIRBuilder) const;
282   /// Emit a CSet for a FP compare.
283   ///
284   /// \p Dst is expected to be a 32-bit scalar register.
285   MachineInstr *emitCSetForFCmp(Register Dst, CmpInst::Predicate Pred,
286                                 MachineIRBuilder &MIRBuilder) const;
287 
288   /// Emit the overflow op for \p Opcode.
289   ///
290   /// \p Opcode is expected to be an overflow op's opcode, e.g. G_UADDO,
291   /// G_USUBO, etc.
292   std::pair<MachineInstr *, AArch64CC::CondCode>
293   emitOverflowOp(unsigned Opcode, Register Dst, MachineOperand &LHS,
294                  MachineOperand &RHS, MachineIRBuilder &MIRBuilder) const;
295 
296   /// Emit a TB(N)Z instruction which tests \p Bit in \p TestReg.
297   /// \p IsNegative is true if the test should be "not zero".
298   /// This will also optimize the test bit instruction when possible.
299   MachineInstr *emitTestBit(Register TestReg, uint64_t Bit, bool IsNegative,
300                             MachineBasicBlock *DstMBB,
301                             MachineIRBuilder &MIB) const;
302 
303   /// Emit a CB(N)Z instruction which branches to \p DestMBB.
304   MachineInstr *emitCBZ(Register CompareReg, bool IsNegative,
305                         MachineBasicBlock *DestMBB,
306                         MachineIRBuilder &MIB) const;
307 
308   // Equivalent to the i32shift_a and friends from AArch64InstrInfo.td.
309   // We use these manually instead of using the importer since it doesn't
310   // support SDNodeXForm.
311   ComplexRendererFns selectShiftA_32(const MachineOperand &Root) const;
312   ComplexRendererFns selectShiftB_32(const MachineOperand &Root) const;
313   ComplexRendererFns selectShiftA_64(const MachineOperand &Root) const;
314   ComplexRendererFns selectShiftB_64(const MachineOperand &Root) const;
315 
316   ComplexRendererFns select12BitValueWithLeftShift(uint64_t Immed) const;
317   ComplexRendererFns selectArithImmed(MachineOperand &Root) const;
318   ComplexRendererFns selectNegArithImmed(MachineOperand &Root) const;
319 
320   ComplexRendererFns selectAddrModeUnscaled(MachineOperand &Root,
321                                             unsigned Size) const;
322 
323   ComplexRendererFns selectAddrModeUnscaled8(MachineOperand &Root) const {
324     return selectAddrModeUnscaled(Root, 1);
325   }
326   ComplexRendererFns selectAddrModeUnscaled16(MachineOperand &Root) const {
327     return selectAddrModeUnscaled(Root, 2);
328   }
329   ComplexRendererFns selectAddrModeUnscaled32(MachineOperand &Root) const {
330     return selectAddrModeUnscaled(Root, 4);
331   }
332   ComplexRendererFns selectAddrModeUnscaled64(MachineOperand &Root) const {
333     return selectAddrModeUnscaled(Root, 8);
334   }
335   ComplexRendererFns selectAddrModeUnscaled128(MachineOperand &Root) const {
336     return selectAddrModeUnscaled(Root, 16);
337   }
338 
339   /// Helper to try to fold in a GISEL_ADD_LOW into an immediate, to be used
340   /// from complex pattern matchers like selectAddrModeIndexed().
341   ComplexRendererFns tryFoldAddLowIntoImm(MachineInstr &RootDef, unsigned Size,
342                                           MachineRegisterInfo &MRI) const;
343 
344   ComplexRendererFns selectAddrModeIndexed(MachineOperand &Root,
345                                            unsigned Size) const;
346   template <int Width>
347   ComplexRendererFns selectAddrModeIndexed(MachineOperand &Root) const {
348     return selectAddrModeIndexed(Root, Width / 8);
349   }
350 
351   bool isWorthFoldingIntoExtendedReg(MachineInstr &MI,
352                                      const MachineRegisterInfo &MRI) const;
353   ComplexRendererFns
354   selectAddrModeShiftedExtendXReg(MachineOperand &Root,
355                                   unsigned SizeInBytes) const;
356 
357   /// Returns a \p ComplexRendererFns which contains a base, offset, and whether
358   /// or not a shift + extend should be folded into an addressing mode. Returns
359   /// None when this is not profitable or possible.
360   ComplexRendererFns
361   selectExtendedSHL(MachineOperand &Root, MachineOperand &Base,
362                     MachineOperand &Offset, unsigned SizeInBytes,
363                     bool WantsExt) const;
364   ComplexRendererFns selectAddrModeRegisterOffset(MachineOperand &Root) const;
365   ComplexRendererFns selectAddrModeXRO(MachineOperand &Root,
366                                        unsigned SizeInBytes) const;
367   template <int Width>
368   ComplexRendererFns selectAddrModeXRO(MachineOperand &Root) const {
369     return selectAddrModeXRO(Root, Width / 8);
370   }
371 
372   ComplexRendererFns selectAddrModeWRO(MachineOperand &Root,
373                                        unsigned SizeInBytes) const;
374   template <int Width>
375   ComplexRendererFns selectAddrModeWRO(MachineOperand &Root) const {
376     return selectAddrModeWRO(Root, Width / 8);
377   }
378 
379   ComplexRendererFns selectShiftedRegister(MachineOperand &Root,
380                                            bool AllowROR = false) const;
381 
382   ComplexRendererFns selectArithShiftedRegister(MachineOperand &Root) const {
383     return selectShiftedRegister(Root);
384   }
385 
386   ComplexRendererFns selectLogicalShiftedRegister(MachineOperand &Root) const {
387     return selectShiftedRegister(Root, true);
388   }
389 
390   /// Given an extend instruction, determine the correct shift-extend type for
391   /// that instruction.
392   ///
393   /// If the instruction is going to be used in a load or store, pass
394   /// \p IsLoadStore = true.
395   AArch64_AM::ShiftExtendType
396   getExtendTypeForInst(MachineInstr &MI, MachineRegisterInfo &MRI,
397                        bool IsLoadStore = false) const;
398 
399   /// Move \p Reg to \p RC if \p Reg is not already on \p RC.
400   ///
401   /// \returns Either \p Reg if no change was necessary, or the new register
402   /// created by moving \p Reg.
403   ///
404   /// Note: This uses emitCopy right now.
405   Register moveScalarRegClass(Register Reg, const TargetRegisterClass &RC,
406                               MachineIRBuilder &MIB) const;
407 
408   ComplexRendererFns selectArithExtendedRegister(MachineOperand &Root) const;
409 
410   void renderTruncImm(MachineInstrBuilder &MIB, const MachineInstr &MI,
411                       int OpIdx = -1) const;
412   void renderLogicalImm32(MachineInstrBuilder &MIB, const MachineInstr &I,
413                           int OpIdx = -1) const;
414   void renderLogicalImm64(MachineInstrBuilder &MIB, const MachineInstr &I,
415                           int OpIdx = -1) const;
416   void renderFPImm16(MachineInstrBuilder &MIB, const MachineInstr &MI,
417                      int OpIdx = -1) const;
418   void renderFPImm32(MachineInstrBuilder &MIB, const MachineInstr &MI,
419                      int OpIdx = -1) const;
420   void renderFPImm64(MachineInstrBuilder &MIB, const MachineInstr &MI,
421                      int OpIdx = -1) const;
422 
423   // Materialize a GlobalValue or BlockAddress using a movz+movk sequence.
424   void materializeLargeCMVal(MachineInstr &I, const Value *V, unsigned OpFlags);
425 
426   // Optimization methods.
427   bool tryOptSelect(MachineInstr &MI);
428   MachineInstr *tryFoldIntegerCompare(MachineOperand &LHS, MachineOperand &RHS,
429                                       MachineOperand &Predicate,
430                                       MachineIRBuilder &MIRBuilder) const;
431 
432   /// Return true if \p MI is a load or store of \p NumBytes bytes.
433   bool isLoadStoreOfNumBytes(const MachineInstr &MI, unsigned NumBytes) const;
434 
435   /// Returns true if \p MI is guaranteed to have the high-half of a 64-bit
436   /// register zeroed out. In other words, the result of MI has been explicitly
437   /// zero extended.
438   bool isDef32(const MachineInstr &MI) const;
439 
440   const AArch64TargetMachine &TM;
441   const AArch64Subtarget &STI;
442   const AArch64InstrInfo &TII;
443   const AArch64RegisterInfo &TRI;
444   const AArch64RegisterBankInfo &RBI;
445 
446   bool ProduceNonFlagSettingCondBr = false;
447 
448   // Some cached values used during selection.
449   // We use LR as a live-in register, and we keep track of it here as it can be
450   // clobbered by calls.
451   Register MFReturnAddr;
452 
453   MachineIRBuilder MIB;
454 
455 #define GET_GLOBALISEL_PREDICATES_DECL
456 #include "AArch64GenGlobalISel.inc"
457 #undef GET_GLOBALISEL_PREDICATES_DECL
458 
459 // We declare the temporaries used by selectImpl() in the class to minimize the
460 // cost of constructing placeholder values.
461 #define GET_GLOBALISEL_TEMPORARIES_DECL
462 #include "AArch64GenGlobalISel.inc"
463 #undef GET_GLOBALISEL_TEMPORARIES_DECL
464 };
465 
466 } // end anonymous namespace
467 
468 #define GET_GLOBALISEL_IMPL
469 #include "AArch64GenGlobalISel.inc"
470 #undef GET_GLOBALISEL_IMPL
471 
472 AArch64InstructionSelector::AArch64InstructionSelector(
473     const AArch64TargetMachine &TM, const AArch64Subtarget &STI,
474     const AArch64RegisterBankInfo &RBI)
475     : TM(TM), STI(STI), TII(*STI.getInstrInfo()), TRI(*STI.getRegisterInfo()),
476       RBI(RBI),
477 #define GET_GLOBALISEL_PREDICATES_INIT
478 #include "AArch64GenGlobalISel.inc"
479 #undef GET_GLOBALISEL_PREDICATES_INIT
480 #define GET_GLOBALISEL_TEMPORARIES_INIT
481 #include "AArch64GenGlobalISel.inc"
482 #undef GET_GLOBALISEL_TEMPORARIES_INIT
483 {
484 }
485 
486 // FIXME: This should be target-independent, inferred from the types declared
487 // for each class in the bank.
488 static const TargetRegisterClass *
489 getRegClassForTypeOnBank(LLT Ty, const RegisterBank &RB,
490                          const RegisterBankInfo &RBI,
491                          bool GetAllRegSet = false) {
492   if (RB.getID() == AArch64::GPRRegBankID) {
493     if (Ty.getSizeInBits() <= 32)
494       return GetAllRegSet ? &AArch64::GPR32allRegClass
495                           : &AArch64::GPR32RegClass;
496     if (Ty.getSizeInBits() == 64)
497       return GetAllRegSet ? &AArch64::GPR64allRegClass
498                           : &AArch64::GPR64RegClass;
499     if (Ty.getSizeInBits() == 128)
500       return &AArch64::XSeqPairsClassRegClass;
501     return nullptr;
502   }
503 
504   if (RB.getID() == AArch64::FPRRegBankID) {
505     switch (Ty.getSizeInBits()) {
506     case 8:
507       return &AArch64::FPR8RegClass;
508     case 16:
509       return &AArch64::FPR16RegClass;
510     case 32:
511       return &AArch64::FPR32RegClass;
512     case 64:
513       return &AArch64::FPR64RegClass;
514     case 128:
515       return &AArch64::FPR128RegClass;
516     }
517     return nullptr;
518   }
519 
520   return nullptr;
521 }
522 
523 /// Given a register bank, and size in bits, return the smallest register class
524 /// that can represent that combination.
525 static const TargetRegisterClass *
526 getMinClassForRegBank(const RegisterBank &RB, unsigned SizeInBits,
527                       bool GetAllRegSet = false) {
528   unsigned RegBankID = RB.getID();
529 
530   if (RegBankID == AArch64::GPRRegBankID) {
531     if (SizeInBits <= 32)
532       return GetAllRegSet ? &AArch64::GPR32allRegClass
533                           : &AArch64::GPR32RegClass;
534     if (SizeInBits == 64)
535       return GetAllRegSet ? &AArch64::GPR64allRegClass
536                           : &AArch64::GPR64RegClass;
537     if (SizeInBits == 128)
538       return &AArch64::XSeqPairsClassRegClass;
539   }
540 
541   if (RegBankID == AArch64::FPRRegBankID) {
542     switch (SizeInBits) {
543     default:
544       return nullptr;
545     case 8:
546       return &AArch64::FPR8RegClass;
547     case 16:
548       return &AArch64::FPR16RegClass;
549     case 32:
550       return &AArch64::FPR32RegClass;
551     case 64:
552       return &AArch64::FPR64RegClass;
553     case 128:
554       return &AArch64::FPR128RegClass;
555     }
556   }
557 
558   return nullptr;
559 }
560 
561 /// Returns the correct subregister to use for a given register class.
562 static bool getSubRegForClass(const TargetRegisterClass *RC,
563                               const TargetRegisterInfo &TRI, unsigned &SubReg) {
564   switch (TRI.getRegSizeInBits(*RC)) {
565   case 8:
566     SubReg = AArch64::bsub;
567     break;
568   case 16:
569     SubReg = AArch64::hsub;
570     break;
571   case 32:
572     if (RC != &AArch64::FPR32RegClass)
573       SubReg = AArch64::sub_32;
574     else
575       SubReg = AArch64::ssub;
576     break;
577   case 64:
578     SubReg = AArch64::dsub;
579     break;
580   default:
581     LLVM_DEBUG(
582         dbgs() << "Couldn't find appropriate subregister for register class.");
583     return false;
584   }
585 
586   return true;
587 }
588 
589 /// Returns the minimum size the given register bank can hold.
590 static unsigned getMinSizeForRegBank(const RegisterBank &RB) {
591   switch (RB.getID()) {
592   case AArch64::GPRRegBankID:
593     return 32;
594   case AArch64::FPRRegBankID:
595     return 8;
596   default:
597     llvm_unreachable("Tried to get minimum size for unknown register bank.");
598   }
599 }
600 
601 /// Create a REG_SEQUENCE instruction using the registers in \p Regs.
602 /// Helper function for functions like createDTuple and createQTuple.
603 ///
604 /// \p RegClassIDs - The list of register class IDs available for some tuple of
605 /// a scalar class. E.g. QQRegClassID, QQQRegClassID, QQQQRegClassID. This is
606 /// expected to contain between 2 and 4 tuple classes.
607 ///
608 /// \p SubRegs - The list of subregister classes associated with each register
609 /// class ID in \p RegClassIDs. E.g., QQRegClassID should use the qsub0
610 /// subregister class. The index of each subregister class is expected to
611 /// correspond with the index of each register class.
612 ///
613 /// \returns Either the destination register of REG_SEQUENCE instruction that
614 /// was created, or the 0th element of \p Regs if \p Regs contains a single
615 /// element.
616 static Register createTuple(ArrayRef<Register> Regs,
617                             const unsigned RegClassIDs[],
618                             const unsigned SubRegs[], MachineIRBuilder &MIB) {
619   unsigned NumRegs = Regs.size();
620   if (NumRegs == 1)
621     return Regs[0];
622   assert(NumRegs >= 2 && NumRegs <= 4 &&
623          "Only support between two and 4 registers in a tuple!");
624   const TargetRegisterInfo *TRI = MIB.getMF().getSubtarget().getRegisterInfo();
625   auto *DesiredClass = TRI->getRegClass(RegClassIDs[NumRegs - 2]);
626   auto RegSequence =
627       MIB.buildInstr(TargetOpcode::REG_SEQUENCE, {DesiredClass}, {});
628   for (unsigned I = 0, E = Regs.size(); I < E; ++I) {
629     RegSequence.addUse(Regs[I]);
630     RegSequence.addImm(SubRegs[I]);
631   }
632   return RegSequence.getReg(0);
633 }
634 
635 /// Create a tuple of D-registers using the registers in \p Regs.
636 static Register createDTuple(ArrayRef<Register> Regs, MachineIRBuilder &MIB) {
637   static const unsigned RegClassIDs[] = {
638       AArch64::DDRegClassID, AArch64::DDDRegClassID, AArch64::DDDDRegClassID};
639   static const unsigned SubRegs[] = {AArch64::dsub0, AArch64::dsub1,
640                                      AArch64::dsub2, AArch64::dsub3};
641   return createTuple(Regs, RegClassIDs, SubRegs, MIB);
642 }
643 
644 /// Create a tuple of Q-registers using the registers in \p Regs.
645 static Register createQTuple(ArrayRef<Register> Regs, MachineIRBuilder &MIB) {
646   static const unsigned RegClassIDs[] = {
647       AArch64::QQRegClassID, AArch64::QQQRegClassID, AArch64::QQQQRegClassID};
648   static const unsigned SubRegs[] = {AArch64::qsub0, AArch64::qsub1,
649                                      AArch64::qsub2, AArch64::qsub3};
650   return createTuple(Regs, RegClassIDs, SubRegs, MIB);
651 }
652 
653 static Optional<uint64_t> getImmedFromMO(const MachineOperand &Root) {
654   auto &MI = *Root.getParent();
655   auto &MBB = *MI.getParent();
656   auto &MF = *MBB.getParent();
657   auto &MRI = MF.getRegInfo();
658   uint64_t Immed;
659   if (Root.isImm())
660     Immed = Root.getImm();
661   else if (Root.isCImm())
662     Immed = Root.getCImm()->getZExtValue();
663   else if (Root.isReg()) {
664     auto ValAndVReg =
665         getIConstantVRegValWithLookThrough(Root.getReg(), MRI, true);
666     if (!ValAndVReg)
667       return None;
668     Immed = ValAndVReg->Value.getSExtValue();
669   } else
670     return None;
671   return Immed;
672 }
673 
674 /// Check whether \p I is a currently unsupported binary operation:
675 /// - it has an unsized type
676 /// - an operand is not a vreg
677 /// - all operands are not in the same bank
678 /// These are checks that should someday live in the verifier, but right now,
679 /// these are mostly limitations of the aarch64 selector.
680 static bool unsupportedBinOp(const MachineInstr &I,
681                              const AArch64RegisterBankInfo &RBI,
682                              const MachineRegisterInfo &MRI,
683                              const AArch64RegisterInfo &TRI) {
684   LLT Ty = MRI.getType(I.getOperand(0).getReg());
685   if (!Ty.isValid()) {
686     LLVM_DEBUG(dbgs() << "Generic binop register should be typed\n");
687     return true;
688   }
689 
690   const RegisterBank *PrevOpBank = nullptr;
691   for (auto &MO : I.operands()) {
692     // FIXME: Support non-register operands.
693     if (!MO.isReg()) {
694       LLVM_DEBUG(dbgs() << "Generic inst non-reg operands are unsupported\n");
695       return true;
696     }
697 
698     // FIXME: Can generic operations have physical registers operands? If
699     // so, this will need to be taught about that, and we'll need to get the
700     // bank out of the minimal class for the register.
701     // Either way, this needs to be documented (and possibly verified).
702     if (!Register::isVirtualRegister(MO.getReg())) {
703       LLVM_DEBUG(dbgs() << "Generic inst has physical register operand\n");
704       return true;
705     }
706 
707     const RegisterBank *OpBank = RBI.getRegBank(MO.getReg(), MRI, TRI);
708     if (!OpBank) {
709       LLVM_DEBUG(dbgs() << "Generic register has no bank or class\n");
710       return true;
711     }
712 
713     if (PrevOpBank && OpBank != PrevOpBank) {
714       LLVM_DEBUG(dbgs() << "Generic inst operands have different banks\n");
715       return true;
716     }
717     PrevOpBank = OpBank;
718   }
719   return false;
720 }
721 
722 /// Select the AArch64 opcode for the basic binary operation \p GenericOpc
723 /// (such as G_OR or G_SDIV), appropriate for the register bank \p RegBankID
724 /// and of size \p OpSize.
725 /// \returns \p GenericOpc if the combination is unsupported.
726 static unsigned selectBinaryOp(unsigned GenericOpc, unsigned RegBankID,
727                                unsigned OpSize) {
728   switch (RegBankID) {
729   case AArch64::GPRRegBankID:
730     if (OpSize == 32) {
731       switch (GenericOpc) {
732       case TargetOpcode::G_SHL:
733         return AArch64::LSLVWr;
734       case TargetOpcode::G_LSHR:
735         return AArch64::LSRVWr;
736       case TargetOpcode::G_ASHR:
737         return AArch64::ASRVWr;
738       default:
739         return GenericOpc;
740       }
741     } else if (OpSize == 64) {
742       switch (GenericOpc) {
743       case TargetOpcode::G_PTR_ADD:
744         return AArch64::ADDXrr;
745       case TargetOpcode::G_SHL:
746         return AArch64::LSLVXr;
747       case TargetOpcode::G_LSHR:
748         return AArch64::LSRVXr;
749       case TargetOpcode::G_ASHR:
750         return AArch64::ASRVXr;
751       default:
752         return GenericOpc;
753       }
754     }
755     break;
756   case AArch64::FPRRegBankID:
757     switch (OpSize) {
758     case 32:
759       switch (GenericOpc) {
760       case TargetOpcode::G_FADD:
761         return AArch64::FADDSrr;
762       case TargetOpcode::G_FSUB:
763         return AArch64::FSUBSrr;
764       case TargetOpcode::G_FMUL:
765         return AArch64::FMULSrr;
766       case TargetOpcode::G_FDIV:
767         return AArch64::FDIVSrr;
768       default:
769         return GenericOpc;
770       }
771     case 64:
772       switch (GenericOpc) {
773       case TargetOpcode::G_FADD:
774         return AArch64::FADDDrr;
775       case TargetOpcode::G_FSUB:
776         return AArch64::FSUBDrr;
777       case TargetOpcode::G_FMUL:
778         return AArch64::FMULDrr;
779       case TargetOpcode::G_FDIV:
780         return AArch64::FDIVDrr;
781       case TargetOpcode::G_OR:
782         return AArch64::ORRv8i8;
783       default:
784         return GenericOpc;
785       }
786     }
787     break;
788   }
789   return GenericOpc;
790 }
791 
792 /// Select the AArch64 opcode for the G_LOAD or G_STORE operation \p GenericOpc,
793 /// appropriate for the (value) register bank \p RegBankID and of memory access
794 /// size \p OpSize.  This returns the variant with the base+unsigned-immediate
795 /// addressing mode (e.g., LDRXui).
796 /// \returns \p GenericOpc if the combination is unsupported.
797 static unsigned selectLoadStoreUIOp(unsigned GenericOpc, unsigned RegBankID,
798                                     unsigned OpSize) {
799   const bool isStore = GenericOpc == TargetOpcode::G_STORE;
800   switch (RegBankID) {
801   case AArch64::GPRRegBankID:
802     switch (OpSize) {
803     case 8:
804       return isStore ? AArch64::STRBBui : AArch64::LDRBBui;
805     case 16:
806       return isStore ? AArch64::STRHHui : AArch64::LDRHHui;
807     case 32:
808       return isStore ? AArch64::STRWui : AArch64::LDRWui;
809     case 64:
810       return isStore ? AArch64::STRXui : AArch64::LDRXui;
811     }
812     break;
813   case AArch64::FPRRegBankID:
814     switch (OpSize) {
815     case 8:
816       return isStore ? AArch64::STRBui : AArch64::LDRBui;
817     case 16:
818       return isStore ? AArch64::STRHui : AArch64::LDRHui;
819     case 32:
820       return isStore ? AArch64::STRSui : AArch64::LDRSui;
821     case 64:
822       return isStore ? AArch64::STRDui : AArch64::LDRDui;
823     case 128:
824       return isStore ? AArch64::STRQui : AArch64::LDRQui;
825     }
826     break;
827   }
828   return GenericOpc;
829 }
830 
831 #ifndef NDEBUG
832 /// Helper function that verifies that we have a valid copy at the end of
833 /// selectCopy. Verifies that the source and dest have the expected sizes and
834 /// then returns true.
835 static bool isValidCopy(const MachineInstr &I, const RegisterBank &DstBank,
836                         const MachineRegisterInfo &MRI,
837                         const TargetRegisterInfo &TRI,
838                         const RegisterBankInfo &RBI) {
839   const Register DstReg = I.getOperand(0).getReg();
840   const Register SrcReg = I.getOperand(1).getReg();
841   const unsigned DstSize = RBI.getSizeInBits(DstReg, MRI, TRI);
842   const unsigned SrcSize = RBI.getSizeInBits(SrcReg, MRI, TRI);
843 
844   // Make sure the size of the source and dest line up.
845   assert(
846       (DstSize == SrcSize ||
847        // Copies are a mean to setup initial types, the number of
848        // bits may not exactly match.
849        (Register::isPhysicalRegister(SrcReg) && DstSize <= SrcSize) ||
850        // Copies are a mean to copy bits around, as long as we are
851        // on the same register class, that's fine. Otherwise, that
852        // means we need some SUBREG_TO_REG or AND & co.
853        (((DstSize + 31) / 32 == (SrcSize + 31) / 32) && DstSize > SrcSize)) &&
854       "Copy with different width?!");
855 
856   // Check the size of the destination.
857   assert((DstSize <= 64 || DstBank.getID() == AArch64::FPRRegBankID) &&
858          "GPRs cannot get more than 64-bit width values");
859 
860   return true;
861 }
862 #endif
863 
864 /// Helper function for selectCopy. Inserts a subregister copy from \p SrcReg
865 /// to \p *To.
866 ///
867 /// E.g "To = COPY SrcReg:SubReg"
868 static bool copySubReg(MachineInstr &I, MachineRegisterInfo &MRI,
869                        const RegisterBankInfo &RBI, Register SrcReg,
870                        const TargetRegisterClass *To, unsigned SubReg) {
871   assert(SrcReg.isValid() && "Expected a valid source register?");
872   assert(To && "Destination register class cannot be null");
873   assert(SubReg && "Expected a valid subregister");
874 
875   MachineIRBuilder MIB(I);
876   auto SubRegCopy =
877       MIB.buildInstr(TargetOpcode::COPY, {To}, {}).addReg(SrcReg, 0, SubReg);
878   MachineOperand &RegOp = I.getOperand(1);
879   RegOp.setReg(SubRegCopy.getReg(0));
880 
881   // It's possible that the destination register won't be constrained. Make
882   // sure that happens.
883   if (!Register::isPhysicalRegister(I.getOperand(0).getReg()))
884     RBI.constrainGenericRegister(I.getOperand(0).getReg(), *To, MRI);
885 
886   return true;
887 }
888 
889 /// Helper function to get the source and destination register classes for a
890 /// copy. Returns a std::pair containing the source register class for the
891 /// copy, and the destination register class for the copy. If a register class
892 /// cannot be determined, then it will be nullptr.
893 static std::pair<const TargetRegisterClass *, const TargetRegisterClass *>
894 getRegClassesForCopy(MachineInstr &I, const TargetInstrInfo &TII,
895                      MachineRegisterInfo &MRI, const TargetRegisterInfo &TRI,
896                      const RegisterBankInfo &RBI) {
897   Register DstReg = I.getOperand(0).getReg();
898   Register SrcReg = I.getOperand(1).getReg();
899   const RegisterBank &DstRegBank = *RBI.getRegBank(DstReg, MRI, TRI);
900   const RegisterBank &SrcRegBank = *RBI.getRegBank(SrcReg, MRI, TRI);
901   unsigned DstSize = RBI.getSizeInBits(DstReg, MRI, TRI);
902   unsigned SrcSize = RBI.getSizeInBits(SrcReg, MRI, TRI);
903 
904   // Special casing for cross-bank copies of s1s. We can technically represent
905   // a 1-bit value with any size of register. The minimum size for a GPR is 32
906   // bits. So, we need to put the FPR on 32 bits as well.
907   //
908   // FIXME: I'm not sure if this case holds true outside of copies. If it does,
909   // then we can pull it into the helpers that get the appropriate class for a
910   // register bank. Or make a new helper that carries along some constraint
911   // information.
912   if (SrcRegBank != DstRegBank && (DstSize == 1 && SrcSize == 1))
913     SrcSize = DstSize = 32;
914 
915   return {getMinClassForRegBank(SrcRegBank, SrcSize, true),
916           getMinClassForRegBank(DstRegBank, DstSize, true)};
917 }
918 
919 static bool selectCopy(MachineInstr &I, const TargetInstrInfo &TII,
920                        MachineRegisterInfo &MRI, const TargetRegisterInfo &TRI,
921                        const RegisterBankInfo &RBI) {
922   Register DstReg = I.getOperand(0).getReg();
923   Register SrcReg = I.getOperand(1).getReg();
924   const RegisterBank &DstRegBank = *RBI.getRegBank(DstReg, MRI, TRI);
925   const RegisterBank &SrcRegBank = *RBI.getRegBank(SrcReg, MRI, TRI);
926 
927   // Find the correct register classes for the source and destination registers.
928   const TargetRegisterClass *SrcRC;
929   const TargetRegisterClass *DstRC;
930   std::tie(SrcRC, DstRC) = getRegClassesForCopy(I, TII, MRI, TRI, RBI);
931 
932   if (!DstRC) {
933     LLVM_DEBUG(dbgs() << "Unexpected dest size "
934                       << RBI.getSizeInBits(DstReg, MRI, TRI) << '\n');
935     return false;
936   }
937 
938   // A couple helpers below, for making sure that the copy we produce is valid.
939 
940   // Set to true if we insert a SUBREG_TO_REG. If we do this, then we don't want
941   // to verify that the src and dst are the same size, since that's handled by
942   // the SUBREG_TO_REG.
943   bool KnownValid = false;
944 
945   // Returns true, or asserts if something we don't expect happens. Instead of
946   // returning true, we return isValidCopy() to ensure that we verify the
947   // result.
948   auto CheckCopy = [&]() {
949     // If we have a bitcast or something, we can't have physical registers.
950     assert((I.isCopy() ||
951             (!Register::isPhysicalRegister(I.getOperand(0).getReg()) &&
952              !Register::isPhysicalRegister(I.getOperand(1).getReg()))) &&
953            "No phys reg on generic operator!");
954     bool ValidCopy = true;
955 #ifndef NDEBUG
956     ValidCopy = KnownValid || isValidCopy(I, DstRegBank, MRI, TRI, RBI);
957     assert(ValidCopy && "Invalid copy.");
958 #endif
959     (void)KnownValid;
960     return ValidCopy;
961   };
962 
963   // Is this a copy? If so, then we may need to insert a subregister copy.
964   if (I.isCopy()) {
965     // Yes. Check if there's anything to fix up.
966     if (!SrcRC) {
967       LLVM_DEBUG(dbgs() << "Couldn't determine source register class\n");
968       return false;
969     }
970 
971     unsigned SrcSize = TRI.getRegSizeInBits(*SrcRC);
972     unsigned DstSize = TRI.getRegSizeInBits(*DstRC);
973     unsigned SubReg;
974 
975     // If the source bank doesn't support a subregister copy small enough,
976     // then we first need to copy to the destination bank.
977     if (getMinSizeForRegBank(SrcRegBank) > DstSize) {
978       const TargetRegisterClass *DstTempRC =
979           getMinClassForRegBank(DstRegBank, SrcSize, /* GetAllRegSet */ true);
980       getSubRegForClass(DstRC, TRI, SubReg);
981 
982       MachineIRBuilder MIB(I);
983       auto Copy = MIB.buildCopy({DstTempRC}, {SrcReg});
984       copySubReg(I, MRI, RBI, Copy.getReg(0), DstRC, SubReg);
985     } else if (SrcSize > DstSize) {
986       // If the source register is bigger than the destination we need to
987       // perform a subregister copy.
988       const TargetRegisterClass *SubRegRC =
989           getMinClassForRegBank(SrcRegBank, DstSize, /* GetAllRegSet */ true);
990       getSubRegForClass(SubRegRC, TRI, SubReg);
991       copySubReg(I, MRI, RBI, SrcReg, DstRC, SubReg);
992     } else if (DstSize > SrcSize) {
993       // If the destination register is bigger than the source we need to do
994       // a promotion using SUBREG_TO_REG.
995       const TargetRegisterClass *PromotionRC =
996           getMinClassForRegBank(SrcRegBank, DstSize, /* GetAllRegSet */ true);
997       getSubRegForClass(SrcRC, TRI, SubReg);
998 
999       Register PromoteReg = MRI.createVirtualRegister(PromotionRC);
1000       BuildMI(*I.getParent(), I, I.getDebugLoc(),
1001               TII.get(AArch64::SUBREG_TO_REG), PromoteReg)
1002           .addImm(0)
1003           .addUse(SrcReg)
1004           .addImm(SubReg);
1005       MachineOperand &RegOp = I.getOperand(1);
1006       RegOp.setReg(PromoteReg);
1007 
1008       // Promise that the copy is implicitly validated by the SUBREG_TO_REG.
1009       KnownValid = true;
1010     }
1011 
1012     // If the destination is a physical register, then there's nothing to
1013     // change, so we're done.
1014     if (Register::isPhysicalRegister(DstReg))
1015       return CheckCopy();
1016   }
1017 
1018   // No need to constrain SrcReg. It will get constrained when we hit another
1019   // of its use or its defs. Copies do not have constraints.
1020   if (!RBI.constrainGenericRegister(DstReg, *DstRC, MRI)) {
1021     LLVM_DEBUG(dbgs() << "Failed to constrain " << TII.getName(I.getOpcode())
1022                       << " operand\n");
1023     return false;
1024   }
1025 
1026   // If this a GPR ZEXT that we want to just reduce down into a copy.
1027   // The sizes will be mismatched with the source < 32b but that's ok.
1028   if (I.getOpcode() == TargetOpcode::G_ZEXT) {
1029     I.setDesc(TII.get(AArch64::COPY));
1030     assert(SrcRegBank.getID() == AArch64::GPRRegBankID);
1031     return selectCopy(I, TII, MRI, TRI, RBI);
1032   }
1033 
1034   I.setDesc(TII.get(AArch64::COPY));
1035   return CheckCopy();
1036 }
1037 
1038 static unsigned selectFPConvOpc(unsigned GenericOpc, LLT DstTy, LLT SrcTy) {
1039   if (!DstTy.isScalar() || !SrcTy.isScalar())
1040     return GenericOpc;
1041 
1042   const unsigned DstSize = DstTy.getSizeInBits();
1043   const unsigned SrcSize = SrcTy.getSizeInBits();
1044 
1045   switch (DstSize) {
1046   case 32:
1047     switch (SrcSize) {
1048     case 32:
1049       switch (GenericOpc) {
1050       case TargetOpcode::G_SITOFP:
1051         return AArch64::SCVTFUWSri;
1052       case TargetOpcode::G_UITOFP:
1053         return AArch64::UCVTFUWSri;
1054       case TargetOpcode::G_FPTOSI:
1055         return AArch64::FCVTZSUWSr;
1056       case TargetOpcode::G_FPTOUI:
1057         return AArch64::FCVTZUUWSr;
1058       default:
1059         return GenericOpc;
1060       }
1061     case 64:
1062       switch (GenericOpc) {
1063       case TargetOpcode::G_SITOFP:
1064         return AArch64::SCVTFUXSri;
1065       case TargetOpcode::G_UITOFP:
1066         return AArch64::UCVTFUXSri;
1067       case TargetOpcode::G_FPTOSI:
1068         return AArch64::FCVTZSUWDr;
1069       case TargetOpcode::G_FPTOUI:
1070         return AArch64::FCVTZUUWDr;
1071       default:
1072         return GenericOpc;
1073       }
1074     default:
1075       return GenericOpc;
1076     }
1077   case 64:
1078     switch (SrcSize) {
1079     case 32:
1080       switch (GenericOpc) {
1081       case TargetOpcode::G_SITOFP:
1082         return AArch64::SCVTFUWDri;
1083       case TargetOpcode::G_UITOFP:
1084         return AArch64::UCVTFUWDri;
1085       case TargetOpcode::G_FPTOSI:
1086         return AArch64::FCVTZSUXSr;
1087       case TargetOpcode::G_FPTOUI:
1088         return AArch64::FCVTZUUXSr;
1089       default:
1090         return GenericOpc;
1091       }
1092     case 64:
1093       switch (GenericOpc) {
1094       case TargetOpcode::G_SITOFP:
1095         return AArch64::SCVTFUXDri;
1096       case TargetOpcode::G_UITOFP:
1097         return AArch64::UCVTFUXDri;
1098       case TargetOpcode::G_FPTOSI:
1099         return AArch64::FCVTZSUXDr;
1100       case TargetOpcode::G_FPTOUI:
1101         return AArch64::FCVTZUUXDr;
1102       default:
1103         return GenericOpc;
1104       }
1105     default:
1106       return GenericOpc;
1107     }
1108   default:
1109     return GenericOpc;
1110   };
1111   return GenericOpc;
1112 }
1113 
1114 MachineInstr *
1115 AArch64InstructionSelector::emitSelect(Register Dst, Register True,
1116                                        Register False, AArch64CC::CondCode CC,
1117                                        MachineIRBuilder &MIB) const {
1118   MachineRegisterInfo &MRI = *MIB.getMRI();
1119   assert(RBI.getRegBank(False, MRI, TRI)->getID() ==
1120              RBI.getRegBank(True, MRI, TRI)->getID() &&
1121          "Expected both select operands to have the same regbank?");
1122   LLT Ty = MRI.getType(True);
1123   if (Ty.isVector())
1124     return nullptr;
1125   const unsigned Size = Ty.getSizeInBits();
1126   assert((Size == 32 || Size == 64) &&
1127          "Expected 32 bit or 64 bit select only?");
1128   const bool Is32Bit = Size == 32;
1129   if (RBI.getRegBank(True, MRI, TRI)->getID() != AArch64::GPRRegBankID) {
1130     unsigned Opc = Is32Bit ? AArch64::FCSELSrrr : AArch64::FCSELDrrr;
1131     auto FCSel = MIB.buildInstr(Opc, {Dst}, {True, False}).addImm(CC);
1132     constrainSelectedInstRegOperands(*FCSel, TII, TRI, RBI);
1133     return &*FCSel;
1134   }
1135 
1136   // By default, we'll try and emit a CSEL.
1137   unsigned Opc = Is32Bit ? AArch64::CSELWr : AArch64::CSELXr;
1138   bool Optimized = false;
1139   auto TryFoldBinOpIntoSelect = [&Opc, Is32Bit, &CC, &MRI,
1140                                  &Optimized](Register &Reg, Register &OtherReg,
1141                                              bool Invert) {
1142     if (Optimized)
1143       return false;
1144 
1145     // Attempt to fold:
1146     //
1147     // %sub = G_SUB 0, %x
1148     // %select = G_SELECT cc, %reg, %sub
1149     //
1150     // Into:
1151     // %select = CSNEG %reg, %x, cc
1152     Register MatchReg;
1153     if (mi_match(Reg, MRI, m_Neg(m_Reg(MatchReg)))) {
1154       Opc = Is32Bit ? AArch64::CSNEGWr : AArch64::CSNEGXr;
1155       Reg = MatchReg;
1156       if (Invert) {
1157         CC = AArch64CC::getInvertedCondCode(CC);
1158         std::swap(Reg, OtherReg);
1159       }
1160       return true;
1161     }
1162 
1163     // Attempt to fold:
1164     //
1165     // %xor = G_XOR %x, -1
1166     // %select = G_SELECT cc, %reg, %xor
1167     //
1168     // Into:
1169     // %select = CSINV %reg, %x, cc
1170     if (mi_match(Reg, MRI, m_Not(m_Reg(MatchReg)))) {
1171       Opc = Is32Bit ? AArch64::CSINVWr : AArch64::CSINVXr;
1172       Reg = MatchReg;
1173       if (Invert) {
1174         CC = AArch64CC::getInvertedCondCode(CC);
1175         std::swap(Reg, OtherReg);
1176       }
1177       return true;
1178     }
1179 
1180     // Attempt to fold:
1181     //
1182     // %add = G_ADD %x, 1
1183     // %select = G_SELECT cc, %reg, %add
1184     //
1185     // Into:
1186     // %select = CSINC %reg, %x, cc
1187     if (mi_match(Reg, MRI,
1188                  m_any_of(m_GAdd(m_Reg(MatchReg), m_SpecificICst(1)),
1189                           m_GPtrAdd(m_Reg(MatchReg), m_SpecificICst(1))))) {
1190       Opc = Is32Bit ? AArch64::CSINCWr : AArch64::CSINCXr;
1191       Reg = MatchReg;
1192       if (Invert) {
1193         CC = AArch64CC::getInvertedCondCode(CC);
1194         std::swap(Reg, OtherReg);
1195       }
1196       return true;
1197     }
1198 
1199     return false;
1200   };
1201 
1202   // Helper lambda which tries to use CSINC/CSINV for the instruction when its
1203   // true/false values are constants.
1204   // FIXME: All of these patterns already exist in tablegen. We should be
1205   // able to import these.
1206   auto TryOptSelectCst = [&Opc, &True, &False, &CC, Is32Bit, &MRI,
1207                           &Optimized]() {
1208     if (Optimized)
1209       return false;
1210     auto TrueCst = getIConstantVRegValWithLookThrough(True, MRI);
1211     auto FalseCst = getIConstantVRegValWithLookThrough(False, MRI);
1212     if (!TrueCst && !FalseCst)
1213       return false;
1214 
1215     Register ZReg = Is32Bit ? AArch64::WZR : AArch64::XZR;
1216     if (TrueCst && FalseCst) {
1217       int64_t T = TrueCst->Value.getSExtValue();
1218       int64_t F = FalseCst->Value.getSExtValue();
1219 
1220       if (T == 0 && F == 1) {
1221         // G_SELECT cc, 0, 1 -> CSINC zreg, zreg, cc
1222         Opc = Is32Bit ? AArch64::CSINCWr : AArch64::CSINCXr;
1223         True = ZReg;
1224         False = ZReg;
1225         return true;
1226       }
1227 
1228       if (T == 0 && F == -1) {
1229         // G_SELECT cc 0, -1 -> CSINV zreg, zreg cc
1230         Opc = Is32Bit ? AArch64::CSINVWr : AArch64::CSINVXr;
1231         True = ZReg;
1232         False = ZReg;
1233         return true;
1234       }
1235     }
1236 
1237     if (TrueCst) {
1238       int64_t T = TrueCst->Value.getSExtValue();
1239       if (T == 1) {
1240         // G_SELECT cc, 1, f -> CSINC f, zreg, inv_cc
1241         Opc = Is32Bit ? AArch64::CSINCWr : AArch64::CSINCXr;
1242         True = False;
1243         False = ZReg;
1244         CC = AArch64CC::getInvertedCondCode(CC);
1245         return true;
1246       }
1247 
1248       if (T == -1) {
1249         // G_SELECT cc, -1, f -> CSINV f, zreg, inv_cc
1250         Opc = Is32Bit ? AArch64::CSINVWr : AArch64::CSINVXr;
1251         True = False;
1252         False = ZReg;
1253         CC = AArch64CC::getInvertedCondCode(CC);
1254         return true;
1255       }
1256     }
1257 
1258     if (FalseCst) {
1259       int64_t F = FalseCst->Value.getSExtValue();
1260       if (F == 1) {
1261         // G_SELECT cc, t, 1 -> CSINC t, zreg, cc
1262         Opc = Is32Bit ? AArch64::CSINCWr : AArch64::CSINCXr;
1263         False = ZReg;
1264         return true;
1265       }
1266 
1267       if (F == -1) {
1268         // G_SELECT cc, t, -1 -> CSINC t, zreg, cc
1269         Opc = Is32Bit ? AArch64::CSINVWr : AArch64::CSINVXr;
1270         False = ZReg;
1271         return true;
1272       }
1273     }
1274     return false;
1275   };
1276 
1277   Optimized |= TryFoldBinOpIntoSelect(False, True, /*Invert = */ false);
1278   Optimized |= TryFoldBinOpIntoSelect(True, False, /*Invert = */ true);
1279   Optimized |= TryOptSelectCst();
1280   auto SelectInst = MIB.buildInstr(Opc, {Dst}, {True, False}).addImm(CC);
1281   constrainSelectedInstRegOperands(*SelectInst, TII, TRI, RBI);
1282   return &*SelectInst;
1283 }
1284 
1285 static AArch64CC::CondCode changeICMPPredToAArch64CC(CmpInst::Predicate P) {
1286   switch (P) {
1287   default:
1288     llvm_unreachable("Unknown condition code!");
1289   case CmpInst::ICMP_NE:
1290     return AArch64CC::NE;
1291   case CmpInst::ICMP_EQ:
1292     return AArch64CC::EQ;
1293   case CmpInst::ICMP_SGT:
1294     return AArch64CC::GT;
1295   case CmpInst::ICMP_SGE:
1296     return AArch64CC::GE;
1297   case CmpInst::ICMP_SLT:
1298     return AArch64CC::LT;
1299   case CmpInst::ICMP_SLE:
1300     return AArch64CC::LE;
1301   case CmpInst::ICMP_UGT:
1302     return AArch64CC::HI;
1303   case CmpInst::ICMP_UGE:
1304     return AArch64CC::HS;
1305   case CmpInst::ICMP_ULT:
1306     return AArch64CC::LO;
1307   case CmpInst::ICMP_ULE:
1308     return AArch64CC::LS;
1309   }
1310 }
1311 
1312 /// Return a register which can be used as a bit to test in a TB(N)Z.
1313 static Register getTestBitReg(Register Reg, uint64_t &Bit, bool &Invert,
1314                               MachineRegisterInfo &MRI) {
1315   assert(Reg.isValid() && "Expected valid register!");
1316   bool HasZext = false;
1317   while (MachineInstr *MI = getDefIgnoringCopies(Reg, MRI)) {
1318     unsigned Opc = MI->getOpcode();
1319 
1320     if (!MI->getOperand(0).isReg() ||
1321         !MRI.hasOneNonDBGUse(MI->getOperand(0).getReg()))
1322       break;
1323 
1324     // (tbz (any_ext x), b) -> (tbz x, b) if we don't use the extended bits.
1325     //
1326     // (tbz (trunc x), b) -> (tbz x, b) is always safe, because the bit number
1327     // on the truncated x is the same as the bit number on x.
1328     if (Opc == TargetOpcode::G_ANYEXT || Opc == TargetOpcode::G_ZEXT ||
1329         Opc == TargetOpcode::G_TRUNC) {
1330       if (Opc == TargetOpcode::G_ZEXT)
1331         HasZext = true;
1332 
1333       Register NextReg = MI->getOperand(1).getReg();
1334       // Did we find something worth folding?
1335       if (!NextReg.isValid() || !MRI.hasOneNonDBGUse(NextReg))
1336         break;
1337 
1338       // NextReg is worth folding. Keep looking.
1339       Reg = NextReg;
1340       continue;
1341     }
1342 
1343     // Attempt to find a suitable operation with a constant on one side.
1344     Optional<uint64_t> C;
1345     Register TestReg;
1346     switch (Opc) {
1347     default:
1348       break;
1349     case TargetOpcode::G_AND:
1350     case TargetOpcode::G_XOR: {
1351       TestReg = MI->getOperand(1).getReg();
1352       Register ConstantReg = MI->getOperand(2).getReg();
1353       auto VRegAndVal = getIConstantVRegValWithLookThrough(ConstantReg, MRI);
1354       if (!VRegAndVal) {
1355         // AND commutes, check the other side for a constant.
1356         // FIXME: Can we canonicalize the constant so that it's always on the
1357         // same side at some point earlier?
1358         std::swap(ConstantReg, TestReg);
1359         VRegAndVal = getIConstantVRegValWithLookThrough(ConstantReg, MRI);
1360       }
1361       if (VRegAndVal) {
1362         if (HasZext)
1363           C = VRegAndVal->Value.getZExtValue();
1364         else
1365           C = VRegAndVal->Value.getSExtValue();
1366       }
1367       break;
1368     }
1369     case TargetOpcode::G_ASHR:
1370     case TargetOpcode::G_LSHR:
1371     case TargetOpcode::G_SHL: {
1372       TestReg = MI->getOperand(1).getReg();
1373       auto VRegAndVal =
1374           getIConstantVRegValWithLookThrough(MI->getOperand(2).getReg(), MRI);
1375       if (VRegAndVal)
1376         C = VRegAndVal->Value.getSExtValue();
1377       break;
1378     }
1379     }
1380 
1381     // Didn't find a constant or viable register. Bail out of the loop.
1382     if (!C || !TestReg.isValid())
1383       break;
1384 
1385     // We found a suitable instruction with a constant. Check to see if we can
1386     // walk through the instruction.
1387     Register NextReg;
1388     unsigned TestRegSize = MRI.getType(TestReg).getSizeInBits();
1389     switch (Opc) {
1390     default:
1391       break;
1392     case TargetOpcode::G_AND:
1393       // (tbz (and x, m), b) -> (tbz x, b) when the b-th bit of m is set.
1394       if ((*C >> Bit) & 1)
1395         NextReg = TestReg;
1396       break;
1397     case TargetOpcode::G_SHL:
1398       // (tbz (shl x, c), b) -> (tbz x, b-c) when b-c is positive and fits in
1399       // the type of the register.
1400       if (*C <= Bit && (Bit - *C) < TestRegSize) {
1401         NextReg = TestReg;
1402         Bit = Bit - *C;
1403       }
1404       break;
1405     case TargetOpcode::G_ASHR:
1406       // (tbz (ashr x, c), b) -> (tbz x, b+c) or (tbz x, msb) if b+c is > # bits
1407       // in x
1408       NextReg = TestReg;
1409       Bit = Bit + *C;
1410       if (Bit >= TestRegSize)
1411         Bit = TestRegSize - 1;
1412       break;
1413     case TargetOpcode::G_LSHR:
1414       // (tbz (lshr x, c), b) -> (tbz x, b+c) when b + c is < # bits in x
1415       if ((Bit + *C) < TestRegSize) {
1416         NextReg = TestReg;
1417         Bit = Bit + *C;
1418       }
1419       break;
1420     case TargetOpcode::G_XOR:
1421       // We can walk through a G_XOR by inverting whether we use tbz/tbnz when
1422       // appropriate.
1423       //
1424       // e.g. If x' = xor x, c, and the b-th bit is set in c then
1425       //
1426       // tbz x', b -> tbnz x, b
1427       //
1428       // Because x' only has the b-th bit set if x does not.
1429       if ((*C >> Bit) & 1)
1430         Invert = !Invert;
1431       NextReg = TestReg;
1432       break;
1433     }
1434 
1435     // Check if we found anything worth folding.
1436     if (!NextReg.isValid())
1437       return Reg;
1438     Reg = NextReg;
1439   }
1440 
1441   return Reg;
1442 }
1443 
1444 MachineInstr *AArch64InstructionSelector::emitTestBit(
1445     Register TestReg, uint64_t Bit, bool IsNegative, MachineBasicBlock *DstMBB,
1446     MachineIRBuilder &MIB) const {
1447   assert(TestReg.isValid());
1448   assert(ProduceNonFlagSettingCondBr &&
1449          "Cannot emit TB(N)Z with speculation tracking!");
1450   MachineRegisterInfo &MRI = *MIB.getMRI();
1451 
1452   // Attempt to optimize the test bit by walking over instructions.
1453   TestReg = getTestBitReg(TestReg, Bit, IsNegative, MRI);
1454   LLT Ty = MRI.getType(TestReg);
1455   unsigned Size = Ty.getSizeInBits();
1456   assert(!Ty.isVector() && "Expected a scalar!");
1457   assert(Bit < 64 && "Bit is too large!");
1458 
1459   // When the test register is a 64-bit register, we have to narrow to make
1460   // TBNZW work.
1461   bool UseWReg = Bit < 32;
1462   unsigned NecessarySize = UseWReg ? 32 : 64;
1463   if (Size != NecessarySize)
1464     TestReg = moveScalarRegClass(
1465         TestReg, UseWReg ? AArch64::GPR32RegClass : AArch64::GPR64RegClass,
1466         MIB);
1467 
1468   static const unsigned OpcTable[2][2] = {{AArch64::TBZX, AArch64::TBNZX},
1469                                           {AArch64::TBZW, AArch64::TBNZW}};
1470   unsigned Opc = OpcTable[UseWReg][IsNegative];
1471   auto TestBitMI =
1472       MIB.buildInstr(Opc).addReg(TestReg).addImm(Bit).addMBB(DstMBB);
1473   constrainSelectedInstRegOperands(*TestBitMI, TII, TRI, RBI);
1474   return &*TestBitMI;
1475 }
1476 
1477 bool AArch64InstructionSelector::tryOptAndIntoCompareBranch(
1478     MachineInstr &AndInst, bool Invert, MachineBasicBlock *DstMBB,
1479     MachineIRBuilder &MIB) const {
1480   assert(AndInst.getOpcode() == TargetOpcode::G_AND && "Expected G_AND only?");
1481   // Given something like this:
1482   //
1483   //  %x = ...Something...
1484   //  %one = G_CONSTANT i64 1
1485   //  %zero = G_CONSTANT i64 0
1486   //  %and = G_AND %x, %one
1487   //  %cmp = G_ICMP intpred(ne), %and, %zero
1488   //  %cmp_trunc = G_TRUNC %cmp
1489   //  G_BRCOND %cmp_trunc, %bb.3
1490   //
1491   // We want to try and fold the AND into the G_BRCOND and produce either a
1492   // TBNZ (when we have intpred(ne)) or a TBZ (when we have intpred(eq)).
1493   //
1494   // In this case, we'd get
1495   //
1496   // TBNZ %x %bb.3
1497   //
1498 
1499   // Check if the AND has a constant on its RHS which we can use as a mask.
1500   // If it's a power of 2, then it's the same as checking a specific bit.
1501   // (e.g, ANDing with 8 == ANDing with 000...100 == testing if bit 3 is set)
1502   auto MaybeBit = getIConstantVRegValWithLookThrough(
1503       AndInst.getOperand(2).getReg(), *MIB.getMRI());
1504   if (!MaybeBit)
1505     return false;
1506 
1507   int32_t Bit = MaybeBit->Value.exactLogBase2();
1508   if (Bit < 0)
1509     return false;
1510 
1511   Register TestReg = AndInst.getOperand(1).getReg();
1512 
1513   // Emit a TB(N)Z.
1514   emitTestBit(TestReg, Bit, Invert, DstMBB, MIB);
1515   return true;
1516 }
1517 
1518 MachineInstr *AArch64InstructionSelector::emitCBZ(Register CompareReg,
1519                                                   bool IsNegative,
1520                                                   MachineBasicBlock *DestMBB,
1521                                                   MachineIRBuilder &MIB) const {
1522   assert(ProduceNonFlagSettingCondBr && "CBZ does not set flags!");
1523   MachineRegisterInfo &MRI = *MIB.getMRI();
1524   assert(RBI.getRegBank(CompareReg, MRI, TRI)->getID() ==
1525              AArch64::GPRRegBankID &&
1526          "Expected GPRs only?");
1527   auto Ty = MRI.getType(CompareReg);
1528   unsigned Width = Ty.getSizeInBits();
1529   assert(!Ty.isVector() && "Expected scalar only?");
1530   assert(Width <= 64 && "Expected width to be at most 64?");
1531   static const unsigned OpcTable[2][2] = {{AArch64::CBZW, AArch64::CBZX},
1532                                           {AArch64::CBNZW, AArch64::CBNZX}};
1533   unsigned Opc = OpcTable[IsNegative][Width == 64];
1534   auto BranchMI = MIB.buildInstr(Opc, {}, {CompareReg}).addMBB(DestMBB);
1535   constrainSelectedInstRegOperands(*BranchMI, TII, TRI, RBI);
1536   return &*BranchMI;
1537 }
1538 
1539 bool AArch64InstructionSelector::selectCompareBranchFedByFCmp(
1540     MachineInstr &I, MachineInstr &FCmp, MachineIRBuilder &MIB) const {
1541   assert(FCmp.getOpcode() == TargetOpcode::G_FCMP);
1542   assert(I.getOpcode() == TargetOpcode::G_BRCOND);
1543   // Unfortunately, the mapping of LLVM FP CC's onto AArch64 CC's isn't
1544   // totally clean.  Some of them require two branches to implement.
1545   auto Pred = (CmpInst::Predicate)FCmp.getOperand(1).getPredicate();
1546   emitFPCompare(FCmp.getOperand(2).getReg(), FCmp.getOperand(3).getReg(), MIB,
1547                 Pred);
1548   AArch64CC::CondCode CC1, CC2;
1549   changeFCMPPredToAArch64CC(static_cast<CmpInst::Predicate>(Pred), CC1, CC2);
1550   MachineBasicBlock *DestMBB = I.getOperand(1).getMBB();
1551   MIB.buildInstr(AArch64::Bcc, {}, {}).addImm(CC1).addMBB(DestMBB);
1552   if (CC2 != AArch64CC::AL)
1553     MIB.buildInstr(AArch64::Bcc, {}, {}).addImm(CC2).addMBB(DestMBB);
1554   I.eraseFromParent();
1555   return true;
1556 }
1557 
1558 bool AArch64InstructionSelector::tryOptCompareBranchFedByICmp(
1559     MachineInstr &I, MachineInstr &ICmp, MachineIRBuilder &MIB) const {
1560   assert(ICmp.getOpcode() == TargetOpcode::G_ICMP);
1561   assert(I.getOpcode() == TargetOpcode::G_BRCOND);
1562   // Attempt to optimize the G_BRCOND + G_ICMP into a TB(N)Z/CB(N)Z.
1563   //
1564   // Speculation tracking/SLH assumes that optimized TB(N)Z/CB(N)Z
1565   // instructions will not be produced, as they are conditional branch
1566   // instructions that do not set flags.
1567   if (!ProduceNonFlagSettingCondBr)
1568     return false;
1569 
1570   MachineRegisterInfo &MRI = *MIB.getMRI();
1571   MachineBasicBlock *DestMBB = I.getOperand(1).getMBB();
1572   auto Pred =
1573       static_cast<CmpInst::Predicate>(ICmp.getOperand(1).getPredicate());
1574   Register LHS = ICmp.getOperand(2).getReg();
1575   Register RHS = ICmp.getOperand(3).getReg();
1576 
1577   // We're allowed to emit a TB(N)Z/CB(N)Z. Try to do that.
1578   auto VRegAndVal = getIConstantVRegValWithLookThrough(RHS, MRI);
1579   MachineInstr *AndInst = getOpcodeDef(TargetOpcode::G_AND, LHS, MRI);
1580 
1581   // When we can emit a TB(N)Z, prefer that.
1582   //
1583   // Handle non-commutative condition codes first.
1584   // Note that we don't want to do this when we have a G_AND because it can
1585   // become a tst. The tst will make the test bit in the TB(N)Z redundant.
1586   if (VRegAndVal && !AndInst) {
1587     int64_t C = VRegAndVal->Value.getSExtValue();
1588 
1589     // When we have a greater-than comparison, we can just test if the msb is
1590     // zero.
1591     if (C == -1 && Pred == CmpInst::ICMP_SGT) {
1592       uint64_t Bit = MRI.getType(LHS).getSizeInBits() - 1;
1593       emitTestBit(LHS, Bit, /*IsNegative = */ false, DestMBB, MIB);
1594       I.eraseFromParent();
1595       return true;
1596     }
1597 
1598     // When we have a less than comparison, we can just test if the msb is not
1599     // zero.
1600     if (C == 0 && Pred == CmpInst::ICMP_SLT) {
1601       uint64_t Bit = MRI.getType(LHS).getSizeInBits() - 1;
1602       emitTestBit(LHS, Bit, /*IsNegative = */ true, DestMBB, MIB);
1603       I.eraseFromParent();
1604       return true;
1605     }
1606   }
1607 
1608   // Attempt to handle commutative condition codes. Right now, that's only
1609   // eq/ne.
1610   if (ICmpInst::isEquality(Pred)) {
1611     if (!VRegAndVal) {
1612       std::swap(RHS, LHS);
1613       VRegAndVal = getIConstantVRegValWithLookThrough(RHS, MRI);
1614       AndInst = getOpcodeDef(TargetOpcode::G_AND, LHS, MRI);
1615     }
1616 
1617     if (VRegAndVal && VRegAndVal->Value == 0) {
1618       // If there's a G_AND feeding into this branch, try to fold it away by
1619       // emitting a TB(N)Z instead.
1620       //
1621       // Note: If we have LT, then it *is* possible to fold, but it wouldn't be
1622       // beneficial. When we have an AND and LT, we need a TST/ANDS, so folding
1623       // would be redundant.
1624       if (AndInst &&
1625           tryOptAndIntoCompareBranch(
1626               *AndInst, /*Invert = */ Pred == CmpInst::ICMP_NE, DestMBB, MIB)) {
1627         I.eraseFromParent();
1628         return true;
1629       }
1630 
1631       // Otherwise, try to emit a CB(N)Z instead.
1632       auto LHSTy = MRI.getType(LHS);
1633       if (!LHSTy.isVector() && LHSTy.getSizeInBits() <= 64) {
1634         emitCBZ(LHS, /*IsNegative = */ Pred == CmpInst::ICMP_NE, DestMBB, MIB);
1635         I.eraseFromParent();
1636         return true;
1637       }
1638     }
1639   }
1640 
1641   return false;
1642 }
1643 
1644 bool AArch64InstructionSelector::selectCompareBranchFedByICmp(
1645     MachineInstr &I, MachineInstr &ICmp, MachineIRBuilder &MIB) const {
1646   assert(ICmp.getOpcode() == TargetOpcode::G_ICMP);
1647   assert(I.getOpcode() == TargetOpcode::G_BRCOND);
1648   if (tryOptCompareBranchFedByICmp(I, ICmp, MIB))
1649     return true;
1650 
1651   // Couldn't optimize. Emit a compare + a Bcc.
1652   MachineBasicBlock *DestMBB = I.getOperand(1).getMBB();
1653   auto PredOp = ICmp.getOperand(1);
1654   emitIntegerCompare(ICmp.getOperand(2), ICmp.getOperand(3), PredOp, MIB);
1655   const AArch64CC::CondCode CC = changeICMPPredToAArch64CC(
1656       static_cast<CmpInst::Predicate>(PredOp.getPredicate()));
1657   MIB.buildInstr(AArch64::Bcc, {}, {}).addImm(CC).addMBB(DestMBB);
1658   I.eraseFromParent();
1659   return true;
1660 }
1661 
1662 bool AArch64InstructionSelector::selectCompareBranch(
1663     MachineInstr &I, MachineFunction &MF, MachineRegisterInfo &MRI) {
1664   Register CondReg = I.getOperand(0).getReg();
1665   MachineInstr *CCMI = MRI.getVRegDef(CondReg);
1666   if (CCMI->getOpcode() == TargetOpcode::G_TRUNC) {
1667     CondReg = CCMI->getOperand(1).getReg();
1668     CCMI = MRI.getVRegDef(CondReg);
1669   }
1670 
1671   // Try to select the G_BRCOND using whatever is feeding the condition if
1672   // possible.
1673   unsigned CCMIOpc = CCMI->getOpcode();
1674   if (CCMIOpc == TargetOpcode::G_FCMP)
1675     return selectCompareBranchFedByFCmp(I, *CCMI, MIB);
1676   if (CCMIOpc == TargetOpcode::G_ICMP)
1677     return selectCompareBranchFedByICmp(I, *CCMI, MIB);
1678 
1679   // Speculation tracking/SLH assumes that optimized TB(N)Z/CB(N)Z
1680   // instructions will not be produced, as they are conditional branch
1681   // instructions that do not set flags.
1682   if (ProduceNonFlagSettingCondBr) {
1683     emitTestBit(CondReg, /*Bit = */ 0, /*IsNegative = */ true,
1684                 I.getOperand(1).getMBB(), MIB);
1685     I.eraseFromParent();
1686     return true;
1687   }
1688 
1689   // Can't emit TB(N)Z/CB(N)Z. Emit a tst + bcc instead.
1690   auto TstMI =
1691       MIB.buildInstr(AArch64::ANDSWri, {LLT::scalar(32)}, {CondReg}).addImm(1);
1692   constrainSelectedInstRegOperands(*TstMI, TII, TRI, RBI);
1693   auto Bcc = MIB.buildInstr(AArch64::Bcc)
1694                  .addImm(AArch64CC::EQ)
1695                  .addMBB(I.getOperand(1).getMBB());
1696   I.eraseFromParent();
1697   return constrainSelectedInstRegOperands(*Bcc, TII, TRI, RBI);
1698 }
1699 
1700 /// Returns the element immediate value of a vector shift operand if found.
1701 /// This needs to detect a splat-like operation, e.g. a G_BUILD_VECTOR.
1702 static Optional<int64_t> getVectorShiftImm(Register Reg,
1703                                            MachineRegisterInfo &MRI) {
1704   assert(MRI.getType(Reg).isVector() && "Expected a *vector* shift operand");
1705   MachineInstr *OpMI = MRI.getVRegDef(Reg);
1706   assert(OpMI && "Expected to find a vreg def for vector shift operand");
1707   return getAArch64VectorSplatScalar(*OpMI, MRI);
1708 }
1709 
1710 /// Matches and returns the shift immediate value for a SHL instruction given
1711 /// a shift operand.
1712 static Optional<int64_t> getVectorSHLImm(LLT SrcTy, Register Reg, MachineRegisterInfo &MRI) {
1713   Optional<int64_t> ShiftImm = getVectorShiftImm(Reg, MRI);
1714   if (!ShiftImm)
1715     return None;
1716   // Check the immediate is in range for a SHL.
1717   int64_t Imm = *ShiftImm;
1718   if (Imm < 0)
1719     return None;
1720   switch (SrcTy.getElementType().getSizeInBits()) {
1721   default:
1722     LLVM_DEBUG(dbgs() << "Unhandled element type for vector shift");
1723     return None;
1724   case 8:
1725     if (Imm > 7)
1726       return None;
1727     break;
1728   case 16:
1729     if (Imm > 15)
1730       return None;
1731     break;
1732   case 32:
1733     if (Imm > 31)
1734       return None;
1735     break;
1736   case 64:
1737     if (Imm > 63)
1738       return None;
1739     break;
1740   }
1741   return Imm;
1742 }
1743 
1744 bool AArch64InstructionSelector::selectVectorSHL(MachineInstr &I,
1745                                                  MachineRegisterInfo &MRI) {
1746   assert(I.getOpcode() == TargetOpcode::G_SHL);
1747   Register DstReg = I.getOperand(0).getReg();
1748   const LLT Ty = MRI.getType(DstReg);
1749   Register Src1Reg = I.getOperand(1).getReg();
1750   Register Src2Reg = I.getOperand(2).getReg();
1751 
1752   if (!Ty.isVector())
1753     return false;
1754 
1755   // Check if we have a vector of constants on RHS that we can select as the
1756   // immediate form.
1757   Optional<int64_t> ImmVal = getVectorSHLImm(Ty, Src2Reg, MRI);
1758 
1759   unsigned Opc = 0;
1760   if (Ty == LLT::fixed_vector(2, 64)) {
1761     Opc = ImmVal ? AArch64::SHLv2i64_shift : AArch64::USHLv2i64;
1762   } else if (Ty == LLT::fixed_vector(4, 32)) {
1763     Opc = ImmVal ? AArch64::SHLv4i32_shift : AArch64::USHLv4i32;
1764   } else if (Ty == LLT::fixed_vector(2, 32)) {
1765     Opc = ImmVal ? AArch64::SHLv2i32_shift : AArch64::USHLv2i32;
1766   } else if (Ty == LLT::fixed_vector(4, 16)) {
1767     Opc = ImmVal ? AArch64::SHLv4i16_shift : AArch64::USHLv4i16;
1768   } else if (Ty == LLT::fixed_vector(8, 16)) {
1769     Opc = ImmVal ? AArch64::SHLv8i16_shift : AArch64::USHLv8i16;
1770   } else if (Ty == LLT::fixed_vector(16, 8)) {
1771     Opc = ImmVal ? AArch64::SHLv16i8_shift : AArch64::USHLv16i8;
1772   } else if (Ty == LLT::fixed_vector(8, 8)) {
1773     Opc = ImmVal ? AArch64::SHLv8i8_shift : AArch64::USHLv8i8;
1774   } else {
1775     LLVM_DEBUG(dbgs() << "Unhandled G_SHL type");
1776     return false;
1777   }
1778 
1779   auto Shl = MIB.buildInstr(Opc, {DstReg}, {Src1Reg});
1780   if (ImmVal)
1781     Shl.addImm(*ImmVal);
1782   else
1783     Shl.addUse(Src2Reg);
1784   constrainSelectedInstRegOperands(*Shl, TII, TRI, RBI);
1785   I.eraseFromParent();
1786   return true;
1787 }
1788 
1789 bool AArch64InstructionSelector::selectVectorAshrLshr(
1790     MachineInstr &I, MachineRegisterInfo &MRI) {
1791   assert(I.getOpcode() == TargetOpcode::G_ASHR ||
1792          I.getOpcode() == TargetOpcode::G_LSHR);
1793   Register DstReg = I.getOperand(0).getReg();
1794   const LLT Ty = MRI.getType(DstReg);
1795   Register Src1Reg = I.getOperand(1).getReg();
1796   Register Src2Reg = I.getOperand(2).getReg();
1797 
1798   if (!Ty.isVector())
1799     return false;
1800 
1801   bool IsASHR = I.getOpcode() == TargetOpcode::G_ASHR;
1802 
1803   // We expect the immediate case to be lowered in the PostLegalCombiner to
1804   // AArch64ISD::VASHR or AArch64ISD::VLSHR equivalents.
1805 
1806   // There is not a shift right register instruction, but the shift left
1807   // register instruction takes a signed value, where negative numbers specify a
1808   // right shift.
1809 
1810   unsigned Opc = 0;
1811   unsigned NegOpc = 0;
1812   const TargetRegisterClass *RC =
1813       getRegClassForTypeOnBank(Ty, RBI.getRegBank(AArch64::FPRRegBankID), RBI);
1814   if (Ty == LLT::fixed_vector(2, 64)) {
1815     Opc = IsASHR ? AArch64::SSHLv2i64 : AArch64::USHLv2i64;
1816     NegOpc = AArch64::NEGv2i64;
1817   } else if (Ty == LLT::fixed_vector(4, 32)) {
1818     Opc = IsASHR ? AArch64::SSHLv4i32 : AArch64::USHLv4i32;
1819     NegOpc = AArch64::NEGv4i32;
1820   } else if (Ty == LLT::fixed_vector(2, 32)) {
1821     Opc = IsASHR ? AArch64::SSHLv2i32 : AArch64::USHLv2i32;
1822     NegOpc = AArch64::NEGv2i32;
1823   } else if (Ty == LLT::fixed_vector(4, 16)) {
1824     Opc = IsASHR ? AArch64::SSHLv4i16 : AArch64::USHLv4i16;
1825     NegOpc = AArch64::NEGv4i16;
1826   } else if (Ty == LLT::fixed_vector(8, 16)) {
1827     Opc = IsASHR ? AArch64::SSHLv8i16 : AArch64::USHLv8i16;
1828     NegOpc = AArch64::NEGv8i16;
1829   } else if (Ty == LLT::fixed_vector(16, 8)) {
1830     Opc = IsASHR ? AArch64::SSHLv16i8 : AArch64::USHLv16i8;
1831     NegOpc = AArch64::NEGv16i8;
1832   } else if (Ty == LLT::fixed_vector(8, 8)) {
1833     Opc = IsASHR ? AArch64::SSHLv8i8 : AArch64::USHLv8i8;
1834     NegOpc = AArch64::NEGv8i8;
1835   } else {
1836     LLVM_DEBUG(dbgs() << "Unhandled G_ASHR type");
1837     return false;
1838   }
1839 
1840   auto Neg = MIB.buildInstr(NegOpc, {RC}, {Src2Reg});
1841   constrainSelectedInstRegOperands(*Neg, TII, TRI, RBI);
1842   auto SShl = MIB.buildInstr(Opc, {DstReg}, {Src1Reg, Neg});
1843   constrainSelectedInstRegOperands(*SShl, TII, TRI, RBI);
1844   I.eraseFromParent();
1845   return true;
1846 }
1847 
1848 bool AArch64InstructionSelector::selectVaStartAAPCS(
1849     MachineInstr &I, MachineFunction &MF, MachineRegisterInfo &MRI) const {
1850   return false;
1851 }
1852 
1853 bool AArch64InstructionSelector::selectVaStartDarwin(
1854     MachineInstr &I, MachineFunction &MF, MachineRegisterInfo &MRI) const {
1855   AArch64FunctionInfo *FuncInfo = MF.getInfo<AArch64FunctionInfo>();
1856   Register ListReg = I.getOperand(0).getReg();
1857 
1858   Register ArgsAddrReg = MRI.createVirtualRegister(&AArch64::GPR64RegClass);
1859 
1860   auto MIB =
1861       BuildMI(*I.getParent(), I, I.getDebugLoc(), TII.get(AArch64::ADDXri))
1862           .addDef(ArgsAddrReg)
1863           .addFrameIndex(FuncInfo->getVarArgsStackIndex())
1864           .addImm(0)
1865           .addImm(0);
1866 
1867   constrainSelectedInstRegOperands(*MIB, TII, TRI, RBI);
1868 
1869   MIB = BuildMI(*I.getParent(), I, I.getDebugLoc(), TII.get(AArch64::STRXui))
1870             .addUse(ArgsAddrReg)
1871             .addUse(ListReg)
1872             .addImm(0)
1873             .addMemOperand(*I.memoperands_begin());
1874 
1875   constrainSelectedInstRegOperands(*MIB, TII, TRI, RBI);
1876   I.eraseFromParent();
1877   return true;
1878 }
1879 
1880 void AArch64InstructionSelector::materializeLargeCMVal(
1881     MachineInstr &I, const Value *V, unsigned OpFlags) {
1882   MachineBasicBlock &MBB = *I.getParent();
1883   MachineFunction &MF = *MBB.getParent();
1884   MachineRegisterInfo &MRI = MF.getRegInfo();
1885 
1886   auto MovZ = MIB.buildInstr(AArch64::MOVZXi, {&AArch64::GPR64RegClass}, {});
1887   MovZ->addOperand(MF, I.getOperand(1));
1888   MovZ->getOperand(1).setTargetFlags(OpFlags | AArch64II::MO_G0 |
1889                                      AArch64II::MO_NC);
1890   MovZ->addOperand(MF, MachineOperand::CreateImm(0));
1891   constrainSelectedInstRegOperands(*MovZ, TII, TRI, RBI);
1892 
1893   auto BuildMovK = [&](Register SrcReg, unsigned char Flags, unsigned Offset,
1894                        Register ForceDstReg) {
1895     Register DstReg = ForceDstReg
1896                           ? ForceDstReg
1897                           : MRI.createVirtualRegister(&AArch64::GPR64RegClass);
1898     auto MovI = MIB.buildInstr(AArch64::MOVKXi).addDef(DstReg).addUse(SrcReg);
1899     if (auto *GV = dyn_cast<GlobalValue>(V)) {
1900       MovI->addOperand(MF, MachineOperand::CreateGA(
1901                                GV, MovZ->getOperand(1).getOffset(), Flags));
1902     } else {
1903       MovI->addOperand(
1904           MF, MachineOperand::CreateBA(cast<BlockAddress>(V),
1905                                        MovZ->getOperand(1).getOffset(), Flags));
1906     }
1907     MovI->addOperand(MF, MachineOperand::CreateImm(Offset));
1908     constrainSelectedInstRegOperands(*MovI, TII, TRI, RBI);
1909     return DstReg;
1910   };
1911   Register DstReg = BuildMovK(MovZ.getReg(0),
1912                               AArch64II::MO_G1 | AArch64II::MO_NC, 16, 0);
1913   DstReg = BuildMovK(DstReg, AArch64II::MO_G2 | AArch64II::MO_NC, 32, 0);
1914   BuildMovK(DstReg, AArch64II::MO_G3, 48, I.getOperand(0).getReg());
1915 }
1916 
1917 bool AArch64InstructionSelector::preISelLower(MachineInstr &I) {
1918   MachineBasicBlock &MBB = *I.getParent();
1919   MachineFunction &MF = *MBB.getParent();
1920   MachineRegisterInfo &MRI = MF.getRegInfo();
1921 
1922   switch (I.getOpcode()) {
1923   case TargetOpcode::G_STORE: {
1924     bool Changed = contractCrossBankCopyIntoStore(I, MRI);
1925     MachineOperand &SrcOp = I.getOperand(0);
1926     if (MRI.getType(SrcOp.getReg()).isPointer()) {
1927       // Allow matching with imported patterns for stores of pointers. Unlike
1928       // G_LOAD/G_PTR_ADD, we may not have selected all users. So, emit a copy
1929       // and constrain.
1930       auto Copy = MIB.buildCopy(LLT::scalar(64), SrcOp);
1931       Register NewSrc = Copy.getReg(0);
1932       SrcOp.setReg(NewSrc);
1933       RBI.constrainGenericRegister(NewSrc, AArch64::GPR64RegClass, MRI);
1934       Changed = true;
1935     }
1936     return Changed;
1937   }
1938   case TargetOpcode::G_PTR_ADD:
1939     return convertPtrAddToAdd(I, MRI);
1940   case TargetOpcode::G_LOAD: {
1941     // For scalar loads of pointers, we try to convert the dest type from p0
1942     // to s64 so that our imported patterns can match. Like with the G_PTR_ADD
1943     // conversion, this should be ok because all users should have been
1944     // selected already, so the type doesn't matter for them.
1945     Register DstReg = I.getOperand(0).getReg();
1946     const LLT DstTy = MRI.getType(DstReg);
1947     if (!DstTy.isPointer())
1948       return false;
1949     MRI.setType(DstReg, LLT::scalar(64));
1950     return true;
1951   }
1952   case AArch64::G_DUP: {
1953     // Convert the type from p0 to s64 to help selection.
1954     LLT DstTy = MRI.getType(I.getOperand(0).getReg());
1955     if (!DstTy.getElementType().isPointer())
1956       return false;
1957     auto NewSrc = MIB.buildCopy(LLT::scalar(64), I.getOperand(1).getReg());
1958     MRI.setType(I.getOperand(0).getReg(),
1959                 DstTy.changeElementType(LLT::scalar(64)));
1960     MRI.setRegClass(NewSrc.getReg(0), &AArch64::GPR64RegClass);
1961     I.getOperand(1).setReg(NewSrc.getReg(0));
1962     return true;
1963   }
1964   case TargetOpcode::G_UITOFP:
1965   case TargetOpcode::G_SITOFP: {
1966     // If both source and destination regbanks are FPR, then convert the opcode
1967     // to G_SITOF so that the importer can select it to an fpr variant.
1968     // Otherwise, it ends up matching an fpr/gpr variant and adding a cross-bank
1969     // copy.
1970     Register SrcReg = I.getOperand(1).getReg();
1971     LLT SrcTy = MRI.getType(SrcReg);
1972     LLT DstTy = MRI.getType(I.getOperand(0).getReg());
1973     if (SrcTy.isVector() || SrcTy.getSizeInBits() != DstTy.getSizeInBits())
1974       return false;
1975 
1976     if (RBI.getRegBank(SrcReg, MRI, TRI)->getID() == AArch64::FPRRegBankID) {
1977       if (I.getOpcode() == TargetOpcode::G_SITOFP)
1978         I.setDesc(TII.get(AArch64::G_SITOF));
1979       else
1980         I.setDesc(TII.get(AArch64::G_UITOF));
1981       return true;
1982     }
1983     return false;
1984   }
1985   default:
1986     return false;
1987   }
1988 }
1989 
1990 /// This lowering tries to look for G_PTR_ADD instructions and then converts
1991 /// them to a standard G_ADD with a COPY on the source.
1992 ///
1993 /// The motivation behind this is to expose the add semantics to the imported
1994 /// tablegen patterns. We shouldn't need to check for uses being loads/stores,
1995 /// because the selector works bottom up, uses before defs. By the time we
1996 /// end up trying to select a G_PTR_ADD, we should have already attempted to
1997 /// fold this into addressing modes and were therefore unsuccessful.
1998 bool AArch64InstructionSelector::convertPtrAddToAdd(
1999     MachineInstr &I, MachineRegisterInfo &MRI) {
2000   assert(I.getOpcode() == TargetOpcode::G_PTR_ADD && "Expected G_PTR_ADD");
2001   Register DstReg = I.getOperand(0).getReg();
2002   Register AddOp1Reg = I.getOperand(1).getReg();
2003   const LLT PtrTy = MRI.getType(DstReg);
2004   if (PtrTy.getAddressSpace() != 0)
2005     return false;
2006 
2007   const LLT CastPtrTy =
2008       PtrTy.isVector() ? LLT::fixed_vector(2, 64) : LLT::scalar(64);
2009   auto PtrToInt = MIB.buildPtrToInt(CastPtrTy, AddOp1Reg);
2010   // Set regbanks on the registers.
2011   if (PtrTy.isVector())
2012     MRI.setRegBank(PtrToInt.getReg(0), RBI.getRegBank(AArch64::FPRRegBankID));
2013   else
2014     MRI.setRegBank(PtrToInt.getReg(0), RBI.getRegBank(AArch64::GPRRegBankID));
2015 
2016   // Now turn the %dst(p0) = G_PTR_ADD %base, off into:
2017   // %dst(intty) = G_ADD %intbase, off
2018   I.setDesc(TII.get(TargetOpcode::G_ADD));
2019   MRI.setType(DstReg, CastPtrTy);
2020   I.getOperand(1).setReg(PtrToInt.getReg(0));
2021   if (!select(*PtrToInt)) {
2022     LLVM_DEBUG(dbgs() << "Failed to select G_PTRTOINT in convertPtrAddToAdd");
2023     return false;
2024   }
2025 
2026   // Also take the opportunity here to try to do some optimization.
2027   // Try to convert this into a G_SUB if the offset is a 0-x negate idiom.
2028   Register NegatedReg;
2029   if (!mi_match(I.getOperand(2).getReg(), MRI, m_Neg(m_Reg(NegatedReg))))
2030     return true;
2031   I.getOperand(2).setReg(NegatedReg);
2032   I.setDesc(TII.get(TargetOpcode::G_SUB));
2033   return true;
2034 }
2035 
2036 bool AArch64InstructionSelector::earlySelectSHL(MachineInstr &I,
2037                                                 MachineRegisterInfo &MRI) {
2038   // We try to match the immediate variant of LSL, which is actually an alias
2039   // for a special case of UBFM. Otherwise, we fall back to the imported
2040   // selector which will match the register variant.
2041   assert(I.getOpcode() == TargetOpcode::G_SHL && "unexpected op");
2042   const auto &MO = I.getOperand(2);
2043   auto VRegAndVal = getIConstantVRegVal(MO.getReg(), MRI);
2044   if (!VRegAndVal)
2045     return false;
2046 
2047   const LLT DstTy = MRI.getType(I.getOperand(0).getReg());
2048   if (DstTy.isVector())
2049     return false;
2050   bool Is64Bit = DstTy.getSizeInBits() == 64;
2051   auto Imm1Fn = Is64Bit ? selectShiftA_64(MO) : selectShiftA_32(MO);
2052   auto Imm2Fn = Is64Bit ? selectShiftB_64(MO) : selectShiftB_32(MO);
2053 
2054   if (!Imm1Fn || !Imm2Fn)
2055     return false;
2056 
2057   auto NewI =
2058       MIB.buildInstr(Is64Bit ? AArch64::UBFMXri : AArch64::UBFMWri,
2059                      {I.getOperand(0).getReg()}, {I.getOperand(1).getReg()});
2060 
2061   for (auto &RenderFn : *Imm1Fn)
2062     RenderFn(NewI);
2063   for (auto &RenderFn : *Imm2Fn)
2064     RenderFn(NewI);
2065 
2066   I.eraseFromParent();
2067   return constrainSelectedInstRegOperands(*NewI, TII, TRI, RBI);
2068 }
2069 
2070 bool AArch64InstructionSelector::contractCrossBankCopyIntoStore(
2071     MachineInstr &I, MachineRegisterInfo &MRI) {
2072   assert(I.getOpcode() == TargetOpcode::G_STORE && "Expected G_STORE");
2073   // If we're storing a scalar, it doesn't matter what register bank that
2074   // scalar is on. All that matters is the size.
2075   //
2076   // So, if we see something like this (with a 32-bit scalar as an example):
2077   //
2078   // %x:gpr(s32) = ... something ...
2079   // %y:fpr(s32) = COPY %x:gpr(s32)
2080   // G_STORE %y:fpr(s32)
2081   //
2082   // We can fix this up into something like this:
2083   //
2084   // G_STORE %x:gpr(s32)
2085   //
2086   // And then continue the selection process normally.
2087   Register DefDstReg = getSrcRegIgnoringCopies(I.getOperand(0).getReg(), MRI);
2088   if (!DefDstReg.isValid())
2089     return false;
2090   LLT DefDstTy = MRI.getType(DefDstReg);
2091   Register StoreSrcReg = I.getOperand(0).getReg();
2092   LLT StoreSrcTy = MRI.getType(StoreSrcReg);
2093 
2094   // If we get something strange like a physical register, then we shouldn't
2095   // go any further.
2096   if (!DefDstTy.isValid())
2097     return false;
2098 
2099   // Are the source and dst types the same size?
2100   if (DefDstTy.getSizeInBits() != StoreSrcTy.getSizeInBits())
2101     return false;
2102 
2103   if (RBI.getRegBank(StoreSrcReg, MRI, TRI) ==
2104       RBI.getRegBank(DefDstReg, MRI, TRI))
2105     return false;
2106 
2107   // We have a cross-bank copy, which is entering a store. Let's fold it.
2108   I.getOperand(0).setReg(DefDstReg);
2109   return true;
2110 }
2111 
2112 bool AArch64InstructionSelector::earlySelect(MachineInstr &I) {
2113   assert(I.getParent() && "Instruction should be in a basic block!");
2114   assert(I.getParent()->getParent() && "Instruction should be in a function!");
2115 
2116   MachineBasicBlock &MBB = *I.getParent();
2117   MachineFunction &MF = *MBB.getParent();
2118   MachineRegisterInfo &MRI = MF.getRegInfo();
2119 
2120   switch (I.getOpcode()) {
2121   case AArch64::G_DUP: {
2122     // Before selecting a DUP instruction, check if it is better selected as a
2123     // MOV or load from a constant pool.
2124     Register Src = I.getOperand(1).getReg();
2125     auto ValAndVReg = getIConstantVRegValWithLookThrough(Src, MRI);
2126     if (!ValAndVReg)
2127       return false;
2128     LLVMContext &Ctx = MF.getFunction().getContext();
2129     Register Dst = I.getOperand(0).getReg();
2130     auto *CV = ConstantDataVector::getSplat(
2131         MRI.getType(Dst).getNumElements(),
2132         ConstantInt::get(Type::getIntNTy(Ctx, MRI.getType(Src).getSizeInBits()),
2133                          ValAndVReg->Value));
2134     if (!emitConstantVector(Dst, CV, MIB, MRI))
2135       return false;
2136     I.eraseFromParent();
2137     return true;
2138   }
2139   case TargetOpcode::G_SEXT:
2140     // Check for i64 sext(i32 vector_extract) prior to tablegen to select SMOV
2141     // over a normal extend.
2142     if (selectUSMovFromExtend(I, MRI))
2143       return true;
2144     return false;
2145   case TargetOpcode::G_BR:
2146     return false;
2147   case TargetOpcode::G_SHL:
2148     return earlySelectSHL(I, MRI);
2149   case TargetOpcode::G_CONSTANT: {
2150     bool IsZero = false;
2151     if (I.getOperand(1).isCImm())
2152       IsZero = I.getOperand(1).getCImm()->getZExtValue() == 0;
2153     else if (I.getOperand(1).isImm())
2154       IsZero = I.getOperand(1).getImm() == 0;
2155 
2156     if (!IsZero)
2157       return false;
2158 
2159     Register DefReg = I.getOperand(0).getReg();
2160     LLT Ty = MRI.getType(DefReg);
2161     if (Ty.getSizeInBits() == 64) {
2162       I.getOperand(1).ChangeToRegister(AArch64::XZR, false);
2163       RBI.constrainGenericRegister(DefReg, AArch64::GPR64RegClass, MRI);
2164     } else if (Ty.getSizeInBits() == 32) {
2165       I.getOperand(1).ChangeToRegister(AArch64::WZR, false);
2166       RBI.constrainGenericRegister(DefReg, AArch64::GPR32RegClass, MRI);
2167     } else
2168       return false;
2169 
2170     I.setDesc(TII.get(TargetOpcode::COPY));
2171     return true;
2172   }
2173 
2174   case TargetOpcode::G_ADD: {
2175     // Check if this is being fed by a G_ICMP on either side.
2176     //
2177     // (cmp pred, x, y) + z
2178     //
2179     // In the above case, when the cmp is true, we increment z by 1. So, we can
2180     // fold the add into the cset for the cmp by using cinc.
2181     //
2182     // FIXME: This would probably be a lot nicer in PostLegalizerLowering.
2183     Register AddDst = I.getOperand(0).getReg();
2184     Register AddLHS = I.getOperand(1).getReg();
2185     Register AddRHS = I.getOperand(2).getReg();
2186     // Only handle scalars.
2187     LLT Ty = MRI.getType(AddLHS);
2188     if (Ty.isVector())
2189       return false;
2190     // Since G_ICMP is modeled as ADDS/SUBS/ANDS, we can handle 32 bits or 64
2191     // bits.
2192     unsigned Size = Ty.getSizeInBits();
2193     if (Size != 32 && Size != 64)
2194       return false;
2195     auto MatchCmp = [&](Register Reg) -> MachineInstr * {
2196       if (!MRI.hasOneNonDBGUse(Reg))
2197         return nullptr;
2198       // If the LHS of the add is 32 bits, then we want to fold a 32-bit
2199       // compare.
2200       if (Size == 32)
2201         return getOpcodeDef(TargetOpcode::G_ICMP, Reg, MRI);
2202       // We model scalar compares using 32-bit destinations right now.
2203       // If it's a 64-bit compare, it'll have 64-bit sources.
2204       Register ZExt;
2205       if (!mi_match(Reg, MRI,
2206                     m_OneNonDBGUse(m_GZExt(m_OneNonDBGUse(m_Reg(ZExt))))))
2207         return nullptr;
2208       auto *Cmp = getOpcodeDef(TargetOpcode::G_ICMP, ZExt, MRI);
2209       if (!Cmp ||
2210           MRI.getType(Cmp->getOperand(2).getReg()).getSizeInBits() != 64)
2211         return nullptr;
2212       return Cmp;
2213     };
2214     // Try to match
2215     // z + (cmp pred, x, y)
2216     MachineInstr *Cmp = MatchCmp(AddRHS);
2217     if (!Cmp) {
2218       // (cmp pred, x, y) + z
2219       std::swap(AddLHS, AddRHS);
2220       Cmp = MatchCmp(AddRHS);
2221       if (!Cmp)
2222         return false;
2223     }
2224     auto &PredOp = Cmp->getOperand(1);
2225     auto Pred = static_cast<CmpInst::Predicate>(PredOp.getPredicate());
2226     const AArch64CC::CondCode InvCC =
2227         changeICMPPredToAArch64CC(CmpInst::getInversePredicate(Pred));
2228     MIB.setInstrAndDebugLoc(I);
2229     emitIntegerCompare(/*LHS=*/Cmp->getOperand(2),
2230                        /*RHS=*/Cmp->getOperand(3), PredOp, MIB);
2231     emitCSINC(/*Dst=*/AddDst, /*Src =*/AddLHS, /*Src2=*/AddLHS, InvCC, MIB);
2232     I.eraseFromParent();
2233     return true;
2234   }
2235   case TargetOpcode::G_OR: {
2236     // Look for operations that take the lower `Width=Size-ShiftImm` bits of
2237     // `ShiftSrc` and insert them into the upper `Width` bits of `MaskSrc` via
2238     // shifting and masking that we can replace with a BFI (encoded as a BFM).
2239     Register Dst = I.getOperand(0).getReg();
2240     LLT Ty = MRI.getType(Dst);
2241 
2242     if (!Ty.isScalar())
2243       return false;
2244 
2245     unsigned Size = Ty.getSizeInBits();
2246     if (Size != 32 && Size != 64)
2247       return false;
2248 
2249     Register ShiftSrc;
2250     int64_t ShiftImm;
2251     Register MaskSrc;
2252     int64_t MaskImm;
2253     if (!mi_match(
2254             Dst, MRI,
2255             m_GOr(m_OneNonDBGUse(m_GShl(m_Reg(ShiftSrc), m_ICst(ShiftImm))),
2256                   m_OneNonDBGUse(m_GAnd(m_Reg(MaskSrc), m_ICst(MaskImm))))))
2257       return false;
2258 
2259     if (ShiftImm > Size || ((1ULL << ShiftImm) - 1ULL) != uint64_t(MaskImm))
2260       return false;
2261 
2262     int64_t Immr = Size - ShiftImm;
2263     int64_t Imms = Size - ShiftImm - 1;
2264     unsigned Opc = Size == 32 ? AArch64::BFMWri : AArch64::BFMXri;
2265     emitInstr(Opc, {Dst}, {MaskSrc, ShiftSrc, Immr, Imms}, MIB);
2266     I.eraseFromParent();
2267     return true;
2268   }
2269   default:
2270     return false;
2271   }
2272 }
2273 
2274 bool AArch64InstructionSelector::select(MachineInstr &I) {
2275   assert(I.getParent() && "Instruction should be in a basic block!");
2276   assert(I.getParent()->getParent() && "Instruction should be in a function!");
2277 
2278   MachineBasicBlock &MBB = *I.getParent();
2279   MachineFunction &MF = *MBB.getParent();
2280   MachineRegisterInfo &MRI = MF.getRegInfo();
2281 
2282   const AArch64Subtarget *Subtarget =
2283       &static_cast<const AArch64Subtarget &>(MF.getSubtarget());
2284   if (Subtarget->requiresStrictAlign()) {
2285     // We don't support this feature yet.
2286     LLVM_DEBUG(dbgs() << "AArch64 GISel does not support strict-align yet\n");
2287     return false;
2288   }
2289 
2290   MIB.setInstrAndDebugLoc(I);
2291 
2292   unsigned Opcode = I.getOpcode();
2293   // G_PHI requires same handling as PHI
2294   if (!I.isPreISelOpcode() || Opcode == TargetOpcode::G_PHI) {
2295     // Certain non-generic instructions also need some special handling.
2296 
2297     if (Opcode ==  TargetOpcode::LOAD_STACK_GUARD)
2298       return constrainSelectedInstRegOperands(I, TII, TRI, RBI);
2299 
2300     if (Opcode == TargetOpcode::PHI || Opcode == TargetOpcode::G_PHI) {
2301       const Register DefReg = I.getOperand(0).getReg();
2302       const LLT DefTy = MRI.getType(DefReg);
2303 
2304       const RegClassOrRegBank &RegClassOrBank =
2305         MRI.getRegClassOrRegBank(DefReg);
2306 
2307       const TargetRegisterClass *DefRC
2308         = RegClassOrBank.dyn_cast<const TargetRegisterClass *>();
2309       if (!DefRC) {
2310         if (!DefTy.isValid()) {
2311           LLVM_DEBUG(dbgs() << "PHI operand has no type, not a gvreg?\n");
2312           return false;
2313         }
2314         const RegisterBank &RB = *RegClassOrBank.get<const RegisterBank *>();
2315         DefRC = getRegClassForTypeOnBank(DefTy, RB, RBI);
2316         if (!DefRC) {
2317           LLVM_DEBUG(dbgs() << "PHI operand has unexpected size/bank\n");
2318           return false;
2319         }
2320       }
2321 
2322       I.setDesc(TII.get(TargetOpcode::PHI));
2323 
2324       return RBI.constrainGenericRegister(DefReg, *DefRC, MRI);
2325     }
2326 
2327     if (I.isCopy())
2328       return selectCopy(I, TII, MRI, TRI, RBI);
2329 
2330     return true;
2331   }
2332 
2333 
2334   if (I.getNumOperands() != I.getNumExplicitOperands()) {
2335     LLVM_DEBUG(
2336         dbgs() << "Generic instruction has unexpected implicit operands\n");
2337     return false;
2338   }
2339 
2340   // Try to do some lowering before we start instruction selecting. These
2341   // lowerings are purely transformations on the input G_MIR and so selection
2342   // must continue after any modification of the instruction.
2343   if (preISelLower(I)) {
2344     Opcode = I.getOpcode(); // The opcode may have been modified, refresh it.
2345   }
2346 
2347   // There may be patterns where the importer can't deal with them optimally,
2348   // but does select it to a suboptimal sequence so our custom C++ selection
2349   // code later never has a chance to work on it. Therefore, we have an early
2350   // selection attempt here to give priority to certain selection routines
2351   // over the imported ones.
2352   if (earlySelect(I))
2353     return true;
2354 
2355   if (selectImpl(I, *CoverageInfo))
2356     return true;
2357 
2358   LLT Ty =
2359       I.getOperand(0).isReg() ? MRI.getType(I.getOperand(0).getReg()) : LLT{};
2360 
2361   switch (Opcode) {
2362   case TargetOpcode::G_SBFX:
2363   case TargetOpcode::G_UBFX: {
2364     static const unsigned OpcTable[2][2] = {
2365         {AArch64::UBFMWri, AArch64::UBFMXri},
2366         {AArch64::SBFMWri, AArch64::SBFMXri}};
2367     bool IsSigned = Opcode == TargetOpcode::G_SBFX;
2368     unsigned Size = Ty.getSizeInBits();
2369     unsigned Opc = OpcTable[IsSigned][Size == 64];
2370     auto Cst1 =
2371         getIConstantVRegValWithLookThrough(I.getOperand(2).getReg(), MRI);
2372     assert(Cst1 && "Should have gotten a constant for src 1?");
2373     auto Cst2 =
2374         getIConstantVRegValWithLookThrough(I.getOperand(3).getReg(), MRI);
2375     assert(Cst2 && "Should have gotten a constant for src 2?");
2376     auto LSB = Cst1->Value.getZExtValue();
2377     auto Width = Cst2->Value.getZExtValue();
2378     auto BitfieldInst =
2379         MIB.buildInstr(Opc, {I.getOperand(0)}, {I.getOperand(1)})
2380             .addImm(LSB)
2381             .addImm(LSB + Width - 1);
2382     I.eraseFromParent();
2383     return constrainSelectedInstRegOperands(*BitfieldInst, TII, TRI, RBI);
2384   }
2385   case TargetOpcode::G_BRCOND:
2386     return selectCompareBranch(I, MF, MRI);
2387 
2388   case TargetOpcode::G_BRINDIRECT: {
2389     I.setDesc(TII.get(AArch64::BR));
2390     return constrainSelectedInstRegOperands(I, TII, TRI, RBI);
2391   }
2392 
2393   case TargetOpcode::G_BRJT:
2394     return selectBrJT(I, MRI);
2395 
2396   case AArch64::G_ADD_LOW: {
2397     // This op may have been separated from it's ADRP companion by the localizer
2398     // or some other code motion pass. Given that many CPUs will try to
2399     // macro fuse these operations anyway, select this into a MOVaddr pseudo
2400     // which will later be expanded into an ADRP+ADD pair after scheduling.
2401     MachineInstr *BaseMI = MRI.getVRegDef(I.getOperand(1).getReg());
2402     if (BaseMI->getOpcode() != AArch64::ADRP) {
2403       I.setDesc(TII.get(AArch64::ADDXri));
2404       I.addOperand(MachineOperand::CreateImm(0));
2405       return constrainSelectedInstRegOperands(I, TII, TRI, RBI);
2406     }
2407     assert(TM.getCodeModel() == CodeModel::Small &&
2408            "Expected small code model");
2409     auto Op1 = BaseMI->getOperand(1);
2410     auto Op2 = I.getOperand(2);
2411     auto MovAddr = MIB.buildInstr(AArch64::MOVaddr, {I.getOperand(0)}, {})
2412                        .addGlobalAddress(Op1.getGlobal(), Op1.getOffset(),
2413                                          Op1.getTargetFlags())
2414                        .addGlobalAddress(Op2.getGlobal(), Op2.getOffset(),
2415                                          Op2.getTargetFlags());
2416     I.eraseFromParent();
2417     return constrainSelectedInstRegOperands(*MovAddr, TII, TRI, RBI);
2418   }
2419 
2420   case TargetOpcode::G_BSWAP: {
2421     // Handle vector types for G_BSWAP directly.
2422     Register DstReg = I.getOperand(0).getReg();
2423     LLT DstTy = MRI.getType(DstReg);
2424 
2425     // We should only get vector types here; everything else is handled by the
2426     // importer right now.
2427     if (!DstTy.isVector() || DstTy.getSizeInBits() > 128) {
2428       LLVM_DEBUG(dbgs() << "Dst type for G_BSWAP currently unsupported.\n");
2429       return false;
2430     }
2431 
2432     // Only handle 4 and 2 element vectors for now.
2433     // TODO: 16-bit elements.
2434     unsigned NumElts = DstTy.getNumElements();
2435     if (NumElts != 4 && NumElts != 2) {
2436       LLVM_DEBUG(dbgs() << "Unsupported number of elements for G_BSWAP.\n");
2437       return false;
2438     }
2439 
2440     // Choose the correct opcode for the supported types. Right now, that's
2441     // v2s32, v4s32, and v2s64.
2442     unsigned Opc = 0;
2443     unsigned EltSize = DstTy.getElementType().getSizeInBits();
2444     if (EltSize == 32)
2445       Opc = (DstTy.getNumElements() == 2) ? AArch64::REV32v8i8
2446                                           : AArch64::REV32v16i8;
2447     else if (EltSize == 64)
2448       Opc = AArch64::REV64v16i8;
2449 
2450     // We should always get something by the time we get here...
2451     assert(Opc != 0 && "Didn't get an opcode for G_BSWAP?");
2452 
2453     I.setDesc(TII.get(Opc));
2454     return constrainSelectedInstRegOperands(I, TII, TRI, RBI);
2455   }
2456 
2457   case TargetOpcode::G_FCONSTANT:
2458   case TargetOpcode::G_CONSTANT: {
2459     const bool isFP = Opcode == TargetOpcode::G_FCONSTANT;
2460 
2461     const LLT s8 = LLT::scalar(8);
2462     const LLT s16 = LLT::scalar(16);
2463     const LLT s32 = LLT::scalar(32);
2464     const LLT s64 = LLT::scalar(64);
2465     const LLT s128 = LLT::scalar(128);
2466     const LLT p0 = LLT::pointer(0, 64);
2467 
2468     const Register DefReg = I.getOperand(0).getReg();
2469     const LLT DefTy = MRI.getType(DefReg);
2470     const unsigned DefSize = DefTy.getSizeInBits();
2471     const RegisterBank &RB = *RBI.getRegBank(DefReg, MRI, TRI);
2472 
2473     // FIXME: Redundant check, but even less readable when factored out.
2474     if (isFP) {
2475       if (Ty != s16 && Ty != s32 && Ty != s64 && Ty != s128) {
2476         LLVM_DEBUG(dbgs() << "Unable to materialize FP " << Ty
2477                           << " constant, expected: " << s16 << " or " << s32
2478                           << " or " << s64 << " or " << s128 << '\n');
2479         return false;
2480       }
2481 
2482       if (RB.getID() != AArch64::FPRRegBankID) {
2483         LLVM_DEBUG(dbgs() << "Unable to materialize FP " << Ty
2484                           << " constant on bank: " << RB
2485                           << ", expected: FPR\n");
2486         return false;
2487       }
2488 
2489       // The case when we have 0.0 is covered by tablegen. Reject it here so we
2490       // can be sure tablegen works correctly and isn't rescued by this code.
2491       // 0.0 is not covered by tablegen for FP128. So we will handle this
2492       // scenario in the code here.
2493       if (DefSize != 128 && I.getOperand(1).getFPImm()->isExactlyValue(0.0))
2494         return false;
2495     } else {
2496       // s32 and s64 are covered by tablegen.
2497       if (Ty != p0 && Ty != s8 && Ty != s16) {
2498         LLVM_DEBUG(dbgs() << "Unable to materialize integer " << Ty
2499                           << " constant, expected: " << s32 << ", " << s64
2500                           << ", or " << p0 << '\n');
2501         return false;
2502       }
2503 
2504       if (RB.getID() != AArch64::GPRRegBankID) {
2505         LLVM_DEBUG(dbgs() << "Unable to materialize integer " << Ty
2506                           << " constant on bank: " << RB
2507                           << ", expected: GPR\n");
2508         return false;
2509       }
2510     }
2511 
2512     if (isFP) {
2513       const TargetRegisterClass &FPRRC = *getMinClassForRegBank(RB, DefSize);
2514       // For 16, 64, and 128b values, emit a constant pool load.
2515       switch (DefSize) {
2516       default:
2517         llvm_unreachable("Unexpected destination size for G_FCONSTANT?");
2518       case 32:
2519         // For s32, use a cp load if we have optsize/minsize.
2520         if (!shouldOptForSize(&MF))
2521           break;
2522         LLVM_FALLTHROUGH;
2523       case 16:
2524       case 64:
2525       case 128: {
2526         auto *FPImm = I.getOperand(1).getFPImm();
2527         auto *LoadMI = emitLoadFromConstantPool(FPImm, MIB);
2528         if (!LoadMI) {
2529           LLVM_DEBUG(dbgs() << "Failed to load double constant pool entry\n");
2530           return false;
2531         }
2532         MIB.buildCopy({DefReg}, {LoadMI->getOperand(0).getReg()});
2533         I.eraseFromParent();
2534         return RBI.constrainGenericRegister(DefReg, FPRRC, MRI);
2535       }
2536       }
2537 
2538       // Either emit a FMOV, or emit a copy to emit a normal mov.
2539       assert(DefSize == 32 &&
2540              "Expected constant pool loads for all sizes other than 32!");
2541       const Register DefGPRReg =
2542           MRI.createVirtualRegister(&AArch64::GPR32RegClass);
2543       MachineOperand &RegOp = I.getOperand(0);
2544       RegOp.setReg(DefGPRReg);
2545       MIB.setInsertPt(MIB.getMBB(), std::next(I.getIterator()));
2546       MIB.buildCopy({DefReg}, {DefGPRReg});
2547 
2548       if (!RBI.constrainGenericRegister(DefReg, FPRRC, MRI)) {
2549         LLVM_DEBUG(dbgs() << "Failed to constrain G_FCONSTANT def operand\n");
2550         return false;
2551       }
2552 
2553       MachineOperand &ImmOp = I.getOperand(1);
2554       // FIXME: Is going through int64_t always correct?
2555       ImmOp.ChangeToImmediate(
2556           ImmOp.getFPImm()->getValueAPF().bitcastToAPInt().getZExtValue());
2557     } else if (I.getOperand(1).isCImm()) {
2558       uint64_t Val = I.getOperand(1).getCImm()->getZExtValue();
2559       I.getOperand(1).ChangeToImmediate(Val);
2560     } else if (I.getOperand(1).isImm()) {
2561       uint64_t Val = I.getOperand(1).getImm();
2562       I.getOperand(1).ChangeToImmediate(Val);
2563     }
2564 
2565     const unsigned MovOpc =
2566         DefSize == 64 ? AArch64::MOVi64imm : AArch64::MOVi32imm;
2567     I.setDesc(TII.get(MovOpc));
2568     constrainSelectedInstRegOperands(I, TII, TRI, RBI);
2569     return true;
2570   }
2571   case TargetOpcode::G_EXTRACT: {
2572     Register DstReg = I.getOperand(0).getReg();
2573     Register SrcReg = I.getOperand(1).getReg();
2574     LLT SrcTy = MRI.getType(SrcReg);
2575     LLT DstTy = MRI.getType(DstReg);
2576     (void)DstTy;
2577     unsigned SrcSize = SrcTy.getSizeInBits();
2578 
2579     if (SrcTy.getSizeInBits() > 64) {
2580       // This should be an extract of an s128, which is like a vector extract.
2581       if (SrcTy.getSizeInBits() != 128)
2582         return false;
2583       // Only support extracting 64 bits from an s128 at the moment.
2584       if (DstTy.getSizeInBits() != 64)
2585         return false;
2586 
2587       unsigned Offset = I.getOperand(2).getImm();
2588       if (Offset % 64 != 0)
2589         return false;
2590 
2591       // Check we have the right regbank always.
2592       const RegisterBank &SrcRB = *RBI.getRegBank(SrcReg, MRI, TRI);
2593       const RegisterBank &DstRB = *RBI.getRegBank(DstReg, MRI, TRI);
2594       assert(SrcRB.getID() == DstRB.getID() && "Wrong extract regbank!");
2595 
2596       if (SrcRB.getID() == AArch64::GPRRegBankID) {
2597         MIB.buildInstr(TargetOpcode::COPY, {DstReg}, {})
2598             .addUse(SrcReg, 0, Offset == 0 ? AArch64::sube64 : AArch64::subo64);
2599         I.eraseFromParent();
2600         return true;
2601       }
2602 
2603       // Emit the same code as a vector extract.
2604       // Offset must be a multiple of 64.
2605       unsigned LaneIdx = Offset / 64;
2606       MachineInstr *Extract = emitExtractVectorElt(
2607           DstReg, DstRB, LLT::scalar(64), SrcReg, LaneIdx, MIB);
2608       if (!Extract)
2609         return false;
2610       I.eraseFromParent();
2611       return true;
2612     }
2613 
2614     I.setDesc(TII.get(SrcSize == 64 ? AArch64::UBFMXri : AArch64::UBFMWri));
2615     MachineInstrBuilder(MF, I).addImm(I.getOperand(2).getImm() +
2616                                       Ty.getSizeInBits() - 1);
2617 
2618     if (SrcSize < 64) {
2619       assert(SrcSize == 32 && DstTy.getSizeInBits() == 16 &&
2620              "unexpected G_EXTRACT types");
2621       return constrainSelectedInstRegOperands(I, TII, TRI, RBI);
2622     }
2623 
2624     DstReg = MRI.createGenericVirtualRegister(LLT::scalar(64));
2625     MIB.setInsertPt(MIB.getMBB(), std::next(I.getIterator()));
2626     MIB.buildInstr(TargetOpcode::COPY, {I.getOperand(0).getReg()}, {})
2627         .addReg(DstReg, 0, AArch64::sub_32);
2628     RBI.constrainGenericRegister(I.getOperand(0).getReg(),
2629                                  AArch64::GPR32RegClass, MRI);
2630     I.getOperand(0).setReg(DstReg);
2631 
2632     return constrainSelectedInstRegOperands(I, TII, TRI, RBI);
2633   }
2634 
2635   case TargetOpcode::G_INSERT: {
2636     LLT SrcTy = MRI.getType(I.getOperand(2).getReg());
2637     LLT DstTy = MRI.getType(I.getOperand(0).getReg());
2638     unsigned DstSize = DstTy.getSizeInBits();
2639     // Larger inserts are vectors, same-size ones should be something else by
2640     // now (split up or turned into COPYs).
2641     if (Ty.getSizeInBits() > 64 || SrcTy.getSizeInBits() > 32)
2642       return false;
2643 
2644     I.setDesc(TII.get(DstSize == 64 ? AArch64::BFMXri : AArch64::BFMWri));
2645     unsigned LSB = I.getOperand(3).getImm();
2646     unsigned Width = MRI.getType(I.getOperand(2).getReg()).getSizeInBits();
2647     I.getOperand(3).setImm((DstSize - LSB) % DstSize);
2648     MachineInstrBuilder(MF, I).addImm(Width - 1);
2649 
2650     if (DstSize < 64) {
2651       assert(DstSize == 32 && SrcTy.getSizeInBits() == 16 &&
2652              "unexpected G_INSERT types");
2653       return constrainSelectedInstRegOperands(I, TII, TRI, RBI);
2654     }
2655 
2656     Register SrcReg = MRI.createGenericVirtualRegister(LLT::scalar(64));
2657     BuildMI(MBB, I.getIterator(), I.getDebugLoc(),
2658             TII.get(AArch64::SUBREG_TO_REG))
2659         .addDef(SrcReg)
2660         .addImm(0)
2661         .addUse(I.getOperand(2).getReg())
2662         .addImm(AArch64::sub_32);
2663     RBI.constrainGenericRegister(I.getOperand(2).getReg(),
2664                                  AArch64::GPR32RegClass, MRI);
2665     I.getOperand(2).setReg(SrcReg);
2666 
2667     return constrainSelectedInstRegOperands(I, TII, TRI, RBI);
2668   }
2669   case TargetOpcode::G_FRAME_INDEX: {
2670     // allocas and G_FRAME_INDEX are only supported in addrspace(0).
2671     if (Ty != LLT::pointer(0, 64)) {
2672       LLVM_DEBUG(dbgs() << "G_FRAME_INDEX pointer has type: " << Ty
2673                         << ", expected: " << LLT::pointer(0, 64) << '\n');
2674       return false;
2675     }
2676     I.setDesc(TII.get(AArch64::ADDXri));
2677 
2678     // MOs for a #0 shifted immediate.
2679     I.addOperand(MachineOperand::CreateImm(0));
2680     I.addOperand(MachineOperand::CreateImm(0));
2681 
2682     return constrainSelectedInstRegOperands(I, TII, TRI, RBI);
2683   }
2684 
2685   case TargetOpcode::G_GLOBAL_VALUE: {
2686     auto GV = I.getOperand(1).getGlobal();
2687     if (GV->isThreadLocal())
2688       return selectTLSGlobalValue(I, MRI);
2689 
2690     unsigned OpFlags = STI.ClassifyGlobalReference(GV, TM);
2691     if (OpFlags & AArch64II::MO_GOT) {
2692       I.setDesc(TII.get(AArch64::LOADgot));
2693       I.getOperand(1).setTargetFlags(OpFlags);
2694     } else if (TM.getCodeModel() == CodeModel::Large) {
2695       // Materialize the global using movz/movk instructions.
2696       materializeLargeCMVal(I, GV, OpFlags);
2697       I.eraseFromParent();
2698       return true;
2699     } else if (TM.getCodeModel() == CodeModel::Tiny) {
2700       I.setDesc(TII.get(AArch64::ADR));
2701       I.getOperand(1).setTargetFlags(OpFlags);
2702     } else {
2703       I.setDesc(TII.get(AArch64::MOVaddr));
2704       I.getOperand(1).setTargetFlags(OpFlags | AArch64II::MO_PAGE);
2705       MachineInstrBuilder MIB(MF, I);
2706       MIB.addGlobalAddress(GV, I.getOperand(1).getOffset(),
2707                            OpFlags | AArch64II::MO_PAGEOFF | AArch64II::MO_NC);
2708     }
2709     return constrainSelectedInstRegOperands(I, TII, TRI, RBI);
2710   }
2711 
2712   case TargetOpcode::G_ZEXTLOAD:
2713   case TargetOpcode::G_LOAD:
2714   case TargetOpcode::G_STORE: {
2715     GLoadStore &LdSt = cast<GLoadStore>(I);
2716     bool IsZExtLoad = I.getOpcode() == TargetOpcode::G_ZEXTLOAD;
2717     LLT PtrTy = MRI.getType(LdSt.getPointerReg());
2718 
2719     if (PtrTy != LLT::pointer(0, 64)) {
2720       LLVM_DEBUG(dbgs() << "Load/Store pointer has type: " << PtrTy
2721                         << ", expected: " << LLT::pointer(0, 64) << '\n');
2722       return false;
2723     }
2724 
2725     uint64_t MemSizeInBytes = LdSt.getMemSize();
2726     unsigned MemSizeInBits = LdSt.getMemSizeInBits();
2727     AtomicOrdering Order = LdSt.getMMO().getSuccessOrdering();
2728 
2729     // Need special instructions for atomics that affect ordering.
2730     if (Order != AtomicOrdering::NotAtomic &&
2731         Order != AtomicOrdering::Unordered &&
2732         Order != AtomicOrdering::Monotonic) {
2733       assert(!isa<GZExtLoad>(LdSt));
2734       if (MemSizeInBytes > 64)
2735         return false;
2736 
2737       if (isa<GLoad>(LdSt)) {
2738         static unsigned Opcodes[] = {AArch64::LDARB, AArch64::LDARH,
2739                                      AArch64::LDARW, AArch64::LDARX};
2740         I.setDesc(TII.get(Opcodes[Log2_32(MemSizeInBytes)]));
2741       } else {
2742         static unsigned Opcodes[] = {AArch64::STLRB, AArch64::STLRH,
2743                                      AArch64::STLRW, AArch64::STLRX};
2744         Register ValReg = LdSt.getReg(0);
2745         if (MRI.getType(ValReg).getSizeInBits() == 64 && MemSizeInBits != 64) {
2746           // Emit a subreg copy of 32 bits.
2747           Register NewVal = MRI.createVirtualRegister(&AArch64::GPR32RegClass);
2748           MIB.buildInstr(TargetOpcode::COPY, {NewVal}, {})
2749               .addReg(I.getOperand(0).getReg(), 0, AArch64::sub_32);
2750           I.getOperand(0).setReg(NewVal);
2751         }
2752         I.setDesc(TII.get(Opcodes[Log2_32(MemSizeInBytes)]));
2753       }
2754       constrainSelectedInstRegOperands(I, TII, TRI, RBI);
2755       return true;
2756     }
2757 
2758 #ifndef NDEBUG
2759     const Register PtrReg = LdSt.getPointerReg();
2760     const RegisterBank &PtrRB = *RBI.getRegBank(PtrReg, MRI, TRI);
2761     // Check that the pointer register is valid.
2762     assert(PtrRB.getID() == AArch64::GPRRegBankID &&
2763            "Load/Store pointer operand isn't a GPR");
2764     assert(MRI.getType(PtrReg).isPointer() &&
2765            "Load/Store pointer operand isn't a pointer");
2766 #endif
2767 
2768     const Register ValReg = LdSt.getReg(0);
2769     const LLT ValTy = MRI.getType(ValReg);
2770     const RegisterBank &RB = *RBI.getRegBank(ValReg, MRI, TRI);
2771 
2772     // The code below doesn't support truncating stores, so we need to split it
2773     // again.
2774     if (isa<GStore>(LdSt) && ValTy.getSizeInBits() > MemSizeInBits) {
2775       unsigned SubReg;
2776       LLT MemTy = LdSt.getMMO().getMemoryType();
2777       auto *RC = getRegClassForTypeOnBank(MemTy, RB, RBI);
2778       if (!getSubRegForClass(RC, TRI, SubReg))
2779         return false;
2780 
2781       // Generate a subreg copy.
2782       auto Copy = MIB.buildInstr(TargetOpcode::COPY, {MemTy}, {})
2783                       .addReg(ValReg, 0, SubReg)
2784                       .getReg(0);
2785       RBI.constrainGenericRegister(Copy, *RC, MRI);
2786       LdSt.getOperand(0).setReg(Copy);
2787     } else if (isa<GLoad>(LdSt) && ValTy.getSizeInBits() > MemSizeInBits) {
2788       // If this is an any-extending load from the FPR bank, split it into a regular
2789       // load + extend.
2790       if (RB.getID() == AArch64::FPRRegBankID) {
2791         unsigned SubReg;
2792         LLT MemTy = LdSt.getMMO().getMemoryType();
2793         auto *RC = getRegClassForTypeOnBank(MemTy, RB, RBI);
2794         if (!getSubRegForClass(RC, TRI, SubReg))
2795           return false;
2796         Register OldDst = LdSt.getReg(0);
2797         Register NewDst =
2798             MRI.createGenericVirtualRegister(LdSt.getMMO().getMemoryType());
2799         LdSt.getOperand(0).setReg(NewDst);
2800         MRI.setRegBank(NewDst, RB);
2801         // Generate a SUBREG_TO_REG to extend it.
2802         MIB.setInsertPt(MIB.getMBB(), std::next(LdSt.getIterator()));
2803         MIB.buildInstr(AArch64::SUBREG_TO_REG, {OldDst}, {})
2804             .addImm(0)
2805             .addUse(NewDst)
2806             .addImm(SubReg);
2807         auto SubRegRC = getRegClassForTypeOnBank(MRI.getType(OldDst), RB, RBI);
2808         RBI.constrainGenericRegister(OldDst, *SubRegRC, MRI);
2809         MIB.setInstr(LdSt);
2810       }
2811     }
2812 
2813     // Helper lambda for partially selecting I. Either returns the original
2814     // instruction with an updated opcode, or a new instruction.
2815     auto SelectLoadStoreAddressingMode = [&]() -> MachineInstr * {
2816       bool IsStore = isa<GStore>(I);
2817       const unsigned NewOpc =
2818           selectLoadStoreUIOp(I.getOpcode(), RB.getID(), MemSizeInBits);
2819       if (NewOpc == I.getOpcode())
2820         return nullptr;
2821       // Check if we can fold anything into the addressing mode.
2822       auto AddrModeFns =
2823           selectAddrModeIndexed(I.getOperand(1), MemSizeInBytes);
2824       if (!AddrModeFns) {
2825         // Can't fold anything. Use the original instruction.
2826         I.setDesc(TII.get(NewOpc));
2827         I.addOperand(MachineOperand::CreateImm(0));
2828         return &I;
2829       }
2830 
2831       // Folded something. Create a new instruction and return it.
2832       auto NewInst = MIB.buildInstr(NewOpc, {}, {}, I.getFlags());
2833       Register CurValReg = I.getOperand(0).getReg();
2834       IsStore ? NewInst.addUse(CurValReg) : NewInst.addDef(CurValReg);
2835       NewInst.cloneMemRefs(I);
2836       for (auto &Fn : *AddrModeFns)
2837         Fn(NewInst);
2838       I.eraseFromParent();
2839       return &*NewInst;
2840     };
2841 
2842     MachineInstr *LoadStore = SelectLoadStoreAddressingMode();
2843     if (!LoadStore)
2844       return false;
2845 
2846     // If we're storing a 0, use WZR/XZR.
2847     if (Opcode == TargetOpcode::G_STORE) {
2848       auto CVal = getIConstantVRegValWithLookThrough(
2849           LoadStore->getOperand(0).getReg(), MRI);
2850       if (CVal && CVal->Value == 0) {
2851         switch (LoadStore->getOpcode()) {
2852         case AArch64::STRWui:
2853         case AArch64::STRHHui:
2854         case AArch64::STRBBui:
2855           LoadStore->getOperand(0).setReg(AArch64::WZR);
2856           break;
2857         case AArch64::STRXui:
2858           LoadStore->getOperand(0).setReg(AArch64::XZR);
2859           break;
2860         }
2861       }
2862     }
2863 
2864     if (IsZExtLoad) {
2865       // The zextload from a smaller type to i32 should be handled by the
2866       // importer.
2867       if (MRI.getType(LoadStore->getOperand(0).getReg()).getSizeInBits() != 64)
2868         return false;
2869       // If we have a ZEXTLOAD then change the load's type to be a narrower reg
2870       // and zero_extend with SUBREG_TO_REG.
2871       Register LdReg = MRI.createVirtualRegister(&AArch64::GPR32RegClass);
2872       Register DstReg = LoadStore->getOperand(0).getReg();
2873       LoadStore->getOperand(0).setReg(LdReg);
2874 
2875       MIB.setInsertPt(MIB.getMBB(), std::next(LoadStore->getIterator()));
2876       MIB.buildInstr(AArch64::SUBREG_TO_REG, {DstReg}, {})
2877           .addImm(0)
2878           .addUse(LdReg)
2879           .addImm(AArch64::sub_32);
2880       constrainSelectedInstRegOperands(*LoadStore, TII, TRI, RBI);
2881       return RBI.constrainGenericRegister(DstReg, AArch64::GPR64allRegClass,
2882                                           MRI);
2883     }
2884     return constrainSelectedInstRegOperands(*LoadStore, TII, TRI, RBI);
2885   }
2886 
2887   case TargetOpcode::G_SMULH:
2888   case TargetOpcode::G_UMULH: {
2889     // Reject the various things we don't support yet.
2890     if (unsupportedBinOp(I, RBI, MRI, TRI))
2891       return false;
2892 
2893     const Register DefReg = I.getOperand(0).getReg();
2894     const RegisterBank &RB = *RBI.getRegBank(DefReg, MRI, TRI);
2895 
2896     if (RB.getID() != AArch64::GPRRegBankID) {
2897       LLVM_DEBUG(dbgs() << "G_[SU]MULH on bank: " << RB << ", expected: GPR\n");
2898       return false;
2899     }
2900 
2901     if (Ty != LLT::scalar(64)) {
2902       LLVM_DEBUG(dbgs() << "G_[SU]MULH has type: " << Ty
2903                         << ", expected: " << LLT::scalar(64) << '\n');
2904       return false;
2905     }
2906 
2907     unsigned NewOpc = I.getOpcode() == TargetOpcode::G_SMULH ? AArch64::SMULHrr
2908                                                              : AArch64::UMULHrr;
2909     I.setDesc(TII.get(NewOpc));
2910 
2911     // Now that we selected an opcode, we need to constrain the register
2912     // operands to use appropriate classes.
2913     return constrainSelectedInstRegOperands(I, TII, TRI, RBI);
2914   }
2915   case TargetOpcode::G_LSHR:
2916   case TargetOpcode::G_ASHR:
2917     if (MRI.getType(I.getOperand(0).getReg()).isVector())
2918       return selectVectorAshrLshr(I, MRI);
2919     LLVM_FALLTHROUGH;
2920   case TargetOpcode::G_SHL:
2921     if (Opcode == TargetOpcode::G_SHL &&
2922         MRI.getType(I.getOperand(0).getReg()).isVector())
2923       return selectVectorSHL(I, MRI);
2924 
2925     // These shifts were legalized to have 64 bit shift amounts because we
2926     // want to take advantage of the selection patterns that assume the
2927     // immediates are s64s, however, selectBinaryOp will assume both operands
2928     // will have the same bit size.
2929     {
2930       Register SrcReg = I.getOperand(1).getReg();
2931       Register ShiftReg = I.getOperand(2).getReg();
2932       const LLT ShiftTy = MRI.getType(ShiftReg);
2933       const LLT SrcTy = MRI.getType(SrcReg);
2934       if (!SrcTy.isVector() && SrcTy.getSizeInBits() == 32 &&
2935           ShiftTy.getSizeInBits() == 64) {
2936         assert(!ShiftTy.isVector() && "unexpected vector shift ty");
2937         assert(MRI.getVRegDef(ShiftReg) &&
2938                "could not find a vreg definition for shift amount");
2939         // Insert a subregister copy to implement a 64->32 trunc
2940         auto Trunc = MIB.buildInstr(TargetOpcode::COPY, {SrcTy}, {})
2941                          .addReg(ShiftReg, 0, AArch64::sub_32);
2942         MRI.setRegBank(Trunc.getReg(0), RBI.getRegBank(AArch64::GPRRegBankID));
2943         I.getOperand(2).setReg(Trunc.getReg(0));
2944       }
2945     }
2946     LLVM_FALLTHROUGH;
2947   case TargetOpcode::G_FADD:
2948   case TargetOpcode::G_FSUB:
2949   case TargetOpcode::G_FMUL:
2950   case TargetOpcode::G_FDIV:
2951   case TargetOpcode::G_OR: {
2952     // Reject the various things we don't support yet.
2953     if (unsupportedBinOp(I, RBI, MRI, TRI))
2954       return false;
2955 
2956     const unsigned OpSize = Ty.getSizeInBits();
2957 
2958     const Register DefReg = I.getOperand(0).getReg();
2959     const RegisterBank &RB = *RBI.getRegBank(DefReg, MRI, TRI);
2960 
2961     const unsigned NewOpc = selectBinaryOp(I.getOpcode(), RB.getID(), OpSize);
2962     if (NewOpc == I.getOpcode())
2963       return false;
2964 
2965     I.setDesc(TII.get(NewOpc));
2966     // FIXME: Should the type be always reset in setDesc?
2967 
2968     // Now that we selected an opcode, we need to constrain the register
2969     // operands to use appropriate classes.
2970     return constrainSelectedInstRegOperands(I, TII, TRI, RBI);
2971   }
2972 
2973   case TargetOpcode::G_PTR_ADD: {
2974     emitADD(I.getOperand(0).getReg(), I.getOperand(1), I.getOperand(2), MIB);
2975     I.eraseFromParent();
2976     return true;
2977   }
2978   case TargetOpcode::G_SADDO:
2979   case TargetOpcode::G_UADDO:
2980   case TargetOpcode::G_SSUBO:
2981   case TargetOpcode::G_USUBO: {
2982     // Emit the operation and get the correct condition code.
2983     auto OpAndCC = emitOverflowOp(Opcode, I.getOperand(0).getReg(),
2984                                   I.getOperand(2), I.getOperand(3), MIB);
2985 
2986     // Now, put the overflow result in the register given by the first operand
2987     // to the overflow op. CSINC increments the result when the predicate is
2988     // false, so to get the increment when it's true, we need to use the
2989     // inverse. In this case, we want to increment when carry is set.
2990     Register ZReg = AArch64::WZR;
2991     emitCSINC(/*Dst=*/I.getOperand(1).getReg(), /*Src1=*/ZReg, /*Src2=*/ZReg,
2992               getInvertedCondCode(OpAndCC.second), MIB);
2993     I.eraseFromParent();
2994     return true;
2995   }
2996 
2997   case TargetOpcode::G_PTRMASK: {
2998     Register MaskReg = I.getOperand(2).getReg();
2999     Optional<int64_t> MaskVal = getIConstantVRegSExtVal(MaskReg, MRI);
3000     // TODO: Implement arbitrary cases
3001     if (!MaskVal || !isShiftedMask_64(*MaskVal))
3002       return false;
3003 
3004     uint64_t Mask = *MaskVal;
3005     I.setDesc(TII.get(AArch64::ANDXri));
3006     I.getOperand(2).ChangeToImmediate(
3007         AArch64_AM::encodeLogicalImmediate(Mask, 64));
3008 
3009     return constrainSelectedInstRegOperands(I, TII, TRI, RBI);
3010   }
3011   case TargetOpcode::G_PTRTOINT:
3012   case TargetOpcode::G_TRUNC: {
3013     const LLT DstTy = MRI.getType(I.getOperand(0).getReg());
3014     const LLT SrcTy = MRI.getType(I.getOperand(1).getReg());
3015 
3016     const Register DstReg = I.getOperand(0).getReg();
3017     const Register SrcReg = I.getOperand(1).getReg();
3018 
3019     const RegisterBank &DstRB = *RBI.getRegBank(DstReg, MRI, TRI);
3020     const RegisterBank &SrcRB = *RBI.getRegBank(SrcReg, MRI, TRI);
3021 
3022     if (DstRB.getID() != SrcRB.getID()) {
3023       LLVM_DEBUG(
3024           dbgs() << "G_TRUNC/G_PTRTOINT input/output on different banks\n");
3025       return false;
3026     }
3027 
3028     if (DstRB.getID() == AArch64::GPRRegBankID) {
3029       const TargetRegisterClass *DstRC =
3030           getRegClassForTypeOnBank(DstTy, DstRB, RBI);
3031       if (!DstRC)
3032         return false;
3033 
3034       const TargetRegisterClass *SrcRC =
3035           getRegClassForTypeOnBank(SrcTy, SrcRB, RBI);
3036       if (!SrcRC)
3037         return false;
3038 
3039       if (!RBI.constrainGenericRegister(SrcReg, *SrcRC, MRI) ||
3040           !RBI.constrainGenericRegister(DstReg, *DstRC, MRI)) {
3041         LLVM_DEBUG(dbgs() << "Failed to constrain G_TRUNC/G_PTRTOINT\n");
3042         return false;
3043       }
3044 
3045       if (DstRC == SrcRC) {
3046         // Nothing to be done
3047       } else if (Opcode == TargetOpcode::G_TRUNC && DstTy == LLT::scalar(32) &&
3048                  SrcTy == LLT::scalar(64)) {
3049         llvm_unreachable("TableGen can import this case");
3050         return false;
3051       } else if (DstRC == &AArch64::GPR32RegClass &&
3052                  SrcRC == &AArch64::GPR64RegClass) {
3053         I.getOperand(1).setSubReg(AArch64::sub_32);
3054       } else {
3055         LLVM_DEBUG(
3056             dbgs() << "Unhandled mismatched classes in G_TRUNC/G_PTRTOINT\n");
3057         return false;
3058       }
3059 
3060       I.setDesc(TII.get(TargetOpcode::COPY));
3061       return true;
3062     } else if (DstRB.getID() == AArch64::FPRRegBankID) {
3063       if (DstTy == LLT::fixed_vector(4, 16) &&
3064           SrcTy == LLT::fixed_vector(4, 32)) {
3065         I.setDesc(TII.get(AArch64::XTNv4i16));
3066         constrainSelectedInstRegOperands(I, TII, TRI, RBI);
3067         return true;
3068       }
3069 
3070       if (!SrcTy.isVector() && SrcTy.getSizeInBits() == 128) {
3071         MachineInstr *Extract = emitExtractVectorElt(
3072             DstReg, DstRB, LLT::scalar(DstTy.getSizeInBits()), SrcReg, 0, MIB);
3073         if (!Extract)
3074           return false;
3075         I.eraseFromParent();
3076         return true;
3077       }
3078 
3079       // We might have a vector G_PTRTOINT, in which case just emit a COPY.
3080       if (Opcode == TargetOpcode::G_PTRTOINT) {
3081         assert(DstTy.isVector() && "Expected an FPR ptrtoint to be a vector");
3082         I.setDesc(TII.get(TargetOpcode::COPY));
3083         return selectCopy(I, TII, MRI, TRI, RBI);
3084       }
3085     }
3086 
3087     return false;
3088   }
3089 
3090   case TargetOpcode::G_ANYEXT: {
3091     if (selectUSMovFromExtend(I, MRI))
3092       return true;
3093 
3094     const Register DstReg = I.getOperand(0).getReg();
3095     const Register SrcReg = I.getOperand(1).getReg();
3096 
3097     const RegisterBank &RBDst = *RBI.getRegBank(DstReg, MRI, TRI);
3098     if (RBDst.getID() != AArch64::GPRRegBankID) {
3099       LLVM_DEBUG(dbgs() << "G_ANYEXT on bank: " << RBDst
3100                         << ", expected: GPR\n");
3101       return false;
3102     }
3103 
3104     const RegisterBank &RBSrc = *RBI.getRegBank(SrcReg, MRI, TRI);
3105     if (RBSrc.getID() != AArch64::GPRRegBankID) {
3106       LLVM_DEBUG(dbgs() << "G_ANYEXT on bank: " << RBSrc
3107                         << ", expected: GPR\n");
3108       return false;
3109     }
3110 
3111     const unsigned DstSize = MRI.getType(DstReg).getSizeInBits();
3112 
3113     if (DstSize == 0) {
3114       LLVM_DEBUG(dbgs() << "G_ANYEXT operand has no size, not a gvreg?\n");
3115       return false;
3116     }
3117 
3118     if (DstSize != 64 && DstSize > 32) {
3119       LLVM_DEBUG(dbgs() << "G_ANYEXT to size: " << DstSize
3120                         << ", expected: 32 or 64\n");
3121       return false;
3122     }
3123     // At this point G_ANYEXT is just like a plain COPY, but we need
3124     // to explicitly form the 64-bit value if any.
3125     if (DstSize > 32) {
3126       Register ExtSrc = MRI.createVirtualRegister(&AArch64::GPR64allRegClass);
3127       BuildMI(MBB, I, I.getDebugLoc(), TII.get(AArch64::SUBREG_TO_REG))
3128           .addDef(ExtSrc)
3129           .addImm(0)
3130           .addUse(SrcReg)
3131           .addImm(AArch64::sub_32);
3132       I.getOperand(1).setReg(ExtSrc);
3133     }
3134     return selectCopy(I, TII, MRI, TRI, RBI);
3135   }
3136 
3137   case TargetOpcode::G_ZEXT:
3138   case TargetOpcode::G_SEXT_INREG:
3139   case TargetOpcode::G_SEXT: {
3140     if (selectUSMovFromExtend(I, MRI))
3141       return true;
3142 
3143     unsigned Opcode = I.getOpcode();
3144     const bool IsSigned = Opcode != TargetOpcode::G_ZEXT;
3145     const Register DefReg = I.getOperand(0).getReg();
3146     Register SrcReg = I.getOperand(1).getReg();
3147     const LLT DstTy = MRI.getType(DefReg);
3148     const LLT SrcTy = MRI.getType(SrcReg);
3149     unsigned DstSize = DstTy.getSizeInBits();
3150     unsigned SrcSize = SrcTy.getSizeInBits();
3151 
3152     // SEXT_INREG has the same src reg size as dst, the size of the value to be
3153     // extended is encoded in the imm.
3154     if (Opcode == TargetOpcode::G_SEXT_INREG)
3155       SrcSize = I.getOperand(2).getImm();
3156 
3157     if (DstTy.isVector())
3158       return false; // Should be handled by imported patterns.
3159 
3160     assert((*RBI.getRegBank(DefReg, MRI, TRI)).getID() ==
3161                AArch64::GPRRegBankID &&
3162            "Unexpected ext regbank");
3163 
3164     MachineInstr *ExtI;
3165 
3166     // First check if we're extending the result of a load which has a dest type
3167     // smaller than 32 bits, then this zext is redundant. GPR32 is the smallest
3168     // GPR register on AArch64 and all loads which are smaller automatically
3169     // zero-extend the upper bits. E.g.
3170     // %v(s8) = G_LOAD %p, :: (load 1)
3171     // %v2(s32) = G_ZEXT %v(s8)
3172     if (!IsSigned) {
3173       auto *LoadMI = getOpcodeDef(TargetOpcode::G_LOAD, SrcReg, MRI);
3174       bool IsGPR =
3175           RBI.getRegBank(SrcReg, MRI, TRI)->getID() == AArch64::GPRRegBankID;
3176       if (LoadMI && IsGPR) {
3177         const MachineMemOperand *MemOp = *LoadMI->memoperands_begin();
3178         unsigned BytesLoaded = MemOp->getSize();
3179         if (BytesLoaded < 4 && SrcTy.getSizeInBytes() == BytesLoaded)
3180           return selectCopy(I, TII, MRI, TRI, RBI);
3181       }
3182 
3183       // For the 32-bit -> 64-bit case, we can emit a mov (ORRWrs)
3184       // + SUBREG_TO_REG.
3185       //
3186       // If we are zero extending from 32 bits to 64 bits, it's possible that
3187       // the instruction implicitly does the zero extend for us. In that case,
3188       // we only need the SUBREG_TO_REG.
3189       if (IsGPR && SrcSize == 32 && DstSize == 64) {
3190         // Unlike with the G_LOAD case, we don't want to look through copies
3191         // here. (See isDef32.)
3192         MachineInstr *Def = MRI.getVRegDef(SrcReg);
3193         Register SubregToRegSrc = SrcReg;
3194 
3195         // Does the instruction implicitly zero extend?
3196         if (!Def || !isDef32(*Def)) {
3197           // No. Zero out using an OR.
3198           Register OrDst = MRI.createVirtualRegister(&AArch64::GPR32RegClass);
3199           const Register ZReg = AArch64::WZR;
3200           MIB.buildInstr(AArch64::ORRWrs, {OrDst}, {ZReg, SrcReg}).addImm(0);
3201           SubregToRegSrc = OrDst;
3202         }
3203 
3204         MIB.buildInstr(AArch64::SUBREG_TO_REG, {DefReg}, {})
3205             .addImm(0)
3206             .addUse(SubregToRegSrc)
3207             .addImm(AArch64::sub_32);
3208 
3209         if (!RBI.constrainGenericRegister(DefReg, AArch64::GPR64RegClass,
3210                                           MRI)) {
3211           LLVM_DEBUG(dbgs() << "Failed to constrain G_ZEXT destination\n");
3212           return false;
3213         }
3214 
3215         if (!RBI.constrainGenericRegister(SrcReg, AArch64::GPR32RegClass,
3216                                           MRI)) {
3217           LLVM_DEBUG(dbgs() << "Failed to constrain G_ZEXT source\n");
3218           return false;
3219         }
3220 
3221         I.eraseFromParent();
3222         return true;
3223       }
3224     }
3225 
3226     if (DstSize == 64) {
3227       if (Opcode != TargetOpcode::G_SEXT_INREG) {
3228         // FIXME: Can we avoid manually doing this?
3229         if (!RBI.constrainGenericRegister(SrcReg, AArch64::GPR32RegClass,
3230                                           MRI)) {
3231           LLVM_DEBUG(dbgs() << "Failed to constrain " << TII.getName(Opcode)
3232                             << " operand\n");
3233           return false;
3234         }
3235         SrcReg = MIB.buildInstr(AArch64::SUBREG_TO_REG,
3236                                 {&AArch64::GPR64RegClass}, {})
3237                      .addImm(0)
3238                      .addUse(SrcReg)
3239                      .addImm(AArch64::sub_32)
3240                      .getReg(0);
3241       }
3242 
3243       ExtI = MIB.buildInstr(IsSigned ? AArch64::SBFMXri : AArch64::UBFMXri,
3244                              {DefReg}, {SrcReg})
3245                   .addImm(0)
3246                   .addImm(SrcSize - 1);
3247     } else if (DstSize <= 32) {
3248       ExtI = MIB.buildInstr(IsSigned ? AArch64::SBFMWri : AArch64::UBFMWri,
3249                              {DefReg}, {SrcReg})
3250                   .addImm(0)
3251                   .addImm(SrcSize - 1);
3252     } else {
3253       return false;
3254     }
3255 
3256     constrainSelectedInstRegOperands(*ExtI, TII, TRI, RBI);
3257     I.eraseFromParent();
3258     return true;
3259   }
3260 
3261   case TargetOpcode::G_SITOFP:
3262   case TargetOpcode::G_UITOFP:
3263   case TargetOpcode::G_FPTOSI:
3264   case TargetOpcode::G_FPTOUI: {
3265     const LLT DstTy = MRI.getType(I.getOperand(0).getReg()),
3266               SrcTy = MRI.getType(I.getOperand(1).getReg());
3267     const unsigned NewOpc = selectFPConvOpc(Opcode, DstTy, SrcTy);
3268     if (NewOpc == Opcode)
3269       return false;
3270 
3271     I.setDesc(TII.get(NewOpc));
3272     constrainSelectedInstRegOperands(I, TII, TRI, RBI);
3273 
3274     return true;
3275   }
3276 
3277   case TargetOpcode::G_FREEZE:
3278     return selectCopy(I, TII, MRI, TRI, RBI);
3279 
3280   case TargetOpcode::G_INTTOPTR:
3281     // The importer is currently unable to import pointer types since they
3282     // didn't exist in SelectionDAG.
3283     return selectCopy(I, TII, MRI, TRI, RBI);
3284 
3285   case TargetOpcode::G_BITCAST:
3286     // Imported SelectionDAG rules can handle every bitcast except those that
3287     // bitcast from a type to the same type. Ideally, these shouldn't occur
3288     // but we might not run an optimizer that deletes them. The other exception
3289     // is bitcasts involving pointer types, as SelectionDAG has no knowledge
3290     // of them.
3291     return selectCopy(I, TII, MRI, TRI, RBI);
3292 
3293   case TargetOpcode::G_SELECT: {
3294     if (MRI.getType(I.getOperand(1).getReg()) != LLT::scalar(1)) {
3295       LLVM_DEBUG(dbgs() << "G_SELECT cond has type: " << Ty
3296                         << ", expected: " << LLT::scalar(1) << '\n');
3297       return false;
3298     }
3299 
3300     const Register CondReg = I.getOperand(1).getReg();
3301     const Register TReg = I.getOperand(2).getReg();
3302     const Register FReg = I.getOperand(3).getReg();
3303 
3304     if (tryOptSelect(I))
3305       return true;
3306 
3307     // Make sure to use an unused vreg instead of wzr, so that the peephole
3308     // optimizations will be able to optimize these.
3309     Register DeadVReg = MRI.createVirtualRegister(&AArch64::GPR32RegClass);
3310     auto TstMI = MIB.buildInstr(AArch64::ANDSWri, {DeadVReg}, {CondReg})
3311                      .addImm(AArch64_AM::encodeLogicalImmediate(1, 32));
3312     constrainSelectedInstRegOperands(*TstMI, TII, TRI, RBI);
3313     if (!emitSelect(I.getOperand(0).getReg(), TReg, FReg, AArch64CC::NE, MIB))
3314       return false;
3315     I.eraseFromParent();
3316     return true;
3317   }
3318   case TargetOpcode::G_ICMP: {
3319     if (Ty.isVector())
3320       return selectVectorICmp(I, MRI);
3321 
3322     if (Ty != LLT::scalar(32)) {
3323       LLVM_DEBUG(dbgs() << "G_ICMP result has type: " << Ty
3324                         << ", expected: " << LLT::scalar(32) << '\n');
3325       return false;
3326     }
3327 
3328     auto Pred = static_cast<CmpInst::Predicate>(I.getOperand(1).getPredicate());
3329     const AArch64CC::CondCode InvCC =
3330         changeICMPPredToAArch64CC(CmpInst::getInversePredicate(Pred));
3331     emitIntegerCompare(I.getOperand(2), I.getOperand(3), I.getOperand(1), MIB);
3332     emitCSINC(/*Dst=*/I.getOperand(0).getReg(), /*Src1=*/AArch64::WZR,
3333               /*Src2=*/AArch64::WZR, InvCC, MIB);
3334     I.eraseFromParent();
3335     return true;
3336   }
3337 
3338   case TargetOpcode::G_FCMP: {
3339     CmpInst::Predicate Pred =
3340         static_cast<CmpInst::Predicate>(I.getOperand(1).getPredicate());
3341     if (!emitFPCompare(I.getOperand(2).getReg(), I.getOperand(3).getReg(), MIB,
3342                        Pred) ||
3343         !emitCSetForFCmp(I.getOperand(0).getReg(), Pred, MIB))
3344       return false;
3345     I.eraseFromParent();
3346     return true;
3347   }
3348   case TargetOpcode::G_VASTART:
3349     return STI.isTargetDarwin() ? selectVaStartDarwin(I, MF, MRI)
3350                                 : selectVaStartAAPCS(I, MF, MRI);
3351   case TargetOpcode::G_INTRINSIC:
3352     return selectIntrinsic(I, MRI);
3353   case TargetOpcode::G_INTRINSIC_W_SIDE_EFFECTS:
3354     return selectIntrinsicWithSideEffects(I, MRI);
3355   case TargetOpcode::G_IMPLICIT_DEF: {
3356     I.setDesc(TII.get(TargetOpcode::IMPLICIT_DEF));
3357     const LLT DstTy = MRI.getType(I.getOperand(0).getReg());
3358     const Register DstReg = I.getOperand(0).getReg();
3359     const RegisterBank &DstRB = *RBI.getRegBank(DstReg, MRI, TRI);
3360     const TargetRegisterClass *DstRC =
3361         getRegClassForTypeOnBank(DstTy, DstRB, RBI);
3362     RBI.constrainGenericRegister(DstReg, *DstRC, MRI);
3363     return true;
3364   }
3365   case TargetOpcode::G_BLOCK_ADDR: {
3366     if (TM.getCodeModel() == CodeModel::Large) {
3367       materializeLargeCMVal(I, I.getOperand(1).getBlockAddress(), 0);
3368       I.eraseFromParent();
3369       return true;
3370     } else {
3371       I.setDesc(TII.get(AArch64::MOVaddrBA));
3372       auto MovMI = BuildMI(MBB, I, I.getDebugLoc(), TII.get(AArch64::MOVaddrBA),
3373                            I.getOperand(0).getReg())
3374                        .addBlockAddress(I.getOperand(1).getBlockAddress(),
3375                                         /* Offset */ 0, AArch64II::MO_PAGE)
3376                        .addBlockAddress(
3377                            I.getOperand(1).getBlockAddress(), /* Offset */ 0,
3378                            AArch64II::MO_NC | AArch64II::MO_PAGEOFF);
3379       I.eraseFromParent();
3380       return constrainSelectedInstRegOperands(*MovMI, TII, TRI, RBI);
3381     }
3382   }
3383   case AArch64::G_DUP: {
3384     // When the scalar of G_DUP is an s8/s16 gpr, they can't be selected by
3385     // imported patterns. Do it manually here. Avoiding generating s16 gpr is
3386     // difficult because at RBS we may end up pessimizing the fpr case if we
3387     // decided to add an anyextend to fix this. Manual selection is the most
3388     // robust solution for now.
3389     if (RBI.getRegBank(I.getOperand(1).getReg(), MRI, TRI)->getID() !=
3390         AArch64::GPRRegBankID)
3391       return false; // We expect the fpr regbank case to be imported.
3392     LLT VecTy = MRI.getType(I.getOperand(0).getReg());
3393     if (VecTy == LLT::fixed_vector(8, 8))
3394       I.setDesc(TII.get(AArch64::DUPv8i8gpr));
3395     else if (VecTy == LLT::fixed_vector(16, 8))
3396       I.setDesc(TII.get(AArch64::DUPv16i8gpr));
3397     else if (VecTy == LLT::fixed_vector(4, 16))
3398       I.setDesc(TII.get(AArch64::DUPv4i16gpr));
3399     else if (VecTy == LLT::fixed_vector(8, 16))
3400       I.setDesc(TII.get(AArch64::DUPv8i16gpr));
3401     else
3402       return false;
3403     return constrainSelectedInstRegOperands(I, TII, TRI, RBI);
3404   }
3405   case TargetOpcode::G_INTRINSIC_TRUNC:
3406     return selectIntrinsicTrunc(I, MRI);
3407   case TargetOpcode::G_INTRINSIC_ROUND:
3408     return selectIntrinsicRound(I, MRI);
3409   case TargetOpcode::G_BUILD_VECTOR:
3410     return selectBuildVector(I, MRI);
3411   case TargetOpcode::G_MERGE_VALUES:
3412     return selectMergeValues(I, MRI);
3413   case TargetOpcode::G_UNMERGE_VALUES:
3414     return selectUnmergeValues(I, MRI);
3415   case TargetOpcode::G_SHUFFLE_VECTOR:
3416     return selectShuffleVector(I, MRI);
3417   case TargetOpcode::G_EXTRACT_VECTOR_ELT:
3418     return selectExtractElt(I, MRI);
3419   case TargetOpcode::G_INSERT_VECTOR_ELT:
3420     return selectInsertElt(I, MRI);
3421   case TargetOpcode::G_CONCAT_VECTORS:
3422     return selectConcatVectors(I, MRI);
3423   case TargetOpcode::G_JUMP_TABLE:
3424     return selectJumpTable(I, MRI);
3425   case TargetOpcode::G_VECREDUCE_FADD:
3426   case TargetOpcode::G_VECREDUCE_ADD:
3427     return selectReduction(I, MRI);
3428   case TargetOpcode::G_MEMCPY:
3429   case TargetOpcode::G_MEMCPY_INLINE:
3430   case TargetOpcode::G_MEMMOVE:
3431   case TargetOpcode::G_MEMSET:
3432     assert(STI.hasMOPS() && "Shouldn't get here without +mops feature");
3433     return selectMOPS(I, MRI);
3434   }
3435 
3436   return false;
3437 }
3438 
3439 bool AArch64InstructionSelector::selectReduction(MachineInstr &I,
3440                                                  MachineRegisterInfo &MRI) {
3441   Register VecReg = I.getOperand(1).getReg();
3442   LLT VecTy = MRI.getType(VecReg);
3443   if (I.getOpcode() == TargetOpcode::G_VECREDUCE_ADD) {
3444     // For <2 x i32> ADDPv2i32 generates an FPR64 value, so we need to emit
3445     // a subregister copy afterwards.
3446     if (VecTy == LLT::fixed_vector(2, 32)) {
3447       Register DstReg = I.getOperand(0).getReg();
3448       auto AddP = MIB.buildInstr(AArch64::ADDPv2i32, {&AArch64::FPR64RegClass},
3449                                  {VecReg, VecReg});
3450       auto Copy = MIB.buildInstr(TargetOpcode::COPY, {DstReg}, {})
3451                       .addReg(AddP.getReg(0), 0, AArch64::ssub)
3452                       .getReg(0);
3453       RBI.constrainGenericRegister(Copy, AArch64::FPR32RegClass, MRI);
3454       I.eraseFromParent();
3455       return constrainSelectedInstRegOperands(*AddP, TII, TRI, RBI);
3456     }
3457 
3458     unsigned Opc = 0;
3459     if (VecTy == LLT::fixed_vector(16, 8))
3460       Opc = AArch64::ADDVv16i8v;
3461     else if (VecTy == LLT::fixed_vector(8, 16))
3462       Opc = AArch64::ADDVv8i16v;
3463     else if (VecTy == LLT::fixed_vector(4, 32))
3464       Opc = AArch64::ADDVv4i32v;
3465     else if (VecTy == LLT::fixed_vector(2, 64))
3466       Opc = AArch64::ADDPv2i64p;
3467     else {
3468       LLVM_DEBUG(dbgs() << "Unhandled type for add reduction");
3469       return false;
3470     }
3471     I.setDesc(TII.get(Opc));
3472     return constrainSelectedInstRegOperands(I, TII, TRI, RBI);
3473   }
3474 
3475   if (I.getOpcode() == TargetOpcode::G_VECREDUCE_FADD) {
3476     unsigned Opc = 0;
3477     if (VecTy == LLT::fixed_vector(2, 32))
3478       Opc = AArch64::FADDPv2i32p;
3479     else if (VecTy == LLT::fixed_vector(2, 64))
3480       Opc = AArch64::FADDPv2i64p;
3481     else {
3482       LLVM_DEBUG(dbgs() << "Unhandled type for fadd reduction");
3483       return false;
3484     }
3485     I.setDesc(TII.get(Opc));
3486     return constrainSelectedInstRegOperands(I, TII, TRI, RBI);
3487   }
3488   return false;
3489 }
3490 
3491 bool AArch64InstructionSelector::selectMOPS(MachineInstr &GI,
3492                                             MachineRegisterInfo &MRI) {
3493   unsigned Mopcode;
3494   switch (GI.getOpcode()) {
3495   case TargetOpcode::G_MEMCPY:
3496   case TargetOpcode::G_MEMCPY_INLINE:
3497     Mopcode = AArch64::MOPSMemoryCopyPseudo;
3498     break;
3499   case TargetOpcode::G_MEMMOVE:
3500     Mopcode = AArch64::MOPSMemoryMovePseudo;
3501     break;
3502   case TargetOpcode::G_MEMSET:
3503     // For tagged memset see llvm.aarch64.mops.memset.tag
3504     Mopcode = AArch64::MOPSMemorySetPseudo;
3505     break;
3506   }
3507 
3508   auto &DstPtr = GI.getOperand(0);
3509   auto &SrcOrVal = GI.getOperand(1);
3510   auto &Size = GI.getOperand(2);
3511 
3512   // Create copies of the registers that can be clobbered.
3513   const Register DstPtrCopy = MRI.cloneVirtualRegister(DstPtr.getReg());
3514   const Register SrcValCopy = MRI.cloneVirtualRegister(SrcOrVal.getReg());
3515   const Register SizeCopy = MRI.cloneVirtualRegister(Size.getReg());
3516 
3517   const bool IsSet = Mopcode == AArch64::MOPSMemorySetPseudo;
3518   const auto &SrcValRegClass =
3519       IsSet ? AArch64::GPR64RegClass : AArch64::GPR64commonRegClass;
3520 
3521   // Constrain to specific registers
3522   RBI.constrainGenericRegister(DstPtrCopy, AArch64::GPR64commonRegClass, MRI);
3523   RBI.constrainGenericRegister(SrcValCopy, SrcValRegClass, MRI);
3524   RBI.constrainGenericRegister(SizeCopy, AArch64::GPR64RegClass, MRI);
3525 
3526   MIB.buildCopy(DstPtrCopy, DstPtr);
3527   MIB.buildCopy(SrcValCopy, SrcOrVal);
3528   MIB.buildCopy(SizeCopy, Size);
3529 
3530   // New instruction uses the copied registers because it must update them.
3531   // The defs are not used since they don't exist in G_MEM*. They are still
3532   // tied.
3533   // Note: order of operands is different from G_MEMSET, G_MEMCPY, G_MEMMOVE
3534   Register DefDstPtr = MRI.createVirtualRegister(&AArch64::GPR64commonRegClass);
3535   Register DefSize = MRI.createVirtualRegister(&AArch64::GPR64RegClass);
3536   if (IsSet) {
3537     MIB.buildInstr(Mopcode, {DefDstPtr, DefSize},
3538                    {DstPtrCopy, SizeCopy, SrcValCopy});
3539   } else {
3540     Register DefSrcPtr = MRI.createVirtualRegister(&SrcValRegClass);
3541     MIB.buildInstr(Mopcode, {DefDstPtr, DefSrcPtr, DefSize},
3542                    {DstPtrCopy, SrcValCopy, SizeCopy});
3543   }
3544 
3545   GI.eraseFromParent();
3546   return true;
3547 }
3548 
3549 bool AArch64InstructionSelector::selectBrJT(MachineInstr &I,
3550                                             MachineRegisterInfo &MRI) {
3551   assert(I.getOpcode() == TargetOpcode::G_BRJT && "Expected G_BRJT");
3552   Register JTAddr = I.getOperand(0).getReg();
3553   unsigned JTI = I.getOperand(1).getIndex();
3554   Register Index = I.getOperand(2).getReg();
3555 
3556   Register TargetReg = MRI.createVirtualRegister(&AArch64::GPR64RegClass);
3557   Register ScratchReg = MRI.createVirtualRegister(&AArch64::GPR64spRegClass);
3558 
3559   MF->getInfo<AArch64FunctionInfo>()->setJumpTableEntryInfo(JTI, 4, nullptr);
3560   auto JumpTableInst = MIB.buildInstr(AArch64::JumpTableDest32,
3561                                       {TargetReg, ScratchReg}, {JTAddr, Index})
3562                            .addJumpTableIndex(JTI);
3563   // Build the indirect branch.
3564   MIB.buildInstr(AArch64::BR, {}, {TargetReg});
3565   I.eraseFromParent();
3566   return constrainSelectedInstRegOperands(*JumpTableInst, TII, TRI, RBI);
3567 }
3568 
3569 bool AArch64InstructionSelector::selectJumpTable(MachineInstr &I,
3570                                                  MachineRegisterInfo &MRI) {
3571   assert(I.getOpcode() == TargetOpcode::G_JUMP_TABLE && "Expected jump table");
3572   assert(I.getOperand(1).isJTI() && "Jump table op should have a JTI!");
3573 
3574   Register DstReg = I.getOperand(0).getReg();
3575   unsigned JTI = I.getOperand(1).getIndex();
3576   // We generate a MOVaddrJT which will get expanded to an ADRP + ADD later.
3577   auto MovMI =
3578     MIB.buildInstr(AArch64::MOVaddrJT, {DstReg}, {})
3579           .addJumpTableIndex(JTI, AArch64II::MO_PAGE)
3580           .addJumpTableIndex(JTI, AArch64II::MO_NC | AArch64II::MO_PAGEOFF);
3581   I.eraseFromParent();
3582   return constrainSelectedInstRegOperands(*MovMI, TII, TRI, RBI);
3583 }
3584 
3585 bool AArch64InstructionSelector::selectTLSGlobalValue(
3586     MachineInstr &I, MachineRegisterInfo &MRI) {
3587   if (!STI.isTargetMachO())
3588     return false;
3589   MachineFunction &MF = *I.getParent()->getParent();
3590   MF.getFrameInfo().setAdjustsStack(true);
3591 
3592   const auto &GlobalOp = I.getOperand(1);
3593   assert(GlobalOp.getOffset() == 0 &&
3594          "Shouldn't have an offset on TLS globals!");
3595   const GlobalValue &GV = *GlobalOp.getGlobal();
3596 
3597   auto LoadGOT =
3598       MIB.buildInstr(AArch64::LOADgot, {&AArch64::GPR64commonRegClass}, {})
3599           .addGlobalAddress(&GV, 0, AArch64II::MO_TLS);
3600 
3601   auto Load = MIB.buildInstr(AArch64::LDRXui, {&AArch64::GPR64commonRegClass},
3602                              {LoadGOT.getReg(0)})
3603                   .addImm(0);
3604 
3605   MIB.buildCopy(Register(AArch64::X0), LoadGOT.getReg(0));
3606   // TLS calls preserve all registers except those that absolutely must be
3607   // trashed: X0 (it takes an argument), LR (it's a call) and NZCV (let's not be
3608   // silly).
3609   MIB.buildInstr(getBLRCallOpcode(MF), {}, {Load})
3610       .addUse(AArch64::X0, RegState::Implicit)
3611       .addDef(AArch64::X0, RegState::Implicit)
3612       .addRegMask(TRI.getTLSCallPreservedMask());
3613 
3614   MIB.buildCopy(I.getOperand(0).getReg(), Register(AArch64::X0));
3615   RBI.constrainGenericRegister(I.getOperand(0).getReg(), AArch64::GPR64RegClass,
3616                                MRI);
3617   I.eraseFromParent();
3618   return true;
3619 }
3620 
3621 bool AArch64InstructionSelector::selectIntrinsicTrunc(
3622     MachineInstr &I, MachineRegisterInfo &MRI) const {
3623   const LLT SrcTy = MRI.getType(I.getOperand(0).getReg());
3624 
3625   // Select the correct opcode.
3626   unsigned Opc = 0;
3627   if (!SrcTy.isVector()) {
3628     switch (SrcTy.getSizeInBits()) {
3629     default:
3630     case 16:
3631       Opc = AArch64::FRINTZHr;
3632       break;
3633     case 32:
3634       Opc = AArch64::FRINTZSr;
3635       break;
3636     case 64:
3637       Opc = AArch64::FRINTZDr;
3638       break;
3639     }
3640   } else {
3641     unsigned NumElts = SrcTy.getNumElements();
3642     switch (SrcTy.getElementType().getSizeInBits()) {
3643     default:
3644       break;
3645     case 16:
3646       if (NumElts == 4)
3647         Opc = AArch64::FRINTZv4f16;
3648       else if (NumElts == 8)
3649         Opc = AArch64::FRINTZv8f16;
3650       break;
3651     case 32:
3652       if (NumElts == 2)
3653         Opc = AArch64::FRINTZv2f32;
3654       else if (NumElts == 4)
3655         Opc = AArch64::FRINTZv4f32;
3656       break;
3657     case 64:
3658       if (NumElts == 2)
3659         Opc = AArch64::FRINTZv2f64;
3660       break;
3661     }
3662   }
3663 
3664   if (!Opc) {
3665     // Didn't get an opcode above, bail.
3666     LLVM_DEBUG(dbgs() << "Unsupported type for G_INTRINSIC_TRUNC!\n");
3667     return false;
3668   }
3669 
3670   // Legalization would have set us up perfectly for this; we just need to
3671   // set the opcode and move on.
3672   I.setDesc(TII.get(Opc));
3673   return constrainSelectedInstRegOperands(I, TII, TRI, RBI);
3674 }
3675 
3676 bool AArch64InstructionSelector::selectIntrinsicRound(
3677     MachineInstr &I, MachineRegisterInfo &MRI) const {
3678   const LLT SrcTy = MRI.getType(I.getOperand(0).getReg());
3679 
3680   // Select the correct opcode.
3681   unsigned Opc = 0;
3682   if (!SrcTy.isVector()) {
3683     switch (SrcTy.getSizeInBits()) {
3684     default:
3685     case 16:
3686       Opc = AArch64::FRINTAHr;
3687       break;
3688     case 32:
3689       Opc = AArch64::FRINTASr;
3690       break;
3691     case 64:
3692       Opc = AArch64::FRINTADr;
3693       break;
3694     }
3695   } else {
3696     unsigned NumElts = SrcTy.getNumElements();
3697     switch (SrcTy.getElementType().getSizeInBits()) {
3698     default:
3699       break;
3700     case 16:
3701       if (NumElts == 4)
3702         Opc = AArch64::FRINTAv4f16;
3703       else if (NumElts == 8)
3704         Opc = AArch64::FRINTAv8f16;
3705       break;
3706     case 32:
3707       if (NumElts == 2)
3708         Opc = AArch64::FRINTAv2f32;
3709       else if (NumElts == 4)
3710         Opc = AArch64::FRINTAv4f32;
3711       break;
3712     case 64:
3713       if (NumElts == 2)
3714         Opc = AArch64::FRINTAv2f64;
3715       break;
3716     }
3717   }
3718 
3719   if (!Opc) {
3720     // Didn't get an opcode above, bail.
3721     LLVM_DEBUG(dbgs() << "Unsupported type for G_INTRINSIC_ROUND!\n");
3722     return false;
3723   }
3724 
3725   // Legalization would have set us up perfectly for this; we just need to
3726   // set the opcode and move on.
3727   I.setDesc(TII.get(Opc));
3728   return constrainSelectedInstRegOperands(I, TII, TRI, RBI);
3729 }
3730 
3731 bool AArch64InstructionSelector::selectVectorICmp(
3732     MachineInstr &I, MachineRegisterInfo &MRI) {
3733   Register DstReg = I.getOperand(0).getReg();
3734   LLT DstTy = MRI.getType(DstReg);
3735   Register SrcReg = I.getOperand(2).getReg();
3736   Register Src2Reg = I.getOperand(3).getReg();
3737   LLT SrcTy = MRI.getType(SrcReg);
3738 
3739   unsigned SrcEltSize = SrcTy.getElementType().getSizeInBits();
3740   unsigned NumElts = DstTy.getNumElements();
3741 
3742   // First index is element size, 0 == 8b, 1 == 16b, 2 == 32b, 3 == 64b
3743   // Second index is num elts, 0 == v2, 1 == v4, 2 == v8, 3 == v16
3744   // Third index is cc opcode:
3745   // 0 == eq
3746   // 1 == ugt
3747   // 2 == uge
3748   // 3 == ult
3749   // 4 == ule
3750   // 5 == sgt
3751   // 6 == sge
3752   // 7 == slt
3753   // 8 == sle
3754   // ne is done by negating 'eq' result.
3755 
3756   // This table below assumes that for some comparisons the operands will be
3757   // commuted.
3758   // ult op == commute + ugt op
3759   // ule op == commute + uge op
3760   // slt op == commute + sgt op
3761   // sle op == commute + sge op
3762   unsigned PredIdx = 0;
3763   bool SwapOperands = false;
3764   CmpInst::Predicate Pred = (CmpInst::Predicate)I.getOperand(1).getPredicate();
3765   switch (Pred) {
3766   case CmpInst::ICMP_NE:
3767   case CmpInst::ICMP_EQ:
3768     PredIdx = 0;
3769     break;
3770   case CmpInst::ICMP_UGT:
3771     PredIdx = 1;
3772     break;
3773   case CmpInst::ICMP_UGE:
3774     PredIdx = 2;
3775     break;
3776   case CmpInst::ICMP_ULT:
3777     PredIdx = 3;
3778     SwapOperands = true;
3779     break;
3780   case CmpInst::ICMP_ULE:
3781     PredIdx = 4;
3782     SwapOperands = true;
3783     break;
3784   case CmpInst::ICMP_SGT:
3785     PredIdx = 5;
3786     break;
3787   case CmpInst::ICMP_SGE:
3788     PredIdx = 6;
3789     break;
3790   case CmpInst::ICMP_SLT:
3791     PredIdx = 7;
3792     SwapOperands = true;
3793     break;
3794   case CmpInst::ICMP_SLE:
3795     PredIdx = 8;
3796     SwapOperands = true;
3797     break;
3798   default:
3799     llvm_unreachable("Unhandled icmp predicate");
3800     return false;
3801   }
3802 
3803   // This table obviously should be tablegen'd when we have our GISel native
3804   // tablegen selector.
3805 
3806   static const unsigned OpcTable[4][4][9] = {
3807       {
3808           {0 /* invalid */, 0 /* invalid */, 0 /* invalid */, 0 /* invalid */,
3809            0 /* invalid */, 0 /* invalid */, 0 /* invalid */, 0 /* invalid */,
3810            0 /* invalid */},
3811           {0 /* invalid */, 0 /* invalid */, 0 /* invalid */, 0 /* invalid */,
3812            0 /* invalid */, 0 /* invalid */, 0 /* invalid */, 0 /* invalid */,
3813            0 /* invalid */},
3814           {AArch64::CMEQv8i8, AArch64::CMHIv8i8, AArch64::CMHSv8i8,
3815            AArch64::CMHIv8i8, AArch64::CMHSv8i8, AArch64::CMGTv8i8,
3816            AArch64::CMGEv8i8, AArch64::CMGTv8i8, AArch64::CMGEv8i8},
3817           {AArch64::CMEQv16i8, AArch64::CMHIv16i8, AArch64::CMHSv16i8,
3818            AArch64::CMHIv16i8, AArch64::CMHSv16i8, AArch64::CMGTv16i8,
3819            AArch64::CMGEv16i8, AArch64::CMGTv16i8, AArch64::CMGEv16i8}
3820       },
3821       {
3822           {0 /* invalid */, 0 /* invalid */, 0 /* invalid */, 0 /* invalid */,
3823            0 /* invalid */, 0 /* invalid */, 0 /* invalid */, 0 /* invalid */,
3824            0 /* invalid */},
3825           {AArch64::CMEQv4i16, AArch64::CMHIv4i16, AArch64::CMHSv4i16,
3826            AArch64::CMHIv4i16, AArch64::CMHSv4i16, AArch64::CMGTv4i16,
3827            AArch64::CMGEv4i16, AArch64::CMGTv4i16, AArch64::CMGEv4i16},
3828           {AArch64::CMEQv8i16, AArch64::CMHIv8i16, AArch64::CMHSv8i16,
3829            AArch64::CMHIv8i16, AArch64::CMHSv8i16, AArch64::CMGTv8i16,
3830            AArch64::CMGEv8i16, AArch64::CMGTv8i16, AArch64::CMGEv8i16},
3831           {0 /* invalid */, 0 /* invalid */, 0 /* invalid */, 0 /* invalid */,
3832            0 /* invalid */, 0 /* invalid */, 0 /* invalid */, 0 /* invalid */,
3833            0 /* invalid */}
3834       },
3835       {
3836           {AArch64::CMEQv2i32, AArch64::CMHIv2i32, AArch64::CMHSv2i32,
3837            AArch64::CMHIv2i32, AArch64::CMHSv2i32, AArch64::CMGTv2i32,
3838            AArch64::CMGEv2i32, AArch64::CMGTv2i32, AArch64::CMGEv2i32},
3839           {AArch64::CMEQv4i32, AArch64::CMHIv4i32, AArch64::CMHSv4i32,
3840            AArch64::CMHIv4i32, AArch64::CMHSv4i32, AArch64::CMGTv4i32,
3841            AArch64::CMGEv4i32, AArch64::CMGTv4i32, AArch64::CMGEv4i32},
3842           {0 /* invalid */, 0 /* invalid */, 0 /* invalid */, 0 /* invalid */,
3843            0 /* invalid */, 0 /* invalid */, 0 /* invalid */, 0 /* invalid */,
3844            0 /* invalid */},
3845           {0 /* invalid */, 0 /* invalid */, 0 /* invalid */, 0 /* invalid */,
3846            0 /* invalid */, 0 /* invalid */, 0 /* invalid */, 0 /* invalid */,
3847            0 /* invalid */}
3848       },
3849       {
3850           {AArch64::CMEQv2i64, AArch64::CMHIv2i64, AArch64::CMHSv2i64,
3851            AArch64::CMHIv2i64, AArch64::CMHSv2i64, AArch64::CMGTv2i64,
3852            AArch64::CMGEv2i64, AArch64::CMGTv2i64, AArch64::CMGEv2i64},
3853           {0 /* invalid */, 0 /* invalid */, 0 /* invalid */, 0 /* invalid */,
3854            0 /* invalid */, 0 /* invalid */, 0 /* invalid */, 0 /* invalid */,
3855            0 /* invalid */},
3856           {0 /* invalid */, 0 /* invalid */, 0 /* invalid */, 0 /* invalid */,
3857            0 /* invalid */, 0 /* invalid */, 0 /* invalid */, 0 /* invalid */,
3858            0 /* invalid */},
3859           {0 /* invalid */, 0 /* invalid */, 0 /* invalid */, 0 /* invalid */,
3860            0 /* invalid */, 0 /* invalid */, 0 /* invalid */, 0 /* invalid */,
3861            0 /* invalid */}
3862       },
3863   };
3864   unsigned EltIdx = Log2_32(SrcEltSize / 8);
3865   unsigned NumEltsIdx = Log2_32(NumElts / 2);
3866   unsigned Opc = OpcTable[EltIdx][NumEltsIdx][PredIdx];
3867   if (!Opc) {
3868     LLVM_DEBUG(dbgs() << "Could not map G_ICMP to cmp opcode");
3869     return false;
3870   }
3871 
3872   const RegisterBank &VecRB = *RBI.getRegBank(SrcReg, MRI, TRI);
3873   const TargetRegisterClass *SrcRC =
3874       getRegClassForTypeOnBank(SrcTy, VecRB, RBI, true);
3875   if (!SrcRC) {
3876     LLVM_DEBUG(dbgs() << "Could not determine source register class.\n");
3877     return false;
3878   }
3879 
3880   unsigned NotOpc = Pred == ICmpInst::ICMP_NE ? AArch64::NOTv8i8 : 0;
3881   if (SrcTy.getSizeInBits() == 128)
3882     NotOpc = NotOpc ? AArch64::NOTv16i8 : 0;
3883 
3884   if (SwapOperands)
3885     std::swap(SrcReg, Src2Reg);
3886 
3887   auto Cmp = MIB.buildInstr(Opc, {SrcRC}, {SrcReg, Src2Reg});
3888   constrainSelectedInstRegOperands(*Cmp, TII, TRI, RBI);
3889 
3890   // Invert if we had a 'ne' cc.
3891   if (NotOpc) {
3892     Cmp = MIB.buildInstr(NotOpc, {DstReg}, {Cmp});
3893     constrainSelectedInstRegOperands(*Cmp, TII, TRI, RBI);
3894   } else {
3895     MIB.buildCopy(DstReg, Cmp.getReg(0));
3896   }
3897   RBI.constrainGenericRegister(DstReg, *SrcRC, MRI);
3898   I.eraseFromParent();
3899   return true;
3900 }
3901 
3902 MachineInstr *AArch64InstructionSelector::emitScalarToVector(
3903     unsigned EltSize, const TargetRegisterClass *DstRC, Register Scalar,
3904     MachineIRBuilder &MIRBuilder) const {
3905   auto Undef = MIRBuilder.buildInstr(TargetOpcode::IMPLICIT_DEF, {DstRC}, {});
3906 
3907   auto BuildFn = [&](unsigned SubregIndex) {
3908     auto Ins =
3909         MIRBuilder
3910             .buildInstr(TargetOpcode::INSERT_SUBREG, {DstRC}, {Undef, Scalar})
3911             .addImm(SubregIndex);
3912     constrainSelectedInstRegOperands(*Undef, TII, TRI, RBI);
3913     constrainSelectedInstRegOperands(*Ins, TII, TRI, RBI);
3914     return &*Ins;
3915   };
3916 
3917   switch (EltSize) {
3918   case 16:
3919     return BuildFn(AArch64::hsub);
3920   case 32:
3921     return BuildFn(AArch64::ssub);
3922   case 64:
3923     return BuildFn(AArch64::dsub);
3924   default:
3925     return nullptr;
3926   }
3927 }
3928 
3929 bool AArch64InstructionSelector::selectMergeValues(
3930     MachineInstr &I, MachineRegisterInfo &MRI) {
3931   assert(I.getOpcode() == TargetOpcode::G_MERGE_VALUES && "unexpected opcode");
3932   const LLT DstTy = MRI.getType(I.getOperand(0).getReg());
3933   const LLT SrcTy = MRI.getType(I.getOperand(1).getReg());
3934   assert(!DstTy.isVector() && !SrcTy.isVector() && "invalid merge operation");
3935   const RegisterBank &RB = *RBI.getRegBank(I.getOperand(1).getReg(), MRI, TRI);
3936 
3937   if (I.getNumOperands() != 3)
3938     return false;
3939 
3940   // Merging 2 s64s into an s128.
3941   if (DstTy == LLT::scalar(128)) {
3942     if (SrcTy.getSizeInBits() != 64)
3943       return false;
3944     Register DstReg = I.getOperand(0).getReg();
3945     Register Src1Reg = I.getOperand(1).getReg();
3946     Register Src2Reg = I.getOperand(2).getReg();
3947     auto Tmp = MIB.buildInstr(TargetOpcode::IMPLICIT_DEF, {DstTy}, {});
3948     MachineInstr *InsMI =
3949         emitLaneInsert(None, Tmp.getReg(0), Src1Reg, /* LaneIdx */ 0, RB, MIB);
3950     if (!InsMI)
3951       return false;
3952     MachineInstr *Ins2MI = emitLaneInsert(DstReg, InsMI->getOperand(0).getReg(),
3953                                           Src2Reg, /* LaneIdx */ 1, RB, MIB);
3954     if (!Ins2MI)
3955       return false;
3956     constrainSelectedInstRegOperands(*InsMI, TII, TRI, RBI);
3957     constrainSelectedInstRegOperands(*Ins2MI, TII, TRI, RBI);
3958     I.eraseFromParent();
3959     return true;
3960   }
3961 
3962   if (RB.getID() != AArch64::GPRRegBankID)
3963     return false;
3964 
3965   if (DstTy.getSizeInBits() != 64 || SrcTy.getSizeInBits() != 32)
3966     return false;
3967 
3968   auto *DstRC = &AArch64::GPR64RegClass;
3969   Register SubToRegDef = MRI.createVirtualRegister(DstRC);
3970   MachineInstr &SubRegMI = *BuildMI(*I.getParent(), I, I.getDebugLoc(),
3971                                     TII.get(TargetOpcode::SUBREG_TO_REG))
3972                                 .addDef(SubToRegDef)
3973                                 .addImm(0)
3974                                 .addUse(I.getOperand(1).getReg())
3975                                 .addImm(AArch64::sub_32);
3976   Register SubToRegDef2 = MRI.createVirtualRegister(DstRC);
3977   // Need to anyext the second scalar before we can use bfm
3978   MachineInstr &SubRegMI2 = *BuildMI(*I.getParent(), I, I.getDebugLoc(),
3979                                     TII.get(TargetOpcode::SUBREG_TO_REG))
3980                                 .addDef(SubToRegDef2)
3981                                 .addImm(0)
3982                                 .addUse(I.getOperand(2).getReg())
3983                                 .addImm(AArch64::sub_32);
3984   MachineInstr &BFM =
3985       *BuildMI(*I.getParent(), I, I.getDebugLoc(), TII.get(AArch64::BFMXri))
3986            .addDef(I.getOperand(0).getReg())
3987            .addUse(SubToRegDef)
3988            .addUse(SubToRegDef2)
3989            .addImm(32)
3990            .addImm(31);
3991   constrainSelectedInstRegOperands(SubRegMI, TII, TRI, RBI);
3992   constrainSelectedInstRegOperands(SubRegMI2, TII, TRI, RBI);
3993   constrainSelectedInstRegOperands(BFM, TII, TRI, RBI);
3994   I.eraseFromParent();
3995   return true;
3996 }
3997 
3998 static bool getLaneCopyOpcode(unsigned &CopyOpc, unsigned &ExtractSubReg,
3999                               const unsigned EltSize) {
4000   // Choose a lane copy opcode and subregister based off of the size of the
4001   // vector's elements.
4002   switch (EltSize) {
4003   case 8:
4004     CopyOpc = AArch64::DUPi8;
4005     ExtractSubReg = AArch64::bsub;
4006     break;
4007   case 16:
4008     CopyOpc = AArch64::DUPi16;
4009     ExtractSubReg = AArch64::hsub;
4010     break;
4011   case 32:
4012     CopyOpc = AArch64::DUPi32;
4013     ExtractSubReg = AArch64::ssub;
4014     break;
4015   case 64:
4016     CopyOpc = AArch64::DUPi64;
4017     ExtractSubReg = AArch64::dsub;
4018     break;
4019   default:
4020     // Unknown size, bail out.
4021     LLVM_DEBUG(dbgs() << "Elt size '" << EltSize << "' unsupported.\n");
4022     return false;
4023   }
4024   return true;
4025 }
4026 
4027 MachineInstr *AArch64InstructionSelector::emitExtractVectorElt(
4028     Optional<Register> DstReg, const RegisterBank &DstRB, LLT ScalarTy,
4029     Register VecReg, unsigned LaneIdx, MachineIRBuilder &MIRBuilder) const {
4030   MachineRegisterInfo &MRI = *MIRBuilder.getMRI();
4031   unsigned CopyOpc = 0;
4032   unsigned ExtractSubReg = 0;
4033   if (!getLaneCopyOpcode(CopyOpc, ExtractSubReg, ScalarTy.getSizeInBits())) {
4034     LLVM_DEBUG(
4035         dbgs() << "Couldn't determine lane copy opcode for instruction.\n");
4036     return nullptr;
4037   }
4038 
4039   const TargetRegisterClass *DstRC =
4040       getRegClassForTypeOnBank(ScalarTy, DstRB, RBI, true);
4041   if (!DstRC) {
4042     LLVM_DEBUG(dbgs() << "Could not determine destination register class.\n");
4043     return nullptr;
4044   }
4045 
4046   const RegisterBank &VecRB = *RBI.getRegBank(VecReg, MRI, TRI);
4047   const LLT &VecTy = MRI.getType(VecReg);
4048   const TargetRegisterClass *VecRC =
4049       getRegClassForTypeOnBank(VecTy, VecRB, RBI, true);
4050   if (!VecRC) {
4051     LLVM_DEBUG(dbgs() << "Could not determine source register class.\n");
4052     return nullptr;
4053   }
4054 
4055   // The register that we're going to copy into.
4056   Register InsertReg = VecReg;
4057   if (!DstReg)
4058     DstReg = MRI.createVirtualRegister(DstRC);
4059   // If the lane index is 0, we just use a subregister COPY.
4060   if (LaneIdx == 0) {
4061     auto Copy = MIRBuilder.buildInstr(TargetOpcode::COPY, {*DstReg}, {})
4062                     .addReg(VecReg, 0, ExtractSubReg);
4063     RBI.constrainGenericRegister(*DstReg, *DstRC, MRI);
4064     return &*Copy;
4065   }
4066 
4067   // Lane copies require 128-bit wide registers. If we're dealing with an
4068   // unpacked vector, then we need to move up to that width. Insert an implicit
4069   // def and a subregister insert to get us there.
4070   if (VecTy.getSizeInBits() != 128) {
4071     MachineInstr *ScalarToVector = emitScalarToVector(
4072         VecTy.getSizeInBits(), &AArch64::FPR128RegClass, VecReg, MIRBuilder);
4073     if (!ScalarToVector)
4074       return nullptr;
4075     InsertReg = ScalarToVector->getOperand(0).getReg();
4076   }
4077 
4078   MachineInstr *LaneCopyMI =
4079       MIRBuilder.buildInstr(CopyOpc, {*DstReg}, {InsertReg}).addImm(LaneIdx);
4080   constrainSelectedInstRegOperands(*LaneCopyMI, TII, TRI, RBI);
4081 
4082   // Make sure that we actually constrain the initial copy.
4083   RBI.constrainGenericRegister(*DstReg, *DstRC, MRI);
4084   return LaneCopyMI;
4085 }
4086 
4087 bool AArch64InstructionSelector::selectExtractElt(
4088     MachineInstr &I, MachineRegisterInfo &MRI) {
4089   assert(I.getOpcode() == TargetOpcode::G_EXTRACT_VECTOR_ELT &&
4090          "unexpected opcode!");
4091   Register DstReg = I.getOperand(0).getReg();
4092   const LLT NarrowTy = MRI.getType(DstReg);
4093   const Register SrcReg = I.getOperand(1).getReg();
4094   const LLT WideTy = MRI.getType(SrcReg);
4095   (void)WideTy;
4096   assert(WideTy.getSizeInBits() >= NarrowTy.getSizeInBits() &&
4097          "source register size too small!");
4098   assert(!NarrowTy.isVector() && "cannot extract vector into vector!");
4099 
4100   // Need the lane index to determine the correct copy opcode.
4101   MachineOperand &LaneIdxOp = I.getOperand(2);
4102   assert(LaneIdxOp.isReg() && "Lane index operand was not a register?");
4103 
4104   if (RBI.getRegBank(DstReg, MRI, TRI)->getID() != AArch64::FPRRegBankID) {
4105     LLVM_DEBUG(dbgs() << "Cannot extract into GPR.\n");
4106     return false;
4107   }
4108 
4109   // Find the index to extract from.
4110   auto VRegAndVal = getIConstantVRegValWithLookThrough(LaneIdxOp.getReg(), MRI);
4111   if (!VRegAndVal)
4112     return false;
4113   unsigned LaneIdx = VRegAndVal->Value.getSExtValue();
4114 
4115 
4116   const RegisterBank &DstRB = *RBI.getRegBank(DstReg, MRI, TRI);
4117   MachineInstr *Extract = emitExtractVectorElt(DstReg, DstRB, NarrowTy, SrcReg,
4118                                                LaneIdx, MIB);
4119   if (!Extract)
4120     return false;
4121 
4122   I.eraseFromParent();
4123   return true;
4124 }
4125 
4126 bool AArch64InstructionSelector::selectSplitVectorUnmerge(
4127     MachineInstr &I, MachineRegisterInfo &MRI) {
4128   unsigned NumElts = I.getNumOperands() - 1;
4129   Register SrcReg = I.getOperand(NumElts).getReg();
4130   const LLT NarrowTy = MRI.getType(I.getOperand(0).getReg());
4131   const LLT SrcTy = MRI.getType(SrcReg);
4132 
4133   assert(NarrowTy.isVector() && "Expected an unmerge into vectors");
4134   if (SrcTy.getSizeInBits() > 128) {
4135     LLVM_DEBUG(dbgs() << "Unexpected vector type for vec split unmerge");
4136     return false;
4137   }
4138 
4139   // We implement a split vector operation by treating the sub-vectors as
4140   // scalars and extracting them.
4141   const RegisterBank &DstRB =
4142       *RBI.getRegBank(I.getOperand(0).getReg(), MRI, TRI);
4143   for (unsigned OpIdx = 0; OpIdx < NumElts; ++OpIdx) {
4144     Register Dst = I.getOperand(OpIdx).getReg();
4145     MachineInstr *Extract =
4146         emitExtractVectorElt(Dst, DstRB, NarrowTy, SrcReg, OpIdx, MIB);
4147     if (!Extract)
4148       return false;
4149   }
4150   I.eraseFromParent();
4151   return true;
4152 }
4153 
4154 bool AArch64InstructionSelector::selectUnmergeValues(MachineInstr &I,
4155                                                      MachineRegisterInfo &MRI) {
4156   assert(I.getOpcode() == TargetOpcode::G_UNMERGE_VALUES &&
4157          "unexpected opcode");
4158 
4159   // TODO: Handle unmerging into GPRs and from scalars to scalars.
4160   if (RBI.getRegBank(I.getOperand(0).getReg(), MRI, TRI)->getID() !=
4161           AArch64::FPRRegBankID ||
4162       RBI.getRegBank(I.getOperand(1).getReg(), MRI, TRI)->getID() !=
4163           AArch64::FPRRegBankID) {
4164     LLVM_DEBUG(dbgs() << "Unmerging vector-to-gpr and scalar-to-scalar "
4165                          "currently unsupported.\n");
4166     return false;
4167   }
4168 
4169   // The last operand is the vector source register, and every other operand is
4170   // a register to unpack into.
4171   unsigned NumElts = I.getNumOperands() - 1;
4172   Register SrcReg = I.getOperand(NumElts).getReg();
4173   const LLT NarrowTy = MRI.getType(I.getOperand(0).getReg());
4174   const LLT WideTy = MRI.getType(SrcReg);
4175   (void)WideTy;
4176   assert((WideTy.isVector() || WideTy.getSizeInBits() == 128) &&
4177          "can only unmerge from vector or s128 types!");
4178   assert(WideTy.getSizeInBits() > NarrowTy.getSizeInBits() &&
4179          "source register size too small!");
4180 
4181   if (!NarrowTy.isScalar())
4182     return selectSplitVectorUnmerge(I, MRI);
4183 
4184   // Choose a lane copy opcode and subregister based off of the size of the
4185   // vector's elements.
4186   unsigned CopyOpc = 0;
4187   unsigned ExtractSubReg = 0;
4188   if (!getLaneCopyOpcode(CopyOpc, ExtractSubReg, NarrowTy.getSizeInBits()))
4189     return false;
4190 
4191   // Set up for the lane copies.
4192   MachineBasicBlock &MBB = *I.getParent();
4193 
4194   // Stores the registers we'll be copying from.
4195   SmallVector<Register, 4> InsertRegs;
4196 
4197   // We'll use the first register twice, so we only need NumElts-1 registers.
4198   unsigned NumInsertRegs = NumElts - 1;
4199 
4200   // If our elements fit into exactly 128 bits, then we can copy from the source
4201   // directly. Otherwise, we need to do a bit of setup with some subregister
4202   // inserts.
4203   if (NarrowTy.getSizeInBits() * NumElts == 128) {
4204     InsertRegs = SmallVector<Register, 4>(NumInsertRegs, SrcReg);
4205   } else {
4206     // No. We have to perform subregister inserts. For each insert, create an
4207     // implicit def and a subregister insert, and save the register we create.
4208     const TargetRegisterClass *RC =
4209         getMinClassForRegBank(*RBI.getRegBank(SrcReg, MRI, TRI),
4210                               WideTy.getScalarSizeInBits() * NumElts);
4211     unsigned SubReg = 0;
4212     bool Found = getSubRegForClass(RC, TRI, SubReg);
4213     (void)Found;
4214     assert(Found && "expected to find last operand's subeg idx");
4215     for (unsigned Idx = 0; Idx < NumInsertRegs; ++Idx) {
4216       Register ImpDefReg = MRI.createVirtualRegister(&AArch64::FPR128RegClass);
4217       MachineInstr &ImpDefMI =
4218           *BuildMI(MBB, I, I.getDebugLoc(), TII.get(TargetOpcode::IMPLICIT_DEF),
4219                    ImpDefReg);
4220 
4221       // Now, create the subregister insert from SrcReg.
4222       Register InsertReg = MRI.createVirtualRegister(&AArch64::FPR128RegClass);
4223       MachineInstr &InsMI =
4224           *BuildMI(MBB, I, I.getDebugLoc(),
4225                    TII.get(TargetOpcode::INSERT_SUBREG), InsertReg)
4226                .addUse(ImpDefReg)
4227                .addUse(SrcReg)
4228                .addImm(SubReg);
4229 
4230       constrainSelectedInstRegOperands(ImpDefMI, TII, TRI, RBI);
4231       constrainSelectedInstRegOperands(InsMI, TII, TRI, RBI);
4232 
4233       // Save the register so that we can copy from it after.
4234       InsertRegs.push_back(InsertReg);
4235     }
4236   }
4237 
4238   // Now that we've created any necessary subregister inserts, we can
4239   // create the copies.
4240   //
4241   // Perform the first copy separately as a subregister copy.
4242   Register CopyTo = I.getOperand(0).getReg();
4243   auto FirstCopy = MIB.buildInstr(TargetOpcode::COPY, {CopyTo}, {})
4244                        .addReg(InsertRegs[0], 0, ExtractSubReg);
4245   constrainSelectedInstRegOperands(*FirstCopy, TII, TRI, RBI);
4246 
4247   // Now, perform the remaining copies as vector lane copies.
4248   unsigned LaneIdx = 1;
4249   for (Register InsReg : InsertRegs) {
4250     Register CopyTo = I.getOperand(LaneIdx).getReg();
4251     MachineInstr &CopyInst =
4252         *BuildMI(MBB, I, I.getDebugLoc(), TII.get(CopyOpc), CopyTo)
4253              .addUse(InsReg)
4254              .addImm(LaneIdx);
4255     constrainSelectedInstRegOperands(CopyInst, TII, TRI, RBI);
4256     ++LaneIdx;
4257   }
4258 
4259   // Separately constrain the first copy's destination. Because of the
4260   // limitation in constrainOperandRegClass, we can't guarantee that this will
4261   // actually be constrained. So, do it ourselves using the second operand.
4262   const TargetRegisterClass *RC =
4263       MRI.getRegClassOrNull(I.getOperand(1).getReg());
4264   if (!RC) {
4265     LLVM_DEBUG(dbgs() << "Couldn't constrain copy destination.\n");
4266     return false;
4267   }
4268 
4269   RBI.constrainGenericRegister(CopyTo, *RC, MRI);
4270   I.eraseFromParent();
4271   return true;
4272 }
4273 
4274 bool AArch64InstructionSelector::selectConcatVectors(
4275     MachineInstr &I, MachineRegisterInfo &MRI)  {
4276   assert(I.getOpcode() == TargetOpcode::G_CONCAT_VECTORS &&
4277          "Unexpected opcode");
4278   Register Dst = I.getOperand(0).getReg();
4279   Register Op1 = I.getOperand(1).getReg();
4280   Register Op2 = I.getOperand(2).getReg();
4281   MachineInstr *ConcatMI = emitVectorConcat(Dst, Op1, Op2, MIB);
4282   if (!ConcatMI)
4283     return false;
4284   I.eraseFromParent();
4285   return true;
4286 }
4287 
4288 unsigned
4289 AArch64InstructionSelector::emitConstantPoolEntry(const Constant *CPVal,
4290                                                   MachineFunction &MF) const {
4291   Type *CPTy = CPVal->getType();
4292   Align Alignment = MF.getDataLayout().getPrefTypeAlign(CPTy);
4293 
4294   MachineConstantPool *MCP = MF.getConstantPool();
4295   return MCP->getConstantPoolIndex(CPVal, Alignment);
4296 }
4297 
4298 MachineInstr *AArch64InstructionSelector::emitLoadFromConstantPool(
4299     const Constant *CPVal, MachineIRBuilder &MIRBuilder) const {
4300   auto &MF = MIRBuilder.getMF();
4301   unsigned CPIdx = emitConstantPoolEntry(CPVal, MF);
4302 
4303   auto Adrp =
4304       MIRBuilder.buildInstr(AArch64::ADRP, {&AArch64::GPR64RegClass}, {})
4305           .addConstantPoolIndex(CPIdx, 0, AArch64II::MO_PAGE);
4306 
4307   MachineInstr *LoadMI = nullptr;
4308   MachinePointerInfo PtrInfo = MachinePointerInfo::getConstantPool(MF);
4309   unsigned Size = MIRBuilder.getDataLayout().getTypeStoreSize(CPVal->getType());
4310   switch (Size) {
4311   case 16:
4312     LoadMI =
4313         &*MIRBuilder
4314               .buildInstr(AArch64::LDRQui, {&AArch64::FPR128RegClass}, {Adrp})
4315               .addConstantPoolIndex(CPIdx, 0,
4316                                     AArch64II::MO_PAGEOFF | AArch64II::MO_NC);
4317     break;
4318   case 8:
4319     LoadMI =
4320         &*MIRBuilder
4321               .buildInstr(AArch64::LDRDui, {&AArch64::FPR64RegClass}, {Adrp})
4322               .addConstantPoolIndex(CPIdx, 0,
4323                                     AArch64II::MO_PAGEOFF | AArch64II::MO_NC);
4324     break;
4325   case 4:
4326     LoadMI =
4327         &*MIRBuilder
4328               .buildInstr(AArch64::LDRSui, {&AArch64::FPR32RegClass}, {Adrp})
4329               .addConstantPoolIndex(CPIdx, 0,
4330                                     AArch64II::MO_PAGEOFF | AArch64II::MO_NC);
4331     break;
4332   case 2:
4333     LoadMI =
4334         &*MIRBuilder
4335               .buildInstr(AArch64::LDRHui, {&AArch64::FPR16RegClass}, {Adrp})
4336               .addConstantPoolIndex(CPIdx, 0,
4337                                     AArch64II::MO_PAGEOFF | AArch64II::MO_NC);
4338     break;
4339   default:
4340     LLVM_DEBUG(dbgs() << "Could not load from constant pool of type "
4341                       << *CPVal->getType());
4342     return nullptr;
4343   }
4344   LoadMI->addMemOperand(MF, MF.getMachineMemOperand(PtrInfo,
4345                                                     MachineMemOperand::MOLoad,
4346                                                     Size, Align(Size)));
4347   constrainSelectedInstRegOperands(*Adrp, TII, TRI, RBI);
4348   constrainSelectedInstRegOperands(*LoadMI, TII, TRI, RBI);
4349   return LoadMI;
4350 }
4351 
4352 /// Return an <Opcode, SubregIndex> pair to do an vector elt insert of a given
4353 /// size and RB.
4354 static std::pair<unsigned, unsigned>
4355 getInsertVecEltOpInfo(const RegisterBank &RB, unsigned EltSize) {
4356   unsigned Opc, SubregIdx;
4357   if (RB.getID() == AArch64::GPRRegBankID) {
4358     if (EltSize == 16) {
4359       Opc = AArch64::INSvi16gpr;
4360       SubregIdx = AArch64::ssub;
4361     } else if (EltSize == 32) {
4362       Opc = AArch64::INSvi32gpr;
4363       SubregIdx = AArch64::ssub;
4364     } else if (EltSize == 64) {
4365       Opc = AArch64::INSvi64gpr;
4366       SubregIdx = AArch64::dsub;
4367     } else {
4368       llvm_unreachable("invalid elt size!");
4369     }
4370   } else {
4371     if (EltSize == 8) {
4372       Opc = AArch64::INSvi8lane;
4373       SubregIdx = AArch64::bsub;
4374     } else if (EltSize == 16) {
4375       Opc = AArch64::INSvi16lane;
4376       SubregIdx = AArch64::hsub;
4377     } else if (EltSize == 32) {
4378       Opc = AArch64::INSvi32lane;
4379       SubregIdx = AArch64::ssub;
4380     } else if (EltSize == 64) {
4381       Opc = AArch64::INSvi64lane;
4382       SubregIdx = AArch64::dsub;
4383     } else {
4384       llvm_unreachable("invalid elt size!");
4385     }
4386   }
4387   return std::make_pair(Opc, SubregIdx);
4388 }
4389 
4390 MachineInstr *AArch64InstructionSelector::emitInstr(
4391     unsigned Opcode, std::initializer_list<llvm::DstOp> DstOps,
4392     std::initializer_list<llvm::SrcOp> SrcOps, MachineIRBuilder &MIRBuilder,
4393     const ComplexRendererFns &RenderFns) const {
4394   assert(Opcode && "Expected an opcode?");
4395   assert(!isPreISelGenericOpcode(Opcode) &&
4396          "Function should only be used to produce selected instructions!");
4397   auto MI = MIRBuilder.buildInstr(Opcode, DstOps, SrcOps);
4398   if (RenderFns)
4399     for (auto &Fn : *RenderFns)
4400       Fn(MI);
4401   constrainSelectedInstRegOperands(*MI, TII, TRI, RBI);
4402   return &*MI;
4403 }
4404 
4405 MachineInstr *AArch64InstructionSelector::emitAddSub(
4406     const std::array<std::array<unsigned, 2>, 5> &AddrModeAndSizeToOpcode,
4407     Register Dst, MachineOperand &LHS, MachineOperand &RHS,
4408     MachineIRBuilder &MIRBuilder) const {
4409   MachineRegisterInfo &MRI = MIRBuilder.getMF().getRegInfo();
4410   assert(LHS.isReg() && RHS.isReg() && "Expected register operands?");
4411   auto Ty = MRI.getType(LHS.getReg());
4412   assert(!Ty.isVector() && "Expected a scalar or pointer?");
4413   unsigned Size = Ty.getSizeInBits();
4414   assert((Size == 32 || Size == 64) && "Expected a 32-bit or 64-bit type only");
4415   bool Is32Bit = Size == 32;
4416 
4417   // INSTRri form with positive arithmetic immediate.
4418   if (auto Fns = selectArithImmed(RHS))
4419     return emitInstr(AddrModeAndSizeToOpcode[0][Is32Bit], {Dst}, {LHS},
4420                      MIRBuilder, Fns);
4421 
4422   // INSTRri form with negative arithmetic immediate.
4423   if (auto Fns = selectNegArithImmed(RHS))
4424     return emitInstr(AddrModeAndSizeToOpcode[3][Is32Bit], {Dst}, {LHS},
4425                      MIRBuilder, Fns);
4426 
4427   // INSTRrx form.
4428   if (auto Fns = selectArithExtendedRegister(RHS))
4429     return emitInstr(AddrModeAndSizeToOpcode[4][Is32Bit], {Dst}, {LHS},
4430                      MIRBuilder, Fns);
4431 
4432   // INSTRrs form.
4433   if (auto Fns = selectShiftedRegister(RHS))
4434     return emitInstr(AddrModeAndSizeToOpcode[1][Is32Bit], {Dst}, {LHS},
4435                      MIRBuilder, Fns);
4436   return emitInstr(AddrModeAndSizeToOpcode[2][Is32Bit], {Dst}, {LHS, RHS},
4437                    MIRBuilder);
4438 }
4439 
4440 MachineInstr *
4441 AArch64InstructionSelector::emitADD(Register DefReg, MachineOperand &LHS,
4442                                     MachineOperand &RHS,
4443                                     MachineIRBuilder &MIRBuilder) const {
4444   const std::array<std::array<unsigned, 2>, 5> OpcTable{
4445       {{AArch64::ADDXri, AArch64::ADDWri},
4446        {AArch64::ADDXrs, AArch64::ADDWrs},
4447        {AArch64::ADDXrr, AArch64::ADDWrr},
4448        {AArch64::SUBXri, AArch64::SUBWri},
4449        {AArch64::ADDXrx, AArch64::ADDWrx}}};
4450   return emitAddSub(OpcTable, DefReg, LHS, RHS, MIRBuilder);
4451 }
4452 
4453 MachineInstr *
4454 AArch64InstructionSelector::emitADDS(Register Dst, MachineOperand &LHS,
4455                                      MachineOperand &RHS,
4456                                      MachineIRBuilder &MIRBuilder) const {
4457   const std::array<std::array<unsigned, 2>, 5> OpcTable{
4458       {{AArch64::ADDSXri, AArch64::ADDSWri},
4459        {AArch64::ADDSXrs, AArch64::ADDSWrs},
4460        {AArch64::ADDSXrr, AArch64::ADDSWrr},
4461        {AArch64::SUBSXri, AArch64::SUBSWri},
4462        {AArch64::ADDSXrx, AArch64::ADDSWrx}}};
4463   return emitAddSub(OpcTable, Dst, LHS, RHS, MIRBuilder);
4464 }
4465 
4466 MachineInstr *
4467 AArch64InstructionSelector::emitSUBS(Register Dst, MachineOperand &LHS,
4468                                      MachineOperand &RHS,
4469                                      MachineIRBuilder &MIRBuilder) const {
4470   const std::array<std::array<unsigned, 2>, 5> OpcTable{
4471       {{AArch64::SUBSXri, AArch64::SUBSWri},
4472        {AArch64::SUBSXrs, AArch64::SUBSWrs},
4473        {AArch64::SUBSXrr, AArch64::SUBSWrr},
4474        {AArch64::ADDSXri, AArch64::ADDSWri},
4475        {AArch64::SUBSXrx, AArch64::SUBSWrx}}};
4476   return emitAddSub(OpcTable, Dst, LHS, RHS, MIRBuilder);
4477 }
4478 
4479 MachineInstr *
4480 AArch64InstructionSelector::emitCMN(MachineOperand &LHS, MachineOperand &RHS,
4481                                     MachineIRBuilder &MIRBuilder) const {
4482   MachineRegisterInfo &MRI = MIRBuilder.getMF().getRegInfo();
4483   bool Is32Bit = (MRI.getType(LHS.getReg()).getSizeInBits() == 32);
4484   auto RC = Is32Bit ? &AArch64::GPR32RegClass : &AArch64::GPR64RegClass;
4485   return emitADDS(MRI.createVirtualRegister(RC), LHS, RHS, MIRBuilder);
4486 }
4487 
4488 MachineInstr *
4489 AArch64InstructionSelector::emitTST(MachineOperand &LHS, MachineOperand &RHS,
4490                                     MachineIRBuilder &MIRBuilder) const {
4491   assert(LHS.isReg() && RHS.isReg() && "Expected register operands?");
4492   MachineRegisterInfo &MRI = MIRBuilder.getMF().getRegInfo();
4493   LLT Ty = MRI.getType(LHS.getReg());
4494   unsigned RegSize = Ty.getSizeInBits();
4495   bool Is32Bit = (RegSize == 32);
4496   const unsigned OpcTable[3][2] = {{AArch64::ANDSXri, AArch64::ANDSWri},
4497                                    {AArch64::ANDSXrs, AArch64::ANDSWrs},
4498                                    {AArch64::ANDSXrr, AArch64::ANDSWrr}};
4499   // ANDS needs a logical immediate for its immediate form. Check if we can
4500   // fold one in.
4501   if (auto ValAndVReg = getIConstantVRegValWithLookThrough(RHS.getReg(), MRI)) {
4502     int64_t Imm = ValAndVReg->Value.getSExtValue();
4503 
4504     if (AArch64_AM::isLogicalImmediate(Imm, RegSize)) {
4505       auto TstMI = MIRBuilder.buildInstr(OpcTable[0][Is32Bit], {Ty}, {LHS});
4506       TstMI.addImm(AArch64_AM::encodeLogicalImmediate(Imm, RegSize));
4507       constrainSelectedInstRegOperands(*TstMI, TII, TRI, RBI);
4508       return &*TstMI;
4509     }
4510   }
4511 
4512   if (auto Fns = selectLogicalShiftedRegister(RHS))
4513     return emitInstr(OpcTable[1][Is32Bit], {Ty}, {LHS}, MIRBuilder, Fns);
4514   return emitInstr(OpcTable[2][Is32Bit], {Ty}, {LHS, RHS}, MIRBuilder);
4515 }
4516 
4517 MachineInstr *AArch64InstructionSelector::emitIntegerCompare(
4518     MachineOperand &LHS, MachineOperand &RHS, MachineOperand &Predicate,
4519     MachineIRBuilder &MIRBuilder) const {
4520   assert(LHS.isReg() && RHS.isReg() && "Expected LHS and RHS to be registers!");
4521   assert(Predicate.isPredicate() && "Expected predicate?");
4522   MachineRegisterInfo &MRI = MIRBuilder.getMF().getRegInfo();
4523   LLT CmpTy = MRI.getType(LHS.getReg());
4524   assert(!CmpTy.isVector() && "Expected scalar or pointer");
4525   unsigned Size = CmpTy.getSizeInBits();
4526   (void)Size;
4527   assert((Size == 32 || Size == 64) && "Expected a 32-bit or 64-bit LHS/RHS?");
4528   // Fold the compare into a cmn or tst if possible.
4529   if (auto FoldCmp = tryFoldIntegerCompare(LHS, RHS, Predicate, MIRBuilder))
4530     return FoldCmp;
4531   auto Dst = MRI.cloneVirtualRegister(LHS.getReg());
4532   return emitSUBS(Dst, LHS, RHS, MIRBuilder);
4533 }
4534 
4535 MachineInstr *AArch64InstructionSelector::emitCSetForFCmp(
4536     Register Dst, CmpInst::Predicate Pred, MachineIRBuilder &MIRBuilder) const {
4537   MachineRegisterInfo &MRI = *MIRBuilder.getMRI();
4538 #ifndef NDEBUG
4539   LLT Ty = MRI.getType(Dst);
4540   assert(!Ty.isVector() && Ty.getSizeInBits() == 32 &&
4541          "Expected a 32-bit scalar register?");
4542 #endif
4543   const Register ZReg = AArch64::WZR;
4544   AArch64CC::CondCode CC1, CC2;
4545   changeFCMPPredToAArch64CC(Pred, CC1, CC2);
4546   auto InvCC1 = AArch64CC::getInvertedCondCode(CC1);
4547   if (CC2 == AArch64CC::AL)
4548     return emitCSINC(/*Dst=*/Dst, /*Src1=*/ZReg, /*Src2=*/ZReg, InvCC1,
4549                      MIRBuilder);
4550   const TargetRegisterClass *RC = &AArch64::GPR32RegClass;
4551   Register Def1Reg = MRI.createVirtualRegister(RC);
4552   Register Def2Reg = MRI.createVirtualRegister(RC);
4553   auto InvCC2 = AArch64CC::getInvertedCondCode(CC2);
4554   emitCSINC(/*Dst=*/Def1Reg, /*Src1=*/ZReg, /*Src2=*/ZReg, InvCC1, MIRBuilder);
4555   emitCSINC(/*Dst=*/Def2Reg, /*Src1=*/ZReg, /*Src2=*/ZReg, InvCC2, MIRBuilder);
4556   auto OrMI = MIRBuilder.buildInstr(AArch64::ORRWrr, {Dst}, {Def1Reg, Def2Reg});
4557   constrainSelectedInstRegOperands(*OrMI, TII, TRI, RBI);
4558   return &*OrMI;
4559 }
4560 
4561 MachineInstr *
4562 AArch64InstructionSelector::emitFPCompare(Register LHS, Register RHS,
4563                                           MachineIRBuilder &MIRBuilder,
4564                                           Optional<CmpInst::Predicate> Pred) const {
4565   MachineRegisterInfo &MRI = *MIRBuilder.getMRI();
4566   LLT Ty = MRI.getType(LHS);
4567   if (Ty.isVector())
4568     return nullptr;
4569   unsigned OpSize = Ty.getSizeInBits();
4570   if (OpSize != 32 && OpSize != 64)
4571     return nullptr;
4572 
4573   // If this is a compare against +0.0, then we don't have
4574   // to explicitly materialize a constant.
4575   const ConstantFP *FPImm = getConstantFPVRegVal(RHS, MRI);
4576   bool ShouldUseImm = FPImm && (FPImm->isZero() && !FPImm->isNegative());
4577 
4578   auto IsEqualityPred = [](CmpInst::Predicate P) {
4579     return P == CmpInst::FCMP_OEQ || P == CmpInst::FCMP_ONE ||
4580            P == CmpInst::FCMP_UEQ || P == CmpInst::FCMP_UNE;
4581   };
4582   if (!ShouldUseImm && Pred && IsEqualityPred(*Pred)) {
4583     // Try commutating the operands.
4584     const ConstantFP *LHSImm = getConstantFPVRegVal(LHS, MRI);
4585     if (LHSImm && (LHSImm->isZero() && !LHSImm->isNegative())) {
4586       ShouldUseImm = true;
4587       std::swap(LHS, RHS);
4588     }
4589   }
4590   unsigned CmpOpcTbl[2][2] = {{AArch64::FCMPSrr, AArch64::FCMPDrr},
4591                               {AArch64::FCMPSri, AArch64::FCMPDri}};
4592   unsigned CmpOpc = CmpOpcTbl[ShouldUseImm][OpSize == 64];
4593 
4594   // Partially build the compare. Decide if we need to add a use for the
4595   // third operand based off whether or not we're comparing against 0.0.
4596   auto CmpMI = MIRBuilder.buildInstr(CmpOpc).addUse(LHS);
4597   if (!ShouldUseImm)
4598     CmpMI.addUse(RHS);
4599   constrainSelectedInstRegOperands(*CmpMI, TII, TRI, RBI);
4600   return &*CmpMI;
4601 }
4602 
4603 MachineInstr *AArch64InstructionSelector::emitVectorConcat(
4604     Optional<Register> Dst, Register Op1, Register Op2,
4605     MachineIRBuilder &MIRBuilder) const {
4606   // We implement a vector concat by:
4607   // 1. Use scalar_to_vector to insert the lower vector into the larger dest
4608   // 2. Insert the upper vector into the destination's upper element
4609   // TODO: some of this code is common with G_BUILD_VECTOR handling.
4610   MachineRegisterInfo &MRI = MIRBuilder.getMF().getRegInfo();
4611 
4612   const LLT Op1Ty = MRI.getType(Op1);
4613   const LLT Op2Ty = MRI.getType(Op2);
4614 
4615   if (Op1Ty != Op2Ty) {
4616     LLVM_DEBUG(dbgs() << "Could not do vector concat of differing vector tys");
4617     return nullptr;
4618   }
4619   assert(Op1Ty.isVector() && "Expected a vector for vector concat");
4620 
4621   if (Op1Ty.getSizeInBits() >= 128) {
4622     LLVM_DEBUG(dbgs() << "Vector concat not supported for full size vectors");
4623     return nullptr;
4624   }
4625 
4626   // At the moment we just support 64 bit vector concats.
4627   if (Op1Ty.getSizeInBits() != 64) {
4628     LLVM_DEBUG(dbgs() << "Vector concat supported for 64b vectors");
4629     return nullptr;
4630   }
4631 
4632   const LLT ScalarTy = LLT::scalar(Op1Ty.getSizeInBits());
4633   const RegisterBank &FPRBank = *RBI.getRegBank(Op1, MRI, TRI);
4634   const TargetRegisterClass *DstRC =
4635       getMinClassForRegBank(FPRBank, Op1Ty.getSizeInBits() * 2);
4636 
4637   MachineInstr *WidenedOp1 =
4638       emitScalarToVector(ScalarTy.getSizeInBits(), DstRC, Op1, MIRBuilder);
4639   MachineInstr *WidenedOp2 =
4640       emitScalarToVector(ScalarTy.getSizeInBits(), DstRC, Op2, MIRBuilder);
4641   if (!WidenedOp1 || !WidenedOp2) {
4642     LLVM_DEBUG(dbgs() << "Could not emit a vector from scalar value");
4643     return nullptr;
4644   }
4645 
4646   // Now do the insert of the upper element.
4647   unsigned InsertOpc, InsSubRegIdx;
4648   std::tie(InsertOpc, InsSubRegIdx) =
4649       getInsertVecEltOpInfo(FPRBank, ScalarTy.getSizeInBits());
4650 
4651   if (!Dst)
4652     Dst = MRI.createVirtualRegister(DstRC);
4653   auto InsElt =
4654       MIRBuilder
4655           .buildInstr(InsertOpc, {*Dst}, {WidenedOp1->getOperand(0).getReg()})
4656           .addImm(1) /* Lane index */
4657           .addUse(WidenedOp2->getOperand(0).getReg())
4658           .addImm(0);
4659   constrainSelectedInstRegOperands(*InsElt, TII, TRI, RBI);
4660   return &*InsElt;
4661 }
4662 
4663 MachineInstr *
4664 AArch64InstructionSelector::emitCSINC(Register Dst, Register Src1,
4665                                       Register Src2, AArch64CC::CondCode Pred,
4666                                       MachineIRBuilder &MIRBuilder) const {
4667   auto &MRI = *MIRBuilder.getMRI();
4668   const RegClassOrRegBank &RegClassOrBank = MRI.getRegClassOrRegBank(Dst);
4669   // If we used a register class, then this won't necessarily have an LLT.
4670   // Compute the size based off whether or not we have a class or bank.
4671   unsigned Size;
4672   if (const auto *RC = RegClassOrBank.dyn_cast<const TargetRegisterClass *>())
4673     Size = TRI.getRegSizeInBits(*RC);
4674   else
4675     Size = MRI.getType(Dst).getSizeInBits();
4676   // Some opcodes use s1.
4677   assert(Size <= 64 && "Expected 64 bits or less only!");
4678   static const unsigned OpcTable[2] = {AArch64::CSINCWr, AArch64::CSINCXr};
4679   unsigned Opc = OpcTable[Size == 64];
4680   auto CSINC = MIRBuilder.buildInstr(Opc, {Dst}, {Src1, Src2}).addImm(Pred);
4681   constrainSelectedInstRegOperands(*CSINC, TII, TRI, RBI);
4682   return &*CSINC;
4683 }
4684 
4685 std::pair<MachineInstr *, AArch64CC::CondCode>
4686 AArch64InstructionSelector::emitOverflowOp(unsigned Opcode, Register Dst,
4687                                            MachineOperand &LHS,
4688                                            MachineOperand &RHS,
4689                                            MachineIRBuilder &MIRBuilder) const {
4690   switch (Opcode) {
4691   default:
4692     llvm_unreachable("Unexpected opcode!");
4693   case TargetOpcode::G_SADDO:
4694     return std::make_pair(emitADDS(Dst, LHS, RHS, MIRBuilder), AArch64CC::VS);
4695   case TargetOpcode::G_UADDO:
4696     return std::make_pair(emitADDS(Dst, LHS, RHS, MIRBuilder), AArch64CC::HS);
4697   case TargetOpcode::G_SSUBO:
4698     return std::make_pair(emitSUBS(Dst, LHS, RHS, MIRBuilder), AArch64CC::VS);
4699   case TargetOpcode::G_USUBO:
4700     return std::make_pair(emitSUBS(Dst, LHS, RHS, MIRBuilder), AArch64CC::LO);
4701   }
4702 }
4703 
4704 bool AArch64InstructionSelector::tryOptSelect(MachineInstr &I) {
4705   MachineRegisterInfo &MRI = *MIB.getMRI();
4706   // We want to recognize this pattern:
4707   //
4708   // $z = G_FCMP pred, $x, $y
4709   // ...
4710   // $w = G_SELECT $z, $a, $b
4711   //
4712   // Where the value of $z is *only* ever used by the G_SELECT (possibly with
4713   // some copies/truncs in between.)
4714   //
4715   // If we see this, then we can emit something like this:
4716   //
4717   // fcmp $x, $y
4718   // fcsel $w, $a, $b, pred
4719   //
4720   // Rather than emitting both of the rather long sequences in the standard
4721   // G_FCMP/G_SELECT select methods.
4722 
4723   // First, check if the condition is defined by a compare.
4724   MachineInstr *CondDef = MRI.getVRegDef(I.getOperand(1).getReg());
4725   while (CondDef) {
4726     // We can only fold if all of the defs have one use.
4727     Register CondDefReg = CondDef->getOperand(0).getReg();
4728     if (!MRI.hasOneNonDBGUse(CondDefReg)) {
4729       // Unless it's another select.
4730       for (const MachineInstr &UI : MRI.use_nodbg_instructions(CondDefReg)) {
4731         if (CondDef == &UI)
4732           continue;
4733         if (UI.getOpcode() != TargetOpcode::G_SELECT)
4734           return false;
4735       }
4736     }
4737 
4738     // We can skip over G_TRUNC since the condition is 1-bit.
4739     // Truncating/extending can have no impact on the value.
4740     unsigned Opc = CondDef->getOpcode();
4741     if (Opc != TargetOpcode::COPY && Opc != TargetOpcode::G_TRUNC)
4742       break;
4743 
4744     // Can't see past copies from physregs.
4745     if (Opc == TargetOpcode::COPY &&
4746         Register::isPhysicalRegister(CondDef->getOperand(1).getReg()))
4747       return false;
4748 
4749     CondDef = MRI.getVRegDef(CondDef->getOperand(1).getReg());
4750   }
4751 
4752   // Is the condition defined by a compare?
4753   if (!CondDef)
4754     return false;
4755 
4756   unsigned CondOpc = CondDef->getOpcode();
4757   if (CondOpc != TargetOpcode::G_ICMP && CondOpc != TargetOpcode::G_FCMP)
4758     return false;
4759 
4760   AArch64CC::CondCode CondCode;
4761   if (CondOpc == TargetOpcode::G_ICMP) {
4762     auto Pred =
4763         static_cast<CmpInst::Predicate>(CondDef->getOperand(1).getPredicate());
4764     CondCode = changeICMPPredToAArch64CC(Pred);
4765     emitIntegerCompare(CondDef->getOperand(2), CondDef->getOperand(3),
4766                        CondDef->getOperand(1), MIB);
4767   } else {
4768     // Get the condition code for the select.
4769     auto Pred =
4770         static_cast<CmpInst::Predicate>(CondDef->getOperand(1).getPredicate());
4771     AArch64CC::CondCode CondCode2;
4772     changeFCMPPredToAArch64CC(Pred, CondCode, CondCode2);
4773 
4774     // changeFCMPPredToAArch64CC sets CondCode2 to AL when we require two
4775     // instructions to emit the comparison.
4776     // TODO: Handle FCMP_UEQ and FCMP_ONE. After that, this check will be
4777     // unnecessary.
4778     if (CondCode2 != AArch64CC::AL)
4779       return false;
4780 
4781     if (!emitFPCompare(CondDef->getOperand(2).getReg(),
4782                        CondDef->getOperand(3).getReg(), MIB)) {
4783       LLVM_DEBUG(dbgs() << "Couldn't emit compare for select!\n");
4784       return false;
4785     }
4786   }
4787 
4788   // Emit the select.
4789   emitSelect(I.getOperand(0).getReg(), I.getOperand(2).getReg(),
4790              I.getOperand(3).getReg(), CondCode, MIB);
4791   I.eraseFromParent();
4792   return true;
4793 }
4794 
4795 MachineInstr *AArch64InstructionSelector::tryFoldIntegerCompare(
4796     MachineOperand &LHS, MachineOperand &RHS, MachineOperand &Predicate,
4797     MachineIRBuilder &MIRBuilder) const {
4798   assert(LHS.isReg() && RHS.isReg() && Predicate.isPredicate() &&
4799          "Unexpected MachineOperand");
4800   MachineRegisterInfo &MRI = *MIRBuilder.getMRI();
4801   // We want to find this sort of thing:
4802   // x = G_SUB 0, y
4803   // G_ICMP z, x
4804   //
4805   // In this case, we can fold the G_SUB into the G_ICMP using a CMN instead.
4806   // e.g:
4807   //
4808   // cmn z, y
4809 
4810   // Check if the RHS or LHS of the G_ICMP is defined by a SUB
4811   MachineInstr *LHSDef = getDefIgnoringCopies(LHS.getReg(), MRI);
4812   MachineInstr *RHSDef = getDefIgnoringCopies(RHS.getReg(), MRI);
4813   auto P = static_cast<CmpInst::Predicate>(Predicate.getPredicate());
4814   // Given this:
4815   //
4816   // x = G_SUB 0, y
4817   // G_ICMP x, z
4818   //
4819   // Produce this:
4820   //
4821   // cmn y, z
4822   if (isCMN(LHSDef, P, MRI))
4823     return emitCMN(LHSDef->getOperand(2), RHS, MIRBuilder);
4824 
4825   // Same idea here, but with the RHS of the compare instead:
4826   //
4827   // Given this:
4828   //
4829   // x = G_SUB 0, y
4830   // G_ICMP z, x
4831   //
4832   // Produce this:
4833   //
4834   // cmn z, y
4835   if (isCMN(RHSDef, P, MRI))
4836     return emitCMN(LHS, RHSDef->getOperand(2), MIRBuilder);
4837 
4838   // Given this:
4839   //
4840   // z = G_AND x, y
4841   // G_ICMP z, 0
4842   //
4843   // Produce this if the compare is signed:
4844   //
4845   // tst x, y
4846   if (!CmpInst::isUnsigned(P) && LHSDef &&
4847       LHSDef->getOpcode() == TargetOpcode::G_AND) {
4848     // Make sure that the RHS is 0.
4849     auto ValAndVReg = getIConstantVRegValWithLookThrough(RHS.getReg(), MRI);
4850     if (!ValAndVReg || ValAndVReg->Value != 0)
4851       return nullptr;
4852 
4853     return emitTST(LHSDef->getOperand(1),
4854                    LHSDef->getOperand(2), MIRBuilder);
4855   }
4856 
4857   return nullptr;
4858 }
4859 
4860 bool AArch64InstructionSelector::selectShuffleVector(
4861     MachineInstr &I, MachineRegisterInfo &MRI) {
4862   const LLT DstTy = MRI.getType(I.getOperand(0).getReg());
4863   Register Src1Reg = I.getOperand(1).getReg();
4864   const LLT Src1Ty = MRI.getType(Src1Reg);
4865   Register Src2Reg = I.getOperand(2).getReg();
4866   const LLT Src2Ty = MRI.getType(Src2Reg);
4867   ArrayRef<int> Mask = I.getOperand(3).getShuffleMask();
4868 
4869   MachineBasicBlock &MBB = *I.getParent();
4870   MachineFunction &MF = *MBB.getParent();
4871   LLVMContext &Ctx = MF.getFunction().getContext();
4872 
4873   // G_SHUFFLE_VECTOR is weird in that the source operands can be scalars, if
4874   // it's originated from a <1 x T> type. Those should have been lowered into
4875   // G_BUILD_VECTOR earlier.
4876   if (!Src1Ty.isVector() || !Src2Ty.isVector()) {
4877     LLVM_DEBUG(dbgs() << "Could not select a \"scalar\" G_SHUFFLE_VECTOR\n");
4878     return false;
4879   }
4880 
4881   unsigned BytesPerElt = DstTy.getElementType().getSizeInBits() / 8;
4882 
4883   SmallVector<Constant *, 64> CstIdxs;
4884   for (int Val : Mask) {
4885     // For now, any undef indexes we'll just assume to be 0. This should be
4886     // optimized in future, e.g. to select DUP etc.
4887     Val = Val < 0 ? 0 : Val;
4888     for (unsigned Byte = 0; Byte < BytesPerElt; ++Byte) {
4889       unsigned Offset = Byte + Val * BytesPerElt;
4890       CstIdxs.emplace_back(ConstantInt::get(Type::getInt8Ty(Ctx), Offset));
4891     }
4892   }
4893 
4894   // Use a constant pool to load the index vector for TBL.
4895   Constant *CPVal = ConstantVector::get(CstIdxs);
4896   MachineInstr *IndexLoad = emitLoadFromConstantPool(CPVal, MIB);
4897   if (!IndexLoad) {
4898     LLVM_DEBUG(dbgs() << "Could not load from a constant pool");
4899     return false;
4900   }
4901 
4902   if (DstTy.getSizeInBits() != 128) {
4903     assert(DstTy.getSizeInBits() == 64 && "Unexpected shuffle result ty");
4904     // This case can be done with TBL1.
4905     MachineInstr *Concat = emitVectorConcat(None, Src1Reg, Src2Reg, MIB);
4906     if (!Concat) {
4907       LLVM_DEBUG(dbgs() << "Could not do vector concat for tbl1");
4908       return false;
4909     }
4910 
4911     // The constant pool load will be 64 bits, so need to convert to FPR128 reg.
4912     IndexLoad = emitScalarToVector(64, &AArch64::FPR128RegClass,
4913                                    IndexLoad->getOperand(0).getReg(), MIB);
4914 
4915     auto TBL1 = MIB.buildInstr(
4916         AArch64::TBLv16i8One, {&AArch64::FPR128RegClass},
4917         {Concat->getOperand(0).getReg(), IndexLoad->getOperand(0).getReg()});
4918     constrainSelectedInstRegOperands(*TBL1, TII, TRI, RBI);
4919 
4920     auto Copy =
4921         MIB.buildInstr(TargetOpcode::COPY, {I.getOperand(0).getReg()}, {})
4922             .addReg(TBL1.getReg(0), 0, AArch64::dsub);
4923     RBI.constrainGenericRegister(Copy.getReg(0), AArch64::FPR64RegClass, MRI);
4924     I.eraseFromParent();
4925     return true;
4926   }
4927 
4928   // For TBL2 we need to emit a REG_SEQUENCE to tie together two consecutive
4929   // Q registers for regalloc.
4930   SmallVector<Register, 2> Regs = {Src1Reg, Src2Reg};
4931   auto RegSeq = createQTuple(Regs, MIB);
4932   auto TBL2 = MIB.buildInstr(AArch64::TBLv16i8Two, {I.getOperand(0)},
4933                              {RegSeq, IndexLoad->getOperand(0)});
4934   constrainSelectedInstRegOperands(*TBL2, TII, TRI, RBI);
4935   I.eraseFromParent();
4936   return true;
4937 }
4938 
4939 MachineInstr *AArch64InstructionSelector::emitLaneInsert(
4940     Optional<Register> DstReg, Register SrcReg, Register EltReg,
4941     unsigned LaneIdx, const RegisterBank &RB,
4942     MachineIRBuilder &MIRBuilder) const {
4943   MachineInstr *InsElt = nullptr;
4944   const TargetRegisterClass *DstRC = &AArch64::FPR128RegClass;
4945   MachineRegisterInfo &MRI = *MIRBuilder.getMRI();
4946 
4947   // Create a register to define with the insert if one wasn't passed in.
4948   if (!DstReg)
4949     DstReg = MRI.createVirtualRegister(DstRC);
4950 
4951   unsigned EltSize = MRI.getType(EltReg).getSizeInBits();
4952   unsigned Opc = getInsertVecEltOpInfo(RB, EltSize).first;
4953 
4954   if (RB.getID() == AArch64::FPRRegBankID) {
4955     auto InsSub = emitScalarToVector(EltSize, DstRC, EltReg, MIRBuilder);
4956     InsElt = MIRBuilder.buildInstr(Opc, {*DstReg}, {SrcReg})
4957                  .addImm(LaneIdx)
4958                  .addUse(InsSub->getOperand(0).getReg())
4959                  .addImm(0);
4960   } else {
4961     InsElt = MIRBuilder.buildInstr(Opc, {*DstReg}, {SrcReg})
4962                  .addImm(LaneIdx)
4963                  .addUse(EltReg);
4964   }
4965 
4966   constrainSelectedInstRegOperands(*InsElt, TII, TRI, RBI);
4967   return InsElt;
4968 }
4969 
4970 bool AArch64InstructionSelector::selectUSMovFromExtend(
4971     MachineInstr &MI, MachineRegisterInfo &MRI) {
4972   if (MI.getOpcode() != TargetOpcode::G_SEXT &&
4973       MI.getOpcode() != TargetOpcode::G_ZEXT &&
4974       MI.getOpcode() != TargetOpcode::G_ANYEXT)
4975     return false;
4976   bool IsSigned = MI.getOpcode() == TargetOpcode::G_SEXT;
4977   const Register DefReg = MI.getOperand(0).getReg();
4978   const LLT DstTy = MRI.getType(DefReg);
4979   unsigned DstSize = DstTy.getSizeInBits();
4980 
4981   if (DstSize != 32 && DstSize != 64)
4982     return false;
4983 
4984   MachineInstr *Extract = getOpcodeDef(TargetOpcode::G_EXTRACT_VECTOR_ELT,
4985                                        MI.getOperand(1).getReg(), MRI);
4986   int64_t Lane;
4987   if (!Extract || !mi_match(Extract->getOperand(2).getReg(), MRI, m_ICst(Lane)))
4988     return false;
4989   Register Src0 = Extract->getOperand(1).getReg();
4990 
4991   const LLT &VecTy = MRI.getType(Src0);
4992 
4993   if (VecTy.getSizeInBits() != 128) {
4994     const MachineInstr *ScalarToVector = emitScalarToVector(
4995         VecTy.getSizeInBits(), &AArch64::FPR128RegClass, Src0, MIB);
4996     assert(ScalarToVector && "Didn't expect emitScalarToVector to fail!");
4997     Src0 = ScalarToVector->getOperand(0).getReg();
4998   }
4999 
5000   unsigned Opcode;
5001   if (DstSize == 64 && VecTy.getScalarSizeInBits() == 32)
5002     Opcode = IsSigned ? AArch64::SMOVvi32to64 : AArch64::UMOVvi32;
5003   else if (DstSize == 64 && VecTy.getScalarSizeInBits() == 16)
5004     Opcode = IsSigned ? AArch64::SMOVvi16to64 : AArch64::UMOVvi16;
5005   else if (DstSize == 64 && VecTy.getScalarSizeInBits() == 8)
5006     Opcode = IsSigned ? AArch64::SMOVvi8to64 : AArch64::UMOVvi8;
5007   else if (DstSize == 32 && VecTy.getScalarSizeInBits() == 16)
5008     Opcode = IsSigned ? AArch64::SMOVvi16to32 : AArch64::UMOVvi16;
5009   else if (DstSize == 32 && VecTy.getScalarSizeInBits() == 8)
5010     Opcode = IsSigned ? AArch64::SMOVvi8to32 : AArch64::UMOVvi8;
5011   else
5012     llvm_unreachable("Unexpected type combo for S/UMov!");
5013 
5014   // We may need to generate one of these, depending on the type and sign of the
5015   // input:
5016   //  DstReg = SMOV Src0, Lane;
5017   //  NewReg = UMOV Src0, Lane; DstReg = SUBREG_TO_REG NewReg, sub_32;
5018   MachineInstr *ExtI = nullptr;
5019   if (DstSize == 64 && !IsSigned) {
5020     Register NewReg = MRI.createVirtualRegister(&AArch64::GPR32RegClass);
5021     MIB.buildInstr(Opcode, {NewReg}, {Src0}).addImm(Lane);
5022     ExtI = MIB.buildInstr(AArch64::SUBREG_TO_REG, {DefReg}, {})
5023                .addImm(0)
5024                .addUse(NewReg)
5025                .addImm(AArch64::sub_32);
5026     RBI.constrainGenericRegister(DefReg, AArch64::GPR64RegClass, MRI);
5027   } else
5028     ExtI = MIB.buildInstr(Opcode, {DefReg}, {Src0}).addImm(Lane);
5029 
5030   constrainSelectedInstRegOperands(*ExtI, TII, TRI, RBI);
5031   MI.eraseFromParent();
5032   return true;
5033 }
5034 
5035 bool AArch64InstructionSelector::selectInsertElt(MachineInstr &I,
5036                                                  MachineRegisterInfo &MRI) {
5037   assert(I.getOpcode() == TargetOpcode::G_INSERT_VECTOR_ELT);
5038 
5039   // Get information on the destination.
5040   Register DstReg = I.getOperand(0).getReg();
5041   const LLT DstTy = MRI.getType(DstReg);
5042   unsigned VecSize = DstTy.getSizeInBits();
5043 
5044   // Get information on the element we want to insert into the destination.
5045   Register EltReg = I.getOperand(2).getReg();
5046   const LLT EltTy = MRI.getType(EltReg);
5047   unsigned EltSize = EltTy.getSizeInBits();
5048   if (EltSize < 16 || EltSize > 64)
5049     return false; // Don't support all element types yet.
5050 
5051   // Find the definition of the index. Bail out if it's not defined by a
5052   // G_CONSTANT.
5053   Register IdxReg = I.getOperand(3).getReg();
5054   auto VRegAndVal = getIConstantVRegValWithLookThrough(IdxReg, MRI);
5055   if (!VRegAndVal)
5056     return false;
5057   unsigned LaneIdx = VRegAndVal->Value.getSExtValue();
5058 
5059   // Perform the lane insert.
5060   Register SrcReg = I.getOperand(1).getReg();
5061   const RegisterBank &EltRB = *RBI.getRegBank(EltReg, MRI, TRI);
5062 
5063   if (VecSize < 128) {
5064     // If the vector we're inserting into is smaller than 128 bits, widen it
5065     // to 128 to do the insert.
5066     MachineInstr *ScalarToVec =
5067         emitScalarToVector(VecSize, &AArch64::FPR128RegClass, SrcReg, MIB);
5068     if (!ScalarToVec)
5069       return false;
5070     SrcReg = ScalarToVec->getOperand(0).getReg();
5071   }
5072 
5073   // Create an insert into a new FPR128 register.
5074   // Note that if our vector is already 128 bits, we end up emitting an extra
5075   // register.
5076   MachineInstr *InsMI =
5077       emitLaneInsert(None, SrcReg, EltReg, LaneIdx, EltRB, MIB);
5078 
5079   if (VecSize < 128) {
5080     // If we had to widen to perform the insert, then we have to demote back to
5081     // the original size to get the result we want.
5082     Register DemoteVec = InsMI->getOperand(0).getReg();
5083     const TargetRegisterClass *RC =
5084         getMinClassForRegBank(*RBI.getRegBank(DemoteVec, MRI, TRI), VecSize);
5085     if (RC != &AArch64::FPR32RegClass && RC != &AArch64::FPR64RegClass) {
5086       LLVM_DEBUG(dbgs() << "Unsupported register class!\n");
5087       return false;
5088     }
5089     unsigned SubReg = 0;
5090     if (!getSubRegForClass(RC, TRI, SubReg))
5091       return false;
5092     if (SubReg != AArch64::ssub && SubReg != AArch64::dsub) {
5093       LLVM_DEBUG(dbgs() << "Unsupported destination size! (" << VecSize
5094                         << "\n");
5095       return false;
5096     }
5097     MIB.buildInstr(TargetOpcode::COPY, {DstReg}, {})
5098         .addReg(DemoteVec, 0, SubReg);
5099     RBI.constrainGenericRegister(DstReg, *RC, MRI);
5100   } else {
5101     // No widening needed.
5102     InsMI->getOperand(0).setReg(DstReg);
5103     constrainSelectedInstRegOperands(*InsMI, TII, TRI, RBI);
5104   }
5105 
5106   I.eraseFromParent();
5107   return true;
5108 }
5109 
5110 MachineInstr *
5111 AArch64InstructionSelector::emitConstantVector(Register Dst, Constant *CV,
5112                                                MachineIRBuilder &MIRBuilder,
5113                                                MachineRegisterInfo &MRI) {
5114   LLT DstTy = MRI.getType(Dst);
5115   unsigned DstSize = DstTy.getSizeInBits();
5116   if (CV->isNullValue()) {
5117     if (DstSize == 128) {
5118       auto Mov =
5119           MIRBuilder.buildInstr(AArch64::MOVIv2d_ns, {Dst}, {}).addImm(0);
5120       constrainSelectedInstRegOperands(*Mov, TII, TRI, RBI);
5121       return &*Mov;
5122     }
5123 
5124     if (DstSize == 64) {
5125       auto Mov =
5126           MIRBuilder
5127               .buildInstr(AArch64::MOVIv2d_ns, {&AArch64::FPR128RegClass}, {})
5128               .addImm(0);
5129       auto Copy = MIRBuilder.buildInstr(TargetOpcode::COPY, {Dst}, {})
5130                       .addReg(Mov.getReg(0), 0, AArch64::dsub);
5131       RBI.constrainGenericRegister(Dst, AArch64::FPR64RegClass, MRI);
5132       return &*Copy;
5133     }
5134   }
5135 
5136   auto *CPLoad = emitLoadFromConstantPool(CV, MIRBuilder);
5137   if (!CPLoad) {
5138     LLVM_DEBUG(dbgs() << "Could not generate cp load for constant vector!");
5139     return nullptr;
5140   }
5141 
5142   auto Copy = MIRBuilder.buildCopy(Dst, CPLoad->getOperand(0));
5143   RBI.constrainGenericRegister(
5144       Dst, *MRI.getRegClass(CPLoad->getOperand(0).getReg()), MRI);
5145   return &*Copy;
5146 }
5147 
5148 bool AArch64InstructionSelector::tryOptConstantBuildVec(
5149     MachineInstr &I, LLT DstTy, MachineRegisterInfo &MRI) {
5150   assert(I.getOpcode() == TargetOpcode::G_BUILD_VECTOR);
5151   unsigned DstSize = DstTy.getSizeInBits();
5152   assert(DstSize <= 128 && "Unexpected build_vec type!");
5153   if (DstSize < 32)
5154     return false;
5155   // Check if we're building a constant vector, in which case we want to
5156   // generate a constant pool load instead of a vector insert sequence.
5157   SmallVector<Constant *, 16> Csts;
5158   for (unsigned Idx = 1; Idx < I.getNumOperands(); ++Idx) {
5159     // Try to find G_CONSTANT or G_FCONSTANT
5160     auto *OpMI =
5161         getOpcodeDef(TargetOpcode::G_CONSTANT, I.getOperand(Idx).getReg(), MRI);
5162     if (OpMI)
5163       Csts.emplace_back(
5164           const_cast<ConstantInt *>(OpMI->getOperand(1).getCImm()));
5165     else if ((OpMI = getOpcodeDef(TargetOpcode::G_FCONSTANT,
5166                                   I.getOperand(Idx).getReg(), MRI)))
5167       Csts.emplace_back(
5168           const_cast<ConstantFP *>(OpMI->getOperand(1).getFPImm()));
5169     else
5170       return false;
5171   }
5172   Constant *CV = ConstantVector::get(Csts);
5173   if (!emitConstantVector(I.getOperand(0).getReg(), CV, MIB, MRI))
5174     return false;
5175   I.eraseFromParent();
5176   return true;
5177 }
5178 
5179 bool AArch64InstructionSelector::tryOptBuildVecToSubregToReg(
5180     MachineInstr &I, MachineRegisterInfo &MRI) {
5181   // Given:
5182   //  %vec = G_BUILD_VECTOR %elt, %undef, %undef, ... %undef
5183   //
5184   // Select the G_BUILD_VECTOR as a SUBREG_TO_REG from %elt.
5185   Register Dst = I.getOperand(0).getReg();
5186   Register EltReg = I.getOperand(1).getReg();
5187   LLT EltTy = MRI.getType(EltReg);
5188   // If the index isn't on the same bank as its elements, then this can't be a
5189   // SUBREG_TO_REG.
5190   const RegisterBank &EltRB = *RBI.getRegBank(EltReg, MRI, TRI);
5191   const RegisterBank &DstRB = *RBI.getRegBank(Dst, MRI, TRI);
5192   if (EltRB != DstRB)
5193     return false;
5194   if (any_of(make_range(I.operands_begin() + 2, I.operands_end()),
5195              [&MRI](const MachineOperand &Op) {
5196                return !getOpcodeDef(TargetOpcode::G_IMPLICIT_DEF, Op.getReg(),
5197                                     MRI);
5198              }))
5199     return false;
5200   unsigned SubReg;
5201   const TargetRegisterClass *EltRC =
5202       getMinClassForRegBank(EltRB, EltTy.getSizeInBits());
5203   if (!EltRC)
5204     return false;
5205   const TargetRegisterClass *DstRC =
5206       getMinClassForRegBank(DstRB, MRI.getType(Dst).getSizeInBits());
5207   if (!DstRC)
5208     return false;
5209   if (!getSubRegForClass(EltRC, TRI, SubReg))
5210     return false;
5211   auto SubregToReg = MIB.buildInstr(AArch64::SUBREG_TO_REG, {Dst}, {})
5212                          .addImm(0)
5213                          .addUse(EltReg)
5214                          .addImm(SubReg);
5215   I.eraseFromParent();
5216   constrainSelectedInstRegOperands(*SubregToReg, TII, TRI, RBI);
5217   return RBI.constrainGenericRegister(Dst, *DstRC, MRI);
5218 }
5219 
5220 bool AArch64InstructionSelector::selectBuildVector(MachineInstr &I,
5221                                                    MachineRegisterInfo &MRI) {
5222   assert(I.getOpcode() == TargetOpcode::G_BUILD_VECTOR);
5223   // Until we port more of the optimized selections, for now just use a vector
5224   // insert sequence.
5225   const LLT DstTy = MRI.getType(I.getOperand(0).getReg());
5226   const LLT EltTy = MRI.getType(I.getOperand(1).getReg());
5227   unsigned EltSize = EltTy.getSizeInBits();
5228 
5229   if (tryOptConstantBuildVec(I, DstTy, MRI))
5230     return true;
5231   if (tryOptBuildVecToSubregToReg(I, MRI))
5232     return true;
5233 
5234   if (EltSize < 16 || EltSize > 64)
5235     return false; // Don't support all element types yet.
5236   const RegisterBank &RB = *RBI.getRegBank(I.getOperand(1).getReg(), MRI, TRI);
5237 
5238   const TargetRegisterClass *DstRC = &AArch64::FPR128RegClass;
5239   MachineInstr *ScalarToVec =
5240       emitScalarToVector(DstTy.getElementType().getSizeInBits(), DstRC,
5241                          I.getOperand(1).getReg(), MIB);
5242   if (!ScalarToVec)
5243     return false;
5244 
5245   Register DstVec = ScalarToVec->getOperand(0).getReg();
5246   unsigned DstSize = DstTy.getSizeInBits();
5247 
5248   // Keep track of the last MI we inserted. Later on, we might be able to save
5249   // a copy using it.
5250   MachineInstr *PrevMI = nullptr;
5251   for (unsigned i = 2, e = DstSize / EltSize + 1; i < e; ++i) {
5252     // Note that if we don't do a subregister copy, we can end up making an
5253     // extra register.
5254     PrevMI = &*emitLaneInsert(None, DstVec, I.getOperand(i).getReg(), i - 1, RB,
5255                               MIB);
5256     DstVec = PrevMI->getOperand(0).getReg();
5257   }
5258 
5259   // If DstTy's size in bits is less than 128, then emit a subregister copy
5260   // from DstVec to the last register we've defined.
5261   if (DstSize < 128) {
5262     // Force this to be FPR using the destination vector.
5263     const TargetRegisterClass *RC =
5264         getMinClassForRegBank(*RBI.getRegBank(DstVec, MRI, TRI), DstSize);
5265     if (!RC)
5266       return false;
5267     if (RC != &AArch64::FPR32RegClass && RC != &AArch64::FPR64RegClass) {
5268       LLVM_DEBUG(dbgs() << "Unsupported register class!\n");
5269       return false;
5270     }
5271 
5272     unsigned SubReg = 0;
5273     if (!getSubRegForClass(RC, TRI, SubReg))
5274       return false;
5275     if (SubReg != AArch64::ssub && SubReg != AArch64::dsub) {
5276       LLVM_DEBUG(dbgs() << "Unsupported destination size! (" << DstSize
5277                         << "\n");
5278       return false;
5279     }
5280 
5281     Register Reg = MRI.createVirtualRegister(RC);
5282     Register DstReg = I.getOperand(0).getReg();
5283 
5284     MIB.buildInstr(TargetOpcode::COPY, {DstReg}, {}).addReg(DstVec, 0, SubReg);
5285     MachineOperand &RegOp = I.getOperand(1);
5286     RegOp.setReg(Reg);
5287     RBI.constrainGenericRegister(DstReg, *RC, MRI);
5288   } else {
5289     // We don't need a subregister copy. Save a copy by re-using the
5290     // destination register on the final insert.
5291     assert(PrevMI && "PrevMI was null?");
5292     PrevMI->getOperand(0).setReg(I.getOperand(0).getReg());
5293     constrainSelectedInstRegOperands(*PrevMI, TII, TRI, RBI);
5294   }
5295 
5296   I.eraseFromParent();
5297   return true;
5298 }
5299 
5300 bool AArch64InstructionSelector::selectVectorLoadIntrinsic(unsigned Opc,
5301                                                            unsigned NumVecs,
5302                                                            MachineInstr &I) {
5303   assert(I.getOpcode() == TargetOpcode::G_INTRINSIC_W_SIDE_EFFECTS);
5304   assert(Opc && "Expected an opcode?");
5305   assert(NumVecs > 1 && NumVecs < 5 && "Only support 2, 3, or 4 vectors");
5306   auto &MRI = *MIB.getMRI();
5307   LLT Ty = MRI.getType(I.getOperand(0).getReg());
5308   unsigned Size = Ty.getSizeInBits();
5309   assert((Size == 64 || Size == 128) &&
5310          "Destination must be 64 bits or 128 bits?");
5311   unsigned SubReg = Size == 64 ? AArch64::dsub0 : AArch64::qsub0;
5312   auto Ptr = I.getOperand(I.getNumOperands() - 1).getReg();
5313   assert(MRI.getType(Ptr).isPointer() && "Expected a pointer type?");
5314   auto Load = MIB.buildInstr(Opc, {Ty}, {Ptr});
5315   Load.cloneMemRefs(I);
5316   constrainSelectedInstRegOperands(*Load, TII, TRI, RBI);
5317   Register SelectedLoadDst = Load->getOperand(0).getReg();
5318   for (unsigned Idx = 0; Idx < NumVecs; ++Idx) {
5319     auto Vec = MIB.buildInstr(TargetOpcode::COPY, {I.getOperand(Idx)}, {})
5320                    .addReg(SelectedLoadDst, 0, SubReg + Idx);
5321     // Emit the subreg copies and immediately select them.
5322     // FIXME: We should refactor our copy code into an emitCopy helper and
5323     // clean up uses of this pattern elsewhere in the selector.
5324     selectCopy(*Vec, TII, MRI, TRI, RBI);
5325   }
5326   return true;
5327 }
5328 
5329 bool AArch64InstructionSelector::selectIntrinsicWithSideEffects(
5330     MachineInstr &I, MachineRegisterInfo &MRI) {
5331   // Find the intrinsic ID.
5332   unsigned IntrinID = I.getIntrinsicID();
5333 
5334   const LLT S8 = LLT::scalar(8);
5335   const LLT S16 = LLT::scalar(16);
5336   const LLT S32 = LLT::scalar(32);
5337   const LLT S64 = LLT::scalar(64);
5338   const LLT P0 = LLT::pointer(0, 64);
5339   // Select the instruction.
5340   switch (IntrinID) {
5341   default:
5342     return false;
5343   case Intrinsic::aarch64_ldxp:
5344   case Intrinsic::aarch64_ldaxp: {
5345     auto NewI = MIB.buildInstr(
5346         IntrinID == Intrinsic::aarch64_ldxp ? AArch64::LDXPX : AArch64::LDAXPX,
5347         {I.getOperand(0).getReg(), I.getOperand(1).getReg()},
5348         {I.getOperand(3)});
5349     NewI.cloneMemRefs(I);
5350     constrainSelectedInstRegOperands(*NewI, TII, TRI, RBI);
5351     break;
5352   }
5353   case Intrinsic::trap:
5354     MIB.buildInstr(AArch64::BRK, {}, {}).addImm(1);
5355     break;
5356   case Intrinsic::debugtrap:
5357     MIB.buildInstr(AArch64::BRK, {}, {}).addImm(0xF000);
5358     break;
5359   case Intrinsic::ubsantrap:
5360     MIB.buildInstr(AArch64::BRK, {}, {})
5361         .addImm(I.getOperand(1).getImm() | ('U' << 8));
5362     break;
5363   case Intrinsic::aarch64_neon_ld2: {
5364     LLT Ty = MRI.getType(I.getOperand(0).getReg());
5365     unsigned Opc = 0;
5366     if (Ty == LLT::fixed_vector(8, S8))
5367       Opc = AArch64::LD2Twov8b;
5368     else if (Ty == LLT::fixed_vector(16, S8))
5369       Opc = AArch64::LD2Twov16b;
5370     else if (Ty == LLT::fixed_vector(4, S16))
5371       Opc = AArch64::LD2Twov4h;
5372     else if (Ty == LLT::fixed_vector(8, S16))
5373       Opc = AArch64::LD2Twov8h;
5374     else if (Ty == LLT::fixed_vector(2, S32))
5375       Opc = AArch64::LD2Twov2s;
5376     else if (Ty == LLT::fixed_vector(4, S32))
5377       Opc = AArch64::LD2Twov4s;
5378     else if (Ty == LLT::fixed_vector(2, S64) || Ty == LLT::fixed_vector(2, P0))
5379       Opc = AArch64::LD2Twov2d;
5380     else if (Ty == S64 || Ty == P0)
5381       Opc = AArch64::LD1Twov1d;
5382     else
5383       llvm_unreachable("Unexpected type for ld2!");
5384     selectVectorLoadIntrinsic(Opc, 2, I);
5385     break;
5386   }
5387   case Intrinsic::aarch64_neon_ld4: {
5388     LLT Ty = MRI.getType(I.getOperand(0).getReg());
5389     unsigned Opc = 0;
5390     if (Ty == LLT::fixed_vector(8, S8))
5391       Opc = AArch64::LD4Fourv8b;
5392     else if (Ty == LLT::fixed_vector(16, S8))
5393       Opc = AArch64::LD4Fourv16b;
5394     else if (Ty == LLT::fixed_vector(4, S16))
5395       Opc = AArch64::LD4Fourv4h;
5396     else if (Ty == LLT::fixed_vector(8, S16))
5397       Opc = AArch64::LD4Fourv8h;
5398     else if (Ty == LLT::fixed_vector(2, S32))
5399       Opc = AArch64::LD4Fourv2s;
5400     else if (Ty == LLT::fixed_vector(4, S32))
5401       Opc = AArch64::LD4Fourv4s;
5402     else if (Ty == LLT::fixed_vector(2, S64) || Ty == LLT::fixed_vector(2, P0))
5403       Opc = AArch64::LD4Fourv2d;
5404     else if (Ty == S64 || Ty == P0)
5405       Opc = AArch64::LD1Fourv1d;
5406     else
5407       llvm_unreachable("Unexpected type for ld4!");
5408     selectVectorLoadIntrinsic(Opc, 4, I);
5409     break;
5410   }
5411   case Intrinsic::aarch64_neon_st2: {
5412     Register Src1 = I.getOperand(1).getReg();
5413     Register Src2 = I.getOperand(2).getReg();
5414     Register Ptr = I.getOperand(3).getReg();
5415     LLT Ty = MRI.getType(Src1);
5416     unsigned Opc;
5417     if (Ty == LLT::fixed_vector(8, S8))
5418       Opc = AArch64::ST2Twov8b;
5419     else if (Ty == LLT::fixed_vector(16, S8))
5420       Opc = AArch64::ST2Twov16b;
5421     else if (Ty == LLT::fixed_vector(4, S16))
5422       Opc = AArch64::ST2Twov4h;
5423     else if (Ty == LLT::fixed_vector(8, S16))
5424       Opc = AArch64::ST2Twov8h;
5425     else if (Ty == LLT::fixed_vector(2, S32))
5426       Opc = AArch64::ST2Twov2s;
5427     else if (Ty == LLT::fixed_vector(4, S32))
5428       Opc = AArch64::ST2Twov4s;
5429     else if (Ty == LLT::fixed_vector(2, S64) || Ty == LLT::fixed_vector(2, P0))
5430       Opc = AArch64::ST2Twov2d;
5431     else if (Ty == S64 || Ty == P0)
5432       Opc = AArch64::ST1Twov1d;
5433     else
5434       llvm_unreachable("Unexpected type for st2!");
5435     SmallVector<Register, 2> Regs = {Src1, Src2};
5436     Register Tuple = Ty.getSizeInBits() == 128 ? createQTuple(Regs, MIB)
5437                                                : createDTuple(Regs, MIB);
5438     auto Store = MIB.buildInstr(Opc, {}, {Tuple, Ptr});
5439     Store.cloneMemRefs(I);
5440     constrainSelectedInstRegOperands(*Store, TII, TRI, RBI);
5441     break;
5442   }
5443   case Intrinsic::aarch64_mops_memset_tag: {
5444     // Transform
5445     //    %dst:gpr(p0) = \
5446     //      G_INTRINSIC_W_SIDE_EFFECTS intrinsic(@llvm.aarch64.mops.memset.tag),
5447     //      \ %dst:gpr(p0), %val:gpr(s64), %n:gpr(s64)
5448     // where %dst is updated, into
5449     //    %Rd:GPR64common, %Rn:GPR64) = \
5450     //      MOPSMemorySetTaggingPseudo \
5451     //      %Rd:GPR64common, %Rn:GPR64, %Rm:GPR64
5452     // where Rd and Rn are tied.
5453     // It is expected that %val has been extended to s64 in legalization.
5454     // Note that the order of the size/value operands are swapped.
5455 
5456     Register DstDef = I.getOperand(0).getReg();
5457     // I.getOperand(1) is the intrinsic function
5458     Register DstUse = I.getOperand(2).getReg();
5459     Register ValUse = I.getOperand(3).getReg();
5460     Register SizeUse = I.getOperand(4).getReg();
5461 
5462     // MOPSMemorySetTaggingPseudo has two defs; the intrinsic call has only one.
5463     // Therefore an additional virtual register is requried for the updated size
5464     // operand. This value is not accessible via the semantics of the intrinsic.
5465     Register SizeDef = MRI.createGenericVirtualRegister(LLT::scalar(64));
5466 
5467     auto Memset = MIB.buildInstr(AArch64::MOPSMemorySetTaggingPseudo,
5468                                  {DstDef, SizeDef}, {DstUse, SizeUse, ValUse});
5469     Memset.cloneMemRefs(I);
5470     constrainSelectedInstRegOperands(*Memset, TII, TRI, RBI);
5471     break;
5472   }
5473   }
5474 
5475   I.eraseFromParent();
5476   return true;
5477 }
5478 
5479 bool AArch64InstructionSelector::selectIntrinsic(MachineInstr &I,
5480                                                  MachineRegisterInfo &MRI) {
5481   unsigned IntrinID = I.getIntrinsicID();
5482 
5483   switch (IntrinID) {
5484   default:
5485     break;
5486   case Intrinsic::aarch64_crypto_sha1h: {
5487     Register DstReg = I.getOperand(0).getReg();
5488     Register SrcReg = I.getOperand(2).getReg();
5489 
5490     // FIXME: Should this be an assert?
5491     if (MRI.getType(DstReg).getSizeInBits() != 32 ||
5492         MRI.getType(SrcReg).getSizeInBits() != 32)
5493       return false;
5494 
5495     // The operation has to happen on FPRs. Set up some new FPR registers for
5496     // the source and destination if they are on GPRs.
5497     if (RBI.getRegBank(SrcReg, MRI, TRI)->getID() != AArch64::FPRRegBankID) {
5498       SrcReg = MRI.createVirtualRegister(&AArch64::FPR32RegClass);
5499       MIB.buildCopy({SrcReg}, {I.getOperand(2)});
5500 
5501       // Make sure the copy ends up getting constrained properly.
5502       RBI.constrainGenericRegister(I.getOperand(2).getReg(),
5503                                    AArch64::GPR32RegClass, MRI);
5504     }
5505 
5506     if (RBI.getRegBank(DstReg, MRI, TRI)->getID() != AArch64::FPRRegBankID)
5507       DstReg = MRI.createVirtualRegister(&AArch64::FPR32RegClass);
5508 
5509     // Actually insert the instruction.
5510     auto SHA1Inst = MIB.buildInstr(AArch64::SHA1Hrr, {DstReg}, {SrcReg});
5511     constrainSelectedInstRegOperands(*SHA1Inst, TII, TRI, RBI);
5512 
5513     // Did we create a new register for the destination?
5514     if (DstReg != I.getOperand(0).getReg()) {
5515       // Yep. Copy the result of the instruction back into the original
5516       // destination.
5517       MIB.buildCopy({I.getOperand(0)}, {DstReg});
5518       RBI.constrainGenericRegister(I.getOperand(0).getReg(),
5519                                    AArch64::GPR32RegClass, MRI);
5520     }
5521 
5522     I.eraseFromParent();
5523     return true;
5524   }
5525   case Intrinsic::ptrauth_sign: {
5526     Register DstReg = I.getOperand(0).getReg();
5527     Register ValReg = I.getOperand(2).getReg();
5528     uint64_t Key = I.getOperand(3).getImm();
5529     Register DiscReg = I.getOperand(4).getReg();
5530     auto DiscVal = getIConstantVRegVal(DiscReg, MRI);
5531     bool IsDiscZero = DiscVal.hasValue() && DiscVal->isNullValue();
5532 
5533     if (Key > 3)
5534       return false;
5535 
5536     unsigned Opcodes[][4] = {
5537         {AArch64::PACIA, AArch64::PACIB, AArch64::PACDA, AArch64::PACDB},
5538         {AArch64::PACIZA, AArch64::PACIZB, AArch64::PACDZA, AArch64::PACDZB}};
5539     unsigned Opcode = Opcodes[IsDiscZero][Key];
5540 
5541     auto PAC = MIB.buildInstr(Opcode, {DstReg}, {ValReg});
5542 
5543     if (!IsDiscZero) {
5544       PAC.addUse(DiscReg);
5545       RBI.constrainGenericRegister(DiscReg, AArch64::GPR64spRegClass, MRI);
5546     }
5547 
5548     RBI.constrainGenericRegister(DstReg, AArch64::GPR64RegClass, MRI);
5549     I.eraseFromParent();
5550     return true;
5551   }
5552   case Intrinsic::frameaddress:
5553   case Intrinsic::returnaddress: {
5554     MachineFunction &MF = *I.getParent()->getParent();
5555     MachineFrameInfo &MFI = MF.getFrameInfo();
5556 
5557     unsigned Depth = I.getOperand(2).getImm();
5558     Register DstReg = I.getOperand(0).getReg();
5559     RBI.constrainGenericRegister(DstReg, AArch64::GPR64RegClass, MRI);
5560 
5561     if (Depth == 0 && IntrinID == Intrinsic::returnaddress) {
5562       if (!MFReturnAddr) {
5563         // Insert the copy from LR/X30 into the entry block, before it can be
5564         // clobbered by anything.
5565         MFI.setReturnAddressIsTaken(true);
5566         MFReturnAddr = getFunctionLiveInPhysReg(
5567             MF, TII, AArch64::LR, AArch64::GPR64RegClass, I.getDebugLoc());
5568       }
5569 
5570       if (STI.hasPAuth()) {
5571         MIB.buildInstr(AArch64::XPACI, {DstReg}, {MFReturnAddr});
5572       } else {
5573         MIB.buildCopy({Register(AArch64::LR)}, {MFReturnAddr});
5574         MIB.buildInstr(AArch64::XPACLRI);
5575         MIB.buildCopy({DstReg}, {Register(AArch64::LR)});
5576       }
5577 
5578       I.eraseFromParent();
5579       return true;
5580     }
5581 
5582     MFI.setFrameAddressIsTaken(true);
5583     Register FrameAddr(AArch64::FP);
5584     while (Depth--) {
5585       Register NextFrame = MRI.createVirtualRegister(&AArch64::GPR64spRegClass);
5586       auto Ldr =
5587           MIB.buildInstr(AArch64::LDRXui, {NextFrame}, {FrameAddr}).addImm(0);
5588       constrainSelectedInstRegOperands(*Ldr, TII, TRI, RBI);
5589       FrameAddr = NextFrame;
5590     }
5591 
5592     if (IntrinID == Intrinsic::frameaddress)
5593       MIB.buildCopy({DstReg}, {FrameAddr});
5594     else {
5595       MFI.setReturnAddressIsTaken(true);
5596 
5597       if (STI.hasPAuth()) {
5598         Register TmpReg = MRI.createVirtualRegister(&AArch64::GPR64RegClass);
5599         MIB.buildInstr(AArch64::LDRXui, {TmpReg}, {FrameAddr}).addImm(1);
5600         MIB.buildInstr(AArch64::XPACI, {DstReg}, {TmpReg});
5601       } else {
5602         MIB.buildInstr(AArch64::LDRXui, {Register(AArch64::LR)}, {FrameAddr})
5603             .addImm(1);
5604         MIB.buildInstr(AArch64::XPACLRI);
5605         MIB.buildCopy({DstReg}, {Register(AArch64::LR)});
5606       }
5607     }
5608 
5609     I.eraseFromParent();
5610     return true;
5611   }
5612   case Intrinsic::swift_async_context_addr:
5613     auto Sub = MIB.buildInstr(AArch64::SUBXri, {I.getOperand(0).getReg()},
5614                               {Register(AArch64::FP)})
5615                    .addImm(8)
5616                    .addImm(0);
5617     constrainSelectedInstRegOperands(*Sub, TII, TRI, RBI);
5618 
5619     MF->getFrameInfo().setFrameAddressIsTaken(true);
5620     MF->getInfo<AArch64FunctionInfo>()->setHasSwiftAsyncContext(true);
5621     I.eraseFromParent();
5622     return true;
5623   }
5624   return false;
5625 }
5626 
5627 InstructionSelector::ComplexRendererFns
5628 AArch64InstructionSelector::selectShiftA_32(const MachineOperand &Root) const {
5629   auto MaybeImmed = getImmedFromMO(Root);
5630   if (MaybeImmed == None || *MaybeImmed > 31)
5631     return None;
5632   uint64_t Enc = (32 - *MaybeImmed) & 0x1f;
5633   return {{[=](MachineInstrBuilder &MIB) { MIB.addImm(Enc); }}};
5634 }
5635 
5636 InstructionSelector::ComplexRendererFns
5637 AArch64InstructionSelector::selectShiftB_32(const MachineOperand &Root) const {
5638   auto MaybeImmed = getImmedFromMO(Root);
5639   if (MaybeImmed == None || *MaybeImmed > 31)
5640     return None;
5641   uint64_t Enc = 31 - *MaybeImmed;
5642   return {{[=](MachineInstrBuilder &MIB) { MIB.addImm(Enc); }}};
5643 }
5644 
5645 InstructionSelector::ComplexRendererFns
5646 AArch64InstructionSelector::selectShiftA_64(const MachineOperand &Root) const {
5647   auto MaybeImmed = getImmedFromMO(Root);
5648   if (MaybeImmed == None || *MaybeImmed > 63)
5649     return None;
5650   uint64_t Enc = (64 - *MaybeImmed) & 0x3f;
5651   return {{[=](MachineInstrBuilder &MIB) { MIB.addImm(Enc); }}};
5652 }
5653 
5654 InstructionSelector::ComplexRendererFns
5655 AArch64InstructionSelector::selectShiftB_64(const MachineOperand &Root) const {
5656   auto MaybeImmed = getImmedFromMO(Root);
5657   if (MaybeImmed == None || *MaybeImmed > 63)
5658     return None;
5659   uint64_t Enc = 63 - *MaybeImmed;
5660   return {{[=](MachineInstrBuilder &MIB) { MIB.addImm(Enc); }}};
5661 }
5662 
5663 /// Helper to select an immediate value that can be represented as a 12-bit
5664 /// value shifted left by either 0 or 12. If it is possible to do so, return
5665 /// the immediate and shift value. If not, return None.
5666 ///
5667 /// Used by selectArithImmed and selectNegArithImmed.
5668 InstructionSelector::ComplexRendererFns
5669 AArch64InstructionSelector::select12BitValueWithLeftShift(
5670     uint64_t Immed) const {
5671   unsigned ShiftAmt;
5672   if (Immed >> 12 == 0) {
5673     ShiftAmt = 0;
5674   } else if ((Immed & 0xfff) == 0 && Immed >> 24 == 0) {
5675     ShiftAmt = 12;
5676     Immed = Immed >> 12;
5677   } else
5678     return None;
5679 
5680   unsigned ShVal = AArch64_AM::getShifterImm(AArch64_AM::LSL, ShiftAmt);
5681   return {{
5682       [=](MachineInstrBuilder &MIB) { MIB.addImm(Immed); },
5683       [=](MachineInstrBuilder &MIB) { MIB.addImm(ShVal); },
5684   }};
5685 }
5686 
5687 /// SelectArithImmed - Select an immediate value that can be represented as
5688 /// a 12-bit value shifted left by either 0 or 12.  If so, return true with
5689 /// Val set to the 12-bit value and Shift set to the shifter operand.
5690 InstructionSelector::ComplexRendererFns
5691 AArch64InstructionSelector::selectArithImmed(MachineOperand &Root) const {
5692   // This function is called from the addsub_shifted_imm ComplexPattern,
5693   // which lists [imm] as the list of opcode it's interested in, however
5694   // we still need to check whether the operand is actually an immediate
5695   // here because the ComplexPattern opcode list is only used in
5696   // root-level opcode matching.
5697   auto MaybeImmed = getImmedFromMO(Root);
5698   if (MaybeImmed == None)
5699     return None;
5700   return select12BitValueWithLeftShift(*MaybeImmed);
5701 }
5702 
5703 /// SelectNegArithImmed - As above, but negates the value before trying to
5704 /// select it.
5705 InstructionSelector::ComplexRendererFns
5706 AArch64InstructionSelector::selectNegArithImmed(MachineOperand &Root) const {
5707   // We need a register here, because we need to know if we have a 64 or 32
5708   // bit immediate.
5709   if (!Root.isReg())
5710     return None;
5711   auto MaybeImmed = getImmedFromMO(Root);
5712   if (MaybeImmed == None)
5713     return None;
5714   uint64_t Immed = *MaybeImmed;
5715 
5716   // This negation is almost always valid, but "cmp wN, #0" and "cmn wN, #0"
5717   // have the opposite effect on the C flag, so this pattern mustn't match under
5718   // those circumstances.
5719   if (Immed == 0)
5720     return None;
5721 
5722   // Check if we're dealing with a 32-bit type on the root or a 64-bit type on
5723   // the root.
5724   MachineRegisterInfo &MRI = Root.getParent()->getMF()->getRegInfo();
5725   if (MRI.getType(Root.getReg()).getSizeInBits() == 32)
5726     Immed = ~((uint32_t)Immed) + 1;
5727   else
5728     Immed = ~Immed + 1ULL;
5729 
5730   if (Immed & 0xFFFFFFFFFF000000ULL)
5731     return None;
5732 
5733   Immed &= 0xFFFFFFULL;
5734   return select12BitValueWithLeftShift(Immed);
5735 }
5736 
5737 /// Return true if it is worth folding MI into an extended register. That is,
5738 /// if it's safe to pull it into the addressing mode of a load or store as a
5739 /// shift.
5740 bool AArch64InstructionSelector::isWorthFoldingIntoExtendedReg(
5741     MachineInstr &MI, const MachineRegisterInfo &MRI) const {
5742   // Always fold if there is one use, or if we're optimizing for size.
5743   Register DefReg = MI.getOperand(0).getReg();
5744   if (MRI.hasOneNonDBGUse(DefReg) ||
5745       MI.getParent()->getParent()->getFunction().hasOptSize())
5746     return true;
5747 
5748   // It's better to avoid folding and recomputing shifts when we don't have a
5749   // fastpath.
5750   if (!STI.hasLSLFast())
5751     return false;
5752 
5753   // We have a fastpath, so folding a shift in and potentially computing it
5754   // many times may be beneficial. Check if this is only used in memory ops.
5755   // If it is, then we should fold.
5756   return all_of(MRI.use_nodbg_instructions(DefReg),
5757                 [](MachineInstr &Use) { return Use.mayLoadOrStore(); });
5758 }
5759 
5760 static bool isSignExtendShiftType(AArch64_AM::ShiftExtendType Type) {
5761   switch (Type) {
5762   case AArch64_AM::SXTB:
5763   case AArch64_AM::SXTH:
5764   case AArch64_AM::SXTW:
5765     return true;
5766   default:
5767     return false;
5768   }
5769 }
5770 
5771 InstructionSelector::ComplexRendererFns
5772 AArch64InstructionSelector::selectExtendedSHL(
5773     MachineOperand &Root, MachineOperand &Base, MachineOperand &Offset,
5774     unsigned SizeInBytes, bool WantsExt) const {
5775   assert(Base.isReg() && "Expected base to be a register operand");
5776   assert(Offset.isReg() && "Expected offset to be a register operand");
5777 
5778   MachineRegisterInfo &MRI = Root.getParent()->getMF()->getRegInfo();
5779   MachineInstr *OffsetInst = MRI.getVRegDef(Offset.getReg());
5780   if (!OffsetInst)
5781     return None;
5782 
5783   unsigned OffsetOpc = OffsetInst->getOpcode();
5784   bool LookedThroughZExt = false;
5785   if (OffsetOpc != TargetOpcode::G_SHL && OffsetOpc != TargetOpcode::G_MUL) {
5786     // Try to look through a ZEXT.
5787     if (OffsetOpc != TargetOpcode::G_ZEXT || !WantsExt)
5788       return None;
5789 
5790     OffsetInst = MRI.getVRegDef(OffsetInst->getOperand(1).getReg());
5791     OffsetOpc = OffsetInst->getOpcode();
5792     LookedThroughZExt = true;
5793 
5794     if (OffsetOpc != TargetOpcode::G_SHL && OffsetOpc != TargetOpcode::G_MUL)
5795       return None;
5796   }
5797   // Make sure that the memory op is a valid size.
5798   int64_t LegalShiftVal = Log2_32(SizeInBytes);
5799   if (LegalShiftVal == 0)
5800     return None;
5801   if (!isWorthFoldingIntoExtendedReg(*OffsetInst, MRI))
5802     return None;
5803 
5804   // Now, try to find the specific G_CONSTANT. Start by assuming that the
5805   // register we will offset is the LHS, and the register containing the
5806   // constant is the RHS.
5807   Register OffsetReg = OffsetInst->getOperand(1).getReg();
5808   Register ConstantReg = OffsetInst->getOperand(2).getReg();
5809   auto ValAndVReg = getIConstantVRegValWithLookThrough(ConstantReg, MRI);
5810   if (!ValAndVReg) {
5811     // We didn't get a constant on the RHS. If the opcode is a shift, then
5812     // we're done.
5813     if (OffsetOpc == TargetOpcode::G_SHL)
5814       return None;
5815 
5816     // If we have a G_MUL, we can use either register. Try looking at the RHS.
5817     std::swap(OffsetReg, ConstantReg);
5818     ValAndVReg = getIConstantVRegValWithLookThrough(ConstantReg, MRI);
5819     if (!ValAndVReg)
5820       return None;
5821   }
5822 
5823   // The value must fit into 3 bits, and must be positive. Make sure that is
5824   // true.
5825   int64_t ImmVal = ValAndVReg->Value.getSExtValue();
5826 
5827   // Since we're going to pull this into a shift, the constant value must be
5828   // a power of 2. If we got a multiply, then we need to check this.
5829   if (OffsetOpc == TargetOpcode::G_MUL) {
5830     if (!isPowerOf2_32(ImmVal))
5831       return None;
5832 
5833     // Got a power of 2. So, the amount we'll shift is the log base-2 of that.
5834     ImmVal = Log2_32(ImmVal);
5835   }
5836 
5837   if ((ImmVal & 0x7) != ImmVal)
5838     return None;
5839 
5840   // We are only allowed to shift by LegalShiftVal. This shift value is built
5841   // into the instruction, so we can't just use whatever we want.
5842   if (ImmVal != LegalShiftVal)
5843     return None;
5844 
5845   unsigned SignExtend = 0;
5846   if (WantsExt) {
5847     // Check if the offset is defined by an extend, unless we looked through a
5848     // G_ZEXT earlier.
5849     if (!LookedThroughZExt) {
5850       MachineInstr *ExtInst = getDefIgnoringCopies(OffsetReg, MRI);
5851       auto Ext = getExtendTypeForInst(*ExtInst, MRI, true);
5852       if (Ext == AArch64_AM::InvalidShiftExtend)
5853         return None;
5854 
5855       SignExtend = isSignExtendShiftType(Ext) ? 1 : 0;
5856       // We only support SXTW for signed extension here.
5857       if (SignExtend && Ext != AArch64_AM::SXTW)
5858         return None;
5859       OffsetReg = ExtInst->getOperand(1).getReg();
5860     }
5861 
5862     // Need a 32-bit wide register here.
5863     MachineIRBuilder MIB(*MRI.getVRegDef(Root.getReg()));
5864     OffsetReg = moveScalarRegClass(OffsetReg, AArch64::GPR32RegClass, MIB);
5865   }
5866 
5867   // We can use the LHS of the GEP as the base, and the LHS of the shift as an
5868   // offset. Signify that we are shifting by setting the shift flag to 1.
5869   return {{[=](MachineInstrBuilder &MIB) { MIB.addUse(Base.getReg()); },
5870            [=](MachineInstrBuilder &MIB) { MIB.addUse(OffsetReg); },
5871            [=](MachineInstrBuilder &MIB) {
5872              // Need to add both immediates here to make sure that they are both
5873              // added to the instruction.
5874              MIB.addImm(SignExtend);
5875              MIB.addImm(1);
5876            }}};
5877 }
5878 
5879 /// This is used for computing addresses like this:
5880 ///
5881 /// ldr x1, [x2, x3, lsl #3]
5882 ///
5883 /// Where x2 is the base register, and x3 is an offset register. The shift-left
5884 /// is a constant value specific to this load instruction. That is, we'll never
5885 /// see anything other than a 3 here (which corresponds to the size of the
5886 /// element being loaded.)
5887 InstructionSelector::ComplexRendererFns
5888 AArch64InstructionSelector::selectAddrModeShiftedExtendXReg(
5889     MachineOperand &Root, unsigned SizeInBytes) const {
5890   if (!Root.isReg())
5891     return None;
5892   MachineRegisterInfo &MRI = Root.getParent()->getMF()->getRegInfo();
5893 
5894   // We want to find something like this:
5895   //
5896   // val = G_CONSTANT LegalShiftVal
5897   // shift = G_SHL off_reg val
5898   // ptr = G_PTR_ADD base_reg shift
5899   // x = G_LOAD ptr
5900   //
5901   // And fold it into this addressing mode:
5902   //
5903   // ldr x, [base_reg, off_reg, lsl #LegalShiftVal]
5904 
5905   // Check if we can find the G_PTR_ADD.
5906   MachineInstr *PtrAdd =
5907       getOpcodeDef(TargetOpcode::G_PTR_ADD, Root.getReg(), MRI);
5908   if (!PtrAdd || !isWorthFoldingIntoExtendedReg(*PtrAdd, MRI))
5909     return None;
5910 
5911   // Now, try to match an opcode which will match our specific offset.
5912   // We want a G_SHL or a G_MUL.
5913   MachineInstr *OffsetInst =
5914       getDefIgnoringCopies(PtrAdd->getOperand(2).getReg(), MRI);
5915   return selectExtendedSHL(Root, PtrAdd->getOperand(1),
5916                            OffsetInst->getOperand(0), SizeInBytes,
5917                            /*WantsExt=*/false);
5918 }
5919 
5920 /// This is used for computing addresses like this:
5921 ///
5922 /// ldr x1, [x2, x3]
5923 ///
5924 /// Where x2 is the base register, and x3 is an offset register.
5925 ///
5926 /// When possible (or profitable) to fold a G_PTR_ADD into the address calculation,
5927 /// this will do so. Otherwise, it will return None.
5928 InstructionSelector::ComplexRendererFns
5929 AArch64InstructionSelector::selectAddrModeRegisterOffset(
5930     MachineOperand &Root) const {
5931   MachineRegisterInfo &MRI = Root.getParent()->getMF()->getRegInfo();
5932 
5933   // We need a GEP.
5934   MachineInstr *Gep = MRI.getVRegDef(Root.getReg());
5935   if (!Gep || Gep->getOpcode() != TargetOpcode::G_PTR_ADD)
5936     return None;
5937 
5938   // If this is used more than once, let's not bother folding.
5939   // TODO: Check if they are memory ops. If they are, then we can still fold
5940   // without having to recompute anything.
5941   if (!MRI.hasOneNonDBGUse(Gep->getOperand(0).getReg()))
5942     return None;
5943 
5944   // Base is the GEP's LHS, offset is its RHS.
5945   return {{[=](MachineInstrBuilder &MIB) {
5946              MIB.addUse(Gep->getOperand(1).getReg());
5947            },
5948            [=](MachineInstrBuilder &MIB) {
5949              MIB.addUse(Gep->getOperand(2).getReg());
5950            },
5951            [=](MachineInstrBuilder &MIB) {
5952              // Need to add both immediates here to make sure that they are both
5953              // added to the instruction.
5954              MIB.addImm(0);
5955              MIB.addImm(0);
5956            }}};
5957 }
5958 
5959 /// This is intended to be equivalent to selectAddrModeXRO in
5960 /// AArch64ISelDAGtoDAG. It's used for selecting X register offset loads.
5961 InstructionSelector::ComplexRendererFns
5962 AArch64InstructionSelector::selectAddrModeXRO(MachineOperand &Root,
5963                                               unsigned SizeInBytes) const {
5964   MachineRegisterInfo &MRI = Root.getParent()->getMF()->getRegInfo();
5965   if (!Root.isReg())
5966     return None;
5967   MachineInstr *PtrAdd =
5968       getOpcodeDef(TargetOpcode::G_PTR_ADD, Root.getReg(), MRI);
5969   if (!PtrAdd)
5970     return None;
5971 
5972   // Check for an immediates which cannot be encoded in the [base + imm]
5973   // addressing mode, and can't be encoded in an add/sub. If this happens, we'll
5974   // end up with code like:
5975   //
5976   // mov x0, wide
5977   // add x1 base, x0
5978   // ldr x2, [x1, x0]
5979   //
5980   // In this situation, we can use the [base, xreg] addressing mode to save an
5981   // add/sub:
5982   //
5983   // mov x0, wide
5984   // ldr x2, [base, x0]
5985   auto ValAndVReg =
5986       getIConstantVRegValWithLookThrough(PtrAdd->getOperand(2).getReg(), MRI);
5987   if (ValAndVReg) {
5988     unsigned Scale = Log2_32(SizeInBytes);
5989     int64_t ImmOff = ValAndVReg->Value.getSExtValue();
5990 
5991     // Skip immediates that can be selected in the load/store addresing
5992     // mode.
5993     if (ImmOff % SizeInBytes == 0 && ImmOff >= 0 &&
5994         ImmOff < (0x1000 << Scale))
5995       return None;
5996 
5997     // Helper lambda to decide whether or not it is preferable to emit an add.
5998     auto isPreferredADD = [](int64_t ImmOff) {
5999       // Constants in [0x0, 0xfff] can be encoded in an add.
6000       if ((ImmOff & 0xfffffffffffff000LL) == 0x0LL)
6001         return true;
6002 
6003       // Can it be encoded in an add lsl #12?
6004       if ((ImmOff & 0xffffffffff000fffLL) != 0x0LL)
6005         return false;
6006 
6007       // It can be encoded in an add lsl #12, but we may not want to. If it is
6008       // possible to select this as a single movz, then prefer that. A single
6009       // movz is faster than an add with a shift.
6010       return (ImmOff & 0xffffffffff00ffffLL) != 0x0LL &&
6011              (ImmOff & 0xffffffffffff0fffLL) != 0x0LL;
6012     };
6013 
6014     // If the immediate can be encoded in a single add/sub, then bail out.
6015     if (isPreferredADD(ImmOff) || isPreferredADD(-ImmOff))
6016       return None;
6017   }
6018 
6019   // Try to fold shifts into the addressing mode.
6020   auto AddrModeFns = selectAddrModeShiftedExtendXReg(Root, SizeInBytes);
6021   if (AddrModeFns)
6022     return AddrModeFns;
6023 
6024   // If that doesn't work, see if it's possible to fold in registers from
6025   // a GEP.
6026   return selectAddrModeRegisterOffset(Root);
6027 }
6028 
6029 /// This is used for computing addresses like this:
6030 ///
6031 /// ldr x0, [xBase, wOffset, sxtw #LegalShiftVal]
6032 ///
6033 /// Where we have a 64-bit base register, a 32-bit offset register, and an
6034 /// extend (which may or may not be signed).
6035 InstructionSelector::ComplexRendererFns
6036 AArch64InstructionSelector::selectAddrModeWRO(MachineOperand &Root,
6037                                               unsigned SizeInBytes) const {
6038   MachineRegisterInfo &MRI = Root.getParent()->getMF()->getRegInfo();
6039 
6040   MachineInstr *PtrAdd =
6041       getOpcodeDef(TargetOpcode::G_PTR_ADD, Root.getReg(), MRI);
6042   if (!PtrAdd || !isWorthFoldingIntoExtendedReg(*PtrAdd, MRI))
6043     return None;
6044 
6045   MachineOperand &LHS = PtrAdd->getOperand(1);
6046   MachineOperand &RHS = PtrAdd->getOperand(2);
6047   MachineInstr *OffsetInst = getDefIgnoringCopies(RHS.getReg(), MRI);
6048 
6049   // The first case is the same as selectAddrModeXRO, except we need an extend.
6050   // In this case, we try to find a shift and extend, and fold them into the
6051   // addressing mode.
6052   //
6053   // E.g.
6054   //
6055   // off_reg = G_Z/S/ANYEXT ext_reg
6056   // val = G_CONSTANT LegalShiftVal
6057   // shift = G_SHL off_reg val
6058   // ptr = G_PTR_ADD base_reg shift
6059   // x = G_LOAD ptr
6060   //
6061   // In this case we can get a load like this:
6062   //
6063   // ldr x0, [base_reg, ext_reg, sxtw #LegalShiftVal]
6064   auto ExtendedShl = selectExtendedSHL(Root, LHS, OffsetInst->getOperand(0),
6065                                        SizeInBytes, /*WantsExt=*/true);
6066   if (ExtendedShl)
6067     return ExtendedShl;
6068 
6069   // There was no shift. We can try and fold a G_Z/S/ANYEXT in alone though.
6070   //
6071   // e.g.
6072   // ldr something, [base_reg, ext_reg, sxtw]
6073   if (!isWorthFoldingIntoExtendedReg(*OffsetInst, MRI))
6074     return None;
6075 
6076   // Check if this is an extend. We'll get an extend type if it is.
6077   AArch64_AM::ShiftExtendType Ext =
6078       getExtendTypeForInst(*OffsetInst, MRI, /*IsLoadStore=*/true);
6079   if (Ext == AArch64_AM::InvalidShiftExtend)
6080     return None;
6081 
6082   // Need a 32-bit wide register.
6083   MachineIRBuilder MIB(*PtrAdd);
6084   Register ExtReg = moveScalarRegClass(OffsetInst->getOperand(1).getReg(),
6085                                        AArch64::GPR32RegClass, MIB);
6086   unsigned SignExtend = Ext == AArch64_AM::SXTW;
6087 
6088   // Base is LHS, offset is ExtReg.
6089   return {{[=](MachineInstrBuilder &MIB) { MIB.addUse(LHS.getReg()); },
6090            [=](MachineInstrBuilder &MIB) { MIB.addUse(ExtReg); },
6091            [=](MachineInstrBuilder &MIB) {
6092              MIB.addImm(SignExtend);
6093              MIB.addImm(0);
6094            }}};
6095 }
6096 
6097 /// Select a "register plus unscaled signed 9-bit immediate" address.  This
6098 /// should only match when there is an offset that is not valid for a scaled
6099 /// immediate addressing mode.  The "Size" argument is the size in bytes of the
6100 /// memory reference, which is needed here to know what is valid for a scaled
6101 /// immediate.
6102 InstructionSelector::ComplexRendererFns
6103 AArch64InstructionSelector::selectAddrModeUnscaled(MachineOperand &Root,
6104                                                    unsigned Size) const {
6105   MachineRegisterInfo &MRI =
6106       Root.getParent()->getParent()->getParent()->getRegInfo();
6107 
6108   if (!Root.isReg())
6109     return None;
6110 
6111   if (!isBaseWithConstantOffset(Root, MRI))
6112     return None;
6113 
6114   MachineInstr *RootDef = MRI.getVRegDef(Root.getReg());
6115   if (!RootDef)
6116     return None;
6117 
6118   MachineOperand &OffImm = RootDef->getOperand(2);
6119   if (!OffImm.isReg())
6120     return None;
6121   MachineInstr *RHS = MRI.getVRegDef(OffImm.getReg());
6122   if (!RHS || RHS->getOpcode() != TargetOpcode::G_CONSTANT)
6123     return None;
6124   int64_t RHSC;
6125   MachineOperand &RHSOp1 = RHS->getOperand(1);
6126   if (!RHSOp1.isCImm() || RHSOp1.getCImm()->getBitWidth() > 64)
6127     return None;
6128   RHSC = RHSOp1.getCImm()->getSExtValue();
6129 
6130   // If the offset is valid as a scaled immediate, don't match here.
6131   if ((RHSC & (Size - 1)) == 0 && RHSC >= 0 && RHSC < (0x1000 << Log2_32(Size)))
6132     return None;
6133   if (RHSC >= -256 && RHSC < 256) {
6134     MachineOperand &Base = RootDef->getOperand(1);
6135     return {{
6136         [=](MachineInstrBuilder &MIB) { MIB.add(Base); },
6137         [=](MachineInstrBuilder &MIB) { MIB.addImm(RHSC); },
6138     }};
6139   }
6140   return None;
6141 }
6142 
6143 InstructionSelector::ComplexRendererFns
6144 AArch64InstructionSelector::tryFoldAddLowIntoImm(MachineInstr &RootDef,
6145                                                  unsigned Size,
6146                                                  MachineRegisterInfo &MRI) const {
6147   if (RootDef.getOpcode() != AArch64::G_ADD_LOW)
6148     return None;
6149   MachineInstr &Adrp = *MRI.getVRegDef(RootDef.getOperand(1).getReg());
6150   if (Adrp.getOpcode() != AArch64::ADRP)
6151     return None;
6152 
6153   // TODO: add heuristics like isWorthFoldingADDlow() from SelectionDAG.
6154   auto Offset = Adrp.getOperand(1).getOffset();
6155   if (Offset % Size != 0)
6156     return None;
6157 
6158   auto GV = Adrp.getOperand(1).getGlobal();
6159   if (GV->isThreadLocal())
6160     return None;
6161 
6162   auto &MF = *RootDef.getParent()->getParent();
6163   if (GV->getPointerAlignment(MF.getDataLayout()) < Size)
6164     return None;
6165 
6166   unsigned OpFlags = STI.ClassifyGlobalReference(GV, MF.getTarget());
6167   MachineIRBuilder MIRBuilder(RootDef);
6168   Register AdrpReg = Adrp.getOperand(0).getReg();
6169   return {{[=](MachineInstrBuilder &MIB) { MIB.addUse(AdrpReg); },
6170            [=](MachineInstrBuilder &MIB) {
6171              MIB.addGlobalAddress(GV, Offset,
6172                                   OpFlags | AArch64II::MO_PAGEOFF |
6173                                       AArch64II::MO_NC);
6174            }}};
6175 }
6176 
6177 /// Select a "register plus scaled unsigned 12-bit immediate" address.  The
6178 /// "Size" argument is the size in bytes of the memory reference, which
6179 /// determines the scale.
6180 InstructionSelector::ComplexRendererFns
6181 AArch64InstructionSelector::selectAddrModeIndexed(MachineOperand &Root,
6182                                                   unsigned Size) const {
6183   MachineFunction &MF = *Root.getParent()->getParent()->getParent();
6184   MachineRegisterInfo &MRI = MF.getRegInfo();
6185 
6186   if (!Root.isReg())
6187     return None;
6188 
6189   MachineInstr *RootDef = MRI.getVRegDef(Root.getReg());
6190   if (!RootDef)
6191     return None;
6192 
6193   if (RootDef->getOpcode() == TargetOpcode::G_FRAME_INDEX) {
6194     return {{
6195         [=](MachineInstrBuilder &MIB) { MIB.add(RootDef->getOperand(1)); },
6196         [=](MachineInstrBuilder &MIB) { MIB.addImm(0); },
6197     }};
6198   }
6199 
6200   CodeModel::Model CM = MF.getTarget().getCodeModel();
6201   // Check if we can fold in the ADD of small code model ADRP + ADD address.
6202   if (CM == CodeModel::Small) {
6203     auto OpFns = tryFoldAddLowIntoImm(*RootDef, Size, MRI);
6204     if (OpFns)
6205       return OpFns;
6206   }
6207 
6208   if (isBaseWithConstantOffset(Root, MRI)) {
6209     MachineOperand &LHS = RootDef->getOperand(1);
6210     MachineOperand &RHS = RootDef->getOperand(2);
6211     MachineInstr *LHSDef = MRI.getVRegDef(LHS.getReg());
6212     MachineInstr *RHSDef = MRI.getVRegDef(RHS.getReg());
6213     if (LHSDef && RHSDef) {
6214       int64_t RHSC = (int64_t)RHSDef->getOperand(1).getCImm()->getZExtValue();
6215       unsigned Scale = Log2_32(Size);
6216       if ((RHSC & (Size - 1)) == 0 && RHSC >= 0 && RHSC < (0x1000 << Scale)) {
6217         if (LHSDef->getOpcode() == TargetOpcode::G_FRAME_INDEX)
6218           return {{
6219               [=](MachineInstrBuilder &MIB) { MIB.add(LHSDef->getOperand(1)); },
6220               [=](MachineInstrBuilder &MIB) { MIB.addImm(RHSC >> Scale); },
6221           }};
6222 
6223         return {{
6224             [=](MachineInstrBuilder &MIB) { MIB.add(LHS); },
6225             [=](MachineInstrBuilder &MIB) { MIB.addImm(RHSC >> Scale); },
6226         }};
6227       }
6228     }
6229   }
6230 
6231   // Before falling back to our general case, check if the unscaled
6232   // instructions can handle this. If so, that's preferable.
6233   if (selectAddrModeUnscaled(Root, Size).hasValue())
6234     return None;
6235 
6236   return {{
6237       [=](MachineInstrBuilder &MIB) { MIB.add(Root); },
6238       [=](MachineInstrBuilder &MIB) { MIB.addImm(0); },
6239   }};
6240 }
6241 
6242 /// Given a shift instruction, return the correct shift type for that
6243 /// instruction.
6244 static AArch64_AM::ShiftExtendType getShiftTypeForInst(MachineInstr &MI) {
6245   switch (MI.getOpcode()) {
6246   default:
6247     return AArch64_AM::InvalidShiftExtend;
6248   case TargetOpcode::G_SHL:
6249     return AArch64_AM::LSL;
6250   case TargetOpcode::G_LSHR:
6251     return AArch64_AM::LSR;
6252   case TargetOpcode::G_ASHR:
6253     return AArch64_AM::ASR;
6254   case TargetOpcode::G_ROTR:
6255     return AArch64_AM::ROR;
6256   }
6257 }
6258 
6259 /// Select a "shifted register" operand. If the value is not shifted, set the
6260 /// shift operand to a default value of "lsl 0".
6261 InstructionSelector::ComplexRendererFns
6262 AArch64InstructionSelector::selectShiftedRegister(MachineOperand &Root,
6263                                                   bool AllowROR) const {
6264   if (!Root.isReg())
6265     return None;
6266   MachineRegisterInfo &MRI =
6267       Root.getParent()->getParent()->getParent()->getRegInfo();
6268 
6269   // Check if the operand is defined by an instruction which corresponds to
6270   // a ShiftExtendType. E.g. a G_SHL, G_LSHR, etc.
6271   MachineInstr *ShiftInst = MRI.getVRegDef(Root.getReg());
6272   if (!ShiftInst)
6273     return None;
6274   AArch64_AM::ShiftExtendType ShType = getShiftTypeForInst(*ShiftInst);
6275   if (ShType == AArch64_AM::InvalidShiftExtend)
6276     return None;
6277   if (ShType == AArch64_AM::ROR && !AllowROR)
6278     return None;
6279   if (!isWorthFoldingIntoExtendedReg(*ShiftInst, MRI))
6280     return None;
6281 
6282   // Need an immediate on the RHS.
6283   MachineOperand &ShiftRHS = ShiftInst->getOperand(2);
6284   auto Immed = getImmedFromMO(ShiftRHS);
6285   if (!Immed)
6286     return None;
6287 
6288   // We have something that we can fold. Fold in the shift's LHS and RHS into
6289   // the instruction.
6290   MachineOperand &ShiftLHS = ShiftInst->getOperand(1);
6291   Register ShiftReg = ShiftLHS.getReg();
6292 
6293   unsigned NumBits = MRI.getType(ShiftReg).getSizeInBits();
6294   unsigned Val = *Immed & (NumBits - 1);
6295   unsigned ShiftVal = AArch64_AM::getShifterImm(ShType, Val);
6296 
6297   return {{[=](MachineInstrBuilder &MIB) { MIB.addUse(ShiftReg); },
6298            [=](MachineInstrBuilder &MIB) { MIB.addImm(ShiftVal); }}};
6299 }
6300 
6301 AArch64_AM::ShiftExtendType AArch64InstructionSelector::getExtendTypeForInst(
6302     MachineInstr &MI, MachineRegisterInfo &MRI, bool IsLoadStore) const {
6303   unsigned Opc = MI.getOpcode();
6304 
6305   // Handle explicit extend instructions first.
6306   if (Opc == TargetOpcode::G_SEXT || Opc == TargetOpcode::G_SEXT_INREG) {
6307     unsigned Size;
6308     if (Opc == TargetOpcode::G_SEXT)
6309       Size = MRI.getType(MI.getOperand(1).getReg()).getSizeInBits();
6310     else
6311       Size = MI.getOperand(2).getImm();
6312     assert(Size != 64 && "Extend from 64 bits?");
6313     switch (Size) {
6314     case 8:
6315       return IsLoadStore ? AArch64_AM::InvalidShiftExtend : AArch64_AM::SXTB;
6316     case 16:
6317       return IsLoadStore ? AArch64_AM::InvalidShiftExtend : AArch64_AM::SXTH;
6318     case 32:
6319       return AArch64_AM::SXTW;
6320     default:
6321       return AArch64_AM::InvalidShiftExtend;
6322     }
6323   }
6324 
6325   if (Opc == TargetOpcode::G_ZEXT || Opc == TargetOpcode::G_ANYEXT) {
6326     unsigned Size = MRI.getType(MI.getOperand(1).getReg()).getSizeInBits();
6327     assert(Size != 64 && "Extend from 64 bits?");
6328     switch (Size) {
6329     case 8:
6330       return IsLoadStore ? AArch64_AM::InvalidShiftExtend : AArch64_AM::UXTB;
6331     case 16:
6332       return IsLoadStore ? AArch64_AM::InvalidShiftExtend : AArch64_AM::UXTH;
6333     case 32:
6334       return AArch64_AM::UXTW;
6335     default:
6336       return AArch64_AM::InvalidShiftExtend;
6337     }
6338   }
6339 
6340   // Don't have an explicit extend. Try to handle a G_AND with a constant mask
6341   // on the RHS.
6342   if (Opc != TargetOpcode::G_AND)
6343     return AArch64_AM::InvalidShiftExtend;
6344 
6345   Optional<uint64_t> MaybeAndMask = getImmedFromMO(MI.getOperand(2));
6346   if (!MaybeAndMask)
6347     return AArch64_AM::InvalidShiftExtend;
6348   uint64_t AndMask = *MaybeAndMask;
6349   switch (AndMask) {
6350   default:
6351     return AArch64_AM::InvalidShiftExtend;
6352   case 0xFF:
6353     return !IsLoadStore ? AArch64_AM::UXTB : AArch64_AM::InvalidShiftExtend;
6354   case 0xFFFF:
6355     return !IsLoadStore ? AArch64_AM::UXTH : AArch64_AM::InvalidShiftExtend;
6356   case 0xFFFFFFFF:
6357     return AArch64_AM::UXTW;
6358   }
6359 }
6360 
6361 Register AArch64InstructionSelector::moveScalarRegClass(
6362     Register Reg, const TargetRegisterClass &RC, MachineIRBuilder &MIB) const {
6363   MachineRegisterInfo &MRI = *MIB.getMRI();
6364   auto Ty = MRI.getType(Reg);
6365   assert(!Ty.isVector() && "Expected scalars only!");
6366   if (Ty.getSizeInBits() == TRI.getRegSizeInBits(RC))
6367     return Reg;
6368 
6369   // Create a copy and immediately select it.
6370   // FIXME: We should have an emitCopy function?
6371   auto Copy = MIB.buildCopy({&RC}, {Reg});
6372   selectCopy(*Copy, TII, MRI, TRI, RBI);
6373   return Copy.getReg(0);
6374 }
6375 
6376 /// Select an "extended register" operand. This operand folds in an extend
6377 /// followed by an optional left shift.
6378 InstructionSelector::ComplexRendererFns
6379 AArch64InstructionSelector::selectArithExtendedRegister(
6380     MachineOperand &Root) const {
6381   if (!Root.isReg())
6382     return None;
6383   MachineRegisterInfo &MRI =
6384       Root.getParent()->getParent()->getParent()->getRegInfo();
6385 
6386   uint64_t ShiftVal = 0;
6387   Register ExtReg;
6388   AArch64_AM::ShiftExtendType Ext;
6389   MachineInstr *RootDef = getDefIgnoringCopies(Root.getReg(), MRI);
6390   if (!RootDef)
6391     return None;
6392 
6393   if (!isWorthFoldingIntoExtendedReg(*RootDef, MRI))
6394     return None;
6395 
6396   // Check if we can fold a shift and an extend.
6397   if (RootDef->getOpcode() == TargetOpcode::G_SHL) {
6398     // Look for a constant on the RHS of the shift.
6399     MachineOperand &RHS = RootDef->getOperand(2);
6400     Optional<uint64_t> MaybeShiftVal = getImmedFromMO(RHS);
6401     if (!MaybeShiftVal)
6402       return None;
6403     ShiftVal = *MaybeShiftVal;
6404     if (ShiftVal > 4)
6405       return None;
6406     // Look for a valid extend instruction on the LHS of the shift.
6407     MachineOperand &LHS = RootDef->getOperand(1);
6408     MachineInstr *ExtDef = getDefIgnoringCopies(LHS.getReg(), MRI);
6409     if (!ExtDef)
6410       return None;
6411     Ext = getExtendTypeForInst(*ExtDef, MRI);
6412     if (Ext == AArch64_AM::InvalidShiftExtend)
6413       return None;
6414     ExtReg = ExtDef->getOperand(1).getReg();
6415   } else {
6416     // Didn't get a shift. Try just folding an extend.
6417     Ext = getExtendTypeForInst(*RootDef, MRI);
6418     if (Ext == AArch64_AM::InvalidShiftExtend)
6419       return None;
6420     ExtReg = RootDef->getOperand(1).getReg();
6421 
6422     // If we have a 32 bit instruction which zeroes out the high half of a
6423     // register, we get an implicit zero extend for free. Check if we have one.
6424     // FIXME: We actually emit the extend right now even though we don't have
6425     // to.
6426     if (Ext == AArch64_AM::UXTW && MRI.getType(ExtReg).getSizeInBits() == 32) {
6427       MachineInstr *ExtInst = MRI.getVRegDef(ExtReg);
6428       if (ExtInst && isDef32(*ExtInst))
6429         return None;
6430     }
6431   }
6432 
6433   // We require a GPR32 here. Narrow the ExtReg if needed using a subregister
6434   // copy.
6435   MachineIRBuilder MIB(*RootDef);
6436   ExtReg = moveScalarRegClass(ExtReg, AArch64::GPR32RegClass, MIB);
6437 
6438   return {{[=](MachineInstrBuilder &MIB) { MIB.addUse(ExtReg); },
6439            [=](MachineInstrBuilder &MIB) {
6440              MIB.addImm(getArithExtendImm(Ext, ShiftVal));
6441            }}};
6442 }
6443 
6444 void AArch64InstructionSelector::renderTruncImm(MachineInstrBuilder &MIB,
6445                                                 const MachineInstr &MI,
6446                                                 int OpIdx) const {
6447   const MachineRegisterInfo &MRI = MI.getParent()->getParent()->getRegInfo();
6448   assert(MI.getOpcode() == TargetOpcode::G_CONSTANT && OpIdx == -1 &&
6449          "Expected G_CONSTANT");
6450   Optional<int64_t> CstVal =
6451       getIConstantVRegSExtVal(MI.getOperand(0).getReg(), MRI);
6452   assert(CstVal && "Expected constant value");
6453   MIB.addImm(CstVal.getValue());
6454 }
6455 
6456 void AArch64InstructionSelector::renderLogicalImm32(
6457   MachineInstrBuilder &MIB, const MachineInstr &I, int OpIdx) const {
6458   assert(I.getOpcode() == TargetOpcode::G_CONSTANT && OpIdx == -1 &&
6459          "Expected G_CONSTANT");
6460   uint64_t CstVal = I.getOperand(1).getCImm()->getZExtValue();
6461   uint64_t Enc = AArch64_AM::encodeLogicalImmediate(CstVal, 32);
6462   MIB.addImm(Enc);
6463 }
6464 
6465 void AArch64InstructionSelector::renderLogicalImm64(
6466   MachineInstrBuilder &MIB, const MachineInstr &I, int OpIdx) const {
6467   assert(I.getOpcode() == TargetOpcode::G_CONSTANT && OpIdx == -1 &&
6468          "Expected G_CONSTANT");
6469   uint64_t CstVal = I.getOperand(1).getCImm()->getZExtValue();
6470   uint64_t Enc = AArch64_AM::encodeLogicalImmediate(CstVal, 64);
6471   MIB.addImm(Enc);
6472 }
6473 
6474 void AArch64InstructionSelector::renderFPImm16(MachineInstrBuilder &MIB,
6475                                                const MachineInstr &MI,
6476                                                int OpIdx) const {
6477   assert(MI.getOpcode() == TargetOpcode::G_FCONSTANT && OpIdx == -1 &&
6478          "Expected G_FCONSTANT");
6479   MIB.addImm(
6480       AArch64_AM::getFP16Imm(MI.getOperand(1).getFPImm()->getValueAPF()));
6481 }
6482 
6483 void AArch64InstructionSelector::renderFPImm32(MachineInstrBuilder &MIB,
6484                                                const MachineInstr &MI,
6485                                                int OpIdx) const {
6486   assert(MI.getOpcode() == TargetOpcode::G_FCONSTANT && OpIdx == -1 &&
6487          "Expected G_FCONSTANT");
6488   MIB.addImm(
6489       AArch64_AM::getFP32Imm(MI.getOperand(1).getFPImm()->getValueAPF()));
6490 }
6491 
6492 void AArch64InstructionSelector::renderFPImm64(MachineInstrBuilder &MIB,
6493                                                const MachineInstr &MI,
6494                                                int OpIdx) const {
6495   assert(MI.getOpcode() == TargetOpcode::G_FCONSTANT && OpIdx == -1 &&
6496          "Expected G_FCONSTANT");
6497   MIB.addImm(
6498       AArch64_AM::getFP64Imm(MI.getOperand(1).getFPImm()->getValueAPF()));
6499 }
6500 
6501 bool AArch64InstructionSelector::isLoadStoreOfNumBytes(
6502     const MachineInstr &MI, unsigned NumBytes) const {
6503   if (!MI.mayLoadOrStore())
6504     return false;
6505   assert(MI.hasOneMemOperand() &&
6506          "Expected load/store to have only one mem op!");
6507   return (*MI.memoperands_begin())->getSize() == NumBytes;
6508 }
6509 
6510 bool AArch64InstructionSelector::isDef32(const MachineInstr &MI) const {
6511   const MachineRegisterInfo &MRI = MI.getParent()->getParent()->getRegInfo();
6512   if (MRI.getType(MI.getOperand(0).getReg()).getSizeInBits() != 32)
6513     return false;
6514 
6515   // Only return true if we know the operation will zero-out the high half of
6516   // the 64-bit register. Truncates can be subregister copies, which don't
6517   // zero out the high bits. Copies and other copy-like instructions can be
6518   // fed by truncates, or could be lowered as subregister copies.
6519   switch (MI.getOpcode()) {
6520   default:
6521     return true;
6522   case TargetOpcode::COPY:
6523   case TargetOpcode::G_BITCAST:
6524   case TargetOpcode::G_TRUNC:
6525   case TargetOpcode::G_PHI:
6526     return false;
6527   }
6528 }
6529 
6530 
6531 // Perform fixups on the given PHI instruction's operands to force them all
6532 // to be the same as the destination regbank.
6533 static void fixupPHIOpBanks(MachineInstr &MI, MachineRegisterInfo &MRI,
6534                             const AArch64RegisterBankInfo &RBI) {
6535   assert(MI.getOpcode() == TargetOpcode::G_PHI && "Expected a G_PHI");
6536   Register DstReg = MI.getOperand(0).getReg();
6537   const RegisterBank *DstRB = MRI.getRegBankOrNull(DstReg);
6538   assert(DstRB && "Expected PHI dst to have regbank assigned");
6539   MachineIRBuilder MIB(MI);
6540 
6541   // Go through each operand and ensure it has the same regbank.
6542   for (MachineOperand &MO : llvm::drop_begin(MI.operands())) {
6543     if (!MO.isReg())
6544       continue;
6545     Register OpReg = MO.getReg();
6546     const RegisterBank *RB = MRI.getRegBankOrNull(OpReg);
6547     if (RB != DstRB) {
6548       // Insert a cross-bank copy.
6549       auto *OpDef = MRI.getVRegDef(OpReg);
6550       const LLT &Ty = MRI.getType(OpReg);
6551       MachineBasicBlock &OpDefBB = *OpDef->getParent();
6552 
6553       // Any instruction we insert must appear after all PHIs in the block
6554       // for the block to be valid MIR.
6555       MachineBasicBlock::iterator InsertPt = std::next(OpDef->getIterator());
6556       if (InsertPt != OpDefBB.end() && InsertPt->isPHI())
6557         InsertPt = OpDefBB.getFirstNonPHI();
6558       MIB.setInsertPt(*OpDef->getParent(), InsertPt);
6559       auto Copy = MIB.buildCopy(Ty, OpReg);
6560       MRI.setRegBank(Copy.getReg(0), *DstRB);
6561       MO.setReg(Copy.getReg(0));
6562     }
6563   }
6564 }
6565 
6566 void AArch64InstructionSelector::processPHIs(MachineFunction &MF) {
6567   // We're looking for PHIs, build a list so we don't invalidate iterators.
6568   MachineRegisterInfo &MRI = MF.getRegInfo();
6569   SmallVector<MachineInstr *, 32> Phis;
6570   for (auto &BB : MF) {
6571     for (auto &MI : BB) {
6572       if (MI.getOpcode() == TargetOpcode::G_PHI)
6573         Phis.emplace_back(&MI);
6574     }
6575   }
6576 
6577   for (auto *MI : Phis) {
6578     // We need to do some work here if the operand types are < 16 bit and they
6579     // are split across fpr/gpr banks. Since all types <32b on gpr
6580     // end up being assigned gpr32 regclasses, we can end up with PHIs here
6581     // which try to select between a gpr32 and an fpr16. Ideally RBS shouldn't
6582     // be selecting heterogenous regbanks for operands if possible, but we
6583     // still need to be able to deal with it here.
6584     //
6585     // To fix this, if we have a gpr-bank operand < 32b in size and at least
6586     // one other operand is on the fpr bank, then we add cross-bank copies
6587     // to homogenize the operand banks. For simplicity the bank that we choose
6588     // to settle on is whatever bank the def operand has. For example:
6589     //
6590     // %endbb:
6591     //   %dst:gpr(s16) = G_PHI %in1:gpr(s16), %bb1, %in2:fpr(s16), %bb2
6592     //  =>
6593     // %bb2:
6594     //   ...
6595     //   %in2_copy:gpr(s16) = COPY %in2:fpr(s16)
6596     //   ...
6597     // %endbb:
6598     //   %dst:gpr(s16) = G_PHI %in1:gpr(s16), %bb1, %in2_copy:gpr(s16), %bb2
6599     bool HasGPROp = false, HasFPROp = false;
6600     for (const MachineOperand &MO : llvm::drop_begin(MI->operands())) {
6601       if (!MO.isReg())
6602         continue;
6603       const LLT &Ty = MRI.getType(MO.getReg());
6604       if (!Ty.isValid() || !Ty.isScalar())
6605         break;
6606       if (Ty.getSizeInBits() >= 32)
6607         break;
6608       const RegisterBank *RB = MRI.getRegBankOrNull(MO.getReg());
6609       // If for some reason we don't have a regbank yet. Don't try anything.
6610       if (!RB)
6611         break;
6612 
6613       if (RB->getID() == AArch64::GPRRegBankID)
6614         HasGPROp = true;
6615       else
6616         HasFPROp = true;
6617     }
6618     // We have heterogenous regbanks, need to fixup.
6619     if (HasGPROp && HasFPROp)
6620       fixupPHIOpBanks(*MI, MRI, RBI);
6621   }
6622 }
6623 
6624 namespace llvm {
6625 InstructionSelector *
6626 createAArch64InstructionSelector(const AArch64TargetMachine &TM,
6627                                  AArch64Subtarget &Subtarget,
6628                                  AArch64RegisterBankInfo &RBI) {
6629   return new AArch64InstructionSelector(TM, Subtarget, RBI);
6630 }
6631 }
6632