xref: /freebsd/contrib/llvm-project/llvm/lib/Target/AArch64/GISel/AArch64InstructionSelector.cpp (revision 1db9f3b21e39176dd5b67cf8ac378633b172463e)
1 //===- AArch64InstructionSelector.cpp ----------------------------*- C++ -*-==//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 /// \file
9 /// This file implements the targeting of the InstructionSelector class for
10 /// AArch64.
11 /// \todo This should be generated by TableGen.
12 //===----------------------------------------------------------------------===//
13 
14 #include "AArch64GlobalISelUtils.h"
15 #include "AArch64InstrInfo.h"
16 #include "AArch64MachineFunctionInfo.h"
17 #include "AArch64RegisterBankInfo.h"
18 #include "AArch64RegisterInfo.h"
19 #include "AArch64Subtarget.h"
20 #include "AArch64TargetMachine.h"
21 #include "MCTargetDesc/AArch64AddressingModes.h"
22 #include "MCTargetDesc/AArch64MCTargetDesc.h"
23 #include "llvm/BinaryFormat/Dwarf.h"
24 #include "llvm/CodeGen/GlobalISel/GIMatchTableExecutorImpl.h"
25 #include "llvm/CodeGen/GlobalISel/GenericMachineInstrs.h"
26 #include "llvm/CodeGen/GlobalISel/InstructionSelector.h"
27 #include "llvm/CodeGen/GlobalISel/MIPatternMatch.h"
28 #include "llvm/CodeGen/GlobalISel/MachineIRBuilder.h"
29 #include "llvm/CodeGen/GlobalISel/Utils.h"
30 #include "llvm/CodeGen/MachineBasicBlock.h"
31 #include "llvm/CodeGen/MachineConstantPool.h"
32 #include "llvm/CodeGen/MachineFrameInfo.h"
33 #include "llvm/CodeGen/MachineFunction.h"
34 #include "llvm/CodeGen/MachineInstr.h"
35 #include "llvm/CodeGen/MachineInstrBuilder.h"
36 #include "llvm/CodeGen/MachineMemOperand.h"
37 #include "llvm/CodeGen/MachineOperand.h"
38 #include "llvm/CodeGen/MachineRegisterInfo.h"
39 #include "llvm/CodeGen/TargetOpcodes.h"
40 #include "llvm/CodeGen/TargetRegisterInfo.h"
41 #include "llvm/IR/Constants.h"
42 #include "llvm/IR/DerivedTypes.h"
43 #include "llvm/IR/Instructions.h"
44 #include "llvm/IR/IntrinsicsAArch64.h"
45 #include "llvm/IR/PatternMatch.h"
46 #include "llvm/IR/Type.h"
47 #include "llvm/Pass.h"
48 #include "llvm/Support/Debug.h"
49 #include "llvm/Support/raw_ostream.h"
50 #include <optional>
51 
52 #define DEBUG_TYPE "aarch64-isel"
53 
54 using namespace llvm;
55 using namespace MIPatternMatch;
56 using namespace AArch64GISelUtils;
57 
58 namespace llvm {
59 class BlockFrequencyInfo;
60 class ProfileSummaryInfo;
61 }
62 
63 namespace {
64 
65 #define GET_GLOBALISEL_PREDICATE_BITSET
66 #include "AArch64GenGlobalISel.inc"
67 #undef GET_GLOBALISEL_PREDICATE_BITSET
68 
69 
70 class AArch64InstructionSelector : public InstructionSelector {
71 public:
72   AArch64InstructionSelector(const AArch64TargetMachine &TM,
73                              const AArch64Subtarget &STI,
74                              const AArch64RegisterBankInfo &RBI);
75 
76   bool select(MachineInstr &I) override;
77   static const char *getName() { return DEBUG_TYPE; }
78 
79   void setupMF(MachineFunction &MF, GISelKnownBits *KB,
80                CodeGenCoverage *CoverageInfo, ProfileSummaryInfo *PSI,
81                BlockFrequencyInfo *BFI) override {
82     InstructionSelector::setupMF(MF, KB, CoverageInfo, PSI, BFI);
83     MIB.setMF(MF);
84 
85     // hasFnAttribute() is expensive to call on every BRCOND selection, so
86     // cache it here for each run of the selector.
87     ProduceNonFlagSettingCondBr =
88         !MF.getFunction().hasFnAttribute(Attribute::SpeculativeLoadHardening);
89     MFReturnAddr = Register();
90 
91     processPHIs(MF);
92   }
93 
94 private:
95   /// tblgen-erated 'select' implementation, used as the initial selector for
96   /// the patterns that don't require complex C++.
97   bool selectImpl(MachineInstr &I, CodeGenCoverage &CoverageInfo) const;
98 
99   // A lowering phase that runs before any selection attempts.
100   // Returns true if the instruction was modified.
101   bool preISelLower(MachineInstr &I);
102 
103   // An early selection function that runs before the selectImpl() call.
104   bool earlySelect(MachineInstr &I);
105 
106   /// Save state that is shared between select calls, call select on \p I and
107   /// then restore the saved state. This can be used to recursively call select
108   /// within a select call.
109   bool selectAndRestoreState(MachineInstr &I);
110 
111   // Do some preprocessing of G_PHIs before we begin selection.
112   void processPHIs(MachineFunction &MF);
113 
114   bool earlySelectSHL(MachineInstr &I, MachineRegisterInfo &MRI);
115 
116   /// Eliminate same-sized cross-bank copies into stores before selectImpl().
117   bool contractCrossBankCopyIntoStore(MachineInstr &I,
118                                       MachineRegisterInfo &MRI);
119 
120   bool convertPtrAddToAdd(MachineInstr &I, MachineRegisterInfo &MRI);
121 
122   bool selectVaStartAAPCS(MachineInstr &I, MachineFunction &MF,
123                           MachineRegisterInfo &MRI) const;
124   bool selectVaStartDarwin(MachineInstr &I, MachineFunction &MF,
125                            MachineRegisterInfo &MRI) const;
126 
127   ///@{
128   /// Helper functions for selectCompareBranch.
129   bool selectCompareBranchFedByFCmp(MachineInstr &I, MachineInstr &FCmp,
130                                     MachineIRBuilder &MIB) const;
131   bool selectCompareBranchFedByICmp(MachineInstr &I, MachineInstr &ICmp,
132                                     MachineIRBuilder &MIB) const;
133   bool tryOptCompareBranchFedByICmp(MachineInstr &I, MachineInstr &ICmp,
134                                     MachineIRBuilder &MIB) const;
135   bool tryOptAndIntoCompareBranch(MachineInstr &AndInst, bool Invert,
136                                   MachineBasicBlock *DstMBB,
137                                   MachineIRBuilder &MIB) const;
138   ///@}
139 
140   bool selectCompareBranch(MachineInstr &I, MachineFunction &MF,
141                            MachineRegisterInfo &MRI);
142 
143   bool selectVectorAshrLshr(MachineInstr &I, MachineRegisterInfo &MRI);
144   bool selectVectorSHL(MachineInstr &I, MachineRegisterInfo &MRI);
145 
146   // Helper to generate an equivalent of scalar_to_vector into a new register,
147   // returned via 'Dst'.
148   MachineInstr *emitScalarToVector(unsigned EltSize,
149                                    const TargetRegisterClass *DstRC,
150                                    Register Scalar,
151                                    MachineIRBuilder &MIRBuilder) const;
152   /// Helper to narrow vector that was widened by emitScalarToVector.
153   /// Copy lowest part of 128-bit or 64-bit vector to 64-bit or 32-bit
154   /// vector, correspondingly.
155   MachineInstr *emitNarrowVector(Register DstReg, Register SrcReg,
156                                  MachineIRBuilder &MIRBuilder,
157                                  MachineRegisterInfo &MRI) const;
158 
159   /// Emit a lane insert into \p DstReg, or a new vector register if
160   /// std::nullopt is provided.
161   ///
162   /// The lane inserted into is defined by \p LaneIdx. The vector source
163   /// register is given by \p SrcReg. The register containing the element is
164   /// given by \p EltReg.
165   MachineInstr *emitLaneInsert(std::optional<Register> DstReg, Register SrcReg,
166                                Register EltReg, unsigned LaneIdx,
167                                const RegisterBank &RB,
168                                MachineIRBuilder &MIRBuilder) const;
169 
170   /// Emit a sequence of instructions representing a constant \p CV for a
171   /// vector register \p Dst. (E.g. a MOV, or a load from a constant pool.)
172   ///
173   /// \returns the last instruction in the sequence on success, and nullptr
174   /// otherwise.
175   MachineInstr *emitConstantVector(Register Dst, Constant *CV,
176                                    MachineIRBuilder &MIRBuilder,
177                                    MachineRegisterInfo &MRI);
178 
179   MachineInstr *tryAdvSIMDModImm8(Register Dst, unsigned DstSize, APInt Bits,
180                                   MachineIRBuilder &MIRBuilder);
181 
182   MachineInstr *tryAdvSIMDModImm16(Register Dst, unsigned DstSize, APInt Bits,
183                                    MachineIRBuilder &MIRBuilder, bool Inv);
184 
185   MachineInstr *tryAdvSIMDModImm32(Register Dst, unsigned DstSize, APInt Bits,
186                                    MachineIRBuilder &MIRBuilder, bool Inv);
187   MachineInstr *tryAdvSIMDModImm64(Register Dst, unsigned DstSize, APInt Bits,
188                                    MachineIRBuilder &MIRBuilder);
189   MachineInstr *tryAdvSIMDModImm321s(Register Dst, unsigned DstSize, APInt Bits,
190                                      MachineIRBuilder &MIRBuilder, bool Inv);
191   MachineInstr *tryAdvSIMDModImmFP(Register Dst, unsigned DstSize, APInt Bits,
192                                    MachineIRBuilder &MIRBuilder);
193 
194   bool selectInsertElt(MachineInstr &I, MachineRegisterInfo &MRI);
195   bool tryOptConstantBuildVec(MachineInstr &MI, LLT DstTy,
196                               MachineRegisterInfo &MRI);
197   /// \returns true if a G_BUILD_VECTOR instruction \p MI can be selected as a
198   /// SUBREG_TO_REG.
199   bool tryOptBuildVecToSubregToReg(MachineInstr &MI, MachineRegisterInfo &MRI);
200   bool selectBuildVector(MachineInstr &I, MachineRegisterInfo &MRI);
201   bool selectMergeValues(MachineInstr &I, MachineRegisterInfo &MRI);
202   bool selectUnmergeValues(MachineInstr &I, MachineRegisterInfo &MRI);
203 
204   bool selectShuffleVector(MachineInstr &I, MachineRegisterInfo &MRI);
205   bool selectExtractElt(MachineInstr &I, MachineRegisterInfo &MRI);
206   bool selectConcatVectors(MachineInstr &I, MachineRegisterInfo &MRI);
207   bool selectSplitVectorUnmerge(MachineInstr &I, MachineRegisterInfo &MRI);
208 
209   /// Helper function to select vector load intrinsics like
210   /// @llvm.aarch64.neon.ld2.*, @llvm.aarch64.neon.ld4.*, etc.
211   /// \p Opc is the opcode that the selected instruction should use.
212   /// \p NumVecs is the number of vector destinations for the instruction.
213   /// \p I is the original G_INTRINSIC_W_SIDE_EFFECTS instruction.
214   bool selectVectorLoadIntrinsic(unsigned Opc, unsigned NumVecs,
215                                  MachineInstr &I);
216   bool selectVectorLoadLaneIntrinsic(unsigned Opc, unsigned NumVecs,
217                                      MachineInstr &I);
218   void selectVectorStoreIntrinsic(MachineInstr &I, unsigned NumVecs,
219                                   unsigned Opc);
220   bool selectVectorStoreLaneIntrinsic(MachineInstr &I, unsigned NumVecs,
221                                       unsigned Opc);
222   bool selectIntrinsicWithSideEffects(MachineInstr &I,
223                                       MachineRegisterInfo &MRI);
224   bool selectIntrinsic(MachineInstr &I, MachineRegisterInfo &MRI);
225   bool selectVectorICmp(MachineInstr &I, MachineRegisterInfo &MRI);
226   bool selectJumpTable(MachineInstr &I, MachineRegisterInfo &MRI);
227   bool selectBrJT(MachineInstr &I, MachineRegisterInfo &MRI);
228   bool selectTLSGlobalValue(MachineInstr &I, MachineRegisterInfo &MRI);
229   bool selectReduction(MachineInstr &I, MachineRegisterInfo &MRI);
230   bool selectMOPS(MachineInstr &I, MachineRegisterInfo &MRI);
231   bool selectUSMovFromExtend(MachineInstr &I, MachineRegisterInfo &MRI);
232 
233   bool selectIndexedExtLoad(MachineInstr &I, MachineRegisterInfo &MRI);
234   bool selectIndexedLoad(MachineInstr &I, MachineRegisterInfo &MRI);
235   bool selectIndexedStore(GIndexedStore &I, MachineRegisterInfo &MRI);
236 
237   unsigned emitConstantPoolEntry(const Constant *CPVal,
238                                  MachineFunction &MF) const;
239   MachineInstr *emitLoadFromConstantPool(const Constant *CPVal,
240                                          MachineIRBuilder &MIRBuilder) const;
241 
242   // Emit a vector concat operation.
243   MachineInstr *emitVectorConcat(std::optional<Register> Dst, Register Op1,
244                                  Register Op2,
245                                  MachineIRBuilder &MIRBuilder) const;
246 
247   // Emit an integer compare between LHS and RHS, which checks for Predicate.
248   MachineInstr *emitIntegerCompare(MachineOperand &LHS, MachineOperand &RHS,
249                                    MachineOperand &Predicate,
250                                    MachineIRBuilder &MIRBuilder) const;
251 
252   /// Emit a floating point comparison between \p LHS and \p RHS.
253   /// \p Pred if given is the intended predicate to use.
254   MachineInstr *
255   emitFPCompare(Register LHS, Register RHS, MachineIRBuilder &MIRBuilder,
256                 std::optional<CmpInst::Predicate> = std::nullopt) const;
257 
258   MachineInstr *
259   emitInstr(unsigned Opcode, std::initializer_list<llvm::DstOp> DstOps,
260             std::initializer_list<llvm::SrcOp> SrcOps,
261             MachineIRBuilder &MIRBuilder,
262             const ComplexRendererFns &RenderFns = std::nullopt) const;
263   /// Helper function to emit an add or sub instruction.
264   ///
265   /// \p AddrModeAndSizeToOpcode must contain each of the opcode variants above
266   /// in a specific order.
267   ///
268   /// Below is an example of the expected input to \p AddrModeAndSizeToOpcode.
269   ///
270   /// \code
271   ///   const std::array<std::array<unsigned, 2>, 4> Table {
272   ///    {{AArch64::ADDXri, AArch64::ADDWri},
273   ///     {AArch64::ADDXrs, AArch64::ADDWrs},
274   ///     {AArch64::ADDXrr, AArch64::ADDWrr},
275   ///     {AArch64::SUBXri, AArch64::SUBWri},
276   ///     {AArch64::ADDXrx, AArch64::ADDWrx}}};
277   /// \endcode
278   ///
279   /// Each row in the table corresponds to a different addressing mode. Each
280   /// column corresponds to a different register size.
281   ///
282   /// \attention Rows must be structured as follows:
283   ///   - Row 0: The ri opcode variants
284   ///   - Row 1: The rs opcode variants
285   ///   - Row 2: The rr opcode variants
286   ///   - Row 3: The ri opcode variants for negative immediates
287   ///   - Row 4: The rx opcode variants
288   ///
289   /// \attention Columns must be structured as follows:
290   ///   - Column 0: The 64-bit opcode variants
291   ///   - Column 1: The 32-bit opcode variants
292   ///
293   /// \p Dst is the destination register of the binop to emit.
294   /// \p LHS is the left-hand operand of the binop to emit.
295   /// \p RHS is the right-hand operand of the binop to emit.
296   MachineInstr *emitAddSub(
297       const std::array<std::array<unsigned, 2>, 5> &AddrModeAndSizeToOpcode,
298       Register Dst, MachineOperand &LHS, MachineOperand &RHS,
299       MachineIRBuilder &MIRBuilder) const;
300   MachineInstr *emitADD(Register DefReg, MachineOperand &LHS,
301                         MachineOperand &RHS,
302                         MachineIRBuilder &MIRBuilder) const;
303   MachineInstr *emitADDS(Register Dst, MachineOperand &LHS, MachineOperand &RHS,
304                          MachineIRBuilder &MIRBuilder) const;
305   MachineInstr *emitSUBS(Register Dst, MachineOperand &LHS, MachineOperand &RHS,
306                          MachineIRBuilder &MIRBuilder) const;
307   MachineInstr *emitADCS(Register Dst, MachineOperand &LHS, MachineOperand &RHS,
308                          MachineIRBuilder &MIRBuilder) const;
309   MachineInstr *emitSBCS(Register Dst, MachineOperand &LHS, MachineOperand &RHS,
310                          MachineIRBuilder &MIRBuilder) const;
311   MachineInstr *emitCMN(MachineOperand &LHS, MachineOperand &RHS,
312                         MachineIRBuilder &MIRBuilder) const;
313   MachineInstr *emitTST(MachineOperand &LHS, MachineOperand &RHS,
314                         MachineIRBuilder &MIRBuilder) const;
315   MachineInstr *emitSelect(Register Dst, Register LHS, Register RHS,
316                            AArch64CC::CondCode CC,
317                            MachineIRBuilder &MIRBuilder) const;
318   MachineInstr *emitExtractVectorElt(std::optional<Register> DstReg,
319                                      const RegisterBank &DstRB, LLT ScalarTy,
320                                      Register VecReg, unsigned LaneIdx,
321                                      MachineIRBuilder &MIRBuilder) const;
322   MachineInstr *emitCSINC(Register Dst, Register Src1, Register Src2,
323                           AArch64CC::CondCode Pred,
324                           MachineIRBuilder &MIRBuilder) const;
325   /// Emit a CSet for a FP compare.
326   ///
327   /// \p Dst is expected to be a 32-bit scalar register.
328   MachineInstr *emitCSetForFCmp(Register Dst, CmpInst::Predicate Pred,
329                                 MachineIRBuilder &MIRBuilder) const;
330 
331   /// Emit an instruction that sets NZCV to the carry-in expected by \p I.
332   /// Might elide the instruction if the previous instruction already sets NZCV
333   /// correctly.
334   MachineInstr *emitCarryIn(MachineInstr &I, Register CarryReg);
335 
336   /// Emit the overflow op for \p Opcode.
337   ///
338   /// \p Opcode is expected to be an overflow op's opcode, e.g. G_UADDO,
339   /// G_USUBO, etc.
340   std::pair<MachineInstr *, AArch64CC::CondCode>
341   emitOverflowOp(unsigned Opcode, Register Dst, MachineOperand &LHS,
342                  MachineOperand &RHS, MachineIRBuilder &MIRBuilder) const;
343 
344   bool selectOverflowOp(MachineInstr &I, MachineRegisterInfo &MRI);
345 
346   /// Emit expression as a conjunction (a series of CCMP/CFCMP ops).
347   /// In some cases this is even possible with OR operations in the expression.
348   MachineInstr *emitConjunction(Register Val, AArch64CC::CondCode &OutCC,
349                                 MachineIRBuilder &MIB) const;
350   MachineInstr *emitConditionalComparison(Register LHS, Register RHS,
351                                           CmpInst::Predicate CC,
352                                           AArch64CC::CondCode Predicate,
353                                           AArch64CC::CondCode OutCC,
354                                           MachineIRBuilder &MIB) const;
355   MachineInstr *emitConjunctionRec(Register Val, AArch64CC::CondCode &OutCC,
356                                    bool Negate, Register CCOp,
357                                    AArch64CC::CondCode Predicate,
358                                    MachineIRBuilder &MIB) const;
359 
360   /// Emit a TB(N)Z instruction which tests \p Bit in \p TestReg.
361   /// \p IsNegative is true if the test should be "not zero".
362   /// This will also optimize the test bit instruction when possible.
363   MachineInstr *emitTestBit(Register TestReg, uint64_t Bit, bool IsNegative,
364                             MachineBasicBlock *DstMBB,
365                             MachineIRBuilder &MIB) const;
366 
367   /// Emit a CB(N)Z instruction which branches to \p DestMBB.
368   MachineInstr *emitCBZ(Register CompareReg, bool IsNegative,
369                         MachineBasicBlock *DestMBB,
370                         MachineIRBuilder &MIB) const;
371 
372   // Equivalent to the i32shift_a and friends from AArch64InstrInfo.td.
373   // We use these manually instead of using the importer since it doesn't
374   // support SDNodeXForm.
375   ComplexRendererFns selectShiftA_32(const MachineOperand &Root) const;
376   ComplexRendererFns selectShiftB_32(const MachineOperand &Root) const;
377   ComplexRendererFns selectShiftA_64(const MachineOperand &Root) const;
378   ComplexRendererFns selectShiftB_64(const MachineOperand &Root) const;
379 
380   ComplexRendererFns select12BitValueWithLeftShift(uint64_t Immed) const;
381   ComplexRendererFns selectArithImmed(MachineOperand &Root) const;
382   ComplexRendererFns selectNegArithImmed(MachineOperand &Root) const;
383 
384   ComplexRendererFns selectAddrModeUnscaled(MachineOperand &Root,
385                                             unsigned Size) const;
386 
387   ComplexRendererFns selectAddrModeUnscaled8(MachineOperand &Root) const {
388     return selectAddrModeUnscaled(Root, 1);
389   }
390   ComplexRendererFns selectAddrModeUnscaled16(MachineOperand &Root) const {
391     return selectAddrModeUnscaled(Root, 2);
392   }
393   ComplexRendererFns selectAddrModeUnscaled32(MachineOperand &Root) const {
394     return selectAddrModeUnscaled(Root, 4);
395   }
396   ComplexRendererFns selectAddrModeUnscaled64(MachineOperand &Root) const {
397     return selectAddrModeUnscaled(Root, 8);
398   }
399   ComplexRendererFns selectAddrModeUnscaled128(MachineOperand &Root) const {
400     return selectAddrModeUnscaled(Root, 16);
401   }
402 
403   /// Helper to try to fold in a GISEL_ADD_LOW into an immediate, to be used
404   /// from complex pattern matchers like selectAddrModeIndexed().
405   ComplexRendererFns tryFoldAddLowIntoImm(MachineInstr &RootDef, unsigned Size,
406                                           MachineRegisterInfo &MRI) const;
407 
408   ComplexRendererFns selectAddrModeIndexed(MachineOperand &Root,
409                                            unsigned Size) const;
410   template <int Width>
411   ComplexRendererFns selectAddrModeIndexed(MachineOperand &Root) const {
412     return selectAddrModeIndexed(Root, Width / 8);
413   }
414 
415   bool isWorthFoldingIntoExtendedReg(MachineInstr &MI,
416                                      const MachineRegisterInfo &MRI) const;
417   ComplexRendererFns
418   selectAddrModeShiftedExtendXReg(MachineOperand &Root,
419                                   unsigned SizeInBytes) const;
420 
421   /// Returns a \p ComplexRendererFns which contains a base, offset, and whether
422   /// or not a shift + extend should be folded into an addressing mode. Returns
423   /// None when this is not profitable or possible.
424   ComplexRendererFns
425   selectExtendedSHL(MachineOperand &Root, MachineOperand &Base,
426                     MachineOperand &Offset, unsigned SizeInBytes,
427                     bool WantsExt) const;
428   ComplexRendererFns selectAddrModeRegisterOffset(MachineOperand &Root) const;
429   ComplexRendererFns selectAddrModeXRO(MachineOperand &Root,
430                                        unsigned SizeInBytes) const;
431   template <int Width>
432   ComplexRendererFns selectAddrModeXRO(MachineOperand &Root) const {
433     return selectAddrModeXRO(Root, Width / 8);
434   }
435 
436   ComplexRendererFns selectAddrModeWRO(MachineOperand &Root,
437                                        unsigned SizeInBytes) const;
438   template <int Width>
439   ComplexRendererFns selectAddrModeWRO(MachineOperand &Root) const {
440     return selectAddrModeWRO(Root, Width / 8);
441   }
442 
443   ComplexRendererFns selectShiftedRegister(MachineOperand &Root,
444                                            bool AllowROR = false) const;
445 
446   ComplexRendererFns selectArithShiftedRegister(MachineOperand &Root) const {
447     return selectShiftedRegister(Root);
448   }
449 
450   ComplexRendererFns selectLogicalShiftedRegister(MachineOperand &Root) const {
451     return selectShiftedRegister(Root, true);
452   }
453 
454   /// Given an extend instruction, determine the correct shift-extend type for
455   /// that instruction.
456   ///
457   /// If the instruction is going to be used in a load or store, pass
458   /// \p IsLoadStore = true.
459   AArch64_AM::ShiftExtendType
460   getExtendTypeForInst(MachineInstr &MI, MachineRegisterInfo &MRI,
461                        bool IsLoadStore = false) const;
462 
463   /// Move \p Reg to \p RC if \p Reg is not already on \p RC.
464   ///
465   /// \returns Either \p Reg if no change was necessary, or the new register
466   /// created by moving \p Reg.
467   ///
468   /// Note: This uses emitCopy right now.
469   Register moveScalarRegClass(Register Reg, const TargetRegisterClass &RC,
470                               MachineIRBuilder &MIB) const;
471 
472   ComplexRendererFns selectArithExtendedRegister(MachineOperand &Root) const;
473 
474   ComplexRendererFns selectExtractHigh(MachineOperand &Root) const;
475 
476   void renderTruncImm(MachineInstrBuilder &MIB, const MachineInstr &MI,
477                       int OpIdx = -1) const;
478   void renderLogicalImm32(MachineInstrBuilder &MIB, const MachineInstr &I,
479                           int OpIdx = -1) const;
480   void renderLogicalImm64(MachineInstrBuilder &MIB, const MachineInstr &I,
481                           int OpIdx = -1) const;
482   void renderFPImm16(MachineInstrBuilder &MIB, const MachineInstr &MI,
483                      int OpIdx = -1) const;
484   void renderFPImm32(MachineInstrBuilder &MIB, const MachineInstr &MI,
485                      int OpIdx = -1) const;
486   void renderFPImm64(MachineInstrBuilder &MIB, const MachineInstr &MI,
487                      int OpIdx = -1) const;
488   void renderFPImm32SIMDModImmType4(MachineInstrBuilder &MIB,
489                                     const MachineInstr &MI,
490                                     int OpIdx = -1) const;
491 
492   // Materialize a GlobalValue or BlockAddress using a movz+movk sequence.
493   void materializeLargeCMVal(MachineInstr &I, const Value *V, unsigned OpFlags);
494 
495   // Optimization methods.
496   bool tryOptSelect(GSelect &Sel);
497   bool tryOptSelectConjunction(GSelect &Sel, MachineInstr &CondMI);
498   MachineInstr *tryFoldIntegerCompare(MachineOperand &LHS, MachineOperand &RHS,
499                                       MachineOperand &Predicate,
500                                       MachineIRBuilder &MIRBuilder) const;
501 
502   /// Return true if \p MI is a load or store of \p NumBytes bytes.
503   bool isLoadStoreOfNumBytes(const MachineInstr &MI, unsigned NumBytes) const;
504 
505   /// Returns true if \p MI is guaranteed to have the high-half of a 64-bit
506   /// register zeroed out. In other words, the result of MI has been explicitly
507   /// zero extended.
508   bool isDef32(const MachineInstr &MI) const;
509 
510   const AArch64TargetMachine &TM;
511   const AArch64Subtarget &STI;
512   const AArch64InstrInfo &TII;
513   const AArch64RegisterInfo &TRI;
514   const AArch64RegisterBankInfo &RBI;
515 
516   bool ProduceNonFlagSettingCondBr = false;
517 
518   // Some cached values used during selection.
519   // We use LR as a live-in register, and we keep track of it here as it can be
520   // clobbered by calls.
521   Register MFReturnAddr;
522 
523   MachineIRBuilder MIB;
524 
525 #define GET_GLOBALISEL_PREDICATES_DECL
526 #include "AArch64GenGlobalISel.inc"
527 #undef GET_GLOBALISEL_PREDICATES_DECL
528 
529 // We declare the temporaries used by selectImpl() in the class to minimize the
530 // cost of constructing placeholder values.
531 #define GET_GLOBALISEL_TEMPORARIES_DECL
532 #include "AArch64GenGlobalISel.inc"
533 #undef GET_GLOBALISEL_TEMPORARIES_DECL
534 };
535 
536 } // end anonymous namespace
537 
538 #define GET_GLOBALISEL_IMPL
539 #include "AArch64GenGlobalISel.inc"
540 #undef GET_GLOBALISEL_IMPL
541 
542 AArch64InstructionSelector::AArch64InstructionSelector(
543     const AArch64TargetMachine &TM, const AArch64Subtarget &STI,
544     const AArch64RegisterBankInfo &RBI)
545     : TM(TM), STI(STI), TII(*STI.getInstrInfo()), TRI(*STI.getRegisterInfo()),
546       RBI(RBI),
547 #define GET_GLOBALISEL_PREDICATES_INIT
548 #include "AArch64GenGlobalISel.inc"
549 #undef GET_GLOBALISEL_PREDICATES_INIT
550 #define GET_GLOBALISEL_TEMPORARIES_INIT
551 #include "AArch64GenGlobalISel.inc"
552 #undef GET_GLOBALISEL_TEMPORARIES_INIT
553 {
554 }
555 
556 // FIXME: This should be target-independent, inferred from the types declared
557 // for each class in the bank.
558 //
559 /// Given a register bank, and a type, return the smallest register class that
560 /// can represent that combination.
561 static const TargetRegisterClass *
562 getRegClassForTypeOnBank(LLT Ty, const RegisterBank &RB,
563                          bool GetAllRegSet = false) {
564   if (RB.getID() == AArch64::GPRRegBankID) {
565     if (Ty.getSizeInBits() <= 32)
566       return GetAllRegSet ? &AArch64::GPR32allRegClass
567                           : &AArch64::GPR32RegClass;
568     if (Ty.getSizeInBits() == 64)
569       return GetAllRegSet ? &AArch64::GPR64allRegClass
570                           : &AArch64::GPR64RegClass;
571     if (Ty.getSizeInBits() == 128)
572       return &AArch64::XSeqPairsClassRegClass;
573     return nullptr;
574   }
575 
576   if (RB.getID() == AArch64::FPRRegBankID) {
577     switch (Ty.getSizeInBits()) {
578     case 8:
579       return &AArch64::FPR8RegClass;
580     case 16:
581       return &AArch64::FPR16RegClass;
582     case 32:
583       return &AArch64::FPR32RegClass;
584     case 64:
585       return &AArch64::FPR64RegClass;
586     case 128:
587       return &AArch64::FPR128RegClass;
588     }
589     return nullptr;
590   }
591 
592   return nullptr;
593 }
594 
595 /// Given a register bank, and size in bits, return the smallest register class
596 /// that can represent that combination.
597 static const TargetRegisterClass *
598 getMinClassForRegBank(const RegisterBank &RB, unsigned SizeInBits,
599                       bool GetAllRegSet = false) {
600   unsigned RegBankID = RB.getID();
601 
602   if (RegBankID == AArch64::GPRRegBankID) {
603     if (SizeInBits <= 32)
604       return GetAllRegSet ? &AArch64::GPR32allRegClass
605                           : &AArch64::GPR32RegClass;
606     if (SizeInBits == 64)
607       return GetAllRegSet ? &AArch64::GPR64allRegClass
608                           : &AArch64::GPR64RegClass;
609     if (SizeInBits == 128)
610       return &AArch64::XSeqPairsClassRegClass;
611   }
612 
613   if (RegBankID == AArch64::FPRRegBankID) {
614     switch (SizeInBits) {
615     default:
616       return nullptr;
617     case 8:
618       return &AArch64::FPR8RegClass;
619     case 16:
620       return &AArch64::FPR16RegClass;
621     case 32:
622       return &AArch64::FPR32RegClass;
623     case 64:
624       return &AArch64::FPR64RegClass;
625     case 128:
626       return &AArch64::FPR128RegClass;
627     }
628   }
629 
630   return nullptr;
631 }
632 
633 /// Returns the correct subregister to use for a given register class.
634 static bool getSubRegForClass(const TargetRegisterClass *RC,
635                               const TargetRegisterInfo &TRI, unsigned &SubReg) {
636   switch (TRI.getRegSizeInBits(*RC)) {
637   case 8:
638     SubReg = AArch64::bsub;
639     break;
640   case 16:
641     SubReg = AArch64::hsub;
642     break;
643   case 32:
644     if (RC != &AArch64::FPR32RegClass)
645       SubReg = AArch64::sub_32;
646     else
647       SubReg = AArch64::ssub;
648     break;
649   case 64:
650     SubReg = AArch64::dsub;
651     break;
652   default:
653     LLVM_DEBUG(
654         dbgs() << "Couldn't find appropriate subregister for register class.");
655     return false;
656   }
657 
658   return true;
659 }
660 
661 /// Returns the minimum size the given register bank can hold.
662 static unsigned getMinSizeForRegBank(const RegisterBank &RB) {
663   switch (RB.getID()) {
664   case AArch64::GPRRegBankID:
665     return 32;
666   case AArch64::FPRRegBankID:
667     return 8;
668   default:
669     llvm_unreachable("Tried to get minimum size for unknown register bank.");
670   }
671 }
672 
673 /// Create a REG_SEQUENCE instruction using the registers in \p Regs.
674 /// Helper function for functions like createDTuple and createQTuple.
675 ///
676 /// \p RegClassIDs - The list of register class IDs available for some tuple of
677 /// a scalar class. E.g. QQRegClassID, QQQRegClassID, QQQQRegClassID. This is
678 /// expected to contain between 2 and 4 tuple classes.
679 ///
680 /// \p SubRegs - The list of subregister classes associated with each register
681 /// class ID in \p RegClassIDs. E.g., QQRegClassID should use the qsub0
682 /// subregister class. The index of each subregister class is expected to
683 /// correspond with the index of each register class.
684 ///
685 /// \returns Either the destination register of REG_SEQUENCE instruction that
686 /// was created, or the 0th element of \p Regs if \p Regs contains a single
687 /// element.
688 static Register createTuple(ArrayRef<Register> Regs,
689                             const unsigned RegClassIDs[],
690                             const unsigned SubRegs[], MachineIRBuilder &MIB) {
691   unsigned NumRegs = Regs.size();
692   if (NumRegs == 1)
693     return Regs[0];
694   assert(NumRegs >= 2 && NumRegs <= 4 &&
695          "Only support between two and 4 registers in a tuple!");
696   const TargetRegisterInfo *TRI = MIB.getMF().getSubtarget().getRegisterInfo();
697   auto *DesiredClass = TRI->getRegClass(RegClassIDs[NumRegs - 2]);
698   auto RegSequence =
699       MIB.buildInstr(TargetOpcode::REG_SEQUENCE, {DesiredClass}, {});
700   for (unsigned I = 0, E = Regs.size(); I < E; ++I) {
701     RegSequence.addUse(Regs[I]);
702     RegSequence.addImm(SubRegs[I]);
703   }
704   return RegSequence.getReg(0);
705 }
706 
707 /// Create a tuple of D-registers using the registers in \p Regs.
708 static Register createDTuple(ArrayRef<Register> Regs, MachineIRBuilder &MIB) {
709   static const unsigned RegClassIDs[] = {
710       AArch64::DDRegClassID, AArch64::DDDRegClassID, AArch64::DDDDRegClassID};
711   static const unsigned SubRegs[] = {AArch64::dsub0, AArch64::dsub1,
712                                      AArch64::dsub2, AArch64::dsub3};
713   return createTuple(Regs, RegClassIDs, SubRegs, MIB);
714 }
715 
716 /// Create a tuple of Q-registers using the registers in \p Regs.
717 static Register createQTuple(ArrayRef<Register> Regs, MachineIRBuilder &MIB) {
718   static const unsigned RegClassIDs[] = {
719       AArch64::QQRegClassID, AArch64::QQQRegClassID, AArch64::QQQQRegClassID};
720   static const unsigned SubRegs[] = {AArch64::qsub0, AArch64::qsub1,
721                                      AArch64::qsub2, AArch64::qsub3};
722   return createTuple(Regs, RegClassIDs, SubRegs, MIB);
723 }
724 
725 static std::optional<uint64_t> getImmedFromMO(const MachineOperand &Root) {
726   auto &MI = *Root.getParent();
727   auto &MBB = *MI.getParent();
728   auto &MF = *MBB.getParent();
729   auto &MRI = MF.getRegInfo();
730   uint64_t Immed;
731   if (Root.isImm())
732     Immed = Root.getImm();
733   else if (Root.isCImm())
734     Immed = Root.getCImm()->getZExtValue();
735   else if (Root.isReg()) {
736     auto ValAndVReg =
737         getIConstantVRegValWithLookThrough(Root.getReg(), MRI, true);
738     if (!ValAndVReg)
739       return std::nullopt;
740     Immed = ValAndVReg->Value.getSExtValue();
741   } else
742     return std::nullopt;
743   return Immed;
744 }
745 
746 /// Check whether \p I is a currently unsupported binary operation:
747 /// - it has an unsized type
748 /// - an operand is not a vreg
749 /// - all operands are not in the same bank
750 /// These are checks that should someday live in the verifier, but right now,
751 /// these are mostly limitations of the aarch64 selector.
752 static bool unsupportedBinOp(const MachineInstr &I,
753                              const AArch64RegisterBankInfo &RBI,
754                              const MachineRegisterInfo &MRI,
755                              const AArch64RegisterInfo &TRI) {
756   LLT Ty = MRI.getType(I.getOperand(0).getReg());
757   if (!Ty.isValid()) {
758     LLVM_DEBUG(dbgs() << "Generic binop register should be typed\n");
759     return true;
760   }
761 
762   const RegisterBank *PrevOpBank = nullptr;
763   for (auto &MO : I.operands()) {
764     // FIXME: Support non-register operands.
765     if (!MO.isReg()) {
766       LLVM_DEBUG(dbgs() << "Generic inst non-reg operands are unsupported\n");
767       return true;
768     }
769 
770     // FIXME: Can generic operations have physical registers operands? If
771     // so, this will need to be taught about that, and we'll need to get the
772     // bank out of the minimal class for the register.
773     // Either way, this needs to be documented (and possibly verified).
774     if (!MO.getReg().isVirtual()) {
775       LLVM_DEBUG(dbgs() << "Generic inst has physical register operand\n");
776       return true;
777     }
778 
779     const RegisterBank *OpBank = RBI.getRegBank(MO.getReg(), MRI, TRI);
780     if (!OpBank) {
781       LLVM_DEBUG(dbgs() << "Generic register has no bank or class\n");
782       return true;
783     }
784 
785     if (PrevOpBank && OpBank != PrevOpBank) {
786       LLVM_DEBUG(dbgs() << "Generic inst operands have different banks\n");
787       return true;
788     }
789     PrevOpBank = OpBank;
790   }
791   return false;
792 }
793 
794 /// Select the AArch64 opcode for the basic binary operation \p GenericOpc
795 /// (such as G_OR or G_SDIV), appropriate for the register bank \p RegBankID
796 /// and of size \p OpSize.
797 /// \returns \p GenericOpc if the combination is unsupported.
798 static unsigned selectBinaryOp(unsigned GenericOpc, unsigned RegBankID,
799                                unsigned OpSize) {
800   switch (RegBankID) {
801   case AArch64::GPRRegBankID:
802     if (OpSize == 32) {
803       switch (GenericOpc) {
804       case TargetOpcode::G_SHL:
805         return AArch64::LSLVWr;
806       case TargetOpcode::G_LSHR:
807         return AArch64::LSRVWr;
808       case TargetOpcode::G_ASHR:
809         return AArch64::ASRVWr;
810       default:
811         return GenericOpc;
812       }
813     } else if (OpSize == 64) {
814       switch (GenericOpc) {
815       case TargetOpcode::G_PTR_ADD:
816         return AArch64::ADDXrr;
817       case TargetOpcode::G_SHL:
818         return AArch64::LSLVXr;
819       case TargetOpcode::G_LSHR:
820         return AArch64::LSRVXr;
821       case TargetOpcode::G_ASHR:
822         return AArch64::ASRVXr;
823       default:
824         return GenericOpc;
825       }
826     }
827     break;
828   case AArch64::FPRRegBankID:
829     switch (OpSize) {
830     case 32:
831       switch (GenericOpc) {
832       case TargetOpcode::G_FADD:
833         return AArch64::FADDSrr;
834       case TargetOpcode::G_FSUB:
835         return AArch64::FSUBSrr;
836       case TargetOpcode::G_FMUL:
837         return AArch64::FMULSrr;
838       case TargetOpcode::G_FDIV:
839         return AArch64::FDIVSrr;
840       default:
841         return GenericOpc;
842       }
843     case 64:
844       switch (GenericOpc) {
845       case TargetOpcode::G_FADD:
846         return AArch64::FADDDrr;
847       case TargetOpcode::G_FSUB:
848         return AArch64::FSUBDrr;
849       case TargetOpcode::G_FMUL:
850         return AArch64::FMULDrr;
851       case TargetOpcode::G_FDIV:
852         return AArch64::FDIVDrr;
853       case TargetOpcode::G_OR:
854         return AArch64::ORRv8i8;
855       default:
856         return GenericOpc;
857       }
858     }
859     break;
860   }
861   return GenericOpc;
862 }
863 
864 /// Select the AArch64 opcode for the G_LOAD or G_STORE operation \p GenericOpc,
865 /// appropriate for the (value) register bank \p RegBankID and of memory access
866 /// size \p OpSize.  This returns the variant with the base+unsigned-immediate
867 /// addressing mode (e.g., LDRXui).
868 /// \returns \p GenericOpc if the combination is unsupported.
869 static unsigned selectLoadStoreUIOp(unsigned GenericOpc, unsigned RegBankID,
870                                     unsigned OpSize) {
871   const bool isStore = GenericOpc == TargetOpcode::G_STORE;
872   switch (RegBankID) {
873   case AArch64::GPRRegBankID:
874     switch (OpSize) {
875     case 8:
876       return isStore ? AArch64::STRBBui : AArch64::LDRBBui;
877     case 16:
878       return isStore ? AArch64::STRHHui : AArch64::LDRHHui;
879     case 32:
880       return isStore ? AArch64::STRWui : AArch64::LDRWui;
881     case 64:
882       return isStore ? AArch64::STRXui : AArch64::LDRXui;
883     }
884     break;
885   case AArch64::FPRRegBankID:
886     switch (OpSize) {
887     case 8:
888       return isStore ? AArch64::STRBui : AArch64::LDRBui;
889     case 16:
890       return isStore ? AArch64::STRHui : AArch64::LDRHui;
891     case 32:
892       return isStore ? AArch64::STRSui : AArch64::LDRSui;
893     case 64:
894       return isStore ? AArch64::STRDui : AArch64::LDRDui;
895     case 128:
896       return isStore ? AArch64::STRQui : AArch64::LDRQui;
897     }
898     break;
899   }
900   return GenericOpc;
901 }
902 
903 /// Helper function for selectCopy. Inserts a subregister copy from \p SrcReg
904 /// to \p *To.
905 ///
906 /// E.g "To = COPY SrcReg:SubReg"
907 static bool copySubReg(MachineInstr &I, MachineRegisterInfo &MRI,
908                        const RegisterBankInfo &RBI, Register SrcReg,
909                        const TargetRegisterClass *To, unsigned SubReg) {
910   assert(SrcReg.isValid() && "Expected a valid source register?");
911   assert(To && "Destination register class cannot be null");
912   assert(SubReg && "Expected a valid subregister");
913 
914   MachineIRBuilder MIB(I);
915   auto SubRegCopy =
916       MIB.buildInstr(TargetOpcode::COPY, {To}, {}).addReg(SrcReg, 0, SubReg);
917   MachineOperand &RegOp = I.getOperand(1);
918   RegOp.setReg(SubRegCopy.getReg(0));
919 
920   // It's possible that the destination register won't be constrained. Make
921   // sure that happens.
922   if (!I.getOperand(0).getReg().isPhysical())
923     RBI.constrainGenericRegister(I.getOperand(0).getReg(), *To, MRI);
924 
925   return true;
926 }
927 
928 /// Helper function to get the source and destination register classes for a
929 /// copy. Returns a std::pair containing the source register class for the
930 /// copy, and the destination register class for the copy. If a register class
931 /// cannot be determined, then it will be nullptr.
932 static std::pair<const TargetRegisterClass *, const TargetRegisterClass *>
933 getRegClassesForCopy(MachineInstr &I, const TargetInstrInfo &TII,
934                      MachineRegisterInfo &MRI, const TargetRegisterInfo &TRI,
935                      const RegisterBankInfo &RBI) {
936   Register DstReg = I.getOperand(0).getReg();
937   Register SrcReg = I.getOperand(1).getReg();
938   const RegisterBank &DstRegBank = *RBI.getRegBank(DstReg, MRI, TRI);
939   const RegisterBank &SrcRegBank = *RBI.getRegBank(SrcReg, MRI, TRI);
940   unsigned DstSize = RBI.getSizeInBits(DstReg, MRI, TRI);
941   unsigned SrcSize = RBI.getSizeInBits(SrcReg, MRI, TRI);
942 
943   // Special casing for cross-bank copies of s1s. We can technically represent
944   // a 1-bit value with any size of register. The minimum size for a GPR is 32
945   // bits. So, we need to put the FPR on 32 bits as well.
946   //
947   // FIXME: I'm not sure if this case holds true outside of copies. If it does,
948   // then we can pull it into the helpers that get the appropriate class for a
949   // register bank. Or make a new helper that carries along some constraint
950   // information.
951   if (SrcRegBank != DstRegBank && (DstSize == 1 && SrcSize == 1))
952     SrcSize = DstSize = 32;
953 
954   return {getMinClassForRegBank(SrcRegBank, SrcSize, true),
955           getMinClassForRegBank(DstRegBank, DstSize, true)};
956 }
957 
958 // FIXME: We need some sort of API in RBI/TRI to allow generic code to
959 // constrain operands of simple instructions given a TargetRegisterClass
960 // and LLT
961 static bool selectDebugInstr(MachineInstr &I, MachineRegisterInfo &MRI,
962                              const RegisterBankInfo &RBI) {
963   for (MachineOperand &MO : I.operands()) {
964     if (!MO.isReg())
965       continue;
966     Register Reg = MO.getReg();
967     if (!Reg)
968       continue;
969     if (Reg.isPhysical())
970       continue;
971     LLT Ty = MRI.getType(Reg);
972     const RegClassOrRegBank &RegClassOrBank = MRI.getRegClassOrRegBank(Reg);
973     const TargetRegisterClass *RC =
974         RegClassOrBank.dyn_cast<const TargetRegisterClass *>();
975     if (!RC) {
976       const RegisterBank &RB = *RegClassOrBank.get<const RegisterBank *>();
977       RC = getRegClassForTypeOnBank(Ty, RB);
978       if (!RC) {
979         LLVM_DEBUG(
980             dbgs() << "Warning: DBG_VALUE operand has unexpected size/bank\n");
981         break;
982       }
983     }
984     RBI.constrainGenericRegister(Reg, *RC, MRI);
985   }
986 
987   return true;
988 }
989 
990 static bool selectCopy(MachineInstr &I, const TargetInstrInfo &TII,
991                        MachineRegisterInfo &MRI, const TargetRegisterInfo &TRI,
992                        const RegisterBankInfo &RBI) {
993   Register DstReg = I.getOperand(0).getReg();
994   Register SrcReg = I.getOperand(1).getReg();
995   const RegisterBank &DstRegBank = *RBI.getRegBank(DstReg, MRI, TRI);
996   const RegisterBank &SrcRegBank = *RBI.getRegBank(SrcReg, MRI, TRI);
997 
998   // Find the correct register classes for the source and destination registers.
999   const TargetRegisterClass *SrcRC;
1000   const TargetRegisterClass *DstRC;
1001   std::tie(SrcRC, DstRC) = getRegClassesForCopy(I, TII, MRI, TRI, RBI);
1002 
1003   if (!DstRC) {
1004     LLVM_DEBUG(dbgs() << "Unexpected dest size "
1005                       << RBI.getSizeInBits(DstReg, MRI, TRI) << '\n');
1006     return false;
1007   }
1008 
1009   // Is this a copy? If so, then we may need to insert a subregister copy.
1010   if (I.isCopy()) {
1011     // Yes. Check if there's anything to fix up.
1012     if (!SrcRC) {
1013       LLVM_DEBUG(dbgs() << "Couldn't determine source register class\n");
1014       return false;
1015     }
1016 
1017     unsigned SrcSize = TRI.getRegSizeInBits(*SrcRC);
1018     unsigned DstSize = TRI.getRegSizeInBits(*DstRC);
1019     unsigned SubReg;
1020 
1021     // If the source bank doesn't support a subregister copy small enough,
1022     // then we first need to copy to the destination bank.
1023     if (getMinSizeForRegBank(SrcRegBank) > DstSize) {
1024       const TargetRegisterClass *DstTempRC =
1025           getMinClassForRegBank(DstRegBank, SrcSize, /* GetAllRegSet */ true);
1026       getSubRegForClass(DstRC, TRI, SubReg);
1027 
1028       MachineIRBuilder MIB(I);
1029       auto Copy = MIB.buildCopy({DstTempRC}, {SrcReg});
1030       copySubReg(I, MRI, RBI, Copy.getReg(0), DstRC, SubReg);
1031     } else if (SrcSize > DstSize) {
1032       // If the source register is bigger than the destination we need to
1033       // perform a subregister copy.
1034       const TargetRegisterClass *SubRegRC =
1035           getMinClassForRegBank(SrcRegBank, DstSize, /* GetAllRegSet */ true);
1036       getSubRegForClass(SubRegRC, TRI, SubReg);
1037       copySubReg(I, MRI, RBI, SrcReg, DstRC, SubReg);
1038     } else if (DstSize > SrcSize) {
1039       // If the destination register is bigger than the source we need to do
1040       // a promotion using SUBREG_TO_REG.
1041       const TargetRegisterClass *PromotionRC =
1042           getMinClassForRegBank(SrcRegBank, DstSize, /* GetAllRegSet */ true);
1043       getSubRegForClass(SrcRC, TRI, SubReg);
1044 
1045       Register PromoteReg = MRI.createVirtualRegister(PromotionRC);
1046       BuildMI(*I.getParent(), I, I.getDebugLoc(),
1047               TII.get(AArch64::SUBREG_TO_REG), PromoteReg)
1048           .addImm(0)
1049           .addUse(SrcReg)
1050           .addImm(SubReg);
1051       MachineOperand &RegOp = I.getOperand(1);
1052       RegOp.setReg(PromoteReg);
1053     }
1054 
1055     // If the destination is a physical register, then there's nothing to
1056     // change, so we're done.
1057     if (DstReg.isPhysical())
1058       return true;
1059   }
1060 
1061   // No need to constrain SrcReg. It will get constrained when we hit another
1062   // of its use or its defs. Copies do not have constraints.
1063   if (!RBI.constrainGenericRegister(DstReg, *DstRC, MRI)) {
1064     LLVM_DEBUG(dbgs() << "Failed to constrain " << TII.getName(I.getOpcode())
1065                       << " operand\n");
1066     return false;
1067   }
1068 
1069   // If this a GPR ZEXT that we want to just reduce down into a copy.
1070   // The sizes will be mismatched with the source < 32b but that's ok.
1071   if (I.getOpcode() == TargetOpcode::G_ZEXT) {
1072     I.setDesc(TII.get(AArch64::COPY));
1073     assert(SrcRegBank.getID() == AArch64::GPRRegBankID);
1074     return selectCopy(I, TII, MRI, TRI, RBI);
1075   }
1076 
1077   I.setDesc(TII.get(AArch64::COPY));
1078   return true;
1079 }
1080 
1081 static unsigned selectFPConvOpc(unsigned GenericOpc, LLT DstTy, LLT SrcTy) {
1082   if (!DstTy.isScalar() || !SrcTy.isScalar())
1083     return GenericOpc;
1084 
1085   const unsigned DstSize = DstTy.getSizeInBits();
1086   const unsigned SrcSize = SrcTy.getSizeInBits();
1087 
1088   switch (DstSize) {
1089   case 32:
1090     switch (SrcSize) {
1091     case 32:
1092       switch (GenericOpc) {
1093       case TargetOpcode::G_SITOFP:
1094         return AArch64::SCVTFUWSri;
1095       case TargetOpcode::G_UITOFP:
1096         return AArch64::UCVTFUWSri;
1097       case TargetOpcode::G_FPTOSI:
1098         return AArch64::FCVTZSUWSr;
1099       case TargetOpcode::G_FPTOUI:
1100         return AArch64::FCVTZUUWSr;
1101       default:
1102         return GenericOpc;
1103       }
1104     case 64:
1105       switch (GenericOpc) {
1106       case TargetOpcode::G_SITOFP:
1107         return AArch64::SCVTFUXSri;
1108       case TargetOpcode::G_UITOFP:
1109         return AArch64::UCVTFUXSri;
1110       case TargetOpcode::G_FPTOSI:
1111         return AArch64::FCVTZSUWDr;
1112       case TargetOpcode::G_FPTOUI:
1113         return AArch64::FCVTZUUWDr;
1114       default:
1115         return GenericOpc;
1116       }
1117     default:
1118       return GenericOpc;
1119     }
1120   case 64:
1121     switch (SrcSize) {
1122     case 32:
1123       switch (GenericOpc) {
1124       case TargetOpcode::G_SITOFP:
1125         return AArch64::SCVTFUWDri;
1126       case TargetOpcode::G_UITOFP:
1127         return AArch64::UCVTFUWDri;
1128       case TargetOpcode::G_FPTOSI:
1129         return AArch64::FCVTZSUXSr;
1130       case TargetOpcode::G_FPTOUI:
1131         return AArch64::FCVTZUUXSr;
1132       default:
1133         return GenericOpc;
1134       }
1135     case 64:
1136       switch (GenericOpc) {
1137       case TargetOpcode::G_SITOFP:
1138         return AArch64::SCVTFUXDri;
1139       case TargetOpcode::G_UITOFP:
1140         return AArch64::UCVTFUXDri;
1141       case TargetOpcode::G_FPTOSI:
1142         return AArch64::FCVTZSUXDr;
1143       case TargetOpcode::G_FPTOUI:
1144         return AArch64::FCVTZUUXDr;
1145       default:
1146         return GenericOpc;
1147       }
1148     default:
1149       return GenericOpc;
1150     }
1151   default:
1152     return GenericOpc;
1153   };
1154   return GenericOpc;
1155 }
1156 
1157 MachineInstr *
1158 AArch64InstructionSelector::emitSelect(Register Dst, Register True,
1159                                        Register False, AArch64CC::CondCode CC,
1160                                        MachineIRBuilder &MIB) const {
1161   MachineRegisterInfo &MRI = *MIB.getMRI();
1162   assert(RBI.getRegBank(False, MRI, TRI)->getID() ==
1163              RBI.getRegBank(True, MRI, TRI)->getID() &&
1164          "Expected both select operands to have the same regbank?");
1165   LLT Ty = MRI.getType(True);
1166   if (Ty.isVector())
1167     return nullptr;
1168   const unsigned Size = Ty.getSizeInBits();
1169   assert((Size == 32 || Size == 64) &&
1170          "Expected 32 bit or 64 bit select only?");
1171   const bool Is32Bit = Size == 32;
1172   if (RBI.getRegBank(True, MRI, TRI)->getID() != AArch64::GPRRegBankID) {
1173     unsigned Opc = Is32Bit ? AArch64::FCSELSrrr : AArch64::FCSELDrrr;
1174     auto FCSel = MIB.buildInstr(Opc, {Dst}, {True, False}).addImm(CC);
1175     constrainSelectedInstRegOperands(*FCSel, TII, TRI, RBI);
1176     return &*FCSel;
1177   }
1178 
1179   // By default, we'll try and emit a CSEL.
1180   unsigned Opc = Is32Bit ? AArch64::CSELWr : AArch64::CSELXr;
1181   bool Optimized = false;
1182   auto TryFoldBinOpIntoSelect = [&Opc, Is32Bit, &CC, &MRI,
1183                                  &Optimized](Register &Reg, Register &OtherReg,
1184                                              bool Invert) {
1185     if (Optimized)
1186       return false;
1187 
1188     // Attempt to fold:
1189     //
1190     // %sub = G_SUB 0, %x
1191     // %select = G_SELECT cc, %reg, %sub
1192     //
1193     // Into:
1194     // %select = CSNEG %reg, %x, cc
1195     Register MatchReg;
1196     if (mi_match(Reg, MRI, m_Neg(m_Reg(MatchReg)))) {
1197       Opc = Is32Bit ? AArch64::CSNEGWr : AArch64::CSNEGXr;
1198       Reg = MatchReg;
1199       if (Invert) {
1200         CC = AArch64CC::getInvertedCondCode(CC);
1201         std::swap(Reg, OtherReg);
1202       }
1203       return true;
1204     }
1205 
1206     // Attempt to fold:
1207     //
1208     // %xor = G_XOR %x, -1
1209     // %select = G_SELECT cc, %reg, %xor
1210     //
1211     // Into:
1212     // %select = CSINV %reg, %x, cc
1213     if (mi_match(Reg, MRI, m_Not(m_Reg(MatchReg)))) {
1214       Opc = Is32Bit ? AArch64::CSINVWr : AArch64::CSINVXr;
1215       Reg = MatchReg;
1216       if (Invert) {
1217         CC = AArch64CC::getInvertedCondCode(CC);
1218         std::swap(Reg, OtherReg);
1219       }
1220       return true;
1221     }
1222 
1223     // Attempt to fold:
1224     //
1225     // %add = G_ADD %x, 1
1226     // %select = G_SELECT cc, %reg, %add
1227     //
1228     // Into:
1229     // %select = CSINC %reg, %x, cc
1230     if (mi_match(Reg, MRI,
1231                  m_any_of(m_GAdd(m_Reg(MatchReg), m_SpecificICst(1)),
1232                           m_GPtrAdd(m_Reg(MatchReg), m_SpecificICst(1))))) {
1233       Opc = Is32Bit ? AArch64::CSINCWr : AArch64::CSINCXr;
1234       Reg = MatchReg;
1235       if (Invert) {
1236         CC = AArch64CC::getInvertedCondCode(CC);
1237         std::swap(Reg, OtherReg);
1238       }
1239       return true;
1240     }
1241 
1242     return false;
1243   };
1244 
1245   // Helper lambda which tries to use CSINC/CSINV for the instruction when its
1246   // true/false values are constants.
1247   // FIXME: All of these patterns already exist in tablegen. We should be
1248   // able to import these.
1249   auto TryOptSelectCst = [&Opc, &True, &False, &CC, Is32Bit, &MRI,
1250                           &Optimized]() {
1251     if (Optimized)
1252       return false;
1253     auto TrueCst = getIConstantVRegValWithLookThrough(True, MRI);
1254     auto FalseCst = getIConstantVRegValWithLookThrough(False, MRI);
1255     if (!TrueCst && !FalseCst)
1256       return false;
1257 
1258     Register ZReg = Is32Bit ? AArch64::WZR : AArch64::XZR;
1259     if (TrueCst && FalseCst) {
1260       int64_t T = TrueCst->Value.getSExtValue();
1261       int64_t F = FalseCst->Value.getSExtValue();
1262 
1263       if (T == 0 && F == 1) {
1264         // G_SELECT cc, 0, 1 -> CSINC zreg, zreg, cc
1265         Opc = Is32Bit ? AArch64::CSINCWr : AArch64::CSINCXr;
1266         True = ZReg;
1267         False = ZReg;
1268         return true;
1269       }
1270 
1271       if (T == 0 && F == -1) {
1272         // G_SELECT cc 0, -1 -> CSINV zreg, zreg cc
1273         Opc = Is32Bit ? AArch64::CSINVWr : AArch64::CSINVXr;
1274         True = ZReg;
1275         False = ZReg;
1276         return true;
1277       }
1278     }
1279 
1280     if (TrueCst) {
1281       int64_t T = TrueCst->Value.getSExtValue();
1282       if (T == 1) {
1283         // G_SELECT cc, 1, f -> CSINC f, zreg, inv_cc
1284         Opc = Is32Bit ? AArch64::CSINCWr : AArch64::CSINCXr;
1285         True = False;
1286         False = ZReg;
1287         CC = AArch64CC::getInvertedCondCode(CC);
1288         return true;
1289       }
1290 
1291       if (T == -1) {
1292         // G_SELECT cc, -1, f -> CSINV f, zreg, inv_cc
1293         Opc = Is32Bit ? AArch64::CSINVWr : AArch64::CSINVXr;
1294         True = False;
1295         False = ZReg;
1296         CC = AArch64CC::getInvertedCondCode(CC);
1297         return true;
1298       }
1299     }
1300 
1301     if (FalseCst) {
1302       int64_t F = FalseCst->Value.getSExtValue();
1303       if (F == 1) {
1304         // G_SELECT cc, t, 1 -> CSINC t, zreg, cc
1305         Opc = Is32Bit ? AArch64::CSINCWr : AArch64::CSINCXr;
1306         False = ZReg;
1307         return true;
1308       }
1309 
1310       if (F == -1) {
1311         // G_SELECT cc, t, -1 -> CSINC t, zreg, cc
1312         Opc = Is32Bit ? AArch64::CSINVWr : AArch64::CSINVXr;
1313         False = ZReg;
1314         return true;
1315       }
1316     }
1317     return false;
1318   };
1319 
1320   Optimized |= TryFoldBinOpIntoSelect(False, True, /*Invert = */ false);
1321   Optimized |= TryFoldBinOpIntoSelect(True, False, /*Invert = */ true);
1322   Optimized |= TryOptSelectCst();
1323   auto SelectInst = MIB.buildInstr(Opc, {Dst}, {True, False}).addImm(CC);
1324   constrainSelectedInstRegOperands(*SelectInst, TII, TRI, RBI);
1325   return &*SelectInst;
1326 }
1327 
1328 static AArch64CC::CondCode changeICMPPredToAArch64CC(CmpInst::Predicate P) {
1329   switch (P) {
1330   default:
1331     llvm_unreachable("Unknown condition code!");
1332   case CmpInst::ICMP_NE:
1333     return AArch64CC::NE;
1334   case CmpInst::ICMP_EQ:
1335     return AArch64CC::EQ;
1336   case CmpInst::ICMP_SGT:
1337     return AArch64CC::GT;
1338   case CmpInst::ICMP_SGE:
1339     return AArch64CC::GE;
1340   case CmpInst::ICMP_SLT:
1341     return AArch64CC::LT;
1342   case CmpInst::ICMP_SLE:
1343     return AArch64CC::LE;
1344   case CmpInst::ICMP_UGT:
1345     return AArch64CC::HI;
1346   case CmpInst::ICMP_UGE:
1347     return AArch64CC::HS;
1348   case CmpInst::ICMP_ULT:
1349     return AArch64CC::LO;
1350   case CmpInst::ICMP_ULE:
1351     return AArch64CC::LS;
1352   }
1353 }
1354 
1355 /// changeFPCCToORAArch64CC - Convert an IR fp condition code to an AArch64 CC.
1356 static void changeFPCCToORAArch64CC(CmpInst::Predicate CC,
1357                                     AArch64CC::CondCode &CondCode,
1358                                     AArch64CC::CondCode &CondCode2) {
1359   CondCode2 = AArch64CC::AL;
1360   switch (CC) {
1361   default:
1362     llvm_unreachable("Unknown FP condition!");
1363   case CmpInst::FCMP_OEQ:
1364     CondCode = AArch64CC::EQ;
1365     break;
1366   case CmpInst::FCMP_OGT:
1367     CondCode = AArch64CC::GT;
1368     break;
1369   case CmpInst::FCMP_OGE:
1370     CondCode = AArch64CC::GE;
1371     break;
1372   case CmpInst::FCMP_OLT:
1373     CondCode = AArch64CC::MI;
1374     break;
1375   case CmpInst::FCMP_OLE:
1376     CondCode = AArch64CC::LS;
1377     break;
1378   case CmpInst::FCMP_ONE:
1379     CondCode = AArch64CC::MI;
1380     CondCode2 = AArch64CC::GT;
1381     break;
1382   case CmpInst::FCMP_ORD:
1383     CondCode = AArch64CC::VC;
1384     break;
1385   case CmpInst::FCMP_UNO:
1386     CondCode = AArch64CC::VS;
1387     break;
1388   case CmpInst::FCMP_UEQ:
1389     CondCode = AArch64CC::EQ;
1390     CondCode2 = AArch64CC::VS;
1391     break;
1392   case CmpInst::FCMP_UGT:
1393     CondCode = AArch64CC::HI;
1394     break;
1395   case CmpInst::FCMP_UGE:
1396     CondCode = AArch64CC::PL;
1397     break;
1398   case CmpInst::FCMP_ULT:
1399     CondCode = AArch64CC::LT;
1400     break;
1401   case CmpInst::FCMP_ULE:
1402     CondCode = AArch64CC::LE;
1403     break;
1404   case CmpInst::FCMP_UNE:
1405     CondCode = AArch64CC::NE;
1406     break;
1407   }
1408 }
1409 
1410 /// Convert an IR fp condition code to an AArch64 CC.
1411 /// This differs from changeFPCCToAArch64CC in that it returns cond codes that
1412 /// should be AND'ed instead of OR'ed.
1413 static void changeFPCCToANDAArch64CC(CmpInst::Predicate CC,
1414                                      AArch64CC::CondCode &CondCode,
1415                                      AArch64CC::CondCode &CondCode2) {
1416   CondCode2 = AArch64CC::AL;
1417   switch (CC) {
1418   default:
1419     changeFPCCToORAArch64CC(CC, CondCode, CondCode2);
1420     assert(CondCode2 == AArch64CC::AL);
1421     break;
1422   case CmpInst::FCMP_ONE:
1423     // (a one b)
1424     // == ((a olt b) || (a ogt b))
1425     // == ((a ord b) && (a une b))
1426     CondCode = AArch64CC::VC;
1427     CondCode2 = AArch64CC::NE;
1428     break;
1429   case CmpInst::FCMP_UEQ:
1430     // (a ueq b)
1431     // == ((a uno b) || (a oeq b))
1432     // == ((a ule b) && (a uge b))
1433     CondCode = AArch64CC::PL;
1434     CondCode2 = AArch64CC::LE;
1435     break;
1436   }
1437 }
1438 
1439 /// Return a register which can be used as a bit to test in a TB(N)Z.
1440 static Register getTestBitReg(Register Reg, uint64_t &Bit, bool &Invert,
1441                               MachineRegisterInfo &MRI) {
1442   assert(Reg.isValid() && "Expected valid register!");
1443   bool HasZext = false;
1444   while (MachineInstr *MI = getDefIgnoringCopies(Reg, MRI)) {
1445     unsigned Opc = MI->getOpcode();
1446 
1447     if (!MI->getOperand(0).isReg() ||
1448         !MRI.hasOneNonDBGUse(MI->getOperand(0).getReg()))
1449       break;
1450 
1451     // (tbz (any_ext x), b) -> (tbz x, b) if we don't use the extended bits.
1452     //
1453     // (tbz (trunc x), b) -> (tbz x, b) is always safe, because the bit number
1454     // on the truncated x is the same as the bit number on x.
1455     if (Opc == TargetOpcode::G_ANYEXT || Opc == TargetOpcode::G_ZEXT ||
1456         Opc == TargetOpcode::G_TRUNC) {
1457       if (Opc == TargetOpcode::G_ZEXT)
1458         HasZext = true;
1459 
1460       Register NextReg = MI->getOperand(1).getReg();
1461       // Did we find something worth folding?
1462       if (!NextReg.isValid() || !MRI.hasOneNonDBGUse(NextReg))
1463         break;
1464 
1465       // NextReg is worth folding. Keep looking.
1466       Reg = NextReg;
1467       continue;
1468     }
1469 
1470     // Attempt to find a suitable operation with a constant on one side.
1471     std::optional<uint64_t> C;
1472     Register TestReg;
1473     switch (Opc) {
1474     default:
1475       break;
1476     case TargetOpcode::G_AND:
1477     case TargetOpcode::G_XOR: {
1478       TestReg = MI->getOperand(1).getReg();
1479       Register ConstantReg = MI->getOperand(2).getReg();
1480       auto VRegAndVal = getIConstantVRegValWithLookThrough(ConstantReg, MRI);
1481       if (!VRegAndVal) {
1482         // AND commutes, check the other side for a constant.
1483         // FIXME: Can we canonicalize the constant so that it's always on the
1484         // same side at some point earlier?
1485         std::swap(ConstantReg, TestReg);
1486         VRegAndVal = getIConstantVRegValWithLookThrough(ConstantReg, MRI);
1487       }
1488       if (VRegAndVal) {
1489         if (HasZext)
1490           C = VRegAndVal->Value.getZExtValue();
1491         else
1492           C = VRegAndVal->Value.getSExtValue();
1493       }
1494       break;
1495     }
1496     case TargetOpcode::G_ASHR:
1497     case TargetOpcode::G_LSHR:
1498     case TargetOpcode::G_SHL: {
1499       TestReg = MI->getOperand(1).getReg();
1500       auto VRegAndVal =
1501           getIConstantVRegValWithLookThrough(MI->getOperand(2).getReg(), MRI);
1502       if (VRegAndVal)
1503         C = VRegAndVal->Value.getSExtValue();
1504       break;
1505     }
1506     }
1507 
1508     // Didn't find a constant or viable register. Bail out of the loop.
1509     if (!C || !TestReg.isValid())
1510       break;
1511 
1512     // We found a suitable instruction with a constant. Check to see if we can
1513     // walk through the instruction.
1514     Register NextReg;
1515     unsigned TestRegSize = MRI.getType(TestReg).getSizeInBits();
1516     switch (Opc) {
1517     default:
1518       break;
1519     case TargetOpcode::G_AND:
1520       // (tbz (and x, m), b) -> (tbz x, b) when the b-th bit of m is set.
1521       if ((*C >> Bit) & 1)
1522         NextReg = TestReg;
1523       break;
1524     case TargetOpcode::G_SHL:
1525       // (tbz (shl x, c), b) -> (tbz x, b-c) when b-c is positive and fits in
1526       // the type of the register.
1527       if (*C <= Bit && (Bit - *C) < TestRegSize) {
1528         NextReg = TestReg;
1529         Bit = Bit - *C;
1530       }
1531       break;
1532     case TargetOpcode::G_ASHR:
1533       // (tbz (ashr x, c), b) -> (tbz x, b+c) or (tbz x, msb) if b+c is > # bits
1534       // in x
1535       NextReg = TestReg;
1536       Bit = Bit + *C;
1537       if (Bit >= TestRegSize)
1538         Bit = TestRegSize - 1;
1539       break;
1540     case TargetOpcode::G_LSHR:
1541       // (tbz (lshr x, c), b) -> (tbz x, b+c) when b + c is < # bits in x
1542       if ((Bit + *C) < TestRegSize) {
1543         NextReg = TestReg;
1544         Bit = Bit + *C;
1545       }
1546       break;
1547     case TargetOpcode::G_XOR:
1548       // We can walk through a G_XOR by inverting whether we use tbz/tbnz when
1549       // appropriate.
1550       //
1551       // e.g. If x' = xor x, c, and the b-th bit is set in c then
1552       //
1553       // tbz x', b -> tbnz x, b
1554       //
1555       // Because x' only has the b-th bit set if x does not.
1556       if ((*C >> Bit) & 1)
1557         Invert = !Invert;
1558       NextReg = TestReg;
1559       break;
1560     }
1561 
1562     // Check if we found anything worth folding.
1563     if (!NextReg.isValid())
1564       return Reg;
1565     Reg = NextReg;
1566   }
1567 
1568   return Reg;
1569 }
1570 
1571 MachineInstr *AArch64InstructionSelector::emitTestBit(
1572     Register TestReg, uint64_t Bit, bool IsNegative, MachineBasicBlock *DstMBB,
1573     MachineIRBuilder &MIB) const {
1574   assert(TestReg.isValid());
1575   assert(ProduceNonFlagSettingCondBr &&
1576          "Cannot emit TB(N)Z with speculation tracking!");
1577   MachineRegisterInfo &MRI = *MIB.getMRI();
1578 
1579   // Attempt to optimize the test bit by walking over instructions.
1580   TestReg = getTestBitReg(TestReg, Bit, IsNegative, MRI);
1581   LLT Ty = MRI.getType(TestReg);
1582   unsigned Size = Ty.getSizeInBits();
1583   assert(!Ty.isVector() && "Expected a scalar!");
1584   assert(Bit < 64 && "Bit is too large!");
1585 
1586   // When the test register is a 64-bit register, we have to narrow to make
1587   // TBNZW work.
1588   bool UseWReg = Bit < 32;
1589   unsigned NecessarySize = UseWReg ? 32 : 64;
1590   if (Size != NecessarySize)
1591     TestReg = moveScalarRegClass(
1592         TestReg, UseWReg ? AArch64::GPR32RegClass : AArch64::GPR64RegClass,
1593         MIB);
1594 
1595   static const unsigned OpcTable[2][2] = {{AArch64::TBZX, AArch64::TBNZX},
1596                                           {AArch64::TBZW, AArch64::TBNZW}};
1597   unsigned Opc = OpcTable[UseWReg][IsNegative];
1598   auto TestBitMI =
1599       MIB.buildInstr(Opc).addReg(TestReg).addImm(Bit).addMBB(DstMBB);
1600   constrainSelectedInstRegOperands(*TestBitMI, TII, TRI, RBI);
1601   return &*TestBitMI;
1602 }
1603 
1604 bool AArch64InstructionSelector::tryOptAndIntoCompareBranch(
1605     MachineInstr &AndInst, bool Invert, MachineBasicBlock *DstMBB,
1606     MachineIRBuilder &MIB) const {
1607   assert(AndInst.getOpcode() == TargetOpcode::G_AND && "Expected G_AND only?");
1608   // Given something like this:
1609   //
1610   //  %x = ...Something...
1611   //  %one = G_CONSTANT i64 1
1612   //  %zero = G_CONSTANT i64 0
1613   //  %and = G_AND %x, %one
1614   //  %cmp = G_ICMP intpred(ne), %and, %zero
1615   //  %cmp_trunc = G_TRUNC %cmp
1616   //  G_BRCOND %cmp_trunc, %bb.3
1617   //
1618   // We want to try and fold the AND into the G_BRCOND and produce either a
1619   // TBNZ (when we have intpred(ne)) or a TBZ (when we have intpred(eq)).
1620   //
1621   // In this case, we'd get
1622   //
1623   // TBNZ %x %bb.3
1624   //
1625 
1626   // Check if the AND has a constant on its RHS which we can use as a mask.
1627   // If it's a power of 2, then it's the same as checking a specific bit.
1628   // (e.g, ANDing with 8 == ANDing with 000...100 == testing if bit 3 is set)
1629   auto MaybeBit = getIConstantVRegValWithLookThrough(
1630       AndInst.getOperand(2).getReg(), *MIB.getMRI());
1631   if (!MaybeBit)
1632     return false;
1633 
1634   int32_t Bit = MaybeBit->Value.exactLogBase2();
1635   if (Bit < 0)
1636     return false;
1637 
1638   Register TestReg = AndInst.getOperand(1).getReg();
1639 
1640   // Emit a TB(N)Z.
1641   emitTestBit(TestReg, Bit, Invert, DstMBB, MIB);
1642   return true;
1643 }
1644 
1645 MachineInstr *AArch64InstructionSelector::emitCBZ(Register CompareReg,
1646                                                   bool IsNegative,
1647                                                   MachineBasicBlock *DestMBB,
1648                                                   MachineIRBuilder &MIB) const {
1649   assert(ProduceNonFlagSettingCondBr && "CBZ does not set flags!");
1650   MachineRegisterInfo &MRI = *MIB.getMRI();
1651   assert(RBI.getRegBank(CompareReg, MRI, TRI)->getID() ==
1652              AArch64::GPRRegBankID &&
1653          "Expected GPRs only?");
1654   auto Ty = MRI.getType(CompareReg);
1655   unsigned Width = Ty.getSizeInBits();
1656   assert(!Ty.isVector() && "Expected scalar only?");
1657   assert(Width <= 64 && "Expected width to be at most 64?");
1658   static const unsigned OpcTable[2][2] = {{AArch64::CBZW, AArch64::CBZX},
1659                                           {AArch64::CBNZW, AArch64::CBNZX}};
1660   unsigned Opc = OpcTable[IsNegative][Width == 64];
1661   auto BranchMI = MIB.buildInstr(Opc, {}, {CompareReg}).addMBB(DestMBB);
1662   constrainSelectedInstRegOperands(*BranchMI, TII, TRI, RBI);
1663   return &*BranchMI;
1664 }
1665 
1666 bool AArch64InstructionSelector::selectCompareBranchFedByFCmp(
1667     MachineInstr &I, MachineInstr &FCmp, MachineIRBuilder &MIB) const {
1668   assert(FCmp.getOpcode() == TargetOpcode::G_FCMP);
1669   assert(I.getOpcode() == TargetOpcode::G_BRCOND);
1670   // Unfortunately, the mapping of LLVM FP CC's onto AArch64 CC's isn't
1671   // totally clean.  Some of them require two branches to implement.
1672   auto Pred = (CmpInst::Predicate)FCmp.getOperand(1).getPredicate();
1673   emitFPCompare(FCmp.getOperand(2).getReg(), FCmp.getOperand(3).getReg(), MIB,
1674                 Pred);
1675   AArch64CC::CondCode CC1, CC2;
1676   changeFCMPPredToAArch64CC(static_cast<CmpInst::Predicate>(Pred), CC1, CC2);
1677   MachineBasicBlock *DestMBB = I.getOperand(1).getMBB();
1678   MIB.buildInstr(AArch64::Bcc, {}, {}).addImm(CC1).addMBB(DestMBB);
1679   if (CC2 != AArch64CC::AL)
1680     MIB.buildInstr(AArch64::Bcc, {}, {}).addImm(CC2).addMBB(DestMBB);
1681   I.eraseFromParent();
1682   return true;
1683 }
1684 
1685 bool AArch64InstructionSelector::tryOptCompareBranchFedByICmp(
1686     MachineInstr &I, MachineInstr &ICmp, MachineIRBuilder &MIB) const {
1687   assert(ICmp.getOpcode() == TargetOpcode::G_ICMP);
1688   assert(I.getOpcode() == TargetOpcode::G_BRCOND);
1689   // Attempt to optimize the G_BRCOND + G_ICMP into a TB(N)Z/CB(N)Z.
1690   //
1691   // Speculation tracking/SLH assumes that optimized TB(N)Z/CB(N)Z
1692   // instructions will not be produced, as they are conditional branch
1693   // instructions that do not set flags.
1694   if (!ProduceNonFlagSettingCondBr)
1695     return false;
1696 
1697   MachineRegisterInfo &MRI = *MIB.getMRI();
1698   MachineBasicBlock *DestMBB = I.getOperand(1).getMBB();
1699   auto Pred =
1700       static_cast<CmpInst::Predicate>(ICmp.getOperand(1).getPredicate());
1701   Register LHS = ICmp.getOperand(2).getReg();
1702   Register RHS = ICmp.getOperand(3).getReg();
1703 
1704   // We're allowed to emit a TB(N)Z/CB(N)Z. Try to do that.
1705   auto VRegAndVal = getIConstantVRegValWithLookThrough(RHS, MRI);
1706   MachineInstr *AndInst = getOpcodeDef(TargetOpcode::G_AND, LHS, MRI);
1707 
1708   // When we can emit a TB(N)Z, prefer that.
1709   //
1710   // Handle non-commutative condition codes first.
1711   // Note that we don't want to do this when we have a G_AND because it can
1712   // become a tst. The tst will make the test bit in the TB(N)Z redundant.
1713   if (VRegAndVal && !AndInst) {
1714     int64_t C = VRegAndVal->Value.getSExtValue();
1715 
1716     // When we have a greater-than comparison, we can just test if the msb is
1717     // zero.
1718     if (C == -1 && Pred == CmpInst::ICMP_SGT) {
1719       uint64_t Bit = MRI.getType(LHS).getSizeInBits() - 1;
1720       emitTestBit(LHS, Bit, /*IsNegative = */ false, DestMBB, MIB);
1721       I.eraseFromParent();
1722       return true;
1723     }
1724 
1725     // When we have a less than comparison, we can just test if the msb is not
1726     // zero.
1727     if (C == 0 && Pred == CmpInst::ICMP_SLT) {
1728       uint64_t Bit = MRI.getType(LHS).getSizeInBits() - 1;
1729       emitTestBit(LHS, Bit, /*IsNegative = */ true, DestMBB, MIB);
1730       I.eraseFromParent();
1731       return true;
1732     }
1733 
1734     // Inversely, if we have a signed greater-than-or-equal comparison to zero,
1735     // we can test if the msb is zero.
1736     if (C == 0 && Pred == CmpInst::ICMP_SGE) {
1737       uint64_t Bit = MRI.getType(LHS).getSizeInBits() - 1;
1738       emitTestBit(LHS, Bit, /*IsNegative = */ false, DestMBB, MIB);
1739       I.eraseFromParent();
1740       return true;
1741     }
1742   }
1743 
1744   // Attempt to handle commutative condition codes. Right now, that's only
1745   // eq/ne.
1746   if (ICmpInst::isEquality(Pred)) {
1747     if (!VRegAndVal) {
1748       std::swap(RHS, LHS);
1749       VRegAndVal = getIConstantVRegValWithLookThrough(RHS, MRI);
1750       AndInst = getOpcodeDef(TargetOpcode::G_AND, LHS, MRI);
1751     }
1752 
1753     if (VRegAndVal && VRegAndVal->Value == 0) {
1754       // If there's a G_AND feeding into this branch, try to fold it away by
1755       // emitting a TB(N)Z instead.
1756       //
1757       // Note: If we have LT, then it *is* possible to fold, but it wouldn't be
1758       // beneficial. When we have an AND and LT, we need a TST/ANDS, so folding
1759       // would be redundant.
1760       if (AndInst &&
1761           tryOptAndIntoCompareBranch(
1762               *AndInst, /*Invert = */ Pred == CmpInst::ICMP_NE, DestMBB, MIB)) {
1763         I.eraseFromParent();
1764         return true;
1765       }
1766 
1767       // Otherwise, try to emit a CB(N)Z instead.
1768       auto LHSTy = MRI.getType(LHS);
1769       if (!LHSTy.isVector() && LHSTy.getSizeInBits() <= 64) {
1770         emitCBZ(LHS, /*IsNegative = */ Pred == CmpInst::ICMP_NE, DestMBB, MIB);
1771         I.eraseFromParent();
1772         return true;
1773       }
1774     }
1775   }
1776 
1777   return false;
1778 }
1779 
1780 bool AArch64InstructionSelector::selectCompareBranchFedByICmp(
1781     MachineInstr &I, MachineInstr &ICmp, MachineIRBuilder &MIB) const {
1782   assert(ICmp.getOpcode() == TargetOpcode::G_ICMP);
1783   assert(I.getOpcode() == TargetOpcode::G_BRCOND);
1784   if (tryOptCompareBranchFedByICmp(I, ICmp, MIB))
1785     return true;
1786 
1787   // Couldn't optimize. Emit a compare + a Bcc.
1788   MachineBasicBlock *DestMBB = I.getOperand(1).getMBB();
1789   auto PredOp = ICmp.getOperand(1);
1790   emitIntegerCompare(ICmp.getOperand(2), ICmp.getOperand(3), PredOp, MIB);
1791   const AArch64CC::CondCode CC = changeICMPPredToAArch64CC(
1792       static_cast<CmpInst::Predicate>(PredOp.getPredicate()));
1793   MIB.buildInstr(AArch64::Bcc, {}, {}).addImm(CC).addMBB(DestMBB);
1794   I.eraseFromParent();
1795   return true;
1796 }
1797 
1798 bool AArch64InstructionSelector::selectCompareBranch(
1799     MachineInstr &I, MachineFunction &MF, MachineRegisterInfo &MRI) {
1800   Register CondReg = I.getOperand(0).getReg();
1801   MachineInstr *CCMI = MRI.getVRegDef(CondReg);
1802   // Try to select the G_BRCOND using whatever is feeding the condition if
1803   // possible.
1804   unsigned CCMIOpc = CCMI->getOpcode();
1805   if (CCMIOpc == TargetOpcode::G_FCMP)
1806     return selectCompareBranchFedByFCmp(I, *CCMI, MIB);
1807   if (CCMIOpc == TargetOpcode::G_ICMP)
1808     return selectCompareBranchFedByICmp(I, *CCMI, MIB);
1809 
1810   // Speculation tracking/SLH assumes that optimized TB(N)Z/CB(N)Z
1811   // instructions will not be produced, as they are conditional branch
1812   // instructions that do not set flags.
1813   if (ProduceNonFlagSettingCondBr) {
1814     emitTestBit(CondReg, /*Bit = */ 0, /*IsNegative = */ true,
1815                 I.getOperand(1).getMBB(), MIB);
1816     I.eraseFromParent();
1817     return true;
1818   }
1819 
1820   // Can't emit TB(N)Z/CB(N)Z. Emit a tst + bcc instead.
1821   auto TstMI =
1822       MIB.buildInstr(AArch64::ANDSWri, {LLT::scalar(32)}, {CondReg}).addImm(1);
1823   constrainSelectedInstRegOperands(*TstMI, TII, TRI, RBI);
1824   auto Bcc = MIB.buildInstr(AArch64::Bcc)
1825                  .addImm(AArch64CC::NE)
1826                  .addMBB(I.getOperand(1).getMBB());
1827   I.eraseFromParent();
1828   return constrainSelectedInstRegOperands(*Bcc, TII, TRI, RBI);
1829 }
1830 
1831 /// Returns the element immediate value of a vector shift operand if found.
1832 /// This needs to detect a splat-like operation, e.g. a G_BUILD_VECTOR.
1833 static std::optional<int64_t> getVectorShiftImm(Register Reg,
1834                                                 MachineRegisterInfo &MRI) {
1835   assert(MRI.getType(Reg).isVector() && "Expected a *vector* shift operand");
1836   MachineInstr *OpMI = MRI.getVRegDef(Reg);
1837   return getAArch64VectorSplatScalar(*OpMI, MRI);
1838 }
1839 
1840 /// Matches and returns the shift immediate value for a SHL instruction given
1841 /// a shift operand.
1842 static std::optional<int64_t> getVectorSHLImm(LLT SrcTy, Register Reg,
1843                                               MachineRegisterInfo &MRI) {
1844   std::optional<int64_t> ShiftImm = getVectorShiftImm(Reg, MRI);
1845   if (!ShiftImm)
1846     return std::nullopt;
1847   // Check the immediate is in range for a SHL.
1848   int64_t Imm = *ShiftImm;
1849   if (Imm < 0)
1850     return std::nullopt;
1851   switch (SrcTy.getElementType().getSizeInBits()) {
1852   default:
1853     LLVM_DEBUG(dbgs() << "Unhandled element type for vector shift");
1854     return std::nullopt;
1855   case 8:
1856     if (Imm > 7)
1857       return std::nullopt;
1858     break;
1859   case 16:
1860     if (Imm > 15)
1861       return std::nullopt;
1862     break;
1863   case 32:
1864     if (Imm > 31)
1865       return std::nullopt;
1866     break;
1867   case 64:
1868     if (Imm > 63)
1869       return std::nullopt;
1870     break;
1871   }
1872   return Imm;
1873 }
1874 
1875 bool AArch64InstructionSelector::selectVectorSHL(MachineInstr &I,
1876                                                  MachineRegisterInfo &MRI) {
1877   assert(I.getOpcode() == TargetOpcode::G_SHL);
1878   Register DstReg = I.getOperand(0).getReg();
1879   const LLT Ty = MRI.getType(DstReg);
1880   Register Src1Reg = I.getOperand(1).getReg();
1881   Register Src2Reg = I.getOperand(2).getReg();
1882 
1883   if (!Ty.isVector())
1884     return false;
1885 
1886   // Check if we have a vector of constants on RHS that we can select as the
1887   // immediate form.
1888   std::optional<int64_t> ImmVal = getVectorSHLImm(Ty, Src2Reg, MRI);
1889 
1890   unsigned Opc = 0;
1891   if (Ty == LLT::fixed_vector(2, 64)) {
1892     Opc = ImmVal ? AArch64::SHLv2i64_shift : AArch64::USHLv2i64;
1893   } else if (Ty == LLT::fixed_vector(4, 32)) {
1894     Opc = ImmVal ? AArch64::SHLv4i32_shift : AArch64::USHLv4i32;
1895   } else if (Ty == LLT::fixed_vector(2, 32)) {
1896     Opc = ImmVal ? AArch64::SHLv2i32_shift : AArch64::USHLv2i32;
1897   } else if (Ty == LLT::fixed_vector(4, 16)) {
1898     Opc = ImmVal ? AArch64::SHLv4i16_shift : AArch64::USHLv4i16;
1899   } else if (Ty == LLT::fixed_vector(8, 16)) {
1900     Opc = ImmVal ? AArch64::SHLv8i16_shift : AArch64::USHLv8i16;
1901   } else if (Ty == LLT::fixed_vector(16, 8)) {
1902     Opc = ImmVal ? AArch64::SHLv16i8_shift : AArch64::USHLv16i8;
1903   } else if (Ty == LLT::fixed_vector(8, 8)) {
1904     Opc = ImmVal ? AArch64::SHLv8i8_shift : AArch64::USHLv8i8;
1905   } else {
1906     LLVM_DEBUG(dbgs() << "Unhandled G_SHL type");
1907     return false;
1908   }
1909 
1910   auto Shl = MIB.buildInstr(Opc, {DstReg}, {Src1Reg});
1911   if (ImmVal)
1912     Shl.addImm(*ImmVal);
1913   else
1914     Shl.addUse(Src2Reg);
1915   constrainSelectedInstRegOperands(*Shl, TII, TRI, RBI);
1916   I.eraseFromParent();
1917   return true;
1918 }
1919 
1920 bool AArch64InstructionSelector::selectVectorAshrLshr(
1921     MachineInstr &I, MachineRegisterInfo &MRI) {
1922   assert(I.getOpcode() == TargetOpcode::G_ASHR ||
1923          I.getOpcode() == TargetOpcode::G_LSHR);
1924   Register DstReg = I.getOperand(0).getReg();
1925   const LLT Ty = MRI.getType(DstReg);
1926   Register Src1Reg = I.getOperand(1).getReg();
1927   Register Src2Reg = I.getOperand(2).getReg();
1928 
1929   if (!Ty.isVector())
1930     return false;
1931 
1932   bool IsASHR = I.getOpcode() == TargetOpcode::G_ASHR;
1933 
1934   // We expect the immediate case to be lowered in the PostLegalCombiner to
1935   // AArch64ISD::VASHR or AArch64ISD::VLSHR equivalents.
1936 
1937   // There is not a shift right register instruction, but the shift left
1938   // register instruction takes a signed value, where negative numbers specify a
1939   // right shift.
1940 
1941   unsigned Opc = 0;
1942   unsigned NegOpc = 0;
1943   const TargetRegisterClass *RC =
1944       getRegClassForTypeOnBank(Ty, RBI.getRegBank(AArch64::FPRRegBankID));
1945   if (Ty == LLT::fixed_vector(2, 64)) {
1946     Opc = IsASHR ? AArch64::SSHLv2i64 : AArch64::USHLv2i64;
1947     NegOpc = AArch64::NEGv2i64;
1948   } else if (Ty == LLT::fixed_vector(4, 32)) {
1949     Opc = IsASHR ? AArch64::SSHLv4i32 : AArch64::USHLv4i32;
1950     NegOpc = AArch64::NEGv4i32;
1951   } else if (Ty == LLT::fixed_vector(2, 32)) {
1952     Opc = IsASHR ? AArch64::SSHLv2i32 : AArch64::USHLv2i32;
1953     NegOpc = AArch64::NEGv2i32;
1954   } else if (Ty == LLT::fixed_vector(4, 16)) {
1955     Opc = IsASHR ? AArch64::SSHLv4i16 : AArch64::USHLv4i16;
1956     NegOpc = AArch64::NEGv4i16;
1957   } else if (Ty == LLT::fixed_vector(8, 16)) {
1958     Opc = IsASHR ? AArch64::SSHLv8i16 : AArch64::USHLv8i16;
1959     NegOpc = AArch64::NEGv8i16;
1960   } else if (Ty == LLT::fixed_vector(16, 8)) {
1961     Opc = IsASHR ? AArch64::SSHLv16i8 : AArch64::USHLv16i8;
1962     NegOpc = AArch64::NEGv16i8;
1963   } else if (Ty == LLT::fixed_vector(8, 8)) {
1964     Opc = IsASHR ? AArch64::SSHLv8i8 : AArch64::USHLv8i8;
1965     NegOpc = AArch64::NEGv8i8;
1966   } else {
1967     LLVM_DEBUG(dbgs() << "Unhandled G_ASHR type");
1968     return false;
1969   }
1970 
1971   auto Neg = MIB.buildInstr(NegOpc, {RC}, {Src2Reg});
1972   constrainSelectedInstRegOperands(*Neg, TII, TRI, RBI);
1973   auto SShl = MIB.buildInstr(Opc, {DstReg}, {Src1Reg, Neg});
1974   constrainSelectedInstRegOperands(*SShl, TII, TRI, RBI);
1975   I.eraseFromParent();
1976   return true;
1977 }
1978 
1979 bool AArch64InstructionSelector::selectVaStartAAPCS(
1980     MachineInstr &I, MachineFunction &MF, MachineRegisterInfo &MRI) const {
1981   return false;
1982 }
1983 
1984 bool AArch64InstructionSelector::selectVaStartDarwin(
1985     MachineInstr &I, MachineFunction &MF, MachineRegisterInfo &MRI) const {
1986   AArch64FunctionInfo *FuncInfo = MF.getInfo<AArch64FunctionInfo>();
1987   Register ListReg = I.getOperand(0).getReg();
1988 
1989   Register ArgsAddrReg = MRI.createVirtualRegister(&AArch64::GPR64RegClass);
1990 
1991   int FrameIdx = FuncInfo->getVarArgsStackIndex();
1992   if (MF.getSubtarget<AArch64Subtarget>().isCallingConvWin64(
1993           MF.getFunction().getCallingConv())) {
1994     FrameIdx = FuncInfo->getVarArgsGPRSize() > 0
1995                    ? FuncInfo->getVarArgsGPRIndex()
1996                    : FuncInfo->getVarArgsStackIndex();
1997   }
1998 
1999   auto MIB =
2000       BuildMI(*I.getParent(), I, I.getDebugLoc(), TII.get(AArch64::ADDXri))
2001           .addDef(ArgsAddrReg)
2002           .addFrameIndex(FrameIdx)
2003           .addImm(0)
2004           .addImm(0);
2005 
2006   constrainSelectedInstRegOperands(*MIB, TII, TRI, RBI);
2007 
2008   MIB = BuildMI(*I.getParent(), I, I.getDebugLoc(), TII.get(AArch64::STRXui))
2009             .addUse(ArgsAddrReg)
2010             .addUse(ListReg)
2011             .addImm(0)
2012             .addMemOperand(*I.memoperands_begin());
2013 
2014   constrainSelectedInstRegOperands(*MIB, TII, TRI, RBI);
2015   I.eraseFromParent();
2016   return true;
2017 }
2018 
2019 void AArch64InstructionSelector::materializeLargeCMVal(
2020     MachineInstr &I, const Value *V, unsigned OpFlags) {
2021   MachineBasicBlock &MBB = *I.getParent();
2022   MachineFunction &MF = *MBB.getParent();
2023   MachineRegisterInfo &MRI = MF.getRegInfo();
2024 
2025   auto MovZ = MIB.buildInstr(AArch64::MOVZXi, {&AArch64::GPR64RegClass}, {});
2026   MovZ->addOperand(MF, I.getOperand(1));
2027   MovZ->getOperand(1).setTargetFlags(OpFlags | AArch64II::MO_G0 |
2028                                      AArch64II::MO_NC);
2029   MovZ->addOperand(MF, MachineOperand::CreateImm(0));
2030   constrainSelectedInstRegOperands(*MovZ, TII, TRI, RBI);
2031 
2032   auto BuildMovK = [&](Register SrcReg, unsigned char Flags, unsigned Offset,
2033                        Register ForceDstReg) {
2034     Register DstReg = ForceDstReg
2035                           ? ForceDstReg
2036                           : MRI.createVirtualRegister(&AArch64::GPR64RegClass);
2037     auto MovI = MIB.buildInstr(AArch64::MOVKXi).addDef(DstReg).addUse(SrcReg);
2038     if (auto *GV = dyn_cast<GlobalValue>(V)) {
2039       MovI->addOperand(MF, MachineOperand::CreateGA(
2040                                GV, MovZ->getOperand(1).getOffset(), Flags));
2041     } else {
2042       MovI->addOperand(
2043           MF, MachineOperand::CreateBA(cast<BlockAddress>(V),
2044                                        MovZ->getOperand(1).getOffset(), Flags));
2045     }
2046     MovI->addOperand(MF, MachineOperand::CreateImm(Offset));
2047     constrainSelectedInstRegOperands(*MovI, TII, TRI, RBI);
2048     return DstReg;
2049   };
2050   Register DstReg = BuildMovK(MovZ.getReg(0),
2051                               AArch64II::MO_G1 | AArch64II::MO_NC, 16, 0);
2052   DstReg = BuildMovK(DstReg, AArch64II::MO_G2 | AArch64II::MO_NC, 32, 0);
2053   BuildMovK(DstReg, AArch64II::MO_G3, 48, I.getOperand(0).getReg());
2054 }
2055 
2056 bool AArch64InstructionSelector::preISelLower(MachineInstr &I) {
2057   MachineBasicBlock &MBB = *I.getParent();
2058   MachineFunction &MF = *MBB.getParent();
2059   MachineRegisterInfo &MRI = MF.getRegInfo();
2060 
2061   switch (I.getOpcode()) {
2062   case TargetOpcode::G_STORE: {
2063     bool Changed = contractCrossBankCopyIntoStore(I, MRI);
2064     MachineOperand &SrcOp = I.getOperand(0);
2065     if (MRI.getType(SrcOp.getReg()).isPointer()) {
2066       // Allow matching with imported patterns for stores of pointers. Unlike
2067       // G_LOAD/G_PTR_ADD, we may not have selected all users. So, emit a copy
2068       // and constrain.
2069       auto Copy = MIB.buildCopy(LLT::scalar(64), SrcOp);
2070       Register NewSrc = Copy.getReg(0);
2071       SrcOp.setReg(NewSrc);
2072       RBI.constrainGenericRegister(NewSrc, AArch64::GPR64RegClass, MRI);
2073       Changed = true;
2074     }
2075     return Changed;
2076   }
2077   case TargetOpcode::G_PTR_ADD:
2078     return convertPtrAddToAdd(I, MRI);
2079   case TargetOpcode::G_LOAD: {
2080     // For scalar loads of pointers, we try to convert the dest type from p0
2081     // to s64 so that our imported patterns can match. Like with the G_PTR_ADD
2082     // conversion, this should be ok because all users should have been
2083     // selected already, so the type doesn't matter for them.
2084     Register DstReg = I.getOperand(0).getReg();
2085     const LLT DstTy = MRI.getType(DstReg);
2086     if (!DstTy.isPointer())
2087       return false;
2088     MRI.setType(DstReg, LLT::scalar(64));
2089     return true;
2090   }
2091   case AArch64::G_DUP: {
2092     // Convert the type from p0 to s64 to help selection.
2093     LLT DstTy = MRI.getType(I.getOperand(0).getReg());
2094     if (!DstTy.getElementType().isPointer())
2095       return false;
2096     auto NewSrc = MIB.buildCopy(LLT::scalar(64), I.getOperand(1).getReg());
2097     MRI.setType(I.getOperand(0).getReg(),
2098                 DstTy.changeElementType(LLT::scalar(64)));
2099     MRI.setRegClass(NewSrc.getReg(0), &AArch64::GPR64RegClass);
2100     I.getOperand(1).setReg(NewSrc.getReg(0));
2101     return true;
2102   }
2103   case TargetOpcode::G_UITOFP:
2104   case TargetOpcode::G_SITOFP: {
2105     // If both source and destination regbanks are FPR, then convert the opcode
2106     // to G_SITOF so that the importer can select it to an fpr variant.
2107     // Otherwise, it ends up matching an fpr/gpr variant and adding a cross-bank
2108     // copy.
2109     Register SrcReg = I.getOperand(1).getReg();
2110     LLT SrcTy = MRI.getType(SrcReg);
2111     LLT DstTy = MRI.getType(I.getOperand(0).getReg());
2112     if (SrcTy.isVector() || SrcTy.getSizeInBits() != DstTy.getSizeInBits())
2113       return false;
2114 
2115     if (RBI.getRegBank(SrcReg, MRI, TRI)->getID() == AArch64::FPRRegBankID) {
2116       if (I.getOpcode() == TargetOpcode::G_SITOFP)
2117         I.setDesc(TII.get(AArch64::G_SITOF));
2118       else
2119         I.setDesc(TII.get(AArch64::G_UITOF));
2120       return true;
2121     }
2122     return false;
2123   }
2124   default:
2125     return false;
2126   }
2127 }
2128 
2129 /// This lowering tries to look for G_PTR_ADD instructions and then converts
2130 /// them to a standard G_ADD with a COPY on the source.
2131 ///
2132 /// The motivation behind this is to expose the add semantics to the imported
2133 /// tablegen patterns. We shouldn't need to check for uses being loads/stores,
2134 /// because the selector works bottom up, uses before defs. By the time we
2135 /// end up trying to select a G_PTR_ADD, we should have already attempted to
2136 /// fold this into addressing modes and were therefore unsuccessful.
2137 bool AArch64InstructionSelector::convertPtrAddToAdd(
2138     MachineInstr &I, MachineRegisterInfo &MRI) {
2139   assert(I.getOpcode() == TargetOpcode::G_PTR_ADD && "Expected G_PTR_ADD");
2140   Register DstReg = I.getOperand(0).getReg();
2141   Register AddOp1Reg = I.getOperand(1).getReg();
2142   const LLT PtrTy = MRI.getType(DstReg);
2143   if (PtrTy.getAddressSpace() != 0)
2144     return false;
2145 
2146   const LLT CastPtrTy =
2147       PtrTy.isVector() ? LLT::fixed_vector(2, 64) : LLT::scalar(64);
2148   auto PtrToInt = MIB.buildPtrToInt(CastPtrTy, AddOp1Reg);
2149   // Set regbanks on the registers.
2150   if (PtrTy.isVector())
2151     MRI.setRegBank(PtrToInt.getReg(0), RBI.getRegBank(AArch64::FPRRegBankID));
2152   else
2153     MRI.setRegBank(PtrToInt.getReg(0), RBI.getRegBank(AArch64::GPRRegBankID));
2154 
2155   // Now turn the %dst(p0) = G_PTR_ADD %base, off into:
2156   // %dst(intty) = G_ADD %intbase, off
2157   I.setDesc(TII.get(TargetOpcode::G_ADD));
2158   MRI.setType(DstReg, CastPtrTy);
2159   I.getOperand(1).setReg(PtrToInt.getReg(0));
2160   if (!select(*PtrToInt)) {
2161     LLVM_DEBUG(dbgs() << "Failed to select G_PTRTOINT in convertPtrAddToAdd");
2162     return false;
2163   }
2164 
2165   // Also take the opportunity here to try to do some optimization.
2166   // Try to convert this into a G_SUB if the offset is a 0-x negate idiom.
2167   Register NegatedReg;
2168   if (!mi_match(I.getOperand(2).getReg(), MRI, m_Neg(m_Reg(NegatedReg))))
2169     return true;
2170   I.getOperand(2).setReg(NegatedReg);
2171   I.setDesc(TII.get(TargetOpcode::G_SUB));
2172   return true;
2173 }
2174 
2175 bool AArch64InstructionSelector::earlySelectSHL(MachineInstr &I,
2176                                                 MachineRegisterInfo &MRI) {
2177   // We try to match the immediate variant of LSL, which is actually an alias
2178   // for a special case of UBFM. Otherwise, we fall back to the imported
2179   // selector which will match the register variant.
2180   assert(I.getOpcode() == TargetOpcode::G_SHL && "unexpected op");
2181   const auto &MO = I.getOperand(2);
2182   auto VRegAndVal = getIConstantVRegVal(MO.getReg(), MRI);
2183   if (!VRegAndVal)
2184     return false;
2185 
2186   const LLT DstTy = MRI.getType(I.getOperand(0).getReg());
2187   if (DstTy.isVector())
2188     return false;
2189   bool Is64Bit = DstTy.getSizeInBits() == 64;
2190   auto Imm1Fn = Is64Bit ? selectShiftA_64(MO) : selectShiftA_32(MO);
2191   auto Imm2Fn = Is64Bit ? selectShiftB_64(MO) : selectShiftB_32(MO);
2192 
2193   if (!Imm1Fn || !Imm2Fn)
2194     return false;
2195 
2196   auto NewI =
2197       MIB.buildInstr(Is64Bit ? AArch64::UBFMXri : AArch64::UBFMWri,
2198                      {I.getOperand(0).getReg()}, {I.getOperand(1).getReg()});
2199 
2200   for (auto &RenderFn : *Imm1Fn)
2201     RenderFn(NewI);
2202   for (auto &RenderFn : *Imm2Fn)
2203     RenderFn(NewI);
2204 
2205   I.eraseFromParent();
2206   return constrainSelectedInstRegOperands(*NewI, TII, TRI, RBI);
2207 }
2208 
2209 bool AArch64InstructionSelector::contractCrossBankCopyIntoStore(
2210     MachineInstr &I, MachineRegisterInfo &MRI) {
2211   assert(I.getOpcode() == TargetOpcode::G_STORE && "Expected G_STORE");
2212   // If we're storing a scalar, it doesn't matter what register bank that
2213   // scalar is on. All that matters is the size.
2214   //
2215   // So, if we see something like this (with a 32-bit scalar as an example):
2216   //
2217   // %x:gpr(s32) = ... something ...
2218   // %y:fpr(s32) = COPY %x:gpr(s32)
2219   // G_STORE %y:fpr(s32)
2220   //
2221   // We can fix this up into something like this:
2222   //
2223   // G_STORE %x:gpr(s32)
2224   //
2225   // And then continue the selection process normally.
2226   Register DefDstReg = getSrcRegIgnoringCopies(I.getOperand(0).getReg(), MRI);
2227   if (!DefDstReg.isValid())
2228     return false;
2229   LLT DefDstTy = MRI.getType(DefDstReg);
2230   Register StoreSrcReg = I.getOperand(0).getReg();
2231   LLT StoreSrcTy = MRI.getType(StoreSrcReg);
2232 
2233   // If we get something strange like a physical register, then we shouldn't
2234   // go any further.
2235   if (!DefDstTy.isValid())
2236     return false;
2237 
2238   // Are the source and dst types the same size?
2239   if (DefDstTy.getSizeInBits() != StoreSrcTy.getSizeInBits())
2240     return false;
2241 
2242   if (RBI.getRegBank(StoreSrcReg, MRI, TRI) ==
2243       RBI.getRegBank(DefDstReg, MRI, TRI))
2244     return false;
2245 
2246   // We have a cross-bank copy, which is entering a store. Let's fold it.
2247   I.getOperand(0).setReg(DefDstReg);
2248   return true;
2249 }
2250 
2251 bool AArch64InstructionSelector::earlySelect(MachineInstr &I) {
2252   assert(I.getParent() && "Instruction should be in a basic block!");
2253   assert(I.getParent()->getParent() && "Instruction should be in a function!");
2254 
2255   MachineBasicBlock &MBB = *I.getParent();
2256   MachineFunction &MF = *MBB.getParent();
2257   MachineRegisterInfo &MRI = MF.getRegInfo();
2258 
2259   switch (I.getOpcode()) {
2260   case AArch64::G_DUP: {
2261     // Before selecting a DUP instruction, check if it is better selected as a
2262     // MOV or load from a constant pool.
2263     Register Src = I.getOperand(1).getReg();
2264     auto ValAndVReg = getAnyConstantVRegValWithLookThrough(Src, MRI);
2265     if (!ValAndVReg)
2266       return false;
2267     LLVMContext &Ctx = MF.getFunction().getContext();
2268     Register Dst = I.getOperand(0).getReg();
2269     auto *CV = ConstantDataVector::getSplat(
2270         MRI.getType(Dst).getNumElements(),
2271         ConstantInt::get(Type::getIntNTy(Ctx, MRI.getType(Src).getSizeInBits()),
2272                          ValAndVReg->Value));
2273     if (!emitConstantVector(Dst, CV, MIB, MRI))
2274       return false;
2275     I.eraseFromParent();
2276     return true;
2277   }
2278   case TargetOpcode::G_SEXT:
2279     // Check for i64 sext(i32 vector_extract) prior to tablegen to select SMOV
2280     // over a normal extend.
2281     if (selectUSMovFromExtend(I, MRI))
2282       return true;
2283     return false;
2284   case TargetOpcode::G_BR:
2285     return false;
2286   case TargetOpcode::G_SHL:
2287     return earlySelectSHL(I, MRI);
2288   case TargetOpcode::G_CONSTANT: {
2289     bool IsZero = false;
2290     if (I.getOperand(1).isCImm())
2291       IsZero = I.getOperand(1).getCImm()->isZero();
2292     else if (I.getOperand(1).isImm())
2293       IsZero = I.getOperand(1).getImm() == 0;
2294 
2295     if (!IsZero)
2296       return false;
2297 
2298     Register DefReg = I.getOperand(0).getReg();
2299     LLT Ty = MRI.getType(DefReg);
2300     if (Ty.getSizeInBits() == 64) {
2301       I.getOperand(1).ChangeToRegister(AArch64::XZR, false);
2302       RBI.constrainGenericRegister(DefReg, AArch64::GPR64RegClass, MRI);
2303     } else if (Ty.getSizeInBits() == 32) {
2304       I.getOperand(1).ChangeToRegister(AArch64::WZR, false);
2305       RBI.constrainGenericRegister(DefReg, AArch64::GPR32RegClass, MRI);
2306     } else
2307       return false;
2308 
2309     I.setDesc(TII.get(TargetOpcode::COPY));
2310     return true;
2311   }
2312 
2313   case TargetOpcode::G_ADD: {
2314     // Check if this is being fed by a G_ICMP on either side.
2315     //
2316     // (cmp pred, x, y) + z
2317     //
2318     // In the above case, when the cmp is true, we increment z by 1. So, we can
2319     // fold the add into the cset for the cmp by using cinc.
2320     //
2321     // FIXME: This would probably be a lot nicer in PostLegalizerLowering.
2322     Register AddDst = I.getOperand(0).getReg();
2323     Register AddLHS = I.getOperand(1).getReg();
2324     Register AddRHS = I.getOperand(2).getReg();
2325     // Only handle scalars.
2326     LLT Ty = MRI.getType(AddLHS);
2327     if (Ty.isVector())
2328       return false;
2329     // Since G_ICMP is modeled as ADDS/SUBS/ANDS, we can handle 32 bits or 64
2330     // bits.
2331     unsigned Size = Ty.getSizeInBits();
2332     if (Size != 32 && Size != 64)
2333       return false;
2334     auto MatchCmp = [&](Register Reg) -> MachineInstr * {
2335       if (!MRI.hasOneNonDBGUse(Reg))
2336         return nullptr;
2337       // If the LHS of the add is 32 bits, then we want to fold a 32-bit
2338       // compare.
2339       if (Size == 32)
2340         return getOpcodeDef(TargetOpcode::G_ICMP, Reg, MRI);
2341       // We model scalar compares using 32-bit destinations right now.
2342       // If it's a 64-bit compare, it'll have 64-bit sources.
2343       Register ZExt;
2344       if (!mi_match(Reg, MRI,
2345                     m_OneNonDBGUse(m_GZExt(m_OneNonDBGUse(m_Reg(ZExt))))))
2346         return nullptr;
2347       auto *Cmp = getOpcodeDef(TargetOpcode::G_ICMP, ZExt, MRI);
2348       if (!Cmp ||
2349           MRI.getType(Cmp->getOperand(2).getReg()).getSizeInBits() != 64)
2350         return nullptr;
2351       return Cmp;
2352     };
2353     // Try to match
2354     // z + (cmp pred, x, y)
2355     MachineInstr *Cmp = MatchCmp(AddRHS);
2356     if (!Cmp) {
2357       // (cmp pred, x, y) + z
2358       std::swap(AddLHS, AddRHS);
2359       Cmp = MatchCmp(AddRHS);
2360       if (!Cmp)
2361         return false;
2362     }
2363     auto &PredOp = Cmp->getOperand(1);
2364     auto Pred = static_cast<CmpInst::Predicate>(PredOp.getPredicate());
2365     const AArch64CC::CondCode InvCC =
2366         changeICMPPredToAArch64CC(CmpInst::getInversePredicate(Pred));
2367     MIB.setInstrAndDebugLoc(I);
2368     emitIntegerCompare(/*LHS=*/Cmp->getOperand(2),
2369                        /*RHS=*/Cmp->getOperand(3), PredOp, MIB);
2370     emitCSINC(/*Dst=*/AddDst, /*Src =*/AddLHS, /*Src2=*/AddLHS, InvCC, MIB);
2371     I.eraseFromParent();
2372     return true;
2373   }
2374   case TargetOpcode::G_OR: {
2375     // Look for operations that take the lower `Width=Size-ShiftImm` bits of
2376     // `ShiftSrc` and insert them into the upper `Width` bits of `MaskSrc` via
2377     // shifting and masking that we can replace with a BFI (encoded as a BFM).
2378     Register Dst = I.getOperand(0).getReg();
2379     LLT Ty = MRI.getType(Dst);
2380 
2381     if (!Ty.isScalar())
2382       return false;
2383 
2384     unsigned Size = Ty.getSizeInBits();
2385     if (Size != 32 && Size != 64)
2386       return false;
2387 
2388     Register ShiftSrc;
2389     int64_t ShiftImm;
2390     Register MaskSrc;
2391     int64_t MaskImm;
2392     if (!mi_match(
2393             Dst, MRI,
2394             m_GOr(m_OneNonDBGUse(m_GShl(m_Reg(ShiftSrc), m_ICst(ShiftImm))),
2395                   m_OneNonDBGUse(m_GAnd(m_Reg(MaskSrc), m_ICst(MaskImm))))))
2396       return false;
2397 
2398     if (ShiftImm > Size || ((1ULL << ShiftImm) - 1ULL) != uint64_t(MaskImm))
2399       return false;
2400 
2401     int64_t Immr = Size - ShiftImm;
2402     int64_t Imms = Size - ShiftImm - 1;
2403     unsigned Opc = Size == 32 ? AArch64::BFMWri : AArch64::BFMXri;
2404     emitInstr(Opc, {Dst}, {MaskSrc, ShiftSrc, Immr, Imms}, MIB);
2405     I.eraseFromParent();
2406     return true;
2407   }
2408   case TargetOpcode::G_FENCE: {
2409     if (I.getOperand(1).getImm() == 0)
2410       BuildMI(MBB, I, MIMetadata(I), TII.get(TargetOpcode::MEMBARRIER));
2411     else
2412       BuildMI(MBB, I, MIMetadata(I), TII.get(AArch64::DMB))
2413           .addImm(I.getOperand(0).getImm() == 4 ? 0x9 : 0xb);
2414     I.eraseFromParent();
2415     return true;
2416   }
2417   default:
2418     return false;
2419   }
2420 }
2421 
2422 bool AArch64InstructionSelector::select(MachineInstr &I) {
2423   assert(I.getParent() && "Instruction should be in a basic block!");
2424   assert(I.getParent()->getParent() && "Instruction should be in a function!");
2425 
2426   MachineBasicBlock &MBB = *I.getParent();
2427   MachineFunction &MF = *MBB.getParent();
2428   MachineRegisterInfo &MRI = MF.getRegInfo();
2429 
2430   const AArch64Subtarget *Subtarget = &MF.getSubtarget<AArch64Subtarget>();
2431   if (Subtarget->requiresStrictAlign()) {
2432     // We don't support this feature yet.
2433     LLVM_DEBUG(dbgs() << "AArch64 GISel does not support strict-align yet\n");
2434     return false;
2435   }
2436 
2437   MIB.setInstrAndDebugLoc(I);
2438 
2439   unsigned Opcode = I.getOpcode();
2440   // G_PHI requires same handling as PHI
2441   if (!I.isPreISelOpcode() || Opcode == TargetOpcode::G_PHI) {
2442     // Certain non-generic instructions also need some special handling.
2443 
2444     if (Opcode ==  TargetOpcode::LOAD_STACK_GUARD)
2445       return constrainSelectedInstRegOperands(I, TII, TRI, RBI);
2446 
2447     if (Opcode == TargetOpcode::PHI || Opcode == TargetOpcode::G_PHI) {
2448       const Register DefReg = I.getOperand(0).getReg();
2449       const LLT DefTy = MRI.getType(DefReg);
2450 
2451       const RegClassOrRegBank &RegClassOrBank =
2452         MRI.getRegClassOrRegBank(DefReg);
2453 
2454       const TargetRegisterClass *DefRC
2455         = RegClassOrBank.dyn_cast<const TargetRegisterClass *>();
2456       if (!DefRC) {
2457         if (!DefTy.isValid()) {
2458           LLVM_DEBUG(dbgs() << "PHI operand has no type, not a gvreg?\n");
2459           return false;
2460         }
2461         const RegisterBank &RB = *RegClassOrBank.get<const RegisterBank *>();
2462         DefRC = getRegClassForTypeOnBank(DefTy, RB);
2463         if (!DefRC) {
2464           LLVM_DEBUG(dbgs() << "PHI operand has unexpected size/bank\n");
2465           return false;
2466         }
2467       }
2468 
2469       I.setDesc(TII.get(TargetOpcode::PHI));
2470 
2471       return RBI.constrainGenericRegister(DefReg, *DefRC, MRI);
2472     }
2473 
2474     if (I.isCopy())
2475       return selectCopy(I, TII, MRI, TRI, RBI);
2476 
2477     if (I.isDebugInstr())
2478       return selectDebugInstr(I, MRI, RBI);
2479 
2480     return true;
2481   }
2482 
2483 
2484   if (I.getNumOperands() != I.getNumExplicitOperands()) {
2485     LLVM_DEBUG(
2486         dbgs() << "Generic instruction has unexpected implicit operands\n");
2487     return false;
2488   }
2489 
2490   // Try to do some lowering before we start instruction selecting. These
2491   // lowerings are purely transformations on the input G_MIR and so selection
2492   // must continue after any modification of the instruction.
2493   if (preISelLower(I)) {
2494     Opcode = I.getOpcode(); // The opcode may have been modified, refresh it.
2495   }
2496 
2497   // There may be patterns where the importer can't deal with them optimally,
2498   // but does select it to a suboptimal sequence so our custom C++ selection
2499   // code later never has a chance to work on it. Therefore, we have an early
2500   // selection attempt here to give priority to certain selection routines
2501   // over the imported ones.
2502   if (earlySelect(I))
2503     return true;
2504 
2505   if (selectImpl(I, *CoverageInfo))
2506     return true;
2507 
2508   LLT Ty =
2509       I.getOperand(0).isReg() ? MRI.getType(I.getOperand(0).getReg()) : LLT{};
2510 
2511   switch (Opcode) {
2512   case TargetOpcode::G_SBFX:
2513   case TargetOpcode::G_UBFX: {
2514     static const unsigned OpcTable[2][2] = {
2515         {AArch64::UBFMWri, AArch64::UBFMXri},
2516         {AArch64::SBFMWri, AArch64::SBFMXri}};
2517     bool IsSigned = Opcode == TargetOpcode::G_SBFX;
2518     unsigned Size = Ty.getSizeInBits();
2519     unsigned Opc = OpcTable[IsSigned][Size == 64];
2520     auto Cst1 =
2521         getIConstantVRegValWithLookThrough(I.getOperand(2).getReg(), MRI);
2522     assert(Cst1 && "Should have gotten a constant for src 1?");
2523     auto Cst2 =
2524         getIConstantVRegValWithLookThrough(I.getOperand(3).getReg(), MRI);
2525     assert(Cst2 && "Should have gotten a constant for src 2?");
2526     auto LSB = Cst1->Value.getZExtValue();
2527     auto Width = Cst2->Value.getZExtValue();
2528     auto BitfieldInst =
2529         MIB.buildInstr(Opc, {I.getOperand(0)}, {I.getOperand(1)})
2530             .addImm(LSB)
2531             .addImm(LSB + Width - 1);
2532     I.eraseFromParent();
2533     return constrainSelectedInstRegOperands(*BitfieldInst, TII, TRI, RBI);
2534   }
2535   case TargetOpcode::G_BRCOND:
2536     return selectCompareBranch(I, MF, MRI);
2537 
2538   case TargetOpcode::G_BRINDIRECT: {
2539     I.setDesc(TII.get(AArch64::BR));
2540     return constrainSelectedInstRegOperands(I, TII, TRI, RBI);
2541   }
2542 
2543   case TargetOpcode::G_BRJT:
2544     return selectBrJT(I, MRI);
2545 
2546   case AArch64::G_ADD_LOW: {
2547     // This op may have been separated from it's ADRP companion by the localizer
2548     // or some other code motion pass. Given that many CPUs will try to
2549     // macro fuse these operations anyway, select this into a MOVaddr pseudo
2550     // which will later be expanded into an ADRP+ADD pair after scheduling.
2551     MachineInstr *BaseMI = MRI.getVRegDef(I.getOperand(1).getReg());
2552     if (BaseMI->getOpcode() != AArch64::ADRP) {
2553       I.setDesc(TII.get(AArch64::ADDXri));
2554       I.addOperand(MachineOperand::CreateImm(0));
2555       return constrainSelectedInstRegOperands(I, TII, TRI, RBI);
2556     }
2557     assert(TM.getCodeModel() == CodeModel::Small &&
2558            "Expected small code model");
2559     auto Op1 = BaseMI->getOperand(1);
2560     auto Op2 = I.getOperand(2);
2561     auto MovAddr = MIB.buildInstr(AArch64::MOVaddr, {I.getOperand(0)}, {})
2562                        .addGlobalAddress(Op1.getGlobal(), Op1.getOffset(),
2563                                          Op1.getTargetFlags())
2564                        .addGlobalAddress(Op2.getGlobal(), Op2.getOffset(),
2565                                          Op2.getTargetFlags());
2566     I.eraseFromParent();
2567     return constrainSelectedInstRegOperands(*MovAddr, TII, TRI, RBI);
2568   }
2569 
2570   case TargetOpcode::G_BSWAP: {
2571     // Handle vector types for G_BSWAP directly.
2572     Register DstReg = I.getOperand(0).getReg();
2573     LLT DstTy = MRI.getType(DstReg);
2574 
2575     // We should only get vector types here; everything else is handled by the
2576     // importer right now.
2577     if (!DstTy.isVector() || DstTy.getSizeInBits() > 128) {
2578       LLVM_DEBUG(dbgs() << "Dst type for G_BSWAP currently unsupported.\n");
2579       return false;
2580     }
2581 
2582     // Only handle 4 and 2 element vectors for now.
2583     // TODO: 16-bit elements.
2584     unsigned NumElts = DstTy.getNumElements();
2585     if (NumElts != 4 && NumElts != 2) {
2586       LLVM_DEBUG(dbgs() << "Unsupported number of elements for G_BSWAP.\n");
2587       return false;
2588     }
2589 
2590     // Choose the correct opcode for the supported types. Right now, that's
2591     // v2s32, v4s32, and v2s64.
2592     unsigned Opc = 0;
2593     unsigned EltSize = DstTy.getElementType().getSizeInBits();
2594     if (EltSize == 32)
2595       Opc = (DstTy.getNumElements() == 2) ? AArch64::REV32v8i8
2596                                           : AArch64::REV32v16i8;
2597     else if (EltSize == 64)
2598       Opc = AArch64::REV64v16i8;
2599 
2600     // We should always get something by the time we get here...
2601     assert(Opc != 0 && "Didn't get an opcode for G_BSWAP?");
2602 
2603     I.setDesc(TII.get(Opc));
2604     return constrainSelectedInstRegOperands(I, TII, TRI, RBI);
2605   }
2606 
2607   case TargetOpcode::G_FCONSTANT:
2608   case TargetOpcode::G_CONSTANT: {
2609     const bool isFP = Opcode == TargetOpcode::G_FCONSTANT;
2610 
2611     const LLT s8 = LLT::scalar(8);
2612     const LLT s16 = LLT::scalar(16);
2613     const LLT s32 = LLT::scalar(32);
2614     const LLT s64 = LLT::scalar(64);
2615     const LLT s128 = LLT::scalar(128);
2616     const LLT p0 = LLT::pointer(0, 64);
2617 
2618     const Register DefReg = I.getOperand(0).getReg();
2619     const LLT DefTy = MRI.getType(DefReg);
2620     const unsigned DefSize = DefTy.getSizeInBits();
2621     const RegisterBank &RB = *RBI.getRegBank(DefReg, MRI, TRI);
2622 
2623     // FIXME: Redundant check, but even less readable when factored out.
2624     if (isFP) {
2625       if (Ty != s16 && Ty != s32 && Ty != s64 && Ty != s128) {
2626         LLVM_DEBUG(dbgs() << "Unable to materialize FP " << Ty
2627                           << " constant, expected: " << s16 << " or " << s32
2628                           << " or " << s64 << " or " << s128 << '\n');
2629         return false;
2630       }
2631 
2632       if (RB.getID() != AArch64::FPRRegBankID) {
2633         LLVM_DEBUG(dbgs() << "Unable to materialize FP " << Ty
2634                           << " constant on bank: " << RB
2635                           << ", expected: FPR\n");
2636         return false;
2637       }
2638 
2639       // The case when we have 0.0 is covered by tablegen. Reject it here so we
2640       // can be sure tablegen works correctly and isn't rescued by this code.
2641       // 0.0 is not covered by tablegen for FP128. So we will handle this
2642       // scenario in the code here.
2643       if (DefSize != 128 && I.getOperand(1).getFPImm()->isExactlyValue(0.0))
2644         return false;
2645     } else {
2646       // s32 and s64 are covered by tablegen.
2647       if (Ty != p0 && Ty != s8 && Ty != s16) {
2648         LLVM_DEBUG(dbgs() << "Unable to materialize integer " << Ty
2649                           << " constant, expected: " << s32 << ", " << s64
2650                           << ", or " << p0 << '\n');
2651         return false;
2652       }
2653 
2654       if (RB.getID() != AArch64::GPRRegBankID) {
2655         LLVM_DEBUG(dbgs() << "Unable to materialize integer " << Ty
2656                           << " constant on bank: " << RB
2657                           << ", expected: GPR\n");
2658         return false;
2659       }
2660     }
2661 
2662     if (isFP) {
2663       const TargetRegisterClass &FPRRC = *getRegClassForTypeOnBank(DefTy, RB);
2664       // For 16, 64, and 128b values, emit a constant pool load.
2665       switch (DefSize) {
2666       default:
2667         llvm_unreachable("Unexpected destination size for G_FCONSTANT?");
2668       case 32:
2669       case 64: {
2670         bool OptForSize = shouldOptForSize(&MF);
2671         const auto &TLI = MF.getSubtarget().getTargetLowering();
2672         // If TLI says that this fpimm is illegal, then we'll expand to a
2673         // constant pool load.
2674         if (TLI->isFPImmLegal(I.getOperand(1).getFPImm()->getValueAPF(),
2675                               EVT::getFloatingPointVT(DefSize), OptForSize))
2676           break;
2677         [[fallthrough]];
2678       }
2679       case 16:
2680       case 128: {
2681         auto *FPImm = I.getOperand(1).getFPImm();
2682         auto *LoadMI = emitLoadFromConstantPool(FPImm, MIB);
2683         if (!LoadMI) {
2684           LLVM_DEBUG(dbgs() << "Failed to load double constant pool entry\n");
2685           return false;
2686         }
2687         MIB.buildCopy({DefReg}, {LoadMI->getOperand(0).getReg()});
2688         I.eraseFromParent();
2689         return RBI.constrainGenericRegister(DefReg, FPRRC, MRI);
2690       }
2691       }
2692 
2693       assert((DefSize == 32 || DefSize == 64) && "Unexpected const def size");
2694       // Either emit a FMOV, or emit a copy to emit a normal mov.
2695       const Register DefGPRReg = MRI.createVirtualRegister(
2696           DefSize == 32 ? &AArch64::GPR32RegClass : &AArch64::GPR64RegClass);
2697       MachineOperand &RegOp = I.getOperand(0);
2698       RegOp.setReg(DefGPRReg);
2699       MIB.setInsertPt(MIB.getMBB(), std::next(I.getIterator()));
2700       MIB.buildCopy({DefReg}, {DefGPRReg});
2701 
2702       if (!RBI.constrainGenericRegister(DefReg, FPRRC, MRI)) {
2703         LLVM_DEBUG(dbgs() << "Failed to constrain G_FCONSTANT def operand\n");
2704         return false;
2705       }
2706 
2707       MachineOperand &ImmOp = I.getOperand(1);
2708       // FIXME: Is going through int64_t always correct?
2709       ImmOp.ChangeToImmediate(
2710           ImmOp.getFPImm()->getValueAPF().bitcastToAPInt().getZExtValue());
2711     } else if (I.getOperand(1).isCImm()) {
2712       uint64_t Val = I.getOperand(1).getCImm()->getZExtValue();
2713       I.getOperand(1).ChangeToImmediate(Val);
2714     } else if (I.getOperand(1).isImm()) {
2715       uint64_t Val = I.getOperand(1).getImm();
2716       I.getOperand(1).ChangeToImmediate(Val);
2717     }
2718 
2719     const unsigned MovOpc =
2720         DefSize == 64 ? AArch64::MOVi64imm : AArch64::MOVi32imm;
2721     I.setDesc(TII.get(MovOpc));
2722     constrainSelectedInstRegOperands(I, TII, TRI, RBI);
2723     return true;
2724   }
2725   case TargetOpcode::G_EXTRACT: {
2726     Register DstReg = I.getOperand(0).getReg();
2727     Register SrcReg = I.getOperand(1).getReg();
2728     LLT SrcTy = MRI.getType(SrcReg);
2729     LLT DstTy = MRI.getType(DstReg);
2730     (void)DstTy;
2731     unsigned SrcSize = SrcTy.getSizeInBits();
2732 
2733     if (SrcTy.getSizeInBits() > 64) {
2734       // This should be an extract of an s128, which is like a vector extract.
2735       if (SrcTy.getSizeInBits() != 128)
2736         return false;
2737       // Only support extracting 64 bits from an s128 at the moment.
2738       if (DstTy.getSizeInBits() != 64)
2739         return false;
2740 
2741       unsigned Offset = I.getOperand(2).getImm();
2742       if (Offset % 64 != 0)
2743         return false;
2744 
2745       // Check we have the right regbank always.
2746       const RegisterBank &SrcRB = *RBI.getRegBank(SrcReg, MRI, TRI);
2747       const RegisterBank &DstRB = *RBI.getRegBank(DstReg, MRI, TRI);
2748       assert(SrcRB.getID() == DstRB.getID() && "Wrong extract regbank!");
2749 
2750       if (SrcRB.getID() == AArch64::GPRRegBankID) {
2751         auto NewI =
2752             MIB.buildInstr(TargetOpcode::COPY, {DstReg}, {})
2753                 .addUse(SrcReg, 0,
2754                         Offset == 0 ? AArch64::sube64 : AArch64::subo64);
2755         constrainOperandRegClass(MF, TRI, MRI, TII, RBI, *NewI,
2756                                  AArch64::GPR64RegClass, NewI->getOperand(0));
2757         I.eraseFromParent();
2758         return true;
2759       }
2760 
2761       // Emit the same code as a vector extract.
2762       // Offset must be a multiple of 64.
2763       unsigned LaneIdx = Offset / 64;
2764       MachineInstr *Extract = emitExtractVectorElt(
2765           DstReg, DstRB, LLT::scalar(64), SrcReg, LaneIdx, MIB);
2766       if (!Extract)
2767         return false;
2768       I.eraseFromParent();
2769       return true;
2770     }
2771 
2772     I.setDesc(TII.get(SrcSize == 64 ? AArch64::UBFMXri : AArch64::UBFMWri));
2773     MachineInstrBuilder(MF, I).addImm(I.getOperand(2).getImm() +
2774                                       Ty.getSizeInBits() - 1);
2775 
2776     if (SrcSize < 64) {
2777       assert(SrcSize == 32 && DstTy.getSizeInBits() == 16 &&
2778              "unexpected G_EXTRACT types");
2779       return constrainSelectedInstRegOperands(I, TII, TRI, RBI);
2780     }
2781 
2782     DstReg = MRI.createGenericVirtualRegister(LLT::scalar(64));
2783     MIB.setInsertPt(MIB.getMBB(), std::next(I.getIterator()));
2784     MIB.buildInstr(TargetOpcode::COPY, {I.getOperand(0).getReg()}, {})
2785         .addReg(DstReg, 0, AArch64::sub_32);
2786     RBI.constrainGenericRegister(I.getOperand(0).getReg(),
2787                                  AArch64::GPR32RegClass, MRI);
2788     I.getOperand(0).setReg(DstReg);
2789 
2790     return constrainSelectedInstRegOperands(I, TII, TRI, RBI);
2791   }
2792 
2793   case TargetOpcode::G_INSERT: {
2794     LLT SrcTy = MRI.getType(I.getOperand(2).getReg());
2795     LLT DstTy = MRI.getType(I.getOperand(0).getReg());
2796     unsigned DstSize = DstTy.getSizeInBits();
2797     // Larger inserts are vectors, same-size ones should be something else by
2798     // now (split up or turned into COPYs).
2799     if (Ty.getSizeInBits() > 64 || SrcTy.getSizeInBits() > 32)
2800       return false;
2801 
2802     I.setDesc(TII.get(DstSize == 64 ? AArch64::BFMXri : AArch64::BFMWri));
2803     unsigned LSB = I.getOperand(3).getImm();
2804     unsigned Width = MRI.getType(I.getOperand(2).getReg()).getSizeInBits();
2805     I.getOperand(3).setImm((DstSize - LSB) % DstSize);
2806     MachineInstrBuilder(MF, I).addImm(Width - 1);
2807 
2808     if (DstSize < 64) {
2809       assert(DstSize == 32 && SrcTy.getSizeInBits() == 16 &&
2810              "unexpected G_INSERT types");
2811       return constrainSelectedInstRegOperands(I, TII, TRI, RBI);
2812     }
2813 
2814     Register SrcReg = MRI.createGenericVirtualRegister(LLT::scalar(64));
2815     BuildMI(MBB, I.getIterator(), I.getDebugLoc(),
2816             TII.get(AArch64::SUBREG_TO_REG))
2817         .addDef(SrcReg)
2818         .addImm(0)
2819         .addUse(I.getOperand(2).getReg())
2820         .addImm(AArch64::sub_32);
2821     RBI.constrainGenericRegister(I.getOperand(2).getReg(),
2822                                  AArch64::GPR32RegClass, MRI);
2823     I.getOperand(2).setReg(SrcReg);
2824 
2825     return constrainSelectedInstRegOperands(I, TII, TRI, RBI);
2826   }
2827   case TargetOpcode::G_FRAME_INDEX: {
2828     // allocas and G_FRAME_INDEX are only supported in addrspace(0).
2829     if (Ty != LLT::pointer(0, 64)) {
2830       LLVM_DEBUG(dbgs() << "G_FRAME_INDEX pointer has type: " << Ty
2831                         << ", expected: " << LLT::pointer(0, 64) << '\n');
2832       return false;
2833     }
2834     I.setDesc(TII.get(AArch64::ADDXri));
2835 
2836     // MOs for a #0 shifted immediate.
2837     I.addOperand(MachineOperand::CreateImm(0));
2838     I.addOperand(MachineOperand::CreateImm(0));
2839 
2840     return constrainSelectedInstRegOperands(I, TII, TRI, RBI);
2841   }
2842 
2843   case TargetOpcode::G_GLOBAL_VALUE: {
2844     auto GV = I.getOperand(1).getGlobal();
2845     if (GV->isThreadLocal())
2846       return selectTLSGlobalValue(I, MRI);
2847 
2848     unsigned OpFlags = STI.ClassifyGlobalReference(GV, TM);
2849     if (OpFlags & AArch64II::MO_GOT) {
2850       I.setDesc(TII.get(AArch64::LOADgot));
2851       I.getOperand(1).setTargetFlags(OpFlags);
2852     } else if (TM.getCodeModel() == CodeModel::Large &&
2853                !TM.isPositionIndependent()) {
2854       // Materialize the global using movz/movk instructions.
2855       materializeLargeCMVal(I, GV, OpFlags);
2856       I.eraseFromParent();
2857       return true;
2858     } else if (TM.getCodeModel() == CodeModel::Tiny) {
2859       I.setDesc(TII.get(AArch64::ADR));
2860       I.getOperand(1).setTargetFlags(OpFlags);
2861     } else {
2862       I.setDesc(TII.get(AArch64::MOVaddr));
2863       I.getOperand(1).setTargetFlags(OpFlags | AArch64II::MO_PAGE);
2864       MachineInstrBuilder MIB(MF, I);
2865       MIB.addGlobalAddress(GV, I.getOperand(1).getOffset(),
2866                            OpFlags | AArch64II::MO_PAGEOFF | AArch64II::MO_NC);
2867     }
2868     return constrainSelectedInstRegOperands(I, TII, TRI, RBI);
2869   }
2870 
2871   case TargetOpcode::G_ZEXTLOAD:
2872   case TargetOpcode::G_LOAD:
2873   case TargetOpcode::G_STORE: {
2874     GLoadStore &LdSt = cast<GLoadStore>(I);
2875     bool IsZExtLoad = I.getOpcode() == TargetOpcode::G_ZEXTLOAD;
2876     LLT PtrTy = MRI.getType(LdSt.getPointerReg());
2877 
2878     if (PtrTy != LLT::pointer(0, 64)) {
2879       LLVM_DEBUG(dbgs() << "Load/Store pointer has type: " << PtrTy
2880                         << ", expected: " << LLT::pointer(0, 64) << '\n');
2881       return false;
2882     }
2883 
2884     uint64_t MemSizeInBytes = LdSt.getMemSize();
2885     unsigned MemSizeInBits = LdSt.getMemSizeInBits();
2886     AtomicOrdering Order = LdSt.getMMO().getSuccessOrdering();
2887 
2888     // Need special instructions for atomics that affect ordering.
2889     if (Order != AtomicOrdering::NotAtomic &&
2890         Order != AtomicOrdering::Unordered &&
2891         Order != AtomicOrdering::Monotonic) {
2892       assert(!isa<GZExtLoad>(LdSt));
2893       if (MemSizeInBytes > 64)
2894         return false;
2895 
2896       if (isa<GLoad>(LdSt)) {
2897         static constexpr unsigned LDAPROpcodes[] = {
2898             AArch64::LDAPRB, AArch64::LDAPRH, AArch64::LDAPRW, AArch64::LDAPRX};
2899         static constexpr unsigned LDAROpcodes[] = {
2900             AArch64::LDARB, AArch64::LDARH, AArch64::LDARW, AArch64::LDARX};
2901         ArrayRef<unsigned> Opcodes =
2902             STI.hasRCPC() && Order != AtomicOrdering::SequentiallyConsistent
2903                 ? LDAPROpcodes
2904                 : LDAROpcodes;
2905         I.setDesc(TII.get(Opcodes[Log2_32(MemSizeInBytes)]));
2906       } else {
2907         static constexpr unsigned Opcodes[] = {AArch64::STLRB, AArch64::STLRH,
2908                                                AArch64::STLRW, AArch64::STLRX};
2909         Register ValReg = LdSt.getReg(0);
2910         if (MRI.getType(ValReg).getSizeInBits() == 64 && MemSizeInBits != 64) {
2911           // Emit a subreg copy of 32 bits.
2912           Register NewVal = MRI.createVirtualRegister(&AArch64::GPR32RegClass);
2913           MIB.buildInstr(TargetOpcode::COPY, {NewVal}, {})
2914               .addReg(I.getOperand(0).getReg(), 0, AArch64::sub_32);
2915           I.getOperand(0).setReg(NewVal);
2916         }
2917         I.setDesc(TII.get(Opcodes[Log2_32(MemSizeInBytes)]));
2918       }
2919       constrainSelectedInstRegOperands(I, TII, TRI, RBI);
2920       return true;
2921     }
2922 
2923 #ifndef NDEBUG
2924     const Register PtrReg = LdSt.getPointerReg();
2925     const RegisterBank &PtrRB = *RBI.getRegBank(PtrReg, MRI, TRI);
2926     // Check that the pointer register is valid.
2927     assert(PtrRB.getID() == AArch64::GPRRegBankID &&
2928            "Load/Store pointer operand isn't a GPR");
2929     assert(MRI.getType(PtrReg).isPointer() &&
2930            "Load/Store pointer operand isn't a pointer");
2931 #endif
2932 
2933     const Register ValReg = LdSt.getReg(0);
2934     const LLT ValTy = MRI.getType(ValReg);
2935     const RegisterBank &RB = *RBI.getRegBank(ValReg, MRI, TRI);
2936 
2937     // The code below doesn't support truncating stores, so we need to split it
2938     // again.
2939     if (isa<GStore>(LdSt) && ValTy.getSizeInBits() > MemSizeInBits) {
2940       unsigned SubReg;
2941       LLT MemTy = LdSt.getMMO().getMemoryType();
2942       auto *RC = getRegClassForTypeOnBank(MemTy, RB);
2943       if (!getSubRegForClass(RC, TRI, SubReg))
2944         return false;
2945 
2946       // Generate a subreg copy.
2947       auto Copy = MIB.buildInstr(TargetOpcode::COPY, {MemTy}, {})
2948                       .addReg(ValReg, 0, SubReg)
2949                       .getReg(0);
2950       RBI.constrainGenericRegister(Copy, *RC, MRI);
2951       LdSt.getOperand(0).setReg(Copy);
2952     } else if (isa<GLoad>(LdSt) && ValTy.getSizeInBits() > MemSizeInBits) {
2953       // If this is an any-extending load from the FPR bank, split it into a regular
2954       // load + extend.
2955       if (RB.getID() == AArch64::FPRRegBankID) {
2956         unsigned SubReg;
2957         LLT MemTy = LdSt.getMMO().getMemoryType();
2958         auto *RC = getRegClassForTypeOnBank(MemTy, RB);
2959         if (!getSubRegForClass(RC, TRI, SubReg))
2960           return false;
2961         Register OldDst = LdSt.getReg(0);
2962         Register NewDst =
2963             MRI.createGenericVirtualRegister(LdSt.getMMO().getMemoryType());
2964         LdSt.getOperand(0).setReg(NewDst);
2965         MRI.setRegBank(NewDst, RB);
2966         // Generate a SUBREG_TO_REG to extend it.
2967         MIB.setInsertPt(MIB.getMBB(), std::next(LdSt.getIterator()));
2968         MIB.buildInstr(AArch64::SUBREG_TO_REG, {OldDst}, {})
2969             .addImm(0)
2970             .addUse(NewDst)
2971             .addImm(SubReg);
2972         auto SubRegRC = getRegClassForTypeOnBank(MRI.getType(OldDst), RB);
2973         RBI.constrainGenericRegister(OldDst, *SubRegRC, MRI);
2974         MIB.setInstr(LdSt);
2975       }
2976     }
2977 
2978     // Helper lambda for partially selecting I. Either returns the original
2979     // instruction with an updated opcode, or a new instruction.
2980     auto SelectLoadStoreAddressingMode = [&]() -> MachineInstr * {
2981       bool IsStore = isa<GStore>(I);
2982       const unsigned NewOpc =
2983           selectLoadStoreUIOp(I.getOpcode(), RB.getID(), MemSizeInBits);
2984       if (NewOpc == I.getOpcode())
2985         return nullptr;
2986       // Check if we can fold anything into the addressing mode.
2987       auto AddrModeFns =
2988           selectAddrModeIndexed(I.getOperand(1), MemSizeInBytes);
2989       if (!AddrModeFns) {
2990         // Can't fold anything. Use the original instruction.
2991         I.setDesc(TII.get(NewOpc));
2992         I.addOperand(MachineOperand::CreateImm(0));
2993         return &I;
2994       }
2995 
2996       // Folded something. Create a new instruction and return it.
2997       auto NewInst = MIB.buildInstr(NewOpc, {}, {}, I.getFlags());
2998       Register CurValReg = I.getOperand(0).getReg();
2999       IsStore ? NewInst.addUse(CurValReg) : NewInst.addDef(CurValReg);
3000       NewInst.cloneMemRefs(I);
3001       for (auto &Fn : *AddrModeFns)
3002         Fn(NewInst);
3003       I.eraseFromParent();
3004       return &*NewInst;
3005     };
3006 
3007     MachineInstr *LoadStore = SelectLoadStoreAddressingMode();
3008     if (!LoadStore)
3009       return false;
3010 
3011     // If we're storing a 0, use WZR/XZR.
3012     if (Opcode == TargetOpcode::G_STORE) {
3013       auto CVal = getIConstantVRegValWithLookThrough(
3014           LoadStore->getOperand(0).getReg(), MRI);
3015       if (CVal && CVal->Value == 0) {
3016         switch (LoadStore->getOpcode()) {
3017         case AArch64::STRWui:
3018         case AArch64::STRHHui:
3019         case AArch64::STRBBui:
3020           LoadStore->getOperand(0).setReg(AArch64::WZR);
3021           break;
3022         case AArch64::STRXui:
3023           LoadStore->getOperand(0).setReg(AArch64::XZR);
3024           break;
3025         }
3026       }
3027     }
3028 
3029     if (IsZExtLoad) {
3030       // The zextload from a smaller type to i32 should be handled by the
3031       // importer.
3032       if (MRI.getType(LoadStore->getOperand(0).getReg()).getSizeInBits() != 64)
3033         return false;
3034       // If we have a ZEXTLOAD then change the load's type to be a narrower reg
3035       // and zero_extend with SUBREG_TO_REG.
3036       Register LdReg = MRI.createVirtualRegister(&AArch64::GPR32RegClass);
3037       Register DstReg = LoadStore->getOperand(0).getReg();
3038       LoadStore->getOperand(0).setReg(LdReg);
3039 
3040       MIB.setInsertPt(MIB.getMBB(), std::next(LoadStore->getIterator()));
3041       MIB.buildInstr(AArch64::SUBREG_TO_REG, {DstReg}, {})
3042           .addImm(0)
3043           .addUse(LdReg)
3044           .addImm(AArch64::sub_32);
3045       constrainSelectedInstRegOperands(*LoadStore, TII, TRI, RBI);
3046       return RBI.constrainGenericRegister(DstReg, AArch64::GPR64allRegClass,
3047                                           MRI);
3048     }
3049     return constrainSelectedInstRegOperands(*LoadStore, TII, TRI, RBI);
3050   }
3051 
3052   case TargetOpcode::G_INDEXED_ZEXTLOAD:
3053   case TargetOpcode::G_INDEXED_SEXTLOAD:
3054     return selectIndexedExtLoad(I, MRI);
3055   case TargetOpcode::G_INDEXED_LOAD:
3056     return selectIndexedLoad(I, MRI);
3057   case TargetOpcode::G_INDEXED_STORE:
3058     return selectIndexedStore(cast<GIndexedStore>(I), MRI);
3059 
3060   case TargetOpcode::G_SMULH:
3061   case TargetOpcode::G_UMULH: {
3062     // Reject the various things we don't support yet.
3063     if (unsupportedBinOp(I, RBI, MRI, TRI))
3064       return false;
3065 
3066     const Register DefReg = I.getOperand(0).getReg();
3067     const RegisterBank &RB = *RBI.getRegBank(DefReg, MRI, TRI);
3068 
3069     if (RB.getID() != AArch64::GPRRegBankID) {
3070       LLVM_DEBUG(dbgs() << "G_[SU]MULH on bank: " << RB << ", expected: GPR\n");
3071       return false;
3072     }
3073 
3074     if (Ty != LLT::scalar(64)) {
3075       LLVM_DEBUG(dbgs() << "G_[SU]MULH has type: " << Ty
3076                         << ", expected: " << LLT::scalar(64) << '\n');
3077       return false;
3078     }
3079 
3080     unsigned NewOpc = I.getOpcode() == TargetOpcode::G_SMULH ? AArch64::SMULHrr
3081                                                              : AArch64::UMULHrr;
3082     I.setDesc(TII.get(NewOpc));
3083 
3084     // Now that we selected an opcode, we need to constrain the register
3085     // operands to use appropriate classes.
3086     return constrainSelectedInstRegOperands(I, TII, TRI, RBI);
3087   }
3088   case TargetOpcode::G_LSHR:
3089   case TargetOpcode::G_ASHR:
3090     if (MRI.getType(I.getOperand(0).getReg()).isVector())
3091       return selectVectorAshrLshr(I, MRI);
3092     [[fallthrough]];
3093   case TargetOpcode::G_SHL:
3094     if (Opcode == TargetOpcode::G_SHL &&
3095         MRI.getType(I.getOperand(0).getReg()).isVector())
3096       return selectVectorSHL(I, MRI);
3097 
3098     // These shifts were legalized to have 64 bit shift amounts because we
3099     // want to take advantage of the selection patterns that assume the
3100     // immediates are s64s, however, selectBinaryOp will assume both operands
3101     // will have the same bit size.
3102     {
3103       Register SrcReg = I.getOperand(1).getReg();
3104       Register ShiftReg = I.getOperand(2).getReg();
3105       const LLT ShiftTy = MRI.getType(ShiftReg);
3106       const LLT SrcTy = MRI.getType(SrcReg);
3107       if (!SrcTy.isVector() && SrcTy.getSizeInBits() == 32 &&
3108           ShiftTy.getSizeInBits() == 64) {
3109         assert(!ShiftTy.isVector() && "unexpected vector shift ty");
3110         // Insert a subregister copy to implement a 64->32 trunc
3111         auto Trunc = MIB.buildInstr(TargetOpcode::COPY, {SrcTy}, {})
3112                          .addReg(ShiftReg, 0, AArch64::sub_32);
3113         MRI.setRegBank(Trunc.getReg(0), RBI.getRegBank(AArch64::GPRRegBankID));
3114         I.getOperand(2).setReg(Trunc.getReg(0));
3115       }
3116     }
3117     [[fallthrough]];
3118   case TargetOpcode::G_OR: {
3119     // Reject the various things we don't support yet.
3120     if (unsupportedBinOp(I, RBI, MRI, TRI))
3121       return false;
3122 
3123     const unsigned OpSize = Ty.getSizeInBits();
3124 
3125     const Register DefReg = I.getOperand(0).getReg();
3126     const RegisterBank &RB = *RBI.getRegBank(DefReg, MRI, TRI);
3127 
3128     const unsigned NewOpc = selectBinaryOp(I.getOpcode(), RB.getID(), OpSize);
3129     if (NewOpc == I.getOpcode())
3130       return false;
3131 
3132     I.setDesc(TII.get(NewOpc));
3133     // FIXME: Should the type be always reset in setDesc?
3134 
3135     // Now that we selected an opcode, we need to constrain the register
3136     // operands to use appropriate classes.
3137     return constrainSelectedInstRegOperands(I, TII, TRI, RBI);
3138   }
3139 
3140   case TargetOpcode::G_PTR_ADD: {
3141     emitADD(I.getOperand(0).getReg(), I.getOperand(1), I.getOperand(2), MIB);
3142     I.eraseFromParent();
3143     return true;
3144   }
3145 
3146   case TargetOpcode::G_SADDE:
3147   case TargetOpcode::G_UADDE:
3148   case TargetOpcode::G_SSUBE:
3149   case TargetOpcode::G_USUBE:
3150   case TargetOpcode::G_SADDO:
3151   case TargetOpcode::G_UADDO:
3152   case TargetOpcode::G_SSUBO:
3153   case TargetOpcode::G_USUBO:
3154     return selectOverflowOp(I, MRI);
3155 
3156   case TargetOpcode::G_PTRMASK: {
3157     Register MaskReg = I.getOperand(2).getReg();
3158     std::optional<int64_t> MaskVal = getIConstantVRegSExtVal(MaskReg, MRI);
3159     // TODO: Implement arbitrary cases
3160     if (!MaskVal || !isShiftedMask_64(*MaskVal))
3161       return false;
3162 
3163     uint64_t Mask = *MaskVal;
3164     I.setDesc(TII.get(AArch64::ANDXri));
3165     I.getOperand(2).ChangeToImmediate(
3166         AArch64_AM::encodeLogicalImmediate(Mask, 64));
3167 
3168     return constrainSelectedInstRegOperands(I, TII, TRI, RBI);
3169   }
3170   case TargetOpcode::G_PTRTOINT:
3171   case TargetOpcode::G_TRUNC: {
3172     const LLT DstTy = MRI.getType(I.getOperand(0).getReg());
3173     const LLT SrcTy = MRI.getType(I.getOperand(1).getReg());
3174 
3175     const Register DstReg = I.getOperand(0).getReg();
3176     const Register SrcReg = I.getOperand(1).getReg();
3177 
3178     const RegisterBank &DstRB = *RBI.getRegBank(DstReg, MRI, TRI);
3179     const RegisterBank &SrcRB = *RBI.getRegBank(SrcReg, MRI, TRI);
3180 
3181     if (DstRB.getID() != SrcRB.getID()) {
3182       LLVM_DEBUG(
3183           dbgs() << "G_TRUNC/G_PTRTOINT input/output on different banks\n");
3184       return false;
3185     }
3186 
3187     if (DstRB.getID() == AArch64::GPRRegBankID) {
3188       const TargetRegisterClass *DstRC = getRegClassForTypeOnBank(DstTy, DstRB);
3189       if (!DstRC)
3190         return false;
3191 
3192       const TargetRegisterClass *SrcRC = getRegClassForTypeOnBank(SrcTy, SrcRB);
3193       if (!SrcRC)
3194         return false;
3195 
3196       if (!RBI.constrainGenericRegister(SrcReg, *SrcRC, MRI) ||
3197           !RBI.constrainGenericRegister(DstReg, *DstRC, MRI)) {
3198         LLVM_DEBUG(dbgs() << "Failed to constrain G_TRUNC/G_PTRTOINT\n");
3199         return false;
3200       }
3201 
3202       if (DstRC == SrcRC) {
3203         // Nothing to be done
3204       } else if (Opcode == TargetOpcode::G_TRUNC && DstTy == LLT::scalar(32) &&
3205                  SrcTy == LLT::scalar(64)) {
3206         llvm_unreachable("TableGen can import this case");
3207         return false;
3208       } else if (DstRC == &AArch64::GPR32RegClass &&
3209                  SrcRC == &AArch64::GPR64RegClass) {
3210         I.getOperand(1).setSubReg(AArch64::sub_32);
3211       } else {
3212         LLVM_DEBUG(
3213             dbgs() << "Unhandled mismatched classes in G_TRUNC/G_PTRTOINT\n");
3214         return false;
3215       }
3216 
3217       I.setDesc(TII.get(TargetOpcode::COPY));
3218       return true;
3219     } else if (DstRB.getID() == AArch64::FPRRegBankID) {
3220       if (DstTy == LLT::fixed_vector(4, 16) &&
3221           SrcTy == LLT::fixed_vector(4, 32)) {
3222         I.setDesc(TII.get(AArch64::XTNv4i16));
3223         constrainSelectedInstRegOperands(I, TII, TRI, RBI);
3224         return true;
3225       }
3226 
3227       if (!SrcTy.isVector() && SrcTy.getSizeInBits() == 128) {
3228         MachineInstr *Extract = emitExtractVectorElt(
3229             DstReg, DstRB, LLT::scalar(DstTy.getSizeInBits()), SrcReg, 0, MIB);
3230         if (!Extract)
3231           return false;
3232         I.eraseFromParent();
3233         return true;
3234       }
3235 
3236       // We might have a vector G_PTRTOINT, in which case just emit a COPY.
3237       if (Opcode == TargetOpcode::G_PTRTOINT) {
3238         assert(DstTy.isVector() && "Expected an FPR ptrtoint to be a vector");
3239         I.setDesc(TII.get(TargetOpcode::COPY));
3240         return selectCopy(I, TII, MRI, TRI, RBI);
3241       }
3242     }
3243 
3244     return false;
3245   }
3246 
3247   case TargetOpcode::G_ANYEXT: {
3248     if (selectUSMovFromExtend(I, MRI))
3249       return true;
3250 
3251     const Register DstReg = I.getOperand(0).getReg();
3252     const Register SrcReg = I.getOperand(1).getReg();
3253 
3254     const RegisterBank &RBDst = *RBI.getRegBank(DstReg, MRI, TRI);
3255     if (RBDst.getID() != AArch64::GPRRegBankID) {
3256       LLVM_DEBUG(dbgs() << "G_ANYEXT on bank: " << RBDst
3257                         << ", expected: GPR\n");
3258       return false;
3259     }
3260 
3261     const RegisterBank &RBSrc = *RBI.getRegBank(SrcReg, MRI, TRI);
3262     if (RBSrc.getID() != AArch64::GPRRegBankID) {
3263       LLVM_DEBUG(dbgs() << "G_ANYEXT on bank: " << RBSrc
3264                         << ", expected: GPR\n");
3265       return false;
3266     }
3267 
3268     const unsigned DstSize = MRI.getType(DstReg).getSizeInBits();
3269 
3270     if (DstSize == 0) {
3271       LLVM_DEBUG(dbgs() << "G_ANYEXT operand has no size, not a gvreg?\n");
3272       return false;
3273     }
3274 
3275     if (DstSize != 64 && DstSize > 32) {
3276       LLVM_DEBUG(dbgs() << "G_ANYEXT to size: " << DstSize
3277                         << ", expected: 32 or 64\n");
3278       return false;
3279     }
3280     // At this point G_ANYEXT is just like a plain COPY, but we need
3281     // to explicitly form the 64-bit value if any.
3282     if (DstSize > 32) {
3283       Register ExtSrc = MRI.createVirtualRegister(&AArch64::GPR64allRegClass);
3284       BuildMI(MBB, I, I.getDebugLoc(), TII.get(AArch64::SUBREG_TO_REG))
3285           .addDef(ExtSrc)
3286           .addImm(0)
3287           .addUse(SrcReg)
3288           .addImm(AArch64::sub_32);
3289       I.getOperand(1).setReg(ExtSrc);
3290     }
3291     return selectCopy(I, TII, MRI, TRI, RBI);
3292   }
3293 
3294   case TargetOpcode::G_ZEXT:
3295   case TargetOpcode::G_SEXT_INREG:
3296   case TargetOpcode::G_SEXT: {
3297     if (selectUSMovFromExtend(I, MRI))
3298       return true;
3299 
3300     unsigned Opcode = I.getOpcode();
3301     const bool IsSigned = Opcode != TargetOpcode::G_ZEXT;
3302     const Register DefReg = I.getOperand(0).getReg();
3303     Register SrcReg = I.getOperand(1).getReg();
3304     const LLT DstTy = MRI.getType(DefReg);
3305     const LLT SrcTy = MRI.getType(SrcReg);
3306     unsigned DstSize = DstTy.getSizeInBits();
3307     unsigned SrcSize = SrcTy.getSizeInBits();
3308 
3309     // SEXT_INREG has the same src reg size as dst, the size of the value to be
3310     // extended is encoded in the imm.
3311     if (Opcode == TargetOpcode::G_SEXT_INREG)
3312       SrcSize = I.getOperand(2).getImm();
3313 
3314     if (DstTy.isVector())
3315       return false; // Should be handled by imported patterns.
3316 
3317     assert((*RBI.getRegBank(DefReg, MRI, TRI)).getID() ==
3318                AArch64::GPRRegBankID &&
3319            "Unexpected ext regbank");
3320 
3321     MachineInstr *ExtI;
3322 
3323     // First check if we're extending the result of a load which has a dest type
3324     // smaller than 32 bits, then this zext is redundant. GPR32 is the smallest
3325     // GPR register on AArch64 and all loads which are smaller automatically
3326     // zero-extend the upper bits. E.g.
3327     // %v(s8) = G_LOAD %p, :: (load 1)
3328     // %v2(s32) = G_ZEXT %v(s8)
3329     if (!IsSigned) {
3330       auto *LoadMI = getOpcodeDef(TargetOpcode::G_LOAD, SrcReg, MRI);
3331       bool IsGPR =
3332           RBI.getRegBank(SrcReg, MRI, TRI)->getID() == AArch64::GPRRegBankID;
3333       if (LoadMI && IsGPR) {
3334         const MachineMemOperand *MemOp = *LoadMI->memoperands_begin();
3335         unsigned BytesLoaded = MemOp->getSize();
3336         if (BytesLoaded < 4 && SrcTy.getSizeInBytes() == BytesLoaded)
3337           return selectCopy(I, TII, MRI, TRI, RBI);
3338       }
3339 
3340       // For the 32-bit -> 64-bit case, we can emit a mov (ORRWrs)
3341       // + SUBREG_TO_REG.
3342       if (IsGPR && SrcSize == 32 && DstSize == 64) {
3343         Register SubregToRegSrc =
3344             MRI.createVirtualRegister(&AArch64::GPR32RegClass);
3345         const Register ZReg = AArch64::WZR;
3346         MIB.buildInstr(AArch64::ORRWrs, {SubregToRegSrc}, {ZReg, SrcReg})
3347             .addImm(0);
3348 
3349         MIB.buildInstr(AArch64::SUBREG_TO_REG, {DefReg}, {})
3350             .addImm(0)
3351             .addUse(SubregToRegSrc)
3352             .addImm(AArch64::sub_32);
3353 
3354         if (!RBI.constrainGenericRegister(DefReg, AArch64::GPR64RegClass,
3355                                           MRI)) {
3356           LLVM_DEBUG(dbgs() << "Failed to constrain G_ZEXT destination\n");
3357           return false;
3358         }
3359 
3360         if (!RBI.constrainGenericRegister(SrcReg, AArch64::GPR32RegClass,
3361                                           MRI)) {
3362           LLVM_DEBUG(dbgs() << "Failed to constrain G_ZEXT source\n");
3363           return false;
3364         }
3365 
3366         I.eraseFromParent();
3367         return true;
3368       }
3369     }
3370 
3371     if (DstSize == 64) {
3372       if (Opcode != TargetOpcode::G_SEXT_INREG) {
3373         // FIXME: Can we avoid manually doing this?
3374         if (!RBI.constrainGenericRegister(SrcReg, AArch64::GPR32RegClass,
3375                                           MRI)) {
3376           LLVM_DEBUG(dbgs() << "Failed to constrain " << TII.getName(Opcode)
3377                             << " operand\n");
3378           return false;
3379         }
3380         SrcReg = MIB.buildInstr(AArch64::SUBREG_TO_REG,
3381                                 {&AArch64::GPR64RegClass}, {})
3382                      .addImm(0)
3383                      .addUse(SrcReg)
3384                      .addImm(AArch64::sub_32)
3385                      .getReg(0);
3386       }
3387 
3388       ExtI = MIB.buildInstr(IsSigned ? AArch64::SBFMXri : AArch64::UBFMXri,
3389                              {DefReg}, {SrcReg})
3390                   .addImm(0)
3391                   .addImm(SrcSize - 1);
3392     } else if (DstSize <= 32) {
3393       ExtI = MIB.buildInstr(IsSigned ? AArch64::SBFMWri : AArch64::UBFMWri,
3394                              {DefReg}, {SrcReg})
3395                   .addImm(0)
3396                   .addImm(SrcSize - 1);
3397     } else {
3398       return false;
3399     }
3400 
3401     constrainSelectedInstRegOperands(*ExtI, TII, TRI, RBI);
3402     I.eraseFromParent();
3403     return true;
3404   }
3405 
3406   case TargetOpcode::G_SITOFP:
3407   case TargetOpcode::G_UITOFP:
3408   case TargetOpcode::G_FPTOSI:
3409   case TargetOpcode::G_FPTOUI: {
3410     const LLT DstTy = MRI.getType(I.getOperand(0).getReg()),
3411               SrcTy = MRI.getType(I.getOperand(1).getReg());
3412     const unsigned NewOpc = selectFPConvOpc(Opcode, DstTy, SrcTy);
3413     if (NewOpc == Opcode)
3414       return false;
3415 
3416     I.setDesc(TII.get(NewOpc));
3417     constrainSelectedInstRegOperands(I, TII, TRI, RBI);
3418     I.setFlags(MachineInstr::NoFPExcept);
3419 
3420     return true;
3421   }
3422 
3423   case TargetOpcode::G_FREEZE:
3424     return selectCopy(I, TII, MRI, TRI, RBI);
3425 
3426   case TargetOpcode::G_INTTOPTR:
3427     // The importer is currently unable to import pointer types since they
3428     // didn't exist in SelectionDAG.
3429     return selectCopy(I, TII, MRI, TRI, RBI);
3430 
3431   case TargetOpcode::G_BITCAST:
3432     // Imported SelectionDAG rules can handle every bitcast except those that
3433     // bitcast from a type to the same type. Ideally, these shouldn't occur
3434     // but we might not run an optimizer that deletes them. The other exception
3435     // is bitcasts involving pointer types, as SelectionDAG has no knowledge
3436     // of them.
3437     return selectCopy(I, TII, MRI, TRI, RBI);
3438 
3439   case TargetOpcode::G_SELECT: {
3440     auto &Sel = cast<GSelect>(I);
3441     const Register CondReg = Sel.getCondReg();
3442     const Register TReg = Sel.getTrueReg();
3443     const Register FReg = Sel.getFalseReg();
3444 
3445     if (tryOptSelect(Sel))
3446       return true;
3447 
3448     // Make sure to use an unused vreg instead of wzr, so that the peephole
3449     // optimizations will be able to optimize these.
3450     Register DeadVReg = MRI.createVirtualRegister(&AArch64::GPR32RegClass);
3451     auto TstMI = MIB.buildInstr(AArch64::ANDSWri, {DeadVReg}, {CondReg})
3452                      .addImm(AArch64_AM::encodeLogicalImmediate(1, 32));
3453     constrainSelectedInstRegOperands(*TstMI, TII, TRI, RBI);
3454     if (!emitSelect(Sel.getReg(0), TReg, FReg, AArch64CC::NE, MIB))
3455       return false;
3456     Sel.eraseFromParent();
3457     return true;
3458   }
3459   case TargetOpcode::G_ICMP: {
3460     if (Ty.isVector())
3461       return selectVectorICmp(I, MRI);
3462 
3463     if (Ty != LLT::scalar(32)) {
3464       LLVM_DEBUG(dbgs() << "G_ICMP result has type: " << Ty
3465                         << ", expected: " << LLT::scalar(32) << '\n');
3466       return false;
3467     }
3468 
3469     auto Pred = static_cast<CmpInst::Predicate>(I.getOperand(1).getPredicate());
3470     const AArch64CC::CondCode InvCC =
3471         changeICMPPredToAArch64CC(CmpInst::getInversePredicate(Pred));
3472     emitIntegerCompare(I.getOperand(2), I.getOperand(3), I.getOperand(1), MIB);
3473     emitCSINC(/*Dst=*/I.getOperand(0).getReg(), /*Src1=*/AArch64::WZR,
3474               /*Src2=*/AArch64::WZR, InvCC, MIB);
3475     I.eraseFromParent();
3476     return true;
3477   }
3478 
3479   case TargetOpcode::G_FCMP: {
3480     CmpInst::Predicate Pred =
3481         static_cast<CmpInst::Predicate>(I.getOperand(1).getPredicate());
3482     if (!emitFPCompare(I.getOperand(2).getReg(), I.getOperand(3).getReg(), MIB,
3483                        Pred) ||
3484         !emitCSetForFCmp(I.getOperand(0).getReg(), Pred, MIB))
3485       return false;
3486     I.eraseFromParent();
3487     return true;
3488   }
3489   case TargetOpcode::G_VASTART:
3490     return STI.isTargetDarwin() ? selectVaStartDarwin(I, MF, MRI)
3491                                 : selectVaStartAAPCS(I, MF, MRI);
3492   case TargetOpcode::G_INTRINSIC:
3493     return selectIntrinsic(I, MRI);
3494   case TargetOpcode::G_INTRINSIC_W_SIDE_EFFECTS:
3495     return selectIntrinsicWithSideEffects(I, MRI);
3496   case TargetOpcode::G_IMPLICIT_DEF: {
3497     I.setDesc(TII.get(TargetOpcode::IMPLICIT_DEF));
3498     const LLT DstTy = MRI.getType(I.getOperand(0).getReg());
3499     const Register DstReg = I.getOperand(0).getReg();
3500     const RegisterBank &DstRB = *RBI.getRegBank(DstReg, MRI, TRI);
3501     const TargetRegisterClass *DstRC = getRegClassForTypeOnBank(DstTy, DstRB);
3502     RBI.constrainGenericRegister(DstReg, *DstRC, MRI);
3503     return true;
3504   }
3505   case TargetOpcode::G_BLOCK_ADDR: {
3506     if (TM.getCodeModel() == CodeModel::Large && !TM.isPositionIndependent()) {
3507       materializeLargeCMVal(I, I.getOperand(1).getBlockAddress(), 0);
3508       I.eraseFromParent();
3509       return true;
3510     } else {
3511       I.setDesc(TII.get(AArch64::MOVaddrBA));
3512       auto MovMI = BuildMI(MBB, I, I.getDebugLoc(), TII.get(AArch64::MOVaddrBA),
3513                            I.getOperand(0).getReg())
3514                        .addBlockAddress(I.getOperand(1).getBlockAddress(),
3515                                         /* Offset */ 0, AArch64II::MO_PAGE)
3516                        .addBlockAddress(
3517                            I.getOperand(1).getBlockAddress(), /* Offset */ 0,
3518                            AArch64II::MO_NC | AArch64II::MO_PAGEOFF);
3519       I.eraseFromParent();
3520       return constrainSelectedInstRegOperands(*MovMI, TII, TRI, RBI);
3521     }
3522   }
3523   case AArch64::G_DUP: {
3524     // When the scalar of G_DUP is an s8/s16 gpr, they can't be selected by
3525     // imported patterns. Do it manually here. Avoiding generating s16 gpr is
3526     // difficult because at RBS we may end up pessimizing the fpr case if we
3527     // decided to add an anyextend to fix this. Manual selection is the most
3528     // robust solution for now.
3529     if (RBI.getRegBank(I.getOperand(1).getReg(), MRI, TRI)->getID() !=
3530         AArch64::GPRRegBankID)
3531       return false; // We expect the fpr regbank case to be imported.
3532     LLT VecTy = MRI.getType(I.getOperand(0).getReg());
3533     if (VecTy == LLT::fixed_vector(8, 8))
3534       I.setDesc(TII.get(AArch64::DUPv8i8gpr));
3535     else if (VecTy == LLT::fixed_vector(16, 8))
3536       I.setDesc(TII.get(AArch64::DUPv16i8gpr));
3537     else if (VecTy == LLT::fixed_vector(4, 16))
3538       I.setDesc(TII.get(AArch64::DUPv4i16gpr));
3539     else if (VecTy == LLT::fixed_vector(8, 16))
3540       I.setDesc(TII.get(AArch64::DUPv8i16gpr));
3541     else
3542       return false;
3543     return constrainSelectedInstRegOperands(I, TII, TRI, RBI);
3544   }
3545   case TargetOpcode::G_BUILD_VECTOR:
3546     return selectBuildVector(I, MRI);
3547   case TargetOpcode::G_MERGE_VALUES:
3548     return selectMergeValues(I, MRI);
3549   case TargetOpcode::G_UNMERGE_VALUES:
3550     return selectUnmergeValues(I, MRI);
3551   case TargetOpcode::G_SHUFFLE_VECTOR:
3552     return selectShuffleVector(I, MRI);
3553   case TargetOpcode::G_EXTRACT_VECTOR_ELT:
3554     return selectExtractElt(I, MRI);
3555   case TargetOpcode::G_INSERT_VECTOR_ELT:
3556     return selectInsertElt(I, MRI);
3557   case TargetOpcode::G_CONCAT_VECTORS:
3558     return selectConcatVectors(I, MRI);
3559   case TargetOpcode::G_JUMP_TABLE:
3560     return selectJumpTable(I, MRI);
3561   case TargetOpcode::G_MEMCPY:
3562   case TargetOpcode::G_MEMCPY_INLINE:
3563   case TargetOpcode::G_MEMMOVE:
3564   case TargetOpcode::G_MEMSET:
3565     assert(STI.hasMOPS() && "Shouldn't get here without +mops feature");
3566     return selectMOPS(I, MRI);
3567   }
3568 
3569   return false;
3570 }
3571 
3572 bool AArch64InstructionSelector::selectAndRestoreState(MachineInstr &I) {
3573   MachineIRBuilderState OldMIBState = MIB.getState();
3574   bool Success = select(I);
3575   MIB.setState(OldMIBState);
3576   return Success;
3577 }
3578 
3579 bool AArch64InstructionSelector::selectMOPS(MachineInstr &GI,
3580                                             MachineRegisterInfo &MRI) {
3581   unsigned Mopcode;
3582   switch (GI.getOpcode()) {
3583   case TargetOpcode::G_MEMCPY:
3584   case TargetOpcode::G_MEMCPY_INLINE:
3585     Mopcode = AArch64::MOPSMemoryCopyPseudo;
3586     break;
3587   case TargetOpcode::G_MEMMOVE:
3588     Mopcode = AArch64::MOPSMemoryMovePseudo;
3589     break;
3590   case TargetOpcode::G_MEMSET:
3591     // For tagged memset see llvm.aarch64.mops.memset.tag
3592     Mopcode = AArch64::MOPSMemorySetPseudo;
3593     break;
3594   }
3595 
3596   auto &DstPtr = GI.getOperand(0);
3597   auto &SrcOrVal = GI.getOperand(1);
3598   auto &Size = GI.getOperand(2);
3599 
3600   // Create copies of the registers that can be clobbered.
3601   const Register DstPtrCopy = MRI.cloneVirtualRegister(DstPtr.getReg());
3602   const Register SrcValCopy = MRI.cloneVirtualRegister(SrcOrVal.getReg());
3603   const Register SizeCopy = MRI.cloneVirtualRegister(Size.getReg());
3604 
3605   const bool IsSet = Mopcode == AArch64::MOPSMemorySetPseudo;
3606   const auto &SrcValRegClass =
3607       IsSet ? AArch64::GPR64RegClass : AArch64::GPR64commonRegClass;
3608 
3609   // Constrain to specific registers
3610   RBI.constrainGenericRegister(DstPtrCopy, AArch64::GPR64commonRegClass, MRI);
3611   RBI.constrainGenericRegister(SrcValCopy, SrcValRegClass, MRI);
3612   RBI.constrainGenericRegister(SizeCopy, AArch64::GPR64RegClass, MRI);
3613 
3614   MIB.buildCopy(DstPtrCopy, DstPtr);
3615   MIB.buildCopy(SrcValCopy, SrcOrVal);
3616   MIB.buildCopy(SizeCopy, Size);
3617 
3618   // New instruction uses the copied registers because it must update them.
3619   // The defs are not used since they don't exist in G_MEM*. They are still
3620   // tied.
3621   // Note: order of operands is different from G_MEMSET, G_MEMCPY, G_MEMMOVE
3622   Register DefDstPtr = MRI.createVirtualRegister(&AArch64::GPR64commonRegClass);
3623   Register DefSize = MRI.createVirtualRegister(&AArch64::GPR64RegClass);
3624   if (IsSet) {
3625     MIB.buildInstr(Mopcode, {DefDstPtr, DefSize},
3626                    {DstPtrCopy, SizeCopy, SrcValCopy});
3627   } else {
3628     Register DefSrcPtr = MRI.createVirtualRegister(&SrcValRegClass);
3629     MIB.buildInstr(Mopcode, {DefDstPtr, DefSrcPtr, DefSize},
3630                    {DstPtrCopy, SrcValCopy, SizeCopy});
3631   }
3632 
3633   GI.eraseFromParent();
3634   return true;
3635 }
3636 
3637 bool AArch64InstructionSelector::selectBrJT(MachineInstr &I,
3638                                             MachineRegisterInfo &MRI) {
3639   assert(I.getOpcode() == TargetOpcode::G_BRJT && "Expected G_BRJT");
3640   Register JTAddr = I.getOperand(0).getReg();
3641   unsigned JTI = I.getOperand(1).getIndex();
3642   Register Index = I.getOperand(2).getReg();
3643 
3644   Register TargetReg = MRI.createVirtualRegister(&AArch64::GPR64RegClass);
3645   Register ScratchReg = MRI.createVirtualRegister(&AArch64::GPR64spRegClass);
3646 
3647   MF->getInfo<AArch64FunctionInfo>()->setJumpTableEntryInfo(JTI, 4, nullptr);
3648   auto JumpTableInst = MIB.buildInstr(AArch64::JumpTableDest32,
3649                                       {TargetReg, ScratchReg}, {JTAddr, Index})
3650                            .addJumpTableIndex(JTI);
3651   // Save the jump table info.
3652   MIB.buildInstr(TargetOpcode::JUMP_TABLE_DEBUG_INFO, {},
3653                  {static_cast<int64_t>(JTI)});
3654   // Build the indirect branch.
3655   MIB.buildInstr(AArch64::BR, {}, {TargetReg});
3656   I.eraseFromParent();
3657   return constrainSelectedInstRegOperands(*JumpTableInst, TII, TRI, RBI);
3658 }
3659 
3660 bool AArch64InstructionSelector::selectJumpTable(MachineInstr &I,
3661                                                  MachineRegisterInfo &MRI) {
3662   assert(I.getOpcode() == TargetOpcode::G_JUMP_TABLE && "Expected jump table");
3663   assert(I.getOperand(1).isJTI() && "Jump table op should have a JTI!");
3664 
3665   Register DstReg = I.getOperand(0).getReg();
3666   unsigned JTI = I.getOperand(1).getIndex();
3667   // We generate a MOVaddrJT which will get expanded to an ADRP + ADD later.
3668   auto MovMI =
3669     MIB.buildInstr(AArch64::MOVaddrJT, {DstReg}, {})
3670           .addJumpTableIndex(JTI, AArch64II::MO_PAGE)
3671           .addJumpTableIndex(JTI, AArch64II::MO_NC | AArch64II::MO_PAGEOFF);
3672   I.eraseFromParent();
3673   return constrainSelectedInstRegOperands(*MovMI, TII, TRI, RBI);
3674 }
3675 
3676 bool AArch64InstructionSelector::selectTLSGlobalValue(
3677     MachineInstr &I, MachineRegisterInfo &MRI) {
3678   if (!STI.isTargetMachO())
3679     return false;
3680   MachineFunction &MF = *I.getParent()->getParent();
3681   MF.getFrameInfo().setAdjustsStack(true);
3682 
3683   const auto &GlobalOp = I.getOperand(1);
3684   assert(GlobalOp.getOffset() == 0 &&
3685          "Shouldn't have an offset on TLS globals!");
3686   const GlobalValue &GV = *GlobalOp.getGlobal();
3687 
3688   auto LoadGOT =
3689       MIB.buildInstr(AArch64::LOADgot, {&AArch64::GPR64commonRegClass}, {})
3690           .addGlobalAddress(&GV, 0, AArch64II::MO_TLS);
3691 
3692   auto Load = MIB.buildInstr(AArch64::LDRXui, {&AArch64::GPR64commonRegClass},
3693                              {LoadGOT.getReg(0)})
3694                   .addImm(0);
3695 
3696   MIB.buildCopy(Register(AArch64::X0), LoadGOT.getReg(0));
3697   // TLS calls preserve all registers except those that absolutely must be
3698   // trashed: X0 (it takes an argument), LR (it's a call) and NZCV (let's not be
3699   // silly).
3700   MIB.buildInstr(getBLRCallOpcode(MF), {}, {Load})
3701       .addUse(AArch64::X0, RegState::Implicit)
3702       .addDef(AArch64::X0, RegState::Implicit)
3703       .addRegMask(TRI.getTLSCallPreservedMask());
3704 
3705   MIB.buildCopy(I.getOperand(0).getReg(), Register(AArch64::X0));
3706   RBI.constrainGenericRegister(I.getOperand(0).getReg(), AArch64::GPR64RegClass,
3707                                MRI);
3708   I.eraseFromParent();
3709   return true;
3710 }
3711 
3712 bool AArch64InstructionSelector::selectVectorICmp(
3713     MachineInstr &I, MachineRegisterInfo &MRI) {
3714   Register DstReg = I.getOperand(0).getReg();
3715   LLT DstTy = MRI.getType(DstReg);
3716   Register SrcReg = I.getOperand(2).getReg();
3717   Register Src2Reg = I.getOperand(3).getReg();
3718   LLT SrcTy = MRI.getType(SrcReg);
3719 
3720   unsigned SrcEltSize = SrcTy.getElementType().getSizeInBits();
3721   unsigned NumElts = DstTy.getNumElements();
3722 
3723   // First index is element size, 0 == 8b, 1 == 16b, 2 == 32b, 3 == 64b
3724   // Second index is num elts, 0 == v2, 1 == v4, 2 == v8, 3 == v16
3725   // Third index is cc opcode:
3726   // 0 == eq
3727   // 1 == ugt
3728   // 2 == uge
3729   // 3 == ult
3730   // 4 == ule
3731   // 5 == sgt
3732   // 6 == sge
3733   // 7 == slt
3734   // 8 == sle
3735   // ne is done by negating 'eq' result.
3736 
3737   // This table below assumes that for some comparisons the operands will be
3738   // commuted.
3739   // ult op == commute + ugt op
3740   // ule op == commute + uge op
3741   // slt op == commute + sgt op
3742   // sle op == commute + sge op
3743   unsigned PredIdx = 0;
3744   bool SwapOperands = false;
3745   CmpInst::Predicate Pred = (CmpInst::Predicate)I.getOperand(1).getPredicate();
3746   switch (Pred) {
3747   case CmpInst::ICMP_NE:
3748   case CmpInst::ICMP_EQ:
3749     PredIdx = 0;
3750     break;
3751   case CmpInst::ICMP_UGT:
3752     PredIdx = 1;
3753     break;
3754   case CmpInst::ICMP_UGE:
3755     PredIdx = 2;
3756     break;
3757   case CmpInst::ICMP_ULT:
3758     PredIdx = 3;
3759     SwapOperands = true;
3760     break;
3761   case CmpInst::ICMP_ULE:
3762     PredIdx = 4;
3763     SwapOperands = true;
3764     break;
3765   case CmpInst::ICMP_SGT:
3766     PredIdx = 5;
3767     break;
3768   case CmpInst::ICMP_SGE:
3769     PredIdx = 6;
3770     break;
3771   case CmpInst::ICMP_SLT:
3772     PredIdx = 7;
3773     SwapOperands = true;
3774     break;
3775   case CmpInst::ICMP_SLE:
3776     PredIdx = 8;
3777     SwapOperands = true;
3778     break;
3779   default:
3780     llvm_unreachable("Unhandled icmp predicate");
3781     return false;
3782   }
3783 
3784   // This table obviously should be tablegen'd when we have our GISel native
3785   // tablegen selector.
3786 
3787   static const unsigned OpcTable[4][4][9] = {
3788       {
3789           {0 /* invalid */, 0 /* invalid */, 0 /* invalid */, 0 /* invalid */,
3790            0 /* invalid */, 0 /* invalid */, 0 /* invalid */, 0 /* invalid */,
3791            0 /* invalid */},
3792           {0 /* invalid */, 0 /* invalid */, 0 /* invalid */, 0 /* invalid */,
3793            0 /* invalid */, 0 /* invalid */, 0 /* invalid */, 0 /* invalid */,
3794            0 /* invalid */},
3795           {AArch64::CMEQv8i8, AArch64::CMHIv8i8, AArch64::CMHSv8i8,
3796            AArch64::CMHIv8i8, AArch64::CMHSv8i8, AArch64::CMGTv8i8,
3797            AArch64::CMGEv8i8, AArch64::CMGTv8i8, AArch64::CMGEv8i8},
3798           {AArch64::CMEQv16i8, AArch64::CMHIv16i8, AArch64::CMHSv16i8,
3799            AArch64::CMHIv16i8, AArch64::CMHSv16i8, AArch64::CMGTv16i8,
3800            AArch64::CMGEv16i8, AArch64::CMGTv16i8, AArch64::CMGEv16i8}
3801       },
3802       {
3803           {0 /* invalid */, 0 /* invalid */, 0 /* invalid */, 0 /* invalid */,
3804            0 /* invalid */, 0 /* invalid */, 0 /* invalid */, 0 /* invalid */,
3805            0 /* invalid */},
3806           {AArch64::CMEQv4i16, AArch64::CMHIv4i16, AArch64::CMHSv4i16,
3807            AArch64::CMHIv4i16, AArch64::CMHSv4i16, AArch64::CMGTv4i16,
3808            AArch64::CMGEv4i16, AArch64::CMGTv4i16, AArch64::CMGEv4i16},
3809           {AArch64::CMEQv8i16, AArch64::CMHIv8i16, AArch64::CMHSv8i16,
3810            AArch64::CMHIv8i16, AArch64::CMHSv8i16, AArch64::CMGTv8i16,
3811            AArch64::CMGEv8i16, AArch64::CMGTv8i16, AArch64::CMGEv8i16},
3812           {0 /* invalid */, 0 /* invalid */, 0 /* invalid */, 0 /* invalid */,
3813            0 /* invalid */, 0 /* invalid */, 0 /* invalid */, 0 /* invalid */,
3814            0 /* invalid */}
3815       },
3816       {
3817           {AArch64::CMEQv2i32, AArch64::CMHIv2i32, AArch64::CMHSv2i32,
3818            AArch64::CMHIv2i32, AArch64::CMHSv2i32, AArch64::CMGTv2i32,
3819            AArch64::CMGEv2i32, AArch64::CMGTv2i32, AArch64::CMGEv2i32},
3820           {AArch64::CMEQv4i32, AArch64::CMHIv4i32, AArch64::CMHSv4i32,
3821            AArch64::CMHIv4i32, AArch64::CMHSv4i32, AArch64::CMGTv4i32,
3822            AArch64::CMGEv4i32, AArch64::CMGTv4i32, AArch64::CMGEv4i32},
3823           {0 /* invalid */, 0 /* invalid */, 0 /* invalid */, 0 /* invalid */,
3824            0 /* invalid */, 0 /* invalid */, 0 /* invalid */, 0 /* invalid */,
3825            0 /* invalid */},
3826           {0 /* invalid */, 0 /* invalid */, 0 /* invalid */, 0 /* invalid */,
3827            0 /* invalid */, 0 /* invalid */, 0 /* invalid */, 0 /* invalid */,
3828            0 /* invalid */}
3829       },
3830       {
3831           {AArch64::CMEQv2i64, AArch64::CMHIv2i64, AArch64::CMHSv2i64,
3832            AArch64::CMHIv2i64, AArch64::CMHSv2i64, AArch64::CMGTv2i64,
3833            AArch64::CMGEv2i64, AArch64::CMGTv2i64, AArch64::CMGEv2i64},
3834           {0 /* invalid */, 0 /* invalid */, 0 /* invalid */, 0 /* invalid */,
3835            0 /* invalid */, 0 /* invalid */, 0 /* invalid */, 0 /* invalid */,
3836            0 /* invalid */},
3837           {0 /* invalid */, 0 /* invalid */, 0 /* invalid */, 0 /* invalid */,
3838            0 /* invalid */, 0 /* invalid */, 0 /* invalid */, 0 /* invalid */,
3839            0 /* invalid */},
3840           {0 /* invalid */, 0 /* invalid */, 0 /* invalid */, 0 /* invalid */,
3841            0 /* invalid */, 0 /* invalid */, 0 /* invalid */, 0 /* invalid */,
3842            0 /* invalid */}
3843       },
3844   };
3845   unsigned EltIdx = Log2_32(SrcEltSize / 8);
3846   unsigned NumEltsIdx = Log2_32(NumElts / 2);
3847   unsigned Opc = OpcTable[EltIdx][NumEltsIdx][PredIdx];
3848   if (!Opc) {
3849     LLVM_DEBUG(dbgs() << "Could not map G_ICMP to cmp opcode");
3850     return false;
3851   }
3852 
3853   const RegisterBank &VecRB = *RBI.getRegBank(SrcReg, MRI, TRI);
3854   const TargetRegisterClass *SrcRC =
3855       getRegClassForTypeOnBank(SrcTy, VecRB, true);
3856   if (!SrcRC) {
3857     LLVM_DEBUG(dbgs() << "Could not determine source register class.\n");
3858     return false;
3859   }
3860 
3861   unsigned NotOpc = Pred == ICmpInst::ICMP_NE ? AArch64::NOTv8i8 : 0;
3862   if (SrcTy.getSizeInBits() == 128)
3863     NotOpc = NotOpc ? AArch64::NOTv16i8 : 0;
3864 
3865   if (SwapOperands)
3866     std::swap(SrcReg, Src2Reg);
3867 
3868   auto Cmp = MIB.buildInstr(Opc, {SrcRC}, {SrcReg, Src2Reg});
3869   constrainSelectedInstRegOperands(*Cmp, TII, TRI, RBI);
3870 
3871   // Invert if we had a 'ne' cc.
3872   if (NotOpc) {
3873     Cmp = MIB.buildInstr(NotOpc, {DstReg}, {Cmp});
3874     constrainSelectedInstRegOperands(*Cmp, TII, TRI, RBI);
3875   } else {
3876     MIB.buildCopy(DstReg, Cmp.getReg(0));
3877   }
3878   RBI.constrainGenericRegister(DstReg, *SrcRC, MRI);
3879   I.eraseFromParent();
3880   return true;
3881 }
3882 
3883 MachineInstr *AArch64InstructionSelector::emitScalarToVector(
3884     unsigned EltSize, const TargetRegisterClass *DstRC, Register Scalar,
3885     MachineIRBuilder &MIRBuilder) const {
3886   auto Undef = MIRBuilder.buildInstr(TargetOpcode::IMPLICIT_DEF, {DstRC}, {});
3887 
3888   auto BuildFn = [&](unsigned SubregIndex) {
3889     auto Ins =
3890         MIRBuilder
3891             .buildInstr(TargetOpcode::INSERT_SUBREG, {DstRC}, {Undef, Scalar})
3892             .addImm(SubregIndex);
3893     constrainSelectedInstRegOperands(*Undef, TII, TRI, RBI);
3894     constrainSelectedInstRegOperands(*Ins, TII, TRI, RBI);
3895     return &*Ins;
3896   };
3897 
3898   switch (EltSize) {
3899   case 8:
3900     return BuildFn(AArch64::bsub);
3901   case 16:
3902     return BuildFn(AArch64::hsub);
3903   case 32:
3904     return BuildFn(AArch64::ssub);
3905   case 64:
3906     return BuildFn(AArch64::dsub);
3907   default:
3908     return nullptr;
3909   }
3910 }
3911 
3912 MachineInstr *
3913 AArch64InstructionSelector::emitNarrowVector(Register DstReg, Register SrcReg,
3914                                              MachineIRBuilder &MIB,
3915                                              MachineRegisterInfo &MRI) const {
3916   LLT DstTy = MRI.getType(DstReg);
3917   const TargetRegisterClass *RC =
3918       getRegClassForTypeOnBank(DstTy, *RBI.getRegBank(SrcReg, MRI, TRI));
3919   if (RC != &AArch64::FPR32RegClass && RC != &AArch64::FPR64RegClass) {
3920     LLVM_DEBUG(dbgs() << "Unsupported register class!\n");
3921     return nullptr;
3922   }
3923   unsigned SubReg = 0;
3924   if (!getSubRegForClass(RC, TRI, SubReg))
3925     return nullptr;
3926   if (SubReg != AArch64::ssub && SubReg != AArch64::dsub) {
3927     LLVM_DEBUG(dbgs() << "Unsupported destination size! ("
3928                       << DstTy.getSizeInBits() << "\n");
3929     return nullptr;
3930   }
3931   auto Copy = MIB.buildInstr(TargetOpcode::COPY, {DstReg}, {})
3932                   .addReg(SrcReg, 0, SubReg);
3933   RBI.constrainGenericRegister(DstReg, *RC, MRI);
3934   return Copy;
3935 }
3936 
3937 bool AArch64InstructionSelector::selectMergeValues(
3938     MachineInstr &I, MachineRegisterInfo &MRI) {
3939   assert(I.getOpcode() == TargetOpcode::G_MERGE_VALUES && "unexpected opcode");
3940   const LLT DstTy = MRI.getType(I.getOperand(0).getReg());
3941   const LLT SrcTy = MRI.getType(I.getOperand(1).getReg());
3942   assert(!DstTy.isVector() && !SrcTy.isVector() && "invalid merge operation");
3943   const RegisterBank &RB = *RBI.getRegBank(I.getOperand(1).getReg(), MRI, TRI);
3944 
3945   if (I.getNumOperands() != 3)
3946     return false;
3947 
3948   // Merging 2 s64s into an s128.
3949   if (DstTy == LLT::scalar(128)) {
3950     if (SrcTy.getSizeInBits() != 64)
3951       return false;
3952     Register DstReg = I.getOperand(0).getReg();
3953     Register Src1Reg = I.getOperand(1).getReg();
3954     Register Src2Reg = I.getOperand(2).getReg();
3955     auto Tmp = MIB.buildInstr(TargetOpcode::IMPLICIT_DEF, {DstTy}, {});
3956     MachineInstr *InsMI = emitLaneInsert(std::nullopt, Tmp.getReg(0), Src1Reg,
3957                                          /* LaneIdx */ 0, RB, MIB);
3958     if (!InsMI)
3959       return false;
3960     MachineInstr *Ins2MI = emitLaneInsert(DstReg, InsMI->getOperand(0).getReg(),
3961                                           Src2Reg, /* LaneIdx */ 1, RB, MIB);
3962     if (!Ins2MI)
3963       return false;
3964     constrainSelectedInstRegOperands(*InsMI, TII, TRI, RBI);
3965     constrainSelectedInstRegOperands(*Ins2MI, TII, TRI, RBI);
3966     I.eraseFromParent();
3967     return true;
3968   }
3969 
3970   if (RB.getID() != AArch64::GPRRegBankID)
3971     return false;
3972 
3973   if (DstTy.getSizeInBits() != 64 || SrcTy.getSizeInBits() != 32)
3974     return false;
3975 
3976   auto *DstRC = &AArch64::GPR64RegClass;
3977   Register SubToRegDef = MRI.createVirtualRegister(DstRC);
3978   MachineInstr &SubRegMI = *BuildMI(*I.getParent(), I, I.getDebugLoc(),
3979                                     TII.get(TargetOpcode::SUBREG_TO_REG))
3980                                 .addDef(SubToRegDef)
3981                                 .addImm(0)
3982                                 .addUse(I.getOperand(1).getReg())
3983                                 .addImm(AArch64::sub_32);
3984   Register SubToRegDef2 = MRI.createVirtualRegister(DstRC);
3985   // Need to anyext the second scalar before we can use bfm
3986   MachineInstr &SubRegMI2 = *BuildMI(*I.getParent(), I, I.getDebugLoc(),
3987                                     TII.get(TargetOpcode::SUBREG_TO_REG))
3988                                 .addDef(SubToRegDef2)
3989                                 .addImm(0)
3990                                 .addUse(I.getOperand(2).getReg())
3991                                 .addImm(AArch64::sub_32);
3992   MachineInstr &BFM =
3993       *BuildMI(*I.getParent(), I, I.getDebugLoc(), TII.get(AArch64::BFMXri))
3994            .addDef(I.getOperand(0).getReg())
3995            .addUse(SubToRegDef)
3996            .addUse(SubToRegDef2)
3997            .addImm(32)
3998            .addImm(31);
3999   constrainSelectedInstRegOperands(SubRegMI, TII, TRI, RBI);
4000   constrainSelectedInstRegOperands(SubRegMI2, TII, TRI, RBI);
4001   constrainSelectedInstRegOperands(BFM, TII, TRI, RBI);
4002   I.eraseFromParent();
4003   return true;
4004 }
4005 
4006 static bool getLaneCopyOpcode(unsigned &CopyOpc, unsigned &ExtractSubReg,
4007                               const unsigned EltSize) {
4008   // Choose a lane copy opcode and subregister based off of the size of the
4009   // vector's elements.
4010   switch (EltSize) {
4011   case 8:
4012     CopyOpc = AArch64::DUPi8;
4013     ExtractSubReg = AArch64::bsub;
4014     break;
4015   case 16:
4016     CopyOpc = AArch64::DUPi16;
4017     ExtractSubReg = AArch64::hsub;
4018     break;
4019   case 32:
4020     CopyOpc = AArch64::DUPi32;
4021     ExtractSubReg = AArch64::ssub;
4022     break;
4023   case 64:
4024     CopyOpc = AArch64::DUPi64;
4025     ExtractSubReg = AArch64::dsub;
4026     break;
4027   default:
4028     // Unknown size, bail out.
4029     LLVM_DEBUG(dbgs() << "Elt size '" << EltSize << "' unsupported.\n");
4030     return false;
4031   }
4032   return true;
4033 }
4034 
4035 MachineInstr *AArch64InstructionSelector::emitExtractVectorElt(
4036     std::optional<Register> DstReg, const RegisterBank &DstRB, LLT ScalarTy,
4037     Register VecReg, unsigned LaneIdx, MachineIRBuilder &MIRBuilder) const {
4038   MachineRegisterInfo &MRI = *MIRBuilder.getMRI();
4039   unsigned CopyOpc = 0;
4040   unsigned ExtractSubReg = 0;
4041   if (!getLaneCopyOpcode(CopyOpc, ExtractSubReg, ScalarTy.getSizeInBits())) {
4042     LLVM_DEBUG(
4043         dbgs() << "Couldn't determine lane copy opcode for instruction.\n");
4044     return nullptr;
4045   }
4046 
4047   const TargetRegisterClass *DstRC =
4048       getRegClassForTypeOnBank(ScalarTy, DstRB, true);
4049   if (!DstRC) {
4050     LLVM_DEBUG(dbgs() << "Could not determine destination register class.\n");
4051     return nullptr;
4052   }
4053 
4054   const RegisterBank &VecRB = *RBI.getRegBank(VecReg, MRI, TRI);
4055   const LLT &VecTy = MRI.getType(VecReg);
4056   const TargetRegisterClass *VecRC =
4057       getRegClassForTypeOnBank(VecTy, VecRB, true);
4058   if (!VecRC) {
4059     LLVM_DEBUG(dbgs() << "Could not determine source register class.\n");
4060     return nullptr;
4061   }
4062 
4063   // The register that we're going to copy into.
4064   Register InsertReg = VecReg;
4065   if (!DstReg)
4066     DstReg = MRI.createVirtualRegister(DstRC);
4067   // If the lane index is 0, we just use a subregister COPY.
4068   if (LaneIdx == 0) {
4069     auto Copy = MIRBuilder.buildInstr(TargetOpcode::COPY, {*DstReg}, {})
4070                     .addReg(VecReg, 0, ExtractSubReg);
4071     RBI.constrainGenericRegister(*DstReg, *DstRC, MRI);
4072     return &*Copy;
4073   }
4074 
4075   // Lane copies require 128-bit wide registers. If we're dealing with an
4076   // unpacked vector, then we need to move up to that width. Insert an implicit
4077   // def and a subregister insert to get us there.
4078   if (VecTy.getSizeInBits() != 128) {
4079     MachineInstr *ScalarToVector = emitScalarToVector(
4080         VecTy.getSizeInBits(), &AArch64::FPR128RegClass, VecReg, MIRBuilder);
4081     if (!ScalarToVector)
4082       return nullptr;
4083     InsertReg = ScalarToVector->getOperand(0).getReg();
4084   }
4085 
4086   MachineInstr *LaneCopyMI =
4087       MIRBuilder.buildInstr(CopyOpc, {*DstReg}, {InsertReg}).addImm(LaneIdx);
4088   constrainSelectedInstRegOperands(*LaneCopyMI, TII, TRI, RBI);
4089 
4090   // Make sure that we actually constrain the initial copy.
4091   RBI.constrainGenericRegister(*DstReg, *DstRC, MRI);
4092   return LaneCopyMI;
4093 }
4094 
4095 bool AArch64InstructionSelector::selectExtractElt(
4096     MachineInstr &I, MachineRegisterInfo &MRI) {
4097   assert(I.getOpcode() == TargetOpcode::G_EXTRACT_VECTOR_ELT &&
4098          "unexpected opcode!");
4099   Register DstReg = I.getOperand(0).getReg();
4100   const LLT NarrowTy = MRI.getType(DstReg);
4101   const Register SrcReg = I.getOperand(1).getReg();
4102   const LLT WideTy = MRI.getType(SrcReg);
4103   (void)WideTy;
4104   assert(WideTy.getSizeInBits() >= NarrowTy.getSizeInBits() &&
4105          "source register size too small!");
4106   assert(!NarrowTy.isVector() && "cannot extract vector into vector!");
4107 
4108   // Need the lane index to determine the correct copy opcode.
4109   MachineOperand &LaneIdxOp = I.getOperand(2);
4110   assert(LaneIdxOp.isReg() && "Lane index operand was not a register?");
4111 
4112   if (RBI.getRegBank(DstReg, MRI, TRI)->getID() != AArch64::FPRRegBankID) {
4113     LLVM_DEBUG(dbgs() << "Cannot extract into GPR.\n");
4114     return false;
4115   }
4116 
4117   // Find the index to extract from.
4118   auto VRegAndVal = getIConstantVRegValWithLookThrough(LaneIdxOp.getReg(), MRI);
4119   if (!VRegAndVal)
4120     return false;
4121   unsigned LaneIdx = VRegAndVal->Value.getSExtValue();
4122 
4123 
4124   const RegisterBank &DstRB = *RBI.getRegBank(DstReg, MRI, TRI);
4125   MachineInstr *Extract = emitExtractVectorElt(DstReg, DstRB, NarrowTy, SrcReg,
4126                                                LaneIdx, MIB);
4127   if (!Extract)
4128     return false;
4129 
4130   I.eraseFromParent();
4131   return true;
4132 }
4133 
4134 bool AArch64InstructionSelector::selectSplitVectorUnmerge(
4135     MachineInstr &I, MachineRegisterInfo &MRI) {
4136   unsigned NumElts = I.getNumOperands() - 1;
4137   Register SrcReg = I.getOperand(NumElts).getReg();
4138   const LLT NarrowTy = MRI.getType(I.getOperand(0).getReg());
4139   const LLT SrcTy = MRI.getType(SrcReg);
4140 
4141   assert(NarrowTy.isVector() && "Expected an unmerge into vectors");
4142   if (SrcTy.getSizeInBits() > 128) {
4143     LLVM_DEBUG(dbgs() << "Unexpected vector type for vec split unmerge");
4144     return false;
4145   }
4146 
4147   // We implement a split vector operation by treating the sub-vectors as
4148   // scalars and extracting them.
4149   const RegisterBank &DstRB =
4150       *RBI.getRegBank(I.getOperand(0).getReg(), MRI, TRI);
4151   for (unsigned OpIdx = 0; OpIdx < NumElts; ++OpIdx) {
4152     Register Dst = I.getOperand(OpIdx).getReg();
4153     MachineInstr *Extract =
4154         emitExtractVectorElt(Dst, DstRB, NarrowTy, SrcReg, OpIdx, MIB);
4155     if (!Extract)
4156       return false;
4157   }
4158   I.eraseFromParent();
4159   return true;
4160 }
4161 
4162 bool AArch64InstructionSelector::selectUnmergeValues(MachineInstr &I,
4163                                                      MachineRegisterInfo &MRI) {
4164   assert(I.getOpcode() == TargetOpcode::G_UNMERGE_VALUES &&
4165          "unexpected opcode");
4166 
4167   // TODO: Handle unmerging into GPRs and from scalars to scalars.
4168   if (RBI.getRegBank(I.getOperand(0).getReg(), MRI, TRI)->getID() !=
4169           AArch64::FPRRegBankID ||
4170       RBI.getRegBank(I.getOperand(1).getReg(), MRI, TRI)->getID() !=
4171           AArch64::FPRRegBankID) {
4172     LLVM_DEBUG(dbgs() << "Unmerging vector-to-gpr and scalar-to-scalar "
4173                          "currently unsupported.\n");
4174     return false;
4175   }
4176 
4177   // The last operand is the vector source register, and every other operand is
4178   // a register to unpack into.
4179   unsigned NumElts = I.getNumOperands() - 1;
4180   Register SrcReg = I.getOperand(NumElts).getReg();
4181   const LLT NarrowTy = MRI.getType(I.getOperand(0).getReg());
4182   const LLT WideTy = MRI.getType(SrcReg);
4183   (void)WideTy;
4184   assert((WideTy.isVector() || WideTy.getSizeInBits() == 128) &&
4185          "can only unmerge from vector or s128 types!");
4186   assert(WideTy.getSizeInBits() > NarrowTy.getSizeInBits() &&
4187          "source register size too small!");
4188 
4189   if (!NarrowTy.isScalar())
4190     return selectSplitVectorUnmerge(I, MRI);
4191 
4192   // Choose a lane copy opcode and subregister based off of the size of the
4193   // vector's elements.
4194   unsigned CopyOpc = 0;
4195   unsigned ExtractSubReg = 0;
4196   if (!getLaneCopyOpcode(CopyOpc, ExtractSubReg, NarrowTy.getSizeInBits()))
4197     return false;
4198 
4199   // Set up for the lane copies.
4200   MachineBasicBlock &MBB = *I.getParent();
4201 
4202   // Stores the registers we'll be copying from.
4203   SmallVector<Register, 4> InsertRegs;
4204 
4205   // We'll use the first register twice, so we only need NumElts-1 registers.
4206   unsigned NumInsertRegs = NumElts - 1;
4207 
4208   // If our elements fit into exactly 128 bits, then we can copy from the source
4209   // directly. Otherwise, we need to do a bit of setup with some subregister
4210   // inserts.
4211   if (NarrowTy.getSizeInBits() * NumElts == 128) {
4212     InsertRegs = SmallVector<Register, 4>(NumInsertRegs, SrcReg);
4213   } else {
4214     // No. We have to perform subregister inserts. For each insert, create an
4215     // implicit def and a subregister insert, and save the register we create.
4216     const TargetRegisterClass *RC = getRegClassForTypeOnBank(
4217         LLT::fixed_vector(NumElts, WideTy.getScalarSizeInBits()),
4218         *RBI.getRegBank(SrcReg, MRI, TRI));
4219     unsigned SubReg = 0;
4220     bool Found = getSubRegForClass(RC, TRI, SubReg);
4221     (void)Found;
4222     assert(Found && "expected to find last operand's subeg idx");
4223     for (unsigned Idx = 0; Idx < NumInsertRegs; ++Idx) {
4224       Register ImpDefReg = MRI.createVirtualRegister(&AArch64::FPR128RegClass);
4225       MachineInstr &ImpDefMI =
4226           *BuildMI(MBB, I, I.getDebugLoc(), TII.get(TargetOpcode::IMPLICIT_DEF),
4227                    ImpDefReg);
4228 
4229       // Now, create the subregister insert from SrcReg.
4230       Register InsertReg = MRI.createVirtualRegister(&AArch64::FPR128RegClass);
4231       MachineInstr &InsMI =
4232           *BuildMI(MBB, I, I.getDebugLoc(),
4233                    TII.get(TargetOpcode::INSERT_SUBREG), InsertReg)
4234                .addUse(ImpDefReg)
4235                .addUse(SrcReg)
4236                .addImm(SubReg);
4237 
4238       constrainSelectedInstRegOperands(ImpDefMI, TII, TRI, RBI);
4239       constrainSelectedInstRegOperands(InsMI, TII, TRI, RBI);
4240 
4241       // Save the register so that we can copy from it after.
4242       InsertRegs.push_back(InsertReg);
4243     }
4244   }
4245 
4246   // Now that we've created any necessary subregister inserts, we can
4247   // create the copies.
4248   //
4249   // Perform the first copy separately as a subregister copy.
4250   Register CopyTo = I.getOperand(0).getReg();
4251   auto FirstCopy = MIB.buildInstr(TargetOpcode::COPY, {CopyTo}, {})
4252                        .addReg(InsertRegs[0], 0, ExtractSubReg);
4253   constrainSelectedInstRegOperands(*FirstCopy, TII, TRI, RBI);
4254 
4255   // Now, perform the remaining copies as vector lane copies.
4256   unsigned LaneIdx = 1;
4257   for (Register InsReg : InsertRegs) {
4258     Register CopyTo = I.getOperand(LaneIdx).getReg();
4259     MachineInstr &CopyInst =
4260         *BuildMI(MBB, I, I.getDebugLoc(), TII.get(CopyOpc), CopyTo)
4261              .addUse(InsReg)
4262              .addImm(LaneIdx);
4263     constrainSelectedInstRegOperands(CopyInst, TII, TRI, RBI);
4264     ++LaneIdx;
4265   }
4266 
4267   // Separately constrain the first copy's destination. Because of the
4268   // limitation in constrainOperandRegClass, we can't guarantee that this will
4269   // actually be constrained. So, do it ourselves using the second operand.
4270   const TargetRegisterClass *RC =
4271       MRI.getRegClassOrNull(I.getOperand(1).getReg());
4272   if (!RC) {
4273     LLVM_DEBUG(dbgs() << "Couldn't constrain copy destination.\n");
4274     return false;
4275   }
4276 
4277   RBI.constrainGenericRegister(CopyTo, *RC, MRI);
4278   I.eraseFromParent();
4279   return true;
4280 }
4281 
4282 bool AArch64InstructionSelector::selectConcatVectors(
4283     MachineInstr &I, MachineRegisterInfo &MRI)  {
4284   assert(I.getOpcode() == TargetOpcode::G_CONCAT_VECTORS &&
4285          "Unexpected opcode");
4286   Register Dst = I.getOperand(0).getReg();
4287   Register Op1 = I.getOperand(1).getReg();
4288   Register Op2 = I.getOperand(2).getReg();
4289   MachineInstr *ConcatMI = emitVectorConcat(Dst, Op1, Op2, MIB);
4290   if (!ConcatMI)
4291     return false;
4292   I.eraseFromParent();
4293   return true;
4294 }
4295 
4296 unsigned
4297 AArch64InstructionSelector::emitConstantPoolEntry(const Constant *CPVal,
4298                                                   MachineFunction &MF) const {
4299   Type *CPTy = CPVal->getType();
4300   Align Alignment = MF.getDataLayout().getPrefTypeAlign(CPTy);
4301 
4302   MachineConstantPool *MCP = MF.getConstantPool();
4303   return MCP->getConstantPoolIndex(CPVal, Alignment);
4304 }
4305 
4306 MachineInstr *AArch64InstructionSelector::emitLoadFromConstantPool(
4307     const Constant *CPVal, MachineIRBuilder &MIRBuilder) const {
4308   const TargetRegisterClass *RC;
4309   unsigned Opc;
4310   bool IsTiny = TM.getCodeModel() == CodeModel::Tiny;
4311   unsigned Size = MIRBuilder.getDataLayout().getTypeStoreSize(CPVal->getType());
4312   switch (Size) {
4313   case 16:
4314     RC = &AArch64::FPR128RegClass;
4315     Opc = IsTiny ? AArch64::LDRQl : AArch64::LDRQui;
4316     break;
4317   case 8:
4318     RC = &AArch64::FPR64RegClass;
4319     Opc = IsTiny ? AArch64::LDRDl : AArch64::LDRDui;
4320     break;
4321   case 4:
4322     RC = &AArch64::FPR32RegClass;
4323     Opc = IsTiny ? AArch64::LDRSl : AArch64::LDRSui;
4324     break;
4325   case 2:
4326     RC = &AArch64::FPR16RegClass;
4327     Opc = AArch64::LDRHui;
4328     break;
4329   default:
4330     LLVM_DEBUG(dbgs() << "Could not load from constant pool of type "
4331                       << *CPVal->getType());
4332     return nullptr;
4333   }
4334 
4335   MachineInstr *LoadMI = nullptr;
4336   auto &MF = MIRBuilder.getMF();
4337   unsigned CPIdx = emitConstantPoolEntry(CPVal, MF);
4338   if (IsTiny && (Size == 16 || Size == 8 || Size == 4)) {
4339     // Use load(literal) for tiny code model.
4340     LoadMI = &*MIRBuilder.buildInstr(Opc, {RC}, {}).addConstantPoolIndex(CPIdx);
4341   } else {
4342     auto Adrp =
4343         MIRBuilder.buildInstr(AArch64::ADRP, {&AArch64::GPR64RegClass}, {})
4344             .addConstantPoolIndex(CPIdx, 0, AArch64II::MO_PAGE);
4345 
4346     LoadMI = &*MIRBuilder.buildInstr(Opc, {RC}, {Adrp})
4347                    .addConstantPoolIndex(
4348                        CPIdx, 0, AArch64II::MO_PAGEOFF | AArch64II::MO_NC);
4349 
4350     constrainSelectedInstRegOperands(*Adrp, TII, TRI, RBI);
4351   }
4352 
4353   MachinePointerInfo PtrInfo = MachinePointerInfo::getConstantPool(MF);
4354   LoadMI->addMemOperand(MF, MF.getMachineMemOperand(PtrInfo,
4355                                                     MachineMemOperand::MOLoad,
4356                                                     Size, Align(Size)));
4357   constrainSelectedInstRegOperands(*LoadMI, TII, TRI, RBI);
4358   return LoadMI;
4359 }
4360 
4361 /// Return an <Opcode, SubregIndex> pair to do an vector elt insert of a given
4362 /// size and RB.
4363 static std::pair<unsigned, unsigned>
4364 getInsertVecEltOpInfo(const RegisterBank &RB, unsigned EltSize) {
4365   unsigned Opc, SubregIdx;
4366   if (RB.getID() == AArch64::GPRRegBankID) {
4367     if (EltSize == 8) {
4368       Opc = AArch64::INSvi8gpr;
4369       SubregIdx = AArch64::bsub;
4370     } else if (EltSize == 16) {
4371       Opc = AArch64::INSvi16gpr;
4372       SubregIdx = AArch64::ssub;
4373     } else if (EltSize == 32) {
4374       Opc = AArch64::INSvi32gpr;
4375       SubregIdx = AArch64::ssub;
4376     } else if (EltSize == 64) {
4377       Opc = AArch64::INSvi64gpr;
4378       SubregIdx = AArch64::dsub;
4379     } else {
4380       llvm_unreachable("invalid elt size!");
4381     }
4382   } else {
4383     if (EltSize == 8) {
4384       Opc = AArch64::INSvi8lane;
4385       SubregIdx = AArch64::bsub;
4386     } else if (EltSize == 16) {
4387       Opc = AArch64::INSvi16lane;
4388       SubregIdx = AArch64::hsub;
4389     } else if (EltSize == 32) {
4390       Opc = AArch64::INSvi32lane;
4391       SubregIdx = AArch64::ssub;
4392     } else if (EltSize == 64) {
4393       Opc = AArch64::INSvi64lane;
4394       SubregIdx = AArch64::dsub;
4395     } else {
4396       llvm_unreachable("invalid elt size!");
4397     }
4398   }
4399   return std::make_pair(Opc, SubregIdx);
4400 }
4401 
4402 MachineInstr *AArch64InstructionSelector::emitInstr(
4403     unsigned Opcode, std::initializer_list<llvm::DstOp> DstOps,
4404     std::initializer_list<llvm::SrcOp> SrcOps, MachineIRBuilder &MIRBuilder,
4405     const ComplexRendererFns &RenderFns) const {
4406   assert(Opcode && "Expected an opcode?");
4407   assert(!isPreISelGenericOpcode(Opcode) &&
4408          "Function should only be used to produce selected instructions!");
4409   auto MI = MIRBuilder.buildInstr(Opcode, DstOps, SrcOps);
4410   if (RenderFns)
4411     for (auto &Fn : *RenderFns)
4412       Fn(MI);
4413   constrainSelectedInstRegOperands(*MI, TII, TRI, RBI);
4414   return &*MI;
4415 }
4416 
4417 MachineInstr *AArch64InstructionSelector::emitAddSub(
4418     const std::array<std::array<unsigned, 2>, 5> &AddrModeAndSizeToOpcode,
4419     Register Dst, MachineOperand &LHS, MachineOperand &RHS,
4420     MachineIRBuilder &MIRBuilder) const {
4421   MachineRegisterInfo &MRI = MIRBuilder.getMF().getRegInfo();
4422   assert(LHS.isReg() && RHS.isReg() && "Expected register operands?");
4423   auto Ty = MRI.getType(LHS.getReg());
4424   assert(!Ty.isVector() && "Expected a scalar or pointer?");
4425   unsigned Size = Ty.getSizeInBits();
4426   assert((Size == 32 || Size == 64) && "Expected a 32-bit or 64-bit type only");
4427   bool Is32Bit = Size == 32;
4428 
4429   // INSTRri form with positive arithmetic immediate.
4430   if (auto Fns = selectArithImmed(RHS))
4431     return emitInstr(AddrModeAndSizeToOpcode[0][Is32Bit], {Dst}, {LHS},
4432                      MIRBuilder, Fns);
4433 
4434   // INSTRri form with negative arithmetic immediate.
4435   if (auto Fns = selectNegArithImmed(RHS))
4436     return emitInstr(AddrModeAndSizeToOpcode[3][Is32Bit], {Dst}, {LHS},
4437                      MIRBuilder, Fns);
4438 
4439   // INSTRrx form.
4440   if (auto Fns = selectArithExtendedRegister(RHS))
4441     return emitInstr(AddrModeAndSizeToOpcode[4][Is32Bit], {Dst}, {LHS},
4442                      MIRBuilder, Fns);
4443 
4444   // INSTRrs form.
4445   if (auto Fns = selectShiftedRegister(RHS))
4446     return emitInstr(AddrModeAndSizeToOpcode[1][Is32Bit], {Dst}, {LHS},
4447                      MIRBuilder, Fns);
4448   return emitInstr(AddrModeAndSizeToOpcode[2][Is32Bit], {Dst}, {LHS, RHS},
4449                    MIRBuilder);
4450 }
4451 
4452 MachineInstr *
4453 AArch64InstructionSelector::emitADD(Register DefReg, MachineOperand &LHS,
4454                                     MachineOperand &RHS,
4455                                     MachineIRBuilder &MIRBuilder) const {
4456   const std::array<std::array<unsigned, 2>, 5> OpcTable{
4457       {{AArch64::ADDXri, AArch64::ADDWri},
4458        {AArch64::ADDXrs, AArch64::ADDWrs},
4459        {AArch64::ADDXrr, AArch64::ADDWrr},
4460        {AArch64::SUBXri, AArch64::SUBWri},
4461        {AArch64::ADDXrx, AArch64::ADDWrx}}};
4462   return emitAddSub(OpcTable, DefReg, LHS, RHS, MIRBuilder);
4463 }
4464 
4465 MachineInstr *
4466 AArch64InstructionSelector::emitADDS(Register Dst, MachineOperand &LHS,
4467                                      MachineOperand &RHS,
4468                                      MachineIRBuilder &MIRBuilder) const {
4469   const std::array<std::array<unsigned, 2>, 5> OpcTable{
4470       {{AArch64::ADDSXri, AArch64::ADDSWri},
4471        {AArch64::ADDSXrs, AArch64::ADDSWrs},
4472        {AArch64::ADDSXrr, AArch64::ADDSWrr},
4473        {AArch64::SUBSXri, AArch64::SUBSWri},
4474        {AArch64::ADDSXrx, AArch64::ADDSWrx}}};
4475   return emitAddSub(OpcTable, Dst, LHS, RHS, MIRBuilder);
4476 }
4477 
4478 MachineInstr *
4479 AArch64InstructionSelector::emitSUBS(Register Dst, MachineOperand &LHS,
4480                                      MachineOperand &RHS,
4481                                      MachineIRBuilder &MIRBuilder) const {
4482   const std::array<std::array<unsigned, 2>, 5> OpcTable{
4483       {{AArch64::SUBSXri, AArch64::SUBSWri},
4484        {AArch64::SUBSXrs, AArch64::SUBSWrs},
4485        {AArch64::SUBSXrr, AArch64::SUBSWrr},
4486        {AArch64::ADDSXri, AArch64::ADDSWri},
4487        {AArch64::SUBSXrx, AArch64::SUBSWrx}}};
4488   return emitAddSub(OpcTable, Dst, LHS, RHS, MIRBuilder);
4489 }
4490 
4491 MachineInstr *
4492 AArch64InstructionSelector::emitADCS(Register Dst, MachineOperand &LHS,
4493                                      MachineOperand &RHS,
4494                                      MachineIRBuilder &MIRBuilder) const {
4495   assert(LHS.isReg() && RHS.isReg() && "Expected register operands?");
4496   MachineRegisterInfo *MRI = MIRBuilder.getMRI();
4497   bool Is32Bit = (MRI->getType(LHS.getReg()).getSizeInBits() == 32);
4498   static const unsigned OpcTable[2] = {AArch64::ADCSXr, AArch64::ADCSWr};
4499   return emitInstr(OpcTable[Is32Bit], {Dst}, {LHS, RHS}, MIRBuilder);
4500 }
4501 
4502 MachineInstr *
4503 AArch64InstructionSelector::emitSBCS(Register Dst, MachineOperand &LHS,
4504                                      MachineOperand &RHS,
4505                                      MachineIRBuilder &MIRBuilder) const {
4506   assert(LHS.isReg() && RHS.isReg() && "Expected register operands?");
4507   MachineRegisterInfo *MRI = MIRBuilder.getMRI();
4508   bool Is32Bit = (MRI->getType(LHS.getReg()).getSizeInBits() == 32);
4509   static const unsigned OpcTable[2] = {AArch64::SBCSXr, AArch64::SBCSWr};
4510   return emitInstr(OpcTable[Is32Bit], {Dst}, {LHS, RHS}, MIRBuilder);
4511 }
4512 
4513 MachineInstr *
4514 AArch64InstructionSelector::emitCMN(MachineOperand &LHS, MachineOperand &RHS,
4515                                     MachineIRBuilder &MIRBuilder) const {
4516   MachineRegisterInfo &MRI = MIRBuilder.getMF().getRegInfo();
4517   bool Is32Bit = (MRI.getType(LHS.getReg()).getSizeInBits() == 32);
4518   auto RC = Is32Bit ? &AArch64::GPR32RegClass : &AArch64::GPR64RegClass;
4519   return emitADDS(MRI.createVirtualRegister(RC), LHS, RHS, MIRBuilder);
4520 }
4521 
4522 MachineInstr *
4523 AArch64InstructionSelector::emitTST(MachineOperand &LHS, MachineOperand &RHS,
4524                                     MachineIRBuilder &MIRBuilder) const {
4525   assert(LHS.isReg() && RHS.isReg() && "Expected register operands?");
4526   MachineRegisterInfo &MRI = MIRBuilder.getMF().getRegInfo();
4527   LLT Ty = MRI.getType(LHS.getReg());
4528   unsigned RegSize = Ty.getSizeInBits();
4529   bool Is32Bit = (RegSize == 32);
4530   const unsigned OpcTable[3][2] = {{AArch64::ANDSXri, AArch64::ANDSWri},
4531                                    {AArch64::ANDSXrs, AArch64::ANDSWrs},
4532                                    {AArch64::ANDSXrr, AArch64::ANDSWrr}};
4533   // ANDS needs a logical immediate for its immediate form. Check if we can
4534   // fold one in.
4535   if (auto ValAndVReg = getIConstantVRegValWithLookThrough(RHS.getReg(), MRI)) {
4536     int64_t Imm = ValAndVReg->Value.getSExtValue();
4537 
4538     if (AArch64_AM::isLogicalImmediate(Imm, RegSize)) {
4539       auto TstMI = MIRBuilder.buildInstr(OpcTable[0][Is32Bit], {Ty}, {LHS});
4540       TstMI.addImm(AArch64_AM::encodeLogicalImmediate(Imm, RegSize));
4541       constrainSelectedInstRegOperands(*TstMI, TII, TRI, RBI);
4542       return &*TstMI;
4543     }
4544   }
4545 
4546   if (auto Fns = selectLogicalShiftedRegister(RHS))
4547     return emitInstr(OpcTable[1][Is32Bit], {Ty}, {LHS}, MIRBuilder, Fns);
4548   return emitInstr(OpcTable[2][Is32Bit], {Ty}, {LHS, RHS}, MIRBuilder);
4549 }
4550 
4551 MachineInstr *AArch64InstructionSelector::emitIntegerCompare(
4552     MachineOperand &LHS, MachineOperand &RHS, MachineOperand &Predicate,
4553     MachineIRBuilder &MIRBuilder) const {
4554   assert(LHS.isReg() && RHS.isReg() && "Expected LHS and RHS to be registers!");
4555   assert(Predicate.isPredicate() && "Expected predicate?");
4556   MachineRegisterInfo &MRI = MIRBuilder.getMF().getRegInfo();
4557   LLT CmpTy = MRI.getType(LHS.getReg());
4558   assert(!CmpTy.isVector() && "Expected scalar or pointer");
4559   unsigned Size = CmpTy.getSizeInBits();
4560   (void)Size;
4561   assert((Size == 32 || Size == 64) && "Expected a 32-bit or 64-bit LHS/RHS?");
4562   // Fold the compare into a cmn or tst if possible.
4563   if (auto FoldCmp = tryFoldIntegerCompare(LHS, RHS, Predicate, MIRBuilder))
4564     return FoldCmp;
4565   auto Dst = MRI.cloneVirtualRegister(LHS.getReg());
4566   return emitSUBS(Dst, LHS, RHS, MIRBuilder);
4567 }
4568 
4569 MachineInstr *AArch64InstructionSelector::emitCSetForFCmp(
4570     Register Dst, CmpInst::Predicate Pred, MachineIRBuilder &MIRBuilder) const {
4571   MachineRegisterInfo &MRI = *MIRBuilder.getMRI();
4572 #ifndef NDEBUG
4573   LLT Ty = MRI.getType(Dst);
4574   assert(!Ty.isVector() && Ty.getSizeInBits() == 32 &&
4575          "Expected a 32-bit scalar register?");
4576 #endif
4577   const Register ZReg = AArch64::WZR;
4578   AArch64CC::CondCode CC1, CC2;
4579   changeFCMPPredToAArch64CC(Pred, CC1, CC2);
4580   auto InvCC1 = AArch64CC::getInvertedCondCode(CC1);
4581   if (CC2 == AArch64CC::AL)
4582     return emitCSINC(/*Dst=*/Dst, /*Src1=*/ZReg, /*Src2=*/ZReg, InvCC1,
4583                      MIRBuilder);
4584   const TargetRegisterClass *RC = &AArch64::GPR32RegClass;
4585   Register Def1Reg = MRI.createVirtualRegister(RC);
4586   Register Def2Reg = MRI.createVirtualRegister(RC);
4587   auto InvCC2 = AArch64CC::getInvertedCondCode(CC2);
4588   emitCSINC(/*Dst=*/Def1Reg, /*Src1=*/ZReg, /*Src2=*/ZReg, InvCC1, MIRBuilder);
4589   emitCSINC(/*Dst=*/Def2Reg, /*Src1=*/ZReg, /*Src2=*/ZReg, InvCC2, MIRBuilder);
4590   auto OrMI = MIRBuilder.buildInstr(AArch64::ORRWrr, {Dst}, {Def1Reg, Def2Reg});
4591   constrainSelectedInstRegOperands(*OrMI, TII, TRI, RBI);
4592   return &*OrMI;
4593 }
4594 
4595 MachineInstr *AArch64InstructionSelector::emitFPCompare(
4596     Register LHS, Register RHS, MachineIRBuilder &MIRBuilder,
4597     std::optional<CmpInst::Predicate> Pred) const {
4598   MachineRegisterInfo &MRI = *MIRBuilder.getMRI();
4599   LLT Ty = MRI.getType(LHS);
4600   if (Ty.isVector())
4601     return nullptr;
4602   unsigned OpSize = Ty.getSizeInBits();
4603   if (OpSize != 32 && OpSize != 64)
4604     return nullptr;
4605 
4606   // If this is a compare against +0.0, then we don't have
4607   // to explicitly materialize a constant.
4608   const ConstantFP *FPImm = getConstantFPVRegVal(RHS, MRI);
4609   bool ShouldUseImm = FPImm && (FPImm->isZero() && !FPImm->isNegative());
4610 
4611   auto IsEqualityPred = [](CmpInst::Predicate P) {
4612     return P == CmpInst::FCMP_OEQ || P == CmpInst::FCMP_ONE ||
4613            P == CmpInst::FCMP_UEQ || P == CmpInst::FCMP_UNE;
4614   };
4615   if (!ShouldUseImm && Pred && IsEqualityPred(*Pred)) {
4616     // Try commutating the operands.
4617     const ConstantFP *LHSImm = getConstantFPVRegVal(LHS, MRI);
4618     if (LHSImm && (LHSImm->isZero() && !LHSImm->isNegative())) {
4619       ShouldUseImm = true;
4620       std::swap(LHS, RHS);
4621     }
4622   }
4623   unsigned CmpOpcTbl[2][2] = {{AArch64::FCMPSrr, AArch64::FCMPDrr},
4624                               {AArch64::FCMPSri, AArch64::FCMPDri}};
4625   unsigned CmpOpc = CmpOpcTbl[ShouldUseImm][OpSize == 64];
4626 
4627   // Partially build the compare. Decide if we need to add a use for the
4628   // third operand based off whether or not we're comparing against 0.0.
4629   auto CmpMI = MIRBuilder.buildInstr(CmpOpc).addUse(LHS);
4630   CmpMI.setMIFlags(MachineInstr::NoFPExcept);
4631   if (!ShouldUseImm)
4632     CmpMI.addUse(RHS);
4633   constrainSelectedInstRegOperands(*CmpMI, TII, TRI, RBI);
4634   return &*CmpMI;
4635 }
4636 
4637 MachineInstr *AArch64InstructionSelector::emitVectorConcat(
4638     std::optional<Register> Dst, Register Op1, Register Op2,
4639     MachineIRBuilder &MIRBuilder) const {
4640   // We implement a vector concat by:
4641   // 1. Use scalar_to_vector to insert the lower vector into the larger dest
4642   // 2. Insert the upper vector into the destination's upper element
4643   // TODO: some of this code is common with G_BUILD_VECTOR handling.
4644   MachineRegisterInfo &MRI = MIRBuilder.getMF().getRegInfo();
4645 
4646   const LLT Op1Ty = MRI.getType(Op1);
4647   const LLT Op2Ty = MRI.getType(Op2);
4648 
4649   if (Op1Ty != Op2Ty) {
4650     LLVM_DEBUG(dbgs() << "Could not do vector concat of differing vector tys");
4651     return nullptr;
4652   }
4653   assert(Op1Ty.isVector() && "Expected a vector for vector concat");
4654 
4655   if (Op1Ty.getSizeInBits() >= 128) {
4656     LLVM_DEBUG(dbgs() << "Vector concat not supported for full size vectors");
4657     return nullptr;
4658   }
4659 
4660   // At the moment we just support 64 bit vector concats.
4661   if (Op1Ty.getSizeInBits() != 64) {
4662     LLVM_DEBUG(dbgs() << "Vector concat supported for 64b vectors");
4663     return nullptr;
4664   }
4665 
4666   const LLT ScalarTy = LLT::scalar(Op1Ty.getSizeInBits());
4667   const RegisterBank &FPRBank = *RBI.getRegBank(Op1, MRI, TRI);
4668   const TargetRegisterClass *DstRC =
4669       getRegClassForTypeOnBank(Op1Ty.multiplyElements(2), FPRBank);
4670 
4671   MachineInstr *WidenedOp1 =
4672       emitScalarToVector(ScalarTy.getSizeInBits(), DstRC, Op1, MIRBuilder);
4673   MachineInstr *WidenedOp2 =
4674       emitScalarToVector(ScalarTy.getSizeInBits(), DstRC, Op2, MIRBuilder);
4675   if (!WidenedOp1 || !WidenedOp2) {
4676     LLVM_DEBUG(dbgs() << "Could not emit a vector from scalar value");
4677     return nullptr;
4678   }
4679 
4680   // Now do the insert of the upper element.
4681   unsigned InsertOpc, InsSubRegIdx;
4682   std::tie(InsertOpc, InsSubRegIdx) =
4683       getInsertVecEltOpInfo(FPRBank, ScalarTy.getSizeInBits());
4684 
4685   if (!Dst)
4686     Dst = MRI.createVirtualRegister(DstRC);
4687   auto InsElt =
4688       MIRBuilder
4689           .buildInstr(InsertOpc, {*Dst}, {WidenedOp1->getOperand(0).getReg()})
4690           .addImm(1) /* Lane index */
4691           .addUse(WidenedOp2->getOperand(0).getReg())
4692           .addImm(0);
4693   constrainSelectedInstRegOperands(*InsElt, TII, TRI, RBI);
4694   return &*InsElt;
4695 }
4696 
4697 MachineInstr *
4698 AArch64InstructionSelector::emitCSINC(Register Dst, Register Src1,
4699                                       Register Src2, AArch64CC::CondCode Pred,
4700                                       MachineIRBuilder &MIRBuilder) const {
4701   auto &MRI = *MIRBuilder.getMRI();
4702   const RegClassOrRegBank &RegClassOrBank = MRI.getRegClassOrRegBank(Dst);
4703   // If we used a register class, then this won't necessarily have an LLT.
4704   // Compute the size based off whether or not we have a class or bank.
4705   unsigned Size;
4706   if (const auto *RC = RegClassOrBank.dyn_cast<const TargetRegisterClass *>())
4707     Size = TRI.getRegSizeInBits(*RC);
4708   else
4709     Size = MRI.getType(Dst).getSizeInBits();
4710   // Some opcodes use s1.
4711   assert(Size <= 64 && "Expected 64 bits or less only!");
4712   static const unsigned OpcTable[2] = {AArch64::CSINCWr, AArch64::CSINCXr};
4713   unsigned Opc = OpcTable[Size == 64];
4714   auto CSINC = MIRBuilder.buildInstr(Opc, {Dst}, {Src1, Src2}).addImm(Pred);
4715   constrainSelectedInstRegOperands(*CSINC, TII, TRI, RBI);
4716   return &*CSINC;
4717 }
4718 
4719 MachineInstr *AArch64InstructionSelector::emitCarryIn(MachineInstr &I,
4720                                                       Register CarryReg) {
4721   MachineRegisterInfo *MRI = MIB.getMRI();
4722   unsigned Opcode = I.getOpcode();
4723 
4724   // If the instruction is a SUB, we need to negate the carry,
4725   // because borrowing is indicated by carry-flag == 0.
4726   bool NeedsNegatedCarry =
4727       (Opcode == TargetOpcode::G_USUBE || Opcode == TargetOpcode::G_SSUBE);
4728 
4729   // If the previous instruction will already produce the correct carry, do not
4730   // emit a carry generating instruction. E.g. for G_UADDE/G_USUBE sequences
4731   // generated during legalization of wide add/sub. This optimization depends on
4732   // these sequences not being interrupted by other instructions.
4733   // We have to select the previous instruction before the carry-using
4734   // instruction is deleted by the calling function, otherwise the previous
4735   // instruction might become dead and would get deleted.
4736   MachineInstr *SrcMI = MRI->getVRegDef(CarryReg);
4737   if (SrcMI == I.getPrevNode()) {
4738     if (auto *CarrySrcMI = dyn_cast<GAddSubCarryOut>(SrcMI)) {
4739       bool ProducesNegatedCarry = CarrySrcMI->isSub();
4740       if (NeedsNegatedCarry == ProducesNegatedCarry &&
4741           CarrySrcMI->isUnsigned() &&
4742           CarrySrcMI->getCarryOutReg() == CarryReg &&
4743           selectAndRestoreState(*SrcMI))
4744         return nullptr;
4745     }
4746   }
4747 
4748   Register DeadReg = MRI->createVirtualRegister(&AArch64::GPR32RegClass);
4749 
4750   if (NeedsNegatedCarry) {
4751     // (0 - Carry) sets !C in NZCV when Carry == 1
4752     Register ZReg = AArch64::WZR;
4753     return emitInstr(AArch64::SUBSWrr, {DeadReg}, {ZReg, CarryReg}, MIB);
4754   }
4755 
4756   // (Carry - 1) sets !C in NZCV when Carry == 0
4757   auto Fns = select12BitValueWithLeftShift(1);
4758   return emitInstr(AArch64::SUBSWri, {DeadReg}, {CarryReg}, MIB, Fns);
4759 }
4760 
4761 bool AArch64InstructionSelector::selectOverflowOp(MachineInstr &I,
4762                                                   MachineRegisterInfo &MRI) {
4763   auto &CarryMI = cast<GAddSubCarryOut>(I);
4764 
4765   if (auto *CarryInMI = dyn_cast<GAddSubCarryInOut>(&I)) {
4766     // Set NZCV carry according to carry-in VReg
4767     emitCarryIn(I, CarryInMI->getCarryInReg());
4768   }
4769 
4770   // Emit the operation and get the correct condition code.
4771   auto OpAndCC = emitOverflowOp(I.getOpcode(), CarryMI.getDstReg(),
4772                                 CarryMI.getLHS(), CarryMI.getRHS(), MIB);
4773 
4774   Register CarryOutReg = CarryMI.getCarryOutReg();
4775 
4776   // Don't convert carry-out to VReg if it is never used
4777   if (!MRI.use_nodbg_empty(CarryOutReg)) {
4778     // Now, put the overflow result in the register given by the first operand
4779     // to the overflow op. CSINC increments the result when the predicate is
4780     // false, so to get the increment when it's true, we need to use the
4781     // inverse. In this case, we want to increment when carry is set.
4782     Register ZReg = AArch64::WZR;
4783     emitCSINC(/*Dst=*/CarryOutReg, /*Src1=*/ZReg, /*Src2=*/ZReg,
4784               getInvertedCondCode(OpAndCC.second), MIB);
4785   }
4786 
4787   I.eraseFromParent();
4788   return true;
4789 }
4790 
4791 std::pair<MachineInstr *, AArch64CC::CondCode>
4792 AArch64InstructionSelector::emitOverflowOp(unsigned Opcode, Register Dst,
4793                                            MachineOperand &LHS,
4794                                            MachineOperand &RHS,
4795                                            MachineIRBuilder &MIRBuilder) const {
4796   switch (Opcode) {
4797   default:
4798     llvm_unreachable("Unexpected opcode!");
4799   case TargetOpcode::G_SADDO:
4800     return std::make_pair(emitADDS(Dst, LHS, RHS, MIRBuilder), AArch64CC::VS);
4801   case TargetOpcode::G_UADDO:
4802     return std::make_pair(emitADDS(Dst, LHS, RHS, MIRBuilder), AArch64CC::HS);
4803   case TargetOpcode::G_SSUBO:
4804     return std::make_pair(emitSUBS(Dst, LHS, RHS, MIRBuilder), AArch64CC::VS);
4805   case TargetOpcode::G_USUBO:
4806     return std::make_pair(emitSUBS(Dst, LHS, RHS, MIRBuilder), AArch64CC::LO);
4807   case TargetOpcode::G_SADDE:
4808     return std::make_pair(emitADCS(Dst, LHS, RHS, MIRBuilder), AArch64CC::VS);
4809   case TargetOpcode::G_UADDE:
4810     return std::make_pair(emitADCS(Dst, LHS, RHS, MIRBuilder), AArch64CC::HS);
4811   case TargetOpcode::G_SSUBE:
4812     return std::make_pair(emitSBCS(Dst, LHS, RHS, MIRBuilder), AArch64CC::VS);
4813   case TargetOpcode::G_USUBE:
4814     return std::make_pair(emitSBCS(Dst, LHS, RHS, MIRBuilder), AArch64CC::LO);
4815   }
4816 }
4817 
4818 /// Returns true if @p Val is a tree of AND/OR/CMP operations that can be
4819 /// expressed as a conjunction.
4820 /// \param CanNegate    Set to true if we can negate the whole sub-tree just by
4821 ///                     changing the conditions on the CMP tests.
4822 ///                     (this means we can call emitConjunctionRec() with
4823 ///                      Negate==true on this sub-tree)
4824 /// \param MustBeFirst  Set to true if this subtree needs to be negated and we
4825 ///                     cannot do the negation naturally. We are required to
4826 ///                     emit the subtree first in this case.
4827 /// \param WillNegate   Is true if are called when the result of this
4828 ///                     subexpression must be negated. This happens when the
4829 ///                     outer expression is an OR. We can use this fact to know
4830 ///                     that we have a double negation (or (or ...) ...) that
4831 ///                     can be implemented for free.
4832 static bool canEmitConjunction(Register Val, bool &CanNegate, bool &MustBeFirst,
4833                                bool WillNegate, MachineRegisterInfo &MRI,
4834                                unsigned Depth = 0) {
4835   if (!MRI.hasOneNonDBGUse(Val))
4836     return false;
4837   MachineInstr *ValDef = MRI.getVRegDef(Val);
4838   unsigned Opcode = ValDef->getOpcode();
4839   if (isa<GAnyCmp>(ValDef)) {
4840     CanNegate = true;
4841     MustBeFirst = false;
4842     return true;
4843   }
4844   // Protect against exponential runtime and stack overflow.
4845   if (Depth > 6)
4846     return false;
4847   if (Opcode == TargetOpcode::G_AND || Opcode == TargetOpcode::G_OR) {
4848     bool IsOR = Opcode == TargetOpcode::G_OR;
4849     Register O0 = ValDef->getOperand(1).getReg();
4850     Register O1 = ValDef->getOperand(2).getReg();
4851     bool CanNegateL;
4852     bool MustBeFirstL;
4853     if (!canEmitConjunction(O0, CanNegateL, MustBeFirstL, IsOR, MRI, Depth + 1))
4854       return false;
4855     bool CanNegateR;
4856     bool MustBeFirstR;
4857     if (!canEmitConjunction(O1, CanNegateR, MustBeFirstR, IsOR, MRI, Depth + 1))
4858       return false;
4859 
4860     if (MustBeFirstL && MustBeFirstR)
4861       return false;
4862 
4863     if (IsOR) {
4864       // For an OR expression we need to be able to naturally negate at least
4865       // one side or we cannot do the transformation at all.
4866       if (!CanNegateL && !CanNegateR)
4867         return false;
4868       // If we the result of the OR will be negated and we can naturally negate
4869       // the leaves, then this sub-tree as a whole negates naturally.
4870       CanNegate = WillNegate && CanNegateL && CanNegateR;
4871       // If we cannot naturally negate the whole sub-tree, then this must be
4872       // emitted first.
4873       MustBeFirst = !CanNegate;
4874     } else {
4875       assert(Opcode == TargetOpcode::G_AND && "Must be G_AND");
4876       // We cannot naturally negate an AND operation.
4877       CanNegate = false;
4878       MustBeFirst = MustBeFirstL || MustBeFirstR;
4879     }
4880     return true;
4881   }
4882   return false;
4883 }
4884 
4885 MachineInstr *AArch64InstructionSelector::emitConditionalComparison(
4886     Register LHS, Register RHS, CmpInst::Predicate CC,
4887     AArch64CC::CondCode Predicate, AArch64CC::CondCode OutCC,
4888     MachineIRBuilder &MIB) const {
4889   // TODO: emit CMN as an optimization.
4890   auto &MRI = *MIB.getMRI();
4891   LLT OpTy = MRI.getType(LHS);
4892   assert(OpTy.getSizeInBits() == 32 || OpTy.getSizeInBits() == 64);
4893   unsigned CCmpOpc;
4894   std::optional<ValueAndVReg> C;
4895   if (CmpInst::isIntPredicate(CC)) {
4896     C = getIConstantVRegValWithLookThrough(RHS, MRI);
4897     if (C && C->Value.ult(32))
4898       CCmpOpc = OpTy.getSizeInBits() == 32 ? AArch64::CCMPWi : AArch64::CCMPXi;
4899     else
4900       CCmpOpc = OpTy.getSizeInBits() == 32 ? AArch64::CCMPWr : AArch64::CCMPXr;
4901   } else {
4902     switch (OpTy.getSizeInBits()) {
4903     case 16:
4904       CCmpOpc = AArch64::FCCMPHrr;
4905       break;
4906     case 32:
4907       CCmpOpc = AArch64::FCCMPSrr;
4908       break;
4909     case 64:
4910       CCmpOpc = AArch64::FCCMPDrr;
4911       break;
4912     default:
4913       return nullptr;
4914     }
4915   }
4916   AArch64CC::CondCode InvOutCC = AArch64CC::getInvertedCondCode(OutCC);
4917   unsigned NZCV = AArch64CC::getNZCVToSatisfyCondCode(InvOutCC);
4918   auto CCmp =
4919       MIB.buildInstr(CCmpOpc, {}, {LHS});
4920   if (CCmpOpc == AArch64::CCMPWi || CCmpOpc == AArch64::CCMPXi)
4921     CCmp.addImm(C->Value.getZExtValue());
4922   else
4923     CCmp.addReg(RHS);
4924   CCmp.addImm(NZCV).addImm(Predicate);
4925   constrainSelectedInstRegOperands(*CCmp, TII, TRI, RBI);
4926   return &*CCmp;
4927 }
4928 
4929 MachineInstr *AArch64InstructionSelector::emitConjunctionRec(
4930     Register Val, AArch64CC::CondCode &OutCC, bool Negate, Register CCOp,
4931     AArch64CC::CondCode Predicate, MachineIRBuilder &MIB) const {
4932   // We're at a tree leaf, produce a conditional comparison operation.
4933   auto &MRI = *MIB.getMRI();
4934   MachineInstr *ValDef = MRI.getVRegDef(Val);
4935   unsigned Opcode = ValDef->getOpcode();
4936   if (auto *Cmp = dyn_cast<GAnyCmp>(ValDef)) {
4937     Register LHS = Cmp->getLHSReg();
4938     Register RHS = Cmp->getRHSReg();
4939     CmpInst::Predicate CC = Cmp->getCond();
4940     if (Negate)
4941       CC = CmpInst::getInversePredicate(CC);
4942     if (isa<GICmp>(Cmp)) {
4943       OutCC = changeICMPPredToAArch64CC(CC);
4944     } else {
4945       // Handle special FP cases.
4946       AArch64CC::CondCode ExtraCC;
4947       changeFPCCToANDAArch64CC(CC, OutCC, ExtraCC);
4948       // Some floating point conditions can't be tested with a single condition
4949       // code. Construct an additional comparison in this case.
4950       if (ExtraCC != AArch64CC::AL) {
4951         MachineInstr *ExtraCmp;
4952         if (!CCOp)
4953           ExtraCmp = emitFPCompare(LHS, RHS, MIB, CC);
4954         else
4955           ExtraCmp =
4956               emitConditionalComparison(LHS, RHS, CC, Predicate, ExtraCC, MIB);
4957         CCOp = ExtraCmp->getOperand(0).getReg();
4958         Predicate = ExtraCC;
4959       }
4960     }
4961 
4962     // Produce a normal comparison if we are first in the chain
4963     if (!CCOp) {
4964       auto Dst = MRI.cloneVirtualRegister(LHS);
4965       if (isa<GICmp>(Cmp))
4966         return emitSUBS(Dst, Cmp->getOperand(2), Cmp->getOperand(3), MIB);
4967       return emitFPCompare(Cmp->getOperand(2).getReg(),
4968                            Cmp->getOperand(3).getReg(), MIB);
4969     }
4970     // Otherwise produce a ccmp.
4971     return emitConditionalComparison(LHS, RHS, CC, Predicate, OutCC, MIB);
4972   }
4973   assert(MRI.hasOneNonDBGUse(Val) && "Valid conjunction/disjunction tree");
4974 
4975   bool IsOR = Opcode == TargetOpcode::G_OR;
4976 
4977   Register LHS = ValDef->getOperand(1).getReg();
4978   bool CanNegateL;
4979   bool MustBeFirstL;
4980   bool ValidL = canEmitConjunction(LHS, CanNegateL, MustBeFirstL, IsOR, MRI);
4981   assert(ValidL && "Valid conjunction/disjunction tree");
4982   (void)ValidL;
4983 
4984   Register RHS = ValDef->getOperand(2).getReg();
4985   bool CanNegateR;
4986   bool MustBeFirstR;
4987   bool ValidR = canEmitConjunction(RHS, CanNegateR, MustBeFirstR, IsOR, MRI);
4988   assert(ValidR && "Valid conjunction/disjunction tree");
4989   (void)ValidR;
4990 
4991   // Swap sub-tree that must come first to the right side.
4992   if (MustBeFirstL) {
4993     assert(!MustBeFirstR && "Valid conjunction/disjunction tree");
4994     std::swap(LHS, RHS);
4995     std::swap(CanNegateL, CanNegateR);
4996     std::swap(MustBeFirstL, MustBeFirstR);
4997   }
4998 
4999   bool NegateR;
5000   bool NegateAfterR;
5001   bool NegateL;
5002   bool NegateAfterAll;
5003   if (Opcode == TargetOpcode::G_OR) {
5004     // Swap the sub-tree that we can negate naturally to the left.
5005     if (!CanNegateL) {
5006       assert(CanNegateR && "at least one side must be negatable");
5007       assert(!MustBeFirstR && "invalid conjunction/disjunction tree");
5008       assert(!Negate);
5009       std::swap(LHS, RHS);
5010       NegateR = false;
5011       NegateAfterR = true;
5012     } else {
5013       // Negate the left sub-tree if possible, otherwise negate the result.
5014       NegateR = CanNegateR;
5015       NegateAfterR = !CanNegateR;
5016     }
5017     NegateL = true;
5018     NegateAfterAll = !Negate;
5019   } else {
5020     assert(Opcode == TargetOpcode::G_AND &&
5021            "Valid conjunction/disjunction tree");
5022     assert(!Negate && "Valid conjunction/disjunction tree");
5023 
5024     NegateL = false;
5025     NegateR = false;
5026     NegateAfterR = false;
5027     NegateAfterAll = false;
5028   }
5029 
5030   // Emit sub-trees.
5031   AArch64CC::CondCode RHSCC;
5032   MachineInstr *CmpR =
5033       emitConjunctionRec(RHS, RHSCC, NegateR, CCOp, Predicate, MIB);
5034   if (NegateAfterR)
5035     RHSCC = AArch64CC::getInvertedCondCode(RHSCC);
5036   MachineInstr *CmpL = emitConjunctionRec(
5037       LHS, OutCC, NegateL, CmpR->getOperand(0).getReg(), RHSCC, MIB);
5038   if (NegateAfterAll)
5039     OutCC = AArch64CC::getInvertedCondCode(OutCC);
5040   return CmpL;
5041 }
5042 
5043 MachineInstr *AArch64InstructionSelector::emitConjunction(
5044     Register Val, AArch64CC::CondCode &OutCC, MachineIRBuilder &MIB) const {
5045   bool DummyCanNegate;
5046   bool DummyMustBeFirst;
5047   if (!canEmitConjunction(Val, DummyCanNegate, DummyMustBeFirst, false,
5048                           *MIB.getMRI()))
5049     return nullptr;
5050   return emitConjunctionRec(Val, OutCC, false, Register(), AArch64CC::AL, MIB);
5051 }
5052 
5053 bool AArch64InstructionSelector::tryOptSelectConjunction(GSelect &SelI,
5054                                                          MachineInstr &CondMI) {
5055   AArch64CC::CondCode AArch64CC;
5056   MachineInstr *ConjMI = emitConjunction(SelI.getCondReg(), AArch64CC, MIB);
5057   if (!ConjMI)
5058     return false;
5059 
5060   emitSelect(SelI.getReg(0), SelI.getTrueReg(), SelI.getFalseReg(), AArch64CC, MIB);
5061   SelI.eraseFromParent();
5062   return true;
5063 }
5064 
5065 bool AArch64InstructionSelector::tryOptSelect(GSelect &I) {
5066   MachineRegisterInfo &MRI = *MIB.getMRI();
5067   // We want to recognize this pattern:
5068   //
5069   // $z = G_FCMP pred, $x, $y
5070   // ...
5071   // $w = G_SELECT $z, $a, $b
5072   //
5073   // Where the value of $z is *only* ever used by the G_SELECT (possibly with
5074   // some copies/truncs in between.)
5075   //
5076   // If we see this, then we can emit something like this:
5077   //
5078   // fcmp $x, $y
5079   // fcsel $w, $a, $b, pred
5080   //
5081   // Rather than emitting both of the rather long sequences in the standard
5082   // G_FCMP/G_SELECT select methods.
5083 
5084   // First, check if the condition is defined by a compare.
5085   MachineInstr *CondDef = MRI.getVRegDef(I.getOperand(1).getReg());
5086 
5087   // We can only fold if all of the defs have one use.
5088   Register CondDefReg = CondDef->getOperand(0).getReg();
5089   if (!MRI.hasOneNonDBGUse(CondDefReg)) {
5090     // Unless it's another select.
5091     for (const MachineInstr &UI : MRI.use_nodbg_instructions(CondDefReg)) {
5092       if (CondDef == &UI)
5093         continue;
5094       if (UI.getOpcode() != TargetOpcode::G_SELECT)
5095         return false;
5096     }
5097   }
5098 
5099   // Is the condition defined by a compare?
5100   unsigned CondOpc = CondDef->getOpcode();
5101   if (CondOpc != TargetOpcode::G_ICMP && CondOpc != TargetOpcode::G_FCMP) {
5102     if (tryOptSelectConjunction(I, *CondDef))
5103       return true;
5104     return false;
5105   }
5106 
5107   AArch64CC::CondCode CondCode;
5108   if (CondOpc == TargetOpcode::G_ICMP) {
5109     auto Pred =
5110         static_cast<CmpInst::Predicate>(CondDef->getOperand(1).getPredicate());
5111     CondCode = changeICMPPredToAArch64CC(Pred);
5112     emitIntegerCompare(CondDef->getOperand(2), CondDef->getOperand(3),
5113                        CondDef->getOperand(1), MIB);
5114   } else {
5115     // Get the condition code for the select.
5116     auto Pred =
5117         static_cast<CmpInst::Predicate>(CondDef->getOperand(1).getPredicate());
5118     AArch64CC::CondCode CondCode2;
5119     changeFCMPPredToAArch64CC(Pred, CondCode, CondCode2);
5120 
5121     // changeFCMPPredToAArch64CC sets CondCode2 to AL when we require two
5122     // instructions to emit the comparison.
5123     // TODO: Handle FCMP_UEQ and FCMP_ONE. After that, this check will be
5124     // unnecessary.
5125     if (CondCode2 != AArch64CC::AL)
5126       return false;
5127 
5128     if (!emitFPCompare(CondDef->getOperand(2).getReg(),
5129                        CondDef->getOperand(3).getReg(), MIB)) {
5130       LLVM_DEBUG(dbgs() << "Couldn't emit compare for select!\n");
5131       return false;
5132     }
5133   }
5134 
5135   // Emit the select.
5136   emitSelect(I.getOperand(0).getReg(), I.getOperand(2).getReg(),
5137              I.getOperand(3).getReg(), CondCode, MIB);
5138   I.eraseFromParent();
5139   return true;
5140 }
5141 
5142 MachineInstr *AArch64InstructionSelector::tryFoldIntegerCompare(
5143     MachineOperand &LHS, MachineOperand &RHS, MachineOperand &Predicate,
5144     MachineIRBuilder &MIRBuilder) const {
5145   assert(LHS.isReg() && RHS.isReg() && Predicate.isPredicate() &&
5146          "Unexpected MachineOperand");
5147   MachineRegisterInfo &MRI = *MIRBuilder.getMRI();
5148   // We want to find this sort of thing:
5149   // x = G_SUB 0, y
5150   // G_ICMP z, x
5151   //
5152   // In this case, we can fold the G_SUB into the G_ICMP using a CMN instead.
5153   // e.g:
5154   //
5155   // cmn z, y
5156 
5157   // Check if the RHS or LHS of the G_ICMP is defined by a SUB
5158   MachineInstr *LHSDef = getDefIgnoringCopies(LHS.getReg(), MRI);
5159   MachineInstr *RHSDef = getDefIgnoringCopies(RHS.getReg(), MRI);
5160   auto P = static_cast<CmpInst::Predicate>(Predicate.getPredicate());
5161   // Given this:
5162   //
5163   // x = G_SUB 0, y
5164   // G_ICMP x, z
5165   //
5166   // Produce this:
5167   //
5168   // cmn y, z
5169   if (isCMN(LHSDef, P, MRI))
5170     return emitCMN(LHSDef->getOperand(2), RHS, MIRBuilder);
5171 
5172   // Same idea here, but with the RHS of the compare instead:
5173   //
5174   // Given this:
5175   //
5176   // x = G_SUB 0, y
5177   // G_ICMP z, x
5178   //
5179   // Produce this:
5180   //
5181   // cmn z, y
5182   if (isCMN(RHSDef, P, MRI))
5183     return emitCMN(LHS, RHSDef->getOperand(2), MIRBuilder);
5184 
5185   // Given this:
5186   //
5187   // z = G_AND x, y
5188   // G_ICMP z, 0
5189   //
5190   // Produce this if the compare is signed:
5191   //
5192   // tst x, y
5193   if (!CmpInst::isUnsigned(P) && LHSDef &&
5194       LHSDef->getOpcode() == TargetOpcode::G_AND) {
5195     // Make sure that the RHS is 0.
5196     auto ValAndVReg = getIConstantVRegValWithLookThrough(RHS.getReg(), MRI);
5197     if (!ValAndVReg || ValAndVReg->Value != 0)
5198       return nullptr;
5199 
5200     return emitTST(LHSDef->getOperand(1),
5201                    LHSDef->getOperand(2), MIRBuilder);
5202   }
5203 
5204   return nullptr;
5205 }
5206 
5207 bool AArch64InstructionSelector::selectShuffleVector(
5208     MachineInstr &I, MachineRegisterInfo &MRI) {
5209   const LLT DstTy = MRI.getType(I.getOperand(0).getReg());
5210   Register Src1Reg = I.getOperand(1).getReg();
5211   const LLT Src1Ty = MRI.getType(Src1Reg);
5212   Register Src2Reg = I.getOperand(2).getReg();
5213   const LLT Src2Ty = MRI.getType(Src2Reg);
5214   ArrayRef<int> Mask = I.getOperand(3).getShuffleMask();
5215 
5216   MachineBasicBlock &MBB = *I.getParent();
5217   MachineFunction &MF = *MBB.getParent();
5218   LLVMContext &Ctx = MF.getFunction().getContext();
5219 
5220   // G_SHUFFLE_VECTOR is weird in that the source operands can be scalars, if
5221   // it's originated from a <1 x T> type. Those should have been lowered into
5222   // G_BUILD_VECTOR earlier.
5223   if (!Src1Ty.isVector() || !Src2Ty.isVector()) {
5224     LLVM_DEBUG(dbgs() << "Could not select a \"scalar\" G_SHUFFLE_VECTOR\n");
5225     return false;
5226   }
5227 
5228   unsigned BytesPerElt = DstTy.getElementType().getSizeInBits() / 8;
5229 
5230   SmallVector<Constant *, 64> CstIdxs;
5231   for (int Val : Mask) {
5232     // For now, any undef indexes we'll just assume to be 0. This should be
5233     // optimized in future, e.g. to select DUP etc.
5234     Val = Val < 0 ? 0 : Val;
5235     for (unsigned Byte = 0; Byte < BytesPerElt; ++Byte) {
5236       unsigned Offset = Byte + Val * BytesPerElt;
5237       CstIdxs.emplace_back(ConstantInt::get(Type::getInt8Ty(Ctx), Offset));
5238     }
5239   }
5240 
5241   // Use a constant pool to load the index vector for TBL.
5242   Constant *CPVal = ConstantVector::get(CstIdxs);
5243   MachineInstr *IndexLoad = emitLoadFromConstantPool(CPVal, MIB);
5244   if (!IndexLoad) {
5245     LLVM_DEBUG(dbgs() << "Could not load from a constant pool");
5246     return false;
5247   }
5248 
5249   if (DstTy.getSizeInBits() != 128) {
5250     assert(DstTy.getSizeInBits() == 64 && "Unexpected shuffle result ty");
5251     // This case can be done with TBL1.
5252     MachineInstr *Concat =
5253         emitVectorConcat(std::nullopt, Src1Reg, Src2Reg, MIB);
5254     if (!Concat) {
5255       LLVM_DEBUG(dbgs() << "Could not do vector concat for tbl1");
5256       return false;
5257     }
5258 
5259     // The constant pool load will be 64 bits, so need to convert to FPR128 reg.
5260     IndexLoad = emitScalarToVector(64, &AArch64::FPR128RegClass,
5261                                    IndexLoad->getOperand(0).getReg(), MIB);
5262 
5263     auto TBL1 = MIB.buildInstr(
5264         AArch64::TBLv16i8One, {&AArch64::FPR128RegClass},
5265         {Concat->getOperand(0).getReg(), IndexLoad->getOperand(0).getReg()});
5266     constrainSelectedInstRegOperands(*TBL1, TII, TRI, RBI);
5267 
5268     auto Copy =
5269         MIB.buildInstr(TargetOpcode::COPY, {I.getOperand(0).getReg()}, {})
5270             .addReg(TBL1.getReg(0), 0, AArch64::dsub);
5271     RBI.constrainGenericRegister(Copy.getReg(0), AArch64::FPR64RegClass, MRI);
5272     I.eraseFromParent();
5273     return true;
5274   }
5275 
5276   // For TBL2 we need to emit a REG_SEQUENCE to tie together two consecutive
5277   // Q registers for regalloc.
5278   SmallVector<Register, 2> Regs = {Src1Reg, Src2Reg};
5279   auto RegSeq = createQTuple(Regs, MIB);
5280   auto TBL2 = MIB.buildInstr(AArch64::TBLv16i8Two, {I.getOperand(0)},
5281                              {RegSeq, IndexLoad->getOperand(0)});
5282   constrainSelectedInstRegOperands(*TBL2, TII, TRI, RBI);
5283   I.eraseFromParent();
5284   return true;
5285 }
5286 
5287 MachineInstr *AArch64InstructionSelector::emitLaneInsert(
5288     std::optional<Register> DstReg, Register SrcReg, Register EltReg,
5289     unsigned LaneIdx, const RegisterBank &RB,
5290     MachineIRBuilder &MIRBuilder) const {
5291   MachineInstr *InsElt = nullptr;
5292   const TargetRegisterClass *DstRC = &AArch64::FPR128RegClass;
5293   MachineRegisterInfo &MRI = *MIRBuilder.getMRI();
5294 
5295   // Create a register to define with the insert if one wasn't passed in.
5296   if (!DstReg)
5297     DstReg = MRI.createVirtualRegister(DstRC);
5298 
5299   unsigned EltSize = MRI.getType(EltReg).getSizeInBits();
5300   unsigned Opc = getInsertVecEltOpInfo(RB, EltSize).first;
5301 
5302   if (RB.getID() == AArch64::FPRRegBankID) {
5303     auto InsSub = emitScalarToVector(EltSize, DstRC, EltReg, MIRBuilder);
5304     InsElt = MIRBuilder.buildInstr(Opc, {*DstReg}, {SrcReg})
5305                  .addImm(LaneIdx)
5306                  .addUse(InsSub->getOperand(0).getReg())
5307                  .addImm(0);
5308   } else {
5309     InsElt = MIRBuilder.buildInstr(Opc, {*DstReg}, {SrcReg})
5310                  .addImm(LaneIdx)
5311                  .addUse(EltReg);
5312   }
5313 
5314   constrainSelectedInstRegOperands(*InsElt, TII, TRI, RBI);
5315   return InsElt;
5316 }
5317 
5318 bool AArch64InstructionSelector::selectUSMovFromExtend(
5319     MachineInstr &MI, MachineRegisterInfo &MRI) {
5320   if (MI.getOpcode() != TargetOpcode::G_SEXT &&
5321       MI.getOpcode() != TargetOpcode::G_ZEXT &&
5322       MI.getOpcode() != TargetOpcode::G_ANYEXT)
5323     return false;
5324   bool IsSigned = MI.getOpcode() == TargetOpcode::G_SEXT;
5325   const Register DefReg = MI.getOperand(0).getReg();
5326   const LLT DstTy = MRI.getType(DefReg);
5327   unsigned DstSize = DstTy.getSizeInBits();
5328 
5329   if (DstSize != 32 && DstSize != 64)
5330     return false;
5331 
5332   MachineInstr *Extract = getOpcodeDef(TargetOpcode::G_EXTRACT_VECTOR_ELT,
5333                                        MI.getOperand(1).getReg(), MRI);
5334   int64_t Lane;
5335   if (!Extract || !mi_match(Extract->getOperand(2).getReg(), MRI, m_ICst(Lane)))
5336     return false;
5337   Register Src0 = Extract->getOperand(1).getReg();
5338 
5339   const LLT &VecTy = MRI.getType(Src0);
5340 
5341   if (VecTy.getSizeInBits() != 128) {
5342     const MachineInstr *ScalarToVector = emitScalarToVector(
5343         VecTy.getSizeInBits(), &AArch64::FPR128RegClass, Src0, MIB);
5344     assert(ScalarToVector && "Didn't expect emitScalarToVector to fail!");
5345     Src0 = ScalarToVector->getOperand(0).getReg();
5346   }
5347 
5348   unsigned Opcode;
5349   if (DstSize == 64 && VecTy.getScalarSizeInBits() == 32)
5350     Opcode = IsSigned ? AArch64::SMOVvi32to64 : AArch64::UMOVvi32;
5351   else if (DstSize == 64 && VecTy.getScalarSizeInBits() == 16)
5352     Opcode = IsSigned ? AArch64::SMOVvi16to64 : AArch64::UMOVvi16;
5353   else if (DstSize == 64 && VecTy.getScalarSizeInBits() == 8)
5354     Opcode = IsSigned ? AArch64::SMOVvi8to64 : AArch64::UMOVvi8;
5355   else if (DstSize == 32 && VecTy.getScalarSizeInBits() == 16)
5356     Opcode = IsSigned ? AArch64::SMOVvi16to32 : AArch64::UMOVvi16;
5357   else if (DstSize == 32 && VecTy.getScalarSizeInBits() == 8)
5358     Opcode = IsSigned ? AArch64::SMOVvi8to32 : AArch64::UMOVvi8;
5359   else
5360     llvm_unreachable("Unexpected type combo for S/UMov!");
5361 
5362   // We may need to generate one of these, depending on the type and sign of the
5363   // input:
5364   //  DstReg = SMOV Src0, Lane;
5365   //  NewReg = UMOV Src0, Lane; DstReg = SUBREG_TO_REG NewReg, sub_32;
5366   MachineInstr *ExtI = nullptr;
5367   if (DstSize == 64 && !IsSigned) {
5368     Register NewReg = MRI.createVirtualRegister(&AArch64::GPR32RegClass);
5369     MIB.buildInstr(Opcode, {NewReg}, {Src0}).addImm(Lane);
5370     ExtI = MIB.buildInstr(AArch64::SUBREG_TO_REG, {DefReg}, {})
5371                .addImm(0)
5372                .addUse(NewReg)
5373                .addImm(AArch64::sub_32);
5374     RBI.constrainGenericRegister(DefReg, AArch64::GPR64RegClass, MRI);
5375   } else
5376     ExtI = MIB.buildInstr(Opcode, {DefReg}, {Src0}).addImm(Lane);
5377 
5378   constrainSelectedInstRegOperands(*ExtI, TII, TRI, RBI);
5379   MI.eraseFromParent();
5380   return true;
5381 }
5382 
5383 bool AArch64InstructionSelector::selectInsertElt(MachineInstr &I,
5384                                                  MachineRegisterInfo &MRI) {
5385   assert(I.getOpcode() == TargetOpcode::G_INSERT_VECTOR_ELT);
5386 
5387   // Get information on the destination.
5388   Register DstReg = I.getOperand(0).getReg();
5389   const LLT DstTy = MRI.getType(DstReg);
5390   unsigned VecSize = DstTy.getSizeInBits();
5391 
5392   // Get information on the element we want to insert into the destination.
5393   Register EltReg = I.getOperand(2).getReg();
5394   const LLT EltTy = MRI.getType(EltReg);
5395   unsigned EltSize = EltTy.getSizeInBits();
5396   if (EltSize < 8 || EltSize > 64)
5397     return false;
5398 
5399   // Find the definition of the index. Bail out if it's not defined by a
5400   // G_CONSTANT.
5401   Register IdxReg = I.getOperand(3).getReg();
5402   auto VRegAndVal = getIConstantVRegValWithLookThrough(IdxReg, MRI);
5403   if (!VRegAndVal)
5404     return false;
5405   unsigned LaneIdx = VRegAndVal->Value.getSExtValue();
5406 
5407   // Perform the lane insert.
5408   Register SrcReg = I.getOperand(1).getReg();
5409   const RegisterBank &EltRB = *RBI.getRegBank(EltReg, MRI, TRI);
5410 
5411   if (VecSize < 128) {
5412     // If the vector we're inserting into is smaller than 128 bits, widen it
5413     // to 128 to do the insert.
5414     MachineInstr *ScalarToVec =
5415         emitScalarToVector(VecSize, &AArch64::FPR128RegClass, SrcReg, MIB);
5416     if (!ScalarToVec)
5417       return false;
5418     SrcReg = ScalarToVec->getOperand(0).getReg();
5419   }
5420 
5421   // Create an insert into a new FPR128 register.
5422   // Note that if our vector is already 128 bits, we end up emitting an extra
5423   // register.
5424   MachineInstr *InsMI =
5425       emitLaneInsert(std::nullopt, SrcReg, EltReg, LaneIdx, EltRB, MIB);
5426 
5427   if (VecSize < 128) {
5428     // If we had to widen to perform the insert, then we have to demote back to
5429     // the original size to get the result we want.
5430     if (!emitNarrowVector(DstReg, InsMI->getOperand(0).getReg(), MIB, MRI))
5431       return false;
5432   } else {
5433     // No widening needed.
5434     InsMI->getOperand(0).setReg(DstReg);
5435     constrainSelectedInstRegOperands(*InsMI, TII, TRI, RBI);
5436   }
5437 
5438   I.eraseFromParent();
5439   return true;
5440 }
5441 
5442 MachineInstr *AArch64InstructionSelector::tryAdvSIMDModImm8(
5443     Register Dst, unsigned DstSize, APInt Bits, MachineIRBuilder &Builder) {
5444   unsigned int Op;
5445   if (DstSize == 128) {
5446     if (Bits.getHiBits(64) != Bits.getLoBits(64))
5447       return nullptr;
5448     Op = AArch64::MOVIv16b_ns;
5449   } else {
5450     Op = AArch64::MOVIv8b_ns;
5451   }
5452 
5453   uint64_t Val = Bits.zextOrTrunc(64).getZExtValue();
5454 
5455   if (AArch64_AM::isAdvSIMDModImmType9(Val)) {
5456     Val = AArch64_AM::encodeAdvSIMDModImmType9(Val);
5457     auto Mov = Builder.buildInstr(Op, {Dst}, {}).addImm(Val);
5458     constrainSelectedInstRegOperands(*Mov, TII, TRI, RBI);
5459     return &*Mov;
5460   }
5461   return nullptr;
5462 }
5463 
5464 MachineInstr *AArch64InstructionSelector::tryAdvSIMDModImm16(
5465     Register Dst, unsigned DstSize, APInt Bits, MachineIRBuilder &Builder,
5466     bool Inv) {
5467 
5468   unsigned int Op;
5469   if (DstSize == 128) {
5470     if (Bits.getHiBits(64) != Bits.getLoBits(64))
5471       return nullptr;
5472     Op = Inv ? AArch64::MVNIv8i16 : AArch64::MOVIv8i16;
5473   } else {
5474     Op = Inv ? AArch64::MVNIv4i16 : AArch64::MOVIv4i16;
5475   }
5476 
5477   uint64_t Val = Bits.zextOrTrunc(64).getZExtValue();
5478   uint64_t Shift;
5479 
5480   if (AArch64_AM::isAdvSIMDModImmType5(Val)) {
5481     Val = AArch64_AM::encodeAdvSIMDModImmType5(Val);
5482     Shift = 0;
5483   } else if (AArch64_AM::isAdvSIMDModImmType6(Val)) {
5484     Val = AArch64_AM::encodeAdvSIMDModImmType6(Val);
5485     Shift = 8;
5486   } else
5487     return nullptr;
5488 
5489   auto Mov = Builder.buildInstr(Op, {Dst}, {}).addImm(Val).addImm(Shift);
5490   constrainSelectedInstRegOperands(*Mov, TII, TRI, RBI);
5491   return &*Mov;
5492 }
5493 
5494 MachineInstr *AArch64InstructionSelector::tryAdvSIMDModImm32(
5495     Register Dst, unsigned DstSize, APInt Bits, MachineIRBuilder &Builder,
5496     bool Inv) {
5497 
5498   unsigned int Op;
5499   if (DstSize == 128) {
5500     if (Bits.getHiBits(64) != Bits.getLoBits(64))
5501       return nullptr;
5502     Op = Inv ? AArch64::MVNIv4i32 : AArch64::MOVIv4i32;
5503   } else {
5504     Op = Inv ? AArch64::MVNIv2i32 : AArch64::MOVIv2i32;
5505   }
5506 
5507   uint64_t Val = Bits.zextOrTrunc(64).getZExtValue();
5508   uint64_t Shift;
5509 
5510   if ((AArch64_AM::isAdvSIMDModImmType1(Val))) {
5511     Val = AArch64_AM::encodeAdvSIMDModImmType1(Val);
5512     Shift = 0;
5513   } else if ((AArch64_AM::isAdvSIMDModImmType2(Val))) {
5514     Val = AArch64_AM::encodeAdvSIMDModImmType2(Val);
5515     Shift = 8;
5516   } else if ((AArch64_AM::isAdvSIMDModImmType3(Val))) {
5517     Val = AArch64_AM::encodeAdvSIMDModImmType3(Val);
5518     Shift = 16;
5519   } else if ((AArch64_AM::isAdvSIMDModImmType4(Val))) {
5520     Val = AArch64_AM::encodeAdvSIMDModImmType4(Val);
5521     Shift = 24;
5522   } else
5523     return nullptr;
5524 
5525   auto Mov = Builder.buildInstr(Op, {Dst}, {}).addImm(Val).addImm(Shift);
5526   constrainSelectedInstRegOperands(*Mov, TII, TRI, RBI);
5527   return &*Mov;
5528 }
5529 
5530 MachineInstr *AArch64InstructionSelector::tryAdvSIMDModImm64(
5531     Register Dst, unsigned DstSize, APInt Bits, MachineIRBuilder &Builder) {
5532 
5533   unsigned int Op;
5534   if (DstSize == 128) {
5535     if (Bits.getHiBits(64) != Bits.getLoBits(64))
5536       return nullptr;
5537     Op = AArch64::MOVIv2d_ns;
5538   } else {
5539     Op = AArch64::MOVID;
5540   }
5541 
5542   uint64_t Val = Bits.zextOrTrunc(64).getZExtValue();
5543   if (AArch64_AM::isAdvSIMDModImmType10(Val)) {
5544     Val = AArch64_AM::encodeAdvSIMDModImmType10(Val);
5545     auto Mov = Builder.buildInstr(Op, {Dst}, {}).addImm(Val);
5546     constrainSelectedInstRegOperands(*Mov, TII, TRI, RBI);
5547     return &*Mov;
5548   }
5549   return nullptr;
5550 }
5551 
5552 MachineInstr *AArch64InstructionSelector::tryAdvSIMDModImm321s(
5553     Register Dst, unsigned DstSize, APInt Bits, MachineIRBuilder &Builder,
5554     bool Inv) {
5555 
5556   unsigned int Op;
5557   if (DstSize == 128) {
5558     if (Bits.getHiBits(64) != Bits.getLoBits(64))
5559       return nullptr;
5560     Op = Inv ? AArch64::MVNIv4s_msl : AArch64::MOVIv4s_msl;
5561   } else {
5562     Op = Inv ? AArch64::MVNIv2s_msl : AArch64::MOVIv2s_msl;
5563   }
5564 
5565   uint64_t Val = Bits.zextOrTrunc(64).getZExtValue();
5566   uint64_t Shift;
5567 
5568   if (AArch64_AM::isAdvSIMDModImmType7(Val)) {
5569     Val = AArch64_AM::encodeAdvSIMDModImmType7(Val);
5570     Shift = 264;
5571   } else if (AArch64_AM::isAdvSIMDModImmType8(Val)) {
5572     Val = AArch64_AM::encodeAdvSIMDModImmType8(Val);
5573     Shift = 272;
5574   } else
5575     return nullptr;
5576 
5577   auto Mov = Builder.buildInstr(Op, {Dst}, {}).addImm(Val).addImm(Shift);
5578   constrainSelectedInstRegOperands(*Mov, TII, TRI, RBI);
5579   return &*Mov;
5580 }
5581 
5582 MachineInstr *AArch64InstructionSelector::tryAdvSIMDModImmFP(
5583     Register Dst, unsigned DstSize, APInt Bits, MachineIRBuilder &Builder) {
5584 
5585   unsigned int Op;
5586   bool IsWide = false;
5587   if (DstSize == 128) {
5588     if (Bits.getHiBits(64) != Bits.getLoBits(64))
5589       return nullptr;
5590     Op = AArch64::FMOVv4f32_ns;
5591     IsWide = true;
5592   } else {
5593     Op = AArch64::FMOVv2f32_ns;
5594   }
5595 
5596   uint64_t Val = Bits.zextOrTrunc(64).getZExtValue();
5597 
5598   if (AArch64_AM::isAdvSIMDModImmType11(Val)) {
5599     Val = AArch64_AM::encodeAdvSIMDModImmType11(Val);
5600   } else if (IsWide && AArch64_AM::isAdvSIMDModImmType12(Val)) {
5601     Val = AArch64_AM::encodeAdvSIMDModImmType12(Val);
5602     Op = AArch64::FMOVv2f64_ns;
5603   } else
5604     return nullptr;
5605 
5606   auto Mov = Builder.buildInstr(Op, {Dst}, {}).addImm(Val);
5607   constrainSelectedInstRegOperands(*Mov, TII, TRI, RBI);
5608   return &*Mov;
5609 }
5610 
5611 bool AArch64InstructionSelector::selectIndexedExtLoad(
5612     MachineInstr &MI, MachineRegisterInfo &MRI) {
5613   auto &ExtLd = cast<GIndexedExtLoad>(MI);
5614   Register Dst = ExtLd.getDstReg();
5615   Register WriteBack = ExtLd.getWritebackReg();
5616   Register Base = ExtLd.getBaseReg();
5617   Register Offset = ExtLd.getOffsetReg();
5618   LLT Ty = MRI.getType(Dst);
5619   assert(Ty.getSizeInBits() <= 64); // Only for scalar GPRs.
5620   unsigned MemSizeBits = ExtLd.getMMO().getMemoryType().getSizeInBits();
5621   bool IsPre = ExtLd.isPre();
5622   bool IsSExt = isa<GIndexedSExtLoad>(ExtLd);
5623   bool InsertIntoXReg = false;
5624   bool IsDst64 = Ty.getSizeInBits() == 64;
5625 
5626   unsigned Opc = 0;
5627   LLT NewLdDstTy;
5628   LLT s32 = LLT::scalar(32);
5629   LLT s64 = LLT::scalar(64);
5630 
5631   if (MemSizeBits == 8) {
5632     if (IsSExt) {
5633       if (IsDst64)
5634         Opc = IsPre ? AArch64::LDRSBXpre : AArch64::LDRSBXpost;
5635       else
5636         Opc = IsPre ? AArch64::LDRSBWpre : AArch64::LDRSBWpost;
5637       NewLdDstTy = IsDst64 ? s64 : s32;
5638     } else {
5639       Opc = IsPre ? AArch64::LDRBBpre : AArch64::LDRBBpost;
5640       InsertIntoXReg = IsDst64;
5641       NewLdDstTy = s32;
5642     }
5643   } else if (MemSizeBits == 16) {
5644     if (IsSExt) {
5645       if (IsDst64)
5646         Opc = IsPre ? AArch64::LDRSHXpre : AArch64::LDRSHXpost;
5647       else
5648         Opc = IsPre ? AArch64::LDRSHWpre : AArch64::LDRSHWpost;
5649       NewLdDstTy = IsDst64 ? s64 : s32;
5650     } else {
5651       Opc = IsPre ? AArch64::LDRHHpre : AArch64::LDRHHpost;
5652       InsertIntoXReg = IsDst64;
5653       NewLdDstTy = s32;
5654     }
5655   } else if (MemSizeBits == 32) {
5656     if (IsSExt) {
5657       Opc = IsPre ? AArch64::LDRSWpre : AArch64::LDRSWpost;
5658       NewLdDstTy = s64;
5659     } else {
5660       Opc = IsPre ? AArch64::LDRWpre : AArch64::LDRWpost;
5661       InsertIntoXReg = IsDst64;
5662       NewLdDstTy = s32;
5663     }
5664   } else {
5665     llvm_unreachable("Unexpected size for indexed load");
5666   }
5667 
5668   if (RBI.getRegBank(Dst, MRI, TRI)->getID() == AArch64::FPRRegBankID)
5669     return false; // We should be on gpr.
5670 
5671   auto Cst = getIConstantVRegVal(Offset, MRI);
5672   if (!Cst)
5673     return false; // Shouldn't happen, but just in case.
5674 
5675   auto LdMI = MIB.buildInstr(Opc, {WriteBack, NewLdDstTy}, {Base})
5676                   .addImm(Cst->getSExtValue());
5677   LdMI.cloneMemRefs(ExtLd);
5678   constrainSelectedInstRegOperands(*LdMI, TII, TRI, RBI);
5679   // Make sure to select the load with the MemTy as the dest type, and then
5680   // insert into X reg if needed.
5681   if (InsertIntoXReg) {
5682     // Generate a SUBREG_TO_REG.
5683     auto SubToReg = MIB.buildInstr(TargetOpcode::SUBREG_TO_REG, {Dst}, {})
5684                         .addImm(0)
5685                         .addUse(LdMI.getReg(1))
5686                         .addImm(AArch64::sub_32);
5687     RBI.constrainGenericRegister(SubToReg.getReg(0), AArch64::GPR64RegClass,
5688                                  MRI);
5689   } else {
5690     auto Copy = MIB.buildCopy(Dst, LdMI.getReg(1));
5691     selectCopy(*Copy, TII, MRI, TRI, RBI);
5692   }
5693   MI.eraseFromParent();
5694 
5695   return true;
5696 }
5697 
5698 bool AArch64InstructionSelector::selectIndexedLoad(MachineInstr &MI,
5699                                                    MachineRegisterInfo &MRI) {
5700   // TODO: extending loads.
5701   if (isa<GIndexedExtLoad>(MI))
5702     return false;
5703 
5704   auto &Ld = cast<GIndexedLoad>(MI);
5705   Register Dst = Ld.getDstReg();
5706   Register WriteBack = Ld.getWritebackReg();
5707   Register Base = Ld.getBaseReg();
5708   Register Offset = Ld.getOffsetReg();
5709   assert(MRI.getType(Dst).getSizeInBits() <= 128 &&
5710          "Unexpected type for indexed load");
5711   unsigned MemSize = Ld.getMMO().getMemoryType().getSizeInBytes();
5712 
5713   unsigned Opc = 0;
5714   if (Ld.isPre()) {
5715     static constexpr unsigned GPROpcodes[] = {
5716         AArch64::LDRBBpre, AArch64::LDRHHpre, AArch64::LDRWpre,
5717         AArch64::LDRXpre};
5718     static constexpr unsigned FPROpcodes[] = {
5719         AArch64::LDRBpre, AArch64::LDRHpre, AArch64::LDRSpre, AArch64::LDRDpre,
5720         AArch64::LDRQpre};
5721     if (RBI.getRegBank(Dst, MRI, TRI)->getID() == AArch64::FPRRegBankID)
5722       Opc = FPROpcodes[Log2_32(MemSize)];
5723     else
5724       Opc = GPROpcodes[Log2_32(MemSize)];
5725   } else {
5726     static constexpr unsigned GPROpcodes[] = {
5727         AArch64::LDRBBpost, AArch64::LDRHHpost, AArch64::LDRWpost,
5728         AArch64::LDRXpost};
5729     static constexpr unsigned FPROpcodes[] = {
5730         AArch64::LDRBpost, AArch64::LDRHpost, AArch64::LDRSpost,
5731         AArch64::LDRDpost, AArch64::LDRQpost};
5732     if (RBI.getRegBank(Dst, MRI, TRI)->getID() == AArch64::FPRRegBankID)
5733       Opc = FPROpcodes[Log2_32(MemSize)];
5734     else
5735       Opc = GPROpcodes[Log2_32(MemSize)];
5736   }
5737   auto Cst = getIConstantVRegVal(Offset, MRI);
5738   if (!Cst)
5739     return false; // Shouldn't happen, but just in case.
5740   auto LdMI =
5741       MIB.buildInstr(Opc, {WriteBack, Dst}, {Base}).addImm(Cst->getSExtValue());
5742   LdMI.cloneMemRefs(Ld);
5743   constrainSelectedInstRegOperands(*LdMI, TII, TRI, RBI);
5744   MI.eraseFromParent();
5745   return true;
5746 }
5747 
5748 bool AArch64InstructionSelector::selectIndexedStore(GIndexedStore &I,
5749                                                     MachineRegisterInfo &MRI) {
5750   Register Dst = I.getWritebackReg();
5751   Register Val = I.getValueReg();
5752   Register Base = I.getBaseReg();
5753   Register Offset = I.getOffsetReg();
5754   LLT ValTy = MRI.getType(Val);
5755   assert(ValTy.getSizeInBits() <= 128 && "Unexpected type for indexed store");
5756 
5757   unsigned Opc = 0;
5758   if (I.isPre()) {
5759     static constexpr unsigned GPROpcodes[] = {
5760         AArch64::STRBBpre, AArch64::STRHHpre, AArch64::STRWpre,
5761         AArch64::STRXpre};
5762     static constexpr unsigned FPROpcodes[] = {
5763         AArch64::STRBpre, AArch64::STRHpre, AArch64::STRSpre, AArch64::STRDpre,
5764         AArch64::STRQpre};
5765 
5766     if (RBI.getRegBank(Val, MRI, TRI)->getID() == AArch64::FPRRegBankID)
5767       Opc = FPROpcodes[Log2_32(ValTy.getSizeInBytes())];
5768     else
5769       Opc = GPROpcodes[Log2_32(ValTy.getSizeInBytes())];
5770   } else {
5771     static constexpr unsigned GPROpcodes[] = {
5772         AArch64::STRBBpost, AArch64::STRHHpost, AArch64::STRWpost,
5773         AArch64::STRXpost};
5774     static constexpr unsigned FPROpcodes[] = {
5775         AArch64::STRBpost, AArch64::STRHpost, AArch64::STRSpost,
5776         AArch64::STRDpost, AArch64::STRQpost};
5777 
5778     if (RBI.getRegBank(Val, MRI, TRI)->getID() == AArch64::FPRRegBankID)
5779       Opc = FPROpcodes[Log2_32(ValTy.getSizeInBytes())];
5780     else
5781       Opc = GPROpcodes[Log2_32(ValTy.getSizeInBytes())];
5782   }
5783 
5784   auto Cst = getIConstantVRegVal(Offset, MRI);
5785   if (!Cst)
5786     return false; // Shouldn't happen, but just in case.
5787   auto Str =
5788       MIB.buildInstr(Opc, {Dst}, {Val, Base}).addImm(Cst->getSExtValue());
5789   Str.cloneMemRefs(I);
5790   constrainSelectedInstRegOperands(*Str, TII, TRI, RBI);
5791   I.eraseFromParent();
5792   return true;
5793 }
5794 
5795 MachineInstr *
5796 AArch64InstructionSelector::emitConstantVector(Register Dst, Constant *CV,
5797                                                MachineIRBuilder &MIRBuilder,
5798                                                MachineRegisterInfo &MRI) {
5799   LLT DstTy = MRI.getType(Dst);
5800   unsigned DstSize = DstTy.getSizeInBits();
5801   if (CV->isNullValue()) {
5802     if (DstSize == 128) {
5803       auto Mov =
5804           MIRBuilder.buildInstr(AArch64::MOVIv2d_ns, {Dst}, {}).addImm(0);
5805       constrainSelectedInstRegOperands(*Mov, TII, TRI, RBI);
5806       return &*Mov;
5807     }
5808 
5809     if (DstSize == 64) {
5810       auto Mov =
5811           MIRBuilder
5812               .buildInstr(AArch64::MOVIv2d_ns, {&AArch64::FPR128RegClass}, {})
5813               .addImm(0);
5814       auto Copy = MIRBuilder.buildInstr(TargetOpcode::COPY, {Dst}, {})
5815                       .addReg(Mov.getReg(0), 0, AArch64::dsub);
5816       RBI.constrainGenericRegister(Dst, AArch64::FPR64RegClass, MRI);
5817       return &*Copy;
5818     }
5819   }
5820 
5821   if (CV->getSplatValue()) {
5822     APInt DefBits = APInt::getSplat(DstSize, CV->getUniqueInteger());
5823     MachineInstr *NewOp;
5824     bool Inv = false;
5825     if ((NewOp = tryAdvSIMDModImm64(Dst, DstSize, DefBits, MIRBuilder)) ||
5826         (NewOp = tryAdvSIMDModImm32(Dst, DstSize, DefBits, MIRBuilder, Inv)) ||
5827         (NewOp =
5828              tryAdvSIMDModImm321s(Dst, DstSize, DefBits, MIRBuilder, Inv)) ||
5829         (NewOp = tryAdvSIMDModImm16(Dst, DstSize, DefBits, MIRBuilder, Inv)) ||
5830         (NewOp = tryAdvSIMDModImm8(Dst, DstSize, DefBits, MIRBuilder)) ||
5831         (NewOp = tryAdvSIMDModImmFP(Dst, DstSize, DefBits, MIRBuilder)))
5832       return NewOp;
5833 
5834     DefBits = ~DefBits;
5835     Inv = true;
5836     if ((NewOp = tryAdvSIMDModImm32(Dst, DstSize, DefBits, MIRBuilder, Inv)) ||
5837         (NewOp =
5838              tryAdvSIMDModImm321s(Dst, DstSize, DefBits, MIRBuilder, Inv)) ||
5839         (NewOp = tryAdvSIMDModImm16(Dst, DstSize, DefBits, MIRBuilder, Inv)))
5840       return NewOp;
5841   }
5842 
5843   auto *CPLoad = emitLoadFromConstantPool(CV, MIRBuilder);
5844   if (!CPLoad) {
5845     LLVM_DEBUG(dbgs() << "Could not generate cp load for constant vector!");
5846     return nullptr;
5847   }
5848 
5849   auto Copy = MIRBuilder.buildCopy(Dst, CPLoad->getOperand(0));
5850   RBI.constrainGenericRegister(
5851       Dst, *MRI.getRegClass(CPLoad->getOperand(0).getReg()), MRI);
5852   return &*Copy;
5853 }
5854 
5855 bool AArch64InstructionSelector::tryOptConstantBuildVec(
5856     MachineInstr &I, LLT DstTy, MachineRegisterInfo &MRI) {
5857   assert(I.getOpcode() == TargetOpcode::G_BUILD_VECTOR);
5858   unsigned DstSize = DstTy.getSizeInBits();
5859   assert(DstSize <= 128 && "Unexpected build_vec type!");
5860   if (DstSize < 32)
5861     return false;
5862   // Check if we're building a constant vector, in which case we want to
5863   // generate a constant pool load instead of a vector insert sequence.
5864   SmallVector<Constant *, 16> Csts;
5865   for (unsigned Idx = 1; Idx < I.getNumOperands(); ++Idx) {
5866     // Try to find G_CONSTANT or G_FCONSTANT
5867     auto *OpMI =
5868         getOpcodeDef(TargetOpcode::G_CONSTANT, I.getOperand(Idx).getReg(), MRI);
5869     if (OpMI)
5870       Csts.emplace_back(
5871           const_cast<ConstantInt *>(OpMI->getOperand(1).getCImm()));
5872     else if ((OpMI = getOpcodeDef(TargetOpcode::G_FCONSTANT,
5873                                   I.getOperand(Idx).getReg(), MRI)))
5874       Csts.emplace_back(
5875           const_cast<ConstantFP *>(OpMI->getOperand(1).getFPImm()));
5876     else
5877       return false;
5878   }
5879   Constant *CV = ConstantVector::get(Csts);
5880   if (!emitConstantVector(I.getOperand(0).getReg(), CV, MIB, MRI))
5881     return false;
5882   I.eraseFromParent();
5883   return true;
5884 }
5885 
5886 bool AArch64InstructionSelector::tryOptBuildVecToSubregToReg(
5887     MachineInstr &I, MachineRegisterInfo &MRI) {
5888   // Given:
5889   //  %vec = G_BUILD_VECTOR %elt, %undef, %undef, ... %undef
5890   //
5891   // Select the G_BUILD_VECTOR as a SUBREG_TO_REG from %elt.
5892   Register Dst = I.getOperand(0).getReg();
5893   Register EltReg = I.getOperand(1).getReg();
5894   LLT EltTy = MRI.getType(EltReg);
5895   // If the index isn't on the same bank as its elements, then this can't be a
5896   // SUBREG_TO_REG.
5897   const RegisterBank &EltRB = *RBI.getRegBank(EltReg, MRI, TRI);
5898   const RegisterBank &DstRB = *RBI.getRegBank(Dst, MRI, TRI);
5899   if (EltRB != DstRB)
5900     return false;
5901   if (any_of(drop_begin(I.operands(), 2), [&MRI](const MachineOperand &Op) {
5902         return !getOpcodeDef(TargetOpcode::G_IMPLICIT_DEF, Op.getReg(), MRI);
5903       }))
5904     return false;
5905   unsigned SubReg;
5906   const TargetRegisterClass *EltRC = getRegClassForTypeOnBank(EltTy, EltRB);
5907   if (!EltRC)
5908     return false;
5909   const TargetRegisterClass *DstRC =
5910       getRegClassForTypeOnBank(MRI.getType(Dst), DstRB);
5911   if (!DstRC)
5912     return false;
5913   if (!getSubRegForClass(EltRC, TRI, SubReg))
5914     return false;
5915   auto SubregToReg = MIB.buildInstr(AArch64::SUBREG_TO_REG, {Dst}, {})
5916                          .addImm(0)
5917                          .addUse(EltReg)
5918                          .addImm(SubReg);
5919   I.eraseFromParent();
5920   constrainSelectedInstRegOperands(*SubregToReg, TII, TRI, RBI);
5921   return RBI.constrainGenericRegister(Dst, *DstRC, MRI);
5922 }
5923 
5924 bool AArch64InstructionSelector::selectBuildVector(MachineInstr &I,
5925                                                    MachineRegisterInfo &MRI) {
5926   assert(I.getOpcode() == TargetOpcode::G_BUILD_VECTOR);
5927   // Until we port more of the optimized selections, for now just use a vector
5928   // insert sequence.
5929   const LLT DstTy = MRI.getType(I.getOperand(0).getReg());
5930   const LLT EltTy = MRI.getType(I.getOperand(1).getReg());
5931   unsigned EltSize = EltTy.getSizeInBits();
5932 
5933   if (tryOptConstantBuildVec(I, DstTy, MRI))
5934     return true;
5935   if (tryOptBuildVecToSubregToReg(I, MRI))
5936     return true;
5937 
5938   if (EltSize != 8 && EltSize != 16 && EltSize != 32 && EltSize != 64)
5939     return false; // Don't support all element types yet.
5940   const RegisterBank &RB = *RBI.getRegBank(I.getOperand(1).getReg(), MRI, TRI);
5941 
5942   const TargetRegisterClass *DstRC = &AArch64::FPR128RegClass;
5943   MachineInstr *ScalarToVec =
5944       emitScalarToVector(DstTy.getElementType().getSizeInBits(), DstRC,
5945                          I.getOperand(1).getReg(), MIB);
5946   if (!ScalarToVec)
5947     return false;
5948 
5949   Register DstVec = ScalarToVec->getOperand(0).getReg();
5950   unsigned DstSize = DstTy.getSizeInBits();
5951 
5952   // Keep track of the last MI we inserted. Later on, we might be able to save
5953   // a copy using it.
5954   MachineInstr *PrevMI = nullptr;
5955   for (unsigned i = 2, e = DstSize / EltSize + 1; i < e; ++i) {
5956     // Note that if we don't do a subregister copy, we can end up making an
5957     // extra register.
5958     PrevMI = &*emitLaneInsert(std::nullopt, DstVec, I.getOperand(i).getReg(),
5959                               i - 1, RB, MIB);
5960     DstVec = PrevMI->getOperand(0).getReg();
5961   }
5962 
5963   // If DstTy's size in bits is less than 128, then emit a subregister copy
5964   // from DstVec to the last register we've defined.
5965   if (DstSize < 128) {
5966     // Force this to be FPR using the destination vector.
5967     const TargetRegisterClass *RC =
5968         getRegClassForTypeOnBank(DstTy, *RBI.getRegBank(DstVec, MRI, TRI));
5969     if (!RC)
5970       return false;
5971     if (RC != &AArch64::FPR32RegClass && RC != &AArch64::FPR64RegClass) {
5972       LLVM_DEBUG(dbgs() << "Unsupported register class!\n");
5973       return false;
5974     }
5975 
5976     unsigned SubReg = 0;
5977     if (!getSubRegForClass(RC, TRI, SubReg))
5978       return false;
5979     if (SubReg != AArch64::ssub && SubReg != AArch64::dsub) {
5980       LLVM_DEBUG(dbgs() << "Unsupported destination size! (" << DstSize
5981                         << "\n");
5982       return false;
5983     }
5984 
5985     Register Reg = MRI.createVirtualRegister(RC);
5986     Register DstReg = I.getOperand(0).getReg();
5987 
5988     MIB.buildInstr(TargetOpcode::COPY, {DstReg}, {}).addReg(DstVec, 0, SubReg);
5989     MachineOperand &RegOp = I.getOperand(1);
5990     RegOp.setReg(Reg);
5991     RBI.constrainGenericRegister(DstReg, *RC, MRI);
5992   } else {
5993     // We don't need a subregister copy. Save a copy by re-using the
5994     // destination register on the final insert.
5995     assert(PrevMI && "PrevMI was null?");
5996     PrevMI->getOperand(0).setReg(I.getOperand(0).getReg());
5997     constrainSelectedInstRegOperands(*PrevMI, TII, TRI, RBI);
5998   }
5999 
6000   I.eraseFromParent();
6001   return true;
6002 }
6003 
6004 bool AArch64InstructionSelector::selectVectorLoadIntrinsic(unsigned Opc,
6005                                                            unsigned NumVecs,
6006                                                            MachineInstr &I) {
6007   assert(I.getOpcode() == TargetOpcode::G_INTRINSIC_W_SIDE_EFFECTS);
6008   assert(Opc && "Expected an opcode?");
6009   assert(NumVecs > 1 && NumVecs < 5 && "Only support 2, 3, or 4 vectors");
6010   auto &MRI = *MIB.getMRI();
6011   LLT Ty = MRI.getType(I.getOperand(0).getReg());
6012   unsigned Size = Ty.getSizeInBits();
6013   assert((Size == 64 || Size == 128) &&
6014          "Destination must be 64 bits or 128 bits?");
6015   unsigned SubReg = Size == 64 ? AArch64::dsub0 : AArch64::qsub0;
6016   auto Ptr = I.getOperand(I.getNumOperands() - 1).getReg();
6017   assert(MRI.getType(Ptr).isPointer() && "Expected a pointer type?");
6018   auto Load = MIB.buildInstr(Opc, {Ty}, {Ptr});
6019   Load.cloneMemRefs(I);
6020   constrainSelectedInstRegOperands(*Load, TII, TRI, RBI);
6021   Register SelectedLoadDst = Load->getOperand(0).getReg();
6022   for (unsigned Idx = 0; Idx < NumVecs; ++Idx) {
6023     auto Vec = MIB.buildInstr(TargetOpcode::COPY, {I.getOperand(Idx)}, {})
6024                    .addReg(SelectedLoadDst, 0, SubReg + Idx);
6025     // Emit the subreg copies and immediately select them.
6026     // FIXME: We should refactor our copy code into an emitCopy helper and
6027     // clean up uses of this pattern elsewhere in the selector.
6028     selectCopy(*Vec, TII, MRI, TRI, RBI);
6029   }
6030   return true;
6031 }
6032 
6033 bool AArch64InstructionSelector::selectVectorLoadLaneIntrinsic(
6034     unsigned Opc, unsigned NumVecs, MachineInstr &I) {
6035   assert(I.getOpcode() == TargetOpcode::G_INTRINSIC_W_SIDE_EFFECTS);
6036   assert(Opc && "Expected an opcode?");
6037   assert(NumVecs > 1 && NumVecs < 5 && "Only support 2, 3, or 4 vectors");
6038   auto &MRI = *MIB.getMRI();
6039   LLT Ty = MRI.getType(I.getOperand(0).getReg());
6040   bool Narrow = Ty.getSizeInBits() == 64;
6041 
6042   auto FirstSrcRegIt = I.operands_begin() + NumVecs + 1;
6043   SmallVector<Register, 4> Regs(NumVecs);
6044   std::transform(FirstSrcRegIt, FirstSrcRegIt + NumVecs, Regs.begin(),
6045                  [](auto MO) { return MO.getReg(); });
6046 
6047   if (Narrow) {
6048     transform(Regs, Regs.begin(), [this](Register Reg) {
6049       return emitScalarToVector(64, &AArch64::FPR128RegClass, Reg, MIB)
6050           ->getOperand(0)
6051           .getReg();
6052     });
6053     Ty = Ty.multiplyElements(2);
6054   }
6055 
6056   Register Tuple = createQTuple(Regs, MIB);
6057   auto LaneNo = getIConstantVRegVal((FirstSrcRegIt + NumVecs)->getReg(), MRI);
6058   if (!LaneNo)
6059     return false;
6060 
6061   Register Ptr = (FirstSrcRegIt + NumVecs + 1)->getReg();
6062   auto Load = MIB.buildInstr(Opc, {Ty}, {})
6063                   .addReg(Tuple)
6064                   .addImm(LaneNo->getZExtValue())
6065                   .addReg(Ptr);
6066   Load.cloneMemRefs(I);
6067   constrainSelectedInstRegOperands(*Load, TII, TRI, RBI);
6068   Register SelectedLoadDst = Load->getOperand(0).getReg();
6069   unsigned SubReg = AArch64::qsub0;
6070   for (unsigned Idx = 0; Idx < NumVecs; ++Idx) {
6071     auto Vec = MIB.buildInstr(TargetOpcode::COPY,
6072                               {Narrow ? DstOp(&AArch64::FPR128RegClass)
6073                                       : DstOp(I.getOperand(Idx).getReg())},
6074                               {})
6075                    .addReg(SelectedLoadDst, 0, SubReg + Idx);
6076     Register WideReg = Vec.getReg(0);
6077     // Emit the subreg copies and immediately select them.
6078     selectCopy(*Vec, TII, MRI, TRI, RBI);
6079     if (Narrow &&
6080         !emitNarrowVector(I.getOperand(Idx).getReg(), WideReg, MIB, MRI))
6081       return false;
6082   }
6083   return true;
6084 }
6085 
6086 void AArch64InstructionSelector::selectVectorStoreIntrinsic(MachineInstr &I,
6087                                                             unsigned NumVecs,
6088                                                             unsigned Opc) {
6089   MachineRegisterInfo &MRI = I.getParent()->getParent()->getRegInfo();
6090   LLT Ty = MRI.getType(I.getOperand(1).getReg());
6091   Register Ptr = I.getOperand(1 + NumVecs).getReg();
6092 
6093   SmallVector<Register, 2> Regs(NumVecs);
6094   std::transform(I.operands_begin() + 1, I.operands_begin() + 1 + NumVecs,
6095                  Regs.begin(), [](auto MO) { return MO.getReg(); });
6096 
6097   Register Tuple = Ty.getSizeInBits() == 128 ? createQTuple(Regs, MIB)
6098                                              : createDTuple(Regs, MIB);
6099   auto Store = MIB.buildInstr(Opc, {}, {Tuple, Ptr});
6100   Store.cloneMemRefs(I);
6101   constrainSelectedInstRegOperands(*Store, TII, TRI, RBI);
6102 }
6103 
6104 bool AArch64InstructionSelector::selectVectorStoreLaneIntrinsic(
6105     MachineInstr &I, unsigned NumVecs, unsigned Opc) {
6106   MachineRegisterInfo &MRI = I.getParent()->getParent()->getRegInfo();
6107   LLT Ty = MRI.getType(I.getOperand(1).getReg());
6108   bool Narrow = Ty.getSizeInBits() == 64;
6109 
6110   SmallVector<Register, 2> Regs(NumVecs);
6111   std::transform(I.operands_begin() + 1, I.operands_begin() + 1 + NumVecs,
6112                  Regs.begin(), [](auto MO) { return MO.getReg(); });
6113 
6114   if (Narrow)
6115     transform(Regs, Regs.begin(), [this](Register Reg) {
6116       return emitScalarToVector(64, &AArch64::FPR128RegClass, Reg, MIB)
6117           ->getOperand(0)
6118           .getReg();
6119     });
6120 
6121   Register Tuple = createQTuple(Regs, MIB);
6122 
6123   auto LaneNo = getIConstantVRegVal(I.getOperand(1 + NumVecs).getReg(), MRI);
6124   if (!LaneNo)
6125     return false;
6126   Register Ptr = I.getOperand(1 + NumVecs + 1).getReg();
6127   auto Store = MIB.buildInstr(Opc, {}, {})
6128                    .addReg(Tuple)
6129                    .addImm(LaneNo->getZExtValue())
6130                    .addReg(Ptr);
6131   Store.cloneMemRefs(I);
6132   constrainSelectedInstRegOperands(*Store, TII, TRI, RBI);
6133   return true;
6134 }
6135 
6136 bool AArch64InstructionSelector::selectIntrinsicWithSideEffects(
6137     MachineInstr &I, MachineRegisterInfo &MRI) {
6138   // Find the intrinsic ID.
6139   unsigned IntrinID = cast<GIntrinsic>(I).getIntrinsicID();
6140 
6141   const LLT S8 = LLT::scalar(8);
6142   const LLT S16 = LLT::scalar(16);
6143   const LLT S32 = LLT::scalar(32);
6144   const LLT S64 = LLT::scalar(64);
6145   const LLT P0 = LLT::pointer(0, 64);
6146   // Select the instruction.
6147   switch (IntrinID) {
6148   default:
6149     return false;
6150   case Intrinsic::aarch64_ldxp:
6151   case Intrinsic::aarch64_ldaxp: {
6152     auto NewI = MIB.buildInstr(
6153         IntrinID == Intrinsic::aarch64_ldxp ? AArch64::LDXPX : AArch64::LDAXPX,
6154         {I.getOperand(0).getReg(), I.getOperand(1).getReg()},
6155         {I.getOperand(3)});
6156     NewI.cloneMemRefs(I);
6157     constrainSelectedInstRegOperands(*NewI, TII, TRI, RBI);
6158     break;
6159   }
6160   case Intrinsic::trap:
6161     MIB.buildInstr(AArch64::BRK, {}, {}).addImm(1);
6162     break;
6163   case Intrinsic::debugtrap:
6164     MIB.buildInstr(AArch64::BRK, {}, {}).addImm(0xF000);
6165     break;
6166   case Intrinsic::ubsantrap:
6167     MIB.buildInstr(AArch64::BRK, {}, {})
6168         .addImm(I.getOperand(1).getImm() | ('U' << 8));
6169     break;
6170   case Intrinsic::aarch64_neon_ld1x2: {
6171     LLT Ty = MRI.getType(I.getOperand(0).getReg());
6172     unsigned Opc = 0;
6173     if (Ty == LLT::fixed_vector(8, S8))
6174       Opc = AArch64::LD1Twov8b;
6175     else if (Ty == LLT::fixed_vector(16, S8))
6176       Opc = AArch64::LD1Twov16b;
6177     else if (Ty == LLT::fixed_vector(4, S16))
6178       Opc = AArch64::LD1Twov4h;
6179     else if (Ty == LLT::fixed_vector(8, S16))
6180       Opc = AArch64::LD1Twov8h;
6181     else if (Ty == LLT::fixed_vector(2, S32))
6182       Opc = AArch64::LD1Twov2s;
6183     else if (Ty == LLT::fixed_vector(4, S32))
6184       Opc = AArch64::LD1Twov4s;
6185     else if (Ty == LLT::fixed_vector(2, S64) || Ty == LLT::fixed_vector(2, P0))
6186       Opc = AArch64::LD1Twov2d;
6187     else if (Ty == S64 || Ty == P0)
6188       Opc = AArch64::LD1Twov1d;
6189     else
6190       llvm_unreachable("Unexpected type for ld1x2!");
6191     selectVectorLoadIntrinsic(Opc, 2, I);
6192     break;
6193   }
6194   case Intrinsic::aarch64_neon_ld1x3: {
6195     LLT Ty = MRI.getType(I.getOperand(0).getReg());
6196     unsigned Opc = 0;
6197     if (Ty == LLT::fixed_vector(8, S8))
6198       Opc = AArch64::LD1Threev8b;
6199     else if (Ty == LLT::fixed_vector(16, S8))
6200       Opc = AArch64::LD1Threev16b;
6201     else if (Ty == LLT::fixed_vector(4, S16))
6202       Opc = AArch64::LD1Threev4h;
6203     else if (Ty == LLT::fixed_vector(8, S16))
6204       Opc = AArch64::LD1Threev8h;
6205     else if (Ty == LLT::fixed_vector(2, S32))
6206       Opc = AArch64::LD1Threev2s;
6207     else if (Ty == LLT::fixed_vector(4, S32))
6208       Opc = AArch64::LD1Threev4s;
6209     else if (Ty == LLT::fixed_vector(2, S64) || Ty == LLT::fixed_vector(2, P0))
6210       Opc = AArch64::LD1Threev2d;
6211     else if (Ty == S64 || Ty == P0)
6212       Opc = AArch64::LD1Threev1d;
6213     else
6214       llvm_unreachable("Unexpected type for ld1x3!");
6215     selectVectorLoadIntrinsic(Opc, 3, I);
6216     break;
6217   }
6218   case Intrinsic::aarch64_neon_ld1x4: {
6219     LLT Ty = MRI.getType(I.getOperand(0).getReg());
6220     unsigned Opc = 0;
6221     if (Ty == LLT::fixed_vector(8, S8))
6222       Opc = AArch64::LD1Fourv8b;
6223     else if (Ty == LLT::fixed_vector(16, S8))
6224       Opc = AArch64::LD1Fourv16b;
6225     else if (Ty == LLT::fixed_vector(4, S16))
6226       Opc = AArch64::LD1Fourv4h;
6227     else if (Ty == LLT::fixed_vector(8, S16))
6228       Opc = AArch64::LD1Fourv8h;
6229     else if (Ty == LLT::fixed_vector(2, S32))
6230       Opc = AArch64::LD1Fourv2s;
6231     else if (Ty == LLT::fixed_vector(4, S32))
6232       Opc = AArch64::LD1Fourv4s;
6233     else if (Ty == LLT::fixed_vector(2, S64) || Ty == LLT::fixed_vector(2, P0))
6234       Opc = AArch64::LD1Fourv2d;
6235     else if (Ty == S64 || Ty == P0)
6236       Opc = AArch64::LD1Fourv1d;
6237     else
6238       llvm_unreachable("Unexpected type for ld1x4!");
6239     selectVectorLoadIntrinsic(Opc, 4, I);
6240     break;
6241   }
6242   case Intrinsic::aarch64_neon_ld2: {
6243     LLT Ty = MRI.getType(I.getOperand(0).getReg());
6244     unsigned Opc = 0;
6245     if (Ty == LLT::fixed_vector(8, S8))
6246       Opc = AArch64::LD2Twov8b;
6247     else if (Ty == LLT::fixed_vector(16, S8))
6248       Opc = AArch64::LD2Twov16b;
6249     else if (Ty == LLT::fixed_vector(4, S16))
6250       Opc = AArch64::LD2Twov4h;
6251     else if (Ty == LLT::fixed_vector(8, S16))
6252       Opc = AArch64::LD2Twov8h;
6253     else if (Ty == LLT::fixed_vector(2, S32))
6254       Opc = AArch64::LD2Twov2s;
6255     else if (Ty == LLT::fixed_vector(4, S32))
6256       Opc = AArch64::LD2Twov4s;
6257     else if (Ty == LLT::fixed_vector(2, S64) || Ty == LLT::fixed_vector(2, P0))
6258       Opc = AArch64::LD2Twov2d;
6259     else if (Ty == S64 || Ty == P0)
6260       Opc = AArch64::LD1Twov1d;
6261     else
6262       llvm_unreachable("Unexpected type for ld2!");
6263     selectVectorLoadIntrinsic(Opc, 2, I);
6264     break;
6265   }
6266   case Intrinsic::aarch64_neon_ld2lane: {
6267     LLT Ty = MRI.getType(I.getOperand(0).getReg());
6268     unsigned Opc;
6269     if (Ty == LLT::fixed_vector(8, S8) || Ty == LLT::fixed_vector(16, S8))
6270       Opc = AArch64::LD2i8;
6271     else if (Ty == LLT::fixed_vector(4, S16) || Ty == LLT::fixed_vector(8, S16))
6272       Opc = AArch64::LD2i16;
6273     else if (Ty == LLT::fixed_vector(2, S32) || Ty == LLT::fixed_vector(4, S32))
6274       Opc = AArch64::LD2i32;
6275     else if (Ty == LLT::fixed_vector(2, S64) ||
6276              Ty == LLT::fixed_vector(2, P0) || Ty == S64 || Ty == P0)
6277       Opc = AArch64::LD2i64;
6278     else
6279       llvm_unreachable("Unexpected type for st2lane!");
6280     if (!selectVectorLoadLaneIntrinsic(Opc, 2, I))
6281       return false;
6282     break;
6283   }
6284   case Intrinsic::aarch64_neon_ld2r: {
6285     LLT Ty = MRI.getType(I.getOperand(0).getReg());
6286     unsigned Opc = 0;
6287     if (Ty == LLT::fixed_vector(8, S8))
6288       Opc = AArch64::LD2Rv8b;
6289     else if (Ty == LLT::fixed_vector(16, S8))
6290       Opc = AArch64::LD2Rv16b;
6291     else if (Ty == LLT::fixed_vector(4, S16))
6292       Opc = AArch64::LD2Rv4h;
6293     else if (Ty == LLT::fixed_vector(8, S16))
6294       Opc = AArch64::LD2Rv8h;
6295     else if (Ty == LLT::fixed_vector(2, S32))
6296       Opc = AArch64::LD2Rv2s;
6297     else if (Ty == LLT::fixed_vector(4, S32))
6298       Opc = AArch64::LD2Rv4s;
6299     else if (Ty == LLT::fixed_vector(2, S64) || Ty == LLT::fixed_vector(2, P0))
6300       Opc = AArch64::LD2Rv2d;
6301     else if (Ty == S64 || Ty == P0)
6302       Opc = AArch64::LD2Rv1d;
6303     else
6304       llvm_unreachable("Unexpected type for ld2r!");
6305     selectVectorLoadIntrinsic(Opc, 2, I);
6306     break;
6307   }
6308   case Intrinsic::aarch64_neon_ld3: {
6309     LLT Ty = MRI.getType(I.getOperand(0).getReg());
6310     unsigned Opc = 0;
6311     if (Ty == LLT::fixed_vector(8, S8))
6312       Opc = AArch64::LD3Threev8b;
6313     else if (Ty == LLT::fixed_vector(16, S8))
6314       Opc = AArch64::LD3Threev16b;
6315     else if (Ty == LLT::fixed_vector(4, S16))
6316       Opc = AArch64::LD3Threev4h;
6317     else if (Ty == LLT::fixed_vector(8, S16))
6318       Opc = AArch64::LD3Threev8h;
6319     else if (Ty == LLT::fixed_vector(2, S32))
6320       Opc = AArch64::LD3Threev2s;
6321     else if (Ty == LLT::fixed_vector(4, S32))
6322       Opc = AArch64::LD3Threev4s;
6323     else if (Ty == LLT::fixed_vector(2, S64) || Ty == LLT::fixed_vector(2, P0))
6324       Opc = AArch64::LD3Threev2d;
6325     else if (Ty == S64 || Ty == P0)
6326       Opc = AArch64::LD1Threev1d;
6327     else
6328       llvm_unreachable("Unexpected type for ld3!");
6329     selectVectorLoadIntrinsic(Opc, 3, I);
6330     break;
6331   }
6332   case Intrinsic::aarch64_neon_ld3lane: {
6333     LLT Ty = MRI.getType(I.getOperand(0).getReg());
6334     unsigned Opc;
6335     if (Ty == LLT::fixed_vector(8, S8) || Ty == LLT::fixed_vector(16, S8))
6336       Opc = AArch64::LD3i8;
6337     else if (Ty == LLT::fixed_vector(4, S16) || Ty == LLT::fixed_vector(8, S16))
6338       Opc = AArch64::LD3i16;
6339     else if (Ty == LLT::fixed_vector(2, S32) || Ty == LLT::fixed_vector(4, S32))
6340       Opc = AArch64::LD3i32;
6341     else if (Ty == LLT::fixed_vector(2, S64) ||
6342              Ty == LLT::fixed_vector(2, P0) || Ty == S64 || Ty == P0)
6343       Opc = AArch64::LD3i64;
6344     else
6345       llvm_unreachable("Unexpected type for st3lane!");
6346     if (!selectVectorLoadLaneIntrinsic(Opc, 3, I))
6347       return false;
6348     break;
6349   }
6350   case Intrinsic::aarch64_neon_ld3r: {
6351     LLT Ty = MRI.getType(I.getOperand(0).getReg());
6352     unsigned Opc = 0;
6353     if (Ty == LLT::fixed_vector(8, S8))
6354       Opc = AArch64::LD3Rv8b;
6355     else if (Ty == LLT::fixed_vector(16, S8))
6356       Opc = AArch64::LD3Rv16b;
6357     else if (Ty == LLT::fixed_vector(4, S16))
6358       Opc = AArch64::LD3Rv4h;
6359     else if (Ty == LLT::fixed_vector(8, S16))
6360       Opc = AArch64::LD3Rv8h;
6361     else if (Ty == LLT::fixed_vector(2, S32))
6362       Opc = AArch64::LD3Rv2s;
6363     else if (Ty == LLT::fixed_vector(4, S32))
6364       Opc = AArch64::LD3Rv4s;
6365     else if (Ty == LLT::fixed_vector(2, S64) || Ty == LLT::fixed_vector(2, P0))
6366       Opc = AArch64::LD3Rv2d;
6367     else if (Ty == S64 || Ty == P0)
6368       Opc = AArch64::LD3Rv1d;
6369     else
6370       llvm_unreachable("Unexpected type for ld3r!");
6371     selectVectorLoadIntrinsic(Opc, 3, I);
6372     break;
6373   }
6374   case Intrinsic::aarch64_neon_ld4: {
6375     LLT Ty = MRI.getType(I.getOperand(0).getReg());
6376     unsigned Opc = 0;
6377     if (Ty == LLT::fixed_vector(8, S8))
6378       Opc = AArch64::LD4Fourv8b;
6379     else if (Ty == LLT::fixed_vector(16, S8))
6380       Opc = AArch64::LD4Fourv16b;
6381     else if (Ty == LLT::fixed_vector(4, S16))
6382       Opc = AArch64::LD4Fourv4h;
6383     else if (Ty == LLT::fixed_vector(8, S16))
6384       Opc = AArch64::LD4Fourv8h;
6385     else if (Ty == LLT::fixed_vector(2, S32))
6386       Opc = AArch64::LD4Fourv2s;
6387     else if (Ty == LLT::fixed_vector(4, S32))
6388       Opc = AArch64::LD4Fourv4s;
6389     else if (Ty == LLT::fixed_vector(2, S64) || Ty == LLT::fixed_vector(2, P0))
6390       Opc = AArch64::LD4Fourv2d;
6391     else if (Ty == S64 || Ty == P0)
6392       Opc = AArch64::LD1Fourv1d;
6393     else
6394       llvm_unreachable("Unexpected type for ld4!");
6395     selectVectorLoadIntrinsic(Opc, 4, I);
6396     break;
6397   }
6398   case Intrinsic::aarch64_neon_ld4lane: {
6399     LLT Ty = MRI.getType(I.getOperand(0).getReg());
6400     unsigned Opc;
6401     if (Ty == LLT::fixed_vector(8, S8) || Ty == LLT::fixed_vector(16, S8))
6402       Opc = AArch64::LD4i8;
6403     else if (Ty == LLT::fixed_vector(4, S16) || Ty == LLT::fixed_vector(8, S16))
6404       Opc = AArch64::LD4i16;
6405     else if (Ty == LLT::fixed_vector(2, S32) || Ty == LLT::fixed_vector(4, S32))
6406       Opc = AArch64::LD4i32;
6407     else if (Ty == LLT::fixed_vector(2, S64) ||
6408              Ty == LLT::fixed_vector(2, P0) || Ty == S64 || Ty == P0)
6409       Opc = AArch64::LD4i64;
6410     else
6411       llvm_unreachable("Unexpected type for st4lane!");
6412     if (!selectVectorLoadLaneIntrinsic(Opc, 4, I))
6413       return false;
6414     break;
6415   }
6416   case Intrinsic::aarch64_neon_ld4r: {
6417     LLT Ty = MRI.getType(I.getOperand(0).getReg());
6418     unsigned Opc = 0;
6419     if (Ty == LLT::fixed_vector(8, S8))
6420       Opc = AArch64::LD4Rv8b;
6421     else if (Ty == LLT::fixed_vector(16, S8))
6422       Opc = AArch64::LD4Rv16b;
6423     else if (Ty == LLT::fixed_vector(4, S16))
6424       Opc = AArch64::LD4Rv4h;
6425     else if (Ty == LLT::fixed_vector(8, S16))
6426       Opc = AArch64::LD4Rv8h;
6427     else if (Ty == LLT::fixed_vector(2, S32))
6428       Opc = AArch64::LD4Rv2s;
6429     else if (Ty == LLT::fixed_vector(4, S32))
6430       Opc = AArch64::LD4Rv4s;
6431     else if (Ty == LLT::fixed_vector(2, S64) || Ty == LLT::fixed_vector(2, P0))
6432       Opc = AArch64::LD4Rv2d;
6433     else if (Ty == S64 || Ty == P0)
6434       Opc = AArch64::LD4Rv1d;
6435     else
6436       llvm_unreachable("Unexpected type for ld4r!");
6437     selectVectorLoadIntrinsic(Opc, 4, I);
6438     break;
6439   }
6440   case Intrinsic::aarch64_neon_st1x2: {
6441     LLT Ty = MRI.getType(I.getOperand(1).getReg());
6442     unsigned Opc;
6443     if (Ty == LLT::fixed_vector(8, S8))
6444       Opc = AArch64::ST1Twov8b;
6445     else if (Ty == LLT::fixed_vector(16, S8))
6446       Opc = AArch64::ST1Twov16b;
6447     else if (Ty == LLT::fixed_vector(4, S16))
6448       Opc = AArch64::ST1Twov4h;
6449     else if (Ty == LLT::fixed_vector(8, S16))
6450       Opc = AArch64::ST1Twov8h;
6451     else if (Ty == LLT::fixed_vector(2, S32))
6452       Opc = AArch64::ST1Twov2s;
6453     else if (Ty == LLT::fixed_vector(4, S32))
6454       Opc = AArch64::ST1Twov4s;
6455     else if (Ty == LLT::fixed_vector(2, S64) || Ty == LLT::fixed_vector(2, P0))
6456       Opc = AArch64::ST1Twov2d;
6457     else if (Ty == S64 || Ty == P0)
6458       Opc = AArch64::ST1Twov1d;
6459     else
6460       llvm_unreachable("Unexpected type for st1x2!");
6461     selectVectorStoreIntrinsic(I, 2, Opc);
6462     break;
6463   }
6464   case Intrinsic::aarch64_neon_st1x3: {
6465     LLT Ty = MRI.getType(I.getOperand(1).getReg());
6466     unsigned Opc;
6467     if (Ty == LLT::fixed_vector(8, S8))
6468       Opc = AArch64::ST1Threev8b;
6469     else if (Ty == LLT::fixed_vector(16, S8))
6470       Opc = AArch64::ST1Threev16b;
6471     else if (Ty == LLT::fixed_vector(4, S16))
6472       Opc = AArch64::ST1Threev4h;
6473     else if (Ty == LLT::fixed_vector(8, S16))
6474       Opc = AArch64::ST1Threev8h;
6475     else if (Ty == LLT::fixed_vector(2, S32))
6476       Opc = AArch64::ST1Threev2s;
6477     else if (Ty == LLT::fixed_vector(4, S32))
6478       Opc = AArch64::ST1Threev4s;
6479     else if (Ty == LLT::fixed_vector(2, S64) || Ty == LLT::fixed_vector(2, P0))
6480       Opc = AArch64::ST1Threev2d;
6481     else if (Ty == S64 || Ty == P0)
6482       Opc = AArch64::ST1Threev1d;
6483     else
6484       llvm_unreachable("Unexpected type for st1x3!");
6485     selectVectorStoreIntrinsic(I, 3, Opc);
6486     break;
6487   }
6488   case Intrinsic::aarch64_neon_st1x4: {
6489     LLT Ty = MRI.getType(I.getOperand(1).getReg());
6490     unsigned Opc;
6491     if (Ty == LLT::fixed_vector(8, S8))
6492       Opc = AArch64::ST1Fourv8b;
6493     else if (Ty == LLT::fixed_vector(16, S8))
6494       Opc = AArch64::ST1Fourv16b;
6495     else if (Ty == LLT::fixed_vector(4, S16))
6496       Opc = AArch64::ST1Fourv4h;
6497     else if (Ty == LLT::fixed_vector(8, S16))
6498       Opc = AArch64::ST1Fourv8h;
6499     else if (Ty == LLT::fixed_vector(2, S32))
6500       Opc = AArch64::ST1Fourv2s;
6501     else if (Ty == LLT::fixed_vector(4, S32))
6502       Opc = AArch64::ST1Fourv4s;
6503     else if (Ty == LLT::fixed_vector(2, S64) || Ty == LLT::fixed_vector(2, P0))
6504       Opc = AArch64::ST1Fourv2d;
6505     else if (Ty == S64 || Ty == P0)
6506       Opc = AArch64::ST1Fourv1d;
6507     else
6508       llvm_unreachable("Unexpected type for st1x4!");
6509     selectVectorStoreIntrinsic(I, 4, Opc);
6510     break;
6511   }
6512   case Intrinsic::aarch64_neon_st2: {
6513     LLT Ty = MRI.getType(I.getOperand(1).getReg());
6514     unsigned Opc;
6515     if (Ty == LLT::fixed_vector(8, S8))
6516       Opc = AArch64::ST2Twov8b;
6517     else if (Ty == LLT::fixed_vector(16, S8))
6518       Opc = AArch64::ST2Twov16b;
6519     else if (Ty == LLT::fixed_vector(4, S16))
6520       Opc = AArch64::ST2Twov4h;
6521     else if (Ty == LLT::fixed_vector(8, S16))
6522       Opc = AArch64::ST2Twov8h;
6523     else if (Ty == LLT::fixed_vector(2, S32))
6524       Opc = AArch64::ST2Twov2s;
6525     else if (Ty == LLT::fixed_vector(4, S32))
6526       Opc = AArch64::ST2Twov4s;
6527     else if (Ty == LLT::fixed_vector(2, S64) || Ty == LLT::fixed_vector(2, P0))
6528       Opc = AArch64::ST2Twov2d;
6529     else if (Ty == S64 || Ty == P0)
6530       Opc = AArch64::ST1Twov1d;
6531     else
6532       llvm_unreachable("Unexpected type for st2!");
6533     selectVectorStoreIntrinsic(I, 2, Opc);
6534     break;
6535   }
6536   case Intrinsic::aarch64_neon_st3: {
6537     LLT Ty = MRI.getType(I.getOperand(1).getReg());
6538     unsigned Opc;
6539     if (Ty == LLT::fixed_vector(8, S8))
6540       Opc = AArch64::ST3Threev8b;
6541     else if (Ty == LLT::fixed_vector(16, S8))
6542       Opc = AArch64::ST3Threev16b;
6543     else if (Ty == LLT::fixed_vector(4, S16))
6544       Opc = AArch64::ST3Threev4h;
6545     else if (Ty == LLT::fixed_vector(8, S16))
6546       Opc = AArch64::ST3Threev8h;
6547     else if (Ty == LLT::fixed_vector(2, S32))
6548       Opc = AArch64::ST3Threev2s;
6549     else if (Ty == LLT::fixed_vector(4, S32))
6550       Opc = AArch64::ST3Threev4s;
6551     else if (Ty == LLT::fixed_vector(2, S64) || Ty == LLT::fixed_vector(2, P0))
6552       Opc = AArch64::ST3Threev2d;
6553     else if (Ty == S64 || Ty == P0)
6554       Opc = AArch64::ST1Threev1d;
6555     else
6556       llvm_unreachable("Unexpected type for st3!");
6557     selectVectorStoreIntrinsic(I, 3, Opc);
6558     break;
6559   }
6560   case Intrinsic::aarch64_neon_st4: {
6561     LLT Ty = MRI.getType(I.getOperand(1).getReg());
6562     unsigned Opc;
6563     if (Ty == LLT::fixed_vector(8, S8))
6564       Opc = AArch64::ST4Fourv8b;
6565     else if (Ty == LLT::fixed_vector(16, S8))
6566       Opc = AArch64::ST4Fourv16b;
6567     else if (Ty == LLT::fixed_vector(4, S16))
6568       Opc = AArch64::ST4Fourv4h;
6569     else if (Ty == LLT::fixed_vector(8, S16))
6570       Opc = AArch64::ST4Fourv8h;
6571     else if (Ty == LLT::fixed_vector(2, S32))
6572       Opc = AArch64::ST4Fourv2s;
6573     else if (Ty == LLT::fixed_vector(4, S32))
6574       Opc = AArch64::ST4Fourv4s;
6575     else if (Ty == LLT::fixed_vector(2, S64) || Ty == LLT::fixed_vector(2, P0))
6576       Opc = AArch64::ST4Fourv2d;
6577     else if (Ty == S64 || Ty == P0)
6578       Opc = AArch64::ST1Fourv1d;
6579     else
6580       llvm_unreachable("Unexpected type for st4!");
6581     selectVectorStoreIntrinsic(I, 4, Opc);
6582     break;
6583   }
6584   case Intrinsic::aarch64_neon_st2lane: {
6585     LLT Ty = MRI.getType(I.getOperand(1).getReg());
6586     unsigned Opc;
6587     if (Ty == LLT::fixed_vector(8, S8) || Ty == LLT::fixed_vector(16, S8))
6588       Opc = AArch64::ST2i8;
6589     else if (Ty == LLT::fixed_vector(4, S16) || Ty == LLT::fixed_vector(8, S16))
6590       Opc = AArch64::ST2i16;
6591     else if (Ty == LLT::fixed_vector(2, S32) || Ty == LLT::fixed_vector(4, S32))
6592       Opc = AArch64::ST2i32;
6593     else if (Ty == LLT::fixed_vector(2, S64) ||
6594              Ty == LLT::fixed_vector(2, P0) || Ty == S64 || Ty == P0)
6595       Opc = AArch64::ST2i64;
6596     else
6597       llvm_unreachable("Unexpected type for st2lane!");
6598     if (!selectVectorStoreLaneIntrinsic(I, 2, Opc))
6599       return false;
6600     break;
6601   }
6602   case Intrinsic::aarch64_neon_st3lane: {
6603     LLT Ty = MRI.getType(I.getOperand(1).getReg());
6604     unsigned Opc;
6605     if (Ty == LLT::fixed_vector(8, S8) || Ty == LLT::fixed_vector(16, S8))
6606       Opc = AArch64::ST3i8;
6607     else if (Ty == LLT::fixed_vector(4, S16) || Ty == LLT::fixed_vector(8, S16))
6608       Opc = AArch64::ST3i16;
6609     else if (Ty == LLT::fixed_vector(2, S32) || Ty == LLT::fixed_vector(4, S32))
6610       Opc = AArch64::ST3i32;
6611     else if (Ty == LLT::fixed_vector(2, S64) ||
6612              Ty == LLT::fixed_vector(2, P0) || Ty == S64 || Ty == P0)
6613       Opc = AArch64::ST3i64;
6614     else
6615       llvm_unreachable("Unexpected type for st3lane!");
6616     if (!selectVectorStoreLaneIntrinsic(I, 3, Opc))
6617       return false;
6618     break;
6619   }
6620   case Intrinsic::aarch64_neon_st4lane: {
6621     LLT Ty = MRI.getType(I.getOperand(1).getReg());
6622     unsigned Opc;
6623     if (Ty == LLT::fixed_vector(8, S8) || Ty == LLT::fixed_vector(16, S8))
6624       Opc = AArch64::ST4i8;
6625     else if (Ty == LLT::fixed_vector(4, S16) || Ty == LLT::fixed_vector(8, S16))
6626       Opc = AArch64::ST4i16;
6627     else if (Ty == LLT::fixed_vector(2, S32) || Ty == LLT::fixed_vector(4, S32))
6628       Opc = AArch64::ST4i32;
6629     else if (Ty == LLT::fixed_vector(2, S64) ||
6630              Ty == LLT::fixed_vector(2, P0) || Ty == S64 || Ty == P0)
6631       Opc = AArch64::ST4i64;
6632     else
6633       llvm_unreachable("Unexpected type for st4lane!");
6634     if (!selectVectorStoreLaneIntrinsic(I, 4, Opc))
6635       return false;
6636     break;
6637   }
6638   case Intrinsic::aarch64_mops_memset_tag: {
6639     // Transform
6640     //    %dst:gpr(p0) = \
6641     //      G_INTRINSIC_W_SIDE_EFFECTS intrinsic(@llvm.aarch64.mops.memset.tag),
6642     //      \ %dst:gpr(p0), %val:gpr(s64), %n:gpr(s64)
6643     // where %dst is updated, into
6644     //    %Rd:GPR64common, %Rn:GPR64) = \
6645     //      MOPSMemorySetTaggingPseudo \
6646     //      %Rd:GPR64common, %Rn:GPR64, %Rm:GPR64
6647     // where Rd and Rn are tied.
6648     // It is expected that %val has been extended to s64 in legalization.
6649     // Note that the order of the size/value operands are swapped.
6650 
6651     Register DstDef = I.getOperand(0).getReg();
6652     // I.getOperand(1) is the intrinsic function
6653     Register DstUse = I.getOperand(2).getReg();
6654     Register ValUse = I.getOperand(3).getReg();
6655     Register SizeUse = I.getOperand(4).getReg();
6656 
6657     // MOPSMemorySetTaggingPseudo has two defs; the intrinsic call has only one.
6658     // Therefore an additional virtual register is requried for the updated size
6659     // operand. This value is not accessible via the semantics of the intrinsic.
6660     Register SizeDef = MRI.createGenericVirtualRegister(LLT::scalar(64));
6661 
6662     auto Memset = MIB.buildInstr(AArch64::MOPSMemorySetTaggingPseudo,
6663                                  {DstDef, SizeDef}, {DstUse, SizeUse, ValUse});
6664     Memset.cloneMemRefs(I);
6665     constrainSelectedInstRegOperands(*Memset, TII, TRI, RBI);
6666     break;
6667   }
6668   }
6669 
6670   I.eraseFromParent();
6671   return true;
6672 }
6673 
6674 bool AArch64InstructionSelector::selectIntrinsic(MachineInstr &I,
6675                                                  MachineRegisterInfo &MRI) {
6676   unsigned IntrinID = cast<GIntrinsic>(I).getIntrinsicID();
6677 
6678   switch (IntrinID) {
6679   default:
6680     break;
6681   case Intrinsic::aarch64_crypto_sha1h: {
6682     Register DstReg = I.getOperand(0).getReg();
6683     Register SrcReg = I.getOperand(2).getReg();
6684 
6685     // FIXME: Should this be an assert?
6686     if (MRI.getType(DstReg).getSizeInBits() != 32 ||
6687         MRI.getType(SrcReg).getSizeInBits() != 32)
6688       return false;
6689 
6690     // The operation has to happen on FPRs. Set up some new FPR registers for
6691     // the source and destination if they are on GPRs.
6692     if (RBI.getRegBank(SrcReg, MRI, TRI)->getID() != AArch64::FPRRegBankID) {
6693       SrcReg = MRI.createVirtualRegister(&AArch64::FPR32RegClass);
6694       MIB.buildCopy({SrcReg}, {I.getOperand(2)});
6695 
6696       // Make sure the copy ends up getting constrained properly.
6697       RBI.constrainGenericRegister(I.getOperand(2).getReg(),
6698                                    AArch64::GPR32RegClass, MRI);
6699     }
6700 
6701     if (RBI.getRegBank(DstReg, MRI, TRI)->getID() != AArch64::FPRRegBankID)
6702       DstReg = MRI.createVirtualRegister(&AArch64::FPR32RegClass);
6703 
6704     // Actually insert the instruction.
6705     auto SHA1Inst = MIB.buildInstr(AArch64::SHA1Hrr, {DstReg}, {SrcReg});
6706     constrainSelectedInstRegOperands(*SHA1Inst, TII, TRI, RBI);
6707 
6708     // Did we create a new register for the destination?
6709     if (DstReg != I.getOperand(0).getReg()) {
6710       // Yep. Copy the result of the instruction back into the original
6711       // destination.
6712       MIB.buildCopy({I.getOperand(0)}, {DstReg});
6713       RBI.constrainGenericRegister(I.getOperand(0).getReg(),
6714                                    AArch64::GPR32RegClass, MRI);
6715     }
6716 
6717     I.eraseFromParent();
6718     return true;
6719   }
6720   case Intrinsic::frameaddress:
6721   case Intrinsic::returnaddress: {
6722     MachineFunction &MF = *I.getParent()->getParent();
6723     MachineFrameInfo &MFI = MF.getFrameInfo();
6724 
6725     unsigned Depth = I.getOperand(2).getImm();
6726     Register DstReg = I.getOperand(0).getReg();
6727     RBI.constrainGenericRegister(DstReg, AArch64::GPR64RegClass, MRI);
6728 
6729     if (Depth == 0 && IntrinID == Intrinsic::returnaddress) {
6730       if (!MFReturnAddr) {
6731         // Insert the copy from LR/X30 into the entry block, before it can be
6732         // clobbered by anything.
6733         MFI.setReturnAddressIsTaken(true);
6734         MFReturnAddr = getFunctionLiveInPhysReg(
6735             MF, TII, AArch64::LR, AArch64::GPR64RegClass, I.getDebugLoc());
6736       }
6737 
6738       if (STI.hasPAuth()) {
6739         MIB.buildInstr(AArch64::XPACI, {DstReg}, {MFReturnAddr});
6740       } else {
6741         MIB.buildCopy({Register(AArch64::LR)}, {MFReturnAddr});
6742         MIB.buildInstr(AArch64::XPACLRI);
6743         MIB.buildCopy({DstReg}, {Register(AArch64::LR)});
6744       }
6745 
6746       I.eraseFromParent();
6747       return true;
6748     }
6749 
6750     MFI.setFrameAddressIsTaken(true);
6751     Register FrameAddr(AArch64::FP);
6752     while (Depth--) {
6753       Register NextFrame = MRI.createVirtualRegister(&AArch64::GPR64spRegClass);
6754       auto Ldr =
6755           MIB.buildInstr(AArch64::LDRXui, {NextFrame}, {FrameAddr}).addImm(0);
6756       constrainSelectedInstRegOperands(*Ldr, TII, TRI, RBI);
6757       FrameAddr = NextFrame;
6758     }
6759 
6760     if (IntrinID == Intrinsic::frameaddress)
6761       MIB.buildCopy({DstReg}, {FrameAddr});
6762     else {
6763       MFI.setReturnAddressIsTaken(true);
6764 
6765       if (STI.hasPAuth()) {
6766         Register TmpReg = MRI.createVirtualRegister(&AArch64::GPR64RegClass);
6767         MIB.buildInstr(AArch64::LDRXui, {TmpReg}, {FrameAddr}).addImm(1);
6768         MIB.buildInstr(AArch64::XPACI, {DstReg}, {TmpReg});
6769       } else {
6770         MIB.buildInstr(AArch64::LDRXui, {Register(AArch64::LR)}, {FrameAddr})
6771             .addImm(1);
6772         MIB.buildInstr(AArch64::XPACLRI);
6773         MIB.buildCopy({DstReg}, {Register(AArch64::LR)});
6774       }
6775     }
6776 
6777     I.eraseFromParent();
6778     return true;
6779   }
6780   case Intrinsic::swift_async_context_addr:
6781     auto Sub = MIB.buildInstr(AArch64::SUBXri, {I.getOperand(0).getReg()},
6782                               {Register(AArch64::FP)})
6783                    .addImm(8)
6784                    .addImm(0);
6785     constrainSelectedInstRegOperands(*Sub, TII, TRI, RBI);
6786 
6787     MF->getFrameInfo().setFrameAddressIsTaken(true);
6788     MF->getInfo<AArch64FunctionInfo>()->setHasSwiftAsyncContext(true);
6789     I.eraseFromParent();
6790     return true;
6791   }
6792   return false;
6793 }
6794 
6795 InstructionSelector::ComplexRendererFns
6796 AArch64InstructionSelector::selectShiftA_32(const MachineOperand &Root) const {
6797   auto MaybeImmed = getImmedFromMO(Root);
6798   if (MaybeImmed == std::nullopt || *MaybeImmed > 31)
6799     return std::nullopt;
6800   uint64_t Enc = (32 - *MaybeImmed) & 0x1f;
6801   return {{[=](MachineInstrBuilder &MIB) { MIB.addImm(Enc); }}};
6802 }
6803 
6804 InstructionSelector::ComplexRendererFns
6805 AArch64InstructionSelector::selectShiftB_32(const MachineOperand &Root) const {
6806   auto MaybeImmed = getImmedFromMO(Root);
6807   if (MaybeImmed == std::nullopt || *MaybeImmed > 31)
6808     return std::nullopt;
6809   uint64_t Enc = 31 - *MaybeImmed;
6810   return {{[=](MachineInstrBuilder &MIB) { MIB.addImm(Enc); }}};
6811 }
6812 
6813 InstructionSelector::ComplexRendererFns
6814 AArch64InstructionSelector::selectShiftA_64(const MachineOperand &Root) const {
6815   auto MaybeImmed = getImmedFromMO(Root);
6816   if (MaybeImmed == std::nullopt || *MaybeImmed > 63)
6817     return std::nullopt;
6818   uint64_t Enc = (64 - *MaybeImmed) & 0x3f;
6819   return {{[=](MachineInstrBuilder &MIB) { MIB.addImm(Enc); }}};
6820 }
6821 
6822 InstructionSelector::ComplexRendererFns
6823 AArch64InstructionSelector::selectShiftB_64(const MachineOperand &Root) const {
6824   auto MaybeImmed = getImmedFromMO(Root);
6825   if (MaybeImmed == std::nullopt || *MaybeImmed > 63)
6826     return std::nullopt;
6827   uint64_t Enc = 63 - *MaybeImmed;
6828   return {{[=](MachineInstrBuilder &MIB) { MIB.addImm(Enc); }}};
6829 }
6830 
6831 /// Helper to select an immediate value that can be represented as a 12-bit
6832 /// value shifted left by either 0 or 12. If it is possible to do so, return
6833 /// the immediate and shift value. If not, return std::nullopt.
6834 ///
6835 /// Used by selectArithImmed and selectNegArithImmed.
6836 InstructionSelector::ComplexRendererFns
6837 AArch64InstructionSelector::select12BitValueWithLeftShift(
6838     uint64_t Immed) const {
6839   unsigned ShiftAmt;
6840   if (Immed >> 12 == 0) {
6841     ShiftAmt = 0;
6842   } else if ((Immed & 0xfff) == 0 && Immed >> 24 == 0) {
6843     ShiftAmt = 12;
6844     Immed = Immed >> 12;
6845   } else
6846     return std::nullopt;
6847 
6848   unsigned ShVal = AArch64_AM::getShifterImm(AArch64_AM::LSL, ShiftAmt);
6849   return {{
6850       [=](MachineInstrBuilder &MIB) { MIB.addImm(Immed); },
6851       [=](MachineInstrBuilder &MIB) { MIB.addImm(ShVal); },
6852   }};
6853 }
6854 
6855 /// SelectArithImmed - Select an immediate value that can be represented as
6856 /// a 12-bit value shifted left by either 0 or 12.  If so, return true with
6857 /// Val set to the 12-bit value and Shift set to the shifter operand.
6858 InstructionSelector::ComplexRendererFns
6859 AArch64InstructionSelector::selectArithImmed(MachineOperand &Root) const {
6860   // This function is called from the addsub_shifted_imm ComplexPattern,
6861   // which lists [imm] as the list of opcode it's interested in, however
6862   // we still need to check whether the operand is actually an immediate
6863   // here because the ComplexPattern opcode list is only used in
6864   // root-level opcode matching.
6865   auto MaybeImmed = getImmedFromMO(Root);
6866   if (MaybeImmed == std::nullopt)
6867     return std::nullopt;
6868   return select12BitValueWithLeftShift(*MaybeImmed);
6869 }
6870 
6871 /// SelectNegArithImmed - As above, but negates the value before trying to
6872 /// select it.
6873 InstructionSelector::ComplexRendererFns
6874 AArch64InstructionSelector::selectNegArithImmed(MachineOperand &Root) const {
6875   // We need a register here, because we need to know if we have a 64 or 32
6876   // bit immediate.
6877   if (!Root.isReg())
6878     return std::nullopt;
6879   auto MaybeImmed = getImmedFromMO(Root);
6880   if (MaybeImmed == std::nullopt)
6881     return std::nullopt;
6882   uint64_t Immed = *MaybeImmed;
6883 
6884   // This negation is almost always valid, but "cmp wN, #0" and "cmn wN, #0"
6885   // have the opposite effect on the C flag, so this pattern mustn't match under
6886   // those circumstances.
6887   if (Immed == 0)
6888     return std::nullopt;
6889 
6890   // Check if we're dealing with a 32-bit type on the root or a 64-bit type on
6891   // the root.
6892   MachineRegisterInfo &MRI = Root.getParent()->getMF()->getRegInfo();
6893   if (MRI.getType(Root.getReg()).getSizeInBits() == 32)
6894     Immed = ~((uint32_t)Immed) + 1;
6895   else
6896     Immed = ~Immed + 1ULL;
6897 
6898   if (Immed & 0xFFFFFFFFFF000000ULL)
6899     return std::nullopt;
6900 
6901   Immed &= 0xFFFFFFULL;
6902   return select12BitValueWithLeftShift(Immed);
6903 }
6904 
6905 /// Return true if it is worth folding MI into an extended register. That is,
6906 /// if it's safe to pull it into the addressing mode of a load or store as a
6907 /// shift.
6908 bool AArch64InstructionSelector::isWorthFoldingIntoExtendedReg(
6909     MachineInstr &MI, const MachineRegisterInfo &MRI) const {
6910   // Always fold if there is one use, or if we're optimizing for size.
6911   Register DefReg = MI.getOperand(0).getReg();
6912   if (MRI.hasOneNonDBGUse(DefReg) ||
6913       MI.getParent()->getParent()->getFunction().hasOptSize())
6914     return true;
6915 
6916   // It's better to avoid folding and recomputing shifts when we don't have a
6917   // fastpath.
6918   if (!STI.hasAddrLSLFast())
6919     return false;
6920 
6921   // We have a fastpath, so folding a shift in and potentially computing it
6922   // many times may be beneficial. Check if this is only used in memory ops.
6923   // If it is, then we should fold.
6924   return all_of(MRI.use_nodbg_instructions(DefReg),
6925                 [](MachineInstr &Use) { return Use.mayLoadOrStore(); });
6926 }
6927 
6928 static bool isSignExtendShiftType(AArch64_AM::ShiftExtendType Type) {
6929   switch (Type) {
6930   case AArch64_AM::SXTB:
6931   case AArch64_AM::SXTH:
6932   case AArch64_AM::SXTW:
6933     return true;
6934   default:
6935     return false;
6936   }
6937 }
6938 
6939 InstructionSelector::ComplexRendererFns
6940 AArch64InstructionSelector::selectExtendedSHL(
6941     MachineOperand &Root, MachineOperand &Base, MachineOperand &Offset,
6942     unsigned SizeInBytes, bool WantsExt) const {
6943   assert(Base.isReg() && "Expected base to be a register operand");
6944   assert(Offset.isReg() && "Expected offset to be a register operand");
6945 
6946   MachineRegisterInfo &MRI = Root.getParent()->getMF()->getRegInfo();
6947   MachineInstr *OffsetInst = MRI.getVRegDef(Offset.getReg());
6948 
6949   unsigned OffsetOpc = OffsetInst->getOpcode();
6950   bool LookedThroughZExt = false;
6951   if (OffsetOpc != TargetOpcode::G_SHL && OffsetOpc != TargetOpcode::G_MUL) {
6952     // Try to look through a ZEXT.
6953     if (OffsetOpc != TargetOpcode::G_ZEXT || !WantsExt)
6954       return std::nullopt;
6955 
6956     OffsetInst = MRI.getVRegDef(OffsetInst->getOperand(1).getReg());
6957     OffsetOpc = OffsetInst->getOpcode();
6958     LookedThroughZExt = true;
6959 
6960     if (OffsetOpc != TargetOpcode::G_SHL && OffsetOpc != TargetOpcode::G_MUL)
6961       return std::nullopt;
6962   }
6963   // Make sure that the memory op is a valid size.
6964   int64_t LegalShiftVal = Log2_32(SizeInBytes);
6965   if (LegalShiftVal == 0)
6966     return std::nullopt;
6967   if (!isWorthFoldingIntoExtendedReg(*OffsetInst, MRI))
6968     return std::nullopt;
6969 
6970   // Now, try to find the specific G_CONSTANT. Start by assuming that the
6971   // register we will offset is the LHS, and the register containing the
6972   // constant is the RHS.
6973   Register OffsetReg = OffsetInst->getOperand(1).getReg();
6974   Register ConstantReg = OffsetInst->getOperand(2).getReg();
6975   auto ValAndVReg = getIConstantVRegValWithLookThrough(ConstantReg, MRI);
6976   if (!ValAndVReg) {
6977     // We didn't get a constant on the RHS. If the opcode is a shift, then
6978     // we're done.
6979     if (OffsetOpc == TargetOpcode::G_SHL)
6980       return std::nullopt;
6981 
6982     // If we have a G_MUL, we can use either register. Try looking at the RHS.
6983     std::swap(OffsetReg, ConstantReg);
6984     ValAndVReg = getIConstantVRegValWithLookThrough(ConstantReg, MRI);
6985     if (!ValAndVReg)
6986       return std::nullopt;
6987   }
6988 
6989   // The value must fit into 3 bits, and must be positive. Make sure that is
6990   // true.
6991   int64_t ImmVal = ValAndVReg->Value.getSExtValue();
6992 
6993   // Since we're going to pull this into a shift, the constant value must be
6994   // a power of 2. If we got a multiply, then we need to check this.
6995   if (OffsetOpc == TargetOpcode::G_MUL) {
6996     if (!llvm::has_single_bit<uint32_t>(ImmVal))
6997       return std::nullopt;
6998 
6999     // Got a power of 2. So, the amount we'll shift is the log base-2 of that.
7000     ImmVal = Log2_32(ImmVal);
7001   }
7002 
7003   if ((ImmVal & 0x7) != ImmVal)
7004     return std::nullopt;
7005 
7006   // We are only allowed to shift by LegalShiftVal. This shift value is built
7007   // into the instruction, so we can't just use whatever we want.
7008   if (ImmVal != LegalShiftVal)
7009     return std::nullopt;
7010 
7011   unsigned SignExtend = 0;
7012   if (WantsExt) {
7013     // Check if the offset is defined by an extend, unless we looked through a
7014     // G_ZEXT earlier.
7015     if (!LookedThroughZExt) {
7016       MachineInstr *ExtInst = getDefIgnoringCopies(OffsetReg, MRI);
7017       auto Ext = getExtendTypeForInst(*ExtInst, MRI, true);
7018       if (Ext == AArch64_AM::InvalidShiftExtend)
7019         return std::nullopt;
7020 
7021       SignExtend = isSignExtendShiftType(Ext) ? 1 : 0;
7022       // We only support SXTW for signed extension here.
7023       if (SignExtend && Ext != AArch64_AM::SXTW)
7024         return std::nullopt;
7025       OffsetReg = ExtInst->getOperand(1).getReg();
7026     }
7027 
7028     // Need a 32-bit wide register here.
7029     MachineIRBuilder MIB(*MRI.getVRegDef(Root.getReg()));
7030     OffsetReg = moveScalarRegClass(OffsetReg, AArch64::GPR32RegClass, MIB);
7031   }
7032 
7033   // We can use the LHS of the GEP as the base, and the LHS of the shift as an
7034   // offset. Signify that we are shifting by setting the shift flag to 1.
7035   return {{[=](MachineInstrBuilder &MIB) { MIB.addUse(Base.getReg()); },
7036            [=](MachineInstrBuilder &MIB) { MIB.addUse(OffsetReg); },
7037            [=](MachineInstrBuilder &MIB) {
7038              // Need to add both immediates here to make sure that they are both
7039              // added to the instruction.
7040              MIB.addImm(SignExtend);
7041              MIB.addImm(1);
7042            }}};
7043 }
7044 
7045 /// This is used for computing addresses like this:
7046 ///
7047 /// ldr x1, [x2, x3, lsl #3]
7048 ///
7049 /// Where x2 is the base register, and x3 is an offset register. The shift-left
7050 /// is a constant value specific to this load instruction. That is, we'll never
7051 /// see anything other than a 3 here (which corresponds to the size of the
7052 /// element being loaded.)
7053 InstructionSelector::ComplexRendererFns
7054 AArch64InstructionSelector::selectAddrModeShiftedExtendXReg(
7055     MachineOperand &Root, unsigned SizeInBytes) const {
7056   if (!Root.isReg())
7057     return std::nullopt;
7058   MachineRegisterInfo &MRI = Root.getParent()->getMF()->getRegInfo();
7059 
7060   // We want to find something like this:
7061   //
7062   // val = G_CONSTANT LegalShiftVal
7063   // shift = G_SHL off_reg val
7064   // ptr = G_PTR_ADD base_reg shift
7065   // x = G_LOAD ptr
7066   //
7067   // And fold it into this addressing mode:
7068   //
7069   // ldr x, [base_reg, off_reg, lsl #LegalShiftVal]
7070 
7071   // Check if we can find the G_PTR_ADD.
7072   MachineInstr *PtrAdd =
7073       getOpcodeDef(TargetOpcode::G_PTR_ADD, Root.getReg(), MRI);
7074   if (!PtrAdd || !isWorthFoldingIntoExtendedReg(*PtrAdd, MRI))
7075     return std::nullopt;
7076 
7077   // Now, try to match an opcode which will match our specific offset.
7078   // We want a G_SHL or a G_MUL.
7079   MachineInstr *OffsetInst =
7080       getDefIgnoringCopies(PtrAdd->getOperand(2).getReg(), MRI);
7081   return selectExtendedSHL(Root, PtrAdd->getOperand(1),
7082                            OffsetInst->getOperand(0), SizeInBytes,
7083                            /*WantsExt=*/false);
7084 }
7085 
7086 /// This is used for computing addresses like this:
7087 ///
7088 /// ldr x1, [x2, x3]
7089 ///
7090 /// Where x2 is the base register, and x3 is an offset register.
7091 ///
7092 /// When possible (or profitable) to fold a G_PTR_ADD into the address
7093 /// calculation, this will do so. Otherwise, it will return std::nullopt.
7094 InstructionSelector::ComplexRendererFns
7095 AArch64InstructionSelector::selectAddrModeRegisterOffset(
7096     MachineOperand &Root) const {
7097   MachineRegisterInfo &MRI = Root.getParent()->getMF()->getRegInfo();
7098 
7099   // We need a GEP.
7100   MachineInstr *Gep = MRI.getVRegDef(Root.getReg());
7101   if (Gep->getOpcode() != TargetOpcode::G_PTR_ADD)
7102     return std::nullopt;
7103 
7104   // If this is used more than once, let's not bother folding.
7105   // TODO: Check if they are memory ops. If they are, then we can still fold
7106   // without having to recompute anything.
7107   if (!MRI.hasOneNonDBGUse(Gep->getOperand(0).getReg()))
7108     return std::nullopt;
7109 
7110   // Base is the GEP's LHS, offset is its RHS.
7111   return {{[=](MachineInstrBuilder &MIB) {
7112              MIB.addUse(Gep->getOperand(1).getReg());
7113            },
7114            [=](MachineInstrBuilder &MIB) {
7115              MIB.addUse(Gep->getOperand(2).getReg());
7116            },
7117            [=](MachineInstrBuilder &MIB) {
7118              // Need to add both immediates here to make sure that they are both
7119              // added to the instruction.
7120              MIB.addImm(0);
7121              MIB.addImm(0);
7122            }}};
7123 }
7124 
7125 /// This is intended to be equivalent to selectAddrModeXRO in
7126 /// AArch64ISelDAGtoDAG. It's used for selecting X register offset loads.
7127 InstructionSelector::ComplexRendererFns
7128 AArch64InstructionSelector::selectAddrModeXRO(MachineOperand &Root,
7129                                               unsigned SizeInBytes) const {
7130   MachineRegisterInfo &MRI = Root.getParent()->getMF()->getRegInfo();
7131   if (!Root.isReg())
7132     return std::nullopt;
7133   MachineInstr *PtrAdd =
7134       getOpcodeDef(TargetOpcode::G_PTR_ADD, Root.getReg(), MRI);
7135   if (!PtrAdd)
7136     return std::nullopt;
7137 
7138   // Check for an immediates which cannot be encoded in the [base + imm]
7139   // addressing mode, and can't be encoded in an add/sub. If this happens, we'll
7140   // end up with code like:
7141   //
7142   // mov x0, wide
7143   // add x1 base, x0
7144   // ldr x2, [x1, x0]
7145   //
7146   // In this situation, we can use the [base, xreg] addressing mode to save an
7147   // add/sub:
7148   //
7149   // mov x0, wide
7150   // ldr x2, [base, x0]
7151   auto ValAndVReg =
7152       getIConstantVRegValWithLookThrough(PtrAdd->getOperand(2).getReg(), MRI);
7153   if (ValAndVReg) {
7154     unsigned Scale = Log2_32(SizeInBytes);
7155     int64_t ImmOff = ValAndVReg->Value.getSExtValue();
7156 
7157     // Skip immediates that can be selected in the load/store addresing
7158     // mode.
7159     if (ImmOff % SizeInBytes == 0 && ImmOff >= 0 &&
7160         ImmOff < (0x1000 << Scale))
7161       return std::nullopt;
7162 
7163     // Helper lambda to decide whether or not it is preferable to emit an add.
7164     auto isPreferredADD = [](int64_t ImmOff) {
7165       // Constants in [0x0, 0xfff] can be encoded in an add.
7166       if ((ImmOff & 0xfffffffffffff000LL) == 0x0LL)
7167         return true;
7168 
7169       // Can it be encoded in an add lsl #12?
7170       if ((ImmOff & 0xffffffffff000fffLL) != 0x0LL)
7171         return false;
7172 
7173       // It can be encoded in an add lsl #12, but we may not want to. If it is
7174       // possible to select this as a single movz, then prefer that. A single
7175       // movz is faster than an add with a shift.
7176       return (ImmOff & 0xffffffffff00ffffLL) != 0x0LL &&
7177              (ImmOff & 0xffffffffffff0fffLL) != 0x0LL;
7178     };
7179 
7180     // If the immediate can be encoded in a single add/sub, then bail out.
7181     if (isPreferredADD(ImmOff) || isPreferredADD(-ImmOff))
7182       return std::nullopt;
7183   }
7184 
7185   // Try to fold shifts into the addressing mode.
7186   auto AddrModeFns = selectAddrModeShiftedExtendXReg(Root, SizeInBytes);
7187   if (AddrModeFns)
7188     return AddrModeFns;
7189 
7190   // If that doesn't work, see if it's possible to fold in registers from
7191   // a GEP.
7192   return selectAddrModeRegisterOffset(Root);
7193 }
7194 
7195 /// This is used for computing addresses like this:
7196 ///
7197 /// ldr x0, [xBase, wOffset, sxtw #LegalShiftVal]
7198 ///
7199 /// Where we have a 64-bit base register, a 32-bit offset register, and an
7200 /// extend (which may or may not be signed).
7201 InstructionSelector::ComplexRendererFns
7202 AArch64InstructionSelector::selectAddrModeWRO(MachineOperand &Root,
7203                                               unsigned SizeInBytes) const {
7204   MachineRegisterInfo &MRI = Root.getParent()->getMF()->getRegInfo();
7205 
7206   MachineInstr *PtrAdd =
7207       getOpcodeDef(TargetOpcode::G_PTR_ADD, Root.getReg(), MRI);
7208   if (!PtrAdd || !isWorthFoldingIntoExtendedReg(*PtrAdd, MRI))
7209     return std::nullopt;
7210 
7211   MachineOperand &LHS = PtrAdd->getOperand(1);
7212   MachineOperand &RHS = PtrAdd->getOperand(2);
7213   MachineInstr *OffsetInst = getDefIgnoringCopies(RHS.getReg(), MRI);
7214 
7215   // The first case is the same as selectAddrModeXRO, except we need an extend.
7216   // In this case, we try to find a shift and extend, and fold them into the
7217   // addressing mode.
7218   //
7219   // E.g.
7220   //
7221   // off_reg = G_Z/S/ANYEXT ext_reg
7222   // val = G_CONSTANT LegalShiftVal
7223   // shift = G_SHL off_reg val
7224   // ptr = G_PTR_ADD base_reg shift
7225   // x = G_LOAD ptr
7226   //
7227   // In this case we can get a load like this:
7228   //
7229   // ldr x0, [base_reg, ext_reg, sxtw #LegalShiftVal]
7230   auto ExtendedShl = selectExtendedSHL(Root, LHS, OffsetInst->getOperand(0),
7231                                        SizeInBytes, /*WantsExt=*/true);
7232   if (ExtendedShl)
7233     return ExtendedShl;
7234 
7235   // There was no shift. We can try and fold a G_Z/S/ANYEXT in alone though.
7236   //
7237   // e.g.
7238   // ldr something, [base_reg, ext_reg, sxtw]
7239   if (!isWorthFoldingIntoExtendedReg(*OffsetInst, MRI))
7240     return std::nullopt;
7241 
7242   // Check if this is an extend. We'll get an extend type if it is.
7243   AArch64_AM::ShiftExtendType Ext =
7244       getExtendTypeForInst(*OffsetInst, MRI, /*IsLoadStore=*/true);
7245   if (Ext == AArch64_AM::InvalidShiftExtend)
7246     return std::nullopt;
7247 
7248   // Need a 32-bit wide register.
7249   MachineIRBuilder MIB(*PtrAdd);
7250   Register ExtReg = moveScalarRegClass(OffsetInst->getOperand(1).getReg(),
7251                                        AArch64::GPR32RegClass, MIB);
7252   unsigned SignExtend = Ext == AArch64_AM::SXTW;
7253 
7254   // Base is LHS, offset is ExtReg.
7255   return {{[=](MachineInstrBuilder &MIB) { MIB.addUse(LHS.getReg()); },
7256            [=](MachineInstrBuilder &MIB) { MIB.addUse(ExtReg); },
7257            [=](MachineInstrBuilder &MIB) {
7258              MIB.addImm(SignExtend);
7259              MIB.addImm(0);
7260            }}};
7261 }
7262 
7263 /// Select a "register plus unscaled signed 9-bit immediate" address.  This
7264 /// should only match when there is an offset that is not valid for a scaled
7265 /// immediate addressing mode.  The "Size" argument is the size in bytes of the
7266 /// memory reference, which is needed here to know what is valid for a scaled
7267 /// immediate.
7268 InstructionSelector::ComplexRendererFns
7269 AArch64InstructionSelector::selectAddrModeUnscaled(MachineOperand &Root,
7270                                                    unsigned Size) const {
7271   MachineRegisterInfo &MRI =
7272       Root.getParent()->getParent()->getParent()->getRegInfo();
7273 
7274   if (!Root.isReg())
7275     return std::nullopt;
7276 
7277   if (!isBaseWithConstantOffset(Root, MRI))
7278     return std::nullopt;
7279 
7280   MachineInstr *RootDef = MRI.getVRegDef(Root.getReg());
7281 
7282   MachineOperand &OffImm = RootDef->getOperand(2);
7283   if (!OffImm.isReg())
7284     return std::nullopt;
7285   MachineInstr *RHS = MRI.getVRegDef(OffImm.getReg());
7286   if (RHS->getOpcode() != TargetOpcode::G_CONSTANT)
7287     return std::nullopt;
7288   int64_t RHSC;
7289   MachineOperand &RHSOp1 = RHS->getOperand(1);
7290   if (!RHSOp1.isCImm() || RHSOp1.getCImm()->getBitWidth() > 64)
7291     return std::nullopt;
7292   RHSC = RHSOp1.getCImm()->getSExtValue();
7293 
7294   if (RHSC >= -256 && RHSC < 256) {
7295     MachineOperand &Base = RootDef->getOperand(1);
7296     return {{
7297         [=](MachineInstrBuilder &MIB) { MIB.add(Base); },
7298         [=](MachineInstrBuilder &MIB) { MIB.addImm(RHSC); },
7299     }};
7300   }
7301   return std::nullopt;
7302 }
7303 
7304 InstructionSelector::ComplexRendererFns
7305 AArch64InstructionSelector::tryFoldAddLowIntoImm(MachineInstr &RootDef,
7306                                                  unsigned Size,
7307                                                  MachineRegisterInfo &MRI) const {
7308   if (RootDef.getOpcode() != AArch64::G_ADD_LOW)
7309     return std::nullopt;
7310   MachineInstr &Adrp = *MRI.getVRegDef(RootDef.getOperand(1).getReg());
7311   if (Adrp.getOpcode() != AArch64::ADRP)
7312     return std::nullopt;
7313 
7314   // TODO: add heuristics like isWorthFoldingADDlow() from SelectionDAG.
7315   auto Offset = Adrp.getOperand(1).getOffset();
7316   if (Offset % Size != 0)
7317     return std::nullopt;
7318 
7319   auto GV = Adrp.getOperand(1).getGlobal();
7320   if (GV->isThreadLocal())
7321     return std::nullopt;
7322 
7323   auto &MF = *RootDef.getParent()->getParent();
7324   if (GV->getPointerAlignment(MF.getDataLayout()) < Size)
7325     return std::nullopt;
7326 
7327   unsigned OpFlags = STI.ClassifyGlobalReference(GV, MF.getTarget());
7328   MachineIRBuilder MIRBuilder(RootDef);
7329   Register AdrpReg = Adrp.getOperand(0).getReg();
7330   return {{[=](MachineInstrBuilder &MIB) { MIB.addUse(AdrpReg); },
7331            [=](MachineInstrBuilder &MIB) {
7332              MIB.addGlobalAddress(GV, Offset,
7333                                   OpFlags | AArch64II::MO_PAGEOFF |
7334                                       AArch64II::MO_NC);
7335            }}};
7336 }
7337 
7338 /// Select a "register plus scaled unsigned 12-bit immediate" address.  The
7339 /// "Size" argument is the size in bytes of the memory reference, which
7340 /// determines the scale.
7341 InstructionSelector::ComplexRendererFns
7342 AArch64InstructionSelector::selectAddrModeIndexed(MachineOperand &Root,
7343                                                   unsigned Size) const {
7344   MachineFunction &MF = *Root.getParent()->getParent()->getParent();
7345   MachineRegisterInfo &MRI = MF.getRegInfo();
7346 
7347   if (!Root.isReg())
7348     return std::nullopt;
7349 
7350   MachineInstr *RootDef = MRI.getVRegDef(Root.getReg());
7351   if (RootDef->getOpcode() == TargetOpcode::G_FRAME_INDEX) {
7352     return {{
7353         [=](MachineInstrBuilder &MIB) { MIB.add(RootDef->getOperand(1)); },
7354         [=](MachineInstrBuilder &MIB) { MIB.addImm(0); },
7355     }};
7356   }
7357 
7358   CodeModel::Model CM = MF.getTarget().getCodeModel();
7359   // Check if we can fold in the ADD of small code model ADRP + ADD address.
7360   if (CM == CodeModel::Small) {
7361     auto OpFns = tryFoldAddLowIntoImm(*RootDef, Size, MRI);
7362     if (OpFns)
7363       return OpFns;
7364   }
7365 
7366   if (isBaseWithConstantOffset(Root, MRI)) {
7367     MachineOperand &LHS = RootDef->getOperand(1);
7368     MachineOperand &RHS = RootDef->getOperand(2);
7369     MachineInstr *LHSDef = MRI.getVRegDef(LHS.getReg());
7370     MachineInstr *RHSDef = MRI.getVRegDef(RHS.getReg());
7371 
7372     int64_t RHSC = (int64_t)RHSDef->getOperand(1).getCImm()->getZExtValue();
7373     unsigned Scale = Log2_32(Size);
7374     if ((RHSC & (Size - 1)) == 0 && RHSC >= 0 && RHSC < (0x1000 << Scale)) {
7375       if (LHSDef->getOpcode() == TargetOpcode::G_FRAME_INDEX)
7376         return {{
7377             [=](MachineInstrBuilder &MIB) { MIB.add(LHSDef->getOperand(1)); },
7378             [=](MachineInstrBuilder &MIB) { MIB.addImm(RHSC >> Scale); },
7379         }};
7380 
7381       return {{
7382           [=](MachineInstrBuilder &MIB) { MIB.add(LHS); },
7383           [=](MachineInstrBuilder &MIB) { MIB.addImm(RHSC >> Scale); },
7384       }};
7385     }
7386   }
7387 
7388   // Before falling back to our general case, check if the unscaled
7389   // instructions can handle this. If so, that's preferable.
7390   if (selectAddrModeUnscaled(Root, Size))
7391     return std::nullopt;
7392 
7393   return {{
7394       [=](MachineInstrBuilder &MIB) { MIB.add(Root); },
7395       [=](MachineInstrBuilder &MIB) { MIB.addImm(0); },
7396   }};
7397 }
7398 
7399 /// Given a shift instruction, return the correct shift type for that
7400 /// instruction.
7401 static AArch64_AM::ShiftExtendType getShiftTypeForInst(MachineInstr &MI) {
7402   switch (MI.getOpcode()) {
7403   default:
7404     return AArch64_AM::InvalidShiftExtend;
7405   case TargetOpcode::G_SHL:
7406     return AArch64_AM::LSL;
7407   case TargetOpcode::G_LSHR:
7408     return AArch64_AM::LSR;
7409   case TargetOpcode::G_ASHR:
7410     return AArch64_AM::ASR;
7411   case TargetOpcode::G_ROTR:
7412     return AArch64_AM::ROR;
7413   }
7414 }
7415 
7416 /// Select a "shifted register" operand. If the value is not shifted, set the
7417 /// shift operand to a default value of "lsl 0".
7418 InstructionSelector::ComplexRendererFns
7419 AArch64InstructionSelector::selectShiftedRegister(MachineOperand &Root,
7420                                                   bool AllowROR) const {
7421   if (!Root.isReg())
7422     return std::nullopt;
7423   MachineRegisterInfo &MRI =
7424       Root.getParent()->getParent()->getParent()->getRegInfo();
7425 
7426   // Check if the operand is defined by an instruction which corresponds to
7427   // a ShiftExtendType. E.g. a G_SHL, G_LSHR, etc.
7428   MachineInstr *ShiftInst = MRI.getVRegDef(Root.getReg());
7429   AArch64_AM::ShiftExtendType ShType = getShiftTypeForInst(*ShiftInst);
7430   if (ShType == AArch64_AM::InvalidShiftExtend)
7431     return std::nullopt;
7432   if (ShType == AArch64_AM::ROR && !AllowROR)
7433     return std::nullopt;
7434   if (!isWorthFoldingIntoExtendedReg(*ShiftInst, MRI))
7435     return std::nullopt;
7436 
7437   // Need an immediate on the RHS.
7438   MachineOperand &ShiftRHS = ShiftInst->getOperand(2);
7439   auto Immed = getImmedFromMO(ShiftRHS);
7440   if (!Immed)
7441     return std::nullopt;
7442 
7443   // We have something that we can fold. Fold in the shift's LHS and RHS into
7444   // the instruction.
7445   MachineOperand &ShiftLHS = ShiftInst->getOperand(1);
7446   Register ShiftReg = ShiftLHS.getReg();
7447 
7448   unsigned NumBits = MRI.getType(ShiftReg).getSizeInBits();
7449   unsigned Val = *Immed & (NumBits - 1);
7450   unsigned ShiftVal = AArch64_AM::getShifterImm(ShType, Val);
7451 
7452   return {{[=](MachineInstrBuilder &MIB) { MIB.addUse(ShiftReg); },
7453            [=](MachineInstrBuilder &MIB) { MIB.addImm(ShiftVal); }}};
7454 }
7455 
7456 AArch64_AM::ShiftExtendType AArch64InstructionSelector::getExtendTypeForInst(
7457     MachineInstr &MI, MachineRegisterInfo &MRI, bool IsLoadStore) const {
7458   unsigned Opc = MI.getOpcode();
7459 
7460   // Handle explicit extend instructions first.
7461   if (Opc == TargetOpcode::G_SEXT || Opc == TargetOpcode::G_SEXT_INREG) {
7462     unsigned Size;
7463     if (Opc == TargetOpcode::G_SEXT)
7464       Size = MRI.getType(MI.getOperand(1).getReg()).getSizeInBits();
7465     else
7466       Size = MI.getOperand(2).getImm();
7467     assert(Size != 64 && "Extend from 64 bits?");
7468     switch (Size) {
7469     case 8:
7470       return IsLoadStore ? AArch64_AM::InvalidShiftExtend : AArch64_AM::SXTB;
7471     case 16:
7472       return IsLoadStore ? AArch64_AM::InvalidShiftExtend : AArch64_AM::SXTH;
7473     case 32:
7474       return AArch64_AM::SXTW;
7475     default:
7476       return AArch64_AM::InvalidShiftExtend;
7477     }
7478   }
7479 
7480   if (Opc == TargetOpcode::G_ZEXT || Opc == TargetOpcode::G_ANYEXT) {
7481     unsigned Size = MRI.getType(MI.getOperand(1).getReg()).getSizeInBits();
7482     assert(Size != 64 && "Extend from 64 bits?");
7483     switch (Size) {
7484     case 8:
7485       return IsLoadStore ? AArch64_AM::InvalidShiftExtend : AArch64_AM::UXTB;
7486     case 16:
7487       return IsLoadStore ? AArch64_AM::InvalidShiftExtend : AArch64_AM::UXTH;
7488     case 32:
7489       return AArch64_AM::UXTW;
7490     default:
7491       return AArch64_AM::InvalidShiftExtend;
7492     }
7493   }
7494 
7495   // Don't have an explicit extend. Try to handle a G_AND with a constant mask
7496   // on the RHS.
7497   if (Opc != TargetOpcode::G_AND)
7498     return AArch64_AM::InvalidShiftExtend;
7499 
7500   std::optional<uint64_t> MaybeAndMask = getImmedFromMO(MI.getOperand(2));
7501   if (!MaybeAndMask)
7502     return AArch64_AM::InvalidShiftExtend;
7503   uint64_t AndMask = *MaybeAndMask;
7504   switch (AndMask) {
7505   default:
7506     return AArch64_AM::InvalidShiftExtend;
7507   case 0xFF:
7508     return !IsLoadStore ? AArch64_AM::UXTB : AArch64_AM::InvalidShiftExtend;
7509   case 0xFFFF:
7510     return !IsLoadStore ? AArch64_AM::UXTH : AArch64_AM::InvalidShiftExtend;
7511   case 0xFFFFFFFF:
7512     return AArch64_AM::UXTW;
7513   }
7514 }
7515 
7516 Register AArch64InstructionSelector::moveScalarRegClass(
7517     Register Reg, const TargetRegisterClass &RC, MachineIRBuilder &MIB) const {
7518   MachineRegisterInfo &MRI = *MIB.getMRI();
7519   auto Ty = MRI.getType(Reg);
7520   assert(!Ty.isVector() && "Expected scalars only!");
7521   if (Ty.getSizeInBits() == TRI.getRegSizeInBits(RC))
7522     return Reg;
7523 
7524   // Create a copy and immediately select it.
7525   // FIXME: We should have an emitCopy function?
7526   auto Copy = MIB.buildCopy({&RC}, {Reg});
7527   selectCopy(*Copy, TII, MRI, TRI, RBI);
7528   return Copy.getReg(0);
7529 }
7530 
7531 /// Select an "extended register" operand. This operand folds in an extend
7532 /// followed by an optional left shift.
7533 InstructionSelector::ComplexRendererFns
7534 AArch64InstructionSelector::selectArithExtendedRegister(
7535     MachineOperand &Root) const {
7536   if (!Root.isReg())
7537     return std::nullopt;
7538   MachineRegisterInfo &MRI =
7539       Root.getParent()->getParent()->getParent()->getRegInfo();
7540 
7541   uint64_t ShiftVal = 0;
7542   Register ExtReg;
7543   AArch64_AM::ShiftExtendType Ext;
7544   MachineInstr *RootDef = getDefIgnoringCopies(Root.getReg(), MRI);
7545   if (!RootDef)
7546     return std::nullopt;
7547 
7548   if (!isWorthFoldingIntoExtendedReg(*RootDef, MRI))
7549     return std::nullopt;
7550 
7551   // Check if we can fold a shift and an extend.
7552   if (RootDef->getOpcode() == TargetOpcode::G_SHL) {
7553     // Look for a constant on the RHS of the shift.
7554     MachineOperand &RHS = RootDef->getOperand(2);
7555     std::optional<uint64_t> MaybeShiftVal = getImmedFromMO(RHS);
7556     if (!MaybeShiftVal)
7557       return std::nullopt;
7558     ShiftVal = *MaybeShiftVal;
7559     if (ShiftVal > 4)
7560       return std::nullopt;
7561     // Look for a valid extend instruction on the LHS of the shift.
7562     MachineOperand &LHS = RootDef->getOperand(1);
7563     MachineInstr *ExtDef = getDefIgnoringCopies(LHS.getReg(), MRI);
7564     if (!ExtDef)
7565       return std::nullopt;
7566     Ext = getExtendTypeForInst(*ExtDef, MRI);
7567     if (Ext == AArch64_AM::InvalidShiftExtend)
7568       return std::nullopt;
7569     ExtReg = ExtDef->getOperand(1).getReg();
7570   } else {
7571     // Didn't get a shift. Try just folding an extend.
7572     Ext = getExtendTypeForInst(*RootDef, MRI);
7573     if (Ext == AArch64_AM::InvalidShiftExtend)
7574       return std::nullopt;
7575     ExtReg = RootDef->getOperand(1).getReg();
7576 
7577     // If we have a 32 bit instruction which zeroes out the high half of a
7578     // register, we get an implicit zero extend for free. Check if we have one.
7579     // FIXME: We actually emit the extend right now even though we don't have
7580     // to.
7581     if (Ext == AArch64_AM::UXTW && MRI.getType(ExtReg).getSizeInBits() == 32) {
7582       MachineInstr *ExtInst = MRI.getVRegDef(ExtReg);
7583       if (isDef32(*ExtInst))
7584         return std::nullopt;
7585     }
7586   }
7587 
7588   // We require a GPR32 here. Narrow the ExtReg if needed using a subregister
7589   // copy.
7590   MachineIRBuilder MIB(*RootDef);
7591   ExtReg = moveScalarRegClass(ExtReg, AArch64::GPR32RegClass, MIB);
7592 
7593   return {{[=](MachineInstrBuilder &MIB) { MIB.addUse(ExtReg); },
7594            [=](MachineInstrBuilder &MIB) {
7595              MIB.addImm(getArithExtendImm(Ext, ShiftVal));
7596            }}};
7597 }
7598 
7599 InstructionSelector::ComplexRendererFns
7600 AArch64InstructionSelector::selectExtractHigh(MachineOperand &Root) const {
7601   if (!Root.isReg())
7602     return std::nullopt;
7603   MachineRegisterInfo &MRI =
7604       Root.getParent()->getParent()->getParent()->getRegInfo();
7605 
7606   auto Extract = getDefSrcRegIgnoringCopies(Root.getReg(), MRI);
7607   while (Extract && Extract->MI->getOpcode() == TargetOpcode::G_BITCAST &&
7608          STI.isLittleEndian())
7609     Extract =
7610         getDefSrcRegIgnoringCopies(Extract->MI->getOperand(1).getReg(), MRI);
7611   if (!Extract)
7612     return std::nullopt;
7613 
7614   if (Extract->MI->getOpcode() == TargetOpcode::G_UNMERGE_VALUES) {
7615     if (Extract->Reg == Extract->MI->getOperand(1).getReg()) {
7616       Register ExtReg = Extract->MI->getOperand(2).getReg();
7617       return {{[=](MachineInstrBuilder &MIB) { MIB.addUse(ExtReg); }}};
7618     }
7619   }
7620   if (Extract->MI->getOpcode() == TargetOpcode::G_EXTRACT_VECTOR_ELT) {
7621     LLT SrcTy = MRI.getType(Extract->MI->getOperand(1).getReg());
7622     auto LaneIdx = getIConstantVRegValWithLookThrough(
7623         Extract->MI->getOperand(2).getReg(), MRI);
7624     if (LaneIdx && SrcTy == LLT::fixed_vector(2, 64) &&
7625         LaneIdx->Value.getSExtValue() == 1) {
7626       Register ExtReg = Extract->MI->getOperand(1).getReg();
7627       return {{[=](MachineInstrBuilder &MIB) { MIB.addUse(ExtReg); }}};
7628     }
7629   }
7630 
7631   return std::nullopt;
7632 }
7633 
7634 void AArch64InstructionSelector::renderTruncImm(MachineInstrBuilder &MIB,
7635                                                 const MachineInstr &MI,
7636                                                 int OpIdx) const {
7637   const MachineRegisterInfo &MRI = MI.getParent()->getParent()->getRegInfo();
7638   assert(MI.getOpcode() == TargetOpcode::G_CONSTANT && OpIdx == -1 &&
7639          "Expected G_CONSTANT");
7640   std::optional<int64_t> CstVal =
7641       getIConstantVRegSExtVal(MI.getOperand(0).getReg(), MRI);
7642   assert(CstVal && "Expected constant value");
7643   MIB.addImm(*CstVal);
7644 }
7645 
7646 void AArch64InstructionSelector::renderLogicalImm32(
7647   MachineInstrBuilder &MIB, const MachineInstr &I, int OpIdx) const {
7648   assert(I.getOpcode() == TargetOpcode::G_CONSTANT && OpIdx == -1 &&
7649          "Expected G_CONSTANT");
7650   uint64_t CstVal = I.getOperand(1).getCImm()->getZExtValue();
7651   uint64_t Enc = AArch64_AM::encodeLogicalImmediate(CstVal, 32);
7652   MIB.addImm(Enc);
7653 }
7654 
7655 void AArch64InstructionSelector::renderLogicalImm64(
7656   MachineInstrBuilder &MIB, const MachineInstr &I, int OpIdx) const {
7657   assert(I.getOpcode() == TargetOpcode::G_CONSTANT && OpIdx == -1 &&
7658          "Expected G_CONSTANT");
7659   uint64_t CstVal = I.getOperand(1).getCImm()->getZExtValue();
7660   uint64_t Enc = AArch64_AM::encodeLogicalImmediate(CstVal, 64);
7661   MIB.addImm(Enc);
7662 }
7663 
7664 void AArch64InstructionSelector::renderFPImm16(MachineInstrBuilder &MIB,
7665                                                const MachineInstr &MI,
7666                                                int OpIdx) const {
7667   assert(MI.getOpcode() == TargetOpcode::G_FCONSTANT && OpIdx == -1 &&
7668          "Expected G_FCONSTANT");
7669   MIB.addImm(
7670       AArch64_AM::getFP16Imm(MI.getOperand(1).getFPImm()->getValueAPF()));
7671 }
7672 
7673 void AArch64InstructionSelector::renderFPImm32(MachineInstrBuilder &MIB,
7674                                                const MachineInstr &MI,
7675                                                int OpIdx) const {
7676   assert(MI.getOpcode() == TargetOpcode::G_FCONSTANT && OpIdx == -1 &&
7677          "Expected G_FCONSTANT");
7678   MIB.addImm(
7679       AArch64_AM::getFP32Imm(MI.getOperand(1).getFPImm()->getValueAPF()));
7680 }
7681 
7682 void AArch64InstructionSelector::renderFPImm64(MachineInstrBuilder &MIB,
7683                                                const MachineInstr &MI,
7684                                                int OpIdx) const {
7685   assert(MI.getOpcode() == TargetOpcode::G_FCONSTANT && OpIdx == -1 &&
7686          "Expected G_FCONSTANT");
7687   MIB.addImm(
7688       AArch64_AM::getFP64Imm(MI.getOperand(1).getFPImm()->getValueAPF()));
7689 }
7690 
7691 void AArch64InstructionSelector::renderFPImm32SIMDModImmType4(
7692     MachineInstrBuilder &MIB, const MachineInstr &MI, int OpIdx) const {
7693   assert(MI.getOpcode() == TargetOpcode::G_FCONSTANT && OpIdx == -1 &&
7694          "Expected G_FCONSTANT");
7695   MIB.addImm(AArch64_AM::encodeAdvSIMDModImmType4(MI.getOperand(1)
7696                                                       .getFPImm()
7697                                                       ->getValueAPF()
7698                                                       .bitcastToAPInt()
7699                                                       .getZExtValue()));
7700 }
7701 
7702 bool AArch64InstructionSelector::isLoadStoreOfNumBytes(
7703     const MachineInstr &MI, unsigned NumBytes) const {
7704   if (!MI.mayLoadOrStore())
7705     return false;
7706   assert(MI.hasOneMemOperand() &&
7707          "Expected load/store to have only one mem op!");
7708   return (*MI.memoperands_begin())->getSize() == NumBytes;
7709 }
7710 
7711 bool AArch64InstructionSelector::isDef32(const MachineInstr &MI) const {
7712   const MachineRegisterInfo &MRI = MI.getParent()->getParent()->getRegInfo();
7713   if (MRI.getType(MI.getOperand(0).getReg()).getSizeInBits() != 32)
7714     return false;
7715 
7716   // Only return true if we know the operation will zero-out the high half of
7717   // the 64-bit register. Truncates can be subregister copies, which don't
7718   // zero out the high bits. Copies and other copy-like instructions can be
7719   // fed by truncates, or could be lowered as subregister copies.
7720   switch (MI.getOpcode()) {
7721   default:
7722     return true;
7723   case TargetOpcode::COPY:
7724   case TargetOpcode::G_BITCAST:
7725   case TargetOpcode::G_TRUNC:
7726   case TargetOpcode::G_PHI:
7727     return false;
7728   }
7729 }
7730 
7731 
7732 // Perform fixups on the given PHI instruction's operands to force them all
7733 // to be the same as the destination regbank.
7734 static void fixupPHIOpBanks(MachineInstr &MI, MachineRegisterInfo &MRI,
7735                             const AArch64RegisterBankInfo &RBI) {
7736   assert(MI.getOpcode() == TargetOpcode::G_PHI && "Expected a G_PHI");
7737   Register DstReg = MI.getOperand(0).getReg();
7738   const RegisterBank *DstRB = MRI.getRegBankOrNull(DstReg);
7739   assert(DstRB && "Expected PHI dst to have regbank assigned");
7740   MachineIRBuilder MIB(MI);
7741 
7742   // Go through each operand and ensure it has the same regbank.
7743   for (MachineOperand &MO : llvm::drop_begin(MI.operands())) {
7744     if (!MO.isReg())
7745       continue;
7746     Register OpReg = MO.getReg();
7747     const RegisterBank *RB = MRI.getRegBankOrNull(OpReg);
7748     if (RB != DstRB) {
7749       // Insert a cross-bank copy.
7750       auto *OpDef = MRI.getVRegDef(OpReg);
7751       const LLT &Ty = MRI.getType(OpReg);
7752       MachineBasicBlock &OpDefBB = *OpDef->getParent();
7753 
7754       // Any instruction we insert must appear after all PHIs in the block
7755       // for the block to be valid MIR.
7756       MachineBasicBlock::iterator InsertPt = std::next(OpDef->getIterator());
7757       if (InsertPt != OpDefBB.end() && InsertPt->isPHI())
7758         InsertPt = OpDefBB.getFirstNonPHI();
7759       MIB.setInsertPt(*OpDef->getParent(), InsertPt);
7760       auto Copy = MIB.buildCopy(Ty, OpReg);
7761       MRI.setRegBank(Copy.getReg(0), *DstRB);
7762       MO.setReg(Copy.getReg(0));
7763     }
7764   }
7765 }
7766 
7767 void AArch64InstructionSelector::processPHIs(MachineFunction &MF) {
7768   // We're looking for PHIs, build a list so we don't invalidate iterators.
7769   MachineRegisterInfo &MRI = MF.getRegInfo();
7770   SmallVector<MachineInstr *, 32> Phis;
7771   for (auto &BB : MF) {
7772     for (auto &MI : BB) {
7773       if (MI.getOpcode() == TargetOpcode::G_PHI)
7774         Phis.emplace_back(&MI);
7775     }
7776   }
7777 
7778   for (auto *MI : Phis) {
7779     // We need to do some work here if the operand types are < 16 bit and they
7780     // are split across fpr/gpr banks. Since all types <32b on gpr
7781     // end up being assigned gpr32 regclasses, we can end up with PHIs here
7782     // which try to select between a gpr32 and an fpr16. Ideally RBS shouldn't
7783     // be selecting heterogenous regbanks for operands if possible, but we
7784     // still need to be able to deal with it here.
7785     //
7786     // To fix this, if we have a gpr-bank operand < 32b in size and at least
7787     // one other operand is on the fpr bank, then we add cross-bank copies
7788     // to homogenize the operand banks. For simplicity the bank that we choose
7789     // to settle on is whatever bank the def operand has. For example:
7790     //
7791     // %endbb:
7792     //   %dst:gpr(s16) = G_PHI %in1:gpr(s16), %bb1, %in2:fpr(s16), %bb2
7793     //  =>
7794     // %bb2:
7795     //   ...
7796     //   %in2_copy:gpr(s16) = COPY %in2:fpr(s16)
7797     //   ...
7798     // %endbb:
7799     //   %dst:gpr(s16) = G_PHI %in1:gpr(s16), %bb1, %in2_copy:gpr(s16), %bb2
7800     bool HasGPROp = false, HasFPROp = false;
7801     for (const MachineOperand &MO : llvm::drop_begin(MI->operands())) {
7802       if (!MO.isReg())
7803         continue;
7804       const LLT &Ty = MRI.getType(MO.getReg());
7805       if (!Ty.isValid() || !Ty.isScalar())
7806         break;
7807       if (Ty.getSizeInBits() >= 32)
7808         break;
7809       const RegisterBank *RB = MRI.getRegBankOrNull(MO.getReg());
7810       // If for some reason we don't have a regbank yet. Don't try anything.
7811       if (!RB)
7812         break;
7813 
7814       if (RB->getID() == AArch64::GPRRegBankID)
7815         HasGPROp = true;
7816       else
7817         HasFPROp = true;
7818     }
7819     // We have heterogenous regbanks, need to fixup.
7820     if (HasGPROp && HasFPROp)
7821       fixupPHIOpBanks(*MI, MRI, RBI);
7822   }
7823 }
7824 
7825 namespace llvm {
7826 InstructionSelector *
7827 createAArch64InstructionSelector(const AArch64TargetMachine &TM,
7828                                  AArch64Subtarget &Subtarget,
7829                                  AArch64RegisterBankInfo &RBI) {
7830   return new AArch64InstructionSelector(TM, Subtarget, RBI);
7831 }
7832 }
7833