xref: /freebsd/contrib/llvm-project/llvm/lib/Target/AArch64/GISel/AArch64InstructionSelector.cpp (revision b64c5a0ace59af62eff52bfe110a521dc73c937b)
1 //===- AArch64InstructionSelector.cpp ----------------------------*- C++ -*-==//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 /// \file
9 /// This file implements the targeting of the InstructionSelector class for
10 /// AArch64.
11 /// \todo This should be generated by TableGen.
12 //===----------------------------------------------------------------------===//
13 
14 #include "AArch64GlobalISelUtils.h"
15 #include "AArch64InstrInfo.h"
16 #include "AArch64MachineFunctionInfo.h"
17 #include "AArch64RegisterBankInfo.h"
18 #include "AArch64RegisterInfo.h"
19 #include "AArch64Subtarget.h"
20 #include "AArch64TargetMachine.h"
21 #include "MCTargetDesc/AArch64AddressingModes.h"
22 #include "MCTargetDesc/AArch64MCTargetDesc.h"
23 #include "llvm/BinaryFormat/Dwarf.h"
24 #include "llvm/CodeGen/GlobalISel/GIMatchTableExecutorImpl.h"
25 #include "llvm/CodeGen/GlobalISel/GenericMachineInstrs.h"
26 #include "llvm/CodeGen/GlobalISel/InstructionSelector.h"
27 #include "llvm/CodeGen/GlobalISel/MIPatternMatch.h"
28 #include "llvm/CodeGen/GlobalISel/MachineIRBuilder.h"
29 #include "llvm/CodeGen/GlobalISel/Utils.h"
30 #include "llvm/CodeGen/MachineBasicBlock.h"
31 #include "llvm/CodeGen/MachineConstantPool.h"
32 #include "llvm/CodeGen/MachineFrameInfo.h"
33 #include "llvm/CodeGen/MachineFunction.h"
34 #include "llvm/CodeGen/MachineInstr.h"
35 #include "llvm/CodeGen/MachineInstrBuilder.h"
36 #include "llvm/CodeGen/MachineMemOperand.h"
37 #include "llvm/CodeGen/MachineOperand.h"
38 #include "llvm/CodeGen/MachineRegisterInfo.h"
39 #include "llvm/CodeGen/TargetOpcodes.h"
40 #include "llvm/CodeGen/TargetRegisterInfo.h"
41 #include "llvm/IR/Constants.h"
42 #include "llvm/IR/DerivedTypes.h"
43 #include "llvm/IR/Instructions.h"
44 #include "llvm/IR/IntrinsicsAArch64.h"
45 #include "llvm/IR/PatternMatch.h"
46 #include "llvm/IR/Type.h"
47 #include "llvm/Pass.h"
48 #include "llvm/Support/Debug.h"
49 #include "llvm/Support/raw_ostream.h"
50 #include <optional>
51 
52 #define DEBUG_TYPE "aarch64-isel"
53 
54 using namespace llvm;
55 using namespace MIPatternMatch;
56 using namespace AArch64GISelUtils;
57 
58 namespace llvm {
59 class BlockFrequencyInfo;
60 class ProfileSummaryInfo;
61 }
62 
63 namespace {
64 
65 #define GET_GLOBALISEL_PREDICATE_BITSET
66 #include "AArch64GenGlobalISel.inc"
67 #undef GET_GLOBALISEL_PREDICATE_BITSET
68 
69 
70 class AArch64InstructionSelector : public InstructionSelector {
71 public:
72   AArch64InstructionSelector(const AArch64TargetMachine &TM,
73                              const AArch64Subtarget &STI,
74                              const AArch64RegisterBankInfo &RBI);
75 
76   bool select(MachineInstr &I) override;
77   static const char *getName() { return DEBUG_TYPE; }
78 
79   void setupMF(MachineFunction &MF, GISelKnownBits *KB,
80                CodeGenCoverage *CoverageInfo, ProfileSummaryInfo *PSI,
81                BlockFrequencyInfo *BFI) override {
82     InstructionSelector::setupMF(MF, KB, CoverageInfo, PSI, BFI);
83     MIB.setMF(MF);
84 
85     // hasFnAttribute() is expensive to call on every BRCOND selection, so
86     // cache it here for each run of the selector.
87     ProduceNonFlagSettingCondBr =
88         !MF.getFunction().hasFnAttribute(Attribute::SpeculativeLoadHardening);
89     MFReturnAddr = Register();
90 
91     processPHIs(MF);
92   }
93 
94 private:
95   /// tblgen-erated 'select' implementation, used as the initial selector for
96   /// the patterns that don't require complex C++.
97   bool selectImpl(MachineInstr &I, CodeGenCoverage &CoverageInfo) const;
98 
99   // A lowering phase that runs before any selection attempts.
100   // Returns true if the instruction was modified.
101   bool preISelLower(MachineInstr &I);
102 
103   // An early selection function that runs before the selectImpl() call.
104   bool earlySelect(MachineInstr &I);
105 
106   /// Save state that is shared between select calls, call select on \p I and
107   /// then restore the saved state. This can be used to recursively call select
108   /// within a select call.
109   bool selectAndRestoreState(MachineInstr &I);
110 
111   // Do some preprocessing of G_PHIs before we begin selection.
112   void processPHIs(MachineFunction &MF);
113 
114   bool earlySelectSHL(MachineInstr &I, MachineRegisterInfo &MRI);
115 
116   /// Eliminate same-sized cross-bank copies into stores before selectImpl().
117   bool contractCrossBankCopyIntoStore(MachineInstr &I,
118                                       MachineRegisterInfo &MRI);
119 
120   bool convertPtrAddToAdd(MachineInstr &I, MachineRegisterInfo &MRI);
121 
122   bool selectVaStartAAPCS(MachineInstr &I, MachineFunction &MF,
123                           MachineRegisterInfo &MRI) const;
124   bool selectVaStartDarwin(MachineInstr &I, MachineFunction &MF,
125                            MachineRegisterInfo &MRI) const;
126 
127   ///@{
128   /// Helper functions for selectCompareBranch.
129   bool selectCompareBranchFedByFCmp(MachineInstr &I, MachineInstr &FCmp,
130                                     MachineIRBuilder &MIB) const;
131   bool selectCompareBranchFedByICmp(MachineInstr &I, MachineInstr &ICmp,
132                                     MachineIRBuilder &MIB) const;
133   bool tryOptCompareBranchFedByICmp(MachineInstr &I, MachineInstr &ICmp,
134                                     MachineIRBuilder &MIB) const;
135   bool tryOptAndIntoCompareBranch(MachineInstr &AndInst, bool Invert,
136                                   MachineBasicBlock *DstMBB,
137                                   MachineIRBuilder &MIB) const;
138   ///@}
139 
140   bool selectCompareBranch(MachineInstr &I, MachineFunction &MF,
141                            MachineRegisterInfo &MRI);
142 
143   bool selectVectorAshrLshr(MachineInstr &I, MachineRegisterInfo &MRI);
144   bool selectVectorSHL(MachineInstr &I, MachineRegisterInfo &MRI);
145 
146   // Helper to generate an equivalent of scalar_to_vector into a new register,
147   // returned via 'Dst'.
148   MachineInstr *emitScalarToVector(unsigned EltSize,
149                                    const TargetRegisterClass *DstRC,
150                                    Register Scalar,
151                                    MachineIRBuilder &MIRBuilder) const;
152   /// Helper to narrow vector that was widened by emitScalarToVector.
153   /// Copy lowest part of 128-bit or 64-bit vector to 64-bit or 32-bit
154   /// vector, correspondingly.
155   MachineInstr *emitNarrowVector(Register DstReg, Register SrcReg,
156                                  MachineIRBuilder &MIRBuilder,
157                                  MachineRegisterInfo &MRI) const;
158 
159   /// Emit a lane insert into \p DstReg, or a new vector register if
160   /// std::nullopt is provided.
161   ///
162   /// The lane inserted into is defined by \p LaneIdx. The vector source
163   /// register is given by \p SrcReg. The register containing the element is
164   /// given by \p EltReg.
165   MachineInstr *emitLaneInsert(std::optional<Register> DstReg, Register SrcReg,
166                                Register EltReg, unsigned LaneIdx,
167                                const RegisterBank &RB,
168                                MachineIRBuilder &MIRBuilder) const;
169 
170   /// Emit a sequence of instructions representing a constant \p CV for a
171   /// vector register \p Dst. (E.g. a MOV, or a load from a constant pool.)
172   ///
173   /// \returns the last instruction in the sequence on success, and nullptr
174   /// otherwise.
175   MachineInstr *emitConstantVector(Register Dst, Constant *CV,
176                                    MachineIRBuilder &MIRBuilder,
177                                    MachineRegisterInfo &MRI);
178 
179   MachineInstr *tryAdvSIMDModImm8(Register Dst, unsigned DstSize, APInt Bits,
180                                   MachineIRBuilder &MIRBuilder);
181 
182   MachineInstr *tryAdvSIMDModImm16(Register Dst, unsigned DstSize, APInt Bits,
183                                    MachineIRBuilder &MIRBuilder, bool Inv);
184 
185   MachineInstr *tryAdvSIMDModImm32(Register Dst, unsigned DstSize, APInt Bits,
186                                    MachineIRBuilder &MIRBuilder, bool Inv);
187   MachineInstr *tryAdvSIMDModImm64(Register Dst, unsigned DstSize, APInt Bits,
188                                    MachineIRBuilder &MIRBuilder);
189   MachineInstr *tryAdvSIMDModImm321s(Register Dst, unsigned DstSize, APInt Bits,
190                                      MachineIRBuilder &MIRBuilder, bool Inv);
191   MachineInstr *tryAdvSIMDModImmFP(Register Dst, unsigned DstSize, APInt Bits,
192                                    MachineIRBuilder &MIRBuilder);
193 
194   bool tryOptConstantBuildVec(MachineInstr &MI, LLT DstTy,
195                               MachineRegisterInfo &MRI);
196   /// \returns true if a G_BUILD_VECTOR instruction \p MI can be selected as a
197   /// SUBREG_TO_REG.
198   bool tryOptBuildVecToSubregToReg(MachineInstr &MI, MachineRegisterInfo &MRI);
199   bool selectBuildVector(MachineInstr &I, MachineRegisterInfo &MRI);
200   bool selectMergeValues(MachineInstr &I, MachineRegisterInfo &MRI);
201   bool selectUnmergeValues(MachineInstr &I, MachineRegisterInfo &MRI);
202 
203   bool selectShuffleVector(MachineInstr &I, MachineRegisterInfo &MRI);
204   bool selectExtractElt(MachineInstr &I, MachineRegisterInfo &MRI);
205   bool selectConcatVectors(MachineInstr &I, MachineRegisterInfo &MRI);
206   bool selectSplitVectorUnmerge(MachineInstr &I, MachineRegisterInfo &MRI);
207 
208   /// Helper function to select vector load intrinsics like
209   /// @llvm.aarch64.neon.ld2.*, @llvm.aarch64.neon.ld4.*, etc.
210   /// \p Opc is the opcode that the selected instruction should use.
211   /// \p NumVecs is the number of vector destinations for the instruction.
212   /// \p I is the original G_INTRINSIC_W_SIDE_EFFECTS instruction.
213   bool selectVectorLoadIntrinsic(unsigned Opc, unsigned NumVecs,
214                                  MachineInstr &I);
215   bool selectVectorLoadLaneIntrinsic(unsigned Opc, unsigned NumVecs,
216                                      MachineInstr &I);
217   void selectVectorStoreIntrinsic(MachineInstr &I, unsigned NumVecs,
218                                   unsigned Opc);
219   bool selectVectorStoreLaneIntrinsic(MachineInstr &I, unsigned NumVecs,
220                                       unsigned Opc);
221   bool selectIntrinsicWithSideEffects(MachineInstr &I,
222                                       MachineRegisterInfo &MRI);
223   bool selectIntrinsic(MachineInstr &I, MachineRegisterInfo &MRI);
224   bool selectJumpTable(MachineInstr &I, MachineRegisterInfo &MRI);
225   bool selectBrJT(MachineInstr &I, MachineRegisterInfo &MRI);
226   bool selectTLSGlobalValue(MachineInstr &I, MachineRegisterInfo &MRI);
227   bool selectPtrAuthGlobalValue(MachineInstr &I,
228                                 MachineRegisterInfo &MRI) const;
229   bool selectReduction(MachineInstr &I, MachineRegisterInfo &MRI);
230   bool selectMOPS(MachineInstr &I, MachineRegisterInfo &MRI);
231   bool selectUSMovFromExtend(MachineInstr &I, MachineRegisterInfo &MRI);
232   void SelectTable(MachineInstr &I, MachineRegisterInfo &MRI, unsigned NumVecs,
233                    unsigned Opc1, unsigned Opc2, bool isExt);
234 
235   bool selectIndexedExtLoad(MachineInstr &I, MachineRegisterInfo &MRI);
236   bool selectIndexedLoad(MachineInstr &I, MachineRegisterInfo &MRI);
237   bool selectIndexedStore(GIndexedStore &I, MachineRegisterInfo &MRI);
238 
239   unsigned emitConstantPoolEntry(const Constant *CPVal,
240                                  MachineFunction &MF) const;
241   MachineInstr *emitLoadFromConstantPool(const Constant *CPVal,
242                                          MachineIRBuilder &MIRBuilder) const;
243 
244   // Emit a vector concat operation.
245   MachineInstr *emitVectorConcat(std::optional<Register> Dst, Register Op1,
246                                  Register Op2,
247                                  MachineIRBuilder &MIRBuilder) const;
248 
249   // Emit an integer compare between LHS and RHS, which checks for Predicate.
250   MachineInstr *emitIntegerCompare(MachineOperand &LHS, MachineOperand &RHS,
251                                    MachineOperand &Predicate,
252                                    MachineIRBuilder &MIRBuilder) const;
253 
254   /// Emit a floating point comparison between \p LHS and \p RHS.
255   /// \p Pred if given is the intended predicate to use.
256   MachineInstr *
257   emitFPCompare(Register LHS, Register RHS, MachineIRBuilder &MIRBuilder,
258                 std::optional<CmpInst::Predicate> = std::nullopt) const;
259 
260   MachineInstr *
261   emitInstr(unsigned Opcode, std::initializer_list<llvm::DstOp> DstOps,
262             std::initializer_list<llvm::SrcOp> SrcOps,
263             MachineIRBuilder &MIRBuilder,
264             const ComplexRendererFns &RenderFns = std::nullopt) const;
265   /// Helper function to emit an add or sub instruction.
266   ///
267   /// \p AddrModeAndSizeToOpcode must contain each of the opcode variants above
268   /// in a specific order.
269   ///
270   /// Below is an example of the expected input to \p AddrModeAndSizeToOpcode.
271   ///
272   /// \code
273   ///   const std::array<std::array<unsigned, 2>, 4> Table {
274   ///    {{AArch64::ADDXri, AArch64::ADDWri},
275   ///     {AArch64::ADDXrs, AArch64::ADDWrs},
276   ///     {AArch64::ADDXrr, AArch64::ADDWrr},
277   ///     {AArch64::SUBXri, AArch64::SUBWri},
278   ///     {AArch64::ADDXrx, AArch64::ADDWrx}}};
279   /// \endcode
280   ///
281   /// Each row in the table corresponds to a different addressing mode. Each
282   /// column corresponds to a different register size.
283   ///
284   /// \attention Rows must be structured as follows:
285   ///   - Row 0: The ri opcode variants
286   ///   - Row 1: The rs opcode variants
287   ///   - Row 2: The rr opcode variants
288   ///   - Row 3: The ri opcode variants for negative immediates
289   ///   - Row 4: The rx opcode variants
290   ///
291   /// \attention Columns must be structured as follows:
292   ///   - Column 0: The 64-bit opcode variants
293   ///   - Column 1: The 32-bit opcode variants
294   ///
295   /// \p Dst is the destination register of the binop to emit.
296   /// \p LHS is the left-hand operand of the binop to emit.
297   /// \p RHS is the right-hand operand of the binop to emit.
298   MachineInstr *emitAddSub(
299       const std::array<std::array<unsigned, 2>, 5> &AddrModeAndSizeToOpcode,
300       Register Dst, MachineOperand &LHS, MachineOperand &RHS,
301       MachineIRBuilder &MIRBuilder) const;
302   MachineInstr *emitADD(Register DefReg, MachineOperand &LHS,
303                         MachineOperand &RHS,
304                         MachineIRBuilder &MIRBuilder) const;
305   MachineInstr *emitADDS(Register Dst, MachineOperand &LHS, MachineOperand &RHS,
306                          MachineIRBuilder &MIRBuilder) const;
307   MachineInstr *emitSUBS(Register Dst, MachineOperand &LHS, MachineOperand &RHS,
308                          MachineIRBuilder &MIRBuilder) const;
309   MachineInstr *emitADCS(Register Dst, MachineOperand &LHS, MachineOperand &RHS,
310                          MachineIRBuilder &MIRBuilder) const;
311   MachineInstr *emitSBCS(Register Dst, MachineOperand &LHS, MachineOperand &RHS,
312                          MachineIRBuilder &MIRBuilder) const;
313   MachineInstr *emitCMN(MachineOperand &LHS, MachineOperand &RHS,
314                         MachineIRBuilder &MIRBuilder) const;
315   MachineInstr *emitTST(MachineOperand &LHS, MachineOperand &RHS,
316                         MachineIRBuilder &MIRBuilder) const;
317   MachineInstr *emitSelect(Register Dst, Register LHS, Register RHS,
318                            AArch64CC::CondCode CC,
319                            MachineIRBuilder &MIRBuilder) const;
320   MachineInstr *emitExtractVectorElt(std::optional<Register> DstReg,
321                                      const RegisterBank &DstRB, LLT ScalarTy,
322                                      Register VecReg, unsigned LaneIdx,
323                                      MachineIRBuilder &MIRBuilder) const;
324   MachineInstr *emitCSINC(Register Dst, Register Src1, Register Src2,
325                           AArch64CC::CondCode Pred,
326                           MachineIRBuilder &MIRBuilder) const;
327   /// Emit a CSet for a FP compare.
328   ///
329   /// \p Dst is expected to be a 32-bit scalar register.
330   MachineInstr *emitCSetForFCmp(Register Dst, CmpInst::Predicate Pred,
331                                 MachineIRBuilder &MIRBuilder) const;
332 
333   /// Emit an instruction that sets NZCV to the carry-in expected by \p I.
334   /// Might elide the instruction if the previous instruction already sets NZCV
335   /// correctly.
336   MachineInstr *emitCarryIn(MachineInstr &I, Register CarryReg);
337 
338   /// Emit the overflow op for \p Opcode.
339   ///
340   /// \p Opcode is expected to be an overflow op's opcode, e.g. G_UADDO,
341   /// G_USUBO, etc.
342   std::pair<MachineInstr *, AArch64CC::CondCode>
343   emitOverflowOp(unsigned Opcode, Register Dst, MachineOperand &LHS,
344                  MachineOperand &RHS, MachineIRBuilder &MIRBuilder) const;
345 
346   bool selectOverflowOp(MachineInstr &I, MachineRegisterInfo &MRI);
347 
348   /// Emit expression as a conjunction (a series of CCMP/CFCMP ops).
349   /// In some cases this is even possible with OR operations in the expression.
350   MachineInstr *emitConjunction(Register Val, AArch64CC::CondCode &OutCC,
351                                 MachineIRBuilder &MIB) const;
352   MachineInstr *emitConditionalComparison(Register LHS, Register RHS,
353                                           CmpInst::Predicate CC,
354                                           AArch64CC::CondCode Predicate,
355                                           AArch64CC::CondCode OutCC,
356                                           MachineIRBuilder &MIB) const;
357   MachineInstr *emitConjunctionRec(Register Val, AArch64CC::CondCode &OutCC,
358                                    bool Negate, Register CCOp,
359                                    AArch64CC::CondCode Predicate,
360                                    MachineIRBuilder &MIB) const;
361 
362   /// Emit a TB(N)Z instruction which tests \p Bit in \p TestReg.
363   /// \p IsNegative is true if the test should be "not zero".
364   /// This will also optimize the test bit instruction when possible.
365   MachineInstr *emitTestBit(Register TestReg, uint64_t Bit, bool IsNegative,
366                             MachineBasicBlock *DstMBB,
367                             MachineIRBuilder &MIB) const;
368 
369   /// Emit a CB(N)Z instruction which branches to \p DestMBB.
370   MachineInstr *emitCBZ(Register CompareReg, bool IsNegative,
371                         MachineBasicBlock *DestMBB,
372                         MachineIRBuilder &MIB) const;
373 
374   // Equivalent to the i32shift_a and friends from AArch64InstrInfo.td.
375   // We use these manually instead of using the importer since it doesn't
376   // support SDNodeXForm.
377   ComplexRendererFns selectShiftA_32(const MachineOperand &Root) const;
378   ComplexRendererFns selectShiftB_32(const MachineOperand &Root) const;
379   ComplexRendererFns selectShiftA_64(const MachineOperand &Root) const;
380   ComplexRendererFns selectShiftB_64(const MachineOperand &Root) const;
381 
382   ComplexRendererFns select12BitValueWithLeftShift(uint64_t Immed) const;
383   ComplexRendererFns selectArithImmed(MachineOperand &Root) const;
384   ComplexRendererFns selectNegArithImmed(MachineOperand &Root) const;
385 
386   ComplexRendererFns selectAddrModeUnscaled(MachineOperand &Root,
387                                             unsigned Size) const;
388 
389   ComplexRendererFns selectAddrModeUnscaled8(MachineOperand &Root) const {
390     return selectAddrModeUnscaled(Root, 1);
391   }
392   ComplexRendererFns selectAddrModeUnscaled16(MachineOperand &Root) const {
393     return selectAddrModeUnscaled(Root, 2);
394   }
395   ComplexRendererFns selectAddrModeUnscaled32(MachineOperand &Root) const {
396     return selectAddrModeUnscaled(Root, 4);
397   }
398   ComplexRendererFns selectAddrModeUnscaled64(MachineOperand &Root) const {
399     return selectAddrModeUnscaled(Root, 8);
400   }
401   ComplexRendererFns selectAddrModeUnscaled128(MachineOperand &Root) const {
402     return selectAddrModeUnscaled(Root, 16);
403   }
404 
405   /// Helper to try to fold in a GISEL_ADD_LOW into an immediate, to be used
406   /// from complex pattern matchers like selectAddrModeIndexed().
407   ComplexRendererFns tryFoldAddLowIntoImm(MachineInstr &RootDef, unsigned Size,
408                                           MachineRegisterInfo &MRI) const;
409 
410   ComplexRendererFns selectAddrModeIndexed(MachineOperand &Root,
411                                            unsigned Size) const;
412   template <int Width>
413   ComplexRendererFns selectAddrModeIndexed(MachineOperand &Root) const {
414     return selectAddrModeIndexed(Root, Width / 8);
415   }
416 
417   std::optional<bool>
418   isWorthFoldingIntoAddrMode(MachineInstr &MI,
419                              const MachineRegisterInfo &MRI) const;
420 
421   bool isWorthFoldingIntoExtendedReg(MachineInstr &MI,
422                                      const MachineRegisterInfo &MRI,
423                                      bool IsAddrOperand) const;
424   ComplexRendererFns
425   selectAddrModeShiftedExtendXReg(MachineOperand &Root,
426                                   unsigned SizeInBytes) const;
427 
428   /// Returns a \p ComplexRendererFns which contains a base, offset, and whether
429   /// or not a shift + extend should be folded into an addressing mode. Returns
430   /// None when this is not profitable or possible.
431   ComplexRendererFns
432   selectExtendedSHL(MachineOperand &Root, MachineOperand &Base,
433                     MachineOperand &Offset, unsigned SizeInBytes,
434                     bool WantsExt) const;
435   ComplexRendererFns selectAddrModeRegisterOffset(MachineOperand &Root) const;
436   ComplexRendererFns selectAddrModeXRO(MachineOperand &Root,
437                                        unsigned SizeInBytes) const;
438   template <int Width>
439   ComplexRendererFns selectAddrModeXRO(MachineOperand &Root) const {
440     return selectAddrModeXRO(Root, Width / 8);
441   }
442 
443   ComplexRendererFns selectAddrModeWRO(MachineOperand &Root,
444                                        unsigned SizeInBytes) const;
445   template <int Width>
446   ComplexRendererFns selectAddrModeWRO(MachineOperand &Root) const {
447     return selectAddrModeWRO(Root, Width / 8);
448   }
449 
450   ComplexRendererFns selectShiftedRegister(MachineOperand &Root,
451                                            bool AllowROR = false) const;
452 
453   ComplexRendererFns selectArithShiftedRegister(MachineOperand &Root) const {
454     return selectShiftedRegister(Root);
455   }
456 
457   ComplexRendererFns selectLogicalShiftedRegister(MachineOperand &Root) const {
458     return selectShiftedRegister(Root, true);
459   }
460 
461   /// Given an extend instruction, determine the correct shift-extend type for
462   /// that instruction.
463   ///
464   /// If the instruction is going to be used in a load or store, pass
465   /// \p IsLoadStore = true.
466   AArch64_AM::ShiftExtendType
467   getExtendTypeForInst(MachineInstr &MI, MachineRegisterInfo &MRI,
468                        bool IsLoadStore = false) const;
469 
470   /// Move \p Reg to \p RC if \p Reg is not already on \p RC.
471   ///
472   /// \returns Either \p Reg if no change was necessary, or the new register
473   /// created by moving \p Reg.
474   ///
475   /// Note: This uses emitCopy right now.
476   Register moveScalarRegClass(Register Reg, const TargetRegisterClass &RC,
477                               MachineIRBuilder &MIB) const;
478 
479   ComplexRendererFns selectArithExtendedRegister(MachineOperand &Root) const;
480 
481   ComplexRendererFns selectExtractHigh(MachineOperand &Root) const;
482 
483   void renderTruncImm(MachineInstrBuilder &MIB, const MachineInstr &MI,
484                       int OpIdx = -1) const;
485   void renderLogicalImm32(MachineInstrBuilder &MIB, const MachineInstr &I,
486                           int OpIdx = -1) const;
487   void renderLogicalImm64(MachineInstrBuilder &MIB, const MachineInstr &I,
488                           int OpIdx = -1) const;
489   void renderUbsanTrap(MachineInstrBuilder &MIB, const MachineInstr &MI,
490                        int OpIdx) const;
491   void renderFPImm16(MachineInstrBuilder &MIB, const MachineInstr &MI,
492                      int OpIdx = -1) const;
493   void renderFPImm32(MachineInstrBuilder &MIB, const MachineInstr &MI,
494                      int OpIdx = -1) const;
495   void renderFPImm64(MachineInstrBuilder &MIB, const MachineInstr &MI,
496                      int OpIdx = -1) const;
497   void renderFPImm32SIMDModImmType4(MachineInstrBuilder &MIB,
498                                     const MachineInstr &MI,
499                                     int OpIdx = -1) const;
500 
501   // Materialize a GlobalValue or BlockAddress using a movz+movk sequence.
502   void materializeLargeCMVal(MachineInstr &I, const Value *V, unsigned OpFlags);
503 
504   // Optimization methods.
505   bool tryOptSelect(GSelect &Sel);
506   bool tryOptSelectConjunction(GSelect &Sel, MachineInstr &CondMI);
507   MachineInstr *tryFoldIntegerCompare(MachineOperand &LHS, MachineOperand &RHS,
508                                       MachineOperand &Predicate,
509                                       MachineIRBuilder &MIRBuilder) const;
510 
511   /// Return true if \p MI is a load or store of \p NumBytes bytes.
512   bool isLoadStoreOfNumBytes(const MachineInstr &MI, unsigned NumBytes) const;
513 
514   /// Returns true if \p MI is guaranteed to have the high-half of a 64-bit
515   /// register zeroed out. In other words, the result of MI has been explicitly
516   /// zero extended.
517   bool isDef32(const MachineInstr &MI) const;
518 
519   const AArch64TargetMachine &TM;
520   const AArch64Subtarget &STI;
521   const AArch64InstrInfo &TII;
522   const AArch64RegisterInfo &TRI;
523   const AArch64RegisterBankInfo &RBI;
524 
525   bool ProduceNonFlagSettingCondBr = false;
526 
527   // Some cached values used during selection.
528   // We use LR as a live-in register, and we keep track of it here as it can be
529   // clobbered by calls.
530   Register MFReturnAddr;
531 
532   MachineIRBuilder MIB;
533 
534 #define GET_GLOBALISEL_PREDICATES_DECL
535 #include "AArch64GenGlobalISel.inc"
536 #undef GET_GLOBALISEL_PREDICATES_DECL
537 
538 // We declare the temporaries used by selectImpl() in the class to minimize the
539 // cost of constructing placeholder values.
540 #define GET_GLOBALISEL_TEMPORARIES_DECL
541 #include "AArch64GenGlobalISel.inc"
542 #undef GET_GLOBALISEL_TEMPORARIES_DECL
543 };
544 
545 } // end anonymous namespace
546 
547 #define GET_GLOBALISEL_IMPL
548 #include "AArch64GenGlobalISel.inc"
549 #undef GET_GLOBALISEL_IMPL
550 
551 AArch64InstructionSelector::AArch64InstructionSelector(
552     const AArch64TargetMachine &TM, const AArch64Subtarget &STI,
553     const AArch64RegisterBankInfo &RBI)
554     : TM(TM), STI(STI), TII(*STI.getInstrInfo()), TRI(*STI.getRegisterInfo()),
555       RBI(RBI),
556 #define GET_GLOBALISEL_PREDICATES_INIT
557 #include "AArch64GenGlobalISel.inc"
558 #undef GET_GLOBALISEL_PREDICATES_INIT
559 #define GET_GLOBALISEL_TEMPORARIES_INIT
560 #include "AArch64GenGlobalISel.inc"
561 #undef GET_GLOBALISEL_TEMPORARIES_INIT
562 {
563 }
564 
565 // FIXME: This should be target-independent, inferred from the types declared
566 // for each class in the bank.
567 //
568 /// Given a register bank, and a type, return the smallest register class that
569 /// can represent that combination.
570 static const TargetRegisterClass *
571 getRegClassForTypeOnBank(LLT Ty, const RegisterBank &RB,
572                          bool GetAllRegSet = false) {
573   if (RB.getID() == AArch64::GPRRegBankID) {
574     if (Ty.getSizeInBits() <= 32)
575       return GetAllRegSet ? &AArch64::GPR32allRegClass
576                           : &AArch64::GPR32RegClass;
577     if (Ty.getSizeInBits() == 64)
578       return GetAllRegSet ? &AArch64::GPR64allRegClass
579                           : &AArch64::GPR64RegClass;
580     if (Ty.getSizeInBits() == 128)
581       return &AArch64::XSeqPairsClassRegClass;
582     return nullptr;
583   }
584 
585   if (RB.getID() == AArch64::FPRRegBankID) {
586     switch (Ty.getSizeInBits()) {
587     case 8:
588       return &AArch64::FPR8RegClass;
589     case 16:
590       return &AArch64::FPR16RegClass;
591     case 32:
592       return &AArch64::FPR32RegClass;
593     case 64:
594       return &AArch64::FPR64RegClass;
595     case 128:
596       return &AArch64::FPR128RegClass;
597     }
598     return nullptr;
599   }
600 
601   return nullptr;
602 }
603 
604 /// Given a register bank, and size in bits, return the smallest register class
605 /// that can represent that combination.
606 static const TargetRegisterClass *
607 getMinClassForRegBank(const RegisterBank &RB, TypeSize SizeInBits,
608                       bool GetAllRegSet = false) {
609   if (SizeInBits.isScalable()) {
610     assert(RB.getID() == AArch64::FPRRegBankID &&
611            "Expected FPR regbank for scalable type size");
612     return &AArch64::ZPRRegClass;
613   }
614 
615   unsigned RegBankID = RB.getID();
616 
617   if (RegBankID == AArch64::GPRRegBankID) {
618     if (SizeInBits <= 32)
619       return GetAllRegSet ? &AArch64::GPR32allRegClass
620                           : &AArch64::GPR32RegClass;
621     if (SizeInBits == 64)
622       return GetAllRegSet ? &AArch64::GPR64allRegClass
623                           : &AArch64::GPR64RegClass;
624     if (SizeInBits == 128)
625       return &AArch64::XSeqPairsClassRegClass;
626   }
627 
628   if (RegBankID == AArch64::FPRRegBankID) {
629     switch (SizeInBits) {
630     default:
631       return nullptr;
632     case 8:
633       return &AArch64::FPR8RegClass;
634     case 16:
635       return &AArch64::FPR16RegClass;
636     case 32:
637       return &AArch64::FPR32RegClass;
638     case 64:
639       return &AArch64::FPR64RegClass;
640     case 128:
641       return &AArch64::FPR128RegClass;
642     }
643   }
644 
645   return nullptr;
646 }
647 
648 /// Returns the correct subregister to use for a given register class.
649 static bool getSubRegForClass(const TargetRegisterClass *RC,
650                               const TargetRegisterInfo &TRI, unsigned &SubReg) {
651   switch (TRI.getRegSizeInBits(*RC)) {
652   case 8:
653     SubReg = AArch64::bsub;
654     break;
655   case 16:
656     SubReg = AArch64::hsub;
657     break;
658   case 32:
659     if (RC != &AArch64::FPR32RegClass)
660       SubReg = AArch64::sub_32;
661     else
662       SubReg = AArch64::ssub;
663     break;
664   case 64:
665     SubReg = AArch64::dsub;
666     break;
667   default:
668     LLVM_DEBUG(
669         dbgs() << "Couldn't find appropriate subregister for register class.");
670     return false;
671   }
672 
673   return true;
674 }
675 
676 /// Returns the minimum size the given register bank can hold.
677 static unsigned getMinSizeForRegBank(const RegisterBank &RB) {
678   switch (RB.getID()) {
679   case AArch64::GPRRegBankID:
680     return 32;
681   case AArch64::FPRRegBankID:
682     return 8;
683   default:
684     llvm_unreachable("Tried to get minimum size for unknown register bank.");
685   }
686 }
687 
688 /// Create a REG_SEQUENCE instruction using the registers in \p Regs.
689 /// Helper function for functions like createDTuple and createQTuple.
690 ///
691 /// \p RegClassIDs - The list of register class IDs available for some tuple of
692 /// a scalar class. E.g. QQRegClassID, QQQRegClassID, QQQQRegClassID. This is
693 /// expected to contain between 2 and 4 tuple classes.
694 ///
695 /// \p SubRegs - The list of subregister classes associated with each register
696 /// class ID in \p RegClassIDs. E.g., QQRegClassID should use the qsub0
697 /// subregister class. The index of each subregister class is expected to
698 /// correspond with the index of each register class.
699 ///
700 /// \returns Either the destination register of REG_SEQUENCE instruction that
701 /// was created, or the 0th element of \p Regs if \p Regs contains a single
702 /// element.
703 static Register createTuple(ArrayRef<Register> Regs,
704                             const unsigned RegClassIDs[],
705                             const unsigned SubRegs[], MachineIRBuilder &MIB) {
706   unsigned NumRegs = Regs.size();
707   if (NumRegs == 1)
708     return Regs[0];
709   assert(NumRegs >= 2 && NumRegs <= 4 &&
710          "Only support between two and 4 registers in a tuple!");
711   const TargetRegisterInfo *TRI = MIB.getMF().getSubtarget().getRegisterInfo();
712   auto *DesiredClass = TRI->getRegClass(RegClassIDs[NumRegs - 2]);
713   auto RegSequence =
714       MIB.buildInstr(TargetOpcode::REG_SEQUENCE, {DesiredClass}, {});
715   for (unsigned I = 0, E = Regs.size(); I < E; ++I) {
716     RegSequence.addUse(Regs[I]);
717     RegSequence.addImm(SubRegs[I]);
718   }
719   return RegSequence.getReg(0);
720 }
721 
722 /// Create a tuple of D-registers using the registers in \p Regs.
723 static Register createDTuple(ArrayRef<Register> Regs, MachineIRBuilder &MIB) {
724   static const unsigned RegClassIDs[] = {
725       AArch64::DDRegClassID, AArch64::DDDRegClassID, AArch64::DDDDRegClassID};
726   static const unsigned SubRegs[] = {AArch64::dsub0, AArch64::dsub1,
727                                      AArch64::dsub2, AArch64::dsub3};
728   return createTuple(Regs, RegClassIDs, SubRegs, MIB);
729 }
730 
731 /// Create a tuple of Q-registers using the registers in \p Regs.
732 static Register createQTuple(ArrayRef<Register> Regs, MachineIRBuilder &MIB) {
733   static const unsigned RegClassIDs[] = {
734       AArch64::QQRegClassID, AArch64::QQQRegClassID, AArch64::QQQQRegClassID};
735   static const unsigned SubRegs[] = {AArch64::qsub0, AArch64::qsub1,
736                                      AArch64::qsub2, AArch64::qsub3};
737   return createTuple(Regs, RegClassIDs, SubRegs, MIB);
738 }
739 
740 static std::optional<uint64_t> getImmedFromMO(const MachineOperand &Root) {
741   auto &MI = *Root.getParent();
742   auto &MBB = *MI.getParent();
743   auto &MF = *MBB.getParent();
744   auto &MRI = MF.getRegInfo();
745   uint64_t Immed;
746   if (Root.isImm())
747     Immed = Root.getImm();
748   else if (Root.isCImm())
749     Immed = Root.getCImm()->getZExtValue();
750   else if (Root.isReg()) {
751     auto ValAndVReg =
752         getIConstantVRegValWithLookThrough(Root.getReg(), MRI, true);
753     if (!ValAndVReg)
754       return std::nullopt;
755     Immed = ValAndVReg->Value.getSExtValue();
756   } else
757     return std::nullopt;
758   return Immed;
759 }
760 
761 /// Check whether \p I is a currently unsupported binary operation:
762 /// - it has an unsized type
763 /// - an operand is not a vreg
764 /// - all operands are not in the same bank
765 /// These are checks that should someday live in the verifier, but right now,
766 /// these are mostly limitations of the aarch64 selector.
767 static bool unsupportedBinOp(const MachineInstr &I,
768                              const AArch64RegisterBankInfo &RBI,
769                              const MachineRegisterInfo &MRI,
770                              const AArch64RegisterInfo &TRI) {
771   LLT Ty = MRI.getType(I.getOperand(0).getReg());
772   if (!Ty.isValid()) {
773     LLVM_DEBUG(dbgs() << "Generic binop register should be typed\n");
774     return true;
775   }
776 
777   const RegisterBank *PrevOpBank = nullptr;
778   for (auto &MO : I.operands()) {
779     // FIXME: Support non-register operands.
780     if (!MO.isReg()) {
781       LLVM_DEBUG(dbgs() << "Generic inst non-reg operands are unsupported\n");
782       return true;
783     }
784 
785     // FIXME: Can generic operations have physical registers operands? If
786     // so, this will need to be taught about that, and we'll need to get the
787     // bank out of the minimal class for the register.
788     // Either way, this needs to be documented (and possibly verified).
789     if (!MO.getReg().isVirtual()) {
790       LLVM_DEBUG(dbgs() << "Generic inst has physical register operand\n");
791       return true;
792     }
793 
794     const RegisterBank *OpBank = RBI.getRegBank(MO.getReg(), MRI, TRI);
795     if (!OpBank) {
796       LLVM_DEBUG(dbgs() << "Generic register has no bank or class\n");
797       return true;
798     }
799 
800     if (PrevOpBank && OpBank != PrevOpBank) {
801       LLVM_DEBUG(dbgs() << "Generic inst operands have different banks\n");
802       return true;
803     }
804     PrevOpBank = OpBank;
805   }
806   return false;
807 }
808 
809 /// Select the AArch64 opcode for the basic binary operation \p GenericOpc
810 /// (such as G_OR or G_SDIV), appropriate for the register bank \p RegBankID
811 /// and of size \p OpSize.
812 /// \returns \p GenericOpc if the combination is unsupported.
813 static unsigned selectBinaryOp(unsigned GenericOpc, unsigned RegBankID,
814                                unsigned OpSize) {
815   switch (RegBankID) {
816   case AArch64::GPRRegBankID:
817     if (OpSize == 32) {
818       switch (GenericOpc) {
819       case TargetOpcode::G_SHL:
820         return AArch64::LSLVWr;
821       case TargetOpcode::G_LSHR:
822         return AArch64::LSRVWr;
823       case TargetOpcode::G_ASHR:
824         return AArch64::ASRVWr;
825       default:
826         return GenericOpc;
827       }
828     } else if (OpSize == 64) {
829       switch (GenericOpc) {
830       case TargetOpcode::G_PTR_ADD:
831         return AArch64::ADDXrr;
832       case TargetOpcode::G_SHL:
833         return AArch64::LSLVXr;
834       case TargetOpcode::G_LSHR:
835         return AArch64::LSRVXr;
836       case TargetOpcode::G_ASHR:
837         return AArch64::ASRVXr;
838       default:
839         return GenericOpc;
840       }
841     }
842     break;
843   case AArch64::FPRRegBankID:
844     switch (OpSize) {
845     case 32:
846       switch (GenericOpc) {
847       case TargetOpcode::G_FADD:
848         return AArch64::FADDSrr;
849       case TargetOpcode::G_FSUB:
850         return AArch64::FSUBSrr;
851       case TargetOpcode::G_FMUL:
852         return AArch64::FMULSrr;
853       case TargetOpcode::G_FDIV:
854         return AArch64::FDIVSrr;
855       default:
856         return GenericOpc;
857       }
858     case 64:
859       switch (GenericOpc) {
860       case TargetOpcode::G_FADD:
861         return AArch64::FADDDrr;
862       case TargetOpcode::G_FSUB:
863         return AArch64::FSUBDrr;
864       case TargetOpcode::G_FMUL:
865         return AArch64::FMULDrr;
866       case TargetOpcode::G_FDIV:
867         return AArch64::FDIVDrr;
868       case TargetOpcode::G_OR:
869         return AArch64::ORRv8i8;
870       default:
871         return GenericOpc;
872       }
873     }
874     break;
875   }
876   return GenericOpc;
877 }
878 
879 /// Select the AArch64 opcode for the G_LOAD or G_STORE operation \p GenericOpc,
880 /// appropriate for the (value) register bank \p RegBankID and of memory access
881 /// size \p OpSize.  This returns the variant with the base+unsigned-immediate
882 /// addressing mode (e.g., LDRXui).
883 /// \returns \p GenericOpc if the combination is unsupported.
884 static unsigned selectLoadStoreUIOp(unsigned GenericOpc, unsigned RegBankID,
885                                     unsigned OpSize) {
886   const bool isStore = GenericOpc == TargetOpcode::G_STORE;
887   switch (RegBankID) {
888   case AArch64::GPRRegBankID:
889     switch (OpSize) {
890     case 8:
891       return isStore ? AArch64::STRBBui : AArch64::LDRBBui;
892     case 16:
893       return isStore ? AArch64::STRHHui : AArch64::LDRHHui;
894     case 32:
895       return isStore ? AArch64::STRWui : AArch64::LDRWui;
896     case 64:
897       return isStore ? AArch64::STRXui : AArch64::LDRXui;
898     }
899     break;
900   case AArch64::FPRRegBankID:
901     switch (OpSize) {
902     case 8:
903       return isStore ? AArch64::STRBui : AArch64::LDRBui;
904     case 16:
905       return isStore ? AArch64::STRHui : AArch64::LDRHui;
906     case 32:
907       return isStore ? AArch64::STRSui : AArch64::LDRSui;
908     case 64:
909       return isStore ? AArch64::STRDui : AArch64::LDRDui;
910     case 128:
911       return isStore ? AArch64::STRQui : AArch64::LDRQui;
912     }
913     break;
914   }
915   return GenericOpc;
916 }
917 
918 /// Helper function for selectCopy. Inserts a subregister copy from \p SrcReg
919 /// to \p *To.
920 ///
921 /// E.g "To = COPY SrcReg:SubReg"
922 static bool copySubReg(MachineInstr &I, MachineRegisterInfo &MRI,
923                        const RegisterBankInfo &RBI, Register SrcReg,
924                        const TargetRegisterClass *To, unsigned SubReg) {
925   assert(SrcReg.isValid() && "Expected a valid source register?");
926   assert(To && "Destination register class cannot be null");
927   assert(SubReg && "Expected a valid subregister");
928 
929   MachineIRBuilder MIB(I);
930   auto SubRegCopy =
931       MIB.buildInstr(TargetOpcode::COPY, {To}, {}).addReg(SrcReg, 0, SubReg);
932   MachineOperand &RegOp = I.getOperand(1);
933   RegOp.setReg(SubRegCopy.getReg(0));
934 
935   // It's possible that the destination register won't be constrained. Make
936   // sure that happens.
937   if (!I.getOperand(0).getReg().isPhysical())
938     RBI.constrainGenericRegister(I.getOperand(0).getReg(), *To, MRI);
939 
940   return true;
941 }
942 
943 /// Helper function to get the source and destination register classes for a
944 /// copy. Returns a std::pair containing the source register class for the
945 /// copy, and the destination register class for the copy. If a register class
946 /// cannot be determined, then it will be nullptr.
947 static std::pair<const TargetRegisterClass *, const TargetRegisterClass *>
948 getRegClassesForCopy(MachineInstr &I, const TargetInstrInfo &TII,
949                      MachineRegisterInfo &MRI, const TargetRegisterInfo &TRI,
950                      const RegisterBankInfo &RBI) {
951   Register DstReg = I.getOperand(0).getReg();
952   Register SrcReg = I.getOperand(1).getReg();
953   const RegisterBank &DstRegBank = *RBI.getRegBank(DstReg, MRI, TRI);
954   const RegisterBank &SrcRegBank = *RBI.getRegBank(SrcReg, MRI, TRI);
955 
956   TypeSize DstSize = RBI.getSizeInBits(DstReg, MRI, TRI);
957   TypeSize SrcSize = RBI.getSizeInBits(SrcReg, MRI, TRI);
958 
959   // Special casing for cross-bank copies of s1s. We can technically represent
960   // a 1-bit value with any size of register. The minimum size for a GPR is 32
961   // bits. So, we need to put the FPR on 32 bits as well.
962   //
963   // FIXME: I'm not sure if this case holds true outside of copies. If it does,
964   // then we can pull it into the helpers that get the appropriate class for a
965   // register bank. Or make a new helper that carries along some constraint
966   // information.
967   if (SrcRegBank != DstRegBank && (DstSize == 1 && SrcSize == 1))
968     SrcSize = DstSize = TypeSize::getFixed(32);
969 
970   return {getMinClassForRegBank(SrcRegBank, SrcSize, true),
971           getMinClassForRegBank(DstRegBank, DstSize, true)};
972 }
973 
974 // FIXME: We need some sort of API in RBI/TRI to allow generic code to
975 // constrain operands of simple instructions given a TargetRegisterClass
976 // and LLT
977 static bool selectDebugInstr(MachineInstr &I, MachineRegisterInfo &MRI,
978                              const RegisterBankInfo &RBI) {
979   for (MachineOperand &MO : I.operands()) {
980     if (!MO.isReg())
981       continue;
982     Register Reg = MO.getReg();
983     if (!Reg)
984       continue;
985     if (Reg.isPhysical())
986       continue;
987     LLT Ty = MRI.getType(Reg);
988     const RegClassOrRegBank &RegClassOrBank = MRI.getRegClassOrRegBank(Reg);
989     const TargetRegisterClass *RC =
990         RegClassOrBank.dyn_cast<const TargetRegisterClass *>();
991     if (!RC) {
992       const RegisterBank &RB = *RegClassOrBank.get<const RegisterBank *>();
993       RC = getRegClassForTypeOnBank(Ty, RB);
994       if (!RC) {
995         LLVM_DEBUG(
996             dbgs() << "Warning: DBG_VALUE operand has unexpected size/bank\n");
997         break;
998       }
999     }
1000     RBI.constrainGenericRegister(Reg, *RC, MRI);
1001   }
1002 
1003   return true;
1004 }
1005 
1006 static bool selectCopy(MachineInstr &I, const TargetInstrInfo &TII,
1007                        MachineRegisterInfo &MRI, const TargetRegisterInfo &TRI,
1008                        const RegisterBankInfo &RBI) {
1009   Register DstReg = I.getOperand(0).getReg();
1010   Register SrcReg = I.getOperand(1).getReg();
1011   const RegisterBank &DstRegBank = *RBI.getRegBank(DstReg, MRI, TRI);
1012   const RegisterBank &SrcRegBank = *RBI.getRegBank(SrcReg, MRI, TRI);
1013 
1014   // Find the correct register classes for the source and destination registers.
1015   const TargetRegisterClass *SrcRC;
1016   const TargetRegisterClass *DstRC;
1017   std::tie(SrcRC, DstRC) = getRegClassesForCopy(I, TII, MRI, TRI, RBI);
1018 
1019   if (!DstRC) {
1020     LLVM_DEBUG(dbgs() << "Unexpected dest size "
1021                       << RBI.getSizeInBits(DstReg, MRI, TRI) << '\n');
1022     return false;
1023   }
1024 
1025   // Is this a copy? If so, then we may need to insert a subregister copy.
1026   if (I.isCopy()) {
1027     // Yes. Check if there's anything to fix up.
1028     if (!SrcRC) {
1029       LLVM_DEBUG(dbgs() << "Couldn't determine source register class\n");
1030       return false;
1031     }
1032 
1033     const TypeSize SrcSize = TRI.getRegSizeInBits(*SrcRC);
1034     const TypeSize DstSize = TRI.getRegSizeInBits(*DstRC);
1035     unsigned SubReg;
1036 
1037     // If the source bank doesn't support a subregister copy small enough,
1038     // then we first need to copy to the destination bank.
1039     if (getMinSizeForRegBank(SrcRegBank) > DstSize) {
1040       const TargetRegisterClass *DstTempRC =
1041           getMinClassForRegBank(DstRegBank, SrcSize, /* GetAllRegSet */ true);
1042       getSubRegForClass(DstRC, TRI, SubReg);
1043 
1044       MachineIRBuilder MIB(I);
1045       auto Copy = MIB.buildCopy({DstTempRC}, {SrcReg});
1046       copySubReg(I, MRI, RBI, Copy.getReg(0), DstRC, SubReg);
1047     } else if (SrcSize > DstSize) {
1048       // If the source register is bigger than the destination we need to
1049       // perform a subregister copy.
1050       const TargetRegisterClass *SubRegRC =
1051           getMinClassForRegBank(SrcRegBank, DstSize, /* GetAllRegSet */ true);
1052       getSubRegForClass(SubRegRC, TRI, SubReg);
1053       copySubReg(I, MRI, RBI, SrcReg, DstRC, SubReg);
1054     } else if (DstSize > SrcSize) {
1055       // If the destination register is bigger than the source we need to do
1056       // a promotion using SUBREG_TO_REG.
1057       const TargetRegisterClass *PromotionRC =
1058           getMinClassForRegBank(SrcRegBank, DstSize, /* GetAllRegSet */ true);
1059       getSubRegForClass(SrcRC, TRI, SubReg);
1060 
1061       Register PromoteReg = MRI.createVirtualRegister(PromotionRC);
1062       BuildMI(*I.getParent(), I, I.getDebugLoc(),
1063               TII.get(AArch64::SUBREG_TO_REG), PromoteReg)
1064           .addImm(0)
1065           .addUse(SrcReg)
1066           .addImm(SubReg);
1067       MachineOperand &RegOp = I.getOperand(1);
1068       RegOp.setReg(PromoteReg);
1069     }
1070 
1071     // If the destination is a physical register, then there's nothing to
1072     // change, so we're done.
1073     if (DstReg.isPhysical())
1074       return true;
1075   }
1076 
1077   // No need to constrain SrcReg. It will get constrained when we hit another
1078   // of its use or its defs. Copies do not have constraints.
1079   if (!RBI.constrainGenericRegister(DstReg, *DstRC, MRI)) {
1080     LLVM_DEBUG(dbgs() << "Failed to constrain " << TII.getName(I.getOpcode())
1081                       << " operand\n");
1082     return false;
1083   }
1084 
1085   // If this a GPR ZEXT that we want to just reduce down into a copy.
1086   // The sizes will be mismatched with the source < 32b but that's ok.
1087   if (I.getOpcode() == TargetOpcode::G_ZEXT) {
1088     I.setDesc(TII.get(AArch64::COPY));
1089     assert(SrcRegBank.getID() == AArch64::GPRRegBankID);
1090     return selectCopy(I, TII, MRI, TRI, RBI);
1091   }
1092 
1093   I.setDesc(TII.get(AArch64::COPY));
1094   return true;
1095 }
1096 
1097 static unsigned selectFPConvOpc(unsigned GenericOpc, LLT DstTy, LLT SrcTy) {
1098   if (!DstTy.isScalar() || !SrcTy.isScalar())
1099     return GenericOpc;
1100 
1101   const unsigned DstSize = DstTy.getSizeInBits();
1102   const unsigned SrcSize = SrcTy.getSizeInBits();
1103 
1104   switch (DstSize) {
1105   case 32:
1106     switch (SrcSize) {
1107     case 32:
1108       switch (GenericOpc) {
1109       case TargetOpcode::G_SITOFP:
1110         return AArch64::SCVTFUWSri;
1111       case TargetOpcode::G_UITOFP:
1112         return AArch64::UCVTFUWSri;
1113       case TargetOpcode::G_FPTOSI:
1114         return AArch64::FCVTZSUWSr;
1115       case TargetOpcode::G_FPTOUI:
1116         return AArch64::FCVTZUUWSr;
1117       default:
1118         return GenericOpc;
1119       }
1120     case 64:
1121       switch (GenericOpc) {
1122       case TargetOpcode::G_SITOFP:
1123         return AArch64::SCVTFUXSri;
1124       case TargetOpcode::G_UITOFP:
1125         return AArch64::UCVTFUXSri;
1126       case TargetOpcode::G_FPTOSI:
1127         return AArch64::FCVTZSUWDr;
1128       case TargetOpcode::G_FPTOUI:
1129         return AArch64::FCVTZUUWDr;
1130       default:
1131         return GenericOpc;
1132       }
1133     default:
1134       return GenericOpc;
1135     }
1136   case 64:
1137     switch (SrcSize) {
1138     case 32:
1139       switch (GenericOpc) {
1140       case TargetOpcode::G_SITOFP:
1141         return AArch64::SCVTFUWDri;
1142       case TargetOpcode::G_UITOFP:
1143         return AArch64::UCVTFUWDri;
1144       case TargetOpcode::G_FPTOSI:
1145         return AArch64::FCVTZSUXSr;
1146       case TargetOpcode::G_FPTOUI:
1147         return AArch64::FCVTZUUXSr;
1148       default:
1149         return GenericOpc;
1150       }
1151     case 64:
1152       switch (GenericOpc) {
1153       case TargetOpcode::G_SITOFP:
1154         return AArch64::SCVTFUXDri;
1155       case TargetOpcode::G_UITOFP:
1156         return AArch64::UCVTFUXDri;
1157       case TargetOpcode::G_FPTOSI:
1158         return AArch64::FCVTZSUXDr;
1159       case TargetOpcode::G_FPTOUI:
1160         return AArch64::FCVTZUUXDr;
1161       default:
1162         return GenericOpc;
1163       }
1164     default:
1165       return GenericOpc;
1166     }
1167   default:
1168     return GenericOpc;
1169   };
1170   return GenericOpc;
1171 }
1172 
1173 MachineInstr *
1174 AArch64InstructionSelector::emitSelect(Register Dst, Register True,
1175                                        Register False, AArch64CC::CondCode CC,
1176                                        MachineIRBuilder &MIB) const {
1177   MachineRegisterInfo &MRI = *MIB.getMRI();
1178   assert(RBI.getRegBank(False, MRI, TRI)->getID() ==
1179              RBI.getRegBank(True, MRI, TRI)->getID() &&
1180          "Expected both select operands to have the same regbank?");
1181   LLT Ty = MRI.getType(True);
1182   if (Ty.isVector())
1183     return nullptr;
1184   const unsigned Size = Ty.getSizeInBits();
1185   assert((Size == 32 || Size == 64) &&
1186          "Expected 32 bit or 64 bit select only?");
1187   const bool Is32Bit = Size == 32;
1188   if (RBI.getRegBank(True, MRI, TRI)->getID() != AArch64::GPRRegBankID) {
1189     unsigned Opc = Is32Bit ? AArch64::FCSELSrrr : AArch64::FCSELDrrr;
1190     auto FCSel = MIB.buildInstr(Opc, {Dst}, {True, False}).addImm(CC);
1191     constrainSelectedInstRegOperands(*FCSel, TII, TRI, RBI);
1192     return &*FCSel;
1193   }
1194 
1195   // By default, we'll try and emit a CSEL.
1196   unsigned Opc = Is32Bit ? AArch64::CSELWr : AArch64::CSELXr;
1197   bool Optimized = false;
1198   auto TryFoldBinOpIntoSelect = [&Opc, Is32Bit, &CC, &MRI,
1199                                  &Optimized](Register &Reg, Register &OtherReg,
1200                                              bool Invert) {
1201     if (Optimized)
1202       return false;
1203 
1204     // Attempt to fold:
1205     //
1206     // %sub = G_SUB 0, %x
1207     // %select = G_SELECT cc, %reg, %sub
1208     //
1209     // Into:
1210     // %select = CSNEG %reg, %x, cc
1211     Register MatchReg;
1212     if (mi_match(Reg, MRI, m_Neg(m_Reg(MatchReg)))) {
1213       Opc = Is32Bit ? AArch64::CSNEGWr : AArch64::CSNEGXr;
1214       Reg = MatchReg;
1215       if (Invert) {
1216         CC = AArch64CC::getInvertedCondCode(CC);
1217         std::swap(Reg, OtherReg);
1218       }
1219       return true;
1220     }
1221 
1222     // Attempt to fold:
1223     //
1224     // %xor = G_XOR %x, -1
1225     // %select = G_SELECT cc, %reg, %xor
1226     //
1227     // Into:
1228     // %select = CSINV %reg, %x, cc
1229     if (mi_match(Reg, MRI, m_Not(m_Reg(MatchReg)))) {
1230       Opc = Is32Bit ? AArch64::CSINVWr : AArch64::CSINVXr;
1231       Reg = MatchReg;
1232       if (Invert) {
1233         CC = AArch64CC::getInvertedCondCode(CC);
1234         std::swap(Reg, OtherReg);
1235       }
1236       return true;
1237     }
1238 
1239     // Attempt to fold:
1240     //
1241     // %add = G_ADD %x, 1
1242     // %select = G_SELECT cc, %reg, %add
1243     //
1244     // Into:
1245     // %select = CSINC %reg, %x, cc
1246     if (mi_match(Reg, MRI,
1247                  m_any_of(m_GAdd(m_Reg(MatchReg), m_SpecificICst(1)),
1248                           m_GPtrAdd(m_Reg(MatchReg), m_SpecificICst(1))))) {
1249       Opc = Is32Bit ? AArch64::CSINCWr : AArch64::CSINCXr;
1250       Reg = MatchReg;
1251       if (Invert) {
1252         CC = AArch64CC::getInvertedCondCode(CC);
1253         std::swap(Reg, OtherReg);
1254       }
1255       return true;
1256     }
1257 
1258     return false;
1259   };
1260 
1261   // Helper lambda which tries to use CSINC/CSINV for the instruction when its
1262   // true/false values are constants.
1263   // FIXME: All of these patterns already exist in tablegen. We should be
1264   // able to import these.
1265   auto TryOptSelectCst = [&Opc, &True, &False, &CC, Is32Bit, &MRI,
1266                           &Optimized]() {
1267     if (Optimized)
1268       return false;
1269     auto TrueCst = getIConstantVRegValWithLookThrough(True, MRI);
1270     auto FalseCst = getIConstantVRegValWithLookThrough(False, MRI);
1271     if (!TrueCst && !FalseCst)
1272       return false;
1273 
1274     Register ZReg = Is32Bit ? AArch64::WZR : AArch64::XZR;
1275     if (TrueCst && FalseCst) {
1276       int64_t T = TrueCst->Value.getSExtValue();
1277       int64_t F = FalseCst->Value.getSExtValue();
1278 
1279       if (T == 0 && F == 1) {
1280         // G_SELECT cc, 0, 1 -> CSINC zreg, zreg, cc
1281         Opc = Is32Bit ? AArch64::CSINCWr : AArch64::CSINCXr;
1282         True = ZReg;
1283         False = ZReg;
1284         return true;
1285       }
1286 
1287       if (T == 0 && F == -1) {
1288         // G_SELECT cc 0, -1 -> CSINV zreg, zreg cc
1289         Opc = Is32Bit ? AArch64::CSINVWr : AArch64::CSINVXr;
1290         True = ZReg;
1291         False = ZReg;
1292         return true;
1293       }
1294     }
1295 
1296     if (TrueCst) {
1297       int64_t T = TrueCst->Value.getSExtValue();
1298       if (T == 1) {
1299         // G_SELECT cc, 1, f -> CSINC f, zreg, inv_cc
1300         Opc = Is32Bit ? AArch64::CSINCWr : AArch64::CSINCXr;
1301         True = False;
1302         False = ZReg;
1303         CC = AArch64CC::getInvertedCondCode(CC);
1304         return true;
1305       }
1306 
1307       if (T == -1) {
1308         // G_SELECT cc, -1, f -> CSINV f, zreg, inv_cc
1309         Opc = Is32Bit ? AArch64::CSINVWr : AArch64::CSINVXr;
1310         True = False;
1311         False = ZReg;
1312         CC = AArch64CC::getInvertedCondCode(CC);
1313         return true;
1314       }
1315     }
1316 
1317     if (FalseCst) {
1318       int64_t F = FalseCst->Value.getSExtValue();
1319       if (F == 1) {
1320         // G_SELECT cc, t, 1 -> CSINC t, zreg, cc
1321         Opc = Is32Bit ? AArch64::CSINCWr : AArch64::CSINCXr;
1322         False = ZReg;
1323         return true;
1324       }
1325 
1326       if (F == -1) {
1327         // G_SELECT cc, t, -1 -> CSINC t, zreg, cc
1328         Opc = Is32Bit ? AArch64::CSINVWr : AArch64::CSINVXr;
1329         False = ZReg;
1330         return true;
1331       }
1332     }
1333     return false;
1334   };
1335 
1336   Optimized |= TryFoldBinOpIntoSelect(False, True, /*Invert = */ false);
1337   Optimized |= TryFoldBinOpIntoSelect(True, False, /*Invert = */ true);
1338   Optimized |= TryOptSelectCst();
1339   auto SelectInst = MIB.buildInstr(Opc, {Dst}, {True, False}).addImm(CC);
1340   constrainSelectedInstRegOperands(*SelectInst, TII, TRI, RBI);
1341   return &*SelectInst;
1342 }
1343 
1344 static AArch64CC::CondCode changeICMPPredToAArch64CC(CmpInst::Predicate P) {
1345   switch (P) {
1346   default:
1347     llvm_unreachable("Unknown condition code!");
1348   case CmpInst::ICMP_NE:
1349     return AArch64CC::NE;
1350   case CmpInst::ICMP_EQ:
1351     return AArch64CC::EQ;
1352   case CmpInst::ICMP_SGT:
1353     return AArch64CC::GT;
1354   case CmpInst::ICMP_SGE:
1355     return AArch64CC::GE;
1356   case CmpInst::ICMP_SLT:
1357     return AArch64CC::LT;
1358   case CmpInst::ICMP_SLE:
1359     return AArch64CC::LE;
1360   case CmpInst::ICMP_UGT:
1361     return AArch64CC::HI;
1362   case CmpInst::ICMP_UGE:
1363     return AArch64CC::HS;
1364   case CmpInst::ICMP_ULT:
1365     return AArch64CC::LO;
1366   case CmpInst::ICMP_ULE:
1367     return AArch64CC::LS;
1368   }
1369 }
1370 
1371 /// changeFPCCToORAArch64CC - Convert an IR fp condition code to an AArch64 CC.
1372 static void changeFPCCToORAArch64CC(CmpInst::Predicate CC,
1373                                     AArch64CC::CondCode &CondCode,
1374                                     AArch64CC::CondCode &CondCode2) {
1375   CondCode2 = AArch64CC::AL;
1376   switch (CC) {
1377   default:
1378     llvm_unreachable("Unknown FP condition!");
1379   case CmpInst::FCMP_OEQ:
1380     CondCode = AArch64CC::EQ;
1381     break;
1382   case CmpInst::FCMP_OGT:
1383     CondCode = AArch64CC::GT;
1384     break;
1385   case CmpInst::FCMP_OGE:
1386     CondCode = AArch64CC::GE;
1387     break;
1388   case CmpInst::FCMP_OLT:
1389     CondCode = AArch64CC::MI;
1390     break;
1391   case CmpInst::FCMP_OLE:
1392     CondCode = AArch64CC::LS;
1393     break;
1394   case CmpInst::FCMP_ONE:
1395     CondCode = AArch64CC::MI;
1396     CondCode2 = AArch64CC::GT;
1397     break;
1398   case CmpInst::FCMP_ORD:
1399     CondCode = AArch64CC::VC;
1400     break;
1401   case CmpInst::FCMP_UNO:
1402     CondCode = AArch64CC::VS;
1403     break;
1404   case CmpInst::FCMP_UEQ:
1405     CondCode = AArch64CC::EQ;
1406     CondCode2 = AArch64CC::VS;
1407     break;
1408   case CmpInst::FCMP_UGT:
1409     CondCode = AArch64CC::HI;
1410     break;
1411   case CmpInst::FCMP_UGE:
1412     CondCode = AArch64CC::PL;
1413     break;
1414   case CmpInst::FCMP_ULT:
1415     CondCode = AArch64CC::LT;
1416     break;
1417   case CmpInst::FCMP_ULE:
1418     CondCode = AArch64CC::LE;
1419     break;
1420   case CmpInst::FCMP_UNE:
1421     CondCode = AArch64CC::NE;
1422     break;
1423   }
1424 }
1425 
1426 /// Convert an IR fp condition code to an AArch64 CC.
1427 /// This differs from changeFPCCToAArch64CC in that it returns cond codes that
1428 /// should be AND'ed instead of OR'ed.
1429 static void changeFPCCToANDAArch64CC(CmpInst::Predicate CC,
1430                                      AArch64CC::CondCode &CondCode,
1431                                      AArch64CC::CondCode &CondCode2) {
1432   CondCode2 = AArch64CC::AL;
1433   switch (CC) {
1434   default:
1435     changeFPCCToORAArch64CC(CC, CondCode, CondCode2);
1436     assert(CondCode2 == AArch64CC::AL);
1437     break;
1438   case CmpInst::FCMP_ONE:
1439     // (a one b)
1440     // == ((a olt b) || (a ogt b))
1441     // == ((a ord b) && (a une b))
1442     CondCode = AArch64CC::VC;
1443     CondCode2 = AArch64CC::NE;
1444     break;
1445   case CmpInst::FCMP_UEQ:
1446     // (a ueq b)
1447     // == ((a uno b) || (a oeq b))
1448     // == ((a ule b) && (a uge b))
1449     CondCode = AArch64CC::PL;
1450     CondCode2 = AArch64CC::LE;
1451     break;
1452   }
1453 }
1454 
1455 /// Return a register which can be used as a bit to test in a TB(N)Z.
1456 static Register getTestBitReg(Register Reg, uint64_t &Bit, bool &Invert,
1457                               MachineRegisterInfo &MRI) {
1458   assert(Reg.isValid() && "Expected valid register!");
1459   bool HasZext = false;
1460   while (MachineInstr *MI = getDefIgnoringCopies(Reg, MRI)) {
1461     unsigned Opc = MI->getOpcode();
1462 
1463     if (!MI->getOperand(0).isReg() ||
1464         !MRI.hasOneNonDBGUse(MI->getOperand(0).getReg()))
1465       break;
1466 
1467     // (tbz (any_ext x), b) -> (tbz x, b) if we don't use the extended bits.
1468     //
1469     // (tbz (trunc x), b) -> (tbz x, b) is always safe, because the bit number
1470     // on the truncated x is the same as the bit number on x.
1471     if (Opc == TargetOpcode::G_ANYEXT || Opc == TargetOpcode::G_ZEXT ||
1472         Opc == TargetOpcode::G_TRUNC) {
1473       if (Opc == TargetOpcode::G_ZEXT)
1474         HasZext = true;
1475 
1476       Register NextReg = MI->getOperand(1).getReg();
1477       // Did we find something worth folding?
1478       if (!NextReg.isValid() || !MRI.hasOneNonDBGUse(NextReg))
1479         break;
1480 
1481       // NextReg is worth folding. Keep looking.
1482       Reg = NextReg;
1483       continue;
1484     }
1485 
1486     // Attempt to find a suitable operation with a constant on one side.
1487     std::optional<uint64_t> C;
1488     Register TestReg;
1489     switch (Opc) {
1490     default:
1491       break;
1492     case TargetOpcode::G_AND:
1493     case TargetOpcode::G_XOR: {
1494       TestReg = MI->getOperand(1).getReg();
1495       Register ConstantReg = MI->getOperand(2).getReg();
1496       auto VRegAndVal = getIConstantVRegValWithLookThrough(ConstantReg, MRI);
1497       if (!VRegAndVal) {
1498         // AND commutes, check the other side for a constant.
1499         // FIXME: Can we canonicalize the constant so that it's always on the
1500         // same side at some point earlier?
1501         std::swap(ConstantReg, TestReg);
1502         VRegAndVal = getIConstantVRegValWithLookThrough(ConstantReg, MRI);
1503       }
1504       if (VRegAndVal) {
1505         if (HasZext)
1506           C = VRegAndVal->Value.getZExtValue();
1507         else
1508           C = VRegAndVal->Value.getSExtValue();
1509       }
1510       break;
1511     }
1512     case TargetOpcode::G_ASHR:
1513     case TargetOpcode::G_LSHR:
1514     case TargetOpcode::G_SHL: {
1515       TestReg = MI->getOperand(1).getReg();
1516       auto VRegAndVal =
1517           getIConstantVRegValWithLookThrough(MI->getOperand(2).getReg(), MRI);
1518       if (VRegAndVal)
1519         C = VRegAndVal->Value.getSExtValue();
1520       break;
1521     }
1522     }
1523 
1524     // Didn't find a constant or viable register. Bail out of the loop.
1525     if (!C || !TestReg.isValid())
1526       break;
1527 
1528     // We found a suitable instruction with a constant. Check to see if we can
1529     // walk through the instruction.
1530     Register NextReg;
1531     unsigned TestRegSize = MRI.getType(TestReg).getSizeInBits();
1532     switch (Opc) {
1533     default:
1534       break;
1535     case TargetOpcode::G_AND:
1536       // (tbz (and x, m), b) -> (tbz x, b) when the b-th bit of m is set.
1537       if ((*C >> Bit) & 1)
1538         NextReg = TestReg;
1539       break;
1540     case TargetOpcode::G_SHL:
1541       // (tbz (shl x, c), b) -> (tbz x, b-c) when b-c is positive and fits in
1542       // the type of the register.
1543       if (*C <= Bit && (Bit - *C) < TestRegSize) {
1544         NextReg = TestReg;
1545         Bit = Bit - *C;
1546       }
1547       break;
1548     case TargetOpcode::G_ASHR:
1549       // (tbz (ashr x, c), b) -> (tbz x, b+c) or (tbz x, msb) if b+c is > # bits
1550       // in x
1551       NextReg = TestReg;
1552       Bit = Bit + *C;
1553       if (Bit >= TestRegSize)
1554         Bit = TestRegSize - 1;
1555       break;
1556     case TargetOpcode::G_LSHR:
1557       // (tbz (lshr x, c), b) -> (tbz x, b+c) when b + c is < # bits in x
1558       if ((Bit + *C) < TestRegSize) {
1559         NextReg = TestReg;
1560         Bit = Bit + *C;
1561       }
1562       break;
1563     case TargetOpcode::G_XOR:
1564       // We can walk through a G_XOR by inverting whether we use tbz/tbnz when
1565       // appropriate.
1566       //
1567       // e.g. If x' = xor x, c, and the b-th bit is set in c then
1568       //
1569       // tbz x', b -> tbnz x, b
1570       //
1571       // Because x' only has the b-th bit set if x does not.
1572       if ((*C >> Bit) & 1)
1573         Invert = !Invert;
1574       NextReg = TestReg;
1575       break;
1576     }
1577 
1578     // Check if we found anything worth folding.
1579     if (!NextReg.isValid())
1580       return Reg;
1581     Reg = NextReg;
1582   }
1583 
1584   return Reg;
1585 }
1586 
1587 MachineInstr *AArch64InstructionSelector::emitTestBit(
1588     Register TestReg, uint64_t Bit, bool IsNegative, MachineBasicBlock *DstMBB,
1589     MachineIRBuilder &MIB) const {
1590   assert(TestReg.isValid());
1591   assert(ProduceNonFlagSettingCondBr &&
1592          "Cannot emit TB(N)Z with speculation tracking!");
1593   MachineRegisterInfo &MRI = *MIB.getMRI();
1594 
1595   // Attempt to optimize the test bit by walking over instructions.
1596   TestReg = getTestBitReg(TestReg, Bit, IsNegative, MRI);
1597   LLT Ty = MRI.getType(TestReg);
1598   unsigned Size = Ty.getSizeInBits();
1599   assert(!Ty.isVector() && "Expected a scalar!");
1600   assert(Bit < 64 && "Bit is too large!");
1601 
1602   // When the test register is a 64-bit register, we have to narrow to make
1603   // TBNZW work.
1604   bool UseWReg = Bit < 32;
1605   unsigned NecessarySize = UseWReg ? 32 : 64;
1606   if (Size != NecessarySize)
1607     TestReg = moveScalarRegClass(
1608         TestReg, UseWReg ? AArch64::GPR32RegClass : AArch64::GPR64RegClass,
1609         MIB);
1610 
1611   static const unsigned OpcTable[2][2] = {{AArch64::TBZX, AArch64::TBNZX},
1612                                           {AArch64::TBZW, AArch64::TBNZW}};
1613   unsigned Opc = OpcTable[UseWReg][IsNegative];
1614   auto TestBitMI =
1615       MIB.buildInstr(Opc).addReg(TestReg).addImm(Bit).addMBB(DstMBB);
1616   constrainSelectedInstRegOperands(*TestBitMI, TII, TRI, RBI);
1617   return &*TestBitMI;
1618 }
1619 
1620 bool AArch64InstructionSelector::tryOptAndIntoCompareBranch(
1621     MachineInstr &AndInst, bool Invert, MachineBasicBlock *DstMBB,
1622     MachineIRBuilder &MIB) const {
1623   assert(AndInst.getOpcode() == TargetOpcode::G_AND && "Expected G_AND only?");
1624   // Given something like this:
1625   //
1626   //  %x = ...Something...
1627   //  %one = G_CONSTANT i64 1
1628   //  %zero = G_CONSTANT i64 0
1629   //  %and = G_AND %x, %one
1630   //  %cmp = G_ICMP intpred(ne), %and, %zero
1631   //  %cmp_trunc = G_TRUNC %cmp
1632   //  G_BRCOND %cmp_trunc, %bb.3
1633   //
1634   // We want to try and fold the AND into the G_BRCOND and produce either a
1635   // TBNZ (when we have intpred(ne)) or a TBZ (when we have intpred(eq)).
1636   //
1637   // In this case, we'd get
1638   //
1639   // TBNZ %x %bb.3
1640   //
1641 
1642   // Check if the AND has a constant on its RHS which we can use as a mask.
1643   // If it's a power of 2, then it's the same as checking a specific bit.
1644   // (e.g, ANDing with 8 == ANDing with 000...100 == testing if bit 3 is set)
1645   auto MaybeBit = getIConstantVRegValWithLookThrough(
1646       AndInst.getOperand(2).getReg(), *MIB.getMRI());
1647   if (!MaybeBit)
1648     return false;
1649 
1650   int32_t Bit = MaybeBit->Value.exactLogBase2();
1651   if (Bit < 0)
1652     return false;
1653 
1654   Register TestReg = AndInst.getOperand(1).getReg();
1655 
1656   // Emit a TB(N)Z.
1657   emitTestBit(TestReg, Bit, Invert, DstMBB, MIB);
1658   return true;
1659 }
1660 
1661 MachineInstr *AArch64InstructionSelector::emitCBZ(Register CompareReg,
1662                                                   bool IsNegative,
1663                                                   MachineBasicBlock *DestMBB,
1664                                                   MachineIRBuilder &MIB) const {
1665   assert(ProduceNonFlagSettingCondBr && "CBZ does not set flags!");
1666   MachineRegisterInfo &MRI = *MIB.getMRI();
1667   assert(RBI.getRegBank(CompareReg, MRI, TRI)->getID() ==
1668              AArch64::GPRRegBankID &&
1669          "Expected GPRs only?");
1670   auto Ty = MRI.getType(CompareReg);
1671   unsigned Width = Ty.getSizeInBits();
1672   assert(!Ty.isVector() && "Expected scalar only?");
1673   assert(Width <= 64 && "Expected width to be at most 64?");
1674   static const unsigned OpcTable[2][2] = {{AArch64::CBZW, AArch64::CBZX},
1675                                           {AArch64::CBNZW, AArch64::CBNZX}};
1676   unsigned Opc = OpcTable[IsNegative][Width == 64];
1677   auto BranchMI = MIB.buildInstr(Opc, {}, {CompareReg}).addMBB(DestMBB);
1678   constrainSelectedInstRegOperands(*BranchMI, TII, TRI, RBI);
1679   return &*BranchMI;
1680 }
1681 
1682 bool AArch64InstructionSelector::selectCompareBranchFedByFCmp(
1683     MachineInstr &I, MachineInstr &FCmp, MachineIRBuilder &MIB) const {
1684   assert(FCmp.getOpcode() == TargetOpcode::G_FCMP);
1685   assert(I.getOpcode() == TargetOpcode::G_BRCOND);
1686   // Unfortunately, the mapping of LLVM FP CC's onto AArch64 CC's isn't
1687   // totally clean.  Some of them require two branches to implement.
1688   auto Pred = (CmpInst::Predicate)FCmp.getOperand(1).getPredicate();
1689   emitFPCompare(FCmp.getOperand(2).getReg(), FCmp.getOperand(3).getReg(), MIB,
1690                 Pred);
1691   AArch64CC::CondCode CC1, CC2;
1692   changeFCMPPredToAArch64CC(static_cast<CmpInst::Predicate>(Pred), CC1, CC2);
1693   MachineBasicBlock *DestMBB = I.getOperand(1).getMBB();
1694   MIB.buildInstr(AArch64::Bcc, {}, {}).addImm(CC1).addMBB(DestMBB);
1695   if (CC2 != AArch64CC::AL)
1696     MIB.buildInstr(AArch64::Bcc, {}, {}).addImm(CC2).addMBB(DestMBB);
1697   I.eraseFromParent();
1698   return true;
1699 }
1700 
1701 bool AArch64InstructionSelector::tryOptCompareBranchFedByICmp(
1702     MachineInstr &I, MachineInstr &ICmp, MachineIRBuilder &MIB) const {
1703   assert(ICmp.getOpcode() == TargetOpcode::G_ICMP);
1704   assert(I.getOpcode() == TargetOpcode::G_BRCOND);
1705   // Attempt to optimize the G_BRCOND + G_ICMP into a TB(N)Z/CB(N)Z.
1706   //
1707   // Speculation tracking/SLH assumes that optimized TB(N)Z/CB(N)Z
1708   // instructions will not be produced, as they are conditional branch
1709   // instructions that do not set flags.
1710   if (!ProduceNonFlagSettingCondBr)
1711     return false;
1712 
1713   MachineRegisterInfo &MRI = *MIB.getMRI();
1714   MachineBasicBlock *DestMBB = I.getOperand(1).getMBB();
1715   auto Pred =
1716       static_cast<CmpInst::Predicate>(ICmp.getOperand(1).getPredicate());
1717   Register LHS = ICmp.getOperand(2).getReg();
1718   Register RHS = ICmp.getOperand(3).getReg();
1719 
1720   // We're allowed to emit a TB(N)Z/CB(N)Z. Try to do that.
1721   auto VRegAndVal = getIConstantVRegValWithLookThrough(RHS, MRI);
1722   MachineInstr *AndInst = getOpcodeDef(TargetOpcode::G_AND, LHS, MRI);
1723 
1724   // When we can emit a TB(N)Z, prefer that.
1725   //
1726   // Handle non-commutative condition codes first.
1727   // Note that we don't want to do this when we have a G_AND because it can
1728   // become a tst. The tst will make the test bit in the TB(N)Z redundant.
1729   if (VRegAndVal && !AndInst) {
1730     int64_t C = VRegAndVal->Value.getSExtValue();
1731 
1732     // When we have a greater-than comparison, we can just test if the msb is
1733     // zero.
1734     if (C == -1 && Pred == CmpInst::ICMP_SGT) {
1735       uint64_t Bit = MRI.getType(LHS).getSizeInBits() - 1;
1736       emitTestBit(LHS, Bit, /*IsNegative = */ false, DestMBB, MIB);
1737       I.eraseFromParent();
1738       return true;
1739     }
1740 
1741     // When we have a less than comparison, we can just test if the msb is not
1742     // zero.
1743     if (C == 0 && Pred == CmpInst::ICMP_SLT) {
1744       uint64_t Bit = MRI.getType(LHS).getSizeInBits() - 1;
1745       emitTestBit(LHS, Bit, /*IsNegative = */ true, DestMBB, MIB);
1746       I.eraseFromParent();
1747       return true;
1748     }
1749 
1750     // Inversely, if we have a signed greater-than-or-equal comparison to zero,
1751     // we can test if the msb is zero.
1752     if (C == 0 && Pred == CmpInst::ICMP_SGE) {
1753       uint64_t Bit = MRI.getType(LHS).getSizeInBits() - 1;
1754       emitTestBit(LHS, Bit, /*IsNegative = */ false, DestMBB, MIB);
1755       I.eraseFromParent();
1756       return true;
1757     }
1758   }
1759 
1760   // Attempt to handle commutative condition codes. Right now, that's only
1761   // eq/ne.
1762   if (ICmpInst::isEquality(Pred)) {
1763     if (!VRegAndVal) {
1764       std::swap(RHS, LHS);
1765       VRegAndVal = getIConstantVRegValWithLookThrough(RHS, MRI);
1766       AndInst = getOpcodeDef(TargetOpcode::G_AND, LHS, MRI);
1767     }
1768 
1769     if (VRegAndVal && VRegAndVal->Value == 0) {
1770       // If there's a G_AND feeding into this branch, try to fold it away by
1771       // emitting a TB(N)Z instead.
1772       //
1773       // Note: If we have LT, then it *is* possible to fold, but it wouldn't be
1774       // beneficial. When we have an AND and LT, we need a TST/ANDS, so folding
1775       // would be redundant.
1776       if (AndInst &&
1777           tryOptAndIntoCompareBranch(
1778               *AndInst, /*Invert = */ Pred == CmpInst::ICMP_NE, DestMBB, MIB)) {
1779         I.eraseFromParent();
1780         return true;
1781       }
1782 
1783       // Otherwise, try to emit a CB(N)Z instead.
1784       auto LHSTy = MRI.getType(LHS);
1785       if (!LHSTy.isVector() && LHSTy.getSizeInBits() <= 64) {
1786         emitCBZ(LHS, /*IsNegative = */ Pred == CmpInst::ICMP_NE, DestMBB, MIB);
1787         I.eraseFromParent();
1788         return true;
1789       }
1790     }
1791   }
1792 
1793   return false;
1794 }
1795 
1796 bool AArch64InstructionSelector::selectCompareBranchFedByICmp(
1797     MachineInstr &I, MachineInstr &ICmp, MachineIRBuilder &MIB) const {
1798   assert(ICmp.getOpcode() == TargetOpcode::G_ICMP);
1799   assert(I.getOpcode() == TargetOpcode::G_BRCOND);
1800   if (tryOptCompareBranchFedByICmp(I, ICmp, MIB))
1801     return true;
1802 
1803   // Couldn't optimize. Emit a compare + a Bcc.
1804   MachineBasicBlock *DestMBB = I.getOperand(1).getMBB();
1805   auto PredOp = ICmp.getOperand(1);
1806   emitIntegerCompare(ICmp.getOperand(2), ICmp.getOperand(3), PredOp, MIB);
1807   const AArch64CC::CondCode CC = changeICMPPredToAArch64CC(
1808       static_cast<CmpInst::Predicate>(PredOp.getPredicate()));
1809   MIB.buildInstr(AArch64::Bcc, {}, {}).addImm(CC).addMBB(DestMBB);
1810   I.eraseFromParent();
1811   return true;
1812 }
1813 
1814 bool AArch64InstructionSelector::selectCompareBranch(
1815     MachineInstr &I, MachineFunction &MF, MachineRegisterInfo &MRI) {
1816   Register CondReg = I.getOperand(0).getReg();
1817   MachineInstr *CCMI = MRI.getVRegDef(CondReg);
1818   // Try to select the G_BRCOND using whatever is feeding the condition if
1819   // possible.
1820   unsigned CCMIOpc = CCMI->getOpcode();
1821   if (CCMIOpc == TargetOpcode::G_FCMP)
1822     return selectCompareBranchFedByFCmp(I, *CCMI, MIB);
1823   if (CCMIOpc == TargetOpcode::G_ICMP)
1824     return selectCompareBranchFedByICmp(I, *CCMI, MIB);
1825 
1826   // Speculation tracking/SLH assumes that optimized TB(N)Z/CB(N)Z
1827   // instructions will not be produced, as they are conditional branch
1828   // instructions that do not set flags.
1829   if (ProduceNonFlagSettingCondBr) {
1830     emitTestBit(CondReg, /*Bit = */ 0, /*IsNegative = */ true,
1831                 I.getOperand(1).getMBB(), MIB);
1832     I.eraseFromParent();
1833     return true;
1834   }
1835 
1836   // Can't emit TB(N)Z/CB(N)Z. Emit a tst + bcc instead.
1837   auto TstMI =
1838       MIB.buildInstr(AArch64::ANDSWri, {LLT::scalar(32)}, {CondReg}).addImm(1);
1839   constrainSelectedInstRegOperands(*TstMI, TII, TRI, RBI);
1840   auto Bcc = MIB.buildInstr(AArch64::Bcc)
1841                  .addImm(AArch64CC::NE)
1842                  .addMBB(I.getOperand(1).getMBB());
1843   I.eraseFromParent();
1844   return constrainSelectedInstRegOperands(*Bcc, TII, TRI, RBI);
1845 }
1846 
1847 /// Returns the element immediate value of a vector shift operand if found.
1848 /// This needs to detect a splat-like operation, e.g. a G_BUILD_VECTOR.
1849 static std::optional<int64_t> getVectorShiftImm(Register Reg,
1850                                                 MachineRegisterInfo &MRI) {
1851   assert(MRI.getType(Reg).isVector() && "Expected a *vector* shift operand");
1852   MachineInstr *OpMI = MRI.getVRegDef(Reg);
1853   return getAArch64VectorSplatScalar(*OpMI, MRI);
1854 }
1855 
1856 /// Matches and returns the shift immediate value for a SHL instruction given
1857 /// a shift operand.
1858 static std::optional<int64_t> getVectorSHLImm(LLT SrcTy, Register Reg,
1859                                               MachineRegisterInfo &MRI) {
1860   std::optional<int64_t> ShiftImm = getVectorShiftImm(Reg, MRI);
1861   if (!ShiftImm)
1862     return std::nullopt;
1863   // Check the immediate is in range for a SHL.
1864   int64_t Imm = *ShiftImm;
1865   if (Imm < 0)
1866     return std::nullopt;
1867   switch (SrcTy.getElementType().getSizeInBits()) {
1868   default:
1869     LLVM_DEBUG(dbgs() << "Unhandled element type for vector shift");
1870     return std::nullopt;
1871   case 8:
1872     if (Imm > 7)
1873       return std::nullopt;
1874     break;
1875   case 16:
1876     if (Imm > 15)
1877       return std::nullopt;
1878     break;
1879   case 32:
1880     if (Imm > 31)
1881       return std::nullopt;
1882     break;
1883   case 64:
1884     if (Imm > 63)
1885       return std::nullopt;
1886     break;
1887   }
1888   return Imm;
1889 }
1890 
1891 bool AArch64InstructionSelector::selectVectorSHL(MachineInstr &I,
1892                                                  MachineRegisterInfo &MRI) {
1893   assert(I.getOpcode() == TargetOpcode::G_SHL);
1894   Register DstReg = I.getOperand(0).getReg();
1895   const LLT Ty = MRI.getType(DstReg);
1896   Register Src1Reg = I.getOperand(1).getReg();
1897   Register Src2Reg = I.getOperand(2).getReg();
1898 
1899   if (!Ty.isVector())
1900     return false;
1901 
1902   // Check if we have a vector of constants on RHS that we can select as the
1903   // immediate form.
1904   std::optional<int64_t> ImmVal = getVectorSHLImm(Ty, Src2Reg, MRI);
1905 
1906   unsigned Opc = 0;
1907   if (Ty == LLT::fixed_vector(2, 64)) {
1908     Opc = ImmVal ? AArch64::SHLv2i64_shift : AArch64::USHLv2i64;
1909   } else if (Ty == LLT::fixed_vector(4, 32)) {
1910     Opc = ImmVal ? AArch64::SHLv4i32_shift : AArch64::USHLv4i32;
1911   } else if (Ty == LLT::fixed_vector(2, 32)) {
1912     Opc = ImmVal ? AArch64::SHLv2i32_shift : AArch64::USHLv2i32;
1913   } else if (Ty == LLT::fixed_vector(4, 16)) {
1914     Opc = ImmVal ? AArch64::SHLv4i16_shift : AArch64::USHLv4i16;
1915   } else if (Ty == LLT::fixed_vector(8, 16)) {
1916     Opc = ImmVal ? AArch64::SHLv8i16_shift : AArch64::USHLv8i16;
1917   } else if (Ty == LLT::fixed_vector(16, 8)) {
1918     Opc = ImmVal ? AArch64::SHLv16i8_shift : AArch64::USHLv16i8;
1919   } else if (Ty == LLT::fixed_vector(8, 8)) {
1920     Opc = ImmVal ? AArch64::SHLv8i8_shift : AArch64::USHLv8i8;
1921   } else {
1922     LLVM_DEBUG(dbgs() << "Unhandled G_SHL type");
1923     return false;
1924   }
1925 
1926   auto Shl = MIB.buildInstr(Opc, {DstReg}, {Src1Reg});
1927   if (ImmVal)
1928     Shl.addImm(*ImmVal);
1929   else
1930     Shl.addUse(Src2Reg);
1931   constrainSelectedInstRegOperands(*Shl, TII, TRI, RBI);
1932   I.eraseFromParent();
1933   return true;
1934 }
1935 
1936 bool AArch64InstructionSelector::selectVectorAshrLshr(
1937     MachineInstr &I, MachineRegisterInfo &MRI) {
1938   assert(I.getOpcode() == TargetOpcode::G_ASHR ||
1939          I.getOpcode() == TargetOpcode::G_LSHR);
1940   Register DstReg = I.getOperand(0).getReg();
1941   const LLT Ty = MRI.getType(DstReg);
1942   Register Src1Reg = I.getOperand(1).getReg();
1943   Register Src2Reg = I.getOperand(2).getReg();
1944 
1945   if (!Ty.isVector())
1946     return false;
1947 
1948   bool IsASHR = I.getOpcode() == TargetOpcode::G_ASHR;
1949 
1950   // We expect the immediate case to be lowered in the PostLegalCombiner to
1951   // AArch64ISD::VASHR or AArch64ISD::VLSHR equivalents.
1952 
1953   // There is not a shift right register instruction, but the shift left
1954   // register instruction takes a signed value, where negative numbers specify a
1955   // right shift.
1956 
1957   unsigned Opc = 0;
1958   unsigned NegOpc = 0;
1959   const TargetRegisterClass *RC =
1960       getRegClassForTypeOnBank(Ty, RBI.getRegBank(AArch64::FPRRegBankID));
1961   if (Ty == LLT::fixed_vector(2, 64)) {
1962     Opc = IsASHR ? AArch64::SSHLv2i64 : AArch64::USHLv2i64;
1963     NegOpc = AArch64::NEGv2i64;
1964   } else if (Ty == LLT::fixed_vector(4, 32)) {
1965     Opc = IsASHR ? AArch64::SSHLv4i32 : AArch64::USHLv4i32;
1966     NegOpc = AArch64::NEGv4i32;
1967   } else if (Ty == LLT::fixed_vector(2, 32)) {
1968     Opc = IsASHR ? AArch64::SSHLv2i32 : AArch64::USHLv2i32;
1969     NegOpc = AArch64::NEGv2i32;
1970   } else if (Ty == LLT::fixed_vector(4, 16)) {
1971     Opc = IsASHR ? AArch64::SSHLv4i16 : AArch64::USHLv4i16;
1972     NegOpc = AArch64::NEGv4i16;
1973   } else if (Ty == LLT::fixed_vector(8, 16)) {
1974     Opc = IsASHR ? AArch64::SSHLv8i16 : AArch64::USHLv8i16;
1975     NegOpc = AArch64::NEGv8i16;
1976   } else if (Ty == LLT::fixed_vector(16, 8)) {
1977     Opc = IsASHR ? AArch64::SSHLv16i8 : AArch64::USHLv16i8;
1978     NegOpc = AArch64::NEGv16i8;
1979   } else if (Ty == LLT::fixed_vector(8, 8)) {
1980     Opc = IsASHR ? AArch64::SSHLv8i8 : AArch64::USHLv8i8;
1981     NegOpc = AArch64::NEGv8i8;
1982   } else {
1983     LLVM_DEBUG(dbgs() << "Unhandled G_ASHR type");
1984     return false;
1985   }
1986 
1987   auto Neg = MIB.buildInstr(NegOpc, {RC}, {Src2Reg});
1988   constrainSelectedInstRegOperands(*Neg, TII, TRI, RBI);
1989   auto SShl = MIB.buildInstr(Opc, {DstReg}, {Src1Reg, Neg});
1990   constrainSelectedInstRegOperands(*SShl, TII, TRI, RBI);
1991   I.eraseFromParent();
1992   return true;
1993 }
1994 
1995 bool AArch64InstructionSelector::selectVaStartAAPCS(
1996     MachineInstr &I, MachineFunction &MF, MachineRegisterInfo &MRI) const {
1997   return false;
1998 }
1999 
2000 bool AArch64InstructionSelector::selectVaStartDarwin(
2001     MachineInstr &I, MachineFunction &MF, MachineRegisterInfo &MRI) const {
2002   AArch64FunctionInfo *FuncInfo = MF.getInfo<AArch64FunctionInfo>();
2003   Register ListReg = I.getOperand(0).getReg();
2004 
2005   Register ArgsAddrReg = MRI.createVirtualRegister(&AArch64::GPR64RegClass);
2006 
2007   int FrameIdx = FuncInfo->getVarArgsStackIndex();
2008   if (MF.getSubtarget<AArch64Subtarget>().isCallingConvWin64(
2009           MF.getFunction().getCallingConv(), MF.getFunction().isVarArg())) {
2010     FrameIdx = FuncInfo->getVarArgsGPRSize() > 0
2011                    ? FuncInfo->getVarArgsGPRIndex()
2012                    : FuncInfo->getVarArgsStackIndex();
2013   }
2014 
2015   auto MIB =
2016       BuildMI(*I.getParent(), I, I.getDebugLoc(), TII.get(AArch64::ADDXri))
2017           .addDef(ArgsAddrReg)
2018           .addFrameIndex(FrameIdx)
2019           .addImm(0)
2020           .addImm(0);
2021 
2022   constrainSelectedInstRegOperands(*MIB, TII, TRI, RBI);
2023 
2024   MIB = BuildMI(*I.getParent(), I, I.getDebugLoc(), TII.get(AArch64::STRXui))
2025             .addUse(ArgsAddrReg)
2026             .addUse(ListReg)
2027             .addImm(0)
2028             .addMemOperand(*I.memoperands_begin());
2029 
2030   constrainSelectedInstRegOperands(*MIB, TII, TRI, RBI);
2031   I.eraseFromParent();
2032   return true;
2033 }
2034 
2035 void AArch64InstructionSelector::materializeLargeCMVal(
2036     MachineInstr &I, const Value *V, unsigned OpFlags) {
2037   MachineBasicBlock &MBB = *I.getParent();
2038   MachineFunction &MF = *MBB.getParent();
2039   MachineRegisterInfo &MRI = MF.getRegInfo();
2040 
2041   auto MovZ = MIB.buildInstr(AArch64::MOVZXi, {&AArch64::GPR64RegClass}, {});
2042   MovZ->addOperand(MF, I.getOperand(1));
2043   MovZ->getOperand(1).setTargetFlags(OpFlags | AArch64II::MO_G0 |
2044                                      AArch64II::MO_NC);
2045   MovZ->addOperand(MF, MachineOperand::CreateImm(0));
2046   constrainSelectedInstRegOperands(*MovZ, TII, TRI, RBI);
2047 
2048   auto BuildMovK = [&](Register SrcReg, unsigned char Flags, unsigned Offset,
2049                        Register ForceDstReg) {
2050     Register DstReg = ForceDstReg
2051                           ? ForceDstReg
2052                           : MRI.createVirtualRegister(&AArch64::GPR64RegClass);
2053     auto MovI = MIB.buildInstr(AArch64::MOVKXi).addDef(DstReg).addUse(SrcReg);
2054     if (auto *GV = dyn_cast<GlobalValue>(V)) {
2055       MovI->addOperand(MF, MachineOperand::CreateGA(
2056                                GV, MovZ->getOperand(1).getOffset(), Flags));
2057     } else {
2058       MovI->addOperand(
2059           MF, MachineOperand::CreateBA(cast<BlockAddress>(V),
2060                                        MovZ->getOperand(1).getOffset(), Flags));
2061     }
2062     MovI->addOperand(MF, MachineOperand::CreateImm(Offset));
2063     constrainSelectedInstRegOperands(*MovI, TII, TRI, RBI);
2064     return DstReg;
2065   };
2066   Register DstReg = BuildMovK(MovZ.getReg(0),
2067                               AArch64II::MO_G1 | AArch64II::MO_NC, 16, 0);
2068   DstReg = BuildMovK(DstReg, AArch64II::MO_G2 | AArch64II::MO_NC, 32, 0);
2069   BuildMovK(DstReg, AArch64II::MO_G3, 48, I.getOperand(0).getReg());
2070 }
2071 
2072 bool AArch64InstructionSelector::preISelLower(MachineInstr &I) {
2073   MachineBasicBlock &MBB = *I.getParent();
2074   MachineFunction &MF = *MBB.getParent();
2075   MachineRegisterInfo &MRI = MF.getRegInfo();
2076 
2077   switch (I.getOpcode()) {
2078   case TargetOpcode::G_STORE: {
2079     bool Changed = contractCrossBankCopyIntoStore(I, MRI);
2080     MachineOperand &SrcOp = I.getOperand(0);
2081     if (MRI.getType(SrcOp.getReg()).isPointer()) {
2082       // Allow matching with imported patterns for stores of pointers. Unlike
2083       // G_LOAD/G_PTR_ADD, we may not have selected all users. So, emit a copy
2084       // and constrain.
2085       auto Copy = MIB.buildCopy(LLT::scalar(64), SrcOp);
2086       Register NewSrc = Copy.getReg(0);
2087       SrcOp.setReg(NewSrc);
2088       RBI.constrainGenericRegister(NewSrc, AArch64::GPR64RegClass, MRI);
2089       Changed = true;
2090     }
2091     return Changed;
2092   }
2093   case TargetOpcode::G_PTR_ADD:
2094     return convertPtrAddToAdd(I, MRI);
2095   case TargetOpcode::G_LOAD: {
2096     // For scalar loads of pointers, we try to convert the dest type from p0
2097     // to s64 so that our imported patterns can match. Like with the G_PTR_ADD
2098     // conversion, this should be ok because all users should have been
2099     // selected already, so the type doesn't matter for them.
2100     Register DstReg = I.getOperand(0).getReg();
2101     const LLT DstTy = MRI.getType(DstReg);
2102     if (!DstTy.isPointer())
2103       return false;
2104     MRI.setType(DstReg, LLT::scalar(64));
2105     return true;
2106   }
2107   case AArch64::G_DUP: {
2108     // Convert the type from p0 to s64 to help selection.
2109     LLT DstTy = MRI.getType(I.getOperand(0).getReg());
2110     if (!DstTy.isPointerVector())
2111       return false;
2112     auto NewSrc = MIB.buildCopy(LLT::scalar(64), I.getOperand(1).getReg());
2113     MRI.setType(I.getOperand(0).getReg(),
2114                 DstTy.changeElementType(LLT::scalar(64)));
2115     MRI.setRegClass(NewSrc.getReg(0), &AArch64::GPR64RegClass);
2116     I.getOperand(1).setReg(NewSrc.getReg(0));
2117     return true;
2118   }
2119   case TargetOpcode::G_UITOFP:
2120   case TargetOpcode::G_SITOFP: {
2121     // If both source and destination regbanks are FPR, then convert the opcode
2122     // to G_SITOF so that the importer can select it to an fpr variant.
2123     // Otherwise, it ends up matching an fpr/gpr variant and adding a cross-bank
2124     // copy.
2125     Register SrcReg = I.getOperand(1).getReg();
2126     LLT SrcTy = MRI.getType(SrcReg);
2127     LLT DstTy = MRI.getType(I.getOperand(0).getReg());
2128     if (SrcTy.isVector() || SrcTy.getSizeInBits() != DstTy.getSizeInBits())
2129       return false;
2130 
2131     if (RBI.getRegBank(SrcReg, MRI, TRI)->getID() == AArch64::FPRRegBankID) {
2132       if (I.getOpcode() == TargetOpcode::G_SITOFP)
2133         I.setDesc(TII.get(AArch64::G_SITOF));
2134       else
2135         I.setDesc(TII.get(AArch64::G_UITOF));
2136       return true;
2137     }
2138     return false;
2139   }
2140   default:
2141     return false;
2142   }
2143 }
2144 
2145 /// This lowering tries to look for G_PTR_ADD instructions and then converts
2146 /// them to a standard G_ADD with a COPY on the source.
2147 ///
2148 /// The motivation behind this is to expose the add semantics to the imported
2149 /// tablegen patterns. We shouldn't need to check for uses being loads/stores,
2150 /// because the selector works bottom up, uses before defs. By the time we
2151 /// end up trying to select a G_PTR_ADD, we should have already attempted to
2152 /// fold this into addressing modes and were therefore unsuccessful.
2153 bool AArch64InstructionSelector::convertPtrAddToAdd(
2154     MachineInstr &I, MachineRegisterInfo &MRI) {
2155   assert(I.getOpcode() == TargetOpcode::G_PTR_ADD && "Expected G_PTR_ADD");
2156   Register DstReg = I.getOperand(0).getReg();
2157   Register AddOp1Reg = I.getOperand(1).getReg();
2158   const LLT PtrTy = MRI.getType(DstReg);
2159   if (PtrTy.getAddressSpace() != 0)
2160     return false;
2161 
2162   const LLT CastPtrTy =
2163       PtrTy.isVector() ? LLT::fixed_vector(2, 64) : LLT::scalar(64);
2164   auto PtrToInt = MIB.buildPtrToInt(CastPtrTy, AddOp1Reg);
2165   // Set regbanks on the registers.
2166   if (PtrTy.isVector())
2167     MRI.setRegBank(PtrToInt.getReg(0), RBI.getRegBank(AArch64::FPRRegBankID));
2168   else
2169     MRI.setRegBank(PtrToInt.getReg(0), RBI.getRegBank(AArch64::GPRRegBankID));
2170 
2171   // Now turn the %dst(p0) = G_PTR_ADD %base, off into:
2172   // %dst(intty) = G_ADD %intbase, off
2173   I.setDesc(TII.get(TargetOpcode::G_ADD));
2174   MRI.setType(DstReg, CastPtrTy);
2175   I.getOperand(1).setReg(PtrToInt.getReg(0));
2176   if (!select(*PtrToInt)) {
2177     LLVM_DEBUG(dbgs() << "Failed to select G_PTRTOINT in convertPtrAddToAdd");
2178     return false;
2179   }
2180 
2181   // Also take the opportunity here to try to do some optimization.
2182   // Try to convert this into a G_SUB if the offset is a 0-x negate idiom.
2183   Register NegatedReg;
2184   if (!mi_match(I.getOperand(2).getReg(), MRI, m_Neg(m_Reg(NegatedReg))))
2185     return true;
2186   I.getOperand(2).setReg(NegatedReg);
2187   I.setDesc(TII.get(TargetOpcode::G_SUB));
2188   return true;
2189 }
2190 
2191 bool AArch64InstructionSelector::earlySelectSHL(MachineInstr &I,
2192                                                 MachineRegisterInfo &MRI) {
2193   // We try to match the immediate variant of LSL, which is actually an alias
2194   // for a special case of UBFM. Otherwise, we fall back to the imported
2195   // selector which will match the register variant.
2196   assert(I.getOpcode() == TargetOpcode::G_SHL && "unexpected op");
2197   const auto &MO = I.getOperand(2);
2198   auto VRegAndVal = getIConstantVRegVal(MO.getReg(), MRI);
2199   if (!VRegAndVal)
2200     return false;
2201 
2202   const LLT DstTy = MRI.getType(I.getOperand(0).getReg());
2203   if (DstTy.isVector())
2204     return false;
2205   bool Is64Bit = DstTy.getSizeInBits() == 64;
2206   auto Imm1Fn = Is64Bit ? selectShiftA_64(MO) : selectShiftA_32(MO);
2207   auto Imm2Fn = Is64Bit ? selectShiftB_64(MO) : selectShiftB_32(MO);
2208 
2209   if (!Imm1Fn || !Imm2Fn)
2210     return false;
2211 
2212   auto NewI =
2213       MIB.buildInstr(Is64Bit ? AArch64::UBFMXri : AArch64::UBFMWri,
2214                      {I.getOperand(0).getReg()}, {I.getOperand(1).getReg()});
2215 
2216   for (auto &RenderFn : *Imm1Fn)
2217     RenderFn(NewI);
2218   for (auto &RenderFn : *Imm2Fn)
2219     RenderFn(NewI);
2220 
2221   I.eraseFromParent();
2222   return constrainSelectedInstRegOperands(*NewI, TII, TRI, RBI);
2223 }
2224 
2225 bool AArch64InstructionSelector::contractCrossBankCopyIntoStore(
2226     MachineInstr &I, MachineRegisterInfo &MRI) {
2227   assert(I.getOpcode() == TargetOpcode::G_STORE && "Expected G_STORE");
2228   // If we're storing a scalar, it doesn't matter what register bank that
2229   // scalar is on. All that matters is the size.
2230   //
2231   // So, if we see something like this (with a 32-bit scalar as an example):
2232   //
2233   // %x:gpr(s32) = ... something ...
2234   // %y:fpr(s32) = COPY %x:gpr(s32)
2235   // G_STORE %y:fpr(s32)
2236   //
2237   // We can fix this up into something like this:
2238   //
2239   // G_STORE %x:gpr(s32)
2240   //
2241   // And then continue the selection process normally.
2242   Register DefDstReg = getSrcRegIgnoringCopies(I.getOperand(0).getReg(), MRI);
2243   if (!DefDstReg.isValid())
2244     return false;
2245   LLT DefDstTy = MRI.getType(DefDstReg);
2246   Register StoreSrcReg = I.getOperand(0).getReg();
2247   LLT StoreSrcTy = MRI.getType(StoreSrcReg);
2248 
2249   // If we get something strange like a physical register, then we shouldn't
2250   // go any further.
2251   if (!DefDstTy.isValid())
2252     return false;
2253 
2254   // Are the source and dst types the same size?
2255   if (DefDstTy.getSizeInBits() != StoreSrcTy.getSizeInBits())
2256     return false;
2257 
2258   if (RBI.getRegBank(StoreSrcReg, MRI, TRI) ==
2259       RBI.getRegBank(DefDstReg, MRI, TRI))
2260     return false;
2261 
2262   // We have a cross-bank copy, which is entering a store. Let's fold it.
2263   I.getOperand(0).setReg(DefDstReg);
2264   return true;
2265 }
2266 
2267 bool AArch64InstructionSelector::earlySelect(MachineInstr &I) {
2268   assert(I.getParent() && "Instruction should be in a basic block!");
2269   assert(I.getParent()->getParent() && "Instruction should be in a function!");
2270 
2271   MachineBasicBlock &MBB = *I.getParent();
2272   MachineFunction &MF = *MBB.getParent();
2273   MachineRegisterInfo &MRI = MF.getRegInfo();
2274 
2275   switch (I.getOpcode()) {
2276   case AArch64::G_DUP: {
2277     // Before selecting a DUP instruction, check if it is better selected as a
2278     // MOV or load from a constant pool.
2279     Register Src = I.getOperand(1).getReg();
2280     auto ValAndVReg = getAnyConstantVRegValWithLookThrough(Src, MRI);
2281     if (!ValAndVReg)
2282       return false;
2283     LLVMContext &Ctx = MF.getFunction().getContext();
2284     Register Dst = I.getOperand(0).getReg();
2285     auto *CV = ConstantDataVector::getSplat(
2286         MRI.getType(Dst).getNumElements(),
2287         ConstantInt::get(Type::getIntNTy(Ctx, MRI.getType(Src).getSizeInBits()),
2288                          ValAndVReg->Value));
2289     if (!emitConstantVector(Dst, CV, MIB, MRI))
2290       return false;
2291     I.eraseFromParent();
2292     return true;
2293   }
2294   case TargetOpcode::G_SEXT:
2295     // Check for i64 sext(i32 vector_extract) prior to tablegen to select SMOV
2296     // over a normal extend.
2297     if (selectUSMovFromExtend(I, MRI))
2298       return true;
2299     return false;
2300   case TargetOpcode::G_BR:
2301     return false;
2302   case TargetOpcode::G_SHL:
2303     return earlySelectSHL(I, MRI);
2304   case TargetOpcode::G_CONSTANT: {
2305     bool IsZero = false;
2306     if (I.getOperand(1).isCImm())
2307       IsZero = I.getOperand(1).getCImm()->isZero();
2308     else if (I.getOperand(1).isImm())
2309       IsZero = I.getOperand(1).getImm() == 0;
2310 
2311     if (!IsZero)
2312       return false;
2313 
2314     Register DefReg = I.getOperand(0).getReg();
2315     LLT Ty = MRI.getType(DefReg);
2316     if (Ty.getSizeInBits() == 64) {
2317       I.getOperand(1).ChangeToRegister(AArch64::XZR, false);
2318       RBI.constrainGenericRegister(DefReg, AArch64::GPR64RegClass, MRI);
2319     } else if (Ty.getSizeInBits() == 32) {
2320       I.getOperand(1).ChangeToRegister(AArch64::WZR, false);
2321       RBI.constrainGenericRegister(DefReg, AArch64::GPR32RegClass, MRI);
2322     } else
2323       return false;
2324 
2325     I.setDesc(TII.get(TargetOpcode::COPY));
2326     return true;
2327   }
2328 
2329   case TargetOpcode::G_ADD: {
2330     // Check if this is being fed by a G_ICMP on either side.
2331     //
2332     // (cmp pred, x, y) + z
2333     //
2334     // In the above case, when the cmp is true, we increment z by 1. So, we can
2335     // fold the add into the cset for the cmp by using cinc.
2336     //
2337     // FIXME: This would probably be a lot nicer in PostLegalizerLowering.
2338     Register AddDst = I.getOperand(0).getReg();
2339     Register AddLHS = I.getOperand(1).getReg();
2340     Register AddRHS = I.getOperand(2).getReg();
2341     // Only handle scalars.
2342     LLT Ty = MRI.getType(AddLHS);
2343     if (Ty.isVector())
2344       return false;
2345     // Since G_ICMP is modeled as ADDS/SUBS/ANDS, we can handle 32 bits or 64
2346     // bits.
2347     unsigned Size = Ty.getSizeInBits();
2348     if (Size != 32 && Size != 64)
2349       return false;
2350     auto MatchCmp = [&](Register Reg) -> MachineInstr * {
2351       if (!MRI.hasOneNonDBGUse(Reg))
2352         return nullptr;
2353       // If the LHS of the add is 32 bits, then we want to fold a 32-bit
2354       // compare.
2355       if (Size == 32)
2356         return getOpcodeDef(TargetOpcode::G_ICMP, Reg, MRI);
2357       // We model scalar compares using 32-bit destinations right now.
2358       // If it's a 64-bit compare, it'll have 64-bit sources.
2359       Register ZExt;
2360       if (!mi_match(Reg, MRI,
2361                     m_OneNonDBGUse(m_GZExt(m_OneNonDBGUse(m_Reg(ZExt))))))
2362         return nullptr;
2363       auto *Cmp = getOpcodeDef(TargetOpcode::G_ICMP, ZExt, MRI);
2364       if (!Cmp ||
2365           MRI.getType(Cmp->getOperand(2).getReg()).getSizeInBits() != 64)
2366         return nullptr;
2367       return Cmp;
2368     };
2369     // Try to match
2370     // z + (cmp pred, x, y)
2371     MachineInstr *Cmp = MatchCmp(AddRHS);
2372     if (!Cmp) {
2373       // (cmp pred, x, y) + z
2374       std::swap(AddLHS, AddRHS);
2375       Cmp = MatchCmp(AddRHS);
2376       if (!Cmp)
2377         return false;
2378     }
2379     auto &PredOp = Cmp->getOperand(1);
2380     auto Pred = static_cast<CmpInst::Predicate>(PredOp.getPredicate());
2381     const AArch64CC::CondCode InvCC =
2382         changeICMPPredToAArch64CC(CmpInst::getInversePredicate(Pred));
2383     MIB.setInstrAndDebugLoc(I);
2384     emitIntegerCompare(/*LHS=*/Cmp->getOperand(2),
2385                        /*RHS=*/Cmp->getOperand(3), PredOp, MIB);
2386     emitCSINC(/*Dst=*/AddDst, /*Src =*/AddLHS, /*Src2=*/AddLHS, InvCC, MIB);
2387     I.eraseFromParent();
2388     return true;
2389   }
2390   case TargetOpcode::G_OR: {
2391     // Look for operations that take the lower `Width=Size-ShiftImm` bits of
2392     // `ShiftSrc` and insert them into the upper `Width` bits of `MaskSrc` via
2393     // shifting and masking that we can replace with a BFI (encoded as a BFM).
2394     Register Dst = I.getOperand(0).getReg();
2395     LLT Ty = MRI.getType(Dst);
2396 
2397     if (!Ty.isScalar())
2398       return false;
2399 
2400     unsigned Size = Ty.getSizeInBits();
2401     if (Size != 32 && Size != 64)
2402       return false;
2403 
2404     Register ShiftSrc;
2405     int64_t ShiftImm;
2406     Register MaskSrc;
2407     int64_t MaskImm;
2408     if (!mi_match(
2409             Dst, MRI,
2410             m_GOr(m_OneNonDBGUse(m_GShl(m_Reg(ShiftSrc), m_ICst(ShiftImm))),
2411                   m_OneNonDBGUse(m_GAnd(m_Reg(MaskSrc), m_ICst(MaskImm))))))
2412       return false;
2413 
2414     if (ShiftImm > Size || ((1ULL << ShiftImm) - 1ULL) != uint64_t(MaskImm))
2415       return false;
2416 
2417     int64_t Immr = Size - ShiftImm;
2418     int64_t Imms = Size - ShiftImm - 1;
2419     unsigned Opc = Size == 32 ? AArch64::BFMWri : AArch64::BFMXri;
2420     emitInstr(Opc, {Dst}, {MaskSrc, ShiftSrc, Immr, Imms}, MIB);
2421     I.eraseFromParent();
2422     return true;
2423   }
2424   case TargetOpcode::G_FENCE: {
2425     if (I.getOperand(1).getImm() == 0)
2426       BuildMI(MBB, I, MIMetadata(I), TII.get(TargetOpcode::MEMBARRIER));
2427     else
2428       BuildMI(MBB, I, MIMetadata(I), TII.get(AArch64::DMB))
2429           .addImm(I.getOperand(0).getImm() == 4 ? 0x9 : 0xb);
2430     I.eraseFromParent();
2431     return true;
2432   }
2433   default:
2434     return false;
2435   }
2436 }
2437 
2438 bool AArch64InstructionSelector::select(MachineInstr &I) {
2439   assert(I.getParent() && "Instruction should be in a basic block!");
2440   assert(I.getParent()->getParent() && "Instruction should be in a function!");
2441 
2442   MachineBasicBlock &MBB = *I.getParent();
2443   MachineFunction &MF = *MBB.getParent();
2444   MachineRegisterInfo &MRI = MF.getRegInfo();
2445 
2446   const AArch64Subtarget *Subtarget = &MF.getSubtarget<AArch64Subtarget>();
2447   if (Subtarget->requiresStrictAlign()) {
2448     // We don't support this feature yet.
2449     LLVM_DEBUG(dbgs() << "AArch64 GISel does not support strict-align yet\n");
2450     return false;
2451   }
2452 
2453   MIB.setInstrAndDebugLoc(I);
2454 
2455   unsigned Opcode = I.getOpcode();
2456   // G_PHI requires same handling as PHI
2457   if (!I.isPreISelOpcode() || Opcode == TargetOpcode::G_PHI) {
2458     // Certain non-generic instructions also need some special handling.
2459 
2460     if (Opcode ==  TargetOpcode::LOAD_STACK_GUARD)
2461       return constrainSelectedInstRegOperands(I, TII, TRI, RBI);
2462 
2463     if (Opcode == TargetOpcode::PHI || Opcode == TargetOpcode::G_PHI) {
2464       const Register DefReg = I.getOperand(0).getReg();
2465       const LLT DefTy = MRI.getType(DefReg);
2466 
2467       const RegClassOrRegBank &RegClassOrBank =
2468         MRI.getRegClassOrRegBank(DefReg);
2469 
2470       const TargetRegisterClass *DefRC
2471         = RegClassOrBank.dyn_cast<const TargetRegisterClass *>();
2472       if (!DefRC) {
2473         if (!DefTy.isValid()) {
2474           LLVM_DEBUG(dbgs() << "PHI operand has no type, not a gvreg?\n");
2475           return false;
2476         }
2477         const RegisterBank &RB = *RegClassOrBank.get<const RegisterBank *>();
2478         DefRC = getRegClassForTypeOnBank(DefTy, RB);
2479         if (!DefRC) {
2480           LLVM_DEBUG(dbgs() << "PHI operand has unexpected size/bank\n");
2481           return false;
2482         }
2483       }
2484 
2485       I.setDesc(TII.get(TargetOpcode::PHI));
2486 
2487       return RBI.constrainGenericRegister(DefReg, *DefRC, MRI);
2488     }
2489 
2490     if (I.isCopy())
2491       return selectCopy(I, TII, MRI, TRI, RBI);
2492 
2493     if (I.isDebugInstr())
2494       return selectDebugInstr(I, MRI, RBI);
2495 
2496     return true;
2497   }
2498 
2499 
2500   if (I.getNumOperands() != I.getNumExplicitOperands()) {
2501     LLVM_DEBUG(
2502         dbgs() << "Generic instruction has unexpected implicit operands\n");
2503     return false;
2504   }
2505 
2506   // Try to do some lowering before we start instruction selecting. These
2507   // lowerings are purely transformations on the input G_MIR and so selection
2508   // must continue after any modification of the instruction.
2509   if (preISelLower(I)) {
2510     Opcode = I.getOpcode(); // The opcode may have been modified, refresh it.
2511   }
2512 
2513   // There may be patterns where the importer can't deal with them optimally,
2514   // but does select it to a suboptimal sequence so our custom C++ selection
2515   // code later never has a chance to work on it. Therefore, we have an early
2516   // selection attempt here to give priority to certain selection routines
2517   // over the imported ones.
2518   if (earlySelect(I))
2519     return true;
2520 
2521   if (selectImpl(I, *CoverageInfo))
2522     return true;
2523 
2524   LLT Ty =
2525       I.getOperand(0).isReg() ? MRI.getType(I.getOperand(0).getReg()) : LLT{};
2526 
2527   switch (Opcode) {
2528   case TargetOpcode::G_SBFX:
2529   case TargetOpcode::G_UBFX: {
2530     static const unsigned OpcTable[2][2] = {
2531         {AArch64::UBFMWri, AArch64::UBFMXri},
2532         {AArch64::SBFMWri, AArch64::SBFMXri}};
2533     bool IsSigned = Opcode == TargetOpcode::G_SBFX;
2534     unsigned Size = Ty.getSizeInBits();
2535     unsigned Opc = OpcTable[IsSigned][Size == 64];
2536     auto Cst1 =
2537         getIConstantVRegValWithLookThrough(I.getOperand(2).getReg(), MRI);
2538     assert(Cst1 && "Should have gotten a constant for src 1?");
2539     auto Cst2 =
2540         getIConstantVRegValWithLookThrough(I.getOperand(3).getReg(), MRI);
2541     assert(Cst2 && "Should have gotten a constant for src 2?");
2542     auto LSB = Cst1->Value.getZExtValue();
2543     auto Width = Cst2->Value.getZExtValue();
2544     auto BitfieldInst =
2545         MIB.buildInstr(Opc, {I.getOperand(0)}, {I.getOperand(1)})
2546             .addImm(LSB)
2547             .addImm(LSB + Width - 1);
2548     I.eraseFromParent();
2549     return constrainSelectedInstRegOperands(*BitfieldInst, TII, TRI, RBI);
2550   }
2551   case TargetOpcode::G_BRCOND:
2552     return selectCompareBranch(I, MF, MRI);
2553 
2554   case TargetOpcode::G_BRINDIRECT: {
2555     const Function &Fn = MF.getFunction();
2556     if (std::optional<uint16_t> BADisc =
2557             STI.getPtrAuthBlockAddressDiscriminatorIfEnabled(Fn)) {
2558       auto MI = MIB.buildInstr(AArch64::BRA, {}, {I.getOperand(0).getReg()});
2559       MI.addImm(AArch64PACKey::IA);
2560       MI.addImm(*BADisc);
2561       MI.addReg(/*AddrDisc=*/AArch64::XZR);
2562       I.eraseFromParent();
2563       return constrainSelectedInstRegOperands(*MI, TII, TRI, RBI);
2564     }
2565     I.setDesc(TII.get(AArch64::BR));
2566     return constrainSelectedInstRegOperands(I, TII, TRI, RBI);
2567   }
2568 
2569   case TargetOpcode::G_BRJT:
2570     return selectBrJT(I, MRI);
2571 
2572   case AArch64::G_ADD_LOW: {
2573     // This op may have been separated from it's ADRP companion by the localizer
2574     // or some other code motion pass. Given that many CPUs will try to
2575     // macro fuse these operations anyway, select this into a MOVaddr pseudo
2576     // which will later be expanded into an ADRP+ADD pair after scheduling.
2577     MachineInstr *BaseMI = MRI.getVRegDef(I.getOperand(1).getReg());
2578     if (BaseMI->getOpcode() != AArch64::ADRP) {
2579       I.setDesc(TII.get(AArch64::ADDXri));
2580       I.addOperand(MachineOperand::CreateImm(0));
2581       return constrainSelectedInstRegOperands(I, TII, TRI, RBI);
2582     }
2583     assert(TM.getCodeModel() == CodeModel::Small &&
2584            "Expected small code model");
2585     auto Op1 = BaseMI->getOperand(1);
2586     auto Op2 = I.getOperand(2);
2587     auto MovAddr = MIB.buildInstr(AArch64::MOVaddr, {I.getOperand(0)}, {})
2588                        .addGlobalAddress(Op1.getGlobal(), Op1.getOffset(),
2589                                          Op1.getTargetFlags())
2590                        .addGlobalAddress(Op2.getGlobal(), Op2.getOffset(),
2591                                          Op2.getTargetFlags());
2592     I.eraseFromParent();
2593     return constrainSelectedInstRegOperands(*MovAddr, TII, TRI, RBI);
2594   }
2595 
2596   case TargetOpcode::G_FCONSTANT:
2597   case TargetOpcode::G_CONSTANT: {
2598     const bool isFP = Opcode == TargetOpcode::G_FCONSTANT;
2599 
2600     const LLT s8 = LLT::scalar(8);
2601     const LLT s16 = LLT::scalar(16);
2602     const LLT s32 = LLT::scalar(32);
2603     const LLT s64 = LLT::scalar(64);
2604     const LLT s128 = LLT::scalar(128);
2605     const LLT p0 = LLT::pointer(0, 64);
2606 
2607     const Register DefReg = I.getOperand(0).getReg();
2608     const LLT DefTy = MRI.getType(DefReg);
2609     const unsigned DefSize = DefTy.getSizeInBits();
2610     const RegisterBank &RB = *RBI.getRegBank(DefReg, MRI, TRI);
2611 
2612     // FIXME: Redundant check, but even less readable when factored out.
2613     if (isFP) {
2614       if (Ty != s16 && Ty != s32 && Ty != s64 && Ty != s128) {
2615         LLVM_DEBUG(dbgs() << "Unable to materialize FP " << Ty
2616                           << " constant, expected: " << s16 << " or " << s32
2617                           << " or " << s64 << " or " << s128 << '\n');
2618         return false;
2619       }
2620 
2621       if (RB.getID() != AArch64::FPRRegBankID) {
2622         LLVM_DEBUG(dbgs() << "Unable to materialize FP " << Ty
2623                           << " constant on bank: " << RB
2624                           << ", expected: FPR\n");
2625         return false;
2626       }
2627 
2628       // The case when we have 0.0 is covered by tablegen. Reject it here so we
2629       // can be sure tablegen works correctly and isn't rescued by this code.
2630       // 0.0 is not covered by tablegen for FP128. So we will handle this
2631       // scenario in the code here.
2632       if (DefSize != 128 && I.getOperand(1).getFPImm()->isExactlyValue(0.0))
2633         return false;
2634     } else {
2635       // s32 and s64 are covered by tablegen.
2636       if (Ty != p0 && Ty != s8 && Ty != s16) {
2637         LLVM_DEBUG(dbgs() << "Unable to materialize integer " << Ty
2638                           << " constant, expected: " << s32 << ", " << s64
2639                           << ", or " << p0 << '\n');
2640         return false;
2641       }
2642 
2643       if (RB.getID() != AArch64::GPRRegBankID) {
2644         LLVM_DEBUG(dbgs() << "Unable to materialize integer " << Ty
2645                           << " constant on bank: " << RB
2646                           << ", expected: GPR\n");
2647         return false;
2648       }
2649     }
2650 
2651     if (isFP) {
2652       const TargetRegisterClass &FPRRC = *getRegClassForTypeOnBank(DefTy, RB);
2653       // For 16, 64, and 128b values, emit a constant pool load.
2654       switch (DefSize) {
2655       default:
2656         llvm_unreachable("Unexpected destination size for G_FCONSTANT?");
2657       case 32:
2658       case 64: {
2659         bool OptForSize = shouldOptForSize(&MF);
2660         const auto &TLI = MF.getSubtarget().getTargetLowering();
2661         // If TLI says that this fpimm is illegal, then we'll expand to a
2662         // constant pool load.
2663         if (TLI->isFPImmLegal(I.getOperand(1).getFPImm()->getValueAPF(),
2664                               EVT::getFloatingPointVT(DefSize), OptForSize))
2665           break;
2666         [[fallthrough]];
2667       }
2668       case 16:
2669       case 128: {
2670         auto *FPImm = I.getOperand(1).getFPImm();
2671         auto *LoadMI = emitLoadFromConstantPool(FPImm, MIB);
2672         if (!LoadMI) {
2673           LLVM_DEBUG(dbgs() << "Failed to load double constant pool entry\n");
2674           return false;
2675         }
2676         MIB.buildCopy({DefReg}, {LoadMI->getOperand(0).getReg()});
2677         I.eraseFromParent();
2678         return RBI.constrainGenericRegister(DefReg, FPRRC, MRI);
2679       }
2680       }
2681 
2682       assert((DefSize == 32 || DefSize == 64) && "Unexpected const def size");
2683       // Either emit a FMOV, or emit a copy to emit a normal mov.
2684       const Register DefGPRReg = MRI.createVirtualRegister(
2685           DefSize == 32 ? &AArch64::GPR32RegClass : &AArch64::GPR64RegClass);
2686       MachineOperand &RegOp = I.getOperand(0);
2687       RegOp.setReg(DefGPRReg);
2688       MIB.setInsertPt(MIB.getMBB(), std::next(I.getIterator()));
2689       MIB.buildCopy({DefReg}, {DefGPRReg});
2690 
2691       if (!RBI.constrainGenericRegister(DefReg, FPRRC, MRI)) {
2692         LLVM_DEBUG(dbgs() << "Failed to constrain G_FCONSTANT def operand\n");
2693         return false;
2694       }
2695 
2696       MachineOperand &ImmOp = I.getOperand(1);
2697       // FIXME: Is going through int64_t always correct?
2698       ImmOp.ChangeToImmediate(
2699           ImmOp.getFPImm()->getValueAPF().bitcastToAPInt().getZExtValue());
2700     } else if (I.getOperand(1).isCImm()) {
2701       uint64_t Val = I.getOperand(1).getCImm()->getZExtValue();
2702       I.getOperand(1).ChangeToImmediate(Val);
2703     } else if (I.getOperand(1).isImm()) {
2704       uint64_t Val = I.getOperand(1).getImm();
2705       I.getOperand(1).ChangeToImmediate(Val);
2706     }
2707 
2708     const unsigned MovOpc =
2709         DefSize == 64 ? AArch64::MOVi64imm : AArch64::MOVi32imm;
2710     I.setDesc(TII.get(MovOpc));
2711     constrainSelectedInstRegOperands(I, TII, TRI, RBI);
2712     return true;
2713   }
2714   case TargetOpcode::G_EXTRACT: {
2715     Register DstReg = I.getOperand(0).getReg();
2716     Register SrcReg = I.getOperand(1).getReg();
2717     LLT SrcTy = MRI.getType(SrcReg);
2718     LLT DstTy = MRI.getType(DstReg);
2719     (void)DstTy;
2720     unsigned SrcSize = SrcTy.getSizeInBits();
2721 
2722     if (SrcTy.getSizeInBits() > 64) {
2723       // This should be an extract of an s128, which is like a vector extract.
2724       if (SrcTy.getSizeInBits() != 128)
2725         return false;
2726       // Only support extracting 64 bits from an s128 at the moment.
2727       if (DstTy.getSizeInBits() != 64)
2728         return false;
2729 
2730       unsigned Offset = I.getOperand(2).getImm();
2731       if (Offset % 64 != 0)
2732         return false;
2733 
2734       // Check we have the right regbank always.
2735       const RegisterBank &SrcRB = *RBI.getRegBank(SrcReg, MRI, TRI);
2736       const RegisterBank &DstRB = *RBI.getRegBank(DstReg, MRI, TRI);
2737       assert(SrcRB.getID() == DstRB.getID() && "Wrong extract regbank!");
2738 
2739       if (SrcRB.getID() == AArch64::GPRRegBankID) {
2740         auto NewI =
2741             MIB.buildInstr(TargetOpcode::COPY, {DstReg}, {})
2742                 .addUse(SrcReg, 0,
2743                         Offset == 0 ? AArch64::sube64 : AArch64::subo64);
2744         constrainOperandRegClass(MF, TRI, MRI, TII, RBI, *NewI,
2745                                  AArch64::GPR64RegClass, NewI->getOperand(0));
2746         I.eraseFromParent();
2747         return true;
2748       }
2749 
2750       // Emit the same code as a vector extract.
2751       // Offset must be a multiple of 64.
2752       unsigned LaneIdx = Offset / 64;
2753       MachineInstr *Extract = emitExtractVectorElt(
2754           DstReg, DstRB, LLT::scalar(64), SrcReg, LaneIdx, MIB);
2755       if (!Extract)
2756         return false;
2757       I.eraseFromParent();
2758       return true;
2759     }
2760 
2761     I.setDesc(TII.get(SrcSize == 64 ? AArch64::UBFMXri : AArch64::UBFMWri));
2762     MachineInstrBuilder(MF, I).addImm(I.getOperand(2).getImm() +
2763                                       Ty.getSizeInBits() - 1);
2764 
2765     if (SrcSize < 64) {
2766       assert(SrcSize == 32 && DstTy.getSizeInBits() == 16 &&
2767              "unexpected G_EXTRACT types");
2768       return constrainSelectedInstRegOperands(I, TII, TRI, RBI);
2769     }
2770 
2771     DstReg = MRI.createGenericVirtualRegister(LLT::scalar(64));
2772     MIB.setInsertPt(MIB.getMBB(), std::next(I.getIterator()));
2773     MIB.buildInstr(TargetOpcode::COPY, {I.getOperand(0).getReg()}, {})
2774         .addReg(DstReg, 0, AArch64::sub_32);
2775     RBI.constrainGenericRegister(I.getOperand(0).getReg(),
2776                                  AArch64::GPR32RegClass, MRI);
2777     I.getOperand(0).setReg(DstReg);
2778 
2779     return constrainSelectedInstRegOperands(I, TII, TRI, RBI);
2780   }
2781 
2782   case TargetOpcode::G_INSERT: {
2783     LLT SrcTy = MRI.getType(I.getOperand(2).getReg());
2784     LLT DstTy = MRI.getType(I.getOperand(0).getReg());
2785     unsigned DstSize = DstTy.getSizeInBits();
2786     // Larger inserts are vectors, same-size ones should be something else by
2787     // now (split up or turned into COPYs).
2788     if (Ty.getSizeInBits() > 64 || SrcTy.getSizeInBits() > 32)
2789       return false;
2790 
2791     I.setDesc(TII.get(DstSize == 64 ? AArch64::BFMXri : AArch64::BFMWri));
2792     unsigned LSB = I.getOperand(3).getImm();
2793     unsigned Width = MRI.getType(I.getOperand(2).getReg()).getSizeInBits();
2794     I.getOperand(3).setImm((DstSize - LSB) % DstSize);
2795     MachineInstrBuilder(MF, I).addImm(Width - 1);
2796 
2797     if (DstSize < 64) {
2798       assert(DstSize == 32 && SrcTy.getSizeInBits() == 16 &&
2799              "unexpected G_INSERT types");
2800       return constrainSelectedInstRegOperands(I, TII, TRI, RBI);
2801     }
2802 
2803     Register SrcReg = MRI.createGenericVirtualRegister(LLT::scalar(64));
2804     BuildMI(MBB, I.getIterator(), I.getDebugLoc(),
2805             TII.get(AArch64::SUBREG_TO_REG))
2806         .addDef(SrcReg)
2807         .addImm(0)
2808         .addUse(I.getOperand(2).getReg())
2809         .addImm(AArch64::sub_32);
2810     RBI.constrainGenericRegister(I.getOperand(2).getReg(),
2811                                  AArch64::GPR32RegClass, MRI);
2812     I.getOperand(2).setReg(SrcReg);
2813 
2814     return constrainSelectedInstRegOperands(I, TII, TRI, RBI);
2815   }
2816   case TargetOpcode::G_FRAME_INDEX: {
2817     // allocas and G_FRAME_INDEX are only supported in addrspace(0).
2818     if (Ty != LLT::pointer(0, 64)) {
2819       LLVM_DEBUG(dbgs() << "G_FRAME_INDEX pointer has type: " << Ty
2820                         << ", expected: " << LLT::pointer(0, 64) << '\n');
2821       return false;
2822     }
2823     I.setDesc(TII.get(AArch64::ADDXri));
2824 
2825     // MOs for a #0 shifted immediate.
2826     I.addOperand(MachineOperand::CreateImm(0));
2827     I.addOperand(MachineOperand::CreateImm(0));
2828 
2829     return constrainSelectedInstRegOperands(I, TII, TRI, RBI);
2830   }
2831 
2832   case TargetOpcode::G_GLOBAL_VALUE: {
2833     const GlobalValue *GV = nullptr;
2834     unsigned OpFlags;
2835     if (I.getOperand(1).isSymbol()) {
2836       OpFlags = I.getOperand(1).getTargetFlags();
2837       // Currently only used by "RtLibUseGOT".
2838       assert(OpFlags == AArch64II::MO_GOT);
2839     } else {
2840       GV = I.getOperand(1).getGlobal();
2841       if (GV->isThreadLocal())
2842         return selectTLSGlobalValue(I, MRI);
2843       OpFlags = STI.ClassifyGlobalReference(GV, TM);
2844     }
2845 
2846     if (OpFlags & AArch64II::MO_GOT) {
2847       I.setDesc(TII.get(AArch64::LOADgot));
2848       I.getOperand(1).setTargetFlags(OpFlags);
2849     } else if (TM.getCodeModel() == CodeModel::Large &&
2850                !TM.isPositionIndependent()) {
2851       // Materialize the global using movz/movk instructions.
2852       materializeLargeCMVal(I, GV, OpFlags);
2853       I.eraseFromParent();
2854       return true;
2855     } else if (TM.getCodeModel() == CodeModel::Tiny) {
2856       I.setDesc(TII.get(AArch64::ADR));
2857       I.getOperand(1).setTargetFlags(OpFlags);
2858     } else {
2859       I.setDesc(TII.get(AArch64::MOVaddr));
2860       I.getOperand(1).setTargetFlags(OpFlags | AArch64II::MO_PAGE);
2861       MachineInstrBuilder MIB(MF, I);
2862       MIB.addGlobalAddress(GV, I.getOperand(1).getOffset(),
2863                            OpFlags | AArch64II::MO_PAGEOFF | AArch64II::MO_NC);
2864     }
2865     return constrainSelectedInstRegOperands(I, TII, TRI, RBI);
2866   }
2867 
2868   case TargetOpcode::G_PTRAUTH_GLOBAL_VALUE:
2869     return selectPtrAuthGlobalValue(I, MRI);
2870 
2871   case TargetOpcode::G_ZEXTLOAD:
2872   case TargetOpcode::G_LOAD:
2873   case TargetOpcode::G_STORE: {
2874     GLoadStore &LdSt = cast<GLoadStore>(I);
2875     bool IsZExtLoad = I.getOpcode() == TargetOpcode::G_ZEXTLOAD;
2876     LLT PtrTy = MRI.getType(LdSt.getPointerReg());
2877 
2878     if (PtrTy != LLT::pointer(0, 64)) {
2879       LLVM_DEBUG(dbgs() << "Load/Store pointer has type: " << PtrTy
2880                         << ", expected: " << LLT::pointer(0, 64) << '\n');
2881       return false;
2882     }
2883 
2884     uint64_t MemSizeInBytes = LdSt.getMemSize().getValue();
2885     unsigned MemSizeInBits = LdSt.getMemSizeInBits().getValue();
2886     AtomicOrdering Order = LdSt.getMMO().getSuccessOrdering();
2887 
2888     // Need special instructions for atomics that affect ordering.
2889     if (Order != AtomicOrdering::NotAtomic &&
2890         Order != AtomicOrdering::Unordered &&
2891         Order != AtomicOrdering::Monotonic) {
2892       assert(!isa<GZExtLoad>(LdSt));
2893       assert(MemSizeInBytes <= 8 &&
2894              "128-bit atomics should already be custom-legalized");
2895 
2896       if (isa<GLoad>(LdSt)) {
2897         static constexpr unsigned LDAPROpcodes[] = {
2898             AArch64::LDAPRB, AArch64::LDAPRH, AArch64::LDAPRW, AArch64::LDAPRX};
2899         static constexpr unsigned LDAROpcodes[] = {
2900             AArch64::LDARB, AArch64::LDARH, AArch64::LDARW, AArch64::LDARX};
2901         ArrayRef<unsigned> Opcodes =
2902             STI.hasRCPC() && Order != AtomicOrdering::SequentiallyConsistent
2903                 ? LDAPROpcodes
2904                 : LDAROpcodes;
2905         I.setDesc(TII.get(Opcodes[Log2_32(MemSizeInBytes)]));
2906       } else {
2907         static constexpr unsigned Opcodes[] = {AArch64::STLRB, AArch64::STLRH,
2908                                                AArch64::STLRW, AArch64::STLRX};
2909         Register ValReg = LdSt.getReg(0);
2910         if (MRI.getType(ValReg).getSizeInBits() == 64 && MemSizeInBits != 64) {
2911           // Emit a subreg copy of 32 bits.
2912           Register NewVal = MRI.createVirtualRegister(&AArch64::GPR32RegClass);
2913           MIB.buildInstr(TargetOpcode::COPY, {NewVal}, {})
2914               .addReg(I.getOperand(0).getReg(), 0, AArch64::sub_32);
2915           I.getOperand(0).setReg(NewVal);
2916         }
2917         I.setDesc(TII.get(Opcodes[Log2_32(MemSizeInBytes)]));
2918       }
2919       constrainSelectedInstRegOperands(I, TII, TRI, RBI);
2920       return true;
2921     }
2922 
2923 #ifndef NDEBUG
2924     const Register PtrReg = LdSt.getPointerReg();
2925     const RegisterBank &PtrRB = *RBI.getRegBank(PtrReg, MRI, TRI);
2926     // Check that the pointer register is valid.
2927     assert(PtrRB.getID() == AArch64::GPRRegBankID &&
2928            "Load/Store pointer operand isn't a GPR");
2929     assert(MRI.getType(PtrReg).isPointer() &&
2930            "Load/Store pointer operand isn't a pointer");
2931 #endif
2932 
2933     const Register ValReg = LdSt.getReg(0);
2934     const LLT ValTy = MRI.getType(ValReg);
2935     const RegisterBank &RB = *RBI.getRegBank(ValReg, MRI, TRI);
2936 
2937     // The code below doesn't support truncating stores, so we need to split it
2938     // again.
2939     if (isa<GStore>(LdSt) && ValTy.getSizeInBits() > MemSizeInBits) {
2940       unsigned SubReg;
2941       LLT MemTy = LdSt.getMMO().getMemoryType();
2942       auto *RC = getRegClassForTypeOnBank(MemTy, RB);
2943       if (!getSubRegForClass(RC, TRI, SubReg))
2944         return false;
2945 
2946       // Generate a subreg copy.
2947       auto Copy = MIB.buildInstr(TargetOpcode::COPY, {MemTy}, {})
2948                       .addReg(ValReg, 0, SubReg)
2949                       .getReg(0);
2950       RBI.constrainGenericRegister(Copy, *RC, MRI);
2951       LdSt.getOperand(0).setReg(Copy);
2952     } else if (isa<GLoad>(LdSt) && ValTy.getSizeInBits() > MemSizeInBits) {
2953       // If this is an any-extending load from the FPR bank, split it into a regular
2954       // load + extend.
2955       if (RB.getID() == AArch64::FPRRegBankID) {
2956         unsigned SubReg;
2957         LLT MemTy = LdSt.getMMO().getMemoryType();
2958         auto *RC = getRegClassForTypeOnBank(MemTy, RB);
2959         if (!getSubRegForClass(RC, TRI, SubReg))
2960           return false;
2961         Register OldDst = LdSt.getReg(0);
2962         Register NewDst =
2963             MRI.createGenericVirtualRegister(LdSt.getMMO().getMemoryType());
2964         LdSt.getOperand(0).setReg(NewDst);
2965         MRI.setRegBank(NewDst, RB);
2966         // Generate a SUBREG_TO_REG to extend it.
2967         MIB.setInsertPt(MIB.getMBB(), std::next(LdSt.getIterator()));
2968         MIB.buildInstr(AArch64::SUBREG_TO_REG, {OldDst}, {})
2969             .addImm(0)
2970             .addUse(NewDst)
2971             .addImm(SubReg);
2972         auto SubRegRC = getRegClassForTypeOnBank(MRI.getType(OldDst), RB);
2973         RBI.constrainGenericRegister(OldDst, *SubRegRC, MRI);
2974         MIB.setInstr(LdSt);
2975       }
2976     }
2977 
2978     // Helper lambda for partially selecting I. Either returns the original
2979     // instruction with an updated opcode, or a new instruction.
2980     auto SelectLoadStoreAddressingMode = [&]() -> MachineInstr * {
2981       bool IsStore = isa<GStore>(I);
2982       const unsigned NewOpc =
2983           selectLoadStoreUIOp(I.getOpcode(), RB.getID(), MemSizeInBits);
2984       if (NewOpc == I.getOpcode())
2985         return nullptr;
2986       // Check if we can fold anything into the addressing mode.
2987       auto AddrModeFns =
2988           selectAddrModeIndexed(I.getOperand(1), MemSizeInBytes);
2989       if (!AddrModeFns) {
2990         // Can't fold anything. Use the original instruction.
2991         I.setDesc(TII.get(NewOpc));
2992         I.addOperand(MachineOperand::CreateImm(0));
2993         return &I;
2994       }
2995 
2996       // Folded something. Create a new instruction and return it.
2997       auto NewInst = MIB.buildInstr(NewOpc, {}, {}, I.getFlags());
2998       Register CurValReg = I.getOperand(0).getReg();
2999       IsStore ? NewInst.addUse(CurValReg) : NewInst.addDef(CurValReg);
3000       NewInst.cloneMemRefs(I);
3001       for (auto &Fn : *AddrModeFns)
3002         Fn(NewInst);
3003       I.eraseFromParent();
3004       return &*NewInst;
3005     };
3006 
3007     MachineInstr *LoadStore = SelectLoadStoreAddressingMode();
3008     if (!LoadStore)
3009       return false;
3010 
3011     // If we're storing a 0, use WZR/XZR.
3012     if (Opcode == TargetOpcode::G_STORE) {
3013       auto CVal = getIConstantVRegValWithLookThrough(
3014           LoadStore->getOperand(0).getReg(), MRI);
3015       if (CVal && CVal->Value == 0) {
3016         switch (LoadStore->getOpcode()) {
3017         case AArch64::STRWui:
3018         case AArch64::STRHHui:
3019         case AArch64::STRBBui:
3020           LoadStore->getOperand(0).setReg(AArch64::WZR);
3021           break;
3022         case AArch64::STRXui:
3023           LoadStore->getOperand(0).setReg(AArch64::XZR);
3024           break;
3025         }
3026       }
3027     }
3028 
3029     if (IsZExtLoad || (Opcode == TargetOpcode::G_LOAD &&
3030                        ValTy == LLT::scalar(64) && MemSizeInBits == 32)) {
3031       // The any/zextload from a smaller type to i32 should be handled by the
3032       // importer.
3033       if (MRI.getType(LoadStore->getOperand(0).getReg()).getSizeInBits() != 64)
3034         return false;
3035       // If we have an extending load then change the load's type to be a
3036       // narrower reg and zero_extend with SUBREG_TO_REG.
3037       Register LdReg = MRI.createVirtualRegister(&AArch64::GPR32RegClass);
3038       Register DstReg = LoadStore->getOperand(0).getReg();
3039       LoadStore->getOperand(0).setReg(LdReg);
3040 
3041       MIB.setInsertPt(MIB.getMBB(), std::next(LoadStore->getIterator()));
3042       MIB.buildInstr(AArch64::SUBREG_TO_REG, {DstReg}, {})
3043           .addImm(0)
3044           .addUse(LdReg)
3045           .addImm(AArch64::sub_32);
3046       constrainSelectedInstRegOperands(*LoadStore, TII, TRI, RBI);
3047       return RBI.constrainGenericRegister(DstReg, AArch64::GPR64allRegClass,
3048                                           MRI);
3049     }
3050     return constrainSelectedInstRegOperands(*LoadStore, TII, TRI, RBI);
3051   }
3052 
3053   case TargetOpcode::G_INDEXED_ZEXTLOAD:
3054   case TargetOpcode::G_INDEXED_SEXTLOAD:
3055     return selectIndexedExtLoad(I, MRI);
3056   case TargetOpcode::G_INDEXED_LOAD:
3057     return selectIndexedLoad(I, MRI);
3058   case TargetOpcode::G_INDEXED_STORE:
3059     return selectIndexedStore(cast<GIndexedStore>(I), MRI);
3060 
3061   case TargetOpcode::G_LSHR:
3062   case TargetOpcode::G_ASHR:
3063     if (MRI.getType(I.getOperand(0).getReg()).isVector())
3064       return selectVectorAshrLshr(I, MRI);
3065     [[fallthrough]];
3066   case TargetOpcode::G_SHL:
3067     if (Opcode == TargetOpcode::G_SHL &&
3068         MRI.getType(I.getOperand(0).getReg()).isVector())
3069       return selectVectorSHL(I, MRI);
3070 
3071     // These shifts were legalized to have 64 bit shift amounts because we
3072     // want to take advantage of the selection patterns that assume the
3073     // immediates are s64s, however, selectBinaryOp will assume both operands
3074     // will have the same bit size.
3075     {
3076       Register SrcReg = I.getOperand(1).getReg();
3077       Register ShiftReg = I.getOperand(2).getReg();
3078       const LLT ShiftTy = MRI.getType(ShiftReg);
3079       const LLT SrcTy = MRI.getType(SrcReg);
3080       if (!SrcTy.isVector() && SrcTy.getSizeInBits() == 32 &&
3081           ShiftTy.getSizeInBits() == 64) {
3082         assert(!ShiftTy.isVector() && "unexpected vector shift ty");
3083         // Insert a subregister copy to implement a 64->32 trunc
3084         auto Trunc = MIB.buildInstr(TargetOpcode::COPY, {SrcTy}, {})
3085                          .addReg(ShiftReg, 0, AArch64::sub_32);
3086         MRI.setRegBank(Trunc.getReg(0), RBI.getRegBank(AArch64::GPRRegBankID));
3087         I.getOperand(2).setReg(Trunc.getReg(0));
3088       }
3089     }
3090     [[fallthrough]];
3091   case TargetOpcode::G_OR: {
3092     // Reject the various things we don't support yet.
3093     if (unsupportedBinOp(I, RBI, MRI, TRI))
3094       return false;
3095 
3096     const unsigned OpSize = Ty.getSizeInBits();
3097 
3098     const Register DefReg = I.getOperand(0).getReg();
3099     const RegisterBank &RB = *RBI.getRegBank(DefReg, MRI, TRI);
3100 
3101     const unsigned NewOpc = selectBinaryOp(I.getOpcode(), RB.getID(), OpSize);
3102     if (NewOpc == I.getOpcode())
3103       return false;
3104 
3105     I.setDesc(TII.get(NewOpc));
3106     // FIXME: Should the type be always reset in setDesc?
3107 
3108     // Now that we selected an opcode, we need to constrain the register
3109     // operands to use appropriate classes.
3110     return constrainSelectedInstRegOperands(I, TII, TRI, RBI);
3111   }
3112 
3113   case TargetOpcode::G_PTR_ADD: {
3114     emitADD(I.getOperand(0).getReg(), I.getOperand(1), I.getOperand(2), MIB);
3115     I.eraseFromParent();
3116     return true;
3117   }
3118 
3119   case TargetOpcode::G_SADDE:
3120   case TargetOpcode::G_UADDE:
3121   case TargetOpcode::G_SSUBE:
3122   case TargetOpcode::G_USUBE:
3123   case TargetOpcode::G_SADDO:
3124   case TargetOpcode::G_UADDO:
3125   case TargetOpcode::G_SSUBO:
3126   case TargetOpcode::G_USUBO:
3127     return selectOverflowOp(I, MRI);
3128 
3129   case TargetOpcode::G_PTRMASK: {
3130     Register MaskReg = I.getOperand(2).getReg();
3131     std::optional<int64_t> MaskVal = getIConstantVRegSExtVal(MaskReg, MRI);
3132     // TODO: Implement arbitrary cases
3133     if (!MaskVal || !isShiftedMask_64(*MaskVal))
3134       return false;
3135 
3136     uint64_t Mask = *MaskVal;
3137     I.setDesc(TII.get(AArch64::ANDXri));
3138     I.getOperand(2).ChangeToImmediate(
3139         AArch64_AM::encodeLogicalImmediate(Mask, 64));
3140 
3141     return constrainSelectedInstRegOperands(I, TII, TRI, RBI);
3142   }
3143   case TargetOpcode::G_PTRTOINT:
3144   case TargetOpcode::G_TRUNC: {
3145     const LLT DstTy = MRI.getType(I.getOperand(0).getReg());
3146     const LLT SrcTy = MRI.getType(I.getOperand(1).getReg());
3147 
3148     const Register DstReg = I.getOperand(0).getReg();
3149     const Register SrcReg = I.getOperand(1).getReg();
3150 
3151     const RegisterBank &DstRB = *RBI.getRegBank(DstReg, MRI, TRI);
3152     const RegisterBank &SrcRB = *RBI.getRegBank(SrcReg, MRI, TRI);
3153 
3154     if (DstRB.getID() != SrcRB.getID()) {
3155       LLVM_DEBUG(
3156           dbgs() << "G_TRUNC/G_PTRTOINT input/output on different banks\n");
3157       return false;
3158     }
3159 
3160     if (DstRB.getID() == AArch64::GPRRegBankID) {
3161       const TargetRegisterClass *DstRC = getRegClassForTypeOnBank(DstTy, DstRB);
3162       if (!DstRC)
3163         return false;
3164 
3165       const TargetRegisterClass *SrcRC = getRegClassForTypeOnBank(SrcTy, SrcRB);
3166       if (!SrcRC)
3167         return false;
3168 
3169       if (!RBI.constrainGenericRegister(SrcReg, *SrcRC, MRI) ||
3170           !RBI.constrainGenericRegister(DstReg, *DstRC, MRI)) {
3171         LLVM_DEBUG(dbgs() << "Failed to constrain G_TRUNC/G_PTRTOINT\n");
3172         return false;
3173       }
3174 
3175       if (DstRC == SrcRC) {
3176         // Nothing to be done
3177       } else if (Opcode == TargetOpcode::G_TRUNC && DstTy == LLT::scalar(32) &&
3178                  SrcTy == LLT::scalar(64)) {
3179         llvm_unreachable("TableGen can import this case");
3180         return false;
3181       } else if (DstRC == &AArch64::GPR32RegClass &&
3182                  SrcRC == &AArch64::GPR64RegClass) {
3183         I.getOperand(1).setSubReg(AArch64::sub_32);
3184       } else {
3185         LLVM_DEBUG(
3186             dbgs() << "Unhandled mismatched classes in G_TRUNC/G_PTRTOINT\n");
3187         return false;
3188       }
3189 
3190       I.setDesc(TII.get(TargetOpcode::COPY));
3191       return true;
3192     } else if (DstRB.getID() == AArch64::FPRRegBankID) {
3193       if (DstTy == LLT::fixed_vector(4, 16) &&
3194           SrcTy == LLT::fixed_vector(4, 32)) {
3195         I.setDesc(TII.get(AArch64::XTNv4i16));
3196         constrainSelectedInstRegOperands(I, TII, TRI, RBI);
3197         return true;
3198       }
3199 
3200       if (!SrcTy.isVector() && SrcTy.getSizeInBits() == 128) {
3201         MachineInstr *Extract = emitExtractVectorElt(
3202             DstReg, DstRB, LLT::scalar(DstTy.getSizeInBits()), SrcReg, 0, MIB);
3203         if (!Extract)
3204           return false;
3205         I.eraseFromParent();
3206         return true;
3207       }
3208 
3209       // We might have a vector G_PTRTOINT, in which case just emit a COPY.
3210       if (Opcode == TargetOpcode::G_PTRTOINT) {
3211         assert(DstTy.isVector() && "Expected an FPR ptrtoint to be a vector");
3212         I.setDesc(TII.get(TargetOpcode::COPY));
3213         return selectCopy(I, TII, MRI, TRI, RBI);
3214       }
3215     }
3216 
3217     return false;
3218   }
3219 
3220   case TargetOpcode::G_ANYEXT: {
3221     if (selectUSMovFromExtend(I, MRI))
3222       return true;
3223 
3224     const Register DstReg = I.getOperand(0).getReg();
3225     const Register SrcReg = I.getOperand(1).getReg();
3226 
3227     const RegisterBank &RBDst = *RBI.getRegBank(DstReg, MRI, TRI);
3228     if (RBDst.getID() != AArch64::GPRRegBankID) {
3229       LLVM_DEBUG(dbgs() << "G_ANYEXT on bank: " << RBDst
3230                         << ", expected: GPR\n");
3231       return false;
3232     }
3233 
3234     const RegisterBank &RBSrc = *RBI.getRegBank(SrcReg, MRI, TRI);
3235     if (RBSrc.getID() != AArch64::GPRRegBankID) {
3236       LLVM_DEBUG(dbgs() << "G_ANYEXT on bank: " << RBSrc
3237                         << ", expected: GPR\n");
3238       return false;
3239     }
3240 
3241     const unsigned DstSize = MRI.getType(DstReg).getSizeInBits();
3242 
3243     if (DstSize == 0) {
3244       LLVM_DEBUG(dbgs() << "G_ANYEXT operand has no size, not a gvreg?\n");
3245       return false;
3246     }
3247 
3248     if (DstSize != 64 && DstSize > 32) {
3249       LLVM_DEBUG(dbgs() << "G_ANYEXT to size: " << DstSize
3250                         << ", expected: 32 or 64\n");
3251       return false;
3252     }
3253     // At this point G_ANYEXT is just like a plain COPY, but we need
3254     // to explicitly form the 64-bit value if any.
3255     if (DstSize > 32) {
3256       Register ExtSrc = MRI.createVirtualRegister(&AArch64::GPR64allRegClass);
3257       BuildMI(MBB, I, I.getDebugLoc(), TII.get(AArch64::SUBREG_TO_REG))
3258           .addDef(ExtSrc)
3259           .addImm(0)
3260           .addUse(SrcReg)
3261           .addImm(AArch64::sub_32);
3262       I.getOperand(1).setReg(ExtSrc);
3263     }
3264     return selectCopy(I, TII, MRI, TRI, RBI);
3265   }
3266 
3267   case TargetOpcode::G_ZEXT:
3268   case TargetOpcode::G_SEXT_INREG:
3269   case TargetOpcode::G_SEXT: {
3270     if (selectUSMovFromExtend(I, MRI))
3271       return true;
3272 
3273     unsigned Opcode = I.getOpcode();
3274     const bool IsSigned = Opcode != TargetOpcode::G_ZEXT;
3275     const Register DefReg = I.getOperand(0).getReg();
3276     Register SrcReg = I.getOperand(1).getReg();
3277     const LLT DstTy = MRI.getType(DefReg);
3278     const LLT SrcTy = MRI.getType(SrcReg);
3279     unsigned DstSize = DstTy.getSizeInBits();
3280     unsigned SrcSize = SrcTy.getSizeInBits();
3281 
3282     // SEXT_INREG has the same src reg size as dst, the size of the value to be
3283     // extended is encoded in the imm.
3284     if (Opcode == TargetOpcode::G_SEXT_INREG)
3285       SrcSize = I.getOperand(2).getImm();
3286 
3287     if (DstTy.isVector())
3288       return false; // Should be handled by imported patterns.
3289 
3290     assert((*RBI.getRegBank(DefReg, MRI, TRI)).getID() ==
3291                AArch64::GPRRegBankID &&
3292            "Unexpected ext regbank");
3293 
3294     MachineInstr *ExtI;
3295 
3296     // First check if we're extending the result of a load which has a dest type
3297     // smaller than 32 bits, then this zext is redundant. GPR32 is the smallest
3298     // GPR register on AArch64 and all loads which are smaller automatically
3299     // zero-extend the upper bits. E.g.
3300     // %v(s8) = G_LOAD %p, :: (load 1)
3301     // %v2(s32) = G_ZEXT %v(s8)
3302     if (!IsSigned) {
3303       auto *LoadMI = getOpcodeDef(TargetOpcode::G_LOAD, SrcReg, MRI);
3304       bool IsGPR =
3305           RBI.getRegBank(SrcReg, MRI, TRI)->getID() == AArch64::GPRRegBankID;
3306       if (LoadMI && IsGPR) {
3307         const MachineMemOperand *MemOp = *LoadMI->memoperands_begin();
3308         unsigned BytesLoaded = MemOp->getSize().getValue();
3309         if (BytesLoaded < 4 && SrcTy.getSizeInBytes() == BytesLoaded)
3310           return selectCopy(I, TII, MRI, TRI, RBI);
3311       }
3312 
3313       // For the 32-bit -> 64-bit case, we can emit a mov (ORRWrs)
3314       // + SUBREG_TO_REG.
3315       if (IsGPR && SrcSize == 32 && DstSize == 64) {
3316         Register SubregToRegSrc =
3317             MRI.createVirtualRegister(&AArch64::GPR32RegClass);
3318         const Register ZReg = AArch64::WZR;
3319         MIB.buildInstr(AArch64::ORRWrs, {SubregToRegSrc}, {ZReg, SrcReg})
3320             .addImm(0);
3321 
3322         MIB.buildInstr(AArch64::SUBREG_TO_REG, {DefReg}, {})
3323             .addImm(0)
3324             .addUse(SubregToRegSrc)
3325             .addImm(AArch64::sub_32);
3326 
3327         if (!RBI.constrainGenericRegister(DefReg, AArch64::GPR64RegClass,
3328                                           MRI)) {
3329           LLVM_DEBUG(dbgs() << "Failed to constrain G_ZEXT destination\n");
3330           return false;
3331         }
3332 
3333         if (!RBI.constrainGenericRegister(SrcReg, AArch64::GPR32RegClass,
3334                                           MRI)) {
3335           LLVM_DEBUG(dbgs() << "Failed to constrain G_ZEXT source\n");
3336           return false;
3337         }
3338 
3339         I.eraseFromParent();
3340         return true;
3341       }
3342     }
3343 
3344     if (DstSize == 64) {
3345       if (Opcode != TargetOpcode::G_SEXT_INREG) {
3346         // FIXME: Can we avoid manually doing this?
3347         if (!RBI.constrainGenericRegister(SrcReg, AArch64::GPR32RegClass,
3348                                           MRI)) {
3349           LLVM_DEBUG(dbgs() << "Failed to constrain " << TII.getName(Opcode)
3350                             << " operand\n");
3351           return false;
3352         }
3353         SrcReg = MIB.buildInstr(AArch64::SUBREG_TO_REG,
3354                                 {&AArch64::GPR64RegClass}, {})
3355                      .addImm(0)
3356                      .addUse(SrcReg)
3357                      .addImm(AArch64::sub_32)
3358                      .getReg(0);
3359       }
3360 
3361       ExtI = MIB.buildInstr(IsSigned ? AArch64::SBFMXri : AArch64::UBFMXri,
3362                              {DefReg}, {SrcReg})
3363                   .addImm(0)
3364                   .addImm(SrcSize - 1);
3365     } else if (DstSize <= 32) {
3366       ExtI = MIB.buildInstr(IsSigned ? AArch64::SBFMWri : AArch64::UBFMWri,
3367                              {DefReg}, {SrcReg})
3368                   .addImm(0)
3369                   .addImm(SrcSize - 1);
3370     } else {
3371       return false;
3372     }
3373 
3374     constrainSelectedInstRegOperands(*ExtI, TII, TRI, RBI);
3375     I.eraseFromParent();
3376     return true;
3377   }
3378 
3379   case TargetOpcode::G_SITOFP:
3380   case TargetOpcode::G_UITOFP:
3381   case TargetOpcode::G_FPTOSI:
3382   case TargetOpcode::G_FPTOUI: {
3383     const LLT DstTy = MRI.getType(I.getOperand(0).getReg()),
3384               SrcTy = MRI.getType(I.getOperand(1).getReg());
3385     const unsigned NewOpc = selectFPConvOpc(Opcode, DstTy, SrcTy);
3386     if (NewOpc == Opcode)
3387       return false;
3388 
3389     I.setDesc(TII.get(NewOpc));
3390     constrainSelectedInstRegOperands(I, TII, TRI, RBI);
3391     I.setFlags(MachineInstr::NoFPExcept);
3392 
3393     return true;
3394   }
3395 
3396   case TargetOpcode::G_FREEZE:
3397     return selectCopy(I, TII, MRI, TRI, RBI);
3398 
3399   case TargetOpcode::G_INTTOPTR:
3400     // The importer is currently unable to import pointer types since they
3401     // didn't exist in SelectionDAG.
3402     return selectCopy(I, TII, MRI, TRI, RBI);
3403 
3404   case TargetOpcode::G_BITCAST:
3405     // Imported SelectionDAG rules can handle every bitcast except those that
3406     // bitcast from a type to the same type. Ideally, these shouldn't occur
3407     // but we might not run an optimizer that deletes them. The other exception
3408     // is bitcasts involving pointer types, as SelectionDAG has no knowledge
3409     // of them.
3410     return selectCopy(I, TII, MRI, TRI, RBI);
3411 
3412   case TargetOpcode::G_SELECT: {
3413     auto &Sel = cast<GSelect>(I);
3414     const Register CondReg = Sel.getCondReg();
3415     const Register TReg = Sel.getTrueReg();
3416     const Register FReg = Sel.getFalseReg();
3417 
3418     if (tryOptSelect(Sel))
3419       return true;
3420 
3421     // Make sure to use an unused vreg instead of wzr, so that the peephole
3422     // optimizations will be able to optimize these.
3423     Register DeadVReg = MRI.createVirtualRegister(&AArch64::GPR32RegClass);
3424     auto TstMI = MIB.buildInstr(AArch64::ANDSWri, {DeadVReg}, {CondReg})
3425                      .addImm(AArch64_AM::encodeLogicalImmediate(1, 32));
3426     constrainSelectedInstRegOperands(*TstMI, TII, TRI, RBI);
3427     if (!emitSelect(Sel.getReg(0), TReg, FReg, AArch64CC::NE, MIB))
3428       return false;
3429     Sel.eraseFromParent();
3430     return true;
3431   }
3432   case TargetOpcode::G_ICMP: {
3433     if (Ty.isVector())
3434       return false;
3435 
3436     if (Ty != LLT::scalar(32)) {
3437       LLVM_DEBUG(dbgs() << "G_ICMP result has type: " << Ty
3438                         << ", expected: " << LLT::scalar(32) << '\n');
3439       return false;
3440     }
3441 
3442     auto Pred = static_cast<CmpInst::Predicate>(I.getOperand(1).getPredicate());
3443     const AArch64CC::CondCode InvCC =
3444         changeICMPPredToAArch64CC(CmpInst::getInversePredicate(Pred));
3445     emitIntegerCompare(I.getOperand(2), I.getOperand(3), I.getOperand(1), MIB);
3446     emitCSINC(/*Dst=*/I.getOperand(0).getReg(), /*Src1=*/AArch64::WZR,
3447               /*Src2=*/AArch64::WZR, InvCC, MIB);
3448     I.eraseFromParent();
3449     return true;
3450   }
3451 
3452   case TargetOpcode::G_FCMP: {
3453     CmpInst::Predicate Pred =
3454         static_cast<CmpInst::Predicate>(I.getOperand(1).getPredicate());
3455     if (!emitFPCompare(I.getOperand(2).getReg(), I.getOperand(3).getReg(), MIB,
3456                        Pred) ||
3457         !emitCSetForFCmp(I.getOperand(0).getReg(), Pred, MIB))
3458       return false;
3459     I.eraseFromParent();
3460     return true;
3461   }
3462   case TargetOpcode::G_VASTART:
3463     return STI.isTargetDarwin() ? selectVaStartDarwin(I, MF, MRI)
3464                                 : selectVaStartAAPCS(I, MF, MRI);
3465   case TargetOpcode::G_INTRINSIC:
3466     return selectIntrinsic(I, MRI);
3467   case TargetOpcode::G_INTRINSIC_W_SIDE_EFFECTS:
3468     return selectIntrinsicWithSideEffects(I, MRI);
3469   case TargetOpcode::G_IMPLICIT_DEF: {
3470     I.setDesc(TII.get(TargetOpcode::IMPLICIT_DEF));
3471     const LLT DstTy = MRI.getType(I.getOperand(0).getReg());
3472     const Register DstReg = I.getOperand(0).getReg();
3473     const RegisterBank &DstRB = *RBI.getRegBank(DstReg, MRI, TRI);
3474     const TargetRegisterClass *DstRC = getRegClassForTypeOnBank(DstTy, DstRB);
3475     RBI.constrainGenericRegister(DstReg, *DstRC, MRI);
3476     return true;
3477   }
3478   case TargetOpcode::G_BLOCK_ADDR: {
3479     Function *BAFn = I.getOperand(1).getBlockAddress()->getFunction();
3480     if (std::optional<uint16_t> BADisc =
3481             STI.getPtrAuthBlockAddressDiscriminatorIfEnabled(*BAFn)) {
3482       MIB.buildInstr(TargetOpcode::IMPLICIT_DEF, {AArch64::X16}, {});
3483       MIB.buildInstr(TargetOpcode::IMPLICIT_DEF, {AArch64::X17}, {});
3484       MIB.buildInstr(AArch64::MOVaddrPAC)
3485           .addBlockAddress(I.getOperand(1).getBlockAddress())
3486           .addImm(AArch64PACKey::IA)
3487           .addReg(/*AddrDisc=*/AArch64::XZR)
3488           .addImm(*BADisc)
3489           .constrainAllUses(TII, TRI, RBI);
3490       MIB.buildCopy(I.getOperand(0).getReg(), Register(AArch64::X16));
3491       RBI.constrainGenericRegister(I.getOperand(0).getReg(),
3492                                    AArch64::GPR64RegClass, MRI);
3493       I.eraseFromParent();
3494       return true;
3495     }
3496     if (TM.getCodeModel() == CodeModel::Large && !TM.isPositionIndependent()) {
3497       materializeLargeCMVal(I, I.getOperand(1).getBlockAddress(), 0);
3498       I.eraseFromParent();
3499       return true;
3500     } else {
3501       I.setDesc(TII.get(AArch64::MOVaddrBA));
3502       auto MovMI = BuildMI(MBB, I, I.getDebugLoc(), TII.get(AArch64::MOVaddrBA),
3503                            I.getOperand(0).getReg())
3504                        .addBlockAddress(I.getOperand(1).getBlockAddress(),
3505                                         /* Offset */ 0, AArch64II::MO_PAGE)
3506                        .addBlockAddress(
3507                            I.getOperand(1).getBlockAddress(), /* Offset */ 0,
3508                            AArch64II::MO_NC | AArch64II::MO_PAGEOFF);
3509       I.eraseFromParent();
3510       return constrainSelectedInstRegOperands(*MovMI, TII, TRI, RBI);
3511     }
3512   }
3513   case AArch64::G_DUP: {
3514     // When the scalar of G_DUP is an s8/s16 gpr, they can't be selected by
3515     // imported patterns. Do it manually here. Avoiding generating s16 gpr is
3516     // difficult because at RBS we may end up pessimizing the fpr case if we
3517     // decided to add an anyextend to fix this. Manual selection is the most
3518     // robust solution for now.
3519     if (RBI.getRegBank(I.getOperand(1).getReg(), MRI, TRI)->getID() !=
3520         AArch64::GPRRegBankID)
3521       return false; // We expect the fpr regbank case to be imported.
3522     LLT VecTy = MRI.getType(I.getOperand(0).getReg());
3523     if (VecTy == LLT::fixed_vector(8, 8))
3524       I.setDesc(TII.get(AArch64::DUPv8i8gpr));
3525     else if (VecTy == LLT::fixed_vector(16, 8))
3526       I.setDesc(TII.get(AArch64::DUPv16i8gpr));
3527     else if (VecTy == LLT::fixed_vector(4, 16))
3528       I.setDesc(TII.get(AArch64::DUPv4i16gpr));
3529     else if (VecTy == LLT::fixed_vector(8, 16))
3530       I.setDesc(TII.get(AArch64::DUPv8i16gpr));
3531     else
3532       return false;
3533     return constrainSelectedInstRegOperands(I, TII, TRI, RBI);
3534   }
3535   case TargetOpcode::G_BUILD_VECTOR:
3536     return selectBuildVector(I, MRI);
3537   case TargetOpcode::G_MERGE_VALUES:
3538     return selectMergeValues(I, MRI);
3539   case TargetOpcode::G_UNMERGE_VALUES:
3540     return selectUnmergeValues(I, MRI);
3541   case TargetOpcode::G_SHUFFLE_VECTOR:
3542     return selectShuffleVector(I, MRI);
3543   case TargetOpcode::G_EXTRACT_VECTOR_ELT:
3544     return selectExtractElt(I, MRI);
3545   case TargetOpcode::G_CONCAT_VECTORS:
3546     return selectConcatVectors(I, MRI);
3547   case TargetOpcode::G_JUMP_TABLE:
3548     return selectJumpTable(I, MRI);
3549   case TargetOpcode::G_MEMCPY:
3550   case TargetOpcode::G_MEMCPY_INLINE:
3551   case TargetOpcode::G_MEMMOVE:
3552   case TargetOpcode::G_MEMSET:
3553     assert(STI.hasMOPS() && "Shouldn't get here without +mops feature");
3554     return selectMOPS(I, MRI);
3555   }
3556 
3557   return false;
3558 }
3559 
3560 bool AArch64InstructionSelector::selectAndRestoreState(MachineInstr &I) {
3561   MachineIRBuilderState OldMIBState = MIB.getState();
3562   bool Success = select(I);
3563   MIB.setState(OldMIBState);
3564   return Success;
3565 }
3566 
3567 bool AArch64InstructionSelector::selectMOPS(MachineInstr &GI,
3568                                             MachineRegisterInfo &MRI) {
3569   unsigned Mopcode;
3570   switch (GI.getOpcode()) {
3571   case TargetOpcode::G_MEMCPY:
3572   case TargetOpcode::G_MEMCPY_INLINE:
3573     Mopcode = AArch64::MOPSMemoryCopyPseudo;
3574     break;
3575   case TargetOpcode::G_MEMMOVE:
3576     Mopcode = AArch64::MOPSMemoryMovePseudo;
3577     break;
3578   case TargetOpcode::G_MEMSET:
3579     // For tagged memset see llvm.aarch64.mops.memset.tag
3580     Mopcode = AArch64::MOPSMemorySetPseudo;
3581     break;
3582   }
3583 
3584   auto &DstPtr = GI.getOperand(0);
3585   auto &SrcOrVal = GI.getOperand(1);
3586   auto &Size = GI.getOperand(2);
3587 
3588   // Create copies of the registers that can be clobbered.
3589   const Register DstPtrCopy = MRI.cloneVirtualRegister(DstPtr.getReg());
3590   const Register SrcValCopy = MRI.cloneVirtualRegister(SrcOrVal.getReg());
3591   const Register SizeCopy = MRI.cloneVirtualRegister(Size.getReg());
3592 
3593   const bool IsSet = Mopcode == AArch64::MOPSMemorySetPseudo;
3594   const auto &SrcValRegClass =
3595       IsSet ? AArch64::GPR64RegClass : AArch64::GPR64commonRegClass;
3596 
3597   // Constrain to specific registers
3598   RBI.constrainGenericRegister(DstPtrCopy, AArch64::GPR64commonRegClass, MRI);
3599   RBI.constrainGenericRegister(SrcValCopy, SrcValRegClass, MRI);
3600   RBI.constrainGenericRegister(SizeCopy, AArch64::GPR64RegClass, MRI);
3601 
3602   MIB.buildCopy(DstPtrCopy, DstPtr);
3603   MIB.buildCopy(SrcValCopy, SrcOrVal);
3604   MIB.buildCopy(SizeCopy, Size);
3605 
3606   // New instruction uses the copied registers because it must update them.
3607   // The defs are not used since they don't exist in G_MEM*. They are still
3608   // tied.
3609   // Note: order of operands is different from G_MEMSET, G_MEMCPY, G_MEMMOVE
3610   Register DefDstPtr = MRI.createVirtualRegister(&AArch64::GPR64commonRegClass);
3611   Register DefSize = MRI.createVirtualRegister(&AArch64::GPR64RegClass);
3612   if (IsSet) {
3613     MIB.buildInstr(Mopcode, {DefDstPtr, DefSize},
3614                    {DstPtrCopy, SizeCopy, SrcValCopy});
3615   } else {
3616     Register DefSrcPtr = MRI.createVirtualRegister(&SrcValRegClass);
3617     MIB.buildInstr(Mopcode, {DefDstPtr, DefSrcPtr, DefSize},
3618                    {DstPtrCopy, SrcValCopy, SizeCopy});
3619   }
3620 
3621   GI.eraseFromParent();
3622   return true;
3623 }
3624 
3625 bool AArch64InstructionSelector::selectBrJT(MachineInstr &I,
3626                                             MachineRegisterInfo &MRI) {
3627   assert(I.getOpcode() == TargetOpcode::G_BRJT && "Expected G_BRJT");
3628   Register JTAddr = I.getOperand(0).getReg();
3629   unsigned JTI = I.getOperand(1).getIndex();
3630   Register Index = I.getOperand(2).getReg();
3631 
3632   MF->getInfo<AArch64FunctionInfo>()->setJumpTableEntryInfo(JTI, 4, nullptr);
3633 
3634   // With aarch64-jump-table-hardening, we only expand the jump table dispatch
3635   // sequence later, to guarantee the integrity of the intermediate values.
3636   if (MF->getFunction().hasFnAttribute("aarch64-jump-table-hardening")) {
3637     CodeModel::Model CM = TM.getCodeModel();
3638     if (STI.isTargetMachO()) {
3639       if (CM != CodeModel::Small && CM != CodeModel::Large)
3640         report_fatal_error("Unsupported code-model for hardened jump-table");
3641     } else {
3642       // Note that COFF support would likely also need JUMP_TABLE_DEBUG_INFO.
3643       assert(STI.isTargetELF() &&
3644              "jump table hardening only supported on MachO/ELF");
3645       if (CM != CodeModel::Small)
3646         report_fatal_error("Unsupported code-model for hardened jump-table");
3647     }
3648 
3649     MIB.buildCopy({AArch64::X16}, I.getOperand(2).getReg());
3650     MIB.buildInstr(AArch64::BR_JumpTable)
3651         .addJumpTableIndex(I.getOperand(1).getIndex());
3652     I.eraseFromParent();
3653     return true;
3654   }
3655 
3656   Register TargetReg = MRI.createVirtualRegister(&AArch64::GPR64RegClass);
3657   Register ScratchReg = MRI.createVirtualRegister(&AArch64::GPR64spRegClass);
3658 
3659   auto JumpTableInst = MIB.buildInstr(AArch64::JumpTableDest32,
3660                                       {TargetReg, ScratchReg}, {JTAddr, Index})
3661                            .addJumpTableIndex(JTI);
3662   // Save the jump table info.
3663   MIB.buildInstr(TargetOpcode::JUMP_TABLE_DEBUG_INFO, {},
3664                  {static_cast<int64_t>(JTI)});
3665   // Build the indirect branch.
3666   MIB.buildInstr(AArch64::BR, {}, {TargetReg});
3667   I.eraseFromParent();
3668   return constrainSelectedInstRegOperands(*JumpTableInst, TII, TRI, RBI);
3669 }
3670 
3671 bool AArch64InstructionSelector::selectJumpTable(MachineInstr &I,
3672                                                  MachineRegisterInfo &MRI) {
3673   assert(I.getOpcode() == TargetOpcode::G_JUMP_TABLE && "Expected jump table");
3674   assert(I.getOperand(1).isJTI() && "Jump table op should have a JTI!");
3675 
3676   Register DstReg = I.getOperand(0).getReg();
3677   unsigned JTI = I.getOperand(1).getIndex();
3678   // We generate a MOVaddrJT which will get expanded to an ADRP + ADD later.
3679   auto MovMI =
3680     MIB.buildInstr(AArch64::MOVaddrJT, {DstReg}, {})
3681           .addJumpTableIndex(JTI, AArch64II::MO_PAGE)
3682           .addJumpTableIndex(JTI, AArch64II::MO_NC | AArch64II::MO_PAGEOFF);
3683   I.eraseFromParent();
3684   return constrainSelectedInstRegOperands(*MovMI, TII, TRI, RBI);
3685 }
3686 
3687 bool AArch64InstructionSelector::selectTLSGlobalValue(
3688     MachineInstr &I, MachineRegisterInfo &MRI) {
3689   if (!STI.isTargetMachO())
3690     return false;
3691   MachineFunction &MF = *I.getParent()->getParent();
3692   MF.getFrameInfo().setAdjustsStack(true);
3693 
3694   const auto &GlobalOp = I.getOperand(1);
3695   assert(GlobalOp.getOffset() == 0 &&
3696          "Shouldn't have an offset on TLS globals!");
3697   const GlobalValue &GV = *GlobalOp.getGlobal();
3698 
3699   auto LoadGOT =
3700       MIB.buildInstr(AArch64::LOADgot, {&AArch64::GPR64commonRegClass}, {})
3701           .addGlobalAddress(&GV, 0, AArch64II::MO_TLS);
3702 
3703   auto Load = MIB.buildInstr(AArch64::LDRXui, {&AArch64::GPR64commonRegClass},
3704                              {LoadGOT.getReg(0)})
3705                   .addImm(0);
3706 
3707   MIB.buildCopy(Register(AArch64::X0), LoadGOT.getReg(0));
3708   // TLS calls preserve all registers except those that absolutely must be
3709   // trashed: X0 (it takes an argument), LR (it's a call) and NZCV (let's not be
3710   // silly).
3711   unsigned Opcode = getBLRCallOpcode(MF);
3712 
3713   // With ptrauth-calls, the tlv access thunk pointer is authenticated (IA, 0).
3714   if (MF.getFunction().hasFnAttribute("ptrauth-calls")) {
3715     assert(Opcode == AArch64::BLR);
3716     Opcode = AArch64::BLRAAZ;
3717   }
3718 
3719   MIB.buildInstr(Opcode, {}, {Load})
3720       .addUse(AArch64::X0, RegState::Implicit)
3721       .addDef(AArch64::X0, RegState::Implicit)
3722       .addRegMask(TRI.getTLSCallPreservedMask());
3723 
3724   MIB.buildCopy(I.getOperand(0).getReg(), Register(AArch64::X0));
3725   RBI.constrainGenericRegister(I.getOperand(0).getReg(), AArch64::GPR64RegClass,
3726                                MRI);
3727   I.eraseFromParent();
3728   return true;
3729 }
3730 
3731 MachineInstr *AArch64InstructionSelector::emitScalarToVector(
3732     unsigned EltSize, const TargetRegisterClass *DstRC, Register Scalar,
3733     MachineIRBuilder &MIRBuilder) const {
3734   auto Undef = MIRBuilder.buildInstr(TargetOpcode::IMPLICIT_DEF, {DstRC}, {});
3735 
3736   auto BuildFn = [&](unsigned SubregIndex) {
3737     auto Ins =
3738         MIRBuilder
3739             .buildInstr(TargetOpcode::INSERT_SUBREG, {DstRC}, {Undef, Scalar})
3740             .addImm(SubregIndex);
3741     constrainSelectedInstRegOperands(*Undef, TII, TRI, RBI);
3742     constrainSelectedInstRegOperands(*Ins, TII, TRI, RBI);
3743     return &*Ins;
3744   };
3745 
3746   switch (EltSize) {
3747   case 8:
3748     return BuildFn(AArch64::bsub);
3749   case 16:
3750     return BuildFn(AArch64::hsub);
3751   case 32:
3752     return BuildFn(AArch64::ssub);
3753   case 64:
3754     return BuildFn(AArch64::dsub);
3755   default:
3756     return nullptr;
3757   }
3758 }
3759 
3760 MachineInstr *
3761 AArch64InstructionSelector::emitNarrowVector(Register DstReg, Register SrcReg,
3762                                              MachineIRBuilder &MIB,
3763                                              MachineRegisterInfo &MRI) const {
3764   LLT DstTy = MRI.getType(DstReg);
3765   const TargetRegisterClass *RC =
3766       getRegClassForTypeOnBank(DstTy, *RBI.getRegBank(SrcReg, MRI, TRI));
3767   if (RC != &AArch64::FPR32RegClass && RC != &AArch64::FPR64RegClass) {
3768     LLVM_DEBUG(dbgs() << "Unsupported register class!\n");
3769     return nullptr;
3770   }
3771   unsigned SubReg = 0;
3772   if (!getSubRegForClass(RC, TRI, SubReg))
3773     return nullptr;
3774   if (SubReg != AArch64::ssub && SubReg != AArch64::dsub) {
3775     LLVM_DEBUG(dbgs() << "Unsupported destination size! ("
3776                       << DstTy.getSizeInBits() << "\n");
3777     return nullptr;
3778   }
3779   auto Copy = MIB.buildInstr(TargetOpcode::COPY, {DstReg}, {})
3780                   .addReg(SrcReg, 0, SubReg);
3781   RBI.constrainGenericRegister(DstReg, *RC, MRI);
3782   return Copy;
3783 }
3784 
3785 bool AArch64InstructionSelector::selectMergeValues(
3786     MachineInstr &I, MachineRegisterInfo &MRI) {
3787   assert(I.getOpcode() == TargetOpcode::G_MERGE_VALUES && "unexpected opcode");
3788   const LLT DstTy = MRI.getType(I.getOperand(0).getReg());
3789   const LLT SrcTy = MRI.getType(I.getOperand(1).getReg());
3790   assert(!DstTy.isVector() && !SrcTy.isVector() && "invalid merge operation");
3791   const RegisterBank &RB = *RBI.getRegBank(I.getOperand(1).getReg(), MRI, TRI);
3792 
3793   if (I.getNumOperands() != 3)
3794     return false;
3795 
3796   // Merging 2 s64s into an s128.
3797   if (DstTy == LLT::scalar(128)) {
3798     if (SrcTy.getSizeInBits() != 64)
3799       return false;
3800     Register DstReg = I.getOperand(0).getReg();
3801     Register Src1Reg = I.getOperand(1).getReg();
3802     Register Src2Reg = I.getOperand(2).getReg();
3803     auto Tmp = MIB.buildInstr(TargetOpcode::IMPLICIT_DEF, {DstTy}, {});
3804     MachineInstr *InsMI = emitLaneInsert(std::nullopt, Tmp.getReg(0), Src1Reg,
3805                                          /* LaneIdx */ 0, RB, MIB);
3806     if (!InsMI)
3807       return false;
3808     MachineInstr *Ins2MI = emitLaneInsert(DstReg, InsMI->getOperand(0).getReg(),
3809                                           Src2Reg, /* LaneIdx */ 1, RB, MIB);
3810     if (!Ins2MI)
3811       return false;
3812     constrainSelectedInstRegOperands(*InsMI, TII, TRI, RBI);
3813     constrainSelectedInstRegOperands(*Ins2MI, TII, TRI, RBI);
3814     I.eraseFromParent();
3815     return true;
3816   }
3817 
3818   if (RB.getID() != AArch64::GPRRegBankID)
3819     return false;
3820 
3821   if (DstTy.getSizeInBits() != 64 || SrcTy.getSizeInBits() != 32)
3822     return false;
3823 
3824   auto *DstRC = &AArch64::GPR64RegClass;
3825   Register SubToRegDef = MRI.createVirtualRegister(DstRC);
3826   MachineInstr &SubRegMI = *BuildMI(*I.getParent(), I, I.getDebugLoc(),
3827                                     TII.get(TargetOpcode::SUBREG_TO_REG))
3828                                 .addDef(SubToRegDef)
3829                                 .addImm(0)
3830                                 .addUse(I.getOperand(1).getReg())
3831                                 .addImm(AArch64::sub_32);
3832   Register SubToRegDef2 = MRI.createVirtualRegister(DstRC);
3833   // Need to anyext the second scalar before we can use bfm
3834   MachineInstr &SubRegMI2 = *BuildMI(*I.getParent(), I, I.getDebugLoc(),
3835                                     TII.get(TargetOpcode::SUBREG_TO_REG))
3836                                 .addDef(SubToRegDef2)
3837                                 .addImm(0)
3838                                 .addUse(I.getOperand(2).getReg())
3839                                 .addImm(AArch64::sub_32);
3840   MachineInstr &BFM =
3841       *BuildMI(*I.getParent(), I, I.getDebugLoc(), TII.get(AArch64::BFMXri))
3842            .addDef(I.getOperand(0).getReg())
3843            .addUse(SubToRegDef)
3844            .addUse(SubToRegDef2)
3845            .addImm(32)
3846            .addImm(31);
3847   constrainSelectedInstRegOperands(SubRegMI, TII, TRI, RBI);
3848   constrainSelectedInstRegOperands(SubRegMI2, TII, TRI, RBI);
3849   constrainSelectedInstRegOperands(BFM, TII, TRI, RBI);
3850   I.eraseFromParent();
3851   return true;
3852 }
3853 
3854 static bool getLaneCopyOpcode(unsigned &CopyOpc, unsigned &ExtractSubReg,
3855                               const unsigned EltSize) {
3856   // Choose a lane copy opcode and subregister based off of the size of the
3857   // vector's elements.
3858   switch (EltSize) {
3859   case 8:
3860     CopyOpc = AArch64::DUPi8;
3861     ExtractSubReg = AArch64::bsub;
3862     break;
3863   case 16:
3864     CopyOpc = AArch64::DUPi16;
3865     ExtractSubReg = AArch64::hsub;
3866     break;
3867   case 32:
3868     CopyOpc = AArch64::DUPi32;
3869     ExtractSubReg = AArch64::ssub;
3870     break;
3871   case 64:
3872     CopyOpc = AArch64::DUPi64;
3873     ExtractSubReg = AArch64::dsub;
3874     break;
3875   default:
3876     // Unknown size, bail out.
3877     LLVM_DEBUG(dbgs() << "Elt size '" << EltSize << "' unsupported.\n");
3878     return false;
3879   }
3880   return true;
3881 }
3882 
3883 MachineInstr *AArch64InstructionSelector::emitExtractVectorElt(
3884     std::optional<Register> DstReg, const RegisterBank &DstRB, LLT ScalarTy,
3885     Register VecReg, unsigned LaneIdx, MachineIRBuilder &MIRBuilder) const {
3886   MachineRegisterInfo &MRI = *MIRBuilder.getMRI();
3887   unsigned CopyOpc = 0;
3888   unsigned ExtractSubReg = 0;
3889   if (!getLaneCopyOpcode(CopyOpc, ExtractSubReg, ScalarTy.getSizeInBits())) {
3890     LLVM_DEBUG(
3891         dbgs() << "Couldn't determine lane copy opcode for instruction.\n");
3892     return nullptr;
3893   }
3894 
3895   const TargetRegisterClass *DstRC =
3896       getRegClassForTypeOnBank(ScalarTy, DstRB, true);
3897   if (!DstRC) {
3898     LLVM_DEBUG(dbgs() << "Could not determine destination register class.\n");
3899     return nullptr;
3900   }
3901 
3902   const RegisterBank &VecRB = *RBI.getRegBank(VecReg, MRI, TRI);
3903   const LLT &VecTy = MRI.getType(VecReg);
3904   const TargetRegisterClass *VecRC =
3905       getRegClassForTypeOnBank(VecTy, VecRB, true);
3906   if (!VecRC) {
3907     LLVM_DEBUG(dbgs() << "Could not determine source register class.\n");
3908     return nullptr;
3909   }
3910 
3911   // The register that we're going to copy into.
3912   Register InsertReg = VecReg;
3913   if (!DstReg)
3914     DstReg = MRI.createVirtualRegister(DstRC);
3915   // If the lane index is 0, we just use a subregister COPY.
3916   if (LaneIdx == 0) {
3917     auto Copy = MIRBuilder.buildInstr(TargetOpcode::COPY, {*DstReg}, {})
3918                     .addReg(VecReg, 0, ExtractSubReg);
3919     RBI.constrainGenericRegister(*DstReg, *DstRC, MRI);
3920     return &*Copy;
3921   }
3922 
3923   // Lane copies require 128-bit wide registers. If we're dealing with an
3924   // unpacked vector, then we need to move up to that width. Insert an implicit
3925   // def and a subregister insert to get us there.
3926   if (VecTy.getSizeInBits() != 128) {
3927     MachineInstr *ScalarToVector = emitScalarToVector(
3928         VecTy.getSizeInBits(), &AArch64::FPR128RegClass, VecReg, MIRBuilder);
3929     if (!ScalarToVector)
3930       return nullptr;
3931     InsertReg = ScalarToVector->getOperand(0).getReg();
3932   }
3933 
3934   MachineInstr *LaneCopyMI =
3935       MIRBuilder.buildInstr(CopyOpc, {*DstReg}, {InsertReg}).addImm(LaneIdx);
3936   constrainSelectedInstRegOperands(*LaneCopyMI, TII, TRI, RBI);
3937 
3938   // Make sure that we actually constrain the initial copy.
3939   RBI.constrainGenericRegister(*DstReg, *DstRC, MRI);
3940   return LaneCopyMI;
3941 }
3942 
3943 bool AArch64InstructionSelector::selectExtractElt(
3944     MachineInstr &I, MachineRegisterInfo &MRI) {
3945   assert(I.getOpcode() == TargetOpcode::G_EXTRACT_VECTOR_ELT &&
3946          "unexpected opcode!");
3947   Register DstReg = I.getOperand(0).getReg();
3948   const LLT NarrowTy = MRI.getType(DstReg);
3949   const Register SrcReg = I.getOperand(1).getReg();
3950   const LLT WideTy = MRI.getType(SrcReg);
3951   (void)WideTy;
3952   assert(WideTy.getSizeInBits() >= NarrowTy.getSizeInBits() &&
3953          "source register size too small!");
3954   assert(!NarrowTy.isVector() && "cannot extract vector into vector!");
3955 
3956   // Need the lane index to determine the correct copy opcode.
3957   MachineOperand &LaneIdxOp = I.getOperand(2);
3958   assert(LaneIdxOp.isReg() && "Lane index operand was not a register?");
3959 
3960   if (RBI.getRegBank(DstReg, MRI, TRI)->getID() != AArch64::FPRRegBankID) {
3961     LLVM_DEBUG(dbgs() << "Cannot extract into GPR.\n");
3962     return false;
3963   }
3964 
3965   // Find the index to extract from.
3966   auto VRegAndVal = getIConstantVRegValWithLookThrough(LaneIdxOp.getReg(), MRI);
3967   if (!VRegAndVal)
3968     return false;
3969   unsigned LaneIdx = VRegAndVal->Value.getSExtValue();
3970 
3971 
3972   const RegisterBank &DstRB = *RBI.getRegBank(DstReg, MRI, TRI);
3973   MachineInstr *Extract = emitExtractVectorElt(DstReg, DstRB, NarrowTy, SrcReg,
3974                                                LaneIdx, MIB);
3975   if (!Extract)
3976     return false;
3977 
3978   I.eraseFromParent();
3979   return true;
3980 }
3981 
3982 bool AArch64InstructionSelector::selectSplitVectorUnmerge(
3983     MachineInstr &I, MachineRegisterInfo &MRI) {
3984   unsigned NumElts = I.getNumOperands() - 1;
3985   Register SrcReg = I.getOperand(NumElts).getReg();
3986   const LLT NarrowTy = MRI.getType(I.getOperand(0).getReg());
3987   const LLT SrcTy = MRI.getType(SrcReg);
3988 
3989   assert(NarrowTy.isVector() && "Expected an unmerge into vectors");
3990   if (SrcTy.getSizeInBits() > 128) {
3991     LLVM_DEBUG(dbgs() << "Unexpected vector type for vec split unmerge");
3992     return false;
3993   }
3994 
3995   // We implement a split vector operation by treating the sub-vectors as
3996   // scalars and extracting them.
3997   const RegisterBank &DstRB =
3998       *RBI.getRegBank(I.getOperand(0).getReg(), MRI, TRI);
3999   for (unsigned OpIdx = 0; OpIdx < NumElts; ++OpIdx) {
4000     Register Dst = I.getOperand(OpIdx).getReg();
4001     MachineInstr *Extract =
4002         emitExtractVectorElt(Dst, DstRB, NarrowTy, SrcReg, OpIdx, MIB);
4003     if (!Extract)
4004       return false;
4005   }
4006   I.eraseFromParent();
4007   return true;
4008 }
4009 
4010 bool AArch64InstructionSelector::selectUnmergeValues(MachineInstr &I,
4011                                                      MachineRegisterInfo &MRI) {
4012   assert(I.getOpcode() == TargetOpcode::G_UNMERGE_VALUES &&
4013          "unexpected opcode");
4014 
4015   // TODO: Handle unmerging into GPRs and from scalars to scalars.
4016   if (RBI.getRegBank(I.getOperand(0).getReg(), MRI, TRI)->getID() !=
4017           AArch64::FPRRegBankID ||
4018       RBI.getRegBank(I.getOperand(1).getReg(), MRI, TRI)->getID() !=
4019           AArch64::FPRRegBankID) {
4020     LLVM_DEBUG(dbgs() << "Unmerging vector-to-gpr and scalar-to-scalar "
4021                          "currently unsupported.\n");
4022     return false;
4023   }
4024 
4025   // The last operand is the vector source register, and every other operand is
4026   // a register to unpack into.
4027   unsigned NumElts = I.getNumOperands() - 1;
4028   Register SrcReg = I.getOperand(NumElts).getReg();
4029   const LLT NarrowTy = MRI.getType(I.getOperand(0).getReg());
4030   const LLT WideTy = MRI.getType(SrcReg);
4031   (void)WideTy;
4032   assert((WideTy.isVector() || WideTy.getSizeInBits() == 128) &&
4033          "can only unmerge from vector or s128 types!");
4034   assert(WideTy.getSizeInBits() > NarrowTy.getSizeInBits() &&
4035          "source register size too small!");
4036 
4037   if (!NarrowTy.isScalar())
4038     return selectSplitVectorUnmerge(I, MRI);
4039 
4040   // Choose a lane copy opcode and subregister based off of the size of the
4041   // vector's elements.
4042   unsigned CopyOpc = 0;
4043   unsigned ExtractSubReg = 0;
4044   if (!getLaneCopyOpcode(CopyOpc, ExtractSubReg, NarrowTy.getSizeInBits()))
4045     return false;
4046 
4047   // Set up for the lane copies.
4048   MachineBasicBlock &MBB = *I.getParent();
4049 
4050   // Stores the registers we'll be copying from.
4051   SmallVector<Register, 4> InsertRegs;
4052 
4053   // We'll use the first register twice, so we only need NumElts-1 registers.
4054   unsigned NumInsertRegs = NumElts - 1;
4055 
4056   // If our elements fit into exactly 128 bits, then we can copy from the source
4057   // directly. Otherwise, we need to do a bit of setup with some subregister
4058   // inserts.
4059   if (NarrowTy.getSizeInBits() * NumElts == 128) {
4060     InsertRegs = SmallVector<Register, 4>(NumInsertRegs, SrcReg);
4061   } else {
4062     // No. We have to perform subregister inserts. For each insert, create an
4063     // implicit def and a subregister insert, and save the register we create.
4064     const TargetRegisterClass *RC = getRegClassForTypeOnBank(
4065         LLT::fixed_vector(NumElts, WideTy.getScalarSizeInBits()),
4066         *RBI.getRegBank(SrcReg, MRI, TRI));
4067     unsigned SubReg = 0;
4068     bool Found = getSubRegForClass(RC, TRI, SubReg);
4069     (void)Found;
4070     assert(Found && "expected to find last operand's subeg idx");
4071     for (unsigned Idx = 0; Idx < NumInsertRegs; ++Idx) {
4072       Register ImpDefReg = MRI.createVirtualRegister(&AArch64::FPR128RegClass);
4073       MachineInstr &ImpDefMI =
4074           *BuildMI(MBB, I, I.getDebugLoc(), TII.get(TargetOpcode::IMPLICIT_DEF),
4075                    ImpDefReg);
4076 
4077       // Now, create the subregister insert from SrcReg.
4078       Register InsertReg = MRI.createVirtualRegister(&AArch64::FPR128RegClass);
4079       MachineInstr &InsMI =
4080           *BuildMI(MBB, I, I.getDebugLoc(),
4081                    TII.get(TargetOpcode::INSERT_SUBREG), InsertReg)
4082                .addUse(ImpDefReg)
4083                .addUse(SrcReg)
4084                .addImm(SubReg);
4085 
4086       constrainSelectedInstRegOperands(ImpDefMI, TII, TRI, RBI);
4087       constrainSelectedInstRegOperands(InsMI, TII, TRI, RBI);
4088 
4089       // Save the register so that we can copy from it after.
4090       InsertRegs.push_back(InsertReg);
4091     }
4092   }
4093 
4094   // Now that we've created any necessary subregister inserts, we can
4095   // create the copies.
4096   //
4097   // Perform the first copy separately as a subregister copy.
4098   Register CopyTo = I.getOperand(0).getReg();
4099   auto FirstCopy = MIB.buildInstr(TargetOpcode::COPY, {CopyTo}, {})
4100                        .addReg(InsertRegs[0], 0, ExtractSubReg);
4101   constrainSelectedInstRegOperands(*FirstCopy, TII, TRI, RBI);
4102 
4103   // Now, perform the remaining copies as vector lane copies.
4104   unsigned LaneIdx = 1;
4105   for (Register InsReg : InsertRegs) {
4106     Register CopyTo = I.getOperand(LaneIdx).getReg();
4107     MachineInstr &CopyInst =
4108         *BuildMI(MBB, I, I.getDebugLoc(), TII.get(CopyOpc), CopyTo)
4109              .addUse(InsReg)
4110              .addImm(LaneIdx);
4111     constrainSelectedInstRegOperands(CopyInst, TII, TRI, RBI);
4112     ++LaneIdx;
4113   }
4114 
4115   // Separately constrain the first copy's destination. Because of the
4116   // limitation in constrainOperandRegClass, we can't guarantee that this will
4117   // actually be constrained. So, do it ourselves using the second operand.
4118   const TargetRegisterClass *RC =
4119       MRI.getRegClassOrNull(I.getOperand(1).getReg());
4120   if (!RC) {
4121     LLVM_DEBUG(dbgs() << "Couldn't constrain copy destination.\n");
4122     return false;
4123   }
4124 
4125   RBI.constrainGenericRegister(CopyTo, *RC, MRI);
4126   I.eraseFromParent();
4127   return true;
4128 }
4129 
4130 bool AArch64InstructionSelector::selectConcatVectors(
4131     MachineInstr &I, MachineRegisterInfo &MRI)  {
4132   assert(I.getOpcode() == TargetOpcode::G_CONCAT_VECTORS &&
4133          "Unexpected opcode");
4134   Register Dst = I.getOperand(0).getReg();
4135   Register Op1 = I.getOperand(1).getReg();
4136   Register Op2 = I.getOperand(2).getReg();
4137   MachineInstr *ConcatMI = emitVectorConcat(Dst, Op1, Op2, MIB);
4138   if (!ConcatMI)
4139     return false;
4140   I.eraseFromParent();
4141   return true;
4142 }
4143 
4144 unsigned
4145 AArch64InstructionSelector::emitConstantPoolEntry(const Constant *CPVal,
4146                                                   MachineFunction &MF) const {
4147   Type *CPTy = CPVal->getType();
4148   Align Alignment = MF.getDataLayout().getPrefTypeAlign(CPTy);
4149 
4150   MachineConstantPool *MCP = MF.getConstantPool();
4151   return MCP->getConstantPoolIndex(CPVal, Alignment);
4152 }
4153 
4154 MachineInstr *AArch64InstructionSelector::emitLoadFromConstantPool(
4155     const Constant *CPVal, MachineIRBuilder &MIRBuilder) const {
4156   const TargetRegisterClass *RC;
4157   unsigned Opc;
4158   bool IsTiny = TM.getCodeModel() == CodeModel::Tiny;
4159   unsigned Size = MIRBuilder.getDataLayout().getTypeStoreSize(CPVal->getType());
4160   switch (Size) {
4161   case 16:
4162     RC = &AArch64::FPR128RegClass;
4163     Opc = IsTiny ? AArch64::LDRQl : AArch64::LDRQui;
4164     break;
4165   case 8:
4166     RC = &AArch64::FPR64RegClass;
4167     Opc = IsTiny ? AArch64::LDRDl : AArch64::LDRDui;
4168     break;
4169   case 4:
4170     RC = &AArch64::FPR32RegClass;
4171     Opc = IsTiny ? AArch64::LDRSl : AArch64::LDRSui;
4172     break;
4173   case 2:
4174     RC = &AArch64::FPR16RegClass;
4175     Opc = AArch64::LDRHui;
4176     break;
4177   default:
4178     LLVM_DEBUG(dbgs() << "Could not load from constant pool of type "
4179                       << *CPVal->getType());
4180     return nullptr;
4181   }
4182 
4183   MachineInstr *LoadMI = nullptr;
4184   auto &MF = MIRBuilder.getMF();
4185   unsigned CPIdx = emitConstantPoolEntry(CPVal, MF);
4186   if (IsTiny && (Size == 16 || Size == 8 || Size == 4)) {
4187     // Use load(literal) for tiny code model.
4188     LoadMI = &*MIRBuilder.buildInstr(Opc, {RC}, {}).addConstantPoolIndex(CPIdx);
4189   } else {
4190     auto Adrp =
4191         MIRBuilder.buildInstr(AArch64::ADRP, {&AArch64::GPR64RegClass}, {})
4192             .addConstantPoolIndex(CPIdx, 0, AArch64II::MO_PAGE);
4193 
4194     LoadMI = &*MIRBuilder.buildInstr(Opc, {RC}, {Adrp})
4195                    .addConstantPoolIndex(
4196                        CPIdx, 0, AArch64II::MO_PAGEOFF | AArch64II::MO_NC);
4197 
4198     constrainSelectedInstRegOperands(*Adrp, TII, TRI, RBI);
4199   }
4200 
4201   MachinePointerInfo PtrInfo = MachinePointerInfo::getConstantPool(MF);
4202   LoadMI->addMemOperand(MF, MF.getMachineMemOperand(PtrInfo,
4203                                                     MachineMemOperand::MOLoad,
4204                                                     Size, Align(Size)));
4205   constrainSelectedInstRegOperands(*LoadMI, TII, TRI, RBI);
4206   return LoadMI;
4207 }
4208 
4209 /// Return an <Opcode, SubregIndex> pair to do an vector elt insert of a given
4210 /// size and RB.
4211 static std::pair<unsigned, unsigned>
4212 getInsertVecEltOpInfo(const RegisterBank &RB, unsigned EltSize) {
4213   unsigned Opc, SubregIdx;
4214   if (RB.getID() == AArch64::GPRRegBankID) {
4215     if (EltSize == 8) {
4216       Opc = AArch64::INSvi8gpr;
4217       SubregIdx = AArch64::bsub;
4218     } else if (EltSize == 16) {
4219       Opc = AArch64::INSvi16gpr;
4220       SubregIdx = AArch64::ssub;
4221     } else if (EltSize == 32) {
4222       Opc = AArch64::INSvi32gpr;
4223       SubregIdx = AArch64::ssub;
4224     } else if (EltSize == 64) {
4225       Opc = AArch64::INSvi64gpr;
4226       SubregIdx = AArch64::dsub;
4227     } else {
4228       llvm_unreachable("invalid elt size!");
4229     }
4230   } else {
4231     if (EltSize == 8) {
4232       Opc = AArch64::INSvi8lane;
4233       SubregIdx = AArch64::bsub;
4234     } else if (EltSize == 16) {
4235       Opc = AArch64::INSvi16lane;
4236       SubregIdx = AArch64::hsub;
4237     } else if (EltSize == 32) {
4238       Opc = AArch64::INSvi32lane;
4239       SubregIdx = AArch64::ssub;
4240     } else if (EltSize == 64) {
4241       Opc = AArch64::INSvi64lane;
4242       SubregIdx = AArch64::dsub;
4243     } else {
4244       llvm_unreachable("invalid elt size!");
4245     }
4246   }
4247   return std::make_pair(Opc, SubregIdx);
4248 }
4249 
4250 MachineInstr *AArch64InstructionSelector::emitInstr(
4251     unsigned Opcode, std::initializer_list<llvm::DstOp> DstOps,
4252     std::initializer_list<llvm::SrcOp> SrcOps, MachineIRBuilder &MIRBuilder,
4253     const ComplexRendererFns &RenderFns) const {
4254   assert(Opcode && "Expected an opcode?");
4255   assert(!isPreISelGenericOpcode(Opcode) &&
4256          "Function should only be used to produce selected instructions!");
4257   auto MI = MIRBuilder.buildInstr(Opcode, DstOps, SrcOps);
4258   if (RenderFns)
4259     for (auto &Fn : *RenderFns)
4260       Fn(MI);
4261   constrainSelectedInstRegOperands(*MI, TII, TRI, RBI);
4262   return &*MI;
4263 }
4264 
4265 MachineInstr *AArch64InstructionSelector::emitAddSub(
4266     const std::array<std::array<unsigned, 2>, 5> &AddrModeAndSizeToOpcode,
4267     Register Dst, MachineOperand &LHS, MachineOperand &RHS,
4268     MachineIRBuilder &MIRBuilder) const {
4269   MachineRegisterInfo &MRI = MIRBuilder.getMF().getRegInfo();
4270   assert(LHS.isReg() && RHS.isReg() && "Expected register operands?");
4271   auto Ty = MRI.getType(LHS.getReg());
4272   assert(!Ty.isVector() && "Expected a scalar or pointer?");
4273   unsigned Size = Ty.getSizeInBits();
4274   assert((Size == 32 || Size == 64) && "Expected a 32-bit or 64-bit type only");
4275   bool Is32Bit = Size == 32;
4276 
4277   // INSTRri form with positive arithmetic immediate.
4278   if (auto Fns = selectArithImmed(RHS))
4279     return emitInstr(AddrModeAndSizeToOpcode[0][Is32Bit], {Dst}, {LHS},
4280                      MIRBuilder, Fns);
4281 
4282   // INSTRri form with negative arithmetic immediate.
4283   if (auto Fns = selectNegArithImmed(RHS))
4284     return emitInstr(AddrModeAndSizeToOpcode[3][Is32Bit], {Dst}, {LHS},
4285                      MIRBuilder, Fns);
4286 
4287   // INSTRrx form.
4288   if (auto Fns = selectArithExtendedRegister(RHS))
4289     return emitInstr(AddrModeAndSizeToOpcode[4][Is32Bit], {Dst}, {LHS},
4290                      MIRBuilder, Fns);
4291 
4292   // INSTRrs form.
4293   if (auto Fns = selectShiftedRegister(RHS))
4294     return emitInstr(AddrModeAndSizeToOpcode[1][Is32Bit], {Dst}, {LHS},
4295                      MIRBuilder, Fns);
4296   return emitInstr(AddrModeAndSizeToOpcode[2][Is32Bit], {Dst}, {LHS, RHS},
4297                    MIRBuilder);
4298 }
4299 
4300 MachineInstr *
4301 AArch64InstructionSelector::emitADD(Register DefReg, MachineOperand &LHS,
4302                                     MachineOperand &RHS,
4303                                     MachineIRBuilder &MIRBuilder) const {
4304   const std::array<std::array<unsigned, 2>, 5> OpcTable{
4305       {{AArch64::ADDXri, AArch64::ADDWri},
4306        {AArch64::ADDXrs, AArch64::ADDWrs},
4307        {AArch64::ADDXrr, AArch64::ADDWrr},
4308        {AArch64::SUBXri, AArch64::SUBWri},
4309        {AArch64::ADDXrx, AArch64::ADDWrx}}};
4310   return emitAddSub(OpcTable, DefReg, LHS, RHS, MIRBuilder);
4311 }
4312 
4313 MachineInstr *
4314 AArch64InstructionSelector::emitADDS(Register Dst, MachineOperand &LHS,
4315                                      MachineOperand &RHS,
4316                                      MachineIRBuilder &MIRBuilder) const {
4317   const std::array<std::array<unsigned, 2>, 5> OpcTable{
4318       {{AArch64::ADDSXri, AArch64::ADDSWri},
4319        {AArch64::ADDSXrs, AArch64::ADDSWrs},
4320        {AArch64::ADDSXrr, AArch64::ADDSWrr},
4321        {AArch64::SUBSXri, AArch64::SUBSWri},
4322        {AArch64::ADDSXrx, AArch64::ADDSWrx}}};
4323   return emitAddSub(OpcTable, Dst, LHS, RHS, MIRBuilder);
4324 }
4325 
4326 MachineInstr *
4327 AArch64InstructionSelector::emitSUBS(Register Dst, MachineOperand &LHS,
4328                                      MachineOperand &RHS,
4329                                      MachineIRBuilder &MIRBuilder) const {
4330   const std::array<std::array<unsigned, 2>, 5> OpcTable{
4331       {{AArch64::SUBSXri, AArch64::SUBSWri},
4332        {AArch64::SUBSXrs, AArch64::SUBSWrs},
4333        {AArch64::SUBSXrr, AArch64::SUBSWrr},
4334        {AArch64::ADDSXri, AArch64::ADDSWri},
4335        {AArch64::SUBSXrx, AArch64::SUBSWrx}}};
4336   return emitAddSub(OpcTable, Dst, LHS, RHS, MIRBuilder);
4337 }
4338 
4339 MachineInstr *
4340 AArch64InstructionSelector::emitADCS(Register Dst, MachineOperand &LHS,
4341                                      MachineOperand &RHS,
4342                                      MachineIRBuilder &MIRBuilder) const {
4343   assert(LHS.isReg() && RHS.isReg() && "Expected register operands?");
4344   MachineRegisterInfo *MRI = MIRBuilder.getMRI();
4345   bool Is32Bit = (MRI->getType(LHS.getReg()).getSizeInBits() == 32);
4346   static const unsigned OpcTable[2] = {AArch64::ADCSXr, AArch64::ADCSWr};
4347   return emitInstr(OpcTable[Is32Bit], {Dst}, {LHS, RHS}, MIRBuilder);
4348 }
4349 
4350 MachineInstr *
4351 AArch64InstructionSelector::emitSBCS(Register Dst, MachineOperand &LHS,
4352                                      MachineOperand &RHS,
4353                                      MachineIRBuilder &MIRBuilder) const {
4354   assert(LHS.isReg() && RHS.isReg() && "Expected register operands?");
4355   MachineRegisterInfo *MRI = MIRBuilder.getMRI();
4356   bool Is32Bit = (MRI->getType(LHS.getReg()).getSizeInBits() == 32);
4357   static const unsigned OpcTable[2] = {AArch64::SBCSXr, AArch64::SBCSWr};
4358   return emitInstr(OpcTable[Is32Bit], {Dst}, {LHS, RHS}, MIRBuilder);
4359 }
4360 
4361 MachineInstr *
4362 AArch64InstructionSelector::emitCMN(MachineOperand &LHS, MachineOperand &RHS,
4363                                     MachineIRBuilder &MIRBuilder) const {
4364   MachineRegisterInfo &MRI = MIRBuilder.getMF().getRegInfo();
4365   bool Is32Bit = (MRI.getType(LHS.getReg()).getSizeInBits() == 32);
4366   auto RC = Is32Bit ? &AArch64::GPR32RegClass : &AArch64::GPR64RegClass;
4367   return emitADDS(MRI.createVirtualRegister(RC), LHS, RHS, MIRBuilder);
4368 }
4369 
4370 MachineInstr *
4371 AArch64InstructionSelector::emitTST(MachineOperand &LHS, MachineOperand &RHS,
4372                                     MachineIRBuilder &MIRBuilder) const {
4373   assert(LHS.isReg() && RHS.isReg() && "Expected register operands?");
4374   MachineRegisterInfo &MRI = MIRBuilder.getMF().getRegInfo();
4375   LLT Ty = MRI.getType(LHS.getReg());
4376   unsigned RegSize = Ty.getSizeInBits();
4377   bool Is32Bit = (RegSize == 32);
4378   const unsigned OpcTable[3][2] = {{AArch64::ANDSXri, AArch64::ANDSWri},
4379                                    {AArch64::ANDSXrs, AArch64::ANDSWrs},
4380                                    {AArch64::ANDSXrr, AArch64::ANDSWrr}};
4381   // ANDS needs a logical immediate for its immediate form. Check if we can
4382   // fold one in.
4383   if (auto ValAndVReg = getIConstantVRegValWithLookThrough(RHS.getReg(), MRI)) {
4384     int64_t Imm = ValAndVReg->Value.getSExtValue();
4385 
4386     if (AArch64_AM::isLogicalImmediate(Imm, RegSize)) {
4387       auto TstMI = MIRBuilder.buildInstr(OpcTable[0][Is32Bit], {Ty}, {LHS});
4388       TstMI.addImm(AArch64_AM::encodeLogicalImmediate(Imm, RegSize));
4389       constrainSelectedInstRegOperands(*TstMI, TII, TRI, RBI);
4390       return &*TstMI;
4391     }
4392   }
4393 
4394   if (auto Fns = selectLogicalShiftedRegister(RHS))
4395     return emitInstr(OpcTable[1][Is32Bit], {Ty}, {LHS}, MIRBuilder, Fns);
4396   return emitInstr(OpcTable[2][Is32Bit], {Ty}, {LHS, RHS}, MIRBuilder);
4397 }
4398 
4399 MachineInstr *AArch64InstructionSelector::emitIntegerCompare(
4400     MachineOperand &LHS, MachineOperand &RHS, MachineOperand &Predicate,
4401     MachineIRBuilder &MIRBuilder) const {
4402   assert(LHS.isReg() && RHS.isReg() && "Expected LHS and RHS to be registers!");
4403   assert(Predicate.isPredicate() && "Expected predicate?");
4404   MachineRegisterInfo &MRI = MIRBuilder.getMF().getRegInfo();
4405   LLT CmpTy = MRI.getType(LHS.getReg());
4406   assert(!CmpTy.isVector() && "Expected scalar or pointer");
4407   unsigned Size = CmpTy.getSizeInBits();
4408   (void)Size;
4409   assert((Size == 32 || Size == 64) && "Expected a 32-bit or 64-bit LHS/RHS?");
4410   // Fold the compare into a cmn or tst if possible.
4411   if (auto FoldCmp = tryFoldIntegerCompare(LHS, RHS, Predicate, MIRBuilder))
4412     return FoldCmp;
4413   auto Dst = MRI.cloneVirtualRegister(LHS.getReg());
4414   return emitSUBS(Dst, LHS, RHS, MIRBuilder);
4415 }
4416 
4417 MachineInstr *AArch64InstructionSelector::emitCSetForFCmp(
4418     Register Dst, CmpInst::Predicate Pred, MachineIRBuilder &MIRBuilder) const {
4419   MachineRegisterInfo &MRI = *MIRBuilder.getMRI();
4420 #ifndef NDEBUG
4421   LLT Ty = MRI.getType(Dst);
4422   assert(!Ty.isVector() && Ty.getSizeInBits() == 32 &&
4423          "Expected a 32-bit scalar register?");
4424 #endif
4425   const Register ZReg = AArch64::WZR;
4426   AArch64CC::CondCode CC1, CC2;
4427   changeFCMPPredToAArch64CC(Pred, CC1, CC2);
4428   auto InvCC1 = AArch64CC::getInvertedCondCode(CC1);
4429   if (CC2 == AArch64CC::AL)
4430     return emitCSINC(/*Dst=*/Dst, /*Src1=*/ZReg, /*Src2=*/ZReg, InvCC1,
4431                      MIRBuilder);
4432   const TargetRegisterClass *RC = &AArch64::GPR32RegClass;
4433   Register Def1Reg = MRI.createVirtualRegister(RC);
4434   Register Def2Reg = MRI.createVirtualRegister(RC);
4435   auto InvCC2 = AArch64CC::getInvertedCondCode(CC2);
4436   emitCSINC(/*Dst=*/Def1Reg, /*Src1=*/ZReg, /*Src2=*/ZReg, InvCC1, MIRBuilder);
4437   emitCSINC(/*Dst=*/Def2Reg, /*Src1=*/ZReg, /*Src2=*/ZReg, InvCC2, MIRBuilder);
4438   auto OrMI = MIRBuilder.buildInstr(AArch64::ORRWrr, {Dst}, {Def1Reg, Def2Reg});
4439   constrainSelectedInstRegOperands(*OrMI, TII, TRI, RBI);
4440   return &*OrMI;
4441 }
4442 
4443 MachineInstr *AArch64InstructionSelector::emitFPCompare(
4444     Register LHS, Register RHS, MachineIRBuilder &MIRBuilder,
4445     std::optional<CmpInst::Predicate> Pred) const {
4446   MachineRegisterInfo &MRI = *MIRBuilder.getMRI();
4447   LLT Ty = MRI.getType(LHS);
4448   if (Ty.isVector())
4449     return nullptr;
4450   unsigned OpSize = Ty.getSizeInBits();
4451   assert(OpSize == 16 || OpSize == 32 || OpSize == 64);
4452 
4453   // If this is a compare against +0.0, then we don't have
4454   // to explicitly materialize a constant.
4455   const ConstantFP *FPImm = getConstantFPVRegVal(RHS, MRI);
4456   bool ShouldUseImm = FPImm && (FPImm->isZero() && !FPImm->isNegative());
4457 
4458   auto IsEqualityPred = [](CmpInst::Predicate P) {
4459     return P == CmpInst::FCMP_OEQ || P == CmpInst::FCMP_ONE ||
4460            P == CmpInst::FCMP_UEQ || P == CmpInst::FCMP_UNE;
4461   };
4462   if (!ShouldUseImm && Pred && IsEqualityPred(*Pred)) {
4463     // Try commutating the operands.
4464     const ConstantFP *LHSImm = getConstantFPVRegVal(LHS, MRI);
4465     if (LHSImm && (LHSImm->isZero() && !LHSImm->isNegative())) {
4466       ShouldUseImm = true;
4467       std::swap(LHS, RHS);
4468     }
4469   }
4470   unsigned CmpOpcTbl[2][3] = {
4471       {AArch64::FCMPHrr, AArch64::FCMPSrr, AArch64::FCMPDrr},
4472       {AArch64::FCMPHri, AArch64::FCMPSri, AArch64::FCMPDri}};
4473   unsigned CmpOpc =
4474       CmpOpcTbl[ShouldUseImm][OpSize == 16 ? 0 : (OpSize == 32 ? 1 : 2)];
4475 
4476   // Partially build the compare. Decide if we need to add a use for the
4477   // third operand based off whether or not we're comparing against 0.0.
4478   auto CmpMI = MIRBuilder.buildInstr(CmpOpc).addUse(LHS);
4479   CmpMI.setMIFlags(MachineInstr::NoFPExcept);
4480   if (!ShouldUseImm)
4481     CmpMI.addUse(RHS);
4482   constrainSelectedInstRegOperands(*CmpMI, TII, TRI, RBI);
4483   return &*CmpMI;
4484 }
4485 
4486 MachineInstr *AArch64InstructionSelector::emitVectorConcat(
4487     std::optional<Register> Dst, Register Op1, Register Op2,
4488     MachineIRBuilder &MIRBuilder) const {
4489   // We implement a vector concat by:
4490   // 1. Use scalar_to_vector to insert the lower vector into the larger dest
4491   // 2. Insert the upper vector into the destination's upper element
4492   // TODO: some of this code is common with G_BUILD_VECTOR handling.
4493   MachineRegisterInfo &MRI = MIRBuilder.getMF().getRegInfo();
4494 
4495   const LLT Op1Ty = MRI.getType(Op1);
4496   const LLT Op2Ty = MRI.getType(Op2);
4497 
4498   if (Op1Ty != Op2Ty) {
4499     LLVM_DEBUG(dbgs() << "Could not do vector concat of differing vector tys");
4500     return nullptr;
4501   }
4502   assert(Op1Ty.isVector() && "Expected a vector for vector concat");
4503 
4504   if (Op1Ty.getSizeInBits() >= 128) {
4505     LLVM_DEBUG(dbgs() << "Vector concat not supported for full size vectors");
4506     return nullptr;
4507   }
4508 
4509   // At the moment we just support 64 bit vector concats.
4510   if (Op1Ty.getSizeInBits() != 64) {
4511     LLVM_DEBUG(dbgs() << "Vector concat supported for 64b vectors");
4512     return nullptr;
4513   }
4514 
4515   const LLT ScalarTy = LLT::scalar(Op1Ty.getSizeInBits());
4516   const RegisterBank &FPRBank = *RBI.getRegBank(Op1, MRI, TRI);
4517   const TargetRegisterClass *DstRC =
4518       getRegClassForTypeOnBank(Op1Ty.multiplyElements(2), FPRBank);
4519 
4520   MachineInstr *WidenedOp1 =
4521       emitScalarToVector(ScalarTy.getSizeInBits(), DstRC, Op1, MIRBuilder);
4522   MachineInstr *WidenedOp2 =
4523       emitScalarToVector(ScalarTy.getSizeInBits(), DstRC, Op2, MIRBuilder);
4524   if (!WidenedOp1 || !WidenedOp2) {
4525     LLVM_DEBUG(dbgs() << "Could not emit a vector from scalar value");
4526     return nullptr;
4527   }
4528 
4529   // Now do the insert of the upper element.
4530   unsigned InsertOpc, InsSubRegIdx;
4531   std::tie(InsertOpc, InsSubRegIdx) =
4532       getInsertVecEltOpInfo(FPRBank, ScalarTy.getSizeInBits());
4533 
4534   if (!Dst)
4535     Dst = MRI.createVirtualRegister(DstRC);
4536   auto InsElt =
4537       MIRBuilder
4538           .buildInstr(InsertOpc, {*Dst}, {WidenedOp1->getOperand(0).getReg()})
4539           .addImm(1) /* Lane index */
4540           .addUse(WidenedOp2->getOperand(0).getReg())
4541           .addImm(0);
4542   constrainSelectedInstRegOperands(*InsElt, TII, TRI, RBI);
4543   return &*InsElt;
4544 }
4545 
4546 MachineInstr *
4547 AArch64InstructionSelector::emitCSINC(Register Dst, Register Src1,
4548                                       Register Src2, AArch64CC::CondCode Pred,
4549                                       MachineIRBuilder &MIRBuilder) const {
4550   auto &MRI = *MIRBuilder.getMRI();
4551   const RegClassOrRegBank &RegClassOrBank = MRI.getRegClassOrRegBank(Dst);
4552   // If we used a register class, then this won't necessarily have an LLT.
4553   // Compute the size based off whether or not we have a class or bank.
4554   unsigned Size;
4555   if (const auto *RC = RegClassOrBank.dyn_cast<const TargetRegisterClass *>())
4556     Size = TRI.getRegSizeInBits(*RC);
4557   else
4558     Size = MRI.getType(Dst).getSizeInBits();
4559   // Some opcodes use s1.
4560   assert(Size <= 64 && "Expected 64 bits or less only!");
4561   static const unsigned OpcTable[2] = {AArch64::CSINCWr, AArch64::CSINCXr};
4562   unsigned Opc = OpcTable[Size == 64];
4563   auto CSINC = MIRBuilder.buildInstr(Opc, {Dst}, {Src1, Src2}).addImm(Pred);
4564   constrainSelectedInstRegOperands(*CSINC, TII, TRI, RBI);
4565   return &*CSINC;
4566 }
4567 
4568 MachineInstr *AArch64InstructionSelector::emitCarryIn(MachineInstr &I,
4569                                                       Register CarryReg) {
4570   MachineRegisterInfo *MRI = MIB.getMRI();
4571   unsigned Opcode = I.getOpcode();
4572 
4573   // If the instruction is a SUB, we need to negate the carry,
4574   // because borrowing is indicated by carry-flag == 0.
4575   bool NeedsNegatedCarry =
4576       (Opcode == TargetOpcode::G_USUBE || Opcode == TargetOpcode::G_SSUBE);
4577 
4578   // If the previous instruction will already produce the correct carry, do not
4579   // emit a carry generating instruction. E.g. for G_UADDE/G_USUBE sequences
4580   // generated during legalization of wide add/sub. This optimization depends on
4581   // these sequences not being interrupted by other instructions.
4582   // We have to select the previous instruction before the carry-using
4583   // instruction is deleted by the calling function, otherwise the previous
4584   // instruction might become dead and would get deleted.
4585   MachineInstr *SrcMI = MRI->getVRegDef(CarryReg);
4586   if (SrcMI == I.getPrevNode()) {
4587     if (auto *CarrySrcMI = dyn_cast<GAddSubCarryOut>(SrcMI)) {
4588       bool ProducesNegatedCarry = CarrySrcMI->isSub();
4589       if (NeedsNegatedCarry == ProducesNegatedCarry &&
4590           CarrySrcMI->isUnsigned() &&
4591           CarrySrcMI->getCarryOutReg() == CarryReg &&
4592           selectAndRestoreState(*SrcMI))
4593         return nullptr;
4594     }
4595   }
4596 
4597   Register DeadReg = MRI->createVirtualRegister(&AArch64::GPR32RegClass);
4598 
4599   if (NeedsNegatedCarry) {
4600     // (0 - Carry) sets !C in NZCV when Carry == 1
4601     Register ZReg = AArch64::WZR;
4602     return emitInstr(AArch64::SUBSWrr, {DeadReg}, {ZReg, CarryReg}, MIB);
4603   }
4604 
4605   // (Carry - 1) sets !C in NZCV when Carry == 0
4606   auto Fns = select12BitValueWithLeftShift(1);
4607   return emitInstr(AArch64::SUBSWri, {DeadReg}, {CarryReg}, MIB, Fns);
4608 }
4609 
4610 bool AArch64InstructionSelector::selectOverflowOp(MachineInstr &I,
4611                                                   MachineRegisterInfo &MRI) {
4612   auto &CarryMI = cast<GAddSubCarryOut>(I);
4613 
4614   if (auto *CarryInMI = dyn_cast<GAddSubCarryInOut>(&I)) {
4615     // Set NZCV carry according to carry-in VReg
4616     emitCarryIn(I, CarryInMI->getCarryInReg());
4617   }
4618 
4619   // Emit the operation and get the correct condition code.
4620   auto OpAndCC = emitOverflowOp(I.getOpcode(), CarryMI.getDstReg(),
4621                                 CarryMI.getLHS(), CarryMI.getRHS(), MIB);
4622 
4623   Register CarryOutReg = CarryMI.getCarryOutReg();
4624 
4625   // Don't convert carry-out to VReg if it is never used
4626   if (!MRI.use_nodbg_empty(CarryOutReg)) {
4627     // Now, put the overflow result in the register given by the first operand
4628     // to the overflow op. CSINC increments the result when the predicate is
4629     // false, so to get the increment when it's true, we need to use the
4630     // inverse. In this case, we want to increment when carry is set.
4631     Register ZReg = AArch64::WZR;
4632     emitCSINC(/*Dst=*/CarryOutReg, /*Src1=*/ZReg, /*Src2=*/ZReg,
4633               getInvertedCondCode(OpAndCC.second), MIB);
4634   }
4635 
4636   I.eraseFromParent();
4637   return true;
4638 }
4639 
4640 std::pair<MachineInstr *, AArch64CC::CondCode>
4641 AArch64InstructionSelector::emitOverflowOp(unsigned Opcode, Register Dst,
4642                                            MachineOperand &LHS,
4643                                            MachineOperand &RHS,
4644                                            MachineIRBuilder &MIRBuilder) const {
4645   switch (Opcode) {
4646   default:
4647     llvm_unreachable("Unexpected opcode!");
4648   case TargetOpcode::G_SADDO:
4649     return std::make_pair(emitADDS(Dst, LHS, RHS, MIRBuilder), AArch64CC::VS);
4650   case TargetOpcode::G_UADDO:
4651     return std::make_pair(emitADDS(Dst, LHS, RHS, MIRBuilder), AArch64CC::HS);
4652   case TargetOpcode::G_SSUBO:
4653     return std::make_pair(emitSUBS(Dst, LHS, RHS, MIRBuilder), AArch64CC::VS);
4654   case TargetOpcode::G_USUBO:
4655     return std::make_pair(emitSUBS(Dst, LHS, RHS, MIRBuilder), AArch64CC::LO);
4656   case TargetOpcode::G_SADDE:
4657     return std::make_pair(emitADCS(Dst, LHS, RHS, MIRBuilder), AArch64CC::VS);
4658   case TargetOpcode::G_UADDE:
4659     return std::make_pair(emitADCS(Dst, LHS, RHS, MIRBuilder), AArch64CC::HS);
4660   case TargetOpcode::G_SSUBE:
4661     return std::make_pair(emitSBCS(Dst, LHS, RHS, MIRBuilder), AArch64CC::VS);
4662   case TargetOpcode::G_USUBE:
4663     return std::make_pair(emitSBCS(Dst, LHS, RHS, MIRBuilder), AArch64CC::LO);
4664   }
4665 }
4666 
4667 /// Returns true if @p Val is a tree of AND/OR/CMP operations that can be
4668 /// expressed as a conjunction.
4669 /// \param CanNegate    Set to true if we can negate the whole sub-tree just by
4670 ///                     changing the conditions on the CMP tests.
4671 ///                     (this means we can call emitConjunctionRec() with
4672 ///                      Negate==true on this sub-tree)
4673 /// \param MustBeFirst  Set to true if this subtree needs to be negated and we
4674 ///                     cannot do the negation naturally. We are required to
4675 ///                     emit the subtree first in this case.
4676 /// \param WillNegate   Is true if are called when the result of this
4677 ///                     subexpression must be negated. This happens when the
4678 ///                     outer expression is an OR. We can use this fact to know
4679 ///                     that we have a double negation (or (or ...) ...) that
4680 ///                     can be implemented for free.
4681 static bool canEmitConjunction(Register Val, bool &CanNegate, bool &MustBeFirst,
4682                                bool WillNegate, MachineRegisterInfo &MRI,
4683                                unsigned Depth = 0) {
4684   if (!MRI.hasOneNonDBGUse(Val))
4685     return false;
4686   MachineInstr *ValDef = MRI.getVRegDef(Val);
4687   unsigned Opcode = ValDef->getOpcode();
4688   if (isa<GAnyCmp>(ValDef)) {
4689     CanNegate = true;
4690     MustBeFirst = false;
4691     return true;
4692   }
4693   // Protect against exponential runtime and stack overflow.
4694   if (Depth > 6)
4695     return false;
4696   if (Opcode == TargetOpcode::G_AND || Opcode == TargetOpcode::G_OR) {
4697     bool IsOR = Opcode == TargetOpcode::G_OR;
4698     Register O0 = ValDef->getOperand(1).getReg();
4699     Register O1 = ValDef->getOperand(2).getReg();
4700     bool CanNegateL;
4701     bool MustBeFirstL;
4702     if (!canEmitConjunction(O0, CanNegateL, MustBeFirstL, IsOR, MRI, Depth + 1))
4703       return false;
4704     bool CanNegateR;
4705     bool MustBeFirstR;
4706     if (!canEmitConjunction(O1, CanNegateR, MustBeFirstR, IsOR, MRI, Depth + 1))
4707       return false;
4708 
4709     if (MustBeFirstL && MustBeFirstR)
4710       return false;
4711 
4712     if (IsOR) {
4713       // For an OR expression we need to be able to naturally negate at least
4714       // one side or we cannot do the transformation at all.
4715       if (!CanNegateL && !CanNegateR)
4716         return false;
4717       // If we the result of the OR will be negated and we can naturally negate
4718       // the leaves, then this sub-tree as a whole negates naturally.
4719       CanNegate = WillNegate && CanNegateL && CanNegateR;
4720       // If we cannot naturally negate the whole sub-tree, then this must be
4721       // emitted first.
4722       MustBeFirst = !CanNegate;
4723     } else {
4724       assert(Opcode == TargetOpcode::G_AND && "Must be G_AND");
4725       // We cannot naturally negate an AND operation.
4726       CanNegate = false;
4727       MustBeFirst = MustBeFirstL || MustBeFirstR;
4728     }
4729     return true;
4730   }
4731   return false;
4732 }
4733 
4734 MachineInstr *AArch64InstructionSelector::emitConditionalComparison(
4735     Register LHS, Register RHS, CmpInst::Predicate CC,
4736     AArch64CC::CondCode Predicate, AArch64CC::CondCode OutCC,
4737     MachineIRBuilder &MIB) const {
4738   auto &MRI = *MIB.getMRI();
4739   LLT OpTy = MRI.getType(LHS);
4740   unsigned CCmpOpc;
4741   std::optional<ValueAndVReg> C;
4742   if (CmpInst::isIntPredicate(CC)) {
4743     assert(OpTy.getSizeInBits() == 32 || OpTy.getSizeInBits() == 64);
4744     C = getIConstantVRegValWithLookThrough(RHS, MRI);
4745     if (!C || C->Value.sgt(31) || C->Value.slt(-31))
4746       CCmpOpc = OpTy.getSizeInBits() == 32 ? AArch64::CCMPWr : AArch64::CCMPXr;
4747     else if (C->Value.ule(31))
4748       CCmpOpc = OpTy.getSizeInBits() == 32 ? AArch64::CCMPWi : AArch64::CCMPXi;
4749     else
4750       CCmpOpc = OpTy.getSizeInBits() == 32 ? AArch64::CCMNWi : AArch64::CCMNXi;
4751   } else {
4752     assert(OpTy.getSizeInBits() == 16 || OpTy.getSizeInBits() == 32 ||
4753            OpTy.getSizeInBits() == 64);
4754     switch (OpTy.getSizeInBits()) {
4755     case 16:
4756       assert(STI.hasFullFP16() && "Expected Full FP16 for fp16 comparisons");
4757       CCmpOpc = AArch64::FCCMPHrr;
4758       break;
4759     case 32:
4760       CCmpOpc = AArch64::FCCMPSrr;
4761       break;
4762     case 64:
4763       CCmpOpc = AArch64::FCCMPDrr;
4764       break;
4765     default:
4766       return nullptr;
4767     }
4768   }
4769   AArch64CC::CondCode InvOutCC = AArch64CC::getInvertedCondCode(OutCC);
4770   unsigned NZCV = AArch64CC::getNZCVToSatisfyCondCode(InvOutCC);
4771   auto CCmp =
4772       MIB.buildInstr(CCmpOpc, {}, {LHS});
4773   if (CCmpOpc == AArch64::CCMPWi || CCmpOpc == AArch64::CCMPXi)
4774     CCmp.addImm(C->Value.getZExtValue());
4775   else if (CCmpOpc == AArch64::CCMNWi || CCmpOpc == AArch64::CCMNXi)
4776     CCmp.addImm(C->Value.abs().getZExtValue());
4777   else
4778     CCmp.addReg(RHS);
4779   CCmp.addImm(NZCV).addImm(Predicate);
4780   constrainSelectedInstRegOperands(*CCmp, TII, TRI, RBI);
4781   return &*CCmp;
4782 }
4783 
4784 MachineInstr *AArch64InstructionSelector::emitConjunctionRec(
4785     Register Val, AArch64CC::CondCode &OutCC, bool Negate, Register CCOp,
4786     AArch64CC::CondCode Predicate, MachineIRBuilder &MIB) const {
4787   // We're at a tree leaf, produce a conditional comparison operation.
4788   auto &MRI = *MIB.getMRI();
4789   MachineInstr *ValDef = MRI.getVRegDef(Val);
4790   unsigned Opcode = ValDef->getOpcode();
4791   if (auto *Cmp = dyn_cast<GAnyCmp>(ValDef)) {
4792     Register LHS = Cmp->getLHSReg();
4793     Register RHS = Cmp->getRHSReg();
4794     CmpInst::Predicate CC = Cmp->getCond();
4795     if (Negate)
4796       CC = CmpInst::getInversePredicate(CC);
4797     if (isa<GICmp>(Cmp)) {
4798       OutCC = changeICMPPredToAArch64CC(CC);
4799     } else {
4800       // Handle special FP cases.
4801       AArch64CC::CondCode ExtraCC;
4802       changeFPCCToANDAArch64CC(CC, OutCC, ExtraCC);
4803       // Some floating point conditions can't be tested with a single condition
4804       // code. Construct an additional comparison in this case.
4805       if (ExtraCC != AArch64CC::AL) {
4806         MachineInstr *ExtraCmp;
4807         if (!CCOp)
4808           ExtraCmp = emitFPCompare(LHS, RHS, MIB, CC);
4809         else
4810           ExtraCmp =
4811               emitConditionalComparison(LHS, RHS, CC, Predicate, ExtraCC, MIB);
4812         CCOp = ExtraCmp->getOperand(0).getReg();
4813         Predicate = ExtraCC;
4814       }
4815     }
4816 
4817     // Produce a normal comparison if we are first in the chain
4818     if (!CCOp) {
4819       auto Dst = MRI.cloneVirtualRegister(LHS);
4820       if (isa<GICmp>(Cmp))
4821         return emitSUBS(Dst, Cmp->getOperand(2), Cmp->getOperand(3), MIB);
4822       return emitFPCompare(Cmp->getOperand(2).getReg(),
4823                            Cmp->getOperand(3).getReg(), MIB);
4824     }
4825     // Otherwise produce a ccmp.
4826     return emitConditionalComparison(LHS, RHS, CC, Predicate, OutCC, MIB);
4827   }
4828   assert(MRI.hasOneNonDBGUse(Val) && "Valid conjunction/disjunction tree");
4829 
4830   bool IsOR = Opcode == TargetOpcode::G_OR;
4831 
4832   Register LHS = ValDef->getOperand(1).getReg();
4833   bool CanNegateL;
4834   bool MustBeFirstL;
4835   bool ValidL = canEmitConjunction(LHS, CanNegateL, MustBeFirstL, IsOR, MRI);
4836   assert(ValidL && "Valid conjunction/disjunction tree");
4837   (void)ValidL;
4838 
4839   Register RHS = ValDef->getOperand(2).getReg();
4840   bool CanNegateR;
4841   bool MustBeFirstR;
4842   bool ValidR = canEmitConjunction(RHS, CanNegateR, MustBeFirstR, IsOR, MRI);
4843   assert(ValidR && "Valid conjunction/disjunction tree");
4844   (void)ValidR;
4845 
4846   // Swap sub-tree that must come first to the right side.
4847   if (MustBeFirstL) {
4848     assert(!MustBeFirstR && "Valid conjunction/disjunction tree");
4849     std::swap(LHS, RHS);
4850     std::swap(CanNegateL, CanNegateR);
4851     std::swap(MustBeFirstL, MustBeFirstR);
4852   }
4853 
4854   bool NegateR;
4855   bool NegateAfterR;
4856   bool NegateL;
4857   bool NegateAfterAll;
4858   if (Opcode == TargetOpcode::G_OR) {
4859     // Swap the sub-tree that we can negate naturally to the left.
4860     if (!CanNegateL) {
4861       assert(CanNegateR && "at least one side must be negatable");
4862       assert(!MustBeFirstR && "invalid conjunction/disjunction tree");
4863       assert(!Negate);
4864       std::swap(LHS, RHS);
4865       NegateR = false;
4866       NegateAfterR = true;
4867     } else {
4868       // Negate the left sub-tree if possible, otherwise negate the result.
4869       NegateR = CanNegateR;
4870       NegateAfterR = !CanNegateR;
4871     }
4872     NegateL = true;
4873     NegateAfterAll = !Negate;
4874   } else {
4875     assert(Opcode == TargetOpcode::G_AND &&
4876            "Valid conjunction/disjunction tree");
4877     assert(!Negate && "Valid conjunction/disjunction tree");
4878 
4879     NegateL = false;
4880     NegateR = false;
4881     NegateAfterR = false;
4882     NegateAfterAll = false;
4883   }
4884 
4885   // Emit sub-trees.
4886   AArch64CC::CondCode RHSCC;
4887   MachineInstr *CmpR =
4888       emitConjunctionRec(RHS, RHSCC, NegateR, CCOp, Predicate, MIB);
4889   if (NegateAfterR)
4890     RHSCC = AArch64CC::getInvertedCondCode(RHSCC);
4891   MachineInstr *CmpL = emitConjunctionRec(
4892       LHS, OutCC, NegateL, CmpR->getOperand(0).getReg(), RHSCC, MIB);
4893   if (NegateAfterAll)
4894     OutCC = AArch64CC::getInvertedCondCode(OutCC);
4895   return CmpL;
4896 }
4897 
4898 MachineInstr *AArch64InstructionSelector::emitConjunction(
4899     Register Val, AArch64CC::CondCode &OutCC, MachineIRBuilder &MIB) const {
4900   bool DummyCanNegate;
4901   bool DummyMustBeFirst;
4902   if (!canEmitConjunction(Val, DummyCanNegate, DummyMustBeFirst, false,
4903                           *MIB.getMRI()))
4904     return nullptr;
4905   return emitConjunctionRec(Val, OutCC, false, Register(), AArch64CC::AL, MIB);
4906 }
4907 
4908 bool AArch64InstructionSelector::tryOptSelectConjunction(GSelect &SelI,
4909                                                          MachineInstr &CondMI) {
4910   AArch64CC::CondCode AArch64CC;
4911   MachineInstr *ConjMI = emitConjunction(SelI.getCondReg(), AArch64CC, MIB);
4912   if (!ConjMI)
4913     return false;
4914 
4915   emitSelect(SelI.getReg(0), SelI.getTrueReg(), SelI.getFalseReg(), AArch64CC, MIB);
4916   SelI.eraseFromParent();
4917   return true;
4918 }
4919 
4920 bool AArch64InstructionSelector::tryOptSelect(GSelect &I) {
4921   MachineRegisterInfo &MRI = *MIB.getMRI();
4922   // We want to recognize this pattern:
4923   //
4924   // $z = G_FCMP pred, $x, $y
4925   // ...
4926   // $w = G_SELECT $z, $a, $b
4927   //
4928   // Where the value of $z is *only* ever used by the G_SELECT (possibly with
4929   // some copies/truncs in between.)
4930   //
4931   // If we see this, then we can emit something like this:
4932   //
4933   // fcmp $x, $y
4934   // fcsel $w, $a, $b, pred
4935   //
4936   // Rather than emitting both of the rather long sequences in the standard
4937   // G_FCMP/G_SELECT select methods.
4938 
4939   // First, check if the condition is defined by a compare.
4940   MachineInstr *CondDef = MRI.getVRegDef(I.getOperand(1).getReg());
4941 
4942   // We can only fold if all of the defs have one use.
4943   Register CondDefReg = CondDef->getOperand(0).getReg();
4944   if (!MRI.hasOneNonDBGUse(CondDefReg)) {
4945     // Unless it's another select.
4946     for (const MachineInstr &UI : MRI.use_nodbg_instructions(CondDefReg)) {
4947       if (CondDef == &UI)
4948         continue;
4949       if (UI.getOpcode() != TargetOpcode::G_SELECT)
4950         return false;
4951     }
4952   }
4953 
4954   // Is the condition defined by a compare?
4955   unsigned CondOpc = CondDef->getOpcode();
4956   if (CondOpc != TargetOpcode::G_ICMP && CondOpc != TargetOpcode::G_FCMP) {
4957     if (tryOptSelectConjunction(I, *CondDef))
4958       return true;
4959     return false;
4960   }
4961 
4962   AArch64CC::CondCode CondCode;
4963   if (CondOpc == TargetOpcode::G_ICMP) {
4964     auto Pred =
4965         static_cast<CmpInst::Predicate>(CondDef->getOperand(1).getPredicate());
4966     CondCode = changeICMPPredToAArch64CC(Pred);
4967     emitIntegerCompare(CondDef->getOperand(2), CondDef->getOperand(3),
4968                        CondDef->getOperand(1), MIB);
4969   } else {
4970     // Get the condition code for the select.
4971     auto Pred =
4972         static_cast<CmpInst::Predicate>(CondDef->getOperand(1).getPredicate());
4973     AArch64CC::CondCode CondCode2;
4974     changeFCMPPredToAArch64CC(Pred, CondCode, CondCode2);
4975 
4976     // changeFCMPPredToAArch64CC sets CondCode2 to AL when we require two
4977     // instructions to emit the comparison.
4978     // TODO: Handle FCMP_UEQ and FCMP_ONE. After that, this check will be
4979     // unnecessary.
4980     if (CondCode2 != AArch64CC::AL)
4981       return false;
4982 
4983     if (!emitFPCompare(CondDef->getOperand(2).getReg(),
4984                        CondDef->getOperand(3).getReg(), MIB)) {
4985       LLVM_DEBUG(dbgs() << "Couldn't emit compare for select!\n");
4986       return false;
4987     }
4988   }
4989 
4990   // Emit the select.
4991   emitSelect(I.getOperand(0).getReg(), I.getOperand(2).getReg(),
4992              I.getOperand(3).getReg(), CondCode, MIB);
4993   I.eraseFromParent();
4994   return true;
4995 }
4996 
4997 MachineInstr *AArch64InstructionSelector::tryFoldIntegerCompare(
4998     MachineOperand &LHS, MachineOperand &RHS, MachineOperand &Predicate,
4999     MachineIRBuilder &MIRBuilder) const {
5000   assert(LHS.isReg() && RHS.isReg() && Predicate.isPredicate() &&
5001          "Unexpected MachineOperand");
5002   MachineRegisterInfo &MRI = *MIRBuilder.getMRI();
5003   // We want to find this sort of thing:
5004   // x = G_SUB 0, y
5005   // G_ICMP z, x
5006   //
5007   // In this case, we can fold the G_SUB into the G_ICMP using a CMN instead.
5008   // e.g:
5009   //
5010   // cmn z, y
5011 
5012   // Check if the RHS or LHS of the G_ICMP is defined by a SUB
5013   MachineInstr *LHSDef = getDefIgnoringCopies(LHS.getReg(), MRI);
5014   MachineInstr *RHSDef = getDefIgnoringCopies(RHS.getReg(), MRI);
5015   auto P = static_cast<CmpInst::Predicate>(Predicate.getPredicate());
5016   // Given this:
5017   //
5018   // x = G_SUB 0, y
5019   // G_ICMP x, z
5020   //
5021   // Produce this:
5022   //
5023   // cmn y, z
5024   if (isCMN(LHSDef, P, MRI))
5025     return emitCMN(LHSDef->getOperand(2), RHS, MIRBuilder);
5026 
5027   // Same idea here, but with the RHS of the compare instead:
5028   //
5029   // Given this:
5030   //
5031   // x = G_SUB 0, y
5032   // G_ICMP z, x
5033   //
5034   // Produce this:
5035   //
5036   // cmn z, y
5037   if (isCMN(RHSDef, P, MRI))
5038     return emitCMN(LHS, RHSDef->getOperand(2), MIRBuilder);
5039 
5040   // Given this:
5041   //
5042   // z = G_AND x, y
5043   // G_ICMP z, 0
5044   //
5045   // Produce this if the compare is signed:
5046   //
5047   // tst x, y
5048   if (!CmpInst::isUnsigned(P) && LHSDef &&
5049       LHSDef->getOpcode() == TargetOpcode::G_AND) {
5050     // Make sure that the RHS is 0.
5051     auto ValAndVReg = getIConstantVRegValWithLookThrough(RHS.getReg(), MRI);
5052     if (!ValAndVReg || ValAndVReg->Value != 0)
5053       return nullptr;
5054 
5055     return emitTST(LHSDef->getOperand(1),
5056                    LHSDef->getOperand(2), MIRBuilder);
5057   }
5058 
5059   return nullptr;
5060 }
5061 
5062 bool AArch64InstructionSelector::selectShuffleVector(
5063     MachineInstr &I, MachineRegisterInfo &MRI) {
5064   const LLT DstTy = MRI.getType(I.getOperand(0).getReg());
5065   Register Src1Reg = I.getOperand(1).getReg();
5066   const LLT Src1Ty = MRI.getType(Src1Reg);
5067   Register Src2Reg = I.getOperand(2).getReg();
5068   const LLT Src2Ty = MRI.getType(Src2Reg);
5069   ArrayRef<int> Mask = I.getOperand(3).getShuffleMask();
5070 
5071   MachineBasicBlock &MBB = *I.getParent();
5072   MachineFunction &MF = *MBB.getParent();
5073   LLVMContext &Ctx = MF.getFunction().getContext();
5074 
5075   // G_SHUFFLE_VECTOR is weird in that the source operands can be scalars, if
5076   // it's originated from a <1 x T> type. Those should have been lowered into
5077   // G_BUILD_VECTOR earlier.
5078   if (!Src1Ty.isVector() || !Src2Ty.isVector()) {
5079     LLVM_DEBUG(dbgs() << "Could not select a \"scalar\" G_SHUFFLE_VECTOR\n");
5080     return false;
5081   }
5082 
5083   unsigned BytesPerElt = DstTy.getElementType().getSizeInBits() / 8;
5084 
5085   SmallVector<Constant *, 64> CstIdxs;
5086   for (int Val : Mask) {
5087     // For now, any undef indexes we'll just assume to be 0. This should be
5088     // optimized in future, e.g. to select DUP etc.
5089     Val = Val < 0 ? 0 : Val;
5090     for (unsigned Byte = 0; Byte < BytesPerElt; ++Byte) {
5091       unsigned Offset = Byte + Val * BytesPerElt;
5092       CstIdxs.emplace_back(ConstantInt::get(Type::getInt8Ty(Ctx), Offset));
5093     }
5094   }
5095 
5096   // Use a constant pool to load the index vector for TBL.
5097   Constant *CPVal = ConstantVector::get(CstIdxs);
5098   MachineInstr *IndexLoad = emitLoadFromConstantPool(CPVal, MIB);
5099   if (!IndexLoad) {
5100     LLVM_DEBUG(dbgs() << "Could not load from a constant pool");
5101     return false;
5102   }
5103 
5104   if (DstTy.getSizeInBits() != 128) {
5105     assert(DstTy.getSizeInBits() == 64 && "Unexpected shuffle result ty");
5106     // This case can be done with TBL1.
5107     MachineInstr *Concat =
5108         emitVectorConcat(std::nullopt, Src1Reg, Src2Reg, MIB);
5109     if (!Concat) {
5110       LLVM_DEBUG(dbgs() << "Could not do vector concat for tbl1");
5111       return false;
5112     }
5113 
5114     // The constant pool load will be 64 bits, so need to convert to FPR128 reg.
5115     IndexLoad = emitScalarToVector(64, &AArch64::FPR128RegClass,
5116                                    IndexLoad->getOperand(0).getReg(), MIB);
5117 
5118     auto TBL1 = MIB.buildInstr(
5119         AArch64::TBLv16i8One, {&AArch64::FPR128RegClass},
5120         {Concat->getOperand(0).getReg(), IndexLoad->getOperand(0).getReg()});
5121     constrainSelectedInstRegOperands(*TBL1, TII, TRI, RBI);
5122 
5123     auto Copy =
5124         MIB.buildInstr(TargetOpcode::COPY, {I.getOperand(0).getReg()}, {})
5125             .addReg(TBL1.getReg(0), 0, AArch64::dsub);
5126     RBI.constrainGenericRegister(Copy.getReg(0), AArch64::FPR64RegClass, MRI);
5127     I.eraseFromParent();
5128     return true;
5129   }
5130 
5131   // For TBL2 we need to emit a REG_SEQUENCE to tie together two consecutive
5132   // Q registers for regalloc.
5133   SmallVector<Register, 2> Regs = {Src1Reg, Src2Reg};
5134   auto RegSeq = createQTuple(Regs, MIB);
5135   auto TBL2 = MIB.buildInstr(AArch64::TBLv16i8Two, {I.getOperand(0)},
5136                              {RegSeq, IndexLoad->getOperand(0)});
5137   constrainSelectedInstRegOperands(*TBL2, TII, TRI, RBI);
5138   I.eraseFromParent();
5139   return true;
5140 }
5141 
5142 MachineInstr *AArch64InstructionSelector::emitLaneInsert(
5143     std::optional<Register> DstReg, Register SrcReg, Register EltReg,
5144     unsigned LaneIdx, const RegisterBank &RB,
5145     MachineIRBuilder &MIRBuilder) const {
5146   MachineInstr *InsElt = nullptr;
5147   const TargetRegisterClass *DstRC = &AArch64::FPR128RegClass;
5148   MachineRegisterInfo &MRI = *MIRBuilder.getMRI();
5149 
5150   // Create a register to define with the insert if one wasn't passed in.
5151   if (!DstReg)
5152     DstReg = MRI.createVirtualRegister(DstRC);
5153 
5154   unsigned EltSize = MRI.getType(EltReg).getSizeInBits();
5155   unsigned Opc = getInsertVecEltOpInfo(RB, EltSize).first;
5156 
5157   if (RB.getID() == AArch64::FPRRegBankID) {
5158     auto InsSub = emitScalarToVector(EltSize, DstRC, EltReg, MIRBuilder);
5159     InsElt = MIRBuilder.buildInstr(Opc, {*DstReg}, {SrcReg})
5160                  .addImm(LaneIdx)
5161                  .addUse(InsSub->getOperand(0).getReg())
5162                  .addImm(0);
5163   } else {
5164     InsElt = MIRBuilder.buildInstr(Opc, {*DstReg}, {SrcReg})
5165                  .addImm(LaneIdx)
5166                  .addUse(EltReg);
5167   }
5168 
5169   constrainSelectedInstRegOperands(*InsElt, TII, TRI, RBI);
5170   return InsElt;
5171 }
5172 
5173 bool AArch64InstructionSelector::selectUSMovFromExtend(
5174     MachineInstr &MI, MachineRegisterInfo &MRI) {
5175   if (MI.getOpcode() != TargetOpcode::G_SEXT &&
5176       MI.getOpcode() != TargetOpcode::G_ZEXT &&
5177       MI.getOpcode() != TargetOpcode::G_ANYEXT)
5178     return false;
5179   bool IsSigned = MI.getOpcode() == TargetOpcode::G_SEXT;
5180   const Register DefReg = MI.getOperand(0).getReg();
5181   const LLT DstTy = MRI.getType(DefReg);
5182   unsigned DstSize = DstTy.getSizeInBits();
5183 
5184   if (DstSize != 32 && DstSize != 64)
5185     return false;
5186 
5187   MachineInstr *Extract = getOpcodeDef(TargetOpcode::G_EXTRACT_VECTOR_ELT,
5188                                        MI.getOperand(1).getReg(), MRI);
5189   int64_t Lane;
5190   if (!Extract || !mi_match(Extract->getOperand(2).getReg(), MRI, m_ICst(Lane)))
5191     return false;
5192   Register Src0 = Extract->getOperand(1).getReg();
5193 
5194   const LLT &VecTy = MRI.getType(Src0);
5195 
5196   if (VecTy.getSizeInBits() != 128) {
5197     const MachineInstr *ScalarToVector = emitScalarToVector(
5198         VecTy.getSizeInBits(), &AArch64::FPR128RegClass, Src0, MIB);
5199     assert(ScalarToVector && "Didn't expect emitScalarToVector to fail!");
5200     Src0 = ScalarToVector->getOperand(0).getReg();
5201   }
5202 
5203   unsigned Opcode;
5204   if (DstSize == 64 && VecTy.getScalarSizeInBits() == 32)
5205     Opcode = IsSigned ? AArch64::SMOVvi32to64 : AArch64::UMOVvi32;
5206   else if (DstSize == 64 && VecTy.getScalarSizeInBits() == 16)
5207     Opcode = IsSigned ? AArch64::SMOVvi16to64 : AArch64::UMOVvi16;
5208   else if (DstSize == 64 && VecTy.getScalarSizeInBits() == 8)
5209     Opcode = IsSigned ? AArch64::SMOVvi8to64 : AArch64::UMOVvi8;
5210   else if (DstSize == 32 && VecTy.getScalarSizeInBits() == 16)
5211     Opcode = IsSigned ? AArch64::SMOVvi16to32 : AArch64::UMOVvi16;
5212   else if (DstSize == 32 && VecTy.getScalarSizeInBits() == 8)
5213     Opcode = IsSigned ? AArch64::SMOVvi8to32 : AArch64::UMOVvi8;
5214   else
5215     llvm_unreachable("Unexpected type combo for S/UMov!");
5216 
5217   // We may need to generate one of these, depending on the type and sign of the
5218   // input:
5219   //  DstReg = SMOV Src0, Lane;
5220   //  NewReg = UMOV Src0, Lane; DstReg = SUBREG_TO_REG NewReg, sub_32;
5221   MachineInstr *ExtI = nullptr;
5222   if (DstSize == 64 && !IsSigned) {
5223     Register NewReg = MRI.createVirtualRegister(&AArch64::GPR32RegClass);
5224     MIB.buildInstr(Opcode, {NewReg}, {Src0}).addImm(Lane);
5225     ExtI = MIB.buildInstr(AArch64::SUBREG_TO_REG, {DefReg}, {})
5226                .addImm(0)
5227                .addUse(NewReg)
5228                .addImm(AArch64::sub_32);
5229     RBI.constrainGenericRegister(DefReg, AArch64::GPR64RegClass, MRI);
5230   } else
5231     ExtI = MIB.buildInstr(Opcode, {DefReg}, {Src0}).addImm(Lane);
5232 
5233   constrainSelectedInstRegOperands(*ExtI, TII, TRI, RBI);
5234   MI.eraseFromParent();
5235   return true;
5236 }
5237 
5238 MachineInstr *AArch64InstructionSelector::tryAdvSIMDModImm8(
5239     Register Dst, unsigned DstSize, APInt Bits, MachineIRBuilder &Builder) {
5240   unsigned int Op;
5241   if (DstSize == 128) {
5242     if (Bits.getHiBits(64) != Bits.getLoBits(64))
5243       return nullptr;
5244     Op = AArch64::MOVIv16b_ns;
5245   } else {
5246     Op = AArch64::MOVIv8b_ns;
5247   }
5248 
5249   uint64_t Val = Bits.zextOrTrunc(64).getZExtValue();
5250 
5251   if (AArch64_AM::isAdvSIMDModImmType9(Val)) {
5252     Val = AArch64_AM::encodeAdvSIMDModImmType9(Val);
5253     auto Mov = Builder.buildInstr(Op, {Dst}, {}).addImm(Val);
5254     constrainSelectedInstRegOperands(*Mov, TII, TRI, RBI);
5255     return &*Mov;
5256   }
5257   return nullptr;
5258 }
5259 
5260 MachineInstr *AArch64InstructionSelector::tryAdvSIMDModImm16(
5261     Register Dst, unsigned DstSize, APInt Bits, MachineIRBuilder &Builder,
5262     bool Inv) {
5263 
5264   unsigned int Op;
5265   if (DstSize == 128) {
5266     if (Bits.getHiBits(64) != Bits.getLoBits(64))
5267       return nullptr;
5268     Op = Inv ? AArch64::MVNIv8i16 : AArch64::MOVIv8i16;
5269   } else {
5270     Op = Inv ? AArch64::MVNIv4i16 : AArch64::MOVIv4i16;
5271   }
5272 
5273   uint64_t Val = Bits.zextOrTrunc(64).getZExtValue();
5274   uint64_t Shift;
5275 
5276   if (AArch64_AM::isAdvSIMDModImmType5(Val)) {
5277     Val = AArch64_AM::encodeAdvSIMDModImmType5(Val);
5278     Shift = 0;
5279   } else if (AArch64_AM::isAdvSIMDModImmType6(Val)) {
5280     Val = AArch64_AM::encodeAdvSIMDModImmType6(Val);
5281     Shift = 8;
5282   } else
5283     return nullptr;
5284 
5285   auto Mov = Builder.buildInstr(Op, {Dst}, {}).addImm(Val).addImm(Shift);
5286   constrainSelectedInstRegOperands(*Mov, TII, TRI, RBI);
5287   return &*Mov;
5288 }
5289 
5290 MachineInstr *AArch64InstructionSelector::tryAdvSIMDModImm32(
5291     Register Dst, unsigned DstSize, APInt Bits, MachineIRBuilder &Builder,
5292     bool Inv) {
5293 
5294   unsigned int Op;
5295   if (DstSize == 128) {
5296     if (Bits.getHiBits(64) != Bits.getLoBits(64))
5297       return nullptr;
5298     Op = Inv ? AArch64::MVNIv4i32 : AArch64::MOVIv4i32;
5299   } else {
5300     Op = Inv ? AArch64::MVNIv2i32 : AArch64::MOVIv2i32;
5301   }
5302 
5303   uint64_t Val = Bits.zextOrTrunc(64).getZExtValue();
5304   uint64_t Shift;
5305 
5306   if ((AArch64_AM::isAdvSIMDModImmType1(Val))) {
5307     Val = AArch64_AM::encodeAdvSIMDModImmType1(Val);
5308     Shift = 0;
5309   } else if ((AArch64_AM::isAdvSIMDModImmType2(Val))) {
5310     Val = AArch64_AM::encodeAdvSIMDModImmType2(Val);
5311     Shift = 8;
5312   } else if ((AArch64_AM::isAdvSIMDModImmType3(Val))) {
5313     Val = AArch64_AM::encodeAdvSIMDModImmType3(Val);
5314     Shift = 16;
5315   } else if ((AArch64_AM::isAdvSIMDModImmType4(Val))) {
5316     Val = AArch64_AM::encodeAdvSIMDModImmType4(Val);
5317     Shift = 24;
5318   } else
5319     return nullptr;
5320 
5321   auto Mov = Builder.buildInstr(Op, {Dst}, {}).addImm(Val).addImm(Shift);
5322   constrainSelectedInstRegOperands(*Mov, TII, TRI, RBI);
5323   return &*Mov;
5324 }
5325 
5326 MachineInstr *AArch64InstructionSelector::tryAdvSIMDModImm64(
5327     Register Dst, unsigned DstSize, APInt Bits, MachineIRBuilder &Builder) {
5328 
5329   unsigned int Op;
5330   if (DstSize == 128) {
5331     if (Bits.getHiBits(64) != Bits.getLoBits(64))
5332       return nullptr;
5333     Op = AArch64::MOVIv2d_ns;
5334   } else {
5335     Op = AArch64::MOVID;
5336   }
5337 
5338   uint64_t Val = Bits.zextOrTrunc(64).getZExtValue();
5339   if (AArch64_AM::isAdvSIMDModImmType10(Val)) {
5340     Val = AArch64_AM::encodeAdvSIMDModImmType10(Val);
5341     auto Mov = Builder.buildInstr(Op, {Dst}, {}).addImm(Val);
5342     constrainSelectedInstRegOperands(*Mov, TII, TRI, RBI);
5343     return &*Mov;
5344   }
5345   return nullptr;
5346 }
5347 
5348 MachineInstr *AArch64InstructionSelector::tryAdvSIMDModImm321s(
5349     Register Dst, unsigned DstSize, APInt Bits, MachineIRBuilder &Builder,
5350     bool Inv) {
5351 
5352   unsigned int Op;
5353   if (DstSize == 128) {
5354     if (Bits.getHiBits(64) != Bits.getLoBits(64))
5355       return nullptr;
5356     Op = Inv ? AArch64::MVNIv4s_msl : AArch64::MOVIv4s_msl;
5357   } else {
5358     Op = Inv ? AArch64::MVNIv2s_msl : AArch64::MOVIv2s_msl;
5359   }
5360 
5361   uint64_t Val = Bits.zextOrTrunc(64).getZExtValue();
5362   uint64_t Shift;
5363 
5364   if (AArch64_AM::isAdvSIMDModImmType7(Val)) {
5365     Val = AArch64_AM::encodeAdvSIMDModImmType7(Val);
5366     Shift = 264;
5367   } else if (AArch64_AM::isAdvSIMDModImmType8(Val)) {
5368     Val = AArch64_AM::encodeAdvSIMDModImmType8(Val);
5369     Shift = 272;
5370   } else
5371     return nullptr;
5372 
5373   auto Mov = Builder.buildInstr(Op, {Dst}, {}).addImm(Val).addImm(Shift);
5374   constrainSelectedInstRegOperands(*Mov, TII, TRI, RBI);
5375   return &*Mov;
5376 }
5377 
5378 MachineInstr *AArch64InstructionSelector::tryAdvSIMDModImmFP(
5379     Register Dst, unsigned DstSize, APInt Bits, MachineIRBuilder &Builder) {
5380 
5381   unsigned int Op;
5382   bool IsWide = false;
5383   if (DstSize == 128) {
5384     if (Bits.getHiBits(64) != Bits.getLoBits(64))
5385       return nullptr;
5386     Op = AArch64::FMOVv4f32_ns;
5387     IsWide = true;
5388   } else {
5389     Op = AArch64::FMOVv2f32_ns;
5390   }
5391 
5392   uint64_t Val = Bits.zextOrTrunc(64).getZExtValue();
5393 
5394   if (AArch64_AM::isAdvSIMDModImmType11(Val)) {
5395     Val = AArch64_AM::encodeAdvSIMDModImmType11(Val);
5396   } else if (IsWide && AArch64_AM::isAdvSIMDModImmType12(Val)) {
5397     Val = AArch64_AM::encodeAdvSIMDModImmType12(Val);
5398     Op = AArch64::FMOVv2f64_ns;
5399   } else
5400     return nullptr;
5401 
5402   auto Mov = Builder.buildInstr(Op, {Dst}, {}).addImm(Val);
5403   constrainSelectedInstRegOperands(*Mov, TII, TRI, RBI);
5404   return &*Mov;
5405 }
5406 
5407 bool AArch64InstructionSelector::selectIndexedExtLoad(
5408     MachineInstr &MI, MachineRegisterInfo &MRI) {
5409   auto &ExtLd = cast<GIndexedAnyExtLoad>(MI);
5410   Register Dst = ExtLd.getDstReg();
5411   Register WriteBack = ExtLd.getWritebackReg();
5412   Register Base = ExtLd.getBaseReg();
5413   Register Offset = ExtLd.getOffsetReg();
5414   LLT Ty = MRI.getType(Dst);
5415   assert(Ty.getSizeInBits() <= 64); // Only for scalar GPRs.
5416   unsigned MemSizeBits = ExtLd.getMMO().getMemoryType().getSizeInBits();
5417   bool IsPre = ExtLd.isPre();
5418   bool IsSExt = isa<GIndexedSExtLoad>(ExtLd);
5419   bool InsertIntoXReg = false;
5420   bool IsDst64 = Ty.getSizeInBits() == 64;
5421 
5422   unsigned Opc = 0;
5423   LLT NewLdDstTy;
5424   LLT s32 = LLT::scalar(32);
5425   LLT s64 = LLT::scalar(64);
5426 
5427   if (MemSizeBits == 8) {
5428     if (IsSExt) {
5429       if (IsDst64)
5430         Opc = IsPre ? AArch64::LDRSBXpre : AArch64::LDRSBXpost;
5431       else
5432         Opc = IsPre ? AArch64::LDRSBWpre : AArch64::LDRSBWpost;
5433       NewLdDstTy = IsDst64 ? s64 : s32;
5434     } else {
5435       Opc = IsPre ? AArch64::LDRBBpre : AArch64::LDRBBpost;
5436       InsertIntoXReg = IsDst64;
5437       NewLdDstTy = s32;
5438     }
5439   } else if (MemSizeBits == 16) {
5440     if (IsSExt) {
5441       if (IsDst64)
5442         Opc = IsPre ? AArch64::LDRSHXpre : AArch64::LDRSHXpost;
5443       else
5444         Opc = IsPre ? AArch64::LDRSHWpre : AArch64::LDRSHWpost;
5445       NewLdDstTy = IsDst64 ? s64 : s32;
5446     } else {
5447       Opc = IsPre ? AArch64::LDRHHpre : AArch64::LDRHHpost;
5448       InsertIntoXReg = IsDst64;
5449       NewLdDstTy = s32;
5450     }
5451   } else if (MemSizeBits == 32) {
5452     if (IsSExt) {
5453       Opc = IsPre ? AArch64::LDRSWpre : AArch64::LDRSWpost;
5454       NewLdDstTy = s64;
5455     } else {
5456       Opc = IsPre ? AArch64::LDRWpre : AArch64::LDRWpost;
5457       InsertIntoXReg = IsDst64;
5458       NewLdDstTy = s32;
5459     }
5460   } else {
5461     llvm_unreachable("Unexpected size for indexed load");
5462   }
5463 
5464   if (RBI.getRegBank(Dst, MRI, TRI)->getID() == AArch64::FPRRegBankID)
5465     return false; // We should be on gpr.
5466 
5467   auto Cst = getIConstantVRegVal(Offset, MRI);
5468   if (!Cst)
5469     return false; // Shouldn't happen, but just in case.
5470 
5471   auto LdMI = MIB.buildInstr(Opc, {WriteBack, NewLdDstTy}, {Base})
5472                   .addImm(Cst->getSExtValue());
5473   LdMI.cloneMemRefs(ExtLd);
5474   constrainSelectedInstRegOperands(*LdMI, TII, TRI, RBI);
5475   // Make sure to select the load with the MemTy as the dest type, and then
5476   // insert into X reg if needed.
5477   if (InsertIntoXReg) {
5478     // Generate a SUBREG_TO_REG.
5479     auto SubToReg = MIB.buildInstr(TargetOpcode::SUBREG_TO_REG, {Dst}, {})
5480                         .addImm(0)
5481                         .addUse(LdMI.getReg(1))
5482                         .addImm(AArch64::sub_32);
5483     RBI.constrainGenericRegister(SubToReg.getReg(0), AArch64::GPR64RegClass,
5484                                  MRI);
5485   } else {
5486     auto Copy = MIB.buildCopy(Dst, LdMI.getReg(1));
5487     selectCopy(*Copy, TII, MRI, TRI, RBI);
5488   }
5489   MI.eraseFromParent();
5490 
5491   return true;
5492 }
5493 
5494 bool AArch64InstructionSelector::selectIndexedLoad(MachineInstr &MI,
5495                                                    MachineRegisterInfo &MRI) {
5496   auto &Ld = cast<GIndexedLoad>(MI);
5497   Register Dst = Ld.getDstReg();
5498   Register WriteBack = Ld.getWritebackReg();
5499   Register Base = Ld.getBaseReg();
5500   Register Offset = Ld.getOffsetReg();
5501   assert(MRI.getType(Dst).getSizeInBits() <= 128 &&
5502          "Unexpected type for indexed load");
5503   unsigned MemSize = Ld.getMMO().getMemoryType().getSizeInBytes();
5504 
5505   if (MemSize < MRI.getType(Dst).getSizeInBytes())
5506     return selectIndexedExtLoad(MI, MRI);
5507 
5508   unsigned Opc = 0;
5509   if (Ld.isPre()) {
5510     static constexpr unsigned GPROpcodes[] = {
5511         AArch64::LDRBBpre, AArch64::LDRHHpre, AArch64::LDRWpre,
5512         AArch64::LDRXpre};
5513     static constexpr unsigned FPROpcodes[] = {
5514         AArch64::LDRBpre, AArch64::LDRHpre, AArch64::LDRSpre, AArch64::LDRDpre,
5515         AArch64::LDRQpre};
5516     if (RBI.getRegBank(Dst, MRI, TRI)->getID() == AArch64::FPRRegBankID)
5517       Opc = FPROpcodes[Log2_32(MemSize)];
5518     else
5519       Opc = GPROpcodes[Log2_32(MemSize)];
5520   } else {
5521     static constexpr unsigned GPROpcodes[] = {
5522         AArch64::LDRBBpost, AArch64::LDRHHpost, AArch64::LDRWpost,
5523         AArch64::LDRXpost};
5524     static constexpr unsigned FPROpcodes[] = {
5525         AArch64::LDRBpost, AArch64::LDRHpost, AArch64::LDRSpost,
5526         AArch64::LDRDpost, AArch64::LDRQpost};
5527     if (RBI.getRegBank(Dst, MRI, TRI)->getID() == AArch64::FPRRegBankID)
5528       Opc = FPROpcodes[Log2_32(MemSize)];
5529     else
5530       Opc = GPROpcodes[Log2_32(MemSize)];
5531   }
5532   auto Cst = getIConstantVRegVal(Offset, MRI);
5533   if (!Cst)
5534     return false; // Shouldn't happen, but just in case.
5535   auto LdMI =
5536       MIB.buildInstr(Opc, {WriteBack, Dst}, {Base}).addImm(Cst->getSExtValue());
5537   LdMI.cloneMemRefs(Ld);
5538   constrainSelectedInstRegOperands(*LdMI, TII, TRI, RBI);
5539   MI.eraseFromParent();
5540   return true;
5541 }
5542 
5543 bool AArch64InstructionSelector::selectIndexedStore(GIndexedStore &I,
5544                                                     MachineRegisterInfo &MRI) {
5545   Register Dst = I.getWritebackReg();
5546   Register Val = I.getValueReg();
5547   Register Base = I.getBaseReg();
5548   Register Offset = I.getOffsetReg();
5549   LLT ValTy = MRI.getType(Val);
5550   assert(ValTy.getSizeInBits() <= 128 && "Unexpected type for indexed store");
5551 
5552   unsigned Opc = 0;
5553   if (I.isPre()) {
5554     static constexpr unsigned GPROpcodes[] = {
5555         AArch64::STRBBpre, AArch64::STRHHpre, AArch64::STRWpre,
5556         AArch64::STRXpre};
5557     static constexpr unsigned FPROpcodes[] = {
5558         AArch64::STRBpre, AArch64::STRHpre, AArch64::STRSpre, AArch64::STRDpre,
5559         AArch64::STRQpre};
5560 
5561     if (RBI.getRegBank(Val, MRI, TRI)->getID() == AArch64::FPRRegBankID)
5562       Opc = FPROpcodes[Log2_32(ValTy.getSizeInBytes())];
5563     else
5564       Opc = GPROpcodes[Log2_32(ValTy.getSizeInBytes())];
5565   } else {
5566     static constexpr unsigned GPROpcodes[] = {
5567         AArch64::STRBBpost, AArch64::STRHHpost, AArch64::STRWpost,
5568         AArch64::STRXpost};
5569     static constexpr unsigned FPROpcodes[] = {
5570         AArch64::STRBpost, AArch64::STRHpost, AArch64::STRSpost,
5571         AArch64::STRDpost, AArch64::STRQpost};
5572 
5573     if (RBI.getRegBank(Val, MRI, TRI)->getID() == AArch64::FPRRegBankID)
5574       Opc = FPROpcodes[Log2_32(ValTy.getSizeInBytes())];
5575     else
5576       Opc = GPROpcodes[Log2_32(ValTy.getSizeInBytes())];
5577   }
5578 
5579   auto Cst = getIConstantVRegVal(Offset, MRI);
5580   if (!Cst)
5581     return false; // Shouldn't happen, but just in case.
5582   auto Str =
5583       MIB.buildInstr(Opc, {Dst}, {Val, Base}).addImm(Cst->getSExtValue());
5584   Str.cloneMemRefs(I);
5585   constrainSelectedInstRegOperands(*Str, TII, TRI, RBI);
5586   I.eraseFromParent();
5587   return true;
5588 }
5589 
5590 MachineInstr *
5591 AArch64InstructionSelector::emitConstantVector(Register Dst, Constant *CV,
5592                                                MachineIRBuilder &MIRBuilder,
5593                                                MachineRegisterInfo &MRI) {
5594   LLT DstTy = MRI.getType(Dst);
5595   unsigned DstSize = DstTy.getSizeInBits();
5596   if (CV->isNullValue()) {
5597     if (DstSize == 128) {
5598       auto Mov =
5599           MIRBuilder.buildInstr(AArch64::MOVIv2d_ns, {Dst}, {}).addImm(0);
5600       constrainSelectedInstRegOperands(*Mov, TII, TRI, RBI);
5601       return &*Mov;
5602     }
5603 
5604     if (DstSize == 64) {
5605       auto Mov =
5606           MIRBuilder
5607               .buildInstr(AArch64::MOVIv2d_ns, {&AArch64::FPR128RegClass}, {})
5608               .addImm(0);
5609       auto Copy = MIRBuilder.buildInstr(TargetOpcode::COPY, {Dst}, {})
5610                       .addReg(Mov.getReg(0), 0, AArch64::dsub);
5611       RBI.constrainGenericRegister(Dst, AArch64::FPR64RegClass, MRI);
5612       return &*Copy;
5613     }
5614   }
5615 
5616   if (CV->getSplatValue()) {
5617     APInt DefBits = APInt::getSplat(DstSize, CV->getUniqueInteger());
5618     auto TryMOVIWithBits = [&](APInt DefBits) -> MachineInstr * {
5619       MachineInstr *NewOp;
5620       bool Inv = false;
5621       if ((NewOp = tryAdvSIMDModImm64(Dst, DstSize, DefBits, MIRBuilder)) ||
5622           (NewOp =
5623                tryAdvSIMDModImm32(Dst, DstSize, DefBits, MIRBuilder, Inv)) ||
5624           (NewOp =
5625                tryAdvSIMDModImm321s(Dst, DstSize, DefBits, MIRBuilder, Inv)) ||
5626           (NewOp =
5627                tryAdvSIMDModImm16(Dst, DstSize, DefBits, MIRBuilder, Inv)) ||
5628           (NewOp = tryAdvSIMDModImm8(Dst, DstSize, DefBits, MIRBuilder)) ||
5629           (NewOp = tryAdvSIMDModImmFP(Dst, DstSize, DefBits, MIRBuilder)))
5630         return NewOp;
5631 
5632       DefBits = ~DefBits;
5633       Inv = true;
5634       if ((NewOp =
5635                tryAdvSIMDModImm32(Dst, DstSize, DefBits, MIRBuilder, Inv)) ||
5636           (NewOp =
5637                tryAdvSIMDModImm321s(Dst, DstSize, DefBits, MIRBuilder, Inv)) ||
5638           (NewOp = tryAdvSIMDModImm16(Dst, DstSize, DefBits, MIRBuilder, Inv)))
5639         return NewOp;
5640       return nullptr;
5641     };
5642 
5643     if (auto *NewOp = TryMOVIWithBits(DefBits))
5644       return NewOp;
5645 
5646     // See if a fneg of the constant can be materialized with a MOVI, etc
5647     auto TryWithFNeg = [&](APInt DefBits, int NumBits,
5648                            unsigned NegOpc) -> MachineInstr * {
5649       // FNegate each sub-element of the constant
5650       APInt Neg = APInt::getHighBitsSet(NumBits, 1).zext(DstSize);
5651       APInt NegBits(DstSize, 0);
5652       unsigned NumElts = DstSize / NumBits;
5653       for (unsigned i = 0; i < NumElts; i++)
5654         NegBits |= Neg << (NumBits * i);
5655       NegBits = DefBits ^ NegBits;
5656 
5657       // Try to create the new constants with MOVI, and if so generate a fneg
5658       // for it.
5659       if (auto *NewOp = TryMOVIWithBits(NegBits)) {
5660         Register NewDst = MRI.createVirtualRegister(&AArch64::FPR128RegClass);
5661         NewOp->getOperand(0).setReg(NewDst);
5662         return MIRBuilder.buildInstr(NegOpc, {Dst}, {NewDst});
5663       }
5664       return nullptr;
5665     };
5666     MachineInstr *R;
5667     if ((R = TryWithFNeg(DefBits, 32, AArch64::FNEGv4f32)) ||
5668         (R = TryWithFNeg(DefBits, 64, AArch64::FNEGv2f64)) ||
5669         (STI.hasFullFP16() &&
5670          (R = TryWithFNeg(DefBits, 16, AArch64::FNEGv8f16))))
5671       return R;
5672   }
5673 
5674   auto *CPLoad = emitLoadFromConstantPool(CV, MIRBuilder);
5675   if (!CPLoad) {
5676     LLVM_DEBUG(dbgs() << "Could not generate cp load for constant vector!");
5677     return nullptr;
5678   }
5679 
5680   auto Copy = MIRBuilder.buildCopy(Dst, CPLoad->getOperand(0));
5681   RBI.constrainGenericRegister(
5682       Dst, *MRI.getRegClass(CPLoad->getOperand(0).getReg()), MRI);
5683   return &*Copy;
5684 }
5685 
5686 bool AArch64InstructionSelector::tryOptConstantBuildVec(
5687     MachineInstr &I, LLT DstTy, MachineRegisterInfo &MRI) {
5688   assert(I.getOpcode() == TargetOpcode::G_BUILD_VECTOR);
5689   unsigned DstSize = DstTy.getSizeInBits();
5690   assert(DstSize <= 128 && "Unexpected build_vec type!");
5691   if (DstSize < 32)
5692     return false;
5693   // Check if we're building a constant vector, in which case we want to
5694   // generate a constant pool load instead of a vector insert sequence.
5695   SmallVector<Constant *, 16> Csts;
5696   for (unsigned Idx = 1; Idx < I.getNumOperands(); ++Idx) {
5697     // Try to find G_CONSTANT or G_FCONSTANT
5698     auto *OpMI =
5699         getOpcodeDef(TargetOpcode::G_CONSTANT, I.getOperand(Idx).getReg(), MRI);
5700     if (OpMI)
5701       Csts.emplace_back(
5702           const_cast<ConstantInt *>(OpMI->getOperand(1).getCImm()));
5703     else if ((OpMI = getOpcodeDef(TargetOpcode::G_FCONSTANT,
5704                                   I.getOperand(Idx).getReg(), MRI)))
5705       Csts.emplace_back(
5706           const_cast<ConstantFP *>(OpMI->getOperand(1).getFPImm()));
5707     else
5708       return false;
5709   }
5710   Constant *CV = ConstantVector::get(Csts);
5711   if (!emitConstantVector(I.getOperand(0).getReg(), CV, MIB, MRI))
5712     return false;
5713   I.eraseFromParent();
5714   return true;
5715 }
5716 
5717 bool AArch64InstructionSelector::tryOptBuildVecToSubregToReg(
5718     MachineInstr &I, MachineRegisterInfo &MRI) {
5719   // Given:
5720   //  %vec = G_BUILD_VECTOR %elt, %undef, %undef, ... %undef
5721   //
5722   // Select the G_BUILD_VECTOR as a SUBREG_TO_REG from %elt.
5723   Register Dst = I.getOperand(0).getReg();
5724   Register EltReg = I.getOperand(1).getReg();
5725   LLT EltTy = MRI.getType(EltReg);
5726   // If the index isn't on the same bank as its elements, then this can't be a
5727   // SUBREG_TO_REG.
5728   const RegisterBank &EltRB = *RBI.getRegBank(EltReg, MRI, TRI);
5729   const RegisterBank &DstRB = *RBI.getRegBank(Dst, MRI, TRI);
5730   if (EltRB != DstRB)
5731     return false;
5732   if (any_of(drop_begin(I.operands(), 2), [&MRI](const MachineOperand &Op) {
5733         return !getOpcodeDef(TargetOpcode::G_IMPLICIT_DEF, Op.getReg(), MRI);
5734       }))
5735     return false;
5736   unsigned SubReg;
5737   const TargetRegisterClass *EltRC = getRegClassForTypeOnBank(EltTy, EltRB);
5738   if (!EltRC)
5739     return false;
5740   const TargetRegisterClass *DstRC =
5741       getRegClassForTypeOnBank(MRI.getType(Dst), DstRB);
5742   if (!DstRC)
5743     return false;
5744   if (!getSubRegForClass(EltRC, TRI, SubReg))
5745     return false;
5746   auto SubregToReg = MIB.buildInstr(AArch64::SUBREG_TO_REG, {Dst}, {})
5747                          .addImm(0)
5748                          .addUse(EltReg)
5749                          .addImm(SubReg);
5750   I.eraseFromParent();
5751   constrainSelectedInstRegOperands(*SubregToReg, TII, TRI, RBI);
5752   return RBI.constrainGenericRegister(Dst, *DstRC, MRI);
5753 }
5754 
5755 bool AArch64InstructionSelector::selectBuildVector(MachineInstr &I,
5756                                                    MachineRegisterInfo &MRI) {
5757   assert(I.getOpcode() == TargetOpcode::G_BUILD_VECTOR);
5758   // Until we port more of the optimized selections, for now just use a vector
5759   // insert sequence.
5760   const LLT DstTy = MRI.getType(I.getOperand(0).getReg());
5761   const LLT EltTy = MRI.getType(I.getOperand(1).getReg());
5762   unsigned EltSize = EltTy.getSizeInBits();
5763 
5764   if (tryOptConstantBuildVec(I, DstTy, MRI))
5765     return true;
5766   if (tryOptBuildVecToSubregToReg(I, MRI))
5767     return true;
5768 
5769   if (EltSize != 8 && EltSize != 16 && EltSize != 32 && EltSize != 64)
5770     return false; // Don't support all element types yet.
5771   const RegisterBank &RB = *RBI.getRegBank(I.getOperand(1).getReg(), MRI, TRI);
5772 
5773   const TargetRegisterClass *DstRC = &AArch64::FPR128RegClass;
5774   MachineInstr *ScalarToVec =
5775       emitScalarToVector(DstTy.getElementType().getSizeInBits(), DstRC,
5776                          I.getOperand(1).getReg(), MIB);
5777   if (!ScalarToVec)
5778     return false;
5779 
5780   Register DstVec = ScalarToVec->getOperand(0).getReg();
5781   unsigned DstSize = DstTy.getSizeInBits();
5782 
5783   // Keep track of the last MI we inserted. Later on, we might be able to save
5784   // a copy using it.
5785   MachineInstr *PrevMI = ScalarToVec;
5786   for (unsigned i = 2, e = DstSize / EltSize + 1; i < e; ++i) {
5787     // Note that if we don't do a subregister copy, we can end up making an
5788     // extra register.
5789     Register OpReg = I.getOperand(i).getReg();
5790     // Do not emit inserts for undefs
5791     if (!getOpcodeDef<GImplicitDef>(OpReg, MRI)) {
5792       PrevMI = &*emitLaneInsert(std::nullopt, DstVec, OpReg, i - 1, RB, MIB);
5793       DstVec = PrevMI->getOperand(0).getReg();
5794     }
5795   }
5796 
5797   // If DstTy's size in bits is less than 128, then emit a subregister copy
5798   // from DstVec to the last register we've defined.
5799   if (DstSize < 128) {
5800     // Force this to be FPR using the destination vector.
5801     const TargetRegisterClass *RC =
5802         getRegClassForTypeOnBank(DstTy, *RBI.getRegBank(DstVec, MRI, TRI));
5803     if (!RC)
5804       return false;
5805     if (RC != &AArch64::FPR32RegClass && RC != &AArch64::FPR64RegClass) {
5806       LLVM_DEBUG(dbgs() << "Unsupported register class!\n");
5807       return false;
5808     }
5809 
5810     unsigned SubReg = 0;
5811     if (!getSubRegForClass(RC, TRI, SubReg))
5812       return false;
5813     if (SubReg != AArch64::ssub && SubReg != AArch64::dsub) {
5814       LLVM_DEBUG(dbgs() << "Unsupported destination size! (" << DstSize
5815                         << "\n");
5816       return false;
5817     }
5818 
5819     Register Reg = MRI.createVirtualRegister(RC);
5820     Register DstReg = I.getOperand(0).getReg();
5821 
5822     MIB.buildInstr(TargetOpcode::COPY, {DstReg}, {}).addReg(DstVec, 0, SubReg);
5823     MachineOperand &RegOp = I.getOperand(1);
5824     RegOp.setReg(Reg);
5825     RBI.constrainGenericRegister(DstReg, *RC, MRI);
5826   } else {
5827     // We either have a vector with all elements (except the first one) undef or
5828     // at least one non-undef non-first element. In the first case, we need to
5829     // constrain the output register ourselves as we may have generated an
5830     // INSERT_SUBREG operation which is a generic operation for which the
5831     // output regclass cannot be automatically chosen.
5832     //
5833     // In the second case, there is no need to do this as it may generate an
5834     // instruction like INSvi32gpr where the regclass can be automatically
5835     // chosen.
5836     //
5837     // Also, we save a copy by re-using the destination register on the final
5838     // insert.
5839     PrevMI->getOperand(0).setReg(I.getOperand(0).getReg());
5840     constrainSelectedInstRegOperands(*PrevMI, TII, TRI, RBI);
5841 
5842     Register DstReg = PrevMI->getOperand(0).getReg();
5843     if (PrevMI == ScalarToVec && DstReg.isVirtual()) {
5844       const TargetRegisterClass *RC =
5845           getRegClassForTypeOnBank(DstTy, *RBI.getRegBank(DstVec, MRI, TRI));
5846       RBI.constrainGenericRegister(DstReg, *RC, MRI);
5847     }
5848   }
5849 
5850   I.eraseFromParent();
5851   return true;
5852 }
5853 
5854 bool AArch64InstructionSelector::selectVectorLoadIntrinsic(unsigned Opc,
5855                                                            unsigned NumVecs,
5856                                                            MachineInstr &I) {
5857   assert(I.getOpcode() == TargetOpcode::G_INTRINSIC_W_SIDE_EFFECTS);
5858   assert(Opc && "Expected an opcode?");
5859   assert(NumVecs > 1 && NumVecs < 5 && "Only support 2, 3, or 4 vectors");
5860   auto &MRI = *MIB.getMRI();
5861   LLT Ty = MRI.getType(I.getOperand(0).getReg());
5862   unsigned Size = Ty.getSizeInBits();
5863   assert((Size == 64 || Size == 128) &&
5864          "Destination must be 64 bits or 128 bits?");
5865   unsigned SubReg = Size == 64 ? AArch64::dsub0 : AArch64::qsub0;
5866   auto Ptr = I.getOperand(I.getNumOperands() - 1).getReg();
5867   assert(MRI.getType(Ptr).isPointer() && "Expected a pointer type?");
5868   auto Load = MIB.buildInstr(Opc, {Ty}, {Ptr});
5869   Load.cloneMemRefs(I);
5870   constrainSelectedInstRegOperands(*Load, TII, TRI, RBI);
5871   Register SelectedLoadDst = Load->getOperand(0).getReg();
5872   for (unsigned Idx = 0; Idx < NumVecs; ++Idx) {
5873     auto Vec = MIB.buildInstr(TargetOpcode::COPY, {I.getOperand(Idx)}, {})
5874                    .addReg(SelectedLoadDst, 0, SubReg + Idx);
5875     // Emit the subreg copies and immediately select them.
5876     // FIXME: We should refactor our copy code into an emitCopy helper and
5877     // clean up uses of this pattern elsewhere in the selector.
5878     selectCopy(*Vec, TII, MRI, TRI, RBI);
5879   }
5880   return true;
5881 }
5882 
5883 bool AArch64InstructionSelector::selectVectorLoadLaneIntrinsic(
5884     unsigned Opc, unsigned NumVecs, MachineInstr &I) {
5885   assert(I.getOpcode() == TargetOpcode::G_INTRINSIC_W_SIDE_EFFECTS);
5886   assert(Opc && "Expected an opcode?");
5887   assert(NumVecs > 1 && NumVecs < 5 && "Only support 2, 3, or 4 vectors");
5888   auto &MRI = *MIB.getMRI();
5889   LLT Ty = MRI.getType(I.getOperand(0).getReg());
5890   bool Narrow = Ty.getSizeInBits() == 64;
5891 
5892   auto FirstSrcRegIt = I.operands_begin() + NumVecs + 1;
5893   SmallVector<Register, 4> Regs(NumVecs);
5894   std::transform(FirstSrcRegIt, FirstSrcRegIt + NumVecs, Regs.begin(),
5895                  [](auto MO) { return MO.getReg(); });
5896 
5897   if (Narrow) {
5898     transform(Regs, Regs.begin(), [this](Register Reg) {
5899       return emitScalarToVector(64, &AArch64::FPR128RegClass, Reg, MIB)
5900           ->getOperand(0)
5901           .getReg();
5902     });
5903     Ty = Ty.multiplyElements(2);
5904   }
5905 
5906   Register Tuple = createQTuple(Regs, MIB);
5907   auto LaneNo = getIConstantVRegVal((FirstSrcRegIt + NumVecs)->getReg(), MRI);
5908   if (!LaneNo)
5909     return false;
5910 
5911   Register Ptr = (FirstSrcRegIt + NumVecs + 1)->getReg();
5912   auto Load = MIB.buildInstr(Opc, {Ty}, {})
5913                   .addReg(Tuple)
5914                   .addImm(LaneNo->getZExtValue())
5915                   .addReg(Ptr);
5916   Load.cloneMemRefs(I);
5917   constrainSelectedInstRegOperands(*Load, TII, TRI, RBI);
5918   Register SelectedLoadDst = Load->getOperand(0).getReg();
5919   unsigned SubReg = AArch64::qsub0;
5920   for (unsigned Idx = 0; Idx < NumVecs; ++Idx) {
5921     auto Vec = MIB.buildInstr(TargetOpcode::COPY,
5922                               {Narrow ? DstOp(&AArch64::FPR128RegClass)
5923                                       : DstOp(I.getOperand(Idx).getReg())},
5924                               {})
5925                    .addReg(SelectedLoadDst, 0, SubReg + Idx);
5926     Register WideReg = Vec.getReg(0);
5927     // Emit the subreg copies and immediately select them.
5928     selectCopy(*Vec, TII, MRI, TRI, RBI);
5929     if (Narrow &&
5930         !emitNarrowVector(I.getOperand(Idx).getReg(), WideReg, MIB, MRI))
5931       return false;
5932   }
5933   return true;
5934 }
5935 
5936 void AArch64InstructionSelector::selectVectorStoreIntrinsic(MachineInstr &I,
5937                                                             unsigned NumVecs,
5938                                                             unsigned Opc) {
5939   MachineRegisterInfo &MRI = I.getParent()->getParent()->getRegInfo();
5940   LLT Ty = MRI.getType(I.getOperand(1).getReg());
5941   Register Ptr = I.getOperand(1 + NumVecs).getReg();
5942 
5943   SmallVector<Register, 2> Regs(NumVecs);
5944   std::transform(I.operands_begin() + 1, I.operands_begin() + 1 + NumVecs,
5945                  Regs.begin(), [](auto MO) { return MO.getReg(); });
5946 
5947   Register Tuple = Ty.getSizeInBits() == 128 ? createQTuple(Regs, MIB)
5948                                              : createDTuple(Regs, MIB);
5949   auto Store = MIB.buildInstr(Opc, {}, {Tuple, Ptr});
5950   Store.cloneMemRefs(I);
5951   constrainSelectedInstRegOperands(*Store, TII, TRI, RBI);
5952 }
5953 
5954 bool AArch64InstructionSelector::selectVectorStoreLaneIntrinsic(
5955     MachineInstr &I, unsigned NumVecs, unsigned Opc) {
5956   MachineRegisterInfo &MRI = I.getParent()->getParent()->getRegInfo();
5957   LLT Ty = MRI.getType(I.getOperand(1).getReg());
5958   bool Narrow = Ty.getSizeInBits() == 64;
5959 
5960   SmallVector<Register, 2> Regs(NumVecs);
5961   std::transform(I.operands_begin() + 1, I.operands_begin() + 1 + NumVecs,
5962                  Regs.begin(), [](auto MO) { return MO.getReg(); });
5963 
5964   if (Narrow)
5965     transform(Regs, Regs.begin(), [this](Register Reg) {
5966       return emitScalarToVector(64, &AArch64::FPR128RegClass, Reg, MIB)
5967           ->getOperand(0)
5968           .getReg();
5969     });
5970 
5971   Register Tuple = createQTuple(Regs, MIB);
5972 
5973   auto LaneNo = getIConstantVRegVal(I.getOperand(1 + NumVecs).getReg(), MRI);
5974   if (!LaneNo)
5975     return false;
5976   Register Ptr = I.getOperand(1 + NumVecs + 1).getReg();
5977   auto Store = MIB.buildInstr(Opc, {}, {})
5978                    .addReg(Tuple)
5979                    .addImm(LaneNo->getZExtValue())
5980                    .addReg(Ptr);
5981   Store.cloneMemRefs(I);
5982   constrainSelectedInstRegOperands(*Store, TII, TRI, RBI);
5983   return true;
5984 }
5985 
5986 bool AArch64InstructionSelector::selectIntrinsicWithSideEffects(
5987     MachineInstr &I, MachineRegisterInfo &MRI) {
5988   // Find the intrinsic ID.
5989   unsigned IntrinID = cast<GIntrinsic>(I).getIntrinsicID();
5990 
5991   const LLT S8 = LLT::scalar(8);
5992   const LLT S16 = LLT::scalar(16);
5993   const LLT S32 = LLT::scalar(32);
5994   const LLT S64 = LLT::scalar(64);
5995   const LLT P0 = LLT::pointer(0, 64);
5996   // Select the instruction.
5997   switch (IntrinID) {
5998   default:
5999     return false;
6000   case Intrinsic::aarch64_ldxp:
6001   case Intrinsic::aarch64_ldaxp: {
6002     auto NewI = MIB.buildInstr(
6003         IntrinID == Intrinsic::aarch64_ldxp ? AArch64::LDXPX : AArch64::LDAXPX,
6004         {I.getOperand(0).getReg(), I.getOperand(1).getReg()},
6005         {I.getOperand(3)});
6006     NewI.cloneMemRefs(I);
6007     constrainSelectedInstRegOperands(*NewI, TII, TRI, RBI);
6008     break;
6009   }
6010   case Intrinsic::aarch64_neon_ld1x2: {
6011     LLT Ty = MRI.getType(I.getOperand(0).getReg());
6012     unsigned Opc = 0;
6013     if (Ty == LLT::fixed_vector(8, S8))
6014       Opc = AArch64::LD1Twov8b;
6015     else if (Ty == LLT::fixed_vector(16, S8))
6016       Opc = AArch64::LD1Twov16b;
6017     else if (Ty == LLT::fixed_vector(4, S16))
6018       Opc = AArch64::LD1Twov4h;
6019     else if (Ty == LLT::fixed_vector(8, S16))
6020       Opc = AArch64::LD1Twov8h;
6021     else if (Ty == LLT::fixed_vector(2, S32))
6022       Opc = AArch64::LD1Twov2s;
6023     else if (Ty == LLT::fixed_vector(4, S32))
6024       Opc = AArch64::LD1Twov4s;
6025     else if (Ty == LLT::fixed_vector(2, S64) || Ty == LLT::fixed_vector(2, P0))
6026       Opc = AArch64::LD1Twov2d;
6027     else if (Ty == S64 || Ty == P0)
6028       Opc = AArch64::LD1Twov1d;
6029     else
6030       llvm_unreachable("Unexpected type for ld1x2!");
6031     selectVectorLoadIntrinsic(Opc, 2, I);
6032     break;
6033   }
6034   case Intrinsic::aarch64_neon_ld1x3: {
6035     LLT Ty = MRI.getType(I.getOperand(0).getReg());
6036     unsigned Opc = 0;
6037     if (Ty == LLT::fixed_vector(8, S8))
6038       Opc = AArch64::LD1Threev8b;
6039     else if (Ty == LLT::fixed_vector(16, S8))
6040       Opc = AArch64::LD1Threev16b;
6041     else if (Ty == LLT::fixed_vector(4, S16))
6042       Opc = AArch64::LD1Threev4h;
6043     else if (Ty == LLT::fixed_vector(8, S16))
6044       Opc = AArch64::LD1Threev8h;
6045     else if (Ty == LLT::fixed_vector(2, S32))
6046       Opc = AArch64::LD1Threev2s;
6047     else if (Ty == LLT::fixed_vector(4, S32))
6048       Opc = AArch64::LD1Threev4s;
6049     else if (Ty == LLT::fixed_vector(2, S64) || Ty == LLT::fixed_vector(2, P0))
6050       Opc = AArch64::LD1Threev2d;
6051     else if (Ty == S64 || Ty == P0)
6052       Opc = AArch64::LD1Threev1d;
6053     else
6054       llvm_unreachable("Unexpected type for ld1x3!");
6055     selectVectorLoadIntrinsic(Opc, 3, I);
6056     break;
6057   }
6058   case Intrinsic::aarch64_neon_ld1x4: {
6059     LLT Ty = MRI.getType(I.getOperand(0).getReg());
6060     unsigned Opc = 0;
6061     if (Ty == LLT::fixed_vector(8, S8))
6062       Opc = AArch64::LD1Fourv8b;
6063     else if (Ty == LLT::fixed_vector(16, S8))
6064       Opc = AArch64::LD1Fourv16b;
6065     else if (Ty == LLT::fixed_vector(4, S16))
6066       Opc = AArch64::LD1Fourv4h;
6067     else if (Ty == LLT::fixed_vector(8, S16))
6068       Opc = AArch64::LD1Fourv8h;
6069     else if (Ty == LLT::fixed_vector(2, S32))
6070       Opc = AArch64::LD1Fourv2s;
6071     else if (Ty == LLT::fixed_vector(4, S32))
6072       Opc = AArch64::LD1Fourv4s;
6073     else if (Ty == LLT::fixed_vector(2, S64) || Ty == LLT::fixed_vector(2, P0))
6074       Opc = AArch64::LD1Fourv2d;
6075     else if (Ty == S64 || Ty == P0)
6076       Opc = AArch64::LD1Fourv1d;
6077     else
6078       llvm_unreachable("Unexpected type for ld1x4!");
6079     selectVectorLoadIntrinsic(Opc, 4, I);
6080     break;
6081   }
6082   case Intrinsic::aarch64_neon_ld2: {
6083     LLT Ty = MRI.getType(I.getOperand(0).getReg());
6084     unsigned Opc = 0;
6085     if (Ty == LLT::fixed_vector(8, S8))
6086       Opc = AArch64::LD2Twov8b;
6087     else if (Ty == LLT::fixed_vector(16, S8))
6088       Opc = AArch64::LD2Twov16b;
6089     else if (Ty == LLT::fixed_vector(4, S16))
6090       Opc = AArch64::LD2Twov4h;
6091     else if (Ty == LLT::fixed_vector(8, S16))
6092       Opc = AArch64::LD2Twov8h;
6093     else if (Ty == LLT::fixed_vector(2, S32))
6094       Opc = AArch64::LD2Twov2s;
6095     else if (Ty == LLT::fixed_vector(4, S32))
6096       Opc = AArch64::LD2Twov4s;
6097     else if (Ty == LLT::fixed_vector(2, S64) || Ty == LLT::fixed_vector(2, P0))
6098       Opc = AArch64::LD2Twov2d;
6099     else if (Ty == S64 || Ty == P0)
6100       Opc = AArch64::LD1Twov1d;
6101     else
6102       llvm_unreachable("Unexpected type for ld2!");
6103     selectVectorLoadIntrinsic(Opc, 2, I);
6104     break;
6105   }
6106   case Intrinsic::aarch64_neon_ld2lane: {
6107     LLT Ty = MRI.getType(I.getOperand(0).getReg());
6108     unsigned Opc;
6109     if (Ty == LLT::fixed_vector(8, S8) || Ty == LLT::fixed_vector(16, S8))
6110       Opc = AArch64::LD2i8;
6111     else if (Ty == LLT::fixed_vector(4, S16) || Ty == LLT::fixed_vector(8, S16))
6112       Opc = AArch64::LD2i16;
6113     else if (Ty == LLT::fixed_vector(2, S32) || Ty == LLT::fixed_vector(4, S32))
6114       Opc = AArch64::LD2i32;
6115     else if (Ty == LLT::fixed_vector(2, S64) ||
6116              Ty == LLT::fixed_vector(2, P0) || Ty == S64 || Ty == P0)
6117       Opc = AArch64::LD2i64;
6118     else
6119       llvm_unreachable("Unexpected type for st2lane!");
6120     if (!selectVectorLoadLaneIntrinsic(Opc, 2, I))
6121       return false;
6122     break;
6123   }
6124   case Intrinsic::aarch64_neon_ld2r: {
6125     LLT Ty = MRI.getType(I.getOperand(0).getReg());
6126     unsigned Opc = 0;
6127     if (Ty == LLT::fixed_vector(8, S8))
6128       Opc = AArch64::LD2Rv8b;
6129     else if (Ty == LLT::fixed_vector(16, S8))
6130       Opc = AArch64::LD2Rv16b;
6131     else if (Ty == LLT::fixed_vector(4, S16))
6132       Opc = AArch64::LD2Rv4h;
6133     else if (Ty == LLT::fixed_vector(8, S16))
6134       Opc = AArch64::LD2Rv8h;
6135     else if (Ty == LLT::fixed_vector(2, S32))
6136       Opc = AArch64::LD2Rv2s;
6137     else if (Ty == LLT::fixed_vector(4, S32))
6138       Opc = AArch64::LD2Rv4s;
6139     else if (Ty == LLT::fixed_vector(2, S64) || Ty == LLT::fixed_vector(2, P0))
6140       Opc = AArch64::LD2Rv2d;
6141     else if (Ty == S64 || Ty == P0)
6142       Opc = AArch64::LD2Rv1d;
6143     else
6144       llvm_unreachable("Unexpected type for ld2r!");
6145     selectVectorLoadIntrinsic(Opc, 2, I);
6146     break;
6147   }
6148   case Intrinsic::aarch64_neon_ld3: {
6149     LLT Ty = MRI.getType(I.getOperand(0).getReg());
6150     unsigned Opc = 0;
6151     if (Ty == LLT::fixed_vector(8, S8))
6152       Opc = AArch64::LD3Threev8b;
6153     else if (Ty == LLT::fixed_vector(16, S8))
6154       Opc = AArch64::LD3Threev16b;
6155     else if (Ty == LLT::fixed_vector(4, S16))
6156       Opc = AArch64::LD3Threev4h;
6157     else if (Ty == LLT::fixed_vector(8, S16))
6158       Opc = AArch64::LD3Threev8h;
6159     else if (Ty == LLT::fixed_vector(2, S32))
6160       Opc = AArch64::LD3Threev2s;
6161     else if (Ty == LLT::fixed_vector(4, S32))
6162       Opc = AArch64::LD3Threev4s;
6163     else if (Ty == LLT::fixed_vector(2, S64) || Ty == LLT::fixed_vector(2, P0))
6164       Opc = AArch64::LD3Threev2d;
6165     else if (Ty == S64 || Ty == P0)
6166       Opc = AArch64::LD1Threev1d;
6167     else
6168       llvm_unreachable("Unexpected type for ld3!");
6169     selectVectorLoadIntrinsic(Opc, 3, I);
6170     break;
6171   }
6172   case Intrinsic::aarch64_neon_ld3lane: {
6173     LLT Ty = MRI.getType(I.getOperand(0).getReg());
6174     unsigned Opc;
6175     if (Ty == LLT::fixed_vector(8, S8) || Ty == LLT::fixed_vector(16, S8))
6176       Opc = AArch64::LD3i8;
6177     else if (Ty == LLT::fixed_vector(4, S16) || Ty == LLT::fixed_vector(8, S16))
6178       Opc = AArch64::LD3i16;
6179     else if (Ty == LLT::fixed_vector(2, S32) || Ty == LLT::fixed_vector(4, S32))
6180       Opc = AArch64::LD3i32;
6181     else if (Ty == LLT::fixed_vector(2, S64) ||
6182              Ty == LLT::fixed_vector(2, P0) || Ty == S64 || Ty == P0)
6183       Opc = AArch64::LD3i64;
6184     else
6185       llvm_unreachable("Unexpected type for st3lane!");
6186     if (!selectVectorLoadLaneIntrinsic(Opc, 3, I))
6187       return false;
6188     break;
6189   }
6190   case Intrinsic::aarch64_neon_ld3r: {
6191     LLT Ty = MRI.getType(I.getOperand(0).getReg());
6192     unsigned Opc = 0;
6193     if (Ty == LLT::fixed_vector(8, S8))
6194       Opc = AArch64::LD3Rv8b;
6195     else if (Ty == LLT::fixed_vector(16, S8))
6196       Opc = AArch64::LD3Rv16b;
6197     else if (Ty == LLT::fixed_vector(4, S16))
6198       Opc = AArch64::LD3Rv4h;
6199     else if (Ty == LLT::fixed_vector(8, S16))
6200       Opc = AArch64::LD3Rv8h;
6201     else if (Ty == LLT::fixed_vector(2, S32))
6202       Opc = AArch64::LD3Rv2s;
6203     else if (Ty == LLT::fixed_vector(4, S32))
6204       Opc = AArch64::LD3Rv4s;
6205     else if (Ty == LLT::fixed_vector(2, S64) || Ty == LLT::fixed_vector(2, P0))
6206       Opc = AArch64::LD3Rv2d;
6207     else if (Ty == S64 || Ty == P0)
6208       Opc = AArch64::LD3Rv1d;
6209     else
6210       llvm_unreachable("Unexpected type for ld3r!");
6211     selectVectorLoadIntrinsic(Opc, 3, I);
6212     break;
6213   }
6214   case Intrinsic::aarch64_neon_ld4: {
6215     LLT Ty = MRI.getType(I.getOperand(0).getReg());
6216     unsigned Opc = 0;
6217     if (Ty == LLT::fixed_vector(8, S8))
6218       Opc = AArch64::LD4Fourv8b;
6219     else if (Ty == LLT::fixed_vector(16, S8))
6220       Opc = AArch64::LD4Fourv16b;
6221     else if (Ty == LLT::fixed_vector(4, S16))
6222       Opc = AArch64::LD4Fourv4h;
6223     else if (Ty == LLT::fixed_vector(8, S16))
6224       Opc = AArch64::LD4Fourv8h;
6225     else if (Ty == LLT::fixed_vector(2, S32))
6226       Opc = AArch64::LD4Fourv2s;
6227     else if (Ty == LLT::fixed_vector(4, S32))
6228       Opc = AArch64::LD4Fourv4s;
6229     else if (Ty == LLT::fixed_vector(2, S64) || Ty == LLT::fixed_vector(2, P0))
6230       Opc = AArch64::LD4Fourv2d;
6231     else if (Ty == S64 || Ty == P0)
6232       Opc = AArch64::LD1Fourv1d;
6233     else
6234       llvm_unreachable("Unexpected type for ld4!");
6235     selectVectorLoadIntrinsic(Opc, 4, I);
6236     break;
6237   }
6238   case Intrinsic::aarch64_neon_ld4lane: {
6239     LLT Ty = MRI.getType(I.getOperand(0).getReg());
6240     unsigned Opc;
6241     if (Ty == LLT::fixed_vector(8, S8) || Ty == LLT::fixed_vector(16, S8))
6242       Opc = AArch64::LD4i8;
6243     else if (Ty == LLT::fixed_vector(4, S16) || Ty == LLT::fixed_vector(8, S16))
6244       Opc = AArch64::LD4i16;
6245     else if (Ty == LLT::fixed_vector(2, S32) || Ty == LLT::fixed_vector(4, S32))
6246       Opc = AArch64::LD4i32;
6247     else if (Ty == LLT::fixed_vector(2, S64) ||
6248              Ty == LLT::fixed_vector(2, P0) || Ty == S64 || Ty == P0)
6249       Opc = AArch64::LD4i64;
6250     else
6251       llvm_unreachable("Unexpected type for st4lane!");
6252     if (!selectVectorLoadLaneIntrinsic(Opc, 4, I))
6253       return false;
6254     break;
6255   }
6256   case Intrinsic::aarch64_neon_ld4r: {
6257     LLT Ty = MRI.getType(I.getOperand(0).getReg());
6258     unsigned Opc = 0;
6259     if (Ty == LLT::fixed_vector(8, S8))
6260       Opc = AArch64::LD4Rv8b;
6261     else if (Ty == LLT::fixed_vector(16, S8))
6262       Opc = AArch64::LD4Rv16b;
6263     else if (Ty == LLT::fixed_vector(4, S16))
6264       Opc = AArch64::LD4Rv4h;
6265     else if (Ty == LLT::fixed_vector(8, S16))
6266       Opc = AArch64::LD4Rv8h;
6267     else if (Ty == LLT::fixed_vector(2, S32))
6268       Opc = AArch64::LD4Rv2s;
6269     else if (Ty == LLT::fixed_vector(4, S32))
6270       Opc = AArch64::LD4Rv4s;
6271     else if (Ty == LLT::fixed_vector(2, S64) || Ty == LLT::fixed_vector(2, P0))
6272       Opc = AArch64::LD4Rv2d;
6273     else if (Ty == S64 || Ty == P0)
6274       Opc = AArch64::LD4Rv1d;
6275     else
6276       llvm_unreachable("Unexpected type for ld4r!");
6277     selectVectorLoadIntrinsic(Opc, 4, I);
6278     break;
6279   }
6280   case Intrinsic::aarch64_neon_st1x2: {
6281     LLT Ty = MRI.getType(I.getOperand(1).getReg());
6282     unsigned Opc;
6283     if (Ty == LLT::fixed_vector(8, S8))
6284       Opc = AArch64::ST1Twov8b;
6285     else if (Ty == LLT::fixed_vector(16, S8))
6286       Opc = AArch64::ST1Twov16b;
6287     else if (Ty == LLT::fixed_vector(4, S16))
6288       Opc = AArch64::ST1Twov4h;
6289     else if (Ty == LLT::fixed_vector(8, S16))
6290       Opc = AArch64::ST1Twov8h;
6291     else if (Ty == LLT::fixed_vector(2, S32))
6292       Opc = AArch64::ST1Twov2s;
6293     else if (Ty == LLT::fixed_vector(4, S32))
6294       Opc = AArch64::ST1Twov4s;
6295     else if (Ty == LLT::fixed_vector(2, S64) || Ty == LLT::fixed_vector(2, P0))
6296       Opc = AArch64::ST1Twov2d;
6297     else if (Ty == S64 || Ty == P0)
6298       Opc = AArch64::ST1Twov1d;
6299     else
6300       llvm_unreachable("Unexpected type for st1x2!");
6301     selectVectorStoreIntrinsic(I, 2, Opc);
6302     break;
6303   }
6304   case Intrinsic::aarch64_neon_st1x3: {
6305     LLT Ty = MRI.getType(I.getOperand(1).getReg());
6306     unsigned Opc;
6307     if (Ty == LLT::fixed_vector(8, S8))
6308       Opc = AArch64::ST1Threev8b;
6309     else if (Ty == LLT::fixed_vector(16, S8))
6310       Opc = AArch64::ST1Threev16b;
6311     else if (Ty == LLT::fixed_vector(4, S16))
6312       Opc = AArch64::ST1Threev4h;
6313     else if (Ty == LLT::fixed_vector(8, S16))
6314       Opc = AArch64::ST1Threev8h;
6315     else if (Ty == LLT::fixed_vector(2, S32))
6316       Opc = AArch64::ST1Threev2s;
6317     else if (Ty == LLT::fixed_vector(4, S32))
6318       Opc = AArch64::ST1Threev4s;
6319     else if (Ty == LLT::fixed_vector(2, S64) || Ty == LLT::fixed_vector(2, P0))
6320       Opc = AArch64::ST1Threev2d;
6321     else if (Ty == S64 || Ty == P0)
6322       Opc = AArch64::ST1Threev1d;
6323     else
6324       llvm_unreachable("Unexpected type for st1x3!");
6325     selectVectorStoreIntrinsic(I, 3, Opc);
6326     break;
6327   }
6328   case Intrinsic::aarch64_neon_st1x4: {
6329     LLT Ty = MRI.getType(I.getOperand(1).getReg());
6330     unsigned Opc;
6331     if (Ty == LLT::fixed_vector(8, S8))
6332       Opc = AArch64::ST1Fourv8b;
6333     else if (Ty == LLT::fixed_vector(16, S8))
6334       Opc = AArch64::ST1Fourv16b;
6335     else if (Ty == LLT::fixed_vector(4, S16))
6336       Opc = AArch64::ST1Fourv4h;
6337     else if (Ty == LLT::fixed_vector(8, S16))
6338       Opc = AArch64::ST1Fourv8h;
6339     else if (Ty == LLT::fixed_vector(2, S32))
6340       Opc = AArch64::ST1Fourv2s;
6341     else if (Ty == LLT::fixed_vector(4, S32))
6342       Opc = AArch64::ST1Fourv4s;
6343     else if (Ty == LLT::fixed_vector(2, S64) || Ty == LLT::fixed_vector(2, P0))
6344       Opc = AArch64::ST1Fourv2d;
6345     else if (Ty == S64 || Ty == P0)
6346       Opc = AArch64::ST1Fourv1d;
6347     else
6348       llvm_unreachable("Unexpected type for st1x4!");
6349     selectVectorStoreIntrinsic(I, 4, Opc);
6350     break;
6351   }
6352   case Intrinsic::aarch64_neon_st2: {
6353     LLT Ty = MRI.getType(I.getOperand(1).getReg());
6354     unsigned Opc;
6355     if (Ty == LLT::fixed_vector(8, S8))
6356       Opc = AArch64::ST2Twov8b;
6357     else if (Ty == LLT::fixed_vector(16, S8))
6358       Opc = AArch64::ST2Twov16b;
6359     else if (Ty == LLT::fixed_vector(4, S16))
6360       Opc = AArch64::ST2Twov4h;
6361     else if (Ty == LLT::fixed_vector(8, S16))
6362       Opc = AArch64::ST2Twov8h;
6363     else if (Ty == LLT::fixed_vector(2, S32))
6364       Opc = AArch64::ST2Twov2s;
6365     else if (Ty == LLT::fixed_vector(4, S32))
6366       Opc = AArch64::ST2Twov4s;
6367     else if (Ty == LLT::fixed_vector(2, S64) || Ty == LLT::fixed_vector(2, P0))
6368       Opc = AArch64::ST2Twov2d;
6369     else if (Ty == S64 || Ty == P0)
6370       Opc = AArch64::ST1Twov1d;
6371     else
6372       llvm_unreachable("Unexpected type for st2!");
6373     selectVectorStoreIntrinsic(I, 2, Opc);
6374     break;
6375   }
6376   case Intrinsic::aarch64_neon_st3: {
6377     LLT Ty = MRI.getType(I.getOperand(1).getReg());
6378     unsigned Opc;
6379     if (Ty == LLT::fixed_vector(8, S8))
6380       Opc = AArch64::ST3Threev8b;
6381     else if (Ty == LLT::fixed_vector(16, S8))
6382       Opc = AArch64::ST3Threev16b;
6383     else if (Ty == LLT::fixed_vector(4, S16))
6384       Opc = AArch64::ST3Threev4h;
6385     else if (Ty == LLT::fixed_vector(8, S16))
6386       Opc = AArch64::ST3Threev8h;
6387     else if (Ty == LLT::fixed_vector(2, S32))
6388       Opc = AArch64::ST3Threev2s;
6389     else if (Ty == LLT::fixed_vector(4, S32))
6390       Opc = AArch64::ST3Threev4s;
6391     else if (Ty == LLT::fixed_vector(2, S64) || Ty == LLT::fixed_vector(2, P0))
6392       Opc = AArch64::ST3Threev2d;
6393     else if (Ty == S64 || Ty == P0)
6394       Opc = AArch64::ST1Threev1d;
6395     else
6396       llvm_unreachable("Unexpected type for st3!");
6397     selectVectorStoreIntrinsic(I, 3, Opc);
6398     break;
6399   }
6400   case Intrinsic::aarch64_neon_st4: {
6401     LLT Ty = MRI.getType(I.getOperand(1).getReg());
6402     unsigned Opc;
6403     if (Ty == LLT::fixed_vector(8, S8))
6404       Opc = AArch64::ST4Fourv8b;
6405     else if (Ty == LLT::fixed_vector(16, S8))
6406       Opc = AArch64::ST4Fourv16b;
6407     else if (Ty == LLT::fixed_vector(4, S16))
6408       Opc = AArch64::ST4Fourv4h;
6409     else if (Ty == LLT::fixed_vector(8, S16))
6410       Opc = AArch64::ST4Fourv8h;
6411     else if (Ty == LLT::fixed_vector(2, S32))
6412       Opc = AArch64::ST4Fourv2s;
6413     else if (Ty == LLT::fixed_vector(4, S32))
6414       Opc = AArch64::ST4Fourv4s;
6415     else if (Ty == LLT::fixed_vector(2, S64) || Ty == LLT::fixed_vector(2, P0))
6416       Opc = AArch64::ST4Fourv2d;
6417     else if (Ty == S64 || Ty == P0)
6418       Opc = AArch64::ST1Fourv1d;
6419     else
6420       llvm_unreachable("Unexpected type for st4!");
6421     selectVectorStoreIntrinsic(I, 4, Opc);
6422     break;
6423   }
6424   case Intrinsic::aarch64_neon_st2lane: {
6425     LLT Ty = MRI.getType(I.getOperand(1).getReg());
6426     unsigned Opc;
6427     if (Ty == LLT::fixed_vector(8, S8) || Ty == LLT::fixed_vector(16, S8))
6428       Opc = AArch64::ST2i8;
6429     else if (Ty == LLT::fixed_vector(4, S16) || Ty == LLT::fixed_vector(8, S16))
6430       Opc = AArch64::ST2i16;
6431     else if (Ty == LLT::fixed_vector(2, S32) || Ty == LLT::fixed_vector(4, S32))
6432       Opc = AArch64::ST2i32;
6433     else if (Ty == LLT::fixed_vector(2, S64) ||
6434              Ty == LLT::fixed_vector(2, P0) || Ty == S64 || Ty == P0)
6435       Opc = AArch64::ST2i64;
6436     else
6437       llvm_unreachable("Unexpected type for st2lane!");
6438     if (!selectVectorStoreLaneIntrinsic(I, 2, Opc))
6439       return false;
6440     break;
6441   }
6442   case Intrinsic::aarch64_neon_st3lane: {
6443     LLT Ty = MRI.getType(I.getOperand(1).getReg());
6444     unsigned Opc;
6445     if (Ty == LLT::fixed_vector(8, S8) || Ty == LLT::fixed_vector(16, S8))
6446       Opc = AArch64::ST3i8;
6447     else if (Ty == LLT::fixed_vector(4, S16) || Ty == LLT::fixed_vector(8, S16))
6448       Opc = AArch64::ST3i16;
6449     else if (Ty == LLT::fixed_vector(2, S32) || Ty == LLT::fixed_vector(4, S32))
6450       Opc = AArch64::ST3i32;
6451     else if (Ty == LLT::fixed_vector(2, S64) ||
6452              Ty == LLT::fixed_vector(2, P0) || Ty == S64 || Ty == P0)
6453       Opc = AArch64::ST3i64;
6454     else
6455       llvm_unreachable("Unexpected type for st3lane!");
6456     if (!selectVectorStoreLaneIntrinsic(I, 3, Opc))
6457       return false;
6458     break;
6459   }
6460   case Intrinsic::aarch64_neon_st4lane: {
6461     LLT Ty = MRI.getType(I.getOperand(1).getReg());
6462     unsigned Opc;
6463     if (Ty == LLT::fixed_vector(8, S8) || Ty == LLT::fixed_vector(16, S8))
6464       Opc = AArch64::ST4i8;
6465     else if (Ty == LLT::fixed_vector(4, S16) || Ty == LLT::fixed_vector(8, S16))
6466       Opc = AArch64::ST4i16;
6467     else if (Ty == LLT::fixed_vector(2, S32) || Ty == LLT::fixed_vector(4, S32))
6468       Opc = AArch64::ST4i32;
6469     else if (Ty == LLT::fixed_vector(2, S64) ||
6470              Ty == LLT::fixed_vector(2, P0) || Ty == S64 || Ty == P0)
6471       Opc = AArch64::ST4i64;
6472     else
6473       llvm_unreachable("Unexpected type for st4lane!");
6474     if (!selectVectorStoreLaneIntrinsic(I, 4, Opc))
6475       return false;
6476     break;
6477   }
6478   case Intrinsic::aarch64_mops_memset_tag: {
6479     // Transform
6480     //    %dst:gpr(p0) = \
6481     //      G_INTRINSIC_W_SIDE_EFFECTS intrinsic(@llvm.aarch64.mops.memset.tag),
6482     //      \ %dst:gpr(p0), %val:gpr(s64), %n:gpr(s64)
6483     // where %dst is updated, into
6484     //    %Rd:GPR64common, %Rn:GPR64) = \
6485     //      MOPSMemorySetTaggingPseudo \
6486     //      %Rd:GPR64common, %Rn:GPR64, %Rm:GPR64
6487     // where Rd and Rn are tied.
6488     // It is expected that %val has been extended to s64 in legalization.
6489     // Note that the order of the size/value operands are swapped.
6490 
6491     Register DstDef = I.getOperand(0).getReg();
6492     // I.getOperand(1) is the intrinsic function
6493     Register DstUse = I.getOperand(2).getReg();
6494     Register ValUse = I.getOperand(3).getReg();
6495     Register SizeUse = I.getOperand(4).getReg();
6496 
6497     // MOPSMemorySetTaggingPseudo has two defs; the intrinsic call has only one.
6498     // Therefore an additional virtual register is requried for the updated size
6499     // operand. This value is not accessible via the semantics of the intrinsic.
6500     Register SizeDef = MRI.createGenericVirtualRegister(LLT::scalar(64));
6501 
6502     auto Memset = MIB.buildInstr(AArch64::MOPSMemorySetTaggingPseudo,
6503                                  {DstDef, SizeDef}, {DstUse, SizeUse, ValUse});
6504     Memset.cloneMemRefs(I);
6505     constrainSelectedInstRegOperands(*Memset, TII, TRI, RBI);
6506     break;
6507   }
6508   }
6509 
6510   I.eraseFromParent();
6511   return true;
6512 }
6513 
6514 bool AArch64InstructionSelector::selectIntrinsic(MachineInstr &I,
6515                                                  MachineRegisterInfo &MRI) {
6516   unsigned IntrinID = cast<GIntrinsic>(I).getIntrinsicID();
6517 
6518   switch (IntrinID) {
6519   default:
6520     break;
6521   case Intrinsic::aarch64_crypto_sha1h: {
6522     Register DstReg = I.getOperand(0).getReg();
6523     Register SrcReg = I.getOperand(2).getReg();
6524 
6525     // FIXME: Should this be an assert?
6526     if (MRI.getType(DstReg).getSizeInBits() != 32 ||
6527         MRI.getType(SrcReg).getSizeInBits() != 32)
6528       return false;
6529 
6530     // The operation has to happen on FPRs. Set up some new FPR registers for
6531     // the source and destination if they are on GPRs.
6532     if (RBI.getRegBank(SrcReg, MRI, TRI)->getID() != AArch64::FPRRegBankID) {
6533       SrcReg = MRI.createVirtualRegister(&AArch64::FPR32RegClass);
6534       MIB.buildCopy({SrcReg}, {I.getOperand(2)});
6535 
6536       // Make sure the copy ends up getting constrained properly.
6537       RBI.constrainGenericRegister(I.getOperand(2).getReg(),
6538                                    AArch64::GPR32RegClass, MRI);
6539     }
6540 
6541     if (RBI.getRegBank(DstReg, MRI, TRI)->getID() != AArch64::FPRRegBankID)
6542       DstReg = MRI.createVirtualRegister(&AArch64::FPR32RegClass);
6543 
6544     // Actually insert the instruction.
6545     auto SHA1Inst = MIB.buildInstr(AArch64::SHA1Hrr, {DstReg}, {SrcReg});
6546     constrainSelectedInstRegOperands(*SHA1Inst, TII, TRI, RBI);
6547 
6548     // Did we create a new register for the destination?
6549     if (DstReg != I.getOperand(0).getReg()) {
6550       // Yep. Copy the result of the instruction back into the original
6551       // destination.
6552       MIB.buildCopy({I.getOperand(0)}, {DstReg});
6553       RBI.constrainGenericRegister(I.getOperand(0).getReg(),
6554                                    AArch64::GPR32RegClass, MRI);
6555     }
6556 
6557     I.eraseFromParent();
6558     return true;
6559   }
6560   case Intrinsic::ptrauth_resign: {
6561     Register DstReg = I.getOperand(0).getReg();
6562     Register ValReg = I.getOperand(2).getReg();
6563     uint64_t AUTKey = I.getOperand(3).getImm();
6564     Register AUTDisc = I.getOperand(4).getReg();
6565     uint64_t PACKey = I.getOperand(5).getImm();
6566     Register PACDisc = I.getOperand(6).getReg();
6567 
6568     Register AUTAddrDisc = AUTDisc;
6569     uint16_t AUTConstDiscC = 0;
6570     std::tie(AUTConstDiscC, AUTAddrDisc) =
6571         extractPtrauthBlendDiscriminators(AUTDisc, MRI);
6572 
6573     Register PACAddrDisc = PACDisc;
6574     uint16_t PACConstDiscC = 0;
6575     std::tie(PACConstDiscC, PACAddrDisc) =
6576         extractPtrauthBlendDiscriminators(PACDisc, MRI);
6577 
6578     MIB.buildCopy({AArch64::X16}, {ValReg});
6579     MIB.buildInstr(TargetOpcode::IMPLICIT_DEF, {AArch64::X17}, {});
6580     MIB.buildInstr(AArch64::AUTPAC)
6581         .addImm(AUTKey)
6582         .addImm(AUTConstDiscC)
6583         .addUse(AUTAddrDisc)
6584         .addImm(PACKey)
6585         .addImm(PACConstDiscC)
6586         .addUse(PACAddrDisc)
6587         .constrainAllUses(TII, TRI, RBI);
6588     MIB.buildCopy({DstReg}, Register(AArch64::X16));
6589 
6590     RBI.constrainGenericRegister(DstReg, AArch64::GPR64RegClass, MRI);
6591     I.eraseFromParent();
6592     return true;
6593   }
6594   case Intrinsic::ptrauth_auth: {
6595     Register DstReg = I.getOperand(0).getReg();
6596     Register ValReg = I.getOperand(2).getReg();
6597     uint64_t AUTKey = I.getOperand(3).getImm();
6598     Register AUTDisc = I.getOperand(4).getReg();
6599 
6600     Register AUTAddrDisc = AUTDisc;
6601     uint16_t AUTConstDiscC = 0;
6602     std::tie(AUTConstDiscC, AUTAddrDisc) =
6603         extractPtrauthBlendDiscriminators(AUTDisc, MRI);
6604 
6605     MIB.buildCopy({AArch64::X16}, {ValReg});
6606     MIB.buildInstr(TargetOpcode::IMPLICIT_DEF, {AArch64::X17}, {});
6607     MIB.buildInstr(AArch64::AUT)
6608         .addImm(AUTKey)
6609         .addImm(AUTConstDiscC)
6610         .addUse(AUTAddrDisc)
6611         .constrainAllUses(TII, TRI, RBI);
6612     MIB.buildCopy({DstReg}, Register(AArch64::X16));
6613 
6614     RBI.constrainGenericRegister(DstReg, AArch64::GPR64RegClass, MRI);
6615     I.eraseFromParent();
6616     return true;
6617   }
6618   case Intrinsic::frameaddress:
6619   case Intrinsic::returnaddress: {
6620     MachineFunction &MF = *I.getParent()->getParent();
6621     MachineFrameInfo &MFI = MF.getFrameInfo();
6622 
6623     unsigned Depth = I.getOperand(2).getImm();
6624     Register DstReg = I.getOperand(0).getReg();
6625     RBI.constrainGenericRegister(DstReg, AArch64::GPR64RegClass, MRI);
6626 
6627     if (Depth == 0 && IntrinID == Intrinsic::returnaddress) {
6628       if (!MFReturnAddr) {
6629         // Insert the copy from LR/X30 into the entry block, before it can be
6630         // clobbered by anything.
6631         MFI.setReturnAddressIsTaken(true);
6632         MFReturnAddr = getFunctionLiveInPhysReg(
6633             MF, TII, AArch64::LR, AArch64::GPR64RegClass, I.getDebugLoc());
6634       }
6635 
6636       if (STI.hasPAuth()) {
6637         MIB.buildInstr(AArch64::XPACI, {DstReg}, {MFReturnAddr});
6638       } else {
6639         MIB.buildCopy({Register(AArch64::LR)}, {MFReturnAddr});
6640         MIB.buildInstr(AArch64::XPACLRI);
6641         MIB.buildCopy({DstReg}, {Register(AArch64::LR)});
6642       }
6643 
6644       I.eraseFromParent();
6645       return true;
6646     }
6647 
6648     MFI.setFrameAddressIsTaken(true);
6649     Register FrameAddr(AArch64::FP);
6650     while (Depth--) {
6651       Register NextFrame = MRI.createVirtualRegister(&AArch64::GPR64spRegClass);
6652       auto Ldr =
6653           MIB.buildInstr(AArch64::LDRXui, {NextFrame}, {FrameAddr}).addImm(0);
6654       constrainSelectedInstRegOperands(*Ldr, TII, TRI, RBI);
6655       FrameAddr = NextFrame;
6656     }
6657 
6658     if (IntrinID == Intrinsic::frameaddress)
6659       MIB.buildCopy({DstReg}, {FrameAddr});
6660     else {
6661       MFI.setReturnAddressIsTaken(true);
6662 
6663       if (STI.hasPAuth()) {
6664         Register TmpReg = MRI.createVirtualRegister(&AArch64::GPR64RegClass);
6665         MIB.buildInstr(AArch64::LDRXui, {TmpReg}, {FrameAddr}).addImm(1);
6666         MIB.buildInstr(AArch64::XPACI, {DstReg}, {TmpReg});
6667       } else {
6668         MIB.buildInstr(AArch64::LDRXui, {Register(AArch64::LR)}, {FrameAddr})
6669             .addImm(1);
6670         MIB.buildInstr(AArch64::XPACLRI);
6671         MIB.buildCopy({DstReg}, {Register(AArch64::LR)});
6672       }
6673     }
6674 
6675     I.eraseFromParent();
6676     return true;
6677   }
6678   case Intrinsic::aarch64_neon_tbl2:
6679     SelectTable(I, MRI, 2, AArch64::TBLv8i8Two, AArch64::TBLv16i8Two, false);
6680     return true;
6681   case Intrinsic::aarch64_neon_tbl3:
6682     SelectTable(I, MRI, 3, AArch64::TBLv8i8Three, AArch64::TBLv16i8Three,
6683                 false);
6684     return true;
6685   case Intrinsic::aarch64_neon_tbl4:
6686     SelectTable(I, MRI, 4, AArch64::TBLv8i8Four, AArch64::TBLv16i8Four, false);
6687     return true;
6688   case Intrinsic::aarch64_neon_tbx2:
6689     SelectTable(I, MRI, 2, AArch64::TBXv8i8Two, AArch64::TBXv16i8Two, true);
6690     return true;
6691   case Intrinsic::aarch64_neon_tbx3:
6692     SelectTable(I, MRI, 3, AArch64::TBXv8i8Three, AArch64::TBXv16i8Three, true);
6693     return true;
6694   case Intrinsic::aarch64_neon_tbx4:
6695     SelectTable(I, MRI, 4, AArch64::TBXv8i8Four, AArch64::TBXv16i8Four, true);
6696     return true;
6697   case Intrinsic::swift_async_context_addr:
6698     auto Sub = MIB.buildInstr(AArch64::SUBXri, {I.getOperand(0).getReg()},
6699                               {Register(AArch64::FP)})
6700                    .addImm(8)
6701                    .addImm(0);
6702     constrainSelectedInstRegOperands(*Sub, TII, TRI, RBI);
6703 
6704     MF->getFrameInfo().setFrameAddressIsTaken(true);
6705     MF->getInfo<AArch64FunctionInfo>()->setHasSwiftAsyncContext(true);
6706     I.eraseFromParent();
6707     return true;
6708   }
6709   return false;
6710 }
6711 
6712 // G_PTRAUTH_GLOBAL_VALUE lowering
6713 //
6714 // We have 3 lowering alternatives to choose from:
6715 // - MOVaddrPAC: similar to MOVaddr, with added PAC.
6716 //   If the GV doesn't need a GOT load (i.e., is locally defined)
6717 //   materialize the pointer using adrp+add+pac. See LowerMOVaddrPAC.
6718 //
6719 // - LOADgotPAC: similar to LOADgot, with added PAC.
6720 //   If the GV needs a GOT load, materialize the pointer using the usual
6721 //   GOT adrp+ldr, +pac. Pointers in GOT are assumed to be not signed, the GOT
6722 //   section is assumed to be read-only (for example, via relro mechanism). See
6723 //   LowerMOVaddrPAC.
6724 //
6725 // - LOADauthptrstatic: similar to LOADgot, but use a
6726 //   special stub slot instead of a GOT slot.
6727 //   Load a signed pointer for symbol 'sym' from a stub slot named
6728 //   'sym$auth_ptr$key$disc' filled by dynamic linker during relocation
6729 //   resolving. This usually lowers to adrp+ldr, but also emits an entry into
6730 //   .data with an
6731 //   @AUTH relocation. See LowerLOADauthptrstatic.
6732 //
6733 // All 3 are pseudos that are expand late to longer sequences: this lets us
6734 // provide integrity guarantees on the to-be-signed intermediate values.
6735 //
6736 // LOADauthptrstatic is undesirable because it requires a large section filled
6737 // with often similarly-signed pointers, making it a good harvesting target.
6738 // Thus, it's only used for ptrauth references to extern_weak to avoid null
6739 // checks.
6740 
6741 bool AArch64InstructionSelector::selectPtrAuthGlobalValue(
6742     MachineInstr &I, MachineRegisterInfo &MRI) const {
6743   Register DefReg = I.getOperand(0).getReg();
6744   Register Addr = I.getOperand(1).getReg();
6745   uint64_t Key = I.getOperand(2).getImm();
6746   Register AddrDisc = I.getOperand(3).getReg();
6747   uint64_t Disc = I.getOperand(4).getImm();
6748   int64_t Offset = 0;
6749 
6750   if (Key > AArch64PACKey::LAST)
6751     report_fatal_error("key in ptrauth global out of range [0, " +
6752                        Twine((int)AArch64PACKey::LAST) + "]");
6753 
6754   // Blend only works if the integer discriminator is 16-bit wide.
6755   if (!isUInt<16>(Disc))
6756     report_fatal_error(
6757         "constant discriminator in ptrauth global out of range [0, 0xffff]");
6758 
6759   // Choosing between 3 lowering alternatives is target-specific.
6760   if (!STI.isTargetELF() && !STI.isTargetMachO())
6761     report_fatal_error("ptrauth global lowering only supported on MachO/ELF");
6762 
6763   if (!MRI.hasOneDef(Addr))
6764     return false;
6765 
6766   // First match any offset we take from the real global.
6767   const MachineInstr *DefMI = &*MRI.def_instr_begin(Addr);
6768   if (DefMI->getOpcode() == TargetOpcode::G_PTR_ADD) {
6769     Register OffsetReg = DefMI->getOperand(2).getReg();
6770     if (!MRI.hasOneDef(OffsetReg))
6771       return false;
6772     const MachineInstr &OffsetMI = *MRI.def_instr_begin(OffsetReg);
6773     if (OffsetMI.getOpcode() != TargetOpcode::G_CONSTANT)
6774       return false;
6775 
6776     Addr = DefMI->getOperand(1).getReg();
6777     if (!MRI.hasOneDef(Addr))
6778       return false;
6779 
6780     DefMI = &*MRI.def_instr_begin(Addr);
6781     Offset = OffsetMI.getOperand(1).getCImm()->getSExtValue();
6782   }
6783 
6784   // We should be left with a genuine unauthenticated GlobalValue.
6785   const GlobalValue *GV;
6786   if (DefMI->getOpcode() == TargetOpcode::G_GLOBAL_VALUE) {
6787     GV = DefMI->getOperand(1).getGlobal();
6788     Offset += DefMI->getOperand(1).getOffset();
6789   } else if (DefMI->getOpcode() == AArch64::G_ADD_LOW) {
6790     GV = DefMI->getOperand(2).getGlobal();
6791     Offset += DefMI->getOperand(2).getOffset();
6792   } else {
6793     return false;
6794   }
6795 
6796   MachineIRBuilder MIB(I);
6797 
6798   // Classify the reference to determine whether it needs a GOT load.
6799   unsigned OpFlags = STI.ClassifyGlobalReference(GV, TM);
6800   const bool NeedsGOTLoad = ((OpFlags & AArch64II::MO_GOT) != 0);
6801   assert(((OpFlags & (~AArch64II::MO_GOT)) == 0) &&
6802          "unsupported non-GOT op flags on ptrauth global reference");
6803   assert((!GV->hasExternalWeakLinkage() || NeedsGOTLoad) &&
6804          "unsupported non-GOT reference to weak ptrauth global");
6805 
6806   std::optional<APInt> AddrDiscVal = getIConstantVRegVal(AddrDisc, MRI);
6807   bool HasAddrDisc = !AddrDiscVal || *AddrDiscVal != 0;
6808 
6809   // Non-extern_weak:
6810   // - No GOT load needed -> MOVaddrPAC
6811   // - GOT load for non-extern_weak -> LOADgotPAC
6812   //   Note that we disallow extern_weak refs to avoid null checks later.
6813   if (!GV->hasExternalWeakLinkage()) {
6814     MIB.buildInstr(TargetOpcode::IMPLICIT_DEF, {AArch64::X16}, {});
6815     MIB.buildInstr(TargetOpcode::IMPLICIT_DEF, {AArch64::X17}, {});
6816     MIB.buildInstr(NeedsGOTLoad ? AArch64::LOADgotPAC : AArch64::MOVaddrPAC)
6817         .addGlobalAddress(GV, Offset)
6818         .addImm(Key)
6819         .addReg(HasAddrDisc ? AddrDisc : AArch64::XZR)
6820         .addImm(Disc)
6821         .constrainAllUses(TII, TRI, RBI);
6822     MIB.buildCopy(DefReg, Register(AArch64::X16));
6823     RBI.constrainGenericRegister(DefReg, AArch64::GPR64RegClass, MRI);
6824     I.eraseFromParent();
6825     return true;
6826   }
6827 
6828   // extern_weak -> LOADauthptrstatic
6829 
6830   // Offsets and extern_weak don't mix well: ptrauth aside, you'd get the
6831   // offset alone as a pointer if the symbol wasn't available, which would
6832   // probably break null checks in users. Ptrauth complicates things further:
6833   // error out.
6834   if (Offset != 0)
6835     report_fatal_error(
6836         "unsupported non-zero offset in weak ptrauth global reference");
6837 
6838   if (HasAddrDisc)
6839     report_fatal_error("unsupported weak addr-div ptrauth global");
6840 
6841   MIB.buildInstr(AArch64::LOADauthptrstatic, {DefReg}, {})
6842       .addGlobalAddress(GV, Offset)
6843       .addImm(Key)
6844       .addImm(Disc);
6845   RBI.constrainGenericRegister(DefReg, AArch64::GPR64RegClass, MRI);
6846 
6847   I.eraseFromParent();
6848   return true;
6849 }
6850 
6851 void AArch64InstructionSelector::SelectTable(MachineInstr &I,
6852                                              MachineRegisterInfo &MRI,
6853                                              unsigned NumVec, unsigned Opc1,
6854                                              unsigned Opc2, bool isExt) {
6855   Register DstReg = I.getOperand(0).getReg();
6856   unsigned Opc = MRI.getType(DstReg) == LLT::fixed_vector(8, 8) ? Opc1 : Opc2;
6857 
6858   // Create the REG_SEQUENCE
6859   SmallVector<Register, 4> Regs;
6860   for (unsigned i = 0; i < NumVec; i++)
6861     Regs.push_back(I.getOperand(i + 2 + isExt).getReg());
6862   Register RegSeq = createQTuple(Regs, MIB);
6863 
6864   Register IdxReg = I.getOperand(2 + NumVec + isExt).getReg();
6865   MachineInstrBuilder Instr;
6866   if (isExt) {
6867     Register Reg = I.getOperand(2).getReg();
6868     Instr = MIB.buildInstr(Opc, {DstReg}, {Reg, RegSeq, IdxReg});
6869   } else
6870     Instr = MIB.buildInstr(Opc, {DstReg}, {RegSeq, IdxReg});
6871   constrainSelectedInstRegOperands(*Instr, TII, TRI, RBI);
6872   I.eraseFromParent();
6873 }
6874 
6875 InstructionSelector::ComplexRendererFns
6876 AArch64InstructionSelector::selectShiftA_32(const MachineOperand &Root) const {
6877   auto MaybeImmed = getImmedFromMO(Root);
6878   if (MaybeImmed == std::nullopt || *MaybeImmed > 31)
6879     return std::nullopt;
6880   uint64_t Enc = (32 - *MaybeImmed) & 0x1f;
6881   return {{[=](MachineInstrBuilder &MIB) { MIB.addImm(Enc); }}};
6882 }
6883 
6884 InstructionSelector::ComplexRendererFns
6885 AArch64InstructionSelector::selectShiftB_32(const MachineOperand &Root) const {
6886   auto MaybeImmed = getImmedFromMO(Root);
6887   if (MaybeImmed == std::nullopt || *MaybeImmed > 31)
6888     return std::nullopt;
6889   uint64_t Enc = 31 - *MaybeImmed;
6890   return {{[=](MachineInstrBuilder &MIB) { MIB.addImm(Enc); }}};
6891 }
6892 
6893 InstructionSelector::ComplexRendererFns
6894 AArch64InstructionSelector::selectShiftA_64(const MachineOperand &Root) const {
6895   auto MaybeImmed = getImmedFromMO(Root);
6896   if (MaybeImmed == std::nullopt || *MaybeImmed > 63)
6897     return std::nullopt;
6898   uint64_t Enc = (64 - *MaybeImmed) & 0x3f;
6899   return {{[=](MachineInstrBuilder &MIB) { MIB.addImm(Enc); }}};
6900 }
6901 
6902 InstructionSelector::ComplexRendererFns
6903 AArch64InstructionSelector::selectShiftB_64(const MachineOperand &Root) const {
6904   auto MaybeImmed = getImmedFromMO(Root);
6905   if (MaybeImmed == std::nullopt || *MaybeImmed > 63)
6906     return std::nullopt;
6907   uint64_t Enc = 63 - *MaybeImmed;
6908   return {{[=](MachineInstrBuilder &MIB) { MIB.addImm(Enc); }}};
6909 }
6910 
6911 /// Helper to select an immediate value that can be represented as a 12-bit
6912 /// value shifted left by either 0 or 12. If it is possible to do so, return
6913 /// the immediate and shift value. If not, return std::nullopt.
6914 ///
6915 /// Used by selectArithImmed and selectNegArithImmed.
6916 InstructionSelector::ComplexRendererFns
6917 AArch64InstructionSelector::select12BitValueWithLeftShift(
6918     uint64_t Immed) const {
6919   unsigned ShiftAmt;
6920   if (Immed >> 12 == 0) {
6921     ShiftAmt = 0;
6922   } else if ((Immed & 0xfff) == 0 && Immed >> 24 == 0) {
6923     ShiftAmt = 12;
6924     Immed = Immed >> 12;
6925   } else
6926     return std::nullopt;
6927 
6928   unsigned ShVal = AArch64_AM::getShifterImm(AArch64_AM::LSL, ShiftAmt);
6929   return {{
6930       [=](MachineInstrBuilder &MIB) { MIB.addImm(Immed); },
6931       [=](MachineInstrBuilder &MIB) { MIB.addImm(ShVal); },
6932   }};
6933 }
6934 
6935 /// SelectArithImmed - Select an immediate value that can be represented as
6936 /// a 12-bit value shifted left by either 0 or 12.  If so, return true with
6937 /// Val set to the 12-bit value and Shift set to the shifter operand.
6938 InstructionSelector::ComplexRendererFns
6939 AArch64InstructionSelector::selectArithImmed(MachineOperand &Root) const {
6940   // This function is called from the addsub_shifted_imm ComplexPattern,
6941   // which lists [imm] as the list of opcode it's interested in, however
6942   // we still need to check whether the operand is actually an immediate
6943   // here because the ComplexPattern opcode list is only used in
6944   // root-level opcode matching.
6945   auto MaybeImmed = getImmedFromMO(Root);
6946   if (MaybeImmed == std::nullopt)
6947     return std::nullopt;
6948   return select12BitValueWithLeftShift(*MaybeImmed);
6949 }
6950 
6951 /// SelectNegArithImmed - As above, but negates the value before trying to
6952 /// select it.
6953 InstructionSelector::ComplexRendererFns
6954 AArch64InstructionSelector::selectNegArithImmed(MachineOperand &Root) const {
6955   // We need a register here, because we need to know if we have a 64 or 32
6956   // bit immediate.
6957   if (!Root.isReg())
6958     return std::nullopt;
6959   auto MaybeImmed = getImmedFromMO(Root);
6960   if (MaybeImmed == std::nullopt)
6961     return std::nullopt;
6962   uint64_t Immed = *MaybeImmed;
6963 
6964   // This negation is almost always valid, but "cmp wN, #0" and "cmn wN, #0"
6965   // have the opposite effect on the C flag, so this pattern mustn't match under
6966   // those circumstances.
6967   if (Immed == 0)
6968     return std::nullopt;
6969 
6970   // Check if we're dealing with a 32-bit type on the root or a 64-bit type on
6971   // the root.
6972   MachineRegisterInfo &MRI = Root.getParent()->getMF()->getRegInfo();
6973   if (MRI.getType(Root.getReg()).getSizeInBits() == 32)
6974     Immed = ~((uint32_t)Immed) + 1;
6975   else
6976     Immed = ~Immed + 1ULL;
6977 
6978   if (Immed & 0xFFFFFFFFFF000000ULL)
6979     return std::nullopt;
6980 
6981   Immed &= 0xFFFFFFULL;
6982   return select12BitValueWithLeftShift(Immed);
6983 }
6984 
6985 /// Checks if we are sure that folding MI into load/store addressing mode is
6986 /// beneficial or not.
6987 ///
6988 /// Returns:
6989 /// - true if folding MI would be beneficial.
6990 /// - false if folding MI would be bad.
6991 /// - std::nullopt if it is not sure whether folding MI is beneficial.
6992 ///
6993 /// \p MI can be the offset operand of G_PTR_ADD, e.g. G_SHL in the example:
6994 ///
6995 /// %13:gpr(s64) = G_CONSTANT i64 1
6996 /// %8:gpr(s64) = G_SHL %6, %13(s64)
6997 /// %9:gpr(p0) = G_PTR_ADD %0, %8(s64)
6998 /// %12:gpr(s32) = G_LOAD %9(p0) :: (load (s16))
6999 std::optional<bool> AArch64InstructionSelector::isWorthFoldingIntoAddrMode(
7000     MachineInstr &MI, const MachineRegisterInfo &MRI) const {
7001   if (MI.getOpcode() == AArch64::G_SHL) {
7002     // Address operands with shifts are free, except for running on subtargets
7003     // with AddrLSLSlow14.
7004     if (const auto ValAndVeg = getIConstantVRegValWithLookThrough(
7005             MI.getOperand(2).getReg(), MRI)) {
7006       const APInt ShiftVal = ValAndVeg->Value;
7007 
7008       // Don't fold if we know this will be slow.
7009       return !(STI.hasAddrLSLSlow14() && (ShiftVal == 1 || ShiftVal == 4));
7010     }
7011   }
7012   return std::nullopt;
7013 }
7014 
7015 /// Return true if it is worth folding MI into an extended register. That is,
7016 /// if it's safe to pull it into the addressing mode of a load or store as a
7017 /// shift.
7018 /// \p IsAddrOperand whether the def of MI is used as an address operand
7019 /// (e.g. feeding into an LDR/STR).
7020 bool AArch64InstructionSelector::isWorthFoldingIntoExtendedReg(
7021     MachineInstr &MI, const MachineRegisterInfo &MRI,
7022     bool IsAddrOperand) const {
7023 
7024   // Always fold if there is one use, or if we're optimizing for size.
7025   Register DefReg = MI.getOperand(0).getReg();
7026   if (MRI.hasOneNonDBGUse(DefReg) ||
7027       MI.getParent()->getParent()->getFunction().hasOptSize())
7028     return true;
7029 
7030   if (IsAddrOperand) {
7031     // If we are already sure that folding MI is good or bad, return the result.
7032     if (const auto Worth = isWorthFoldingIntoAddrMode(MI, MRI))
7033       return *Worth;
7034 
7035     // Fold G_PTR_ADD if its offset operand can be folded
7036     if (MI.getOpcode() == AArch64::G_PTR_ADD) {
7037       MachineInstr *OffsetInst =
7038           getDefIgnoringCopies(MI.getOperand(2).getReg(), MRI);
7039 
7040       // Note, we already know G_PTR_ADD is used by at least two instructions.
7041       // If we are also sure about whether folding is beneficial or not,
7042       // return the result.
7043       if (const auto Worth = isWorthFoldingIntoAddrMode(*OffsetInst, MRI))
7044         return *Worth;
7045     }
7046   }
7047 
7048   // FIXME: Consider checking HasALULSLFast as appropriate.
7049 
7050   // We have a fastpath, so folding a shift in and potentially computing it
7051   // many times may be beneficial. Check if this is only used in memory ops.
7052   // If it is, then we should fold.
7053   return all_of(MRI.use_nodbg_instructions(DefReg),
7054                 [](MachineInstr &Use) { return Use.mayLoadOrStore(); });
7055 }
7056 
7057 static bool isSignExtendShiftType(AArch64_AM::ShiftExtendType Type) {
7058   switch (Type) {
7059   case AArch64_AM::SXTB:
7060   case AArch64_AM::SXTH:
7061   case AArch64_AM::SXTW:
7062     return true;
7063   default:
7064     return false;
7065   }
7066 }
7067 
7068 InstructionSelector::ComplexRendererFns
7069 AArch64InstructionSelector::selectExtendedSHL(
7070     MachineOperand &Root, MachineOperand &Base, MachineOperand &Offset,
7071     unsigned SizeInBytes, bool WantsExt) const {
7072   assert(Base.isReg() && "Expected base to be a register operand");
7073   assert(Offset.isReg() && "Expected offset to be a register operand");
7074 
7075   MachineRegisterInfo &MRI = Root.getParent()->getMF()->getRegInfo();
7076   MachineInstr *OffsetInst = MRI.getVRegDef(Offset.getReg());
7077 
7078   unsigned OffsetOpc = OffsetInst->getOpcode();
7079   bool LookedThroughZExt = false;
7080   if (OffsetOpc != TargetOpcode::G_SHL && OffsetOpc != TargetOpcode::G_MUL) {
7081     // Try to look through a ZEXT.
7082     if (OffsetOpc != TargetOpcode::G_ZEXT || !WantsExt)
7083       return std::nullopt;
7084 
7085     OffsetInst = MRI.getVRegDef(OffsetInst->getOperand(1).getReg());
7086     OffsetOpc = OffsetInst->getOpcode();
7087     LookedThroughZExt = true;
7088 
7089     if (OffsetOpc != TargetOpcode::G_SHL && OffsetOpc != TargetOpcode::G_MUL)
7090       return std::nullopt;
7091   }
7092   // Make sure that the memory op is a valid size.
7093   int64_t LegalShiftVal = Log2_32(SizeInBytes);
7094   if (LegalShiftVal == 0)
7095     return std::nullopt;
7096   if (!isWorthFoldingIntoExtendedReg(*OffsetInst, MRI, true))
7097     return std::nullopt;
7098 
7099   // Now, try to find the specific G_CONSTANT. Start by assuming that the
7100   // register we will offset is the LHS, and the register containing the
7101   // constant is the RHS.
7102   Register OffsetReg = OffsetInst->getOperand(1).getReg();
7103   Register ConstantReg = OffsetInst->getOperand(2).getReg();
7104   auto ValAndVReg = getIConstantVRegValWithLookThrough(ConstantReg, MRI);
7105   if (!ValAndVReg) {
7106     // We didn't get a constant on the RHS. If the opcode is a shift, then
7107     // we're done.
7108     if (OffsetOpc == TargetOpcode::G_SHL)
7109       return std::nullopt;
7110 
7111     // If we have a G_MUL, we can use either register. Try looking at the RHS.
7112     std::swap(OffsetReg, ConstantReg);
7113     ValAndVReg = getIConstantVRegValWithLookThrough(ConstantReg, MRI);
7114     if (!ValAndVReg)
7115       return std::nullopt;
7116   }
7117 
7118   // The value must fit into 3 bits, and must be positive. Make sure that is
7119   // true.
7120   int64_t ImmVal = ValAndVReg->Value.getSExtValue();
7121 
7122   // Since we're going to pull this into a shift, the constant value must be
7123   // a power of 2. If we got a multiply, then we need to check this.
7124   if (OffsetOpc == TargetOpcode::G_MUL) {
7125     if (!llvm::has_single_bit<uint32_t>(ImmVal))
7126       return std::nullopt;
7127 
7128     // Got a power of 2. So, the amount we'll shift is the log base-2 of that.
7129     ImmVal = Log2_32(ImmVal);
7130   }
7131 
7132   if ((ImmVal & 0x7) != ImmVal)
7133     return std::nullopt;
7134 
7135   // We are only allowed to shift by LegalShiftVal. This shift value is built
7136   // into the instruction, so we can't just use whatever we want.
7137   if (ImmVal != LegalShiftVal)
7138     return std::nullopt;
7139 
7140   unsigned SignExtend = 0;
7141   if (WantsExt) {
7142     // Check if the offset is defined by an extend, unless we looked through a
7143     // G_ZEXT earlier.
7144     if (!LookedThroughZExt) {
7145       MachineInstr *ExtInst = getDefIgnoringCopies(OffsetReg, MRI);
7146       auto Ext = getExtendTypeForInst(*ExtInst, MRI, true);
7147       if (Ext == AArch64_AM::InvalidShiftExtend)
7148         return std::nullopt;
7149 
7150       SignExtend = isSignExtendShiftType(Ext) ? 1 : 0;
7151       // We only support SXTW for signed extension here.
7152       if (SignExtend && Ext != AArch64_AM::SXTW)
7153         return std::nullopt;
7154       OffsetReg = ExtInst->getOperand(1).getReg();
7155     }
7156 
7157     // Need a 32-bit wide register here.
7158     MachineIRBuilder MIB(*MRI.getVRegDef(Root.getReg()));
7159     OffsetReg = moveScalarRegClass(OffsetReg, AArch64::GPR32RegClass, MIB);
7160   }
7161 
7162   // We can use the LHS of the GEP as the base, and the LHS of the shift as an
7163   // offset. Signify that we are shifting by setting the shift flag to 1.
7164   return {{[=](MachineInstrBuilder &MIB) { MIB.addUse(Base.getReg()); },
7165            [=](MachineInstrBuilder &MIB) { MIB.addUse(OffsetReg); },
7166            [=](MachineInstrBuilder &MIB) {
7167              // Need to add both immediates here to make sure that they are both
7168              // added to the instruction.
7169              MIB.addImm(SignExtend);
7170              MIB.addImm(1);
7171            }}};
7172 }
7173 
7174 /// This is used for computing addresses like this:
7175 ///
7176 /// ldr x1, [x2, x3, lsl #3]
7177 ///
7178 /// Where x2 is the base register, and x3 is an offset register. The shift-left
7179 /// is a constant value specific to this load instruction. That is, we'll never
7180 /// see anything other than a 3 here (which corresponds to the size of the
7181 /// element being loaded.)
7182 InstructionSelector::ComplexRendererFns
7183 AArch64InstructionSelector::selectAddrModeShiftedExtendXReg(
7184     MachineOperand &Root, unsigned SizeInBytes) const {
7185   if (!Root.isReg())
7186     return std::nullopt;
7187   MachineRegisterInfo &MRI = Root.getParent()->getMF()->getRegInfo();
7188 
7189   // We want to find something like this:
7190   //
7191   // val = G_CONSTANT LegalShiftVal
7192   // shift = G_SHL off_reg val
7193   // ptr = G_PTR_ADD base_reg shift
7194   // x = G_LOAD ptr
7195   //
7196   // And fold it into this addressing mode:
7197   //
7198   // ldr x, [base_reg, off_reg, lsl #LegalShiftVal]
7199 
7200   // Check if we can find the G_PTR_ADD.
7201   MachineInstr *PtrAdd =
7202       getOpcodeDef(TargetOpcode::G_PTR_ADD, Root.getReg(), MRI);
7203   if (!PtrAdd || !isWorthFoldingIntoExtendedReg(*PtrAdd, MRI, true))
7204     return std::nullopt;
7205 
7206   // Now, try to match an opcode which will match our specific offset.
7207   // We want a G_SHL or a G_MUL.
7208   MachineInstr *OffsetInst =
7209       getDefIgnoringCopies(PtrAdd->getOperand(2).getReg(), MRI);
7210   return selectExtendedSHL(Root, PtrAdd->getOperand(1),
7211                            OffsetInst->getOperand(0), SizeInBytes,
7212                            /*WantsExt=*/false);
7213 }
7214 
7215 /// This is used for computing addresses like this:
7216 ///
7217 /// ldr x1, [x2, x3]
7218 ///
7219 /// Where x2 is the base register, and x3 is an offset register.
7220 ///
7221 /// When possible (or profitable) to fold a G_PTR_ADD into the address
7222 /// calculation, this will do so. Otherwise, it will return std::nullopt.
7223 InstructionSelector::ComplexRendererFns
7224 AArch64InstructionSelector::selectAddrModeRegisterOffset(
7225     MachineOperand &Root) const {
7226   MachineRegisterInfo &MRI = Root.getParent()->getMF()->getRegInfo();
7227 
7228   // We need a GEP.
7229   MachineInstr *Gep = MRI.getVRegDef(Root.getReg());
7230   if (Gep->getOpcode() != TargetOpcode::G_PTR_ADD)
7231     return std::nullopt;
7232 
7233   // If this is used more than once, let's not bother folding.
7234   // TODO: Check if they are memory ops. If they are, then we can still fold
7235   // without having to recompute anything.
7236   if (!MRI.hasOneNonDBGUse(Gep->getOperand(0).getReg()))
7237     return std::nullopt;
7238 
7239   // Base is the GEP's LHS, offset is its RHS.
7240   return {{[=](MachineInstrBuilder &MIB) {
7241              MIB.addUse(Gep->getOperand(1).getReg());
7242            },
7243            [=](MachineInstrBuilder &MIB) {
7244              MIB.addUse(Gep->getOperand(2).getReg());
7245            },
7246            [=](MachineInstrBuilder &MIB) {
7247              // Need to add both immediates here to make sure that they are both
7248              // added to the instruction.
7249              MIB.addImm(0);
7250              MIB.addImm(0);
7251            }}};
7252 }
7253 
7254 /// This is intended to be equivalent to selectAddrModeXRO in
7255 /// AArch64ISelDAGtoDAG. It's used for selecting X register offset loads.
7256 InstructionSelector::ComplexRendererFns
7257 AArch64InstructionSelector::selectAddrModeXRO(MachineOperand &Root,
7258                                               unsigned SizeInBytes) const {
7259   MachineRegisterInfo &MRI = Root.getParent()->getMF()->getRegInfo();
7260   if (!Root.isReg())
7261     return std::nullopt;
7262   MachineInstr *PtrAdd =
7263       getOpcodeDef(TargetOpcode::G_PTR_ADD, Root.getReg(), MRI);
7264   if (!PtrAdd)
7265     return std::nullopt;
7266 
7267   // Check for an immediates which cannot be encoded in the [base + imm]
7268   // addressing mode, and can't be encoded in an add/sub. If this happens, we'll
7269   // end up with code like:
7270   //
7271   // mov x0, wide
7272   // add x1 base, x0
7273   // ldr x2, [x1, x0]
7274   //
7275   // In this situation, we can use the [base, xreg] addressing mode to save an
7276   // add/sub:
7277   //
7278   // mov x0, wide
7279   // ldr x2, [base, x0]
7280   auto ValAndVReg =
7281       getIConstantVRegValWithLookThrough(PtrAdd->getOperand(2).getReg(), MRI);
7282   if (ValAndVReg) {
7283     unsigned Scale = Log2_32(SizeInBytes);
7284     int64_t ImmOff = ValAndVReg->Value.getSExtValue();
7285 
7286     // Skip immediates that can be selected in the load/store addresing
7287     // mode.
7288     if (ImmOff % SizeInBytes == 0 && ImmOff >= 0 &&
7289         ImmOff < (0x1000 << Scale))
7290       return std::nullopt;
7291 
7292     // Helper lambda to decide whether or not it is preferable to emit an add.
7293     auto isPreferredADD = [](int64_t ImmOff) {
7294       // Constants in [0x0, 0xfff] can be encoded in an add.
7295       if ((ImmOff & 0xfffffffffffff000LL) == 0x0LL)
7296         return true;
7297 
7298       // Can it be encoded in an add lsl #12?
7299       if ((ImmOff & 0xffffffffff000fffLL) != 0x0LL)
7300         return false;
7301 
7302       // It can be encoded in an add lsl #12, but we may not want to. If it is
7303       // possible to select this as a single movz, then prefer that. A single
7304       // movz is faster than an add with a shift.
7305       return (ImmOff & 0xffffffffff00ffffLL) != 0x0LL &&
7306              (ImmOff & 0xffffffffffff0fffLL) != 0x0LL;
7307     };
7308 
7309     // If the immediate can be encoded in a single add/sub, then bail out.
7310     if (isPreferredADD(ImmOff) || isPreferredADD(-ImmOff))
7311       return std::nullopt;
7312   }
7313 
7314   // Try to fold shifts into the addressing mode.
7315   auto AddrModeFns = selectAddrModeShiftedExtendXReg(Root, SizeInBytes);
7316   if (AddrModeFns)
7317     return AddrModeFns;
7318 
7319   // If that doesn't work, see if it's possible to fold in registers from
7320   // a GEP.
7321   return selectAddrModeRegisterOffset(Root);
7322 }
7323 
7324 /// This is used for computing addresses like this:
7325 ///
7326 /// ldr x0, [xBase, wOffset, sxtw #LegalShiftVal]
7327 ///
7328 /// Where we have a 64-bit base register, a 32-bit offset register, and an
7329 /// extend (which may or may not be signed).
7330 InstructionSelector::ComplexRendererFns
7331 AArch64InstructionSelector::selectAddrModeWRO(MachineOperand &Root,
7332                                               unsigned SizeInBytes) const {
7333   MachineRegisterInfo &MRI = Root.getParent()->getMF()->getRegInfo();
7334 
7335   MachineInstr *PtrAdd =
7336       getOpcodeDef(TargetOpcode::G_PTR_ADD, Root.getReg(), MRI);
7337   if (!PtrAdd || !isWorthFoldingIntoExtendedReg(*PtrAdd, MRI, true))
7338     return std::nullopt;
7339 
7340   MachineOperand &LHS = PtrAdd->getOperand(1);
7341   MachineOperand &RHS = PtrAdd->getOperand(2);
7342   MachineInstr *OffsetInst = getDefIgnoringCopies(RHS.getReg(), MRI);
7343 
7344   // The first case is the same as selectAddrModeXRO, except we need an extend.
7345   // In this case, we try to find a shift and extend, and fold them into the
7346   // addressing mode.
7347   //
7348   // E.g.
7349   //
7350   // off_reg = G_Z/S/ANYEXT ext_reg
7351   // val = G_CONSTANT LegalShiftVal
7352   // shift = G_SHL off_reg val
7353   // ptr = G_PTR_ADD base_reg shift
7354   // x = G_LOAD ptr
7355   //
7356   // In this case we can get a load like this:
7357   //
7358   // ldr x0, [base_reg, ext_reg, sxtw #LegalShiftVal]
7359   auto ExtendedShl = selectExtendedSHL(Root, LHS, OffsetInst->getOperand(0),
7360                                        SizeInBytes, /*WantsExt=*/true);
7361   if (ExtendedShl)
7362     return ExtendedShl;
7363 
7364   // There was no shift. We can try and fold a G_Z/S/ANYEXT in alone though.
7365   //
7366   // e.g.
7367   // ldr something, [base_reg, ext_reg, sxtw]
7368   if (!isWorthFoldingIntoExtendedReg(*OffsetInst, MRI, true))
7369     return std::nullopt;
7370 
7371   // Check if this is an extend. We'll get an extend type if it is.
7372   AArch64_AM::ShiftExtendType Ext =
7373       getExtendTypeForInst(*OffsetInst, MRI, /*IsLoadStore=*/true);
7374   if (Ext == AArch64_AM::InvalidShiftExtend)
7375     return std::nullopt;
7376 
7377   // Need a 32-bit wide register.
7378   MachineIRBuilder MIB(*PtrAdd);
7379   Register ExtReg = moveScalarRegClass(OffsetInst->getOperand(1).getReg(),
7380                                        AArch64::GPR32RegClass, MIB);
7381   unsigned SignExtend = Ext == AArch64_AM::SXTW;
7382 
7383   // Base is LHS, offset is ExtReg.
7384   return {{[=](MachineInstrBuilder &MIB) { MIB.addUse(LHS.getReg()); },
7385            [=](MachineInstrBuilder &MIB) { MIB.addUse(ExtReg); },
7386            [=](MachineInstrBuilder &MIB) {
7387              MIB.addImm(SignExtend);
7388              MIB.addImm(0);
7389            }}};
7390 }
7391 
7392 /// Select a "register plus unscaled signed 9-bit immediate" address.  This
7393 /// should only match when there is an offset that is not valid for a scaled
7394 /// immediate addressing mode.  The "Size" argument is the size in bytes of the
7395 /// memory reference, which is needed here to know what is valid for a scaled
7396 /// immediate.
7397 InstructionSelector::ComplexRendererFns
7398 AArch64InstructionSelector::selectAddrModeUnscaled(MachineOperand &Root,
7399                                                    unsigned Size) const {
7400   MachineRegisterInfo &MRI =
7401       Root.getParent()->getParent()->getParent()->getRegInfo();
7402 
7403   if (!Root.isReg())
7404     return std::nullopt;
7405 
7406   if (!isBaseWithConstantOffset(Root, MRI))
7407     return std::nullopt;
7408 
7409   MachineInstr *RootDef = MRI.getVRegDef(Root.getReg());
7410 
7411   MachineOperand &OffImm = RootDef->getOperand(2);
7412   if (!OffImm.isReg())
7413     return std::nullopt;
7414   MachineInstr *RHS = MRI.getVRegDef(OffImm.getReg());
7415   if (RHS->getOpcode() != TargetOpcode::G_CONSTANT)
7416     return std::nullopt;
7417   int64_t RHSC;
7418   MachineOperand &RHSOp1 = RHS->getOperand(1);
7419   if (!RHSOp1.isCImm() || RHSOp1.getCImm()->getBitWidth() > 64)
7420     return std::nullopt;
7421   RHSC = RHSOp1.getCImm()->getSExtValue();
7422 
7423   if (RHSC >= -256 && RHSC < 256) {
7424     MachineOperand &Base = RootDef->getOperand(1);
7425     return {{
7426         [=](MachineInstrBuilder &MIB) { MIB.add(Base); },
7427         [=](MachineInstrBuilder &MIB) { MIB.addImm(RHSC); },
7428     }};
7429   }
7430   return std::nullopt;
7431 }
7432 
7433 InstructionSelector::ComplexRendererFns
7434 AArch64InstructionSelector::tryFoldAddLowIntoImm(MachineInstr &RootDef,
7435                                                  unsigned Size,
7436                                                  MachineRegisterInfo &MRI) const {
7437   if (RootDef.getOpcode() != AArch64::G_ADD_LOW)
7438     return std::nullopt;
7439   MachineInstr &Adrp = *MRI.getVRegDef(RootDef.getOperand(1).getReg());
7440   if (Adrp.getOpcode() != AArch64::ADRP)
7441     return std::nullopt;
7442 
7443   // TODO: add heuristics like isWorthFoldingADDlow() from SelectionDAG.
7444   auto Offset = Adrp.getOperand(1).getOffset();
7445   if (Offset % Size != 0)
7446     return std::nullopt;
7447 
7448   auto GV = Adrp.getOperand(1).getGlobal();
7449   if (GV->isThreadLocal())
7450     return std::nullopt;
7451 
7452   auto &MF = *RootDef.getParent()->getParent();
7453   if (GV->getPointerAlignment(MF.getDataLayout()) < Size)
7454     return std::nullopt;
7455 
7456   unsigned OpFlags = STI.ClassifyGlobalReference(GV, MF.getTarget());
7457   MachineIRBuilder MIRBuilder(RootDef);
7458   Register AdrpReg = Adrp.getOperand(0).getReg();
7459   return {{[=](MachineInstrBuilder &MIB) { MIB.addUse(AdrpReg); },
7460            [=](MachineInstrBuilder &MIB) {
7461              MIB.addGlobalAddress(GV, Offset,
7462                                   OpFlags | AArch64II::MO_PAGEOFF |
7463                                       AArch64II::MO_NC);
7464            }}};
7465 }
7466 
7467 /// Select a "register plus scaled unsigned 12-bit immediate" address.  The
7468 /// "Size" argument is the size in bytes of the memory reference, which
7469 /// determines the scale.
7470 InstructionSelector::ComplexRendererFns
7471 AArch64InstructionSelector::selectAddrModeIndexed(MachineOperand &Root,
7472                                                   unsigned Size) const {
7473   MachineFunction &MF = *Root.getParent()->getParent()->getParent();
7474   MachineRegisterInfo &MRI = MF.getRegInfo();
7475 
7476   if (!Root.isReg())
7477     return std::nullopt;
7478 
7479   MachineInstr *RootDef = MRI.getVRegDef(Root.getReg());
7480   if (RootDef->getOpcode() == TargetOpcode::G_FRAME_INDEX) {
7481     return {{
7482         [=](MachineInstrBuilder &MIB) { MIB.add(RootDef->getOperand(1)); },
7483         [=](MachineInstrBuilder &MIB) { MIB.addImm(0); },
7484     }};
7485   }
7486 
7487   CodeModel::Model CM = MF.getTarget().getCodeModel();
7488   // Check if we can fold in the ADD of small code model ADRP + ADD address.
7489   if (CM == CodeModel::Small) {
7490     auto OpFns = tryFoldAddLowIntoImm(*RootDef, Size, MRI);
7491     if (OpFns)
7492       return OpFns;
7493   }
7494 
7495   if (isBaseWithConstantOffset(Root, MRI)) {
7496     MachineOperand &LHS = RootDef->getOperand(1);
7497     MachineOperand &RHS = RootDef->getOperand(2);
7498     MachineInstr *LHSDef = MRI.getVRegDef(LHS.getReg());
7499     MachineInstr *RHSDef = MRI.getVRegDef(RHS.getReg());
7500 
7501     int64_t RHSC = (int64_t)RHSDef->getOperand(1).getCImm()->getZExtValue();
7502     unsigned Scale = Log2_32(Size);
7503     if ((RHSC & (Size - 1)) == 0 && RHSC >= 0 && RHSC < (0x1000 << Scale)) {
7504       if (LHSDef->getOpcode() == TargetOpcode::G_FRAME_INDEX)
7505         return {{
7506             [=](MachineInstrBuilder &MIB) { MIB.add(LHSDef->getOperand(1)); },
7507             [=](MachineInstrBuilder &MIB) { MIB.addImm(RHSC >> Scale); },
7508         }};
7509 
7510       return {{
7511           [=](MachineInstrBuilder &MIB) { MIB.add(LHS); },
7512           [=](MachineInstrBuilder &MIB) { MIB.addImm(RHSC >> Scale); },
7513       }};
7514     }
7515   }
7516 
7517   // Before falling back to our general case, check if the unscaled
7518   // instructions can handle this. If so, that's preferable.
7519   if (selectAddrModeUnscaled(Root, Size))
7520     return std::nullopt;
7521 
7522   return {{
7523       [=](MachineInstrBuilder &MIB) { MIB.add(Root); },
7524       [=](MachineInstrBuilder &MIB) { MIB.addImm(0); },
7525   }};
7526 }
7527 
7528 /// Given a shift instruction, return the correct shift type for that
7529 /// instruction.
7530 static AArch64_AM::ShiftExtendType getShiftTypeForInst(MachineInstr &MI) {
7531   switch (MI.getOpcode()) {
7532   default:
7533     return AArch64_AM::InvalidShiftExtend;
7534   case TargetOpcode::G_SHL:
7535     return AArch64_AM::LSL;
7536   case TargetOpcode::G_LSHR:
7537     return AArch64_AM::LSR;
7538   case TargetOpcode::G_ASHR:
7539     return AArch64_AM::ASR;
7540   case TargetOpcode::G_ROTR:
7541     return AArch64_AM::ROR;
7542   }
7543 }
7544 
7545 /// Select a "shifted register" operand. If the value is not shifted, set the
7546 /// shift operand to a default value of "lsl 0".
7547 InstructionSelector::ComplexRendererFns
7548 AArch64InstructionSelector::selectShiftedRegister(MachineOperand &Root,
7549                                                   bool AllowROR) const {
7550   if (!Root.isReg())
7551     return std::nullopt;
7552   MachineRegisterInfo &MRI =
7553       Root.getParent()->getParent()->getParent()->getRegInfo();
7554 
7555   // Check if the operand is defined by an instruction which corresponds to
7556   // a ShiftExtendType. E.g. a G_SHL, G_LSHR, etc.
7557   MachineInstr *ShiftInst = MRI.getVRegDef(Root.getReg());
7558   AArch64_AM::ShiftExtendType ShType = getShiftTypeForInst(*ShiftInst);
7559   if (ShType == AArch64_AM::InvalidShiftExtend)
7560     return std::nullopt;
7561   if (ShType == AArch64_AM::ROR && !AllowROR)
7562     return std::nullopt;
7563   if (!isWorthFoldingIntoExtendedReg(*ShiftInst, MRI, false))
7564     return std::nullopt;
7565 
7566   // Need an immediate on the RHS.
7567   MachineOperand &ShiftRHS = ShiftInst->getOperand(2);
7568   auto Immed = getImmedFromMO(ShiftRHS);
7569   if (!Immed)
7570     return std::nullopt;
7571 
7572   // We have something that we can fold. Fold in the shift's LHS and RHS into
7573   // the instruction.
7574   MachineOperand &ShiftLHS = ShiftInst->getOperand(1);
7575   Register ShiftReg = ShiftLHS.getReg();
7576 
7577   unsigned NumBits = MRI.getType(ShiftReg).getSizeInBits();
7578   unsigned Val = *Immed & (NumBits - 1);
7579   unsigned ShiftVal = AArch64_AM::getShifterImm(ShType, Val);
7580 
7581   return {{[=](MachineInstrBuilder &MIB) { MIB.addUse(ShiftReg); },
7582            [=](MachineInstrBuilder &MIB) { MIB.addImm(ShiftVal); }}};
7583 }
7584 
7585 AArch64_AM::ShiftExtendType AArch64InstructionSelector::getExtendTypeForInst(
7586     MachineInstr &MI, MachineRegisterInfo &MRI, bool IsLoadStore) const {
7587   unsigned Opc = MI.getOpcode();
7588 
7589   // Handle explicit extend instructions first.
7590   if (Opc == TargetOpcode::G_SEXT || Opc == TargetOpcode::G_SEXT_INREG) {
7591     unsigned Size;
7592     if (Opc == TargetOpcode::G_SEXT)
7593       Size = MRI.getType(MI.getOperand(1).getReg()).getSizeInBits();
7594     else
7595       Size = MI.getOperand(2).getImm();
7596     assert(Size != 64 && "Extend from 64 bits?");
7597     switch (Size) {
7598     case 8:
7599       return IsLoadStore ? AArch64_AM::InvalidShiftExtend : AArch64_AM::SXTB;
7600     case 16:
7601       return IsLoadStore ? AArch64_AM::InvalidShiftExtend : AArch64_AM::SXTH;
7602     case 32:
7603       return AArch64_AM::SXTW;
7604     default:
7605       return AArch64_AM::InvalidShiftExtend;
7606     }
7607   }
7608 
7609   if (Opc == TargetOpcode::G_ZEXT || Opc == TargetOpcode::G_ANYEXT) {
7610     unsigned Size = MRI.getType(MI.getOperand(1).getReg()).getSizeInBits();
7611     assert(Size != 64 && "Extend from 64 bits?");
7612     switch (Size) {
7613     case 8:
7614       return IsLoadStore ? AArch64_AM::InvalidShiftExtend : AArch64_AM::UXTB;
7615     case 16:
7616       return IsLoadStore ? AArch64_AM::InvalidShiftExtend : AArch64_AM::UXTH;
7617     case 32:
7618       return AArch64_AM::UXTW;
7619     default:
7620       return AArch64_AM::InvalidShiftExtend;
7621     }
7622   }
7623 
7624   // Don't have an explicit extend. Try to handle a G_AND with a constant mask
7625   // on the RHS.
7626   if (Opc != TargetOpcode::G_AND)
7627     return AArch64_AM::InvalidShiftExtend;
7628 
7629   std::optional<uint64_t> MaybeAndMask = getImmedFromMO(MI.getOperand(2));
7630   if (!MaybeAndMask)
7631     return AArch64_AM::InvalidShiftExtend;
7632   uint64_t AndMask = *MaybeAndMask;
7633   switch (AndMask) {
7634   default:
7635     return AArch64_AM::InvalidShiftExtend;
7636   case 0xFF:
7637     return !IsLoadStore ? AArch64_AM::UXTB : AArch64_AM::InvalidShiftExtend;
7638   case 0xFFFF:
7639     return !IsLoadStore ? AArch64_AM::UXTH : AArch64_AM::InvalidShiftExtend;
7640   case 0xFFFFFFFF:
7641     return AArch64_AM::UXTW;
7642   }
7643 }
7644 
7645 Register AArch64InstructionSelector::moveScalarRegClass(
7646     Register Reg, const TargetRegisterClass &RC, MachineIRBuilder &MIB) const {
7647   MachineRegisterInfo &MRI = *MIB.getMRI();
7648   auto Ty = MRI.getType(Reg);
7649   assert(!Ty.isVector() && "Expected scalars only!");
7650   if (Ty.getSizeInBits() == TRI.getRegSizeInBits(RC))
7651     return Reg;
7652 
7653   // Create a copy and immediately select it.
7654   // FIXME: We should have an emitCopy function?
7655   auto Copy = MIB.buildCopy({&RC}, {Reg});
7656   selectCopy(*Copy, TII, MRI, TRI, RBI);
7657   return Copy.getReg(0);
7658 }
7659 
7660 /// Select an "extended register" operand. This operand folds in an extend
7661 /// followed by an optional left shift.
7662 InstructionSelector::ComplexRendererFns
7663 AArch64InstructionSelector::selectArithExtendedRegister(
7664     MachineOperand &Root) const {
7665   if (!Root.isReg())
7666     return std::nullopt;
7667   MachineRegisterInfo &MRI =
7668       Root.getParent()->getParent()->getParent()->getRegInfo();
7669 
7670   uint64_t ShiftVal = 0;
7671   Register ExtReg;
7672   AArch64_AM::ShiftExtendType Ext;
7673   MachineInstr *RootDef = getDefIgnoringCopies(Root.getReg(), MRI);
7674   if (!RootDef)
7675     return std::nullopt;
7676 
7677   if (!isWorthFoldingIntoExtendedReg(*RootDef, MRI, false))
7678     return std::nullopt;
7679 
7680   // Check if we can fold a shift and an extend.
7681   if (RootDef->getOpcode() == TargetOpcode::G_SHL) {
7682     // Look for a constant on the RHS of the shift.
7683     MachineOperand &RHS = RootDef->getOperand(2);
7684     std::optional<uint64_t> MaybeShiftVal = getImmedFromMO(RHS);
7685     if (!MaybeShiftVal)
7686       return std::nullopt;
7687     ShiftVal = *MaybeShiftVal;
7688     if (ShiftVal > 4)
7689       return std::nullopt;
7690     // Look for a valid extend instruction on the LHS of the shift.
7691     MachineOperand &LHS = RootDef->getOperand(1);
7692     MachineInstr *ExtDef = getDefIgnoringCopies(LHS.getReg(), MRI);
7693     if (!ExtDef)
7694       return std::nullopt;
7695     Ext = getExtendTypeForInst(*ExtDef, MRI);
7696     if (Ext == AArch64_AM::InvalidShiftExtend)
7697       return std::nullopt;
7698     ExtReg = ExtDef->getOperand(1).getReg();
7699   } else {
7700     // Didn't get a shift. Try just folding an extend.
7701     Ext = getExtendTypeForInst(*RootDef, MRI);
7702     if (Ext == AArch64_AM::InvalidShiftExtend)
7703       return std::nullopt;
7704     ExtReg = RootDef->getOperand(1).getReg();
7705 
7706     // If we have a 32 bit instruction which zeroes out the high half of a
7707     // register, we get an implicit zero extend for free. Check if we have one.
7708     // FIXME: We actually emit the extend right now even though we don't have
7709     // to.
7710     if (Ext == AArch64_AM::UXTW && MRI.getType(ExtReg).getSizeInBits() == 32) {
7711       MachineInstr *ExtInst = MRI.getVRegDef(ExtReg);
7712       if (isDef32(*ExtInst))
7713         return std::nullopt;
7714     }
7715   }
7716 
7717   // We require a GPR32 here. Narrow the ExtReg if needed using a subregister
7718   // copy.
7719   MachineIRBuilder MIB(*RootDef);
7720   ExtReg = moveScalarRegClass(ExtReg, AArch64::GPR32RegClass, MIB);
7721 
7722   return {{[=](MachineInstrBuilder &MIB) { MIB.addUse(ExtReg); },
7723            [=](MachineInstrBuilder &MIB) {
7724              MIB.addImm(getArithExtendImm(Ext, ShiftVal));
7725            }}};
7726 }
7727 
7728 InstructionSelector::ComplexRendererFns
7729 AArch64InstructionSelector::selectExtractHigh(MachineOperand &Root) const {
7730   if (!Root.isReg())
7731     return std::nullopt;
7732   MachineRegisterInfo &MRI =
7733       Root.getParent()->getParent()->getParent()->getRegInfo();
7734 
7735   auto Extract = getDefSrcRegIgnoringCopies(Root.getReg(), MRI);
7736   while (Extract && Extract->MI->getOpcode() == TargetOpcode::G_BITCAST &&
7737          STI.isLittleEndian())
7738     Extract =
7739         getDefSrcRegIgnoringCopies(Extract->MI->getOperand(1).getReg(), MRI);
7740   if (!Extract)
7741     return std::nullopt;
7742 
7743   if (Extract->MI->getOpcode() == TargetOpcode::G_UNMERGE_VALUES) {
7744     if (Extract->Reg == Extract->MI->getOperand(1).getReg()) {
7745       Register ExtReg = Extract->MI->getOperand(2).getReg();
7746       return {{[=](MachineInstrBuilder &MIB) { MIB.addUse(ExtReg); }}};
7747     }
7748   }
7749   if (Extract->MI->getOpcode() == TargetOpcode::G_EXTRACT_VECTOR_ELT) {
7750     LLT SrcTy = MRI.getType(Extract->MI->getOperand(1).getReg());
7751     auto LaneIdx = getIConstantVRegValWithLookThrough(
7752         Extract->MI->getOperand(2).getReg(), MRI);
7753     if (LaneIdx && SrcTy == LLT::fixed_vector(2, 64) &&
7754         LaneIdx->Value.getSExtValue() == 1) {
7755       Register ExtReg = Extract->MI->getOperand(1).getReg();
7756       return {{[=](MachineInstrBuilder &MIB) { MIB.addUse(ExtReg); }}};
7757     }
7758   }
7759 
7760   return std::nullopt;
7761 }
7762 
7763 void AArch64InstructionSelector::renderTruncImm(MachineInstrBuilder &MIB,
7764                                                 const MachineInstr &MI,
7765                                                 int OpIdx) const {
7766   const MachineRegisterInfo &MRI = MI.getParent()->getParent()->getRegInfo();
7767   assert(MI.getOpcode() == TargetOpcode::G_CONSTANT && OpIdx == -1 &&
7768          "Expected G_CONSTANT");
7769   std::optional<int64_t> CstVal =
7770       getIConstantVRegSExtVal(MI.getOperand(0).getReg(), MRI);
7771   assert(CstVal && "Expected constant value");
7772   MIB.addImm(*CstVal);
7773 }
7774 
7775 void AArch64InstructionSelector::renderLogicalImm32(
7776   MachineInstrBuilder &MIB, const MachineInstr &I, int OpIdx) const {
7777   assert(I.getOpcode() == TargetOpcode::G_CONSTANT && OpIdx == -1 &&
7778          "Expected G_CONSTANT");
7779   uint64_t CstVal = I.getOperand(1).getCImm()->getZExtValue();
7780   uint64_t Enc = AArch64_AM::encodeLogicalImmediate(CstVal, 32);
7781   MIB.addImm(Enc);
7782 }
7783 
7784 void AArch64InstructionSelector::renderLogicalImm64(
7785   MachineInstrBuilder &MIB, const MachineInstr &I, int OpIdx) const {
7786   assert(I.getOpcode() == TargetOpcode::G_CONSTANT && OpIdx == -1 &&
7787          "Expected G_CONSTANT");
7788   uint64_t CstVal = I.getOperand(1).getCImm()->getZExtValue();
7789   uint64_t Enc = AArch64_AM::encodeLogicalImmediate(CstVal, 64);
7790   MIB.addImm(Enc);
7791 }
7792 
7793 void AArch64InstructionSelector::renderUbsanTrap(MachineInstrBuilder &MIB,
7794                                                  const MachineInstr &MI,
7795                                                  int OpIdx) const {
7796   assert(MI.getOpcode() == TargetOpcode::G_UBSANTRAP && OpIdx == 0 &&
7797          "Expected G_UBSANTRAP");
7798   MIB.addImm(MI.getOperand(0).getImm() | ('U' << 8));
7799 }
7800 
7801 void AArch64InstructionSelector::renderFPImm16(MachineInstrBuilder &MIB,
7802                                                const MachineInstr &MI,
7803                                                int OpIdx) const {
7804   assert(MI.getOpcode() == TargetOpcode::G_FCONSTANT && OpIdx == -1 &&
7805          "Expected G_FCONSTANT");
7806   MIB.addImm(
7807       AArch64_AM::getFP16Imm(MI.getOperand(1).getFPImm()->getValueAPF()));
7808 }
7809 
7810 void AArch64InstructionSelector::renderFPImm32(MachineInstrBuilder &MIB,
7811                                                const MachineInstr &MI,
7812                                                int OpIdx) const {
7813   assert(MI.getOpcode() == TargetOpcode::G_FCONSTANT && OpIdx == -1 &&
7814          "Expected G_FCONSTANT");
7815   MIB.addImm(
7816       AArch64_AM::getFP32Imm(MI.getOperand(1).getFPImm()->getValueAPF()));
7817 }
7818 
7819 void AArch64InstructionSelector::renderFPImm64(MachineInstrBuilder &MIB,
7820                                                const MachineInstr &MI,
7821                                                int OpIdx) const {
7822   assert(MI.getOpcode() == TargetOpcode::G_FCONSTANT && OpIdx == -1 &&
7823          "Expected G_FCONSTANT");
7824   MIB.addImm(
7825       AArch64_AM::getFP64Imm(MI.getOperand(1).getFPImm()->getValueAPF()));
7826 }
7827 
7828 void AArch64InstructionSelector::renderFPImm32SIMDModImmType4(
7829     MachineInstrBuilder &MIB, const MachineInstr &MI, int OpIdx) const {
7830   assert(MI.getOpcode() == TargetOpcode::G_FCONSTANT && OpIdx == -1 &&
7831          "Expected G_FCONSTANT");
7832   MIB.addImm(AArch64_AM::encodeAdvSIMDModImmType4(MI.getOperand(1)
7833                                                       .getFPImm()
7834                                                       ->getValueAPF()
7835                                                       .bitcastToAPInt()
7836                                                       .getZExtValue()));
7837 }
7838 
7839 bool AArch64InstructionSelector::isLoadStoreOfNumBytes(
7840     const MachineInstr &MI, unsigned NumBytes) const {
7841   if (!MI.mayLoadOrStore())
7842     return false;
7843   assert(MI.hasOneMemOperand() &&
7844          "Expected load/store to have only one mem op!");
7845   return (*MI.memoperands_begin())->getSize() == NumBytes;
7846 }
7847 
7848 bool AArch64InstructionSelector::isDef32(const MachineInstr &MI) const {
7849   const MachineRegisterInfo &MRI = MI.getParent()->getParent()->getRegInfo();
7850   if (MRI.getType(MI.getOperand(0).getReg()).getSizeInBits() != 32)
7851     return false;
7852 
7853   // Only return true if we know the operation will zero-out the high half of
7854   // the 64-bit register. Truncates can be subregister copies, which don't
7855   // zero out the high bits. Copies and other copy-like instructions can be
7856   // fed by truncates, or could be lowered as subregister copies.
7857   switch (MI.getOpcode()) {
7858   default:
7859     return true;
7860   case TargetOpcode::COPY:
7861   case TargetOpcode::G_BITCAST:
7862   case TargetOpcode::G_TRUNC:
7863   case TargetOpcode::G_PHI:
7864     return false;
7865   }
7866 }
7867 
7868 
7869 // Perform fixups on the given PHI instruction's operands to force them all
7870 // to be the same as the destination regbank.
7871 static void fixupPHIOpBanks(MachineInstr &MI, MachineRegisterInfo &MRI,
7872                             const AArch64RegisterBankInfo &RBI) {
7873   assert(MI.getOpcode() == TargetOpcode::G_PHI && "Expected a G_PHI");
7874   Register DstReg = MI.getOperand(0).getReg();
7875   const RegisterBank *DstRB = MRI.getRegBankOrNull(DstReg);
7876   assert(DstRB && "Expected PHI dst to have regbank assigned");
7877   MachineIRBuilder MIB(MI);
7878 
7879   // Go through each operand and ensure it has the same regbank.
7880   for (MachineOperand &MO : llvm::drop_begin(MI.operands())) {
7881     if (!MO.isReg())
7882       continue;
7883     Register OpReg = MO.getReg();
7884     const RegisterBank *RB = MRI.getRegBankOrNull(OpReg);
7885     if (RB != DstRB) {
7886       // Insert a cross-bank copy.
7887       auto *OpDef = MRI.getVRegDef(OpReg);
7888       const LLT &Ty = MRI.getType(OpReg);
7889       MachineBasicBlock &OpDefBB = *OpDef->getParent();
7890 
7891       // Any instruction we insert must appear after all PHIs in the block
7892       // for the block to be valid MIR.
7893       MachineBasicBlock::iterator InsertPt = std::next(OpDef->getIterator());
7894       if (InsertPt != OpDefBB.end() && InsertPt->isPHI())
7895         InsertPt = OpDefBB.getFirstNonPHI();
7896       MIB.setInsertPt(*OpDef->getParent(), InsertPt);
7897       auto Copy = MIB.buildCopy(Ty, OpReg);
7898       MRI.setRegBank(Copy.getReg(0), *DstRB);
7899       MO.setReg(Copy.getReg(0));
7900     }
7901   }
7902 }
7903 
7904 void AArch64InstructionSelector::processPHIs(MachineFunction &MF) {
7905   // We're looking for PHIs, build a list so we don't invalidate iterators.
7906   MachineRegisterInfo &MRI = MF.getRegInfo();
7907   SmallVector<MachineInstr *, 32> Phis;
7908   for (auto &BB : MF) {
7909     for (auto &MI : BB) {
7910       if (MI.getOpcode() == TargetOpcode::G_PHI)
7911         Phis.emplace_back(&MI);
7912     }
7913   }
7914 
7915   for (auto *MI : Phis) {
7916     // We need to do some work here if the operand types are < 16 bit and they
7917     // are split across fpr/gpr banks. Since all types <32b on gpr
7918     // end up being assigned gpr32 regclasses, we can end up with PHIs here
7919     // which try to select between a gpr32 and an fpr16. Ideally RBS shouldn't
7920     // be selecting heterogenous regbanks for operands if possible, but we
7921     // still need to be able to deal with it here.
7922     //
7923     // To fix this, if we have a gpr-bank operand < 32b in size and at least
7924     // one other operand is on the fpr bank, then we add cross-bank copies
7925     // to homogenize the operand banks. For simplicity the bank that we choose
7926     // to settle on is whatever bank the def operand has. For example:
7927     //
7928     // %endbb:
7929     //   %dst:gpr(s16) = G_PHI %in1:gpr(s16), %bb1, %in2:fpr(s16), %bb2
7930     //  =>
7931     // %bb2:
7932     //   ...
7933     //   %in2_copy:gpr(s16) = COPY %in2:fpr(s16)
7934     //   ...
7935     // %endbb:
7936     //   %dst:gpr(s16) = G_PHI %in1:gpr(s16), %bb1, %in2_copy:gpr(s16), %bb2
7937     bool HasGPROp = false, HasFPROp = false;
7938     for (const MachineOperand &MO : llvm::drop_begin(MI->operands())) {
7939       if (!MO.isReg())
7940         continue;
7941       const LLT &Ty = MRI.getType(MO.getReg());
7942       if (!Ty.isValid() || !Ty.isScalar())
7943         break;
7944       if (Ty.getSizeInBits() >= 32)
7945         break;
7946       const RegisterBank *RB = MRI.getRegBankOrNull(MO.getReg());
7947       // If for some reason we don't have a regbank yet. Don't try anything.
7948       if (!RB)
7949         break;
7950 
7951       if (RB->getID() == AArch64::GPRRegBankID)
7952         HasGPROp = true;
7953       else
7954         HasFPROp = true;
7955     }
7956     // We have heterogenous regbanks, need to fixup.
7957     if (HasGPROp && HasFPROp)
7958       fixupPHIOpBanks(*MI, MRI, RBI);
7959   }
7960 }
7961 
7962 namespace llvm {
7963 InstructionSelector *
7964 createAArch64InstructionSelector(const AArch64TargetMachine &TM,
7965                                  const AArch64Subtarget &Subtarget,
7966                                  const AArch64RegisterBankInfo &RBI) {
7967   return new AArch64InstructionSelector(TM, Subtarget, RBI);
7968 }
7969 }
7970