xref: /freebsd/contrib/llvm-project/llvm/lib/Target/AArch64/GISel/AArch64InstructionSelector.cpp (revision 2f9966ff63d65bd474478888c9088eeae3f9c669)
1 //===- AArch64InstructionSelector.cpp ----------------------------*- C++ -*-==//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 /// \file
9 /// This file implements the targeting of the InstructionSelector class for
10 /// AArch64.
11 /// \todo This should be generated by TableGen.
12 //===----------------------------------------------------------------------===//
13 
14 #include "AArch64GlobalISelUtils.h"
15 #include "AArch64InstrInfo.h"
16 #include "AArch64MachineFunctionInfo.h"
17 #include "AArch64RegisterBankInfo.h"
18 #include "AArch64RegisterInfo.h"
19 #include "AArch64Subtarget.h"
20 #include "AArch64TargetMachine.h"
21 #include "MCTargetDesc/AArch64AddressingModes.h"
22 #include "MCTargetDesc/AArch64MCTargetDesc.h"
23 #include "llvm/BinaryFormat/Dwarf.h"
24 #include "llvm/CodeGen/GlobalISel/GIMatchTableExecutorImpl.h"
25 #include "llvm/CodeGen/GlobalISel/GenericMachineInstrs.h"
26 #include "llvm/CodeGen/GlobalISel/InstructionSelector.h"
27 #include "llvm/CodeGen/GlobalISel/MIPatternMatch.h"
28 #include "llvm/CodeGen/GlobalISel/MachineIRBuilder.h"
29 #include "llvm/CodeGen/GlobalISel/Utils.h"
30 #include "llvm/CodeGen/MachineBasicBlock.h"
31 #include "llvm/CodeGen/MachineConstantPool.h"
32 #include "llvm/CodeGen/MachineFrameInfo.h"
33 #include "llvm/CodeGen/MachineFunction.h"
34 #include "llvm/CodeGen/MachineInstr.h"
35 #include "llvm/CodeGen/MachineInstrBuilder.h"
36 #include "llvm/CodeGen/MachineMemOperand.h"
37 #include "llvm/CodeGen/MachineOperand.h"
38 #include "llvm/CodeGen/MachineRegisterInfo.h"
39 #include "llvm/CodeGen/TargetOpcodes.h"
40 #include "llvm/CodeGen/TargetRegisterInfo.h"
41 #include "llvm/IR/Constants.h"
42 #include "llvm/IR/DerivedTypes.h"
43 #include "llvm/IR/Instructions.h"
44 #include "llvm/IR/IntrinsicsAArch64.h"
45 #include "llvm/IR/PatternMatch.h"
46 #include "llvm/IR/Type.h"
47 #include "llvm/Pass.h"
48 #include "llvm/Support/Debug.h"
49 #include "llvm/Support/raw_ostream.h"
50 #include <optional>
51 
52 #define DEBUG_TYPE "aarch64-isel"
53 
54 using namespace llvm;
55 using namespace MIPatternMatch;
56 using namespace AArch64GISelUtils;
57 
58 namespace llvm {
59 class BlockFrequencyInfo;
60 class ProfileSummaryInfo;
61 }
62 
63 namespace {
64 
65 #define GET_GLOBALISEL_PREDICATE_BITSET
66 #include "AArch64GenGlobalISel.inc"
67 #undef GET_GLOBALISEL_PREDICATE_BITSET
68 
69 
70 class AArch64InstructionSelector : public InstructionSelector {
71 public:
72   AArch64InstructionSelector(const AArch64TargetMachine &TM,
73                              const AArch64Subtarget &STI,
74                              const AArch64RegisterBankInfo &RBI);
75 
76   bool select(MachineInstr &I) override;
77   static const char *getName() { return DEBUG_TYPE; }
78 
79   void setupMF(MachineFunction &MF, GISelKnownBits *KB,
80                CodeGenCoverage *CoverageInfo, ProfileSummaryInfo *PSI,
81                BlockFrequencyInfo *BFI) override {
82     InstructionSelector::setupMF(MF, KB, CoverageInfo, PSI, BFI);
83     MIB.setMF(MF);
84 
85     // hasFnAttribute() is expensive to call on every BRCOND selection, so
86     // cache it here for each run of the selector.
87     ProduceNonFlagSettingCondBr =
88         !MF.getFunction().hasFnAttribute(Attribute::SpeculativeLoadHardening);
89     MFReturnAddr = Register();
90 
91     processPHIs(MF);
92   }
93 
94 private:
95   /// tblgen-erated 'select' implementation, used as the initial selector for
96   /// the patterns that don't require complex C++.
97   bool selectImpl(MachineInstr &I, CodeGenCoverage &CoverageInfo) const;
98 
99   // A lowering phase that runs before any selection attempts.
100   // Returns true if the instruction was modified.
101   bool preISelLower(MachineInstr &I);
102 
103   // An early selection function that runs before the selectImpl() call.
104   bool earlySelect(MachineInstr &I);
105 
106   /// Save state that is shared between select calls, call select on \p I and
107   /// then restore the saved state. This can be used to recursively call select
108   /// within a select call.
109   bool selectAndRestoreState(MachineInstr &I);
110 
111   // Do some preprocessing of G_PHIs before we begin selection.
112   void processPHIs(MachineFunction &MF);
113 
114   bool earlySelectSHL(MachineInstr &I, MachineRegisterInfo &MRI);
115 
116   /// Eliminate same-sized cross-bank copies into stores before selectImpl().
117   bool contractCrossBankCopyIntoStore(MachineInstr &I,
118                                       MachineRegisterInfo &MRI);
119 
120   bool convertPtrAddToAdd(MachineInstr &I, MachineRegisterInfo &MRI);
121 
122   bool selectVaStartAAPCS(MachineInstr &I, MachineFunction &MF,
123                           MachineRegisterInfo &MRI) const;
124   bool selectVaStartDarwin(MachineInstr &I, MachineFunction &MF,
125                            MachineRegisterInfo &MRI) const;
126 
127   ///@{
128   /// Helper functions for selectCompareBranch.
129   bool selectCompareBranchFedByFCmp(MachineInstr &I, MachineInstr &FCmp,
130                                     MachineIRBuilder &MIB) const;
131   bool selectCompareBranchFedByICmp(MachineInstr &I, MachineInstr &ICmp,
132                                     MachineIRBuilder &MIB) const;
133   bool tryOptCompareBranchFedByICmp(MachineInstr &I, MachineInstr &ICmp,
134                                     MachineIRBuilder &MIB) const;
135   bool tryOptAndIntoCompareBranch(MachineInstr &AndInst, bool Invert,
136                                   MachineBasicBlock *DstMBB,
137                                   MachineIRBuilder &MIB) const;
138   ///@}
139 
140   bool selectCompareBranch(MachineInstr &I, MachineFunction &MF,
141                            MachineRegisterInfo &MRI);
142 
143   bool selectVectorAshrLshr(MachineInstr &I, MachineRegisterInfo &MRI);
144   bool selectVectorSHL(MachineInstr &I, MachineRegisterInfo &MRI);
145 
146   // Helper to generate an equivalent of scalar_to_vector into a new register,
147   // returned via 'Dst'.
148   MachineInstr *emitScalarToVector(unsigned EltSize,
149                                    const TargetRegisterClass *DstRC,
150                                    Register Scalar,
151                                    MachineIRBuilder &MIRBuilder) const;
152   /// Helper to narrow vector that was widened by emitScalarToVector.
153   /// Copy lowest part of 128-bit or 64-bit vector to 64-bit or 32-bit
154   /// vector, correspondingly.
155   MachineInstr *emitNarrowVector(Register DstReg, Register SrcReg,
156                                  MachineIRBuilder &MIRBuilder,
157                                  MachineRegisterInfo &MRI) const;
158 
159   /// Emit a lane insert into \p DstReg, or a new vector register if
160   /// std::nullopt is provided.
161   ///
162   /// The lane inserted into is defined by \p LaneIdx. The vector source
163   /// register is given by \p SrcReg. The register containing the element is
164   /// given by \p EltReg.
165   MachineInstr *emitLaneInsert(std::optional<Register> DstReg, Register SrcReg,
166                                Register EltReg, unsigned LaneIdx,
167                                const RegisterBank &RB,
168                                MachineIRBuilder &MIRBuilder) const;
169 
170   /// Emit a sequence of instructions representing a constant \p CV for a
171   /// vector register \p Dst. (E.g. a MOV, or a load from a constant pool.)
172   ///
173   /// \returns the last instruction in the sequence on success, and nullptr
174   /// otherwise.
175   MachineInstr *emitConstantVector(Register Dst, Constant *CV,
176                                    MachineIRBuilder &MIRBuilder,
177                                    MachineRegisterInfo &MRI);
178 
179   MachineInstr *tryAdvSIMDModImm8(Register Dst, unsigned DstSize, APInt Bits,
180                                   MachineIRBuilder &MIRBuilder);
181 
182   MachineInstr *tryAdvSIMDModImm16(Register Dst, unsigned DstSize, APInt Bits,
183                                    MachineIRBuilder &MIRBuilder, bool Inv);
184 
185   MachineInstr *tryAdvSIMDModImm32(Register Dst, unsigned DstSize, APInt Bits,
186                                    MachineIRBuilder &MIRBuilder, bool Inv);
187   MachineInstr *tryAdvSIMDModImm64(Register Dst, unsigned DstSize, APInt Bits,
188                                    MachineIRBuilder &MIRBuilder);
189   MachineInstr *tryAdvSIMDModImm321s(Register Dst, unsigned DstSize, APInt Bits,
190                                      MachineIRBuilder &MIRBuilder, bool Inv);
191   MachineInstr *tryAdvSIMDModImmFP(Register Dst, unsigned DstSize, APInt Bits,
192                                    MachineIRBuilder &MIRBuilder);
193 
194   bool selectInsertElt(MachineInstr &I, MachineRegisterInfo &MRI);
195   bool tryOptConstantBuildVec(MachineInstr &MI, LLT DstTy,
196                               MachineRegisterInfo &MRI);
197   /// \returns true if a G_BUILD_VECTOR instruction \p MI can be selected as a
198   /// SUBREG_TO_REG.
199   bool tryOptBuildVecToSubregToReg(MachineInstr &MI, MachineRegisterInfo &MRI);
200   bool selectBuildVector(MachineInstr &I, MachineRegisterInfo &MRI);
201   bool selectMergeValues(MachineInstr &I, MachineRegisterInfo &MRI);
202   bool selectUnmergeValues(MachineInstr &I, MachineRegisterInfo &MRI);
203 
204   bool selectShuffleVector(MachineInstr &I, MachineRegisterInfo &MRI);
205   bool selectExtractElt(MachineInstr &I, MachineRegisterInfo &MRI);
206   bool selectConcatVectors(MachineInstr &I, MachineRegisterInfo &MRI);
207   bool selectSplitVectorUnmerge(MachineInstr &I, MachineRegisterInfo &MRI);
208 
209   /// Helper function to select vector load intrinsics like
210   /// @llvm.aarch64.neon.ld2.*, @llvm.aarch64.neon.ld4.*, etc.
211   /// \p Opc is the opcode that the selected instruction should use.
212   /// \p NumVecs is the number of vector destinations for the instruction.
213   /// \p I is the original G_INTRINSIC_W_SIDE_EFFECTS instruction.
214   bool selectVectorLoadIntrinsic(unsigned Opc, unsigned NumVecs,
215                                  MachineInstr &I);
216   bool selectVectorLoadLaneIntrinsic(unsigned Opc, unsigned NumVecs,
217                                      MachineInstr &I);
218   void selectVectorStoreIntrinsic(MachineInstr &I, unsigned NumVecs,
219                                   unsigned Opc);
220   bool selectVectorStoreLaneIntrinsic(MachineInstr &I, unsigned NumVecs,
221                                       unsigned Opc);
222   bool selectIntrinsicWithSideEffects(MachineInstr &I,
223                                       MachineRegisterInfo &MRI);
224   bool selectIntrinsic(MachineInstr &I, MachineRegisterInfo &MRI);
225   bool selectVectorICmp(MachineInstr &I, MachineRegisterInfo &MRI);
226   bool selectJumpTable(MachineInstr &I, MachineRegisterInfo &MRI);
227   bool selectBrJT(MachineInstr &I, MachineRegisterInfo &MRI);
228   bool selectTLSGlobalValue(MachineInstr &I, MachineRegisterInfo &MRI);
229   bool selectReduction(MachineInstr &I, MachineRegisterInfo &MRI);
230   bool selectMOPS(MachineInstr &I, MachineRegisterInfo &MRI);
231   bool selectUSMovFromExtend(MachineInstr &I, MachineRegisterInfo &MRI);
232 
233   bool selectIndexedExtLoad(MachineInstr &I, MachineRegisterInfo &MRI);
234   bool selectIndexedLoad(MachineInstr &I, MachineRegisterInfo &MRI);
235   bool selectIndexedStore(GIndexedStore &I, MachineRegisterInfo &MRI);
236 
237   unsigned emitConstantPoolEntry(const Constant *CPVal,
238                                  MachineFunction &MF) const;
239   MachineInstr *emitLoadFromConstantPool(const Constant *CPVal,
240                                          MachineIRBuilder &MIRBuilder) const;
241 
242   // Emit a vector concat operation.
243   MachineInstr *emitVectorConcat(std::optional<Register> Dst, Register Op1,
244                                  Register Op2,
245                                  MachineIRBuilder &MIRBuilder) const;
246 
247   // Emit an integer compare between LHS and RHS, which checks for Predicate.
248   MachineInstr *emitIntegerCompare(MachineOperand &LHS, MachineOperand &RHS,
249                                    MachineOperand &Predicate,
250                                    MachineIRBuilder &MIRBuilder) const;
251 
252   /// Emit a floating point comparison between \p LHS and \p RHS.
253   /// \p Pred if given is the intended predicate to use.
254   MachineInstr *
255   emitFPCompare(Register LHS, Register RHS, MachineIRBuilder &MIRBuilder,
256                 std::optional<CmpInst::Predicate> = std::nullopt) const;
257 
258   MachineInstr *
259   emitInstr(unsigned Opcode, std::initializer_list<llvm::DstOp> DstOps,
260             std::initializer_list<llvm::SrcOp> SrcOps,
261             MachineIRBuilder &MIRBuilder,
262             const ComplexRendererFns &RenderFns = std::nullopt) const;
263   /// Helper function to emit an add or sub instruction.
264   ///
265   /// \p AddrModeAndSizeToOpcode must contain each of the opcode variants above
266   /// in a specific order.
267   ///
268   /// Below is an example of the expected input to \p AddrModeAndSizeToOpcode.
269   ///
270   /// \code
271   ///   const std::array<std::array<unsigned, 2>, 4> Table {
272   ///    {{AArch64::ADDXri, AArch64::ADDWri},
273   ///     {AArch64::ADDXrs, AArch64::ADDWrs},
274   ///     {AArch64::ADDXrr, AArch64::ADDWrr},
275   ///     {AArch64::SUBXri, AArch64::SUBWri},
276   ///     {AArch64::ADDXrx, AArch64::ADDWrx}}};
277   /// \endcode
278   ///
279   /// Each row in the table corresponds to a different addressing mode. Each
280   /// column corresponds to a different register size.
281   ///
282   /// \attention Rows must be structured as follows:
283   ///   - Row 0: The ri opcode variants
284   ///   - Row 1: The rs opcode variants
285   ///   - Row 2: The rr opcode variants
286   ///   - Row 3: The ri opcode variants for negative immediates
287   ///   - Row 4: The rx opcode variants
288   ///
289   /// \attention Columns must be structured as follows:
290   ///   - Column 0: The 64-bit opcode variants
291   ///   - Column 1: The 32-bit opcode variants
292   ///
293   /// \p Dst is the destination register of the binop to emit.
294   /// \p LHS is the left-hand operand of the binop to emit.
295   /// \p RHS is the right-hand operand of the binop to emit.
296   MachineInstr *emitAddSub(
297       const std::array<std::array<unsigned, 2>, 5> &AddrModeAndSizeToOpcode,
298       Register Dst, MachineOperand &LHS, MachineOperand &RHS,
299       MachineIRBuilder &MIRBuilder) const;
300   MachineInstr *emitADD(Register DefReg, MachineOperand &LHS,
301                         MachineOperand &RHS,
302                         MachineIRBuilder &MIRBuilder) const;
303   MachineInstr *emitADDS(Register Dst, MachineOperand &LHS, MachineOperand &RHS,
304                          MachineIRBuilder &MIRBuilder) const;
305   MachineInstr *emitSUBS(Register Dst, MachineOperand &LHS, MachineOperand &RHS,
306                          MachineIRBuilder &MIRBuilder) const;
307   MachineInstr *emitADCS(Register Dst, MachineOperand &LHS, MachineOperand &RHS,
308                          MachineIRBuilder &MIRBuilder) const;
309   MachineInstr *emitSBCS(Register Dst, MachineOperand &LHS, MachineOperand &RHS,
310                          MachineIRBuilder &MIRBuilder) const;
311   MachineInstr *emitCMN(MachineOperand &LHS, MachineOperand &RHS,
312                         MachineIRBuilder &MIRBuilder) const;
313   MachineInstr *emitTST(MachineOperand &LHS, MachineOperand &RHS,
314                         MachineIRBuilder &MIRBuilder) const;
315   MachineInstr *emitSelect(Register Dst, Register LHS, Register RHS,
316                            AArch64CC::CondCode CC,
317                            MachineIRBuilder &MIRBuilder) const;
318   MachineInstr *emitExtractVectorElt(std::optional<Register> DstReg,
319                                      const RegisterBank &DstRB, LLT ScalarTy,
320                                      Register VecReg, unsigned LaneIdx,
321                                      MachineIRBuilder &MIRBuilder) const;
322   MachineInstr *emitCSINC(Register Dst, Register Src1, Register Src2,
323                           AArch64CC::CondCode Pred,
324                           MachineIRBuilder &MIRBuilder) const;
325   /// Emit a CSet for a FP compare.
326   ///
327   /// \p Dst is expected to be a 32-bit scalar register.
328   MachineInstr *emitCSetForFCmp(Register Dst, CmpInst::Predicate Pred,
329                                 MachineIRBuilder &MIRBuilder) const;
330 
331   /// Emit an instruction that sets NZCV to the carry-in expected by \p I.
332   /// Might elide the instruction if the previous instruction already sets NZCV
333   /// correctly.
334   MachineInstr *emitCarryIn(MachineInstr &I, Register CarryReg);
335 
336   /// Emit the overflow op for \p Opcode.
337   ///
338   /// \p Opcode is expected to be an overflow op's opcode, e.g. G_UADDO,
339   /// G_USUBO, etc.
340   std::pair<MachineInstr *, AArch64CC::CondCode>
341   emitOverflowOp(unsigned Opcode, Register Dst, MachineOperand &LHS,
342                  MachineOperand &RHS, MachineIRBuilder &MIRBuilder) const;
343 
344   bool selectOverflowOp(MachineInstr &I, MachineRegisterInfo &MRI);
345 
346   /// Emit expression as a conjunction (a series of CCMP/CFCMP ops).
347   /// In some cases this is even possible with OR operations in the expression.
348   MachineInstr *emitConjunction(Register Val, AArch64CC::CondCode &OutCC,
349                                 MachineIRBuilder &MIB) const;
350   MachineInstr *emitConditionalComparison(Register LHS, Register RHS,
351                                           CmpInst::Predicate CC,
352                                           AArch64CC::CondCode Predicate,
353                                           AArch64CC::CondCode OutCC,
354                                           MachineIRBuilder &MIB) const;
355   MachineInstr *emitConjunctionRec(Register Val, AArch64CC::CondCode &OutCC,
356                                    bool Negate, Register CCOp,
357                                    AArch64CC::CondCode Predicate,
358                                    MachineIRBuilder &MIB) const;
359 
360   /// Emit a TB(N)Z instruction which tests \p Bit in \p TestReg.
361   /// \p IsNegative is true if the test should be "not zero".
362   /// This will also optimize the test bit instruction when possible.
363   MachineInstr *emitTestBit(Register TestReg, uint64_t Bit, bool IsNegative,
364                             MachineBasicBlock *DstMBB,
365                             MachineIRBuilder &MIB) const;
366 
367   /// Emit a CB(N)Z instruction which branches to \p DestMBB.
368   MachineInstr *emitCBZ(Register CompareReg, bool IsNegative,
369                         MachineBasicBlock *DestMBB,
370                         MachineIRBuilder &MIB) const;
371 
372   // Equivalent to the i32shift_a and friends from AArch64InstrInfo.td.
373   // We use these manually instead of using the importer since it doesn't
374   // support SDNodeXForm.
375   ComplexRendererFns selectShiftA_32(const MachineOperand &Root) const;
376   ComplexRendererFns selectShiftB_32(const MachineOperand &Root) const;
377   ComplexRendererFns selectShiftA_64(const MachineOperand &Root) const;
378   ComplexRendererFns selectShiftB_64(const MachineOperand &Root) const;
379 
380   ComplexRendererFns select12BitValueWithLeftShift(uint64_t Immed) const;
381   ComplexRendererFns selectArithImmed(MachineOperand &Root) const;
382   ComplexRendererFns selectNegArithImmed(MachineOperand &Root) const;
383 
384   ComplexRendererFns selectAddrModeUnscaled(MachineOperand &Root,
385                                             unsigned Size) const;
386 
387   ComplexRendererFns selectAddrModeUnscaled8(MachineOperand &Root) const {
388     return selectAddrModeUnscaled(Root, 1);
389   }
390   ComplexRendererFns selectAddrModeUnscaled16(MachineOperand &Root) const {
391     return selectAddrModeUnscaled(Root, 2);
392   }
393   ComplexRendererFns selectAddrModeUnscaled32(MachineOperand &Root) const {
394     return selectAddrModeUnscaled(Root, 4);
395   }
396   ComplexRendererFns selectAddrModeUnscaled64(MachineOperand &Root) const {
397     return selectAddrModeUnscaled(Root, 8);
398   }
399   ComplexRendererFns selectAddrModeUnscaled128(MachineOperand &Root) const {
400     return selectAddrModeUnscaled(Root, 16);
401   }
402 
403   /// Helper to try to fold in a GISEL_ADD_LOW into an immediate, to be used
404   /// from complex pattern matchers like selectAddrModeIndexed().
405   ComplexRendererFns tryFoldAddLowIntoImm(MachineInstr &RootDef, unsigned Size,
406                                           MachineRegisterInfo &MRI) const;
407 
408   ComplexRendererFns selectAddrModeIndexed(MachineOperand &Root,
409                                            unsigned Size) const;
410   template <int Width>
411   ComplexRendererFns selectAddrModeIndexed(MachineOperand &Root) const {
412     return selectAddrModeIndexed(Root, Width / 8);
413   }
414 
415   bool isWorthFoldingIntoExtendedReg(MachineInstr &MI,
416                                      const MachineRegisterInfo &MRI) const;
417   ComplexRendererFns
418   selectAddrModeShiftedExtendXReg(MachineOperand &Root,
419                                   unsigned SizeInBytes) const;
420 
421   /// Returns a \p ComplexRendererFns which contains a base, offset, and whether
422   /// or not a shift + extend should be folded into an addressing mode. Returns
423   /// None when this is not profitable or possible.
424   ComplexRendererFns
425   selectExtendedSHL(MachineOperand &Root, MachineOperand &Base,
426                     MachineOperand &Offset, unsigned SizeInBytes,
427                     bool WantsExt) const;
428   ComplexRendererFns selectAddrModeRegisterOffset(MachineOperand &Root) const;
429   ComplexRendererFns selectAddrModeXRO(MachineOperand &Root,
430                                        unsigned SizeInBytes) const;
431   template <int Width>
432   ComplexRendererFns selectAddrModeXRO(MachineOperand &Root) const {
433     return selectAddrModeXRO(Root, Width / 8);
434   }
435 
436   ComplexRendererFns selectAddrModeWRO(MachineOperand &Root,
437                                        unsigned SizeInBytes) const;
438   template <int Width>
439   ComplexRendererFns selectAddrModeWRO(MachineOperand &Root) const {
440     return selectAddrModeWRO(Root, Width / 8);
441   }
442 
443   ComplexRendererFns selectShiftedRegister(MachineOperand &Root,
444                                            bool AllowROR = false) const;
445 
446   ComplexRendererFns selectArithShiftedRegister(MachineOperand &Root) const {
447     return selectShiftedRegister(Root);
448   }
449 
450   ComplexRendererFns selectLogicalShiftedRegister(MachineOperand &Root) const {
451     return selectShiftedRegister(Root, true);
452   }
453 
454   /// Given an extend instruction, determine the correct shift-extend type for
455   /// that instruction.
456   ///
457   /// If the instruction is going to be used in a load or store, pass
458   /// \p IsLoadStore = true.
459   AArch64_AM::ShiftExtendType
460   getExtendTypeForInst(MachineInstr &MI, MachineRegisterInfo &MRI,
461                        bool IsLoadStore = false) const;
462 
463   /// Move \p Reg to \p RC if \p Reg is not already on \p RC.
464   ///
465   /// \returns Either \p Reg if no change was necessary, or the new register
466   /// created by moving \p Reg.
467   ///
468   /// Note: This uses emitCopy right now.
469   Register moveScalarRegClass(Register Reg, const TargetRegisterClass &RC,
470                               MachineIRBuilder &MIB) const;
471 
472   ComplexRendererFns selectArithExtendedRegister(MachineOperand &Root) const;
473 
474   ComplexRendererFns selectExtractHigh(MachineOperand &Root) const;
475 
476   void renderTruncImm(MachineInstrBuilder &MIB, const MachineInstr &MI,
477                       int OpIdx = -1) const;
478   void renderLogicalImm32(MachineInstrBuilder &MIB, const MachineInstr &I,
479                           int OpIdx = -1) const;
480   void renderLogicalImm64(MachineInstrBuilder &MIB, const MachineInstr &I,
481                           int OpIdx = -1) const;
482   void renderFPImm16(MachineInstrBuilder &MIB, const MachineInstr &MI,
483                      int OpIdx = -1) const;
484   void renderFPImm32(MachineInstrBuilder &MIB, const MachineInstr &MI,
485                      int OpIdx = -1) const;
486   void renderFPImm64(MachineInstrBuilder &MIB, const MachineInstr &MI,
487                      int OpIdx = -1) const;
488   void renderFPImm32SIMDModImmType4(MachineInstrBuilder &MIB,
489                                     const MachineInstr &MI,
490                                     int OpIdx = -1) const;
491 
492   // Materialize a GlobalValue or BlockAddress using a movz+movk sequence.
493   void materializeLargeCMVal(MachineInstr &I, const Value *V, unsigned OpFlags);
494 
495   // Optimization methods.
496   bool tryOptSelect(GSelect &Sel);
497   bool tryOptSelectConjunction(GSelect &Sel, MachineInstr &CondMI);
498   MachineInstr *tryFoldIntegerCompare(MachineOperand &LHS, MachineOperand &RHS,
499                                       MachineOperand &Predicate,
500                                       MachineIRBuilder &MIRBuilder) const;
501 
502   /// Return true if \p MI is a load or store of \p NumBytes bytes.
503   bool isLoadStoreOfNumBytes(const MachineInstr &MI, unsigned NumBytes) const;
504 
505   /// Returns true if \p MI is guaranteed to have the high-half of a 64-bit
506   /// register zeroed out. In other words, the result of MI has been explicitly
507   /// zero extended.
508   bool isDef32(const MachineInstr &MI) const;
509 
510   const AArch64TargetMachine &TM;
511   const AArch64Subtarget &STI;
512   const AArch64InstrInfo &TII;
513   const AArch64RegisterInfo &TRI;
514   const AArch64RegisterBankInfo &RBI;
515 
516   bool ProduceNonFlagSettingCondBr = false;
517 
518   // Some cached values used during selection.
519   // We use LR as a live-in register, and we keep track of it here as it can be
520   // clobbered by calls.
521   Register MFReturnAddr;
522 
523   MachineIRBuilder MIB;
524 
525 #define GET_GLOBALISEL_PREDICATES_DECL
526 #include "AArch64GenGlobalISel.inc"
527 #undef GET_GLOBALISEL_PREDICATES_DECL
528 
529 // We declare the temporaries used by selectImpl() in the class to minimize the
530 // cost of constructing placeholder values.
531 #define GET_GLOBALISEL_TEMPORARIES_DECL
532 #include "AArch64GenGlobalISel.inc"
533 #undef GET_GLOBALISEL_TEMPORARIES_DECL
534 };
535 
536 } // end anonymous namespace
537 
538 #define GET_GLOBALISEL_IMPL
539 #include "AArch64GenGlobalISel.inc"
540 #undef GET_GLOBALISEL_IMPL
541 
542 AArch64InstructionSelector::AArch64InstructionSelector(
543     const AArch64TargetMachine &TM, const AArch64Subtarget &STI,
544     const AArch64RegisterBankInfo &RBI)
545     : TM(TM), STI(STI), TII(*STI.getInstrInfo()), TRI(*STI.getRegisterInfo()),
546       RBI(RBI),
547 #define GET_GLOBALISEL_PREDICATES_INIT
548 #include "AArch64GenGlobalISel.inc"
549 #undef GET_GLOBALISEL_PREDICATES_INIT
550 #define GET_GLOBALISEL_TEMPORARIES_INIT
551 #include "AArch64GenGlobalISel.inc"
552 #undef GET_GLOBALISEL_TEMPORARIES_INIT
553 {
554 }
555 
556 // FIXME: This should be target-independent, inferred from the types declared
557 // for each class in the bank.
558 //
559 /// Given a register bank, and a type, return the smallest register class that
560 /// can represent that combination.
561 static const TargetRegisterClass *
562 getRegClassForTypeOnBank(LLT Ty, const RegisterBank &RB,
563                          bool GetAllRegSet = false) {
564   if (RB.getID() == AArch64::GPRRegBankID) {
565     if (Ty.getSizeInBits() <= 32)
566       return GetAllRegSet ? &AArch64::GPR32allRegClass
567                           : &AArch64::GPR32RegClass;
568     if (Ty.getSizeInBits() == 64)
569       return GetAllRegSet ? &AArch64::GPR64allRegClass
570                           : &AArch64::GPR64RegClass;
571     if (Ty.getSizeInBits() == 128)
572       return &AArch64::XSeqPairsClassRegClass;
573     return nullptr;
574   }
575 
576   if (RB.getID() == AArch64::FPRRegBankID) {
577     switch (Ty.getSizeInBits()) {
578     case 8:
579       return &AArch64::FPR8RegClass;
580     case 16:
581       return &AArch64::FPR16RegClass;
582     case 32:
583       return &AArch64::FPR32RegClass;
584     case 64:
585       return &AArch64::FPR64RegClass;
586     case 128:
587       return &AArch64::FPR128RegClass;
588     }
589     return nullptr;
590   }
591 
592   return nullptr;
593 }
594 
595 /// Given a register bank, and size in bits, return the smallest register class
596 /// that can represent that combination.
597 static const TargetRegisterClass *
598 getMinClassForRegBank(const RegisterBank &RB, unsigned SizeInBits,
599                       bool GetAllRegSet = false) {
600   unsigned RegBankID = RB.getID();
601 
602   if (RegBankID == AArch64::GPRRegBankID) {
603     if (SizeInBits <= 32)
604       return GetAllRegSet ? &AArch64::GPR32allRegClass
605                           : &AArch64::GPR32RegClass;
606     if (SizeInBits == 64)
607       return GetAllRegSet ? &AArch64::GPR64allRegClass
608                           : &AArch64::GPR64RegClass;
609     if (SizeInBits == 128)
610       return &AArch64::XSeqPairsClassRegClass;
611   }
612 
613   if (RegBankID == AArch64::FPRRegBankID) {
614     switch (SizeInBits) {
615     default:
616       return nullptr;
617     case 8:
618       return &AArch64::FPR8RegClass;
619     case 16:
620       return &AArch64::FPR16RegClass;
621     case 32:
622       return &AArch64::FPR32RegClass;
623     case 64:
624       return &AArch64::FPR64RegClass;
625     case 128:
626       return &AArch64::FPR128RegClass;
627     }
628   }
629 
630   return nullptr;
631 }
632 
633 /// Returns the correct subregister to use for a given register class.
634 static bool getSubRegForClass(const TargetRegisterClass *RC,
635                               const TargetRegisterInfo &TRI, unsigned &SubReg) {
636   switch (TRI.getRegSizeInBits(*RC)) {
637   case 8:
638     SubReg = AArch64::bsub;
639     break;
640   case 16:
641     SubReg = AArch64::hsub;
642     break;
643   case 32:
644     if (RC != &AArch64::FPR32RegClass)
645       SubReg = AArch64::sub_32;
646     else
647       SubReg = AArch64::ssub;
648     break;
649   case 64:
650     SubReg = AArch64::dsub;
651     break;
652   default:
653     LLVM_DEBUG(
654         dbgs() << "Couldn't find appropriate subregister for register class.");
655     return false;
656   }
657 
658   return true;
659 }
660 
661 /// Returns the minimum size the given register bank can hold.
662 static unsigned getMinSizeForRegBank(const RegisterBank &RB) {
663   switch (RB.getID()) {
664   case AArch64::GPRRegBankID:
665     return 32;
666   case AArch64::FPRRegBankID:
667     return 8;
668   default:
669     llvm_unreachable("Tried to get minimum size for unknown register bank.");
670   }
671 }
672 
673 /// Create a REG_SEQUENCE instruction using the registers in \p Regs.
674 /// Helper function for functions like createDTuple and createQTuple.
675 ///
676 /// \p RegClassIDs - The list of register class IDs available for some tuple of
677 /// a scalar class. E.g. QQRegClassID, QQQRegClassID, QQQQRegClassID. This is
678 /// expected to contain between 2 and 4 tuple classes.
679 ///
680 /// \p SubRegs - The list of subregister classes associated with each register
681 /// class ID in \p RegClassIDs. E.g., QQRegClassID should use the qsub0
682 /// subregister class. The index of each subregister class is expected to
683 /// correspond with the index of each register class.
684 ///
685 /// \returns Either the destination register of REG_SEQUENCE instruction that
686 /// was created, or the 0th element of \p Regs if \p Regs contains a single
687 /// element.
688 static Register createTuple(ArrayRef<Register> Regs,
689                             const unsigned RegClassIDs[],
690                             const unsigned SubRegs[], MachineIRBuilder &MIB) {
691   unsigned NumRegs = Regs.size();
692   if (NumRegs == 1)
693     return Regs[0];
694   assert(NumRegs >= 2 && NumRegs <= 4 &&
695          "Only support between two and 4 registers in a tuple!");
696   const TargetRegisterInfo *TRI = MIB.getMF().getSubtarget().getRegisterInfo();
697   auto *DesiredClass = TRI->getRegClass(RegClassIDs[NumRegs - 2]);
698   auto RegSequence =
699       MIB.buildInstr(TargetOpcode::REG_SEQUENCE, {DesiredClass}, {});
700   for (unsigned I = 0, E = Regs.size(); I < E; ++I) {
701     RegSequence.addUse(Regs[I]);
702     RegSequence.addImm(SubRegs[I]);
703   }
704   return RegSequence.getReg(0);
705 }
706 
707 /// Create a tuple of D-registers using the registers in \p Regs.
708 static Register createDTuple(ArrayRef<Register> Regs, MachineIRBuilder &MIB) {
709   static const unsigned RegClassIDs[] = {
710       AArch64::DDRegClassID, AArch64::DDDRegClassID, AArch64::DDDDRegClassID};
711   static const unsigned SubRegs[] = {AArch64::dsub0, AArch64::dsub1,
712                                      AArch64::dsub2, AArch64::dsub3};
713   return createTuple(Regs, RegClassIDs, SubRegs, MIB);
714 }
715 
716 /// Create a tuple of Q-registers using the registers in \p Regs.
717 static Register createQTuple(ArrayRef<Register> Regs, MachineIRBuilder &MIB) {
718   static const unsigned RegClassIDs[] = {
719       AArch64::QQRegClassID, AArch64::QQQRegClassID, AArch64::QQQQRegClassID};
720   static const unsigned SubRegs[] = {AArch64::qsub0, AArch64::qsub1,
721                                      AArch64::qsub2, AArch64::qsub3};
722   return createTuple(Regs, RegClassIDs, SubRegs, MIB);
723 }
724 
725 static std::optional<uint64_t> getImmedFromMO(const MachineOperand &Root) {
726   auto &MI = *Root.getParent();
727   auto &MBB = *MI.getParent();
728   auto &MF = *MBB.getParent();
729   auto &MRI = MF.getRegInfo();
730   uint64_t Immed;
731   if (Root.isImm())
732     Immed = Root.getImm();
733   else if (Root.isCImm())
734     Immed = Root.getCImm()->getZExtValue();
735   else if (Root.isReg()) {
736     auto ValAndVReg =
737         getIConstantVRegValWithLookThrough(Root.getReg(), MRI, true);
738     if (!ValAndVReg)
739       return std::nullopt;
740     Immed = ValAndVReg->Value.getSExtValue();
741   } else
742     return std::nullopt;
743   return Immed;
744 }
745 
746 /// Check whether \p I is a currently unsupported binary operation:
747 /// - it has an unsized type
748 /// - an operand is not a vreg
749 /// - all operands are not in the same bank
750 /// These are checks that should someday live in the verifier, but right now,
751 /// these are mostly limitations of the aarch64 selector.
752 static bool unsupportedBinOp(const MachineInstr &I,
753                              const AArch64RegisterBankInfo &RBI,
754                              const MachineRegisterInfo &MRI,
755                              const AArch64RegisterInfo &TRI) {
756   LLT Ty = MRI.getType(I.getOperand(0).getReg());
757   if (!Ty.isValid()) {
758     LLVM_DEBUG(dbgs() << "Generic binop register should be typed\n");
759     return true;
760   }
761 
762   const RegisterBank *PrevOpBank = nullptr;
763   for (auto &MO : I.operands()) {
764     // FIXME: Support non-register operands.
765     if (!MO.isReg()) {
766       LLVM_DEBUG(dbgs() << "Generic inst non-reg operands are unsupported\n");
767       return true;
768     }
769 
770     // FIXME: Can generic operations have physical registers operands? If
771     // so, this will need to be taught about that, and we'll need to get the
772     // bank out of the minimal class for the register.
773     // Either way, this needs to be documented (and possibly verified).
774     if (!MO.getReg().isVirtual()) {
775       LLVM_DEBUG(dbgs() << "Generic inst has physical register operand\n");
776       return true;
777     }
778 
779     const RegisterBank *OpBank = RBI.getRegBank(MO.getReg(), MRI, TRI);
780     if (!OpBank) {
781       LLVM_DEBUG(dbgs() << "Generic register has no bank or class\n");
782       return true;
783     }
784 
785     if (PrevOpBank && OpBank != PrevOpBank) {
786       LLVM_DEBUG(dbgs() << "Generic inst operands have different banks\n");
787       return true;
788     }
789     PrevOpBank = OpBank;
790   }
791   return false;
792 }
793 
794 /// Select the AArch64 opcode for the basic binary operation \p GenericOpc
795 /// (such as G_OR or G_SDIV), appropriate for the register bank \p RegBankID
796 /// and of size \p OpSize.
797 /// \returns \p GenericOpc if the combination is unsupported.
798 static unsigned selectBinaryOp(unsigned GenericOpc, unsigned RegBankID,
799                                unsigned OpSize) {
800   switch (RegBankID) {
801   case AArch64::GPRRegBankID:
802     if (OpSize == 32) {
803       switch (GenericOpc) {
804       case TargetOpcode::G_SHL:
805         return AArch64::LSLVWr;
806       case TargetOpcode::G_LSHR:
807         return AArch64::LSRVWr;
808       case TargetOpcode::G_ASHR:
809         return AArch64::ASRVWr;
810       default:
811         return GenericOpc;
812       }
813     } else if (OpSize == 64) {
814       switch (GenericOpc) {
815       case TargetOpcode::G_PTR_ADD:
816         return AArch64::ADDXrr;
817       case TargetOpcode::G_SHL:
818         return AArch64::LSLVXr;
819       case TargetOpcode::G_LSHR:
820         return AArch64::LSRVXr;
821       case TargetOpcode::G_ASHR:
822         return AArch64::ASRVXr;
823       default:
824         return GenericOpc;
825       }
826     }
827     break;
828   case AArch64::FPRRegBankID:
829     switch (OpSize) {
830     case 32:
831       switch (GenericOpc) {
832       case TargetOpcode::G_FADD:
833         return AArch64::FADDSrr;
834       case TargetOpcode::G_FSUB:
835         return AArch64::FSUBSrr;
836       case TargetOpcode::G_FMUL:
837         return AArch64::FMULSrr;
838       case TargetOpcode::G_FDIV:
839         return AArch64::FDIVSrr;
840       default:
841         return GenericOpc;
842       }
843     case 64:
844       switch (GenericOpc) {
845       case TargetOpcode::G_FADD:
846         return AArch64::FADDDrr;
847       case TargetOpcode::G_FSUB:
848         return AArch64::FSUBDrr;
849       case TargetOpcode::G_FMUL:
850         return AArch64::FMULDrr;
851       case TargetOpcode::G_FDIV:
852         return AArch64::FDIVDrr;
853       case TargetOpcode::G_OR:
854         return AArch64::ORRv8i8;
855       default:
856         return GenericOpc;
857       }
858     }
859     break;
860   }
861   return GenericOpc;
862 }
863 
864 /// Select the AArch64 opcode for the G_LOAD or G_STORE operation \p GenericOpc,
865 /// appropriate for the (value) register bank \p RegBankID and of memory access
866 /// size \p OpSize.  This returns the variant with the base+unsigned-immediate
867 /// addressing mode (e.g., LDRXui).
868 /// \returns \p GenericOpc if the combination is unsupported.
869 static unsigned selectLoadStoreUIOp(unsigned GenericOpc, unsigned RegBankID,
870                                     unsigned OpSize) {
871   const bool isStore = GenericOpc == TargetOpcode::G_STORE;
872   switch (RegBankID) {
873   case AArch64::GPRRegBankID:
874     switch (OpSize) {
875     case 8:
876       return isStore ? AArch64::STRBBui : AArch64::LDRBBui;
877     case 16:
878       return isStore ? AArch64::STRHHui : AArch64::LDRHHui;
879     case 32:
880       return isStore ? AArch64::STRWui : AArch64::LDRWui;
881     case 64:
882       return isStore ? AArch64::STRXui : AArch64::LDRXui;
883     }
884     break;
885   case AArch64::FPRRegBankID:
886     switch (OpSize) {
887     case 8:
888       return isStore ? AArch64::STRBui : AArch64::LDRBui;
889     case 16:
890       return isStore ? AArch64::STRHui : AArch64::LDRHui;
891     case 32:
892       return isStore ? AArch64::STRSui : AArch64::LDRSui;
893     case 64:
894       return isStore ? AArch64::STRDui : AArch64::LDRDui;
895     case 128:
896       return isStore ? AArch64::STRQui : AArch64::LDRQui;
897     }
898     break;
899   }
900   return GenericOpc;
901 }
902 
903 /// Helper function for selectCopy. Inserts a subregister copy from \p SrcReg
904 /// to \p *To.
905 ///
906 /// E.g "To = COPY SrcReg:SubReg"
907 static bool copySubReg(MachineInstr &I, MachineRegisterInfo &MRI,
908                        const RegisterBankInfo &RBI, Register SrcReg,
909                        const TargetRegisterClass *To, unsigned SubReg) {
910   assert(SrcReg.isValid() && "Expected a valid source register?");
911   assert(To && "Destination register class cannot be null");
912   assert(SubReg && "Expected a valid subregister");
913 
914   MachineIRBuilder MIB(I);
915   auto SubRegCopy =
916       MIB.buildInstr(TargetOpcode::COPY, {To}, {}).addReg(SrcReg, 0, SubReg);
917   MachineOperand &RegOp = I.getOperand(1);
918   RegOp.setReg(SubRegCopy.getReg(0));
919 
920   // It's possible that the destination register won't be constrained. Make
921   // sure that happens.
922   if (!I.getOperand(0).getReg().isPhysical())
923     RBI.constrainGenericRegister(I.getOperand(0).getReg(), *To, MRI);
924 
925   return true;
926 }
927 
928 /// Helper function to get the source and destination register classes for a
929 /// copy. Returns a std::pair containing the source register class for the
930 /// copy, and the destination register class for the copy. If a register class
931 /// cannot be determined, then it will be nullptr.
932 static std::pair<const TargetRegisterClass *, const TargetRegisterClass *>
933 getRegClassesForCopy(MachineInstr &I, const TargetInstrInfo &TII,
934                      MachineRegisterInfo &MRI, const TargetRegisterInfo &TRI,
935                      const RegisterBankInfo &RBI) {
936   Register DstReg = I.getOperand(0).getReg();
937   Register SrcReg = I.getOperand(1).getReg();
938   const RegisterBank &DstRegBank = *RBI.getRegBank(DstReg, MRI, TRI);
939   const RegisterBank &SrcRegBank = *RBI.getRegBank(SrcReg, MRI, TRI);
940   unsigned DstSize = RBI.getSizeInBits(DstReg, MRI, TRI);
941   unsigned SrcSize = RBI.getSizeInBits(SrcReg, MRI, TRI);
942 
943   // Special casing for cross-bank copies of s1s. We can technically represent
944   // a 1-bit value with any size of register. The minimum size for a GPR is 32
945   // bits. So, we need to put the FPR on 32 bits as well.
946   //
947   // FIXME: I'm not sure if this case holds true outside of copies. If it does,
948   // then we can pull it into the helpers that get the appropriate class for a
949   // register bank. Or make a new helper that carries along some constraint
950   // information.
951   if (SrcRegBank != DstRegBank && (DstSize == 1 && SrcSize == 1))
952     SrcSize = DstSize = 32;
953 
954   return {getMinClassForRegBank(SrcRegBank, SrcSize, true),
955           getMinClassForRegBank(DstRegBank, DstSize, true)};
956 }
957 
958 // FIXME: We need some sort of API in RBI/TRI to allow generic code to
959 // constrain operands of simple instructions given a TargetRegisterClass
960 // and LLT
961 static bool selectDebugInstr(MachineInstr &I, MachineRegisterInfo &MRI,
962                              const RegisterBankInfo &RBI) {
963   for (MachineOperand &MO : I.operands()) {
964     if (!MO.isReg())
965       continue;
966     Register Reg = MO.getReg();
967     if (!Reg)
968       continue;
969     if (Reg.isPhysical())
970       continue;
971     LLT Ty = MRI.getType(Reg);
972     const RegClassOrRegBank &RegClassOrBank = MRI.getRegClassOrRegBank(Reg);
973     const TargetRegisterClass *RC =
974         RegClassOrBank.dyn_cast<const TargetRegisterClass *>();
975     if (!RC) {
976       const RegisterBank &RB = *RegClassOrBank.get<const RegisterBank *>();
977       RC = getRegClassForTypeOnBank(Ty, RB);
978       if (!RC) {
979         LLVM_DEBUG(
980             dbgs() << "Warning: DBG_VALUE operand has unexpected size/bank\n");
981         break;
982       }
983     }
984     RBI.constrainGenericRegister(Reg, *RC, MRI);
985   }
986 
987   return true;
988 }
989 
990 static bool selectCopy(MachineInstr &I, const TargetInstrInfo &TII,
991                        MachineRegisterInfo &MRI, const TargetRegisterInfo &TRI,
992                        const RegisterBankInfo &RBI) {
993   Register DstReg = I.getOperand(0).getReg();
994   Register SrcReg = I.getOperand(1).getReg();
995   const RegisterBank &DstRegBank = *RBI.getRegBank(DstReg, MRI, TRI);
996   const RegisterBank &SrcRegBank = *RBI.getRegBank(SrcReg, MRI, TRI);
997 
998   // Find the correct register classes for the source and destination registers.
999   const TargetRegisterClass *SrcRC;
1000   const TargetRegisterClass *DstRC;
1001   std::tie(SrcRC, DstRC) = getRegClassesForCopy(I, TII, MRI, TRI, RBI);
1002 
1003   if (!DstRC) {
1004     LLVM_DEBUG(dbgs() << "Unexpected dest size "
1005                       << RBI.getSizeInBits(DstReg, MRI, TRI) << '\n');
1006     return false;
1007   }
1008 
1009   // Is this a copy? If so, then we may need to insert a subregister copy.
1010   if (I.isCopy()) {
1011     // Yes. Check if there's anything to fix up.
1012     if (!SrcRC) {
1013       LLVM_DEBUG(dbgs() << "Couldn't determine source register class\n");
1014       return false;
1015     }
1016 
1017     unsigned SrcSize = TRI.getRegSizeInBits(*SrcRC);
1018     unsigned DstSize = TRI.getRegSizeInBits(*DstRC);
1019     unsigned SubReg;
1020 
1021     // If the source bank doesn't support a subregister copy small enough,
1022     // then we first need to copy to the destination bank.
1023     if (getMinSizeForRegBank(SrcRegBank) > DstSize) {
1024       const TargetRegisterClass *DstTempRC =
1025           getMinClassForRegBank(DstRegBank, SrcSize, /* GetAllRegSet */ true);
1026       getSubRegForClass(DstRC, TRI, SubReg);
1027 
1028       MachineIRBuilder MIB(I);
1029       auto Copy = MIB.buildCopy({DstTempRC}, {SrcReg});
1030       copySubReg(I, MRI, RBI, Copy.getReg(0), DstRC, SubReg);
1031     } else if (SrcSize > DstSize) {
1032       // If the source register is bigger than the destination we need to
1033       // perform a subregister copy.
1034       const TargetRegisterClass *SubRegRC =
1035           getMinClassForRegBank(SrcRegBank, DstSize, /* GetAllRegSet */ true);
1036       getSubRegForClass(SubRegRC, TRI, SubReg);
1037       copySubReg(I, MRI, RBI, SrcReg, DstRC, SubReg);
1038     } else if (DstSize > SrcSize) {
1039       // If the destination register is bigger than the source we need to do
1040       // a promotion using SUBREG_TO_REG.
1041       const TargetRegisterClass *PromotionRC =
1042           getMinClassForRegBank(SrcRegBank, DstSize, /* GetAllRegSet */ true);
1043       getSubRegForClass(SrcRC, TRI, SubReg);
1044 
1045       Register PromoteReg = MRI.createVirtualRegister(PromotionRC);
1046       BuildMI(*I.getParent(), I, I.getDebugLoc(),
1047               TII.get(AArch64::SUBREG_TO_REG), PromoteReg)
1048           .addImm(0)
1049           .addUse(SrcReg)
1050           .addImm(SubReg);
1051       MachineOperand &RegOp = I.getOperand(1);
1052       RegOp.setReg(PromoteReg);
1053     }
1054 
1055     // If the destination is a physical register, then there's nothing to
1056     // change, so we're done.
1057     if (DstReg.isPhysical())
1058       return true;
1059   }
1060 
1061   // No need to constrain SrcReg. It will get constrained when we hit another
1062   // of its use or its defs. Copies do not have constraints.
1063   if (!RBI.constrainGenericRegister(DstReg, *DstRC, MRI)) {
1064     LLVM_DEBUG(dbgs() << "Failed to constrain " << TII.getName(I.getOpcode())
1065                       << " operand\n");
1066     return false;
1067   }
1068 
1069   // If this a GPR ZEXT that we want to just reduce down into a copy.
1070   // The sizes will be mismatched with the source < 32b but that's ok.
1071   if (I.getOpcode() == TargetOpcode::G_ZEXT) {
1072     I.setDesc(TII.get(AArch64::COPY));
1073     assert(SrcRegBank.getID() == AArch64::GPRRegBankID);
1074     return selectCopy(I, TII, MRI, TRI, RBI);
1075   }
1076 
1077   I.setDesc(TII.get(AArch64::COPY));
1078   return true;
1079 }
1080 
1081 static unsigned selectFPConvOpc(unsigned GenericOpc, LLT DstTy, LLT SrcTy) {
1082   if (!DstTy.isScalar() || !SrcTy.isScalar())
1083     return GenericOpc;
1084 
1085   const unsigned DstSize = DstTy.getSizeInBits();
1086   const unsigned SrcSize = SrcTy.getSizeInBits();
1087 
1088   switch (DstSize) {
1089   case 32:
1090     switch (SrcSize) {
1091     case 32:
1092       switch (GenericOpc) {
1093       case TargetOpcode::G_SITOFP:
1094         return AArch64::SCVTFUWSri;
1095       case TargetOpcode::G_UITOFP:
1096         return AArch64::UCVTFUWSri;
1097       case TargetOpcode::G_FPTOSI:
1098         return AArch64::FCVTZSUWSr;
1099       case TargetOpcode::G_FPTOUI:
1100         return AArch64::FCVTZUUWSr;
1101       default:
1102         return GenericOpc;
1103       }
1104     case 64:
1105       switch (GenericOpc) {
1106       case TargetOpcode::G_SITOFP:
1107         return AArch64::SCVTFUXSri;
1108       case TargetOpcode::G_UITOFP:
1109         return AArch64::UCVTFUXSri;
1110       case TargetOpcode::G_FPTOSI:
1111         return AArch64::FCVTZSUWDr;
1112       case TargetOpcode::G_FPTOUI:
1113         return AArch64::FCVTZUUWDr;
1114       default:
1115         return GenericOpc;
1116       }
1117     default:
1118       return GenericOpc;
1119     }
1120   case 64:
1121     switch (SrcSize) {
1122     case 32:
1123       switch (GenericOpc) {
1124       case TargetOpcode::G_SITOFP:
1125         return AArch64::SCVTFUWDri;
1126       case TargetOpcode::G_UITOFP:
1127         return AArch64::UCVTFUWDri;
1128       case TargetOpcode::G_FPTOSI:
1129         return AArch64::FCVTZSUXSr;
1130       case TargetOpcode::G_FPTOUI:
1131         return AArch64::FCVTZUUXSr;
1132       default:
1133         return GenericOpc;
1134       }
1135     case 64:
1136       switch (GenericOpc) {
1137       case TargetOpcode::G_SITOFP:
1138         return AArch64::SCVTFUXDri;
1139       case TargetOpcode::G_UITOFP:
1140         return AArch64::UCVTFUXDri;
1141       case TargetOpcode::G_FPTOSI:
1142         return AArch64::FCVTZSUXDr;
1143       case TargetOpcode::G_FPTOUI:
1144         return AArch64::FCVTZUUXDr;
1145       default:
1146         return GenericOpc;
1147       }
1148     default:
1149       return GenericOpc;
1150     }
1151   default:
1152     return GenericOpc;
1153   };
1154   return GenericOpc;
1155 }
1156 
1157 MachineInstr *
1158 AArch64InstructionSelector::emitSelect(Register Dst, Register True,
1159                                        Register False, AArch64CC::CondCode CC,
1160                                        MachineIRBuilder &MIB) const {
1161   MachineRegisterInfo &MRI = *MIB.getMRI();
1162   assert(RBI.getRegBank(False, MRI, TRI)->getID() ==
1163              RBI.getRegBank(True, MRI, TRI)->getID() &&
1164          "Expected both select operands to have the same regbank?");
1165   LLT Ty = MRI.getType(True);
1166   if (Ty.isVector())
1167     return nullptr;
1168   const unsigned Size = Ty.getSizeInBits();
1169   assert((Size == 32 || Size == 64) &&
1170          "Expected 32 bit or 64 bit select only?");
1171   const bool Is32Bit = Size == 32;
1172   if (RBI.getRegBank(True, MRI, TRI)->getID() != AArch64::GPRRegBankID) {
1173     unsigned Opc = Is32Bit ? AArch64::FCSELSrrr : AArch64::FCSELDrrr;
1174     auto FCSel = MIB.buildInstr(Opc, {Dst}, {True, False}).addImm(CC);
1175     constrainSelectedInstRegOperands(*FCSel, TII, TRI, RBI);
1176     return &*FCSel;
1177   }
1178 
1179   // By default, we'll try and emit a CSEL.
1180   unsigned Opc = Is32Bit ? AArch64::CSELWr : AArch64::CSELXr;
1181   bool Optimized = false;
1182   auto TryFoldBinOpIntoSelect = [&Opc, Is32Bit, &CC, &MRI,
1183                                  &Optimized](Register &Reg, Register &OtherReg,
1184                                              bool Invert) {
1185     if (Optimized)
1186       return false;
1187 
1188     // Attempt to fold:
1189     //
1190     // %sub = G_SUB 0, %x
1191     // %select = G_SELECT cc, %reg, %sub
1192     //
1193     // Into:
1194     // %select = CSNEG %reg, %x, cc
1195     Register MatchReg;
1196     if (mi_match(Reg, MRI, m_Neg(m_Reg(MatchReg)))) {
1197       Opc = Is32Bit ? AArch64::CSNEGWr : AArch64::CSNEGXr;
1198       Reg = MatchReg;
1199       if (Invert) {
1200         CC = AArch64CC::getInvertedCondCode(CC);
1201         std::swap(Reg, OtherReg);
1202       }
1203       return true;
1204     }
1205 
1206     // Attempt to fold:
1207     //
1208     // %xor = G_XOR %x, -1
1209     // %select = G_SELECT cc, %reg, %xor
1210     //
1211     // Into:
1212     // %select = CSINV %reg, %x, cc
1213     if (mi_match(Reg, MRI, m_Not(m_Reg(MatchReg)))) {
1214       Opc = Is32Bit ? AArch64::CSINVWr : AArch64::CSINVXr;
1215       Reg = MatchReg;
1216       if (Invert) {
1217         CC = AArch64CC::getInvertedCondCode(CC);
1218         std::swap(Reg, OtherReg);
1219       }
1220       return true;
1221     }
1222 
1223     // Attempt to fold:
1224     //
1225     // %add = G_ADD %x, 1
1226     // %select = G_SELECT cc, %reg, %add
1227     //
1228     // Into:
1229     // %select = CSINC %reg, %x, cc
1230     if (mi_match(Reg, MRI,
1231                  m_any_of(m_GAdd(m_Reg(MatchReg), m_SpecificICst(1)),
1232                           m_GPtrAdd(m_Reg(MatchReg), m_SpecificICst(1))))) {
1233       Opc = Is32Bit ? AArch64::CSINCWr : AArch64::CSINCXr;
1234       Reg = MatchReg;
1235       if (Invert) {
1236         CC = AArch64CC::getInvertedCondCode(CC);
1237         std::swap(Reg, OtherReg);
1238       }
1239       return true;
1240     }
1241 
1242     return false;
1243   };
1244 
1245   // Helper lambda which tries to use CSINC/CSINV for the instruction when its
1246   // true/false values are constants.
1247   // FIXME: All of these patterns already exist in tablegen. We should be
1248   // able to import these.
1249   auto TryOptSelectCst = [&Opc, &True, &False, &CC, Is32Bit, &MRI,
1250                           &Optimized]() {
1251     if (Optimized)
1252       return false;
1253     auto TrueCst = getIConstantVRegValWithLookThrough(True, MRI);
1254     auto FalseCst = getIConstantVRegValWithLookThrough(False, MRI);
1255     if (!TrueCst && !FalseCst)
1256       return false;
1257 
1258     Register ZReg = Is32Bit ? AArch64::WZR : AArch64::XZR;
1259     if (TrueCst && FalseCst) {
1260       int64_t T = TrueCst->Value.getSExtValue();
1261       int64_t F = FalseCst->Value.getSExtValue();
1262 
1263       if (T == 0 && F == 1) {
1264         // G_SELECT cc, 0, 1 -> CSINC zreg, zreg, cc
1265         Opc = Is32Bit ? AArch64::CSINCWr : AArch64::CSINCXr;
1266         True = ZReg;
1267         False = ZReg;
1268         return true;
1269       }
1270 
1271       if (T == 0 && F == -1) {
1272         // G_SELECT cc 0, -1 -> CSINV zreg, zreg cc
1273         Opc = Is32Bit ? AArch64::CSINVWr : AArch64::CSINVXr;
1274         True = ZReg;
1275         False = ZReg;
1276         return true;
1277       }
1278     }
1279 
1280     if (TrueCst) {
1281       int64_t T = TrueCst->Value.getSExtValue();
1282       if (T == 1) {
1283         // G_SELECT cc, 1, f -> CSINC f, zreg, inv_cc
1284         Opc = Is32Bit ? AArch64::CSINCWr : AArch64::CSINCXr;
1285         True = False;
1286         False = ZReg;
1287         CC = AArch64CC::getInvertedCondCode(CC);
1288         return true;
1289       }
1290 
1291       if (T == -1) {
1292         // G_SELECT cc, -1, f -> CSINV f, zreg, inv_cc
1293         Opc = Is32Bit ? AArch64::CSINVWr : AArch64::CSINVXr;
1294         True = False;
1295         False = ZReg;
1296         CC = AArch64CC::getInvertedCondCode(CC);
1297         return true;
1298       }
1299     }
1300 
1301     if (FalseCst) {
1302       int64_t F = FalseCst->Value.getSExtValue();
1303       if (F == 1) {
1304         // G_SELECT cc, t, 1 -> CSINC t, zreg, cc
1305         Opc = Is32Bit ? AArch64::CSINCWr : AArch64::CSINCXr;
1306         False = ZReg;
1307         return true;
1308       }
1309 
1310       if (F == -1) {
1311         // G_SELECT cc, t, -1 -> CSINC t, zreg, cc
1312         Opc = Is32Bit ? AArch64::CSINVWr : AArch64::CSINVXr;
1313         False = ZReg;
1314         return true;
1315       }
1316     }
1317     return false;
1318   };
1319 
1320   Optimized |= TryFoldBinOpIntoSelect(False, True, /*Invert = */ false);
1321   Optimized |= TryFoldBinOpIntoSelect(True, False, /*Invert = */ true);
1322   Optimized |= TryOptSelectCst();
1323   auto SelectInst = MIB.buildInstr(Opc, {Dst}, {True, False}).addImm(CC);
1324   constrainSelectedInstRegOperands(*SelectInst, TII, TRI, RBI);
1325   return &*SelectInst;
1326 }
1327 
1328 static AArch64CC::CondCode changeICMPPredToAArch64CC(CmpInst::Predicate P) {
1329   switch (P) {
1330   default:
1331     llvm_unreachable("Unknown condition code!");
1332   case CmpInst::ICMP_NE:
1333     return AArch64CC::NE;
1334   case CmpInst::ICMP_EQ:
1335     return AArch64CC::EQ;
1336   case CmpInst::ICMP_SGT:
1337     return AArch64CC::GT;
1338   case CmpInst::ICMP_SGE:
1339     return AArch64CC::GE;
1340   case CmpInst::ICMP_SLT:
1341     return AArch64CC::LT;
1342   case CmpInst::ICMP_SLE:
1343     return AArch64CC::LE;
1344   case CmpInst::ICMP_UGT:
1345     return AArch64CC::HI;
1346   case CmpInst::ICMP_UGE:
1347     return AArch64CC::HS;
1348   case CmpInst::ICMP_ULT:
1349     return AArch64CC::LO;
1350   case CmpInst::ICMP_ULE:
1351     return AArch64CC::LS;
1352   }
1353 }
1354 
1355 /// changeFPCCToORAArch64CC - Convert an IR fp condition code to an AArch64 CC.
1356 static void changeFPCCToORAArch64CC(CmpInst::Predicate CC,
1357                                     AArch64CC::CondCode &CondCode,
1358                                     AArch64CC::CondCode &CondCode2) {
1359   CondCode2 = AArch64CC::AL;
1360   switch (CC) {
1361   default:
1362     llvm_unreachable("Unknown FP condition!");
1363   case CmpInst::FCMP_OEQ:
1364     CondCode = AArch64CC::EQ;
1365     break;
1366   case CmpInst::FCMP_OGT:
1367     CondCode = AArch64CC::GT;
1368     break;
1369   case CmpInst::FCMP_OGE:
1370     CondCode = AArch64CC::GE;
1371     break;
1372   case CmpInst::FCMP_OLT:
1373     CondCode = AArch64CC::MI;
1374     break;
1375   case CmpInst::FCMP_OLE:
1376     CondCode = AArch64CC::LS;
1377     break;
1378   case CmpInst::FCMP_ONE:
1379     CondCode = AArch64CC::MI;
1380     CondCode2 = AArch64CC::GT;
1381     break;
1382   case CmpInst::FCMP_ORD:
1383     CondCode = AArch64CC::VC;
1384     break;
1385   case CmpInst::FCMP_UNO:
1386     CondCode = AArch64CC::VS;
1387     break;
1388   case CmpInst::FCMP_UEQ:
1389     CondCode = AArch64CC::EQ;
1390     CondCode2 = AArch64CC::VS;
1391     break;
1392   case CmpInst::FCMP_UGT:
1393     CondCode = AArch64CC::HI;
1394     break;
1395   case CmpInst::FCMP_UGE:
1396     CondCode = AArch64CC::PL;
1397     break;
1398   case CmpInst::FCMP_ULT:
1399     CondCode = AArch64CC::LT;
1400     break;
1401   case CmpInst::FCMP_ULE:
1402     CondCode = AArch64CC::LE;
1403     break;
1404   case CmpInst::FCMP_UNE:
1405     CondCode = AArch64CC::NE;
1406     break;
1407   }
1408 }
1409 
1410 /// Convert an IR fp condition code to an AArch64 CC.
1411 /// This differs from changeFPCCToAArch64CC in that it returns cond codes that
1412 /// should be AND'ed instead of OR'ed.
1413 static void changeFPCCToANDAArch64CC(CmpInst::Predicate CC,
1414                                      AArch64CC::CondCode &CondCode,
1415                                      AArch64CC::CondCode &CondCode2) {
1416   CondCode2 = AArch64CC::AL;
1417   switch (CC) {
1418   default:
1419     changeFPCCToORAArch64CC(CC, CondCode, CondCode2);
1420     assert(CondCode2 == AArch64CC::AL);
1421     break;
1422   case CmpInst::FCMP_ONE:
1423     // (a one b)
1424     // == ((a olt b) || (a ogt b))
1425     // == ((a ord b) && (a une b))
1426     CondCode = AArch64CC::VC;
1427     CondCode2 = AArch64CC::NE;
1428     break;
1429   case CmpInst::FCMP_UEQ:
1430     // (a ueq b)
1431     // == ((a uno b) || (a oeq b))
1432     // == ((a ule b) && (a uge b))
1433     CondCode = AArch64CC::PL;
1434     CondCode2 = AArch64CC::LE;
1435     break;
1436   }
1437 }
1438 
1439 /// Return a register which can be used as a bit to test in a TB(N)Z.
1440 static Register getTestBitReg(Register Reg, uint64_t &Bit, bool &Invert,
1441                               MachineRegisterInfo &MRI) {
1442   assert(Reg.isValid() && "Expected valid register!");
1443   bool HasZext = false;
1444   while (MachineInstr *MI = getDefIgnoringCopies(Reg, MRI)) {
1445     unsigned Opc = MI->getOpcode();
1446 
1447     if (!MI->getOperand(0).isReg() ||
1448         !MRI.hasOneNonDBGUse(MI->getOperand(0).getReg()))
1449       break;
1450 
1451     // (tbz (any_ext x), b) -> (tbz x, b) if we don't use the extended bits.
1452     //
1453     // (tbz (trunc x), b) -> (tbz x, b) is always safe, because the bit number
1454     // on the truncated x is the same as the bit number on x.
1455     if (Opc == TargetOpcode::G_ANYEXT || Opc == TargetOpcode::G_ZEXT ||
1456         Opc == TargetOpcode::G_TRUNC) {
1457       if (Opc == TargetOpcode::G_ZEXT)
1458         HasZext = true;
1459 
1460       Register NextReg = MI->getOperand(1).getReg();
1461       // Did we find something worth folding?
1462       if (!NextReg.isValid() || !MRI.hasOneNonDBGUse(NextReg))
1463         break;
1464 
1465       // NextReg is worth folding. Keep looking.
1466       Reg = NextReg;
1467       continue;
1468     }
1469 
1470     // Attempt to find a suitable operation with a constant on one side.
1471     std::optional<uint64_t> C;
1472     Register TestReg;
1473     switch (Opc) {
1474     default:
1475       break;
1476     case TargetOpcode::G_AND:
1477     case TargetOpcode::G_XOR: {
1478       TestReg = MI->getOperand(1).getReg();
1479       Register ConstantReg = MI->getOperand(2).getReg();
1480       auto VRegAndVal = getIConstantVRegValWithLookThrough(ConstantReg, MRI);
1481       if (!VRegAndVal) {
1482         // AND commutes, check the other side for a constant.
1483         // FIXME: Can we canonicalize the constant so that it's always on the
1484         // same side at some point earlier?
1485         std::swap(ConstantReg, TestReg);
1486         VRegAndVal = getIConstantVRegValWithLookThrough(ConstantReg, MRI);
1487       }
1488       if (VRegAndVal) {
1489         if (HasZext)
1490           C = VRegAndVal->Value.getZExtValue();
1491         else
1492           C = VRegAndVal->Value.getSExtValue();
1493       }
1494       break;
1495     }
1496     case TargetOpcode::G_ASHR:
1497     case TargetOpcode::G_LSHR:
1498     case TargetOpcode::G_SHL: {
1499       TestReg = MI->getOperand(1).getReg();
1500       auto VRegAndVal =
1501           getIConstantVRegValWithLookThrough(MI->getOperand(2).getReg(), MRI);
1502       if (VRegAndVal)
1503         C = VRegAndVal->Value.getSExtValue();
1504       break;
1505     }
1506     }
1507 
1508     // Didn't find a constant or viable register. Bail out of the loop.
1509     if (!C || !TestReg.isValid())
1510       break;
1511 
1512     // We found a suitable instruction with a constant. Check to see if we can
1513     // walk through the instruction.
1514     Register NextReg;
1515     unsigned TestRegSize = MRI.getType(TestReg).getSizeInBits();
1516     switch (Opc) {
1517     default:
1518       break;
1519     case TargetOpcode::G_AND:
1520       // (tbz (and x, m), b) -> (tbz x, b) when the b-th bit of m is set.
1521       if ((*C >> Bit) & 1)
1522         NextReg = TestReg;
1523       break;
1524     case TargetOpcode::G_SHL:
1525       // (tbz (shl x, c), b) -> (tbz x, b-c) when b-c is positive and fits in
1526       // the type of the register.
1527       if (*C <= Bit && (Bit - *C) < TestRegSize) {
1528         NextReg = TestReg;
1529         Bit = Bit - *C;
1530       }
1531       break;
1532     case TargetOpcode::G_ASHR:
1533       // (tbz (ashr x, c), b) -> (tbz x, b+c) or (tbz x, msb) if b+c is > # bits
1534       // in x
1535       NextReg = TestReg;
1536       Bit = Bit + *C;
1537       if (Bit >= TestRegSize)
1538         Bit = TestRegSize - 1;
1539       break;
1540     case TargetOpcode::G_LSHR:
1541       // (tbz (lshr x, c), b) -> (tbz x, b+c) when b + c is < # bits in x
1542       if ((Bit + *C) < TestRegSize) {
1543         NextReg = TestReg;
1544         Bit = Bit + *C;
1545       }
1546       break;
1547     case TargetOpcode::G_XOR:
1548       // We can walk through a G_XOR by inverting whether we use tbz/tbnz when
1549       // appropriate.
1550       //
1551       // e.g. If x' = xor x, c, and the b-th bit is set in c then
1552       //
1553       // tbz x', b -> tbnz x, b
1554       //
1555       // Because x' only has the b-th bit set if x does not.
1556       if ((*C >> Bit) & 1)
1557         Invert = !Invert;
1558       NextReg = TestReg;
1559       break;
1560     }
1561 
1562     // Check if we found anything worth folding.
1563     if (!NextReg.isValid())
1564       return Reg;
1565     Reg = NextReg;
1566   }
1567 
1568   return Reg;
1569 }
1570 
1571 MachineInstr *AArch64InstructionSelector::emitTestBit(
1572     Register TestReg, uint64_t Bit, bool IsNegative, MachineBasicBlock *DstMBB,
1573     MachineIRBuilder &MIB) const {
1574   assert(TestReg.isValid());
1575   assert(ProduceNonFlagSettingCondBr &&
1576          "Cannot emit TB(N)Z with speculation tracking!");
1577   MachineRegisterInfo &MRI = *MIB.getMRI();
1578 
1579   // Attempt to optimize the test bit by walking over instructions.
1580   TestReg = getTestBitReg(TestReg, Bit, IsNegative, MRI);
1581   LLT Ty = MRI.getType(TestReg);
1582   unsigned Size = Ty.getSizeInBits();
1583   assert(!Ty.isVector() && "Expected a scalar!");
1584   assert(Bit < 64 && "Bit is too large!");
1585 
1586   // When the test register is a 64-bit register, we have to narrow to make
1587   // TBNZW work.
1588   bool UseWReg = Bit < 32;
1589   unsigned NecessarySize = UseWReg ? 32 : 64;
1590   if (Size != NecessarySize)
1591     TestReg = moveScalarRegClass(
1592         TestReg, UseWReg ? AArch64::GPR32RegClass : AArch64::GPR64RegClass,
1593         MIB);
1594 
1595   static const unsigned OpcTable[2][2] = {{AArch64::TBZX, AArch64::TBNZX},
1596                                           {AArch64::TBZW, AArch64::TBNZW}};
1597   unsigned Opc = OpcTable[UseWReg][IsNegative];
1598   auto TestBitMI =
1599       MIB.buildInstr(Opc).addReg(TestReg).addImm(Bit).addMBB(DstMBB);
1600   constrainSelectedInstRegOperands(*TestBitMI, TII, TRI, RBI);
1601   return &*TestBitMI;
1602 }
1603 
1604 bool AArch64InstructionSelector::tryOptAndIntoCompareBranch(
1605     MachineInstr &AndInst, bool Invert, MachineBasicBlock *DstMBB,
1606     MachineIRBuilder &MIB) const {
1607   assert(AndInst.getOpcode() == TargetOpcode::G_AND && "Expected G_AND only?");
1608   // Given something like this:
1609   //
1610   //  %x = ...Something...
1611   //  %one = G_CONSTANT i64 1
1612   //  %zero = G_CONSTANT i64 0
1613   //  %and = G_AND %x, %one
1614   //  %cmp = G_ICMP intpred(ne), %and, %zero
1615   //  %cmp_trunc = G_TRUNC %cmp
1616   //  G_BRCOND %cmp_trunc, %bb.3
1617   //
1618   // We want to try and fold the AND into the G_BRCOND and produce either a
1619   // TBNZ (when we have intpred(ne)) or a TBZ (when we have intpred(eq)).
1620   //
1621   // In this case, we'd get
1622   //
1623   // TBNZ %x %bb.3
1624   //
1625 
1626   // Check if the AND has a constant on its RHS which we can use as a mask.
1627   // If it's a power of 2, then it's the same as checking a specific bit.
1628   // (e.g, ANDing with 8 == ANDing with 000...100 == testing if bit 3 is set)
1629   auto MaybeBit = getIConstantVRegValWithLookThrough(
1630       AndInst.getOperand(2).getReg(), *MIB.getMRI());
1631   if (!MaybeBit)
1632     return false;
1633 
1634   int32_t Bit = MaybeBit->Value.exactLogBase2();
1635   if (Bit < 0)
1636     return false;
1637 
1638   Register TestReg = AndInst.getOperand(1).getReg();
1639 
1640   // Emit a TB(N)Z.
1641   emitTestBit(TestReg, Bit, Invert, DstMBB, MIB);
1642   return true;
1643 }
1644 
1645 MachineInstr *AArch64InstructionSelector::emitCBZ(Register CompareReg,
1646                                                   bool IsNegative,
1647                                                   MachineBasicBlock *DestMBB,
1648                                                   MachineIRBuilder &MIB) const {
1649   assert(ProduceNonFlagSettingCondBr && "CBZ does not set flags!");
1650   MachineRegisterInfo &MRI = *MIB.getMRI();
1651   assert(RBI.getRegBank(CompareReg, MRI, TRI)->getID() ==
1652              AArch64::GPRRegBankID &&
1653          "Expected GPRs only?");
1654   auto Ty = MRI.getType(CompareReg);
1655   unsigned Width = Ty.getSizeInBits();
1656   assert(!Ty.isVector() && "Expected scalar only?");
1657   assert(Width <= 64 && "Expected width to be at most 64?");
1658   static const unsigned OpcTable[2][2] = {{AArch64::CBZW, AArch64::CBZX},
1659                                           {AArch64::CBNZW, AArch64::CBNZX}};
1660   unsigned Opc = OpcTable[IsNegative][Width == 64];
1661   auto BranchMI = MIB.buildInstr(Opc, {}, {CompareReg}).addMBB(DestMBB);
1662   constrainSelectedInstRegOperands(*BranchMI, TII, TRI, RBI);
1663   return &*BranchMI;
1664 }
1665 
1666 bool AArch64InstructionSelector::selectCompareBranchFedByFCmp(
1667     MachineInstr &I, MachineInstr &FCmp, MachineIRBuilder &MIB) const {
1668   assert(FCmp.getOpcode() == TargetOpcode::G_FCMP);
1669   assert(I.getOpcode() == TargetOpcode::G_BRCOND);
1670   // Unfortunately, the mapping of LLVM FP CC's onto AArch64 CC's isn't
1671   // totally clean.  Some of them require two branches to implement.
1672   auto Pred = (CmpInst::Predicate)FCmp.getOperand(1).getPredicate();
1673   emitFPCompare(FCmp.getOperand(2).getReg(), FCmp.getOperand(3).getReg(), MIB,
1674                 Pred);
1675   AArch64CC::CondCode CC1, CC2;
1676   changeFCMPPredToAArch64CC(static_cast<CmpInst::Predicate>(Pred), CC1, CC2);
1677   MachineBasicBlock *DestMBB = I.getOperand(1).getMBB();
1678   MIB.buildInstr(AArch64::Bcc, {}, {}).addImm(CC1).addMBB(DestMBB);
1679   if (CC2 != AArch64CC::AL)
1680     MIB.buildInstr(AArch64::Bcc, {}, {}).addImm(CC2).addMBB(DestMBB);
1681   I.eraseFromParent();
1682   return true;
1683 }
1684 
1685 bool AArch64InstructionSelector::tryOptCompareBranchFedByICmp(
1686     MachineInstr &I, MachineInstr &ICmp, MachineIRBuilder &MIB) const {
1687   assert(ICmp.getOpcode() == TargetOpcode::G_ICMP);
1688   assert(I.getOpcode() == TargetOpcode::G_BRCOND);
1689   // Attempt to optimize the G_BRCOND + G_ICMP into a TB(N)Z/CB(N)Z.
1690   //
1691   // Speculation tracking/SLH assumes that optimized TB(N)Z/CB(N)Z
1692   // instructions will not be produced, as they are conditional branch
1693   // instructions that do not set flags.
1694   if (!ProduceNonFlagSettingCondBr)
1695     return false;
1696 
1697   MachineRegisterInfo &MRI = *MIB.getMRI();
1698   MachineBasicBlock *DestMBB = I.getOperand(1).getMBB();
1699   auto Pred =
1700       static_cast<CmpInst::Predicate>(ICmp.getOperand(1).getPredicate());
1701   Register LHS = ICmp.getOperand(2).getReg();
1702   Register RHS = ICmp.getOperand(3).getReg();
1703 
1704   // We're allowed to emit a TB(N)Z/CB(N)Z. Try to do that.
1705   auto VRegAndVal = getIConstantVRegValWithLookThrough(RHS, MRI);
1706   MachineInstr *AndInst = getOpcodeDef(TargetOpcode::G_AND, LHS, MRI);
1707 
1708   // When we can emit a TB(N)Z, prefer that.
1709   //
1710   // Handle non-commutative condition codes first.
1711   // Note that we don't want to do this when we have a G_AND because it can
1712   // become a tst. The tst will make the test bit in the TB(N)Z redundant.
1713   if (VRegAndVal && !AndInst) {
1714     int64_t C = VRegAndVal->Value.getSExtValue();
1715 
1716     // When we have a greater-than comparison, we can just test if the msb is
1717     // zero.
1718     if (C == -1 && Pred == CmpInst::ICMP_SGT) {
1719       uint64_t Bit = MRI.getType(LHS).getSizeInBits() - 1;
1720       emitTestBit(LHS, Bit, /*IsNegative = */ false, DestMBB, MIB);
1721       I.eraseFromParent();
1722       return true;
1723     }
1724 
1725     // When we have a less than comparison, we can just test if the msb is not
1726     // zero.
1727     if (C == 0 && Pred == CmpInst::ICMP_SLT) {
1728       uint64_t Bit = MRI.getType(LHS).getSizeInBits() - 1;
1729       emitTestBit(LHS, Bit, /*IsNegative = */ true, DestMBB, MIB);
1730       I.eraseFromParent();
1731       return true;
1732     }
1733 
1734     // Inversely, if we have a signed greater-than-or-equal comparison to zero,
1735     // we can test if the msb is zero.
1736     if (C == 0 && Pred == CmpInst::ICMP_SGE) {
1737       uint64_t Bit = MRI.getType(LHS).getSizeInBits() - 1;
1738       emitTestBit(LHS, Bit, /*IsNegative = */ false, DestMBB, MIB);
1739       I.eraseFromParent();
1740       return true;
1741     }
1742   }
1743 
1744   // Attempt to handle commutative condition codes. Right now, that's only
1745   // eq/ne.
1746   if (ICmpInst::isEquality(Pred)) {
1747     if (!VRegAndVal) {
1748       std::swap(RHS, LHS);
1749       VRegAndVal = getIConstantVRegValWithLookThrough(RHS, MRI);
1750       AndInst = getOpcodeDef(TargetOpcode::G_AND, LHS, MRI);
1751     }
1752 
1753     if (VRegAndVal && VRegAndVal->Value == 0) {
1754       // If there's a G_AND feeding into this branch, try to fold it away by
1755       // emitting a TB(N)Z instead.
1756       //
1757       // Note: If we have LT, then it *is* possible to fold, but it wouldn't be
1758       // beneficial. When we have an AND and LT, we need a TST/ANDS, so folding
1759       // would be redundant.
1760       if (AndInst &&
1761           tryOptAndIntoCompareBranch(
1762               *AndInst, /*Invert = */ Pred == CmpInst::ICMP_NE, DestMBB, MIB)) {
1763         I.eraseFromParent();
1764         return true;
1765       }
1766 
1767       // Otherwise, try to emit a CB(N)Z instead.
1768       auto LHSTy = MRI.getType(LHS);
1769       if (!LHSTy.isVector() && LHSTy.getSizeInBits() <= 64) {
1770         emitCBZ(LHS, /*IsNegative = */ Pred == CmpInst::ICMP_NE, DestMBB, MIB);
1771         I.eraseFromParent();
1772         return true;
1773       }
1774     }
1775   }
1776 
1777   return false;
1778 }
1779 
1780 bool AArch64InstructionSelector::selectCompareBranchFedByICmp(
1781     MachineInstr &I, MachineInstr &ICmp, MachineIRBuilder &MIB) const {
1782   assert(ICmp.getOpcode() == TargetOpcode::G_ICMP);
1783   assert(I.getOpcode() == TargetOpcode::G_BRCOND);
1784   if (tryOptCompareBranchFedByICmp(I, ICmp, MIB))
1785     return true;
1786 
1787   // Couldn't optimize. Emit a compare + a Bcc.
1788   MachineBasicBlock *DestMBB = I.getOperand(1).getMBB();
1789   auto PredOp = ICmp.getOperand(1);
1790   emitIntegerCompare(ICmp.getOperand(2), ICmp.getOperand(3), PredOp, MIB);
1791   const AArch64CC::CondCode CC = changeICMPPredToAArch64CC(
1792       static_cast<CmpInst::Predicate>(PredOp.getPredicate()));
1793   MIB.buildInstr(AArch64::Bcc, {}, {}).addImm(CC).addMBB(DestMBB);
1794   I.eraseFromParent();
1795   return true;
1796 }
1797 
1798 bool AArch64InstructionSelector::selectCompareBranch(
1799     MachineInstr &I, MachineFunction &MF, MachineRegisterInfo &MRI) {
1800   Register CondReg = I.getOperand(0).getReg();
1801   MachineInstr *CCMI = MRI.getVRegDef(CondReg);
1802   // Try to select the G_BRCOND using whatever is feeding the condition if
1803   // possible.
1804   unsigned CCMIOpc = CCMI->getOpcode();
1805   if (CCMIOpc == TargetOpcode::G_FCMP)
1806     return selectCompareBranchFedByFCmp(I, *CCMI, MIB);
1807   if (CCMIOpc == TargetOpcode::G_ICMP)
1808     return selectCompareBranchFedByICmp(I, *CCMI, MIB);
1809 
1810   // Speculation tracking/SLH assumes that optimized TB(N)Z/CB(N)Z
1811   // instructions will not be produced, as they are conditional branch
1812   // instructions that do not set flags.
1813   if (ProduceNonFlagSettingCondBr) {
1814     emitTestBit(CondReg, /*Bit = */ 0, /*IsNegative = */ true,
1815                 I.getOperand(1).getMBB(), MIB);
1816     I.eraseFromParent();
1817     return true;
1818   }
1819 
1820   // Can't emit TB(N)Z/CB(N)Z. Emit a tst + bcc instead.
1821   auto TstMI =
1822       MIB.buildInstr(AArch64::ANDSWri, {LLT::scalar(32)}, {CondReg}).addImm(1);
1823   constrainSelectedInstRegOperands(*TstMI, TII, TRI, RBI);
1824   auto Bcc = MIB.buildInstr(AArch64::Bcc)
1825                  .addImm(AArch64CC::NE)
1826                  .addMBB(I.getOperand(1).getMBB());
1827   I.eraseFromParent();
1828   return constrainSelectedInstRegOperands(*Bcc, TII, TRI, RBI);
1829 }
1830 
1831 /// Returns the element immediate value of a vector shift operand if found.
1832 /// This needs to detect a splat-like operation, e.g. a G_BUILD_VECTOR.
1833 static std::optional<int64_t> getVectorShiftImm(Register Reg,
1834                                                 MachineRegisterInfo &MRI) {
1835   assert(MRI.getType(Reg).isVector() && "Expected a *vector* shift operand");
1836   MachineInstr *OpMI = MRI.getVRegDef(Reg);
1837   return getAArch64VectorSplatScalar(*OpMI, MRI);
1838 }
1839 
1840 /// Matches and returns the shift immediate value for a SHL instruction given
1841 /// a shift operand.
1842 static std::optional<int64_t> getVectorSHLImm(LLT SrcTy, Register Reg,
1843                                               MachineRegisterInfo &MRI) {
1844   std::optional<int64_t> ShiftImm = getVectorShiftImm(Reg, MRI);
1845   if (!ShiftImm)
1846     return std::nullopt;
1847   // Check the immediate is in range for a SHL.
1848   int64_t Imm = *ShiftImm;
1849   if (Imm < 0)
1850     return std::nullopt;
1851   switch (SrcTy.getElementType().getSizeInBits()) {
1852   default:
1853     LLVM_DEBUG(dbgs() << "Unhandled element type for vector shift");
1854     return std::nullopt;
1855   case 8:
1856     if (Imm > 7)
1857       return std::nullopt;
1858     break;
1859   case 16:
1860     if (Imm > 15)
1861       return std::nullopt;
1862     break;
1863   case 32:
1864     if (Imm > 31)
1865       return std::nullopt;
1866     break;
1867   case 64:
1868     if (Imm > 63)
1869       return std::nullopt;
1870     break;
1871   }
1872   return Imm;
1873 }
1874 
1875 bool AArch64InstructionSelector::selectVectorSHL(MachineInstr &I,
1876                                                  MachineRegisterInfo &MRI) {
1877   assert(I.getOpcode() == TargetOpcode::G_SHL);
1878   Register DstReg = I.getOperand(0).getReg();
1879   const LLT Ty = MRI.getType(DstReg);
1880   Register Src1Reg = I.getOperand(1).getReg();
1881   Register Src2Reg = I.getOperand(2).getReg();
1882 
1883   if (!Ty.isVector())
1884     return false;
1885 
1886   // Check if we have a vector of constants on RHS that we can select as the
1887   // immediate form.
1888   std::optional<int64_t> ImmVal = getVectorSHLImm(Ty, Src2Reg, MRI);
1889 
1890   unsigned Opc = 0;
1891   if (Ty == LLT::fixed_vector(2, 64)) {
1892     Opc = ImmVal ? AArch64::SHLv2i64_shift : AArch64::USHLv2i64;
1893   } else if (Ty == LLT::fixed_vector(4, 32)) {
1894     Opc = ImmVal ? AArch64::SHLv4i32_shift : AArch64::USHLv4i32;
1895   } else if (Ty == LLT::fixed_vector(2, 32)) {
1896     Opc = ImmVal ? AArch64::SHLv2i32_shift : AArch64::USHLv2i32;
1897   } else if (Ty == LLT::fixed_vector(4, 16)) {
1898     Opc = ImmVal ? AArch64::SHLv4i16_shift : AArch64::USHLv4i16;
1899   } else if (Ty == LLT::fixed_vector(8, 16)) {
1900     Opc = ImmVal ? AArch64::SHLv8i16_shift : AArch64::USHLv8i16;
1901   } else if (Ty == LLT::fixed_vector(16, 8)) {
1902     Opc = ImmVal ? AArch64::SHLv16i8_shift : AArch64::USHLv16i8;
1903   } else if (Ty == LLT::fixed_vector(8, 8)) {
1904     Opc = ImmVal ? AArch64::SHLv8i8_shift : AArch64::USHLv8i8;
1905   } else {
1906     LLVM_DEBUG(dbgs() << "Unhandled G_SHL type");
1907     return false;
1908   }
1909 
1910   auto Shl = MIB.buildInstr(Opc, {DstReg}, {Src1Reg});
1911   if (ImmVal)
1912     Shl.addImm(*ImmVal);
1913   else
1914     Shl.addUse(Src2Reg);
1915   constrainSelectedInstRegOperands(*Shl, TII, TRI, RBI);
1916   I.eraseFromParent();
1917   return true;
1918 }
1919 
1920 bool AArch64InstructionSelector::selectVectorAshrLshr(
1921     MachineInstr &I, MachineRegisterInfo &MRI) {
1922   assert(I.getOpcode() == TargetOpcode::G_ASHR ||
1923          I.getOpcode() == TargetOpcode::G_LSHR);
1924   Register DstReg = I.getOperand(0).getReg();
1925   const LLT Ty = MRI.getType(DstReg);
1926   Register Src1Reg = I.getOperand(1).getReg();
1927   Register Src2Reg = I.getOperand(2).getReg();
1928 
1929   if (!Ty.isVector())
1930     return false;
1931 
1932   bool IsASHR = I.getOpcode() == TargetOpcode::G_ASHR;
1933 
1934   // We expect the immediate case to be lowered in the PostLegalCombiner to
1935   // AArch64ISD::VASHR or AArch64ISD::VLSHR equivalents.
1936 
1937   // There is not a shift right register instruction, but the shift left
1938   // register instruction takes a signed value, where negative numbers specify a
1939   // right shift.
1940 
1941   unsigned Opc = 0;
1942   unsigned NegOpc = 0;
1943   const TargetRegisterClass *RC =
1944       getRegClassForTypeOnBank(Ty, RBI.getRegBank(AArch64::FPRRegBankID));
1945   if (Ty == LLT::fixed_vector(2, 64)) {
1946     Opc = IsASHR ? AArch64::SSHLv2i64 : AArch64::USHLv2i64;
1947     NegOpc = AArch64::NEGv2i64;
1948   } else if (Ty == LLT::fixed_vector(4, 32)) {
1949     Opc = IsASHR ? AArch64::SSHLv4i32 : AArch64::USHLv4i32;
1950     NegOpc = AArch64::NEGv4i32;
1951   } else if (Ty == LLT::fixed_vector(2, 32)) {
1952     Opc = IsASHR ? AArch64::SSHLv2i32 : AArch64::USHLv2i32;
1953     NegOpc = AArch64::NEGv2i32;
1954   } else if (Ty == LLT::fixed_vector(4, 16)) {
1955     Opc = IsASHR ? AArch64::SSHLv4i16 : AArch64::USHLv4i16;
1956     NegOpc = AArch64::NEGv4i16;
1957   } else if (Ty == LLT::fixed_vector(8, 16)) {
1958     Opc = IsASHR ? AArch64::SSHLv8i16 : AArch64::USHLv8i16;
1959     NegOpc = AArch64::NEGv8i16;
1960   } else if (Ty == LLT::fixed_vector(16, 8)) {
1961     Opc = IsASHR ? AArch64::SSHLv16i8 : AArch64::USHLv16i8;
1962     NegOpc = AArch64::NEGv16i8;
1963   } else if (Ty == LLT::fixed_vector(8, 8)) {
1964     Opc = IsASHR ? AArch64::SSHLv8i8 : AArch64::USHLv8i8;
1965     NegOpc = AArch64::NEGv8i8;
1966   } else {
1967     LLVM_DEBUG(dbgs() << "Unhandled G_ASHR type");
1968     return false;
1969   }
1970 
1971   auto Neg = MIB.buildInstr(NegOpc, {RC}, {Src2Reg});
1972   constrainSelectedInstRegOperands(*Neg, TII, TRI, RBI);
1973   auto SShl = MIB.buildInstr(Opc, {DstReg}, {Src1Reg, Neg});
1974   constrainSelectedInstRegOperands(*SShl, TII, TRI, RBI);
1975   I.eraseFromParent();
1976   return true;
1977 }
1978 
1979 bool AArch64InstructionSelector::selectVaStartAAPCS(
1980     MachineInstr &I, MachineFunction &MF, MachineRegisterInfo &MRI) const {
1981   return false;
1982 }
1983 
1984 bool AArch64InstructionSelector::selectVaStartDarwin(
1985     MachineInstr &I, MachineFunction &MF, MachineRegisterInfo &MRI) const {
1986   AArch64FunctionInfo *FuncInfo = MF.getInfo<AArch64FunctionInfo>();
1987   Register ListReg = I.getOperand(0).getReg();
1988 
1989   Register ArgsAddrReg = MRI.createVirtualRegister(&AArch64::GPR64RegClass);
1990 
1991   int FrameIdx = FuncInfo->getVarArgsStackIndex();
1992   if (MF.getSubtarget<AArch64Subtarget>().isCallingConvWin64(
1993           MF.getFunction().getCallingConv())) {
1994     FrameIdx = FuncInfo->getVarArgsGPRSize() > 0
1995                    ? FuncInfo->getVarArgsGPRIndex()
1996                    : FuncInfo->getVarArgsStackIndex();
1997   }
1998 
1999   auto MIB =
2000       BuildMI(*I.getParent(), I, I.getDebugLoc(), TII.get(AArch64::ADDXri))
2001           .addDef(ArgsAddrReg)
2002           .addFrameIndex(FrameIdx)
2003           .addImm(0)
2004           .addImm(0);
2005 
2006   constrainSelectedInstRegOperands(*MIB, TII, TRI, RBI);
2007 
2008   MIB = BuildMI(*I.getParent(), I, I.getDebugLoc(), TII.get(AArch64::STRXui))
2009             .addUse(ArgsAddrReg)
2010             .addUse(ListReg)
2011             .addImm(0)
2012             .addMemOperand(*I.memoperands_begin());
2013 
2014   constrainSelectedInstRegOperands(*MIB, TII, TRI, RBI);
2015   I.eraseFromParent();
2016   return true;
2017 }
2018 
2019 void AArch64InstructionSelector::materializeLargeCMVal(
2020     MachineInstr &I, const Value *V, unsigned OpFlags) {
2021   MachineBasicBlock &MBB = *I.getParent();
2022   MachineFunction &MF = *MBB.getParent();
2023   MachineRegisterInfo &MRI = MF.getRegInfo();
2024 
2025   auto MovZ = MIB.buildInstr(AArch64::MOVZXi, {&AArch64::GPR64RegClass}, {});
2026   MovZ->addOperand(MF, I.getOperand(1));
2027   MovZ->getOperand(1).setTargetFlags(OpFlags | AArch64II::MO_G0 |
2028                                      AArch64II::MO_NC);
2029   MovZ->addOperand(MF, MachineOperand::CreateImm(0));
2030   constrainSelectedInstRegOperands(*MovZ, TII, TRI, RBI);
2031 
2032   auto BuildMovK = [&](Register SrcReg, unsigned char Flags, unsigned Offset,
2033                        Register ForceDstReg) {
2034     Register DstReg = ForceDstReg
2035                           ? ForceDstReg
2036                           : MRI.createVirtualRegister(&AArch64::GPR64RegClass);
2037     auto MovI = MIB.buildInstr(AArch64::MOVKXi).addDef(DstReg).addUse(SrcReg);
2038     if (auto *GV = dyn_cast<GlobalValue>(V)) {
2039       MovI->addOperand(MF, MachineOperand::CreateGA(
2040                                GV, MovZ->getOperand(1).getOffset(), Flags));
2041     } else {
2042       MovI->addOperand(
2043           MF, MachineOperand::CreateBA(cast<BlockAddress>(V),
2044                                        MovZ->getOperand(1).getOffset(), Flags));
2045     }
2046     MovI->addOperand(MF, MachineOperand::CreateImm(Offset));
2047     constrainSelectedInstRegOperands(*MovI, TII, TRI, RBI);
2048     return DstReg;
2049   };
2050   Register DstReg = BuildMovK(MovZ.getReg(0),
2051                               AArch64II::MO_G1 | AArch64II::MO_NC, 16, 0);
2052   DstReg = BuildMovK(DstReg, AArch64II::MO_G2 | AArch64II::MO_NC, 32, 0);
2053   BuildMovK(DstReg, AArch64II::MO_G3, 48, I.getOperand(0).getReg());
2054 }
2055 
2056 bool AArch64InstructionSelector::preISelLower(MachineInstr &I) {
2057   MachineBasicBlock &MBB = *I.getParent();
2058   MachineFunction &MF = *MBB.getParent();
2059   MachineRegisterInfo &MRI = MF.getRegInfo();
2060 
2061   switch (I.getOpcode()) {
2062   case TargetOpcode::G_STORE: {
2063     bool Changed = contractCrossBankCopyIntoStore(I, MRI);
2064     MachineOperand &SrcOp = I.getOperand(0);
2065     if (MRI.getType(SrcOp.getReg()).isPointer()) {
2066       // Allow matching with imported patterns for stores of pointers. Unlike
2067       // G_LOAD/G_PTR_ADD, we may not have selected all users. So, emit a copy
2068       // and constrain.
2069       auto Copy = MIB.buildCopy(LLT::scalar(64), SrcOp);
2070       Register NewSrc = Copy.getReg(0);
2071       SrcOp.setReg(NewSrc);
2072       RBI.constrainGenericRegister(NewSrc, AArch64::GPR64RegClass, MRI);
2073       Changed = true;
2074     }
2075     return Changed;
2076   }
2077   case TargetOpcode::G_PTR_ADD:
2078     return convertPtrAddToAdd(I, MRI);
2079   case TargetOpcode::G_LOAD: {
2080     // For scalar loads of pointers, we try to convert the dest type from p0
2081     // to s64 so that our imported patterns can match. Like with the G_PTR_ADD
2082     // conversion, this should be ok because all users should have been
2083     // selected already, so the type doesn't matter for them.
2084     Register DstReg = I.getOperand(0).getReg();
2085     const LLT DstTy = MRI.getType(DstReg);
2086     if (!DstTy.isPointer())
2087       return false;
2088     MRI.setType(DstReg, LLT::scalar(64));
2089     return true;
2090   }
2091   case AArch64::G_DUP: {
2092     // Convert the type from p0 to s64 to help selection.
2093     LLT DstTy = MRI.getType(I.getOperand(0).getReg());
2094     if (!DstTy.getElementType().isPointer())
2095       return false;
2096     auto NewSrc = MIB.buildCopy(LLT::scalar(64), I.getOperand(1).getReg());
2097     MRI.setType(I.getOperand(0).getReg(),
2098                 DstTy.changeElementType(LLT::scalar(64)));
2099     MRI.setRegClass(NewSrc.getReg(0), &AArch64::GPR64RegClass);
2100     I.getOperand(1).setReg(NewSrc.getReg(0));
2101     return true;
2102   }
2103   case TargetOpcode::G_UITOFP:
2104   case TargetOpcode::G_SITOFP: {
2105     // If both source and destination regbanks are FPR, then convert the opcode
2106     // to G_SITOF so that the importer can select it to an fpr variant.
2107     // Otherwise, it ends up matching an fpr/gpr variant and adding a cross-bank
2108     // copy.
2109     Register SrcReg = I.getOperand(1).getReg();
2110     LLT SrcTy = MRI.getType(SrcReg);
2111     LLT DstTy = MRI.getType(I.getOperand(0).getReg());
2112     if (SrcTy.isVector() || SrcTy.getSizeInBits() != DstTy.getSizeInBits())
2113       return false;
2114 
2115     if (RBI.getRegBank(SrcReg, MRI, TRI)->getID() == AArch64::FPRRegBankID) {
2116       if (I.getOpcode() == TargetOpcode::G_SITOFP)
2117         I.setDesc(TII.get(AArch64::G_SITOF));
2118       else
2119         I.setDesc(TII.get(AArch64::G_UITOF));
2120       return true;
2121     }
2122     return false;
2123   }
2124   default:
2125     return false;
2126   }
2127 }
2128 
2129 /// This lowering tries to look for G_PTR_ADD instructions and then converts
2130 /// them to a standard G_ADD with a COPY on the source.
2131 ///
2132 /// The motivation behind this is to expose the add semantics to the imported
2133 /// tablegen patterns. We shouldn't need to check for uses being loads/stores,
2134 /// because the selector works bottom up, uses before defs. By the time we
2135 /// end up trying to select a G_PTR_ADD, we should have already attempted to
2136 /// fold this into addressing modes and were therefore unsuccessful.
2137 bool AArch64InstructionSelector::convertPtrAddToAdd(
2138     MachineInstr &I, MachineRegisterInfo &MRI) {
2139   assert(I.getOpcode() == TargetOpcode::G_PTR_ADD && "Expected G_PTR_ADD");
2140   Register DstReg = I.getOperand(0).getReg();
2141   Register AddOp1Reg = I.getOperand(1).getReg();
2142   const LLT PtrTy = MRI.getType(DstReg);
2143   if (PtrTy.getAddressSpace() != 0)
2144     return false;
2145 
2146   const LLT CastPtrTy =
2147       PtrTy.isVector() ? LLT::fixed_vector(2, 64) : LLT::scalar(64);
2148   auto PtrToInt = MIB.buildPtrToInt(CastPtrTy, AddOp1Reg);
2149   // Set regbanks on the registers.
2150   if (PtrTy.isVector())
2151     MRI.setRegBank(PtrToInt.getReg(0), RBI.getRegBank(AArch64::FPRRegBankID));
2152   else
2153     MRI.setRegBank(PtrToInt.getReg(0), RBI.getRegBank(AArch64::GPRRegBankID));
2154 
2155   // Now turn the %dst(p0) = G_PTR_ADD %base, off into:
2156   // %dst(intty) = G_ADD %intbase, off
2157   I.setDesc(TII.get(TargetOpcode::G_ADD));
2158   MRI.setType(DstReg, CastPtrTy);
2159   I.getOperand(1).setReg(PtrToInt.getReg(0));
2160   if (!select(*PtrToInt)) {
2161     LLVM_DEBUG(dbgs() << "Failed to select G_PTRTOINT in convertPtrAddToAdd");
2162     return false;
2163   }
2164 
2165   // Also take the opportunity here to try to do some optimization.
2166   // Try to convert this into a G_SUB if the offset is a 0-x negate idiom.
2167   Register NegatedReg;
2168   if (!mi_match(I.getOperand(2).getReg(), MRI, m_Neg(m_Reg(NegatedReg))))
2169     return true;
2170   I.getOperand(2).setReg(NegatedReg);
2171   I.setDesc(TII.get(TargetOpcode::G_SUB));
2172   return true;
2173 }
2174 
2175 bool AArch64InstructionSelector::earlySelectSHL(MachineInstr &I,
2176                                                 MachineRegisterInfo &MRI) {
2177   // We try to match the immediate variant of LSL, which is actually an alias
2178   // for a special case of UBFM. Otherwise, we fall back to the imported
2179   // selector which will match the register variant.
2180   assert(I.getOpcode() == TargetOpcode::G_SHL && "unexpected op");
2181   const auto &MO = I.getOperand(2);
2182   auto VRegAndVal = getIConstantVRegVal(MO.getReg(), MRI);
2183   if (!VRegAndVal)
2184     return false;
2185 
2186   const LLT DstTy = MRI.getType(I.getOperand(0).getReg());
2187   if (DstTy.isVector())
2188     return false;
2189   bool Is64Bit = DstTy.getSizeInBits() == 64;
2190   auto Imm1Fn = Is64Bit ? selectShiftA_64(MO) : selectShiftA_32(MO);
2191   auto Imm2Fn = Is64Bit ? selectShiftB_64(MO) : selectShiftB_32(MO);
2192 
2193   if (!Imm1Fn || !Imm2Fn)
2194     return false;
2195 
2196   auto NewI =
2197       MIB.buildInstr(Is64Bit ? AArch64::UBFMXri : AArch64::UBFMWri,
2198                      {I.getOperand(0).getReg()}, {I.getOperand(1).getReg()});
2199 
2200   for (auto &RenderFn : *Imm1Fn)
2201     RenderFn(NewI);
2202   for (auto &RenderFn : *Imm2Fn)
2203     RenderFn(NewI);
2204 
2205   I.eraseFromParent();
2206   return constrainSelectedInstRegOperands(*NewI, TII, TRI, RBI);
2207 }
2208 
2209 bool AArch64InstructionSelector::contractCrossBankCopyIntoStore(
2210     MachineInstr &I, MachineRegisterInfo &MRI) {
2211   assert(I.getOpcode() == TargetOpcode::G_STORE && "Expected G_STORE");
2212   // If we're storing a scalar, it doesn't matter what register bank that
2213   // scalar is on. All that matters is the size.
2214   //
2215   // So, if we see something like this (with a 32-bit scalar as an example):
2216   //
2217   // %x:gpr(s32) = ... something ...
2218   // %y:fpr(s32) = COPY %x:gpr(s32)
2219   // G_STORE %y:fpr(s32)
2220   //
2221   // We can fix this up into something like this:
2222   //
2223   // G_STORE %x:gpr(s32)
2224   //
2225   // And then continue the selection process normally.
2226   Register DefDstReg = getSrcRegIgnoringCopies(I.getOperand(0).getReg(), MRI);
2227   if (!DefDstReg.isValid())
2228     return false;
2229   LLT DefDstTy = MRI.getType(DefDstReg);
2230   Register StoreSrcReg = I.getOperand(0).getReg();
2231   LLT StoreSrcTy = MRI.getType(StoreSrcReg);
2232 
2233   // If we get something strange like a physical register, then we shouldn't
2234   // go any further.
2235   if (!DefDstTy.isValid())
2236     return false;
2237 
2238   // Are the source and dst types the same size?
2239   if (DefDstTy.getSizeInBits() != StoreSrcTy.getSizeInBits())
2240     return false;
2241 
2242   if (RBI.getRegBank(StoreSrcReg, MRI, TRI) ==
2243       RBI.getRegBank(DefDstReg, MRI, TRI))
2244     return false;
2245 
2246   // We have a cross-bank copy, which is entering a store. Let's fold it.
2247   I.getOperand(0).setReg(DefDstReg);
2248   return true;
2249 }
2250 
2251 bool AArch64InstructionSelector::earlySelect(MachineInstr &I) {
2252   assert(I.getParent() && "Instruction should be in a basic block!");
2253   assert(I.getParent()->getParent() && "Instruction should be in a function!");
2254 
2255   MachineBasicBlock &MBB = *I.getParent();
2256   MachineFunction &MF = *MBB.getParent();
2257   MachineRegisterInfo &MRI = MF.getRegInfo();
2258 
2259   switch (I.getOpcode()) {
2260   case AArch64::G_DUP: {
2261     // Before selecting a DUP instruction, check if it is better selected as a
2262     // MOV or load from a constant pool.
2263     Register Src = I.getOperand(1).getReg();
2264     auto ValAndVReg = getAnyConstantVRegValWithLookThrough(Src, MRI);
2265     if (!ValAndVReg)
2266       return false;
2267     LLVMContext &Ctx = MF.getFunction().getContext();
2268     Register Dst = I.getOperand(0).getReg();
2269     auto *CV = ConstantDataVector::getSplat(
2270         MRI.getType(Dst).getNumElements(),
2271         ConstantInt::get(Type::getIntNTy(Ctx, MRI.getType(Src).getSizeInBits()),
2272                          ValAndVReg->Value));
2273     if (!emitConstantVector(Dst, CV, MIB, MRI))
2274       return false;
2275     I.eraseFromParent();
2276     return true;
2277   }
2278   case TargetOpcode::G_SEXT:
2279     // Check for i64 sext(i32 vector_extract) prior to tablegen to select SMOV
2280     // over a normal extend.
2281     if (selectUSMovFromExtend(I, MRI))
2282       return true;
2283     return false;
2284   case TargetOpcode::G_BR:
2285     return false;
2286   case TargetOpcode::G_SHL:
2287     return earlySelectSHL(I, MRI);
2288   case TargetOpcode::G_CONSTANT: {
2289     bool IsZero = false;
2290     if (I.getOperand(1).isCImm())
2291       IsZero = I.getOperand(1).getCImm()->isZero();
2292     else if (I.getOperand(1).isImm())
2293       IsZero = I.getOperand(1).getImm() == 0;
2294 
2295     if (!IsZero)
2296       return false;
2297 
2298     Register DefReg = I.getOperand(0).getReg();
2299     LLT Ty = MRI.getType(DefReg);
2300     if (Ty.getSizeInBits() == 64) {
2301       I.getOperand(1).ChangeToRegister(AArch64::XZR, false);
2302       RBI.constrainGenericRegister(DefReg, AArch64::GPR64RegClass, MRI);
2303     } else if (Ty.getSizeInBits() == 32) {
2304       I.getOperand(1).ChangeToRegister(AArch64::WZR, false);
2305       RBI.constrainGenericRegister(DefReg, AArch64::GPR32RegClass, MRI);
2306     } else
2307       return false;
2308 
2309     I.setDesc(TII.get(TargetOpcode::COPY));
2310     return true;
2311   }
2312 
2313   case TargetOpcode::G_ADD: {
2314     // Check if this is being fed by a G_ICMP on either side.
2315     //
2316     // (cmp pred, x, y) + z
2317     //
2318     // In the above case, when the cmp is true, we increment z by 1. So, we can
2319     // fold the add into the cset for the cmp by using cinc.
2320     //
2321     // FIXME: This would probably be a lot nicer in PostLegalizerLowering.
2322     Register AddDst = I.getOperand(0).getReg();
2323     Register AddLHS = I.getOperand(1).getReg();
2324     Register AddRHS = I.getOperand(2).getReg();
2325     // Only handle scalars.
2326     LLT Ty = MRI.getType(AddLHS);
2327     if (Ty.isVector())
2328       return false;
2329     // Since G_ICMP is modeled as ADDS/SUBS/ANDS, we can handle 32 bits or 64
2330     // bits.
2331     unsigned Size = Ty.getSizeInBits();
2332     if (Size != 32 && Size != 64)
2333       return false;
2334     auto MatchCmp = [&](Register Reg) -> MachineInstr * {
2335       if (!MRI.hasOneNonDBGUse(Reg))
2336         return nullptr;
2337       // If the LHS of the add is 32 bits, then we want to fold a 32-bit
2338       // compare.
2339       if (Size == 32)
2340         return getOpcodeDef(TargetOpcode::G_ICMP, Reg, MRI);
2341       // We model scalar compares using 32-bit destinations right now.
2342       // If it's a 64-bit compare, it'll have 64-bit sources.
2343       Register ZExt;
2344       if (!mi_match(Reg, MRI,
2345                     m_OneNonDBGUse(m_GZExt(m_OneNonDBGUse(m_Reg(ZExt))))))
2346         return nullptr;
2347       auto *Cmp = getOpcodeDef(TargetOpcode::G_ICMP, ZExt, MRI);
2348       if (!Cmp ||
2349           MRI.getType(Cmp->getOperand(2).getReg()).getSizeInBits() != 64)
2350         return nullptr;
2351       return Cmp;
2352     };
2353     // Try to match
2354     // z + (cmp pred, x, y)
2355     MachineInstr *Cmp = MatchCmp(AddRHS);
2356     if (!Cmp) {
2357       // (cmp pred, x, y) + z
2358       std::swap(AddLHS, AddRHS);
2359       Cmp = MatchCmp(AddRHS);
2360       if (!Cmp)
2361         return false;
2362     }
2363     auto &PredOp = Cmp->getOperand(1);
2364     auto Pred = static_cast<CmpInst::Predicate>(PredOp.getPredicate());
2365     const AArch64CC::CondCode InvCC =
2366         changeICMPPredToAArch64CC(CmpInst::getInversePredicate(Pred));
2367     MIB.setInstrAndDebugLoc(I);
2368     emitIntegerCompare(/*LHS=*/Cmp->getOperand(2),
2369                        /*RHS=*/Cmp->getOperand(3), PredOp, MIB);
2370     emitCSINC(/*Dst=*/AddDst, /*Src =*/AddLHS, /*Src2=*/AddLHS, InvCC, MIB);
2371     I.eraseFromParent();
2372     return true;
2373   }
2374   case TargetOpcode::G_OR: {
2375     // Look for operations that take the lower `Width=Size-ShiftImm` bits of
2376     // `ShiftSrc` and insert them into the upper `Width` bits of `MaskSrc` via
2377     // shifting and masking that we can replace with a BFI (encoded as a BFM).
2378     Register Dst = I.getOperand(0).getReg();
2379     LLT Ty = MRI.getType(Dst);
2380 
2381     if (!Ty.isScalar())
2382       return false;
2383 
2384     unsigned Size = Ty.getSizeInBits();
2385     if (Size != 32 && Size != 64)
2386       return false;
2387 
2388     Register ShiftSrc;
2389     int64_t ShiftImm;
2390     Register MaskSrc;
2391     int64_t MaskImm;
2392     if (!mi_match(
2393             Dst, MRI,
2394             m_GOr(m_OneNonDBGUse(m_GShl(m_Reg(ShiftSrc), m_ICst(ShiftImm))),
2395                   m_OneNonDBGUse(m_GAnd(m_Reg(MaskSrc), m_ICst(MaskImm))))))
2396       return false;
2397 
2398     if (ShiftImm > Size || ((1ULL << ShiftImm) - 1ULL) != uint64_t(MaskImm))
2399       return false;
2400 
2401     int64_t Immr = Size - ShiftImm;
2402     int64_t Imms = Size - ShiftImm - 1;
2403     unsigned Opc = Size == 32 ? AArch64::BFMWri : AArch64::BFMXri;
2404     emitInstr(Opc, {Dst}, {MaskSrc, ShiftSrc, Immr, Imms}, MIB);
2405     I.eraseFromParent();
2406     return true;
2407   }
2408   case TargetOpcode::G_FENCE: {
2409     if (I.getOperand(1).getImm() == 0)
2410       BuildMI(MBB, I, MIMetadata(I), TII.get(TargetOpcode::MEMBARRIER));
2411     else
2412       BuildMI(MBB, I, MIMetadata(I), TII.get(AArch64::DMB))
2413           .addImm(I.getOperand(0).getImm() == 4 ? 0x9 : 0xb);
2414     I.eraseFromParent();
2415     return true;
2416   }
2417   default:
2418     return false;
2419   }
2420 }
2421 
2422 bool AArch64InstructionSelector::select(MachineInstr &I) {
2423   assert(I.getParent() && "Instruction should be in a basic block!");
2424   assert(I.getParent()->getParent() && "Instruction should be in a function!");
2425 
2426   MachineBasicBlock &MBB = *I.getParent();
2427   MachineFunction &MF = *MBB.getParent();
2428   MachineRegisterInfo &MRI = MF.getRegInfo();
2429 
2430   const AArch64Subtarget *Subtarget = &MF.getSubtarget<AArch64Subtarget>();
2431   if (Subtarget->requiresStrictAlign()) {
2432     // We don't support this feature yet.
2433     LLVM_DEBUG(dbgs() << "AArch64 GISel does not support strict-align yet\n");
2434     return false;
2435   }
2436 
2437   MIB.setInstrAndDebugLoc(I);
2438 
2439   unsigned Opcode = I.getOpcode();
2440   // G_PHI requires same handling as PHI
2441   if (!I.isPreISelOpcode() || Opcode == TargetOpcode::G_PHI) {
2442     // Certain non-generic instructions also need some special handling.
2443 
2444     if (Opcode ==  TargetOpcode::LOAD_STACK_GUARD)
2445       return constrainSelectedInstRegOperands(I, TII, TRI, RBI);
2446 
2447     if (Opcode == TargetOpcode::PHI || Opcode == TargetOpcode::G_PHI) {
2448       const Register DefReg = I.getOperand(0).getReg();
2449       const LLT DefTy = MRI.getType(DefReg);
2450 
2451       const RegClassOrRegBank &RegClassOrBank =
2452         MRI.getRegClassOrRegBank(DefReg);
2453 
2454       const TargetRegisterClass *DefRC
2455         = RegClassOrBank.dyn_cast<const TargetRegisterClass *>();
2456       if (!DefRC) {
2457         if (!DefTy.isValid()) {
2458           LLVM_DEBUG(dbgs() << "PHI operand has no type, not a gvreg?\n");
2459           return false;
2460         }
2461         const RegisterBank &RB = *RegClassOrBank.get<const RegisterBank *>();
2462         DefRC = getRegClassForTypeOnBank(DefTy, RB);
2463         if (!DefRC) {
2464           LLVM_DEBUG(dbgs() << "PHI operand has unexpected size/bank\n");
2465           return false;
2466         }
2467       }
2468 
2469       I.setDesc(TII.get(TargetOpcode::PHI));
2470 
2471       return RBI.constrainGenericRegister(DefReg, *DefRC, MRI);
2472     }
2473 
2474     if (I.isCopy())
2475       return selectCopy(I, TII, MRI, TRI, RBI);
2476 
2477     if (I.isDebugInstr())
2478       return selectDebugInstr(I, MRI, RBI);
2479 
2480     return true;
2481   }
2482 
2483 
2484   if (I.getNumOperands() != I.getNumExplicitOperands()) {
2485     LLVM_DEBUG(
2486         dbgs() << "Generic instruction has unexpected implicit operands\n");
2487     return false;
2488   }
2489 
2490   // Try to do some lowering before we start instruction selecting. These
2491   // lowerings are purely transformations on the input G_MIR and so selection
2492   // must continue after any modification of the instruction.
2493   if (preISelLower(I)) {
2494     Opcode = I.getOpcode(); // The opcode may have been modified, refresh it.
2495   }
2496 
2497   // There may be patterns where the importer can't deal with them optimally,
2498   // but does select it to a suboptimal sequence so our custom C++ selection
2499   // code later never has a chance to work on it. Therefore, we have an early
2500   // selection attempt here to give priority to certain selection routines
2501   // over the imported ones.
2502   if (earlySelect(I))
2503     return true;
2504 
2505   if (selectImpl(I, *CoverageInfo))
2506     return true;
2507 
2508   LLT Ty =
2509       I.getOperand(0).isReg() ? MRI.getType(I.getOperand(0).getReg()) : LLT{};
2510 
2511   switch (Opcode) {
2512   case TargetOpcode::G_SBFX:
2513   case TargetOpcode::G_UBFX: {
2514     static const unsigned OpcTable[2][2] = {
2515         {AArch64::UBFMWri, AArch64::UBFMXri},
2516         {AArch64::SBFMWri, AArch64::SBFMXri}};
2517     bool IsSigned = Opcode == TargetOpcode::G_SBFX;
2518     unsigned Size = Ty.getSizeInBits();
2519     unsigned Opc = OpcTable[IsSigned][Size == 64];
2520     auto Cst1 =
2521         getIConstantVRegValWithLookThrough(I.getOperand(2).getReg(), MRI);
2522     assert(Cst1 && "Should have gotten a constant for src 1?");
2523     auto Cst2 =
2524         getIConstantVRegValWithLookThrough(I.getOperand(3).getReg(), MRI);
2525     assert(Cst2 && "Should have gotten a constant for src 2?");
2526     auto LSB = Cst1->Value.getZExtValue();
2527     auto Width = Cst2->Value.getZExtValue();
2528     auto BitfieldInst =
2529         MIB.buildInstr(Opc, {I.getOperand(0)}, {I.getOperand(1)})
2530             .addImm(LSB)
2531             .addImm(LSB + Width - 1);
2532     I.eraseFromParent();
2533     return constrainSelectedInstRegOperands(*BitfieldInst, TII, TRI, RBI);
2534   }
2535   case TargetOpcode::G_BRCOND:
2536     return selectCompareBranch(I, MF, MRI);
2537 
2538   case TargetOpcode::G_BRINDIRECT: {
2539     I.setDesc(TII.get(AArch64::BR));
2540     return constrainSelectedInstRegOperands(I, TII, TRI, RBI);
2541   }
2542 
2543   case TargetOpcode::G_BRJT:
2544     return selectBrJT(I, MRI);
2545 
2546   case AArch64::G_ADD_LOW: {
2547     // This op may have been separated from it's ADRP companion by the localizer
2548     // or some other code motion pass. Given that many CPUs will try to
2549     // macro fuse these operations anyway, select this into a MOVaddr pseudo
2550     // which will later be expanded into an ADRP+ADD pair after scheduling.
2551     MachineInstr *BaseMI = MRI.getVRegDef(I.getOperand(1).getReg());
2552     if (BaseMI->getOpcode() != AArch64::ADRP) {
2553       I.setDesc(TII.get(AArch64::ADDXri));
2554       I.addOperand(MachineOperand::CreateImm(0));
2555       return constrainSelectedInstRegOperands(I, TII, TRI, RBI);
2556     }
2557     assert(TM.getCodeModel() == CodeModel::Small &&
2558            "Expected small code model");
2559     auto Op1 = BaseMI->getOperand(1);
2560     auto Op2 = I.getOperand(2);
2561     auto MovAddr = MIB.buildInstr(AArch64::MOVaddr, {I.getOperand(0)}, {})
2562                        .addGlobalAddress(Op1.getGlobal(), Op1.getOffset(),
2563                                          Op1.getTargetFlags())
2564                        .addGlobalAddress(Op2.getGlobal(), Op2.getOffset(),
2565                                          Op2.getTargetFlags());
2566     I.eraseFromParent();
2567     return constrainSelectedInstRegOperands(*MovAddr, TII, TRI, RBI);
2568   }
2569 
2570   case TargetOpcode::G_BSWAP: {
2571     // Handle vector types for G_BSWAP directly.
2572     Register DstReg = I.getOperand(0).getReg();
2573     LLT DstTy = MRI.getType(DstReg);
2574 
2575     // We should only get vector types here; everything else is handled by the
2576     // importer right now.
2577     if (!DstTy.isVector() || DstTy.getSizeInBits() > 128) {
2578       LLVM_DEBUG(dbgs() << "Dst type for G_BSWAP currently unsupported.\n");
2579       return false;
2580     }
2581 
2582     // Only handle 4 and 2 element vectors for now.
2583     // TODO: 16-bit elements.
2584     unsigned NumElts = DstTy.getNumElements();
2585     if (NumElts != 4 && NumElts != 2) {
2586       LLVM_DEBUG(dbgs() << "Unsupported number of elements for G_BSWAP.\n");
2587       return false;
2588     }
2589 
2590     // Choose the correct opcode for the supported types. Right now, that's
2591     // v2s32, v4s32, and v2s64.
2592     unsigned Opc = 0;
2593     unsigned EltSize = DstTy.getElementType().getSizeInBits();
2594     if (EltSize == 32)
2595       Opc = (DstTy.getNumElements() == 2) ? AArch64::REV32v8i8
2596                                           : AArch64::REV32v16i8;
2597     else if (EltSize == 64)
2598       Opc = AArch64::REV64v16i8;
2599 
2600     // We should always get something by the time we get here...
2601     assert(Opc != 0 && "Didn't get an opcode for G_BSWAP?");
2602 
2603     I.setDesc(TII.get(Opc));
2604     return constrainSelectedInstRegOperands(I, TII, TRI, RBI);
2605   }
2606 
2607   case TargetOpcode::G_FCONSTANT:
2608   case TargetOpcode::G_CONSTANT: {
2609     const bool isFP = Opcode == TargetOpcode::G_FCONSTANT;
2610 
2611     const LLT s8 = LLT::scalar(8);
2612     const LLT s16 = LLT::scalar(16);
2613     const LLT s32 = LLT::scalar(32);
2614     const LLT s64 = LLT::scalar(64);
2615     const LLT s128 = LLT::scalar(128);
2616     const LLT p0 = LLT::pointer(0, 64);
2617 
2618     const Register DefReg = I.getOperand(0).getReg();
2619     const LLT DefTy = MRI.getType(DefReg);
2620     const unsigned DefSize = DefTy.getSizeInBits();
2621     const RegisterBank &RB = *RBI.getRegBank(DefReg, MRI, TRI);
2622 
2623     // FIXME: Redundant check, but even less readable when factored out.
2624     if (isFP) {
2625       if (Ty != s16 && Ty != s32 && Ty != s64 && Ty != s128) {
2626         LLVM_DEBUG(dbgs() << "Unable to materialize FP " << Ty
2627                           << " constant, expected: " << s16 << " or " << s32
2628                           << " or " << s64 << " or " << s128 << '\n');
2629         return false;
2630       }
2631 
2632       if (RB.getID() != AArch64::FPRRegBankID) {
2633         LLVM_DEBUG(dbgs() << "Unable to materialize FP " << Ty
2634                           << " constant on bank: " << RB
2635                           << ", expected: FPR\n");
2636         return false;
2637       }
2638 
2639       // The case when we have 0.0 is covered by tablegen. Reject it here so we
2640       // can be sure tablegen works correctly and isn't rescued by this code.
2641       // 0.0 is not covered by tablegen for FP128. So we will handle this
2642       // scenario in the code here.
2643       if (DefSize != 128 && I.getOperand(1).getFPImm()->isExactlyValue(0.0))
2644         return false;
2645     } else {
2646       // s32 and s64 are covered by tablegen.
2647       if (Ty != p0 && Ty != s8 && Ty != s16) {
2648         LLVM_DEBUG(dbgs() << "Unable to materialize integer " << Ty
2649                           << " constant, expected: " << s32 << ", " << s64
2650                           << ", or " << p0 << '\n');
2651         return false;
2652       }
2653 
2654       if (RB.getID() != AArch64::GPRRegBankID) {
2655         LLVM_DEBUG(dbgs() << "Unable to materialize integer " << Ty
2656                           << " constant on bank: " << RB
2657                           << ", expected: GPR\n");
2658         return false;
2659       }
2660     }
2661 
2662     if (isFP) {
2663       const TargetRegisterClass &FPRRC = *getRegClassForTypeOnBank(DefTy, RB);
2664       // For 16, 64, and 128b values, emit a constant pool load.
2665       switch (DefSize) {
2666       default:
2667         llvm_unreachable("Unexpected destination size for G_FCONSTANT?");
2668       case 32:
2669       case 64: {
2670         bool OptForSize = shouldOptForSize(&MF);
2671         const auto &TLI = MF.getSubtarget().getTargetLowering();
2672         // If TLI says that this fpimm is illegal, then we'll expand to a
2673         // constant pool load.
2674         if (TLI->isFPImmLegal(I.getOperand(1).getFPImm()->getValueAPF(),
2675                               EVT::getFloatingPointVT(DefSize), OptForSize))
2676           break;
2677         [[fallthrough]];
2678       }
2679       case 16:
2680       case 128: {
2681         auto *FPImm = I.getOperand(1).getFPImm();
2682         auto *LoadMI = emitLoadFromConstantPool(FPImm, MIB);
2683         if (!LoadMI) {
2684           LLVM_DEBUG(dbgs() << "Failed to load double constant pool entry\n");
2685           return false;
2686         }
2687         MIB.buildCopy({DefReg}, {LoadMI->getOperand(0).getReg()});
2688         I.eraseFromParent();
2689         return RBI.constrainGenericRegister(DefReg, FPRRC, MRI);
2690       }
2691       }
2692 
2693       assert((DefSize == 32 || DefSize == 64) && "Unexpected const def size");
2694       // Either emit a FMOV, or emit a copy to emit a normal mov.
2695       const Register DefGPRReg = MRI.createVirtualRegister(
2696           DefSize == 32 ? &AArch64::GPR32RegClass : &AArch64::GPR64RegClass);
2697       MachineOperand &RegOp = I.getOperand(0);
2698       RegOp.setReg(DefGPRReg);
2699       MIB.setInsertPt(MIB.getMBB(), std::next(I.getIterator()));
2700       MIB.buildCopy({DefReg}, {DefGPRReg});
2701 
2702       if (!RBI.constrainGenericRegister(DefReg, FPRRC, MRI)) {
2703         LLVM_DEBUG(dbgs() << "Failed to constrain G_FCONSTANT def operand\n");
2704         return false;
2705       }
2706 
2707       MachineOperand &ImmOp = I.getOperand(1);
2708       // FIXME: Is going through int64_t always correct?
2709       ImmOp.ChangeToImmediate(
2710           ImmOp.getFPImm()->getValueAPF().bitcastToAPInt().getZExtValue());
2711     } else if (I.getOperand(1).isCImm()) {
2712       uint64_t Val = I.getOperand(1).getCImm()->getZExtValue();
2713       I.getOperand(1).ChangeToImmediate(Val);
2714     } else if (I.getOperand(1).isImm()) {
2715       uint64_t Val = I.getOperand(1).getImm();
2716       I.getOperand(1).ChangeToImmediate(Val);
2717     }
2718 
2719     const unsigned MovOpc =
2720         DefSize == 64 ? AArch64::MOVi64imm : AArch64::MOVi32imm;
2721     I.setDesc(TII.get(MovOpc));
2722     constrainSelectedInstRegOperands(I, TII, TRI, RBI);
2723     return true;
2724   }
2725   case TargetOpcode::G_EXTRACT: {
2726     Register DstReg = I.getOperand(0).getReg();
2727     Register SrcReg = I.getOperand(1).getReg();
2728     LLT SrcTy = MRI.getType(SrcReg);
2729     LLT DstTy = MRI.getType(DstReg);
2730     (void)DstTy;
2731     unsigned SrcSize = SrcTy.getSizeInBits();
2732 
2733     if (SrcTy.getSizeInBits() > 64) {
2734       // This should be an extract of an s128, which is like a vector extract.
2735       if (SrcTy.getSizeInBits() != 128)
2736         return false;
2737       // Only support extracting 64 bits from an s128 at the moment.
2738       if (DstTy.getSizeInBits() != 64)
2739         return false;
2740 
2741       unsigned Offset = I.getOperand(2).getImm();
2742       if (Offset % 64 != 0)
2743         return false;
2744 
2745       // Check we have the right regbank always.
2746       const RegisterBank &SrcRB = *RBI.getRegBank(SrcReg, MRI, TRI);
2747       const RegisterBank &DstRB = *RBI.getRegBank(DstReg, MRI, TRI);
2748       assert(SrcRB.getID() == DstRB.getID() && "Wrong extract regbank!");
2749 
2750       if (SrcRB.getID() == AArch64::GPRRegBankID) {
2751         auto NewI =
2752             MIB.buildInstr(TargetOpcode::COPY, {DstReg}, {})
2753                 .addUse(SrcReg, 0,
2754                         Offset == 0 ? AArch64::sube64 : AArch64::subo64);
2755         constrainOperandRegClass(MF, TRI, MRI, TII, RBI, *NewI,
2756                                  AArch64::GPR64RegClass, NewI->getOperand(0));
2757         I.eraseFromParent();
2758         return true;
2759       }
2760 
2761       // Emit the same code as a vector extract.
2762       // Offset must be a multiple of 64.
2763       unsigned LaneIdx = Offset / 64;
2764       MachineInstr *Extract = emitExtractVectorElt(
2765           DstReg, DstRB, LLT::scalar(64), SrcReg, LaneIdx, MIB);
2766       if (!Extract)
2767         return false;
2768       I.eraseFromParent();
2769       return true;
2770     }
2771 
2772     I.setDesc(TII.get(SrcSize == 64 ? AArch64::UBFMXri : AArch64::UBFMWri));
2773     MachineInstrBuilder(MF, I).addImm(I.getOperand(2).getImm() +
2774                                       Ty.getSizeInBits() - 1);
2775 
2776     if (SrcSize < 64) {
2777       assert(SrcSize == 32 && DstTy.getSizeInBits() == 16 &&
2778              "unexpected G_EXTRACT types");
2779       return constrainSelectedInstRegOperands(I, TII, TRI, RBI);
2780     }
2781 
2782     DstReg = MRI.createGenericVirtualRegister(LLT::scalar(64));
2783     MIB.setInsertPt(MIB.getMBB(), std::next(I.getIterator()));
2784     MIB.buildInstr(TargetOpcode::COPY, {I.getOperand(0).getReg()}, {})
2785         .addReg(DstReg, 0, AArch64::sub_32);
2786     RBI.constrainGenericRegister(I.getOperand(0).getReg(),
2787                                  AArch64::GPR32RegClass, MRI);
2788     I.getOperand(0).setReg(DstReg);
2789 
2790     return constrainSelectedInstRegOperands(I, TII, TRI, RBI);
2791   }
2792 
2793   case TargetOpcode::G_INSERT: {
2794     LLT SrcTy = MRI.getType(I.getOperand(2).getReg());
2795     LLT DstTy = MRI.getType(I.getOperand(0).getReg());
2796     unsigned DstSize = DstTy.getSizeInBits();
2797     // Larger inserts are vectors, same-size ones should be something else by
2798     // now (split up or turned into COPYs).
2799     if (Ty.getSizeInBits() > 64 || SrcTy.getSizeInBits() > 32)
2800       return false;
2801 
2802     I.setDesc(TII.get(DstSize == 64 ? AArch64::BFMXri : AArch64::BFMWri));
2803     unsigned LSB = I.getOperand(3).getImm();
2804     unsigned Width = MRI.getType(I.getOperand(2).getReg()).getSizeInBits();
2805     I.getOperand(3).setImm((DstSize - LSB) % DstSize);
2806     MachineInstrBuilder(MF, I).addImm(Width - 1);
2807 
2808     if (DstSize < 64) {
2809       assert(DstSize == 32 && SrcTy.getSizeInBits() == 16 &&
2810              "unexpected G_INSERT types");
2811       return constrainSelectedInstRegOperands(I, TII, TRI, RBI);
2812     }
2813 
2814     Register SrcReg = MRI.createGenericVirtualRegister(LLT::scalar(64));
2815     BuildMI(MBB, I.getIterator(), I.getDebugLoc(),
2816             TII.get(AArch64::SUBREG_TO_REG))
2817         .addDef(SrcReg)
2818         .addImm(0)
2819         .addUse(I.getOperand(2).getReg())
2820         .addImm(AArch64::sub_32);
2821     RBI.constrainGenericRegister(I.getOperand(2).getReg(),
2822                                  AArch64::GPR32RegClass, MRI);
2823     I.getOperand(2).setReg(SrcReg);
2824 
2825     return constrainSelectedInstRegOperands(I, TII, TRI, RBI);
2826   }
2827   case TargetOpcode::G_FRAME_INDEX: {
2828     // allocas and G_FRAME_INDEX are only supported in addrspace(0).
2829     if (Ty != LLT::pointer(0, 64)) {
2830       LLVM_DEBUG(dbgs() << "G_FRAME_INDEX pointer has type: " << Ty
2831                         << ", expected: " << LLT::pointer(0, 64) << '\n');
2832       return false;
2833     }
2834     I.setDesc(TII.get(AArch64::ADDXri));
2835 
2836     // MOs for a #0 shifted immediate.
2837     I.addOperand(MachineOperand::CreateImm(0));
2838     I.addOperand(MachineOperand::CreateImm(0));
2839 
2840     return constrainSelectedInstRegOperands(I, TII, TRI, RBI);
2841   }
2842 
2843   case TargetOpcode::G_GLOBAL_VALUE: {
2844     auto GV = I.getOperand(1).getGlobal();
2845     if (GV->isThreadLocal())
2846       return selectTLSGlobalValue(I, MRI);
2847 
2848     unsigned OpFlags = STI.ClassifyGlobalReference(GV, TM);
2849     if (OpFlags & AArch64II::MO_GOT) {
2850       I.setDesc(TII.get(AArch64::LOADgot));
2851       I.getOperand(1).setTargetFlags(OpFlags);
2852     } else if (TM.getCodeModel() == CodeModel::Large &&
2853                !TM.isPositionIndependent()) {
2854       // Materialize the global using movz/movk instructions.
2855       materializeLargeCMVal(I, GV, OpFlags);
2856       I.eraseFromParent();
2857       return true;
2858     } else if (TM.getCodeModel() == CodeModel::Tiny) {
2859       I.setDesc(TII.get(AArch64::ADR));
2860       I.getOperand(1).setTargetFlags(OpFlags);
2861     } else {
2862       I.setDesc(TII.get(AArch64::MOVaddr));
2863       I.getOperand(1).setTargetFlags(OpFlags | AArch64II::MO_PAGE);
2864       MachineInstrBuilder MIB(MF, I);
2865       MIB.addGlobalAddress(GV, I.getOperand(1).getOffset(),
2866                            OpFlags | AArch64II::MO_PAGEOFF | AArch64II::MO_NC);
2867     }
2868     return constrainSelectedInstRegOperands(I, TII, TRI, RBI);
2869   }
2870 
2871   case TargetOpcode::G_ZEXTLOAD:
2872   case TargetOpcode::G_LOAD:
2873   case TargetOpcode::G_STORE: {
2874     GLoadStore &LdSt = cast<GLoadStore>(I);
2875     bool IsZExtLoad = I.getOpcode() == TargetOpcode::G_ZEXTLOAD;
2876     LLT PtrTy = MRI.getType(LdSt.getPointerReg());
2877 
2878     if (PtrTy != LLT::pointer(0, 64)) {
2879       LLVM_DEBUG(dbgs() << "Load/Store pointer has type: " << PtrTy
2880                         << ", expected: " << LLT::pointer(0, 64) << '\n');
2881       return false;
2882     }
2883 
2884     uint64_t MemSizeInBytes = LdSt.getMemSize();
2885     unsigned MemSizeInBits = LdSt.getMemSizeInBits();
2886     AtomicOrdering Order = LdSt.getMMO().getSuccessOrdering();
2887 
2888     // Need special instructions for atomics that affect ordering.
2889     if (Order != AtomicOrdering::NotAtomic &&
2890         Order != AtomicOrdering::Unordered &&
2891         Order != AtomicOrdering::Monotonic) {
2892       assert(!isa<GZExtLoad>(LdSt));
2893       if (MemSizeInBytes > 64)
2894         return false;
2895 
2896       if (isa<GLoad>(LdSt)) {
2897         static constexpr unsigned LDAPROpcodes[] = {
2898             AArch64::LDAPRB, AArch64::LDAPRH, AArch64::LDAPRW, AArch64::LDAPRX};
2899         static constexpr unsigned LDAROpcodes[] = {
2900             AArch64::LDARB, AArch64::LDARH, AArch64::LDARW, AArch64::LDARX};
2901         ArrayRef<unsigned> Opcodes =
2902             STI.hasRCPC() && Order != AtomicOrdering::SequentiallyConsistent
2903                 ? LDAPROpcodes
2904                 : LDAROpcodes;
2905         I.setDesc(TII.get(Opcodes[Log2_32(MemSizeInBytes)]));
2906       } else {
2907         static constexpr unsigned Opcodes[] = {AArch64::STLRB, AArch64::STLRH,
2908                                                AArch64::STLRW, AArch64::STLRX};
2909         Register ValReg = LdSt.getReg(0);
2910         if (MRI.getType(ValReg).getSizeInBits() == 64 && MemSizeInBits != 64) {
2911           // Emit a subreg copy of 32 bits.
2912           Register NewVal = MRI.createVirtualRegister(&AArch64::GPR32RegClass);
2913           MIB.buildInstr(TargetOpcode::COPY, {NewVal}, {})
2914               .addReg(I.getOperand(0).getReg(), 0, AArch64::sub_32);
2915           I.getOperand(0).setReg(NewVal);
2916         }
2917         I.setDesc(TII.get(Opcodes[Log2_32(MemSizeInBytes)]));
2918       }
2919       constrainSelectedInstRegOperands(I, TII, TRI, RBI);
2920       return true;
2921     }
2922 
2923 #ifndef NDEBUG
2924     const Register PtrReg = LdSt.getPointerReg();
2925     const RegisterBank &PtrRB = *RBI.getRegBank(PtrReg, MRI, TRI);
2926     // Check that the pointer register is valid.
2927     assert(PtrRB.getID() == AArch64::GPRRegBankID &&
2928            "Load/Store pointer operand isn't a GPR");
2929     assert(MRI.getType(PtrReg).isPointer() &&
2930            "Load/Store pointer operand isn't a pointer");
2931 #endif
2932 
2933     const Register ValReg = LdSt.getReg(0);
2934     const LLT ValTy = MRI.getType(ValReg);
2935     const RegisterBank &RB = *RBI.getRegBank(ValReg, MRI, TRI);
2936 
2937     // The code below doesn't support truncating stores, so we need to split it
2938     // again.
2939     if (isa<GStore>(LdSt) && ValTy.getSizeInBits() > MemSizeInBits) {
2940       unsigned SubReg;
2941       LLT MemTy = LdSt.getMMO().getMemoryType();
2942       auto *RC = getRegClassForTypeOnBank(MemTy, RB);
2943       if (!getSubRegForClass(RC, TRI, SubReg))
2944         return false;
2945 
2946       // Generate a subreg copy.
2947       auto Copy = MIB.buildInstr(TargetOpcode::COPY, {MemTy}, {})
2948                       .addReg(ValReg, 0, SubReg)
2949                       .getReg(0);
2950       RBI.constrainGenericRegister(Copy, *RC, MRI);
2951       LdSt.getOperand(0).setReg(Copy);
2952     } else if (isa<GLoad>(LdSt) && ValTy.getSizeInBits() > MemSizeInBits) {
2953       // If this is an any-extending load from the FPR bank, split it into a regular
2954       // load + extend.
2955       if (RB.getID() == AArch64::FPRRegBankID) {
2956         unsigned SubReg;
2957         LLT MemTy = LdSt.getMMO().getMemoryType();
2958         auto *RC = getRegClassForTypeOnBank(MemTy, RB);
2959         if (!getSubRegForClass(RC, TRI, SubReg))
2960           return false;
2961         Register OldDst = LdSt.getReg(0);
2962         Register NewDst =
2963             MRI.createGenericVirtualRegister(LdSt.getMMO().getMemoryType());
2964         LdSt.getOperand(0).setReg(NewDst);
2965         MRI.setRegBank(NewDst, RB);
2966         // Generate a SUBREG_TO_REG to extend it.
2967         MIB.setInsertPt(MIB.getMBB(), std::next(LdSt.getIterator()));
2968         MIB.buildInstr(AArch64::SUBREG_TO_REG, {OldDst}, {})
2969             .addImm(0)
2970             .addUse(NewDst)
2971             .addImm(SubReg);
2972         auto SubRegRC = getRegClassForTypeOnBank(MRI.getType(OldDst), RB);
2973         RBI.constrainGenericRegister(OldDst, *SubRegRC, MRI);
2974         MIB.setInstr(LdSt);
2975       }
2976     }
2977 
2978     // Helper lambda for partially selecting I. Either returns the original
2979     // instruction with an updated opcode, or a new instruction.
2980     auto SelectLoadStoreAddressingMode = [&]() -> MachineInstr * {
2981       bool IsStore = isa<GStore>(I);
2982       const unsigned NewOpc =
2983           selectLoadStoreUIOp(I.getOpcode(), RB.getID(), MemSizeInBits);
2984       if (NewOpc == I.getOpcode())
2985         return nullptr;
2986       // Check if we can fold anything into the addressing mode.
2987       auto AddrModeFns =
2988           selectAddrModeIndexed(I.getOperand(1), MemSizeInBytes);
2989       if (!AddrModeFns) {
2990         // Can't fold anything. Use the original instruction.
2991         I.setDesc(TII.get(NewOpc));
2992         I.addOperand(MachineOperand::CreateImm(0));
2993         return &I;
2994       }
2995 
2996       // Folded something. Create a new instruction and return it.
2997       auto NewInst = MIB.buildInstr(NewOpc, {}, {}, I.getFlags());
2998       Register CurValReg = I.getOperand(0).getReg();
2999       IsStore ? NewInst.addUse(CurValReg) : NewInst.addDef(CurValReg);
3000       NewInst.cloneMemRefs(I);
3001       for (auto &Fn : *AddrModeFns)
3002         Fn(NewInst);
3003       I.eraseFromParent();
3004       return &*NewInst;
3005     };
3006 
3007     MachineInstr *LoadStore = SelectLoadStoreAddressingMode();
3008     if (!LoadStore)
3009       return false;
3010 
3011     // If we're storing a 0, use WZR/XZR.
3012     if (Opcode == TargetOpcode::G_STORE) {
3013       auto CVal = getIConstantVRegValWithLookThrough(
3014           LoadStore->getOperand(0).getReg(), MRI);
3015       if (CVal && CVal->Value == 0) {
3016         switch (LoadStore->getOpcode()) {
3017         case AArch64::STRWui:
3018         case AArch64::STRHHui:
3019         case AArch64::STRBBui:
3020           LoadStore->getOperand(0).setReg(AArch64::WZR);
3021           break;
3022         case AArch64::STRXui:
3023           LoadStore->getOperand(0).setReg(AArch64::XZR);
3024           break;
3025         }
3026       }
3027     }
3028 
3029     if (IsZExtLoad) {
3030       // The zextload from a smaller type to i32 should be handled by the
3031       // importer.
3032       if (MRI.getType(LoadStore->getOperand(0).getReg()).getSizeInBits() != 64)
3033         return false;
3034       // If we have a ZEXTLOAD then change the load's type to be a narrower reg
3035       // and zero_extend with SUBREG_TO_REG.
3036       Register LdReg = MRI.createVirtualRegister(&AArch64::GPR32RegClass);
3037       Register DstReg = LoadStore->getOperand(0).getReg();
3038       LoadStore->getOperand(0).setReg(LdReg);
3039 
3040       MIB.setInsertPt(MIB.getMBB(), std::next(LoadStore->getIterator()));
3041       MIB.buildInstr(AArch64::SUBREG_TO_REG, {DstReg}, {})
3042           .addImm(0)
3043           .addUse(LdReg)
3044           .addImm(AArch64::sub_32);
3045       constrainSelectedInstRegOperands(*LoadStore, TII, TRI, RBI);
3046       return RBI.constrainGenericRegister(DstReg, AArch64::GPR64allRegClass,
3047                                           MRI);
3048     }
3049     return constrainSelectedInstRegOperands(*LoadStore, TII, TRI, RBI);
3050   }
3051 
3052   case TargetOpcode::G_INDEXED_ZEXTLOAD:
3053   case TargetOpcode::G_INDEXED_SEXTLOAD:
3054     return selectIndexedExtLoad(I, MRI);
3055   case TargetOpcode::G_INDEXED_LOAD:
3056     return selectIndexedLoad(I, MRI);
3057   case TargetOpcode::G_INDEXED_STORE:
3058     return selectIndexedStore(cast<GIndexedStore>(I), MRI);
3059 
3060   case TargetOpcode::G_SMULH:
3061   case TargetOpcode::G_UMULH: {
3062     // Reject the various things we don't support yet.
3063     if (unsupportedBinOp(I, RBI, MRI, TRI))
3064       return false;
3065 
3066     const Register DefReg = I.getOperand(0).getReg();
3067     const RegisterBank &RB = *RBI.getRegBank(DefReg, MRI, TRI);
3068 
3069     if (RB.getID() != AArch64::GPRRegBankID) {
3070       LLVM_DEBUG(dbgs() << "G_[SU]MULH on bank: " << RB << ", expected: GPR\n");
3071       return false;
3072     }
3073 
3074     if (Ty != LLT::scalar(64)) {
3075       LLVM_DEBUG(dbgs() << "G_[SU]MULH has type: " << Ty
3076                         << ", expected: " << LLT::scalar(64) << '\n');
3077       return false;
3078     }
3079 
3080     unsigned NewOpc = I.getOpcode() == TargetOpcode::G_SMULH ? AArch64::SMULHrr
3081                                                              : AArch64::UMULHrr;
3082     I.setDesc(TII.get(NewOpc));
3083 
3084     // Now that we selected an opcode, we need to constrain the register
3085     // operands to use appropriate classes.
3086     return constrainSelectedInstRegOperands(I, TII, TRI, RBI);
3087   }
3088   case TargetOpcode::G_LSHR:
3089   case TargetOpcode::G_ASHR:
3090     if (MRI.getType(I.getOperand(0).getReg()).isVector())
3091       return selectVectorAshrLshr(I, MRI);
3092     [[fallthrough]];
3093   case TargetOpcode::G_SHL:
3094     if (Opcode == TargetOpcode::G_SHL &&
3095         MRI.getType(I.getOperand(0).getReg()).isVector())
3096       return selectVectorSHL(I, MRI);
3097 
3098     // These shifts were legalized to have 64 bit shift amounts because we
3099     // want to take advantage of the selection patterns that assume the
3100     // immediates are s64s, however, selectBinaryOp will assume both operands
3101     // will have the same bit size.
3102     {
3103       Register SrcReg = I.getOperand(1).getReg();
3104       Register ShiftReg = I.getOperand(2).getReg();
3105       const LLT ShiftTy = MRI.getType(ShiftReg);
3106       const LLT SrcTy = MRI.getType(SrcReg);
3107       if (!SrcTy.isVector() && SrcTy.getSizeInBits() == 32 &&
3108           ShiftTy.getSizeInBits() == 64) {
3109         assert(!ShiftTy.isVector() && "unexpected vector shift ty");
3110         // Insert a subregister copy to implement a 64->32 trunc
3111         auto Trunc = MIB.buildInstr(TargetOpcode::COPY, {SrcTy}, {})
3112                          .addReg(ShiftReg, 0, AArch64::sub_32);
3113         MRI.setRegBank(Trunc.getReg(0), RBI.getRegBank(AArch64::GPRRegBankID));
3114         I.getOperand(2).setReg(Trunc.getReg(0));
3115       }
3116     }
3117     [[fallthrough]];
3118   case TargetOpcode::G_OR: {
3119     // Reject the various things we don't support yet.
3120     if (unsupportedBinOp(I, RBI, MRI, TRI))
3121       return false;
3122 
3123     const unsigned OpSize = Ty.getSizeInBits();
3124 
3125     const Register DefReg = I.getOperand(0).getReg();
3126     const RegisterBank &RB = *RBI.getRegBank(DefReg, MRI, TRI);
3127 
3128     const unsigned NewOpc = selectBinaryOp(I.getOpcode(), RB.getID(), OpSize);
3129     if (NewOpc == I.getOpcode())
3130       return false;
3131 
3132     I.setDesc(TII.get(NewOpc));
3133     // FIXME: Should the type be always reset in setDesc?
3134 
3135     // Now that we selected an opcode, we need to constrain the register
3136     // operands to use appropriate classes.
3137     return constrainSelectedInstRegOperands(I, TII, TRI, RBI);
3138   }
3139 
3140   case TargetOpcode::G_PTR_ADD: {
3141     emitADD(I.getOperand(0).getReg(), I.getOperand(1), I.getOperand(2), MIB);
3142     I.eraseFromParent();
3143     return true;
3144   }
3145 
3146   case TargetOpcode::G_SADDE:
3147   case TargetOpcode::G_UADDE:
3148   case TargetOpcode::G_SSUBE:
3149   case TargetOpcode::G_USUBE:
3150   case TargetOpcode::G_SADDO:
3151   case TargetOpcode::G_UADDO:
3152   case TargetOpcode::G_SSUBO:
3153   case TargetOpcode::G_USUBO:
3154     return selectOverflowOp(I, MRI);
3155 
3156   case TargetOpcode::G_PTRMASK: {
3157     Register MaskReg = I.getOperand(2).getReg();
3158     std::optional<int64_t> MaskVal = getIConstantVRegSExtVal(MaskReg, MRI);
3159     // TODO: Implement arbitrary cases
3160     if (!MaskVal || !isShiftedMask_64(*MaskVal))
3161       return false;
3162 
3163     uint64_t Mask = *MaskVal;
3164     I.setDesc(TII.get(AArch64::ANDXri));
3165     I.getOperand(2).ChangeToImmediate(
3166         AArch64_AM::encodeLogicalImmediate(Mask, 64));
3167 
3168     return constrainSelectedInstRegOperands(I, TII, TRI, RBI);
3169   }
3170   case TargetOpcode::G_PTRTOINT:
3171   case TargetOpcode::G_TRUNC: {
3172     const LLT DstTy = MRI.getType(I.getOperand(0).getReg());
3173     const LLT SrcTy = MRI.getType(I.getOperand(1).getReg());
3174 
3175     const Register DstReg = I.getOperand(0).getReg();
3176     const Register SrcReg = I.getOperand(1).getReg();
3177 
3178     const RegisterBank &DstRB = *RBI.getRegBank(DstReg, MRI, TRI);
3179     const RegisterBank &SrcRB = *RBI.getRegBank(SrcReg, MRI, TRI);
3180 
3181     if (DstRB.getID() != SrcRB.getID()) {
3182       LLVM_DEBUG(
3183           dbgs() << "G_TRUNC/G_PTRTOINT input/output on different banks\n");
3184       return false;
3185     }
3186 
3187     if (DstRB.getID() == AArch64::GPRRegBankID) {
3188       const TargetRegisterClass *DstRC = getRegClassForTypeOnBank(DstTy, DstRB);
3189       if (!DstRC)
3190         return false;
3191 
3192       const TargetRegisterClass *SrcRC = getRegClassForTypeOnBank(SrcTy, SrcRB);
3193       if (!SrcRC)
3194         return false;
3195 
3196       if (!RBI.constrainGenericRegister(SrcReg, *SrcRC, MRI) ||
3197           !RBI.constrainGenericRegister(DstReg, *DstRC, MRI)) {
3198         LLVM_DEBUG(dbgs() << "Failed to constrain G_TRUNC/G_PTRTOINT\n");
3199         return false;
3200       }
3201 
3202       if (DstRC == SrcRC) {
3203         // Nothing to be done
3204       } else if (Opcode == TargetOpcode::G_TRUNC && DstTy == LLT::scalar(32) &&
3205                  SrcTy == LLT::scalar(64)) {
3206         llvm_unreachable("TableGen can import this case");
3207         return false;
3208       } else if (DstRC == &AArch64::GPR32RegClass &&
3209                  SrcRC == &AArch64::GPR64RegClass) {
3210         I.getOperand(1).setSubReg(AArch64::sub_32);
3211       } else {
3212         LLVM_DEBUG(
3213             dbgs() << "Unhandled mismatched classes in G_TRUNC/G_PTRTOINT\n");
3214         return false;
3215       }
3216 
3217       I.setDesc(TII.get(TargetOpcode::COPY));
3218       return true;
3219     } else if (DstRB.getID() == AArch64::FPRRegBankID) {
3220       if (DstTy == LLT::fixed_vector(4, 16) &&
3221           SrcTy == LLT::fixed_vector(4, 32)) {
3222         I.setDesc(TII.get(AArch64::XTNv4i16));
3223         constrainSelectedInstRegOperands(I, TII, TRI, RBI);
3224         return true;
3225       }
3226 
3227       if (!SrcTy.isVector() && SrcTy.getSizeInBits() == 128) {
3228         MachineInstr *Extract = emitExtractVectorElt(
3229             DstReg, DstRB, LLT::scalar(DstTy.getSizeInBits()), SrcReg, 0, MIB);
3230         if (!Extract)
3231           return false;
3232         I.eraseFromParent();
3233         return true;
3234       }
3235 
3236       // We might have a vector G_PTRTOINT, in which case just emit a COPY.
3237       if (Opcode == TargetOpcode::G_PTRTOINT) {
3238         assert(DstTy.isVector() && "Expected an FPR ptrtoint to be a vector");
3239         I.setDesc(TII.get(TargetOpcode::COPY));
3240         return selectCopy(I, TII, MRI, TRI, RBI);
3241       }
3242     }
3243 
3244     return false;
3245   }
3246 
3247   case TargetOpcode::G_ANYEXT: {
3248     if (selectUSMovFromExtend(I, MRI))
3249       return true;
3250 
3251     const Register DstReg = I.getOperand(0).getReg();
3252     const Register SrcReg = I.getOperand(1).getReg();
3253 
3254     const RegisterBank &RBDst = *RBI.getRegBank(DstReg, MRI, TRI);
3255     if (RBDst.getID() != AArch64::GPRRegBankID) {
3256       LLVM_DEBUG(dbgs() << "G_ANYEXT on bank: " << RBDst
3257                         << ", expected: GPR\n");
3258       return false;
3259     }
3260 
3261     const RegisterBank &RBSrc = *RBI.getRegBank(SrcReg, MRI, TRI);
3262     if (RBSrc.getID() != AArch64::GPRRegBankID) {
3263       LLVM_DEBUG(dbgs() << "G_ANYEXT on bank: " << RBSrc
3264                         << ", expected: GPR\n");
3265       return false;
3266     }
3267 
3268     const unsigned DstSize = MRI.getType(DstReg).getSizeInBits();
3269 
3270     if (DstSize == 0) {
3271       LLVM_DEBUG(dbgs() << "G_ANYEXT operand has no size, not a gvreg?\n");
3272       return false;
3273     }
3274 
3275     if (DstSize != 64 && DstSize > 32) {
3276       LLVM_DEBUG(dbgs() << "G_ANYEXT to size: " << DstSize
3277                         << ", expected: 32 or 64\n");
3278       return false;
3279     }
3280     // At this point G_ANYEXT is just like a plain COPY, but we need
3281     // to explicitly form the 64-bit value if any.
3282     if (DstSize > 32) {
3283       Register ExtSrc = MRI.createVirtualRegister(&AArch64::GPR64allRegClass);
3284       BuildMI(MBB, I, I.getDebugLoc(), TII.get(AArch64::SUBREG_TO_REG))
3285           .addDef(ExtSrc)
3286           .addImm(0)
3287           .addUse(SrcReg)
3288           .addImm(AArch64::sub_32);
3289       I.getOperand(1).setReg(ExtSrc);
3290     }
3291     return selectCopy(I, TII, MRI, TRI, RBI);
3292   }
3293 
3294   case TargetOpcode::G_ZEXT:
3295   case TargetOpcode::G_SEXT_INREG:
3296   case TargetOpcode::G_SEXT: {
3297     if (selectUSMovFromExtend(I, MRI))
3298       return true;
3299 
3300     unsigned Opcode = I.getOpcode();
3301     const bool IsSigned = Opcode != TargetOpcode::G_ZEXT;
3302     const Register DefReg = I.getOperand(0).getReg();
3303     Register SrcReg = I.getOperand(1).getReg();
3304     const LLT DstTy = MRI.getType(DefReg);
3305     const LLT SrcTy = MRI.getType(SrcReg);
3306     unsigned DstSize = DstTy.getSizeInBits();
3307     unsigned SrcSize = SrcTy.getSizeInBits();
3308 
3309     // SEXT_INREG has the same src reg size as dst, the size of the value to be
3310     // extended is encoded in the imm.
3311     if (Opcode == TargetOpcode::G_SEXT_INREG)
3312       SrcSize = I.getOperand(2).getImm();
3313 
3314     if (DstTy.isVector())
3315       return false; // Should be handled by imported patterns.
3316 
3317     assert((*RBI.getRegBank(DefReg, MRI, TRI)).getID() ==
3318                AArch64::GPRRegBankID &&
3319            "Unexpected ext regbank");
3320 
3321     MachineInstr *ExtI;
3322 
3323     // First check if we're extending the result of a load which has a dest type
3324     // smaller than 32 bits, then this zext is redundant. GPR32 is the smallest
3325     // GPR register on AArch64 and all loads which are smaller automatically
3326     // zero-extend the upper bits. E.g.
3327     // %v(s8) = G_LOAD %p, :: (load 1)
3328     // %v2(s32) = G_ZEXT %v(s8)
3329     if (!IsSigned) {
3330       auto *LoadMI = getOpcodeDef(TargetOpcode::G_LOAD, SrcReg, MRI);
3331       bool IsGPR =
3332           RBI.getRegBank(SrcReg, MRI, TRI)->getID() == AArch64::GPRRegBankID;
3333       if (LoadMI && IsGPR) {
3334         const MachineMemOperand *MemOp = *LoadMI->memoperands_begin();
3335         unsigned BytesLoaded = MemOp->getSize();
3336         if (BytesLoaded < 4 && SrcTy.getSizeInBytes() == BytesLoaded)
3337           return selectCopy(I, TII, MRI, TRI, RBI);
3338       }
3339 
3340       // For the 32-bit -> 64-bit case, we can emit a mov (ORRWrs)
3341       // + SUBREG_TO_REG.
3342       if (IsGPR && SrcSize == 32 && DstSize == 64) {
3343         Register SubregToRegSrc =
3344             MRI.createVirtualRegister(&AArch64::GPR32RegClass);
3345         const Register ZReg = AArch64::WZR;
3346         MIB.buildInstr(AArch64::ORRWrs, {SubregToRegSrc}, {ZReg, SrcReg})
3347             .addImm(0);
3348 
3349         MIB.buildInstr(AArch64::SUBREG_TO_REG, {DefReg}, {})
3350             .addImm(0)
3351             .addUse(SubregToRegSrc)
3352             .addImm(AArch64::sub_32);
3353 
3354         if (!RBI.constrainGenericRegister(DefReg, AArch64::GPR64RegClass,
3355                                           MRI)) {
3356           LLVM_DEBUG(dbgs() << "Failed to constrain G_ZEXT destination\n");
3357           return false;
3358         }
3359 
3360         if (!RBI.constrainGenericRegister(SrcReg, AArch64::GPR32RegClass,
3361                                           MRI)) {
3362           LLVM_DEBUG(dbgs() << "Failed to constrain G_ZEXT source\n");
3363           return false;
3364         }
3365 
3366         I.eraseFromParent();
3367         return true;
3368       }
3369     }
3370 
3371     if (DstSize == 64) {
3372       if (Opcode != TargetOpcode::G_SEXT_INREG) {
3373         // FIXME: Can we avoid manually doing this?
3374         if (!RBI.constrainGenericRegister(SrcReg, AArch64::GPR32RegClass,
3375                                           MRI)) {
3376           LLVM_DEBUG(dbgs() << "Failed to constrain " << TII.getName(Opcode)
3377                             << " operand\n");
3378           return false;
3379         }
3380         SrcReg = MIB.buildInstr(AArch64::SUBREG_TO_REG,
3381                                 {&AArch64::GPR64RegClass}, {})
3382                      .addImm(0)
3383                      .addUse(SrcReg)
3384                      .addImm(AArch64::sub_32)
3385                      .getReg(0);
3386       }
3387 
3388       ExtI = MIB.buildInstr(IsSigned ? AArch64::SBFMXri : AArch64::UBFMXri,
3389                              {DefReg}, {SrcReg})
3390                   .addImm(0)
3391                   .addImm(SrcSize - 1);
3392     } else if (DstSize <= 32) {
3393       ExtI = MIB.buildInstr(IsSigned ? AArch64::SBFMWri : AArch64::UBFMWri,
3394                              {DefReg}, {SrcReg})
3395                   .addImm(0)
3396                   .addImm(SrcSize - 1);
3397     } else {
3398       return false;
3399     }
3400 
3401     constrainSelectedInstRegOperands(*ExtI, TII, TRI, RBI);
3402     I.eraseFromParent();
3403     return true;
3404   }
3405 
3406   case TargetOpcode::G_SITOFP:
3407   case TargetOpcode::G_UITOFP:
3408   case TargetOpcode::G_FPTOSI:
3409   case TargetOpcode::G_FPTOUI: {
3410     const LLT DstTy = MRI.getType(I.getOperand(0).getReg()),
3411               SrcTy = MRI.getType(I.getOperand(1).getReg());
3412     const unsigned NewOpc = selectFPConvOpc(Opcode, DstTy, SrcTy);
3413     if (NewOpc == Opcode)
3414       return false;
3415 
3416     I.setDesc(TII.get(NewOpc));
3417     constrainSelectedInstRegOperands(I, TII, TRI, RBI);
3418     I.setFlags(MachineInstr::NoFPExcept);
3419 
3420     return true;
3421   }
3422 
3423   case TargetOpcode::G_FREEZE:
3424     return selectCopy(I, TII, MRI, TRI, RBI);
3425 
3426   case TargetOpcode::G_INTTOPTR:
3427     // The importer is currently unable to import pointer types since they
3428     // didn't exist in SelectionDAG.
3429     return selectCopy(I, TII, MRI, TRI, RBI);
3430 
3431   case TargetOpcode::G_BITCAST:
3432     // Imported SelectionDAG rules can handle every bitcast except those that
3433     // bitcast from a type to the same type. Ideally, these shouldn't occur
3434     // but we might not run an optimizer that deletes them. The other exception
3435     // is bitcasts involving pointer types, as SelectionDAG has no knowledge
3436     // of them.
3437     return selectCopy(I, TII, MRI, TRI, RBI);
3438 
3439   case TargetOpcode::G_SELECT: {
3440     auto &Sel = cast<GSelect>(I);
3441     const Register CondReg = Sel.getCondReg();
3442     const Register TReg = Sel.getTrueReg();
3443     const Register FReg = Sel.getFalseReg();
3444 
3445     if (tryOptSelect(Sel))
3446       return true;
3447 
3448     // Make sure to use an unused vreg instead of wzr, so that the peephole
3449     // optimizations will be able to optimize these.
3450     Register DeadVReg = MRI.createVirtualRegister(&AArch64::GPR32RegClass);
3451     auto TstMI = MIB.buildInstr(AArch64::ANDSWri, {DeadVReg}, {CondReg})
3452                      .addImm(AArch64_AM::encodeLogicalImmediate(1, 32));
3453     constrainSelectedInstRegOperands(*TstMI, TII, TRI, RBI);
3454     if (!emitSelect(Sel.getReg(0), TReg, FReg, AArch64CC::NE, MIB))
3455       return false;
3456     Sel.eraseFromParent();
3457     return true;
3458   }
3459   case TargetOpcode::G_ICMP: {
3460     if (Ty.isVector())
3461       return selectVectorICmp(I, MRI);
3462 
3463     if (Ty != LLT::scalar(32)) {
3464       LLVM_DEBUG(dbgs() << "G_ICMP result has type: " << Ty
3465                         << ", expected: " << LLT::scalar(32) << '\n');
3466       return false;
3467     }
3468 
3469     auto Pred = static_cast<CmpInst::Predicate>(I.getOperand(1).getPredicate());
3470     const AArch64CC::CondCode InvCC =
3471         changeICMPPredToAArch64CC(CmpInst::getInversePredicate(Pred));
3472     emitIntegerCompare(I.getOperand(2), I.getOperand(3), I.getOperand(1), MIB);
3473     emitCSINC(/*Dst=*/I.getOperand(0).getReg(), /*Src1=*/AArch64::WZR,
3474               /*Src2=*/AArch64::WZR, InvCC, MIB);
3475     I.eraseFromParent();
3476     return true;
3477   }
3478 
3479   case TargetOpcode::G_FCMP: {
3480     CmpInst::Predicate Pred =
3481         static_cast<CmpInst::Predicate>(I.getOperand(1).getPredicate());
3482     if (!emitFPCompare(I.getOperand(2).getReg(), I.getOperand(3).getReg(), MIB,
3483                        Pred) ||
3484         !emitCSetForFCmp(I.getOperand(0).getReg(), Pred, MIB))
3485       return false;
3486     I.eraseFromParent();
3487     return true;
3488   }
3489   case TargetOpcode::G_VASTART:
3490     return STI.isTargetDarwin() ? selectVaStartDarwin(I, MF, MRI)
3491                                 : selectVaStartAAPCS(I, MF, MRI);
3492   case TargetOpcode::G_INTRINSIC:
3493     return selectIntrinsic(I, MRI);
3494   case TargetOpcode::G_INTRINSIC_W_SIDE_EFFECTS:
3495     return selectIntrinsicWithSideEffects(I, MRI);
3496   case TargetOpcode::G_IMPLICIT_DEF: {
3497     I.setDesc(TII.get(TargetOpcode::IMPLICIT_DEF));
3498     const LLT DstTy = MRI.getType(I.getOperand(0).getReg());
3499     const Register DstReg = I.getOperand(0).getReg();
3500     const RegisterBank &DstRB = *RBI.getRegBank(DstReg, MRI, TRI);
3501     const TargetRegisterClass *DstRC = getRegClassForTypeOnBank(DstTy, DstRB);
3502     RBI.constrainGenericRegister(DstReg, *DstRC, MRI);
3503     return true;
3504   }
3505   case TargetOpcode::G_BLOCK_ADDR: {
3506     if (TM.getCodeModel() == CodeModel::Large && !TM.isPositionIndependent()) {
3507       materializeLargeCMVal(I, I.getOperand(1).getBlockAddress(), 0);
3508       I.eraseFromParent();
3509       return true;
3510     } else {
3511       I.setDesc(TII.get(AArch64::MOVaddrBA));
3512       auto MovMI = BuildMI(MBB, I, I.getDebugLoc(), TII.get(AArch64::MOVaddrBA),
3513                            I.getOperand(0).getReg())
3514                        .addBlockAddress(I.getOperand(1).getBlockAddress(),
3515                                         /* Offset */ 0, AArch64II::MO_PAGE)
3516                        .addBlockAddress(
3517                            I.getOperand(1).getBlockAddress(), /* Offset */ 0,
3518                            AArch64II::MO_NC | AArch64II::MO_PAGEOFF);
3519       I.eraseFromParent();
3520       return constrainSelectedInstRegOperands(*MovMI, TII, TRI, RBI);
3521     }
3522   }
3523   case AArch64::G_DUP: {
3524     // When the scalar of G_DUP is an s8/s16 gpr, they can't be selected by
3525     // imported patterns. Do it manually here. Avoiding generating s16 gpr is
3526     // difficult because at RBS we may end up pessimizing the fpr case if we
3527     // decided to add an anyextend to fix this. Manual selection is the most
3528     // robust solution for now.
3529     if (RBI.getRegBank(I.getOperand(1).getReg(), MRI, TRI)->getID() !=
3530         AArch64::GPRRegBankID)
3531       return false; // We expect the fpr regbank case to be imported.
3532     LLT VecTy = MRI.getType(I.getOperand(0).getReg());
3533     if (VecTy == LLT::fixed_vector(8, 8))
3534       I.setDesc(TII.get(AArch64::DUPv8i8gpr));
3535     else if (VecTy == LLT::fixed_vector(16, 8))
3536       I.setDesc(TII.get(AArch64::DUPv16i8gpr));
3537     else if (VecTy == LLT::fixed_vector(4, 16))
3538       I.setDesc(TII.get(AArch64::DUPv4i16gpr));
3539     else if (VecTy == LLT::fixed_vector(8, 16))
3540       I.setDesc(TII.get(AArch64::DUPv8i16gpr));
3541     else
3542       return false;
3543     return constrainSelectedInstRegOperands(I, TII, TRI, RBI);
3544   }
3545   case TargetOpcode::G_BUILD_VECTOR:
3546     return selectBuildVector(I, MRI);
3547   case TargetOpcode::G_MERGE_VALUES:
3548     return selectMergeValues(I, MRI);
3549   case TargetOpcode::G_UNMERGE_VALUES:
3550     return selectUnmergeValues(I, MRI);
3551   case TargetOpcode::G_SHUFFLE_VECTOR:
3552     return selectShuffleVector(I, MRI);
3553   case TargetOpcode::G_EXTRACT_VECTOR_ELT:
3554     return selectExtractElt(I, MRI);
3555   case TargetOpcode::G_INSERT_VECTOR_ELT:
3556     return selectInsertElt(I, MRI);
3557   case TargetOpcode::G_CONCAT_VECTORS:
3558     return selectConcatVectors(I, MRI);
3559   case TargetOpcode::G_JUMP_TABLE:
3560     return selectJumpTable(I, MRI);
3561   case TargetOpcode::G_MEMCPY:
3562   case TargetOpcode::G_MEMCPY_INLINE:
3563   case TargetOpcode::G_MEMMOVE:
3564   case TargetOpcode::G_MEMSET:
3565     assert(STI.hasMOPS() && "Shouldn't get here without +mops feature");
3566     return selectMOPS(I, MRI);
3567   }
3568 
3569   return false;
3570 }
3571 
3572 bool AArch64InstructionSelector::selectAndRestoreState(MachineInstr &I) {
3573   MachineIRBuilderState OldMIBState = MIB.getState();
3574   bool Success = select(I);
3575   MIB.setState(OldMIBState);
3576   return Success;
3577 }
3578 
3579 bool AArch64InstructionSelector::selectMOPS(MachineInstr &GI,
3580                                             MachineRegisterInfo &MRI) {
3581   unsigned Mopcode;
3582   switch (GI.getOpcode()) {
3583   case TargetOpcode::G_MEMCPY:
3584   case TargetOpcode::G_MEMCPY_INLINE:
3585     Mopcode = AArch64::MOPSMemoryCopyPseudo;
3586     break;
3587   case TargetOpcode::G_MEMMOVE:
3588     Mopcode = AArch64::MOPSMemoryMovePseudo;
3589     break;
3590   case TargetOpcode::G_MEMSET:
3591     // For tagged memset see llvm.aarch64.mops.memset.tag
3592     Mopcode = AArch64::MOPSMemorySetPseudo;
3593     break;
3594   }
3595 
3596   auto &DstPtr = GI.getOperand(0);
3597   auto &SrcOrVal = GI.getOperand(1);
3598   auto &Size = GI.getOperand(2);
3599 
3600   // Create copies of the registers that can be clobbered.
3601   const Register DstPtrCopy = MRI.cloneVirtualRegister(DstPtr.getReg());
3602   const Register SrcValCopy = MRI.cloneVirtualRegister(SrcOrVal.getReg());
3603   const Register SizeCopy = MRI.cloneVirtualRegister(Size.getReg());
3604 
3605   const bool IsSet = Mopcode == AArch64::MOPSMemorySetPseudo;
3606   const auto &SrcValRegClass =
3607       IsSet ? AArch64::GPR64RegClass : AArch64::GPR64commonRegClass;
3608 
3609   // Constrain to specific registers
3610   RBI.constrainGenericRegister(DstPtrCopy, AArch64::GPR64commonRegClass, MRI);
3611   RBI.constrainGenericRegister(SrcValCopy, SrcValRegClass, MRI);
3612   RBI.constrainGenericRegister(SizeCopy, AArch64::GPR64RegClass, MRI);
3613 
3614   MIB.buildCopy(DstPtrCopy, DstPtr);
3615   MIB.buildCopy(SrcValCopy, SrcOrVal);
3616   MIB.buildCopy(SizeCopy, Size);
3617 
3618   // New instruction uses the copied registers because it must update them.
3619   // The defs are not used since they don't exist in G_MEM*. They are still
3620   // tied.
3621   // Note: order of operands is different from G_MEMSET, G_MEMCPY, G_MEMMOVE
3622   Register DefDstPtr = MRI.createVirtualRegister(&AArch64::GPR64commonRegClass);
3623   Register DefSize = MRI.createVirtualRegister(&AArch64::GPR64RegClass);
3624   if (IsSet) {
3625     MIB.buildInstr(Mopcode, {DefDstPtr, DefSize},
3626                    {DstPtrCopy, SizeCopy, SrcValCopy});
3627   } else {
3628     Register DefSrcPtr = MRI.createVirtualRegister(&SrcValRegClass);
3629     MIB.buildInstr(Mopcode, {DefDstPtr, DefSrcPtr, DefSize},
3630                    {DstPtrCopy, SrcValCopy, SizeCopy});
3631   }
3632 
3633   GI.eraseFromParent();
3634   return true;
3635 }
3636 
3637 bool AArch64InstructionSelector::selectBrJT(MachineInstr &I,
3638                                             MachineRegisterInfo &MRI) {
3639   assert(I.getOpcode() == TargetOpcode::G_BRJT && "Expected G_BRJT");
3640   Register JTAddr = I.getOperand(0).getReg();
3641   unsigned JTI = I.getOperand(1).getIndex();
3642   Register Index = I.getOperand(2).getReg();
3643 
3644   Register TargetReg = MRI.createVirtualRegister(&AArch64::GPR64RegClass);
3645   Register ScratchReg = MRI.createVirtualRegister(&AArch64::GPR64spRegClass);
3646 
3647   MF->getInfo<AArch64FunctionInfo>()->setJumpTableEntryInfo(JTI, 4, nullptr);
3648   auto JumpTableInst = MIB.buildInstr(AArch64::JumpTableDest32,
3649                                       {TargetReg, ScratchReg}, {JTAddr, Index})
3650                            .addJumpTableIndex(JTI);
3651   // Save the jump table info.
3652   MIB.buildInstr(TargetOpcode::JUMP_TABLE_DEBUG_INFO, {},
3653                  {static_cast<int64_t>(JTI)});
3654   // Build the indirect branch.
3655   MIB.buildInstr(AArch64::BR, {}, {TargetReg});
3656   I.eraseFromParent();
3657   return constrainSelectedInstRegOperands(*JumpTableInst, TII, TRI, RBI);
3658 }
3659 
3660 bool AArch64InstructionSelector::selectJumpTable(MachineInstr &I,
3661                                                  MachineRegisterInfo &MRI) {
3662   assert(I.getOpcode() == TargetOpcode::G_JUMP_TABLE && "Expected jump table");
3663   assert(I.getOperand(1).isJTI() && "Jump table op should have a JTI!");
3664 
3665   Register DstReg = I.getOperand(0).getReg();
3666   unsigned JTI = I.getOperand(1).getIndex();
3667   // We generate a MOVaddrJT which will get expanded to an ADRP + ADD later.
3668   auto MovMI =
3669     MIB.buildInstr(AArch64::MOVaddrJT, {DstReg}, {})
3670           .addJumpTableIndex(JTI, AArch64II::MO_PAGE)
3671           .addJumpTableIndex(JTI, AArch64II::MO_NC | AArch64II::MO_PAGEOFF);
3672   I.eraseFromParent();
3673   return constrainSelectedInstRegOperands(*MovMI, TII, TRI, RBI);
3674 }
3675 
3676 bool AArch64InstructionSelector::selectTLSGlobalValue(
3677     MachineInstr &I, MachineRegisterInfo &MRI) {
3678   if (!STI.isTargetMachO())
3679     return false;
3680   MachineFunction &MF = *I.getParent()->getParent();
3681   MF.getFrameInfo().setAdjustsStack(true);
3682 
3683   const auto &GlobalOp = I.getOperand(1);
3684   assert(GlobalOp.getOffset() == 0 &&
3685          "Shouldn't have an offset on TLS globals!");
3686   const GlobalValue &GV = *GlobalOp.getGlobal();
3687 
3688   auto LoadGOT =
3689       MIB.buildInstr(AArch64::LOADgot, {&AArch64::GPR64commonRegClass}, {})
3690           .addGlobalAddress(&GV, 0, AArch64II::MO_TLS);
3691 
3692   auto Load = MIB.buildInstr(AArch64::LDRXui, {&AArch64::GPR64commonRegClass},
3693                              {LoadGOT.getReg(0)})
3694                   .addImm(0);
3695 
3696   MIB.buildCopy(Register(AArch64::X0), LoadGOT.getReg(0));
3697   // TLS calls preserve all registers except those that absolutely must be
3698   // trashed: X0 (it takes an argument), LR (it's a call) and NZCV (let's not be
3699   // silly).
3700   MIB.buildInstr(getBLRCallOpcode(MF), {}, {Load})
3701       .addUse(AArch64::X0, RegState::Implicit)
3702       .addDef(AArch64::X0, RegState::Implicit)
3703       .addRegMask(TRI.getTLSCallPreservedMask());
3704 
3705   MIB.buildCopy(I.getOperand(0).getReg(), Register(AArch64::X0));
3706   RBI.constrainGenericRegister(I.getOperand(0).getReg(), AArch64::GPR64RegClass,
3707                                MRI);
3708   I.eraseFromParent();
3709   return true;
3710 }
3711 
3712 bool AArch64InstructionSelector::selectVectorICmp(
3713     MachineInstr &I, MachineRegisterInfo &MRI) {
3714   Register DstReg = I.getOperand(0).getReg();
3715   LLT DstTy = MRI.getType(DstReg);
3716   Register SrcReg = I.getOperand(2).getReg();
3717   Register Src2Reg = I.getOperand(3).getReg();
3718   LLT SrcTy = MRI.getType(SrcReg);
3719 
3720   unsigned SrcEltSize = SrcTy.getElementType().getSizeInBits();
3721   unsigned NumElts = DstTy.getNumElements();
3722 
3723   // First index is element size, 0 == 8b, 1 == 16b, 2 == 32b, 3 == 64b
3724   // Second index is num elts, 0 == v2, 1 == v4, 2 == v8, 3 == v16
3725   // Third index is cc opcode:
3726   // 0 == eq
3727   // 1 == ugt
3728   // 2 == uge
3729   // 3 == ult
3730   // 4 == ule
3731   // 5 == sgt
3732   // 6 == sge
3733   // 7 == slt
3734   // 8 == sle
3735   // ne is done by negating 'eq' result.
3736 
3737   // This table below assumes that for some comparisons the operands will be
3738   // commuted.
3739   // ult op == commute + ugt op
3740   // ule op == commute + uge op
3741   // slt op == commute + sgt op
3742   // sle op == commute + sge op
3743   unsigned PredIdx = 0;
3744   bool SwapOperands = false;
3745   CmpInst::Predicate Pred = (CmpInst::Predicate)I.getOperand(1).getPredicate();
3746   switch (Pred) {
3747   case CmpInst::ICMP_NE:
3748   case CmpInst::ICMP_EQ:
3749     PredIdx = 0;
3750     break;
3751   case CmpInst::ICMP_UGT:
3752     PredIdx = 1;
3753     break;
3754   case CmpInst::ICMP_UGE:
3755     PredIdx = 2;
3756     break;
3757   case CmpInst::ICMP_ULT:
3758     PredIdx = 3;
3759     SwapOperands = true;
3760     break;
3761   case CmpInst::ICMP_ULE:
3762     PredIdx = 4;
3763     SwapOperands = true;
3764     break;
3765   case CmpInst::ICMP_SGT:
3766     PredIdx = 5;
3767     break;
3768   case CmpInst::ICMP_SGE:
3769     PredIdx = 6;
3770     break;
3771   case CmpInst::ICMP_SLT:
3772     PredIdx = 7;
3773     SwapOperands = true;
3774     break;
3775   case CmpInst::ICMP_SLE:
3776     PredIdx = 8;
3777     SwapOperands = true;
3778     break;
3779   default:
3780     llvm_unreachable("Unhandled icmp predicate");
3781     return false;
3782   }
3783 
3784   // This table obviously should be tablegen'd when we have our GISel native
3785   // tablegen selector.
3786 
3787   static const unsigned OpcTable[4][4][9] = {
3788       {
3789           {0 /* invalid */, 0 /* invalid */, 0 /* invalid */, 0 /* invalid */,
3790            0 /* invalid */, 0 /* invalid */, 0 /* invalid */, 0 /* invalid */,
3791            0 /* invalid */},
3792           {0 /* invalid */, 0 /* invalid */, 0 /* invalid */, 0 /* invalid */,
3793            0 /* invalid */, 0 /* invalid */, 0 /* invalid */, 0 /* invalid */,
3794            0 /* invalid */},
3795           {AArch64::CMEQv8i8, AArch64::CMHIv8i8, AArch64::CMHSv8i8,
3796            AArch64::CMHIv8i8, AArch64::CMHSv8i8, AArch64::CMGTv8i8,
3797            AArch64::CMGEv8i8, AArch64::CMGTv8i8, AArch64::CMGEv8i8},
3798           {AArch64::CMEQv16i8, AArch64::CMHIv16i8, AArch64::CMHSv16i8,
3799            AArch64::CMHIv16i8, AArch64::CMHSv16i8, AArch64::CMGTv16i8,
3800            AArch64::CMGEv16i8, AArch64::CMGTv16i8, AArch64::CMGEv16i8}
3801       },
3802       {
3803           {0 /* invalid */, 0 /* invalid */, 0 /* invalid */, 0 /* invalid */,
3804            0 /* invalid */, 0 /* invalid */, 0 /* invalid */, 0 /* invalid */,
3805            0 /* invalid */},
3806           {AArch64::CMEQv4i16, AArch64::CMHIv4i16, AArch64::CMHSv4i16,
3807            AArch64::CMHIv4i16, AArch64::CMHSv4i16, AArch64::CMGTv4i16,
3808            AArch64::CMGEv4i16, AArch64::CMGTv4i16, AArch64::CMGEv4i16},
3809           {AArch64::CMEQv8i16, AArch64::CMHIv8i16, AArch64::CMHSv8i16,
3810            AArch64::CMHIv8i16, AArch64::CMHSv8i16, AArch64::CMGTv8i16,
3811            AArch64::CMGEv8i16, AArch64::CMGTv8i16, AArch64::CMGEv8i16},
3812           {0 /* invalid */, 0 /* invalid */, 0 /* invalid */, 0 /* invalid */,
3813            0 /* invalid */, 0 /* invalid */, 0 /* invalid */, 0 /* invalid */,
3814            0 /* invalid */}
3815       },
3816       {
3817           {AArch64::CMEQv2i32, AArch64::CMHIv2i32, AArch64::CMHSv2i32,
3818            AArch64::CMHIv2i32, AArch64::CMHSv2i32, AArch64::CMGTv2i32,
3819            AArch64::CMGEv2i32, AArch64::CMGTv2i32, AArch64::CMGEv2i32},
3820           {AArch64::CMEQv4i32, AArch64::CMHIv4i32, AArch64::CMHSv4i32,
3821            AArch64::CMHIv4i32, AArch64::CMHSv4i32, AArch64::CMGTv4i32,
3822            AArch64::CMGEv4i32, AArch64::CMGTv4i32, AArch64::CMGEv4i32},
3823           {0 /* invalid */, 0 /* invalid */, 0 /* invalid */, 0 /* invalid */,
3824            0 /* invalid */, 0 /* invalid */, 0 /* invalid */, 0 /* invalid */,
3825            0 /* invalid */},
3826           {0 /* invalid */, 0 /* invalid */, 0 /* invalid */, 0 /* invalid */,
3827            0 /* invalid */, 0 /* invalid */, 0 /* invalid */, 0 /* invalid */,
3828            0 /* invalid */}
3829       },
3830       {
3831           {AArch64::CMEQv2i64, AArch64::CMHIv2i64, AArch64::CMHSv2i64,
3832            AArch64::CMHIv2i64, AArch64::CMHSv2i64, AArch64::CMGTv2i64,
3833            AArch64::CMGEv2i64, AArch64::CMGTv2i64, AArch64::CMGEv2i64},
3834           {0 /* invalid */, 0 /* invalid */, 0 /* invalid */, 0 /* invalid */,
3835            0 /* invalid */, 0 /* invalid */, 0 /* invalid */, 0 /* invalid */,
3836            0 /* invalid */},
3837           {0 /* invalid */, 0 /* invalid */, 0 /* invalid */, 0 /* invalid */,
3838            0 /* invalid */, 0 /* invalid */, 0 /* invalid */, 0 /* invalid */,
3839            0 /* invalid */},
3840           {0 /* invalid */, 0 /* invalid */, 0 /* invalid */, 0 /* invalid */,
3841            0 /* invalid */, 0 /* invalid */, 0 /* invalid */, 0 /* invalid */,
3842            0 /* invalid */}
3843       },
3844   };
3845   unsigned EltIdx = Log2_32(SrcEltSize / 8);
3846   unsigned NumEltsIdx = Log2_32(NumElts / 2);
3847   unsigned Opc = OpcTable[EltIdx][NumEltsIdx][PredIdx];
3848   if (!Opc) {
3849     LLVM_DEBUG(dbgs() << "Could not map G_ICMP to cmp opcode");
3850     return false;
3851   }
3852 
3853   const RegisterBank &VecRB = *RBI.getRegBank(SrcReg, MRI, TRI);
3854   const TargetRegisterClass *SrcRC =
3855       getRegClassForTypeOnBank(SrcTy, VecRB, true);
3856   if (!SrcRC) {
3857     LLVM_DEBUG(dbgs() << "Could not determine source register class.\n");
3858     return false;
3859   }
3860 
3861   unsigned NotOpc = Pred == ICmpInst::ICMP_NE ? AArch64::NOTv8i8 : 0;
3862   if (SrcTy.getSizeInBits() == 128)
3863     NotOpc = NotOpc ? AArch64::NOTv16i8 : 0;
3864 
3865   if (SwapOperands)
3866     std::swap(SrcReg, Src2Reg);
3867 
3868   auto Cmp = MIB.buildInstr(Opc, {SrcRC}, {SrcReg, Src2Reg});
3869   constrainSelectedInstRegOperands(*Cmp, TII, TRI, RBI);
3870 
3871   // Invert if we had a 'ne' cc.
3872   if (NotOpc) {
3873     Cmp = MIB.buildInstr(NotOpc, {DstReg}, {Cmp});
3874     constrainSelectedInstRegOperands(*Cmp, TII, TRI, RBI);
3875   } else {
3876     MIB.buildCopy(DstReg, Cmp.getReg(0));
3877   }
3878   RBI.constrainGenericRegister(DstReg, *SrcRC, MRI);
3879   I.eraseFromParent();
3880   return true;
3881 }
3882 
3883 MachineInstr *AArch64InstructionSelector::emitScalarToVector(
3884     unsigned EltSize, const TargetRegisterClass *DstRC, Register Scalar,
3885     MachineIRBuilder &MIRBuilder) const {
3886   auto Undef = MIRBuilder.buildInstr(TargetOpcode::IMPLICIT_DEF, {DstRC}, {});
3887 
3888   auto BuildFn = [&](unsigned SubregIndex) {
3889     auto Ins =
3890         MIRBuilder
3891             .buildInstr(TargetOpcode::INSERT_SUBREG, {DstRC}, {Undef, Scalar})
3892             .addImm(SubregIndex);
3893     constrainSelectedInstRegOperands(*Undef, TII, TRI, RBI);
3894     constrainSelectedInstRegOperands(*Ins, TII, TRI, RBI);
3895     return &*Ins;
3896   };
3897 
3898   switch (EltSize) {
3899   case 8:
3900     return BuildFn(AArch64::bsub);
3901   case 16:
3902     return BuildFn(AArch64::hsub);
3903   case 32:
3904     return BuildFn(AArch64::ssub);
3905   case 64:
3906     return BuildFn(AArch64::dsub);
3907   default:
3908     return nullptr;
3909   }
3910 }
3911 
3912 MachineInstr *
3913 AArch64InstructionSelector::emitNarrowVector(Register DstReg, Register SrcReg,
3914                                              MachineIRBuilder &MIB,
3915                                              MachineRegisterInfo &MRI) const {
3916   LLT DstTy = MRI.getType(DstReg);
3917   const TargetRegisterClass *RC =
3918       getRegClassForTypeOnBank(DstTy, *RBI.getRegBank(SrcReg, MRI, TRI));
3919   if (RC != &AArch64::FPR32RegClass && RC != &AArch64::FPR64RegClass) {
3920     LLVM_DEBUG(dbgs() << "Unsupported register class!\n");
3921     return nullptr;
3922   }
3923   unsigned SubReg = 0;
3924   if (!getSubRegForClass(RC, TRI, SubReg))
3925     return nullptr;
3926   if (SubReg != AArch64::ssub && SubReg != AArch64::dsub) {
3927     LLVM_DEBUG(dbgs() << "Unsupported destination size! ("
3928                       << DstTy.getSizeInBits() << "\n");
3929     return nullptr;
3930   }
3931   auto Copy = MIB.buildInstr(TargetOpcode::COPY, {DstReg}, {})
3932                   .addReg(SrcReg, 0, SubReg);
3933   RBI.constrainGenericRegister(DstReg, *RC, MRI);
3934   return Copy;
3935 }
3936 
3937 bool AArch64InstructionSelector::selectMergeValues(
3938     MachineInstr &I, MachineRegisterInfo &MRI) {
3939   assert(I.getOpcode() == TargetOpcode::G_MERGE_VALUES && "unexpected opcode");
3940   const LLT DstTy = MRI.getType(I.getOperand(0).getReg());
3941   const LLT SrcTy = MRI.getType(I.getOperand(1).getReg());
3942   assert(!DstTy.isVector() && !SrcTy.isVector() && "invalid merge operation");
3943   const RegisterBank &RB = *RBI.getRegBank(I.getOperand(1).getReg(), MRI, TRI);
3944 
3945   if (I.getNumOperands() != 3)
3946     return false;
3947 
3948   // Merging 2 s64s into an s128.
3949   if (DstTy == LLT::scalar(128)) {
3950     if (SrcTy.getSizeInBits() != 64)
3951       return false;
3952     Register DstReg = I.getOperand(0).getReg();
3953     Register Src1Reg = I.getOperand(1).getReg();
3954     Register Src2Reg = I.getOperand(2).getReg();
3955     auto Tmp = MIB.buildInstr(TargetOpcode::IMPLICIT_DEF, {DstTy}, {});
3956     MachineInstr *InsMI = emitLaneInsert(std::nullopt, Tmp.getReg(0), Src1Reg,
3957                                          /* LaneIdx */ 0, RB, MIB);
3958     if (!InsMI)
3959       return false;
3960     MachineInstr *Ins2MI = emitLaneInsert(DstReg, InsMI->getOperand(0).getReg(),
3961                                           Src2Reg, /* LaneIdx */ 1, RB, MIB);
3962     if (!Ins2MI)
3963       return false;
3964     constrainSelectedInstRegOperands(*InsMI, TII, TRI, RBI);
3965     constrainSelectedInstRegOperands(*Ins2MI, TII, TRI, RBI);
3966     I.eraseFromParent();
3967     return true;
3968   }
3969 
3970   if (RB.getID() != AArch64::GPRRegBankID)
3971     return false;
3972 
3973   if (DstTy.getSizeInBits() != 64 || SrcTy.getSizeInBits() != 32)
3974     return false;
3975 
3976   auto *DstRC = &AArch64::GPR64RegClass;
3977   Register SubToRegDef = MRI.createVirtualRegister(DstRC);
3978   MachineInstr &SubRegMI = *BuildMI(*I.getParent(), I, I.getDebugLoc(),
3979                                     TII.get(TargetOpcode::SUBREG_TO_REG))
3980                                 .addDef(SubToRegDef)
3981                                 .addImm(0)
3982                                 .addUse(I.getOperand(1).getReg())
3983                                 .addImm(AArch64::sub_32);
3984   Register SubToRegDef2 = MRI.createVirtualRegister(DstRC);
3985   // Need to anyext the second scalar before we can use bfm
3986   MachineInstr &SubRegMI2 = *BuildMI(*I.getParent(), I, I.getDebugLoc(),
3987                                     TII.get(TargetOpcode::SUBREG_TO_REG))
3988                                 .addDef(SubToRegDef2)
3989                                 .addImm(0)
3990                                 .addUse(I.getOperand(2).getReg())
3991                                 .addImm(AArch64::sub_32);
3992   MachineInstr &BFM =
3993       *BuildMI(*I.getParent(), I, I.getDebugLoc(), TII.get(AArch64::BFMXri))
3994            .addDef(I.getOperand(0).getReg())
3995            .addUse(SubToRegDef)
3996            .addUse(SubToRegDef2)
3997            .addImm(32)
3998            .addImm(31);
3999   constrainSelectedInstRegOperands(SubRegMI, TII, TRI, RBI);
4000   constrainSelectedInstRegOperands(SubRegMI2, TII, TRI, RBI);
4001   constrainSelectedInstRegOperands(BFM, TII, TRI, RBI);
4002   I.eraseFromParent();
4003   return true;
4004 }
4005 
4006 static bool getLaneCopyOpcode(unsigned &CopyOpc, unsigned &ExtractSubReg,
4007                               const unsigned EltSize) {
4008   // Choose a lane copy opcode and subregister based off of the size of the
4009   // vector's elements.
4010   switch (EltSize) {
4011   case 8:
4012     CopyOpc = AArch64::DUPi8;
4013     ExtractSubReg = AArch64::bsub;
4014     break;
4015   case 16:
4016     CopyOpc = AArch64::DUPi16;
4017     ExtractSubReg = AArch64::hsub;
4018     break;
4019   case 32:
4020     CopyOpc = AArch64::DUPi32;
4021     ExtractSubReg = AArch64::ssub;
4022     break;
4023   case 64:
4024     CopyOpc = AArch64::DUPi64;
4025     ExtractSubReg = AArch64::dsub;
4026     break;
4027   default:
4028     // Unknown size, bail out.
4029     LLVM_DEBUG(dbgs() << "Elt size '" << EltSize << "' unsupported.\n");
4030     return false;
4031   }
4032   return true;
4033 }
4034 
4035 MachineInstr *AArch64InstructionSelector::emitExtractVectorElt(
4036     std::optional<Register> DstReg, const RegisterBank &DstRB, LLT ScalarTy,
4037     Register VecReg, unsigned LaneIdx, MachineIRBuilder &MIRBuilder) const {
4038   MachineRegisterInfo &MRI = *MIRBuilder.getMRI();
4039   unsigned CopyOpc = 0;
4040   unsigned ExtractSubReg = 0;
4041   if (!getLaneCopyOpcode(CopyOpc, ExtractSubReg, ScalarTy.getSizeInBits())) {
4042     LLVM_DEBUG(
4043         dbgs() << "Couldn't determine lane copy opcode for instruction.\n");
4044     return nullptr;
4045   }
4046 
4047   const TargetRegisterClass *DstRC =
4048       getRegClassForTypeOnBank(ScalarTy, DstRB, true);
4049   if (!DstRC) {
4050     LLVM_DEBUG(dbgs() << "Could not determine destination register class.\n");
4051     return nullptr;
4052   }
4053 
4054   const RegisterBank &VecRB = *RBI.getRegBank(VecReg, MRI, TRI);
4055   const LLT &VecTy = MRI.getType(VecReg);
4056   const TargetRegisterClass *VecRC =
4057       getRegClassForTypeOnBank(VecTy, VecRB, true);
4058   if (!VecRC) {
4059     LLVM_DEBUG(dbgs() << "Could not determine source register class.\n");
4060     return nullptr;
4061   }
4062 
4063   // The register that we're going to copy into.
4064   Register InsertReg = VecReg;
4065   if (!DstReg)
4066     DstReg = MRI.createVirtualRegister(DstRC);
4067   // If the lane index is 0, we just use a subregister COPY.
4068   if (LaneIdx == 0) {
4069     auto Copy = MIRBuilder.buildInstr(TargetOpcode::COPY, {*DstReg}, {})
4070                     .addReg(VecReg, 0, ExtractSubReg);
4071     RBI.constrainGenericRegister(*DstReg, *DstRC, MRI);
4072     return &*Copy;
4073   }
4074 
4075   // Lane copies require 128-bit wide registers. If we're dealing with an
4076   // unpacked vector, then we need to move up to that width. Insert an implicit
4077   // def and a subregister insert to get us there.
4078   if (VecTy.getSizeInBits() != 128) {
4079     MachineInstr *ScalarToVector = emitScalarToVector(
4080         VecTy.getSizeInBits(), &AArch64::FPR128RegClass, VecReg, MIRBuilder);
4081     if (!ScalarToVector)
4082       return nullptr;
4083     InsertReg = ScalarToVector->getOperand(0).getReg();
4084   }
4085 
4086   MachineInstr *LaneCopyMI =
4087       MIRBuilder.buildInstr(CopyOpc, {*DstReg}, {InsertReg}).addImm(LaneIdx);
4088   constrainSelectedInstRegOperands(*LaneCopyMI, TII, TRI, RBI);
4089 
4090   // Make sure that we actually constrain the initial copy.
4091   RBI.constrainGenericRegister(*DstReg, *DstRC, MRI);
4092   return LaneCopyMI;
4093 }
4094 
4095 bool AArch64InstructionSelector::selectExtractElt(
4096     MachineInstr &I, MachineRegisterInfo &MRI) {
4097   assert(I.getOpcode() == TargetOpcode::G_EXTRACT_VECTOR_ELT &&
4098          "unexpected opcode!");
4099   Register DstReg = I.getOperand(0).getReg();
4100   const LLT NarrowTy = MRI.getType(DstReg);
4101   const Register SrcReg = I.getOperand(1).getReg();
4102   const LLT WideTy = MRI.getType(SrcReg);
4103   (void)WideTy;
4104   assert(WideTy.getSizeInBits() >= NarrowTy.getSizeInBits() &&
4105          "source register size too small!");
4106   assert(!NarrowTy.isVector() && "cannot extract vector into vector!");
4107 
4108   // Need the lane index to determine the correct copy opcode.
4109   MachineOperand &LaneIdxOp = I.getOperand(2);
4110   assert(LaneIdxOp.isReg() && "Lane index operand was not a register?");
4111 
4112   if (RBI.getRegBank(DstReg, MRI, TRI)->getID() != AArch64::FPRRegBankID) {
4113     LLVM_DEBUG(dbgs() << "Cannot extract into GPR.\n");
4114     return false;
4115   }
4116 
4117   // Find the index to extract from.
4118   auto VRegAndVal = getIConstantVRegValWithLookThrough(LaneIdxOp.getReg(), MRI);
4119   if (!VRegAndVal)
4120     return false;
4121   unsigned LaneIdx = VRegAndVal->Value.getSExtValue();
4122 
4123 
4124   const RegisterBank &DstRB = *RBI.getRegBank(DstReg, MRI, TRI);
4125   MachineInstr *Extract = emitExtractVectorElt(DstReg, DstRB, NarrowTy, SrcReg,
4126                                                LaneIdx, MIB);
4127   if (!Extract)
4128     return false;
4129 
4130   I.eraseFromParent();
4131   return true;
4132 }
4133 
4134 bool AArch64InstructionSelector::selectSplitVectorUnmerge(
4135     MachineInstr &I, MachineRegisterInfo &MRI) {
4136   unsigned NumElts = I.getNumOperands() - 1;
4137   Register SrcReg = I.getOperand(NumElts).getReg();
4138   const LLT NarrowTy = MRI.getType(I.getOperand(0).getReg());
4139   const LLT SrcTy = MRI.getType(SrcReg);
4140 
4141   assert(NarrowTy.isVector() && "Expected an unmerge into vectors");
4142   if (SrcTy.getSizeInBits() > 128) {
4143     LLVM_DEBUG(dbgs() << "Unexpected vector type for vec split unmerge");
4144     return false;
4145   }
4146 
4147   // We implement a split vector operation by treating the sub-vectors as
4148   // scalars and extracting them.
4149   const RegisterBank &DstRB =
4150       *RBI.getRegBank(I.getOperand(0).getReg(), MRI, TRI);
4151   for (unsigned OpIdx = 0; OpIdx < NumElts; ++OpIdx) {
4152     Register Dst = I.getOperand(OpIdx).getReg();
4153     MachineInstr *Extract =
4154         emitExtractVectorElt(Dst, DstRB, NarrowTy, SrcReg, OpIdx, MIB);
4155     if (!Extract)
4156       return false;
4157   }
4158   I.eraseFromParent();
4159   return true;
4160 }
4161 
4162 bool AArch64InstructionSelector::selectUnmergeValues(MachineInstr &I,
4163                                                      MachineRegisterInfo &MRI) {
4164   assert(I.getOpcode() == TargetOpcode::G_UNMERGE_VALUES &&
4165          "unexpected opcode");
4166 
4167   // TODO: Handle unmerging into GPRs and from scalars to scalars.
4168   if (RBI.getRegBank(I.getOperand(0).getReg(), MRI, TRI)->getID() !=
4169           AArch64::FPRRegBankID ||
4170       RBI.getRegBank(I.getOperand(1).getReg(), MRI, TRI)->getID() !=
4171           AArch64::FPRRegBankID) {
4172     LLVM_DEBUG(dbgs() << "Unmerging vector-to-gpr and scalar-to-scalar "
4173                          "currently unsupported.\n");
4174     return false;
4175   }
4176 
4177   // The last operand is the vector source register, and every other operand is
4178   // a register to unpack into.
4179   unsigned NumElts = I.getNumOperands() - 1;
4180   Register SrcReg = I.getOperand(NumElts).getReg();
4181   const LLT NarrowTy = MRI.getType(I.getOperand(0).getReg());
4182   const LLT WideTy = MRI.getType(SrcReg);
4183   (void)WideTy;
4184   assert((WideTy.isVector() || WideTy.getSizeInBits() == 128) &&
4185          "can only unmerge from vector or s128 types!");
4186   assert(WideTy.getSizeInBits() > NarrowTy.getSizeInBits() &&
4187          "source register size too small!");
4188 
4189   if (!NarrowTy.isScalar())
4190     return selectSplitVectorUnmerge(I, MRI);
4191 
4192   // Choose a lane copy opcode and subregister based off of the size of the
4193   // vector's elements.
4194   unsigned CopyOpc = 0;
4195   unsigned ExtractSubReg = 0;
4196   if (!getLaneCopyOpcode(CopyOpc, ExtractSubReg, NarrowTy.getSizeInBits()))
4197     return false;
4198 
4199   // Set up for the lane copies.
4200   MachineBasicBlock &MBB = *I.getParent();
4201 
4202   // Stores the registers we'll be copying from.
4203   SmallVector<Register, 4> InsertRegs;
4204 
4205   // We'll use the first register twice, so we only need NumElts-1 registers.
4206   unsigned NumInsertRegs = NumElts - 1;
4207 
4208   // If our elements fit into exactly 128 bits, then we can copy from the source
4209   // directly. Otherwise, we need to do a bit of setup with some subregister
4210   // inserts.
4211   if (NarrowTy.getSizeInBits() * NumElts == 128) {
4212     InsertRegs = SmallVector<Register, 4>(NumInsertRegs, SrcReg);
4213   } else {
4214     // No. We have to perform subregister inserts. For each insert, create an
4215     // implicit def and a subregister insert, and save the register we create.
4216     const TargetRegisterClass *RC = getRegClassForTypeOnBank(
4217         LLT::fixed_vector(NumElts, WideTy.getScalarSizeInBits()),
4218         *RBI.getRegBank(SrcReg, MRI, TRI));
4219     unsigned SubReg = 0;
4220     bool Found = getSubRegForClass(RC, TRI, SubReg);
4221     (void)Found;
4222     assert(Found && "expected to find last operand's subeg idx");
4223     for (unsigned Idx = 0; Idx < NumInsertRegs; ++Idx) {
4224       Register ImpDefReg = MRI.createVirtualRegister(&AArch64::FPR128RegClass);
4225       MachineInstr &ImpDefMI =
4226           *BuildMI(MBB, I, I.getDebugLoc(), TII.get(TargetOpcode::IMPLICIT_DEF),
4227                    ImpDefReg);
4228 
4229       // Now, create the subregister insert from SrcReg.
4230       Register InsertReg = MRI.createVirtualRegister(&AArch64::FPR128RegClass);
4231       MachineInstr &InsMI =
4232           *BuildMI(MBB, I, I.getDebugLoc(),
4233                    TII.get(TargetOpcode::INSERT_SUBREG), InsertReg)
4234                .addUse(ImpDefReg)
4235                .addUse(SrcReg)
4236                .addImm(SubReg);
4237 
4238       constrainSelectedInstRegOperands(ImpDefMI, TII, TRI, RBI);
4239       constrainSelectedInstRegOperands(InsMI, TII, TRI, RBI);
4240 
4241       // Save the register so that we can copy from it after.
4242       InsertRegs.push_back(InsertReg);
4243     }
4244   }
4245 
4246   // Now that we've created any necessary subregister inserts, we can
4247   // create the copies.
4248   //
4249   // Perform the first copy separately as a subregister copy.
4250   Register CopyTo = I.getOperand(0).getReg();
4251   auto FirstCopy = MIB.buildInstr(TargetOpcode::COPY, {CopyTo}, {})
4252                        .addReg(InsertRegs[0], 0, ExtractSubReg);
4253   constrainSelectedInstRegOperands(*FirstCopy, TII, TRI, RBI);
4254 
4255   // Now, perform the remaining copies as vector lane copies.
4256   unsigned LaneIdx = 1;
4257   for (Register InsReg : InsertRegs) {
4258     Register CopyTo = I.getOperand(LaneIdx).getReg();
4259     MachineInstr &CopyInst =
4260         *BuildMI(MBB, I, I.getDebugLoc(), TII.get(CopyOpc), CopyTo)
4261              .addUse(InsReg)
4262              .addImm(LaneIdx);
4263     constrainSelectedInstRegOperands(CopyInst, TII, TRI, RBI);
4264     ++LaneIdx;
4265   }
4266 
4267   // Separately constrain the first copy's destination. Because of the
4268   // limitation in constrainOperandRegClass, we can't guarantee that this will
4269   // actually be constrained. So, do it ourselves using the second operand.
4270   const TargetRegisterClass *RC =
4271       MRI.getRegClassOrNull(I.getOperand(1).getReg());
4272   if (!RC) {
4273     LLVM_DEBUG(dbgs() << "Couldn't constrain copy destination.\n");
4274     return false;
4275   }
4276 
4277   RBI.constrainGenericRegister(CopyTo, *RC, MRI);
4278   I.eraseFromParent();
4279   return true;
4280 }
4281 
4282 bool AArch64InstructionSelector::selectConcatVectors(
4283     MachineInstr &I, MachineRegisterInfo &MRI)  {
4284   assert(I.getOpcode() == TargetOpcode::G_CONCAT_VECTORS &&
4285          "Unexpected opcode");
4286   Register Dst = I.getOperand(0).getReg();
4287   Register Op1 = I.getOperand(1).getReg();
4288   Register Op2 = I.getOperand(2).getReg();
4289   MachineInstr *ConcatMI = emitVectorConcat(Dst, Op1, Op2, MIB);
4290   if (!ConcatMI)
4291     return false;
4292   I.eraseFromParent();
4293   return true;
4294 }
4295 
4296 unsigned
4297 AArch64InstructionSelector::emitConstantPoolEntry(const Constant *CPVal,
4298                                                   MachineFunction &MF) const {
4299   Type *CPTy = CPVal->getType();
4300   Align Alignment = MF.getDataLayout().getPrefTypeAlign(CPTy);
4301 
4302   MachineConstantPool *MCP = MF.getConstantPool();
4303   return MCP->getConstantPoolIndex(CPVal, Alignment);
4304 }
4305 
4306 MachineInstr *AArch64InstructionSelector::emitLoadFromConstantPool(
4307     const Constant *CPVal, MachineIRBuilder &MIRBuilder) const {
4308   const TargetRegisterClass *RC;
4309   unsigned Opc;
4310   bool IsTiny = TM.getCodeModel() == CodeModel::Tiny;
4311   unsigned Size = MIRBuilder.getDataLayout().getTypeStoreSize(CPVal->getType());
4312   switch (Size) {
4313   case 16:
4314     RC = &AArch64::FPR128RegClass;
4315     Opc = IsTiny ? AArch64::LDRQl : AArch64::LDRQui;
4316     break;
4317   case 8:
4318     RC = &AArch64::FPR64RegClass;
4319     Opc = IsTiny ? AArch64::LDRDl : AArch64::LDRDui;
4320     break;
4321   case 4:
4322     RC = &AArch64::FPR32RegClass;
4323     Opc = IsTiny ? AArch64::LDRSl : AArch64::LDRSui;
4324     break;
4325   case 2:
4326     RC = &AArch64::FPR16RegClass;
4327     Opc = AArch64::LDRHui;
4328     break;
4329   default:
4330     LLVM_DEBUG(dbgs() << "Could not load from constant pool of type "
4331                       << *CPVal->getType());
4332     return nullptr;
4333   }
4334 
4335   MachineInstr *LoadMI = nullptr;
4336   auto &MF = MIRBuilder.getMF();
4337   unsigned CPIdx = emitConstantPoolEntry(CPVal, MF);
4338   if (IsTiny && (Size == 16 || Size == 8 || Size == 4)) {
4339     // Use load(literal) for tiny code model.
4340     LoadMI = &*MIRBuilder.buildInstr(Opc, {RC}, {}).addConstantPoolIndex(CPIdx);
4341   } else {
4342     auto Adrp =
4343         MIRBuilder.buildInstr(AArch64::ADRP, {&AArch64::GPR64RegClass}, {})
4344             .addConstantPoolIndex(CPIdx, 0, AArch64II::MO_PAGE);
4345 
4346     LoadMI = &*MIRBuilder.buildInstr(Opc, {RC}, {Adrp})
4347                    .addConstantPoolIndex(
4348                        CPIdx, 0, AArch64II::MO_PAGEOFF | AArch64II::MO_NC);
4349 
4350     constrainSelectedInstRegOperands(*Adrp, TII, TRI, RBI);
4351   }
4352 
4353   MachinePointerInfo PtrInfo = MachinePointerInfo::getConstantPool(MF);
4354   LoadMI->addMemOperand(MF, MF.getMachineMemOperand(PtrInfo,
4355                                                     MachineMemOperand::MOLoad,
4356                                                     Size, Align(Size)));
4357   constrainSelectedInstRegOperands(*LoadMI, TII, TRI, RBI);
4358   return LoadMI;
4359 }
4360 
4361 /// Return an <Opcode, SubregIndex> pair to do an vector elt insert of a given
4362 /// size and RB.
4363 static std::pair<unsigned, unsigned>
4364 getInsertVecEltOpInfo(const RegisterBank &RB, unsigned EltSize) {
4365   unsigned Opc, SubregIdx;
4366   if (RB.getID() == AArch64::GPRRegBankID) {
4367     if (EltSize == 8) {
4368       Opc = AArch64::INSvi8gpr;
4369       SubregIdx = AArch64::bsub;
4370     } else if (EltSize == 16) {
4371       Opc = AArch64::INSvi16gpr;
4372       SubregIdx = AArch64::ssub;
4373     } else if (EltSize == 32) {
4374       Opc = AArch64::INSvi32gpr;
4375       SubregIdx = AArch64::ssub;
4376     } else if (EltSize == 64) {
4377       Opc = AArch64::INSvi64gpr;
4378       SubregIdx = AArch64::dsub;
4379     } else {
4380       llvm_unreachable("invalid elt size!");
4381     }
4382   } else {
4383     if (EltSize == 8) {
4384       Opc = AArch64::INSvi8lane;
4385       SubregIdx = AArch64::bsub;
4386     } else if (EltSize == 16) {
4387       Opc = AArch64::INSvi16lane;
4388       SubregIdx = AArch64::hsub;
4389     } else if (EltSize == 32) {
4390       Opc = AArch64::INSvi32lane;
4391       SubregIdx = AArch64::ssub;
4392     } else if (EltSize == 64) {
4393       Opc = AArch64::INSvi64lane;
4394       SubregIdx = AArch64::dsub;
4395     } else {
4396       llvm_unreachable("invalid elt size!");
4397     }
4398   }
4399   return std::make_pair(Opc, SubregIdx);
4400 }
4401 
4402 MachineInstr *AArch64InstructionSelector::emitInstr(
4403     unsigned Opcode, std::initializer_list<llvm::DstOp> DstOps,
4404     std::initializer_list<llvm::SrcOp> SrcOps, MachineIRBuilder &MIRBuilder,
4405     const ComplexRendererFns &RenderFns) const {
4406   assert(Opcode && "Expected an opcode?");
4407   assert(!isPreISelGenericOpcode(Opcode) &&
4408          "Function should only be used to produce selected instructions!");
4409   auto MI = MIRBuilder.buildInstr(Opcode, DstOps, SrcOps);
4410   if (RenderFns)
4411     for (auto &Fn : *RenderFns)
4412       Fn(MI);
4413   constrainSelectedInstRegOperands(*MI, TII, TRI, RBI);
4414   return &*MI;
4415 }
4416 
4417 MachineInstr *AArch64InstructionSelector::emitAddSub(
4418     const std::array<std::array<unsigned, 2>, 5> &AddrModeAndSizeToOpcode,
4419     Register Dst, MachineOperand &LHS, MachineOperand &RHS,
4420     MachineIRBuilder &MIRBuilder) const {
4421   MachineRegisterInfo &MRI = MIRBuilder.getMF().getRegInfo();
4422   assert(LHS.isReg() && RHS.isReg() && "Expected register operands?");
4423   auto Ty = MRI.getType(LHS.getReg());
4424   assert(!Ty.isVector() && "Expected a scalar or pointer?");
4425   unsigned Size = Ty.getSizeInBits();
4426   assert((Size == 32 || Size == 64) && "Expected a 32-bit or 64-bit type only");
4427   bool Is32Bit = Size == 32;
4428 
4429   // INSTRri form with positive arithmetic immediate.
4430   if (auto Fns = selectArithImmed(RHS))
4431     return emitInstr(AddrModeAndSizeToOpcode[0][Is32Bit], {Dst}, {LHS},
4432                      MIRBuilder, Fns);
4433 
4434   // INSTRri form with negative arithmetic immediate.
4435   if (auto Fns = selectNegArithImmed(RHS))
4436     return emitInstr(AddrModeAndSizeToOpcode[3][Is32Bit], {Dst}, {LHS},
4437                      MIRBuilder, Fns);
4438 
4439   // INSTRrx form.
4440   if (auto Fns = selectArithExtendedRegister(RHS))
4441     return emitInstr(AddrModeAndSizeToOpcode[4][Is32Bit], {Dst}, {LHS},
4442                      MIRBuilder, Fns);
4443 
4444   // INSTRrs form.
4445   if (auto Fns = selectShiftedRegister(RHS))
4446     return emitInstr(AddrModeAndSizeToOpcode[1][Is32Bit], {Dst}, {LHS},
4447                      MIRBuilder, Fns);
4448   return emitInstr(AddrModeAndSizeToOpcode[2][Is32Bit], {Dst}, {LHS, RHS},
4449                    MIRBuilder);
4450 }
4451 
4452 MachineInstr *
4453 AArch64InstructionSelector::emitADD(Register DefReg, MachineOperand &LHS,
4454                                     MachineOperand &RHS,
4455                                     MachineIRBuilder &MIRBuilder) const {
4456   const std::array<std::array<unsigned, 2>, 5> OpcTable{
4457       {{AArch64::ADDXri, AArch64::ADDWri},
4458        {AArch64::ADDXrs, AArch64::ADDWrs},
4459        {AArch64::ADDXrr, AArch64::ADDWrr},
4460        {AArch64::SUBXri, AArch64::SUBWri},
4461        {AArch64::ADDXrx, AArch64::ADDWrx}}};
4462   return emitAddSub(OpcTable, DefReg, LHS, RHS, MIRBuilder);
4463 }
4464 
4465 MachineInstr *
4466 AArch64InstructionSelector::emitADDS(Register Dst, MachineOperand &LHS,
4467                                      MachineOperand &RHS,
4468                                      MachineIRBuilder &MIRBuilder) const {
4469   const std::array<std::array<unsigned, 2>, 5> OpcTable{
4470       {{AArch64::ADDSXri, AArch64::ADDSWri},
4471        {AArch64::ADDSXrs, AArch64::ADDSWrs},
4472        {AArch64::ADDSXrr, AArch64::ADDSWrr},
4473        {AArch64::SUBSXri, AArch64::SUBSWri},
4474        {AArch64::ADDSXrx, AArch64::ADDSWrx}}};
4475   return emitAddSub(OpcTable, Dst, LHS, RHS, MIRBuilder);
4476 }
4477 
4478 MachineInstr *
4479 AArch64InstructionSelector::emitSUBS(Register Dst, MachineOperand &LHS,
4480                                      MachineOperand &RHS,
4481                                      MachineIRBuilder &MIRBuilder) const {
4482   const std::array<std::array<unsigned, 2>, 5> OpcTable{
4483       {{AArch64::SUBSXri, AArch64::SUBSWri},
4484        {AArch64::SUBSXrs, AArch64::SUBSWrs},
4485        {AArch64::SUBSXrr, AArch64::SUBSWrr},
4486        {AArch64::ADDSXri, AArch64::ADDSWri},
4487        {AArch64::SUBSXrx, AArch64::SUBSWrx}}};
4488   return emitAddSub(OpcTable, Dst, LHS, RHS, MIRBuilder);
4489 }
4490 
4491 MachineInstr *
4492 AArch64InstructionSelector::emitADCS(Register Dst, MachineOperand &LHS,
4493                                      MachineOperand &RHS,
4494                                      MachineIRBuilder &MIRBuilder) const {
4495   assert(LHS.isReg() && RHS.isReg() && "Expected register operands?");
4496   MachineRegisterInfo *MRI = MIRBuilder.getMRI();
4497   bool Is32Bit = (MRI->getType(LHS.getReg()).getSizeInBits() == 32);
4498   static const unsigned OpcTable[2] = {AArch64::ADCSXr, AArch64::ADCSWr};
4499   return emitInstr(OpcTable[Is32Bit], {Dst}, {LHS, RHS}, MIRBuilder);
4500 }
4501 
4502 MachineInstr *
4503 AArch64InstructionSelector::emitSBCS(Register Dst, MachineOperand &LHS,
4504                                      MachineOperand &RHS,
4505                                      MachineIRBuilder &MIRBuilder) const {
4506   assert(LHS.isReg() && RHS.isReg() && "Expected register operands?");
4507   MachineRegisterInfo *MRI = MIRBuilder.getMRI();
4508   bool Is32Bit = (MRI->getType(LHS.getReg()).getSizeInBits() == 32);
4509   static const unsigned OpcTable[2] = {AArch64::SBCSXr, AArch64::SBCSWr};
4510   return emitInstr(OpcTable[Is32Bit], {Dst}, {LHS, RHS}, MIRBuilder);
4511 }
4512 
4513 MachineInstr *
4514 AArch64InstructionSelector::emitCMN(MachineOperand &LHS, MachineOperand &RHS,
4515                                     MachineIRBuilder &MIRBuilder) const {
4516   MachineRegisterInfo &MRI = MIRBuilder.getMF().getRegInfo();
4517   bool Is32Bit = (MRI.getType(LHS.getReg()).getSizeInBits() == 32);
4518   auto RC = Is32Bit ? &AArch64::GPR32RegClass : &AArch64::GPR64RegClass;
4519   return emitADDS(MRI.createVirtualRegister(RC), LHS, RHS, MIRBuilder);
4520 }
4521 
4522 MachineInstr *
4523 AArch64InstructionSelector::emitTST(MachineOperand &LHS, MachineOperand &RHS,
4524                                     MachineIRBuilder &MIRBuilder) const {
4525   assert(LHS.isReg() && RHS.isReg() && "Expected register operands?");
4526   MachineRegisterInfo &MRI = MIRBuilder.getMF().getRegInfo();
4527   LLT Ty = MRI.getType(LHS.getReg());
4528   unsigned RegSize = Ty.getSizeInBits();
4529   bool Is32Bit = (RegSize == 32);
4530   const unsigned OpcTable[3][2] = {{AArch64::ANDSXri, AArch64::ANDSWri},
4531                                    {AArch64::ANDSXrs, AArch64::ANDSWrs},
4532                                    {AArch64::ANDSXrr, AArch64::ANDSWrr}};
4533   // ANDS needs a logical immediate for its immediate form. Check if we can
4534   // fold one in.
4535   if (auto ValAndVReg = getIConstantVRegValWithLookThrough(RHS.getReg(), MRI)) {
4536     int64_t Imm = ValAndVReg->Value.getSExtValue();
4537 
4538     if (AArch64_AM::isLogicalImmediate(Imm, RegSize)) {
4539       auto TstMI = MIRBuilder.buildInstr(OpcTable[0][Is32Bit], {Ty}, {LHS});
4540       TstMI.addImm(AArch64_AM::encodeLogicalImmediate(Imm, RegSize));
4541       constrainSelectedInstRegOperands(*TstMI, TII, TRI, RBI);
4542       return &*TstMI;
4543     }
4544   }
4545 
4546   if (auto Fns = selectLogicalShiftedRegister(RHS))
4547     return emitInstr(OpcTable[1][Is32Bit], {Ty}, {LHS}, MIRBuilder, Fns);
4548   return emitInstr(OpcTable[2][Is32Bit], {Ty}, {LHS, RHS}, MIRBuilder);
4549 }
4550 
4551 MachineInstr *AArch64InstructionSelector::emitIntegerCompare(
4552     MachineOperand &LHS, MachineOperand &RHS, MachineOperand &Predicate,
4553     MachineIRBuilder &MIRBuilder) const {
4554   assert(LHS.isReg() && RHS.isReg() && "Expected LHS and RHS to be registers!");
4555   assert(Predicate.isPredicate() && "Expected predicate?");
4556   MachineRegisterInfo &MRI = MIRBuilder.getMF().getRegInfo();
4557   LLT CmpTy = MRI.getType(LHS.getReg());
4558   assert(!CmpTy.isVector() && "Expected scalar or pointer");
4559   unsigned Size = CmpTy.getSizeInBits();
4560   (void)Size;
4561   assert((Size == 32 || Size == 64) && "Expected a 32-bit or 64-bit LHS/RHS?");
4562   // Fold the compare into a cmn or tst if possible.
4563   if (auto FoldCmp = tryFoldIntegerCompare(LHS, RHS, Predicate, MIRBuilder))
4564     return FoldCmp;
4565   auto Dst = MRI.cloneVirtualRegister(LHS.getReg());
4566   return emitSUBS(Dst, LHS, RHS, MIRBuilder);
4567 }
4568 
4569 MachineInstr *AArch64InstructionSelector::emitCSetForFCmp(
4570     Register Dst, CmpInst::Predicate Pred, MachineIRBuilder &MIRBuilder) const {
4571   MachineRegisterInfo &MRI = *MIRBuilder.getMRI();
4572 #ifndef NDEBUG
4573   LLT Ty = MRI.getType(Dst);
4574   assert(!Ty.isVector() && Ty.getSizeInBits() == 32 &&
4575          "Expected a 32-bit scalar register?");
4576 #endif
4577   const Register ZReg = AArch64::WZR;
4578   AArch64CC::CondCode CC1, CC2;
4579   changeFCMPPredToAArch64CC(Pred, CC1, CC2);
4580   auto InvCC1 = AArch64CC::getInvertedCondCode(CC1);
4581   if (CC2 == AArch64CC::AL)
4582     return emitCSINC(/*Dst=*/Dst, /*Src1=*/ZReg, /*Src2=*/ZReg, InvCC1,
4583                      MIRBuilder);
4584   const TargetRegisterClass *RC = &AArch64::GPR32RegClass;
4585   Register Def1Reg = MRI.createVirtualRegister(RC);
4586   Register Def2Reg = MRI.createVirtualRegister(RC);
4587   auto InvCC2 = AArch64CC::getInvertedCondCode(CC2);
4588   emitCSINC(/*Dst=*/Def1Reg, /*Src1=*/ZReg, /*Src2=*/ZReg, InvCC1, MIRBuilder);
4589   emitCSINC(/*Dst=*/Def2Reg, /*Src1=*/ZReg, /*Src2=*/ZReg, InvCC2, MIRBuilder);
4590   auto OrMI = MIRBuilder.buildInstr(AArch64::ORRWrr, {Dst}, {Def1Reg, Def2Reg});
4591   constrainSelectedInstRegOperands(*OrMI, TII, TRI, RBI);
4592   return &*OrMI;
4593 }
4594 
4595 MachineInstr *AArch64InstructionSelector::emitFPCompare(
4596     Register LHS, Register RHS, MachineIRBuilder &MIRBuilder,
4597     std::optional<CmpInst::Predicate> Pred) const {
4598   MachineRegisterInfo &MRI = *MIRBuilder.getMRI();
4599   LLT Ty = MRI.getType(LHS);
4600   if (Ty.isVector())
4601     return nullptr;
4602   unsigned OpSize = Ty.getSizeInBits();
4603   if (OpSize != 32 && OpSize != 64)
4604     return nullptr;
4605 
4606   // If this is a compare against +0.0, then we don't have
4607   // to explicitly materialize a constant.
4608   const ConstantFP *FPImm = getConstantFPVRegVal(RHS, MRI);
4609   bool ShouldUseImm = FPImm && (FPImm->isZero() && !FPImm->isNegative());
4610 
4611   auto IsEqualityPred = [](CmpInst::Predicate P) {
4612     return P == CmpInst::FCMP_OEQ || P == CmpInst::FCMP_ONE ||
4613            P == CmpInst::FCMP_UEQ || P == CmpInst::FCMP_UNE;
4614   };
4615   if (!ShouldUseImm && Pred && IsEqualityPred(*Pred)) {
4616     // Try commutating the operands.
4617     const ConstantFP *LHSImm = getConstantFPVRegVal(LHS, MRI);
4618     if (LHSImm && (LHSImm->isZero() && !LHSImm->isNegative())) {
4619       ShouldUseImm = true;
4620       std::swap(LHS, RHS);
4621     }
4622   }
4623   unsigned CmpOpcTbl[2][2] = {{AArch64::FCMPSrr, AArch64::FCMPDrr},
4624                               {AArch64::FCMPSri, AArch64::FCMPDri}};
4625   unsigned CmpOpc = CmpOpcTbl[ShouldUseImm][OpSize == 64];
4626 
4627   // Partially build the compare. Decide if we need to add a use for the
4628   // third operand based off whether or not we're comparing against 0.0.
4629   auto CmpMI = MIRBuilder.buildInstr(CmpOpc).addUse(LHS);
4630   CmpMI.setMIFlags(MachineInstr::NoFPExcept);
4631   if (!ShouldUseImm)
4632     CmpMI.addUse(RHS);
4633   constrainSelectedInstRegOperands(*CmpMI, TII, TRI, RBI);
4634   return &*CmpMI;
4635 }
4636 
4637 MachineInstr *AArch64InstructionSelector::emitVectorConcat(
4638     std::optional<Register> Dst, Register Op1, Register Op2,
4639     MachineIRBuilder &MIRBuilder) const {
4640   // We implement a vector concat by:
4641   // 1. Use scalar_to_vector to insert the lower vector into the larger dest
4642   // 2. Insert the upper vector into the destination's upper element
4643   // TODO: some of this code is common with G_BUILD_VECTOR handling.
4644   MachineRegisterInfo &MRI = MIRBuilder.getMF().getRegInfo();
4645 
4646   const LLT Op1Ty = MRI.getType(Op1);
4647   const LLT Op2Ty = MRI.getType(Op2);
4648 
4649   if (Op1Ty != Op2Ty) {
4650     LLVM_DEBUG(dbgs() << "Could not do vector concat of differing vector tys");
4651     return nullptr;
4652   }
4653   assert(Op1Ty.isVector() && "Expected a vector for vector concat");
4654 
4655   if (Op1Ty.getSizeInBits() >= 128) {
4656     LLVM_DEBUG(dbgs() << "Vector concat not supported for full size vectors");
4657     return nullptr;
4658   }
4659 
4660   // At the moment we just support 64 bit vector concats.
4661   if (Op1Ty.getSizeInBits() != 64) {
4662     LLVM_DEBUG(dbgs() << "Vector concat supported for 64b vectors");
4663     return nullptr;
4664   }
4665 
4666   const LLT ScalarTy = LLT::scalar(Op1Ty.getSizeInBits());
4667   const RegisterBank &FPRBank = *RBI.getRegBank(Op1, MRI, TRI);
4668   const TargetRegisterClass *DstRC =
4669       getRegClassForTypeOnBank(Op1Ty.multiplyElements(2), FPRBank);
4670 
4671   MachineInstr *WidenedOp1 =
4672       emitScalarToVector(ScalarTy.getSizeInBits(), DstRC, Op1, MIRBuilder);
4673   MachineInstr *WidenedOp2 =
4674       emitScalarToVector(ScalarTy.getSizeInBits(), DstRC, Op2, MIRBuilder);
4675   if (!WidenedOp1 || !WidenedOp2) {
4676     LLVM_DEBUG(dbgs() << "Could not emit a vector from scalar value");
4677     return nullptr;
4678   }
4679 
4680   // Now do the insert of the upper element.
4681   unsigned InsertOpc, InsSubRegIdx;
4682   std::tie(InsertOpc, InsSubRegIdx) =
4683       getInsertVecEltOpInfo(FPRBank, ScalarTy.getSizeInBits());
4684 
4685   if (!Dst)
4686     Dst = MRI.createVirtualRegister(DstRC);
4687   auto InsElt =
4688       MIRBuilder
4689           .buildInstr(InsertOpc, {*Dst}, {WidenedOp1->getOperand(0).getReg()})
4690           .addImm(1) /* Lane index */
4691           .addUse(WidenedOp2->getOperand(0).getReg())
4692           .addImm(0);
4693   constrainSelectedInstRegOperands(*InsElt, TII, TRI, RBI);
4694   return &*InsElt;
4695 }
4696 
4697 MachineInstr *
4698 AArch64InstructionSelector::emitCSINC(Register Dst, Register Src1,
4699                                       Register Src2, AArch64CC::CondCode Pred,
4700                                       MachineIRBuilder &MIRBuilder) const {
4701   auto &MRI = *MIRBuilder.getMRI();
4702   const RegClassOrRegBank &RegClassOrBank = MRI.getRegClassOrRegBank(Dst);
4703   // If we used a register class, then this won't necessarily have an LLT.
4704   // Compute the size based off whether or not we have a class or bank.
4705   unsigned Size;
4706   if (const auto *RC = RegClassOrBank.dyn_cast<const TargetRegisterClass *>())
4707     Size = TRI.getRegSizeInBits(*RC);
4708   else
4709     Size = MRI.getType(Dst).getSizeInBits();
4710   // Some opcodes use s1.
4711   assert(Size <= 64 && "Expected 64 bits or less only!");
4712   static const unsigned OpcTable[2] = {AArch64::CSINCWr, AArch64::CSINCXr};
4713   unsigned Opc = OpcTable[Size == 64];
4714   auto CSINC = MIRBuilder.buildInstr(Opc, {Dst}, {Src1, Src2}).addImm(Pred);
4715   constrainSelectedInstRegOperands(*CSINC, TII, TRI, RBI);
4716   return &*CSINC;
4717 }
4718 
4719 MachineInstr *AArch64InstructionSelector::emitCarryIn(MachineInstr &I,
4720                                                       Register CarryReg) {
4721   MachineRegisterInfo *MRI = MIB.getMRI();
4722   unsigned Opcode = I.getOpcode();
4723 
4724   // If the instruction is a SUB, we need to negate the carry,
4725   // because borrowing is indicated by carry-flag == 0.
4726   bool NeedsNegatedCarry =
4727       (Opcode == TargetOpcode::G_USUBE || Opcode == TargetOpcode::G_SSUBE);
4728 
4729   // If the previous instruction will already produce the correct carry, do not
4730   // emit a carry generating instruction. E.g. for G_UADDE/G_USUBE sequences
4731   // generated during legalization of wide add/sub. This optimization depends on
4732   // these sequences not being interrupted by other instructions.
4733   // We have to select the previous instruction before the carry-using
4734   // instruction is deleted by the calling function, otherwise the previous
4735   // instruction might become dead and would get deleted.
4736   MachineInstr *SrcMI = MRI->getVRegDef(CarryReg);
4737   if (SrcMI == I.getPrevNode()) {
4738     if (auto *CarrySrcMI = dyn_cast<GAddSubCarryOut>(SrcMI)) {
4739       bool ProducesNegatedCarry = CarrySrcMI->isSub();
4740       if (NeedsNegatedCarry == ProducesNegatedCarry &&
4741           CarrySrcMI->isUnsigned() &&
4742           CarrySrcMI->getCarryOutReg() == CarryReg &&
4743           selectAndRestoreState(*SrcMI))
4744         return nullptr;
4745     }
4746   }
4747 
4748   Register DeadReg = MRI->createVirtualRegister(&AArch64::GPR32RegClass);
4749 
4750   if (NeedsNegatedCarry) {
4751     // (0 - Carry) sets !C in NZCV when Carry == 1
4752     Register ZReg = AArch64::WZR;
4753     return emitInstr(AArch64::SUBSWrr, {DeadReg}, {ZReg, CarryReg}, MIB);
4754   }
4755 
4756   // (Carry - 1) sets !C in NZCV when Carry == 0
4757   auto Fns = select12BitValueWithLeftShift(1);
4758   return emitInstr(AArch64::SUBSWri, {DeadReg}, {CarryReg}, MIB, Fns);
4759 }
4760 
4761 bool AArch64InstructionSelector::selectOverflowOp(MachineInstr &I,
4762                                                   MachineRegisterInfo &MRI) {
4763   auto &CarryMI = cast<GAddSubCarryOut>(I);
4764 
4765   if (auto *CarryInMI = dyn_cast<GAddSubCarryInOut>(&I)) {
4766     // Set NZCV carry according to carry-in VReg
4767     emitCarryIn(I, CarryInMI->getCarryInReg());
4768   }
4769 
4770   // Emit the operation and get the correct condition code.
4771   auto OpAndCC = emitOverflowOp(I.getOpcode(), CarryMI.getDstReg(),
4772                                 CarryMI.getLHS(), CarryMI.getRHS(), MIB);
4773 
4774   Register CarryOutReg = CarryMI.getCarryOutReg();
4775 
4776   // Don't convert carry-out to VReg if it is never used
4777   if (!MRI.use_nodbg_empty(CarryOutReg)) {
4778     // Now, put the overflow result in the register given by the first operand
4779     // to the overflow op. CSINC increments the result when the predicate is
4780     // false, so to get the increment when it's true, we need to use the
4781     // inverse. In this case, we want to increment when carry is set.
4782     Register ZReg = AArch64::WZR;
4783     emitCSINC(/*Dst=*/CarryOutReg, /*Src1=*/ZReg, /*Src2=*/ZReg,
4784               getInvertedCondCode(OpAndCC.second), MIB);
4785   }
4786 
4787   I.eraseFromParent();
4788   return true;
4789 }
4790 
4791 std::pair<MachineInstr *, AArch64CC::CondCode>
4792 AArch64InstructionSelector::emitOverflowOp(unsigned Opcode, Register Dst,
4793                                            MachineOperand &LHS,
4794                                            MachineOperand &RHS,
4795                                            MachineIRBuilder &MIRBuilder) const {
4796   switch (Opcode) {
4797   default:
4798     llvm_unreachable("Unexpected opcode!");
4799   case TargetOpcode::G_SADDO:
4800     return std::make_pair(emitADDS(Dst, LHS, RHS, MIRBuilder), AArch64CC::VS);
4801   case TargetOpcode::G_UADDO:
4802     return std::make_pair(emitADDS(Dst, LHS, RHS, MIRBuilder), AArch64CC::HS);
4803   case TargetOpcode::G_SSUBO:
4804     return std::make_pair(emitSUBS(Dst, LHS, RHS, MIRBuilder), AArch64CC::VS);
4805   case TargetOpcode::G_USUBO:
4806     return std::make_pair(emitSUBS(Dst, LHS, RHS, MIRBuilder), AArch64CC::LO);
4807   case TargetOpcode::G_SADDE:
4808     return std::make_pair(emitADCS(Dst, LHS, RHS, MIRBuilder), AArch64CC::VS);
4809   case TargetOpcode::G_UADDE:
4810     return std::make_pair(emitADCS(Dst, LHS, RHS, MIRBuilder), AArch64CC::HS);
4811   case TargetOpcode::G_SSUBE:
4812     return std::make_pair(emitSBCS(Dst, LHS, RHS, MIRBuilder), AArch64CC::VS);
4813   case TargetOpcode::G_USUBE:
4814     return std::make_pair(emitSBCS(Dst, LHS, RHS, MIRBuilder), AArch64CC::LO);
4815   }
4816 }
4817 
4818 /// Returns true if @p Val is a tree of AND/OR/CMP operations that can be
4819 /// expressed as a conjunction.
4820 /// \param CanNegate    Set to true if we can negate the whole sub-tree just by
4821 ///                     changing the conditions on the CMP tests.
4822 ///                     (this means we can call emitConjunctionRec() with
4823 ///                      Negate==true on this sub-tree)
4824 /// \param MustBeFirst  Set to true if this subtree needs to be negated and we
4825 ///                     cannot do the negation naturally. We are required to
4826 ///                     emit the subtree first in this case.
4827 /// \param WillNegate   Is true if are called when the result of this
4828 ///                     subexpression must be negated. This happens when the
4829 ///                     outer expression is an OR. We can use this fact to know
4830 ///                     that we have a double negation (or (or ...) ...) that
4831 ///                     can be implemented for free.
4832 static bool canEmitConjunction(Register Val, bool &CanNegate, bool &MustBeFirst,
4833                                bool WillNegate, MachineRegisterInfo &MRI,
4834                                unsigned Depth = 0) {
4835   if (!MRI.hasOneNonDBGUse(Val))
4836     return false;
4837   MachineInstr *ValDef = MRI.getVRegDef(Val);
4838   unsigned Opcode = ValDef->getOpcode();
4839   if (isa<GAnyCmp>(ValDef)) {
4840     CanNegate = true;
4841     MustBeFirst = false;
4842     return true;
4843   }
4844   // Protect against exponential runtime and stack overflow.
4845   if (Depth > 6)
4846     return false;
4847   if (Opcode == TargetOpcode::G_AND || Opcode == TargetOpcode::G_OR) {
4848     bool IsOR = Opcode == TargetOpcode::G_OR;
4849     Register O0 = ValDef->getOperand(1).getReg();
4850     Register O1 = ValDef->getOperand(2).getReg();
4851     bool CanNegateL;
4852     bool MustBeFirstL;
4853     if (!canEmitConjunction(O0, CanNegateL, MustBeFirstL, IsOR, MRI, Depth + 1))
4854       return false;
4855     bool CanNegateR;
4856     bool MustBeFirstR;
4857     if (!canEmitConjunction(O1, CanNegateR, MustBeFirstR, IsOR, MRI, Depth + 1))
4858       return false;
4859 
4860     if (MustBeFirstL && MustBeFirstR)
4861       return false;
4862 
4863     if (IsOR) {
4864       // For an OR expression we need to be able to naturally negate at least
4865       // one side or we cannot do the transformation at all.
4866       if (!CanNegateL && !CanNegateR)
4867         return false;
4868       // If we the result of the OR will be negated and we can naturally negate
4869       // the leaves, then this sub-tree as a whole negates naturally.
4870       CanNegate = WillNegate && CanNegateL && CanNegateR;
4871       // If we cannot naturally negate the whole sub-tree, then this must be
4872       // emitted first.
4873       MustBeFirst = !CanNegate;
4874     } else {
4875       assert(Opcode == TargetOpcode::G_AND && "Must be G_AND");
4876       // We cannot naturally negate an AND operation.
4877       CanNegate = false;
4878       MustBeFirst = MustBeFirstL || MustBeFirstR;
4879     }
4880     return true;
4881   }
4882   return false;
4883 }
4884 
4885 MachineInstr *AArch64InstructionSelector::emitConditionalComparison(
4886     Register LHS, Register RHS, CmpInst::Predicate CC,
4887     AArch64CC::CondCode Predicate, AArch64CC::CondCode OutCC,
4888     MachineIRBuilder &MIB) const {
4889   // TODO: emit CMN as an optimization.
4890   auto &MRI = *MIB.getMRI();
4891   LLT OpTy = MRI.getType(LHS);
4892   assert(OpTy.getSizeInBits() == 32 || OpTy.getSizeInBits() == 64);
4893   unsigned CCmpOpc;
4894   std::optional<ValueAndVReg> C;
4895   if (CmpInst::isIntPredicate(CC)) {
4896     C = getIConstantVRegValWithLookThrough(RHS, MRI);
4897     if (C && C->Value.ult(32))
4898       CCmpOpc = OpTy.getSizeInBits() == 32 ? AArch64::CCMPWi : AArch64::CCMPXi;
4899     else
4900       CCmpOpc = OpTy.getSizeInBits() == 32 ? AArch64::CCMPWr : AArch64::CCMPXr;
4901   } else {
4902     switch (OpTy.getSizeInBits()) {
4903     case 16:
4904       CCmpOpc = AArch64::FCCMPHrr;
4905       break;
4906     case 32:
4907       CCmpOpc = AArch64::FCCMPSrr;
4908       break;
4909     case 64:
4910       CCmpOpc = AArch64::FCCMPDrr;
4911       break;
4912     default:
4913       return nullptr;
4914     }
4915   }
4916   AArch64CC::CondCode InvOutCC = AArch64CC::getInvertedCondCode(OutCC);
4917   unsigned NZCV = AArch64CC::getNZCVToSatisfyCondCode(InvOutCC);
4918   auto CCmp =
4919       MIB.buildInstr(CCmpOpc, {}, {LHS});
4920   if (CCmpOpc == AArch64::CCMPWi || CCmpOpc == AArch64::CCMPXi)
4921     CCmp.addImm(C->Value.getZExtValue());
4922   else
4923     CCmp.addReg(RHS);
4924   CCmp.addImm(NZCV).addImm(Predicate);
4925   constrainSelectedInstRegOperands(*CCmp, TII, TRI, RBI);
4926   return &*CCmp;
4927 }
4928 
4929 MachineInstr *AArch64InstructionSelector::emitConjunctionRec(
4930     Register Val, AArch64CC::CondCode &OutCC, bool Negate, Register CCOp,
4931     AArch64CC::CondCode Predicate, MachineIRBuilder &MIB) const {
4932   // We're at a tree leaf, produce a conditional comparison operation.
4933   auto &MRI = *MIB.getMRI();
4934   MachineInstr *ValDef = MRI.getVRegDef(Val);
4935   unsigned Opcode = ValDef->getOpcode();
4936   if (auto *Cmp = dyn_cast<GAnyCmp>(ValDef)) {
4937     Register LHS = Cmp->getLHSReg();
4938     Register RHS = Cmp->getRHSReg();
4939     CmpInst::Predicate CC = Cmp->getCond();
4940     if (Negate)
4941       CC = CmpInst::getInversePredicate(CC);
4942     if (isa<GICmp>(Cmp)) {
4943       OutCC = changeICMPPredToAArch64CC(CC);
4944     } else {
4945       // Handle special FP cases.
4946       AArch64CC::CondCode ExtraCC;
4947       changeFPCCToANDAArch64CC(CC, OutCC, ExtraCC);
4948       // Some floating point conditions can't be tested with a single condition
4949       // code. Construct an additional comparison in this case.
4950       if (ExtraCC != AArch64CC::AL) {
4951         MachineInstr *ExtraCmp;
4952         if (!CCOp)
4953           ExtraCmp = emitFPCompare(LHS, RHS, MIB, CC);
4954         else
4955           ExtraCmp =
4956               emitConditionalComparison(LHS, RHS, CC, Predicate, ExtraCC, MIB);
4957         CCOp = ExtraCmp->getOperand(0).getReg();
4958         Predicate = ExtraCC;
4959       }
4960     }
4961 
4962     // Produce a normal comparison if we are first in the chain
4963     if (!CCOp) {
4964       auto Dst = MRI.cloneVirtualRegister(LHS);
4965       if (isa<GICmp>(Cmp))
4966         return emitSUBS(Dst, Cmp->getOperand(2), Cmp->getOperand(3), MIB);
4967       return emitFPCompare(Cmp->getOperand(2).getReg(),
4968                            Cmp->getOperand(3).getReg(), MIB);
4969     }
4970     // Otherwise produce a ccmp.
4971     return emitConditionalComparison(LHS, RHS, CC, Predicate, OutCC, MIB);
4972   }
4973   assert(MRI.hasOneNonDBGUse(Val) && "Valid conjunction/disjunction tree");
4974 
4975   bool IsOR = Opcode == TargetOpcode::G_OR;
4976 
4977   Register LHS = ValDef->getOperand(1).getReg();
4978   bool CanNegateL;
4979   bool MustBeFirstL;
4980   bool ValidL = canEmitConjunction(LHS, CanNegateL, MustBeFirstL, IsOR, MRI);
4981   assert(ValidL && "Valid conjunction/disjunction tree");
4982   (void)ValidL;
4983 
4984   Register RHS = ValDef->getOperand(2).getReg();
4985   bool CanNegateR;
4986   bool MustBeFirstR;
4987   bool ValidR = canEmitConjunction(RHS, CanNegateR, MustBeFirstR, IsOR, MRI);
4988   assert(ValidR && "Valid conjunction/disjunction tree");
4989   (void)ValidR;
4990 
4991   // Swap sub-tree that must come first to the right side.
4992   if (MustBeFirstL) {
4993     assert(!MustBeFirstR && "Valid conjunction/disjunction tree");
4994     std::swap(LHS, RHS);
4995     std::swap(CanNegateL, CanNegateR);
4996     std::swap(MustBeFirstL, MustBeFirstR);
4997   }
4998 
4999   bool NegateR;
5000   bool NegateAfterR;
5001   bool NegateL;
5002   bool NegateAfterAll;
5003   if (Opcode == TargetOpcode::G_OR) {
5004     // Swap the sub-tree that we can negate naturally to the left.
5005     if (!CanNegateL) {
5006       assert(CanNegateR && "at least one side must be negatable");
5007       assert(!MustBeFirstR && "invalid conjunction/disjunction tree");
5008       assert(!Negate);
5009       std::swap(LHS, RHS);
5010       NegateR = false;
5011       NegateAfterR = true;
5012     } else {
5013       // Negate the left sub-tree if possible, otherwise negate the result.
5014       NegateR = CanNegateR;
5015       NegateAfterR = !CanNegateR;
5016     }
5017     NegateL = true;
5018     NegateAfterAll = !Negate;
5019   } else {
5020     assert(Opcode == TargetOpcode::G_AND &&
5021            "Valid conjunction/disjunction tree");
5022     assert(!Negate && "Valid conjunction/disjunction tree");
5023 
5024     NegateL = false;
5025     NegateR = false;
5026     NegateAfterR = false;
5027     NegateAfterAll = false;
5028   }
5029 
5030   // Emit sub-trees.
5031   AArch64CC::CondCode RHSCC;
5032   MachineInstr *CmpR =
5033       emitConjunctionRec(RHS, RHSCC, NegateR, CCOp, Predicate, MIB);
5034   if (NegateAfterR)
5035     RHSCC = AArch64CC::getInvertedCondCode(RHSCC);
5036   MachineInstr *CmpL = emitConjunctionRec(
5037       LHS, OutCC, NegateL, CmpR->getOperand(0).getReg(), RHSCC, MIB);
5038   if (NegateAfterAll)
5039     OutCC = AArch64CC::getInvertedCondCode(OutCC);
5040   return CmpL;
5041 }
5042 
5043 MachineInstr *AArch64InstructionSelector::emitConjunction(
5044     Register Val, AArch64CC::CondCode &OutCC, MachineIRBuilder &MIB) const {
5045   bool DummyCanNegate;
5046   bool DummyMustBeFirst;
5047   if (!canEmitConjunction(Val, DummyCanNegate, DummyMustBeFirst, false,
5048                           *MIB.getMRI()))
5049     return nullptr;
5050   return emitConjunctionRec(Val, OutCC, false, Register(), AArch64CC::AL, MIB);
5051 }
5052 
5053 bool AArch64InstructionSelector::tryOptSelectConjunction(GSelect &SelI,
5054                                                          MachineInstr &CondMI) {
5055   AArch64CC::CondCode AArch64CC;
5056   MachineInstr *ConjMI = emitConjunction(SelI.getCondReg(), AArch64CC, MIB);
5057   if (!ConjMI)
5058     return false;
5059 
5060   emitSelect(SelI.getReg(0), SelI.getTrueReg(), SelI.getFalseReg(), AArch64CC, MIB);
5061   SelI.eraseFromParent();
5062   return true;
5063 }
5064 
5065 bool AArch64InstructionSelector::tryOptSelect(GSelect &I) {
5066   MachineRegisterInfo &MRI = *MIB.getMRI();
5067   // We want to recognize this pattern:
5068   //
5069   // $z = G_FCMP pred, $x, $y
5070   // ...
5071   // $w = G_SELECT $z, $a, $b
5072   //
5073   // Where the value of $z is *only* ever used by the G_SELECT (possibly with
5074   // some copies/truncs in between.)
5075   //
5076   // If we see this, then we can emit something like this:
5077   //
5078   // fcmp $x, $y
5079   // fcsel $w, $a, $b, pred
5080   //
5081   // Rather than emitting both of the rather long sequences in the standard
5082   // G_FCMP/G_SELECT select methods.
5083 
5084   // First, check if the condition is defined by a compare.
5085   MachineInstr *CondDef = MRI.getVRegDef(I.getOperand(1).getReg());
5086 
5087   // We can only fold if all of the defs have one use.
5088   Register CondDefReg = CondDef->getOperand(0).getReg();
5089   if (!MRI.hasOneNonDBGUse(CondDefReg)) {
5090     // Unless it's another select.
5091     for (const MachineInstr &UI : MRI.use_nodbg_instructions(CondDefReg)) {
5092       if (CondDef == &UI)
5093         continue;
5094       if (UI.getOpcode() != TargetOpcode::G_SELECT)
5095         return false;
5096     }
5097   }
5098 
5099   // Is the condition defined by a compare?
5100   unsigned CondOpc = CondDef->getOpcode();
5101   if (CondOpc != TargetOpcode::G_ICMP && CondOpc != TargetOpcode::G_FCMP) {
5102     if (tryOptSelectConjunction(I, *CondDef))
5103       return true;
5104     return false;
5105   }
5106 
5107   AArch64CC::CondCode CondCode;
5108   if (CondOpc == TargetOpcode::G_ICMP) {
5109     auto Pred =
5110         static_cast<CmpInst::Predicate>(CondDef->getOperand(1).getPredicate());
5111     CondCode = changeICMPPredToAArch64CC(Pred);
5112     emitIntegerCompare(CondDef->getOperand(2), CondDef->getOperand(3),
5113                        CondDef->getOperand(1), MIB);
5114   } else {
5115     // Get the condition code for the select.
5116     auto Pred =
5117         static_cast<CmpInst::Predicate>(CondDef->getOperand(1).getPredicate());
5118     AArch64CC::CondCode CondCode2;
5119     changeFCMPPredToAArch64CC(Pred, CondCode, CondCode2);
5120 
5121     // changeFCMPPredToAArch64CC sets CondCode2 to AL when we require two
5122     // instructions to emit the comparison.
5123     // TODO: Handle FCMP_UEQ and FCMP_ONE. After that, this check will be
5124     // unnecessary.
5125     if (CondCode2 != AArch64CC::AL)
5126       return false;
5127 
5128     if (!emitFPCompare(CondDef->getOperand(2).getReg(),
5129                        CondDef->getOperand(3).getReg(), MIB)) {
5130       LLVM_DEBUG(dbgs() << "Couldn't emit compare for select!\n");
5131       return false;
5132     }
5133   }
5134 
5135   // Emit the select.
5136   emitSelect(I.getOperand(0).getReg(), I.getOperand(2).getReg(),
5137              I.getOperand(3).getReg(), CondCode, MIB);
5138   I.eraseFromParent();
5139   return true;
5140 }
5141 
5142 MachineInstr *AArch64InstructionSelector::tryFoldIntegerCompare(
5143     MachineOperand &LHS, MachineOperand &RHS, MachineOperand &Predicate,
5144     MachineIRBuilder &MIRBuilder) const {
5145   assert(LHS.isReg() && RHS.isReg() && Predicate.isPredicate() &&
5146          "Unexpected MachineOperand");
5147   MachineRegisterInfo &MRI = *MIRBuilder.getMRI();
5148   // We want to find this sort of thing:
5149   // x = G_SUB 0, y
5150   // G_ICMP z, x
5151   //
5152   // In this case, we can fold the G_SUB into the G_ICMP using a CMN instead.
5153   // e.g:
5154   //
5155   // cmn z, y
5156 
5157   // Check if the RHS or LHS of the G_ICMP is defined by a SUB
5158   MachineInstr *LHSDef = getDefIgnoringCopies(LHS.getReg(), MRI);
5159   MachineInstr *RHSDef = getDefIgnoringCopies(RHS.getReg(), MRI);
5160   auto P = static_cast<CmpInst::Predicate>(Predicate.getPredicate());
5161   // Given this:
5162   //
5163   // x = G_SUB 0, y
5164   // G_ICMP x, z
5165   //
5166   // Produce this:
5167   //
5168   // cmn y, z
5169   if (isCMN(LHSDef, P, MRI))
5170     return emitCMN(LHSDef->getOperand(2), RHS, MIRBuilder);
5171 
5172   // Same idea here, but with the RHS of the compare instead:
5173   //
5174   // Given this:
5175   //
5176   // x = G_SUB 0, y
5177   // G_ICMP z, x
5178   //
5179   // Produce this:
5180   //
5181   // cmn z, y
5182   if (isCMN(RHSDef, P, MRI))
5183     return emitCMN(LHS, RHSDef->getOperand(2), MIRBuilder);
5184 
5185   // Given this:
5186   //
5187   // z = G_AND x, y
5188   // G_ICMP z, 0
5189   //
5190   // Produce this if the compare is signed:
5191   //
5192   // tst x, y
5193   if (!CmpInst::isUnsigned(P) && LHSDef &&
5194       LHSDef->getOpcode() == TargetOpcode::G_AND) {
5195     // Make sure that the RHS is 0.
5196     auto ValAndVReg = getIConstantVRegValWithLookThrough(RHS.getReg(), MRI);
5197     if (!ValAndVReg || ValAndVReg->Value != 0)
5198       return nullptr;
5199 
5200     return emitTST(LHSDef->getOperand(1),
5201                    LHSDef->getOperand(2), MIRBuilder);
5202   }
5203 
5204   return nullptr;
5205 }
5206 
5207 bool AArch64InstructionSelector::selectShuffleVector(
5208     MachineInstr &I, MachineRegisterInfo &MRI) {
5209   const LLT DstTy = MRI.getType(I.getOperand(0).getReg());
5210   Register Src1Reg = I.getOperand(1).getReg();
5211   const LLT Src1Ty = MRI.getType(Src1Reg);
5212   Register Src2Reg = I.getOperand(2).getReg();
5213   const LLT Src2Ty = MRI.getType(Src2Reg);
5214   ArrayRef<int> Mask = I.getOperand(3).getShuffleMask();
5215 
5216   MachineBasicBlock &MBB = *I.getParent();
5217   MachineFunction &MF = *MBB.getParent();
5218   LLVMContext &Ctx = MF.getFunction().getContext();
5219 
5220   // G_SHUFFLE_VECTOR is weird in that the source operands can be scalars, if
5221   // it's originated from a <1 x T> type. Those should have been lowered into
5222   // G_BUILD_VECTOR earlier.
5223   if (!Src1Ty.isVector() || !Src2Ty.isVector()) {
5224     LLVM_DEBUG(dbgs() << "Could not select a \"scalar\" G_SHUFFLE_VECTOR\n");
5225     return false;
5226   }
5227 
5228   unsigned BytesPerElt = DstTy.getElementType().getSizeInBits() / 8;
5229 
5230   SmallVector<Constant *, 64> CstIdxs;
5231   for (int Val : Mask) {
5232     // For now, any undef indexes we'll just assume to be 0. This should be
5233     // optimized in future, e.g. to select DUP etc.
5234     Val = Val < 0 ? 0 : Val;
5235     for (unsigned Byte = 0; Byte < BytesPerElt; ++Byte) {
5236       unsigned Offset = Byte + Val * BytesPerElt;
5237       CstIdxs.emplace_back(ConstantInt::get(Type::getInt8Ty(Ctx), Offset));
5238     }
5239   }
5240 
5241   // Use a constant pool to load the index vector for TBL.
5242   Constant *CPVal = ConstantVector::get(CstIdxs);
5243   MachineInstr *IndexLoad = emitLoadFromConstantPool(CPVal, MIB);
5244   if (!IndexLoad) {
5245     LLVM_DEBUG(dbgs() << "Could not load from a constant pool");
5246     return false;
5247   }
5248 
5249   if (DstTy.getSizeInBits() != 128) {
5250     assert(DstTy.getSizeInBits() == 64 && "Unexpected shuffle result ty");
5251     // This case can be done with TBL1.
5252     MachineInstr *Concat =
5253         emitVectorConcat(std::nullopt, Src1Reg, Src2Reg, MIB);
5254     if (!Concat) {
5255       LLVM_DEBUG(dbgs() << "Could not do vector concat for tbl1");
5256       return false;
5257     }
5258 
5259     // The constant pool load will be 64 bits, so need to convert to FPR128 reg.
5260     IndexLoad = emitScalarToVector(64, &AArch64::FPR128RegClass,
5261                                    IndexLoad->getOperand(0).getReg(), MIB);
5262 
5263     auto TBL1 = MIB.buildInstr(
5264         AArch64::TBLv16i8One, {&AArch64::FPR128RegClass},
5265         {Concat->getOperand(0).getReg(), IndexLoad->getOperand(0).getReg()});
5266     constrainSelectedInstRegOperands(*TBL1, TII, TRI, RBI);
5267 
5268     auto Copy =
5269         MIB.buildInstr(TargetOpcode::COPY, {I.getOperand(0).getReg()}, {})
5270             .addReg(TBL1.getReg(0), 0, AArch64::dsub);
5271     RBI.constrainGenericRegister(Copy.getReg(0), AArch64::FPR64RegClass, MRI);
5272     I.eraseFromParent();
5273     return true;
5274   }
5275 
5276   // For TBL2 we need to emit a REG_SEQUENCE to tie together two consecutive
5277   // Q registers for regalloc.
5278   SmallVector<Register, 2> Regs = {Src1Reg, Src2Reg};
5279   auto RegSeq = createQTuple(Regs, MIB);
5280   auto TBL2 = MIB.buildInstr(AArch64::TBLv16i8Two, {I.getOperand(0)},
5281                              {RegSeq, IndexLoad->getOperand(0)});
5282   constrainSelectedInstRegOperands(*TBL2, TII, TRI, RBI);
5283   I.eraseFromParent();
5284   return true;
5285 }
5286 
5287 MachineInstr *AArch64InstructionSelector::emitLaneInsert(
5288     std::optional<Register> DstReg, Register SrcReg, Register EltReg,
5289     unsigned LaneIdx, const RegisterBank &RB,
5290     MachineIRBuilder &MIRBuilder) const {
5291   MachineInstr *InsElt = nullptr;
5292   const TargetRegisterClass *DstRC = &AArch64::FPR128RegClass;
5293   MachineRegisterInfo &MRI = *MIRBuilder.getMRI();
5294 
5295   // Create a register to define with the insert if one wasn't passed in.
5296   if (!DstReg)
5297     DstReg = MRI.createVirtualRegister(DstRC);
5298 
5299   unsigned EltSize = MRI.getType(EltReg).getSizeInBits();
5300   unsigned Opc = getInsertVecEltOpInfo(RB, EltSize).first;
5301 
5302   if (RB.getID() == AArch64::FPRRegBankID) {
5303     auto InsSub = emitScalarToVector(EltSize, DstRC, EltReg, MIRBuilder);
5304     InsElt = MIRBuilder.buildInstr(Opc, {*DstReg}, {SrcReg})
5305                  .addImm(LaneIdx)
5306                  .addUse(InsSub->getOperand(0).getReg())
5307                  .addImm(0);
5308   } else {
5309     InsElt = MIRBuilder.buildInstr(Opc, {*DstReg}, {SrcReg})
5310                  .addImm(LaneIdx)
5311                  .addUse(EltReg);
5312   }
5313 
5314   constrainSelectedInstRegOperands(*InsElt, TII, TRI, RBI);
5315   return InsElt;
5316 }
5317 
5318 bool AArch64InstructionSelector::selectUSMovFromExtend(
5319     MachineInstr &MI, MachineRegisterInfo &MRI) {
5320   if (MI.getOpcode() != TargetOpcode::G_SEXT &&
5321       MI.getOpcode() != TargetOpcode::G_ZEXT &&
5322       MI.getOpcode() != TargetOpcode::G_ANYEXT)
5323     return false;
5324   bool IsSigned = MI.getOpcode() == TargetOpcode::G_SEXT;
5325   const Register DefReg = MI.getOperand(0).getReg();
5326   const LLT DstTy = MRI.getType(DefReg);
5327   unsigned DstSize = DstTy.getSizeInBits();
5328 
5329   if (DstSize != 32 && DstSize != 64)
5330     return false;
5331 
5332   MachineInstr *Extract = getOpcodeDef(TargetOpcode::G_EXTRACT_VECTOR_ELT,
5333                                        MI.getOperand(1).getReg(), MRI);
5334   int64_t Lane;
5335   if (!Extract || !mi_match(Extract->getOperand(2).getReg(), MRI, m_ICst(Lane)))
5336     return false;
5337   Register Src0 = Extract->getOperand(1).getReg();
5338 
5339   const LLT &VecTy = MRI.getType(Src0);
5340 
5341   if (VecTy.getSizeInBits() != 128) {
5342     const MachineInstr *ScalarToVector = emitScalarToVector(
5343         VecTy.getSizeInBits(), &AArch64::FPR128RegClass, Src0, MIB);
5344     assert(ScalarToVector && "Didn't expect emitScalarToVector to fail!");
5345     Src0 = ScalarToVector->getOperand(0).getReg();
5346   }
5347 
5348   unsigned Opcode;
5349   if (DstSize == 64 && VecTy.getScalarSizeInBits() == 32)
5350     Opcode = IsSigned ? AArch64::SMOVvi32to64 : AArch64::UMOVvi32;
5351   else if (DstSize == 64 && VecTy.getScalarSizeInBits() == 16)
5352     Opcode = IsSigned ? AArch64::SMOVvi16to64 : AArch64::UMOVvi16;
5353   else if (DstSize == 64 && VecTy.getScalarSizeInBits() == 8)
5354     Opcode = IsSigned ? AArch64::SMOVvi8to64 : AArch64::UMOVvi8;
5355   else if (DstSize == 32 && VecTy.getScalarSizeInBits() == 16)
5356     Opcode = IsSigned ? AArch64::SMOVvi16to32 : AArch64::UMOVvi16;
5357   else if (DstSize == 32 && VecTy.getScalarSizeInBits() == 8)
5358     Opcode = IsSigned ? AArch64::SMOVvi8to32 : AArch64::UMOVvi8;
5359   else
5360     llvm_unreachable("Unexpected type combo for S/UMov!");
5361 
5362   // We may need to generate one of these, depending on the type and sign of the
5363   // input:
5364   //  DstReg = SMOV Src0, Lane;
5365   //  NewReg = UMOV Src0, Lane; DstReg = SUBREG_TO_REG NewReg, sub_32;
5366   MachineInstr *ExtI = nullptr;
5367   if (DstSize == 64 && !IsSigned) {
5368     Register NewReg = MRI.createVirtualRegister(&AArch64::GPR32RegClass);
5369     MIB.buildInstr(Opcode, {NewReg}, {Src0}).addImm(Lane);
5370     ExtI = MIB.buildInstr(AArch64::SUBREG_TO_REG, {DefReg}, {})
5371                .addImm(0)
5372                .addUse(NewReg)
5373                .addImm(AArch64::sub_32);
5374     RBI.constrainGenericRegister(DefReg, AArch64::GPR64RegClass, MRI);
5375   } else
5376     ExtI = MIB.buildInstr(Opcode, {DefReg}, {Src0}).addImm(Lane);
5377 
5378   constrainSelectedInstRegOperands(*ExtI, TII, TRI, RBI);
5379   MI.eraseFromParent();
5380   return true;
5381 }
5382 
5383 bool AArch64InstructionSelector::selectInsertElt(MachineInstr &I,
5384                                                  MachineRegisterInfo &MRI) {
5385   assert(I.getOpcode() == TargetOpcode::G_INSERT_VECTOR_ELT);
5386 
5387   // Get information on the destination.
5388   Register DstReg = I.getOperand(0).getReg();
5389   const LLT DstTy = MRI.getType(DstReg);
5390   unsigned VecSize = DstTy.getSizeInBits();
5391 
5392   // Get information on the element we want to insert into the destination.
5393   Register EltReg = I.getOperand(2).getReg();
5394   const LLT EltTy = MRI.getType(EltReg);
5395   unsigned EltSize = EltTy.getSizeInBits();
5396   if (EltSize < 8 || EltSize > 64)
5397     return false;
5398 
5399   // Find the definition of the index. Bail out if it's not defined by a
5400   // G_CONSTANT.
5401   Register IdxReg = I.getOperand(3).getReg();
5402   auto VRegAndVal = getIConstantVRegValWithLookThrough(IdxReg, MRI);
5403   if (!VRegAndVal)
5404     return false;
5405   unsigned LaneIdx = VRegAndVal->Value.getSExtValue();
5406 
5407   // Perform the lane insert.
5408   Register SrcReg = I.getOperand(1).getReg();
5409   const RegisterBank &EltRB = *RBI.getRegBank(EltReg, MRI, TRI);
5410 
5411   if (VecSize < 128) {
5412     // If the vector we're inserting into is smaller than 128 bits, widen it
5413     // to 128 to do the insert.
5414     MachineInstr *ScalarToVec =
5415         emitScalarToVector(VecSize, &AArch64::FPR128RegClass, SrcReg, MIB);
5416     if (!ScalarToVec)
5417       return false;
5418     SrcReg = ScalarToVec->getOperand(0).getReg();
5419   }
5420 
5421   // Create an insert into a new FPR128 register.
5422   // Note that if our vector is already 128 bits, we end up emitting an extra
5423   // register.
5424   MachineInstr *InsMI =
5425       emitLaneInsert(std::nullopt, SrcReg, EltReg, LaneIdx, EltRB, MIB);
5426 
5427   if (VecSize < 128) {
5428     // If we had to widen to perform the insert, then we have to demote back to
5429     // the original size to get the result we want.
5430     if (!emitNarrowVector(DstReg, InsMI->getOperand(0).getReg(), MIB, MRI))
5431       return false;
5432   } else {
5433     // No widening needed.
5434     InsMI->getOperand(0).setReg(DstReg);
5435     constrainSelectedInstRegOperands(*InsMI, TII, TRI, RBI);
5436   }
5437 
5438   I.eraseFromParent();
5439   return true;
5440 }
5441 
5442 MachineInstr *AArch64InstructionSelector::tryAdvSIMDModImm8(
5443     Register Dst, unsigned DstSize, APInt Bits, MachineIRBuilder &Builder) {
5444   unsigned int Op;
5445   if (DstSize == 128) {
5446     if (Bits.getHiBits(64) != Bits.getLoBits(64))
5447       return nullptr;
5448     Op = AArch64::MOVIv16b_ns;
5449   } else {
5450     Op = AArch64::MOVIv8b_ns;
5451   }
5452 
5453   uint64_t Val = Bits.zextOrTrunc(64).getZExtValue();
5454 
5455   if (AArch64_AM::isAdvSIMDModImmType9(Val)) {
5456     Val = AArch64_AM::encodeAdvSIMDModImmType9(Val);
5457     auto Mov = Builder.buildInstr(Op, {Dst}, {}).addImm(Val);
5458     constrainSelectedInstRegOperands(*Mov, TII, TRI, RBI);
5459     return &*Mov;
5460   }
5461   return nullptr;
5462 }
5463 
5464 MachineInstr *AArch64InstructionSelector::tryAdvSIMDModImm16(
5465     Register Dst, unsigned DstSize, APInt Bits, MachineIRBuilder &Builder,
5466     bool Inv) {
5467 
5468   unsigned int Op;
5469   if (DstSize == 128) {
5470     if (Bits.getHiBits(64) != Bits.getLoBits(64))
5471       return nullptr;
5472     Op = Inv ? AArch64::MVNIv8i16 : AArch64::MOVIv8i16;
5473   } else {
5474     Op = Inv ? AArch64::MVNIv4i16 : AArch64::MOVIv4i16;
5475   }
5476 
5477   uint64_t Val = Bits.zextOrTrunc(64).getZExtValue();
5478   uint64_t Shift;
5479 
5480   if (AArch64_AM::isAdvSIMDModImmType5(Val)) {
5481     Val = AArch64_AM::encodeAdvSIMDModImmType5(Val);
5482     Shift = 0;
5483   } else if (AArch64_AM::isAdvSIMDModImmType6(Val)) {
5484     Val = AArch64_AM::encodeAdvSIMDModImmType6(Val);
5485     Shift = 8;
5486   } else
5487     return nullptr;
5488 
5489   auto Mov = Builder.buildInstr(Op, {Dst}, {}).addImm(Val).addImm(Shift);
5490   constrainSelectedInstRegOperands(*Mov, TII, TRI, RBI);
5491   return &*Mov;
5492 }
5493 
5494 MachineInstr *AArch64InstructionSelector::tryAdvSIMDModImm32(
5495     Register Dst, unsigned DstSize, APInt Bits, MachineIRBuilder &Builder,
5496     bool Inv) {
5497 
5498   unsigned int Op;
5499   if (DstSize == 128) {
5500     if (Bits.getHiBits(64) != Bits.getLoBits(64))
5501       return nullptr;
5502     Op = Inv ? AArch64::MVNIv4i32 : AArch64::MOVIv4i32;
5503   } else {
5504     Op = Inv ? AArch64::MVNIv2i32 : AArch64::MOVIv2i32;
5505   }
5506 
5507   uint64_t Val = Bits.zextOrTrunc(64).getZExtValue();
5508   uint64_t Shift;
5509 
5510   if ((AArch64_AM::isAdvSIMDModImmType1(Val))) {
5511     Val = AArch64_AM::encodeAdvSIMDModImmType1(Val);
5512     Shift = 0;
5513   } else if ((AArch64_AM::isAdvSIMDModImmType2(Val))) {
5514     Val = AArch64_AM::encodeAdvSIMDModImmType2(Val);
5515     Shift = 8;
5516   } else if ((AArch64_AM::isAdvSIMDModImmType3(Val))) {
5517     Val = AArch64_AM::encodeAdvSIMDModImmType3(Val);
5518     Shift = 16;
5519   } else if ((AArch64_AM::isAdvSIMDModImmType4(Val))) {
5520     Val = AArch64_AM::encodeAdvSIMDModImmType4(Val);
5521     Shift = 24;
5522   } else
5523     return nullptr;
5524 
5525   auto Mov = Builder.buildInstr(Op, {Dst}, {}).addImm(Val).addImm(Shift);
5526   constrainSelectedInstRegOperands(*Mov, TII, TRI, RBI);
5527   return &*Mov;
5528 }
5529 
5530 MachineInstr *AArch64InstructionSelector::tryAdvSIMDModImm64(
5531     Register Dst, unsigned DstSize, APInt Bits, MachineIRBuilder &Builder) {
5532 
5533   unsigned int Op;
5534   if (DstSize == 128) {
5535     if (Bits.getHiBits(64) != Bits.getLoBits(64))
5536       return nullptr;
5537     Op = AArch64::MOVIv2d_ns;
5538   } else {
5539     Op = AArch64::MOVID;
5540   }
5541 
5542   uint64_t Val = Bits.zextOrTrunc(64).getZExtValue();
5543   if (AArch64_AM::isAdvSIMDModImmType10(Val)) {
5544     Val = AArch64_AM::encodeAdvSIMDModImmType10(Val);
5545     auto Mov = Builder.buildInstr(Op, {Dst}, {}).addImm(Val);
5546     constrainSelectedInstRegOperands(*Mov, TII, TRI, RBI);
5547     return &*Mov;
5548   }
5549   return nullptr;
5550 }
5551 
5552 MachineInstr *AArch64InstructionSelector::tryAdvSIMDModImm321s(
5553     Register Dst, unsigned DstSize, APInt Bits, MachineIRBuilder &Builder,
5554     bool Inv) {
5555 
5556   unsigned int Op;
5557   if (DstSize == 128) {
5558     if (Bits.getHiBits(64) != Bits.getLoBits(64))
5559       return nullptr;
5560     Op = Inv ? AArch64::MVNIv4s_msl : AArch64::MOVIv4s_msl;
5561   } else {
5562     Op = Inv ? AArch64::MVNIv2s_msl : AArch64::MOVIv2s_msl;
5563   }
5564 
5565   uint64_t Val = Bits.zextOrTrunc(64).getZExtValue();
5566   uint64_t Shift;
5567 
5568   if (AArch64_AM::isAdvSIMDModImmType7(Val)) {
5569     Val = AArch64_AM::encodeAdvSIMDModImmType7(Val);
5570     Shift = 264;
5571   } else if (AArch64_AM::isAdvSIMDModImmType8(Val)) {
5572     Val = AArch64_AM::encodeAdvSIMDModImmType8(Val);
5573     Shift = 272;
5574   } else
5575     return nullptr;
5576 
5577   auto Mov = Builder.buildInstr(Op, {Dst}, {}).addImm(Val).addImm(Shift);
5578   constrainSelectedInstRegOperands(*Mov, TII, TRI, RBI);
5579   return &*Mov;
5580 }
5581 
5582 MachineInstr *AArch64InstructionSelector::tryAdvSIMDModImmFP(
5583     Register Dst, unsigned DstSize, APInt Bits, MachineIRBuilder &Builder) {
5584 
5585   unsigned int Op;
5586   bool IsWide = false;
5587   if (DstSize == 128) {
5588     if (Bits.getHiBits(64) != Bits.getLoBits(64))
5589       return nullptr;
5590     Op = AArch64::FMOVv4f32_ns;
5591     IsWide = true;
5592   } else {
5593     Op = AArch64::FMOVv2f32_ns;
5594   }
5595 
5596   uint64_t Val = Bits.zextOrTrunc(64).getZExtValue();
5597 
5598   if (AArch64_AM::isAdvSIMDModImmType11(Val)) {
5599     Val = AArch64_AM::encodeAdvSIMDModImmType11(Val);
5600   } else if (IsWide && AArch64_AM::isAdvSIMDModImmType12(Val)) {
5601     Val = AArch64_AM::encodeAdvSIMDModImmType12(Val);
5602     Op = AArch64::FMOVv2f64_ns;
5603   } else
5604     return nullptr;
5605 
5606   auto Mov = Builder.buildInstr(Op, {Dst}, {}).addImm(Val);
5607   constrainSelectedInstRegOperands(*Mov, TII, TRI, RBI);
5608   return &*Mov;
5609 }
5610 
5611 bool AArch64InstructionSelector::selectIndexedExtLoad(
5612     MachineInstr &MI, MachineRegisterInfo &MRI) {
5613   auto &ExtLd = cast<GIndexedAnyExtLoad>(MI);
5614   Register Dst = ExtLd.getDstReg();
5615   Register WriteBack = ExtLd.getWritebackReg();
5616   Register Base = ExtLd.getBaseReg();
5617   Register Offset = ExtLd.getOffsetReg();
5618   LLT Ty = MRI.getType(Dst);
5619   assert(Ty.getSizeInBits() <= 64); // Only for scalar GPRs.
5620   unsigned MemSizeBits = ExtLd.getMMO().getMemoryType().getSizeInBits();
5621   bool IsPre = ExtLd.isPre();
5622   bool IsSExt = isa<GIndexedSExtLoad>(ExtLd);
5623   bool InsertIntoXReg = false;
5624   bool IsDst64 = Ty.getSizeInBits() == 64;
5625 
5626   unsigned Opc = 0;
5627   LLT NewLdDstTy;
5628   LLT s32 = LLT::scalar(32);
5629   LLT s64 = LLT::scalar(64);
5630 
5631   if (MemSizeBits == 8) {
5632     if (IsSExt) {
5633       if (IsDst64)
5634         Opc = IsPre ? AArch64::LDRSBXpre : AArch64::LDRSBXpost;
5635       else
5636         Opc = IsPre ? AArch64::LDRSBWpre : AArch64::LDRSBWpost;
5637       NewLdDstTy = IsDst64 ? s64 : s32;
5638     } else {
5639       Opc = IsPre ? AArch64::LDRBBpre : AArch64::LDRBBpost;
5640       InsertIntoXReg = IsDst64;
5641       NewLdDstTy = s32;
5642     }
5643   } else if (MemSizeBits == 16) {
5644     if (IsSExt) {
5645       if (IsDst64)
5646         Opc = IsPre ? AArch64::LDRSHXpre : AArch64::LDRSHXpost;
5647       else
5648         Opc = IsPre ? AArch64::LDRSHWpre : AArch64::LDRSHWpost;
5649       NewLdDstTy = IsDst64 ? s64 : s32;
5650     } else {
5651       Opc = IsPre ? AArch64::LDRHHpre : AArch64::LDRHHpost;
5652       InsertIntoXReg = IsDst64;
5653       NewLdDstTy = s32;
5654     }
5655   } else if (MemSizeBits == 32) {
5656     if (IsSExt) {
5657       Opc = IsPre ? AArch64::LDRSWpre : AArch64::LDRSWpost;
5658       NewLdDstTy = s64;
5659     } else {
5660       Opc = IsPre ? AArch64::LDRWpre : AArch64::LDRWpost;
5661       InsertIntoXReg = IsDst64;
5662       NewLdDstTy = s32;
5663     }
5664   } else {
5665     llvm_unreachable("Unexpected size for indexed load");
5666   }
5667 
5668   if (RBI.getRegBank(Dst, MRI, TRI)->getID() == AArch64::FPRRegBankID)
5669     return false; // We should be on gpr.
5670 
5671   auto Cst = getIConstantVRegVal(Offset, MRI);
5672   if (!Cst)
5673     return false; // Shouldn't happen, but just in case.
5674 
5675   auto LdMI = MIB.buildInstr(Opc, {WriteBack, NewLdDstTy}, {Base})
5676                   .addImm(Cst->getSExtValue());
5677   LdMI.cloneMemRefs(ExtLd);
5678   constrainSelectedInstRegOperands(*LdMI, TII, TRI, RBI);
5679   // Make sure to select the load with the MemTy as the dest type, and then
5680   // insert into X reg if needed.
5681   if (InsertIntoXReg) {
5682     // Generate a SUBREG_TO_REG.
5683     auto SubToReg = MIB.buildInstr(TargetOpcode::SUBREG_TO_REG, {Dst}, {})
5684                         .addImm(0)
5685                         .addUse(LdMI.getReg(1))
5686                         .addImm(AArch64::sub_32);
5687     RBI.constrainGenericRegister(SubToReg.getReg(0), AArch64::GPR64RegClass,
5688                                  MRI);
5689   } else {
5690     auto Copy = MIB.buildCopy(Dst, LdMI.getReg(1));
5691     selectCopy(*Copy, TII, MRI, TRI, RBI);
5692   }
5693   MI.eraseFromParent();
5694 
5695   return true;
5696 }
5697 
5698 bool AArch64InstructionSelector::selectIndexedLoad(MachineInstr &MI,
5699                                                    MachineRegisterInfo &MRI) {
5700   auto &Ld = cast<GIndexedLoad>(MI);
5701   Register Dst = Ld.getDstReg();
5702   Register WriteBack = Ld.getWritebackReg();
5703   Register Base = Ld.getBaseReg();
5704   Register Offset = Ld.getOffsetReg();
5705   assert(MRI.getType(Dst).getSizeInBits() <= 128 &&
5706          "Unexpected type for indexed load");
5707   unsigned MemSize = Ld.getMMO().getMemoryType().getSizeInBytes();
5708 
5709   if (MemSize < MRI.getType(Dst).getSizeInBytes())
5710     return selectIndexedExtLoad(MI, MRI);
5711 
5712   unsigned Opc = 0;
5713   if (Ld.isPre()) {
5714     static constexpr unsigned GPROpcodes[] = {
5715         AArch64::LDRBBpre, AArch64::LDRHHpre, AArch64::LDRWpre,
5716         AArch64::LDRXpre};
5717     static constexpr unsigned FPROpcodes[] = {
5718         AArch64::LDRBpre, AArch64::LDRHpre, AArch64::LDRSpre, AArch64::LDRDpre,
5719         AArch64::LDRQpre};
5720     if (RBI.getRegBank(Dst, MRI, TRI)->getID() == AArch64::FPRRegBankID)
5721       Opc = FPROpcodes[Log2_32(MemSize)];
5722     else
5723       Opc = GPROpcodes[Log2_32(MemSize)];
5724   } else {
5725     static constexpr unsigned GPROpcodes[] = {
5726         AArch64::LDRBBpost, AArch64::LDRHHpost, AArch64::LDRWpost,
5727         AArch64::LDRXpost};
5728     static constexpr unsigned FPROpcodes[] = {
5729         AArch64::LDRBpost, AArch64::LDRHpost, AArch64::LDRSpost,
5730         AArch64::LDRDpost, AArch64::LDRQpost};
5731     if (RBI.getRegBank(Dst, MRI, TRI)->getID() == AArch64::FPRRegBankID)
5732       Opc = FPROpcodes[Log2_32(MemSize)];
5733     else
5734       Opc = GPROpcodes[Log2_32(MemSize)];
5735   }
5736   auto Cst = getIConstantVRegVal(Offset, MRI);
5737   if (!Cst)
5738     return false; // Shouldn't happen, but just in case.
5739   auto LdMI =
5740       MIB.buildInstr(Opc, {WriteBack, Dst}, {Base}).addImm(Cst->getSExtValue());
5741   LdMI.cloneMemRefs(Ld);
5742   constrainSelectedInstRegOperands(*LdMI, TII, TRI, RBI);
5743   MI.eraseFromParent();
5744   return true;
5745 }
5746 
5747 bool AArch64InstructionSelector::selectIndexedStore(GIndexedStore &I,
5748                                                     MachineRegisterInfo &MRI) {
5749   Register Dst = I.getWritebackReg();
5750   Register Val = I.getValueReg();
5751   Register Base = I.getBaseReg();
5752   Register Offset = I.getOffsetReg();
5753   LLT ValTy = MRI.getType(Val);
5754   assert(ValTy.getSizeInBits() <= 128 && "Unexpected type for indexed store");
5755 
5756   unsigned Opc = 0;
5757   if (I.isPre()) {
5758     static constexpr unsigned GPROpcodes[] = {
5759         AArch64::STRBBpre, AArch64::STRHHpre, AArch64::STRWpre,
5760         AArch64::STRXpre};
5761     static constexpr unsigned FPROpcodes[] = {
5762         AArch64::STRBpre, AArch64::STRHpre, AArch64::STRSpre, AArch64::STRDpre,
5763         AArch64::STRQpre};
5764 
5765     if (RBI.getRegBank(Val, MRI, TRI)->getID() == AArch64::FPRRegBankID)
5766       Opc = FPROpcodes[Log2_32(ValTy.getSizeInBytes())];
5767     else
5768       Opc = GPROpcodes[Log2_32(ValTy.getSizeInBytes())];
5769   } else {
5770     static constexpr unsigned GPROpcodes[] = {
5771         AArch64::STRBBpost, AArch64::STRHHpost, AArch64::STRWpost,
5772         AArch64::STRXpost};
5773     static constexpr unsigned FPROpcodes[] = {
5774         AArch64::STRBpost, AArch64::STRHpost, AArch64::STRSpost,
5775         AArch64::STRDpost, AArch64::STRQpost};
5776 
5777     if (RBI.getRegBank(Val, MRI, TRI)->getID() == AArch64::FPRRegBankID)
5778       Opc = FPROpcodes[Log2_32(ValTy.getSizeInBytes())];
5779     else
5780       Opc = GPROpcodes[Log2_32(ValTy.getSizeInBytes())];
5781   }
5782 
5783   auto Cst = getIConstantVRegVal(Offset, MRI);
5784   if (!Cst)
5785     return false; // Shouldn't happen, but just in case.
5786   auto Str =
5787       MIB.buildInstr(Opc, {Dst}, {Val, Base}).addImm(Cst->getSExtValue());
5788   Str.cloneMemRefs(I);
5789   constrainSelectedInstRegOperands(*Str, TII, TRI, RBI);
5790   I.eraseFromParent();
5791   return true;
5792 }
5793 
5794 MachineInstr *
5795 AArch64InstructionSelector::emitConstantVector(Register Dst, Constant *CV,
5796                                                MachineIRBuilder &MIRBuilder,
5797                                                MachineRegisterInfo &MRI) {
5798   LLT DstTy = MRI.getType(Dst);
5799   unsigned DstSize = DstTy.getSizeInBits();
5800   if (CV->isNullValue()) {
5801     if (DstSize == 128) {
5802       auto Mov =
5803           MIRBuilder.buildInstr(AArch64::MOVIv2d_ns, {Dst}, {}).addImm(0);
5804       constrainSelectedInstRegOperands(*Mov, TII, TRI, RBI);
5805       return &*Mov;
5806     }
5807 
5808     if (DstSize == 64) {
5809       auto Mov =
5810           MIRBuilder
5811               .buildInstr(AArch64::MOVIv2d_ns, {&AArch64::FPR128RegClass}, {})
5812               .addImm(0);
5813       auto Copy = MIRBuilder.buildInstr(TargetOpcode::COPY, {Dst}, {})
5814                       .addReg(Mov.getReg(0), 0, AArch64::dsub);
5815       RBI.constrainGenericRegister(Dst, AArch64::FPR64RegClass, MRI);
5816       return &*Copy;
5817     }
5818   }
5819 
5820   if (CV->getSplatValue()) {
5821     APInt DefBits = APInt::getSplat(DstSize, CV->getUniqueInteger());
5822     MachineInstr *NewOp;
5823     bool Inv = false;
5824     if ((NewOp = tryAdvSIMDModImm64(Dst, DstSize, DefBits, MIRBuilder)) ||
5825         (NewOp = tryAdvSIMDModImm32(Dst, DstSize, DefBits, MIRBuilder, Inv)) ||
5826         (NewOp =
5827              tryAdvSIMDModImm321s(Dst, DstSize, DefBits, MIRBuilder, Inv)) ||
5828         (NewOp = tryAdvSIMDModImm16(Dst, DstSize, DefBits, MIRBuilder, Inv)) ||
5829         (NewOp = tryAdvSIMDModImm8(Dst, DstSize, DefBits, MIRBuilder)) ||
5830         (NewOp = tryAdvSIMDModImmFP(Dst, DstSize, DefBits, MIRBuilder)))
5831       return NewOp;
5832 
5833     DefBits = ~DefBits;
5834     Inv = true;
5835     if ((NewOp = tryAdvSIMDModImm32(Dst, DstSize, DefBits, MIRBuilder, Inv)) ||
5836         (NewOp =
5837              tryAdvSIMDModImm321s(Dst, DstSize, DefBits, MIRBuilder, Inv)) ||
5838         (NewOp = tryAdvSIMDModImm16(Dst, DstSize, DefBits, MIRBuilder, Inv)))
5839       return NewOp;
5840   }
5841 
5842   auto *CPLoad = emitLoadFromConstantPool(CV, MIRBuilder);
5843   if (!CPLoad) {
5844     LLVM_DEBUG(dbgs() << "Could not generate cp load for constant vector!");
5845     return nullptr;
5846   }
5847 
5848   auto Copy = MIRBuilder.buildCopy(Dst, CPLoad->getOperand(0));
5849   RBI.constrainGenericRegister(
5850       Dst, *MRI.getRegClass(CPLoad->getOperand(0).getReg()), MRI);
5851   return &*Copy;
5852 }
5853 
5854 bool AArch64InstructionSelector::tryOptConstantBuildVec(
5855     MachineInstr &I, LLT DstTy, MachineRegisterInfo &MRI) {
5856   assert(I.getOpcode() == TargetOpcode::G_BUILD_VECTOR);
5857   unsigned DstSize = DstTy.getSizeInBits();
5858   assert(DstSize <= 128 && "Unexpected build_vec type!");
5859   if (DstSize < 32)
5860     return false;
5861   // Check if we're building a constant vector, in which case we want to
5862   // generate a constant pool load instead of a vector insert sequence.
5863   SmallVector<Constant *, 16> Csts;
5864   for (unsigned Idx = 1; Idx < I.getNumOperands(); ++Idx) {
5865     // Try to find G_CONSTANT or G_FCONSTANT
5866     auto *OpMI =
5867         getOpcodeDef(TargetOpcode::G_CONSTANT, I.getOperand(Idx).getReg(), MRI);
5868     if (OpMI)
5869       Csts.emplace_back(
5870           const_cast<ConstantInt *>(OpMI->getOperand(1).getCImm()));
5871     else if ((OpMI = getOpcodeDef(TargetOpcode::G_FCONSTANT,
5872                                   I.getOperand(Idx).getReg(), MRI)))
5873       Csts.emplace_back(
5874           const_cast<ConstantFP *>(OpMI->getOperand(1).getFPImm()));
5875     else
5876       return false;
5877   }
5878   Constant *CV = ConstantVector::get(Csts);
5879   if (!emitConstantVector(I.getOperand(0).getReg(), CV, MIB, MRI))
5880     return false;
5881   I.eraseFromParent();
5882   return true;
5883 }
5884 
5885 bool AArch64InstructionSelector::tryOptBuildVecToSubregToReg(
5886     MachineInstr &I, MachineRegisterInfo &MRI) {
5887   // Given:
5888   //  %vec = G_BUILD_VECTOR %elt, %undef, %undef, ... %undef
5889   //
5890   // Select the G_BUILD_VECTOR as a SUBREG_TO_REG from %elt.
5891   Register Dst = I.getOperand(0).getReg();
5892   Register EltReg = I.getOperand(1).getReg();
5893   LLT EltTy = MRI.getType(EltReg);
5894   // If the index isn't on the same bank as its elements, then this can't be a
5895   // SUBREG_TO_REG.
5896   const RegisterBank &EltRB = *RBI.getRegBank(EltReg, MRI, TRI);
5897   const RegisterBank &DstRB = *RBI.getRegBank(Dst, MRI, TRI);
5898   if (EltRB != DstRB)
5899     return false;
5900   if (any_of(drop_begin(I.operands(), 2), [&MRI](const MachineOperand &Op) {
5901         return !getOpcodeDef(TargetOpcode::G_IMPLICIT_DEF, Op.getReg(), MRI);
5902       }))
5903     return false;
5904   unsigned SubReg;
5905   const TargetRegisterClass *EltRC = getRegClassForTypeOnBank(EltTy, EltRB);
5906   if (!EltRC)
5907     return false;
5908   const TargetRegisterClass *DstRC =
5909       getRegClassForTypeOnBank(MRI.getType(Dst), DstRB);
5910   if (!DstRC)
5911     return false;
5912   if (!getSubRegForClass(EltRC, TRI, SubReg))
5913     return false;
5914   auto SubregToReg = MIB.buildInstr(AArch64::SUBREG_TO_REG, {Dst}, {})
5915                          .addImm(0)
5916                          .addUse(EltReg)
5917                          .addImm(SubReg);
5918   I.eraseFromParent();
5919   constrainSelectedInstRegOperands(*SubregToReg, TII, TRI, RBI);
5920   return RBI.constrainGenericRegister(Dst, *DstRC, MRI);
5921 }
5922 
5923 bool AArch64InstructionSelector::selectBuildVector(MachineInstr &I,
5924                                                    MachineRegisterInfo &MRI) {
5925   assert(I.getOpcode() == TargetOpcode::G_BUILD_VECTOR);
5926   // Until we port more of the optimized selections, for now just use a vector
5927   // insert sequence.
5928   const LLT DstTy = MRI.getType(I.getOperand(0).getReg());
5929   const LLT EltTy = MRI.getType(I.getOperand(1).getReg());
5930   unsigned EltSize = EltTy.getSizeInBits();
5931 
5932   if (tryOptConstantBuildVec(I, DstTy, MRI))
5933     return true;
5934   if (tryOptBuildVecToSubregToReg(I, MRI))
5935     return true;
5936 
5937   if (EltSize != 8 && EltSize != 16 && EltSize != 32 && EltSize != 64)
5938     return false; // Don't support all element types yet.
5939   const RegisterBank &RB = *RBI.getRegBank(I.getOperand(1).getReg(), MRI, TRI);
5940 
5941   const TargetRegisterClass *DstRC = &AArch64::FPR128RegClass;
5942   MachineInstr *ScalarToVec =
5943       emitScalarToVector(DstTy.getElementType().getSizeInBits(), DstRC,
5944                          I.getOperand(1).getReg(), MIB);
5945   if (!ScalarToVec)
5946     return false;
5947 
5948   Register DstVec = ScalarToVec->getOperand(0).getReg();
5949   unsigned DstSize = DstTy.getSizeInBits();
5950 
5951   // Keep track of the last MI we inserted. Later on, we might be able to save
5952   // a copy using it.
5953   MachineInstr *PrevMI = nullptr;
5954   for (unsigned i = 2, e = DstSize / EltSize + 1; i < e; ++i) {
5955     // Note that if we don't do a subregister copy, we can end up making an
5956     // extra register.
5957     PrevMI = &*emitLaneInsert(std::nullopt, DstVec, I.getOperand(i).getReg(),
5958                               i - 1, RB, MIB);
5959     DstVec = PrevMI->getOperand(0).getReg();
5960   }
5961 
5962   // If DstTy's size in bits is less than 128, then emit a subregister copy
5963   // from DstVec to the last register we've defined.
5964   if (DstSize < 128) {
5965     // Force this to be FPR using the destination vector.
5966     const TargetRegisterClass *RC =
5967         getRegClassForTypeOnBank(DstTy, *RBI.getRegBank(DstVec, MRI, TRI));
5968     if (!RC)
5969       return false;
5970     if (RC != &AArch64::FPR32RegClass && RC != &AArch64::FPR64RegClass) {
5971       LLVM_DEBUG(dbgs() << "Unsupported register class!\n");
5972       return false;
5973     }
5974 
5975     unsigned SubReg = 0;
5976     if (!getSubRegForClass(RC, TRI, SubReg))
5977       return false;
5978     if (SubReg != AArch64::ssub && SubReg != AArch64::dsub) {
5979       LLVM_DEBUG(dbgs() << "Unsupported destination size! (" << DstSize
5980                         << "\n");
5981       return false;
5982     }
5983 
5984     Register Reg = MRI.createVirtualRegister(RC);
5985     Register DstReg = I.getOperand(0).getReg();
5986 
5987     MIB.buildInstr(TargetOpcode::COPY, {DstReg}, {}).addReg(DstVec, 0, SubReg);
5988     MachineOperand &RegOp = I.getOperand(1);
5989     RegOp.setReg(Reg);
5990     RBI.constrainGenericRegister(DstReg, *RC, MRI);
5991   } else {
5992     // We don't need a subregister copy. Save a copy by re-using the
5993     // destination register on the final insert.
5994     assert(PrevMI && "PrevMI was null?");
5995     PrevMI->getOperand(0).setReg(I.getOperand(0).getReg());
5996     constrainSelectedInstRegOperands(*PrevMI, TII, TRI, RBI);
5997   }
5998 
5999   I.eraseFromParent();
6000   return true;
6001 }
6002 
6003 bool AArch64InstructionSelector::selectVectorLoadIntrinsic(unsigned Opc,
6004                                                            unsigned NumVecs,
6005                                                            MachineInstr &I) {
6006   assert(I.getOpcode() == TargetOpcode::G_INTRINSIC_W_SIDE_EFFECTS);
6007   assert(Opc && "Expected an opcode?");
6008   assert(NumVecs > 1 && NumVecs < 5 && "Only support 2, 3, or 4 vectors");
6009   auto &MRI = *MIB.getMRI();
6010   LLT Ty = MRI.getType(I.getOperand(0).getReg());
6011   unsigned Size = Ty.getSizeInBits();
6012   assert((Size == 64 || Size == 128) &&
6013          "Destination must be 64 bits or 128 bits?");
6014   unsigned SubReg = Size == 64 ? AArch64::dsub0 : AArch64::qsub0;
6015   auto Ptr = I.getOperand(I.getNumOperands() - 1).getReg();
6016   assert(MRI.getType(Ptr).isPointer() && "Expected a pointer type?");
6017   auto Load = MIB.buildInstr(Opc, {Ty}, {Ptr});
6018   Load.cloneMemRefs(I);
6019   constrainSelectedInstRegOperands(*Load, TII, TRI, RBI);
6020   Register SelectedLoadDst = Load->getOperand(0).getReg();
6021   for (unsigned Idx = 0; Idx < NumVecs; ++Idx) {
6022     auto Vec = MIB.buildInstr(TargetOpcode::COPY, {I.getOperand(Idx)}, {})
6023                    .addReg(SelectedLoadDst, 0, SubReg + Idx);
6024     // Emit the subreg copies and immediately select them.
6025     // FIXME: We should refactor our copy code into an emitCopy helper and
6026     // clean up uses of this pattern elsewhere in the selector.
6027     selectCopy(*Vec, TII, MRI, TRI, RBI);
6028   }
6029   return true;
6030 }
6031 
6032 bool AArch64InstructionSelector::selectVectorLoadLaneIntrinsic(
6033     unsigned Opc, unsigned NumVecs, MachineInstr &I) {
6034   assert(I.getOpcode() == TargetOpcode::G_INTRINSIC_W_SIDE_EFFECTS);
6035   assert(Opc && "Expected an opcode?");
6036   assert(NumVecs > 1 && NumVecs < 5 && "Only support 2, 3, or 4 vectors");
6037   auto &MRI = *MIB.getMRI();
6038   LLT Ty = MRI.getType(I.getOperand(0).getReg());
6039   bool Narrow = Ty.getSizeInBits() == 64;
6040 
6041   auto FirstSrcRegIt = I.operands_begin() + NumVecs + 1;
6042   SmallVector<Register, 4> Regs(NumVecs);
6043   std::transform(FirstSrcRegIt, FirstSrcRegIt + NumVecs, Regs.begin(),
6044                  [](auto MO) { return MO.getReg(); });
6045 
6046   if (Narrow) {
6047     transform(Regs, Regs.begin(), [this](Register Reg) {
6048       return emitScalarToVector(64, &AArch64::FPR128RegClass, Reg, MIB)
6049           ->getOperand(0)
6050           .getReg();
6051     });
6052     Ty = Ty.multiplyElements(2);
6053   }
6054 
6055   Register Tuple = createQTuple(Regs, MIB);
6056   auto LaneNo = getIConstantVRegVal((FirstSrcRegIt + NumVecs)->getReg(), MRI);
6057   if (!LaneNo)
6058     return false;
6059 
6060   Register Ptr = (FirstSrcRegIt + NumVecs + 1)->getReg();
6061   auto Load = MIB.buildInstr(Opc, {Ty}, {})
6062                   .addReg(Tuple)
6063                   .addImm(LaneNo->getZExtValue())
6064                   .addReg(Ptr);
6065   Load.cloneMemRefs(I);
6066   constrainSelectedInstRegOperands(*Load, TII, TRI, RBI);
6067   Register SelectedLoadDst = Load->getOperand(0).getReg();
6068   unsigned SubReg = AArch64::qsub0;
6069   for (unsigned Idx = 0; Idx < NumVecs; ++Idx) {
6070     auto Vec = MIB.buildInstr(TargetOpcode::COPY,
6071                               {Narrow ? DstOp(&AArch64::FPR128RegClass)
6072                                       : DstOp(I.getOperand(Idx).getReg())},
6073                               {})
6074                    .addReg(SelectedLoadDst, 0, SubReg + Idx);
6075     Register WideReg = Vec.getReg(0);
6076     // Emit the subreg copies and immediately select them.
6077     selectCopy(*Vec, TII, MRI, TRI, RBI);
6078     if (Narrow &&
6079         !emitNarrowVector(I.getOperand(Idx).getReg(), WideReg, MIB, MRI))
6080       return false;
6081   }
6082   return true;
6083 }
6084 
6085 void AArch64InstructionSelector::selectVectorStoreIntrinsic(MachineInstr &I,
6086                                                             unsigned NumVecs,
6087                                                             unsigned Opc) {
6088   MachineRegisterInfo &MRI = I.getParent()->getParent()->getRegInfo();
6089   LLT Ty = MRI.getType(I.getOperand(1).getReg());
6090   Register Ptr = I.getOperand(1 + NumVecs).getReg();
6091 
6092   SmallVector<Register, 2> Regs(NumVecs);
6093   std::transform(I.operands_begin() + 1, I.operands_begin() + 1 + NumVecs,
6094                  Regs.begin(), [](auto MO) { return MO.getReg(); });
6095 
6096   Register Tuple = Ty.getSizeInBits() == 128 ? createQTuple(Regs, MIB)
6097                                              : createDTuple(Regs, MIB);
6098   auto Store = MIB.buildInstr(Opc, {}, {Tuple, Ptr});
6099   Store.cloneMemRefs(I);
6100   constrainSelectedInstRegOperands(*Store, TII, TRI, RBI);
6101 }
6102 
6103 bool AArch64InstructionSelector::selectVectorStoreLaneIntrinsic(
6104     MachineInstr &I, unsigned NumVecs, unsigned Opc) {
6105   MachineRegisterInfo &MRI = I.getParent()->getParent()->getRegInfo();
6106   LLT Ty = MRI.getType(I.getOperand(1).getReg());
6107   bool Narrow = Ty.getSizeInBits() == 64;
6108 
6109   SmallVector<Register, 2> Regs(NumVecs);
6110   std::transform(I.operands_begin() + 1, I.operands_begin() + 1 + NumVecs,
6111                  Regs.begin(), [](auto MO) { return MO.getReg(); });
6112 
6113   if (Narrow)
6114     transform(Regs, Regs.begin(), [this](Register Reg) {
6115       return emitScalarToVector(64, &AArch64::FPR128RegClass, Reg, MIB)
6116           ->getOperand(0)
6117           .getReg();
6118     });
6119 
6120   Register Tuple = createQTuple(Regs, MIB);
6121 
6122   auto LaneNo = getIConstantVRegVal(I.getOperand(1 + NumVecs).getReg(), MRI);
6123   if (!LaneNo)
6124     return false;
6125   Register Ptr = I.getOperand(1 + NumVecs + 1).getReg();
6126   auto Store = MIB.buildInstr(Opc, {}, {})
6127                    .addReg(Tuple)
6128                    .addImm(LaneNo->getZExtValue())
6129                    .addReg(Ptr);
6130   Store.cloneMemRefs(I);
6131   constrainSelectedInstRegOperands(*Store, TII, TRI, RBI);
6132   return true;
6133 }
6134 
6135 bool AArch64InstructionSelector::selectIntrinsicWithSideEffects(
6136     MachineInstr &I, MachineRegisterInfo &MRI) {
6137   // Find the intrinsic ID.
6138   unsigned IntrinID = cast<GIntrinsic>(I).getIntrinsicID();
6139 
6140   const LLT S8 = LLT::scalar(8);
6141   const LLT S16 = LLT::scalar(16);
6142   const LLT S32 = LLT::scalar(32);
6143   const LLT S64 = LLT::scalar(64);
6144   const LLT P0 = LLT::pointer(0, 64);
6145   // Select the instruction.
6146   switch (IntrinID) {
6147   default:
6148     return false;
6149   case Intrinsic::aarch64_ldxp:
6150   case Intrinsic::aarch64_ldaxp: {
6151     auto NewI = MIB.buildInstr(
6152         IntrinID == Intrinsic::aarch64_ldxp ? AArch64::LDXPX : AArch64::LDAXPX,
6153         {I.getOperand(0).getReg(), I.getOperand(1).getReg()},
6154         {I.getOperand(3)});
6155     NewI.cloneMemRefs(I);
6156     constrainSelectedInstRegOperands(*NewI, TII, TRI, RBI);
6157     break;
6158   }
6159   case Intrinsic::trap:
6160     MIB.buildInstr(AArch64::BRK, {}, {}).addImm(1);
6161     break;
6162   case Intrinsic::debugtrap:
6163     MIB.buildInstr(AArch64::BRK, {}, {}).addImm(0xF000);
6164     break;
6165   case Intrinsic::ubsantrap:
6166     MIB.buildInstr(AArch64::BRK, {}, {})
6167         .addImm(I.getOperand(1).getImm() | ('U' << 8));
6168     break;
6169   case Intrinsic::aarch64_neon_ld1x2: {
6170     LLT Ty = MRI.getType(I.getOperand(0).getReg());
6171     unsigned Opc = 0;
6172     if (Ty == LLT::fixed_vector(8, S8))
6173       Opc = AArch64::LD1Twov8b;
6174     else if (Ty == LLT::fixed_vector(16, S8))
6175       Opc = AArch64::LD1Twov16b;
6176     else if (Ty == LLT::fixed_vector(4, S16))
6177       Opc = AArch64::LD1Twov4h;
6178     else if (Ty == LLT::fixed_vector(8, S16))
6179       Opc = AArch64::LD1Twov8h;
6180     else if (Ty == LLT::fixed_vector(2, S32))
6181       Opc = AArch64::LD1Twov2s;
6182     else if (Ty == LLT::fixed_vector(4, S32))
6183       Opc = AArch64::LD1Twov4s;
6184     else if (Ty == LLT::fixed_vector(2, S64) || Ty == LLT::fixed_vector(2, P0))
6185       Opc = AArch64::LD1Twov2d;
6186     else if (Ty == S64 || Ty == P0)
6187       Opc = AArch64::LD1Twov1d;
6188     else
6189       llvm_unreachable("Unexpected type for ld1x2!");
6190     selectVectorLoadIntrinsic(Opc, 2, I);
6191     break;
6192   }
6193   case Intrinsic::aarch64_neon_ld1x3: {
6194     LLT Ty = MRI.getType(I.getOperand(0).getReg());
6195     unsigned Opc = 0;
6196     if (Ty == LLT::fixed_vector(8, S8))
6197       Opc = AArch64::LD1Threev8b;
6198     else if (Ty == LLT::fixed_vector(16, S8))
6199       Opc = AArch64::LD1Threev16b;
6200     else if (Ty == LLT::fixed_vector(4, S16))
6201       Opc = AArch64::LD1Threev4h;
6202     else if (Ty == LLT::fixed_vector(8, S16))
6203       Opc = AArch64::LD1Threev8h;
6204     else if (Ty == LLT::fixed_vector(2, S32))
6205       Opc = AArch64::LD1Threev2s;
6206     else if (Ty == LLT::fixed_vector(4, S32))
6207       Opc = AArch64::LD1Threev4s;
6208     else if (Ty == LLT::fixed_vector(2, S64) || Ty == LLT::fixed_vector(2, P0))
6209       Opc = AArch64::LD1Threev2d;
6210     else if (Ty == S64 || Ty == P0)
6211       Opc = AArch64::LD1Threev1d;
6212     else
6213       llvm_unreachable("Unexpected type for ld1x3!");
6214     selectVectorLoadIntrinsic(Opc, 3, I);
6215     break;
6216   }
6217   case Intrinsic::aarch64_neon_ld1x4: {
6218     LLT Ty = MRI.getType(I.getOperand(0).getReg());
6219     unsigned Opc = 0;
6220     if (Ty == LLT::fixed_vector(8, S8))
6221       Opc = AArch64::LD1Fourv8b;
6222     else if (Ty == LLT::fixed_vector(16, S8))
6223       Opc = AArch64::LD1Fourv16b;
6224     else if (Ty == LLT::fixed_vector(4, S16))
6225       Opc = AArch64::LD1Fourv4h;
6226     else if (Ty == LLT::fixed_vector(8, S16))
6227       Opc = AArch64::LD1Fourv8h;
6228     else if (Ty == LLT::fixed_vector(2, S32))
6229       Opc = AArch64::LD1Fourv2s;
6230     else if (Ty == LLT::fixed_vector(4, S32))
6231       Opc = AArch64::LD1Fourv4s;
6232     else if (Ty == LLT::fixed_vector(2, S64) || Ty == LLT::fixed_vector(2, P0))
6233       Opc = AArch64::LD1Fourv2d;
6234     else if (Ty == S64 || Ty == P0)
6235       Opc = AArch64::LD1Fourv1d;
6236     else
6237       llvm_unreachable("Unexpected type for ld1x4!");
6238     selectVectorLoadIntrinsic(Opc, 4, I);
6239     break;
6240   }
6241   case Intrinsic::aarch64_neon_ld2: {
6242     LLT Ty = MRI.getType(I.getOperand(0).getReg());
6243     unsigned Opc = 0;
6244     if (Ty == LLT::fixed_vector(8, S8))
6245       Opc = AArch64::LD2Twov8b;
6246     else if (Ty == LLT::fixed_vector(16, S8))
6247       Opc = AArch64::LD2Twov16b;
6248     else if (Ty == LLT::fixed_vector(4, S16))
6249       Opc = AArch64::LD2Twov4h;
6250     else if (Ty == LLT::fixed_vector(8, S16))
6251       Opc = AArch64::LD2Twov8h;
6252     else if (Ty == LLT::fixed_vector(2, S32))
6253       Opc = AArch64::LD2Twov2s;
6254     else if (Ty == LLT::fixed_vector(4, S32))
6255       Opc = AArch64::LD2Twov4s;
6256     else if (Ty == LLT::fixed_vector(2, S64) || Ty == LLT::fixed_vector(2, P0))
6257       Opc = AArch64::LD2Twov2d;
6258     else if (Ty == S64 || Ty == P0)
6259       Opc = AArch64::LD1Twov1d;
6260     else
6261       llvm_unreachable("Unexpected type for ld2!");
6262     selectVectorLoadIntrinsic(Opc, 2, I);
6263     break;
6264   }
6265   case Intrinsic::aarch64_neon_ld2lane: {
6266     LLT Ty = MRI.getType(I.getOperand(0).getReg());
6267     unsigned Opc;
6268     if (Ty == LLT::fixed_vector(8, S8) || Ty == LLT::fixed_vector(16, S8))
6269       Opc = AArch64::LD2i8;
6270     else if (Ty == LLT::fixed_vector(4, S16) || Ty == LLT::fixed_vector(8, S16))
6271       Opc = AArch64::LD2i16;
6272     else if (Ty == LLT::fixed_vector(2, S32) || Ty == LLT::fixed_vector(4, S32))
6273       Opc = AArch64::LD2i32;
6274     else if (Ty == LLT::fixed_vector(2, S64) ||
6275              Ty == LLT::fixed_vector(2, P0) || Ty == S64 || Ty == P0)
6276       Opc = AArch64::LD2i64;
6277     else
6278       llvm_unreachable("Unexpected type for st2lane!");
6279     if (!selectVectorLoadLaneIntrinsic(Opc, 2, I))
6280       return false;
6281     break;
6282   }
6283   case Intrinsic::aarch64_neon_ld2r: {
6284     LLT Ty = MRI.getType(I.getOperand(0).getReg());
6285     unsigned Opc = 0;
6286     if (Ty == LLT::fixed_vector(8, S8))
6287       Opc = AArch64::LD2Rv8b;
6288     else if (Ty == LLT::fixed_vector(16, S8))
6289       Opc = AArch64::LD2Rv16b;
6290     else if (Ty == LLT::fixed_vector(4, S16))
6291       Opc = AArch64::LD2Rv4h;
6292     else if (Ty == LLT::fixed_vector(8, S16))
6293       Opc = AArch64::LD2Rv8h;
6294     else if (Ty == LLT::fixed_vector(2, S32))
6295       Opc = AArch64::LD2Rv2s;
6296     else if (Ty == LLT::fixed_vector(4, S32))
6297       Opc = AArch64::LD2Rv4s;
6298     else if (Ty == LLT::fixed_vector(2, S64) || Ty == LLT::fixed_vector(2, P0))
6299       Opc = AArch64::LD2Rv2d;
6300     else if (Ty == S64 || Ty == P0)
6301       Opc = AArch64::LD2Rv1d;
6302     else
6303       llvm_unreachable("Unexpected type for ld2r!");
6304     selectVectorLoadIntrinsic(Opc, 2, I);
6305     break;
6306   }
6307   case Intrinsic::aarch64_neon_ld3: {
6308     LLT Ty = MRI.getType(I.getOperand(0).getReg());
6309     unsigned Opc = 0;
6310     if (Ty == LLT::fixed_vector(8, S8))
6311       Opc = AArch64::LD3Threev8b;
6312     else if (Ty == LLT::fixed_vector(16, S8))
6313       Opc = AArch64::LD3Threev16b;
6314     else if (Ty == LLT::fixed_vector(4, S16))
6315       Opc = AArch64::LD3Threev4h;
6316     else if (Ty == LLT::fixed_vector(8, S16))
6317       Opc = AArch64::LD3Threev8h;
6318     else if (Ty == LLT::fixed_vector(2, S32))
6319       Opc = AArch64::LD3Threev2s;
6320     else if (Ty == LLT::fixed_vector(4, S32))
6321       Opc = AArch64::LD3Threev4s;
6322     else if (Ty == LLT::fixed_vector(2, S64) || Ty == LLT::fixed_vector(2, P0))
6323       Opc = AArch64::LD3Threev2d;
6324     else if (Ty == S64 || Ty == P0)
6325       Opc = AArch64::LD1Threev1d;
6326     else
6327       llvm_unreachable("Unexpected type for ld3!");
6328     selectVectorLoadIntrinsic(Opc, 3, I);
6329     break;
6330   }
6331   case Intrinsic::aarch64_neon_ld3lane: {
6332     LLT Ty = MRI.getType(I.getOperand(0).getReg());
6333     unsigned Opc;
6334     if (Ty == LLT::fixed_vector(8, S8) || Ty == LLT::fixed_vector(16, S8))
6335       Opc = AArch64::LD3i8;
6336     else if (Ty == LLT::fixed_vector(4, S16) || Ty == LLT::fixed_vector(8, S16))
6337       Opc = AArch64::LD3i16;
6338     else if (Ty == LLT::fixed_vector(2, S32) || Ty == LLT::fixed_vector(4, S32))
6339       Opc = AArch64::LD3i32;
6340     else if (Ty == LLT::fixed_vector(2, S64) ||
6341              Ty == LLT::fixed_vector(2, P0) || Ty == S64 || Ty == P0)
6342       Opc = AArch64::LD3i64;
6343     else
6344       llvm_unreachable("Unexpected type for st3lane!");
6345     if (!selectVectorLoadLaneIntrinsic(Opc, 3, I))
6346       return false;
6347     break;
6348   }
6349   case Intrinsic::aarch64_neon_ld3r: {
6350     LLT Ty = MRI.getType(I.getOperand(0).getReg());
6351     unsigned Opc = 0;
6352     if (Ty == LLT::fixed_vector(8, S8))
6353       Opc = AArch64::LD3Rv8b;
6354     else if (Ty == LLT::fixed_vector(16, S8))
6355       Opc = AArch64::LD3Rv16b;
6356     else if (Ty == LLT::fixed_vector(4, S16))
6357       Opc = AArch64::LD3Rv4h;
6358     else if (Ty == LLT::fixed_vector(8, S16))
6359       Opc = AArch64::LD3Rv8h;
6360     else if (Ty == LLT::fixed_vector(2, S32))
6361       Opc = AArch64::LD3Rv2s;
6362     else if (Ty == LLT::fixed_vector(4, S32))
6363       Opc = AArch64::LD3Rv4s;
6364     else if (Ty == LLT::fixed_vector(2, S64) || Ty == LLT::fixed_vector(2, P0))
6365       Opc = AArch64::LD3Rv2d;
6366     else if (Ty == S64 || Ty == P0)
6367       Opc = AArch64::LD3Rv1d;
6368     else
6369       llvm_unreachable("Unexpected type for ld3r!");
6370     selectVectorLoadIntrinsic(Opc, 3, I);
6371     break;
6372   }
6373   case Intrinsic::aarch64_neon_ld4: {
6374     LLT Ty = MRI.getType(I.getOperand(0).getReg());
6375     unsigned Opc = 0;
6376     if (Ty == LLT::fixed_vector(8, S8))
6377       Opc = AArch64::LD4Fourv8b;
6378     else if (Ty == LLT::fixed_vector(16, S8))
6379       Opc = AArch64::LD4Fourv16b;
6380     else if (Ty == LLT::fixed_vector(4, S16))
6381       Opc = AArch64::LD4Fourv4h;
6382     else if (Ty == LLT::fixed_vector(8, S16))
6383       Opc = AArch64::LD4Fourv8h;
6384     else if (Ty == LLT::fixed_vector(2, S32))
6385       Opc = AArch64::LD4Fourv2s;
6386     else if (Ty == LLT::fixed_vector(4, S32))
6387       Opc = AArch64::LD4Fourv4s;
6388     else if (Ty == LLT::fixed_vector(2, S64) || Ty == LLT::fixed_vector(2, P0))
6389       Opc = AArch64::LD4Fourv2d;
6390     else if (Ty == S64 || Ty == P0)
6391       Opc = AArch64::LD1Fourv1d;
6392     else
6393       llvm_unreachable("Unexpected type for ld4!");
6394     selectVectorLoadIntrinsic(Opc, 4, I);
6395     break;
6396   }
6397   case Intrinsic::aarch64_neon_ld4lane: {
6398     LLT Ty = MRI.getType(I.getOperand(0).getReg());
6399     unsigned Opc;
6400     if (Ty == LLT::fixed_vector(8, S8) || Ty == LLT::fixed_vector(16, S8))
6401       Opc = AArch64::LD4i8;
6402     else if (Ty == LLT::fixed_vector(4, S16) || Ty == LLT::fixed_vector(8, S16))
6403       Opc = AArch64::LD4i16;
6404     else if (Ty == LLT::fixed_vector(2, S32) || Ty == LLT::fixed_vector(4, S32))
6405       Opc = AArch64::LD4i32;
6406     else if (Ty == LLT::fixed_vector(2, S64) ||
6407              Ty == LLT::fixed_vector(2, P0) || Ty == S64 || Ty == P0)
6408       Opc = AArch64::LD4i64;
6409     else
6410       llvm_unreachable("Unexpected type for st4lane!");
6411     if (!selectVectorLoadLaneIntrinsic(Opc, 4, I))
6412       return false;
6413     break;
6414   }
6415   case Intrinsic::aarch64_neon_ld4r: {
6416     LLT Ty = MRI.getType(I.getOperand(0).getReg());
6417     unsigned Opc = 0;
6418     if (Ty == LLT::fixed_vector(8, S8))
6419       Opc = AArch64::LD4Rv8b;
6420     else if (Ty == LLT::fixed_vector(16, S8))
6421       Opc = AArch64::LD4Rv16b;
6422     else if (Ty == LLT::fixed_vector(4, S16))
6423       Opc = AArch64::LD4Rv4h;
6424     else if (Ty == LLT::fixed_vector(8, S16))
6425       Opc = AArch64::LD4Rv8h;
6426     else if (Ty == LLT::fixed_vector(2, S32))
6427       Opc = AArch64::LD4Rv2s;
6428     else if (Ty == LLT::fixed_vector(4, S32))
6429       Opc = AArch64::LD4Rv4s;
6430     else if (Ty == LLT::fixed_vector(2, S64) || Ty == LLT::fixed_vector(2, P0))
6431       Opc = AArch64::LD4Rv2d;
6432     else if (Ty == S64 || Ty == P0)
6433       Opc = AArch64::LD4Rv1d;
6434     else
6435       llvm_unreachable("Unexpected type for ld4r!");
6436     selectVectorLoadIntrinsic(Opc, 4, I);
6437     break;
6438   }
6439   case Intrinsic::aarch64_neon_st1x2: {
6440     LLT Ty = MRI.getType(I.getOperand(1).getReg());
6441     unsigned Opc;
6442     if (Ty == LLT::fixed_vector(8, S8))
6443       Opc = AArch64::ST1Twov8b;
6444     else if (Ty == LLT::fixed_vector(16, S8))
6445       Opc = AArch64::ST1Twov16b;
6446     else if (Ty == LLT::fixed_vector(4, S16))
6447       Opc = AArch64::ST1Twov4h;
6448     else if (Ty == LLT::fixed_vector(8, S16))
6449       Opc = AArch64::ST1Twov8h;
6450     else if (Ty == LLT::fixed_vector(2, S32))
6451       Opc = AArch64::ST1Twov2s;
6452     else if (Ty == LLT::fixed_vector(4, S32))
6453       Opc = AArch64::ST1Twov4s;
6454     else if (Ty == LLT::fixed_vector(2, S64) || Ty == LLT::fixed_vector(2, P0))
6455       Opc = AArch64::ST1Twov2d;
6456     else if (Ty == S64 || Ty == P0)
6457       Opc = AArch64::ST1Twov1d;
6458     else
6459       llvm_unreachable("Unexpected type for st1x2!");
6460     selectVectorStoreIntrinsic(I, 2, Opc);
6461     break;
6462   }
6463   case Intrinsic::aarch64_neon_st1x3: {
6464     LLT Ty = MRI.getType(I.getOperand(1).getReg());
6465     unsigned Opc;
6466     if (Ty == LLT::fixed_vector(8, S8))
6467       Opc = AArch64::ST1Threev8b;
6468     else if (Ty == LLT::fixed_vector(16, S8))
6469       Opc = AArch64::ST1Threev16b;
6470     else if (Ty == LLT::fixed_vector(4, S16))
6471       Opc = AArch64::ST1Threev4h;
6472     else if (Ty == LLT::fixed_vector(8, S16))
6473       Opc = AArch64::ST1Threev8h;
6474     else if (Ty == LLT::fixed_vector(2, S32))
6475       Opc = AArch64::ST1Threev2s;
6476     else if (Ty == LLT::fixed_vector(4, S32))
6477       Opc = AArch64::ST1Threev4s;
6478     else if (Ty == LLT::fixed_vector(2, S64) || Ty == LLT::fixed_vector(2, P0))
6479       Opc = AArch64::ST1Threev2d;
6480     else if (Ty == S64 || Ty == P0)
6481       Opc = AArch64::ST1Threev1d;
6482     else
6483       llvm_unreachable("Unexpected type for st1x3!");
6484     selectVectorStoreIntrinsic(I, 3, Opc);
6485     break;
6486   }
6487   case Intrinsic::aarch64_neon_st1x4: {
6488     LLT Ty = MRI.getType(I.getOperand(1).getReg());
6489     unsigned Opc;
6490     if (Ty == LLT::fixed_vector(8, S8))
6491       Opc = AArch64::ST1Fourv8b;
6492     else if (Ty == LLT::fixed_vector(16, S8))
6493       Opc = AArch64::ST1Fourv16b;
6494     else if (Ty == LLT::fixed_vector(4, S16))
6495       Opc = AArch64::ST1Fourv4h;
6496     else if (Ty == LLT::fixed_vector(8, S16))
6497       Opc = AArch64::ST1Fourv8h;
6498     else if (Ty == LLT::fixed_vector(2, S32))
6499       Opc = AArch64::ST1Fourv2s;
6500     else if (Ty == LLT::fixed_vector(4, S32))
6501       Opc = AArch64::ST1Fourv4s;
6502     else if (Ty == LLT::fixed_vector(2, S64) || Ty == LLT::fixed_vector(2, P0))
6503       Opc = AArch64::ST1Fourv2d;
6504     else if (Ty == S64 || Ty == P0)
6505       Opc = AArch64::ST1Fourv1d;
6506     else
6507       llvm_unreachable("Unexpected type for st1x4!");
6508     selectVectorStoreIntrinsic(I, 4, Opc);
6509     break;
6510   }
6511   case Intrinsic::aarch64_neon_st2: {
6512     LLT Ty = MRI.getType(I.getOperand(1).getReg());
6513     unsigned Opc;
6514     if (Ty == LLT::fixed_vector(8, S8))
6515       Opc = AArch64::ST2Twov8b;
6516     else if (Ty == LLT::fixed_vector(16, S8))
6517       Opc = AArch64::ST2Twov16b;
6518     else if (Ty == LLT::fixed_vector(4, S16))
6519       Opc = AArch64::ST2Twov4h;
6520     else if (Ty == LLT::fixed_vector(8, S16))
6521       Opc = AArch64::ST2Twov8h;
6522     else if (Ty == LLT::fixed_vector(2, S32))
6523       Opc = AArch64::ST2Twov2s;
6524     else if (Ty == LLT::fixed_vector(4, S32))
6525       Opc = AArch64::ST2Twov4s;
6526     else if (Ty == LLT::fixed_vector(2, S64) || Ty == LLT::fixed_vector(2, P0))
6527       Opc = AArch64::ST2Twov2d;
6528     else if (Ty == S64 || Ty == P0)
6529       Opc = AArch64::ST1Twov1d;
6530     else
6531       llvm_unreachable("Unexpected type for st2!");
6532     selectVectorStoreIntrinsic(I, 2, Opc);
6533     break;
6534   }
6535   case Intrinsic::aarch64_neon_st3: {
6536     LLT Ty = MRI.getType(I.getOperand(1).getReg());
6537     unsigned Opc;
6538     if (Ty == LLT::fixed_vector(8, S8))
6539       Opc = AArch64::ST3Threev8b;
6540     else if (Ty == LLT::fixed_vector(16, S8))
6541       Opc = AArch64::ST3Threev16b;
6542     else if (Ty == LLT::fixed_vector(4, S16))
6543       Opc = AArch64::ST3Threev4h;
6544     else if (Ty == LLT::fixed_vector(8, S16))
6545       Opc = AArch64::ST3Threev8h;
6546     else if (Ty == LLT::fixed_vector(2, S32))
6547       Opc = AArch64::ST3Threev2s;
6548     else if (Ty == LLT::fixed_vector(4, S32))
6549       Opc = AArch64::ST3Threev4s;
6550     else if (Ty == LLT::fixed_vector(2, S64) || Ty == LLT::fixed_vector(2, P0))
6551       Opc = AArch64::ST3Threev2d;
6552     else if (Ty == S64 || Ty == P0)
6553       Opc = AArch64::ST1Threev1d;
6554     else
6555       llvm_unreachable("Unexpected type for st3!");
6556     selectVectorStoreIntrinsic(I, 3, Opc);
6557     break;
6558   }
6559   case Intrinsic::aarch64_neon_st4: {
6560     LLT Ty = MRI.getType(I.getOperand(1).getReg());
6561     unsigned Opc;
6562     if (Ty == LLT::fixed_vector(8, S8))
6563       Opc = AArch64::ST4Fourv8b;
6564     else if (Ty == LLT::fixed_vector(16, S8))
6565       Opc = AArch64::ST4Fourv16b;
6566     else if (Ty == LLT::fixed_vector(4, S16))
6567       Opc = AArch64::ST4Fourv4h;
6568     else if (Ty == LLT::fixed_vector(8, S16))
6569       Opc = AArch64::ST4Fourv8h;
6570     else if (Ty == LLT::fixed_vector(2, S32))
6571       Opc = AArch64::ST4Fourv2s;
6572     else if (Ty == LLT::fixed_vector(4, S32))
6573       Opc = AArch64::ST4Fourv4s;
6574     else if (Ty == LLT::fixed_vector(2, S64) || Ty == LLT::fixed_vector(2, P0))
6575       Opc = AArch64::ST4Fourv2d;
6576     else if (Ty == S64 || Ty == P0)
6577       Opc = AArch64::ST1Fourv1d;
6578     else
6579       llvm_unreachable("Unexpected type for st4!");
6580     selectVectorStoreIntrinsic(I, 4, Opc);
6581     break;
6582   }
6583   case Intrinsic::aarch64_neon_st2lane: {
6584     LLT Ty = MRI.getType(I.getOperand(1).getReg());
6585     unsigned Opc;
6586     if (Ty == LLT::fixed_vector(8, S8) || Ty == LLT::fixed_vector(16, S8))
6587       Opc = AArch64::ST2i8;
6588     else if (Ty == LLT::fixed_vector(4, S16) || Ty == LLT::fixed_vector(8, S16))
6589       Opc = AArch64::ST2i16;
6590     else if (Ty == LLT::fixed_vector(2, S32) || Ty == LLT::fixed_vector(4, S32))
6591       Opc = AArch64::ST2i32;
6592     else if (Ty == LLT::fixed_vector(2, S64) ||
6593              Ty == LLT::fixed_vector(2, P0) || Ty == S64 || Ty == P0)
6594       Opc = AArch64::ST2i64;
6595     else
6596       llvm_unreachable("Unexpected type for st2lane!");
6597     if (!selectVectorStoreLaneIntrinsic(I, 2, Opc))
6598       return false;
6599     break;
6600   }
6601   case Intrinsic::aarch64_neon_st3lane: {
6602     LLT Ty = MRI.getType(I.getOperand(1).getReg());
6603     unsigned Opc;
6604     if (Ty == LLT::fixed_vector(8, S8) || Ty == LLT::fixed_vector(16, S8))
6605       Opc = AArch64::ST3i8;
6606     else if (Ty == LLT::fixed_vector(4, S16) || Ty == LLT::fixed_vector(8, S16))
6607       Opc = AArch64::ST3i16;
6608     else if (Ty == LLT::fixed_vector(2, S32) || Ty == LLT::fixed_vector(4, S32))
6609       Opc = AArch64::ST3i32;
6610     else if (Ty == LLT::fixed_vector(2, S64) ||
6611              Ty == LLT::fixed_vector(2, P0) || Ty == S64 || Ty == P0)
6612       Opc = AArch64::ST3i64;
6613     else
6614       llvm_unreachable("Unexpected type for st3lane!");
6615     if (!selectVectorStoreLaneIntrinsic(I, 3, Opc))
6616       return false;
6617     break;
6618   }
6619   case Intrinsic::aarch64_neon_st4lane: {
6620     LLT Ty = MRI.getType(I.getOperand(1).getReg());
6621     unsigned Opc;
6622     if (Ty == LLT::fixed_vector(8, S8) || Ty == LLT::fixed_vector(16, S8))
6623       Opc = AArch64::ST4i8;
6624     else if (Ty == LLT::fixed_vector(4, S16) || Ty == LLT::fixed_vector(8, S16))
6625       Opc = AArch64::ST4i16;
6626     else if (Ty == LLT::fixed_vector(2, S32) || Ty == LLT::fixed_vector(4, S32))
6627       Opc = AArch64::ST4i32;
6628     else if (Ty == LLT::fixed_vector(2, S64) ||
6629              Ty == LLT::fixed_vector(2, P0) || Ty == S64 || Ty == P0)
6630       Opc = AArch64::ST4i64;
6631     else
6632       llvm_unreachable("Unexpected type for st4lane!");
6633     if (!selectVectorStoreLaneIntrinsic(I, 4, Opc))
6634       return false;
6635     break;
6636   }
6637   case Intrinsic::aarch64_mops_memset_tag: {
6638     // Transform
6639     //    %dst:gpr(p0) = \
6640     //      G_INTRINSIC_W_SIDE_EFFECTS intrinsic(@llvm.aarch64.mops.memset.tag),
6641     //      \ %dst:gpr(p0), %val:gpr(s64), %n:gpr(s64)
6642     // where %dst is updated, into
6643     //    %Rd:GPR64common, %Rn:GPR64) = \
6644     //      MOPSMemorySetTaggingPseudo \
6645     //      %Rd:GPR64common, %Rn:GPR64, %Rm:GPR64
6646     // where Rd and Rn are tied.
6647     // It is expected that %val has been extended to s64 in legalization.
6648     // Note that the order of the size/value operands are swapped.
6649 
6650     Register DstDef = I.getOperand(0).getReg();
6651     // I.getOperand(1) is the intrinsic function
6652     Register DstUse = I.getOperand(2).getReg();
6653     Register ValUse = I.getOperand(3).getReg();
6654     Register SizeUse = I.getOperand(4).getReg();
6655 
6656     // MOPSMemorySetTaggingPseudo has two defs; the intrinsic call has only one.
6657     // Therefore an additional virtual register is requried for the updated size
6658     // operand. This value is not accessible via the semantics of the intrinsic.
6659     Register SizeDef = MRI.createGenericVirtualRegister(LLT::scalar(64));
6660 
6661     auto Memset = MIB.buildInstr(AArch64::MOPSMemorySetTaggingPseudo,
6662                                  {DstDef, SizeDef}, {DstUse, SizeUse, ValUse});
6663     Memset.cloneMemRefs(I);
6664     constrainSelectedInstRegOperands(*Memset, TII, TRI, RBI);
6665     break;
6666   }
6667   }
6668 
6669   I.eraseFromParent();
6670   return true;
6671 }
6672 
6673 bool AArch64InstructionSelector::selectIntrinsic(MachineInstr &I,
6674                                                  MachineRegisterInfo &MRI) {
6675   unsigned IntrinID = cast<GIntrinsic>(I).getIntrinsicID();
6676 
6677   switch (IntrinID) {
6678   default:
6679     break;
6680   case Intrinsic::aarch64_crypto_sha1h: {
6681     Register DstReg = I.getOperand(0).getReg();
6682     Register SrcReg = I.getOperand(2).getReg();
6683 
6684     // FIXME: Should this be an assert?
6685     if (MRI.getType(DstReg).getSizeInBits() != 32 ||
6686         MRI.getType(SrcReg).getSizeInBits() != 32)
6687       return false;
6688 
6689     // The operation has to happen on FPRs. Set up some new FPR registers for
6690     // the source and destination if they are on GPRs.
6691     if (RBI.getRegBank(SrcReg, MRI, TRI)->getID() != AArch64::FPRRegBankID) {
6692       SrcReg = MRI.createVirtualRegister(&AArch64::FPR32RegClass);
6693       MIB.buildCopy({SrcReg}, {I.getOperand(2)});
6694 
6695       // Make sure the copy ends up getting constrained properly.
6696       RBI.constrainGenericRegister(I.getOperand(2).getReg(),
6697                                    AArch64::GPR32RegClass, MRI);
6698     }
6699 
6700     if (RBI.getRegBank(DstReg, MRI, TRI)->getID() != AArch64::FPRRegBankID)
6701       DstReg = MRI.createVirtualRegister(&AArch64::FPR32RegClass);
6702 
6703     // Actually insert the instruction.
6704     auto SHA1Inst = MIB.buildInstr(AArch64::SHA1Hrr, {DstReg}, {SrcReg});
6705     constrainSelectedInstRegOperands(*SHA1Inst, TII, TRI, RBI);
6706 
6707     // Did we create a new register for the destination?
6708     if (DstReg != I.getOperand(0).getReg()) {
6709       // Yep. Copy the result of the instruction back into the original
6710       // destination.
6711       MIB.buildCopy({I.getOperand(0)}, {DstReg});
6712       RBI.constrainGenericRegister(I.getOperand(0).getReg(),
6713                                    AArch64::GPR32RegClass, MRI);
6714     }
6715 
6716     I.eraseFromParent();
6717     return true;
6718   }
6719   case Intrinsic::frameaddress:
6720   case Intrinsic::returnaddress: {
6721     MachineFunction &MF = *I.getParent()->getParent();
6722     MachineFrameInfo &MFI = MF.getFrameInfo();
6723 
6724     unsigned Depth = I.getOperand(2).getImm();
6725     Register DstReg = I.getOperand(0).getReg();
6726     RBI.constrainGenericRegister(DstReg, AArch64::GPR64RegClass, MRI);
6727 
6728     if (Depth == 0 && IntrinID == Intrinsic::returnaddress) {
6729       if (!MFReturnAddr) {
6730         // Insert the copy from LR/X30 into the entry block, before it can be
6731         // clobbered by anything.
6732         MFI.setReturnAddressIsTaken(true);
6733         MFReturnAddr = getFunctionLiveInPhysReg(
6734             MF, TII, AArch64::LR, AArch64::GPR64RegClass, I.getDebugLoc());
6735       }
6736 
6737       if (STI.hasPAuth()) {
6738         MIB.buildInstr(AArch64::XPACI, {DstReg}, {MFReturnAddr});
6739       } else {
6740         MIB.buildCopy({Register(AArch64::LR)}, {MFReturnAddr});
6741         MIB.buildInstr(AArch64::XPACLRI);
6742         MIB.buildCopy({DstReg}, {Register(AArch64::LR)});
6743       }
6744 
6745       I.eraseFromParent();
6746       return true;
6747     }
6748 
6749     MFI.setFrameAddressIsTaken(true);
6750     Register FrameAddr(AArch64::FP);
6751     while (Depth--) {
6752       Register NextFrame = MRI.createVirtualRegister(&AArch64::GPR64spRegClass);
6753       auto Ldr =
6754           MIB.buildInstr(AArch64::LDRXui, {NextFrame}, {FrameAddr}).addImm(0);
6755       constrainSelectedInstRegOperands(*Ldr, TII, TRI, RBI);
6756       FrameAddr = NextFrame;
6757     }
6758 
6759     if (IntrinID == Intrinsic::frameaddress)
6760       MIB.buildCopy({DstReg}, {FrameAddr});
6761     else {
6762       MFI.setReturnAddressIsTaken(true);
6763 
6764       if (STI.hasPAuth()) {
6765         Register TmpReg = MRI.createVirtualRegister(&AArch64::GPR64RegClass);
6766         MIB.buildInstr(AArch64::LDRXui, {TmpReg}, {FrameAddr}).addImm(1);
6767         MIB.buildInstr(AArch64::XPACI, {DstReg}, {TmpReg});
6768       } else {
6769         MIB.buildInstr(AArch64::LDRXui, {Register(AArch64::LR)}, {FrameAddr})
6770             .addImm(1);
6771         MIB.buildInstr(AArch64::XPACLRI);
6772         MIB.buildCopy({DstReg}, {Register(AArch64::LR)});
6773       }
6774     }
6775 
6776     I.eraseFromParent();
6777     return true;
6778   }
6779   case Intrinsic::swift_async_context_addr:
6780     auto Sub = MIB.buildInstr(AArch64::SUBXri, {I.getOperand(0).getReg()},
6781                               {Register(AArch64::FP)})
6782                    .addImm(8)
6783                    .addImm(0);
6784     constrainSelectedInstRegOperands(*Sub, TII, TRI, RBI);
6785 
6786     MF->getFrameInfo().setFrameAddressIsTaken(true);
6787     MF->getInfo<AArch64FunctionInfo>()->setHasSwiftAsyncContext(true);
6788     I.eraseFromParent();
6789     return true;
6790   }
6791   return false;
6792 }
6793 
6794 InstructionSelector::ComplexRendererFns
6795 AArch64InstructionSelector::selectShiftA_32(const MachineOperand &Root) const {
6796   auto MaybeImmed = getImmedFromMO(Root);
6797   if (MaybeImmed == std::nullopt || *MaybeImmed > 31)
6798     return std::nullopt;
6799   uint64_t Enc = (32 - *MaybeImmed) & 0x1f;
6800   return {{[=](MachineInstrBuilder &MIB) { MIB.addImm(Enc); }}};
6801 }
6802 
6803 InstructionSelector::ComplexRendererFns
6804 AArch64InstructionSelector::selectShiftB_32(const MachineOperand &Root) const {
6805   auto MaybeImmed = getImmedFromMO(Root);
6806   if (MaybeImmed == std::nullopt || *MaybeImmed > 31)
6807     return std::nullopt;
6808   uint64_t Enc = 31 - *MaybeImmed;
6809   return {{[=](MachineInstrBuilder &MIB) { MIB.addImm(Enc); }}};
6810 }
6811 
6812 InstructionSelector::ComplexRendererFns
6813 AArch64InstructionSelector::selectShiftA_64(const MachineOperand &Root) const {
6814   auto MaybeImmed = getImmedFromMO(Root);
6815   if (MaybeImmed == std::nullopt || *MaybeImmed > 63)
6816     return std::nullopt;
6817   uint64_t Enc = (64 - *MaybeImmed) & 0x3f;
6818   return {{[=](MachineInstrBuilder &MIB) { MIB.addImm(Enc); }}};
6819 }
6820 
6821 InstructionSelector::ComplexRendererFns
6822 AArch64InstructionSelector::selectShiftB_64(const MachineOperand &Root) const {
6823   auto MaybeImmed = getImmedFromMO(Root);
6824   if (MaybeImmed == std::nullopt || *MaybeImmed > 63)
6825     return std::nullopt;
6826   uint64_t Enc = 63 - *MaybeImmed;
6827   return {{[=](MachineInstrBuilder &MIB) { MIB.addImm(Enc); }}};
6828 }
6829 
6830 /// Helper to select an immediate value that can be represented as a 12-bit
6831 /// value shifted left by either 0 or 12. If it is possible to do so, return
6832 /// the immediate and shift value. If not, return std::nullopt.
6833 ///
6834 /// Used by selectArithImmed and selectNegArithImmed.
6835 InstructionSelector::ComplexRendererFns
6836 AArch64InstructionSelector::select12BitValueWithLeftShift(
6837     uint64_t Immed) const {
6838   unsigned ShiftAmt;
6839   if (Immed >> 12 == 0) {
6840     ShiftAmt = 0;
6841   } else if ((Immed & 0xfff) == 0 && Immed >> 24 == 0) {
6842     ShiftAmt = 12;
6843     Immed = Immed >> 12;
6844   } else
6845     return std::nullopt;
6846 
6847   unsigned ShVal = AArch64_AM::getShifterImm(AArch64_AM::LSL, ShiftAmt);
6848   return {{
6849       [=](MachineInstrBuilder &MIB) { MIB.addImm(Immed); },
6850       [=](MachineInstrBuilder &MIB) { MIB.addImm(ShVal); },
6851   }};
6852 }
6853 
6854 /// SelectArithImmed - Select an immediate value that can be represented as
6855 /// a 12-bit value shifted left by either 0 or 12.  If so, return true with
6856 /// Val set to the 12-bit value and Shift set to the shifter operand.
6857 InstructionSelector::ComplexRendererFns
6858 AArch64InstructionSelector::selectArithImmed(MachineOperand &Root) const {
6859   // This function is called from the addsub_shifted_imm ComplexPattern,
6860   // which lists [imm] as the list of opcode it's interested in, however
6861   // we still need to check whether the operand is actually an immediate
6862   // here because the ComplexPattern opcode list is only used in
6863   // root-level opcode matching.
6864   auto MaybeImmed = getImmedFromMO(Root);
6865   if (MaybeImmed == std::nullopt)
6866     return std::nullopt;
6867   return select12BitValueWithLeftShift(*MaybeImmed);
6868 }
6869 
6870 /// SelectNegArithImmed - As above, but negates the value before trying to
6871 /// select it.
6872 InstructionSelector::ComplexRendererFns
6873 AArch64InstructionSelector::selectNegArithImmed(MachineOperand &Root) const {
6874   // We need a register here, because we need to know if we have a 64 or 32
6875   // bit immediate.
6876   if (!Root.isReg())
6877     return std::nullopt;
6878   auto MaybeImmed = getImmedFromMO(Root);
6879   if (MaybeImmed == std::nullopt)
6880     return std::nullopt;
6881   uint64_t Immed = *MaybeImmed;
6882 
6883   // This negation is almost always valid, but "cmp wN, #0" and "cmn wN, #0"
6884   // have the opposite effect on the C flag, so this pattern mustn't match under
6885   // those circumstances.
6886   if (Immed == 0)
6887     return std::nullopt;
6888 
6889   // Check if we're dealing with a 32-bit type on the root or a 64-bit type on
6890   // the root.
6891   MachineRegisterInfo &MRI = Root.getParent()->getMF()->getRegInfo();
6892   if (MRI.getType(Root.getReg()).getSizeInBits() == 32)
6893     Immed = ~((uint32_t)Immed) + 1;
6894   else
6895     Immed = ~Immed + 1ULL;
6896 
6897   if (Immed & 0xFFFFFFFFFF000000ULL)
6898     return std::nullopt;
6899 
6900   Immed &= 0xFFFFFFULL;
6901   return select12BitValueWithLeftShift(Immed);
6902 }
6903 
6904 /// Return true if it is worth folding MI into an extended register. That is,
6905 /// if it's safe to pull it into the addressing mode of a load or store as a
6906 /// shift.
6907 bool AArch64InstructionSelector::isWorthFoldingIntoExtendedReg(
6908     MachineInstr &MI, const MachineRegisterInfo &MRI) const {
6909   // Always fold if there is one use, or if we're optimizing for size.
6910   Register DefReg = MI.getOperand(0).getReg();
6911   if (MRI.hasOneNonDBGUse(DefReg) ||
6912       MI.getParent()->getParent()->getFunction().hasOptSize())
6913     return true;
6914 
6915   // It's better to avoid folding and recomputing shifts when we don't have a
6916   // fastpath.
6917   if (!STI.hasAddrLSLFast())
6918     return false;
6919 
6920   // We have a fastpath, so folding a shift in and potentially computing it
6921   // many times may be beneficial. Check if this is only used in memory ops.
6922   // If it is, then we should fold.
6923   return all_of(MRI.use_nodbg_instructions(DefReg),
6924                 [](MachineInstr &Use) { return Use.mayLoadOrStore(); });
6925 }
6926 
6927 static bool isSignExtendShiftType(AArch64_AM::ShiftExtendType Type) {
6928   switch (Type) {
6929   case AArch64_AM::SXTB:
6930   case AArch64_AM::SXTH:
6931   case AArch64_AM::SXTW:
6932     return true;
6933   default:
6934     return false;
6935   }
6936 }
6937 
6938 InstructionSelector::ComplexRendererFns
6939 AArch64InstructionSelector::selectExtendedSHL(
6940     MachineOperand &Root, MachineOperand &Base, MachineOperand &Offset,
6941     unsigned SizeInBytes, bool WantsExt) const {
6942   assert(Base.isReg() && "Expected base to be a register operand");
6943   assert(Offset.isReg() && "Expected offset to be a register operand");
6944 
6945   MachineRegisterInfo &MRI = Root.getParent()->getMF()->getRegInfo();
6946   MachineInstr *OffsetInst = MRI.getVRegDef(Offset.getReg());
6947 
6948   unsigned OffsetOpc = OffsetInst->getOpcode();
6949   bool LookedThroughZExt = false;
6950   if (OffsetOpc != TargetOpcode::G_SHL && OffsetOpc != TargetOpcode::G_MUL) {
6951     // Try to look through a ZEXT.
6952     if (OffsetOpc != TargetOpcode::G_ZEXT || !WantsExt)
6953       return std::nullopt;
6954 
6955     OffsetInst = MRI.getVRegDef(OffsetInst->getOperand(1).getReg());
6956     OffsetOpc = OffsetInst->getOpcode();
6957     LookedThroughZExt = true;
6958 
6959     if (OffsetOpc != TargetOpcode::G_SHL && OffsetOpc != TargetOpcode::G_MUL)
6960       return std::nullopt;
6961   }
6962   // Make sure that the memory op is a valid size.
6963   int64_t LegalShiftVal = Log2_32(SizeInBytes);
6964   if (LegalShiftVal == 0)
6965     return std::nullopt;
6966   if (!isWorthFoldingIntoExtendedReg(*OffsetInst, MRI))
6967     return std::nullopt;
6968 
6969   // Now, try to find the specific G_CONSTANT. Start by assuming that the
6970   // register we will offset is the LHS, and the register containing the
6971   // constant is the RHS.
6972   Register OffsetReg = OffsetInst->getOperand(1).getReg();
6973   Register ConstantReg = OffsetInst->getOperand(2).getReg();
6974   auto ValAndVReg = getIConstantVRegValWithLookThrough(ConstantReg, MRI);
6975   if (!ValAndVReg) {
6976     // We didn't get a constant on the RHS. If the opcode is a shift, then
6977     // we're done.
6978     if (OffsetOpc == TargetOpcode::G_SHL)
6979       return std::nullopt;
6980 
6981     // If we have a G_MUL, we can use either register. Try looking at the RHS.
6982     std::swap(OffsetReg, ConstantReg);
6983     ValAndVReg = getIConstantVRegValWithLookThrough(ConstantReg, MRI);
6984     if (!ValAndVReg)
6985       return std::nullopt;
6986   }
6987 
6988   // The value must fit into 3 bits, and must be positive. Make sure that is
6989   // true.
6990   int64_t ImmVal = ValAndVReg->Value.getSExtValue();
6991 
6992   // Since we're going to pull this into a shift, the constant value must be
6993   // a power of 2. If we got a multiply, then we need to check this.
6994   if (OffsetOpc == TargetOpcode::G_MUL) {
6995     if (!llvm::has_single_bit<uint32_t>(ImmVal))
6996       return std::nullopt;
6997 
6998     // Got a power of 2. So, the amount we'll shift is the log base-2 of that.
6999     ImmVal = Log2_32(ImmVal);
7000   }
7001 
7002   if ((ImmVal & 0x7) != ImmVal)
7003     return std::nullopt;
7004 
7005   // We are only allowed to shift by LegalShiftVal. This shift value is built
7006   // into the instruction, so we can't just use whatever we want.
7007   if (ImmVal != LegalShiftVal)
7008     return std::nullopt;
7009 
7010   unsigned SignExtend = 0;
7011   if (WantsExt) {
7012     // Check if the offset is defined by an extend, unless we looked through a
7013     // G_ZEXT earlier.
7014     if (!LookedThroughZExt) {
7015       MachineInstr *ExtInst = getDefIgnoringCopies(OffsetReg, MRI);
7016       auto Ext = getExtendTypeForInst(*ExtInst, MRI, true);
7017       if (Ext == AArch64_AM::InvalidShiftExtend)
7018         return std::nullopt;
7019 
7020       SignExtend = isSignExtendShiftType(Ext) ? 1 : 0;
7021       // We only support SXTW for signed extension here.
7022       if (SignExtend && Ext != AArch64_AM::SXTW)
7023         return std::nullopt;
7024       OffsetReg = ExtInst->getOperand(1).getReg();
7025     }
7026 
7027     // Need a 32-bit wide register here.
7028     MachineIRBuilder MIB(*MRI.getVRegDef(Root.getReg()));
7029     OffsetReg = moveScalarRegClass(OffsetReg, AArch64::GPR32RegClass, MIB);
7030   }
7031 
7032   // We can use the LHS of the GEP as the base, and the LHS of the shift as an
7033   // offset. Signify that we are shifting by setting the shift flag to 1.
7034   return {{[=](MachineInstrBuilder &MIB) { MIB.addUse(Base.getReg()); },
7035            [=](MachineInstrBuilder &MIB) { MIB.addUse(OffsetReg); },
7036            [=](MachineInstrBuilder &MIB) {
7037              // Need to add both immediates here to make sure that they are both
7038              // added to the instruction.
7039              MIB.addImm(SignExtend);
7040              MIB.addImm(1);
7041            }}};
7042 }
7043 
7044 /// This is used for computing addresses like this:
7045 ///
7046 /// ldr x1, [x2, x3, lsl #3]
7047 ///
7048 /// Where x2 is the base register, and x3 is an offset register. The shift-left
7049 /// is a constant value specific to this load instruction. That is, we'll never
7050 /// see anything other than a 3 here (which corresponds to the size of the
7051 /// element being loaded.)
7052 InstructionSelector::ComplexRendererFns
7053 AArch64InstructionSelector::selectAddrModeShiftedExtendXReg(
7054     MachineOperand &Root, unsigned SizeInBytes) const {
7055   if (!Root.isReg())
7056     return std::nullopt;
7057   MachineRegisterInfo &MRI = Root.getParent()->getMF()->getRegInfo();
7058 
7059   // We want to find something like this:
7060   //
7061   // val = G_CONSTANT LegalShiftVal
7062   // shift = G_SHL off_reg val
7063   // ptr = G_PTR_ADD base_reg shift
7064   // x = G_LOAD ptr
7065   //
7066   // And fold it into this addressing mode:
7067   //
7068   // ldr x, [base_reg, off_reg, lsl #LegalShiftVal]
7069 
7070   // Check if we can find the G_PTR_ADD.
7071   MachineInstr *PtrAdd =
7072       getOpcodeDef(TargetOpcode::G_PTR_ADD, Root.getReg(), MRI);
7073   if (!PtrAdd || !isWorthFoldingIntoExtendedReg(*PtrAdd, MRI))
7074     return std::nullopt;
7075 
7076   // Now, try to match an opcode which will match our specific offset.
7077   // We want a G_SHL or a G_MUL.
7078   MachineInstr *OffsetInst =
7079       getDefIgnoringCopies(PtrAdd->getOperand(2).getReg(), MRI);
7080   return selectExtendedSHL(Root, PtrAdd->getOperand(1),
7081                            OffsetInst->getOperand(0), SizeInBytes,
7082                            /*WantsExt=*/false);
7083 }
7084 
7085 /// This is used for computing addresses like this:
7086 ///
7087 /// ldr x1, [x2, x3]
7088 ///
7089 /// Where x2 is the base register, and x3 is an offset register.
7090 ///
7091 /// When possible (or profitable) to fold a G_PTR_ADD into the address
7092 /// calculation, this will do so. Otherwise, it will return std::nullopt.
7093 InstructionSelector::ComplexRendererFns
7094 AArch64InstructionSelector::selectAddrModeRegisterOffset(
7095     MachineOperand &Root) const {
7096   MachineRegisterInfo &MRI = Root.getParent()->getMF()->getRegInfo();
7097 
7098   // We need a GEP.
7099   MachineInstr *Gep = MRI.getVRegDef(Root.getReg());
7100   if (Gep->getOpcode() != TargetOpcode::G_PTR_ADD)
7101     return std::nullopt;
7102 
7103   // If this is used more than once, let's not bother folding.
7104   // TODO: Check if they are memory ops. If they are, then we can still fold
7105   // without having to recompute anything.
7106   if (!MRI.hasOneNonDBGUse(Gep->getOperand(0).getReg()))
7107     return std::nullopt;
7108 
7109   // Base is the GEP's LHS, offset is its RHS.
7110   return {{[=](MachineInstrBuilder &MIB) {
7111              MIB.addUse(Gep->getOperand(1).getReg());
7112            },
7113            [=](MachineInstrBuilder &MIB) {
7114              MIB.addUse(Gep->getOperand(2).getReg());
7115            },
7116            [=](MachineInstrBuilder &MIB) {
7117              // Need to add both immediates here to make sure that they are both
7118              // added to the instruction.
7119              MIB.addImm(0);
7120              MIB.addImm(0);
7121            }}};
7122 }
7123 
7124 /// This is intended to be equivalent to selectAddrModeXRO in
7125 /// AArch64ISelDAGtoDAG. It's used for selecting X register offset loads.
7126 InstructionSelector::ComplexRendererFns
7127 AArch64InstructionSelector::selectAddrModeXRO(MachineOperand &Root,
7128                                               unsigned SizeInBytes) const {
7129   MachineRegisterInfo &MRI = Root.getParent()->getMF()->getRegInfo();
7130   if (!Root.isReg())
7131     return std::nullopt;
7132   MachineInstr *PtrAdd =
7133       getOpcodeDef(TargetOpcode::G_PTR_ADD, Root.getReg(), MRI);
7134   if (!PtrAdd)
7135     return std::nullopt;
7136 
7137   // Check for an immediates which cannot be encoded in the [base + imm]
7138   // addressing mode, and can't be encoded in an add/sub. If this happens, we'll
7139   // end up with code like:
7140   //
7141   // mov x0, wide
7142   // add x1 base, x0
7143   // ldr x2, [x1, x0]
7144   //
7145   // In this situation, we can use the [base, xreg] addressing mode to save an
7146   // add/sub:
7147   //
7148   // mov x0, wide
7149   // ldr x2, [base, x0]
7150   auto ValAndVReg =
7151       getIConstantVRegValWithLookThrough(PtrAdd->getOperand(2).getReg(), MRI);
7152   if (ValAndVReg) {
7153     unsigned Scale = Log2_32(SizeInBytes);
7154     int64_t ImmOff = ValAndVReg->Value.getSExtValue();
7155 
7156     // Skip immediates that can be selected in the load/store addresing
7157     // mode.
7158     if (ImmOff % SizeInBytes == 0 && ImmOff >= 0 &&
7159         ImmOff < (0x1000 << Scale))
7160       return std::nullopt;
7161 
7162     // Helper lambda to decide whether or not it is preferable to emit an add.
7163     auto isPreferredADD = [](int64_t ImmOff) {
7164       // Constants in [0x0, 0xfff] can be encoded in an add.
7165       if ((ImmOff & 0xfffffffffffff000LL) == 0x0LL)
7166         return true;
7167 
7168       // Can it be encoded in an add lsl #12?
7169       if ((ImmOff & 0xffffffffff000fffLL) != 0x0LL)
7170         return false;
7171 
7172       // It can be encoded in an add lsl #12, but we may not want to. If it is
7173       // possible to select this as a single movz, then prefer that. A single
7174       // movz is faster than an add with a shift.
7175       return (ImmOff & 0xffffffffff00ffffLL) != 0x0LL &&
7176              (ImmOff & 0xffffffffffff0fffLL) != 0x0LL;
7177     };
7178 
7179     // If the immediate can be encoded in a single add/sub, then bail out.
7180     if (isPreferredADD(ImmOff) || isPreferredADD(-ImmOff))
7181       return std::nullopt;
7182   }
7183 
7184   // Try to fold shifts into the addressing mode.
7185   auto AddrModeFns = selectAddrModeShiftedExtendXReg(Root, SizeInBytes);
7186   if (AddrModeFns)
7187     return AddrModeFns;
7188 
7189   // If that doesn't work, see if it's possible to fold in registers from
7190   // a GEP.
7191   return selectAddrModeRegisterOffset(Root);
7192 }
7193 
7194 /// This is used for computing addresses like this:
7195 ///
7196 /// ldr x0, [xBase, wOffset, sxtw #LegalShiftVal]
7197 ///
7198 /// Where we have a 64-bit base register, a 32-bit offset register, and an
7199 /// extend (which may or may not be signed).
7200 InstructionSelector::ComplexRendererFns
7201 AArch64InstructionSelector::selectAddrModeWRO(MachineOperand &Root,
7202                                               unsigned SizeInBytes) const {
7203   MachineRegisterInfo &MRI = Root.getParent()->getMF()->getRegInfo();
7204 
7205   MachineInstr *PtrAdd =
7206       getOpcodeDef(TargetOpcode::G_PTR_ADD, Root.getReg(), MRI);
7207   if (!PtrAdd || !isWorthFoldingIntoExtendedReg(*PtrAdd, MRI))
7208     return std::nullopt;
7209 
7210   MachineOperand &LHS = PtrAdd->getOperand(1);
7211   MachineOperand &RHS = PtrAdd->getOperand(2);
7212   MachineInstr *OffsetInst = getDefIgnoringCopies(RHS.getReg(), MRI);
7213 
7214   // The first case is the same as selectAddrModeXRO, except we need an extend.
7215   // In this case, we try to find a shift and extend, and fold them into the
7216   // addressing mode.
7217   //
7218   // E.g.
7219   //
7220   // off_reg = G_Z/S/ANYEXT ext_reg
7221   // val = G_CONSTANT LegalShiftVal
7222   // shift = G_SHL off_reg val
7223   // ptr = G_PTR_ADD base_reg shift
7224   // x = G_LOAD ptr
7225   //
7226   // In this case we can get a load like this:
7227   //
7228   // ldr x0, [base_reg, ext_reg, sxtw #LegalShiftVal]
7229   auto ExtendedShl = selectExtendedSHL(Root, LHS, OffsetInst->getOperand(0),
7230                                        SizeInBytes, /*WantsExt=*/true);
7231   if (ExtendedShl)
7232     return ExtendedShl;
7233 
7234   // There was no shift. We can try and fold a G_Z/S/ANYEXT in alone though.
7235   //
7236   // e.g.
7237   // ldr something, [base_reg, ext_reg, sxtw]
7238   if (!isWorthFoldingIntoExtendedReg(*OffsetInst, MRI))
7239     return std::nullopt;
7240 
7241   // Check if this is an extend. We'll get an extend type if it is.
7242   AArch64_AM::ShiftExtendType Ext =
7243       getExtendTypeForInst(*OffsetInst, MRI, /*IsLoadStore=*/true);
7244   if (Ext == AArch64_AM::InvalidShiftExtend)
7245     return std::nullopt;
7246 
7247   // Need a 32-bit wide register.
7248   MachineIRBuilder MIB(*PtrAdd);
7249   Register ExtReg = moveScalarRegClass(OffsetInst->getOperand(1).getReg(),
7250                                        AArch64::GPR32RegClass, MIB);
7251   unsigned SignExtend = Ext == AArch64_AM::SXTW;
7252 
7253   // Base is LHS, offset is ExtReg.
7254   return {{[=](MachineInstrBuilder &MIB) { MIB.addUse(LHS.getReg()); },
7255            [=](MachineInstrBuilder &MIB) { MIB.addUse(ExtReg); },
7256            [=](MachineInstrBuilder &MIB) {
7257              MIB.addImm(SignExtend);
7258              MIB.addImm(0);
7259            }}};
7260 }
7261 
7262 /// Select a "register plus unscaled signed 9-bit immediate" address.  This
7263 /// should only match when there is an offset that is not valid for a scaled
7264 /// immediate addressing mode.  The "Size" argument is the size in bytes of the
7265 /// memory reference, which is needed here to know what is valid for a scaled
7266 /// immediate.
7267 InstructionSelector::ComplexRendererFns
7268 AArch64InstructionSelector::selectAddrModeUnscaled(MachineOperand &Root,
7269                                                    unsigned Size) const {
7270   MachineRegisterInfo &MRI =
7271       Root.getParent()->getParent()->getParent()->getRegInfo();
7272 
7273   if (!Root.isReg())
7274     return std::nullopt;
7275 
7276   if (!isBaseWithConstantOffset(Root, MRI))
7277     return std::nullopt;
7278 
7279   MachineInstr *RootDef = MRI.getVRegDef(Root.getReg());
7280 
7281   MachineOperand &OffImm = RootDef->getOperand(2);
7282   if (!OffImm.isReg())
7283     return std::nullopt;
7284   MachineInstr *RHS = MRI.getVRegDef(OffImm.getReg());
7285   if (RHS->getOpcode() != TargetOpcode::G_CONSTANT)
7286     return std::nullopt;
7287   int64_t RHSC;
7288   MachineOperand &RHSOp1 = RHS->getOperand(1);
7289   if (!RHSOp1.isCImm() || RHSOp1.getCImm()->getBitWidth() > 64)
7290     return std::nullopt;
7291   RHSC = RHSOp1.getCImm()->getSExtValue();
7292 
7293   if (RHSC >= -256 && RHSC < 256) {
7294     MachineOperand &Base = RootDef->getOperand(1);
7295     return {{
7296         [=](MachineInstrBuilder &MIB) { MIB.add(Base); },
7297         [=](MachineInstrBuilder &MIB) { MIB.addImm(RHSC); },
7298     }};
7299   }
7300   return std::nullopt;
7301 }
7302 
7303 InstructionSelector::ComplexRendererFns
7304 AArch64InstructionSelector::tryFoldAddLowIntoImm(MachineInstr &RootDef,
7305                                                  unsigned Size,
7306                                                  MachineRegisterInfo &MRI) const {
7307   if (RootDef.getOpcode() != AArch64::G_ADD_LOW)
7308     return std::nullopt;
7309   MachineInstr &Adrp = *MRI.getVRegDef(RootDef.getOperand(1).getReg());
7310   if (Adrp.getOpcode() != AArch64::ADRP)
7311     return std::nullopt;
7312 
7313   // TODO: add heuristics like isWorthFoldingADDlow() from SelectionDAG.
7314   auto Offset = Adrp.getOperand(1).getOffset();
7315   if (Offset % Size != 0)
7316     return std::nullopt;
7317 
7318   auto GV = Adrp.getOperand(1).getGlobal();
7319   if (GV->isThreadLocal())
7320     return std::nullopt;
7321 
7322   auto &MF = *RootDef.getParent()->getParent();
7323   if (GV->getPointerAlignment(MF.getDataLayout()) < Size)
7324     return std::nullopt;
7325 
7326   unsigned OpFlags = STI.ClassifyGlobalReference(GV, MF.getTarget());
7327   MachineIRBuilder MIRBuilder(RootDef);
7328   Register AdrpReg = Adrp.getOperand(0).getReg();
7329   return {{[=](MachineInstrBuilder &MIB) { MIB.addUse(AdrpReg); },
7330            [=](MachineInstrBuilder &MIB) {
7331              MIB.addGlobalAddress(GV, Offset,
7332                                   OpFlags | AArch64II::MO_PAGEOFF |
7333                                       AArch64II::MO_NC);
7334            }}};
7335 }
7336 
7337 /// Select a "register plus scaled unsigned 12-bit immediate" address.  The
7338 /// "Size" argument is the size in bytes of the memory reference, which
7339 /// determines the scale.
7340 InstructionSelector::ComplexRendererFns
7341 AArch64InstructionSelector::selectAddrModeIndexed(MachineOperand &Root,
7342                                                   unsigned Size) const {
7343   MachineFunction &MF = *Root.getParent()->getParent()->getParent();
7344   MachineRegisterInfo &MRI = MF.getRegInfo();
7345 
7346   if (!Root.isReg())
7347     return std::nullopt;
7348 
7349   MachineInstr *RootDef = MRI.getVRegDef(Root.getReg());
7350   if (RootDef->getOpcode() == TargetOpcode::G_FRAME_INDEX) {
7351     return {{
7352         [=](MachineInstrBuilder &MIB) { MIB.add(RootDef->getOperand(1)); },
7353         [=](MachineInstrBuilder &MIB) { MIB.addImm(0); },
7354     }};
7355   }
7356 
7357   CodeModel::Model CM = MF.getTarget().getCodeModel();
7358   // Check if we can fold in the ADD of small code model ADRP + ADD address.
7359   if (CM == CodeModel::Small) {
7360     auto OpFns = tryFoldAddLowIntoImm(*RootDef, Size, MRI);
7361     if (OpFns)
7362       return OpFns;
7363   }
7364 
7365   if (isBaseWithConstantOffset(Root, MRI)) {
7366     MachineOperand &LHS = RootDef->getOperand(1);
7367     MachineOperand &RHS = RootDef->getOperand(2);
7368     MachineInstr *LHSDef = MRI.getVRegDef(LHS.getReg());
7369     MachineInstr *RHSDef = MRI.getVRegDef(RHS.getReg());
7370 
7371     int64_t RHSC = (int64_t)RHSDef->getOperand(1).getCImm()->getZExtValue();
7372     unsigned Scale = Log2_32(Size);
7373     if ((RHSC & (Size - 1)) == 0 && RHSC >= 0 && RHSC < (0x1000 << Scale)) {
7374       if (LHSDef->getOpcode() == TargetOpcode::G_FRAME_INDEX)
7375         return {{
7376             [=](MachineInstrBuilder &MIB) { MIB.add(LHSDef->getOperand(1)); },
7377             [=](MachineInstrBuilder &MIB) { MIB.addImm(RHSC >> Scale); },
7378         }};
7379 
7380       return {{
7381           [=](MachineInstrBuilder &MIB) { MIB.add(LHS); },
7382           [=](MachineInstrBuilder &MIB) { MIB.addImm(RHSC >> Scale); },
7383       }};
7384     }
7385   }
7386 
7387   // Before falling back to our general case, check if the unscaled
7388   // instructions can handle this. If so, that's preferable.
7389   if (selectAddrModeUnscaled(Root, Size))
7390     return std::nullopt;
7391 
7392   return {{
7393       [=](MachineInstrBuilder &MIB) { MIB.add(Root); },
7394       [=](MachineInstrBuilder &MIB) { MIB.addImm(0); },
7395   }};
7396 }
7397 
7398 /// Given a shift instruction, return the correct shift type for that
7399 /// instruction.
7400 static AArch64_AM::ShiftExtendType getShiftTypeForInst(MachineInstr &MI) {
7401   switch (MI.getOpcode()) {
7402   default:
7403     return AArch64_AM::InvalidShiftExtend;
7404   case TargetOpcode::G_SHL:
7405     return AArch64_AM::LSL;
7406   case TargetOpcode::G_LSHR:
7407     return AArch64_AM::LSR;
7408   case TargetOpcode::G_ASHR:
7409     return AArch64_AM::ASR;
7410   case TargetOpcode::G_ROTR:
7411     return AArch64_AM::ROR;
7412   }
7413 }
7414 
7415 /// Select a "shifted register" operand. If the value is not shifted, set the
7416 /// shift operand to a default value of "lsl 0".
7417 InstructionSelector::ComplexRendererFns
7418 AArch64InstructionSelector::selectShiftedRegister(MachineOperand &Root,
7419                                                   bool AllowROR) const {
7420   if (!Root.isReg())
7421     return std::nullopt;
7422   MachineRegisterInfo &MRI =
7423       Root.getParent()->getParent()->getParent()->getRegInfo();
7424 
7425   // Check if the operand is defined by an instruction which corresponds to
7426   // a ShiftExtendType. E.g. a G_SHL, G_LSHR, etc.
7427   MachineInstr *ShiftInst = MRI.getVRegDef(Root.getReg());
7428   AArch64_AM::ShiftExtendType ShType = getShiftTypeForInst(*ShiftInst);
7429   if (ShType == AArch64_AM::InvalidShiftExtend)
7430     return std::nullopt;
7431   if (ShType == AArch64_AM::ROR && !AllowROR)
7432     return std::nullopt;
7433   if (!isWorthFoldingIntoExtendedReg(*ShiftInst, MRI))
7434     return std::nullopt;
7435 
7436   // Need an immediate on the RHS.
7437   MachineOperand &ShiftRHS = ShiftInst->getOperand(2);
7438   auto Immed = getImmedFromMO(ShiftRHS);
7439   if (!Immed)
7440     return std::nullopt;
7441 
7442   // We have something that we can fold. Fold in the shift's LHS and RHS into
7443   // the instruction.
7444   MachineOperand &ShiftLHS = ShiftInst->getOperand(1);
7445   Register ShiftReg = ShiftLHS.getReg();
7446 
7447   unsigned NumBits = MRI.getType(ShiftReg).getSizeInBits();
7448   unsigned Val = *Immed & (NumBits - 1);
7449   unsigned ShiftVal = AArch64_AM::getShifterImm(ShType, Val);
7450 
7451   return {{[=](MachineInstrBuilder &MIB) { MIB.addUse(ShiftReg); },
7452            [=](MachineInstrBuilder &MIB) { MIB.addImm(ShiftVal); }}};
7453 }
7454 
7455 AArch64_AM::ShiftExtendType AArch64InstructionSelector::getExtendTypeForInst(
7456     MachineInstr &MI, MachineRegisterInfo &MRI, bool IsLoadStore) const {
7457   unsigned Opc = MI.getOpcode();
7458 
7459   // Handle explicit extend instructions first.
7460   if (Opc == TargetOpcode::G_SEXT || Opc == TargetOpcode::G_SEXT_INREG) {
7461     unsigned Size;
7462     if (Opc == TargetOpcode::G_SEXT)
7463       Size = MRI.getType(MI.getOperand(1).getReg()).getSizeInBits();
7464     else
7465       Size = MI.getOperand(2).getImm();
7466     assert(Size != 64 && "Extend from 64 bits?");
7467     switch (Size) {
7468     case 8:
7469       return IsLoadStore ? AArch64_AM::InvalidShiftExtend : AArch64_AM::SXTB;
7470     case 16:
7471       return IsLoadStore ? AArch64_AM::InvalidShiftExtend : AArch64_AM::SXTH;
7472     case 32:
7473       return AArch64_AM::SXTW;
7474     default:
7475       return AArch64_AM::InvalidShiftExtend;
7476     }
7477   }
7478 
7479   if (Opc == TargetOpcode::G_ZEXT || Opc == TargetOpcode::G_ANYEXT) {
7480     unsigned Size = MRI.getType(MI.getOperand(1).getReg()).getSizeInBits();
7481     assert(Size != 64 && "Extend from 64 bits?");
7482     switch (Size) {
7483     case 8:
7484       return IsLoadStore ? AArch64_AM::InvalidShiftExtend : AArch64_AM::UXTB;
7485     case 16:
7486       return IsLoadStore ? AArch64_AM::InvalidShiftExtend : AArch64_AM::UXTH;
7487     case 32:
7488       return AArch64_AM::UXTW;
7489     default:
7490       return AArch64_AM::InvalidShiftExtend;
7491     }
7492   }
7493 
7494   // Don't have an explicit extend. Try to handle a G_AND with a constant mask
7495   // on the RHS.
7496   if (Opc != TargetOpcode::G_AND)
7497     return AArch64_AM::InvalidShiftExtend;
7498 
7499   std::optional<uint64_t> MaybeAndMask = getImmedFromMO(MI.getOperand(2));
7500   if (!MaybeAndMask)
7501     return AArch64_AM::InvalidShiftExtend;
7502   uint64_t AndMask = *MaybeAndMask;
7503   switch (AndMask) {
7504   default:
7505     return AArch64_AM::InvalidShiftExtend;
7506   case 0xFF:
7507     return !IsLoadStore ? AArch64_AM::UXTB : AArch64_AM::InvalidShiftExtend;
7508   case 0xFFFF:
7509     return !IsLoadStore ? AArch64_AM::UXTH : AArch64_AM::InvalidShiftExtend;
7510   case 0xFFFFFFFF:
7511     return AArch64_AM::UXTW;
7512   }
7513 }
7514 
7515 Register AArch64InstructionSelector::moveScalarRegClass(
7516     Register Reg, const TargetRegisterClass &RC, MachineIRBuilder &MIB) const {
7517   MachineRegisterInfo &MRI = *MIB.getMRI();
7518   auto Ty = MRI.getType(Reg);
7519   assert(!Ty.isVector() && "Expected scalars only!");
7520   if (Ty.getSizeInBits() == TRI.getRegSizeInBits(RC))
7521     return Reg;
7522 
7523   // Create a copy and immediately select it.
7524   // FIXME: We should have an emitCopy function?
7525   auto Copy = MIB.buildCopy({&RC}, {Reg});
7526   selectCopy(*Copy, TII, MRI, TRI, RBI);
7527   return Copy.getReg(0);
7528 }
7529 
7530 /// Select an "extended register" operand. This operand folds in an extend
7531 /// followed by an optional left shift.
7532 InstructionSelector::ComplexRendererFns
7533 AArch64InstructionSelector::selectArithExtendedRegister(
7534     MachineOperand &Root) const {
7535   if (!Root.isReg())
7536     return std::nullopt;
7537   MachineRegisterInfo &MRI =
7538       Root.getParent()->getParent()->getParent()->getRegInfo();
7539 
7540   uint64_t ShiftVal = 0;
7541   Register ExtReg;
7542   AArch64_AM::ShiftExtendType Ext;
7543   MachineInstr *RootDef = getDefIgnoringCopies(Root.getReg(), MRI);
7544   if (!RootDef)
7545     return std::nullopt;
7546 
7547   if (!isWorthFoldingIntoExtendedReg(*RootDef, MRI))
7548     return std::nullopt;
7549 
7550   // Check if we can fold a shift and an extend.
7551   if (RootDef->getOpcode() == TargetOpcode::G_SHL) {
7552     // Look for a constant on the RHS of the shift.
7553     MachineOperand &RHS = RootDef->getOperand(2);
7554     std::optional<uint64_t> MaybeShiftVal = getImmedFromMO(RHS);
7555     if (!MaybeShiftVal)
7556       return std::nullopt;
7557     ShiftVal = *MaybeShiftVal;
7558     if (ShiftVal > 4)
7559       return std::nullopt;
7560     // Look for a valid extend instruction on the LHS of the shift.
7561     MachineOperand &LHS = RootDef->getOperand(1);
7562     MachineInstr *ExtDef = getDefIgnoringCopies(LHS.getReg(), MRI);
7563     if (!ExtDef)
7564       return std::nullopt;
7565     Ext = getExtendTypeForInst(*ExtDef, MRI);
7566     if (Ext == AArch64_AM::InvalidShiftExtend)
7567       return std::nullopt;
7568     ExtReg = ExtDef->getOperand(1).getReg();
7569   } else {
7570     // Didn't get a shift. Try just folding an extend.
7571     Ext = getExtendTypeForInst(*RootDef, MRI);
7572     if (Ext == AArch64_AM::InvalidShiftExtend)
7573       return std::nullopt;
7574     ExtReg = RootDef->getOperand(1).getReg();
7575 
7576     // If we have a 32 bit instruction which zeroes out the high half of a
7577     // register, we get an implicit zero extend for free. Check if we have one.
7578     // FIXME: We actually emit the extend right now even though we don't have
7579     // to.
7580     if (Ext == AArch64_AM::UXTW && MRI.getType(ExtReg).getSizeInBits() == 32) {
7581       MachineInstr *ExtInst = MRI.getVRegDef(ExtReg);
7582       if (isDef32(*ExtInst))
7583         return std::nullopt;
7584     }
7585   }
7586 
7587   // We require a GPR32 here. Narrow the ExtReg if needed using a subregister
7588   // copy.
7589   MachineIRBuilder MIB(*RootDef);
7590   ExtReg = moveScalarRegClass(ExtReg, AArch64::GPR32RegClass, MIB);
7591 
7592   return {{[=](MachineInstrBuilder &MIB) { MIB.addUse(ExtReg); },
7593            [=](MachineInstrBuilder &MIB) {
7594              MIB.addImm(getArithExtendImm(Ext, ShiftVal));
7595            }}};
7596 }
7597 
7598 InstructionSelector::ComplexRendererFns
7599 AArch64InstructionSelector::selectExtractHigh(MachineOperand &Root) const {
7600   if (!Root.isReg())
7601     return std::nullopt;
7602   MachineRegisterInfo &MRI =
7603       Root.getParent()->getParent()->getParent()->getRegInfo();
7604 
7605   auto Extract = getDefSrcRegIgnoringCopies(Root.getReg(), MRI);
7606   while (Extract && Extract->MI->getOpcode() == TargetOpcode::G_BITCAST &&
7607          STI.isLittleEndian())
7608     Extract =
7609         getDefSrcRegIgnoringCopies(Extract->MI->getOperand(1).getReg(), MRI);
7610   if (!Extract)
7611     return std::nullopt;
7612 
7613   if (Extract->MI->getOpcode() == TargetOpcode::G_UNMERGE_VALUES) {
7614     if (Extract->Reg == Extract->MI->getOperand(1).getReg()) {
7615       Register ExtReg = Extract->MI->getOperand(2).getReg();
7616       return {{[=](MachineInstrBuilder &MIB) { MIB.addUse(ExtReg); }}};
7617     }
7618   }
7619   if (Extract->MI->getOpcode() == TargetOpcode::G_EXTRACT_VECTOR_ELT) {
7620     LLT SrcTy = MRI.getType(Extract->MI->getOperand(1).getReg());
7621     auto LaneIdx = getIConstantVRegValWithLookThrough(
7622         Extract->MI->getOperand(2).getReg(), MRI);
7623     if (LaneIdx && SrcTy == LLT::fixed_vector(2, 64) &&
7624         LaneIdx->Value.getSExtValue() == 1) {
7625       Register ExtReg = Extract->MI->getOperand(1).getReg();
7626       return {{[=](MachineInstrBuilder &MIB) { MIB.addUse(ExtReg); }}};
7627     }
7628   }
7629 
7630   return std::nullopt;
7631 }
7632 
7633 void AArch64InstructionSelector::renderTruncImm(MachineInstrBuilder &MIB,
7634                                                 const MachineInstr &MI,
7635                                                 int OpIdx) const {
7636   const MachineRegisterInfo &MRI = MI.getParent()->getParent()->getRegInfo();
7637   assert(MI.getOpcode() == TargetOpcode::G_CONSTANT && OpIdx == -1 &&
7638          "Expected G_CONSTANT");
7639   std::optional<int64_t> CstVal =
7640       getIConstantVRegSExtVal(MI.getOperand(0).getReg(), MRI);
7641   assert(CstVal && "Expected constant value");
7642   MIB.addImm(*CstVal);
7643 }
7644 
7645 void AArch64InstructionSelector::renderLogicalImm32(
7646   MachineInstrBuilder &MIB, const MachineInstr &I, int OpIdx) const {
7647   assert(I.getOpcode() == TargetOpcode::G_CONSTANT && OpIdx == -1 &&
7648          "Expected G_CONSTANT");
7649   uint64_t CstVal = I.getOperand(1).getCImm()->getZExtValue();
7650   uint64_t Enc = AArch64_AM::encodeLogicalImmediate(CstVal, 32);
7651   MIB.addImm(Enc);
7652 }
7653 
7654 void AArch64InstructionSelector::renderLogicalImm64(
7655   MachineInstrBuilder &MIB, const MachineInstr &I, int OpIdx) const {
7656   assert(I.getOpcode() == TargetOpcode::G_CONSTANT && OpIdx == -1 &&
7657          "Expected G_CONSTANT");
7658   uint64_t CstVal = I.getOperand(1).getCImm()->getZExtValue();
7659   uint64_t Enc = AArch64_AM::encodeLogicalImmediate(CstVal, 64);
7660   MIB.addImm(Enc);
7661 }
7662 
7663 void AArch64InstructionSelector::renderFPImm16(MachineInstrBuilder &MIB,
7664                                                const MachineInstr &MI,
7665                                                int OpIdx) const {
7666   assert(MI.getOpcode() == TargetOpcode::G_FCONSTANT && OpIdx == -1 &&
7667          "Expected G_FCONSTANT");
7668   MIB.addImm(
7669       AArch64_AM::getFP16Imm(MI.getOperand(1).getFPImm()->getValueAPF()));
7670 }
7671 
7672 void AArch64InstructionSelector::renderFPImm32(MachineInstrBuilder &MIB,
7673                                                const MachineInstr &MI,
7674                                                int OpIdx) const {
7675   assert(MI.getOpcode() == TargetOpcode::G_FCONSTANT && OpIdx == -1 &&
7676          "Expected G_FCONSTANT");
7677   MIB.addImm(
7678       AArch64_AM::getFP32Imm(MI.getOperand(1).getFPImm()->getValueAPF()));
7679 }
7680 
7681 void AArch64InstructionSelector::renderFPImm64(MachineInstrBuilder &MIB,
7682                                                const MachineInstr &MI,
7683                                                int OpIdx) const {
7684   assert(MI.getOpcode() == TargetOpcode::G_FCONSTANT && OpIdx == -1 &&
7685          "Expected G_FCONSTANT");
7686   MIB.addImm(
7687       AArch64_AM::getFP64Imm(MI.getOperand(1).getFPImm()->getValueAPF()));
7688 }
7689 
7690 void AArch64InstructionSelector::renderFPImm32SIMDModImmType4(
7691     MachineInstrBuilder &MIB, const MachineInstr &MI, int OpIdx) const {
7692   assert(MI.getOpcode() == TargetOpcode::G_FCONSTANT && OpIdx == -1 &&
7693          "Expected G_FCONSTANT");
7694   MIB.addImm(AArch64_AM::encodeAdvSIMDModImmType4(MI.getOperand(1)
7695                                                       .getFPImm()
7696                                                       ->getValueAPF()
7697                                                       .bitcastToAPInt()
7698                                                       .getZExtValue()));
7699 }
7700 
7701 bool AArch64InstructionSelector::isLoadStoreOfNumBytes(
7702     const MachineInstr &MI, unsigned NumBytes) const {
7703   if (!MI.mayLoadOrStore())
7704     return false;
7705   assert(MI.hasOneMemOperand() &&
7706          "Expected load/store to have only one mem op!");
7707   return (*MI.memoperands_begin())->getSize() == NumBytes;
7708 }
7709 
7710 bool AArch64InstructionSelector::isDef32(const MachineInstr &MI) const {
7711   const MachineRegisterInfo &MRI = MI.getParent()->getParent()->getRegInfo();
7712   if (MRI.getType(MI.getOperand(0).getReg()).getSizeInBits() != 32)
7713     return false;
7714 
7715   // Only return true if we know the operation will zero-out the high half of
7716   // the 64-bit register. Truncates can be subregister copies, which don't
7717   // zero out the high bits. Copies and other copy-like instructions can be
7718   // fed by truncates, or could be lowered as subregister copies.
7719   switch (MI.getOpcode()) {
7720   default:
7721     return true;
7722   case TargetOpcode::COPY:
7723   case TargetOpcode::G_BITCAST:
7724   case TargetOpcode::G_TRUNC:
7725   case TargetOpcode::G_PHI:
7726     return false;
7727   }
7728 }
7729 
7730 
7731 // Perform fixups on the given PHI instruction's operands to force them all
7732 // to be the same as the destination regbank.
7733 static void fixupPHIOpBanks(MachineInstr &MI, MachineRegisterInfo &MRI,
7734                             const AArch64RegisterBankInfo &RBI) {
7735   assert(MI.getOpcode() == TargetOpcode::G_PHI && "Expected a G_PHI");
7736   Register DstReg = MI.getOperand(0).getReg();
7737   const RegisterBank *DstRB = MRI.getRegBankOrNull(DstReg);
7738   assert(DstRB && "Expected PHI dst to have regbank assigned");
7739   MachineIRBuilder MIB(MI);
7740 
7741   // Go through each operand and ensure it has the same regbank.
7742   for (MachineOperand &MO : llvm::drop_begin(MI.operands())) {
7743     if (!MO.isReg())
7744       continue;
7745     Register OpReg = MO.getReg();
7746     const RegisterBank *RB = MRI.getRegBankOrNull(OpReg);
7747     if (RB != DstRB) {
7748       // Insert a cross-bank copy.
7749       auto *OpDef = MRI.getVRegDef(OpReg);
7750       const LLT &Ty = MRI.getType(OpReg);
7751       MachineBasicBlock &OpDefBB = *OpDef->getParent();
7752 
7753       // Any instruction we insert must appear after all PHIs in the block
7754       // for the block to be valid MIR.
7755       MachineBasicBlock::iterator InsertPt = std::next(OpDef->getIterator());
7756       if (InsertPt != OpDefBB.end() && InsertPt->isPHI())
7757         InsertPt = OpDefBB.getFirstNonPHI();
7758       MIB.setInsertPt(*OpDef->getParent(), InsertPt);
7759       auto Copy = MIB.buildCopy(Ty, OpReg);
7760       MRI.setRegBank(Copy.getReg(0), *DstRB);
7761       MO.setReg(Copy.getReg(0));
7762     }
7763   }
7764 }
7765 
7766 void AArch64InstructionSelector::processPHIs(MachineFunction &MF) {
7767   // We're looking for PHIs, build a list so we don't invalidate iterators.
7768   MachineRegisterInfo &MRI = MF.getRegInfo();
7769   SmallVector<MachineInstr *, 32> Phis;
7770   for (auto &BB : MF) {
7771     for (auto &MI : BB) {
7772       if (MI.getOpcode() == TargetOpcode::G_PHI)
7773         Phis.emplace_back(&MI);
7774     }
7775   }
7776 
7777   for (auto *MI : Phis) {
7778     // We need to do some work here if the operand types are < 16 bit and they
7779     // are split across fpr/gpr banks. Since all types <32b on gpr
7780     // end up being assigned gpr32 regclasses, we can end up with PHIs here
7781     // which try to select between a gpr32 and an fpr16. Ideally RBS shouldn't
7782     // be selecting heterogenous regbanks for operands if possible, but we
7783     // still need to be able to deal with it here.
7784     //
7785     // To fix this, if we have a gpr-bank operand < 32b in size and at least
7786     // one other operand is on the fpr bank, then we add cross-bank copies
7787     // to homogenize the operand banks. For simplicity the bank that we choose
7788     // to settle on is whatever bank the def operand has. For example:
7789     //
7790     // %endbb:
7791     //   %dst:gpr(s16) = G_PHI %in1:gpr(s16), %bb1, %in2:fpr(s16), %bb2
7792     //  =>
7793     // %bb2:
7794     //   ...
7795     //   %in2_copy:gpr(s16) = COPY %in2:fpr(s16)
7796     //   ...
7797     // %endbb:
7798     //   %dst:gpr(s16) = G_PHI %in1:gpr(s16), %bb1, %in2_copy:gpr(s16), %bb2
7799     bool HasGPROp = false, HasFPROp = false;
7800     for (const MachineOperand &MO : llvm::drop_begin(MI->operands())) {
7801       if (!MO.isReg())
7802         continue;
7803       const LLT &Ty = MRI.getType(MO.getReg());
7804       if (!Ty.isValid() || !Ty.isScalar())
7805         break;
7806       if (Ty.getSizeInBits() >= 32)
7807         break;
7808       const RegisterBank *RB = MRI.getRegBankOrNull(MO.getReg());
7809       // If for some reason we don't have a regbank yet. Don't try anything.
7810       if (!RB)
7811         break;
7812 
7813       if (RB->getID() == AArch64::GPRRegBankID)
7814         HasGPROp = true;
7815       else
7816         HasFPROp = true;
7817     }
7818     // We have heterogenous regbanks, need to fixup.
7819     if (HasGPROp && HasFPROp)
7820       fixupPHIOpBanks(*MI, MRI, RBI);
7821   }
7822 }
7823 
7824 namespace llvm {
7825 InstructionSelector *
7826 createAArch64InstructionSelector(const AArch64TargetMachine &TM,
7827                                  AArch64Subtarget &Subtarget,
7828                                  AArch64RegisterBankInfo &RBI) {
7829   return new AArch64InstructionSelector(TM, Subtarget, RBI);
7830 }
7831 }
7832