xref: /freebsd/contrib/llvm-project/llvm/lib/Target/AArch64/GISel/AArch64InstructionSelector.cpp (revision 2e3507c25e42292b45a5482e116d278f5515d04d)
1 //===- AArch64InstructionSelector.cpp ----------------------------*- C++ -*-==//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 /// \file
9 /// This file implements the targeting of the InstructionSelector class for
10 /// AArch64.
11 /// \todo This should be generated by TableGen.
12 //===----------------------------------------------------------------------===//
13 
14 #include "AArch64GlobalISelUtils.h"
15 #include "AArch64InstrInfo.h"
16 #include "AArch64MachineFunctionInfo.h"
17 #include "AArch64RegisterBankInfo.h"
18 #include "AArch64RegisterInfo.h"
19 #include "AArch64Subtarget.h"
20 #include "AArch64TargetMachine.h"
21 #include "MCTargetDesc/AArch64AddressingModes.h"
22 #include "MCTargetDesc/AArch64MCTargetDesc.h"
23 #include "llvm/BinaryFormat/Dwarf.h"
24 #include "llvm/CodeGen/GlobalISel/GIMatchTableExecutorImpl.h"
25 #include "llvm/CodeGen/GlobalISel/GenericMachineInstrs.h"
26 #include "llvm/CodeGen/GlobalISel/InstructionSelector.h"
27 #include "llvm/CodeGen/GlobalISel/MIPatternMatch.h"
28 #include "llvm/CodeGen/GlobalISel/MachineIRBuilder.h"
29 #include "llvm/CodeGen/GlobalISel/Utils.h"
30 #include "llvm/CodeGen/MachineBasicBlock.h"
31 #include "llvm/CodeGen/MachineConstantPool.h"
32 #include "llvm/CodeGen/MachineFrameInfo.h"
33 #include "llvm/CodeGen/MachineFunction.h"
34 #include "llvm/CodeGen/MachineInstr.h"
35 #include "llvm/CodeGen/MachineInstrBuilder.h"
36 #include "llvm/CodeGen/MachineMemOperand.h"
37 #include "llvm/CodeGen/MachineOperand.h"
38 #include "llvm/CodeGen/MachineRegisterInfo.h"
39 #include "llvm/CodeGen/TargetOpcodes.h"
40 #include "llvm/IR/Constants.h"
41 #include "llvm/IR/DerivedTypes.h"
42 #include "llvm/IR/Instructions.h"
43 #include "llvm/IR/IntrinsicsAArch64.h"
44 #include "llvm/IR/PatternMatch.h"
45 #include "llvm/IR/Type.h"
46 #include "llvm/Pass.h"
47 #include "llvm/Support/Debug.h"
48 #include "llvm/Support/raw_ostream.h"
49 #include <optional>
50 
51 #define DEBUG_TYPE "aarch64-isel"
52 
53 using namespace llvm;
54 using namespace MIPatternMatch;
55 using namespace AArch64GISelUtils;
56 
57 namespace llvm {
58 class BlockFrequencyInfo;
59 class ProfileSummaryInfo;
60 }
61 
62 namespace {
63 
64 #define GET_GLOBALISEL_PREDICATE_BITSET
65 #include "AArch64GenGlobalISel.inc"
66 #undef GET_GLOBALISEL_PREDICATE_BITSET
67 
68 
69 class AArch64InstructionSelector : public InstructionSelector {
70 public:
71   AArch64InstructionSelector(const AArch64TargetMachine &TM,
72                              const AArch64Subtarget &STI,
73                              const AArch64RegisterBankInfo &RBI);
74 
75   bool select(MachineInstr &I) override;
76   static const char *getName() { return DEBUG_TYPE; }
77 
78   void setupMF(MachineFunction &MF, GISelKnownBits *KB,
79                CodeGenCoverage *CoverageInfo, ProfileSummaryInfo *PSI,
80                BlockFrequencyInfo *BFI) override {
81     InstructionSelector::setupMF(MF, KB, CoverageInfo, PSI, BFI);
82     MIB.setMF(MF);
83 
84     // hasFnAttribute() is expensive to call on every BRCOND selection, so
85     // cache it here for each run of the selector.
86     ProduceNonFlagSettingCondBr =
87         !MF.getFunction().hasFnAttribute(Attribute::SpeculativeLoadHardening);
88     MFReturnAddr = Register();
89 
90     processPHIs(MF);
91   }
92 
93 private:
94   /// tblgen-erated 'select' implementation, used as the initial selector for
95   /// the patterns that don't require complex C++.
96   bool selectImpl(MachineInstr &I, CodeGenCoverage &CoverageInfo) const;
97 
98   // A lowering phase that runs before any selection attempts.
99   // Returns true if the instruction was modified.
100   bool preISelLower(MachineInstr &I);
101 
102   // An early selection function that runs before the selectImpl() call.
103   bool earlySelect(MachineInstr &I);
104 
105   // Do some preprocessing of G_PHIs before we begin selection.
106   void processPHIs(MachineFunction &MF);
107 
108   bool earlySelectSHL(MachineInstr &I, MachineRegisterInfo &MRI);
109 
110   /// Eliminate same-sized cross-bank copies into stores before selectImpl().
111   bool contractCrossBankCopyIntoStore(MachineInstr &I,
112                                       MachineRegisterInfo &MRI);
113 
114   bool convertPtrAddToAdd(MachineInstr &I, MachineRegisterInfo &MRI);
115 
116   bool selectVaStartAAPCS(MachineInstr &I, MachineFunction &MF,
117                           MachineRegisterInfo &MRI) const;
118   bool selectVaStartDarwin(MachineInstr &I, MachineFunction &MF,
119                            MachineRegisterInfo &MRI) const;
120 
121   ///@{
122   /// Helper functions for selectCompareBranch.
123   bool selectCompareBranchFedByFCmp(MachineInstr &I, MachineInstr &FCmp,
124                                     MachineIRBuilder &MIB) const;
125   bool selectCompareBranchFedByICmp(MachineInstr &I, MachineInstr &ICmp,
126                                     MachineIRBuilder &MIB) const;
127   bool tryOptCompareBranchFedByICmp(MachineInstr &I, MachineInstr &ICmp,
128                                     MachineIRBuilder &MIB) const;
129   bool tryOptAndIntoCompareBranch(MachineInstr &AndInst, bool Invert,
130                                   MachineBasicBlock *DstMBB,
131                                   MachineIRBuilder &MIB) const;
132   ///@}
133 
134   bool selectCompareBranch(MachineInstr &I, MachineFunction &MF,
135                            MachineRegisterInfo &MRI);
136 
137   bool selectVectorAshrLshr(MachineInstr &I, MachineRegisterInfo &MRI);
138   bool selectVectorSHL(MachineInstr &I, MachineRegisterInfo &MRI);
139 
140   // Helper to generate an equivalent of scalar_to_vector into a new register,
141   // returned via 'Dst'.
142   MachineInstr *emitScalarToVector(unsigned EltSize,
143                                    const TargetRegisterClass *DstRC,
144                                    Register Scalar,
145                                    MachineIRBuilder &MIRBuilder) const;
146 
147   /// Emit a lane insert into \p DstReg, or a new vector register if
148   /// std::nullopt is provided.
149   ///
150   /// The lane inserted into is defined by \p LaneIdx. The vector source
151   /// register is given by \p SrcReg. The register containing the element is
152   /// given by \p EltReg.
153   MachineInstr *emitLaneInsert(std::optional<Register> DstReg, Register SrcReg,
154                                Register EltReg, unsigned LaneIdx,
155                                const RegisterBank &RB,
156                                MachineIRBuilder &MIRBuilder) const;
157 
158   /// Emit a sequence of instructions representing a constant \p CV for a
159   /// vector register \p Dst. (E.g. a MOV, or a load from a constant pool.)
160   ///
161   /// \returns the last instruction in the sequence on success, and nullptr
162   /// otherwise.
163   MachineInstr *emitConstantVector(Register Dst, Constant *CV,
164                                    MachineIRBuilder &MIRBuilder,
165                                    MachineRegisterInfo &MRI);
166 
167   bool selectInsertElt(MachineInstr &I, MachineRegisterInfo &MRI);
168   bool tryOptConstantBuildVec(MachineInstr &MI, LLT DstTy,
169                               MachineRegisterInfo &MRI);
170   /// \returns true if a G_BUILD_VECTOR instruction \p MI can be selected as a
171   /// SUBREG_TO_REG.
172   bool tryOptBuildVecToSubregToReg(MachineInstr &MI, MachineRegisterInfo &MRI);
173   bool selectBuildVector(MachineInstr &I, MachineRegisterInfo &MRI);
174   bool selectMergeValues(MachineInstr &I, MachineRegisterInfo &MRI);
175   bool selectUnmergeValues(MachineInstr &I, MachineRegisterInfo &MRI);
176 
177   bool selectShuffleVector(MachineInstr &I, MachineRegisterInfo &MRI);
178   bool selectExtractElt(MachineInstr &I, MachineRegisterInfo &MRI);
179   bool selectConcatVectors(MachineInstr &I, MachineRegisterInfo &MRI);
180   bool selectSplitVectorUnmerge(MachineInstr &I, MachineRegisterInfo &MRI);
181 
182   /// Helper function to select vector load intrinsics like
183   /// @llvm.aarch64.neon.ld2.*, @llvm.aarch64.neon.ld4.*, etc.
184   /// \p Opc is the opcode that the selected instruction should use.
185   /// \p NumVecs is the number of vector destinations for the instruction.
186   /// \p I is the original G_INTRINSIC_W_SIDE_EFFECTS instruction.
187   bool selectVectorLoadIntrinsic(unsigned Opc, unsigned NumVecs,
188                                  MachineInstr &I);
189   bool selectIntrinsicWithSideEffects(MachineInstr &I,
190                                       MachineRegisterInfo &MRI);
191   bool selectIntrinsic(MachineInstr &I, MachineRegisterInfo &MRI);
192   bool selectVectorICmp(MachineInstr &I, MachineRegisterInfo &MRI);
193   bool selectIntrinsicTrunc(MachineInstr &I, MachineRegisterInfo &MRI) const;
194   bool selectIntrinsicRound(MachineInstr &I, MachineRegisterInfo &MRI) const;
195   bool selectJumpTable(MachineInstr &I, MachineRegisterInfo &MRI);
196   bool selectBrJT(MachineInstr &I, MachineRegisterInfo &MRI);
197   bool selectTLSGlobalValue(MachineInstr &I, MachineRegisterInfo &MRI);
198   bool selectReduction(MachineInstr &I, MachineRegisterInfo &MRI);
199   bool selectMOPS(MachineInstr &I, MachineRegisterInfo &MRI);
200   bool selectUSMovFromExtend(MachineInstr &I, MachineRegisterInfo &MRI);
201 
202   unsigned emitConstantPoolEntry(const Constant *CPVal,
203                                  MachineFunction &MF) const;
204   MachineInstr *emitLoadFromConstantPool(const Constant *CPVal,
205                                          MachineIRBuilder &MIRBuilder) const;
206 
207   // Emit a vector concat operation.
208   MachineInstr *emitVectorConcat(std::optional<Register> Dst, Register Op1,
209                                  Register Op2,
210                                  MachineIRBuilder &MIRBuilder) const;
211 
212   // Emit an integer compare between LHS and RHS, which checks for Predicate.
213   MachineInstr *emitIntegerCompare(MachineOperand &LHS, MachineOperand &RHS,
214                                    MachineOperand &Predicate,
215                                    MachineIRBuilder &MIRBuilder) const;
216 
217   /// Emit a floating point comparison between \p LHS and \p RHS.
218   /// \p Pred if given is the intended predicate to use.
219   MachineInstr *
220   emitFPCompare(Register LHS, Register RHS, MachineIRBuilder &MIRBuilder,
221                 std::optional<CmpInst::Predicate> = std::nullopt) const;
222 
223   MachineInstr *
224   emitInstr(unsigned Opcode, std::initializer_list<llvm::DstOp> DstOps,
225             std::initializer_list<llvm::SrcOp> SrcOps,
226             MachineIRBuilder &MIRBuilder,
227             const ComplexRendererFns &RenderFns = std::nullopt) const;
228   /// Helper function to emit an add or sub instruction.
229   ///
230   /// \p AddrModeAndSizeToOpcode must contain each of the opcode variants above
231   /// in a specific order.
232   ///
233   /// Below is an example of the expected input to \p AddrModeAndSizeToOpcode.
234   ///
235   /// \code
236   ///   const std::array<std::array<unsigned, 2>, 4> Table {
237   ///    {{AArch64::ADDXri, AArch64::ADDWri},
238   ///     {AArch64::ADDXrs, AArch64::ADDWrs},
239   ///     {AArch64::ADDXrr, AArch64::ADDWrr},
240   ///     {AArch64::SUBXri, AArch64::SUBWri},
241   ///     {AArch64::ADDXrx, AArch64::ADDWrx}}};
242   /// \endcode
243   ///
244   /// Each row in the table corresponds to a different addressing mode. Each
245   /// column corresponds to a different register size.
246   ///
247   /// \attention Rows must be structured as follows:
248   ///   - Row 0: The ri opcode variants
249   ///   - Row 1: The rs opcode variants
250   ///   - Row 2: The rr opcode variants
251   ///   - Row 3: The ri opcode variants for negative immediates
252   ///   - Row 4: The rx opcode variants
253   ///
254   /// \attention Columns must be structured as follows:
255   ///   - Column 0: The 64-bit opcode variants
256   ///   - Column 1: The 32-bit opcode variants
257   ///
258   /// \p Dst is the destination register of the binop to emit.
259   /// \p LHS is the left-hand operand of the binop to emit.
260   /// \p RHS is the right-hand operand of the binop to emit.
261   MachineInstr *emitAddSub(
262       const std::array<std::array<unsigned, 2>, 5> &AddrModeAndSizeToOpcode,
263       Register Dst, MachineOperand &LHS, MachineOperand &RHS,
264       MachineIRBuilder &MIRBuilder) const;
265   MachineInstr *emitADD(Register DefReg, MachineOperand &LHS,
266                         MachineOperand &RHS,
267                         MachineIRBuilder &MIRBuilder) const;
268   MachineInstr *emitADDS(Register Dst, MachineOperand &LHS, MachineOperand &RHS,
269                          MachineIRBuilder &MIRBuilder) const;
270   MachineInstr *emitSUBS(Register Dst, MachineOperand &LHS, MachineOperand &RHS,
271                          MachineIRBuilder &MIRBuilder) const;
272   MachineInstr *emitADCS(Register Dst, MachineOperand &LHS, MachineOperand &RHS,
273                          MachineIRBuilder &MIRBuilder) const;
274   MachineInstr *emitSBCS(Register Dst, MachineOperand &LHS, MachineOperand &RHS,
275                          MachineIRBuilder &MIRBuilder) const;
276   MachineInstr *emitCMN(MachineOperand &LHS, MachineOperand &RHS,
277                         MachineIRBuilder &MIRBuilder) const;
278   MachineInstr *emitTST(MachineOperand &LHS, MachineOperand &RHS,
279                         MachineIRBuilder &MIRBuilder) const;
280   MachineInstr *emitSelect(Register Dst, Register LHS, Register RHS,
281                            AArch64CC::CondCode CC,
282                            MachineIRBuilder &MIRBuilder) const;
283   MachineInstr *emitExtractVectorElt(std::optional<Register> DstReg,
284                                      const RegisterBank &DstRB, LLT ScalarTy,
285                                      Register VecReg, unsigned LaneIdx,
286                                      MachineIRBuilder &MIRBuilder) const;
287   MachineInstr *emitCSINC(Register Dst, Register Src1, Register Src2,
288                           AArch64CC::CondCode Pred,
289                           MachineIRBuilder &MIRBuilder) const;
290   /// Emit a CSet for a FP compare.
291   ///
292   /// \p Dst is expected to be a 32-bit scalar register.
293   MachineInstr *emitCSetForFCmp(Register Dst, CmpInst::Predicate Pred,
294                                 MachineIRBuilder &MIRBuilder) const;
295 
296   /// Emit an instruction that sets NZCV to the carry-in expected by \p I.
297   /// Might elide the instruction if the previous instruction already sets NZCV
298   /// correctly.
299   MachineInstr *emitCarryIn(MachineInstr &I, Register CarryReg);
300 
301   /// Emit the overflow op for \p Opcode.
302   ///
303   /// \p Opcode is expected to be an overflow op's opcode, e.g. G_UADDO,
304   /// G_USUBO, etc.
305   std::pair<MachineInstr *, AArch64CC::CondCode>
306   emitOverflowOp(unsigned Opcode, Register Dst, MachineOperand &LHS,
307                  MachineOperand &RHS, MachineIRBuilder &MIRBuilder) const;
308 
309   bool selectOverflowOp(MachineInstr &I, MachineRegisterInfo &MRI);
310 
311   /// Emit expression as a conjunction (a series of CCMP/CFCMP ops).
312   /// In some cases this is even possible with OR operations in the expression.
313   MachineInstr *emitConjunction(Register Val, AArch64CC::CondCode &OutCC,
314                                 MachineIRBuilder &MIB) const;
315   MachineInstr *emitConditionalComparison(Register LHS, Register RHS,
316                                           CmpInst::Predicate CC,
317                                           AArch64CC::CondCode Predicate,
318                                           AArch64CC::CondCode OutCC,
319                                           MachineIRBuilder &MIB) const;
320   MachineInstr *emitConjunctionRec(Register Val, AArch64CC::CondCode &OutCC,
321                                    bool Negate, Register CCOp,
322                                    AArch64CC::CondCode Predicate,
323                                    MachineIRBuilder &MIB) const;
324 
325   /// Emit a TB(N)Z instruction which tests \p Bit in \p TestReg.
326   /// \p IsNegative is true if the test should be "not zero".
327   /// This will also optimize the test bit instruction when possible.
328   MachineInstr *emitTestBit(Register TestReg, uint64_t Bit, bool IsNegative,
329                             MachineBasicBlock *DstMBB,
330                             MachineIRBuilder &MIB) const;
331 
332   /// Emit a CB(N)Z instruction which branches to \p DestMBB.
333   MachineInstr *emitCBZ(Register CompareReg, bool IsNegative,
334                         MachineBasicBlock *DestMBB,
335                         MachineIRBuilder &MIB) const;
336 
337   // Equivalent to the i32shift_a and friends from AArch64InstrInfo.td.
338   // We use these manually instead of using the importer since it doesn't
339   // support SDNodeXForm.
340   ComplexRendererFns selectShiftA_32(const MachineOperand &Root) const;
341   ComplexRendererFns selectShiftB_32(const MachineOperand &Root) const;
342   ComplexRendererFns selectShiftA_64(const MachineOperand &Root) const;
343   ComplexRendererFns selectShiftB_64(const MachineOperand &Root) const;
344 
345   ComplexRendererFns select12BitValueWithLeftShift(uint64_t Immed) const;
346   ComplexRendererFns selectArithImmed(MachineOperand &Root) const;
347   ComplexRendererFns selectNegArithImmed(MachineOperand &Root) const;
348 
349   ComplexRendererFns selectAddrModeUnscaled(MachineOperand &Root,
350                                             unsigned Size) const;
351 
352   ComplexRendererFns selectAddrModeUnscaled8(MachineOperand &Root) const {
353     return selectAddrModeUnscaled(Root, 1);
354   }
355   ComplexRendererFns selectAddrModeUnscaled16(MachineOperand &Root) const {
356     return selectAddrModeUnscaled(Root, 2);
357   }
358   ComplexRendererFns selectAddrModeUnscaled32(MachineOperand &Root) const {
359     return selectAddrModeUnscaled(Root, 4);
360   }
361   ComplexRendererFns selectAddrModeUnscaled64(MachineOperand &Root) const {
362     return selectAddrModeUnscaled(Root, 8);
363   }
364   ComplexRendererFns selectAddrModeUnscaled128(MachineOperand &Root) const {
365     return selectAddrModeUnscaled(Root, 16);
366   }
367 
368   /// Helper to try to fold in a GISEL_ADD_LOW into an immediate, to be used
369   /// from complex pattern matchers like selectAddrModeIndexed().
370   ComplexRendererFns tryFoldAddLowIntoImm(MachineInstr &RootDef, unsigned Size,
371                                           MachineRegisterInfo &MRI) const;
372 
373   ComplexRendererFns selectAddrModeIndexed(MachineOperand &Root,
374                                            unsigned Size) const;
375   template <int Width>
376   ComplexRendererFns selectAddrModeIndexed(MachineOperand &Root) const {
377     return selectAddrModeIndexed(Root, Width / 8);
378   }
379 
380   bool isWorthFoldingIntoExtendedReg(MachineInstr &MI,
381                                      const MachineRegisterInfo &MRI) const;
382   ComplexRendererFns
383   selectAddrModeShiftedExtendXReg(MachineOperand &Root,
384                                   unsigned SizeInBytes) const;
385 
386   /// Returns a \p ComplexRendererFns which contains a base, offset, and whether
387   /// or not a shift + extend should be folded into an addressing mode. Returns
388   /// None when this is not profitable or possible.
389   ComplexRendererFns
390   selectExtendedSHL(MachineOperand &Root, MachineOperand &Base,
391                     MachineOperand &Offset, unsigned SizeInBytes,
392                     bool WantsExt) const;
393   ComplexRendererFns selectAddrModeRegisterOffset(MachineOperand &Root) const;
394   ComplexRendererFns selectAddrModeXRO(MachineOperand &Root,
395                                        unsigned SizeInBytes) const;
396   template <int Width>
397   ComplexRendererFns selectAddrModeXRO(MachineOperand &Root) const {
398     return selectAddrModeXRO(Root, Width / 8);
399   }
400 
401   ComplexRendererFns selectAddrModeWRO(MachineOperand &Root,
402                                        unsigned SizeInBytes) const;
403   template <int Width>
404   ComplexRendererFns selectAddrModeWRO(MachineOperand &Root) const {
405     return selectAddrModeWRO(Root, Width / 8);
406   }
407 
408   ComplexRendererFns selectShiftedRegister(MachineOperand &Root,
409                                            bool AllowROR = false) const;
410 
411   ComplexRendererFns selectArithShiftedRegister(MachineOperand &Root) const {
412     return selectShiftedRegister(Root);
413   }
414 
415   ComplexRendererFns selectLogicalShiftedRegister(MachineOperand &Root) const {
416     return selectShiftedRegister(Root, true);
417   }
418 
419   /// Given an extend instruction, determine the correct shift-extend type for
420   /// that instruction.
421   ///
422   /// If the instruction is going to be used in a load or store, pass
423   /// \p IsLoadStore = true.
424   AArch64_AM::ShiftExtendType
425   getExtendTypeForInst(MachineInstr &MI, MachineRegisterInfo &MRI,
426                        bool IsLoadStore = false) const;
427 
428   /// Move \p Reg to \p RC if \p Reg is not already on \p RC.
429   ///
430   /// \returns Either \p Reg if no change was necessary, or the new register
431   /// created by moving \p Reg.
432   ///
433   /// Note: This uses emitCopy right now.
434   Register moveScalarRegClass(Register Reg, const TargetRegisterClass &RC,
435                               MachineIRBuilder &MIB) const;
436 
437   ComplexRendererFns selectArithExtendedRegister(MachineOperand &Root) const;
438 
439   ComplexRendererFns selectExtractHigh(MachineOperand &Root) const;
440 
441   void renderTruncImm(MachineInstrBuilder &MIB, const MachineInstr &MI,
442                       int OpIdx = -1) const;
443   void renderLogicalImm32(MachineInstrBuilder &MIB, const MachineInstr &I,
444                           int OpIdx = -1) const;
445   void renderLogicalImm64(MachineInstrBuilder &MIB, const MachineInstr &I,
446                           int OpIdx = -1) const;
447   void renderFPImm16(MachineInstrBuilder &MIB, const MachineInstr &MI,
448                      int OpIdx = -1) const;
449   void renderFPImm32(MachineInstrBuilder &MIB, const MachineInstr &MI,
450                      int OpIdx = -1) const;
451   void renderFPImm64(MachineInstrBuilder &MIB, const MachineInstr &MI,
452                      int OpIdx = -1) const;
453   void renderFPImm32SIMDModImmType4(MachineInstrBuilder &MIB,
454                                     const MachineInstr &MI,
455                                     int OpIdx = -1) const;
456 
457   // Materialize a GlobalValue or BlockAddress using a movz+movk sequence.
458   void materializeLargeCMVal(MachineInstr &I, const Value *V, unsigned OpFlags);
459 
460   // Optimization methods.
461   bool tryOptSelect(GSelect &Sel);
462   bool tryOptSelectConjunction(GSelect &Sel, MachineInstr &CondMI);
463   MachineInstr *tryFoldIntegerCompare(MachineOperand &LHS, MachineOperand &RHS,
464                                       MachineOperand &Predicate,
465                                       MachineIRBuilder &MIRBuilder) const;
466 
467   /// Return true if \p MI is a load or store of \p NumBytes bytes.
468   bool isLoadStoreOfNumBytes(const MachineInstr &MI, unsigned NumBytes) const;
469 
470   /// Returns true if \p MI is guaranteed to have the high-half of a 64-bit
471   /// register zeroed out. In other words, the result of MI has been explicitly
472   /// zero extended.
473   bool isDef32(const MachineInstr &MI) const;
474 
475   const AArch64TargetMachine &TM;
476   const AArch64Subtarget &STI;
477   const AArch64InstrInfo &TII;
478   const AArch64RegisterInfo &TRI;
479   const AArch64RegisterBankInfo &RBI;
480 
481   bool ProduceNonFlagSettingCondBr = false;
482 
483   // Some cached values used during selection.
484   // We use LR as a live-in register, and we keep track of it here as it can be
485   // clobbered by calls.
486   Register MFReturnAddr;
487 
488   MachineIRBuilder MIB;
489 
490 #define GET_GLOBALISEL_PREDICATES_DECL
491 #include "AArch64GenGlobalISel.inc"
492 #undef GET_GLOBALISEL_PREDICATES_DECL
493 
494 // We declare the temporaries used by selectImpl() in the class to minimize the
495 // cost of constructing placeholder values.
496 #define GET_GLOBALISEL_TEMPORARIES_DECL
497 #include "AArch64GenGlobalISel.inc"
498 #undef GET_GLOBALISEL_TEMPORARIES_DECL
499 };
500 
501 } // end anonymous namespace
502 
503 #define GET_GLOBALISEL_IMPL
504 #include "AArch64GenGlobalISel.inc"
505 #undef GET_GLOBALISEL_IMPL
506 
507 AArch64InstructionSelector::AArch64InstructionSelector(
508     const AArch64TargetMachine &TM, const AArch64Subtarget &STI,
509     const AArch64RegisterBankInfo &RBI)
510     : TM(TM), STI(STI), TII(*STI.getInstrInfo()), TRI(*STI.getRegisterInfo()),
511       RBI(RBI),
512 #define GET_GLOBALISEL_PREDICATES_INIT
513 #include "AArch64GenGlobalISel.inc"
514 #undef GET_GLOBALISEL_PREDICATES_INIT
515 #define GET_GLOBALISEL_TEMPORARIES_INIT
516 #include "AArch64GenGlobalISel.inc"
517 #undef GET_GLOBALISEL_TEMPORARIES_INIT
518 {
519 }
520 
521 // FIXME: This should be target-independent, inferred from the types declared
522 // for each class in the bank.
523 //
524 /// Given a register bank, and a type, return the smallest register class that
525 /// can represent that combination.
526 static const TargetRegisterClass *
527 getRegClassForTypeOnBank(LLT Ty, const RegisterBank &RB,
528                          bool GetAllRegSet = false) {
529   if (RB.getID() == AArch64::GPRRegBankID) {
530     if (Ty.getSizeInBits() <= 32)
531       return GetAllRegSet ? &AArch64::GPR32allRegClass
532                           : &AArch64::GPR32RegClass;
533     if (Ty.getSizeInBits() == 64)
534       return GetAllRegSet ? &AArch64::GPR64allRegClass
535                           : &AArch64::GPR64RegClass;
536     if (Ty.getSizeInBits() == 128)
537       return &AArch64::XSeqPairsClassRegClass;
538     return nullptr;
539   }
540 
541   if (RB.getID() == AArch64::FPRRegBankID) {
542     switch (Ty.getSizeInBits()) {
543     case 8:
544       return &AArch64::FPR8RegClass;
545     case 16:
546       return &AArch64::FPR16RegClass;
547     case 32:
548       return &AArch64::FPR32RegClass;
549     case 64:
550       return &AArch64::FPR64RegClass;
551     case 128:
552       return &AArch64::FPR128RegClass;
553     }
554     return nullptr;
555   }
556 
557   return nullptr;
558 }
559 
560 /// Given a register bank, and size in bits, return the smallest register class
561 /// that can represent that combination.
562 static const TargetRegisterClass *
563 getMinClassForRegBank(const RegisterBank &RB, unsigned SizeInBits,
564                       bool GetAllRegSet = false) {
565   unsigned RegBankID = RB.getID();
566 
567   if (RegBankID == AArch64::GPRRegBankID) {
568     if (SizeInBits <= 32)
569       return GetAllRegSet ? &AArch64::GPR32allRegClass
570                           : &AArch64::GPR32RegClass;
571     if (SizeInBits == 64)
572       return GetAllRegSet ? &AArch64::GPR64allRegClass
573                           : &AArch64::GPR64RegClass;
574     if (SizeInBits == 128)
575       return &AArch64::XSeqPairsClassRegClass;
576   }
577 
578   if (RegBankID == AArch64::FPRRegBankID) {
579     switch (SizeInBits) {
580     default:
581       return nullptr;
582     case 8:
583       return &AArch64::FPR8RegClass;
584     case 16:
585       return &AArch64::FPR16RegClass;
586     case 32:
587       return &AArch64::FPR32RegClass;
588     case 64:
589       return &AArch64::FPR64RegClass;
590     case 128:
591       return &AArch64::FPR128RegClass;
592     }
593   }
594 
595   return nullptr;
596 }
597 
598 /// Returns the correct subregister to use for a given register class.
599 static bool getSubRegForClass(const TargetRegisterClass *RC,
600                               const TargetRegisterInfo &TRI, unsigned &SubReg) {
601   switch (TRI.getRegSizeInBits(*RC)) {
602   case 8:
603     SubReg = AArch64::bsub;
604     break;
605   case 16:
606     SubReg = AArch64::hsub;
607     break;
608   case 32:
609     if (RC != &AArch64::FPR32RegClass)
610       SubReg = AArch64::sub_32;
611     else
612       SubReg = AArch64::ssub;
613     break;
614   case 64:
615     SubReg = AArch64::dsub;
616     break;
617   default:
618     LLVM_DEBUG(
619         dbgs() << "Couldn't find appropriate subregister for register class.");
620     return false;
621   }
622 
623   return true;
624 }
625 
626 /// Returns the minimum size the given register bank can hold.
627 static unsigned getMinSizeForRegBank(const RegisterBank &RB) {
628   switch (RB.getID()) {
629   case AArch64::GPRRegBankID:
630     return 32;
631   case AArch64::FPRRegBankID:
632     return 8;
633   default:
634     llvm_unreachable("Tried to get minimum size for unknown register bank.");
635   }
636 }
637 
638 /// Create a REG_SEQUENCE instruction using the registers in \p Regs.
639 /// Helper function for functions like createDTuple and createQTuple.
640 ///
641 /// \p RegClassIDs - The list of register class IDs available for some tuple of
642 /// a scalar class. E.g. QQRegClassID, QQQRegClassID, QQQQRegClassID. This is
643 /// expected to contain between 2 and 4 tuple classes.
644 ///
645 /// \p SubRegs - The list of subregister classes associated with each register
646 /// class ID in \p RegClassIDs. E.g., QQRegClassID should use the qsub0
647 /// subregister class. The index of each subregister class is expected to
648 /// correspond with the index of each register class.
649 ///
650 /// \returns Either the destination register of REG_SEQUENCE instruction that
651 /// was created, or the 0th element of \p Regs if \p Regs contains a single
652 /// element.
653 static Register createTuple(ArrayRef<Register> Regs,
654                             const unsigned RegClassIDs[],
655                             const unsigned SubRegs[], MachineIRBuilder &MIB) {
656   unsigned NumRegs = Regs.size();
657   if (NumRegs == 1)
658     return Regs[0];
659   assert(NumRegs >= 2 && NumRegs <= 4 &&
660          "Only support between two and 4 registers in a tuple!");
661   const TargetRegisterInfo *TRI = MIB.getMF().getSubtarget().getRegisterInfo();
662   auto *DesiredClass = TRI->getRegClass(RegClassIDs[NumRegs - 2]);
663   auto RegSequence =
664       MIB.buildInstr(TargetOpcode::REG_SEQUENCE, {DesiredClass}, {});
665   for (unsigned I = 0, E = Regs.size(); I < E; ++I) {
666     RegSequence.addUse(Regs[I]);
667     RegSequence.addImm(SubRegs[I]);
668   }
669   return RegSequence.getReg(0);
670 }
671 
672 /// Create a tuple of D-registers using the registers in \p Regs.
673 static Register createDTuple(ArrayRef<Register> Regs, MachineIRBuilder &MIB) {
674   static const unsigned RegClassIDs[] = {
675       AArch64::DDRegClassID, AArch64::DDDRegClassID, AArch64::DDDDRegClassID};
676   static const unsigned SubRegs[] = {AArch64::dsub0, AArch64::dsub1,
677                                      AArch64::dsub2, AArch64::dsub3};
678   return createTuple(Regs, RegClassIDs, SubRegs, MIB);
679 }
680 
681 /// Create a tuple of Q-registers using the registers in \p Regs.
682 static Register createQTuple(ArrayRef<Register> Regs, MachineIRBuilder &MIB) {
683   static const unsigned RegClassIDs[] = {
684       AArch64::QQRegClassID, AArch64::QQQRegClassID, AArch64::QQQQRegClassID};
685   static const unsigned SubRegs[] = {AArch64::qsub0, AArch64::qsub1,
686                                      AArch64::qsub2, AArch64::qsub3};
687   return createTuple(Regs, RegClassIDs, SubRegs, MIB);
688 }
689 
690 static std::optional<uint64_t> getImmedFromMO(const MachineOperand &Root) {
691   auto &MI = *Root.getParent();
692   auto &MBB = *MI.getParent();
693   auto &MF = *MBB.getParent();
694   auto &MRI = MF.getRegInfo();
695   uint64_t Immed;
696   if (Root.isImm())
697     Immed = Root.getImm();
698   else if (Root.isCImm())
699     Immed = Root.getCImm()->getZExtValue();
700   else if (Root.isReg()) {
701     auto ValAndVReg =
702         getIConstantVRegValWithLookThrough(Root.getReg(), MRI, true);
703     if (!ValAndVReg)
704       return std::nullopt;
705     Immed = ValAndVReg->Value.getSExtValue();
706   } else
707     return std::nullopt;
708   return Immed;
709 }
710 
711 /// Check whether \p I is a currently unsupported binary operation:
712 /// - it has an unsized type
713 /// - an operand is not a vreg
714 /// - all operands are not in the same bank
715 /// These are checks that should someday live in the verifier, but right now,
716 /// these are mostly limitations of the aarch64 selector.
717 static bool unsupportedBinOp(const MachineInstr &I,
718                              const AArch64RegisterBankInfo &RBI,
719                              const MachineRegisterInfo &MRI,
720                              const AArch64RegisterInfo &TRI) {
721   LLT Ty = MRI.getType(I.getOperand(0).getReg());
722   if (!Ty.isValid()) {
723     LLVM_DEBUG(dbgs() << "Generic binop register should be typed\n");
724     return true;
725   }
726 
727   const RegisterBank *PrevOpBank = nullptr;
728   for (auto &MO : I.operands()) {
729     // FIXME: Support non-register operands.
730     if (!MO.isReg()) {
731       LLVM_DEBUG(dbgs() << "Generic inst non-reg operands are unsupported\n");
732       return true;
733     }
734 
735     // FIXME: Can generic operations have physical registers operands? If
736     // so, this will need to be taught about that, and we'll need to get the
737     // bank out of the minimal class for the register.
738     // Either way, this needs to be documented (and possibly verified).
739     if (!MO.getReg().isVirtual()) {
740       LLVM_DEBUG(dbgs() << "Generic inst has physical register operand\n");
741       return true;
742     }
743 
744     const RegisterBank *OpBank = RBI.getRegBank(MO.getReg(), MRI, TRI);
745     if (!OpBank) {
746       LLVM_DEBUG(dbgs() << "Generic register has no bank or class\n");
747       return true;
748     }
749 
750     if (PrevOpBank && OpBank != PrevOpBank) {
751       LLVM_DEBUG(dbgs() << "Generic inst operands have different banks\n");
752       return true;
753     }
754     PrevOpBank = OpBank;
755   }
756   return false;
757 }
758 
759 /// Select the AArch64 opcode for the basic binary operation \p GenericOpc
760 /// (such as G_OR or G_SDIV), appropriate for the register bank \p RegBankID
761 /// and of size \p OpSize.
762 /// \returns \p GenericOpc if the combination is unsupported.
763 static unsigned selectBinaryOp(unsigned GenericOpc, unsigned RegBankID,
764                                unsigned OpSize) {
765   switch (RegBankID) {
766   case AArch64::GPRRegBankID:
767     if (OpSize == 32) {
768       switch (GenericOpc) {
769       case TargetOpcode::G_SHL:
770         return AArch64::LSLVWr;
771       case TargetOpcode::G_LSHR:
772         return AArch64::LSRVWr;
773       case TargetOpcode::G_ASHR:
774         return AArch64::ASRVWr;
775       default:
776         return GenericOpc;
777       }
778     } else if (OpSize == 64) {
779       switch (GenericOpc) {
780       case TargetOpcode::G_PTR_ADD:
781         return AArch64::ADDXrr;
782       case TargetOpcode::G_SHL:
783         return AArch64::LSLVXr;
784       case TargetOpcode::G_LSHR:
785         return AArch64::LSRVXr;
786       case TargetOpcode::G_ASHR:
787         return AArch64::ASRVXr;
788       default:
789         return GenericOpc;
790       }
791     }
792     break;
793   case AArch64::FPRRegBankID:
794     switch (OpSize) {
795     case 32:
796       switch (GenericOpc) {
797       case TargetOpcode::G_FADD:
798         return AArch64::FADDSrr;
799       case TargetOpcode::G_FSUB:
800         return AArch64::FSUBSrr;
801       case TargetOpcode::G_FMUL:
802         return AArch64::FMULSrr;
803       case TargetOpcode::G_FDIV:
804         return AArch64::FDIVSrr;
805       default:
806         return GenericOpc;
807       }
808     case 64:
809       switch (GenericOpc) {
810       case TargetOpcode::G_FADD:
811         return AArch64::FADDDrr;
812       case TargetOpcode::G_FSUB:
813         return AArch64::FSUBDrr;
814       case TargetOpcode::G_FMUL:
815         return AArch64::FMULDrr;
816       case TargetOpcode::G_FDIV:
817         return AArch64::FDIVDrr;
818       case TargetOpcode::G_OR:
819         return AArch64::ORRv8i8;
820       default:
821         return GenericOpc;
822       }
823     }
824     break;
825   }
826   return GenericOpc;
827 }
828 
829 /// Select the AArch64 opcode for the G_LOAD or G_STORE operation \p GenericOpc,
830 /// appropriate for the (value) register bank \p RegBankID and of memory access
831 /// size \p OpSize.  This returns the variant with the base+unsigned-immediate
832 /// addressing mode (e.g., LDRXui).
833 /// \returns \p GenericOpc if the combination is unsupported.
834 static unsigned selectLoadStoreUIOp(unsigned GenericOpc, unsigned RegBankID,
835                                     unsigned OpSize) {
836   const bool isStore = GenericOpc == TargetOpcode::G_STORE;
837   switch (RegBankID) {
838   case AArch64::GPRRegBankID:
839     switch (OpSize) {
840     case 8:
841       return isStore ? AArch64::STRBBui : AArch64::LDRBBui;
842     case 16:
843       return isStore ? AArch64::STRHHui : AArch64::LDRHHui;
844     case 32:
845       return isStore ? AArch64::STRWui : AArch64::LDRWui;
846     case 64:
847       return isStore ? AArch64::STRXui : AArch64::LDRXui;
848     }
849     break;
850   case AArch64::FPRRegBankID:
851     switch (OpSize) {
852     case 8:
853       return isStore ? AArch64::STRBui : AArch64::LDRBui;
854     case 16:
855       return isStore ? AArch64::STRHui : AArch64::LDRHui;
856     case 32:
857       return isStore ? AArch64::STRSui : AArch64::LDRSui;
858     case 64:
859       return isStore ? AArch64::STRDui : AArch64::LDRDui;
860     case 128:
861       return isStore ? AArch64::STRQui : AArch64::LDRQui;
862     }
863     break;
864   }
865   return GenericOpc;
866 }
867 
868 /// Helper function for selectCopy. Inserts a subregister copy from \p SrcReg
869 /// to \p *To.
870 ///
871 /// E.g "To = COPY SrcReg:SubReg"
872 static bool copySubReg(MachineInstr &I, MachineRegisterInfo &MRI,
873                        const RegisterBankInfo &RBI, Register SrcReg,
874                        const TargetRegisterClass *To, unsigned SubReg) {
875   assert(SrcReg.isValid() && "Expected a valid source register?");
876   assert(To && "Destination register class cannot be null");
877   assert(SubReg && "Expected a valid subregister");
878 
879   MachineIRBuilder MIB(I);
880   auto SubRegCopy =
881       MIB.buildInstr(TargetOpcode::COPY, {To}, {}).addReg(SrcReg, 0, SubReg);
882   MachineOperand &RegOp = I.getOperand(1);
883   RegOp.setReg(SubRegCopy.getReg(0));
884 
885   // It's possible that the destination register won't be constrained. Make
886   // sure that happens.
887   if (!I.getOperand(0).getReg().isPhysical())
888     RBI.constrainGenericRegister(I.getOperand(0).getReg(), *To, MRI);
889 
890   return true;
891 }
892 
893 /// Helper function to get the source and destination register classes for a
894 /// copy. Returns a std::pair containing the source register class for the
895 /// copy, and the destination register class for the copy. If a register class
896 /// cannot be determined, then it will be nullptr.
897 static std::pair<const TargetRegisterClass *, const TargetRegisterClass *>
898 getRegClassesForCopy(MachineInstr &I, const TargetInstrInfo &TII,
899                      MachineRegisterInfo &MRI, const TargetRegisterInfo &TRI,
900                      const RegisterBankInfo &RBI) {
901   Register DstReg = I.getOperand(0).getReg();
902   Register SrcReg = I.getOperand(1).getReg();
903   const RegisterBank &DstRegBank = *RBI.getRegBank(DstReg, MRI, TRI);
904   const RegisterBank &SrcRegBank = *RBI.getRegBank(SrcReg, MRI, TRI);
905   unsigned DstSize = RBI.getSizeInBits(DstReg, MRI, TRI);
906   unsigned SrcSize = RBI.getSizeInBits(SrcReg, MRI, TRI);
907 
908   // Special casing for cross-bank copies of s1s. We can technically represent
909   // a 1-bit value with any size of register. The minimum size for a GPR is 32
910   // bits. So, we need to put the FPR on 32 bits as well.
911   //
912   // FIXME: I'm not sure if this case holds true outside of copies. If it does,
913   // then we can pull it into the helpers that get the appropriate class for a
914   // register bank. Or make a new helper that carries along some constraint
915   // information.
916   if (SrcRegBank != DstRegBank && (DstSize == 1 && SrcSize == 1))
917     SrcSize = DstSize = 32;
918 
919   return {getMinClassForRegBank(SrcRegBank, SrcSize, true),
920           getMinClassForRegBank(DstRegBank, DstSize, true)};
921 }
922 
923 // FIXME: We need some sort of API in RBI/TRI to allow generic code to
924 // constrain operands of simple instructions given a TargetRegisterClass
925 // and LLT
926 static bool selectDebugInstr(MachineInstr &I, MachineRegisterInfo &MRI,
927                              const RegisterBankInfo &RBI) {
928   for (MachineOperand &MO : I.operands()) {
929     if (!MO.isReg())
930       continue;
931     Register Reg = MO.getReg();
932     if (!Reg)
933       continue;
934     if (Reg.isPhysical())
935       continue;
936     LLT Ty = MRI.getType(Reg);
937     const RegClassOrRegBank &RegClassOrBank = MRI.getRegClassOrRegBank(Reg);
938     const TargetRegisterClass *RC =
939         RegClassOrBank.dyn_cast<const TargetRegisterClass *>();
940     if (!RC) {
941       const RegisterBank &RB = *RegClassOrBank.get<const RegisterBank *>();
942       RC = getRegClassForTypeOnBank(Ty, RB);
943       if (!RC) {
944         LLVM_DEBUG(
945             dbgs() << "Warning: DBG_VALUE operand has unexpected size/bank\n");
946         break;
947       }
948     }
949     RBI.constrainGenericRegister(Reg, *RC, MRI);
950   }
951 
952   return true;
953 }
954 
955 static bool selectCopy(MachineInstr &I, const TargetInstrInfo &TII,
956                        MachineRegisterInfo &MRI, const TargetRegisterInfo &TRI,
957                        const RegisterBankInfo &RBI) {
958   Register DstReg = I.getOperand(0).getReg();
959   Register SrcReg = I.getOperand(1).getReg();
960   const RegisterBank &DstRegBank = *RBI.getRegBank(DstReg, MRI, TRI);
961   const RegisterBank &SrcRegBank = *RBI.getRegBank(SrcReg, MRI, TRI);
962 
963   // Find the correct register classes for the source and destination registers.
964   const TargetRegisterClass *SrcRC;
965   const TargetRegisterClass *DstRC;
966   std::tie(SrcRC, DstRC) = getRegClassesForCopy(I, TII, MRI, TRI, RBI);
967 
968   if (!DstRC) {
969     LLVM_DEBUG(dbgs() << "Unexpected dest size "
970                       << RBI.getSizeInBits(DstReg, MRI, TRI) << '\n');
971     return false;
972   }
973 
974   // Is this a copy? If so, then we may need to insert a subregister copy.
975   if (I.isCopy()) {
976     // Yes. Check if there's anything to fix up.
977     if (!SrcRC) {
978       LLVM_DEBUG(dbgs() << "Couldn't determine source register class\n");
979       return false;
980     }
981 
982     unsigned SrcSize = TRI.getRegSizeInBits(*SrcRC);
983     unsigned DstSize = TRI.getRegSizeInBits(*DstRC);
984     unsigned SubReg;
985 
986     // If the source bank doesn't support a subregister copy small enough,
987     // then we first need to copy to the destination bank.
988     if (getMinSizeForRegBank(SrcRegBank) > DstSize) {
989       const TargetRegisterClass *DstTempRC =
990           getMinClassForRegBank(DstRegBank, SrcSize, /* GetAllRegSet */ true);
991       getSubRegForClass(DstRC, TRI, SubReg);
992 
993       MachineIRBuilder MIB(I);
994       auto Copy = MIB.buildCopy({DstTempRC}, {SrcReg});
995       copySubReg(I, MRI, RBI, Copy.getReg(0), DstRC, SubReg);
996     } else if (SrcSize > DstSize) {
997       // If the source register is bigger than the destination we need to
998       // perform a subregister copy.
999       const TargetRegisterClass *SubRegRC =
1000           getMinClassForRegBank(SrcRegBank, DstSize, /* GetAllRegSet */ true);
1001       getSubRegForClass(SubRegRC, TRI, SubReg);
1002       copySubReg(I, MRI, RBI, SrcReg, DstRC, SubReg);
1003     } else if (DstSize > SrcSize) {
1004       // If the destination register is bigger than the source we need to do
1005       // a promotion using SUBREG_TO_REG.
1006       const TargetRegisterClass *PromotionRC =
1007           getMinClassForRegBank(SrcRegBank, DstSize, /* GetAllRegSet */ true);
1008       getSubRegForClass(SrcRC, TRI, SubReg);
1009 
1010       Register PromoteReg = MRI.createVirtualRegister(PromotionRC);
1011       BuildMI(*I.getParent(), I, I.getDebugLoc(),
1012               TII.get(AArch64::SUBREG_TO_REG), PromoteReg)
1013           .addImm(0)
1014           .addUse(SrcReg)
1015           .addImm(SubReg);
1016       MachineOperand &RegOp = I.getOperand(1);
1017       RegOp.setReg(PromoteReg);
1018     }
1019 
1020     // If the destination is a physical register, then there's nothing to
1021     // change, so we're done.
1022     if (DstReg.isPhysical())
1023       return true;
1024   }
1025 
1026   // No need to constrain SrcReg. It will get constrained when we hit another
1027   // of its use or its defs. Copies do not have constraints.
1028   if (!RBI.constrainGenericRegister(DstReg, *DstRC, MRI)) {
1029     LLVM_DEBUG(dbgs() << "Failed to constrain " << TII.getName(I.getOpcode())
1030                       << " operand\n");
1031     return false;
1032   }
1033 
1034   // If this a GPR ZEXT that we want to just reduce down into a copy.
1035   // The sizes will be mismatched with the source < 32b but that's ok.
1036   if (I.getOpcode() == TargetOpcode::G_ZEXT) {
1037     I.setDesc(TII.get(AArch64::COPY));
1038     assert(SrcRegBank.getID() == AArch64::GPRRegBankID);
1039     return selectCopy(I, TII, MRI, TRI, RBI);
1040   }
1041 
1042   I.setDesc(TII.get(AArch64::COPY));
1043   return true;
1044 }
1045 
1046 static unsigned selectFPConvOpc(unsigned GenericOpc, LLT DstTy, LLT SrcTy) {
1047   if (!DstTy.isScalar() || !SrcTy.isScalar())
1048     return GenericOpc;
1049 
1050   const unsigned DstSize = DstTy.getSizeInBits();
1051   const unsigned SrcSize = SrcTy.getSizeInBits();
1052 
1053   switch (DstSize) {
1054   case 32:
1055     switch (SrcSize) {
1056     case 32:
1057       switch (GenericOpc) {
1058       case TargetOpcode::G_SITOFP:
1059         return AArch64::SCVTFUWSri;
1060       case TargetOpcode::G_UITOFP:
1061         return AArch64::UCVTFUWSri;
1062       case TargetOpcode::G_FPTOSI:
1063         return AArch64::FCVTZSUWSr;
1064       case TargetOpcode::G_FPTOUI:
1065         return AArch64::FCVTZUUWSr;
1066       default:
1067         return GenericOpc;
1068       }
1069     case 64:
1070       switch (GenericOpc) {
1071       case TargetOpcode::G_SITOFP:
1072         return AArch64::SCVTFUXSri;
1073       case TargetOpcode::G_UITOFP:
1074         return AArch64::UCVTFUXSri;
1075       case TargetOpcode::G_FPTOSI:
1076         return AArch64::FCVTZSUWDr;
1077       case TargetOpcode::G_FPTOUI:
1078         return AArch64::FCVTZUUWDr;
1079       default:
1080         return GenericOpc;
1081       }
1082     default:
1083       return GenericOpc;
1084     }
1085   case 64:
1086     switch (SrcSize) {
1087     case 32:
1088       switch (GenericOpc) {
1089       case TargetOpcode::G_SITOFP:
1090         return AArch64::SCVTFUWDri;
1091       case TargetOpcode::G_UITOFP:
1092         return AArch64::UCVTFUWDri;
1093       case TargetOpcode::G_FPTOSI:
1094         return AArch64::FCVTZSUXSr;
1095       case TargetOpcode::G_FPTOUI:
1096         return AArch64::FCVTZUUXSr;
1097       default:
1098         return GenericOpc;
1099       }
1100     case 64:
1101       switch (GenericOpc) {
1102       case TargetOpcode::G_SITOFP:
1103         return AArch64::SCVTFUXDri;
1104       case TargetOpcode::G_UITOFP:
1105         return AArch64::UCVTFUXDri;
1106       case TargetOpcode::G_FPTOSI:
1107         return AArch64::FCVTZSUXDr;
1108       case TargetOpcode::G_FPTOUI:
1109         return AArch64::FCVTZUUXDr;
1110       default:
1111         return GenericOpc;
1112       }
1113     default:
1114       return GenericOpc;
1115     }
1116   default:
1117     return GenericOpc;
1118   };
1119   return GenericOpc;
1120 }
1121 
1122 MachineInstr *
1123 AArch64InstructionSelector::emitSelect(Register Dst, Register True,
1124                                        Register False, AArch64CC::CondCode CC,
1125                                        MachineIRBuilder &MIB) const {
1126   MachineRegisterInfo &MRI = *MIB.getMRI();
1127   assert(RBI.getRegBank(False, MRI, TRI)->getID() ==
1128              RBI.getRegBank(True, MRI, TRI)->getID() &&
1129          "Expected both select operands to have the same regbank?");
1130   LLT Ty = MRI.getType(True);
1131   if (Ty.isVector())
1132     return nullptr;
1133   const unsigned Size = Ty.getSizeInBits();
1134   assert((Size == 32 || Size == 64) &&
1135          "Expected 32 bit or 64 bit select only?");
1136   const bool Is32Bit = Size == 32;
1137   if (RBI.getRegBank(True, MRI, TRI)->getID() != AArch64::GPRRegBankID) {
1138     unsigned Opc = Is32Bit ? AArch64::FCSELSrrr : AArch64::FCSELDrrr;
1139     auto FCSel = MIB.buildInstr(Opc, {Dst}, {True, False}).addImm(CC);
1140     constrainSelectedInstRegOperands(*FCSel, TII, TRI, RBI);
1141     return &*FCSel;
1142   }
1143 
1144   // By default, we'll try and emit a CSEL.
1145   unsigned Opc = Is32Bit ? AArch64::CSELWr : AArch64::CSELXr;
1146   bool Optimized = false;
1147   auto TryFoldBinOpIntoSelect = [&Opc, Is32Bit, &CC, &MRI,
1148                                  &Optimized](Register &Reg, Register &OtherReg,
1149                                              bool Invert) {
1150     if (Optimized)
1151       return false;
1152 
1153     // Attempt to fold:
1154     //
1155     // %sub = G_SUB 0, %x
1156     // %select = G_SELECT cc, %reg, %sub
1157     //
1158     // Into:
1159     // %select = CSNEG %reg, %x, cc
1160     Register MatchReg;
1161     if (mi_match(Reg, MRI, m_Neg(m_Reg(MatchReg)))) {
1162       Opc = Is32Bit ? AArch64::CSNEGWr : AArch64::CSNEGXr;
1163       Reg = MatchReg;
1164       if (Invert) {
1165         CC = AArch64CC::getInvertedCondCode(CC);
1166         std::swap(Reg, OtherReg);
1167       }
1168       return true;
1169     }
1170 
1171     // Attempt to fold:
1172     //
1173     // %xor = G_XOR %x, -1
1174     // %select = G_SELECT cc, %reg, %xor
1175     //
1176     // Into:
1177     // %select = CSINV %reg, %x, cc
1178     if (mi_match(Reg, MRI, m_Not(m_Reg(MatchReg)))) {
1179       Opc = Is32Bit ? AArch64::CSINVWr : AArch64::CSINVXr;
1180       Reg = MatchReg;
1181       if (Invert) {
1182         CC = AArch64CC::getInvertedCondCode(CC);
1183         std::swap(Reg, OtherReg);
1184       }
1185       return true;
1186     }
1187 
1188     // Attempt to fold:
1189     //
1190     // %add = G_ADD %x, 1
1191     // %select = G_SELECT cc, %reg, %add
1192     //
1193     // Into:
1194     // %select = CSINC %reg, %x, cc
1195     if (mi_match(Reg, MRI,
1196                  m_any_of(m_GAdd(m_Reg(MatchReg), m_SpecificICst(1)),
1197                           m_GPtrAdd(m_Reg(MatchReg), m_SpecificICst(1))))) {
1198       Opc = Is32Bit ? AArch64::CSINCWr : AArch64::CSINCXr;
1199       Reg = MatchReg;
1200       if (Invert) {
1201         CC = AArch64CC::getInvertedCondCode(CC);
1202         std::swap(Reg, OtherReg);
1203       }
1204       return true;
1205     }
1206 
1207     return false;
1208   };
1209 
1210   // Helper lambda which tries to use CSINC/CSINV for the instruction when its
1211   // true/false values are constants.
1212   // FIXME: All of these patterns already exist in tablegen. We should be
1213   // able to import these.
1214   auto TryOptSelectCst = [&Opc, &True, &False, &CC, Is32Bit, &MRI,
1215                           &Optimized]() {
1216     if (Optimized)
1217       return false;
1218     auto TrueCst = getIConstantVRegValWithLookThrough(True, MRI);
1219     auto FalseCst = getIConstantVRegValWithLookThrough(False, MRI);
1220     if (!TrueCst && !FalseCst)
1221       return false;
1222 
1223     Register ZReg = Is32Bit ? AArch64::WZR : AArch64::XZR;
1224     if (TrueCst && FalseCst) {
1225       int64_t T = TrueCst->Value.getSExtValue();
1226       int64_t F = FalseCst->Value.getSExtValue();
1227 
1228       if (T == 0 && F == 1) {
1229         // G_SELECT cc, 0, 1 -> CSINC zreg, zreg, cc
1230         Opc = Is32Bit ? AArch64::CSINCWr : AArch64::CSINCXr;
1231         True = ZReg;
1232         False = ZReg;
1233         return true;
1234       }
1235 
1236       if (T == 0 && F == -1) {
1237         // G_SELECT cc 0, -1 -> CSINV zreg, zreg cc
1238         Opc = Is32Bit ? AArch64::CSINVWr : AArch64::CSINVXr;
1239         True = ZReg;
1240         False = ZReg;
1241         return true;
1242       }
1243     }
1244 
1245     if (TrueCst) {
1246       int64_t T = TrueCst->Value.getSExtValue();
1247       if (T == 1) {
1248         // G_SELECT cc, 1, f -> CSINC f, zreg, inv_cc
1249         Opc = Is32Bit ? AArch64::CSINCWr : AArch64::CSINCXr;
1250         True = False;
1251         False = ZReg;
1252         CC = AArch64CC::getInvertedCondCode(CC);
1253         return true;
1254       }
1255 
1256       if (T == -1) {
1257         // G_SELECT cc, -1, f -> CSINV f, zreg, inv_cc
1258         Opc = Is32Bit ? AArch64::CSINVWr : AArch64::CSINVXr;
1259         True = False;
1260         False = ZReg;
1261         CC = AArch64CC::getInvertedCondCode(CC);
1262         return true;
1263       }
1264     }
1265 
1266     if (FalseCst) {
1267       int64_t F = FalseCst->Value.getSExtValue();
1268       if (F == 1) {
1269         // G_SELECT cc, t, 1 -> CSINC t, zreg, cc
1270         Opc = Is32Bit ? AArch64::CSINCWr : AArch64::CSINCXr;
1271         False = ZReg;
1272         return true;
1273       }
1274 
1275       if (F == -1) {
1276         // G_SELECT cc, t, -1 -> CSINC t, zreg, cc
1277         Opc = Is32Bit ? AArch64::CSINVWr : AArch64::CSINVXr;
1278         False = ZReg;
1279         return true;
1280       }
1281     }
1282     return false;
1283   };
1284 
1285   Optimized |= TryFoldBinOpIntoSelect(False, True, /*Invert = */ false);
1286   Optimized |= TryFoldBinOpIntoSelect(True, False, /*Invert = */ true);
1287   Optimized |= TryOptSelectCst();
1288   auto SelectInst = MIB.buildInstr(Opc, {Dst}, {True, False}).addImm(CC);
1289   constrainSelectedInstRegOperands(*SelectInst, TII, TRI, RBI);
1290   return &*SelectInst;
1291 }
1292 
1293 static AArch64CC::CondCode changeICMPPredToAArch64CC(CmpInst::Predicate P) {
1294   switch (P) {
1295   default:
1296     llvm_unreachable("Unknown condition code!");
1297   case CmpInst::ICMP_NE:
1298     return AArch64CC::NE;
1299   case CmpInst::ICMP_EQ:
1300     return AArch64CC::EQ;
1301   case CmpInst::ICMP_SGT:
1302     return AArch64CC::GT;
1303   case CmpInst::ICMP_SGE:
1304     return AArch64CC::GE;
1305   case CmpInst::ICMP_SLT:
1306     return AArch64CC::LT;
1307   case CmpInst::ICMP_SLE:
1308     return AArch64CC::LE;
1309   case CmpInst::ICMP_UGT:
1310     return AArch64CC::HI;
1311   case CmpInst::ICMP_UGE:
1312     return AArch64CC::HS;
1313   case CmpInst::ICMP_ULT:
1314     return AArch64CC::LO;
1315   case CmpInst::ICMP_ULE:
1316     return AArch64CC::LS;
1317   }
1318 }
1319 
1320 /// changeFPCCToORAArch64CC - Convert an IR fp condition code to an AArch64 CC.
1321 static void changeFPCCToORAArch64CC(CmpInst::Predicate CC,
1322                                     AArch64CC::CondCode &CondCode,
1323                                     AArch64CC::CondCode &CondCode2) {
1324   CondCode2 = AArch64CC::AL;
1325   switch (CC) {
1326   default:
1327     llvm_unreachable("Unknown FP condition!");
1328   case CmpInst::FCMP_OEQ:
1329     CondCode = AArch64CC::EQ;
1330     break;
1331   case CmpInst::FCMP_OGT:
1332     CondCode = AArch64CC::GT;
1333     break;
1334   case CmpInst::FCMP_OGE:
1335     CondCode = AArch64CC::GE;
1336     break;
1337   case CmpInst::FCMP_OLT:
1338     CondCode = AArch64CC::MI;
1339     break;
1340   case CmpInst::FCMP_OLE:
1341     CondCode = AArch64CC::LS;
1342     break;
1343   case CmpInst::FCMP_ONE:
1344     CondCode = AArch64CC::MI;
1345     CondCode2 = AArch64CC::GT;
1346     break;
1347   case CmpInst::FCMP_ORD:
1348     CondCode = AArch64CC::VC;
1349     break;
1350   case CmpInst::FCMP_UNO:
1351     CondCode = AArch64CC::VS;
1352     break;
1353   case CmpInst::FCMP_UEQ:
1354     CondCode = AArch64CC::EQ;
1355     CondCode2 = AArch64CC::VS;
1356     break;
1357   case CmpInst::FCMP_UGT:
1358     CondCode = AArch64CC::HI;
1359     break;
1360   case CmpInst::FCMP_UGE:
1361     CondCode = AArch64CC::PL;
1362     break;
1363   case CmpInst::FCMP_ULT:
1364     CondCode = AArch64CC::LT;
1365     break;
1366   case CmpInst::FCMP_ULE:
1367     CondCode = AArch64CC::LE;
1368     break;
1369   case CmpInst::FCMP_UNE:
1370     CondCode = AArch64CC::NE;
1371     break;
1372   }
1373 }
1374 
1375 /// Convert an IR fp condition code to an AArch64 CC.
1376 /// This differs from changeFPCCToAArch64CC in that it returns cond codes that
1377 /// should be AND'ed instead of OR'ed.
1378 static void changeFPCCToANDAArch64CC(CmpInst::Predicate CC,
1379                                      AArch64CC::CondCode &CondCode,
1380                                      AArch64CC::CondCode &CondCode2) {
1381   CondCode2 = AArch64CC::AL;
1382   switch (CC) {
1383   default:
1384     changeFPCCToORAArch64CC(CC, CondCode, CondCode2);
1385     assert(CondCode2 == AArch64CC::AL);
1386     break;
1387   case CmpInst::FCMP_ONE:
1388     // (a one b)
1389     // == ((a olt b) || (a ogt b))
1390     // == ((a ord b) && (a une b))
1391     CondCode = AArch64CC::VC;
1392     CondCode2 = AArch64CC::NE;
1393     break;
1394   case CmpInst::FCMP_UEQ:
1395     // (a ueq b)
1396     // == ((a uno b) || (a oeq b))
1397     // == ((a ule b) && (a uge b))
1398     CondCode = AArch64CC::PL;
1399     CondCode2 = AArch64CC::LE;
1400     break;
1401   }
1402 }
1403 
1404 /// Return a register which can be used as a bit to test in a TB(N)Z.
1405 static Register getTestBitReg(Register Reg, uint64_t &Bit, bool &Invert,
1406                               MachineRegisterInfo &MRI) {
1407   assert(Reg.isValid() && "Expected valid register!");
1408   bool HasZext = false;
1409   while (MachineInstr *MI = getDefIgnoringCopies(Reg, MRI)) {
1410     unsigned Opc = MI->getOpcode();
1411 
1412     if (!MI->getOperand(0).isReg() ||
1413         !MRI.hasOneNonDBGUse(MI->getOperand(0).getReg()))
1414       break;
1415 
1416     // (tbz (any_ext x), b) -> (tbz x, b) if we don't use the extended bits.
1417     //
1418     // (tbz (trunc x), b) -> (tbz x, b) is always safe, because the bit number
1419     // on the truncated x is the same as the bit number on x.
1420     if (Opc == TargetOpcode::G_ANYEXT || Opc == TargetOpcode::G_ZEXT ||
1421         Opc == TargetOpcode::G_TRUNC) {
1422       if (Opc == TargetOpcode::G_ZEXT)
1423         HasZext = true;
1424 
1425       Register NextReg = MI->getOperand(1).getReg();
1426       // Did we find something worth folding?
1427       if (!NextReg.isValid() || !MRI.hasOneNonDBGUse(NextReg))
1428         break;
1429 
1430       // NextReg is worth folding. Keep looking.
1431       Reg = NextReg;
1432       continue;
1433     }
1434 
1435     // Attempt to find a suitable operation with a constant on one side.
1436     std::optional<uint64_t> C;
1437     Register TestReg;
1438     switch (Opc) {
1439     default:
1440       break;
1441     case TargetOpcode::G_AND:
1442     case TargetOpcode::G_XOR: {
1443       TestReg = MI->getOperand(1).getReg();
1444       Register ConstantReg = MI->getOperand(2).getReg();
1445       auto VRegAndVal = getIConstantVRegValWithLookThrough(ConstantReg, MRI);
1446       if (!VRegAndVal) {
1447         // AND commutes, check the other side for a constant.
1448         // FIXME: Can we canonicalize the constant so that it's always on the
1449         // same side at some point earlier?
1450         std::swap(ConstantReg, TestReg);
1451         VRegAndVal = getIConstantVRegValWithLookThrough(ConstantReg, MRI);
1452       }
1453       if (VRegAndVal) {
1454         if (HasZext)
1455           C = VRegAndVal->Value.getZExtValue();
1456         else
1457           C = VRegAndVal->Value.getSExtValue();
1458       }
1459       break;
1460     }
1461     case TargetOpcode::G_ASHR:
1462     case TargetOpcode::G_LSHR:
1463     case TargetOpcode::G_SHL: {
1464       TestReg = MI->getOperand(1).getReg();
1465       auto VRegAndVal =
1466           getIConstantVRegValWithLookThrough(MI->getOperand(2).getReg(), MRI);
1467       if (VRegAndVal)
1468         C = VRegAndVal->Value.getSExtValue();
1469       break;
1470     }
1471     }
1472 
1473     // Didn't find a constant or viable register. Bail out of the loop.
1474     if (!C || !TestReg.isValid())
1475       break;
1476 
1477     // We found a suitable instruction with a constant. Check to see if we can
1478     // walk through the instruction.
1479     Register NextReg;
1480     unsigned TestRegSize = MRI.getType(TestReg).getSizeInBits();
1481     switch (Opc) {
1482     default:
1483       break;
1484     case TargetOpcode::G_AND:
1485       // (tbz (and x, m), b) -> (tbz x, b) when the b-th bit of m is set.
1486       if ((*C >> Bit) & 1)
1487         NextReg = TestReg;
1488       break;
1489     case TargetOpcode::G_SHL:
1490       // (tbz (shl x, c), b) -> (tbz x, b-c) when b-c is positive and fits in
1491       // the type of the register.
1492       if (*C <= Bit && (Bit - *C) < TestRegSize) {
1493         NextReg = TestReg;
1494         Bit = Bit - *C;
1495       }
1496       break;
1497     case TargetOpcode::G_ASHR:
1498       // (tbz (ashr x, c), b) -> (tbz x, b+c) or (tbz x, msb) if b+c is > # bits
1499       // in x
1500       NextReg = TestReg;
1501       Bit = Bit + *C;
1502       if (Bit >= TestRegSize)
1503         Bit = TestRegSize - 1;
1504       break;
1505     case TargetOpcode::G_LSHR:
1506       // (tbz (lshr x, c), b) -> (tbz x, b+c) when b + c is < # bits in x
1507       if ((Bit + *C) < TestRegSize) {
1508         NextReg = TestReg;
1509         Bit = Bit + *C;
1510       }
1511       break;
1512     case TargetOpcode::G_XOR:
1513       // We can walk through a G_XOR by inverting whether we use tbz/tbnz when
1514       // appropriate.
1515       //
1516       // e.g. If x' = xor x, c, and the b-th bit is set in c then
1517       //
1518       // tbz x', b -> tbnz x, b
1519       //
1520       // Because x' only has the b-th bit set if x does not.
1521       if ((*C >> Bit) & 1)
1522         Invert = !Invert;
1523       NextReg = TestReg;
1524       break;
1525     }
1526 
1527     // Check if we found anything worth folding.
1528     if (!NextReg.isValid())
1529       return Reg;
1530     Reg = NextReg;
1531   }
1532 
1533   return Reg;
1534 }
1535 
1536 MachineInstr *AArch64InstructionSelector::emitTestBit(
1537     Register TestReg, uint64_t Bit, bool IsNegative, MachineBasicBlock *DstMBB,
1538     MachineIRBuilder &MIB) const {
1539   assert(TestReg.isValid());
1540   assert(ProduceNonFlagSettingCondBr &&
1541          "Cannot emit TB(N)Z with speculation tracking!");
1542   MachineRegisterInfo &MRI = *MIB.getMRI();
1543 
1544   // Attempt to optimize the test bit by walking over instructions.
1545   TestReg = getTestBitReg(TestReg, Bit, IsNegative, MRI);
1546   LLT Ty = MRI.getType(TestReg);
1547   unsigned Size = Ty.getSizeInBits();
1548   assert(!Ty.isVector() && "Expected a scalar!");
1549   assert(Bit < 64 && "Bit is too large!");
1550 
1551   // When the test register is a 64-bit register, we have to narrow to make
1552   // TBNZW work.
1553   bool UseWReg = Bit < 32;
1554   unsigned NecessarySize = UseWReg ? 32 : 64;
1555   if (Size != NecessarySize)
1556     TestReg = moveScalarRegClass(
1557         TestReg, UseWReg ? AArch64::GPR32RegClass : AArch64::GPR64RegClass,
1558         MIB);
1559 
1560   static const unsigned OpcTable[2][2] = {{AArch64::TBZX, AArch64::TBNZX},
1561                                           {AArch64::TBZW, AArch64::TBNZW}};
1562   unsigned Opc = OpcTable[UseWReg][IsNegative];
1563   auto TestBitMI =
1564       MIB.buildInstr(Opc).addReg(TestReg).addImm(Bit).addMBB(DstMBB);
1565   constrainSelectedInstRegOperands(*TestBitMI, TII, TRI, RBI);
1566   return &*TestBitMI;
1567 }
1568 
1569 bool AArch64InstructionSelector::tryOptAndIntoCompareBranch(
1570     MachineInstr &AndInst, bool Invert, MachineBasicBlock *DstMBB,
1571     MachineIRBuilder &MIB) const {
1572   assert(AndInst.getOpcode() == TargetOpcode::G_AND && "Expected G_AND only?");
1573   // Given something like this:
1574   //
1575   //  %x = ...Something...
1576   //  %one = G_CONSTANT i64 1
1577   //  %zero = G_CONSTANT i64 0
1578   //  %and = G_AND %x, %one
1579   //  %cmp = G_ICMP intpred(ne), %and, %zero
1580   //  %cmp_trunc = G_TRUNC %cmp
1581   //  G_BRCOND %cmp_trunc, %bb.3
1582   //
1583   // We want to try and fold the AND into the G_BRCOND and produce either a
1584   // TBNZ (when we have intpred(ne)) or a TBZ (when we have intpred(eq)).
1585   //
1586   // In this case, we'd get
1587   //
1588   // TBNZ %x %bb.3
1589   //
1590 
1591   // Check if the AND has a constant on its RHS which we can use as a mask.
1592   // If it's a power of 2, then it's the same as checking a specific bit.
1593   // (e.g, ANDing with 8 == ANDing with 000...100 == testing if bit 3 is set)
1594   auto MaybeBit = getIConstantVRegValWithLookThrough(
1595       AndInst.getOperand(2).getReg(), *MIB.getMRI());
1596   if (!MaybeBit)
1597     return false;
1598 
1599   int32_t Bit = MaybeBit->Value.exactLogBase2();
1600   if (Bit < 0)
1601     return false;
1602 
1603   Register TestReg = AndInst.getOperand(1).getReg();
1604 
1605   // Emit a TB(N)Z.
1606   emitTestBit(TestReg, Bit, Invert, DstMBB, MIB);
1607   return true;
1608 }
1609 
1610 MachineInstr *AArch64InstructionSelector::emitCBZ(Register CompareReg,
1611                                                   bool IsNegative,
1612                                                   MachineBasicBlock *DestMBB,
1613                                                   MachineIRBuilder &MIB) const {
1614   assert(ProduceNonFlagSettingCondBr && "CBZ does not set flags!");
1615   MachineRegisterInfo &MRI = *MIB.getMRI();
1616   assert(RBI.getRegBank(CompareReg, MRI, TRI)->getID() ==
1617              AArch64::GPRRegBankID &&
1618          "Expected GPRs only?");
1619   auto Ty = MRI.getType(CompareReg);
1620   unsigned Width = Ty.getSizeInBits();
1621   assert(!Ty.isVector() && "Expected scalar only?");
1622   assert(Width <= 64 && "Expected width to be at most 64?");
1623   static const unsigned OpcTable[2][2] = {{AArch64::CBZW, AArch64::CBZX},
1624                                           {AArch64::CBNZW, AArch64::CBNZX}};
1625   unsigned Opc = OpcTable[IsNegative][Width == 64];
1626   auto BranchMI = MIB.buildInstr(Opc, {}, {CompareReg}).addMBB(DestMBB);
1627   constrainSelectedInstRegOperands(*BranchMI, TII, TRI, RBI);
1628   return &*BranchMI;
1629 }
1630 
1631 bool AArch64InstructionSelector::selectCompareBranchFedByFCmp(
1632     MachineInstr &I, MachineInstr &FCmp, MachineIRBuilder &MIB) const {
1633   assert(FCmp.getOpcode() == TargetOpcode::G_FCMP);
1634   assert(I.getOpcode() == TargetOpcode::G_BRCOND);
1635   // Unfortunately, the mapping of LLVM FP CC's onto AArch64 CC's isn't
1636   // totally clean.  Some of them require two branches to implement.
1637   auto Pred = (CmpInst::Predicate)FCmp.getOperand(1).getPredicate();
1638   emitFPCompare(FCmp.getOperand(2).getReg(), FCmp.getOperand(3).getReg(), MIB,
1639                 Pred);
1640   AArch64CC::CondCode CC1, CC2;
1641   changeFCMPPredToAArch64CC(static_cast<CmpInst::Predicate>(Pred), CC1, CC2);
1642   MachineBasicBlock *DestMBB = I.getOperand(1).getMBB();
1643   MIB.buildInstr(AArch64::Bcc, {}, {}).addImm(CC1).addMBB(DestMBB);
1644   if (CC2 != AArch64CC::AL)
1645     MIB.buildInstr(AArch64::Bcc, {}, {}).addImm(CC2).addMBB(DestMBB);
1646   I.eraseFromParent();
1647   return true;
1648 }
1649 
1650 bool AArch64InstructionSelector::tryOptCompareBranchFedByICmp(
1651     MachineInstr &I, MachineInstr &ICmp, MachineIRBuilder &MIB) const {
1652   assert(ICmp.getOpcode() == TargetOpcode::G_ICMP);
1653   assert(I.getOpcode() == TargetOpcode::G_BRCOND);
1654   // Attempt to optimize the G_BRCOND + G_ICMP into a TB(N)Z/CB(N)Z.
1655   //
1656   // Speculation tracking/SLH assumes that optimized TB(N)Z/CB(N)Z
1657   // instructions will not be produced, as they are conditional branch
1658   // instructions that do not set flags.
1659   if (!ProduceNonFlagSettingCondBr)
1660     return false;
1661 
1662   MachineRegisterInfo &MRI = *MIB.getMRI();
1663   MachineBasicBlock *DestMBB = I.getOperand(1).getMBB();
1664   auto Pred =
1665       static_cast<CmpInst::Predicate>(ICmp.getOperand(1).getPredicate());
1666   Register LHS = ICmp.getOperand(2).getReg();
1667   Register RHS = ICmp.getOperand(3).getReg();
1668 
1669   // We're allowed to emit a TB(N)Z/CB(N)Z. Try to do that.
1670   auto VRegAndVal = getIConstantVRegValWithLookThrough(RHS, MRI);
1671   MachineInstr *AndInst = getOpcodeDef(TargetOpcode::G_AND, LHS, MRI);
1672 
1673   // When we can emit a TB(N)Z, prefer that.
1674   //
1675   // Handle non-commutative condition codes first.
1676   // Note that we don't want to do this when we have a G_AND because it can
1677   // become a tst. The tst will make the test bit in the TB(N)Z redundant.
1678   if (VRegAndVal && !AndInst) {
1679     int64_t C = VRegAndVal->Value.getSExtValue();
1680 
1681     // When we have a greater-than comparison, we can just test if the msb is
1682     // zero.
1683     if (C == -1 && Pred == CmpInst::ICMP_SGT) {
1684       uint64_t Bit = MRI.getType(LHS).getSizeInBits() - 1;
1685       emitTestBit(LHS, Bit, /*IsNegative = */ false, DestMBB, MIB);
1686       I.eraseFromParent();
1687       return true;
1688     }
1689 
1690     // When we have a less than comparison, we can just test if the msb is not
1691     // zero.
1692     if (C == 0 && Pred == CmpInst::ICMP_SLT) {
1693       uint64_t Bit = MRI.getType(LHS).getSizeInBits() - 1;
1694       emitTestBit(LHS, Bit, /*IsNegative = */ true, DestMBB, MIB);
1695       I.eraseFromParent();
1696       return true;
1697     }
1698 
1699     // Inversely, if we have a signed greater-than-or-equal comparison to zero,
1700     // we can test if the msb is zero.
1701     if (C == 0 && Pred == CmpInst::ICMP_SGE) {
1702       uint64_t Bit = MRI.getType(LHS).getSizeInBits() - 1;
1703       emitTestBit(LHS, Bit, /*IsNegative = */ false, DestMBB, MIB);
1704       I.eraseFromParent();
1705       return true;
1706     }
1707   }
1708 
1709   // Attempt to handle commutative condition codes. Right now, that's only
1710   // eq/ne.
1711   if (ICmpInst::isEquality(Pred)) {
1712     if (!VRegAndVal) {
1713       std::swap(RHS, LHS);
1714       VRegAndVal = getIConstantVRegValWithLookThrough(RHS, MRI);
1715       AndInst = getOpcodeDef(TargetOpcode::G_AND, LHS, MRI);
1716     }
1717 
1718     if (VRegAndVal && VRegAndVal->Value == 0) {
1719       // If there's a G_AND feeding into this branch, try to fold it away by
1720       // emitting a TB(N)Z instead.
1721       //
1722       // Note: If we have LT, then it *is* possible to fold, but it wouldn't be
1723       // beneficial. When we have an AND and LT, we need a TST/ANDS, so folding
1724       // would be redundant.
1725       if (AndInst &&
1726           tryOptAndIntoCompareBranch(
1727               *AndInst, /*Invert = */ Pred == CmpInst::ICMP_NE, DestMBB, MIB)) {
1728         I.eraseFromParent();
1729         return true;
1730       }
1731 
1732       // Otherwise, try to emit a CB(N)Z instead.
1733       auto LHSTy = MRI.getType(LHS);
1734       if (!LHSTy.isVector() && LHSTy.getSizeInBits() <= 64) {
1735         emitCBZ(LHS, /*IsNegative = */ Pred == CmpInst::ICMP_NE, DestMBB, MIB);
1736         I.eraseFromParent();
1737         return true;
1738       }
1739     }
1740   }
1741 
1742   return false;
1743 }
1744 
1745 bool AArch64InstructionSelector::selectCompareBranchFedByICmp(
1746     MachineInstr &I, MachineInstr &ICmp, MachineIRBuilder &MIB) const {
1747   assert(ICmp.getOpcode() == TargetOpcode::G_ICMP);
1748   assert(I.getOpcode() == TargetOpcode::G_BRCOND);
1749   if (tryOptCompareBranchFedByICmp(I, ICmp, MIB))
1750     return true;
1751 
1752   // Couldn't optimize. Emit a compare + a Bcc.
1753   MachineBasicBlock *DestMBB = I.getOperand(1).getMBB();
1754   auto PredOp = ICmp.getOperand(1);
1755   emitIntegerCompare(ICmp.getOperand(2), ICmp.getOperand(3), PredOp, MIB);
1756   const AArch64CC::CondCode CC = changeICMPPredToAArch64CC(
1757       static_cast<CmpInst::Predicate>(PredOp.getPredicate()));
1758   MIB.buildInstr(AArch64::Bcc, {}, {}).addImm(CC).addMBB(DestMBB);
1759   I.eraseFromParent();
1760   return true;
1761 }
1762 
1763 bool AArch64InstructionSelector::selectCompareBranch(
1764     MachineInstr &I, MachineFunction &MF, MachineRegisterInfo &MRI) {
1765   Register CondReg = I.getOperand(0).getReg();
1766   MachineInstr *CCMI = MRI.getVRegDef(CondReg);
1767   // Try to select the G_BRCOND using whatever is feeding the condition if
1768   // possible.
1769   unsigned CCMIOpc = CCMI->getOpcode();
1770   if (CCMIOpc == TargetOpcode::G_FCMP)
1771     return selectCompareBranchFedByFCmp(I, *CCMI, MIB);
1772   if (CCMIOpc == TargetOpcode::G_ICMP)
1773     return selectCompareBranchFedByICmp(I, *CCMI, MIB);
1774 
1775   // Speculation tracking/SLH assumes that optimized TB(N)Z/CB(N)Z
1776   // instructions will not be produced, as they are conditional branch
1777   // instructions that do not set flags.
1778   if (ProduceNonFlagSettingCondBr) {
1779     emitTestBit(CondReg, /*Bit = */ 0, /*IsNegative = */ true,
1780                 I.getOperand(1).getMBB(), MIB);
1781     I.eraseFromParent();
1782     return true;
1783   }
1784 
1785   // Can't emit TB(N)Z/CB(N)Z. Emit a tst + bcc instead.
1786   auto TstMI =
1787       MIB.buildInstr(AArch64::ANDSWri, {LLT::scalar(32)}, {CondReg}).addImm(1);
1788   constrainSelectedInstRegOperands(*TstMI, TII, TRI, RBI);
1789   auto Bcc = MIB.buildInstr(AArch64::Bcc)
1790                  .addImm(AArch64CC::NE)
1791                  .addMBB(I.getOperand(1).getMBB());
1792   I.eraseFromParent();
1793   return constrainSelectedInstRegOperands(*Bcc, TII, TRI, RBI);
1794 }
1795 
1796 /// Returns the element immediate value of a vector shift operand if found.
1797 /// This needs to detect a splat-like operation, e.g. a G_BUILD_VECTOR.
1798 static std::optional<int64_t> getVectorShiftImm(Register Reg,
1799                                                 MachineRegisterInfo &MRI) {
1800   assert(MRI.getType(Reg).isVector() && "Expected a *vector* shift operand");
1801   MachineInstr *OpMI = MRI.getVRegDef(Reg);
1802   return getAArch64VectorSplatScalar(*OpMI, MRI);
1803 }
1804 
1805 /// Matches and returns the shift immediate value for a SHL instruction given
1806 /// a shift operand.
1807 static std::optional<int64_t> getVectorSHLImm(LLT SrcTy, Register Reg,
1808                                               MachineRegisterInfo &MRI) {
1809   std::optional<int64_t> ShiftImm = getVectorShiftImm(Reg, MRI);
1810   if (!ShiftImm)
1811     return std::nullopt;
1812   // Check the immediate is in range for a SHL.
1813   int64_t Imm = *ShiftImm;
1814   if (Imm < 0)
1815     return std::nullopt;
1816   switch (SrcTy.getElementType().getSizeInBits()) {
1817   default:
1818     LLVM_DEBUG(dbgs() << "Unhandled element type for vector shift");
1819     return std::nullopt;
1820   case 8:
1821     if (Imm > 7)
1822       return std::nullopt;
1823     break;
1824   case 16:
1825     if (Imm > 15)
1826       return std::nullopt;
1827     break;
1828   case 32:
1829     if (Imm > 31)
1830       return std::nullopt;
1831     break;
1832   case 64:
1833     if (Imm > 63)
1834       return std::nullopt;
1835     break;
1836   }
1837   return Imm;
1838 }
1839 
1840 bool AArch64InstructionSelector::selectVectorSHL(MachineInstr &I,
1841                                                  MachineRegisterInfo &MRI) {
1842   assert(I.getOpcode() == TargetOpcode::G_SHL);
1843   Register DstReg = I.getOperand(0).getReg();
1844   const LLT Ty = MRI.getType(DstReg);
1845   Register Src1Reg = I.getOperand(1).getReg();
1846   Register Src2Reg = I.getOperand(2).getReg();
1847 
1848   if (!Ty.isVector())
1849     return false;
1850 
1851   // Check if we have a vector of constants on RHS that we can select as the
1852   // immediate form.
1853   std::optional<int64_t> ImmVal = getVectorSHLImm(Ty, Src2Reg, MRI);
1854 
1855   unsigned Opc = 0;
1856   if (Ty == LLT::fixed_vector(2, 64)) {
1857     Opc = ImmVal ? AArch64::SHLv2i64_shift : AArch64::USHLv2i64;
1858   } else if (Ty == LLT::fixed_vector(4, 32)) {
1859     Opc = ImmVal ? AArch64::SHLv4i32_shift : AArch64::USHLv4i32;
1860   } else if (Ty == LLT::fixed_vector(2, 32)) {
1861     Opc = ImmVal ? AArch64::SHLv2i32_shift : AArch64::USHLv2i32;
1862   } else if (Ty == LLT::fixed_vector(4, 16)) {
1863     Opc = ImmVal ? AArch64::SHLv4i16_shift : AArch64::USHLv4i16;
1864   } else if (Ty == LLT::fixed_vector(8, 16)) {
1865     Opc = ImmVal ? AArch64::SHLv8i16_shift : AArch64::USHLv8i16;
1866   } else if (Ty == LLT::fixed_vector(16, 8)) {
1867     Opc = ImmVal ? AArch64::SHLv16i8_shift : AArch64::USHLv16i8;
1868   } else if (Ty == LLT::fixed_vector(8, 8)) {
1869     Opc = ImmVal ? AArch64::SHLv8i8_shift : AArch64::USHLv8i8;
1870   } else {
1871     LLVM_DEBUG(dbgs() << "Unhandled G_SHL type");
1872     return false;
1873   }
1874 
1875   auto Shl = MIB.buildInstr(Opc, {DstReg}, {Src1Reg});
1876   if (ImmVal)
1877     Shl.addImm(*ImmVal);
1878   else
1879     Shl.addUse(Src2Reg);
1880   constrainSelectedInstRegOperands(*Shl, TII, TRI, RBI);
1881   I.eraseFromParent();
1882   return true;
1883 }
1884 
1885 bool AArch64InstructionSelector::selectVectorAshrLshr(
1886     MachineInstr &I, MachineRegisterInfo &MRI) {
1887   assert(I.getOpcode() == TargetOpcode::G_ASHR ||
1888          I.getOpcode() == TargetOpcode::G_LSHR);
1889   Register DstReg = I.getOperand(0).getReg();
1890   const LLT Ty = MRI.getType(DstReg);
1891   Register Src1Reg = I.getOperand(1).getReg();
1892   Register Src2Reg = I.getOperand(2).getReg();
1893 
1894   if (!Ty.isVector())
1895     return false;
1896 
1897   bool IsASHR = I.getOpcode() == TargetOpcode::G_ASHR;
1898 
1899   // We expect the immediate case to be lowered in the PostLegalCombiner to
1900   // AArch64ISD::VASHR or AArch64ISD::VLSHR equivalents.
1901 
1902   // There is not a shift right register instruction, but the shift left
1903   // register instruction takes a signed value, where negative numbers specify a
1904   // right shift.
1905 
1906   unsigned Opc = 0;
1907   unsigned NegOpc = 0;
1908   const TargetRegisterClass *RC =
1909       getRegClassForTypeOnBank(Ty, RBI.getRegBank(AArch64::FPRRegBankID));
1910   if (Ty == LLT::fixed_vector(2, 64)) {
1911     Opc = IsASHR ? AArch64::SSHLv2i64 : AArch64::USHLv2i64;
1912     NegOpc = AArch64::NEGv2i64;
1913   } else if (Ty == LLT::fixed_vector(4, 32)) {
1914     Opc = IsASHR ? AArch64::SSHLv4i32 : AArch64::USHLv4i32;
1915     NegOpc = AArch64::NEGv4i32;
1916   } else if (Ty == LLT::fixed_vector(2, 32)) {
1917     Opc = IsASHR ? AArch64::SSHLv2i32 : AArch64::USHLv2i32;
1918     NegOpc = AArch64::NEGv2i32;
1919   } else if (Ty == LLT::fixed_vector(4, 16)) {
1920     Opc = IsASHR ? AArch64::SSHLv4i16 : AArch64::USHLv4i16;
1921     NegOpc = AArch64::NEGv4i16;
1922   } else if (Ty == LLT::fixed_vector(8, 16)) {
1923     Opc = IsASHR ? AArch64::SSHLv8i16 : AArch64::USHLv8i16;
1924     NegOpc = AArch64::NEGv8i16;
1925   } else if (Ty == LLT::fixed_vector(16, 8)) {
1926     Opc = IsASHR ? AArch64::SSHLv16i8 : AArch64::USHLv16i8;
1927     NegOpc = AArch64::NEGv16i8;
1928   } else if (Ty == LLT::fixed_vector(8, 8)) {
1929     Opc = IsASHR ? AArch64::SSHLv8i8 : AArch64::USHLv8i8;
1930     NegOpc = AArch64::NEGv8i8;
1931   } else {
1932     LLVM_DEBUG(dbgs() << "Unhandled G_ASHR type");
1933     return false;
1934   }
1935 
1936   auto Neg = MIB.buildInstr(NegOpc, {RC}, {Src2Reg});
1937   constrainSelectedInstRegOperands(*Neg, TII, TRI, RBI);
1938   auto SShl = MIB.buildInstr(Opc, {DstReg}, {Src1Reg, Neg});
1939   constrainSelectedInstRegOperands(*SShl, TII, TRI, RBI);
1940   I.eraseFromParent();
1941   return true;
1942 }
1943 
1944 bool AArch64InstructionSelector::selectVaStartAAPCS(
1945     MachineInstr &I, MachineFunction &MF, MachineRegisterInfo &MRI) const {
1946   return false;
1947 }
1948 
1949 bool AArch64InstructionSelector::selectVaStartDarwin(
1950     MachineInstr &I, MachineFunction &MF, MachineRegisterInfo &MRI) const {
1951   AArch64FunctionInfo *FuncInfo = MF.getInfo<AArch64FunctionInfo>();
1952   Register ListReg = I.getOperand(0).getReg();
1953 
1954   Register ArgsAddrReg = MRI.createVirtualRegister(&AArch64::GPR64RegClass);
1955 
1956   int FrameIdx = FuncInfo->getVarArgsStackIndex();
1957   if (MF.getSubtarget<AArch64Subtarget>().isCallingConvWin64(
1958           MF.getFunction().getCallingConv())) {
1959     FrameIdx = FuncInfo->getVarArgsGPRSize() > 0
1960                    ? FuncInfo->getVarArgsGPRIndex()
1961                    : FuncInfo->getVarArgsStackIndex();
1962   }
1963 
1964   auto MIB =
1965       BuildMI(*I.getParent(), I, I.getDebugLoc(), TII.get(AArch64::ADDXri))
1966           .addDef(ArgsAddrReg)
1967           .addFrameIndex(FrameIdx)
1968           .addImm(0)
1969           .addImm(0);
1970 
1971   constrainSelectedInstRegOperands(*MIB, TII, TRI, RBI);
1972 
1973   MIB = BuildMI(*I.getParent(), I, I.getDebugLoc(), TII.get(AArch64::STRXui))
1974             .addUse(ArgsAddrReg)
1975             .addUse(ListReg)
1976             .addImm(0)
1977             .addMemOperand(*I.memoperands_begin());
1978 
1979   constrainSelectedInstRegOperands(*MIB, TII, TRI, RBI);
1980   I.eraseFromParent();
1981   return true;
1982 }
1983 
1984 void AArch64InstructionSelector::materializeLargeCMVal(
1985     MachineInstr &I, const Value *V, unsigned OpFlags) {
1986   MachineBasicBlock &MBB = *I.getParent();
1987   MachineFunction &MF = *MBB.getParent();
1988   MachineRegisterInfo &MRI = MF.getRegInfo();
1989 
1990   auto MovZ = MIB.buildInstr(AArch64::MOVZXi, {&AArch64::GPR64RegClass}, {});
1991   MovZ->addOperand(MF, I.getOperand(1));
1992   MovZ->getOperand(1).setTargetFlags(OpFlags | AArch64II::MO_G0 |
1993                                      AArch64II::MO_NC);
1994   MovZ->addOperand(MF, MachineOperand::CreateImm(0));
1995   constrainSelectedInstRegOperands(*MovZ, TII, TRI, RBI);
1996 
1997   auto BuildMovK = [&](Register SrcReg, unsigned char Flags, unsigned Offset,
1998                        Register ForceDstReg) {
1999     Register DstReg = ForceDstReg
2000                           ? ForceDstReg
2001                           : MRI.createVirtualRegister(&AArch64::GPR64RegClass);
2002     auto MovI = MIB.buildInstr(AArch64::MOVKXi).addDef(DstReg).addUse(SrcReg);
2003     if (auto *GV = dyn_cast<GlobalValue>(V)) {
2004       MovI->addOperand(MF, MachineOperand::CreateGA(
2005                                GV, MovZ->getOperand(1).getOffset(), Flags));
2006     } else {
2007       MovI->addOperand(
2008           MF, MachineOperand::CreateBA(cast<BlockAddress>(V),
2009                                        MovZ->getOperand(1).getOffset(), Flags));
2010     }
2011     MovI->addOperand(MF, MachineOperand::CreateImm(Offset));
2012     constrainSelectedInstRegOperands(*MovI, TII, TRI, RBI);
2013     return DstReg;
2014   };
2015   Register DstReg = BuildMovK(MovZ.getReg(0),
2016                               AArch64II::MO_G1 | AArch64II::MO_NC, 16, 0);
2017   DstReg = BuildMovK(DstReg, AArch64II::MO_G2 | AArch64II::MO_NC, 32, 0);
2018   BuildMovK(DstReg, AArch64II::MO_G3, 48, I.getOperand(0).getReg());
2019 }
2020 
2021 bool AArch64InstructionSelector::preISelLower(MachineInstr &I) {
2022   MachineBasicBlock &MBB = *I.getParent();
2023   MachineFunction &MF = *MBB.getParent();
2024   MachineRegisterInfo &MRI = MF.getRegInfo();
2025 
2026   switch (I.getOpcode()) {
2027   case TargetOpcode::G_STORE: {
2028     bool Changed = contractCrossBankCopyIntoStore(I, MRI);
2029     MachineOperand &SrcOp = I.getOperand(0);
2030     if (MRI.getType(SrcOp.getReg()).isPointer()) {
2031       // Allow matching with imported patterns for stores of pointers. Unlike
2032       // G_LOAD/G_PTR_ADD, we may not have selected all users. So, emit a copy
2033       // and constrain.
2034       auto Copy = MIB.buildCopy(LLT::scalar(64), SrcOp);
2035       Register NewSrc = Copy.getReg(0);
2036       SrcOp.setReg(NewSrc);
2037       RBI.constrainGenericRegister(NewSrc, AArch64::GPR64RegClass, MRI);
2038       Changed = true;
2039     }
2040     return Changed;
2041   }
2042   case TargetOpcode::G_PTR_ADD:
2043     return convertPtrAddToAdd(I, MRI);
2044   case TargetOpcode::G_LOAD: {
2045     // For scalar loads of pointers, we try to convert the dest type from p0
2046     // to s64 so that our imported patterns can match. Like with the G_PTR_ADD
2047     // conversion, this should be ok because all users should have been
2048     // selected already, so the type doesn't matter for them.
2049     Register DstReg = I.getOperand(0).getReg();
2050     const LLT DstTy = MRI.getType(DstReg);
2051     if (!DstTy.isPointer())
2052       return false;
2053     MRI.setType(DstReg, LLT::scalar(64));
2054     return true;
2055   }
2056   case AArch64::G_DUP: {
2057     // Convert the type from p0 to s64 to help selection.
2058     LLT DstTy = MRI.getType(I.getOperand(0).getReg());
2059     if (!DstTy.getElementType().isPointer())
2060       return false;
2061     auto NewSrc = MIB.buildCopy(LLT::scalar(64), I.getOperand(1).getReg());
2062     MRI.setType(I.getOperand(0).getReg(),
2063                 DstTy.changeElementType(LLT::scalar(64)));
2064     MRI.setRegClass(NewSrc.getReg(0), &AArch64::GPR64RegClass);
2065     I.getOperand(1).setReg(NewSrc.getReg(0));
2066     return true;
2067   }
2068   case TargetOpcode::G_UITOFP:
2069   case TargetOpcode::G_SITOFP: {
2070     // If both source and destination regbanks are FPR, then convert the opcode
2071     // to G_SITOF so that the importer can select it to an fpr variant.
2072     // Otherwise, it ends up matching an fpr/gpr variant and adding a cross-bank
2073     // copy.
2074     Register SrcReg = I.getOperand(1).getReg();
2075     LLT SrcTy = MRI.getType(SrcReg);
2076     LLT DstTy = MRI.getType(I.getOperand(0).getReg());
2077     if (SrcTy.isVector() || SrcTy.getSizeInBits() != DstTy.getSizeInBits())
2078       return false;
2079 
2080     if (RBI.getRegBank(SrcReg, MRI, TRI)->getID() == AArch64::FPRRegBankID) {
2081       if (I.getOpcode() == TargetOpcode::G_SITOFP)
2082         I.setDesc(TII.get(AArch64::G_SITOF));
2083       else
2084         I.setDesc(TII.get(AArch64::G_UITOF));
2085       return true;
2086     }
2087     return false;
2088   }
2089   default:
2090     return false;
2091   }
2092 }
2093 
2094 /// This lowering tries to look for G_PTR_ADD instructions and then converts
2095 /// them to a standard G_ADD with a COPY on the source.
2096 ///
2097 /// The motivation behind this is to expose the add semantics to the imported
2098 /// tablegen patterns. We shouldn't need to check for uses being loads/stores,
2099 /// because the selector works bottom up, uses before defs. By the time we
2100 /// end up trying to select a G_PTR_ADD, we should have already attempted to
2101 /// fold this into addressing modes and were therefore unsuccessful.
2102 bool AArch64InstructionSelector::convertPtrAddToAdd(
2103     MachineInstr &I, MachineRegisterInfo &MRI) {
2104   assert(I.getOpcode() == TargetOpcode::G_PTR_ADD && "Expected G_PTR_ADD");
2105   Register DstReg = I.getOperand(0).getReg();
2106   Register AddOp1Reg = I.getOperand(1).getReg();
2107   const LLT PtrTy = MRI.getType(DstReg);
2108   if (PtrTy.getAddressSpace() != 0)
2109     return false;
2110 
2111   const LLT CastPtrTy =
2112       PtrTy.isVector() ? LLT::fixed_vector(2, 64) : LLT::scalar(64);
2113   auto PtrToInt = MIB.buildPtrToInt(CastPtrTy, AddOp1Reg);
2114   // Set regbanks on the registers.
2115   if (PtrTy.isVector())
2116     MRI.setRegBank(PtrToInt.getReg(0), RBI.getRegBank(AArch64::FPRRegBankID));
2117   else
2118     MRI.setRegBank(PtrToInt.getReg(0), RBI.getRegBank(AArch64::GPRRegBankID));
2119 
2120   // Now turn the %dst(p0) = G_PTR_ADD %base, off into:
2121   // %dst(intty) = G_ADD %intbase, off
2122   I.setDesc(TII.get(TargetOpcode::G_ADD));
2123   MRI.setType(DstReg, CastPtrTy);
2124   I.getOperand(1).setReg(PtrToInt.getReg(0));
2125   if (!select(*PtrToInt)) {
2126     LLVM_DEBUG(dbgs() << "Failed to select G_PTRTOINT in convertPtrAddToAdd");
2127     return false;
2128   }
2129 
2130   // Also take the opportunity here to try to do some optimization.
2131   // Try to convert this into a G_SUB if the offset is a 0-x negate idiom.
2132   Register NegatedReg;
2133   if (!mi_match(I.getOperand(2).getReg(), MRI, m_Neg(m_Reg(NegatedReg))))
2134     return true;
2135   I.getOperand(2).setReg(NegatedReg);
2136   I.setDesc(TII.get(TargetOpcode::G_SUB));
2137   return true;
2138 }
2139 
2140 bool AArch64InstructionSelector::earlySelectSHL(MachineInstr &I,
2141                                                 MachineRegisterInfo &MRI) {
2142   // We try to match the immediate variant of LSL, which is actually an alias
2143   // for a special case of UBFM. Otherwise, we fall back to the imported
2144   // selector which will match the register variant.
2145   assert(I.getOpcode() == TargetOpcode::G_SHL && "unexpected op");
2146   const auto &MO = I.getOperand(2);
2147   auto VRegAndVal = getIConstantVRegVal(MO.getReg(), MRI);
2148   if (!VRegAndVal)
2149     return false;
2150 
2151   const LLT DstTy = MRI.getType(I.getOperand(0).getReg());
2152   if (DstTy.isVector())
2153     return false;
2154   bool Is64Bit = DstTy.getSizeInBits() == 64;
2155   auto Imm1Fn = Is64Bit ? selectShiftA_64(MO) : selectShiftA_32(MO);
2156   auto Imm2Fn = Is64Bit ? selectShiftB_64(MO) : selectShiftB_32(MO);
2157 
2158   if (!Imm1Fn || !Imm2Fn)
2159     return false;
2160 
2161   auto NewI =
2162       MIB.buildInstr(Is64Bit ? AArch64::UBFMXri : AArch64::UBFMWri,
2163                      {I.getOperand(0).getReg()}, {I.getOperand(1).getReg()});
2164 
2165   for (auto &RenderFn : *Imm1Fn)
2166     RenderFn(NewI);
2167   for (auto &RenderFn : *Imm2Fn)
2168     RenderFn(NewI);
2169 
2170   I.eraseFromParent();
2171   return constrainSelectedInstRegOperands(*NewI, TII, TRI, RBI);
2172 }
2173 
2174 bool AArch64InstructionSelector::contractCrossBankCopyIntoStore(
2175     MachineInstr &I, MachineRegisterInfo &MRI) {
2176   assert(I.getOpcode() == TargetOpcode::G_STORE && "Expected G_STORE");
2177   // If we're storing a scalar, it doesn't matter what register bank that
2178   // scalar is on. All that matters is the size.
2179   //
2180   // So, if we see something like this (with a 32-bit scalar as an example):
2181   //
2182   // %x:gpr(s32) = ... something ...
2183   // %y:fpr(s32) = COPY %x:gpr(s32)
2184   // G_STORE %y:fpr(s32)
2185   //
2186   // We can fix this up into something like this:
2187   //
2188   // G_STORE %x:gpr(s32)
2189   //
2190   // And then continue the selection process normally.
2191   Register DefDstReg = getSrcRegIgnoringCopies(I.getOperand(0).getReg(), MRI);
2192   if (!DefDstReg.isValid())
2193     return false;
2194   LLT DefDstTy = MRI.getType(DefDstReg);
2195   Register StoreSrcReg = I.getOperand(0).getReg();
2196   LLT StoreSrcTy = MRI.getType(StoreSrcReg);
2197 
2198   // If we get something strange like a physical register, then we shouldn't
2199   // go any further.
2200   if (!DefDstTy.isValid())
2201     return false;
2202 
2203   // Are the source and dst types the same size?
2204   if (DefDstTy.getSizeInBits() != StoreSrcTy.getSizeInBits())
2205     return false;
2206 
2207   if (RBI.getRegBank(StoreSrcReg, MRI, TRI) ==
2208       RBI.getRegBank(DefDstReg, MRI, TRI))
2209     return false;
2210 
2211   // We have a cross-bank copy, which is entering a store. Let's fold it.
2212   I.getOperand(0).setReg(DefDstReg);
2213   return true;
2214 }
2215 
2216 bool AArch64InstructionSelector::earlySelect(MachineInstr &I) {
2217   assert(I.getParent() && "Instruction should be in a basic block!");
2218   assert(I.getParent()->getParent() && "Instruction should be in a function!");
2219 
2220   MachineBasicBlock &MBB = *I.getParent();
2221   MachineFunction &MF = *MBB.getParent();
2222   MachineRegisterInfo &MRI = MF.getRegInfo();
2223 
2224   switch (I.getOpcode()) {
2225   case AArch64::G_DUP: {
2226     // Before selecting a DUP instruction, check if it is better selected as a
2227     // MOV or load from a constant pool.
2228     Register Src = I.getOperand(1).getReg();
2229     auto ValAndVReg = getIConstantVRegValWithLookThrough(Src, MRI);
2230     if (!ValAndVReg)
2231       return false;
2232     LLVMContext &Ctx = MF.getFunction().getContext();
2233     Register Dst = I.getOperand(0).getReg();
2234     auto *CV = ConstantDataVector::getSplat(
2235         MRI.getType(Dst).getNumElements(),
2236         ConstantInt::get(Type::getIntNTy(Ctx, MRI.getType(Src).getSizeInBits()),
2237                          ValAndVReg->Value));
2238     if (!emitConstantVector(Dst, CV, MIB, MRI))
2239       return false;
2240     I.eraseFromParent();
2241     return true;
2242   }
2243   case TargetOpcode::G_SEXT:
2244     // Check for i64 sext(i32 vector_extract) prior to tablegen to select SMOV
2245     // over a normal extend.
2246     if (selectUSMovFromExtend(I, MRI))
2247       return true;
2248     return false;
2249   case TargetOpcode::G_BR:
2250     return false;
2251   case TargetOpcode::G_SHL:
2252     return earlySelectSHL(I, MRI);
2253   case TargetOpcode::G_CONSTANT: {
2254     bool IsZero = false;
2255     if (I.getOperand(1).isCImm())
2256       IsZero = I.getOperand(1).getCImm()->isZero();
2257     else if (I.getOperand(1).isImm())
2258       IsZero = I.getOperand(1).getImm() == 0;
2259 
2260     if (!IsZero)
2261       return false;
2262 
2263     Register DefReg = I.getOperand(0).getReg();
2264     LLT Ty = MRI.getType(DefReg);
2265     if (Ty.getSizeInBits() == 64) {
2266       I.getOperand(1).ChangeToRegister(AArch64::XZR, false);
2267       RBI.constrainGenericRegister(DefReg, AArch64::GPR64RegClass, MRI);
2268     } else if (Ty.getSizeInBits() == 32) {
2269       I.getOperand(1).ChangeToRegister(AArch64::WZR, false);
2270       RBI.constrainGenericRegister(DefReg, AArch64::GPR32RegClass, MRI);
2271     } else
2272       return false;
2273 
2274     I.setDesc(TII.get(TargetOpcode::COPY));
2275     return true;
2276   }
2277 
2278   case TargetOpcode::G_ADD: {
2279     // Check if this is being fed by a G_ICMP on either side.
2280     //
2281     // (cmp pred, x, y) + z
2282     //
2283     // In the above case, when the cmp is true, we increment z by 1. So, we can
2284     // fold the add into the cset for the cmp by using cinc.
2285     //
2286     // FIXME: This would probably be a lot nicer in PostLegalizerLowering.
2287     Register AddDst = I.getOperand(0).getReg();
2288     Register AddLHS = I.getOperand(1).getReg();
2289     Register AddRHS = I.getOperand(2).getReg();
2290     // Only handle scalars.
2291     LLT Ty = MRI.getType(AddLHS);
2292     if (Ty.isVector())
2293       return false;
2294     // Since G_ICMP is modeled as ADDS/SUBS/ANDS, we can handle 32 bits or 64
2295     // bits.
2296     unsigned Size = Ty.getSizeInBits();
2297     if (Size != 32 && Size != 64)
2298       return false;
2299     auto MatchCmp = [&](Register Reg) -> MachineInstr * {
2300       if (!MRI.hasOneNonDBGUse(Reg))
2301         return nullptr;
2302       // If the LHS of the add is 32 bits, then we want to fold a 32-bit
2303       // compare.
2304       if (Size == 32)
2305         return getOpcodeDef(TargetOpcode::G_ICMP, Reg, MRI);
2306       // We model scalar compares using 32-bit destinations right now.
2307       // If it's a 64-bit compare, it'll have 64-bit sources.
2308       Register ZExt;
2309       if (!mi_match(Reg, MRI,
2310                     m_OneNonDBGUse(m_GZExt(m_OneNonDBGUse(m_Reg(ZExt))))))
2311         return nullptr;
2312       auto *Cmp = getOpcodeDef(TargetOpcode::G_ICMP, ZExt, MRI);
2313       if (!Cmp ||
2314           MRI.getType(Cmp->getOperand(2).getReg()).getSizeInBits() != 64)
2315         return nullptr;
2316       return Cmp;
2317     };
2318     // Try to match
2319     // z + (cmp pred, x, y)
2320     MachineInstr *Cmp = MatchCmp(AddRHS);
2321     if (!Cmp) {
2322       // (cmp pred, x, y) + z
2323       std::swap(AddLHS, AddRHS);
2324       Cmp = MatchCmp(AddRHS);
2325       if (!Cmp)
2326         return false;
2327     }
2328     auto &PredOp = Cmp->getOperand(1);
2329     auto Pred = static_cast<CmpInst::Predicate>(PredOp.getPredicate());
2330     const AArch64CC::CondCode InvCC =
2331         changeICMPPredToAArch64CC(CmpInst::getInversePredicate(Pred));
2332     MIB.setInstrAndDebugLoc(I);
2333     emitIntegerCompare(/*LHS=*/Cmp->getOperand(2),
2334                        /*RHS=*/Cmp->getOperand(3), PredOp, MIB);
2335     emitCSINC(/*Dst=*/AddDst, /*Src =*/AddLHS, /*Src2=*/AddLHS, InvCC, MIB);
2336     I.eraseFromParent();
2337     return true;
2338   }
2339   case TargetOpcode::G_OR: {
2340     // Look for operations that take the lower `Width=Size-ShiftImm` bits of
2341     // `ShiftSrc` and insert them into the upper `Width` bits of `MaskSrc` via
2342     // shifting and masking that we can replace with a BFI (encoded as a BFM).
2343     Register Dst = I.getOperand(0).getReg();
2344     LLT Ty = MRI.getType(Dst);
2345 
2346     if (!Ty.isScalar())
2347       return false;
2348 
2349     unsigned Size = Ty.getSizeInBits();
2350     if (Size != 32 && Size != 64)
2351       return false;
2352 
2353     Register ShiftSrc;
2354     int64_t ShiftImm;
2355     Register MaskSrc;
2356     int64_t MaskImm;
2357     if (!mi_match(
2358             Dst, MRI,
2359             m_GOr(m_OneNonDBGUse(m_GShl(m_Reg(ShiftSrc), m_ICst(ShiftImm))),
2360                   m_OneNonDBGUse(m_GAnd(m_Reg(MaskSrc), m_ICst(MaskImm))))))
2361       return false;
2362 
2363     if (ShiftImm > Size || ((1ULL << ShiftImm) - 1ULL) != uint64_t(MaskImm))
2364       return false;
2365 
2366     int64_t Immr = Size - ShiftImm;
2367     int64_t Imms = Size - ShiftImm - 1;
2368     unsigned Opc = Size == 32 ? AArch64::BFMWri : AArch64::BFMXri;
2369     emitInstr(Opc, {Dst}, {MaskSrc, ShiftSrc, Immr, Imms}, MIB);
2370     I.eraseFromParent();
2371     return true;
2372   }
2373   case TargetOpcode::G_FENCE: {
2374     if (I.getOperand(1).getImm() == 0)
2375       BuildMI(MBB, I, MIMetadata(I), TII.get(TargetOpcode::MEMBARRIER));
2376     else
2377       BuildMI(MBB, I, MIMetadata(I), TII.get(AArch64::DMB))
2378           .addImm(I.getOperand(0).getImm() == 4 ? 0x9 : 0xb);
2379     I.eraseFromParent();
2380     return true;
2381   }
2382   default:
2383     return false;
2384   }
2385 }
2386 
2387 bool AArch64InstructionSelector::select(MachineInstr &I) {
2388   assert(I.getParent() && "Instruction should be in a basic block!");
2389   assert(I.getParent()->getParent() && "Instruction should be in a function!");
2390 
2391   MachineBasicBlock &MBB = *I.getParent();
2392   MachineFunction &MF = *MBB.getParent();
2393   MachineRegisterInfo &MRI = MF.getRegInfo();
2394 
2395   const AArch64Subtarget *Subtarget = &MF.getSubtarget<AArch64Subtarget>();
2396   if (Subtarget->requiresStrictAlign()) {
2397     // We don't support this feature yet.
2398     LLVM_DEBUG(dbgs() << "AArch64 GISel does not support strict-align yet\n");
2399     return false;
2400   }
2401 
2402   MIB.setInstrAndDebugLoc(I);
2403 
2404   unsigned Opcode = I.getOpcode();
2405   // G_PHI requires same handling as PHI
2406   if (!I.isPreISelOpcode() || Opcode == TargetOpcode::G_PHI) {
2407     // Certain non-generic instructions also need some special handling.
2408 
2409     if (Opcode ==  TargetOpcode::LOAD_STACK_GUARD)
2410       return constrainSelectedInstRegOperands(I, TII, TRI, RBI);
2411 
2412     if (Opcode == TargetOpcode::PHI || Opcode == TargetOpcode::G_PHI) {
2413       const Register DefReg = I.getOperand(0).getReg();
2414       const LLT DefTy = MRI.getType(DefReg);
2415 
2416       const RegClassOrRegBank &RegClassOrBank =
2417         MRI.getRegClassOrRegBank(DefReg);
2418 
2419       const TargetRegisterClass *DefRC
2420         = RegClassOrBank.dyn_cast<const TargetRegisterClass *>();
2421       if (!DefRC) {
2422         if (!DefTy.isValid()) {
2423           LLVM_DEBUG(dbgs() << "PHI operand has no type, not a gvreg?\n");
2424           return false;
2425         }
2426         const RegisterBank &RB = *RegClassOrBank.get<const RegisterBank *>();
2427         DefRC = getRegClassForTypeOnBank(DefTy, RB);
2428         if (!DefRC) {
2429           LLVM_DEBUG(dbgs() << "PHI operand has unexpected size/bank\n");
2430           return false;
2431         }
2432       }
2433 
2434       I.setDesc(TII.get(TargetOpcode::PHI));
2435 
2436       return RBI.constrainGenericRegister(DefReg, *DefRC, MRI);
2437     }
2438 
2439     if (I.isCopy())
2440       return selectCopy(I, TII, MRI, TRI, RBI);
2441 
2442     if (I.isDebugInstr())
2443       return selectDebugInstr(I, MRI, RBI);
2444 
2445     return true;
2446   }
2447 
2448 
2449   if (I.getNumOperands() != I.getNumExplicitOperands()) {
2450     LLVM_DEBUG(
2451         dbgs() << "Generic instruction has unexpected implicit operands\n");
2452     return false;
2453   }
2454 
2455   // Try to do some lowering before we start instruction selecting. These
2456   // lowerings are purely transformations on the input G_MIR and so selection
2457   // must continue after any modification of the instruction.
2458   if (preISelLower(I)) {
2459     Opcode = I.getOpcode(); // The opcode may have been modified, refresh it.
2460   }
2461 
2462   // There may be patterns where the importer can't deal with them optimally,
2463   // but does select it to a suboptimal sequence so our custom C++ selection
2464   // code later never has a chance to work on it. Therefore, we have an early
2465   // selection attempt here to give priority to certain selection routines
2466   // over the imported ones.
2467   if (earlySelect(I))
2468     return true;
2469 
2470   if (selectImpl(I, *CoverageInfo))
2471     return true;
2472 
2473   LLT Ty =
2474       I.getOperand(0).isReg() ? MRI.getType(I.getOperand(0).getReg()) : LLT{};
2475 
2476   switch (Opcode) {
2477   case TargetOpcode::G_SBFX:
2478   case TargetOpcode::G_UBFX: {
2479     static const unsigned OpcTable[2][2] = {
2480         {AArch64::UBFMWri, AArch64::UBFMXri},
2481         {AArch64::SBFMWri, AArch64::SBFMXri}};
2482     bool IsSigned = Opcode == TargetOpcode::G_SBFX;
2483     unsigned Size = Ty.getSizeInBits();
2484     unsigned Opc = OpcTable[IsSigned][Size == 64];
2485     auto Cst1 =
2486         getIConstantVRegValWithLookThrough(I.getOperand(2).getReg(), MRI);
2487     assert(Cst1 && "Should have gotten a constant for src 1?");
2488     auto Cst2 =
2489         getIConstantVRegValWithLookThrough(I.getOperand(3).getReg(), MRI);
2490     assert(Cst2 && "Should have gotten a constant for src 2?");
2491     auto LSB = Cst1->Value.getZExtValue();
2492     auto Width = Cst2->Value.getZExtValue();
2493     auto BitfieldInst =
2494         MIB.buildInstr(Opc, {I.getOperand(0)}, {I.getOperand(1)})
2495             .addImm(LSB)
2496             .addImm(LSB + Width - 1);
2497     I.eraseFromParent();
2498     return constrainSelectedInstRegOperands(*BitfieldInst, TII, TRI, RBI);
2499   }
2500   case TargetOpcode::G_BRCOND:
2501     return selectCompareBranch(I, MF, MRI);
2502 
2503   case TargetOpcode::G_BRINDIRECT: {
2504     I.setDesc(TII.get(AArch64::BR));
2505     return constrainSelectedInstRegOperands(I, TII, TRI, RBI);
2506   }
2507 
2508   case TargetOpcode::G_BRJT:
2509     return selectBrJT(I, MRI);
2510 
2511   case AArch64::G_ADD_LOW: {
2512     // This op may have been separated from it's ADRP companion by the localizer
2513     // or some other code motion pass. Given that many CPUs will try to
2514     // macro fuse these operations anyway, select this into a MOVaddr pseudo
2515     // which will later be expanded into an ADRP+ADD pair after scheduling.
2516     MachineInstr *BaseMI = MRI.getVRegDef(I.getOperand(1).getReg());
2517     if (BaseMI->getOpcode() != AArch64::ADRP) {
2518       I.setDesc(TII.get(AArch64::ADDXri));
2519       I.addOperand(MachineOperand::CreateImm(0));
2520       return constrainSelectedInstRegOperands(I, TII, TRI, RBI);
2521     }
2522     assert(TM.getCodeModel() == CodeModel::Small &&
2523            "Expected small code model");
2524     auto Op1 = BaseMI->getOperand(1);
2525     auto Op2 = I.getOperand(2);
2526     auto MovAddr = MIB.buildInstr(AArch64::MOVaddr, {I.getOperand(0)}, {})
2527                        .addGlobalAddress(Op1.getGlobal(), Op1.getOffset(),
2528                                          Op1.getTargetFlags())
2529                        .addGlobalAddress(Op2.getGlobal(), Op2.getOffset(),
2530                                          Op2.getTargetFlags());
2531     I.eraseFromParent();
2532     return constrainSelectedInstRegOperands(*MovAddr, TII, TRI, RBI);
2533   }
2534 
2535   case TargetOpcode::G_BSWAP: {
2536     // Handle vector types for G_BSWAP directly.
2537     Register DstReg = I.getOperand(0).getReg();
2538     LLT DstTy = MRI.getType(DstReg);
2539 
2540     // We should only get vector types here; everything else is handled by the
2541     // importer right now.
2542     if (!DstTy.isVector() || DstTy.getSizeInBits() > 128) {
2543       LLVM_DEBUG(dbgs() << "Dst type for G_BSWAP currently unsupported.\n");
2544       return false;
2545     }
2546 
2547     // Only handle 4 and 2 element vectors for now.
2548     // TODO: 16-bit elements.
2549     unsigned NumElts = DstTy.getNumElements();
2550     if (NumElts != 4 && NumElts != 2) {
2551       LLVM_DEBUG(dbgs() << "Unsupported number of elements for G_BSWAP.\n");
2552       return false;
2553     }
2554 
2555     // Choose the correct opcode for the supported types. Right now, that's
2556     // v2s32, v4s32, and v2s64.
2557     unsigned Opc = 0;
2558     unsigned EltSize = DstTy.getElementType().getSizeInBits();
2559     if (EltSize == 32)
2560       Opc = (DstTy.getNumElements() == 2) ? AArch64::REV32v8i8
2561                                           : AArch64::REV32v16i8;
2562     else if (EltSize == 64)
2563       Opc = AArch64::REV64v16i8;
2564 
2565     // We should always get something by the time we get here...
2566     assert(Opc != 0 && "Didn't get an opcode for G_BSWAP?");
2567 
2568     I.setDesc(TII.get(Opc));
2569     return constrainSelectedInstRegOperands(I, TII, TRI, RBI);
2570   }
2571 
2572   case TargetOpcode::G_FCONSTANT:
2573   case TargetOpcode::G_CONSTANT: {
2574     const bool isFP = Opcode == TargetOpcode::G_FCONSTANT;
2575 
2576     const LLT s8 = LLT::scalar(8);
2577     const LLT s16 = LLT::scalar(16);
2578     const LLT s32 = LLT::scalar(32);
2579     const LLT s64 = LLT::scalar(64);
2580     const LLT s128 = LLT::scalar(128);
2581     const LLT p0 = LLT::pointer(0, 64);
2582 
2583     const Register DefReg = I.getOperand(0).getReg();
2584     const LLT DefTy = MRI.getType(DefReg);
2585     const unsigned DefSize = DefTy.getSizeInBits();
2586     const RegisterBank &RB = *RBI.getRegBank(DefReg, MRI, TRI);
2587 
2588     // FIXME: Redundant check, but even less readable when factored out.
2589     if (isFP) {
2590       if (Ty != s16 && Ty != s32 && Ty != s64 && Ty != s128) {
2591         LLVM_DEBUG(dbgs() << "Unable to materialize FP " << Ty
2592                           << " constant, expected: " << s16 << " or " << s32
2593                           << " or " << s64 << " or " << s128 << '\n');
2594         return false;
2595       }
2596 
2597       if (RB.getID() != AArch64::FPRRegBankID) {
2598         LLVM_DEBUG(dbgs() << "Unable to materialize FP " << Ty
2599                           << " constant on bank: " << RB
2600                           << ", expected: FPR\n");
2601         return false;
2602       }
2603 
2604       // The case when we have 0.0 is covered by tablegen. Reject it here so we
2605       // can be sure tablegen works correctly and isn't rescued by this code.
2606       // 0.0 is not covered by tablegen for FP128. So we will handle this
2607       // scenario in the code here.
2608       if (DefSize != 128 && I.getOperand(1).getFPImm()->isExactlyValue(0.0))
2609         return false;
2610     } else {
2611       // s32 and s64 are covered by tablegen.
2612       if (Ty != p0 && Ty != s8 && Ty != s16) {
2613         LLVM_DEBUG(dbgs() << "Unable to materialize integer " << Ty
2614                           << " constant, expected: " << s32 << ", " << s64
2615                           << ", or " << p0 << '\n');
2616         return false;
2617       }
2618 
2619       if (RB.getID() != AArch64::GPRRegBankID) {
2620         LLVM_DEBUG(dbgs() << "Unable to materialize integer " << Ty
2621                           << " constant on bank: " << RB
2622                           << ", expected: GPR\n");
2623         return false;
2624       }
2625     }
2626 
2627     if (isFP) {
2628       const TargetRegisterClass &FPRRC = *getRegClassForTypeOnBank(DefTy, RB);
2629       // For 16, 64, and 128b values, emit a constant pool load.
2630       switch (DefSize) {
2631       default:
2632         llvm_unreachable("Unexpected destination size for G_FCONSTANT?");
2633       case 32:
2634         // For s32, use a cp load if we have optsize/minsize.
2635         if (!shouldOptForSize(&MF))
2636           break;
2637         [[fallthrough]];
2638       case 16:
2639       case 64:
2640       case 128: {
2641         auto *FPImm = I.getOperand(1).getFPImm();
2642         auto *LoadMI = emitLoadFromConstantPool(FPImm, MIB);
2643         if (!LoadMI) {
2644           LLVM_DEBUG(dbgs() << "Failed to load double constant pool entry\n");
2645           return false;
2646         }
2647         MIB.buildCopy({DefReg}, {LoadMI->getOperand(0).getReg()});
2648         I.eraseFromParent();
2649         return RBI.constrainGenericRegister(DefReg, FPRRC, MRI);
2650       }
2651       }
2652 
2653       // Either emit a FMOV, or emit a copy to emit a normal mov.
2654       assert(DefSize == 32 &&
2655              "Expected constant pool loads for all sizes other than 32!");
2656       const Register DefGPRReg =
2657           MRI.createVirtualRegister(&AArch64::GPR32RegClass);
2658       MachineOperand &RegOp = I.getOperand(0);
2659       RegOp.setReg(DefGPRReg);
2660       MIB.setInsertPt(MIB.getMBB(), std::next(I.getIterator()));
2661       MIB.buildCopy({DefReg}, {DefGPRReg});
2662 
2663       if (!RBI.constrainGenericRegister(DefReg, FPRRC, MRI)) {
2664         LLVM_DEBUG(dbgs() << "Failed to constrain G_FCONSTANT def operand\n");
2665         return false;
2666       }
2667 
2668       MachineOperand &ImmOp = I.getOperand(1);
2669       // FIXME: Is going through int64_t always correct?
2670       ImmOp.ChangeToImmediate(
2671           ImmOp.getFPImm()->getValueAPF().bitcastToAPInt().getZExtValue());
2672     } else if (I.getOperand(1).isCImm()) {
2673       uint64_t Val = I.getOperand(1).getCImm()->getZExtValue();
2674       I.getOperand(1).ChangeToImmediate(Val);
2675     } else if (I.getOperand(1).isImm()) {
2676       uint64_t Val = I.getOperand(1).getImm();
2677       I.getOperand(1).ChangeToImmediate(Val);
2678     }
2679 
2680     const unsigned MovOpc =
2681         DefSize == 64 ? AArch64::MOVi64imm : AArch64::MOVi32imm;
2682     I.setDesc(TII.get(MovOpc));
2683     constrainSelectedInstRegOperands(I, TII, TRI, RBI);
2684     return true;
2685   }
2686   case TargetOpcode::G_EXTRACT: {
2687     Register DstReg = I.getOperand(0).getReg();
2688     Register SrcReg = I.getOperand(1).getReg();
2689     LLT SrcTy = MRI.getType(SrcReg);
2690     LLT DstTy = MRI.getType(DstReg);
2691     (void)DstTy;
2692     unsigned SrcSize = SrcTy.getSizeInBits();
2693 
2694     if (SrcTy.getSizeInBits() > 64) {
2695       // This should be an extract of an s128, which is like a vector extract.
2696       if (SrcTy.getSizeInBits() != 128)
2697         return false;
2698       // Only support extracting 64 bits from an s128 at the moment.
2699       if (DstTy.getSizeInBits() != 64)
2700         return false;
2701 
2702       unsigned Offset = I.getOperand(2).getImm();
2703       if (Offset % 64 != 0)
2704         return false;
2705 
2706       // Check we have the right regbank always.
2707       const RegisterBank &SrcRB = *RBI.getRegBank(SrcReg, MRI, TRI);
2708       const RegisterBank &DstRB = *RBI.getRegBank(DstReg, MRI, TRI);
2709       assert(SrcRB.getID() == DstRB.getID() && "Wrong extract regbank!");
2710 
2711       if (SrcRB.getID() == AArch64::GPRRegBankID) {
2712         auto NewI =
2713             MIB.buildInstr(TargetOpcode::COPY, {DstReg}, {})
2714                 .addUse(SrcReg, 0,
2715                         Offset == 0 ? AArch64::sube64 : AArch64::subo64);
2716         constrainOperandRegClass(MF, TRI, MRI, TII, RBI, *NewI,
2717                                  AArch64::GPR64RegClass, NewI->getOperand(0));
2718         I.eraseFromParent();
2719         return true;
2720       }
2721 
2722       // Emit the same code as a vector extract.
2723       // Offset must be a multiple of 64.
2724       unsigned LaneIdx = Offset / 64;
2725       MachineInstr *Extract = emitExtractVectorElt(
2726           DstReg, DstRB, LLT::scalar(64), SrcReg, LaneIdx, MIB);
2727       if (!Extract)
2728         return false;
2729       I.eraseFromParent();
2730       return true;
2731     }
2732 
2733     I.setDesc(TII.get(SrcSize == 64 ? AArch64::UBFMXri : AArch64::UBFMWri));
2734     MachineInstrBuilder(MF, I).addImm(I.getOperand(2).getImm() +
2735                                       Ty.getSizeInBits() - 1);
2736 
2737     if (SrcSize < 64) {
2738       assert(SrcSize == 32 && DstTy.getSizeInBits() == 16 &&
2739              "unexpected G_EXTRACT types");
2740       return constrainSelectedInstRegOperands(I, TII, TRI, RBI);
2741     }
2742 
2743     DstReg = MRI.createGenericVirtualRegister(LLT::scalar(64));
2744     MIB.setInsertPt(MIB.getMBB(), std::next(I.getIterator()));
2745     MIB.buildInstr(TargetOpcode::COPY, {I.getOperand(0).getReg()}, {})
2746         .addReg(DstReg, 0, AArch64::sub_32);
2747     RBI.constrainGenericRegister(I.getOperand(0).getReg(),
2748                                  AArch64::GPR32RegClass, MRI);
2749     I.getOperand(0).setReg(DstReg);
2750 
2751     return constrainSelectedInstRegOperands(I, TII, TRI, RBI);
2752   }
2753 
2754   case TargetOpcode::G_INSERT: {
2755     LLT SrcTy = MRI.getType(I.getOperand(2).getReg());
2756     LLT DstTy = MRI.getType(I.getOperand(0).getReg());
2757     unsigned DstSize = DstTy.getSizeInBits();
2758     // Larger inserts are vectors, same-size ones should be something else by
2759     // now (split up or turned into COPYs).
2760     if (Ty.getSizeInBits() > 64 || SrcTy.getSizeInBits() > 32)
2761       return false;
2762 
2763     I.setDesc(TII.get(DstSize == 64 ? AArch64::BFMXri : AArch64::BFMWri));
2764     unsigned LSB = I.getOperand(3).getImm();
2765     unsigned Width = MRI.getType(I.getOperand(2).getReg()).getSizeInBits();
2766     I.getOperand(3).setImm((DstSize - LSB) % DstSize);
2767     MachineInstrBuilder(MF, I).addImm(Width - 1);
2768 
2769     if (DstSize < 64) {
2770       assert(DstSize == 32 && SrcTy.getSizeInBits() == 16 &&
2771              "unexpected G_INSERT types");
2772       return constrainSelectedInstRegOperands(I, TII, TRI, RBI);
2773     }
2774 
2775     Register SrcReg = MRI.createGenericVirtualRegister(LLT::scalar(64));
2776     BuildMI(MBB, I.getIterator(), I.getDebugLoc(),
2777             TII.get(AArch64::SUBREG_TO_REG))
2778         .addDef(SrcReg)
2779         .addImm(0)
2780         .addUse(I.getOperand(2).getReg())
2781         .addImm(AArch64::sub_32);
2782     RBI.constrainGenericRegister(I.getOperand(2).getReg(),
2783                                  AArch64::GPR32RegClass, MRI);
2784     I.getOperand(2).setReg(SrcReg);
2785 
2786     return constrainSelectedInstRegOperands(I, TII, TRI, RBI);
2787   }
2788   case TargetOpcode::G_FRAME_INDEX: {
2789     // allocas and G_FRAME_INDEX are only supported in addrspace(0).
2790     if (Ty != LLT::pointer(0, 64)) {
2791       LLVM_DEBUG(dbgs() << "G_FRAME_INDEX pointer has type: " << Ty
2792                         << ", expected: " << LLT::pointer(0, 64) << '\n');
2793       return false;
2794     }
2795     I.setDesc(TII.get(AArch64::ADDXri));
2796 
2797     // MOs for a #0 shifted immediate.
2798     I.addOperand(MachineOperand::CreateImm(0));
2799     I.addOperand(MachineOperand::CreateImm(0));
2800 
2801     return constrainSelectedInstRegOperands(I, TII, TRI, RBI);
2802   }
2803 
2804   case TargetOpcode::G_GLOBAL_VALUE: {
2805     auto GV = I.getOperand(1).getGlobal();
2806     if (GV->isThreadLocal())
2807       return selectTLSGlobalValue(I, MRI);
2808 
2809     unsigned OpFlags = STI.ClassifyGlobalReference(GV, TM);
2810     if (OpFlags & AArch64II::MO_GOT) {
2811       I.setDesc(TII.get(AArch64::LOADgot));
2812       I.getOperand(1).setTargetFlags(OpFlags);
2813     } else if (TM.getCodeModel() == CodeModel::Large) {
2814       // Materialize the global using movz/movk instructions.
2815       materializeLargeCMVal(I, GV, OpFlags);
2816       I.eraseFromParent();
2817       return true;
2818     } else if (TM.getCodeModel() == CodeModel::Tiny) {
2819       I.setDesc(TII.get(AArch64::ADR));
2820       I.getOperand(1).setTargetFlags(OpFlags);
2821     } else {
2822       I.setDesc(TII.get(AArch64::MOVaddr));
2823       I.getOperand(1).setTargetFlags(OpFlags | AArch64II::MO_PAGE);
2824       MachineInstrBuilder MIB(MF, I);
2825       MIB.addGlobalAddress(GV, I.getOperand(1).getOffset(),
2826                            OpFlags | AArch64II::MO_PAGEOFF | AArch64II::MO_NC);
2827     }
2828     return constrainSelectedInstRegOperands(I, TII, TRI, RBI);
2829   }
2830 
2831   case TargetOpcode::G_ZEXTLOAD:
2832   case TargetOpcode::G_LOAD:
2833   case TargetOpcode::G_STORE: {
2834     GLoadStore &LdSt = cast<GLoadStore>(I);
2835     bool IsZExtLoad = I.getOpcode() == TargetOpcode::G_ZEXTLOAD;
2836     LLT PtrTy = MRI.getType(LdSt.getPointerReg());
2837 
2838     if (PtrTy != LLT::pointer(0, 64)) {
2839       LLVM_DEBUG(dbgs() << "Load/Store pointer has type: " << PtrTy
2840                         << ", expected: " << LLT::pointer(0, 64) << '\n');
2841       return false;
2842     }
2843 
2844     uint64_t MemSizeInBytes = LdSt.getMemSize();
2845     unsigned MemSizeInBits = LdSt.getMemSizeInBits();
2846     AtomicOrdering Order = LdSt.getMMO().getSuccessOrdering();
2847 
2848     // Need special instructions for atomics that affect ordering.
2849     if (Order != AtomicOrdering::NotAtomic &&
2850         Order != AtomicOrdering::Unordered &&
2851         Order != AtomicOrdering::Monotonic) {
2852       assert(!isa<GZExtLoad>(LdSt));
2853       if (MemSizeInBytes > 64)
2854         return false;
2855 
2856       if (isa<GLoad>(LdSt)) {
2857         static constexpr unsigned LDAPROpcodes[] = {
2858             AArch64::LDAPRB, AArch64::LDAPRH, AArch64::LDAPRW, AArch64::LDAPRX};
2859         static constexpr unsigned LDAROpcodes[] = {
2860             AArch64::LDARB, AArch64::LDARH, AArch64::LDARW, AArch64::LDARX};
2861         ArrayRef<unsigned> Opcodes =
2862             STI.hasRCPC() && Order != AtomicOrdering::SequentiallyConsistent
2863                 ? LDAPROpcodes
2864                 : LDAROpcodes;
2865         I.setDesc(TII.get(Opcodes[Log2_32(MemSizeInBytes)]));
2866       } else {
2867         static constexpr unsigned Opcodes[] = {AArch64::STLRB, AArch64::STLRH,
2868                                                AArch64::STLRW, AArch64::STLRX};
2869         Register ValReg = LdSt.getReg(0);
2870         if (MRI.getType(ValReg).getSizeInBits() == 64 && MemSizeInBits != 64) {
2871           // Emit a subreg copy of 32 bits.
2872           Register NewVal = MRI.createVirtualRegister(&AArch64::GPR32RegClass);
2873           MIB.buildInstr(TargetOpcode::COPY, {NewVal}, {})
2874               .addReg(I.getOperand(0).getReg(), 0, AArch64::sub_32);
2875           I.getOperand(0).setReg(NewVal);
2876         }
2877         I.setDesc(TII.get(Opcodes[Log2_32(MemSizeInBytes)]));
2878       }
2879       constrainSelectedInstRegOperands(I, TII, TRI, RBI);
2880       return true;
2881     }
2882 
2883 #ifndef NDEBUG
2884     const Register PtrReg = LdSt.getPointerReg();
2885     const RegisterBank &PtrRB = *RBI.getRegBank(PtrReg, MRI, TRI);
2886     // Check that the pointer register is valid.
2887     assert(PtrRB.getID() == AArch64::GPRRegBankID &&
2888            "Load/Store pointer operand isn't a GPR");
2889     assert(MRI.getType(PtrReg).isPointer() &&
2890            "Load/Store pointer operand isn't a pointer");
2891 #endif
2892 
2893     const Register ValReg = LdSt.getReg(0);
2894     const LLT ValTy = MRI.getType(ValReg);
2895     const RegisterBank &RB = *RBI.getRegBank(ValReg, MRI, TRI);
2896 
2897     // The code below doesn't support truncating stores, so we need to split it
2898     // again.
2899     if (isa<GStore>(LdSt) && ValTy.getSizeInBits() > MemSizeInBits) {
2900       unsigned SubReg;
2901       LLT MemTy = LdSt.getMMO().getMemoryType();
2902       auto *RC = getRegClassForTypeOnBank(MemTy, RB);
2903       if (!getSubRegForClass(RC, TRI, SubReg))
2904         return false;
2905 
2906       // Generate a subreg copy.
2907       auto Copy = MIB.buildInstr(TargetOpcode::COPY, {MemTy}, {})
2908                       .addReg(ValReg, 0, SubReg)
2909                       .getReg(0);
2910       RBI.constrainGenericRegister(Copy, *RC, MRI);
2911       LdSt.getOperand(0).setReg(Copy);
2912     } else if (isa<GLoad>(LdSt) && ValTy.getSizeInBits() > MemSizeInBits) {
2913       // If this is an any-extending load from the FPR bank, split it into a regular
2914       // load + extend.
2915       if (RB.getID() == AArch64::FPRRegBankID) {
2916         unsigned SubReg;
2917         LLT MemTy = LdSt.getMMO().getMemoryType();
2918         auto *RC = getRegClassForTypeOnBank(MemTy, RB);
2919         if (!getSubRegForClass(RC, TRI, SubReg))
2920           return false;
2921         Register OldDst = LdSt.getReg(0);
2922         Register NewDst =
2923             MRI.createGenericVirtualRegister(LdSt.getMMO().getMemoryType());
2924         LdSt.getOperand(0).setReg(NewDst);
2925         MRI.setRegBank(NewDst, RB);
2926         // Generate a SUBREG_TO_REG to extend it.
2927         MIB.setInsertPt(MIB.getMBB(), std::next(LdSt.getIterator()));
2928         MIB.buildInstr(AArch64::SUBREG_TO_REG, {OldDst}, {})
2929             .addImm(0)
2930             .addUse(NewDst)
2931             .addImm(SubReg);
2932         auto SubRegRC = getRegClassForTypeOnBank(MRI.getType(OldDst), RB);
2933         RBI.constrainGenericRegister(OldDst, *SubRegRC, MRI);
2934         MIB.setInstr(LdSt);
2935       }
2936     }
2937 
2938     // Helper lambda for partially selecting I. Either returns the original
2939     // instruction with an updated opcode, or a new instruction.
2940     auto SelectLoadStoreAddressingMode = [&]() -> MachineInstr * {
2941       bool IsStore = isa<GStore>(I);
2942       const unsigned NewOpc =
2943           selectLoadStoreUIOp(I.getOpcode(), RB.getID(), MemSizeInBits);
2944       if (NewOpc == I.getOpcode())
2945         return nullptr;
2946       // Check if we can fold anything into the addressing mode.
2947       auto AddrModeFns =
2948           selectAddrModeIndexed(I.getOperand(1), MemSizeInBytes);
2949       if (!AddrModeFns) {
2950         // Can't fold anything. Use the original instruction.
2951         I.setDesc(TII.get(NewOpc));
2952         I.addOperand(MachineOperand::CreateImm(0));
2953         return &I;
2954       }
2955 
2956       // Folded something. Create a new instruction and return it.
2957       auto NewInst = MIB.buildInstr(NewOpc, {}, {}, I.getFlags());
2958       Register CurValReg = I.getOperand(0).getReg();
2959       IsStore ? NewInst.addUse(CurValReg) : NewInst.addDef(CurValReg);
2960       NewInst.cloneMemRefs(I);
2961       for (auto &Fn : *AddrModeFns)
2962         Fn(NewInst);
2963       I.eraseFromParent();
2964       return &*NewInst;
2965     };
2966 
2967     MachineInstr *LoadStore = SelectLoadStoreAddressingMode();
2968     if (!LoadStore)
2969       return false;
2970 
2971     // If we're storing a 0, use WZR/XZR.
2972     if (Opcode == TargetOpcode::G_STORE) {
2973       auto CVal = getIConstantVRegValWithLookThrough(
2974           LoadStore->getOperand(0).getReg(), MRI);
2975       if (CVal && CVal->Value == 0) {
2976         switch (LoadStore->getOpcode()) {
2977         case AArch64::STRWui:
2978         case AArch64::STRHHui:
2979         case AArch64::STRBBui:
2980           LoadStore->getOperand(0).setReg(AArch64::WZR);
2981           break;
2982         case AArch64::STRXui:
2983           LoadStore->getOperand(0).setReg(AArch64::XZR);
2984           break;
2985         }
2986       }
2987     }
2988 
2989     if (IsZExtLoad) {
2990       // The zextload from a smaller type to i32 should be handled by the
2991       // importer.
2992       if (MRI.getType(LoadStore->getOperand(0).getReg()).getSizeInBits() != 64)
2993         return false;
2994       // If we have a ZEXTLOAD then change the load's type to be a narrower reg
2995       // and zero_extend with SUBREG_TO_REG.
2996       Register LdReg = MRI.createVirtualRegister(&AArch64::GPR32RegClass);
2997       Register DstReg = LoadStore->getOperand(0).getReg();
2998       LoadStore->getOperand(0).setReg(LdReg);
2999 
3000       MIB.setInsertPt(MIB.getMBB(), std::next(LoadStore->getIterator()));
3001       MIB.buildInstr(AArch64::SUBREG_TO_REG, {DstReg}, {})
3002           .addImm(0)
3003           .addUse(LdReg)
3004           .addImm(AArch64::sub_32);
3005       constrainSelectedInstRegOperands(*LoadStore, TII, TRI, RBI);
3006       return RBI.constrainGenericRegister(DstReg, AArch64::GPR64allRegClass,
3007                                           MRI);
3008     }
3009     return constrainSelectedInstRegOperands(*LoadStore, TII, TRI, RBI);
3010   }
3011 
3012   case TargetOpcode::G_SMULH:
3013   case TargetOpcode::G_UMULH: {
3014     // Reject the various things we don't support yet.
3015     if (unsupportedBinOp(I, RBI, MRI, TRI))
3016       return false;
3017 
3018     const Register DefReg = I.getOperand(0).getReg();
3019     const RegisterBank &RB = *RBI.getRegBank(DefReg, MRI, TRI);
3020 
3021     if (RB.getID() != AArch64::GPRRegBankID) {
3022       LLVM_DEBUG(dbgs() << "G_[SU]MULH on bank: " << RB << ", expected: GPR\n");
3023       return false;
3024     }
3025 
3026     if (Ty != LLT::scalar(64)) {
3027       LLVM_DEBUG(dbgs() << "G_[SU]MULH has type: " << Ty
3028                         << ", expected: " << LLT::scalar(64) << '\n');
3029       return false;
3030     }
3031 
3032     unsigned NewOpc = I.getOpcode() == TargetOpcode::G_SMULH ? AArch64::SMULHrr
3033                                                              : AArch64::UMULHrr;
3034     I.setDesc(TII.get(NewOpc));
3035 
3036     // Now that we selected an opcode, we need to constrain the register
3037     // operands to use appropriate classes.
3038     return constrainSelectedInstRegOperands(I, TII, TRI, RBI);
3039   }
3040   case TargetOpcode::G_LSHR:
3041   case TargetOpcode::G_ASHR:
3042     if (MRI.getType(I.getOperand(0).getReg()).isVector())
3043       return selectVectorAshrLshr(I, MRI);
3044     [[fallthrough]];
3045   case TargetOpcode::G_SHL:
3046     if (Opcode == TargetOpcode::G_SHL &&
3047         MRI.getType(I.getOperand(0).getReg()).isVector())
3048       return selectVectorSHL(I, MRI);
3049 
3050     // These shifts were legalized to have 64 bit shift amounts because we
3051     // want to take advantage of the selection patterns that assume the
3052     // immediates are s64s, however, selectBinaryOp will assume both operands
3053     // will have the same bit size.
3054     {
3055       Register SrcReg = I.getOperand(1).getReg();
3056       Register ShiftReg = I.getOperand(2).getReg();
3057       const LLT ShiftTy = MRI.getType(ShiftReg);
3058       const LLT SrcTy = MRI.getType(SrcReg);
3059       if (!SrcTy.isVector() && SrcTy.getSizeInBits() == 32 &&
3060           ShiftTy.getSizeInBits() == 64) {
3061         assert(!ShiftTy.isVector() && "unexpected vector shift ty");
3062         // Insert a subregister copy to implement a 64->32 trunc
3063         auto Trunc = MIB.buildInstr(TargetOpcode::COPY, {SrcTy}, {})
3064                          .addReg(ShiftReg, 0, AArch64::sub_32);
3065         MRI.setRegBank(Trunc.getReg(0), RBI.getRegBank(AArch64::GPRRegBankID));
3066         I.getOperand(2).setReg(Trunc.getReg(0));
3067       }
3068     }
3069     [[fallthrough]];
3070   case TargetOpcode::G_OR: {
3071     // Reject the various things we don't support yet.
3072     if (unsupportedBinOp(I, RBI, MRI, TRI))
3073       return false;
3074 
3075     const unsigned OpSize = Ty.getSizeInBits();
3076 
3077     const Register DefReg = I.getOperand(0).getReg();
3078     const RegisterBank &RB = *RBI.getRegBank(DefReg, MRI, TRI);
3079 
3080     const unsigned NewOpc = selectBinaryOp(I.getOpcode(), RB.getID(), OpSize);
3081     if (NewOpc == I.getOpcode())
3082       return false;
3083 
3084     I.setDesc(TII.get(NewOpc));
3085     // FIXME: Should the type be always reset in setDesc?
3086 
3087     // Now that we selected an opcode, we need to constrain the register
3088     // operands to use appropriate classes.
3089     return constrainSelectedInstRegOperands(I, TII, TRI, RBI);
3090   }
3091 
3092   case TargetOpcode::G_PTR_ADD: {
3093     emitADD(I.getOperand(0).getReg(), I.getOperand(1), I.getOperand(2), MIB);
3094     I.eraseFromParent();
3095     return true;
3096   }
3097 
3098   case TargetOpcode::G_SADDE:
3099   case TargetOpcode::G_UADDE:
3100   case TargetOpcode::G_SSUBE:
3101   case TargetOpcode::G_USUBE:
3102   case TargetOpcode::G_SADDO:
3103   case TargetOpcode::G_UADDO:
3104   case TargetOpcode::G_SSUBO:
3105   case TargetOpcode::G_USUBO:
3106     return selectOverflowOp(I, MRI);
3107 
3108   case TargetOpcode::G_PTRMASK: {
3109     Register MaskReg = I.getOperand(2).getReg();
3110     std::optional<int64_t> MaskVal = getIConstantVRegSExtVal(MaskReg, MRI);
3111     // TODO: Implement arbitrary cases
3112     if (!MaskVal || !isShiftedMask_64(*MaskVal))
3113       return false;
3114 
3115     uint64_t Mask = *MaskVal;
3116     I.setDesc(TII.get(AArch64::ANDXri));
3117     I.getOperand(2).ChangeToImmediate(
3118         AArch64_AM::encodeLogicalImmediate(Mask, 64));
3119 
3120     return constrainSelectedInstRegOperands(I, TII, TRI, RBI);
3121   }
3122   case TargetOpcode::G_PTRTOINT:
3123   case TargetOpcode::G_TRUNC: {
3124     const LLT DstTy = MRI.getType(I.getOperand(0).getReg());
3125     const LLT SrcTy = MRI.getType(I.getOperand(1).getReg());
3126 
3127     const Register DstReg = I.getOperand(0).getReg();
3128     const Register SrcReg = I.getOperand(1).getReg();
3129 
3130     const RegisterBank &DstRB = *RBI.getRegBank(DstReg, MRI, TRI);
3131     const RegisterBank &SrcRB = *RBI.getRegBank(SrcReg, MRI, TRI);
3132 
3133     if (DstRB.getID() != SrcRB.getID()) {
3134       LLVM_DEBUG(
3135           dbgs() << "G_TRUNC/G_PTRTOINT input/output on different banks\n");
3136       return false;
3137     }
3138 
3139     if (DstRB.getID() == AArch64::GPRRegBankID) {
3140       const TargetRegisterClass *DstRC = getRegClassForTypeOnBank(DstTy, DstRB);
3141       if (!DstRC)
3142         return false;
3143 
3144       const TargetRegisterClass *SrcRC = getRegClassForTypeOnBank(SrcTy, SrcRB);
3145       if (!SrcRC)
3146         return false;
3147 
3148       if (!RBI.constrainGenericRegister(SrcReg, *SrcRC, MRI) ||
3149           !RBI.constrainGenericRegister(DstReg, *DstRC, MRI)) {
3150         LLVM_DEBUG(dbgs() << "Failed to constrain G_TRUNC/G_PTRTOINT\n");
3151         return false;
3152       }
3153 
3154       if (DstRC == SrcRC) {
3155         // Nothing to be done
3156       } else if (Opcode == TargetOpcode::G_TRUNC && DstTy == LLT::scalar(32) &&
3157                  SrcTy == LLT::scalar(64)) {
3158         llvm_unreachable("TableGen can import this case");
3159         return false;
3160       } else if (DstRC == &AArch64::GPR32RegClass &&
3161                  SrcRC == &AArch64::GPR64RegClass) {
3162         I.getOperand(1).setSubReg(AArch64::sub_32);
3163       } else {
3164         LLVM_DEBUG(
3165             dbgs() << "Unhandled mismatched classes in G_TRUNC/G_PTRTOINT\n");
3166         return false;
3167       }
3168 
3169       I.setDesc(TII.get(TargetOpcode::COPY));
3170       return true;
3171     } else if (DstRB.getID() == AArch64::FPRRegBankID) {
3172       if (DstTy == LLT::fixed_vector(4, 16) &&
3173           SrcTy == LLT::fixed_vector(4, 32)) {
3174         I.setDesc(TII.get(AArch64::XTNv4i16));
3175         constrainSelectedInstRegOperands(I, TII, TRI, RBI);
3176         return true;
3177       }
3178 
3179       if (!SrcTy.isVector() && SrcTy.getSizeInBits() == 128) {
3180         MachineInstr *Extract = emitExtractVectorElt(
3181             DstReg, DstRB, LLT::scalar(DstTy.getSizeInBits()), SrcReg, 0, MIB);
3182         if (!Extract)
3183           return false;
3184         I.eraseFromParent();
3185         return true;
3186       }
3187 
3188       // We might have a vector G_PTRTOINT, in which case just emit a COPY.
3189       if (Opcode == TargetOpcode::G_PTRTOINT) {
3190         assert(DstTy.isVector() && "Expected an FPR ptrtoint to be a vector");
3191         I.setDesc(TII.get(TargetOpcode::COPY));
3192         return selectCopy(I, TII, MRI, TRI, RBI);
3193       }
3194     }
3195 
3196     return false;
3197   }
3198 
3199   case TargetOpcode::G_ANYEXT: {
3200     if (selectUSMovFromExtend(I, MRI))
3201       return true;
3202 
3203     const Register DstReg = I.getOperand(0).getReg();
3204     const Register SrcReg = I.getOperand(1).getReg();
3205 
3206     const RegisterBank &RBDst = *RBI.getRegBank(DstReg, MRI, TRI);
3207     if (RBDst.getID() != AArch64::GPRRegBankID) {
3208       LLVM_DEBUG(dbgs() << "G_ANYEXT on bank: " << RBDst
3209                         << ", expected: GPR\n");
3210       return false;
3211     }
3212 
3213     const RegisterBank &RBSrc = *RBI.getRegBank(SrcReg, MRI, TRI);
3214     if (RBSrc.getID() != AArch64::GPRRegBankID) {
3215       LLVM_DEBUG(dbgs() << "G_ANYEXT on bank: " << RBSrc
3216                         << ", expected: GPR\n");
3217       return false;
3218     }
3219 
3220     const unsigned DstSize = MRI.getType(DstReg).getSizeInBits();
3221 
3222     if (DstSize == 0) {
3223       LLVM_DEBUG(dbgs() << "G_ANYEXT operand has no size, not a gvreg?\n");
3224       return false;
3225     }
3226 
3227     if (DstSize != 64 && DstSize > 32) {
3228       LLVM_DEBUG(dbgs() << "G_ANYEXT to size: " << DstSize
3229                         << ", expected: 32 or 64\n");
3230       return false;
3231     }
3232     // At this point G_ANYEXT is just like a plain COPY, but we need
3233     // to explicitly form the 64-bit value if any.
3234     if (DstSize > 32) {
3235       Register ExtSrc = MRI.createVirtualRegister(&AArch64::GPR64allRegClass);
3236       BuildMI(MBB, I, I.getDebugLoc(), TII.get(AArch64::SUBREG_TO_REG))
3237           .addDef(ExtSrc)
3238           .addImm(0)
3239           .addUse(SrcReg)
3240           .addImm(AArch64::sub_32);
3241       I.getOperand(1).setReg(ExtSrc);
3242     }
3243     return selectCopy(I, TII, MRI, TRI, RBI);
3244   }
3245 
3246   case TargetOpcode::G_ZEXT:
3247   case TargetOpcode::G_SEXT_INREG:
3248   case TargetOpcode::G_SEXT: {
3249     if (selectUSMovFromExtend(I, MRI))
3250       return true;
3251 
3252     unsigned Opcode = I.getOpcode();
3253     const bool IsSigned = Opcode != TargetOpcode::G_ZEXT;
3254     const Register DefReg = I.getOperand(0).getReg();
3255     Register SrcReg = I.getOperand(1).getReg();
3256     const LLT DstTy = MRI.getType(DefReg);
3257     const LLT SrcTy = MRI.getType(SrcReg);
3258     unsigned DstSize = DstTy.getSizeInBits();
3259     unsigned SrcSize = SrcTy.getSizeInBits();
3260 
3261     // SEXT_INREG has the same src reg size as dst, the size of the value to be
3262     // extended is encoded in the imm.
3263     if (Opcode == TargetOpcode::G_SEXT_INREG)
3264       SrcSize = I.getOperand(2).getImm();
3265 
3266     if (DstTy.isVector())
3267       return false; // Should be handled by imported patterns.
3268 
3269     assert((*RBI.getRegBank(DefReg, MRI, TRI)).getID() ==
3270                AArch64::GPRRegBankID &&
3271            "Unexpected ext regbank");
3272 
3273     MachineInstr *ExtI;
3274 
3275     // First check if we're extending the result of a load which has a dest type
3276     // smaller than 32 bits, then this zext is redundant. GPR32 is the smallest
3277     // GPR register on AArch64 and all loads which are smaller automatically
3278     // zero-extend the upper bits. E.g.
3279     // %v(s8) = G_LOAD %p, :: (load 1)
3280     // %v2(s32) = G_ZEXT %v(s8)
3281     if (!IsSigned) {
3282       auto *LoadMI = getOpcodeDef(TargetOpcode::G_LOAD, SrcReg, MRI);
3283       bool IsGPR =
3284           RBI.getRegBank(SrcReg, MRI, TRI)->getID() == AArch64::GPRRegBankID;
3285       if (LoadMI && IsGPR) {
3286         const MachineMemOperand *MemOp = *LoadMI->memoperands_begin();
3287         unsigned BytesLoaded = MemOp->getSize();
3288         if (BytesLoaded < 4 && SrcTy.getSizeInBytes() == BytesLoaded)
3289           return selectCopy(I, TII, MRI, TRI, RBI);
3290       }
3291 
3292       // For the 32-bit -> 64-bit case, we can emit a mov (ORRWrs)
3293       // + SUBREG_TO_REG.
3294       if (IsGPR && SrcSize == 32 && DstSize == 64) {
3295         Register SubregToRegSrc =
3296             MRI.createVirtualRegister(&AArch64::GPR32RegClass);
3297         const Register ZReg = AArch64::WZR;
3298         MIB.buildInstr(AArch64::ORRWrs, {SubregToRegSrc}, {ZReg, SrcReg})
3299             .addImm(0);
3300 
3301         MIB.buildInstr(AArch64::SUBREG_TO_REG, {DefReg}, {})
3302             .addImm(0)
3303             .addUse(SubregToRegSrc)
3304             .addImm(AArch64::sub_32);
3305 
3306         if (!RBI.constrainGenericRegister(DefReg, AArch64::GPR64RegClass,
3307                                           MRI)) {
3308           LLVM_DEBUG(dbgs() << "Failed to constrain G_ZEXT destination\n");
3309           return false;
3310         }
3311 
3312         if (!RBI.constrainGenericRegister(SrcReg, AArch64::GPR32RegClass,
3313                                           MRI)) {
3314           LLVM_DEBUG(dbgs() << "Failed to constrain G_ZEXT source\n");
3315           return false;
3316         }
3317 
3318         I.eraseFromParent();
3319         return true;
3320       }
3321     }
3322 
3323     if (DstSize == 64) {
3324       if (Opcode != TargetOpcode::G_SEXT_INREG) {
3325         // FIXME: Can we avoid manually doing this?
3326         if (!RBI.constrainGenericRegister(SrcReg, AArch64::GPR32RegClass,
3327                                           MRI)) {
3328           LLVM_DEBUG(dbgs() << "Failed to constrain " << TII.getName(Opcode)
3329                             << " operand\n");
3330           return false;
3331         }
3332         SrcReg = MIB.buildInstr(AArch64::SUBREG_TO_REG,
3333                                 {&AArch64::GPR64RegClass}, {})
3334                      .addImm(0)
3335                      .addUse(SrcReg)
3336                      .addImm(AArch64::sub_32)
3337                      .getReg(0);
3338       }
3339 
3340       ExtI = MIB.buildInstr(IsSigned ? AArch64::SBFMXri : AArch64::UBFMXri,
3341                              {DefReg}, {SrcReg})
3342                   .addImm(0)
3343                   .addImm(SrcSize - 1);
3344     } else if (DstSize <= 32) {
3345       ExtI = MIB.buildInstr(IsSigned ? AArch64::SBFMWri : AArch64::UBFMWri,
3346                              {DefReg}, {SrcReg})
3347                   .addImm(0)
3348                   .addImm(SrcSize - 1);
3349     } else {
3350       return false;
3351     }
3352 
3353     constrainSelectedInstRegOperands(*ExtI, TII, TRI, RBI);
3354     I.eraseFromParent();
3355     return true;
3356   }
3357 
3358   case TargetOpcode::G_SITOFP:
3359   case TargetOpcode::G_UITOFP:
3360   case TargetOpcode::G_FPTOSI:
3361   case TargetOpcode::G_FPTOUI: {
3362     const LLT DstTy = MRI.getType(I.getOperand(0).getReg()),
3363               SrcTy = MRI.getType(I.getOperand(1).getReg());
3364     const unsigned NewOpc = selectFPConvOpc(Opcode, DstTy, SrcTy);
3365     if (NewOpc == Opcode)
3366       return false;
3367 
3368     I.setDesc(TII.get(NewOpc));
3369     constrainSelectedInstRegOperands(I, TII, TRI, RBI);
3370     I.setFlags(MachineInstr::NoFPExcept);
3371 
3372     return true;
3373   }
3374 
3375   case TargetOpcode::G_FREEZE:
3376     return selectCopy(I, TII, MRI, TRI, RBI);
3377 
3378   case TargetOpcode::G_INTTOPTR:
3379     // The importer is currently unable to import pointer types since they
3380     // didn't exist in SelectionDAG.
3381     return selectCopy(I, TII, MRI, TRI, RBI);
3382 
3383   case TargetOpcode::G_BITCAST:
3384     // Imported SelectionDAG rules can handle every bitcast except those that
3385     // bitcast from a type to the same type. Ideally, these shouldn't occur
3386     // but we might not run an optimizer that deletes them. The other exception
3387     // is bitcasts involving pointer types, as SelectionDAG has no knowledge
3388     // of them.
3389     return selectCopy(I, TII, MRI, TRI, RBI);
3390 
3391   case TargetOpcode::G_SELECT: {
3392     auto &Sel = cast<GSelect>(I);
3393     const Register CondReg = Sel.getCondReg();
3394     const Register TReg = Sel.getTrueReg();
3395     const Register FReg = Sel.getFalseReg();
3396 
3397     if (tryOptSelect(Sel))
3398       return true;
3399 
3400     // Make sure to use an unused vreg instead of wzr, so that the peephole
3401     // optimizations will be able to optimize these.
3402     Register DeadVReg = MRI.createVirtualRegister(&AArch64::GPR32RegClass);
3403     auto TstMI = MIB.buildInstr(AArch64::ANDSWri, {DeadVReg}, {CondReg})
3404                      .addImm(AArch64_AM::encodeLogicalImmediate(1, 32));
3405     constrainSelectedInstRegOperands(*TstMI, TII, TRI, RBI);
3406     if (!emitSelect(Sel.getReg(0), TReg, FReg, AArch64CC::NE, MIB))
3407       return false;
3408     Sel.eraseFromParent();
3409     return true;
3410   }
3411   case TargetOpcode::G_ICMP: {
3412     if (Ty.isVector())
3413       return selectVectorICmp(I, MRI);
3414 
3415     if (Ty != LLT::scalar(32)) {
3416       LLVM_DEBUG(dbgs() << "G_ICMP result has type: " << Ty
3417                         << ", expected: " << LLT::scalar(32) << '\n');
3418       return false;
3419     }
3420 
3421     auto Pred = static_cast<CmpInst::Predicate>(I.getOperand(1).getPredicate());
3422     const AArch64CC::CondCode InvCC =
3423         changeICMPPredToAArch64CC(CmpInst::getInversePredicate(Pred));
3424     emitIntegerCompare(I.getOperand(2), I.getOperand(3), I.getOperand(1), MIB);
3425     emitCSINC(/*Dst=*/I.getOperand(0).getReg(), /*Src1=*/AArch64::WZR,
3426               /*Src2=*/AArch64::WZR, InvCC, MIB);
3427     I.eraseFromParent();
3428     return true;
3429   }
3430 
3431   case TargetOpcode::G_FCMP: {
3432     CmpInst::Predicate Pred =
3433         static_cast<CmpInst::Predicate>(I.getOperand(1).getPredicate());
3434     if (!emitFPCompare(I.getOperand(2).getReg(), I.getOperand(3).getReg(), MIB,
3435                        Pred) ||
3436         !emitCSetForFCmp(I.getOperand(0).getReg(), Pred, MIB))
3437       return false;
3438     I.eraseFromParent();
3439     return true;
3440   }
3441   case TargetOpcode::G_VASTART:
3442     return STI.isTargetDarwin() ? selectVaStartDarwin(I, MF, MRI)
3443                                 : selectVaStartAAPCS(I, MF, MRI);
3444   case TargetOpcode::G_INTRINSIC:
3445     return selectIntrinsic(I, MRI);
3446   case TargetOpcode::G_INTRINSIC_W_SIDE_EFFECTS:
3447     return selectIntrinsicWithSideEffects(I, MRI);
3448   case TargetOpcode::G_IMPLICIT_DEF: {
3449     I.setDesc(TII.get(TargetOpcode::IMPLICIT_DEF));
3450     const LLT DstTy = MRI.getType(I.getOperand(0).getReg());
3451     const Register DstReg = I.getOperand(0).getReg();
3452     const RegisterBank &DstRB = *RBI.getRegBank(DstReg, MRI, TRI);
3453     const TargetRegisterClass *DstRC = getRegClassForTypeOnBank(DstTy, DstRB);
3454     RBI.constrainGenericRegister(DstReg, *DstRC, MRI);
3455     return true;
3456   }
3457   case TargetOpcode::G_BLOCK_ADDR: {
3458     if (TM.getCodeModel() == CodeModel::Large) {
3459       materializeLargeCMVal(I, I.getOperand(1).getBlockAddress(), 0);
3460       I.eraseFromParent();
3461       return true;
3462     } else {
3463       I.setDesc(TII.get(AArch64::MOVaddrBA));
3464       auto MovMI = BuildMI(MBB, I, I.getDebugLoc(), TII.get(AArch64::MOVaddrBA),
3465                            I.getOperand(0).getReg())
3466                        .addBlockAddress(I.getOperand(1).getBlockAddress(),
3467                                         /* Offset */ 0, AArch64II::MO_PAGE)
3468                        .addBlockAddress(
3469                            I.getOperand(1).getBlockAddress(), /* Offset */ 0,
3470                            AArch64II::MO_NC | AArch64II::MO_PAGEOFF);
3471       I.eraseFromParent();
3472       return constrainSelectedInstRegOperands(*MovMI, TII, TRI, RBI);
3473     }
3474   }
3475   case AArch64::G_DUP: {
3476     // When the scalar of G_DUP is an s8/s16 gpr, they can't be selected by
3477     // imported patterns. Do it manually here. Avoiding generating s16 gpr is
3478     // difficult because at RBS we may end up pessimizing the fpr case if we
3479     // decided to add an anyextend to fix this. Manual selection is the most
3480     // robust solution for now.
3481     if (RBI.getRegBank(I.getOperand(1).getReg(), MRI, TRI)->getID() !=
3482         AArch64::GPRRegBankID)
3483       return false; // We expect the fpr regbank case to be imported.
3484     LLT VecTy = MRI.getType(I.getOperand(0).getReg());
3485     if (VecTy == LLT::fixed_vector(8, 8))
3486       I.setDesc(TII.get(AArch64::DUPv8i8gpr));
3487     else if (VecTy == LLT::fixed_vector(16, 8))
3488       I.setDesc(TII.get(AArch64::DUPv16i8gpr));
3489     else if (VecTy == LLT::fixed_vector(4, 16))
3490       I.setDesc(TII.get(AArch64::DUPv4i16gpr));
3491     else if (VecTy == LLT::fixed_vector(8, 16))
3492       I.setDesc(TII.get(AArch64::DUPv8i16gpr));
3493     else
3494       return false;
3495     return constrainSelectedInstRegOperands(I, TII, TRI, RBI);
3496   }
3497   case TargetOpcode::G_INTRINSIC_TRUNC:
3498     return selectIntrinsicTrunc(I, MRI);
3499   case TargetOpcode::G_INTRINSIC_ROUND:
3500     return selectIntrinsicRound(I, MRI);
3501   case TargetOpcode::G_BUILD_VECTOR:
3502     return selectBuildVector(I, MRI);
3503   case TargetOpcode::G_MERGE_VALUES:
3504     return selectMergeValues(I, MRI);
3505   case TargetOpcode::G_UNMERGE_VALUES:
3506     return selectUnmergeValues(I, MRI);
3507   case TargetOpcode::G_SHUFFLE_VECTOR:
3508     return selectShuffleVector(I, MRI);
3509   case TargetOpcode::G_EXTRACT_VECTOR_ELT:
3510     return selectExtractElt(I, MRI);
3511   case TargetOpcode::G_INSERT_VECTOR_ELT:
3512     return selectInsertElt(I, MRI);
3513   case TargetOpcode::G_CONCAT_VECTORS:
3514     return selectConcatVectors(I, MRI);
3515   case TargetOpcode::G_JUMP_TABLE:
3516     return selectJumpTable(I, MRI);
3517   case TargetOpcode::G_VECREDUCE_ADD:
3518     return selectReduction(I, MRI);
3519   case TargetOpcode::G_MEMCPY:
3520   case TargetOpcode::G_MEMCPY_INLINE:
3521   case TargetOpcode::G_MEMMOVE:
3522   case TargetOpcode::G_MEMSET:
3523     assert(STI.hasMOPS() && "Shouldn't get here without +mops feature");
3524     return selectMOPS(I, MRI);
3525   }
3526 
3527   return false;
3528 }
3529 
3530 bool AArch64InstructionSelector::selectReduction(MachineInstr &I,
3531                                                  MachineRegisterInfo &MRI) {
3532   Register VecReg = I.getOperand(1).getReg();
3533   LLT VecTy = MRI.getType(VecReg);
3534   if (I.getOpcode() == TargetOpcode::G_VECREDUCE_ADD) {
3535     // For <2 x i32> ADDPv2i32 generates an FPR64 value, so we need to emit
3536     // a subregister copy afterwards.
3537     if (VecTy == LLT::fixed_vector(2, 32)) {
3538       Register DstReg = I.getOperand(0).getReg();
3539       auto AddP = MIB.buildInstr(AArch64::ADDPv2i32, {&AArch64::FPR64RegClass},
3540                                  {VecReg, VecReg});
3541       auto Copy = MIB.buildInstr(TargetOpcode::COPY, {DstReg}, {})
3542                       .addReg(AddP.getReg(0), 0, AArch64::ssub)
3543                       .getReg(0);
3544       RBI.constrainGenericRegister(Copy, AArch64::FPR32RegClass, MRI);
3545       I.eraseFromParent();
3546       return constrainSelectedInstRegOperands(*AddP, TII, TRI, RBI);
3547     }
3548 
3549     unsigned Opc = 0;
3550     if (VecTy == LLT::fixed_vector(16, 8))
3551       Opc = AArch64::ADDVv16i8v;
3552     else if (VecTy == LLT::fixed_vector(8, 16))
3553       Opc = AArch64::ADDVv8i16v;
3554     else if (VecTy == LLT::fixed_vector(4, 32))
3555       Opc = AArch64::ADDVv4i32v;
3556     else if (VecTy == LLT::fixed_vector(2, 64))
3557       Opc = AArch64::ADDPv2i64p;
3558     else {
3559       LLVM_DEBUG(dbgs() << "Unhandled type for add reduction");
3560       return false;
3561     }
3562     I.setDesc(TII.get(Opc));
3563     return constrainSelectedInstRegOperands(I, TII, TRI, RBI);
3564   }
3565 
3566   return false;
3567 }
3568 
3569 bool AArch64InstructionSelector::selectMOPS(MachineInstr &GI,
3570                                             MachineRegisterInfo &MRI) {
3571   unsigned Mopcode;
3572   switch (GI.getOpcode()) {
3573   case TargetOpcode::G_MEMCPY:
3574   case TargetOpcode::G_MEMCPY_INLINE:
3575     Mopcode = AArch64::MOPSMemoryCopyPseudo;
3576     break;
3577   case TargetOpcode::G_MEMMOVE:
3578     Mopcode = AArch64::MOPSMemoryMovePseudo;
3579     break;
3580   case TargetOpcode::G_MEMSET:
3581     // For tagged memset see llvm.aarch64.mops.memset.tag
3582     Mopcode = AArch64::MOPSMemorySetPseudo;
3583     break;
3584   }
3585 
3586   auto &DstPtr = GI.getOperand(0);
3587   auto &SrcOrVal = GI.getOperand(1);
3588   auto &Size = GI.getOperand(2);
3589 
3590   // Create copies of the registers that can be clobbered.
3591   const Register DstPtrCopy = MRI.cloneVirtualRegister(DstPtr.getReg());
3592   const Register SrcValCopy = MRI.cloneVirtualRegister(SrcOrVal.getReg());
3593   const Register SizeCopy = MRI.cloneVirtualRegister(Size.getReg());
3594 
3595   const bool IsSet = Mopcode == AArch64::MOPSMemorySetPseudo;
3596   const auto &SrcValRegClass =
3597       IsSet ? AArch64::GPR64RegClass : AArch64::GPR64commonRegClass;
3598 
3599   // Constrain to specific registers
3600   RBI.constrainGenericRegister(DstPtrCopy, AArch64::GPR64commonRegClass, MRI);
3601   RBI.constrainGenericRegister(SrcValCopy, SrcValRegClass, MRI);
3602   RBI.constrainGenericRegister(SizeCopy, AArch64::GPR64RegClass, MRI);
3603 
3604   MIB.buildCopy(DstPtrCopy, DstPtr);
3605   MIB.buildCopy(SrcValCopy, SrcOrVal);
3606   MIB.buildCopy(SizeCopy, Size);
3607 
3608   // New instruction uses the copied registers because it must update them.
3609   // The defs are not used since they don't exist in G_MEM*. They are still
3610   // tied.
3611   // Note: order of operands is different from G_MEMSET, G_MEMCPY, G_MEMMOVE
3612   Register DefDstPtr = MRI.createVirtualRegister(&AArch64::GPR64commonRegClass);
3613   Register DefSize = MRI.createVirtualRegister(&AArch64::GPR64RegClass);
3614   if (IsSet) {
3615     MIB.buildInstr(Mopcode, {DefDstPtr, DefSize},
3616                    {DstPtrCopy, SizeCopy, SrcValCopy});
3617   } else {
3618     Register DefSrcPtr = MRI.createVirtualRegister(&SrcValRegClass);
3619     MIB.buildInstr(Mopcode, {DefDstPtr, DefSrcPtr, DefSize},
3620                    {DstPtrCopy, SrcValCopy, SizeCopy});
3621   }
3622 
3623   GI.eraseFromParent();
3624   return true;
3625 }
3626 
3627 bool AArch64InstructionSelector::selectBrJT(MachineInstr &I,
3628                                             MachineRegisterInfo &MRI) {
3629   assert(I.getOpcode() == TargetOpcode::G_BRJT && "Expected G_BRJT");
3630   Register JTAddr = I.getOperand(0).getReg();
3631   unsigned JTI = I.getOperand(1).getIndex();
3632   Register Index = I.getOperand(2).getReg();
3633 
3634   Register TargetReg = MRI.createVirtualRegister(&AArch64::GPR64RegClass);
3635   Register ScratchReg = MRI.createVirtualRegister(&AArch64::GPR64spRegClass);
3636 
3637   MF->getInfo<AArch64FunctionInfo>()->setJumpTableEntryInfo(JTI, 4, nullptr);
3638   auto JumpTableInst = MIB.buildInstr(AArch64::JumpTableDest32,
3639                                       {TargetReg, ScratchReg}, {JTAddr, Index})
3640                            .addJumpTableIndex(JTI);
3641   // Build the indirect branch.
3642   MIB.buildInstr(AArch64::BR, {}, {TargetReg});
3643   I.eraseFromParent();
3644   return constrainSelectedInstRegOperands(*JumpTableInst, TII, TRI, RBI);
3645 }
3646 
3647 bool AArch64InstructionSelector::selectJumpTable(MachineInstr &I,
3648                                                  MachineRegisterInfo &MRI) {
3649   assert(I.getOpcode() == TargetOpcode::G_JUMP_TABLE && "Expected jump table");
3650   assert(I.getOperand(1).isJTI() && "Jump table op should have a JTI!");
3651 
3652   Register DstReg = I.getOperand(0).getReg();
3653   unsigned JTI = I.getOperand(1).getIndex();
3654   // We generate a MOVaddrJT which will get expanded to an ADRP + ADD later.
3655   auto MovMI =
3656     MIB.buildInstr(AArch64::MOVaddrJT, {DstReg}, {})
3657           .addJumpTableIndex(JTI, AArch64II::MO_PAGE)
3658           .addJumpTableIndex(JTI, AArch64II::MO_NC | AArch64II::MO_PAGEOFF);
3659   I.eraseFromParent();
3660   return constrainSelectedInstRegOperands(*MovMI, TII, TRI, RBI);
3661 }
3662 
3663 bool AArch64InstructionSelector::selectTLSGlobalValue(
3664     MachineInstr &I, MachineRegisterInfo &MRI) {
3665   if (!STI.isTargetMachO())
3666     return false;
3667   MachineFunction &MF = *I.getParent()->getParent();
3668   MF.getFrameInfo().setAdjustsStack(true);
3669 
3670   const auto &GlobalOp = I.getOperand(1);
3671   assert(GlobalOp.getOffset() == 0 &&
3672          "Shouldn't have an offset on TLS globals!");
3673   const GlobalValue &GV = *GlobalOp.getGlobal();
3674 
3675   auto LoadGOT =
3676       MIB.buildInstr(AArch64::LOADgot, {&AArch64::GPR64commonRegClass}, {})
3677           .addGlobalAddress(&GV, 0, AArch64II::MO_TLS);
3678 
3679   auto Load = MIB.buildInstr(AArch64::LDRXui, {&AArch64::GPR64commonRegClass},
3680                              {LoadGOT.getReg(0)})
3681                   .addImm(0);
3682 
3683   MIB.buildCopy(Register(AArch64::X0), LoadGOT.getReg(0));
3684   // TLS calls preserve all registers except those that absolutely must be
3685   // trashed: X0 (it takes an argument), LR (it's a call) and NZCV (let's not be
3686   // silly).
3687   MIB.buildInstr(getBLRCallOpcode(MF), {}, {Load})
3688       .addUse(AArch64::X0, RegState::Implicit)
3689       .addDef(AArch64::X0, RegState::Implicit)
3690       .addRegMask(TRI.getTLSCallPreservedMask());
3691 
3692   MIB.buildCopy(I.getOperand(0).getReg(), Register(AArch64::X0));
3693   RBI.constrainGenericRegister(I.getOperand(0).getReg(), AArch64::GPR64RegClass,
3694                                MRI);
3695   I.eraseFromParent();
3696   return true;
3697 }
3698 
3699 bool AArch64InstructionSelector::selectIntrinsicTrunc(
3700     MachineInstr &I, MachineRegisterInfo &MRI) const {
3701   const LLT SrcTy = MRI.getType(I.getOperand(0).getReg());
3702 
3703   // Select the correct opcode.
3704   unsigned Opc = 0;
3705   if (!SrcTy.isVector()) {
3706     switch (SrcTy.getSizeInBits()) {
3707     default:
3708     case 16:
3709       Opc = AArch64::FRINTZHr;
3710       break;
3711     case 32:
3712       Opc = AArch64::FRINTZSr;
3713       break;
3714     case 64:
3715       Opc = AArch64::FRINTZDr;
3716       break;
3717     }
3718   } else {
3719     unsigned NumElts = SrcTy.getNumElements();
3720     switch (SrcTy.getElementType().getSizeInBits()) {
3721     default:
3722       break;
3723     case 16:
3724       if (NumElts == 4)
3725         Opc = AArch64::FRINTZv4f16;
3726       else if (NumElts == 8)
3727         Opc = AArch64::FRINTZv8f16;
3728       break;
3729     case 32:
3730       if (NumElts == 2)
3731         Opc = AArch64::FRINTZv2f32;
3732       else if (NumElts == 4)
3733         Opc = AArch64::FRINTZv4f32;
3734       break;
3735     case 64:
3736       if (NumElts == 2)
3737         Opc = AArch64::FRINTZv2f64;
3738       break;
3739     }
3740   }
3741 
3742   if (!Opc) {
3743     // Didn't get an opcode above, bail.
3744     LLVM_DEBUG(dbgs() << "Unsupported type for G_INTRINSIC_TRUNC!\n");
3745     return false;
3746   }
3747 
3748   // Legalization would have set us up perfectly for this; we just need to
3749   // set the opcode and move on.
3750   I.setDesc(TII.get(Opc));
3751   return constrainSelectedInstRegOperands(I, TII, TRI, RBI);
3752 }
3753 
3754 bool AArch64InstructionSelector::selectIntrinsicRound(
3755     MachineInstr &I, MachineRegisterInfo &MRI) const {
3756   const LLT SrcTy = MRI.getType(I.getOperand(0).getReg());
3757 
3758   // Select the correct opcode.
3759   unsigned Opc = 0;
3760   if (!SrcTy.isVector()) {
3761     switch (SrcTy.getSizeInBits()) {
3762     default:
3763     case 16:
3764       Opc = AArch64::FRINTAHr;
3765       break;
3766     case 32:
3767       Opc = AArch64::FRINTASr;
3768       break;
3769     case 64:
3770       Opc = AArch64::FRINTADr;
3771       break;
3772     }
3773   } else {
3774     unsigned NumElts = SrcTy.getNumElements();
3775     switch (SrcTy.getElementType().getSizeInBits()) {
3776     default:
3777       break;
3778     case 16:
3779       if (NumElts == 4)
3780         Opc = AArch64::FRINTAv4f16;
3781       else if (NumElts == 8)
3782         Opc = AArch64::FRINTAv8f16;
3783       break;
3784     case 32:
3785       if (NumElts == 2)
3786         Opc = AArch64::FRINTAv2f32;
3787       else if (NumElts == 4)
3788         Opc = AArch64::FRINTAv4f32;
3789       break;
3790     case 64:
3791       if (NumElts == 2)
3792         Opc = AArch64::FRINTAv2f64;
3793       break;
3794     }
3795   }
3796 
3797   if (!Opc) {
3798     // Didn't get an opcode above, bail.
3799     LLVM_DEBUG(dbgs() << "Unsupported type for G_INTRINSIC_ROUND!\n");
3800     return false;
3801   }
3802 
3803   // Legalization would have set us up perfectly for this; we just need to
3804   // set the opcode and move on.
3805   I.setDesc(TII.get(Opc));
3806   return constrainSelectedInstRegOperands(I, TII, TRI, RBI);
3807 }
3808 
3809 bool AArch64InstructionSelector::selectVectorICmp(
3810     MachineInstr &I, MachineRegisterInfo &MRI) {
3811   Register DstReg = I.getOperand(0).getReg();
3812   LLT DstTy = MRI.getType(DstReg);
3813   Register SrcReg = I.getOperand(2).getReg();
3814   Register Src2Reg = I.getOperand(3).getReg();
3815   LLT SrcTy = MRI.getType(SrcReg);
3816 
3817   unsigned SrcEltSize = SrcTy.getElementType().getSizeInBits();
3818   unsigned NumElts = DstTy.getNumElements();
3819 
3820   // First index is element size, 0 == 8b, 1 == 16b, 2 == 32b, 3 == 64b
3821   // Second index is num elts, 0 == v2, 1 == v4, 2 == v8, 3 == v16
3822   // Third index is cc opcode:
3823   // 0 == eq
3824   // 1 == ugt
3825   // 2 == uge
3826   // 3 == ult
3827   // 4 == ule
3828   // 5 == sgt
3829   // 6 == sge
3830   // 7 == slt
3831   // 8 == sle
3832   // ne is done by negating 'eq' result.
3833 
3834   // This table below assumes that for some comparisons the operands will be
3835   // commuted.
3836   // ult op == commute + ugt op
3837   // ule op == commute + uge op
3838   // slt op == commute + sgt op
3839   // sle op == commute + sge op
3840   unsigned PredIdx = 0;
3841   bool SwapOperands = false;
3842   CmpInst::Predicate Pred = (CmpInst::Predicate)I.getOperand(1).getPredicate();
3843   switch (Pred) {
3844   case CmpInst::ICMP_NE:
3845   case CmpInst::ICMP_EQ:
3846     PredIdx = 0;
3847     break;
3848   case CmpInst::ICMP_UGT:
3849     PredIdx = 1;
3850     break;
3851   case CmpInst::ICMP_UGE:
3852     PredIdx = 2;
3853     break;
3854   case CmpInst::ICMP_ULT:
3855     PredIdx = 3;
3856     SwapOperands = true;
3857     break;
3858   case CmpInst::ICMP_ULE:
3859     PredIdx = 4;
3860     SwapOperands = true;
3861     break;
3862   case CmpInst::ICMP_SGT:
3863     PredIdx = 5;
3864     break;
3865   case CmpInst::ICMP_SGE:
3866     PredIdx = 6;
3867     break;
3868   case CmpInst::ICMP_SLT:
3869     PredIdx = 7;
3870     SwapOperands = true;
3871     break;
3872   case CmpInst::ICMP_SLE:
3873     PredIdx = 8;
3874     SwapOperands = true;
3875     break;
3876   default:
3877     llvm_unreachable("Unhandled icmp predicate");
3878     return false;
3879   }
3880 
3881   // This table obviously should be tablegen'd when we have our GISel native
3882   // tablegen selector.
3883 
3884   static const unsigned OpcTable[4][4][9] = {
3885       {
3886           {0 /* invalid */, 0 /* invalid */, 0 /* invalid */, 0 /* invalid */,
3887            0 /* invalid */, 0 /* invalid */, 0 /* invalid */, 0 /* invalid */,
3888            0 /* invalid */},
3889           {0 /* invalid */, 0 /* invalid */, 0 /* invalid */, 0 /* invalid */,
3890            0 /* invalid */, 0 /* invalid */, 0 /* invalid */, 0 /* invalid */,
3891            0 /* invalid */},
3892           {AArch64::CMEQv8i8, AArch64::CMHIv8i8, AArch64::CMHSv8i8,
3893            AArch64::CMHIv8i8, AArch64::CMHSv8i8, AArch64::CMGTv8i8,
3894            AArch64::CMGEv8i8, AArch64::CMGTv8i8, AArch64::CMGEv8i8},
3895           {AArch64::CMEQv16i8, AArch64::CMHIv16i8, AArch64::CMHSv16i8,
3896            AArch64::CMHIv16i8, AArch64::CMHSv16i8, AArch64::CMGTv16i8,
3897            AArch64::CMGEv16i8, AArch64::CMGTv16i8, AArch64::CMGEv16i8}
3898       },
3899       {
3900           {0 /* invalid */, 0 /* invalid */, 0 /* invalid */, 0 /* invalid */,
3901            0 /* invalid */, 0 /* invalid */, 0 /* invalid */, 0 /* invalid */,
3902            0 /* invalid */},
3903           {AArch64::CMEQv4i16, AArch64::CMHIv4i16, AArch64::CMHSv4i16,
3904            AArch64::CMHIv4i16, AArch64::CMHSv4i16, AArch64::CMGTv4i16,
3905            AArch64::CMGEv4i16, AArch64::CMGTv4i16, AArch64::CMGEv4i16},
3906           {AArch64::CMEQv8i16, AArch64::CMHIv8i16, AArch64::CMHSv8i16,
3907            AArch64::CMHIv8i16, AArch64::CMHSv8i16, AArch64::CMGTv8i16,
3908            AArch64::CMGEv8i16, AArch64::CMGTv8i16, AArch64::CMGEv8i16},
3909           {0 /* invalid */, 0 /* invalid */, 0 /* invalid */, 0 /* invalid */,
3910            0 /* invalid */, 0 /* invalid */, 0 /* invalid */, 0 /* invalid */,
3911            0 /* invalid */}
3912       },
3913       {
3914           {AArch64::CMEQv2i32, AArch64::CMHIv2i32, AArch64::CMHSv2i32,
3915            AArch64::CMHIv2i32, AArch64::CMHSv2i32, AArch64::CMGTv2i32,
3916            AArch64::CMGEv2i32, AArch64::CMGTv2i32, AArch64::CMGEv2i32},
3917           {AArch64::CMEQv4i32, AArch64::CMHIv4i32, AArch64::CMHSv4i32,
3918            AArch64::CMHIv4i32, AArch64::CMHSv4i32, AArch64::CMGTv4i32,
3919            AArch64::CMGEv4i32, AArch64::CMGTv4i32, AArch64::CMGEv4i32},
3920           {0 /* invalid */, 0 /* invalid */, 0 /* invalid */, 0 /* invalid */,
3921            0 /* invalid */, 0 /* invalid */, 0 /* invalid */, 0 /* invalid */,
3922            0 /* invalid */},
3923           {0 /* invalid */, 0 /* invalid */, 0 /* invalid */, 0 /* invalid */,
3924            0 /* invalid */, 0 /* invalid */, 0 /* invalid */, 0 /* invalid */,
3925            0 /* invalid */}
3926       },
3927       {
3928           {AArch64::CMEQv2i64, AArch64::CMHIv2i64, AArch64::CMHSv2i64,
3929            AArch64::CMHIv2i64, AArch64::CMHSv2i64, AArch64::CMGTv2i64,
3930            AArch64::CMGEv2i64, AArch64::CMGTv2i64, AArch64::CMGEv2i64},
3931           {0 /* invalid */, 0 /* invalid */, 0 /* invalid */, 0 /* invalid */,
3932            0 /* invalid */, 0 /* invalid */, 0 /* invalid */, 0 /* invalid */,
3933            0 /* invalid */},
3934           {0 /* invalid */, 0 /* invalid */, 0 /* invalid */, 0 /* invalid */,
3935            0 /* invalid */, 0 /* invalid */, 0 /* invalid */, 0 /* invalid */,
3936            0 /* invalid */},
3937           {0 /* invalid */, 0 /* invalid */, 0 /* invalid */, 0 /* invalid */,
3938            0 /* invalid */, 0 /* invalid */, 0 /* invalid */, 0 /* invalid */,
3939            0 /* invalid */}
3940       },
3941   };
3942   unsigned EltIdx = Log2_32(SrcEltSize / 8);
3943   unsigned NumEltsIdx = Log2_32(NumElts / 2);
3944   unsigned Opc = OpcTable[EltIdx][NumEltsIdx][PredIdx];
3945   if (!Opc) {
3946     LLVM_DEBUG(dbgs() << "Could not map G_ICMP to cmp opcode");
3947     return false;
3948   }
3949 
3950   const RegisterBank &VecRB = *RBI.getRegBank(SrcReg, MRI, TRI);
3951   const TargetRegisterClass *SrcRC =
3952       getRegClassForTypeOnBank(SrcTy, VecRB, true);
3953   if (!SrcRC) {
3954     LLVM_DEBUG(dbgs() << "Could not determine source register class.\n");
3955     return false;
3956   }
3957 
3958   unsigned NotOpc = Pred == ICmpInst::ICMP_NE ? AArch64::NOTv8i8 : 0;
3959   if (SrcTy.getSizeInBits() == 128)
3960     NotOpc = NotOpc ? AArch64::NOTv16i8 : 0;
3961 
3962   if (SwapOperands)
3963     std::swap(SrcReg, Src2Reg);
3964 
3965   auto Cmp = MIB.buildInstr(Opc, {SrcRC}, {SrcReg, Src2Reg});
3966   constrainSelectedInstRegOperands(*Cmp, TII, TRI, RBI);
3967 
3968   // Invert if we had a 'ne' cc.
3969   if (NotOpc) {
3970     Cmp = MIB.buildInstr(NotOpc, {DstReg}, {Cmp});
3971     constrainSelectedInstRegOperands(*Cmp, TII, TRI, RBI);
3972   } else {
3973     MIB.buildCopy(DstReg, Cmp.getReg(0));
3974   }
3975   RBI.constrainGenericRegister(DstReg, *SrcRC, MRI);
3976   I.eraseFromParent();
3977   return true;
3978 }
3979 
3980 MachineInstr *AArch64InstructionSelector::emitScalarToVector(
3981     unsigned EltSize, const TargetRegisterClass *DstRC, Register Scalar,
3982     MachineIRBuilder &MIRBuilder) const {
3983   auto Undef = MIRBuilder.buildInstr(TargetOpcode::IMPLICIT_DEF, {DstRC}, {});
3984 
3985   auto BuildFn = [&](unsigned SubregIndex) {
3986     auto Ins =
3987         MIRBuilder
3988             .buildInstr(TargetOpcode::INSERT_SUBREG, {DstRC}, {Undef, Scalar})
3989             .addImm(SubregIndex);
3990     constrainSelectedInstRegOperands(*Undef, TII, TRI, RBI);
3991     constrainSelectedInstRegOperands(*Ins, TII, TRI, RBI);
3992     return &*Ins;
3993   };
3994 
3995   switch (EltSize) {
3996   case 8:
3997     return BuildFn(AArch64::bsub);
3998   case 16:
3999     return BuildFn(AArch64::hsub);
4000   case 32:
4001     return BuildFn(AArch64::ssub);
4002   case 64:
4003     return BuildFn(AArch64::dsub);
4004   default:
4005     return nullptr;
4006   }
4007 }
4008 
4009 bool AArch64InstructionSelector::selectMergeValues(
4010     MachineInstr &I, MachineRegisterInfo &MRI) {
4011   assert(I.getOpcode() == TargetOpcode::G_MERGE_VALUES && "unexpected opcode");
4012   const LLT DstTy = MRI.getType(I.getOperand(0).getReg());
4013   const LLT SrcTy = MRI.getType(I.getOperand(1).getReg());
4014   assert(!DstTy.isVector() && !SrcTy.isVector() && "invalid merge operation");
4015   const RegisterBank &RB = *RBI.getRegBank(I.getOperand(1).getReg(), MRI, TRI);
4016 
4017   if (I.getNumOperands() != 3)
4018     return false;
4019 
4020   // Merging 2 s64s into an s128.
4021   if (DstTy == LLT::scalar(128)) {
4022     if (SrcTy.getSizeInBits() != 64)
4023       return false;
4024     Register DstReg = I.getOperand(0).getReg();
4025     Register Src1Reg = I.getOperand(1).getReg();
4026     Register Src2Reg = I.getOperand(2).getReg();
4027     auto Tmp = MIB.buildInstr(TargetOpcode::IMPLICIT_DEF, {DstTy}, {});
4028     MachineInstr *InsMI = emitLaneInsert(std::nullopt, Tmp.getReg(0), Src1Reg,
4029                                          /* LaneIdx */ 0, RB, MIB);
4030     if (!InsMI)
4031       return false;
4032     MachineInstr *Ins2MI = emitLaneInsert(DstReg, InsMI->getOperand(0).getReg(),
4033                                           Src2Reg, /* LaneIdx */ 1, RB, MIB);
4034     if (!Ins2MI)
4035       return false;
4036     constrainSelectedInstRegOperands(*InsMI, TII, TRI, RBI);
4037     constrainSelectedInstRegOperands(*Ins2MI, TII, TRI, RBI);
4038     I.eraseFromParent();
4039     return true;
4040   }
4041 
4042   if (RB.getID() != AArch64::GPRRegBankID)
4043     return false;
4044 
4045   if (DstTy.getSizeInBits() != 64 || SrcTy.getSizeInBits() != 32)
4046     return false;
4047 
4048   auto *DstRC = &AArch64::GPR64RegClass;
4049   Register SubToRegDef = MRI.createVirtualRegister(DstRC);
4050   MachineInstr &SubRegMI = *BuildMI(*I.getParent(), I, I.getDebugLoc(),
4051                                     TII.get(TargetOpcode::SUBREG_TO_REG))
4052                                 .addDef(SubToRegDef)
4053                                 .addImm(0)
4054                                 .addUse(I.getOperand(1).getReg())
4055                                 .addImm(AArch64::sub_32);
4056   Register SubToRegDef2 = MRI.createVirtualRegister(DstRC);
4057   // Need to anyext the second scalar before we can use bfm
4058   MachineInstr &SubRegMI2 = *BuildMI(*I.getParent(), I, I.getDebugLoc(),
4059                                     TII.get(TargetOpcode::SUBREG_TO_REG))
4060                                 .addDef(SubToRegDef2)
4061                                 .addImm(0)
4062                                 .addUse(I.getOperand(2).getReg())
4063                                 .addImm(AArch64::sub_32);
4064   MachineInstr &BFM =
4065       *BuildMI(*I.getParent(), I, I.getDebugLoc(), TII.get(AArch64::BFMXri))
4066            .addDef(I.getOperand(0).getReg())
4067            .addUse(SubToRegDef)
4068            .addUse(SubToRegDef2)
4069            .addImm(32)
4070            .addImm(31);
4071   constrainSelectedInstRegOperands(SubRegMI, TII, TRI, RBI);
4072   constrainSelectedInstRegOperands(SubRegMI2, TII, TRI, RBI);
4073   constrainSelectedInstRegOperands(BFM, TII, TRI, RBI);
4074   I.eraseFromParent();
4075   return true;
4076 }
4077 
4078 static bool getLaneCopyOpcode(unsigned &CopyOpc, unsigned &ExtractSubReg,
4079                               const unsigned EltSize) {
4080   // Choose a lane copy opcode and subregister based off of the size of the
4081   // vector's elements.
4082   switch (EltSize) {
4083   case 8:
4084     CopyOpc = AArch64::DUPi8;
4085     ExtractSubReg = AArch64::bsub;
4086     break;
4087   case 16:
4088     CopyOpc = AArch64::DUPi16;
4089     ExtractSubReg = AArch64::hsub;
4090     break;
4091   case 32:
4092     CopyOpc = AArch64::DUPi32;
4093     ExtractSubReg = AArch64::ssub;
4094     break;
4095   case 64:
4096     CopyOpc = AArch64::DUPi64;
4097     ExtractSubReg = AArch64::dsub;
4098     break;
4099   default:
4100     // Unknown size, bail out.
4101     LLVM_DEBUG(dbgs() << "Elt size '" << EltSize << "' unsupported.\n");
4102     return false;
4103   }
4104   return true;
4105 }
4106 
4107 MachineInstr *AArch64InstructionSelector::emitExtractVectorElt(
4108     std::optional<Register> DstReg, const RegisterBank &DstRB, LLT ScalarTy,
4109     Register VecReg, unsigned LaneIdx, MachineIRBuilder &MIRBuilder) const {
4110   MachineRegisterInfo &MRI = *MIRBuilder.getMRI();
4111   unsigned CopyOpc = 0;
4112   unsigned ExtractSubReg = 0;
4113   if (!getLaneCopyOpcode(CopyOpc, ExtractSubReg, ScalarTy.getSizeInBits())) {
4114     LLVM_DEBUG(
4115         dbgs() << "Couldn't determine lane copy opcode for instruction.\n");
4116     return nullptr;
4117   }
4118 
4119   const TargetRegisterClass *DstRC =
4120       getRegClassForTypeOnBank(ScalarTy, DstRB, true);
4121   if (!DstRC) {
4122     LLVM_DEBUG(dbgs() << "Could not determine destination register class.\n");
4123     return nullptr;
4124   }
4125 
4126   const RegisterBank &VecRB = *RBI.getRegBank(VecReg, MRI, TRI);
4127   const LLT &VecTy = MRI.getType(VecReg);
4128   const TargetRegisterClass *VecRC =
4129       getRegClassForTypeOnBank(VecTy, VecRB, true);
4130   if (!VecRC) {
4131     LLVM_DEBUG(dbgs() << "Could not determine source register class.\n");
4132     return nullptr;
4133   }
4134 
4135   // The register that we're going to copy into.
4136   Register InsertReg = VecReg;
4137   if (!DstReg)
4138     DstReg = MRI.createVirtualRegister(DstRC);
4139   // If the lane index is 0, we just use a subregister COPY.
4140   if (LaneIdx == 0) {
4141     auto Copy = MIRBuilder.buildInstr(TargetOpcode::COPY, {*DstReg}, {})
4142                     .addReg(VecReg, 0, ExtractSubReg);
4143     RBI.constrainGenericRegister(*DstReg, *DstRC, MRI);
4144     return &*Copy;
4145   }
4146 
4147   // Lane copies require 128-bit wide registers. If we're dealing with an
4148   // unpacked vector, then we need to move up to that width. Insert an implicit
4149   // def and a subregister insert to get us there.
4150   if (VecTy.getSizeInBits() != 128) {
4151     MachineInstr *ScalarToVector = emitScalarToVector(
4152         VecTy.getSizeInBits(), &AArch64::FPR128RegClass, VecReg, MIRBuilder);
4153     if (!ScalarToVector)
4154       return nullptr;
4155     InsertReg = ScalarToVector->getOperand(0).getReg();
4156   }
4157 
4158   MachineInstr *LaneCopyMI =
4159       MIRBuilder.buildInstr(CopyOpc, {*DstReg}, {InsertReg}).addImm(LaneIdx);
4160   constrainSelectedInstRegOperands(*LaneCopyMI, TII, TRI, RBI);
4161 
4162   // Make sure that we actually constrain the initial copy.
4163   RBI.constrainGenericRegister(*DstReg, *DstRC, MRI);
4164   return LaneCopyMI;
4165 }
4166 
4167 bool AArch64InstructionSelector::selectExtractElt(
4168     MachineInstr &I, MachineRegisterInfo &MRI) {
4169   assert(I.getOpcode() == TargetOpcode::G_EXTRACT_VECTOR_ELT &&
4170          "unexpected opcode!");
4171   Register DstReg = I.getOperand(0).getReg();
4172   const LLT NarrowTy = MRI.getType(DstReg);
4173   const Register SrcReg = I.getOperand(1).getReg();
4174   const LLT WideTy = MRI.getType(SrcReg);
4175   (void)WideTy;
4176   assert(WideTy.getSizeInBits() >= NarrowTy.getSizeInBits() &&
4177          "source register size too small!");
4178   assert(!NarrowTy.isVector() && "cannot extract vector into vector!");
4179 
4180   // Need the lane index to determine the correct copy opcode.
4181   MachineOperand &LaneIdxOp = I.getOperand(2);
4182   assert(LaneIdxOp.isReg() && "Lane index operand was not a register?");
4183 
4184   if (RBI.getRegBank(DstReg, MRI, TRI)->getID() != AArch64::FPRRegBankID) {
4185     LLVM_DEBUG(dbgs() << "Cannot extract into GPR.\n");
4186     return false;
4187   }
4188 
4189   // Find the index to extract from.
4190   auto VRegAndVal = getIConstantVRegValWithLookThrough(LaneIdxOp.getReg(), MRI);
4191   if (!VRegAndVal)
4192     return false;
4193   unsigned LaneIdx = VRegAndVal->Value.getSExtValue();
4194 
4195 
4196   const RegisterBank &DstRB = *RBI.getRegBank(DstReg, MRI, TRI);
4197   MachineInstr *Extract = emitExtractVectorElt(DstReg, DstRB, NarrowTy, SrcReg,
4198                                                LaneIdx, MIB);
4199   if (!Extract)
4200     return false;
4201 
4202   I.eraseFromParent();
4203   return true;
4204 }
4205 
4206 bool AArch64InstructionSelector::selectSplitVectorUnmerge(
4207     MachineInstr &I, MachineRegisterInfo &MRI) {
4208   unsigned NumElts = I.getNumOperands() - 1;
4209   Register SrcReg = I.getOperand(NumElts).getReg();
4210   const LLT NarrowTy = MRI.getType(I.getOperand(0).getReg());
4211   const LLT SrcTy = MRI.getType(SrcReg);
4212 
4213   assert(NarrowTy.isVector() && "Expected an unmerge into vectors");
4214   if (SrcTy.getSizeInBits() > 128) {
4215     LLVM_DEBUG(dbgs() << "Unexpected vector type for vec split unmerge");
4216     return false;
4217   }
4218 
4219   // We implement a split vector operation by treating the sub-vectors as
4220   // scalars and extracting them.
4221   const RegisterBank &DstRB =
4222       *RBI.getRegBank(I.getOperand(0).getReg(), MRI, TRI);
4223   for (unsigned OpIdx = 0; OpIdx < NumElts; ++OpIdx) {
4224     Register Dst = I.getOperand(OpIdx).getReg();
4225     MachineInstr *Extract =
4226         emitExtractVectorElt(Dst, DstRB, NarrowTy, SrcReg, OpIdx, MIB);
4227     if (!Extract)
4228       return false;
4229   }
4230   I.eraseFromParent();
4231   return true;
4232 }
4233 
4234 bool AArch64InstructionSelector::selectUnmergeValues(MachineInstr &I,
4235                                                      MachineRegisterInfo &MRI) {
4236   assert(I.getOpcode() == TargetOpcode::G_UNMERGE_VALUES &&
4237          "unexpected opcode");
4238 
4239   // TODO: Handle unmerging into GPRs and from scalars to scalars.
4240   if (RBI.getRegBank(I.getOperand(0).getReg(), MRI, TRI)->getID() !=
4241           AArch64::FPRRegBankID ||
4242       RBI.getRegBank(I.getOperand(1).getReg(), MRI, TRI)->getID() !=
4243           AArch64::FPRRegBankID) {
4244     LLVM_DEBUG(dbgs() << "Unmerging vector-to-gpr and scalar-to-scalar "
4245                          "currently unsupported.\n");
4246     return false;
4247   }
4248 
4249   // The last operand is the vector source register, and every other operand is
4250   // a register to unpack into.
4251   unsigned NumElts = I.getNumOperands() - 1;
4252   Register SrcReg = I.getOperand(NumElts).getReg();
4253   const LLT NarrowTy = MRI.getType(I.getOperand(0).getReg());
4254   const LLT WideTy = MRI.getType(SrcReg);
4255   (void)WideTy;
4256   assert((WideTy.isVector() || WideTy.getSizeInBits() == 128) &&
4257          "can only unmerge from vector or s128 types!");
4258   assert(WideTy.getSizeInBits() > NarrowTy.getSizeInBits() &&
4259          "source register size too small!");
4260 
4261   if (!NarrowTy.isScalar())
4262     return selectSplitVectorUnmerge(I, MRI);
4263 
4264   // Choose a lane copy opcode and subregister based off of the size of the
4265   // vector's elements.
4266   unsigned CopyOpc = 0;
4267   unsigned ExtractSubReg = 0;
4268   if (!getLaneCopyOpcode(CopyOpc, ExtractSubReg, NarrowTy.getSizeInBits()))
4269     return false;
4270 
4271   // Set up for the lane copies.
4272   MachineBasicBlock &MBB = *I.getParent();
4273 
4274   // Stores the registers we'll be copying from.
4275   SmallVector<Register, 4> InsertRegs;
4276 
4277   // We'll use the first register twice, so we only need NumElts-1 registers.
4278   unsigned NumInsertRegs = NumElts - 1;
4279 
4280   // If our elements fit into exactly 128 bits, then we can copy from the source
4281   // directly. Otherwise, we need to do a bit of setup with some subregister
4282   // inserts.
4283   if (NarrowTy.getSizeInBits() * NumElts == 128) {
4284     InsertRegs = SmallVector<Register, 4>(NumInsertRegs, SrcReg);
4285   } else {
4286     // No. We have to perform subregister inserts. For each insert, create an
4287     // implicit def and a subregister insert, and save the register we create.
4288     const TargetRegisterClass *RC = getRegClassForTypeOnBank(
4289         LLT::fixed_vector(NumElts, WideTy.getScalarSizeInBits()),
4290         *RBI.getRegBank(SrcReg, MRI, TRI));
4291     unsigned SubReg = 0;
4292     bool Found = getSubRegForClass(RC, TRI, SubReg);
4293     (void)Found;
4294     assert(Found && "expected to find last operand's subeg idx");
4295     for (unsigned Idx = 0; Idx < NumInsertRegs; ++Idx) {
4296       Register ImpDefReg = MRI.createVirtualRegister(&AArch64::FPR128RegClass);
4297       MachineInstr &ImpDefMI =
4298           *BuildMI(MBB, I, I.getDebugLoc(), TII.get(TargetOpcode::IMPLICIT_DEF),
4299                    ImpDefReg);
4300 
4301       // Now, create the subregister insert from SrcReg.
4302       Register InsertReg = MRI.createVirtualRegister(&AArch64::FPR128RegClass);
4303       MachineInstr &InsMI =
4304           *BuildMI(MBB, I, I.getDebugLoc(),
4305                    TII.get(TargetOpcode::INSERT_SUBREG), InsertReg)
4306                .addUse(ImpDefReg)
4307                .addUse(SrcReg)
4308                .addImm(SubReg);
4309 
4310       constrainSelectedInstRegOperands(ImpDefMI, TII, TRI, RBI);
4311       constrainSelectedInstRegOperands(InsMI, TII, TRI, RBI);
4312 
4313       // Save the register so that we can copy from it after.
4314       InsertRegs.push_back(InsertReg);
4315     }
4316   }
4317 
4318   // Now that we've created any necessary subregister inserts, we can
4319   // create the copies.
4320   //
4321   // Perform the first copy separately as a subregister copy.
4322   Register CopyTo = I.getOperand(0).getReg();
4323   auto FirstCopy = MIB.buildInstr(TargetOpcode::COPY, {CopyTo}, {})
4324                        .addReg(InsertRegs[0], 0, ExtractSubReg);
4325   constrainSelectedInstRegOperands(*FirstCopy, TII, TRI, RBI);
4326 
4327   // Now, perform the remaining copies as vector lane copies.
4328   unsigned LaneIdx = 1;
4329   for (Register InsReg : InsertRegs) {
4330     Register CopyTo = I.getOperand(LaneIdx).getReg();
4331     MachineInstr &CopyInst =
4332         *BuildMI(MBB, I, I.getDebugLoc(), TII.get(CopyOpc), CopyTo)
4333              .addUse(InsReg)
4334              .addImm(LaneIdx);
4335     constrainSelectedInstRegOperands(CopyInst, TII, TRI, RBI);
4336     ++LaneIdx;
4337   }
4338 
4339   // Separately constrain the first copy's destination. Because of the
4340   // limitation in constrainOperandRegClass, we can't guarantee that this will
4341   // actually be constrained. So, do it ourselves using the second operand.
4342   const TargetRegisterClass *RC =
4343       MRI.getRegClassOrNull(I.getOperand(1).getReg());
4344   if (!RC) {
4345     LLVM_DEBUG(dbgs() << "Couldn't constrain copy destination.\n");
4346     return false;
4347   }
4348 
4349   RBI.constrainGenericRegister(CopyTo, *RC, MRI);
4350   I.eraseFromParent();
4351   return true;
4352 }
4353 
4354 bool AArch64InstructionSelector::selectConcatVectors(
4355     MachineInstr &I, MachineRegisterInfo &MRI)  {
4356   assert(I.getOpcode() == TargetOpcode::G_CONCAT_VECTORS &&
4357          "Unexpected opcode");
4358   Register Dst = I.getOperand(0).getReg();
4359   Register Op1 = I.getOperand(1).getReg();
4360   Register Op2 = I.getOperand(2).getReg();
4361   MachineInstr *ConcatMI = emitVectorConcat(Dst, Op1, Op2, MIB);
4362   if (!ConcatMI)
4363     return false;
4364   I.eraseFromParent();
4365   return true;
4366 }
4367 
4368 unsigned
4369 AArch64InstructionSelector::emitConstantPoolEntry(const Constant *CPVal,
4370                                                   MachineFunction &MF) const {
4371   Type *CPTy = CPVal->getType();
4372   Align Alignment = MF.getDataLayout().getPrefTypeAlign(CPTy);
4373 
4374   MachineConstantPool *MCP = MF.getConstantPool();
4375   return MCP->getConstantPoolIndex(CPVal, Alignment);
4376 }
4377 
4378 MachineInstr *AArch64InstructionSelector::emitLoadFromConstantPool(
4379     const Constant *CPVal, MachineIRBuilder &MIRBuilder) const {
4380   const TargetRegisterClass *RC;
4381   unsigned Opc;
4382   bool IsTiny = TM.getCodeModel() == CodeModel::Tiny;
4383   unsigned Size = MIRBuilder.getDataLayout().getTypeStoreSize(CPVal->getType());
4384   switch (Size) {
4385   case 16:
4386     RC = &AArch64::FPR128RegClass;
4387     Opc = IsTiny ? AArch64::LDRQl : AArch64::LDRQui;
4388     break;
4389   case 8:
4390     RC = &AArch64::FPR64RegClass;
4391     Opc = IsTiny ? AArch64::LDRDl : AArch64::LDRDui;
4392     break;
4393   case 4:
4394     RC = &AArch64::FPR32RegClass;
4395     Opc = IsTiny ? AArch64::LDRSl : AArch64::LDRSui;
4396     break;
4397   case 2:
4398     RC = &AArch64::FPR16RegClass;
4399     Opc = AArch64::LDRHui;
4400     break;
4401   default:
4402     LLVM_DEBUG(dbgs() << "Could not load from constant pool of type "
4403                       << *CPVal->getType());
4404     return nullptr;
4405   }
4406 
4407   MachineInstr *LoadMI = nullptr;
4408   auto &MF = MIRBuilder.getMF();
4409   unsigned CPIdx = emitConstantPoolEntry(CPVal, MF);
4410   if (IsTiny && (Size == 16 || Size == 8 || Size == 4)) {
4411     // Use load(literal) for tiny code model.
4412     LoadMI = &*MIRBuilder.buildInstr(Opc, {RC}, {}).addConstantPoolIndex(CPIdx);
4413   } else {
4414     auto Adrp =
4415         MIRBuilder.buildInstr(AArch64::ADRP, {&AArch64::GPR64RegClass}, {})
4416             .addConstantPoolIndex(CPIdx, 0, AArch64II::MO_PAGE);
4417 
4418     LoadMI = &*MIRBuilder.buildInstr(Opc, {RC}, {Adrp})
4419                    .addConstantPoolIndex(
4420                        CPIdx, 0, AArch64II::MO_PAGEOFF | AArch64II::MO_NC);
4421 
4422     constrainSelectedInstRegOperands(*Adrp, TII, TRI, RBI);
4423   }
4424 
4425   MachinePointerInfo PtrInfo = MachinePointerInfo::getConstantPool(MF);
4426   LoadMI->addMemOperand(MF, MF.getMachineMemOperand(PtrInfo,
4427                                                     MachineMemOperand::MOLoad,
4428                                                     Size, Align(Size)));
4429   constrainSelectedInstRegOperands(*LoadMI, TII, TRI, RBI);
4430   return LoadMI;
4431 }
4432 
4433 /// Return an <Opcode, SubregIndex> pair to do an vector elt insert of a given
4434 /// size and RB.
4435 static std::pair<unsigned, unsigned>
4436 getInsertVecEltOpInfo(const RegisterBank &RB, unsigned EltSize) {
4437   unsigned Opc, SubregIdx;
4438   if (RB.getID() == AArch64::GPRRegBankID) {
4439     if (EltSize == 8) {
4440       Opc = AArch64::INSvi8gpr;
4441       SubregIdx = AArch64::bsub;
4442     } else if (EltSize == 16) {
4443       Opc = AArch64::INSvi16gpr;
4444       SubregIdx = AArch64::ssub;
4445     } else if (EltSize == 32) {
4446       Opc = AArch64::INSvi32gpr;
4447       SubregIdx = AArch64::ssub;
4448     } else if (EltSize == 64) {
4449       Opc = AArch64::INSvi64gpr;
4450       SubregIdx = AArch64::dsub;
4451     } else {
4452       llvm_unreachable("invalid elt size!");
4453     }
4454   } else {
4455     if (EltSize == 8) {
4456       Opc = AArch64::INSvi8lane;
4457       SubregIdx = AArch64::bsub;
4458     } else if (EltSize == 16) {
4459       Opc = AArch64::INSvi16lane;
4460       SubregIdx = AArch64::hsub;
4461     } else if (EltSize == 32) {
4462       Opc = AArch64::INSvi32lane;
4463       SubregIdx = AArch64::ssub;
4464     } else if (EltSize == 64) {
4465       Opc = AArch64::INSvi64lane;
4466       SubregIdx = AArch64::dsub;
4467     } else {
4468       llvm_unreachable("invalid elt size!");
4469     }
4470   }
4471   return std::make_pair(Opc, SubregIdx);
4472 }
4473 
4474 MachineInstr *AArch64InstructionSelector::emitInstr(
4475     unsigned Opcode, std::initializer_list<llvm::DstOp> DstOps,
4476     std::initializer_list<llvm::SrcOp> SrcOps, MachineIRBuilder &MIRBuilder,
4477     const ComplexRendererFns &RenderFns) const {
4478   assert(Opcode && "Expected an opcode?");
4479   assert(!isPreISelGenericOpcode(Opcode) &&
4480          "Function should only be used to produce selected instructions!");
4481   auto MI = MIRBuilder.buildInstr(Opcode, DstOps, SrcOps);
4482   if (RenderFns)
4483     for (auto &Fn : *RenderFns)
4484       Fn(MI);
4485   constrainSelectedInstRegOperands(*MI, TII, TRI, RBI);
4486   return &*MI;
4487 }
4488 
4489 MachineInstr *AArch64InstructionSelector::emitAddSub(
4490     const std::array<std::array<unsigned, 2>, 5> &AddrModeAndSizeToOpcode,
4491     Register Dst, MachineOperand &LHS, MachineOperand &RHS,
4492     MachineIRBuilder &MIRBuilder) const {
4493   MachineRegisterInfo &MRI = MIRBuilder.getMF().getRegInfo();
4494   assert(LHS.isReg() && RHS.isReg() && "Expected register operands?");
4495   auto Ty = MRI.getType(LHS.getReg());
4496   assert(!Ty.isVector() && "Expected a scalar or pointer?");
4497   unsigned Size = Ty.getSizeInBits();
4498   assert((Size == 32 || Size == 64) && "Expected a 32-bit or 64-bit type only");
4499   bool Is32Bit = Size == 32;
4500 
4501   // INSTRri form with positive arithmetic immediate.
4502   if (auto Fns = selectArithImmed(RHS))
4503     return emitInstr(AddrModeAndSizeToOpcode[0][Is32Bit], {Dst}, {LHS},
4504                      MIRBuilder, Fns);
4505 
4506   // INSTRri form with negative arithmetic immediate.
4507   if (auto Fns = selectNegArithImmed(RHS))
4508     return emitInstr(AddrModeAndSizeToOpcode[3][Is32Bit], {Dst}, {LHS},
4509                      MIRBuilder, Fns);
4510 
4511   // INSTRrx form.
4512   if (auto Fns = selectArithExtendedRegister(RHS))
4513     return emitInstr(AddrModeAndSizeToOpcode[4][Is32Bit], {Dst}, {LHS},
4514                      MIRBuilder, Fns);
4515 
4516   // INSTRrs form.
4517   if (auto Fns = selectShiftedRegister(RHS))
4518     return emitInstr(AddrModeAndSizeToOpcode[1][Is32Bit], {Dst}, {LHS},
4519                      MIRBuilder, Fns);
4520   return emitInstr(AddrModeAndSizeToOpcode[2][Is32Bit], {Dst}, {LHS, RHS},
4521                    MIRBuilder);
4522 }
4523 
4524 MachineInstr *
4525 AArch64InstructionSelector::emitADD(Register DefReg, MachineOperand &LHS,
4526                                     MachineOperand &RHS,
4527                                     MachineIRBuilder &MIRBuilder) const {
4528   const std::array<std::array<unsigned, 2>, 5> OpcTable{
4529       {{AArch64::ADDXri, AArch64::ADDWri},
4530        {AArch64::ADDXrs, AArch64::ADDWrs},
4531        {AArch64::ADDXrr, AArch64::ADDWrr},
4532        {AArch64::SUBXri, AArch64::SUBWri},
4533        {AArch64::ADDXrx, AArch64::ADDWrx}}};
4534   return emitAddSub(OpcTable, DefReg, LHS, RHS, MIRBuilder);
4535 }
4536 
4537 MachineInstr *
4538 AArch64InstructionSelector::emitADDS(Register Dst, MachineOperand &LHS,
4539                                      MachineOperand &RHS,
4540                                      MachineIRBuilder &MIRBuilder) const {
4541   const std::array<std::array<unsigned, 2>, 5> OpcTable{
4542       {{AArch64::ADDSXri, AArch64::ADDSWri},
4543        {AArch64::ADDSXrs, AArch64::ADDSWrs},
4544        {AArch64::ADDSXrr, AArch64::ADDSWrr},
4545        {AArch64::SUBSXri, AArch64::SUBSWri},
4546        {AArch64::ADDSXrx, AArch64::ADDSWrx}}};
4547   return emitAddSub(OpcTable, Dst, LHS, RHS, MIRBuilder);
4548 }
4549 
4550 MachineInstr *
4551 AArch64InstructionSelector::emitSUBS(Register Dst, MachineOperand &LHS,
4552                                      MachineOperand &RHS,
4553                                      MachineIRBuilder &MIRBuilder) const {
4554   const std::array<std::array<unsigned, 2>, 5> OpcTable{
4555       {{AArch64::SUBSXri, AArch64::SUBSWri},
4556        {AArch64::SUBSXrs, AArch64::SUBSWrs},
4557        {AArch64::SUBSXrr, AArch64::SUBSWrr},
4558        {AArch64::ADDSXri, AArch64::ADDSWri},
4559        {AArch64::SUBSXrx, AArch64::SUBSWrx}}};
4560   return emitAddSub(OpcTable, Dst, LHS, RHS, MIRBuilder);
4561 }
4562 
4563 MachineInstr *
4564 AArch64InstructionSelector::emitADCS(Register Dst, MachineOperand &LHS,
4565                                      MachineOperand &RHS,
4566                                      MachineIRBuilder &MIRBuilder) const {
4567   assert(LHS.isReg() && RHS.isReg() && "Expected register operands?");
4568   MachineRegisterInfo *MRI = MIRBuilder.getMRI();
4569   bool Is32Bit = (MRI->getType(LHS.getReg()).getSizeInBits() == 32);
4570   static const unsigned OpcTable[2] = {AArch64::ADCSXr, AArch64::ADCSWr};
4571   return emitInstr(OpcTable[Is32Bit], {Dst}, {LHS, RHS}, MIRBuilder);
4572 }
4573 
4574 MachineInstr *
4575 AArch64InstructionSelector::emitSBCS(Register Dst, MachineOperand &LHS,
4576                                      MachineOperand &RHS,
4577                                      MachineIRBuilder &MIRBuilder) const {
4578   assert(LHS.isReg() && RHS.isReg() && "Expected register operands?");
4579   MachineRegisterInfo *MRI = MIRBuilder.getMRI();
4580   bool Is32Bit = (MRI->getType(LHS.getReg()).getSizeInBits() == 32);
4581   static const unsigned OpcTable[2] = {AArch64::SBCSXr, AArch64::SBCSWr};
4582   return emitInstr(OpcTable[Is32Bit], {Dst}, {LHS, RHS}, MIRBuilder);
4583 }
4584 
4585 MachineInstr *
4586 AArch64InstructionSelector::emitCMN(MachineOperand &LHS, MachineOperand &RHS,
4587                                     MachineIRBuilder &MIRBuilder) const {
4588   MachineRegisterInfo &MRI = MIRBuilder.getMF().getRegInfo();
4589   bool Is32Bit = (MRI.getType(LHS.getReg()).getSizeInBits() == 32);
4590   auto RC = Is32Bit ? &AArch64::GPR32RegClass : &AArch64::GPR64RegClass;
4591   return emitADDS(MRI.createVirtualRegister(RC), LHS, RHS, MIRBuilder);
4592 }
4593 
4594 MachineInstr *
4595 AArch64InstructionSelector::emitTST(MachineOperand &LHS, MachineOperand &RHS,
4596                                     MachineIRBuilder &MIRBuilder) const {
4597   assert(LHS.isReg() && RHS.isReg() && "Expected register operands?");
4598   MachineRegisterInfo &MRI = MIRBuilder.getMF().getRegInfo();
4599   LLT Ty = MRI.getType(LHS.getReg());
4600   unsigned RegSize = Ty.getSizeInBits();
4601   bool Is32Bit = (RegSize == 32);
4602   const unsigned OpcTable[3][2] = {{AArch64::ANDSXri, AArch64::ANDSWri},
4603                                    {AArch64::ANDSXrs, AArch64::ANDSWrs},
4604                                    {AArch64::ANDSXrr, AArch64::ANDSWrr}};
4605   // ANDS needs a logical immediate for its immediate form. Check if we can
4606   // fold one in.
4607   if (auto ValAndVReg = getIConstantVRegValWithLookThrough(RHS.getReg(), MRI)) {
4608     int64_t Imm = ValAndVReg->Value.getSExtValue();
4609 
4610     if (AArch64_AM::isLogicalImmediate(Imm, RegSize)) {
4611       auto TstMI = MIRBuilder.buildInstr(OpcTable[0][Is32Bit], {Ty}, {LHS});
4612       TstMI.addImm(AArch64_AM::encodeLogicalImmediate(Imm, RegSize));
4613       constrainSelectedInstRegOperands(*TstMI, TII, TRI, RBI);
4614       return &*TstMI;
4615     }
4616   }
4617 
4618   if (auto Fns = selectLogicalShiftedRegister(RHS))
4619     return emitInstr(OpcTable[1][Is32Bit], {Ty}, {LHS}, MIRBuilder, Fns);
4620   return emitInstr(OpcTable[2][Is32Bit], {Ty}, {LHS, RHS}, MIRBuilder);
4621 }
4622 
4623 MachineInstr *AArch64InstructionSelector::emitIntegerCompare(
4624     MachineOperand &LHS, MachineOperand &RHS, MachineOperand &Predicate,
4625     MachineIRBuilder &MIRBuilder) const {
4626   assert(LHS.isReg() && RHS.isReg() && "Expected LHS and RHS to be registers!");
4627   assert(Predicate.isPredicate() && "Expected predicate?");
4628   MachineRegisterInfo &MRI = MIRBuilder.getMF().getRegInfo();
4629   LLT CmpTy = MRI.getType(LHS.getReg());
4630   assert(!CmpTy.isVector() && "Expected scalar or pointer");
4631   unsigned Size = CmpTy.getSizeInBits();
4632   (void)Size;
4633   assert((Size == 32 || Size == 64) && "Expected a 32-bit or 64-bit LHS/RHS?");
4634   // Fold the compare into a cmn or tst if possible.
4635   if (auto FoldCmp = tryFoldIntegerCompare(LHS, RHS, Predicate, MIRBuilder))
4636     return FoldCmp;
4637   auto Dst = MRI.cloneVirtualRegister(LHS.getReg());
4638   return emitSUBS(Dst, LHS, RHS, MIRBuilder);
4639 }
4640 
4641 MachineInstr *AArch64InstructionSelector::emitCSetForFCmp(
4642     Register Dst, CmpInst::Predicate Pred, MachineIRBuilder &MIRBuilder) const {
4643   MachineRegisterInfo &MRI = *MIRBuilder.getMRI();
4644 #ifndef NDEBUG
4645   LLT Ty = MRI.getType(Dst);
4646   assert(!Ty.isVector() && Ty.getSizeInBits() == 32 &&
4647          "Expected a 32-bit scalar register?");
4648 #endif
4649   const Register ZReg = AArch64::WZR;
4650   AArch64CC::CondCode CC1, CC2;
4651   changeFCMPPredToAArch64CC(Pred, CC1, CC2);
4652   auto InvCC1 = AArch64CC::getInvertedCondCode(CC1);
4653   if (CC2 == AArch64CC::AL)
4654     return emitCSINC(/*Dst=*/Dst, /*Src1=*/ZReg, /*Src2=*/ZReg, InvCC1,
4655                      MIRBuilder);
4656   const TargetRegisterClass *RC = &AArch64::GPR32RegClass;
4657   Register Def1Reg = MRI.createVirtualRegister(RC);
4658   Register Def2Reg = MRI.createVirtualRegister(RC);
4659   auto InvCC2 = AArch64CC::getInvertedCondCode(CC2);
4660   emitCSINC(/*Dst=*/Def1Reg, /*Src1=*/ZReg, /*Src2=*/ZReg, InvCC1, MIRBuilder);
4661   emitCSINC(/*Dst=*/Def2Reg, /*Src1=*/ZReg, /*Src2=*/ZReg, InvCC2, MIRBuilder);
4662   auto OrMI = MIRBuilder.buildInstr(AArch64::ORRWrr, {Dst}, {Def1Reg, Def2Reg});
4663   constrainSelectedInstRegOperands(*OrMI, TII, TRI, RBI);
4664   return &*OrMI;
4665 }
4666 
4667 MachineInstr *AArch64InstructionSelector::emitFPCompare(
4668     Register LHS, Register RHS, MachineIRBuilder &MIRBuilder,
4669     std::optional<CmpInst::Predicate> Pred) const {
4670   MachineRegisterInfo &MRI = *MIRBuilder.getMRI();
4671   LLT Ty = MRI.getType(LHS);
4672   if (Ty.isVector())
4673     return nullptr;
4674   unsigned OpSize = Ty.getSizeInBits();
4675   if (OpSize != 32 && OpSize != 64)
4676     return nullptr;
4677 
4678   // If this is a compare against +0.0, then we don't have
4679   // to explicitly materialize a constant.
4680   const ConstantFP *FPImm = getConstantFPVRegVal(RHS, MRI);
4681   bool ShouldUseImm = FPImm && (FPImm->isZero() && !FPImm->isNegative());
4682 
4683   auto IsEqualityPred = [](CmpInst::Predicate P) {
4684     return P == CmpInst::FCMP_OEQ || P == CmpInst::FCMP_ONE ||
4685            P == CmpInst::FCMP_UEQ || P == CmpInst::FCMP_UNE;
4686   };
4687   if (!ShouldUseImm && Pred && IsEqualityPred(*Pred)) {
4688     // Try commutating the operands.
4689     const ConstantFP *LHSImm = getConstantFPVRegVal(LHS, MRI);
4690     if (LHSImm && (LHSImm->isZero() && !LHSImm->isNegative())) {
4691       ShouldUseImm = true;
4692       std::swap(LHS, RHS);
4693     }
4694   }
4695   unsigned CmpOpcTbl[2][2] = {{AArch64::FCMPSrr, AArch64::FCMPDrr},
4696                               {AArch64::FCMPSri, AArch64::FCMPDri}};
4697   unsigned CmpOpc = CmpOpcTbl[ShouldUseImm][OpSize == 64];
4698 
4699   // Partially build the compare. Decide if we need to add a use for the
4700   // third operand based off whether or not we're comparing against 0.0.
4701   auto CmpMI = MIRBuilder.buildInstr(CmpOpc).addUse(LHS);
4702   CmpMI.setMIFlags(MachineInstr::NoFPExcept);
4703   if (!ShouldUseImm)
4704     CmpMI.addUse(RHS);
4705   constrainSelectedInstRegOperands(*CmpMI, TII, TRI, RBI);
4706   return &*CmpMI;
4707 }
4708 
4709 MachineInstr *AArch64InstructionSelector::emitVectorConcat(
4710     std::optional<Register> Dst, Register Op1, Register Op2,
4711     MachineIRBuilder &MIRBuilder) const {
4712   // We implement a vector concat by:
4713   // 1. Use scalar_to_vector to insert the lower vector into the larger dest
4714   // 2. Insert the upper vector into the destination's upper element
4715   // TODO: some of this code is common with G_BUILD_VECTOR handling.
4716   MachineRegisterInfo &MRI = MIRBuilder.getMF().getRegInfo();
4717 
4718   const LLT Op1Ty = MRI.getType(Op1);
4719   const LLT Op2Ty = MRI.getType(Op2);
4720 
4721   if (Op1Ty != Op2Ty) {
4722     LLVM_DEBUG(dbgs() << "Could not do vector concat of differing vector tys");
4723     return nullptr;
4724   }
4725   assert(Op1Ty.isVector() && "Expected a vector for vector concat");
4726 
4727   if (Op1Ty.getSizeInBits() >= 128) {
4728     LLVM_DEBUG(dbgs() << "Vector concat not supported for full size vectors");
4729     return nullptr;
4730   }
4731 
4732   // At the moment we just support 64 bit vector concats.
4733   if (Op1Ty.getSizeInBits() != 64) {
4734     LLVM_DEBUG(dbgs() << "Vector concat supported for 64b vectors");
4735     return nullptr;
4736   }
4737 
4738   const LLT ScalarTy = LLT::scalar(Op1Ty.getSizeInBits());
4739   const RegisterBank &FPRBank = *RBI.getRegBank(Op1, MRI, TRI);
4740   const TargetRegisterClass *DstRC =
4741       getRegClassForTypeOnBank(Op1Ty.multiplyElements(2), FPRBank);
4742 
4743   MachineInstr *WidenedOp1 =
4744       emitScalarToVector(ScalarTy.getSizeInBits(), DstRC, Op1, MIRBuilder);
4745   MachineInstr *WidenedOp2 =
4746       emitScalarToVector(ScalarTy.getSizeInBits(), DstRC, Op2, MIRBuilder);
4747   if (!WidenedOp1 || !WidenedOp2) {
4748     LLVM_DEBUG(dbgs() << "Could not emit a vector from scalar value");
4749     return nullptr;
4750   }
4751 
4752   // Now do the insert of the upper element.
4753   unsigned InsertOpc, InsSubRegIdx;
4754   std::tie(InsertOpc, InsSubRegIdx) =
4755       getInsertVecEltOpInfo(FPRBank, ScalarTy.getSizeInBits());
4756 
4757   if (!Dst)
4758     Dst = MRI.createVirtualRegister(DstRC);
4759   auto InsElt =
4760       MIRBuilder
4761           .buildInstr(InsertOpc, {*Dst}, {WidenedOp1->getOperand(0).getReg()})
4762           .addImm(1) /* Lane index */
4763           .addUse(WidenedOp2->getOperand(0).getReg())
4764           .addImm(0);
4765   constrainSelectedInstRegOperands(*InsElt, TII, TRI, RBI);
4766   return &*InsElt;
4767 }
4768 
4769 MachineInstr *
4770 AArch64InstructionSelector::emitCSINC(Register Dst, Register Src1,
4771                                       Register Src2, AArch64CC::CondCode Pred,
4772                                       MachineIRBuilder &MIRBuilder) const {
4773   auto &MRI = *MIRBuilder.getMRI();
4774   const RegClassOrRegBank &RegClassOrBank = MRI.getRegClassOrRegBank(Dst);
4775   // If we used a register class, then this won't necessarily have an LLT.
4776   // Compute the size based off whether or not we have a class or bank.
4777   unsigned Size;
4778   if (const auto *RC = RegClassOrBank.dyn_cast<const TargetRegisterClass *>())
4779     Size = TRI.getRegSizeInBits(*RC);
4780   else
4781     Size = MRI.getType(Dst).getSizeInBits();
4782   // Some opcodes use s1.
4783   assert(Size <= 64 && "Expected 64 bits or less only!");
4784   static const unsigned OpcTable[2] = {AArch64::CSINCWr, AArch64::CSINCXr};
4785   unsigned Opc = OpcTable[Size == 64];
4786   auto CSINC = MIRBuilder.buildInstr(Opc, {Dst}, {Src1, Src2}).addImm(Pred);
4787   constrainSelectedInstRegOperands(*CSINC, TII, TRI, RBI);
4788   return &*CSINC;
4789 }
4790 
4791 MachineInstr *AArch64InstructionSelector::emitCarryIn(MachineInstr &I,
4792                                                       Register CarryReg) {
4793   MachineRegisterInfo *MRI = MIB.getMRI();
4794   unsigned Opcode = I.getOpcode();
4795 
4796   // If the instruction is a SUB, we need to negate the carry,
4797   // because borrowing is indicated by carry-flag == 0.
4798   bool NeedsNegatedCarry =
4799       (Opcode == TargetOpcode::G_USUBE || Opcode == TargetOpcode::G_SSUBE);
4800 
4801   // If the previous instruction will already produce the correct carry, do not
4802   // emit a carry generating instruction. E.g. for G_UADDE/G_USUBE sequences
4803   // generated during legalization of wide add/sub. This optimization depends on
4804   // these sequences not being interrupted by other instructions.
4805   MachineInstr *SrcMI = MRI->getVRegDef(CarryReg);
4806   if (SrcMI == I.getPrevNode()) {
4807     if (auto *CarrySrcMI = dyn_cast<GAddSubCarryOut>(SrcMI)) {
4808       bool ProducesNegatedCarry = CarrySrcMI->isSub();
4809       if (NeedsNegatedCarry == ProducesNegatedCarry && CarrySrcMI->isUnsigned())
4810         return nullptr;
4811     }
4812   }
4813 
4814   Register DeadReg = MRI->createVirtualRegister(&AArch64::GPR32RegClass);
4815 
4816   if (NeedsNegatedCarry) {
4817     // (0 - Carry) sets !C in NZCV when Carry == 1
4818     Register ZReg = AArch64::WZR;
4819     return emitInstr(AArch64::SUBSWrr, {DeadReg}, {ZReg, CarryReg}, MIB);
4820   }
4821 
4822   // (Carry - 1) sets !C in NZCV when Carry == 0
4823   auto Fns = select12BitValueWithLeftShift(1);
4824   return emitInstr(AArch64::SUBSWri, {DeadReg}, {CarryReg}, MIB, Fns);
4825 }
4826 
4827 bool AArch64InstructionSelector::selectOverflowOp(MachineInstr &I,
4828                                                   MachineRegisterInfo &MRI) {
4829   auto &CarryMI = cast<GAddSubCarryOut>(I);
4830 
4831   if (auto *CarryInMI = dyn_cast<GAddSubCarryInOut>(&I)) {
4832     // Set NZCV carry according to carry-in VReg
4833     emitCarryIn(I, CarryInMI->getCarryInReg());
4834   }
4835 
4836   // Emit the operation and get the correct condition code.
4837   auto OpAndCC = emitOverflowOp(I.getOpcode(), CarryMI.getDstReg(),
4838                                 CarryMI.getLHS(), CarryMI.getRHS(), MIB);
4839 
4840   Register CarryOutReg = CarryMI.getCarryOutReg();
4841 
4842   // Don't convert carry-out to VReg if it is never used
4843   if (!MRI.use_nodbg_empty(CarryOutReg)) {
4844     // Now, put the overflow result in the register given by the first operand
4845     // to the overflow op. CSINC increments the result when the predicate is
4846     // false, so to get the increment when it's true, we need to use the
4847     // inverse. In this case, we want to increment when carry is set.
4848     Register ZReg = AArch64::WZR;
4849     emitCSINC(/*Dst=*/CarryOutReg, /*Src1=*/ZReg, /*Src2=*/ZReg,
4850               getInvertedCondCode(OpAndCC.second), MIB);
4851   }
4852 
4853   I.eraseFromParent();
4854   return true;
4855 }
4856 
4857 std::pair<MachineInstr *, AArch64CC::CondCode>
4858 AArch64InstructionSelector::emitOverflowOp(unsigned Opcode, Register Dst,
4859                                            MachineOperand &LHS,
4860                                            MachineOperand &RHS,
4861                                            MachineIRBuilder &MIRBuilder) const {
4862   switch (Opcode) {
4863   default:
4864     llvm_unreachable("Unexpected opcode!");
4865   case TargetOpcode::G_SADDO:
4866     return std::make_pair(emitADDS(Dst, LHS, RHS, MIRBuilder), AArch64CC::VS);
4867   case TargetOpcode::G_UADDO:
4868     return std::make_pair(emitADDS(Dst, LHS, RHS, MIRBuilder), AArch64CC::HS);
4869   case TargetOpcode::G_SSUBO:
4870     return std::make_pair(emitSUBS(Dst, LHS, RHS, MIRBuilder), AArch64CC::VS);
4871   case TargetOpcode::G_USUBO:
4872     return std::make_pair(emitSUBS(Dst, LHS, RHS, MIRBuilder), AArch64CC::LO);
4873   case TargetOpcode::G_SADDE:
4874     return std::make_pair(emitADCS(Dst, LHS, RHS, MIRBuilder), AArch64CC::VS);
4875   case TargetOpcode::G_UADDE:
4876     return std::make_pair(emitADCS(Dst, LHS, RHS, MIRBuilder), AArch64CC::HS);
4877   case TargetOpcode::G_SSUBE:
4878     return std::make_pair(emitSBCS(Dst, LHS, RHS, MIRBuilder), AArch64CC::VS);
4879   case TargetOpcode::G_USUBE:
4880     return std::make_pair(emitSBCS(Dst, LHS, RHS, MIRBuilder), AArch64CC::LO);
4881   }
4882 }
4883 
4884 /// Returns true if @p Val is a tree of AND/OR/CMP operations that can be
4885 /// expressed as a conjunction.
4886 /// \param CanNegate    Set to true if we can negate the whole sub-tree just by
4887 ///                     changing the conditions on the CMP tests.
4888 ///                     (this means we can call emitConjunctionRec() with
4889 ///                      Negate==true on this sub-tree)
4890 /// \param MustBeFirst  Set to true if this subtree needs to be negated and we
4891 ///                     cannot do the negation naturally. We are required to
4892 ///                     emit the subtree first in this case.
4893 /// \param WillNegate   Is true if are called when the result of this
4894 ///                     subexpression must be negated. This happens when the
4895 ///                     outer expression is an OR. We can use this fact to know
4896 ///                     that we have a double negation (or (or ...) ...) that
4897 ///                     can be implemented for free.
4898 static bool canEmitConjunction(Register Val, bool &CanNegate, bool &MustBeFirst,
4899                                bool WillNegate, MachineRegisterInfo &MRI,
4900                                unsigned Depth = 0) {
4901   if (!MRI.hasOneNonDBGUse(Val))
4902     return false;
4903   MachineInstr *ValDef = MRI.getVRegDef(Val);
4904   unsigned Opcode = ValDef->getOpcode();
4905   if (isa<GAnyCmp>(ValDef)) {
4906     CanNegate = true;
4907     MustBeFirst = false;
4908     return true;
4909   }
4910   // Protect against exponential runtime and stack overflow.
4911   if (Depth > 6)
4912     return false;
4913   if (Opcode == TargetOpcode::G_AND || Opcode == TargetOpcode::G_OR) {
4914     bool IsOR = Opcode == TargetOpcode::G_OR;
4915     Register O0 = ValDef->getOperand(1).getReg();
4916     Register O1 = ValDef->getOperand(2).getReg();
4917     bool CanNegateL;
4918     bool MustBeFirstL;
4919     if (!canEmitConjunction(O0, CanNegateL, MustBeFirstL, IsOR, MRI, Depth + 1))
4920       return false;
4921     bool CanNegateR;
4922     bool MustBeFirstR;
4923     if (!canEmitConjunction(O1, CanNegateR, MustBeFirstR, IsOR, MRI, Depth + 1))
4924       return false;
4925 
4926     if (MustBeFirstL && MustBeFirstR)
4927       return false;
4928 
4929     if (IsOR) {
4930       // For an OR expression we need to be able to naturally negate at least
4931       // one side or we cannot do the transformation at all.
4932       if (!CanNegateL && !CanNegateR)
4933         return false;
4934       // If we the result of the OR will be negated and we can naturally negate
4935       // the leaves, then this sub-tree as a whole negates naturally.
4936       CanNegate = WillNegate && CanNegateL && CanNegateR;
4937       // If we cannot naturally negate the whole sub-tree, then this must be
4938       // emitted first.
4939       MustBeFirst = !CanNegate;
4940     } else {
4941       assert(Opcode == TargetOpcode::G_AND && "Must be G_AND");
4942       // We cannot naturally negate an AND operation.
4943       CanNegate = false;
4944       MustBeFirst = MustBeFirstL || MustBeFirstR;
4945     }
4946     return true;
4947   }
4948   return false;
4949 }
4950 
4951 MachineInstr *AArch64InstructionSelector::emitConditionalComparison(
4952     Register LHS, Register RHS, CmpInst::Predicate CC,
4953     AArch64CC::CondCode Predicate, AArch64CC::CondCode OutCC,
4954     MachineIRBuilder &MIB) const {
4955   // TODO: emit CMN as an optimization.
4956   auto &MRI = *MIB.getMRI();
4957   LLT OpTy = MRI.getType(LHS);
4958   assert(OpTy.getSizeInBits() == 32 || OpTy.getSizeInBits() == 64);
4959   unsigned CCmpOpc;
4960   std::optional<ValueAndVReg> C;
4961   if (CmpInst::isIntPredicate(CC)) {
4962     C = getIConstantVRegValWithLookThrough(RHS, MRI);
4963     if (C && C->Value.ult(32))
4964       CCmpOpc = OpTy.getSizeInBits() == 32 ? AArch64::CCMPWi : AArch64::CCMPXi;
4965     else
4966       CCmpOpc = OpTy.getSizeInBits() == 32 ? AArch64::CCMPWr : AArch64::CCMPXr;
4967   } else {
4968     switch (OpTy.getSizeInBits()) {
4969     case 16:
4970       CCmpOpc = AArch64::FCCMPHrr;
4971       break;
4972     case 32:
4973       CCmpOpc = AArch64::FCCMPSrr;
4974       break;
4975     case 64:
4976       CCmpOpc = AArch64::FCCMPDrr;
4977       break;
4978     default:
4979       return nullptr;
4980     }
4981   }
4982   AArch64CC::CondCode InvOutCC = AArch64CC::getInvertedCondCode(OutCC);
4983   unsigned NZCV = AArch64CC::getNZCVToSatisfyCondCode(InvOutCC);
4984   auto CCmp =
4985       MIB.buildInstr(CCmpOpc, {}, {LHS});
4986   if (CCmpOpc == AArch64::CCMPWi || CCmpOpc == AArch64::CCMPXi)
4987     CCmp.addImm(C->Value.getZExtValue());
4988   else
4989     CCmp.addReg(RHS);
4990   CCmp.addImm(NZCV).addImm(Predicate);
4991   constrainSelectedInstRegOperands(*CCmp, TII, TRI, RBI);
4992   return &*CCmp;
4993 }
4994 
4995 MachineInstr *AArch64InstructionSelector::emitConjunctionRec(
4996     Register Val, AArch64CC::CondCode &OutCC, bool Negate, Register CCOp,
4997     AArch64CC::CondCode Predicate, MachineIRBuilder &MIB) const {
4998   // We're at a tree leaf, produce a conditional comparison operation.
4999   auto &MRI = *MIB.getMRI();
5000   MachineInstr *ValDef = MRI.getVRegDef(Val);
5001   unsigned Opcode = ValDef->getOpcode();
5002   if (auto *Cmp = dyn_cast<GAnyCmp>(ValDef)) {
5003     Register LHS = Cmp->getLHSReg();
5004     Register RHS = Cmp->getRHSReg();
5005     CmpInst::Predicate CC = Cmp->getCond();
5006     if (Negate)
5007       CC = CmpInst::getInversePredicate(CC);
5008     if (isa<GICmp>(Cmp)) {
5009       OutCC = changeICMPPredToAArch64CC(CC);
5010     } else {
5011       // Handle special FP cases.
5012       AArch64CC::CondCode ExtraCC;
5013       changeFPCCToANDAArch64CC(CC, OutCC, ExtraCC);
5014       // Some floating point conditions can't be tested with a single condition
5015       // code. Construct an additional comparison in this case.
5016       if (ExtraCC != AArch64CC::AL) {
5017         MachineInstr *ExtraCmp;
5018         if (!CCOp)
5019           ExtraCmp = emitFPCompare(LHS, RHS, MIB, CC);
5020         else
5021           ExtraCmp =
5022               emitConditionalComparison(LHS, RHS, CC, Predicate, ExtraCC, MIB);
5023         CCOp = ExtraCmp->getOperand(0).getReg();
5024         Predicate = ExtraCC;
5025       }
5026     }
5027 
5028     // Produce a normal comparison if we are first in the chain
5029     if (!CCOp) {
5030       auto Dst = MRI.cloneVirtualRegister(LHS);
5031       if (isa<GICmp>(Cmp))
5032         return emitSUBS(Dst, Cmp->getOperand(2), Cmp->getOperand(3), MIB);
5033       return emitFPCompare(Cmp->getOperand(2).getReg(),
5034                            Cmp->getOperand(3).getReg(), MIB);
5035     }
5036     // Otherwise produce a ccmp.
5037     return emitConditionalComparison(LHS, RHS, CC, Predicate, OutCC, MIB);
5038   }
5039   assert(MRI.hasOneNonDBGUse(Val) && "Valid conjunction/disjunction tree");
5040 
5041   bool IsOR = Opcode == TargetOpcode::G_OR;
5042 
5043   Register LHS = ValDef->getOperand(1).getReg();
5044   bool CanNegateL;
5045   bool MustBeFirstL;
5046   bool ValidL = canEmitConjunction(LHS, CanNegateL, MustBeFirstL, IsOR, MRI);
5047   assert(ValidL && "Valid conjunction/disjunction tree");
5048   (void)ValidL;
5049 
5050   Register RHS = ValDef->getOperand(2).getReg();
5051   bool CanNegateR;
5052   bool MustBeFirstR;
5053   bool ValidR = canEmitConjunction(RHS, CanNegateR, MustBeFirstR, IsOR, MRI);
5054   assert(ValidR && "Valid conjunction/disjunction tree");
5055   (void)ValidR;
5056 
5057   // Swap sub-tree that must come first to the right side.
5058   if (MustBeFirstL) {
5059     assert(!MustBeFirstR && "Valid conjunction/disjunction tree");
5060     std::swap(LHS, RHS);
5061     std::swap(CanNegateL, CanNegateR);
5062     std::swap(MustBeFirstL, MustBeFirstR);
5063   }
5064 
5065   bool NegateR;
5066   bool NegateAfterR;
5067   bool NegateL;
5068   bool NegateAfterAll;
5069   if (Opcode == TargetOpcode::G_OR) {
5070     // Swap the sub-tree that we can negate naturally to the left.
5071     if (!CanNegateL) {
5072       assert(CanNegateR && "at least one side must be negatable");
5073       assert(!MustBeFirstR && "invalid conjunction/disjunction tree");
5074       assert(!Negate);
5075       std::swap(LHS, RHS);
5076       NegateR = false;
5077       NegateAfterR = true;
5078     } else {
5079       // Negate the left sub-tree if possible, otherwise negate the result.
5080       NegateR = CanNegateR;
5081       NegateAfterR = !CanNegateR;
5082     }
5083     NegateL = true;
5084     NegateAfterAll = !Negate;
5085   } else {
5086     assert(Opcode == TargetOpcode::G_AND &&
5087            "Valid conjunction/disjunction tree");
5088     assert(!Negate && "Valid conjunction/disjunction tree");
5089 
5090     NegateL = false;
5091     NegateR = false;
5092     NegateAfterR = false;
5093     NegateAfterAll = false;
5094   }
5095 
5096   // Emit sub-trees.
5097   AArch64CC::CondCode RHSCC;
5098   MachineInstr *CmpR =
5099       emitConjunctionRec(RHS, RHSCC, NegateR, CCOp, Predicate, MIB);
5100   if (NegateAfterR)
5101     RHSCC = AArch64CC::getInvertedCondCode(RHSCC);
5102   MachineInstr *CmpL = emitConjunctionRec(
5103       LHS, OutCC, NegateL, CmpR->getOperand(0).getReg(), RHSCC, MIB);
5104   if (NegateAfterAll)
5105     OutCC = AArch64CC::getInvertedCondCode(OutCC);
5106   return CmpL;
5107 }
5108 
5109 MachineInstr *AArch64InstructionSelector::emitConjunction(
5110     Register Val, AArch64CC::CondCode &OutCC, MachineIRBuilder &MIB) const {
5111   bool DummyCanNegate;
5112   bool DummyMustBeFirst;
5113   if (!canEmitConjunction(Val, DummyCanNegate, DummyMustBeFirst, false,
5114                           *MIB.getMRI()))
5115     return nullptr;
5116   return emitConjunctionRec(Val, OutCC, false, Register(), AArch64CC::AL, MIB);
5117 }
5118 
5119 bool AArch64InstructionSelector::tryOptSelectConjunction(GSelect &SelI,
5120                                                          MachineInstr &CondMI) {
5121   AArch64CC::CondCode AArch64CC;
5122   MachineInstr *ConjMI = emitConjunction(SelI.getCondReg(), AArch64CC, MIB);
5123   if (!ConjMI)
5124     return false;
5125 
5126   emitSelect(SelI.getReg(0), SelI.getTrueReg(), SelI.getFalseReg(), AArch64CC, MIB);
5127   SelI.eraseFromParent();
5128   return true;
5129 }
5130 
5131 bool AArch64InstructionSelector::tryOptSelect(GSelect &I) {
5132   MachineRegisterInfo &MRI = *MIB.getMRI();
5133   // We want to recognize this pattern:
5134   //
5135   // $z = G_FCMP pred, $x, $y
5136   // ...
5137   // $w = G_SELECT $z, $a, $b
5138   //
5139   // Where the value of $z is *only* ever used by the G_SELECT (possibly with
5140   // some copies/truncs in between.)
5141   //
5142   // If we see this, then we can emit something like this:
5143   //
5144   // fcmp $x, $y
5145   // fcsel $w, $a, $b, pred
5146   //
5147   // Rather than emitting both of the rather long sequences in the standard
5148   // G_FCMP/G_SELECT select methods.
5149 
5150   // First, check if the condition is defined by a compare.
5151   MachineInstr *CondDef = MRI.getVRegDef(I.getOperand(1).getReg());
5152 
5153   // We can only fold if all of the defs have one use.
5154   Register CondDefReg = CondDef->getOperand(0).getReg();
5155   if (!MRI.hasOneNonDBGUse(CondDefReg)) {
5156     // Unless it's another select.
5157     for (const MachineInstr &UI : MRI.use_nodbg_instructions(CondDefReg)) {
5158       if (CondDef == &UI)
5159         continue;
5160       if (UI.getOpcode() != TargetOpcode::G_SELECT)
5161         return false;
5162     }
5163   }
5164 
5165   // Is the condition defined by a compare?
5166   unsigned CondOpc = CondDef->getOpcode();
5167   if (CondOpc != TargetOpcode::G_ICMP && CondOpc != TargetOpcode::G_FCMP) {
5168     if (tryOptSelectConjunction(I, *CondDef))
5169       return true;
5170     return false;
5171   }
5172 
5173   AArch64CC::CondCode CondCode;
5174   if (CondOpc == TargetOpcode::G_ICMP) {
5175     auto Pred =
5176         static_cast<CmpInst::Predicate>(CondDef->getOperand(1).getPredicate());
5177     CondCode = changeICMPPredToAArch64CC(Pred);
5178     emitIntegerCompare(CondDef->getOperand(2), CondDef->getOperand(3),
5179                        CondDef->getOperand(1), MIB);
5180   } else {
5181     // Get the condition code for the select.
5182     auto Pred =
5183         static_cast<CmpInst::Predicate>(CondDef->getOperand(1).getPredicate());
5184     AArch64CC::CondCode CondCode2;
5185     changeFCMPPredToAArch64CC(Pred, CondCode, CondCode2);
5186 
5187     // changeFCMPPredToAArch64CC sets CondCode2 to AL when we require two
5188     // instructions to emit the comparison.
5189     // TODO: Handle FCMP_UEQ and FCMP_ONE. After that, this check will be
5190     // unnecessary.
5191     if (CondCode2 != AArch64CC::AL)
5192       return false;
5193 
5194     if (!emitFPCompare(CondDef->getOperand(2).getReg(),
5195                        CondDef->getOperand(3).getReg(), MIB)) {
5196       LLVM_DEBUG(dbgs() << "Couldn't emit compare for select!\n");
5197       return false;
5198     }
5199   }
5200 
5201   // Emit the select.
5202   emitSelect(I.getOperand(0).getReg(), I.getOperand(2).getReg(),
5203              I.getOperand(3).getReg(), CondCode, MIB);
5204   I.eraseFromParent();
5205   return true;
5206 }
5207 
5208 MachineInstr *AArch64InstructionSelector::tryFoldIntegerCompare(
5209     MachineOperand &LHS, MachineOperand &RHS, MachineOperand &Predicate,
5210     MachineIRBuilder &MIRBuilder) const {
5211   assert(LHS.isReg() && RHS.isReg() && Predicate.isPredicate() &&
5212          "Unexpected MachineOperand");
5213   MachineRegisterInfo &MRI = *MIRBuilder.getMRI();
5214   // We want to find this sort of thing:
5215   // x = G_SUB 0, y
5216   // G_ICMP z, x
5217   //
5218   // In this case, we can fold the G_SUB into the G_ICMP using a CMN instead.
5219   // e.g:
5220   //
5221   // cmn z, y
5222 
5223   // Check if the RHS or LHS of the G_ICMP is defined by a SUB
5224   MachineInstr *LHSDef = getDefIgnoringCopies(LHS.getReg(), MRI);
5225   MachineInstr *RHSDef = getDefIgnoringCopies(RHS.getReg(), MRI);
5226   auto P = static_cast<CmpInst::Predicate>(Predicate.getPredicate());
5227   // Given this:
5228   //
5229   // x = G_SUB 0, y
5230   // G_ICMP x, z
5231   //
5232   // Produce this:
5233   //
5234   // cmn y, z
5235   if (isCMN(LHSDef, P, MRI))
5236     return emitCMN(LHSDef->getOperand(2), RHS, MIRBuilder);
5237 
5238   // Same idea here, but with the RHS of the compare instead:
5239   //
5240   // Given this:
5241   //
5242   // x = G_SUB 0, y
5243   // G_ICMP z, x
5244   //
5245   // Produce this:
5246   //
5247   // cmn z, y
5248   if (isCMN(RHSDef, P, MRI))
5249     return emitCMN(LHS, RHSDef->getOperand(2), MIRBuilder);
5250 
5251   // Given this:
5252   //
5253   // z = G_AND x, y
5254   // G_ICMP z, 0
5255   //
5256   // Produce this if the compare is signed:
5257   //
5258   // tst x, y
5259   if (!CmpInst::isUnsigned(P) && LHSDef &&
5260       LHSDef->getOpcode() == TargetOpcode::G_AND) {
5261     // Make sure that the RHS is 0.
5262     auto ValAndVReg = getIConstantVRegValWithLookThrough(RHS.getReg(), MRI);
5263     if (!ValAndVReg || ValAndVReg->Value != 0)
5264       return nullptr;
5265 
5266     return emitTST(LHSDef->getOperand(1),
5267                    LHSDef->getOperand(2), MIRBuilder);
5268   }
5269 
5270   return nullptr;
5271 }
5272 
5273 bool AArch64InstructionSelector::selectShuffleVector(
5274     MachineInstr &I, MachineRegisterInfo &MRI) {
5275   const LLT DstTy = MRI.getType(I.getOperand(0).getReg());
5276   Register Src1Reg = I.getOperand(1).getReg();
5277   const LLT Src1Ty = MRI.getType(Src1Reg);
5278   Register Src2Reg = I.getOperand(2).getReg();
5279   const LLT Src2Ty = MRI.getType(Src2Reg);
5280   ArrayRef<int> Mask = I.getOperand(3).getShuffleMask();
5281 
5282   MachineBasicBlock &MBB = *I.getParent();
5283   MachineFunction &MF = *MBB.getParent();
5284   LLVMContext &Ctx = MF.getFunction().getContext();
5285 
5286   // G_SHUFFLE_VECTOR is weird in that the source operands can be scalars, if
5287   // it's originated from a <1 x T> type. Those should have been lowered into
5288   // G_BUILD_VECTOR earlier.
5289   if (!Src1Ty.isVector() || !Src2Ty.isVector()) {
5290     LLVM_DEBUG(dbgs() << "Could not select a \"scalar\" G_SHUFFLE_VECTOR\n");
5291     return false;
5292   }
5293 
5294   unsigned BytesPerElt = DstTy.getElementType().getSizeInBits() / 8;
5295 
5296   SmallVector<Constant *, 64> CstIdxs;
5297   for (int Val : Mask) {
5298     // For now, any undef indexes we'll just assume to be 0. This should be
5299     // optimized in future, e.g. to select DUP etc.
5300     Val = Val < 0 ? 0 : Val;
5301     for (unsigned Byte = 0; Byte < BytesPerElt; ++Byte) {
5302       unsigned Offset = Byte + Val * BytesPerElt;
5303       CstIdxs.emplace_back(ConstantInt::get(Type::getInt8Ty(Ctx), Offset));
5304     }
5305   }
5306 
5307   // Use a constant pool to load the index vector for TBL.
5308   Constant *CPVal = ConstantVector::get(CstIdxs);
5309   MachineInstr *IndexLoad = emitLoadFromConstantPool(CPVal, MIB);
5310   if (!IndexLoad) {
5311     LLVM_DEBUG(dbgs() << "Could not load from a constant pool");
5312     return false;
5313   }
5314 
5315   if (DstTy.getSizeInBits() != 128) {
5316     assert(DstTy.getSizeInBits() == 64 && "Unexpected shuffle result ty");
5317     // This case can be done with TBL1.
5318     MachineInstr *Concat =
5319         emitVectorConcat(std::nullopt, Src1Reg, Src2Reg, MIB);
5320     if (!Concat) {
5321       LLVM_DEBUG(dbgs() << "Could not do vector concat for tbl1");
5322       return false;
5323     }
5324 
5325     // The constant pool load will be 64 bits, so need to convert to FPR128 reg.
5326     IndexLoad = emitScalarToVector(64, &AArch64::FPR128RegClass,
5327                                    IndexLoad->getOperand(0).getReg(), MIB);
5328 
5329     auto TBL1 = MIB.buildInstr(
5330         AArch64::TBLv16i8One, {&AArch64::FPR128RegClass},
5331         {Concat->getOperand(0).getReg(), IndexLoad->getOperand(0).getReg()});
5332     constrainSelectedInstRegOperands(*TBL1, TII, TRI, RBI);
5333 
5334     auto Copy =
5335         MIB.buildInstr(TargetOpcode::COPY, {I.getOperand(0).getReg()}, {})
5336             .addReg(TBL1.getReg(0), 0, AArch64::dsub);
5337     RBI.constrainGenericRegister(Copy.getReg(0), AArch64::FPR64RegClass, MRI);
5338     I.eraseFromParent();
5339     return true;
5340   }
5341 
5342   // For TBL2 we need to emit a REG_SEQUENCE to tie together two consecutive
5343   // Q registers for regalloc.
5344   SmallVector<Register, 2> Regs = {Src1Reg, Src2Reg};
5345   auto RegSeq = createQTuple(Regs, MIB);
5346   auto TBL2 = MIB.buildInstr(AArch64::TBLv16i8Two, {I.getOperand(0)},
5347                              {RegSeq, IndexLoad->getOperand(0)});
5348   constrainSelectedInstRegOperands(*TBL2, TII, TRI, RBI);
5349   I.eraseFromParent();
5350   return true;
5351 }
5352 
5353 MachineInstr *AArch64InstructionSelector::emitLaneInsert(
5354     std::optional<Register> DstReg, Register SrcReg, Register EltReg,
5355     unsigned LaneIdx, const RegisterBank &RB,
5356     MachineIRBuilder &MIRBuilder) const {
5357   MachineInstr *InsElt = nullptr;
5358   const TargetRegisterClass *DstRC = &AArch64::FPR128RegClass;
5359   MachineRegisterInfo &MRI = *MIRBuilder.getMRI();
5360 
5361   // Create a register to define with the insert if one wasn't passed in.
5362   if (!DstReg)
5363     DstReg = MRI.createVirtualRegister(DstRC);
5364 
5365   unsigned EltSize = MRI.getType(EltReg).getSizeInBits();
5366   unsigned Opc = getInsertVecEltOpInfo(RB, EltSize).first;
5367 
5368   if (RB.getID() == AArch64::FPRRegBankID) {
5369     auto InsSub = emitScalarToVector(EltSize, DstRC, EltReg, MIRBuilder);
5370     InsElt = MIRBuilder.buildInstr(Opc, {*DstReg}, {SrcReg})
5371                  .addImm(LaneIdx)
5372                  .addUse(InsSub->getOperand(0).getReg())
5373                  .addImm(0);
5374   } else {
5375     InsElt = MIRBuilder.buildInstr(Opc, {*DstReg}, {SrcReg})
5376                  .addImm(LaneIdx)
5377                  .addUse(EltReg);
5378   }
5379 
5380   constrainSelectedInstRegOperands(*InsElt, TII, TRI, RBI);
5381   return InsElt;
5382 }
5383 
5384 bool AArch64InstructionSelector::selectUSMovFromExtend(
5385     MachineInstr &MI, MachineRegisterInfo &MRI) {
5386   if (MI.getOpcode() != TargetOpcode::G_SEXT &&
5387       MI.getOpcode() != TargetOpcode::G_ZEXT &&
5388       MI.getOpcode() != TargetOpcode::G_ANYEXT)
5389     return false;
5390   bool IsSigned = MI.getOpcode() == TargetOpcode::G_SEXT;
5391   const Register DefReg = MI.getOperand(0).getReg();
5392   const LLT DstTy = MRI.getType(DefReg);
5393   unsigned DstSize = DstTy.getSizeInBits();
5394 
5395   if (DstSize != 32 && DstSize != 64)
5396     return false;
5397 
5398   MachineInstr *Extract = getOpcodeDef(TargetOpcode::G_EXTRACT_VECTOR_ELT,
5399                                        MI.getOperand(1).getReg(), MRI);
5400   int64_t Lane;
5401   if (!Extract || !mi_match(Extract->getOperand(2).getReg(), MRI, m_ICst(Lane)))
5402     return false;
5403   Register Src0 = Extract->getOperand(1).getReg();
5404 
5405   const LLT &VecTy = MRI.getType(Src0);
5406 
5407   if (VecTy.getSizeInBits() != 128) {
5408     const MachineInstr *ScalarToVector = emitScalarToVector(
5409         VecTy.getSizeInBits(), &AArch64::FPR128RegClass, Src0, MIB);
5410     assert(ScalarToVector && "Didn't expect emitScalarToVector to fail!");
5411     Src0 = ScalarToVector->getOperand(0).getReg();
5412   }
5413 
5414   unsigned Opcode;
5415   if (DstSize == 64 && VecTy.getScalarSizeInBits() == 32)
5416     Opcode = IsSigned ? AArch64::SMOVvi32to64 : AArch64::UMOVvi32;
5417   else if (DstSize == 64 && VecTy.getScalarSizeInBits() == 16)
5418     Opcode = IsSigned ? AArch64::SMOVvi16to64 : AArch64::UMOVvi16;
5419   else if (DstSize == 64 && VecTy.getScalarSizeInBits() == 8)
5420     Opcode = IsSigned ? AArch64::SMOVvi8to64 : AArch64::UMOVvi8;
5421   else if (DstSize == 32 && VecTy.getScalarSizeInBits() == 16)
5422     Opcode = IsSigned ? AArch64::SMOVvi16to32 : AArch64::UMOVvi16;
5423   else if (DstSize == 32 && VecTy.getScalarSizeInBits() == 8)
5424     Opcode = IsSigned ? AArch64::SMOVvi8to32 : AArch64::UMOVvi8;
5425   else
5426     llvm_unreachable("Unexpected type combo for S/UMov!");
5427 
5428   // We may need to generate one of these, depending on the type and sign of the
5429   // input:
5430   //  DstReg = SMOV Src0, Lane;
5431   //  NewReg = UMOV Src0, Lane; DstReg = SUBREG_TO_REG NewReg, sub_32;
5432   MachineInstr *ExtI = nullptr;
5433   if (DstSize == 64 && !IsSigned) {
5434     Register NewReg = MRI.createVirtualRegister(&AArch64::GPR32RegClass);
5435     MIB.buildInstr(Opcode, {NewReg}, {Src0}).addImm(Lane);
5436     ExtI = MIB.buildInstr(AArch64::SUBREG_TO_REG, {DefReg}, {})
5437                .addImm(0)
5438                .addUse(NewReg)
5439                .addImm(AArch64::sub_32);
5440     RBI.constrainGenericRegister(DefReg, AArch64::GPR64RegClass, MRI);
5441   } else
5442     ExtI = MIB.buildInstr(Opcode, {DefReg}, {Src0}).addImm(Lane);
5443 
5444   constrainSelectedInstRegOperands(*ExtI, TII, TRI, RBI);
5445   MI.eraseFromParent();
5446   return true;
5447 }
5448 
5449 bool AArch64InstructionSelector::selectInsertElt(MachineInstr &I,
5450                                                  MachineRegisterInfo &MRI) {
5451   assert(I.getOpcode() == TargetOpcode::G_INSERT_VECTOR_ELT);
5452 
5453   // Get information on the destination.
5454   Register DstReg = I.getOperand(0).getReg();
5455   const LLT DstTy = MRI.getType(DstReg);
5456   unsigned VecSize = DstTy.getSizeInBits();
5457 
5458   // Get information on the element we want to insert into the destination.
5459   Register EltReg = I.getOperand(2).getReg();
5460   const LLT EltTy = MRI.getType(EltReg);
5461   unsigned EltSize = EltTy.getSizeInBits();
5462   if (EltSize < 8 || EltSize > 64)
5463     return false;
5464 
5465   // Find the definition of the index. Bail out if it's not defined by a
5466   // G_CONSTANT.
5467   Register IdxReg = I.getOperand(3).getReg();
5468   auto VRegAndVal = getIConstantVRegValWithLookThrough(IdxReg, MRI);
5469   if (!VRegAndVal)
5470     return false;
5471   unsigned LaneIdx = VRegAndVal->Value.getSExtValue();
5472 
5473   // Perform the lane insert.
5474   Register SrcReg = I.getOperand(1).getReg();
5475   const RegisterBank &EltRB = *RBI.getRegBank(EltReg, MRI, TRI);
5476 
5477   if (VecSize < 128) {
5478     // If the vector we're inserting into is smaller than 128 bits, widen it
5479     // to 128 to do the insert.
5480     MachineInstr *ScalarToVec =
5481         emitScalarToVector(VecSize, &AArch64::FPR128RegClass, SrcReg, MIB);
5482     if (!ScalarToVec)
5483       return false;
5484     SrcReg = ScalarToVec->getOperand(0).getReg();
5485   }
5486 
5487   // Create an insert into a new FPR128 register.
5488   // Note that if our vector is already 128 bits, we end up emitting an extra
5489   // register.
5490   MachineInstr *InsMI =
5491       emitLaneInsert(std::nullopt, SrcReg, EltReg, LaneIdx, EltRB, MIB);
5492 
5493   if (VecSize < 128) {
5494     // If we had to widen to perform the insert, then we have to demote back to
5495     // the original size to get the result we want.
5496     Register DemoteVec = InsMI->getOperand(0).getReg();
5497     const TargetRegisterClass *RC =
5498         getRegClassForTypeOnBank(DstTy, *RBI.getRegBank(DemoteVec, MRI, TRI));
5499     if (RC != &AArch64::FPR32RegClass && RC != &AArch64::FPR64RegClass) {
5500       LLVM_DEBUG(dbgs() << "Unsupported register class!\n");
5501       return false;
5502     }
5503     unsigned SubReg = 0;
5504     if (!getSubRegForClass(RC, TRI, SubReg))
5505       return false;
5506     if (SubReg != AArch64::ssub && SubReg != AArch64::dsub) {
5507       LLVM_DEBUG(dbgs() << "Unsupported destination size! (" << VecSize
5508                         << "\n");
5509       return false;
5510     }
5511     MIB.buildInstr(TargetOpcode::COPY, {DstReg}, {})
5512         .addReg(DemoteVec, 0, SubReg);
5513     RBI.constrainGenericRegister(DstReg, *RC, MRI);
5514   } else {
5515     // No widening needed.
5516     InsMI->getOperand(0).setReg(DstReg);
5517     constrainSelectedInstRegOperands(*InsMI, TII, TRI, RBI);
5518   }
5519 
5520   I.eraseFromParent();
5521   return true;
5522 }
5523 
5524 MachineInstr *
5525 AArch64InstructionSelector::emitConstantVector(Register Dst, Constant *CV,
5526                                                MachineIRBuilder &MIRBuilder,
5527                                                MachineRegisterInfo &MRI) {
5528   LLT DstTy = MRI.getType(Dst);
5529   unsigned DstSize = DstTy.getSizeInBits();
5530   if (CV->isNullValue()) {
5531     if (DstSize == 128) {
5532       auto Mov =
5533           MIRBuilder.buildInstr(AArch64::MOVIv2d_ns, {Dst}, {}).addImm(0);
5534       constrainSelectedInstRegOperands(*Mov, TII, TRI, RBI);
5535       return &*Mov;
5536     }
5537 
5538     if (DstSize == 64) {
5539       auto Mov =
5540           MIRBuilder
5541               .buildInstr(AArch64::MOVIv2d_ns, {&AArch64::FPR128RegClass}, {})
5542               .addImm(0);
5543       auto Copy = MIRBuilder.buildInstr(TargetOpcode::COPY, {Dst}, {})
5544                       .addReg(Mov.getReg(0), 0, AArch64::dsub);
5545       RBI.constrainGenericRegister(Dst, AArch64::FPR64RegClass, MRI);
5546       return &*Copy;
5547     }
5548   }
5549 
5550   auto *CPLoad = emitLoadFromConstantPool(CV, MIRBuilder);
5551   if (!CPLoad) {
5552     LLVM_DEBUG(dbgs() << "Could not generate cp load for constant vector!");
5553     return nullptr;
5554   }
5555 
5556   auto Copy = MIRBuilder.buildCopy(Dst, CPLoad->getOperand(0));
5557   RBI.constrainGenericRegister(
5558       Dst, *MRI.getRegClass(CPLoad->getOperand(0).getReg()), MRI);
5559   return &*Copy;
5560 }
5561 
5562 bool AArch64InstructionSelector::tryOptConstantBuildVec(
5563     MachineInstr &I, LLT DstTy, MachineRegisterInfo &MRI) {
5564   assert(I.getOpcode() == TargetOpcode::G_BUILD_VECTOR);
5565   unsigned DstSize = DstTy.getSizeInBits();
5566   assert(DstSize <= 128 && "Unexpected build_vec type!");
5567   if (DstSize < 32)
5568     return false;
5569   // Check if we're building a constant vector, in which case we want to
5570   // generate a constant pool load instead of a vector insert sequence.
5571   SmallVector<Constant *, 16> Csts;
5572   for (unsigned Idx = 1; Idx < I.getNumOperands(); ++Idx) {
5573     // Try to find G_CONSTANT or G_FCONSTANT
5574     auto *OpMI =
5575         getOpcodeDef(TargetOpcode::G_CONSTANT, I.getOperand(Idx).getReg(), MRI);
5576     if (OpMI)
5577       Csts.emplace_back(
5578           const_cast<ConstantInt *>(OpMI->getOperand(1).getCImm()));
5579     else if ((OpMI = getOpcodeDef(TargetOpcode::G_FCONSTANT,
5580                                   I.getOperand(Idx).getReg(), MRI)))
5581       Csts.emplace_back(
5582           const_cast<ConstantFP *>(OpMI->getOperand(1).getFPImm()));
5583     else
5584       return false;
5585   }
5586   Constant *CV = ConstantVector::get(Csts);
5587   if (!emitConstantVector(I.getOperand(0).getReg(), CV, MIB, MRI))
5588     return false;
5589   I.eraseFromParent();
5590   return true;
5591 }
5592 
5593 bool AArch64InstructionSelector::tryOptBuildVecToSubregToReg(
5594     MachineInstr &I, MachineRegisterInfo &MRI) {
5595   // Given:
5596   //  %vec = G_BUILD_VECTOR %elt, %undef, %undef, ... %undef
5597   //
5598   // Select the G_BUILD_VECTOR as a SUBREG_TO_REG from %elt.
5599   Register Dst = I.getOperand(0).getReg();
5600   Register EltReg = I.getOperand(1).getReg();
5601   LLT EltTy = MRI.getType(EltReg);
5602   // If the index isn't on the same bank as its elements, then this can't be a
5603   // SUBREG_TO_REG.
5604   const RegisterBank &EltRB = *RBI.getRegBank(EltReg, MRI, TRI);
5605   const RegisterBank &DstRB = *RBI.getRegBank(Dst, MRI, TRI);
5606   if (EltRB != DstRB)
5607     return false;
5608   if (any_of(make_range(I.operands_begin() + 2, I.operands_end()),
5609              [&MRI](const MachineOperand &Op) {
5610                return !getOpcodeDef(TargetOpcode::G_IMPLICIT_DEF, Op.getReg(),
5611                                     MRI);
5612              }))
5613     return false;
5614   unsigned SubReg;
5615   const TargetRegisterClass *EltRC = getRegClassForTypeOnBank(EltTy, EltRB);
5616   if (!EltRC)
5617     return false;
5618   const TargetRegisterClass *DstRC =
5619       getRegClassForTypeOnBank(MRI.getType(Dst), DstRB);
5620   if (!DstRC)
5621     return false;
5622   if (!getSubRegForClass(EltRC, TRI, SubReg))
5623     return false;
5624   auto SubregToReg = MIB.buildInstr(AArch64::SUBREG_TO_REG, {Dst}, {})
5625                          .addImm(0)
5626                          .addUse(EltReg)
5627                          .addImm(SubReg);
5628   I.eraseFromParent();
5629   constrainSelectedInstRegOperands(*SubregToReg, TII, TRI, RBI);
5630   return RBI.constrainGenericRegister(Dst, *DstRC, MRI);
5631 }
5632 
5633 bool AArch64InstructionSelector::selectBuildVector(MachineInstr &I,
5634                                                    MachineRegisterInfo &MRI) {
5635   assert(I.getOpcode() == TargetOpcode::G_BUILD_VECTOR);
5636   // Until we port more of the optimized selections, for now just use a vector
5637   // insert sequence.
5638   const LLT DstTy = MRI.getType(I.getOperand(0).getReg());
5639   const LLT EltTy = MRI.getType(I.getOperand(1).getReg());
5640   unsigned EltSize = EltTy.getSizeInBits();
5641 
5642   if (tryOptConstantBuildVec(I, DstTy, MRI))
5643     return true;
5644   if (tryOptBuildVecToSubregToReg(I, MRI))
5645     return true;
5646 
5647   if (EltSize != 8 && EltSize != 16 && EltSize != 32 && EltSize != 64)
5648     return false; // Don't support all element types yet.
5649   const RegisterBank &RB = *RBI.getRegBank(I.getOperand(1).getReg(), MRI, TRI);
5650 
5651   const TargetRegisterClass *DstRC = &AArch64::FPR128RegClass;
5652   MachineInstr *ScalarToVec =
5653       emitScalarToVector(DstTy.getElementType().getSizeInBits(), DstRC,
5654                          I.getOperand(1).getReg(), MIB);
5655   if (!ScalarToVec)
5656     return false;
5657 
5658   Register DstVec = ScalarToVec->getOperand(0).getReg();
5659   unsigned DstSize = DstTy.getSizeInBits();
5660 
5661   // Keep track of the last MI we inserted. Later on, we might be able to save
5662   // a copy using it.
5663   MachineInstr *PrevMI = nullptr;
5664   for (unsigned i = 2, e = DstSize / EltSize + 1; i < e; ++i) {
5665     // Note that if we don't do a subregister copy, we can end up making an
5666     // extra register.
5667     PrevMI = &*emitLaneInsert(std::nullopt, DstVec, I.getOperand(i).getReg(),
5668                               i - 1, RB, MIB);
5669     DstVec = PrevMI->getOperand(0).getReg();
5670   }
5671 
5672   // If DstTy's size in bits is less than 128, then emit a subregister copy
5673   // from DstVec to the last register we've defined.
5674   if (DstSize < 128) {
5675     // Force this to be FPR using the destination vector.
5676     const TargetRegisterClass *RC =
5677         getRegClassForTypeOnBank(DstTy, *RBI.getRegBank(DstVec, MRI, TRI));
5678     if (!RC)
5679       return false;
5680     if (RC != &AArch64::FPR32RegClass && RC != &AArch64::FPR64RegClass) {
5681       LLVM_DEBUG(dbgs() << "Unsupported register class!\n");
5682       return false;
5683     }
5684 
5685     unsigned SubReg = 0;
5686     if (!getSubRegForClass(RC, TRI, SubReg))
5687       return false;
5688     if (SubReg != AArch64::ssub && SubReg != AArch64::dsub) {
5689       LLVM_DEBUG(dbgs() << "Unsupported destination size! (" << DstSize
5690                         << "\n");
5691       return false;
5692     }
5693 
5694     Register Reg = MRI.createVirtualRegister(RC);
5695     Register DstReg = I.getOperand(0).getReg();
5696 
5697     MIB.buildInstr(TargetOpcode::COPY, {DstReg}, {}).addReg(DstVec, 0, SubReg);
5698     MachineOperand &RegOp = I.getOperand(1);
5699     RegOp.setReg(Reg);
5700     RBI.constrainGenericRegister(DstReg, *RC, MRI);
5701   } else {
5702     // We don't need a subregister copy. Save a copy by re-using the
5703     // destination register on the final insert.
5704     assert(PrevMI && "PrevMI was null?");
5705     PrevMI->getOperand(0).setReg(I.getOperand(0).getReg());
5706     constrainSelectedInstRegOperands(*PrevMI, TII, TRI, RBI);
5707   }
5708 
5709   I.eraseFromParent();
5710   return true;
5711 }
5712 
5713 bool AArch64InstructionSelector::selectVectorLoadIntrinsic(unsigned Opc,
5714                                                            unsigned NumVecs,
5715                                                            MachineInstr &I) {
5716   assert(I.getOpcode() == TargetOpcode::G_INTRINSIC_W_SIDE_EFFECTS);
5717   assert(Opc && "Expected an opcode?");
5718   assert(NumVecs > 1 && NumVecs < 5 && "Only support 2, 3, or 4 vectors");
5719   auto &MRI = *MIB.getMRI();
5720   LLT Ty = MRI.getType(I.getOperand(0).getReg());
5721   unsigned Size = Ty.getSizeInBits();
5722   assert((Size == 64 || Size == 128) &&
5723          "Destination must be 64 bits or 128 bits?");
5724   unsigned SubReg = Size == 64 ? AArch64::dsub0 : AArch64::qsub0;
5725   auto Ptr = I.getOperand(I.getNumOperands() - 1).getReg();
5726   assert(MRI.getType(Ptr).isPointer() && "Expected a pointer type?");
5727   auto Load = MIB.buildInstr(Opc, {Ty}, {Ptr});
5728   Load.cloneMemRefs(I);
5729   constrainSelectedInstRegOperands(*Load, TII, TRI, RBI);
5730   Register SelectedLoadDst = Load->getOperand(0).getReg();
5731   for (unsigned Idx = 0; Idx < NumVecs; ++Idx) {
5732     auto Vec = MIB.buildInstr(TargetOpcode::COPY, {I.getOperand(Idx)}, {})
5733                    .addReg(SelectedLoadDst, 0, SubReg + Idx);
5734     // Emit the subreg copies and immediately select them.
5735     // FIXME: We should refactor our copy code into an emitCopy helper and
5736     // clean up uses of this pattern elsewhere in the selector.
5737     selectCopy(*Vec, TII, MRI, TRI, RBI);
5738   }
5739   return true;
5740 }
5741 
5742 bool AArch64InstructionSelector::selectIntrinsicWithSideEffects(
5743     MachineInstr &I, MachineRegisterInfo &MRI) {
5744   // Find the intrinsic ID.
5745   unsigned IntrinID = I.getIntrinsicID();
5746 
5747   const LLT S8 = LLT::scalar(8);
5748   const LLT S16 = LLT::scalar(16);
5749   const LLT S32 = LLT::scalar(32);
5750   const LLT S64 = LLT::scalar(64);
5751   const LLT P0 = LLT::pointer(0, 64);
5752   // Select the instruction.
5753   switch (IntrinID) {
5754   default:
5755     return false;
5756   case Intrinsic::aarch64_ldxp:
5757   case Intrinsic::aarch64_ldaxp: {
5758     auto NewI = MIB.buildInstr(
5759         IntrinID == Intrinsic::aarch64_ldxp ? AArch64::LDXPX : AArch64::LDAXPX,
5760         {I.getOperand(0).getReg(), I.getOperand(1).getReg()},
5761         {I.getOperand(3)});
5762     NewI.cloneMemRefs(I);
5763     constrainSelectedInstRegOperands(*NewI, TII, TRI, RBI);
5764     break;
5765   }
5766   case Intrinsic::trap:
5767     MIB.buildInstr(AArch64::BRK, {}, {}).addImm(1);
5768     break;
5769   case Intrinsic::debugtrap:
5770     MIB.buildInstr(AArch64::BRK, {}, {}).addImm(0xF000);
5771     break;
5772   case Intrinsic::ubsantrap:
5773     MIB.buildInstr(AArch64::BRK, {}, {})
5774         .addImm(I.getOperand(1).getImm() | ('U' << 8));
5775     break;
5776   case Intrinsic::aarch64_neon_ld2: {
5777     LLT Ty = MRI.getType(I.getOperand(0).getReg());
5778     unsigned Opc = 0;
5779     if (Ty == LLT::fixed_vector(8, S8))
5780       Opc = AArch64::LD2Twov8b;
5781     else if (Ty == LLT::fixed_vector(16, S8))
5782       Opc = AArch64::LD2Twov16b;
5783     else if (Ty == LLT::fixed_vector(4, S16))
5784       Opc = AArch64::LD2Twov4h;
5785     else if (Ty == LLT::fixed_vector(8, S16))
5786       Opc = AArch64::LD2Twov8h;
5787     else if (Ty == LLT::fixed_vector(2, S32))
5788       Opc = AArch64::LD2Twov2s;
5789     else if (Ty == LLT::fixed_vector(4, S32))
5790       Opc = AArch64::LD2Twov4s;
5791     else if (Ty == LLT::fixed_vector(2, S64) || Ty == LLT::fixed_vector(2, P0))
5792       Opc = AArch64::LD2Twov2d;
5793     else if (Ty == S64 || Ty == P0)
5794       Opc = AArch64::LD1Twov1d;
5795     else
5796       llvm_unreachable("Unexpected type for ld2!");
5797     selectVectorLoadIntrinsic(Opc, 2, I);
5798     break;
5799   }
5800   case Intrinsic::aarch64_neon_ld4: {
5801     LLT Ty = MRI.getType(I.getOperand(0).getReg());
5802     unsigned Opc = 0;
5803     if (Ty == LLT::fixed_vector(8, S8))
5804       Opc = AArch64::LD4Fourv8b;
5805     else if (Ty == LLT::fixed_vector(16, S8))
5806       Opc = AArch64::LD4Fourv16b;
5807     else if (Ty == LLT::fixed_vector(4, S16))
5808       Opc = AArch64::LD4Fourv4h;
5809     else if (Ty == LLT::fixed_vector(8, S16))
5810       Opc = AArch64::LD4Fourv8h;
5811     else if (Ty == LLT::fixed_vector(2, S32))
5812       Opc = AArch64::LD4Fourv2s;
5813     else if (Ty == LLT::fixed_vector(4, S32))
5814       Opc = AArch64::LD4Fourv4s;
5815     else if (Ty == LLT::fixed_vector(2, S64) || Ty == LLT::fixed_vector(2, P0))
5816       Opc = AArch64::LD4Fourv2d;
5817     else if (Ty == S64 || Ty == P0)
5818       Opc = AArch64::LD1Fourv1d;
5819     else
5820       llvm_unreachable("Unexpected type for ld4!");
5821     selectVectorLoadIntrinsic(Opc, 4, I);
5822     break;
5823   }
5824   case Intrinsic::aarch64_neon_st2: {
5825     Register Src1 = I.getOperand(1).getReg();
5826     Register Src2 = I.getOperand(2).getReg();
5827     Register Ptr = I.getOperand(3).getReg();
5828     LLT Ty = MRI.getType(Src1);
5829     unsigned Opc;
5830     if (Ty == LLT::fixed_vector(8, S8))
5831       Opc = AArch64::ST2Twov8b;
5832     else if (Ty == LLT::fixed_vector(16, S8))
5833       Opc = AArch64::ST2Twov16b;
5834     else if (Ty == LLT::fixed_vector(4, S16))
5835       Opc = AArch64::ST2Twov4h;
5836     else if (Ty == LLT::fixed_vector(8, S16))
5837       Opc = AArch64::ST2Twov8h;
5838     else if (Ty == LLT::fixed_vector(2, S32))
5839       Opc = AArch64::ST2Twov2s;
5840     else if (Ty == LLT::fixed_vector(4, S32))
5841       Opc = AArch64::ST2Twov4s;
5842     else if (Ty == LLT::fixed_vector(2, S64) || Ty == LLT::fixed_vector(2, P0))
5843       Opc = AArch64::ST2Twov2d;
5844     else if (Ty == S64 || Ty == P0)
5845       Opc = AArch64::ST1Twov1d;
5846     else
5847       llvm_unreachable("Unexpected type for st2!");
5848     SmallVector<Register, 2> Regs = {Src1, Src2};
5849     Register Tuple = Ty.getSizeInBits() == 128 ? createQTuple(Regs, MIB)
5850                                                : createDTuple(Regs, MIB);
5851     auto Store = MIB.buildInstr(Opc, {}, {Tuple, Ptr});
5852     Store.cloneMemRefs(I);
5853     constrainSelectedInstRegOperands(*Store, TII, TRI, RBI);
5854     break;
5855   }
5856   case Intrinsic::aarch64_mops_memset_tag: {
5857     // Transform
5858     //    %dst:gpr(p0) = \
5859     //      G_INTRINSIC_W_SIDE_EFFECTS intrinsic(@llvm.aarch64.mops.memset.tag),
5860     //      \ %dst:gpr(p0), %val:gpr(s64), %n:gpr(s64)
5861     // where %dst is updated, into
5862     //    %Rd:GPR64common, %Rn:GPR64) = \
5863     //      MOPSMemorySetTaggingPseudo \
5864     //      %Rd:GPR64common, %Rn:GPR64, %Rm:GPR64
5865     // where Rd and Rn are tied.
5866     // It is expected that %val has been extended to s64 in legalization.
5867     // Note that the order of the size/value operands are swapped.
5868 
5869     Register DstDef = I.getOperand(0).getReg();
5870     // I.getOperand(1) is the intrinsic function
5871     Register DstUse = I.getOperand(2).getReg();
5872     Register ValUse = I.getOperand(3).getReg();
5873     Register SizeUse = I.getOperand(4).getReg();
5874 
5875     // MOPSMemorySetTaggingPseudo has two defs; the intrinsic call has only one.
5876     // Therefore an additional virtual register is requried for the updated size
5877     // operand. This value is not accessible via the semantics of the intrinsic.
5878     Register SizeDef = MRI.createGenericVirtualRegister(LLT::scalar(64));
5879 
5880     auto Memset = MIB.buildInstr(AArch64::MOPSMemorySetTaggingPseudo,
5881                                  {DstDef, SizeDef}, {DstUse, SizeUse, ValUse});
5882     Memset.cloneMemRefs(I);
5883     constrainSelectedInstRegOperands(*Memset, TII, TRI, RBI);
5884     break;
5885   }
5886   }
5887 
5888   I.eraseFromParent();
5889   return true;
5890 }
5891 
5892 bool AArch64InstructionSelector::selectIntrinsic(MachineInstr &I,
5893                                                  MachineRegisterInfo &MRI) {
5894   unsigned IntrinID = I.getIntrinsicID();
5895 
5896   switch (IntrinID) {
5897   default:
5898     break;
5899   case Intrinsic::aarch64_crypto_sha1h: {
5900     Register DstReg = I.getOperand(0).getReg();
5901     Register SrcReg = I.getOperand(2).getReg();
5902 
5903     // FIXME: Should this be an assert?
5904     if (MRI.getType(DstReg).getSizeInBits() != 32 ||
5905         MRI.getType(SrcReg).getSizeInBits() != 32)
5906       return false;
5907 
5908     // The operation has to happen on FPRs. Set up some new FPR registers for
5909     // the source and destination if they are on GPRs.
5910     if (RBI.getRegBank(SrcReg, MRI, TRI)->getID() != AArch64::FPRRegBankID) {
5911       SrcReg = MRI.createVirtualRegister(&AArch64::FPR32RegClass);
5912       MIB.buildCopy({SrcReg}, {I.getOperand(2)});
5913 
5914       // Make sure the copy ends up getting constrained properly.
5915       RBI.constrainGenericRegister(I.getOperand(2).getReg(),
5916                                    AArch64::GPR32RegClass, MRI);
5917     }
5918 
5919     if (RBI.getRegBank(DstReg, MRI, TRI)->getID() != AArch64::FPRRegBankID)
5920       DstReg = MRI.createVirtualRegister(&AArch64::FPR32RegClass);
5921 
5922     // Actually insert the instruction.
5923     auto SHA1Inst = MIB.buildInstr(AArch64::SHA1Hrr, {DstReg}, {SrcReg});
5924     constrainSelectedInstRegOperands(*SHA1Inst, TII, TRI, RBI);
5925 
5926     // Did we create a new register for the destination?
5927     if (DstReg != I.getOperand(0).getReg()) {
5928       // Yep. Copy the result of the instruction back into the original
5929       // destination.
5930       MIB.buildCopy({I.getOperand(0)}, {DstReg});
5931       RBI.constrainGenericRegister(I.getOperand(0).getReg(),
5932                                    AArch64::GPR32RegClass, MRI);
5933     }
5934 
5935     I.eraseFromParent();
5936     return true;
5937   }
5938   case Intrinsic::ptrauth_sign: {
5939     Register DstReg = I.getOperand(0).getReg();
5940     Register ValReg = I.getOperand(2).getReg();
5941     uint64_t Key = I.getOperand(3).getImm();
5942     Register DiscReg = I.getOperand(4).getReg();
5943     auto DiscVal = getIConstantVRegVal(DiscReg, MRI);
5944     bool IsDiscZero = DiscVal && DiscVal->isZero();
5945 
5946     if (Key > AArch64PACKey::LAST)
5947       return false;
5948 
5949     unsigned Opcodes[][4] = {
5950         {AArch64::PACIA, AArch64::PACIB, AArch64::PACDA, AArch64::PACDB},
5951         {AArch64::PACIZA, AArch64::PACIZB, AArch64::PACDZA, AArch64::PACDZB}};
5952     unsigned Opcode = Opcodes[IsDiscZero][Key];
5953 
5954     auto PAC = MIB.buildInstr(Opcode, {DstReg}, {ValReg});
5955 
5956     if (!IsDiscZero) {
5957       PAC.addUse(DiscReg);
5958       RBI.constrainGenericRegister(DiscReg, AArch64::GPR64spRegClass, MRI);
5959     }
5960 
5961     RBI.constrainGenericRegister(DstReg, AArch64::GPR64RegClass, MRI);
5962     I.eraseFromParent();
5963     return true;
5964   }
5965   case Intrinsic::ptrauth_strip: {
5966     Register DstReg = I.getOperand(0).getReg();
5967     Register ValReg = I.getOperand(2).getReg();
5968     uint64_t Key = I.getOperand(3).getImm();
5969 
5970     if (Key > AArch64PACKey::LAST)
5971       return false;
5972     unsigned Opcode = getXPACOpcodeForKey((AArch64PACKey::ID)Key);
5973 
5974     MIB.buildInstr(Opcode, {DstReg}, {ValReg});
5975 
5976     RBI.constrainGenericRegister(DstReg, AArch64::GPR64RegClass, MRI);
5977     RBI.constrainGenericRegister(ValReg, AArch64::GPR64RegClass, MRI);
5978     I.eraseFromParent();
5979     return true;
5980   }
5981   case Intrinsic::ptrauth_blend: {
5982     MachineFunction &MF = *I.getParent()->getParent();
5983     auto RHS = getIConstantVRegVal(I.getOperand(3).getReg(), MRI);
5984     if (RHS && (RHS->getZExtValue() <= 0xffff)) {
5985       I.setDesc(TII.get(AArch64::MOVKXi));
5986       I.removeOperand(3);
5987       I.removeOperand(1);
5988       MachineInstrBuilder(MF, I)
5989           .addImm(RHS->getZExtValue() & 0xffff)
5990           .addImm(48)
5991           .constrainAllUses(TII, TRI, RBI);
5992     } else {
5993       I.setDesc(TII.get(AArch64::BFMXri));
5994       I.removeOperand(1);
5995       MachineInstrBuilder(MF, I).addImm(16).addImm(15).constrainAllUses(
5996           TII, TRI, RBI);
5997     }
5998     return true;
5999   }
6000   case Intrinsic::frameaddress:
6001   case Intrinsic::returnaddress: {
6002     MachineFunction &MF = *I.getParent()->getParent();
6003     MachineFrameInfo &MFI = MF.getFrameInfo();
6004 
6005     unsigned Depth = I.getOperand(2).getImm();
6006     Register DstReg = I.getOperand(0).getReg();
6007     RBI.constrainGenericRegister(DstReg, AArch64::GPR64RegClass, MRI);
6008 
6009     if (Depth == 0 && IntrinID == Intrinsic::returnaddress) {
6010       if (!MFReturnAddr) {
6011         // Insert the copy from LR/X30 into the entry block, before it can be
6012         // clobbered by anything.
6013         MFI.setReturnAddressIsTaken(true);
6014         MFReturnAddr = getFunctionLiveInPhysReg(
6015             MF, TII, AArch64::LR, AArch64::GPR64RegClass, I.getDebugLoc());
6016       }
6017 
6018       if (STI.hasPAuth()) {
6019         MIB.buildInstr(AArch64::XPACI, {DstReg}, {MFReturnAddr});
6020       } else {
6021         MIB.buildCopy({Register(AArch64::LR)}, {MFReturnAddr});
6022         MIB.buildInstr(AArch64::XPACLRI);
6023         MIB.buildCopy({DstReg}, {Register(AArch64::LR)});
6024       }
6025 
6026       I.eraseFromParent();
6027       return true;
6028     }
6029 
6030     MFI.setFrameAddressIsTaken(true);
6031     Register FrameAddr(AArch64::FP);
6032     while (Depth--) {
6033       Register NextFrame = MRI.createVirtualRegister(&AArch64::GPR64spRegClass);
6034       auto Ldr =
6035           MIB.buildInstr(AArch64::LDRXui, {NextFrame}, {FrameAddr}).addImm(0);
6036       constrainSelectedInstRegOperands(*Ldr, TII, TRI, RBI);
6037       FrameAddr = NextFrame;
6038     }
6039 
6040     if (IntrinID == Intrinsic::frameaddress)
6041       MIB.buildCopy({DstReg}, {FrameAddr});
6042     else {
6043       MFI.setReturnAddressIsTaken(true);
6044 
6045       if (STI.hasPAuth()) {
6046         Register TmpReg = MRI.createVirtualRegister(&AArch64::GPR64RegClass);
6047         MIB.buildInstr(AArch64::LDRXui, {TmpReg}, {FrameAddr}).addImm(1);
6048         MIB.buildInstr(AArch64::XPACI, {DstReg}, {TmpReg});
6049       } else {
6050         MIB.buildInstr(AArch64::LDRXui, {Register(AArch64::LR)}, {FrameAddr})
6051             .addImm(1);
6052         MIB.buildInstr(AArch64::XPACLRI);
6053         MIB.buildCopy({DstReg}, {Register(AArch64::LR)});
6054       }
6055     }
6056 
6057     I.eraseFromParent();
6058     return true;
6059   }
6060   case Intrinsic::swift_async_context_addr:
6061     auto Sub = MIB.buildInstr(AArch64::SUBXri, {I.getOperand(0).getReg()},
6062                               {Register(AArch64::FP)})
6063                    .addImm(8)
6064                    .addImm(0);
6065     constrainSelectedInstRegOperands(*Sub, TII, TRI, RBI);
6066 
6067     MF->getFrameInfo().setFrameAddressIsTaken(true);
6068     MF->getInfo<AArch64FunctionInfo>()->setHasSwiftAsyncContext(true);
6069     I.eraseFromParent();
6070     return true;
6071   }
6072   return false;
6073 }
6074 
6075 InstructionSelector::ComplexRendererFns
6076 AArch64InstructionSelector::selectShiftA_32(const MachineOperand &Root) const {
6077   auto MaybeImmed = getImmedFromMO(Root);
6078   if (MaybeImmed == std::nullopt || *MaybeImmed > 31)
6079     return std::nullopt;
6080   uint64_t Enc = (32 - *MaybeImmed) & 0x1f;
6081   return {{[=](MachineInstrBuilder &MIB) { MIB.addImm(Enc); }}};
6082 }
6083 
6084 InstructionSelector::ComplexRendererFns
6085 AArch64InstructionSelector::selectShiftB_32(const MachineOperand &Root) const {
6086   auto MaybeImmed = getImmedFromMO(Root);
6087   if (MaybeImmed == std::nullopt || *MaybeImmed > 31)
6088     return std::nullopt;
6089   uint64_t Enc = 31 - *MaybeImmed;
6090   return {{[=](MachineInstrBuilder &MIB) { MIB.addImm(Enc); }}};
6091 }
6092 
6093 InstructionSelector::ComplexRendererFns
6094 AArch64InstructionSelector::selectShiftA_64(const MachineOperand &Root) const {
6095   auto MaybeImmed = getImmedFromMO(Root);
6096   if (MaybeImmed == std::nullopt || *MaybeImmed > 63)
6097     return std::nullopt;
6098   uint64_t Enc = (64 - *MaybeImmed) & 0x3f;
6099   return {{[=](MachineInstrBuilder &MIB) { MIB.addImm(Enc); }}};
6100 }
6101 
6102 InstructionSelector::ComplexRendererFns
6103 AArch64InstructionSelector::selectShiftB_64(const MachineOperand &Root) const {
6104   auto MaybeImmed = getImmedFromMO(Root);
6105   if (MaybeImmed == std::nullopt || *MaybeImmed > 63)
6106     return std::nullopt;
6107   uint64_t Enc = 63 - *MaybeImmed;
6108   return {{[=](MachineInstrBuilder &MIB) { MIB.addImm(Enc); }}};
6109 }
6110 
6111 /// Helper to select an immediate value that can be represented as a 12-bit
6112 /// value shifted left by either 0 or 12. If it is possible to do so, return
6113 /// the immediate and shift value. If not, return std::nullopt.
6114 ///
6115 /// Used by selectArithImmed and selectNegArithImmed.
6116 InstructionSelector::ComplexRendererFns
6117 AArch64InstructionSelector::select12BitValueWithLeftShift(
6118     uint64_t Immed) const {
6119   unsigned ShiftAmt;
6120   if (Immed >> 12 == 0) {
6121     ShiftAmt = 0;
6122   } else if ((Immed & 0xfff) == 0 && Immed >> 24 == 0) {
6123     ShiftAmt = 12;
6124     Immed = Immed >> 12;
6125   } else
6126     return std::nullopt;
6127 
6128   unsigned ShVal = AArch64_AM::getShifterImm(AArch64_AM::LSL, ShiftAmt);
6129   return {{
6130       [=](MachineInstrBuilder &MIB) { MIB.addImm(Immed); },
6131       [=](MachineInstrBuilder &MIB) { MIB.addImm(ShVal); },
6132   }};
6133 }
6134 
6135 /// SelectArithImmed - Select an immediate value that can be represented as
6136 /// a 12-bit value shifted left by either 0 or 12.  If so, return true with
6137 /// Val set to the 12-bit value and Shift set to the shifter operand.
6138 InstructionSelector::ComplexRendererFns
6139 AArch64InstructionSelector::selectArithImmed(MachineOperand &Root) const {
6140   // This function is called from the addsub_shifted_imm ComplexPattern,
6141   // which lists [imm] as the list of opcode it's interested in, however
6142   // we still need to check whether the operand is actually an immediate
6143   // here because the ComplexPattern opcode list is only used in
6144   // root-level opcode matching.
6145   auto MaybeImmed = getImmedFromMO(Root);
6146   if (MaybeImmed == std::nullopt)
6147     return std::nullopt;
6148   return select12BitValueWithLeftShift(*MaybeImmed);
6149 }
6150 
6151 /// SelectNegArithImmed - As above, but negates the value before trying to
6152 /// select it.
6153 InstructionSelector::ComplexRendererFns
6154 AArch64InstructionSelector::selectNegArithImmed(MachineOperand &Root) const {
6155   // We need a register here, because we need to know if we have a 64 or 32
6156   // bit immediate.
6157   if (!Root.isReg())
6158     return std::nullopt;
6159   auto MaybeImmed = getImmedFromMO(Root);
6160   if (MaybeImmed == std::nullopt)
6161     return std::nullopt;
6162   uint64_t Immed = *MaybeImmed;
6163 
6164   // This negation is almost always valid, but "cmp wN, #0" and "cmn wN, #0"
6165   // have the opposite effect on the C flag, so this pattern mustn't match under
6166   // those circumstances.
6167   if (Immed == 0)
6168     return std::nullopt;
6169 
6170   // Check if we're dealing with a 32-bit type on the root or a 64-bit type on
6171   // the root.
6172   MachineRegisterInfo &MRI = Root.getParent()->getMF()->getRegInfo();
6173   if (MRI.getType(Root.getReg()).getSizeInBits() == 32)
6174     Immed = ~((uint32_t)Immed) + 1;
6175   else
6176     Immed = ~Immed + 1ULL;
6177 
6178   if (Immed & 0xFFFFFFFFFF000000ULL)
6179     return std::nullopt;
6180 
6181   Immed &= 0xFFFFFFULL;
6182   return select12BitValueWithLeftShift(Immed);
6183 }
6184 
6185 /// Return true if it is worth folding MI into an extended register. That is,
6186 /// if it's safe to pull it into the addressing mode of a load or store as a
6187 /// shift.
6188 bool AArch64InstructionSelector::isWorthFoldingIntoExtendedReg(
6189     MachineInstr &MI, const MachineRegisterInfo &MRI) const {
6190   // Always fold if there is one use, or if we're optimizing for size.
6191   Register DefReg = MI.getOperand(0).getReg();
6192   if (MRI.hasOneNonDBGUse(DefReg) ||
6193       MI.getParent()->getParent()->getFunction().hasOptSize())
6194     return true;
6195 
6196   // It's better to avoid folding and recomputing shifts when we don't have a
6197   // fastpath.
6198   if (!STI.hasLSLFast())
6199     return false;
6200 
6201   // We have a fastpath, so folding a shift in and potentially computing it
6202   // many times may be beneficial. Check if this is only used in memory ops.
6203   // If it is, then we should fold.
6204   return all_of(MRI.use_nodbg_instructions(DefReg),
6205                 [](MachineInstr &Use) { return Use.mayLoadOrStore(); });
6206 }
6207 
6208 static bool isSignExtendShiftType(AArch64_AM::ShiftExtendType Type) {
6209   switch (Type) {
6210   case AArch64_AM::SXTB:
6211   case AArch64_AM::SXTH:
6212   case AArch64_AM::SXTW:
6213     return true;
6214   default:
6215     return false;
6216   }
6217 }
6218 
6219 InstructionSelector::ComplexRendererFns
6220 AArch64InstructionSelector::selectExtendedSHL(
6221     MachineOperand &Root, MachineOperand &Base, MachineOperand &Offset,
6222     unsigned SizeInBytes, bool WantsExt) const {
6223   assert(Base.isReg() && "Expected base to be a register operand");
6224   assert(Offset.isReg() && "Expected offset to be a register operand");
6225 
6226   MachineRegisterInfo &MRI = Root.getParent()->getMF()->getRegInfo();
6227   MachineInstr *OffsetInst = MRI.getVRegDef(Offset.getReg());
6228 
6229   unsigned OffsetOpc = OffsetInst->getOpcode();
6230   bool LookedThroughZExt = false;
6231   if (OffsetOpc != TargetOpcode::G_SHL && OffsetOpc != TargetOpcode::G_MUL) {
6232     // Try to look through a ZEXT.
6233     if (OffsetOpc != TargetOpcode::G_ZEXT || !WantsExt)
6234       return std::nullopt;
6235 
6236     OffsetInst = MRI.getVRegDef(OffsetInst->getOperand(1).getReg());
6237     OffsetOpc = OffsetInst->getOpcode();
6238     LookedThroughZExt = true;
6239 
6240     if (OffsetOpc != TargetOpcode::G_SHL && OffsetOpc != TargetOpcode::G_MUL)
6241       return std::nullopt;
6242   }
6243   // Make sure that the memory op is a valid size.
6244   int64_t LegalShiftVal = Log2_32(SizeInBytes);
6245   if (LegalShiftVal == 0)
6246     return std::nullopt;
6247   if (!isWorthFoldingIntoExtendedReg(*OffsetInst, MRI))
6248     return std::nullopt;
6249 
6250   // Now, try to find the specific G_CONSTANT. Start by assuming that the
6251   // register we will offset is the LHS, and the register containing the
6252   // constant is the RHS.
6253   Register OffsetReg = OffsetInst->getOperand(1).getReg();
6254   Register ConstantReg = OffsetInst->getOperand(2).getReg();
6255   auto ValAndVReg = getIConstantVRegValWithLookThrough(ConstantReg, MRI);
6256   if (!ValAndVReg) {
6257     // We didn't get a constant on the RHS. If the opcode is a shift, then
6258     // we're done.
6259     if (OffsetOpc == TargetOpcode::G_SHL)
6260       return std::nullopt;
6261 
6262     // If we have a G_MUL, we can use either register. Try looking at the RHS.
6263     std::swap(OffsetReg, ConstantReg);
6264     ValAndVReg = getIConstantVRegValWithLookThrough(ConstantReg, MRI);
6265     if (!ValAndVReg)
6266       return std::nullopt;
6267   }
6268 
6269   // The value must fit into 3 bits, and must be positive. Make sure that is
6270   // true.
6271   int64_t ImmVal = ValAndVReg->Value.getSExtValue();
6272 
6273   // Since we're going to pull this into a shift, the constant value must be
6274   // a power of 2. If we got a multiply, then we need to check this.
6275   if (OffsetOpc == TargetOpcode::G_MUL) {
6276     if (!llvm::has_single_bit<uint32_t>(ImmVal))
6277       return std::nullopt;
6278 
6279     // Got a power of 2. So, the amount we'll shift is the log base-2 of that.
6280     ImmVal = Log2_32(ImmVal);
6281   }
6282 
6283   if ((ImmVal & 0x7) != ImmVal)
6284     return std::nullopt;
6285 
6286   // We are only allowed to shift by LegalShiftVal. This shift value is built
6287   // into the instruction, so we can't just use whatever we want.
6288   if (ImmVal != LegalShiftVal)
6289     return std::nullopt;
6290 
6291   unsigned SignExtend = 0;
6292   if (WantsExt) {
6293     // Check if the offset is defined by an extend, unless we looked through a
6294     // G_ZEXT earlier.
6295     if (!LookedThroughZExt) {
6296       MachineInstr *ExtInst = getDefIgnoringCopies(OffsetReg, MRI);
6297       auto Ext = getExtendTypeForInst(*ExtInst, MRI, true);
6298       if (Ext == AArch64_AM::InvalidShiftExtend)
6299         return std::nullopt;
6300 
6301       SignExtend = isSignExtendShiftType(Ext) ? 1 : 0;
6302       // We only support SXTW for signed extension here.
6303       if (SignExtend && Ext != AArch64_AM::SXTW)
6304         return std::nullopt;
6305       OffsetReg = ExtInst->getOperand(1).getReg();
6306     }
6307 
6308     // Need a 32-bit wide register here.
6309     MachineIRBuilder MIB(*MRI.getVRegDef(Root.getReg()));
6310     OffsetReg = moveScalarRegClass(OffsetReg, AArch64::GPR32RegClass, MIB);
6311   }
6312 
6313   // We can use the LHS of the GEP as the base, and the LHS of the shift as an
6314   // offset. Signify that we are shifting by setting the shift flag to 1.
6315   return {{[=](MachineInstrBuilder &MIB) { MIB.addUse(Base.getReg()); },
6316            [=](MachineInstrBuilder &MIB) { MIB.addUse(OffsetReg); },
6317            [=](MachineInstrBuilder &MIB) {
6318              // Need to add both immediates here to make sure that they are both
6319              // added to the instruction.
6320              MIB.addImm(SignExtend);
6321              MIB.addImm(1);
6322            }}};
6323 }
6324 
6325 /// This is used for computing addresses like this:
6326 ///
6327 /// ldr x1, [x2, x3, lsl #3]
6328 ///
6329 /// Where x2 is the base register, and x3 is an offset register. The shift-left
6330 /// is a constant value specific to this load instruction. That is, we'll never
6331 /// see anything other than a 3 here (which corresponds to the size of the
6332 /// element being loaded.)
6333 InstructionSelector::ComplexRendererFns
6334 AArch64InstructionSelector::selectAddrModeShiftedExtendXReg(
6335     MachineOperand &Root, unsigned SizeInBytes) const {
6336   if (!Root.isReg())
6337     return std::nullopt;
6338   MachineRegisterInfo &MRI = Root.getParent()->getMF()->getRegInfo();
6339 
6340   // We want to find something like this:
6341   //
6342   // val = G_CONSTANT LegalShiftVal
6343   // shift = G_SHL off_reg val
6344   // ptr = G_PTR_ADD base_reg shift
6345   // x = G_LOAD ptr
6346   //
6347   // And fold it into this addressing mode:
6348   //
6349   // ldr x, [base_reg, off_reg, lsl #LegalShiftVal]
6350 
6351   // Check if we can find the G_PTR_ADD.
6352   MachineInstr *PtrAdd =
6353       getOpcodeDef(TargetOpcode::G_PTR_ADD, Root.getReg(), MRI);
6354   if (!PtrAdd || !isWorthFoldingIntoExtendedReg(*PtrAdd, MRI))
6355     return std::nullopt;
6356 
6357   // Now, try to match an opcode which will match our specific offset.
6358   // We want a G_SHL or a G_MUL.
6359   MachineInstr *OffsetInst =
6360       getDefIgnoringCopies(PtrAdd->getOperand(2).getReg(), MRI);
6361   return selectExtendedSHL(Root, PtrAdd->getOperand(1),
6362                            OffsetInst->getOperand(0), SizeInBytes,
6363                            /*WantsExt=*/false);
6364 }
6365 
6366 /// This is used for computing addresses like this:
6367 ///
6368 /// ldr x1, [x2, x3]
6369 ///
6370 /// Where x2 is the base register, and x3 is an offset register.
6371 ///
6372 /// When possible (or profitable) to fold a G_PTR_ADD into the address
6373 /// calculation, this will do so. Otherwise, it will return std::nullopt.
6374 InstructionSelector::ComplexRendererFns
6375 AArch64InstructionSelector::selectAddrModeRegisterOffset(
6376     MachineOperand &Root) const {
6377   MachineRegisterInfo &MRI = Root.getParent()->getMF()->getRegInfo();
6378 
6379   // We need a GEP.
6380   MachineInstr *Gep = MRI.getVRegDef(Root.getReg());
6381   if (Gep->getOpcode() != TargetOpcode::G_PTR_ADD)
6382     return std::nullopt;
6383 
6384   // If this is used more than once, let's not bother folding.
6385   // TODO: Check if they are memory ops. If they are, then we can still fold
6386   // without having to recompute anything.
6387   if (!MRI.hasOneNonDBGUse(Gep->getOperand(0).getReg()))
6388     return std::nullopt;
6389 
6390   // Base is the GEP's LHS, offset is its RHS.
6391   return {{[=](MachineInstrBuilder &MIB) {
6392              MIB.addUse(Gep->getOperand(1).getReg());
6393            },
6394            [=](MachineInstrBuilder &MIB) {
6395              MIB.addUse(Gep->getOperand(2).getReg());
6396            },
6397            [=](MachineInstrBuilder &MIB) {
6398              // Need to add both immediates here to make sure that they are both
6399              // added to the instruction.
6400              MIB.addImm(0);
6401              MIB.addImm(0);
6402            }}};
6403 }
6404 
6405 /// This is intended to be equivalent to selectAddrModeXRO in
6406 /// AArch64ISelDAGtoDAG. It's used for selecting X register offset loads.
6407 InstructionSelector::ComplexRendererFns
6408 AArch64InstructionSelector::selectAddrModeXRO(MachineOperand &Root,
6409                                               unsigned SizeInBytes) const {
6410   MachineRegisterInfo &MRI = Root.getParent()->getMF()->getRegInfo();
6411   if (!Root.isReg())
6412     return std::nullopt;
6413   MachineInstr *PtrAdd =
6414       getOpcodeDef(TargetOpcode::G_PTR_ADD, Root.getReg(), MRI);
6415   if (!PtrAdd)
6416     return std::nullopt;
6417 
6418   // Check for an immediates which cannot be encoded in the [base + imm]
6419   // addressing mode, and can't be encoded in an add/sub. If this happens, we'll
6420   // end up with code like:
6421   //
6422   // mov x0, wide
6423   // add x1 base, x0
6424   // ldr x2, [x1, x0]
6425   //
6426   // In this situation, we can use the [base, xreg] addressing mode to save an
6427   // add/sub:
6428   //
6429   // mov x0, wide
6430   // ldr x2, [base, x0]
6431   auto ValAndVReg =
6432       getIConstantVRegValWithLookThrough(PtrAdd->getOperand(2).getReg(), MRI);
6433   if (ValAndVReg) {
6434     unsigned Scale = Log2_32(SizeInBytes);
6435     int64_t ImmOff = ValAndVReg->Value.getSExtValue();
6436 
6437     // Skip immediates that can be selected in the load/store addresing
6438     // mode.
6439     if (ImmOff % SizeInBytes == 0 && ImmOff >= 0 &&
6440         ImmOff < (0x1000 << Scale))
6441       return std::nullopt;
6442 
6443     // Helper lambda to decide whether or not it is preferable to emit an add.
6444     auto isPreferredADD = [](int64_t ImmOff) {
6445       // Constants in [0x0, 0xfff] can be encoded in an add.
6446       if ((ImmOff & 0xfffffffffffff000LL) == 0x0LL)
6447         return true;
6448 
6449       // Can it be encoded in an add lsl #12?
6450       if ((ImmOff & 0xffffffffff000fffLL) != 0x0LL)
6451         return false;
6452 
6453       // It can be encoded in an add lsl #12, but we may not want to. If it is
6454       // possible to select this as a single movz, then prefer that. A single
6455       // movz is faster than an add with a shift.
6456       return (ImmOff & 0xffffffffff00ffffLL) != 0x0LL &&
6457              (ImmOff & 0xffffffffffff0fffLL) != 0x0LL;
6458     };
6459 
6460     // If the immediate can be encoded in a single add/sub, then bail out.
6461     if (isPreferredADD(ImmOff) || isPreferredADD(-ImmOff))
6462       return std::nullopt;
6463   }
6464 
6465   // Try to fold shifts into the addressing mode.
6466   auto AddrModeFns = selectAddrModeShiftedExtendXReg(Root, SizeInBytes);
6467   if (AddrModeFns)
6468     return AddrModeFns;
6469 
6470   // If that doesn't work, see if it's possible to fold in registers from
6471   // a GEP.
6472   return selectAddrModeRegisterOffset(Root);
6473 }
6474 
6475 /// This is used for computing addresses like this:
6476 ///
6477 /// ldr x0, [xBase, wOffset, sxtw #LegalShiftVal]
6478 ///
6479 /// Where we have a 64-bit base register, a 32-bit offset register, and an
6480 /// extend (which may or may not be signed).
6481 InstructionSelector::ComplexRendererFns
6482 AArch64InstructionSelector::selectAddrModeWRO(MachineOperand &Root,
6483                                               unsigned SizeInBytes) const {
6484   MachineRegisterInfo &MRI = Root.getParent()->getMF()->getRegInfo();
6485 
6486   MachineInstr *PtrAdd =
6487       getOpcodeDef(TargetOpcode::G_PTR_ADD, Root.getReg(), MRI);
6488   if (!PtrAdd || !isWorthFoldingIntoExtendedReg(*PtrAdd, MRI))
6489     return std::nullopt;
6490 
6491   MachineOperand &LHS = PtrAdd->getOperand(1);
6492   MachineOperand &RHS = PtrAdd->getOperand(2);
6493   MachineInstr *OffsetInst = getDefIgnoringCopies(RHS.getReg(), MRI);
6494 
6495   // The first case is the same as selectAddrModeXRO, except we need an extend.
6496   // In this case, we try to find a shift and extend, and fold them into the
6497   // addressing mode.
6498   //
6499   // E.g.
6500   //
6501   // off_reg = G_Z/S/ANYEXT ext_reg
6502   // val = G_CONSTANT LegalShiftVal
6503   // shift = G_SHL off_reg val
6504   // ptr = G_PTR_ADD base_reg shift
6505   // x = G_LOAD ptr
6506   //
6507   // In this case we can get a load like this:
6508   //
6509   // ldr x0, [base_reg, ext_reg, sxtw #LegalShiftVal]
6510   auto ExtendedShl = selectExtendedSHL(Root, LHS, OffsetInst->getOperand(0),
6511                                        SizeInBytes, /*WantsExt=*/true);
6512   if (ExtendedShl)
6513     return ExtendedShl;
6514 
6515   // There was no shift. We can try and fold a G_Z/S/ANYEXT in alone though.
6516   //
6517   // e.g.
6518   // ldr something, [base_reg, ext_reg, sxtw]
6519   if (!isWorthFoldingIntoExtendedReg(*OffsetInst, MRI))
6520     return std::nullopt;
6521 
6522   // Check if this is an extend. We'll get an extend type if it is.
6523   AArch64_AM::ShiftExtendType Ext =
6524       getExtendTypeForInst(*OffsetInst, MRI, /*IsLoadStore=*/true);
6525   if (Ext == AArch64_AM::InvalidShiftExtend)
6526     return std::nullopt;
6527 
6528   // Need a 32-bit wide register.
6529   MachineIRBuilder MIB(*PtrAdd);
6530   Register ExtReg = moveScalarRegClass(OffsetInst->getOperand(1).getReg(),
6531                                        AArch64::GPR32RegClass, MIB);
6532   unsigned SignExtend = Ext == AArch64_AM::SXTW;
6533 
6534   // Base is LHS, offset is ExtReg.
6535   return {{[=](MachineInstrBuilder &MIB) { MIB.addUse(LHS.getReg()); },
6536            [=](MachineInstrBuilder &MIB) { MIB.addUse(ExtReg); },
6537            [=](MachineInstrBuilder &MIB) {
6538              MIB.addImm(SignExtend);
6539              MIB.addImm(0);
6540            }}};
6541 }
6542 
6543 /// Select a "register plus unscaled signed 9-bit immediate" address.  This
6544 /// should only match when there is an offset that is not valid for a scaled
6545 /// immediate addressing mode.  The "Size" argument is the size in bytes of the
6546 /// memory reference, which is needed here to know what is valid for a scaled
6547 /// immediate.
6548 InstructionSelector::ComplexRendererFns
6549 AArch64InstructionSelector::selectAddrModeUnscaled(MachineOperand &Root,
6550                                                    unsigned Size) const {
6551   MachineRegisterInfo &MRI =
6552       Root.getParent()->getParent()->getParent()->getRegInfo();
6553 
6554   if (!Root.isReg())
6555     return std::nullopt;
6556 
6557   if (!isBaseWithConstantOffset(Root, MRI))
6558     return std::nullopt;
6559 
6560   MachineInstr *RootDef = MRI.getVRegDef(Root.getReg());
6561 
6562   MachineOperand &OffImm = RootDef->getOperand(2);
6563   if (!OffImm.isReg())
6564     return std::nullopt;
6565   MachineInstr *RHS = MRI.getVRegDef(OffImm.getReg());
6566   if (RHS->getOpcode() != TargetOpcode::G_CONSTANT)
6567     return std::nullopt;
6568   int64_t RHSC;
6569   MachineOperand &RHSOp1 = RHS->getOperand(1);
6570   if (!RHSOp1.isCImm() || RHSOp1.getCImm()->getBitWidth() > 64)
6571     return std::nullopt;
6572   RHSC = RHSOp1.getCImm()->getSExtValue();
6573 
6574   // If the offset is valid as a scaled immediate, don't match here.
6575   if ((RHSC & (Size - 1)) == 0 && RHSC >= 0 && RHSC < (0x1000 << Log2_32(Size)))
6576     return std::nullopt;
6577   if (RHSC >= -256 && RHSC < 256) {
6578     MachineOperand &Base = RootDef->getOperand(1);
6579     return {{
6580         [=](MachineInstrBuilder &MIB) { MIB.add(Base); },
6581         [=](MachineInstrBuilder &MIB) { MIB.addImm(RHSC); },
6582     }};
6583   }
6584   return std::nullopt;
6585 }
6586 
6587 InstructionSelector::ComplexRendererFns
6588 AArch64InstructionSelector::tryFoldAddLowIntoImm(MachineInstr &RootDef,
6589                                                  unsigned Size,
6590                                                  MachineRegisterInfo &MRI) const {
6591   if (RootDef.getOpcode() != AArch64::G_ADD_LOW)
6592     return std::nullopt;
6593   MachineInstr &Adrp = *MRI.getVRegDef(RootDef.getOperand(1).getReg());
6594   if (Adrp.getOpcode() != AArch64::ADRP)
6595     return std::nullopt;
6596 
6597   // TODO: add heuristics like isWorthFoldingADDlow() from SelectionDAG.
6598   auto Offset = Adrp.getOperand(1).getOffset();
6599   if (Offset % Size != 0)
6600     return std::nullopt;
6601 
6602   auto GV = Adrp.getOperand(1).getGlobal();
6603   if (GV->isThreadLocal())
6604     return std::nullopt;
6605 
6606   auto &MF = *RootDef.getParent()->getParent();
6607   if (GV->getPointerAlignment(MF.getDataLayout()) < Size)
6608     return std::nullopt;
6609 
6610   unsigned OpFlags = STI.ClassifyGlobalReference(GV, MF.getTarget());
6611   MachineIRBuilder MIRBuilder(RootDef);
6612   Register AdrpReg = Adrp.getOperand(0).getReg();
6613   return {{[=](MachineInstrBuilder &MIB) { MIB.addUse(AdrpReg); },
6614            [=](MachineInstrBuilder &MIB) {
6615              MIB.addGlobalAddress(GV, Offset,
6616                                   OpFlags | AArch64II::MO_PAGEOFF |
6617                                       AArch64II::MO_NC);
6618            }}};
6619 }
6620 
6621 /// Select a "register plus scaled unsigned 12-bit immediate" address.  The
6622 /// "Size" argument is the size in bytes of the memory reference, which
6623 /// determines the scale.
6624 InstructionSelector::ComplexRendererFns
6625 AArch64InstructionSelector::selectAddrModeIndexed(MachineOperand &Root,
6626                                                   unsigned Size) const {
6627   MachineFunction &MF = *Root.getParent()->getParent()->getParent();
6628   MachineRegisterInfo &MRI = MF.getRegInfo();
6629 
6630   if (!Root.isReg())
6631     return std::nullopt;
6632 
6633   MachineInstr *RootDef = MRI.getVRegDef(Root.getReg());
6634   if (RootDef->getOpcode() == TargetOpcode::G_FRAME_INDEX) {
6635     return {{
6636         [=](MachineInstrBuilder &MIB) { MIB.add(RootDef->getOperand(1)); },
6637         [=](MachineInstrBuilder &MIB) { MIB.addImm(0); },
6638     }};
6639   }
6640 
6641   CodeModel::Model CM = MF.getTarget().getCodeModel();
6642   // Check if we can fold in the ADD of small code model ADRP + ADD address.
6643   if (CM == CodeModel::Small) {
6644     auto OpFns = tryFoldAddLowIntoImm(*RootDef, Size, MRI);
6645     if (OpFns)
6646       return OpFns;
6647   }
6648 
6649   if (isBaseWithConstantOffset(Root, MRI)) {
6650     MachineOperand &LHS = RootDef->getOperand(1);
6651     MachineOperand &RHS = RootDef->getOperand(2);
6652     MachineInstr *LHSDef = MRI.getVRegDef(LHS.getReg());
6653     MachineInstr *RHSDef = MRI.getVRegDef(RHS.getReg());
6654 
6655     int64_t RHSC = (int64_t)RHSDef->getOperand(1).getCImm()->getZExtValue();
6656     unsigned Scale = Log2_32(Size);
6657     if ((RHSC & (Size - 1)) == 0 && RHSC >= 0 && RHSC < (0x1000 << Scale)) {
6658       if (LHSDef->getOpcode() == TargetOpcode::G_FRAME_INDEX)
6659         return {{
6660             [=](MachineInstrBuilder &MIB) { MIB.add(LHSDef->getOperand(1)); },
6661             [=](MachineInstrBuilder &MIB) { MIB.addImm(RHSC >> Scale); },
6662         }};
6663 
6664       return {{
6665           [=](MachineInstrBuilder &MIB) { MIB.add(LHS); },
6666           [=](MachineInstrBuilder &MIB) { MIB.addImm(RHSC >> Scale); },
6667       }};
6668     }
6669   }
6670 
6671   // Before falling back to our general case, check if the unscaled
6672   // instructions can handle this. If so, that's preferable.
6673   if (selectAddrModeUnscaled(Root, Size))
6674     return std::nullopt;
6675 
6676   return {{
6677       [=](MachineInstrBuilder &MIB) { MIB.add(Root); },
6678       [=](MachineInstrBuilder &MIB) { MIB.addImm(0); },
6679   }};
6680 }
6681 
6682 /// Given a shift instruction, return the correct shift type for that
6683 /// instruction.
6684 static AArch64_AM::ShiftExtendType getShiftTypeForInst(MachineInstr &MI) {
6685   switch (MI.getOpcode()) {
6686   default:
6687     return AArch64_AM::InvalidShiftExtend;
6688   case TargetOpcode::G_SHL:
6689     return AArch64_AM::LSL;
6690   case TargetOpcode::G_LSHR:
6691     return AArch64_AM::LSR;
6692   case TargetOpcode::G_ASHR:
6693     return AArch64_AM::ASR;
6694   case TargetOpcode::G_ROTR:
6695     return AArch64_AM::ROR;
6696   }
6697 }
6698 
6699 /// Select a "shifted register" operand. If the value is not shifted, set the
6700 /// shift operand to a default value of "lsl 0".
6701 InstructionSelector::ComplexRendererFns
6702 AArch64InstructionSelector::selectShiftedRegister(MachineOperand &Root,
6703                                                   bool AllowROR) const {
6704   if (!Root.isReg())
6705     return std::nullopt;
6706   MachineRegisterInfo &MRI =
6707       Root.getParent()->getParent()->getParent()->getRegInfo();
6708 
6709   // Check if the operand is defined by an instruction which corresponds to
6710   // a ShiftExtendType. E.g. a G_SHL, G_LSHR, etc.
6711   MachineInstr *ShiftInst = MRI.getVRegDef(Root.getReg());
6712   AArch64_AM::ShiftExtendType ShType = getShiftTypeForInst(*ShiftInst);
6713   if (ShType == AArch64_AM::InvalidShiftExtend)
6714     return std::nullopt;
6715   if (ShType == AArch64_AM::ROR && !AllowROR)
6716     return std::nullopt;
6717   if (!isWorthFoldingIntoExtendedReg(*ShiftInst, MRI))
6718     return std::nullopt;
6719 
6720   // Need an immediate on the RHS.
6721   MachineOperand &ShiftRHS = ShiftInst->getOperand(2);
6722   auto Immed = getImmedFromMO(ShiftRHS);
6723   if (!Immed)
6724     return std::nullopt;
6725 
6726   // We have something that we can fold. Fold in the shift's LHS and RHS into
6727   // the instruction.
6728   MachineOperand &ShiftLHS = ShiftInst->getOperand(1);
6729   Register ShiftReg = ShiftLHS.getReg();
6730 
6731   unsigned NumBits = MRI.getType(ShiftReg).getSizeInBits();
6732   unsigned Val = *Immed & (NumBits - 1);
6733   unsigned ShiftVal = AArch64_AM::getShifterImm(ShType, Val);
6734 
6735   return {{[=](MachineInstrBuilder &MIB) { MIB.addUse(ShiftReg); },
6736            [=](MachineInstrBuilder &MIB) { MIB.addImm(ShiftVal); }}};
6737 }
6738 
6739 AArch64_AM::ShiftExtendType AArch64InstructionSelector::getExtendTypeForInst(
6740     MachineInstr &MI, MachineRegisterInfo &MRI, bool IsLoadStore) const {
6741   unsigned Opc = MI.getOpcode();
6742 
6743   // Handle explicit extend instructions first.
6744   if (Opc == TargetOpcode::G_SEXT || Opc == TargetOpcode::G_SEXT_INREG) {
6745     unsigned Size;
6746     if (Opc == TargetOpcode::G_SEXT)
6747       Size = MRI.getType(MI.getOperand(1).getReg()).getSizeInBits();
6748     else
6749       Size = MI.getOperand(2).getImm();
6750     assert(Size != 64 && "Extend from 64 bits?");
6751     switch (Size) {
6752     case 8:
6753       return IsLoadStore ? AArch64_AM::InvalidShiftExtend : AArch64_AM::SXTB;
6754     case 16:
6755       return IsLoadStore ? AArch64_AM::InvalidShiftExtend : AArch64_AM::SXTH;
6756     case 32:
6757       return AArch64_AM::SXTW;
6758     default:
6759       return AArch64_AM::InvalidShiftExtend;
6760     }
6761   }
6762 
6763   if (Opc == TargetOpcode::G_ZEXT || Opc == TargetOpcode::G_ANYEXT) {
6764     unsigned Size = MRI.getType(MI.getOperand(1).getReg()).getSizeInBits();
6765     assert(Size != 64 && "Extend from 64 bits?");
6766     switch (Size) {
6767     case 8:
6768       return IsLoadStore ? AArch64_AM::InvalidShiftExtend : AArch64_AM::UXTB;
6769     case 16:
6770       return IsLoadStore ? AArch64_AM::InvalidShiftExtend : AArch64_AM::UXTH;
6771     case 32:
6772       return AArch64_AM::UXTW;
6773     default:
6774       return AArch64_AM::InvalidShiftExtend;
6775     }
6776   }
6777 
6778   // Don't have an explicit extend. Try to handle a G_AND with a constant mask
6779   // on the RHS.
6780   if (Opc != TargetOpcode::G_AND)
6781     return AArch64_AM::InvalidShiftExtend;
6782 
6783   std::optional<uint64_t> MaybeAndMask = getImmedFromMO(MI.getOperand(2));
6784   if (!MaybeAndMask)
6785     return AArch64_AM::InvalidShiftExtend;
6786   uint64_t AndMask = *MaybeAndMask;
6787   switch (AndMask) {
6788   default:
6789     return AArch64_AM::InvalidShiftExtend;
6790   case 0xFF:
6791     return !IsLoadStore ? AArch64_AM::UXTB : AArch64_AM::InvalidShiftExtend;
6792   case 0xFFFF:
6793     return !IsLoadStore ? AArch64_AM::UXTH : AArch64_AM::InvalidShiftExtend;
6794   case 0xFFFFFFFF:
6795     return AArch64_AM::UXTW;
6796   }
6797 }
6798 
6799 Register AArch64InstructionSelector::moveScalarRegClass(
6800     Register Reg, const TargetRegisterClass &RC, MachineIRBuilder &MIB) const {
6801   MachineRegisterInfo &MRI = *MIB.getMRI();
6802   auto Ty = MRI.getType(Reg);
6803   assert(!Ty.isVector() && "Expected scalars only!");
6804   if (Ty.getSizeInBits() == TRI.getRegSizeInBits(RC))
6805     return Reg;
6806 
6807   // Create a copy and immediately select it.
6808   // FIXME: We should have an emitCopy function?
6809   auto Copy = MIB.buildCopy({&RC}, {Reg});
6810   selectCopy(*Copy, TII, MRI, TRI, RBI);
6811   return Copy.getReg(0);
6812 }
6813 
6814 /// Select an "extended register" operand. This operand folds in an extend
6815 /// followed by an optional left shift.
6816 InstructionSelector::ComplexRendererFns
6817 AArch64InstructionSelector::selectArithExtendedRegister(
6818     MachineOperand &Root) const {
6819   if (!Root.isReg())
6820     return std::nullopt;
6821   MachineRegisterInfo &MRI =
6822       Root.getParent()->getParent()->getParent()->getRegInfo();
6823 
6824   uint64_t ShiftVal = 0;
6825   Register ExtReg;
6826   AArch64_AM::ShiftExtendType Ext;
6827   MachineInstr *RootDef = getDefIgnoringCopies(Root.getReg(), MRI);
6828   if (!RootDef)
6829     return std::nullopt;
6830 
6831   if (!isWorthFoldingIntoExtendedReg(*RootDef, MRI))
6832     return std::nullopt;
6833 
6834   // Check if we can fold a shift and an extend.
6835   if (RootDef->getOpcode() == TargetOpcode::G_SHL) {
6836     // Look for a constant on the RHS of the shift.
6837     MachineOperand &RHS = RootDef->getOperand(2);
6838     std::optional<uint64_t> MaybeShiftVal = getImmedFromMO(RHS);
6839     if (!MaybeShiftVal)
6840       return std::nullopt;
6841     ShiftVal = *MaybeShiftVal;
6842     if (ShiftVal > 4)
6843       return std::nullopt;
6844     // Look for a valid extend instruction on the LHS of the shift.
6845     MachineOperand &LHS = RootDef->getOperand(1);
6846     MachineInstr *ExtDef = getDefIgnoringCopies(LHS.getReg(), MRI);
6847     if (!ExtDef)
6848       return std::nullopt;
6849     Ext = getExtendTypeForInst(*ExtDef, MRI);
6850     if (Ext == AArch64_AM::InvalidShiftExtend)
6851       return std::nullopt;
6852     ExtReg = ExtDef->getOperand(1).getReg();
6853   } else {
6854     // Didn't get a shift. Try just folding an extend.
6855     Ext = getExtendTypeForInst(*RootDef, MRI);
6856     if (Ext == AArch64_AM::InvalidShiftExtend)
6857       return std::nullopt;
6858     ExtReg = RootDef->getOperand(1).getReg();
6859 
6860     // If we have a 32 bit instruction which zeroes out the high half of a
6861     // register, we get an implicit zero extend for free. Check if we have one.
6862     // FIXME: We actually emit the extend right now even though we don't have
6863     // to.
6864     if (Ext == AArch64_AM::UXTW && MRI.getType(ExtReg).getSizeInBits() == 32) {
6865       MachineInstr *ExtInst = MRI.getVRegDef(ExtReg);
6866       if (isDef32(*ExtInst))
6867         return std::nullopt;
6868     }
6869   }
6870 
6871   // We require a GPR32 here. Narrow the ExtReg if needed using a subregister
6872   // copy.
6873   MachineIRBuilder MIB(*RootDef);
6874   ExtReg = moveScalarRegClass(ExtReg, AArch64::GPR32RegClass, MIB);
6875 
6876   return {{[=](MachineInstrBuilder &MIB) { MIB.addUse(ExtReg); },
6877            [=](MachineInstrBuilder &MIB) {
6878              MIB.addImm(getArithExtendImm(Ext, ShiftVal));
6879            }}};
6880 }
6881 
6882 InstructionSelector::ComplexRendererFns
6883 AArch64InstructionSelector::selectExtractHigh(MachineOperand &Root) const {
6884   if (!Root.isReg())
6885     return std::nullopt;
6886   MachineRegisterInfo &MRI =
6887       Root.getParent()->getParent()->getParent()->getRegInfo();
6888 
6889   MachineInstr *Extract = getDefIgnoringCopies(Root.getReg(), MRI);
6890   if (Extract && Extract->getOpcode() == TargetOpcode::G_UNMERGE_VALUES &&
6891       Root.getReg() == Extract->getOperand(1).getReg()) {
6892     Register ExtReg = Extract->getOperand(2).getReg();
6893     return {{[=](MachineInstrBuilder &MIB) { MIB.addUse(ExtReg); }}};
6894   }
6895 
6896   return std::nullopt;
6897 }
6898 
6899 void AArch64InstructionSelector::renderTruncImm(MachineInstrBuilder &MIB,
6900                                                 const MachineInstr &MI,
6901                                                 int OpIdx) const {
6902   const MachineRegisterInfo &MRI = MI.getParent()->getParent()->getRegInfo();
6903   assert(MI.getOpcode() == TargetOpcode::G_CONSTANT && OpIdx == -1 &&
6904          "Expected G_CONSTANT");
6905   std::optional<int64_t> CstVal =
6906       getIConstantVRegSExtVal(MI.getOperand(0).getReg(), MRI);
6907   assert(CstVal && "Expected constant value");
6908   MIB.addImm(*CstVal);
6909 }
6910 
6911 void AArch64InstructionSelector::renderLogicalImm32(
6912   MachineInstrBuilder &MIB, const MachineInstr &I, int OpIdx) const {
6913   assert(I.getOpcode() == TargetOpcode::G_CONSTANT && OpIdx == -1 &&
6914          "Expected G_CONSTANT");
6915   uint64_t CstVal = I.getOperand(1).getCImm()->getZExtValue();
6916   uint64_t Enc = AArch64_AM::encodeLogicalImmediate(CstVal, 32);
6917   MIB.addImm(Enc);
6918 }
6919 
6920 void AArch64InstructionSelector::renderLogicalImm64(
6921   MachineInstrBuilder &MIB, const MachineInstr &I, int OpIdx) const {
6922   assert(I.getOpcode() == TargetOpcode::G_CONSTANT && OpIdx == -1 &&
6923          "Expected G_CONSTANT");
6924   uint64_t CstVal = I.getOperand(1).getCImm()->getZExtValue();
6925   uint64_t Enc = AArch64_AM::encodeLogicalImmediate(CstVal, 64);
6926   MIB.addImm(Enc);
6927 }
6928 
6929 void AArch64InstructionSelector::renderFPImm16(MachineInstrBuilder &MIB,
6930                                                const MachineInstr &MI,
6931                                                int OpIdx) const {
6932   assert(MI.getOpcode() == TargetOpcode::G_FCONSTANT && OpIdx == -1 &&
6933          "Expected G_FCONSTANT");
6934   MIB.addImm(
6935       AArch64_AM::getFP16Imm(MI.getOperand(1).getFPImm()->getValueAPF()));
6936 }
6937 
6938 void AArch64InstructionSelector::renderFPImm32(MachineInstrBuilder &MIB,
6939                                                const MachineInstr &MI,
6940                                                int OpIdx) const {
6941   assert(MI.getOpcode() == TargetOpcode::G_FCONSTANT && OpIdx == -1 &&
6942          "Expected G_FCONSTANT");
6943   MIB.addImm(
6944       AArch64_AM::getFP32Imm(MI.getOperand(1).getFPImm()->getValueAPF()));
6945 }
6946 
6947 void AArch64InstructionSelector::renderFPImm64(MachineInstrBuilder &MIB,
6948                                                const MachineInstr &MI,
6949                                                int OpIdx) const {
6950   assert(MI.getOpcode() == TargetOpcode::G_FCONSTANT && OpIdx == -1 &&
6951          "Expected G_FCONSTANT");
6952   MIB.addImm(
6953       AArch64_AM::getFP64Imm(MI.getOperand(1).getFPImm()->getValueAPF()));
6954 }
6955 
6956 void AArch64InstructionSelector::renderFPImm32SIMDModImmType4(
6957     MachineInstrBuilder &MIB, const MachineInstr &MI, int OpIdx) const {
6958   assert(MI.getOpcode() == TargetOpcode::G_FCONSTANT && OpIdx == -1 &&
6959          "Expected G_FCONSTANT");
6960   MIB.addImm(AArch64_AM::encodeAdvSIMDModImmType4(MI.getOperand(1)
6961                                                       .getFPImm()
6962                                                       ->getValueAPF()
6963                                                       .bitcastToAPInt()
6964                                                       .getZExtValue()));
6965 }
6966 
6967 bool AArch64InstructionSelector::isLoadStoreOfNumBytes(
6968     const MachineInstr &MI, unsigned NumBytes) const {
6969   if (!MI.mayLoadOrStore())
6970     return false;
6971   assert(MI.hasOneMemOperand() &&
6972          "Expected load/store to have only one mem op!");
6973   return (*MI.memoperands_begin())->getSize() == NumBytes;
6974 }
6975 
6976 bool AArch64InstructionSelector::isDef32(const MachineInstr &MI) const {
6977   const MachineRegisterInfo &MRI = MI.getParent()->getParent()->getRegInfo();
6978   if (MRI.getType(MI.getOperand(0).getReg()).getSizeInBits() != 32)
6979     return false;
6980 
6981   // Only return true if we know the operation will zero-out the high half of
6982   // the 64-bit register. Truncates can be subregister copies, which don't
6983   // zero out the high bits. Copies and other copy-like instructions can be
6984   // fed by truncates, or could be lowered as subregister copies.
6985   switch (MI.getOpcode()) {
6986   default:
6987     return true;
6988   case TargetOpcode::COPY:
6989   case TargetOpcode::G_BITCAST:
6990   case TargetOpcode::G_TRUNC:
6991   case TargetOpcode::G_PHI:
6992     return false;
6993   }
6994 }
6995 
6996 
6997 // Perform fixups on the given PHI instruction's operands to force them all
6998 // to be the same as the destination regbank.
6999 static void fixupPHIOpBanks(MachineInstr &MI, MachineRegisterInfo &MRI,
7000                             const AArch64RegisterBankInfo &RBI) {
7001   assert(MI.getOpcode() == TargetOpcode::G_PHI && "Expected a G_PHI");
7002   Register DstReg = MI.getOperand(0).getReg();
7003   const RegisterBank *DstRB = MRI.getRegBankOrNull(DstReg);
7004   assert(DstRB && "Expected PHI dst to have regbank assigned");
7005   MachineIRBuilder MIB(MI);
7006 
7007   // Go through each operand and ensure it has the same regbank.
7008   for (MachineOperand &MO : llvm::drop_begin(MI.operands())) {
7009     if (!MO.isReg())
7010       continue;
7011     Register OpReg = MO.getReg();
7012     const RegisterBank *RB = MRI.getRegBankOrNull(OpReg);
7013     if (RB != DstRB) {
7014       // Insert a cross-bank copy.
7015       auto *OpDef = MRI.getVRegDef(OpReg);
7016       const LLT &Ty = MRI.getType(OpReg);
7017       MachineBasicBlock &OpDefBB = *OpDef->getParent();
7018 
7019       // Any instruction we insert must appear after all PHIs in the block
7020       // for the block to be valid MIR.
7021       MachineBasicBlock::iterator InsertPt = std::next(OpDef->getIterator());
7022       if (InsertPt != OpDefBB.end() && InsertPt->isPHI())
7023         InsertPt = OpDefBB.getFirstNonPHI();
7024       MIB.setInsertPt(*OpDef->getParent(), InsertPt);
7025       auto Copy = MIB.buildCopy(Ty, OpReg);
7026       MRI.setRegBank(Copy.getReg(0), *DstRB);
7027       MO.setReg(Copy.getReg(0));
7028     }
7029   }
7030 }
7031 
7032 void AArch64InstructionSelector::processPHIs(MachineFunction &MF) {
7033   // We're looking for PHIs, build a list so we don't invalidate iterators.
7034   MachineRegisterInfo &MRI = MF.getRegInfo();
7035   SmallVector<MachineInstr *, 32> Phis;
7036   for (auto &BB : MF) {
7037     for (auto &MI : BB) {
7038       if (MI.getOpcode() == TargetOpcode::G_PHI)
7039         Phis.emplace_back(&MI);
7040     }
7041   }
7042 
7043   for (auto *MI : Phis) {
7044     // We need to do some work here if the operand types are < 16 bit and they
7045     // are split across fpr/gpr banks. Since all types <32b on gpr
7046     // end up being assigned gpr32 regclasses, we can end up with PHIs here
7047     // which try to select between a gpr32 and an fpr16. Ideally RBS shouldn't
7048     // be selecting heterogenous regbanks for operands if possible, but we
7049     // still need to be able to deal with it here.
7050     //
7051     // To fix this, if we have a gpr-bank operand < 32b in size and at least
7052     // one other operand is on the fpr bank, then we add cross-bank copies
7053     // to homogenize the operand banks. For simplicity the bank that we choose
7054     // to settle on is whatever bank the def operand has. For example:
7055     //
7056     // %endbb:
7057     //   %dst:gpr(s16) = G_PHI %in1:gpr(s16), %bb1, %in2:fpr(s16), %bb2
7058     //  =>
7059     // %bb2:
7060     //   ...
7061     //   %in2_copy:gpr(s16) = COPY %in2:fpr(s16)
7062     //   ...
7063     // %endbb:
7064     //   %dst:gpr(s16) = G_PHI %in1:gpr(s16), %bb1, %in2_copy:gpr(s16), %bb2
7065     bool HasGPROp = false, HasFPROp = false;
7066     for (const MachineOperand &MO : llvm::drop_begin(MI->operands())) {
7067       if (!MO.isReg())
7068         continue;
7069       const LLT &Ty = MRI.getType(MO.getReg());
7070       if (!Ty.isValid() || !Ty.isScalar())
7071         break;
7072       if (Ty.getSizeInBits() >= 32)
7073         break;
7074       const RegisterBank *RB = MRI.getRegBankOrNull(MO.getReg());
7075       // If for some reason we don't have a regbank yet. Don't try anything.
7076       if (!RB)
7077         break;
7078 
7079       if (RB->getID() == AArch64::GPRRegBankID)
7080         HasGPROp = true;
7081       else
7082         HasFPROp = true;
7083     }
7084     // We have heterogenous regbanks, need to fixup.
7085     if (HasGPROp && HasFPROp)
7086       fixupPHIOpBanks(*MI, MRI, RBI);
7087   }
7088 }
7089 
7090 namespace llvm {
7091 InstructionSelector *
7092 createAArch64InstructionSelector(const AArch64TargetMachine &TM,
7093                                  AArch64Subtarget &Subtarget,
7094                                  AArch64RegisterBankInfo &RBI) {
7095   return new AArch64InstructionSelector(TM, Subtarget, RBI);
7096 }
7097 }
7098