xref: /freebsd/contrib/llvm-project/llvm/lib/Target/AArch64/GISel/AArch64InstructionSelector.cpp (revision 700637cbb5e582861067a11aaca4d053546871d2)
1 //===- AArch64InstructionSelector.cpp ----------------------------*- C++ -*-==//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 /// \file
9 /// This file implements the targeting of the InstructionSelector class for
10 /// AArch64.
11 /// \todo This should be generated by TableGen.
12 //===----------------------------------------------------------------------===//
13 
14 #include "AArch64GlobalISelUtils.h"
15 #include "AArch64InstrInfo.h"
16 #include "AArch64MachineFunctionInfo.h"
17 #include "AArch64RegisterBankInfo.h"
18 #include "AArch64RegisterInfo.h"
19 #include "AArch64Subtarget.h"
20 #include "AArch64TargetMachine.h"
21 #include "MCTargetDesc/AArch64AddressingModes.h"
22 #include "MCTargetDesc/AArch64MCTargetDesc.h"
23 #include "llvm/BinaryFormat/Dwarf.h"
24 #include "llvm/CodeGen/GlobalISel/GIMatchTableExecutorImpl.h"
25 #include "llvm/CodeGen/GlobalISel/GISelValueTracking.h"
26 #include "llvm/CodeGen/GlobalISel/GenericMachineInstrs.h"
27 #include "llvm/CodeGen/GlobalISel/InstructionSelector.h"
28 #include "llvm/CodeGen/GlobalISel/MIPatternMatch.h"
29 #include "llvm/CodeGen/GlobalISel/MachineIRBuilder.h"
30 #include "llvm/CodeGen/GlobalISel/Utils.h"
31 #include "llvm/CodeGen/MachineBasicBlock.h"
32 #include "llvm/CodeGen/MachineConstantPool.h"
33 #include "llvm/CodeGen/MachineFrameInfo.h"
34 #include "llvm/CodeGen/MachineFunction.h"
35 #include "llvm/CodeGen/MachineInstr.h"
36 #include "llvm/CodeGen/MachineInstrBuilder.h"
37 #include "llvm/CodeGen/MachineMemOperand.h"
38 #include "llvm/CodeGen/MachineOperand.h"
39 #include "llvm/CodeGen/MachineRegisterInfo.h"
40 #include "llvm/CodeGen/TargetOpcodes.h"
41 #include "llvm/CodeGen/TargetRegisterInfo.h"
42 #include "llvm/IR/Constants.h"
43 #include "llvm/IR/DerivedTypes.h"
44 #include "llvm/IR/Instructions.h"
45 #include "llvm/IR/IntrinsicsAArch64.h"
46 #include "llvm/IR/Type.h"
47 #include "llvm/Pass.h"
48 #include "llvm/Support/Debug.h"
49 #include "llvm/Support/raw_ostream.h"
50 #include <optional>
51 
52 #define DEBUG_TYPE "aarch64-isel"
53 
54 using namespace llvm;
55 using namespace MIPatternMatch;
56 using namespace AArch64GISelUtils;
57 
58 namespace llvm {
59 class BlockFrequencyInfo;
60 class ProfileSummaryInfo;
61 }
62 
63 namespace {
64 
65 #define GET_GLOBALISEL_PREDICATE_BITSET
66 #include "AArch64GenGlobalISel.inc"
67 #undef GET_GLOBALISEL_PREDICATE_BITSET
68 
69 
70 class AArch64InstructionSelector : public InstructionSelector {
71 public:
72   AArch64InstructionSelector(const AArch64TargetMachine &TM,
73                              const AArch64Subtarget &STI,
74                              const AArch64RegisterBankInfo &RBI);
75 
76   bool select(MachineInstr &I) override;
getName()77   static const char *getName() { return DEBUG_TYPE; }
78 
setupMF(MachineFunction & MF,GISelValueTracking * VT,CodeGenCoverage * CoverageInfo,ProfileSummaryInfo * PSI,BlockFrequencyInfo * BFI)79   void setupMF(MachineFunction &MF, GISelValueTracking *VT,
80                CodeGenCoverage *CoverageInfo, ProfileSummaryInfo *PSI,
81                BlockFrequencyInfo *BFI) override {
82     InstructionSelector::setupMF(MF, VT, CoverageInfo, PSI, BFI);
83     MIB.setMF(MF);
84 
85     // hasFnAttribute() is expensive to call on every BRCOND selection, so
86     // cache it here for each run of the selector.
87     ProduceNonFlagSettingCondBr =
88         !MF.getFunction().hasFnAttribute(Attribute::SpeculativeLoadHardening);
89     MFReturnAddr = Register();
90 
91     processPHIs(MF);
92   }
93 
94 private:
95   /// tblgen-erated 'select' implementation, used as the initial selector for
96   /// the patterns that don't require complex C++.
97   bool selectImpl(MachineInstr &I, CodeGenCoverage &CoverageInfo) const;
98 
99   // A lowering phase that runs before any selection attempts.
100   // Returns true if the instruction was modified.
101   bool preISelLower(MachineInstr &I);
102 
103   // An early selection function that runs before the selectImpl() call.
104   bool earlySelect(MachineInstr &I);
105 
106   /// Save state that is shared between select calls, call select on \p I and
107   /// then restore the saved state. This can be used to recursively call select
108   /// within a select call.
109   bool selectAndRestoreState(MachineInstr &I);
110 
111   // Do some preprocessing of G_PHIs before we begin selection.
112   void processPHIs(MachineFunction &MF);
113 
114   bool earlySelectSHL(MachineInstr &I, MachineRegisterInfo &MRI);
115 
116   /// Eliminate same-sized cross-bank copies into stores before selectImpl().
117   bool contractCrossBankCopyIntoStore(MachineInstr &I,
118                                       MachineRegisterInfo &MRI);
119 
120   bool convertPtrAddToAdd(MachineInstr &I, MachineRegisterInfo &MRI);
121 
122   bool selectVaStartAAPCS(MachineInstr &I, MachineFunction &MF,
123                           MachineRegisterInfo &MRI) const;
124   bool selectVaStartDarwin(MachineInstr &I, MachineFunction &MF,
125                            MachineRegisterInfo &MRI) const;
126 
127   ///@{
128   /// Helper functions for selectCompareBranch.
129   bool selectCompareBranchFedByFCmp(MachineInstr &I, MachineInstr &FCmp,
130                                     MachineIRBuilder &MIB) const;
131   bool selectCompareBranchFedByICmp(MachineInstr &I, MachineInstr &ICmp,
132                                     MachineIRBuilder &MIB) const;
133   bool tryOptCompareBranchFedByICmp(MachineInstr &I, MachineInstr &ICmp,
134                                     MachineIRBuilder &MIB) const;
135   bool tryOptAndIntoCompareBranch(MachineInstr &AndInst, bool Invert,
136                                   MachineBasicBlock *DstMBB,
137                                   MachineIRBuilder &MIB) const;
138   ///@}
139 
140   bool selectCompareBranch(MachineInstr &I, MachineFunction &MF,
141                            MachineRegisterInfo &MRI);
142 
143   bool selectVectorAshrLshr(MachineInstr &I, MachineRegisterInfo &MRI);
144   bool selectVectorSHL(MachineInstr &I, MachineRegisterInfo &MRI);
145 
146   // Helper to generate an equivalent of scalar_to_vector into a new register,
147   // returned via 'Dst'.
148   MachineInstr *emitScalarToVector(unsigned EltSize,
149                                    const TargetRegisterClass *DstRC,
150                                    Register Scalar,
151                                    MachineIRBuilder &MIRBuilder) const;
152   /// Helper to narrow vector that was widened by emitScalarToVector.
153   /// Copy lowest part of 128-bit or 64-bit vector to 64-bit or 32-bit
154   /// vector, correspondingly.
155   MachineInstr *emitNarrowVector(Register DstReg, Register SrcReg,
156                                  MachineIRBuilder &MIRBuilder,
157                                  MachineRegisterInfo &MRI) const;
158 
159   /// Emit a lane insert into \p DstReg, or a new vector register if
160   /// std::nullopt is provided.
161   ///
162   /// The lane inserted into is defined by \p LaneIdx. The vector source
163   /// register is given by \p SrcReg. The register containing the element is
164   /// given by \p EltReg.
165   MachineInstr *emitLaneInsert(std::optional<Register> DstReg, Register SrcReg,
166                                Register EltReg, unsigned LaneIdx,
167                                const RegisterBank &RB,
168                                MachineIRBuilder &MIRBuilder) const;
169 
170   /// Emit a sequence of instructions representing a constant \p CV for a
171   /// vector register \p Dst. (E.g. a MOV, or a load from a constant pool.)
172   ///
173   /// \returns the last instruction in the sequence on success, and nullptr
174   /// otherwise.
175   MachineInstr *emitConstantVector(Register Dst, Constant *CV,
176                                    MachineIRBuilder &MIRBuilder,
177                                    MachineRegisterInfo &MRI);
178 
179   MachineInstr *tryAdvSIMDModImm8(Register Dst, unsigned DstSize, APInt Bits,
180                                   MachineIRBuilder &MIRBuilder);
181 
182   MachineInstr *tryAdvSIMDModImm16(Register Dst, unsigned DstSize, APInt Bits,
183                                    MachineIRBuilder &MIRBuilder, bool Inv);
184 
185   MachineInstr *tryAdvSIMDModImm32(Register Dst, unsigned DstSize, APInt Bits,
186                                    MachineIRBuilder &MIRBuilder, bool Inv);
187   MachineInstr *tryAdvSIMDModImm64(Register Dst, unsigned DstSize, APInt Bits,
188                                    MachineIRBuilder &MIRBuilder);
189   MachineInstr *tryAdvSIMDModImm321s(Register Dst, unsigned DstSize, APInt Bits,
190                                      MachineIRBuilder &MIRBuilder, bool Inv);
191   MachineInstr *tryAdvSIMDModImmFP(Register Dst, unsigned DstSize, APInt Bits,
192                                    MachineIRBuilder &MIRBuilder);
193 
194   bool tryOptConstantBuildVec(MachineInstr &MI, LLT DstTy,
195                               MachineRegisterInfo &MRI);
196   /// \returns true if a G_BUILD_VECTOR instruction \p MI can be selected as a
197   /// SUBREG_TO_REG.
198   bool tryOptBuildVecToSubregToReg(MachineInstr &MI, MachineRegisterInfo &MRI);
199   bool selectBuildVector(MachineInstr &I, MachineRegisterInfo &MRI);
200   bool selectMergeValues(MachineInstr &I, MachineRegisterInfo &MRI);
201   bool selectUnmergeValues(MachineInstr &I, MachineRegisterInfo &MRI);
202 
203   bool selectShuffleVector(MachineInstr &I, MachineRegisterInfo &MRI);
204   bool selectExtractElt(MachineInstr &I, MachineRegisterInfo &MRI);
205   bool selectConcatVectors(MachineInstr &I, MachineRegisterInfo &MRI);
206   bool selectSplitVectorUnmerge(MachineInstr &I, MachineRegisterInfo &MRI);
207 
208   /// Helper function to select vector load intrinsics like
209   /// @llvm.aarch64.neon.ld2.*, @llvm.aarch64.neon.ld4.*, etc.
210   /// \p Opc is the opcode that the selected instruction should use.
211   /// \p NumVecs is the number of vector destinations for the instruction.
212   /// \p I is the original G_INTRINSIC_W_SIDE_EFFECTS instruction.
213   bool selectVectorLoadIntrinsic(unsigned Opc, unsigned NumVecs,
214                                  MachineInstr &I);
215   bool selectVectorLoadLaneIntrinsic(unsigned Opc, unsigned NumVecs,
216                                      MachineInstr &I);
217   void selectVectorStoreIntrinsic(MachineInstr &I, unsigned NumVecs,
218                                   unsigned Opc);
219   bool selectVectorStoreLaneIntrinsic(MachineInstr &I, unsigned NumVecs,
220                                       unsigned Opc);
221   bool selectIntrinsicWithSideEffects(MachineInstr &I,
222                                       MachineRegisterInfo &MRI);
223   bool selectIntrinsic(MachineInstr &I, MachineRegisterInfo &MRI);
224   bool selectJumpTable(MachineInstr &I, MachineRegisterInfo &MRI);
225   bool selectBrJT(MachineInstr &I, MachineRegisterInfo &MRI);
226   bool selectTLSGlobalValue(MachineInstr &I, MachineRegisterInfo &MRI);
227   bool selectPtrAuthGlobalValue(MachineInstr &I,
228                                 MachineRegisterInfo &MRI) const;
229   bool selectReduction(MachineInstr &I, MachineRegisterInfo &MRI);
230   bool selectMOPS(MachineInstr &I, MachineRegisterInfo &MRI);
231   bool selectUSMovFromExtend(MachineInstr &I, MachineRegisterInfo &MRI);
232   void SelectTable(MachineInstr &I, MachineRegisterInfo &MRI, unsigned NumVecs,
233                    unsigned Opc1, unsigned Opc2, bool isExt);
234 
235   bool selectIndexedExtLoad(MachineInstr &I, MachineRegisterInfo &MRI);
236   bool selectIndexedLoad(MachineInstr &I, MachineRegisterInfo &MRI);
237   bool selectIndexedStore(GIndexedStore &I, MachineRegisterInfo &MRI);
238 
239   unsigned emitConstantPoolEntry(const Constant *CPVal,
240                                  MachineFunction &MF) const;
241   MachineInstr *emitLoadFromConstantPool(const Constant *CPVal,
242                                          MachineIRBuilder &MIRBuilder) const;
243 
244   // Emit a vector concat operation.
245   MachineInstr *emitVectorConcat(std::optional<Register> Dst, Register Op1,
246                                  Register Op2,
247                                  MachineIRBuilder &MIRBuilder) const;
248 
249   // Emit an integer compare between LHS and RHS, which checks for Predicate.
250   MachineInstr *emitIntegerCompare(MachineOperand &LHS, MachineOperand &RHS,
251                                    MachineOperand &Predicate,
252                                    MachineIRBuilder &MIRBuilder) const;
253 
254   /// Emit a floating point comparison between \p LHS and \p RHS.
255   /// \p Pred if given is the intended predicate to use.
256   MachineInstr *
257   emitFPCompare(Register LHS, Register RHS, MachineIRBuilder &MIRBuilder,
258                 std::optional<CmpInst::Predicate> = std::nullopt) const;
259 
260   MachineInstr *
261   emitInstr(unsigned Opcode, std::initializer_list<llvm::DstOp> DstOps,
262             std::initializer_list<llvm::SrcOp> SrcOps,
263             MachineIRBuilder &MIRBuilder,
264             const ComplexRendererFns &RenderFns = std::nullopt) const;
265   /// Helper function to emit an add or sub instruction.
266   ///
267   /// \p AddrModeAndSizeToOpcode must contain each of the opcode variants above
268   /// in a specific order.
269   ///
270   /// Below is an example of the expected input to \p AddrModeAndSizeToOpcode.
271   ///
272   /// \code
273   ///   const std::array<std::array<unsigned, 2>, 4> Table {
274   ///    {{AArch64::ADDXri, AArch64::ADDWri},
275   ///     {AArch64::ADDXrs, AArch64::ADDWrs},
276   ///     {AArch64::ADDXrr, AArch64::ADDWrr},
277   ///     {AArch64::SUBXri, AArch64::SUBWri},
278   ///     {AArch64::ADDXrx, AArch64::ADDWrx}}};
279   /// \endcode
280   ///
281   /// Each row in the table corresponds to a different addressing mode. Each
282   /// column corresponds to a different register size.
283   ///
284   /// \attention Rows must be structured as follows:
285   ///   - Row 0: The ri opcode variants
286   ///   - Row 1: The rs opcode variants
287   ///   - Row 2: The rr opcode variants
288   ///   - Row 3: The ri opcode variants for negative immediates
289   ///   - Row 4: The rx opcode variants
290   ///
291   /// \attention Columns must be structured as follows:
292   ///   - Column 0: The 64-bit opcode variants
293   ///   - Column 1: The 32-bit opcode variants
294   ///
295   /// \p Dst is the destination register of the binop to emit.
296   /// \p LHS is the left-hand operand of the binop to emit.
297   /// \p RHS is the right-hand operand of the binop to emit.
298   MachineInstr *emitAddSub(
299       const std::array<std::array<unsigned, 2>, 5> &AddrModeAndSizeToOpcode,
300       Register Dst, MachineOperand &LHS, MachineOperand &RHS,
301       MachineIRBuilder &MIRBuilder) const;
302   MachineInstr *emitADD(Register DefReg, MachineOperand &LHS,
303                         MachineOperand &RHS,
304                         MachineIRBuilder &MIRBuilder) const;
305   MachineInstr *emitADDS(Register Dst, MachineOperand &LHS, MachineOperand &RHS,
306                          MachineIRBuilder &MIRBuilder) const;
307   MachineInstr *emitSUBS(Register Dst, MachineOperand &LHS, MachineOperand &RHS,
308                          MachineIRBuilder &MIRBuilder) const;
309   MachineInstr *emitADCS(Register Dst, MachineOperand &LHS, MachineOperand &RHS,
310                          MachineIRBuilder &MIRBuilder) const;
311   MachineInstr *emitSBCS(Register Dst, MachineOperand &LHS, MachineOperand &RHS,
312                          MachineIRBuilder &MIRBuilder) const;
313   MachineInstr *emitCMN(MachineOperand &LHS, MachineOperand &RHS,
314                         MachineIRBuilder &MIRBuilder) const;
315   MachineInstr *emitTST(MachineOperand &LHS, MachineOperand &RHS,
316                         MachineIRBuilder &MIRBuilder) const;
317   MachineInstr *emitSelect(Register Dst, Register LHS, Register RHS,
318                            AArch64CC::CondCode CC,
319                            MachineIRBuilder &MIRBuilder) const;
320   MachineInstr *emitExtractVectorElt(std::optional<Register> DstReg,
321                                      const RegisterBank &DstRB, LLT ScalarTy,
322                                      Register VecReg, unsigned LaneIdx,
323                                      MachineIRBuilder &MIRBuilder) const;
324   MachineInstr *emitCSINC(Register Dst, Register Src1, Register Src2,
325                           AArch64CC::CondCode Pred,
326                           MachineIRBuilder &MIRBuilder) const;
327   /// Emit a CSet for a FP compare.
328   ///
329   /// \p Dst is expected to be a 32-bit scalar register.
330   MachineInstr *emitCSetForFCmp(Register Dst, CmpInst::Predicate Pred,
331                                 MachineIRBuilder &MIRBuilder) const;
332 
333   /// Emit an instruction that sets NZCV to the carry-in expected by \p I.
334   /// Might elide the instruction if the previous instruction already sets NZCV
335   /// correctly.
336   MachineInstr *emitCarryIn(MachineInstr &I, Register CarryReg);
337 
338   /// Emit the overflow op for \p Opcode.
339   ///
340   /// \p Opcode is expected to be an overflow op's opcode, e.g. G_UADDO,
341   /// G_USUBO, etc.
342   std::pair<MachineInstr *, AArch64CC::CondCode>
343   emitOverflowOp(unsigned Opcode, Register Dst, MachineOperand &LHS,
344                  MachineOperand &RHS, MachineIRBuilder &MIRBuilder) const;
345 
346   bool selectOverflowOp(MachineInstr &I, MachineRegisterInfo &MRI);
347 
348   /// Emit expression as a conjunction (a series of CCMP/CFCMP ops).
349   /// In some cases this is even possible with OR operations in the expression.
350   MachineInstr *emitConjunction(Register Val, AArch64CC::CondCode &OutCC,
351                                 MachineIRBuilder &MIB) const;
352   MachineInstr *emitConditionalComparison(Register LHS, Register RHS,
353                                           CmpInst::Predicate CC,
354                                           AArch64CC::CondCode Predicate,
355                                           AArch64CC::CondCode OutCC,
356                                           MachineIRBuilder &MIB) const;
357   MachineInstr *emitConjunctionRec(Register Val, AArch64CC::CondCode &OutCC,
358                                    bool Negate, Register CCOp,
359                                    AArch64CC::CondCode Predicate,
360                                    MachineIRBuilder &MIB) const;
361 
362   /// Emit a TB(N)Z instruction which tests \p Bit in \p TestReg.
363   /// \p IsNegative is true if the test should be "not zero".
364   /// This will also optimize the test bit instruction when possible.
365   MachineInstr *emitTestBit(Register TestReg, uint64_t Bit, bool IsNegative,
366                             MachineBasicBlock *DstMBB,
367                             MachineIRBuilder &MIB) const;
368 
369   /// Emit a CB(N)Z instruction which branches to \p DestMBB.
370   MachineInstr *emitCBZ(Register CompareReg, bool IsNegative,
371                         MachineBasicBlock *DestMBB,
372                         MachineIRBuilder &MIB) const;
373 
374   // Equivalent to the i32shift_a and friends from AArch64InstrInfo.td.
375   // We use these manually instead of using the importer since it doesn't
376   // support SDNodeXForm.
377   ComplexRendererFns selectShiftA_32(const MachineOperand &Root) const;
378   ComplexRendererFns selectShiftB_32(const MachineOperand &Root) const;
379   ComplexRendererFns selectShiftA_64(const MachineOperand &Root) const;
380   ComplexRendererFns selectShiftB_64(const MachineOperand &Root) const;
381 
382   ComplexRendererFns select12BitValueWithLeftShift(uint64_t Immed) const;
383   ComplexRendererFns selectArithImmed(MachineOperand &Root) const;
384   ComplexRendererFns selectNegArithImmed(MachineOperand &Root) const;
385 
386   ComplexRendererFns selectAddrModeUnscaled(MachineOperand &Root,
387                                             unsigned Size) const;
388 
selectAddrModeUnscaled8(MachineOperand & Root) const389   ComplexRendererFns selectAddrModeUnscaled8(MachineOperand &Root) const {
390     return selectAddrModeUnscaled(Root, 1);
391   }
selectAddrModeUnscaled16(MachineOperand & Root) const392   ComplexRendererFns selectAddrModeUnscaled16(MachineOperand &Root) const {
393     return selectAddrModeUnscaled(Root, 2);
394   }
selectAddrModeUnscaled32(MachineOperand & Root) const395   ComplexRendererFns selectAddrModeUnscaled32(MachineOperand &Root) const {
396     return selectAddrModeUnscaled(Root, 4);
397   }
selectAddrModeUnscaled64(MachineOperand & Root) const398   ComplexRendererFns selectAddrModeUnscaled64(MachineOperand &Root) const {
399     return selectAddrModeUnscaled(Root, 8);
400   }
selectAddrModeUnscaled128(MachineOperand & Root) const401   ComplexRendererFns selectAddrModeUnscaled128(MachineOperand &Root) const {
402     return selectAddrModeUnscaled(Root, 16);
403   }
404 
405   /// Helper to try to fold in a GISEL_ADD_LOW into an immediate, to be used
406   /// from complex pattern matchers like selectAddrModeIndexed().
407   ComplexRendererFns tryFoldAddLowIntoImm(MachineInstr &RootDef, unsigned Size,
408                                           MachineRegisterInfo &MRI) const;
409 
410   ComplexRendererFns selectAddrModeIndexed(MachineOperand &Root,
411                                            unsigned Size) const;
412   template <int Width>
selectAddrModeIndexed(MachineOperand & Root) const413   ComplexRendererFns selectAddrModeIndexed(MachineOperand &Root) const {
414     return selectAddrModeIndexed(Root, Width / 8);
415   }
416 
417   std::optional<bool>
418   isWorthFoldingIntoAddrMode(MachineInstr &MI,
419                              const MachineRegisterInfo &MRI) const;
420 
421   bool isWorthFoldingIntoExtendedReg(MachineInstr &MI,
422                                      const MachineRegisterInfo &MRI,
423                                      bool IsAddrOperand) const;
424   ComplexRendererFns
425   selectAddrModeShiftedExtendXReg(MachineOperand &Root,
426                                   unsigned SizeInBytes) const;
427 
428   /// Returns a \p ComplexRendererFns which contains a base, offset, and whether
429   /// or not a shift + extend should be folded into an addressing mode. Returns
430   /// None when this is not profitable or possible.
431   ComplexRendererFns
432   selectExtendedSHL(MachineOperand &Root, MachineOperand &Base,
433                     MachineOperand &Offset, unsigned SizeInBytes,
434                     bool WantsExt) const;
435   ComplexRendererFns selectAddrModeRegisterOffset(MachineOperand &Root) const;
436   ComplexRendererFns selectAddrModeXRO(MachineOperand &Root,
437                                        unsigned SizeInBytes) const;
438   template <int Width>
selectAddrModeXRO(MachineOperand & Root) const439   ComplexRendererFns selectAddrModeXRO(MachineOperand &Root) const {
440     return selectAddrModeXRO(Root, Width / 8);
441   }
442 
443   ComplexRendererFns selectAddrModeWRO(MachineOperand &Root,
444                                        unsigned SizeInBytes) const;
445   template <int Width>
selectAddrModeWRO(MachineOperand & Root) const446   ComplexRendererFns selectAddrModeWRO(MachineOperand &Root) const {
447     return selectAddrModeWRO(Root, Width / 8);
448   }
449 
450   ComplexRendererFns selectShiftedRegister(MachineOperand &Root,
451                                            bool AllowROR = false) const;
452 
selectArithShiftedRegister(MachineOperand & Root) const453   ComplexRendererFns selectArithShiftedRegister(MachineOperand &Root) const {
454     return selectShiftedRegister(Root);
455   }
456 
selectLogicalShiftedRegister(MachineOperand & Root) const457   ComplexRendererFns selectLogicalShiftedRegister(MachineOperand &Root) const {
458     return selectShiftedRegister(Root, true);
459   }
460 
461   /// Given an extend instruction, determine the correct shift-extend type for
462   /// that instruction.
463   ///
464   /// If the instruction is going to be used in a load or store, pass
465   /// \p IsLoadStore = true.
466   AArch64_AM::ShiftExtendType
467   getExtendTypeForInst(MachineInstr &MI, MachineRegisterInfo &MRI,
468                        bool IsLoadStore = false) const;
469 
470   /// Move \p Reg to \p RC if \p Reg is not already on \p RC.
471   ///
472   /// \returns Either \p Reg if no change was necessary, or the new register
473   /// created by moving \p Reg.
474   ///
475   /// Note: This uses emitCopy right now.
476   Register moveScalarRegClass(Register Reg, const TargetRegisterClass &RC,
477                               MachineIRBuilder &MIB) const;
478 
479   ComplexRendererFns selectArithExtendedRegister(MachineOperand &Root) const;
480 
481   ComplexRendererFns selectExtractHigh(MachineOperand &Root) const;
482 
483   void renderTruncImm(MachineInstrBuilder &MIB, const MachineInstr &MI,
484                       int OpIdx = -1) const;
485   void renderLogicalImm32(MachineInstrBuilder &MIB, const MachineInstr &I,
486                           int OpIdx = -1) const;
487   void renderLogicalImm64(MachineInstrBuilder &MIB, const MachineInstr &I,
488                           int OpIdx = -1) const;
489   void renderUbsanTrap(MachineInstrBuilder &MIB, const MachineInstr &MI,
490                        int OpIdx) const;
491   void renderFPImm16(MachineInstrBuilder &MIB, const MachineInstr &MI,
492                      int OpIdx = -1) const;
493   void renderFPImm32(MachineInstrBuilder &MIB, const MachineInstr &MI,
494                      int OpIdx = -1) const;
495   void renderFPImm64(MachineInstrBuilder &MIB, const MachineInstr &MI,
496                      int OpIdx = -1) const;
497   void renderFPImm32SIMDModImmType4(MachineInstrBuilder &MIB,
498                                     const MachineInstr &MI,
499                                     int OpIdx = -1) const;
500 
501   // Materialize a GlobalValue or BlockAddress using a movz+movk sequence.
502   void materializeLargeCMVal(MachineInstr &I, const Value *V, unsigned OpFlags);
503 
504   // Optimization methods.
505   bool tryOptSelect(GSelect &Sel);
506   bool tryOptSelectConjunction(GSelect &Sel, MachineInstr &CondMI);
507   MachineInstr *tryFoldIntegerCompare(MachineOperand &LHS, MachineOperand &RHS,
508                                       MachineOperand &Predicate,
509                                       MachineIRBuilder &MIRBuilder) const;
510 
511   /// Return true if \p MI is a load or store of \p NumBytes bytes.
512   bool isLoadStoreOfNumBytes(const MachineInstr &MI, unsigned NumBytes) const;
513 
514   /// Returns true if \p MI is guaranteed to have the high-half of a 64-bit
515   /// register zeroed out. In other words, the result of MI has been explicitly
516   /// zero extended.
517   bool isDef32(const MachineInstr &MI) const;
518 
519   const AArch64TargetMachine &TM;
520   const AArch64Subtarget &STI;
521   const AArch64InstrInfo &TII;
522   const AArch64RegisterInfo &TRI;
523   const AArch64RegisterBankInfo &RBI;
524 
525   bool ProduceNonFlagSettingCondBr = false;
526 
527   // Some cached values used during selection.
528   // We use LR as a live-in register, and we keep track of it here as it can be
529   // clobbered by calls.
530   Register MFReturnAddr;
531 
532   MachineIRBuilder MIB;
533 
534 #define GET_GLOBALISEL_PREDICATES_DECL
535 #include "AArch64GenGlobalISel.inc"
536 #undef GET_GLOBALISEL_PREDICATES_DECL
537 
538 // We declare the temporaries used by selectImpl() in the class to minimize the
539 // cost of constructing placeholder values.
540 #define GET_GLOBALISEL_TEMPORARIES_DECL
541 #include "AArch64GenGlobalISel.inc"
542 #undef GET_GLOBALISEL_TEMPORARIES_DECL
543 };
544 
545 } // end anonymous namespace
546 
547 #define GET_GLOBALISEL_IMPL
548 #include "AArch64GenGlobalISel.inc"
549 #undef GET_GLOBALISEL_IMPL
550 
AArch64InstructionSelector(const AArch64TargetMachine & TM,const AArch64Subtarget & STI,const AArch64RegisterBankInfo & RBI)551 AArch64InstructionSelector::AArch64InstructionSelector(
552     const AArch64TargetMachine &TM, const AArch64Subtarget &STI,
553     const AArch64RegisterBankInfo &RBI)
554     : TM(TM), STI(STI), TII(*STI.getInstrInfo()), TRI(*STI.getRegisterInfo()),
555       RBI(RBI),
556 #define GET_GLOBALISEL_PREDICATES_INIT
557 #include "AArch64GenGlobalISel.inc"
558 #undef GET_GLOBALISEL_PREDICATES_INIT
559 #define GET_GLOBALISEL_TEMPORARIES_INIT
560 #include "AArch64GenGlobalISel.inc"
561 #undef GET_GLOBALISEL_TEMPORARIES_INIT
562 {
563 }
564 
565 // FIXME: This should be target-independent, inferred from the types declared
566 // for each class in the bank.
567 //
568 /// Given a register bank, and a type, return the smallest register class that
569 /// can represent that combination.
570 static const TargetRegisterClass *
getRegClassForTypeOnBank(LLT Ty,const RegisterBank & RB,bool GetAllRegSet=false)571 getRegClassForTypeOnBank(LLT Ty, const RegisterBank &RB,
572                          bool GetAllRegSet = false) {
573   if (RB.getID() == AArch64::GPRRegBankID) {
574     if (Ty.getSizeInBits() <= 32)
575       return GetAllRegSet ? &AArch64::GPR32allRegClass
576                           : &AArch64::GPR32RegClass;
577     if (Ty.getSizeInBits() == 64)
578       return GetAllRegSet ? &AArch64::GPR64allRegClass
579                           : &AArch64::GPR64RegClass;
580     if (Ty.getSizeInBits() == 128)
581       return &AArch64::XSeqPairsClassRegClass;
582     return nullptr;
583   }
584 
585   if (RB.getID() == AArch64::FPRRegBankID) {
586     switch (Ty.getSizeInBits()) {
587     case 8:
588       return &AArch64::FPR8RegClass;
589     case 16:
590       return &AArch64::FPR16RegClass;
591     case 32:
592       return &AArch64::FPR32RegClass;
593     case 64:
594       return &AArch64::FPR64RegClass;
595     case 128:
596       return &AArch64::FPR128RegClass;
597     }
598     return nullptr;
599   }
600 
601   return nullptr;
602 }
603 
604 /// Given a register bank, and size in bits, return the smallest register class
605 /// that can represent that combination.
606 static const TargetRegisterClass *
getMinClassForRegBank(const RegisterBank & RB,TypeSize SizeInBits,bool GetAllRegSet=false)607 getMinClassForRegBank(const RegisterBank &RB, TypeSize SizeInBits,
608                       bool GetAllRegSet = false) {
609   if (SizeInBits.isScalable()) {
610     assert(RB.getID() == AArch64::FPRRegBankID &&
611            "Expected FPR regbank for scalable type size");
612     return &AArch64::ZPRRegClass;
613   }
614 
615   unsigned RegBankID = RB.getID();
616 
617   if (RegBankID == AArch64::GPRRegBankID) {
618     assert(!SizeInBits.isScalable() && "Unexpected scalable register size");
619     if (SizeInBits <= 32)
620       return GetAllRegSet ? &AArch64::GPR32allRegClass
621                           : &AArch64::GPR32RegClass;
622     if (SizeInBits == 64)
623       return GetAllRegSet ? &AArch64::GPR64allRegClass
624                           : &AArch64::GPR64RegClass;
625     if (SizeInBits == 128)
626       return &AArch64::XSeqPairsClassRegClass;
627   }
628 
629   if (RegBankID == AArch64::FPRRegBankID) {
630     if (SizeInBits.isScalable()) {
631       assert(SizeInBits == TypeSize::getScalable(128) &&
632              "Unexpected scalable register size");
633       return &AArch64::ZPRRegClass;
634     }
635 
636     switch (SizeInBits) {
637     default:
638       return nullptr;
639     case 8:
640       return &AArch64::FPR8RegClass;
641     case 16:
642       return &AArch64::FPR16RegClass;
643     case 32:
644       return &AArch64::FPR32RegClass;
645     case 64:
646       return &AArch64::FPR64RegClass;
647     case 128:
648       return &AArch64::FPR128RegClass;
649     }
650   }
651 
652   return nullptr;
653 }
654 
655 /// Returns the correct subregister to use for a given register class.
getSubRegForClass(const TargetRegisterClass * RC,const TargetRegisterInfo & TRI,unsigned & SubReg)656 static bool getSubRegForClass(const TargetRegisterClass *RC,
657                               const TargetRegisterInfo &TRI, unsigned &SubReg) {
658   switch (TRI.getRegSizeInBits(*RC)) {
659   case 8:
660     SubReg = AArch64::bsub;
661     break;
662   case 16:
663     SubReg = AArch64::hsub;
664     break;
665   case 32:
666     if (RC != &AArch64::FPR32RegClass)
667       SubReg = AArch64::sub_32;
668     else
669       SubReg = AArch64::ssub;
670     break;
671   case 64:
672     SubReg = AArch64::dsub;
673     break;
674   default:
675     LLVM_DEBUG(
676         dbgs() << "Couldn't find appropriate subregister for register class.");
677     return false;
678   }
679 
680   return true;
681 }
682 
683 /// Returns the minimum size the given register bank can hold.
getMinSizeForRegBank(const RegisterBank & RB)684 static unsigned getMinSizeForRegBank(const RegisterBank &RB) {
685   switch (RB.getID()) {
686   case AArch64::GPRRegBankID:
687     return 32;
688   case AArch64::FPRRegBankID:
689     return 8;
690   default:
691     llvm_unreachable("Tried to get minimum size for unknown register bank.");
692   }
693 }
694 
695 /// Create a REG_SEQUENCE instruction using the registers in \p Regs.
696 /// Helper function for functions like createDTuple and createQTuple.
697 ///
698 /// \p RegClassIDs - The list of register class IDs available for some tuple of
699 /// a scalar class. E.g. QQRegClassID, QQQRegClassID, QQQQRegClassID. This is
700 /// expected to contain between 2 and 4 tuple classes.
701 ///
702 /// \p SubRegs - The list of subregister classes associated with each register
703 /// class ID in \p RegClassIDs. E.g., QQRegClassID should use the qsub0
704 /// subregister class. The index of each subregister class is expected to
705 /// correspond with the index of each register class.
706 ///
707 /// \returns Either the destination register of REG_SEQUENCE instruction that
708 /// was created, or the 0th element of \p Regs if \p Regs contains a single
709 /// element.
createTuple(ArrayRef<Register> Regs,const unsigned RegClassIDs[],const unsigned SubRegs[],MachineIRBuilder & MIB)710 static Register createTuple(ArrayRef<Register> Regs,
711                             const unsigned RegClassIDs[],
712                             const unsigned SubRegs[], MachineIRBuilder &MIB) {
713   unsigned NumRegs = Regs.size();
714   if (NumRegs == 1)
715     return Regs[0];
716   assert(NumRegs >= 2 && NumRegs <= 4 &&
717          "Only support between two and 4 registers in a tuple!");
718   const TargetRegisterInfo *TRI = MIB.getMF().getSubtarget().getRegisterInfo();
719   auto *DesiredClass = TRI->getRegClass(RegClassIDs[NumRegs - 2]);
720   auto RegSequence =
721       MIB.buildInstr(TargetOpcode::REG_SEQUENCE, {DesiredClass}, {});
722   for (unsigned I = 0, E = Regs.size(); I < E; ++I) {
723     RegSequence.addUse(Regs[I]);
724     RegSequence.addImm(SubRegs[I]);
725   }
726   return RegSequence.getReg(0);
727 }
728 
729 /// Create a tuple of D-registers using the registers in \p Regs.
createDTuple(ArrayRef<Register> Regs,MachineIRBuilder & MIB)730 static Register createDTuple(ArrayRef<Register> Regs, MachineIRBuilder &MIB) {
731   static const unsigned RegClassIDs[] = {
732       AArch64::DDRegClassID, AArch64::DDDRegClassID, AArch64::DDDDRegClassID};
733   static const unsigned SubRegs[] = {AArch64::dsub0, AArch64::dsub1,
734                                      AArch64::dsub2, AArch64::dsub3};
735   return createTuple(Regs, RegClassIDs, SubRegs, MIB);
736 }
737 
738 /// Create a tuple of Q-registers using the registers in \p Regs.
createQTuple(ArrayRef<Register> Regs,MachineIRBuilder & MIB)739 static Register createQTuple(ArrayRef<Register> Regs, MachineIRBuilder &MIB) {
740   static const unsigned RegClassIDs[] = {
741       AArch64::QQRegClassID, AArch64::QQQRegClassID, AArch64::QQQQRegClassID};
742   static const unsigned SubRegs[] = {AArch64::qsub0, AArch64::qsub1,
743                                      AArch64::qsub2, AArch64::qsub3};
744   return createTuple(Regs, RegClassIDs, SubRegs, MIB);
745 }
746 
getImmedFromMO(const MachineOperand & Root)747 static std::optional<uint64_t> getImmedFromMO(const MachineOperand &Root) {
748   auto &MI = *Root.getParent();
749   auto &MBB = *MI.getParent();
750   auto &MF = *MBB.getParent();
751   auto &MRI = MF.getRegInfo();
752   uint64_t Immed;
753   if (Root.isImm())
754     Immed = Root.getImm();
755   else if (Root.isCImm())
756     Immed = Root.getCImm()->getZExtValue();
757   else if (Root.isReg()) {
758     auto ValAndVReg =
759         getIConstantVRegValWithLookThrough(Root.getReg(), MRI, true);
760     if (!ValAndVReg)
761       return std::nullopt;
762     Immed = ValAndVReg->Value.getSExtValue();
763   } else
764     return std::nullopt;
765   return Immed;
766 }
767 
768 /// Check whether \p I is a currently unsupported binary operation:
769 /// - it has an unsized type
770 /// - an operand is not a vreg
771 /// - all operands are not in the same bank
772 /// These are checks that should someday live in the verifier, but right now,
773 /// these are mostly limitations of the aarch64 selector.
unsupportedBinOp(const MachineInstr & I,const AArch64RegisterBankInfo & RBI,const MachineRegisterInfo & MRI,const AArch64RegisterInfo & TRI)774 static bool unsupportedBinOp(const MachineInstr &I,
775                              const AArch64RegisterBankInfo &RBI,
776                              const MachineRegisterInfo &MRI,
777                              const AArch64RegisterInfo &TRI) {
778   LLT Ty = MRI.getType(I.getOperand(0).getReg());
779   if (!Ty.isValid()) {
780     LLVM_DEBUG(dbgs() << "Generic binop register should be typed\n");
781     return true;
782   }
783 
784   const RegisterBank *PrevOpBank = nullptr;
785   for (auto &MO : I.operands()) {
786     // FIXME: Support non-register operands.
787     if (!MO.isReg()) {
788       LLVM_DEBUG(dbgs() << "Generic inst non-reg operands are unsupported\n");
789       return true;
790     }
791 
792     // FIXME: Can generic operations have physical registers operands? If
793     // so, this will need to be taught about that, and we'll need to get the
794     // bank out of the minimal class for the register.
795     // Either way, this needs to be documented (and possibly verified).
796     if (!MO.getReg().isVirtual()) {
797       LLVM_DEBUG(dbgs() << "Generic inst has physical register operand\n");
798       return true;
799     }
800 
801     const RegisterBank *OpBank = RBI.getRegBank(MO.getReg(), MRI, TRI);
802     if (!OpBank) {
803       LLVM_DEBUG(dbgs() << "Generic register has no bank or class\n");
804       return true;
805     }
806 
807     if (PrevOpBank && OpBank != PrevOpBank) {
808       LLVM_DEBUG(dbgs() << "Generic inst operands have different banks\n");
809       return true;
810     }
811     PrevOpBank = OpBank;
812   }
813   return false;
814 }
815 
816 /// Select the AArch64 opcode for the basic binary operation \p GenericOpc
817 /// (such as G_OR or G_SDIV), appropriate for the register bank \p RegBankID
818 /// and of size \p OpSize.
819 /// \returns \p GenericOpc if the combination is unsupported.
selectBinaryOp(unsigned GenericOpc,unsigned RegBankID,unsigned OpSize)820 static unsigned selectBinaryOp(unsigned GenericOpc, unsigned RegBankID,
821                                unsigned OpSize) {
822   switch (RegBankID) {
823   case AArch64::GPRRegBankID:
824     if (OpSize == 32) {
825       switch (GenericOpc) {
826       case TargetOpcode::G_SHL:
827         return AArch64::LSLVWr;
828       case TargetOpcode::G_LSHR:
829         return AArch64::LSRVWr;
830       case TargetOpcode::G_ASHR:
831         return AArch64::ASRVWr;
832       default:
833         return GenericOpc;
834       }
835     } else if (OpSize == 64) {
836       switch (GenericOpc) {
837       case TargetOpcode::G_PTR_ADD:
838         return AArch64::ADDXrr;
839       case TargetOpcode::G_SHL:
840         return AArch64::LSLVXr;
841       case TargetOpcode::G_LSHR:
842         return AArch64::LSRVXr;
843       case TargetOpcode::G_ASHR:
844         return AArch64::ASRVXr;
845       default:
846         return GenericOpc;
847       }
848     }
849     break;
850   case AArch64::FPRRegBankID:
851     switch (OpSize) {
852     case 32:
853       switch (GenericOpc) {
854       case TargetOpcode::G_FADD:
855         return AArch64::FADDSrr;
856       case TargetOpcode::G_FSUB:
857         return AArch64::FSUBSrr;
858       case TargetOpcode::G_FMUL:
859         return AArch64::FMULSrr;
860       case TargetOpcode::G_FDIV:
861         return AArch64::FDIVSrr;
862       default:
863         return GenericOpc;
864       }
865     case 64:
866       switch (GenericOpc) {
867       case TargetOpcode::G_FADD:
868         return AArch64::FADDDrr;
869       case TargetOpcode::G_FSUB:
870         return AArch64::FSUBDrr;
871       case TargetOpcode::G_FMUL:
872         return AArch64::FMULDrr;
873       case TargetOpcode::G_FDIV:
874         return AArch64::FDIVDrr;
875       case TargetOpcode::G_OR:
876         return AArch64::ORRv8i8;
877       default:
878         return GenericOpc;
879       }
880     }
881     break;
882   }
883   return GenericOpc;
884 }
885 
886 /// Select the AArch64 opcode for the G_LOAD or G_STORE operation \p GenericOpc,
887 /// appropriate for the (value) register bank \p RegBankID and of memory access
888 /// size \p OpSize.  This returns the variant with the base+unsigned-immediate
889 /// addressing mode (e.g., LDRXui).
890 /// \returns \p GenericOpc if the combination is unsupported.
selectLoadStoreUIOp(unsigned GenericOpc,unsigned RegBankID,unsigned OpSize)891 static unsigned selectLoadStoreUIOp(unsigned GenericOpc, unsigned RegBankID,
892                                     unsigned OpSize) {
893   const bool isStore = GenericOpc == TargetOpcode::G_STORE;
894   switch (RegBankID) {
895   case AArch64::GPRRegBankID:
896     switch (OpSize) {
897     case 8:
898       return isStore ? AArch64::STRBBui : AArch64::LDRBBui;
899     case 16:
900       return isStore ? AArch64::STRHHui : AArch64::LDRHHui;
901     case 32:
902       return isStore ? AArch64::STRWui : AArch64::LDRWui;
903     case 64:
904       return isStore ? AArch64::STRXui : AArch64::LDRXui;
905     }
906     break;
907   case AArch64::FPRRegBankID:
908     switch (OpSize) {
909     case 8:
910       return isStore ? AArch64::STRBui : AArch64::LDRBui;
911     case 16:
912       return isStore ? AArch64::STRHui : AArch64::LDRHui;
913     case 32:
914       return isStore ? AArch64::STRSui : AArch64::LDRSui;
915     case 64:
916       return isStore ? AArch64::STRDui : AArch64::LDRDui;
917     case 128:
918       return isStore ? AArch64::STRQui : AArch64::LDRQui;
919     }
920     break;
921   }
922   return GenericOpc;
923 }
924 
925 /// Helper function for selectCopy. Inserts a subregister copy from \p SrcReg
926 /// to \p *To.
927 ///
928 /// E.g "To = COPY SrcReg:SubReg"
copySubReg(MachineInstr & I,MachineRegisterInfo & MRI,const RegisterBankInfo & RBI,Register SrcReg,const TargetRegisterClass * To,unsigned SubReg)929 static bool copySubReg(MachineInstr &I, MachineRegisterInfo &MRI,
930                        const RegisterBankInfo &RBI, Register SrcReg,
931                        const TargetRegisterClass *To, unsigned SubReg) {
932   assert(SrcReg.isValid() && "Expected a valid source register?");
933   assert(To && "Destination register class cannot be null");
934   assert(SubReg && "Expected a valid subregister");
935 
936   MachineIRBuilder MIB(I);
937   auto SubRegCopy =
938       MIB.buildInstr(TargetOpcode::COPY, {To}, {}).addReg(SrcReg, 0, SubReg);
939   MachineOperand &RegOp = I.getOperand(1);
940   RegOp.setReg(SubRegCopy.getReg(0));
941 
942   // It's possible that the destination register won't be constrained. Make
943   // sure that happens.
944   if (!I.getOperand(0).getReg().isPhysical())
945     RBI.constrainGenericRegister(I.getOperand(0).getReg(), *To, MRI);
946 
947   return true;
948 }
949 
950 /// Helper function to get the source and destination register classes for a
951 /// copy. Returns a std::pair containing the source register class for the
952 /// copy, and the destination register class for the copy. If a register class
953 /// cannot be determined, then it will be nullptr.
954 static std::pair<const TargetRegisterClass *, const TargetRegisterClass *>
getRegClassesForCopy(MachineInstr & I,const TargetInstrInfo & TII,MachineRegisterInfo & MRI,const TargetRegisterInfo & TRI,const RegisterBankInfo & RBI)955 getRegClassesForCopy(MachineInstr &I, const TargetInstrInfo &TII,
956                      MachineRegisterInfo &MRI, const TargetRegisterInfo &TRI,
957                      const RegisterBankInfo &RBI) {
958   Register DstReg = I.getOperand(0).getReg();
959   Register SrcReg = I.getOperand(1).getReg();
960   const RegisterBank &DstRegBank = *RBI.getRegBank(DstReg, MRI, TRI);
961   const RegisterBank &SrcRegBank = *RBI.getRegBank(SrcReg, MRI, TRI);
962 
963   TypeSize DstSize = RBI.getSizeInBits(DstReg, MRI, TRI);
964   TypeSize SrcSize = RBI.getSizeInBits(SrcReg, MRI, TRI);
965 
966   // Special casing for cross-bank copies of s1s. We can technically represent
967   // a 1-bit value with any size of register. The minimum size for a GPR is 32
968   // bits. So, we need to put the FPR on 32 bits as well.
969   //
970   // FIXME: I'm not sure if this case holds true outside of copies. If it does,
971   // then we can pull it into the helpers that get the appropriate class for a
972   // register bank. Or make a new helper that carries along some constraint
973   // information.
974   if (SrcRegBank != DstRegBank &&
975       (DstSize == TypeSize::getFixed(1) && SrcSize == TypeSize::getFixed(1)))
976     SrcSize = DstSize = TypeSize::getFixed(32);
977 
978   return {getMinClassForRegBank(SrcRegBank, SrcSize, true),
979           getMinClassForRegBank(DstRegBank, DstSize, true)};
980 }
981 
982 // FIXME: We need some sort of API in RBI/TRI to allow generic code to
983 // constrain operands of simple instructions given a TargetRegisterClass
984 // and LLT
selectDebugInstr(MachineInstr & I,MachineRegisterInfo & MRI,const RegisterBankInfo & RBI)985 static bool selectDebugInstr(MachineInstr &I, MachineRegisterInfo &MRI,
986                              const RegisterBankInfo &RBI) {
987   for (MachineOperand &MO : I.operands()) {
988     if (!MO.isReg())
989       continue;
990     Register Reg = MO.getReg();
991     if (!Reg)
992       continue;
993     if (Reg.isPhysical())
994       continue;
995     LLT Ty = MRI.getType(Reg);
996     const RegClassOrRegBank &RegClassOrBank = MRI.getRegClassOrRegBank(Reg);
997     const TargetRegisterClass *RC =
998         dyn_cast<const TargetRegisterClass *>(RegClassOrBank);
999     if (!RC) {
1000       const RegisterBank &RB = *cast<const RegisterBank *>(RegClassOrBank);
1001       RC = getRegClassForTypeOnBank(Ty, RB);
1002       if (!RC) {
1003         LLVM_DEBUG(
1004             dbgs() << "Warning: DBG_VALUE operand has unexpected size/bank\n");
1005         break;
1006       }
1007     }
1008     RBI.constrainGenericRegister(Reg, *RC, MRI);
1009   }
1010 
1011   return true;
1012 }
1013 
selectCopy(MachineInstr & I,const TargetInstrInfo & TII,MachineRegisterInfo & MRI,const TargetRegisterInfo & TRI,const RegisterBankInfo & RBI)1014 static bool selectCopy(MachineInstr &I, const TargetInstrInfo &TII,
1015                        MachineRegisterInfo &MRI, const TargetRegisterInfo &TRI,
1016                        const RegisterBankInfo &RBI) {
1017   Register DstReg = I.getOperand(0).getReg();
1018   Register SrcReg = I.getOperand(1).getReg();
1019   const RegisterBank &DstRegBank = *RBI.getRegBank(DstReg, MRI, TRI);
1020   const RegisterBank &SrcRegBank = *RBI.getRegBank(SrcReg, MRI, TRI);
1021 
1022   // Find the correct register classes for the source and destination registers.
1023   const TargetRegisterClass *SrcRC;
1024   const TargetRegisterClass *DstRC;
1025   std::tie(SrcRC, DstRC) = getRegClassesForCopy(I, TII, MRI, TRI, RBI);
1026 
1027   if (!DstRC) {
1028     LLVM_DEBUG(dbgs() << "Unexpected dest size "
1029                       << RBI.getSizeInBits(DstReg, MRI, TRI) << '\n');
1030     return false;
1031   }
1032 
1033   // Is this a copy? If so, then we may need to insert a subregister copy.
1034   if (I.isCopy()) {
1035     // Yes. Check if there's anything to fix up.
1036     if (!SrcRC) {
1037       LLVM_DEBUG(dbgs() << "Couldn't determine source register class\n");
1038       return false;
1039     }
1040 
1041     const TypeSize SrcSize = TRI.getRegSizeInBits(*SrcRC);
1042     const TypeSize DstSize = TRI.getRegSizeInBits(*DstRC);
1043     unsigned SubReg;
1044 
1045     // If the source bank doesn't support a subregister copy small enough,
1046     // then we first need to copy to the destination bank.
1047     if (getMinSizeForRegBank(SrcRegBank) > DstSize) {
1048       const TargetRegisterClass *DstTempRC =
1049           getMinClassForRegBank(DstRegBank, SrcSize, /* GetAllRegSet */ true);
1050       getSubRegForClass(DstRC, TRI, SubReg);
1051 
1052       MachineIRBuilder MIB(I);
1053       auto Copy = MIB.buildCopy({DstTempRC}, {SrcReg});
1054       copySubReg(I, MRI, RBI, Copy.getReg(0), DstRC, SubReg);
1055     } else if (SrcSize > DstSize) {
1056       // If the source register is bigger than the destination we need to
1057       // perform a subregister copy.
1058       const TargetRegisterClass *SubRegRC =
1059           getMinClassForRegBank(SrcRegBank, DstSize, /* GetAllRegSet */ true);
1060       getSubRegForClass(SubRegRC, TRI, SubReg);
1061       copySubReg(I, MRI, RBI, SrcReg, DstRC, SubReg);
1062     } else if (DstSize > SrcSize) {
1063       // If the destination register is bigger than the source we need to do
1064       // a promotion using SUBREG_TO_REG.
1065       const TargetRegisterClass *PromotionRC =
1066           getMinClassForRegBank(SrcRegBank, DstSize, /* GetAllRegSet */ true);
1067       getSubRegForClass(SrcRC, TRI, SubReg);
1068 
1069       Register PromoteReg = MRI.createVirtualRegister(PromotionRC);
1070       BuildMI(*I.getParent(), I, I.getDebugLoc(),
1071               TII.get(AArch64::SUBREG_TO_REG), PromoteReg)
1072           .addImm(0)
1073           .addUse(SrcReg)
1074           .addImm(SubReg);
1075       MachineOperand &RegOp = I.getOperand(1);
1076       RegOp.setReg(PromoteReg);
1077     }
1078 
1079     // If the destination is a physical register, then there's nothing to
1080     // change, so we're done.
1081     if (DstReg.isPhysical())
1082       return true;
1083   }
1084 
1085   // No need to constrain SrcReg. It will get constrained when we hit another
1086   // of its use or its defs. Copies do not have constraints.
1087   if (!RBI.constrainGenericRegister(DstReg, *DstRC, MRI)) {
1088     LLVM_DEBUG(dbgs() << "Failed to constrain " << TII.getName(I.getOpcode())
1089                       << " operand\n");
1090     return false;
1091   }
1092 
1093   // If this a GPR ZEXT that we want to just reduce down into a copy.
1094   // The sizes will be mismatched with the source < 32b but that's ok.
1095   if (I.getOpcode() == TargetOpcode::G_ZEXT) {
1096     I.setDesc(TII.get(AArch64::COPY));
1097     assert(SrcRegBank.getID() == AArch64::GPRRegBankID);
1098     return selectCopy(I, TII, MRI, TRI, RBI);
1099   }
1100 
1101   I.setDesc(TII.get(AArch64::COPY));
1102   return true;
1103 }
1104 
selectFPConvOpc(unsigned GenericOpc,LLT DstTy,LLT SrcTy)1105 static unsigned selectFPConvOpc(unsigned GenericOpc, LLT DstTy, LLT SrcTy) {
1106   if (!DstTy.isScalar() || !SrcTy.isScalar())
1107     return GenericOpc;
1108 
1109   const unsigned DstSize = DstTy.getSizeInBits();
1110   const unsigned SrcSize = SrcTy.getSizeInBits();
1111 
1112   switch (DstSize) {
1113   case 32:
1114     switch (SrcSize) {
1115     case 32:
1116       switch (GenericOpc) {
1117       case TargetOpcode::G_SITOFP:
1118         return AArch64::SCVTFUWSri;
1119       case TargetOpcode::G_UITOFP:
1120         return AArch64::UCVTFUWSri;
1121       case TargetOpcode::G_FPTOSI:
1122         return AArch64::FCVTZSUWSr;
1123       case TargetOpcode::G_FPTOUI:
1124         return AArch64::FCVTZUUWSr;
1125       default:
1126         return GenericOpc;
1127       }
1128     case 64:
1129       switch (GenericOpc) {
1130       case TargetOpcode::G_SITOFP:
1131         return AArch64::SCVTFUXSri;
1132       case TargetOpcode::G_UITOFP:
1133         return AArch64::UCVTFUXSri;
1134       case TargetOpcode::G_FPTOSI:
1135         return AArch64::FCVTZSUWDr;
1136       case TargetOpcode::G_FPTOUI:
1137         return AArch64::FCVTZUUWDr;
1138       default:
1139         return GenericOpc;
1140       }
1141     default:
1142       return GenericOpc;
1143     }
1144   case 64:
1145     switch (SrcSize) {
1146     case 32:
1147       switch (GenericOpc) {
1148       case TargetOpcode::G_SITOFP:
1149         return AArch64::SCVTFUWDri;
1150       case TargetOpcode::G_UITOFP:
1151         return AArch64::UCVTFUWDri;
1152       case TargetOpcode::G_FPTOSI:
1153         return AArch64::FCVTZSUXSr;
1154       case TargetOpcode::G_FPTOUI:
1155         return AArch64::FCVTZUUXSr;
1156       default:
1157         return GenericOpc;
1158       }
1159     case 64:
1160       switch (GenericOpc) {
1161       case TargetOpcode::G_SITOFP:
1162         return AArch64::SCVTFUXDri;
1163       case TargetOpcode::G_UITOFP:
1164         return AArch64::UCVTFUXDri;
1165       case TargetOpcode::G_FPTOSI:
1166         return AArch64::FCVTZSUXDr;
1167       case TargetOpcode::G_FPTOUI:
1168         return AArch64::FCVTZUUXDr;
1169       default:
1170         return GenericOpc;
1171       }
1172     default:
1173       return GenericOpc;
1174     }
1175   default:
1176     return GenericOpc;
1177   };
1178   return GenericOpc;
1179 }
1180 
1181 MachineInstr *
emitSelect(Register Dst,Register True,Register False,AArch64CC::CondCode CC,MachineIRBuilder & MIB) const1182 AArch64InstructionSelector::emitSelect(Register Dst, Register True,
1183                                        Register False, AArch64CC::CondCode CC,
1184                                        MachineIRBuilder &MIB) const {
1185   MachineRegisterInfo &MRI = *MIB.getMRI();
1186   assert(RBI.getRegBank(False, MRI, TRI)->getID() ==
1187              RBI.getRegBank(True, MRI, TRI)->getID() &&
1188          "Expected both select operands to have the same regbank?");
1189   LLT Ty = MRI.getType(True);
1190   if (Ty.isVector())
1191     return nullptr;
1192   const unsigned Size = Ty.getSizeInBits();
1193   assert((Size == 32 || Size == 64) &&
1194          "Expected 32 bit or 64 bit select only?");
1195   const bool Is32Bit = Size == 32;
1196   if (RBI.getRegBank(True, MRI, TRI)->getID() != AArch64::GPRRegBankID) {
1197     unsigned Opc = Is32Bit ? AArch64::FCSELSrrr : AArch64::FCSELDrrr;
1198     auto FCSel = MIB.buildInstr(Opc, {Dst}, {True, False}).addImm(CC);
1199     constrainSelectedInstRegOperands(*FCSel, TII, TRI, RBI);
1200     return &*FCSel;
1201   }
1202 
1203   // By default, we'll try and emit a CSEL.
1204   unsigned Opc = Is32Bit ? AArch64::CSELWr : AArch64::CSELXr;
1205   bool Optimized = false;
1206   auto TryFoldBinOpIntoSelect = [&Opc, Is32Bit, &CC, &MRI,
1207                                  &Optimized](Register &Reg, Register &OtherReg,
1208                                              bool Invert) {
1209     if (Optimized)
1210       return false;
1211 
1212     // Attempt to fold:
1213     //
1214     // %sub = G_SUB 0, %x
1215     // %select = G_SELECT cc, %reg, %sub
1216     //
1217     // Into:
1218     // %select = CSNEG %reg, %x, cc
1219     Register MatchReg;
1220     if (mi_match(Reg, MRI, m_Neg(m_Reg(MatchReg)))) {
1221       Opc = Is32Bit ? AArch64::CSNEGWr : AArch64::CSNEGXr;
1222       Reg = MatchReg;
1223       if (Invert) {
1224         CC = AArch64CC::getInvertedCondCode(CC);
1225         std::swap(Reg, OtherReg);
1226       }
1227       return true;
1228     }
1229 
1230     // Attempt to fold:
1231     //
1232     // %xor = G_XOR %x, -1
1233     // %select = G_SELECT cc, %reg, %xor
1234     //
1235     // Into:
1236     // %select = CSINV %reg, %x, cc
1237     if (mi_match(Reg, MRI, m_Not(m_Reg(MatchReg)))) {
1238       Opc = Is32Bit ? AArch64::CSINVWr : AArch64::CSINVXr;
1239       Reg = MatchReg;
1240       if (Invert) {
1241         CC = AArch64CC::getInvertedCondCode(CC);
1242         std::swap(Reg, OtherReg);
1243       }
1244       return true;
1245     }
1246 
1247     // Attempt to fold:
1248     //
1249     // %add = G_ADD %x, 1
1250     // %select = G_SELECT cc, %reg, %add
1251     //
1252     // Into:
1253     // %select = CSINC %reg, %x, cc
1254     if (mi_match(Reg, MRI,
1255                  m_any_of(m_GAdd(m_Reg(MatchReg), m_SpecificICst(1)),
1256                           m_GPtrAdd(m_Reg(MatchReg), m_SpecificICst(1))))) {
1257       Opc = Is32Bit ? AArch64::CSINCWr : AArch64::CSINCXr;
1258       Reg = MatchReg;
1259       if (Invert) {
1260         CC = AArch64CC::getInvertedCondCode(CC);
1261         std::swap(Reg, OtherReg);
1262       }
1263       return true;
1264     }
1265 
1266     return false;
1267   };
1268 
1269   // Helper lambda which tries to use CSINC/CSINV for the instruction when its
1270   // true/false values are constants.
1271   // FIXME: All of these patterns already exist in tablegen. We should be
1272   // able to import these.
1273   auto TryOptSelectCst = [&Opc, &True, &False, &CC, Is32Bit, &MRI,
1274                           &Optimized]() {
1275     if (Optimized)
1276       return false;
1277     auto TrueCst = getIConstantVRegValWithLookThrough(True, MRI);
1278     auto FalseCst = getIConstantVRegValWithLookThrough(False, MRI);
1279     if (!TrueCst && !FalseCst)
1280       return false;
1281 
1282     Register ZReg = Is32Bit ? AArch64::WZR : AArch64::XZR;
1283     if (TrueCst && FalseCst) {
1284       int64_t T = TrueCst->Value.getSExtValue();
1285       int64_t F = FalseCst->Value.getSExtValue();
1286 
1287       if (T == 0 && F == 1) {
1288         // G_SELECT cc, 0, 1 -> CSINC zreg, zreg, cc
1289         Opc = Is32Bit ? AArch64::CSINCWr : AArch64::CSINCXr;
1290         True = ZReg;
1291         False = ZReg;
1292         return true;
1293       }
1294 
1295       if (T == 0 && F == -1) {
1296         // G_SELECT cc 0, -1 -> CSINV zreg, zreg cc
1297         Opc = Is32Bit ? AArch64::CSINVWr : AArch64::CSINVXr;
1298         True = ZReg;
1299         False = ZReg;
1300         return true;
1301       }
1302     }
1303 
1304     if (TrueCst) {
1305       int64_t T = TrueCst->Value.getSExtValue();
1306       if (T == 1) {
1307         // G_SELECT cc, 1, f -> CSINC f, zreg, inv_cc
1308         Opc = Is32Bit ? AArch64::CSINCWr : AArch64::CSINCXr;
1309         True = False;
1310         False = ZReg;
1311         CC = AArch64CC::getInvertedCondCode(CC);
1312         return true;
1313       }
1314 
1315       if (T == -1) {
1316         // G_SELECT cc, -1, f -> CSINV f, zreg, inv_cc
1317         Opc = Is32Bit ? AArch64::CSINVWr : AArch64::CSINVXr;
1318         True = False;
1319         False = ZReg;
1320         CC = AArch64CC::getInvertedCondCode(CC);
1321         return true;
1322       }
1323     }
1324 
1325     if (FalseCst) {
1326       int64_t F = FalseCst->Value.getSExtValue();
1327       if (F == 1) {
1328         // G_SELECT cc, t, 1 -> CSINC t, zreg, cc
1329         Opc = Is32Bit ? AArch64::CSINCWr : AArch64::CSINCXr;
1330         False = ZReg;
1331         return true;
1332       }
1333 
1334       if (F == -1) {
1335         // G_SELECT cc, t, -1 -> CSINC t, zreg, cc
1336         Opc = Is32Bit ? AArch64::CSINVWr : AArch64::CSINVXr;
1337         False = ZReg;
1338         return true;
1339       }
1340     }
1341     return false;
1342   };
1343 
1344   Optimized |= TryFoldBinOpIntoSelect(False, True, /*Invert = */ false);
1345   Optimized |= TryFoldBinOpIntoSelect(True, False, /*Invert = */ true);
1346   Optimized |= TryOptSelectCst();
1347   auto SelectInst = MIB.buildInstr(Opc, {Dst}, {True, False}).addImm(CC);
1348   constrainSelectedInstRegOperands(*SelectInst, TII, TRI, RBI);
1349   return &*SelectInst;
1350 }
1351 
changeICMPPredToAArch64CC(CmpInst::Predicate P)1352 static AArch64CC::CondCode changeICMPPredToAArch64CC(CmpInst::Predicate P) {
1353   switch (P) {
1354   default:
1355     llvm_unreachable("Unknown condition code!");
1356   case CmpInst::ICMP_NE:
1357     return AArch64CC::NE;
1358   case CmpInst::ICMP_EQ:
1359     return AArch64CC::EQ;
1360   case CmpInst::ICMP_SGT:
1361     return AArch64CC::GT;
1362   case CmpInst::ICMP_SGE:
1363     return AArch64CC::GE;
1364   case CmpInst::ICMP_SLT:
1365     return AArch64CC::LT;
1366   case CmpInst::ICMP_SLE:
1367     return AArch64CC::LE;
1368   case CmpInst::ICMP_UGT:
1369     return AArch64CC::HI;
1370   case CmpInst::ICMP_UGE:
1371     return AArch64CC::HS;
1372   case CmpInst::ICMP_ULT:
1373     return AArch64CC::LO;
1374   case CmpInst::ICMP_ULE:
1375     return AArch64CC::LS;
1376   }
1377 }
1378 
1379 /// changeFPCCToORAArch64CC - Convert an IR fp condition code to an AArch64 CC.
changeFPCCToORAArch64CC(CmpInst::Predicate CC,AArch64CC::CondCode & CondCode,AArch64CC::CondCode & CondCode2)1380 static void changeFPCCToORAArch64CC(CmpInst::Predicate CC,
1381                                     AArch64CC::CondCode &CondCode,
1382                                     AArch64CC::CondCode &CondCode2) {
1383   CondCode2 = AArch64CC::AL;
1384   switch (CC) {
1385   default:
1386     llvm_unreachable("Unknown FP condition!");
1387   case CmpInst::FCMP_OEQ:
1388     CondCode = AArch64CC::EQ;
1389     break;
1390   case CmpInst::FCMP_OGT:
1391     CondCode = AArch64CC::GT;
1392     break;
1393   case CmpInst::FCMP_OGE:
1394     CondCode = AArch64CC::GE;
1395     break;
1396   case CmpInst::FCMP_OLT:
1397     CondCode = AArch64CC::MI;
1398     break;
1399   case CmpInst::FCMP_OLE:
1400     CondCode = AArch64CC::LS;
1401     break;
1402   case CmpInst::FCMP_ONE:
1403     CondCode = AArch64CC::MI;
1404     CondCode2 = AArch64CC::GT;
1405     break;
1406   case CmpInst::FCMP_ORD:
1407     CondCode = AArch64CC::VC;
1408     break;
1409   case CmpInst::FCMP_UNO:
1410     CondCode = AArch64CC::VS;
1411     break;
1412   case CmpInst::FCMP_UEQ:
1413     CondCode = AArch64CC::EQ;
1414     CondCode2 = AArch64CC::VS;
1415     break;
1416   case CmpInst::FCMP_UGT:
1417     CondCode = AArch64CC::HI;
1418     break;
1419   case CmpInst::FCMP_UGE:
1420     CondCode = AArch64CC::PL;
1421     break;
1422   case CmpInst::FCMP_ULT:
1423     CondCode = AArch64CC::LT;
1424     break;
1425   case CmpInst::FCMP_ULE:
1426     CondCode = AArch64CC::LE;
1427     break;
1428   case CmpInst::FCMP_UNE:
1429     CondCode = AArch64CC::NE;
1430     break;
1431   }
1432 }
1433 
1434 /// Convert an IR fp condition code to an AArch64 CC.
1435 /// This differs from changeFPCCToAArch64CC in that it returns cond codes that
1436 /// should be AND'ed instead of OR'ed.
changeFPCCToANDAArch64CC(CmpInst::Predicate CC,AArch64CC::CondCode & CondCode,AArch64CC::CondCode & CondCode2)1437 static void changeFPCCToANDAArch64CC(CmpInst::Predicate CC,
1438                                      AArch64CC::CondCode &CondCode,
1439                                      AArch64CC::CondCode &CondCode2) {
1440   CondCode2 = AArch64CC::AL;
1441   switch (CC) {
1442   default:
1443     changeFPCCToORAArch64CC(CC, CondCode, CondCode2);
1444     assert(CondCode2 == AArch64CC::AL);
1445     break;
1446   case CmpInst::FCMP_ONE:
1447     // (a one b)
1448     // == ((a olt b) || (a ogt b))
1449     // == ((a ord b) && (a une b))
1450     CondCode = AArch64CC::VC;
1451     CondCode2 = AArch64CC::NE;
1452     break;
1453   case CmpInst::FCMP_UEQ:
1454     // (a ueq b)
1455     // == ((a uno b) || (a oeq b))
1456     // == ((a ule b) && (a uge b))
1457     CondCode = AArch64CC::PL;
1458     CondCode2 = AArch64CC::LE;
1459     break;
1460   }
1461 }
1462 
1463 /// Return a register which can be used as a bit to test in a TB(N)Z.
getTestBitReg(Register Reg,uint64_t & Bit,bool & Invert,MachineRegisterInfo & MRI)1464 static Register getTestBitReg(Register Reg, uint64_t &Bit, bool &Invert,
1465                               MachineRegisterInfo &MRI) {
1466   assert(Reg.isValid() && "Expected valid register!");
1467   bool HasZext = false;
1468   while (MachineInstr *MI = getDefIgnoringCopies(Reg, MRI)) {
1469     unsigned Opc = MI->getOpcode();
1470 
1471     if (!MI->getOperand(0).isReg() ||
1472         !MRI.hasOneNonDBGUse(MI->getOperand(0).getReg()))
1473       break;
1474 
1475     // (tbz (any_ext x), b) -> (tbz x, b) if we don't use the extended bits.
1476     //
1477     // (tbz (trunc x), b) -> (tbz x, b) is always safe, because the bit number
1478     // on the truncated x is the same as the bit number on x.
1479     if (Opc == TargetOpcode::G_ANYEXT || Opc == TargetOpcode::G_ZEXT ||
1480         Opc == TargetOpcode::G_TRUNC) {
1481       if (Opc == TargetOpcode::G_ZEXT)
1482         HasZext = true;
1483 
1484       Register NextReg = MI->getOperand(1).getReg();
1485       // Did we find something worth folding?
1486       if (!NextReg.isValid() || !MRI.hasOneNonDBGUse(NextReg))
1487         break;
1488 
1489       // NextReg is worth folding. Keep looking.
1490       Reg = NextReg;
1491       continue;
1492     }
1493 
1494     // Attempt to find a suitable operation with a constant on one side.
1495     std::optional<uint64_t> C;
1496     Register TestReg;
1497     switch (Opc) {
1498     default:
1499       break;
1500     case TargetOpcode::G_AND:
1501     case TargetOpcode::G_XOR: {
1502       TestReg = MI->getOperand(1).getReg();
1503       Register ConstantReg = MI->getOperand(2).getReg();
1504       auto VRegAndVal = getIConstantVRegValWithLookThrough(ConstantReg, MRI);
1505       if (!VRegAndVal) {
1506         // AND commutes, check the other side for a constant.
1507         // FIXME: Can we canonicalize the constant so that it's always on the
1508         // same side at some point earlier?
1509         std::swap(ConstantReg, TestReg);
1510         VRegAndVal = getIConstantVRegValWithLookThrough(ConstantReg, MRI);
1511       }
1512       if (VRegAndVal) {
1513         if (HasZext)
1514           C = VRegAndVal->Value.getZExtValue();
1515         else
1516           C = VRegAndVal->Value.getSExtValue();
1517       }
1518       break;
1519     }
1520     case TargetOpcode::G_ASHR:
1521     case TargetOpcode::G_LSHR:
1522     case TargetOpcode::G_SHL: {
1523       TestReg = MI->getOperand(1).getReg();
1524       auto VRegAndVal =
1525           getIConstantVRegValWithLookThrough(MI->getOperand(2).getReg(), MRI);
1526       if (VRegAndVal)
1527         C = VRegAndVal->Value.getSExtValue();
1528       break;
1529     }
1530     }
1531 
1532     // Didn't find a constant or viable register. Bail out of the loop.
1533     if (!C || !TestReg.isValid())
1534       break;
1535 
1536     // We found a suitable instruction with a constant. Check to see if we can
1537     // walk through the instruction.
1538     Register NextReg;
1539     unsigned TestRegSize = MRI.getType(TestReg).getSizeInBits();
1540     switch (Opc) {
1541     default:
1542       break;
1543     case TargetOpcode::G_AND:
1544       // (tbz (and x, m), b) -> (tbz x, b) when the b-th bit of m is set.
1545       if ((*C >> Bit) & 1)
1546         NextReg = TestReg;
1547       break;
1548     case TargetOpcode::G_SHL:
1549       // (tbz (shl x, c), b) -> (tbz x, b-c) when b-c is positive and fits in
1550       // the type of the register.
1551       if (*C <= Bit && (Bit - *C) < TestRegSize) {
1552         NextReg = TestReg;
1553         Bit = Bit - *C;
1554       }
1555       break;
1556     case TargetOpcode::G_ASHR:
1557       // (tbz (ashr x, c), b) -> (tbz x, b+c) or (tbz x, msb) if b+c is > # bits
1558       // in x
1559       NextReg = TestReg;
1560       Bit = Bit + *C;
1561       if (Bit >= TestRegSize)
1562         Bit = TestRegSize - 1;
1563       break;
1564     case TargetOpcode::G_LSHR:
1565       // (tbz (lshr x, c), b) -> (tbz x, b+c) when b + c is < # bits in x
1566       if ((Bit + *C) < TestRegSize) {
1567         NextReg = TestReg;
1568         Bit = Bit + *C;
1569       }
1570       break;
1571     case TargetOpcode::G_XOR:
1572       // We can walk through a G_XOR by inverting whether we use tbz/tbnz when
1573       // appropriate.
1574       //
1575       // e.g. If x' = xor x, c, and the b-th bit is set in c then
1576       //
1577       // tbz x', b -> tbnz x, b
1578       //
1579       // Because x' only has the b-th bit set if x does not.
1580       if ((*C >> Bit) & 1)
1581         Invert = !Invert;
1582       NextReg = TestReg;
1583       break;
1584     }
1585 
1586     // Check if we found anything worth folding.
1587     if (!NextReg.isValid())
1588       return Reg;
1589     Reg = NextReg;
1590   }
1591 
1592   return Reg;
1593 }
1594 
emitTestBit(Register TestReg,uint64_t Bit,bool IsNegative,MachineBasicBlock * DstMBB,MachineIRBuilder & MIB) const1595 MachineInstr *AArch64InstructionSelector::emitTestBit(
1596     Register TestReg, uint64_t Bit, bool IsNegative, MachineBasicBlock *DstMBB,
1597     MachineIRBuilder &MIB) const {
1598   assert(TestReg.isValid());
1599   assert(ProduceNonFlagSettingCondBr &&
1600          "Cannot emit TB(N)Z with speculation tracking!");
1601   MachineRegisterInfo &MRI = *MIB.getMRI();
1602 
1603   // Attempt to optimize the test bit by walking over instructions.
1604   TestReg = getTestBitReg(TestReg, Bit, IsNegative, MRI);
1605   LLT Ty = MRI.getType(TestReg);
1606   unsigned Size = Ty.getSizeInBits();
1607   assert(!Ty.isVector() && "Expected a scalar!");
1608   assert(Bit < 64 && "Bit is too large!");
1609 
1610   // When the test register is a 64-bit register, we have to narrow to make
1611   // TBNZW work.
1612   bool UseWReg = Bit < 32;
1613   unsigned NecessarySize = UseWReg ? 32 : 64;
1614   if (Size != NecessarySize)
1615     TestReg = moveScalarRegClass(
1616         TestReg, UseWReg ? AArch64::GPR32RegClass : AArch64::GPR64RegClass,
1617         MIB);
1618 
1619   static const unsigned OpcTable[2][2] = {{AArch64::TBZX, AArch64::TBNZX},
1620                                           {AArch64::TBZW, AArch64::TBNZW}};
1621   unsigned Opc = OpcTable[UseWReg][IsNegative];
1622   auto TestBitMI =
1623       MIB.buildInstr(Opc).addReg(TestReg).addImm(Bit).addMBB(DstMBB);
1624   constrainSelectedInstRegOperands(*TestBitMI, TII, TRI, RBI);
1625   return &*TestBitMI;
1626 }
1627 
tryOptAndIntoCompareBranch(MachineInstr & AndInst,bool Invert,MachineBasicBlock * DstMBB,MachineIRBuilder & MIB) const1628 bool AArch64InstructionSelector::tryOptAndIntoCompareBranch(
1629     MachineInstr &AndInst, bool Invert, MachineBasicBlock *DstMBB,
1630     MachineIRBuilder &MIB) const {
1631   assert(AndInst.getOpcode() == TargetOpcode::G_AND && "Expected G_AND only?");
1632   // Given something like this:
1633   //
1634   //  %x = ...Something...
1635   //  %one = G_CONSTANT i64 1
1636   //  %zero = G_CONSTANT i64 0
1637   //  %and = G_AND %x, %one
1638   //  %cmp = G_ICMP intpred(ne), %and, %zero
1639   //  %cmp_trunc = G_TRUNC %cmp
1640   //  G_BRCOND %cmp_trunc, %bb.3
1641   //
1642   // We want to try and fold the AND into the G_BRCOND and produce either a
1643   // TBNZ (when we have intpred(ne)) or a TBZ (when we have intpred(eq)).
1644   //
1645   // In this case, we'd get
1646   //
1647   // TBNZ %x %bb.3
1648   //
1649 
1650   // Check if the AND has a constant on its RHS which we can use as a mask.
1651   // If it's a power of 2, then it's the same as checking a specific bit.
1652   // (e.g, ANDing with 8 == ANDing with 000...100 == testing if bit 3 is set)
1653   auto MaybeBit = getIConstantVRegValWithLookThrough(
1654       AndInst.getOperand(2).getReg(), *MIB.getMRI());
1655   if (!MaybeBit)
1656     return false;
1657 
1658   int32_t Bit = MaybeBit->Value.exactLogBase2();
1659   if (Bit < 0)
1660     return false;
1661 
1662   Register TestReg = AndInst.getOperand(1).getReg();
1663 
1664   // Emit a TB(N)Z.
1665   emitTestBit(TestReg, Bit, Invert, DstMBB, MIB);
1666   return true;
1667 }
1668 
emitCBZ(Register CompareReg,bool IsNegative,MachineBasicBlock * DestMBB,MachineIRBuilder & MIB) const1669 MachineInstr *AArch64InstructionSelector::emitCBZ(Register CompareReg,
1670                                                   bool IsNegative,
1671                                                   MachineBasicBlock *DestMBB,
1672                                                   MachineIRBuilder &MIB) const {
1673   assert(ProduceNonFlagSettingCondBr && "CBZ does not set flags!");
1674   MachineRegisterInfo &MRI = *MIB.getMRI();
1675   assert(RBI.getRegBank(CompareReg, MRI, TRI)->getID() ==
1676              AArch64::GPRRegBankID &&
1677          "Expected GPRs only?");
1678   auto Ty = MRI.getType(CompareReg);
1679   unsigned Width = Ty.getSizeInBits();
1680   assert(!Ty.isVector() && "Expected scalar only?");
1681   assert(Width <= 64 && "Expected width to be at most 64?");
1682   static const unsigned OpcTable[2][2] = {{AArch64::CBZW, AArch64::CBZX},
1683                                           {AArch64::CBNZW, AArch64::CBNZX}};
1684   unsigned Opc = OpcTable[IsNegative][Width == 64];
1685   auto BranchMI = MIB.buildInstr(Opc, {}, {CompareReg}).addMBB(DestMBB);
1686   constrainSelectedInstRegOperands(*BranchMI, TII, TRI, RBI);
1687   return &*BranchMI;
1688 }
1689 
selectCompareBranchFedByFCmp(MachineInstr & I,MachineInstr & FCmp,MachineIRBuilder & MIB) const1690 bool AArch64InstructionSelector::selectCompareBranchFedByFCmp(
1691     MachineInstr &I, MachineInstr &FCmp, MachineIRBuilder &MIB) const {
1692   assert(FCmp.getOpcode() == TargetOpcode::G_FCMP);
1693   assert(I.getOpcode() == TargetOpcode::G_BRCOND);
1694   // Unfortunately, the mapping of LLVM FP CC's onto AArch64 CC's isn't
1695   // totally clean.  Some of them require two branches to implement.
1696   auto Pred = (CmpInst::Predicate)FCmp.getOperand(1).getPredicate();
1697   emitFPCompare(FCmp.getOperand(2).getReg(), FCmp.getOperand(3).getReg(), MIB,
1698                 Pred);
1699   AArch64CC::CondCode CC1, CC2;
1700   changeFCMPPredToAArch64CC(static_cast<CmpInst::Predicate>(Pred), CC1, CC2);
1701   MachineBasicBlock *DestMBB = I.getOperand(1).getMBB();
1702   MIB.buildInstr(AArch64::Bcc, {}, {}).addImm(CC1).addMBB(DestMBB);
1703   if (CC2 != AArch64CC::AL)
1704     MIB.buildInstr(AArch64::Bcc, {}, {}).addImm(CC2).addMBB(DestMBB);
1705   I.eraseFromParent();
1706   return true;
1707 }
1708 
tryOptCompareBranchFedByICmp(MachineInstr & I,MachineInstr & ICmp,MachineIRBuilder & MIB) const1709 bool AArch64InstructionSelector::tryOptCompareBranchFedByICmp(
1710     MachineInstr &I, MachineInstr &ICmp, MachineIRBuilder &MIB) const {
1711   assert(ICmp.getOpcode() == TargetOpcode::G_ICMP);
1712   assert(I.getOpcode() == TargetOpcode::G_BRCOND);
1713   // Attempt to optimize the G_BRCOND + G_ICMP into a TB(N)Z/CB(N)Z.
1714   //
1715   // Speculation tracking/SLH assumes that optimized TB(N)Z/CB(N)Z
1716   // instructions will not be produced, as they are conditional branch
1717   // instructions that do not set flags.
1718   if (!ProduceNonFlagSettingCondBr)
1719     return false;
1720 
1721   MachineRegisterInfo &MRI = *MIB.getMRI();
1722   MachineBasicBlock *DestMBB = I.getOperand(1).getMBB();
1723   auto Pred =
1724       static_cast<CmpInst::Predicate>(ICmp.getOperand(1).getPredicate());
1725   Register LHS = ICmp.getOperand(2).getReg();
1726   Register RHS = ICmp.getOperand(3).getReg();
1727 
1728   // We're allowed to emit a TB(N)Z/CB(N)Z. Try to do that.
1729   auto VRegAndVal = getIConstantVRegValWithLookThrough(RHS, MRI);
1730   MachineInstr *AndInst = getOpcodeDef(TargetOpcode::G_AND, LHS, MRI);
1731 
1732   // When we can emit a TB(N)Z, prefer that.
1733   //
1734   // Handle non-commutative condition codes first.
1735   // Note that we don't want to do this when we have a G_AND because it can
1736   // become a tst. The tst will make the test bit in the TB(N)Z redundant.
1737   if (VRegAndVal && !AndInst) {
1738     int64_t C = VRegAndVal->Value.getSExtValue();
1739 
1740     // When we have a greater-than comparison, we can just test if the msb is
1741     // zero.
1742     if (C == -1 && Pred == CmpInst::ICMP_SGT) {
1743       uint64_t Bit = MRI.getType(LHS).getSizeInBits() - 1;
1744       emitTestBit(LHS, Bit, /*IsNegative = */ false, DestMBB, MIB);
1745       I.eraseFromParent();
1746       return true;
1747     }
1748 
1749     // When we have a less than comparison, we can just test if the msb is not
1750     // zero.
1751     if (C == 0 && Pred == CmpInst::ICMP_SLT) {
1752       uint64_t Bit = MRI.getType(LHS).getSizeInBits() - 1;
1753       emitTestBit(LHS, Bit, /*IsNegative = */ true, DestMBB, MIB);
1754       I.eraseFromParent();
1755       return true;
1756     }
1757 
1758     // Inversely, if we have a signed greater-than-or-equal comparison to zero,
1759     // we can test if the msb is zero.
1760     if (C == 0 && Pred == CmpInst::ICMP_SGE) {
1761       uint64_t Bit = MRI.getType(LHS).getSizeInBits() - 1;
1762       emitTestBit(LHS, Bit, /*IsNegative = */ false, DestMBB, MIB);
1763       I.eraseFromParent();
1764       return true;
1765     }
1766   }
1767 
1768   // Attempt to handle commutative condition codes. Right now, that's only
1769   // eq/ne.
1770   if (ICmpInst::isEquality(Pred)) {
1771     if (!VRegAndVal) {
1772       std::swap(RHS, LHS);
1773       VRegAndVal = getIConstantVRegValWithLookThrough(RHS, MRI);
1774       AndInst = getOpcodeDef(TargetOpcode::G_AND, LHS, MRI);
1775     }
1776 
1777     if (VRegAndVal && VRegAndVal->Value == 0) {
1778       // If there's a G_AND feeding into this branch, try to fold it away by
1779       // emitting a TB(N)Z instead.
1780       //
1781       // Note: If we have LT, then it *is* possible to fold, but it wouldn't be
1782       // beneficial. When we have an AND and LT, we need a TST/ANDS, so folding
1783       // would be redundant.
1784       if (AndInst &&
1785           tryOptAndIntoCompareBranch(
1786               *AndInst, /*Invert = */ Pred == CmpInst::ICMP_NE, DestMBB, MIB)) {
1787         I.eraseFromParent();
1788         return true;
1789       }
1790 
1791       // Otherwise, try to emit a CB(N)Z instead.
1792       auto LHSTy = MRI.getType(LHS);
1793       if (!LHSTy.isVector() && LHSTy.getSizeInBits() <= 64) {
1794         emitCBZ(LHS, /*IsNegative = */ Pred == CmpInst::ICMP_NE, DestMBB, MIB);
1795         I.eraseFromParent();
1796         return true;
1797       }
1798     }
1799   }
1800 
1801   return false;
1802 }
1803 
selectCompareBranchFedByICmp(MachineInstr & I,MachineInstr & ICmp,MachineIRBuilder & MIB) const1804 bool AArch64InstructionSelector::selectCompareBranchFedByICmp(
1805     MachineInstr &I, MachineInstr &ICmp, MachineIRBuilder &MIB) const {
1806   assert(ICmp.getOpcode() == TargetOpcode::G_ICMP);
1807   assert(I.getOpcode() == TargetOpcode::G_BRCOND);
1808   if (tryOptCompareBranchFedByICmp(I, ICmp, MIB))
1809     return true;
1810 
1811   // Couldn't optimize. Emit a compare + a Bcc.
1812   MachineBasicBlock *DestMBB = I.getOperand(1).getMBB();
1813   auto PredOp = ICmp.getOperand(1);
1814   emitIntegerCompare(ICmp.getOperand(2), ICmp.getOperand(3), PredOp, MIB);
1815   const AArch64CC::CondCode CC = changeICMPPredToAArch64CC(
1816       static_cast<CmpInst::Predicate>(PredOp.getPredicate()));
1817   MIB.buildInstr(AArch64::Bcc, {}, {}).addImm(CC).addMBB(DestMBB);
1818   I.eraseFromParent();
1819   return true;
1820 }
1821 
selectCompareBranch(MachineInstr & I,MachineFunction & MF,MachineRegisterInfo & MRI)1822 bool AArch64InstructionSelector::selectCompareBranch(
1823     MachineInstr &I, MachineFunction &MF, MachineRegisterInfo &MRI) {
1824   Register CondReg = I.getOperand(0).getReg();
1825   MachineInstr *CCMI = MRI.getVRegDef(CondReg);
1826   // Try to select the G_BRCOND using whatever is feeding the condition if
1827   // possible.
1828   unsigned CCMIOpc = CCMI->getOpcode();
1829   if (CCMIOpc == TargetOpcode::G_FCMP)
1830     return selectCompareBranchFedByFCmp(I, *CCMI, MIB);
1831   if (CCMIOpc == TargetOpcode::G_ICMP)
1832     return selectCompareBranchFedByICmp(I, *CCMI, MIB);
1833 
1834   // Speculation tracking/SLH assumes that optimized TB(N)Z/CB(N)Z
1835   // instructions will not be produced, as they are conditional branch
1836   // instructions that do not set flags.
1837   if (ProduceNonFlagSettingCondBr) {
1838     emitTestBit(CondReg, /*Bit = */ 0, /*IsNegative = */ true,
1839                 I.getOperand(1).getMBB(), MIB);
1840     I.eraseFromParent();
1841     return true;
1842   }
1843 
1844   // Can't emit TB(N)Z/CB(N)Z. Emit a tst + bcc instead.
1845   auto TstMI =
1846       MIB.buildInstr(AArch64::ANDSWri, {LLT::scalar(32)}, {CondReg}).addImm(1);
1847   constrainSelectedInstRegOperands(*TstMI, TII, TRI, RBI);
1848   auto Bcc = MIB.buildInstr(AArch64::Bcc)
1849                  .addImm(AArch64CC::NE)
1850                  .addMBB(I.getOperand(1).getMBB());
1851   I.eraseFromParent();
1852   return constrainSelectedInstRegOperands(*Bcc, TII, TRI, RBI);
1853 }
1854 
1855 /// Returns the element immediate value of a vector shift operand if found.
1856 /// This needs to detect a splat-like operation, e.g. a G_BUILD_VECTOR.
getVectorShiftImm(Register Reg,MachineRegisterInfo & MRI)1857 static std::optional<int64_t> getVectorShiftImm(Register Reg,
1858                                                 MachineRegisterInfo &MRI) {
1859   assert(MRI.getType(Reg).isVector() && "Expected a *vector* shift operand");
1860   MachineInstr *OpMI = MRI.getVRegDef(Reg);
1861   return getAArch64VectorSplatScalar(*OpMI, MRI);
1862 }
1863 
1864 /// Matches and returns the shift immediate value for a SHL instruction given
1865 /// a shift operand.
getVectorSHLImm(LLT SrcTy,Register Reg,MachineRegisterInfo & MRI)1866 static std::optional<int64_t> getVectorSHLImm(LLT SrcTy, Register Reg,
1867                                               MachineRegisterInfo &MRI) {
1868   std::optional<int64_t> ShiftImm = getVectorShiftImm(Reg, MRI);
1869   if (!ShiftImm)
1870     return std::nullopt;
1871   // Check the immediate is in range for a SHL.
1872   int64_t Imm = *ShiftImm;
1873   if (Imm < 0)
1874     return std::nullopt;
1875   switch (SrcTy.getElementType().getSizeInBits()) {
1876   default:
1877     LLVM_DEBUG(dbgs() << "Unhandled element type for vector shift");
1878     return std::nullopt;
1879   case 8:
1880     if (Imm > 7)
1881       return std::nullopt;
1882     break;
1883   case 16:
1884     if (Imm > 15)
1885       return std::nullopt;
1886     break;
1887   case 32:
1888     if (Imm > 31)
1889       return std::nullopt;
1890     break;
1891   case 64:
1892     if (Imm > 63)
1893       return std::nullopt;
1894     break;
1895   }
1896   return Imm;
1897 }
1898 
selectVectorSHL(MachineInstr & I,MachineRegisterInfo & MRI)1899 bool AArch64InstructionSelector::selectVectorSHL(MachineInstr &I,
1900                                                  MachineRegisterInfo &MRI) {
1901   assert(I.getOpcode() == TargetOpcode::G_SHL);
1902   Register DstReg = I.getOperand(0).getReg();
1903   const LLT Ty = MRI.getType(DstReg);
1904   Register Src1Reg = I.getOperand(1).getReg();
1905   Register Src2Reg = I.getOperand(2).getReg();
1906 
1907   if (!Ty.isVector())
1908     return false;
1909 
1910   // Check if we have a vector of constants on RHS that we can select as the
1911   // immediate form.
1912   std::optional<int64_t> ImmVal = getVectorSHLImm(Ty, Src2Reg, MRI);
1913 
1914   unsigned Opc = 0;
1915   if (Ty == LLT::fixed_vector(2, 64)) {
1916     Opc = ImmVal ? AArch64::SHLv2i64_shift : AArch64::USHLv2i64;
1917   } else if (Ty == LLT::fixed_vector(4, 32)) {
1918     Opc = ImmVal ? AArch64::SHLv4i32_shift : AArch64::USHLv4i32;
1919   } else if (Ty == LLT::fixed_vector(2, 32)) {
1920     Opc = ImmVal ? AArch64::SHLv2i32_shift : AArch64::USHLv2i32;
1921   } else if (Ty == LLT::fixed_vector(4, 16)) {
1922     Opc = ImmVal ? AArch64::SHLv4i16_shift : AArch64::USHLv4i16;
1923   } else if (Ty == LLT::fixed_vector(8, 16)) {
1924     Opc = ImmVal ? AArch64::SHLv8i16_shift : AArch64::USHLv8i16;
1925   } else if (Ty == LLT::fixed_vector(16, 8)) {
1926     Opc = ImmVal ? AArch64::SHLv16i8_shift : AArch64::USHLv16i8;
1927   } else if (Ty == LLT::fixed_vector(8, 8)) {
1928     Opc = ImmVal ? AArch64::SHLv8i8_shift : AArch64::USHLv8i8;
1929   } else {
1930     LLVM_DEBUG(dbgs() << "Unhandled G_SHL type");
1931     return false;
1932   }
1933 
1934   auto Shl = MIB.buildInstr(Opc, {DstReg}, {Src1Reg});
1935   if (ImmVal)
1936     Shl.addImm(*ImmVal);
1937   else
1938     Shl.addUse(Src2Reg);
1939   constrainSelectedInstRegOperands(*Shl, TII, TRI, RBI);
1940   I.eraseFromParent();
1941   return true;
1942 }
1943 
selectVectorAshrLshr(MachineInstr & I,MachineRegisterInfo & MRI)1944 bool AArch64InstructionSelector::selectVectorAshrLshr(
1945     MachineInstr &I, MachineRegisterInfo &MRI) {
1946   assert(I.getOpcode() == TargetOpcode::G_ASHR ||
1947          I.getOpcode() == TargetOpcode::G_LSHR);
1948   Register DstReg = I.getOperand(0).getReg();
1949   const LLT Ty = MRI.getType(DstReg);
1950   Register Src1Reg = I.getOperand(1).getReg();
1951   Register Src2Reg = I.getOperand(2).getReg();
1952 
1953   if (!Ty.isVector())
1954     return false;
1955 
1956   bool IsASHR = I.getOpcode() == TargetOpcode::G_ASHR;
1957 
1958   // We expect the immediate case to be lowered in the PostLegalCombiner to
1959   // AArch64ISD::VASHR or AArch64ISD::VLSHR equivalents.
1960 
1961   // There is not a shift right register instruction, but the shift left
1962   // register instruction takes a signed value, where negative numbers specify a
1963   // right shift.
1964 
1965   unsigned Opc = 0;
1966   unsigned NegOpc = 0;
1967   const TargetRegisterClass *RC =
1968       getRegClassForTypeOnBank(Ty, RBI.getRegBank(AArch64::FPRRegBankID));
1969   if (Ty == LLT::fixed_vector(2, 64)) {
1970     Opc = IsASHR ? AArch64::SSHLv2i64 : AArch64::USHLv2i64;
1971     NegOpc = AArch64::NEGv2i64;
1972   } else if (Ty == LLT::fixed_vector(4, 32)) {
1973     Opc = IsASHR ? AArch64::SSHLv4i32 : AArch64::USHLv4i32;
1974     NegOpc = AArch64::NEGv4i32;
1975   } else if (Ty == LLT::fixed_vector(2, 32)) {
1976     Opc = IsASHR ? AArch64::SSHLv2i32 : AArch64::USHLv2i32;
1977     NegOpc = AArch64::NEGv2i32;
1978   } else if (Ty == LLT::fixed_vector(4, 16)) {
1979     Opc = IsASHR ? AArch64::SSHLv4i16 : AArch64::USHLv4i16;
1980     NegOpc = AArch64::NEGv4i16;
1981   } else if (Ty == LLT::fixed_vector(8, 16)) {
1982     Opc = IsASHR ? AArch64::SSHLv8i16 : AArch64::USHLv8i16;
1983     NegOpc = AArch64::NEGv8i16;
1984   } else if (Ty == LLT::fixed_vector(16, 8)) {
1985     Opc = IsASHR ? AArch64::SSHLv16i8 : AArch64::USHLv16i8;
1986     NegOpc = AArch64::NEGv16i8;
1987   } else if (Ty == LLT::fixed_vector(8, 8)) {
1988     Opc = IsASHR ? AArch64::SSHLv8i8 : AArch64::USHLv8i8;
1989     NegOpc = AArch64::NEGv8i8;
1990   } else {
1991     LLVM_DEBUG(dbgs() << "Unhandled G_ASHR type");
1992     return false;
1993   }
1994 
1995   auto Neg = MIB.buildInstr(NegOpc, {RC}, {Src2Reg});
1996   constrainSelectedInstRegOperands(*Neg, TII, TRI, RBI);
1997   auto SShl = MIB.buildInstr(Opc, {DstReg}, {Src1Reg, Neg});
1998   constrainSelectedInstRegOperands(*SShl, TII, TRI, RBI);
1999   I.eraseFromParent();
2000   return true;
2001 }
2002 
selectVaStartAAPCS(MachineInstr & I,MachineFunction & MF,MachineRegisterInfo & MRI) const2003 bool AArch64InstructionSelector::selectVaStartAAPCS(
2004     MachineInstr &I, MachineFunction &MF, MachineRegisterInfo &MRI) const {
2005 
2006   if (STI.isCallingConvWin64(MF.getFunction().getCallingConv(),
2007                              MF.getFunction().isVarArg()))
2008     return false;
2009 
2010   // The layout of the va_list struct is specified in the AArch64 Procedure Call
2011   // Standard, section 10.1.5.
2012 
2013   const AArch64FunctionInfo *FuncInfo = MF.getInfo<AArch64FunctionInfo>();
2014   const unsigned PtrSize = STI.isTargetILP32() ? 4 : 8;
2015   const auto *PtrRegClass =
2016       STI.isTargetILP32() ? &AArch64::GPR32RegClass : &AArch64::GPR64RegClass;
2017 
2018   const MCInstrDesc &MCIDAddAddr =
2019       TII.get(STI.isTargetILP32() ? AArch64::ADDWri : AArch64::ADDXri);
2020   const MCInstrDesc &MCIDStoreAddr =
2021       TII.get(STI.isTargetILP32() ? AArch64::STRWui : AArch64::STRXui);
2022 
2023   /*
2024    * typedef struct va_list {
2025    *  void * stack; // next stack param
2026    *  void * gr_top; // end of GP arg reg save area
2027    *  void * vr_top; // end of FP/SIMD arg reg save area
2028    *  int gr_offs; // offset from gr_top to next GP register arg
2029    *  int vr_offs; // offset from vr_top to next FP/SIMD register arg
2030    * } va_list;
2031    */
2032   const auto VAList = I.getOperand(0).getReg();
2033 
2034   // Our current offset in bytes from the va_list struct (VAList).
2035   unsigned OffsetBytes = 0;
2036 
2037   // Helper function to store (FrameIndex + Imm) to VAList at offset OffsetBytes
2038   // and increment OffsetBytes by PtrSize.
2039   const auto PushAddress = [&](const int FrameIndex, const int64_t Imm) {
2040     const Register Top = MRI.createVirtualRegister(PtrRegClass);
2041     auto MIB = BuildMI(*I.getParent(), I, I.getDebugLoc(), MCIDAddAddr)
2042                    .addDef(Top)
2043                    .addFrameIndex(FrameIndex)
2044                    .addImm(Imm)
2045                    .addImm(0);
2046     constrainSelectedInstRegOperands(*MIB, TII, TRI, RBI);
2047 
2048     const auto *MMO = *I.memoperands_begin();
2049     MIB = BuildMI(*I.getParent(), I, I.getDebugLoc(), MCIDStoreAddr)
2050               .addUse(Top)
2051               .addUse(VAList)
2052               .addImm(OffsetBytes / PtrSize)
2053               .addMemOperand(MF.getMachineMemOperand(
2054                   MMO->getPointerInfo().getWithOffset(OffsetBytes),
2055                   MachineMemOperand::MOStore, PtrSize, MMO->getBaseAlign()));
2056     constrainSelectedInstRegOperands(*MIB, TII, TRI, RBI);
2057 
2058     OffsetBytes += PtrSize;
2059   };
2060 
2061   // void* stack at offset 0
2062   PushAddress(FuncInfo->getVarArgsStackIndex(), 0);
2063 
2064   // void* gr_top at offset 8 (4 on ILP32)
2065   const unsigned GPRSize = FuncInfo->getVarArgsGPRSize();
2066   PushAddress(FuncInfo->getVarArgsGPRIndex(), GPRSize);
2067 
2068   // void* vr_top at offset 16 (8 on ILP32)
2069   const unsigned FPRSize = FuncInfo->getVarArgsFPRSize();
2070   PushAddress(FuncInfo->getVarArgsFPRIndex(), FPRSize);
2071 
2072   // Helper function to store a 4-byte integer constant to VAList at offset
2073   // OffsetBytes, and increment OffsetBytes by 4.
2074   const auto PushIntConstant = [&](const int32_t Value) {
2075     constexpr int IntSize = 4;
2076     const Register Temp = MRI.createVirtualRegister(&AArch64::GPR32RegClass);
2077     auto MIB =
2078         BuildMI(*I.getParent(), I, I.getDebugLoc(), TII.get(AArch64::MOVi32imm))
2079             .addDef(Temp)
2080             .addImm(Value);
2081     constrainSelectedInstRegOperands(*MIB, TII, TRI, RBI);
2082 
2083     const auto *MMO = *I.memoperands_begin();
2084     MIB = BuildMI(*I.getParent(), I, I.getDebugLoc(), TII.get(AArch64::STRWui))
2085               .addUse(Temp)
2086               .addUse(VAList)
2087               .addImm(OffsetBytes / IntSize)
2088               .addMemOperand(MF.getMachineMemOperand(
2089                   MMO->getPointerInfo().getWithOffset(OffsetBytes),
2090                   MachineMemOperand::MOStore, IntSize, MMO->getBaseAlign()));
2091     constrainSelectedInstRegOperands(*MIB, TII, TRI, RBI);
2092     OffsetBytes += IntSize;
2093   };
2094 
2095   // int gr_offs at offset 24 (12 on ILP32)
2096   PushIntConstant(-static_cast<int32_t>(GPRSize));
2097 
2098   // int vr_offs at offset 28 (16 on ILP32)
2099   PushIntConstant(-static_cast<int32_t>(FPRSize));
2100 
2101   assert(OffsetBytes == (STI.isTargetILP32() ? 20 : 32) && "Unexpected offset");
2102 
2103   I.eraseFromParent();
2104   return true;
2105 }
2106 
selectVaStartDarwin(MachineInstr & I,MachineFunction & MF,MachineRegisterInfo & MRI) const2107 bool AArch64InstructionSelector::selectVaStartDarwin(
2108     MachineInstr &I, MachineFunction &MF, MachineRegisterInfo &MRI) const {
2109   AArch64FunctionInfo *FuncInfo = MF.getInfo<AArch64FunctionInfo>();
2110   Register ListReg = I.getOperand(0).getReg();
2111 
2112   Register ArgsAddrReg = MRI.createVirtualRegister(&AArch64::GPR64RegClass);
2113 
2114   int FrameIdx = FuncInfo->getVarArgsStackIndex();
2115   if (MF.getSubtarget<AArch64Subtarget>().isCallingConvWin64(
2116           MF.getFunction().getCallingConv(), MF.getFunction().isVarArg())) {
2117     FrameIdx = FuncInfo->getVarArgsGPRSize() > 0
2118                    ? FuncInfo->getVarArgsGPRIndex()
2119                    : FuncInfo->getVarArgsStackIndex();
2120   }
2121 
2122   auto MIB =
2123       BuildMI(*I.getParent(), I, I.getDebugLoc(), TII.get(AArch64::ADDXri))
2124           .addDef(ArgsAddrReg)
2125           .addFrameIndex(FrameIdx)
2126           .addImm(0)
2127           .addImm(0);
2128 
2129   constrainSelectedInstRegOperands(*MIB, TII, TRI, RBI);
2130 
2131   MIB = BuildMI(*I.getParent(), I, I.getDebugLoc(), TII.get(AArch64::STRXui))
2132             .addUse(ArgsAddrReg)
2133             .addUse(ListReg)
2134             .addImm(0)
2135             .addMemOperand(*I.memoperands_begin());
2136 
2137   constrainSelectedInstRegOperands(*MIB, TII, TRI, RBI);
2138   I.eraseFromParent();
2139   return true;
2140 }
2141 
materializeLargeCMVal(MachineInstr & I,const Value * V,unsigned OpFlags)2142 void AArch64InstructionSelector::materializeLargeCMVal(
2143     MachineInstr &I, const Value *V, unsigned OpFlags) {
2144   MachineBasicBlock &MBB = *I.getParent();
2145   MachineFunction &MF = *MBB.getParent();
2146   MachineRegisterInfo &MRI = MF.getRegInfo();
2147 
2148   auto MovZ = MIB.buildInstr(AArch64::MOVZXi, {&AArch64::GPR64RegClass}, {});
2149   MovZ->addOperand(MF, I.getOperand(1));
2150   MovZ->getOperand(1).setTargetFlags(OpFlags | AArch64II::MO_G0 |
2151                                      AArch64II::MO_NC);
2152   MovZ->addOperand(MF, MachineOperand::CreateImm(0));
2153   constrainSelectedInstRegOperands(*MovZ, TII, TRI, RBI);
2154 
2155   auto BuildMovK = [&](Register SrcReg, unsigned char Flags, unsigned Offset,
2156                        Register ForceDstReg) {
2157     Register DstReg = ForceDstReg
2158                           ? ForceDstReg
2159                           : MRI.createVirtualRegister(&AArch64::GPR64RegClass);
2160     auto MovI = MIB.buildInstr(AArch64::MOVKXi).addDef(DstReg).addUse(SrcReg);
2161     if (auto *GV = dyn_cast<GlobalValue>(V)) {
2162       MovI->addOperand(MF, MachineOperand::CreateGA(
2163                                GV, MovZ->getOperand(1).getOffset(), Flags));
2164     } else {
2165       MovI->addOperand(
2166           MF, MachineOperand::CreateBA(cast<BlockAddress>(V),
2167                                        MovZ->getOperand(1).getOffset(), Flags));
2168     }
2169     MovI->addOperand(MF, MachineOperand::CreateImm(Offset));
2170     constrainSelectedInstRegOperands(*MovI, TII, TRI, RBI);
2171     return DstReg;
2172   };
2173   Register DstReg = BuildMovK(MovZ.getReg(0),
2174                               AArch64II::MO_G1 | AArch64II::MO_NC, 16, 0);
2175   DstReg = BuildMovK(DstReg, AArch64II::MO_G2 | AArch64II::MO_NC, 32, 0);
2176   BuildMovK(DstReg, AArch64II::MO_G3, 48, I.getOperand(0).getReg());
2177 }
2178 
preISelLower(MachineInstr & I)2179 bool AArch64InstructionSelector::preISelLower(MachineInstr &I) {
2180   MachineBasicBlock &MBB = *I.getParent();
2181   MachineFunction &MF = *MBB.getParent();
2182   MachineRegisterInfo &MRI = MF.getRegInfo();
2183 
2184   switch (I.getOpcode()) {
2185   case TargetOpcode::G_STORE: {
2186     bool Changed = contractCrossBankCopyIntoStore(I, MRI);
2187     MachineOperand &SrcOp = I.getOperand(0);
2188     if (MRI.getType(SrcOp.getReg()).isPointer()) {
2189       // Allow matching with imported patterns for stores of pointers. Unlike
2190       // G_LOAD/G_PTR_ADD, we may not have selected all users. So, emit a copy
2191       // and constrain.
2192       auto Copy = MIB.buildCopy(LLT::scalar(64), SrcOp);
2193       Register NewSrc = Copy.getReg(0);
2194       SrcOp.setReg(NewSrc);
2195       RBI.constrainGenericRegister(NewSrc, AArch64::GPR64RegClass, MRI);
2196       Changed = true;
2197     }
2198     return Changed;
2199   }
2200   case TargetOpcode::G_PTR_ADD: {
2201     // If Checked Pointer Arithmetic (FEAT_CPA) is present, preserve the pointer
2202     // arithmetic semantics instead of falling back to regular arithmetic.
2203     const auto &TL = STI.getTargetLowering();
2204     if (TL->shouldPreservePtrArith(MF.getFunction(), EVT()))
2205       return false;
2206     return convertPtrAddToAdd(I, MRI);
2207   }
2208   case TargetOpcode::G_LOAD: {
2209     // For scalar loads of pointers, we try to convert the dest type from p0
2210     // to s64 so that our imported patterns can match. Like with the G_PTR_ADD
2211     // conversion, this should be ok because all users should have been
2212     // selected already, so the type doesn't matter for them.
2213     Register DstReg = I.getOperand(0).getReg();
2214     const LLT DstTy = MRI.getType(DstReg);
2215     if (!DstTy.isPointer())
2216       return false;
2217     MRI.setType(DstReg, LLT::scalar(64));
2218     return true;
2219   }
2220   case AArch64::G_DUP: {
2221     // Convert the type from p0 to s64 to help selection.
2222     LLT DstTy = MRI.getType(I.getOperand(0).getReg());
2223     if (!DstTy.isPointerVector())
2224       return false;
2225     auto NewSrc = MIB.buildCopy(LLT::scalar(64), I.getOperand(1).getReg());
2226     MRI.setType(I.getOperand(0).getReg(),
2227                 DstTy.changeElementType(LLT::scalar(64)));
2228     MRI.setRegClass(NewSrc.getReg(0), &AArch64::GPR64RegClass);
2229     I.getOperand(1).setReg(NewSrc.getReg(0));
2230     return true;
2231   }
2232   case AArch64::G_INSERT_VECTOR_ELT: {
2233     // Convert the type from p0 to s64 to help selection.
2234     LLT DstTy = MRI.getType(I.getOperand(0).getReg());
2235     LLT SrcVecTy = MRI.getType(I.getOperand(1).getReg());
2236     if (!SrcVecTy.isPointerVector())
2237       return false;
2238     auto NewSrc = MIB.buildCopy(LLT::scalar(64), I.getOperand(2).getReg());
2239     MRI.setType(I.getOperand(1).getReg(),
2240                 DstTy.changeElementType(LLT::scalar(64)));
2241     MRI.setType(I.getOperand(0).getReg(),
2242                 DstTy.changeElementType(LLT::scalar(64)));
2243     MRI.setRegClass(NewSrc.getReg(0), &AArch64::GPR64RegClass);
2244     I.getOperand(2).setReg(NewSrc.getReg(0));
2245     return true;
2246   }
2247   case TargetOpcode::G_UITOFP:
2248   case TargetOpcode::G_SITOFP: {
2249     // If both source and destination regbanks are FPR, then convert the opcode
2250     // to G_SITOF so that the importer can select it to an fpr variant.
2251     // Otherwise, it ends up matching an fpr/gpr variant and adding a cross-bank
2252     // copy.
2253     Register SrcReg = I.getOperand(1).getReg();
2254     LLT SrcTy = MRI.getType(SrcReg);
2255     LLT DstTy = MRI.getType(I.getOperand(0).getReg());
2256     if (SrcTy.isVector() || SrcTy.getSizeInBits() != DstTy.getSizeInBits())
2257       return false;
2258 
2259     if (RBI.getRegBank(SrcReg, MRI, TRI)->getID() == AArch64::FPRRegBankID) {
2260       if (I.getOpcode() == TargetOpcode::G_SITOFP)
2261         I.setDesc(TII.get(AArch64::G_SITOF));
2262       else
2263         I.setDesc(TII.get(AArch64::G_UITOF));
2264       return true;
2265     }
2266     return false;
2267   }
2268   default:
2269     return false;
2270   }
2271 }
2272 
2273 /// This lowering tries to look for G_PTR_ADD instructions and then converts
2274 /// them to a standard G_ADD with a COPY on the source.
2275 ///
2276 /// The motivation behind this is to expose the add semantics to the imported
2277 /// tablegen patterns. We shouldn't need to check for uses being loads/stores,
2278 /// because the selector works bottom up, uses before defs. By the time we
2279 /// end up trying to select a G_PTR_ADD, we should have already attempted to
2280 /// fold this into addressing modes and were therefore unsuccessful.
convertPtrAddToAdd(MachineInstr & I,MachineRegisterInfo & MRI)2281 bool AArch64InstructionSelector::convertPtrAddToAdd(
2282     MachineInstr &I, MachineRegisterInfo &MRI) {
2283   assert(I.getOpcode() == TargetOpcode::G_PTR_ADD && "Expected G_PTR_ADD");
2284   Register DstReg = I.getOperand(0).getReg();
2285   Register AddOp1Reg = I.getOperand(1).getReg();
2286   const LLT PtrTy = MRI.getType(DstReg);
2287   if (PtrTy.getAddressSpace() != 0)
2288     return false;
2289 
2290   const LLT CastPtrTy =
2291       PtrTy.isVector() ? LLT::fixed_vector(2, 64) : LLT::scalar(64);
2292   auto PtrToInt = MIB.buildPtrToInt(CastPtrTy, AddOp1Reg);
2293   // Set regbanks on the registers.
2294   if (PtrTy.isVector())
2295     MRI.setRegBank(PtrToInt.getReg(0), RBI.getRegBank(AArch64::FPRRegBankID));
2296   else
2297     MRI.setRegBank(PtrToInt.getReg(0), RBI.getRegBank(AArch64::GPRRegBankID));
2298 
2299   // Now turn the %dst(p0) = G_PTR_ADD %base, off into:
2300   // %dst(intty) = G_ADD %intbase, off
2301   I.setDesc(TII.get(TargetOpcode::G_ADD));
2302   MRI.setType(DstReg, CastPtrTy);
2303   I.getOperand(1).setReg(PtrToInt.getReg(0));
2304   if (!select(*PtrToInt)) {
2305     LLVM_DEBUG(dbgs() << "Failed to select G_PTRTOINT in convertPtrAddToAdd");
2306     return false;
2307   }
2308 
2309   // Also take the opportunity here to try to do some optimization.
2310   // Try to convert this into a G_SUB if the offset is a 0-x negate idiom.
2311   Register NegatedReg;
2312   if (!mi_match(I.getOperand(2).getReg(), MRI, m_Neg(m_Reg(NegatedReg))))
2313     return true;
2314   I.getOperand(2).setReg(NegatedReg);
2315   I.setDesc(TII.get(TargetOpcode::G_SUB));
2316   return true;
2317 }
2318 
earlySelectSHL(MachineInstr & I,MachineRegisterInfo & MRI)2319 bool AArch64InstructionSelector::earlySelectSHL(MachineInstr &I,
2320                                                 MachineRegisterInfo &MRI) {
2321   // We try to match the immediate variant of LSL, which is actually an alias
2322   // for a special case of UBFM. Otherwise, we fall back to the imported
2323   // selector which will match the register variant.
2324   assert(I.getOpcode() == TargetOpcode::G_SHL && "unexpected op");
2325   const auto &MO = I.getOperand(2);
2326   auto VRegAndVal = getIConstantVRegVal(MO.getReg(), MRI);
2327   if (!VRegAndVal)
2328     return false;
2329 
2330   const LLT DstTy = MRI.getType(I.getOperand(0).getReg());
2331   if (DstTy.isVector())
2332     return false;
2333   bool Is64Bit = DstTy.getSizeInBits() == 64;
2334   auto Imm1Fn = Is64Bit ? selectShiftA_64(MO) : selectShiftA_32(MO);
2335   auto Imm2Fn = Is64Bit ? selectShiftB_64(MO) : selectShiftB_32(MO);
2336 
2337   if (!Imm1Fn || !Imm2Fn)
2338     return false;
2339 
2340   auto NewI =
2341       MIB.buildInstr(Is64Bit ? AArch64::UBFMXri : AArch64::UBFMWri,
2342                      {I.getOperand(0).getReg()}, {I.getOperand(1).getReg()});
2343 
2344   for (auto &RenderFn : *Imm1Fn)
2345     RenderFn(NewI);
2346   for (auto &RenderFn : *Imm2Fn)
2347     RenderFn(NewI);
2348 
2349   I.eraseFromParent();
2350   return constrainSelectedInstRegOperands(*NewI, TII, TRI, RBI);
2351 }
2352 
contractCrossBankCopyIntoStore(MachineInstr & I,MachineRegisterInfo & MRI)2353 bool AArch64InstructionSelector::contractCrossBankCopyIntoStore(
2354     MachineInstr &I, MachineRegisterInfo &MRI) {
2355   assert(I.getOpcode() == TargetOpcode::G_STORE && "Expected G_STORE");
2356   // If we're storing a scalar, it doesn't matter what register bank that
2357   // scalar is on. All that matters is the size.
2358   //
2359   // So, if we see something like this (with a 32-bit scalar as an example):
2360   //
2361   // %x:gpr(s32) = ... something ...
2362   // %y:fpr(s32) = COPY %x:gpr(s32)
2363   // G_STORE %y:fpr(s32)
2364   //
2365   // We can fix this up into something like this:
2366   //
2367   // G_STORE %x:gpr(s32)
2368   //
2369   // And then continue the selection process normally.
2370   Register DefDstReg = getSrcRegIgnoringCopies(I.getOperand(0).getReg(), MRI);
2371   if (!DefDstReg.isValid())
2372     return false;
2373   LLT DefDstTy = MRI.getType(DefDstReg);
2374   Register StoreSrcReg = I.getOperand(0).getReg();
2375   LLT StoreSrcTy = MRI.getType(StoreSrcReg);
2376 
2377   // If we get something strange like a physical register, then we shouldn't
2378   // go any further.
2379   if (!DefDstTy.isValid())
2380     return false;
2381 
2382   // Are the source and dst types the same size?
2383   if (DefDstTy.getSizeInBits() != StoreSrcTy.getSizeInBits())
2384     return false;
2385 
2386   if (RBI.getRegBank(StoreSrcReg, MRI, TRI) ==
2387       RBI.getRegBank(DefDstReg, MRI, TRI))
2388     return false;
2389 
2390   // We have a cross-bank copy, which is entering a store. Let's fold it.
2391   I.getOperand(0).setReg(DefDstReg);
2392   return true;
2393 }
2394 
earlySelect(MachineInstr & I)2395 bool AArch64InstructionSelector::earlySelect(MachineInstr &I) {
2396   assert(I.getParent() && "Instruction should be in a basic block!");
2397   assert(I.getParent()->getParent() && "Instruction should be in a function!");
2398 
2399   MachineBasicBlock &MBB = *I.getParent();
2400   MachineFunction &MF = *MBB.getParent();
2401   MachineRegisterInfo &MRI = MF.getRegInfo();
2402 
2403   switch (I.getOpcode()) {
2404   case AArch64::G_DUP: {
2405     // Before selecting a DUP instruction, check if it is better selected as a
2406     // MOV or load from a constant pool.
2407     Register Src = I.getOperand(1).getReg();
2408     auto ValAndVReg = getAnyConstantVRegValWithLookThrough(Src, MRI);
2409     if (!ValAndVReg)
2410       return false;
2411     LLVMContext &Ctx = MF.getFunction().getContext();
2412     Register Dst = I.getOperand(0).getReg();
2413     auto *CV = ConstantDataVector::getSplat(
2414         MRI.getType(Dst).getNumElements(),
2415         ConstantInt::get(
2416             Type::getIntNTy(Ctx, MRI.getType(Dst).getScalarSizeInBits()),
2417             ValAndVReg->Value.trunc(MRI.getType(Dst).getScalarSizeInBits())));
2418     if (!emitConstantVector(Dst, CV, MIB, MRI))
2419       return false;
2420     I.eraseFromParent();
2421     return true;
2422   }
2423   case TargetOpcode::G_SEXT:
2424     // Check for i64 sext(i32 vector_extract) prior to tablegen to select SMOV
2425     // over a normal extend.
2426     if (selectUSMovFromExtend(I, MRI))
2427       return true;
2428     return false;
2429   case TargetOpcode::G_BR:
2430     return false;
2431   case TargetOpcode::G_SHL:
2432     return earlySelectSHL(I, MRI);
2433   case TargetOpcode::G_CONSTANT: {
2434     bool IsZero = false;
2435     if (I.getOperand(1).isCImm())
2436       IsZero = I.getOperand(1).getCImm()->isZero();
2437     else if (I.getOperand(1).isImm())
2438       IsZero = I.getOperand(1).getImm() == 0;
2439 
2440     if (!IsZero)
2441       return false;
2442 
2443     Register DefReg = I.getOperand(0).getReg();
2444     LLT Ty = MRI.getType(DefReg);
2445     if (Ty.getSizeInBits() == 64) {
2446       I.getOperand(1).ChangeToRegister(AArch64::XZR, false);
2447       RBI.constrainGenericRegister(DefReg, AArch64::GPR64RegClass, MRI);
2448     } else if (Ty.getSizeInBits() == 32) {
2449       I.getOperand(1).ChangeToRegister(AArch64::WZR, false);
2450       RBI.constrainGenericRegister(DefReg, AArch64::GPR32RegClass, MRI);
2451     } else
2452       return false;
2453 
2454     I.setDesc(TII.get(TargetOpcode::COPY));
2455     return true;
2456   }
2457 
2458   case TargetOpcode::G_ADD: {
2459     // Check if this is being fed by a G_ICMP on either side.
2460     //
2461     // (cmp pred, x, y) + z
2462     //
2463     // In the above case, when the cmp is true, we increment z by 1. So, we can
2464     // fold the add into the cset for the cmp by using cinc.
2465     //
2466     // FIXME: This would probably be a lot nicer in PostLegalizerLowering.
2467     Register AddDst = I.getOperand(0).getReg();
2468     Register AddLHS = I.getOperand(1).getReg();
2469     Register AddRHS = I.getOperand(2).getReg();
2470     // Only handle scalars.
2471     LLT Ty = MRI.getType(AddLHS);
2472     if (Ty.isVector())
2473       return false;
2474     // Since G_ICMP is modeled as ADDS/SUBS/ANDS, we can handle 32 bits or 64
2475     // bits.
2476     unsigned Size = Ty.getSizeInBits();
2477     if (Size != 32 && Size != 64)
2478       return false;
2479     auto MatchCmp = [&](Register Reg) -> MachineInstr * {
2480       if (!MRI.hasOneNonDBGUse(Reg))
2481         return nullptr;
2482       // If the LHS of the add is 32 bits, then we want to fold a 32-bit
2483       // compare.
2484       if (Size == 32)
2485         return getOpcodeDef(TargetOpcode::G_ICMP, Reg, MRI);
2486       // We model scalar compares using 32-bit destinations right now.
2487       // If it's a 64-bit compare, it'll have 64-bit sources.
2488       Register ZExt;
2489       if (!mi_match(Reg, MRI,
2490                     m_OneNonDBGUse(m_GZExt(m_OneNonDBGUse(m_Reg(ZExt))))))
2491         return nullptr;
2492       auto *Cmp = getOpcodeDef(TargetOpcode::G_ICMP, ZExt, MRI);
2493       if (!Cmp ||
2494           MRI.getType(Cmp->getOperand(2).getReg()).getSizeInBits() != 64)
2495         return nullptr;
2496       return Cmp;
2497     };
2498     // Try to match
2499     // z + (cmp pred, x, y)
2500     MachineInstr *Cmp = MatchCmp(AddRHS);
2501     if (!Cmp) {
2502       // (cmp pred, x, y) + z
2503       std::swap(AddLHS, AddRHS);
2504       Cmp = MatchCmp(AddRHS);
2505       if (!Cmp)
2506         return false;
2507     }
2508     auto &PredOp = Cmp->getOperand(1);
2509     auto Pred = static_cast<CmpInst::Predicate>(PredOp.getPredicate());
2510     const AArch64CC::CondCode InvCC =
2511         changeICMPPredToAArch64CC(CmpInst::getInversePredicate(Pred));
2512     MIB.setInstrAndDebugLoc(I);
2513     emitIntegerCompare(/*LHS=*/Cmp->getOperand(2),
2514                        /*RHS=*/Cmp->getOperand(3), PredOp, MIB);
2515     emitCSINC(/*Dst=*/AddDst, /*Src =*/AddLHS, /*Src2=*/AddLHS, InvCC, MIB);
2516     I.eraseFromParent();
2517     return true;
2518   }
2519   case TargetOpcode::G_OR: {
2520     // Look for operations that take the lower `Width=Size-ShiftImm` bits of
2521     // `ShiftSrc` and insert them into the upper `Width` bits of `MaskSrc` via
2522     // shifting and masking that we can replace with a BFI (encoded as a BFM).
2523     Register Dst = I.getOperand(0).getReg();
2524     LLT Ty = MRI.getType(Dst);
2525 
2526     if (!Ty.isScalar())
2527       return false;
2528 
2529     unsigned Size = Ty.getSizeInBits();
2530     if (Size != 32 && Size != 64)
2531       return false;
2532 
2533     Register ShiftSrc;
2534     int64_t ShiftImm;
2535     Register MaskSrc;
2536     int64_t MaskImm;
2537     if (!mi_match(
2538             Dst, MRI,
2539             m_GOr(m_OneNonDBGUse(m_GShl(m_Reg(ShiftSrc), m_ICst(ShiftImm))),
2540                   m_OneNonDBGUse(m_GAnd(m_Reg(MaskSrc), m_ICst(MaskImm))))))
2541       return false;
2542 
2543     if (ShiftImm > Size || ((1ULL << ShiftImm) - 1ULL) != uint64_t(MaskImm))
2544       return false;
2545 
2546     int64_t Immr = Size - ShiftImm;
2547     int64_t Imms = Size - ShiftImm - 1;
2548     unsigned Opc = Size == 32 ? AArch64::BFMWri : AArch64::BFMXri;
2549     emitInstr(Opc, {Dst}, {MaskSrc, ShiftSrc, Immr, Imms}, MIB);
2550     I.eraseFromParent();
2551     return true;
2552   }
2553   case TargetOpcode::G_FENCE: {
2554     if (I.getOperand(1).getImm() == 0)
2555       BuildMI(MBB, I, MIMetadata(I), TII.get(TargetOpcode::MEMBARRIER));
2556     else
2557       BuildMI(MBB, I, MIMetadata(I), TII.get(AArch64::DMB))
2558           .addImm(I.getOperand(0).getImm() == 4 ? 0x9 : 0xb);
2559     I.eraseFromParent();
2560     return true;
2561   }
2562   default:
2563     return false;
2564   }
2565 }
2566 
select(MachineInstr & I)2567 bool AArch64InstructionSelector::select(MachineInstr &I) {
2568   assert(I.getParent() && "Instruction should be in a basic block!");
2569   assert(I.getParent()->getParent() && "Instruction should be in a function!");
2570 
2571   MachineBasicBlock &MBB = *I.getParent();
2572   MachineFunction &MF = *MBB.getParent();
2573   MachineRegisterInfo &MRI = MF.getRegInfo();
2574 
2575   const AArch64Subtarget *Subtarget = &MF.getSubtarget<AArch64Subtarget>();
2576   if (Subtarget->requiresStrictAlign()) {
2577     // We don't support this feature yet.
2578     LLVM_DEBUG(dbgs() << "AArch64 GISel does not support strict-align yet\n");
2579     return false;
2580   }
2581 
2582   MIB.setInstrAndDebugLoc(I);
2583 
2584   unsigned Opcode = I.getOpcode();
2585   // G_PHI requires same handling as PHI
2586   if (!I.isPreISelOpcode() || Opcode == TargetOpcode::G_PHI) {
2587     // Certain non-generic instructions also need some special handling.
2588 
2589     if (Opcode ==  TargetOpcode::LOAD_STACK_GUARD)
2590       return constrainSelectedInstRegOperands(I, TII, TRI, RBI);
2591 
2592     if (Opcode == TargetOpcode::PHI || Opcode == TargetOpcode::G_PHI) {
2593       const Register DefReg = I.getOperand(0).getReg();
2594       const LLT DefTy = MRI.getType(DefReg);
2595 
2596       const RegClassOrRegBank &RegClassOrBank =
2597         MRI.getRegClassOrRegBank(DefReg);
2598 
2599       const TargetRegisterClass *DefRC =
2600           dyn_cast<const TargetRegisterClass *>(RegClassOrBank);
2601       if (!DefRC) {
2602         if (!DefTy.isValid()) {
2603           LLVM_DEBUG(dbgs() << "PHI operand has no type, not a gvreg?\n");
2604           return false;
2605         }
2606         const RegisterBank &RB = *cast<const RegisterBank *>(RegClassOrBank);
2607         DefRC = getRegClassForTypeOnBank(DefTy, RB);
2608         if (!DefRC) {
2609           LLVM_DEBUG(dbgs() << "PHI operand has unexpected size/bank\n");
2610           return false;
2611         }
2612       }
2613 
2614       I.setDesc(TII.get(TargetOpcode::PHI));
2615 
2616       return RBI.constrainGenericRegister(DefReg, *DefRC, MRI);
2617     }
2618 
2619     if (I.isCopy())
2620       return selectCopy(I, TII, MRI, TRI, RBI);
2621 
2622     if (I.isDebugInstr())
2623       return selectDebugInstr(I, MRI, RBI);
2624 
2625     return true;
2626   }
2627 
2628 
2629   if (I.getNumOperands() != I.getNumExplicitOperands()) {
2630     LLVM_DEBUG(
2631         dbgs() << "Generic instruction has unexpected implicit operands\n");
2632     return false;
2633   }
2634 
2635   // Try to do some lowering before we start instruction selecting. These
2636   // lowerings are purely transformations on the input G_MIR and so selection
2637   // must continue after any modification of the instruction.
2638   if (preISelLower(I)) {
2639     Opcode = I.getOpcode(); // The opcode may have been modified, refresh it.
2640   }
2641 
2642   // There may be patterns where the importer can't deal with them optimally,
2643   // but does select it to a suboptimal sequence so our custom C++ selection
2644   // code later never has a chance to work on it. Therefore, we have an early
2645   // selection attempt here to give priority to certain selection routines
2646   // over the imported ones.
2647   if (earlySelect(I))
2648     return true;
2649 
2650   if (selectImpl(I, *CoverageInfo))
2651     return true;
2652 
2653   LLT Ty =
2654       I.getOperand(0).isReg() ? MRI.getType(I.getOperand(0).getReg()) : LLT{};
2655 
2656   switch (Opcode) {
2657   case TargetOpcode::G_SBFX:
2658   case TargetOpcode::G_UBFX: {
2659     static const unsigned OpcTable[2][2] = {
2660         {AArch64::UBFMWri, AArch64::UBFMXri},
2661         {AArch64::SBFMWri, AArch64::SBFMXri}};
2662     bool IsSigned = Opcode == TargetOpcode::G_SBFX;
2663     unsigned Size = Ty.getSizeInBits();
2664     unsigned Opc = OpcTable[IsSigned][Size == 64];
2665     auto Cst1 =
2666         getIConstantVRegValWithLookThrough(I.getOperand(2).getReg(), MRI);
2667     assert(Cst1 && "Should have gotten a constant for src 1?");
2668     auto Cst2 =
2669         getIConstantVRegValWithLookThrough(I.getOperand(3).getReg(), MRI);
2670     assert(Cst2 && "Should have gotten a constant for src 2?");
2671     auto LSB = Cst1->Value.getZExtValue();
2672     auto Width = Cst2->Value.getZExtValue();
2673     auto BitfieldInst =
2674         MIB.buildInstr(Opc, {I.getOperand(0)}, {I.getOperand(1)})
2675             .addImm(LSB)
2676             .addImm(LSB + Width - 1);
2677     I.eraseFromParent();
2678     return constrainSelectedInstRegOperands(*BitfieldInst, TII, TRI, RBI);
2679   }
2680   case TargetOpcode::G_BRCOND:
2681     return selectCompareBranch(I, MF, MRI);
2682 
2683   case TargetOpcode::G_BRINDIRECT: {
2684     const Function &Fn = MF.getFunction();
2685     if (std::optional<uint16_t> BADisc =
2686             STI.getPtrAuthBlockAddressDiscriminatorIfEnabled(Fn)) {
2687       auto MI = MIB.buildInstr(AArch64::BRA, {}, {I.getOperand(0).getReg()});
2688       MI.addImm(AArch64PACKey::IA);
2689       MI.addImm(*BADisc);
2690       MI.addReg(/*AddrDisc=*/AArch64::XZR);
2691       I.eraseFromParent();
2692       return constrainSelectedInstRegOperands(*MI, TII, TRI, RBI);
2693     }
2694     I.setDesc(TII.get(AArch64::BR));
2695     return constrainSelectedInstRegOperands(I, TII, TRI, RBI);
2696   }
2697 
2698   case TargetOpcode::G_BRJT:
2699     return selectBrJT(I, MRI);
2700 
2701   case AArch64::G_ADD_LOW: {
2702     // This op may have been separated from it's ADRP companion by the localizer
2703     // or some other code motion pass. Given that many CPUs will try to
2704     // macro fuse these operations anyway, select this into a MOVaddr pseudo
2705     // which will later be expanded into an ADRP+ADD pair after scheduling.
2706     MachineInstr *BaseMI = MRI.getVRegDef(I.getOperand(1).getReg());
2707     if (BaseMI->getOpcode() != AArch64::ADRP) {
2708       I.setDesc(TII.get(AArch64::ADDXri));
2709       I.addOperand(MachineOperand::CreateImm(0));
2710       return constrainSelectedInstRegOperands(I, TII, TRI, RBI);
2711     }
2712     assert(TM.getCodeModel() == CodeModel::Small &&
2713            "Expected small code model");
2714     auto Op1 = BaseMI->getOperand(1);
2715     auto Op2 = I.getOperand(2);
2716     auto MovAddr = MIB.buildInstr(AArch64::MOVaddr, {I.getOperand(0)}, {})
2717                        .addGlobalAddress(Op1.getGlobal(), Op1.getOffset(),
2718                                          Op1.getTargetFlags())
2719                        .addGlobalAddress(Op2.getGlobal(), Op2.getOffset(),
2720                                          Op2.getTargetFlags());
2721     I.eraseFromParent();
2722     return constrainSelectedInstRegOperands(*MovAddr, TII, TRI, RBI);
2723   }
2724 
2725   case TargetOpcode::G_FCONSTANT:
2726   case TargetOpcode::G_CONSTANT: {
2727     const bool isFP = Opcode == TargetOpcode::G_FCONSTANT;
2728 
2729     const LLT s8 = LLT::scalar(8);
2730     const LLT s16 = LLT::scalar(16);
2731     const LLT s32 = LLT::scalar(32);
2732     const LLT s64 = LLT::scalar(64);
2733     const LLT s128 = LLT::scalar(128);
2734     const LLT p0 = LLT::pointer(0, 64);
2735 
2736     const Register DefReg = I.getOperand(0).getReg();
2737     const LLT DefTy = MRI.getType(DefReg);
2738     const unsigned DefSize = DefTy.getSizeInBits();
2739     const RegisterBank &RB = *RBI.getRegBank(DefReg, MRI, TRI);
2740 
2741     // FIXME: Redundant check, but even less readable when factored out.
2742     if (isFP) {
2743       if (Ty != s16 && Ty != s32 && Ty != s64 && Ty != s128) {
2744         LLVM_DEBUG(dbgs() << "Unable to materialize FP " << Ty
2745                           << " constant, expected: " << s16 << " or " << s32
2746                           << " or " << s64 << " or " << s128 << '\n');
2747         return false;
2748       }
2749 
2750       if (RB.getID() != AArch64::FPRRegBankID) {
2751         LLVM_DEBUG(dbgs() << "Unable to materialize FP " << Ty
2752                           << " constant on bank: " << RB
2753                           << ", expected: FPR\n");
2754         return false;
2755       }
2756 
2757       // The case when we have 0.0 is covered by tablegen. Reject it here so we
2758       // can be sure tablegen works correctly and isn't rescued by this code.
2759       // 0.0 is not covered by tablegen for FP128. So we will handle this
2760       // scenario in the code here.
2761       if (DefSize != 128 && I.getOperand(1).getFPImm()->isExactlyValue(0.0))
2762         return false;
2763     } else {
2764       // s32 and s64 are covered by tablegen.
2765       if (Ty != p0 && Ty != s8 && Ty != s16) {
2766         LLVM_DEBUG(dbgs() << "Unable to materialize integer " << Ty
2767                           << " constant, expected: " << s32 << ", " << s64
2768                           << ", or " << p0 << '\n');
2769         return false;
2770       }
2771 
2772       if (RB.getID() != AArch64::GPRRegBankID) {
2773         LLVM_DEBUG(dbgs() << "Unable to materialize integer " << Ty
2774                           << " constant on bank: " << RB
2775                           << ", expected: GPR\n");
2776         return false;
2777       }
2778     }
2779 
2780     if (isFP) {
2781       const TargetRegisterClass &FPRRC = *getRegClassForTypeOnBank(DefTy, RB);
2782       // For 16, 64, and 128b values, emit a constant pool load.
2783       switch (DefSize) {
2784       default:
2785         llvm_unreachable("Unexpected destination size for G_FCONSTANT?");
2786       case 32:
2787       case 64: {
2788         bool OptForSize = shouldOptForSize(&MF);
2789         const auto &TLI = MF.getSubtarget().getTargetLowering();
2790         // If TLI says that this fpimm is illegal, then we'll expand to a
2791         // constant pool load.
2792         if (TLI->isFPImmLegal(I.getOperand(1).getFPImm()->getValueAPF(),
2793                               EVT::getFloatingPointVT(DefSize), OptForSize))
2794           break;
2795         [[fallthrough]];
2796       }
2797       case 16:
2798       case 128: {
2799         auto *FPImm = I.getOperand(1).getFPImm();
2800         auto *LoadMI = emitLoadFromConstantPool(FPImm, MIB);
2801         if (!LoadMI) {
2802           LLVM_DEBUG(dbgs() << "Failed to load double constant pool entry\n");
2803           return false;
2804         }
2805         MIB.buildCopy({DefReg}, {LoadMI->getOperand(0).getReg()});
2806         I.eraseFromParent();
2807         return RBI.constrainGenericRegister(DefReg, FPRRC, MRI);
2808       }
2809       }
2810 
2811       assert((DefSize == 32 || DefSize == 64) && "Unexpected const def size");
2812       // Either emit a FMOV, or emit a copy to emit a normal mov.
2813       const Register DefGPRReg = MRI.createVirtualRegister(
2814           DefSize == 32 ? &AArch64::GPR32RegClass : &AArch64::GPR64RegClass);
2815       MachineOperand &RegOp = I.getOperand(0);
2816       RegOp.setReg(DefGPRReg);
2817       MIB.setInsertPt(MIB.getMBB(), std::next(I.getIterator()));
2818       MIB.buildCopy({DefReg}, {DefGPRReg});
2819 
2820       if (!RBI.constrainGenericRegister(DefReg, FPRRC, MRI)) {
2821         LLVM_DEBUG(dbgs() << "Failed to constrain G_FCONSTANT def operand\n");
2822         return false;
2823       }
2824 
2825       MachineOperand &ImmOp = I.getOperand(1);
2826       // FIXME: Is going through int64_t always correct?
2827       ImmOp.ChangeToImmediate(
2828           ImmOp.getFPImm()->getValueAPF().bitcastToAPInt().getZExtValue());
2829     } else if (I.getOperand(1).isCImm()) {
2830       uint64_t Val = I.getOperand(1).getCImm()->getZExtValue();
2831       I.getOperand(1).ChangeToImmediate(Val);
2832     } else if (I.getOperand(1).isImm()) {
2833       uint64_t Val = I.getOperand(1).getImm();
2834       I.getOperand(1).ChangeToImmediate(Val);
2835     }
2836 
2837     const unsigned MovOpc =
2838         DefSize == 64 ? AArch64::MOVi64imm : AArch64::MOVi32imm;
2839     I.setDesc(TII.get(MovOpc));
2840     constrainSelectedInstRegOperands(I, TII, TRI, RBI);
2841     return true;
2842   }
2843   case TargetOpcode::G_EXTRACT: {
2844     Register DstReg = I.getOperand(0).getReg();
2845     Register SrcReg = I.getOperand(1).getReg();
2846     LLT SrcTy = MRI.getType(SrcReg);
2847     LLT DstTy = MRI.getType(DstReg);
2848     (void)DstTy;
2849     unsigned SrcSize = SrcTy.getSizeInBits();
2850 
2851     if (SrcTy.getSizeInBits() > 64) {
2852       // This should be an extract of an s128, which is like a vector extract.
2853       if (SrcTy.getSizeInBits() != 128)
2854         return false;
2855       // Only support extracting 64 bits from an s128 at the moment.
2856       if (DstTy.getSizeInBits() != 64)
2857         return false;
2858 
2859       unsigned Offset = I.getOperand(2).getImm();
2860       if (Offset % 64 != 0)
2861         return false;
2862 
2863       // Check we have the right regbank always.
2864       const RegisterBank &SrcRB = *RBI.getRegBank(SrcReg, MRI, TRI);
2865       const RegisterBank &DstRB = *RBI.getRegBank(DstReg, MRI, TRI);
2866       assert(SrcRB.getID() == DstRB.getID() && "Wrong extract regbank!");
2867 
2868       if (SrcRB.getID() == AArch64::GPRRegBankID) {
2869         auto NewI =
2870             MIB.buildInstr(TargetOpcode::COPY, {DstReg}, {})
2871                 .addUse(SrcReg, 0,
2872                         Offset == 0 ? AArch64::sube64 : AArch64::subo64);
2873         constrainOperandRegClass(MF, TRI, MRI, TII, RBI, *NewI,
2874                                  AArch64::GPR64RegClass, NewI->getOperand(0));
2875         I.eraseFromParent();
2876         return true;
2877       }
2878 
2879       // Emit the same code as a vector extract.
2880       // Offset must be a multiple of 64.
2881       unsigned LaneIdx = Offset / 64;
2882       MachineInstr *Extract = emitExtractVectorElt(
2883           DstReg, DstRB, LLT::scalar(64), SrcReg, LaneIdx, MIB);
2884       if (!Extract)
2885         return false;
2886       I.eraseFromParent();
2887       return true;
2888     }
2889 
2890     I.setDesc(TII.get(SrcSize == 64 ? AArch64::UBFMXri : AArch64::UBFMWri));
2891     MachineInstrBuilder(MF, I).addImm(I.getOperand(2).getImm() +
2892                                       Ty.getSizeInBits() - 1);
2893 
2894     if (SrcSize < 64) {
2895       assert(SrcSize == 32 && DstTy.getSizeInBits() == 16 &&
2896              "unexpected G_EXTRACT types");
2897       return constrainSelectedInstRegOperands(I, TII, TRI, RBI);
2898     }
2899 
2900     DstReg = MRI.createGenericVirtualRegister(LLT::scalar(64));
2901     MIB.setInsertPt(MIB.getMBB(), std::next(I.getIterator()));
2902     MIB.buildInstr(TargetOpcode::COPY, {I.getOperand(0).getReg()}, {})
2903         .addReg(DstReg, 0, AArch64::sub_32);
2904     RBI.constrainGenericRegister(I.getOperand(0).getReg(),
2905                                  AArch64::GPR32RegClass, MRI);
2906     I.getOperand(0).setReg(DstReg);
2907 
2908     return constrainSelectedInstRegOperands(I, TII, TRI, RBI);
2909   }
2910 
2911   case TargetOpcode::G_INSERT: {
2912     LLT SrcTy = MRI.getType(I.getOperand(2).getReg());
2913     LLT DstTy = MRI.getType(I.getOperand(0).getReg());
2914     unsigned DstSize = DstTy.getSizeInBits();
2915     // Larger inserts are vectors, same-size ones should be something else by
2916     // now (split up or turned into COPYs).
2917     if (Ty.getSizeInBits() > 64 || SrcTy.getSizeInBits() > 32)
2918       return false;
2919 
2920     I.setDesc(TII.get(DstSize == 64 ? AArch64::BFMXri : AArch64::BFMWri));
2921     unsigned LSB = I.getOperand(3).getImm();
2922     unsigned Width = MRI.getType(I.getOperand(2).getReg()).getSizeInBits();
2923     I.getOperand(3).setImm((DstSize - LSB) % DstSize);
2924     MachineInstrBuilder(MF, I).addImm(Width - 1);
2925 
2926     if (DstSize < 64) {
2927       assert(DstSize == 32 && SrcTy.getSizeInBits() == 16 &&
2928              "unexpected G_INSERT types");
2929       return constrainSelectedInstRegOperands(I, TII, TRI, RBI);
2930     }
2931 
2932     Register SrcReg = MRI.createGenericVirtualRegister(LLT::scalar(64));
2933     BuildMI(MBB, I.getIterator(), I.getDebugLoc(),
2934             TII.get(AArch64::SUBREG_TO_REG))
2935         .addDef(SrcReg)
2936         .addImm(0)
2937         .addUse(I.getOperand(2).getReg())
2938         .addImm(AArch64::sub_32);
2939     RBI.constrainGenericRegister(I.getOperand(2).getReg(),
2940                                  AArch64::GPR32RegClass, MRI);
2941     I.getOperand(2).setReg(SrcReg);
2942 
2943     return constrainSelectedInstRegOperands(I, TII, TRI, RBI);
2944   }
2945   case TargetOpcode::G_FRAME_INDEX: {
2946     // allocas and G_FRAME_INDEX are only supported in addrspace(0).
2947     if (Ty != LLT::pointer(0, 64)) {
2948       LLVM_DEBUG(dbgs() << "G_FRAME_INDEX pointer has type: " << Ty
2949                         << ", expected: " << LLT::pointer(0, 64) << '\n');
2950       return false;
2951     }
2952     I.setDesc(TII.get(AArch64::ADDXri));
2953 
2954     // MOs for a #0 shifted immediate.
2955     I.addOperand(MachineOperand::CreateImm(0));
2956     I.addOperand(MachineOperand::CreateImm(0));
2957 
2958     return constrainSelectedInstRegOperands(I, TII, TRI, RBI);
2959   }
2960 
2961   case TargetOpcode::G_GLOBAL_VALUE: {
2962     const GlobalValue *GV = nullptr;
2963     unsigned OpFlags;
2964     if (I.getOperand(1).isSymbol()) {
2965       OpFlags = I.getOperand(1).getTargetFlags();
2966       // Currently only used by "RtLibUseGOT".
2967       assert(OpFlags == AArch64II::MO_GOT);
2968     } else {
2969       GV = I.getOperand(1).getGlobal();
2970       if (GV->isThreadLocal()) {
2971         // We don't support instructions with emulated TLS variables yet
2972         if (TM.useEmulatedTLS())
2973           return false;
2974         return selectTLSGlobalValue(I, MRI);
2975       }
2976       OpFlags = STI.ClassifyGlobalReference(GV, TM);
2977     }
2978 
2979     if (OpFlags & AArch64II::MO_GOT) {
2980       I.setDesc(TII.get(MF.getInfo<AArch64FunctionInfo>()->hasELFSignedGOT()
2981                             ? AArch64::LOADgotAUTH
2982                             : AArch64::LOADgot));
2983       I.getOperand(1).setTargetFlags(OpFlags);
2984     } else if (TM.getCodeModel() == CodeModel::Large &&
2985                !TM.isPositionIndependent()) {
2986       // Materialize the global using movz/movk instructions.
2987       materializeLargeCMVal(I, GV, OpFlags);
2988       I.eraseFromParent();
2989       return true;
2990     } else if (TM.getCodeModel() == CodeModel::Tiny) {
2991       I.setDesc(TII.get(AArch64::ADR));
2992       I.getOperand(1).setTargetFlags(OpFlags);
2993     } else {
2994       I.setDesc(TII.get(AArch64::MOVaddr));
2995       I.getOperand(1).setTargetFlags(OpFlags | AArch64II::MO_PAGE);
2996       MachineInstrBuilder MIB(MF, I);
2997       MIB.addGlobalAddress(GV, I.getOperand(1).getOffset(),
2998                            OpFlags | AArch64II::MO_PAGEOFF | AArch64II::MO_NC);
2999     }
3000     return constrainSelectedInstRegOperands(I, TII, TRI, RBI);
3001   }
3002 
3003   case TargetOpcode::G_PTRAUTH_GLOBAL_VALUE:
3004     return selectPtrAuthGlobalValue(I, MRI);
3005 
3006   case TargetOpcode::G_ZEXTLOAD:
3007   case TargetOpcode::G_LOAD:
3008   case TargetOpcode::G_STORE: {
3009     GLoadStore &LdSt = cast<GLoadStore>(I);
3010     bool IsZExtLoad = I.getOpcode() == TargetOpcode::G_ZEXTLOAD;
3011     LLT PtrTy = MRI.getType(LdSt.getPointerReg());
3012 
3013     // Can only handle AddressSpace 0, 64-bit pointers.
3014     if (PtrTy != LLT::pointer(0, 64)) {
3015       return false;
3016     }
3017 
3018     uint64_t MemSizeInBytes = LdSt.getMemSize().getValue();
3019     unsigned MemSizeInBits = LdSt.getMemSizeInBits().getValue();
3020     AtomicOrdering Order = LdSt.getMMO().getSuccessOrdering();
3021 
3022     // Need special instructions for atomics that affect ordering.
3023     if (Order != AtomicOrdering::NotAtomic &&
3024         Order != AtomicOrdering::Unordered &&
3025         Order != AtomicOrdering::Monotonic) {
3026       assert(!isa<GZExtLoad>(LdSt));
3027       assert(MemSizeInBytes <= 8 &&
3028              "128-bit atomics should already be custom-legalized");
3029 
3030       if (isa<GLoad>(LdSt)) {
3031         static constexpr unsigned LDAPROpcodes[] = {
3032             AArch64::LDAPRB, AArch64::LDAPRH, AArch64::LDAPRW, AArch64::LDAPRX};
3033         static constexpr unsigned LDAROpcodes[] = {
3034             AArch64::LDARB, AArch64::LDARH, AArch64::LDARW, AArch64::LDARX};
3035         ArrayRef<unsigned> Opcodes =
3036             STI.hasRCPC() && Order != AtomicOrdering::SequentiallyConsistent
3037                 ? LDAPROpcodes
3038                 : LDAROpcodes;
3039         I.setDesc(TII.get(Opcodes[Log2_32(MemSizeInBytes)]));
3040       } else {
3041         static constexpr unsigned Opcodes[] = {AArch64::STLRB, AArch64::STLRH,
3042                                                AArch64::STLRW, AArch64::STLRX};
3043         Register ValReg = LdSt.getReg(0);
3044         if (MRI.getType(ValReg).getSizeInBits() == 64 && MemSizeInBits != 64) {
3045           // Emit a subreg copy of 32 bits.
3046           Register NewVal = MRI.createVirtualRegister(&AArch64::GPR32RegClass);
3047           MIB.buildInstr(TargetOpcode::COPY, {NewVal}, {})
3048               .addReg(I.getOperand(0).getReg(), 0, AArch64::sub_32);
3049           I.getOperand(0).setReg(NewVal);
3050         }
3051         I.setDesc(TII.get(Opcodes[Log2_32(MemSizeInBytes)]));
3052       }
3053       constrainSelectedInstRegOperands(I, TII, TRI, RBI);
3054       return true;
3055     }
3056 
3057 #ifndef NDEBUG
3058     const Register PtrReg = LdSt.getPointerReg();
3059     const RegisterBank &PtrRB = *RBI.getRegBank(PtrReg, MRI, TRI);
3060     // Check that the pointer register is valid.
3061     assert(PtrRB.getID() == AArch64::GPRRegBankID &&
3062            "Load/Store pointer operand isn't a GPR");
3063     assert(MRI.getType(PtrReg).isPointer() &&
3064            "Load/Store pointer operand isn't a pointer");
3065 #endif
3066 
3067     const Register ValReg = LdSt.getReg(0);
3068     const RegisterBank &RB = *RBI.getRegBank(ValReg, MRI, TRI);
3069     LLT ValTy = MRI.getType(ValReg);
3070 
3071     // The code below doesn't support truncating stores, so we need to split it
3072     // again.
3073     if (isa<GStore>(LdSt) && ValTy.getSizeInBits() > MemSizeInBits) {
3074       unsigned SubReg;
3075       LLT MemTy = LdSt.getMMO().getMemoryType();
3076       auto *RC = getRegClassForTypeOnBank(MemTy, RB);
3077       if (!getSubRegForClass(RC, TRI, SubReg))
3078         return false;
3079 
3080       // Generate a subreg copy.
3081       auto Copy = MIB.buildInstr(TargetOpcode::COPY, {MemTy}, {})
3082                       .addReg(ValReg, 0, SubReg)
3083                       .getReg(0);
3084       RBI.constrainGenericRegister(Copy, *RC, MRI);
3085       LdSt.getOperand(0).setReg(Copy);
3086     } else if (isa<GLoad>(LdSt) && ValTy.getSizeInBits() > MemSizeInBits) {
3087       // If this is an any-extending load from the FPR bank, split it into a regular
3088       // load + extend.
3089       if (RB.getID() == AArch64::FPRRegBankID) {
3090         unsigned SubReg;
3091         LLT MemTy = LdSt.getMMO().getMemoryType();
3092         auto *RC = getRegClassForTypeOnBank(MemTy, RB);
3093         if (!getSubRegForClass(RC, TRI, SubReg))
3094           return false;
3095         Register OldDst = LdSt.getReg(0);
3096         Register NewDst =
3097             MRI.createGenericVirtualRegister(LdSt.getMMO().getMemoryType());
3098         LdSt.getOperand(0).setReg(NewDst);
3099         MRI.setRegBank(NewDst, RB);
3100         // Generate a SUBREG_TO_REG to extend it.
3101         MIB.setInsertPt(MIB.getMBB(), std::next(LdSt.getIterator()));
3102         MIB.buildInstr(AArch64::SUBREG_TO_REG, {OldDst}, {})
3103             .addImm(0)
3104             .addUse(NewDst)
3105             .addImm(SubReg);
3106         auto SubRegRC = getRegClassForTypeOnBank(MRI.getType(OldDst), RB);
3107         RBI.constrainGenericRegister(OldDst, *SubRegRC, MRI);
3108         MIB.setInstr(LdSt);
3109         ValTy = MemTy; // This is no longer an extending load.
3110       }
3111     }
3112 
3113     // Helper lambda for partially selecting I. Either returns the original
3114     // instruction with an updated opcode, or a new instruction.
3115     auto SelectLoadStoreAddressingMode = [&]() -> MachineInstr * {
3116       bool IsStore = isa<GStore>(I);
3117       const unsigned NewOpc =
3118           selectLoadStoreUIOp(I.getOpcode(), RB.getID(), MemSizeInBits);
3119       if (NewOpc == I.getOpcode())
3120         return nullptr;
3121       // Check if we can fold anything into the addressing mode.
3122       auto AddrModeFns =
3123           selectAddrModeIndexed(I.getOperand(1), MemSizeInBytes);
3124       if (!AddrModeFns) {
3125         // Can't fold anything. Use the original instruction.
3126         I.setDesc(TII.get(NewOpc));
3127         I.addOperand(MachineOperand::CreateImm(0));
3128         return &I;
3129       }
3130 
3131       // Folded something. Create a new instruction and return it.
3132       auto NewInst = MIB.buildInstr(NewOpc, {}, {}, I.getFlags());
3133       Register CurValReg = I.getOperand(0).getReg();
3134       IsStore ? NewInst.addUse(CurValReg) : NewInst.addDef(CurValReg);
3135       NewInst.cloneMemRefs(I);
3136       for (auto &Fn : *AddrModeFns)
3137         Fn(NewInst);
3138       I.eraseFromParent();
3139       return &*NewInst;
3140     };
3141 
3142     MachineInstr *LoadStore = SelectLoadStoreAddressingMode();
3143     if (!LoadStore)
3144       return false;
3145 
3146     // If we're storing a 0, use WZR/XZR.
3147     if (Opcode == TargetOpcode::G_STORE) {
3148       auto CVal = getIConstantVRegValWithLookThrough(
3149           LoadStore->getOperand(0).getReg(), MRI);
3150       if (CVal && CVal->Value == 0) {
3151         switch (LoadStore->getOpcode()) {
3152         case AArch64::STRWui:
3153         case AArch64::STRHHui:
3154         case AArch64::STRBBui:
3155           LoadStore->getOperand(0).setReg(AArch64::WZR);
3156           break;
3157         case AArch64::STRXui:
3158           LoadStore->getOperand(0).setReg(AArch64::XZR);
3159           break;
3160         }
3161       }
3162     }
3163 
3164     if (IsZExtLoad || (Opcode == TargetOpcode::G_LOAD &&
3165                        ValTy == LLT::scalar(64) && MemSizeInBits == 32)) {
3166       // The any/zextload from a smaller type to i32 should be handled by the
3167       // importer.
3168       if (MRI.getType(LoadStore->getOperand(0).getReg()).getSizeInBits() != 64)
3169         return false;
3170       // If we have an extending load then change the load's type to be a
3171       // narrower reg and zero_extend with SUBREG_TO_REG.
3172       Register LdReg = MRI.createVirtualRegister(&AArch64::GPR32RegClass);
3173       Register DstReg = LoadStore->getOperand(0).getReg();
3174       LoadStore->getOperand(0).setReg(LdReg);
3175 
3176       MIB.setInsertPt(MIB.getMBB(), std::next(LoadStore->getIterator()));
3177       MIB.buildInstr(AArch64::SUBREG_TO_REG, {DstReg}, {})
3178           .addImm(0)
3179           .addUse(LdReg)
3180           .addImm(AArch64::sub_32);
3181       constrainSelectedInstRegOperands(*LoadStore, TII, TRI, RBI);
3182       return RBI.constrainGenericRegister(DstReg, AArch64::GPR64allRegClass,
3183                                           MRI);
3184     }
3185     return constrainSelectedInstRegOperands(*LoadStore, TII, TRI, RBI);
3186   }
3187 
3188   case TargetOpcode::G_INDEXED_ZEXTLOAD:
3189   case TargetOpcode::G_INDEXED_SEXTLOAD:
3190     return selectIndexedExtLoad(I, MRI);
3191   case TargetOpcode::G_INDEXED_LOAD:
3192     return selectIndexedLoad(I, MRI);
3193   case TargetOpcode::G_INDEXED_STORE:
3194     return selectIndexedStore(cast<GIndexedStore>(I), MRI);
3195 
3196   case TargetOpcode::G_LSHR:
3197   case TargetOpcode::G_ASHR:
3198     if (MRI.getType(I.getOperand(0).getReg()).isVector())
3199       return selectVectorAshrLshr(I, MRI);
3200     [[fallthrough]];
3201   case TargetOpcode::G_SHL:
3202     if (Opcode == TargetOpcode::G_SHL &&
3203         MRI.getType(I.getOperand(0).getReg()).isVector())
3204       return selectVectorSHL(I, MRI);
3205 
3206     // These shifts were legalized to have 64 bit shift amounts because we
3207     // want to take advantage of the selection patterns that assume the
3208     // immediates are s64s, however, selectBinaryOp will assume both operands
3209     // will have the same bit size.
3210     {
3211       Register SrcReg = I.getOperand(1).getReg();
3212       Register ShiftReg = I.getOperand(2).getReg();
3213       const LLT ShiftTy = MRI.getType(ShiftReg);
3214       const LLT SrcTy = MRI.getType(SrcReg);
3215       if (!SrcTy.isVector() && SrcTy.getSizeInBits() == 32 &&
3216           ShiftTy.getSizeInBits() == 64) {
3217         assert(!ShiftTy.isVector() && "unexpected vector shift ty");
3218         // Insert a subregister copy to implement a 64->32 trunc
3219         auto Trunc = MIB.buildInstr(TargetOpcode::COPY, {SrcTy}, {})
3220                          .addReg(ShiftReg, 0, AArch64::sub_32);
3221         MRI.setRegBank(Trunc.getReg(0), RBI.getRegBank(AArch64::GPRRegBankID));
3222         I.getOperand(2).setReg(Trunc.getReg(0));
3223       }
3224     }
3225     [[fallthrough]];
3226   case TargetOpcode::G_OR: {
3227     // Reject the various things we don't support yet.
3228     if (unsupportedBinOp(I, RBI, MRI, TRI))
3229       return false;
3230 
3231     const unsigned OpSize = Ty.getSizeInBits();
3232 
3233     const Register DefReg = I.getOperand(0).getReg();
3234     const RegisterBank &RB = *RBI.getRegBank(DefReg, MRI, TRI);
3235 
3236     const unsigned NewOpc = selectBinaryOp(I.getOpcode(), RB.getID(), OpSize);
3237     if (NewOpc == I.getOpcode())
3238       return false;
3239 
3240     I.setDesc(TII.get(NewOpc));
3241     // FIXME: Should the type be always reset in setDesc?
3242 
3243     // Now that we selected an opcode, we need to constrain the register
3244     // operands to use appropriate classes.
3245     return constrainSelectedInstRegOperands(I, TII, TRI, RBI);
3246   }
3247 
3248   case TargetOpcode::G_PTR_ADD: {
3249     emitADD(I.getOperand(0).getReg(), I.getOperand(1), I.getOperand(2), MIB);
3250     I.eraseFromParent();
3251     return true;
3252   }
3253 
3254   case TargetOpcode::G_SADDE:
3255   case TargetOpcode::G_UADDE:
3256   case TargetOpcode::G_SSUBE:
3257   case TargetOpcode::G_USUBE:
3258   case TargetOpcode::G_SADDO:
3259   case TargetOpcode::G_UADDO:
3260   case TargetOpcode::G_SSUBO:
3261   case TargetOpcode::G_USUBO:
3262     return selectOverflowOp(I, MRI);
3263 
3264   case TargetOpcode::G_PTRMASK: {
3265     Register MaskReg = I.getOperand(2).getReg();
3266     std::optional<int64_t> MaskVal = getIConstantVRegSExtVal(MaskReg, MRI);
3267     // TODO: Implement arbitrary cases
3268     if (!MaskVal || !isShiftedMask_64(*MaskVal))
3269       return false;
3270 
3271     uint64_t Mask = *MaskVal;
3272     I.setDesc(TII.get(AArch64::ANDXri));
3273     I.getOperand(2).ChangeToImmediate(
3274         AArch64_AM::encodeLogicalImmediate(Mask, 64));
3275 
3276     return constrainSelectedInstRegOperands(I, TII, TRI, RBI);
3277   }
3278   case TargetOpcode::G_PTRTOINT:
3279   case TargetOpcode::G_TRUNC: {
3280     const LLT DstTy = MRI.getType(I.getOperand(0).getReg());
3281     const LLT SrcTy = MRI.getType(I.getOperand(1).getReg());
3282 
3283     const Register DstReg = I.getOperand(0).getReg();
3284     const Register SrcReg = I.getOperand(1).getReg();
3285 
3286     const RegisterBank &DstRB = *RBI.getRegBank(DstReg, MRI, TRI);
3287     const RegisterBank &SrcRB = *RBI.getRegBank(SrcReg, MRI, TRI);
3288 
3289     if (DstRB.getID() != SrcRB.getID()) {
3290       LLVM_DEBUG(
3291           dbgs() << "G_TRUNC/G_PTRTOINT input/output on different banks\n");
3292       return false;
3293     }
3294 
3295     if (DstRB.getID() == AArch64::GPRRegBankID) {
3296       const TargetRegisterClass *DstRC = getRegClassForTypeOnBank(DstTy, DstRB);
3297       if (!DstRC)
3298         return false;
3299 
3300       const TargetRegisterClass *SrcRC = getRegClassForTypeOnBank(SrcTy, SrcRB);
3301       if (!SrcRC)
3302         return false;
3303 
3304       if (!RBI.constrainGenericRegister(SrcReg, *SrcRC, MRI) ||
3305           !RBI.constrainGenericRegister(DstReg, *DstRC, MRI)) {
3306         LLVM_DEBUG(dbgs() << "Failed to constrain G_TRUNC/G_PTRTOINT\n");
3307         return false;
3308       }
3309 
3310       if (DstRC == SrcRC) {
3311         // Nothing to be done
3312       } else if (Opcode == TargetOpcode::G_TRUNC && DstTy == LLT::scalar(32) &&
3313                  SrcTy == LLT::scalar(64)) {
3314         llvm_unreachable("TableGen can import this case");
3315         return false;
3316       } else if (DstRC == &AArch64::GPR32RegClass &&
3317                  SrcRC == &AArch64::GPR64RegClass) {
3318         I.getOperand(1).setSubReg(AArch64::sub_32);
3319       } else {
3320         LLVM_DEBUG(
3321             dbgs() << "Unhandled mismatched classes in G_TRUNC/G_PTRTOINT\n");
3322         return false;
3323       }
3324 
3325       I.setDesc(TII.get(TargetOpcode::COPY));
3326       return true;
3327     } else if (DstRB.getID() == AArch64::FPRRegBankID) {
3328       if (DstTy == LLT::fixed_vector(4, 16) &&
3329           SrcTy == LLT::fixed_vector(4, 32)) {
3330         I.setDesc(TII.get(AArch64::XTNv4i16));
3331         constrainSelectedInstRegOperands(I, TII, TRI, RBI);
3332         return true;
3333       }
3334 
3335       if (!SrcTy.isVector() && SrcTy.getSizeInBits() == 128) {
3336         MachineInstr *Extract = emitExtractVectorElt(
3337             DstReg, DstRB, LLT::scalar(DstTy.getSizeInBits()), SrcReg, 0, MIB);
3338         if (!Extract)
3339           return false;
3340         I.eraseFromParent();
3341         return true;
3342       }
3343 
3344       // We might have a vector G_PTRTOINT, in which case just emit a COPY.
3345       if (Opcode == TargetOpcode::G_PTRTOINT) {
3346         assert(DstTy.isVector() && "Expected an FPR ptrtoint to be a vector");
3347         I.setDesc(TII.get(TargetOpcode::COPY));
3348         return selectCopy(I, TII, MRI, TRI, RBI);
3349       }
3350     }
3351 
3352     return false;
3353   }
3354 
3355   case TargetOpcode::G_ANYEXT: {
3356     if (selectUSMovFromExtend(I, MRI))
3357       return true;
3358 
3359     const Register DstReg = I.getOperand(0).getReg();
3360     const Register SrcReg = I.getOperand(1).getReg();
3361 
3362     const RegisterBank &RBDst = *RBI.getRegBank(DstReg, MRI, TRI);
3363     if (RBDst.getID() != AArch64::GPRRegBankID) {
3364       LLVM_DEBUG(dbgs() << "G_ANYEXT on bank: " << RBDst
3365                         << ", expected: GPR\n");
3366       return false;
3367     }
3368 
3369     const RegisterBank &RBSrc = *RBI.getRegBank(SrcReg, MRI, TRI);
3370     if (RBSrc.getID() != AArch64::GPRRegBankID) {
3371       LLVM_DEBUG(dbgs() << "G_ANYEXT on bank: " << RBSrc
3372                         << ", expected: GPR\n");
3373       return false;
3374     }
3375 
3376     const unsigned DstSize = MRI.getType(DstReg).getSizeInBits();
3377 
3378     if (DstSize == 0) {
3379       LLVM_DEBUG(dbgs() << "G_ANYEXT operand has no size, not a gvreg?\n");
3380       return false;
3381     }
3382 
3383     if (DstSize != 64 && DstSize > 32) {
3384       LLVM_DEBUG(dbgs() << "G_ANYEXT to size: " << DstSize
3385                         << ", expected: 32 or 64\n");
3386       return false;
3387     }
3388     // At this point G_ANYEXT is just like a plain COPY, but we need
3389     // to explicitly form the 64-bit value if any.
3390     if (DstSize > 32) {
3391       Register ExtSrc = MRI.createVirtualRegister(&AArch64::GPR64allRegClass);
3392       BuildMI(MBB, I, I.getDebugLoc(), TII.get(AArch64::SUBREG_TO_REG))
3393           .addDef(ExtSrc)
3394           .addImm(0)
3395           .addUse(SrcReg)
3396           .addImm(AArch64::sub_32);
3397       I.getOperand(1).setReg(ExtSrc);
3398     }
3399     return selectCopy(I, TII, MRI, TRI, RBI);
3400   }
3401 
3402   case TargetOpcode::G_ZEXT:
3403   case TargetOpcode::G_SEXT_INREG:
3404   case TargetOpcode::G_SEXT: {
3405     if (selectUSMovFromExtend(I, MRI))
3406       return true;
3407 
3408     unsigned Opcode = I.getOpcode();
3409     const bool IsSigned = Opcode != TargetOpcode::G_ZEXT;
3410     const Register DefReg = I.getOperand(0).getReg();
3411     Register SrcReg = I.getOperand(1).getReg();
3412     const LLT DstTy = MRI.getType(DefReg);
3413     const LLT SrcTy = MRI.getType(SrcReg);
3414     unsigned DstSize = DstTy.getSizeInBits();
3415     unsigned SrcSize = SrcTy.getSizeInBits();
3416 
3417     // SEXT_INREG has the same src reg size as dst, the size of the value to be
3418     // extended is encoded in the imm.
3419     if (Opcode == TargetOpcode::G_SEXT_INREG)
3420       SrcSize = I.getOperand(2).getImm();
3421 
3422     if (DstTy.isVector())
3423       return false; // Should be handled by imported patterns.
3424 
3425     assert((*RBI.getRegBank(DefReg, MRI, TRI)).getID() ==
3426                AArch64::GPRRegBankID &&
3427            "Unexpected ext regbank");
3428 
3429     MachineInstr *ExtI;
3430 
3431     // First check if we're extending the result of a load which has a dest type
3432     // smaller than 32 bits, then this zext is redundant. GPR32 is the smallest
3433     // GPR register on AArch64 and all loads which are smaller automatically
3434     // zero-extend the upper bits. E.g.
3435     // %v(s8) = G_LOAD %p, :: (load 1)
3436     // %v2(s32) = G_ZEXT %v(s8)
3437     if (!IsSigned) {
3438       auto *LoadMI = getOpcodeDef(TargetOpcode::G_LOAD, SrcReg, MRI);
3439       bool IsGPR =
3440           RBI.getRegBank(SrcReg, MRI, TRI)->getID() == AArch64::GPRRegBankID;
3441       if (LoadMI && IsGPR) {
3442         const MachineMemOperand *MemOp = *LoadMI->memoperands_begin();
3443         unsigned BytesLoaded = MemOp->getSize().getValue();
3444         if (BytesLoaded < 4 && SrcTy.getSizeInBytes() == BytesLoaded)
3445           return selectCopy(I, TII, MRI, TRI, RBI);
3446       }
3447 
3448       // For the 32-bit -> 64-bit case, we can emit a mov (ORRWrs)
3449       // + SUBREG_TO_REG.
3450       if (IsGPR && SrcSize == 32 && DstSize == 64) {
3451         Register SubregToRegSrc =
3452             MRI.createVirtualRegister(&AArch64::GPR32RegClass);
3453         const Register ZReg = AArch64::WZR;
3454         MIB.buildInstr(AArch64::ORRWrs, {SubregToRegSrc}, {ZReg, SrcReg})
3455             .addImm(0);
3456 
3457         MIB.buildInstr(AArch64::SUBREG_TO_REG, {DefReg}, {})
3458             .addImm(0)
3459             .addUse(SubregToRegSrc)
3460             .addImm(AArch64::sub_32);
3461 
3462         if (!RBI.constrainGenericRegister(DefReg, AArch64::GPR64RegClass,
3463                                           MRI)) {
3464           LLVM_DEBUG(dbgs() << "Failed to constrain G_ZEXT destination\n");
3465           return false;
3466         }
3467 
3468         if (!RBI.constrainGenericRegister(SrcReg, AArch64::GPR32RegClass,
3469                                           MRI)) {
3470           LLVM_DEBUG(dbgs() << "Failed to constrain G_ZEXT source\n");
3471           return false;
3472         }
3473 
3474         I.eraseFromParent();
3475         return true;
3476       }
3477     }
3478 
3479     if (DstSize == 64) {
3480       if (Opcode != TargetOpcode::G_SEXT_INREG) {
3481         // FIXME: Can we avoid manually doing this?
3482         if (!RBI.constrainGenericRegister(SrcReg, AArch64::GPR32RegClass,
3483                                           MRI)) {
3484           LLVM_DEBUG(dbgs() << "Failed to constrain " << TII.getName(Opcode)
3485                             << " operand\n");
3486           return false;
3487         }
3488         SrcReg = MIB.buildInstr(AArch64::SUBREG_TO_REG,
3489                                 {&AArch64::GPR64RegClass}, {})
3490                      .addImm(0)
3491                      .addUse(SrcReg)
3492                      .addImm(AArch64::sub_32)
3493                      .getReg(0);
3494       }
3495 
3496       ExtI = MIB.buildInstr(IsSigned ? AArch64::SBFMXri : AArch64::UBFMXri,
3497                              {DefReg}, {SrcReg})
3498                   .addImm(0)
3499                   .addImm(SrcSize - 1);
3500     } else if (DstSize <= 32) {
3501       ExtI = MIB.buildInstr(IsSigned ? AArch64::SBFMWri : AArch64::UBFMWri,
3502                              {DefReg}, {SrcReg})
3503                   .addImm(0)
3504                   .addImm(SrcSize - 1);
3505     } else {
3506       return false;
3507     }
3508 
3509     constrainSelectedInstRegOperands(*ExtI, TII, TRI, RBI);
3510     I.eraseFromParent();
3511     return true;
3512   }
3513 
3514   case TargetOpcode::G_SITOFP:
3515   case TargetOpcode::G_UITOFP:
3516   case TargetOpcode::G_FPTOSI:
3517   case TargetOpcode::G_FPTOUI: {
3518     const LLT DstTy = MRI.getType(I.getOperand(0).getReg()),
3519               SrcTy = MRI.getType(I.getOperand(1).getReg());
3520     const unsigned NewOpc = selectFPConvOpc(Opcode, DstTy, SrcTy);
3521     if (NewOpc == Opcode)
3522       return false;
3523 
3524     I.setDesc(TII.get(NewOpc));
3525     constrainSelectedInstRegOperands(I, TII, TRI, RBI);
3526     I.setFlags(MachineInstr::NoFPExcept);
3527 
3528     return true;
3529   }
3530 
3531   case TargetOpcode::G_FREEZE:
3532     return selectCopy(I, TII, MRI, TRI, RBI);
3533 
3534   case TargetOpcode::G_INTTOPTR:
3535     // The importer is currently unable to import pointer types since they
3536     // didn't exist in SelectionDAG.
3537     return selectCopy(I, TII, MRI, TRI, RBI);
3538 
3539   case TargetOpcode::G_BITCAST:
3540     // Imported SelectionDAG rules can handle every bitcast except those that
3541     // bitcast from a type to the same type. Ideally, these shouldn't occur
3542     // but we might not run an optimizer that deletes them. The other exception
3543     // is bitcasts involving pointer types, as SelectionDAG has no knowledge
3544     // of them.
3545     return selectCopy(I, TII, MRI, TRI, RBI);
3546 
3547   case TargetOpcode::G_SELECT: {
3548     auto &Sel = cast<GSelect>(I);
3549     const Register CondReg = Sel.getCondReg();
3550     const Register TReg = Sel.getTrueReg();
3551     const Register FReg = Sel.getFalseReg();
3552 
3553     if (tryOptSelect(Sel))
3554       return true;
3555 
3556     // Make sure to use an unused vreg instead of wzr, so that the peephole
3557     // optimizations will be able to optimize these.
3558     Register DeadVReg = MRI.createVirtualRegister(&AArch64::GPR32RegClass);
3559     auto TstMI = MIB.buildInstr(AArch64::ANDSWri, {DeadVReg}, {CondReg})
3560                      .addImm(AArch64_AM::encodeLogicalImmediate(1, 32));
3561     constrainSelectedInstRegOperands(*TstMI, TII, TRI, RBI);
3562     if (!emitSelect(Sel.getReg(0), TReg, FReg, AArch64CC::NE, MIB))
3563       return false;
3564     Sel.eraseFromParent();
3565     return true;
3566   }
3567   case TargetOpcode::G_ICMP: {
3568     if (Ty.isVector())
3569       return false;
3570 
3571     if (Ty != LLT::scalar(32)) {
3572       LLVM_DEBUG(dbgs() << "G_ICMP result has type: " << Ty
3573                         << ", expected: " << LLT::scalar(32) << '\n');
3574       return false;
3575     }
3576 
3577     auto Pred = static_cast<CmpInst::Predicate>(I.getOperand(1).getPredicate());
3578     const AArch64CC::CondCode InvCC =
3579         changeICMPPredToAArch64CC(CmpInst::getInversePredicate(Pred));
3580     emitIntegerCompare(I.getOperand(2), I.getOperand(3), I.getOperand(1), MIB);
3581     emitCSINC(/*Dst=*/I.getOperand(0).getReg(), /*Src1=*/AArch64::WZR,
3582               /*Src2=*/AArch64::WZR, InvCC, MIB);
3583     I.eraseFromParent();
3584     return true;
3585   }
3586 
3587   case TargetOpcode::G_FCMP: {
3588     CmpInst::Predicate Pred =
3589         static_cast<CmpInst::Predicate>(I.getOperand(1).getPredicate());
3590     if (!emitFPCompare(I.getOperand(2).getReg(), I.getOperand(3).getReg(), MIB,
3591                        Pred) ||
3592         !emitCSetForFCmp(I.getOperand(0).getReg(), Pred, MIB))
3593       return false;
3594     I.eraseFromParent();
3595     return true;
3596   }
3597   case TargetOpcode::G_VASTART:
3598     return STI.isTargetDarwin() ? selectVaStartDarwin(I, MF, MRI)
3599                                 : selectVaStartAAPCS(I, MF, MRI);
3600   case TargetOpcode::G_INTRINSIC:
3601     return selectIntrinsic(I, MRI);
3602   case TargetOpcode::G_INTRINSIC_W_SIDE_EFFECTS:
3603     return selectIntrinsicWithSideEffects(I, MRI);
3604   case TargetOpcode::G_IMPLICIT_DEF: {
3605     I.setDesc(TII.get(TargetOpcode::IMPLICIT_DEF));
3606     const LLT DstTy = MRI.getType(I.getOperand(0).getReg());
3607     const Register DstReg = I.getOperand(0).getReg();
3608     const RegisterBank &DstRB = *RBI.getRegBank(DstReg, MRI, TRI);
3609     const TargetRegisterClass *DstRC = getRegClassForTypeOnBank(DstTy, DstRB);
3610     RBI.constrainGenericRegister(DstReg, *DstRC, MRI);
3611     return true;
3612   }
3613   case TargetOpcode::G_BLOCK_ADDR: {
3614     Function *BAFn = I.getOperand(1).getBlockAddress()->getFunction();
3615     if (std::optional<uint16_t> BADisc =
3616             STI.getPtrAuthBlockAddressDiscriminatorIfEnabled(*BAFn)) {
3617       MIB.buildInstr(TargetOpcode::IMPLICIT_DEF, {AArch64::X16}, {});
3618       MIB.buildInstr(TargetOpcode::IMPLICIT_DEF, {AArch64::X17}, {});
3619       MIB.buildInstr(AArch64::MOVaddrPAC)
3620           .addBlockAddress(I.getOperand(1).getBlockAddress())
3621           .addImm(AArch64PACKey::IA)
3622           .addReg(/*AddrDisc=*/AArch64::XZR)
3623           .addImm(*BADisc)
3624           .constrainAllUses(TII, TRI, RBI);
3625       MIB.buildCopy(I.getOperand(0).getReg(), Register(AArch64::X16));
3626       RBI.constrainGenericRegister(I.getOperand(0).getReg(),
3627                                    AArch64::GPR64RegClass, MRI);
3628       I.eraseFromParent();
3629       return true;
3630     }
3631     if (TM.getCodeModel() == CodeModel::Large && !TM.isPositionIndependent()) {
3632       materializeLargeCMVal(I, I.getOperand(1).getBlockAddress(), 0);
3633       I.eraseFromParent();
3634       return true;
3635     } else {
3636       I.setDesc(TII.get(AArch64::MOVaddrBA));
3637       auto MovMI = BuildMI(MBB, I, I.getDebugLoc(), TII.get(AArch64::MOVaddrBA),
3638                            I.getOperand(0).getReg())
3639                        .addBlockAddress(I.getOperand(1).getBlockAddress(),
3640                                         /* Offset */ 0, AArch64II::MO_PAGE)
3641                        .addBlockAddress(
3642                            I.getOperand(1).getBlockAddress(), /* Offset */ 0,
3643                            AArch64II::MO_NC | AArch64II::MO_PAGEOFF);
3644       I.eraseFromParent();
3645       return constrainSelectedInstRegOperands(*MovMI, TII, TRI, RBI);
3646     }
3647   }
3648   case AArch64::G_DUP: {
3649     // When the scalar of G_DUP is an s8/s16 gpr, they can't be selected by
3650     // imported patterns. Do it manually here. Avoiding generating s16 gpr is
3651     // difficult because at RBS we may end up pessimizing the fpr case if we
3652     // decided to add an anyextend to fix this. Manual selection is the most
3653     // robust solution for now.
3654     if (RBI.getRegBank(I.getOperand(1).getReg(), MRI, TRI)->getID() !=
3655         AArch64::GPRRegBankID)
3656       return false; // We expect the fpr regbank case to be imported.
3657     LLT VecTy = MRI.getType(I.getOperand(0).getReg());
3658     if (VecTy == LLT::fixed_vector(8, 8))
3659       I.setDesc(TII.get(AArch64::DUPv8i8gpr));
3660     else if (VecTy == LLT::fixed_vector(16, 8))
3661       I.setDesc(TII.get(AArch64::DUPv16i8gpr));
3662     else if (VecTy == LLT::fixed_vector(4, 16))
3663       I.setDesc(TII.get(AArch64::DUPv4i16gpr));
3664     else if (VecTy == LLT::fixed_vector(8, 16))
3665       I.setDesc(TII.get(AArch64::DUPv8i16gpr));
3666     else
3667       return false;
3668     return constrainSelectedInstRegOperands(I, TII, TRI, RBI);
3669   }
3670   case TargetOpcode::G_BUILD_VECTOR:
3671     return selectBuildVector(I, MRI);
3672   case TargetOpcode::G_MERGE_VALUES:
3673     return selectMergeValues(I, MRI);
3674   case TargetOpcode::G_UNMERGE_VALUES:
3675     return selectUnmergeValues(I, MRI);
3676   case TargetOpcode::G_SHUFFLE_VECTOR:
3677     return selectShuffleVector(I, MRI);
3678   case TargetOpcode::G_EXTRACT_VECTOR_ELT:
3679     return selectExtractElt(I, MRI);
3680   case TargetOpcode::G_CONCAT_VECTORS:
3681     return selectConcatVectors(I, MRI);
3682   case TargetOpcode::G_JUMP_TABLE:
3683     return selectJumpTable(I, MRI);
3684   case TargetOpcode::G_MEMCPY:
3685   case TargetOpcode::G_MEMCPY_INLINE:
3686   case TargetOpcode::G_MEMMOVE:
3687   case TargetOpcode::G_MEMSET:
3688     assert(STI.hasMOPS() && "Shouldn't get here without +mops feature");
3689     return selectMOPS(I, MRI);
3690   }
3691 
3692   return false;
3693 }
3694 
selectAndRestoreState(MachineInstr & I)3695 bool AArch64InstructionSelector::selectAndRestoreState(MachineInstr &I) {
3696   MachineIRBuilderState OldMIBState = MIB.getState();
3697   bool Success = select(I);
3698   MIB.setState(OldMIBState);
3699   return Success;
3700 }
3701 
selectMOPS(MachineInstr & GI,MachineRegisterInfo & MRI)3702 bool AArch64InstructionSelector::selectMOPS(MachineInstr &GI,
3703                                             MachineRegisterInfo &MRI) {
3704   unsigned Mopcode;
3705   switch (GI.getOpcode()) {
3706   case TargetOpcode::G_MEMCPY:
3707   case TargetOpcode::G_MEMCPY_INLINE:
3708     Mopcode = AArch64::MOPSMemoryCopyPseudo;
3709     break;
3710   case TargetOpcode::G_MEMMOVE:
3711     Mopcode = AArch64::MOPSMemoryMovePseudo;
3712     break;
3713   case TargetOpcode::G_MEMSET:
3714     // For tagged memset see llvm.aarch64.mops.memset.tag
3715     Mopcode = AArch64::MOPSMemorySetPseudo;
3716     break;
3717   }
3718 
3719   auto &DstPtr = GI.getOperand(0);
3720   auto &SrcOrVal = GI.getOperand(1);
3721   auto &Size = GI.getOperand(2);
3722 
3723   // Create copies of the registers that can be clobbered.
3724   const Register DstPtrCopy = MRI.cloneVirtualRegister(DstPtr.getReg());
3725   const Register SrcValCopy = MRI.cloneVirtualRegister(SrcOrVal.getReg());
3726   const Register SizeCopy = MRI.cloneVirtualRegister(Size.getReg());
3727 
3728   const bool IsSet = Mopcode == AArch64::MOPSMemorySetPseudo;
3729   const auto &SrcValRegClass =
3730       IsSet ? AArch64::GPR64RegClass : AArch64::GPR64commonRegClass;
3731 
3732   // Constrain to specific registers
3733   RBI.constrainGenericRegister(DstPtrCopy, AArch64::GPR64commonRegClass, MRI);
3734   RBI.constrainGenericRegister(SrcValCopy, SrcValRegClass, MRI);
3735   RBI.constrainGenericRegister(SizeCopy, AArch64::GPR64RegClass, MRI);
3736 
3737   MIB.buildCopy(DstPtrCopy, DstPtr);
3738   MIB.buildCopy(SrcValCopy, SrcOrVal);
3739   MIB.buildCopy(SizeCopy, Size);
3740 
3741   // New instruction uses the copied registers because it must update them.
3742   // The defs are not used since they don't exist in G_MEM*. They are still
3743   // tied.
3744   // Note: order of operands is different from G_MEMSET, G_MEMCPY, G_MEMMOVE
3745   Register DefDstPtr = MRI.createVirtualRegister(&AArch64::GPR64commonRegClass);
3746   Register DefSize = MRI.createVirtualRegister(&AArch64::GPR64RegClass);
3747   if (IsSet) {
3748     MIB.buildInstr(Mopcode, {DefDstPtr, DefSize},
3749                    {DstPtrCopy, SizeCopy, SrcValCopy});
3750   } else {
3751     Register DefSrcPtr = MRI.createVirtualRegister(&SrcValRegClass);
3752     MIB.buildInstr(Mopcode, {DefDstPtr, DefSrcPtr, DefSize},
3753                    {DstPtrCopy, SrcValCopy, SizeCopy});
3754   }
3755 
3756   GI.eraseFromParent();
3757   return true;
3758 }
3759 
selectBrJT(MachineInstr & I,MachineRegisterInfo & MRI)3760 bool AArch64InstructionSelector::selectBrJT(MachineInstr &I,
3761                                             MachineRegisterInfo &MRI) {
3762   assert(I.getOpcode() == TargetOpcode::G_BRJT && "Expected G_BRJT");
3763   Register JTAddr = I.getOperand(0).getReg();
3764   unsigned JTI = I.getOperand(1).getIndex();
3765   Register Index = I.getOperand(2).getReg();
3766 
3767   MF->getInfo<AArch64FunctionInfo>()->setJumpTableEntryInfo(JTI, 4, nullptr);
3768 
3769   // With aarch64-jump-table-hardening, we only expand the jump table dispatch
3770   // sequence later, to guarantee the integrity of the intermediate values.
3771   if (MF->getFunction().hasFnAttribute("aarch64-jump-table-hardening")) {
3772     CodeModel::Model CM = TM.getCodeModel();
3773     if (STI.isTargetMachO()) {
3774       if (CM != CodeModel::Small && CM != CodeModel::Large)
3775         report_fatal_error("Unsupported code-model for hardened jump-table");
3776     } else {
3777       // Note that COFF support would likely also need JUMP_TABLE_DEBUG_INFO.
3778       assert(STI.isTargetELF() &&
3779              "jump table hardening only supported on MachO/ELF");
3780       if (CM != CodeModel::Small)
3781         report_fatal_error("Unsupported code-model for hardened jump-table");
3782     }
3783 
3784     MIB.buildCopy({AArch64::X16}, I.getOperand(2).getReg());
3785     MIB.buildInstr(AArch64::BR_JumpTable)
3786         .addJumpTableIndex(I.getOperand(1).getIndex());
3787     I.eraseFromParent();
3788     return true;
3789   }
3790 
3791   Register TargetReg = MRI.createVirtualRegister(&AArch64::GPR64RegClass);
3792   Register ScratchReg = MRI.createVirtualRegister(&AArch64::GPR64spRegClass);
3793 
3794   auto JumpTableInst = MIB.buildInstr(AArch64::JumpTableDest32,
3795                                       {TargetReg, ScratchReg}, {JTAddr, Index})
3796                            .addJumpTableIndex(JTI);
3797   // Save the jump table info.
3798   MIB.buildInstr(TargetOpcode::JUMP_TABLE_DEBUG_INFO, {},
3799                  {static_cast<int64_t>(JTI)});
3800   // Build the indirect branch.
3801   MIB.buildInstr(AArch64::BR, {}, {TargetReg});
3802   I.eraseFromParent();
3803   return constrainSelectedInstRegOperands(*JumpTableInst, TII, TRI, RBI);
3804 }
3805 
selectJumpTable(MachineInstr & I,MachineRegisterInfo & MRI)3806 bool AArch64InstructionSelector::selectJumpTable(MachineInstr &I,
3807                                                  MachineRegisterInfo &MRI) {
3808   assert(I.getOpcode() == TargetOpcode::G_JUMP_TABLE && "Expected jump table");
3809   assert(I.getOperand(1).isJTI() && "Jump table op should have a JTI!");
3810 
3811   Register DstReg = I.getOperand(0).getReg();
3812   unsigned JTI = I.getOperand(1).getIndex();
3813   // We generate a MOVaddrJT which will get expanded to an ADRP + ADD later.
3814   auto MovMI =
3815     MIB.buildInstr(AArch64::MOVaddrJT, {DstReg}, {})
3816           .addJumpTableIndex(JTI, AArch64II::MO_PAGE)
3817           .addJumpTableIndex(JTI, AArch64II::MO_NC | AArch64II::MO_PAGEOFF);
3818   I.eraseFromParent();
3819   return constrainSelectedInstRegOperands(*MovMI, TII, TRI, RBI);
3820 }
3821 
selectTLSGlobalValue(MachineInstr & I,MachineRegisterInfo & MRI)3822 bool AArch64InstructionSelector::selectTLSGlobalValue(
3823     MachineInstr &I, MachineRegisterInfo &MRI) {
3824   if (!STI.isTargetMachO())
3825     return false;
3826   MachineFunction &MF = *I.getParent()->getParent();
3827   MF.getFrameInfo().setAdjustsStack(true);
3828 
3829   const auto &GlobalOp = I.getOperand(1);
3830   assert(GlobalOp.getOffset() == 0 &&
3831          "Shouldn't have an offset on TLS globals!");
3832   const GlobalValue &GV = *GlobalOp.getGlobal();
3833 
3834   auto LoadGOT =
3835       MIB.buildInstr(AArch64::LOADgot, {&AArch64::GPR64commonRegClass}, {})
3836           .addGlobalAddress(&GV, 0, AArch64II::MO_TLS);
3837 
3838   auto Load = MIB.buildInstr(AArch64::LDRXui, {&AArch64::GPR64commonRegClass},
3839                              {LoadGOT.getReg(0)})
3840                   .addImm(0);
3841 
3842   MIB.buildCopy(Register(AArch64::X0), LoadGOT.getReg(0));
3843   // TLS calls preserve all registers except those that absolutely must be
3844   // trashed: X0 (it takes an argument), LR (it's a call) and NZCV (let's not be
3845   // silly).
3846   unsigned Opcode = getBLRCallOpcode(MF);
3847 
3848   // With ptrauth-calls, the tlv access thunk pointer is authenticated (IA, 0).
3849   if (MF.getFunction().hasFnAttribute("ptrauth-calls")) {
3850     assert(Opcode == AArch64::BLR);
3851     Opcode = AArch64::BLRAAZ;
3852   }
3853 
3854   MIB.buildInstr(Opcode, {}, {Load})
3855       .addUse(AArch64::X0, RegState::Implicit)
3856       .addDef(AArch64::X0, RegState::Implicit)
3857       .addRegMask(TRI.getTLSCallPreservedMask());
3858 
3859   MIB.buildCopy(I.getOperand(0).getReg(), Register(AArch64::X0));
3860   RBI.constrainGenericRegister(I.getOperand(0).getReg(), AArch64::GPR64RegClass,
3861                                MRI);
3862   I.eraseFromParent();
3863   return true;
3864 }
3865 
emitScalarToVector(unsigned EltSize,const TargetRegisterClass * DstRC,Register Scalar,MachineIRBuilder & MIRBuilder) const3866 MachineInstr *AArch64InstructionSelector::emitScalarToVector(
3867     unsigned EltSize, const TargetRegisterClass *DstRC, Register Scalar,
3868     MachineIRBuilder &MIRBuilder) const {
3869   auto Undef = MIRBuilder.buildInstr(TargetOpcode::IMPLICIT_DEF, {DstRC}, {});
3870 
3871   auto BuildFn = [&](unsigned SubregIndex) {
3872     auto Ins =
3873         MIRBuilder
3874             .buildInstr(TargetOpcode::INSERT_SUBREG, {DstRC}, {Undef, Scalar})
3875             .addImm(SubregIndex);
3876     constrainSelectedInstRegOperands(*Undef, TII, TRI, RBI);
3877     constrainSelectedInstRegOperands(*Ins, TII, TRI, RBI);
3878     return &*Ins;
3879   };
3880 
3881   switch (EltSize) {
3882   case 8:
3883     return BuildFn(AArch64::bsub);
3884   case 16:
3885     return BuildFn(AArch64::hsub);
3886   case 32:
3887     return BuildFn(AArch64::ssub);
3888   case 64:
3889     return BuildFn(AArch64::dsub);
3890   default:
3891     return nullptr;
3892   }
3893 }
3894 
3895 MachineInstr *
emitNarrowVector(Register DstReg,Register SrcReg,MachineIRBuilder & MIB,MachineRegisterInfo & MRI) const3896 AArch64InstructionSelector::emitNarrowVector(Register DstReg, Register SrcReg,
3897                                              MachineIRBuilder &MIB,
3898                                              MachineRegisterInfo &MRI) const {
3899   LLT DstTy = MRI.getType(DstReg);
3900   const TargetRegisterClass *RC =
3901       getRegClassForTypeOnBank(DstTy, *RBI.getRegBank(SrcReg, MRI, TRI));
3902   if (RC != &AArch64::FPR32RegClass && RC != &AArch64::FPR64RegClass) {
3903     LLVM_DEBUG(dbgs() << "Unsupported register class!\n");
3904     return nullptr;
3905   }
3906   unsigned SubReg = 0;
3907   if (!getSubRegForClass(RC, TRI, SubReg))
3908     return nullptr;
3909   if (SubReg != AArch64::ssub && SubReg != AArch64::dsub) {
3910     LLVM_DEBUG(dbgs() << "Unsupported destination size! ("
3911                       << DstTy.getSizeInBits() << "\n");
3912     return nullptr;
3913   }
3914   auto Copy = MIB.buildInstr(TargetOpcode::COPY, {DstReg}, {})
3915                   .addReg(SrcReg, 0, SubReg);
3916   RBI.constrainGenericRegister(DstReg, *RC, MRI);
3917   return Copy;
3918 }
3919 
selectMergeValues(MachineInstr & I,MachineRegisterInfo & MRI)3920 bool AArch64InstructionSelector::selectMergeValues(
3921     MachineInstr &I, MachineRegisterInfo &MRI) {
3922   assert(I.getOpcode() == TargetOpcode::G_MERGE_VALUES && "unexpected opcode");
3923   const LLT DstTy = MRI.getType(I.getOperand(0).getReg());
3924   const LLT SrcTy = MRI.getType(I.getOperand(1).getReg());
3925   assert(!DstTy.isVector() && !SrcTy.isVector() && "invalid merge operation");
3926   const RegisterBank &RB = *RBI.getRegBank(I.getOperand(1).getReg(), MRI, TRI);
3927 
3928   if (I.getNumOperands() != 3)
3929     return false;
3930 
3931   // Merging 2 s64s into an s128.
3932   if (DstTy == LLT::scalar(128)) {
3933     if (SrcTy.getSizeInBits() != 64)
3934       return false;
3935     Register DstReg = I.getOperand(0).getReg();
3936     Register Src1Reg = I.getOperand(1).getReg();
3937     Register Src2Reg = I.getOperand(2).getReg();
3938     auto Tmp = MIB.buildInstr(TargetOpcode::IMPLICIT_DEF, {DstTy}, {});
3939     MachineInstr *InsMI = emitLaneInsert(std::nullopt, Tmp.getReg(0), Src1Reg,
3940                                          /* LaneIdx */ 0, RB, MIB);
3941     if (!InsMI)
3942       return false;
3943     MachineInstr *Ins2MI = emitLaneInsert(DstReg, InsMI->getOperand(0).getReg(),
3944                                           Src2Reg, /* LaneIdx */ 1, RB, MIB);
3945     if (!Ins2MI)
3946       return false;
3947     constrainSelectedInstRegOperands(*InsMI, TII, TRI, RBI);
3948     constrainSelectedInstRegOperands(*Ins2MI, TII, TRI, RBI);
3949     I.eraseFromParent();
3950     return true;
3951   }
3952 
3953   if (RB.getID() != AArch64::GPRRegBankID)
3954     return false;
3955 
3956   if (DstTy.getSizeInBits() != 64 || SrcTy.getSizeInBits() != 32)
3957     return false;
3958 
3959   auto *DstRC = &AArch64::GPR64RegClass;
3960   Register SubToRegDef = MRI.createVirtualRegister(DstRC);
3961   MachineInstr &SubRegMI = *BuildMI(*I.getParent(), I, I.getDebugLoc(),
3962                                     TII.get(TargetOpcode::SUBREG_TO_REG))
3963                                 .addDef(SubToRegDef)
3964                                 .addImm(0)
3965                                 .addUse(I.getOperand(1).getReg())
3966                                 .addImm(AArch64::sub_32);
3967   Register SubToRegDef2 = MRI.createVirtualRegister(DstRC);
3968   // Need to anyext the second scalar before we can use bfm
3969   MachineInstr &SubRegMI2 = *BuildMI(*I.getParent(), I, I.getDebugLoc(),
3970                                     TII.get(TargetOpcode::SUBREG_TO_REG))
3971                                 .addDef(SubToRegDef2)
3972                                 .addImm(0)
3973                                 .addUse(I.getOperand(2).getReg())
3974                                 .addImm(AArch64::sub_32);
3975   MachineInstr &BFM =
3976       *BuildMI(*I.getParent(), I, I.getDebugLoc(), TII.get(AArch64::BFMXri))
3977            .addDef(I.getOperand(0).getReg())
3978            .addUse(SubToRegDef)
3979            .addUse(SubToRegDef2)
3980            .addImm(32)
3981            .addImm(31);
3982   constrainSelectedInstRegOperands(SubRegMI, TII, TRI, RBI);
3983   constrainSelectedInstRegOperands(SubRegMI2, TII, TRI, RBI);
3984   constrainSelectedInstRegOperands(BFM, TII, TRI, RBI);
3985   I.eraseFromParent();
3986   return true;
3987 }
3988 
getLaneCopyOpcode(unsigned & CopyOpc,unsigned & ExtractSubReg,const unsigned EltSize)3989 static bool getLaneCopyOpcode(unsigned &CopyOpc, unsigned &ExtractSubReg,
3990                               const unsigned EltSize) {
3991   // Choose a lane copy opcode and subregister based off of the size of the
3992   // vector's elements.
3993   switch (EltSize) {
3994   case 8:
3995     CopyOpc = AArch64::DUPi8;
3996     ExtractSubReg = AArch64::bsub;
3997     break;
3998   case 16:
3999     CopyOpc = AArch64::DUPi16;
4000     ExtractSubReg = AArch64::hsub;
4001     break;
4002   case 32:
4003     CopyOpc = AArch64::DUPi32;
4004     ExtractSubReg = AArch64::ssub;
4005     break;
4006   case 64:
4007     CopyOpc = AArch64::DUPi64;
4008     ExtractSubReg = AArch64::dsub;
4009     break;
4010   default:
4011     // Unknown size, bail out.
4012     LLVM_DEBUG(dbgs() << "Elt size '" << EltSize << "' unsupported.\n");
4013     return false;
4014   }
4015   return true;
4016 }
4017 
emitExtractVectorElt(std::optional<Register> DstReg,const RegisterBank & DstRB,LLT ScalarTy,Register VecReg,unsigned LaneIdx,MachineIRBuilder & MIRBuilder) const4018 MachineInstr *AArch64InstructionSelector::emitExtractVectorElt(
4019     std::optional<Register> DstReg, const RegisterBank &DstRB, LLT ScalarTy,
4020     Register VecReg, unsigned LaneIdx, MachineIRBuilder &MIRBuilder) const {
4021   MachineRegisterInfo &MRI = *MIRBuilder.getMRI();
4022   unsigned CopyOpc = 0;
4023   unsigned ExtractSubReg = 0;
4024   if (!getLaneCopyOpcode(CopyOpc, ExtractSubReg, ScalarTy.getSizeInBits())) {
4025     LLVM_DEBUG(
4026         dbgs() << "Couldn't determine lane copy opcode for instruction.\n");
4027     return nullptr;
4028   }
4029 
4030   const TargetRegisterClass *DstRC =
4031       getRegClassForTypeOnBank(ScalarTy, DstRB, true);
4032   if (!DstRC) {
4033     LLVM_DEBUG(dbgs() << "Could not determine destination register class.\n");
4034     return nullptr;
4035   }
4036 
4037   const RegisterBank &VecRB = *RBI.getRegBank(VecReg, MRI, TRI);
4038   const LLT &VecTy = MRI.getType(VecReg);
4039   const TargetRegisterClass *VecRC =
4040       getRegClassForTypeOnBank(VecTy, VecRB, true);
4041   if (!VecRC) {
4042     LLVM_DEBUG(dbgs() << "Could not determine source register class.\n");
4043     return nullptr;
4044   }
4045 
4046   // The register that we're going to copy into.
4047   Register InsertReg = VecReg;
4048   if (!DstReg)
4049     DstReg = MRI.createVirtualRegister(DstRC);
4050   // If the lane index is 0, we just use a subregister COPY.
4051   if (LaneIdx == 0) {
4052     auto Copy = MIRBuilder.buildInstr(TargetOpcode::COPY, {*DstReg}, {})
4053                     .addReg(VecReg, 0, ExtractSubReg);
4054     RBI.constrainGenericRegister(*DstReg, *DstRC, MRI);
4055     return &*Copy;
4056   }
4057 
4058   // Lane copies require 128-bit wide registers. If we're dealing with an
4059   // unpacked vector, then we need to move up to that width. Insert an implicit
4060   // def and a subregister insert to get us there.
4061   if (VecTy.getSizeInBits() != 128) {
4062     MachineInstr *ScalarToVector = emitScalarToVector(
4063         VecTy.getSizeInBits(), &AArch64::FPR128RegClass, VecReg, MIRBuilder);
4064     if (!ScalarToVector)
4065       return nullptr;
4066     InsertReg = ScalarToVector->getOperand(0).getReg();
4067   }
4068 
4069   MachineInstr *LaneCopyMI =
4070       MIRBuilder.buildInstr(CopyOpc, {*DstReg}, {InsertReg}).addImm(LaneIdx);
4071   constrainSelectedInstRegOperands(*LaneCopyMI, TII, TRI, RBI);
4072 
4073   // Make sure that we actually constrain the initial copy.
4074   RBI.constrainGenericRegister(*DstReg, *DstRC, MRI);
4075   return LaneCopyMI;
4076 }
4077 
selectExtractElt(MachineInstr & I,MachineRegisterInfo & MRI)4078 bool AArch64InstructionSelector::selectExtractElt(
4079     MachineInstr &I, MachineRegisterInfo &MRI) {
4080   assert(I.getOpcode() == TargetOpcode::G_EXTRACT_VECTOR_ELT &&
4081          "unexpected opcode!");
4082   Register DstReg = I.getOperand(0).getReg();
4083   const LLT NarrowTy = MRI.getType(DstReg);
4084   const Register SrcReg = I.getOperand(1).getReg();
4085   const LLT WideTy = MRI.getType(SrcReg);
4086   (void)WideTy;
4087   assert(WideTy.getSizeInBits() >= NarrowTy.getSizeInBits() &&
4088          "source register size too small!");
4089   assert(!NarrowTy.isVector() && "cannot extract vector into vector!");
4090 
4091   // Need the lane index to determine the correct copy opcode.
4092   MachineOperand &LaneIdxOp = I.getOperand(2);
4093   assert(LaneIdxOp.isReg() && "Lane index operand was not a register?");
4094 
4095   if (RBI.getRegBank(DstReg, MRI, TRI)->getID() != AArch64::FPRRegBankID) {
4096     LLVM_DEBUG(dbgs() << "Cannot extract into GPR.\n");
4097     return false;
4098   }
4099 
4100   // Find the index to extract from.
4101   auto VRegAndVal = getIConstantVRegValWithLookThrough(LaneIdxOp.getReg(), MRI);
4102   if (!VRegAndVal)
4103     return false;
4104   unsigned LaneIdx = VRegAndVal->Value.getSExtValue();
4105 
4106 
4107   const RegisterBank &DstRB = *RBI.getRegBank(DstReg, MRI, TRI);
4108   MachineInstr *Extract = emitExtractVectorElt(DstReg, DstRB, NarrowTy, SrcReg,
4109                                                LaneIdx, MIB);
4110   if (!Extract)
4111     return false;
4112 
4113   I.eraseFromParent();
4114   return true;
4115 }
4116 
selectSplitVectorUnmerge(MachineInstr & I,MachineRegisterInfo & MRI)4117 bool AArch64InstructionSelector::selectSplitVectorUnmerge(
4118     MachineInstr &I, MachineRegisterInfo &MRI) {
4119   unsigned NumElts = I.getNumOperands() - 1;
4120   Register SrcReg = I.getOperand(NumElts).getReg();
4121   const LLT NarrowTy = MRI.getType(I.getOperand(0).getReg());
4122   const LLT SrcTy = MRI.getType(SrcReg);
4123 
4124   assert(NarrowTy.isVector() && "Expected an unmerge into vectors");
4125   if (SrcTy.getSizeInBits() > 128) {
4126     LLVM_DEBUG(dbgs() << "Unexpected vector type for vec split unmerge");
4127     return false;
4128   }
4129 
4130   // We implement a split vector operation by treating the sub-vectors as
4131   // scalars and extracting them.
4132   const RegisterBank &DstRB =
4133       *RBI.getRegBank(I.getOperand(0).getReg(), MRI, TRI);
4134   for (unsigned OpIdx = 0; OpIdx < NumElts; ++OpIdx) {
4135     Register Dst = I.getOperand(OpIdx).getReg();
4136     MachineInstr *Extract =
4137         emitExtractVectorElt(Dst, DstRB, NarrowTy, SrcReg, OpIdx, MIB);
4138     if (!Extract)
4139       return false;
4140   }
4141   I.eraseFromParent();
4142   return true;
4143 }
4144 
selectUnmergeValues(MachineInstr & I,MachineRegisterInfo & MRI)4145 bool AArch64InstructionSelector::selectUnmergeValues(MachineInstr &I,
4146                                                      MachineRegisterInfo &MRI) {
4147   assert(I.getOpcode() == TargetOpcode::G_UNMERGE_VALUES &&
4148          "unexpected opcode");
4149 
4150   // TODO: Handle unmerging into GPRs and from scalars to scalars.
4151   if (RBI.getRegBank(I.getOperand(0).getReg(), MRI, TRI)->getID() !=
4152           AArch64::FPRRegBankID ||
4153       RBI.getRegBank(I.getOperand(1).getReg(), MRI, TRI)->getID() !=
4154           AArch64::FPRRegBankID) {
4155     LLVM_DEBUG(dbgs() << "Unmerging vector-to-gpr and scalar-to-scalar "
4156                          "currently unsupported.\n");
4157     return false;
4158   }
4159 
4160   // The last operand is the vector source register, and every other operand is
4161   // a register to unpack into.
4162   unsigned NumElts = I.getNumOperands() - 1;
4163   Register SrcReg = I.getOperand(NumElts).getReg();
4164   const LLT NarrowTy = MRI.getType(I.getOperand(0).getReg());
4165   const LLT WideTy = MRI.getType(SrcReg);
4166   (void)WideTy;
4167   assert((WideTy.isVector() || WideTy.getSizeInBits() == 128) &&
4168          "can only unmerge from vector or s128 types!");
4169   assert(WideTy.getSizeInBits() > NarrowTy.getSizeInBits() &&
4170          "source register size too small!");
4171 
4172   if (!NarrowTy.isScalar())
4173     return selectSplitVectorUnmerge(I, MRI);
4174 
4175   // Choose a lane copy opcode and subregister based off of the size of the
4176   // vector's elements.
4177   unsigned CopyOpc = 0;
4178   unsigned ExtractSubReg = 0;
4179   if (!getLaneCopyOpcode(CopyOpc, ExtractSubReg, NarrowTy.getSizeInBits()))
4180     return false;
4181 
4182   // Set up for the lane copies.
4183   MachineBasicBlock &MBB = *I.getParent();
4184 
4185   // Stores the registers we'll be copying from.
4186   SmallVector<Register, 4> InsertRegs;
4187 
4188   // We'll use the first register twice, so we only need NumElts-1 registers.
4189   unsigned NumInsertRegs = NumElts - 1;
4190 
4191   // If our elements fit into exactly 128 bits, then we can copy from the source
4192   // directly. Otherwise, we need to do a bit of setup with some subregister
4193   // inserts.
4194   if (NarrowTy.getSizeInBits() * NumElts == 128) {
4195     InsertRegs = SmallVector<Register, 4>(NumInsertRegs, SrcReg);
4196   } else {
4197     // No. We have to perform subregister inserts. For each insert, create an
4198     // implicit def and a subregister insert, and save the register we create.
4199     const TargetRegisterClass *RC = getRegClassForTypeOnBank(
4200         LLT::fixed_vector(NumElts, WideTy.getScalarSizeInBits()),
4201         *RBI.getRegBank(SrcReg, MRI, TRI));
4202     unsigned SubReg = 0;
4203     bool Found = getSubRegForClass(RC, TRI, SubReg);
4204     (void)Found;
4205     assert(Found && "expected to find last operand's subeg idx");
4206     for (unsigned Idx = 0; Idx < NumInsertRegs; ++Idx) {
4207       Register ImpDefReg = MRI.createVirtualRegister(&AArch64::FPR128RegClass);
4208       MachineInstr &ImpDefMI =
4209           *BuildMI(MBB, I, I.getDebugLoc(), TII.get(TargetOpcode::IMPLICIT_DEF),
4210                    ImpDefReg);
4211 
4212       // Now, create the subregister insert from SrcReg.
4213       Register InsertReg = MRI.createVirtualRegister(&AArch64::FPR128RegClass);
4214       MachineInstr &InsMI =
4215           *BuildMI(MBB, I, I.getDebugLoc(),
4216                    TII.get(TargetOpcode::INSERT_SUBREG), InsertReg)
4217                .addUse(ImpDefReg)
4218                .addUse(SrcReg)
4219                .addImm(SubReg);
4220 
4221       constrainSelectedInstRegOperands(ImpDefMI, TII, TRI, RBI);
4222       constrainSelectedInstRegOperands(InsMI, TII, TRI, RBI);
4223 
4224       // Save the register so that we can copy from it after.
4225       InsertRegs.push_back(InsertReg);
4226     }
4227   }
4228 
4229   // Now that we've created any necessary subregister inserts, we can
4230   // create the copies.
4231   //
4232   // Perform the first copy separately as a subregister copy.
4233   Register CopyTo = I.getOperand(0).getReg();
4234   auto FirstCopy = MIB.buildInstr(TargetOpcode::COPY, {CopyTo}, {})
4235                        .addReg(InsertRegs[0], 0, ExtractSubReg);
4236   constrainSelectedInstRegOperands(*FirstCopy, TII, TRI, RBI);
4237 
4238   // Now, perform the remaining copies as vector lane copies.
4239   unsigned LaneIdx = 1;
4240   for (Register InsReg : InsertRegs) {
4241     Register CopyTo = I.getOperand(LaneIdx).getReg();
4242     MachineInstr &CopyInst =
4243         *BuildMI(MBB, I, I.getDebugLoc(), TII.get(CopyOpc), CopyTo)
4244              .addUse(InsReg)
4245              .addImm(LaneIdx);
4246     constrainSelectedInstRegOperands(CopyInst, TII, TRI, RBI);
4247     ++LaneIdx;
4248   }
4249 
4250   // Separately constrain the first copy's destination. Because of the
4251   // limitation in constrainOperandRegClass, we can't guarantee that this will
4252   // actually be constrained. So, do it ourselves using the second operand.
4253   const TargetRegisterClass *RC =
4254       MRI.getRegClassOrNull(I.getOperand(1).getReg());
4255   if (!RC) {
4256     LLVM_DEBUG(dbgs() << "Couldn't constrain copy destination.\n");
4257     return false;
4258   }
4259 
4260   RBI.constrainGenericRegister(CopyTo, *RC, MRI);
4261   I.eraseFromParent();
4262   return true;
4263 }
4264 
selectConcatVectors(MachineInstr & I,MachineRegisterInfo & MRI)4265 bool AArch64InstructionSelector::selectConcatVectors(
4266     MachineInstr &I, MachineRegisterInfo &MRI)  {
4267   assert(I.getOpcode() == TargetOpcode::G_CONCAT_VECTORS &&
4268          "Unexpected opcode");
4269   Register Dst = I.getOperand(0).getReg();
4270   Register Op1 = I.getOperand(1).getReg();
4271   Register Op2 = I.getOperand(2).getReg();
4272   MachineInstr *ConcatMI = emitVectorConcat(Dst, Op1, Op2, MIB);
4273   if (!ConcatMI)
4274     return false;
4275   I.eraseFromParent();
4276   return true;
4277 }
4278 
4279 unsigned
emitConstantPoolEntry(const Constant * CPVal,MachineFunction & MF) const4280 AArch64InstructionSelector::emitConstantPoolEntry(const Constant *CPVal,
4281                                                   MachineFunction &MF) const {
4282   Type *CPTy = CPVal->getType();
4283   Align Alignment = MF.getDataLayout().getPrefTypeAlign(CPTy);
4284 
4285   MachineConstantPool *MCP = MF.getConstantPool();
4286   return MCP->getConstantPoolIndex(CPVal, Alignment);
4287 }
4288 
emitLoadFromConstantPool(const Constant * CPVal,MachineIRBuilder & MIRBuilder) const4289 MachineInstr *AArch64InstructionSelector::emitLoadFromConstantPool(
4290     const Constant *CPVal, MachineIRBuilder &MIRBuilder) const {
4291   const TargetRegisterClass *RC;
4292   unsigned Opc;
4293   bool IsTiny = TM.getCodeModel() == CodeModel::Tiny;
4294   unsigned Size = MIRBuilder.getDataLayout().getTypeStoreSize(CPVal->getType());
4295   switch (Size) {
4296   case 16:
4297     RC = &AArch64::FPR128RegClass;
4298     Opc = IsTiny ? AArch64::LDRQl : AArch64::LDRQui;
4299     break;
4300   case 8:
4301     RC = &AArch64::FPR64RegClass;
4302     Opc = IsTiny ? AArch64::LDRDl : AArch64::LDRDui;
4303     break;
4304   case 4:
4305     RC = &AArch64::FPR32RegClass;
4306     Opc = IsTiny ? AArch64::LDRSl : AArch64::LDRSui;
4307     break;
4308   case 2:
4309     RC = &AArch64::FPR16RegClass;
4310     Opc = AArch64::LDRHui;
4311     break;
4312   default:
4313     LLVM_DEBUG(dbgs() << "Could not load from constant pool of type "
4314                       << *CPVal->getType());
4315     return nullptr;
4316   }
4317 
4318   MachineInstr *LoadMI = nullptr;
4319   auto &MF = MIRBuilder.getMF();
4320   unsigned CPIdx = emitConstantPoolEntry(CPVal, MF);
4321   if (IsTiny && (Size == 16 || Size == 8 || Size == 4)) {
4322     // Use load(literal) for tiny code model.
4323     LoadMI = &*MIRBuilder.buildInstr(Opc, {RC}, {}).addConstantPoolIndex(CPIdx);
4324   } else {
4325     auto Adrp =
4326         MIRBuilder.buildInstr(AArch64::ADRP, {&AArch64::GPR64RegClass}, {})
4327             .addConstantPoolIndex(CPIdx, 0, AArch64II::MO_PAGE);
4328 
4329     LoadMI = &*MIRBuilder.buildInstr(Opc, {RC}, {Adrp})
4330                    .addConstantPoolIndex(
4331                        CPIdx, 0, AArch64II::MO_PAGEOFF | AArch64II::MO_NC);
4332 
4333     constrainSelectedInstRegOperands(*Adrp, TII, TRI, RBI);
4334   }
4335 
4336   MachinePointerInfo PtrInfo = MachinePointerInfo::getConstantPool(MF);
4337   LoadMI->addMemOperand(MF, MF.getMachineMemOperand(PtrInfo,
4338                                                     MachineMemOperand::MOLoad,
4339                                                     Size, Align(Size)));
4340   constrainSelectedInstRegOperands(*LoadMI, TII, TRI, RBI);
4341   return LoadMI;
4342 }
4343 
4344 /// Return an <Opcode, SubregIndex> pair to do an vector elt insert of a given
4345 /// size and RB.
4346 static std::pair<unsigned, unsigned>
getInsertVecEltOpInfo(const RegisterBank & RB,unsigned EltSize)4347 getInsertVecEltOpInfo(const RegisterBank &RB, unsigned EltSize) {
4348   unsigned Opc, SubregIdx;
4349   if (RB.getID() == AArch64::GPRRegBankID) {
4350     if (EltSize == 8) {
4351       Opc = AArch64::INSvi8gpr;
4352       SubregIdx = AArch64::bsub;
4353     } else if (EltSize == 16) {
4354       Opc = AArch64::INSvi16gpr;
4355       SubregIdx = AArch64::ssub;
4356     } else if (EltSize == 32) {
4357       Opc = AArch64::INSvi32gpr;
4358       SubregIdx = AArch64::ssub;
4359     } else if (EltSize == 64) {
4360       Opc = AArch64::INSvi64gpr;
4361       SubregIdx = AArch64::dsub;
4362     } else {
4363       llvm_unreachable("invalid elt size!");
4364     }
4365   } else {
4366     if (EltSize == 8) {
4367       Opc = AArch64::INSvi8lane;
4368       SubregIdx = AArch64::bsub;
4369     } else if (EltSize == 16) {
4370       Opc = AArch64::INSvi16lane;
4371       SubregIdx = AArch64::hsub;
4372     } else if (EltSize == 32) {
4373       Opc = AArch64::INSvi32lane;
4374       SubregIdx = AArch64::ssub;
4375     } else if (EltSize == 64) {
4376       Opc = AArch64::INSvi64lane;
4377       SubregIdx = AArch64::dsub;
4378     } else {
4379       llvm_unreachable("invalid elt size!");
4380     }
4381   }
4382   return std::make_pair(Opc, SubregIdx);
4383 }
4384 
emitInstr(unsigned Opcode,std::initializer_list<llvm::DstOp> DstOps,std::initializer_list<llvm::SrcOp> SrcOps,MachineIRBuilder & MIRBuilder,const ComplexRendererFns & RenderFns) const4385 MachineInstr *AArch64InstructionSelector::emitInstr(
4386     unsigned Opcode, std::initializer_list<llvm::DstOp> DstOps,
4387     std::initializer_list<llvm::SrcOp> SrcOps, MachineIRBuilder &MIRBuilder,
4388     const ComplexRendererFns &RenderFns) const {
4389   assert(Opcode && "Expected an opcode?");
4390   assert(!isPreISelGenericOpcode(Opcode) &&
4391          "Function should only be used to produce selected instructions!");
4392   auto MI = MIRBuilder.buildInstr(Opcode, DstOps, SrcOps);
4393   if (RenderFns)
4394     for (auto &Fn : *RenderFns)
4395       Fn(MI);
4396   constrainSelectedInstRegOperands(*MI, TII, TRI, RBI);
4397   return &*MI;
4398 }
4399 
emitAddSub(const std::array<std::array<unsigned,2>,5> & AddrModeAndSizeToOpcode,Register Dst,MachineOperand & LHS,MachineOperand & RHS,MachineIRBuilder & MIRBuilder) const4400 MachineInstr *AArch64InstructionSelector::emitAddSub(
4401     const std::array<std::array<unsigned, 2>, 5> &AddrModeAndSizeToOpcode,
4402     Register Dst, MachineOperand &LHS, MachineOperand &RHS,
4403     MachineIRBuilder &MIRBuilder) const {
4404   MachineRegisterInfo &MRI = MIRBuilder.getMF().getRegInfo();
4405   assert(LHS.isReg() && RHS.isReg() && "Expected register operands?");
4406   auto Ty = MRI.getType(LHS.getReg());
4407   assert(!Ty.isVector() && "Expected a scalar or pointer?");
4408   unsigned Size = Ty.getSizeInBits();
4409   assert((Size == 32 || Size == 64) && "Expected a 32-bit or 64-bit type only");
4410   bool Is32Bit = Size == 32;
4411 
4412   // INSTRri form with positive arithmetic immediate.
4413   if (auto Fns = selectArithImmed(RHS))
4414     return emitInstr(AddrModeAndSizeToOpcode[0][Is32Bit], {Dst}, {LHS},
4415                      MIRBuilder, Fns);
4416 
4417   // INSTRri form with negative arithmetic immediate.
4418   if (auto Fns = selectNegArithImmed(RHS))
4419     return emitInstr(AddrModeAndSizeToOpcode[3][Is32Bit], {Dst}, {LHS},
4420                      MIRBuilder, Fns);
4421 
4422   // INSTRrx form.
4423   if (auto Fns = selectArithExtendedRegister(RHS))
4424     return emitInstr(AddrModeAndSizeToOpcode[4][Is32Bit], {Dst}, {LHS},
4425                      MIRBuilder, Fns);
4426 
4427   // INSTRrs form.
4428   if (auto Fns = selectShiftedRegister(RHS))
4429     return emitInstr(AddrModeAndSizeToOpcode[1][Is32Bit], {Dst}, {LHS},
4430                      MIRBuilder, Fns);
4431   return emitInstr(AddrModeAndSizeToOpcode[2][Is32Bit], {Dst}, {LHS, RHS},
4432                    MIRBuilder);
4433 }
4434 
4435 MachineInstr *
emitADD(Register DefReg,MachineOperand & LHS,MachineOperand & RHS,MachineIRBuilder & MIRBuilder) const4436 AArch64InstructionSelector::emitADD(Register DefReg, MachineOperand &LHS,
4437                                     MachineOperand &RHS,
4438                                     MachineIRBuilder &MIRBuilder) const {
4439   const std::array<std::array<unsigned, 2>, 5> OpcTable{
4440       {{AArch64::ADDXri, AArch64::ADDWri},
4441        {AArch64::ADDXrs, AArch64::ADDWrs},
4442        {AArch64::ADDXrr, AArch64::ADDWrr},
4443        {AArch64::SUBXri, AArch64::SUBWri},
4444        {AArch64::ADDXrx, AArch64::ADDWrx}}};
4445   return emitAddSub(OpcTable, DefReg, LHS, RHS, MIRBuilder);
4446 }
4447 
4448 MachineInstr *
emitADDS(Register Dst,MachineOperand & LHS,MachineOperand & RHS,MachineIRBuilder & MIRBuilder) const4449 AArch64InstructionSelector::emitADDS(Register Dst, MachineOperand &LHS,
4450                                      MachineOperand &RHS,
4451                                      MachineIRBuilder &MIRBuilder) const {
4452   const std::array<std::array<unsigned, 2>, 5> OpcTable{
4453       {{AArch64::ADDSXri, AArch64::ADDSWri},
4454        {AArch64::ADDSXrs, AArch64::ADDSWrs},
4455        {AArch64::ADDSXrr, AArch64::ADDSWrr},
4456        {AArch64::SUBSXri, AArch64::SUBSWri},
4457        {AArch64::ADDSXrx, AArch64::ADDSWrx}}};
4458   return emitAddSub(OpcTable, Dst, LHS, RHS, MIRBuilder);
4459 }
4460 
4461 MachineInstr *
emitSUBS(Register Dst,MachineOperand & LHS,MachineOperand & RHS,MachineIRBuilder & MIRBuilder) const4462 AArch64InstructionSelector::emitSUBS(Register Dst, MachineOperand &LHS,
4463                                      MachineOperand &RHS,
4464                                      MachineIRBuilder &MIRBuilder) const {
4465   const std::array<std::array<unsigned, 2>, 5> OpcTable{
4466       {{AArch64::SUBSXri, AArch64::SUBSWri},
4467        {AArch64::SUBSXrs, AArch64::SUBSWrs},
4468        {AArch64::SUBSXrr, AArch64::SUBSWrr},
4469        {AArch64::ADDSXri, AArch64::ADDSWri},
4470        {AArch64::SUBSXrx, AArch64::SUBSWrx}}};
4471   return emitAddSub(OpcTable, Dst, LHS, RHS, MIRBuilder);
4472 }
4473 
4474 MachineInstr *
emitADCS(Register Dst,MachineOperand & LHS,MachineOperand & RHS,MachineIRBuilder & MIRBuilder) const4475 AArch64InstructionSelector::emitADCS(Register Dst, MachineOperand &LHS,
4476                                      MachineOperand &RHS,
4477                                      MachineIRBuilder &MIRBuilder) const {
4478   assert(LHS.isReg() && RHS.isReg() && "Expected register operands?");
4479   MachineRegisterInfo *MRI = MIRBuilder.getMRI();
4480   bool Is32Bit = (MRI->getType(LHS.getReg()).getSizeInBits() == 32);
4481   static const unsigned OpcTable[2] = {AArch64::ADCSXr, AArch64::ADCSWr};
4482   return emitInstr(OpcTable[Is32Bit], {Dst}, {LHS, RHS}, MIRBuilder);
4483 }
4484 
4485 MachineInstr *
emitSBCS(Register Dst,MachineOperand & LHS,MachineOperand & RHS,MachineIRBuilder & MIRBuilder) const4486 AArch64InstructionSelector::emitSBCS(Register Dst, MachineOperand &LHS,
4487                                      MachineOperand &RHS,
4488                                      MachineIRBuilder &MIRBuilder) const {
4489   assert(LHS.isReg() && RHS.isReg() && "Expected register operands?");
4490   MachineRegisterInfo *MRI = MIRBuilder.getMRI();
4491   bool Is32Bit = (MRI->getType(LHS.getReg()).getSizeInBits() == 32);
4492   static const unsigned OpcTable[2] = {AArch64::SBCSXr, AArch64::SBCSWr};
4493   return emitInstr(OpcTable[Is32Bit], {Dst}, {LHS, RHS}, MIRBuilder);
4494 }
4495 
4496 MachineInstr *
emitCMN(MachineOperand & LHS,MachineOperand & RHS,MachineIRBuilder & MIRBuilder) const4497 AArch64InstructionSelector::emitCMN(MachineOperand &LHS, MachineOperand &RHS,
4498                                     MachineIRBuilder &MIRBuilder) const {
4499   MachineRegisterInfo &MRI = MIRBuilder.getMF().getRegInfo();
4500   bool Is32Bit = (MRI.getType(LHS.getReg()).getSizeInBits() == 32);
4501   auto RC = Is32Bit ? &AArch64::GPR32RegClass : &AArch64::GPR64RegClass;
4502   return emitADDS(MRI.createVirtualRegister(RC), LHS, RHS, MIRBuilder);
4503 }
4504 
4505 MachineInstr *
emitTST(MachineOperand & LHS,MachineOperand & RHS,MachineIRBuilder & MIRBuilder) const4506 AArch64InstructionSelector::emitTST(MachineOperand &LHS, MachineOperand &RHS,
4507                                     MachineIRBuilder &MIRBuilder) const {
4508   assert(LHS.isReg() && RHS.isReg() && "Expected register operands?");
4509   MachineRegisterInfo &MRI = MIRBuilder.getMF().getRegInfo();
4510   LLT Ty = MRI.getType(LHS.getReg());
4511   unsigned RegSize = Ty.getSizeInBits();
4512   bool Is32Bit = (RegSize == 32);
4513   const unsigned OpcTable[3][2] = {{AArch64::ANDSXri, AArch64::ANDSWri},
4514                                    {AArch64::ANDSXrs, AArch64::ANDSWrs},
4515                                    {AArch64::ANDSXrr, AArch64::ANDSWrr}};
4516   // ANDS needs a logical immediate for its immediate form. Check if we can
4517   // fold one in.
4518   if (auto ValAndVReg = getIConstantVRegValWithLookThrough(RHS.getReg(), MRI)) {
4519     int64_t Imm = ValAndVReg->Value.getSExtValue();
4520 
4521     if (AArch64_AM::isLogicalImmediate(Imm, RegSize)) {
4522       auto TstMI = MIRBuilder.buildInstr(OpcTable[0][Is32Bit], {Ty}, {LHS});
4523       TstMI.addImm(AArch64_AM::encodeLogicalImmediate(Imm, RegSize));
4524       constrainSelectedInstRegOperands(*TstMI, TII, TRI, RBI);
4525       return &*TstMI;
4526     }
4527   }
4528 
4529   if (auto Fns = selectLogicalShiftedRegister(RHS))
4530     return emitInstr(OpcTable[1][Is32Bit], {Ty}, {LHS}, MIRBuilder, Fns);
4531   return emitInstr(OpcTable[2][Is32Bit], {Ty}, {LHS, RHS}, MIRBuilder);
4532 }
4533 
emitIntegerCompare(MachineOperand & LHS,MachineOperand & RHS,MachineOperand & Predicate,MachineIRBuilder & MIRBuilder) const4534 MachineInstr *AArch64InstructionSelector::emitIntegerCompare(
4535     MachineOperand &LHS, MachineOperand &RHS, MachineOperand &Predicate,
4536     MachineIRBuilder &MIRBuilder) const {
4537   assert(LHS.isReg() && RHS.isReg() && "Expected LHS and RHS to be registers!");
4538   assert(Predicate.isPredicate() && "Expected predicate?");
4539   MachineRegisterInfo &MRI = MIRBuilder.getMF().getRegInfo();
4540   LLT CmpTy = MRI.getType(LHS.getReg());
4541   assert(!CmpTy.isVector() && "Expected scalar or pointer");
4542   unsigned Size = CmpTy.getSizeInBits();
4543   (void)Size;
4544   assert((Size == 32 || Size == 64) && "Expected a 32-bit or 64-bit LHS/RHS?");
4545   // Fold the compare into a cmn or tst if possible.
4546   if (auto FoldCmp = tryFoldIntegerCompare(LHS, RHS, Predicate, MIRBuilder))
4547     return FoldCmp;
4548   auto Dst = MRI.cloneVirtualRegister(LHS.getReg());
4549   return emitSUBS(Dst, LHS, RHS, MIRBuilder);
4550 }
4551 
emitCSetForFCmp(Register Dst,CmpInst::Predicate Pred,MachineIRBuilder & MIRBuilder) const4552 MachineInstr *AArch64InstructionSelector::emitCSetForFCmp(
4553     Register Dst, CmpInst::Predicate Pred, MachineIRBuilder &MIRBuilder) const {
4554   MachineRegisterInfo &MRI = *MIRBuilder.getMRI();
4555 #ifndef NDEBUG
4556   LLT Ty = MRI.getType(Dst);
4557   assert(!Ty.isVector() && Ty.getSizeInBits() == 32 &&
4558          "Expected a 32-bit scalar register?");
4559 #endif
4560   const Register ZReg = AArch64::WZR;
4561   AArch64CC::CondCode CC1, CC2;
4562   changeFCMPPredToAArch64CC(Pred, CC1, CC2);
4563   auto InvCC1 = AArch64CC::getInvertedCondCode(CC1);
4564   if (CC2 == AArch64CC::AL)
4565     return emitCSINC(/*Dst=*/Dst, /*Src1=*/ZReg, /*Src2=*/ZReg, InvCC1,
4566                      MIRBuilder);
4567   const TargetRegisterClass *RC = &AArch64::GPR32RegClass;
4568   Register Def1Reg = MRI.createVirtualRegister(RC);
4569   Register Def2Reg = MRI.createVirtualRegister(RC);
4570   auto InvCC2 = AArch64CC::getInvertedCondCode(CC2);
4571   emitCSINC(/*Dst=*/Def1Reg, /*Src1=*/ZReg, /*Src2=*/ZReg, InvCC1, MIRBuilder);
4572   emitCSINC(/*Dst=*/Def2Reg, /*Src1=*/ZReg, /*Src2=*/ZReg, InvCC2, MIRBuilder);
4573   auto OrMI = MIRBuilder.buildInstr(AArch64::ORRWrr, {Dst}, {Def1Reg, Def2Reg});
4574   constrainSelectedInstRegOperands(*OrMI, TII, TRI, RBI);
4575   return &*OrMI;
4576 }
4577 
emitFPCompare(Register LHS,Register RHS,MachineIRBuilder & MIRBuilder,std::optional<CmpInst::Predicate> Pred) const4578 MachineInstr *AArch64InstructionSelector::emitFPCompare(
4579     Register LHS, Register RHS, MachineIRBuilder &MIRBuilder,
4580     std::optional<CmpInst::Predicate> Pred) const {
4581   MachineRegisterInfo &MRI = *MIRBuilder.getMRI();
4582   LLT Ty = MRI.getType(LHS);
4583   if (Ty.isVector())
4584     return nullptr;
4585   unsigned OpSize = Ty.getSizeInBits();
4586   assert(OpSize == 16 || OpSize == 32 || OpSize == 64);
4587 
4588   // If this is a compare against +0.0, then we don't have
4589   // to explicitly materialize a constant.
4590   const ConstantFP *FPImm = getConstantFPVRegVal(RHS, MRI);
4591   bool ShouldUseImm = FPImm && (FPImm->isZero() && !FPImm->isNegative());
4592 
4593   auto IsEqualityPred = [](CmpInst::Predicate P) {
4594     return P == CmpInst::FCMP_OEQ || P == CmpInst::FCMP_ONE ||
4595            P == CmpInst::FCMP_UEQ || P == CmpInst::FCMP_UNE;
4596   };
4597   if (!ShouldUseImm && Pred && IsEqualityPred(*Pred)) {
4598     // Try commutating the operands.
4599     const ConstantFP *LHSImm = getConstantFPVRegVal(LHS, MRI);
4600     if (LHSImm && (LHSImm->isZero() && !LHSImm->isNegative())) {
4601       ShouldUseImm = true;
4602       std::swap(LHS, RHS);
4603     }
4604   }
4605   unsigned CmpOpcTbl[2][3] = {
4606       {AArch64::FCMPHrr, AArch64::FCMPSrr, AArch64::FCMPDrr},
4607       {AArch64::FCMPHri, AArch64::FCMPSri, AArch64::FCMPDri}};
4608   unsigned CmpOpc =
4609       CmpOpcTbl[ShouldUseImm][OpSize == 16 ? 0 : (OpSize == 32 ? 1 : 2)];
4610 
4611   // Partially build the compare. Decide if we need to add a use for the
4612   // third operand based off whether or not we're comparing against 0.0.
4613   auto CmpMI = MIRBuilder.buildInstr(CmpOpc).addUse(LHS);
4614   CmpMI.setMIFlags(MachineInstr::NoFPExcept);
4615   if (!ShouldUseImm)
4616     CmpMI.addUse(RHS);
4617   constrainSelectedInstRegOperands(*CmpMI, TII, TRI, RBI);
4618   return &*CmpMI;
4619 }
4620 
emitVectorConcat(std::optional<Register> Dst,Register Op1,Register Op2,MachineIRBuilder & MIRBuilder) const4621 MachineInstr *AArch64InstructionSelector::emitVectorConcat(
4622     std::optional<Register> Dst, Register Op1, Register Op2,
4623     MachineIRBuilder &MIRBuilder) const {
4624   // We implement a vector concat by:
4625   // 1. Use scalar_to_vector to insert the lower vector into the larger dest
4626   // 2. Insert the upper vector into the destination's upper element
4627   // TODO: some of this code is common with G_BUILD_VECTOR handling.
4628   MachineRegisterInfo &MRI = MIRBuilder.getMF().getRegInfo();
4629 
4630   const LLT Op1Ty = MRI.getType(Op1);
4631   const LLT Op2Ty = MRI.getType(Op2);
4632 
4633   if (Op1Ty != Op2Ty) {
4634     LLVM_DEBUG(dbgs() << "Could not do vector concat of differing vector tys");
4635     return nullptr;
4636   }
4637   assert(Op1Ty.isVector() && "Expected a vector for vector concat");
4638 
4639   if (Op1Ty.getSizeInBits() >= 128) {
4640     LLVM_DEBUG(dbgs() << "Vector concat not supported for full size vectors");
4641     return nullptr;
4642   }
4643 
4644   // At the moment we just support 64 bit vector concats.
4645   if (Op1Ty.getSizeInBits() != 64) {
4646     LLVM_DEBUG(dbgs() << "Vector concat supported for 64b vectors");
4647     return nullptr;
4648   }
4649 
4650   const LLT ScalarTy = LLT::scalar(Op1Ty.getSizeInBits());
4651   const RegisterBank &FPRBank = *RBI.getRegBank(Op1, MRI, TRI);
4652   const TargetRegisterClass *DstRC =
4653       getRegClassForTypeOnBank(Op1Ty.multiplyElements(2), FPRBank);
4654 
4655   MachineInstr *WidenedOp1 =
4656       emitScalarToVector(ScalarTy.getSizeInBits(), DstRC, Op1, MIRBuilder);
4657   MachineInstr *WidenedOp2 =
4658       emitScalarToVector(ScalarTy.getSizeInBits(), DstRC, Op2, MIRBuilder);
4659   if (!WidenedOp1 || !WidenedOp2) {
4660     LLVM_DEBUG(dbgs() << "Could not emit a vector from scalar value");
4661     return nullptr;
4662   }
4663 
4664   // Now do the insert of the upper element.
4665   unsigned InsertOpc, InsSubRegIdx;
4666   std::tie(InsertOpc, InsSubRegIdx) =
4667       getInsertVecEltOpInfo(FPRBank, ScalarTy.getSizeInBits());
4668 
4669   if (!Dst)
4670     Dst = MRI.createVirtualRegister(DstRC);
4671   auto InsElt =
4672       MIRBuilder
4673           .buildInstr(InsertOpc, {*Dst}, {WidenedOp1->getOperand(0).getReg()})
4674           .addImm(1) /* Lane index */
4675           .addUse(WidenedOp2->getOperand(0).getReg())
4676           .addImm(0);
4677   constrainSelectedInstRegOperands(*InsElt, TII, TRI, RBI);
4678   return &*InsElt;
4679 }
4680 
4681 MachineInstr *
emitCSINC(Register Dst,Register Src1,Register Src2,AArch64CC::CondCode Pred,MachineIRBuilder & MIRBuilder) const4682 AArch64InstructionSelector::emitCSINC(Register Dst, Register Src1,
4683                                       Register Src2, AArch64CC::CondCode Pred,
4684                                       MachineIRBuilder &MIRBuilder) const {
4685   auto &MRI = *MIRBuilder.getMRI();
4686   const RegClassOrRegBank &RegClassOrBank = MRI.getRegClassOrRegBank(Dst);
4687   // If we used a register class, then this won't necessarily have an LLT.
4688   // Compute the size based off whether or not we have a class or bank.
4689   unsigned Size;
4690   if (const auto *RC = dyn_cast<const TargetRegisterClass *>(RegClassOrBank))
4691     Size = TRI.getRegSizeInBits(*RC);
4692   else
4693     Size = MRI.getType(Dst).getSizeInBits();
4694   // Some opcodes use s1.
4695   assert(Size <= 64 && "Expected 64 bits or less only!");
4696   static const unsigned OpcTable[2] = {AArch64::CSINCWr, AArch64::CSINCXr};
4697   unsigned Opc = OpcTable[Size == 64];
4698   auto CSINC = MIRBuilder.buildInstr(Opc, {Dst}, {Src1, Src2}).addImm(Pred);
4699   constrainSelectedInstRegOperands(*CSINC, TII, TRI, RBI);
4700   return &*CSINC;
4701 }
4702 
emitCarryIn(MachineInstr & I,Register CarryReg)4703 MachineInstr *AArch64InstructionSelector::emitCarryIn(MachineInstr &I,
4704                                                       Register CarryReg) {
4705   MachineRegisterInfo *MRI = MIB.getMRI();
4706   unsigned Opcode = I.getOpcode();
4707 
4708   // If the instruction is a SUB, we need to negate the carry,
4709   // because borrowing is indicated by carry-flag == 0.
4710   bool NeedsNegatedCarry =
4711       (Opcode == TargetOpcode::G_USUBE || Opcode == TargetOpcode::G_SSUBE);
4712 
4713   // If the previous instruction will already produce the correct carry, do not
4714   // emit a carry generating instruction. E.g. for G_UADDE/G_USUBE sequences
4715   // generated during legalization of wide add/sub. This optimization depends on
4716   // these sequences not being interrupted by other instructions.
4717   // We have to select the previous instruction before the carry-using
4718   // instruction is deleted by the calling function, otherwise the previous
4719   // instruction might become dead and would get deleted.
4720   MachineInstr *SrcMI = MRI->getVRegDef(CarryReg);
4721   if (SrcMI == I.getPrevNode()) {
4722     if (auto *CarrySrcMI = dyn_cast<GAddSubCarryOut>(SrcMI)) {
4723       bool ProducesNegatedCarry = CarrySrcMI->isSub();
4724       if (NeedsNegatedCarry == ProducesNegatedCarry &&
4725           CarrySrcMI->isUnsigned() &&
4726           CarrySrcMI->getCarryOutReg() == CarryReg &&
4727           selectAndRestoreState(*SrcMI))
4728         return nullptr;
4729     }
4730   }
4731 
4732   Register DeadReg = MRI->createVirtualRegister(&AArch64::GPR32RegClass);
4733 
4734   if (NeedsNegatedCarry) {
4735     // (0 - Carry) sets !C in NZCV when Carry == 1
4736     Register ZReg = AArch64::WZR;
4737     return emitInstr(AArch64::SUBSWrr, {DeadReg}, {ZReg, CarryReg}, MIB);
4738   }
4739 
4740   // (Carry - 1) sets !C in NZCV when Carry == 0
4741   auto Fns = select12BitValueWithLeftShift(1);
4742   return emitInstr(AArch64::SUBSWri, {DeadReg}, {CarryReg}, MIB, Fns);
4743 }
4744 
selectOverflowOp(MachineInstr & I,MachineRegisterInfo & MRI)4745 bool AArch64InstructionSelector::selectOverflowOp(MachineInstr &I,
4746                                                   MachineRegisterInfo &MRI) {
4747   auto &CarryMI = cast<GAddSubCarryOut>(I);
4748 
4749   if (auto *CarryInMI = dyn_cast<GAddSubCarryInOut>(&I)) {
4750     // Set NZCV carry according to carry-in VReg
4751     emitCarryIn(I, CarryInMI->getCarryInReg());
4752   }
4753 
4754   // Emit the operation and get the correct condition code.
4755   auto OpAndCC = emitOverflowOp(I.getOpcode(), CarryMI.getDstReg(),
4756                                 CarryMI.getLHS(), CarryMI.getRHS(), MIB);
4757 
4758   Register CarryOutReg = CarryMI.getCarryOutReg();
4759 
4760   // Don't convert carry-out to VReg if it is never used
4761   if (!MRI.use_nodbg_empty(CarryOutReg)) {
4762     // Now, put the overflow result in the register given by the first operand
4763     // to the overflow op. CSINC increments the result when the predicate is
4764     // false, so to get the increment when it's true, we need to use the
4765     // inverse. In this case, we want to increment when carry is set.
4766     Register ZReg = AArch64::WZR;
4767     emitCSINC(/*Dst=*/CarryOutReg, /*Src1=*/ZReg, /*Src2=*/ZReg,
4768               getInvertedCondCode(OpAndCC.second), MIB);
4769   }
4770 
4771   I.eraseFromParent();
4772   return true;
4773 }
4774 
4775 std::pair<MachineInstr *, AArch64CC::CondCode>
emitOverflowOp(unsigned Opcode,Register Dst,MachineOperand & LHS,MachineOperand & RHS,MachineIRBuilder & MIRBuilder) const4776 AArch64InstructionSelector::emitOverflowOp(unsigned Opcode, Register Dst,
4777                                            MachineOperand &LHS,
4778                                            MachineOperand &RHS,
4779                                            MachineIRBuilder &MIRBuilder) const {
4780   switch (Opcode) {
4781   default:
4782     llvm_unreachable("Unexpected opcode!");
4783   case TargetOpcode::G_SADDO:
4784     return std::make_pair(emitADDS(Dst, LHS, RHS, MIRBuilder), AArch64CC::VS);
4785   case TargetOpcode::G_UADDO:
4786     return std::make_pair(emitADDS(Dst, LHS, RHS, MIRBuilder), AArch64CC::HS);
4787   case TargetOpcode::G_SSUBO:
4788     return std::make_pair(emitSUBS(Dst, LHS, RHS, MIRBuilder), AArch64CC::VS);
4789   case TargetOpcode::G_USUBO:
4790     return std::make_pair(emitSUBS(Dst, LHS, RHS, MIRBuilder), AArch64CC::LO);
4791   case TargetOpcode::G_SADDE:
4792     return std::make_pair(emitADCS(Dst, LHS, RHS, MIRBuilder), AArch64CC::VS);
4793   case TargetOpcode::G_UADDE:
4794     return std::make_pair(emitADCS(Dst, LHS, RHS, MIRBuilder), AArch64CC::HS);
4795   case TargetOpcode::G_SSUBE:
4796     return std::make_pair(emitSBCS(Dst, LHS, RHS, MIRBuilder), AArch64CC::VS);
4797   case TargetOpcode::G_USUBE:
4798     return std::make_pair(emitSBCS(Dst, LHS, RHS, MIRBuilder), AArch64CC::LO);
4799   }
4800 }
4801 
4802 /// Returns true if @p Val is a tree of AND/OR/CMP operations that can be
4803 /// expressed as a conjunction.
4804 /// \param CanNegate    Set to true if we can negate the whole sub-tree just by
4805 ///                     changing the conditions on the CMP tests.
4806 ///                     (this means we can call emitConjunctionRec() with
4807 ///                      Negate==true on this sub-tree)
4808 /// \param MustBeFirst  Set to true if this subtree needs to be negated and we
4809 ///                     cannot do the negation naturally. We are required to
4810 ///                     emit the subtree first in this case.
4811 /// \param WillNegate   Is true if are called when the result of this
4812 ///                     subexpression must be negated. This happens when the
4813 ///                     outer expression is an OR. We can use this fact to know
4814 ///                     that we have a double negation (or (or ...) ...) that
4815 ///                     can be implemented for free.
canEmitConjunction(Register Val,bool & CanNegate,bool & MustBeFirst,bool WillNegate,MachineRegisterInfo & MRI,unsigned Depth=0)4816 static bool canEmitConjunction(Register Val, bool &CanNegate, bool &MustBeFirst,
4817                                bool WillNegate, MachineRegisterInfo &MRI,
4818                                unsigned Depth = 0) {
4819   if (!MRI.hasOneNonDBGUse(Val))
4820     return false;
4821   MachineInstr *ValDef = MRI.getVRegDef(Val);
4822   unsigned Opcode = ValDef->getOpcode();
4823   if (isa<GAnyCmp>(ValDef)) {
4824     CanNegate = true;
4825     MustBeFirst = false;
4826     return true;
4827   }
4828   // Protect against exponential runtime and stack overflow.
4829   if (Depth > 6)
4830     return false;
4831   if (Opcode == TargetOpcode::G_AND || Opcode == TargetOpcode::G_OR) {
4832     bool IsOR = Opcode == TargetOpcode::G_OR;
4833     Register O0 = ValDef->getOperand(1).getReg();
4834     Register O1 = ValDef->getOperand(2).getReg();
4835     bool CanNegateL;
4836     bool MustBeFirstL;
4837     if (!canEmitConjunction(O0, CanNegateL, MustBeFirstL, IsOR, MRI, Depth + 1))
4838       return false;
4839     bool CanNegateR;
4840     bool MustBeFirstR;
4841     if (!canEmitConjunction(O1, CanNegateR, MustBeFirstR, IsOR, MRI, Depth + 1))
4842       return false;
4843 
4844     if (MustBeFirstL && MustBeFirstR)
4845       return false;
4846 
4847     if (IsOR) {
4848       // For an OR expression we need to be able to naturally negate at least
4849       // one side or we cannot do the transformation at all.
4850       if (!CanNegateL && !CanNegateR)
4851         return false;
4852       // If we the result of the OR will be negated and we can naturally negate
4853       // the leaves, then this sub-tree as a whole negates naturally.
4854       CanNegate = WillNegate && CanNegateL && CanNegateR;
4855       // If we cannot naturally negate the whole sub-tree, then this must be
4856       // emitted first.
4857       MustBeFirst = !CanNegate;
4858     } else {
4859       assert(Opcode == TargetOpcode::G_AND && "Must be G_AND");
4860       // We cannot naturally negate an AND operation.
4861       CanNegate = false;
4862       MustBeFirst = MustBeFirstL || MustBeFirstR;
4863     }
4864     return true;
4865   }
4866   return false;
4867 }
4868 
emitConditionalComparison(Register LHS,Register RHS,CmpInst::Predicate CC,AArch64CC::CondCode Predicate,AArch64CC::CondCode OutCC,MachineIRBuilder & MIB) const4869 MachineInstr *AArch64InstructionSelector::emitConditionalComparison(
4870     Register LHS, Register RHS, CmpInst::Predicate CC,
4871     AArch64CC::CondCode Predicate, AArch64CC::CondCode OutCC,
4872     MachineIRBuilder &MIB) const {
4873   auto &MRI = *MIB.getMRI();
4874   LLT OpTy = MRI.getType(LHS);
4875   unsigned CCmpOpc;
4876   std::optional<ValueAndVReg> C;
4877   if (CmpInst::isIntPredicate(CC)) {
4878     assert(OpTy.getSizeInBits() == 32 || OpTy.getSizeInBits() == 64);
4879     C = getIConstantVRegValWithLookThrough(RHS, MRI);
4880     if (!C || C->Value.sgt(31) || C->Value.slt(-31))
4881       CCmpOpc = OpTy.getSizeInBits() == 32 ? AArch64::CCMPWr : AArch64::CCMPXr;
4882     else if (C->Value.ule(31))
4883       CCmpOpc = OpTy.getSizeInBits() == 32 ? AArch64::CCMPWi : AArch64::CCMPXi;
4884     else
4885       CCmpOpc = OpTy.getSizeInBits() == 32 ? AArch64::CCMNWi : AArch64::CCMNXi;
4886   } else {
4887     assert(OpTy.getSizeInBits() == 16 || OpTy.getSizeInBits() == 32 ||
4888            OpTy.getSizeInBits() == 64);
4889     switch (OpTy.getSizeInBits()) {
4890     case 16:
4891       assert(STI.hasFullFP16() && "Expected Full FP16 for fp16 comparisons");
4892       CCmpOpc = AArch64::FCCMPHrr;
4893       break;
4894     case 32:
4895       CCmpOpc = AArch64::FCCMPSrr;
4896       break;
4897     case 64:
4898       CCmpOpc = AArch64::FCCMPDrr;
4899       break;
4900     default:
4901       return nullptr;
4902     }
4903   }
4904   AArch64CC::CondCode InvOutCC = AArch64CC::getInvertedCondCode(OutCC);
4905   unsigned NZCV = AArch64CC::getNZCVToSatisfyCondCode(InvOutCC);
4906   auto CCmp =
4907       MIB.buildInstr(CCmpOpc, {}, {LHS});
4908   if (CCmpOpc == AArch64::CCMPWi || CCmpOpc == AArch64::CCMPXi)
4909     CCmp.addImm(C->Value.getZExtValue());
4910   else if (CCmpOpc == AArch64::CCMNWi || CCmpOpc == AArch64::CCMNXi)
4911     CCmp.addImm(C->Value.abs().getZExtValue());
4912   else
4913     CCmp.addReg(RHS);
4914   CCmp.addImm(NZCV).addImm(Predicate);
4915   constrainSelectedInstRegOperands(*CCmp, TII, TRI, RBI);
4916   return &*CCmp;
4917 }
4918 
emitConjunctionRec(Register Val,AArch64CC::CondCode & OutCC,bool Negate,Register CCOp,AArch64CC::CondCode Predicate,MachineIRBuilder & MIB) const4919 MachineInstr *AArch64InstructionSelector::emitConjunctionRec(
4920     Register Val, AArch64CC::CondCode &OutCC, bool Negate, Register CCOp,
4921     AArch64CC::CondCode Predicate, MachineIRBuilder &MIB) const {
4922   // We're at a tree leaf, produce a conditional comparison operation.
4923   auto &MRI = *MIB.getMRI();
4924   MachineInstr *ValDef = MRI.getVRegDef(Val);
4925   unsigned Opcode = ValDef->getOpcode();
4926   if (auto *Cmp = dyn_cast<GAnyCmp>(ValDef)) {
4927     Register LHS = Cmp->getLHSReg();
4928     Register RHS = Cmp->getRHSReg();
4929     CmpInst::Predicate CC = Cmp->getCond();
4930     if (Negate)
4931       CC = CmpInst::getInversePredicate(CC);
4932     if (isa<GICmp>(Cmp)) {
4933       OutCC = changeICMPPredToAArch64CC(CC);
4934     } else {
4935       // Handle special FP cases.
4936       AArch64CC::CondCode ExtraCC;
4937       changeFPCCToANDAArch64CC(CC, OutCC, ExtraCC);
4938       // Some floating point conditions can't be tested with a single condition
4939       // code. Construct an additional comparison in this case.
4940       if (ExtraCC != AArch64CC::AL) {
4941         MachineInstr *ExtraCmp;
4942         if (!CCOp)
4943           ExtraCmp = emitFPCompare(LHS, RHS, MIB, CC);
4944         else
4945           ExtraCmp =
4946               emitConditionalComparison(LHS, RHS, CC, Predicate, ExtraCC, MIB);
4947         CCOp = ExtraCmp->getOperand(0).getReg();
4948         Predicate = ExtraCC;
4949       }
4950     }
4951 
4952     // Produce a normal comparison if we are first in the chain
4953     if (!CCOp) {
4954       auto Dst = MRI.cloneVirtualRegister(LHS);
4955       if (isa<GICmp>(Cmp))
4956         return emitSUBS(Dst, Cmp->getOperand(2), Cmp->getOperand(3), MIB);
4957       return emitFPCompare(Cmp->getOperand(2).getReg(),
4958                            Cmp->getOperand(3).getReg(), MIB);
4959     }
4960     // Otherwise produce a ccmp.
4961     return emitConditionalComparison(LHS, RHS, CC, Predicate, OutCC, MIB);
4962   }
4963   assert(MRI.hasOneNonDBGUse(Val) && "Valid conjunction/disjunction tree");
4964 
4965   bool IsOR = Opcode == TargetOpcode::G_OR;
4966 
4967   Register LHS = ValDef->getOperand(1).getReg();
4968   bool CanNegateL;
4969   bool MustBeFirstL;
4970   bool ValidL = canEmitConjunction(LHS, CanNegateL, MustBeFirstL, IsOR, MRI);
4971   assert(ValidL && "Valid conjunction/disjunction tree");
4972   (void)ValidL;
4973 
4974   Register RHS = ValDef->getOperand(2).getReg();
4975   bool CanNegateR;
4976   bool MustBeFirstR;
4977   bool ValidR = canEmitConjunction(RHS, CanNegateR, MustBeFirstR, IsOR, MRI);
4978   assert(ValidR && "Valid conjunction/disjunction tree");
4979   (void)ValidR;
4980 
4981   // Swap sub-tree that must come first to the right side.
4982   if (MustBeFirstL) {
4983     assert(!MustBeFirstR && "Valid conjunction/disjunction tree");
4984     std::swap(LHS, RHS);
4985     std::swap(CanNegateL, CanNegateR);
4986     std::swap(MustBeFirstL, MustBeFirstR);
4987   }
4988 
4989   bool NegateR;
4990   bool NegateAfterR;
4991   bool NegateL;
4992   bool NegateAfterAll;
4993   if (Opcode == TargetOpcode::G_OR) {
4994     // Swap the sub-tree that we can negate naturally to the left.
4995     if (!CanNegateL) {
4996       assert(CanNegateR && "at least one side must be negatable");
4997       assert(!MustBeFirstR && "invalid conjunction/disjunction tree");
4998       assert(!Negate);
4999       std::swap(LHS, RHS);
5000       NegateR = false;
5001       NegateAfterR = true;
5002     } else {
5003       // Negate the left sub-tree if possible, otherwise negate the result.
5004       NegateR = CanNegateR;
5005       NegateAfterR = !CanNegateR;
5006     }
5007     NegateL = true;
5008     NegateAfterAll = !Negate;
5009   } else {
5010     assert(Opcode == TargetOpcode::G_AND &&
5011            "Valid conjunction/disjunction tree");
5012     assert(!Negate && "Valid conjunction/disjunction tree");
5013 
5014     NegateL = false;
5015     NegateR = false;
5016     NegateAfterR = false;
5017     NegateAfterAll = false;
5018   }
5019 
5020   // Emit sub-trees.
5021   AArch64CC::CondCode RHSCC;
5022   MachineInstr *CmpR =
5023       emitConjunctionRec(RHS, RHSCC, NegateR, CCOp, Predicate, MIB);
5024   if (NegateAfterR)
5025     RHSCC = AArch64CC::getInvertedCondCode(RHSCC);
5026   MachineInstr *CmpL = emitConjunctionRec(
5027       LHS, OutCC, NegateL, CmpR->getOperand(0).getReg(), RHSCC, MIB);
5028   if (NegateAfterAll)
5029     OutCC = AArch64CC::getInvertedCondCode(OutCC);
5030   return CmpL;
5031 }
5032 
emitConjunction(Register Val,AArch64CC::CondCode & OutCC,MachineIRBuilder & MIB) const5033 MachineInstr *AArch64InstructionSelector::emitConjunction(
5034     Register Val, AArch64CC::CondCode &OutCC, MachineIRBuilder &MIB) const {
5035   bool DummyCanNegate;
5036   bool DummyMustBeFirst;
5037   if (!canEmitConjunction(Val, DummyCanNegate, DummyMustBeFirst, false,
5038                           *MIB.getMRI()))
5039     return nullptr;
5040   return emitConjunctionRec(Val, OutCC, false, Register(), AArch64CC::AL, MIB);
5041 }
5042 
tryOptSelectConjunction(GSelect & SelI,MachineInstr & CondMI)5043 bool AArch64InstructionSelector::tryOptSelectConjunction(GSelect &SelI,
5044                                                          MachineInstr &CondMI) {
5045   AArch64CC::CondCode AArch64CC;
5046   MachineInstr *ConjMI = emitConjunction(SelI.getCondReg(), AArch64CC, MIB);
5047   if (!ConjMI)
5048     return false;
5049 
5050   emitSelect(SelI.getReg(0), SelI.getTrueReg(), SelI.getFalseReg(), AArch64CC, MIB);
5051   SelI.eraseFromParent();
5052   return true;
5053 }
5054 
tryOptSelect(GSelect & I)5055 bool AArch64InstructionSelector::tryOptSelect(GSelect &I) {
5056   MachineRegisterInfo &MRI = *MIB.getMRI();
5057   // We want to recognize this pattern:
5058   //
5059   // $z = G_FCMP pred, $x, $y
5060   // ...
5061   // $w = G_SELECT $z, $a, $b
5062   //
5063   // Where the value of $z is *only* ever used by the G_SELECT (possibly with
5064   // some copies/truncs in between.)
5065   //
5066   // If we see this, then we can emit something like this:
5067   //
5068   // fcmp $x, $y
5069   // fcsel $w, $a, $b, pred
5070   //
5071   // Rather than emitting both of the rather long sequences in the standard
5072   // G_FCMP/G_SELECT select methods.
5073 
5074   // First, check if the condition is defined by a compare.
5075   MachineInstr *CondDef = MRI.getVRegDef(I.getOperand(1).getReg());
5076 
5077   // We can only fold if all of the defs have one use.
5078   Register CondDefReg = CondDef->getOperand(0).getReg();
5079   if (!MRI.hasOneNonDBGUse(CondDefReg)) {
5080     // Unless it's another select.
5081     for (const MachineInstr &UI : MRI.use_nodbg_instructions(CondDefReg)) {
5082       if (CondDef == &UI)
5083         continue;
5084       if (UI.getOpcode() != TargetOpcode::G_SELECT)
5085         return false;
5086     }
5087   }
5088 
5089   // Is the condition defined by a compare?
5090   unsigned CondOpc = CondDef->getOpcode();
5091   if (CondOpc != TargetOpcode::G_ICMP && CondOpc != TargetOpcode::G_FCMP) {
5092     if (tryOptSelectConjunction(I, *CondDef))
5093       return true;
5094     return false;
5095   }
5096 
5097   AArch64CC::CondCode CondCode;
5098   if (CondOpc == TargetOpcode::G_ICMP) {
5099     auto Pred =
5100         static_cast<CmpInst::Predicate>(CondDef->getOperand(1).getPredicate());
5101     CondCode = changeICMPPredToAArch64CC(Pred);
5102     emitIntegerCompare(CondDef->getOperand(2), CondDef->getOperand(3),
5103                        CondDef->getOperand(1), MIB);
5104   } else {
5105     // Get the condition code for the select.
5106     auto Pred =
5107         static_cast<CmpInst::Predicate>(CondDef->getOperand(1).getPredicate());
5108     AArch64CC::CondCode CondCode2;
5109     changeFCMPPredToAArch64CC(Pred, CondCode, CondCode2);
5110 
5111     // changeFCMPPredToAArch64CC sets CondCode2 to AL when we require two
5112     // instructions to emit the comparison.
5113     // TODO: Handle FCMP_UEQ and FCMP_ONE. After that, this check will be
5114     // unnecessary.
5115     if (CondCode2 != AArch64CC::AL)
5116       return false;
5117 
5118     if (!emitFPCompare(CondDef->getOperand(2).getReg(),
5119                        CondDef->getOperand(3).getReg(), MIB)) {
5120       LLVM_DEBUG(dbgs() << "Couldn't emit compare for select!\n");
5121       return false;
5122     }
5123   }
5124 
5125   // Emit the select.
5126   emitSelect(I.getOperand(0).getReg(), I.getOperand(2).getReg(),
5127              I.getOperand(3).getReg(), CondCode, MIB);
5128   I.eraseFromParent();
5129   return true;
5130 }
5131 
tryFoldIntegerCompare(MachineOperand & LHS,MachineOperand & RHS,MachineOperand & Predicate,MachineIRBuilder & MIRBuilder) const5132 MachineInstr *AArch64InstructionSelector::tryFoldIntegerCompare(
5133     MachineOperand &LHS, MachineOperand &RHS, MachineOperand &Predicate,
5134     MachineIRBuilder &MIRBuilder) const {
5135   assert(LHS.isReg() && RHS.isReg() && Predicate.isPredicate() &&
5136          "Unexpected MachineOperand");
5137   MachineRegisterInfo &MRI = *MIRBuilder.getMRI();
5138   // We want to find this sort of thing:
5139   // x = G_SUB 0, y
5140   // G_ICMP z, x
5141   //
5142   // In this case, we can fold the G_SUB into the G_ICMP using a CMN instead.
5143   // e.g:
5144   //
5145   // cmn z, y
5146 
5147   // Check if the RHS or LHS of the G_ICMP is defined by a SUB
5148   MachineInstr *LHSDef = getDefIgnoringCopies(LHS.getReg(), MRI);
5149   MachineInstr *RHSDef = getDefIgnoringCopies(RHS.getReg(), MRI);
5150   auto P = static_cast<CmpInst::Predicate>(Predicate.getPredicate());
5151   // Given this:
5152   //
5153   // x = G_SUB 0, y
5154   // G_ICMP x, z
5155   //
5156   // Produce this:
5157   //
5158   // cmn y, z
5159   if (isCMN(LHSDef, P, MRI))
5160     return emitCMN(LHSDef->getOperand(2), RHS, MIRBuilder);
5161 
5162   // Same idea here, but with the RHS of the compare instead:
5163   //
5164   // Given this:
5165   //
5166   // x = G_SUB 0, y
5167   // G_ICMP z, x
5168   //
5169   // Produce this:
5170   //
5171   // cmn z, y
5172   if (isCMN(RHSDef, P, MRI))
5173     return emitCMN(LHS, RHSDef->getOperand(2), MIRBuilder);
5174 
5175   // Given this:
5176   //
5177   // z = G_AND x, y
5178   // G_ICMP z, 0
5179   //
5180   // Produce this if the compare is signed:
5181   //
5182   // tst x, y
5183   if (!CmpInst::isUnsigned(P) && LHSDef &&
5184       LHSDef->getOpcode() == TargetOpcode::G_AND) {
5185     // Make sure that the RHS is 0.
5186     auto ValAndVReg = getIConstantVRegValWithLookThrough(RHS.getReg(), MRI);
5187     if (!ValAndVReg || ValAndVReg->Value != 0)
5188       return nullptr;
5189 
5190     return emitTST(LHSDef->getOperand(1),
5191                    LHSDef->getOperand(2), MIRBuilder);
5192   }
5193 
5194   return nullptr;
5195 }
5196 
selectShuffleVector(MachineInstr & I,MachineRegisterInfo & MRI)5197 bool AArch64InstructionSelector::selectShuffleVector(
5198     MachineInstr &I, MachineRegisterInfo &MRI) {
5199   const LLT DstTy = MRI.getType(I.getOperand(0).getReg());
5200   Register Src1Reg = I.getOperand(1).getReg();
5201   const LLT Src1Ty = MRI.getType(Src1Reg);
5202   Register Src2Reg = I.getOperand(2).getReg();
5203   const LLT Src2Ty = MRI.getType(Src2Reg);
5204   ArrayRef<int> Mask = I.getOperand(3).getShuffleMask();
5205 
5206   MachineBasicBlock &MBB = *I.getParent();
5207   MachineFunction &MF = *MBB.getParent();
5208   LLVMContext &Ctx = MF.getFunction().getContext();
5209 
5210   // G_SHUFFLE_VECTOR is weird in that the source operands can be scalars, if
5211   // it's originated from a <1 x T> type. Those should have been lowered into
5212   // G_BUILD_VECTOR earlier.
5213   if (!Src1Ty.isVector() || !Src2Ty.isVector()) {
5214     LLVM_DEBUG(dbgs() << "Could not select a \"scalar\" G_SHUFFLE_VECTOR\n");
5215     return false;
5216   }
5217 
5218   unsigned BytesPerElt = DstTy.getElementType().getSizeInBits() / 8;
5219 
5220   SmallVector<Constant *, 64> CstIdxs;
5221   for (int Val : Mask) {
5222     // For now, any undef indexes we'll just assume to be 0. This should be
5223     // optimized in future, e.g. to select DUP etc.
5224     Val = Val < 0 ? 0 : Val;
5225     for (unsigned Byte = 0; Byte < BytesPerElt; ++Byte) {
5226       unsigned Offset = Byte + Val * BytesPerElt;
5227       CstIdxs.emplace_back(ConstantInt::get(Type::getInt8Ty(Ctx), Offset));
5228     }
5229   }
5230 
5231   // Use a constant pool to load the index vector for TBL.
5232   Constant *CPVal = ConstantVector::get(CstIdxs);
5233   MachineInstr *IndexLoad = emitLoadFromConstantPool(CPVal, MIB);
5234   if (!IndexLoad) {
5235     LLVM_DEBUG(dbgs() << "Could not load from a constant pool");
5236     return false;
5237   }
5238 
5239   if (DstTy.getSizeInBits() != 128) {
5240     assert(DstTy.getSizeInBits() == 64 && "Unexpected shuffle result ty");
5241     // This case can be done with TBL1.
5242     MachineInstr *Concat =
5243         emitVectorConcat(std::nullopt, Src1Reg, Src2Reg, MIB);
5244     if (!Concat) {
5245       LLVM_DEBUG(dbgs() << "Could not do vector concat for tbl1");
5246       return false;
5247     }
5248 
5249     // The constant pool load will be 64 bits, so need to convert to FPR128 reg.
5250     IndexLoad = emitScalarToVector(64, &AArch64::FPR128RegClass,
5251                                    IndexLoad->getOperand(0).getReg(), MIB);
5252 
5253     auto TBL1 = MIB.buildInstr(
5254         AArch64::TBLv16i8One, {&AArch64::FPR128RegClass},
5255         {Concat->getOperand(0).getReg(), IndexLoad->getOperand(0).getReg()});
5256     constrainSelectedInstRegOperands(*TBL1, TII, TRI, RBI);
5257 
5258     auto Copy =
5259         MIB.buildInstr(TargetOpcode::COPY, {I.getOperand(0).getReg()}, {})
5260             .addReg(TBL1.getReg(0), 0, AArch64::dsub);
5261     RBI.constrainGenericRegister(Copy.getReg(0), AArch64::FPR64RegClass, MRI);
5262     I.eraseFromParent();
5263     return true;
5264   }
5265 
5266   // For TBL2 we need to emit a REG_SEQUENCE to tie together two consecutive
5267   // Q registers for regalloc.
5268   SmallVector<Register, 2> Regs = {Src1Reg, Src2Reg};
5269   auto RegSeq = createQTuple(Regs, MIB);
5270   auto TBL2 = MIB.buildInstr(AArch64::TBLv16i8Two, {I.getOperand(0)},
5271                              {RegSeq, IndexLoad->getOperand(0)});
5272   constrainSelectedInstRegOperands(*TBL2, TII, TRI, RBI);
5273   I.eraseFromParent();
5274   return true;
5275 }
5276 
emitLaneInsert(std::optional<Register> DstReg,Register SrcReg,Register EltReg,unsigned LaneIdx,const RegisterBank & RB,MachineIRBuilder & MIRBuilder) const5277 MachineInstr *AArch64InstructionSelector::emitLaneInsert(
5278     std::optional<Register> DstReg, Register SrcReg, Register EltReg,
5279     unsigned LaneIdx, const RegisterBank &RB,
5280     MachineIRBuilder &MIRBuilder) const {
5281   MachineInstr *InsElt = nullptr;
5282   const TargetRegisterClass *DstRC = &AArch64::FPR128RegClass;
5283   MachineRegisterInfo &MRI = *MIRBuilder.getMRI();
5284 
5285   // Create a register to define with the insert if one wasn't passed in.
5286   if (!DstReg)
5287     DstReg = MRI.createVirtualRegister(DstRC);
5288 
5289   unsigned EltSize = MRI.getType(EltReg).getSizeInBits();
5290   unsigned Opc = getInsertVecEltOpInfo(RB, EltSize).first;
5291 
5292   if (RB.getID() == AArch64::FPRRegBankID) {
5293     auto InsSub = emitScalarToVector(EltSize, DstRC, EltReg, MIRBuilder);
5294     InsElt = MIRBuilder.buildInstr(Opc, {*DstReg}, {SrcReg})
5295                  .addImm(LaneIdx)
5296                  .addUse(InsSub->getOperand(0).getReg())
5297                  .addImm(0);
5298   } else {
5299     InsElt = MIRBuilder.buildInstr(Opc, {*DstReg}, {SrcReg})
5300                  .addImm(LaneIdx)
5301                  .addUse(EltReg);
5302   }
5303 
5304   constrainSelectedInstRegOperands(*InsElt, TII, TRI, RBI);
5305   return InsElt;
5306 }
5307 
selectUSMovFromExtend(MachineInstr & MI,MachineRegisterInfo & MRI)5308 bool AArch64InstructionSelector::selectUSMovFromExtend(
5309     MachineInstr &MI, MachineRegisterInfo &MRI) {
5310   if (MI.getOpcode() != TargetOpcode::G_SEXT &&
5311       MI.getOpcode() != TargetOpcode::G_ZEXT &&
5312       MI.getOpcode() != TargetOpcode::G_ANYEXT)
5313     return false;
5314   bool IsSigned = MI.getOpcode() == TargetOpcode::G_SEXT;
5315   const Register DefReg = MI.getOperand(0).getReg();
5316   const LLT DstTy = MRI.getType(DefReg);
5317   unsigned DstSize = DstTy.getSizeInBits();
5318 
5319   if (DstSize != 32 && DstSize != 64)
5320     return false;
5321 
5322   MachineInstr *Extract = getOpcodeDef(TargetOpcode::G_EXTRACT_VECTOR_ELT,
5323                                        MI.getOperand(1).getReg(), MRI);
5324   int64_t Lane;
5325   if (!Extract || !mi_match(Extract->getOperand(2).getReg(), MRI, m_ICst(Lane)))
5326     return false;
5327   Register Src0 = Extract->getOperand(1).getReg();
5328 
5329   const LLT VecTy = MRI.getType(Src0);
5330   if (VecTy.isScalableVector())
5331     return false;
5332 
5333   if (VecTy.getSizeInBits() != 128) {
5334     const MachineInstr *ScalarToVector = emitScalarToVector(
5335         VecTy.getSizeInBits(), &AArch64::FPR128RegClass, Src0, MIB);
5336     assert(ScalarToVector && "Didn't expect emitScalarToVector to fail!");
5337     Src0 = ScalarToVector->getOperand(0).getReg();
5338   }
5339 
5340   unsigned Opcode;
5341   if (DstSize == 64 && VecTy.getScalarSizeInBits() == 32)
5342     Opcode = IsSigned ? AArch64::SMOVvi32to64 : AArch64::UMOVvi32;
5343   else if (DstSize == 64 && VecTy.getScalarSizeInBits() == 16)
5344     Opcode = IsSigned ? AArch64::SMOVvi16to64 : AArch64::UMOVvi16;
5345   else if (DstSize == 64 && VecTy.getScalarSizeInBits() == 8)
5346     Opcode = IsSigned ? AArch64::SMOVvi8to64 : AArch64::UMOVvi8;
5347   else if (DstSize == 32 && VecTy.getScalarSizeInBits() == 16)
5348     Opcode = IsSigned ? AArch64::SMOVvi16to32 : AArch64::UMOVvi16;
5349   else if (DstSize == 32 && VecTy.getScalarSizeInBits() == 8)
5350     Opcode = IsSigned ? AArch64::SMOVvi8to32 : AArch64::UMOVvi8;
5351   else
5352     llvm_unreachable("Unexpected type combo for S/UMov!");
5353 
5354   // We may need to generate one of these, depending on the type and sign of the
5355   // input:
5356   //  DstReg = SMOV Src0, Lane;
5357   //  NewReg = UMOV Src0, Lane; DstReg = SUBREG_TO_REG NewReg, sub_32;
5358   MachineInstr *ExtI = nullptr;
5359   if (DstSize == 64 && !IsSigned) {
5360     Register NewReg = MRI.createVirtualRegister(&AArch64::GPR32RegClass);
5361     MIB.buildInstr(Opcode, {NewReg}, {Src0}).addImm(Lane);
5362     ExtI = MIB.buildInstr(AArch64::SUBREG_TO_REG, {DefReg}, {})
5363                .addImm(0)
5364                .addUse(NewReg)
5365                .addImm(AArch64::sub_32);
5366     RBI.constrainGenericRegister(DefReg, AArch64::GPR64RegClass, MRI);
5367   } else
5368     ExtI = MIB.buildInstr(Opcode, {DefReg}, {Src0}).addImm(Lane);
5369 
5370   constrainSelectedInstRegOperands(*ExtI, TII, TRI, RBI);
5371   MI.eraseFromParent();
5372   return true;
5373 }
5374 
tryAdvSIMDModImm8(Register Dst,unsigned DstSize,APInt Bits,MachineIRBuilder & Builder)5375 MachineInstr *AArch64InstructionSelector::tryAdvSIMDModImm8(
5376     Register Dst, unsigned DstSize, APInt Bits, MachineIRBuilder &Builder) {
5377   unsigned int Op;
5378   if (DstSize == 128) {
5379     if (Bits.getHiBits(64) != Bits.getLoBits(64))
5380       return nullptr;
5381     Op = AArch64::MOVIv16b_ns;
5382   } else {
5383     Op = AArch64::MOVIv8b_ns;
5384   }
5385 
5386   uint64_t Val = Bits.zextOrTrunc(64).getZExtValue();
5387 
5388   if (AArch64_AM::isAdvSIMDModImmType9(Val)) {
5389     Val = AArch64_AM::encodeAdvSIMDModImmType9(Val);
5390     auto Mov = Builder.buildInstr(Op, {Dst}, {}).addImm(Val);
5391     constrainSelectedInstRegOperands(*Mov, TII, TRI, RBI);
5392     return &*Mov;
5393   }
5394   return nullptr;
5395 }
5396 
tryAdvSIMDModImm16(Register Dst,unsigned DstSize,APInt Bits,MachineIRBuilder & Builder,bool Inv)5397 MachineInstr *AArch64InstructionSelector::tryAdvSIMDModImm16(
5398     Register Dst, unsigned DstSize, APInt Bits, MachineIRBuilder &Builder,
5399     bool Inv) {
5400 
5401   unsigned int Op;
5402   if (DstSize == 128) {
5403     if (Bits.getHiBits(64) != Bits.getLoBits(64))
5404       return nullptr;
5405     Op = Inv ? AArch64::MVNIv8i16 : AArch64::MOVIv8i16;
5406   } else {
5407     Op = Inv ? AArch64::MVNIv4i16 : AArch64::MOVIv4i16;
5408   }
5409 
5410   uint64_t Val = Bits.zextOrTrunc(64).getZExtValue();
5411   uint64_t Shift;
5412 
5413   if (AArch64_AM::isAdvSIMDModImmType5(Val)) {
5414     Val = AArch64_AM::encodeAdvSIMDModImmType5(Val);
5415     Shift = 0;
5416   } else if (AArch64_AM::isAdvSIMDModImmType6(Val)) {
5417     Val = AArch64_AM::encodeAdvSIMDModImmType6(Val);
5418     Shift = 8;
5419   } else
5420     return nullptr;
5421 
5422   auto Mov = Builder.buildInstr(Op, {Dst}, {}).addImm(Val).addImm(Shift);
5423   constrainSelectedInstRegOperands(*Mov, TII, TRI, RBI);
5424   return &*Mov;
5425 }
5426 
tryAdvSIMDModImm32(Register Dst,unsigned DstSize,APInt Bits,MachineIRBuilder & Builder,bool Inv)5427 MachineInstr *AArch64InstructionSelector::tryAdvSIMDModImm32(
5428     Register Dst, unsigned DstSize, APInt Bits, MachineIRBuilder &Builder,
5429     bool Inv) {
5430 
5431   unsigned int Op;
5432   if (DstSize == 128) {
5433     if (Bits.getHiBits(64) != Bits.getLoBits(64))
5434       return nullptr;
5435     Op = Inv ? AArch64::MVNIv4i32 : AArch64::MOVIv4i32;
5436   } else {
5437     Op = Inv ? AArch64::MVNIv2i32 : AArch64::MOVIv2i32;
5438   }
5439 
5440   uint64_t Val = Bits.zextOrTrunc(64).getZExtValue();
5441   uint64_t Shift;
5442 
5443   if ((AArch64_AM::isAdvSIMDModImmType1(Val))) {
5444     Val = AArch64_AM::encodeAdvSIMDModImmType1(Val);
5445     Shift = 0;
5446   } else if ((AArch64_AM::isAdvSIMDModImmType2(Val))) {
5447     Val = AArch64_AM::encodeAdvSIMDModImmType2(Val);
5448     Shift = 8;
5449   } else if ((AArch64_AM::isAdvSIMDModImmType3(Val))) {
5450     Val = AArch64_AM::encodeAdvSIMDModImmType3(Val);
5451     Shift = 16;
5452   } else if ((AArch64_AM::isAdvSIMDModImmType4(Val))) {
5453     Val = AArch64_AM::encodeAdvSIMDModImmType4(Val);
5454     Shift = 24;
5455   } else
5456     return nullptr;
5457 
5458   auto Mov = Builder.buildInstr(Op, {Dst}, {}).addImm(Val).addImm(Shift);
5459   constrainSelectedInstRegOperands(*Mov, TII, TRI, RBI);
5460   return &*Mov;
5461 }
5462 
tryAdvSIMDModImm64(Register Dst,unsigned DstSize,APInt Bits,MachineIRBuilder & Builder)5463 MachineInstr *AArch64InstructionSelector::tryAdvSIMDModImm64(
5464     Register Dst, unsigned DstSize, APInt Bits, MachineIRBuilder &Builder) {
5465 
5466   unsigned int Op;
5467   if (DstSize == 128) {
5468     if (Bits.getHiBits(64) != Bits.getLoBits(64))
5469       return nullptr;
5470     Op = AArch64::MOVIv2d_ns;
5471   } else {
5472     Op = AArch64::MOVID;
5473   }
5474 
5475   uint64_t Val = Bits.zextOrTrunc(64).getZExtValue();
5476   if (AArch64_AM::isAdvSIMDModImmType10(Val)) {
5477     Val = AArch64_AM::encodeAdvSIMDModImmType10(Val);
5478     auto Mov = Builder.buildInstr(Op, {Dst}, {}).addImm(Val);
5479     constrainSelectedInstRegOperands(*Mov, TII, TRI, RBI);
5480     return &*Mov;
5481   }
5482   return nullptr;
5483 }
5484 
tryAdvSIMDModImm321s(Register Dst,unsigned DstSize,APInt Bits,MachineIRBuilder & Builder,bool Inv)5485 MachineInstr *AArch64InstructionSelector::tryAdvSIMDModImm321s(
5486     Register Dst, unsigned DstSize, APInt Bits, MachineIRBuilder &Builder,
5487     bool Inv) {
5488 
5489   unsigned int Op;
5490   if (DstSize == 128) {
5491     if (Bits.getHiBits(64) != Bits.getLoBits(64))
5492       return nullptr;
5493     Op = Inv ? AArch64::MVNIv4s_msl : AArch64::MOVIv4s_msl;
5494   } else {
5495     Op = Inv ? AArch64::MVNIv2s_msl : AArch64::MOVIv2s_msl;
5496   }
5497 
5498   uint64_t Val = Bits.zextOrTrunc(64).getZExtValue();
5499   uint64_t Shift;
5500 
5501   if (AArch64_AM::isAdvSIMDModImmType7(Val)) {
5502     Val = AArch64_AM::encodeAdvSIMDModImmType7(Val);
5503     Shift = 264;
5504   } else if (AArch64_AM::isAdvSIMDModImmType8(Val)) {
5505     Val = AArch64_AM::encodeAdvSIMDModImmType8(Val);
5506     Shift = 272;
5507   } else
5508     return nullptr;
5509 
5510   auto Mov = Builder.buildInstr(Op, {Dst}, {}).addImm(Val).addImm(Shift);
5511   constrainSelectedInstRegOperands(*Mov, TII, TRI, RBI);
5512   return &*Mov;
5513 }
5514 
tryAdvSIMDModImmFP(Register Dst,unsigned DstSize,APInt Bits,MachineIRBuilder & Builder)5515 MachineInstr *AArch64InstructionSelector::tryAdvSIMDModImmFP(
5516     Register Dst, unsigned DstSize, APInt Bits, MachineIRBuilder &Builder) {
5517 
5518   unsigned int Op;
5519   bool IsWide = false;
5520   if (DstSize == 128) {
5521     if (Bits.getHiBits(64) != Bits.getLoBits(64))
5522       return nullptr;
5523     Op = AArch64::FMOVv4f32_ns;
5524     IsWide = true;
5525   } else {
5526     Op = AArch64::FMOVv2f32_ns;
5527   }
5528 
5529   uint64_t Val = Bits.zextOrTrunc(64).getZExtValue();
5530 
5531   if (AArch64_AM::isAdvSIMDModImmType11(Val)) {
5532     Val = AArch64_AM::encodeAdvSIMDModImmType11(Val);
5533   } else if (IsWide && AArch64_AM::isAdvSIMDModImmType12(Val)) {
5534     Val = AArch64_AM::encodeAdvSIMDModImmType12(Val);
5535     Op = AArch64::FMOVv2f64_ns;
5536   } else
5537     return nullptr;
5538 
5539   auto Mov = Builder.buildInstr(Op, {Dst}, {}).addImm(Val);
5540   constrainSelectedInstRegOperands(*Mov, TII, TRI, RBI);
5541   return &*Mov;
5542 }
5543 
selectIndexedExtLoad(MachineInstr & MI,MachineRegisterInfo & MRI)5544 bool AArch64InstructionSelector::selectIndexedExtLoad(
5545     MachineInstr &MI, MachineRegisterInfo &MRI) {
5546   auto &ExtLd = cast<GIndexedAnyExtLoad>(MI);
5547   Register Dst = ExtLd.getDstReg();
5548   Register WriteBack = ExtLd.getWritebackReg();
5549   Register Base = ExtLd.getBaseReg();
5550   Register Offset = ExtLd.getOffsetReg();
5551   LLT Ty = MRI.getType(Dst);
5552   assert(Ty.getSizeInBits() <= 64); // Only for scalar GPRs.
5553   unsigned MemSizeBits = ExtLd.getMMO().getMemoryType().getSizeInBits();
5554   bool IsPre = ExtLd.isPre();
5555   bool IsSExt = isa<GIndexedSExtLoad>(ExtLd);
5556   unsigned InsertIntoSubReg = 0;
5557   bool IsDst64 = Ty.getSizeInBits() == 64;
5558 
5559   // ZExt/SExt should be on gpr but can handle extload and zextload of fpr, so
5560   // long as they are scalar.
5561   bool IsFPR = RBI.getRegBank(Dst, MRI, TRI)->getID() == AArch64::FPRRegBankID;
5562   if ((IsSExt && IsFPR) || Ty.isVector())
5563     return false;
5564 
5565   unsigned Opc = 0;
5566   LLT NewLdDstTy;
5567   LLT s32 = LLT::scalar(32);
5568   LLT s64 = LLT::scalar(64);
5569 
5570   if (MemSizeBits == 8) {
5571     if (IsSExt) {
5572       if (IsDst64)
5573         Opc = IsPre ? AArch64::LDRSBXpre : AArch64::LDRSBXpost;
5574       else
5575         Opc = IsPre ? AArch64::LDRSBWpre : AArch64::LDRSBWpost;
5576       NewLdDstTy = IsDst64 ? s64 : s32;
5577     } else if (IsFPR) {
5578       Opc = IsPre ? AArch64::LDRBpre : AArch64::LDRBpost;
5579       InsertIntoSubReg = AArch64::bsub;
5580       NewLdDstTy = LLT::scalar(MemSizeBits);
5581     } else {
5582       Opc = IsPre ? AArch64::LDRBBpre : AArch64::LDRBBpost;
5583       InsertIntoSubReg = IsDst64 ? AArch64::sub_32 : 0;
5584       NewLdDstTy = s32;
5585     }
5586   } else if (MemSizeBits == 16) {
5587     if (IsSExt) {
5588       if (IsDst64)
5589         Opc = IsPre ? AArch64::LDRSHXpre : AArch64::LDRSHXpost;
5590       else
5591         Opc = IsPre ? AArch64::LDRSHWpre : AArch64::LDRSHWpost;
5592       NewLdDstTy = IsDst64 ? s64 : s32;
5593     } else if (IsFPR) {
5594       Opc = IsPre ? AArch64::LDRHpre : AArch64::LDRHpost;
5595       InsertIntoSubReg = AArch64::hsub;
5596       NewLdDstTy = LLT::scalar(MemSizeBits);
5597     } else {
5598       Opc = IsPre ? AArch64::LDRHHpre : AArch64::LDRHHpost;
5599       InsertIntoSubReg = IsDst64 ? AArch64::sub_32 : 0;
5600       NewLdDstTy = s32;
5601     }
5602   } else if (MemSizeBits == 32) {
5603     if (IsSExt) {
5604       Opc = IsPre ? AArch64::LDRSWpre : AArch64::LDRSWpost;
5605       NewLdDstTy = s64;
5606     } else if (IsFPR) {
5607       Opc = IsPre ? AArch64::LDRSpre : AArch64::LDRSpost;
5608       InsertIntoSubReg = AArch64::ssub;
5609       NewLdDstTy = LLT::scalar(MemSizeBits);
5610     } else {
5611       Opc = IsPre ? AArch64::LDRWpre : AArch64::LDRWpost;
5612       InsertIntoSubReg = IsDst64 ? AArch64::sub_32 : 0;
5613       NewLdDstTy = s32;
5614     }
5615   } else {
5616     llvm_unreachable("Unexpected size for indexed load");
5617   }
5618 
5619   auto Cst = getIConstantVRegVal(Offset, MRI);
5620   if (!Cst)
5621     return false; // Shouldn't happen, but just in case.
5622 
5623   auto LdMI = MIB.buildInstr(Opc, {WriteBack, NewLdDstTy}, {Base})
5624                   .addImm(Cst->getSExtValue());
5625   LdMI.cloneMemRefs(ExtLd);
5626   constrainSelectedInstRegOperands(*LdMI, TII, TRI, RBI);
5627   // Make sure to select the load with the MemTy as the dest type, and then
5628   // insert into a larger reg if needed.
5629   if (InsertIntoSubReg) {
5630     // Generate a SUBREG_TO_REG.
5631     auto SubToReg = MIB.buildInstr(TargetOpcode::SUBREG_TO_REG, {Dst}, {})
5632                         .addImm(0)
5633                         .addUse(LdMI.getReg(1))
5634                         .addImm(InsertIntoSubReg);
5635     RBI.constrainGenericRegister(
5636         SubToReg.getReg(0),
5637         *getRegClassForTypeOnBank(MRI.getType(Dst),
5638                                   *RBI.getRegBank(Dst, MRI, TRI)),
5639         MRI);
5640   } else {
5641     auto Copy = MIB.buildCopy(Dst, LdMI.getReg(1));
5642     selectCopy(*Copy, TII, MRI, TRI, RBI);
5643   }
5644   MI.eraseFromParent();
5645 
5646   return true;
5647 }
5648 
selectIndexedLoad(MachineInstr & MI,MachineRegisterInfo & MRI)5649 bool AArch64InstructionSelector::selectIndexedLoad(MachineInstr &MI,
5650                                                    MachineRegisterInfo &MRI) {
5651   auto &Ld = cast<GIndexedLoad>(MI);
5652   Register Dst = Ld.getDstReg();
5653   Register WriteBack = Ld.getWritebackReg();
5654   Register Base = Ld.getBaseReg();
5655   Register Offset = Ld.getOffsetReg();
5656   assert(MRI.getType(Dst).getSizeInBits() <= 128 &&
5657          "Unexpected type for indexed load");
5658   unsigned MemSize = Ld.getMMO().getMemoryType().getSizeInBytes();
5659 
5660   if (MemSize < MRI.getType(Dst).getSizeInBytes())
5661     return selectIndexedExtLoad(MI, MRI);
5662 
5663   unsigned Opc = 0;
5664   if (Ld.isPre()) {
5665     static constexpr unsigned GPROpcodes[] = {
5666         AArch64::LDRBBpre, AArch64::LDRHHpre, AArch64::LDRWpre,
5667         AArch64::LDRXpre};
5668     static constexpr unsigned FPROpcodes[] = {
5669         AArch64::LDRBpre, AArch64::LDRHpre, AArch64::LDRSpre, AArch64::LDRDpre,
5670         AArch64::LDRQpre};
5671     if (RBI.getRegBank(Dst, MRI, TRI)->getID() == AArch64::FPRRegBankID)
5672       Opc = FPROpcodes[Log2_32(MemSize)];
5673     else
5674       Opc = GPROpcodes[Log2_32(MemSize)];
5675   } else {
5676     static constexpr unsigned GPROpcodes[] = {
5677         AArch64::LDRBBpost, AArch64::LDRHHpost, AArch64::LDRWpost,
5678         AArch64::LDRXpost};
5679     static constexpr unsigned FPROpcodes[] = {
5680         AArch64::LDRBpost, AArch64::LDRHpost, AArch64::LDRSpost,
5681         AArch64::LDRDpost, AArch64::LDRQpost};
5682     if (RBI.getRegBank(Dst, MRI, TRI)->getID() == AArch64::FPRRegBankID)
5683       Opc = FPROpcodes[Log2_32(MemSize)];
5684     else
5685       Opc = GPROpcodes[Log2_32(MemSize)];
5686   }
5687   auto Cst = getIConstantVRegVal(Offset, MRI);
5688   if (!Cst)
5689     return false; // Shouldn't happen, but just in case.
5690   auto LdMI =
5691       MIB.buildInstr(Opc, {WriteBack, Dst}, {Base}).addImm(Cst->getSExtValue());
5692   LdMI.cloneMemRefs(Ld);
5693   constrainSelectedInstRegOperands(*LdMI, TII, TRI, RBI);
5694   MI.eraseFromParent();
5695   return true;
5696 }
5697 
selectIndexedStore(GIndexedStore & I,MachineRegisterInfo & MRI)5698 bool AArch64InstructionSelector::selectIndexedStore(GIndexedStore &I,
5699                                                     MachineRegisterInfo &MRI) {
5700   Register Dst = I.getWritebackReg();
5701   Register Val = I.getValueReg();
5702   Register Base = I.getBaseReg();
5703   Register Offset = I.getOffsetReg();
5704   LLT ValTy = MRI.getType(Val);
5705   assert(ValTy.getSizeInBits() <= 128 && "Unexpected type for indexed store");
5706 
5707   unsigned Opc = 0;
5708   if (I.isPre()) {
5709     static constexpr unsigned GPROpcodes[] = {
5710         AArch64::STRBBpre, AArch64::STRHHpre, AArch64::STRWpre,
5711         AArch64::STRXpre};
5712     static constexpr unsigned FPROpcodes[] = {
5713         AArch64::STRBpre, AArch64::STRHpre, AArch64::STRSpre, AArch64::STRDpre,
5714         AArch64::STRQpre};
5715 
5716     if (RBI.getRegBank(Val, MRI, TRI)->getID() == AArch64::FPRRegBankID)
5717       Opc = FPROpcodes[Log2_32(ValTy.getSizeInBytes())];
5718     else
5719       Opc = GPROpcodes[Log2_32(ValTy.getSizeInBytes())];
5720   } else {
5721     static constexpr unsigned GPROpcodes[] = {
5722         AArch64::STRBBpost, AArch64::STRHHpost, AArch64::STRWpost,
5723         AArch64::STRXpost};
5724     static constexpr unsigned FPROpcodes[] = {
5725         AArch64::STRBpost, AArch64::STRHpost, AArch64::STRSpost,
5726         AArch64::STRDpost, AArch64::STRQpost};
5727 
5728     if (RBI.getRegBank(Val, MRI, TRI)->getID() == AArch64::FPRRegBankID)
5729       Opc = FPROpcodes[Log2_32(ValTy.getSizeInBytes())];
5730     else
5731       Opc = GPROpcodes[Log2_32(ValTy.getSizeInBytes())];
5732   }
5733 
5734   auto Cst = getIConstantVRegVal(Offset, MRI);
5735   if (!Cst)
5736     return false; // Shouldn't happen, but just in case.
5737   auto Str =
5738       MIB.buildInstr(Opc, {Dst}, {Val, Base}).addImm(Cst->getSExtValue());
5739   Str.cloneMemRefs(I);
5740   constrainSelectedInstRegOperands(*Str, TII, TRI, RBI);
5741   I.eraseFromParent();
5742   return true;
5743 }
5744 
5745 MachineInstr *
emitConstantVector(Register Dst,Constant * CV,MachineIRBuilder & MIRBuilder,MachineRegisterInfo & MRI)5746 AArch64InstructionSelector::emitConstantVector(Register Dst, Constant *CV,
5747                                                MachineIRBuilder &MIRBuilder,
5748                                                MachineRegisterInfo &MRI) {
5749   LLT DstTy = MRI.getType(Dst);
5750   unsigned DstSize = DstTy.getSizeInBits();
5751   if (CV->isNullValue()) {
5752     if (DstSize == 128) {
5753       auto Mov =
5754           MIRBuilder.buildInstr(AArch64::MOVIv2d_ns, {Dst}, {}).addImm(0);
5755       constrainSelectedInstRegOperands(*Mov, TII, TRI, RBI);
5756       return &*Mov;
5757     }
5758 
5759     if (DstSize == 64) {
5760       auto Mov =
5761           MIRBuilder
5762               .buildInstr(AArch64::MOVIv2d_ns, {&AArch64::FPR128RegClass}, {})
5763               .addImm(0);
5764       auto Copy = MIRBuilder.buildInstr(TargetOpcode::COPY, {Dst}, {})
5765                       .addReg(Mov.getReg(0), 0, AArch64::dsub);
5766       RBI.constrainGenericRegister(Dst, AArch64::FPR64RegClass, MRI);
5767       return &*Copy;
5768     }
5769   }
5770 
5771   if (Constant *SplatValue = CV->getSplatValue()) {
5772     APInt SplatValueAsInt =
5773         isa<ConstantFP>(SplatValue)
5774             ? cast<ConstantFP>(SplatValue)->getValueAPF().bitcastToAPInt()
5775             : SplatValue->getUniqueInteger();
5776     APInt DefBits = APInt::getSplat(
5777         DstSize, SplatValueAsInt.trunc(DstTy.getScalarSizeInBits()));
5778     auto TryMOVIWithBits = [&](APInt DefBits) -> MachineInstr * {
5779       MachineInstr *NewOp;
5780       bool Inv = false;
5781       if ((NewOp = tryAdvSIMDModImm64(Dst, DstSize, DefBits, MIRBuilder)) ||
5782           (NewOp =
5783                tryAdvSIMDModImm32(Dst, DstSize, DefBits, MIRBuilder, Inv)) ||
5784           (NewOp =
5785                tryAdvSIMDModImm321s(Dst, DstSize, DefBits, MIRBuilder, Inv)) ||
5786           (NewOp =
5787                tryAdvSIMDModImm16(Dst, DstSize, DefBits, MIRBuilder, Inv)) ||
5788           (NewOp = tryAdvSIMDModImm8(Dst, DstSize, DefBits, MIRBuilder)) ||
5789           (NewOp = tryAdvSIMDModImmFP(Dst, DstSize, DefBits, MIRBuilder)))
5790         return NewOp;
5791 
5792       DefBits = ~DefBits;
5793       Inv = true;
5794       if ((NewOp =
5795                tryAdvSIMDModImm32(Dst, DstSize, DefBits, MIRBuilder, Inv)) ||
5796           (NewOp =
5797                tryAdvSIMDModImm321s(Dst, DstSize, DefBits, MIRBuilder, Inv)) ||
5798           (NewOp = tryAdvSIMDModImm16(Dst, DstSize, DefBits, MIRBuilder, Inv)))
5799         return NewOp;
5800       return nullptr;
5801     };
5802 
5803     if (auto *NewOp = TryMOVIWithBits(DefBits))
5804       return NewOp;
5805 
5806     // See if a fneg of the constant can be materialized with a MOVI, etc
5807     auto TryWithFNeg = [&](APInt DefBits, int NumBits,
5808                            unsigned NegOpc) -> MachineInstr * {
5809       // FNegate each sub-element of the constant
5810       APInt Neg = APInt::getHighBitsSet(NumBits, 1).zext(DstSize);
5811       APInt NegBits(DstSize, 0);
5812       unsigned NumElts = DstSize / NumBits;
5813       for (unsigned i = 0; i < NumElts; i++)
5814         NegBits |= Neg << (NumBits * i);
5815       NegBits = DefBits ^ NegBits;
5816 
5817       // Try to create the new constants with MOVI, and if so generate a fneg
5818       // for it.
5819       if (auto *NewOp = TryMOVIWithBits(NegBits)) {
5820         Register NewDst = MRI.createVirtualRegister(&AArch64::FPR128RegClass);
5821         NewOp->getOperand(0).setReg(NewDst);
5822         return MIRBuilder.buildInstr(NegOpc, {Dst}, {NewDst});
5823       }
5824       return nullptr;
5825     };
5826     MachineInstr *R;
5827     if ((R = TryWithFNeg(DefBits, 32, AArch64::FNEGv4f32)) ||
5828         (R = TryWithFNeg(DefBits, 64, AArch64::FNEGv2f64)) ||
5829         (STI.hasFullFP16() &&
5830          (R = TryWithFNeg(DefBits, 16, AArch64::FNEGv8f16))))
5831       return R;
5832   }
5833 
5834   auto *CPLoad = emitLoadFromConstantPool(CV, MIRBuilder);
5835   if (!CPLoad) {
5836     LLVM_DEBUG(dbgs() << "Could not generate cp load for constant vector!");
5837     return nullptr;
5838   }
5839 
5840   auto Copy = MIRBuilder.buildCopy(Dst, CPLoad->getOperand(0));
5841   RBI.constrainGenericRegister(
5842       Dst, *MRI.getRegClass(CPLoad->getOperand(0).getReg()), MRI);
5843   return &*Copy;
5844 }
5845 
tryOptConstantBuildVec(MachineInstr & I,LLT DstTy,MachineRegisterInfo & MRI)5846 bool AArch64InstructionSelector::tryOptConstantBuildVec(
5847     MachineInstr &I, LLT DstTy, MachineRegisterInfo &MRI) {
5848   assert(I.getOpcode() == TargetOpcode::G_BUILD_VECTOR);
5849   unsigned DstSize = DstTy.getSizeInBits();
5850   assert(DstSize <= 128 && "Unexpected build_vec type!");
5851   if (DstSize < 32)
5852     return false;
5853   // Check if we're building a constant vector, in which case we want to
5854   // generate a constant pool load instead of a vector insert sequence.
5855   SmallVector<Constant *, 16> Csts;
5856   for (unsigned Idx = 1; Idx < I.getNumOperands(); ++Idx) {
5857     // Try to find G_CONSTANT or G_FCONSTANT
5858     auto *OpMI =
5859         getOpcodeDef(TargetOpcode::G_CONSTANT, I.getOperand(Idx).getReg(), MRI);
5860     if (OpMI)
5861       Csts.emplace_back(
5862           const_cast<ConstantInt *>(OpMI->getOperand(1).getCImm()));
5863     else if ((OpMI = getOpcodeDef(TargetOpcode::G_FCONSTANT,
5864                                   I.getOperand(Idx).getReg(), MRI)))
5865       Csts.emplace_back(
5866           const_cast<ConstantFP *>(OpMI->getOperand(1).getFPImm()));
5867     else
5868       return false;
5869   }
5870   Constant *CV = ConstantVector::get(Csts);
5871   if (!emitConstantVector(I.getOperand(0).getReg(), CV, MIB, MRI))
5872     return false;
5873   I.eraseFromParent();
5874   return true;
5875 }
5876 
tryOptBuildVecToSubregToReg(MachineInstr & I,MachineRegisterInfo & MRI)5877 bool AArch64InstructionSelector::tryOptBuildVecToSubregToReg(
5878     MachineInstr &I, MachineRegisterInfo &MRI) {
5879   // Given:
5880   //  %vec = G_BUILD_VECTOR %elt, %undef, %undef, ... %undef
5881   //
5882   // Select the G_BUILD_VECTOR as a SUBREG_TO_REG from %elt.
5883   Register Dst = I.getOperand(0).getReg();
5884   Register EltReg = I.getOperand(1).getReg();
5885   LLT EltTy = MRI.getType(EltReg);
5886   // If the index isn't on the same bank as its elements, then this can't be a
5887   // SUBREG_TO_REG.
5888   const RegisterBank &EltRB = *RBI.getRegBank(EltReg, MRI, TRI);
5889   const RegisterBank &DstRB = *RBI.getRegBank(Dst, MRI, TRI);
5890   if (EltRB != DstRB)
5891     return false;
5892   if (any_of(drop_begin(I.operands(), 2), [&MRI](const MachineOperand &Op) {
5893         return !getOpcodeDef(TargetOpcode::G_IMPLICIT_DEF, Op.getReg(), MRI);
5894       }))
5895     return false;
5896   unsigned SubReg;
5897   const TargetRegisterClass *EltRC = getRegClassForTypeOnBank(EltTy, EltRB);
5898   if (!EltRC)
5899     return false;
5900   const TargetRegisterClass *DstRC =
5901       getRegClassForTypeOnBank(MRI.getType(Dst), DstRB);
5902   if (!DstRC)
5903     return false;
5904   if (!getSubRegForClass(EltRC, TRI, SubReg))
5905     return false;
5906   auto SubregToReg = MIB.buildInstr(AArch64::SUBREG_TO_REG, {Dst}, {})
5907                          .addImm(0)
5908                          .addUse(EltReg)
5909                          .addImm(SubReg);
5910   I.eraseFromParent();
5911   constrainSelectedInstRegOperands(*SubregToReg, TII, TRI, RBI);
5912   return RBI.constrainGenericRegister(Dst, *DstRC, MRI);
5913 }
5914 
selectBuildVector(MachineInstr & I,MachineRegisterInfo & MRI)5915 bool AArch64InstructionSelector::selectBuildVector(MachineInstr &I,
5916                                                    MachineRegisterInfo &MRI) {
5917   assert(I.getOpcode() == TargetOpcode::G_BUILD_VECTOR);
5918   // Until we port more of the optimized selections, for now just use a vector
5919   // insert sequence.
5920   const LLT DstTy = MRI.getType(I.getOperand(0).getReg());
5921   const LLT EltTy = MRI.getType(I.getOperand(1).getReg());
5922   unsigned EltSize = EltTy.getSizeInBits();
5923 
5924   if (tryOptConstantBuildVec(I, DstTy, MRI))
5925     return true;
5926   if (tryOptBuildVecToSubregToReg(I, MRI))
5927     return true;
5928 
5929   if (EltSize != 8 && EltSize != 16 && EltSize != 32 && EltSize != 64)
5930     return false; // Don't support all element types yet.
5931   const RegisterBank &RB = *RBI.getRegBank(I.getOperand(1).getReg(), MRI, TRI);
5932 
5933   const TargetRegisterClass *DstRC = &AArch64::FPR128RegClass;
5934   MachineInstr *ScalarToVec =
5935       emitScalarToVector(DstTy.getElementType().getSizeInBits(), DstRC,
5936                          I.getOperand(1).getReg(), MIB);
5937   if (!ScalarToVec)
5938     return false;
5939 
5940   Register DstVec = ScalarToVec->getOperand(0).getReg();
5941   unsigned DstSize = DstTy.getSizeInBits();
5942 
5943   // Keep track of the last MI we inserted. Later on, we might be able to save
5944   // a copy using it.
5945   MachineInstr *PrevMI = ScalarToVec;
5946   for (unsigned i = 2, e = DstSize / EltSize + 1; i < e; ++i) {
5947     // Note that if we don't do a subregister copy, we can end up making an
5948     // extra register.
5949     Register OpReg = I.getOperand(i).getReg();
5950     // Do not emit inserts for undefs
5951     if (!getOpcodeDef<GImplicitDef>(OpReg, MRI)) {
5952       PrevMI = &*emitLaneInsert(std::nullopt, DstVec, OpReg, i - 1, RB, MIB);
5953       DstVec = PrevMI->getOperand(0).getReg();
5954     }
5955   }
5956 
5957   // If DstTy's size in bits is less than 128, then emit a subregister copy
5958   // from DstVec to the last register we've defined.
5959   if (DstSize < 128) {
5960     // Force this to be FPR using the destination vector.
5961     const TargetRegisterClass *RC =
5962         getRegClassForTypeOnBank(DstTy, *RBI.getRegBank(DstVec, MRI, TRI));
5963     if (!RC)
5964       return false;
5965     if (RC != &AArch64::FPR32RegClass && RC != &AArch64::FPR64RegClass) {
5966       LLVM_DEBUG(dbgs() << "Unsupported register class!\n");
5967       return false;
5968     }
5969 
5970     unsigned SubReg = 0;
5971     if (!getSubRegForClass(RC, TRI, SubReg))
5972       return false;
5973     if (SubReg != AArch64::ssub && SubReg != AArch64::dsub) {
5974       LLVM_DEBUG(dbgs() << "Unsupported destination size! (" << DstSize
5975                         << "\n");
5976       return false;
5977     }
5978 
5979     Register Reg = MRI.createVirtualRegister(RC);
5980     Register DstReg = I.getOperand(0).getReg();
5981 
5982     MIB.buildInstr(TargetOpcode::COPY, {DstReg}, {}).addReg(DstVec, 0, SubReg);
5983     MachineOperand &RegOp = I.getOperand(1);
5984     RegOp.setReg(Reg);
5985     RBI.constrainGenericRegister(DstReg, *RC, MRI);
5986   } else {
5987     // We either have a vector with all elements (except the first one) undef or
5988     // at least one non-undef non-first element. In the first case, we need to
5989     // constrain the output register ourselves as we may have generated an
5990     // INSERT_SUBREG operation which is a generic operation for which the
5991     // output regclass cannot be automatically chosen.
5992     //
5993     // In the second case, there is no need to do this as it may generate an
5994     // instruction like INSvi32gpr where the regclass can be automatically
5995     // chosen.
5996     //
5997     // Also, we save a copy by re-using the destination register on the final
5998     // insert.
5999     PrevMI->getOperand(0).setReg(I.getOperand(0).getReg());
6000     constrainSelectedInstRegOperands(*PrevMI, TII, TRI, RBI);
6001 
6002     Register DstReg = PrevMI->getOperand(0).getReg();
6003     if (PrevMI == ScalarToVec && DstReg.isVirtual()) {
6004       const TargetRegisterClass *RC =
6005           getRegClassForTypeOnBank(DstTy, *RBI.getRegBank(DstVec, MRI, TRI));
6006       RBI.constrainGenericRegister(DstReg, *RC, MRI);
6007     }
6008   }
6009 
6010   I.eraseFromParent();
6011   return true;
6012 }
6013 
selectVectorLoadIntrinsic(unsigned Opc,unsigned NumVecs,MachineInstr & I)6014 bool AArch64InstructionSelector::selectVectorLoadIntrinsic(unsigned Opc,
6015                                                            unsigned NumVecs,
6016                                                            MachineInstr &I) {
6017   assert(I.getOpcode() == TargetOpcode::G_INTRINSIC_W_SIDE_EFFECTS);
6018   assert(Opc && "Expected an opcode?");
6019   assert(NumVecs > 1 && NumVecs < 5 && "Only support 2, 3, or 4 vectors");
6020   auto &MRI = *MIB.getMRI();
6021   LLT Ty = MRI.getType(I.getOperand(0).getReg());
6022   unsigned Size = Ty.getSizeInBits();
6023   assert((Size == 64 || Size == 128) &&
6024          "Destination must be 64 bits or 128 bits?");
6025   unsigned SubReg = Size == 64 ? AArch64::dsub0 : AArch64::qsub0;
6026   auto Ptr = I.getOperand(I.getNumOperands() - 1).getReg();
6027   assert(MRI.getType(Ptr).isPointer() && "Expected a pointer type?");
6028   auto Load = MIB.buildInstr(Opc, {Ty}, {Ptr});
6029   Load.cloneMemRefs(I);
6030   constrainSelectedInstRegOperands(*Load, TII, TRI, RBI);
6031   Register SelectedLoadDst = Load->getOperand(0).getReg();
6032   for (unsigned Idx = 0; Idx < NumVecs; ++Idx) {
6033     auto Vec = MIB.buildInstr(TargetOpcode::COPY, {I.getOperand(Idx)}, {})
6034                    .addReg(SelectedLoadDst, 0, SubReg + Idx);
6035     // Emit the subreg copies and immediately select them.
6036     // FIXME: We should refactor our copy code into an emitCopy helper and
6037     // clean up uses of this pattern elsewhere in the selector.
6038     selectCopy(*Vec, TII, MRI, TRI, RBI);
6039   }
6040   return true;
6041 }
6042 
selectVectorLoadLaneIntrinsic(unsigned Opc,unsigned NumVecs,MachineInstr & I)6043 bool AArch64InstructionSelector::selectVectorLoadLaneIntrinsic(
6044     unsigned Opc, unsigned NumVecs, MachineInstr &I) {
6045   assert(I.getOpcode() == TargetOpcode::G_INTRINSIC_W_SIDE_EFFECTS);
6046   assert(Opc && "Expected an opcode?");
6047   assert(NumVecs > 1 && NumVecs < 5 && "Only support 2, 3, or 4 vectors");
6048   auto &MRI = *MIB.getMRI();
6049   LLT Ty = MRI.getType(I.getOperand(0).getReg());
6050   bool Narrow = Ty.getSizeInBits() == 64;
6051 
6052   auto FirstSrcRegIt = I.operands_begin() + NumVecs + 1;
6053   SmallVector<Register, 4> Regs(NumVecs);
6054   std::transform(FirstSrcRegIt, FirstSrcRegIt + NumVecs, Regs.begin(),
6055                  [](auto MO) { return MO.getReg(); });
6056 
6057   if (Narrow) {
6058     transform(Regs, Regs.begin(), [this](Register Reg) {
6059       return emitScalarToVector(64, &AArch64::FPR128RegClass, Reg, MIB)
6060           ->getOperand(0)
6061           .getReg();
6062     });
6063     Ty = Ty.multiplyElements(2);
6064   }
6065 
6066   Register Tuple = createQTuple(Regs, MIB);
6067   auto LaneNo = getIConstantVRegVal((FirstSrcRegIt + NumVecs)->getReg(), MRI);
6068   if (!LaneNo)
6069     return false;
6070 
6071   Register Ptr = (FirstSrcRegIt + NumVecs + 1)->getReg();
6072   auto Load = MIB.buildInstr(Opc, {Ty}, {})
6073                   .addReg(Tuple)
6074                   .addImm(LaneNo->getZExtValue())
6075                   .addReg(Ptr);
6076   Load.cloneMemRefs(I);
6077   constrainSelectedInstRegOperands(*Load, TII, TRI, RBI);
6078   Register SelectedLoadDst = Load->getOperand(0).getReg();
6079   unsigned SubReg = AArch64::qsub0;
6080   for (unsigned Idx = 0; Idx < NumVecs; ++Idx) {
6081     auto Vec = MIB.buildInstr(TargetOpcode::COPY,
6082                               {Narrow ? DstOp(&AArch64::FPR128RegClass)
6083                                       : DstOp(I.getOperand(Idx).getReg())},
6084                               {})
6085                    .addReg(SelectedLoadDst, 0, SubReg + Idx);
6086     Register WideReg = Vec.getReg(0);
6087     // Emit the subreg copies and immediately select them.
6088     selectCopy(*Vec, TII, MRI, TRI, RBI);
6089     if (Narrow &&
6090         !emitNarrowVector(I.getOperand(Idx).getReg(), WideReg, MIB, MRI))
6091       return false;
6092   }
6093   return true;
6094 }
6095 
selectVectorStoreIntrinsic(MachineInstr & I,unsigned NumVecs,unsigned Opc)6096 void AArch64InstructionSelector::selectVectorStoreIntrinsic(MachineInstr &I,
6097                                                             unsigned NumVecs,
6098                                                             unsigned Opc) {
6099   MachineRegisterInfo &MRI = I.getParent()->getParent()->getRegInfo();
6100   LLT Ty = MRI.getType(I.getOperand(1).getReg());
6101   Register Ptr = I.getOperand(1 + NumVecs).getReg();
6102 
6103   SmallVector<Register, 2> Regs(NumVecs);
6104   std::transform(I.operands_begin() + 1, I.operands_begin() + 1 + NumVecs,
6105                  Regs.begin(), [](auto MO) { return MO.getReg(); });
6106 
6107   Register Tuple = Ty.getSizeInBits() == 128 ? createQTuple(Regs, MIB)
6108                                              : createDTuple(Regs, MIB);
6109   auto Store = MIB.buildInstr(Opc, {}, {Tuple, Ptr});
6110   Store.cloneMemRefs(I);
6111   constrainSelectedInstRegOperands(*Store, TII, TRI, RBI);
6112 }
6113 
selectVectorStoreLaneIntrinsic(MachineInstr & I,unsigned NumVecs,unsigned Opc)6114 bool AArch64InstructionSelector::selectVectorStoreLaneIntrinsic(
6115     MachineInstr &I, unsigned NumVecs, unsigned Opc) {
6116   MachineRegisterInfo &MRI = I.getParent()->getParent()->getRegInfo();
6117   LLT Ty = MRI.getType(I.getOperand(1).getReg());
6118   bool Narrow = Ty.getSizeInBits() == 64;
6119 
6120   SmallVector<Register, 2> Regs(NumVecs);
6121   std::transform(I.operands_begin() + 1, I.operands_begin() + 1 + NumVecs,
6122                  Regs.begin(), [](auto MO) { return MO.getReg(); });
6123 
6124   if (Narrow)
6125     transform(Regs, Regs.begin(), [this](Register Reg) {
6126       return emitScalarToVector(64, &AArch64::FPR128RegClass, Reg, MIB)
6127           ->getOperand(0)
6128           .getReg();
6129     });
6130 
6131   Register Tuple = createQTuple(Regs, MIB);
6132 
6133   auto LaneNo = getIConstantVRegVal(I.getOperand(1 + NumVecs).getReg(), MRI);
6134   if (!LaneNo)
6135     return false;
6136   Register Ptr = I.getOperand(1 + NumVecs + 1).getReg();
6137   auto Store = MIB.buildInstr(Opc, {}, {})
6138                    .addReg(Tuple)
6139                    .addImm(LaneNo->getZExtValue())
6140                    .addReg(Ptr);
6141   Store.cloneMemRefs(I);
6142   constrainSelectedInstRegOperands(*Store, TII, TRI, RBI);
6143   return true;
6144 }
6145 
selectIntrinsicWithSideEffects(MachineInstr & I,MachineRegisterInfo & MRI)6146 bool AArch64InstructionSelector::selectIntrinsicWithSideEffects(
6147     MachineInstr &I, MachineRegisterInfo &MRI) {
6148   // Find the intrinsic ID.
6149   unsigned IntrinID = cast<GIntrinsic>(I).getIntrinsicID();
6150 
6151   const LLT S8 = LLT::scalar(8);
6152   const LLT S16 = LLT::scalar(16);
6153   const LLT S32 = LLT::scalar(32);
6154   const LLT S64 = LLT::scalar(64);
6155   const LLT P0 = LLT::pointer(0, 64);
6156   // Select the instruction.
6157   switch (IntrinID) {
6158   default:
6159     return false;
6160   case Intrinsic::aarch64_ldxp:
6161   case Intrinsic::aarch64_ldaxp: {
6162     auto NewI = MIB.buildInstr(
6163         IntrinID == Intrinsic::aarch64_ldxp ? AArch64::LDXPX : AArch64::LDAXPX,
6164         {I.getOperand(0).getReg(), I.getOperand(1).getReg()},
6165         {I.getOperand(3)});
6166     NewI.cloneMemRefs(I);
6167     constrainSelectedInstRegOperands(*NewI, TII, TRI, RBI);
6168     break;
6169   }
6170   case Intrinsic::aarch64_neon_ld1x2: {
6171     LLT Ty = MRI.getType(I.getOperand(0).getReg());
6172     unsigned Opc = 0;
6173     if (Ty == LLT::fixed_vector(8, S8))
6174       Opc = AArch64::LD1Twov8b;
6175     else if (Ty == LLT::fixed_vector(16, S8))
6176       Opc = AArch64::LD1Twov16b;
6177     else if (Ty == LLT::fixed_vector(4, S16))
6178       Opc = AArch64::LD1Twov4h;
6179     else if (Ty == LLT::fixed_vector(8, S16))
6180       Opc = AArch64::LD1Twov8h;
6181     else if (Ty == LLT::fixed_vector(2, S32))
6182       Opc = AArch64::LD1Twov2s;
6183     else if (Ty == LLT::fixed_vector(4, S32))
6184       Opc = AArch64::LD1Twov4s;
6185     else if (Ty == LLT::fixed_vector(2, S64) || Ty == LLT::fixed_vector(2, P0))
6186       Opc = AArch64::LD1Twov2d;
6187     else if (Ty == S64 || Ty == P0)
6188       Opc = AArch64::LD1Twov1d;
6189     else
6190       llvm_unreachable("Unexpected type for ld1x2!");
6191     selectVectorLoadIntrinsic(Opc, 2, I);
6192     break;
6193   }
6194   case Intrinsic::aarch64_neon_ld1x3: {
6195     LLT Ty = MRI.getType(I.getOperand(0).getReg());
6196     unsigned Opc = 0;
6197     if (Ty == LLT::fixed_vector(8, S8))
6198       Opc = AArch64::LD1Threev8b;
6199     else if (Ty == LLT::fixed_vector(16, S8))
6200       Opc = AArch64::LD1Threev16b;
6201     else if (Ty == LLT::fixed_vector(4, S16))
6202       Opc = AArch64::LD1Threev4h;
6203     else if (Ty == LLT::fixed_vector(8, S16))
6204       Opc = AArch64::LD1Threev8h;
6205     else if (Ty == LLT::fixed_vector(2, S32))
6206       Opc = AArch64::LD1Threev2s;
6207     else if (Ty == LLT::fixed_vector(4, S32))
6208       Opc = AArch64::LD1Threev4s;
6209     else if (Ty == LLT::fixed_vector(2, S64) || Ty == LLT::fixed_vector(2, P0))
6210       Opc = AArch64::LD1Threev2d;
6211     else if (Ty == S64 || Ty == P0)
6212       Opc = AArch64::LD1Threev1d;
6213     else
6214       llvm_unreachable("Unexpected type for ld1x3!");
6215     selectVectorLoadIntrinsic(Opc, 3, I);
6216     break;
6217   }
6218   case Intrinsic::aarch64_neon_ld1x4: {
6219     LLT Ty = MRI.getType(I.getOperand(0).getReg());
6220     unsigned Opc = 0;
6221     if (Ty == LLT::fixed_vector(8, S8))
6222       Opc = AArch64::LD1Fourv8b;
6223     else if (Ty == LLT::fixed_vector(16, S8))
6224       Opc = AArch64::LD1Fourv16b;
6225     else if (Ty == LLT::fixed_vector(4, S16))
6226       Opc = AArch64::LD1Fourv4h;
6227     else if (Ty == LLT::fixed_vector(8, S16))
6228       Opc = AArch64::LD1Fourv8h;
6229     else if (Ty == LLT::fixed_vector(2, S32))
6230       Opc = AArch64::LD1Fourv2s;
6231     else if (Ty == LLT::fixed_vector(4, S32))
6232       Opc = AArch64::LD1Fourv4s;
6233     else if (Ty == LLT::fixed_vector(2, S64) || Ty == LLT::fixed_vector(2, P0))
6234       Opc = AArch64::LD1Fourv2d;
6235     else if (Ty == S64 || Ty == P0)
6236       Opc = AArch64::LD1Fourv1d;
6237     else
6238       llvm_unreachable("Unexpected type for ld1x4!");
6239     selectVectorLoadIntrinsic(Opc, 4, I);
6240     break;
6241   }
6242   case Intrinsic::aarch64_neon_ld2: {
6243     LLT Ty = MRI.getType(I.getOperand(0).getReg());
6244     unsigned Opc = 0;
6245     if (Ty == LLT::fixed_vector(8, S8))
6246       Opc = AArch64::LD2Twov8b;
6247     else if (Ty == LLT::fixed_vector(16, S8))
6248       Opc = AArch64::LD2Twov16b;
6249     else if (Ty == LLT::fixed_vector(4, S16))
6250       Opc = AArch64::LD2Twov4h;
6251     else if (Ty == LLT::fixed_vector(8, S16))
6252       Opc = AArch64::LD2Twov8h;
6253     else if (Ty == LLT::fixed_vector(2, S32))
6254       Opc = AArch64::LD2Twov2s;
6255     else if (Ty == LLT::fixed_vector(4, S32))
6256       Opc = AArch64::LD2Twov4s;
6257     else if (Ty == LLT::fixed_vector(2, S64) || Ty == LLT::fixed_vector(2, P0))
6258       Opc = AArch64::LD2Twov2d;
6259     else if (Ty == S64 || Ty == P0)
6260       Opc = AArch64::LD1Twov1d;
6261     else
6262       llvm_unreachable("Unexpected type for ld2!");
6263     selectVectorLoadIntrinsic(Opc, 2, I);
6264     break;
6265   }
6266   case Intrinsic::aarch64_neon_ld2lane: {
6267     LLT Ty = MRI.getType(I.getOperand(0).getReg());
6268     unsigned Opc;
6269     if (Ty == LLT::fixed_vector(8, S8) || Ty == LLT::fixed_vector(16, S8))
6270       Opc = AArch64::LD2i8;
6271     else if (Ty == LLT::fixed_vector(4, S16) || Ty == LLT::fixed_vector(8, S16))
6272       Opc = AArch64::LD2i16;
6273     else if (Ty == LLT::fixed_vector(2, S32) || Ty == LLT::fixed_vector(4, S32))
6274       Opc = AArch64::LD2i32;
6275     else if (Ty == LLT::fixed_vector(2, S64) ||
6276              Ty == LLT::fixed_vector(2, P0) || Ty == S64 || Ty == P0)
6277       Opc = AArch64::LD2i64;
6278     else
6279       llvm_unreachable("Unexpected type for st2lane!");
6280     if (!selectVectorLoadLaneIntrinsic(Opc, 2, I))
6281       return false;
6282     break;
6283   }
6284   case Intrinsic::aarch64_neon_ld2r: {
6285     LLT Ty = MRI.getType(I.getOperand(0).getReg());
6286     unsigned Opc = 0;
6287     if (Ty == LLT::fixed_vector(8, S8))
6288       Opc = AArch64::LD2Rv8b;
6289     else if (Ty == LLT::fixed_vector(16, S8))
6290       Opc = AArch64::LD2Rv16b;
6291     else if (Ty == LLT::fixed_vector(4, S16))
6292       Opc = AArch64::LD2Rv4h;
6293     else if (Ty == LLT::fixed_vector(8, S16))
6294       Opc = AArch64::LD2Rv8h;
6295     else if (Ty == LLT::fixed_vector(2, S32))
6296       Opc = AArch64::LD2Rv2s;
6297     else if (Ty == LLT::fixed_vector(4, S32))
6298       Opc = AArch64::LD2Rv4s;
6299     else if (Ty == LLT::fixed_vector(2, S64) || Ty == LLT::fixed_vector(2, P0))
6300       Opc = AArch64::LD2Rv2d;
6301     else if (Ty == S64 || Ty == P0)
6302       Opc = AArch64::LD2Rv1d;
6303     else
6304       llvm_unreachable("Unexpected type for ld2r!");
6305     selectVectorLoadIntrinsic(Opc, 2, I);
6306     break;
6307   }
6308   case Intrinsic::aarch64_neon_ld3: {
6309     LLT Ty = MRI.getType(I.getOperand(0).getReg());
6310     unsigned Opc = 0;
6311     if (Ty == LLT::fixed_vector(8, S8))
6312       Opc = AArch64::LD3Threev8b;
6313     else if (Ty == LLT::fixed_vector(16, S8))
6314       Opc = AArch64::LD3Threev16b;
6315     else if (Ty == LLT::fixed_vector(4, S16))
6316       Opc = AArch64::LD3Threev4h;
6317     else if (Ty == LLT::fixed_vector(8, S16))
6318       Opc = AArch64::LD3Threev8h;
6319     else if (Ty == LLT::fixed_vector(2, S32))
6320       Opc = AArch64::LD3Threev2s;
6321     else if (Ty == LLT::fixed_vector(4, S32))
6322       Opc = AArch64::LD3Threev4s;
6323     else if (Ty == LLT::fixed_vector(2, S64) || Ty == LLT::fixed_vector(2, P0))
6324       Opc = AArch64::LD3Threev2d;
6325     else if (Ty == S64 || Ty == P0)
6326       Opc = AArch64::LD1Threev1d;
6327     else
6328       llvm_unreachable("Unexpected type for ld3!");
6329     selectVectorLoadIntrinsic(Opc, 3, I);
6330     break;
6331   }
6332   case Intrinsic::aarch64_neon_ld3lane: {
6333     LLT Ty = MRI.getType(I.getOperand(0).getReg());
6334     unsigned Opc;
6335     if (Ty == LLT::fixed_vector(8, S8) || Ty == LLT::fixed_vector(16, S8))
6336       Opc = AArch64::LD3i8;
6337     else if (Ty == LLT::fixed_vector(4, S16) || Ty == LLT::fixed_vector(8, S16))
6338       Opc = AArch64::LD3i16;
6339     else if (Ty == LLT::fixed_vector(2, S32) || Ty == LLT::fixed_vector(4, S32))
6340       Opc = AArch64::LD3i32;
6341     else if (Ty == LLT::fixed_vector(2, S64) ||
6342              Ty == LLT::fixed_vector(2, P0) || Ty == S64 || Ty == P0)
6343       Opc = AArch64::LD3i64;
6344     else
6345       llvm_unreachable("Unexpected type for st3lane!");
6346     if (!selectVectorLoadLaneIntrinsic(Opc, 3, I))
6347       return false;
6348     break;
6349   }
6350   case Intrinsic::aarch64_neon_ld3r: {
6351     LLT Ty = MRI.getType(I.getOperand(0).getReg());
6352     unsigned Opc = 0;
6353     if (Ty == LLT::fixed_vector(8, S8))
6354       Opc = AArch64::LD3Rv8b;
6355     else if (Ty == LLT::fixed_vector(16, S8))
6356       Opc = AArch64::LD3Rv16b;
6357     else if (Ty == LLT::fixed_vector(4, S16))
6358       Opc = AArch64::LD3Rv4h;
6359     else if (Ty == LLT::fixed_vector(8, S16))
6360       Opc = AArch64::LD3Rv8h;
6361     else if (Ty == LLT::fixed_vector(2, S32))
6362       Opc = AArch64::LD3Rv2s;
6363     else if (Ty == LLT::fixed_vector(4, S32))
6364       Opc = AArch64::LD3Rv4s;
6365     else if (Ty == LLT::fixed_vector(2, S64) || Ty == LLT::fixed_vector(2, P0))
6366       Opc = AArch64::LD3Rv2d;
6367     else if (Ty == S64 || Ty == P0)
6368       Opc = AArch64::LD3Rv1d;
6369     else
6370       llvm_unreachable("Unexpected type for ld3r!");
6371     selectVectorLoadIntrinsic(Opc, 3, I);
6372     break;
6373   }
6374   case Intrinsic::aarch64_neon_ld4: {
6375     LLT Ty = MRI.getType(I.getOperand(0).getReg());
6376     unsigned Opc = 0;
6377     if (Ty == LLT::fixed_vector(8, S8))
6378       Opc = AArch64::LD4Fourv8b;
6379     else if (Ty == LLT::fixed_vector(16, S8))
6380       Opc = AArch64::LD4Fourv16b;
6381     else if (Ty == LLT::fixed_vector(4, S16))
6382       Opc = AArch64::LD4Fourv4h;
6383     else if (Ty == LLT::fixed_vector(8, S16))
6384       Opc = AArch64::LD4Fourv8h;
6385     else if (Ty == LLT::fixed_vector(2, S32))
6386       Opc = AArch64::LD4Fourv2s;
6387     else if (Ty == LLT::fixed_vector(4, S32))
6388       Opc = AArch64::LD4Fourv4s;
6389     else if (Ty == LLT::fixed_vector(2, S64) || Ty == LLT::fixed_vector(2, P0))
6390       Opc = AArch64::LD4Fourv2d;
6391     else if (Ty == S64 || Ty == P0)
6392       Opc = AArch64::LD1Fourv1d;
6393     else
6394       llvm_unreachable("Unexpected type for ld4!");
6395     selectVectorLoadIntrinsic(Opc, 4, I);
6396     break;
6397   }
6398   case Intrinsic::aarch64_neon_ld4lane: {
6399     LLT Ty = MRI.getType(I.getOperand(0).getReg());
6400     unsigned Opc;
6401     if (Ty == LLT::fixed_vector(8, S8) || Ty == LLT::fixed_vector(16, S8))
6402       Opc = AArch64::LD4i8;
6403     else if (Ty == LLT::fixed_vector(4, S16) || Ty == LLT::fixed_vector(8, S16))
6404       Opc = AArch64::LD4i16;
6405     else if (Ty == LLT::fixed_vector(2, S32) || Ty == LLT::fixed_vector(4, S32))
6406       Opc = AArch64::LD4i32;
6407     else if (Ty == LLT::fixed_vector(2, S64) ||
6408              Ty == LLT::fixed_vector(2, P0) || Ty == S64 || Ty == P0)
6409       Opc = AArch64::LD4i64;
6410     else
6411       llvm_unreachable("Unexpected type for st4lane!");
6412     if (!selectVectorLoadLaneIntrinsic(Opc, 4, I))
6413       return false;
6414     break;
6415   }
6416   case Intrinsic::aarch64_neon_ld4r: {
6417     LLT Ty = MRI.getType(I.getOperand(0).getReg());
6418     unsigned Opc = 0;
6419     if (Ty == LLT::fixed_vector(8, S8))
6420       Opc = AArch64::LD4Rv8b;
6421     else if (Ty == LLT::fixed_vector(16, S8))
6422       Opc = AArch64::LD4Rv16b;
6423     else if (Ty == LLT::fixed_vector(4, S16))
6424       Opc = AArch64::LD4Rv4h;
6425     else if (Ty == LLT::fixed_vector(8, S16))
6426       Opc = AArch64::LD4Rv8h;
6427     else if (Ty == LLT::fixed_vector(2, S32))
6428       Opc = AArch64::LD4Rv2s;
6429     else if (Ty == LLT::fixed_vector(4, S32))
6430       Opc = AArch64::LD4Rv4s;
6431     else if (Ty == LLT::fixed_vector(2, S64) || Ty == LLT::fixed_vector(2, P0))
6432       Opc = AArch64::LD4Rv2d;
6433     else if (Ty == S64 || Ty == P0)
6434       Opc = AArch64::LD4Rv1d;
6435     else
6436       llvm_unreachable("Unexpected type for ld4r!");
6437     selectVectorLoadIntrinsic(Opc, 4, I);
6438     break;
6439   }
6440   case Intrinsic::aarch64_neon_st1x2: {
6441     LLT Ty = MRI.getType(I.getOperand(1).getReg());
6442     unsigned Opc;
6443     if (Ty == LLT::fixed_vector(8, S8))
6444       Opc = AArch64::ST1Twov8b;
6445     else if (Ty == LLT::fixed_vector(16, S8))
6446       Opc = AArch64::ST1Twov16b;
6447     else if (Ty == LLT::fixed_vector(4, S16))
6448       Opc = AArch64::ST1Twov4h;
6449     else if (Ty == LLT::fixed_vector(8, S16))
6450       Opc = AArch64::ST1Twov8h;
6451     else if (Ty == LLT::fixed_vector(2, S32))
6452       Opc = AArch64::ST1Twov2s;
6453     else if (Ty == LLT::fixed_vector(4, S32))
6454       Opc = AArch64::ST1Twov4s;
6455     else if (Ty == LLT::fixed_vector(2, S64) || Ty == LLT::fixed_vector(2, P0))
6456       Opc = AArch64::ST1Twov2d;
6457     else if (Ty == S64 || Ty == P0)
6458       Opc = AArch64::ST1Twov1d;
6459     else
6460       llvm_unreachable("Unexpected type for st1x2!");
6461     selectVectorStoreIntrinsic(I, 2, Opc);
6462     break;
6463   }
6464   case Intrinsic::aarch64_neon_st1x3: {
6465     LLT Ty = MRI.getType(I.getOperand(1).getReg());
6466     unsigned Opc;
6467     if (Ty == LLT::fixed_vector(8, S8))
6468       Opc = AArch64::ST1Threev8b;
6469     else if (Ty == LLT::fixed_vector(16, S8))
6470       Opc = AArch64::ST1Threev16b;
6471     else if (Ty == LLT::fixed_vector(4, S16))
6472       Opc = AArch64::ST1Threev4h;
6473     else if (Ty == LLT::fixed_vector(8, S16))
6474       Opc = AArch64::ST1Threev8h;
6475     else if (Ty == LLT::fixed_vector(2, S32))
6476       Opc = AArch64::ST1Threev2s;
6477     else if (Ty == LLT::fixed_vector(4, S32))
6478       Opc = AArch64::ST1Threev4s;
6479     else if (Ty == LLT::fixed_vector(2, S64) || Ty == LLT::fixed_vector(2, P0))
6480       Opc = AArch64::ST1Threev2d;
6481     else if (Ty == S64 || Ty == P0)
6482       Opc = AArch64::ST1Threev1d;
6483     else
6484       llvm_unreachable("Unexpected type for st1x3!");
6485     selectVectorStoreIntrinsic(I, 3, Opc);
6486     break;
6487   }
6488   case Intrinsic::aarch64_neon_st1x4: {
6489     LLT Ty = MRI.getType(I.getOperand(1).getReg());
6490     unsigned Opc;
6491     if (Ty == LLT::fixed_vector(8, S8))
6492       Opc = AArch64::ST1Fourv8b;
6493     else if (Ty == LLT::fixed_vector(16, S8))
6494       Opc = AArch64::ST1Fourv16b;
6495     else if (Ty == LLT::fixed_vector(4, S16))
6496       Opc = AArch64::ST1Fourv4h;
6497     else if (Ty == LLT::fixed_vector(8, S16))
6498       Opc = AArch64::ST1Fourv8h;
6499     else if (Ty == LLT::fixed_vector(2, S32))
6500       Opc = AArch64::ST1Fourv2s;
6501     else if (Ty == LLT::fixed_vector(4, S32))
6502       Opc = AArch64::ST1Fourv4s;
6503     else if (Ty == LLT::fixed_vector(2, S64) || Ty == LLT::fixed_vector(2, P0))
6504       Opc = AArch64::ST1Fourv2d;
6505     else if (Ty == S64 || Ty == P0)
6506       Opc = AArch64::ST1Fourv1d;
6507     else
6508       llvm_unreachable("Unexpected type for st1x4!");
6509     selectVectorStoreIntrinsic(I, 4, Opc);
6510     break;
6511   }
6512   case Intrinsic::aarch64_neon_st2: {
6513     LLT Ty = MRI.getType(I.getOperand(1).getReg());
6514     unsigned Opc;
6515     if (Ty == LLT::fixed_vector(8, S8))
6516       Opc = AArch64::ST2Twov8b;
6517     else if (Ty == LLT::fixed_vector(16, S8))
6518       Opc = AArch64::ST2Twov16b;
6519     else if (Ty == LLT::fixed_vector(4, S16))
6520       Opc = AArch64::ST2Twov4h;
6521     else if (Ty == LLT::fixed_vector(8, S16))
6522       Opc = AArch64::ST2Twov8h;
6523     else if (Ty == LLT::fixed_vector(2, S32))
6524       Opc = AArch64::ST2Twov2s;
6525     else if (Ty == LLT::fixed_vector(4, S32))
6526       Opc = AArch64::ST2Twov4s;
6527     else if (Ty == LLT::fixed_vector(2, S64) || Ty == LLT::fixed_vector(2, P0))
6528       Opc = AArch64::ST2Twov2d;
6529     else if (Ty == S64 || Ty == P0)
6530       Opc = AArch64::ST1Twov1d;
6531     else
6532       llvm_unreachable("Unexpected type for st2!");
6533     selectVectorStoreIntrinsic(I, 2, Opc);
6534     break;
6535   }
6536   case Intrinsic::aarch64_neon_st3: {
6537     LLT Ty = MRI.getType(I.getOperand(1).getReg());
6538     unsigned Opc;
6539     if (Ty == LLT::fixed_vector(8, S8))
6540       Opc = AArch64::ST3Threev8b;
6541     else if (Ty == LLT::fixed_vector(16, S8))
6542       Opc = AArch64::ST3Threev16b;
6543     else if (Ty == LLT::fixed_vector(4, S16))
6544       Opc = AArch64::ST3Threev4h;
6545     else if (Ty == LLT::fixed_vector(8, S16))
6546       Opc = AArch64::ST3Threev8h;
6547     else if (Ty == LLT::fixed_vector(2, S32))
6548       Opc = AArch64::ST3Threev2s;
6549     else if (Ty == LLT::fixed_vector(4, S32))
6550       Opc = AArch64::ST3Threev4s;
6551     else if (Ty == LLT::fixed_vector(2, S64) || Ty == LLT::fixed_vector(2, P0))
6552       Opc = AArch64::ST3Threev2d;
6553     else if (Ty == S64 || Ty == P0)
6554       Opc = AArch64::ST1Threev1d;
6555     else
6556       llvm_unreachable("Unexpected type for st3!");
6557     selectVectorStoreIntrinsic(I, 3, Opc);
6558     break;
6559   }
6560   case Intrinsic::aarch64_neon_st4: {
6561     LLT Ty = MRI.getType(I.getOperand(1).getReg());
6562     unsigned Opc;
6563     if (Ty == LLT::fixed_vector(8, S8))
6564       Opc = AArch64::ST4Fourv8b;
6565     else if (Ty == LLT::fixed_vector(16, S8))
6566       Opc = AArch64::ST4Fourv16b;
6567     else if (Ty == LLT::fixed_vector(4, S16))
6568       Opc = AArch64::ST4Fourv4h;
6569     else if (Ty == LLT::fixed_vector(8, S16))
6570       Opc = AArch64::ST4Fourv8h;
6571     else if (Ty == LLT::fixed_vector(2, S32))
6572       Opc = AArch64::ST4Fourv2s;
6573     else if (Ty == LLT::fixed_vector(4, S32))
6574       Opc = AArch64::ST4Fourv4s;
6575     else if (Ty == LLT::fixed_vector(2, S64) || Ty == LLT::fixed_vector(2, P0))
6576       Opc = AArch64::ST4Fourv2d;
6577     else if (Ty == S64 || Ty == P0)
6578       Opc = AArch64::ST1Fourv1d;
6579     else
6580       llvm_unreachable("Unexpected type for st4!");
6581     selectVectorStoreIntrinsic(I, 4, Opc);
6582     break;
6583   }
6584   case Intrinsic::aarch64_neon_st2lane: {
6585     LLT Ty = MRI.getType(I.getOperand(1).getReg());
6586     unsigned Opc;
6587     if (Ty == LLT::fixed_vector(8, S8) || Ty == LLT::fixed_vector(16, S8))
6588       Opc = AArch64::ST2i8;
6589     else if (Ty == LLT::fixed_vector(4, S16) || Ty == LLT::fixed_vector(8, S16))
6590       Opc = AArch64::ST2i16;
6591     else if (Ty == LLT::fixed_vector(2, S32) || Ty == LLT::fixed_vector(4, S32))
6592       Opc = AArch64::ST2i32;
6593     else if (Ty == LLT::fixed_vector(2, S64) ||
6594              Ty == LLT::fixed_vector(2, P0) || Ty == S64 || Ty == P0)
6595       Opc = AArch64::ST2i64;
6596     else
6597       llvm_unreachable("Unexpected type for st2lane!");
6598     if (!selectVectorStoreLaneIntrinsic(I, 2, Opc))
6599       return false;
6600     break;
6601   }
6602   case Intrinsic::aarch64_neon_st3lane: {
6603     LLT Ty = MRI.getType(I.getOperand(1).getReg());
6604     unsigned Opc;
6605     if (Ty == LLT::fixed_vector(8, S8) || Ty == LLT::fixed_vector(16, S8))
6606       Opc = AArch64::ST3i8;
6607     else if (Ty == LLT::fixed_vector(4, S16) || Ty == LLT::fixed_vector(8, S16))
6608       Opc = AArch64::ST3i16;
6609     else if (Ty == LLT::fixed_vector(2, S32) || Ty == LLT::fixed_vector(4, S32))
6610       Opc = AArch64::ST3i32;
6611     else if (Ty == LLT::fixed_vector(2, S64) ||
6612              Ty == LLT::fixed_vector(2, P0) || Ty == S64 || Ty == P0)
6613       Opc = AArch64::ST3i64;
6614     else
6615       llvm_unreachable("Unexpected type for st3lane!");
6616     if (!selectVectorStoreLaneIntrinsic(I, 3, Opc))
6617       return false;
6618     break;
6619   }
6620   case Intrinsic::aarch64_neon_st4lane: {
6621     LLT Ty = MRI.getType(I.getOperand(1).getReg());
6622     unsigned Opc;
6623     if (Ty == LLT::fixed_vector(8, S8) || Ty == LLT::fixed_vector(16, S8))
6624       Opc = AArch64::ST4i8;
6625     else if (Ty == LLT::fixed_vector(4, S16) || Ty == LLT::fixed_vector(8, S16))
6626       Opc = AArch64::ST4i16;
6627     else if (Ty == LLT::fixed_vector(2, S32) || Ty == LLT::fixed_vector(4, S32))
6628       Opc = AArch64::ST4i32;
6629     else if (Ty == LLT::fixed_vector(2, S64) ||
6630              Ty == LLT::fixed_vector(2, P0) || Ty == S64 || Ty == P0)
6631       Opc = AArch64::ST4i64;
6632     else
6633       llvm_unreachable("Unexpected type for st4lane!");
6634     if (!selectVectorStoreLaneIntrinsic(I, 4, Opc))
6635       return false;
6636     break;
6637   }
6638   case Intrinsic::aarch64_mops_memset_tag: {
6639     // Transform
6640     //    %dst:gpr(p0) = \
6641     //      G_INTRINSIC_W_SIDE_EFFECTS intrinsic(@llvm.aarch64.mops.memset.tag),
6642     //      \ %dst:gpr(p0), %val:gpr(s64), %n:gpr(s64)
6643     // where %dst is updated, into
6644     //    %Rd:GPR64common, %Rn:GPR64) = \
6645     //      MOPSMemorySetTaggingPseudo \
6646     //      %Rd:GPR64common, %Rn:GPR64, %Rm:GPR64
6647     // where Rd and Rn are tied.
6648     // It is expected that %val has been extended to s64 in legalization.
6649     // Note that the order of the size/value operands are swapped.
6650 
6651     Register DstDef = I.getOperand(0).getReg();
6652     // I.getOperand(1) is the intrinsic function
6653     Register DstUse = I.getOperand(2).getReg();
6654     Register ValUse = I.getOperand(3).getReg();
6655     Register SizeUse = I.getOperand(4).getReg();
6656 
6657     // MOPSMemorySetTaggingPseudo has two defs; the intrinsic call has only one.
6658     // Therefore an additional virtual register is required for the updated size
6659     // operand. This value is not accessible via the semantics of the intrinsic.
6660     Register SizeDef = MRI.createGenericVirtualRegister(LLT::scalar(64));
6661 
6662     auto Memset = MIB.buildInstr(AArch64::MOPSMemorySetTaggingPseudo,
6663                                  {DstDef, SizeDef}, {DstUse, SizeUse, ValUse});
6664     Memset.cloneMemRefs(I);
6665     constrainSelectedInstRegOperands(*Memset, TII, TRI, RBI);
6666     break;
6667   }
6668   }
6669 
6670   I.eraseFromParent();
6671   return true;
6672 }
6673 
selectIntrinsic(MachineInstr & I,MachineRegisterInfo & MRI)6674 bool AArch64InstructionSelector::selectIntrinsic(MachineInstr &I,
6675                                                  MachineRegisterInfo &MRI) {
6676   unsigned IntrinID = cast<GIntrinsic>(I).getIntrinsicID();
6677 
6678   switch (IntrinID) {
6679   default:
6680     break;
6681   case Intrinsic::aarch64_crypto_sha1h: {
6682     Register DstReg = I.getOperand(0).getReg();
6683     Register SrcReg = I.getOperand(2).getReg();
6684 
6685     // FIXME: Should this be an assert?
6686     if (MRI.getType(DstReg).getSizeInBits() != 32 ||
6687         MRI.getType(SrcReg).getSizeInBits() != 32)
6688       return false;
6689 
6690     // The operation has to happen on FPRs. Set up some new FPR registers for
6691     // the source and destination if they are on GPRs.
6692     if (RBI.getRegBank(SrcReg, MRI, TRI)->getID() != AArch64::FPRRegBankID) {
6693       SrcReg = MRI.createVirtualRegister(&AArch64::FPR32RegClass);
6694       MIB.buildCopy({SrcReg}, {I.getOperand(2)});
6695 
6696       // Make sure the copy ends up getting constrained properly.
6697       RBI.constrainGenericRegister(I.getOperand(2).getReg(),
6698                                    AArch64::GPR32RegClass, MRI);
6699     }
6700 
6701     if (RBI.getRegBank(DstReg, MRI, TRI)->getID() != AArch64::FPRRegBankID)
6702       DstReg = MRI.createVirtualRegister(&AArch64::FPR32RegClass);
6703 
6704     // Actually insert the instruction.
6705     auto SHA1Inst = MIB.buildInstr(AArch64::SHA1Hrr, {DstReg}, {SrcReg});
6706     constrainSelectedInstRegOperands(*SHA1Inst, TII, TRI, RBI);
6707 
6708     // Did we create a new register for the destination?
6709     if (DstReg != I.getOperand(0).getReg()) {
6710       // Yep. Copy the result of the instruction back into the original
6711       // destination.
6712       MIB.buildCopy({I.getOperand(0)}, {DstReg});
6713       RBI.constrainGenericRegister(I.getOperand(0).getReg(),
6714                                    AArch64::GPR32RegClass, MRI);
6715     }
6716 
6717     I.eraseFromParent();
6718     return true;
6719   }
6720   case Intrinsic::ptrauth_resign: {
6721     Register DstReg = I.getOperand(0).getReg();
6722     Register ValReg = I.getOperand(2).getReg();
6723     uint64_t AUTKey = I.getOperand(3).getImm();
6724     Register AUTDisc = I.getOperand(4).getReg();
6725     uint64_t PACKey = I.getOperand(5).getImm();
6726     Register PACDisc = I.getOperand(6).getReg();
6727 
6728     Register AUTAddrDisc = AUTDisc;
6729     uint16_t AUTConstDiscC = 0;
6730     std::tie(AUTConstDiscC, AUTAddrDisc) =
6731         extractPtrauthBlendDiscriminators(AUTDisc, MRI);
6732 
6733     Register PACAddrDisc = PACDisc;
6734     uint16_t PACConstDiscC = 0;
6735     std::tie(PACConstDiscC, PACAddrDisc) =
6736         extractPtrauthBlendDiscriminators(PACDisc, MRI);
6737 
6738     MIB.buildCopy({AArch64::X16}, {ValReg});
6739     MIB.buildInstr(TargetOpcode::IMPLICIT_DEF, {AArch64::X17}, {});
6740     MIB.buildInstr(AArch64::AUTPAC)
6741         .addImm(AUTKey)
6742         .addImm(AUTConstDiscC)
6743         .addUse(AUTAddrDisc)
6744         .addImm(PACKey)
6745         .addImm(PACConstDiscC)
6746         .addUse(PACAddrDisc)
6747         .constrainAllUses(TII, TRI, RBI);
6748     MIB.buildCopy({DstReg}, Register(AArch64::X16));
6749 
6750     RBI.constrainGenericRegister(DstReg, AArch64::GPR64RegClass, MRI);
6751     I.eraseFromParent();
6752     return true;
6753   }
6754   case Intrinsic::ptrauth_auth: {
6755     Register DstReg = I.getOperand(0).getReg();
6756     Register ValReg = I.getOperand(2).getReg();
6757     uint64_t AUTKey = I.getOperand(3).getImm();
6758     Register AUTDisc = I.getOperand(4).getReg();
6759 
6760     Register AUTAddrDisc = AUTDisc;
6761     uint16_t AUTConstDiscC = 0;
6762     std::tie(AUTConstDiscC, AUTAddrDisc) =
6763         extractPtrauthBlendDiscriminators(AUTDisc, MRI);
6764 
6765     if (STI.isX16X17Safer()) {
6766       MIB.buildCopy({AArch64::X16}, {ValReg});
6767       MIB.buildInstr(TargetOpcode::IMPLICIT_DEF, {AArch64::X17}, {});
6768       MIB.buildInstr(AArch64::AUTx16x17)
6769           .addImm(AUTKey)
6770           .addImm(AUTConstDiscC)
6771           .addUse(AUTAddrDisc)
6772           .constrainAllUses(TII, TRI, RBI);
6773       MIB.buildCopy({DstReg}, Register(AArch64::X16));
6774     } else {
6775       Register ScratchReg =
6776           MRI.createVirtualRegister(&AArch64::GPR64commonRegClass);
6777       MIB.buildInstr(AArch64::AUTxMxN)
6778           .addDef(DstReg)
6779           .addDef(ScratchReg)
6780           .addUse(ValReg)
6781           .addImm(AUTKey)
6782           .addImm(AUTConstDiscC)
6783           .addUse(AUTAddrDisc)
6784           .constrainAllUses(TII, TRI, RBI);
6785     }
6786 
6787     RBI.constrainGenericRegister(DstReg, AArch64::GPR64RegClass, MRI);
6788     I.eraseFromParent();
6789     return true;
6790   }
6791   case Intrinsic::frameaddress:
6792   case Intrinsic::returnaddress: {
6793     MachineFunction &MF = *I.getParent()->getParent();
6794     MachineFrameInfo &MFI = MF.getFrameInfo();
6795 
6796     unsigned Depth = I.getOperand(2).getImm();
6797     Register DstReg = I.getOperand(0).getReg();
6798     RBI.constrainGenericRegister(DstReg, AArch64::GPR64RegClass, MRI);
6799 
6800     if (Depth == 0 && IntrinID == Intrinsic::returnaddress) {
6801       if (!MFReturnAddr) {
6802         // Insert the copy from LR/X30 into the entry block, before it can be
6803         // clobbered by anything.
6804         MFI.setReturnAddressIsTaken(true);
6805         MFReturnAddr = getFunctionLiveInPhysReg(
6806             MF, TII, AArch64::LR, AArch64::GPR64RegClass, I.getDebugLoc());
6807       }
6808 
6809       if (STI.hasPAuth()) {
6810         MIB.buildInstr(AArch64::XPACI, {DstReg}, {MFReturnAddr});
6811       } else {
6812         MIB.buildCopy({Register(AArch64::LR)}, {MFReturnAddr});
6813         MIB.buildInstr(AArch64::XPACLRI);
6814         MIB.buildCopy({DstReg}, {Register(AArch64::LR)});
6815       }
6816 
6817       I.eraseFromParent();
6818       return true;
6819     }
6820 
6821     MFI.setFrameAddressIsTaken(true);
6822     Register FrameAddr(AArch64::FP);
6823     while (Depth--) {
6824       Register NextFrame = MRI.createVirtualRegister(&AArch64::GPR64spRegClass);
6825       auto Ldr =
6826           MIB.buildInstr(AArch64::LDRXui, {NextFrame}, {FrameAddr}).addImm(0);
6827       constrainSelectedInstRegOperands(*Ldr, TII, TRI, RBI);
6828       FrameAddr = NextFrame;
6829     }
6830 
6831     if (IntrinID == Intrinsic::frameaddress)
6832       MIB.buildCopy({DstReg}, {FrameAddr});
6833     else {
6834       MFI.setReturnAddressIsTaken(true);
6835 
6836       if (STI.hasPAuth()) {
6837         Register TmpReg = MRI.createVirtualRegister(&AArch64::GPR64RegClass);
6838         MIB.buildInstr(AArch64::LDRXui, {TmpReg}, {FrameAddr}).addImm(1);
6839         MIB.buildInstr(AArch64::XPACI, {DstReg}, {TmpReg});
6840       } else {
6841         MIB.buildInstr(AArch64::LDRXui, {Register(AArch64::LR)}, {FrameAddr})
6842             .addImm(1);
6843         MIB.buildInstr(AArch64::XPACLRI);
6844         MIB.buildCopy({DstReg}, {Register(AArch64::LR)});
6845       }
6846     }
6847 
6848     I.eraseFromParent();
6849     return true;
6850   }
6851   case Intrinsic::aarch64_neon_tbl2:
6852     SelectTable(I, MRI, 2, AArch64::TBLv8i8Two, AArch64::TBLv16i8Two, false);
6853     return true;
6854   case Intrinsic::aarch64_neon_tbl3:
6855     SelectTable(I, MRI, 3, AArch64::TBLv8i8Three, AArch64::TBLv16i8Three,
6856                 false);
6857     return true;
6858   case Intrinsic::aarch64_neon_tbl4:
6859     SelectTable(I, MRI, 4, AArch64::TBLv8i8Four, AArch64::TBLv16i8Four, false);
6860     return true;
6861   case Intrinsic::aarch64_neon_tbx2:
6862     SelectTable(I, MRI, 2, AArch64::TBXv8i8Two, AArch64::TBXv16i8Two, true);
6863     return true;
6864   case Intrinsic::aarch64_neon_tbx3:
6865     SelectTable(I, MRI, 3, AArch64::TBXv8i8Three, AArch64::TBXv16i8Three, true);
6866     return true;
6867   case Intrinsic::aarch64_neon_tbx4:
6868     SelectTable(I, MRI, 4, AArch64::TBXv8i8Four, AArch64::TBXv16i8Four, true);
6869     return true;
6870   case Intrinsic::swift_async_context_addr:
6871     auto Sub = MIB.buildInstr(AArch64::SUBXri, {I.getOperand(0).getReg()},
6872                               {Register(AArch64::FP)})
6873                    .addImm(8)
6874                    .addImm(0);
6875     constrainSelectedInstRegOperands(*Sub, TII, TRI, RBI);
6876 
6877     MF->getFrameInfo().setFrameAddressIsTaken(true);
6878     MF->getInfo<AArch64FunctionInfo>()->setHasSwiftAsyncContext(true);
6879     I.eraseFromParent();
6880     return true;
6881   }
6882   return false;
6883 }
6884 
6885 // G_PTRAUTH_GLOBAL_VALUE lowering
6886 //
6887 // We have 3 lowering alternatives to choose from:
6888 // - MOVaddrPAC: similar to MOVaddr, with added PAC.
6889 //   If the GV doesn't need a GOT load (i.e., is locally defined)
6890 //   materialize the pointer using adrp+add+pac. See LowerMOVaddrPAC.
6891 //
6892 // - LOADgotPAC: similar to LOADgot, with added PAC.
6893 //   If the GV needs a GOT load, materialize the pointer using the usual
6894 //   GOT adrp+ldr, +pac. Pointers in GOT are assumed to be not signed, the GOT
6895 //   section is assumed to be read-only (for example, via relro mechanism). See
6896 //   LowerMOVaddrPAC.
6897 //
6898 // - LOADauthptrstatic: similar to LOADgot, but use a
6899 //   special stub slot instead of a GOT slot.
6900 //   Load a signed pointer for symbol 'sym' from a stub slot named
6901 //   'sym$auth_ptr$key$disc' filled by dynamic linker during relocation
6902 //   resolving. This usually lowers to adrp+ldr, but also emits an entry into
6903 //   .data with an
6904 //   @AUTH relocation. See LowerLOADauthptrstatic.
6905 //
6906 // All 3 are pseudos that are expand late to longer sequences: this lets us
6907 // provide integrity guarantees on the to-be-signed intermediate values.
6908 //
6909 // LOADauthptrstatic is undesirable because it requires a large section filled
6910 // with often similarly-signed pointers, making it a good harvesting target.
6911 // Thus, it's only used for ptrauth references to extern_weak to avoid null
6912 // checks.
6913 
selectPtrAuthGlobalValue(MachineInstr & I,MachineRegisterInfo & MRI) const6914 bool AArch64InstructionSelector::selectPtrAuthGlobalValue(
6915     MachineInstr &I, MachineRegisterInfo &MRI) const {
6916   Register DefReg = I.getOperand(0).getReg();
6917   Register Addr = I.getOperand(1).getReg();
6918   uint64_t Key = I.getOperand(2).getImm();
6919   Register AddrDisc = I.getOperand(3).getReg();
6920   uint64_t Disc = I.getOperand(4).getImm();
6921   int64_t Offset = 0;
6922 
6923   if (Key > AArch64PACKey::LAST)
6924     report_fatal_error("key in ptrauth global out of range [0, " +
6925                        Twine((int)AArch64PACKey::LAST) + "]");
6926 
6927   // Blend only works if the integer discriminator is 16-bit wide.
6928   if (!isUInt<16>(Disc))
6929     report_fatal_error(
6930         "constant discriminator in ptrauth global out of range [0, 0xffff]");
6931 
6932   // Choosing between 3 lowering alternatives is target-specific.
6933   if (!STI.isTargetELF() && !STI.isTargetMachO())
6934     report_fatal_error("ptrauth global lowering only supported on MachO/ELF");
6935 
6936   if (!MRI.hasOneDef(Addr))
6937     return false;
6938 
6939   // First match any offset we take from the real global.
6940   const MachineInstr *DefMI = &*MRI.def_instr_begin(Addr);
6941   if (DefMI->getOpcode() == TargetOpcode::G_PTR_ADD) {
6942     Register OffsetReg = DefMI->getOperand(2).getReg();
6943     if (!MRI.hasOneDef(OffsetReg))
6944       return false;
6945     const MachineInstr &OffsetMI = *MRI.def_instr_begin(OffsetReg);
6946     if (OffsetMI.getOpcode() != TargetOpcode::G_CONSTANT)
6947       return false;
6948 
6949     Addr = DefMI->getOperand(1).getReg();
6950     if (!MRI.hasOneDef(Addr))
6951       return false;
6952 
6953     DefMI = &*MRI.def_instr_begin(Addr);
6954     Offset = OffsetMI.getOperand(1).getCImm()->getSExtValue();
6955   }
6956 
6957   // We should be left with a genuine unauthenticated GlobalValue.
6958   const GlobalValue *GV;
6959   if (DefMI->getOpcode() == TargetOpcode::G_GLOBAL_VALUE) {
6960     GV = DefMI->getOperand(1).getGlobal();
6961     Offset += DefMI->getOperand(1).getOffset();
6962   } else if (DefMI->getOpcode() == AArch64::G_ADD_LOW) {
6963     GV = DefMI->getOperand(2).getGlobal();
6964     Offset += DefMI->getOperand(2).getOffset();
6965   } else {
6966     return false;
6967   }
6968 
6969   MachineIRBuilder MIB(I);
6970 
6971   // Classify the reference to determine whether it needs a GOT load.
6972   unsigned OpFlags = STI.ClassifyGlobalReference(GV, TM);
6973   const bool NeedsGOTLoad = ((OpFlags & AArch64II::MO_GOT) != 0);
6974   assert(((OpFlags & (~AArch64II::MO_GOT)) == 0) &&
6975          "unsupported non-GOT op flags on ptrauth global reference");
6976   assert((!GV->hasExternalWeakLinkage() || NeedsGOTLoad) &&
6977          "unsupported non-GOT reference to weak ptrauth global");
6978 
6979   std::optional<APInt> AddrDiscVal = getIConstantVRegVal(AddrDisc, MRI);
6980   bool HasAddrDisc = !AddrDiscVal || *AddrDiscVal != 0;
6981 
6982   // Non-extern_weak:
6983   // - No GOT load needed -> MOVaddrPAC
6984   // - GOT load for non-extern_weak -> LOADgotPAC
6985   //   Note that we disallow extern_weak refs to avoid null checks later.
6986   if (!GV->hasExternalWeakLinkage()) {
6987     MIB.buildInstr(TargetOpcode::IMPLICIT_DEF, {AArch64::X16}, {});
6988     MIB.buildInstr(TargetOpcode::IMPLICIT_DEF, {AArch64::X17}, {});
6989     MIB.buildInstr(NeedsGOTLoad ? AArch64::LOADgotPAC : AArch64::MOVaddrPAC)
6990         .addGlobalAddress(GV, Offset)
6991         .addImm(Key)
6992         .addReg(HasAddrDisc ? AddrDisc : AArch64::XZR)
6993         .addImm(Disc)
6994         .constrainAllUses(TII, TRI, RBI);
6995     MIB.buildCopy(DefReg, Register(AArch64::X16));
6996     RBI.constrainGenericRegister(DefReg, AArch64::GPR64RegClass, MRI);
6997     I.eraseFromParent();
6998     return true;
6999   }
7000 
7001   // extern_weak -> LOADauthptrstatic
7002 
7003   // Offsets and extern_weak don't mix well: ptrauth aside, you'd get the
7004   // offset alone as a pointer if the symbol wasn't available, which would
7005   // probably break null checks in users. Ptrauth complicates things further:
7006   // error out.
7007   if (Offset != 0)
7008     report_fatal_error(
7009         "unsupported non-zero offset in weak ptrauth global reference");
7010 
7011   if (HasAddrDisc)
7012     report_fatal_error("unsupported weak addr-div ptrauth global");
7013 
7014   MIB.buildInstr(AArch64::LOADauthptrstatic, {DefReg}, {})
7015       .addGlobalAddress(GV, Offset)
7016       .addImm(Key)
7017       .addImm(Disc);
7018   RBI.constrainGenericRegister(DefReg, AArch64::GPR64RegClass, MRI);
7019 
7020   I.eraseFromParent();
7021   return true;
7022 }
7023 
SelectTable(MachineInstr & I,MachineRegisterInfo & MRI,unsigned NumVec,unsigned Opc1,unsigned Opc2,bool isExt)7024 void AArch64InstructionSelector::SelectTable(MachineInstr &I,
7025                                              MachineRegisterInfo &MRI,
7026                                              unsigned NumVec, unsigned Opc1,
7027                                              unsigned Opc2, bool isExt) {
7028   Register DstReg = I.getOperand(0).getReg();
7029   unsigned Opc = MRI.getType(DstReg) == LLT::fixed_vector(8, 8) ? Opc1 : Opc2;
7030 
7031   // Create the REG_SEQUENCE
7032   SmallVector<Register, 4> Regs;
7033   for (unsigned i = 0; i < NumVec; i++)
7034     Regs.push_back(I.getOperand(i + 2 + isExt).getReg());
7035   Register RegSeq = createQTuple(Regs, MIB);
7036 
7037   Register IdxReg = I.getOperand(2 + NumVec + isExt).getReg();
7038   MachineInstrBuilder Instr;
7039   if (isExt) {
7040     Register Reg = I.getOperand(2).getReg();
7041     Instr = MIB.buildInstr(Opc, {DstReg}, {Reg, RegSeq, IdxReg});
7042   } else
7043     Instr = MIB.buildInstr(Opc, {DstReg}, {RegSeq, IdxReg});
7044   constrainSelectedInstRegOperands(*Instr, TII, TRI, RBI);
7045   I.eraseFromParent();
7046 }
7047 
7048 InstructionSelector::ComplexRendererFns
selectShiftA_32(const MachineOperand & Root) const7049 AArch64InstructionSelector::selectShiftA_32(const MachineOperand &Root) const {
7050   auto MaybeImmed = getImmedFromMO(Root);
7051   if (MaybeImmed == std::nullopt || *MaybeImmed > 31)
7052     return std::nullopt;
7053   uint64_t Enc = (32 - *MaybeImmed) & 0x1f;
7054   return {{[=](MachineInstrBuilder &MIB) { MIB.addImm(Enc); }}};
7055 }
7056 
7057 InstructionSelector::ComplexRendererFns
selectShiftB_32(const MachineOperand & Root) const7058 AArch64InstructionSelector::selectShiftB_32(const MachineOperand &Root) const {
7059   auto MaybeImmed = getImmedFromMO(Root);
7060   if (MaybeImmed == std::nullopt || *MaybeImmed > 31)
7061     return std::nullopt;
7062   uint64_t Enc = 31 - *MaybeImmed;
7063   return {{[=](MachineInstrBuilder &MIB) { MIB.addImm(Enc); }}};
7064 }
7065 
7066 InstructionSelector::ComplexRendererFns
selectShiftA_64(const MachineOperand & Root) const7067 AArch64InstructionSelector::selectShiftA_64(const MachineOperand &Root) const {
7068   auto MaybeImmed = getImmedFromMO(Root);
7069   if (MaybeImmed == std::nullopt || *MaybeImmed > 63)
7070     return std::nullopt;
7071   uint64_t Enc = (64 - *MaybeImmed) & 0x3f;
7072   return {{[=](MachineInstrBuilder &MIB) { MIB.addImm(Enc); }}};
7073 }
7074 
7075 InstructionSelector::ComplexRendererFns
selectShiftB_64(const MachineOperand & Root) const7076 AArch64InstructionSelector::selectShiftB_64(const MachineOperand &Root) const {
7077   auto MaybeImmed = getImmedFromMO(Root);
7078   if (MaybeImmed == std::nullopt || *MaybeImmed > 63)
7079     return std::nullopt;
7080   uint64_t Enc = 63 - *MaybeImmed;
7081   return {{[=](MachineInstrBuilder &MIB) { MIB.addImm(Enc); }}};
7082 }
7083 
7084 /// Helper to select an immediate value that can be represented as a 12-bit
7085 /// value shifted left by either 0 or 12. If it is possible to do so, return
7086 /// the immediate and shift value. If not, return std::nullopt.
7087 ///
7088 /// Used by selectArithImmed and selectNegArithImmed.
7089 InstructionSelector::ComplexRendererFns
select12BitValueWithLeftShift(uint64_t Immed) const7090 AArch64InstructionSelector::select12BitValueWithLeftShift(
7091     uint64_t Immed) const {
7092   unsigned ShiftAmt;
7093   if (Immed >> 12 == 0) {
7094     ShiftAmt = 0;
7095   } else if ((Immed & 0xfff) == 0 && Immed >> 24 == 0) {
7096     ShiftAmt = 12;
7097     Immed = Immed >> 12;
7098   } else
7099     return std::nullopt;
7100 
7101   unsigned ShVal = AArch64_AM::getShifterImm(AArch64_AM::LSL, ShiftAmt);
7102   return {{
7103       [=](MachineInstrBuilder &MIB) { MIB.addImm(Immed); },
7104       [=](MachineInstrBuilder &MIB) { MIB.addImm(ShVal); },
7105   }};
7106 }
7107 
7108 /// SelectArithImmed - Select an immediate value that can be represented as
7109 /// a 12-bit value shifted left by either 0 or 12.  If so, return true with
7110 /// Val set to the 12-bit value and Shift set to the shifter operand.
7111 InstructionSelector::ComplexRendererFns
selectArithImmed(MachineOperand & Root) const7112 AArch64InstructionSelector::selectArithImmed(MachineOperand &Root) const {
7113   // This function is called from the addsub_shifted_imm ComplexPattern,
7114   // which lists [imm] as the list of opcode it's interested in, however
7115   // we still need to check whether the operand is actually an immediate
7116   // here because the ComplexPattern opcode list is only used in
7117   // root-level opcode matching.
7118   auto MaybeImmed = getImmedFromMO(Root);
7119   if (MaybeImmed == std::nullopt)
7120     return std::nullopt;
7121   return select12BitValueWithLeftShift(*MaybeImmed);
7122 }
7123 
7124 /// SelectNegArithImmed - As above, but negates the value before trying to
7125 /// select it.
7126 InstructionSelector::ComplexRendererFns
selectNegArithImmed(MachineOperand & Root) const7127 AArch64InstructionSelector::selectNegArithImmed(MachineOperand &Root) const {
7128   // We need a register here, because we need to know if we have a 64 or 32
7129   // bit immediate.
7130   if (!Root.isReg())
7131     return std::nullopt;
7132   auto MaybeImmed = getImmedFromMO(Root);
7133   if (MaybeImmed == std::nullopt)
7134     return std::nullopt;
7135   uint64_t Immed = *MaybeImmed;
7136 
7137   // This negation is almost always valid, but "cmp wN, #0" and "cmn wN, #0"
7138   // have the opposite effect on the C flag, so this pattern mustn't match under
7139   // those circumstances.
7140   if (Immed == 0)
7141     return std::nullopt;
7142 
7143   // Check if we're dealing with a 32-bit type on the root or a 64-bit type on
7144   // the root.
7145   MachineRegisterInfo &MRI = Root.getParent()->getMF()->getRegInfo();
7146   if (MRI.getType(Root.getReg()).getSizeInBits() == 32)
7147     Immed = ~((uint32_t)Immed) + 1;
7148   else
7149     Immed = ~Immed + 1ULL;
7150 
7151   if (Immed & 0xFFFFFFFFFF000000ULL)
7152     return std::nullopt;
7153 
7154   Immed &= 0xFFFFFFULL;
7155   return select12BitValueWithLeftShift(Immed);
7156 }
7157 
7158 /// Checks if we are sure that folding MI into load/store addressing mode is
7159 /// beneficial or not.
7160 ///
7161 /// Returns:
7162 /// - true if folding MI would be beneficial.
7163 /// - false if folding MI would be bad.
7164 /// - std::nullopt if it is not sure whether folding MI is beneficial.
7165 ///
7166 /// \p MI can be the offset operand of G_PTR_ADD, e.g. G_SHL in the example:
7167 ///
7168 /// %13:gpr(s64) = G_CONSTANT i64 1
7169 /// %8:gpr(s64) = G_SHL %6, %13(s64)
7170 /// %9:gpr(p0) = G_PTR_ADD %0, %8(s64)
7171 /// %12:gpr(s32) = G_LOAD %9(p0) :: (load (s16))
isWorthFoldingIntoAddrMode(MachineInstr & MI,const MachineRegisterInfo & MRI) const7172 std::optional<bool> AArch64InstructionSelector::isWorthFoldingIntoAddrMode(
7173     MachineInstr &MI, const MachineRegisterInfo &MRI) const {
7174   if (MI.getOpcode() == AArch64::G_SHL) {
7175     // Address operands with shifts are free, except for running on subtargets
7176     // with AddrLSLSlow14.
7177     if (const auto ValAndVeg = getIConstantVRegValWithLookThrough(
7178             MI.getOperand(2).getReg(), MRI)) {
7179       const APInt ShiftVal = ValAndVeg->Value;
7180 
7181       // Don't fold if we know this will be slow.
7182       return !(STI.hasAddrLSLSlow14() && (ShiftVal == 1 || ShiftVal == 4));
7183     }
7184   }
7185   return std::nullopt;
7186 }
7187 
7188 /// Return true if it is worth folding MI into an extended register. That is,
7189 /// if it's safe to pull it into the addressing mode of a load or store as a
7190 /// shift.
7191 /// \p IsAddrOperand whether the def of MI is used as an address operand
7192 /// (e.g. feeding into an LDR/STR).
isWorthFoldingIntoExtendedReg(MachineInstr & MI,const MachineRegisterInfo & MRI,bool IsAddrOperand) const7193 bool AArch64InstructionSelector::isWorthFoldingIntoExtendedReg(
7194     MachineInstr &MI, const MachineRegisterInfo &MRI,
7195     bool IsAddrOperand) const {
7196 
7197   // Always fold if there is one use, or if we're optimizing for size.
7198   Register DefReg = MI.getOperand(0).getReg();
7199   if (MRI.hasOneNonDBGUse(DefReg) ||
7200       MI.getParent()->getParent()->getFunction().hasOptSize())
7201     return true;
7202 
7203   if (IsAddrOperand) {
7204     // If we are already sure that folding MI is good or bad, return the result.
7205     if (const auto Worth = isWorthFoldingIntoAddrMode(MI, MRI))
7206       return *Worth;
7207 
7208     // Fold G_PTR_ADD if its offset operand can be folded
7209     if (MI.getOpcode() == AArch64::G_PTR_ADD) {
7210       MachineInstr *OffsetInst =
7211           getDefIgnoringCopies(MI.getOperand(2).getReg(), MRI);
7212 
7213       // Note, we already know G_PTR_ADD is used by at least two instructions.
7214       // If we are also sure about whether folding is beneficial or not,
7215       // return the result.
7216       if (const auto Worth = isWorthFoldingIntoAddrMode(*OffsetInst, MRI))
7217         return *Worth;
7218     }
7219   }
7220 
7221   // FIXME: Consider checking HasALULSLFast as appropriate.
7222 
7223   // We have a fastpath, so folding a shift in and potentially computing it
7224   // many times may be beneficial. Check if this is only used in memory ops.
7225   // If it is, then we should fold.
7226   return all_of(MRI.use_nodbg_instructions(DefReg),
7227                 [](MachineInstr &Use) { return Use.mayLoadOrStore(); });
7228 }
7229 
isSignExtendShiftType(AArch64_AM::ShiftExtendType Type)7230 static bool isSignExtendShiftType(AArch64_AM::ShiftExtendType Type) {
7231   switch (Type) {
7232   case AArch64_AM::SXTB:
7233   case AArch64_AM::SXTH:
7234   case AArch64_AM::SXTW:
7235     return true;
7236   default:
7237     return false;
7238   }
7239 }
7240 
7241 InstructionSelector::ComplexRendererFns
selectExtendedSHL(MachineOperand & Root,MachineOperand & Base,MachineOperand & Offset,unsigned SizeInBytes,bool WantsExt) const7242 AArch64InstructionSelector::selectExtendedSHL(
7243     MachineOperand &Root, MachineOperand &Base, MachineOperand &Offset,
7244     unsigned SizeInBytes, bool WantsExt) const {
7245   assert(Base.isReg() && "Expected base to be a register operand");
7246   assert(Offset.isReg() && "Expected offset to be a register operand");
7247 
7248   MachineRegisterInfo &MRI = Root.getParent()->getMF()->getRegInfo();
7249   MachineInstr *OffsetInst = MRI.getVRegDef(Offset.getReg());
7250 
7251   unsigned OffsetOpc = OffsetInst->getOpcode();
7252   bool LookedThroughZExt = false;
7253   if (OffsetOpc != TargetOpcode::G_SHL && OffsetOpc != TargetOpcode::G_MUL) {
7254     // Try to look through a ZEXT.
7255     if (OffsetOpc != TargetOpcode::G_ZEXT || !WantsExt)
7256       return std::nullopt;
7257 
7258     OffsetInst = MRI.getVRegDef(OffsetInst->getOperand(1).getReg());
7259     OffsetOpc = OffsetInst->getOpcode();
7260     LookedThroughZExt = true;
7261 
7262     if (OffsetOpc != TargetOpcode::G_SHL && OffsetOpc != TargetOpcode::G_MUL)
7263       return std::nullopt;
7264   }
7265   // Make sure that the memory op is a valid size.
7266   int64_t LegalShiftVal = Log2_32(SizeInBytes);
7267   if (LegalShiftVal == 0)
7268     return std::nullopt;
7269   if (!isWorthFoldingIntoExtendedReg(*OffsetInst, MRI, true))
7270     return std::nullopt;
7271 
7272   // Now, try to find the specific G_CONSTANT. Start by assuming that the
7273   // register we will offset is the LHS, and the register containing the
7274   // constant is the RHS.
7275   Register OffsetReg = OffsetInst->getOperand(1).getReg();
7276   Register ConstantReg = OffsetInst->getOperand(2).getReg();
7277   auto ValAndVReg = getIConstantVRegValWithLookThrough(ConstantReg, MRI);
7278   if (!ValAndVReg) {
7279     // We didn't get a constant on the RHS. If the opcode is a shift, then
7280     // we're done.
7281     if (OffsetOpc == TargetOpcode::G_SHL)
7282       return std::nullopt;
7283 
7284     // If we have a G_MUL, we can use either register. Try looking at the RHS.
7285     std::swap(OffsetReg, ConstantReg);
7286     ValAndVReg = getIConstantVRegValWithLookThrough(ConstantReg, MRI);
7287     if (!ValAndVReg)
7288       return std::nullopt;
7289   }
7290 
7291   // The value must fit into 3 bits, and must be positive. Make sure that is
7292   // true.
7293   int64_t ImmVal = ValAndVReg->Value.getSExtValue();
7294 
7295   // Since we're going to pull this into a shift, the constant value must be
7296   // a power of 2. If we got a multiply, then we need to check this.
7297   if (OffsetOpc == TargetOpcode::G_MUL) {
7298     if (!llvm::has_single_bit<uint32_t>(ImmVal))
7299       return std::nullopt;
7300 
7301     // Got a power of 2. So, the amount we'll shift is the log base-2 of that.
7302     ImmVal = Log2_32(ImmVal);
7303   }
7304 
7305   if ((ImmVal & 0x7) != ImmVal)
7306     return std::nullopt;
7307 
7308   // We are only allowed to shift by LegalShiftVal. This shift value is built
7309   // into the instruction, so we can't just use whatever we want.
7310   if (ImmVal != LegalShiftVal)
7311     return std::nullopt;
7312 
7313   unsigned SignExtend = 0;
7314   if (WantsExt) {
7315     // Check if the offset is defined by an extend, unless we looked through a
7316     // G_ZEXT earlier.
7317     if (!LookedThroughZExt) {
7318       MachineInstr *ExtInst = getDefIgnoringCopies(OffsetReg, MRI);
7319       auto Ext = getExtendTypeForInst(*ExtInst, MRI, true);
7320       if (Ext == AArch64_AM::InvalidShiftExtend)
7321         return std::nullopt;
7322 
7323       SignExtend = isSignExtendShiftType(Ext) ? 1 : 0;
7324       // We only support SXTW for signed extension here.
7325       if (SignExtend && Ext != AArch64_AM::SXTW)
7326         return std::nullopt;
7327       OffsetReg = ExtInst->getOperand(1).getReg();
7328     }
7329 
7330     // Need a 32-bit wide register here.
7331     MachineIRBuilder MIB(*MRI.getVRegDef(Root.getReg()));
7332     OffsetReg = moveScalarRegClass(OffsetReg, AArch64::GPR32RegClass, MIB);
7333   }
7334 
7335   // We can use the LHS of the GEP as the base, and the LHS of the shift as an
7336   // offset. Signify that we are shifting by setting the shift flag to 1.
7337   return {{[=](MachineInstrBuilder &MIB) { MIB.addUse(Base.getReg()); },
7338            [=](MachineInstrBuilder &MIB) { MIB.addUse(OffsetReg); },
7339            [=](MachineInstrBuilder &MIB) {
7340              // Need to add both immediates here to make sure that they are both
7341              // added to the instruction.
7342              MIB.addImm(SignExtend);
7343              MIB.addImm(1);
7344            }}};
7345 }
7346 
7347 /// This is used for computing addresses like this:
7348 ///
7349 /// ldr x1, [x2, x3, lsl #3]
7350 ///
7351 /// Where x2 is the base register, and x3 is an offset register. The shift-left
7352 /// is a constant value specific to this load instruction. That is, we'll never
7353 /// see anything other than a 3 here (which corresponds to the size of the
7354 /// element being loaded.)
7355 InstructionSelector::ComplexRendererFns
selectAddrModeShiftedExtendXReg(MachineOperand & Root,unsigned SizeInBytes) const7356 AArch64InstructionSelector::selectAddrModeShiftedExtendXReg(
7357     MachineOperand &Root, unsigned SizeInBytes) const {
7358   if (!Root.isReg())
7359     return std::nullopt;
7360   MachineRegisterInfo &MRI = Root.getParent()->getMF()->getRegInfo();
7361 
7362   // We want to find something like this:
7363   //
7364   // val = G_CONSTANT LegalShiftVal
7365   // shift = G_SHL off_reg val
7366   // ptr = G_PTR_ADD base_reg shift
7367   // x = G_LOAD ptr
7368   //
7369   // And fold it into this addressing mode:
7370   //
7371   // ldr x, [base_reg, off_reg, lsl #LegalShiftVal]
7372 
7373   // Check if we can find the G_PTR_ADD.
7374   MachineInstr *PtrAdd =
7375       getOpcodeDef(TargetOpcode::G_PTR_ADD, Root.getReg(), MRI);
7376   if (!PtrAdd || !isWorthFoldingIntoExtendedReg(*PtrAdd, MRI, true))
7377     return std::nullopt;
7378 
7379   // Now, try to match an opcode which will match our specific offset.
7380   // We want a G_SHL or a G_MUL.
7381   MachineInstr *OffsetInst =
7382       getDefIgnoringCopies(PtrAdd->getOperand(2).getReg(), MRI);
7383   return selectExtendedSHL(Root, PtrAdd->getOperand(1),
7384                            OffsetInst->getOperand(0), SizeInBytes,
7385                            /*WantsExt=*/false);
7386 }
7387 
7388 /// This is used for computing addresses like this:
7389 ///
7390 /// ldr x1, [x2, x3]
7391 ///
7392 /// Where x2 is the base register, and x3 is an offset register.
7393 ///
7394 /// When possible (or profitable) to fold a G_PTR_ADD into the address
7395 /// calculation, this will do so. Otherwise, it will return std::nullopt.
7396 InstructionSelector::ComplexRendererFns
selectAddrModeRegisterOffset(MachineOperand & Root) const7397 AArch64InstructionSelector::selectAddrModeRegisterOffset(
7398     MachineOperand &Root) const {
7399   MachineRegisterInfo &MRI = Root.getParent()->getMF()->getRegInfo();
7400 
7401   // We need a GEP.
7402   MachineInstr *Gep = MRI.getVRegDef(Root.getReg());
7403   if (Gep->getOpcode() != TargetOpcode::G_PTR_ADD)
7404     return std::nullopt;
7405 
7406   // If this is used more than once, let's not bother folding.
7407   // TODO: Check if they are memory ops. If they are, then we can still fold
7408   // without having to recompute anything.
7409   if (!MRI.hasOneNonDBGUse(Gep->getOperand(0).getReg()))
7410     return std::nullopt;
7411 
7412   // Base is the GEP's LHS, offset is its RHS.
7413   return {{[=](MachineInstrBuilder &MIB) {
7414              MIB.addUse(Gep->getOperand(1).getReg());
7415            },
7416            [=](MachineInstrBuilder &MIB) {
7417              MIB.addUse(Gep->getOperand(2).getReg());
7418            },
7419            [=](MachineInstrBuilder &MIB) {
7420              // Need to add both immediates here to make sure that they are both
7421              // added to the instruction.
7422              MIB.addImm(0);
7423              MIB.addImm(0);
7424            }}};
7425 }
7426 
7427 /// This is intended to be equivalent to selectAddrModeXRO in
7428 /// AArch64ISelDAGtoDAG. It's used for selecting X register offset loads.
7429 InstructionSelector::ComplexRendererFns
selectAddrModeXRO(MachineOperand & Root,unsigned SizeInBytes) const7430 AArch64InstructionSelector::selectAddrModeXRO(MachineOperand &Root,
7431                                               unsigned SizeInBytes) const {
7432   MachineRegisterInfo &MRI = Root.getParent()->getMF()->getRegInfo();
7433   if (!Root.isReg())
7434     return std::nullopt;
7435   MachineInstr *PtrAdd =
7436       getOpcodeDef(TargetOpcode::G_PTR_ADD, Root.getReg(), MRI);
7437   if (!PtrAdd)
7438     return std::nullopt;
7439 
7440   // Check for an immediates which cannot be encoded in the [base + imm]
7441   // addressing mode, and can't be encoded in an add/sub. If this happens, we'll
7442   // end up with code like:
7443   //
7444   // mov x0, wide
7445   // add x1 base, x0
7446   // ldr x2, [x1, x0]
7447   //
7448   // In this situation, we can use the [base, xreg] addressing mode to save an
7449   // add/sub:
7450   //
7451   // mov x0, wide
7452   // ldr x2, [base, x0]
7453   auto ValAndVReg =
7454       getIConstantVRegValWithLookThrough(PtrAdd->getOperand(2).getReg(), MRI);
7455   if (ValAndVReg) {
7456     unsigned Scale = Log2_32(SizeInBytes);
7457     int64_t ImmOff = ValAndVReg->Value.getSExtValue();
7458 
7459     // Skip immediates that can be selected in the load/store addressing
7460     // mode.
7461     if (ImmOff % SizeInBytes == 0 && ImmOff >= 0 &&
7462         ImmOff < (0x1000 << Scale))
7463       return std::nullopt;
7464 
7465     // Helper lambda to decide whether or not it is preferable to emit an add.
7466     auto isPreferredADD = [](int64_t ImmOff) {
7467       // Constants in [0x0, 0xfff] can be encoded in an add.
7468       if ((ImmOff & 0xfffffffffffff000LL) == 0x0LL)
7469         return true;
7470 
7471       // Can it be encoded in an add lsl #12?
7472       if ((ImmOff & 0xffffffffff000fffLL) != 0x0LL)
7473         return false;
7474 
7475       // It can be encoded in an add lsl #12, but we may not want to. If it is
7476       // possible to select this as a single movz, then prefer that. A single
7477       // movz is faster than an add with a shift.
7478       return (ImmOff & 0xffffffffff00ffffLL) != 0x0LL &&
7479              (ImmOff & 0xffffffffffff0fffLL) != 0x0LL;
7480     };
7481 
7482     // If the immediate can be encoded in a single add/sub, then bail out.
7483     if (isPreferredADD(ImmOff) || isPreferredADD(-ImmOff))
7484       return std::nullopt;
7485   }
7486 
7487   // Try to fold shifts into the addressing mode.
7488   auto AddrModeFns = selectAddrModeShiftedExtendXReg(Root, SizeInBytes);
7489   if (AddrModeFns)
7490     return AddrModeFns;
7491 
7492   // If that doesn't work, see if it's possible to fold in registers from
7493   // a GEP.
7494   return selectAddrModeRegisterOffset(Root);
7495 }
7496 
7497 /// This is used for computing addresses like this:
7498 ///
7499 /// ldr x0, [xBase, wOffset, sxtw #LegalShiftVal]
7500 ///
7501 /// Where we have a 64-bit base register, a 32-bit offset register, and an
7502 /// extend (which may or may not be signed).
7503 InstructionSelector::ComplexRendererFns
selectAddrModeWRO(MachineOperand & Root,unsigned SizeInBytes) const7504 AArch64InstructionSelector::selectAddrModeWRO(MachineOperand &Root,
7505                                               unsigned SizeInBytes) const {
7506   MachineRegisterInfo &MRI = Root.getParent()->getMF()->getRegInfo();
7507 
7508   MachineInstr *PtrAdd =
7509       getOpcodeDef(TargetOpcode::G_PTR_ADD, Root.getReg(), MRI);
7510   if (!PtrAdd || !isWorthFoldingIntoExtendedReg(*PtrAdd, MRI, true))
7511     return std::nullopt;
7512 
7513   MachineOperand &LHS = PtrAdd->getOperand(1);
7514   MachineOperand &RHS = PtrAdd->getOperand(2);
7515   MachineInstr *OffsetInst = getDefIgnoringCopies(RHS.getReg(), MRI);
7516 
7517   // The first case is the same as selectAddrModeXRO, except we need an extend.
7518   // In this case, we try to find a shift and extend, and fold them into the
7519   // addressing mode.
7520   //
7521   // E.g.
7522   //
7523   // off_reg = G_Z/S/ANYEXT ext_reg
7524   // val = G_CONSTANT LegalShiftVal
7525   // shift = G_SHL off_reg val
7526   // ptr = G_PTR_ADD base_reg shift
7527   // x = G_LOAD ptr
7528   //
7529   // In this case we can get a load like this:
7530   //
7531   // ldr x0, [base_reg, ext_reg, sxtw #LegalShiftVal]
7532   auto ExtendedShl = selectExtendedSHL(Root, LHS, OffsetInst->getOperand(0),
7533                                        SizeInBytes, /*WantsExt=*/true);
7534   if (ExtendedShl)
7535     return ExtendedShl;
7536 
7537   // There was no shift. We can try and fold a G_Z/S/ANYEXT in alone though.
7538   //
7539   // e.g.
7540   // ldr something, [base_reg, ext_reg, sxtw]
7541   if (!isWorthFoldingIntoExtendedReg(*OffsetInst, MRI, true))
7542     return std::nullopt;
7543 
7544   // Check if this is an extend. We'll get an extend type if it is.
7545   AArch64_AM::ShiftExtendType Ext =
7546       getExtendTypeForInst(*OffsetInst, MRI, /*IsLoadStore=*/true);
7547   if (Ext == AArch64_AM::InvalidShiftExtend)
7548     return std::nullopt;
7549 
7550   // Need a 32-bit wide register.
7551   MachineIRBuilder MIB(*PtrAdd);
7552   Register ExtReg = moveScalarRegClass(OffsetInst->getOperand(1).getReg(),
7553                                        AArch64::GPR32RegClass, MIB);
7554   unsigned SignExtend = Ext == AArch64_AM::SXTW;
7555 
7556   // Base is LHS, offset is ExtReg.
7557   return {{[=](MachineInstrBuilder &MIB) { MIB.addUse(LHS.getReg()); },
7558            [=](MachineInstrBuilder &MIB) { MIB.addUse(ExtReg); },
7559            [=](MachineInstrBuilder &MIB) {
7560              MIB.addImm(SignExtend);
7561              MIB.addImm(0);
7562            }}};
7563 }
7564 
7565 /// Select a "register plus unscaled signed 9-bit immediate" address.  This
7566 /// should only match when there is an offset that is not valid for a scaled
7567 /// immediate addressing mode.  The "Size" argument is the size in bytes of the
7568 /// memory reference, which is needed here to know what is valid for a scaled
7569 /// immediate.
7570 InstructionSelector::ComplexRendererFns
selectAddrModeUnscaled(MachineOperand & Root,unsigned Size) const7571 AArch64InstructionSelector::selectAddrModeUnscaled(MachineOperand &Root,
7572                                                    unsigned Size) const {
7573   MachineRegisterInfo &MRI =
7574       Root.getParent()->getParent()->getParent()->getRegInfo();
7575 
7576   if (!Root.isReg())
7577     return std::nullopt;
7578 
7579   if (!isBaseWithConstantOffset(Root, MRI))
7580     return std::nullopt;
7581 
7582   MachineInstr *RootDef = MRI.getVRegDef(Root.getReg());
7583 
7584   MachineOperand &OffImm = RootDef->getOperand(2);
7585   if (!OffImm.isReg())
7586     return std::nullopt;
7587   MachineInstr *RHS = MRI.getVRegDef(OffImm.getReg());
7588   if (RHS->getOpcode() != TargetOpcode::G_CONSTANT)
7589     return std::nullopt;
7590   int64_t RHSC;
7591   MachineOperand &RHSOp1 = RHS->getOperand(1);
7592   if (!RHSOp1.isCImm() || RHSOp1.getCImm()->getBitWidth() > 64)
7593     return std::nullopt;
7594   RHSC = RHSOp1.getCImm()->getSExtValue();
7595 
7596   if (RHSC >= -256 && RHSC < 256) {
7597     MachineOperand &Base = RootDef->getOperand(1);
7598     return {{
7599         [=](MachineInstrBuilder &MIB) { MIB.add(Base); },
7600         [=](MachineInstrBuilder &MIB) { MIB.addImm(RHSC); },
7601     }};
7602   }
7603   return std::nullopt;
7604 }
7605 
7606 InstructionSelector::ComplexRendererFns
tryFoldAddLowIntoImm(MachineInstr & RootDef,unsigned Size,MachineRegisterInfo & MRI) const7607 AArch64InstructionSelector::tryFoldAddLowIntoImm(MachineInstr &RootDef,
7608                                                  unsigned Size,
7609                                                  MachineRegisterInfo &MRI) const {
7610   if (RootDef.getOpcode() != AArch64::G_ADD_LOW)
7611     return std::nullopt;
7612   MachineInstr &Adrp = *MRI.getVRegDef(RootDef.getOperand(1).getReg());
7613   if (Adrp.getOpcode() != AArch64::ADRP)
7614     return std::nullopt;
7615 
7616   // TODO: add heuristics like isWorthFoldingADDlow() from SelectionDAG.
7617   auto Offset = Adrp.getOperand(1).getOffset();
7618   if (Offset % Size != 0)
7619     return std::nullopt;
7620 
7621   auto GV = Adrp.getOperand(1).getGlobal();
7622   if (GV->isThreadLocal())
7623     return std::nullopt;
7624 
7625   auto &MF = *RootDef.getParent()->getParent();
7626   if (GV->getPointerAlignment(MF.getDataLayout()) < Size)
7627     return std::nullopt;
7628 
7629   unsigned OpFlags = STI.ClassifyGlobalReference(GV, MF.getTarget());
7630   MachineIRBuilder MIRBuilder(RootDef);
7631   Register AdrpReg = Adrp.getOperand(0).getReg();
7632   return {{[=](MachineInstrBuilder &MIB) { MIB.addUse(AdrpReg); },
7633            [=](MachineInstrBuilder &MIB) {
7634              MIB.addGlobalAddress(GV, Offset,
7635                                   OpFlags | AArch64II::MO_PAGEOFF |
7636                                       AArch64II::MO_NC);
7637            }}};
7638 }
7639 
7640 /// Select a "register plus scaled unsigned 12-bit immediate" address.  The
7641 /// "Size" argument is the size in bytes of the memory reference, which
7642 /// determines the scale.
7643 InstructionSelector::ComplexRendererFns
selectAddrModeIndexed(MachineOperand & Root,unsigned Size) const7644 AArch64InstructionSelector::selectAddrModeIndexed(MachineOperand &Root,
7645                                                   unsigned Size) const {
7646   MachineFunction &MF = *Root.getParent()->getParent()->getParent();
7647   MachineRegisterInfo &MRI = MF.getRegInfo();
7648 
7649   if (!Root.isReg())
7650     return std::nullopt;
7651 
7652   MachineInstr *RootDef = MRI.getVRegDef(Root.getReg());
7653   if (RootDef->getOpcode() == TargetOpcode::G_FRAME_INDEX) {
7654     return {{
7655         [=](MachineInstrBuilder &MIB) { MIB.add(RootDef->getOperand(1)); },
7656         [=](MachineInstrBuilder &MIB) { MIB.addImm(0); },
7657     }};
7658   }
7659 
7660   CodeModel::Model CM = MF.getTarget().getCodeModel();
7661   // Check if we can fold in the ADD of small code model ADRP + ADD address.
7662   // HACK: ld64 on Darwin doesn't support relocations on PRFM, so we can't fold
7663   // globals into the offset.
7664   MachineInstr *RootParent = Root.getParent();
7665   if (CM == CodeModel::Small &&
7666       !(RootParent->getOpcode() == AArch64::G_AARCH64_PREFETCH &&
7667         STI.isTargetDarwin())) {
7668     auto OpFns = tryFoldAddLowIntoImm(*RootDef, Size, MRI);
7669     if (OpFns)
7670       return OpFns;
7671   }
7672 
7673   if (isBaseWithConstantOffset(Root, MRI)) {
7674     MachineOperand &LHS = RootDef->getOperand(1);
7675     MachineOperand &RHS = RootDef->getOperand(2);
7676     MachineInstr *LHSDef = MRI.getVRegDef(LHS.getReg());
7677     MachineInstr *RHSDef = MRI.getVRegDef(RHS.getReg());
7678 
7679     int64_t RHSC = (int64_t)RHSDef->getOperand(1).getCImm()->getZExtValue();
7680     unsigned Scale = Log2_32(Size);
7681     if ((RHSC & (Size - 1)) == 0 && RHSC >= 0 && RHSC < (0x1000 << Scale)) {
7682       if (LHSDef->getOpcode() == TargetOpcode::G_FRAME_INDEX)
7683         return {{
7684             [=](MachineInstrBuilder &MIB) { MIB.add(LHSDef->getOperand(1)); },
7685             [=](MachineInstrBuilder &MIB) { MIB.addImm(RHSC >> Scale); },
7686         }};
7687 
7688       return {{
7689           [=](MachineInstrBuilder &MIB) { MIB.add(LHS); },
7690           [=](MachineInstrBuilder &MIB) { MIB.addImm(RHSC >> Scale); },
7691       }};
7692     }
7693   }
7694 
7695   // Before falling back to our general case, check if the unscaled
7696   // instructions can handle this. If so, that's preferable.
7697   if (selectAddrModeUnscaled(Root, Size))
7698     return std::nullopt;
7699 
7700   return {{
7701       [=](MachineInstrBuilder &MIB) { MIB.add(Root); },
7702       [=](MachineInstrBuilder &MIB) { MIB.addImm(0); },
7703   }};
7704 }
7705 
7706 /// Given a shift instruction, return the correct shift type for that
7707 /// instruction.
getShiftTypeForInst(MachineInstr & MI)7708 static AArch64_AM::ShiftExtendType getShiftTypeForInst(MachineInstr &MI) {
7709   switch (MI.getOpcode()) {
7710   default:
7711     return AArch64_AM::InvalidShiftExtend;
7712   case TargetOpcode::G_SHL:
7713     return AArch64_AM::LSL;
7714   case TargetOpcode::G_LSHR:
7715     return AArch64_AM::LSR;
7716   case TargetOpcode::G_ASHR:
7717     return AArch64_AM::ASR;
7718   case TargetOpcode::G_ROTR:
7719     return AArch64_AM::ROR;
7720   }
7721 }
7722 
7723 /// Select a "shifted register" operand. If the value is not shifted, set the
7724 /// shift operand to a default value of "lsl 0".
7725 InstructionSelector::ComplexRendererFns
selectShiftedRegister(MachineOperand & Root,bool AllowROR) const7726 AArch64InstructionSelector::selectShiftedRegister(MachineOperand &Root,
7727                                                   bool AllowROR) const {
7728   if (!Root.isReg())
7729     return std::nullopt;
7730   MachineRegisterInfo &MRI =
7731       Root.getParent()->getParent()->getParent()->getRegInfo();
7732 
7733   // Check if the operand is defined by an instruction which corresponds to
7734   // a ShiftExtendType. E.g. a G_SHL, G_LSHR, etc.
7735   MachineInstr *ShiftInst = MRI.getVRegDef(Root.getReg());
7736   AArch64_AM::ShiftExtendType ShType = getShiftTypeForInst(*ShiftInst);
7737   if (ShType == AArch64_AM::InvalidShiftExtend)
7738     return std::nullopt;
7739   if (ShType == AArch64_AM::ROR && !AllowROR)
7740     return std::nullopt;
7741   if (!isWorthFoldingIntoExtendedReg(*ShiftInst, MRI, false))
7742     return std::nullopt;
7743 
7744   // Need an immediate on the RHS.
7745   MachineOperand &ShiftRHS = ShiftInst->getOperand(2);
7746   auto Immed = getImmedFromMO(ShiftRHS);
7747   if (!Immed)
7748     return std::nullopt;
7749 
7750   // We have something that we can fold. Fold in the shift's LHS and RHS into
7751   // the instruction.
7752   MachineOperand &ShiftLHS = ShiftInst->getOperand(1);
7753   Register ShiftReg = ShiftLHS.getReg();
7754 
7755   unsigned NumBits = MRI.getType(ShiftReg).getSizeInBits();
7756   unsigned Val = *Immed & (NumBits - 1);
7757   unsigned ShiftVal = AArch64_AM::getShifterImm(ShType, Val);
7758 
7759   return {{[=](MachineInstrBuilder &MIB) { MIB.addUse(ShiftReg); },
7760            [=](MachineInstrBuilder &MIB) { MIB.addImm(ShiftVal); }}};
7761 }
7762 
getExtendTypeForInst(MachineInstr & MI,MachineRegisterInfo & MRI,bool IsLoadStore) const7763 AArch64_AM::ShiftExtendType AArch64InstructionSelector::getExtendTypeForInst(
7764     MachineInstr &MI, MachineRegisterInfo &MRI, bool IsLoadStore) const {
7765   unsigned Opc = MI.getOpcode();
7766 
7767   // Handle explicit extend instructions first.
7768   if (Opc == TargetOpcode::G_SEXT || Opc == TargetOpcode::G_SEXT_INREG) {
7769     unsigned Size;
7770     if (Opc == TargetOpcode::G_SEXT)
7771       Size = MRI.getType(MI.getOperand(1).getReg()).getSizeInBits();
7772     else
7773       Size = MI.getOperand(2).getImm();
7774     assert(Size != 64 && "Extend from 64 bits?");
7775     switch (Size) {
7776     case 8:
7777       return IsLoadStore ? AArch64_AM::InvalidShiftExtend : AArch64_AM::SXTB;
7778     case 16:
7779       return IsLoadStore ? AArch64_AM::InvalidShiftExtend : AArch64_AM::SXTH;
7780     case 32:
7781       return AArch64_AM::SXTW;
7782     default:
7783       return AArch64_AM::InvalidShiftExtend;
7784     }
7785   }
7786 
7787   if (Opc == TargetOpcode::G_ZEXT || Opc == TargetOpcode::G_ANYEXT) {
7788     unsigned Size = MRI.getType(MI.getOperand(1).getReg()).getSizeInBits();
7789     assert(Size != 64 && "Extend from 64 bits?");
7790     switch (Size) {
7791     case 8:
7792       return IsLoadStore ? AArch64_AM::InvalidShiftExtend : AArch64_AM::UXTB;
7793     case 16:
7794       return IsLoadStore ? AArch64_AM::InvalidShiftExtend : AArch64_AM::UXTH;
7795     case 32:
7796       return AArch64_AM::UXTW;
7797     default:
7798       return AArch64_AM::InvalidShiftExtend;
7799     }
7800   }
7801 
7802   // Don't have an explicit extend. Try to handle a G_AND with a constant mask
7803   // on the RHS.
7804   if (Opc != TargetOpcode::G_AND)
7805     return AArch64_AM::InvalidShiftExtend;
7806 
7807   std::optional<uint64_t> MaybeAndMask = getImmedFromMO(MI.getOperand(2));
7808   if (!MaybeAndMask)
7809     return AArch64_AM::InvalidShiftExtend;
7810   uint64_t AndMask = *MaybeAndMask;
7811   switch (AndMask) {
7812   default:
7813     return AArch64_AM::InvalidShiftExtend;
7814   case 0xFF:
7815     return !IsLoadStore ? AArch64_AM::UXTB : AArch64_AM::InvalidShiftExtend;
7816   case 0xFFFF:
7817     return !IsLoadStore ? AArch64_AM::UXTH : AArch64_AM::InvalidShiftExtend;
7818   case 0xFFFFFFFF:
7819     return AArch64_AM::UXTW;
7820   }
7821 }
7822 
moveScalarRegClass(Register Reg,const TargetRegisterClass & RC,MachineIRBuilder & MIB) const7823 Register AArch64InstructionSelector::moveScalarRegClass(
7824     Register Reg, const TargetRegisterClass &RC, MachineIRBuilder &MIB) const {
7825   MachineRegisterInfo &MRI = *MIB.getMRI();
7826   auto Ty = MRI.getType(Reg);
7827   assert(!Ty.isVector() && "Expected scalars only!");
7828   if (Ty.getSizeInBits() == TRI.getRegSizeInBits(RC))
7829     return Reg;
7830 
7831   // Create a copy and immediately select it.
7832   // FIXME: We should have an emitCopy function?
7833   auto Copy = MIB.buildCopy({&RC}, {Reg});
7834   selectCopy(*Copy, TII, MRI, TRI, RBI);
7835   return Copy.getReg(0);
7836 }
7837 
7838 /// Select an "extended register" operand. This operand folds in an extend
7839 /// followed by an optional left shift.
7840 InstructionSelector::ComplexRendererFns
selectArithExtendedRegister(MachineOperand & Root) const7841 AArch64InstructionSelector::selectArithExtendedRegister(
7842     MachineOperand &Root) const {
7843   if (!Root.isReg())
7844     return std::nullopt;
7845   MachineRegisterInfo &MRI =
7846       Root.getParent()->getParent()->getParent()->getRegInfo();
7847 
7848   uint64_t ShiftVal = 0;
7849   Register ExtReg;
7850   AArch64_AM::ShiftExtendType Ext;
7851   MachineInstr *RootDef = getDefIgnoringCopies(Root.getReg(), MRI);
7852   if (!RootDef)
7853     return std::nullopt;
7854 
7855   if (!isWorthFoldingIntoExtendedReg(*RootDef, MRI, false))
7856     return std::nullopt;
7857 
7858   // Check if we can fold a shift and an extend.
7859   if (RootDef->getOpcode() == TargetOpcode::G_SHL) {
7860     // Look for a constant on the RHS of the shift.
7861     MachineOperand &RHS = RootDef->getOperand(2);
7862     std::optional<uint64_t> MaybeShiftVal = getImmedFromMO(RHS);
7863     if (!MaybeShiftVal)
7864       return std::nullopt;
7865     ShiftVal = *MaybeShiftVal;
7866     if (ShiftVal > 4)
7867       return std::nullopt;
7868     // Look for a valid extend instruction on the LHS of the shift.
7869     MachineOperand &LHS = RootDef->getOperand(1);
7870     MachineInstr *ExtDef = getDefIgnoringCopies(LHS.getReg(), MRI);
7871     if (!ExtDef)
7872       return std::nullopt;
7873     Ext = getExtendTypeForInst(*ExtDef, MRI);
7874     if (Ext == AArch64_AM::InvalidShiftExtend)
7875       return std::nullopt;
7876     ExtReg = ExtDef->getOperand(1).getReg();
7877   } else {
7878     // Didn't get a shift. Try just folding an extend.
7879     Ext = getExtendTypeForInst(*RootDef, MRI);
7880     if (Ext == AArch64_AM::InvalidShiftExtend)
7881       return std::nullopt;
7882     ExtReg = RootDef->getOperand(1).getReg();
7883 
7884     // If we have a 32 bit instruction which zeroes out the high half of a
7885     // register, we get an implicit zero extend for free. Check if we have one.
7886     // FIXME: We actually emit the extend right now even though we don't have
7887     // to.
7888     if (Ext == AArch64_AM::UXTW && MRI.getType(ExtReg).getSizeInBits() == 32) {
7889       MachineInstr *ExtInst = MRI.getVRegDef(ExtReg);
7890       if (isDef32(*ExtInst))
7891         return std::nullopt;
7892     }
7893   }
7894 
7895   // We require a GPR32 here. Narrow the ExtReg if needed using a subregister
7896   // copy.
7897   MachineIRBuilder MIB(*RootDef);
7898   ExtReg = moveScalarRegClass(ExtReg, AArch64::GPR32RegClass, MIB);
7899 
7900   return {{[=](MachineInstrBuilder &MIB) { MIB.addUse(ExtReg); },
7901            [=](MachineInstrBuilder &MIB) {
7902              MIB.addImm(getArithExtendImm(Ext, ShiftVal));
7903            }}};
7904 }
7905 
7906 InstructionSelector::ComplexRendererFns
selectExtractHigh(MachineOperand & Root) const7907 AArch64InstructionSelector::selectExtractHigh(MachineOperand &Root) const {
7908   if (!Root.isReg())
7909     return std::nullopt;
7910   MachineRegisterInfo &MRI =
7911       Root.getParent()->getParent()->getParent()->getRegInfo();
7912 
7913   auto Extract = getDefSrcRegIgnoringCopies(Root.getReg(), MRI);
7914   while (Extract && Extract->MI->getOpcode() == TargetOpcode::G_BITCAST &&
7915          STI.isLittleEndian())
7916     Extract =
7917         getDefSrcRegIgnoringCopies(Extract->MI->getOperand(1).getReg(), MRI);
7918   if (!Extract)
7919     return std::nullopt;
7920 
7921   if (Extract->MI->getOpcode() == TargetOpcode::G_UNMERGE_VALUES) {
7922     if (Extract->Reg == Extract->MI->getOperand(1).getReg()) {
7923       Register ExtReg = Extract->MI->getOperand(2).getReg();
7924       return {{[=](MachineInstrBuilder &MIB) { MIB.addUse(ExtReg); }}};
7925     }
7926   }
7927   if (Extract->MI->getOpcode() == TargetOpcode::G_EXTRACT_VECTOR_ELT) {
7928     LLT SrcTy = MRI.getType(Extract->MI->getOperand(1).getReg());
7929     auto LaneIdx = getIConstantVRegValWithLookThrough(
7930         Extract->MI->getOperand(2).getReg(), MRI);
7931     if (LaneIdx && SrcTy == LLT::fixed_vector(2, 64) &&
7932         LaneIdx->Value.getSExtValue() == 1) {
7933       Register ExtReg = Extract->MI->getOperand(1).getReg();
7934       return {{[=](MachineInstrBuilder &MIB) { MIB.addUse(ExtReg); }}};
7935     }
7936   }
7937 
7938   return std::nullopt;
7939 }
7940 
renderTruncImm(MachineInstrBuilder & MIB,const MachineInstr & MI,int OpIdx) const7941 void AArch64InstructionSelector::renderTruncImm(MachineInstrBuilder &MIB,
7942                                                 const MachineInstr &MI,
7943                                                 int OpIdx) const {
7944   const MachineRegisterInfo &MRI = MI.getParent()->getParent()->getRegInfo();
7945   assert(MI.getOpcode() == TargetOpcode::G_CONSTANT && OpIdx == -1 &&
7946          "Expected G_CONSTANT");
7947   std::optional<int64_t> CstVal =
7948       getIConstantVRegSExtVal(MI.getOperand(0).getReg(), MRI);
7949   assert(CstVal && "Expected constant value");
7950   MIB.addImm(*CstVal);
7951 }
7952 
renderLogicalImm32(MachineInstrBuilder & MIB,const MachineInstr & I,int OpIdx) const7953 void AArch64InstructionSelector::renderLogicalImm32(
7954   MachineInstrBuilder &MIB, const MachineInstr &I, int OpIdx) const {
7955   assert(I.getOpcode() == TargetOpcode::G_CONSTANT && OpIdx == -1 &&
7956          "Expected G_CONSTANT");
7957   uint64_t CstVal = I.getOperand(1).getCImm()->getZExtValue();
7958   uint64_t Enc = AArch64_AM::encodeLogicalImmediate(CstVal, 32);
7959   MIB.addImm(Enc);
7960 }
7961 
renderLogicalImm64(MachineInstrBuilder & MIB,const MachineInstr & I,int OpIdx) const7962 void AArch64InstructionSelector::renderLogicalImm64(
7963   MachineInstrBuilder &MIB, const MachineInstr &I, int OpIdx) const {
7964   assert(I.getOpcode() == TargetOpcode::G_CONSTANT && OpIdx == -1 &&
7965          "Expected G_CONSTANT");
7966   uint64_t CstVal = I.getOperand(1).getCImm()->getZExtValue();
7967   uint64_t Enc = AArch64_AM::encodeLogicalImmediate(CstVal, 64);
7968   MIB.addImm(Enc);
7969 }
7970 
renderUbsanTrap(MachineInstrBuilder & MIB,const MachineInstr & MI,int OpIdx) const7971 void AArch64InstructionSelector::renderUbsanTrap(MachineInstrBuilder &MIB,
7972                                                  const MachineInstr &MI,
7973                                                  int OpIdx) const {
7974   assert(MI.getOpcode() == TargetOpcode::G_UBSANTRAP && OpIdx == 0 &&
7975          "Expected G_UBSANTRAP");
7976   MIB.addImm(MI.getOperand(0).getImm() | ('U' << 8));
7977 }
7978 
renderFPImm16(MachineInstrBuilder & MIB,const MachineInstr & MI,int OpIdx) const7979 void AArch64InstructionSelector::renderFPImm16(MachineInstrBuilder &MIB,
7980                                                const MachineInstr &MI,
7981                                                int OpIdx) const {
7982   assert(MI.getOpcode() == TargetOpcode::G_FCONSTANT && OpIdx == -1 &&
7983          "Expected G_FCONSTANT");
7984   MIB.addImm(
7985       AArch64_AM::getFP16Imm(MI.getOperand(1).getFPImm()->getValueAPF()));
7986 }
7987 
renderFPImm32(MachineInstrBuilder & MIB,const MachineInstr & MI,int OpIdx) const7988 void AArch64InstructionSelector::renderFPImm32(MachineInstrBuilder &MIB,
7989                                                const MachineInstr &MI,
7990                                                int OpIdx) const {
7991   assert(MI.getOpcode() == TargetOpcode::G_FCONSTANT && OpIdx == -1 &&
7992          "Expected G_FCONSTANT");
7993   MIB.addImm(
7994       AArch64_AM::getFP32Imm(MI.getOperand(1).getFPImm()->getValueAPF()));
7995 }
7996 
renderFPImm64(MachineInstrBuilder & MIB,const MachineInstr & MI,int OpIdx) const7997 void AArch64InstructionSelector::renderFPImm64(MachineInstrBuilder &MIB,
7998                                                const MachineInstr &MI,
7999                                                int OpIdx) const {
8000   assert(MI.getOpcode() == TargetOpcode::G_FCONSTANT && OpIdx == -1 &&
8001          "Expected G_FCONSTANT");
8002   MIB.addImm(
8003       AArch64_AM::getFP64Imm(MI.getOperand(1).getFPImm()->getValueAPF()));
8004 }
8005 
renderFPImm32SIMDModImmType4(MachineInstrBuilder & MIB,const MachineInstr & MI,int OpIdx) const8006 void AArch64InstructionSelector::renderFPImm32SIMDModImmType4(
8007     MachineInstrBuilder &MIB, const MachineInstr &MI, int OpIdx) const {
8008   assert(MI.getOpcode() == TargetOpcode::G_FCONSTANT && OpIdx == -1 &&
8009          "Expected G_FCONSTANT");
8010   MIB.addImm(AArch64_AM::encodeAdvSIMDModImmType4(MI.getOperand(1)
8011                                                       .getFPImm()
8012                                                       ->getValueAPF()
8013                                                       .bitcastToAPInt()
8014                                                       .getZExtValue()));
8015 }
8016 
isLoadStoreOfNumBytes(const MachineInstr & MI,unsigned NumBytes) const8017 bool AArch64InstructionSelector::isLoadStoreOfNumBytes(
8018     const MachineInstr &MI, unsigned NumBytes) const {
8019   if (!MI.mayLoadOrStore())
8020     return false;
8021   assert(MI.hasOneMemOperand() &&
8022          "Expected load/store to have only one mem op!");
8023   return (*MI.memoperands_begin())->getSize() == NumBytes;
8024 }
8025 
isDef32(const MachineInstr & MI) const8026 bool AArch64InstructionSelector::isDef32(const MachineInstr &MI) const {
8027   const MachineRegisterInfo &MRI = MI.getParent()->getParent()->getRegInfo();
8028   if (MRI.getType(MI.getOperand(0).getReg()).getSizeInBits() != 32)
8029     return false;
8030 
8031   // Only return true if we know the operation will zero-out the high half of
8032   // the 64-bit register. Truncates can be subregister copies, which don't
8033   // zero out the high bits. Copies and other copy-like instructions can be
8034   // fed by truncates, or could be lowered as subregister copies.
8035   switch (MI.getOpcode()) {
8036   default:
8037     return true;
8038   case TargetOpcode::COPY:
8039   case TargetOpcode::G_BITCAST:
8040   case TargetOpcode::G_TRUNC:
8041   case TargetOpcode::G_PHI:
8042     return false;
8043   }
8044 }
8045 
8046 
8047 // Perform fixups on the given PHI instruction's operands to force them all
8048 // to be the same as the destination regbank.
fixupPHIOpBanks(MachineInstr & MI,MachineRegisterInfo & MRI,const AArch64RegisterBankInfo & RBI)8049 static void fixupPHIOpBanks(MachineInstr &MI, MachineRegisterInfo &MRI,
8050                             const AArch64RegisterBankInfo &RBI) {
8051   assert(MI.getOpcode() == TargetOpcode::G_PHI && "Expected a G_PHI");
8052   Register DstReg = MI.getOperand(0).getReg();
8053   const RegisterBank *DstRB = MRI.getRegBankOrNull(DstReg);
8054   assert(DstRB && "Expected PHI dst to have regbank assigned");
8055   MachineIRBuilder MIB(MI);
8056 
8057   // Go through each operand and ensure it has the same regbank.
8058   for (MachineOperand &MO : llvm::drop_begin(MI.operands())) {
8059     if (!MO.isReg())
8060       continue;
8061     Register OpReg = MO.getReg();
8062     const RegisterBank *RB = MRI.getRegBankOrNull(OpReg);
8063     if (RB != DstRB) {
8064       // Insert a cross-bank copy.
8065       auto *OpDef = MRI.getVRegDef(OpReg);
8066       const LLT &Ty = MRI.getType(OpReg);
8067       MachineBasicBlock &OpDefBB = *OpDef->getParent();
8068 
8069       // Any instruction we insert must appear after all PHIs in the block
8070       // for the block to be valid MIR.
8071       MachineBasicBlock::iterator InsertPt = std::next(OpDef->getIterator());
8072       if (InsertPt != OpDefBB.end() && InsertPt->isPHI())
8073         InsertPt = OpDefBB.getFirstNonPHI();
8074       MIB.setInsertPt(*OpDef->getParent(), InsertPt);
8075       auto Copy = MIB.buildCopy(Ty, OpReg);
8076       MRI.setRegBank(Copy.getReg(0), *DstRB);
8077       MO.setReg(Copy.getReg(0));
8078     }
8079   }
8080 }
8081 
processPHIs(MachineFunction & MF)8082 void AArch64InstructionSelector::processPHIs(MachineFunction &MF) {
8083   // We're looking for PHIs, build a list so we don't invalidate iterators.
8084   MachineRegisterInfo &MRI = MF.getRegInfo();
8085   SmallVector<MachineInstr *, 32> Phis;
8086   for (auto &BB : MF) {
8087     for (auto &MI : BB) {
8088       if (MI.getOpcode() == TargetOpcode::G_PHI)
8089         Phis.emplace_back(&MI);
8090     }
8091   }
8092 
8093   for (auto *MI : Phis) {
8094     // We need to do some work here if the operand types are < 16 bit and they
8095     // are split across fpr/gpr banks. Since all types <32b on gpr
8096     // end up being assigned gpr32 regclasses, we can end up with PHIs here
8097     // which try to select between a gpr32 and an fpr16. Ideally RBS shouldn't
8098     // be selecting heterogenous regbanks for operands if possible, but we
8099     // still need to be able to deal with it here.
8100     //
8101     // To fix this, if we have a gpr-bank operand < 32b in size and at least
8102     // one other operand is on the fpr bank, then we add cross-bank copies
8103     // to homogenize the operand banks. For simplicity the bank that we choose
8104     // to settle on is whatever bank the def operand has. For example:
8105     //
8106     // %endbb:
8107     //   %dst:gpr(s16) = G_PHI %in1:gpr(s16), %bb1, %in2:fpr(s16), %bb2
8108     //  =>
8109     // %bb2:
8110     //   ...
8111     //   %in2_copy:gpr(s16) = COPY %in2:fpr(s16)
8112     //   ...
8113     // %endbb:
8114     //   %dst:gpr(s16) = G_PHI %in1:gpr(s16), %bb1, %in2_copy:gpr(s16), %bb2
8115     bool HasGPROp = false, HasFPROp = false;
8116     for (const MachineOperand &MO : llvm::drop_begin(MI->operands())) {
8117       if (!MO.isReg())
8118         continue;
8119       const LLT &Ty = MRI.getType(MO.getReg());
8120       if (!Ty.isValid() || !Ty.isScalar())
8121         break;
8122       if (Ty.getSizeInBits() >= 32)
8123         break;
8124       const RegisterBank *RB = MRI.getRegBankOrNull(MO.getReg());
8125       // If for some reason we don't have a regbank yet. Don't try anything.
8126       if (!RB)
8127         break;
8128 
8129       if (RB->getID() == AArch64::GPRRegBankID)
8130         HasGPROp = true;
8131       else
8132         HasFPROp = true;
8133     }
8134     // We have heterogenous regbanks, need to fixup.
8135     if (HasGPROp && HasFPROp)
8136       fixupPHIOpBanks(*MI, MRI, RBI);
8137   }
8138 }
8139 
8140 namespace llvm {
8141 InstructionSelector *
createAArch64InstructionSelector(const AArch64TargetMachine & TM,const AArch64Subtarget & Subtarget,const AArch64RegisterBankInfo & RBI)8142 createAArch64InstructionSelector(const AArch64TargetMachine &TM,
8143                                  const AArch64Subtarget &Subtarget,
8144                                  const AArch64RegisterBankInfo &RBI) {
8145   return new AArch64InstructionSelector(TM, Subtarget, RBI);
8146 }
8147 }
8148