xref: /freebsd/contrib/llvm-project/llvm/lib/Target/AArch64/AArch64ISelDAGToDAG.cpp (revision a521f2116473fbd8c09db395518f060a27d02334)
1 //===-- AArch64ISelDAGToDAG.cpp - A dag to dag inst selector for AArch64 --===//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 //
9 // This file defines an instruction selector for the AArch64 target.
10 //
11 //===----------------------------------------------------------------------===//
12 
13 #include "AArch64TargetMachine.h"
14 #include "MCTargetDesc/AArch64AddressingModes.h"
15 #include "llvm/ADT/APSInt.h"
16 #include "llvm/CodeGen/SelectionDAGISel.h"
17 #include "llvm/IR/Function.h" // To access function attributes.
18 #include "llvm/IR/GlobalValue.h"
19 #include "llvm/IR/Intrinsics.h"
20 #include "llvm/IR/IntrinsicsAArch64.h"
21 #include "llvm/Support/Debug.h"
22 #include "llvm/Support/ErrorHandling.h"
23 #include "llvm/Support/KnownBits.h"
24 #include "llvm/Support/MathExtras.h"
25 #include "llvm/Support/raw_ostream.h"
26 
27 using namespace llvm;
28 
29 #define DEBUG_TYPE "aarch64-isel"
30 
31 //===--------------------------------------------------------------------===//
32 /// AArch64DAGToDAGISel - AArch64 specific code to select AArch64 machine
33 /// instructions for SelectionDAG operations.
34 ///
35 namespace {
36 
37 class AArch64DAGToDAGISel : public SelectionDAGISel {
38 
39   /// Subtarget - Keep a pointer to the AArch64Subtarget around so that we can
40   /// make the right decision when generating code for different targets.
41   const AArch64Subtarget *Subtarget;
42 
43 public:
44   explicit AArch64DAGToDAGISel(AArch64TargetMachine &tm,
45                                CodeGenOpt::Level OptLevel)
46       : SelectionDAGISel(tm, OptLevel), Subtarget(nullptr) {}
47 
48   StringRef getPassName() const override {
49     return "AArch64 Instruction Selection";
50   }
51 
52   bool runOnMachineFunction(MachineFunction &MF) override {
53     Subtarget = &MF.getSubtarget<AArch64Subtarget>();
54     return SelectionDAGISel::runOnMachineFunction(MF);
55   }
56 
57   void Select(SDNode *Node) override;
58 
59   /// SelectInlineAsmMemoryOperand - Implement addressing mode selection for
60   /// inline asm expressions.
61   bool SelectInlineAsmMemoryOperand(const SDValue &Op,
62                                     unsigned ConstraintID,
63                                     std::vector<SDValue> &OutOps) override;
64 
65   template <signed Low, signed High, signed Scale>
66   bool SelectRDVLImm(SDValue N, SDValue &Imm);
67 
68   bool tryMLAV64LaneV128(SDNode *N);
69   bool tryMULLV64LaneV128(unsigned IntNo, SDNode *N);
70   bool SelectArithExtendedRegister(SDValue N, SDValue &Reg, SDValue &Shift);
71   bool SelectArithImmed(SDValue N, SDValue &Val, SDValue &Shift);
72   bool SelectNegArithImmed(SDValue N, SDValue &Val, SDValue &Shift);
73   bool SelectArithShiftedRegister(SDValue N, SDValue &Reg, SDValue &Shift) {
74     return SelectShiftedRegister(N, false, Reg, Shift);
75   }
76   bool SelectLogicalShiftedRegister(SDValue N, SDValue &Reg, SDValue &Shift) {
77     return SelectShiftedRegister(N, true, Reg, Shift);
78   }
79   bool SelectAddrModeIndexed7S8(SDValue N, SDValue &Base, SDValue &OffImm) {
80     return SelectAddrModeIndexed7S(N, 1, Base, OffImm);
81   }
82   bool SelectAddrModeIndexed7S16(SDValue N, SDValue &Base, SDValue &OffImm) {
83     return SelectAddrModeIndexed7S(N, 2, Base, OffImm);
84   }
85   bool SelectAddrModeIndexed7S32(SDValue N, SDValue &Base, SDValue &OffImm) {
86     return SelectAddrModeIndexed7S(N, 4, Base, OffImm);
87   }
88   bool SelectAddrModeIndexed7S64(SDValue N, SDValue &Base, SDValue &OffImm) {
89     return SelectAddrModeIndexed7S(N, 8, Base, OffImm);
90   }
91   bool SelectAddrModeIndexed7S128(SDValue N, SDValue &Base, SDValue &OffImm) {
92     return SelectAddrModeIndexed7S(N, 16, Base, OffImm);
93   }
94   bool SelectAddrModeIndexedS9S128(SDValue N, SDValue &Base, SDValue &OffImm) {
95     return SelectAddrModeIndexedBitWidth(N, true, 9, 16, Base, OffImm);
96   }
97   bool SelectAddrModeIndexedU6S128(SDValue N, SDValue &Base, SDValue &OffImm) {
98     return SelectAddrModeIndexedBitWidth(N, false, 6, 16, Base, OffImm);
99   }
100   bool SelectAddrModeIndexed8(SDValue N, SDValue &Base, SDValue &OffImm) {
101     return SelectAddrModeIndexed(N, 1, Base, OffImm);
102   }
103   bool SelectAddrModeIndexed16(SDValue N, SDValue &Base, SDValue &OffImm) {
104     return SelectAddrModeIndexed(N, 2, Base, OffImm);
105   }
106   bool SelectAddrModeIndexed32(SDValue N, SDValue &Base, SDValue &OffImm) {
107     return SelectAddrModeIndexed(N, 4, Base, OffImm);
108   }
109   bool SelectAddrModeIndexed64(SDValue N, SDValue &Base, SDValue &OffImm) {
110     return SelectAddrModeIndexed(N, 8, Base, OffImm);
111   }
112   bool SelectAddrModeIndexed128(SDValue N, SDValue &Base, SDValue &OffImm) {
113     return SelectAddrModeIndexed(N, 16, Base, OffImm);
114   }
115   bool SelectAddrModeUnscaled8(SDValue N, SDValue &Base, SDValue &OffImm) {
116     return SelectAddrModeUnscaled(N, 1, Base, OffImm);
117   }
118   bool SelectAddrModeUnscaled16(SDValue N, SDValue &Base, SDValue &OffImm) {
119     return SelectAddrModeUnscaled(N, 2, Base, OffImm);
120   }
121   bool SelectAddrModeUnscaled32(SDValue N, SDValue &Base, SDValue &OffImm) {
122     return SelectAddrModeUnscaled(N, 4, Base, OffImm);
123   }
124   bool SelectAddrModeUnscaled64(SDValue N, SDValue &Base, SDValue &OffImm) {
125     return SelectAddrModeUnscaled(N, 8, Base, OffImm);
126   }
127   bool SelectAddrModeUnscaled128(SDValue N, SDValue &Base, SDValue &OffImm) {
128     return SelectAddrModeUnscaled(N, 16, Base, OffImm);
129   }
130 
131   template<int Width>
132   bool SelectAddrModeWRO(SDValue N, SDValue &Base, SDValue &Offset,
133                          SDValue &SignExtend, SDValue &DoShift) {
134     return SelectAddrModeWRO(N, Width / 8, Base, Offset, SignExtend, DoShift);
135   }
136 
137   template<int Width>
138   bool SelectAddrModeXRO(SDValue N, SDValue &Base, SDValue &Offset,
139                          SDValue &SignExtend, SDValue &DoShift) {
140     return SelectAddrModeXRO(N, Width / 8, Base, Offset, SignExtend, DoShift);
141   }
142 
143   bool SelectDupZeroOrUndef(SDValue N) {
144     switch(N->getOpcode()) {
145     case ISD::UNDEF:
146       return true;
147     case AArch64ISD::DUP:
148     case ISD::SPLAT_VECTOR: {
149       auto Opnd0 = N->getOperand(0);
150       if (auto CN = dyn_cast<ConstantSDNode>(Opnd0))
151         if (CN->isNullValue())
152           return true;
153       if (auto CN = dyn_cast<ConstantFPSDNode>(Opnd0))
154         if (CN->isZero())
155           return true;
156       break;
157     }
158     default:
159       break;
160     }
161 
162     return false;
163   }
164 
165   bool SelectDupZero(SDValue N) {
166     switch(N->getOpcode()) {
167     case AArch64ISD::DUP:
168     case ISD::SPLAT_VECTOR: {
169       auto Opnd0 = N->getOperand(0);
170       if (auto CN = dyn_cast<ConstantSDNode>(Opnd0))
171         if (CN->isNullValue())
172           return true;
173       if (auto CN = dyn_cast<ConstantFPSDNode>(Opnd0))
174         if (CN->isZero())
175           return true;
176       break;
177     }
178     }
179 
180     return false;
181   }
182 
183   template<MVT::SimpleValueType VT>
184   bool SelectSVEAddSubImm(SDValue N, SDValue &Imm, SDValue &Shift) {
185     return SelectSVEAddSubImm(N, VT, Imm, Shift);
186   }
187 
188   template<MVT::SimpleValueType VT>
189   bool SelectSVELogicalImm(SDValue N, SDValue &Imm) {
190     return SelectSVELogicalImm(N, VT, Imm);
191   }
192 
193   template <unsigned Low, unsigned High>
194   bool SelectSVEShiftImm64(SDValue N, SDValue &Imm) {
195     return SelectSVEShiftImm64(N, Low, High, Imm);
196   }
197 
198   // Returns a suitable CNT/INC/DEC/RDVL multiplier to calculate VSCALE*N.
199   template<signed Min, signed Max, signed Scale, bool Shift>
200   bool SelectCntImm(SDValue N, SDValue &Imm) {
201     if (!isa<ConstantSDNode>(N))
202       return false;
203 
204     int64_t MulImm = cast<ConstantSDNode>(N)->getSExtValue();
205     if (Shift)
206       MulImm = 1LL << MulImm;
207 
208     if ((MulImm % std::abs(Scale)) != 0)
209       return false;
210 
211     MulImm /= Scale;
212     if ((MulImm >= Min) && (MulImm <= Max)) {
213       Imm = CurDAG->getTargetConstant(MulImm, SDLoc(N), MVT::i32);
214       return true;
215     }
216 
217     return false;
218   }
219 
220   /// Form sequences of consecutive 64/128-bit registers for use in NEON
221   /// instructions making use of a vector-list (e.g. ldN, tbl). Vecs must have
222   /// between 1 and 4 elements. If it contains a single element that is returned
223   /// unchanged; otherwise a REG_SEQUENCE value is returned.
224   SDValue createDTuple(ArrayRef<SDValue> Vecs);
225   SDValue createQTuple(ArrayRef<SDValue> Vecs);
226   // Form a sequence of SVE registers for instructions using list of vectors,
227   // e.g. structured loads and stores (ldN, stN).
228   SDValue createZTuple(ArrayRef<SDValue> Vecs);
229 
230   /// Generic helper for the createDTuple/createQTuple
231   /// functions. Those should almost always be called instead.
232   SDValue createTuple(ArrayRef<SDValue> Vecs, const unsigned RegClassIDs[],
233                       const unsigned SubRegs[]);
234 
235   void SelectTable(SDNode *N, unsigned NumVecs, unsigned Opc, bool isExt);
236 
237   bool tryIndexedLoad(SDNode *N);
238 
239   bool trySelectStackSlotTagP(SDNode *N);
240   void SelectTagP(SDNode *N);
241 
242   void SelectLoad(SDNode *N, unsigned NumVecs, unsigned Opc,
243                      unsigned SubRegIdx);
244   void SelectPostLoad(SDNode *N, unsigned NumVecs, unsigned Opc,
245                          unsigned SubRegIdx);
246   void SelectLoadLane(SDNode *N, unsigned NumVecs, unsigned Opc);
247   void SelectPostLoadLane(SDNode *N, unsigned NumVecs, unsigned Opc);
248   void SelectPredicatedLoad(SDNode *N, unsigned NumVecs, unsigned Scale,
249                             unsigned Opc_rr, unsigned Opc_ri);
250 
251   bool SelectAddrModeFrameIndexSVE(SDValue N, SDValue &Base, SDValue &OffImm);
252   /// SVE Reg+Imm addressing mode.
253   template <int64_t Min, int64_t Max>
254   bool SelectAddrModeIndexedSVE(SDNode *Root, SDValue N, SDValue &Base,
255                                 SDValue &OffImm);
256   /// SVE Reg+Reg address mode.
257   template <unsigned Scale>
258   bool SelectSVERegRegAddrMode(SDValue N, SDValue &Base, SDValue &Offset) {
259     return SelectSVERegRegAddrMode(N, Scale, Base, Offset);
260   }
261 
262   void SelectStore(SDNode *N, unsigned NumVecs, unsigned Opc);
263   void SelectPostStore(SDNode *N, unsigned NumVecs, unsigned Opc);
264   void SelectStoreLane(SDNode *N, unsigned NumVecs, unsigned Opc);
265   void SelectPostStoreLane(SDNode *N, unsigned NumVecs, unsigned Opc);
266   void SelectPredicatedStore(SDNode *N, unsigned NumVecs, unsigned Scale,
267                              unsigned Opc_rr, unsigned Opc_ri);
268   std::tuple<unsigned, SDValue, SDValue>
269   findAddrModeSVELoadStore(SDNode *N, unsigned Opc_rr, unsigned Opc_ri,
270                            const SDValue &OldBase, const SDValue &OldOffset,
271                            unsigned Scale);
272 
273   bool tryBitfieldExtractOp(SDNode *N);
274   bool tryBitfieldExtractOpFromSExt(SDNode *N);
275   bool tryBitfieldInsertOp(SDNode *N);
276   bool tryBitfieldInsertInZeroOp(SDNode *N);
277   bool tryShiftAmountMod(SDNode *N);
278   bool tryHighFPExt(SDNode *N);
279 
280   bool tryReadRegister(SDNode *N);
281   bool tryWriteRegister(SDNode *N);
282 
283 // Include the pieces autogenerated from the target description.
284 #include "AArch64GenDAGISel.inc"
285 
286 private:
287   bool SelectShiftedRegister(SDValue N, bool AllowROR, SDValue &Reg,
288                              SDValue &Shift);
289   bool SelectAddrModeIndexed7S(SDValue N, unsigned Size, SDValue &Base,
290                                SDValue &OffImm) {
291     return SelectAddrModeIndexedBitWidth(N, true, 7, Size, Base, OffImm);
292   }
293   bool SelectAddrModeIndexedBitWidth(SDValue N, bool IsSignedImm, unsigned BW,
294                                      unsigned Size, SDValue &Base,
295                                      SDValue &OffImm);
296   bool SelectAddrModeIndexed(SDValue N, unsigned Size, SDValue &Base,
297                              SDValue &OffImm);
298   bool SelectAddrModeUnscaled(SDValue N, unsigned Size, SDValue &Base,
299                               SDValue &OffImm);
300   bool SelectAddrModeWRO(SDValue N, unsigned Size, SDValue &Base,
301                          SDValue &Offset, SDValue &SignExtend,
302                          SDValue &DoShift);
303   bool SelectAddrModeXRO(SDValue N, unsigned Size, SDValue &Base,
304                          SDValue &Offset, SDValue &SignExtend,
305                          SDValue &DoShift);
306   bool isWorthFolding(SDValue V) const;
307   bool SelectExtendedSHL(SDValue N, unsigned Size, bool WantExtend,
308                          SDValue &Offset, SDValue &SignExtend);
309 
310   template<unsigned RegWidth>
311   bool SelectCVTFixedPosOperand(SDValue N, SDValue &FixedPos) {
312     return SelectCVTFixedPosOperand(N, FixedPos, RegWidth);
313   }
314 
315   bool SelectCVTFixedPosOperand(SDValue N, SDValue &FixedPos, unsigned Width);
316 
317   bool SelectCMP_SWAP(SDNode *N);
318 
319   bool SelectSVE8BitLslImm(SDValue N, SDValue &Imm, SDValue &Shift);
320 
321   bool SelectSVEAddSubImm(SDValue N, MVT VT, SDValue &Imm, SDValue &Shift);
322 
323   bool SelectSVELogicalImm(SDValue N, MVT VT, SDValue &Imm);
324 
325   bool SelectSVESignedArithImm(SDValue N, SDValue &Imm);
326   bool SelectSVEShiftImm64(SDValue N, uint64_t Low, uint64_t High,
327                            SDValue &Imm);
328 
329   bool SelectSVEArithImm(SDValue N, SDValue &Imm);
330   bool SelectSVERegRegAddrMode(SDValue N, unsigned Scale, SDValue &Base,
331                                SDValue &Offset);
332 };
333 } // end anonymous namespace
334 
335 /// isIntImmediate - This method tests to see if the node is a constant
336 /// operand. If so Imm will receive the 32-bit value.
337 static bool isIntImmediate(const SDNode *N, uint64_t &Imm) {
338   if (const ConstantSDNode *C = dyn_cast<const ConstantSDNode>(N)) {
339     Imm = C->getZExtValue();
340     return true;
341   }
342   return false;
343 }
344 
345 // isIntImmediate - This method tests to see if a constant operand.
346 // If so Imm will receive the value.
347 static bool isIntImmediate(SDValue N, uint64_t &Imm) {
348   return isIntImmediate(N.getNode(), Imm);
349 }
350 
351 // isOpcWithIntImmediate - This method tests to see if the node is a specific
352 // opcode and that it has a immediate integer right operand.
353 // If so Imm will receive the 32 bit value.
354 static bool isOpcWithIntImmediate(const SDNode *N, unsigned Opc,
355                                   uint64_t &Imm) {
356   return N->getOpcode() == Opc &&
357          isIntImmediate(N->getOperand(1).getNode(), Imm);
358 }
359 
360 bool AArch64DAGToDAGISel::SelectInlineAsmMemoryOperand(
361     const SDValue &Op, unsigned ConstraintID, std::vector<SDValue> &OutOps) {
362   switch(ConstraintID) {
363   default:
364     llvm_unreachable("Unexpected asm memory constraint");
365   case InlineAsm::Constraint_m:
366   case InlineAsm::Constraint_Q:
367     // We need to make sure that this one operand does not end up in XZR, thus
368     // require the address to be in a PointerRegClass register.
369     const TargetRegisterInfo *TRI = Subtarget->getRegisterInfo();
370     const TargetRegisterClass *TRC = TRI->getPointerRegClass(*MF);
371     SDLoc dl(Op);
372     SDValue RC = CurDAG->getTargetConstant(TRC->getID(), dl, MVT::i64);
373     SDValue NewOp =
374         SDValue(CurDAG->getMachineNode(TargetOpcode::COPY_TO_REGCLASS,
375                                        dl, Op.getValueType(),
376                                        Op, RC), 0);
377     OutOps.push_back(NewOp);
378     return false;
379   }
380   return true;
381 }
382 
383 /// SelectArithImmed - Select an immediate value that can be represented as
384 /// a 12-bit value shifted left by either 0 or 12.  If so, return true with
385 /// Val set to the 12-bit value and Shift set to the shifter operand.
386 bool AArch64DAGToDAGISel::SelectArithImmed(SDValue N, SDValue &Val,
387                                            SDValue &Shift) {
388   // This function is called from the addsub_shifted_imm ComplexPattern,
389   // which lists [imm] as the list of opcode it's interested in, however
390   // we still need to check whether the operand is actually an immediate
391   // here because the ComplexPattern opcode list is only used in
392   // root-level opcode matching.
393   if (!isa<ConstantSDNode>(N.getNode()))
394     return false;
395 
396   uint64_t Immed = cast<ConstantSDNode>(N.getNode())->getZExtValue();
397   unsigned ShiftAmt;
398 
399   if (Immed >> 12 == 0) {
400     ShiftAmt = 0;
401   } else if ((Immed & 0xfff) == 0 && Immed >> 24 == 0) {
402     ShiftAmt = 12;
403     Immed = Immed >> 12;
404   } else
405     return false;
406 
407   unsigned ShVal = AArch64_AM::getShifterImm(AArch64_AM::LSL, ShiftAmt);
408   SDLoc dl(N);
409   Val = CurDAG->getTargetConstant(Immed, dl, MVT::i32);
410   Shift = CurDAG->getTargetConstant(ShVal, dl, MVT::i32);
411   return true;
412 }
413 
414 /// SelectNegArithImmed - As above, but negates the value before trying to
415 /// select it.
416 bool AArch64DAGToDAGISel::SelectNegArithImmed(SDValue N, SDValue &Val,
417                                               SDValue &Shift) {
418   // This function is called from the addsub_shifted_imm ComplexPattern,
419   // which lists [imm] as the list of opcode it's interested in, however
420   // we still need to check whether the operand is actually an immediate
421   // here because the ComplexPattern opcode list is only used in
422   // root-level opcode matching.
423   if (!isa<ConstantSDNode>(N.getNode()))
424     return false;
425 
426   // The immediate operand must be a 24-bit zero-extended immediate.
427   uint64_t Immed = cast<ConstantSDNode>(N.getNode())->getZExtValue();
428 
429   // This negation is almost always valid, but "cmp wN, #0" and "cmn wN, #0"
430   // have the opposite effect on the C flag, so this pattern mustn't match under
431   // those circumstances.
432   if (Immed == 0)
433     return false;
434 
435   if (N.getValueType() == MVT::i32)
436     Immed = ~((uint32_t)Immed) + 1;
437   else
438     Immed = ~Immed + 1ULL;
439   if (Immed & 0xFFFFFFFFFF000000ULL)
440     return false;
441 
442   Immed &= 0xFFFFFFULL;
443   return SelectArithImmed(CurDAG->getConstant(Immed, SDLoc(N), MVT::i32), Val,
444                           Shift);
445 }
446 
447 /// getShiftTypeForNode - Translate a shift node to the corresponding
448 /// ShiftType value.
449 static AArch64_AM::ShiftExtendType getShiftTypeForNode(SDValue N) {
450   switch (N.getOpcode()) {
451   default:
452     return AArch64_AM::InvalidShiftExtend;
453   case ISD::SHL:
454     return AArch64_AM::LSL;
455   case ISD::SRL:
456     return AArch64_AM::LSR;
457   case ISD::SRA:
458     return AArch64_AM::ASR;
459   case ISD::ROTR:
460     return AArch64_AM::ROR;
461   }
462 }
463 
464 /// Determine whether it is worth it to fold SHL into the addressing
465 /// mode.
466 static bool isWorthFoldingSHL(SDValue V) {
467   assert(V.getOpcode() == ISD::SHL && "invalid opcode");
468   // It is worth folding logical shift of up to three places.
469   auto *CSD = dyn_cast<ConstantSDNode>(V.getOperand(1));
470   if (!CSD)
471     return false;
472   unsigned ShiftVal = CSD->getZExtValue();
473   if (ShiftVal > 3)
474     return false;
475 
476   // Check if this particular node is reused in any non-memory related
477   // operation.  If yes, do not try to fold this node into the address
478   // computation, since the computation will be kept.
479   const SDNode *Node = V.getNode();
480   for (SDNode *UI : Node->uses())
481     if (!isa<MemSDNode>(*UI))
482       for (SDNode *UII : UI->uses())
483         if (!isa<MemSDNode>(*UII))
484           return false;
485   return true;
486 }
487 
488 /// Determine whether it is worth to fold V into an extended register.
489 bool AArch64DAGToDAGISel::isWorthFolding(SDValue V) const {
490   // Trivial if we are optimizing for code size or if there is only
491   // one use of the value.
492   if (CurDAG->shouldOptForSize() || V.hasOneUse())
493     return true;
494   // If a subtarget has a fastpath LSL we can fold a logical shift into
495   // the addressing mode and save a cycle.
496   if (Subtarget->hasLSLFast() && V.getOpcode() == ISD::SHL &&
497       isWorthFoldingSHL(V))
498     return true;
499   if (Subtarget->hasLSLFast() && V.getOpcode() == ISD::ADD) {
500     const SDValue LHS = V.getOperand(0);
501     const SDValue RHS = V.getOperand(1);
502     if (LHS.getOpcode() == ISD::SHL && isWorthFoldingSHL(LHS))
503       return true;
504     if (RHS.getOpcode() == ISD::SHL && isWorthFoldingSHL(RHS))
505       return true;
506   }
507 
508   // It hurts otherwise, since the value will be reused.
509   return false;
510 }
511 
512 /// SelectShiftedRegister - Select a "shifted register" operand.  If the value
513 /// is not shifted, set the Shift operand to default of "LSL 0".  The logical
514 /// instructions allow the shifted register to be rotated, but the arithmetic
515 /// instructions do not.  The AllowROR parameter specifies whether ROR is
516 /// supported.
517 bool AArch64DAGToDAGISel::SelectShiftedRegister(SDValue N, bool AllowROR,
518                                                 SDValue &Reg, SDValue &Shift) {
519   AArch64_AM::ShiftExtendType ShType = getShiftTypeForNode(N);
520   if (ShType == AArch64_AM::InvalidShiftExtend)
521     return false;
522   if (!AllowROR && ShType == AArch64_AM::ROR)
523     return false;
524 
525   if (ConstantSDNode *RHS = dyn_cast<ConstantSDNode>(N.getOperand(1))) {
526     unsigned BitSize = N.getValueSizeInBits();
527     unsigned Val = RHS->getZExtValue() & (BitSize - 1);
528     unsigned ShVal = AArch64_AM::getShifterImm(ShType, Val);
529 
530     Reg = N.getOperand(0);
531     Shift = CurDAG->getTargetConstant(ShVal, SDLoc(N), MVT::i32);
532     return isWorthFolding(N);
533   }
534 
535   return false;
536 }
537 
538 /// getExtendTypeForNode - Translate an extend node to the corresponding
539 /// ExtendType value.
540 static AArch64_AM::ShiftExtendType
541 getExtendTypeForNode(SDValue N, bool IsLoadStore = false) {
542   if (N.getOpcode() == ISD::SIGN_EXTEND ||
543       N.getOpcode() == ISD::SIGN_EXTEND_INREG) {
544     EVT SrcVT;
545     if (N.getOpcode() == ISD::SIGN_EXTEND_INREG)
546       SrcVT = cast<VTSDNode>(N.getOperand(1))->getVT();
547     else
548       SrcVT = N.getOperand(0).getValueType();
549 
550     if (!IsLoadStore && SrcVT == MVT::i8)
551       return AArch64_AM::SXTB;
552     else if (!IsLoadStore && SrcVT == MVT::i16)
553       return AArch64_AM::SXTH;
554     else if (SrcVT == MVT::i32)
555       return AArch64_AM::SXTW;
556     assert(SrcVT != MVT::i64 && "extend from 64-bits?");
557 
558     return AArch64_AM::InvalidShiftExtend;
559   } else if (N.getOpcode() == ISD::ZERO_EXTEND ||
560              N.getOpcode() == ISD::ANY_EXTEND) {
561     EVT SrcVT = N.getOperand(0).getValueType();
562     if (!IsLoadStore && SrcVT == MVT::i8)
563       return AArch64_AM::UXTB;
564     else if (!IsLoadStore && SrcVT == MVT::i16)
565       return AArch64_AM::UXTH;
566     else if (SrcVT == MVT::i32)
567       return AArch64_AM::UXTW;
568     assert(SrcVT != MVT::i64 && "extend from 64-bits?");
569 
570     return AArch64_AM::InvalidShiftExtend;
571   } else if (N.getOpcode() == ISD::AND) {
572     ConstantSDNode *CSD = dyn_cast<ConstantSDNode>(N.getOperand(1));
573     if (!CSD)
574       return AArch64_AM::InvalidShiftExtend;
575     uint64_t AndMask = CSD->getZExtValue();
576 
577     switch (AndMask) {
578     default:
579       return AArch64_AM::InvalidShiftExtend;
580     case 0xFF:
581       return !IsLoadStore ? AArch64_AM::UXTB : AArch64_AM::InvalidShiftExtend;
582     case 0xFFFF:
583       return !IsLoadStore ? AArch64_AM::UXTH : AArch64_AM::InvalidShiftExtend;
584     case 0xFFFFFFFF:
585       return AArch64_AM::UXTW;
586     }
587   }
588 
589   return AArch64_AM::InvalidShiftExtend;
590 }
591 
592 // Helper for SelectMLAV64LaneV128 - Recognize high lane extracts.
593 static bool checkHighLaneIndex(SDNode *DL, SDValue &LaneOp, int &LaneIdx) {
594   if (DL->getOpcode() != AArch64ISD::DUPLANE16 &&
595       DL->getOpcode() != AArch64ISD::DUPLANE32)
596     return false;
597 
598   SDValue SV = DL->getOperand(0);
599   if (SV.getOpcode() != ISD::INSERT_SUBVECTOR)
600     return false;
601 
602   SDValue EV = SV.getOperand(1);
603   if (EV.getOpcode() != ISD::EXTRACT_SUBVECTOR)
604     return false;
605 
606   ConstantSDNode *DLidx = cast<ConstantSDNode>(DL->getOperand(1).getNode());
607   ConstantSDNode *EVidx = cast<ConstantSDNode>(EV.getOperand(1).getNode());
608   LaneIdx = DLidx->getSExtValue() + EVidx->getSExtValue();
609   LaneOp = EV.getOperand(0);
610 
611   return true;
612 }
613 
614 // Helper for SelectOpcV64LaneV128 - Recognize operations where one operand is a
615 // high lane extract.
616 static bool checkV64LaneV128(SDValue Op0, SDValue Op1, SDValue &StdOp,
617                              SDValue &LaneOp, int &LaneIdx) {
618 
619   if (!checkHighLaneIndex(Op0.getNode(), LaneOp, LaneIdx)) {
620     std::swap(Op0, Op1);
621     if (!checkHighLaneIndex(Op0.getNode(), LaneOp, LaneIdx))
622       return false;
623   }
624   StdOp = Op1;
625   return true;
626 }
627 
628 /// SelectMLAV64LaneV128 - AArch64 supports vector MLAs where one multiplicand
629 /// is a lane in the upper half of a 128-bit vector.  Recognize and select this
630 /// so that we don't emit unnecessary lane extracts.
631 bool AArch64DAGToDAGISel::tryMLAV64LaneV128(SDNode *N) {
632   SDLoc dl(N);
633   SDValue Op0 = N->getOperand(0);
634   SDValue Op1 = N->getOperand(1);
635   SDValue MLAOp1;   // Will hold ordinary multiplicand for MLA.
636   SDValue MLAOp2;   // Will hold lane-accessed multiplicand for MLA.
637   int LaneIdx = -1; // Will hold the lane index.
638 
639   if (Op1.getOpcode() != ISD::MUL ||
640       !checkV64LaneV128(Op1.getOperand(0), Op1.getOperand(1), MLAOp1, MLAOp2,
641                         LaneIdx)) {
642     std::swap(Op0, Op1);
643     if (Op1.getOpcode() != ISD::MUL ||
644         !checkV64LaneV128(Op1.getOperand(0), Op1.getOperand(1), MLAOp1, MLAOp2,
645                           LaneIdx))
646       return false;
647   }
648 
649   SDValue LaneIdxVal = CurDAG->getTargetConstant(LaneIdx, dl, MVT::i64);
650 
651   SDValue Ops[] = { Op0, MLAOp1, MLAOp2, LaneIdxVal };
652 
653   unsigned MLAOpc = ~0U;
654 
655   switch (N->getSimpleValueType(0).SimpleTy) {
656   default:
657     llvm_unreachable("Unrecognized MLA.");
658   case MVT::v4i16:
659     MLAOpc = AArch64::MLAv4i16_indexed;
660     break;
661   case MVT::v8i16:
662     MLAOpc = AArch64::MLAv8i16_indexed;
663     break;
664   case MVT::v2i32:
665     MLAOpc = AArch64::MLAv2i32_indexed;
666     break;
667   case MVT::v4i32:
668     MLAOpc = AArch64::MLAv4i32_indexed;
669     break;
670   }
671 
672   ReplaceNode(N, CurDAG->getMachineNode(MLAOpc, dl, N->getValueType(0), Ops));
673   return true;
674 }
675 
676 bool AArch64DAGToDAGISel::tryMULLV64LaneV128(unsigned IntNo, SDNode *N) {
677   SDLoc dl(N);
678   SDValue SMULLOp0;
679   SDValue SMULLOp1;
680   int LaneIdx;
681 
682   if (!checkV64LaneV128(N->getOperand(1), N->getOperand(2), SMULLOp0, SMULLOp1,
683                         LaneIdx))
684     return false;
685 
686   SDValue LaneIdxVal = CurDAG->getTargetConstant(LaneIdx, dl, MVT::i64);
687 
688   SDValue Ops[] = { SMULLOp0, SMULLOp1, LaneIdxVal };
689 
690   unsigned SMULLOpc = ~0U;
691 
692   if (IntNo == Intrinsic::aarch64_neon_smull) {
693     switch (N->getSimpleValueType(0).SimpleTy) {
694     default:
695       llvm_unreachable("Unrecognized SMULL.");
696     case MVT::v4i32:
697       SMULLOpc = AArch64::SMULLv4i16_indexed;
698       break;
699     case MVT::v2i64:
700       SMULLOpc = AArch64::SMULLv2i32_indexed;
701       break;
702     }
703   } else if (IntNo == Intrinsic::aarch64_neon_umull) {
704     switch (N->getSimpleValueType(0).SimpleTy) {
705     default:
706       llvm_unreachable("Unrecognized SMULL.");
707     case MVT::v4i32:
708       SMULLOpc = AArch64::UMULLv4i16_indexed;
709       break;
710     case MVT::v2i64:
711       SMULLOpc = AArch64::UMULLv2i32_indexed;
712       break;
713     }
714   } else
715     llvm_unreachable("Unrecognized intrinsic.");
716 
717   ReplaceNode(N, CurDAG->getMachineNode(SMULLOpc, dl, N->getValueType(0), Ops));
718   return true;
719 }
720 
721 /// Instructions that accept extend modifiers like UXTW expect the register
722 /// being extended to be a GPR32, but the incoming DAG might be acting on a
723 /// GPR64 (either via SEXT_INREG or AND). Extract the appropriate low bits if
724 /// this is the case.
725 static SDValue narrowIfNeeded(SelectionDAG *CurDAG, SDValue N) {
726   if (N.getValueType() == MVT::i32)
727     return N;
728 
729   SDLoc dl(N);
730   SDValue SubReg = CurDAG->getTargetConstant(AArch64::sub_32, dl, MVT::i32);
731   MachineSDNode *Node = CurDAG->getMachineNode(TargetOpcode::EXTRACT_SUBREG,
732                                                dl, MVT::i32, N, SubReg);
733   return SDValue(Node, 0);
734 }
735 
736 // Returns a suitable CNT/INC/DEC/RDVL multiplier to calculate VSCALE*N.
737 template<signed Low, signed High, signed Scale>
738 bool AArch64DAGToDAGISel::SelectRDVLImm(SDValue N, SDValue &Imm) {
739   if (!isa<ConstantSDNode>(N))
740     return false;
741 
742   int64_t MulImm = cast<ConstantSDNode>(N)->getSExtValue();
743   if ((MulImm % std::abs(Scale)) == 0) {
744     int64_t RDVLImm = MulImm / Scale;
745     if ((RDVLImm >= Low) && (RDVLImm <= High)) {
746       Imm = CurDAG->getTargetConstant(RDVLImm, SDLoc(N), MVT::i32);
747       return true;
748     }
749   }
750 
751   return false;
752 }
753 
754 /// SelectArithExtendedRegister - Select a "extended register" operand.  This
755 /// operand folds in an extend followed by an optional left shift.
756 bool AArch64DAGToDAGISel::SelectArithExtendedRegister(SDValue N, SDValue &Reg,
757                                                       SDValue &Shift) {
758   unsigned ShiftVal = 0;
759   AArch64_AM::ShiftExtendType Ext;
760 
761   if (N.getOpcode() == ISD::SHL) {
762     ConstantSDNode *CSD = dyn_cast<ConstantSDNode>(N.getOperand(1));
763     if (!CSD)
764       return false;
765     ShiftVal = CSD->getZExtValue();
766     if (ShiftVal > 4)
767       return false;
768 
769     Ext = getExtendTypeForNode(N.getOperand(0));
770     if (Ext == AArch64_AM::InvalidShiftExtend)
771       return false;
772 
773     Reg = N.getOperand(0).getOperand(0);
774   } else {
775     Ext = getExtendTypeForNode(N);
776     if (Ext == AArch64_AM::InvalidShiftExtend)
777       return false;
778 
779     Reg = N.getOperand(0);
780 
781     // Don't match if free 32-bit -> 64-bit zext can be used instead.
782     if (Ext == AArch64_AM::UXTW &&
783         Reg->getValueType(0).getSizeInBits() == 32 && isDef32(*Reg.getNode()))
784       return false;
785   }
786 
787   // AArch64 mandates that the RHS of the operation must use the smallest
788   // register class that could contain the size being extended from.  Thus,
789   // if we're folding a (sext i8), we need the RHS to be a GPR32, even though
790   // there might not be an actual 32-bit value in the program.  We can
791   // (harmlessly) synthesize one by injected an EXTRACT_SUBREG here.
792   assert(Ext != AArch64_AM::UXTX && Ext != AArch64_AM::SXTX);
793   Reg = narrowIfNeeded(CurDAG, Reg);
794   Shift = CurDAG->getTargetConstant(getArithExtendImm(Ext, ShiftVal), SDLoc(N),
795                                     MVT::i32);
796   return isWorthFolding(N);
797 }
798 
799 /// If there's a use of this ADDlow that's not itself a load/store then we'll
800 /// need to create a real ADD instruction from it anyway and there's no point in
801 /// folding it into the mem op. Theoretically, it shouldn't matter, but there's
802 /// a single pseudo-instruction for an ADRP/ADD pair so over-aggressive folding
803 /// leads to duplicated ADRP instructions.
804 static bool isWorthFoldingADDlow(SDValue N) {
805   for (auto Use : N->uses()) {
806     if (Use->getOpcode() != ISD::LOAD && Use->getOpcode() != ISD::STORE &&
807         Use->getOpcode() != ISD::ATOMIC_LOAD &&
808         Use->getOpcode() != ISD::ATOMIC_STORE)
809       return false;
810 
811     // ldar and stlr have much more restrictive addressing modes (just a
812     // register).
813     if (isStrongerThanMonotonic(cast<MemSDNode>(Use)->getOrdering()))
814       return false;
815   }
816 
817   return true;
818 }
819 
820 /// SelectAddrModeIndexedBitWidth - Select a "register plus scaled (un)signed BW-bit
821 /// immediate" address.  The "Size" argument is the size in bytes of the memory
822 /// reference, which determines the scale.
823 bool AArch64DAGToDAGISel::SelectAddrModeIndexedBitWidth(SDValue N, bool IsSignedImm,
824                                                         unsigned BW, unsigned Size,
825                                                         SDValue &Base,
826                                                         SDValue &OffImm) {
827   SDLoc dl(N);
828   const DataLayout &DL = CurDAG->getDataLayout();
829   const TargetLowering *TLI = getTargetLowering();
830   if (N.getOpcode() == ISD::FrameIndex) {
831     int FI = cast<FrameIndexSDNode>(N)->getIndex();
832     Base = CurDAG->getTargetFrameIndex(FI, TLI->getPointerTy(DL));
833     OffImm = CurDAG->getTargetConstant(0, dl, MVT::i64);
834     return true;
835   }
836 
837   // As opposed to the (12-bit) Indexed addressing mode below, the 7/9-bit signed
838   // selected here doesn't support labels/immediates, only base+offset.
839   if (CurDAG->isBaseWithConstantOffset(N)) {
840     if (ConstantSDNode *RHS = dyn_cast<ConstantSDNode>(N.getOperand(1))) {
841       if (IsSignedImm) {
842         int64_t RHSC = RHS->getSExtValue();
843         unsigned Scale = Log2_32(Size);
844         int64_t Range = 0x1LL << (BW - 1);
845 
846         if ((RHSC & (Size - 1)) == 0 && RHSC >= -(Range << Scale) &&
847             RHSC < (Range << Scale)) {
848           Base = N.getOperand(0);
849           if (Base.getOpcode() == ISD::FrameIndex) {
850             int FI = cast<FrameIndexSDNode>(Base)->getIndex();
851             Base = CurDAG->getTargetFrameIndex(FI, TLI->getPointerTy(DL));
852           }
853           OffImm = CurDAG->getTargetConstant(RHSC >> Scale, dl, MVT::i64);
854           return true;
855         }
856       } else {
857         // unsigned Immediate
858         uint64_t RHSC = RHS->getZExtValue();
859         unsigned Scale = Log2_32(Size);
860         uint64_t Range = 0x1ULL << BW;
861 
862         if ((RHSC & (Size - 1)) == 0 && RHSC < (Range << Scale)) {
863           Base = N.getOperand(0);
864           if (Base.getOpcode() == ISD::FrameIndex) {
865             int FI = cast<FrameIndexSDNode>(Base)->getIndex();
866             Base = CurDAG->getTargetFrameIndex(FI, TLI->getPointerTy(DL));
867           }
868           OffImm = CurDAG->getTargetConstant(RHSC >> Scale, dl, MVT::i64);
869           return true;
870         }
871       }
872     }
873   }
874   // Base only. The address will be materialized into a register before
875   // the memory is accessed.
876   //    add x0, Xbase, #offset
877   //    stp x1, x2, [x0]
878   Base = N;
879   OffImm = CurDAG->getTargetConstant(0, dl, MVT::i64);
880   return true;
881 }
882 
883 /// SelectAddrModeIndexed - Select a "register plus scaled unsigned 12-bit
884 /// immediate" address.  The "Size" argument is the size in bytes of the memory
885 /// reference, which determines the scale.
886 bool AArch64DAGToDAGISel::SelectAddrModeIndexed(SDValue N, unsigned Size,
887                                               SDValue &Base, SDValue &OffImm) {
888   SDLoc dl(N);
889   const DataLayout &DL = CurDAG->getDataLayout();
890   const TargetLowering *TLI = getTargetLowering();
891   if (N.getOpcode() == ISD::FrameIndex) {
892     int FI = cast<FrameIndexSDNode>(N)->getIndex();
893     Base = CurDAG->getTargetFrameIndex(FI, TLI->getPointerTy(DL));
894     OffImm = CurDAG->getTargetConstant(0, dl, MVT::i64);
895     return true;
896   }
897 
898   if (N.getOpcode() == AArch64ISD::ADDlow && isWorthFoldingADDlow(N)) {
899     GlobalAddressSDNode *GAN =
900         dyn_cast<GlobalAddressSDNode>(N.getOperand(1).getNode());
901     Base = N.getOperand(0);
902     OffImm = N.getOperand(1);
903     if (!GAN)
904       return true;
905 
906     if (GAN->getOffset() % Size == 0 &&
907         GAN->getGlobal()->getPointerAlignment(DL) >= Size)
908       return true;
909   }
910 
911   if (CurDAG->isBaseWithConstantOffset(N)) {
912     if (ConstantSDNode *RHS = dyn_cast<ConstantSDNode>(N.getOperand(1))) {
913       int64_t RHSC = (int64_t)RHS->getZExtValue();
914       unsigned Scale = Log2_32(Size);
915       if ((RHSC & (Size - 1)) == 0 && RHSC >= 0 && RHSC < (0x1000 << Scale)) {
916         Base = N.getOperand(0);
917         if (Base.getOpcode() == ISD::FrameIndex) {
918           int FI = cast<FrameIndexSDNode>(Base)->getIndex();
919           Base = CurDAG->getTargetFrameIndex(FI, TLI->getPointerTy(DL));
920         }
921         OffImm = CurDAG->getTargetConstant(RHSC >> Scale, dl, MVT::i64);
922         return true;
923       }
924     }
925   }
926 
927   // Before falling back to our general case, check if the unscaled
928   // instructions can handle this. If so, that's preferable.
929   if (SelectAddrModeUnscaled(N, Size, Base, OffImm))
930     return false;
931 
932   // Base only. The address will be materialized into a register before
933   // the memory is accessed.
934   //    add x0, Xbase, #offset
935   //    ldr x0, [x0]
936   Base = N;
937   OffImm = CurDAG->getTargetConstant(0, dl, MVT::i64);
938   return true;
939 }
940 
941 /// SelectAddrModeUnscaled - Select a "register plus unscaled signed 9-bit
942 /// immediate" address.  This should only match when there is an offset that
943 /// is not valid for a scaled immediate addressing mode.  The "Size" argument
944 /// is the size in bytes of the memory reference, which is needed here to know
945 /// what is valid for a scaled immediate.
946 bool AArch64DAGToDAGISel::SelectAddrModeUnscaled(SDValue N, unsigned Size,
947                                                  SDValue &Base,
948                                                  SDValue &OffImm) {
949   if (!CurDAG->isBaseWithConstantOffset(N))
950     return false;
951   if (ConstantSDNode *RHS = dyn_cast<ConstantSDNode>(N.getOperand(1))) {
952     int64_t RHSC = RHS->getSExtValue();
953     // If the offset is valid as a scaled immediate, don't match here.
954     if ((RHSC & (Size - 1)) == 0 && RHSC >= 0 &&
955         RHSC < (0x1000 << Log2_32(Size)))
956       return false;
957     if (RHSC >= -256 && RHSC < 256) {
958       Base = N.getOperand(0);
959       if (Base.getOpcode() == ISD::FrameIndex) {
960         int FI = cast<FrameIndexSDNode>(Base)->getIndex();
961         const TargetLowering *TLI = getTargetLowering();
962         Base = CurDAG->getTargetFrameIndex(
963             FI, TLI->getPointerTy(CurDAG->getDataLayout()));
964       }
965       OffImm = CurDAG->getTargetConstant(RHSC, SDLoc(N), MVT::i64);
966       return true;
967     }
968   }
969   return false;
970 }
971 
972 static SDValue Widen(SelectionDAG *CurDAG, SDValue N) {
973   SDLoc dl(N);
974   SDValue SubReg = CurDAG->getTargetConstant(AArch64::sub_32, dl, MVT::i32);
975   SDValue ImpDef = SDValue(
976       CurDAG->getMachineNode(TargetOpcode::IMPLICIT_DEF, dl, MVT::i64), 0);
977   MachineSDNode *Node = CurDAG->getMachineNode(
978       TargetOpcode::INSERT_SUBREG, dl, MVT::i64, ImpDef, N, SubReg);
979   return SDValue(Node, 0);
980 }
981 
982 /// Check if the given SHL node (\p N), can be used to form an
983 /// extended register for an addressing mode.
984 bool AArch64DAGToDAGISel::SelectExtendedSHL(SDValue N, unsigned Size,
985                                             bool WantExtend, SDValue &Offset,
986                                             SDValue &SignExtend) {
987   assert(N.getOpcode() == ISD::SHL && "Invalid opcode.");
988   ConstantSDNode *CSD = dyn_cast<ConstantSDNode>(N.getOperand(1));
989   if (!CSD || (CSD->getZExtValue() & 0x7) != CSD->getZExtValue())
990     return false;
991 
992   SDLoc dl(N);
993   if (WantExtend) {
994     AArch64_AM::ShiftExtendType Ext =
995         getExtendTypeForNode(N.getOperand(0), true);
996     if (Ext == AArch64_AM::InvalidShiftExtend)
997       return false;
998 
999     Offset = narrowIfNeeded(CurDAG, N.getOperand(0).getOperand(0));
1000     SignExtend = CurDAG->getTargetConstant(Ext == AArch64_AM::SXTW, dl,
1001                                            MVT::i32);
1002   } else {
1003     Offset = N.getOperand(0);
1004     SignExtend = CurDAG->getTargetConstant(0, dl, MVT::i32);
1005   }
1006 
1007   unsigned LegalShiftVal = Log2_32(Size);
1008   unsigned ShiftVal = CSD->getZExtValue();
1009 
1010   if (ShiftVal != 0 && ShiftVal != LegalShiftVal)
1011     return false;
1012 
1013   return isWorthFolding(N);
1014 }
1015 
1016 bool AArch64DAGToDAGISel::SelectAddrModeWRO(SDValue N, unsigned Size,
1017                                             SDValue &Base, SDValue &Offset,
1018                                             SDValue &SignExtend,
1019                                             SDValue &DoShift) {
1020   if (N.getOpcode() != ISD::ADD)
1021     return false;
1022   SDValue LHS = N.getOperand(0);
1023   SDValue RHS = N.getOperand(1);
1024   SDLoc dl(N);
1025 
1026   // We don't want to match immediate adds here, because they are better lowered
1027   // to the register-immediate addressing modes.
1028   if (isa<ConstantSDNode>(LHS) || isa<ConstantSDNode>(RHS))
1029     return false;
1030 
1031   // Check if this particular node is reused in any non-memory related
1032   // operation.  If yes, do not try to fold this node into the address
1033   // computation, since the computation will be kept.
1034   const SDNode *Node = N.getNode();
1035   for (SDNode *UI : Node->uses()) {
1036     if (!isa<MemSDNode>(*UI))
1037       return false;
1038   }
1039 
1040   // Remember if it is worth folding N when it produces extended register.
1041   bool IsExtendedRegisterWorthFolding = isWorthFolding(N);
1042 
1043   // Try to match a shifted extend on the RHS.
1044   if (IsExtendedRegisterWorthFolding && RHS.getOpcode() == ISD::SHL &&
1045       SelectExtendedSHL(RHS, Size, true, Offset, SignExtend)) {
1046     Base = LHS;
1047     DoShift = CurDAG->getTargetConstant(true, dl, MVT::i32);
1048     return true;
1049   }
1050 
1051   // Try to match a shifted extend on the LHS.
1052   if (IsExtendedRegisterWorthFolding && LHS.getOpcode() == ISD::SHL &&
1053       SelectExtendedSHL(LHS, Size, true, Offset, SignExtend)) {
1054     Base = RHS;
1055     DoShift = CurDAG->getTargetConstant(true, dl, MVT::i32);
1056     return true;
1057   }
1058 
1059   // There was no shift, whatever else we find.
1060   DoShift = CurDAG->getTargetConstant(false, dl, MVT::i32);
1061 
1062   AArch64_AM::ShiftExtendType Ext = AArch64_AM::InvalidShiftExtend;
1063   // Try to match an unshifted extend on the LHS.
1064   if (IsExtendedRegisterWorthFolding &&
1065       (Ext = getExtendTypeForNode(LHS, true)) !=
1066           AArch64_AM::InvalidShiftExtend) {
1067     Base = RHS;
1068     Offset = narrowIfNeeded(CurDAG, LHS.getOperand(0));
1069     SignExtend = CurDAG->getTargetConstant(Ext == AArch64_AM::SXTW, dl,
1070                                            MVT::i32);
1071     if (isWorthFolding(LHS))
1072       return true;
1073   }
1074 
1075   // Try to match an unshifted extend on the RHS.
1076   if (IsExtendedRegisterWorthFolding &&
1077       (Ext = getExtendTypeForNode(RHS, true)) !=
1078           AArch64_AM::InvalidShiftExtend) {
1079     Base = LHS;
1080     Offset = narrowIfNeeded(CurDAG, RHS.getOperand(0));
1081     SignExtend = CurDAG->getTargetConstant(Ext == AArch64_AM::SXTW, dl,
1082                                            MVT::i32);
1083     if (isWorthFolding(RHS))
1084       return true;
1085   }
1086 
1087   return false;
1088 }
1089 
1090 // Check if the given immediate is preferred by ADD. If an immediate can be
1091 // encoded in an ADD, or it can be encoded in an "ADD LSL #12" and can not be
1092 // encoded by one MOVZ, return true.
1093 static bool isPreferredADD(int64_t ImmOff) {
1094   // Constant in [0x0, 0xfff] can be encoded in ADD.
1095   if ((ImmOff & 0xfffffffffffff000LL) == 0x0LL)
1096     return true;
1097   // Check if it can be encoded in an "ADD LSL #12".
1098   if ((ImmOff & 0xffffffffff000fffLL) == 0x0LL)
1099     // As a single MOVZ is faster than a "ADD of LSL #12", ignore such constant.
1100     return (ImmOff & 0xffffffffff00ffffLL) != 0x0LL &&
1101            (ImmOff & 0xffffffffffff0fffLL) != 0x0LL;
1102   return false;
1103 }
1104 
1105 bool AArch64DAGToDAGISel::SelectAddrModeXRO(SDValue N, unsigned Size,
1106                                             SDValue &Base, SDValue &Offset,
1107                                             SDValue &SignExtend,
1108                                             SDValue &DoShift) {
1109   if (N.getOpcode() != ISD::ADD)
1110     return false;
1111   SDValue LHS = N.getOperand(0);
1112   SDValue RHS = N.getOperand(1);
1113   SDLoc DL(N);
1114 
1115   // Check if this particular node is reused in any non-memory related
1116   // operation.  If yes, do not try to fold this node into the address
1117   // computation, since the computation will be kept.
1118   const SDNode *Node = N.getNode();
1119   for (SDNode *UI : Node->uses()) {
1120     if (!isa<MemSDNode>(*UI))
1121       return false;
1122   }
1123 
1124   // Watch out if RHS is a wide immediate, it can not be selected into
1125   // [BaseReg+Imm] addressing mode. Also it may not be able to be encoded into
1126   // ADD/SUB. Instead it will use [BaseReg + 0] address mode and generate
1127   // instructions like:
1128   //     MOV  X0, WideImmediate
1129   //     ADD  X1, BaseReg, X0
1130   //     LDR  X2, [X1, 0]
1131   // For such situation, using [BaseReg, XReg] addressing mode can save one
1132   // ADD/SUB:
1133   //     MOV  X0, WideImmediate
1134   //     LDR  X2, [BaseReg, X0]
1135   if (isa<ConstantSDNode>(RHS)) {
1136     int64_t ImmOff = (int64_t)cast<ConstantSDNode>(RHS)->getZExtValue();
1137     unsigned Scale = Log2_32(Size);
1138     // Skip the immediate can be selected by load/store addressing mode.
1139     // Also skip the immediate can be encoded by a single ADD (SUB is also
1140     // checked by using -ImmOff).
1141     if ((ImmOff % Size == 0 && ImmOff >= 0 && ImmOff < (0x1000 << Scale)) ||
1142         isPreferredADD(ImmOff) || isPreferredADD(-ImmOff))
1143       return false;
1144 
1145     SDValue Ops[] = { RHS };
1146     SDNode *MOVI =
1147         CurDAG->getMachineNode(AArch64::MOVi64imm, DL, MVT::i64, Ops);
1148     SDValue MOVIV = SDValue(MOVI, 0);
1149     // This ADD of two X register will be selected into [Reg+Reg] mode.
1150     N = CurDAG->getNode(ISD::ADD, DL, MVT::i64, LHS, MOVIV);
1151   }
1152 
1153   // Remember if it is worth folding N when it produces extended register.
1154   bool IsExtendedRegisterWorthFolding = isWorthFolding(N);
1155 
1156   // Try to match a shifted extend on the RHS.
1157   if (IsExtendedRegisterWorthFolding && RHS.getOpcode() == ISD::SHL &&
1158       SelectExtendedSHL(RHS, Size, false, Offset, SignExtend)) {
1159     Base = LHS;
1160     DoShift = CurDAG->getTargetConstant(true, DL, MVT::i32);
1161     return true;
1162   }
1163 
1164   // Try to match a shifted extend on the LHS.
1165   if (IsExtendedRegisterWorthFolding && LHS.getOpcode() == ISD::SHL &&
1166       SelectExtendedSHL(LHS, Size, false, Offset, SignExtend)) {
1167     Base = RHS;
1168     DoShift = CurDAG->getTargetConstant(true, DL, MVT::i32);
1169     return true;
1170   }
1171 
1172   // Match any non-shifted, non-extend, non-immediate add expression.
1173   Base = LHS;
1174   Offset = RHS;
1175   SignExtend = CurDAG->getTargetConstant(false, DL, MVT::i32);
1176   DoShift = CurDAG->getTargetConstant(false, DL, MVT::i32);
1177   // Reg1 + Reg2 is free: no check needed.
1178   return true;
1179 }
1180 
1181 SDValue AArch64DAGToDAGISel::createDTuple(ArrayRef<SDValue> Regs) {
1182   static const unsigned RegClassIDs[] = {
1183       AArch64::DDRegClassID, AArch64::DDDRegClassID, AArch64::DDDDRegClassID};
1184   static const unsigned SubRegs[] = {AArch64::dsub0, AArch64::dsub1,
1185                                      AArch64::dsub2, AArch64::dsub3};
1186 
1187   return createTuple(Regs, RegClassIDs, SubRegs);
1188 }
1189 
1190 SDValue AArch64DAGToDAGISel::createQTuple(ArrayRef<SDValue> Regs) {
1191   static const unsigned RegClassIDs[] = {
1192       AArch64::QQRegClassID, AArch64::QQQRegClassID, AArch64::QQQQRegClassID};
1193   static const unsigned SubRegs[] = {AArch64::qsub0, AArch64::qsub1,
1194                                      AArch64::qsub2, AArch64::qsub3};
1195 
1196   return createTuple(Regs, RegClassIDs, SubRegs);
1197 }
1198 
1199 SDValue AArch64DAGToDAGISel::createZTuple(ArrayRef<SDValue> Regs) {
1200   static const unsigned RegClassIDs[] = {AArch64::ZPR2RegClassID,
1201                                          AArch64::ZPR3RegClassID,
1202                                          AArch64::ZPR4RegClassID};
1203   static const unsigned SubRegs[] = {AArch64::zsub0, AArch64::zsub1,
1204                                      AArch64::zsub2, AArch64::zsub3};
1205 
1206   return createTuple(Regs, RegClassIDs, SubRegs);
1207 }
1208 
1209 SDValue AArch64DAGToDAGISel::createTuple(ArrayRef<SDValue> Regs,
1210                                          const unsigned RegClassIDs[],
1211                                          const unsigned SubRegs[]) {
1212   // There's no special register-class for a vector-list of 1 element: it's just
1213   // a vector.
1214   if (Regs.size() == 1)
1215     return Regs[0];
1216 
1217   assert(Regs.size() >= 2 && Regs.size() <= 4);
1218 
1219   SDLoc DL(Regs[0]);
1220 
1221   SmallVector<SDValue, 4> Ops;
1222 
1223   // First operand of REG_SEQUENCE is the desired RegClass.
1224   Ops.push_back(
1225       CurDAG->getTargetConstant(RegClassIDs[Regs.size() - 2], DL, MVT::i32));
1226 
1227   // Then we get pairs of source & subregister-position for the components.
1228   for (unsigned i = 0; i < Regs.size(); ++i) {
1229     Ops.push_back(Regs[i]);
1230     Ops.push_back(CurDAG->getTargetConstant(SubRegs[i], DL, MVT::i32));
1231   }
1232 
1233   SDNode *N =
1234       CurDAG->getMachineNode(TargetOpcode::REG_SEQUENCE, DL, MVT::Untyped, Ops);
1235   return SDValue(N, 0);
1236 }
1237 
1238 void AArch64DAGToDAGISel::SelectTable(SDNode *N, unsigned NumVecs, unsigned Opc,
1239                                       bool isExt) {
1240   SDLoc dl(N);
1241   EVT VT = N->getValueType(0);
1242 
1243   unsigned ExtOff = isExt;
1244 
1245   // Form a REG_SEQUENCE to force register allocation.
1246   unsigned Vec0Off = ExtOff + 1;
1247   SmallVector<SDValue, 4> Regs(N->op_begin() + Vec0Off,
1248                                N->op_begin() + Vec0Off + NumVecs);
1249   SDValue RegSeq = createQTuple(Regs);
1250 
1251   SmallVector<SDValue, 6> Ops;
1252   if (isExt)
1253     Ops.push_back(N->getOperand(1));
1254   Ops.push_back(RegSeq);
1255   Ops.push_back(N->getOperand(NumVecs + ExtOff + 1));
1256   ReplaceNode(N, CurDAG->getMachineNode(Opc, dl, VT, Ops));
1257 }
1258 
1259 bool AArch64DAGToDAGISel::tryIndexedLoad(SDNode *N) {
1260   LoadSDNode *LD = cast<LoadSDNode>(N);
1261   if (LD->isUnindexed())
1262     return false;
1263   EVT VT = LD->getMemoryVT();
1264   EVT DstVT = N->getValueType(0);
1265   ISD::MemIndexedMode AM = LD->getAddressingMode();
1266   bool IsPre = AM == ISD::PRE_INC || AM == ISD::PRE_DEC;
1267 
1268   // We're not doing validity checking here. That was done when checking
1269   // if we should mark the load as indexed or not. We're just selecting
1270   // the right instruction.
1271   unsigned Opcode = 0;
1272 
1273   ISD::LoadExtType ExtType = LD->getExtensionType();
1274   bool InsertTo64 = false;
1275   if (VT == MVT::i64)
1276     Opcode = IsPre ? AArch64::LDRXpre : AArch64::LDRXpost;
1277   else if (VT == MVT::i32) {
1278     if (ExtType == ISD::NON_EXTLOAD)
1279       Opcode = IsPre ? AArch64::LDRWpre : AArch64::LDRWpost;
1280     else if (ExtType == ISD::SEXTLOAD)
1281       Opcode = IsPre ? AArch64::LDRSWpre : AArch64::LDRSWpost;
1282     else {
1283       Opcode = IsPre ? AArch64::LDRWpre : AArch64::LDRWpost;
1284       InsertTo64 = true;
1285       // The result of the load is only i32. It's the subreg_to_reg that makes
1286       // it into an i64.
1287       DstVT = MVT::i32;
1288     }
1289   } else if (VT == MVT::i16) {
1290     if (ExtType == ISD::SEXTLOAD) {
1291       if (DstVT == MVT::i64)
1292         Opcode = IsPre ? AArch64::LDRSHXpre : AArch64::LDRSHXpost;
1293       else
1294         Opcode = IsPre ? AArch64::LDRSHWpre : AArch64::LDRSHWpost;
1295     } else {
1296       Opcode = IsPre ? AArch64::LDRHHpre : AArch64::LDRHHpost;
1297       InsertTo64 = DstVT == MVT::i64;
1298       // The result of the load is only i32. It's the subreg_to_reg that makes
1299       // it into an i64.
1300       DstVT = MVT::i32;
1301     }
1302   } else if (VT == MVT::i8) {
1303     if (ExtType == ISD::SEXTLOAD) {
1304       if (DstVT == MVT::i64)
1305         Opcode = IsPre ? AArch64::LDRSBXpre : AArch64::LDRSBXpost;
1306       else
1307         Opcode = IsPre ? AArch64::LDRSBWpre : AArch64::LDRSBWpost;
1308     } else {
1309       Opcode = IsPre ? AArch64::LDRBBpre : AArch64::LDRBBpost;
1310       InsertTo64 = DstVT == MVT::i64;
1311       // The result of the load is only i32. It's the subreg_to_reg that makes
1312       // it into an i64.
1313       DstVT = MVT::i32;
1314     }
1315   } else if (VT == MVT::f16) {
1316     Opcode = IsPre ? AArch64::LDRHpre : AArch64::LDRHpost;
1317   } else if (VT == MVT::bf16) {
1318     Opcode = IsPre ? AArch64::LDRHpre : AArch64::LDRHpost;
1319   } else if (VT == MVT::f32) {
1320     Opcode = IsPre ? AArch64::LDRSpre : AArch64::LDRSpost;
1321   } else if (VT == MVT::f64 || VT.is64BitVector()) {
1322     Opcode = IsPre ? AArch64::LDRDpre : AArch64::LDRDpost;
1323   } else if (VT.is128BitVector()) {
1324     Opcode = IsPre ? AArch64::LDRQpre : AArch64::LDRQpost;
1325   } else
1326     return false;
1327   SDValue Chain = LD->getChain();
1328   SDValue Base = LD->getBasePtr();
1329   ConstantSDNode *OffsetOp = cast<ConstantSDNode>(LD->getOffset());
1330   int OffsetVal = (int)OffsetOp->getZExtValue();
1331   SDLoc dl(N);
1332   SDValue Offset = CurDAG->getTargetConstant(OffsetVal, dl, MVT::i64);
1333   SDValue Ops[] = { Base, Offset, Chain };
1334   SDNode *Res = CurDAG->getMachineNode(Opcode, dl, MVT::i64, DstVT,
1335                                        MVT::Other, Ops);
1336   // Either way, we're replacing the node, so tell the caller that.
1337   SDValue LoadedVal = SDValue(Res, 1);
1338   if (InsertTo64) {
1339     SDValue SubReg = CurDAG->getTargetConstant(AArch64::sub_32, dl, MVT::i32);
1340     LoadedVal =
1341         SDValue(CurDAG->getMachineNode(
1342                     AArch64::SUBREG_TO_REG, dl, MVT::i64,
1343                     CurDAG->getTargetConstant(0, dl, MVT::i64), LoadedVal,
1344                     SubReg),
1345                 0);
1346   }
1347 
1348   ReplaceUses(SDValue(N, 0), LoadedVal);
1349   ReplaceUses(SDValue(N, 1), SDValue(Res, 0));
1350   ReplaceUses(SDValue(N, 2), SDValue(Res, 2));
1351   CurDAG->RemoveDeadNode(N);
1352   return true;
1353 }
1354 
1355 void AArch64DAGToDAGISel::SelectLoad(SDNode *N, unsigned NumVecs, unsigned Opc,
1356                                      unsigned SubRegIdx) {
1357   SDLoc dl(N);
1358   EVT VT = N->getValueType(0);
1359   SDValue Chain = N->getOperand(0);
1360 
1361   SDValue Ops[] = {N->getOperand(2), // Mem operand;
1362                    Chain};
1363 
1364   const EVT ResTys[] = {MVT::Untyped, MVT::Other};
1365 
1366   SDNode *Ld = CurDAG->getMachineNode(Opc, dl, ResTys, Ops);
1367   SDValue SuperReg = SDValue(Ld, 0);
1368   for (unsigned i = 0; i < NumVecs; ++i)
1369     ReplaceUses(SDValue(N, i),
1370         CurDAG->getTargetExtractSubreg(SubRegIdx + i, dl, VT, SuperReg));
1371 
1372   ReplaceUses(SDValue(N, NumVecs), SDValue(Ld, 1));
1373 
1374   // Transfer memoperands.
1375   MachineMemOperand *MemOp = cast<MemIntrinsicSDNode>(N)->getMemOperand();
1376   CurDAG->setNodeMemRefs(cast<MachineSDNode>(Ld), {MemOp});
1377 
1378   CurDAG->RemoveDeadNode(N);
1379 }
1380 
1381 void AArch64DAGToDAGISel::SelectPostLoad(SDNode *N, unsigned NumVecs,
1382                                          unsigned Opc, unsigned SubRegIdx) {
1383   SDLoc dl(N);
1384   EVT VT = N->getValueType(0);
1385   SDValue Chain = N->getOperand(0);
1386 
1387   SDValue Ops[] = {N->getOperand(1), // Mem operand
1388                    N->getOperand(2), // Incremental
1389                    Chain};
1390 
1391   const EVT ResTys[] = {MVT::i64, // Type of the write back register
1392                         MVT::Untyped, MVT::Other};
1393 
1394   SDNode *Ld = CurDAG->getMachineNode(Opc, dl, ResTys, Ops);
1395 
1396   // Update uses of write back register
1397   ReplaceUses(SDValue(N, NumVecs), SDValue(Ld, 0));
1398 
1399   // Update uses of vector list
1400   SDValue SuperReg = SDValue(Ld, 1);
1401   if (NumVecs == 1)
1402     ReplaceUses(SDValue(N, 0), SuperReg);
1403   else
1404     for (unsigned i = 0; i < NumVecs; ++i)
1405       ReplaceUses(SDValue(N, i),
1406           CurDAG->getTargetExtractSubreg(SubRegIdx + i, dl, VT, SuperReg));
1407 
1408   // Update the chain
1409   ReplaceUses(SDValue(N, NumVecs + 1), SDValue(Ld, 2));
1410   CurDAG->RemoveDeadNode(N);
1411 }
1412 
1413 /// Optimize \param OldBase and \param OldOffset selecting the best addressing
1414 /// mode. Returns a tuple consisting of an Opcode, an SDValue representing the
1415 /// new Base and an SDValue representing the new offset.
1416 std::tuple<unsigned, SDValue, SDValue>
1417 AArch64DAGToDAGISel::findAddrModeSVELoadStore(SDNode *N, unsigned Opc_rr,
1418                                               unsigned Opc_ri,
1419                                               const SDValue &OldBase,
1420                                               const SDValue &OldOffset,
1421                                               unsigned Scale) {
1422   SDValue NewBase = OldBase;
1423   SDValue NewOffset = OldOffset;
1424   // Detect a possible Reg+Imm addressing mode.
1425   const bool IsRegImm = SelectAddrModeIndexedSVE</*Min=*/-8, /*Max=*/7>(
1426       N, OldBase, NewBase, NewOffset);
1427 
1428   // Detect a possible reg+reg addressing mode, but only if we haven't already
1429   // detected a Reg+Imm one.
1430   const bool IsRegReg =
1431       !IsRegImm && SelectSVERegRegAddrMode(OldBase, Scale, NewBase, NewOffset);
1432 
1433   // Select the instruction.
1434   return std::make_tuple(IsRegReg ? Opc_rr : Opc_ri, NewBase, NewOffset);
1435 }
1436 
1437 void AArch64DAGToDAGISel::SelectPredicatedLoad(SDNode *N, unsigned NumVecs,
1438                                                unsigned Scale, unsigned Opc_ri,
1439                                                unsigned Opc_rr) {
1440   assert(Scale < 4 && "Invalid scaling value.");
1441   SDLoc DL(N);
1442   EVT VT = N->getValueType(0);
1443   SDValue Chain = N->getOperand(0);
1444 
1445   // Optimize addressing mode.
1446   SDValue Base, Offset;
1447   unsigned Opc;
1448   std::tie(Opc, Base, Offset) = findAddrModeSVELoadStore(
1449       N, Opc_rr, Opc_ri, N->getOperand(2),
1450       CurDAG->getTargetConstant(0, DL, MVT::i64), Scale);
1451 
1452   SDValue Ops[] = {N->getOperand(1), // Predicate
1453                    Base,             // Memory operand
1454                    Offset, Chain};
1455 
1456   const EVT ResTys[] = {MVT::Untyped, MVT::Other};
1457 
1458   SDNode *Load = CurDAG->getMachineNode(Opc, DL, ResTys, Ops);
1459   SDValue SuperReg = SDValue(Load, 0);
1460   for (unsigned i = 0; i < NumVecs; ++i)
1461     ReplaceUses(SDValue(N, i), CurDAG->getTargetExtractSubreg(
1462                                    AArch64::zsub0 + i, DL, VT, SuperReg));
1463 
1464   // Copy chain
1465   unsigned ChainIdx = NumVecs;
1466   ReplaceUses(SDValue(N, ChainIdx), SDValue(Load, 1));
1467   CurDAG->RemoveDeadNode(N);
1468 }
1469 
1470 void AArch64DAGToDAGISel::SelectStore(SDNode *N, unsigned NumVecs,
1471                                       unsigned Opc) {
1472   SDLoc dl(N);
1473   EVT VT = N->getOperand(2)->getValueType(0);
1474 
1475   // Form a REG_SEQUENCE to force register allocation.
1476   bool Is128Bit = VT.getSizeInBits() == 128;
1477   SmallVector<SDValue, 4> Regs(N->op_begin() + 2, N->op_begin() + 2 + NumVecs);
1478   SDValue RegSeq = Is128Bit ? createQTuple(Regs) : createDTuple(Regs);
1479 
1480   SDValue Ops[] = {RegSeq, N->getOperand(NumVecs + 2), N->getOperand(0)};
1481   SDNode *St = CurDAG->getMachineNode(Opc, dl, N->getValueType(0), Ops);
1482 
1483   // Transfer memoperands.
1484   MachineMemOperand *MemOp = cast<MemIntrinsicSDNode>(N)->getMemOperand();
1485   CurDAG->setNodeMemRefs(cast<MachineSDNode>(St), {MemOp});
1486 
1487   ReplaceNode(N, St);
1488 }
1489 
1490 void AArch64DAGToDAGISel::SelectPredicatedStore(SDNode *N, unsigned NumVecs,
1491                                                 unsigned Scale, unsigned Opc_rr,
1492                                                 unsigned Opc_ri) {
1493   SDLoc dl(N);
1494 
1495   // Form a REG_SEQUENCE to force register allocation.
1496   SmallVector<SDValue, 4> Regs(N->op_begin() + 2, N->op_begin() + 2 + NumVecs);
1497   SDValue RegSeq = createZTuple(Regs);
1498 
1499   // Optimize addressing mode.
1500   unsigned Opc;
1501   SDValue Offset, Base;
1502   std::tie(Opc, Base, Offset) = findAddrModeSVELoadStore(
1503       N, Opc_rr, Opc_ri, N->getOperand(NumVecs + 3),
1504       CurDAG->getTargetConstant(0, dl, MVT::i64), Scale);
1505 
1506   SDValue Ops[] = {RegSeq, N->getOperand(NumVecs + 2), // predicate
1507                    Base,                               // address
1508                    Offset,                             // offset
1509                    N->getOperand(0)};                  // chain
1510   SDNode *St = CurDAG->getMachineNode(Opc, dl, N->getValueType(0), Ops);
1511 
1512   ReplaceNode(N, St);
1513 }
1514 
1515 bool AArch64DAGToDAGISel::SelectAddrModeFrameIndexSVE(SDValue N, SDValue &Base,
1516                                                       SDValue &OffImm) {
1517   SDLoc dl(N);
1518   const DataLayout &DL = CurDAG->getDataLayout();
1519   const TargetLowering *TLI = getTargetLowering();
1520 
1521   // Try to match it for the frame address
1522   if (auto FINode = dyn_cast<FrameIndexSDNode>(N)) {
1523     int FI = FINode->getIndex();
1524     Base = CurDAG->getTargetFrameIndex(FI, TLI->getPointerTy(DL));
1525     OffImm = CurDAG->getTargetConstant(0, dl, MVT::i64);
1526     return true;
1527   }
1528 
1529   return false;
1530 }
1531 
1532 void AArch64DAGToDAGISel::SelectPostStore(SDNode *N, unsigned NumVecs,
1533                                           unsigned Opc) {
1534   SDLoc dl(N);
1535   EVT VT = N->getOperand(2)->getValueType(0);
1536   const EVT ResTys[] = {MVT::i64,    // Type of the write back register
1537                         MVT::Other}; // Type for the Chain
1538 
1539   // Form a REG_SEQUENCE to force register allocation.
1540   bool Is128Bit = VT.getSizeInBits() == 128;
1541   SmallVector<SDValue, 4> Regs(N->op_begin() + 1, N->op_begin() + 1 + NumVecs);
1542   SDValue RegSeq = Is128Bit ? createQTuple(Regs) : createDTuple(Regs);
1543 
1544   SDValue Ops[] = {RegSeq,
1545                    N->getOperand(NumVecs + 1), // base register
1546                    N->getOperand(NumVecs + 2), // Incremental
1547                    N->getOperand(0)};          // Chain
1548   SDNode *St = CurDAG->getMachineNode(Opc, dl, ResTys, Ops);
1549 
1550   ReplaceNode(N, St);
1551 }
1552 
1553 namespace {
1554 /// WidenVector - Given a value in the V64 register class, produce the
1555 /// equivalent value in the V128 register class.
1556 class WidenVector {
1557   SelectionDAG &DAG;
1558 
1559 public:
1560   WidenVector(SelectionDAG &DAG) : DAG(DAG) {}
1561 
1562   SDValue operator()(SDValue V64Reg) {
1563     EVT VT = V64Reg.getValueType();
1564     unsigned NarrowSize = VT.getVectorNumElements();
1565     MVT EltTy = VT.getVectorElementType().getSimpleVT();
1566     MVT WideTy = MVT::getVectorVT(EltTy, 2 * NarrowSize);
1567     SDLoc DL(V64Reg);
1568 
1569     SDValue Undef =
1570         SDValue(DAG.getMachineNode(TargetOpcode::IMPLICIT_DEF, DL, WideTy), 0);
1571     return DAG.getTargetInsertSubreg(AArch64::dsub, DL, WideTy, Undef, V64Reg);
1572   }
1573 };
1574 } // namespace
1575 
1576 /// NarrowVector - Given a value in the V128 register class, produce the
1577 /// equivalent value in the V64 register class.
1578 static SDValue NarrowVector(SDValue V128Reg, SelectionDAG &DAG) {
1579   EVT VT = V128Reg.getValueType();
1580   unsigned WideSize = VT.getVectorNumElements();
1581   MVT EltTy = VT.getVectorElementType().getSimpleVT();
1582   MVT NarrowTy = MVT::getVectorVT(EltTy, WideSize / 2);
1583 
1584   return DAG.getTargetExtractSubreg(AArch64::dsub, SDLoc(V128Reg), NarrowTy,
1585                                     V128Reg);
1586 }
1587 
1588 void AArch64DAGToDAGISel::SelectLoadLane(SDNode *N, unsigned NumVecs,
1589                                          unsigned Opc) {
1590   SDLoc dl(N);
1591   EVT VT = N->getValueType(0);
1592   bool Narrow = VT.getSizeInBits() == 64;
1593 
1594   // Form a REG_SEQUENCE to force register allocation.
1595   SmallVector<SDValue, 4> Regs(N->op_begin() + 2, N->op_begin() + 2 + NumVecs);
1596 
1597   if (Narrow)
1598     transform(Regs, Regs.begin(),
1599                    WidenVector(*CurDAG));
1600 
1601   SDValue RegSeq = createQTuple(Regs);
1602 
1603   const EVT ResTys[] = {MVT::Untyped, MVT::Other};
1604 
1605   unsigned LaneNo =
1606       cast<ConstantSDNode>(N->getOperand(NumVecs + 2))->getZExtValue();
1607 
1608   SDValue Ops[] = {RegSeq, CurDAG->getTargetConstant(LaneNo, dl, MVT::i64),
1609                    N->getOperand(NumVecs + 3), N->getOperand(0)};
1610   SDNode *Ld = CurDAG->getMachineNode(Opc, dl, ResTys, Ops);
1611   SDValue SuperReg = SDValue(Ld, 0);
1612 
1613   EVT WideVT = RegSeq.getOperand(1)->getValueType(0);
1614   static const unsigned QSubs[] = { AArch64::qsub0, AArch64::qsub1,
1615                                     AArch64::qsub2, AArch64::qsub3 };
1616   for (unsigned i = 0; i < NumVecs; ++i) {
1617     SDValue NV = CurDAG->getTargetExtractSubreg(QSubs[i], dl, WideVT, SuperReg);
1618     if (Narrow)
1619       NV = NarrowVector(NV, *CurDAG);
1620     ReplaceUses(SDValue(N, i), NV);
1621   }
1622 
1623   ReplaceUses(SDValue(N, NumVecs), SDValue(Ld, 1));
1624   CurDAG->RemoveDeadNode(N);
1625 }
1626 
1627 void AArch64DAGToDAGISel::SelectPostLoadLane(SDNode *N, unsigned NumVecs,
1628                                              unsigned Opc) {
1629   SDLoc dl(N);
1630   EVT VT = N->getValueType(0);
1631   bool Narrow = VT.getSizeInBits() == 64;
1632 
1633   // Form a REG_SEQUENCE to force register allocation.
1634   SmallVector<SDValue, 4> Regs(N->op_begin() + 1, N->op_begin() + 1 + NumVecs);
1635 
1636   if (Narrow)
1637     transform(Regs, Regs.begin(),
1638                    WidenVector(*CurDAG));
1639 
1640   SDValue RegSeq = createQTuple(Regs);
1641 
1642   const EVT ResTys[] = {MVT::i64, // Type of the write back register
1643                         RegSeq->getValueType(0), MVT::Other};
1644 
1645   unsigned LaneNo =
1646       cast<ConstantSDNode>(N->getOperand(NumVecs + 1))->getZExtValue();
1647 
1648   SDValue Ops[] = {RegSeq,
1649                    CurDAG->getTargetConstant(LaneNo, dl,
1650                                              MVT::i64),         // Lane Number
1651                    N->getOperand(NumVecs + 2),                  // Base register
1652                    N->getOperand(NumVecs + 3),                  // Incremental
1653                    N->getOperand(0)};
1654   SDNode *Ld = CurDAG->getMachineNode(Opc, dl, ResTys, Ops);
1655 
1656   // Update uses of the write back register
1657   ReplaceUses(SDValue(N, NumVecs), SDValue(Ld, 0));
1658 
1659   // Update uses of the vector list
1660   SDValue SuperReg = SDValue(Ld, 1);
1661   if (NumVecs == 1) {
1662     ReplaceUses(SDValue(N, 0),
1663                 Narrow ? NarrowVector(SuperReg, *CurDAG) : SuperReg);
1664   } else {
1665     EVT WideVT = RegSeq.getOperand(1)->getValueType(0);
1666     static const unsigned QSubs[] = { AArch64::qsub0, AArch64::qsub1,
1667                                       AArch64::qsub2, AArch64::qsub3 };
1668     for (unsigned i = 0; i < NumVecs; ++i) {
1669       SDValue NV = CurDAG->getTargetExtractSubreg(QSubs[i], dl, WideVT,
1670                                                   SuperReg);
1671       if (Narrow)
1672         NV = NarrowVector(NV, *CurDAG);
1673       ReplaceUses(SDValue(N, i), NV);
1674     }
1675   }
1676 
1677   // Update the Chain
1678   ReplaceUses(SDValue(N, NumVecs + 1), SDValue(Ld, 2));
1679   CurDAG->RemoveDeadNode(N);
1680 }
1681 
1682 void AArch64DAGToDAGISel::SelectStoreLane(SDNode *N, unsigned NumVecs,
1683                                           unsigned Opc) {
1684   SDLoc dl(N);
1685   EVT VT = N->getOperand(2)->getValueType(0);
1686   bool Narrow = VT.getSizeInBits() == 64;
1687 
1688   // Form a REG_SEQUENCE to force register allocation.
1689   SmallVector<SDValue, 4> Regs(N->op_begin() + 2, N->op_begin() + 2 + NumVecs);
1690 
1691   if (Narrow)
1692     transform(Regs, Regs.begin(),
1693                    WidenVector(*CurDAG));
1694 
1695   SDValue RegSeq = createQTuple(Regs);
1696 
1697   unsigned LaneNo =
1698       cast<ConstantSDNode>(N->getOperand(NumVecs + 2))->getZExtValue();
1699 
1700   SDValue Ops[] = {RegSeq, CurDAG->getTargetConstant(LaneNo, dl, MVT::i64),
1701                    N->getOperand(NumVecs + 3), N->getOperand(0)};
1702   SDNode *St = CurDAG->getMachineNode(Opc, dl, MVT::Other, Ops);
1703 
1704   // Transfer memoperands.
1705   MachineMemOperand *MemOp = cast<MemIntrinsicSDNode>(N)->getMemOperand();
1706   CurDAG->setNodeMemRefs(cast<MachineSDNode>(St), {MemOp});
1707 
1708   ReplaceNode(N, St);
1709 }
1710 
1711 void AArch64DAGToDAGISel::SelectPostStoreLane(SDNode *N, unsigned NumVecs,
1712                                               unsigned Opc) {
1713   SDLoc dl(N);
1714   EVT VT = N->getOperand(2)->getValueType(0);
1715   bool Narrow = VT.getSizeInBits() == 64;
1716 
1717   // Form a REG_SEQUENCE to force register allocation.
1718   SmallVector<SDValue, 4> Regs(N->op_begin() + 1, N->op_begin() + 1 + NumVecs);
1719 
1720   if (Narrow)
1721     transform(Regs, Regs.begin(),
1722                    WidenVector(*CurDAG));
1723 
1724   SDValue RegSeq = createQTuple(Regs);
1725 
1726   const EVT ResTys[] = {MVT::i64, // Type of the write back register
1727                         MVT::Other};
1728 
1729   unsigned LaneNo =
1730       cast<ConstantSDNode>(N->getOperand(NumVecs + 1))->getZExtValue();
1731 
1732   SDValue Ops[] = {RegSeq, CurDAG->getTargetConstant(LaneNo, dl, MVT::i64),
1733                    N->getOperand(NumVecs + 2), // Base Register
1734                    N->getOperand(NumVecs + 3), // Incremental
1735                    N->getOperand(0)};
1736   SDNode *St = CurDAG->getMachineNode(Opc, dl, ResTys, Ops);
1737 
1738   // Transfer memoperands.
1739   MachineMemOperand *MemOp = cast<MemIntrinsicSDNode>(N)->getMemOperand();
1740   CurDAG->setNodeMemRefs(cast<MachineSDNode>(St), {MemOp});
1741 
1742   ReplaceNode(N, St);
1743 }
1744 
1745 static bool isBitfieldExtractOpFromAnd(SelectionDAG *CurDAG, SDNode *N,
1746                                        unsigned &Opc, SDValue &Opd0,
1747                                        unsigned &LSB, unsigned &MSB,
1748                                        unsigned NumberOfIgnoredLowBits,
1749                                        bool BiggerPattern) {
1750   assert(N->getOpcode() == ISD::AND &&
1751          "N must be a AND operation to call this function");
1752 
1753   EVT VT = N->getValueType(0);
1754 
1755   // Here we can test the type of VT and return false when the type does not
1756   // match, but since it is done prior to that call in the current context
1757   // we turned that into an assert to avoid redundant code.
1758   assert((VT == MVT::i32 || VT == MVT::i64) &&
1759          "Type checking must have been done before calling this function");
1760 
1761   // FIXME: simplify-demanded-bits in DAGCombine will probably have
1762   // changed the AND node to a 32-bit mask operation. We'll have to
1763   // undo that as part of the transform here if we want to catch all
1764   // the opportunities.
1765   // Currently the NumberOfIgnoredLowBits argument helps to recover
1766   // form these situations when matching bigger pattern (bitfield insert).
1767 
1768   // For unsigned extracts, check for a shift right and mask
1769   uint64_t AndImm = 0;
1770   if (!isOpcWithIntImmediate(N, ISD::AND, AndImm))
1771     return false;
1772 
1773   const SDNode *Op0 = N->getOperand(0).getNode();
1774 
1775   // Because of simplify-demanded-bits in DAGCombine, the mask may have been
1776   // simplified. Try to undo that
1777   AndImm |= maskTrailingOnes<uint64_t>(NumberOfIgnoredLowBits);
1778 
1779   // The immediate is a mask of the low bits iff imm & (imm+1) == 0
1780   if (AndImm & (AndImm + 1))
1781     return false;
1782 
1783   bool ClampMSB = false;
1784   uint64_t SrlImm = 0;
1785   // Handle the SRL + ANY_EXTEND case.
1786   if (VT == MVT::i64 && Op0->getOpcode() == ISD::ANY_EXTEND &&
1787       isOpcWithIntImmediate(Op0->getOperand(0).getNode(), ISD::SRL, SrlImm)) {
1788     // Extend the incoming operand of the SRL to 64-bit.
1789     Opd0 = Widen(CurDAG, Op0->getOperand(0).getOperand(0));
1790     // Make sure to clamp the MSB so that we preserve the semantics of the
1791     // original operations.
1792     ClampMSB = true;
1793   } else if (VT == MVT::i32 && Op0->getOpcode() == ISD::TRUNCATE &&
1794              isOpcWithIntImmediate(Op0->getOperand(0).getNode(), ISD::SRL,
1795                                    SrlImm)) {
1796     // If the shift result was truncated, we can still combine them.
1797     Opd0 = Op0->getOperand(0).getOperand(0);
1798 
1799     // Use the type of SRL node.
1800     VT = Opd0->getValueType(0);
1801   } else if (isOpcWithIntImmediate(Op0, ISD::SRL, SrlImm)) {
1802     Opd0 = Op0->getOperand(0);
1803   } else if (BiggerPattern) {
1804     // Let's pretend a 0 shift right has been performed.
1805     // The resulting code will be at least as good as the original one
1806     // plus it may expose more opportunities for bitfield insert pattern.
1807     // FIXME: Currently we limit this to the bigger pattern, because
1808     // some optimizations expect AND and not UBFM.
1809     Opd0 = N->getOperand(0);
1810   } else
1811     return false;
1812 
1813   // Bail out on large immediates. This happens when no proper
1814   // combining/constant folding was performed.
1815   if (!BiggerPattern && (SrlImm <= 0 || SrlImm >= VT.getSizeInBits())) {
1816     LLVM_DEBUG(
1817         (dbgs() << N
1818                 << ": Found large shift immediate, this should not happen\n"));
1819     return false;
1820   }
1821 
1822   LSB = SrlImm;
1823   MSB = SrlImm + (VT == MVT::i32 ? countTrailingOnes<uint32_t>(AndImm)
1824                                  : countTrailingOnes<uint64_t>(AndImm)) -
1825         1;
1826   if (ClampMSB)
1827     // Since we're moving the extend before the right shift operation, we need
1828     // to clamp the MSB to make sure we don't shift in undefined bits instead of
1829     // the zeros which would get shifted in with the original right shift
1830     // operation.
1831     MSB = MSB > 31 ? 31 : MSB;
1832 
1833   Opc = VT == MVT::i32 ? AArch64::UBFMWri : AArch64::UBFMXri;
1834   return true;
1835 }
1836 
1837 static bool isBitfieldExtractOpFromSExtInReg(SDNode *N, unsigned &Opc,
1838                                              SDValue &Opd0, unsigned &Immr,
1839                                              unsigned &Imms) {
1840   assert(N->getOpcode() == ISD::SIGN_EXTEND_INREG);
1841 
1842   EVT VT = N->getValueType(0);
1843   unsigned BitWidth = VT.getSizeInBits();
1844   assert((VT == MVT::i32 || VT == MVT::i64) &&
1845          "Type checking must have been done before calling this function");
1846 
1847   SDValue Op = N->getOperand(0);
1848   if (Op->getOpcode() == ISD::TRUNCATE) {
1849     Op = Op->getOperand(0);
1850     VT = Op->getValueType(0);
1851     BitWidth = VT.getSizeInBits();
1852   }
1853 
1854   uint64_t ShiftImm;
1855   if (!isOpcWithIntImmediate(Op.getNode(), ISD::SRL, ShiftImm) &&
1856       !isOpcWithIntImmediate(Op.getNode(), ISD::SRA, ShiftImm))
1857     return false;
1858 
1859   unsigned Width = cast<VTSDNode>(N->getOperand(1))->getVT().getSizeInBits();
1860   if (ShiftImm + Width > BitWidth)
1861     return false;
1862 
1863   Opc = (VT == MVT::i32) ? AArch64::SBFMWri : AArch64::SBFMXri;
1864   Opd0 = Op.getOperand(0);
1865   Immr = ShiftImm;
1866   Imms = ShiftImm + Width - 1;
1867   return true;
1868 }
1869 
1870 static bool isSeveralBitsExtractOpFromShr(SDNode *N, unsigned &Opc,
1871                                           SDValue &Opd0, unsigned &LSB,
1872                                           unsigned &MSB) {
1873   // We are looking for the following pattern which basically extracts several
1874   // continuous bits from the source value and places it from the LSB of the
1875   // destination value, all other bits of the destination value or set to zero:
1876   //
1877   // Value2 = AND Value, MaskImm
1878   // SRL Value2, ShiftImm
1879   //
1880   // with MaskImm >> ShiftImm to search for the bit width.
1881   //
1882   // This gets selected into a single UBFM:
1883   //
1884   // UBFM Value, ShiftImm, BitWide + SrlImm -1
1885   //
1886 
1887   if (N->getOpcode() != ISD::SRL)
1888     return false;
1889 
1890   uint64_t AndMask = 0;
1891   if (!isOpcWithIntImmediate(N->getOperand(0).getNode(), ISD::AND, AndMask))
1892     return false;
1893 
1894   Opd0 = N->getOperand(0).getOperand(0);
1895 
1896   uint64_t SrlImm = 0;
1897   if (!isIntImmediate(N->getOperand(1), SrlImm))
1898     return false;
1899 
1900   // Check whether we really have several bits extract here.
1901   unsigned BitWide = 64 - countLeadingOnes(~(AndMask >> SrlImm));
1902   if (BitWide && isMask_64(AndMask >> SrlImm)) {
1903     if (N->getValueType(0) == MVT::i32)
1904       Opc = AArch64::UBFMWri;
1905     else
1906       Opc = AArch64::UBFMXri;
1907 
1908     LSB = SrlImm;
1909     MSB = BitWide + SrlImm - 1;
1910     return true;
1911   }
1912 
1913   return false;
1914 }
1915 
1916 static bool isBitfieldExtractOpFromShr(SDNode *N, unsigned &Opc, SDValue &Opd0,
1917                                        unsigned &Immr, unsigned &Imms,
1918                                        bool BiggerPattern) {
1919   assert((N->getOpcode() == ISD::SRA || N->getOpcode() == ISD::SRL) &&
1920          "N must be a SHR/SRA operation to call this function");
1921 
1922   EVT VT = N->getValueType(0);
1923 
1924   // Here we can test the type of VT and return false when the type does not
1925   // match, but since it is done prior to that call in the current context
1926   // we turned that into an assert to avoid redundant code.
1927   assert((VT == MVT::i32 || VT == MVT::i64) &&
1928          "Type checking must have been done before calling this function");
1929 
1930   // Check for AND + SRL doing several bits extract.
1931   if (isSeveralBitsExtractOpFromShr(N, Opc, Opd0, Immr, Imms))
1932     return true;
1933 
1934   // We're looking for a shift of a shift.
1935   uint64_t ShlImm = 0;
1936   uint64_t TruncBits = 0;
1937   if (isOpcWithIntImmediate(N->getOperand(0).getNode(), ISD::SHL, ShlImm)) {
1938     Opd0 = N->getOperand(0).getOperand(0);
1939   } else if (VT == MVT::i32 && N->getOpcode() == ISD::SRL &&
1940              N->getOperand(0).getNode()->getOpcode() == ISD::TRUNCATE) {
1941     // We are looking for a shift of truncate. Truncate from i64 to i32 could
1942     // be considered as setting high 32 bits as zero. Our strategy here is to
1943     // always generate 64bit UBFM. This consistency will help the CSE pass
1944     // later find more redundancy.
1945     Opd0 = N->getOperand(0).getOperand(0);
1946     TruncBits = Opd0->getValueType(0).getSizeInBits() - VT.getSizeInBits();
1947     VT = Opd0.getValueType();
1948     assert(VT == MVT::i64 && "the promoted type should be i64");
1949   } else if (BiggerPattern) {
1950     // Let's pretend a 0 shift left has been performed.
1951     // FIXME: Currently we limit this to the bigger pattern case,
1952     // because some optimizations expect AND and not UBFM
1953     Opd0 = N->getOperand(0);
1954   } else
1955     return false;
1956 
1957   // Missing combines/constant folding may have left us with strange
1958   // constants.
1959   if (ShlImm >= VT.getSizeInBits()) {
1960     LLVM_DEBUG(
1961         (dbgs() << N
1962                 << ": Found large shift immediate, this should not happen\n"));
1963     return false;
1964   }
1965 
1966   uint64_t SrlImm = 0;
1967   if (!isIntImmediate(N->getOperand(1), SrlImm))
1968     return false;
1969 
1970   assert(SrlImm > 0 && SrlImm < VT.getSizeInBits() &&
1971          "bad amount in shift node!");
1972   int immr = SrlImm - ShlImm;
1973   Immr = immr < 0 ? immr + VT.getSizeInBits() : immr;
1974   Imms = VT.getSizeInBits() - ShlImm - TruncBits - 1;
1975   // SRA requires a signed extraction
1976   if (VT == MVT::i32)
1977     Opc = N->getOpcode() == ISD::SRA ? AArch64::SBFMWri : AArch64::UBFMWri;
1978   else
1979     Opc = N->getOpcode() == ISD::SRA ? AArch64::SBFMXri : AArch64::UBFMXri;
1980   return true;
1981 }
1982 
1983 bool AArch64DAGToDAGISel::tryBitfieldExtractOpFromSExt(SDNode *N) {
1984   assert(N->getOpcode() == ISD::SIGN_EXTEND);
1985 
1986   EVT VT = N->getValueType(0);
1987   EVT NarrowVT = N->getOperand(0)->getValueType(0);
1988   if (VT != MVT::i64 || NarrowVT != MVT::i32)
1989     return false;
1990 
1991   uint64_t ShiftImm;
1992   SDValue Op = N->getOperand(0);
1993   if (!isOpcWithIntImmediate(Op.getNode(), ISD::SRA, ShiftImm))
1994     return false;
1995 
1996   SDLoc dl(N);
1997   // Extend the incoming operand of the shift to 64-bits.
1998   SDValue Opd0 = Widen(CurDAG, Op.getOperand(0));
1999   unsigned Immr = ShiftImm;
2000   unsigned Imms = NarrowVT.getSizeInBits() - 1;
2001   SDValue Ops[] = {Opd0, CurDAG->getTargetConstant(Immr, dl, VT),
2002                    CurDAG->getTargetConstant(Imms, dl, VT)};
2003   CurDAG->SelectNodeTo(N, AArch64::SBFMXri, VT, Ops);
2004   return true;
2005 }
2006 
2007 /// Try to form fcvtl2 instructions from a floating-point extend of a high-half
2008 /// extract of a subvector.
2009 bool AArch64DAGToDAGISel::tryHighFPExt(SDNode *N) {
2010   assert(N->getOpcode() == ISD::FP_EXTEND);
2011 
2012   // There are 2 forms of fcvtl2 - extend to double or extend to float.
2013   SDValue Extract = N->getOperand(0);
2014   EVT VT = N->getValueType(0);
2015   EVT NarrowVT = Extract.getValueType();
2016   if ((VT != MVT::v2f64 || NarrowVT != MVT::v2f32) &&
2017       (VT != MVT::v4f32 || NarrowVT != MVT::v4f16))
2018     return false;
2019 
2020   // Optionally look past a bitcast.
2021   Extract = peekThroughBitcasts(Extract);
2022   if (Extract.getOpcode() != ISD::EXTRACT_SUBVECTOR)
2023     return false;
2024 
2025   // Match extract from start of high half index.
2026   // Example: v8i16 -> v4i16 means the extract must begin at index 4.
2027   unsigned ExtractIndex = Extract.getConstantOperandVal(1);
2028   if (ExtractIndex != Extract.getValueType().getVectorNumElements())
2029     return false;
2030 
2031   auto Opcode = VT == MVT::v2f64 ? AArch64::FCVTLv4i32 : AArch64::FCVTLv8i16;
2032   CurDAG->SelectNodeTo(N, Opcode, VT, Extract.getOperand(0));
2033   return true;
2034 }
2035 
2036 static bool isBitfieldExtractOp(SelectionDAG *CurDAG, SDNode *N, unsigned &Opc,
2037                                 SDValue &Opd0, unsigned &Immr, unsigned &Imms,
2038                                 unsigned NumberOfIgnoredLowBits = 0,
2039                                 bool BiggerPattern = false) {
2040   if (N->getValueType(0) != MVT::i32 && N->getValueType(0) != MVT::i64)
2041     return false;
2042 
2043   switch (N->getOpcode()) {
2044   default:
2045     if (!N->isMachineOpcode())
2046       return false;
2047     break;
2048   case ISD::AND:
2049     return isBitfieldExtractOpFromAnd(CurDAG, N, Opc, Opd0, Immr, Imms,
2050                                       NumberOfIgnoredLowBits, BiggerPattern);
2051   case ISD::SRL:
2052   case ISD::SRA:
2053     return isBitfieldExtractOpFromShr(N, Opc, Opd0, Immr, Imms, BiggerPattern);
2054 
2055   case ISD::SIGN_EXTEND_INREG:
2056     return isBitfieldExtractOpFromSExtInReg(N, Opc, Opd0, Immr, Imms);
2057   }
2058 
2059   unsigned NOpc = N->getMachineOpcode();
2060   switch (NOpc) {
2061   default:
2062     return false;
2063   case AArch64::SBFMWri:
2064   case AArch64::UBFMWri:
2065   case AArch64::SBFMXri:
2066   case AArch64::UBFMXri:
2067     Opc = NOpc;
2068     Opd0 = N->getOperand(0);
2069     Immr = cast<ConstantSDNode>(N->getOperand(1).getNode())->getZExtValue();
2070     Imms = cast<ConstantSDNode>(N->getOperand(2).getNode())->getZExtValue();
2071     return true;
2072   }
2073   // Unreachable
2074   return false;
2075 }
2076 
2077 bool AArch64DAGToDAGISel::tryBitfieldExtractOp(SDNode *N) {
2078   unsigned Opc, Immr, Imms;
2079   SDValue Opd0;
2080   if (!isBitfieldExtractOp(CurDAG, N, Opc, Opd0, Immr, Imms))
2081     return false;
2082 
2083   EVT VT = N->getValueType(0);
2084   SDLoc dl(N);
2085 
2086   // If the bit extract operation is 64bit but the original type is 32bit, we
2087   // need to add one EXTRACT_SUBREG.
2088   if ((Opc == AArch64::SBFMXri || Opc == AArch64::UBFMXri) && VT == MVT::i32) {
2089     SDValue Ops64[] = {Opd0, CurDAG->getTargetConstant(Immr, dl, MVT::i64),
2090                        CurDAG->getTargetConstant(Imms, dl, MVT::i64)};
2091 
2092     SDNode *BFM = CurDAG->getMachineNode(Opc, dl, MVT::i64, Ops64);
2093     SDValue SubReg = CurDAG->getTargetConstant(AArch64::sub_32, dl, MVT::i32);
2094     ReplaceNode(N, CurDAG->getMachineNode(TargetOpcode::EXTRACT_SUBREG, dl,
2095                                           MVT::i32, SDValue(BFM, 0), SubReg));
2096     return true;
2097   }
2098 
2099   SDValue Ops[] = {Opd0, CurDAG->getTargetConstant(Immr, dl, VT),
2100                    CurDAG->getTargetConstant(Imms, dl, VT)};
2101   CurDAG->SelectNodeTo(N, Opc, VT, Ops);
2102   return true;
2103 }
2104 
2105 /// Does DstMask form a complementary pair with the mask provided by
2106 /// BitsToBeInserted, suitable for use in a BFI instruction. Roughly speaking,
2107 /// this asks whether DstMask zeroes precisely those bits that will be set by
2108 /// the other half.
2109 static bool isBitfieldDstMask(uint64_t DstMask, const APInt &BitsToBeInserted,
2110                               unsigned NumberOfIgnoredHighBits, EVT VT) {
2111   assert((VT == MVT::i32 || VT == MVT::i64) &&
2112          "i32 or i64 mask type expected!");
2113   unsigned BitWidth = VT.getSizeInBits() - NumberOfIgnoredHighBits;
2114 
2115   APInt SignificantDstMask = APInt(BitWidth, DstMask);
2116   APInt SignificantBitsToBeInserted = BitsToBeInserted.zextOrTrunc(BitWidth);
2117 
2118   return (SignificantDstMask & SignificantBitsToBeInserted) == 0 &&
2119          (SignificantDstMask | SignificantBitsToBeInserted).isAllOnesValue();
2120 }
2121 
2122 // Look for bits that will be useful for later uses.
2123 // A bit is consider useless as soon as it is dropped and never used
2124 // before it as been dropped.
2125 // E.g., looking for useful bit of x
2126 // 1. y = x & 0x7
2127 // 2. z = y >> 2
2128 // After #1, x useful bits are 0x7, then the useful bits of x, live through
2129 // y.
2130 // After #2, the useful bits of x are 0x4.
2131 // However, if x is used on an unpredicatable instruction, then all its bits
2132 // are useful.
2133 // E.g.
2134 // 1. y = x & 0x7
2135 // 2. z = y >> 2
2136 // 3. str x, [@x]
2137 static void getUsefulBits(SDValue Op, APInt &UsefulBits, unsigned Depth = 0);
2138 
2139 static void getUsefulBitsFromAndWithImmediate(SDValue Op, APInt &UsefulBits,
2140                                               unsigned Depth) {
2141   uint64_t Imm =
2142       cast<const ConstantSDNode>(Op.getOperand(1).getNode())->getZExtValue();
2143   Imm = AArch64_AM::decodeLogicalImmediate(Imm, UsefulBits.getBitWidth());
2144   UsefulBits &= APInt(UsefulBits.getBitWidth(), Imm);
2145   getUsefulBits(Op, UsefulBits, Depth + 1);
2146 }
2147 
2148 static void getUsefulBitsFromBitfieldMoveOpd(SDValue Op, APInt &UsefulBits,
2149                                              uint64_t Imm, uint64_t MSB,
2150                                              unsigned Depth) {
2151   // inherit the bitwidth value
2152   APInt OpUsefulBits(UsefulBits);
2153   OpUsefulBits = 1;
2154 
2155   if (MSB >= Imm) {
2156     OpUsefulBits <<= MSB - Imm + 1;
2157     --OpUsefulBits;
2158     // The interesting part will be in the lower part of the result
2159     getUsefulBits(Op, OpUsefulBits, Depth + 1);
2160     // The interesting part was starting at Imm in the argument
2161     OpUsefulBits <<= Imm;
2162   } else {
2163     OpUsefulBits <<= MSB + 1;
2164     --OpUsefulBits;
2165     // The interesting part will be shifted in the result
2166     OpUsefulBits <<= OpUsefulBits.getBitWidth() - Imm;
2167     getUsefulBits(Op, OpUsefulBits, Depth + 1);
2168     // The interesting part was at zero in the argument
2169     OpUsefulBits.lshrInPlace(OpUsefulBits.getBitWidth() - Imm);
2170   }
2171 
2172   UsefulBits &= OpUsefulBits;
2173 }
2174 
2175 static void getUsefulBitsFromUBFM(SDValue Op, APInt &UsefulBits,
2176                                   unsigned Depth) {
2177   uint64_t Imm =
2178       cast<const ConstantSDNode>(Op.getOperand(1).getNode())->getZExtValue();
2179   uint64_t MSB =
2180       cast<const ConstantSDNode>(Op.getOperand(2).getNode())->getZExtValue();
2181 
2182   getUsefulBitsFromBitfieldMoveOpd(Op, UsefulBits, Imm, MSB, Depth);
2183 }
2184 
2185 static void getUsefulBitsFromOrWithShiftedReg(SDValue Op, APInt &UsefulBits,
2186                                               unsigned Depth) {
2187   uint64_t ShiftTypeAndValue =
2188       cast<const ConstantSDNode>(Op.getOperand(2).getNode())->getZExtValue();
2189   APInt Mask(UsefulBits);
2190   Mask.clearAllBits();
2191   Mask.flipAllBits();
2192 
2193   if (AArch64_AM::getShiftType(ShiftTypeAndValue) == AArch64_AM::LSL) {
2194     // Shift Left
2195     uint64_t ShiftAmt = AArch64_AM::getShiftValue(ShiftTypeAndValue);
2196     Mask <<= ShiftAmt;
2197     getUsefulBits(Op, Mask, Depth + 1);
2198     Mask.lshrInPlace(ShiftAmt);
2199   } else if (AArch64_AM::getShiftType(ShiftTypeAndValue) == AArch64_AM::LSR) {
2200     // Shift Right
2201     // We do not handle AArch64_AM::ASR, because the sign will change the
2202     // number of useful bits
2203     uint64_t ShiftAmt = AArch64_AM::getShiftValue(ShiftTypeAndValue);
2204     Mask.lshrInPlace(ShiftAmt);
2205     getUsefulBits(Op, Mask, Depth + 1);
2206     Mask <<= ShiftAmt;
2207   } else
2208     return;
2209 
2210   UsefulBits &= Mask;
2211 }
2212 
2213 static void getUsefulBitsFromBFM(SDValue Op, SDValue Orig, APInt &UsefulBits,
2214                                  unsigned Depth) {
2215   uint64_t Imm =
2216       cast<const ConstantSDNode>(Op.getOperand(2).getNode())->getZExtValue();
2217   uint64_t MSB =
2218       cast<const ConstantSDNode>(Op.getOperand(3).getNode())->getZExtValue();
2219 
2220   APInt OpUsefulBits(UsefulBits);
2221   OpUsefulBits = 1;
2222 
2223   APInt ResultUsefulBits(UsefulBits.getBitWidth(), 0);
2224   ResultUsefulBits.flipAllBits();
2225   APInt Mask(UsefulBits.getBitWidth(), 0);
2226 
2227   getUsefulBits(Op, ResultUsefulBits, Depth + 1);
2228 
2229   if (MSB >= Imm) {
2230     // The instruction is a BFXIL.
2231     uint64_t Width = MSB - Imm + 1;
2232     uint64_t LSB = Imm;
2233 
2234     OpUsefulBits <<= Width;
2235     --OpUsefulBits;
2236 
2237     if (Op.getOperand(1) == Orig) {
2238       // Copy the low bits from the result to bits starting from LSB.
2239       Mask = ResultUsefulBits & OpUsefulBits;
2240       Mask <<= LSB;
2241     }
2242 
2243     if (Op.getOperand(0) == Orig)
2244       // Bits starting from LSB in the input contribute to the result.
2245       Mask |= (ResultUsefulBits & ~OpUsefulBits);
2246   } else {
2247     // The instruction is a BFI.
2248     uint64_t Width = MSB + 1;
2249     uint64_t LSB = UsefulBits.getBitWidth() - Imm;
2250 
2251     OpUsefulBits <<= Width;
2252     --OpUsefulBits;
2253     OpUsefulBits <<= LSB;
2254 
2255     if (Op.getOperand(1) == Orig) {
2256       // Copy the bits from the result to the zero bits.
2257       Mask = ResultUsefulBits & OpUsefulBits;
2258       Mask.lshrInPlace(LSB);
2259     }
2260 
2261     if (Op.getOperand(0) == Orig)
2262       Mask |= (ResultUsefulBits & ~OpUsefulBits);
2263   }
2264 
2265   UsefulBits &= Mask;
2266 }
2267 
2268 static void getUsefulBitsForUse(SDNode *UserNode, APInt &UsefulBits,
2269                                 SDValue Orig, unsigned Depth) {
2270 
2271   // Users of this node should have already been instruction selected
2272   // FIXME: Can we turn that into an assert?
2273   if (!UserNode->isMachineOpcode())
2274     return;
2275 
2276   switch (UserNode->getMachineOpcode()) {
2277   default:
2278     return;
2279   case AArch64::ANDSWri:
2280   case AArch64::ANDSXri:
2281   case AArch64::ANDWri:
2282   case AArch64::ANDXri:
2283     // We increment Depth only when we call the getUsefulBits
2284     return getUsefulBitsFromAndWithImmediate(SDValue(UserNode, 0), UsefulBits,
2285                                              Depth);
2286   case AArch64::UBFMWri:
2287   case AArch64::UBFMXri:
2288     return getUsefulBitsFromUBFM(SDValue(UserNode, 0), UsefulBits, Depth);
2289 
2290   case AArch64::ORRWrs:
2291   case AArch64::ORRXrs:
2292     if (UserNode->getOperand(1) != Orig)
2293       return;
2294     return getUsefulBitsFromOrWithShiftedReg(SDValue(UserNode, 0), UsefulBits,
2295                                              Depth);
2296   case AArch64::BFMWri:
2297   case AArch64::BFMXri:
2298     return getUsefulBitsFromBFM(SDValue(UserNode, 0), Orig, UsefulBits, Depth);
2299 
2300   case AArch64::STRBBui:
2301   case AArch64::STURBBi:
2302     if (UserNode->getOperand(0) != Orig)
2303       return;
2304     UsefulBits &= APInt(UsefulBits.getBitWidth(), 0xff);
2305     return;
2306 
2307   case AArch64::STRHHui:
2308   case AArch64::STURHHi:
2309     if (UserNode->getOperand(0) != Orig)
2310       return;
2311     UsefulBits &= APInt(UsefulBits.getBitWidth(), 0xffff);
2312     return;
2313   }
2314 }
2315 
2316 static void getUsefulBits(SDValue Op, APInt &UsefulBits, unsigned Depth) {
2317   if (Depth >= SelectionDAG::MaxRecursionDepth)
2318     return;
2319   // Initialize UsefulBits
2320   if (!Depth) {
2321     unsigned Bitwidth = Op.getScalarValueSizeInBits();
2322     // At the beginning, assume every produced bits is useful
2323     UsefulBits = APInt(Bitwidth, 0);
2324     UsefulBits.flipAllBits();
2325   }
2326   APInt UsersUsefulBits(UsefulBits.getBitWidth(), 0);
2327 
2328   for (SDNode *Node : Op.getNode()->uses()) {
2329     // A use cannot produce useful bits
2330     APInt UsefulBitsForUse = APInt(UsefulBits);
2331     getUsefulBitsForUse(Node, UsefulBitsForUse, Op, Depth);
2332     UsersUsefulBits |= UsefulBitsForUse;
2333   }
2334   // UsefulBits contains the produced bits that are meaningful for the
2335   // current definition, thus a user cannot make a bit meaningful at
2336   // this point
2337   UsefulBits &= UsersUsefulBits;
2338 }
2339 
2340 /// Create a machine node performing a notional SHL of Op by ShlAmount. If
2341 /// ShlAmount is negative, do a (logical) right-shift instead. If ShlAmount is
2342 /// 0, return Op unchanged.
2343 static SDValue getLeftShift(SelectionDAG *CurDAG, SDValue Op, int ShlAmount) {
2344   if (ShlAmount == 0)
2345     return Op;
2346 
2347   EVT VT = Op.getValueType();
2348   SDLoc dl(Op);
2349   unsigned BitWidth = VT.getSizeInBits();
2350   unsigned UBFMOpc = BitWidth == 32 ? AArch64::UBFMWri : AArch64::UBFMXri;
2351 
2352   SDNode *ShiftNode;
2353   if (ShlAmount > 0) {
2354     // LSL wD, wN, #Amt == UBFM wD, wN, #32-Amt, #31-Amt
2355     ShiftNode = CurDAG->getMachineNode(
2356         UBFMOpc, dl, VT, Op,
2357         CurDAG->getTargetConstant(BitWidth - ShlAmount, dl, VT),
2358         CurDAG->getTargetConstant(BitWidth - 1 - ShlAmount, dl, VT));
2359   } else {
2360     // LSR wD, wN, #Amt == UBFM wD, wN, #Amt, #32-1
2361     assert(ShlAmount < 0 && "expected right shift");
2362     int ShrAmount = -ShlAmount;
2363     ShiftNode = CurDAG->getMachineNode(
2364         UBFMOpc, dl, VT, Op, CurDAG->getTargetConstant(ShrAmount, dl, VT),
2365         CurDAG->getTargetConstant(BitWidth - 1, dl, VT));
2366   }
2367 
2368   return SDValue(ShiftNode, 0);
2369 }
2370 
2371 /// Does this tree qualify as an attempt to move a bitfield into position,
2372 /// essentially "(and (shl VAL, N), Mask)".
2373 static bool isBitfieldPositioningOp(SelectionDAG *CurDAG, SDValue Op,
2374                                     bool BiggerPattern,
2375                                     SDValue &Src, int &ShiftAmount,
2376                                     int &MaskWidth) {
2377   EVT VT = Op.getValueType();
2378   unsigned BitWidth = VT.getSizeInBits();
2379   (void)BitWidth;
2380   assert(BitWidth == 32 || BitWidth == 64);
2381 
2382   KnownBits Known = CurDAG->computeKnownBits(Op);
2383 
2384   // Non-zero in the sense that they're not provably zero, which is the key
2385   // point if we want to use this value
2386   uint64_t NonZeroBits = (~Known.Zero).getZExtValue();
2387 
2388   // Discard a constant AND mask if present. It's safe because the node will
2389   // already have been factored into the computeKnownBits calculation above.
2390   uint64_t AndImm;
2391   if (isOpcWithIntImmediate(Op.getNode(), ISD::AND, AndImm)) {
2392     assert((~APInt(BitWidth, AndImm) & ~Known.Zero) == 0);
2393     Op = Op.getOperand(0);
2394   }
2395 
2396   // Don't match if the SHL has more than one use, since then we'll end up
2397   // generating SHL+UBFIZ instead of just keeping SHL+AND.
2398   if (!BiggerPattern && !Op.hasOneUse())
2399     return false;
2400 
2401   uint64_t ShlImm;
2402   if (!isOpcWithIntImmediate(Op.getNode(), ISD::SHL, ShlImm))
2403     return false;
2404   Op = Op.getOperand(0);
2405 
2406   if (!isShiftedMask_64(NonZeroBits))
2407     return false;
2408 
2409   ShiftAmount = countTrailingZeros(NonZeroBits);
2410   MaskWidth = countTrailingOnes(NonZeroBits >> ShiftAmount);
2411 
2412   // BFI encompasses sufficiently many nodes that it's worth inserting an extra
2413   // LSL/LSR if the mask in NonZeroBits doesn't quite match up with the ISD::SHL
2414   // amount.  BiggerPattern is true when this pattern is being matched for BFI,
2415   // BiggerPattern is false when this pattern is being matched for UBFIZ, in
2416   // which case it is not profitable to insert an extra shift.
2417   if (ShlImm - ShiftAmount != 0 && !BiggerPattern)
2418     return false;
2419   Src = getLeftShift(CurDAG, Op, ShlImm - ShiftAmount);
2420 
2421   return true;
2422 }
2423 
2424 static bool isShiftedMask(uint64_t Mask, EVT VT) {
2425   assert(VT == MVT::i32 || VT == MVT::i64);
2426   if (VT == MVT::i32)
2427     return isShiftedMask_32(Mask);
2428   return isShiftedMask_64(Mask);
2429 }
2430 
2431 // Generate a BFI/BFXIL from 'or (and X, MaskImm), OrImm' iff the value being
2432 // inserted only sets known zero bits.
2433 static bool tryBitfieldInsertOpFromOrAndImm(SDNode *N, SelectionDAG *CurDAG) {
2434   assert(N->getOpcode() == ISD::OR && "Expect a OR operation");
2435 
2436   EVT VT = N->getValueType(0);
2437   if (VT != MVT::i32 && VT != MVT::i64)
2438     return false;
2439 
2440   unsigned BitWidth = VT.getSizeInBits();
2441 
2442   uint64_t OrImm;
2443   if (!isOpcWithIntImmediate(N, ISD::OR, OrImm))
2444     return false;
2445 
2446   // Skip this transformation if the ORR immediate can be encoded in the ORR.
2447   // Otherwise, we'll trade an AND+ORR for ORR+BFI/BFXIL, which is most likely
2448   // performance neutral.
2449   if (AArch64_AM::isLogicalImmediate(OrImm, BitWidth))
2450     return false;
2451 
2452   uint64_t MaskImm;
2453   SDValue And = N->getOperand(0);
2454   // Must be a single use AND with an immediate operand.
2455   if (!And.hasOneUse() ||
2456       !isOpcWithIntImmediate(And.getNode(), ISD::AND, MaskImm))
2457     return false;
2458 
2459   // Compute the Known Zero for the AND as this allows us to catch more general
2460   // cases than just looking for AND with imm.
2461   KnownBits Known = CurDAG->computeKnownBits(And);
2462 
2463   // Non-zero in the sense that they're not provably zero, which is the key
2464   // point if we want to use this value.
2465   uint64_t NotKnownZero = (~Known.Zero).getZExtValue();
2466 
2467   // The KnownZero mask must be a shifted mask (e.g., 1110..011, 11100..00).
2468   if (!isShiftedMask(Known.Zero.getZExtValue(), VT))
2469     return false;
2470 
2471   // The bits being inserted must only set those bits that are known to be zero.
2472   if ((OrImm & NotKnownZero) != 0) {
2473     // FIXME:  It's okay if the OrImm sets NotKnownZero bits to 1, but we don't
2474     // currently handle this case.
2475     return false;
2476   }
2477 
2478   // BFI/BFXIL dst, src, #lsb, #width.
2479   int LSB = countTrailingOnes(NotKnownZero);
2480   int Width = BitWidth - APInt(BitWidth, NotKnownZero).countPopulation();
2481 
2482   // BFI/BFXIL is an alias of BFM, so translate to BFM operands.
2483   unsigned ImmR = (BitWidth - LSB) % BitWidth;
2484   unsigned ImmS = Width - 1;
2485 
2486   // If we're creating a BFI instruction avoid cases where we need more
2487   // instructions to materialize the BFI constant as compared to the original
2488   // ORR.  A BFXIL will use the same constant as the original ORR, so the code
2489   // should be no worse in this case.
2490   bool IsBFI = LSB != 0;
2491   uint64_t BFIImm = OrImm >> LSB;
2492   if (IsBFI && !AArch64_AM::isLogicalImmediate(BFIImm, BitWidth)) {
2493     // We have a BFI instruction and we know the constant can't be materialized
2494     // with a ORR-immediate with the zero register.
2495     unsigned OrChunks = 0, BFIChunks = 0;
2496     for (unsigned Shift = 0; Shift < BitWidth; Shift += 16) {
2497       if (((OrImm >> Shift) & 0xFFFF) != 0)
2498         ++OrChunks;
2499       if (((BFIImm >> Shift) & 0xFFFF) != 0)
2500         ++BFIChunks;
2501     }
2502     if (BFIChunks > OrChunks)
2503       return false;
2504   }
2505 
2506   // Materialize the constant to be inserted.
2507   SDLoc DL(N);
2508   unsigned MOVIOpc = VT == MVT::i32 ? AArch64::MOVi32imm : AArch64::MOVi64imm;
2509   SDNode *MOVI = CurDAG->getMachineNode(
2510       MOVIOpc, DL, VT, CurDAG->getTargetConstant(BFIImm, DL, VT));
2511 
2512   // Create the BFI/BFXIL instruction.
2513   SDValue Ops[] = {And.getOperand(0), SDValue(MOVI, 0),
2514                    CurDAG->getTargetConstant(ImmR, DL, VT),
2515                    CurDAG->getTargetConstant(ImmS, DL, VT)};
2516   unsigned Opc = (VT == MVT::i32) ? AArch64::BFMWri : AArch64::BFMXri;
2517   CurDAG->SelectNodeTo(N, Opc, VT, Ops);
2518   return true;
2519 }
2520 
2521 static bool tryBitfieldInsertOpFromOr(SDNode *N, const APInt &UsefulBits,
2522                                       SelectionDAG *CurDAG) {
2523   assert(N->getOpcode() == ISD::OR && "Expect a OR operation");
2524 
2525   EVT VT = N->getValueType(0);
2526   if (VT != MVT::i32 && VT != MVT::i64)
2527     return false;
2528 
2529   unsigned BitWidth = VT.getSizeInBits();
2530 
2531   // Because of simplify-demanded-bits in DAGCombine, involved masks may not
2532   // have the expected shape. Try to undo that.
2533 
2534   unsigned NumberOfIgnoredLowBits = UsefulBits.countTrailingZeros();
2535   unsigned NumberOfIgnoredHighBits = UsefulBits.countLeadingZeros();
2536 
2537   // Given a OR operation, check if we have the following pattern
2538   // ubfm c, b, imm, imm2 (or something that does the same jobs, see
2539   //                       isBitfieldExtractOp)
2540   // d = e & mask2 ; where mask is a binary sequence of 1..10..0 and
2541   //                 countTrailingZeros(mask2) == imm2 - imm + 1
2542   // f = d | c
2543   // if yes, replace the OR instruction with:
2544   // f = BFM Opd0, Opd1, LSB, MSB ; where LSB = imm, and MSB = imm2
2545 
2546   // OR is commutative, check all combinations of operand order and values of
2547   // BiggerPattern, i.e.
2548   //     Opd0, Opd1, BiggerPattern=false
2549   //     Opd1, Opd0, BiggerPattern=false
2550   //     Opd0, Opd1, BiggerPattern=true
2551   //     Opd1, Opd0, BiggerPattern=true
2552   // Several of these combinations may match, so check with BiggerPattern=false
2553   // first since that will produce better results by matching more instructions
2554   // and/or inserting fewer extra instructions.
2555   for (int I = 0; I < 4; ++I) {
2556 
2557     SDValue Dst, Src;
2558     unsigned ImmR, ImmS;
2559     bool BiggerPattern = I / 2;
2560     SDValue OrOpd0Val = N->getOperand(I % 2);
2561     SDNode *OrOpd0 = OrOpd0Val.getNode();
2562     SDValue OrOpd1Val = N->getOperand((I + 1) % 2);
2563     SDNode *OrOpd1 = OrOpd1Val.getNode();
2564 
2565     unsigned BFXOpc;
2566     int DstLSB, Width;
2567     if (isBitfieldExtractOp(CurDAG, OrOpd0, BFXOpc, Src, ImmR, ImmS,
2568                             NumberOfIgnoredLowBits, BiggerPattern)) {
2569       // Check that the returned opcode is compatible with the pattern,
2570       // i.e., same type and zero extended (U and not S)
2571       if ((BFXOpc != AArch64::UBFMXri && VT == MVT::i64) ||
2572           (BFXOpc != AArch64::UBFMWri && VT == MVT::i32))
2573         continue;
2574 
2575       // Compute the width of the bitfield insertion
2576       DstLSB = 0;
2577       Width = ImmS - ImmR + 1;
2578       // FIXME: This constraint is to catch bitfield insertion we may
2579       // want to widen the pattern if we want to grab general bitfied
2580       // move case
2581       if (Width <= 0)
2582         continue;
2583 
2584       // If the mask on the insertee is correct, we have a BFXIL operation. We
2585       // can share the ImmR and ImmS values from the already-computed UBFM.
2586     } else if (isBitfieldPositioningOp(CurDAG, OrOpd0Val,
2587                                        BiggerPattern,
2588                                        Src, DstLSB, Width)) {
2589       ImmR = (BitWidth - DstLSB) % BitWidth;
2590       ImmS = Width - 1;
2591     } else
2592       continue;
2593 
2594     // Check the second part of the pattern
2595     EVT VT = OrOpd1Val.getValueType();
2596     assert((VT == MVT::i32 || VT == MVT::i64) && "unexpected OR operand");
2597 
2598     // Compute the Known Zero for the candidate of the first operand.
2599     // This allows to catch more general case than just looking for
2600     // AND with imm. Indeed, simplify-demanded-bits may have removed
2601     // the AND instruction because it proves it was useless.
2602     KnownBits Known = CurDAG->computeKnownBits(OrOpd1Val);
2603 
2604     // Check if there is enough room for the second operand to appear
2605     // in the first one
2606     APInt BitsToBeInserted =
2607         APInt::getBitsSet(Known.getBitWidth(), DstLSB, DstLSB + Width);
2608 
2609     if ((BitsToBeInserted & ~Known.Zero) != 0)
2610       continue;
2611 
2612     // Set the first operand
2613     uint64_t Imm;
2614     if (isOpcWithIntImmediate(OrOpd1, ISD::AND, Imm) &&
2615         isBitfieldDstMask(Imm, BitsToBeInserted, NumberOfIgnoredHighBits, VT))
2616       // In that case, we can eliminate the AND
2617       Dst = OrOpd1->getOperand(0);
2618     else
2619       // Maybe the AND has been removed by simplify-demanded-bits
2620       // or is useful because it discards more bits
2621       Dst = OrOpd1Val;
2622 
2623     // both parts match
2624     SDLoc DL(N);
2625     SDValue Ops[] = {Dst, Src, CurDAG->getTargetConstant(ImmR, DL, VT),
2626                      CurDAG->getTargetConstant(ImmS, DL, VT)};
2627     unsigned Opc = (VT == MVT::i32) ? AArch64::BFMWri : AArch64::BFMXri;
2628     CurDAG->SelectNodeTo(N, Opc, VT, Ops);
2629     return true;
2630   }
2631 
2632   // Generate a BFXIL from 'or (and X, Mask0Imm), (and Y, Mask1Imm)' iff
2633   // Mask0Imm and ~Mask1Imm are equivalent and one of the MaskImms is a shifted
2634   // mask (e.g., 0x000ffff0).
2635   uint64_t Mask0Imm, Mask1Imm;
2636   SDValue And0 = N->getOperand(0);
2637   SDValue And1 = N->getOperand(1);
2638   if (And0.hasOneUse() && And1.hasOneUse() &&
2639       isOpcWithIntImmediate(And0.getNode(), ISD::AND, Mask0Imm) &&
2640       isOpcWithIntImmediate(And1.getNode(), ISD::AND, Mask1Imm) &&
2641       APInt(BitWidth, Mask0Imm) == ~APInt(BitWidth, Mask1Imm) &&
2642       (isShiftedMask(Mask0Imm, VT) || isShiftedMask(Mask1Imm, VT))) {
2643 
2644     // ORR is commutative, so canonicalize to the form 'or (and X, Mask0Imm),
2645     // (and Y, Mask1Imm)' where Mask1Imm is the shifted mask masking off the
2646     // bits to be inserted.
2647     if (isShiftedMask(Mask0Imm, VT)) {
2648       std::swap(And0, And1);
2649       std::swap(Mask0Imm, Mask1Imm);
2650     }
2651 
2652     SDValue Src = And1->getOperand(0);
2653     SDValue Dst = And0->getOperand(0);
2654     unsigned LSB = countTrailingZeros(Mask1Imm);
2655     int Width = BitWidth - APInt(BitWidth, Mask0Imm).countPopulation();
2656 
2657     // The BFXIL inserts the low-order bits from a source register, so right
2658     // shift the needed bits into place.
2659     SDLoc DL(N);
2660     unsigned ShiftOpc = (VT == MVT::i32) ? AArch64::UBFMWri : AArch64::UBFMXri;
2661     SDNode *LSR = CurDAG->getMachineNode(
2662         ShiftOpc, DL, VT, Src, CurDAG->getTargetConstant(LSB, DL, VT),
2663         CurDAG->getTargetConstant(BitWidth - 1, DL, VT));
2664 
2665     // BFXIL is an alias of BFM, so translate to BFM operands.
2666     unsigned ImmR = (BitWidth - LSB) % BitWidth;
2667     unsigned ImmS = Width - 1;
2668 
2669     // Create the BFXIL instruction.
2670     SDValue Ops[] = {Dst, SDValue(LSR, 0),
2671                      CurDAG->getTargetConstant(ImmR, DL, VT),
2672                      CurDAG->getTargetConstant(ImmS, DL, VT)};
2673     unsigned Opc = (VT == MVT::i32) ? AArch64::BFMWri : AArch64::BFMXri;
2674     CurDAG->SelectNodeTo(N, Opc, VT, Ops);
2675     return true;
2676   }
2677 
2678   return false;
2679 }
2680 
2681 bool AArch64DAGToDAGISel::tryBitfieldInsertOp(SDNode *N) {
2682   if (N->getOpcode() != ISD::OR)
2683     return false;
2684 
2685   APInt NUsefulBits;
2686   getUsefulBits(SDValue(N, 0), NUsefulBits);
2687 
2688   // If all bits are not useful, just return UNDEF.
2689   if (!NUsefulBits) {
2690     CurDAG->SelectNodeTo(N, TargetOpcode::IMPLICIT_DEF, N->getValueType(0));
2691     return true;
2692   }
2693 
2694   if (tryBitfieldInsertOpFromOr(N, NUsefulBits, CurDAG))
2695     return true;
2696 
2697   return tryBitfieldInsertOpFromOrAndImm(N, CurDAG);
2698 }
2699 
2700 /// SelectBitfieldInsertInZeroOp - Match a UBFIZ instruction that is the
2701 /// equivalent of a left shift by a constant amount followed by an and masking
2702 /// out a contiguous set of bits.
2703 bool AArch64DAGToDAGISel::tryBitfieldInsertInZeroOp(SDNode *N) {
2704   if (N->getOpcode() != ISD::AND)
2705     return false;
2706 
2707   EVT VT = N->getValueType(0);
2708   if (VT != MVT::i32 && VT != MVT::i64)
2709     return false;
2710 
2711   SDValue Op0;
2712   int DstLSB, Width;
2713   if (!isBitfieldPositioningOp(CurDAG, SDValue(N, 0), /*BiggerPattern=*/false,
2714                                Op0, DstLSB, Width))
2715     return false;
2716 
2717   // ImmR is the rotate right amount.
2718   unsigned ImmR = (VT.getSizeInBits() - DstLSB) % VT.getSizeInBits();
2719   // ImmS is the most significant bit of the source to be moved.
2720   unsigned ImmS = Width - 1;
2721 
2722   SDLoc DL(N);
2723   SDValue Ops[] = {Op0, CurDAG->getTargetConstant(ImmR, DL, VT),
2724                    CurDAG->getTargetConstant(ImmS, DL, VT)};
2725   unsigned Opc = (VT == MVT::i32) ? AArch64::UBFMWri : AArch64::UBFMXri;
2726   CurDAG->SelectNodeTo(N, Opc, VT, Ops);
2727   return true;
2728 }
2729 
2730 /// tryShiftAmountMod - Take advantage of built-in mod of shift amount in
2731 /// variable shift/rotate instructions.
2732 bool AArch64DAGToDAGISel::tryShiftAmountMod(SDNode *N) {
2733   EVT VT = N->getValueType(0);
2734 
2735   unsigned Opc;
2736   switch (N->getOpcode()) {
2737   case ISD::ROTR:
2738     Opc = (VT == MVT::i32) ? AArch64::RORVWr : AArch64::RORVXr;
2739     break;
2740   case ISD::SHL:
2741     Opc = (VT == MVT::i32) ? AArch64::LSLVWr : AArch64::LSLVXr;
2742     break;
2743   case ISD::SRL:
2744     Opc = (VT == MVT::i32) ? AArch64::LSRVWr : AArch64::LSRVXr;
2745     break;
2746   case ISD::SRA:
2747     Opc = (VT == MVT::i32) ? AArch64::ASRVWr : AArch64::ASRVXr;
2748     break;
2749   default:
2750     return false;
2751   }
2752 
2753   uint64_t Size;
2754   uint64_t Bits;
2755   if (VT == MVT::i32) {
2756     Bits = 5;
2757     Size = 32;
2758   } else if (VT == MVT::i64) {
2759     Bits = 6;
2760     Size = 64;
2761   } else
2762     return false;
2763 
2764   SDValue ShiftAmt = N->getOperand(1);
2765   SDLoc DL(N);
2766   SDValue NewShiftAmt;
2767 
2768   // Skip over an extend of the shift amount.
2769   if (ShiftAmt->getOpcode() == ISD::ZERO_EXTEND ||
2770       ShiftAmt->getOpcode() == ISD::ANY_EXTEND)
2771     ShiftAmt = ShiftAmt->getOperand(0);
2772 
2773   if (ShiftAmt->getOpcode() == ISD::ADD || ShiftAmt->getOpcode() == ISD::SUB) {
2774     SDValue Add0 = ShiftAmt->getOperand(0);
2775     SDValue Add1 = ShiftAmt->getOperand(1);
2776     uint64_t Add0Imm;
2777     uint64_t Add1Imm;
2778     // If we are shifting by X+/-N where N == 0 mod Size, then just shift by X
2779     // to avoid the ADD/SUB.
2780     if (isIntImmediate(Add1, Add1Imm) && (Add1Imm % Size == 0))
2781       NewShiftAmt = Add0;
2782     // If we are shifting by N-X where N == 0 mod Size, then just shift by -X to
2783     // generate a NEG instead of a SUB of a constant.
2784     else if (ShiftAmt->getOpcode() == ISD::SUB &&
2785              isIntImmediate(Add0, Add0Imm) && Add0Imm != 0 &&
2786              (Add0Imm % Size == 0)) {
2787       unsigned NegOpc;
2788       unsigned ZeroReg;
2789       EVT SubVT = ShiftAmt->getValueType(0);
2790       if (SubVT == MVT::i32) {
2791         NegOpc = AArch64::SUBWrr;
2792         ZeroReg = AArch64::WZR;
2793       } else {
2794         assert(SubVT == MVT::i64);
2795         NegOpc = AArch64::SUBXrr;
2796         ZeroReg = AArch64::XZR;
2797       }
2798       SDValue Zero =
2799           CurDAG->getCopyFromReg(CurDAG->getEntryNode(), DL, ZeroReg, SubVT);
2800       MachineSDNode *Neg =
2801           CurDAG->getMachineNode(NegOpc, DL, SubVT, Zero, Add1);
2802       NewShiftAmt = SDValue(Neg, 0);
2803     } else
2804       return false;
2805   } else {
2806     // If the shift amount is masked with an AND, check that the mask covers the
2807     // bits that are implicitly ANDed off by the above opcodes and if so, skip
2808     // the AND.
2809     uint64_t MaskImm;
2810     if (!isOpcWithIntImmediate(ShiftAmt.getNode(), ISD::AND, MaskImm) &&
2811         !isOpcWithIntImmediate(ShiftAmt.getNode(), AArch64ISD::ANDS, MaskImm))
2812       return false;
2813 
2814     if (countTrailingOnes(MaskImm) < Bits)
2815       return false;
2816 
2817     NewShiftAmt = ShiftAmt->getOperand(0);
2818   }
2819 
2820   // Narrow/widen the shift amount to match the size of the shift operation.
2821   if (VT == MVT::i32)
2822     NewShiftAmt = narrowIfNeeded(CurDAG, NewShiftAmt);
2823   else if (VT == MVT::i64 && NewShiftAmt->getValueType(0) == MVT::i32) {
2824     SDValue SubReg = CurDAG->getTargetConstant(AArch64::sub_32, DL, MVT::i32);
2825     MachineSDNode *Ext = CurDAG->getMachineNode(
2826         AArch64::SUBREG_TO_REG, DL, VT,
2827         CurDAG->getTargetConstant(0, DL, MVT::i64), NewShiftAmt, SubReg);
2828     NewShiftAmt = SDValue(Ext, 0);
2829   }
2830 
2831   SDValue Ops[] = {N->getOperand(0), NewShiftAmt};
2832   CurDAG->SelectNodeTo(N, Opc, VT, Ops);
2833   return true;
2834 }
2835 
2836 bool
2837 AArch64DAGToDAGISel::SelectCVTFixedPosOperand(SDValue N, SDValue &FixedPos,
2838                                               unsigned RegWidth) {
2839   APFloat FVal(0.0);
2840   if (ConstantFPSDNode *CN = dyn_cast<ConstantFPSDNode>(N))
2841     FVal = CN->getValueAPF();
2842   else if (LoadSDNode *LN = dyn_cast<LoadSDNode>(N)) {
2843     // Some otherwise illegal constants are allowed in this case.
2844     if (LN->getOperand(1).getOpcode() != AArch64ISD::ADDlow ||
2845         !isa<ConstantPoolSDNode>(LN->getOperand(1)->getOperand(1)))
2846       return false;
2847 
2848     ConstantPoolSDNode *CN =
2849         dyn_cast<ConstantPoolSDNode>(LN->getOperand(1)->getOperand(1));
2850     FVal = cast<ConstantFP>(CN->getConstVal())->getValueAPF();
2851   } else
2852     return false;
2853 
2854   // An FCVT[SU] instruction performs: convertToInt(Val * 2^fbits) where fbits
2855   // is between 1 and 32 for a destination w-register, or 1 and 64 for an
2856   // x-register.
2857   //
2858   // By this stage, we've detected (fp_to_[su]int (fmul Val, THIS_NODE)) so we
2859   // want THIS_NODE to be 2^fbits. This is much easier to deal with using
2860   // integers.
2861   bool IsExact;
2862 
2863   // fbits is between 1 and 64 in the worst-case, which means the fmul
2864   // could have 2^64 as an actual operand. Need 65 bits of precision.
2865   APSInt IntVal(65, true);
2866   FVal.convertToInteger(IntVal, APFloat::rmTowardZero, &IsExact);
2867 
2868   // N.b. isPowerOf2 also checks for > 0.
2869   if (!IsExact || !IntVal.isPowerOf2()) return false;
2870   unsigned FBits = IntVal.logBase2();
2871 
2872   // Checks above should have guaranteed that we haven't lost information in
2873   // finding FBits, but it must still be in range.
2874   if (FBits == 0 || FBits > RegWidth) return false;
2875 
2876   FixedPos = CurDAG->getTargetConstant(FBits, SDLoc(N), MVT::i32);
2877   return true;
2878 }
2879 
2880 // Inspects a register string of the form o0:op1:CRn:CRm:op2 gets the fields
2881 // of the string and obtains the integer values from them and combines these
2882 // into a single value to be used in the MRS/MSR instruction.
2883 static int getIntOperandFromRegisterString(StringRef RegString) {
2884   SmallVector<StringRef, 5> Fields;
2885   RegString.split(Fields, ':');
2886 
2887   if (Fields.size() == 1)
2888     return -1;
2889 
2890   assert(Fields.size() == 5
2891             && "Invalid number of fields in read register string");
2892 
2893   SmallVector<int, 5> Ops;
2894   bool AllIntFields = true;
2895 
2896   for (StringRef Field : Fields) {
2897     unsigned IntField;
2898     AllIntFields &= !Field.getAsInteger(10, IntField);
2899     Ops.push_back(IntField);
2900   }
2901 
2902   assert(AllIntFields &&
2903           "Unexpected non-integer value in special register string.");
2904 
2905   // Need to combine the integer fields of the string into a single value
2906   // based on the bit encoding of MRS/MSR instruction.
2907   return (Ops[0] << 14) | (Ops[1] << 11) | (Ops[2] << 7) |
2908          (Ops[3] << 3) | (Ops[4]);
2909 }
2910 
2911 // Lower the read_register intrinsic to an MRS instruction node if the special
2912 // register string argument is either of the form detailed in the ALCE (the
2913 // form described in getIntOperandsFromRegsterString) or is a named register
2914 // known by the MRS SysReg mapper.
2915 bool AArch64DAGToDAGISel::tryReadRegister(SDNode *N) {
2916   const MDNodeSDNode *MD = dyn_cast<MDNodeSDNode>(N->getOperand(1));
2917   const MDString *RegString = dyn_cast<MDString>(MD->getMD()->getOperand(0));
2918   SDLoc DL(N);
2919 
2920   int Reg = getIntOperandFromRegisterString(RegString->getString());
2921   if (Reg != -1) {
2922     ReplaceNode(N, CurDAG->getMachineNode(
2923                        AArch64::MRS, DL, N->getSimpleValueType(0), MVT::Other,
2924                        CurDAG->getTargetConstant(Reg, DL, MVT::i32),
2925                        N->getOperand(0)));
2926     return true;
2927   }
2928 
2929   // Use the sysreg mapper to map the remaining possible strings to the
2930   // value for the register to be used for the instruction operand.
2931   auto TheReg = AArch64SysReg::lookupSysRegByName(RegString->getString());
2932   if (TheReg && TheReg->Readable &&
2933       TheReg->haveFeatures(Subtarget->getFeatureBits()))
2934     Reg = TheReg->Encoding;
2935   else
2936     Reg = AArch64SysReg::parseGenericRegister(RegString->getString());
2937 
2938   if (Reg != -1) {
2939     ReplaceNode(N, CurDAG->getMachineNode(
2940                        AArch64::MRS, DL, N->getSimpleValueType(0), MVT::Other,
2941                        CurDAG->getTargetConstant(Reg, DL, MVT::i32),
2942                        N->getOperand(0)));
2943     return true;
2944   }
2945 
2946   if (RegString->getString() == "pc") {
2947     ReplaceNode(N, CurDAG->getMachineNode(
2948                        AArch64::ADR, DL, N->getSimpleValueType(0), MVT::Other,
2949                        CurDAG->getTargetConstant(0, DL, MVT::i32),
2950                        N->getOperand(0)));
2951     return true;
2952   }
2953 
2954   return false;
2955 }
2956 
2957 // Lower the write_register intrinsic to an MSR instruction node if the special
2958 // register string argument is either of the form detailed in the ALCE (the
2959 // form described in getIntOperandsFromRegsterString) or is a named register
2960 // known by the MSR SysReg mapper.
2961 bool AArch64DAGToDAGISel::tryWriteRegister(SDNode *N) {
2962   const MDNodeSDNode *MD = dyn_cast<MDNodeSDNode>(N->getOperand(1));
2963   const MDString *RegString = dyn_cast<MDString>(MD->getMD()->getOperand(0));
2964   SDLoc DL(N);
2965 
2966   int Reg = getIntOperandFromRegisterString(RegString->getString());
2967   if (Reg != -1) {
2968     ReplaceNode(
2969         N, CurDAG->getMachineNode(AArch64::MSR, DL, MVT::Other,
2970                                   CurDAG->getTargetConstant(Reg, DL, MVT::i32),
2971                                   N->getOperand(2), N->getOperand(0)));
2972     return true;
2973   }
2974 
2975   // Check if the register was one of those allowed as the pstatefield value in
2976   // the MSR (immediate) instruction. To accept the values allowed in the
2977   // pstatefield for the MSR (immediate) instruction, we also require that an
2978   // immediate value has been provided as an argument, we know that this is
2979   // the case as it has been ensured by semantic checking.
2980   auto PMapper = AArch64PState::lookupPStateByName(RegString->getString());
2981   if (PMapper) {
2982     assert (isa<ConstantSDNode>(N->getOperand(2))
2983               && "Expected a constant integer expression.");
2984     unsigned Reg = PMapper->Encoding;
2985     uint64_t Immed = cast<ConstantSDNode>(N->getOperand(2))->getZExtValue();
2986     unsigned State;
2987     if (Reg == AArch64PState::PAN || Reg == AArch64PState::UAO || Reg == AArch64PState::SSBS) {
2988       assert(Immed < 2 && "Bad imm");
2989       State = AArch64::MSRpstateImm1;
2990     } else {
2991       assert(Immed < 16 && "Bad imm");
2992       State = AArch64::MSRpstateImm4;
2993     }
2994     ReplaceNode(N, CurDAG->getMachineNode(
2995                        State, DL, MVT::Other,
2996                        CurDAG->getTargetConstant(Reg, DL, MVT::i32),
2997                        CurDAG->getTargetConstant(Immed, DL, MVT::i16),
2998                        N->getOperand(0)));
2999     return true;
3000   }
3001 
3002   // Use the sysreg mapper to attempt to map the remaining possible strings
3003   // to the value for the register to be used for the MSR (register)
3004   // instruction operand.
3005   auto TheReg = AArch64SysReg::lookupSysRegByName(RegString->getString());
3006   if (TheReg && TheReg->Writeable &&
3007       TheReg->haveFeatures(Subtarget->getFeatureBits()))
3008     Reg = TheReg->Encoding;
3009   else
3010     Reg = AArch64SysReg::parseGenericRegister(RegString->getString());
3011   if (Reg != -1) {
3012     ReplaceNode(N, CurDAG->getMachineNode(
3013                        AArch64::MSR, DL, MVT::Other,
3014                        CurDAG->getTargetConstant(Reg, DL, MVT::i32),
3015                        N->getOperand(2), N->getOperand(0)));
3016     return true;
3017   }
3018 
3019   return false;
3020 }
3021 
3022 /// We've got special pseudo-instructions for these
3023 bool AArch64DAGToDAGISel::SelectCMP_SWAP(SDNode *N) {
3024   unsigned Opcode;
3025   EVT MemTy = cast<MemSDNode>(N)->getMemoryVT();
3026 
3027   // Leave IR for LSE if subtarget supports it.
3028   if (Subtarget->hasLSE()) return false;
3029 
3030   if (MemTy == MVT::i8)
3031     Opcode = AArch64::CMP_SWAP_8;
3032   else if (MemTy == MVT::i16)
3033     Opcode = AArch64::CMP_SWAP_16;
3034   else if (MemTy == MVT::i32)
3035     Opcode = AArch64::CMP_SWAP_32;
3036   else if (MemTy == MVT::i64)
3037     Opcode = AArch64::CMP_SWAP_64;
3038   else
3039     llvm_unreachable("Unknown AtomicCmpSwap type");
3040 
3041   MVT RegTy = MemTy == MVT::i64 ? MVT::i64 : MVT::i32;
3042   SDValue Ops[] = {N->getOperand(1), N->getOperand(2), N->getOperand(3),
3043                    N->getOperand(0)};
3044   SDNode *CmpSwap = CurDAG->getMachineNode(
3045       Opcode, SDLoc(N),
3046       CurDAG->getVTList(RegTy, MVT::i32, MVT::Other), Ops);
3047 
3048   MachineMemOperand *MemOp = cast<MemSDNode>(N)->getMemOperand();
3049   CurDAG->setNodeMemRefs(cast<MachineSDNode>(CmpSwap), {MemOp});
3050 
3051   ReplaceUses(SDValue(N, 0), SDValue(CmpSwap, 0));
3052   ReplaceUses(SDValue(N, 1), SDValue(CmpSwap, 2));
3053   CurDAG->RemoveDeadNode(N);
3054 
3055   return true;
3056 }
3057 
3058 bool AArch64DAGToDAGISel::SelectSVE8BitLslImm(SDValue N, SDValue &Base,
3059                                                   SDValue &Offset) {
3060   auto C = dyn_cast<ConstantSDNode>(N);
3061   if (!C)
3062     return false;
3063 
3064   auto Ty = N->getValueType(0);
3065 
3066   int64_t Imm = C->getSExtValue();
3067   SDLoc DL(N);
3068 
3069   if ((Imm >= -128) && (Imm <= 127)) {
3070     Base = CurDAG->getTargetConstant(Imm, DL, Ty);
3071     Offset = CurDAG->getTargetConstant(0, DL, Ty);
3072     return true;
3073   }
3074 
3075   if (((Imm % 256) == 0) && (Imm >= -32768) && (Imm <= 32512)) {
3076     Base = CurDAG->getTargetConstant(Imm/256, DL, Ty);
3077     Offset = CurDAG->getTargetConstant(8, DL, Ty);
3078     return true;
3079   }
3080 
3081   return false;
3082 }
3083 
3084 bool AArch64DAGToDAGISel::SelectSVEAddSubImm(SDValue N, MVT VT, SDValue &Imm, SDValue &Shift) {
3085   if (auto CNode = dyn_cast<ConstantSDNode>(N)) {
3086     const int64_t ImmVal = CNode->getZExtValue();
3087     SDLoc DL(N);
3088 
3089     switch (VT.SimpleTy) {
3090     case MVT::i8:
3091       if ((ImmVal & 0xFF) == ImmVal) {
3092         Shift = CurDAG->getTargetConstant(0, DL, MVT::i32);
3093         Imm = CurDAG->getTargetConstant(ImmVal, DL, MVT::i32);
3094         return true;
3095       }
3096       break;
3097     case MVT::i16:
3098     case MVT::i32:
3099     case MVT::i64:
3100       if ((ImmVal & 0xFF) == ImmVal) {
3101         Shift = CurDAG->getTargetConstant(0, DL, MVT::i32);
3102         Imm = CurDAG->getTargetConstant(ImmVal, DL, MVT::i32);
3103         return true;
3104       } else if ((ImmVal & 0xFF00) == ImmVal) {
3105         Shift = CurDAG->getTargetConstant(8, DL, MVT::i32);
3106         Imm = CurDAG->getTargetConstant(ImmVal >> 8, DL, MVT::i32);
3107         return true;
3108       }
3109       break;
3110     default:
3111       break;
3112     }
3113   }
3114 
3115   return false;
3116 }
3117 
3118 bool AArch64DAGToDAGISel::SelectSVESignedArithImm(SDValue N, SDValue &Imm) {
3119   if (auto CNode = dyn_cast<ConstantSDNode>(N)) {
3120     int64_t ImmVal = CNode->getSExtValue();
3121     SDLoc DL(N);
3122     if (ImmVal >= -128 && ImmVal < 128) {
3123       Imm = CurDAG->getTargetConstant(ImmVal, DL, MVT::i32);
3124       return true;
3125     }
3126   }
3127   return false;
3128 }
3129 
3130 bool AArch64DAGToDAGISel::SelectSVEArithImm(SDValue N, SDValue &Imm) {
3131   if (auto CNode = dyn_cast<ConstantSDNode>(N)) {
3132     uint64_t ImmVal = CNode->getSExtValue();
3133     SDLoc DL(N);
3134     ImmVal = ImmVal & 0xFF;
3135     if (ImmVal < 256) {
3136       Imm = CurDAG->getTargetConstant(ImmVal, DL, MVT::i32);
3137       return true;
3138     }
3139   }
3140   return false;
3141 }
3142 
3143 bool AArch64DAGToDAGISel::SelectSVELogicalImm(SDValue N, MVT VT, SDValue &Imm) {
3144   if (auto CNode = dyn_cast<ConstantSDNode>(N)) {
3145     uint64_t ImmVal = CNode->getZExtValue();
3146     SDLoc DL(N);
3147 
3148     // Shift mask depending on type size.
3149     switch (VT.SimpleTy) {
3150       case MVT::i8:
3151         ImmVal &= 0xFF;
3152         ImmVal |= ImmVal << 8;
3153         ImmVal |= ImmVal << 16;
3154         ImmVal |= ImmVal << 32;
3155         break;
3156       case MVT::i16:
3157         ImmVal &= 0xFFFF;
3158         ImmVal |= ImmVal << 16;
3159         ImmVal |= ImmVal << 32;
3160         break;
3161       case MVT::i32:
3162         ImmVal &= 0xFFFFFFFF;
3163         ImmVal |= ImmVal << 32;
3164         break;
3165       case MVT::i64:
3166         break;
3167       default:
3168         llvm_unreachable("Unexpected type");
3169     }
3170 
3171     uint64_t encoding;
3172     if (AArch64_AM::processLogicalImmediate(ImmVal, 64, encoding)) {
3173       Imm = CurDAG->getTargetConstant(encoding, DL, MVT::i64);
3174       return true;
3175     }
3176   }
3177   return false;
3178 }
3179 
3180 // This method is only needed to "cast" i64s into i32s when the value
3181 // is a valid shift which has been splatted into a vector with i64 elements.
3182 // Every other type is fine in tablegen.
3183 bool AArch64DAGToDAGISel::SelectSVEShiftImm64(SDValue N, uint64_t Low,
3184                                               uint64_t High, SDValue &Imm) {
3185   if (auto *CN = dyn_cast<ConstantSDNode>(N)) {
3186     uint64_t ImmVal = CN->getZExtValue();
3187     SDLoc DL(N);
3188 
3189     if (ImmVal >= Low && ImmVal <= High) {
3190       Imm = CurDAG->getTargetConstant(ImmVal, DL, MVT::i32);
3191       return true;
3192     }
3193   }
3194 
3195   return false;
3196 }
3197 
3198 bool AArch64DAGToDAGISel::trySelectStackSlotTagP(SDNode *N) {
3199   // tagp(FrameIndex, IRGstack, tag_offset):
3200   // since the offset between FrameIndex and IRGstack is a compile-time
3201   // constant, this can be lowered to a single ADDG instruction.
3202   if (!(isa<FrameIndexSDNode>(N->getOperand(1)))) {
3203     return false;
3204   }
3205 
3206   SDValue IRG_SP = N->getOperand(2);
3207   if (IRG_SP->getOpcode() != ISD::INTRINSIC_W_CHAIN ||
3208       cast<ConstantSDNode>(IRG_SP->getOperand(1))->getZExtValue() !=
3209           Intrinsic::aarch64_irg_sp) {
3210     return false;
3211   }
3212 
3213   const TargetLowering *TLI = getTargetLowering();
3214   SDLoc DL(N);
3215   int FI = cast<FrameIndexSDNode>(N->getOperand(1))->getIndex();
3216   SDValue FiOp = CurDAG->getTargetFrameIndex(
3217       FI, TLI->getPointerTy(CurDAG->getDataLayout()));
3218   int TagOffset = cast<ConstantSDNode>(N->getOperand(3))->getZExtValue();
3219 
3220   SDNode *Out = CurDAG->getMachineNode(
3221       AArch64::TAGPstack, DL, MVT::i64,
3222       {FiOp, CurDAG->getTargetConstant(0, DL, MVT::i64), N->getOperand(2),
3223        CurDAG->getTargetConstant(TagOffset, DL, MVT::i64)});
3224   ReplaceNode(N, Out);
3225   return true;
3226 }
3227 
3228 void AArch64DAGToDAGISel::SelectTagP(SDNode *N) {
3229   assert(isa<ConstantSDNode>(N->getOperand(3)) &&
3230          "llvm.aarch64.tagp third argument must be an immediate");
3231   if (trySelectStackSlotTagP(N))
3232     return;
3233   // FIXME: above applies in any case when offset between Op1 and Op2 is a
3234   // compile-time constant, not just for stack allocations.
3235 
3236   // General case for unrelated pointers in Op1 and Op2.
3237   SDLoc DL(N);
3238   int TagOffset = cast<ConstantSDNode>(N->getOperand(3))->getZExtValue();
3239   SDNode *N1 = CurDAG->getMachineNode(AArch64::SUBP, DL, MVT::i64,
3240                                       {N->getOperand(1), N->getOperand(2)});
3241   SDNode *N2 = CurDAG->getMachineNode(AArch64::ADDXrr, DL, MVT::i64,
3242                                       {SDValue(N1, 0), N->getOperand(2)});
3243   SDNode *N3 = CurDAG->getMachineNode(
3244       AArch64::ADDG, DL, MVT::i64,
3245       {SDValue(N2, 0), CurDAG->getTargetConstant(0, DL, MVT::i64),
3246        CurDAG->getTargetConstant(TagOffset, DL, MVT::i64)});
3247   ReplaceNode(N, N3);
3248 }
3249 
3250 // NOTE: We cannot use EXTRACT_SUBREG in all cases because the fixed length
3251 // vector types larger than NEON don't have a matching SubRegIndex.
3252 static SDNode *extractSubReg(SelectionDAG *DAG, EVT VT, SDValue V) {
3253   assert(V.getValueType().isScalableVector() &&
3254          V.getValueType().getSizeInBits().getKnownMinSize() ==
3255              AArch64::SVEBitsPerBlock &&
3256          "Expected to extract from a packed scalable vector!");
3257   assert(VT.isFixedLengthVector() &&
3258          "Expected to extract a fixed length vector!");
3259 
3260   SDLoc DL(V);
3261   switch (VT.getSizeInBits()) {
3262   case 64: {
3263     auto SubReg = DAG->getTargetConstant(AArch64::dsub, DL, MVT::i32);
3264     return DAG->getMachineNode(TargetOpcode::EXTRACT_SUBREG, DL, VT, V, SubReg);
3265   }
3266   case 128: {
3267     auto SubReg = DAG->getTargetConstant(AArch64::zsub, DL, MVT::i32);
3268     return DAG->getMachineNode(TargetOpcode::EXTRACT_SUBREG, DL, VT, V, SubReg);
3269   }
3270   default: {
3271     auto RC = DAG->getTargetConstant(AArch64::ZPRRegClassID, DL, MVT::i64);
3272     return DAG->getMachineNode(TargetOpcode::COPY_TO_REGCLASS, DL, VT, V, RC);
3273   }
3274   }
3275 }
3276 
3277 // NOTE: We cannot use INSERT_SUBREG in all cases because the fixed length
3278 // vector types larger than NEON don't have a matching SubRegIndex.
3279 static SDNode *insertSubReg(SelectionDAG *DAG, EVT VT, SDValue V) {
3280   assert(VT.isScalableVector() &&
3281          VT.getSizeInBits().getKnownMinSize() == AArch64::SVEBitsPerBlock &&
3282          "Expected to insert into a packed scalable vector!");
3283   assert(V.getValueType().isFixedLengthVector() &&
3284          "Expected to insert a fixed length vector!");
3285 
3286   SDLoc DL(V);
3287   switch (V.getValueType().getSizeInBits()) {
3288   case 64: {
3289     auto SubReg = DAG->getTargetConstant(AArch64::dsub, DL, MVT::i32);
3290     auto Container = DAG->getMachineNode(TargetOpcode::IMPLICIT_DEF, DL, VT);
3291     return DAG->getMachineNode(TargetOpcode::INSERT_SUBREG, DL, VT,
3292                                SDValue(Container, 0), V, SubReg);
3293   }
3294   case 128: {
3295     auto SubReg = DAG->getTargetConstant(AArch64::zsub, DL, MVT::i32);
3296     auto Container = DAG->getMachineNode(TargetOpcode::IMPLICIT_DEF, DL, VT);
3297     return DAG->getMachineNode(TargetOpcode::INSERT_SUBREG, DL, VT,
3298                                SDValue(Container, 0), V, SubReg);
3299   }
3300   default: {
3301     auto RC = DAG->getTargetConstant(AArch64::ZPRRegClassID, DL, MVT::i64);
3302     return DAG->getMachineNode(TargetOpcode::COPY_TO_REGCLASS, DL, VT, V, RC);
3303   }
3304   }
3305 }
3306 
3307 void AArch64DAGToDAGISel::Select(SDNode *Node) {
3308   // If we have a custom node, we already have selected!
3309   if (Node->isMachineOpcode()) {
3310     LLVM_DEBUG(errs() << "== "; Node->dump(CurDAG); errs() << "\n");
3311     Node->setNodeId(-1);
3312     return;
3313   }
3314 
3315   // Few custom selection stuff.
3316   EVT VT = Node->getValueType(0);
3317 
3318   switch (Node->getOpcode()) {
3319   default:
3320     break;
3321 
3322   case ISD::ATOMIC_CMP_SWAP:
3323     if (SelectCMP_SWAP(Node))
3324       return;
3325     break;
3326 
3327   case ISD::READ_REGISTER:
3328     if (tryReadRegister(Node))
3329       return;
3330     break;
3331 
3332   case ISD::WRITE_REGISTER:
3333     if (tryWriteRegister(Node))
3334       return;
3335     break;
3336 
3337   case ISD::ADD:
3338     if (tryMLAV64LaneV128(Node))
3339       return;
3340     break;
3341 
3342   case ISD::LOAD: {
3343     // Try to select as an indexed load. Fall through to normal processing
3344     // if we can't.
3345     if (tryIndexedLoad(Node))
3346       return;
3347     break;
3348   }
3349 
3350   case ISD::SRL:
3351   case ISD::AND:
3352   case ISD::SRA:
3353   case ISD::SIGN_EXTEND_INREG:
3354     if (tryBitfieldExtractOp(Node))
3355       return;
3356     if (tryBitfieldInsertInZeroOp(Node))
3357       return;
3358     LLVM_FALLTHROUGH;
3359   case ISD::ROTR:
3360   case ISD::SHL:
3361     if (tryShiftAmountMod(Node))
3362       return;
3363     break;
3364 
3365   case ISD::SIGN_EXTEND:
3366     if (tryBitfieldExtractOpFromSExt(Node))
3367       return;
3368     break;
3369 
3370   case ISD::FP_EXTEND:
3371     if (tryHighFPExt(Node))
3372       return;
3373     break;
3374 
3375   case ISD::OR:
3376     if (tryBitfieldInsertOp(Node))
3377       return;
3378     break;
3379 
3380   case ISD::EXTRACT_SUBVECTOR: {
3381     // Bail when not a "cast" like extract_subvector.
3382     if (cast<ConstantSDNode>(Node->getOperand(1))->getZExtValue() != 0)
3383       break;
3384 
3385     // Bail when normal isel can do the job.
3386     EVT InVT = Node->getOperand(0).getValueType();
3387     if (VT.isScalableVector() || InVT.isFixedLengthVector())
3388       break;
3389 
3390     // NOTE: We can only get here when doing fixed length SVE code generation.
3391     // We do manual selection because the types involved are not linked to real
3392     // registers (despite being legal) and must be coerced into SVE registers.
3393     //
3394     // NOTE: If the above changes, be aware that selection will still not work
3395     // because the td definition of extract_vector does not support extracting
3396     // a fixed length vector from a scalable vector.
3397 
3398     ReplaceNode(Node, extractSubReg(CurDAG, VT, Node->getOperand(0)));
3399     return;
3400   }
3401 
3402   case ISD::INSERT_SUBVECTOR: {
3403     // Bail when not a "cast" like insert_subvector.
3404     if (cast<ConstantSDNode>(Node->getOperand(2))->getZExtValue() != 0)
3405       break;
3406     if (!Node->getOperand(0).isUndef())
3407       break;
3408 
3409     // Bail when normal isel should do the job.
3410     EVT InVT = Node->getOperand(1).getValueType();
3411     if (VT.isFixedLengthVector() || InVT.isScalableVector())
3412       break;
3413 
3414     // NOTE: We can only get here when doing fixed length SVE code generation.
3415     // We do manual selection because the types involved are not linked to real
3416     // registers (despite being legal) and must be coerced into SVE registers.
3417     //
3418     // NOTE: If the above changes, be aware that selection will still not work
3419     // because the td definition of insert_vector does not support inserting a
3420     // fixed length vector into a scalable vector.
3421 
3422     ReplaceNode(Node, insertSubReg(CurDAG, VT, Node->getOperand(1)));
3423     return;
3424   }
3425 
3426   case ISD::Constant: {
3427     // Materialize zero constants as copies from WZR/XZR.  This allows
3428     // the coalescer to propagate these into other instructions.
3429     ConstantSDNode *ConstNode = cast<ConstantSDNode>(Node);
3430     if (ConstNode->isNullValue()) {
3431       if (VT == MVT::i32) {
3432         SDValue New = CurDAG->getCopyFromReg(
3433             CurDAG->getEntryNode(), SDLoc(Node), AArch64::WZR, MVT::i32);
3434         ReplaceNode(Node, New.getNode());
3435         return;
3436       } else if (VT == MVT::i64) {
3437         SDValue New = CurDAG->getCopyFromReg(
3438             CurDAG->getEntryNode(), SDLoc(Node), AArch64::XZR, MVT::i64);
3439         ReplaceNode(Node, New.getNode());
3440         return;
3441       }
3442     }
3443     break;
3444   }
3445 
3446   case ISD::FrameIndex: {
3447     // Selects to ADDXri FI, 0 which in turn will become ADDXri SP, imm.
3448     int FI = cast<FrameIndexSDNode>(Node)->getIndex();
3449     unsigned Shifter = AArch64_AM::getShifterImm(AArch64_AM::LSL, 0);
3450     const TargetLowering *TLI = getTargetLowering();
3451     SDValue TFI = CurDAG->getTargetFrameIndex(
3452         FI, TLI->getPointerTy(CurDAG->getDataLayout()));
3453     SDLoc DL(Node);
3454     SDValue Ops[] = { TFI, CurDAG->getTargetConstant(0, DL, MVT::i32),
3455                       CurDAG->getTargetConstant(Shifter, DL, MVT::i32) };
3456     CurDAG->SelectNodeTo(Node, AArch64::ADDXri, MVT::i64, Ops);
3457     return;
3458   }
3459   case ISD::INTRINSIC_W_CHAIN: {
3460     unsigned IntNo = cast<ConstantSDNode>(Node->getOperand(1))->getZExtValue();
3461     switch (IntNo) {
3462     default:
3463       break;
3464     case Intrinsic::aarch64_ldaxp:
3465     case Intrinsic::aarch64_ldxp: {
3466       unsigned Op =
3467           IntNo == Intrinsic::aarch64_ldaxp ? AArch64::LDAXPX : AArch64::LDXPX;
3468       SDValue MemAddr = Node->getOperand(2);
3469       SDLoc DL(Node);
3470       SDValue Chain = Node->getOperand(0);
3471 
3472       SDNode *Ld = CurDAG->getMachineNode(Op, DL, MVT::i64, MVT::i64,
3473                                           MVT::Other, MemAddr, Chain);
3474 
3475       // Transfer memoperands.
3476       MachineMemOperand *MemOp =
3477           cast<MemIntrinsicSDNode>(Node)->getMemOperand();
3478       CurDAG->setNodeMemRefs(cast<MachineSDNode>(Ld), {MemOp});
3479       ReplaceNode(Node, Ld);
3480       return;
3481     }
3482     case Intrinsic::aarch64_stlxp:
3483     case Intrinsic::aarch64_stxp: {
3484       unsigned Op =
3485           IntNo == Intrinsic::aarch64_stlxp ? AArch64::STLXPX : AArch64::STXPX;
3486       SDLoc DL(Node);
3487       SDValue Chain = Node->getOperand(0);
3488       SDValue ValLo = Node->getOperand(2);
3489       SDValue ValHi = Node->getOperand(3);
3490       SDValue MemAddr = Node->getOperand(4);
3491 
3492       // Place arguments in the right order.
3493       SDValue Ops[] = {ValLo, ValHi, MemAddr, Chain};
3494 
3495       SDNode *St = CurDAG->getMachineNode(Op, DL, MVT::i32, MVT::Other, Ops);
3496       // Transfer memoperands.
3497       MachineMemOperand *MemOp =
3498           cast<MemIntrinsicSDNode>(Node)->getMemOperand();
3499       CurDAG->setNodeMemRefs(cast<MachineSDNode>(St), {MemOp});
3500 
3501       ReplaceNode(Node, St);
3502       return;
3503     }
3504     case Intrinsic::aarch64_neon_ld1x2:
3505       if (VT == MVT::v8i8) {
3506         SelectLoad(Node, 2, AArch64::LD1Twov8b, AArch64::dsub0);
3507         return;
3508       } else if (VT == MVT::v16i8) {
3509         SelectLoad(Node, 2, AArch64::LD1Twov16b, AArch64::qsub0);
3510         return;
3511       } else if (VT == MVT::v4i16 || VT == MVT::v4f16 || VT == MVT::v4bf16) {
3512         SelectLoad(Node, 2, AArch64::LD1Twov4h, AArch64::dsub0);
3513         return;
3514       } else if (VT == MVT::v8i16 || VT == MVT::v8f16 || VT == MVT::v8bf16) {
3515         SelectLoad(Node, 2, AArch64::LD1Twov8h, AArch64::qsub0);
3516         return;
3517       } else if (VT == MVT::v2i32 || VT == MVT::v2f32) {
3518         SelectLoad(Node, 2, AArch64::LD1Twov2s, AArch64::dsub0);
3519         return;
3520       } else if (VT == MVT::v4i32 || VT == MVT::v4f32) {
3521         SelectLoad(Node, 2, AArch64::LD1Twov4s, AArch64::qsub0);
3522         return;
3523       } else if (VT == MVT::v1i64 || VT == MVT::v1f64) {
3524         SelectLoad(Node, 2, AArch64::LD1Twov1d, AArch64::dsub0);
3525         return;
3526       } else if (VT == MVT::v2i64 || VT == MVT::v2f64) {
3527         SelectLoad(Node, 2, AArch64::LD1Twov2d, AArch64::qsub0);
3528         return;
3529       }
3530       break;
3531     case Intrinsic::aarch64_neon_ld1x3:
3532       if (VT == MVT::v8i8) {
3533         SelectLoad(Node, 3, AArch64::LD1Threev8b, AArch64::dsub0);
3534         return;
3535       } else if (VT == MVT::v16i8) {
3536         SelectLoad(Node, 3, AArch64::LD1Threev16b, AArch64::qsub0);
3537         return;
3538       } else if (VT == MVT::v4i16 || VT == MVT::v4f16 || VT == MVT::v4bf16) {
3539         SelectLoad(Node, 3, AArch64::LD1Threev4h, AArch64::dsub0);
3540         return;
3541       } else if (VT == MVT::v8i16 || VT == MVT::v8f16 || VT == MVT::v8bf16) {
3542         SelectLoad(Node, 3, AArch64::LD1Threev8h, AArch64::qsub0);
3543         return;
3544       } else if (VT == MVT::v2i32 || VT == MVT::v2f32) {
3545         SelectLoad(Node, 3, AArch64::LD1Threev2s, AArch64::dsub0);
3546         return;
3547       } else if (VT == MVT::v4i32 || VT == MVT::v4f32) {
3548         SelectLoad(Node, 3, AArch64::LD1Threev4s, AArch64::qsub0);
3549         return;
3550       } else if (VT == MVT::v1i64 || VT == MVT::v1f64) {
3551         SelectLoad(Node, 3, AArch64::LD1Threev1d, AArch64::dsub0);
3552         return;
3553       } else if (VT == MVT::v2i64 || VT == MVT::v2f64) {
3554         SelectLoad(Node, 3, AArch64::LD1Threev2d, AArch64::qsub0);
3555         return;
3556       }
3557       break;
3558     case Intrinsic::aarch64_neon_ld1x4:
3559       if (VT == MVT::v8i8) {
3560         SelectLoad(Node, 4, AArch64::LD1Fourv8b, AArch64::dsub0);
3561         return;
3562       } else if (VT == MVT::v16i8) {
3563         SelectLoad(Node, 4, AArch64::LD1Fourv16b, AArch64::qsub0);
3564         return;
3565       } else if (VT == MVT::v4i16 || VT == MVT::v4f16 || VT == MVT::v4bf16) {
3566         SelectLoad(Node, 4, AArch64::LD1Fourv4h, AArch64::dsub0);
3567         return;
3568       } else if (VT == MVT::v8i16 || VT == MVT::v8f16 || VT == MVT::v8bf16) {
3569         SelectLoad(Node, 4, AArch64::LD1Fourv8h, AArch64::qsub0);
3570         return;
3571       } else if (VT == MVT::v2i32 || VT == MVT::v2f32) {
3572         SelectLoad(Node, 4, AArch64::LD1Fourv2s, AArch64::dsub0);
3573         return;
3574       } else if (VT == MVT::v4i32 || VT == MVT::v4f32) {
3575         SelectLoad(Node, 4, AArch64::LD1Fourv4s, AArch64::qsub0);
3576         return;
3577       } else if (VT == MVT::v1i64 || VT == MVT::v1f64) {
3578         SelectLoad(Node, 4, AArch64::LD1Fourv1d, AArch64::dsub0);
3579         return;
3580       } else if (VT == MVT::v2i64 || VT == MVT::v2f64) {
3581         SelectLoad(Node, 4, AArch64::LD1Fourv2d, AArch64::qsub0);
3582         return;
3583       }
3584       break;
3585     case Intrinsic::aarch64_neon_ld2:
3586       if (VT == MVT::v8i8) {
3587         SelectLoad(Node, 2, AArch64::LD2Twov8b, AArch64::dsub0);
3588         return;
3589       } else if (VT == MVT::v16i8) {
3590         SelectLoad(Node, 2, AArch64::LD2Twov16b, AArch64::qsub0);
3591         return;
3592       } else if (VT == MVT::v4i16 || VT == MVT::v4f16 || VT == MVT::v4bf16) {
3593         SelectLoad(Node, 2, AArch64::LD2Twov4h, AArch64::dsub0);
3594         return;
3595       } else if (VT == MVT::v8i16 || VT == MVT::v8f16 || VT == MVT::v8bf16) {
3596         SelectLoad(Node, 2, AArch64::LD2Twov8h, AArch64::qsub0);
3597         return;
3598       } else if (VT == MVT::v2i32 || VT == MVT::v2f32) {
3599         SelectLoad(Node, 2, AArch64::LD2Twov2s, AArch64::dsub0);
3600         return;
3601       } else if (VT == MVT::v4i32 || VT == MVT::v4f32) {
3602         SelectLoad(Node, 2, AArch64::LD2Twov4s, AArch64::qsub0);
3603         return;
3604       } else if (VT == MVT::v1i64 || VT == MVT::v1f64) {
3605         SelectLoad(Node, 2, AArch64::LD1Twov1d, AArch64::dsub0);
3606         return;
3607       } else if (VT == MVT::v2i64 || VT == MVT::v2f64) {
3608         SelectLoad(Node, 2, AArch64::LD2Twov2d, AArch64::qsub0);
3609         return;
3610       }
3611       break;
3612     case Intrinsic::aarch64_neon_ld3:
3613       if (VT == MVT::v8i8) {
3614         SelectLoad(Node, 3, AArch64::LD3Threev8b, AArch64::dsub0);
3615         return;
3616       } else if (VT == MVT::v16i8) {
3617         SelectLoad(Node, 3, AArch64::LD3Threev16b, AArch64::qsub0);
3618         return;
3619       } else if (VT == MVT::v4i16 || VT == MVT::v4f16 || VT == MVT::v4bf16) {
3620         SelectLoad(Node, 3, AArch64::LD3Threev4h, AArch64::dsub0);
3621         return;
3622       } else if (VT == MVT::v8i16 || VT == MVT::v8f16 || VT == MVT::v8bf16) {
3623         SelectLoad(Node, 3, AArch64::LD3Threev8h, AArch64::qsub0);
3624         return;
3625       } else if (VT == MVT::v2i32 || VT == MVT::v2f32) {
3626         SelectLoad(Node, 3, AArch64::LD3Threev2s, AArch64::dsub0);
3627         return;
3628       } else if (VT == MVT::v4i32 || VT == MVT::v4f32) {
3629         SelectLoad(Node, 3, AArch64::LD3Threev4s, AArch64::qsub0);
3630         return;
3631       } else if (VT == MVT::v1i64 || VT == MVT::v1f64) {
3632         SelectLoad(Node, 3, AArch64::LD1Threev1d, AArch64::dsub0);
3633         return;
3634       } else if (VT == MVT::v2i64 || VT == MVT::v2f64) {
3635         SelectLoad(Node, 3, AArch64::LD3Threev2d, AArch64::qsub0);
3636         return;
3637       }
3638       break;
3639     case Intrinsic::aarch64_neon_ld4:
3640       if (VT == MVT::v8i8) {
3641         SelectLoad(Node, 4, AArch64::LD4Fourv8b, AArch64::dsub0);
3642         return;
3643       } else if (VT == MVT::v16i8) {
3644         SelectLoad(Node, 4, AArch64::LD4Fourv16b, AArch64::qsub0);
3645         return;
3646       } else if (VT == MVT::v4i16 || VT == MVT::v4f16 || VT == MVT::v4bf16) {
3647         SelectLoad(Node, 4, AArch64::LD4Fourv4h, AArch64::dsub0);
3648         return;
3649       } else if (VT == MVT::v8i16 || VT == MVT::v8f16 || VT == MVT::v8bf16) {
3650         SelectLoad(Node, 4, AArch64::LD4Fourv8h, AArch64::qsub0);
3651         return;
3652       } else if (VT == MVT::v2i32 || VT == MVT::v2f32) {
3653         SelectLoad(Node, 4, AArch64::LD4Fourv2s, AArch64::dsub0);
3654         return;
3655       } else if (VT == MVT::v4i32 || VT == MVT::v4f32) {
3656         SelectLoad(Node, 4, AArch64::LD4Fourv4s, AArch64::qsub0);
3657         return;
3658       } else if (VT == MVT::v1i64 || VT == MVT::v1f64) {
3659         SelectLoad(Node, 4, AArch64::LD1Fourv1d, AArch64::dsub0);
3660         return;
3661       } else if (VT == MVT::v2i64 || VT == MVT::v2f64) {
3662         SelectLoad(Node, 4, AArch64::LD4Fourv2d, AArch64::qsub0);
3663         return;
3664       }
3665       break;
3666     case Intrinsic::aarch64_neon_ld2r:
3667       if (VT == MVT::v8i8) {
3668         SelectLoad(Node, 2, AArch64::LD2Rv8b, AArch64::dsub0);
3669         return;
3670       } else if (VT == MVT::v16i8) {
3671         SelectLoad(Node, 2, AArch64::LD2Rv16b, AArch64::qsub0);
3672         return;
3673       } else if (VT == MVT::v4i16 || VT == MVT::v4f16 || VT == MVT::v4bf16) {
3674         SelectLoad(Node, 2, AArch64::LD2Rv4h, AArch64::dsub0);
3675         return;
3676       } else if (VT == MVT::v8i16 || VT == MVT::v8f16 || VT == MVT::v8bf16) {
3677         SelectLoad(Node, 2, AArch64::LD2Rv8h, AArch64::qsub0);
3678         return;
3679       } else if (VT == MVT::v2i32 || VT == MVT::v2f32) {
3680         SelectLoad(Node, 2, AArch64::LD2Rv2s, AArch64::dsub0);
3681         return;
3682       } else if (VT == MVT::v4i32 || VT == MVT::v4f32) {
3683         SelectLoad(Node, 2, AArch64::LD2Rv4s, AArch64::qsub0);
3684         return;
3685       } else if (VT == MVT::v1i64 || VT == MVT::v1f64) {
3686         SelectLoad(Node, 2, AArch64::LD2Rv1d, AArch64::dsub0);
3687         return;
3688       } else if (VT == MVT::v2i64 || VT == MVT::v2f64) {
3689         SelectLoad(Node, 2, AArch64::LD2Rv2d, AArch64::qsub0);
3690         return;
3691       }
3692       break;
3693     case Intrinsic::aarch64_neon_ld3r:
3694       if (VT == MVT::v8i8) {
3695         SelectLoad(Node, 3, AArch64::LD3Rv8b, AArch64::dsub0);
3696         return;
3697       } else if (VT == MVT::v16i8) {
3698         SelectLoad(Node, 3, AArch64::LD3Rv16b, AArch64::qsub0);
3699         return;
3700       } else if (VT == MVT::v4i16 || VT == MVT::v4f16 || VT == MVT::v4bf16) {
3701         SelectLoad(Node, 3, AArch64::LD3Rv4h, AArch64::dsub0);
3702         return;
3703       } else if (VT == MVT::v8i16 || VT == MVT::v8f16 || VT == MVT::v8bf16) {
3704         SelectLoad(Node, 3, AArch64::LD3Rv8h, AArch64::qsub0);
3705         return;
3706       } else if (VT == MVT::v2i32 || VT == MVT::v2f32) {
3707         SelectLoad(Node, 3, AArch64::LD3Rv2s, AArch64::dsub0);
3708         return;
3709       } else if (VT == MVT::v4i32 || VT == MVT::v4f32) {
3710         SelectLoad(Node, 3, AArch64::LD3Rv4s, AArch64::qsub0);
3711         return;
3712       } else if (VT == MVT::v1i64 || VT == MVT::v1f64) {
3713         SelectLoad(Node, 3, AArch64::LD3Rv1d, AArch64::dsub0);
3714         return;
3715       } else if (VT == MVT::v2i64 || VT == MVT::v2f64) {
3716         SelectLoad(Node, 3, AArch64::LD3Rv2d, AArch64::qsub0);
3717         return;
3718       }
3719       break;
3720     case Intrinsic::aarch64_neon_ld4r:
3721       if (VT == MVT::v8i8) {
3722         SelectLoad(Node, 4, AArch64::LD4Rv8b, AArch64::dsub0);
3723         return;
3724       } else if (VT == MVT::v16i8) {
3725         SelectLoad(Node, 4, AArch64::LD4Rv16b, AArch64::qsub0);
3726         return;
3727       } else if (VT == MVT::v4i16 || VT == MVT::v4f16 || VT == MVT::v4bf16) {
3728         SelectLoad(Node, 4, AArch64::LD4Rv4h, AArch64::dsub0);
3729         return;
3730       } else if (VT == MVT::v8i16 || VT == MVT::v8f16 || VT == MVT::v8bf16) {
3731         SelectLoad(Node, 4, AArch64::LD4Rv8h, AArch64::qsub0);
3732         return;
3733       } else if (VT == MVT::v2i32 || VT == MVT::v2f32) {
3734         SelectLoad(Node, 4, AArch64::LD4Rv2s, AArch64::dsub0);
3735         return;
3736       } else if (VT == MVT::v4i32 || VT == MVT::v4f32) {
3737         SelectLoad(Node, 4, AArch64::LD4Rv4s, AArch64::qsub0);
3738         return;
3739       } else if (VT == MVT::v1i64 || VT == MVT::v1f64) {
3740         SelectLoad(Node, 4, AArch64::LD4Rv1d, AArch64::dsub0);
3741         return;
3742       } else if (VT == MVT::v2i64 || VT == MVT::v2f64) {
3743         SelectLoad(Node, 4, AArch64::LD4Rv2d, AArch64::qsub0);
3744         return;
3745       }
3746       break;
3747     case Intrinsic::aarch64_neon_ld2lane:
3748       if (VT == MVT::v16i8 || VT == MVT::v8i8) {
3749         SelectLoadLane(Node, 2, AArch64::LD2i8);
3750         return;
3751       } else if (VT == MVT::v8i16 || VT == MVT::v4i16 || VT == MVT::v4f16 ||
3752                  VT == MVT::v8f16 || VT == MVT::v4bf16 || VT == MVT::v8bf16) {
3753         SelectLoadLane(Node, 2, AArch64::LD2i16);
3754         return;
3755       } else if (VT == MVT::v4i32 || VT == MVT::v2i32 || VT == MVT::v4f32 ||
3756                  VT == MVT::v2f32) {
3757         SelectLoadLane(Node, 2, AArch64::LD2i32);
3758         return;
3759       } else if (VT == MVT::v2i64 || VT == MVT::v1i64 || VT == MVT::v2f64 ||
3760                  VT == MVT::v1f64) {
3761         SelectLoadLane(Node, 2, AArch64::LD2i64);
3762         return;
3763       }
3764       break;
3765     case Intrinsic::aarch64_neon_ld3lane:
3766       if (VT == MVT::v16i8 || VT == MVT::v8i8) {
3767         SelectLoadLane(Node, 3, AArch64::LD3i8);
3768         return;
3769       } else if (VT == MVT::v8i16 || VT == MVT::v4i16 || VT == MVT::v4f16 ||
3770                  VT == MVT::v8f16 || VT == MVT::v4bf16 || VT == MVT::v8bf16) {
3771         SelectLoadLane(Node, 3, AArch64::LD3i16);
3772         return;
3773       } else if (VT == MVT::v4i32 || VT == MVT::v2i32 || VT == MVT::v4f32 ||
3774                  VT == MVT::v2f32) {
3775         SelectLoadLane(Node, 3, AArch64::LD3i32);
3776         return;
3777       } else if (VT == MVT::v2i64 || VT == MVT::v1i64 || VT == MVT::v2f64 ||
3778                  VT == MVT::v1f64) {
3779         SelectLoadLane(Node, 3, AArch64::LD3i64);
3780         return;
3781       }
3782       break;
3783     case Intrinsic::aarch64_neon_ld4lane:
3784       if (VT == MVT::v16i8 || VT == MVT::v8i8) {
3785         SelectLoadLane(Node, 4, AArch64::LD4i8);
3786         return;
3787       } else if (VT == MVT::v8i16 || VT == MVT::v4i16 || VT == MVT::v4f16 ||
3788                  VT == MVT::v8f16 || VT == MVT::v4bf16 || VT == MVT::v8bf16) {
3789         SelectLoadLane(Node, 4, AArch64::LD4i16);
3790         return;
3791       } else if (VT == MVT::v4i32 || VT == MVT::v2i32 || VT == MVT::v4f32 ||
3792                  VT == MVT::v2f32) {
3793         SelectLoadLane(Node, 4, AArch64::LD4i32);
3794         return;
3795       } else if (VT == MVT::v2i64 || VT == MVT::v1i64 || VT == MVT::v2f64 ||
3796                  VT == MVT::v1f64) {
3797         SelectLoadLane(Node, 4, AArch64::LD4i64);
3798         return;
3799       }
3800       break;
3801     }
3802   } break;
3803   case ISD::INTRINSIC_WO_CHAIN: {
3804     unsigned IntNo = cast<ConstantSDNode>(Node->getOperand(0))->getZExtValue();
3805     switch (IntNo) {
3806     default:
3807       break;
3808     case Intrinsic::aarch64_tagp:
3809       SelectTagP(Node);
3810       return;
3811     case Intrinsic::aarch64_neon_tbl2:
3812       SelectTable(Node, 2,
3813                   VT == MVT::v8i8 ? AArch64::TBLv8i8Two : AArch64::TBLv16i8Two,
3814                   false);
3815       return;
3816     case Intrinsic::aarch64_neon_tbl3:
3817       SelectTable(Node, 3, VT == MVT::v8i8 ? AArch64::TBLv8i8Three
3818                                            : AArch64::TBLv16i8Three,
3819                   false);
3820       return;
3821     case Intrinsic::aarch64_neon_tbl4:
3822       SelectTable(Node, 4, VT == MVT::v8i8 ? AArch64::TBLv8i8Four
3823                                            : AArch64::TBLv16i8Four,
3824                   false);
3825       return;
3826     case Intrinsic::aarch64_neon_tbx2:
3827       SelectTable(Node, 2,
3828                   VT == MVT::v8i8 ? AArch64::TBXv8i8Two : AArch64::TBXv16i8Two,
3829                   true);
3830       return;
3831     case Intrinsic::aarch64_neon_tbx3:
3832       SelectTable(Node, 3, VT == MVT::v8i8 ? AArch64::TBXv8i8Three
3833                                            : AArch64::TBXv16i8Three,
3834                   true);
3835       return;
3836     case Intrinsic::aarch64_neon_tbx4:
3837       SelectTable(Node, 4, VT == MVT::v8i8 ? AArch64::TBXv8i8Four
3838                                            : AArch64::TBXv16i8Four,
3839                   true);
3840       return;
3841     case Intrinsic::aarch64_neon_smull:
3842     case Intrinsic::aarch64_neon_umull:
3843       if (tryMULLV64LaneV128(IntNo, Node))
3844         return;
3845       break;
3846     }
3847     break;
3848   }
3849   case ISD::INTRINSIC_VOID: {
3850     unsigned IntNo = cast<ConstantSDNode>(Node->getOperand(1))->getZExtValue();
3851     if (Node->getNumOperands() >= 3)
3852       VT = Node->getOperand(2)->getValueType(0);
3853     switch (IntNo) {
3854     default:
3855       break;
3856     case Intrinsic::aarch64_neon_st1x2: {
3857       if (VT == MVT::v8i8) {
3858         SelectStore(Node, 2, AArch64::ST1Twov8b);
3859         return;
3860       } else if (VT == MVT::v16i8) {
3861         SelectStore(Node, 2, AArch64::ST1Twov16b);
3862         return;
3863       } else if (VT == MVT::v4i16 || VT == MVT::v4f16 ||
3864                  VT == MVT::v4bf16) {
3865         SelectStore(Node, 2, AArch64::ST1Twov4h);
3866         return;
3867       } else if (VT == MVT::v8i16 || VT == MVT::v8f16 ||
3868                  VT == MVT::v8bf16) {
3869         SelectStore(Node, 2, AArch64::ST1Twov8h);
3870         return;
3871       } else if (VT == MVT::v2i32 || VT == MVT::v2f32) {
3872         SelectStore(Node, 2, AArch64::ST1Twov2s);
3873         return;
3874       } else if (VT == MVT::v4i32 || VT == MVT::v4f32) {
3875         SelectStore(Node, 2, AArch64::ST1Twov4s);
3876         return;
3877       } else if (VT == MVT::v2i64 || VT == MVT::v2f64) {
3878         SelectStore(Node, 2, AArch64::ST1Twov2d);
3879         return;
3880       } else if (VT == MVT::v1i64 || VT == MVT::v1f64) {
3881         SelectStore(Node, 2, AArch64::ST1Twov1d);
3882         return;
3883       }
3884       break;
3885     }
3886     case Intrinsic::aarch64_neon_st1x3: {
3887       if (VT == MVT::v8i8) {
3888         SelectStore(Node, 3, AArch64::ST1Threev8b);
3889         return;
3890       } else if (VT == MVT::v16i8) {
3891         SelectStore(Node, 3, AArch64::ST1Threev16b);
3892         return;
3893       } else if (VT == MVT::v4i16 || VT == MVT::v4f16 ||
3894                  VT == MVT::v4bf16) {
3895         SelectStore(Node, 3, AArch64::ST1Threev4h);
3896         return;
3897       } else if (VT == MVT::v8i16 || VT == MVT::v8f16 ||
3898                  VT == MVT::v8bf16) {
3899         SelectStore(Node, 3, AArch64::ST1Threev8h);
3900         return;
3901       } else if (VT == MVT::v2i32 || VT == MVT::v2f32) {
3902         SelectStore(Node, 3, AArch64::ST1Threev2s);
3903         return;
3904       } else if (VT == MVT::v4i32 || VT == MVT::v4f32) {
3905         SelectStore(Node, 3, AArch64::ST1Threev4s);
3906         return;
3907       } else if (VT == MVT::v2i64 || VT == MVT::v2f64) {
3908         SelectStore(Node, 3, AArch64::ST1Threev2d);
3909         return;
3910       } else if (VT == MVT::v1i64 || VT == MVT::v1f64) {
3911         SelectStore(Node, 3, AArch64::ST1Threev1d);
3912         return;
3913       }
3914       break;
3915     }
3916     case Intrinsic::aarch64_neon_st1x4: {
3917       if (VT == MVT::v8i8) {
3918         SelectStore(Node, 4, AArch64::ST1Fourv8b);
3919         return;
3920       } else if (VT == MVT::v16i8) {
3921         SelectStore(Node, 4, AArch64::ST1Fourv16b);
3922         return;
3923       } else if (VT == MVT::v4i16 || VT == MVT::v4f16 ||
3924                  VT == MVT::v4bf16) {
3925         SelectStore(Node, 4, AArch64::ST1Fourv4h);
3926         return;
3927       } else if (VT == MVT::v8i16 || VT == MVT::v8f16 ||
3928                  VT == MVT::v8bf16) {
3929         SelectStore(Node, 4, AArch64::ST1Fourv8h);
3930         return;
3931       } else if (VT == MVT::v2i32 || VT == MVT::v2f32) {
3932         SelectStore(Node, 4, AArch64::ST1Fourv2s);
3933         return;
3934       } else if (VT == MVT::v4i32 || VT == MVT::v4f32) {
3935         SelectStore(Node, 4, AArch64::ST1Fourv4s);
3936         return;
3937       } else if (VT == MVT::v2i64 || VT == MVT::v2f64) {
3938         SelectStore(Node, 4, AArch64::ST1Fourv2d);
3939         return;
3940       } else if (VT == MVT::v1i64 || VT == MVT::v1f64) {
3941         SelectStore(Node, 4, AArch64::ST1Fourv1d);
3942         return;
3943       }
3944       break;
3945     }
3946     case Intrinsic::aarch64_neon_st2: {
3947       if (VT == MVT::v8i8) {
3948         SelectStore(Node, 2, AArch64::ST2Twov8b);
3949         return;
3950       } else if (VT == MVT::v16i8) {
3951         SelectStore(Node, 2, AArch64::ST2Twov16b);
3952         return;
3953       } else if (VT == MVT::v4i16 || VT == MVT::v4f16 ||
3954                  VT == MVT::v4bf16) {
3955         SelectStore(Node, 2, AArch64::ST2Twov4h);
3956         return;
3957       } else if (VT == MVT::v8i16 || VT == MVT::v8f16 ||
3958                  VT == MVT::v8bf16) {
3959         SelectStore(Node, 2, AArch64::ST2Twov8h);
3960         return;
3961       } else if (VT == MVT::v2i32 || VT == MVT::v2f32) {
3962         SelectStore(Node, 2, AArch64::ST2Twov2s);
3963         return;
3964       } else if (VT == MVT::v4i32 || VT == MVT::v4f32) {
3965         SelectStore(Node, 2, AArch64::ST2Twov4s);
3966         return;
3967       } else if (VT == MVT::v2i64 || VT == MVT::v2f64) {
3968         SelectStore(Node, 2, AArch64::ST2Twov2d);
3969         return;
3970       } else if (VT == MVT::v1i64 || VT == MVT::v1f64) {
3971         SelectStore(Node, 2, AArch64::ST1Twov1d);
3972         return;
3973       }
3974       break;
3975     }
3976     case Intrinsic::aarch64_neon_st3: {
3977       if (VT == MVT::v8i8) {
3978         SelectStore(Node, 3, AArch64::ST3Threev8b);
3979         return;
3980       } else if (VT == MVT::v16i8) {
3981         SelectStore(Node, 3, AArch64::ST3Threev16b);
3982         return;
3983       } else if (VT == MVT::v4i16 || VT == MVT::v4f16 ||
3984                  VT == MVT::v4bf16) {
3985         SelectStore(Node, 3, AArch64::ST3Threev4h);
3986         return;
3987       } else if (VT == MVT::v8i16 || VT == MVT::v8f16 ||
3988                  VT == MVT::v8bf16) {
3989         SelectStore(Node, 3, AArch64::ST3Threev8h);
3990         return;
3991       } else if (VT == MVT::v2i32 || VT == MVT::v2f32) {
3992         SelectStore(Node, 3, AArch64::ST3Threev2s);
3993         return;
3994       } else if (VT == MVT::v4i32 || VT == MVT::v4f32) {
3995         SelectStore(Node, 3, AArch64::ST3Threev4s);
3996         return;
3997       } else if (VT == MVT::v2i64 || VT == MVT::v2f64) {
3998         SelectStore(Node, 3, AArch64::ST3Threev2d);
3999         return;
4000       } else if (VT == MVT::v1i64 || VT == MVT::v1f64) {
4001         SelectStore(Node, 3, AArch64::ST1Threev1d);
4002         return;
4003       }
4004       break;
4005     }
4006     case Intrinsic::aarch64_neon_st4: {
4007       if (VT == MVT::v8i8) {
4008         SelectStore(Node, 4, AArch64::ST4Fourv8b);
4009         return;
4010       } else if (VT == MVT::v16i8) {
4011         SelectStore(Node, 4, AArch64::ST4Fourv16b);
4012         return;
4013       } else if (VT == MVT::v4i16 || VT == MVT::v4f16 ||
4014                  VT == MVT::v4bf16) {
4015         SelectStore(Node, 4, AArch64::ST4Fourv4h);
4016         return;
4017       } else if (VT == MVT::v8i16 || VT == MVT::v8f16 ||
4018                  VT == MVT::v8bf16) {
4019         SelectStore(Node, 4, AArch64::ST4Fourv8h);
4020         return;
4021       } else if (VT == MVT::v2i32 || VT == MVT::v2f32) {
4022         SelectStore(Node, 4, AArch64::ST4Fourv2s);
4023         return;
4024       } else if (VT == MVT::v4i32 || VT == MVT::v4f32) {
4025         SelectStore(Node, 4, AArch64::ST4Fourv4s);
4026         return;
4027       } else if (VT == MVT::v2i64 || VT == MVT::v2f64) {
4028         SelectStore(Node, 4, AArch64::ST4Fourv2d);
4029         return;
4030       } else if (VT == MVT::v1i64 || VT == MVT::v1f64) {
4031         SelectStore(Node, 4, AArch64::ST1Fourv1d);
4032         return;
4033       }
4034       break;
4035     }
4036     case Intrinsic::aarch64_neon_st2lane: {
4037       if (VT == MVT::v16i8 || VT == MVT::v8i8) {
4038         SelectStoreLane(Node, 2, AArch64::ST2i8);
4039         return;
4040       } else if (VT == MVT::v8i16 || VT == MVT::v4i16 || VT == MVT::v4f16 ||
4041                  VT == MVT::v8f16 || VT == MVT::v4bf16 || VT == MVT::v8bf16) {
4042         SelectStoreLane(Node, 2, AArch64::ST2i16);
4043         return;
4044       } else if (VT == MVT::v4i32 || VT == MVT::v2i32 || VT == MVT::v4f32 ||
4045                  VT == MVT::v2f32) {
4046         SelectStoreLane(Node, 2, AArch64::ST2i32);
4047         return;
4048       } else if (VT == MVT::v2i64 || VT == MVT::v1i64 || VT == MVT::v2f64 ||
4049                  VT == MVT::v1f64) {
4050         SelectStoreLane(Node, 2, AArch64::ST2i64);
4051         return;
4052       }
4053       break;
4054     }
4055     case Intrinsic::aarch64_neon_st3lane: {
4056       if (VT == MVT::v16i8 || VT == MVT::v8i8) {
4057         SelectStoreLane(Node, 3, AArch64::ST3i8);
4058         return;
4059       } else if (VT == MVT::v8i16 || VT == MVT::v4i16 || VT == MVT::v4f16 ||
4060                  VT == MVT::v8f16 || VT == MVT::v4bf16 || VT == MVT::v8bf16) {
4061         SelectStoreLane(Node, 3, AArch64::ST3i16);
4062         return;
4063       } else if (VT == MVT::v4i32 || VT == MVT::v2i32 || VT == MVT::v4f32 ||
4064                  VT == MVT::v2f32) {
4065         SelectStoreLane(Node, 3, AArch64::ST3i32);
4066         return;
4067       } else if (VT == MVT::v2i64 || VT == MVT::v1i64 || VT == MVT::v2f64 ||
4068                  VT == MVT::v1f64) {
4069         SelectStoreLane(Node, 3, AArch64::ST3i64);
4070         return;
4071       }
4072       break;
4073     }
4074     case Intrinsic::aarch64_neon_st4lane: {
4075       if (VT == MVT::v16i8 || VT == MVT::v8i8) {
4076         SelectStoreLane(Node, 4, AArch64::ST4i8);
4077         return;
4078       } else if (VT == MVT::v8i16 || VT == MVT::v4i16 || VT == MVT::v4f16 ||
4079                  VT == MVT::v8f16 || VT == MVT::v4bf16 || VT == MVT::v8bf16) {
4080         SelectStoreLane(Node, 4, AArch64::ST4i16);
4081         return;
4082       } else if (VT == MVT::v4i32 || VT == MVT::v2i32 || VT == MVT::v4f32 ||
4083                  VT == MVT::v2f32) {
4084         SelectStoreLane(Node, 4, AArch64::ST4i32);
4085         return;
4086       } else if (VT == MVT::v2i64 || VT == MVT::v1i64 || VT == MVT::v2f64 ||
4087                  VT == MVT::v1f64) {
4088         SelectStoreLane(Node, 4, AArch64::ST4i64);
4089         return;
4090       }
4091       break;
4092     }
4093     case Intrinsic::aarch64_sve_st2: {
4094       if (VT == MVT::nxv16i8) {
4095         SelectPredicatedStore(Node, 2, 0, AArch64::ST2B, AArch64::ST2B_IMM);
4096         return;
4097       } else if (VT == MVT::nxv8i16 || VT == MVT::nxv8f16 ||
4098                  (VT == MVT::nxv8bf16 && Subtarget->hasBF16())) {
4099         SelectPredicatedStore(Node, 2, 1, AArch64::ST2H, AArch64::ST2H_IMM);
4100         return;
4101       } else if (VT == MVT::nxv4i32 || VT == MVT::nxv4f32) {
4102         SelectPredicatedStore(Node, 2, 2, AArch64::ST2W, AArch64::ST2W_IMM);
4103         return;
4104       } else if (VT == MVT::nxv2i64 || VT == MVT::nxv2f64) {
4105         SelectPredicatedStore(Node, 2, 3, AArch64::ST2D, AArch64::ST2D_IMM);
4106         return;
4107       }
4108       break;
4109     }
4110     case Intrinsic::aarch64_sve_st3: {
4111       if (VT == MVT::nxv16i8) {
4112         SelectPredicatedStore(Node, 3, 0, AArch64::ST3B, AArch64::ST3B_IMM);
4113         return;
4114       } else if (VT == MVT::nxv8i16 || VT == MVT::nxv8f16 ||
4115                  (VT == MVT::nxv8bf16 && Subtarget->hasBF16())) {
4116         SelectPredicatedStore(Node, 3, 1, AArch64::ST3H, AArch64::ST3H_IMM);
4117         return;
4118       } else if (VT == MVT::nxv4i32 || VT == MVT::nxv4f32) {
4119         SelectPredicatedStore(Node, 3, 2, AArch64::ST3W, AArch64::ST3W_IMM);
4120         return;
4121       } else if (VT == MVT::nxv2i64 || VT == MVT::nxv2f64) {
4122         SelectPredicatedStore(Node, 3, 3, AArch64::ST3D, AArch64::ST3D_IMM);
4123         return;
4124       }
4125       break;
4126     }
4127     case Intrinsic::aarch64_sve_st4: {
4128       if (VT == MVT::nxv16i8) {
4129         SelectPredicatedStore(Node, 4, 0, AArch64::ST4B, AArch64::ST4B_IMM);
4130         return;
4131       } else if (VT == MVT::nxv8i16 || VT == MVT::nxv8f16 ||
4132                  (VT == MVT::nxv8bf16 && Subtarget->hasBF16())) {
4133         SelectPredicatedStore(Node, 4, 1, AArch64::ST4H, AArch64::ST4H_IMM);
4134         return;
4135       } else if (VT == MVT::nxv4i32 || VT == MVT::nxv4f32) {
4136         SelectPredicatedStore(Node, 4, 2, AArch64::ST4W, AArch64::ST4W_IMM);
4137         return;
4138       } else if (VT == MVT::nxv2i64 || VT == MVT::nxv2f64) {
4139         SelectPredicatedStore(Node, 4, 3, AArch64::ST4D, AArch64::ST4D_IMM);
4140         return;
4141       }
4142       break;
4143     }
4144     }
4145     break;
4146   }
4147   case AArch64ISD::LD2post: {
4148     if (VT == MVT::v8i8) {
4149       SelectPostLoad(Node, 2, AArch64::LD2Twov8b_POST, AArch64::dsub0);
4150       return;
4151     } else if (VT == MVT::v16i8) {
4152       SelectPostLoad(Node, 2, AArch64::LD2Twov16b_POST, AArch64::qsub0);
4153       return;
4154     } else if (VT == MVT::v4i16 || VT == MVT::v4f16 || VT == MVT::v4bf16) {
4155       SelectPostLoad(Node, 2, AArch64::LD2Twov4h_POST, AArch64::dsub0);
4156       return;
4157     } else if (VT == MVT::v8i16 || VT == MVT::v8f16  || VT == MVT::v8bf16) {
4158       SelectPostLoad(Node, 2, AArch64::LD2Twov8h_POST, AArch64::qsub0);
4159       return;
4160     } else if (VT == MVT::v2i32 || VT == MVT::v2f32) {
4161       SelectPostLoad(Node, 2, AArch64::LD2Twov2s_POST, AArch64::dsub0);
4162       return;
4163     } else if (VT == MVT::v4i32 || VT == MVT::v4f32) {
4164       SelectPostLoad(Node, 2, AArch64::LD2Twov4s_POST, AArch64::qsub0);
4165       return;
4166     } else if (VT == MVT::v1i64 || VT == MVT::v1f64) {
4167       SelectPostLoad(Node, 2, AArch64::LD1Twov1d_POST, AArch64::dsub0);
4168       return;
4169     } else if (VT == MVT::v2i64 || VT == MVT::v2f64) {
4170       SelectPostLoad(Node, 2, AArch64::LD2Twov2d_POST, AArch64::qsub0);
4171       return;
4172     }
4173     break;
4174   }
4175   case AArch64ISD::LD3post: {
4176     if (VT == MVT::v8i8) {
4177       SelectPostLoad(Node, 3, AArch64::LD3Threev8b_POST, AArch64::dsub0);
4178       return;
4179     } else if (VT == MVT::v16i8) {
4180       SelectPostLoad(Node, 3, AArch64::LD3Threev16b_POST, AArch64::qsub0);
4181       return;
4182     } else if (VT == MVT::v4i16 || VT == MVT::v4f16 || VT == MVT::v4bf16) {
4183       SelectPostLoad(Node, 3, AArch64::LD3Threev4h_POST, AArch64::dsub0);
4184       return;
4185     } else if (VT == MVT::v8i16 || VT == MVT::v8f16  || VT == MVT::v8bf16) {
4186       SelectPostLoad(Node, 3, AArch64::LD3Threev8h_POST, AArch64::qsub0);
4187       return;
4188     } else if (VT == MVT::v2i32 || VT == MVT::v2f32) {
4189       SelectPostLoad(Node, 3, AArch64::LD3Threev2s_POST, AArch64::dsub0);
4190       return;
4191     } else if (VT == MVT::v4i32 || VT == MVT::v4f32) {
4192       SelectPostLoad(Node, 3, AArch64::LD3Threev4s_POST, AArch64::qsub0);
4193       return;
4194     } else if (VT == MVT::v1i64 || VT == MVT::v1f64) {
4195       SelectPostLoad(Node, 3, AArch64::LD1Threev1d_POST, AArch64::dsub0);
4196       return;
4197     } else if (VT == MVT::v2i64 || VT == MVT::v2f64) {
4198       SelectPostLoad(Node, 3, AArch64::LD3Threev2d_POST, AArch64::qsub0);
4199       return;
4200     }
4201     break;
4202   }
4203   case AArch64ISD::LD4post: {
4204     if (VT == MVT::v8i8) {
4205       SelectPostLoad(Node, 4, AArch64::LD4Fourv8b_POST, AArch64::dsub0);
4206       return;
4207     } else if (VT == MVT::v16i8) {
4208       SelectPostLoad(Node, 4, AArch64::LD4Fourv16b_POST, AArch64::qsub0);
4209       return;
4210     } else if (VT == MVT::v4i16 || VT == MVT::v4f16 || VT == MVT::v4bf16) {
4211       SelectPostLoad(Node, 4, AArch64::LD4Fourv4h_POST, AArch64::dsub0);
4212       return;
4213     } else if (VT == MVT::v8i16 || VT == MVT::v8f16  || VT == MVT::v8bf16) {
4214       SelectPostLoad(Node, 4, AArch64::LD4Fourv8h_POST, AArch64::qsub0);
4215       return;
4216     } else if (VT == MVT::v2i32 || VT == MVT::v2f32) {
4217       SelectPostLoad(Node, 4, AArch64::LD4Fourv2s_POST, AArch64::dsub0);
4218       return;
4219     } else if (VT == MVT::v4i32 || VT == MVT::v4f32) {
4220       SelectPostLoad(Node, 4, AArch64::LD4Fourv4s_POST, AArch64::qsub0);
4221       return;
4222     } else if (VT == MVT::v1i64 || VT == MVT::v1f64) {
4223       SelectPostLoad(Node, 4, AArch64::LD1Fourv1d_POST, AArch64::dsub0);
4224       return;
4225     } else if (VT == MVT::v2i64 || VT == MVT::v2f64) {
4226       SelectPostLoad(Node, 4, AArch64::LD4Fourv2d_POST, AArch64::qsub0);
4227       return;
4228     }
4229     break;
4230   }
4231   case AArch64ISD::LD1x2post: {
4232     if (VT == MVT::v8i8) {
4233       SelectPostLoad(Node, 2, AArch64::LD1Twov8b_POST, AArch64::dsub0);
4234       return;
4235     } else if (VT == MVT::v16i8) {
4236       SelectPostLoad(Node, 2, AArch64::LD1Twov16b_POST, AArch64::qsub0);
4237       return;
4238     } else if (VT == MVT::v4i16 || VT == MVT::v4f16 || VT == MVT::v4bf16) {
4239       SelectPostLoad(Node, 2, AArch64::LD1Twov4h_POST, AArch64::dsub0);
4240       return;
4241     } else if (VT == MVT::v8i16 || VT == MVT::v8f16  || VT == MVT::v8bf16) {
4242       SelectPostLoad(Node, 2, AArch64::LD1Twov8h_POST, AArch64::qsub0);
4243       return;
4244     } else if (VT == MVT::v2i32 || VT == MVT::v2f32) {
4245       SelectPostLoad(Node, 2, AArch64::LD1Twov2s_POST, AArch64::dsub0);
4246       return;
4247     } else if (VT == MVT::v4i32 || VT == MVT::v4f32) {
4248       SelectPostLoad(Node, 2, AArch64::LD1Twov4s_POST, AArch64::qsub0);
4249       return;
4250     } else if (VT == MVT::v1i64 || VT == MVT::v1f64) {
4251       SelectPostLoad(Node, 2, AArch64::LD1Twov1d_POST, AArch64::dsub0);
4252       return;
4253     } else if (VT == MVT::v2i64 || VT == MVT::v2f64) {
4254       SelectPostLoad(Node, 2, AArch64::LD1Twov2d_POST, AArch64::qsub0);
4255       return;
4256     }
4257     break;
4258   }
4259   case AArch64ISD::LD1x3post: {
4260     if (VT == MVT::v8i8) {
4261       SelectPostLoad(Node, 3, AArch64::LD1Threev8b_POST, AArch64::dsub0);
4262       return;
4263     } else if (VT == MVT::v16i8) {
4264       SelectPostLoad(Node, 3, AArch64::LD1Threev16b_POST, AArch64::qsub0);
4265       return;
4266     } else if (VT == MVT::v4i16 || VT == MVT::v4f16 || VT == MVT::v4bf16) {
4267       SelectPostLoad(Node, 3, AArch64::LD1Threev4h_POST, AArch64::dsub0);
4268       return;
4269     } else if (VT == MVT::v8i16 || VT == MVT::v8f16  || VT == MVT::v8bf16) {
4270       SelectPostLoad(Node, 3, AArch64::LD1Threev8h_POST, AArch64::qsub0);
4271       return;
4272     } else if (VT == MVT::v2i32 || VT == MVT::v2f32) {
4273       SelectPostLoad(Node, 3, AArch64::LD1Threev2s_POST, AArch64::dsub0);
4274       return;
4275     } else if (VT == MVT::v4i32 || VT == MVT::v4f32) {
4276       SelectPostLoad(Node, 3, AArch64::LD1Threev4s_POST, AArch64::qsub0);
4277       return;
4278     } else if (VT == MVT::v1i64 || VT == MVT::v1f64) {
4279       SelectPostLoad(Node, 3, AArch64::LD1Threev1d_POST, AArch64::dsub0);
4280       return;
4281     } else if (VT == MVT::v2i64 || VT == MVT::v2f64) {
4282       SelectPostLoad(Node, 3, AArch64::LD1Threev2d_POST, AArch64::qsub0);
4283       return;
4284     }
4285     break;
4286   }
4287   case AArch64ISD::LD1x4post: {
4288     if (VT == MVT::v8i8) {
4289       SelectPostLoad(Node, 4, AArch64::LD1Fourv8b_POST, AArch64::dsub0);
4290       return;
4291     } else if (VT == MVT::v16i8) {
4292       SelectPostLoad(Node, 4, AArch64::LD1Fourv16b_POST, AArch64::qsub0);
4293       return;
4294     } else if (VT == MVT::v4i16 || VT == MVT::v4f16 || VT == MVT::v4bf16) {
4295       SelectPostLoad(Node, 4, AArch64::LD1Fourv4h_POST, AArch64::dsub0);
4296       return;
4297     } else if (VT == MVT::v8i16 || VT == MVT::v8f16  || VT == MVT::v8bf16) {
4298       SelectPostLoad(Node, 4, AArch64::LD1Fourv8h_POST, AArch64::qsub0);
4299       return;
4300     } else if (VT == MVT::v2i32 || VT == MVT::v2f32) {
4301       SelectPostLoad(Node, 4, AArch64::LD1Fourv2s_POST, AArch64::dsub0);
4302       return;
4303     } else if (VT == MVT::v4i32 || VT == MVT::v4f32) {
4304       SelectPostLoad(Node, 4, AArch64::LD1Fourv4s_POST, AArch64::qsub0);
4305       return;
4306     } else if (VT == MVT::v1i64 || VT == MVT::v1f64) {
4307       SelectPostLoad(Node, 4, AArch64::LD1Fourv1d_POST, AArch64::dsub0);
4308       return;
4309     } else if (VT == MVT::v2i64 || VT == MVT::v2f64) {
4310       SelectPostLoad(Node, 4, AArch64::LD1Fourv2d_POST, AArch64::qsub0);
4311       return;
4312     }
4313     break;
4314   }
4315   case AArch64ISD::LD1DUPpost: {
4316     if (VT == MVT::v8i8) {
4317       SelectPostLoad(Node, 1, AArch64::LD1Rv8b_POST, AArch64::dsub0);
4318       return;
4319     } else if (VT == MVT::v16i8) {
4320       SelectPostLoad(Node, 1, AArch64::LD1Rv16b_POST, AArch64::qsub0);
4321       return;
4322     } else if (VT == MVT::v4i16 || VT == MVT::v4f16 || VT == MVT::v4bf16) {
4323       SelectPostLoad(Node, 1, AArch64::LD1Rv4h_POST, AArch64::dsub0);
4324       return;
4325     } else if (VT == MVT::v8i16 || VT == MVT::v8f16  || VT == MVT::v8bf16) {
4326       SelectPostLoad(Node, 1, AArch64::LD1Rv8h_POST, AArch64::qsub0);
4327       return;
4328     } else if (VT == MVT::v2i32 || VT == MVT::v2f32) {
4329       SelectPostLoad(Node, 1, AArch64::LD1Rv2s_POST, AArch64::dsub0);
4330       return;
4331     } else if (VT == MVT::v4i32 || VT == MVT::v4f32) {
4332       SelectPostLoad(Node, 1, AArch64::LD1Rv4s_POST, AArch64::qsub0);
4333       return;
4334     } else if (VT == MVT::v1i64 || VT == MVT::v1f64) {
4335       SelectPostLoad(Node, 1, AArch64::LD1Rv1d_POST, AArch64::dsub0);
4336       return;
4337     } else if (VT == MVT::v2i64 || VT == MVT::v2f64) {
4338       SelectPostLoad(Node, 1, AArch64::LD1Rv2d_POST, AArch64::qsub0);
4339       return;
4340     }
4341     break;
4342   }
4343   case AArch64ISD::LD2DUPpost: {
4344     if (VT == MVT::v8i8) {
4345       SelectPostLoad(Node, 2, AArch64::LD2Rv8b_POST, AArch64::dsub0);
4346       return;
4347     } else if (VT == MVT::v16i8) {
4348       SelectPostLoad(Node, 2, AArch64::LD2Rv16b_POST, AArch64::qsub0);
4349       return;
4350     } else if (VT == MVT::v4i16 || VT == MVT::v4f16 || VT == MVT::v4bf16) {
4351       SelectPostLoad(Node, 2, AArch64::LD2Rv4h_POST, AArch64::dsub0);
4352       return;
4353     } else if (VT == MVT::v8i16 || VT == MVT::v8f16  || VT == MVT::v8bf16) {
4354       SelectPostLoad(Node, 2, AArch64::LD2Rv8h_POST, AArch64::qsub0);
4355       return;
4356     } else if (VT == MVT::v2i32 || VT == MVT::v2f32) {
4357       SelectPostLoad(Node, 2, AArch64::LD2Rv2s_POST, AArch64::dsub0);
4358       return;
4359     } else if (VT == MVT::v4i32 || VT == MVT::v4f32) {
4360       SelectPostLoad(Node, 2, AArch64::LD2Rv4s_POST, AArch64::qsub0);
4361       return;
4362     } else if (VT == MVT::v1i64 || VT == MVT::v1f64) {
4363       SelectPostLoad(Node, 2, AArch64::LD2Rv1d_POST, AArch64::dsub0);
4364       return;
4365     } else if (VT == MVT::v2i64 || VT == MVT::v2f64) {
4366       SelectPostLoad(Node, 2, AArch64::LD2Rv2d_POST, AArch64::qsub0);
4367       return;
4368     }
4369     break;
4370   }
4371   case AArch64ISD::LD3DUPpost: {
4372     if (VT == MVT::v8i8) {
4373       SelectPostLoad(Node, 3, AArch64::LD3Rv8b_POST, AArch64::dsub0);
4374       return;
4375     } else if (VT == MVT::v16i8) {
4376       SelectPostLoad(Node, 3, AArch64::LD3Rv16b_POST, AArch64::qsub0);
4377       return;
4378     } else if (VT == MVT::v4i16 || VT == MVT::v4f16 || VT == MVT::v4bf16) {
4379       SelectPostLoad(Node, 3, AArch64::LD3Rv4h_POST, AArch64::dsub0);
4380       return;
4381     } else if (VT == MVT::v8i16 || VT == MVT::v8f16  || VT == MVT::v8bf16) {
4382       SelectPostLoad(Node, 3, AArch64::LD3Rv8h_POST, AArch64::qsub0);
4383       return;
4384     } else if (VT == MVT::v2i32 || VT == MVT::v2f32) {
4385       SelectPostLoad(Node, 3, AArch64::LD3Rv2s_POST, AArch64::dsub0);
4386       return;
4387     } else if (VT == MVT::v4i32 || VT == MVT::v4f32) {
4388       SelectPostLoad(Node, 3, AArch64::LD3Rv4s_POST, AArch64::qsub0);
4389       return;
4390     } else if (VT == MVT::v1i64 || VT == MVT::v1f64) {
4391       SelectPostLoad(Node, 3, AArch64::LD3Rv1d_POST, AArch64::dsub0);
4392       return;
4393     } else if (VT == MVT::v2i64 || VT == MVT::v2f64) {
4394       SelectPostLoad(Node, 3, AArch64::LD3Rv2d_POST, AArch64::qsub0);
4395       return;
4396     }
4397     break;
4398   }
4399   case AArch64ISD::LD4DUPpost: {
4400     if (VT == MVT::v8i8) {
4401       SelectPostLoad(Node, 4, AArch64::LD4Rv8b_POST, AArch64::dsub0);
4402       return;
4403     } else if (VT == MVT::v16i8) {
4404       SelectPostLoad(Node, 4, AArch64::LD4Rv16b_POST, AArch64::qsub0);
4405       return;
4406     } else if (VT == MVT::v4i16 || VT == MVT::v4f16 || VT == MVT::v4bf16) {
4407       SelectPostLoad(Node, 4, AArch64::LD4Rv4h_POST, AArch64::dsub0);
4408       return;
4409     } else if (VT == MVT::v8i16 || VT == MVT::v8f16  || VT == MVT::v8bf16) {
4410       SelectPostLoad(Node, 4, AArch64::LD4Rv8h_POST, AArch64::qsub0);
4411       return;
4412     } else if (VT == MVT::v2i32 || VT == MVT::v2f32) {
4413       SelectPostLoad(Node, 4, AArch64::LD4Rv2s_POST, AArch64::dsub0);
4414       return;
4415     } else if (VT == MVT::v4i32 || VT == MVT::v4f32) {
4416       SelectPostLoad(Node, 4, AArch64::LD4Rv4s_POST, AArch64::qsub0);
4417       return;
4418     } else if (VT == MVT::v1i64 || VT == MVT::v1f64) {
4419       SelectPostLoad(Node, 4, AArch64::LD4Rv1d_POST, AArch64::dsub0);
4420       return;
4421     } else if (VT == MVT::v2i64 || VT == MVT::v2f64) {
4422       SelectPostLoad(Node, 4, AArch64::LD4Rv2d_POST, AArch64::qsub0);
4423       return;
4424     }
4425     break;
4426   }
4427   case AArch64ISD::LD1LANEpost: {
4428     if (VT == MVT::v16i8 || VT == MVT::v8i8) {
4429       SelectPostLoadLane(Node, 1, AArch64::LD1i8_POST);
4430       return;
4431     } else if (VT == MVT::v8i16 || VT == MVT::v4i16 || VT == MVT::v4f16 ||
4432                VT == MVT::v8f16 || VT == MVT::v4bf16 || VT == MVT::v8bf16) {
4433       SelectPostLoadLane(Node, 1, AArch64::LD1i16_POST);
4434       return;
4435     } else if (VT == MVT::v4i32 || VT == MVT::v2i32 || VT == MVT::v4f32 ||
4436                VT == MVT::v2f32) {
4437       SelectPostLoadLane(Node, 1, AArch64::LD1i32_POST);
4438       return;
4439     } else if (VT == MVT::v2i64 || VT == MVT::v1i64 || VT == MVT::v2f64 ||
4440                VT == MVT::v1f64) {
4441       SelectPostLoadLane(Node, 1, AArch64::LD1i64_POST);
4442       return;
4443     }
4444     break;
4445   }
4446   case AArch64ISD::LD2LANEpost: {
4447     if (VT == MVT::v16i8 || VT == MVT::v8i8) {
4448       SelectPostLoadLane(Node, 2, AArch64::LD2i8_POST);
4449       return;
4450     } else if (VT == MVT::v8i16 || VT == MVT::v4i16 || VT == MVT::v4f16 ||
4451                VT == MVT::v8f16 || VT == MVT::v4bf16 || VT == MVT::v8bf16) {
4452       SelectPostLoadLane(Node, 2, AArch64::LD2i16_POST);
4453       return;
4454     } else if (VT == MVT::v4i32 || VT == MVT::v2i32 || VT == MVT::v4f32 ||
4455                VT == MVT::v2f32) {
4456       SelectPostLoadLane(Node, 2, AArch64::LD2i32_POST);
4457       return;
4458     } else if (VT == MVT::v2i64 || VT == MVT::v1i64 || VT == MVT::v2f64 ||
4459                VT == MVT::v1f64) {
4460       SelectPostLoadLane(Node, 2, AArch64::LD2i64_POST);
4461       return;
4462     }
4463     break;
4464   }
4465   case AArch64ISD::LD3LANEpost: {
4466     if (VT == MVT::v16i8 || VT == MVT::v8i8) {
4467       SelectPostLoadLane(Node, 3, AArch64::LD3i8_POST);
4468       return;
4469     } else if (VT == MVT::v8i16 || VT == MVT::v4i16 || VT == MVT::v4f16 ||
4470                VT == MVT::v8f16 || VT == MVT::v4bf16 || VT == MVT::v8bf16) {
4471       SelectPostLoadLane(Node, 3, AArch64::LD3i16_POST);
4472       return;
4473     } else if (VT == MVT::v4i32 || VT == MVT::v2i32 || VT == MVT::v4f32 ||
4474                VT == MVT::v2f32) {
4475       SelectPostLoadLane(Node, 3, AArch64::LD3i32_POST);
4476       return;
4477     } else if (VT == MVT::v2i64 || VT == MVT::v1i64 || VT == MVT::v2f64 ||
4478                VT == MVT::v1f64) {
4479       SelectPostLoadLane(Node, 3, AArch64::LD3i64_POST);
4480       return;
4481     }
4482     break;
4483   }
4484   case AArch64ISD::LD4LANEpost: {
4485     if (VT == MVT::v16i8 || VT == MVT::v8i8) {
4486       SelectPostLoadLane(Node, 4, AArch64::LD4i8_POST);
4487       return;
4488     } else if (VT == MVT::v8i16 || VT == MVT::v4i16 || VT == MVT::v4f16 ||
4489                VT == MVT::v8f16 || VT == MVT::v4bf16 || VT == MVT::v8bf16) {
4490       SelectPostLoadLane(Node, 4, AArch64::LD4i16_POST);
4491       return;
4492     } else if (VT == MVT::v4i32 || VT == MVT::v2i32 || VT == MVT::v4f32 ||
4493                VT == MVT::v2f32) {
4494       SelectPostLoadLane(Node, 4, AArch64::LD4i32_POST);
4495       return;
4496     } else if (VT == MVT::v2i64 || VT == MVT::v1i64 || VT == MVT::v2f64 ||
4497                VT == MVT::v1f64) {
4498       SelectPostLoadLane(Node, 4, AArch64::LD4i64_POST);
4499       return;
4500     }
4501     break;
4502   }
4503   case AArch64ISD::ST2post: {
4504     VT = Node->getOperand(1).getValueType();
4505     if (VT == MVT::v8i8) {
4506       SelectPostStore(Node, 2, AArch64::ST2Twov8b_POST);
4507       return;
4508     } else if (VT == MVT::v16i8) {
4509       SelectPostStore(Node, 2, AArch64::ST2Twov16b_POST);
4510       return;
4511     } else if (VT == MVT::v4i16 || VT == MVT::v4f16 || VT == MVT::v4bf16) {
4512       SelectPostStore(Node, 2, AArch64::ST2Twov4h_POST);
4513       return;
4514     } else if (VT == MVT::v8i16 || VT == MVT::v8f16 || VT == MVT::v8bf16) {
4515       SelectPostStore(Node, 2, AArch64::ST2Twov8h_POST);
4516       return;
4517     } else if (VT == MVT::v2i32 || VT == MVT::v2f32) {
4518       SelectPostStore(Node, 2, AArch64::ST2Twov2s_POST);
4519       return;
4520     } else if (VT == MVT::v4i32 || VT == MVT::v4f32) {
4521       SelectPostStore(Node, 2, AArch64::ST2Twov4s_POST);
4522       return;
4523     } else if (VT == MVT::v2i64 || VT == MVT::v2f64) {
4524       SelectPostStore(Node, 2, AArch64::ST2Twov2d_POST);
4525       return;
4526     } else if (VT == MVT::v1i64 || VT == MVT::v1f64) {
4527       SelectPostStore(Node, 2, AArch64::ST1Twov1d_POST);
4528       return;
4529     }
4530     break;
4531   }
4532   case AArch64ISD::ST3post: {
4533     VT = Node->getOperand(1).getValueType();
4534     if (VT == MVT::v8i8) {
4535       SelectPostStore(Node, 3, AArch64::ST3Threev8b_POST);
4536       return;
4537     } else if (VT == MVT::v16i8) {
4538       SelectPostStore(Node, 3, AArch64::ST3Threev16b_POST);
4539       return;
4540     } else if (VT == MVT::v4i16 || VT == MVT::v4f16 || VT == MVT::v4bf16) {
4541       SelectPostStore(Node, 3, AArch64::ST3Threev4h_POST);
4542       return;
4543     } else if (VT == MVT::v8i16 || VT == MVT::v8f16 || VT == MVT::v8bf16) {
4544       SelectPostStore(Node, 3, AArch64::ST3Threev8h_POST);
4545       return;
4546     } else if (VT == MVT::v2i32 || VT == MVT::v2f32) {
4547       SelectPostStore(Node, 3, AArch64::ST3Threev2s_POST);
4548       return;
4549     } else if (VT == MVT::v4i32 || VT == MVT::v4f32) {
4550       SelectPostStore(Node, 3, AArch64::ST3Threev4s_POST);
4551       return;
4552     } else if (VT == MVT::v2i64 || VT == MVT::v2f64) {
4553       SelectPostStore(Node, 3, AArch64::ST3Threev2d_POST);
4554       return;
4555     } else if (VT == MVT::v1i64 || VT == MVT::v1f64) {
4556       SelectPostStore(Node, 3, AArch64::ST1Threev1d_POST);
4557       return;
4558     }
4559     break;
4560   }
4561   case AArch64ISD::ST4post: {
4562     VT = Node->getOperand(1).getValueType();
4563     if (VT == MVT::v8i8) {
4564       SelectPostStore(Node, 4, AArch64::ST4Fourv8b_POST);
4565       return;
4566     } else if (VT == MVT::v16i8) {
4567       SelectPostStore(Node, 4, AArch64::ST4Fourv16b_POST);
4568       return;
4569     } else if (VT == MVT::v4i16 || VT == MVT::v4f16 || VT == MVT::v4bf16) {
4570       SelectPostStore(Node, 4, AArch64::ST4Fourv4h_POST);
4571       return;
4572     } else if (VT == MVT::v8i16 || VT == MVT::v8f16 || VT == MVT::v8bf16) {
4573       SelectPostStore(Node, 4, AArch64::ST4Fourv8h_POST);
4574       return;
4575     } else if (VT == MVT::v2i32 || VT == MVT::v2f32) {
4576       SelectPostStore(Node, 4, AArch64::ST4Fourv2s_POST);
4577       return;
4578     } else if (VT == MVT::v4i32 || VT == MVT::v4f32) {
4579       SelectPostStore(Node, 4, AArch64::ST4Fourv4s_POST);
4580       return;
4581     } else if (VT == MVT::v2i64 || VT == MVT::v2f64) {
4582       SelectPostStore(Node, 4, AArch64::ST4Fourv2d_POST);
4583       return;
4584     } else if (VT == MVT::v1i64 || VT == MVT::v1f64) {
4585       SelectPostStore(Node, 4, AArch64::ST1Fourv1d_POST);
4586       return;
4587     }
4588     break;
4589   }
4590   case AArch64ISD::ST1x2post: {
4591     VT = Node->getOperand(1).getValueType();
4592     if (VT == MVT::v8i8) {
4593       SelectPostStore(Node, 2, AArch64::ST1Twov8b_POST);
4594       return;
4595     } else if (VT == MVT::v16i8) {
4596       SelectPostStore(Node, 2, AArch64::ST1Twov16b_POST);
4597       return;
4598     } else if (VT == MVT::v4i16 || VT == MVT::v4f16 || VT == MVT::v4bf16) {
4599       SelectPostStore(Node, 2, AArch64::ST1Twov4h_POST);
4600       return;
4601     } else if (VT == MVT::v8i16 || VT == MVT::v8f16 || VT == MVT::v8bf16) {
4602       SelectPostStore(Node, 2, AArch64::ST1Twov8h_POST);
4603       return;
4604     } else if (VT == MVT::v2i32 || VT == MVT::v2f32) {
4605       SelectPostStore(Node, 2, AArch64::ST1Twov2s_POST);
4606       return;
4607     } else if (VT == MVT::v4i32 || VT == MVT::v4f32) {
4608       SelectPostStore(Node, 2, AArch64::ST1Twov4s_POST);
4609       return;
4610     } else if (VT == MVT::v1i64 || VT == MVT::v1f64) {
4611       SelectPostStore(Node, 2, AArch64::ST1Twov1d_POST);
4612       return;
4613     } else if (VT == MVT::v2i64 || VT == MVT::v2f64) {
4614       SelectPostStore(Node, 2, AArch64::ST1Twov2d_POST);
4615       return;
4616     }
4617     break;
4618   }
4619   case AArch64ISD::ST1x3post: {
4620     VT = Node->getOperand(1).getValueType();
4621     if (VT == MVT::v8i8) {
4622       SelectPostStore(Node, 3, AArch64::ST1Threev8b_POST);
4623       return;
4624     } else if (VT == MVT::v16i8) {
4625       SelectPostStore(Node, 3, AArch64::ST1Threev16b_POST);
4626       return;
4627     } else if (VT == MVT::v4i16 || VT == MVT::v4f16 || VT == MVT::v4bf16) {
4628       SelectPostStore(Node, 3, AArch64::ST1Threev4h_POST);
4629       return;
4630     } else if (VT == MVT::v8i16 || VT == MVT::v8f16 || VT == MVT::v8bf16 ) {
4631       SelectPostStore(Node, 3, AArch64::ST1Threev8h_POST);
4632       return;
4633     } else if (VT == MVT::v2i32 || VT == MVT::v2f32) {
4634       SelectPostStore(Node, 3, AArch64::ST1Threev2s_POST);
4635       return;
4636     } else if (VT == MVT::v4i32 || VT == MVT::v4f32) {
4637       SelectPostStore(Node, 3, AArch64::ST1Threev4s_POST);
4638       return;
4639     } else if (VT == MVT::v1i64 || VT == MVT::v1f64) {
4640       SelectPostStore(Node, 3, AArch64::ST1Threev1d_POST);
4641       return;
4642     } else if (VT == MVT::v2i64 || VT == MVT::v2f64) {
4643       SelectPostStore(Node, 3, AArch64::ST1Threev2d_POST);
4644       return;
4645     }
4646     break;
4647   }
4648   case AArch64ISD::ST1x4post: {
4649     VT = Node->getOperand(1).getValueType();
4650     if (VT == MVT::v8i8) {
4651       SelectPostStore(Node, 4, AArch64::ST1Fourv8b_POST);
4652       return;
4653     } else if (VT == MVT::v16i8) {
4654       SelectPostStore(Node, 4, AArch64::ST1Fourv16b_POST);
4655       return;
4656     } else if (VT == MVT::v4i16 || VT == MVT::v4f16 || VT == MVT::v4bf16) {
4657       SelectPostStore(Node, 4, AArch64::ST1Fourv4h_POST);
4658       return;
4659     } else if (VT == MVT::v8i16 || VT == MVT::v8f16 || VT == MVT::v8bf16) {
4660       SelectPostStore(Node, 4, AArch64::ST1Fourv8h_POST);
4661       return;
4662     } else if (VT == MVT::v2i32 || VT == MVT::v2f32) {
4663       SelectPostStore(Node, 4, AArch64::ST1Fourv2s_POST);
4664       return;
4665     } else if (VT == MVT::v4i32 || VT == MVT::v4f32) {
4666       SelectPostStore(Node, 4, AArch64::ST1Fourv4s_POST);
4667       return;
4668     } else if (VT == MVT::v1i64 || VT == MVT::v1f64) {
4669       SelectPostStore(Node, 4, AArch64::ST1Fourv1d_POST);
4670       return;
4671     } else if (VT == MVT::v2i64 || VT == MVT::v2f64) {
4672       SelectPostStore(Node, 4, AArch64::ST1Fourv2d_POST);
4673       return;
4674     }
4675     break;
4676   }
4677   case AArch64ISD::ST2LANEpost: {
4678     VT = Node->getOperand(1).getValueType();
4679     if (VT == MVT::v16i8 || VT == MVT::v8i8) {
4680       SelectPostStoreLane(Node, 2, AArch64::ST2i8_POST);
4681       return;
4682     } else if (VT == MVT::v8i16 || VT == MVT::v4i16 || VT == MVT::v4f16 ||
4683                VT == MVT::v8f16 || VT == MVT::v4bf16 || VT == MVT::v8bf16) {
4684       SelectPostStoreLane(Node, 2, AArch64::ST2i16_POST);
4685       return;
4686     } else if (VT == MVT::v4i32 || VT == MVT::v2i32 || VT == MVT::v4f32 ||
4687                VT == MVT::v2f32) {
4688       SelectPostStoreLane(Node, 2, AArch64::ST2i32_POST);
4689       return;
4690     } else if (VT == MVT::v2i64 || VT == MVT::v1i64 || VT == MVT::v2f64 ||
4691                VT == MVT::v1f64) {
4692       SelectPostStoreLane(Node, 2, AArch64::ST2i64_POST);
4693       return;
4694     }
4695     break;
4696   }
4697   case AArch64ISD::ST3LANEpost: {
4698     VT = Node->getOperand(1).getValueType();
4699     if (VT == MVT::v16i8 || VT == MVT::v8i8) {
4700       SelectPostStoreLane(Node, 3, AArch64::ST3i8_POST);
4701       return;
4702     } else if (VT == MVT::v8i16 || VT == MVT::v4i16 || VT == MVT::v4f16 ||
4703                VT == MVT::v8f16 || VT == MVT::v4bf16 || VT == MVT::v8bf16) {
4704       SelectPostStoreLane(Node, 3, AArch64::ST3i16_POST);
4705       return;
4706     } else if (VT == MVT::v4i32 || VT == MVT::v2i32 || VT == MVT::v4f32 ||
4707                VT == MVT::v2f32) {
4708       SelectPostStoreLane(Node, 3, AArch64::ST3i32_POST);
4709       return;
4710     } else if (VT == MVT::v2i64 || VT == MVT::v1i64 || VT == MVT::v2f64 ||
4711                VT == MVT::v1f64) {
4712       SelectPostStoreLane(Node, 3, AArch64::ST3i64_POST);
4713       return;
4714     }
4715     break;
4716   }
4717   case AArch64ISD::ST4LANEpost: {
4718     VT = Node->getOperand(1).getValueType();
4719     if (VT == MVT::v16i8 || VT == MVT::v8i8) {
4720       SelectPostStoreLane(Node, 4, AArch64::ST4i8_POST);
4721       return;
4722     } else if (VT == MVT::v8i16 || VT == MVT::v4i16 || VT == MVT::v4f16 ||
4723                VT == MVT::v8f16 || VT == MVT::v4bf16 || VT == MVT::v8bf16) {
4724       SelectPostStoreLane(Node, 4, AArch64::ST4i16_POST);
4725       return;
4726     } else if (VT == MVT::v4i32 || VT == MVT::v2i32 || VT == MVT::v4f32 ||
4727                VT == MVT::v2f32) {
4728       SelectPostStoreLane(Node, 4, AArch64::ST4i32_POST);
4729       return;
4730     } else if (VT == MVT::v2i64 || VT == MVT::v1i64 || VT == MVT::v2f64 ||
4731                VT == MVT::v1f64) {
4732       SelectPostStoreLane(Node, 4, AArch64::ST4i64_POST);
4733       return;
4734     }
4735     break;
4736   }
4737   case AArch64ISD::SVE_LD2_MERGE_ZERO: {
4738     if (VT == MVT::nxv16i8) {
4739       SelectPredicatedLoad(Node, 2, 0, AArch64::LD2B_IMM, AArch64::LD2B);
4740       return;
4741     } else if (VT == MVT::nxv8i16 || VT == MVT::nxv8f16 ||
4742                (VT == MVT::nxv8bf16 && Subtarget->hasBF16())) {
4743       SelectPredicatedLoad(Node, 2, 1, AArch64::LD2H_IMM, AArch64::LD2H);
4744       return;
4745     } else if (VT == MVT::nxv4i32 || VT == MVT::nxv4f32) {
4746       SelectPredicatedLoad(Node, 2, 2, AArch64::LD2W_IMM, AArch64::LD2W);
4747       return;
4748     } else if (VT == MVT::nxv2i64 || VT == MVT::nxv2f64) {
4749       SelectPredicatedLoad(Node, 2, 3, AArch64::LD2D_IMM, AArch64::LD2D);
4750       return;
4751     }
4752     break;
4753   }
4754   case AArch64ISD::SVE_LD3_MERGE_ZERO: {
4755     if (VT == MVT::nxv16i8) {
4756       SelectPredicatedLoad(Node, 3, 0, AArch64::LD3B_IMM, AArch64::LD3B);
4757       return;
4758     } else if (VT == MVT::nxv8i16 || VT == MVT::nxv8f16 ||
4759                (VT == MVT::nxv8bf16 && Subtarget->hasBF16())) {
4760       SelectPredicatedLoad(Node, 3, 1, AArch64::LD3H_IMM, AArch64::LD3H);
4761       return;
4762     } else if (VT == MVT::nxv4i32 || VT == MVT::nxv4f32) {
4763       SelectPredicatedLoad(Node, 3, 2, AArch64::LD3W_IMM, AArch64::LD3W);
4764       return;
4765     } else if (VT == MVT::nxv2i64 || VT == MVT::nxv2f64) {
4766       SelectPredicatedLoad(Node, 3, 3, AArch64::LD3D_IMM, AArch64::LD3D);
4767       return;
4768     }
4769     break;
4770   }
4771   case AArch64ISD::SVE_LD4_MERGE_ZERO: {
4772     if (VT == MVT::nxv16i8) {
4773       SelectPredicatedLoad(Node, 4, 0, AArch64::LD4B_IMM, AArch64::LD4B);
4774       return;
4775     } else if (VT == MVT::nxv8i16 || VT == MVT::nxv8f16 ||
4776                (VT == MVT::nxv8bf16 && Subtarget->hasBF16())) {
4777       SelectPredicatedLoad(Node, 4, 1, AArch64::LD4H_IMM, AArch64::LD4H);
4778       return;
4779     } else if (VT == MVT::nxv4i32 || VT == MVT::nxv4f32) {
4780       SelectPredicatedLoad(Node, 4, 2, AArch64::LD4W_IMM, AArch64::LD4W);
4781       return;
4782     } else if (VT == MVT::nxv2i64 || VT == MVT::nxv2f64) {
4783       SelectPredicatedLoad(Node, 4, 3, AArch64::LD4D_IMM, AArch64::LD4D);
4784       return;
4785     }
4786     break;
4787   }
4788   }
4789 
4790   // Select the default instruction
4791   SelectCode(Node);
4792 }
4793 
4794 /// createAArch64ISelDag - This pass converts a legalized DAG into a
4795 /// AArch64-specific DAG, ready for instruction scheduling.
4796 FunctionPass *llvm::createAArch64ISelDag(AArch64TargetMachine &TM,
4797                                          CodeGenOpt::Level OptLevel) {
4798   return new AArch64DAGToDAGISel(TM, OptLevel);
4799 }
4800 
4801 /// When \p PredVT is a scalable vector predicate in the form
4802 /// MVT::nx<M>xi1, it builds the correspondent scalable vector of
4803 /// integers MVT::nx<M>xi<bits> s.t. M x bits = 128. When targeting
4804 /// structured vectors (NumVec >1), the output data type is
4805 /// MVT::nx<M*NumVec>xi<bits> s.t. M x bits = 128. If the input
4806 /// PredVT is not in the form MVT::nx<M>xi1, it returns an invalid
4807 /// EVT.
4808 static EVT getPackedVectorTypeFromPredicateType(LLVMContext &Ctx, EVT PredVT,
4809                                                 unsigned NumVec) {
4810   assert(NumVec > 0 && NumVec < 5 && "Invalid number of vectors.");
4811   if (!PredVT.isScalableVector() || PredVT.getVectorElementType() != MVT::i1)
4812     return EVT();
4813 
4814   if (PredVT != MVT::nxv16i1 && PredVT != MVT::nxv8i1 &&
4815       PredVT != MVT::nxv4i1 && PredVT != MVT::nxv2i1)
4816     return EVT();
4817 
4818   ElementCount EC = PredVT.getVectorElementCount();
4819   EVT ScalarVT = EVT::getIntegerVT(Ctx, AArch64::SVEBitsPerBlock / EC.Min);
4820   EVT MemVT = EVT::getVectorVT(Ctx, ScalarVT, EC * NumVec);
4821 
4822   return MemVT;
4823 }
4824 
4825 /// Return the EVT of the data associated to a memory operation in \p
4826 /// Root. If such EVT cannot be retrived, it returns an invalid EVT.
4827 static EVT getMemVTFromNode(LLVMContext &Ctx, SDNode *Root) {
4828   if (isa<MemSDNode>(Root))
4829     return cast<MemSDNode>(Root)->getMemoryVT();
4830 
4831   if (isa<MemIntrinsicSDNode>(Root))
4832     return cast<MemIntrinsicSDNode>(Root)->getMemoryVT();
4833 
4834   const unsigned Opcode = Root->getOpcode();
4835   // For custom ISD nodes, we have to look at them individually to extract the
4836   // type of the data moved to/from memory.
4837   switch (Opcode) {
4838   case AArch64ISD::LD1_MERGE_ZERO:
4839   case AArch64ISD::LD1S_MERGE_ZERO:
4840   case AArch64ISD::LDNF1_MERGE_ZERO:
4841   case AArch64ISD::LDNF1S_MERGE_ZERO:
4842     return cast<VTSDNode>(Root->getOperand(3))->getVT();
4843   case AArch64ISD::ST1_PRED:
4844     return cast<VTSDNode>(Root->getOperand(4))->getVT();
4845   case AArch64ISD::SVE_LD2_MERGE_ZERO:
4846     return getPackedVectorTypeFromPredicateType(
4847         Ctx, Root->getOperand(1)->getValueType(0), /*NumVec=*/2);
4848   case AArch64ISD::SVE_LD3_MERGE_ZERO:
4849     return getPackedVectorTypeFromPredicateType(
4850         Ctx, Root->getOperand(1)->getValueType(0), /*NumVec=*/3);
4851   case AArch64ISD::SVE_LD4_MERGE_ZERO:
4852     return getPackedVectorTypeFromPredicateType(
4853         Ctx, Root->getOperand(1)->getValueType(0), /*NumVec=*/4);
4854   default:
4855     break;
4856   }
4857 
4858   if (Opcode != ISD::INTRINSIC_VOID)
4859     return EVT();
4860 
4861   const unsigned IntNo =
4862       cast<ConstantSDNode>(Root->getOperand(1))->getZExtValue();
4863   if (IntNo != Intrinsic::aarch64_sve_prf)
4864     return EVT();
4865 
4866   // We are using an SVE prefetch intrinsic. Type must be inferred
4867   // from the width of the predicate.
4868   return getPackedVectorTypeFromPredicateType(
4869       Ctx, Root->getOperand(2)->getValueType(0), /*NumVec=*/1);
4870 }
4871 
4872 /// SelectAddrModeIndexedSVE - Attempt selection of the addressing mode:
4873 /// Base + OffImm * sizeof(MemVT) for Min >= OffImm <= Max
4874 /// where Root is the memory access using N for its address.
4875 template <int64_t Min, int64_t Max>
4876 bool AArch64DAGToDAGISel::SelectAddrModeIndexedSVE(SDNode *Root, SDValue N,
4877                                                    SDValue &Base,
4878                                                    SDValue &OffImm) {
4879   const EVT MemVT = getMemVTFromNode(*(CurDAG->getContext()), Root);
4880 
4881   if (MemVT == EVT())
4882     return false;
4883 
4884   if (N.getOpcode() != ISD::ADD)
4885     return false;
4886 
4887   SDValue VScale = N.getOperand(1);
4888   if (VScale.getOpcode() != ISD::VSCALE)
4889     return false;
4890 
4891   TypeSize TS = MemVT.getSizeInBits();
4892   int64_t MemWidthBytes = static_cast<int64_t>(TS.getKnownMinSize()) / 8;
4893   int64_t MulImm = cast<ConstantSDNode>(VScale.getOperand(0))->getSExtValue();
4894 
4895   if ((MulImm % MemWidthBytes) != 0)
4896     return false;
4897 
4898   int64_t Offset = MulImm / MemWidthBytes;
4899   if (Offset < Min || Offset > Max)
4900     return false;
4901 
4902   Base = N.getOperand(0);
4903   OffImm = CurDAG->getTargetConstant(Offset, SDLoc(N), MVT::i64);
4904   return true;
4905 }
4906 
4907 /// Select register plus register addressing mode for SVE, with scaled
4908 /// offset.
4909 bool AArch64DAGToDAGISel::SelectSVERegRegAddrMode(SDValue N, unsigned Scale,
4910                                                   SDValue &Base,
4911                                                   SDValue &Offset) {
4912   if (N.getOpcode() != ISD::ADD)
4913     return false;
4914 
4915   // Process an ADD node.
4916   const SDValue LHS = N.getOperand(0);
4917   const SDValue RHS = N.getOperand(1);
4918 
4919   // 8 bit data does not come with the SHL node, so it is treated
4920   // separately.
4921   if (Scale == 0) {
4922     Base = LHS;
4923     Offset = RHS;
4924     return true;
4925   }
4926 
4927   // Check if the RHS is a shift node with a constant.
4928   if (RHS.getOpcode() != ISD::SHL)
4929     return false;
4930 
4931   const SDValue ShiftRHS = RHS.getOperand(1);
4932   if (auto *C = dyn_cast<ConstantSDNode>(ShiftRHS))
4933     if (C->getZExtValue() == Scale) {
4934       Base = LHS;
4935       Offset = RHS.getOperand(0);
4936       return true;
4937     }
4938 
4939   return false;
4940 }
4941