xref: /freebsd/contrib/llvm-project/llvm/lib/Target/AArch64/AArch64ISelDAGToDAG.cpp (revision 7029da5c36f2d3cf6bb6c81bf551229f416399e8)
1 //===-- AArch64ISelDAGToDAG.cpp - A dag to dag inst selector for AArch64 --===//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 //
9 // This file defines an instruction selector for the AArch64 target.
10 //
11 //===----------------------------------------------------------------------===//
12 
13 #include "AArch64TargetMachine.h"
14 #include "MCTargetDesc/AArch64AddressingModes.h"
15 #include "llvm/ADT/APSInt.h"
16 #include "llvm/CodeGen/SelectionDAGISel.h"
17 #include "llvm/IR/Function.h" // To access function attributes.
18 #include "llvm/IR/GlobalValue.h"
19 #include "llvm/IR/Intrinsics.h"
20 #include "llvm/Support/Debug.h"
21 #include "llvm/Support/ErrorHandling.h"
22 #include "llvm/Support/KnownBits.h"
23 #include "llvm/Support/MathExtras.h"
24 #include "llvm/Support/raw_ostream.h"
25 
26 using namespace llvm;
27 
28 #define DEBUG_TYPE "aarch64-isel"
29 
30 //===--------------------------------------------------------------------===//
31 /// AArch64DAGToDAGISel - AArch64 specific code to select AArch64 machine
32 /// instructions for SelectionDAG operations.
33 ///
34 namespace {
35 
36 class AArch64DAGToDAGISel : public SelectionDAGISel {
37 
38   /// Subtarget - Keep a pointer to the AArch64Subtarget around so that we can
39   /// make the right decision when generating code for different targets.
40   const AArch64Subtarget *Subtarget;
41 
42   bool ForCodeSize;
43 
44 public:
45   explicit AArch64DAGToDAGISel(AArch64TargetMachine &tm,
46                                CodeGenOpt::Level OptLevel)
47       : SelectionDAGISel(tm, OptLevel), Subtarget(nullptr),
48         ForCodeSize(false) {}
49 
50   StringRef getPassName() const override {
51     return "AArch64 Instruction Selection";
52   }
53 
54   bool runOnMachineFunction(MachineFunction &MF) override {
55     ForCodeSize = MF.getFunction().hasOptSize();
56     Subtarget = &MF.getSubtarget<AArch64Subtarget>();
57     return SelectionDAGISel::runOnMachineFunction(MF);
58   }
59 
60   void Select(SDNode *Node) override;
61 
62   /// SelectInlineAsmMemoryOperand - Implement addressing mode selection for
63   /// inline asm expressions.
64   bool SelectInlineAsmMemoryOperand(const SDValue &Op,
65                                     unsigned ConstraintID,
66                                     std::vector<SDValue> &OutOps) override;
67 
68   bool tryMLAV64LaneV128(SDNode *N);
69   bool tryMULLV64LaneV128(unsigned IntNo, SDNode *N);
70   bool SelectArithExtendedRegister(SDValue N, SDValue &Reg, SDValue &Shift);
71   bool SelectArithImmed(SDValue N, SDValue &Val, SDValue &Shift);
72   bool SelectNegArithImmed(SDValue N, SDValue &Val, SDValue &Shift);
73   bool SelectArithShiftedRegister(SDValue N, SDValue &Reg, SDValue &Shift) {
74     return SelectShiftedRegister(N, false, Reg, Shift);
75   }
76   bool SelectLogicalShiftedRegister(SDValue N, SDValue &Reg, SDValue &Shift) {
77     return SelectShiftedRegister(N, true, Reg, Shift);
78   }
79   bool SelectAddrModeIndexed7S8(SDValue N, SDValue &Base, SDValue &OffImm) {
80     return SelectAddrModeIndexed7S(N, 1, Base, OffImm);
81   }
82   bool SelectAddrModeIndexed7S16(SDValue N, SDValue &Base, SDValue &OffImm) {
83     return SelectAddrModeIndexed7S(N, 2, Base, OffImm);
84   }
85   bool SelectAddrModeIndexed7S32(SDValue N, SDValue &Base, SDValue &OffImm) {
86     return SelectAddrModeIndexed7S(N, 4, Base, OffImm);
87   }
88   bool SelectAddrModeIndexed7S64(SDValue N, SDValue &Base, SDValue &OffImm) {
89     return SelectAddrModeIndexed7S(N, 8, Base, OffImm);
90   }
91   bool SelectAddrModeIndexed7S128(SDValue N, SDValue &Base, SDValue &OffImm) {
92     return SelectAddrModeIndexed7S(N, 16, Base, OffImm);
93   }
94   bool SelectAddrModeIndexedS9S128(SDValue N, SDValue &Base, SDValue &OffImm) {
95     return SelectAddrModeIndexedBitWidth(N, true, 9, 16, Base, OffImm);
96   }
97   bool SelectAddrModeIndexedU6S128(SDValue N, SDValue &Base, SDValue &OffImm) {
98     return SelectAddrModeIndexedBitWidth(N, false, 6, 16, Base, OffImm);
99   }
100   bool SelectAddrModeIndexed8(SDValue N, SDValue &Base, SDValue &OffImm) {
101     return SelectAddrModeIndexed(N, 1, Base, OffImm);
102   }
103   bool SelectAddrModeIndexed16(SDValue N, SDValue &Base, SDValue &OffImm) {
104     return SelectAddrModeIndexed(N, 2, Base, OffImm);
105   }
106   bool SelectAddrModeIndexed32(SDValue N, SDValue &Base, SDValue &OffImm) {
107     return SelectAddrModeIndexed(N, 4, Base, OffImm);
108   }
109   bool SelectAddrModeIndexed64(SDValue N, SDValue &Base, SDValue &OffImm) {
110     return SelectAddrModeIndexed(N, 8, Base, OffImm);
111   }
112   bool SelectAddrModeIndexed128(SDValue N, SDValue &Base, SDValue &OffImm) {
113     return SelectAddrModeIndexed(N, 16, Base, OffImm);
114   }
115   bool SelectAddrModeUnscaled8(SDValue N, SDValue &Base, SDValue &OffImm) {
116     return SelectAddrModeUnscaled(N, 1, Base, OffImm);
117   }
118   bool SelectAddrModeUnscaled16(SDValue N, SDValue &Base, SDValue &OffImm) {
119     return SelectAddrModeUnscaled(N, 2, Base, OffImm);
120   }
121   bool SelectAddrModeUnscaled32(SDValue N, SDValue &Base, SDValue &OffImm) {
122     return SelectAddrModeUnscaled(N, 4, Base, OffImm);
123   }
124   bool SelectAddrModeUnscaled64(SDValue N, SDValue &Base, SDValue &OffImm) {
125     return SelectAddrModeUnscaled(N, 8, Base, OffImm);
126   }
127   bool SelectAddrModeUnscaled128(SDValue N, SDValue &Base, SDValue &OffImm) {
128     return SelectAddrModeUnscaled(N, 16, Base, OffImm);
129   }
130 
131   template<int Width>
132   bool SelectAddrModeWRO(SDValue N, SDValue &Base, SDValue &Offset,
133                          SDValue &SignExtend, SDValue &DoShift) {
134     return SelectAddrModeWRO(N, Width / 8, Base, Offset, SignExtend, DoShift);
135   }
136 
137   template<int Width>
138   bool SelectAddrModeXRO(SDValue N, SDValue &Base, SDValue &Offset,
139                          SDValue &SignExtend, SDValue &DoShift) {
140     return SelectAddrModeXRO(N, Width / 8, Base, Offset, SignExtend, DoShift);
141   }
142 
143 
144   /// Form sequences of consecutive 64/128-bit registers for use in NEON
145   /// instructions making use of a vector-list (e.g. ldN, tbl). Vecs must have
146   /// between 1 and 4 elements. If it contains a single element that is returned
147   /// unchanged; otherwise a REG_SEQUENCE value is returned.
148   SDValue createDTuple(ArrayRef<SDValue> Vecs);
149   SDValue createQTuple(ArrayRef<SDValue> Vecs);
150 
151   /// Generic helper for the createDTuple/createQTuple
152   /// functions. Those should almost always be called instead.
153   SDValue createTuple(ArrayRef<SDValue> Vecs, const unsigned RegClassIDs[],
154                       const unsigned SubRegs[]);
155 
156   void SelectTable(SDNode *N, unsigned NumVecs, unsigned Opc, bool isExt);
157 
158   bool tryIndexedLoad(SDNode *N);
159 
160   bool trySelectStackSlotTagP(SDNode *N);
161   void SelectTagP(SDNode *N);
162 
163   void SelectLoad(SDNode *N, unsigned NumVecs, unsigned Opc,
164                      unsigned SubRegIdx);
165   void SelectPostLoad(SDNode *N, unsigned NumVecs, unsigned Opc,
166                          unsigned SubRegIdx);
167   void SelectLoadLane(SDNode *N, unsigned NumVecs, unsigned Opc);
168   void SelectPostLoadLane(SDNode *N, unsigned NumVecs, unsigned Opc);
169 
170   void SelectStore(SDNode *N, unsigned NumVecs, unsigned Opc);
171   void SelectPostStore(SDNode *N, unsigned NumVecs, unsigned Opc);
172   void SelectStoreLane(SDNode *N, unsigned NumVecs, unsigned Opc);
173   void SelectPostStoreLane(SDNode *N, unsigned NumVecs, unsigned Opc);
174 
175   bool tryBitfieldExtractOp(SDNode *N);
176   bool tryBitfieldExtractOpFromSExt(SDNode *N);
177   bool tryBitfieldInsertOp(SDNode *N);
178   bool tryBitfieldInsertInZeroOp(SDNode *N);
179   bool tryShiftAmountMod(SDNode *N);
180 
181   bool tryReadRegister(SDNode *N);
182   bool tryWriteRegister(SDNode *N);
183 
184 // Include the pieces autogenerated from the target description.
185 #include "AArch64GenDAGISel.inc"
186 
187 private:
188   bool SelectShiftedRegister(SDValue N, bool AllowROR, SDValue &Reg,
189                              SDValue &Shift);
190   bool SelectAddrModeIndexed7S(SDValue N, unsigned Size, SDValue &Base,
191                                SDValue &OffImm) {
192     return SelectAddrModeIndexedBitWidth(N, true, 7, Size, Base, OffImm);
193   }
194   bool SelectAddrModeIndexedBitWidth(SDValue N, bool IsSignedImm, unsigned BW,
195                                      unsigned Size, SDValue &Base,
196                                      SDValue &OffImm);
197   bool SelectAddrModeIndexed(SDValue N, unsigned Size, SDValue &Base,
198                              SDValue &OffImm);
199   bool SelectAddrModeUnscaled(SDValue N, unsigned Size, SDValue &Base,
200                               SDValue &OffImm);
201   bool SelectAddrModeWRO(SDValue N, unsigned Size, SDValue &Base,
202                          SDValue &Offset, SDValue &SignExtend,
203                          SDValue &DoShift);
204   bool SelectAddrModeXRO(SDValue N, unsigned Size, SDValue &Base,
205                          SDValue &Offset, SDValue &SignExtend,
206                          SDValue &DoShift);
207   bool isWorthFolding(SDValue V) const;
208   bool SelectExtendedSHL(SDValue N, unsigned Size, bool WantExtend,
209                          SDValue &Offset, SDValue &SignExtend);
210 
211   template<unsigned RegWidth>
212   bool SelectCVTFixedPosOperand(SDValue N, SDValue &FixedPos) {
213     return SelectCVTFixedPosOperand(N, FixedPos, RegWidth);
214   }
215 
216   bool SelectCVTFixedPosOperand(SDValue N, SDValue &FixedPos, unsigned Width);
217 
218   bool SelectCMP_SWAP(SDNode *N);
219 
220 };
221 } // end anonymous namespace
222 
223 /// isIntImmediate - This method tests to see if the node is a constant
224 /// operand. If so Imm will receive the 32-bit value.
225 static bool isIntImmediate(const SDNode *N, uint64_t &Imm) {
226   if (const ConstantSDNode *C = dyn_cast<const ConstantSDNode>(N)) {
227     Imm = C->getZExtValue();
228     return true;
229   }
230   return false;
231 }
232 
233 // isIntImmediate - This method tests to see if a constant operand.
234 // If so Imm will receive the value.
235 static bool isIntImmediate(SDValue N, uint64_t &Imm) {
236   return isIntImmediate(N.getNode(), Imm);
237 }
238 
239 // isOpcWithIntImmediate - This method tests to see if the node is a specific
240 // opcode and that it has a immediate integer right operand.
241 // If so Imm will receive the 32 bit value.
242 static bool isOpcWithIntImmediate(const SDNode *N, unsigned Opc,
243                                   uint64_t &Imm) {
244   return N->getOpcode() == Opc &&
245          isIntImmediate(N->getOperand(1).getNode(), Imm);
246 }
247 
248 bool AArch64DAGToDAGISel::SelectInlineAsmMemoryOperand(
249     const SDValue &Op, unsigned ConstraintID, std::vector<SDValue> &OutOps) {
250   switch(ConstraintID) {
251   default:
252     llvm_unreachable("Unexpected asm memory constraint");
253   case InlineAsm::Constraint_i:
254   case InlineAsm::Constraint_m:
255   case InlineAsm::Constraint_Q:
256     // We need to make sure that this one operand does not end up in XZR, thus
257     // require the address to be in a PointerRegClass register.
258     const TargetRegisterInfo *TRI = Subtarget->getRegisterInfo();
259     const TargetRegisterClass *TRC = TRI->getPointerRegClass(*MF);
260     SDLoc dl(Op);
261     SDValue RC = CurDAG->getTargetConstant(TRC->getID(), dl, MVT::i64);
262     SDValue NewOp =
263         SDValue(CurDAG->getMachineNode(TargetOpcode::COPY_TO_REGCLASS,
264                                        dl, Op.getValueType(),
265                                        Op, RC), 0);
266     OutOps.push_back(NewOp);
267     return false;
268   }
269   return true;
270 }
271 
272 /// SelectArithImmed - Select an immediate value that can be represented as
273 /// a 12-bit value shifted left by either 0 or 12.  If so, return true with
274 /// Val set to the 12-bit value and Shift set to the shifter operand.
275 bool AArch64DAGToDAGISel::SelectArithImmed(SDValue N, SDValue &Val,
276                                            SDValue &Shift) {
277   // This function is called from the addsub_shifted_imm ComplexPattern,
278   // which lists [imm] as the list of opcode it's interested in, however
279   // we still need to check whether the operand is actually an immediate
280   // here because the ComplexPattern opcode list is only used in
281   // root-level opcode matching.
282   if (!isa<ConstantSDNode>(N.getNode()))
283     return false;
284 
285   uint64_t Immed = cast<ConstantSDNode>(N.getNode())->getZExtValue();
286   unsigned ShiftAmt;
287 
288   if (Immed >> 12 == 0) {
289     ShiftAmt = 0;
290   } else if ((Immed & 0xfff) == 0 && Immed >> 24 == 0) {
291     ShiftAmt = 12;
292     Immed = Immed >> 12;
293   } else
294     return false;
295 
296   unsigned ShVal = AArch64_AM::getShifterImm(AArch64_AM::LSL, ShiftAmt);
297   SDLoc dl(N);
298   Val = CurDAG->getTargetConstant(Immed, dl, MVT::i32);
299   Shift = CurDAG->getTargetConstant(ShVal, dl, MVT::i32);
300   return true;
301 }
302 
303 /// SelectNegArithImmed - As above, but negates the value before trying to
304 /// select it.
305 bool AArch64DAGToDAGISel::SelectNegArithImmed(SDValue N, SDValue &Val,
306                                               SDValue &Shift) {
307   // This function is called from the addsub_shifted_imm ComplexPattern,
308   // which lists [imm] as the list of opcode it's interested in, however
309   // we still need to check whether the operand is actually an immediate
310   // here because the ComplexPattern opcode list is only used in
311   // root-level opcode matching.
312   if (!isa<ConstantSDNode>(N.getNode()))
313     return false;
314 
315   // The immediate operand must be a 24-bit zero-extended immediate.
316   uint64_t Immed = cast<ConstantSDNode>(N.getNode())->getZExtValue();
317 
318   // This negation is almost always valid, but "cmp wN, #0" and "cmn wN, #0"
319   // have the opposite effect on the C flag, so this pattern mustn't match under
320   // those circumstances.
321   if (Immed == 0)
322     return false;
323 
324   if (N.getValueType() == MVT::i32)
325     Immed = ~((uint32_t)Immed) + 1;
326   else
327     Immed = ~Immed + 1ULL;
328   if (Immed & 0xFFFFFFFFFF000000ULL)
329     return false;
330 
331   Immed &= 0xFFFFFFULL;
332   return SelectArithImmed(CurDAG->getConstant(Immed, SDLoc(N), MVT::i32), Val,
333                           Shift);
334 }
335 
336 /// getShiftTypeForNode - Translate a shift node to the corresponding
337 /// ShiftType value.
338 static AArch64_AM::ShiftExtendType getShiftTypeForNode(SDValue N) {
339   switch (N.getOpcode()) {
340   default:
341     return AArch64_AM::InvalidShiftExtend;
342   case ISD::SHL:
343     return AArch64_AM::LSL;
344   case ISD::SRL:
345     return AArch64_AM::LSR;
346   case ISD::SRA:
347     return AArch64_AM::ASR;
348   case ISD::ROTR:
349     return AArch64_AM::ROR;
350   }
351 }
352 
353 /// Determine whether it is worth it to fold SHL into the addressing
354 /// mode.
355 static bool isWorthFoldingSHL(SDValue V) {
356   assert(V.getOpcode() == ISD::SHL && "invalid opcode");
357   // It is worth folding logical shift of up to three places.
358   auto *CSD = dyn_cast<ConstantSDNode>(V.getOperand(1));
359   if (!CSD)
360     return false;
361   unsigned ShiftVal = CSD->getZExtValue();
362   if (ShiftVal > 3)
363     return false;
364 
365   // Check if this particular node is reused in any non-memory related
366   // operation.  If yes, do not try to fold this node into the address
367   // computation, since the computation will be kept.
368   const SDNode *Node = V.getNode();
369   for (SDNode *UI : Node->uses())
370     if (!isa<MemSDNode>(*UI))
371       for (SDNode *UII : UI->uses())
372         if (!isa<MemSDNode>(*UII))
373           return false;
374   return true;
375 }
376 
377 /// Determine whether it is worth to fold V into an extended register.
378 bool AArch64DAGToDAGISel::isWorthFolding(SDValue V) const {
379   // Trivial if we are optimizing for code size or if there is only
380   // one use of the value.
381   if (ForCodeSize || V.hasOneUse())
382     return true;
383   // If a subtarget has a fastpath LSL we can fold a logical shift into
384   // the addressing mode and save a cycle.
385   if (Subtarget->hasLSLFast() && V.getOpcode() == ISD::SHL &&
386       isWorthFoldingSHL(V))
387     return true;
388   if (Subtarget->hasLSLFast() && V.getOpcode() == ISD::ADD) {
389     const SDValue LHS = V.getOperand(0);
390     const SDValue RHS = V.getOperand(1);
391     if (LHS.getOpcode() == ISD::SHL && isWorthFoldingSHL(LHS))
392       return true;
393     if (RHS.getOpcode() == ISD::SHL && isWorthFoldingSHL(RHS))
394       return true;
395   }
396 
397   // It hurts otherwise, since the value will be reused.
398   return false;
399 }
400 
401 /// SelectShiftedRegister - Select a "shifted register" operand.  If the value
402 /// is not shifted, set the Shift operand to default of "LSL 0".  The logical
403 /// instructions allow the shifted register to be rotated, but the arithmetic
404 /// instructions do not.  The AllowROR parameter specifies whether ROR is
405 /// supported.
406 bool AArch64DAGToDAGISel::SelectShiftedRegister(SDValue N, bool AllowROR,
407                                                 SDValue &Reg, SDValue &Shift) {
408   AArch64_AM::ShiftExtendType ShType = getShiftTypeForNode(N);
409   if (ShType == AArch64_AM::InvalidShiftExtend)
410     return false;
411   if (!AllowROR && ShType == AArch64_AM::ROR)
412     return false;
413 
414   if (ConstantSDNode *RHS = dyn_cast<ConstantSDNode>(N.getOperand(1))) {
415     unsigned BitSize = N.getValueSizeInBits();
416     unsigned Val = RHS->getZExtValue() & (BitSize - 1);
417     unsigned ShVal = AArch64_AM::getShifterImm(ShType, Val);
418 
419     Reg = N.getOperand(0);
420     Shift = CurDAG->getTargetConstant(ShVal, SDLoc(N), MVT::i32);
421     return isWorthFolding(N);
422   }
423 
424   return false;
425 }
426 
427 /// getExtendTypeForNode - Translate an extend node to the corresponding
428 /// ExtendType value.
429 static AArch64_AM::ShiftExtendType
430 getExtendTypeForNode(SDValue N, bool IsLoadStore = false) {
431   if (N.getOpcode() == ISD::SIGN_EXTEND ||
432       N.getOpcode() == ISD::SIGN_EXTEND_INREG) {
433     EVT SrcVT;
434     if (N.getOpcode() == ISD::SIGN_EXTEND_INREG)
435       SrcVT = cast<VTSDNode>(N.getOperand(1))->getVT();
436     else
437       SrcVT = N.getOperand(0).getValueType();
438 
439     if (!IsLoadStore && SrcVT == MVT::i8)
440       return AArch64_AM::SXTB;
441     else if (!IsLoadStore && SrcVT == MVT::i16)
442       return AArch64_AM::SXTH;
443     else if (SrcVT == MVT::i32)
444       return AArch64_AM::SXTW;
445     assert(SrcVT != MVT::i64 && "extend from 64-bits?");
446 
447     return AArch64_AM::InvalidShiftExtend;
448   } else if (N.getOpcode() == ISD::ZERO_EXTEND ||
449              N.getOpcode() == ISD::ANY_EXTEND) {
450     EVT SrcVT = N.getOperand(0).getValueType();
451     if (!IsLoadStore && SrcVT == MVT::i8)
452       return AArch64_AM::UXTB;
453     else if (!IsLoadStore && SrcVT == MVT::i16)
454       return AArch64_AM::UXTH;
455     else if (SrcVT == MVT::i32)
456       return AArch64_AM::UXTW;
457     assert(SrcVT != MVT::i64 && "extend from 64-bits?");
458 
459     return AArch64_AM::InvalidShiftExtend;
460   } else if (N.getOpcode() == ISD::AND) {
461     ConstantSDNode *CSD = dyn_cast<ConstantSDNode>(N.getOperand(1));
462     if (!CSD)
463       return AArch64_AM::InvalidShiftExtend;
464     uint64_t AndMask = CSD->getZExtValue();
465 
466     switch (AndMask) {
467     default:
468       return AArch64_AM::InvalidShiftExtend;
469     case 0xFF:
470       return !IsLoadStore ? AArch64_AM::UXTB : AArch64_AM::InvalidShiftExtend;
471     case 0xFFFF:
472       return !IsLoadStore ? AArch64_AM::UXTH : AArch64_AM::InvalidShiftExtend;
473     case 0xFFFFFFFF:
474       return AArch64_AM::UXTW;
475     }
476   }
477 
478   return AArch64_AM::InvalidShiftExtend;
479 }
480 
481 // Helper for SelectMLAV64LaneV128 - Recognize high lane extracts.
482 static bool checkHighLaneIndex(SDNode *DL, SDValue &LaneOp, int &LaneIdx) {
483   if (DL->getOpcode() != AArch64ISD::DUPLANE16 &&
484       DL->getOpcode() != AArch64ISD::DUPLANE32)
485     return false;
486 
487   SDValue SV = DL->getOperand(0);
488   if (SV.getOpcode() != ISD::INSERT_SUBVECTOR)
489     return false;
490 
491   SDValue EV = SV.getOperand(1);
492   if (EV.getOpcode() != ISD::EXTRACT_SUBVECTOR)
493     return false;
494 
495   ConstantSDNode *DLidx = cast<ConstantSDNode>(DL->getOperand(1).getNode());
496   ConstantSDNode *EVidx = cast<ConstantSDNode>(EV.getOperand(1).getNode());
497   LaneIdx = DLidx->getSExtValue() + EVidx->getSExtValue();
498   LaneOp = EV.getOperand(0);
499 
500   return true;
501 }
502 
503 // Helper for SelectOpcV64LaneV128 - Recognize operations where one operand is a
504 // high lane extract.
505 static bool checkV64LaneV128(SDValue Op0, SDValue Op1, SDValue &StdOp,
506                              SDValue &LaneOp, int &LaneIdx) {
507 
508   if (!checkHighLaneIndex(Op0.getNode(), LaneOp, LaneIdx)) {
509     std::swap(Op0, Op1);
510     if (!checkHighLaneIndex(Op0.getNode(), LaneOp, LaneIdx))
511       return false;
512   }
513   StdOp = Op1;
514   return true;
515 }
516 
517 /// SelectMLAV64LaneV128 - AArch64 supports vector MLAs where one multiplicand
518 /// is a lane in the upper half of a 128-bit vector.  Recognize and select this
519 /// so that we don't emit unnecessary lane extracts.
520 bool AArch64DAGToDAGISel::tryMLAV64LaneV128(SDNode *N) {
521   SDLoc dl(N);
522   SDValue Op0 = N->getOperand(0);
523   SDValue Op1 = N->getOperand(1);
524   SDValue MLAOp1;   // Will hold ordinary multiplicand for MLA.
525   SDValue MLAOp2;   // Will hold lane-accessed multiplicand for MLA.
526   int LaneIdx = -1; // Will hold the lane index.
527 
528   if (Op1.getOpcode() != ISD::MUL ||
529       !checkV64LaneV128(Op1.getOperand(0), Op1.getOperand(1), MLAOp1, MLAOp2,
530                         LaneIdx)) {
531     std::swap(Op0, Op1);
532     if (Op1.getOpcode() != ISD::MUL ||
533         !checkV64LaneV128(Op1.getOperand(0), Op1.getOperand(1), MLAOp1, MLAOp2,
534                           LaneIdx))
535       return false;
536   }
537 
538   SDValue LaneIdxVal = CurDAG->getTargetConstant(LaneIdx, dl, MVT::i64);
539 
540   SDValue Ops[] = { Op0, MLAOp1, MLAOp2, LaneIdxVal };
541 
542   unsigned MLAOpc = ~0U;
543 
544   switch (N->getSimpleValueType(0).SimpleTy) {
545   default:
546     llvm_unreachable("Unrecognized MLA.");
547   case MVT::v4i16:
548     MLAOpc = AArch64::MLAv4i16_indexed;
549     break;
550   case MVT::v8i16:
551     MLAOpc = AArch64::MLAv8i16_indexed;
552     break;
553   case MVT::v2i32:
554     MLAOpc = AArch64::MLAv2i32_indexed;
555     break;
556   case MVT::v4i32:
557     MLAOpc = AArch64::MLAv4i32_indexed;
558     break;
559   }
560 
561   ReplaceNode(N, CurDAG->getMachineNode(MLAOpc, dl, N->getValueType(0), Ops));
562   return true;
563 }
564 
565 bool AArch64DAGToDAGISel::tryMULLV64LaneV128(unsigned IntNo, SDNode *N) {
566   SDLoc dl(N);
567   SDValue SMULLOp0;
568   SDValue SMULLOp1;
569   int LaneIdx;
570 
571   if (!checkV64LaneV128(N->getOperand(1), N->getOperand(2), SMULLOp0, SMULLOp1,
572                         LaneIdx))
573     return false;
574 
575   SDValue LaneIdxVal = CurDAG->getTargetConstant(LaneIdx, dl, MVT::i64);
576 
577   SDValue Ops[] = { SMULLOp0, SMULLOp1, LaneIdxVal };
578 
579   unsigned SMULLOpc = ~0U;
580 
581   if (IntNo == Intrinsic::aarch64_neon_smull) {
582     switch (N->getSimpleValueType(0).SimpleTy) {
583     default:
584       llvm_unreachable("Unrecognized SMULL.");
585     case MVT::v4i32:
586       SMULLOpc = AArch64::SMULLv4i16_indexed;
587       break;
588     case MVT::v2i64:
589       SMULLOpc = AArch64::SMULLv2i32_indexed;
590       break;
591     }
592   } else if (IntNo == Intrinsic::aarch64_neon_umull) {
593     switch (N->getSimpleValueType(0).SimpleTy) {
594     default:
595       llvm_unreachable("Unrecognized SMULL.");
596     case MVT::v4i32:
597       SMULLOpc = AArch64::UMULLv4i16_indexed;
598       break;
599     case MVT::v2i64:
600       SMULLOpc = AArch64::UMULLv2i32_indexed;
601       break;
602     }
603   } else
604     llvm_unreachable("Unrecognized intrinsic.");
605 
606   ReplaceNode(N, CurDAG->getMachineNode(SMULLOpc, dl, N->getValueType(0), Ops));
607   return true;
608 }
609 
610 /// Instructions that accept extend modifiers like UXTW expect the register
611 /// being extended to be a GPR32, but the incoming DAG might be acting on a
612 /// GPR64 (either via SEXT_INREG or AND). Extract the appropriate low bits if
613 /// this is the case.
614 static SDValue narrowIfNeeded(SelectionDAG *CurDAG, SDValue N) {
615   if (N.getValueType() == MVT::i32)
616     return N;
617 
618   SDLoc dl(N);
619   SDValue SubReg = CurDAG->getTargetConstant(AArch64::sub_32, dl, MVT::i32);
620   MachineSDNode *Node = CurDAG->getMachineNode(TargetOpcode::EXTRACT_SUBREG,
621                                                dl, MVT::i32, N, SubReg);
622   return SDValue(Node, 0);
623 }
624 
625 
626 /// SelectArithExtendedRegister - Select a "extended register" operand.  This
627 /// operand folds in an extend followed by an optional left shift.
628 bool AArch64DAGToDAGISel::SelectArithExtendedRegister(SDValue N, SDValue &Reg,
629                                                       SDValue &Shift) {
630   unsigned ShiftVal = 0;
631   AArch64_AM::ShiftExtendType Ext;
632 
633   if (N.getOpcode() == ISD::SHL) {
634     ConstantSDNode *CSD = dyn_cast<ConstantSDNode>(N.getOperand(1));
635     if (!CSD)
636       return false;
637     ShiftVal = CSD->getZExtValue();
638     if (ShiftVal > 4)
639       return false;
640 
641     Ext = getExtendTypeForNode(N.getOperand(0));
642     if (Ext == AArch64_AM::InvalidShiftExtend)
643       return false;
644 
645     Reg = N.getOperand(0).getOperand(0);
646   } else {
647     Ext = getExtendTypeForNode(N);
648     if (Ext == AArch64_AM::InvalidShiftExtend)
649       return false;
650 
651     Reg = N.getOperand(0);
652 
653     // Don't match if free 32-bit -> 64-bit zext can be used instead.
654     if (Ext == AArch64_AM::UXTW &&
655         Reg->getValueType(0).getSizeInBits() == 32 && isDef32(*Reg.getNode()))
656       return false;
657   }
658 
659   // AArch64 mandates that the RHS of the operation must use the smallest
660   // register class that could contain the size being extended from.  Thus,
661   // if we're folding a (sext i8), we need the RHS to be a GPR32, even though
662   // there might not be an actual 32-bit value in the program.  We can
663   // (harmlessly) synthesize one by injected an EXTRACT_SUBREG here.
664   assert(Ext != AArch64_AM::UXTX && Ext != AArch64_AM::SXTX);
665   Reg = narrowIfNeeded(CurDAG, Reg);
666   Shift = CurDAG->getTargetConstant(getArithExtendImm(Ext, ShiftVal), SDLoc(N),
667                                     MVT::i32);
668   return isWorthFolding(N);
669 }
670 
671 /// If there's a use of this ADDlow that's not itself a load/store then we'll
672 /// need to create a real ADD instruction from it anyway and there's no point in
673 /// folding it into the mem op. Theoretically, it shouldn't matter, but there's
674 /// a single pseudo-instruction for an ADRP/ADD pair so over-aggressive folding
675 /// leads to duplicated ADRP instructions.
676 static bool isWorthFoldingADDlow(SDValue N) {
677   for (auto Use : N->uses()) {
678     if (Use->getOpcode() != ISD::LOAD && Use->getOpcode() != ISD::STORE &&
679         Use->getOpcode() != ISD::ATOMIC_LOAD &&
680         Use->getOpcode() != ISD::ATOMIC_STORE)
681       return false;
682 
683     // ldar and stlr have much more restrictive addressing modes (just a
684     // register).
685     if (isStrongerThanMonotonic(cast<MemSDNode>(Use)->getOrdering()))
686       return false;
687   }
688 
689   return true;
690 }
691 
692 /// SelectAddrModeIndexedBitWidth - Select a "register plus scaled (un)signed BW-bit
693 /// immediate" address.  The "Size" argument is the size in bytes of the memory
694 /// reference, which determines the scale.
695 bool AArch64DAGToDAGISel::SelectAddrModeIndexedBitWidth(SDValue N, bool IsSignedImm,
696                                                         unsigned BW, unsigned Size,
697                                                         SDValue &Base,
698                                                         SDValue &OffImm) {
699   SDLoc dl(N);
700   const DataLayout &DL = CurDAG->getDataLayout();
701   const TargetLowering *TLI = getTargetLowering();
702   if (N.getOpcode() == ISD::FrameIndex) {
703     int FI = cast<FrameIndexSDNode>(N)->getIndex();
704     Base = CurDAG->getTargetFrameIndex(FI, TLI->getPointerTy(DL));
705     OffImm = CurDAG->getTargetConstant(0, dl, MVT::i64);
706     return true;
707   }
708 
709   // As opposed to the (12-bit) Indexed addressing mode below, the 7/9-bit signed
710   // selected here doesn't support labels/immediates, only base+offset.
711   if (CurDAG->isBaseWithConstantOffset(N)) {
712     if (ConstantSDNode *RHS = dyn_cast<ConstantSDNode>(N.getOperand(1))) {
713       if (IsSignedImm) {
714         int64_t RHSC = RHS->getSExtValue();
715         unsigned Scale = Log2_32(Size);
716         int64_t Range = 0x1LL << (BW - 1);
717 
718         if ((RHSC & (Size - 1)) == 0 && RHSC >= -(Range << Scale) &&
719             RHSC < (Range << Scale)) {
720           Base = N.getOperand(0);
721           if (Base.getOpcode() == ISD::FrameIndex) {
722             int FI = cast<FrameIndexSDNode>(Base)->getIndex();
723             Base = CurDAG->getTargetFrameIndex(FI, TLI->getPointerTy(DL));
724           }
725           OffImm = CurDAG->getTargetConstant(RHSC >> Scale, dl, MVT::i64);
726           return true;
727         }
728       } else {
729         // unsigned Immediate
730         uint64_t RHSC = RHS->getZExtValue();
731         unsigned Scale = Log2_32(Size);
732         uint64_t Range = 0x1ULL << BW;
733 
734         if ((RHSC & (Size - 1)) == 0 && RHSC < (Range << Scale)) {
735           Base = N.getOperand(0);
736           if (Base.getOpcode() == ISD::FrameIndex) {
737             int FI = cast<FrameIndexSDNode>(Base)->getIndex();
738             Base = CurDAG->getTargetFrameIndex(FI, TLI->getPointerTy(DL));
739           }
740           OffImm = CurDAG->getTargetConstant(RHSC >> Scale, dl, MVT::i64);
741           return true;
742         }
743       }
744     }
745   }
746   // Base only. The address will be materialized into a register before
747   // the memory is accessed.
748   //    add x0, Xbase, #offset
749   //    stp x1, x2, [x0]
750   Base = N;
751   OffImm = CurDAG->getTargetConstant(0, dl, MVT::i64);
752   return true;
753 }
754 
755 /// SelectAddrModeIndexed - Select a "register plus scaled unsigned 12-bit
756 /// immediate" address.  The "Size" argument is the size in bytes of the memory
757 /// reference, which determines the scale.
758 bool AArch64DAGToDAGISel::SelectAddrModeIndexed(SDValue N, unsigned Size,
759                                               SDValue &Base, SDValue &OffImm) {
760   SDLoc dl(N);
761   const DataLayout &DL = CurDAG->getDataLayout();
762   const TargetLowering *TLI = getTargetLowering();
763   if (N.getOpcode() == ISD::FrameIndex) {
764     int FI = cast<FrameIndexSDNode>(N)->getIndex();
765     Base = CurDAG->getTargetFrameIndex(FI, TLI->getPointerTy(DL));
766     OffImm = CurDAG->getTargetConstant(0, dl, MVT::i64);
767     return true;
768   }
769 
770   if (N.getOpcode() == AArch64ISD::ADDlow && isWorthFoldingADDlow(N)) {
771     GlobalAddressSDNode *GAN =
772         dyn_cast<GlobalAddressSDNode>(N.getOperand(1).getNode());
773     Base = N.getOperand(0);
774     OffImm = N.getOperand(1);
775     if (!GAN)
776       return true;
777 
778     if (GAN->getOffset() % Size == 0) {
779       const GlobalValue *GV = GAN->getGlobal();
780       unsigned Alignment = GV->getAlignment();
781       Type *Ty = GV->getValueType();
782       if (Alignment == 0 && Ty->isSized())
783         Alignment = DL.getABITypeAlignment(Ty);
784 
785       if (Alignment >= Size)
786         return true;
787     }
788   }
789 
790   if (CurDAG->isBaseWithConstantOffset(N)) {
791     if (ConstantSDNode *RHS = dyn_cast<ConstantSDNode>(N.getOperand(1))) {
792       int64_t RHSC = (int64_t)RHS->getZExtValue();
793       unsigned Scale = Log2_32(Size);
794       if ((RHSC & (Size - 1)) == 0 && RHSC >= 0 && RHSC < (0x1000 << Scale)) {
795         Base = N.getOperand(0);
796         if (Base.getOpcode() == ISD::FrameIndex) {
797           int FI = cast<FrameIndexSDNode>(Base)->getIndex();
798           Base = CurDAG->getTargetFrameIndex(FI, TLI->getPointerTy(DL));
799         }
800         OffImm = CurDAG->getTargetConstant(RHSC >> Scale, dl, MVT::i64);
801         return true;
802       }
803     }
804   }
805 
806   // Before falling back to our general case, check if the unscaled
807   // instructions can handle this. If so, that's preferable.
808   if (SelectAddrModeUnscaled(N, Size, Base, OffImm))
809     return false;
810 
811   // Base only. The address will be materialized into a register before
812   // the memory is accessed.
813   //    add x0, Xbase, #offset
814   //    ldr x0, [x0]
815   Base = N;
816   OffImm = CurDAG->getTargetConstant(0, dl, MVT::i64);
817   return true;
818 }
819 
820 /// SelectAddrModeUnscaled - Select a "register plus unscaled signed 9-bit
821 /// immediate" address.  This should only match when there is an offset that
822 /// is not valid for a scaled immediate addressing mode.  The "Size" argument
823 /// is the size in bytes of the memory reference, which is needed here to know
824 /// what is valid for a scaled immediate.
825 bool AArch64DAGToDAGISel::SelectAddrModeUnscaled(SDValue N, unsigned Size,
826                                                  SDValue &Base,
827                                                  SDValue &OffImm) {
828   if (!CurDAG->isBaseWithConstantOffset(N))
829     return false;
830   if (ConstantSDNode *RHS = dyn_cast<ConstantSDNode>(N.getOperand(1))) {
831     int64_t RHSC = RHS->getSExtValue();
832     // If the offset is valid as a scaled immediate, don't match here.
833     if ((RHSC & (Size - 1)) == 0 && RHSC >= 0 &&
834         RHSC < (0x1000 << Log2_32(Size)))
835       return false;
836     if (RHSC >= -256 && RHSC < 256) {
837       Base = N.getOperand(0);
838       if (Base.getOpcode() == ISD::FrameIndex) {
839         int FI = cast<FrameIndexSDNode>(Base)->getIndex();
840         const TargetLowering *TLI = getTargetLowering();
841         Base = CurDAG->getTargetFrameIndex(
842             FI, TLI->getPointerTy(CurDAG->getDataLayout()));
843       }
844       OffImm = CurDAG->getTargetConstant(RHSC, SDLoc(N), MVT::i64);
845       return true;
846     }
847   }
848   return false;
849 }
850 
851 static SDValue Widen(SelectionDAG *CurDAG, SDValue N) {
852   SDLoc dl(N);
853   SDValue SubReg = CurDAG->getTargetConstant(AArch64::sub_32, dl, MVT::i32);
854   SDValue ImpDef = SDValue(
855       CurDAG->getMachineNode(TargetOpcode::IMPLICIT_DEF, dl, MVT::i64), 0);
856   MachineSDNode *Node = CurDAG->getMachineNode(
857       TargetOpcode::INSERT_SUBREG, dl, MVT::i64, ImpDef, N, SubReg);
858   return SDValue(Node, 0);
859 }
860 
861 /// Check if the given SHL node (\p N), can be used to form an
862 /// extended register for an addressing mode.
863 bool AArch64DAGToDAGISel::SelectExtendedSHL(SDValue N, unsigned Size,
864                                             bool WantExtend, SDValue &Offset,
865                                             SDValue &SignExtend) {
866   assert(N.getOpcode() == ISD::SHL && "Invalid opcode.");
867   ConstantSDNode *CSD = dyn_cast<ConstantSDNode>(N.getOperand(1));
868   if (!CSD || (CSD->getZExtValue() & 0x7) != CSD->getZExtValue())
869     return false;
870 
871   SDLoc dl(N);
872   if (WantExtend) {
873     AArch64_AM::ShiftExtendType Ext =
874         getExtendTypeForNode(N.getOperand(0), true);
875     if (Ext == AArch64_AM::InvalidShiftExtend)
876       return false;
877 
878     Offset = narrowIfNeeded(CurDAG, N.getOperand(0).getOperand(0));
879     SignExtend = CurDAG->getTargetConstant(Ext == AArch64_AM::SXTW, dl,
880                                            MVT::i32);
881   } else {
882     Offset = N.getOperand(0);
883     SignExtend = CurDAG->getTargetConstant(0, dl, MVT::i32);
884   }
885 
886   unsigned LegalShiftVal = Log2_32(Size);
887   unsigned ShiftVal = CSD->getZExtValue();
888 
889   if (ShiftVal != 0 && ShiftVal != LegalShiftVal)
890     return false;
891 
892   return isWorthFolding(N);
893 }
894 
895 bool AArch64DAGToDAGISel::SelectAddrModeWRO(SDValue N, unsigned Size,
896                                             SDValue &Base, SDValue &Offset,
897                                             SDValue &SignExtend,
898                                             SDValue &DoShift) {
899   if (N.getOpcode() != ISD::ADD)
900     return false;
901   SDValue LHS = N.getOperand(0);
902   SDValue RHS = N.getOperand(1);
903   SDLoc dl(N);
904 
905   // We don't want to match immediate adds here, because they are better lowered
906   // to the register-immediate addressing modes.
907   if (isa<ConstantSDNode>(LHS) || isa<ConstantSDNode>(RHS))
908     return false;
909 
910   // Check if this particular node is reused in any non-memory related
911   // operation.  If yes, do not try to fold this node into the address
912   // computation, since the computation will be kept.
913   const SDNode *Node = N.getNode();
914   for (SDNode *UI : Node->uses()) {
915     if (!isa<MemSDNode>(*UI))
916       return false;
917   }
918 
919   // Remember if it is worth folding N when it produces extended register.
920   bool IsExtendedRegisterWorthFolding = isWorthFolding(N);
921 
922   // Try to match a shifted extend on the RHS.
923   if (IsExtendedRegisterWorthFolding && RHS.getOpcode() == ISD::SHL &&
924       SelectExtendedSHL(RHS, Size, true, Offset, SignExtend)) {
925     Base = LHS;
926     DoShift = CurDAG->getTargetConstant(true, dl, MVT::i32);
927     return true;
928   }
929 
930   // Try to match a shifted extend on the LHS.
931   if (IsExtendedRegisterWorthFolding && LHS.getOpcode() == ISD::SHL &&
932       SelectExtendedSHL(LHS, Size, true, Offset, SignExtend)) {
933     Base = RHS;
934     DoShift = CurDAG->getTargetConstant(true, dl, MVT::i32);
935     return true;
936   }
937 
938   // There was no shift, whatever else we find.
939   DoShift = CurDAG->getTargetConstant(false, dl, MVT::i32);
940 
941   AArch64_AM::ShiftExtendType Ext = AArch64_AM::InvalidShiftExtend;
942   // Try to match an unshifted extend on the LHS.
943   if (IsExtendedRegisterWorthFolding &&
944       (Ext = getExtendTypeForNode(LHS, true)) !=
945           AArch64_AM::InvalidShiftExtend) {
946     Base = RHS;
947     Offset = narrowIfNeeded(CurDAG, LHS.getOperand(0));
948     SignExtend = CurDAG->getTargetConstant(Ext == AArch64_AM::SXTW, dl,
949                                            MVT::i32);
950     if (isWorthFolding(LHS))
951       return true;
952   }
953 
954   // Try to match an unshifted extend on the RHS.
955   if (IsExtendedRegisterWorthFolding &&
956       (Ext = getExtendTypeForNode(RHS, true)) !=
957           AArch64_AM::InvalidShiftExtend) {
958     Base = LHS;
959     Offset = narrowIfNeeded(CurDAG, RHS.getOperand(0));
960     SignExtend = CurDAG->getTargetConstant(Ext == AArch64_AM::SXTW, dl,
961                                            MVT::i32);
962     if (isWorthFolding(RHS))
963       return true;
964   }
965 
966   return false;
967 }
968 
969 // Check if the given immediate is preferred by ADD. If an immediate can be
970 // encoded in an ADD, or it can be encoded in an "ADD LSL #12" and can not be
971 // encoded by one MOVZ, return true.
972 static bool isPreferredADD(int64_t ImmOff) {
973   // Constant in [0x0, 0xfff] can be encoded in ADD.
974   if ((ImmOff & 0xfffffffffffff000LL) == 0x0LL)
975     return true;
976   // Check if it can be encoded in an "ADD LSL #12".
977   if ((ImmOff & 0xffffffffff000fffLL) == 0x0LL)
978     // As a single MOVZ is faster than a "ADD of LSL #12", ignore such constant.
979     return (ImmOff & 0xffffffffff00ffffLL) != 0x0LL &&
980            (ImmOff & 0xffffffffffff0fffLL) != 0x0LL;
981   return false;
982 }
983 
984 bool AArch64DAGToDAGISel::SelectAddrModeXRO(SDValue N, unsigned Size,
985                                             SDValue &Base, SDValue &Offset,
986                                             SDValue &SignExtend,
987                                             SDValue &DoShift) {
988   if (N.getOpcode() != ISD::ADD)
989     return false;
990   SDValue LHS = N.getOperand(0);
991   SDValue RHS = N.getOperand(1);
992   SDLoc DL(N);
993 
994   // Check if this particular node is reused in any non-memory related
995   // operation.  If yes, do not try to fold this node into the address
996   // computation, since the computation will be kept.
997   const SDNode *Node = N.getNode();
998   for (SDNode *UI : Node->uses()) {
999     if (!isa<MemSDNode>(*UI))
1000       return false;
1001   }
1002 
1003   // Watch out if RHS is a wide immediate, it can not be selected into
1004   // [BaseReg+Imm] addressing mode. Also it may not be able to be encoded into
1005   // ADD/SUB. Instead it will use [BaseReg + 0] address mode and generate
1006   // instructions like:
1007   //     MOV  X0, WideImmediate
1008   //     ADD  X1, BaseReg, X0
1009   //     LDR  X2, [X1, 0]
1010   // For such situation, using [BaseReg, XReg] addressing mode can save one
1011   // ADD/SUB:
1012   //     MOV  X0, WideImmediate
1013   //     LDR  X2, [BaseReg, X0]
1014   if (isa<ConstantSDNode>(RHS)) {
1015     int64_t ImmOff = (int64_t)cast<ConstantSDNode>(RHS)->getZExtValue();
1016     unsigned Scale = Log2_32(Size);
1017     // Skip the immediate can be selected by load/store addressing mode.
1018     // Also skip the immediate can be encoded by a single ADD (SUB is also
1019     // checked by using -ImmOff).
1020     if ((ImmOff % Size == 0 && ImmOff >= 0 && ImmOff < (0x1000 << Scale)) ||
1021         isPreferredADD(ImmOff) || isPreferredADD(-ImmOff))
1022       return false;
1023 
1024     SDValue Ops[] = { RHS };
1025     SDNode *MOVI =
1026         CurDAG->getMachineNode(AArch64::MOVi64imm, DL, MVT::i64, Ops);
1027     SDValue MOVIV = SDValue(MOVI, 0);
1028     // This ADD of two X register will be selected into [Reg+Reg] mode.
1029     N = CurDAG->getNode(ISD::ADD, DL, MVT::i64, LHS, MOVIV);
1030   }
1031 
1032   // Remember if it is worth folding N when it produces extended register.
1033   bool IsExtendedRegisterWorthFolding = isWorthFolding(N);
1034 
1035   // Try to match a shifted extend on the RHS.
1036   if (IsExtendedRegisterWorthFolding && RHS.getOpcode() == ISD::SHL &&
1037       SelectExtendedSHL(RHS, Size, false, Offset, SignExtend)) {
1038     Base = LHS;
1039     DoShift = CurDAG->getTargetConstant(true, DL, MVT::i32);
1040     return true;
1041   }
1042 
1043   // Try to match a shifted extend on the LHS.
1044   if (IsExtendedRegisterWorthFolding && LHS.getOpcode() == ISD::SHL &&
1045       SelectExtendedSHL(LHS, Size, false, Offset, SignExtend)) {
1046     Base = RHS;
1047     DoShift = CurDAG->getTargetConstant(true, DL, MVT::i32);
1048     return true;
1049   }
1050 
1051   // Match any non-shifted, non-extend, non-immediate add expression.
1052   Base = LHS;
1053   Offset = RHS;
1054   SignExtend = CurDAG->getTargetConstant(false, DL, MVT::i32);
1055   DoShift = CurDAG->getTargetConstant(false, DL, MVT::i32);
1056   // Reg1 + Reg2 is free: no check needed.
1057   return true;
1058 }
1059 
1060 SDValue AArch64DAGToDAGISel::createDTuple(ArrayRef<SDValue> Regs) {
1061   static const unsigned RegClassIDs[] = {
1062       AArch64::DDRegClassID, AArch64::DDDRegClassID, AArch64::DDDDRegClassID};
1063   static const unsigned SubRegs[] = {AArch64::dsub0, AArch64::dsub1,
1064                                      AArch64::dsub2, AArch64::dsub3};
1065 
1066   return createTuple(Regs, RegClassIDs, SubRegs);
1067 }
1068 
1069 SDValue AArch64DAGToDAGISel::createQTuple(ArrayRef<SDValue> Regs) {
1070   static const unsigned RegClassIDs[] = {
1071       AArch64::QQRegClassID, AArch64::QQQRegClassID, AArch64::QQQQRegClassID};
1072   static const unsigned SubRegs[] = {AArch64::qsub0, AArch64::qsub1,
1073                                      AArch64::qsub2, AArch64::qsub3};
1074 
1075   return createTuple(Regs, RegClassIDs, SubRegs);
1076 }
1077 
1078 SDValue AArch64DAGToDAGISel::createTuple(ArrayRef<SDValue> Regs,
1079                                          const unsigned RegClassIDs[],
1080                                          const unsigned SubRegs[]) {
1081   // There's no special register-class for a vector-list of 1 element: it's just
1082   // a vector.
1083   if (Regs.size() == 1)
1084     return Regs[0];
1085 
1086   assert(Regs.size() >= 2 && Regs.size() <= 4);
1087 
1088   SDLoc DL(Regs[0]);
1089 
1090   SmallVector<SDValue, 4> Ops;
1091 
1092   // First operand of REG_SEQUENCE is the desired RegClass.
1093   Ops.push_back(
1094       CurDAG->getTargetConstant(RegClassIDs[Regs.size() - 2], DL, MVT::i32));
1095 
1096   // Then we get pairs of source & subregister-position for the components.
1097   for (unsigned i = 0; i < Regs.size(); ++i) {
1098     Ops.push_back(Regs[i]);
1099     Ops.push_back(CurDAG->getTargetConstant(SubRegs[i], DL, MVT::i32));
1100   }
1101 
1102   SDNode *N =
1103       CurDAG->getMachineNode(TargetOpcode::REG_SEQUENCE, DL, MVT::Untyped, Ops);
1104   return SDValue(N, 0);
1105 }
1106 
1107 void AArch64DAGToDAGISel::SelectTable(SDNode *N, unsigned NumVecs, unsigned Opc,
1108                                       bool isExt) {
1109   SDLoc dl(N);
1110   EVT VT = N->getValueType(0);
1111 
1112   unsigned ExtOff = isExt;
1113 
1114   // Form a REG_SEQUENCE to force register allocation.
1115   unsigned Vec0Off = ExtOff + 1;
1116   SmallVector<SDValue, 4> Regs(N->op_begin() + Vec0Off,
1117                                N->op_begin() + Vec0Off + NumVecs);
1118   SDValue RegSeq = createQTuple(Regs);
1119 
1120   SmallVector<SDValue, 6> Ops;
1121   if (isExt)
1122     Ops.push_back(N->getOperand(1));
1123   Ops.push_back(RegSeq);
1124   Ops.push_back(N->getOperand(NumVecs + ExtOff + 1));
1125   ReplaceNode(N, CurDAG->getMachineNode(Opc, dl, VT, Ops));
1126 }
1127 
1128 bool AArch64DAGToDAGISel::tryIndexedLoad(SDNode *N) {
1129   LoadSDNode *LD = cast<LoadSDNode>(N);
1130   if (LD->isUnindexed())
1131     return false;
1132   EVT VT = LD->getMemoryVT();
1133   EVT DstVT = N->getValueType(0);
1134   ISD::MemIndexedMode AM = LD->getAddressingMode();
1135   bool IsPre = AM == ISD::PRE_INC || AM == ISD::PRE_DEC;
1136 
1137   // We're not doing validity checking here. That was done when checking
1138   // if we should mark the load as indexed or not. We're just selecting
1139   // the right instruction.
1140   unsigned Opcode = 0;
1141 
1142   ISD::LoadExtType ExtType = LD->getExtensionType();
1143   bool InsertTo64 = false;
1144   if (VT == MVT::i64)
1145     Opcode = IsPre ? AArch64::LDRXpre : AArch64::LDRXpost;
1146   else if (VT == MVT::i32) {
1147     if (ExtType == ISD::NON_EXTLOAD)
1148       Opcode = IsPre ? AArch64::LDRWpre : AArch64::LDRWpost;
1149     else if (ExtType == ISD::SEXTLOAD)
1150       Opcode = IsPre ? AArch64::LDRSWpre : AArch64::LDRSWpost;
1151     else {
1152       Opcode = IsPre ? AArch64::LDRWpre : AArch64::LDRWpost;
1153       InsertTo64 = true;
1154       // The result of the load is only i32. It's the subreg_to_reg that makes
1155       // it into an i64.
1156       DstVT = MVT::i32;
1157     }
1158   } else if (VT == MVT::i16) {
1159     if (ExtType == ISD::SEXTLOAD) {
1160       if (DstVT == MVT::i64)
1161         Opcode = IsPre ? AArch64::LDRSHXpre : AArch64::LDRSHXpost;
1162       else
1163         Opcode = IsPre ? AArch64::LDRSHWpre : AArch64::LDRSHWpost;
1164     } else {
1165       Opcode = IsPre ? AArch64::LDRHHpre : AArch64::LDRHHpost;
1166       InsertTo64 = DstVT == MVT::i64;
1167       // The result of the load is only i32. It's the subreg_to_reg that makes
1168       // it into an i64.
1169       DstVT = MVT::i32;
1170     }
1171   } else if (VT == MVT::i8) {
1172     if (ExtType == ISD::SEXTLOAD) {
1173       if (DstVT == MVT::i64)
1174         Opcode = IsPre ? AArch64::LDRSBXpre : AArch64::LDRSBXpost;
1175       else
1176         Opcode = IsPre ? AArch64::LDRSBWpre : AArch64::LDRSBWpost;
1177     } else {
1178       Opcode = IsPre ? AArch64::LDRBBpre : AArch64::LDRBBpost;
1179       InsertTo64 = DstVT == MVT::i64;
1180       // The result of the load is only i32. It's the subreg_to_reg that makes
1181       // it into an i64.
1182       DstVT = MVT::i32;
1183     }
1184   } else if (VT == MVT::f16) {
1185     Opcode = IsPre ? AArch64::LDRHpre : AArch64::LDRHpost;
1186   } else if (VT == MVT::f32) {
1187     Opcode = IsPre ? AArch64::LDRSpre : AArch64::LDRSpost;
1188   } else if (VT == MVT::f64 || VT.is64BitVector()) {
1189     Opcode = IsPre ? AArch64::LDRDpre : AArch64::LDRDpost;
1190   } else if (VT.is128BitVector()) {
1191     Opcode = IsPre ? AArch64::LDRQpre : AArch64::LDRQpost;
1192   } else
1193     return false;
1194   SDValue Chain = LD->getChain();
1195   SDValue Base = LD->getBasePtr();
1196   ConstantSDNode *OffsetOp = cast<ConstantSDNode>(LD->getOffset());
1197   int OffsetVal = (int)OffsetOp->getZExtValue();
1198   SDLoc dl(N);
1199   SDValue Offset = CurDAG->getTargetConstant(OffsetVal, dl, MVT::i64);
1200   SDValue Ops[] = { Base, Offset, Chain };
1201   SDNode *Res = CurDAG->getMachineNode(Opcode, dl, MVT::i64, DstVT,
1202                                        MVT::Other, Ops);
1203   // Either way, we're replacing the node, so tell the caller that.
1204   SDValue LoadedVal = SDValue(Res, 1);
1205   if (InsertTo64) {
1206     SDValue SubReg = CurDAG->getTargetConstant(AArch64::sub_32, dl, MVT::i32);
1207     LoadedVal =
1208         SDValue(CurDAG->getMachineNode(
1209                     AArch64::SUBREG_TO_REG, dl, MVT::i64,
1210                     CurDAG->getTargetConstant(0, dl, MVT::i64), LoadedVal,
1211                     SubReg),
1212                 0);
1213   }
1214 
1215   ReplaceUses(SDValue(N, 0), LoadedVal);
1216   ReplaceUses(SDValue(N, 1), SDValue(Res, 0));
1217   ReplaceUses(SDValue(N, 2), SDValue(Res, 2));
1218   CurDAG->RemoveDeadNode(N);
1219   return true;
1220 }
1221 
1222 void AArch64DAGToDAGISel::SelectLoad(SDNode *N, unsigned NumVecs, unsigned Opc,
1223                                      unsigned SubRegIdx) {
1224   SDLoc dl(N);
1225   EVT VT = N->getValueType(0);
1226   SDValue Chain = N->getOperand(0);
1227 
1228   SDValue Ops[] = {N->getOperand(2), // Mem operand;
1229                    Chain};
1230 
1231   const EVT ResTys[] = {MVT::Untyped, MVT::Other};
1232 
1233   SDNode *Ld = CurDAG->getMachineNode(Opc, dl, ResTys, Ops);
1234   SDValue SuperReg = SDValue(Ld, 0);
1235   for (unsigned i = 0; i < NumVecs; ++i)
1236     ReplaceUses(SDValue(N, i),
1237         CurDAG->getTargetExtractSubreg(SubRegIdx + i, dl, VT, SuperReg));
1238 
1239   ReplaceUses(SDValue(N, NumVecs), SDValue(Ld, 1));
1240 
1241   // Transfer memoperands.
1242   MachineMemOperand *MemOp = cast<MemIntrinsicSDNode>(N)->getMemOperand();
1243   CurDAG->setNodeMemRefs(cast<MachineSDNode>(Ld), {MemOp});
1244 
1245   CurDAG->RemoveDeadNode(N);
1246 }
1247 
1248 void AArch64DAGToDAGISel::SelectPostLoad(SDNode *N, unsigned NumVecs,
1249                                          unsigned Opc, unsigned SubRegIdx) {
1250   SDLoc dl(N);
1251   EVT VT = N->getValueType(0);
1252   SDValue Chain = N->getOperand(0);
1253 
1254   SDValue Ops[] = {N->getOperand(1), // Mem operand
1255                    N->getOperand(2), // Incremental
1256                    Chain};
1257 
1258   const EVT ResTys[] = {MVT::i64, // Type of the write back register
1259                         MVT::Untyped, MVT::Other};
1260 
1261   SDNode *Ld = CurDAG->getMachineNode(Opc, dl, ResTys, Ops);
1262 
1263   // Update uses of write back register
1264   ReplaceUses(SDValue(N, NumVecs), SDValue(Ld, 0));
1265 
1266   // Update uses of vector list
1267   SDValue SuperReg = SDValue(Ld, 1);
1268   if (NumVecs == 1)
1269     ReplaceUses(SDValue(N, 0), SuperReg);
1270   else
1271     for (unsigned i = 0; i < NumVecs; ++i)
1272       ReplaceUses(SDValue(N, i),
1273           CurDAG->getTargetExtractSubreg(SubRegIdx + i, dl, VT, SuperReg));
1274 
1275   // Update the chain
1276   ReplaceUses(SDValue(N, NumVecs + 1), SDValue(Ld, 2));
1277   CurDAG->RemoveDeadNode(N);
1278 }
1279 
1280 void AArch64DAGToDAGISel::SelectStore(SDNode *N, unsigned NumVecs,
1281                                       unsigned Opc) {
1282   SDLoc dl(N);
1283   EVT VT = N->getOperand(2)->getValueType(0);
1284 
1285   // Form a REG_SEQUENCE to force register allocation.
1286   bool Is128Bit = VT.getSizeInBits() == 128;
1287   SmallVector<SDValue, 4> Regs(N->op_begin() + 2, N->op_begin() + 2 + NumVecs);
1288   SDValue RegSeq = Is128Bit ? createQTuple(Regs) : createDTuple(Regs);
1289 
1290   SDValue Ops[] = {RegSeq, N->getOperand(NumVecs + 2), N->getOperand(0)};
1291   SDNode *St = CurDAG->getMachineNode(Opc, dl, N->getValueType(0), Ops);
1292 
1293   // Transfer memoperands.
1294   MachineMemOperand *MemOp = cast<MemIntrinsicSDNode>(N)->getMemOperand();
1295   CurDAG->setNodeMemRefs(cast<MachineSDNode>(St), {MemOp});
1296 
1297   ReplaceNode(N, St);
1298 }
1299 
1300 void AArch64DAGToDAGISel::SelectPostStore(SDNode *N, unsigned NumVecs,
1301                                           unsigned Opc) {
1302   SDLoc dl(N);
1303   EVT VT = N->getOperand(2)->getValueType(0);
1304   const EVT ResTys[] = {MVT::i64,    // Type of the write back register
1305                         MVT::Other}; // Type for the Chain
1306 
1307   // Form a REG_SEQUENCE to force register allocation.
1308   bool Is128Bit = VT.getSizeInBits() == 128;
1309   SmallVector<SDValue, 4> Regs(N->op_begin() + 1, N->op_begin() + 1 + NumVecs);
1310   SDValue RegSeq = Is128Bit ? createQTuple(Regs) : createDTuple(Regs);
1311 
1312   SDValue Ops[] = {RegSeq,
1313                    N->getOperand(NumVecs + 1), // base register
1314                    N->getOperand(NumVecs + 2), // Incremental
1315                    N->getOperand(0)};          // Chain
1316   SDNode *St = CurDAG->getMachineNode(Opc, dl, ResTys, Ops);
1317 
1318   ReplaceNode(N, St);
1319 }
1320 
1321 namespace {
1322 /// WidenVector - Given a value in the V64 register class, produce the
1323 /// equivalent value in the V128 register class.
1324 class WidenVector {
1325   SelectionDAG &DAG;
1326 
1327 public:
1328   WidenVector(SelectionDAG &DAG) : DAG(DAG) {}
1329 
1330   SDValue operator()(SDValue V64Reg) {
1331     EVT VT = V64Reg.getValueType();
1332     unsigned NarrowSize = VT.getVectorNumElements();
1333     MVT EltTy = VT.getVectorElementType().getSimpleVT();
1334     MVT WideTy = MVT::getVectorVT(EltTy, 2 * NarrowSize);
1335     SDLoc DL(V64Reg);
1336 
1337     SDValue Undef =
1338         SDValue(DAG.getMachineNode(TargetOpcode::IMPLICIT_DEF, DL, WideTy), 0);
1339     return DAG.getTargetInsertSubreg(AArch64::dsub, DL, WideTy, Undef, V64Reg);
1340   }
1341 };
1342 } // namespace
1343 
1344 /// NarrowVector - Given a value in the V128 register class, produce the
1345 /// equivalent value in the V64 register class.
1346 static SDValue NarrowVector(SDValue V128Reg, SelectionDAG &DAG) {
1347   EVT VT = V128Reg.getValueType();
1348   unsigned WideSize = VT.getVectorNumElements();
1349   MVT EltTy = VT.getVectorElementType().getSimpleVT();
1350   MVT NarrowTy = MVT::getVectorVT(EltTy, WideSize / 2);
1351 
1352   return DAG.getTargetExtractSubreg(AArch64::dsub, SDLoc(V128Reg), NarrowTy,
1353                                     V128Reg);
1354 }
1355 
1356 void AArch64DAGToDAGISel::SelectLoadLane(SDNode *N, unsigned NumVecs,
1357                                          unsigned Opc) {
1358   SDLoc dl(N);
1359   EVT VT = N->getValueType(0);
1360   bool Narrow = VT.getSizeInBits() == 64;
1361 
1362   // Form a REG_SEQUENCE to force register allocation.
1363   SmallVector<SDValue, 4> Regs(N->op_begin() + 2, N->op_begin() + 2 + NumVecs);
1364 
1365   if (Narrow)
1366     transform(Regs, Regs.begin(),
1367                    WidenVector(*CurDAG));
1368 
1369   SDValue RegSeq = createQTuple(Regs);
1370 
1371   const EVT ResTys[] = {MVT::Untyped, MVT::Other};
1372 
1373   unsigned LaneNo =
1374       cast<ConstantSDNode>(N->getOperand(NumVecs + 2))->getZExtValue();
1375 
1376   SDValue Ops[] = {RegSeq, CurDAG->getTargetConstant(LaneNo, dl, MVT::i64),
1377                    N->getOperand(NumVecs + 3), N->getOperand(0)};
1378   SDNode *Ld = CurDAG->getMachineNode(Opc, dl, ResTys, Ops);
1379   SDValue SuperReg = SDValue(Ld, 0);
1380 
1381   EVT WideVT = RegSeq.getOperand(1)->getValueType(0);
1382   static const unsigned QSubs[] = { AArch64::qsub0, AArch64::qsub1,
1383                                     AArch64::qsub2, AArch64::qsub3 };
1384   for (unsigned i = 0; i < NumVecs; ++i) {
1385     SDValue NV = CurDAG->getTargetExtractSubreg(QSubs[i], dl, WideVT, SuperReg);
1386     if (Narrow)
1387       NV = NarrowVector(NV, *CurDAG);
1388     ReplaceUses(SDValue(N, i), NV);
1389   }
1390 
1391   ReplaceUses(SDValue(N, NumVecs), SDValue(Ld, 1));
1392   CurDAG->RemoveDeadNode(N);
1393 }
1394 
1395 void AArch64DAGToDAGISel::SelectPostLoadLane(SDNode *N, unsigned NumVecs,
1396                                              unsigned Opc) {
1397   SDLoc dl(N);
1398   EVT VT = N->getValueType(0);
1399   bool Narrow = VT.getSizeInBits() == 64;
1400 
1401   // Form a REG_SEQUENCE to force register allocation.
1402   SmallVector<SDValue, 4> Regs(N->op_begin() + 1, N->op_begin() + 1 + NumVecs);
1403 
1404   if (Narrow)
1405     transform(Regs, Regs.begin(),
1406                    WidenVector(*CurDAG));
1407 
1408   SDValue RegSeq = createQTuple(Regs);
1409 
1410   const EVT ResTys[] = {MVT::i64, // Type of the write back register
1411                         RegSeq->getValueType(0), MVT::Other};
1412 
1413   unsigned LaneNo =
1414       cast<ConstantSDNode>(N->getOperand(NumVecs + 1))->getZExtValue();
1415 
1416   SDValue Ops[] = {RegSeq,
1417                    CurDAG->getTargetConstant(LaneNo, dl,
1418                                              MVT::i64),         // Lane Number
1419                    N->getOperand(NumVecs + 2),                  // Base register
1420                    N->getOperand(NumVecs + 3),                  // Incremental
1421                    N->getOperand(0)};
1422   SDNode *Ld = CurDAG->getMachineNode(Opc, dl, ResTys, Ops);
1423 
1424   // Update uses of the write back register
1425   ReplaceUses(SDValue(N, NumVecs), SDValue(Ld, 0));
1426 
1427   // Update uses of the vector list
1428   SDValue SuperReg = SDValue(Ld, 1);
1429   if (NumVecs == 1) {
1430     ReplaceUses(SDValue(N, 0),
1431                 Narrow ? NarrowVector(SuperReg, *CurDAG) : SuperReg);
1432   } else {
1433     EVT WideVT = RegSeq.getOperand(1)->getValueType(0);
1434     static const unsigned QSubs[] = { AArch64::qsub0, AArch64::qsub1,
1435                                       AArch64::qsub2, AArch64::qsub3 };
1436     for (unsigned i = 0; i < NumVecs; ++i) {
1437       SDValue NV = CurDAG->getTargetExtractSubreg(QSubs[i], dl, WideVT,
1438                                                   SuperReg);
1439       if (Narrow)
1440         NV = NarrowVector(NV, *CurDAG);
1441       ReplaceUses(SDValue(N, i), NV);
1442     }
1443   }
1444 
1445   // Update the Chain
1446   ReplaceUses(SDValue(N, NumVecs + 1), SDValue(Ld, 2));
1447   CurDAG->RemoveDeadNode(N);
1448 }
1449 
1450 void AArch64DAGToDAGISel::SelectStoreLane(SDNode *N, unsigned NumVecs,
1451                                           unsigned Opc) {
1452   SDLoc dl(N);
1453   EVT VT = N->getOperand(2)->getValueType(0);
1454   bool Narrow = VT.getSizeInBits() == 64;
1455 
1456   // Form a REG_SEQUENCE to force register allocation.
1457   SmallVector<SDValue, 4> Regs(N->op_begin() + 2, N->op_begin() + 2 + NumVecs);
1458 
1459   if (Narrow)
1460     transform(Regs, Regs.begin(),
1461                    WidenVector(*CurDAG));
1462 
1463   SDValue RegSeq = createQTuple(Regs);
1464 
1465   unsigned LaneNo =
1466       cast<ConstantSDNode>(N->getOperand(NumVecs + 2))->getZExtValue();
1467 
1468   SDValue Ops[] = {RegSeq, CurDAG->getTargetConstant(LaneNo, dl, MVT::i64),
1469                    N->getOperand(NumVecs + 3), N->getOperand(0)};
1470   SDNode *St = CurDAG->getMachineNode(Opc, dl, MVT::Other, Ops);
1471 
1472   // Transfer memoperands.
1473   MachineMemOperand *MemOp = cast<MemIntrinsicSDNode>(N)->getMemOperand();
1474   CurDAG->setNodeMemRefs(cast<MachineSDNode>(St), {MemOp});
1475 
1476   ReplaceNode(N, St);
1477 }
1478 
1479 void AArch64DAGToDAGISel::SelectPostStoreLane(SDNode *N, unsigned NumVecs,
1480                                               unsigned Opc) {
1481   SDLoc dl(N);
1482   EVT VT = N->getOperand(2)->getValueType(0);
1483   bool Narrow = VT.getSizeInBits() == 64;
1484 
1485   // Form a REG_SEQUENCE to force register allocation.
1486   SmallVector<SDValue, 4> Regs(N->op_begin() + 1, N->op_begin() + 1 + NumVecs);
1487 
1488   if (Narrow)
1489     transform(Regs, Regs.begin(),
1490                    WidenVector(*CurDAG));
1491 
1492   SDValue RegSeq = createQTuple(Regs);
1493 
1494   const EVT ResTys[] = {MVT::i64, // Type of the write back register
1495                         MVT::Other};
1496 
1497   unsigned LaneNo =
1498       cast<ConstantSDNode>(N->getOperand(NumVecs + 1))->getZExtValue();
1499 
1500   SDValue Ops[] = {RegSeq, CurDAG->getTargetConstant(LaneNo, dl, MVT::i64),
1501                    N->getOperand(NumVecs + 2), // Base Register
1502                    N->getOperand(NumVecs + 3), // Incremental
1503                    N->getOperand(0)};
1504   SDNode *St = CurDAG->getMachineNode(Opc, dl, ResTys, Ops);
1505 
1506   // Transfer memoperands.
1507   MachineMemOperand *MemOp = cast<MemIntrinsicSDNode>(N)->getMemOperand();
1508   CurDAG->setNodeMemRefs(cast<MachineSDNode>(St), {MemOp});
1509 
1510   ReplaceNode(N, St);
1511 }
1512 
1513 static bool isBitfieldExtractOpFromAnd(SelectionDAG *CurDAG, SDNode *N,
1514                                        unsigned &Opc, SDValue &Opd0,
1515                                        unsigned &LSB, unsigned &MSB,
1516                                        unsigned NumberOfIgnoredLowBits,
1517                                        bool BiggerPattern) {
1518   assert(N->getOpcode() == ISD::AND &&
1519          "N must be a AND operation to call this function");
1520 
1521   EVT VT = N->getValueType(0);
1522 
1523   // Here we can test the type of VT and return false when the type does not
1524   // match, but since it is done prior to that call in the current context
1525   // we turned that into an assert to avoid redundant code.
1526   assert((VT == MVT::i32 || VT == MVT::i64) &&
1527          "Type checking must have been done before calling this function");
1528 
1529   // FIXME: simplify-demanded-bits in DAGCombine will probably have
1530   // changed the AND node to a 32-bit mask operation. We'll have to
1531   // undo that as part of the transform here if we want to catch all
1532   // the opportunities.
1533   // Currently the NumberOfIgnoredLowBits argument helps to recover
1534   // form these situations when matching bigger pattern (bitfield insert).
1535 
1536   // For unsigned extracts, check for a shift right and mask
1537   uint64_t AndImm = 0;
1538   if (!isOpcWithIntImmediate(N, ISD::AND, AndImm))
1539     return false;
1540 
1541   const SDNode *Op0 = N->getOperand(0).getNode();
1542 
1543   // Because of simplify-demanded-bits in DAGCombine, the mask may have been
1544   // simplified. Try to undo that
1545   AndImm |= maskTrailingOnes<uint64_t>(NumberOfIgnoredLowBits);
1546 
1547   // The immediate is a mask of the low bits iff imm & (imm+1) == 0
1548   if (AndImm & (AndImm + 1))
1549     return false;
1550 
1551   bool ClampMSB = false;
1552   uint64_t SrlImm = 0;
1553   // Handle the SRL + ANY_EXTEND case.
1554   if (VT == MVT::i64 && Op0->getOpcode() == ISD::ANY_EXTEND &&
1555       isOpcWithIntImmediate(Op0->getOperand(0).getNode(), ISD::SRL, SrlImm)) {
1556     // Extend the incoming operand of the SRL to 64-bit.
1557     Opd0 = Widen(CurDAG, Op0->getOperand(0).getOperand(0));
1558     // Make sure to clamp the MSB so that we preserve the semantics of the
1559     // original operations.
1560     ClampMSB = true;
1561   } else if (VT == MVT::i32 && Op0->getOpcode() == ISD::TRUNCATE &&
1562              isOpcWithIntImmediate(Op0->getOperand(0).getNode(), ISD::SRL,
1563                                    SrlImm)) {
1564     // If the shift result was truncated, we can still combine them.
1565     Opd0 = Op0->getOperand(0).getOperand(0);
1566 
1567     // Use the type of SRL node.
1568     VT = Opd0->getValueType(0);
1569   } else if (isOpcWithIntImmediate(Op0, ISD::SRL, SrlImm)) {
1570     Opd0 = Op0->getOperand(0);
1571   } else if (BiggerPattern) {
1572     // Let's pretend a 0 shift right has been performed.
1573     // The resulting code will be at least as good as the original one
1574     // plus it may expose more opportunities for bitfield insert pattern.
1575     // FIXME: Currently we limit this to the bigger pattern, because
1576     // some optimizations expect AND and not UBFM.
1577     Opd0 = N->getOperand(0);
1578   } else
1579     return false;
1580 
1581   // Bail out on large immediates. This happens when no proper
1582   // combining/constant folding was performed.
1583   if (!BiggerPattern && (SrlImm <= 0 || SrlImm >= VT.getSizeInBits())) {
1584     LLVM_DEBUG(
1585         (dbgs() << N
1586                 << ": Found large shift immediate, this should not happen\n"));
1587     return false;
1588   }
1589 
1590   LSB = SrlImm;
1591   MSB = SrlImm + (VT == MVT::i32 ? countTrailingOnes<uint32_t>(AndImm)
1592                                  : countTrailingOnes<uint64_t>(AndImm)) -
1593         1;
1594   if (ClampMSB)
1595     // Since we're moving the extend before the right shift operation, we need
1596     // to clamp the MSB to make sure we don't shift in undefined bits instead of
1597     // the zeros which would get shifted in with the original right shift
1598     // operation.
1599     MSB = MSB > 31 ? 31 : MSB;
1600 
1601   Opc = VT == MVT::i32 ? AArch64::UBFMWri : AArch64::UBFMXri;
1602   return true;
1603 }
1604 
1605 static bool isBitfieldExtractOpFromSExtInReg(SDNode *N, unsigned &Opc,
1606                                              SDValue &Opd0, unsigned &Immr,
1607                                              unsigned &Imms) {
1608   assert(N->getOpcode() == ISD::SIGN_EXTEND_INREG);
1609 
1610   EVT VT = N->getValueType(0);
1611   unsigned BitWidth = VT.getSizeInBits();
1612   assert((VT == MVT::i32 || VT == MVT::i64) &&
1613          "Type checking must have been done before calling this function");
1614 
1615   SDValue Op = N->getOperand(0);
1616   if (Op->getOpcode() == ISD::TRUNCATE) {
1617     Op = Op->getOperand(0);
1618     VT = Op->getValueType(0);
1619     BitWidth = VT.getSizeInBits();
1620   }
1621 
1622   uint64_t ShiftImm;
1623   if (!isOpcWithIntImmediate(Op.getNode(), ISD::SRL, ShiftImm) &&
1624       !isOpcWithIntImmediate(Op.getNode(), ISD::SRA, ShiftImm))
1625     return false;
1626 
1627   unsigned Width = cast<VTSDNode>(N->getOperand(1))->getVT().getSizeInBits();
1628   if (ShiftImm + Width > BitWidth)
1629     return false;
1630 
1631   Opc = (VT == MVT::i32) ? AArch64::SBFMWri : AArch64::SBFMXri;
1632   Opd0 = Op.getOperand(0);
1633   Immr = ShiftImm;
1634   Imms = ShiftImm + Width - 1;
1635   return true;
1636 }
1637 
1638 static bool isSeveralBitsExtractOpFromShr(SDNode *N, unsigned &Opc,
1639                                           SDValue &Opd0, unsigned &LSB,
1640                                           unsigned &MSB) {
1641   // We are looking for the following pattern which basically extracts several
1642   // continuous bits from the source value and places it from the LSB of the
1643   // destination value, all other bits of the destination value or set to zero:
1644   //
1645   // Value2 = AND Value, MaskImm
1646   // SRL Value2, ShiftImm
1647   //
1648   // with MaskImm >> ShiftImm to search for the bit width.
1649   //
1650   // This gets selected into a single UBFM:
1651   //
1652   // UBFM Value, ShiftImm, BitWide + SrlImm -1
1653   //
1654 
1655   if (N->getOpcode() != ISD::SRL)
1656     return false;
1657 
1658   uint64_t AndMask = 0;
1659   if (!isOpcWithIntImmediate(N->getOperand(0).getNode(), ISD::AND, AndMask))
1660     return false;
1661 
1662   Opd0 = N->getOperand(0).getOperand(0);
1663 
1664   uint64_t SrlImm = 0;
1665   if (!isIntImmediate(N->getOperand(1), SrlImm))
1666     return false;
1667 
1668   // Check whether we really have several bits extract here.
1669   unsigned BitWide = 64 - countLeadingOnes(~(AndMask >> SrlImm));
1670   if (BitWide && isMask_64(AndMask >> SrlImm)) {
1671     if (N->getValueType(0) == MVT::i32)
1672       Opc = AArch64::UBFMWri;
1673     else
1674       Opc = AArch64::UBFMXri;
1675 
1676     LSB = SrlImm;
1677     MSB = BitWide + SrlImm - 1;
1678     return true;
1679   }
1680 
1681   return false;
1682 }
1683 
1684 static bool isBitfieldExtractOpFromShr(SDNode *N, unsigned &Opc, SDValue &Opd0,
1685                                        unsigned &Immr, unsigned &Imms,
1686                                        bool BiggerPattern) {
1687   assert((N->getOpcode() == ISD::SRA || N->getOpcode() == ISD::SRL) &&
1688          "N must be a SHR/SRA operation to call this function");
1689 
1690   EVT VT = N->getValueType(0);
1691 
1692   // Here we can test the type of VT and return false when the type does not
1693   // match, but since it is done prior to that call in the current context
1694   // we turned that into an assert to avoid redundant code.
1695   assert((VT == MVT::i32 || VT == MVT::i64) &&
1696          "Type checking must have been done before calling this function");
1697 
1698   // Check for AND + SRL doing several bits extract.
1699   if (isSeveralBitsExtractOpFromShr(N, Opc, Opd0, Immr, Imms))
1700     return true;
1701 
1702   // We're looking for a shift of a shift.
1703   uint64_t ShlImm = 0;
1704   uint64_t TruncBits = 0;
1705   if (isOpcWithIntImmediate(N->getOperand(0).getNode(), ISD::SHL, ShlImm)) {
1706     Opd0 = N->getOperand(0).getOperand(0);
1707   } else if (VT == MVT::i32 && N->getOpcode() == ISD::SRL &&
1708              N->getOperand(0).getNode()->getOpcode() == ISD::TRUNCATE) {
1709     // We are looking for a shift of truncate. Truncate from i64 to i32 could
1710     // be considered as setting high 32 bits as zero. Our strategy here is to
1711     // always generate 64bit UBFM. This consistency will help the CSE pass
1712     // later find more redundancy.
1713     Opd0 = N->getOperand(0).getOperand(0);
1714     TruncBits = Opd0->getValueType(0).getSizeInBits() - VT.getSizeInBits();
1715     VT = Opd0.getValueType();
1716     assert(VT == MVT::i64 && "the promoted type should be i64");
1717   } else if (BiggerPattern) {
1718     // Let's pretend a 0 shift left has been performed.
1719     // FIXME: Currently we limit this to the bigger pattern case,
1720     // because some optimizations expect AND and not UBFM
1721     Opd0 = N->getOperand(0);
1722   } else
1723     return false;
1724 
1725   // Missing combines/constant folding may have left us with strange
1726   // constants.
1727   if (ShlImm >= VT.getSizeInBits()) {
1728     LLVM_DEBUG(
1729         (dbgs() << N
1730                 << ": Found large shift immediate, this should not happen\n"));
1731     return false;
1732   }
1733 
1734   uint64_t SrlImm = 0;
1735   if (!isIntImmediate(N->getOperand(1), SrlImm))
1736     return false;
1737 
1738   assert(SrlImm > 0 && SrlImm < VT.getSizeInBits() &&
1739          "bad amount in shift node!");
1740   int immr = SrlImm - ShlImm;
1741   Immr = immr < 0 ? immr + VT.getSizeInBits() : immr;
1742   Imms = VT.getSizeInBits() - ShlImm - TruncBits - 1;
1743   // SRA requires a signed extraction
1744   if (VT == MVT::i32)
1745     Opc = N->getOpcode() == ISD::SRA ? AArch64::SBFMWri : AArch64::UBFMWri;
1746   else
1747     Opc = N->getOpcode() == ISD::SRA ? AArch64::SBFMXri : AArch64::UBFMXri;
1748   return true;
1749 }
1750 
1751 bool AArch64DAGToDAGISel::tryBitfieldExtractOpFromSExt(SDNode *N) {
1752   assert(N->getOpcode() == ISD::SIGN_EXTEND);
1753 
1754   EVT VT = N->getValueType(0);
1755   EVT NarrowVT = N->getOperand(0)->getValueType(0);
1756   if (VT != MVT::i64 || NarrowVT != MVT::i32)
1757     return false;
1758 
1759   uint64_t ShiftImm;
1760   SDValue Op = N->getOperand(0);
1761   if (!isOpcWithIntImmediate(Op.getNode(), ISD::SRA, ShiftImm))
1762     return false;
1763 
1764   SDLoc dl(N);
1765   // Extend the incoming operand of the shift to 64-bits.
1766   SDValue Opd0 = Widen(CurDAG, Op.getOperand(0));
1767   unsigned Immr = ShiftImm;
1768   unsigned Imms = NarrowVT.getSizeInBits() - 1;
1769   SDValue Ops[] = {Opd0, CurDAG->getTargetConstant(Immr, dl, VT),
1770                    CurDAG->getTargetConstant(Imms, dl, VT)};
1771   CurDAG->SelectNodeTo(N, AArch64::SBFMXri, VT, Ops);
1772   return true;
1773 }
1774 
1775 static bool isBitfieldExtractOp(SelectionDAG *CurDAG, SDNode *N, unsigned &Opc,
1776                                 SDValue &Opd0, unsigned &Immr, unsigned &Imms,
1777                                 unsigned NumberOfIgnoredLowBits = 0,
1778                                 bool BiggerPattern = false) {
1779   if (N->getValueType(0) != MVT::i32 && N->getValueType(0) != MVT::i64)
1780     return false;
1781 
1782   switch (N->getOpcode()) {
1783   default:
1784     if (!N->isMachineOpcode())
1785       return false;
1786     break;
1787   case ISD::AND:
1788     return isBitfieldExtractOpFromAnd(CurDAG, N, Opc, Opd0, Immr, Imms,
1789                                       NumberOfIgnoredLowBits, BiggerPattern);
1790   case ISD::SRL:
1791   case ISD::SRA:
1792     return isBitfieldExtractOpFromShr(N, Opc, Opd0, Immr, Imms, BiggerPattern);
1793 
1794   case ISD::SIGN_EXTEND_INREG:
1795     return isBitfieldExtractOpFromSExtInReg(N, Opc, Opd0, Immr, Imms);
1796   }
1797 
1798   unsigned NOpc = N->getMachineOpcode();
1799   switch (NOpc) {
1800   default:
1801     return false;
1802   case AArch64::SBFMWri:
1803   case AArch64::UBFMWri:
1804   case AArch64::SBFMXri:
1805   case AArch64::UBFMXri:
1806     Opc = NOpc;
1807     Opd0 = N->getOperand(0);
1808     Immr = cast<ConstantSDNode>(N->getOperand(1).getNode())->getZExtValue();
1809     Imms = cast<ConstantSDNode>(N->getOperand(2).getNode())->getZExtValue();
1810     return true;
1811   }
1812   // Unreachable
1813   return false;
1814 }
1815 
1816 bool AArch64DAGToDAGISel::tryBitfieldExtractOp(SDNode *N) {
1817   unsigned Opc, Immr, Imms;
1818   SDValue Opd0;
1819   if (!isBitfieldExtractOp(CurDAG, N, Opc, Opd0, Immr, Imms))
1820     return false;
1821 
1822   EVT VT = N->getValueType(0);
1823   SDLoc dl(N);
1824 
1825   // If the bit extract operation is 64bit but the original type is 32bit, we
1826   // need to add one EXTRACT_SUBREG.
1827   if ((Opc == AArch64::SBFMXri || Opc == AArch64::UBFMXri) && VT == MVT::i32) {
1828     SDValue Ops64[] = {Opd0, CurDAG->getTargetConstant(Immr, dl, MVT::i64),
1829                        CurDAG->getTargetConstant(Imms, dl, MVT::i64)};
1830 
1831     SDNode *BFM = CurDAG->getMachineNode(Opc, dl, MVT::i64, Ops64);
1832     SDValue SubReg = CurDAG->getTargetConstant(AArch64::sub_32, dl, MVT::i32);
1833     ReplaceNode(N, CurDAG->getMachineNode(TargetOpcode::EXTRACT_SUBREG, dl,
1834                                           MVT::i32, SDValue(BFM, 0), SubReg));
1835     return true;
1836   }
1837 
1838   SDValue Ops[] = {Opd0, CurDAG->getTargetConstant(Immr, dl, VT),
1839                    CurDAG->getTargetConstant(Imms, dl, VT)};
1840   CurDAG->SelectNodeTo(N, Opc, VT, Ops);
1841   return true;
1842 }
1843 
1844 /// Does DstMask form a complementary pair with the mask provided by
1845 /// BitsToBeInserted, suitable for use in a BFI instruction. Roughly speaking,
1846 /// this asks whether DstMask zeroes precisely those bits that will be set by
1847 /// the other half.
1848 static bool isBitfieldDstMask(uint64_t DstMask, const APInt &BitsToBeInserted,
1849                               unsigned NumberOfIgnoredHighBits, EVT VT) {
1850   assert((VT == MVT::i32 || VT == MVT::i64) &&
1851          "i32 or i64 mask type expected!");
1852   unsigned BitWidth = VT.getSizeInBits() - NumberOfIgnoredHighBits;
1853 
1854   APInt SignificantDstMask = APInt(BitWidth, DstMask);
1855   APInt SignificantBitsToBeInserted = BitsToBeInserted.zextOrTrunc(BitWidth);
1856 
1857   return (SignificantDstMask & SignificantBitsToBeInserted) == 0 &&
1858          (SignificantDstMask | SignificantBitsToBeInserted).isAllOnesValue();
1859 }
1860 
1861 // Look for bits that will be useful for later uses.
1862 // A bit is consider useless as soon as it is dropped and never used
1863 // before it as been dropped.
1864 // E.g., looking for useful bit of x
1865 // 1. y = x & 0x7
1866 // 2. z = y >> 2
1867 // After #1, x useful bits are 0x7, then the useful bits of x, live through
1868 // y.
1869 // After #2, the useful bits of x are 0x4.
1870 // However, if x is used on an unpredicatable instruction, then all its bits
1871 // are useful.
1872 // E.g.
1873 // 1. y = x & 0x7
1874 // 2. z = y >> 2
1875 // 3. str x, [@x]
1876 static void getUsefulBits(SDValue Op, APInt &UsefulBits, unsigned Depth = 0);
1877 
1878 static void getUsefulBitsFromAndWithImmediate(SDValue Op, APInt &UsefulBits,
1879                                               unsigned Depth) {
1880   uint64_t Imm =
1881       cast<const ConstantSDNode>(Op.getOperand(1).getNode())->getZExtValue();
1882   Imm = AArch64_AM::decodeLogicalImmediate(Imm, UsefulBits.getBitWidth());
1883   UsefulBits &= APInt(UsefulBits.getBitWidth(), Imm);
1884   getUsefulBits(Op, UsefulBits, Depth + 1);
1885 }
1886 
1887 static void getUsefulBitsFromBitfieldMoveOpd(SDValue Op, APInt &UsefulBits,
1888                                              uint64_t Imm, uint64_t MSB,
1889                                              unsigned Depth) {
1890   // inherit the bitwidth value
1891   APInt OpUsefulBits(UsefulBits);
1892   OpUsefulBits = 1;
1893 
1894   if (MSB >= Imm) {
1895     OpUsefulBits <<= MSB - Imm + 1;
1896     --OpUsefulBits;
1897     // The interesting part will be in the lower part of the result
1898     getUsefulBits(Op, OpUsefulBits, Depth + 1);
1899     // The interesting part was starting at Imm in the argument
1900     OpUsefulBits <<= Imm;
1901   } else {
1902     OpUsefulBits <<= MSB + 1;
1903     --OpUsefulBits;
1904     // The interesting part will be shifted in the result
1905     OpUsefulBits <<= OpUsefulBits.getBitWidth() - Imm;
1906     getUsefulBits(Op, OpUsefulBits, Depth + 1);
1907     // The interesting part was at zero in the argument
1908     OpUsefulBits.lshrInPlace(OpUsefulBits.getBitWidth() - Imm);
1909   }
1910 
1911   UsefulBits &= OpUsefulBits;
1912 }
1913 
1914 static void getUsefulBitsFromUBFM(SDValue Op, APInt &UsefulBits,
1915                                   unsigned Depth) {
1916   uint64_t Imm =
1917       cast<const ConstantSDNode>(Op.getOperand(1).getNode())->getZExtValue();
1918   uint64_t MSB =
1919       cast<const ConstantSDNode>(Op.getOperand(2).getNode())->getZExtValue();
1920 
1921   getUsefulBitsFromBitfieldMoveOpd(Op, UsefulBits, Imm, MSB, Depth);
1922 }
1923 
1924 static void getUsefulBitsFromOrWithShiftedReg(SDValue Op, APInt &UsefulBits,
1925                                               unsigned Depth) {
1926   uint64_t ShiftTypeAndValue =
1927       cast<const ConstantSDNode>(Op.getOperand(2).getNode())->getZExtValue();
1928   APInt Mask(UsefulBits);
1929   Mask.clearAllBits();
1930   Mask.flipAllBits();
1931 
1932   if (AArch64_AM::getShiftType(ShiftTypeAndValue) == AArch64_AM::LSL) {
1933     // Shift Left
1934     uint64_t ShiftAmt = AArch64_AM::getShiftValue(ShiftTypeAndValue);
1935     Mask <<= ShiftAmt;
1936     getUsefulBits(Op, Mask, Depth + 1);
1937     Mask.lshrInPlace(ShiftAmt);
1938   } else if (AArch64_AM::getShiftType(ShiftTypeAndValue) == AArch64_AM::LSR) {
1939     // Shift Right
1940     // We do not handle AArch64_AM::ASR, because the sign will change the
1941     // number of useful bits
1942     uint64_t ShiftAmt = AArch64_AM::getShiftValue(ShiftTypeAndValue);
1943     Mask.lshrInPlace(ShiftAmt);
1944     getUsefulBits(Op, Mask, Depth + 1);
1945     Mask <<= ShiftAmt;
1946   } else
1947     return;
1948 
1949   UsefulBits &= Mask;
1950 }
1951 
1952 static void getUsefulBitsFromBFM(SDValue Op, SDValue Orig, APInt &UsefulBits,
1953                                  unsigned Depth) {
1954   uint64_t Imm =
1955       cast<const ConstantSDNode>(Op.getOperand(2).getNode())->getZExtValue();
1956   uint64_t MSB =
1957       cast<const ConstantSDNode>(Op.getOperand(3).getNode())->getZExtValue();
1958 
1959   APInt OpUsefulBits(UsefulBits);
1960   OpUsefulBits = 1;
1961 
1962   APInt ResultUsefulBits(UsefulBits.getBitWidth(), 0);
1963   ResultUsefulBits.flipAllBits();
1964   APInt Mask(UsefulBits.getBitWidth(), 0);
1965 
1966   getUsefulBits(Op, ResultUsefulBits, Depth + 1);
1967 
1968   if (MSB >= Imm) {
1969     // The instruction is a BFXIL.
1970     uint64_t Width = MSB - Imm + 1;
1971     uint64_t LSB = Imm;
1972 
1973     OpUsefulBits <<= Width;
1974     --OpUsefulBits;
1975 
1976     if (Op.getOperand(1) == Orig) {
1977       // Copy the low bits from the result to bits starting from LSB.
1978       Mask = ResultUsefulBits & OpUsefulBits;
1979       Mask <<= LSB;
1980     }
1981 
1982     if (Op.getOperand(0) == Orig)
1983       // Bits starting from LSB in the input contribute to the result.
1984       Mask |= (ResultUsefulBits & ~OpUsefulBits);
1985   } else {
1986     // The instruction is a BFI.
1987     uint64_t Width = MSB + 1;
1988     uint64_t LSB = UsefulBits.getBitWidth() - Imm;
1989 
1990     OpUsefulBits <<= Width;
1991     --OpUsefulBits;
1992     OpUsefulBits <<= LSB;
1993 
1994     if (Op.getOperand(1) == Orig) {
1995       // Copy the bits from the result to the zero bits.
1996       Mask = ResultUsefulBits & OpUsefulBits;
1997       Mask.lshrInPlace(LSB);
1998     }
1999 
2000     if (Op.getOperand(0) == Orig)
2001       Mask |= (ResultUsefulBits & ~OpUsefulBits);
2002   }
2003 
2004   UsefulBits &= Mask;
2005 }
2006 
2007 static void getUsefulBitsForUse(SDNode *UserNode, APInt &UsefulBits,
2008                                 SDValue Orig, unsigned Depth) {
2009 
2010   // Users of this node should have already been instruction selected
2011   // FIXME: Can we turn that into an assert?
2012   if (!UserNode->isMachineOpcode())
2013     return;
2014 
2015   switch (UserNode->getMachineOpcode()) {
2016   default:
2017     return;
2018   case AArch64::ANDSWri:
2019   case AArch64::ANDSXri:
2020   case AArch64::ANDWri:
2021   case AArch64::ANDXri:
2022     // We increment Depth only when we call the getUsefulBits
2023     return getUsefulBitsFromAndWithImmediate(SDValue(UserNode, 0), UsefulBits,
2024                                              Depth);
2025   case AArch64::UBFMWri:
2026   case AArch64::UBFMXri:
2027     return getUsefulBitsFromUBFM(SDValue(UserNode, 0), UsefulBits, Depth);
2028 
2029   case AArch64::ORRWrs:
2030   case AArch64::ORRXrs:
2031     if (UserNode->getOperand(1) != Orig)
2032       return;
2033     return getUsefulBitsFromOrWithShiftedReg(SDValue(UserNode, 0), UsefulBits,
2034                                              Depth);
2035   case AArch64::BFMWri:
2036   case AArch64::BFMXri:
2037     return getUsefulBitsFromBFM(SDValue(UserNode, 0), Orig, UsefulBits, Depth);
2038 
2039   case AArch64::STRBBui:
2040   case AArch64::STURBBi:
2041     if (UserNode->getOperand(0) != Orig)
2042       return;
2043     UsefulBits &= APInt(UsefulBits.getBitWidth(), 0xff);
2044     return;
2045 
2046   case AArch64::STRHHui:
2047   case AArch64::STURHHi:
2048     if (UserNode->getOperand(0) != Orig)
2049       return;
2050     UsefulBits &= APInt(UsefulBits.getBitWidth(), 0xffff);
2051     return;
2052   }
2053 }
2054 
2055 static void getUsefulBits(SDValue Op, APInt &UsefulBits, unsigned Depth) {
2056   if (Depth >= 6)
2057     return;
2058   // Initialize UsefulBits
2059   if (!Depth) {
2060     unsigned Bitwidth = Op.getScalarValueSizeInBits();
2061     // At the beginning, assume every produced bits is useful
2062     UsefulBits = APInt(Bitwidth, 0);
2063     UsefulBits.flipAllBits();
2064   }
2065   APInt UsersUsefulBits(UsefulBits.getBitWidth(), 0);
2066 
2067   for (SDNode *Node : Op.getNode()->uses()) {
2068     // A use cannot produce useful bits
2069     APInt UsefulBitsForUse = APInt(UsefulBits);
2070     getUsefulBitsForUse(Node, UsefulBitsForUse, Op, Depth);
2071     UsersUsefulBits |= UsefulBitsForUse;
2072   }
2073   // UsefulBits contains the produced bits that are meaningful for the
2074   // current definition, thus a user cannot make a bit meaningful at
2075   // this point
2076   UsefulBits &= UsersUsefulBits;
2077 }
2078 
2079 /// Create a machine node performing a notional SHL of Op by ShlAmount. If
2080 /// ShlAmount is negative, do a (logical) right-shift instead. If ShlAmount is
2081 /// 0, return Op unchanged.
2082 static SDValue getLeftShift(SelectionDAG *CurDAG, SDValue Op, int ShlAmount) {
2083   if (ShlAmount == 0)
2084     return Op;
2085 
2086   EVT VT = Op.getValueType();
2087   SDLoc dl(Op);
2088   unsigned BitWidth = VT.getSizeInBits();
2089   unsigned UBFMOpc = BitWidth == 32 ? AArch64::UBFMWri : AArch64::UBFMXri;
2090 
2091   SDNode *ShiftNode;
2092   if (ShlAmount > 0) {
2093     // LSL wD, wN, #Amt == UBFM wD, wN, #32-Amt, #31-Amt
2094     ShiftNode = CurDAG->getMachineNode(
2095         UBFMOpc, dl, VT, Op,
2096         CurDAG->getTargetConstant(BitWidth - ShlAmount, dl, VT),
2097         CurDAG->getTargetConstant(BitWidth - 1 - ShlAmount, dl, VT));
2098   } else {
2099     // LSR wD, wN, #Amt == UBFM wD, wN, #Amt, #32-1
2100     assert(ShlAmount < 0 && "expected right shift");
2101     int ShrAmount = -ShlAmount;
2102     ShiftNode = CurDAG->getMachineNode(
2103         UBFMOpc, dl, VT, Op, CurDAG->getTargetConstant(ShrAmount, dl, VT),
2104         CurDAG->getTargetConstant(BitWidth - 1, dl, VT));
2105   }
2106 
2107   return SDValue(ShiftNode, 0);
2108 }
2109 
2110 /// Does this tree qualify as an attempt to move a bitfield into position,
2111 /// essentially "(and (shl VAL, N), Mask)".
2112 static bool isBitfieldPositioningOp(SelectionDAG *CurDAG, SDValue Op,
2113                                     bool BiggerPattern,
2114                                     SDValue &Src, int &ShiftAmount,
2115                                     int &MaskWidth) {
2116   EVT VT = Op.getValueType();
2117   unsigned BitWidth = VT.getSizeInBits();
2118   (void)BitWidth;
2119   assert(BitWidth == 32 || BitWidth == 64);
2120 
2121   KnownBits Known = CurDAG->computeKnownBits(Op);
2122 
2123   // Non-zero in the sense that they're not provably zero, which is the key
2124   // point if we want to use this value
2125   uint64_t NonZeroBits = (~Known.Zero).getZExtValue();
2126 
2127   // Discard a constant AND mask if present. It's safe because the node will
2128   // already have been factored into the computeKnownBits calculation above.
2129   uint64_t AndImm;
2130   if (isOpcWithIntImmediate(Op.getNode(), ISD::AND, AndImm)) {
2131     assert((~APInt(BitWidth, AndImm) & ~Known.Zero) == 0);
2132     Op = Op.getOperand(0);
2133   }
2134 
2135   // Don't match if the SHL has more than one use, since then we'll end up
2136   // generating SHL+UBFIZ instead of just keeping SHL+AND.
2137   if (!BiggerPattern && !Op.hasOneUse())
2138     return false;
2139 
2140   uint64_t ShlImm;
2141   if (!isOpcWithIntImmediate(Op.getNode(), ISD::SHL, ShlImm))
2142     return false;
2143   Op = Op.getOperand(0);
2144 
2145   if (!isShiftedMask_64(NonZeroBits))
2146     return false;
2147 
2148   ShiftAmount = countTrailingZeros(NonZeroBits);
2149   MaskWidth = countTrailingOnes(NonZeroBits >> ShiftAmount);
2150 
2151   // BFI encompasses sufficiently many nodes that it's worth inserting an extra
2152   // LSL/LSR if the mask in NonZeroBits doesn't quite match up with the ISD::SHL
2153   // amount.  BiggerPattern is true when this pattern is being matched for BFI,
2154   // BiggerPattern is false when this pattern is being matched for UBFIZ, in
2155   // which case it is not profitable to insert an extra shift.
2156   if (ShlImm - ShiftAmount != 0 && !BiggerPattern)
2157     return false;
2158   Src = getLeftShift(CurDAG, Op, ShlImm - ShiftAmount);
2159 
2160   return true;
2161 }
2162 
2163 static bool isShiftedMask(uint64_t Mask, EVT VT) {
2164   assert(VT == MVT::i32 || VT == MVT::i64);
2165   if (VT == MVT::i32)
2166     return isShiftedMask_32(Mask);
2167   return isShiftedMask_64(Mask);
2168 }
2169 
2170 // Generate a BFI/BFXIL from 'or (and X, MaskImm), OrImm' iff the value being
2171 // inserted only sets known zero bits.
2172 static bool tryBitfieldInsertOpFromOrAndImm(SDNode *N, SelectionDAG *CurDAG) {
2173   assert(N->getOpcode() == ISD::OR && "Expect a OR operation");
2174 
2175   EVT VT = N->getValueType(0);
2176   if (VT != MVT::i32 && VT != MVT::i64)
2177     return false;
2178 
2179   unsigned BitWidth = VT.getSizeInBits();
2180 
2181   uint64_t OrImm;
2182   if (!isOpcWithIntImmediate(N, ISD::OR, OrImm))
2183     return false;
2184 
2185   // Skip this transformation if the ORR immediate can be encoded in the ORR.
2186   // Otherwise, we'll trade an AND+ORR for ORR+BFI/BFXIL, which is most likely
2187   // performance neutral.
2188   if (AArch64_AM::isLogicalImmediate(OrImm, BitWidth))
2189     return false;
2190 
2191   uint64_t MaskImm;
2192   SDValue And = N->getOperand(0);
2193   // Must be a single use AND with an immediate operand.
2194   if (!And.hasOneUse() ||
2195       !isOpcWithIntImmediate(And.getNode(), ISD::AND, MaskImm))
2196     return false;
2197 
2198   // Compute the Known Zero for the AND as this allows us to catch more general
2199   // cases than just looking for AND with imm.
2200   KnownBits Known = CurDAG->computeKnownBits(And);
2201 
2202   // Non-zero in the sense that they're not provably zero, which is the key
2203   // point if we want to use this value.
2204   uint64_t NotKnownZero = (~Known.Zero).getZExtValue();
2205 
2206   // The KnownZero mask must be a shifted mask (e.g., 1110..011, 11100..00).
2207   if (!isShiftedMask(Known.Zero.getZExtValue(), VT))
2208     return false;
2209 
2210   // The bits being inserted must only set those bits that are known to be zero.
2211   if ((OrImm & NotKnownZero) != 0) {
2212     // FIXME:  It's okay if the OrImm sets NotKnownZero bits to 1, but we don't
2213     // currently handle this case.
2214     return false;
2215   }
2216 
2217   // BFI/BFXIL dst, src, #lsb, #width.
2218   int LSB = countTrailingOnes(NotKnownZero);
2219   int Width = BitWidth - APInt(BitWidth, NotKnownZero).countPopulation();
2220 
2221   // BFI/BFXIL is an alias of BFM, so translate to BFM operands.
2222   unsigned ImmR = (BitWidth - LSB) % BitWidth;
2223   unsigned ImmS = Width - 1;
2224 
2225   // If we're creating a BFI instruction avoid cases where we need more
2226   // instructions to materialize the BFI constant as compared to the original
2227   // ORR.  A BFXIL will use the same constant as the original ORR, so the code
2228   // should be no worse in this case.
2229   bool IsBFI = LSB != 0;
2230   uint64_t BFIImm = OrImm >> LSB;
2231   if (IsBFI && !AArch64_AM::isLogicalImmediate(BFIImm, BitWidth)) {
2232     // We have a BFI instruction and we know the constant can't be materialized
2233     // with a ORR-immediate with the zero register.
2234     unsigned OrChunks = 0, BFIChunks = 0;
2235     for (unsigned Shift = 0; Shift < BitWidth; Shift += 16) {
2236       if (((OrImm >> Shift) & 0xFFFF) != 0)
2237         ++OrChunks;
2238       if (((BFIImm >> Shift) & 0xFFFF) != 0)
2239         ++BFIChunks;
2240     }
2241     if (BFIChunks > OrChunks)
2242       return false;
2243   }
2244 
2245   // Materialize the constant to be inserted.
2246   SDLoc DL(N);
2247   unsigned MOVIOpc = VT == MVT::i32 ? AArch64::MOVi32imm : AArch64::MOVi64imm;
2248   SDNode *MOVI = CurDAG->getMachineNode(
2249       MOVIOpc, DL, VT, CurDAG->getTargetConstant(BFIImm, DL, VT));
2250 
2251   // Create the BFI/BFXIL instruction.
2252   SDValue Ops[] = {And.getOperand(0), SDValue(MOVI, 0),
2253                    CurDAG->getTargetConstant(ImmR, DL, VT),
2254                    CurDAG->getTargetConstant(ImmS, DL, VT)};
2255   unsigned Opc = (VT == MVT::i32) ? AArch64::BFMWri : AArch64::BFMXri;
2256   CurDAG->SelectNodeTo(N, Opc, VT, Ops);
2257   return true;
2258 }
2259 
2260 static bool tryBitfieldInsertOpFromOr(SDNode *N, const APInt &UsefulBits,
2261                                       SelectionDAG *CurDAG) {
2262   assert(N->getOpcode() == ISD::OR && "Expect a OR operation");
2263 
2264   EVT VT = N->getValueType(0);
2265   if (VT != MVT::i32 && VT != MVT::i64)
2266     return false;
2267 
2268   unsigned BitWidth = VT.getSizeInBits();
2269 
2270   // Because of simplify-demanded-bits in DAGCombine, involved masks may not
2271   // have the expected shape. Try to undo that.
2272 
2273   unsigned NumberOfIgnoredLowBits = UsefulBits.countTrailingZeros();
2274   unsigned NumberOfIgnoredHighBits = UsefulBits.countLeadingZeros();
2275 
2276   // Given a OR operation, check if we have the following pattern
2277   // ubfm c, b, imm, imm2 (or something that does the same jobs, see
2278   //                       isBitfieldExtractOp)
2279   // d = e & mask2 ; where mask is a binary sequence of 1..10..0 and
2280   //                 countTrailingZeros(mask2) == imm2 - imm + 1
2281   // f = d | c
2282   // if yes, replace the OR instruction with:
2283   // f = BFM Opd0, Opd1, LSB, MSB ; where LSB = imm, and MSB = imm2
2284 
2285   // OR is commutative, check all combinations of operand order and values of
2286   // BiggerPattern, i.e.
2287   //     Opd0, Opd1, BiggerPattern=false
2288   //     Opd1, Opd0, BiggerPattern=false
2289   //     Opd0, Opd1, BiggerPattern=true
2290   //     Opd1, Opd0, BiggerPattern=true
2291   // Several of these combinations may match, so check with BiggerPattern=false
2292   // first since that will produce better results by matching more instructions
2293   // and/or inserting fewer extra instructions.
2294   for (int I = 0; I < 4; ++I) {
2295 
2296     SDValue Dst, Src;
2297     unsigned ImmR, ImmS;
2298     bool BiggerPattern = I / 2;
2299     SDValue OrOpd0Val = N->getOperand(I % 2);
2300     SDNode *OrOpd0 = OrOpd0Val.getNode();
2301     SDValue OrOpd1Val = N->getOperand((I + 1) % 2);
2302     SDNode *OrOpd1 = OrOpd1Val.getNode();
2303 
2304     unsigned BFXOpc;
2305     int DstLSB, Width;
2306     if (isBitfieldExtractOp(CurDAG, OrOpd0, BFXOpc, Src, ImmR, ImmS,
2307                             NumberOfIgnoredLowBits, BiggerPattern)) {
2308       // Check that the returned opcode is compatible with the pattern,
2309       // i.e., same type and zero extended (U and not S)
2310       if ((BFXOpc != AArch64::UBFMXri && VT == MVT::i64) ||
2311           (BFXOpc != AArch64::UBFMWri && VT == MVT::i32))
2312         continue;
2313 
2314       // Compute the width of the bitfield insertion
2315       DstLSB = 0;
2316       Width = ImmS - ImmR + 1;
2317       // FIXME: This constraint is to catch bitfield insertion we may
2318       // want to widen the pattern if we want to grab general bitfied
2319       // move case
2320       if (Width <= 0)
2321         continue;
2322 
2323       // If the mask on the insertee is correct, we have a BFXIL operation. We
2324       // can share the ImmR and ImmS values from the already-computed UBFM.
2325     } else if (isBitfieldPositioningOp(CurDAG, OrOpd0Val,
2326                                        BiggerPattern,
2327                                        Src, DstLSB, Width)) {
2328       ImmR = (BitWidth - DstLSB) % BitWidth;
2329       ImmS = Width - 1;
2330     } else
2331       continue;
2332 
2333     // Check the second part of the pattern
2334     EVT VT = OrOpd1Val.getValueType();
2335     assert((VT == MVT::i32 || VT == MVT::i64) && "unexpected OR operand");
2336 
2337     // Compute the Known Zero for the candidate of the first operand.
2338     // This allows to catch more general case than just looking for
2339     // AND with imm. Indeed, simplify-demanded-bits may have removed
2340     // the AND instruction because it proves it was useless.
2341     KnownBits Known = CurDAG->computeKnownBits(OrOpd1Val);
2342 
2343     // Check if there is enough room for the second operand to appear
2344     // in the first one
2345     APInt BitsToBeInserted =
2346         APInt::getBitsSet(Known.getBitWidth(), DstLSB, DstLSB + Width);
2347 
2348     if ((BitsToBeInserted & ~Known.Zero) != 0)
2349       continue;
2350 
2351     // Set the first operand
2352     uint64_t Imm;
2353     if (isOpcWithIntImmediate(OrOpd1, ISD::AND, Imm) &&
2354         isBitfieldDstMask(Imm, BitsToBeInserted, NumberOfIgnoredHighBits, VT))
2355       // In that case, we can eliminate the AND
2356       Dst = OrOpd1->getOperand(0);
2357     else
2358       // Maybe the AND has been removed by simplify-demanded-bits
2359       // or is useful because it discards more bits
2360       Dst = OrOpd1Val;
2361 
2362     // both parts match
2363     SDLoc DL(N);
2364     SDValue Ops[] = {Dst, Src, CurDAG->getTargetConstant(ImmR, DL, VT),
2365                      CurDAG->getTargetConstant(ImmS, DL, VT)};
2366     unsigned Opc = (VT == MVT::i32) ? AArch64::BFMWri : AArch64::BFMXri;
2367     CurDAG->SelectNodeTo(N, Opc, VT, Ops);
2368     return true;
2369   }
2370 
2371   // Generate a BFXIL from 'or (and X, Mask0Imm), (and Y, Mask1Imm)' iff
2372   // Mask0Imm and ~Mask1Imm are equivalent and one of the MaskImms is a shifted
2373   // mask (e.g., 0x000ffff0).
2374   uint64_t Mask0Imm, Mask1Imm;
2375   SDValue And0 = N->getOperand(0);
2376   SDValue And1 = N->getOperand(1);
2377   if (And0.hasOneUse() && And1.hasOneUse() &&
2378       isOpcWithIntImmediate(And0.getNode(), ISD::AND, Mask0Imm) &&
2379       isOpcWithIntImmediate(And1.getNode(), ISD::AND, Mask1Imm) &&
2380       APInt(BitWidth, Mask0Imm) == ~APInt(BitWidth, Mask1Imm) &&
2381       (isShiftedMask(Mask0Imm, VT) || isShiftedMask(Mask1Imm, VT))) {
2382 
2383     // ORR is commutative, so canonicalize to the form 'or (and X, Mask0Imm),
2384     // (and Y, Mask1Imm)' where Mask1Imm is the shifted mask masking off the
2385     // bits to be inserted.
2386     if (isShiftedMask(Mask0Imm, VT)) {
2387       std::swap(And0, And1);
2388       std::swap(Mask0Imm, Mask1Imm);
2389     }
2390 
2391     SDValue Src = And1->getOperand(0);
2392     SDValue Dst = And0->getOperand(0);
2393     unsigned LSB = countTrailingZeros(Mask1Imm);
2394     int Width = BitWidth - APInt(BitWidth, Mask0Imm).countPopulation();
2395 
2396     // The BFXIL inserts the low-order bits from a source register, so right
2397     // shift the needed bits into place.
2398     SDLoc DL(N);
2399     unsigned ShiftOpc = (VT == MVT::i32) ? AArch64::UBFMWri : AArch64::UBFMXri;
2400     SDNode *LSR = CurDAG->getMachineNode(
2401         ShiftOpc, DL, VT, Src, CurDAG->getTargetConstant(LSB, DL, VT),
2402         CurDAG->getTargetConstant(BitWidth - 1, DL, VT));
2403 
2404     // BFXIL is an alias of BFM, so translate to BFM operands.
2405     unsigned ImmR = (BitWidth - LSB) % BitWidth;
2406     unsigned ImmS = Width - 1;
2407 
2408     // Create the BFXIL instruction.
2409     SDValue Ops[] = {Dst, SDValue(LSR, 0),
2410                      CurDAG->getTargetConstant(ImmR, DL, VT),
2411                      CurDAG->getTargetConstant(ImmS, DL, VT)};
2412     unsigned Opc = (VT == MVT::i32) ? AArch64::BFMWri : AArch64::BFMXri;
2413     CurDAG->SelectNodeTo(N, Opc, VT, Ops);
2414     return true;
2415   }
2416 
2417   return false;
2418 }
2419 
2420 bool AArch64DAGToDAGISel::tryBitfieldInsertOp(SDNode *N) {
2421   if (N->getOpcode() != ISD::OR)
2422     return false;
2423 
2424   APInt NUsefulBits;
2425   getUsefulBits(SDValue(N, 0), NUsefulBits);
2426 
2427   // If all bits are not useful, just return UNDEF.
2428   if (!NUsefulBits) {
2429     CurDAG->SelectNodeTo(N, TargetOpcode::IMPLICIT_DEF, N->getValueType(0));
2430     return true;
2431   }
2432 
2433   if (tryBitfieldInsertOpFromOr(N, NUsefulBits, CurDAG))
2434     return true;
2435 
2436   return tryBitfieldInsertOpFromOrAndImm(N, CurDAG);
2437 }
2438 
2439 /// SelectBitfieldInsertInZeroOp - Match a UBFIZ instruction that is the
2440 /// equivalent of a left shift by a constant amount followed by an and masking
2441 /// out a contiguous set of bits.
2442 bool AArch64DAGToDAGISel::tryBitfieldInsertInZeroOp(SDNode *N) {
2443   if (N->getOpcode() != ISD::AND)
2444     return false;
2445 
2446   EVT VT = N->getValueType(0);
2447   if (VT != MVT::i32 && VT != MVT::i64)
2448     return false;
2449 
2450   SDValue Op0;
2451   int DstLSB, Width;
2452   if (!isBitfieldPositioningOp(CurDAG, SDValue(N, 0), /*BiggerPattern=*/false,
2453                                Op0, DstLSB, Width))
2454     return false;
2455 
2456   // ImmR is the rotate right amount.
2457   unsigned ImmR = (VT.getSizeInBits() - DstLSB) % VT.getSizeInBits();
2458   // ImmS is the most significant bit of the source to be moved.
2459   unsigned ImmS = Width - 1;
2460 
2461   SDLoc DL(N);
2462   SDValue Ops[] = {Op0, CurDAG->getTargetConstant(ImmR, DL, VT),
2463                    CurDAG->getTargetConstant(ImmS, DL, VT)};
2464   unsigned Opc = (VT == MVT::i32) ? AArch64::UBFMWri : AArch64::UBFMXri;
2465   CurDAG->SelectNodeTo(N, Opc, VT, Ops);
2466   return true;
2467 }
2468 
2469 /// tryShiftAmountMod - Take advantage of built-in mod of shift amount in
2470 /// variable shift/rotate instructions.
2471 bool AArch64DAGToDAGISel::tryShiftAmountMod(SDNode *N) {
2472   EVT VT = N->getValueType(0);
2473 
2474   unsigned Opc;
2475   switch (N->getOpcode()) {
2476   case ISD::ROTR:
2477     Opc = (VT == MVT::i32) ? AArch64::RORVWr : AArch64::RORVXr;
2478     break;
2479   case ISD::SHL:
2480     Opc = (VT == MVT::i32) ? AArch64::LSLVWr : AArch64::LSLVXr;
2481     break;
2482   case ISD::SRL:
2483     Opc = (VT == MVT::i32) ? AArch64::LSRVWr : AArch64::LSRVXr;
2484     break;
2485   case ISD::SRA:
2486     Opc = (VT == MVT::i32) ? AArch64::ASRVWr : AArch64::ASRVXr;
2487     break;
2488   default:
2489     return false;
2490   }
2491 
2492   uint64_t Size;
2493   uint64_t Bits;
2494   if (VT == MVT::i32) {
2495     Bits = 5;
2496     Size = 32;
2497   } else if (VT == MVT::i64) {
2498     Bits = 6;
2499     Size = 64;
2500   } else
2501     return false;
2502 
2503   SDValue ShiftAmt = N->getOperand(1);
2504   SDLoc DL(N);
2505   SDValue NewShiftAmt;
2506 
2507   // Skip over an extend of the shift amount.
2508   if (ShiftAmt->getOpcode() == ISD::ZERO_EXTEND ||
2509       ShiftAmt->getOpcode() == ISD::ANY_EXTEND)
2510     ShiftAmt = ShiftAmt->getOperand(0);
2511 
2512   if (ShiftAmt->getOpcode() == ISD::ADD || ShiftAmt->getOpcode() == ISD::SUB) {
2513     SDValue Add0 = ShiftAmt->getOperand(0);
2514     SDValue Add1 = ShiftAmt->getOperand(1);
2515     uint64_t Add0Imm;
2516     uint64_t Add1Imm;
2517     // If we are shifting by X+/-N where N == 0 mod Size, then just shift by X
2518     // to avoid the ADD/SUB.
2519     if (isIntImmediate(Add1, Add1Imm) && (Add1Imm % Size == 0))
2520       NewShiftAmt = Add0;
2521     // If we are shifting by N-X where N == 0 mod Size, then just shift by -X to
2522     // generate a NEG instead of a SUB of a constant.
2523     else if (ShiftAmt->getOpcode() == ISD::SUB &&
2524              isIntImmediate(Add0, Add0Imm) && Add0Imm != 0 &&
2525              (Add0Imm % Size == 0)) {
2526       unsigned NegOpc;
2527       unsigned ZeroReg;
2528       EVT SubVT = ShiftAmt->getValueType(0);
2529       if (SubVT == MVT::i32) {
2530         NegOpc = AArch64::SUBWrr;
2531         ZeroReg = AArch64::WZR;
2532       } else {
2533         assert(SubVT == MVT::i64);
2534         NegOpc = AArch64::SUBXrr;
2535         ZeroReg = AArch64::XZR;
2536       }
2537       SDValue Zero =
2538           CurDAG->getCopyFromReg(CurDAG->getEntryNode(), DL, ZeroReg, SubVT);
2539       MachineSDNode *Neg =
2540           CurDAG->getMachineNode(NegOpc, DL, SubVT, Zero, Add1);
2541       NewShiftAmt = SDValue(Neg, 0);
2542     } else
2543       return false;
2544   } else {
2545     // If the shift amount is masked with an AND, check that the mask covers the
2546     // bits that are implicitly ANDed off by the above opcodes and if so, skip
2547     // the AND.
2548     uint64_t MaskImm;
2549     if (!isOpcWithIntImmediate(ShiftAmt.getNode(), ISD::AND, MaskImm))
2550       return false;
2551 
2552     if (countTrailingOnes(MaskImm) < Bits)
2553       return false;
2554 
2555     NewShiftAmt = ShiftAmt->getOperand(0);
2556   }
2557 
2558   // Narrow/widen the shift amount to match the size of the shift operation.
2559   if (VT == MVT::i32)
2560     NewShiftAmt = narrowIfNeeded(CurDAG, NewShiftAmt);
2561   else if (VT == MVT::i64 && NewShiftAmt->getValueType(0) == MVT::i32) {
2562     SDValue SubReg = CurDAG->getTargetConstant(AArch64::sub_32, DL, MVT::i32);
2563     MachineSDNode *Ext = CurDAG->getMachineNode(
2564         AArch64::SUBREG_TO_REG, DL, VT,
2565         CurDAG->getTargetConstant(0, DL, MVT::i64), NewShiftAmt, SubReg);
2566     NewShiftAmt = SDValue(Ext, 0);
2567   }
2568 
2569   SDValue Ops[] = {N->getOperand(0), NewShiftAmt};
2570   CurDAG->SelectNodeTo(N, Opc, VT, Ops);
2571   return true;
2572 }
2573 
2574 bool
2575 AArch64DAGToDAGISel::SelectCVTFixedPosOperand(SDValue N, SDValue &FixedPos,
2576                                               unsigned RegWidth) {
2577   APFloat FVal(0.0);
2578   if (ConstantFPSDNode *CN = dyn_cast<ConstantFPSDNode>(N))
2579     FVal = CN->getValueAPF();
2580   else if (LoadSDNode *LN = dyn_cast<LoadSDNode>(N)) {
2581     // Some otherwise illegal constants are allowed in this case.
2582     if (LN->getOperand(1).getOpcode() != AArch64ISD::ADDlow ||
2583         !isa<ConstantPoolSDNode>(LN->getOperand(1)->getOperand(1)))
2584       return false;
2585 
2586     ConstantPoolSDNode *CN =
2587         dyn_cast<ConstantPoolSDNode>(LN->getOperand(1)->getOperand(1));
2588     FVal = cast<ConstantFP>(CN->getConstVal())->getValueAPF();
2589   } else
2590     return false;
2591 
2592   // An FCVT[SU] instruction performs: convertToInt(Val * 2^fbits) where fbits
2593   // is between 1 and 32 for a destination w-register, or 1 and 64 for an
2594   // x-register.
2595   //
2596   // By this stage, we've detected (fp_to_[su]int (fmul Val, THIS_NODE)) so we
2597   // want THIS_NODE to be 2^fbits. This is much easier to deal with using
2598   // integers.
2599   bool IsExact;
2600 
2601   // fbits is between 1 and 64 in the worst-case, which means the fmul
2602   // could have 2^64 as an actual operand. Need 65 bits of precision.
2603   APSInt IntVal(65, true);
2604   FVal.convertToInteger(IntVal, APFloat::rmTowardZero, &IsExact);
2605 
2606   // N.b. isPowerOf2 also checks for > 0.
2607   if (!IsExact || !IntVal.isPowerOf2()) return false;
2608   unsigned FBits = IntVal.logBase2();
2609 
2610   // Checks above should have guaranteed that we haven't lost information in
2611   // finding FBits, but it must still be in range.
2612   if (FBits == 0 || FBits > RegWidth) return false;
2613 
2614   FixedPos = CurDAG->getTargetConstant(FBits, SDLoc(N), MVT::i32);
2615   return true;
2616 }
2617 
2618 // Inspects a register string of the form o0:op1:CRn:CRm:op2 gets the fields
2619 // of the string and obtains the integer values from them and combines these
2620 // into a single value to be used in the MRS/MSR instruction.
2621 static int getIntOperandFromRegisterString(StringRef RegString) {
2622   SmallVector<StringRef, 5> Fields;
2623   RegString.split(Fields, ':');
2624 
2625   if (Fields.size() == 1)
2626     return -1;
2627 
2628   assert(Fields.size() == 5
2629             && "Invalid number of fields in read register string");
2630 
2631   SmallVector<int, 5> Ops;
2632   bool AllIntFields = true;
2633 
2634   for (StringRef Field : Fields) {
2635     unsigned IntField;
2636     AllIntFields &= !Field.getAsInteger(10, IntField);
2637     Ops.push_back(IntField);
2638   }
2639 
2640   assert(AllIntFields &&
2641           "Unexpected non-integer value in special register string.");
2642 
2643   // Need to combine the integer fields of the string into a single value
2644   // based on the bit encoding of MRS/MSR instruction.
2645   return (Ops[0] << 14) | (Ops[1] << 11) | (Ops[2] << 7) |
2646          (Ops[3] << 3) | (Ops[4]);
2647 }
2648 
2649 // Lower the read_register intrinsic to an MRS instruction node if the special
2650 // register string argument is either of the form detailed in the ALCE (the
2651 // form described in getIntOperandsFromRegsterString) or is a named register
2652 // known by the MRS SysReg mapper.
2653 bool AArch64DAGToDAGISel::tryReadRegister(SDNode *N) {
2654   const MDNodeSDNode *MD = dyn_cast<MDNodeSDNode>(N->getOperand(1));
2655   const MDString *RegString = dyn_cast<MDString>(MD->getMD()->getOperand(0));
2656   SDLoc DL(N);
2657 
2658   int Reg = getIntOperandFromRegisterString(RegString->getString());
2659   if (Reg != -1) {
2660     ReplaceNode(N, CurDAG->getMachineNode(
2661                        AArch64::MRS, DL, N->getSimpleValueType(0), MVT::Other,
2662                        CurDAG->getTargetConstant(Reg, DL, MVT::i32),
2663                        N->getOperand(0)));
2664     return true;
2665   }
2666 
2667   // Use the sysreg mapper to map the remaining possible strings to the
2668   // value for the register to be used for the instruction operand.
2669   auto TheReg = AArch64SysReg::lookupSysRegByName(RegString->getString());
2670   if (TheReg && TheReg->Readable &&
2671       TheReg->haveFeatures(Subtarget->getFeatureBits()))
2672     Reg = TheReg->Encoding;
2673   else
2674     Reg = AArch64SysReg::parseGenericRegister(RegString->getString());
2675 
2676   if (Reg != -1) {
2677     ReplaceNode(N, CurDAG->getMachineNode(
2678                        AArch64::MRS, DL, N->getSimpleValueType(0), MVT::Other,
2679                        CurDAG->getTargetConstant(Reg, DL, MVT::i32),
2680                        N->getOperand(0)));
2681     return true;
2682   }
2683 
2684   if (RegString->getString() == "pc") {
2685     ReplaceNode(N, CurDAG->getMachineNode(
2686                        AArch64::ADR, DL, N->getSimpleValueType(0), MVT::Other,
2687                        CurDAG->getTargetConstant(0, DL, MVT::i32),
2688                        N->getOperand(0)));
2689     return true;
2690   }
2691 
2692   return false;
2693 }
2694 
2695 // Lower the write_register intrinsic to an MSR instruction node if the special
2696 // register string argument is either of the form detailed in the ALCE (the
2697 // form described in getIntOperandsFromRegsterString) or is a named register
2698 // known by the MSR SysReg mapper.
2699 bool AArch64DAGToDAGISel::tryWriteRegister(SDNode *N) {
2700   const MDNodeSDNode *MD = dyn_cast<MDNodeSDNode>(N->getOperand(1));
2701   const MDString *RegString = dyn_cast<MDString>(MD->getMD()->getOperand(0));
2702   SDLoc DL(N);
2703 
2704   int Reg = getIntOperandFromRegisterString(RegString->getString());
2705   if (Reg != -1) {
2706     ReplaceNode(
2707         N, CurDAG->getMachineNode(AArch64::MSR, DL, MVT::Other,
2708                                   CurDAG->getTargetConstant(Reg, DL, MVT::i32),
2709                                   N->getOperand(2), N->getOperand(0)));
2710     return true;
2711   }
2712 
2713   // Check if the register was one of those allowed as the pstatefield value in
2714   // the MSR (immediate) instruction. To accept the values allowed in the
2715   // pstatefield for the MSR (immediate) instruction, we also require that an
2716   // immediate value has been provided as an argument, we know that this is
2717   // the case as it has been ensured by semantic checking.
2718   auto PMapper = AArch64PState::lookupPStateByName(RegString->getString());
2719   if (PMapper) {
2720     assert (isa<ConstantSDNode>(N->getOperand(2))
2721               && "Expected a constant integer expression.");
2722     unsigned Reg = PMapper->Encoding;
2723     uint64_t Immed = cast<ConstantSDNode>(N->getOperand(2))->getZExtValue();
2724     unsigned State;
2725     if (Reg == AArch64PState::PAN || Reg == AArch64PState::UAO || Reg == AArch64PState::SSBS) {
2726       assert(Immed < 2 && "Bad imm");
2727       State = AArch64::MSRpstateImm1;
2728     } else {
2729       assert(Immed < 16 && "Bad imm");
2730       State = AArch64::MSRpstateImm4;
2731     }
2732     ReplaceNode(N, CurDAG->getMachineNode(
2733                        State, DL, MVT::Other,
2734                        CurDAG->getTargetConstant(Reg, DL, MVT::i32),
2735                        CurDAG->getTargetConstant(Immed, DL, MVT::i16),
2736                        N->getOperand(0)));
2737     return true;
2738   }
2739 
2740   // Use the sysreg mapper to attempt to map the remaining possible strings
2741   // to the value for the register to be used for the MSR (register)
2742   // instruction operand.
2743   auto TheReg = AArch64SysReg::lookupSysRegByName(RegString->getString());
2744   if (TheReg && TheReg->Writeable &&
2745       TheReg->haveFeatures(Subtarget->getFeatureBits()))
2746     Reg = TheReg->Encoding;
2747   else
2748     Reg = AArch64SysReg::parseGenericRegister(RegString->getString());
2749   if (Reg != -1) {
2750     ReplaceNode(N, CurDAG->getMachineNode(
2751                        AArch64::MSR, DL, MVT::Other,
2752                        CurDAG->getTargetConstant(Reg, DL, MVT::i32),
2753                        N->getOperand(2), N->getOperand(0)));
2754     return true;
2755   }
2756 
2757   return false;
2758 }
2759 
2760 /// We've got special pseudo-instructions for these
2761 bool AArch64DAGToDAGISel::SelectCMP_SWAP(SDNode *N) {
2762   unsigned Opcode;
2763   EVT MemTy = cast<MemSDNode>(N)->getMemoryVT();
2764 
2765   // Leave IR for LSE if subtarget supports it.
2766   if (Subtarget->hasLSE()) return false;
2767 
2768   if (MemTy == MVT::i8)
2769     Opcode = AArch64::CMP_SWAP_8;
2770   else if (MemTy == MVT::i16)
2771     Opcode = AArch64::CMP_SWAP_16;
2772   else if (MemTy == MVT::i32)
2773     Opcode = AArch64::CMP_SWAP_32;
2774   else if (MemTy == MVT::i64)
2775     Opcode = AArch64::CMP_SWAP_64;
2776   else
2777     llvm_unreachable("Unknown AtomicCmpSwap type");
2778 
2779   MVT RegTy = MemTy == MVT::i64 ? MVT::i64 : MVT::i32;
2780   SDValue Ops[] = {N->getOperand(1), N->getOperand(2), N->getOperand(3),
2781                    N->getOperand(0)};
2782   SDNode *CmpSwap = CurDAG->getMachineNode(
2783       Opcode, SDLoc(N),
2784       CurDAG->getVTList(RegTy, MVT::i32, MVT::Other), Ops);
2785 
2786   MachineMemOperand *MemOp = cast<MemSDNode>(N)->getMemOperand();
2787   CurDAG->setNodeMemRefs(cast<MachineSDNode>(CmpSwap), {MemOp});
2788 
2789   ReplaceUses(SDValue(N, 0), SDValue(CmpSwap, 0));
2790   ReplaceUses(SDValue(N, 1), SDValue(CmpSwap, 2));
2791   CurDAG->RemoveDeadNode(N);
2792 
2793   return true;
2794 }
2795 
2796 bool AArch64DAGToDAGISel::trySelectStackSlotTagP(SDNode *N) {
2797   // tagp(FrameIndex, IRGstack, tag_offset):
2798   // since the offset between FrameIndex and IRGstack is a compile-time
2799   // constant, this can be lowered to a single ADDG instruction.
2800   if (!(isa<FrameIndexSDNode>(N->getOperand(1)))) {
2801     return false;
2802   }
2803 
2804   SDValue IRG_SP = N->getOperand(2);
2805   if (IRG_SP->getOpcode() != ISD::INTRINSIC_W_CHAIN ||
2806       cast<ConstantSDNode>(IRG_SP->getOperand(1))->getZExtValue() !=
2807           Intrinsic::aarch64_irg_sp) {
2808     return false;
2809   }
2810 
2811   const TargetLowering *TLI = getTargetLowering();
2812   SDLoc DL(N);
2813   int FI = cast<FrameIndexSDNode>(N->getOperand(1))->getIndex();
2814   SDValue FiOp = CurDAG->getTargetFrameIndex(
2815       FI, TLI->getPointerTy(CurDAG->getDataLayout()));
2816   int TagOffset = cast<ConstantSDNode>(N->getOperand(3))->getZExtValue();
2817 
2818   SDNode *Out = CurDAG->getMachineNode(
2819       AArch64::TAGPstack, DL, MVT::i64,
2820       {FiOp, CurDAG->getTargetConstant(0, DL, MVT::i64), N->getOperand(2),
2821        CurDAG->getTargetConstant(TagOffset, DL, MVT::i64)});
2822   ReplaceNode(N, Out);
2823   return true;
2824 }
2825 
2826 void AArch64DAGToDAGISel::SelectTagP(SDNode *N) {
2827   assert(isa<ConstantSDNode>(N->getOperand(3)) &&
2828          "llvm.aarch64.tagp third argument must be an immediate");
2829   if (trySelectStackSlotTagP(N))
2830     return;
2831   // FIXME: above applies in any case when offset between Op1 and Op2 is a
2832   // compile-time constant, not just for stack allocations.
2833 
2834   // General case for unrelated pointers in Op1 and Op2.
2835   SDLoc DL(N);
2836   int TagOffset = cast<ConstantSDNode>(N->getOperand(3))->getZExtValue();
2837   SDNode *N1 = CurDAG->getMachineNode(AArch64::SUBP, DL, MVT::i64,
2838                                       {N->getOperand(1), N->getOperand(2)});
2839   SDNode *N2 = CurDAG->getMachineNode(AArch64::ADDXrr, DL, MVT::i64,
2840                                       {SDValue(N1, 0), N->getOperand(2)});
2841   SDNode *N3 = CurDAG->getMachineNode(
2842       AArch64::ADDG, DL, MVT::i64,
2843       {SDValue(N2, 0), CurDAG->getTargetConstant(0, DL, MVT::i64),
2844        CurDAG->getTargetConstant(TagOffset, DL, MVT::i64)});
2845   ReplaceNode(N, N3);
2846 }
2847 
2848 void AArch64DAGToDAGISel::Select(SDNode *Node) {
2849   // If we have a custom node, we already have selected!
2850   if (Node->isMachineOpcode()) {
2851     LLVM_DEBUG(errs() << "== "; Node->dump(CurDAG); errs() << "\n");
2852     Node->setNodeId(-1);
2853     return;
2854   }
2855 
2856   // Few custom selection stuff.
2857   EVT VT = Node->getValueType(0);
2858 
2859   switch (Node->getOpcode()) {
2860   default:
2861     break;
2862 
2863   case ISD::ATOMIC_CMP_SWAP:
2864     if (SelectCMP_SWAP(Node))
2865       return;
2866     break;
2867 
2868   case ISD::READ_REGISTER:
2869     if (tryReadRegister(Node))
2870       return;
2871     break;
2872 
2873   case ISD::WRITE_REGISTER:
2874     if (tryWriteRegister(Node))
2875       return;
2876     break;
2877 
2878   case ISD::ADD:
2879     if (tryMLAV64LaneV128(Node))
2880       return;
2881     break;
2882 
2883   case ISD::LOAD: {
2884     // Try to select as an indexed load. Fall through to normal processing
2885     // if we can't.
2886     if (tryIndexedLoad(Node))
2887       return;
2888     break;
2889   }
2890 
2891   case ISD::SRL:
2892   case ISD::AND:
2893   case ISD::SRA:
2894   case ISD::SIGN_EXTEND_INREG:
2895     if (tryBitfieldExtractOp(Node))
2896       return;
2897     if (tryBitfieldInsertInZeroOp(Node))
2898       return;
2899     LLVM_FALLTHROUGH;
2900   case ISD::ROTR:
2901   case ISD::SHL:
2902     if (tryShiftAmountMod(Node))
2903       return;
2904     break;
2905 
2906   case ISD::SIGN_EXTEND:
2907     if (tryBitfieldExtractOpFromSExt(Node))
2908       return;
2909     break;
2910 
2911   case ISD::OR:
2912     if (tryBitfieldInsertOp(Node))
2913       return;
2914     break;
2915 
2916   case ISD::EXTRACT_VECTOR_ELT: {
2917     // Extracting lane zero is a special case where we can just use a plain
2918     // EXTRACT_SUBREG instruction, which will become FMOV. This is easier for
2919     // the rest of the compiler, especially the register allocator and copyi
2920     // propagation, to reason about, so is preferred when it's possible to
2921     // use it.
2922     ConstantSDNode *LaneNode = cast<ConstantSDNode>(Node->getOperand(1));
2923     // Bail and use the default Select() for non-zero lanes.
2924     if (LaneNode->getZExtValue() != 0)
2925       break;
2926     // If the element type is not the same as the result type, likewise
2927     // bail and use the default Select(), as there's more to do than just
2928     // a cross-class COPY. This catches extracts of i8 and i16 elements
2929     // since they will need an explicit zext.
2930     if (VT != Node->getOperand(0).getValueType().getVectorElementType())
2931       break;
2932     unsigned SubReg;
2933     switch (Node->getOperand(0)
2934                 .getValueType()
2935                 .getVectorElementType()
2936                 .getSizeInBits()) {
2937     default:
2938       llvm_unreachable("Unexpected vector element type!");
2939     case 64:
2940       SubReg = AArch64::dsub;
2941       break;
2942     case 32:
2943       SubReg = AArch64::ssub;
2944       break;
2945     case 16:
2946       SubReg = AArch64::hsub;
2947       break;
2948     case 8:
2949       llvm_unreachable("unexpected zext-requiring extract element!");
2950     }
2951     SDValue Extract = CurDAG->getTargetExtractSubreg(SubReg, SDLoc(Node), VT,
2952                                                      Node->getOperand(0));
2953     LLVM_DEBUG(dbgs() << "ISEL: Custom selection!\n=> ");
2954     LLVM_DEBUG(Extract->dumpr(CurDAG));
2955     LLVM_DEBUG(dbgs() << "\n");
2956     ReplaceNode(Node, Extract.getNode());
2957     return;
2958   }
2959   case ISD::Constant: {
2960     // Materialize zero constants as copies from WZR/XZR.  This allows
2961     // the coalescer to propagate these into other instructions.
2962     ConstantSDNode *ConstNode = cast<ConstantSDNode>(Node);
2963     if (ConstNode->isNullValue()) {
2964       if (VT == MVT::i32) {
2965         SDValue New = CurDAG->getCopyFromReg(
2966             CurDAG->getEntryNode(), SDLoc(Node), AArch64::WZR, MVT::i32);
2967         ReplaceNode(Node, New.getNode());
2968         return;
2969       } else if (VT == MVT::i64) {
2970         SDValue New = CurDAG->getCopyFromReg(
2971             CurDAG->getEntryNode(), SDLoc(Node), AArch64::XZR, MVT::i64);
2972         ReplaceNode(Node, New.getNode());
2973         return;
2974       }
2975     }
2976     break;
2977   }
2978 
2979   case ISD::FrameIndex: {
2980     // Selects to ADDXri FI, 0 which in turn will become ADDXri SP, imm.
2981     int FI = cast<FrameIndexSDNode>(Node)->getIndex();
2982     unsigned Shifter = AArch64_AM::getShifterImm(AArch64_AM::LSL, 0);
2983     const TargetLowering *TLI = getTargetLowering();
2984     SDValue TFI = CurDAG->getTargetFrameIndex(
2985         FI, TLI->getPointerTy(CurDAG->getDataLayout()));
2986     SDLoc DL(Node);
2987     SDValue Ops[] = { TFI, CurDAG->getTargetConstant(0, DL, MVT::i32),
2988                       CurDAG->getTargetConstant(Shifter, DL, MVT::i32) };
2989     CurDAG->SelectNodeTo(Node, AArch64::ADDXri, MVT::i64, Ops);
2990     return;
2991   }
2992   case ISD::INTRINSIC_W_CHAIN: {
2993     unsigned IntNo = cast<ConstantSDNode>(Node->getOperand(1))->getZExtValue();
2994     switch (IntNo) {
2995     default:
2996       break;
2997     case Intrinsic::aarch64_ldaxp:
2998     case Intrinsic::aarch64_ldxp: {
2999       unsigned Op =
3000           IntNo == Intrinsic::aarch64_ldaxp ? AArch64::LDAXPX : AArch64::LDXPX;
3001       SDValue MemAddr = Node->getOperand(2);
3002       SDLoc DL(Node);
3003       SDValue Chain = Node->getOperand(0);
3004 
3005       SDNode *Ld = CurDAG->getMachineNode(Op, DL, MVT::i64, MVT::i64,
3006                                           MVT::Other, MemAddr, Chain);
3007 
3008       // Transfer memoperands.
3009       MachineMemOperand *MemOp =
3010           cast<MemIntrinsicSDNode>(Node)->getMemOperand();
3011       CurDAG->setNodeMemRefs(cast<MachineSDNode>(Ld), {MemOp});
3012       ReplaceNode(Node, Ld);
3013       return;
3014     }
3015     case Intrinsic::aarch64_stlxp:
3016     case Intrinsic::aarch64_stxp: {
3017       unsigned Op =
3018           IntNo == Intrinsic::aarch64_stlxp ? AArch64::STLXPX : AArch64::STXPX;
3019       SDLoc DL(Node);
3020       SDValue Chain = Node->getOperand(0);
3021       SDValue ValLo = Node->getOperand(2);
3022       SDValue ValHi = Node->getOperand(3);
3023       SDValue MemAddr = Node->getOperand(4);
3024 
3025       // Place arguments in the right order.
3026       SDValue Ops[] = {ValLo, ValHi, MemAddr, Chain};
3027 
3028       SDNode *St = CurDAG->getMachineNode(Op, DL, MVT::i32, MVT::Other, Ops);
3029       // Transfer memoperands.
3030       MachineMemOperand *MemOp =
3031           cast<MemIntrinsicSDNode>(Node)->getMemOperand();
3032       CurDAG->setNodeMemRefs(cast<MachineSDNode>(St), {MemOp});
3033 
3034       ReplaceNode(Node, St);
3035       return;
3036     }
3037     case Intrinsic::aarch64_neon_ld1x2:
3038       if (VT == MVT::v8i8) {
3039         SelectLoad(Node, 2, AArch64::LD1Twov8b, AArch64::dsub0);
3040         return;
3041       } else if (VT == MVT::v16i8) {
3042         SelectLoad(Node, 2, AArch64::LD1Twov16b, AArch64::qsub0);
3043         return;
3044       } else if (VT == MVT::v4i16 || VT == MVT::v4f16) {
3045         SelectLoad(Node, 2, AArch64::LD1Twov4h, AArch64::dsub0);
3046         return;
3047       } else if (VT == MVT::v8i16 || VT == MVT::v8f16) {
3048         SelectLoad(Node, 2, AArch64::LD1Twov8h, AArch64::qsub0);
3049         return;
3050       } else if (VT == MVT::v2i32 || VT == MVT::v2f32) {
3051         SelectLoad(Node, 2, AArch64::LD1Twov2s, AArch64::dsub0);
3052         return;
3053       } else if (VT == MVT::v4i32 || VT == MVT::v4f32) {
3054         SelectLoad(Node, 2, AArch64::LD1Twov4s, AArch64::qsub0);
3055         return;
3056       } else if (VT == MVT::v1i64 || VT == MVT::v1f64) {
3057         SelectLoad(Node, 2, AArch64::LD1Twov1d, AArch64::dsub0);
3058         return;
3059       } else if (VT == MVT::v2i64 || VT == MVT::v2f64) {
3060         SelectLoad(Node, 2, AArch64::LD1Twov2d, AArch64::qsub0);
3061         return;
3062       }
3063       break;
3064     case Intrinsic::aarch64_neon_ld1x3:
3065       if (VT == MVT::v8i8) {
3066         SelectLoad(Node, 3, AArch64::LD1Threev8b, AArch64::dsub0);
3067         return;
3068       } else if (VT == MVT::v16i8) {
3069         SelectLoad(Node, 3, AArch64::LD1Threev16b, AArch64::qsub0);
3070         return;
3071       } else if (VT == MVT::v4i16 || VT == MVT::v4f16) {
3072         SelectLoad(Node, 3, AArch64::LD1Threev4h, AArch64::dsub0);
3073         return;
3074       } else if (VT == MVT::v8i16 || VT == MVT::v8f16) {
3075         SelectLoad(Node, 3, AArch64::LD1Threev8h, AArch64::qsub0);
3076         return;
3077       } else if (VT == MVT::v2i32 || VT == MVT::v2f32) {
3078         SelectLoad(Node, 3, AArch64::LD1Threev2s, AArch64::dsub0);
3079         return;
3080       } else if (VT == MVT::v4i32 || VT == MVT::v4f32) {
3081         SelectLoad(Node, 3, AArch64::LD1Threev4s, AArch64::qsub0);
3082         return;
3083       } else if (VT == MVT::v1i64 || VT == MVT::v1f64) {
3084         SelectLoad(Node, 3, AArch64::LD1Threev1d, AArch64::dsub0);
3085         return;
3086       } else if (VT == MVT::v2i64 || VT == MVT::v2f64) {
3087         SelectLoad(Node, 3, AArch64::LD1Threev2d, AArch64::qsub0);
3088         return;
3089       }
3090       break;
3091     case Intrinsic::aarch64_neon_ld1x4:
3092       if (VT == MVT::v8i8) {
3093         SelectLoad(Node, 4, AArch64::LD1Fourv8b, AArch64::dsub0);
3094         return;
3095       } else if (VT == MVT::v16i8) {
3096         SelectLoad(Node, 4, AArch64::LD1Fourv16b, AArch64::qsub0);
3097         return;
3098       } else if (VT == MVT::v4i16 || VT == MVT::v4f16) {
3099         SelectLoad(Node, 4, AArch64::LD1Fourv4h, AArch64::dsub0);
3100         return;
3101       } else if (VT == MVT::v8i16 || VT == MVT::v8f16) {
3102         SelectLoad(Node, 4, AArch64::LD1Fourv8h, AArch64::qsub0);
3103         return;
3104       } else if (VT == MVT::v2i32 || VT == MVT::v2f32) {
3105         SelectLoad(Node, 4, AArch64::LD1Fourv2s, AArch64::dsub0);
3106         return;
3107       } else if (VT == MVT::v4i32 || VT == MVT::v4f32) {
3108         SelectLoad(Node, 4, AArch64::LD1Fourv4s, AArch64::qsub0);
3109         return;
3110       } else if (VT == MVT::v1i64 || VT == MVT::v1f64) {
3111         SelectLoad(Node, 4, AArch64::LD1Fourv1d, AArch64::dsub0);
3112         return;
3113       } else if (VT == MVT::v2i64 || VT == MVT::v2f64) {
3114         SelectLoad(Node, 4, AArch64::LD1Fourv2d, AArch64::qsub0);
3115         return;
3116       }
3117       break;
3118     case Intrinsic::aarch64_neon_ld2:
3119       if (VT == MVT::v8i8) {
3120         SelectLoad(Node, 2, AArch64::LD2Twov8b, AArch64::dsub0);
3121         return;
3122       } else if (VT == MVT::v16i8) {
3123         SelectLoad(Node, 2, AArch64::LD2Twov16b, AArch64::qsub0);
3124         return;
3125       } else if (VT == MVT::v4i16 || VT == MVT::v4f16) {
3126         SelectLoad(Node, 2, AArch64::LD2Twov4h, AArch64::dsub0);
3127         return;
3128       } else if (VT == MVT::v8i16 || VT == MVT::v8f16) {
3129         SelectLoad(Node, 2, AArch64::LD2Twov8h, AArch64::qsub0);
3130         return;
3131       } else if (VT == MVT::v2i32 || VT == MVT::v2f32) {
3132         SelectLoad(Node, 2, AArch64::LD2Twov2s, AArch64::dsub0);
3133         return;
3134       } else if (VT == MVT::v4i32 || VT == MVT::v4f32) {
3135         SelectLoad(Node, 2, AArch64::LD2Twov4s, AArch64::qsub0);
3136         return;
3137       } else if (VT == MVT::v1i64 || VT == MVT::v1f64) {
3138         SelectLoad(Node, 2, AArch64::LD1Twov1d, AArch64::dsub0);
3139         return;
3140       } else if (VT == MVT::v2i64 || VT == MVT::v2f64) {
3141         SelectLoad(Node, 2, AArch64::LD2Twov2d, AArch64::qsub0);
3142         return;
3143       }
3144       break;
3145     case Intrinsic::aarch64_neon_ld3:
3146       if (VT == MVT::v8i8) {
3147         SelectLoad(Node, 3, AArch64::LD3Threev8b, AArch64::dsub0);
3148         return;
3149       } else if (VT == MVT::v16i8) {
3150         SelectLoad(Node, 3, AArch64::LD3Threev16b, AArch64::qsub0);
3151         return;
3152       } else if (VT == MVT::v4i16 || VT == MVT::v4f16) {
3153         SelectLoad(Node, 3, AArch64::LD3Threev4h, AArch64::dsub0);
3154         return;
3155       } else if (VT == MVT::v8i16 || VT == MVT::v8f16) {
3156         SelectLoad(Node, 3, AArch64::LD3Threev8h, AArch64::qsub0);
3157         return;
3158       } else if (VT == MVT::v2i32 || VT == MVT::v2f32) {
3159         SelectLoad(Node, 3, AArch64::LD3Threev2s, AArch64::dsub0);
3160         return;
3161       } else if (VT == MVT::v4i32 || VT == MVT::v4f32) {
3162         SelectLoad(Node, 3, AArch64::LD3Threev4s, AArch64::qsub0);
3163         return;
3164       } else if (VT == MVT::v1i64 || VT == MVT::v1f64) {
3165         SelectLoad(Node, 3, AArch64::LD1Threev1d, AArch64::dsub0);
3166         return;
3167       } else if (VT == MVT::v2i64 || VT == MVT::v2f64) {
3168         SelectLoad(Node, 3, AArch64::LD3Threev2d, AArch64::qsub0);
3169         return;
3170       }
3171       break;
3172     case Intrinsic::aarch64_neon_ld4:
3173       if (VT == MVT::v8i8) {
3174         SelectLoad(Node, 4, AArch64::LD4Fourv8b, AArch64::dsub0);
3175         return;
3176       } else if (VT == MVT::v16i8) {
3177         SelectLoad(Node, 4, AArch64::LD4Fourv16b, AArch64::qsub0);
3178         return;
3179       } else if (VT == MVT::v4i16 || VT == MVT::v4f16) {
3180         SelectLoad(Node, 4, AArch64::LD4Fourv4h, AArch64::dsub0);
3181         return;
3182       } else if (VT == MVT::v8i16 || VT == MVT::v8f16) {
3183         SelectLoad(Node, 4, AArch64::LD4Fourv8h, AArch64::qsub0);
3184         return;
3185       } else if (VT == MVT::v2i32 || VT == MVT::v2f32) {
3186         SelectLoad(Node, 4, AArch64::LD4Fourv2s, AArch64::dsub0);
3187         return;
3188       } else if (VT == MVT::v4i32 || VT == MVT::v4f32) {
3189         SelectLoad(Node, 4, AArch64::LD4Fourv4s, AArch64::qsub0);
3190         return;
3191       } else if (VT == MVT::v1i64 || VT == MVT::v1f64) {
3192         SelectLoad(Node, 4, AArch64::LD1Fourv1d, AArch64::dsub0);
3193         return;
3194       } else if (VT == MVT::v2i64 || VT == MVT::v2f64) {
3195         SelectLoad(Node, 4, AArch64::LD4Fourv2d, AArch64::qsub0);
3196         return;
3197       }
3198       break;
3199     case Intrinsic::aarch64_neon_ld2r:
3200       if (VT == MVT::v8i8) {
3201         SelectLoad(Node, 2, AArch64::LD2Rv8b, AArch64::dsub0);
3202         return;
3203       } else if (VT == MVT::v16i8) {
3204         SelectLoad(Node, 2, AArch64::LD2Rv16b, AArch64::qsub0);
3205         return;
3206       } else if (VT == MVT::v4i16 || VT == MVT::v4f16) {
3207         SelectLoad(Node, 2, AArch64::LD2Rv4h, AArch64::dsub0);
3208         return;
3209       } else if (VT == MVT::v8i16 || VT == MVT::v8f16) {
3210         SelectLoad(Node, 2, AArch64::LD2Rv8h, AArch64::qsub0);
3211         return;
3212       } else if (VT == MVT::v2i32 || VT == MVT::v2f32) {
3213         SelectLoad(Node, 2, AArch64::LD2Rv2s, AArch64::dsub0);
3214         return;
3215       } else if (VT == MVT::v4i32 || VT == MVT::v4f32) {
3216         SelectLoad(Node, 2, AArch64::LD2Rv4s, AArch64::qsub0);
3217         return;
3218       } else if (VT == MVT::v1i64 || VT == MVT::v1f64) {
3219         SelectLoad(Node, 2, AArch64::LD2Rv1d, AArch64::dsub0);
3220         return;
3221       } else if (VT == MVT::v2i64 || VT == MVT::v2f64) {
3222         SelectLoad(Node, 2, AArch64::LD2Rv2d, AArch64::qsub0);
3223         return;
3224       }
3225       break;
3226     case Intrinsic::aarch64_neon_ld3r:
3227       if (VT == MVT::v8i8) {
3228         SelectLoad(Node, 3, AArch64::LD3Rv8b, AArch64::dsub0);
3229         return;
3230       } else if (VT == MVT::v16i8) {
3231         SelectLoad(Node, 3, AArch64::LD3Rv16b, AArch64::qsub0);
3232         return;
3233       } else if (VT == MVT::v4i16 || VT == MVT::v4f16) {
3234         SelectLoad(Node, 3, AArch64::LD3Rv4h, AArch64::dsub0);
3235         return;
3236       } else if (VT == MVT::v8i16 || VT == MVT::v8f16) {
3237         SelectLoad(Node, 3, AArch64::LD3Rv8h, AArch64::qsub0);
3238         return;
3239       } else if (VT == MVT::v2i32 || VT == MVT::v2f32) {
3240         SelectLoad(Node, 3, AArch64::LD3Rv2s, AArch64::dsub0);
3241         return;
3242       } else if (VT == MVT::v4i32 || VT == MVT::v4f32) {
3243         SelectLoad(Node, 3, AArch64::LD3Rv4s, AArch64::qsub0);
3244         return;
3245       } else if (VT == MVT::v1i64 || VT == MVT::v1f64) {
3246         SelectLoad(Node, 3, AArch64::LD3Rv1d, AArch64::dsub0);
3247         return;
3248       } else if (VT == MVT::v2i64 || VT == MVT::v2f64) {
3249         SelectLoad(Node, 3, AArch64::LD3Rv2d, AArch64::qsub0);
3250         return;
3251       }
3252       break;
3253     case Intrinsic::aarch64_neon_ld4r:
3254       if (VT == MVT::v8i8) {
3255         SelectLoad(Node, 4, AArch64::LD4Rv8b, AArch64::dsub0);
3256         return;
3257       } else if (VT == MVT::v16i8) {
3258         SelectLoad(Node, 4, AArch64::LD4Rv16b, AArch64::qsub0);
3259         return;
3260       } else if (VT == MVT::v4i16 || VT == MVT::v4f16) {
3261         SelectLoad(Node, 4, AArch64::LD4Rv4h, AArch64::dsub0);
3262         return;
3263       } else if (VT == MVT::v8i16 || VT == MVT::v8f16) {
3264         SelectLoad(Node, 4, AArch64::LD4Rv8h, AArch64::qsub0);
3265         return;
3266       } else if (VT == MVT::v2i32 || VT == MVT::v2f32) {
3267         SelectLoad(Node, 4, AArch64::LD4Rv2s, AArch64::dsub0);
3268         return;
3269       } else if (VT == MVT::v4i32 || VT == MVT::v4f32) {
3270         SelectLoad(Node, 4, AArch64::LD4Rv4s, AArch64::qsub0);
3271         return;
3272       } else if (VT == MVT::v1i64 || VT == MVT::v1f64) {
3273         SelectLoad(Node, 4, AArch64::LD4Rv1d, AArch64::dsub0);
3274         return;
3275       } else if (VT == MVT::v2i64 || VT == MVT::v2f64) {
3276         SelectLoad(Node, 4, AArch64::LD4Rv2d, AArch64::qsub0);
3277         return;
3278       }
3279       break;
3280     case Intrinsic::aarch64_neon_ld2lane:
3281       if (VT == MVT::v16i8 || VT == MVT::v8i8) {
3282         SelectLoadLane(Node, 2, AArch64::LD2i8);
3283         return;
3284       } else if (VT == MVT::v8i16 || VT == MVT::v4i16 || VT == MVT::v4f16 ||
3285                  VT == MVT::v8f16) {
3286         SelectLoadLane(Node, 2, AArch64::LD2i16);
3287         return;
3288       } else if (VT == MVT::v4i32 || VT == MVT::v2i32 || VT == MVT::v4f32 ||
3289                  VT == MVT::v2f32) {
3290         SelectLoadLane(Node, 2, AArch64::LD2i32);
3291         return;
3292       } else if (VT == MVT::v2i64 || VT == MVT::v1i64 || VT == MVT::v2f64 ||
3293                  VT == MVT::v1f64) {
3294         SelectLoadLane(Node, 2, AArch64::LD2i64);
3295         return;
3296       }
3297       break;
3298     case Intrinsic::aarch64_neon_ld3lane:
3299       if (VT == MVT::v16i8 || VT == MVT::v8i8) {
3300         SelectLoadLane(Node, 3, AArch64::LD3i8);
3301         return;
3302       } else if (VT == MVT::v8i16 || VT == MVT::v4i16 || VT == MVT::v4f16 ||
3303                  VT == MVT::v8f16) {
3304         SelectLoadLane(Node, 3, AArch64::LD3i16);
3305         return;
3306       } else if (VT == MVT::v4i32 || VT == MVT::v2i32 || VT == MVT::v4f32 ||
3307                  VT == MVT::v2f32) {
3308         SelectLoadLane(Node, 3, AArch64::LD3i32);
3309         return;
3310       } else if (VT == MVT::v2i64 || VT == MVT::v1i64 || VT == MVT::v2f64 ||
3311                  VT == MVT::v1f64) {
3312         SelectLoadLane(Node, 3, AArch64::LD3i64);
3313         return;
3314       }
3315       break;
3316     case Intrinsic::aarch64_neon_ld4lane:
3317       if (VT == MVT::v16i8 || VT == MVT::v8i8) {
3318         SelectLoadLane(Node, 4, AArch64::LD4i8);
3319         return;
3320       } else if (VT == MVT::v8i16 || VT == MVT::v4i16 || VT == MVT::v4f16 ||
3321                  VT == MVT::v8f16) {
3322         SelectLoadLane(Node, 4, AArch64::LD4i16);
3323         return;
3324       } else if (VT == MVT::v4i32 || VT == MVT::v2i32 || VT == MVT::v4f32 ||
3325                  VT == MVT::v2f32) {
3326         SelectLoadLane(Node, 4, AArch64::LD4i32);
3327         return;
3328       } else if (VT == MVT::v2i64 || VT == MVT::v1i64 || VT == MVT::v2f64 ||
3329                  VT == MVT::v1f64) {
3330         SelectLoadLane(Node, 4, AArch64::LD4i64);
3331         return;
3332       }
3333       break;
3334     }
3335   } break;
3336   case ISD::INTRINSIC_WO_CHAIN: {
3337     unsigned IntNo = cast<ConstantSDNode>(Node->getOperand(0))->getZExtValue();
3338     switch (IntNo) {
3339     default:
3340       break;
3341     case Intrinsic::aarch64_tagp:
3342       SelectTagP(Node);
3343       return;
3344     case Intrinsic::aarch64_neon_tbl2:
3345       SelectTable(Node, 2,
3346                   VT == MVT::v8i8 ? AArch64::TBLv8i8Two : AArch64::TBLv16i8Two,
3347                   false);
3348       return;
3349     case Intrinsic::aarch64_neon_tbl3:
3350       SelectTable(Node, 3, VT == MVT::v8i8 ? AArch64::TBLv8i8Three
3351                                            : AArch64::TBLv16i8Three,
3352                   false);
3353       return;
3354     case Intrinsic::aarch64_neon_tbl4:
3355       SelectTable(Node, 4, VT == MVT::v8i8 ? AArch64::TBLv8i8Four
3356                                            : AArch64::TBLv16i8Four,
3357                   false);
3358       return;
3359     case Intrinsic::aarch64_neon_tbx2:
3360       SelectTable(Node, 2,
3361                   VT == MVT::v8i8 ? AArch64::TBXv8i8Two : AArch64::TBXv16i8Two,
3362                   true);
3363       return;
3364     case Intrinsic::aarch64_neon_tbx3:
3365       SelectTable(Node, 3, VT == MVT::v8i8 ? AArch64::TBXv8i8Three
3366                                            : AArch64::TBXv16i8Three,
3367                   true);
3368       return;
3369     case Intrinsic::aarch64_neon_tbx4:
3370       SelectTable(Node, 4, VT == MVT::v8i8 ? AArch64::TBXv8i8Four
3371                                            : AArch64::TBXv16i8Four,
3372                   true);
3373       return;
3374     case Intrinsic::aarch64_neon_smull:
3375     case Intrinsic::aarch64_neon_umull:
3376       if (tryMULLV64LaneV128(IntNo, Node))
3377         return;
3378       break;
3379     }
3380     break;
3381   }
3382   case ISD::INTRINSIC_VOID: {
3383     unsigned IntNo = cast<ConstantSDNode>(Node->getOperand(1))->getZExtValue();
3384     if (Node->getNumOperands() >= 3)
3385       VT = Node->getOperand(2)->getValueType(0);
3386     switch (IntNo) {
3387     default:
3388       break;
3389     case Intrinsic::aarch64_neon_st1x2: {
3390       if (VT == MVT::v8i8) {
3391         SelectStore(Node, 2, AArch64::ST1Twov8b);
3392         return;
3393       } else if (VT == MVT::v16i8) {
3394         SelectStore(Node, 2, AArch64::ST1Twov16b);
3395         return;
3396       } else if (VT == MVT::v4i16 || VT == MVT::v4f16) {
3397         SelectStore(Node, 2, AArch64::ST1Twov4h);
3398         return;
3399       } else if (VT == MVT::v8i16 || VT == MVT::v8f16) {
3400         SelectStore(Node, 2, AArch64::ST1Twov8h);
3401         return;
3402       } else if (VT == MVT::v2i32 || VT == MVT::v2f32) {
3403         SelectStore(Node, 2, AArch64::ST1Twov2s);
3404         return;
3405       } else if (VT == MVT::v4i32 || VT == MVT::v4f32) {
3406         SelectStore(Node, 2, AArch64::ST1Twov4s);
3407         return;
3408       } else if (VT == MVT::v2i64 || VT == MVT::v2f64) {
3409         SelectStore(Node, 2, AArch64::ST1Twov2d);
3410         return;
3411       } else if (VT == MVT::v1i64 || VT == MVT::v1f64) {
3412         SelectStore(Node, 2, AArch64::ST1Twov1d);
3413         return;
3414       }
3415       break;
3416     }
3417     case Intrinsic::aarch64_neon_st1x3: {
3418       if (VT == MVT::v8i8) {
3419         SelectStore(Node, 3, AArch64::ST1Threev8b);
3420         return;
3421       } else if (VT == MVT::v16i8) {
3422         SelectStore(Node, 3, AArch64::ST1Threev16b);
3423         return;
3424       } else if (VT == MVT::v4i16 || VT == MVT::v4f16) {
3425         SelectStore(Node, 3, AArch64::ST1Threev4h);
3426         return;
3427       } else if (VT == MVT::v8i16 || VT == MVT::v8f16) {
3428         SelectStore(Node, 3, AArch64::ST1Threev8h);
3429         return;
3430       } else if (VT == MVT::v2i32 || VT == MVT::v2f32) {
3431         SelectStore(Node, 3, AArch64::ST1Threev2s);
3432         return;
3433       } else if (VT == MVT::v4i32 || VT == MVT::v4f32) {
3434         SelectStore(Node, 3, AArch64::ST1Threev4s);
3435         return;
3436       } else if (VT == MVT::v2i64 || VT == MVT::v2f64) {
3437         SelectStore(Node, 3, AArch64::ST1Threev2d);
3438         return;
3439       } else if (VT == MVT::v1i64 || VT == MVT::v1f64) {
3440         SelectStore(Node, 3, AArch64::ST1Threev1d);
3441         return;
3442       }
3443       break;
3444     }
3445     case Intrinsic::aarch64_neon_st1x4: {
3446       if (VT == MVT::v8i8) {
3447         SelectStore(Node, 4, AArch64::ST1Fourv8b);
3448         return;
3449       } else if (VT == MVT::v16i8) {
3450         SelectStore(Node, 4, AArch64::ST1Fourv16b);
3451         return;
3452       } else if (VT == MVT::v4i16 || VT == MVT::v4f16) {
3453         SelectStore(Node, 4, AArch64::ST1Fourv4h);
3454         return;
3455       } else if (VT == MVT::v8i16 || VT == MVT::v8f16) {
3456         SelectStore(Node, 4, AArch64::ST1Fourv8h);
3457         return;
3458       } else if (VT == MVT::v2i32 || VT == MVT::v2f32) {
3459         SelectStore(Node, 4, AArch64::ST1Fourv2s);
3460         return;
3461       } else if (VT == MVT::v4i32 || VT == MVT::v4f32) {
3462         SelectStore(Node, 4, AArch64::ST1Fourv4s);
3463         return;
3464       } else if (VT == MVT::v2i64 || VT == MVT::v2f64) {
3465         SelectStore(Node, 4, AArch64::ST1Fourv2d);
3466         return;
3467       } else if (VT == MVT::v1i64 || VT == MVT::v1f64) {
3468         SelectStore(Node, 4, AArch64::ST1Fourv1d);
3469         return;
3470       }
3471       break;
3472     }
3473     case Intrinsic::aarch64_neon_st2: {
3474       if (VT == MVT::v8i8) {
3475         SelectStore(Node, 2, AArch64::ST2Twov8b);
3476         return;
3477       } else if (VT == MVT::v16i8) {
3478         SelectStore(Node, 2, AArch64::ST2Twov16b);
3479         return;
3480       } else if (VT == MVT::v4i16 || VT == MVT::v4f16) {
3481         SelectStore(Node, 2, AArch64::ST2Twov4h);
3482         return;
3483       } else if (VT == MVT::v8i16 || VT == MVT::v8f16) {
3484         SelectStore(Node, 2, AArch64::ST2Twov8h);
3485         return;
3486       } else if (VT == MVT::v2i32 || VT == MVT::v2f32) {
3487         SelectStore(Node, 2, AArch64::ST2Twov2s);
3488         return;
3489       } else if (VT == MVT::v4i32 || VT == MVT::v4f32) {
3490         SelectStore(Node, 2, AArch64::ST2Twov4s);
3491         return;
3492       } else if (VT == MVT::v2i64 || VT == MVT::v2f64) {
3493         SelectStore(Node, 2, AArch64::ST2Twov2d);
3494         return;
3495       } else if (VT == MVT::v1i64 || VT == MVT::v1f64) {
3496         SelectStore(Node, 2, AArch64::ST1Twov1d);
3497         return;
3498       }
3499       break;
3500     }
3501     case Intrinsic::aarch64_neon_st3: {
3502       if (VT == MVT::v8i8) {
3503         SelectStore(Node, 3, AArch64::ST3Threev8b);
3504         return;
3505       } else if (VT == MVT::v16i8) {
3506         SelectStore(Node, 3, AArch64::ST3Threev16b);
3507         return;
3508       } else if (VT == MVT::v4i16 || VT == MVT::v4f16) {
3509         SelectStore(Node, 3, AArch64::ST3Threev4h);
3510         return;
3511       } else if (VT == MVT::v8i16 || VT == MVT::v8f16) {
3512         SelectStore(Node, 3, AArch64::ST3Threev8h);
3513         return;
3514       } else if (VT == MVT::v2i32 || VT == MVT::v2f32) {
3515         SelectStore(Node, 3, AArch64::ST3Threev2s);
3516         return;
3517       } else if (VT == MVT::v4i32 || VT == MVT::v4f32) {
3518         SelectStore(Node, 3, AArch64::ST3Threev4s);
3519         return;
3520       } else if (VT == MVT::v2i64 || VT == MVT::v2f64) {
3521         SelectStore(Node, 3, AArch64::ST3Threev2d);
3522         return;
3523       } else if (VT == MVT::v1i64 || VT == MVT::v1f64) {
3524         SelectStore(Node, 3, AArch64::ST1Threev1d);
3525         return;
3526       }
3527       break;
3528     }
3529     case Intrinsic::aarch64_neon_st4: {
3530       if (VT == MVT::v8i8) {
3531         SelectStore(Node, 4, AArch64::ST4Fourv8b);
3532         return;
3533       } else if (VT == MVT::v16i8) {
3534         SelectStore(Node, 4, AArch64::ST4Fourv16b);
3535         return;
3536       } else if (VT == MVT::v4i16 || VT == MVT::v4f16) {
3537         SelectStore(Node, 4, AArch64::ST4Fourv4h);
3538         return;
3539       } else if (VT == MVT::v8i16 || VT == MVT::v8f16) {
3540         SelectStore(Node, 4, AArch64::ST4Fourv8h);
3541         return;
3542       } else if (VT == MVT::v2i32 || VT == MVT::v2f32) {
3543         SelectStore(Node, 4, AArch64::ST4Fourv2s);
3544         return;
3545       } else if (VT == MVT::v4i32 || VT == MVT::v4f32) {
3546         SelectStore(Node, 4, AArch64::ST4Fourv4s);
3547         return;
3548       } else if (VT == MVT::v2i64 || VT == MVT::v2f64) {
3549         SelectStore(Node, 4, AArch64::ST4Fourv2d);
3550         return;
3551       } else if (VT == MVT::v1i64 || VT == MVT::v1f64) {
3552         SelectStore(Node, 4, AArch64::ST1Fourv1d);
3553         return;
3554       }
3555       break;
3556     }
3557     case Intrinsic::aarch64_neon_st2lane: {
3558       if (VT == MVT::v16i8 || VT == MVT::v8i8) {
3559         SelectStoreLane(Node, 2, AArch64::ST2i8);
3560         return;
3561       } else if (VT == MVT::v8i16 || VT == MVT::v4i16 || VT == MVT::v4f16 ||
3562                  VT == MVT::v8f16) {
3563         SelectStoreLane(Node, 2, AArch64::ST2i16);
3564         return;
3565       } else if (VT == MVT::v4i32 || VT == MVT::v2i32 || VT == MVT::v4f32 ||
3566                  VT == MVT::v2f32) {
3567         SelectStoreLane(Node, 2, AArch64::ST2i32);
3568         return;
3569       } else if (VT == MVT::v2i64 || VT == MVT::v1i64 || VT == MVT::v2f64 ||
3570                  VT == MVT::v1f64) {
3571         SelectStoreLane(Node, 2, AArch64::ST2i64);
3572         return;
3573       }
3574       break;
3575     }
3576     case Intrinsic::aarch64_neon_st3lane: {
3577       if (VT == MVT::v16i8 || VT == MVT::v8i8) {
3578         SelectStoreLane(Node, 3, AArch64::ST3i8);
3579         return;
3580       } else if (VT == MVT::v8i16 || VT == MVT::v4i16 || VT == MVT::v4f16 ||
3581                  VT == MVT::v8f16) {
3582         SelectStoreLane(Node, 3, AArch64::ST3i16);
3583         return;
3584       } else if (VT == MVT::v4i32 || VT == MVT::v2i32 || VT == MVT::v4f32 ||
3585                  VT == MVT::v2f32) {
3586         SelectStoreLane(Node, 3, AArch64::ST3i32);
3587         return;
3588       } else if (VT == MVT::v2i64 || VT == MVT::v1i64 || VT == MVT::v2f64 ||
3589                  VT == MVT::v1f64) {
3590         SelectStoreLane(Node, 3, AArch64::ST3i64);
3591         return;
3592       }
3593       break;
3594     }
3595     case Intrinsic::aarch64_neon_st4lane: {
3596       if (VT == MVT::v16i8 || VT == MVT::v8i8) {
3597         SelectStoreLane(Node, 4, AArch64::ST4i8);
3598         return;
3599       } else if (VT == MVT::v8i16 || VT == MVT::v4i16 || VT == MVT::v4f16 ||
3600                  VT == MVT::v8f16) {
3601         SelectStoreLane(Node, 4, AArch64::ST4i16);
3602         return;
3603       } else if (VT == MVT::v4i32 || VT == MVT::v2i32 || VT == MVT::v4f32 ||
3604                  VT == MVT::v2f32) {
3605         SelectStoreLane(Node, 4, AArch64::ST4i32);
3606         return;
3607       } else if (VT == MVT::v2i64 || VT == MVT::v1i64 || VT == MVT::v2f64 ||
3608                  VT == MVT::v1f64) {
3609         SelectStoreLane(Node, 4, AArch64::ST4i64);
3610         return;
3611       }
3612       break;
3613     }
3614     }
3615     break;
3616   }
3617   case AArch64ISD::LD2post: {
3618     if (VT == MVT::v8i8) {
3619       SelectPostLoad(Node, 2, AArch64::LD2Twov8b_POST, AArch64::dsub0);
3620       return;
3621     } else if (VT == MVT::v16i8) {
3622       SelectPostLoad(Node, 2, AArch64::LD2Twov16b_POST, AArch64::qsub0);
3623       return;
3624     } else if (VT == MVT::v4i16 || VT == MVT::v4f16) {
3625       SelectPostLoad(Node, 2, AArch64::LD2Twov4h_POST, AArch64::dsub0);
3626       return;
3627     } else if (VT == MVT::v8i16 || VT == MVT::v8f16) {
3628       SelectPostLoad(Node, 2, AArch64::LD2Twov8h_POST, AArch64::qsub0);
3629       return;
3630     } else if (VT == MVT::v2i32 || VT == MVT::v2f32) {
3631       SelectPostLoad(Node, 2, AArch64::LD2Twov2s_POST, AArch64::dsub0);
3632       return;
3633     } else if (VT == MVT::v4i32 || VT == MVT::v4f32) {
3634       SelectPostLoad(Node, 2, AArch64::LD2Twov4s_POST, AArch64::qsub0);
3635       return;
3636     } else if (VT == MVT::v1i64 || VT == MVT::v1f64) {
3637       SelectPostLoad(Node, 2, AArch64::LD1Twov1d_POST, AArch64::dsub0);
3638       return;
3639     } else if (VT == MVT::v2i64 || VT == MVT::v2f64) {
3640       SelectPostLoad(Node, 2, AArch64::LD2Twov2d_POST, AArch64::qsub0);
3641       return;
3642     }
3643     break;
3644   }
3645   case AArch64ISD::LD3post: {
3646     if (VT == MVT::v8i8) {
3647       SelectPostLoad(Node, 3, AArch64::LD3Threev8b_POST, AArch64::dsub0);
3648       return;
3649     } else if (VT == MVT::v16i8) {
3650       SelectPostLoad(Node, 3, AArch64::LD3Threev16b_POST, AArch64::qsub0);
3651       return;
3652     } else if (VT == MVT::v4i16 || VT == MVT::v4f16) {
3653       SelectPostLoad(Node, 3, AArch64::LD3Threev4h_POST, AArch64::dsub0);
3654       return;
3655     } else if (VT == MVT::v8i16 || VT == MVT::v8f16) {
3656       SelectPostLoad(Node, 3, AArch64::LD3Threev8h_POST, AArch64::qsub0);
3657       return;
3658     } else if (VT == MVT::v2i32 || VT == MVT::v2f32) {
3659       SelectPostLoad(Node, 3, AArch64::LD3Threev2s_POST, AArch64::dsub0);
3660       return;
3661     } else if (VT == MVT::v4i32 || VT == MVT::v4f32) {
3662       SelectPostLoad(Node, 3, AArch64::LD3Threev4s_POST, AArch64::qsub0);
3663       return;
3664     } else if (VT == MVT::v1i64 || VT == MVT::v1f64) {
3665       SelectPostLoad(Node, 3, AArch64::LD1Threev1d_POST, AArch64::dsub0);
3666       return;
3667     } else if (VT == MVT::v2i64 || VT == MVT::v2f64) {
3668       SelectPostLoad(Node, 3, AArch64::LD3Threev2d_POST, AArch64::qsub0);
3669       return;
3670     }
3671     break;
3672   }
3673   case AArch64ISD::LD4post: {
3674     if (VT == MVT::v8i8) {
3675       SelectPostLoad(Node, 4, AArch64::LD4Fourv8b_POST, AArch64::dsub0);
3676       return;
3677     } else if (VT == MVT::v16i8) {
3678       SelectPostLoad(Node, 4, AArch64::LD4Fourv16b_POST, AArch64::qsub0);
3679       return;
3680     } else if (VT == MVT::v4i16 || VT == MVT::v4f16) {
3681       SelectPostLoad(Node, 4, AArch64::LD4Fourv4h_POST, AArch64::dsub0);
3682       return;
3683     } else if (VT == MVT::v8i16 || VT == MVT::v8f16) {
3684       SelectPostLoad(Node, 4, AArch64::LD4Fourv8h_POST, AArch64::qsub0);
3685       return;
3686     } else if (VT == MVT::v2i32 || VT == MVT::v2f32) {
3687       SelectPostLoad(Node, 4, AArch64::LD4Fourv2s_POST, AArch64::dsub0);
3688       return;
3689     } else if (VT == MVT::v4i32 || VT == MVT::v4f32) {
3690       SelectPostLoad(Node, 4, AArch64::LD4Fourv4s_POST, AArch64::qsub0);
3691       return;
3692     } else if (VT == MVT::v1i64 || VT == MVT::v1f64) {
3693       SelectPostLoad(Node, 4, AArch64::LD1Fourv1d_POST, AArch64::dsub0);
3694       return;
3695     } else if (VT == MVT::v2i64 || VT == MVT::v2f64) {
3696       SelectPostLoad(Node, 4, AArch64::LD4Fourv2d_POST, AArch64::qsub0);
3697       return;
3698     }
3699     break;
3700   }
3701   case AArch64ISD::LD1x2post: {
3702     if (VT == MVT::v8i8) {
3703       SelectPostLoad(Node, 2, AArch64::LD1Twov8b_POST, AArch64::dsub0);
3704       return;
3705     } else if (VT == MVT::v16i8) {
3706       SelectPostLoad(Node, 2, AArch64::LD1Twov16b_POST, AArch64::qsub0);
3707       return;
3708     } else if (VT == MVT::v4i16 || VT == MVT::v4f16) {
3709       SelectPostLoad(Node, 2, AArch64::LD1Twov4h_POST, AArch64::dsub0);
3710       return;
3711     } else if (VT == MVT::v8i16 || VT == MVT::v8f16) {
3712       SelectPostLoad(Node, 2, AArch64::LD1Twov8h_POST, AArch64::qsub0);
3713       return;
3714     } else if (VT == MVT::v2i32 || VT == MVT::v2f32) {
3715       SelectPostLoad(Node, 2, AArch64::LD1Twov2s_POST, AArch64::dsub0);
3716       return;
3717     } else if (VT == MVT::v4i32 || VT == MVT::v4f32) {
3718       SelectPostLoad(Node, 2, AArch64::LD1Twov4s_POST, AArch64::qsub0);
3719       return;
3720     } else if (VT == MVT::v1i64 || VT == MVT::v1f64) {
3721       SelectPostLoad(Node, 2, AArch64::LD1Twov1d_POST, AArch64::dsub0);
3722       return;
3723     } else if (VT == MVT::v2i64 || VT == MVT::v2f64) {
3724       SelectPostLoad(Node, 2, AArch64::LD1Twov2d_POST, AArch64::qsub0);
3725       return;
3726     }
3727     break;
3728   }
3729   case AArch64ISD::LD1x3post: {
3730     if (VT == MVT::v8i8) {
3731       SelectPostLoad(Node, 3, AArch64::LD1Threev8b_POST, AArch64::dsub0);
3732       return;
3733     } else if (VT == MVT::v16i8) {
3734       SelectPostLoad(Node, 3, AArch64::LD1Threev16b_POST, AArch64::qsub0);
3735       return;
3736     } else if (VT == MVT::v4i16 || VT == MVT::v4f16) {
3737       SelectPostLoad(Node, 3, AArch64::LD1Threev4h_POST, AArch64::dsub0);
3738       return;
3739     } else if (VT == MVT::v8i16 || VT == MVT::v8f16) {
3740       SelectPostLoad(Node, 3, AArch64::LD1Threev8h_POST, AArch64::qsub0);
3741       return;
3742     } else if (VT == MVT::v2i32 || VT == MVT::v2f32) {
3743       SelectPostLoad(Node, 3, AArch64::LD1Threev2s_POST, AArch64::dsub0);
3744       return;
3745     } else if (VT == MVT::v4i32 || VT == MVT::v4f32) {
3746       SelectPostLoad(Node, 3, AArch64::LD1Threev4s_POST, AArch64::qsub0);
3747       return;
3748     } else if (VT == MVT::v1i64 || VT == MVT::v1f64) {
3749       SelectPostLoad(Node, 3, AArch64::LD1Threev1d_POST, AArch64::dsub0);
3750       return;
3751     } else if (VT == MVT::v2i64 || VT == MVT::v2f64) {
3752       SelectPostLoad(Node, 3, AArch64::LD1Threev2d_POST, AArch64::qsub0);
3753       return;
3754     }
3755     break;
3756   }
3757   case AArch64ISD::LD1x4post: {
3758     if (VT == MVT::v8i8) {
3759       SelectPostLoad(Node, 4, AArch64::LD1Fourv8b_POST, AArch64::dsub0);
3760       return;
3761     } else if (VT == MVT::v16i8) {
3762       SelectPostLoad(Node, 4, AArch64::LD1Fourv16b_POST, AArch64::qsub0);
3763       return;
3764     } else if (VT == MVT::v4i16 || VT == MVT::v4f16) {
3765       SelectPostLoad(Node, 4, AArch64::LD1Fourv4h_POST, AArch64::dsub0);
3766       return;
3767     } else if (VT == MVT::v8i16 || VT == MVT::v8f16) {
3768       SelectPostLoad(Node, 4, AArch64::LD1Fourv8h_POST, AArch64::qsub0);
3769       return;
3770     } else if (VT == MVT::v2i32 || VT == MVT::v2f32) {
3771       SelectPostLoad(Node, 4, AArch64::LD1Fourv2s_POST, AArch64::dsub0);
3772       return;
3773     } else if (VT == MVT::v4i32 || VT == MVT::v4f32) {
3774       SelectPostLoad(Node, 4, AArch64::LD1Fourv4s_POST, AArch64::qsub0);
3775       return;
3776     } else if (VT == MVT::v1i64 || VT == MVT::v1f64) {
3777       SelectPostLoad(Node, 4, AArch64::LD1Fourv1d_POST, AArch64::dsub0);
3778       return;
3779     } else if (VT == MVT::v2i64 || VT == MVT::v2f64) {
3780       SelectPostLoad(Node, 4, AArch64::LD1Fourv2d_POST, AArch64::qsub0);
3781       return;
3782     }
3783     break;
3784   }
3785   case AArch64ISD::LD1DUPpost: {
3786     if (VT == MVT::v8i8) {
3787       SelectPostLoad(Node, 1, AArch64::LD1Rv8b_POST, AArch64::dsub0);
3788       return;
3789     } else if (VT == MVT::v16i8) {
3790       SelectPostLoad(Node, 1, AArch64::LD1Rv16b_POST, AArch64::qsub0);
3791       return;
3792     } else if (VT == MVT::v4i16 || VT == MVT::v4f16) {
3793       SelectPostLoad(Node, 1, AArch64::LD1Rv4h_POST, AArch64::dsub0);
3794       return;
3795     } else if (VT == MVT::v8i16 || VT == MVT::v8f16) {
3796       SelectPostLoad(Node, 1, AArch64::LD1Rv8h_POST, AArch64::qsub0);
3797       return;
3798     } else if (VT == MVT::v2i32 || VT == MVT::v2f32) {
3799       SelectPostLoad(Node, 1, AArch64::LD1Rv2s_POST, AArch64::dsub0);
3800       return;
3801     } else if (VT == MVT::v4i32 || VT == MVT::v4f32) {
3802       SelectPostLoad(Node, 1, AArch64::LD1Rv4s_POST, AArch64::qsub0);
3803       return;
3804     } else if (VT == MVT::v1i64 || VT == MVT::v1f64) {
3805       SelectPostLoad(Node, 1, AArch64::LD1Rv1d_POST, AArch64::dsub0);
3806       return;
3807     } else if (VT == MVT::v2i64 || VT == MVT::v2f64) {
3808       SelectPostLoad(Node, 1, AArch64::LD1Rv2d_POST, AArch64::qsub0);
3809       return;
3810     }
3811     break;
3812   }
3813   case AArch64ISD::LD2DUPpost: {
3814     if (VT == MVT::v8i8) {
3815       SelectPostLoad(Node, 2, AArch64::LD2Rv8b_POST, AArch64::dsub0);
3816       return;
3817     } else if (VT == MVT::v16i8) {
3818       SelectPostLoad(Node, 2, AArch64::LD2Rv16b_POST, AArch64::qsub0);
3819       return;
3820     } else if (VT == MVT::v4i16 || VT == MVT::v4f16) {
3821       SelectPostLoad(Node, 2, AArch64::LD2Rv4h_POST, AArch64::dsub0);
3822       return;
3823     } else if (VT == MVT::v8i16 || VT == MVT::v8f16) {
3824       SelectPostLoad(Node, 2, AArch64::LD2Rv8h_POST, AArch64::qsub0);
3825       return;
3826     } else if (VT == MVT::v2i32 || VT == MVT::v2f32) {
3827       SelectPostLoad(Node, 2, AArch64::LD2Rv2s_POST, AArch64::dsub0);
3828       return;
3829     } else if (VT == MVT::v4i32 || VT == MVT::v4f32) {
3830       SelectPostLoad(Node, 2, AArch64::LD2Rv4s_POST, AArch64::qsub0);
3831       return;
3832     } else if (VT == MVT::v1i64 || VT == MVT::v1f64) {
3833       SelectPostLoad(Node, 2, AArch64::LD2Rv1d_POST, AArch64::dsub0);
3834       return;
3835     } else if (VT == MVT::v2i64 || VT == MVT::v2f64) {
3836       SelectPostLoad(Node, 2, AArch64::LD2Rv2d_POST, AArch64::qsub0);
3837       return;
3838     }
3839     break;
3840   }
3841   case AArch64ISD::LD3DUPpost: {
3842     if (VT == MVT::v8i8) {
3843       SelectPostLoad(Node, 3, AArch64::LD3Rv8b_POST, AArch64::dsub0);
3844       return;
3845     } else if (VT == MVT::v16i8) {
3846       SelectPostLoad(Node, 3, AArch64::LD3Rv16b_POST, AArch64::qsub0);
3847       return;
3848     } else if (VT == MVT::v4i16 || VT == MVT::v4f16) {
3849       SelectPostLoad(Node, 3, AArch64::LD3Rv4h_POST, AArch64::dsub0);
3850       return;
3851     } else if (VT == MVT::v8i16 || VT == MVT::v8f16) {
3852       SelectPostLoad(Node, 3, AArch64::LD3Rv8h_POST, AArch64::qsub0);
3853       return;
3854     } else if (VT == MVT::v2i32 || VT == MVT::v2f32) {
3855       SelectPostLoad(Node, 3, AArch64::LD3Rv2s_POST, AArch64::dsub0);
3856       return;
3857     } else if (VT == MVT::v4i32 || VT == MVT::v4f32) {
3858       SelectPostLoad(Node, 3, AArch64::LD3Rv4s_POST, AArch64::qsub0);
3859       return;
3860     } else if (VT == MVT::v1i64 || VT == MVT::v1f64) {
3861       SelectPostLoad(Node, 3, AArch64::LD3Rv1d_POST, AArch64::dsub0);
3862       return;
3863     } else if (VT == MVT::v2i64 || VT == MVT::v2f64) {
3864       SelectPostLoad(Node, 3, AArch64::LD3Rv2d_POST, AArch64::qsub0);
3865       return;
3866     }
3867     break;
3868   }
3869   case AArch64ISD::LD4DUPpost: {
3870     if (VT == MVT::v8i8) {
3871       SelectPostLoad(Node, 4, AArch64::LD4Rv8b_POST, AArch64::dsub0);
3872       return;
3873     } else if (VT == MVT::v16i8) {
3874       SelectPostLoad(Node, 4, AArch64::LD4Rv16b_POST, AArch64::qsub0);
3875       return;
3876     } else if (VT == MVT::v4i16 || VT == MVT::v4f16) {
3877       SelectPostLoad(Node, 4, AArch64::LD4Rv4h_POST, AArch64::dsub0);
3878       return;
3879     } else if (VT == MVT::v8i16 || VT == MVT::v8f16) {
3880       SelectPostLoad(Node, 4, AArch64::LD4Rv8h_POST, AArch64::qsub0);
3881       return;
3882     } else if (VT == MVT::v2i32 || VT == MVT::v2f32) {
3883       SelectPostLoad(Node, 4, AArch64::LD4Rv2s_POST, AArch64::dsub0);
3884       return;
3885     } else if (VT == MVT::v4i32 || VT == MVT::v4f32) {
3886       SelectPostLoad(Node, 4, AArch64::LD4Rv4s_POST, AArch64::qsub0);
3887       return;
3888     } else if (VT == MVT::v1i64 || VT == MVT::v1f64) {
3889       SelectPostLoad(Node, 4, AArch64::LD4Rv1d_POST, AArch64::dsub0);
3890       return;
3891     } else if (VT == MVT::v2i64 || VT == MVT::v2f64) {
3892       SelectPostLoad(Node, 4, AArch64::LD4Rv2d_POST, AArch64::qsub0);
3893       return;
3894     }
3895     break;
3896   }
3897   case AArch64ISD::LD1LANEpost: {
3898     if (VT == MVT::v16i8 || VT == MVT::v8i8) {
3899       SelectPostLoadLane(Node, 1, AArch64::LD1i8_POST);
3900       return;
3901     } else if (VT == MVT::v8i16 || VT == MVT::v4i16 || VT == MVT::v4f16 ||
3902                VT == MVT::v8f16) {
3903       SelectPostLoadLane(Node, 1, AArch64::LD1i16_POST);
3904       return;
3905     } else if (VT == MVT::v4i32 || VT == MVT::v2i32 || VT == MVT::v4f32 ||
3906                VT == MVT::v2f32) {
3907       SelectPostLoadLane(Node, 1, AArch64::LD1i32_POST);
3908       return;
3909     } else if (VT == MVT::v2i64 || VT == MVT::v1i64 || VT == MVT::v2f64 ||
3910                VT == MVT::v1f64) {
3911       SelectPostLoadLane(Node, 1, AArch64::LD1i64_POST);
3912       return;
3913     }
3914     break;
3915   }
3916   case AArch64ISD::LD2LANEpost: {
3917     if (VT == MVT::v16i8 || VT == MVT::v8i8) {
3918       SelectPostLoadLane(Node, 2, AArch64::LD2i8_POST);
3919       return;
3920     } else if (VT == MVT::v8i16 || VT == MVT::v4i16 || VT == MVT::v4f16 ||
3921                VT == MVT::v8f16) {
3922       SelectPostLoadLane(Node, 2, AArch64::LD2i16_POST);
3923       return;
3924     } else if (VT == MVT::v4i32 || VT == MVT::v2i32 || VT == MVT::v4f32 ||
3925                VT == MVT::v2f32) {
3926       SelectPostLoadLane(Node, 2, AArch64::LD2i32_POST);
3927       return;
3928     } else if (VT == MVT::v2i64 || VT == MVT::v1i64 || VT == MVT::v2f64 ||
3929                VT == MVT::v1f64) {
3930       SelectPostLoadLane(Node, 2, AArch64::LD2i64_POST);
3931       return;
3932     }
3933     break;
3934   }
3935   case AArch64ISD::LD3LANEpost: {
3936     if (VT == MVT::v16i8 || VT == MVT::v8i8) {
3937       SelectPostLoadLane(Node, 3, AArch64::LD3i8_POST);
3938       return;
3939     } else if (VT == MVT::v8i16 || VT == MVT::v4i16 || VT == MVT::v4f16 ||
3940                VT == MVT::v8f16) {
3941       SelectPostLoadLane(Node, 3, AArch64::LD3i16_POST);
3942       return;
3943     } else if (VT == MVT::v4i32 || VT == MVT::v2i32 || VT == MVT::v4f32 ||
3944                VT == MVT::v2f32) {
3945       SelectPostLoadLane(Node, 3, AArch64::LD3i32_POST);
3946       return;
3947     } else if (VT == MVT::v2i64 || VT == MVT::v1i64 || VT == MVT::v2f64 ||
3948                VT == MVT::v1f64) {
3949       SelectPostLoadLane(Node, 3, AArch64::LD3i64_POST);
3950       return;
3951     }
3952     break;
3953   }
3954   case AArch64ISD::LD4LANEpost: {
3955     if (VT == MVT::v16i8 || VT == MVT::v8i8) {
3956       SelectPostLoadLane(Node, 4, AArch64::LD4i8_POST);
3957       return;
3958     } else if (VT == MVT::v8i16 || VT == MVT::v4i16 || VT == MVT::v4f16 ||
3959                VT == MVT::v8f16) {
3960       SelectPostLoadLane(Node, 4, AArch64::LD4i16_POST);
3961       return;
3962     } else if (VT == MVT::v4i32 || VT == MVT::v2i32 || VT == MVT::v4f32 ||
3963                VT == MVT::v2f32) {
3964       SelectPostLoadLane(Node, 4, AArch64::LD4i32_POST);
3965       return;
3966     } else if (VT == MVT::v2i64 || VT == MVT::v1i64 || VT == MVT::v2f64 ||
3967                VT == MVT::v1f64) {
3968       SelectPostLoadLane(Node, 4, AArch64::LD4i64_POST);
3969       return;
3970     }
3971     break;
3972   }
3973   case AArch64ISD::ST2post: {
3974     VT = Node->getOperand(1).getValueType();
3975     if (VT == MVT::v8i8) {
3976       SelectPostStore(Node, 2, AArch64::ST2Twov8b_POST);
3977       return;
3978     } else if (VT == MVT::v16i8) {
3979       SelectPostStore(Node, 2, AArch64::ST2Twov16b_POST);
3980       return;
3981     } else if (VT == MVT::v4i16 || VT == MVT::v4f16) {
3982       SelectPostStore(Node, 2, AArch64::ST2Twov4h_POST);
3983       return;
3984     } else if (VT == MVT::v8i16 || VT == MVT::v8f16) {
3985       SelectPostStore(Node, 2, AArch64::ST2Twov8h_POST);
3986       return;
3987     } else if (VT == MVT::v2i32 || VT == MVT::v2f32) {
3988       SelectPostStore(Node, 2, AArch64::ST2Twov2s_POST);
3989       return;
3990     } else if (VT == MVT::v4i32 || VT == MVT::v4f32) {
3991       SelectPostStore(Node, 2, AArch64::ST2Twov4s_POST);
3992       return;
3993     } else if (VT == MVT::v2i64 || VT == MVT::v2f64) {
3994       SelectPostStore(Node, 2, AArch64::ST2Twov2d_POST);
3995       return;
3996     } else if (VT == MVT::v1i64 || VT == MVT::v1f64) {
3997       SelectPostStore(Node, 2, AArch64::ST1Twov1d_POST);
3998       return;
3999     }
4000     break;
4001   }
4002   case AArch64ISD::ST3post: {
4003     VT = Node->getOperand(1).getValueType();
4004     if (VT == MVT::v8i8) {
4005       SelectPostStore(Node, 3, AArch64::ST3Threev8b_POST);
4006       return;
4007     } else if (VT == MVT::v16i8) {
4008       SelectPostStore(Node, 3, AArch64::ST3Threev16b_POST);
4009       return;
4010     } else if (VT == MVT::v4i16 || VT == MVT::v4f16) {
4011       SelectPostStore(Node, 3, AArch64::ST3Threev4h_POST);
4012       return;
4013     } else if (VT == MVT::v8i16 || VT == MVT::v8f16) {
4014       SelectPostStore(Node, 3, AArch64::ST3Threev8h_POST);
4015       return;
4016     } else if (VT == MVT::v2i32 || VT == MVT::v2f32) {
4017       SelectPostStore(Node, 3, AArch64::ST3Threev2s_POST);
4018       return;
4019     } else if (VT == MVT::v4i32 || VT == MVT::v4f32) {
4020       SelectPostStore(Node, 3, AArch64::ST3Threev4s_POST);
4021       return;
4022     } else if (VT == MVT::v2i64 || VT == MVT::v2f64) {
4023       SelectPostStore(Node, 3, AArch64::ST3Threev2d_POST);
4024       return;
4025     } else if (VT == MVT::v1i64 || VT == MVT::v1f64) {
4026       SelectPostStore(Node, 3, AArch64::ST1Threev1d_POST);
4027       return;
4028     }
4029     break;
4030   }
4031   case AArch64ISD::ST4post: {
4032     VT = Node->getOperand(1).getValueType();
4033     if (VT == MVT::v8i8) {
4034       SelectPostStore(Node, 4, AArch64::ST4Fourv8b_POST);
4035       return;
4036     } else if (VT == MVT::v16i8) {
4037       SelectPostStore(Node, 4, AArch64::ST4Fourv16b_POST);
4038       return;
4039     } else if (VT == MVT::v4i16 || VT == MVT::v4f16) {
4040       SelectPostStore(Node, 4, AArch64::ST4Fourv4h_POST);
4041       return;
4042     } else if (VT == MVT::v8i16 || VT == MVT::v8f16) {
4043       SelectPostStore(Node, 4, AArch64::ST4Fourv8h_POST);
4044       return;
4045     } else if (VT == MVT::v2i32 || VT == MVT::v2f32) {
4046       SelectPostStore(Node, 4, AArch64::ST4Fourv2s_POST);
4047       return;
4048     } else if (VT == MVT::v4i32 || VT == MVT::v4f32) {
4049       SelectPostStore(Node, 4, AArch64::ST4Fourv4s_POST);
4050       return;
4051     } else if (VT == MVT::v2i64 || VT == MVT::v2f64) {
4052       SelectPostStore(Node, 4, AArch64::ST4Fourv2d_POST);
4053       return;
4054     } else if (VT == MVT::v1i64 || VT == MVT::v1f64) {
4055       SelectPostStore(Node, 4, AArch64::ST1Fourv1d_POST);
4056       return;
4057     }
4058     break;
4059   }
4060   case AArch64ISD::ST1x2post: {
4061     VT = Node->getOperand(1).getValueType();
4062     if (VT == MVT::v8i8) {
4063       SelectPostStore(Node, 2, AArch64::ST1Twov8b_POST);
4064       return;
4065     } else if (VT == MVT::v16i8) {
4066       SelectPostStore(Node, 2, AArch64::ST1Twov16b_POST);
4067       return;
4068     } else if (VT == MVT::v4i16 || VT == MVT::v4f16) {
4069       SelectPostStore(Node, 2, AArch64::ST1Twov4h_POST);
4070       return;
4071     } else if (VT == MVT::v8i16 || VT == MVT::v8f16) {
4072       SelectPostStore(Node, 2, AArch64::ST1Twov8h_POST);
4073       return;
4074     } else if (VT == MVT::v2i32 || VT == MVT::v2f32) {
4075       SelectPostStore(Node, 2, AArch64::ST1Twov2s_POST);
4076       return;
4077     } else if (VT == MVT::v4i32 || VT == MVT::v4f32) {
4078       SelectPostStore(Node, 2, AArch64::ST1Twov4s_POST);
4079       return;
4080     } else if (VT == MVT::v1i64 || VT == MVT::v1f64) {
4081       SelectPostStore(Node, 2, AArch64::ST1Twov1d_POST);
4082       return;
4083     } else if (VT == MVT::v2i64 || VT == MVT::v2f64) {
4084       SelectPostStore(Node, 2, AArch64::ST1Twov2d_POST);
4085       return;
4086     }
4087     break;
4088   }
4089   case AArch64ISD::ST1x3post: {
4090     VT = Node->getOperand(1).getValueType();
4091     if (VT == MVT::v8i8) {
4092       SelectPostStore(Node, 3, AArch64::ST1Threev8b_POST);
4093       return;
4094     } else if (VT == MVT::v16i8) {
4095       SelectPostStore(Node, 3, AArch64::ST1Threev16b_POST);
4096       return;
4097     } else if (VT == MVT::v4i16 || VT == MVT::v4f16) {
4098       SelectPostStore(Node, 3, AArch64::ST1Threev4h_POST);
4099       return;
4100     } else if (VT == MVT::v8i16 || VT == MVT::v8f16) {
4101       SelectPostStore(Node, 3, AArch64::ST1Threev8h_POST);
4102       return;
4103     } else if (VT == MVT::v2i32 || VT == MVT::v2f32) {
4104       SelectPostStore(Node, 3, AArch64::ST1Threev2s_POST);
4105       return;
4106     } else if (VT == MVT::v4i32 || VT == MVT::v4f32) {
4107       SelectPostStore(Node, 3, AArch64::ST1Threev4s_POST);
4108       return;
4109     } else if (VT == MVT::v1i64 || VT == MVT::v1f64) {
4110       SelectPostStore(Node, 3, AArch64::ST1Threev1d_POST);
4111       return;
4112     } else if (VT == MVT::v2i64 || VT == MVT::v2f64) {
4113       SelectPostStore(Node, 3, AArch64::ST1Threev2d_POST);
4114       return;
4115     }
4116     break;
4117   }
4118   case AArch64ISD::ST1x4post: {
4119     VT = Node->getOperand(1).getValueType();
4120     if (VT == MVT::v8i8) {
4121       SelectPostStore(Node, 4, AArch64::ST1Fourv8b_POST);
4122       return;
4123     } else if (VT == MVT::v16i8) {
4124       SelectPostStore(Node, 4, AArch64::ST1Fourv16b_POST);
4125       return;
4126     } else if (VT == MVT::v4i16 || VT == MVT::v4f16) {
4127       SelectPostStore(Node, 4, AArch64::ST1Fourv4h_POST);
4128       return;
4129     } else if (VT == MVT::v8i16 || VT == MVT::v8f16) {
4130       SelectPostStore(Node, 4, AArch64::ST1Fourv8h_POST);
4131       return;
4132     } else if (VT == MVT::v2i32 || VT == MVT::v2f32) {
4133       SelectPostStore(Node, 4, AArch64::ST1Fourv2s_POST);
4134       return;
4135     } else if (VT == MVT::v4i32 || VT == MVT::v4f32) {
4136       SelectPostStore(Node, 4, AArch64::ST1Fourv4s_POST);
4137       return;
4138     } else if (VT == MVT::v1i64 || VT == MVT::v1f64) {
4139       SelectPostStore(Node, 4, AArch64::ST1Fourv1d_POST);
4140       return;
4141     } else if (VT == MVT::v2i64 || VT == MVT::v2f64) {
4142       SelectPostStore(Node, 4, AArch64::ST1Fourv2d_POST);
4143       return;
4144     }
4145     break;
4146   }
4147   case AArch64ISD::ST2LANEpost: {
4148     VT = Node->getOperand(1).getValueType();
4149     if (VT == MVT::v16i8 || VT == MVT::v8i8) {
4150       SelectPostStoreLane(Node, 2, AArch64::ST2i8_POST);
4151       return;
4152     } else if (VT == MVT::v8i16 || VT == MVT::v4i16 || VT == MVT::v4f16 ||
4153                VT == MVT::v8f16) {
4154       SelectPostStoreLane(Node, 2, AArch64::ST2i16_POST);
4155       return;
4156     } else if (VT == MVT::v4i32 || VT == MVT::v2i32 || VT == MVT::v4f32 ||
4157                VT == MVT::v2f32) {
4158       SelectPostStoreLane(Node, 2, AArch64::ST2i32_POST);
4159       return;
4160     } else if (VT == MVT::v2i64 || VT == MVT::v1i64 || VT == MVT::v2f64 ||
4161                VT == MVT::v1f64) {
4162       SelectPostStoreLane(Node, 2, AArch64::ST2i64_POST);
4163       return;
4164     }
4165     break;
4166   }
4167   case AArch64ISD::ST3LANEpost: {
4168     VT = Node->getOperand(1).getValueType();
4169     if (VT == MVT::v16i8 || VT == MVT::v8i8) {
4170       SelectPostStoreLane(Node, 3, AArch64::ST3i8_POST);
4171       return;
4172     } else if (VT == MVT::v8i16 || VT == MVT::v4i16 || VT == MVT::v4f16 ||
4173                VT == MVT::v8f16) {
4174       SelectPostStoreLane(Node, 3, AArch64::ST3i16_POST);
4175       return;
4176     } else if (VT == MVT::v4i32 || VT == MVT::v2i32 || VT == MVT::v4f32 ||
4177                VT == MVT::v2f32) {
4178       SelectPostStoreLane(Node, 3, AArch64::ST3i32_POST);
4179       return;
4180     } else if (VT == MVT::v2i64 || VT == MVT::v1i64 || VT == MVT::v2f64 ||
4181                VT == MVT::v1f64) {
4182       SelectPostStoreLane(Node, 3, AArch64::ST3i64_POST);
4183       return;
4184     }
4185     break;
4186   }
4187   case AArch64ISD::ST4LANEpost: {
4188     VT = Node->getOperand(1).getValueType();
4189     if (VT == MVT::v16i8 || VT == MVT::v8i8) {
4190       SelectPostStoreLane(Node, 4, AArch64::ST4i8_POST);
4191       return;
4192     } else if (VT == MVT::v8i16 || VT == MVT::v4i16 || VT == MVT::v4f16 ||
4193                VT == MVT::v8f16) {
4194       SelectPostStoreLane(Node, 4, AArch64::ST4i16_POST);
4195       return;
4196     } else if (VT == MVT::v4i32 || VT == MVT::v2i32 || VT == MVT::v4f32 ||
4197                VT == MVT::v2f32) {
4198       SelectPostStoreLane(Node, 4, AArch64::ST4i32_POST);
4199       return;
4200     } else if (VT == MVT::v2i64 || VT == MVT::v1i64 || VT == MVT::v2f64 ||
4201                VT == MVT::v1f64) {
4202       SelectPostStoreLane(Node, 4, AArch64::ST4i64_POST);
4203       return;
4204     }
4205     break;
4206   }
4207   }
4208 
4209   // Select the default instruction
4210   SelectCode(Node);
4211 }
4212 
4213 /// createAArch64ISelDag - This pass converts a legalized DAG into a
4214 /// AArch64-specific DAG, ready for instruction scheduling.
4215 FunctionPass *llvm::createAArch64ISelDag(AArch64TargetMachine &TM,
4216                                          CodeGenOpt::Level OptLevel) {
4217   return new AArch64DAGToDAGISel(TM, OptLevel);
4218 }
4219