xref: /freebsd/contrib/llvm-project/llvm/lib/Target/AArch64/AArch64InstrInfo.cpp (revision a7dea1671b87c07d2d266f836bfa8b58efc7c134)
1 //===- AArch64InstrInfo.cpp - AArch64 Instruction Information -------------===//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 //
9 // This file contains the AArch64 implementation of the TargetInstrInfo class.
10 //
11 //===----------------------------------------------------------------------===//
12 
13 #include "AArch64InstrInfo.h"
14 #include "AArch64MachineFunctionInfo.h"
15 #include "AArch64Subtarget.h"
16 #include "MCTargetDesc/AArch64AddressingModes.h"
17 #include "Utils/AArch64BaseInfo.h"
18 #include "llvm/ADT/ArrayRef.h"
19 #include "llvm/ADT/STLExtras.h"
20 #include "llvm/ADT/SmallVector.h"
21 #include "llvm/CodeGen/MachineBasicBlock.h"
22 #include "llvm/CodeGen/MachineFrameInfo.h"
23 #include "llvm/CodeGen/MachineFunction.h"
24 #include "llvm/CodeGen/MachineInstr.h"
25 #include "llvm/CodeGen/MachineInstrBuilder.h"
26 #include "llvm/CodeGen/MachineMemOperand.h"
27 #include "llvm/CodeGen/MachineOperand.h"
28 #include "llvm/CodeGen/MachineRegisterInfo.h"
29 #include "llvm/CodeGen/MachineModuleInfo.h"
30 #include "llvm/CodeGen/StackMaps.h"
31 #include "llvm/CodeGen/TargetRegisterInfo.h"
32 #include "llvm/CodeGen/TargetSubtargetInfo.h"
33 #include "llvm/IR/DebugLoc.h"
34 #include "llvm/IR/GlobalValue.h"
35 #include "llvm/MC/MCAsmInfo.h"
36 #include "llvm/MC/MCInst.h"
37 #include "llvm/MC/MCInstrDesc.h"
38 #include "llvm/Support/Casting.h"
39 #include "llvm/Support/CodeGen.h"
40 #include "llvm/Support/CommandLine.h"
41 #include "llvm/Support/Compiler.h"
42 #include "llvm/Support/ErrorHandling.h"
43 #include "llvm/Support/MathExtras.h"
44 #include "llvm/Target/TargetMachine.h"
45 #include "llvm/Target/TargetOptions.h"
46 #include <cassert>
47 #include <cstdint>
48 #include <iterator>
49 #include <utility>
50 
51 using namespace llvm;
52 
53 #define GET_INSTRINFO_CTOR_DTOR
54 #include "AArch64GenInstrInfo.inc"
55 
56 static cl::opt<unsigned> TBZDisplacementBits(
57     "aarch64-tbz-offset-bits", cl::Hidden, cl::init(14),
58     cl::desc("Restrict range of TB[N]Z instructions (DEBUG)"));
59 
60 static cl::opt<unsigned> CBZDisplacementBits(
61     "aarch64-cbz-offset-bits", cl::Hidden, cl::init(19),
62     cl::desc("Restrict range of CB[N]Z instructions (DEBUG)"));
63 
64 static cl::opt<unsigned>
65     BCCDisplacementBits("aarch64-bcc-offset-bits", cl::Hidden, cl::init(19),
66                         cl::desc("Restrict range of Bcc instructions (DEBUG)"));
67 
68 AArch64InstrInfo::AArch64InstrInfo(const AArch64Subtarget &STI)
69     : AArch64GenInstrInfo(AArch64::ADJCALLSTACKDOWN, AArch64::ADJCALLSTACKUP,
70                           AArch64::CATCHRET),
71       RI(STI.getTargetTriple()), Subtarget(STI) {}
72 
73 /// GetInstSize - Return the number of bytes of code the specified
74 /// instruction may be.  This returns the maximum number of bytes.
75 unsigned AArch64InstrInfo::getInstSizeInBytes(const MachineInstr &MI) const {
76   const MachineBasicBlock &MBB = *MI.getParent();
77   const MachineFunction *MF = MBB.getParent();
78   const MCAsmInfo *MAI = MF->getTarget().getMCAsmInfo();
79 
80   {
81     auto Op = MI.getOpcode();
82     if (Op == AArch64::INLINEASM || Op == AArch64::INLINEASM_BR)
83       return getInlineAsmLength(MI.getOperand(0).getSymbolName(), *MAI);
84   }
85 
86   // Meta-instructions emit no code.
87   if (MI.isMetaInstruction())
88     return 0;
89 
90   // FIXME: We currently only handle pseudoinstructions that don't get expanded
91   //        before the assembly printer.
92   unsigned NumBytes = 0;
93   const MCInstrDesc &Desc = MI.getDesc();
94   switch (Desc.getOpcode()) {
95   default:
96     // Anything not explicitly designated otherwise is a normal 4-byte insn.
97     NumBytes = 4;
98     break;
99   case TargetOpcode::STACKMAP:
100     // The upper bound for a stackmap intrinsic is the full length of its shadow
101     NumBytes = StackMapOpers(&MI).getNumPatchBytes();
102     assert(NumBytes % 4 == 0 && "Invalid number of NOP bytes requested!");
103     break;
104   case TargetOpcode::PATCHPOINT:
105     // The size of the patchpoint intrinsic is the number of bytes requested
106     NumBytes = PatchPointOpers(&MI).getNumPatchBytes();
107     assert(NumBytes % 4 == 0 && "Invalid number of NOP bytes requested!");
108     break;
109   case AArch64::TLSDESC_CALLSEQ:
110     // This gets lowered to an instruction sequence which takes 16 bytes
111     NumBytes = 16;
112     break;
113   case AArch64::JumpTableDest32:
114   case AArch64::JumpTableDest16:
115   case AArch64::JumpTableDest8:
116     NumBytes = 12;
117     break;
118   case AArch64::SPACE:
119     NumBytes = MI.getOperand(1).getImm();
120     break;
121   }
122 
123   return NumBytes;
124 }
125 
126 static void parseCondBranch(MachineInstr *LastInst, MachineBasicBlock *&Target,
127                             SmallVectorImpl<MachineOperand> &Cond) {
128   // Block ends with fall-through condbranch.
129   switch (LastInst->getOpcode()) {
130   default:
131     llvm_unreachable("Unknown branch instruction?");
132   case AArch64::Bcc:
133     Target = LastInst->getOperand(1).getMBB();
134     Cond.push_back(LastInst->getOperand(0));
135     break;
136   case AArch64::CBZW:
137   case AArch64::CBZX:
138   case AArch64::CBNZW:
139   case AArch64::CBNZX:
140     Target = LastInst->getOperand(1).getMBB();
141     Cond.push_back(MachineOperand::CreateImm(-1));
142     Cond.push_back(MachineOperand::CreateImm(LastInst->getOpcode()));
143     Cond.push_back(LastInst->getOperand(0));
144     break;
145   case AArch64::TBZW:
146   case AArch64::TBZX:
147   case AArch64::TBNZW:
148   case AArch64::TBNZX:
149     Target = LastInst->getOperand(2).getMBB();
150     Cond.push_back(MachineOperand::CreateImm(-1));
151     Cond.push_back(MachineOperand::CreateImm(LastInst->getOpcode()));
152     Cond.push_back(LastInst->getOperand(0));
153     Cond.push_back(LastInst->getOperand(1));
154   }
155 }
156 
157 static unsigned getBranchDisplacementBits(unsigned Opc) {
158   switch (Opc) {
159   default:
160     llvm_unreachable("unexpected opcode!");
161   case AArch64::B:
162     return 64;
163   case AArch64::TBNZW:
164   case AArch64::TBZW:
165   case AArch64::TBNZX:
166   case AArch64::TBZX:
167     return TBZDisplacementBits;
168   case AArch64::CBNZW:
169   case AArch64::CBZW:
170   case AArch64::CBNZX:
171   case AArch64::CBZX:
172     return CBZDisplacementBits;
173   case AArch64::Bcc:
174     return BCCDisplacementBits;
175   }
176 }
177 
178 bool AArch64InstrInfo::isBranchOffsetInRange(unsigned BranchOp,
179                                              int64_t BrOffset) const {
180   unsigned Bits = getBranchDisplacementBits(BranchOp);
181   assert(Bits >= 3 && "max branch displacement must be enough to jump"
182                       "over conditional branch expansion");
183   return isIntN(Bits, BrOffset / 4);
184 }
185 
186 MachineBasicBlock *
187 AArch64InstrInfo::getBranchDestBlock(const MachineInstr &MI) const {
188   switch (MI.getOpcode()) {
189   default:
190     llvm_unreachable("unexpected opcode!");
191   case AArch64::B:
192     return MI.getOperand(0).getMBB();
193   case AArch64::TBZW:
194   case AArch64::TBNZW:
195   case AArch64::TBZX:
196   case AArch64::TBNZX:
197     return MI.getOperand(2).getMBB();
198   case AArch64::CBZW:
199   case AArch64::CBNZW:
200   case AArch64::CBZX:
201   case AArch64::CBNZX:
202   case AArch64::Bcc:
203     return MI.getOperand(1).getMBB();
204   }
205 }
206 
207 // Branch analysis.
208 bool AArch64InstrInfo::analyzeBranch(MachineBasicBlock &MBB,
209                                      MachineBasicBlock *&TBB,
210                                      MachineBasicBlock *&FBB,
211                                      SmallVectorImpl<MachineOperand> &Cond,
212                                      bool AllowModify) const {
213   // If the block has no terminators, it just falls into the block after it.
214   MachineBasicBlock::iterator I = MBB.getLastNonDebugInstr();
215   if (I == MBB.end())
216     return false;
217 
218   if (!isUnpredicatedTerminator(*I))
219     return false;
220 
221   // Get the last instruction in the block.
222   MachineInstr *LastInst = &*I;
223 
224   // If there is only one terminator instruction, process it.
225   unsigned LastOpc = LastInst->getOpcode();
226   if (I == MBB.begin() || !isUnpredicatedTerminator(*--I)) {
227     if (isUncondBranchOpcode(LastOpc)) {
228       TBB = LastInst->getOperand(0).getMBB();
229       return false;
230     }
231     if (isCondBranchOpcode(LastOpc)) {
232       // Block ends with fall-through condbranch.
233       parseCondBranch(LastInst, TBB, Cond);
234       return false;
235     }
236     return true; // Can't handle indirect branch.
237   }
238 
239   // Get the instruction before it if it is a terminator.
240   MachineInstr *SecondLastInst = &*I;
241   unsigned SecondLastOpc = SecondLastInst->getOpcode();
242 
243   // If AllowModify is true and the block ends with two or more unconditional
244   // branches, delete all but the first unconditional branch.
245   if (AllowModify && isUncondBranchOpcode(LastOpc)) {
246     while (isUncondBranchOpcode(SecondLastOpc)) {
247       LastInst->eraseFromParent();
248       LastInst = SecondLastInst;
249       LastOpc = LastInst->getOpcode();
250       if (I == MBB.begin() || !isUnpredicatedTerminator(*--I)) {
251         // Return now the only terminator is an unconditional branch.
252         TBB = LastInst->getOperand(0).getMBB();
253         return false;
254       } else {
255         SecondLastInst = &*I;
256         SecondLastOpc = SecondLastInst->getOpcode();
257       }
258     }
259   }
260 
261   // If there are three terminators, we don't know what sort of block this is.
262   if (SecondLastInst && I != MBB.begin() && isUnpredicatedTerminator(*--I))
263     return true;
264 
265   // If the block ends with a B and a Bcc, handle it.
266   if (isCondBranchOpcode(SecondLastOpc) && isUncondBranchOpcode(LastOpc)) {
267     parseCondBranch(SecondLastInst, TBB, Cond);
268     FBB = LastInst->getOperand(0).getMBB();
269     return false;
270   }
271 
272   // If the block ends with two unconditional branches, handle it.  The second
273   // one is not executed, so remove it.
274   if (isUncondBranchOpcode(SecondLastOpc) && isUncondBranchOpcode(LastOpc)) {
275     TBB = SecondLastInst->getOperand(0).getMBB();
276     I = LastInst;
277     if (AllowModify)
278       I->eraseFromParent();
279     return false;
280   }
281 
282   // ...likewise if it ends with an indirect branch followed by an unconditional
283   // branch.
284   if (isIndirectBranchOpcode(SecondLastOpc) && isUncondBranchOpcode(LastOpc)) {
285     I = LastInst;
286     if (AllowModify)
287       I->eraseFromParent();
288     return true;
289   }
290 
291   // Otherwise, can't handle this.
292   return true;
293 }
294 
295 bool AArch64InstrInfo::reverseBranchCondition(
296     SmallVectorImpl<MachineOperand> &Cond) const {
297   if (Cond[0].getImm() != -1) {
298     // Regular Bcc
299     AArch64CC::CondCode CC = (AArch64CC::CondCode)(int)Cond[0].getImm();
300     Cond[0].setImm(AArch64CC::getInvertedCondCode(CC));
301   } else {
302     // Folded compare-and-branch
303     switch (Cond[1].getImm()) {
304     default:
305       llvm_unreachable("Unknown conditional branch!");
306     case AArch64::CBZW:
307       Cond[1].setImm(AArch64::CBNZW);
308       break;
309     case AArch64::CBNZW:
310       Cond[1].setImm(AArch64::CBZW);
311       break;
312     case AArch64::CBZX:
313       Cond[1].setImm(AArch64::CBNZX);
314       break;
315     case AArch64::CBNZX:
316       Cond[1].setImm(AArch64::CBZX);
317       break;
318     case AArch64::TBZW:
319       Cond[1].setImm(AArch64::TBNZW);
320       break;
321     case AArch64::TBNZW:
322       Cond[1].setImm(AArch64::TBZW);
323       break;
324     case AArch64::TBZX:
325       Cond[1].setImm(AArch64::TBNZX);
326       break;
327     case AArch64::TBNZX:
328       Cond[1].setImm(AArch64::TBZX);
329       break;
330     }
331   }
332 
333   return false;
334 }
335 
336 unsigned AArch64InstrInfo::removeBranch(MachineBasicBlock &MBB,
337                                         int *BytesRemoved) const {
338   MachineBasicBlock::iterator I = MBB.getLastNonDebugInstr();
339   if (I == MBB.end())
340     return 0;
341 
342   if (!isUncondBranchOpcode(I->getOpcode()) &&
343       !isCondBranchOpcode(I->getOpcode()))
344     return 0;
345 
346   // Remove the branch.
347   I->eraseFromParent();
348 
349   I = MBB.end();
350 
351   if (I == MBB.begin()) {
352     if (BytesRemoved)
353       *BytesRemoved = 4;
354     return 1;
355   }
356   --I;
357   if (!isCondBranchOpcode(I->getOpcode())) {
358     if (BytesRemoved)
359       *BytesRemoved = 4;
360     return 1;
361   }
362 
363   // Remove the branch.
364   I->eraseFromParent();
365   if (BytesRemoved)
366     *BytesRemoved = 8;
367 
368   return 2;
369 }
370 
371 void AArch64InstrInfo::instantiateCondBranch(
372     MachineBasicBlock &MBB, const DebugLoc &DL, MachineBasicBlock *TBB,
373     ArrayRef<MachineOperand> Cond) const {
374   if (Cond[0].getImm() != -1) {
375     // Regular Bcc
376     BuildMI(&MBB, DL, get(AArch64::Bcc)).addImm(Cond[0].getImm()).addMBB(TBB);
377   } else {
378     // Folded compare-and-branch
379     // Note that we use addOperand instead of addReg to keep the flags.
380     const MachineInstrBuilder MIB =
381         BuildMI(&MBB, DL, get(Cond[1].getImm())).add(Cond[2]);
382     if (Cond.size() > 3)
383       MIB.addImm(Cond[3].getImm());
384     MIB.addMBB(TBB);
385   }
386 }
387 
388 unsigned AArch64InstrInfo::insertBranch(
389     MachineBasicBlock &MBB, MachineBasicBlock *TBB, MachineBasicBlock *FBB,
390     ArrayRef<MachineOperand> Cond, const DebugLoc &DL, int *BytesAdded) const {
391   // Shouldn't be a fall through.
392   assert(TBB && "insertBranch must not be told to insert a fallthrough");
393 
394   if (!FBB) {
395     if (Cond.empty()) // Unconditional branch?
396       BuildMI(&MBB, DL, get(AArch64::B)).addMBB(TBB);
397     else
398       instantiateCondBranch(MBB, DL, TBB, Cond);
399 
400     if (BytesAdded)
401       *BytesAdded = 4;
402 
403     return 1;
404   }
405 
406   // Two-way conditional branch.
407   instantiateCondBranch(MBB, DL, TBB, Cond);
408   BuildMI(&MBB, DL, get(AArch64::B)).addMBB(FBB);
409 
410   if (BytesAdded)
411     *BytesAdded = 8;
412 
413   return 2;
414 }
415 
416 // Find the original register that VReg is copied from.
417 static unsigned removeCopies(const MachineRegisterInfo &MRI, unsigned VReg) {
418   while (Register::isVirtualRegister(VReg)) {
419     const MachineInstr *DefMI = MRI.getVRegDef(VReg);
420     if (!DefMI->isFullCopy())
421       return VReg;
422     VReg = DefMI->getOperand(1).getReg();
423   }
424   return VReg;
425 }
426 
427 // Determine if VReg is defined by an instruction that can be folded into a
428 // csel instruction. If so, return the folded opcode, and the replacement
429 // register.
430 static unsigned canFoldIntoCSel(const MachineRegisterInfo &MRI, unsigned VReg,
431                                 unsigned *NewVReg = nullptr) {
432   VReg = removeCopies(MRI, VReg);
433   if (!Register::isVirtualRegister(VReg))
434     return 0;
435 
436   bool Is64Bit = AArch64::GPR64allRegClass.hasSubClassEq(MRI.getRegClass(VReg));
437   const MachineInstr *DefMI = MRI.getVRegDef(VReg);
438   unsigned Opc = 0;
439   unsigned SrcOpNum = 0;
440   switch (DefMI->getOpcode()) {
441   case AArch64::ADDSXri:
442   case AArch64::ADDSWri:
443     // if NZCV is used, do not fold.
444     if (DefMI->findRegisterDefOperandIdx(AArch64::NZCV, true) == -1)
445       return 0;
446     // fall-through to ADDXri and ADDWri.
447     LLVM_FALLTHROUGH;
448   case AArch64::ADDXri:
449   case AArch64::ADDWri:
450     // add x, 1 -> csinc.
451     if (!DefMI->getOperand(2).isImm() || DefMI->getOperand(2).getImm() != 1 ||
452         DefMI->getOperand(3).getImm() != 0)
453       return 0;
454     SrcOpNum = 1;
455     Opc = Is64Bit ? AArch64::CSINCXr : AArch64::CSINCWr;
456     break;
457 
458   case AArch64::ORNXrr:
459   case AArch64::ORNWrr: {
460     // not x -> csinv, represented as orn dst, xzr, src.
461     unsigned ZReg = removeCopies(MRI, DefMI->getOperand(1).getReg());
462     if (ZReg != AArch64::XZR && ZReg != AArch64::WZR)
463       return 0;
464     SrcOpNum = 2;
465     Opc = Is64Bit ? AArch64::CSINVXr : AArch64::CSINVWr;
466     break;
467   }
468 
469   case AArch64::SUBSXrr:
470   case AArch64::SUBSWrr:
471     // if NZCV is used, do not fold.
472     if (DefMI->findRegisterDefOperandIdx(AArch64::NZCV, true) == -1)
473       return 0;
474     // fall-through to SUBXrr and SUBWrr.
475     LLVM_FALLTHROUGH;
476   case AArch64::SUBXrr:
477   case AArch64::SUBWrr: {
478     // neg x -> csneg, represented as sub dst, xzr, src.
479     unsigned ZReg = removeCopies(MRI, DefMI->getOperand(1).getReg());
480     if (ZReg != AArch64::XZR && ZReg != AArch64::WZR)
481       return 0;
482     SrcOpNum = 2;
483     Opc = Is64Bit ? AArch64::CSNEGXr : AArch64::CSNEGWr;
484     break;
485   }
486   default:
487     return 0;
488   }
489   assert(Opc && SrcOpNum && "Missing parameters");
490 
491   if (NewVReg)
492     *NewVReg = DefMI->getOperand(SrcOpNum).getReg();
493   return Opc;
494 }
495 
496 bool AArch64InstrInfo::canInsertSelect(const MachineBasicBlock &MBB,
497                                        ArrayRef<MachineOperand> Cond,
498                                        unsigned TrueReg, unsigned FalseReg,
499                                        int &CondCycles, int &TrueCycles,
500                                        int &FalseCycles) const {
501   // Check register classes.
502   const MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo();
503   const TargetRegisterClass *RC =
504       RI.getCommonSubClass(MRI.getRegClass(TrueReg), MRI.getRegClass(FalseReg));
505   if (!RC)
506     return false;
507 
508   // Expanding cbz/tbz requires an extra cycle of latency on the condition.
509   unsigned ExtraCondLat = Cond.size() != 1;
510 
511   // GPRs are handled by csel.
512   // FIXME: Fold in x+1, -x, and ~x when applicable.
513   if (AArch64::GPR64allRegClass.hasSubClassEq(RC) ||
514       AArch64::GPR32allRegClass.hasSubClassEq(RC)) {
515     // Single-cycle csel, csinc, csinv, and csneg.
516     CondCycles = 1 + ExtraCondLat;
517     TrueCycles = FalseCycles = 1;
518     if (canFoldIntoCSel(MRI, TrueReg))
519       TrueCycles = 0;
520     else if (canFoldIntoCSel(MRI, FalseReg))
521       FalseCycles = 0;
522     return true;
523   }
524 
525   // Scalar floating point is handled by fcsel.
526   // FIXME: Form fabs, fmin, and fmax when applicable.
527   if (AArch64::FPR64RegClass.hasSubClassEq(RC) ||
528       AArch64::FPR32RegClass.hasSubClassEq(RC)) {
529     CondCycles = 5 + ExtraCondLat;
530     TrueCycles = FalseCycles = 2;
531     return true;
532   }
533 
534   // Can't do vectors.
535   return false;
536 }
537 
538 void AArch64InstrInfo::insertSelect(MachineBasicBlock &MBB,
539                                     MachineBasicBlock::iterator I,
540                                     const DebugLoc &DL, unsigned DstReg,
541                                     ArrayRef<MachineOperand> Cond,
542                                     unsigned TrueReg, unsigned FalseReg) const {
543   MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo();
544 
545   // Parse the condition code, see parseCondBranch() above.
546   AArch64CC::CondCode CC;
547   switch (Cond.size()) {
548   default:
549     llvm_unreachable("Unknown condition opcode in Cond");
550   case 1: // b.cc
551     CC = AArch64CC::CondCode(Cond[0].getImm());
552     break;
553   case 3: { // cbz/cbnz
554     // We must insert a compare against 0.
555     bool Is64Bit;
556     switch (Cond[1].getImm()) {
557     default:
558       llvm_unreachable("Unknown branch opcode in Cond");
559     case AArch64::CBZW:
560       Is64Bit = false;
561       CC = AArch64CC::EQ;
562       break;
563     case AArch64::CBZX:
564       Is64Bit = true;
565       CC = AArch64CC::EQ;
566       break;
567     case AArch64::CBNZW:
568       Is64Bit = false;
569       CC = AArch64CC::NE;
570       break;
571     case AArch64::CBNZX:
572       Is64Bit = true;
573       CC = AArch64CC::NE;
574       break;
575     }
576     Register SrcReg = Cond[2].getReg();
577     if (Is64Bit) {
578       // cmp reg, #0 is actually subs xzr, reg, #0.
579       MRI.constrainRegClass(SrcReg, &AArch64::GPR64spRegClass);
580       BuildMI(MBB, I, DL, get(AArch64::SUBSXri), AArch64::XZR)
581           .addReg(SrcReg)
582           .addImm(0)
583           .addImm(0);
584     } else {
585       MRI.constrainRegClass(SrcReg, &AArch64::GPR32spRegClass);
586       BuildMI(MBB, I, DL, get(AArch64::SUBSWri), AArch64::WZR)
587           .addReg(SrcReg)
588           .addImm(0)
589           .addImm(0);
590     }
591     break;
592   }
593   case 4: { // tbz/tbnz
594     // We must insert a tst instruction.
595     switch (Cond[1].getImm()) {
596     default:
597       llvm_unreachable("Unknown branch opcode in Cond");
598     case AArch64::TBZW:
599     case AArch64::TBZX:
600       CC = AArch64CC::EQ;
601       break;
602     case AArch64::TBNZW:
603     case AArch64::TBNZX:
604       CC = AArch64CC::NE;
605       break;
606     }
607     // cmp reg, #foo is actually ands xzr, reg, #1<<foo.
608     if (Cond[1].getImm() == AArch64::TBZW || Cond[1].getImm() == AArch64::TBNZW)
609       BuildMI(MBB, I, DL, get(AArch64::ANDSWri), AArch64::WZR)
610           .addReg(Cond[2].getReg())
611           .addImm(
612               AArch64_AM::encodeLogicalImmediate(1ull << Cond[3].getImm(), 32));
613     else
614       BuildMI(MBB, I, DL, get(AArch64::ANDSXri), AArch64::XZR)
615           .addReg(Cond[2].getReg())
616           .addImm(
617               AArch64_AM::encodeLogicalImmediate(1ull << Cond[3].getImm(), 64));
618     break;
619   }
620   }
621 
622   unsigned Opc = 0;
623   const TargetRegisterClass *RC = nullptr;
624   bool TryFold = false;
625   if (MRI.constrainRegClass(DstReg, &AArch64::GPR64RegClass)) {
626     RC = &AArch64::GPR64RegClass;
627     Opc = AArch64::CSELXr;
628     TryFold = true;
629   } else if (MRI.constrainRegClass(DstReg, &AArch64::GPR32RegClass)) {
630     RC = &AArch64::GPR32RegClass;
631     Opc = AArch64::CSELWr;
632     TryFold = true;
633   } else if (MRI.constrainRegClass(DstReg, &AArch64::FPR64RegClass)) {
634     RC = &AArch64::FPR64RegClass;
635     Opc = AArch64::FCSELDrrr;
636   } else if (MRI.constrainRegClass(DstReg, &AArch64::FPR32RegClass)) {
637     RC = &AArch64::FPR32RegClass;
638     Opc = AArch64::FCSELSrrr;
639   }
640   assert(RC && "Unsupported regclass");
641 
642   // Try folding simple instructions into the csel.
643   if (TryFold) {
644     unsigned NewVReg = 0;
645     unsigned FoldedOpc = canFoldIntoCSel(MRI, TrueReg, &NewVReg);
646     if (FoldedOpc) {
647       // The folded opcodes csinc, csinc and csneg apply the operation to
648       // FalseReg, so we need to invert the condition.
649       CC = AArch64CC::getInvertedCondCode(CC);
650       TrueReg = FalseReg;
651     } else
652       FoldedOpc = canFoldIntoCSel(MRI, FalseReg, &NewVReg);
653 
654     // Fold the operation. Leave any dead instructions for DCE to clean up.
655     if (FoldedOpc) {
656       FalseReg = NewVReg;
657       Opc = FoldedOpc;
658       // The extends the live range of NewVReg.
659       MRI.clearKillFlags(NewVReg);
660     }
661   }
662 
663   // Pull all virtual register into the appropriate class.
664   MRI.constrainRegClass(TrueReg, RC);
665   MRI.constrainRegClass(FalseReg, RC);
666 
667   // Insert the csel.
668   BuildMI(MBB, I, DL, get(Opc), DstReg)
669       .addReg(TrueReg)
670       .addReg(FalseReg)
671       .addImm(CC);
672 }
673 
674 /// Returns true if a MOVi32imm or MOVi64imm can be expanded to an  ORRxx.
675 static bool canBeExpandedToORR(const MachineInstr &MI, unsigned BitSize) {
676   uint64_t Imm = MI.getOperand(1).getImm();
677   uint64_t UImm = Imm << (64 - BitSize) >> (64 - BitSize);
678   uint64_t Encoding;
679   return AArch64_AM::processLogicalImmediate(UImm, BitSize, Encoding);
680 }
681 
682 // FIXME: this implementation should be micro-architecture dependent, so a
683 // micro-architecture target hook should be introduced here in future.
684 bool AArch64InstrInfo::isAsCheapAsAMove(const MachineInstr &MI) const {
685   if (!Subtarget.hasCustomCheapAsMoveHandling())
686     return MI.isAsCheapAsAMove();
687 
688   const unsigned Opcode = MI.getOpcode();
689 
690   // Firstly, check cases gated by features.
691 
692   if (Subtarget.hasZeroCycleZeroingFP()) {
693     if (Opcode == AArch64::FMOVH0 ||
694         Opcode == AArch64::FMOVS0 ||
695         Opcode == AArch64::FMOVD0)
696       return true;
697   }
698 
699   if (Subtarget.hasZeroCycleZeroingGP()) {
700     if (Opcode == TargetOpcode::COPY &&
701         (MI.getOperand(1).getReg() == AArch64::WZR ||
702          MI.getOperand(1).getReg() == AArch64::XZR))
703       return true;
704   }
705 
706   // Secondly, check cases specific to sub-targets.
707 
708   if (Subtarget.hasExynosCheapAsMoveHandling()) {
709     if (isExynosCheapAsMove(MI))
710       return true;
711 
712     return MI.isAsCheapAsAMove();
713   }
714 
715   // Finally, check generic cases.
716 
717   switch (Opcode) {
718   default:
719     return false;
720 
721   // add/sub on register without shift
722   case AArch64::ADDWri:
723   case AArch64::ADDXri:
724   case AArch64::SUBWri:
725   case AArch64::SUBXri:
726     return (MI.getOperand(3).getImm() == 0);
727 
728   // logical ops on immediate
729   case AArch64::ANDWri:
730   case AArch64::ANDXri:
731   case AArch64::EORWri:
732   case AArch64::EORXri:
733   case AArch64::ORRWri:
734   case AArch64::ORRXri:
735     return true;
736 
737   // logical ops on register without shift
738   case AArch64::ANDWrr:
739   case AArch64::ANDXrr:
740   case AArch64::BICWrr:
741   case AArch64::BICXrr:
742   case AArch64::EONWrr:
743   case AArch64::EONXrr:
744   case AArch64::EORWrr:
745   case AArch64::EORXrr:
746   case AArch64::ORNWrr:
747   case AArch64::ORNXrr:
748   case AArch64::ORRWrr:
749   case AArch64::ORRXrr:
750     return true;
751 
752   // If MOVi32imm or MOVi64imm can be expanded into ORRWri or
753   // ORRXri, it is as cheap as MOV
754   case AArch64::MOVi32imm:
755     return canBeExpandedToORR(MI, 32);
756   case AArch64::MOVi64imm:
757     return canBeExpandedToORR(MI, 64);
758   }
759 
760   llvm_unreachable("Unknown opcode to check as cheap as a move!");
761 }
762 
763 bool AArch64InstrInfo::isFalkorShiftExtFast(const MachineInstr &MI) {
764   switch (MI.getOpcode()) {
765   default:
766     return false;
767 
768   case AArch64::ADDWrs:
769   case AArch64::ADDXrs:
770   case AArch64::ADDSWrs:
771   case AArch64::ADDSXrs: {
772     unsigned Imm = MI.getOperand(3).getImm();
773     unsigned ShiftVal = AArch64_AM::getShiftValue(Imm);
774     if (ShiftVal == 0)
775       return true;
776     return AArch64_AM::getShiftType(Imm) == AArch64_AM::LSL && ShiftVal <= 5;
777   }
778 
779   case AArch64::ADDWrx:
780   case AArch64::ADDXrx:
781   case AArch64::ADDXrx64:
782   case AArch64::ADDSWrx:
783   case AArch64::ADDSXrx:
784   case AArch64::ADDSXrx64: {
785     unsigned Imm = MI.getOperand(3).getImm();
786     switch (AArch64_AM::getArithExtendType(Imm)) {
787     default:
788       return false;
789     case AArch64_AM::UXTB:
790     case AArch64_AM::UXTH:
791     case AArch64_AM::UXTW:
792     case AArch64_AM::UXTX:
793       return AArch64_AM::getArithShiftValue(Imm) <= 4;
794     }
795   }
796 
797   case AArch64::SUBWrs:
798   case AArch64::SUBSWrs: {
799     unsigned Imm = MI.getOperand(3).getImm();
800     unsigned ShiftVal = AArch64_AM::getShiftValue(Imm);
801     return ShiftVal == 0 ||
802            (AArch64_AM::getShiftType(Imm) == AArch64_AM::ASR && ShiftVal == 31);
803   }
804 
805   case AArch64::SUBXrs:
806   case AArch64::SUBSXrs: {
807     unsigned Imm = MI.getOperand(3).getImm();
808     unsigned ShiftVal = AArch64_AM::getShiftValue(Imm);
809     return ShiftVal == 0 ||
810            (AArch64_AM::getShiftType(Imm) == AArch64_AM::ASR && ShiftVal == 63);
811   }
812 
813   case AArch64::SUBWrx:
814   case AArch64::SUBXrx:
815   case AArch64::SUBXrx64:
816   case AArch64::SUBSWrx:
817   case AArch64::SUBSXrx:
818   case AArch64::SUBSXrx64: {
819     unsigned Imm = MI.getOperand(3).getImm();
820     switch (AArch64_AM::getArithExtendType(Imm)) {
821     default:
822       return false;
823     case AArch64_AM::UXTB:
824     case AArch64_AM::UXTH:
825     case AArch64_AM::UXTW:
826     case AArch64_AM::UXTX:
827       return AArch64_AM::getArithShiftValue(Imm) == 0;
828     }
829   }
830 
831   case AArch64::LDRBBroW:
832   case AArch64::LDRBBroX:
833   case AArch64::LDRBroW:
834   case AArch64::LDRBroX:
835   case AArch64::LDRDroW:
836   case AArch64::LDRDroX:
837   case AArch64::LDRHHroW:
838   case AArch64::LDRHHroX:
839   case AArch64::LDRHroW:
840   case AArch64::LDRHroX:
841   case AArch64::LDRQroW:
842   case AArch64::LDRQroX:
843   case AArch64::LDRSBWroW:
844   case AArch64::LDRSBWroX:
845   case AArch64::LDRSBXroW:
846   case AArch64::LDRSBXroX:
847   case AArch64::LDRSHWroW:
848   case AArch64::LDRSHWroX:
849   case AArch64::LDRSHXroW:
850   case AArch64::LDRSHXroX:
851   case AArch64::LDRSWroW:
852   case AArch64::LDRSWroX:
853   case AArch64::LDRSroW:
854   case AArch64::LDRSroX:
855   case AArch64::LDRWroW:
856   case AArch64::LDRWroX:
857   case AArch64::LDRXroW:
858   case AArch64::LDRXroX:
859   case AArch64::PRFMroW:
860   case AArch64::PRFMroX:
861   case AArch64::STRBBroW:
862   case AArch64::STRBBroX:
863   case AArch64::STRBroW:
864   case AArch64::STRBroX:
865   case AArch64::STRDroW:
866   case AArch64::STRDroX:
867   case AArch64::STRHHroW:
868   case AArch64::STRHHroX:
869   case AArch64::STRHroW:
870   case AArch64::STRHroX:
871   case AArch64::STRQroW:
872   case AArch64::STRQroX:
873   case AArch64::STRSroW:
874   case AArch64::STRSroX:
875   case AArch64::STRWroW:
876   case AArch64::STRWroX:
877   case AArch64::STRXroW:
878   case AArch64::STRXroX: {
879     unsigned IsSigned = MI.getOperand(3).getImm();
880     return !IsSigned;
881   }
882   }
883 }
884 
885 bool AArch64InstrInfo::isSEHInstruction(const MachineInstr &MI) {
886   unsigned Opc = MI.getOpcode();
887   switch (Opc) {
888     default:
889       return false;
890     case AArch64::SEH_StackAlloc:
891     case AArch64::SEH_SaveFPLR:
892     case AArch64::SEH_SaveFPLR_X:
893     case AArch64::SEH_SaveReg:
894     case AArch64::SEH_SaveReg_X:
895     case AArch64::SEH_SaveRegP:
896     case AArch64::SEH_SaveRegP_X:
897     case AArch64::SEH_SaveFReg:
898     case AArch64::SEH_SaveFReg_X:
899     case AArch64::SEH_SaveFRegP:
900     case AArch64::SEH_SaveFRegP_X:
901     case AArch64::SEH_SetFP:
902     case AArch64::SEH_AddFP:
903     case AArch64::SEH_Nop:
904     case AArch64::SEH_PrologEnd:
905     case AArch64::SEH_EpilogStart:
906     case AArch64::SEH_EpilogEnd:
907       return true;
908   }
909 }
910 
911 bool AArch64InstrInfo::isCoalescableExtInstr(const MachineInstr &MI,
912                                              unsigned &SrcReg, unsigned &DstReg,
913                                              unsigned &SubIdx) const {
914   switch (MI.getOpcode()) {
915   default:
916     return false;
917   case AArch64::SBFMXri: // aka sxtw
918   case AArch64::UBFMXri: // aka uxtw
919     // Check for the 32 -> 64 bit extension case, these instructions can do
920     // much more.
921     if (MI.getOperand(2).getImm() != 0 || MI.getOperand(3).getImm() != 31)
922       return false;
923     // This is a signed or unsigned 32 -> 64 bit extension.
924     SrcReg = MI.getOperand(1).getReg();
925     DstReg = MI.getOperand(0).getReg();
926     SubIdx = AArch64::sub_32;
927     return true;
928   }
929 }
930 
931 bool AArch64InstrInfo::areMemAccessesTriviallyDisjoint(
932     const MachineInstr &MIa, const MachineInstr &MIb) const {
933   const TargetRegisterInfo *TRI = &getRegisterInfo();
934   const MachineOperand *BaseOpA = nullptr, *BaseOpB = nullptr;
935   int64_t OffsetA = 0, OffsetB = 0;
936   unsigned WidthA = 0, WidthB = 0;
937 
938   assert(MIa.mayLoadOrStore() && "MIa must be a load or store.");
939   assert(MIb.mayLoadOrStore() && "MIb must be a load or store.");
940 
941   if (MIa.hasUnmodeledSideEffects() || MIb.hasUnmodeledSideEffects() ||
942       MIa.hasOrderedMemoryRef() || MIb.hasOrderedMemoryRef())
943     return false;
944 
945   // Retrieve the base, offset from the base and width. Width
946   // is the size of memory that is being loaded/stored (e.g. 1, 2, 4, 8).  If
947   // base are identical, and the offset of a lower memory access +
948   // the width doesn't overlap the offset of a higher memory access,
949   // then the memory accesses are different.
950   if (getMemOperandWithOffsetWidth(MIa, BaseOpA, OffsetA, WidthA, TRI) &&
951       getMemOperandWithOffsetWidth(MIb, BaseOpB, OffsetB, WidthB, TRI)) {
952     if (BaseOpA->isIdenticalTo(*BaseOpB)) {
953       int LowOffset = OffsetA < OffsetB ? OffsetA : OffsetB;
954       int HighOffset = OffsetA < OffsetB ? OffsetB : OffsetA;
955       int LowWidth = (LowOffset == OffsetA) ? WidthA : WidthB;
956       if (LowOffset + LowWidth <= HighOffset)
957         return true;
958     }
959   }
960   return false;
961 }
962 
963 bool AArch64InstrInfo::isSchedulingBoundary(const MachineInstr &MI,
964                                             const MachineBasicBlock *MBB,
965                                             const MachineFunction &MF) const {
966   if (TargetInstrInfo::isSchedulingBoundary(MI, MBB, MF))
967     return true;
968   switch (MI.getOpcode()) {
969   case AArch64::HINT:
970     // CSDB hints are scheduling barriers.
971     if (MI.getOperand(0).getImm() == 0x14)
972       return true;
973     break;
974   case AArch64::DSB:
975   case AArch64::ISB:
976     // DSB and ISB also are scheduling barriers.
977     return true;
978   default:;
979   }
980   return isSEHInstruction(MI);
981 }
982 
983 /// analyzeCompare - For a comparison instruction, return the source registers
984 /// in SrcReg and SrcReg2, and the value it compares against in CmpValue.
985 /// Return true if the comparison instruction can be analyzed.
986 bool AArch64InstrInfo::analyzeCompare(const MachineInstr &MI, unsigned &SrcReg,
987                                       unsigned &SrcReg2, int &CmpMask,
988                                       int &CmpValue) const {
989   // The first operand can be a frame index where we'd normally expect a
990   // register.
991   assert(MI.getNumOperands() >= 2 && "All AArch64 cmps should have 2 operands");
992   if (!MI.getOperand(1).isReg())
993     return false;
994 
995   switch (MI.getOpcode()) {
996   default:
997     break;
998   case AArch64::SUBSWrr:
999   case AArch64::SUBSWrs:
1000   case AArch64::SUBSWrx:
1001   case AArch64::SUBSXrr:
1002   case AArch64::SUBSXrs:
1003   case AArch64::SUBSXrx:
1004   case AArch64::ADDSWrr:
1005   case AArch64::ADDSWrs:
1006   case AArch64::ADDSWrx:
1007   case AArch64::ADDSXrr:
1008   case AArch64::ADDSXrs:
1009   case AArch64::ADDSXrx:
1010     // Replace SUBSWrr with SUBWrr if NZCV is not used.
1011     SrcReg = MI.getOperand(1).getReg();
1012     SrcReg2 = MI.getOperand(2).getReg();
1013     CmpMask = ~0;
1014     CmpValue = 0;
1015     return true;
1016   case AArch64::SUBSWri:
1017   case AArch64::ADDSWri:
1018   case AArch64::SUBSXri:
1019   case AArch64::ADDSXri:
1020     SrcReg = MI.getOperand(1).getReg();
1021     SrcReg2 = 0;
1022     CmpMask = ~0;
1023     // FIXME: In order to convert CmpValue to 0 or 1
1024     CmpValue = MI.getOperand(2).getImm() != 0;
1025     return true;
1026   case AArch64::ANDSWri:
1027   case AArch64::ANDSXri:
1028     // ANDS does not use the same encoding scheme as the others xxxS
1029     // instructions.
1030     SrcReg = MI.getOperand(1).getReg();
1031     SrcReg2 = 0;
1032     CmpMask = ~0;
1033     // FIXME:The return val type of decodeLogicalImmediate is uint64_t,
1034     // while the type of CmpValue is int. When converting uint64_t to int,
1035     // the high 32 bits of uint64_t will be lost.
1036     // In fact it causes a bug in spec2006-483.xalancbmk
1037     // CmpValue is only used to compare with zero in OptimizeCompareInstr
1038     CmpValue = AArch64_AM::decodeLogicalImmediate(
1039                    MI.getOperand(2).getImm(),
1040                    MI.getOpcode() == AArch64::ANDSWri ? 32 : 64) != 0;
1041     return true;
1042   }
1043 
1044   return false;
1045 }
1046 
1047 static bool UpdateOperandRegClass(MachineInstr &Instr) {
1048   MachineBasicBlock *MBB = Instr.getParent();
1049   assert(MBB && "Can't get MachineBasicBlock here");
1050   MachineFunction *MF = MBB->getParent();
1051   assert(MF && "Can't get MachineFunction here");
1052   const TargetInstrInfo *TII = MF->getSubtarget().getInstrInfo();
1053   const TargetRegisterInfo *TRI = MF->getSubtarget().getRegisterInfo();
1054   MachineRegisterInfo *MRI = &MF->getRegInfo();
1055 
1056   for (unsigned OpIdx = 0, EndIdx = Instr.getNumOperands(); OpIdx < EndIdx;
1057        ++OpIdx) {
1058     MachineOperand &MO = Instr.getOperand(OpIdx);
1059     const TargetRegisterClass *OpRegCstraints =
1060         Instr.getRegClassConstraint(OpIdx, TII, TRI);
1061 
1062     // If there's no constraint, there's nothing to do.
1063     if (!OpRegCstraints)
1064       continue;
1065     // If the operand is a frame index, there's nothing to do here.
1066     // A frame index operand will resolve correctly during PEI.
1067     if (MO.isFI())
1068       continue;
1069 
1070     assert(MO.isReg() &&
1071            "Operand has register constraints without being a register!");
1072 
1073     Register Reg = MO.getReg();
1074     if (Register::isPhysicalRegister(Reg)) {
1075       if (!OpRegCstraints->contains(Reg))
1076         return false;
1077     } else if (!OpRegCstraints->hasSubClassEq(MRI->getRegClass(Reg)) &&
1078                !MRI->constrainRegClass(Reg, OpRegCstraints))
1079       return false;
1080   }
1081 
1082   return true;
1083 }
1084 
1085 /// Return the opcode that does not set flags when possible - otherwise
1086 /// return the original opcode. The caller is responsible to do the actual
1087 /// substitution and legality checking.
1088 static unsigned convertToNonFlagSettingOpc(const MachineInstr &MI) {
1089   // Don't convert all compare instructions, because for some the zero register
1090   // encoding becomes the sp register.
1091   bool MIDefinesZeroReg = false;
1092   if (MI.definesRegister(AArch64::WZR) || MI.definesRegister(AArch64::XZR))
1093     MIDefinesZeroReg = true;
1094 
1095   switch (MI.getOpcode()) {
1096   default:
1097     return MI.getOpcode();
1098   case AArch64::ADDSWrr:
1099     return AArch64::ADDWrr;
1100   case AArch64::ADDSWri:
1101     return MIDefinesZeroReg ? AArch64::ADDSWri : AArch64::ADDWri;
1102   case AArch64::ADDSWrs:
1103     return MIDefinesZeroReg ? AArch64::ADDSWrs : AArch64::ADDWrs;
1104   case AArch64::ADDSWrx:
1105     return AArch64::ADDWrx;
1106   case AArch64::ADDSXrr:
1107     return AArch64::ADDXrr;
1108   case AArch64::ADDSXri:
1109     return MIDefinesZeroReg ? AArch64::ADDSXri : AArch64::ADDXri;
1110   case AArch64::ADDSXrs:
1111     return MIDefinesZeroReg ? AArch64::ADDSXrs : AArch64::ADDXrs;
1112   case AArch64::ADDSXrx:
1113     return AArch64::ADDXrx;
1114   case AArch64::SUBSWrr:
1115     return AArch64::SUBWrr;
1116   case AArch64::SUBSWri:
1117     return MIDefinesZeroReg ? AArch64::SUBSWri : AArch64::SUBWri;
1118   case AArch64::SUBSWrs:
1119     return MIDefinesZeroReg ? AArch64::SUBSWrs : AArch64::SUBWrs;
1120   case AArch64::SUBSWrx:
1121     return AArch64::SUBWrx;
1122   case AArch64::SUBSXrr:
1123     return AArch64::SUBXrr;
1124   case AArch64::SUBSXri:
1125     return MIDefinesZeroReg ? AArch64::SUBSXri : AArch64::SUBXri;
1126   case AArch64::SUBSXrs:
1127     return MIDefinesZeroReg ? AArch64::SUBSXrs : AArch64::SUBXrs;
1128   case AArch64::SUBSXrx:
1129     return AArch64::SUBXrx;
1130   }
1131 }
1132 
1133 enum AccessKind { AK_Write = 0x01, AK_Read = 0x10, AK_All = 0x11 };
1134 
1135 /// True when condition flags are accessed (either by writing or reading)
1136 /// on the instruction trace starting at From and ending at To.
1137 ///
1138 /// Note: If From and To are from different blocks it's assumed CC are accessed
1139 ///       on the path.
1140 static bool areCFlagsAccessedBetweenInstrs(
1141     MachineBasicBlock::iterator From, MachineBasicBlock::iterator To,
1142     const TargetRegisterInfo *TRI, const AccessKind AccessToCheck = AK_All) {
1143   // Early exit if To is at the beginning of the BB.
1144   if (To == To->getParent()->begin())
1145     return true;
1146 
1147   // Check whether the instructions are in the same basic block
1148   // If not, assume the condition flags might get modified somewhere.
1149   if (To->getParent() != From->getParent())
1150     return true;
1151 
1152   // From must be above To.
1153   assert(std::find_if(++To.getReverse(), To->getParent()->rend(),
1154                       [From](MachineInstr &MI) {
1155                         return MI.getIterator() == From;
1156                       }) != To->getParent()->rend());
1157 
1158   // We iterate backward starting \p To until we hit \p From.
1159   for (--To; To != From; --To) {
1160     const MachineInstr &Instr = *To;
1161 
1162     if (((AccessToCheck & AK_Write) &&
1163          Instr.modifiesRegister(AArch64::NZCV, TRI)) ||
1164         ((AccessToCheck & AK_Read) && Instr.readsRegister(AArch64::NZCV, TRI)))
1165       return true;
1166   }
1167   return false;
1168 }
1169 
1170 /// Try to optimize a compare instruction. A compare instruction is an
1171 /// instruction which produces AArch64::NZCV. It can be truly compare
1172 /// instruction
1173 /// when there are no uses of its destination register.
1174 ///
1175 /// The following steps are tried in order:
1176 /// 1. Convert CmpInstr into an unconditional version.
1177 /// 2. Remove CmpInstr if above there is an instruction producing a needed
1178 ///    condition code or an instruction which can be converted into such an
1179 ///    instruction.
1180 ///    Only comparison with zero is supported.
1181 bool AArch64InstrInfo::optimizeCompareInstr(
1182     MachineInstr &CmpInstr, unsigned SrcReg, unsigned SrcReg2, int CmpMask,
1183     int CmpValue, const MachineRegisterInfo *MRI) const {
1184   assert(CmpInstr.getParent());
1185   assert(MRI);
1186 
1187   // Replace SUBSWrr with SUBWrr if NZCV is not used.
1188   int DeadNZCVIdx = CmpInstr.findRegisterDefOperandIdx(AArch64::NZCV, true);
1189   if (DeadNZCVIdx != -1) {
1190     if (CmpInstr.definesRegister(AArch64::WZR) ||
1191         CmpInstr.definesRegister(AArch64::XZR)) {
1192       CmpInstr.eraseFromParent();
1193       return true;
1194     }
1195     unsigned Opc = CmpInstr.getOpcode();
1196     unsigned NewOpc = convertToNonFlagSettingOpc(CmpInstr);
1197     if (NewOpc == Opc)
1198       return false;
1199     const MCInstrDesc &MCID = get(NewOpc);
1200     CmpInstr.setDesc(MCID);
1201     CmpInstr.RemoveOperand(DeadNZCVIdx);
1202     bool succeeded = UpdateOperandRegClass(CmpInstr);
1203     (void)succeeded;
1204     assert(succeeded && "Some operands reg class are incompatible!");
1205     return true;
1206   }
1207 
1208   // Continue only if we have a "ri" where immediate is zero.
1209   // FIXME:CmpValue has already been converted to 0 or 1 in analyzeCompare
1210   // function.
1211   assert((CmpValue == 0 || CmpValue == 1) && "CmpValue must be 0 or 1!");
1212   if (CmpValue != 0 || SrcReg2 != 0)
1213     return false;
1214 
1215   // CmpInstr is a Compare instruction if destination register is not used.
1216   if (!MRI->use_nodbg_empty(CmpInstr.getOperand(0).getReg()))
1217     return false;
1218 
1219   return substituteCmpToZero(CmpInstr, SrcReg, MRI);
1220 }
1221 
1222 /// Get opcode of S version of Instr.
1223 /// If Instr is S version its opcode is returned.
1224 /// AArch64::INSTRUCTION_LIST_END is returned if Instr does not have S version
1225 /// or we are not interested in it.
1226 static unsigned sForm(MachineInstr &Instr) {
1227   switch (Instr.getOpcode()) {
1228   default:
1229     return AArch64::INSTRUCTION_LIST_END;
1230 
1231   case AArch64::ADDSWrr:
1232   case AArch64::ADDSWri:
1233   case AArch64::ADDSXrr:
1234   case AArch64::ADDSXri:
1235   case AArch64::SUBSWrr:
1236   case AArch64::SUBSWri:
1237   case AArch64::SUBSXrr:
1238   case AArch64::SUBSXri:
1239     return Instr.getOpcode();
1240 
1241   case AArch64::ADDWrr:
1242     return AArch64::ADDSWrr;
1243   case AArch64::ADDWri:
1244     return AArch64::ADDSWri;
1245   case AArch64::ADDXrr:
1246     return AArch64::ADDSXrr;
1247   case AArch64::ADDXri:
1248     return AArch64::ADDSXri;
1249   case AArch64::ADCWr:
1250     return AArch64::ADCSWr;
1251   case AArch64::ADCXr:
1252     return AArch64::ADCSXr;
1253   case AArch64::SUBWrr:
1254     return AArch64::SUBSWrr;
1255   case AArch64::SUBWri:
1256     return AArch64::SUBSWri;
1257   case AArch64::SUBXrr:
1258     return AArch64::SUBSXrr;
1259   case AArch64::SUBXri:
1260     return AArch64::SUBSXri;
1261   case AArch64::SBCWr:
1262     return AArch64::SBCSWr;
1263   case AArch64::SBCXr:
1264     return AArch64::SBCSXr;
1265   case AArch64::ANDWri:
1266     return AArch64::ANDSWri;
1267   case AArch64::ANDXri:
1268     return AArch64::ANDSXri;
1269   }
1270 }
1271 
1272 /// Check if AArch64::NZCV should be alive in successors of MBB.
1273 static bool areCFlagsAliveInSuccessors(MachineBasicBlock *MBB) {
1274   for (auto *BB : MBB->successors())
1275     if (BB->isLiveIn(AArch64::NZCV))
1276       return true;
1277   return false;
1278 }
1279 
1280 namespace {
1281 
1282 struct UsedNZCV {
1283   bool N = false;
1284   bool Z = false;
1285   bool C = false;
1286   bool V = false;
1287 
1288   UsedNZCV() = default;
1289 
1290   UsedNZCV &operator|=(const UsedNZCV &UsedFlags) {
1291     this->N |= UsedFlags.N;
1292     this->Z |= UsedFlags.Z;
1293     this->C |= UsedFlags.C;
1294     this->V |= UsedFlags.V;
1295     return *this;
1296   }
1297 };
1298 
1299 } // end anonymous namespace
1300 
1301 /// Find a condition code used by the instruction.
1302 /// Returns AArch64CC::Invalid if either the instruction does not use condition
1303 /// codes or we don't optimize CmpInstr in the presence of such instructions.
1304 static AArch64CC::CondCode findCondCodeUsedByInstr(const MachineInstr &Instr) {
1305   switch (Instr.getOpcode()) {
1306   default:
1307     return AArch64CC::Invalid;
1308 
1309   case AArch64::Bcc: {
1310     int Idx = Instr.findRegisterUseOperandIdx(AArch64::NZCV);
1311     assert(Idx >= 2);
1312     return static_cast<AArch64CC::CondCode>(Instr.getOperand(Idx - 2).getImm());
1313   }
1314 
1315   case AArch64::CSINVWr:
1316   case AArch64::CSINVXr:
1317   case AArch64::CSINCWr:
1318   case AArch64::CSINCXr:
1319   case AArch64::CSELWr:
1320   case AArch64::CSELXr:
1321   case AArch64::CSNEGWr:
1322   case AArch64::CSNEGXr:
1323   case AArch64::FCSELSrrr:
1324   case AArch64::FCSELDrrr: {
1325     int Idx = Instr.findRegisterUseOperandIdx(AArch64::NZCV);
1326     assert(Idx >= 1);
1327     return static_cast<AArch64CC::CondCode>(Instr.getOperand(Idx - 1).getImm());
1328   }
1329   }
1330 }
1331 
1332 static UsedNZCV getUsedNZCV(AArch64CC::CondCode CC) {
1333   assert(CC != AArch64CC::Invalid);
1334   UsedNZCV UsedFlags;
1335   switch (CC) {
1336   default:
1337     break;
1338 
1339   case AArch64CC::EQ: // Z set
1340   case AArch64CC::NE: // Z clear
1341     UsedFlags.Z = true;
1342     break;
1343 
1344   case AArch64CC::HI: // Z clear and C set
1345   case AArch64CC::LS: // Z set   or  C clear
1346     UsedFlags.Z = true;
1347     LLVM_FALLTHROUGH;
1348   case AArch64CC::HS: // C set
1349   case AArch64CC::LO: // C clear
1350     UsedFlags.C = true;
1351     break;
1352 
1353   case AArch64CC::MI: // N set
1354   case AArch64CC::PL: // N clear
1355     UsedFlags.N = true;
1356     break;
1357 
1358   case AArch64CC::VS: // V set
1359   case AArch64CC::VC: // V clear
1360     UsedFlags.V = true;
1361     break;
1362 
1363   case AArch64CC::GT: // Z clear, N and V the same
1364   case AArch64CC::LE: // Z set,   N and V differ
1365     UsedFlags.Z = true;
1366     LLVM_FALLTHROUGH;
1367   case AArch64CC::GE: // N and V the same
1368   case AArch64CC::LT: // N and V differ
1369     UsedFlags.N = true;
1370     UsedFlags.V = true;
1371     break;
1372   }
1373   return UsedFlags;
1374 }
1375 
1376 static bool isADDSRegImm(unsigned Opcode) {
1377   return Opcode == AArch64::ADDSWri || Opcode == AArch64::ADDSXri;
1378 }
1379 
1380 static bool isSUBSRegImm(unsigned Opcode) {
1381   return Opcode == AArch64::SUBSWri || Opcode == AArch64::SUBSXri;
1382 }
1383 
1384 /// Check if CmpInstr can be substituted by MI.
1385 ///
1386 /// CmpInstr can be substituted:
1387 /// - CmpInstr is either 'ADDS %vreg, 0' or 'SUBS %vreg, 0'
1388 /// - and, MI and CmpInstr are from the same MachineBB
1389 /// - and, condition flags are not alive in successors of the CmpInstr parent
1390 /// - and, if MI opcode is the S form there must be no defs of flags between
1391 ///        MI and CmpInstr
1392 ///        or if MI opcode is not the S form there must be neither defs of flags
1393 ///        nor uses of flags between MI and CmpInstr.
1394 /// - and  C/V flags are not used after CmpInstr
1395 static bool canInstrSubstituteCmpInstr(MachineInstr *MI, MachineInstr *CmpInstr,
1396                                        const TargetRegisterInfo *TRI) {
1397   assert(MI);
1398   assert(sForm(*MI) != AArch64::INSTRUCTION_LIST_END);
1399   assert(CmpInstr);
1400 
1401   const unsigned CmpOpcode = CmpInstr->getOpcode();
1402   if (!isADDSRegImm(CmpOpcode) && !isSUBSRegImm(CmpOpcode))
1403     return false;
1404 
1405   if (MI->getParent() != CmpInstr->getParent())
1406     return false;
1407 
1408   if (areCFlagsAliveInSuccessors(CmpInstr->getParent()))
1409     return false;
1410 
1411   AccessKind AccessToCheck = AK_Write;
1412   if (sForm(*MI) != MI->getOpcode())
1413     AccessToCheck = AK_All;
1414   if (areCFlagsAccessedBetweenInstrs(MI, CmpInstr, TRI, AccessToCheck))
1415     return false;
1416 
1417   UsedNZCV NZCVUsedAfterCmp;
1418   for (auto I = std::next(CmpInstr->getIterator()),
1419             E = CmpInstr->getParent()->instr_end();
1420        I != E; ++I) {
1421     const MachineInstr &Instr = *I;
1422     if (Instr.readsRegister(AArch64::NZCV, TRI)) {
1423       AArch64CC::CondCode CC = findCondCodeUsedByInstr(Instr);
1424       if (CC == AArch64CC::Invalid) // Unsupported conditional instruction
1425         return false;
1426       NZCVUsedAfterCmp |= getUsedNZCV(CC);
1427     }
1428 
1429     if (Instr.modifiesRegister(AArch64::NZCV, TRI))
1430       break;
1431   }
1432 
1433   return !NZCVUsedAfterCmp.C && !NZCVUsedAfterCmp.V;
1434 }
1435 
1436 /// Substitute an instruction comparing to zero with another instruction
1437 /// which produces needed condition flags.
1438 ///
1439 /// Return true on success.
1440 bool AArch64InstrInfo::substituteCmpToZero(
1441     MachineInstr &CmpInstr, unsigned SrcReg,
1442     const MachineRegisterInfo *MRI) const {
1443   assert(MRI);
1444   // Get the unique definition of SrcReg.
1445   MachineInstr *MI = MRI->getUniqueVRegDef(SrcReg);
1446   if (!MI)
1447     return false;
1448 
1449   const TargetRegisterInfo *TRI = &getRegisterInfo();
1450 
1451   unsigned NewOpc = sForm(*MI);
1452   if (NewOpc == AArch64::INSTRUCTION_LIST_END)
1453     return false;
1454 
1455   if (!canInstrSubstituteCmpInstr(MI, &CmpInstr, TRI))
1456     return false;
1457 
1458   // Update the instruction to set NZCV.
1459   MI->setDesc(get(NewOpc));
1460   CmpInstr.eraseFromParent();
1461   bool succeeded = UpdateOperandRegClass(*MI);
1462   (void)succeeded;
1463   assert(succeeded && "Some operands reg class are incompatible!");
1464   MI->addRegisterDefined(AArch64::NZCV, TRI);
1465   return true;
1466 }
1467 
1468 bool AArch64InstrInfo::expandPostRAPseudo(MachineInstr &MI) const {
1469   if (MI.getOpcode() != TargetOpcode::LOAD_STACK_GUARD &&
1470       MI.getOpcode() != AArch64::CATCHRET)
1471     return false;
1472 
1473   MachineBasicBlock &MBB = *MI.getParent();
1474   auto &Subtarget = MBB.getParent()->getSubtarget<AArch64Subtarget>();
1475   auto TRI = Subtarget.getRegisterInfo();
1476   DebugLoc DL = MI.getDebugLoc();
1477 
1478   if (MI.getOpcode() == AArch64::CATCHRET) {
1479     // Skip to the first instruction before the epilog.
1480     const TargetInstrInfo *TII =
1481       MBB.getParent()->getSubtarget().getInstrInfo();
1482     MachineBasicBlock *TargetMBB = MI.getOperand(0).getMBB();
1483     auto MBBI = MachineBasicBlock::iterator(MI);
1484     MachineBasicBlock::iterator FirstEpilogSEH = std::prev(MBBI);
1485     while (FirstEpilogSEH->getFlag(MachineInstr::FrameDestroy) &&
1486            FirstEpilogSEH != MBB.begin())
1487       FirstEpilogSEH = std::prev(FirstEpilogSEH);
1488     if (FirstEpilogSEH != MBB.begin())
1489       FirstEpilogSEH = std::next(FirstEpilogSEH);
1490     BuildMI(MBB, FirstEpilogSEH, DL, TII->get(AArch64::ADRP))
1491         .addReg(AArch64::X0, RegState::Define)
1492         .addMBB(TargetMBB);
1493     BuildMI(MBB, FirstEpilogSEH, DL, TII->get(AArch64::ADDXri))
1494         .addReg(AArch64::X0, RegState::Define)
1495         .addReg(AArch64::X0)
1496         .addMBB(TargetMBB)
1497         .addImm(0);
1498     return true;
1499   }
1500 
1501   Register Reg = MI.getOperand(0).getReg();
1502   const GlobalValue *GV =
1503       cast<GlobalValue>((*MI.memoperands_begin())->getValue());
1504   const TargetMachine &TM = MBB.getParent()->getTarget();
1505   unsigned OpFlags = Subtarget.ClassifyGlobalReference(GV, TM);
1506   const unsigned char MO_NC = AArch64II::MO_NC;
1507 
1508   if ((OpFlags & AArch64II::MO_GOT) != 0) {
1509     BuildMI(MBB, MI, DL, get(AArch64::LOADgot), Reg)
1510         .addGlobalAddress(GV, 0, OpFlags);
1511     if (Subtarget.isTargetILP32()) {
1512       unsigned Reg32 = TRI->getSubReg(Reg, AArch64::sub_32);
1513       BuildMI(MBB, MI, DL, get(AArch64::LDRWui))
1514           .addDef(Reg32, RegState::Dead)
1515           .addUse(Reg, RegState::Kill)
1516           .addImm(0)
1517           .addMemOperand(*MI.memoperands_begin())
1518           .addDef(Reg, RegState::Implicit);
1519     } else {
1520       BuildMI(MBB, MI, DL, get(AArch64::LDRXui), Reg)
1521           .addReg(Reg, RegState::Kill)
1522           .addImm(0)
1523           .addMemOperand(*MI.memoperands_begin());
1524     }
1525   } else if (TM.getCodeModel() == CodeModel::Large) {
1526     assert(!Subtarget.isTargetILP32() && "how can large exist in ILP32?");
1527     BuildMI(MBB, MI, DL, get(AArch64::MOVZXi), Reg)
1528         .addGlobalAddress(GV, 0, AArch64II::MO_G0 | MO_NC)
1529         .addImm(0);
1530     BuildMI(MBB, MI, DL, get(AArch64::MOVKXi), Reg)
1531         .addReg(Reg, RegState::Kill)
1532         .addGlobalAddress(GV, 0, AArch64II::MO_G1 | MO_NC)
1533         .addImm(16);
1534     BuildMI(MBB, MI, DL, get(AArch64::MOVKXi), Reg)
1535         .addReg(Reg, RegState::Kill)
1536         .addGlobalAddress(GV, 0, AArch64II::MO_G2 | MO_NC)
1537         .addImm(32);
1538     BuildMI(MBB, MI, DL, get(AArch64::MOVKXi), Reg)
1539         .addReg(Reg, RegState::Kill)
1540         .addGlobalAddress(GV, 0, AArch64II::MO_G3)
1541         .addImm(48);
1542     BuildMI(MBB, MI, DL, get(AArch64::LDRXui), Reg)
1543         .addReg(Reg, RegState::Kill)
1544         .addImm(0)
1545         .addMemOperand(*MI.memoperands_begin());
1546   } else if (TM.getCodeModel() == CodeModel::Tiny) {
1547     BuildMI(MBB, MI, DL, get(AArch64::ADR), Reg)
1548         .addGlobalAddress(GV, 0, OpFlags);
1549   } else {
1550     BuildMI(MBB, MI, DL, get(AArch64::ADRP), Reg)
1551         .addGlobalAddress(GV, 0, OpFlags | AArch64II::MO_PAGE);
1552     unsigned char LoFlags = OpFlags | AArch64II::MO_PAGEOFF | MO_NC;
1553     if (Subtarget.isTargetILP32()) {
1554       unsigned Reg32 = TRI->getSubReg(Reg, AArch64::sub_32);
1555       BuildMI(MBB, MI, DL, get(AArch64::LDRWui))
1556           .addDef(Reg32, RegState::Dead)
1557           .addUse(Reg, RegState::Kill)
1558           .addGlobalAddress(GV, 0, LoFlags)
1559           .addMemOperand(*MI.memoperands_begin())
1560           .addDef(Reg, RegState::Implicit);
1561     } else {
1562       BuildMI(MBB, MI, DL, get(AArch64::LDRXui), Reg)
1563           .addReg(Reg, RegState::Kill)
1564           .addGlobalAddress(GV, 0, LoFlags)
1565           .addMemOperand(*MI.memoperands_begin());
1566     }
1567   }
1568 
1569   MBB.erase(MI);
1570 
1571   return true;
1572 }
1573 
1574 // Return true if this instruction simply sets its single destination register
1575 // to zero. This is equivalent to a register rename of the zero-register.
1576 bool AArch64InstrInfo::isGPRZero(const MachineInstr &MI) {
1577   switch (MI.getOpcode()) {
1578   default:
1579     break;
1580   case AArch64::MOVZWi:
1581   case AArch64::MOVZXi: // movz Rd, #0 (LSL #0)
1582     if (MI.getOperand(1).isImm() && MI.getOperand(1).getImm() == 0) {
1583       assert(MI.getDesc().getNumOperands() == 3 &&
1584              MI.getOperand(2).getImm() == 0 && "invalid MOVZi operands");
1585       return true;
1586     }
1587     break;
1588   case AArch64::ANDWri: // and Rd, Rzr, #imm
1589     return MI.getOperand(1).getReg() == AArch64::WZR;
1590   case AArch64::ANDXri:
1591     return MI.getOperand(1).getReg() == AArch64::XZR;
1592   case TargetOpcode::COPY:
1593     return MI.getOperand(1).getReg() == AArch64::WZR;
1594   }
1595   return false;
1596 }
1597 
1598 // Return true if this instruction simply renames a general register without
1599 // modifying bits.
1600 bool AArch64InstrInfo::isGPRCopy(const MachineInstr &MI) {
1601   switch (MI.getOpcode()) {
1602   default:
1603     break;
1604   case TargetOpcode::COPY: {
1605     // GPR32 copies will by lowered to ORRXrs
1606     Register DstReg = MI.getOperand(0).getReg();
1607     return (AArch64::GPR32RegClass.contains(DstReg) ||
1608             AArch64::GPR64RegClass.contains(DstReg));
1609   }
1610   case AArch64::ORRXrs: // orr Xd, Xzr, Xm (LSL #0)
1611     if (MI.getOperand(1).getReg() == AArch64::XZR) {
1612       assert(MI.getDesc().getNumOperands() == 4 &&
1613              MI.getOperand(3).getImm() == 0 && "invalid ORRrs operands");
1614       return true;
1615     }
1616     break;
1617   case AArch64::ADDXri: // add Xd, Xn, #0 (LSL #0)
1618     if (MI.getOperand(2).getImm() == 0) {
1619       assert(MI.getDesc().getNumOperands() == 4 &&
1620              MI.getOperand(3).getImm() == 0 && "invalid ADDXri operands");
1621       return true;
1622     }
1623     break;
1624   }
1625   return false;
1626 }
1627 
1628 // Return true if this instruction simply renames a general register without
1629 // modifying bits.
1630 bool AArch64InstrInfo::isFPRCopy(const MachineInstr &MI) {
1631   switch (MI.getOpcode()) {
1632   default:
1633     break;
1634   case TargetOpcode::COPY: {
1635     // FPR64 copies will by lowered to ORR.16b
1636     Register DstReg = MI.getOperand(0).getReg();
1637     return (AArch64::FPR64RegClass.contains(DstReg) ||
1638             AArch64::FPR128RegClass.contains(DstReg));
1639   }
1640   case AArch64::ORRv16i8:
1641     if (MI.getOperand(1).getReg() == MI.getOperand(2).getReg()) {
1642       assert(MI.getDesc().getNumOperands() == 3 && MI.getOperand(0).isReg() &&
1643              "invalid ORRv16i8 operands");
1644       return true;
1645     }
1646     break;
1647   }
1648   return false;
1649 }
1650 
1651 unsigned AArch64InstrInfo::isLoadFromStackSlot(const MachineInstr &MI,
1652                                                int &FrameIndex) const {
1653   switch (MI.getOpcode()) {
1654   default:
1655     break;
1656   case AArch64::LDRWui:
1657   case AArch64::LDRXui:
1658   case AArch64::LDRBui:
1659   case AArch64::LDRHui:
1660   case AArch64::LDRSui:
1661   case AArch64::LDRDui:
1662   case AArch64::LDRQui:
1663     if (MI.getOperand(0).getSubReg() == 0 && MI.getOperand(1).isFI() &&
1664         MI.getOperand(2).isImm() && MI.getOperand(2).getImm() == 0) {
1665       FrameIndex = MI.getOperand(1).getIndex();
1666       return MI.getOperand(0).getReg();
1667     }
1668     break;
1669   }
1670 
1671   return 0;
1672 }
1673 
1674 unsigned AArch64InstrInfo::isStoreToStackSlot(const MachineInstr &MI,
1675                                               int &FrameIndex) const {
1676   switch (MI.getOpcode()) {
1677   default:
1678     break;
1679   case AArch64::STRWui:
1680   case AArch64::STRXui:
1681   case AArch64::STRBui:
1682   case AArch64::STRHui:
1683   case AArch64::STRSui:
1684   case AArch64::STRDui:
1685   case AArch64::STRQui:
1686     if (MI.getOperand(0).getSubReg() == 0 && MI.getOperand(1).isFI() &&
1687         MI.getOperand(2).isImm() && MI.getOperand(2).getImm() == 0) {
1688       FrameIndex = MI.getOperand(1).getIndex();
1689       return MI.getOperand(0).getReg();
1690     }
1691     break;
1692   }
1693   return 0;
1694 }
1695 
1696 /// Check all MachineMemOperands for a hint to suppress pairing.
1697 bool AArch64InstrInfo::isLdStPairSuppressed(const MachineInstr &MI) {
1698   return llvm::any_of(MI.memoperands(), [](MachineMemOperand *MMO) {
1699     return MMO->getFlags() & MOSuppressPair;
1700   });
1701 }
1702 
1703 /// Set a flag on the first MachineMemOperand to suppress pairing.
1704 void AArch64InstrInfo::suppressLdStPair(MachineInstr &MI) {
1705   if (MI.memoperands_empty())
1706     return;
1707   (*MI.memoperands_begin())->setFlags(MOSuppressPair);
1708 }
1709 
1710 /// Check all MachineMemOperands for a hint that the load/store is strided.
1711 bool AArch64InstrInfo::isStridedAccess(const MachineInstr &MI) {
1712   return llvm::any_of(MI.memoperands(), [](MachineMemOperand *MMO) {
1713     return MMO->getFlags() & MOStridedAccess;
1714   });
1715 }
1716 
1717 bool AArch64InstrInfo::isUnscaledLdSt(unsigned Opc) {
1718   switch (Opc) {
1719   default:
1720     return false;
1721   case AArch64::STURSi:
1722   case AArch64::STURDi:
1723   case AArch64::STURQi:
1724   case AArch64::STURBBi:
1725   case AArch64::STURHHi:
1726   case AArch64::STURWi:
1727   case AArch64::STURXi:
1728   case AArch64::LDURSi:
1729   case AArch64::LDURDi:
1730   case AArch64::LDURQi:
1731   case AArch64::LDURWi:
1732   case AArch64::LDURXi:
1733   case AArch64::LDURSWi:
1734   case AArch64::LDURHHi:
1735   case AArch64::LDURBBi:
1736   case AArch64::LDURSBWi:
1737   case AArch64::LDURSHWi:
1738     return true;
1739   }
1740 }
1741 
1742 Optional<unsigned> AArch64InstrInfo::getUnscaledLdSt(unsigned Opc) {
1743   switch (Opc) {
1744   default: return {};
1745   case AArch64::PRFMui: return AArch64::PRFUMi;
1746   case AArch64::LDRXui: return AArch64::LDURXi;
1747   case AArch64::LDRWui: return AArch64::LDURWi;
1748   case AArch64::LDRBui: return AArch64::LDURBi;
1749   case AArch64::LDRHui: return AArch64::LDURHi;
1750   case AArch64::LDRSui: return AArch64::LDURSi;
1751   case AArch64::LDRDui: return AArch64::LDURDi;
1752   case AArch64::LDRQui: return AArch64::LDURQi;
1753   case AArch64::LDRBBui: return AArch64::LDURBBi;
1754   case AArch64::LDRHHui: return AArch64::LDURHHi;
1755   case AArch64::LDRSBXui: return AArch64::LDURSBXi;
1756   case AArch64::LDRSBWui: return AArch64::LDURSBWi;
1757   case AArch64::LDRSHXui: return AArch64::LDURSHXi;
1758   case AArch64::LDRSHWui: return AArch64::LDURSHWi;
1759   case AArch64::LDRSWui: return AArch64::LDURSWi;
1760   case AArch64::STRXui: return AArch64::STURXi;
1761   case AArch64::STRWui: return AArch64::STURWi;
1762   case AArch64::STRBui: return AArch64::STURBi;
1763   case AArch64::STRHui: return AArch64::STURHi;
1764   case AArch64::STRSui: return AArch64::STURSi;
1765   case AArch64::STRDui: return AArch64::STURDi;
1766   case AArch64::STRQui: return AArch64::STURQi;
1767   case AArch64::STRBBui: return AArch64::STURBBi;
1768   case AArch64::STRHHui: return AArch64::STURHHi;
1769   }
1770 }
1771 
1772 unsigned AArch64InstrInfo::getLoadStoreImmIdx(unsigned Opc) {
1773   switch (Opc) {
1774   default:
1775     return 2;
1776   case AArch64::LDPXi:
1777   case AArch64::LDPDi:
1778   case AArch64::STPXi:
1779   case AArch64::STPDi:
1780   case AArch64::LDNPXi:
1781   case AArch64::LDNPDi:
1782   case AArch64::STNPXi:
1783   case AArch64::STNPDi:
1784   case AArch64::LDPQi:
1785   case AArch64::STPQi:
1786   case AArch64::LDNPQi:
1787   case AArch64::STNPQi:
1788   case AArch64::LDPWi:
1789   case AArch64::LDPSi:
1790   case AArch64::STPWi:
1791   case AArch64::STPSi:
1792   case AArch64::LDNPWi:
1793   case AArch64::LDNPSi:
1794   case AArch64::STNPWi:
1795   case AArch64::STNPSi:
1796   case AArch64::LDG:
1797   case AArch64::STGPi:
1798     return 3;
1799   case AArch64::ADDG:
1800   case AArch64::STGOffset:
1801     return 2;
1802   }
1803 }
1804 
1805 bool AArch64InstrInfo::isPairableLdStInst(const MachineInstr &MI) {
1806   switch (MI.getOpcode()) {
1807   default:
1808     return false;
1809   // Scaled instructions.
1810   case AArch64::STRSui:
1811   case AArch64::STRDui:
1812   case AArch64::STRQui:
1813   case AArch64::STRXui:
1814   case AArch64::STRWui:
1815   case AArch64::LDRSui:
1816   case AArch64::LDRDui:
1817   case AArch64::LDRQui:
1818   case AArch64::LDRXui:
1819   case AArch64::LDRWui:
1820   case AArch64::LDRSWui:
1821   // Unscaled instructions.
1822   case AArch64::STURSi:
1823   case AArch64::STURDi:
1824   case AArch64::STURQi:
1825   case AArch64::STURWi:
1826   case AArch64::STURXi:
1827   case AArch64::LDURSi:
1828   case AArch64::LDURDi:
1829   case AArch64::LDURQi:
1830   case AArch64::LDURWi:
1831   case AArch64::LDURXi:
1832   case AArch64::LDURSWi:
1833     return true;
1834   }
1835 }
1836 
1837 unsigned AArch64InstrInfo::convertToFlagSettingOpc(unsigned Opc,
1838                                                    bool &Is64Bit) {
1839   switch (Opc) {
1840   default:
1841     llvm_unreachable("Opcode has no flag setting equivalent!");
1842   // 32-bit cases:
1843   case AArch64::ADDWri:
1844     Is64Bit = false;
1845     return AArch64::ADDSWri;
1846   case AArch64::ADDWrr:
1847     Is64Bit = false;
1848     return AArch64::ADDSWrr;
1849   case AArch64::ADDWrs:
1850     Is64Bit = false;
1851     return AArch64::ADDSWrs;
1852   case AArch64::ADDWrx:
1853     Is64Bit = false;
1854     return AArch64::ADDSWrx;
1855   case AArch64::ANDWri:
1856     Is64Bit = false;
1857     return AArch64::ANDSWri;
1858   case AArch64::ANDWrr:
1859     Is64Bit = false;
1860     return AArch64::ANDSWrr;
1861   case AArch64::ANDWrs:
1862     Is64Bit = false;
1863     return AArch64::ANDSWrs;
1864   case AArch64::BICWrr:
1865     Is64Bit = false;
1866     return AArch64::BICSWrr;
1867   case AArch64::BICWrs:
1868     Is64Bit = false;
1869     return AArch64::BICSWrs;
1870   case AArch64::SUBWri:
1871     Is64Bit = false;
1872     return AArch64::SUBSWri;
1873   case AArch64::SUBWrr:
1874     Is64Bit = false;
1875     return AArch64::SUBSWrr;
1876   case AArch64::SUBWrs:
1877     Is64Bit = false;
1878     return AArch64::SUBSWrs;
1879   case AArch64::SUBWrx:
1880     Is64Bit = false;
1881     return AArch64::SUBSWrx;
1882   // 64-bit cases:
1883   case AArch64::ADDXri:
1884     Is64Bit = true;
1885     return AArch64::ADDSXri;
1886   case AArch64::ADDXrr:
1887     Is64Bit = true;
1888     return AArch64::ADDSXrr;
1889   case AArch64::ADDXrs:
1890     Is64Bit = true;
1891     return AArch64::ADDSXrs;
1892   case AArch64::ADDXrx:
1893     Is64Bit = true;
1894     return AArch64::ADDSXrx;
1895   case AArch64::ANDXri:
1896     Is64Bit = true;
1897     return AArch64::ANDSXri;
1898   case AArch64::ANDXrr:
1899     Is64Bit = true;
1900     return AArch64::ANDSXrr;
1901   case AArch64::ANDXrs:
1902     Is64Bit = true;
1903     return AArch64::ANDSXrs;
1904   case AArch64::BICXrr:
1905     Is64Bit = true;
1906     return AArch64::BICSXrr;
1907   case AArch64::BICXrs:
1908     Is64Bit = true;
1909     return AArch64::BICSXrs;
1910   case AArch64::SUBXri:
1911     Is64Bit = true;
1912     return AArch64::SUBSXri;
1913   case AArch64::SUBXrr:
1914     Is64Bit = true;
1915     return AArch64::SUBSXrr;
1916   case AArch64::SUBXrs:
1917     Is64Bit = true;
1918     return AArch64::SUBSXrs;
1919   case AArch64::SUBXrx:
1920     Is64Bit = true;
1921     return AArch64::SUBSXrx;
1922   }
1923 }
1924 
1925 // Is this a candidate for ld/st merging or pairing?  For example, we don't
1926 // touch volatiles or load/stores that have a hint to avoid pair formation.
1927 bool AArch64InstrInfo::isCandidateToMergeOrPair(const MachineInstr &MI) const {
1928   // If this is a volatile load/store, don't mess with it.
1929   if (MI.hasOrderedMemoryRef())
1930     return false;
1931 
1932   // Make sure this is a reg/fi+imm (as opposed to an address reloc).
1933   assert((MI.getOperand(1).isReg() || MI.getOperand(1).isFI()) &&
1934          "Expected a reg or frame index operand.");
1935   if (!MI.getOperand(2).isImm())
1936     return false;
1937 
1938   // Can't merge/pair if the instruction modifies the base register.
1939   // e.g., ldr x0, [x0]
1940   // This case will never occur with an FI base.
1941   if (MI.getOperand(1).isReg()) {
1942     Register BaseReg = MI.getOperand(1).getReg();
1943     const TargetRegisterInfo *TRI = &getRegisterInfo();
1944     if (MI.modifiesRegister(BaseReg, TRI))
1945       return false;
1946   }
1947 
1948   // Check if this load/store has a hint to avoid pair formation.
1949   // MachineMemOperands hints are set by the AArch64StorePairSuppress pass.
1950   if (isLdStPairSuppressed(MI))
1951     return false;
1952 
1953   // Do not pair any callee-save store/reload instructions in the
1954   // prologue/epilogue if the CFI information encoded the operations as separate
1955   // instructions, as that will cause the size of the actual prologue to mismatch
1956   // with the prologue size recorded in the Windows CFI.
1957   const MCAsmInfo *MAI = MI.getMF()->getTarget().getMCAsmInfo();
1958   bool NeedsWinCFI = MAI->usesWindowsCFI() &&
1959                      MI.getMF()->getFunction().needsUnwindTableEntry();
1960   if (NeedsWinCFI && (MI.getFlag(MachineInstr::FrameSetup) ||
1961                       MI.getFlag(MachineInstr::FrameDestroy)))
1962     return false;
1963 
1964   // On some CPUs quad load/store pairs are slower than two single load/stores.
1965   if (Subtarget.isPaired128Slow()) {
1966     switch (MI.getOpcode()) {
1967     default:
1968       break;
1969     case AArch64::LDURQi:
1970     case AArch64::STURQi:
1971     case AArch64::LDRQui:
1972     case AArch64::STRQui:
1973       return false;
1974     }
1975   }
1976 
1977   return true;
1978 }
1979 
1980 bool AArch64InstrInfo::getMemOperandWithOffset(const MachineInstr &LdSt,
1981                                           const MachineOperand *&BaseOp,
1982                                           int64_t &Offset,
1983                                           const TargetRegisterInfo *TRI) const {
1984   unsigned Width;
1985   return getMemOperandWithOffsetWidth(LdSt, BaseOp, Offset, Width, TRI);
1986 }
1987 
1988 bool AArch64InstrInfo::getMemOperandWithOffsetWidth(
1989     const MachineInstr &LdSt, const MachineOperand *&BaseOp, int64_t &Offset,
1990     unsigned &Width, const TargetRegisterInfo *TRI) const {
1991   assert(LdSt.mayLoadOrStore() && "Expected a memory operation.");
1992   // Handle only loads/stores with base register followed by immediate offset.
1993   if (LdSt.getNumExplicitOperands() == 3) {
1994     // Non-paired instruction (e.g., ldr x1, [x0, #8]).
1995     if ((!LdSt.getOperand(1).isReg() && !LdSt.getOperand(1).isFI()) ||
1996         !LdSt.getOperand(2).isImm())
1997       return false;
1998   } else if (LdSt.getNumExplicitOperands() == 4) {
1999     // Paired instruction (e.g., ldp x1, x2, [x0, #8]).
2000     if (!LdSt.getOperand(1).isReg() ||
2001         (!LdSt.getOperand(2).isReg() && !LdSt.getOperand(2).isFI()) ||
2002         !LdSt.getOperand(3).isImm())
2003       return false;
2004   } else
2005     return false;
2006 
2007   // Get the scaling factor for the instruction and set the width for the
2008   // instruction.
2009   unsigned Scale = 0;
2010   int64_t Dummy1, Dummy2;
2011 
2012   // If this returns false, then it's an instruction we don't want to handle.
2013   if (!getMemOpInfo(LdSt.getOpcode(), Scale, Width, Dummy1, Dummy2))
2014     return false;
2015 
2016   // Compute the offset. Offset is calculated as the immediate operand
2017   // multiplied by the scaling factor. Unscaled instructions have scaling factor
2018   // set to 1.
2019   if (LdSt.getNumExplicitOperands() == 3) {
2020     BaseOp = &LdSt.getOperand(1);
2021     Offset = LdSt.getOperand(2).getImm() * Scale;
2022   } else {
2023     assert(LdSt.getNumExplicitOperands() == 4 && "invalid number of operands");
2024     BaseOp = &LdSt.getOperand(2);
2025     Offset = LdSt.getOperand(3).getImm() * Scale;
2026   }
2027 
2028   assert((BaseOp->isReg() || BaseOp->isFI()) &&
2029          "getMemOperandWithOffset only supports base "
2030          "operands of type register or frame index.");
2031 
2032   return true;
2033 }
2034 
2035 MachineOperand &
2036 AArch64InstrInfo::getMemOpBaseRegImmOfsOffsetOperand(MachineInstr &LdSt) const {
2037   assert(LdSt.mayLoadOrStore() && "Expected a memory operation.");
2038   MachineOperand &OfsOp = LdSt.getOperand(LdSt.getNumExplicitOperands() - 1);
2039   assert(OfsOp.isImm() && "Offset operand wasn't immediate.");
2040   return OfsOp;
2041 }
2042 
2043 bool AArch64InstrInfo::getMemOpInfo(unsigned Opcode, unsigned &Scale,
2044                                     unsigned &Width, int64_t &MinOffset,
2045                                     int64_t &MaxOffset) {
2046   switch (Opcode) {
2047   // Not a memory operation or something we want to handle.
2048   default:
2049     Scale = Width = 0;
2050     MinOffset = MaxOffset = 0;
2051     return false;
2052   case AArch64::STRWpost:
2053   case AArch64::LDRWpost:
2054     Width = 32;
2055     Scale = 4;
2056     MinOffset = -256;
2057     MaxOffset = 255;
2058     break;
2059   case AArch64::LDURQi:
2060   case AArch64::STURQi:
2061     Width = 16;
2062     Scale = 1;
2063     MinOffset = -256;
2064     MaxOffset = 255;
2065     break;
2066   case AArch64::PRFUMi:
2067   case AArch64::LDURXi:
2068   case AArch64::LDURDi:
2069   case AArch64::STURXi:
2070   case AArch64::STURDi:
2071     Width = 8;
2072     Scale = 1;
2073     MinOffset = -256;
2074     MaxOffset = 255;
2075     break;
2076   case AArch64::LDURWi:
2077   case AArch64::LDURSi:
2078   case AArch64::LDURSWi:
2079   case AArch64::STURWi:
2080   case AArch64::STURSi:
2081     Width = 4;
2082     Scale = 1;
2083     MinOffset = -256;
2084     MaxOffset = 255;
2085     break;
2086   case AArch64::LDURHi:
2087   case AArch64::LDURHHi:
2088   case AArch64::LDURSHXi:
2089   case AArch64::LDURSHWi:
2090   case AArch64::STURHi:
2091   case AArch64::STURHHi:
2092     Width = 2;
2093     Scale = 1;
2094     MinOffset = -256;
2095     MaxOffset = 255;
2096     break;
2097   case AArch64::LDURBi:
2098   case AArch64::LDURBBi:
2099   case AArch64::LDURSBXi:
2100   case AArch64::LDURSBWi:
2101   case AArch64::STURBi:
2102   case AArch64::STURBBi:
2103     Width = 1;
2104     Scale = 1;
2105     MinOffset = -256;
2106     MaxOffset = 255;
2107     break;
2108   case AArch64::LDPQi:
2109   case AArch64::LDNPQi:
2110   case AArch64::STPQi:
2111   case AArch64::STNPQi:
2112     Scale = 16;
2113     Width = 32;
2114     MinOffset = -64;
2115     MaxOffset = 63;
2116     break;
2117   case AArch64::LDRQui:
2118   case AArch64::STRQui:
2119     Scale = Width = 16;
2120     MinOffset = 0;
2121     MaxOffset = 4095;
2122     break;
2123   case AArch64::LDPXi:
2124   case AArch64::LDPDi:
2125   case AArch64::LDNPXi:
2126   case AArch64::LDNPDi:
2127   case AArch64::STPXi:
2128   case AArch64::STPDi:
2129   case AArch64::STNPXi:
2130   case AArch64::STNPDi:
2131     Scale = 8;
2132     Width = 16;
2133     MinOffset = -64;
2134     MaxOffset = 63;
2135     break;
2136   case AArch64::PRFMui:
2137   case AArch64::LDRXui:
2138   case AArch64::LDRDui:
2139   case AArch64::STRXui:
2140   case AArch64::STRDui:
2141     Scale = Width = 8;
2142     MinOffset = 0;
2143     MaxOffset = 4095;
2144     break;
2145   case AArch64::LDPWi:
2146   case AArch64::LDPSi:
2147   case AArch64::LDNPWi:
2148   case AArch64::LDNPSi:
2149   case AArch64::STPWi:
2150   case AArch64::STPSi:
2151   case AArch64::STNPWi:
2152   case AArch64::STNPSi:
2153     Scale = 4;
2154     Width = 8;
2155     MinOffset = -64;
2156     MaxOffset = 63;
2157     break;
2158   case AArch64::LDRWui:
2159   case AArch64::LDRSui:
2160   case AArch64::LDRSWui:
2161   case AArch64::STRWui:
2162   case AArch64::STRSui:
2163     Scale = Width = 4;
2164     MinOffset = 0;
2165     MaxOffset = 4095;
2166     break;
2167   case AArch64::LDRHui:
2168   case AArch64::LDRHHui:
2169   case AArch64::LDRSHWui:
2170   case AArch64::LDRSHXui:
2171   case AArch64::STRHui:
2172   case AArch64::STRHHui:
2173     Scale = Width = 2;
2174     MinOffset = 0;
2175     MaxOffset = 4095;
2176     break;
2177   case AArch64::LDRBui:
2178   case AArch64::LDRBBui:
2179   case AArch64::LDRSBWui:
2180   case AArch64::LDRSBXui:
2181   case AArch64::STRBui:
2182   case AArch64::STRBBui:
2183     Scale = Width = 1;
2184     MinOffset = 0;
2185     MaxOffset = 4095;
2186     break;
2187   case AArch64::ADDG:
2188   case AArch64::TAGPstack:
2189     Scale = 16;
2190     Width = 0;
2191     MinOffset = 0;
2192     MaxOffset = 63;
2193     break;
2194   case AArch64::LDG:
2195   case AArch64::STGOffset:
2196   case AArch64::STZGOffset:
2197     Scale = Width = 16;
2198     MinOffset = -256;
2199     MaxOffset = 255;
2200     break;
2201   case AArch64::LDR_PXI:
2202   case AArch64::STR_PXI:
2203     Scale = Width = 2;
2204     MinOffset = -256;
2205     MaxOffset = 255;
2206     break;
2207   case AArch64::LDR_ZXI:
2208   case AArch64::STR_ZXI:
2209     Scale = Width = 16;
2210     MinOffset = -256;
2211     MaxOffset = 255;
2212     break;
2213   case AArch64::ST2GOffset:
2214   case AArch64::STZ2GOffset:
2215     Scale = 16;
2216     Width = 32;
2217     MinOffset = -256;
2218     MaxOffset = 255;
2219     break;
2220   case AArch64::STGPi:
2221     Scale = Width = 16;
2222     MinOffset = -64;
2223     MaxOffset = 63;
2224     break;
2225   }
2226 
2227   return true;
2228 }
2229 
2230 static unsigned getOffsetStride(unsigned Opc) {
2231   switch (Opc) {
2232   default:
2233     return 0;
2234   case AArch64::LDURQi:
2235   case AArch64::STURQi:
2236     return 16;
2237   case AArch64::LDURXi:
2238   case AArch64::LDURDi:
2239   case AArch64::STURXi:
2240   case AArch64::STURDi:
2241     return 8;
2242   case AArch64::LDURWi:
2243   case AArch64::LDURSi:
2244   case AArch64::LDURSWi:
2245   case AArch64::STURWi:
2246   case AArch64::STURSi:
2247     return 4;
2248   }
2249 }
2250 
2251 // Scale the unscaled offsets.  Returns false if the unscaled offset can't be
2252 // scaled.
2253 static bool scaleOffset(unsigned Opc, int64_t &Offset) {
2254   unsigned OffsetStride = getOffsetStride(Opc);
2255   if (OffsetStride == 0)
2256     return false;
2257   // If the byte-offset isn't a multiple of the stride, we can't scale this
2258   // offset.
2259   if (Offset % OffsetStride != 0)
2260     return false;
2261 
2262   // Convert the byte-offset used by unscaled into an "element" offset used
2263   // by the scaled pair load/store instructions.
2264   Offset /= OffsetStride;
2265   return true;
2266 }
2267 
2268 // Unscale the scaled offsets. Returns false if the scaled offset can't be
2269 // unscaled.
2270 static bool unscaleOffset(unsigned Opc, int64_t &Offset) {
2271   unsigned OffsetStride = getOffsetStride(Opc);
2272   if (OffsetStride == 0)
2273     return false;
2274 
2275   // Convert the "element" offset used by scaled pair load/store instructions
2276   // into the byte-offset used by unscaled.
2277   Offset *= OffsetStride;
2278   return true;
2279 }
2280 
2281 static bool canPairLdStOpc(unsigned FirstOpc, unsigned SecondOpc) {
2282   if (FirstOpc == SecondOpc)
2283     return true;
2284   // We can also pair sign-ext and zero-ext instructions.
2285   switch (FirstOpc) {
2286   default:
2287     return false;
2288   case AArch64::LDRWui:
2289   case AArch64::LDURWi:
2290     return SecondOpc == AArch64::LDRSWui || SecondOpc == AArch64::LDURSWi;
2291   case AArch64::LDRSWui:
2292   case AArch64::LDURSWi:
2293     return SecondOpc == AArch64::LDRWui || SecondOpc == AArch64::LDURWi;
2294   }
2295   // These instructions can't be paired based on their opcodes.
2296   return false;
2297 }
2298 
2299 static bool shouldClusterFI(const MachineFrameInfo &MFI, int FI1,
2300                             int64_t Offset1, unsigned Opcode1, int FI2,
2301                             int64_t Offset2, unsigned Opcode2) {
2302   // Accesses through fixed stack object frame indices may access a different
2303   // fixed stack slot. Check that the object offsets + offsets match.
2304   if (MFI.isFixedObjectIndex(FI1) && MFI.isFixedObjectIndex(FI2)) {
2305     int64_t ObjectOffset1 = MFI.getObjectOffset(FI1);
2306     int64_t ObjectOffset2 = MFI.getObjectOffset(FI2);
2307     assert(ObjectOffset1 <= ObjectOffset2 && "Object offsets are not ordered.");
2308     // Get the byte-offset from the object offset.
2309     if (!unscaleOffset(Opcode1, Offset1) || !unscaleOffset(Opcode2, Offset2))
2310       return false;
2311     ObjectOffset1 += Offset1;
2312     ObjectOffset2 += Offset2;
2313     // Get the "element" index in the object.
2314     if (!scaleOffset(Opcode1, ObjectOffset1) ||
2315         !scaleOffset(Opcode2, ObjectOffset2))
2316       return false;
2317     return ObjectOffset1 + 1 == ObjectOffset2;
2318   }
2319 
2320   return FI1 == FI2;
2321 }
2322 
2323 /// Detect opportunities for ldp/stp formation.
2324 ///
2325 /// Only called for LdSt for which getMemOperandWithOffset returns true.
2326 bool AArch64InstrInfo::shouldClusterMemOps(const MachineOperand &BaseOp1,
2327                                            const MachineOperand &BaseOp2,
2328                                            unsigned NumLoads) const {
2329   const MachineInstr &FirstLdSt = *BaseOp1.getParent();
2330   const MachineInstr &SecondLdSt = *BaseOp2.getParent();
2331   if (BaseOp1.getType() != BaseOp2.getType())
2332     return false;
2333 
2334   assert((BaseOp1.isReg() || BaseOp1.isFI()) &&
2335          "Only base registers and frame indices are supported.");
2336 
2337   // Check for both base regs and base FI.
2338   if (BaseOp1.isReg() && BaseOp1.getReg() != BaseOp2.getReg())
2339     return false;
2340 
2341   // Only cluster up to a single pair.
2342   if (NumLoads > 1)
2343     return false;
2344 
2345   if (!isPairableLdStInst(FirstLdSt) || !isPairableLdStInst(SecondLdSt))
2346     return false;
2347 
2348   // Can we pair these instructions based on their opcodes?
2349   unsigned FirstOpc = FirstLdSt.getOpcode();
2350   unsigned SecondOpc = SecondLdSt.getOpcode();
2351   if (!canPairLdStOpc(FirstOpc, SecondOpc))
2352     return false;
2353 
2354   // Can't merge volatiles or load/stores that have a hint to avoid pair
2355   // formation, for example.
2356   if (!isCandidateToMergeOrPair(FirstLdSt) ||
2357       !isCandidateToMergeOrPair(SecondLdSt))
2358     return false;
2359 
2360   // isCandidateToMergeOrPair guarantees that operand 2 is an immediate.
2361   int64_t Offset1 = FirstLdSt.getOperand(2).getImm();
2362   if (isUnscaledLdSt(FirstOpc) && !scaleOffset(FirstOpc, Offset1))
2363     return false;
2364 
2365   int64_t Offset2 = SecondLdSt.getOperand(2).getImm();
2366   if (isUnscaledLdSt(SecondOpc) && !scaleOffset(SecondOpc, Offset2))
2367     return false;
2368 
2369   // Pairwise instructions have a 7-bit signed offset field.
2370   if (Offset1 > 63 || Offset1 < -64)
2371     return false;
2372 
2373   // The caller should already have ordered First/SecondLdSt by offset.
2374   // Note: except for non-equal frame index bases
2375   if (BaseOp1.isFI()) {
2376     assert((!BaseOp1.isIdenticalTo(BaseOp2) || Offset1 >= Offset2) &&
2377            "Caller should have ordered offsets.");
2378 
2379     const MachineFrameInfo &MFI =
2380         FirstLdSt.getParent()->getParent()->getFrameInfo();
2381     return shouldClusterFI(MFI, BaseOp1.getIndex(), Offset1, FirstOpc,
2382                            BaseOp2.getIndex(), Offset2, SecondOpc);
2383   }
2384 
2385   assert((!BaseOp1.isIdenticalTo(BaseOp2) || Offset1 <= Offset2) &&
2386          "Caller should have ordered offsets.");
2387 
2388   return Offset1 + 1 == Offset2;
2389 }
2390 
2391 static const MachineInstrBuilder &AddSubReg(const MachineInstrBuilder &MIB,
2392                                             unsigned Reg, unsigned SubIdx,
2393                                             unsigned State,
2394                                             const TargetRegisterInfo *TRI) {
2395   if (!SubIdx)
2396     return MIB.addReg(Reg, State);
2397 
2398   if (Register::isPhysicalRegister(Reg))
2399     return MIB.addReg(TRI->getSubReg(Reg, SubIdx), State);
2400   return MIB.addReg(Reg, State, SubIdx);
2401 }
2402 
2403 static bool forwardCopyWillClobberTuple(unsigned DestReg, unsigned SrcReg,
2404                                         unsigned NumRegs) {
2405   // We really want the positive remainder mod 32 here, that happens to be
2406   // easily obtainable with a mask.
2407   return ((DestReg - SrcReg) & 0x1f) < NumRegs;
2408 }
2409 
2410 void AArch64InstrInfo::copyPhysRegTuple(MachineBasicBlock &MBB,
2411                                         MachineBasicBlock::iterator I,
2412                                         const DebugLoc &DL, unsigned DestReg,
2413                                         unsigned SrcReg, bool KillSrc,
2414                                         unsigned Opcode,
2415                                         ArrayRef<unsigned> Indices) const {
2416   assert(Subtarget.hasNEON() && "Unexpected register copy without NEON");
2417   const TargetRegisterInfo *TRI = &getRegisterInfo();
2418   uint16_t DestEncoding = TRI->getEncodingValue(DestReg);
2419   uint16_t SrcEncoding = TRI->getEncodingValue(SrcReg);
2420   unsigned NumRegs = Indices.size();
2421 
2422   int SubReg = 0, End = NumRegs, Incr = 1;
2423   if (forwardCopyWillClobberTuple(DestEncoding, SrcEncoding, NumRegs)) {
2424     SubReg = NumRegs - 1;
2425     End = -1;
2426     Incr = -1;
2427   }
2428 
2429   for (; SubReg != End; SubReg += Incr) {
2430     const MachineInstrBuilder MIB = BuildMI(MBB, I, DL, get(Opcode));
2431     AddSubReg(MIB, DestReg, Indices[SubReg], RegState::Define, TRI);
2432     AddSubReg(MIB, SrcReg, Indices[SubReg], 0, TRI);
2433     AddSubReg(MIB, SrcReg, Indices[SubReg], getKillRegState(KillSrc), TRI);
2434   }
2435 }
2436 
2437 void AArch64InstrInfo::copyGPRRegTuple(MachineBasicBlock &MBB,
2438                                        MachineBasicBlock::iterator I,
2439                                        DebugLoc DL, unsigned DestReg,
2440                                        unsigned SrcReg, bool KillSrc,
2441                                        unsigned Opcode, unsigned ZeroReg,
2442                                        llvm::ArrayRef<unsigned> Indices) const {
2443   const TargetRegisterInfo *TRI = &getRegisterInfo();
2444   unsigned NumRegs = Indices.size();
2445 
2446 #ifndef NDEBUG
2447   uint16_t DestEncoding = TRI->getEncodingValue(DestReg);
2448   uint16_t SrcEncoding = TRI->getEncodingValue(SrcReg);
2449   assert(DestEncoding % NumRegs == 0 && SrcEncoding % NumRegs == 0 &&
2450          "GPR reg sequences should not be able to overlap");
2451 #endif
2452 
2453   for (unsigned SubReg = 0; SubReg != NumRegs; ++SubReg) {
2454     const MachineInstrBuilder MIB = BuildMI(MBB, I, DL, get(Opcode));
2455     AddSubReg(MIB, DestReg, Indices[SubReg], RegState::Define, TRI);
2456     MIB.addReg(ZeroReg);
2457     AddSubReg(MIB, SrcReg, Indices[SubReg], getKillRegState(KillSrc), TRI);
2458     MIB.addImm(0);
2459   }
2460 }
2461 
2462 void AArch64InstrInfo::copyPhysReg(MachineBasicBlock &MBB,
2463                                    MachineBasicBlock::iterator I,
2464                                    const DebugLoc &DL, unsigned DestReg,
2465                                    unsigned SrcReg, bool KillSrc) const {
2466   if (AArch64::GPR32spRegClass.contains(DestReg) &&
2467       (AArch64::GPR32spRegClass.contains(SrcReg) || SrcReg == AArch64::WZR)) {
2468     const TargetRegisterInfo *TRI = &getRegisterInfo();
2469 
2470     if (DestReg == AArch64::WSP || SrcReg == AArch64::WSP) {
2471       // If either operand is WSP, expand to ADD #0.
2472       if (Subtarget.hasZeroCycleRegMove()) {
2473         // Cyclone recognizes "ADD Xd, Xn, #0" as a zero-cycle register move.
2474         unsigned DestRegX = TRI->getMatchingSuperReg(DestReg, AArch64::sub_32,
2475                                                      &AArch64::GPR64spRegClass);
2476         unsigned SrcRegX = TRI->getMatchingSuperReg(SrcReg, AArch64::sub_32,
2477                                                     &AArch64::GPR64spRegClass);
2478         // This instruction is reading and writing X registers.  This may upset
2479         // the register scavenger and machine verifier, so we need to indicate
2480         // that we are reading an undefined value from SrcRegX, but a proper
2481         // value from SrcReg.
2482         BuildMI(MBB, I, DL, get(AArch64::ADDXri), DestRegX)
2483             .addReg(SrcRegX, RegState::Undef)
2484             .addImm(0)
2485             .addImm(AArch64_AM::getShifterImm(AArch64_AM::LSL, 0))
2486             .addReg(SrcReg, RegState::Implicit | getKillRegState(KillSrc));
2487       } else {
2488         BuildMI(MBB, I, DL, get(AArch64::ADDWri), DestReg)
2489             .addReg(SrcReg, getKillRegState(KillSrc))
2490             .addImm(0)
2491             .addImm(AArch64_AM::getShifterImm(AArch64_AM::LSL, 0));
2492       }
2493     } else if (SrcReg == AArch64::WZR && Subtarget.hasZeroCycleZeroingGP()) {
2494       BuildMI(MBB, I, DL, get(AArch64::MOVZWi), DestReg)
2495           .addImm(0)
2496           .addImm(AArch64_AM::getShifterImm(AArch64_AM::LSL, 0));
2497     } else {
2498       if (Subtarget.hasZeroCycleRegMove()) {
2499         // Cyclone recognizes "ORR Xd, XZR, Xm" as a zero-cycle register move.
2500         unsigned DestRegX = TRI->getMatchingSuperReg(DestReg, AArch64::sub_32,
2501                                                      &AArch64::GPR64spRegClass);
2502         unsigned SrcRegX = TRI->getMatchingSuperReg(SrcReg, AArch64::sub_32,
2503                                                     &AArch64::GPR64spRegClass);
2504         // This instruction is reading and writing X registers.  This may upset
2505         // the register scavenger and machine verifier, so we need to indicate
2506         // that we are reading an undefined value from SrcRegX, but a proper
2507         // value from SrcReg.
2508         BuildMI(MBB, I, DL, get(AArch64::ORRXrr), DestRegX)
2509             .addReg(AArch64::XZR)
2510             .addReg(SrcRegX, RegState::Undef)
2511             .addReg(SrcReg, RegState::Implicit | getKillRegState(KillSrc));
2512       } else {
2513         // Otherwise, expand to ORR WZR.
2514         BuildMI(MBB, I, DL, get(AArch64::ORRWrr), DestReg)
2515             .addReg(AArch64::WZR)
2516             .addReg(SrcReg, getKillRegState(KillSrc));
2517       }
2518     }
2519     return;
2520   }
2521 
2522   // Copy a Predicate register by ORRing with itself.
2523   if (AArch64::PPRRegClass.contains(DestReg) &&
2524       AArch64::PPRRegClass.contains(SrcReg)) {
2525     assert(Subtarget.hasSVE() && "Unexpected SVE register.");
2526     BuildMI(MBB, I, DL, get(AArch64::ORR_PPzPP), DestReg)
2527       .addReg(SrcReg) // Pg
2528       .addReg(SrcReg)
2529       .addReg(SrcReg, getKillRegState(KillSrc));
2530     return;
2531   }
2532 
2533   // Copy a Z register by ORRing with itself.
2534   if (AArch64::ZPRRegClass.contains(DestReg) &&
2535       AArch64::ZPRRegClass.contains(SrcReg)) {
2536     assert(Subtarget.hasSVE() && "Unexpected SVE register.");
2537     BuildMI(MBB, I, DL, get(AArch64::ORR_ZZZ), DestReg)
2538       .addReg(SrcReg)
2539       .addReg(SrcReg, getKillRegState(KillSrc));
2540     return;
2541   }
2542 
2543   if (AArch64::GPR64spRegClass.contains(DestReg) &&
2544       (AArch64::GPR64spRegClass.contains(SrcReg) || SrcReg == AArch64::XZR)) {
2545     if (DestReg == AArch64::SP || SrcReg == AArch64::SP) {
2546       // If either operand is SP, expand to ADD #0.
2547       BuildMI(MBB, I, DL, get(AArch64::ADDXri), DestReg)
2548           .addReg(SrcReg, getKillRegState(KillSrc))
2549           .addImm(0)
2550           .addImm(AArch64_AM::getShifterImm(AArch64_AM::LSL, 0));
2551     } else if (SrcReg == AArch64::XZR && Subtarget.hasZeroCycleZeroingGP()) {
2552       BuildMI(MBB, I, DL, get(AArch64::MOVZXi), DestReg)
2553           .addImm(0)
2554           .addImm(AArch64_AM::getShifterImm(AArch64_AM::LSL, 0));
2555     } else {
2556       // Otherwise, expand to ORR XZR.
2557       BuildMI(MBB, I, DL, get(AArch64::ORRXrr), DestReg)
2558           .addReg(AArch64::XZR)
2559           .addReg(SrcReg, getKillRegState(KillSrc));
2560     }
2561     return;
2562   }
2563 
2564   // Copy a DDDD register quad by copying the individual sub-registers.
2565   if (AArch64::DDDDRegClass.contains(DestReg) &&
2566       AArch64::DDDDRegClass.contains(SrcReg)) {
2567     static const unsigned Indices[] = {AArch64::dsub0, AArch64::dsub1,
2568                                        AArch64::dsub2, AArch64::dsub3};
2569     copyPhysRegTuple(MBB, I, DL, DestReg, SrcReg, KillSrc, AArch64::ORRv8i8,
2570                      Indices);
2571     return;
2572   }
2573 
2574   // Copy a DDD register triple by copying the individual sub-registers.
2575   if (AArch64::DDDRegClass.contains(DestReg) &&
2576       AArch64::DDDRegClass.contains(SrcReg)) {
2577     static const unsigned Indices[] = {AArch64::dsub0, AArch64::dsub1,
2578                                        AArch64::dsub2};
2579     copyPhysRegTuple(MBB, I, DL, DestReg, SrcReg, KillSrc, AArch64::ORRv8i8,
2580                      Indices);
2581     return;
2582   }
2583 
2584   // Copy a DD register pair by copying the individual sub-registers.
2585   if (AArch64::DDRegClass.contains(DestReg) &&
2586       AArch64::DDRegClass.contains(SrcReg)) {
2587     static const unsigned Indices[] = {AArch64::dsub0, AArch64::dsub1};
2588     copyPhysRegTuple(MBB, I, DL, DestReg, SrcReg, KillSrc, AArch64::ORRv8i8,
2589                      Indices);
2590     return;
2591   }
2592 
2593   // Copy a QQQQ register quad by copying the individual sub-registers.
2594   if (AArch64::QQQQRegClass.contains(DestReg) &&
2595       AArch64::QQQQRegClass.contains(SrcReg)) {
2596     static const unsigned Indices[] = {AArch64::qsub0, AArch64::qsub1,
2597                                        AArch64::qsub2, AArch64::qsub3};
2598     copyPhysRegTuple(MBB, I, DL, DestReg, SrcReg, KillSrc, AArch64::ORRv16i8,
2599                      Indices);
2600     return;
2601   }
2602 
2603   // Copy a QQQ register triple by copying the individual sub-registers.
2604   if (AArch64::QQQRegClass.contains(DestReg) &&
2605       AArch64::QQQRegClass.contains(SrcReg)) {
2606     static const unsigned Indices[] = {AArch64::qsub0, AArch64::qsub1,
2607                                        AArch64::qsub2};
2608     copyPhysRegTuple(MBB, I, DL, DestReg, SrcReg, KillSrc, AArch64::ORRv16i8,
2609                      Indices);
2610     return;
2611   }
2612 
2613   // Copy a QQ register pair by copying the individual sub-registers.
2614   if (AArch64::QQRegClass.contains(DestReg) &&
2615       AArch64::QQRegClass.contains(SrcReg)) {
2616     static const unsigned Indices[] = {AArch64::qsub0, AArch64::qsub1};
2617     copyPhysRegTuple(MBB, I, DL, DestReg, SrcReg, KillSrc, AArch64::ORRv16i8,
2618                      Indices);
2619     return;
2620   }
2621 
2622   if (AArch64::XSeqPairsClassRegClass.contains(DestReg) &&
2623       AArch64::XSeqPairsClassRegClass.contains(SrcReg)) {
2624     static const unsigned Indices[] = {AArch64::sube64, AArch64::subo64};
2625     copyGPRRegTuple(MBB, I, DL, DestReg, SrcReg, KillSrc, AArch64::ORRXrs,
2626                     AArch64::XZR, Indices);
2627     return;
2628   }
2629 
2630   if (AArch64::WSeqPairsClassRegClass.contains(DestReg) &&
2631       AArch64::WSeqPairsClassRegClass.contains(SrcReg)) {
2632     static const unsigned Indices[] = {AArch64::sube32, AArch64::subo32};
2633     copyGPRRegTuple(MBB, I, DL, DestReg, SrcReg, KillSrc, AArch64::ORRWrs,
2634                     AArch64::WZR, Indices);
2635     return;
2636   }
2637 
2638   if (AArch64::FPR128RegClass.contains(DestReg) &&
2639       AArch64::FPR128RegClass.contains(SrcReg)) {
2640     if (Subtarget.hasNEON()) {
2641       BuildMI(MBB, I, DL, get(AArch64::ORRv16i8), DestReg)
2642           .addReg(SrcReg)
2643           .addReg(SrcReg, getKillRegState(KillSrc));
2644     } else {
2645       BuildMI(MBB, I, DL, get(AArch64::STRQpre))
2646           .addReg(AArch64::SP, RegState::Define)
2647           .addReg(SrcReg, getKillRegState(KillSrc))
2648           .addReg(AArch64::SP)
2649           .addImm(-16);
2650       BuildMI(MBB, I, DL, get(AArch64::LDRQpre))
2651           .addReg(AArch64::SP, RegState::Define)
2652           .addReg(DestReg, RegState::Define)
2653           .addReg(AArch64::SP)
2654           .addImm(16);
2655     }
2656     return;
2657   }
2658 
2659   if (AArch64::FPR64RegClass.contains(DestReg) &&
2660       AArch64::FPR64RegClass.contains(SrcReg)) {
2661     if (Subtarget.hasNEON()) {
2662       DestReg = RI.getMatchingSuperReg(DestReg, AArch64::dsub,
2663                                        &AArch64::FPR128RegClass);
2664       SrcReg = RI.getMatchingSuperReg(SrcReg, AArch64::dsub,
2665                                       &AArch64::FPR128RegClass);
2666       BuildMI(MBB, I, DL, get(AArch64::ORRv16i8), DestReg)
2667           .addReg(SrcReg)
2668           .addReg(SrcReg, getKillRegState(KillSrc));
2669     } else {
2670       BuildMI(MBB, I, DL, get(AArch64::FMOVDr), DestReg)
2671           .addReg(SrcReg, getKillRegState(KillSrc));
2672     }
2673     return;
2674   }
2675 
2676   if (AArch64::FPR32RegClass.contains(DestReg) &&
2677       AArch64::FPR32RegClass.contains(SrcReg)) {
2678     if (Subtarget.hasNEON()) {
2679       DestReg = RI.getMatchingSuperReg(DestReg, AArch64::ssub,
2680                                        &AArch64::FPR128RegClass);
2681       SrcReg = RI.getMatchingSuperReg(SrcReg, AArch64::ssub,
2682                                       &AArch64::FPR128RegClass);
2683       BuildMI(MBB, I, DL, get(AArch64::ORRv16i8), DestReg)
2684           .addReg(SrcReg)
2685           .addReg(SrcReg, getKillRegState(KillSrc));
2686     } else {
2687       BuildMI(MBB, I, DL, get(AArch64::FMOVSr), DestReg)
2688           .addReg(SrcReg, getKillRegState(KillSrc));
2689     }
2690     return;
2691   }
2692 
2693   if (AArch64::FPR16RegClass.contains(DestReg) &&
2694       AArch64::FPR16RegClass.contains(SrcReg)) {
2695     if (Subtarget.hasNEON()) {
2696       DestReg = RI.getMatchingSuperReg(DestReg, AArch64::hsub,
2697                                        &AArch64::FPR128RegClass);
2698       SrcReg = RI.getMatchingSuperReg(SrcReg, AArch64::hsub,
2699                                       &AArch64::FPR128RegClass);
2700       BuildMI(MBB, I, DL, get(AArch64::ORRv16i8), DestReg)
2701           .addReg(SrcReg)
2702           .addReg(SrcReg, getKillRegState(KillSrc));
2703     } else {
2704       DestReg = RI.getMatchingSuperReg(DestReg, AArch64::hsub,
2705                                        &AArch64::FPR32RegClass);
2706       SrcReg = RI.getMatchingSuperReg(SrcReg, AArch64::hsub,
2707                                       &AArch64::FPR32RegClass);
2708       BuildMI(MBB, I, DL, get(AArch64::FMOVSr), DestReg)
2709           .addReg(SrcReg, getKillRegState(KillSrc));
2710     }
2711     return;
2712   }
2713 
2714   if (AArch64::FPR8RegClass.contains(DestReg) &&
2715       AArch64::FPR8RegClass.contains(SrcReg)) {
2716     if (Subtarget.hasNEON()) {
2717       DestReg = RI.getMatchingSuperReg(DestReg, AArch64::bsub,
2718                                        &AArch64::FPR128RegClass);
2719       SrcReg = RI.getMatchingSuperReg(SrcReg, AArch64::bsub,
2720                                       &AArch64::FPR128RegClass);
2721       BuildMI(MBB, I, DL, get(AArch64::ORRv16i8), DestReg)
2722           .addReg(SrcReg)
2723           .addReg(SrcReg, getKillRegState(KillSrc));
2724     } else {
2725       DestReg = RI.getMatchingSuperReg(DestReg, AArch64::bsub,
2726                                        &AArch64::FPR32RegClass);
2727       SrcReg = RI.getMatchingSuperReg(SrcReg, AArch64::bsub,
2728                                       &AArch64::FPR32RegClass);
2729       BuildMI(MBB, I, DL, get(AArch64::FMOVSr), DestReg)
2730           .addReg(SrcReg, getKillRegState(KillSrc));
2731     }
2732     return;
2733   }
2734 
2735   // Copies between GPR64 and FPR64.
2736   if (AArch64::FPR64RegClass.contains(DestReg) &&
2737       AArch64::GPR64RegClass.contains(SrcReg)) {
2738     BuildMI(MBB, I, DL, get(AArch64::FMOVXDr), DestReg)
2739         .addReg(SrcReg, getKillRegState(KillSrc));
2740     return;
2741   }
2742   if (AArch64::GPR64RegClass.contains(DestReg) &&
2743       AArch64::FPR64RegClass.contains(SrcReg)) {
2744     BuildMI(MBB, I, DL, get(AArch64::FMOVDXr), DestReg)
2745         .addReg(SrcReg, getKillRegState(KillSrc));
2746     return;
2747   }
2748   // Copies between GPR32 and FPR32.
2749   if (AArch64::FPR32RegClass.contains(DestReg) &&
2750       AArch64::GPR32RegClass.contains(SrcReg)) {
2751     BuildMI(MBB, I, DL, get(AArch64::FMOVWSr), DestReg)
2752         .addReg(SrcReg, getKillRegState(KillSrc));
2753     return;
2754   }
2755   if (AArch64::GPR32RegClass.contains(DestReg) &&
2756       AArch64::FPR32RegClass.contains(SrcReg)) {
2757     BuildMI(MBB, I, DL, get(AArch64::FMOVSWr), DestReg)
2758         .addReg(SrcReg, getKillRegState(KillSrc));
2759     return;
2760   }
2761 
2762   if (DestReg == AArch64::NZCV) {
2763     assert(AArch64::GPR64RegClass.contains(SrcReg) && "Invalid NZCV copy");
2764     BuildMI(MBB, I, DL, get(AArch64::MSR))
2765         .addImm(AArch64SysReg::NZCV)
2766         .addReg(SrcReg, getKillRegState(KillSrc))
2767         .addReg(AArch64::NZCV, RegState::Implicit | RegState::Define);
2768     return;
2769   }
2770 
2771   if (SrcReg == AArch64::NZCV) {
2772     assert(AArch64::GPR64RegClass.contains(DestReg) && "Invalid NZCV copy");
2773     BuildMI(MBB, I, DL, get(AArch64::MRS), DestReg)
2774         .addImm(AArch64SysReg::NZCV)
2775         .addReg(AArch64::NZCV, RegState::Implicit | getKillRegState(KillSrc));
2776     return;
2777   }
2778 
2779   llvm_unreachable("unimplemented reg-to-reg copy");
2780 }
2781 
2782 static void storeRegPairToStackSlot(const TargetRegisterInfo &TRI,
2783                                     MachineBasicBlock &MBB,
2784                                     MachineBasicBlock::iterator InsertBefore,
2785                                     const MCInstrDesc &MCID,
2786                                     unsigned SrcReg, bool IsKill,
2787                                     unsigned SubIdx0, unsigned SubIdx1, int FI,
2788                                     MachineMemOperand *MMO) {
2789   unsigned SrcReg0 = SrcReg;
2790   unsigned SrcReg1 = SrcReg;
2791   if (Register::isPhysicalRegister(SrcReg)) {
2792     SrcReg0 = TRI.getSubReg(SrcReg, SubIdx0);
2793     SubIdx0 = 0;
2794     SrcReg1 = TRI.getSubReg(SrcReg, SubIdx1);
2795     SubIdx1 = 0;
2796   }
2797   BuildMI(MBB, InsertBefore, DebugLoc(), MCID)
2798       .addReg(SrcReg0, getKillRegState(IsKill), SubIdx0)
2799       .addReg(SrcReg1, getKillRegState(IsKill), SubIdx1)
2800       .addFrameIndex(FI)
2801       .addImm(0)
2802       .addMemOperand(MMO);
2803 }
2804 
2805 void AArch64InstrInfo::storeRegToStackSlot(
2806     MachineBasicBlock &MBB, MachineBasicBlock::iterator MBBI, unsigned SrcReg,
2807     bool isKill, int FI, const TargetRegisterClass *RC,
2808     const TargetRegisterInfo *TRI) const {
2809   MachineFunction &MF = *MBB.getParent();
2810   MachineFrameInfo &MFI = MF.getFrameInfo();
2811   unsigned Align = MFI.getObjectAlignment(FI);
2812 
2813   MachinePointerInfo PtrInfo = MachinePointerInfo::getFixedStack(MF, FI);
2814   MachineMemOperand *MMO = MF.getMachineMemOperand(
2815       PtrInfo, MachineMemOperand::MOStore, MFI.getObjectSize(FI), Align);
2816   unsigned Opc = 0;
2817   bool Offset = true;
2818   switch (TRI->getSpillSize(*RC)) {
2819   case 1:
2820     if (AArch64::FPR8RegClass.hasSubClassEq(RC))
2821       Opc = AArch64::STRBui;
2822     break;
2823   case 2:
2824     if (AArch64::FPR16RegClass.hasSubClassEq(RC))
2825       Opc = AArch64::STRHui;
2826     break;
2827   case 4:
2828     if (AArch64::GPR32allRegClass.hasSubClassEq(RC)) {
2829       Opc = AArch64::STRWui;
2830       if (Register::isVirtualRegister(SrcReg))
2831         MF.getRegInfo().constrainRegClass(SrcReg, &AArch64::GPR32RegClass);
2832       else
2833         assert(SrcReg != AArch64::WSP);
2834     } else if (AArch64::FPR32RegClass.hasSubClassEq(RC))
2835       Opc = AArch64::STRSui;
2836     break;
2837   case 8:
2838     if (AArch64::GPR64allRegClass.hasSubClassEq(RC)) {
2839       Opc = AArch64::STRXui;
2840       if (Register::isVirtualRegister(SrcReg))
2841         MF.getRegInfo().constrainRegClass(SrcReg, &AArch64::GPR64RegClass);
2842       else
2843         assert(SrcReg != AArch64::SP);
2844     } else if (AArch64::FPR64RegClass.hasSubClassEq(RC)) {
2845       Opc = AArch64::STRDui;
2846     } else if (AArch64::WSeqPairsClassRegClass.hasSubClassEq(RC)) {
2847       storeRegPairToStackSlot(getRegisterInfo(), MBB, MBBI,
2848                               get(AArch64::STPWi), SrcReg, isKill,
2849                               AArch64::sube32, AArch64::subo32, FI, MMO);
2850       return;
2851     }
2852     break;
2853   case 16:
2854     if (AArch64::FPR128RegClass.hasSubClassEq(RC))
2855       Opc = AArch64::STRQui;
2856     else if (AArch64::DDRegClass.hasSubClassEq(RC)) {
2857       assert(Subtarget.hasNEON() && "Unexpected register store without NEON");
2858       Opc = AArch64::ST1Twov1d;
2859       Offset = false;
2860     } else if (AArch64::XSeqPairsClassRegClass.hasSubClassEq(RC)) {
2861       storeRegPairToStackSlot(getRegisterInfo(), MBB, MBBI,
2862                               get(AArch64::STPXi), SrcReg, isKill,
2863                               AArch64::sube64, AArch64::subo64, FI, MMO);
2864       return;
2865     }
2866     break;
2867   case 24:
2868     if (AArch64::DDDRegClass.hasSubClassEq(RC)) {
2869       assert(Subtarget.hasNEON() && "Unexpected register store without NEON");
2870       Opc = AArch64::ST1Threev1d;
2871       Offset = false;
2872     }
2873     break;
2874   case 32:
2875     if (AArch64::DDDDRegClass.hasSubClassEq(RC)) {
2876       assert(Subtarget.hasNEON() && "Unexpected register store without NEON");
2877       Opc = AArch64::ST1Fourv1d;
2878       Offset = false;
2879     } else if (AArch64::QQRegClass.hasSubClassEq(RC)) {
2880       assert(Subtarget.hasNEON() && "Unexpected register store without NEON");
2881       Opc = AArch64::ST1Twov2d;
2882       Offset = false;
2883     }
2884     break;
2885   case 48:
2886     if (AArch64::QQQRegClass.hasSubClassEq(RC)) {
2887       assert(Subtarget.hasNEON() && "Unexpected register store without NEON");
2888       Opc = AArch64::ST1Threev2d;
2889       Offset = false;
2890     }
2891     break;
2892   case 64:
2893     if (AArch64::QQQQRegClass.hasSubClassEq(RC)) {
2894       assert(Subtarget.hasNEON() && "Unexpected register store without NEON");
2895       Opc = AArch64::ST1Fourv2d;
2896       Offset = false;
2897     }
2898     break;
2899   }
2900   assert(Opc && "Unknown register class");
2901 
2902   const MachineInstrBuilder MI = BuildMI(MBB, MBBI, DebugLoc(), get(Opc))
2903                                      .addReg(SrcReg, getKillRegState(isKill))
2904                                      .addFrameIndex(FI);
2905 
2906   if (Offset)
2907     MI.addImm(0);
2908   MI.addMemOperand(MMO);
2909 }
2910 
2911 static void loadRegPairFromStackSlot(const TargetRegisterInfo &TRI,
2912                                      MachineBasicBlock &MBB,
2913                                      MachineBasicBlock::iterator InsertBefore,
2914                                      const MCInstrDesc &MCID,
2915                                      unsigned DestReg, unsigned SubIdx0,
2916                                      unsigned SubIdx1, int FI,
2917                                      MachineMemOperand *MMO) {
2918   unsigned DestReg0 = DestReg;
2919   unsigned DestReg1 = DestReg;
2920   bool IsUndef = true;
2921   if (Register::isPhysicalRegister(DestReg)) {
2922     DestReg0 = TRI.getSubReg(DestReg, SubIdx0);
2923     SubIdx0 = 0;
2924     DestReg1 = TRI.getSubReg(DestReg, SubIdx1);
2925     SubIdx1 = 0;
2926     IsUndef = false;
2927   }
2928   BuildMI(MBB, InsertBefore, DebugLoc(), MCID)
2929       .addReg(DestReg0, RegState::Define | getUndefRegState(IsUndef), SubIdx0)
2930       .addReg(DestReg1, RegState::Define | getUndefRegState(IsUndef), SubIdx1)
2931       .addFrameIndex(FI)
2932       .addImm(0)
2933       .addMemOperand(MMO);
2934 }
2935 
2936 void AArch64InstrInfo::loadRegFromStackSlot(
2937     MachineBasicBlock &MBB, MachineBasicBlock::iterator MBBI, unsigned DestReg,
2938     int FI, const TargetRegisterClass *RC,
2939     const TargetRegisterInfo *TRI) const {
2940   MachineFunction &MF = *MBB.getParent();
2941   MachineFrameInfo &MFI = MF.getFrameInfo();
2942   unsigned Align = MFI.getObjectAlignment(FI);
2943   MachinePointerInfo PtrInfo = MachinePointerInfo::getFixedStack(MF, FI);
2944   MachineMemOperand *MMO = MF.getMachineMemOperand(
2945       PtrInfo, MachineMemOperand::MOLoad, MFI.getObjectSize(FI), Align);
2946 
2947   unsigned Opc = 0;
2948   bool Offset = true;
2949   switch (TRI->getSpillSize(*RC)) {
2950   case 1:
2951     if (AArch64::FPR8RegClass.hasSubClassEq(RC))
2952       Opc = AArch64::LDRBui;
2953     break;
2954   case 2:
2955     if (AArch64::FPR16RegClass.hasSubClassEq(RC))
2956       Opc = AArch64::LDRHui;
2957     break;
2958   case 4:
2959     if (AArch64::GPR32allRegClass.hasSubClassEq(RC)) {
2960       Opc = AArch64::LDRWui;
2961       if (Register::isVirtualRegister(DestReg))
2962         MF.getRegInfo().constrainRegClass(DestReg, &AArch64::GPR32RegClass);
2963       else
2964         assert(DestReg != AArch64::WSP);
2965     } else if (AArch64::FPR32RegClass.hasSubClassEq(RC))
2966       Opc = AArch64::LDRSui;
2967     break;
2968   case 8:
2969     if (AArch64::GPR64allRegClass.hasSubClassEq(RC)) {
2970       Opc = AArch64::LDRXui;
2971       if (Register::isVirtualRegister(DestReg))
2972         MF.getRegInfo().constrainRegClass(DestReg, &AArch64::GPR64RegClass);
2973       else
2974         assert(DestReg != AArch64::SP);
2975     } else if (AArch64::FPR64RegClass.hasSubClassEq(RC)) {
2976       Opc = AArch64::LDRDui;
2977     } else if (AArch64::WSeqPairsClassRegClass.hasSubClassEq(RC)) {
2978       loadRegPairFromStackSlot(getRegisterInfo(), MBB, MBBI,
2979                                get(AArch64::LDPWi), DestReg, AArch64::sube32,
2980                                AArch64::subo32, FI, MMO);
2981       return;
2982     }
2983     break;
2984   case 16:
2985     if (AArch64::FPR128RegClass.hasSubClassEq(RC))
2986       Opc = AArch64::LDRQui;
2987     else if (AArch64::DDRegClass.hasSubClassEq(RC)) {
2988       assert(Subtarget.hasNEON() && "Unexpected register load without NEON");
2989       Opc = AArch64::LD1Twov1d;
2990       Offset = false;
2991     } else if (AArch64::XSeqPairsClassRegClass.hasSubClassEq(RC)) {
2992       loadRegPairFromStackSlot(getRegisterInfo(), MBB, MBBI,
2993                                get(AArch64::LDPXi), DestReg, AArch64::sube64,
2994                                AArch64::subo64, FI, MMO);
2995       return;
2996     }
2997     break;
2998   case 24:
2999     if (AArch64::DDDRegClass.hasSubClassEq(RC)) {
3000       assert(Subtarget.hasNEON() && "Unexpected register load without NEON");
3001       Opc = AArch64::LD1Threev1d;
3002       Offset = false;
3003     }
3004     break;
3005   case 32:
3006     if (AArch64::DDDDRegClass.hasSubClassEq(RC)) {
3007       assert(Subtarget.hasNEON() && "Unexpected register load without NEON");
3008       Opc = AArch64::LD1Fourv1d;
3009       Offset = false;
3010     } else if (AArch64::QQRegClass.hasSubClassEq(RC)) {
3011       assert(Subtarget.hasNEON() && "Unexpected register load without NEON");
3012       Opc = AArch64::LD1Twov2d;
3013       Offset = false;
3014     }
3015     break;
3016   case 48:
3017     if (AArch64::QQQRegClass.hasSubClassEq(RC)) {
3018       assert(Subtarget.hasNEON() && "Unexpected register load without NEON");
3019       Opc = AArch64::LD1Threev2d;
3020       Offset = false;
3021     }
3022     break;
3023   case 64:
3024     if (AArch64::QQQQRegClass.hasSubClassEq(RC)) {
3025       assert(Subtarget.hasNEON() && "Unexpected register load without NEON");
3026       Opc = AArch64::LD1Fourv2d;
3027       Offset = false;
3028     }
3029     break;
3030   }
3031   assert(Opc && "Unknown register class");
3032 
3033   const MachineInstrBuilder MI = BuildMI(MBB, MBBI, DebugLoc(), get(Opc))
3034                                      .addReg(DestReg, getDefRegState(true))
3035                                      .addFrameIndex(FI);
3036   if (Offset)
3037     MI.addImm(0);
3038   MI.addMemOperand(MMO);
3039 }
3040 
3041 // Helper function to emit a frame offset adjustment from a given
3042 // pointer (SrcReg), stored into DestReg. This function is explicit
3043 // in that it requires the opcode.
3044 static void emitFrameOffsetAdj(MachineBasicBlock &MBB,
3045                                MachineBasicBlock::iterator MBBI,
3046                                const DebugLoc &DL, unsigned DestReg,
3047                                unsigned SrcReg, int64_t Offset, unsigned Opc,
3048                                const TargetInstrInfo *TII,
3049                                MachineInstr::MIFlag Flag, bool NeedsWinCFI,
3050                                bool *HasWinCFI) {
3051   int Sign = 1;
3052   unsigned MaxEncoding, ShiftSize;
3053   switch (Opc) {
3054   case AArch64::ADDXri:
3055   case AArch64::ADDSXri:
3056   case AArch64::SUBXri:
3057   case AArch64::SUBSXri:
3058     MaxEncoding = 0xfff;
3059     ShiftSize = 12;
3060     break;
3061   case AArch64::ADDVL_XXI:
3062   case AArch64::ADDPL_XXI:
3063     MaxEncoding = 31;
3064     ShiftSize = 0;
3065     if (Offset < 0) {
3066       MaxEncoding = 32;
3067       Sign = -1;
3068       Offset = -Offset;
3069     }
3070     break;
3071   default:
3072     llvm_unreachable("Unsupported opcode");
3073   }
3074 
3075   // FIXME: If the offset won't fit in 24-bits, compute the offset into a
3076   // scratch register.  If DestReg is a virtual register, use it as the
3077   // scratch register; otherwise, create a new virtual register (to be
3078   // replaced by the scavenger at the end of PEI).  That case can be optimized
3079   // slightly if DestReg is SP which is always 16-byte aligned, so the scratch
3080   // register can be loaded with offset%8 and the add/sub can use an extending
3081   // instruction with LSL#3.
3082   // Currently the function handles any offsets but generates a poor sequence
3083   // of code.
3084   //  assert(Offset < (1 << 24) && "unimplemented reg plus immediate");
3085 
3086   const unsigned MaxEncodableValue = MaxEncoding << ShiftSize;
3087   do {
3088     unsigned ThisVal = std::min<unsigned>(Offset, MaxEncodableValue);
3089     unsigned LocalShiftSize = 0;
3090     if (ThisVal > MaxEncoding) {
3091       ThisVal = ThisVal >> ShiftSize;
3092       LocalShiftSize = ShiftSize;
3093     }
3094     assert((ThisVal >> ShiftSize) <= MaxEncoding &&
3095            "Encoding cannot handle value that big");
3096     auto MBI = BuildMI(MBB, MBBI, DL, TII->get(Opc), DestReg)
3097                    .addReg(SrcReg)
3098                    .addImm(Sign * (int)ThisVal);
3099     if (ShiftSize)
3100       MBI = MBI.addImm(
3101           AArch64_AM::getShifterImm(AArch64_AM::LSL, LocalShiftSize));
3102     MBI = MBI.setMIFlag(Flag);
3103 
3104     if (NeedsWinCFI) {
3105       assert(Sign == 1 && "SEH directives should always have a positive sign");
3106       int Imm = (int)(ThisVal << LocalShiftSize);
3107       if ((DestReg == AArch64::FP && SrcReg == AArch64::SP) ||
3108           (SrcReg == AArch64::FP && DestReg == AArch64::SP)) {
3109         if (HasWinCFI)
3110           *HasWinCFI = true;
3111         if (Imm == 0)
3112           BuildMI(MBB, MBBI, DL, TII->get(AArch64::SEH_SetFP)).setMIFlag(Flag);
3113         else
3114           BuildMI(MBB, MBBI, DL, TII->get(AArch64::SEH_AddFP))
3115               .addImm(Imm)
3116               .setMIFlag(Flag);
3117         assert((Offset - Imm) == 0 && "Expected remaining offset to be zero to "
3118                                       "emit a single SEH directive");
3119       } else if (DestReg == AArch64::SP) {
3120         if (HasWinCFI)
3121           *HasWinCFI = true;
3122         assert(SrcReg == AArch64::SP && "Unexpected SrcReg for SEH_StackAlloc");
3123         BuildMI(MBB, MBBI, DL, TII->get(AArch64::SEH_StackAlloc))
3124             .addImm(Imm)
3125             .setMIFlag(Flag);
3126       }
3127       if (HasWinCFI)
3128         *HasWinCFI = true;
3129     }
3130 
3131     SrcReg = DestReg;
3132     Offset -= ThisVal << LocalShiftSize;
3133   } while (Offset);
3134 }
3135 
3136 void llvm::emitFrameOffset(MachineBasicBlock &MBB,
3137                            MachineBasicBlock::iterator MBBI, const DebugLoc &DL,
3138                            unsigned DestReg, unsigned SrcReg,
3139                            StackOffset Offset, const TargetInstrInfo *TII,
3140                            MachineInstr::MIFlag Flag, bool SetNZCV,
3141                            bool NeedsWinCFI, bool *HasWinCFI) {
3142   int64_t Bytes, NumPredicateVectors, NumDataVectors;
3143   Offset.getForFrameOffset(Bytes, NumPredicateVectors, NumDataVectors);
3144 
3145   // First emit non-scalable frame offsets, or a simple 'mov'.
3146   if (Bytes || (!Offset && SrcReg != DestReg)) {
3147     assert((DestReg != AArch64::SP || Bytes % 16 == 0) &&
3148            "SP increment/decrement not 16-byte aligned");
3149     unsigned Opc = SetNZCV ? AArch64::ADDSXri : AArch64::ADDXri;
3150     if (Bytes < 0) {
3151       Bytes = -Bytes;
3152       Opc = SetNZCV ? AArch64::SUBSXri : AArch64::SUBXri;
3153     }
3154     emitFrameOffsetAdj(MBB, MBBI, DL, DestReg, SrcReg, Bytes, Opc, TII, Flag,
3155                        NeedsWinCFI, HasWinCFI);
3156     SrcReg = DestReg;
3157   }
3158 
3159   assert(!(SetNZCV && (NumPredicateVectors || NumDataVectors)) &&
3160          "SetNZCV not supported with SVE vectors");
3161   assert(!(NeedsWinCFI && (NumPredicateVectors || NumDataVectors)) &&
3162          "WinCFI not supported with SVE vectors");
3163 
3164   if (NumDataVectors) {
3165     emitFrameOffsetAdj(MBB, MBBI, DL, DestReg, SrcReg, NumDataVectors,
3166                        AArch64::ADDVL_XXI, TII, Flag, NeedsWinCFI, nullptr);
3167     SrcReg = DestReg;
3168   }
3169 
3170   if (NumPredicateVectors) {
3171     assert(DestReg != AArch64::SP && "Unaligned access to SP");
3172     emitFrameOffsetAdj(MBB, MBBI, DL, DestReg, SrcReg, NumPredicateVectors,
3173                        AArch64::ADDPL_XXI, TII, Flag, NeedsWinCFI, nullptr);
3174   }
3175 }
3176 
3177 MachineInstr *AArch64InstrInfo::foldMemoryOperandImpl(
3178     MachineFunction &MF, MachineInstr &MI, ArrayRef<unsigned> Ops,
3179     MachineBasicBlock::iterator InsertPt, int FrameIndex,
3180     LiveIntervals *LIS, VirtRegMap *VRM) const {
3181   // This is a bit of a hack. Consider this instruction:
3182   //
3183   //   %0 = COPY %sp; GPR64all:%0
3184   //
3185   // We explicitly chose GPR64all for the virtual register so such a copy might
3186   // be eliminated by RegisterCoalescer. However, that may not be possible, and
3187   // %0 may even spill. We can't spill %sp, and since it is in the GPR64all
3188   // register class, TargetInstrInfo::foldMemoryOperand() is going to try.
3189   //
3190   // To prevent that, we are going to constrain the %0 register class here.
3191   //
3192   // <rdar://problem/11522048>
3193   //
3194   if (MI.isFullCopy()) {
3195     Register DstReg = MI.getOperand(0).getReg();
3196     Register SrcReg = MI.getOperand(1).getReg();
3197     if (SrcReg == AArch64::SP && Register::isVirtualRegister(DstReg)) {
3198       MF.getRegInfo().constrainRegClass(DstReg, &AArch64::GPR64RegClass);
3199       return nullptr;
3200     }
3201     if (DstReg == AArch64::SP && Register::isVirtualRegister(SrcReg)) {
3202       MF.getRegInfo().constrainRegClass(SrcReg, &AArch64::GPR64RegClass);
3203       return nullptr;
3204     }
3205   }
3206 
3207   // Handle the case where a copy is being spilled or filled but the source
3208   // and destination register class don't match.  For example:
3209   //
3210   //   %0 = COPY %xzr; GPR64common:%0
3211   //
3212   // In this case we can still safely fold away the COPY and generate the
3213   // following spill code:
3214   //
3215   //   STRXui %xzr, %stack.0
3216   //
3217   // This also eliminates spilled cross register class COPYs (e.g. between x and
3218   // d regs) of the same size.  For example:
3219   //
3220   //   %0 = COPY %1; GPR64:%0, FPR64:%1
3221   //
3222   // will be filled as
3223   //
3224   //   LDRDui %0, fi<#0>
3225   //
3226   // instead of
3227   //
3228   //   LDRXui %Temp, fi<#0>
3229   //   %0 = FMOV %Temp
3230   //
3231   if (MI.isCopy() && Ops.size() == 1 &&
3232       // Make sure we're only folding the explicit COPY defs/uses.
3233       (Ops[0] == 0 || Ops[0] == 1)) {
3234     bool IsSpill = Ops[0] == 0;
3235     bool IsFill = !IsSpill;
3236     const TargetRegisterInfo &TRI = *MF.getSubtarget().getRegisterInfo();
3237     const MachineRegisterInfo &MRI = MF.getRegInfo();
3238     MachineBasicBlock &MBB = *MI.getParent();
3239     const MachineOperand &DstMO = MI.getOperand(0);
3240     const MachineOperand &SrcMO = MI.getOperand(1);
3241     Register DstReg = DstMO.getReg();
3242     Register SrcReg = SrcMO.getReg();
3243     // This is slightly expensive to compute for physical regs since
3244     // getMinimalPhysRegClass is slow.
3245     auto getRegClass = [&](unsigned Reg) {
3246       return Register::isVirtualRegister(Reg) ? MRI.getRegClass(Reg)
3247                                               : TRI.getMinimalPhysRegClass(Reg);
3248     };
3249 
3250     if (DstMO.getSubReg() == 0 && SrcMO.getSubReg() == 0) {
3251       assert(TRI.getRegSizeInBits(*getRegClass(DstReg)) ==
3252                  TRI.getRegSizeInBits(*getRegClass(SrcReg)) &&
3253              "Mismatched register size in non subreg COPY");
3254       if (IsSpill)
3255         storeRegToStackSlot(MBB, InsertPt, SrcReg, SrcMO.isKill(), FrameIndex,
3256                             getRegClass(SrcReg), &TRI);
3257       else
3258         loadRegFromStackSlot(MBB, InsertPt, DstReg, FrameIndex,
3259                              getRegClass(DstReg), &TRI);
3260       return &*--InsertPt;
3261     }
3262 
3263     // Handle cases like spilling def of:
3264     //
3265     //   %0:sub_32<def,read-undef> = COPY %wzr; GPR64common:%0
3266     //
3267     // where the physical register source can be widened and stored to the full
3268     // virtual reg destination stack slot, in this case producing:
3269     //
3270     //   STRXui %xzr, %stack.0
3271     //
3272     if (IsSpill && DstMO.isUndef() && Register::isPhysicalRegister(SrcReg)) {
3273       assert(SrcMO.getSubReg() == 0 &&
3274              "Unexpected subreg on physical register");
3275       const TargetRegisterClass *SpillRC;
3276       unsigned SpillSubreg;
3277       switch (DstMO.getSubReg()) {
3278       default:
3279         SpillRC = nullptr;
3280         break;
3281       case AArch64::sub_32:
3282       case AArch64::ssub:
3283         if (AArch64::GPR32RegClass.contains(SrcReg)) {
3284           SpillRC = &AArch64::GPR64RegClass;
3285           SpillSubreg = AArch64::sub_32;
3286         } else if (AArch64::FPR32RegClass.contains(SrcReg)) {
3287           SpillRC = &AArch64::FPR64RegClass;
3288           SpillSubreg = AArch64::ssub;
3289         } else
3290           SpillRC = nullptr;
3291         break;
3292       case AArch64::dsub:
3293         if (AArch64::FPR64RegClass.contains(SrcReg)) {
3294           SpillRC = &AArch64::FPR128RegClass;
3295           SpillSubreg = AArch64::dsub;
3296         } else
3297           SpillRC = nullptr;
3298         break;
3299       }
3300 
3301       if (SpillRC)
3302         if (unsigned WidenedSrcReg =
3303                 TRI.getMatchingSuperReg(SrcReg, SpillSubreg, SpillRC)) {
3304           storeRegToStackSlot(MBB, InsertPt, WidenedSrcReg, SrcMO.isKill(),
3305                               FrameIndex, SpillRC, &TRI);
3306           return &*--InsertPt;
3307         }
3308     }
3309 
3310     // Handle cases like filling use of:
3311     //
3312     //   %0:sub_32<def,read-undef> = COPY %1; GPR64:%0, GPR32:%1
3313     //
3314     // where we can load the full virtual reg source stack slot, into the subreg
3315     // destination, in this case producing:
3316     //
3317     //   LDRWui %0:sub_32<def,read-undef>, %stack.0
3318     //
3319     if (IsFill && SrcMO.getSubReg() == 0 && DstMO.isUndef()) {
3320       const TargetRegisterClass *FillRC;
3321       switch (DstMO.getSubReg()) {
3322       default:
3323         FillRC = nullptr;
3324         break;
3325       case AArch64::sub_32:
3326         FillRC = &AArch64::GPR32RegClass;
3327         break;
3328       case AArch64::ssub:
3329         FillRC = &AArch64::FPR32RegClass;
3330         break;
3331       case AArch64::dsub:
3332         FillRC = &AArch64::FPR64RegClass;
3333         break;
3334       }
3335 
3336       if (FillRC) {
3337         assert(TRI.getRegSizeInBits(*getRegClass(SrcReg)) ==
3338                    TRI.getRegSizeInBits(*FillRC) &&
3339                "Mismatched regclass size on folded subreg COPY");
3340         loadRegFromStackSlot(MBB, InsertPt, DstReg, FrameIndex, FillRC, &TRI);
3341         MachineInstr &LoadMI = *--InsertPt;
3342         MachineOperand &LoadDst = LoadMI.getOperand(0);
3343         assert(LoadDst.getSubReg() == 0 && "unexpected subreg on fill load");
3344         LoadDst.setSubReg(DstMO.getSubReg());
3345         LoadDst.setIsUndef();
3346         return &LoadMI;
3347       }
3348     }
3349   }
3350 
3351   // Cannot fold.
3352   return nullptr;
3353 }
3354 
3355 static bool isSVEScaledImmInstruction(unsigned Opcode) {
3356   switch (Opcode) {
3357   case AArch64::LDR_ZXI:
3358   case AArch64::STR_ZXI:
3359   case AArch64::LDR_PXI:
3360   case AArch64::STR_PXI:
3361     return true;
3362   default:
3363     return false;
3364   }
3365 }
3366 
3367 int llvm::isAArch64FrameOffsetLegal(const MachineInstr &MI,
3368                                     StackOffset &SOffset,
3369                                     bool *OutUseUnscaledOp,
3370                                     unsigned *OutUnscaledOp,
3371                                     int64_t *EmittableOffset) {
3372   // Set output values in case of early exit.
3373   if (EmittableOffset)
3374     *EmittableOffset = 0;
3375   if (OutUseUnscaledOp)
3376     *OutUseUnscaledOp = false;
3377   if (OutUnscaledOp)
3378     *OutUnscaledOp = 0;
3379 
3380   // Exit early for structured vector spills/fills as they can't take an
3381   // immediate offset.
3382   switch (MI.getOpcode()) {
3383   default:
3384     break;
3385   case AArch64::LD1Twov2d:
3386   case AArch64::LD1Threev2d:
3387   case AArch64::LD1Fourv2d:
3388   case AArch64::LD1Twov1d:
3389   case AArch64::LD1Threev1d:
3390   case AArch64::LD1Fourv1d:
3391   case AArch64::ST1Twov2d:
3392   case AArch64::ST1Threev2d:
3393   case AArch64::ST1Fourv2d:
3394   case AArch64::ST1Twov1d:
3395   case AArch64::ST1Threev1d:
3396   case AArch64::ST1Fourv1d:
3397   case AArch64::IRG:
3398   case AArch64::IRGstack:
3399     return AArch64FrameOffsetCannotUpdate;
3400   }
3401 
3402   // Get the min/max offset and the scale.
3403   unsigned Scale, Width;
3404   int64_t MinOff, MaxOff;
3405   if (!AArch64InstrInfo::getMemOpInfo(MI.getOpcode(), Scale, Width, MinOff,
3406                                       MaxOff))
3407     llvm_unreachable("unhandled opcode in isAArch64FrameOffsetLegal");
3408 
3409   // Construct the complete offset.
3410   bool IsMulVL = isSVEScaledImmInstruction(MI.getOpcode());
3411   int64_t Offset =
3412       IsMulVL ? (SOffset.getScalableBytes()) : (SOffset.getBytes());
3413 
3414   const MachineOperand &ImmOpnd =
3415       MI.getOperand(AArch64InstrInfo::getLoadStoreImmIdx(MI.getOpcode()));
3416   Offset += ImmOpnd.getImm() * Scale;
3417 
3418   // If the offset doesn't match the scale, we rewrite the instruction to
3419   // use the unscaled instruction instead. Likewise, if we have a negative
3420   // offset and there is an unscaled op to use.
3421   Optional<unsigned> UnscaledOp =
3422       AArch64InstrInfo::getUnscaledLdSt(MI.getOpcode());
3423   bool useUnscaledOp = UnscaledOp && (Offset % Scale || Offset < 0);
3424   if (useUnscaledOp &&
3425       !AArch64InstrInfo::getMemOpInfo(*UnscaledOp, Scale, Width, MinOff, MaxOff))
3426     llvm_unreachable("unhandled opcode in isAArch64FrameOffsetLegal");
3427 
3428   int64_t Remainder = Offset % Scale;
3429   assert(!(Remainder && useUnscaledOp) &&
3430          "Cannot have remainder when using unscaled op");
3431 
3432   assert(MinOff < MaxOff && "Unexpected Min/Max offsets");
3433   int64_t NewOffset = Offset / Scale;
3434   if (MinOff <= NewOffset && NewOffset <= MaxOff)
3435     Offset = Remainder;
3436   else {
3437     NewOffset = NewOffset < 0 ? MinOff : MaxOff;
3438     Offset = Offset - NewOffset * Scale + Remainder;
3439   }
3440 
3441   if (EmittableOffset)
3442     *EmittableOffset = NewOffset;
3443   if (OutUseUnscaledOp)
3444     *OutUseUnscaledOp = useUnscaledOp;
3445   if (OutUnscaledOp && UnscaledOp)
3446     *OutUnscaledOp = *UnscaledOp;
3447 
3448   if (IsMulVL)
3449     SOffset = StackOffset(Offset, MVT::nxv1i8) +
3450               StackOffset(SOffset.getBytes(), MVT::i8);
3451   else
3452     SOffset = StackOffset(Offset, MVT::i8) +
3453               StackOffset(SOffset.getScalableBytes(), MVT::nxv1i8);
3454   return AArch64FrameOffsetCanUpdate |
3455          (SOffset ? 0 : AArch64FrameOffsetIsLegal);
3456 }
3457 
3458 bool llvm::rewriteAArch64FrameIndex(MachineInstr &MI, unsigned FrameRegIdx,
3459                                     unsigned FrameReg, StackOffset &Offset,
3460                                     const AArch64InstrInfo *TII) {
3461   unsigned Opcode = MI.getOpcode();
3462   unsigned ImmIdx = FrameRegIdx + 1;
3463 
3464   if (Opcode == AArch64::ADDSXri || Opcode == AArch64::ADDXri) {
3465     Offset += StackOffset(MI.getOperand(ImmIdx).getImm(), MVT::i8);
3466     emitFrameOffset(*MI.getParent(), MI, MI.getDebugLoc(),
3467                     MI.getOperand(0).getReg(), FrameReg, Offset, TII,
3468                     MachineInstr::NoFlags, (Opcode == AArch64::ADDSXri));
3469     MI.eraseFromParent();
3470     Offset = StackOffset();
3471     return true;
3472   }
3473 
3474   int64_t NewOffset;
3475   unsigned UnscaledOp;
3476   bool UseUnscaledOp;
3477   int Status = isAArch64FrameOffsetLegal(MI, Offset, &UseUnscaledOp,
3478                                          &UnscaledOp, &NewOffset);
3479   if (Status & AArch64FrameOffsetCanUpdate) {
3480     if (Status & AArch64FrameOffsetIsLegal)
3481       // Replace the FrameIndex with FrameReg.
3482       MI.getOperand(FrameRegIdx).ChangeToRegister(FrameReg, false);
3483     if (UseUnscaledOp)
3484       MI.setDesc(TII->get(UnscaledOp));
3485 
3486     MI.getOperand(ImmIdx).ChangeToImmediate(NewOffset);
3487     return !Offset;
3488   }
3489 
3490   return false;
3491 }
3492 
3493 void AArch64InstrInfo::getNoop(MCInst &NopInst) const {
3494   NopInst.setOpcode(AArch64::HINT);
3495   NopInst.addOperand(MCOperand::createImm(0));
3496 }
3497 
3498 // AArch64 supports MachineCombiner.
3499 bool AArch64InstrInfo::useMachineCombiner() const { return true; }
3500 
3501 // True when Opc sets flag
3502 static bool isCombineInstrSettingFlag(unsigned Opc) {
3503   switch (Opc) {
3504   case AArch64::ADDSWrr:
3505   case AArch64::ADDSWri:
3506   case AArch64::ADDSXrr:
3507   case AArch64::ADDSXri:
3508   case AArch64::SUBSWrr:
3509   case AArch64::SUBSXrr:
3510   // Note: MSUB Wd,Wn,Wm,Wi -> Wd = Wi - WnxWm, not Wd=WnxWm - Wi.
3511   case AArch64::SUBSWri:
3512   case AArch64::SUBSXri:
3513     return true;
3514   default:
3515     break;
3516   }
3517   return false;
3518 }
3519 
3520 // 32b Opcodes that can be combined with a MUL
3521 static bool isCombineInstrCandidate32(unsigned Opc) {
3522   switch (Opc) {
3523   case AArch64::ADDWrr:
3524   case AArch64::ADDWri:
3525   case AArch64::SUBWrr:
3526   case AArch64::ADDSWrr:
3527   case AArch64::ADDSWri:
3528   case AArch64::SUBSWrr:
3529   // Note: MSUB Wd,Wn,Wm,Wi -> Wd = Wi - WnxWm, not Wd=WnxWm - Wi.
3530   case AArch64::SUBWri:
3531   case AArch64::SUBSWri:
3532     return true;
3533   default:
3534     break;
3535   }
3536   return false;
3537 }
3538 
3539 // 64b Opcodes that can be combined with a MUL
3540 static bool isCombineInstrCandidate64(unsigned Opc) {
3541   switch (Opc) {
3542   case AArch64::ADDXrr:
3543   case AArch64::ADDXri:
3544   case AArch64::SUBXrr:
3545   case AArch64::ADDSXrr:
3546   case AArch64::ADDSXri:
3547   case AArch64::SUBSXrr:
3548   // Note: MSUB Wd,Wn,Wm,Wi -> Wd = Wi - WnxWm, not Wd=WnxWm - Wi.
3549   case AArch64::SUBXri:
3550   case AArch64::SUBSXri:
3551     return true;
3552   default:
3553     break;
3554   }
3555   return false;
3556 }
3557 
3558 // FP Opcodes that can be combined with a FMUL
3559 static bool isCombineInstrCandidateFP(const MachineInstr &Inst) {
3560   switch (Inst.getOpcode()) {
3561   default:
3562     break;
3563   case AArch64::FADDHrr:
3564   case AArch64::FADDSrr:
3565   case AArch64::FADDDrr:
3566   case AArch64::FADDv4f16:
3567   case AArch64::FADDv8f16:
3568   case AArch64::FADDv2f32:
3569   case AArch64::FADDv2f64:
3570   case AArch64::FADDv4f32:
3571   case AArch64::FSUBHrr:
3572   case AArch64::FSUBSrr:
3573   case AArch64::FSUBDrr:
3574   case AArch64::FSUBv4f16:
3575   case AArch64::FSUBv8f16:
3576   case AArch64::FSUBv2f32:
3577   case AArch64::FSUBv2f64:
3578   case AArch64::FSUBv4f32:
3579     TargetOptions Options = Inst.getParent()->getParent()->getTarget().Options;
3580     return (Options.UnsafeFPMath ||
3581             Options.AllowFPOpFusion == FPOpFusion::Fast);
3582   }
3583   return false;
3584 }
3585 
3586 // Opcodes that can be combined with a MUL
3587 static bool isCombineInstrCandidate(unsigned Opc) {
3588   return (isCombineInstrCandidate32(Opc) || isCombineInstrCandidate64(Opc));
3589 }
3590 
3591 //
3592 // Utility routine that checks if \param MO is defined by an
3593 // \param CombineOpc instruction in the basic block \param MBB
3594 static bool canCombine(MachineBasicBlock &MBB, MachineOperand &MO,
3595                        unsigned CombineOpc, unsigned ZeroReg = 0,
3596                        bool CheckZeroReg = false) {
3597   MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo();
3598   MachineInstr *MI = nullptr;
3599 
3600   if (MO.isReg() && Register::isVirtualRegister(MO.getReg()))
3601     MI = MRI.getUniqueVRegDef(MO.getReg());
3602   // And it needs to be in the trace (otherwise, it won't have a depth).
3603   if (!MI || MI->getParent() != &MBB || (unsigned)MI->getOpcode() != CombineOpc)
3604     return false;
3605   // Must only used by the user we combine with.
3606   if (!MRI.hasOneNonDBGUse(MI->getOperand(0).getReg()))
3607     return false;
3608 
3609   if (CheckZeroReg) {
3610     assert(MI->getNumOperands() >= 4 && MI->getOperand(0).isReg() &&
3611            MI->getOperand(1).isReg() && MI->getOperand(2).isReg() &&
3612            MI->getOperand(3).isReg() && "MAdd/MSub must have a least 4 regs");
3613     // The third input reg must be zero.
3614     if (MI->getOperand(3).getReg() != ZeroReg)
3615       return false;
3616   }
3617 
3618   return true;
3619 }
3620 
3621 //
3622 // Is \param MO defined by an integer multiply and can be combined?
3623 static bool canCombineWithMUL(MachineBasicBlock &MBB, MachineOperand &MO,
3624                               unsigned MulOpc, unsigned ZeroReg) {
3625   return canCombine(MBB, MO, MulOpc, ZeroReg, true);
3626 }
3627 
3628 //
3629 // Is \param MO defined by a floating-point multiply and can be combined?
3630 static bool canCombineWithFMUL(MachineBasicBlock &MBB, MachineOperand &MO,
3631                                unsigned MulOpc) {
3632   return canCombine(MBB, MO, MulOpc);
3633 }
3634 
3635 // TODO: There are many more machine instruction opcodes to match:
3636 //       1. Other data types (integer, vectors)
3637 //       2. Other math / logic operations (xor, or)
3638 //       3. Other forms of the same operation (intrinsics and other variants)
3639 bool AArch64InstrInfo::isAssociativeAndCommutative(
3640     const MachineInstr &Inst) const {
3641   switch (Inst.getOpcode()) {
3642   case AArch64::FADDDrr:
3643   case AArch64::FADDSrr:
3644   case AArch64::FADDv2f32:
3645   case AArch64::FADDv2f64:
3646   case AArch64::FADDv4f32:
3647   case AArch64::FMULDrr:
3648   case AArch64::FMULSrr:
3649   case AArch64::FMULX32:
3650   case AArch64::FMULX64:
3651   case AArch64::FMULXv2f32:
3652   case AArch64::FMULXv2f64:
3653   case AArch64::FMULXv4f32:
3654   case AArch64::FMULv2f32:
3655   case AArch64::FMULv2f64:
3656   case AArch64::FMULv4f32:
3657     return Inst.getParent()->getParent()->getTarget().Options.UnsafeFPMath;
3658   default:
3659     return false;
3660   }
3661 }
3662 
3663 /// Find instructions that can be turned into madd.
3664 static bool getMaddPatterns(MachineInstr &Root,
3665                             SmallVectorImpl<MachineCombinerPattern> &Patterns) {
3666   unsigned Opc = Root.getOpcode();
3667   MachineBasicBlock &MBB = *Root.getParent();
3668   bool Found = false;
3669 
3670   if (!isCombineInstrCandidate(Opc))
3671     return false;
3672   if (isCombineInstrSettingFlag(Opc)) {
3673     int Cmp_NZCV = Root.findRegisterDefOperandIdx(AArch64::NZCV, true);
3674     // When NZCV is live bail out.
3675     if (Cmp_NZCV == -1)
3676       return false;
3677     unsigned NewOpc = convertToNonFlagSettingOpc(Root);
3678     // When opcode can't change bail out.
3679     // CHECKME: do we miss any cases for opcode conversion?
3680     if (NewOpc == Opc)
3681       return false;
3682     Opc = NewOpc;
3683   }
3684 
3685   auto setFound = [&](int Opcode, int Operand, unsigned ZeroReg,
3686                       MachineCombinerPattern Pattern) {
3687     if (canCombineWithMUL(MBB, Root.getOperand(Operand), Opcode, ZeroReg)) {
3688       Patterns.push_back(Pattern);
3689       Found = true;
3690     }
3691   };
3692 
3693   typedef MachineCombinerPattern MCP;
3694 
3695   switch (Opc) {
3696   default:
3697     break;
3698   case AArch64::ADDWrr:
3699     assert(Root.getOperand(1).isReg() && Root.getOperand(2).isReg() &&
3700            "ADDWrr does not have register operands");
3701     setFound(AArch64::MADDWrrr, 1, AArch64::WZR, MCP::MULADDW_OP1);
3702     setFound(AArch64::MADDWrrr, 2, AArch64::WZR, MCP::MULADDW_OP2);
3703     break;
3704   case AArch64::ADDXrr:
3705     setFound(AArch64::MADDXrrr, 1, AArch64::XZR, MCP::MULADDX_OP1);
3706     setFound(AArch64::MADDXrrr, 2, AArch64::XZR, MCP::MULADDX_OP2);
3707     break;
3708   case AArch64::SUBWrr:
3709     setFound(AArch64::MADDWrrr, 1, AArch64::WZR, MCP::MULSUBW_OP1);
3710     setFound(AArch64::MADDWrrr, 2, AArch64::WZR, MCP::MULSUBW_OP2);
3711     break;
3712   case AArch64::SUBXrr:
3713     setFound(AArch64::MADDXrrr, 1, AArch64::XZR, MCP::MULSUBX_OP1);
3714     setFound(AArch64::MADDXrrr, 2, AArch64::XZR, MCP::MULSUBX_OP2);
3715     break;
3716   case AArch64::ADDWri:
3717     setFound(AArch64::MADDWrrr, 1, AArch64::WZR, MCP::MULADDWI_OP1);
3718     break;
3719   case AArch64::ADDXri:
3720     setFound(AArch64::MADDXrrr, 1, AArch64::XZR, MCP::MULADDXI_OP1);
3721     break;
3722   case AArch64::SUBWri:
3723     setFound(AArch64::MADDWrrr, 1, AArch64::WZR, MCP::MULSUBWI_OP1);
3724     break;
3725   case AArch64::SUBXri:
3726     setFound(AArch64::MADDXrrr, 1, AArch64::XZR, MCP::MULSUBXI_OP1);
3727     break;
3728   }
3729   return Found;
3730 }
3731 /// Floating-Point Support
3732 
3733 /// Find instructions that can be turned into madd.
3734 static bool getFMAPatterns(MachineInstr &Root,
3735                            SmallVectorImpl<MachineCombinerPattern> &Patterns) {
3736 
3737   if (!isCombineInstrCandidateFP(Root))
3738     return false;
3739 
3740   MachineBasicBlock &MBB = *Root.getParent();
3741   bool Found = false;
3742 
3743   auto Match = [&](int Opcode, int Operand,
3744                    MachineCombinerPattern Pattern) -> bool {
3745     if (canCombineWithFMUL(MBB, Root.getOperand(Operand), Opcode)) {
3746       Patterns.push_back(Pattern);
3747       return true;
3748     }
3749     return false;
3750   };
3751 
3752   typedef MachineCombinerPattern MCP;
3753 
3754   switch (Root.getOpcode()) {
3755   default:
3756     assert(false && "Unsupported FP instruction in combiner\n");
3757     break;
3758   case AArch64::FADDHrr:
3759     assert(Root.getOperand(1).isReg() && Root.getOperand(2).isReg() &&
3760            "FADDHrr does not have register operands");
3761 
3762     Found  = Match(AArch64::FMULHrr, 1, MCP::FMULADDH_OP1);
3763     Found |= Match(AArch64::FMULHrr, 2, MCP::FMULADDH_OP2);
3764     break;
3765   case AArch64::FADDSrr:
3766     assert(Root.getOperand(1).isReg() && Root.getOperand(2).isReg() &&
3767            "FADDSrr does not have register operands");
3768 
3769     Found |= Match(AArch64::FMULSrr, 1, MCP::FMULADDS_OP1) ||
3770              Match(AArch64::FMULv1i32_indexed, 1, MCP::FMLAv1i32_indexed_OP1);
3771 
3772     Found |= Match(AArch64::FMULSrr, 2, MCP::FMULADDS_OP2) ||
3773              Match(AArch64::FMULv1i32_indexed, 2, MCP::FMLAv1i32_indexed_OP2);
3774     break;
3775   case AArch64::FADDDrr:
3776     Found |= Match(AArch64::FMULDrr, 1, MCP::FMULADDD_OP1) ||
3777              Match(AArch64::FMULv1i64_indexed, 1, MCP::FMLAv1i64_indexed_OP1);
3778 
3779     Found |= Match(AArch64::FMULDrr, 2, MCP::FMULADDD_OP2) ||
3780              Match(AArch64::FMULv1i64_indexed, 2, MCP::FMLAv1i64_indexed_OP2);
3781     break;
3782   case AArch64::FADDv4f16:
3783     Found |= Match(AArch64::FMULv4i16_indexed, 1, MCP::FMLAv4i16_indexed_OP1) ||
3784              Match(AArch64::FMULv4f16, 1, MCP::FMLAv4f16_OP1);
3785 
3786     Found |= Match(AArch64::FMULv4i16_indexed, 2, MCP::FMLAv4i16_indexed_OP2) ||
3787              Match(AArch64::FMULv4f16, 2, MCP::FMLAv4f16_OP2);
3788     break;
3789   case AArch64::FADDv8f16:
3790     Found |= Match(AArch64::FMULv8i16_indexed, 1, MCP::FMLAv8i16_indexed_OP1) ||
3791              Match(AArch64::FMULv8f16, 1, MCP::FMLAv8f16_OP1);
3792 
3793     Found |= Match(AArch64::FMULv8i16_indexed, 2, MCP::FMLAv8i16_indexed_OP2) ||
3794              Match(AArch64::FMULv8f16, 2, MCP::FMLAv8f16_OP2);
3795     break;
3796   case AArch64::FADDv2f32:
3797     Found |= Match(AArch64::FMULv2i32_indexed, 1, MCP::FMLAv2i32_indexed_OP1) ||
3798              Match(AArch64::FMULv2f32, 1, MCP::FMLAv2f32_OP1);
3799 
3800     Found |= Match(AArch64::FMULv2i32_indexed, 2, MCP::FMLAv2i32_indexed_OP2) ||
3801              Match(AArch64::FMULv2f32, 2, MCP::FMLAv2f32_OP2);
3802     break;
3803   case AArch64::FADDv2f64:
3804     Found |= Match(AArch64::FMULv2i64_indexed, 1, MCP::FMLAv2i64_indexed_OP1) ||
3805              Match(AArch64::FMULv2f64, 1, MCP::FMLAv2f64_OP1);
3806 
3807     Found |= Match(AArch64::FMULv2i64_indexed, 2, MCP::FMLAv2i64_indexed_OP2) ||
3808              Match(AArch64::FMULv2f64, 2, MCP::FMLAv2f64_OP2);
3809     break;
3810   case AArch64::FADDv4f32:
3811     Found |= Match(AArch64::FMULv4i32_indexed, 1, MCP::FMLAv4i32_indexed_OP1) ||
3812              Match(AArch64::FMULv4f32, 1, MCP::FMLAv4f32_OP1);
3813 
3814     Found |= Match(AArch64::FMULv4i32_indexed, 2, MCP::FMLAv4i32_indexed_OP2) ||
3815              Match(AArch64::FMULv4f32, 2, MCP::FMLAv4f32_OP2);
3816     break;
3817   case AArch64::FSUBHrr:
3818     Found  = Match(AArch64::FMULHrr, 1, MCP::FMULSUBH_OP1);
3819     Found |= Match(AArch64::FMULHrr, 2, MCP::FMULSUBH_OP2);
3820     Found |= Match(AArch64::FNMULHrr, 1, MCP::FNMULSUBH_OP1);
3821     break;
3822   case AArch64::FSUBSrr:
3823     Found = Match(AArch64::FMULSrr, 1, MCP::FMULSUBS_OP1);
3824 
3825     Found |= Match(AArch64::FMULSrr, 2, MCP::FMULSUBS_OP2) ||
3826              Match(AArch64::FMULv1i32_indexed, 2, MCP::FMLSv1i32_indexed_OP2);
3827 
3828     Found |= Match(AArch64::FNMULSrr, 1, MCP::FNMULSUBS_OP1);
3829     break;
3830   case AArch64::FSUBDrr:
3831     Found = Match(AArch64::FMULDrr, 1, MCP::FMULSUBD_OP1);
3832 
3833     Found |= Match(AArch64::FMULDrr, 2, MCP::FMULSUBD_OP2) ||
3834              Match(AArch64::FMULv1i64_indexed, 2, MCP::FMLSv1i64_indexed_OP2);
3835 
3836     Found |= Match(AArch64::FNMULDrr, 1, MCP::FNMULSUBD_OP1);
3837     break;
3838   case AArch64::FSUBv4f16:
3839     Found |= Match(AArch64::FMULv4i16_indexed, 2, MCP::FMLSv4i16_indexed_OP2) ||
3840              Match(AArch64::FMULv4f16, 2, MCP::FMLSv4f16_OP2);
3841 
3842     Found |= Match(AArch64::FMULv4i16_indexed, 1, MCP::FMLSv4i16_indexed_OP1) ||
3843              Match(AArch64::FMULv4f16, 1, MCP::FMLSv4f16_OP1);
3844     break;
3845   case AArch64::FSUBv8f16:
3846     Found |= Match(AArch64::FMULv8i16_indexed, 2, MCP::FMLSv8i16_indexed_OP2) ||
3847              Match(AArch64::FMULv8f16, 2, MCP::FMLSv8f16_OP2);
3848 
3849     Found |= Match(AArch64::FMULv8i16_indexed, 1, MCP::FMLSv8i16_indexed_OP1) ||
3850              Match(AArch64::FMULv8f16, 1, MCP::FMLSv8f16_OP1);
3851     break;
3852   case AArch64::FSUBv2f32:
3853     Found |= Match(AArch64::FMULv2i32_indexed, 2, MCP::FMLSv2i32_indexed_OP2) ||
3854              Match(AArch64::FMULv2f32, 2, MCP::FMLSv2f32_OP2);
3855 
3856     Found |= Match(AArch64::FMULv2i32_indexed, 1, MCP::FMLSv2i32_indexed_OP1) ||
3857              Match(AArch64::FMULv2f32, 1, MCP::FMLSv2f32_OP1);
3858     break;
3859   case AArch64::FSUBv2f64:
3860     Found |= Match(AArch64::FMULv2i64_indexed, 2, MCP::FMLSv2i64_indexed_OP2) ||
3861              Match(AArch64::FMULv2f64, 2, MCP::FMLSv2f64_OP2);
3862 
3863     Found |= Match(AArch64::FMULv2i64_indexed, 1, MCP::FMLSv2i64_indexed_OP1) ||
3864              Match(AArch64::FMULv2f64, 1, MCP::FMLSv2f64_OP1);
3865     break;
3866   case AArch64::FSUBv4f32:
3867     Found |= Match(AArch64::FMULv4i32_indexed, 2, MCP::FMLSv4i32_indexed_OP2) ||
3868              Match(AArch64::FMULv4f32, 2, MCP::FMLSv4f32_OP2);
3869 
3870     Found |= Match(AArch64::FMULv4i32_indexed, 1, MCP::FMLSv4i32_indexed_OP1) ||
3871              Match(AArch64::FMULv4f32, 1, MCP::FMLSv4f32_OP1);
3872     break;
3873   }
3874   return Found;
3875 }
3876 
3877 /// Return true when a code sequence can improve throughput. It
3878 /// should be called only for instructions in loops.
3879 /// \param Pattern - combiner pattern
3880 bool AArch64InstrInfo::isThroughputPattern(
3881     MachineCombinerPattern Pattern) const {
3882   switch (Pattern) {
3883   default:
3884     break;
3885   case MachineCombinerPattern::FMULADDH_OP1:
3886   case MachineCombinerPattern::FMULADDH_OP2:
3887   case MachineCombinerPattern::FMULSUBH_OP1:
3888   case MachineCombinerPattern::FMULSUBH_OP2:
3889   case MachineCombinerPattern::FMULADDS_OP1:
3890   case MachineCombinerPattern::FMULADDS_OP2:
3891   case MachineCombinerPattern::FMULSUBS_OP1:
3892   case MachineCombinerPattern::FMULSUBS_OP2:
3893   case MachineCombinerPattern::FMULADDD_OP1:
3894   case MachineCombinerPattern::FMULADDD_OP2:
3895   case MachineCombinerPattern::FMULSUBD_OP1:
3896   case MachineCombinerPattern::FMULSUBD_OP2:
3897   case MachineCombinerPattern::FNMULSUBH_OP1:
3898   case MachineCombinerPattern::FNMULSUBS_OP1:
3899   case MachineCombinerPattern::FNMULSUBD_OP1:
3900   case MachineCombinerPattern::FMLAv4i16_indexed_OP1:
3901   case MachineCombinerPattern::FMLAv4i16_indexed_OP2:
3902   case MachineCombinerPattern::FMLAv8i16_indexed_OP1:
3903   case MachineCombinerPattern::FMLAv8i16_indexed_OP2:
3904   case MachineCombinerPattern::FMLAv1i32_indexed_OP1:
3905   case MachineCombinerPattern::FMLAv1i32_indexed_OP2:
3906   case MachineCombinerPattern::FMLAv1i64_indexed_OP1:
3907   case MachineCombinerPattern::FMLAv1i64_indexed_OP2:
3908   case MachineCombinerPattern::FMLAv4f16_OP2:
3909   case MachineCombinerPattern::FMLAv4f16_OP1:
3910   case MachineCombinerPattern::FMLAv8f16_OP1:
3911   case MachineCombinerPattern::FMLAv8f16_OP2:
3912   case MachineCombinerPattern::FMLAv2f32_OP2:
3913   case MachineCombinerPattern::FMLAv2f32_OP1:
3914   case MachineCombinerPattern::FMLAv2f64_OP1:
3915   case MachineCombinerPattern::FMLAv2f64_OP2:
3916   case MachineCombinerPattern::FMLAv2i32_indexed_OP1:
3917   case MachineCombinerPattern::FMLAv2i32_indexed_OP2:
3918   case MachineCombinerPattern::FMLAv2i64_indexed_OP1:
3919   case MachineCombinerPattern::FMLAv2i64_indexed_OP2:
3920   case MachineCombinerPattern::FMLAv4f32_OP1:
3921   case MachineCombinerPattern::FMLAv4f32_OP2:
3922   case MachineCombinerPattern::FMLAv4i32_indexed_OP1:
3923   case MachineCombinerPattern::FMLAv4i32_indexed_OP2:
3924   case MachineCombinerPattern::FMLSv4i16_indexed_OP1:
3925   case MachineCombinerPattern::FMLSv4i16_indexed_OP2:
3926   case MachineCombinerPattern::FMLSv8i16_indexed_OP1:
3927   case MachineCombinerPattern::FMLSv8i16_indexed_OP2:
3928   case MachineCombinerPattern::FMLSv1i32_indexed_OP2:
3929   case MachineCombinerPattern::FMLSv1i64_indexed_OP2:
3930   case MachineCombinerPattern::FMLSv2i32_indexed_OP2:
3931   case MachineCombinerPattern::FMLSv2i64_indexed_OP2:
3932   case MachineCombinerPattern::FMLSv4f16_OP1:
3933   case MachineCombinerPattern::FMLSv4f16_OP2:
3934   case MachineCombinerPattern::FMLSv8f16_OP1:
3935   case MachineCombinerPattern::FMLSv8f16_OP2:
3936   case MachineCombinerPattern::FMLSv2f32_OP2:
3937   case MachineCombinerPattern::FMLSv2f64_OP2:
3938   case MachineCombinerPattern::FMLSv4i32_indexed_OP2:
3939   case MachineCombinerPattern::FMLSv4f32_OP2:
3940     return true;
3941   } // end switch (Pattern)
3942   return false;
3943 }
3944 /// Return true when there is potentially a faster code sequence for an
3945 /// instruction chain ending in \p Root. All potential patterns are listed in
3946 /// the \p Pattern vector. Pattern should be sorted in priority order since the
3947 /// pattern evaluator stops checking as soon as it finds a faster sequence.
3948 
3949 bool AArch64InstrInfo::getMachineCombinerPatterns(
3950     MachineInstr &Root,
3951     SmallVectorImpl<MachineCombinerPattern> &Patterns) const {
3952   // Integer patterns
3953   if (getMaddPatterns(Root, Patterns))
3954     return true;
3955   // Floating point patterns
3956   if (getFMAPatterns(Root, Patterns))
3957     return true;
3958 
3959   return TargetInstrInfo::getMachineCombinerPatterns(Root, Patterns);
3960 }
3961 
3962 enum class FMAInstKind { Default, Indexed, Accumulator };
3963 /// genFusedMultiply - Generate fused multiply instructions.
3964 /// This function supports both integer and floating point instructions.
3965 /// A typical example:
3966 ///  F|MUL I=A,B,0
3967 ///  F|ADD R,I,C
3968 ///  ==> F|MADD R,A,B,C
3969 /// \param MF Containing MachineFunction
3970 /// \param MRI Register information
3971 /// \param TII Target information
3972 /// \param Root is the F|ADD instruction
3973 /// \param [out] InsInstrs is a vector of machine instructions and will
3974 /// contain the generated madd instruction
3975 /// \param IdxMulOpd is index of operand in Root that is the result of
3976 /// the F|MUL. In the example above IdxMulOpd is 1.
3977 /// \param MaddOpc the opcode fo the f|madd instruction
3978 /// \param RC Register class of operands
3979 /// \param kind of fma instruction (addressing mode) to be generated
3980 /// \param ReplacedAddend is the result register from the instruction
3981 /// replacing the non-combined operand, if any.
3982 static MachineInstr *
3983 genFusedMultiply(MachineFunction &MF, MachineRegisterInfo &MRI,
3984                  const TargetInstrInfo *TII, MachineInstr &Root,
3985                  SmallVectorImpl<MachineInstr *> &InsInstrs, unsigned IdxMulOpd,
3986                  unsigned MaddOpc, const TargetRegisterClass *RC,
3987                  FMAInstKind kind = FMAInstKind::Default,
3988                  const Register *ReplacedAddend = nullptr) {
3989   assert(IdxMulOpd == 1 || IdxMulOpd == 2);
3990 
3991   unsigned IdxOtherOpd = IdxMulOpd == 1 ? 2 : 1;
3992   MachineInstr *MUL = MRI.getUniqueVRegDef(Root.getOperand(IdxMulOpd).getReg());
3993   Register ResultReg = Root.getOperand(0).getReg();
3994   Register SrcReg0 = MUL->getOperand(1).getReg();
3995   bool Src0IsKill = MUL->getOperand(1).isKill();
3996   Register SrcReg1 = MUL->getOperand(2).getReg();
3997   bool Src1IsKill = MUL->getOperand(2).isKill();
3998 
3999   unsigned SrcReg2;
4000   bool Src2IsKill;
4001   if (ReplacedAddend) {
4002     // If we just generated a new addend, we must be it's only use.
4003     SrcReg2 = *ReplacedAddend;
4004     Src2IsKill = true;
4005   } else {
4006     SrcReg2 = Root.getOperand(IdxOtherOpd).getReg();
4007     Src2IsKill = Root.getOperand(IdxOtherOpd).isKill();
4008   }
4009 
4010   if (Register::isVirtualRegister(ResultReg))
4011     MRI.constrainRegClass(ResultReg, RC);
4012   if (Register::isVirtualRegister(SrcReg0))
4013     MRI.constrainRegClass(SrcReg0, RC);
4014   if (Register::isVirtualRegister(SrcReg1))
4015     MRI.constrainRegClass(SrcReg1, RC);
4016   if (Register::isVirtualRegister(SrcReg2))
4017     MRI.constrainRegClass(SrcReg2, RC);
4018 
4019   MachineInstrBuilder MIB;
4020   if (kind == FMAInstKind::Default)
4021     MIB = BuildMI(MF, Root.getDebugLoc(), TII->get(MaddOpc), ResultReg)
4022               .addReg(SrcReg0, getKillRegState(Src0IsKill))
4023               .addReg(SrcReg1, getKillRegState(Src1IsKill))
4024               .addReg(SrcReg2, getKillRegState(Src2IsKill));
4025   else if (kind == FMAInstKind::Indexed)
4026     MIB = BuildMI(MF, Root.getDebugLoc(), TII->get(MaddOpc), ResultReg)
4027               .addReg(SrcReg2, getKillRegState(Src2IsKill))
4028               .addReg(SrcReg0, getKillRegState(Src0IsKill))
4029               .addReg(SrcReg1, getKillRegState(Src1IsKill))
4030               .addImm(MUL->getOperand(3).getImm());
4031   else if (kind == FMAInstKind::Accumulator)
4032     MIB = BuildMI(MF, Root.getDebugLoc(), TII->get(MaddOpc), ResultReg)
4033               .addReg(SrcReg2, getKillRegState(Src2IsKill))
4034               .addReg(SrcReg0, getKillRegState(Src0IsKill))
4035               .addReg(SrcReg1, getKillRegState(Src1IsKill));
4036   else
4037     assert(false && "Invalid FMA instruction kind \n");
4038   // Insert the MADD (MADD, FMA, FMS, FMLA, FMSL)
4039   InsInstrs.push_back(MIB);
4040   return MUL;
4041 }
4042 
4043 /// genMaddR - Generate madd instruction and combine mul and add using
4044 /// an extra virtual register
4045 /// Example - an ADD intermediate needs to be stored in a register:
4046 ///   MUL I=A,B,0
4047 ///   ADD R,I,Imm
4048 ///   ==> ORR  V, ZR, Imm
4049 ///   ==> MADD R,A,B,V
4050 /// \param MF Containing MachineFunction
4051 /// \param MRI Register information
4052 /// \param TII Target information
4053 /// \param Root is the ADD instruction
4054 /// \param [out] InsInstrs is a vector of machine instructions and will
4055 /// contain the generated madd instruction
4056 /// \param IdxMulOpd is index of operand in Root that is the result of
4057 /// the MUL. In the example above IdxMulOpd is 1.
4058 /// \param MaddOpc the opcode fo the madd instruction
4059 /// \param VR is a virtual register that holds the value of an ADD operand
4060 /// (V in the example above).
4061 /// \param RC Register class of operands
4062 static MachineInstr *genMaddR(MachineFunction &MF, MachineRegisterInfo &MRI,
4063                               const TargetInstrInfo *TII, MachineInstr &Root,
4064                               SmallVectorImpl<MachineInstr *> &InsInstrs,
4065                               unsigned IdxMulOpd, unsigned MaddOpc, unsigned VR,
4066                               const TargetRegisterClass *RC) {
4067   assert(IdxMulOpd == 1 || IdxMulOpd == 2);
4068 
4069   MachineInstr *MUL = MRI.getUniqueVRegDef(Root.getOperand(IdxMulOpd).getReg());
4070   Register ResultReg = Root.getOperand(0).getReg();
4071   Register SrcReg0 = MUL->getOperand(1).getReg();
4072   bool Src0IsKill = MUL->getOperand(1).isKill();
4073   Register SrcReg1 = MUL->getOperand(2).getReg();
4074   bool Src1IsKill = MUL->getOperand(2).isKill();
4075 
4076   if (Register::isVirtualRegister(ResultReg))
4077     MRI.constrainRegClass(ResultReg, RC);
4078   if (Register::isVirtualRegister(SrcReg0))
4079     MRI.constrainRegClass(SrcReg0, RC);
4080   if (Register::isVirtualRegister(SrcReg1))
4081     MRI.constrainRegClass(SrcReg1, RC);
4082   if (Register::isVirtualRegister(VR))
4083     MRI.constrainRegClass(VR, RC);
4084 
4085   MachineInstrBuilder MIB =
4086       BuildMI(MF, Root.getDebugLoc(), TII->get(MaddOpc), ResultReg)
4087           .addReg(SrcReg0, getKillRegState(Src0IsKill))
4088           .addReg(SrcReg1, getKillRegState(Src1IsKill))
4089           .addReg(VR);
4090   // Insert the MADD
4091   InsInstrs.push_back(MIB);
4092   return MUL;
4093 }
4094 
4095 /// When getMachineCombinerPatterns() finds potential patterns,
4096 /// this function generates the instructions that could replace the
4097 /// original code sequence
4098 void AArch64InstrInfo::genAlternativeCodeSequence(
4099     MachineInstr &Root, MachineCombinerPattern Pattern,
4100     SmallVectorImpl<MachineInstr *> &InsInstrs,
4101     SmallVectorImpl<MachineInstr *> &DelInstrs,
4102     DenseMap<unsigned, unsigned> &InstrIdxForVirtReg) const {
4103   MachineBasicBlock &MBB = *Root.getParent();
4104   MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo();
4105   MachineFunction &MF = *MBB.getParent();
4106   const TargetInstrInfo *TII = MF.getSubtarget().getInstrInfo();
4107 
4108   MachineInstr *MUL;
4109   const TargetRegisterClass *RC;
4110   unsigned Opc;
4111   switch (Pattern) {
4112   default:
4113     // Reassociate instructions.
4114     TargetInstrInfo::genAlternativeCodeSequence(Root, Pattern, InsInstrs,
4115                                                 DelInstrs, InstrIdxForVirtReg);
4116     return;
4117   case MachineCombinerPattern::MULADDW_OP1:
4118   case MachineCombinerPattern::MULADDX_OP1:
4119     // MUL I=A,B,0
4120     // ADD R,I,C
4121     // ==> MADD R,A,B,C
4122     // --- Create(MADD);
4123     if (Pattern == MachineCombinerPattern::MULADDW_OP1) {
4124       Opc = AArch64::MADDWrrr;
4125       RC = &AArch64::GPR32RegClass;
4126     } else {
4127       Opc = AArch64::MADDXrrr;
4128       RC = &AArch64::GPR64RegClass;
4129     }
4130     MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC);
4131     break;
4132   case MachineCombinerPattern::MULADDW_OP2:
4133   case MachineCombinerPattern::MULADDX_OP2:
4134     // MUL I=A,B,0
4135     // ADD R,C,I
4136     // ==> MADD R,A,B,C
4137     // --- Create(MADD);
4138     if (Pattern == MachineCombinerPattern::MULADDW_OP2) {
4139       Opc = AArch64::MADDWrrr;
4140       RC = &AArch64::GPR32RegClass;
4141     } else {
4142       Opc = AArch64::MADDXrrr;
4143       RC = &AArch64::GPR64RegClass;
4144     }
4145     MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC);
4146     break;
4147   case MachineCombinerPattern::MULADDWI_OP1:
4148   case MachineCombinerPattern::MULADDXI_OP1: {
4149     // MUL I=A,B,0
4150     // ADD R,I,Imm
4151     // ==> ORR  V, ZR, Imm
4152     // ==> MADD R,A,B,V
4153     // --- Create(MADD);
4154     const TargetRegisterClass *OrrRC;
4155     unsigned BitSize, OrrOpc, ZeroReg;
4156     if (Pattern == MachineCombinerPattern::MULADDWI_OP1) {
4157       OrrOpc = AArch64::ORRWri;
4158       OrrRC = &AArch64::GPR32spRegClass;
4159       BitSize = 32;
4160       ZeroReg = AArch64::WZR;
4161       Opc = AArch64::MADDWrrr;
4162       RC = &AArch64::GPR32RegClass;
4163     } else {
4164       OrrOpc = AArch64::ORRXri;
4165       OrrRC = &AArch64::GPR64spRegClass;
4166       BitSize = 64;
4167       ZeroReg = AArch64::XZR;
4168       Opc = AArch64::MADDXrrr;
4169       RC = &AArch64::GPR64RegClass;
4170     }
4171     Register NewVR = MRI.createVirtualRegister(OrrRC);
4172     uint64_t Imm = Root.getOperand(2).getImm();
4173 
4174     if (Root.getOperand(3).isImm()) {
4175       unsigned Val = Root.getOperand(3).getImm();
4176       Imm = Imm << Val;
4177     }
4178     uint64_t UImm = SignExtend64(Imm, BitSize);
4179     uint64_t Encoding;
4180     if (AArch64_AM::processLogicalImmediate(UImm, BitSize, Encoding)) {
4181       MachineInstrBuilder MIB1 =
4182           BuildMI(MF, Root.getDebugLoc(), TII->get(OrrOpc), NewVR)
4183               .addReg(ZeroReg)
4184               .addImm(Encoding);
4185       InsInstrs.push_back(MIB1);
4186       InstrIdxForVirtReg.insert(std::make_pair(NewVR, 0));
4187       MUL = genMaddR(MF, MRI, TII, Root, InsInstrs, 1, Opc, NewVR, RC);
4188     }
4189     break;
4190   }
4191   case MachineCombinerPattern::MULSUBW_OP1:
4192   case MachineCombinerPattern::MULSUBX_OP1: {
4193     // MUL I=A,B,0
4194     // SUB R,I, C
4195     // ==> SUB  V, 0, C
4196     // ==> MADD R,A,B,V // = -C + A*B
4197     // --- Create(MADD);
4198     const TargetRegisterClass *SubRC;
4199     unsigned SubOpc, ZeroReg;
4200     if (Pattern == MachineCombinerPattern::MULSUBW_OP1) {
4201       SubOpc = AArch64::SUBWrr;
4202       SubRC = &AArch64::GPR32spRegClass;
4203       ZeroReg = AArch64::WZR;
4204       Opc = AArch64::MADDWrrr;
4205       RC = &AArch64::GPR32RegClass;
4206     } else {
4207       SubOpc = AArch64::SUBXrr;
4208       SubRC = &AArch64::GPR64spRegClass;
4209       ZeroReg = AArch64::XZR;
4210       Opc = AArch64::MADDXrrr;
4211       RC = &AArch64::GPR64RegClass;
4212     }
4213     Register NewVR = MRI.createVirtualRegister(SubRC);
4214     // SUB NewVR, 0, C
4215     MachineInstrBuilder MIB1 =
4216         BuildMI(MF, Root.getDebugLoc(), TII->get(SubOpc), NewVR)
4217             .addReg(ZeroReg)
4218             .add(Root.getOperand(2));
4219     InsInstrs.push_back(MIB1);
4220     InstrIdxForVirtReg.insert(std::make_pair(NewVR, 0));
4221     MUL = genMaddR(MF, MRI, TII, Root, InsInstrs, 1, Opc, NewVR, RC);
4222     break;
4223   }
4224   case MachineCombinerPattern::MULSUBW_OP2:
4225   case MachineCombinerPattern::MULSUBX_OP2:
4226     // MUL I=A,B,0
4227     // SUB R,C,I
4228     // ==> MSUB R,A,B,C (computes C - A*B)
4229     // --- Create(MSUB);
4230     if (Pattern == MachineCombinerPattern::MULSUBW_OP2) {
4231       Opc = AArch64::MSUBWrrr;
4232       RC = &AArch64::GPR32RegClass;
4233     } else {
4234       Opc = AArch64::MSUBXrrr;
4235       RC = &AArch64::GPR64RegClass;
4236     }
4237     MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC);
4238     break;
4239   case MachineCombinerPattern::MULSUBWI_OP1:
4240   case MachineCombinerPattern::MULSUBXI_OP1: {
4241     // MUL I=A,B,0
4242     // SUB R,I, Imm
4243     // ==> ORR  V, ZR, -Imm
4244     // ==> MADD R,A,B,V // = -Imm + A*B
4245     // --- Create(MADD);
4246     const TargetRegisterClass *OrrRC;
4247     unsigned BitSize, OrrOpc, ZeroReg;
4248     if (Pattern == MachineCombinerPattern::MULSUBWI_OP1) {
4249       OrrOpc = AArch64::ORRWri;
4250       OrrRC = &AArch64::GPR32spRegClass;
4251       BitSize = 32;
4252       ZeroReg = AArch64::WZR;
4253       Opc = AArch64::MADDWrrr;
4254       RC = &AArch64::GPR32RegClass;
4255     } else {
4256       OrrOpc = AArch64::ORRXri;
4257       OrrRC = &AArch64::GPR64spRegClass;
4258       BitSize = 64;
4259       ZeroReg = AArch64::XZR;
4260       Opc = AArch64::MADDXrrr;
4261       RC = &AArch64::GPR64RegClass;
4262     }
4263     Register NewVR = MRI.createVirtualRegister(OrrRC);
4264     uint64_t Imm = Root.getOperand(2).getImm();
4265     if (Root.getOperand(3).isImm()) {
4266       unsigned Val = Root.getOperand(3).getImm();
4267       Imm = Imm << Val;
4268     }
4269     uint64_t UImm = SignExtend64(-Imm, BitSize);
4270     uint64_t Encoding;
4271     if (AArch64_AM::processLogicalImmediate(UImm, BitSize, Encoding)) {
4272       MachineInstrBuilder MIB1 =
4273           BuildMI(MF, Root.getDebugLoc(), TII->get(OrrOpc), NewVR)
4274               .addReg(ZeroReg)
4275               .addImm(Encoding);
4276       InsInstrs.push_back(MIB1);
4277       InstrIdxForVirtReg.insert(std::make_pair(NewVR, 0));
4278       MUL = genMaddR(MF, MRI, TII, Root, InsInstrs, 1, Opc, NewVR, RC);
4279     }
4280     break;
4281   }
4282   // Floating Point Support
4283   case MachineCombinerPattern::FMULADDH_OP1:
4284     Opc = AArch64::FMADDHrrr;
4285     RC = &AArch64::FPR16RegClass;
4286     MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC);
4287     break;
4288   case MachineCombinerPattern::FMULADDS_OP1:
4289     Opc = AArch64::FMADDSrrr;
4290     RC = &AArch64::FPR32RegClass;
4291     MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC);
4292     break;
4293   case MachineCombinerPattern::FMULADDD_OP1:
4294     Opc = AArch64::FMADDDrrr;
4295     RC = &AArch64::FPR64RegClass;
4296     MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC);
4297     break;
4298 
4299   case MachineCombinerPattern::FMULADDH_OP2:
4300     Opc = AArch64::FMADDHrrr;
4301     RC = &AArch64::FPR16RegClass;
4302     MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC);
4303     break;
4304   case MachineCombinerPattern::FMULADDS_OP2:
4305     Opc = AArch64::FMADDSrrr;
4306     RC = &AArch64::FPR32RegClass;
4307     MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC);
4308     break;
4309   case MachineCombinerPattern::FMULADDD_OP2:
4310     Opc = AArch64::FMADDDrrr;
4311     RC = &AArch64::FPR64RegClass;
4312     MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC);
4313     break;
4314 
4315   case MachineCombinerPattern::FMLAv1i32_indexed_OP1:
4316     Opc = AArch64::FMLAv1i32_indexed;
4317     RC = &AArch64::FPR32RegClass;
4318     MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC,
4319                            FMAInstKind::Indexed);
4320     break;
4321   case MachineCombinerPattern::FMLAv1i32_indexed_OP2:
4322     Opc = AArch64::FMLAv1i32_indexed;
4323     RC = &AArch64::FPR32RegClass;
4324     MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC,
4325                            FMAInstKind::Indexed);
4326     break;
4327 
4328   case MachineCombinerPattern::FMLAv1i64_indexed_OP1:
4329     Opc = AArch64::FMLAv1i64_indexed;
4330     RC = &AArch64::FPR64RegClass;
4331     MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC,
4332                            FMAInstKind::Indexed);
4333     break;
4334   case MachineCombinerPattern::FMLAv1i64_indexed_OP2:
4335     Opc = AArch64::FMLAv1i64_indexed;
4336     RC = &AArch64::FPR64RegClass;
4337     MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC,
4338                            FMAInstKind::Indexed);
4339     break;
4340 
4341   case MachineCombinerPattern::FMLAv4i16_indexed_OP1:
4342     RC = &AArch64::FPR64RegClass;
4343     Opc = AArch64::FMLAv4i16_indexed;
4344     MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC,
4345                            FMAInstKind::Indexed);
4346     break;
4347   case MachineCombinerPattern::FMLAv4f16_OP1:
4348     RC = &AArch64::FPR64RegClass;
4349     Opc = AArch64::FMLAv4f16;
4350     MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC,
4351                            FMAInstKind::Accumulator);
4352     break;
4353   case MachineCombinerPattern::FMLAv4i16_indexed_OP2:
4354     RC = &AArch64::FPR64RegClass;
4355     Opc = AArch64::FMLAv4i16_indexed;
4356     MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC,
4357                            FMAInstKind::Indexed);
4358     break;
4359   case MachineCombinerPattern::FMLAv4f16_OP2:
4360     RC = &AArch64::FPR64RegClass;
4361     Opc = AArch64::FMLAv4f16;
4362     MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC,
4363                            FMAInstKind::Accumulator);
4364     break;
4365 
4366   case MachineCombinerPattern::FMLAv2i32_indexed_OP1:
4367   case MachineCombinerPattern::FMLAv2f32_OP1:
4368     RC = &AArch64::FPR64RegClass;
4369     if (Pattern == MachineCombinerPattern::FMLAv2i32_indexed_OP1) {
4370       Opc = AArch64::FMLAv2i32_indexed;
4371       MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC,
4372                              FMAInstKind::Indexed);
4373     } else {
4374       Opc = AArch64::FMLAv2f32;
4375       MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC,
4376                              FMAInstKind::Accumulator);
4377     }
4378     break;
4379   case MachineCombinerPattern::FMLAv2i32_indexed_OP2:
4380   case MachineCombinerPattern::FMLAv2f32_OP2:
4381     RC = &AArch64::FPR64RegClass;
4382     if (Pattern == MachineCombinerPattern::FMLAv2i32_indexed_OP2) {
4383       Opc = AArch64::FMLAv2i32_indexed;
4384       MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC,
4385                              FMAInstKind::Indexed);
4386     } else {
4387       Opc = AArch64::FMLAv2f32;
4388       MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC,
4389                              FMAInstKind::Accumulator);
4390     }
4391     break;
4392 
4393   case MachineCombinerPattern::FMLAv8i16_indexed_OP1:
4394     RC = &AArch64::FPR128RegClass;
4395     Opc = AArch64::FMLAv8i16_indexed;
4396     MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC,
4397                            FMAInstKind::Indexed);
4398     break;
4399   case MachineCombinerPattern::FMLAv8f16_OP1:
4400     RC = &AArch64::FPR128RegClass;
4401     Opc = AArch64::FMLAv8f16;
4402     MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC,
4403                            FMAInstKind::Accumulator);
4404     break;
4405   case MachineCombinerPattern::FMLAv8i16_indexed_OP2:
4406     RC = &AArch64::FPR128RegClass;
4407     Opc = AArch64::FMLAv8i16_indexed;
4408     MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC,
4409                            FMAInstKind::Indexed);
4410     break;
4411   case MachineCombinerPattern::FMLAv8f16_OP2:
4412     RC = &AArch64::FPR128RegClass;
4413     Opc = AArch64::FMLAv8f16;
4414     MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC,
4415                            FMAInstKind::Accumulator);
4416     break;
4417 
4418   case MachineCombinerPattern::FMLAv2i64_indexed_OP1:
4419   case MachineCombinerPattern::FMLAv2f64_OP1:
4420     RC = &AArch64::FPR128RegClass;
4421     if (Pattern == MachineCombinerPattern::FMLAv2i64_indexed_OP1) {
4422       Opc = AArch64::FMLAv2i64_indexed;
4423       MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC,
4424                              FMAInstKind::Indexed);
4425     } else {
4426       Opc = AArch64::FMLAv2f64;
4427       MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC,
4428                              FMAInstKind::Accumulator);
4429     }
4430     break;
4431   case MachineCombinerPattern::FMLAv2i64_indexed_OP2:
4432   case MachineCombinerPattern::FMLAv2f64_OP2:
4433     RC = &AArch64::FPR128RegClass;
4434     if (Pattern == MachineCombinerPattern::FMLAv2i64_indexed_OP2) {
4435       Opc = AArch64::FMLAv2i64_indexed;
4436       MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC,
4437                              FMAInstKind::Indexed);
4438     } else {
4439       Opc = AArch64::FMLAv2f64;
4440       MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC,
4441                              FMAInstKind::Accumulator);
4442     }
4443     break;
4444 
4445   case MachineCombinerPattern::FMLAv4i32_indexed_OP1:
4446   case MachineCombinerPattern::FMLAv4f32_OP1:
4447     RC = &AArch64::FPR128RegClass;
4448     if (Pattern == MachineCombinerPattern::FMLAv4i32_indexed_OP1) {
4449       Opc = AArch64::FMLAv4i32_indexed;
4450       MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC,
4451                              FMAInstKind::Indexed);
4452     } else {
4453       Opc = AArch64::FMLAv4f32;
4454       MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC,
4455                              FMAInstKind::Accumulator);
4456     }
4457     break;
4458 
4459   case MachineCombinerPattern::FMLAv4i32_indexed_OP2:
4460   case MachineCombinerPattern::FMLAv4f32_OP2:
4461     RC = &AArch64::FPR128RegClass;
4462     if (Pattern == MachineCombinerPattern::FMLAv4i32_indexed_OP2) {
4463       Opc = AArch64::FMLAv4i32_indexed;
4464       MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC,
4465                              FMAInstKind::Indexed);
4466     } else {
4467       Opc = AArch64::FMLAv4f32;
4468       MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC,
4469                              FMAInstKind::Accumulator);
4470     }
4471     break;
4472 
4473   case MachineCombinerPattern::FMULSUBH_OP1:
4474     Opc = AArch64::FNMSUBHrrr;
4475     RC = &AArch64::FPR16RegClass;
4476     MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC);
4477     break;
4478   case MachineCombinerPattern::FMULSUBS_OP1:
4479     Opc = AArch64::FNMSUBSrrr;
4480     RC = &AArch64::FPR32RegClass;
4481     MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC);
4482     break;
4483   case MachineCombinerPattern::FMULSUBD_OP1:
4484     Opc = AArch64::FNMSUBDrrr;
4485     RC = &AArch64::FPR64RegClass;
4486     MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC);
4487     break;
4488 
4489   case MachineCombinerPattern::FNMULSUBH_OP1:
4490     Opc = AArch64::FNMADDHrrr;
4491     RC = &AArch64::FPR16RegClass;
4492     MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC);
4493     break;
4494   case MachineCombinerPattern::FNMULSUBS_OP1:
4495     Opc = AArch64::FNMADDSrrr;
4496     RC = &AArch64::FPR32RegClass;
4497     MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC);
4498     break;
4499   case MachineCombinerPattern::FNMULSUBD_OP1:
4500     Opc = AArch64::FNMADDDrrr;
4501     RC = &AArch64::FPR64RegClass;
4502     MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC);
4503     break;
4504 
4505   case MachineCombinerPattern::FMULSUBH_OP2:
4506     Opc = AArch64::FMSUBHrrr;
4507     RC = &AArch64::FPR16RegClass;
4508     MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC);
4509     break;
4510   case MachineCombinerPattern::FMULSUBS_OP2:
4511     Opc = AArch64::FMSUBSrrr;
4512     RC = &AArch64::FPR32RegClass;
4513     MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC);
4514     break;
4515   case MachineCombinerPattern::FMULSUBD_OP2:
4516     Opc = AArch64::FMSUBDrrr;
4517     RC = &AArch64::FPR64RegClass;
4518     MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC);
4519     break;
4520 
4521   case MachineCombinerPattern::FMLSv1i32_indexed_OP2:
4522     Opc = AArch64::FMLSv1i32_indexed;
4523     RC = &AArch64::FPR32RegClass;
4524     MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC,
4525                            FMAInstKind::Indexed);
4526     break;
4527 
4528   case MachineCombinerPattern::FMLSv1i64_indexed_OP2:
4529     Opc = AArch64::FMLSv1i64_indexed;
4530     RC = &AArch64::FPR64RegClass;
4531     MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC,
4532                            FMAInstKind::Indexed);
4533     break;
4534 
4535   case MachineCombinerPattern::FMLSv4f16_OP1:
4536   case MachineCombinerPattern::FMLSv4i16_indexed_OP1: {
4537     RC = &AArch64::FPR64RegClass;
4538     Register NewVR = MRI.createVirtualRegister(RC);
4539     MachineInstrBuilder MIB1 =
4540         BuildMI(MF, Root.getDebugLoc(), TII->get(AArch64::FNEGv4f16), NewVR)
4541             .add(Root.getOperand(2));
4542     InsInstrs.push_back(MIB1);
4543     InstrIdxForVirtReg.insert(std::make_pair(NewVR, 0));
4544     if (Pattern == MachineCombinerPattern::FMLSv4f16_OP1) {
4545       Opc = AArch64::FMLAv4f16;
4546       MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC,
4547                              FMAInstKind::Accumulator, &NewVR);
4548     } else {
4549       Opc = AArch64::FMLAv4i16_indexed;
4550       MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC,
4551                              FMAInstKind::Indexed, &NewVR);
4552     }
4553     break;
4554   }
4555   case MachineCombinerPattern::FMLSv4f16_OP2:
4556     RC = &AArch64::FPR64RegClass;
4557     Opc = AArch64::FMLSv4f16;
4558     MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC,
4559                            FMAInstKind::Accumulator);
4560     break;
4561   case MachineCombinerPattern::FMLSv4i16_indexed_OP2:
4562     RC = &AArch64::FPR64RegClass;
4563     Opc = AArch64::FMLSv4i16_indexed;
4564     MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC,
4565                            FMAInstKind::Indexed);
4566     break;
4567 
4568   case MachineCombinerPattern::FMLSv2f32_OP2:
4569   case MachineCombinerPattern::FMLSv2i32_indexed_OP2:
4570     RC = &AArch64::FPR64RegClass;
4571     if (Pattern == MachineCombinerPattern::FMLSv2i32_indexed_OP2) {
4572       Opc = AArch64::FMLSv2i32_indexed;
4573       MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC,
4574                              FMAInstKind::Indexed);
4575     } else {
4576       Opc = AArch64::FMLSv2f32;
4577       MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC,
4578                              FMAInstKind::Accumulator);
4579     }
4580     break;
4581 
4582   case MachineCombinerPattern::FMLSv8f16_OP1:
4583   case MachineCombinerPattern::FMLSv8i16_indexed_OP1: {
4584     RC = &AArch64::FPR128RegClass;
4585     Register NewVR = MRI.createVirtualRegister(RC);
4586     MachineInstrBuilder MIB1 =
4587         BuildMI(MF, Root.getDebugLoc(), TII->get(AArch64::FNEGv8f16), NewVR)
4588             .add(Root.getOperand(2));
4589     InsInstrs.push_back(MIB1);
4590     InstrIdxForVirtReg.insert(std::make_pair(NewVR, 0));
4591     if (Pattern == MachineCombinerPattern::FMLSv8f16_OP1) {
4592       Opc = AArch64::FMLAv8f16;
4593       MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC,
4594                              FMAInstKind::Accumulator, &NewVR);
4595     } else {
4596       Opc = AArch64::FMLAv8i16_indexed;
4597       MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC,
4598                              FMAInstKind::Indexed, &NewVR);
4599     }
4600     break;
4601   }
4602   case MachineCombinerPattern::FMLSv8f16_OP2:
4603     RC = &AArch64::FPR128RegClass;
4604     Opc = AArch64::FMLSv8f16;
4605     MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC,
4606                            FMAInstKind::Accumulator);
4607     break;
4608   case MachineCombinerPattern::FMLSv8i16_indexed_OP2:
4609     RC = &AArch64::FPR128RegClass;
4610     Opc = AArch64::FMLSv8i16_indexed;
4611     MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC,
4612                            FMAInstKind::Indexed);
4613     break;
4614 
4615   case MachineCombinerPattern::FMLSv2f64_OP2:
4616   case MachineCombinerPattern::FMLSv2i64_indexed_OP2:
4617     RC = &AArch64::FPR128RegClass;
4618     if (Pattern == MachineCombinerPattern::FMLSv2i64_indexed_OP2) {
4619       Opc = AArch64::FMLSv2i64_indexed;
4620       MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC,
4621                              FMAInstKind::Indexed);
4622     } else {
4623       Opc = AArch64::FMLSv2f64;
4624       MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC,
4625                              FMAInstKind::Accumulator);
4626     }
4627     break;
4628 
4629   case MachineCombinerPattern::FMLSv4f32_OP2:
4630   case MachineCombinerPattern::FMLSv4i32_indexed_OP2:
4631     RC = &AArch64::FPR128RegClass;
4632     if (Pattern == MachineCombinerPattern::FMLSv4i32_indexed_OP2) {
4633       Opc = AArch64::FMLSv4i32_indexed;
4634       MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC,
4635                              FMAInstKind::Indexed);
4636     } else {
4637       Opc = AArch64::FMLSv4f32;
4638       MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC,
4639                              FMAInstKind::Accumulator);
4640     }
4641     break;
4642   case MachineCombinerPattern::FMLSv2f32_OP1:
4643   case MachineCombinerPattern::FMLSv2i32_indexed_OP1: {
4644     RC = &AArch64::FPR64RegClass;
4645     Register NewVR = MRI.createVirtualRegister(RC);
4646     MachineInstrBuilder MIB1 =
4647         BuildMI(MF, Root.getDebugLoc(), TII->get(AArch64::FNEGv2f32), NewVR)
4648             .add(Root.getOperand(2));
4649     InsInstrs.push_back(MIB1);
4650     InstrIdxForVirtReg.insert(std::make_pair(NewVR, 0));
4651     if (Pattern == MachineCombinerPattern::FMLSv2i32_indexed_OP1) {
4652       Opc = AArch64::FMLAv2i32_indexed;
4653       MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC,
4654                              FMAInstKind::Indexed, &NewVR);
4655     } else {
4656       Opc = AArch64::FMLAv2f32;
4657       MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC,
4658                              FMAInstKind::Accumulator, &NewVR);
4659     }
4660     break;
4661   }
4662   case MachineCombinerPattern::FMLSv4f32_OP1:
4663   case MachineCombinerPattern::FMLSv4i32_indexed_OP1: {
4664     RC = &AArch64::FPR128RegClass;
4665     Register NewVR = MRI.createVirtualRegister(RC);
4666     MachineInstrBuilder MIB1 =
4667         BuildMI(MF, Root.getDebugLoc(), TII->get(AArch64::FNEGv4f32), NewVR)
4668             .add(Root.getOperand(2));
4669     InsInstrs.push_back(MIB1);
4670     InstrIdxForVirtReg.insert(std::make_pair(NewVR, 0));
4671     if (Pattern == MachineCombinerPattern::FMLSv4i32_indexed_OP1) {
4672       Opc = AArch64::FMLAv4i32_indexed;
4673       MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC,
4674                              FMAInstKind::Indexed, &NewVR);
4675     } else {
4676       Opc = AArch64::FMLAv4f32;
4677       MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC,
4678                              FMAInstKind::Accumulator, &NewVR);
4679     }
4680     break;
4681   }
4682   case MachineCombinerPattern::FMLSv2f64_OP1:
4683   case MachineCombinerPattern::FMLSv2i64_indexed_OP1: {
4684     RC = &AArch64::FPR128RegClass;
4685     Register NewVR = MRI.createVirtualRegister(RC);
4686     MachineInstrBuilder MIB1 =
4687         BuildMI(MF, Root.getDebugLoc(), TII->get(AArch64::FNEGv2f64), NewVR)
4688             .add(Root.getOperand(2));
4689     InsInstrs.push_back(MIB1);
4690     InstrIdxForVirtReg.insert(std::make_pair(NewVR, 0));
4691     if (Pattern == MachineCombinerPattern::FMLSv2i64_indexed_OP1) {
4692       Opc = AArch64::FMLAv2i64_indexed;
4693       MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC,
4694                              FMAInstKind::Indexed, &NewVR);
4695     } else {
4696       Opc = AArch64::FMLAv2f64;
4697       MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC,
4698                              FMAInstKind::Accumulator, &NewVR);
4699     }
4700     break;
4701   }
4702   } // end switch (Pattern)
4703   // Record MUL and ADD/SUB for deletion
4704   DelInstrs.push_back(MUL);
4705   DelInstrs.push_back(&Root);
4706 }
4707 
4708 /// Replace csincr-branch sequence by simple conditional branch
4709 ///
4710 /// Examples:
4711 /// 1. \code
4712 ///   csinc  w9, wzr, wzr, <condition code>
4713 ///   tbnz   w9, #0, 0x44
4714 ///    \endcode
4715 /// to
4716 ///    \code
4717 ///   b.<inverted condition code>
4718 ///    \endcode
4719 ///
4720 /// 2. \code
4721 ///   csinc w9, wzr, wzr, <condition code>
4722 ///   tbz   w9, #0, 0x44
4723 ///    \endcode
4724 /// to
4725 ///    \code
4726 ///   b.<condition code>
4727 ///    \endcode
4728 ///
4729 /// Replace compare and branch sequence by TBZ/TBNZ instruction when the
4730 /// compare's constant operand is power of 2.
4731 ///
4732 /// Examples:
4733 ///    \code
4734 ///   and  w8, w8, #0x400
4735 ///   cbnz w8, L1
4736 ///    \endcode
4737 /// to
4738 ///    \code
4739 ///   tbnz w8, #10, L1
4740 ///    \endcode
4741 ///
4742 /// \param  MI Conditional Branch
4743 /// \return True when the simple conditional branch is generated
4744 ///
4745 bool AArch64InstrInfo::optimizeCondBranch(MachineInstr &MI) const {
4746   bool IsNegativeBranch = false;
4747   bool IsTestAndBranch = false;
4748   unsigned TargetBBInMI = 0;
4749   switch (MI.getOpcode()) {
4750   default:
4751     llvm_unreachable("Unknown branch instruction?");
4752   case AArch64::Bcc:
4753     return false;
4754   case AArch64::CBZW:
4755   case AArch64::CBZX:
4756     TargetBBInMI = 1;
4757     break;
4758   case AArch64::CBNZW:
4759   case AArch64::CBNZX:
4760     TargetBBInMI = 1;
4761     IsNegativeBranch = true;
4762     break;
4763   case AArch64::TBZW:
4764   case AArch64::TBZX:
4765     TargetBBInMI = 2;
4766     IsTestAndBranch = true;
4767     break;
4768   case AArch64::TBNZW:
4769   case AArch64::TBNZX:
4770     TargetBBInMI = 2;
4771     IsNegativeBranch = true;
4772     IsTestAndBranch = true;
4773     break;
4774   }
4775   // So we increment a zero register and test for bits other
4776   // than bit 0? Conservatively bail out in case the verifier
4777   // missed this case.
4778   if (IsTestAndBranch && MI.getOperand(1).getImm())
4779     return false;
4780 
4781   // Find Definition.
4782   assert(MI.getParent() && "Incomplete machine instruciton\n");
4783   MachineBasicBlock *MBB = MI.getParent();
4784   MachineFunction *MF = MBB->getParent();
4785   MachineRegisterInfo *MRI = &MF->getRegInfo();
4786   Register VReg = MI.getOperand(0).getReg();
4787   if (!Register::isVirtualRegister(VReg))
4788     return false;
4789 
4790   MachineInstr *DefMI = MRI->getVRegDef(VReg);
4791 
4792   // Look through COPY instructions to find definition.
4793   while (DefMI->isCopy()) {
4794     Register CopyVReg = DefMI->getOperand(1).getReg();
4795     if (!MRI->hasOneNonDBGUse(CopyVReg))
4796       return false;
4797     if (!MRI->hasOneDef(CopyVReg))
4798       return false;
4799     DefMI = MRI->getVRegDef(CopyVReg);
4800   }
4801 
4802   switch (DefMI->getOpcode()) {
4803   default:
4804     return false;
4805   // Fold AND into a TBZ/TBNZ if constant operand is power of 2.
4806   case AArch64::ANDWri:
4807   case AArch64::ANDXri: {
4808     if (IsTestAndBranch)
4809       return false;
4810     if (DefMI->getParent() != MBB)
4811       return false;
4812     if (!MRI->hasOneNonDBGUse(VReg))
4813       return false;
4814 
4815     bool Is32Bit = (DefMI->getOpcode() == AArch64::ANDWri);
4816     uint64_t Mask = AArch64_AM::decodeLogicalImmediate(
4817         DefMI->getOperand(2).getImm(), Is32Bit ? 32 : 64);
4818     if (!isPowerOf2_64(Mask))
4819       return false;
4820 
4821     MachineOperand &MO = DefMI->getOperand(1);
4822     Register NewReg = MO.getReg();
4823     if (!Register::isVirtualRegister(NewReg))
4824       return false;
4825 
4826     assert(!MRI->def_empty(NewReg) && "Register must be defined.");
4827 
4828     MachineBasicBlock &RefToMBB = *MBB;
4829     MachineBasicBlock *TBB = MI.getOperand(1).getMBB();
4830     DebugLoc DL = MI.getDebugLoc();
4831     unsigned Imm = Log2_64(Mask);
4832     unsigned Opc = (Imm < 32)
4833                        ? (IsNegativeBranch ? AArch64::TBNZW : AArch64::TBZW)
4834                        : (IsNegativeBranch ? AArch64::TBNZX : AArch64::TBZX);
4835     MachineInstr *NewMI = BuildMI(RefToMBB, MI, DL, get(Opc))
4836                               .addReg(NewReg)
4837                               .addImm(Imm)
4838                               .addMBB(TBB);
4839     // Register lives on to the CBZ now.
4840     MO.setIsKill(false);
4841 
4842     // For immediate smaller than 32, we need to use the 32-bit
4843     // variant (W) in all cases. Indeed the 64-bit variant does not
4844     // allow to encode them.
4845     // Therefore, if the input register is 64-bit, we need to take the
4846     // 32-bit sub-part.
4847     if (!Is32Bit && Imm < 32)
4848       NewMI->getOperand(0).setSubReg(AArch64::sub_32);
4849     MI.eraseFromParent();
4850     return true;
4851   }
4852   // Look for CSINC
4853   case AArch64::CSINCWr:
4854   case AArch64::CSINCXr: {
4855     if (!(DefMI->getOperand(1).getReg() == AArch64::WZR &&
4856           DefMI->getOperand(2).getReg() == AArch64::WZR) &&
4857         !(DefMI->getOperand(1).getReg() == AArch64::XZR &&
4858           DefMI->getOperand(2).getReg() == AArch64::XZR))
4859       return false;
4860 
4861     if (DefMI->findRegisterDefOperandIdx(AArch64::NZCV, true) != -1)
4862       return false;
4863 
4864     AArch64CC::CondCode CC = (AArch64CC::CondCode)DefMI->getOperand(3).getImm();
4865     // Convert only when the condition code is not modified between
4866     // the CSINC and the branch. The CC may be used by other
4867     // instructions in between.
4868     if (areCFlagsAccessedBetweenInstrs(DefMI, MI, &getRegisterInfo(), AK_Write))
4869       return false;
4870     MachineBasicBlock &RefToMBB = *MBB;
4871     MachineBasicBlock *TBB = MI.getOperand(TargetBBInMI).getMBB();
4872     DebugLoc DL = MI.getDebugLoc();
4873     if (IsNegativeBranch)
4874       CC = AArch64CC::getInvertedCondCode(CC);
4875     BuildMI(RefToMBB, MI, DL, get(AArch64::Bcc)).addImm(CC).addMBB(TBB);
4876     MI.eraseFromParent();
4877     return true;
4878   }
4879   }
4880 }
4881 
4882 std::pair<unsigned, unsigned>
4883 AArch64InstrInfo::decomposeMachineOperandsTargetFlags(unsigned TF) const {
4884   const unsigned Mask = AArch64II::MO_FRAGMENT;
4885   return std::make_pair(TF & Mask, TF & ~Mask);
4886 }
4887 
4888 ArrayRef<std::pair<unsigned, const char *>>
4889 AArch64InstrInfo::getSerializableDirectMachineOperandTargetFlags() const {
4890   using namespace AArch64II;
4891 
4892   static const std::pair<unsigned, const char *> TargetFlags[] = {
4893       {MO_PAGE, "aarch64-page"}, {MO_PAGEOFF, "aarch64-pageoff"},
4894       {MO_G3, "aarch64-g3"},     {MO_G2, "aarch64-g2"},
4895       {MO_G1, "aarch64-g1"},     {MO_G0, "aarch64-g0"},
4896       {MO_HI12, "aarch64-hi12"}};
4897   return makeArrayRef(TargetFlags);
4898 }
4899 
4900 ArrayRef<std::pair<unsigned, const char *>>
4901 AArch64InstrInfo::getSerializableBitmaskMachineOperandTargetFlags() const {
4902   using namespace AArch64II;
4903 
4904   static const std::pair<unsigned, const char *> TargetFlags[] = {
4905       {MO_COFFSTUB, "aarch64-coffstub"},
4906       {MO_GOT, "aarch64-got"},
4907       {MO_NC, "aarch64-nc"},
4908       {MO_S, "aarch64-s"},
4909       {MO_TLS, "aarch64-tls"},
4910       {MO_DLLIMPORT, "aarch64-dllimport"},
4911       {MO_PREL, "aarch64-prel"},
4912       {MO_TAGGED, "aarch64-tagged"}};
4913   return makeArrayRef(TargetFlags);
4914 }
4915 
4916 ArrayRef<std::pair<MachineMemOperand::Flags, const char *>>
4917 AArch64InstrInfo::getSerializableMachineMemOperandTargetFlags() const {
4918   static const std::pair<MachineMemOperand::Flags, const char *> TargetFlags[] =
4919       {{MOSuppressPair, "aarch64-suppress-pair"},
4920        {MOStridedAccess, "aarch64-strided-access"}};
4921   return makeArrayRef(TargetFlags);
4922 }
4923 
4924 /// Constants defining how certain sequences should be outlined.
4925 /// This encompasses how an outlined function should be called, and what kind of
4926 /// frame should be emitted for that outlined function.
4927 ///
4928 /// \p MachineOutlinerDefault implies that the function should be called with
4929 /// a save and restore of LR to the stack.
4930 ///
4931 /// That is,
4932 ///
4933 /// I1     Save LR                    OUTLINED_FUNCTION:
4934 /// I2 --> BL OUTLINED_FUNCTION       I1
4935 /// I3     Restore LR                 I2
4936 ///                                   I3
4937 ///                                   RET
4938 ///
4939 /// * Call construction overhead: 3 (save + BL + restore)
4940 /// * Frame construction overhead: 1 (ret)
4941 /// * Requires stack fixups? Yes
4942 ///
4943 /// \p MachineOutlinerTailCall implies that the function is being created from
4944 /// a sequence of instructions ending in a return.
4945 ///
4946 /// That is,
4947 ///
4948 /// I1                             OUTLINED_FUNCTION:
4949 /// I2 --> B OUTLINED_FUNCTION     I1
4950 /// RET                            I2
4951 ///                                RET
4952 ///
4953 /// * Call construction overhead: 1 (B)
4954 /// * Frame construction overhead: 0 (Return included in sequence)
4955 /// * Requires stack fixups? No
4956 ///
4957 /// \p MachineOutlinerNoLRSave implies that the function should be called using
4958 /// a BL instruction, but doesn't require LR to be saved and restored. This
4959 /// happens when LR is known to be dead.
4960 ///
4961 /// That is,
4962 ///
4963 /// I1                                OUTLINED_FUNCTION:
4964 /// I2 --> BL OUTLINED_FUNCTION       I1
4965 /// I3                                I2
4966 ///                                   I3
4967 ///                                   RET
4968 ///
4969 /// * Call construction overhead: 1 (BL)
4970 /// * Frame construction overhead: 1 (RET)
4971 /// * Requires stack fixups? No
4972 ///
4973 /// \p MachineOutlinerThunk implies that the function is being created from
4974 /// a sequence of instructions ending in a call. The outlined function is
4975 /// called with a BL instruction, and the outlined function tail-calls the
4976 /// original call destination.
4977 ///
4978 /// That is,
4979 ///
4980 /// I1                                OUTLINED_FUNCTION:
4981 /// I2 --> BL OUTLINED_FUNCTION       I1
4982 /// BL f                              I2
4983 ///                                   B f
4984 /// * Call construction overhead: 1 (BL)
4985 /// * Frame construction overhead: 0
4986 /// * Requires stack fixups? No
4987 ///
4988 /// \p MachineOutlinerRegSave implies that the function should be called with a
4989 /// save and restore of LR to an available register. This allows us to avoid
4990 /// stack fixups. Note that this outlining variant is compatible with the
4991 /// NoLRSave case.
4992 ///
4993 /// That is,
4994 ///
4995 /// I1     Save LR                    OUTLINED_FUNCTION:
4996 /// I2 --> BL OUTLINED_FUNCTION       I1
4997 /// I3     Restore LR                 I2
4998 ///                                   I3
4999 ///                                   RET
5000 ///
5001 /// * Call construction overhead: 3 (save + BL + restore)
5002 /// * Frame construction overhead: 1 (ret)
5003 /// * Requires stack fixups? No
5004 enum MachineOutlinerClass {
5005   MachineOutlinerDefault,  /// Emit a save, restore, call, and return.
5006   MachineOutlinerTailCall, /// Only emit a branch.
5007   MachineOutlinerNoLRSave, /// Emit a call and return.
5008   MachineOutlinerThunk,    /// Emit a call and tail-call.
5009   MachineOutlinerRegSave   /// Same as default, but save to a register.
5010 };
5011 
5012 enum MachineOutlinerMBBFlags {
5013   LRUnavailableSomewhere = 0x2,
5014   HasCalls = 0x4,
5015   UnsafeRegsDead = 0x8
5016 };
5017 
5018 unsigned
5019 AArch64InstrInfo::findRegisterToSaveLRTo(const outliner::Candidate &C) const {
5020   assert(C.LRUWasSet && "LRU wasn't set?");
5021   MachineFunction *MF = C.getMF();
5022   const AArch64RegisterInfo *ARI = static_cast<const AArch64RegisterInfo *>(
5023       MF->getSubtarget().getRegisterInfo());
5024 
5025   // Check if there is an available register across the sequence that we can
5026   // use.
5027   for (unsigned Reg : AArch64::GPR64RegClass) {
5028     if (!ARI->isReservedReg(*MF, Reg) &&
5029         Reg != AArch64::LR &&  // LR is not reserved, but don't use it.
5030         Reg != AArch64::X16 && // X16 is not guaranteed to be preserved.
5031         Reg != AArch64::X17 && // Ditto for X17.
5032         C.LRU.available(Reg) && C.UsedInSequence.available(Reg))
5033       return Reg;
5034   }
5035 
5036   // No suitable register. Return 0.
5037   return 0u;
5038 }
5039 
5040 outliner::OutlinedFunction
5041 AArch64InstrInfo::getOutliningCandidateInfo(
5042     std::vector<outliner::Candidate> &RepeatedSequenceLocs) const {
5043   outliner::Candidate &FirstCand = RepeatedSequenceLocs[0];
5044   unsigned SequenceSize =
5045       std::accumulate(FirstCand.front(), std::next(FirstCand.back()), 0,
5046                       [this](unsigned Sum, const MachineInstr &MI) {
5047                         return Sum + getInstSizeInBytes(MI);
5048                       });
5049 
5050   // Properties about candidate MBBs that hold for all of them.
5051   unsigned FlagsSetInAll = 0xF;
5052 
5053   // Compute liveness information for each candidate, and set FlagsSetInAll.
5054   const TargetRegisterInfo &TRI = getRegisterInfo();
5055   std::for_each(RepeatedSequenceLocs.begin(), RepeatedSequenceLocs.end(),
5056                 [&FlagsSetInAll](outliner::Candidate &C) {
5057                   FlagsSetInAll &= C.Flags;
5058                 });
5059 
5060   // According to the AArch64 Procedure Call Standard, the following are
5061   // undefined on entry/exit from a function call:
5062   //
5063   // * Registers x16, x17, (and thus w16, w17)
5064   // * Condition codes (and thus the NZCV register)
5065   //
5066   // Because if this, we can't outline any sequence of instructions where
5067   // one
5068   // of these registers is live into/across it. Thus, we need to delete
5069   // those
5070   // candidates.
5071   auto CantGuaranteeValueAcrossCall = [&TRI](outliner::Candidate &C) {
5072     // If the unsafe registers in this block are all dead, then we don't need
5073     // to compute liveness here.
5074     if (C.Flags & UnsafeRegsDead)
5075       return false;
5076     C.initLRU(TRI);
5077     LiveRegUnits LRU = C.LRU;
5078     return (!LRU.available(AArch64::W16) || !LRU.available(AArch64::W17) ||
5079             !LRU.available(AArch64::NZCV));
5080   };
5081 
5082   // Are there any candidates where those registers are live?
5083   if (!(FlagsSetInAll & UnsafeRegsDead)) {
5084     // Erase every candidate that violates the restrictions above. (It could be
5085     // true that we have viable candidates, so it's not worth bailing out in
5086     // the case that, say, 1 out of 20 candidates violate the restructions.)
5087     RepeatedSequenceLocs.erase(std::remove_if(RepeatedSequenceLocs.begin(),
5088                                               RepeatedSequenceLocs.end(),
5089                                               CantGuaranteeValueAcrossCall),
5090                                RepeatedSequenceLocs.end());
5091 
5092     // If the sequence doesn't have enough candidates left, then we're done.
5093     if (RepeatedSequenceLocs.size() < 2)
5094       return outliner::OutlinedFunction();
5095   }
5096 
5097   // At this point, we have only "safe" candidates to outline. Figure out
5098   // frame + call instruction information.
5099 
5100   unsigned LastInstrOpcode = RepeatedSequenceLocs[0].back()->getOpcode();
5101 
5102   // Helper lambda which sets call information for every candidate.
5103   auto SetCandidateCallInfo =
5104       [&RepeatedSequenceLocs](unsigned CallID, unsigned NumBytesForCall) {
5105         for (outliner::Candidate &C : RepeatedSequenceLocs)
5106           C.setCallInfo(CallID, NumBytesForCall);
5107       };
5108 
5109   unsigned FrameID = MachineOutlinerDefault;
5110   unsigned NumBytesToCreateFrame = 4;
5111 
5112   bool HasBTI = any_of(RepeatedSequenceLocs, [](outliner::Candidate &C) {
5113     return C.getMF()->getFunction().hasFnAttribute("branch-target-enforcement");
5114   });
5115 
5116   // Returns true if an instructions is safe to fix up, false otherwise.
5117   auto IsSafeToFixup = [this, &TRI](MachineInstr &MI) {
5118     if (MI.isCall())
5119       return true;
5120 
5121     if (!MI.modifiesRegister(AArch64::SP, &TRI) &&
5122         !MI.readsRegister(AArch64::SP, &TRI))
5123       return true;
5124 
5125     // Any modification of SP will break our code to save/restore LR.
5126     // FIXME: We could handle some instructions which add a constant
5127     // offset to SP, with a bit more work.
5128     if (MI.modifiesRegister(AArch64::SP, &TRI))
5129       return false;
5130 
5131     // At this point, we have a stack instruction that we might need to
5132     // fix up. We'll handle it if it's a load or store.
5133     if (MI.mayLoadOrStore()) {
5134       const MachineOperand *Base; // Filled with the base operand of MI.
5135       int64_t Offset;             // Filled with the offset of MI.
5136 
5137       // Does it allow us to offset the base operand and is the base the
5138       // register SP?
5139       if (!getMemOperandWithOffset(MI, Base, Offset, &TRI) || !Base->isReg() ||
5140           Base->getReg() != AArch64::SP)
5141         return false;
5142 
5143       // Find the minimum/maximum offset for this instruction and check
5144       // if fixing it up would be in range.
5145       int64_t MinOffset,
5146           MaxOffset;  // Unscaled offsets for the instruction.
5147       unsigned Scale; // The scale to multiply the offsets by.
5148       unsigned DummyWidth;
5149       getMemOpInfo(MI.getOpcode(), Scale, DummyWidth, MinOffset, MaxOffset);
5150 
5151       Offset += 16; // Update the offset to what it would be if we outlined.
5152       if (Offset < MinOffset * Scale || Offset > MaxOffset * Scale)
5153         return false;
5154 
5155       // It's in range, so we can outline it.
5156       return true;
5157     }
5158 
5159     // FIXME: Add handling for instructions like "add x0, sp, #8".
5160 
5161     // We can't fix it up, so don't outline it.
5162     return false;
5163   };
5164 
5165   // True if it's possible to fix up each stack instruction in this sequence.
5166   // Important for frames/call variants that modify the stack.
5167   bool AllStackInstrsSafe = std::all_of(
5168       FirstCand.front(), std::next(FirstCand.back()), IsSafeToFixup);
5169 
5170   // If the last instruction in any candidate is a terminator, then we should
5171   // tail call all of the candidates.
5172   if (RepeatedSequenceLocs[0].back()->isTerminator()) {
5173     FrameID = MachineOutlinerTailCall;
5174     NumBytesToCreateFrame = 0;
5175     SetCandidateCallInfo(MachineOutlinerTailCall, 4);
5176   }
5177 
5178   else if (LastInstrOpcode == AArch64::BL ||
5179            (LastInstrOpcode == AArch64::BLR && !HasBTI)) {
5180     // FIXME: Do we need to check if the code after this uses the value of LR?
5181     FrameID = MachineOutlinerThunk;
5182     NumBytesToCreateFrame = 0;
5183     SetCandidateCallInfo(MachineOutlinerThunk, 4);
5184   }
5185 
5186   else {
5187     // We need to decide how to emit calls + frames. We can always emit the same
5188     // frame if we don't need to save to the stack. If we have to save to the
5189     // stack, then we need a different frame.
5190     unsigned NumBytesNoStackCalls = 0;
5191     std::vector<outliner::Candidate> CandidatesWithoutStackFixups;
5192 
5193     for (outliner::Candidate &C : RepeatedSequenceLocs) {
5194       C.initLRU(TRI);
5195 
5196       // Is LR available? If so, we don't need a save.
5197       if (C.LRU.available(AArch64::LR)) {
5198         NumBytesNoStackCalls += 4;
5199         C.setCallInfo(MachineOutlinerNoLRSave, 4);
5200         CandidatesWithoutStackFixups.push_back(C);
5201       }
5202 
5203       // Is an unused register available? If so, we won't modify the stack, so
5204       // we can outline with the same frame type as those that don't save LR.
5205       else if (findRegisterToSaveLRTo(C)) {
5206         NumBytesNoStackCalls += 12;
5207         C.setCallInfo(MachineOutlinerRegSave, 12);
5208         CandidatesWithoutStackFixups.push_back(C);
5209       }
5210 
5211       // Is SP used in the sequence at all? If not, we don't have to modify
5212       // the stack, so we are guaranteed to get the same frame.
5213       else if (C.UsedInSequence.available(AArch64::SP)) {
5214         NumBytesNoStackCalls += 12;
5215         C.setCallInfo(MachineOutlinerDefault, 12);
5216         CandidatesWithoutStackFixups.push_back(C);
5217       }
5218 
5219       // If we outline this, we need to modify the stack. Pretend we don't
5220       // outline this by saving all of its bytes.
5221       else {
5222         NumBytesNoStackCalls += SequenceSize;
5223       }
5224     }
5225 
5226     // If there are no places where we have to save LR, then note that we
5227     // don't have to update the stack. Otherwise, give every candidate the
5228     // default call type, as long as it's safe to do so.
5229     if (!AllStackInstrsSafe ||
5230         NumBytesNoStackCalls <= RepeatedSequenceLocs.size() * 12) {
5231       RepeatedSequenceLocs = CandidatesWithoutStackFixups;
5232       FrameID = MachineOutlinerNoLRSave;
5233     } else {
5234       SetCandidateCallInfo(MachineOutlinerDefault, 12);
5235     }
5236 
5237     // If we dropped all of the candidates, bail out here.
5238     if (RepeatedSequenceLocs.size() < 2) {
5239       RepeatedSequenceLocs.clear();
5240       return outliner::OutlinedFunction();
5241     }
5242   }
5243 
5244   // Does every candidate's MBB contain a call? If so, then we might have a call
5245   // in the range.
5246   if (FlagsSetInAll & MachineOutlinerMBBFlags::HasCalls) {
5247     // Check if the range contains a call. These require a save + restore of the
5248     // link register.
5249     bool ModStackToSaveLR = false;
5250     if (std::any_of(FirstCand.front(), FirstCand.back(),
5251                     [](const MachineInstr &MI) { return MI.isCall(); }))
5252       ModStackToSaveLR = true;
5253 
5254     // Handle the last instruction separately. If this is a tail call, then the
5255     // last instruction is a call. We don't want to save + restore in this case.
5256     // However, it could be possible that the last instruction is a call without
5257     // it being valid to tail call this sequence. We should consider this as
5258     // well.
5259     else if (FrameID != MachineOutlinerThunk &&
5260              FrameID != MachineOutlinerTailCall && FirstCand.back()->isCall())
5261       ModStackToSaveLR = true;
5262 
5263     if (ModStackToSaveLR) {
5264       // We can't fix up the stack. Bail out.
5265       if (!AllStackInstrsSafe) {
5266         RepeatedSequenceLocs.clear();
5267         return outliner::OutlinedFunction();
5268       }
5269 
5270       // Save + restore LR.
5271       NumBytesToCreateFrame += 8;
5272     }
5273   }
5274 
5275   return outliner::OutlinedFunction(RepeatedSequenceLocs, SequenceSize,
5276                                     NumBytesToCreateFrame, FrameID);
5277 }
5278 
5279 bool AArch64InstrInfo::isFunctionSafeToOutlineFrom(
5280     MachineFunction &MF, bool OutlineFromLinkOnceODRs) const {
5281   const Function &F = MF.getFunction();
5282 
5283   // Can F be deduplicated by the linker? If it can, don't outline from it.
5284   if (!OutlineFromLinkOnceODRs && F.hasLinkOnceODRLinkage())
5285     return false;
5286 
5287   // Don't outline from functions with section markings; the program could
5288   // expect that all the code is in the named section.
5289   // FIXME: Allow outlining from multiple functions with the same section
5290   // marking.
5291   if (F.hasSection())
5292     return false;
5293 
5294   // Outlining from functions with redzones is unsafe since the outliner may
5295   // modify the stack. Check if hasRedZone is true or unknown; if yes, don't
5296   // outline from it.
5297   AArch64FunctionInfo *AFI = MF.getInfo<AArch64FunctionInfo>();
5298   if (!AFI || AFI->hasRedZone().getValueOr(true))
5299     return false;
5300 
5301   // It's safe to outline from MF.
5302   return true;
5303 }
5304 
5305 bool AArch64InstrInfo::isMBBSafeToOutlineFrom(MachineBasicBlock &MBB,
5306                                               unsigned &Flags) const {
5307   // Check if LR is available through all of the MBB. If it's not, then set
5308   // a flag.
5309   assert(MBB.getParent()->getRegInfo().tracksLiveness() &&
5310          "Suitable Machine Function for outlining must track liveness");
5311   LiveRegUnits LRU(getRegisterInfo());
5312 
5313   std::for_each(MBB.rbegin(), MBB.rend(),
5314                 [&LRU](MachineInstr &MI) { LRU.accumulate(MI); });
5315 
5316   // Check if each of the unsafe registers are available...
5317   bool W16AvailableInBlock = LRU.available(AArch64::W16);
5318   bool W17AvailableInBlock = LRU.available(AArch64::W17);
5319   bool NZCVAvailableInBlock = LRU.available(AArch64::NZCV);
5320 
5321   // If all of these are dead (and not live out), we know we don't have to check
5322   // them later.
5323   if (W16AvailableInBlock && W17AvailableInBlock && NZCVAvailableInBlock)
5324     Flags |= MachineOutlinerMBBFlags::UnsafeRegsDead;
5325 
5326   // Now, add the live outs to the set.
5327   LRU.addLiveOuts(MBB);
5328 
5329   // If any of these registers is available in the MBB, but also a live out of
5330   // the block, then we know outlining is unsafe.
5331   if (W16AvailableInBlock && !LRU.available(AArch64::W16))
5332     return false;
5333   if (W17AvailableInBlock && !LRU.available(AArch64::W17))
5334     return false;
5335   if (NZCVAvailableInBlock && !LRU.available(AArch64::NZCV))
5336     return false;
5337 
5338   // Check if there's a call inside this MachineBasicBlock. If there is, then
5339   // set a flag.
5340   if (any_of(MBB, [](MachineInstr &MI) { return MI.isCall(); }))
5341     Flags |= MachineOutlinerMBBFlags::HasCalls;
5342 
5343   MachineFunction *MF = MBB.getParent();
5344 
5345   // In the event that we outline, we may have to save LR. If there is an
5346   // available register in the MBB, then we'll always save LR there. Check if
5347   // this is true.
5348   bool CanSaveLR = false;
5349   const AArch64RegisterInfo *ARI = static_cast<const AArch64RegisterInfo *>(
5350       MF->getSubtarget().getRegisterInfo());
5351 
5352   // Check if there is an available register across the sequence that we can
5353   // use.
5354   for (unsigned Reg : AArch64::GPR64RegClass) {
5355     if (!ARI->isReservedReg(*MF, Reg) && Reg != AArch64::LR &&
5356         Reg != AArch64::X16 && Reg != AArch64::X17 && LRU.available(Reg)) {
5357       CanSaveLR = true;
5358       break;
5359     }
5360   }
5361 
5362   // Check if we have a register we can save LR to, and if LR was used
5363   // somewhere. If both of those things are true, then we need to evaluate the
5364   // safety of outlining stack instructions later.
5365   if (!CanSaveLR && !LRU.available(AArch64::LR))
5366     Flags |= MachineOutlinerMBBFlags::LRUnavailableSomewhere;
5367 
5368   return true;
5369 }
5370 
5371 outliner::InstrType
5372 AArch64InstrInfo::getOutliningType(MachineBasicBlock::iterator &MIT,
5373                                    unsigned Flags) const {
5374   MachineInstr &MI = *MIT;
5375   MachineBasicBlock *MBB = MI.getParent();
5376   MachineFunction *MF = MBB->getParent();
5377   AArch64FunctionInfo *FuncInfo = MF->getInfo<AArch64FunctionInfo>();
5378 
5379   // Don't outline LOHs.
5380   if (FuncInfo->getLOHRelated().count(&MI))
5381     return outliner::InstrType::Illegal;
5382 
5383   // Don't allow debug values to impact outlining type.
5384   if (MI.isDebugInstr() || MI.isIndirectDebugValue())
5385     return outliner::InstrType::Invisible;
5386 
5387   // At this point, KILL instructions don't really tell us much so we can go
5388   // ahead and skip over them.
5389   if (MI.isKill())
5390     return outliner::InstrType::Invisible;
5391 
5392   // Is this a terminator for a basic block?
5393   if (MI.isTerminator()) {
5394 
5395     // Is this the end of a function?
5396     if (MI.getParent()->succ_empty())
5397       return outliner::InstrType::Legal;
5398 
5399     // It's not, so don't outline it.
5400     return outliner::InstrType::Illegal;
5401   }
5402 
5403   // Make sure none of the operands are un-outlinable.
5404   for (const MachineOperand &MOP : MI.operands()) {
5405     if (MOP.isCPI() || MOP.isJTI() || MOP.isCFIIndex() || MOP.isFI() ||
5406         MOP.isTargetIndex())
5407       return outliner::InstrType::Illegal;
5408 
5409     // If it uses LR or W30 explicitly, then don't touch it.
5410     if (MOP.isReg() && !MOP.isImplicit() &&
5411         (MOP.getReg() == AArch64::LR || MOP.getReg() == AArch64::W30))
5412       return outliner::InstrType::Illegal;
5413   }
5414 
5415   // Special cases for instructions that can always be outlined, but will fail
5416   // the later tests. e.g, ADRPs, which are PC-relative use LR, but can always
5417   // be outlined because they don't require a *specific* value to be in LR.
5418   if (MI.getOpcode() == AArch64::ADRP)
5419     return outliner::InstrType::Legal;
5420 
5421   // If MI is a call we might be able to outline it. We don't want to outline
5422   // any calls that rely on the position of items on the stack. When we outline
5423   // something containing a call, we have to emit a save and restore of LR in
5424   // the outlined function. Currently, this always happens by saving LR to the
5425   // stack. Thus, if we outline, say, half the parameters for a function call
5426   // plus the call, then we'll break the callee's expectations for the layout
5427   // of the stack.
5428   //
5429   // FIXME: Allow calls to functions which construct a stack frame, as long
5430   // as they don't access arguments on the stack.
5431   // FIXME: Figure out some way to analyze functions defined in other modules.
5432   // We should be able to compute the memory usage based on the IR calling
5433   // convention, even if we can't see the definition.
5434   if (MI.isCall()) {
5435     // Get the function associated with the call. Look at each operand and find
5436     // the one that represents the callee and get its name.
5437     const Function *Callee = nullptr;
5438     for (const MachineOperand &MOP : MI.operands()) {
5439       if (MOP.isGlobal()) {
5440         Callee = dyn_cast<Function>(MOP.getGlobal());
5441         break;
5442       }
5443     }
5444 
5445     // Never outline calls to mcount.  There isn't any rule that would require
5446     // this, but the Linux kernel's "ftrace" feature depends on it.
5447     if (Callee && Callee->getName() == "\01_mcount")
5448       return outliner::InstrType::Illegal;
5449 
5450     // If we don't know anything about the callee, assume it depends on the
5451     // stack layout of the caller. In that case, it's only legal to outline
5452     // as a tail-call.  Whitelist the call instructions we know about so we
5453     // don't get unexpected results with call pseudo-instructions.
5454     auto UnknownCallOutlineType = outliner::InstrType::Illegal;
5455     if (MI.getOpcode() == AArch64::BLR || MI.getOpcode() == AArch64::BL)
5456       UnknownCallOutlineType = outliner::InstrType::LegalTerminator;
5457 
5458     if (!Callee)
5459       return UnknownCallOutlineType;
5460 
5461     // We have a function we have information about. Check it if it's something
5462     // can safely outline.
5463     MachineFunction *CalleeMF = MF->getMMI().getMachineFunction(*Callee);
5464 
5465     // We don't know what's going on with the callee at all. Don't touch it.
5466     if (!CalleeMF)
5467       return UnknownCallOutlineType;
5468 
5469     // Check if we know anything about the callee saves on the function. If we
5470     // don't, then don't touch it, since that implies that we haven't
5471     // computed anything about its stack frame yet.
5472     MachineFrameInfo &MFI = CalleeMF->getFrameInfo();
5473     if (!MFI.isCalleeSavedInfoValid() || MFI.getStackSize() > 0 ||
5474         MFI.getNumObjects() > 0)
5475       return UnknownCallOutlineType;
5476 
5477     // At this point, we can say that CalleeMF ought to not pass anything on the
5478     // stack. Therefore, we can outline it.
5479     return outliner::InstrType::Legal;
5480   }
5481 
5482   // Don't outline positions.
5483   if (MI.isPosition())
5484     return outliner::InstrType::Illegal;
5485 
5486   // Don't touch the link register or W30.
5487   if (MI.readsRegister(AArch64::W30, &getRegisterInfo()) ||
5488       MI.modifiesRegister(AArch64::W30, &getRegisterInfo()))
5489     return outliner::InstrType::Illegal;
5490 
5491   // Don't outline BTI instructions, because that will prevent the outlining
5492   // site from being indirectly callable.
5493   if (MI.getOpcode() == AArch64::HINT) {
5494     int64_t Imm = MI.getOperand(0).getImm();
5495     if (Imm == 32 || Imm == 34 || Imm == 36 || Imm == 38)
5496       return outliner::InstrType::Illegal;
5497   }
5498 
5499   return outliner::InstrType::Legal;
5500 }
5501 
5502 void AArch64InstrInfo::fixupPostOutline(MachineBasicBlock &MBB) const {
5503   for (MachineInstr &MI : MBB) {
5504     const MachineOperand *Base;
5505     unsigned Width;
5506     int64_t Offset;
5507 
5508     // Is this a load or store with an immediate offset with SP as the base?
5509     if (!MI.mayLoadOrStore() ||
5510         !getMemOperandWithOffsetWidth(MI, Base, Offset, Width, &RI) ||
5511         (Base->isReg() && Base->getReg() != AArch64::SP))
5512       continue;
5513 
5514     // It is, so we have to fix it up.
5515     unsigned Scale;
5516     int64_t Dummy1, Dummy2;
5517 
5518     MachineOperand &StackOffsetOperand = getMemOpBaseRegImmOfsOffsetOperand(MI);
5519     assert(StackOffsetOperand.isImm() && "Stack offset wasn't immediate!");
5520     getMemOpInfo(MI.getOpcode(), Scale, Width, Dummy1, Dummy2);
5521     assert(Scale != 0 && "Unexpected opcode!");
5522 
5523     // We've pushed the return address to the stack, so add 16 to the offset.
5524     // This is safe, since we already checked if it would overflow when we
5525     // checked if this instruction was legal to outline.
5526     int64_t NewImm = (Offset + 16) / Scale;
5527     StackOffsetOperand.setImm(NewImm);
5528   }
5529 }
5530 
5531 void AArch64InstrInfo::buildOutlinedFrame(
5532     MachineBasicBlock &MBB, MachineFunction &MF,
5533     const outliner::OutlinedFunction &OF) const {
5534   // For thunk outlining, rewrite the last instruction from a call to a
5535   // tail-call.
5536   if (OF.FrameConstructionID == MachineOutlinerThunk) {
5537     MachineInstr *Call = &*--MBB.instr_end();
5538     unsigned TailOpcode;
5539     if (Call->getOpcode() == AArch64::BL) {
5540       TailOpcode = AArch64::TCRETURNdi;
5541     } else {
5542       assert(Call->getOpcode() == AArch64::BLR);
5543       TailOpcode = AArch64::TCRETURNriALL;
5544     }
5545     MachineInstr *TC = BuildMI(MF, DebugLoc(), get(TailOpcode))
5546                             .add(Call->getOperand(0))
5547                             .addImm(0);
5548     MBB.insert(MBB.end(), TC);
5549     Call->eraseFromParent();
5550   }
5551 
5552   // Is there a call in the outlined range?
5553   auto IsNonTailCall = [](MachineInstr &MI) {
5554     return MI.isCall() && !MI.isReturn();
5555   };
5556   if (std::any_of(MBB.instr_begin(), MBB.instr_end(), IsNonTailCall)) {
5557     // Fix up the instructions in the range, since we're going to modify the
5558     // stack.
5559     assert(OF.FrameConstructionID != MachineOutlinerDefault &&
5560            "Can only fix up stack references once");
5561     fixupPostOutline(MBB);
5562 
5563     // LR has to be a live in so that we can save it.
5564     MBB.addLiveIn(AArch64::LR);
5565 
5566     MachineBasicBlock::iterator It = MBB.begin();
5567     MachineBasicBlock::iterator Et = MBB.end();
5568 
5569     if (OF.FrameConstructionID == MachineOutlinerTailCall ||
5570         OF.FrameConstructionID == MachineOutlinerThunk)
5571       Et = std::prev(MBB.end());
5572 
5573     // Insert a save before the outlined region
5574     MachineInstr *STRXpre = BuildMI(MF, DebugLoc(), get(AArch64::STRXpre))
5575                                 .addReg(AArch64::SP, RegState::Define)
5576                                 .addReg(AArch64::LR)
5577                                 .addReg(AArch64::SP)
5578                                 .addImm(-16);
5579     It = MBB.insert(It, STRXpre);
5580 
5581     const TargetSubtargetInfo &STI = MF.getSubtarget();
5582     const MCRegisterInfo *MRI = STI.getRegisterInfo();
5583     unsigned DwarfReg = MRI->getDwarfRegNum(AArch64::LR, true);
5584 
5585     // Add a CFI saying the stack was moved 16 B down.
5586     int64_t StackPosEntry =
5587         MF.addFrameInst(MCCFIInstruction::createDefCfaOffset(nullptr, 16));
5588     BuildMI(MBB, It, DebugLoc(), get(AArch64::CFI_INSTRUCTION))
5589         .addCFIIndex(StackPosEntry)
5590         .setMIFlags(MachineInstr::FrameSetup);
5591 
5592     // Add a CFI saying that the LR that we want to find is now 16 B higher than
5593     // before.
5594     int64_t LRPosEntry =
5595         MF.addFrameInst(MCCFIInstruction::createOffset(nullptr, DwarfReg, 16));
5596     BuildMI(MBB, It, DebugLoc(), get(AArch64::CFI_INSTRUCTION))
5597         .addCFIIndex(LRPosEntry)
5598         .setMIFlags(MachineInstr::FrameSetup);
5599 
5600     // Insert a restore before the terminator for the function.
5601     MachineInstr *LDRXpost = BuildMI(MF, DebugLoc(), get(AArch64::LDRXpost))
5602                                  .addReg(AArch64::SP, RegState::Define)
5603                                  .addReg(AArch64::LR, RegState::Define)
5604                                  .addReg(AArch64::SP)
5605                                  .addImm(16);
5606     Et = MBB.insert(Et, LDRXpost);
5607   }
5608 
5609   // If this is a tail call outlined function, then there's already a return.
5610   if (OF.FrameConstructionID == MachineOutlinerTailCall ||
5611       OF.FrameConstructionID == MachineOutlinerThunk)
5612     return;
5613 
5614   // It's not a tail call, so we have to insert the return ourselves.
5615   MachineInstr *ret = BuildMI(MF, DebugLoc(), get(AArch64::RET))
5616                           .addReg(AArch64::LR, RegState::Undef);
5617   MBB.insert(MBB.end(), ret);
5618 
5619   // Did we have to modify the stack by saving the link register?
5620   if (OF.FrameConstructionID != MachineOutlinerDefault)
5621     return;
5622 
5623   // We modified the stack.
5624   // Walk over the basic block and fix up all the stack accesses.
5625   fixupPostOutline(MBB);
5626 }
5627 
5628 MachineBasicBlock::iterator AArch64InstrInfo::insertOutlinedCall(
5629     Module &M, MachineBasicBlock &MBB, MachineBasicBlock::iterator &It,
5630     MachineFunction &MF, const outliner::Candidate &C) const {
5631 
5632   // Are we tail calling?
5633   if (C.CallConstructionID == MachineOutlinerTailCall) {
5634     // If yes, then we can just branch to the label.
5635     It = MBB.insert(It, BuildMI(MF, DebugLoc(), get(AArch64::TCRETURNdi))
5636                             .addGlobalAddress(M.getNamedValue(MF.getName()))
5637                             .addImm(0));
5638     return It;
5639   }
5640 
5641   // Are we saving the link register?
5642   if (C.CallConstructionID == MachineOutlinerNoLRSave ||
5643       C.CallConstructionID == MachineOutlinerThunk) {
5644     // No, so just insert the call.
5645     It = MBB.insert(It, BuildMI(MF, DebugLoc(), get(AArch64::BL))
5646                             .addGlobalAddress(M.getNamedValue(MF.getName())));
5647     return It;
5648   }
5649 
5650   // We want to return the spot where we inserted the call.
5651   MachineBasicBlock::iterator CallPt;
5652 
5653   // Instructions for saving and restoring LR around the call instruction we're
5654   // going to insert.
5655   MachineInstr *Save;
5656   MachineInstr *Restore;
5657   // Can we save to a register?
5658   if (C.CallConstructionID == MachineOutlinerRegSave) {
5659     // FIXME: This logic should be sunk into a target-specific interface so that
5660     // we don't have to recompute the register.
5661     unsigned Reg = findRegisterToSaveLRTo(C);
5662     assert(Reg != 0 && "No callee-saved register available?");
5663 
5664     // Save and restore LR from that register.
5665     Save = BuildMI(MF, DebugLoc(), get(AArch64::ORRXrs), Reg)
5666                .addReg(AArch64::XZR)
5667                .addReg(AArch64::LR)
5668                .addImm(0);
5669     Restore = BuildMI(MF, DebugLoc(), get(AArch64::ORRXrs), AArch64::LR)
5670                 .addReg(AArch64::XZR)
5671                 .addReg(Reg)
5672                 .addImm(0);
5673   } else {
5674     // We have the default case. Save and restore from SP.
5675     Save = BuildMI(MF, DebugLoc(), get(AArch64::STRXpre))
5676                .addReg(AArch64::SP, RegState::Define)
5677                .addReg(AArch64::LR)
5678                .addReg(AArch64::SP)
5679                .addImm(-16);
5680     Restore = BuildMI(MF, DebugLoc(), get(AArch64::LDRXpost))
5681                   .addReg(AArch64::SP, RegState::Define)
5682                   .addReg(AArch64::LR, RegState::Define)
5683                   .addReg(AArch64::SP)
5684                   .addImm(16);
5685   }
5686 
5687   It = MBB.insert(It, Save);
5688   It++;
5689 
5690   // Insert the call.
5691   It = MBB.insert(It, BuildMI(MF, DebugLoc(), get(AArch64::BL))
5692                           .addGlobalAddress(M.getNamedValue(MF.getName())));
5693   CallPt = It;
5694   It++;
5695 
5696   It = MBB.insert(It, Restore);
5697   return CallPt;
5698 }
5699 
5700 bool AArch64InstrInfo::shouldOutlineFromFunctionByDefault(
5701   MachineFunction &MF) const {
5702   return MF.getFunction().hasMinSize();
5703 }
5704 
5705 bool AArch64InstrInfo::isCopyInstrImpl(
5706     const MachineInstr &MI, const MachineOperand *&Source,
5707     const MachineOperand *&Destination) const {
5708 
5709   // AArch64::ORRWrs and AArch64::ORRXrs with WZR/XZR reg
5710   // and zero immediate operands used as an alias for mov instruction.
5711   if (MI.getOpcode() == AArch64::ORRWrs &&
5712       MI.getOperand(1).getReg() == AArch64::WZR &&
5713       MI.getOperand(3).getImm() == 0x0) {
5714     Destination = &MI.getOperand(0);
5715     Source = &MI.getOperand(2);
5716     return true;
5717   }
5718 
5719   if (MI.getOpcode() == AArch64::ORRXrs &&
5720       MI.getOperand(1).getReg() == AArch64::XZR &&
5721       MI.getOperand(3).getImm() == 0x0) {
5722     Destination = &MI.getOperand(0);
5723     Source = &MI.getOperand(2);
5724     return true;
5725   }
5726 
5727   return false;
5728 }
5729 
5730 #define GET_INSTRINFO_HELPERS
5731 #include "AArch64GenInstrInfo.inc"
5732