xref: /freebsd/contrib/llvm-project/llvm/lib/Target/AArch64/AArch64InstrInfo.cpp (revision 77a1348b3c1cfe8547be49a121b56299a1e18b69)
1 //===- AArch64InstrInfo.cpp - AArch64 Instruction Information -------------===//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 //
9 // This file contains the AArch64 implementation of the TargetInstrInfo class.
10 //
11 //===----------------------------------------------------------------------===//
12 
13 #include "AArch64InstrInfo.h"
14 #include "AArch64MachineFunctionInfo.h"
15 #include "AArch64Subtarget.h"
16 #include "MCTargetDesc/AArch64AddressingModes.h"
17 #include "Utils/AArch64BaseInfo.h"
18 #include "llvm/ADT/ArrayRef.h"
19 #include "llvm/ADT/STLExtras.h"
20 #include "llvm/ADT/SmallVector.h"
21 #include "llvm/CodeGen/MachineBasicBlock.h"
22 #include "llvm/CodeGen/MachineFrameInfo.h"
23 #include "llvm/CodeGen/MachineFunction.h"
24 #include "llvm/CodeGen/MachineInstr.h"
25 #include "llvm/CodeGen/MachineInstrBuilder.h"
26 #include "llvm/CodeGen/MachineMemOperand.h"
27 #include "llvm/CodeGen/MachineOperand.h"
28 #include "llvm/CodeGen/MachineRegisterInfo.h"
29 #include "llvm/CodeGen/MachineModuleInfo.h"
30 #include "llvm/CodeGen/StackMaps.h"
31 #include "llvm/CodeGen/TargetRegisterInfo.h"
32 #include "llvm/CodeGen/TargetSubtargetInfo.h"
33 #include "llvm/IR/DebugInfoMetadata.h"
34 #include "llvm/IR/DebugLoc.h"
35 #include "llvm/IR/GlobalValue.h"
36 #include "llvm/MC/MCAsmInfo.h"
37 #include "llvm/MC/MCInst.h"
38 #include "llvm/MC/MCInstrDesc.h"
39 #include "llvm/Support/Casting.h"
40 #include "llvm/Support/CodeGen.h"
41 #include "llvm/Support/CommandLine.h"
42 #include "llvm/Support/Compiler.h"
43 #include "llvm/Support/ErrorHandling.h"
44 #include "llvm/Support/MathExtras.h"
45 #include "llvm/Target/TargetMachine.h"
46 #include "llvm/Target/TargetOptions.h"
47 #include <cassert>
48 #include <cstdint>
49 #include <iterator>
50 #include <utility>
51 
52 using namespace llvm;
53 
54 #define GET_INSTRINFO_CTOR_DTOR
55 #include "AArch64GenInstrInfo.inc"
56 
57 static cl::opt<unsigned> TBZDisplacementBits(
58     "aarch64-tbz-offset-bits", cl::Hidden, cl::init(14),
59     cl::desc("Restrict range of TB[N]Z instructions (DEBUG)"));
60 
61 static cl::opt<unsigned> CBZDisplacementBits(
62     "aarch64-cbz-offset-bits", cl::Hidden, cl::init(19),
63     cl::desc("Restrict range of CB[N]Z instructions (DEBUG)"));
64 
65 static cl::opt<unsigned>
66     BCCDisplacementBits("aarch64-bcc-offset-bits", cl::Hidden, cl::init(19),
67                         cl::desc("Restrict range of Bcc instructions (DEBUG)"));
68 
69 AArch64InstrInfo::AArch64InstrInfo(const AArch64Subtarget &STI)
70     : AArch64GenInstrInfo(AArch64::ADJCALLSTACKDOWN, AArch64::ADJCALLSTACKUP,
71                           AArch64::CATCHRET),
72       RI(STI.getTargetTriple()), Subtarget(STI) {}
73 
74 /// GetInstSize - Return the number of bytes of code the specified
75 /// instruction may be.  This returns the maximum number of bytes.
76 unsigned AArch64InstrInfo::getInstSizeInBytes(const MachineInstr &MI) const {
77   const MachineBasicBlock &MBB = *MI.getParent();
78   const MachineFunction *MF = MBB.getParent();
79   const MCAsmInfo *MAI = MF->getTarget().getMCAsmInfo();
80 
81   {
82     auto Op = MI.getOpcode();
83     if (Op == AArch64::INLINEASM || Op == AArch64::INLINEASM_BR)
84       return getInlineAsmLength(MI.getOperand(0).getSymbolName(), *MAI);
85   }
86 
87   // Meta-instructions emit no code.
88   if (MI.isMetaInstruction())
89     return 0;
90 
91   // FIXME: We currently only handle pseudoinstructions that don't get expanded
92   //        before the assembly printer.
93   unsigned NumBytes = 0;
94   const MCInstrDesc &Desc = MI.getDesc();
95   switch (Desc.getOpcode()) {
96   default:
97     // Anything not explicitly designated otherwise is a normal 4-byte insn.
98     NumBytes = 4;
99     break;
100   case TargetOpcode::STACKMAP:
101     // The upper bound for a stackmap intrinsic is the full length of its shadow
102     NumBytes = StackMapOpers(&MI).getNumPatchBytes();
103     assert(NumBytes % 4 == 0 && "Invalid number of NOP bytes requested!");
104     break;
105   case TargetOpcode::PATCHPOINT:
106     // The size of the patchpoint intrinsic is the number of bytes requested
107     NumBytes = PatchPointOpers(&MI).getNumPatchBytes();
108     assert(NumBytes % 4 == 0 && "Invalid number of NOP bytes requested!");
109     break;
110   case AArch64::TLSDESC_CALLSEQ:
111     // This gets lowered to an instruction sequence which takes 16 bytes
112     NumBytes = 16;
113     break;
114   case AArch64::JumpTableDest32:
115   case AArch64::JumpTableDest16:
116   case AArch64::JumpTableDest8:
117     NumBytes = 12;
118     break;
119   case AArch64::SPACE:
120     NumBytes = MI.getOperand(1).getImm();
121     break;
122   }
123 
124   return NumBytes;
125 }
126 
127 static void parseCondBranch(MachineInstr *LastInst, MachineBasicBlock *&Target,
128                             SmallVectorImpl<MachineOperand> &Cond) {
129   // Block ends with fall-through condbranch.
130   switch (LastInst->getOpcode()) {
131   default:
132     llvm_unreachable("Unknown branch instruction?");
133   case AArch64::Bcc:
134     Target = LastInst->getOperand(1).getMBB();
135     Cond.push_back(LastInst->getOperand(0));
136     break;
137   case AArch64::CBZW:
138   case AArch64::CBZX:
139   case AArch64::CBNZW:
140   case AArch64::CBNZX:
141     Target = LastInst->getOperand(1).getMBB();
142     Cond.push_back(MachineOperand::CreateImm(-1));
143     Cond.push_back(MachineOperand::CreateImm(LastInst->getOpcode()));
144     Cond.push_back(LastInst->getOperand(0));
145     break;
146   case AArch64::TBZW:
147   case AArch64::TBZX:
148   case AArch64::TBNZW:
149   case AArch64::TBNZX:
150     Target = LastInst->getOperand(2).getMBB();
151     Cond.push_back(MachineOperand::CreateImm(-1));
152     Cond.push_back(MachineOperand::CreateImm(LastInst->getOpcode()));
153     Cond.push_back(LastInst->getOperand(0));
154     Cond.push_back(LastInst->getOperand(1));
155   }
156 }
157 
158 static unsigned getBranchDisplacementBits(unsigned Opc) {
159   switch (Opc) {
160   default:
161     llvm_unreachable("unexpected opcode!");
162   case AArch64::B:
163     return 64;
164   case AArch64::TBNZW:
165   case AArch64::TBZW:
166   case AArch64::TBNZX:
167   case AArch64::TBZX:
168     return TBZDisplacementBits;
169   case AArch64::CBNZW:
170   case AArch64::CBZW:
171   case AArch64::CBNZX:
172   case AArch64::CBZX:
173     return CBZDisplacementBits;
174   case AArch64::Bcc:
175     return BCCDisplacementBits;
176   }
177 }
178 
179 bool AArch64InstrInfo::isBranchOffsetInRange(unsigned BranchOp,
180                                              int64_t BrOffset) const {
181   unsigned Bits = getBranchDisplacementBits(BranchOp);
182   assert(Bits >= 3 && "max branch displacement must be enough to jump"
183                       "over conditional branch expansion");
184   return isIntN(Bits, BrOffset / 4);
185 }
186 
187 MachineBasicBlock *
188 AArch64InstrInfo::getBranchDestBlock(const MachineInstr &MI) const {
189   switch (MI.getOpcode()) {
190   default:
191     llvm_unreachable("unexpected opcode!");
192   case AArch64::B:
193     return MI.getOperand(0).getMBB();
194   case AArch64::TBZW:
195   case AArch64::TBNZW:
196   case AArch64::TBZX:
197   case AArch64::TBNZX:
198     return MI.getOperand(2).getMBB();
199   case AArch64::CBZW:
200   case AArch64::CBNZW:
201   case AArch64::CBZX:
202   case AArch64::CBNZX:
203   case AArch64::Bcc:
204     return MI.getOperand(1).getMBB();
205   }
206 }
207 
208 // Branch analysis.
209 bool AArch64InstrInfo::analyzeBranch(MachineBasicBlock &MBB,
210                                      MachineBasicBlock *&TBB,
211                                      MachineBasicBlock *&FBB,
212                                      SmallVectorImpl<MachineOperand> &Cond,
213                                      bool AllowModify) const {
214   // If the block has no terminators, it just falls into the block after it.
215   MachineBasicBlock::iterator I = MBB.getLastNonDebugInstr();
216   if (I == MBB.end())
217     return false;
218 
219   if (!isUnpredicatedTerminator(*I))
220     return false;
221 
222   // Get the last instruction in the block.
223   MachineInstr *LastInst = &*I;
224 
225   // If there is only one terminator instruction, process it.
226   unsigned LastOpc = LastInst->getOpcode();
227   if (I == MBB.begin() || !isUnpredicatedTerminator(*--I)) {
228     if (isUncondBranchOpcode(LastOpc)) {
229       TBB = LastInst->getOperand(0).getMBB();
230       return false;
231     }
232     if (isCondBranchOpcode(LastOpc)) {
233       // Block ends with fall-through condbranch.
234       parseCondBranch(LastInst, TBB, Cond);
235       return false;
236     }
237     return true; // Can't handle indirect branch.
238   }
239 
240   // Get the instruction before it if it is a terminator.
241   MachineInstr *SecondLastInst = &*I;
242   unsigned SecondLastOpc = SecondLastInst->getOpcode();
243 
244   // If AllowModify is true and the block ends with two or more unconditional
245   // branches, delete all but the first unconditional branch.
246   if (AllowModify && isUncondBranchOpcode(LastOpc)) {
247     while (isUncondBranchOpcode(SecondLastOpc)) {
248       LastInst->eraseFromParent();
249       LastInst = SecondLastInst;
250       LastOpc = LastInst->getOpcode();
251       if (I == MBB.begin() || !isUnpredicatedTerminator(*--I)) {
252         // Return now the only terminator is an unconditional branch.
253         TBB = LastInst->getOperand(0).getMBB();
254         return false;
255       } else {
256         SecondLastInst = &*I;
257         SecondLastOpc = SecondLastInst->getOpcode();
258       }
259     }
260   }
261 
262   // If there are three terminators, we don't know what sort of block this is.
263   if (SecondLastInst && I != MBB.begin() && isUnpredicatedTerminator(*--I))
264     return true;
265 
266   // If the block ends with a B and a Bcc, handle it.
267   if (isCondBranchOpcode(SecondLastOpc) && isUncondBranchOpcode(LastOpc)) {
268     parseCondBranch(SecondLastInst, TBB, Cond);
269     FBB = LastInst->getOperand(0).getMBB();
270     return false;
271   }
272 
273   // If the block ends with two unconditional branches, handle it.  The second
274   // one is not executed, so remove it.
275   if (isUncondBranchOpcode(SecondLastOpc) && isUncondBranchOpcode(LastOpc)) {
276     TBB = SecondLastInst->getOperand(0).getMBB();
277     I = LastInst;
278     if (AllowModify)
279       I->eraseFromParent();
280     return false;
281   }
282 
283   // ...likewise if it ends with an indirect branch followed by an unconditional
284   // branch.
285   if (isIndirectBranchOpcode(SecondLastOpc) && isUncondBranchOpcode(LastOpc)) {
286     I = LastInst;
287     if (AllowModify)
288       I->eraseFromParent();
289     return true;
290   }
291 
292   // Otherwise, can't handle this.
293   return true;
294 }
295 
296 bool AArch64InstrInfo::reverseBranchCondition(
297     SmallVectorImpl<MachineOperand> &Cond) const {
298   if (Cond[0].getImm() != -1) {
299     // Regular Bcc
300     AArch64CC::CondCode CC = (AArch64CC::CondCode)(int)Cond[0].getImm();
301     Cond[0].setImm(AArch64CC::getInvertedCondCode(CC));
302   } else {
303     // Folded compare-and-branch
304     switch (Cond[1].getImm()) {
305     default:
306       llvm_unreachable("Unknown conditional branch!");
307     case AArch64::CBZW:
308       Cond[1].setImm(AArch64::CBNZW);
309       break;
310     case AArch64::CBNZW:
311       Cond[1].setImm(AArch64::CBZW);
312       break;
313     case AArch64::CBZX:
314       Cond[1].setImm(AArch64::CBNZX);
315       break;
316     case AArch64::CBNZX:
317       Cond[1].setImm(AArch64::CBZX);
318       break;
319     case AArch64::TBZW:
320       Cond[1].setImm(AArch64::TBNZW);
321       break;
322     case AArch64::TBNZW:
323       Cond[1].setImm(AArch64::TBZW);
324       break;
325     case AArch64::TBZX:
326       Cond[1].setImm(AArch64::TBNZX);
327       break;
328     case AArch64::TBNZX:
329       Cond[1].setImm(AArch64::TBZX);
330       break;
331     }
332   }
333 
334   return false;
335 }
336 
337 unsigned AArch64InstrInfo::removeBranch(MachineBasicBlock &MBB,
338                                         int *BytesRemoved) const {
339   MachineBasicBlock::iterator I = MBB.getLastNonDebugInstr();
340   if (I == MBB.end())
341     return 0;
342 
343   if (!isUncondBranchOpcode(I->getOpcode()) &&
344       !isCondBranchOpcode(I->getOpcode()))
345     return 0;
346 
347   // Remove the branch.
348   I->eraseFromParent();
349 
350   I = MBB.end();
351 
352   if (I == MBB.begin()) {
353     if (BytesRemoved)
354       *BytesRemoved = 4;
355     return 1;
356   }
357   --I;
358   if (!isCondBranchOpcode(I->getOpcode())) {
359     if (BytesRemoved)
360       *BytesRemoved = 4;
361     return 1;
362   }
363 
364   // Remove the branch.
365   I->eraseFromParent();
366   if (BytesRemoved)
367     *BytesRemoved = 8;
368 
369   return 2;
370 }
371 
372 void AArch64InstrInfo::instantiateCondBranch(
373     MachineBasicBlock &MBB, const DebugLoc &DL, MachineBasicBlock *TBB,
374     ArrayRef<MachineOperand> Cond) const {
375   if (Cond[0].getImm() != -1) {
376     // Regular Bcc
377     BuildMI(&MBB, DL, get(AArch64::Bcc)).addImm(Cond[0].getImm()).addMBB(TBB);
378   } else {
379     // Folded compare-and-branch
380     // Note that we use addOperand instead of addReg to keep the flags.
381     const MachineInstrBuilder MIB =
382         BuildMI(&MBB, DL, get(Cond[1].getImm())).add(Cond[2]);
383     if (Cond.size() > 3)
384       MIB.addImm(Cond[3].getImm());
385     MIB.addMBB(TBB);
386   }
387 }
388 
389 unsigned AArch64InstrInfo::insertBranch(
390     MachineBasicBlock &MBB, MachineBasicBlock *TBB, MachineBasicBlock *FBB,
391     ArrayRef<MachineOperand> Cond, const DebugLoc &DL, int *BytesAdded) const {
392   // Shouldn't be a fall through.
393   assert(TBB && "insertBranch must not be told to insert a fallthrough");
394 
395   if (!FBB) {
396     if (Cond.empty()) // Unconditional branch?
397       BuildMI(&MBB, DL, get(AArch64::B)).addMBB(TBB);
398     else
399       instantiateCondBranch(MBB, DL, TBB, Cond);
400 
401     if (BytesAdded)
402       *BytesAdded = 4;
403 
404     return 1;
405   }
406 
407   // Two-way conditional branch.
408   instantiateCondBranch(MBB, DL, TBB, Cond);
409   BuildMI(&MBB, DL, get(AArch64::B)).addMBB(FBB);
410 
411   if (BytesAdded)
412     *BytesAdded = 8;
413 
414   return 2;
415 }
416 
417 // Find the original register that VReg is copied from.
418 static unsigned removeCopies(const MachineRegisterInfo &MRI, unsigned VReg) {
419   while (Register::isVirtualRegister(VReg)) {
420     const MachineInstr *DefMI = MRI.getVRegDef(VReg);
421     if (!DefMI->isFullCopy())
422       return VReg;
423     VReg = DefMI->getOperand(1).getReg();
424   }
425   return VReg;
426 }
427 
428 // Determine if VReg is defined by an instruction that can be folded into a
429 // csel instruction. If so, return the folded opcode, and the replacement
430 // register.
431 static unsigned canFoldIntoCSel(const MachineRegisterInfo &MRI, unsigned VReg,
432                                 unsigned *NewVReg = nullptr) {
433   VReg = removeCopies(MRI, VReg);
434   if (!Register::isVirtualRegister(VReg))
435     return 0;
436 
437   bool Is64Bit = AArch64::GPR64allRegClass.hasSubClassEq(MRI.getRegClass(VReg));
438   const MachineInstr *DefMI = MRI.getVRegDef(VReg);
439   unsigned Opc = 0;
440   unsigned SrcOpNum = 0;
441   switch (DefMI->getOpcode()) {
442   case AArch64::ADDSXri:
443   case AArch64::ADDSWri:
444     // if NZCV is used, do not fold.
445     if (DefMI->findRegisterDefOperandIdx(AArch64::NZCV, true) == -1)
446       return 0;
447     // fall-through to ADDXri and ADDWri.
448     LLVM_FALLTHROUGH;
449   case AArch64::ADDXri:
450   case AArch64::ADDWri:
451     // add x, 1 -> csinc.
452     if (!DefMI->getOperand(2).isImm() || DefMI->getOperand(2).getImm() != 1 ||
453         DefMI->getOperand(3).getImm() != 0)
454       return 0;
455     SrcOpNum = 1;
456     Opc = Is64Bit ? AArch64::CSINCXr : AArch64::CSINCWr;
457     break;
458 
459   case AArch64::ORNXrr:
460   case AArch64::ORNWrr: {
461     // not x -> csinv, represented as orn dst, xzr, src.
462     unsigned ZReg = removeCopies(MRI, DefMI->getOperand(1).getReg());
463     if (ZReg != AArch64::XZR && ZReg != AArch64::WZR)
464       return 0;
465     SrcOpNum = 2;
466     Opc = Is64Bit ? AArch64::CSINVXr : AArch64::CSINVWr;
467     break;
468   }
469 
470   case AArch64::SUBSXrr:
471   case AArch64::SUBSWrr:
472     // if NZCV is used, do not fold.
473     if (DefMI->findRegisterDefOperandIdx(AArch64::NZCV, true) == -1)
474       return 0;
475     // fall-through to SUBXrr and SUBWrr.
476     LLVM_FALLTHROUGH;
477   case AArch64::SUBXrr:
478   case AArch64::SUBWrr: {
479     // neg x -> csneg, represented as sub dst, xzr, src.
480     unsigned ZReg = removeCopies(MRI, DefMI->getOperand(1).getReg());
481     if (ZReg != AArch64::XZR && ZReg != AArch64::WZR)
482       return 0;
483     SrcOpNum = 2;
484     Opc = Is64Bit ? AArch64::CSNEGXr : AArch64::CSNEGWr;
485     break;
486   }
487   default:
488     return 0;
489   }
490   assert(Opc && SrcOpNum && "Missing parameters");
491 
492   if (NewVReg)
493     *NewVReg = DefMI->getOperand(SrcOpNum).getReg();
494   return Opc;
495 }
496 
497 bool AArch64InstrInfo::canInsertSelect(const MachineBasicBlock &MBB,
498                                        ArrayRef<MachineOperand> Cond,
499                                        unsigned TrueReg, unsigned FalseReg,
500                                        int &CondCycles, int &TrueCycles,
501                                        int &FalseCycles) const {
502   // Check register classes.
503   const MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo();
504   const TargetRegisterClass *RC =
505       RI.getCommonSubClass(MRI.getRegClass(TrueReg), MRI.getRegClass(FalseReg));
506   if (!RC)
507     return false;
508 
509   // Expanding cbz/tbz requires an extra cycle of latency on the condition.
510   unsigned ExtraCondLat = Cond.size() != 1;
511 
512   // GPRs are handled by csel.
513   // FIXME: Fold in x+1, -x, and ~x when applicable.
514   if (AArch64::GPR64allRegClass.hasSubClassEq(RC) ||
515       AArch64::GPR32allRegClass.hasSubClassEq(RC)) {
516     // Single-cycle csel, csinc, csinv, and csneg.
517     CondCycles = 1 + ExtraCondLat;
518     TrueCycles = FalseCycles = 1;
519     if (canFoldIntoCSel(MRI, TrueReg))
520       TrueCycles = 0;
521     else if (canFoldIntoCSel(MRI, FalseReg))
522       FalseCycles = 0;
523     return true;
524   }
525 
526   // Scalar floating point is handled by fcsel.
527   // FIXME: Form fabs, fmin, and fmax when applicable.
528   if (AArch64::FPR64RegClass.hasSubClassEq(RC) ||
529       AArch64::FPR32RegClass.hasSubClassEq(RC)) {
530     CondCycles = 5 + ExtraCondLat;
531     TrueCycles = FalseCycles = 2;
532     return true;
533   }
534 
535   // Can't do vectors.
536   return false;
537 }
538 
539 void AArch64InstrInfo::insertSelect(MachineBasicBlock &MBB,
540                                     MachineBasicBlock::iterator I,
541                                     const DebugLoc &DL, unsigned DstReg,
542                                     ArrayRef<MachineOperand> Cond,
543                                     unsigned TrueReg, unsigned FalseReg) const {
544   MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo();
545 
546   // Parse the condition code, see parseCondBranch() above.
547   AArch64CC::CondCode CC;
548   switch (Cond.size()) {
549   default:
550     llvm_unreachable("Unknown condition opcode in Cond");
551   case 1: // b.cc
552     CC = AArch64CC::CondCode(Cond[0].getImm());
553     break;
554   case 3: { // cbz/cbnz
555     // We must insert a compare against 0.
556     bool Is64Bit;
557     switch (Cond[1].getImm()) {
558     default:
559       llvm_unreachable("Unknown branch opcode in Cond");
560     case AArch64::CBZW:
561       Is64Bit = false;
562       CC = AArch64CC::EQ;
563       break;
564     case AArch64::CBZX:
565       Is64Bit = true;
566       CC = AArch64CC::EQ;
567       break;
568     case AArch64::CBNZW:
569       Is64Bit = false;
570       CC = AArch64CC::NE;
571       break;
572     case AArch64::CBNZX:
573       Is64Bit = true;
574       CC = AArch64CC::NE;
575       break;
576     }
577     Register SrcReg = Cond[2].getReg();
578     if (Is64Bit) {
579       // cmp reg, #0 is actually subs xzr, reg, #0.
580       MRI.constrainRegClass(SrcReg, &AArch64::GPR64spRegClass);
581       BuildMI(MBB, I, DL, get(AArch64::SUBSXri), AArch64::XZR)
582           .addReg(SrcReg)
583           .addImm(0)
584           .addImm(0);
585     } else {
586       MRI.constrainRegClass(SrcReg, &AArch64::GPR32spRegClass);
587       BuildMI(MBB, I, DL, get(AArch64::SUBSWri), AArch64::WZR)
588           .addReg(SrcReg)
589           .addImm(0)
590           .addImm(0);
591     }
592     break;
593   }
594   case 4: { // tbz/tbnz
595     // We must insert a tst instruction.
596     switch (Cond[1].getImm()) {
597     default:
598       llvm_unreachable("Unknown branch opcode in Cond");
599     case AArch64::TBZW:
600     case AArch64::TBZX:
601       CC = AArch64CC::EQ;
602       break;
603     case AArch64::TBNZW:
604     case AArch64::TBNZX:
605       CC = AArch64CC::NE;
606       break;
607     }
608     // cmp reg, #foo is actually ands xzr, reg, #1<<foo.
609     if (Cond[1].getImm() == AArch64::TBZW || Cond[1].getImm() == AArch64::TBNZW)
610       BuildMI(MBB, I, DL, get(AArch64::ANDSWri), AArch64::WZR)
611           .addReg(Cond[2].getReg())
612           .addImm(
613               AArch64_AM::encodeLogicalImmediate(1ull << Cond[3].getImm(), 32));
614     else
615       BuildMI(MBB, I, DL, get(AArch64::ANDSXri), AArch64::XZR)
616           .addReg(Cond[2].getReg())
617           .addImm(
618               AArch64_AM::encodeLogicalImmediate(1ull << Cond[3].getImm(), 64));
619     break;
620   }
621   }
622 
623   unsigned Opc = 0;
624   const TargetRegisterClass *RC = nullptr;
625   bool TryFold = false;
626   if (MRI.constrainRegClass(DstReg, &AArch64::GPR64RegClass)) {
627     RC = &AArch64::GPR64RegClass;
628     Opc = AArch64::CSELXr;
629     TryFold = true;
630   } else if (MRI.constrainRegClass(DstReg, &AArch64::GPR32RegClass)) {
631     RC = &AArch64::GPR32RegClass;
632     Opc = AArch64::CSELWr;
633     TryFold = true;
634   } else if (MRI.constrainRegClass(DstReg, &AArch64::FPR64RegClass)) {
635     RC = &AArch64::FPR64RegClass;
636     Opc = AArch64::FCSELDrrr;
637   } else if (MRI.constrainRegClass(DstReg, &AArch64::FPR32RegClass)) {
638     RC = &AArch64::FPR32RegClass;
639     Opc = AArch64::FCSELSrrr;
640   }
641   assert(RC && "Unsupported regclass");
642 
643   // Try folding simple instructions into the csel.
644   if (TryFold) {
645     unsigned NewVReg = 0;
646     unsigned FoldedOpc = canFoldIntoCSel(MRI, TrueReg, &NewVReg);
647     if (FoldedOpc) {
648       // The folded opcodes csinc, csinc and csneg apply the operation to
649       // FalseReg, so we need to invert the condition.
650       CC = AArch64CC::getInvertedCondCode(CC);
651       TrueReg = FalseReg;
652     } else
653       FoldedOpc = canFoldIntoCSel(MRI, FalseReg, &NewVReg);
654 
655     // Fold the operation. Leave any dead instructions for DCE to clean up.
656     if (FoldedOpc) {
657       FalseReg = NewVReg;
658       Opc = FoldedOpc;
659       // The extends the live range of NewVReg.
660       MRI.clearKillFlags(NewVReg);
661     }
662   }
663 
664   // Pull all virtual register into the appropriate class.
665   MRI.constrainRegClass(TrueReg, RC);
666   MRI.constrainRegClass(FalseReg, RC);
667 
668   // Insert the csel.
669   BuildMI(MBB, I, DL, get(Opc), DstReg)
670       .addReg(TrueReg)
671       .addReg(FalseReg)
672       .addImm(CC);
673 }
674 
675 /// Returns true if a MOVi32imm or MOVi64imm can be expanded to an  ORRxx.
676 static bool canBeExpandedToORR(const MachineInstr &MI, unsigned BitSize) {
677   uint64_t Imm = MI.getOperand(1).getImm();
678   uint64_t UImm = Imm << (64 - BitSize) >> (64 - BitSize);
679   uint64_t Encoding;
680   return AArch64_AM::processLogicalImmediate(UImm, BitSize, Encoding);
681 }
682 
683 // FIXME: this implementation should be micro-architecture dependent, so a
684 // micro-architecture target hook should be introduced here in future.
685 bool AArch64InstrInfo::isAsCheapAsAMove(const MachineInstr &MI) const {
686   if (!Subtarget.hasCustomCheapAsMoveHandling())
687     return MI.isAsCheapAsAMove();
688 
689   const unsigned Opcode = MI.getOpcode();
690 
691   // Firstly, check cases gated by features.
692 
693   if (Subtarget.hasZeroCycleZeroingFP()) {
694     if (Opcode == AArch64::FMOVH0 ||
695         Opcode == AArch64::FMOVS0 ||
696         Opcode == AArch64::FMOVD0)
697       return true;
698   }
699 
700   if (Subtarget.hasZeroCycleZeroingGP()) {
701     if (Opcode == TargetOpcode::COPY &&
702         (MI.getOperand(1).getReg() == AArch64::WZR ||
703          MI.getOperand(1).getReg() == AArch64::XZR))
704       return true;
705   }
706 
707   // Secondly, check cases specific to sub-targets.
708 
709   if (Subtarget.hasExynosCheapAsMoveHandling()) {
710     if (isExynosCheapAsMove(MI))
711       return true;
712 
713     return MI.isAsCheapAsAMove();
714   }
715 
716   // Finally, check generic cases.
717 
718   switch (Opcode) {
719   default:
720     return false;
721 
722   // add/sub on register without shift
723   case AArch64::ADDWri:
724   case AArch64::ADDXri:
725   case AArch64::SUBWri:
726   case AArch64::SUBXri:
727     return (MI.getOperand(3).getImm() == 0);
728 
729   // logical ops on immediate
730   case AArch64::ANDWri:
731   case AArch64::ANDXri:
732   case AArch64::EORWri:
733   case AArch64::EORXri:
734   case AArch64::ORRWri:
735   case AArch64::ORRXri:
736     return true;
737 
738   // logical ops on register without shift
739   case AArch64::ANDWrr:
740   case AArch64::ANDXrr:
741   case AArch64::BICWrr:
742   case AArch64::BICXrr:
743   case AArch64::EONWrr:
744   case AArch64::EONXrr:
745   case AArch64::EORWrr:
746   case AArch64::EORXrr:
747   case AArch64::ORNWrr:
748   case AArch64::ORNXrr:
749   case AArch64::ORRWrr:
750   case AArch64::ORRXrr:
751     return true;
752 
753   // If MOVi32imm or MOVi64imm can be expanded into ORRWri or
754   // ORRXri, it is as cheap as MOV
755   case AArch64::MOVi32imm:
756     return canBeExpandedToORR(MI, 32);
757   case AArch64::MOVi64imm:
758     return canBeExpandedToORR(MI, 64);
759   }
760 
761   llvm_unreachable("Unknown opcode to check as cheap as a move!");
762 }
763 
764 bool AArch64InstrInfo::isFalkorShiftExtFast(const MachineInstr &MI) {
765   switch (MI.getOpcode()) {
766   default:
767     return false;
768 
769   case AArch64::ADDWrs:
770   case AArch64::ADDXrs:
771   case AArch64::ADDSWrs:
772   case AArch64::ADDSXrs: {
773     unsigned Imm = MI.getOperand(3).getImm();
774     unsigned ShiftVal = AArch64_AM::getShiftValue(Imm);
775     if (ShiftVal == 0)
776       return true;
777     return AArch64_AM::getShiftType(Imm) == AArch64_AM::LSL && ShiftVal <= 5;
778   }
779 
780   case AArch64::ADDWrx:
781   case AArch64::ADDXrx:
782   case AArch64::ADDXrx64:
783   case AArch64::ADDSWrx:
784   case AArch64::ADDSXrx:
785   case AArch64::ADDSXrx64: {
786     unsigned Imm = MI.getOperand(3).getImm();
787     switch (AArch64_AM::getArithExtendType(Imm)) {
788     default:
789       return false;
790     case AArch64_AM::UXTB:
791     case AArch64_AM::UXTH:
792     case AArch64_AM::UXTW:
793     case AArch64_AM::UXTX:
794       return AArch64_AM::getArithShiftValue(Imm) <= 4;
795     }
796   }
797 
798   case AArch64::SUBWrs:
799   case AArch64::SUBSWrs: {
800     unsigned Imm = MI.getOperand(3).getImm();
801     unsigned ShiftVal = AArch64_AM::getShiftValue(Imm);
802     return ShiftVal == 0 ||
803            (AArch64_AM::getShiftType(Imm) == AArch64_AM::ASR && ShiftVal == 31);
804   }
805 
806   case AArch64::SUBXrs:
807   case AArch64::SUBSXrs: {
808     unsigned Imm = MI.getOperand(3).getImm();
809     unsigned ShiftVal = AArch64_AM::getShiftValue(Imm);
810     return ShiftVal == 0 ||
811            (AArch64_AM::getShiftType(Imm) == AArch64_AM::ASR && ShiftVal == 63);
812   }
813 
814   case AArch64::SUBWrx:
815   case AArch64::SUBXrx:
816   case AArch64::SUBXrx64:
817   case AArch64::SUBSWrx:
818   case AArch64::SUBSXrx:
819   case AArch64::SUBSXrx64: {
820     unsigned Imm = MI.getOperand(3).getImm();
821     switch (AArch64_AM::getArithExtendType(Imm)) {
822     default:
823       return false;
824     case AArch64_AM::UXTB:
825     case AArch64_AM::UXTH:
826     case AArch64_AM::UXTW:
827     case AArch64_AM::UXTX:
828       return AArch64_AM::getArithShiftValue(Imm) == 0;
829     }
830   }
831 
832   case AArch64::LDRBBroW:
833   case AArch64::LDRBBroX:
834   case AArch64::LDRBroW:
835   case AArch64::LDRBroX:
836   case AArch64::LDRDroW:
837   case AArch64::LDRDroX:
838   case AArch64::LDRHHroW:
839   case AArch64::LDRHHroX:
840   case AArch64::LDRHroW:
841   case AArch64::LDRHroX:
842   case AArch64::LDRQroW:
843   case AArch64::LDRQroX:
844   case AArch64::LDRSBWroW:
845   case AArch64::LDRSBWroX:
846   case AArch64::LDRSBXroW:
847   case AArch64::LDRSBXroX:
848   case AArch64::LDRSHWroW:
849   case AArch64::LDRSHWroX:
850   case AArch64::LDRSHXroW:
851   case AArch64::LDRSHXroX:
852   case AArch64::LDRSWroW:
853   case AArch64::LDRSWroX:
854   case AArch64::LDRSroW:
855   case AArch64::LDRSroX:
856   case AArch64::LDRWroW:
857   case AArch64::LDRWroX:
858   case AArch64::LDRXroW:
859   case AArch64::LDRXroX:
860   case AArch64::PRFMroW:
861   case AArch64::PRFMroX:
862   case AArch64::STRBBroW:
863   case AArch64::STRBBroX:
864   case AArch64::STRBroW:
865   case AArch64::STRBroX:
866   case AArch64::STRDroW:
867   case AArch64::STRDroX:
868   case AArch64::STRHHroW:
869   case AArch64::STRHHroX:
870   case AArch64::STRHroW:
871   case AArch64::STRHroX:
872   case AArch64::STRQroW:
873   case AArch64::STRQroX:
874   case AArch64::STRSroW:
875   case AArch64::STRSroX:
876   case AArch64::STRWroW:
877   case AArch64::STRWroX:
878   case AArch64::STRXroW:
879   case AArch64::STRXroX: {
880     unsigned IsSigned = MI.getOperand(3).getImm();
881     return !IsSigned;
882   }
883   }
884 }
885 
886 bool AArch64InstrInfo::isSEHInstruction(const MachineInstr &MI) {
887   unsigned Opc = MI.getOpcode();
888   switch (Opc) {
889     default:
890       return false;
891     case AArch64::SEH_StackAlloc:
892     case AArch64::SEH_SaveFPLR:
893     case AArch64::SEH_SaveFPLR_X:
894     case AArch64::SEH_SaveReg:
895     case AArch64::SEH_SaveReg_X:
896     case AArch64::SEH_SaveRegP:
897     case AArch64::SEH_SaveRegP_X:
898     case AArch64::SEH_SaveFReg:
899     case AArch64::SEH_SaveFReg_X:
900     case AArch64::SEH_SaveFRegP:
901     case AArch64::SEH_SaveFRegP_X:
902     case AArch64::SEH_SetFP:
903     case AArch64::SEH_AddFP:
904     case AArch64::SEH_Nop:
905     case AArch64::SEH_PrologEnd:
906     case AArch64::SEH_EpilogStart:
907     case AArch64::SEH_EpilogEnd:
908       return true;
909   }
910 }
911 
912 bool AArch64InstrInfo::isCoalescableExtInstr(const MachineInstr &MI,
913                                              unsigned &SrcReg, unsigned &DstReg,
914                                              unsigned &SubIdx) const {
915   switch (MI.getOpcode()) {
916   default:
917     return false;
918   case AArch64::SBFMXri: // aka sxtw
919   case AArch64::UBFMXri: // aka uxtw
920     // Check for the 32 -> 64 bit extension case, these instructions can do
921     // much more.
922     if (MI.getOperand(2).getImm() != 0 || MI.getOperand(3).getImm() != 31)
923       return false;
924     // This is a signed or unsigned 32 -> 64 bit extension.
925     SrcReg = MI.getOperand(1).getReg();
926     DstReg = MI.getOperand(0).getReg();
927     SubIdx = AArch64::sub_32;
928     return true;
929   }
930 }
931 
932 bool AArch64InstrInfo::areMemAccessesTriviallyDisjoint(
933     const MachineInstr &MIa, const MachineInstr &MIb) const {
934   const TargetRegisterInfo *TRI = &getRegisterInfo();
935   const MachineOperand *BaseOpA = nullptr, *BaseOpB = nullptr;
936   int64_t OffsetA = 0, OffsetB = 0;
937   unsigned WidthA = 0, WidthB = 0;
938 
939   assert(MIa.mayLoadOrStore() && "MIa must be a load or store.");
940   assert(MIb.mayLoadOrStore() && "MIb must be a load or store.");
941 
942   if (MIa.hasUnmodeledSideEffects() || MIb.hasUnmodeledSideEffects() ||
943       MIa.hasOrderedMemoryRef() || MIb.hasOrderedMemoryRef())
944     return false;
945 
946   // Retrieve the base, offset from the base and width. Width
947   // is the size of memory that is being loaded/stored (e.g. 1, 2, 4, 8).  If
948   // base are identical, and the offset of a lower memory access +
949   // the width doesn't overlap the offset of a higher memory access,
950   // then the memory accesses are different.
951   if (getMemOperandWithOffsetWidth(MIa, BaseOpA, OffsetA, WidthA, TRI) &&
952       getMemOperandWithOffsetWidth(MIb, BaseOpB, OffsetB, WidthB, TRI)) {
953     if (BaseOpA->isIdenticalTo(*BaseOpB)) {
954       int LowOffset = OffsetA < OffsetB ? OffsetA : OffsetB;
955       int HighOffset = OffsetA < OffsetB ? OffsetB : OffsetA;
956       int LowWidth = (LowOffset == OffsetA) ? WidthA : WidthB;
957       if (LowOffset + LowWidth <= HighOffset)
958         return true;
959     }
960   }
961   return false;
962 }
963 
964 bool AArch64InstrInfo::isSchedulingBoundary(const MachineInstr &MI,
965                                             const MachineBasicBlock *MBB,
966                                             const MachineFunction &MF) const {
967   if (TargetInstrInfo::isSchedulingBoundary(MI, MBB, MF))
968     return true;
969   switch (MI.getOpcode()) {
970   case AArch64::HINT:
971     // CSDB hints are scheduling barriers.
972     if (MI.getOperand(0).getImm() == 0x14)
973       return true;
974     break;
975   case AArch64::DSB:
976   case AArch64::ISB:
977     // DSB and ISB also are scheduling barriers.
978     return true;
979   default:;
980   }
981   return isSEHInstruction(MI);
982 }
983 
984 /// analyzeCompare - For a comparison instruction, return the source registers
985 /// in SrcReg and SrcReg2, and the value it compares against in CmpValue.
986 /// Return true if the comparison instruction can be analyzed.
987 bool AArch64InstrInfo::analyzeCompare(const MachineInstr &MI, unsigned &SrcReg,
988                                       unsigned &SrcReg2, int &CmpMask,
989                                       int &CmpValue) const {
990   // The first operand can be a frame index where we'd normally expect a
991   // register.
992   assert(MI.getNumOperands() >= 2 && "All AArch64 cmps should have 2 operands");
993   if (!MI.getOperand(1).isReg())
994     return false;
995 
996   switch (MI.getOpcode()) {
997   default:
998     break;
999   case AArch64::SUBSWrr:
1000   case AArch64::SUBSWrs:
1001   case AArch64::SUBSWrx:
1002   case AArch64::SUBSXrr:
1003   case AArch64::SUBSXrs:
1004   case AArch64::SUBSXrx:
1005   case AArch64::ADDSWrr:
1006   case AArch64::ADDSWrs:
1007   case AArch64::ADDSWrx:
1008   case AArch64::ADDSXrr:
1009   case AArch64::ADDSXrs:
1010   case AArch64::ADDSXrx:
1011     // Replace SUBSWrr with SUBWrr if NZCV is not used.
1012     SrcReg = MI.getOperand(1).getReg();
1013     SrcReg2 = MI.getOperand(2).getReg();
1014     CmpMask = ~0;
1015     CmpValue = 0;
1016     return true;
1017   case AArch64::SUBSWri:
1018   case AArch64::ADDSWri:
1019   case AArch64::SUBSXri:
1020   case AArch64::ADDSXri:
1021     SrcReg = MI.getOperand(1).getReg();
1022     SrcReg2 = 0;
1023     CmpMask = ~0;
1024     // FIXME: In order to convert CmpValue to 0 or 1
1025     CmpValue = MI.getOperand(2).getImm() != 0;
1026     return true;
1027   case AArch64::ANDSWri:
1028   case AArch64::ANDSXri:
1029     // ANDS does not use the same encoding scheme as the others xxxS
1030     // instructions.
1031     SrcReg = MI.getOperand(1).getReg();
1032     SrcReg2 = 0;
1033     CmpMask = ~0;
1034     // FIXME:The return val type of decodeLogicalImmediate is uint64_t,
1035     // while the type of CmpValue is int. When converting uint64_t to int,
1036     // the high 32 bits of uint64_t will be lost.
1037     // In fact it causes a bug in spec2006-483.xalancbmk
1038     // CmpValue is only used to compare with zero in OptimizeCompareInstr
1039     CmpValue = AArch64_AM::decodeLogicalImmediate(
1040                    MI.getOperand(2).getImm(),
1041                    MI.getOpcode() == AArch64::ANDSWri ? 32 : 64) != 0;
1042     return true;
1043   }
1044 
1045   return false;
1046 }
1047 
1048 static bool UpdateOperandRegClass(MachineInstr &Instr) {
1049   MachineBasicBlock *MBB = Instr.getParent();
1050   assert(MBB && "Can't get MachineBasicBlock here");
1051   MachineFunction *MF = MBB->getParent();
1052   assert(MF && "Can't get MachineFunction here");
1053   const TargetInstrInfo *TII = MF->getSubtarget().getInstrInfo();
1054   const TargetRegisterInfo *TRI = MF->getSubtarget().getRegisterInfo();
1055   MachineRegisterInfo *MRI = &MF->getRegInfo();
1056 
1057   for (unsigned OpIdx = 0, EndIdx = Instr.getNumOperands(); OpIdx < EndIdx;
1058        ++OpIdx) {
1059     MachineOperand &MO = Instr.getOperand(OpIdx);
1060     const TargetRegisterClass *OpRegCstraints =
1061         Instr.getRegClassConstraint(OpIdx, TII, TRI);
1062 
1063     // If there's no constraint, there's nothing to do.
1064     if (!OpRegCstraints)
1065       continue;
1066     // If the operand is a frame index, there's nothing to do here.
1067     // A frame index operand will resolve correctly during PEI.
1068     if (MO.isFI())
1069       continue;
1070 
1071     assert(MO.isReg() &&
1072            "Operand has register constraints without being a register!");
1073 
1074     Register Reg = MO.getReg();
1075     if (Register::isPhysicalRegister(Reg)) {
1076       if (!OpRegCstraints->contains(Reg))
1077         return false;
1078     } else if (!OpRegCstraints->hasSubClassEq(MRI->getRegClass(Reg)) &&
1079                !MRI->constrainRegClass(Reg, OpRegCstraints))
1080       return false;
1081   }
1082 
1083   return true;
1084 }
1085 
1086 /// Return the opcode that does not set flags when possible - otherwise
1087 /// return the original opcode. The caller is responsible to do the actual
1088 /// substitution and legality checking.
1089 static unsigned convertToNonFlagSettingOpc(const MachineInstr &MI) {
1090   // Don't convert all compare instructions, because for some the zero register
1091   // encoding becomes the sp register.
1092   bool MIDefinesZeroReg = false;
1093   if (MI.definesRegister(AArch64::WZR) || MI.definesRegister(AArch64::XZR))
1094     MIDefinesZeroReg = true;
1095 
1096   switch (MI.getOpcode()) {
1097   default:
1098     return MI.getOpcode();
1099   case AArch64::ADDSWrr:
1100     return AArch64::ADDWrr;
1101   case AArch64::ADDSWri:
1102     return MIDefinesZeroReg ? AArch64::ADDSWri : AArch64::ADDWri;
1103   case AArch64::ADDSWrs:
1104     return MIDefinesZeroReg ? AArch64::ADDSWrs : AArch64::ADDWrs;
1105   case AArch64::ADDSWrx:
1106     return AArch64::ADDWrx;
1107   case AArch64::ADDSXrr:
1108     return AArch64::ADDXrr;
1109   case AArch64::ADDSXri:
1110     return MIDefinesZeroReg ? AArch64::ADDSXri : AArch64::ADDXri;
1111   case AArch64::ADDSXrs:
1112     return MIDefinesZeroReg ? AArch64::ADDSXrs : AArch64::ADDXrs;
1113   case AArch64::ADDSXrx:
1114     return AArch64::ADDXrx;
1115   case AArch64::SUBSWrr:
1116     return AArch64::SUBWrr;
1117   case AArch64::SUBSWri:
1118     return MIDefinesZeroReg ? AArch64::SUBSWri : AArch64::SUBWri;
1119   case AArch64::SUBSWrs:
1120     return MIDefinesZeroReg ? AArch64::SUBSWrs : AArch64::SUBWrs;
1121   case AArch64::SUBSWrx:
1122     return AArch64::SUBWrx;
1123   case AArch64::SUBSXrr:
1124     return AArch64::SUBXrr;
1125   case AArch64::SUBSXri:
1126     return MIDefinesZeroReg ? AArch64::SUBSXri : AArch64::SUBXri;
1127   case AArch64::SUBSXrs:
1128     return MIDefinesZeroReg ? AArch64::SUBSXrs : AArch64::SUBXrs;
1129   case AArch64::SUBSXrx:
1130     return AArch64::SUBXrx;
1131   }
1132 }
1133 
1134 enum AccessKind { AK_Write = 0x01, AK_Read = 0x10, AK_All = 0x11 };
1135 
1136 /// True when condition flags are accessed (either by writing or reading)
1137 /// on the instruction trace starting at From and ending at To.
1138 ///
1139 /// Note: If From and To are from different blocks it's assumed CC are accessed
1140 ///       on the path.
1141 static bool areCFlagsAccessedBetweenInstrs(
1142     MachineBasicBlock::iterator From, MachineBasicBlock::iterator To,
1143     const TargetRegisterInfo *TRI, const AccessKind AccessToCheck = AK_All) {
1144   // Early exit if To is at the beginning of the BB.
1145   if (To == To->getParent()->begin())
1146     return true;
1147 
1148   // Check whether the instructions are in the same basic block
1149   // If not, assume the condition flags might get modified somewhere.
1150   if (To->getParent() != From->getParent())
1151     return true;
1152 
1153   // From must be above To.
1154   assert(std::find_if(++To.getReverse(), To->getParent()->rend(),
1155                       [From](MachineInstr &MI) {
1156                         return MI.getIterator() == From;
1157                       }) != To->getParent()->rend());
1158 
1159   // We iterate backward starting \p To until we hit \p From.
1160   for (--To; To != From; --To) {
1161     const MachineInstr &Instr = *To;
1162 
1163     if (((AccessToCheck & AK_Write) &&
1164          Instr.modifiesRegister(AArch64::NZCV, TRI)) ||
1165         ((AccessToCheck & AK_Read) && Instr.readsRegister(AArch64::NZCV, TRI)))
1166       return true;
1167   }
1168   return false;
1169 }
1170 
1171 /// Try to optimize a compare instruction. A compare instruction is an
1172 /// instruction which produces AArch64::NZCV. It can be truly compare
1173 /// instruction
1174 /// when there are no uses of its destination register.
1175 ///
1176 /// The following steps are tried in order:
1177 /// 1. Convert CmpInstr into an unconditional version.
1178 /// 2. Remove CmpInstr if above there is an instruction producing a needed
1179 ///    condition code or an instruction which can be converted into such an
1180 ///    instruction.
1181 ///    Only comparison with zero is supported.
1182 bool AArch64InstrInfo::optimizeCompareInstr(
1183     MachineInstr &CmpInstr, unsigned SrcReg, unsigned SrcReg2, int CmpMask,
1184     int CmpValue, const MachineRegisterInfo *MRI) const {
1185   assert(CmpInstr.getParent());
1186   assert(MRI);
1187 
1188   // Replace SUBSWrr with SUBWrr if NZCV is not used.
1189   int DeadNZCVIdx = CmpInstr.findRegisterDefOperandIdx(AArch64::NZCV, true);
1190   if (DeadNZCVIdx != -1) {
1191     if (CmpInstr.definesRegister(AArch64::WZR) ||
1192         CmpInstr.definesRegister(AArch64::XZR)) {
1193       CmpInstr.eraseFromParent();
1194       return true;
1195     }
1196     unsigned Opc = CmpInstr.getOpcode();
1197     unsigned NewOpc = convertToNonFlagSettingOpc(CmpInstr);
1198     if (NewOpc == Opc)
1199       return false;
1200     const MCInstrDesc &MCID = get(NewOpc);
1201     CmpInstr.setDesc(MCID);
1202     CmpInstr.RemoveOperand(DeadNZCVIdx);
1203     bool succeeded = UpdateOperandRegClass(CmpInstr);
1204     (void)succeeded;
1205     assert(succeeded && "Some operands reg class are incompatible!");
1206     return true;
1207   }
1208 
1209   // Continue only if we have a "ri" where immediate is zero.
1210   // FIXME:CmpValue has already been converted to 0 or 1 in analyzeCompare
1211   // function.
1212   assert((CmpValue == 0 || CmpValue == 1) && "CmpValue must be 0 or 1!");
1213   if (CmpValue != 0 || SrcReg2 != 0)
1214     return false;
1215 
1216   // CmpInstr is a Compare instruction if destination register is not used.
1217   if (!MRI->use_nodbg_empty(CmpInstr.getOperand(0).getReg()))
1218     return false;
1219 
1220   return substituteCmpToZero(CmpInstr, SrcReg, MRI);
1221 }
1222 
1223 /// Get opcode of S version of Instr.
1224 /// If Instr is S version its opcode is returned.
1225 /// AArch64::INSTRUCTION_LIST_END is returned if Instr does not have S version
1226 /// or we are not interested in it.
1227 static unsigned sForm(MachineInstr &Instr) {
1228   switch (Instr.getOpcode()) {
1229   default:
1230     return AArch64::INSTRUCTION_LIST_END;
1231 
1232   case AArch64::ADDSWrr:
1233   case AArch64::ADDSWri:
1234   case AArch64::ADDSXrr:
1235   case AArch64::ADDSXri:
1236   case AArch64::SUBSWrr:
1237   case AArch64::SUBSWri:
1238   case AArch64::SUBSXrr:
1239   case AArch64::SUBSXri:
1240     return Instr.getOpcode();
1241 
1242   case AArch64::ADDWrr:
1243     return AArch64::ADDSWrr;
1244   case AArch64::ADDWri:
1245     return AArch64::ADDSWri;
1246   case AArch64::ADDXrr:
1247     return AArch64::ADDSXrr;
1248   case AArch64::ADDXri:
1249     return AArch64::ADDSXri;
1250   case AArch64::ADCWr:
1251     return AArch64::ADCSWr;
1252   case AArch64::ADCXr:
1253     return AArch64::ADCSXr;
1254   case AArch64::SUBWrr:
1255     return AArch64::SUBSWrr;
1256   case AArch64::SUBWri:
1257     return AArch64::SUBSWri;
1258   case AArch64::SUBXrr:
1259     return AArch64::SUBSXrr;
1260   case AArch64::SUBXri:
1261     return AArch64::SUBSXri;
1262   case AArch64::SBCWr:
1263     return AArch64::SBCSWr;
1264   case AArch64::SBCXr:
1265     return AArch64::SBCSXr;
1266   case AArch64::ANDWri:
1267     return AArch64::ANDSWri;
1268   case AArch64::ANDXri:
1269     return AArch64::ANDSXri;
1270   }
1271 }
1272 
1273 /// Check if AArch64::NZCV should be alive in successors of MBB.
1274 static bool areCFlagsAliveInSuccessors(MachineBasicBlock *MBB) {
1275   for (auto *BB : MBB->successors())
1276     if (BB->isLiveIn(AArch64::NZCV))
1277       return true;
1278   return false;
1279 }
1280 
1281 namespace {
1282 
1283 struct UsedNZCV {
1284   bool N = false;
1285   bool Z = false;
1286   bool C = false;
1287   bool V = false;
1288 
1289   UsedNZCV() = default;
1290 
1291   UsedNZCV &operator|=(const UsedNZCV &UsedFlags) {
1292     this->N |= UsedFlags.N;
1293     this->Z |= UsedFlags.Z;
1294     this->C |= UsedFlags.C;
1295     this->V |= UsedFlags.V;
1296     return *this;
1297   }
1298 };
1299 
1300 } // end anonymous namespace
1301 
1302 /// Find a condition code used by the instruction.
1303 /// Returns AArch64CC::Invalid if either the instruction does not use condition
1304 /// codes or we don't optimize CmpInstr in the presence of such instructions.
1305 static AArch64CC::CondCode findCondCodeUsedByInstr(const MachineInstr &Instr) {
1306   switch (Instr.getOpcode()) {
1307   default:
1308     return AArch64CC::Invalid;
1309 
1310   case AArch64::Bcc: {
1311     int Idx = Instr.findRegisterUseOperandIdx(AArch64::NZCV);
1312     assert(Idx >= 2);
1313     return static_cast<AArch64CC::CondCode>(Instr.getOperand(Idx - 2).getImm());
1314   }
1315 
1316   case AArch64::CSINVWr:
1317   case AArch64::CSINVXr:
1318   case AArch64::CSINCWr:
1319   case AArch64::CSINCXr:
1320   case AArch64::CSELWr:
1321   case AArch64::CSELXr:
1322   case AArch64::CSNEGWr:
1323   case AArch64::CSNEGXr:
1324   case AArch64::FCSELSrrr:
1325   case AArch64::FCSELDrrr: {
1326     int Idx = Instr.findRegisterUseOperandIdx(AArch64::NZCV);
1327     assert(Idx >= 1);
1328     return static_cast<AArch64CC::CondCode>(Instr.getOperand(Idx - 1).getImm());
1329   }
1330   }
1331 }
1332 
1333 static UsedNZCV getUsedNZCV(AArch64CC::CondCode CC) {
1334   assert(CC != AArch64CC::Invalid);
1335   UsedNZCV UsedFlags;
1336   switch (CC) {
1337   default:
1338     break;
1339 
1340   case AArch64CC::EQ: // Z set
1341   case AArch64CC::NE: // Z clear
1342     UsedFlags.Z = true;
1343     break;
1344 
1345   case AArch64CC::HI: // Z clear and C set
1346   case AArch64CC::LS: // Z set   or  C clear
1347     UsedFlags.Z = true;
1348     LLVM_FALLTHROUGH;
1349   case AArch64CC::HS: // C set
1350   case AArch64CC::LO: // C clear
1351     UsedFlags.C = true;
1352     break;
1353 
1354   case AArch64CC::MI: // N set
1355   case AArch64CC::PL: // N clear
1356     UsedFlags.N = true;
1357     break;
1358 
1359   case AArch64CC::VS: // V set
1360   case AArch64CC::VC: // V clear
1361     UsedFlags.V = true;
1362     break;
1363 
1364   case AArch64CC::GT: // Z clear, N and V the same
1365   case AArch64CC::LE: // Z set,   N and V differ
1366     UsedFlags.Z = true;
1367     LLVM_FALLTHROUGH;
1368   case AArch64CC::GE: // N and V the same
1369   case AArch64CC::LT: // N and V differ
1370     UsedFlags.N = true;
1371     UsedFlags.V = true;
1372     break;
1373   }
1374   return UsedFlags;
1375 }
1376 
1377 static bool isADDSRegImm(unsigned Opcode) {
1378   return Opcode == AArch64::ADDSWri || Opcode == AArch64::ADDSXri;
1379 }
1380 
1381 static bool isSUBSRegImm(unsigned Opcode) {
1382   return Opcode == AArch64::SUBSWri || Opcode == AArch64::SUBSXri;
1383 }
1384 
1385 /// Check if CmpInstr can be substituted by MI.
1386 ///
1387 /// CmpInstr can be substituted:
1388 /// - CmpInstr is either 'ADDS %vreg, 0' or 'SUBS %vreg, 0'
1389 /// - and, MI and CmpInstr are from the same MachineBB
1390 /// - and, condition flags are not alive in successors of the CmpInstr parent
1391 /// - and, if MI opcode is the S form there must be no defs of flags between
1392 ///        MI and CmpInstr
1393 ///        or if MI opcode is not the S form there must be neither defs of flags
1394 ///        nor uses of flags between MI and CmpInstr.
1395 /// - and  C/V flags are not used after CmpInstr
1396 static bool canInstrSubstituteCmpInstr(MachineInstr *MI, MachineInstr *CmpInstr,
1397                                        const TargetRegisterInfo *TRI) {
1398   assert(MI);
1399   assert(sForm(*MI) != AArch64::INSTRUCTION_LIST_END);
1400   assert(CmpInstr);
1401 
1402   const unsigned CmpOpcode = CmpInstr->getOpcode();
1403   if (!isADDSRegImm(CmpOpcode) && !isSUBSRegImm(CmpOpcode))
1404     return false;
1405 
1406   if (MI->getParent() != CmpInstr->getParent())
1407     return false;
1408 
1409   if (areCFlagsAliveInSuccessors(CmpInstr->getParent()))
1410     return false;
1411 
1412   AccessKind AccessToCheck = AK_Write;
1413   if (sForm(*MI) != MI->getOpcode())
1414     AccessToCheck = AK_All;
1415   if (areCFlagsAccessedBetweenInstrs(MI, CmpInstr, TRI, AccessToCheck))
1416     return false;
1417 
1418   UsedNZCV NZCVUsedAfterCmp;
1419   for (auto I = std::next(CmpInstr->getIterator()),
1420             E = CmpInstr->getParent()->instr_end();
1421        I != E; ++I) {
1422     const MachineInstr &Instr = *I;
1423     if (Instr.readsRegister(AArch64::NZCV, TRI)) {
1424       AArch64CC::CondCode CC = findCondCodeUsedByInstr(Instr);
1425       if (CC == AArch64CC::Invalid) // Unsupported conditional instruction
1426         return false;
1427       NZCVUsedAfterCmp |= getUsedNZCV(CC);
1428     }
1429 
1430     if (Instr.modifiesRegister(AArch64::NZCV, TRI))
1431       break;
1432   }
1433 
1434   return !NZCVUsedAfterCmp.C && !NZCVUsedAfterCmp.V;
1435 }
1436 
1437 /// Substitute an instruction comparing to zero with another instruction
1438 /// which produces needed condition flags.
1439 ///
1440 /// Return true on success.
1441 bool AArch64InstrInfo::substituteCmpToZero(
1442     MachineInstr &CmpInstr, unsigned SrcReg,
1443     const MachineRegisterInfo *MRI) const {
1444   assert(MRI);
1445   // Get the unique definition of SrcReg.
1446   MachineInstr *MI = MRI->getUniqueVRegDef(SrcReg);
1447   if (!MI)
1448     return false;
1449 
1450   const TargetRegisterInfo *TRI = &getRegisterInfo();
1451 
1452   unsigned NewOpc = sForm(*MI);
1453   if (NewOpc == AArch64::INSTRUCTION_LIST_END)
1454     return false;
1455 
1456   if (!canInstrSubstituteCmpInstr(MI, &CmpInstr, TRI))
1457     return false;
1458 
1459   // Update the instruction to set NZCV.
1460   MI->setDesc(get(NewOpc));
1461   CmpInstr.eraseFromParent();
1462   bool succeeded = UpdateOperandRegClass(*MI);
1463   (void)succeeded;
1464   assert(succeeded && "Some operands reg class are incompatible!");
1465   MI->addRegisterDefined(AArch64::NZCV, TRI);
1466   return true;
1467 }
1468 
1469 bool AArch64InstrInfo::expandPostRAPseudo(MachineInstr &MI) const {
1470   if (MI.getOpcode() != TargetOpcode::LOAD_STACK_GUARD &&
1471       MI.getOpcode() != AArch64::CATCHRET)
1472     return false;
1473 
1474   MachineBasicBlock &MBB = *MI.getParent();
1475   auto &Subtarget = MBB.getParent()->getSubtarget<AArch64Subtarget>();
1476   auto TRI = Subtarget.getRegisterInfo();
1477   DebugLoc DL = MI.getDebugLoc();
1478 
1479   if (MI.getOpcode() == AArch64::CATCHRET) {
1480     // Skip to the first instruction before the epilog.
1481     const TargetInstrInfo *TII =
1482       MBB.getParent()->getSubtarget().getInstrInfo();
1483     MachineBasicBlock *TargetMBB = MI.getOperand(0).getMBB();
1484     auto MBBI = MachineBasicBlock::iterator(MI);
1485     MachineBasicBlock::iterator FirstEpilogSEH = std::prev(MBBI);
1486     while (FirstEpilogSEH->getFlag(MachineInstr::FrameDestroy) &&
1487            FirstEpilogSEH != MBB.begin())
1488       FirstEpilogSEH = std::prev(FirstEpilogSEH);
1489     if (FirstEpilogSEH != MBB.begin())
1490       FirstEpilogSEH = std::next(FirstEpilogSEH);
1491     BuildMI(MBB, FirstEpilogSEH, DL, TII->get(AArch64::ADRP))
1492         .addReg(AArch64::X0, RegState::Define)
1493         .addMBB(TargetMBB);
1494     BuildMI(MBB, FirstEpilogSEH, DL, TII->get(AArch64::ADDXri))
1495         .addReg(AArch64::X0, RegState::Define)
1496         .addReg(AArch64::X0)
1497         .addMBB(TargetMBB)
1498         .addImm(0);
1499     return true;
1500   }
1501 
1502   Register Reg = MI.getOperand(0).getReg();
1503   const GlobalValue *GV =
1504       cast<GlobalValue>((*MI.memoperands_begin())->getValue());
1505   const TargetMachine &TM = MBB.getParent()->getTarget();
1506   unsigned OpFlags = Subtarget.ClassifyGlobalReference(GV, TM);
1507   const unsigned char MO_NC = AArch64II::MO_NC;
1508 
1509   if ((OpFlags & AArch64II::MO_GOT) != 0) {
1510     BuildMI(MBB, MI, DL, get(AArch64::LOADgot), Reg)
1511         .addGlobalAddress(GV, 0, OpFlags);
1512     if (Subtarget.isTargetILP32()) {
1513       unsigned Reg32 = TRI->getSubReg(Reg, AArch64::sub_32);
1514       BuildMI(MBB, MI, DL, get(AArch64::LDRWui))
1515           .addDef(Reg32, RegState::Dead)
1516           .addUse(Reg, RegState::Kill)
1517           .addImm(0)
1518           .addMemOperand(*MI.memoperands_begin())
1519           .addDef(Reg, RegState::Implicit);
1520     } else {
1521       BuildMI(MBB, MI, DL, get(AArch64::LDRXui), Reg)
1522           .addReg(Reg, RegState::Kill)
1523           .addImm(0)
1524           .addMemOperand(*MI.memoperands_begin());
1525     }
1526   } else if (TM.getCodeModel() == CodeModel::Large) {
1527     assert(!Subtarget.isTargetILP32() && "how can large exist in ILP32?");
1528     BuildMI(MBB, MI, DL, get(AArch64::MOVZXi), Reg)
1529         .addGlobalAddress(GV, 0, AArch64II::MO_G0 | MO_NC)
1530         .addImm(0);
1531     BuildMI(MBB, MI, DL, get(AArch64::MOVKXi), Reg)
1532         .addReg(Reg, RegState::Kill)
1533         .addGlobalAddress(GV, 0, AArch64II::MO_G1 | MO_NC)
1534         .addImm(16);
1535     BuildMI(MBB, MI, DL, get(AArch64::MOVKXi), Reg)
1536         .addReg(Reg, RegState::Kill)
1537         .addGlobalAddress(GV, 0, AArch64II::MO_G2 | MO_NC)
1538         .addImm(32);
1539     BuildMI(MBB, MI, DL, get(AArch64::MOVKXi), Reg)
1540         .addReg(Reg, RegState::Kill)
1541         .addGlobalAddress(GV, 0, AArch64II::MO_G3)
1542         .addImm(48);
1543     BuildMI(MBB, MI, DL, get(AArch64::LDRXui), Reg)
1544         .addReg(Reg, RegState::Kill)
1545         .addImm(0)
1546         .addMemOperand(*MI.memoperands_begin());
1547   } else if (TM.getCodeModel() == CodeModel::Tiny) {
1548     BuildMI(MBB, MI, DL, get(AArch64::ADR), Reg)
1549         .addGlobalAddress(GV, 0, OpFlags);
1550   } else {
1551     BuildMI(MBB, MI, DL, get(AArch64::ADRP), Reg)
1552         .addGlobalAddress(GV, 0, OpFlags | AArch64II::MO_PAGE);
1553     unsigned char LoFlags = OpFlags | AArch64II::MO_PAGEOFF | MO_NC;
1554     if (Subtarget.isTargetILP32()) {
1555       unsigned Reg32 = TRI->getSubReg(Reg, AArch64::sub_32);
1556       BuildMI(MBB, MI, DL, get(AArch64::LDRWui))
1557           .addDef(Reg32, RegState::Dead)
1558           .addUse(Reg, RegState::Kill)
1559           .addGlobalAddress(GV, 0, LoFlags)
1560           .addMemOperand(*MI.memoperands_begin())
1561           .addDef(Reg, RegState::Implicit);
1562     } else {
1563       BuildMI(MBB, MI, DL, get(AArch64::LDRXui), Reg)
1564           .addReg(Reg, RegState::Kill)
1565           .addGlobalAddress(GV, 0, LoFlags)
1566           .addMemOperand(*MI.memoperands_begin());
1567     }
1568   }
1569 
1570   MBB.erase(MI);
1571 
1572   return true;
1573 }
1574 
1575 // Return true if this instruction simply sets its single destination register
1576 // to zero. This is equivalent to a register rename of the zero-register.
1577 bool AArch64InstrInfo::isGPRZero(const MachineInstr &MI) {
1578   switch (MI.getOpcode()) {
1579   default:
1580     break;
1581   case AArch64::MOVZWi:
1582   case AArch64::MOVZXi: // movz Rd, #0 (LSL #0)
1583     if (MI.getOperand(1).isImm() && MI.getOperand(1).getImm() == 0) {
1584       assert(MI.getDesc().getNumOperands() == 3 &&
1585              MI.getOperand(2).getImm() == 0 && "invalid MOVZi operands");
1586       return true;
1587     }
1588     break;
1589   case AArch64::ANDWri: // and Rd, Rzr, #imm
1590     return MI.getOperand(1).getReg() == AArch64::WZR;
1591   case AArch64::ANDXri:
1592     return MI.getOperand(1).getReg() == AArch64::XZR;
1593   case TargetOpcode::COPY:
1594     return MI.getOperand(1).getReg() == AArch64::WZR;
1595   }
1596   return false;
1597 }
1598 
1599 // Return true if this instruction simply renames a general register without
1600 // modifying bits.
1601 bool AArch64InstrInfo::isGPRCopy(const MachineInstr &MI) {
1602   switch (MI.getOpcode()) {
1603   default:
1604     break;
1605   case TargetOpcode::COPY: {
1606     // GPR32 copies will by lowered to ORRXrs
1607     Register DstReg = MI.getOperand(0).getReg();
1608     return (AArch64::GPR32RegClass.contains(DstReg) ||
1609             AArch64::GPR64RegClass.contains(DstReg));
1610   }
1611   case AArch64::ORRXrs: // orr Xd, Xzr, Xm (LSL #0)
1612     if (MI.getOperand(1).getReg() == AArch64::XZR) {
1613       assert(MI.getDesc().getNumOperands() == 4 &&
1614              MI.getOperand(3).getImm() == 0 && "invalid ORRrs operands");
1615       return true;
1616     }
1617     break;
1618   case AArch64::ADDXri: // add Xd, Xn, #0 (LSL #0)
1619     if (MI.getOperand(2).getImm() == 0) {
1620       assert(MI.getDesc().getNumOperands() == 4 &&
1621              MI.getOperand(3).getImm() == 0 && "invalid ADDXri operands");
1622       return true;
1623     }
1624     break;
1625   }
1626   return false;
1627 }
1628 
1629 // Return true if this instruction simply renames a general register without
1630 // modifying bits.
1631 bool AArch64InstrInfo::isFPRCopy(const MachineInstr &MI) {
1632   switch (MI.getOpcode()) {
1633   default:
1634     break;
1635   case TargetOpcode::COPY: {
1636     // FPR64 copies will by lowered to ORR.16b
1637     Register DstReg = MI.getOperand(0).getReg();
1638     return (AArch64::FPR64RegClass.contains(DstReg) ||
1639             AArch64::FPR128RegClass.contains(DstReg));
1640   }
1641   case AArch64::ORRv16i8:
1642     if (MI.getOperand(1).getReg() == MI.getOperand(2).getReg()) {
1643       assert(MI.getDesc().getNumOperands() == 3 && MI.getOperand(0).isReg() &&
1644              "invalid ORRv16i8 operands");
1645       return true;
1646     }
1647     break;
1648   }
1649   return false;
1650 }
1651 
1652 unsigned AArch64InstrInfo::isLoadFromStackSlot(const MachineInstr &MI,
1653                                                int &FrameIndex) const {
1654   switch (MI.getOpcode()) {
1655   default:
1656     break;
1657   case AArch64::LDRWui:
1658   case AArch64::LDRXui:
1659   case AArch64::LDRBui:
1660   case AArch64::LDRHui:
1661   case AArch64::LDRSui:
1662   case AArch64::LDRDui:
1663   case AArch64::LDRQui:
1664     if (MI.getOperand(0).getSubReg() == 0 && MI.getOperand(1).isFI() &&
1665         MI.getOperand(2).isImm() && MI.getOperand(2).getImm() == 0) {
1666       FrameIndex = MI.getOperand(1).getIndex();
1667       return MI.getOperand(0).getReg();
1668     }
1669     break;
1670   }
1671 
1672   return 0;
1673 }
1674 
1675 unsigned AArch64InstrInfo::isStoreToStackSlot(const MachineInstr &MI,
1676                                               int &FrameIndex) const {
1677   switch (MI.getOpcode()) {
1678   default:
1679     break;
1680   case AArch64::STRWui:
1681   case AArch64::STRXui:
1682   case AArch64::STRBui:
1683   case AArch64::STRHui:
1684   case AArch64::STRSui:
1685   case AArch64::STRDui:
1686   case AArch64::STRQui:
1687     if (MI.getOperand(0).getSubReg() == 0 && MI.getOperand(1).isFI() &&
1688         MI.getOperand(2).isImm() && MI.getOperand(2).getImm() == 0) {
1689       FrameIndex = MI.getOperand(1).getIndex();
1690       return MI.getOperand(0).getReg();
1691     }
1692     break;
1693   }
1694   return 0;
1695 }
1696 
1697 /// Check all MachineMemOperands for a hint to suppress pairing.
1698 bool AArch64InstrInfo::isLdStPairSuppressed(const MachineInstr &MI) {
1699   return llvm::any_of(MI.memoperands(), [](MachineMemOperand *MMO) {
1700     return MMO->getFlags() & MOSuppressPair;
1701   });
1702 }
1703 
1704 /// Set a flag on the first MachineMemOperand to suppress pairing.
1705 void AArch64InstrInfo::suppressLdStPair(MachineInstr &MI) {
1706   if (MI.memoperands_empty())
1707     return;
1708   (*MI.memoperands_begin())->setFlags(MOSuppressPair);
1709 }
1710 
1711 /// Check all MachineMemOperands for a hint that the load/store is strided.
1712 bool AArch64InstrInfo::isStridedAccess(const MachineInstr &MI) {
1713   return llvm::any_of(MI.memoperands(), [](MachineMemOperand *MMO) {
1714     return MMO->getFlags() & MOStridedAccess;
1715   });
1716 }
1717 
1718 bool AArch64InstrInfo::isUnscaledLdSt(unsigned Opc) {
1719   switch (Opc) {
1720   default:
1721     return false;
1722   case AArch64::STURSi:
1723   case AArch64::STURDi:
1724   case AArch64::STURQi:
1725   case AArch64::STURBBi:
1726   case AArch64::STURHHi:
1727   case AArch64::STURWi:
1728   case AArch64::STURXi:
1729   case AArch64::LDURSi:
1730   case AArch64::LDURDi:
1731   case AArch64::LDURQi:
1732   case AArch64::LDURWi:
1733   case AArch64::LDURXi:
1734   case AArch64::LDURSWi:
1735   case AArch64::LDURHHi:
1736   case AArch64::LDURBBi:
1737   case AArch64::LDURSBWi:
1738   case AArch64::LDURSHWi:
1739     return true;
1740   }
1741 }
1742 
1743 Optional<unsigned> AArch64InstrInfo::getUnscaledLdSt(unsigned Opc) {
1744   switch (Opc) {
1745   default: return {};
1746   case AArch64::PRFMui: return AArch64::PRFUMi;
1747   case AArch64::LDRXui: return AArch64::LDURXi;
1748   case AArch64::LDRWui: return AArch64::LDURWi;
1749   case AArch64::LDRBui: return AArch64::LDURBi;
1750   case AArch64::LDRHui: return AArch64::LDURHi;
1751   case AArch64::LDRSui: return AArch64::LDURSi;
1752   case AArch64::LDRDui: return AArch64::LDURDi;
1753   case AArch64::LDRQui: return AArch64::LDURQi;
1754   case AArch64::LDRBBui: return AArch64::LDURBBi;
1755   case AArch64::LDRHHui: return AArch64::LDURHHi;
1756   case AArch64::LDRSBXui: return AArch64::LDURSBXi;
1757   case AArch64::LDRSBWui: return AArch64::LDURSBWi;
1758   case AArch64::LDRSHXui: return AArch64::LDURSHXi;
1759   case AArch64::LDRSHWui: return AArch64::LDURSHWi;
1760   case AArch64::LDRSWui: return AArch64::LDURSWi;
1761   case AArch64::STRXui: return AArch64::STURXi;
1762   case AArch64::STRWui: return AArch64::STURWi;
1763   case AArch64::STRBui: return AArch64::STURBi;
1764   case AArch64::STRHui: return AArch64::STURHi;
1765   case AArch64::STRSui: return AArch64::STURSi;
1766   case AArch64::STRDui: return AArch64::STURDi;
1767   case AArch64::STRQui: return AArch64::STURQi;
1768   case AArch64::STRBBui: return AArch64::STURBBi;
1769   case AArch64::STRHHui: return AArch64::STURHHi;
1770   }
1771 }
1772 
1773 unsigned AArch64InstrInfo::getLoadStoreImmIdx(unsigned Opc) {
1774   switch (Opc) {
1775   default:
1776     return 2;
1777   case AArch64::LDPXi:
1778   case AArch64::LDPDi:
1779   case AArch64::STPXi:
1780   case AArch64::STPDi:
1781   case AArch64::LDNPXi:
1782   case AArch64::LDNPDi:
1783   case AArch64::STNPXi:
1784   case AArch64::STNPDi:
1785   case AArch64::LDPQi:
1786   case AArch64::STPQi:
1787   case AArch64::LDNPQi:
1788   case AArch64::STNPQi:
1789   case AArch64::LDPWi:
1790   case AArch64::LDPSi:
1791   case AArch64::STPWi:
1792   case AArch64::STPSi:
1793   case AArch64::LDNPWi:
1794   case AArch64::LDNPSi:
1795   case AArch64::STNPWi:
1796   case AArch64::STNPSi:
1797   case AArch64::LDG:
1798   case AArch64::STGPi:
1799     return 3;
1800   case AArch64::ADDG:
1801   case AArch64::STGOffset:
1802     return 2;
1803   }
1804 }
1805 
1806 bool AArch64InstrInfo::isPairableLdStInst(const MachineInstr &MI) {
1807   switch (MI.getOpcode()) {
1808   default:
1809     return false;
1810   // Scaled instructions.
1811   case AArch64::STRSui:
1812   case AArch64::STRDui:
1813   case AArch64::STRQui:
1814   case AArch64::STRXui:
1815   case AArch64::STRWui:
1816   case AArch64::LDRSui:
1817   case AArch64::LDRDui:
1818   case AArch64::LDRQui:
1819   case AArch64::LDRXui:
1820   case AArch64::LDRWui:
1821   case AArch64::LDRSWui:
1822   // Unscaled instructions.
1823   case AArch64::STURSi:
1824   case AArch64::STURDi:
1825   case AArch64::STURQi:
1826   case AArch64::STURWi:
1827   case AArch64::STURXi:
1828   case AArch64::LDURSi:
1829   case AArch64::LDURDi:
1830   case AArch64::LDURQi:
1831   case AArch64::LDURWi:
1832   case AArch64::LDURXi:
1833   case AArch64::LDURSWi:
1834     return true;
1835   }
1836 }
1837 
1838 unsigned AArch64InstrInfo::convertToFlagSettingOpc(unsigned Opc,
1839                                                    bool &Is64Bit) {
1840   switch (Opc) {
1841   default:
1842     llvm_unreachable("Opcode has no flag setting equivalent!");
1843   // 32-bit cases:
1844   case AArch64::ADDWri:
1845     Is64Bit = false;
1846     return AArch64::ADDSWri;
1847   case AArch64::ADDWrr:
1848     Is64Bit = false;
1849     return AArch64::ADDSWrr;
1850   case AArch64::ADDWrs:
1851     Is64Bit = false;
1852     return AArch64::ADDSWrs;
1853   case AArch64::ADDWrx:
1854     Is64Bit = false;
1855     return AArch64::ADDSWrx;
1856   case AArch64::ANDWri:
1857     Is64Bit = false;
1858     return AArch64::ANDSWri;
1859   case AArch64::ANDWrr:
1860     Is64Bit = false;
1861     return AArch64::ANDSWrr;
1862   case AArch64::ANDWrs:
1863     Is64Bit = false;
1864     return AArch64::ANDSWrs;
1865   case AArch64::BICWrr:
1866     Is64Bit = false;
1867     return AArch64::BICSWrr;
1868   case AArch64::BICWrs:
1869     Is64Bit = false;
1870     return AArch64::BICSWrs;
1871   case AArch64::SUBWri:
1872     Is64Bit = false;
1873     return AArch64::SUBSWri;
1874   case AArch64::SUBWrr:
1875     Is64Bit = false;
1876     return AArch64::SUBSWrr;
1877   case AArch64::SUBWrs:
1878     Is64Bit = false;
1879     return AArch64::SUBSWrs;
1880   case AArch64::SUBWrx:
1881     Is64Bit = false;
1882     return AArch64::SUBSWrx;
1883   // 64-bit cases:
1884   case AArch64::ADDXri:
1885     Is64Bit = true;
1886     return AArch64::ADDSXri;
1887   case AArch64::ADDXrr:
1888     Is64Bit = true;
1889     return AArch64::ADDSXrr;
1890   case AArch64::ADDXrs:
1891     Is64Bit = true;
1892     return AArch64::ADDSXrs;
1893   case AArch64::ADDXrx:
1894     Is64Bit = true;
1895     return AArch64::ADDSXrx;
1896   case AArch64::ANDXri:
1897     Is64Bit = true;
1898     return AArch64::ANDSXri;
1899   case AArch64::ANDXrr:
1900     Is64Bit = true;
1901     return AArch64::ANDSXrr;
1902   case AArch64::ANDXrs:
1903     Is64Bit = true;
1904     return AArch64::ANDSXrs;
1905   case AArch64::BICXrr:
1906     Is64Bit = true;
1907     return AArch64::BICSXrr;
1908   case AArch64::BICXrs:
1909     Is64Bit = true;
1910     return AArch64::BICSXrs;
1911   case AArch64::SUBXri:
1912     Is64Bit = true;
1913     return AArch64::SUBSXri;
1914   case AArch64::SUBXrr:
1915     Is64Bit = true;
1916     return AArch64::SUBSXrr;
1917   case AArch64::SUBXrs:
1918     Is64Bit = true;
1919     return AArch64::SUBSXrs;
1920   case AArch64::SUBXrx:
1921     Is64Bit = true;
1922     return AArch64::SUBSXrx;
1923   }
1924 }
1925 
1926 // Is this a candidate for ld/st merging or pairing?  For example, we don't
1927 // touch volatiles or load/stores that have a hint to avoid pair formation.
1928 bool AArch64InstrInfo::isCandidateToMergeOrPair(const MachineInstr &MI) const {
1929   // If this is a volatile load/store, don't mess with it.
1930   if (MI.hasOrderedMemoryRef())
1931     return false;
1932 
1933   // Make sure this is a reg/fi+imm (as opposed to an address reloc).
1934   assert((MI.getOperand(1).isReg() || MI.getOperand(1).isFI()) &&
1935          "Expected a reg or frame index operand.");
1936   if (!MI.getOperand(2).isImm())
1937     return false;
1938 
1939   // Can't merge/pair if the instruction modifies the base register.
1940   // e.g., ldr x0, [x0]
1941   // This case will never occur with an FI base.
1942   if (MI.getOperand(1).isReg()) {
1943     Register BaseReg = MI.getOperand(1).getReg();
1944     const TargetRegisterInfo *TRI = &getRegisterInfo();
1945     if (MI.modifiesRegister(BaseReg, TRI))
1946       return false;
1947   }
1948 
1949   // Check if this load/store has a hint to avoid pair formation.
1950   // MachineMemOperands hints are set by the AArch64StorePairSuppress pass.
1951   if (isLdStPairSuppressed(MI))
1952     return false;
1953 
1954   // Do not pair any callee-save store/reload instructions in the
1955   // prologue/epilogue if the CFI information encoded the operations as separate
1956   // instructions, as that will cause the size of the actual prologue to mismatch
1957   // with the prologue size recorded in the Windows CFI.
1958   const MCAsmInfo *MAI = MI.getMF()->getTarget().getMCAsmInfo();
1959   bool NeedsWinCFI = MAI->usesWindowsCFI() &&
1960                      MI.getMF()->getFunction().needsUnwindTableEntry();
1961   if (NeedsWinCFI && (MI.getFlag(MachineInstr::FrameSetup) ||
1962                       MI.getFlag(MachineInstr::FrameDestroy)))
1963     return false;
1964 
1965   // On some CPUs quad load/store pairs are slower than two single load/stores.
1966   if (Subtarget.isPaired128Slow()) {
1967     switch (MI.getOpcode()) {
1968     default:
1969       break;
1970     case AArch64::LDURQi:
1971     case AArch64::STURQi:
1972     case AArch64::LDRQui:
1973     case AArch64::STRQui:
1974       return false;
1975     }
1976   }
1977 
1978   return true;
1979 }
1980 
1981 bool AArch64InstrInfo::getMemOperandWithOffset(const MachineInstr &LdSt,
1982                                           const MachineOperand *&BaseOp,
1983                                           int64_t &Offset,
1984                                           const TargetRegisterInfo *TRI) const {
1985   if (!LdSt.mayLoadOrStore())
1986     return false;
1987 
1988   unsigned Width;
1989   return getMemOperandWithOffsetWidth(LdSt, BaseOp, Offset, Width, TRI);
1990 }
1991 
1992 bool AArch64InstrInfo::getMemOperandWithOffsetWidth(
1993     const MachineInstr &LdSt, const MachineOperand *&BaseOp, int64_t &Offset,
1994     unsigned &Width, const TargetRegisterInfo *TRI) const {
1995   assert(LdSt.mayLoadOrStore() && "Expected a memory operation.");
1996   // Handle only loads/stores with base register followed by immediate offset.
1997   if (LdSt.getNumExplicitOperands() == 3) {
1998     // Non-paired instruction (e.g., ldr x1, [x0, #8]).
1999     if ((!LdSt.getOperand(1).isReg() && !LdSt.getOperand(1).isFI()) ||
2000         !LdSt.getOperand(2).isImm())
2001       return false;
2002   } else if (LdSt.getNumExplicitOperands() == 4) {
2003     // Paired instruction (e.g., ldp x1, x2, [x0, #8]).
2004     if (!LdSt.getOperand(1).isReg() ||
2005         (!LdSt.getOperand(2).isReg() && !LdSt.getOperand(2).isFI()) ||
2006         !LdSt.getOperand(3).isImm())
2007       return false;
2008   } else
2009     return false;
2010 
2011   // Get the scaling factor for the instruction and set the width for the
2012   // instruction.
2013   unsigned Scale = 0;
2014   int64_t Dummy1, Dummy2;
2015 
2016   // If this returns false, then it's an instruction we don't want to handle.
2017   if (!getMemOpInfo(LdSt.getOpcode(), Scale, Width, Dummy1, Dummy2))
2018     return false;
2019 
2020   // Compute the offset. Offset is calculated as the immediate operand
2021   // multiplied by the scaling factor. Unscaled instructions have scaling factor
2022   // set to 1.
2023   if (LdSt.getNumExplicitOperands() == 3) {
2024     BaseOp = &LdSt.getOperand(1);
2025     Offset = LdSt.getOperand(2).getImm() * Scale;
2026   } else {
2027     assert(LdSt.getNumExplicitOperands() == 4 && "invalid number of operands");
2028     BaseOp = &LdSt.getOperand(2);
2029     Offset = LdSt.getOperand(3).getImm() * Scale;
2030   }
2031 
2032   if (!BaseOp->isReg() && !BaseOp->isFI())
2033     return false;
2034 
2035   return true;
2036 }
2037 
2038 MachineOperand &
2039 AArch64InstrInfo::getMemOpBaseRegImmOfsOffsetOperand(MachineInstr &LdSt) const {
2040   assert(LdSt.mayLoadOrStore() && "Expected a memory operation.");
2041   MachineOperand &OfsOp = LdSt.getOperand(LdSt.getNumExplicitOperands() - 1);
2042   assert(OfsOp.isImm() && "Offset operand wasn't immediate.");
2043   return OfsOp;
2044 }
2045 
2046 bool AArch64InstrInfo::getMemOpInfo(unsigned Opcode, unsigned &Scale,
2047                                     unsigned &Width, int64_t &MinOffset,
2048                                     int64_t &MaxOffset) {
2049   switch (Opcode) {
2050   // Not a memory operation or something we want to handle.
2051   default:
2052     Scale = Width = 0;
2053     MinOffset = MaxOffset = 0;
2054     return false;
2055   case AArch64::STRWpost:
2056   case AArch64::LDRWpost:
2057     Width = 32;
2058     Scale = 4;
2059     MinOffset = -256;
2060     MaxOffset = 255;
2061     break;
2062   case AArch64::LDURQi:
2063   case AArch64::STURQi:
2064     Width = 16;
2065     Scale = 1;
2066     MinOffset = -256;
2067     MaxOffset = 255;
2068     break;
2069   case AArch64::PRFUMi:
2070   case AArch64::LDURXi:
2071   case AArch64::LDURDi:
2072   case AArch64::STURXi:
2073   case AArch64::STURDi:
2074     Width = 8;
2075     Scale = 1;
2076     MinOffset = -256;
2077     MaxOffset = 255;
2078     break;
2079   case AArch64::LDURWi:
2080   case AArch64::LDURSi:
2081   case AArch64::LDURSWi:
2082   case AArch64::STURWi:
2083   case AArch64::STURSi:
2084     Width = 4;
2085     Scale = 1;
2086     MinOffset = -256;
2087     MaxOffset = 255;
2088     break;
2089   case AArch64::LDURHi:
2090   case AArch64::LDURHHi:
2091   case AArch64::LDURSHXi:
2092   case AArch64::LDURSHWi:
2093   case AArch64::STURHi:
2094   case AArch64::STURHHi:
2095     Width = 2;
2096     Scale = 1;
2097     MinOffset = -256;
2098     MaxOffset = 255;
2099     break;
2100   case AArch64::LDURBi:
2101   case AArch64::LDURBBi:
2102   case AArch64::LDURSBXi:
2103   case AArch64::LDURSBWi:
2104   case AArch64::STURBi:
2105   case AArch64::STURBBi:
2106     Width = 1;
2107     Scale = 1;
2108     MinOffset = -256;
2109     MaxOffset = 255;
2110     break;
2111   case AArch64::LDPQi:
2112   case AArch64::LDNPQi:
2113   case AArch64::STPQi:
2114   case AArch64::STNPQi:
2115     Scale = 16;
2116     Width = 32;
2117     MinOffset = -64;
2118     MaxOffset = 63;
2119     break;
2120   case AArch64::LDRQui:
2121   case AArch64::STRQui:
2122     Scale = Width = 16;
2123     MinOffset = 0;
2124     MaxOffset = 4095;
2125     break;
2126   case AArch64::LDPXi:
2127   case AArch64::LDPDi:
2128   case AArch64::LDNPXi:
2129   case AArch64::LDNPDi:
2130   case AArch64::STPXi:
2131   case AArch64::STPDi:
2132   case AArch64::STNPXi:
2133   case AArch64::STNPDi:
2134     Scale = 8;
2135     Width = 16;
2136     MinOffset = -64;
2137     MaxOffset = 63;
2138     break;
2139   case AArch64::PRFMui:
2140   case AArch64::LDRXui:
2141   case AArch64::LDRDui:
2142   case AArch64::STRXui:
2143   case AArch64::STRDui:
2144     Scale = Width = 8;
2145     MinOffset = 0;
2146     MaxOffset = 4095;
2147     break;
2148   case AArch64::LDPWi:
2149   case AArch64::LDPSi:
2150   case AArch64::LDNPWi:
2151   case AArch64::LDNPSi:
2152   case AArch64::STPWi:
2153   case AArch64::STPSi:
2154   case AArch64::STNPWi:
2155   case AArch64::STNPSi:
2156     Scale = 4;
2157     Width = 8;
2158     MinOffset = -64;
2159     MaxOffset = 63;
2160     break;
2161   case AArch64::LDRWui:
2162   case AArch64::LDRSui:
2163   case AArch64::LDRSWui:
2164   case AArch64::STRWui:
2165   case AArch64::STRSui:
2166     Scale = Width = 4;
2167     MinOffset = 0;
2168     MaxOffset = 4095;
2169     break;
2170   case AArch64::LDRHui:
2171   case AArch64::LDRHHui:
2172   case AArch64::LDRSHWui:
2173   case AArch64::LDRSHXui:
2174   case AArch64::STRHui:
2175   case AArch64::STRHHui:
2176     Scale = Width = 2;
2177     MinOffset = 0;
2178     MaxOffset = 4095;
2179     break;
2180   case AArch64::LDRBui:
2181   case AArch64::LDRBBui:
2182   case AArch64::LDRSBWui:
2183   case AArch64::LDRSBXui:
2184   case AArch64::STRBui:
2185   case AArch64::STRBBui:
2186     Scale = Width = 1;
2187     MinOffset = 0;
2188     MaxOffset = 4095;
2189     break;
2190   case AArch64::ADDG:
2191     Scale = 16;
2192     Width = 0;
2193     MinOffset = 0;
2194     MaxOffset = 63;
2195     break;
2196   case AArch64::TAGPstack:
2197     Scale = 16;
2198     Width = 0;
2199     // TAGP with a negative offset turns into SUBP, which has a maximum offset
2200     // of 63 (not 64!).
2201     MinOffset = -63;
2202     MaxOffset = 63;
2203     break;
2204   case AArch64::LDG:
2205   case AArch64::STGOffset:
2206   case AArch64::STZGOffset:
2207     Scale = Width = 16;
2208     MinOffset = -256;
2209     MaxOffset = 255;
2210     break;
2211   case AArch64::LDR_PXI:
2212   case AArch64::STR_PXI:
2213     Scale = Width = 2;
2214     MinOffset = -256;
2215     MaxOffset = 255;
2216     break;
2217   case AArch64::LDR_ZXI:
2218   case AArch64::STR_ZXI:
2219     Scale = Width = 16;
2220     MinOffset = -256;
2221     MaxOffset = 255;
2222     break;
2223   case AArch64::ST2GOffset:
2224   case AArch64::STZ2GOffset:
2225     Scale = 16;
2226     Width = 32;
2227     MinOffset = -256;
2228     MaxOffset = 255;
2229     break;
2230   case AArch64::STGPi:
2231     Scale = Width = 16;
2232     MinOffset = -64;
2233     MaxOffset = 63;
2234     break;
2235   }
2236 
2237   return true;
2238 }
2239 
2240 // Scaling factor for unscaled load or store.
2241 int AArch64InstrInfo::getMemScale(unsigned Opc) {
2242   switch (Opc) {
2243   default:
2244     llvm_unreachable("Opcode has unknown scale!");
2245   case AArch64::LDRBBui:
2246   case AArch64::LDURBBi:
2247   case AArch64::LDRSBWui:
2248   case AArch64::LDURSBWi:
2249   case AArch64::STRBBui:
2250   case AArch64::STURBBi:
2251     return 1;
2252   case AArch64::LDRHHui:
2253   case AArch64::LDURHHi:
2254   case AArch64::LDRSHWui:
2255   case AArch64::LDURSHWi:
2256   case AArch64::STRHHui:
2257   case AArch64::STURHHi:
2258     return 2;
2259   case AArch64::LDRSui:
2260   case AArch64::LDURSi:
2261   case AArch64::LDRSWui:
2262   case AArch64::LDURSWi:
2263   case AArch64::LDRWui:
2264   case AArch64::LDURWi:
2265   case AArch64::STRSui:
2266   case AArch64::STURSi:
2267   case AArch64::STRWui:
2268   case AArch64::STURWi:
2269   case AArch64::LDPSi:
2270   case AArch64::LDPSWi:
2271   case AArch64::LDPWi:
2272   case AArch64::STPSi:
2273   case AArch64::STPWi:
2274     return 4;
2275   case AArch64::LDRDui:
2276   case AArch64::LDURDi:
2277   case AArch64::LDRXui:
2278   case AArch64::LDURXi:
2279   case AArch64::STRDui:
2280   case AArch64::STURDi:
2281   case AArch64::STRXui:
2282   case AArch64::STURXi:
2283   case AArch64::LDPDi:
2284   case AArch64::LDPXi:
2285   case AArch64::STPDi:
2286   case AArch64::STPXi:
2287     return 8;
2288   case AArch64::LDRQui:
2289   case AArch64::LDURQi:
2290   case AArch64::STRQui:
2291   case AArch64::STURQi:
2292   case AArch64::LDPQi:
2293   case AArch64::STPQi:
2294   case AArch64::STGOffset:
2295   case AArch64::STZGOffset:
2296   case AArch64::ST2GOffset:
2297   case AArch64::STZ2GOffset:
2298   case AArch64::STGPi:
2299     return 16;
2300   }
2301 }
2302 
2303 // Scale the unscaled offsets.  Returns false if the unscaled offset can't be
2304 // scaled.
2305 static bool scaleOffset(unsigned Opc, int64_t &Offset) {
2306   int Scale = AArch64InstrInfo::getMemScale(Opc);
2307 
2308   // If the byte-offset isn't a multiple of the stride, we can't scale this
2309   // offset.
2310   if (Offset % Scale != 0)
2311     return false;
2312 
2313   // Convert the byte-offset used by unscaled into an "element" offset used
2314   // by the scaled pair load/store instructions.
2315   Offset /= Scale;
2316   return true;
2317 }
2318 
2319 static bool canPairLdStOpc(unsigned FirstOpc, unsigned SecondOpc) {
2320   if (FirstOpc == SecondOpc)
2321     return true;
2322   // We can also pair sign-ext and zero-ext instructions.
2323   switch (FirstOpc) {
2324   default:
2325     return false;
2326   case AArch64::LDRWui:
2327   case AArch64::LDURWi:
2328     return SecondOpc == AArch64::LDRSWui || SecondOpc == AArch64::LDURSWi;
2329   case AArch64::LDRSWui:
2330   case AArch64::LDURSWi:
2331     return SecondOpc == AArch64::LDRWui || SecondOpc == AArch64::LDURWi;
2332   }
2333   // These instructions can't be paired based on their opcodes.
2334   return false;
2335 }
2336 
2337 static bool shouldClusterFI(const MachineFrameInfo &MFI, int FI1,
2338                             int64_t Offset1, unsigned Opcode1, int FI2,
2339                             int64_t Offset2, unsigned Opcode2) {
2340   // Accesses through fixed stack object frame indices may access a different
2341   // fixed stack slot. Check that the object offsets + offsets match.
2342   if (MFI.isFixedObjectIndex(FI1) && MFI.isFixedObjectIndex(FI2)) {
2343     int64_t ObjectOffset1 = MFI.getObjectOffset(FI1);
2344     int64_t ObjectOffset2 = MFI.getObjectOffset(FI2);
2345     assert(ObjectOffset1 <= ObjectOffset2 && "Object offsets are not ordered.");
2346     // Convert to scaled object offsets.
2347     int Scale1 = AArch64InstrInfo::getMemScale(Opcode1);
2348     if (ObjectOffset1 % Scale1 != 0)
2349       return false;
2350     ObjectOffset1 /= Scale1;
2351     int Scale2 = AArch64InstrInfo::getMemScale(Opcode2);
2352     if (ObjectOffset2 % Scale2 != 0)
2353       return false;
2354     ObjectOffset2 /= Scale2;
2355     ObjectOffset1 += Offset1;
2356     ObjectOffset2 += Offset2;
2357     return ObjectOffset1 + 1 == ObjectOffset2;
2358   }
2359 
2360   return FI1 == FI2;
2361 }
2362 
2363 /// Detect opportunities for ldp/stp formation.
2364 ///
2365 /// Only called for LdSt for which getMemOperandWithOffset returns true.
2366 bool AArch64InstrInfo::shouldClusterMemOps(const MachineOperand &BaseOp1,
2367                                            const MachineOperand &BaseOp2,
2368                                            unsigned NumLoads) const {
2369   const MachineInstr &FirstLdSt = *BaseOp1.getParent();
2370   const MachineInstr &SecondLdSt = *BaseOp2.getParent();
2371   if (BaseOp1.getType() != BaseOp2.getType())
2372     return false;
2373 
2374   assert((BaseOp1.isReg() || BaseOp1.isFI()) &&
2375          "Only base registers and frame indices are supported.");
2376 
2377   // Check for both base regs and base FI.
2378   if (BaseOp1.isReg() && BaseOp1.getReg() != BaseOp2.getReg())
2379     return false;
2380 
2381   // Only cluster up to a single pair.
2382   if (NumLoads > 1)
2383     return false;
2384 
2385   if (!isPairableLdStInst(FirstLdSt) || !isPairableLdStInst(SecondLdSt))
2386     return false;
2387 
2388   // Can we pair these instructions based on their opcodes?
2389   unsigned FirstOpc = FirstLdSt.getOpcode();
2390   unsigned SecondOpc = SecondLdSt.getOpcode();
2391   if (!canPairLdStOpc(FirstOpc, SecondOpc))
2392     return false;
2393 
2394   // Can't merge volatiles or load/stores that have a hint to avoid pair
2395   // formation, for example.
2396   if (!isCandidateToMergeOrPair(FirstLdSt) ||
2397       !isCandidateToMergeOrPair(SecondLdSt))
2398     return false;
2399 
2400   // isCandidateToMergeOrPair guarantees that operand 2 is an immediate.
2401   int64_t Offset1 = FirstLdSt.getOperand(2).getImm();
2402   if (isUnscaledLdSt(FirstOpc) && !scaleOffset(FirstOpc, Offset1))
2403     return false;
2404 
2405   int64_t Offset2 = SecondLdSt.getOperand(2).getImm();
2406   if (isUnscaledLdSt(SecondOpc) && !scaleOffset(SecondOpc, Offset2))
2407     return false;
2408 
2409   // Pairwise instructions have a 7-bit signed offset field.
2410   if (Offset1 > 63 || Offset1 < -64)
2411     return false;
2412 
2413   // The caller should already have ordered First/SecondLdSt by offset.
2414   // Note: except for non-equal frame index bases
2415   if (BaseOp1.isFI()) {
2416     assert((!BaseOp1.isIdenticalTo(BaseOp2) || Offset1 <= Offset2) &&
2417            "Caller should have ordered offsets.");
2418 
2419     const MachineFrameInfo &MFI =
2420         FirstLdSt.getParent()->getParent()->getFrameInfo();
2421     return shouldClusterFI(MFI, BaseOp1.getIndex(), Offset1, FirstOpc,
2422                            BaseOp2.getIndex(), Offset2, SecondOpc);
2423   }
2424 
2425   assert(Offset1 <= Offset2 && "Caller should have ordered offsets.");
2426 
2427   return Offset1 + 1 == Offset2;
2428 }
2429 
2430 static const MachineInstrBuilder &AddSubReg(const MachineInstrBuilder &MIB,
2431                                             unsigned Reg, unsigned SubIdx,
2432                                             unsigned State,
2433                                             const TargetRegisterInfo *TRI) {
2434   if (!SubIdx)
2435     return MIB.addReg(Reg, State);
2436 
2437   if (Register::isPhysicalRegister(Reg))
2438     return MIB.addReg(TRI->getSubReg(Reg, SubIdx), State);
2439   return MIB.addReg(Reg, State, SubIdx);
2440 }
2441 
2442 static bool forwardCopyWillClobberTuple(unsigned DestReg, unsigned SrcReg,
2443                                         unsigned NumRegs) {
2444   // We really want the positive remainder mod 32 here, that happens to be
2445   // easily obtainable with a mask.
2446   return ((DestReg - SrcReg) & 0x1f) < NumRegs;
2447 }
2448 
2449 void AArch64InstrInfo::copyPhysRegTuple(MachineBasicBlock &MBB,
2450                                         MachineBasicBlock::iterator I,
2451                                         const DebugLoc &DL, MCRegister DestReg,
2452                                         MCRegister SrcReg, bool KillSrc,
2453                                         unsigned Opcode,
2454                                         ArrayRef<unsigned> Indices) const {
2455   assert(Subtarget.hasNEON() && "Unexpected register copy without NEON");
2456   const TargetRegisterInfo *TRI = &getRegisterInfo();
2457   uint16_t DestEncoding = TRI->getEncodingValue(DestReg);
2458   uint16_t SrcEncoding = TRI->getEncodingValue(SrcReg);
2459   unsigned NumRegs = Indices.size();
2460 
2461   int SubReg = 0, End = NumRegs, Incr = 1;
2462   if (forwardCopyWillClobberTuple(DestEncoding, SrcEncoding, NumRegs)) {
2463     SubReg = NumRegs - 1;
2464     End = -1;
2465     Incr = -1;
2466   }
2467 
2468   for (; SubReg != End; SubReg += Incr) {
2469     const MachineInstrBuilder MIB = BuildMI(MBB, I, DL, get(Opcode));
2470     AddSubReg(MIB, DestReg, Indices[SubReg], RegState::Define, TRI);
2471     AddSubReg(MIB, SrcReg, Indices[SubReg], 0, TRI);
2472     AddSubReg(MIB, SrcReg, Indices[SubReg], getKillRegState(KillSrc), TRI);
2473   }
2474 }
2475 
2476 void AArch64InstrInfo::copyGPRRegTuple(MachineBasicBlock &MBB,
2477                                        MachineBasicBlock::iterator I,
2478                                        DebugLoc DL, unsigned DestReg,
2479                                        unsigned SrcReg, bool KillSrc,
2480                                        unsigned Opcode, unsigned ZeroReg,
2481                                        llvm::ArrayRef<unsigned> Indices) const {
2482   const TargetRegisterInfo *TRI = &getRegisterInfo();
2483   unsigned NumRegs = Indices.size();
2484 
2485 #ifndef NDEBUG
2486   uint16_t DestEncoding = TRI->getEncodingValue(DestReg);
2487   uint16_t SrcEncoding = TRI->getEncodingValue(SrcReg);
2488   assert(DestEncoding % NumRegs == 0 && SrcEncoding % NumRegs == 0 &&
2489          "GPR reg sequences should not be able to overlap");
2490 #endif
2491 
2492   for (unsigned SubReg = 0; SubReg != NumRegs; ++SubReg) {
2493     const MachineInstrBuilder MIB = BuildMI(MBB, I, DL, get(Opcode));
2494     AddSubReg(MIB, DestReg, Indices[SubReg], RegState::Define, TRI);
2495     MIB.addReg(ZeroReg);
2496     AddSubReg(MIB, SrcReg, Indices[SubReg], getKillRegState(KillSrc), TRI);
2497     MIB.addImm(0);
2498   }
2499 }
2500 
2501 void AArch64InstrInfo::copyPhysReg(MachineBasicBlock &MBB,
2502                                    MachineBasicBlock::iterator I,
2503                                    const DebugLoc &DL, MCRegister DestReg,
2504                                    MCRegister SrcReg, bool KillSrc) const {
2505   if (AArch64::GPR32spRegClass.contains(DestReg) &&
2506       (AArch64::GPR32spRegClass.contains(SrcReg) || SrcReg == AArch64::WZR)) {
2507     const TargetRegisterInfo *TRI = &getRegisterInfo();
2508 
2509     if (DestReg == AArch64::WSP || SrcReg == AArch64::WSP) {
2510       // If either operand is WSP, expand to ADD #0.
2511       if (Subtarget.hasZeroCycleRegMove()) {
2512         // Cyclone recognizes "ADD Xd, Xn, #0" as a zero-cycle register move.
2513         MCRegister DestRegX = TRI->getMatchingSuperReg(
2514             DestReg, AArch64::sub_32, &AArch64::GPR64spRegClass);
2515         MCRegister SrcRegX = TRI->getMatchingSuperReg(
2516             SrcReg, AArch64::sub_32, &AArch64::GPR64spRegClass);
2517         // This instruction is reading and writing X registers.  This may upset
2518         // the register scavenger and machine verifier, so we need to indicate
2519         // that we are reading an undefined value from SrcRegX, but a proper
2520         // value from SrcReg.
2521         BuildMI(MBB, I, DL, get(AArch64::ADDXri), DestRegX)
2522             .addReg(SrcRegX, RegState::Undef)
2523             .addImm(0)
2524             .addImm(AArch64_AM::getShifterImm(AArch64_AM::LSL, 0))
2525             .addReg(SrcReg, RegState::Implicit | getKillRegState(KillSrc));
2526       } else {
2527         BuildMI(MBB, I, DL, get(AArch64::ADDWri), DestReg)
2528             .addReg(SrcReg, getKillRegState(KillSrc))
2529             .addImm(0)
2530             .addImm(AArch64_AM::getShifterImm(AArch64_AM::LSL, 0));
2531       }
2532     } else if (SrcReg == AArch64::WZR && Subtarget.hasZeroCycleZeroingGP()) {
2533       BuildMI(MBB, I, DL, get(AArch64::MOVZWi), DestReg)
2534           .addImm(0)
2535           .addImm(AArch64_AM::getShifterImm(AArch64_AM::LSL, 0));
2536     } else {
2537       if (Subtarget.hasZeroCycleRegMove()) {
2538         // Cyclone recognizes "ORR Xd, XZR, Xm" as a zero-cycle register move.
2539         MCRegister DestRegX = TRI->getMatchingSuperReg(
2540             DestReg, AArch64::sub_32, &AArch64::GPR64spRegClass);
2541         MCRegister SrcRegX = TRI->getMatchingSuperReg(
2542             SrcReg, AArch64::sub_32, &AArch64::GPR64spRegClass);
2543         // This instruction is reading and writing X registers.  This may upset
2544         // the register scavenger and machine verifier, so we need to indicate
2545         // that we are reading an undefined value from SrcRegX, but a proper
2546         // value from SrcReg.
2547         BuildMI(MBB, I, DL, get(AArch64::ORRXrr), DestRegX)
2548             .addReg(AArch64::XZR)
2549             .addReg(SrcRegX, RegState::Undef)
2550             .addReg(SrcReg, RegState::Implicit | getKillRegState(KillSrc));
2551       } else {
2552         // Otherwise, expand to ORR WZR.
2553         BuildMI(MBB, I, DL, get(AArch64::ORRWrr), DestReg)
2554             .addReg(AArch64::WZR)
2555             .addReg(SrcReg, getKillRegState(KillSrc));
2556       }
2557     }
2558     return;
2559   }
2560 
2561   // Copy a Predicate register by ORRing with itself.
2562   if (AArch64::PPRRegClass.contains(DestReg) &&
2563       AArch64::PPRRegClass.contains(SrcReg)) {
2564     assert(Subtarget.hasSVE() && "Unexpected SVE register.");
2565     BuildMI(MBB, I, DL, get(AArch64::ORR_PPzPP), DestReg)
2566       .addReg(SrcReg) // Pg
2567       .addReg(SrcReg)
2568       .addReg(SrcReg, getKillRegState(KillSrc));
2569     return;
2570   }
2571 
2572   // Copy a Z register by ORRing with itself.
2573   if (AArch64::ZPRRegClass.contains(DestReg) &&
2574       AArch64::ZPRRegClass.contains(SrcReg)) {
2575     assert(Subtarget.hasSVE() && "Unexpected SVE register.");
2576     BuildMI(MBB, I, DL, get(AArch64::ORR_ZZZ), DestReg)
2577       .addReg(SrcReg)
2578       .addReg(SrcReg, getKillRegState(KillSrc));
2579     return;
2580   }
2581 
2582   if (AArch64::GPR64spRegClass.contains(DestReg) &&
2583       (AArch64::GPR64spRegClass.contains(SrcReg) || SrcReg == AArch64::XZR)) {
2584     if (DestReg == AArch64::SP || SrcReg == AArch64::SP) {
2585       // If either operand is SP, expand to ADD #0.
2586       BuildMI(MBB, I, DL, get(AArch64::ADDXri), DestReg)
2587           .addReg(SrcReg, getKillRegState(KillSrc))
2588           .addImm(0)
2589           .addImm(AArch64_AM::getShifterImm(AArch64_AM::LSL, 0));
2590     } else if (SrcReg == AArch64::XZR && Subtarget.hasZeroCycleZeroingGP()) {
2591       BuildMI(MBB, I, DL, get(AArch64::MOVZXi), DestReg)
2592           .addImm(0)
2593           .addImm(AArch64_AM::getShifterImm(AArch64_AM::LSL, 0));
2594     } else {
2595       // Otherwise, expand to ORR XZR.
2596       BuildMI(MBB, I, DL, get(AArch64::ORRXrr), DestReg)
2597           .addReg(AArch64::XZR)
2598           .addReg(SrcReg, getKillRegState(KillSrc));
2599     }
2600     return;
2601   }
2602 
2603   // Copy a DDDD register quad by copying the individual sub-registers.
2604   if (AArch64::DDDDRegClass.contains(DestReg) &&
2605       AArch64::DDDDRegClass.contains(SrcReg)) {
2606     static const unsigned Indices[] = {AArch64::dsub0, AArch64::dsub1,
2607                                        AArch64::dsub2, AArch64::dsub3};
2608     copyPhysRegTuple(MBB, I, DL, DestReg, SrcReg, KillSrc, AArch64::ORRv8i8,
2609                      Indices);
2610     return;
2611   }
2612 
2613   // Copy a DDD register triple by copying the individual sub-registers.
2614   if (AArch64::DDDRegClass.contains(DestReg) &&
2615       AArch64::DDDRegClass.contains(SrcReg)) {
2616     static const unsigned Indices[] = {AArch64::dsub0, AArch64::dsub1,
2617                                        AArch64::dsub2};
2618     copyPhysRegTuple(MBB, I, DL, DestReg, SrcReg, KillSrc, AArch64::ORRv8i8,
2619                      Indices);
2620     return;
2621   }
2622 
2623   // Copy a DD register pair by copying the individual sub-registers.
2624   if (AArch64::DDRegClass.contains(DestReg) &&
2625       AArch64::DDRegClass.contains(SrcReg)) {
2626     static const unsigned Indices[] = {AArch64::dsub0, AArch64::dsub1};
2627     copyPhysRegTuple(MBB, I, DL, DestReg, SrcReg, KillSrc, AArch64::ORRv8i8,
2628                      Indices);
2629     return;
2630   }
2631 
2632   // Copy a QQQQ register quad by copying the individual sub-registers.
2633   if (AArch64::QQQQRegClass.contains(DestReg) &&
2634       AArch64::QQQQRegClass.contains(SrcReg)) {
2635     static const unsigned Indices[] = {AArch64::qsub0, AArch64::qsub1,
2636                                        AArch64::qsub2, AArch64::qsub3};
2637     copyPhysRegTuple(MBB, I, DL, DestReg, SrcReg, KillSrc, AArch64::ORRv16i8,
2638                      Indices);
2639     return;
2640   }
2641 
2642   // Copy a QQQ register triple by copying the individual sub-registers.
2643   if (AArch64::QQQRegClass.contains(DestReg) &&
2644       AArch64::QQQRegClass.contains(SrcReg)) {
2645     static const unsigned Indices[] = {AArch64::qsub0, AArch64::qsub1,
2646                                        AArch64::qsub2};
2647     copyPhysRegTuple(MBB, I, DL, DestReg, SrcReg, KillSrc, AArch64::ORRv16i8,
2648                      Indices);
2649     return;
2650   }
2651 
2652   // Copy a QQ register pair by copying the individual sub-registers.
2653   if (AArch64::QQRegClass.contains(DestReg) &&
2654       AArch64::QQRegClass.contains(SrcReg)) {
2655     static const unsigned Indices[] = {AArch64::qsub0, AArch64::qsub1};
2656     copyPhysRegTuple(MBB, I, DL, DestReg, SrcReg, KillSrc, AArch64::ORRv16i8,
2657                      Indices);
2658     return;
2659   }
2660 
2661   if (AArch64::XSeqPairsClassRegClass.contains(DestReg) &&
2662       AArch64::XSeqPairsClassRegClass.contains(SrcReg)) {
2663     static const unsigned Indices[] = {AArch64::sube64, AArch64::subo64};
2664     copyGPRRegTuple(MBB, I, DL, DestReg, SrcReg, KillSrc, AArch64::ORRXrs,
2665                     AArch64::XZR, Indices);
2666     return;
2667   }
2668 
2669   if (AArch64::WSeqPairsClassRegClass.contains(DestReg) &&
2670       AArch64::WSeqPairsClassRegClass.contains(SrcReg)) {
2671     static const unsigned Indices[] = {AArch64::sube32, AArch64::subo32};
2672     copyGPRRegTuple(MBB, I, DL, DestReg, SrcReg, KillSrc, AArch64::ORRWrs,
2673                     AArch64::WZR, Indices);
2674     return;
2675   }
2676 
2677   if (AArch64::FPR128RegClass.contains(DestReg) &&
2678       AArch64::FPR128RegClass.contains(SrcReg)) {
2679     if (Subtarget.hasNEON()) {
2680       BuildMI(MBB, I, DL, get(AArch64::ORRv16i8), DestReg)
2681           .addReg(SrcReg)
2682           .addReg(SrcReg, getKillRegState(KillSrc));
2683     } else {
2684       BuildMI(MBB, I, DL, get(AArch64::STRQpre))
2685           .addReg(AArch64::SP, RegState::Define)
2686           .addReg(SrcReg, getKillRegState(KillSrc))
2687           .addReg(AArch64::SP)
2688           .addImm(-16);
2689       BuildMI(MBB, I, DL, get(AArch64::LDRQpre))
2690           .addReg(AArch64::SP, RegState::Define)
2691           .addReg(DestReg, RegState::Define)
2692           .addReg(AArch64::SP)
2693           .addImm(16);
2694     }
2695     return;
2696   }
2697 
2698   if (AArch64::FPR64RegClass.contains(DestReg) &&
2699       AArch64::FPR64RegClass.contains(SrcReg)) {
2700     if (Subtarget.hasNEON()) {
2701       DestReg = RI.getMatchingSuperReg(DestReg, AArch64::dsub,
2702                                        &AArch64::FPR128RegClass);
2703       SrcReg = RI.getMatchingSuperReg(SrcReg, AArch64::dsub,
2704                                       &AArch64::FPR128RegClass);
2705       BuildMI(MBB, I, DL, get(AArch64::ORRv16i8), DestReg)
2706           .addReg(SrcReg)
2707           .addReg(SrcReg, getKillRegState(KillSrc));
2708     } else {
2709       BuildMI(MBB, I, DL, get(AArch64::FMOVDr), DestReg)
2710           .addReg(SrcReg, getKillRegState(KillSrc));
2711     }
2712     return;
2713   }
2714 
2715   if (AArch64::FPR32RegClass.contains(DestReg) &&
2716       AArch64::FPR32RegClass.contains(SrcReg)) {
2717     if (Subtarget.hasNEON()) {
2718       DestReg = RI.getMatchingSuperReg(DestReg, AArch64::ssub,
2719                                        &AArch64::FPR128RegClass);
2720       SrcReg = RI.getMatchingSuperReg(SrcReg, AArch64::ssub,
2721                                       &AArch64::FPR128RegClass);
2722       BuildMI(MBB, I, DL, get(AArch64::ORRv16i8), DestReg)
2723           .addReg(SrcReg)
2724           .addReg(SrcReg, getKillRegState(KillSrc));
2725     } else {
2726       BuildMI(MBB, I, DL, get(AArch64::FMOVSr), DestReg)
2727           .addReg(SrcReg, getKillRegState(KillSrc));
2728     }
2729     return;
2730   }
2731 
2732   if (AArch64::FPR16RegClass.contains(DestReg) &&
2733       AArch64::FPR16RegClass.contains(SrcReg)) {
2734     if (Subtarget.hasNEON()) {
2735       DestReg = RI.getMatchingSuperReg(DestReg, AArch64::hsub,
2736                                        &AArch64::FPR128RegClass);
2737       SrcReg = RI.getMatchingSuperReg(SrcReg, AArch64::hsub,
2738                                       &AArch64::FPR128RegClass);
2739       BuildMI(MBB, I, DL, get(AArch64::ORRv16i8), DestReg)
2740           .addReg(SrcReg)
2741           .addReg(SrcReg, getKillRegState(KillSrc));
2742     } else {
2743       DestReg = RI.getMatchingSuperReg(DestReg, AArch64::hsub,
2744                                        &AArch64::FPR32RegClass);
2745       SrcReg = RI.getMatchingSuperReg(SrcReg, AArch64::hsub,
2746                                       &AArch64::FPR32RegClass);
2747       BuildMI(MBB, I, DL, get(AArch64::FMOVSr), DestReg)
2748           .addReg(SrcReg, getKillRegState(KillSrc));
2749     }
2750     return;
2751   }
2752 
2753   if (AArch64::FPR8RegClass.contains(DestReg) &&
2754       AArch64::FPR8RegClass.contains(SrcReg)) {
2755     if (Subtarget.hasNEON()) {
2756       DestReg = RI.getMatchingSuperReg(DestReg, AArch64::bsub,
2757                                        &AArch64::FPR128RegClass);
2758       SrcReg = RI.getMatchingSuperReg(SrcReg, AArch64::bsub,
2759                                       &AArch64::FPR128RegClass);
2760       BuildMI(MBB, I, DL, get(AArch64::ORRv16i8), DestReg)
2761           .addReg(SrcReg)
2762           .addReg(SrcReg, getKillRegState(KillSrc));
2763     } else {
2764       DestReg = RI.getMatchingSuperReg(DestReg, AArch64::bsub,
2765                                        &AArch64::FPR32RegClass);
2766       SrcReg = RI.getMatchingSuperReg(SrcReg, AArch64::bsub,
2767                                       &AArch64::FPR32RegClass);
2768       BuildMI(MBB, I, DL, get(AArch64::FMOVSr), DestReg)
2769           .addReg(SrcReg, getKillRegState(KillSrc));
2770     }
2771     return;
2772   }
2773 
2774   // Copies between GPR64 and FPR64.
2775   if (AArch64::FPR64RegClass.contains(DestReg) &&
2776       AArch64::GPR64RegClass.contains(SrcReg)) {
2777     BuildMI(MBB, I, DL, get(AArch64::FMOVXDr), DestReg)
2778         .addReg(SrcReg, getKillRegState(KillSrc));
2779     return;
2780   }
2781   if (AArch64::GPR64RegClass.contains(DestReg) &&
2782       AArch64::FPR64RegClass.contains(SrcReg)) {
2783     BuildMI(MBB, I, DL, get(AArch64::FMOVDXr), DestReg)
2784         .addReg(SrcReg, getKillRegState(KillSrc));
2785     return;
2786   }
2787   // Copies between GPR32 and FPR32.
2788   if (AArch64::FPR32RegClass.contains(DestReg) &&
2789       AArch64::GPR32RegClass.contains(SrcReg)) {
2790     BuildMI(MBB, I, DL, get(AArch64::FMOVWSr), DestReg)
2791         .addReg(SrcReg, getKillRegState(KillSrc));
2792     return;
2793   }
2794   if (AArch64::GPR32RegClass.contains(DestReg) &&
2795       AArch64::FPR32RegClass.contains(SrcReg)) {
2796     BuildMI(MBB, I, DL, get(AArch64::FMOVSWr), DestReg)
2797         .addReg(SrcReg, getKillRegState(KillSrc));
2798     return;
2799   }
2800 
2801   if (DestReg == AArch64::NZCV) {
2802     assert(AArch64::GPR64RegClass.contains(SrcReg) && "Invalid NZCV copy");
2803     BuildMI(MBB, I, DL, get(AArch64::MSR))
2804         .addImm(AArch64SysReg::NZCV)
2805         .addReg(SrcReg, getKillRegState(KillSrc))
2806         .addReg(AArch64::NZCV, RegState::Implicit | RegState::Define);
2807     return;
2808   }
2809 
2810   if (SrcReg == AArch64::NZCV) {
2811     assert(AArch64::GPR64RegClass.contains(DestReg) && "Invalid NZCV copy");
2812     BuildMI(MBB, I, DL, get(AArch64::MRS), DestReg)
2813         .addImm(AArch64SysReg::NZCV)
2814         .addReg(AArch64::NZCV, RegState::Implicit | getKillRegState(KillSrc));
2815     return;
2816   }
2817 
2818   llvm_unreachable("unimplemented reg-to-reg copy");
2819 }
2820 
2821 static void storeRegPairToStackSlot(const TargetRegisterInfo &TRI,
2822                                     MachineBasicBlock &MBB,
2823                                     MachineBasicBlock::iterator InsertBefore,
2824                                     const MCInstrDesc &MCID,
2825                                     unsigned SrcReg, bool IsKill,
2826                                     unsigned SubIdx0, unsigned SubIdx1, int FI,
2827                                     MachineMemOperand *MMO) {
2828   unsigned SrcReg0 = SrcReg;
2829   unsigned SrcReg1 = SrcReg;
2830   if (Register::isPhysicalRegister(SrcReg)) {
2831     SrcReg0 = TRI.getSubReg(SrcReg, SubIdx0);
2832     SubIdx0 = 0;
2833     SrcReg1 = TRI.getSubReg(SrcReg, SubIdx1);
2834     SubIdx1 = 0;
2835   }
2836   BuildMI(MBB, InsertBefore, DebugLoc(), MCID)
2837       .addReg(SrcReg0, getKillRegState(IsKill), SubIdx0)
2838       .addReg(SrcReg1, getKillRegState(IsKill), SubIdx1)
2839       .addFrameIndex(FI)
2840       .addImm(0)
2841       .addMemOperand(MMO);
2842 }
2843 
2844 void AArch64InstrInfo::storeRegToStackSlot(
2845     MachineBasicBlock &MBB, MachineBasicBlock::iterator MBBI, unsigned SrcReg,
2846     bool isKill, int FI, const TargetRegisterClass *RC,
2847     const TargetRegisterInfo *TRI) const {
2848   MachineFunction &MF = *MBB.getParent();
2849   MachineFrameInfo &MFI = MF.getFrameInfo();
2850   unsigned Align = MFI.getObjectAlignment(FI);
2851 
2852   MachinePointerInfo PtrInfo = MachinePointerInfo::getFixedStack(MF, FI);
2853   MachineMemOperand *MMO = MF.getMachineMemOperand(
2854       PtrInfo, MachineMemOperand::MOStore, MFI.getObjectSize(FI), Align);
2855   unsigned Opc = 0;
2856   bool Offset = true;
2857   switch (TRI->getSpillSize(*RC)) {
2858   case 1:
2859     if (AArch64::FPR8RegClass.hasSubClassEq(RC))
2860       Opc = AArch64::STRBui;
2861     break;
2862   case 2:
2863     if (AArch64::FPR16RegClass.hasSubClassEq(RC))
2864       Opc = AArch64::STRHui;
2865     break;
2866   case 4:
2867     if (AArch64::GPR32allRegClass.hasSubClassEq(RC)) {
2868       Opc = AArch64::STRWui;
2869       if (Register::isVirtualRegister(SrcReg))
2870         MF.getRegInfo().constrainRegClass(SrcReg, &AArch64::GPR32RegClass);
2871       else
2872         assert(SrcReg != AArch64::WSP);
2873     } else if (AArch64::FPR32RegClass.hasSubClassEq(RC))
2874       Opc = AArch64::STRSui;
2875     break;
2876   case 8:
2877     if (AArch64::GPR64allRegClass.hasSubClassEq(RC)) {
2878       Opc = AArch64::STRXui;
2879       if (Register::isVirtualRegister(SrcReg))
2880         MF.getRegInfo().constrainRegClass(SrcReg, &AArch64::GPR64RegClass);
2881       else
2882         assert(SrcReg != AArch64::SP);
2883     } else if (AArch64::FPR64RegClass.hasSubClassEq(RC)) {
2884       Opc = AArch64::STRDui;
2885     } else if (AArch64::WSeqPairsClassRegClass.hasSubClassEq(RC)) {
2886       storeRegPairToStackSlot(getRegisterInfo(), MBB, MBBI,
2887                               get(AArch64::STPWi), SrcReg, isKill,
2888                               AArch64::sube32, AArch64::subo32, FI, MMO);
2889       return;
2890     }
2891     break;
2892   case 16:
2893     if (AArch64::FPR128RegClass.hasSubClassEq(RC))
2894       Opc = AArch64::STRQui;
2895     else if (AArch64::DDRegClass.hasSubClassEq(RC)) {
2896       assert(Subtarget.hasNEON() && "Unexpected register store without NEON");
2897       Opc = AArch64::ST1Twov1d;
2898       Offset = false;
2899     } else if (AArch64::XSeqPairsClassRegClass.hasSubClassEq(RC)) {
2900       storeRegPairToStackSlot(getRegisterInfo(), MBB, MBBI,
2901                               get(AArch64::STPXi), SrcReg, isKill,
2902                               AArch64::sube64, AArch64::subo64, FI, MMO);
2903       return;
2904     }
2905     break;
2906   case 24:
2907     if (AArch64::DDDRegClass.hasSubClassEq(RC)) {
2908       assert(Subtarget.hasNEON() && "Unexpected register store without NEON");
2909       Opc = AArch64::ST1Threev1d;
2910       Offset = false;
2911     }
2912     break;
2913   case 32:
2914     if (AArch64::DDDDRegClass.hasSubClassEq(RC)) {
2915       assert(Subtarget.hasNEON() && "Unexpected register store without NEON");
2916       Opc = AArch64::ST1Fourv1d;
2917       Offset = false;
2918     } else if (AArch64::QQRegClass.hasSubClassEq(RC)) {
2919       assert(Subtarget.hasNEON() && "Unexpected register store without NEON");
2920       Opc = AArch64::ST1Twov2d;
2921       Offset = false;
2922     }
2923     break;
2924   case 48:
2925     if (AArch64::QQQRegClass.hasSubClassEq(RC)) {
2926       assert(Subtarget.hasNEON() && "Unexpected register store without NEON");
2927       Opc = AArch64::ST1Threev2d;
2928       Offset = false;
2929     }
2930     break;
2931   case 64:
2932     if (AArch64::QQQQRegClass.hasSubClassEq(RC)) {
2933       assert(Subtarget.hasNEON() && "Unexpected register store without NEON");
2934       Opc = AArch64::ST1Fourv2d;
2935       Offset = false;
2936     }
2937     break;
2938   }
2939   unsigned StackID = TargetStackID::Default;
2940   if (AArch64::PPRRegClass.hasSubClassEq(RC)) {
2941     assert(Subtarget.hasSVE() && "Unexpected register store without SVE");
2942     Opc = AArch64::STR_PXI;
2943     StackID = TargetStackID::SVEVector;
2944   } else if (AArch64::ZPRRegClass.hasSubClassEq(RC)) {
2945     assert(Subtarget.hasSVE() && "Unexpected register store without SVE");
2946     Opc = AArch64::STR_ZXI;
2947     StackID = TargetStackID::SVEVector;
2948   }
2949   assert(Opc && "Unknown register class");
2950   MFI.setStackID(FI, StackID);
2951 
2952   const MachineInstrBuilder MI = BuildMI(MBB, MBBI, DebugLoc(), get(Opc))
2953                                      .addReg(SrcReg, getKillRegState(isKill))
2954                                      .addFrameIndex(FI);
2955 
2956   if (Offset)
2957     MI.addImm(0);
2958   MI.addMemOperand(MMO);
2959 }
2960 
2961 static void loadRegPairFromStackSlot(const TargetRegisterInfo &TRI,
2962                                      MachineBasicBlock &MBB,
2963                                      MachineBasicBlock::iterator InsertBefore,
2964                                      const MCInstrDesc &MCID,
2965                                      unsigned DestReg, unsigned SubIdx0,
2966                                      unsigned SubIdx1, int FI,
2967                                      MachineMemOperand *MMO) {
2968   unsigned DestReg0 = DestReg;
2969   unsigned DestReg1 = DestReg;
2970   bool IsUndef = true;
2971   if (Register::isPhysicalRegister(DestReg)) {
2972     DestReg0 = TRI.getSubReg(DestReg, SubIdx0);
2973     SubIdx0 = 0;
2974     DestReg1 = TRI.getSubReg(DestReg, SubIdx1);
2975     SubIdx1 = 0;
2976     IsUndef = false;
2977   }
2978   BuildMI(MBB, InsertBefore, DebugLoc(), MCID)
2979       .addReg(DestReg0, RegState::Define | getUndefRegState(IsUndef), SubIdx0)
2980       .addReg(DestReg1, RegState::Define | getUndefRegState(IsUndef), SubIdx1)
2981       .addFrameIndex(FI)
2982       .addImm(0)
2983       .addMemOperand(MMO);
2984 }
2985 
2986 void AArch64InstrInfo::loadRegFromStackSlot(
2987     MachineBasicBlock &MBB, MachineBasicBlock::iterator MBBI, unsigned DestReg,
2988     int FI, const TargetRegisterClass *RC,
2989     const TargetRegisterInfo *TRI) const {
2990   MachineFunction &MF = *MBB.getParent();
2991   MachineFrameInfo &MFI = MF.getFrameInfo();
2992   unsigned Align = MFI.getObjectAlignment(FI);
2993   MachinePointerInfo PtrInfo = MachinePointerInfo::getFixedStack(MF, FI);
2994   MachineMemOperand *MMO = MF.getMachineMemOperand(
2995       PtrInfo, MachineMemOperand::MOLoad, MFI.getObjectSize(FI), Align);
2996 
2997   unsigned Opc = 0;
2998   bool Offset = true;
2999   switch (TRI->getSpillSize(*RC)) {
3000   case 1:
3001     if (AArch64::FPR8RegClass.hasSubClassEq(RC))
3002       Opc = AArch64::LDRBui;
3003     break;
3004   case 2:
3005     if (AArch64::FPR16RegClass.hasSubClassEq(RC))
3006       Opc = AArch64::LDRHui;
3007     break;
3008   case 4:
3009     if (AArch64::GPR32allRegClass.hasSubClassEq(RC)) {
3010       Opc = AArch64::LDRWui;
3011       if (Register::isVirtualRegister(DestReg))
3012         MF.getRegInfo().constrainRegClass(DestReg, &AArch64::GPR32RegClass);
3013       else
3014         assert(DestReg != AArch64::WSP);
3015     } else if (AArch64::FPR32RegClass.hasSubClassEq(RC))
3016       Opc = AArch64::LDRSui;
3017     break;
3018   case 8:
3019     if (AArch64::GPR64allRegClass.hasSubClassEq(RC)) {
3020       Opc = AArch64::LDRXui;
3021       if (Register::isVirtualRegister(DestReg))
3022         MF.getRegInfo().constrainRegClass(DestReg, &AArch64::GPR64RegClass);
3023       else
3024         assert(DestReg != AArch64::SP);
3025     } else if (AArch64::FPR64RegClass.hasSubClassEq(RC)) {
3026       Opc = AArch64::LDRDui;
3027     } else if (AArch64::WSeqPairsClassRegClass.hasSubClassEq(RC)) {
3028       loadRegPairFromStackSlot(getRegisterInfo(), MBB, MBBI,
3029                                get(AArch64::LDPWi), DestReg, AArch64::sube32,
3030                                AArch64::subo32, FI, MMO);
3031       return;
3032     }
3033     break;
3034   case 16:
3035     if (AArch64::FPR128RegClass.hasSubClassEq(RC))
3036       Opc = AArch64::LDRQui;
3037     else if (AArch64::DDRegClass.hasSubClassEq(RC)) {
3038       assert(Subtarget.hasNEON() && "Unexpected register load without NEON");
3039       Opc = AArch64::LD1Twov1d;
3040       Offset = false;
3041     } else if (AArch64::XSeqPairsClassRegClass.hasSubClassEq(RC)) {
3042       loadRegPairFromStackSlot(getRegisterInfo(), MBB, MBBI,
3043                                get(AArch64::LDPXi), DestReg, AArch64::sube64,
3044                                AArch64::subo64, FI, MMO);
3045       return;
3046     }
3047     break;
3048   case 24:
3049     if (AArch64::DDDRegClass.hasSubClassEq(RC)) {
3050       assert(Subtarget.hasNEON() && "Unexpected register load without NEON");
3051       Opc = AArch64::LD1Threev1d;
3052       Offset = false;
3053     }
3054     break;
3055   case 32:
3056     if (AArch64::DDDDRegClass.hasSubClassEq(RC)) {
3057       assert(Subtarget.hasNEON() && "Unexpected register load without NEON");
3058       Opc = AArch64::LD1Fourv1d;
3059       Offset = false;
3060     } else if (AArch64::QQRegClass.hasSubClassEq(RC)) {
3061       assert(Subtarget.hasNEON() && "Unexpected register load without NEON");
3062       Opc = AArch64::LD1Twov2d;
3063       Offset = false;
3064     }
3065     break;
3066   case 48:
3067     if (AArch64::QQQRegClass.hasSubClassEq(RC)) {
3068       assert(Subtarget.hasNEON() && "Unexpected register load without NEON");
3069       Opc = AArch64::LD1Threev2d;
3070       Offset = false;
3071     }
3072     break;
3073   case 64:
3074     if (AArch64::QQQQRegClass.hasSubClassEq(RC)) {
3075       assert(Subtarget.hasNEON() && "Unexpected register load without NEON");
3076       Opc = AArch64::LD1Fourv2d;
3077       Offset = false;
3078     }
3079     break;
3080   }
3081 
3082   unsigned StackID = TargetStackID::Default;
3083   if (AArch64::PPRRegClass.hasSubClassEq(RC)) {
3084     assert(Subtarget.hasSVE() && "Unexpected register load without SVE");
3085     Opc = AArch64::LDR_PXI;
3086     StackID = TargetStackID::SVEVector;
3087   } else if (AArch64::ZPRRegClass.hasSubClassEq(RC)) {
3088     assert(Subtarget.hasSVE() && "Unexpected register load without SVE");
3089     Opc = AArch64::LDR_ZXI;
3090     StackID = TargetStackID::SVEVector;
3091   }
3092   assert(Opc && "Unknown register class");
3093   MFI.setStackID(FI, StackID);
3094 
3095   const MachineInstrBuilder MI = BuildMI(MBB, MBBI, DebugLoc(), get(Opc))
3096                                      .addReg(DestReg, getDefRegState(true))
3097                                      .addFrameIndex(FI);
3098   if (Offset)
3099     MI.addImm(0);
3100   MI.addMemOperand(MMO);
3101 }
3102 
3103 // Helper function to emit a frame offset adjustment from a given
3104 // pointer (SrcReg), stored into DestReg. This function is explicit
3105 // in that it requires the opcode.
3106 static void emitFrameOffsetAdj(MachineBasicBlock &MBB,
3107                                MachineBasicBlock::iterator MBBI,
3108                                const DebugLoc &DL, unsigned DestReg,
3109                                unsigned SrcReg, int64_t Offset, unsigned Opc,
3110                                const TargetInstrInfo *TII,
3111                                MachineInstr::MIFlag Flag, bool NeedsWinCFI,
3112                                bool *HasWinCFI) {
3113   int Sign = 1;
3114   unsigned MaxEncoding, ShiftSize;
3115   switch (Opc) {
3116   case AArch64::ADDXri:
3117   case AArch64::ADDSXri:
3118   case AArch64::SUBXri:
3119   case AArch64::SUBSXri:
3120     MaxEncoding = 0xfff;
3121     ShiftSize = 12;
3122     break;
3123   case AArch64::ADDVL_XXI:
3124   case AArch64::ADDPL_XXI:
3125     MaxEncoding = 31;
3126     ShiftSize = 0;
3127     if (Offset < 0) {
3128       MaxEncoding = 32;
3129       Sign = -1;
3130       Offset = -Offset;
3131     }
3132     break;
3133   default:
3134     llvm_unreachable("Unsupported opcode");
3135   }
3136 
3137   // FIXME: If the offset won't fit in 24-bits, compute the offset into a
3138   // scratch register.  If DestReg is a virtual register, use it as the
3139   // scratch register; otherwise, create a new virtual register (to be
3140   // replaced by the scavenger at the end of PEI).  That case can be optimized
3141   // slightly if DestReg is SP which is always 16-byte aligned, so the scratch
3142   // register can be loaded with offset%8 and the add/sub can use an extending
3143   // instruction with LSL#3.
3144   // Currently the function handles any offsets but generates a poor sequence
3145   // of code.
3146   //  assert(Offset < (1 << 24) && "unimplemented reg plus immediate");
3147 
3148   const unsigned MaxEncodableValue = MaxEncoding << ShiftSize;
3149   do {
3150     uint64_t ThisVal = std::min<uint64_t>(Offset, MaxEncodableValue);
3151     unsigned LocalShiftSize = 0;
3152     if (ThisVal > MaxEncoding) {
3153       ThisVal = ThisVal >> ShiftSize;
3154       LocalShiftSize = ShiftSize;
3155     }
3156     assert((ThisVal >> ShiftSize) <= MaxEncoding &&
3157            "Encoding cannot handle value that big");
3158     auto MBI = BuildMI(MBB, MBBI, DL, TII->get(Opc), DestReg)
3159                    .addReg(SrcReg)
3160                    .addImm(Sign * (int)ThisVal);
3161     if (ShiftSize)
3162       MBI = MBI.addImm(
3163           AArch64_AM::getShifterImm(AArch64_AM::LSL, LocalShiftSize));
3164     MBI = MBI.setMIFlag(Flag);
3165 
3166     if (NeedsWinCFI) {
3167       assert(Sign == 1 && "SEH directives should always have a positive sign");
3168       int Imm = (int)(ThisVal << LocalShiftSize);
3169       if ((DestReg == AArch64::FP && SrcReg == AArch64::SP) ||
3170           (SrcReg == AArch64::FP && DestReg == AArch64::SP)) {
3171         if (HasWinCFI)
3172           *HasWinCFI = true;
3173         if (Imm == 0)
3174           BuildMI(MBB, MBBI, DL, TII->get(AArch64::SEH_SetFP)).setMIFlag(Flag);
3175         else
3176           BuildMI(MBB, MBBI, DL, TII->get(AArch64::SEH_AddFP))
3177               .addImm(Imm)
3178               .setMIFlag(Flag);
3179         assert((Offset - Imm) == 0 && "Expected remaining offset to be zero to "
3180                                       "emit a single SEH directive");
3181       } else if (DestReg == AArch64::SP) {
3182         if (HasWinCFI)
3183           *HasWinCFI = true;
3184         assert(SrcReg == AArch64::SP && "Unexpected SrcReg for SEH_StackAlloc");
3185         BuildMI(MBB, MBBI, DL, TII->get(AArch64::SEH_StackAlloc))
3186             .addImm(Imm)
3187             .setMIFlag(Flag);
3188       }
3189       if (HasWinCFI)
3190         *HasWinCFI = true;
3191     }
3192 
3193     SrcReg = DestReg;
3194     Offset -= ThisVal << LocalShiftSize;
3195   } while (Offset);
3196 }
3197 
3198 void llvm::emitFrameOffset(MachineBasicBlock &MBB,
3199                            MachineBasicBlock::iterator MBBI, const DebugLoc &DL,
3200                            unsigned DestReg, unsigned SrcReg,
3201                            StackOffset Offset, const TargetInstrInfo *TII,
3202                            MachineInstr::MIFlag Flag, bool SetNZCV,
3203                            bool NeedsWinCFI, bool *HasWinCFI) {
3204   int64_t Bytes, NumPredicateVectors, NumDataVectors;
3205   Offset.getForFrameOffset(Bytes, NumPredicateVectors, NumDataVectors);
3206 
3207   // First emit non-scalable frame offsets, or a simple 'mov'.
3208   if (Bytes || (!Offset && SrcReg != DestReg)) {
3209     assert((DestReg != AArch64::SP || Bytes % 16 == 0) &&
3210            "SP increment/decrement not 16-byte aligned");
3211     unsigned Opc = SetNZCV ? AArch64::ADDSXri : AArch64::ADDXri;
3212     if (Bytes < 0) {
3213       Bytes = -Bytes;
3214       Opc = SetNZCV ? AArch64::SUBSXri : AArch64::SUBXri;
3215     }
3216     emitFrameOffsetAdj(MBB, MBBI, DL, DestReg, SrcReg, Bytes, Opc, TII, Flag,
3217                        NeedsWinCFI, HasWinCFI);
3218     SrcReg = DestReg;
3219   }
3220 
3221   assert(!(SetNZCV && (NumPredicateVectors || NumDataVectors)) &&
3222          "SetNZCV not supported with SVE vectors");
3223   assert(!(NeedsWinCFI && (NumPredicateVectors || NumDataVectors)) &&
3224          "WinCFI not supported with SVE vectors");
3225 
3226   if (NumDataVectors) {
3227     emitFrameOffsetAdj(MBB, MBBI, DL, DestReg, SrcReg, NumDataVectors,
3228                        AArch64::ADDVL_XXI, TII, Flag, NeedsWinCFI, nullptr);
3229     SrcReg = DestReg;
3230   }
3231 
3232   if (NumPredicateVectors) {
3233     assert(DestReg != AArch64::SP && "Unaligned access to SP");
3234     emitFrameOffsetAdj(MBB, MBBI, DL, DestReg, SrcReg, NumPredicateVectors,
3235                        AArch64::ADDPL_XXI, TII, Flag, NeedsWinCFI, nullptr);
3236   }
3237 }
3238 
3239 MachineInstr *AArch64InstrInfo::foldMemoryOperandImpl(
3240     MachineFunction &MF, MachineInstr &MI, ArrayRef<unsigned> Ops,
3241     MachineBasicBlock::iterator InsertPt, int FrameIndex,
3242     LiveIntervals *LIS, VirtRegMap *VRM) const {
3243   // This is a bit of a hack. Consider this instruction:
3244   //
3245   //   %0 = COPY %sp; GPR64all:%0
3246   //
3247   // We explicitly chose GPR64all for the virtual register so such a copy might
3248   // be eliminated by RegisterCoalescer. However, that may not be possible, and
3249   // %0 may even spill. We can't spill %sp, and since it is in the GPR64all
3250   // register class, TargetInstrInfo::foldMemoryOperand() is going to try.
3251   //
3252   // To prevent that, we are going to constrain the %0 register class here.
3253   //
3254   // <rdar://problem/11522048>
3255   //
3256   if (MI.isFullCopy()) {
3257     Register DstReg = MI.getOperand(0).getReg();
3258     Register SrcReg = MI.getOperand(1).getReg();
3259     if (SrcReg == AArch64::SP && Register::isVirtualRegister(DstReg)) {
3260       MF.getRegInfo().constrainRegClass(DstReg, &AArch64::GPR64RegClass);
3261       return nullptr;
3262     }
3263     if (DstReg == AArch64::SP && Register::isVirtualRegister(SrcReg)) {
3264       MF.getRegInfo().constrainRegClass(SrcReg, &AArch64::GPR64RegClass);
3265       return nullptr;
3266     }
3267   }
3268 
3269   // Handle the case where a copy is being spilled or filled but the source
3270   // and destination register class don't match.  For example:
3271   //
3272   //   %0 = COPY %xzr; GPR64common:%0
3273   //
3274   // In this case we can still safely fold away the COPY and generate the
3275   // following spill code:
3276   //
3277   //   STRXui %xzr, %stack.0
3278   //
3279   // This also eliminates spilled cross register class COPYs (e.g. between x and
3280   // d regs) of the same size.  For example:
3281   //
3282   //   %0 = COPY %1; GPR64:%0, FPR64:%1
3283   //
3284   // will be filled as
3285   //
3286   //   LDRDui %0, fi<#0>
3287   //
3288   // instead of
3289   //
3290   //   LDRXui %Temp, fi<#0>
3291   //   %0 = FMOV %Temp
3292   //
3293   if (MI.isCopy() && Ops.size() == 1 &&
3294       // Make sure we're only folding the explicit COPY defs/uses.
3295       (Ops[0] == 0 || Ops[0] == 1)) {
3296     bool IsSpill = Ops[0] == 0;
3297     bool IsFill = !IsSpill;
3298     const TargetRegisterInfo &TRI = *MF.getSubtarget().getRegisterInfo();
3299     const MachineRegisterInfo &MRI = MF.getRegInfo();
3300     MachineBasicBlock &MBB = *MI.getParent();
3301     const MachineOperand &DstMO = MI.getOperand(0);
3302     const MachineOperand &SrcMO = MI.getOperand(1);
3303     Register DstReg = DstMO.getReg();
3304     Register SrcReg = SrcMO.getReg();
3305     // This is slightly expensive to compute for physical regs since
3306     // getMinimalPhysRegClass is slow.
3307     auto getRegClass = [&](unsigned Reg) {
3308       return Register::isVirtualRegister(Reg) ? MRI.getRegClass(Reg)
3309                                               : TRI.getMinimalPhysRegClass(Reg);
3310     };
3311 
3312     if (DstMO.getSubReg() == 0 && SrcMO.getSubReg() == 0) {
3313       assert(TRI.getRegSizeInBits(*getRegClass(DstReg)) ==
3314                  TRI.getRegSizeInBits(*getRegClass(SrcReg)) &&
3315              "Mismatched register size in non subreg COPY");
3316       if (IsSpill)
3317         storeRegToStackSlot(MBB, InsertPt, SrcReg, SrcMO.isKill(), FrameIndex,
3318                             getRegClass(SrcReg), &TRI);
3319       else
3320         loadRegFromStackSlot(MBB, InsertPt, DstReg, FrameIndex,
3321                              getRegClass(DstReg), &TRI);
3322       return &*--InsertPt;
3323     }
3324 
3325     // Handle cases like spilling def of:
3326     //
3327     //   %0:sub_32<def,read-undef> = COPY %wzr; GPR64common:%0
3328     //
3329     // where the physical register source can be widened and stored to the full
3330     // virtual reg destination stack slot, in this case producing:
3331     //
3332     //   STRXui %xzr, %stack.0
3333     //
3334     if (IsSpill && DstMO.isUndef() && Register::isPhysicalRegister(SrcReg)) {
3335       assert(SrcMO.getSubReg() == 0 &&
3336              "Unexpected subreg on physical register");
3337       const TargetRegisterClass *SpillRC;
3338       unsigned SpillSubreg;
3339       switch (DstMO.getSubReg()) {
3340       default:
3341         SpillRC = nullptr;
3342         break;
3343       case AArch64::sub_32:
3344       case AArch64::ssub:
3345         if (AArch64::GPR32RegClass.contains(SrcReg)) {
3346           SpillRC = &AArch64::GPR64RegClass;
3347           SpillSubreg = AArch64::sub_32;
3348         } else if (AArch64::FPR32RegClass.contains(SrcReg)) {
3349           SpillRC = &AArch64::FPR64RegClass;
3350           SpillSubreg = AArch64::ssub;
3351         } else
3352           SpillRC = nullptr;
3353         break;
3354       case AArch64::dsub:
3355         if (AArch64::FPR64RegClass.contains(SrcReg)) {
3356           SpillRC = &AArch64::FPR128RegClass;
3357           SpillSubreg = AArch64::dsub;
3358         } else
3359           SpillRC = nullptr;
3360         break;
3361       }
3362 
3363       if (SpillRC)
3364         if (unsigned WidenedSrcReg =
3365                 TRI.getMatchingSuperReg(SrcReg, SpillSubreg, SpillRC)) {
3366           storeRegToStackSlot(MBB, InsertPt, WidenedSrcReg, SrcMO.isKill(),
3367                               FrameIndex, SpillRC, &TRI);
3368           return &*--InsertPt;
3369         }
3370     }
3371 
3372     // Handle cases like filling use of:
3373     //
3374     //   %0:sub_32<def,read-undef> = COPY %1; GPR64:%0, GPR32:%1
3375     //
3376     // where we can load the full virtual reg source stack slot, into the subreg
3377     // destination, in this case producing:
3378     //
3379     //   LDRWui %0:sub_32<def,read-undef>, %stack.0
3380     //
3381     if (IsFill && SrcMO.getSubReg() == 0 && DstMO.isUndef()) {
3382       const TargetRegisterClass *FillRC;
3383       switch (DstMO.getSubReg()) {
3384       default:
3385         FillRC = nullptr;
3386         break;
3387       case AArch64::sub_32:
3388         FillRC = &AArch64::GPR32RegClass;
3389         break;
3390       case AArch64::ssub:
3391         FillRC = &AArch64::FPR32RegClass;
3392         break;
3393       case AArch64::dsub:
3394         FillRC = &AArch64::FPR64RegClass;
3395         break;
3396       }
3397 
3398       if (FillRC) {
3399         assert(TRI.getRegSizeInBits(*getRegClass(SrcReg)) ==
3400                    TRI.getRegSizeInBits(*FillRC) &&
3401                "Mismatched regclass size on folded subreg COPY");
3402         loadRegFromStackSlot(MBB, InsertPt, DstReg, FrameIndex, FillRC, &TRI);
3403         MachineInstr &LoadMI = *--InsertPt;
3404         MachineOperand &LoadDst = LoadMI.getOperand(0);
3405         assert(LoadDst.getSubReg() == 0 && "unexpected subreg on fill load");
3406         LoadDst.setSubReg(DstMO.getSubReg());
3407         LoadDst.setIsUndef();
3408         return &LoadMI;
3409       }
3410     }
3411   }
3412 
3413   // Cannot fold.
3414   return nullptr;
3415 }
3416 
3417 static bool isSVEScaledImmInstruction(unsigned Opcode) {
3418   switch (Opcode) {
3419   case AArch64::LDR_ZXI:
3420   case AArch64::STR_ZXI:
3421   case AArch64::LDR_PXI:
3422   case AArch64::STR_PXI:
3423     return true;
3424   default:
3425     return false;
3426   }
3427 }
3428 
3429 int llvm::isAArch64FrameOffsetLegal(const MachineInstr &MI,
3430                                     StackOffset &SOffset,
3431                                     bool *OutUseUnscaledOp,
3432                                     unsigned *OutUnscaledOp,
3433                                     int64_t *EmittableOffset) {
3434   // Set output values in case of early exit.
3435   if (EmittableOffset)
3436     *EmittableOffset = 0;
3437   if (OutUseUnscaledOp)
3438     *OutUseUnscaledOp = false;
3439   if (OutUnscaledOp)
3440     *OutUnscaledOp = 0;
3441 
3442   // Exit early for structured vector spills/fills as they can't take an
3443   // immediate offset.
3444   switch (MI.getOpcode()) {
3445   default:
3446     break;
3447   case AArch64::LD1Twov2d:
3448   case AArch64::LD1Threev2d:
3449   case AArch64::LD1Fourv2d:
3450   case AArch64::LD1Twov1d:
3451   case AArch64::LD1Threev1d:
3452   case AArch64::LD1Fourv1d:
3453   case AArch64::ST1Twov2d:
3454   case AArch64::ST1Threev2d:
3455   case AArch64::ST1Fourv2d:
3456   case AArch64::ST1Twov1d:
3457   case AArch64::ST1Threev1d:
3458   case AArch64::ST1Fourv1d:
3459   case AArch64::IRG:
3460   case AArch64::IRGstack:
3461     return AArch64FrameOffsetCannotUpdate;
3462   }
3463 
3464   // Get the min/max offset and the scale.
3465   unsigned Scale, Width;
3466   int64_t MinOff, MaxOff;
3467   if (!AArch64InstrInfo::getMemOpInfo(MI.getOpcode(), Scale, Width, MinOff,
3468                                       MaxOff))
3469     llvm_unreachable("unhandled opcode in isAArch64FrameOffsetLegal");
3470 
3471   // Construct the complete offset.
3472   bool IsMulVL = isSVEScaledImmInstruction(MI.getOpcode());
3473   int64_t Offset =
3474       IsMulVL ? (SOffset.getScalableBytes()) : (SOffset.getBytes());
3475 
3476   const MachineOperand &ImmOpnd =
3477       MI.getOperand(AArch64InstrInfo::getLoadStoreImmIdx(MI.getOpcode()));
3478   Offset += ImmOpnd.getImm() * Scale;
3479 
3480   // If the offset doesn't match the scale, we rewrite the instruction to
3481   // use the unscaled instruction instead. Likewise, if we have a negative
3482   // offset and there is an unscaled op to use.
3483   Optional<unsigned> UnscaledOp =
3484       AArch64InstrInfo::getUnscaledLdSt(MI.getOpcode());
3485   bool useUnscaledOp = UnscaledOp && (Offset % Scale || Offset < 0);
3486   if (useUnscaledOp &&
3487       !AArch64InstrInfo::getMemOpInfo(*UnscaledOp, Scale, Width, MinOff, MaxOff))
3488     llvm_unreachable("unhandled opcode in isAArch64FrameOffsetLegal");
3489 
3490   int64_t Remainder = Offset % Scale;
3491   assert(!(Remainder && useUnscaledOp) &&
3492          "Cannot have remainder when using unscaled op");
3493 
3494   assert(MinOff < MaxOff && "Unexpected Min/Max offsets");
3495   int64_t NewOffset = Offset / Scale;
3496   if (MinOff <= NewOffset && NewOffset <= MaxOff)
3497     Offset = Remainder;
3498   else {
3499     NewOffset = NewOffset < 0 ? MinOff : MaxOff;
3500     Offset = Offset - NewOffset * Scale + Remainder;
3501   }
3502 
3503   if (EmittableOffset)
3504     *EmittableOffset = NewOffset;
3505   if (OutUseUnscaledOp)
3506     *OutUseUnscaledOp = useUnscaledOp;
3507   if (OutUnscaledOp && UnscaledOp)
3508     *OutUnscaledOp = *UnscaledOp;
3509 
3510   if (IsMulVL)
3511     SOffset = StackOffset(Offset, MVT::nxv1i8) +
3512               StackOffset(SOffset.getBytes(), MVT::i8);
3513   else
3514     SOffset = StackOffset(Offset, MVT::i8) +
3515               StackOffset(SOffset.getScalableBytes(), MVT::nxv1i8);
3516   return AArch64FrameOffsetCanUpdate |
3517          (SOffset ? 0 : AArch64FrameOffsetIsLegal);
3518 }
3519 
3520 bool llvm::rewriteAArch64FrameIndex(MachineInstr &MI, unsigned FrameRegIdx,
3521                                     unsigned FrameReg, StackOffset &Offset,
3522                                     const AArch64InstrInfo *TII) {
3523   unsigned Opcode = MI.getOpcode();
3524   unsigned ImmIdx = FrameRegIdx + 1;
3525 
3526   if (Opcode == AArch64::ADDSXri || Opcode == AArch64::ADDXri) {
3527     Offset += StackOffset(MI.getOperand(ImmIdx).getImm(), MVT::i8);
3528     emitFrameOffset(*MI.getParent(), MI, MI.getDebugLoc(),
3529                     MI.getOperand(0).getReg(), FrameReg, Offset, TII,
3530                     MachineInstr::NoFlags, (Opcode == AArch64::ADDSXri));
3531     MI.eraseFromParent();
3532     Offset = StackOffset();
3533     return true;
3534   }
3535 
3536   int64_t NewOffset;
3537   unsigned UnscaledOp;
3538   bool UseUnscaledOp;
3539   int Status = isAArch64FrameOffsetLegal(MI, Offset, &UseUnscaledOp,
3540                                          &UnscaledOp, &NewOffset);
3541   if (Status & AArch64FrameOffsetCanUpdate) {
3542     if (Status & AArch64FrameOffsetIsLegal)
3543       // Replace the FrameIndex with FrameReg.
3544       MI.getOperand(FrameRegIdx).ChangeToRegister(FrameReg, false);
3545     if (UseUnscaledOp)
3546       MI.setDesc(TII->get(UnscaledOp));
3547 
3548     MI.getOperand(ImmIdx).ChangeToImmediate(NewOffset);
3549     return !Offset;
3550   }
3551 
3552   return false;
3553 }
3554 
3555 void AArch64InstrInfo::getNoop(MCInst &NopInst) const {
3556   NopInst.setOpcode(AArch64::HINT);
3557   NopInst.addOperand(MCOperand::createImm(0));
3558 }
3559 
3560 // AArch64 supports MachineCombiner.
3561 bool AArch64InstrInfo::useMachineCombiner() const { return true; }
3562 
3563 // True when Opc sets flag
3564 static bool isCombineInstrSettingFlag(unsigned Opc) {
3565   switch (Opc) {
3566   case AArch64::ADDSWrr:
3567   case AArch64::ADDSWri:
3568   case AArch64::ADDSXrr:
3569   case AArch64::ADDSXri:
3570   case AArch64::SUBSWrr:
3571   case AArch64::SUBSXrr:
3572   // Note: MSUB Wd,Wn,Wm,Wi -> Wd = Wi - WnxWm, not Wd=WnxWm - Wi.
3573   case AArch64::SUBSWri:
3574   case AArch64::SUBSXri:
3575     return true;
3576   default:
3577     break;
3578   }
3579   return false;
3580 }
3581 
3582 // 32b Opcodes that can be combined with a MUL
3583 static bool isCombineInstrCandidate32(unsigned Opc) {
3584   switch (Opc) {
3585   case AArch64::ADDWrr:
3586   case AArch64::ADDWri:
3587   case AArch64::SUBWrr:
3588   case AArch64::ADDSWrr:
3589   case AArch64::ADDSWri:
3590   case AArch64::SUBSWrr:
3591   // Note: MSUB Wd,Wn,Wm,Wi -> Wd = Wi - WnxWm, not Wd=WnxWm - Wi.
3592   case AArch64::SUBWri:
3593   case AArch64::SUBSWri:
3594     return true;
3595   default:
3596     break;
3597   }
3598   return false;
3599 }
3600 
3601 // 64b Opcodes that can be combined with a MUL
3602 static bool isCombineInstrCandidate64(unsigned Opc) {
3603   switch (Opc) {
3604   case AArch64::ADDXrr:
3605   case AArch64::ADDXri:
3606   case AArch64::SUBXrr:
3607   case AArch64::ADDSXrr:
3608   case AArch64::ADDSXri:
3609   case AArch64::SUBSXrr:
3610   // Note: MSUB Wd,Wn,Wm,Wi -> Wd = Wi - WnxWm, not Wd=WnxWm - Wi.
3611   case AArch64::SUBXri:
3612   case AArch64::SUBSXri:
3613   case AArch64::ADDv8i8:
3614   case AArch64::ADDv16i8:
3615   case AArch64::ADDv4i16:
3616   case AArch64::ADDv8i16:
3617   case AArch64::ADDv2i32:
3618   case AArch64::ADDv4i32:
3619   case AArch64::SUBv8i8:
3620   case AArch64::SUBv16i8:
3621   case AArch64::SUBv4i16:
3622   case AArch64::SUBv8i16:
3623   case AArch64::SUBv2i32:
3624   case AArch64::SUBv4i32:
3625     return true;
3626   default:
3627     break;
3628   }
3629   return false;
3630 }
3631 
3632 // FP Opcodes that can be combined with a FMUL
3633 static bool isCombineInstrCandidateFP(const MachineInstr &Inst) {
3634   switch (Inst.getOpcode()) {
3635   default:
3636     break;
3637   case AArch64::FADDHrr:
3638   case AArch64::FADDSrr:
3639   case AArch64::FADDDrr:
3640   case AArch64::FADDv4f16:
3641   case AArch64::FADDv8f16:
3642   case AArch64::FADDv2f32:
3643   case AArch64::FADDv2f64:
3644   case AArch64::FADDv4f32:
3645   case AArch64::FSUBHrr:
3646   case AArch64::FSUBSrr:
3647   case AArch64::FSUBDrr:
3648   case AArch64::FSUBv4f16:
3649   case AArch64::FSUBv8f16:
3650   case AArch64::FSUBv2f32:
3651   case AArch64::FSUBv2f64:
3652   case AArch64::FSUBv4f32:
3653     TargetOptions Options = Inst.getParent()->getParent()->getTarget().Options;
3654     return (Options.UnsafeFPMath ||
3655             Options.AllowFPOpFusion == FPOpFusion::Fast);
3656   }
3657   return false;
3658 }
3659 
3660 // Opcodes that can be combined with a MUL
3661 static bool isCombineInstrCandidate(unsigned Opc) {
3662   return (isCombineInstrCandidate32(Opc) || isCombineInstrCandidate64(Opc));
3663 }
3664 
3665 //
3666 // Utility routine that checks if \param MO is defined by an
3667 // \param CombineOpc instruction in the basic block \param MBB
3668 static bool canCombine(MachineBasicBlock &MBB, MachineOperand &MO,
3669                        unsigned CombineOpc, unsigned ZeroReg = 0,
3670                        bool CheckZeroReg = false) {
3671   MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo();
3672   MachineInstr *MI = nullptr;
3673 
3674   if (MO.isReg() && Register::isVirtualRegister(MO.getReg()))
3675     MI = MRI.getUniqueVRegDef(MO.getReg());
3676   // And it needs to be in the trace (otherwise, it won't have a depth).
3677   if (!MI || MI->getParent() != &MBB || (unsigned)MI->getOpcode() != CombineOpc)
3678     return false;
3679   // Must only used by the user we combine with.
3680   if (!MRI.hasOneNonDBGUse(MI->getOperand(0).getReg()))
3681     return false;
3682 
3683   if (CheckZeroReg) {
3684     assert(MI->getNumOperands() >= 4 && MI->getOperand(0).isReg() &&
3685            MI->getOperand(1).isReg() && MI->getOperand(2).isReg() &&
3686            MI->getOperand(3).isReg() && "MAdd/MSub must have a least 4 regs");
3687     // The third input reg must be zero.
3688     if (MI->getOperand(3).getReg() != ZeroReg)
3689       return false;
3690   }
3691 
3692   return true;
3693 }
3694 
3695 //
3696 // Is \param MO defined by an integer multiply and can be combined?
3697 static bool canCombineWithMUL(MachineBasicBlock &MBB, MachineOperand &MO,
3698                               unsigned MulOpc, unsigned ZeroReg) {
3699   return canCombine(MBB, MO, MulOpc, ZeroReg, true);
3700 }
3701 
3702 //
3703 // Is \param MO defined by a floating-point multiply and can be combined?
3704 static bool canCombineWithFMUL(MachineBasicBlock &MBB, MachineOperand &MO,
3705                                unsigned MulOpc) {
3706   return canCombine(MBB, MO, MulOpc);
3707 }
3708 
3709 // TODO: There are many more machine instruction opcodes to match:
3710 //       1. Other data types (integer, vectors)
3711 //       2. Other math / logic operations (xor, or)
3712 //       3. Other forms of the same operation (intrinsics and other variants)
3713 bool AArch64InstrInfo::isAssociativeAndCommutative(
3714     const MachineInstr &Inst) const {
3715   switch (Inst.getOpcode()) {
3716   case AArch64::FADDDrr:
3717   case AArch64::FADDSrr:
3718   case AArch64::FADDv2f32:
3719   case AArch64::FADDv2f64:
3720   case AArch64::FADDv4f32:
3721   case AArch64::FMULDrr:
3722   case AArch64::FMULSrr:
3723   case AArch64::FMULX32:
3724   case AArch64::FMULX64:
3725   case AArch64::FMULXv2f32:
3726   case AArch64::FMULXv2f64:
3727   case AArch64::FMULXv4f32:
3728   case AArch64::FMULv2f32:
3729   case AArch64::FMULv2f64:
3730   case AArch64::FMULv4f32:
3731     return Inst.getParent()->getParent()->getTarget().Options.UnsafeFPMath;
3732   default:
3733     return false;
3734   }
3735 }
3736 
3737 /// Find instructions that can be turned into madd.
3738 static bool getMaddPatterns(MachineInstr &Root,
3739                             SmallVectorImpl<MachineCombinerPattern> &Patterns) {
3740   unsigned Opc = Root.getOpcode();
3741   MachineBasicBlock &MBB = *Root.getParent();
3742   bool Found = false;
3743 
3744   if (!isCombineInstrCandidate(Opc))
3745     return false;
3746   if (isCombineInstrSettingFlag(Opc)) {
3747     int Cmp_NZCV = Root.findRegisterDefOperandIdx(AArch64::NZCV, true);
3748     // When NZCV is live bail out.
3749     if (Cmp_NZCV == -1)
3750       return false;
3751     unsigned NewOpc = convertToNonFlagSettingOpc(Root);
3752     // When opcode can't change bail out.
3753     // CHECKME: do we miss any cases for opcode conversion?
3754     if (NewOpc == Opc)
3755       return false;
3756     Opc = NewOpc;
3757   }
3758 
3759   auto setFound = [&](int Opcode, int Operand, unsigned ZeroReg,
3760                       MachineCombinerPattern Pattern) {
3761     if (canCombineWithMUL(MBB, Root.getOperand(Operand), Opcode, ZeroReg)) {
3762       Patterns.push_back(Pattern);
3763       Found = true;
3764     }
3765   };
3766 
3767   auto setVFound = [&](int Opcode, int Operand, MachineCombinerPattern Pattern) {
3768     if (canCombine(MBB, Root.getOperand(Operand), Opcode)) {
3769       Patterns.push_back(Pattern);
3770       Found = true;
3771     }
3772   };
3773 
3774   typedef MachineCombinerPattern MCP;
3775 
3776   switch (Opc) {
3777   default:
3778     break;
3779   case AArch64::ADDWrr:
3780     assert(Root.getOperand(1).isReg() && Root.getOperand(2).isReg() &&
3781            "ADDWrr does not have register operands");
3782     setFound(AArch64::MADDWrrr, 1, AArch64::WZR, MCP::MULADDW_OP1);
3783     setFound(AArch64::MADDWrrr, 2, AArch64::WZR, MCP::MULADDW_OP2);
3784     break;
3785   case AArch64::ADDXrr:
3786     setFound(AArch64::MADDXrrr, 1, AArch64::XZR, MCP::MULADDX_OP1);
3787     setFound(AArch64::MADDXrrr, 2, AArch64::XZR, MCP::MULADDX_OP2);
3788     break;
3789   case AArch64::SUBWrr:
3790     setFound(AArch64::MADDWrrr, 1, AArch64::WZR, MCP::MULSUBW_OP1);
3791     setFound(AArch64::MADDWrrr, 2, AArch64::WZR, MCP::MULSUBW_OP2);
3792     break;
3793   case AArch64::SUBXrr:
3794     setFound(AArch64::MADDXrrr, 1, AArch64::XZR, MCP::MULSUBX_OP1);
3795     setFound(AArch64::MADDXrrr, 2, AArch64::XZR, MCP::MULSUBX_OP2);
3796     break;
3797   case AArch64::ADDWri:
3798     setFound(AArch64::MADDWrrr, 1, AArch64::WZR, MCP::MULADDWI_OP1);
3799     break;
3800   case AArch64::ADDXri:
3801     setFound(AArch64::MADDXrrr, 1, AArch64::XZR, MCP::MULADDXI_OP1);
3802     break;
3803   case AArch64::SUBWri:
3804     setFound(AArch64::MADDWrrr, 1, AArch64::WZR, MCP::MULSUBWI_OP1);
3805     break;
3806   case AArch64::SUBXri:
3807     setFound(AArch64::MADDXrrr, 1, AArch64::XZR, MCP::MULSUBXI_OP1);
3808     break;
3809   case AArch64::ADDv8i8:
3810     setVFound(AArch64::MULv8i8, 1, MCP::MULADDv8i8_OP1);
3811     setVFound(AArch64::MULv8i8, 2, MCP::MULADDv8i8_OP2);
3812     break;
3813   case AArch64::ADDv16i8:
3814     setVFound(AArch64::MULv16i8, 1, MCP::MULADDv16i8_OP1);
3815     setVFound(AArch64::MULv16i8, 2, MCP::MULADDv16i8_OP2);
3816     break;
3817   case AArch64::ADDv4i16:
3818     setVFound(AArch64::MULv4i16, 1, MCP::MULADDv4i16_OP1);
3819     setVFound(AArch64::MULv4i16, 2, MCP::MULADDv4i16_OP2);
3820     setVFound(AArch64::MULv4i16_indexed, 1, MCP::MULADDv4i16_indexed_OP1);
3821     setVFound(AArch64::MULv4i16_indexed, 2, MCP::MULADDv4i16_indexed_OP2);
3822     break;
3823   case AArch64::ADDv8i16:
3824     setVFound(AArch64::MULv8i16, 1, MCP::MULADDv8i16_OP1);
3825     setVFound(AArch64::MULv8i16, 2, MCP::MULADDv8i16_OP2);
3826     setVFound(AArch64::MULv8i16_indexed, 1, MCP::MULADDv8i16_indexed_OP1);
3827     setVFound(AArch64::MULv8i16_indexed, 2, MCP::MULADDv8i16_indexed_OP2);
3828     break;
3829   case AArch64::ADDv2i32:
3830     setVFound(AArch64::MULv2i32, 1, MCP::MULADDv2i32_OP1);
3831     setVFound(AArch64::MULv2i32, 2, MCP::MULADDv2i32_OP2);
3832     setVFound(AArch64::MULv2i32_indexed, 1, MCP::MULADDv2i32_indexed_OP1);
3833     setVFound(AArch64::MULv2i32_indexed, 2, MCP::MULADDv2i32_indexed_OP2);
3834     break;
3835   case AArch64::ADDv4i32:
3836     setVFound(AArch64::MULv4i32, 1, MCP::MULADDv4i32_OP1);
3837     setVFound(AArch64::MULv4i32, 2, MCP::MULADDv4i32_OP2);
3838     setVFound(AArch64::MULv4i32_indexed, 1, MCP::MULADDv4i32_indexed_OP1);
3839     setVFound(AArch64::MULv4i32_indexed, 2, MCP::MULADDv4i32_indexed_OP2);
3840     break;
3841   case AArch64::SUBv8i8:
3842     setVFound(AArch64::MULv8i8, 1, MCP::MULSUBv8i8_OP1);
3843     setVFound(AArch64::MULv8i8, 2, MCP::MULSUBv8i8_OP2);
3844     break;
3845   case AArch64::SUBv16i8:
3846     setVFound(AArch64::MULv16i8, 1, MCP::MULSUBv16i8_OP1);
3847     setVFound(AArch64::MULv16i8, 2, MCP::MULSUBv16i8_OP2);
3848     break;
3849   case AArch64::SUBv4i16:
3850     setVFound(AArch64::MULv4i16, 1, MCP::MULSUBv4i16_OP1);
3851     setVFound(AArch64::MULv4i16, 2, MCP::MULSUBv4i16_OP2);
3852     setVFound(AArch64::MULv4i16_indexed, 1, MCP::MULSUBv4i16_indexed_OP1);
3853     setVFound(AArch64::MULv4i16_indexed, 2, MCP::MULSUBv4i16_indexed_OP2);
3854     break;
3855   case AArch64::SUBv8i16:
3856     setVFound(AArch64::MULv8i16, 1, MCP::MULSUBv8i16_OP1);
3857     setVFound(AArch64::MULv8i16, 2, MCP::MULSUBv8i16_OP2);
3858     setVFound(AArch64::MULv8i16_indexed, 1, MCP::MULSUBv8i16_indexed_OP1);
3859     setVFound(AArch64::MULv8i16_indexed, 2, MCP::MULSUBv8i16_indexed_OP2);
3860     break;
3861   case AArch64::SUBv2i32:
3862     setVFound(AArch64::MULv2i32, 1, MCP::MULSUBv2i32_OP1);
3863     setVFound(AArch64::MULv2i32, 2, MCP::MULSUBv2i32_OP2);
3864     setVFound(AArch64::MULv2i32_indexed, 1, MCP::MULSUBv2i32_indexed_OP1);
3865     setVFound(AArch64::MULv2i32_indexed, 2, MCP::MULSUBv2i32_indexed_OP2);
3866     break;
3867   case AArch64::SUBv4i32:
3868     setVFound(AArch64::MULv4i32, 1, MCP::MULSUBv4i32_OP1);
3869     setVFound(AArch64::MULv4i32, 2, MCP::MULSUBv4i32_OP2);
3870     setVFound(AArch64::MULv4i32_indexed, 1, MCP::MULSUBv4i32_indexed_OP1);
3871     setVFound(AArch64::MULv4i32_indexed, 2, MCP::MULSUBv4i32_indexed_OP2);
3872     break;
3873   }
3874   return Found;
3875 }
3876 /// Floating-Point Support
3877 
3878 /// Find instructions that can be turned into madd.
3879 static bool getFMAPatterns(MachineInstr &Root,
3880                            SmallVectorImpl<MachineCombinerPattern> &Patterns) {
3881 
3882   if (!isCombineInstrCandidateFP(Root))
3883     return false;
3884 
3885   MachineBasicBlock &MBB = *Root.getParent();
3886   bool Found = false;
3887 
3888   auto Match = [&](int Opcode, int Operand,
3889                    MachineCombinerPattern Pattern) -> bool {
3890     if (canCombineWithFMUL(MBB, Root.getOperand(Operand), Opcode)) {
3891       Patterns.push_back(Pattern);
3892       return true;
3893     }
3894     return false;
3895   };
3896 
3897   typedef MachineCombinerPattern MCP;
3898 
3899   switch (Root.getOpcode()) {
3900   default:
3901     assert(false && "Unsupported FP instruction in combiner\n");
3902     break;
3903   case AArch64::FADDHrr:
3904     assert(Root.getOperand(1).isReg() && Root.getOperand(2).isReg() &&
3905            "FADDHrr does not have register operands");
3906 
3907     Found  = Match(AArch64::FMULHrr, 1, MCP::FMULADDH_OP1);
3908     Found |= Match(AArch64::FMULHrr, 2, MCP::FMULADDH_OP2);
3909     break;
3910   case AArch64::FADDSrr:
3911     assert(Root.getOperand(1).isReg() && Root.getOperand(2).isReg() &&
3912            "FADDSrr does not have register operands");
3913 
3914     Found |= Match(AArch64::FMULSrr, 1, MCP::FMULADDS_OP1) ||
3915              Match(AArch64::FMULv1i32_indexed, 1, MCP::FMLAv1i32_indexed_OP1);
3916 
3917     Found |= Match(AArch64::FMULSrr, 2, MCP::FMULADDS_OP2) ||
3918              Match(AArch64::FMULv1i32_indexed, 2, MCP::FMLAv1i32_indexed_OP2);
3919     break;
3920   case AArch64::FADDDrr:
3921     Found |= Match(AArch64::FMULDrr, 1, MCP::FMULADDD_OP1) ||
3922              Match(AArch64::FMULv1i64_indexed, 1, MCP::FMLAv1i64_indexed_OP1);
3923 
3924     Found |= Match(AArch64::FMULDrr, 2, MCP::FMULADDD_OP2) ||
3925              Match(AArch64::FMULv1i64_indexed, 2, MCP::FMLAv1i64_indexed_OP2);
3926     break;
3927   case AArch64::FADDv4f16:
3928     Found |= Match(AArch64::FMULv4i16_indexed, 1, MCP::FMLAv4i16_indexed_OP1) ||
3929              Match(AArch64::FMULv4f16, 1, MCP::FMLAv4f16_OP1);
3930 
3931     Found |= Match(AArch64::FMULv4i16_indexed, 2, MCP::FMLAv4i16_indexed_OP2) ||
3932              Match(AArch64::FMULv4f16, 2, MCP::FMLAv4f16_OP2);
3933     break;
3934   case AArch64::FADDv8f16:
3935     Found |= Match(AArch64::FMULv8i16_indexed, 1, MCP::FMLAv8i16_indexed_OP1) ||
3936              Match(AArch64::FMULv8f16, 1, MCP::FMLAv8f16_OP1);
3937 
3938     Found |= Match(AArch64::FMULv8i16_indexed, 2, MCP::FMLAv8i16_indexed_OP2) ||
3939              Match(AArch64::FMULv8f16, 2, MCP::FMLAv8f16_OP2);
3940     break;
3941   case AArch64::FADDv2f32:
3942     Found |= Match(AArch64::FMULv2i32_indexed, 1, MCP::FMLAv2i32_indexed_OP1) ||
3943              Match(AArch64::FMULv2f32, 1, MCP::FMLAv2f32_OP1);
3944 
3945     Found |= Match(AArch64::FMULv2i32_indexed, 2, MCP::FMLAv2i32_indexed_OP2) ||
3946              Match(AArch64::FMULv2f32, 2, MCP::FMLAv2f32_OP2);
3947     break;
3948   case AArch64::FADDv2f64:
3949     Found |= Match(AArch64::FMULv2i64_indexed, 1, MCP::FMLAv2i64_indexed_OP1) ||
3950              Match(AArch64::FMULv2f64, 1, MCP::FMLAv2f64_OP1);
3951 
3952     Found |= Match(AArch64::FMULv2i64_indexed, 2, MCP::FMLAv2i64_indexed_OP2) ||
3953              Match(AArch64::FMULv2f64, 2, MCP::FMLAv2f64_OP2);
3954     break;
3955   case AArch64::FADDv4f32:
3956     Found |= Match(AArch64::FMULv4i32_indexed, 1, MCP::FMLAv4i32_indexed_OP1) ||
3957              Match(AArch64::FMULv4f32, 1, MCP::FMLAv4f32_OP1);
3958 
3959     Found |= Match(AArch64::FMULv4i32_indexed, 2, MCP::FMLAv4i32_indexed_OP2) ||
3960              Match(AArch64::FMULv4f32, 2, MCP::FMLAv4f32_OP2);
3961     break;
3962   case AArch64::FSUBHrr:
3963     Found  = Match(AArch64::FMULHrr, 1, MCP::FMULSUBH_OP1);
3964     Found |= Match(AArch64::FMULHrr, 2, MCP::FMULSUBH_OP2);
3965     Found |= Match(AArch64::FNMULHrr, 1, MCP::FNMULSUBH_OP1);
3966     break;
3967   case AArch64::FSUBSrr:
3968     Found = Match(AArch64::FMULSrr, 1, MCP::FMULSUBS_OP1);
3969 
3970     Found |= Match(AArch64::FMULSrr, 2, MCP::FMULSUBS_OP2) ||
3971              Match(AArch64::FMULv1i32_indexed, 2, MCP::FMLSv1i32_indexed_OP2);
3972 
3973     Found |= Match(AArch64::FNMULSrr, 1, MCP::FNMULSUBS_OP1);
3974     break;
3975   case AArch64::FSUBDrr:
3976     Found = Match(AArch64::FMULDrr, 1, MCP::FMULSUBD_OP1);
3977 
3978     Found |= Match(AArch64::FMULDrr, 2, MCP::FMULSUBD_OP2) ||
3979              Match(AArch64::FMULv1i64_indexed, 2, MCP::FMLSv1i64_indexed_OP2);
3980 
3981     Found |= Match(AArch64::FNMULDrr, 1, MCP::FNMULSUBD_OP1);
3982     break;
3983   case AArch64::FSUBv4f16:
3984     Found |= Match(AArch64::FMULv4i16_indexed, 2, MCP::FMLSv4i16_indexed_OP2) ||
3985              Match(AArch64::FMULv4f16, 2, MCP::FMLSv4f16_OP2);
3986 
3987     Found |= Match(AArch64::FMULv4i16_indexed, 1, MCP::FMLSv4i16_indexed_OP1) ||
3988              Match(AArch64::FMULv4f16, 1, MCP::FMLSv4f16_OP1);
3989     break;
3990   case AArch64::FSUBv8f16:
3991     Found |= Match(AArch64::FMULv8i16_indexed, 2, MCP::FMLSv8i16_indexed_OP2) ||
3992              Match(AArch64::FMULv8f16, 2, MCP::FMLSv8f16_OP2);
3993 
3994     Found |= Match(AArch64::FMULv8i16_indexed, 1, MCP::FMLSv8i16_indexed_OP1) ||
3995              Match(AArch64::FMULv8f16, 1, MCP::FMLSv8f16_OP1);
3996     break;
3997   case AArch64::FSUBv2f32:
3998     Found |= Match(AArch64::FMULv2i32_indexed, 2, MCP::FMLSv2i32_indexed_OP2) ||
3999              Match(AArch64::FMULv2f32, 2, MCP::FMLSv2f32_OP2);
4000 
4001     Found |= Match(AArch64::FMULv2i32_indexed, 1, MCP::FMLSv2i32_indexed_OP1) ||
4002              Match(AArch64::FMULv2f32, 1, MCP::FMLSv2f32_OP1);
4003     break;
4004   case AArch64::FSUBv2f64:
4005     Found |= Match(AArch64::FMULv2i64_indexed, 2, MCP::FMLSv2i64_indexed_OP2) ||
4006              Match(AArch64::FMULv2f64, 2, MCP::FMLSv2f64_OP2);
4007 
4008     Found |= Match(AArch64::FMULv2i64_indexed, 1, MCP::FMLSv2i64_indexed_OP1) ||
4009              Match(AArch64::FMULv2f64, 1, MCP::FMLSv2f64_OP1);
4010     break;
4011   case AArch64::FSUBv4f32:
4012     Found |= Match(AArch64::FMULv4i32_indexed, 2, MCP::FMLSv4i32_indexed_OP2) ||
4013              Match(AArch64::FMULv4f32, 2, MCP::FMLSv4f32_OP2);
4014 
4015     Found |= Match(AArch64::FMULv4i32_indexed, 1, MCP::FMLSv4i32_indexed_OP1) ||
4016              Match(AArch64::FMULv4f32, 1, MCP::FMLSv4f32_OP1);
4017     break;
4018   }
4019   return Found;
4020 }
4021 
4022 /// Return true when a code sequence can improve throughput. It
4023 /// should be called only for instructions in loops.
4024 /// \param Pattern - combiner pattern
4025 bool AArch64InstrInfo::isThroughputPattern(
4026     MachineCombinerPattern Pattern) const {
4027   switch (Pattern) {
4028   default:
4029     break;
4030   case MachineCombinerPattern::FMULADDH_OP1:
4031   case MachineCombinerPattern::FMULADDH_OP2:
4032   case MachineCombinerPattern::FMULSUBH_OP1:
4033   case MachineCombinerPattern::FMULSUBH_OP2:
4034   case MachineCombinerPattern::FMULADDS_OP1:
4035   case MachineCombinerPattern::FMULADDS_OP2:
4036   case MachineCombinerPattern::FMULSUBS_OP1:
4037   case MachineCombinerPattern::FMULSUBS_OP2:
4038   case MachineCombinerPattern::FMULADDD_OP1:
4039   case MachineCombinerPattern::FMULADDD_OP2:
4040   case MachineCombinerPattern::FMULSUBD_OP1:
4041   case MachineCombinerPattern::FMULSUBD_OP2:
4042   case MachineCombinerPattern::FNMULSUBH_OP1:
4043   case MachineCombinerPattern::FNMULSUBS_OP1:
4044   case MachineCombinerPattern::FNMULSUBD_OP1:
4045   case MachineCombinerPattern::FMLAv4i16_indexed_OP1:
4046   case MachineCombinerPattern::FMLAv4i16_indexed_OP2:
4047   case MachineCombinerPattern::FMLAv8i16_indexed_OP1:
4048   case MachineCombinerPattern::FMLAv8i16_indexed_OP2:
4049   case MachineCombinerPattern::FMLAv1i32_indexed_OP1:
4050   case MachineCombinerPattern::FMLAv1i32_indexed_OP2:
4051   case MachineCombinerPattern::FMLAv1i64_indexed_OP1:
4052   case MachineCombinerPattern::FMLAv1i64_indexed_OP2:
4053   case MachineCombinerPattern::FMLAv4f16_OP2:
4054   case MachineCombinerPattern::FMLAv4f16_OP1:
4055   case MachineCombinerPattern::FMLAv8f16_OP1:
4056   case MachineCombinerPattern::FMLAv8f16_OP2:
4057   case MachineCombinerPattern::FMLAv2f32_OP2:
4058   case MachineCombinerPattern::FMLAv2f32_OP1:
4059   case MachineCombinerPattern::FMLAv2f64_OP1:
4060   case MachineCombinerPattern::FMLAv2f64_OP2:
4061   case MachineCombinerPattern::FMLAv2i32_indexed_OP1:
4062   case MachineCombinerPattern::FMLAv2i32_indexed_OP2:
4063   case MachineCombinerPattern::FMLAv2i64_indexed_OP1:
4064   case MachineCombinerPattern::FMLAv2i64_indexed_OP2:
4065   case MachineCombinerPattern::FMLAv4f32_OP1:
4066   case MachineCombinerPattern::FMLAv4f32_OP2:
4067   case MachineCombinerPattern::FMLAv4i32_indexed_OP1:
4068   case MachineCombinerPattern::FMLAv4i32_indexed_OP2:
4069   case MachineCombinerPattern::FMLSv4i16_indexed_OP1:
4070   case MachineCombinerPattern::FMLSv4i16_indexed_OP2:
4071   case MachineCombinerPattern::FMLSv8i16_indexed_OP1:
4072   case MachineCombinerPattern::FMLSv8i16_indexed_OP2:
4073   case MachineCombinerPattern::FMLSv1i32_indexed_OP2:
4074   case MachineCombinerPattern::FMLSv1i64_indexed_OP2:
4075   case MachineCombinerPattern::FMLSv2i32_indexed_OP2:
4076   case MachineCombinerPattern::FMLSv2i64_indexed_OP2:
4077   case MachineCombinerPattern::FMLSv4f16_OP1:
4078   case MachineCombinerPattern::FMLSv4f16_OP2:
4079   case MachineCombinerPattern::FMLSv8f16_OP1:
4080   case MachineCombinerPattern::FMLSv8f16_OP2:
4081   case MachineCombinerPattern::FMLSv2f32_OP2:
4082   case MachineCombinerPattern::FMLSv2f64_OP2:
4083   case MachineCombinerPattern::FMLSv4i32_indexed_OP2:
4084   case MachineCombinerPattern::FMLSv4f32_OP2:
4085   case MachineCombinerPattern::MULADDv8i8_OP1:
4086   case MachineCombinerPattern::MULADDv8i8_OP2:
4087   case MachineCombinerPattern::MULADDv16i8_OP1:
4088   case MachineCombinerPattern::MULADDv16i8_OP2:
4089   case MachineCombinerPattern::MULADDv4i16_OP1:
4090   case MachineCombinerPattern::MULADDv4i16_OP2:
4091   case MachineCombinerPattern::MULADDv8i16_OP1:
4092   case MachineCombinerPattern::MULADDv8i16_OP2:
4093   case MachineCombinerPattern::MULADDv2i32_OP1:
4094   case MachineCombinerPattern::MULADDv2i32_OP2:
4095   case MachineCombinerPattern::MULADDv4i32_OP1:
4096   case MachineCombinerPattern::MULADDv4i32_OP2:
4097   case MachineCombinerPattern::MULSUBv8i8_OP1:
4098   case MachineCombinerPattern::MULSUBv8i8_OP2:
4099   case MachineCombinerPattern::MULSUBv16i8_OP1:
4100   case MachineCombinerPattern::MULSUBv16i8_OP2:
4101   case MachineCombinerPattern::MULSUBv4i16_OP1:
4102   case MachineCombinerPattern::MULSUBv4i16_OP2:
4103   case MachineCombinerPattern::MULSUBv8i16_OP1:
4104   case MachineCombinerPattern::MULSUBv8i16_OP2:
4105   case MachineCombinerPattern::MULSUBv2i32_OP1:
4106   case MachineCombinerPattern::MULSUBv2i32_OP2:
4107   case MachineCombinerPattern::MULSUBv4i32_OP1:
4108   case MachineCombinerPattern::MULSUBv4i32_OP2:
4109   case MachineCombinerPattern::MULADDv4i16_indexed_OP1:
4110   case MachineCombinerPattern::MULADDv4i16_indexed_OP2:
4111   case MachineCombinerPattern::MULADDv8i16_indexed_OP1:
4112   case MachineCombinerPattern::MULADDv8i16_indexed_OP2:
4113   case MachineCombinerPattern::MULADDv2i32_indexed_OP1:
4114   case MachineCombinerPattern::MULADDv2i32_indexed_OP2:
4115   case MachineCombinerPattern::MULADDv4i32_indexed_OP1:
4116   case MachineCombinerPattern::MULADDv4i32_indexed_OP2:
4117   case MachineCombinerPattern::MULSUBv4i16_indexed_OP1:
4118   case MachineCombinerPattern::MULSUBv4i16_indexed_OP2:
4119   case MachineCombinerPattern::MULSUBv8i16_indexed_OP1:
4120   case MachineCombinerPattern::MULSUBv8i16_indexed_OP2:
4121   case MachineCombinerPattern::MULSUBv2i32_indexed_OP1:
4122   case MachineCombinerPattern::MULSUBv2i32_indexed_OP2:
4123   case MachineCombinerPattern::MULSUBv4i32_indexed_OP1:
4124   case MachineCombinerPattern::MULSUBv4i32_indexed_OP2:
4125     return true;
4126   } // end switch (Pattern)
4127   return false;
4128 }
4129 /// Return true when there is potentially a faster code sequence for an
4130 /// instruction chain ending in \p Root. All potential patterns are listed in
4131 /// the \p Pattern vector. Pattern should be sorted in priority order since the
4132 /// pattern evaluator stops checking as soon as it finds a faster sequence.
4133 
4134 bool AArch64InstrInfo::getMachineCombinerPatterns(
4135     MachineInstr &Root,
4136     SmallVectorImpl<MachineCombinerPattern> &Patterns) const {
4137   // Integer patterns
4138   if (getMaddPatterns(Root, Patterns))
4139     return true;
4140   // Floating point patterns
4141   if (getFMAPatterns(Root, Patterns))
4142     return true;
4143 
4144   return TargetInstrInfo::getMachineCombinerPatterns(Root, Patterns);
4145 }
4146 
4147 enum class FMAInstKind { Default, Indexed, Accumulator };
4148 /// genFusedMultiply - Generate fused multiply instructions.
4149 /// This function supports both integer and floating point instructions.
4150 /// A typical example:
4151 ///  F|MUL I=A,B,0
4152 ///  F|ADD R,I,C
4153 ///  ==> F|MADD R,A,B,C
4154 /// \param MF Containing MachineFunction
4155 /// \param MRI Register information
4156 /// \param TII Target information
4157 /// \param Root is the F|ADD instruction
4158 /// \param [out] InsInstrs is a vector of machine instructions and will
4159 /// contain the generated madd instruction
4160 /// \param IdxMulOpd is index of operand in Root that is the result of
4161 /// the F|MUL. In the example above IdxMulOpd is 1.
4162 /// \param MaddOpc the opcode fo the f|madd instruction
4163 /// \param RC Register class of operands
4164 /// \param kind of fma instruction (addressing mode) to be generated
4165 /// \param ReplacedAddend is the result register from the instruction
4166 /// replacing the non-combined operand, if any.
4167 static MachineInstr *
4168 genFusedMultiply(MachineFunction &MF, MachineRegisterInfo &MRI,
4169                  const TargetInstrInfo *TII, MachineInstr &Root,
4170                  SmallVectorImpl<MachineInstr *> &InsInstrs, unsigned IdxMulOpd,
4171                  unsigned MaddOpc, const TargetRegisterClass *RC,
4172                  FMAInstKind kind = FMAInstKind::Default,
4173                  const Register *ReplacedAddend = nullptr) {
4174   assert(IdxMulOpd == 1 || IdxMulOpd == 2);
4175 
4176   unsigned IdxOtherOpd = IdxMulOpd == 1 ? 2 : 1;
4177   MachineInstr *MUL = MRI.getUniqueVRegDef(Root.getOperand(IdxMulOpd).getReg());
4178   Register ResultReg = Root.getOperand(0).getReg();
4179   Register SrcReg0 = MUL->getOperand(1).getReg();
4180   bool Src0IsKill = MUL->getOperand(1).isKill();
4181   Register SrcReg1 = MUL->getOperand(2).getReg();
4182   bool Src1IsKill = MUL->getOperand(2).isKill();
4183 
4184   unsigned SrcReg2;
4185   bool Src2IsKill;
4186   if (ReplacedAddend) {
4187     // If we just generated a new addend, we must be it's only use.
4188     SrcReg2 = *ReplacedAddend;
4189     Src2IsKill = true;
4190   } else {
4191     SrcReg2 = Root.getOperand(IdxOtherOpd).getReg();
4192     Src2IsKill = Root.getOperand(IdxOtherOpd).isKill();
4193   }
4194 
4195   if (Register::isVirtualRegister(ResultReg))
4196     MRI.constrainRegClass(ResultReg, RC);
4197   if (Register::isVirtualRegister(SrcReg0))
4198     MRI.constrainRegClass(SrcReg0, RC);
4199   if (Register::isVirtualRegister(SrcReg1))
4200     MRI.constrainRegClass(SrcReg1, RC);
4201   if (Register::isVirtualRegister(SrcReg2))
4202     MRI.constrainRegClass(SrcReg2, RC);
4203 
4204   MachineInstrBuilder MIB;
4205   if (kind == FMAInstKind::Default)
4206     MIB = BuildMI(MF, Root.getDebugLoc(), TII->get(MaddOpc), ResultReg)
4207               .addReg(SrcReg0, getKillRegState(Src0IsKill))
4208               .addReg(SrcReg1, getKillRegState(Src1IsKill))
4209               .addReg(SrcReg2, getKillRegState(Src2IsKill));
4210   else if (kind == FMAInstKind::Indexed)
4211     MIB = BuildMI(MF, Root.getDebugLoc(), TII->get(MaddOpc), ResultReg)
4212               .addReg(SrcReg2, getKillRegState(Src2IsKill))
4213               .addReg(SrcReg0, getKillRegState(Src0IsKill))
4214               .addReg(SrcReg1, getKillRegState(Src1IsKill))
4215               .addImm(MUL->getOperand(3).getImm());
4216   else if (kind == FMAInstKind::Accumulator)
4217     MIB = BuildMI(MF, Root.getDebugLoc(), TII->get(MaddOpc), ResultReg)
4218               .addReg(SrcReg2, getKillRegState(Src2IsKill))
4219               .addReg(SrcReg0, getKillRegState(Src0IsKill))
4220               .addReg(SrcReg1, getKillRegState(Src1IsKill));
4221   else
4222     assert(false && "Invalid FMA instruction kind \n");
4223   // Insert the MADD (MADD, FMA, FMS, FMLA, FMSL)
4224   InsInstrs.push_back(MIB);
4225   return MUL;
4226 }
4227 
4228 /// genFusedMultiplyAcc - Helper to generate fused multiply accumulate
4229 /// instructions.
4230 ///
4231 /// \see genFusedMultiply
4232 static MachineInstr *genFusedMultiplyAcc(
4233     MachineFunction &MF, MachineRegisterInfo &MRI, const TargetInstrInfo *TII,
4234     MachineInstr &Root, SmallVectorImpl<MachineInstr *> &InsInstrs,
4235     unsigned IdxMulOpd, unsigned MaddOpc, const TargetRegisterClass *RC) {
4236   return genFusedMultiply(MF, MRI, TII, Root, InsInstrs, IdxMulOpd, MaddOpc, RC,
4237                           FMAInstKind::Accumulator);
4238 }
4239 
4240 /// genNeg - Helper to generate an intermediate negation of the second operand
4241 /// of Root
4242 static Register genNeg(MachineFunction &MF, MachineRegisterInfo &MRI,
4243                        const TargetInstrInfo *TII, MachineInstr &Root,
4244                        SmallVectorImpl<MachineInstr *> &InsInstrs,
4245                        DenseMap<unsigned, unsigned> &InstrIdxForVirtReg,
4246                        unsigned MnegOpc, const TargetRegisterClass *RC) {
4247   Register NewVR = MRI.createVirtualRegister(RC);
4248   MachineInstrBuilder MIB =
4249       BuildMI(MF, Root.getDebugLoc(), TII->get(MnegOpc), NewVR)
4250           .add(Root.getOperand(2));
4251   InsInstrs.push_back(MIB);
4252 
4253   assert(InstrIdxForVirtReg.empty());
4254   InstrIdxForVirtReg.insert(std::make_pair(NewVR, 0));
4255 
4256   return NewVR;
4257 }
4258 
4259 /// genFusedMultiplyAccNeg - Helper to generate fused multiply accumulate
4260 /// instructions with an additional negation of the accumulator
4261 static MachineInstr *genFusedMultiplyAccNeg(
4262     MachineFunction &MF, MachineRegisterInfo &MRI, const TargetInstrInfo *TII,
4263     MachineInstr &Root, SmallVectorImpl<MachineInstr *> &InsInstrs,
4264     DenseMap<unsigned, unsigned> &InstrIdxForVirtReg, unsigned IdxMulOpd,
4265     unsigned MaddOpc, unsigned MnegOpc, const TargetRegisterClass *RC) {
4266   assert(IdxMulOpd == 1);
4267 
4268   Register NewVR =
4269       genNeg(MF, MRI, TII, Root, InsInstrs, InstrIdxForVirtReg, MnegOpc, RC);
4270   return genFusedMultiply(MF, MRI, TII, Root, InsInstrs, IdxMulOpd, MaddOpc, RC,
4271                           FMAInstKind::Accumulator, &NewVR);
4272 }
4273 
4274 /// genFusedMultiplyIdx - Helper to generate fused multiply accumulate
4275 /// instructions.
4276 ///
4277 /// \see genFusedMultiply
4278 static MachineInstr *genFusedMultiplyIdx(
4279     MachineFunction &MF, MachineRegisterInfo &MRI, const TargetInstrInfo *TII,
4280     MachineInstr &Root, SmallVectorImpl<MachineInstr *> &InsInstrs,
4281     unsigned IdxMulOpd, unsigned MaddOpc, const TargetRegisterClass *RC) {
4282   return genFusedMultiply(MF, MRI, TII, Root, InsInstrs, IdxMulOpd, MaddOpc, RC,
4283                           FMAInstKind::Indexed);
4284 }
4285 
4286 /// genFusedMultiplyAccNeg - Helper to generate fused multiply accumulate
4287 /// instructions with an additional negation of the accumulator
4288 static MachineInstr *genFusedMultiplyIdxNeg(
4289     MachineFunction &MF, MachineRegisterInfo &MRI, const TargetInstrInfo *TII,
4290     MachineInstr &Root, SmallVectorImpl<MachineInstr *> &InsInstrs,
4291     DenseMap<unsigned, unsigned> &InstrIdxForVirtReg, unsigned IdxMulOpd,
4292     unsigned MaddOpc, unsigned MnegOpc, const TargetRegisterClass *RC) {
4293   assert(IdxMulOpd == 1);
4294 
4295   Register NewVR =
4296       genNeg(MF, MRI, TII, Root, InsInstrs, InstrIdxForVirtReg, MnegOpc, RC);
4297 
4298   return genFusedMultiply(MF, MRI, TII, Root, InsInstrs, IdxMulOpd, MaddOpc, RC,
4299                           FMAInstKind::Indexed, &NewVR);
4300 }
4301 
4302 /// genMaddR - Generate madd instruction and combine mul and add using
4303 /// an extra virtual register
4304 /// Example - an ADD intermediate needs to be stored in a register:
4305 ///   MUL I=A,B,0
4306 ///   ADD R,I,Imm
4307 ///   ==> ORR  V, ZR, Imm
4308 ///   ==> MADD R,A,B,V
4309 /// \param MF Containing MachineFunction
4310 /// \param MRI Register information
4311 /// \param TII Target information
4312 /// \param Root is the ADD instruction
4313 /// \param [out] InsInstrs is a vector of machine instructions and will
4314 /// contain the generated madd instruction
4315 /// \param IdxMulOpd is index of operand in Root that is the result of
4316 /// the MUL. In the example above IdxMulOpd is 1.
4317 /// \param MaddOpc the opcode fo the madd instruction
4318 /// \param VR is a virtual register that holds the value of an ADD operand
4319 /// (V in the example above).
4320 /// \param RC Register class of operands
4321 static MachineInstr *genMaddR(MachineFunction &MF, MachineRegisterInfo &MRI,
4322                               const TargetInstrInfo *TII, MachineInstr &Root,
4323                               SmallVectorImpl<MachineInstr *> &InsInstrs,
4324                               unsigned IdxMulOpd, unsigned MaddOpc, unsigned VR,
4325                               const TargetRegisterClass *RC) {
4326   assert(IdxMulOpd == 1 || IdxMulOpd == 2);
4327 
4328   MachineInstr *MUL = MRI.getUniqueVRegDef(Root.getOperand(IdxMulOpd).getReg());
4329   Register ResultReg = Root.getOperand(0).getReg();
4330   Register SrcReg0 = MUL->getOperand(1).getReg();
4331   bool Src0IsKill = MUL->getOperand(1).isKill();
4332   Register SrcReg1 = MUL->getOperand(2).getReg();
4333   bool Src1IsKill = MUL->getOperand(2).isKill();
4334 
4335   if (Register::isVirtualRegister(ResultReg))
4336     MRI.constrainRegClass(ResultReg, RC);
4337   if (Register::isVirtualRegister(SrcReg0))
4338     MRI.constrainRegClass(SrcReg0, RC);
4339   if (Register::isVirtualRegister(SrcReg1))
4340     MRI.constrainRegClass(SrcReg1, RC);
4341   if (Register::isVirtualRegister(VR))
4342     MRI.constrainRegClass(VR, RC);
4343 
4344   MachineInstrBuilder MIB =
4345       BuildMI(MF, Root.getDebugLoc(), TII->get(MaddOpc), ResultReg)
4346           .addReg(SrcReg0, getKillRegState(Src0IsKill))
4347           .addReg(SrcReg1, getKillRegState(Src1IsKill))
4348           .addReg(VR);
4349   // Insert the MADD
4350   InsInstrs.push_back(MIB);
4351   return MUL;
4352 }
4353 
4354 /// When getMachineCombinerPatterns() finds potential patterns,
4355 /// this function generates the instructions that could replace the
4356 /// original code sequence
4357 void AArch64InstrInfo::genAlternativeCodeSequence(
4358     MachineInstr &Root, MachineCombinerPattern Pattern,
4359     SmallVectorImpl<MachineInstr *> &InsInstrs,
4360     SmallVectorImpl<MachineInstr *> &DelInstrs,
4361     DenseMap<unsigned, unsigned> &InstrIdxForVirtReg) const {
4362   MachineBasicBlock &MBB = *Root.getParent();
4363   MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo();
4364   MachineFunction &MF = *MBB.getParent();
4365   const TargetInstrInfo *TII = MF.getSubtarget().getInstrInfo();
4366 
4367   MachineInstr *MUL;
4368   const TargetRegisterClass *RC;
4369   unsigned Opc;
4370   switch (Pattern) {
4371   default:
4372     // Reassociate instructions.
4373     TargetInstrInfo::genAlternativeCodeSequence(Root, Pattern, InsInstrs,
4374                                                 DelInstrs, InstrIdxForVirtReg);
4375     return;
4376   case MachineCombinerPattern::MULADDW_OP1:
4377   case MachineCombinerPattern::MULADDX_OP1:
4378     // MUL I=A,B,0
4379     // ADD R,I,C
4380     // ==> MADD R,A,B,C
4381     // --- Create(MADD);
4382     if (Pattern == MachineCombinerPattern::MULADDW_OP1) {
4383       Opc = AArch64::MADDWrrr;
4384       RC = &AArch64::GPR32RegClass;
4385     } else {
4386       Opc = AArch64::MADDXrrr;
4387       RC = &AArch64::GPR64RegClass;
4388     }
4389     MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC);
4390     break;
4391   case MachineCombinerPattern::MULADDW_OP2:
4392   case MachineCombinerPattern::MULADDX_OP2:
4393     // MUL I=A,B,0
4394     // ADD R,C,I
4395     // ==> MADD R,A,B,C
4396     // --- Create(MADD);
4397     if (Pattern == MachineCombinerPattern::MULADDW_OP2) {
4398       Opc = AArch64::MADDWrrr;
4399       RC = &AArch64::GPR32RegClass;
4400     } else {
4401       Opc = AArch64::MADDXrrr;
4402       RC = &AArch64::GPR64RegClass;
4403     }
4404     MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC);
4405     break;
4406   case MachineCombinerPattern::MULADDWI_OP1:
4407   case MachineCombinerPattern::MULADDXI_OP1: {
4408     // MUL I=A,B,0
4409     // ADD R,I,Imm
4410     // ==> ORR  V, ZR, Imm
4411     // ==> MADD R,A,B,V
4412     // --- Create(MADD);
4413     const TargetRegisterClass *OrrRC;
4414     unsigned BitSize, OrrOpc, ZeroReg;
4415     if (Pattern == MachineCombinerPattern::MULADDWI_OP1) {
4416       OrrOpc = AArch64::ORRWri;
4417       OrrRC = &AArch64::GPR32spRegClass;
4418       BitSize = 32;
4419       ZeroReg = AArch64::WZR;
4420       Opc = AArch64::MADDWrrr;
4421       RC = &AArch64::GPR32RegClass;
4422     } else {
4423       OrrOpc = AArch64::ORRXri;
4424       OrrRC = &AArch64::GPR64spRegClass;
4425       BitSize = 64;
4426       ZeroReg = AArch64::XZR;
4427       Opc = AArch64::MADDXrrr;
4428       RC = &AArch64::GPR64RegClass;
4429     }
4430     Register NewVR = MRI.createVirtualRegister(OrrRC);
4431     uint64_t Imm = Root.getOperand(2).getImm();
4432 
4433     if (Root.getOperand(3).isImm()) {
4434       unsigned Val = Root.getOperand(3).getImm();
4435       Imm = Imm << Val;
4436     }
4437     uint64_t UImm = SignExtend64(Imm, BitSize);
4438     uint64_t Encoding;
4439     if (AArch64_AM::processLogicalImmediate(UImm, BitSize, Encoding)) {
4440       MachineInstrBuilder MIB1 =
4441           BuildMI(MF, Root.getDebugLoc(), TII->get(OrrOpc), NewVR)
4442               .addReg(ZeroReg)
4443               .addImm(Encoding);
4444       InsInstrs.push_back(MIB1);
4445       InstrIdxForVirtReg.insert(std::make_pair(NewVR, 0));
4446       MUL = genMaddR(MF, MRI, TII, Root, InsInstrs, 1, Opc, NewVR, RC);
4447     }
4448     break;
4449   }
4450   case MachineCombinerPattern::MULSUBW_OP1:
4451   case MachineCombinerPattern::MULSUBX_OP1: {
4452     // MUL I=A,B,0
4453     // SUB R,I, C
4454     // ==> SUB  V, 0, C
4455     // ==> MADD R,A,B,V // = -C + A*B
4456     // --- Create(MADD);
4457     const TargetRegisterClass *SubRC;
4458     unsigned SubOpc, ZeroReg;
4459     if (Pattern == MachineCombinerPattern::MULSUBW_OP1) {
4460       SubOpc = AArch64::SUBWrr;
4461       SubRC = &AArch64::GPR32spRegClass;
4462       ZeroReg = AArch64::WZR;
4463       Opc = AArch64::MADDWrrr;
4464       RC = &AArch64::GPR32RegClass;
4465     } else {
4466       SubOpc = AArch64::SUBXrr;
4467       SubRC = &AArch64::GPR64spRegClass;
4468       ZeroReg = AArch64::XZR;
4469       Opc = AArch64::MADDXrrr;
4470       RC = &AArch64::GPR64RegClass;
4471     }
4472     Register NewVR = MRI.createVirtualRegister(SubRC);
4473     // SUB NewVR, 0, C
4474     MachineInstrBuilder MIB1 =
4475         BuildMI(MF, Root.getDebugLoc(), TII->get(SubOpc), NewVR)
4476             .addReg(ZeroReg)
4477             .add(Root.getOperand(2));
4478     InsInstrs.push_back(MIB1);
4479     InstrIdxForVirtReg.insert(std::make_pair(NewVR, 0));
4480     MUL = genMaddR(MF, MRI, TII, Root, InsInstrs, 1, Opc, NewVR, RC);
4481     break;
4482   }
4483   case MachineCombinerPattern::MULSUBW_OP2:
4484   case MachineCombinerPattern::MULSUBX_OP2:
4485     // MUL I=A,B,0
4486     // SUB R,C,I
4487     // ==> MSUB R,A,B,C (computes C - A*B)
4488     // --- Create(MSUB);
4489     if (Pattern == MachineCombinerPattern::MULSUBW_OP2) {
4490       Opc = AArch64::MSUBWrrr;
4491       RC = &AArch64::GPR32RegClass;
4492     } else {
4493       Opc = AArch64::MSUBXrrr;
4494       RC = &AArch64::GPR64RegClass;
4495     }
4496     MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC);
4497     break;
4498   case MachineCombinerPattern::MULSUBWI_OP1:
4499   case MachineCombinerPattern::MULSUBXI_OP1: {
4500     // MUL I=A,B,0
4501     // SUB R,I, Imm
4502     // ==> ORR  V, ZR, -Imm
4503     // ==> MADD R,A,B,V // = -Imm + A*B
4504     // --- Create(MADD);
4505     const TargetRegisterClass *OrrRC;
4506     unsigned BitSize, OrrOpc, ZeroReg;
4507     if (Pattern == MachineCombinerPattern::MULSUBWI_OP1) {
4508       OrrOpc = AArch64::ORRWri;
4509       OrrRC = &AArch64::GPR32spRegClass;
4510       BitSize = 32;
4511       ZeroReg = AArch64::WZR;
4512       Opc = AArch64::MADDWrrr;
4513       RC = &AArch64::GPR32RegClass;
4514     } else {
4515       OrrOpc = AArch64::ORRXri;
4516       OrrRC = &AArch64::GPR64spRegClass;
4517       BitSize = 64;
4518       ZeroReg = AArch64::XZR;
4519       Opc = AArch64::MADDXrrr;
4520       RC = &AArch64::GPR64RegClass;
4521     }
4522     Register NewVR = MRI.createVirtualRegister(OrrRC);
4523     uint64_t Imm = Root.getOperand(2).getImm();
4524     if (Root.getOperand(3).isImm()) {
4525       unsigned Val = Root.getOperand(3).getImm();
4526       Imm = Imm << Val;
4527     }
4528     uint64_t UImm = SignExtend64(-Imm, BitSize);
4529     uint64_t Encoding;
4530     if (AArch64_AM::processLogicalImmediate(UImm, BitSize, Encoding)) {
4531       MachineInstrBuilder MIB1 =
4532           BuildMI(MF, Root.getDebugLoc(), TII->get(OrrOpc), NewVR)
4533               .addReg(ZeroReg)
4534               .addImm(Encoding);
4535       InsInstrs.push_back(MIB1);
4536       InstrIdxForVirtReg.insert(std::make_pair(NewVR, 0));
4537       MUL = genMaddR(MF, MRI, TII, Root, InsInstrs, 1, Opc, NewVR, RC);
4538     }
4539     break;
4540   }
4541 
4542   case MachineCombinerPattern::MULADDv8i8_OP1:
4543     Opc = AArch64::MLAv8i8;
4544     RC = &AArch64::FPR64RegClass;
4545     MUL = genFusedMultiplyAcc(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC);
4546     break;
4547   case MachineCombinerPattern::MULADDv8i8_OP2:
4548     Opc = AArch64::MLAv8i8;
4549     RC = &AArch64::FPR64RegClass;
4550     MUL = genFusedMultiplyAcc(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC);
4551     break;
4552   case MachineCombinerPattern::MULADDv16i8_OP1:
4553     Opc = AArch64::MLAv16i8;
4554     RC = &AArch64::FPR128RegClass;
4555     MUL = genFusedMultiplyAcc(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC);
4556     break;
4557   case MachineCombinerPattern::MULADDv16i8_OP2:
4558     Opc = AArch64::MLAv16i8;
4559     RC = &AArch64::FPR128RegClass;
4560     MUL = genFusedMultiplyAcc(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC);
4561     break;
4562   case MachineCombinerPattern::MULADDv4i16_OP1:
4563     Opc = AArch64::MLAv4i16;
4564     RC = &AArch64::FPR64RegClass;
4565     MUL = genFusedMultiplyAcc(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC);
4566     break;
4567   case MachineCombinerPattern::MULADDv4i16_OP2:
4568     Opc = AArch64::MLAv4i16;
4569     RC = &AArch64::FPR64RegClass;
4570     MUL = genFusedMultiplyAcc(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC);
4571     break;
4572   case MachineCombinerPattern::MULADDv8i16_OP1:
4573     Opc = AArch64::MLAv8i16;
4574     RC = &AArch64::FPR128RegClass;
4575     MUL = genFusedMultiplyAcc(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC);
4576     break;
4577   case MachineCombinerPattern::MULADDv8i16_OP2:
4578     Opc = AArch64::MLAv8i16;
4579     RC = &AArch64::FPR128RegClass;
4580     MUL = genFusedMultiplyAcc(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC);
4581     break;
4582   case MachineCombinerPattern::MULADDv2i32_OP1:
4583     Opc = AArch64::MLAv2i32;
4584     RC = &AArch64::FPR64RegClass;
4585     MUL = genFusedMultiplyAcc(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC);
4586     break;
4587   case MachineCombinerPattern::MULADDv2i32_OP2:
4588     Opc = AArch64::MLAv2i32;
4589     RC = &AArch64::FPR64RegClass;
4590     MUL = genFusedMultiplyAcc(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC);
4591     break;
4592   case MachineCombinerPattern::MULADDv4i32_OP1:
4593     Opc = AArch64::MLAv4i32;
4594     RC = &AArch64::FPR128RegClass;
4595     MUL = genFusedMultiplyAcc(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC);
4596     break;
4597   case MachineCombinerPattern::MULADDv4i32_OP2:
4598     Opc = AArch64::MLAv4i32;
4599     RC = &AArch64::FPR128RegClass;
4600     MUL = genFusedMultiplyAcc(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC);
4601     break;
4602 
4603   case MachineCombinerPattern::MULSUBv8i8_OP1:
4604     Opc = AArch64::MLAv8i8;
4605     RC = &AArch64::FPR64RegClass;
4606     MUL = genFusedMultiplyAccNeg(MF, MRI, TII, Root, InsInstrs,
4607                                  InstrIdxForVirtReg, 1, Opc, AArch64::NEGv8i8,
4608                                  RC);
4609     break;
4610   case MachineCombinerPattern::MULSUBv8i8_OP2:
4611     Opc = AArch64::MLSv8i8;
4612     RC = &AArch64::FPR64RegClass;
4613     MUL = genFusedMultiplyAcc(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC);
4614     break;
4615   case MachineCombinerPattern::MULSUBv16i8_OP1:
4616     Opc = AArch64::MLAv16i8;
4617     RC = &AArch64::FPR128RegClass;
4618     MUL = genFusedMultiplyAccNeg(MF, MRI, TII, Root, InsInstrs,
4619                                  InstrIdxForVirtReg, 1, Opc, AArch64::NEGv16i8,
4620                                  RC);
4621     break;
4622   case MachineCombinerPattern::MULSUBv16i8_OP2:
4623     Opc = AArch64::MLSv16i8;
4624     RC = &AArch64::FPR128RegClass;
4625     MUL = genFusedMultiplyAcc(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC);
4626     break;
4627   case MachineCombinerPattern::MULSUBv4i16_OP1:
4628     Opc = AArch64::MLAv4i16;
4629     RC = &AArch64::FPR64RegClass;
4630     MUL = genFusedMultiplyAccNeg(MF, MRI, TII, Root, InsInstrs,
4631                                  InstrIdxForVirtReg, 1, Opc, AArch64::NEGv4i16,
4632                                  RC);
4633     break;
4634   case MachineCombinerPattern::MULSUBv4i16_OP2:
4635     Opc = AArch64::MLSv4i16;
4636     RC = &AArch64::FPR64RegClass;
4637     MUL = genFusedMultiplyAcc(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC);
4638     break;
4639   case MachineCombinerPattern::MULSUBv8i16_OP1:
4640     Opc = AArch64::MLAv8i16;
4641     RC = &AArch64::FPR128RegClass;
4642     MUL = genFusedMultiplyAccNeg(MF, MRI, TII, Root, InsInstrs,
4643                                  InstrIdxForVirtReg, 1, Opc, AArch64::NEGv8i16,
4644                                  RC);
4645     break;
4646   case MachineCombinerPattern::MULSUBv8i16_OP2:
4647     Opc = AArch64::MLSv8i16;
4648     RC = &AArch64::FPR128RegClass;
4649     MUL = genFusedMultiplyAcc(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC);
4650     break;
4651   case MachineCombinerPattern::MULSUBv2i32_OP1:
4652     Opc = AArch64::MLAv2i32;
4653     RC = &AArch64::FPR64RegClass;
4654     MUL = genFusedMultiplyAccNeg(MF, MRI, TII, Root, InsInstrs,
4655                                  InstrIdxForVirtReg, 1, Opc, AArch64::NEGv2i32,
4656                                  RC);
4657     break;
4658   case MachineCombinerPattern::MULSUBv2i32_OP2:
4659     Opc = AArch64::MLSv2i32;
4660     RC = &AArch64::FPR64RegClass;
4661     MUL = genFusedMultiplyAcc(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC);
4662     break;
4663   case MachineCombinerPattern::MULSUBv4i32_OP1:
4664     Opc = AArch64::MLAv4i32;
4665     RC = &AArch64::FPR128RegClass;
4666     MUL = genFusedMultiplyAccNeg(MF, MRI, TII, Root, InsInstrs,
4667                                  InstrIdxForVirtReg, 1, Opc, AArch64::NEGv4i32,
4668                                  RC);
4669     break;
4670   case MachineCombinerPattern::MULSUBv4i32_OP2:
4671     Opc = AArch64::MLSv4i32;
4672     RC = &AArch64::FPR128RegClass;
4673     MUL = genFusedMultiplyAcc(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC);
4674     break;
4675 
4676   case MachineCombinerPattern::MULADDv4i16_indexed_OP1:
4677     Opc = AArch64::MLAv4i16_indexed;
4678     RC = &AArch64::FPR64RegClass;
4679     MUL = genFusedMultiplyIdx(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC);
4680     break;
4681   case MachineCombinerPattern::MULADDv4i16_indexed_OP2:
4682     Opc = AArch64::MLAv4i16_indexed;
4683     RC = &AArch64::FPR64RegClass;
4684     MUL = genFusedMultiplyIdx(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC);
4685     break;
4686   case MachineCombinerPattern::MULADDv8i16_indexed_OP1:
4687     Opc = AArch64::MLAv8i16_indexed;
4688     RC = &AArch64::FPR128RegClass;
4689     MUL = genFusedMultiplyIdx(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC);
4690     break;
4691   case MachineCombinerPattern::MULADDv8i16_indexed_OP2:
4692     Opc = AArch64::MLAv8i16_indexed;
4693     RC = &AArch64::FPR128RegClass;
4694     MUL = genFusedMultiplyIdx(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC);
4695     break;
4696   case MachineCombinerPattern::MULADDv2i32_indexed_OP1:
4697     Opc = AArch64::MLAv2i32_indexed;
4698     RC = &AArch64::FPR64RegClass;
4699     MUL = genFusedMultiplyIdx(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC);
4700     break;
4701   case MachineCombinerPattern::MULADDv2i32_indexed_OP2:
4702     Opc = AArch64::MLAv2i32_indexed;
4703     RC = &AArch64::FPR64RegClass;
4704     MUL = genFusedMultiplyIdx(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC);
4705     break;
4706   case MachineCombinerPattern::MULADDv4i32_indexed_OP1:
4707     Opc = AArch64::MLAv4i32_indexed;
4708     RC = &AArch64::FPR128RegClass;
4709     MUL = genFusedMultiplyIdx(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC);
4710     break;
4711   case MachineCombinerPattern::MULADDv4i32_indexed_OP2:
4712     Opc = AArch64::MLAv4i32_indexed;
4713     RC = &AArch64::FPR128RegClass;
4714     MUL = genFusedMultiplyIdx(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC);
4715     break;
4716 
4717   case MachineCombinerPattern::MULSUBv4i16_indexed_OP1:
4718     Opc = AArch64::MLAv4i16_indexed;
4719     RC = &AArch64::FPR64RegClass;
4720     MUL = genFusedMultiplyIdxNeg(MF, MRI, TII, Root, InsInstrs,
4721                                  InstrIdxForVirtReg, 1, Opc, AArch64::NEGv4i16,
4722                                  RC);
4723     break;
4724   case MachineCombinerPattern::MULSUBv4i16_indexed_OP2:
4725     Opc = AArch64::MLSv4i16_indexed;
4726     RC = &AArch64::FPR64RegClass;
4727     MUL = genFusedMultiplyIdx(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC);
4728     break;
4729   case MachineCombinerPattern::MULSUBv8i16_indexed_OP1:
4730     Opc = AArch64::MLAv8i16_indexed;
4731     RC = &AArch64::FPR128RegClass;
4732     MUL = genFusedMultiplyIdxNeg(MF, MRI, TII, Root, InsInstrs,
4733                                  InstrIdxForVirtReg, 1, Opc, AArch64::NEGv8i16,
4734                                  RC);
4735     break;
4736   case MachineCombinerPattern::MULSUBv8i16_indexed_OP2:
4737     Opc = AArch64::MLSv8i16_indexed;
4738     RC = &AArch64::FPR128RegClass;
4739     MUL = genFusedMultiplyIdx(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC);
4740     break;
4741   case MachineCombinerPattern::MULSUBv2i32_indexed_OP1:
4742     Opc = AArch64::MLAv2i32_indexed;
4743     RC = &AArch64::FPR64RegClass;
4744     MUL = genFusedMultiplyIdxNeg(MF, MRI, TII, Root, InsInstrs,
4745                                  InstrIdxForVirtReg, 1, Opc, AArch64::NEGv2i32,
4746                                  RC);
4747     break;
4748   case MachineCombinerPattern::MULSUBv2i32_indexed_OP2:
4749     Opc = AArch64::MLSv2i32_indexed;
4750     RC = &AArch64::FPR64RegClass;
4751     MUL = genFusedMultiplyIdx(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC);
4752     break;
4753   case MachineCombinerPattern::MULSUBv4i32_indexed_OP1:
4754     Opc = AArch64::MLAv4i32_indexed;
4755     RC = &AArch64::FPR128RegClass;
4756     MUL = genFusedMultiplyIdxNeg(MF, MRI, TII, Root, InsInstrs,
4757                                  InstrIdxForVirtReg, 1, Opc, AArch64::NEGv4i32,
4758                                  RC);
4759     break;
4760   case MachineCombinerPattern::MULSUBv4i32_indexed_OP2:
4761     Opc = AArch64::MLSv4i32_indexed;
4762     RC = &AArch64::FPR128RegClass;
4763     MUL = genFusedMultiplyIdx(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC);
4764     break;
4765 
4766   // Floating Point Support
4767   case MachineCombinerPattern::FMULADDH_OP1:
4768     Opc = AArch64::FMADDHrrr;
4769     RC = &AArch64::FPR16RegClass;
4770     MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC);
4771     break;
4772   case MachineCombinerPattern::FMULADDS_OP1:
4773     Opc = AArch64::FMADDSrrr;
4774     RC = &AArch64::FPR32RegClass;
4775     MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC);
4776     break;
4777   case MachineCombinerPattern::FMULADDD_OP1:
4778     Opc = AArch64::FMADDDrrr;
4779     RC = &AArch64::FPR64RegClass;
4780     MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC);
4781     break;
4782 
4783   case MachineCombinerPattern::FMULADDH_OP2:
4784     Opc = AArch64::FMADDHrrr;
4785     RC = &AArch64::FPR16RegClass;
4786     MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC);
4787     break;
4788   case MachineCombinerPattern::FMULADDS_OP2:
4789     Opc = AArch64::FMADDSrrr;
4790     RC = &AArch64::FPR32RegClass;
4791     MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC);
4792     break;
4793   case MachineCombinerPattern::FMULADDD_OP2:
4794     Opc = AArch64::FMADDDrrr;
4795     RC = &AArch64::FPR64RegClass;
4796     MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC);
4797     break;
4798 
4799   case MachineCombinerPattern::FMLAv1i32_indexed_OP1:
4800     Opc = AArch64::FMLAv1i32_indexed;
4801     RC = &AArch64::FPR32RegClass;
4802     MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC,
4803                            FMAInstKind::Indexed);
4804     break;
4805   case MachineCombinerPattern::FMLAv1i32_indexed_OP2:
4806     Opc = AArch64::FMLAv1i32_indexed;
4807     RC = &AArch64::FPR32RegClass;
4808     MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC,
4809                            FMAInstKind::Indexed);
4810     break;
4811 
4812   case MachineCombinerPattern::FMLAv1i64_indexed_OP1:
4813     Opc = AArch64::FMLAv1i64_indexed;
4814     RC = &AArch64::FPR64RegClass;
4815     MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC,
4816                            FMAInstKind::Indexed);
4817     break;
4818   case MachineCombinerPattern::FMLAv1i64_indexed_OP2:
4819     Opc = AArch64::FMLAv1i64_indexed;
4820     RC = &AArch64::FPR64RegClass;
4821     MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC,
4822                            FMAInstKind::Indexed);
4823     break;
4824 
4825   case MachineCombinerPattern::FMLAv4i16_indexed_OP1:
4826     RC = &AArch64::FPR64RegClass;
4827     Opc = AArch64::FMLAv4i16_indexed;
4828     MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC,
4829                            FMAInstKind::Indexed);
4830     break;
4831   case MachineCombinerPattern::FMLAv4f16_OP1:
4832     RC = &AArch64::FPR64RegClass;
4833     Opc = AArch64::FMLAv4f16;
4834     MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC,
4835                            FMAInstKind::Accumulator);
4836     break;
4837   case MachineCombinerPattern::FMLAv4i16_indexed_OP2:
4838     RC = &AArch64::FPR64RegClass;
4839     Opc = AArch64::FMLAv4i16_indexed;
4840     MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC,
4841                            FMAInstKind::Indexed);
4842     break;
4843   case MachineCombinerPattern::FMLAv4f16_OP2:
4844     RC = &AArch64::FPR64RegClass;
4845     Opc = AArch64::FMLAv4f16;
4846     MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC,
4847                            FMAInstKind::Accumulator);
4848     break;
4849 
4850   case MachineCombinerPattern::FMLAv2i32_indexed_OP1:
4851   case MachineCombinerPattern::FMLAv2f32_OP1:
4852     RC = &AArch64::FPR64RegClass;
4853     if (Pattern == MachineCombinerPattern::FMLAv2i32_indexed_OP1) {
4854       Opc = AArch64::FMLAv2i32_indexed;
4855       MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC,
4856                              FMAInstKind::Indexed);
4857     } else {
4858       Opc = AArch64::FMLAv2f32;
4859       MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC,
4860                              FMAInstKind::Accumulator);
4861     }
4862     break;
4863   case MachineCombinerPattern::FMLAv2i32_indexed_OP2:
4864   case MachineCombinerPattern::FMLAv2f32_OP2:
4865     RC = &AArch64::FPR64RegClass;
4866     if (Pattern == MachineCombinerPattern::FMLAv2i32_indexed_OP2) {
4867       Opc = AArch64::FMLAv2i32_indexed;
4868       MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC,
4869                              FMAInstKind::Indexed);
4870     } else {
4871       Opc = AArch64::FMLAv2f32;
4872       MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC,
4873                              FMAInstKind::Accumulator);
4874     }
4875     break;
4876 
4877   case MachineCombinerPattern::FMLAv8i16_indexed_OP1:
4878     RC = &AArch64::FPR128RegClass;
4879     Opc = AArch64::FMLAv8i16_indexed;
4880     MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC,
4881                            FMAInstKind::Indexed);
4882     break;
4883   case MachineCombinerPattern::FMLAv8f16_OP1:
4884     RC = &AArch64::FPR128RegClass;
4885     Opc = AArch64::FMLAv8f16;
4886     MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC,
4887                            FMAInstKind::Accumulator);
4888     break;
4889   case MachineCombinerPattern::FMLAv8i16_indexed_OP2:
4890     RC = &AArch64::FPR128RegClass;
4891     Opc = AArch64::FMLAv8i16_indexed;
4892     MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC,
4893                            FMAInstKind::Indexed);
4894     break;
4895   case MachineCombinerPattern::FMLAv8f16_OP2:
4896     RC = &AArch64::FPR128RegClass;
4897     Opc = AArch64::FMLAv8f16;
4898     MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC,
4899                            FMAInstKind::Accumulator);
4900     break;
4901 
4902   case MachineCombinerPattern::FMLAv2i64_indexed_OP1:
4903   case MachineCombinerPattern::FMLAv2f64_OP1:
4904     RC = &AArch64::FPR128RegClass;
4905     if (Pattern == MachineCombinerPattern::FMLAv2i64_indexed_OP1) {
4906       Opc = AArch64::FMLAv2i64_indexed;
4907       MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC,
4908                              FMAInstKind::Indexed);
4909     } else {
4910       Opc = AArch64::FMLAv2f64;
4911       MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC,
4912                              FMAInstKind::Accumulator);
4913     }
4914     break;
4915   case MachineCombinerPattern::FMLAv2i64_indexed_OP2:
4916   case MachineCombinerPattern::FMLAv2f64_OP2:
4917     RC = &AArch64::FPR128RegClass;
4918     if (Pattern == MachineCombinerPattern::FMLAv2i64_indexed_OP2) {
4919       Opc = AArch64::FMLAv2i64_indexed;
4920       MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC,
4921                              FMAInstKind::Indexed);
4922     } else {
4923       Opc = AArch64::FMLAv2f64;
4924       MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC,
4925                              FMAInstKind::Accumulator);
4926     }
4927     break;
4928 
4929   case MachineCombinerPattern::FMLAv4i32_indexed_OP1:
4930   case MachineCombinerPattern::FMLAv4f32_OP1:
4931     RC = &AArch64::FPR128RegClass;
4932     if (Pattern == MachineCombinerPattern::FMLAv4i32_indexed_OP1) {
4933       Opc = AArch64::FMLAv4i32_indexed;
4934       MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC,
4935                              FMAInstKind::Indexed);
4936     } else {
4937       Opc = AArch64::FMLAv4f32;
4938       MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC,
4939                              FMAInstKind::Accumulator);
4940     }
4941     break;
4942 
4943   case MachineCombinerPattern::FMLAv4i32_indexed_OP2:
4944   case MachineCombinerPattern::FMLAv4f32_OP2:
4945     RC = &AArch64::FPR128RegClass;
4946     if (Pattern == MachineCombinerPattern::FMLAv4i32_indexed_OP2) {
4947       Opc = AArch64::FMLAv4i32_indexed;
4948       MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC,
4949                              FMAInstKind::Indexed);
4950     } else {
4951       Opc = AArch64::FMLAv4f32;
4952       MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC,
4953                              FMAInstKind::Accumulator);
4954     }
4955     break;
4956 
4957   case MachineCombinerPattern::FMULSUBH_OP1:
4958     Opc = AArch64::FNMSUBHrrr;
4959     RC = &AArch64::FPR16RegClass;
4960     MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC);
4961     break;
4962   case MachineCombinerPattern::FMULSUBS_OP1:
4963     Opc = AArch64::FNMSUBSrrr;
4964     RC = &AArch64::FPR32RegClass;
4965     MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC);
4966     break;
4967   case MachineCombinerPattern::FMULSUBD_OP1:
4968     Opc = AArch64::FNMSUBDrrr;
4969     RC = &AArch64::FPR64RegClass;
4970     MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC);
4971     break;
4972 
4973   case MachineCombinerPattern::FNMULSUBH_OP1:
4974     Opc = AArch64::FNMADDHrrr;
4975     RC = &AArch64::FPR16RegClass;
4976     MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC);
4977     break;
4978   case MachineCombinerPattern::FNMULSUBS_OP1:
4979     Opc = AArch64::FNMADDSrrr;
4980     RC = &AArch64::FPR32RegClass;
4981     MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC);
4982     break;
4983   case MachineCombinerPattern::FNMULSUBD_OP1:
4984     Opc = AArch64::FNMADDDrrr;
4985     RC = &AArch64::FPR64RegClass;
4986     MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC);
4987     break;
4988 
4989   case MachineCombinerPattern::FMULSUBH_OP2:
4990     Opc = AArch64::FMSUBHrrr;
4991     RC = &AArch64::FPR16RegClass;
4992     MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC);
4993     break;
4994   case MachineCombinerPattern::FMULSUBS_OP2:
4995     Opc = AArch64::FMSUBSrrr;
4996     RC = &AArch64::FPR32RegClass;
4997     MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC);
4998     break;
4999   case MachineCombinerPattern::FMULSUBD_OP2:
5000     Opc = AArch64::FMSUBDrrr;
5001     RC = &AArch64::FPR64RegClass;
5002     MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC);
5003     break;
5004 
5005   case MachineCombinerPattern::FMLSv1i32_indexed_OP2:
5006     Opc = AArch64::FMLSv1i32_indexed;
5007     RC = &AArch64::FPR32RegClass;
5008     MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC,
5009                            FMAInstKind::Indexed);
5010     break;
5011 
5012   case MachineCombinerPattern::FMLSv1i64_indexed_OP2:
5013     Opc = AArch64::FMLSv1i64_indexed;
5014     RC = &AArch64::FPR64RegClass;
5015     MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC,
5016                            FMAInstKind::Indexed);
5017     break;
5018 
5019   case MachineCombinerPattern::FMLSv4f16_OP1:
5020   case MachineCombinerPattern::FMLSv4i16_indexed_OP1: {
5021     RC = &AArch64::FPR64RegClass;
5022     Register NewVR = MRI.createVirtualRegister(RC);
5023     MachineInstrBuilder MIB1 =
5024         BuildMI(MF, Root.getDebugLoc(), TII->get(AArch64::FNEGv4f16), NewVR)
5025             .add(Root.getOperand(2));
5026     InsInstrs.push_back(MIB1);
5027     InstrIdxForVirtReg.insert(std::make_pair(NewVR, 0));
5028     if (Pattern == MachineCombinerPattern::FMLSv4f16_OP1) {
5029       Opc = AArch64::FMLAv4f16;
5030       MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC,
5031                              FMAInstKind::Accumulator, &NewVR);
5032     } else {
5033       Opc = AArch64::FMLAv4i16_indexed;
5034       MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC,
5035                              FMAInstKind::Indexed, &NewVR);
5036     }
5037     break;
5038   }
5039   case MachineCombinerPattern::FMLSv4f16_OP2:
5040     RC = &AArch64::FPR64RegClass;
5041     Opc = AArch64::FMLSv4f16;
5042     MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC,
5043                            FMAInstKind::Accumulator);
5044     break;
5045   case MachineCombinerPattern::FMLSv4i16_indexed_OP2:
5046     RC = &AArch64::FPR64RegClass;
5047     Opc = AArch64::FMLSv4i16_indexed;
5048     MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC,
5049                            FMAInstKind::Indexed);
5050     break;
5051 
5052   case MachineCombinerPattern::FMLSv2f32_OP2:
5053   case MachineCombinerPattern::FMLSv2i32_indexed_OP2:
5054     RC = &AArch64::FPR64RegClass;
5055     if (Pattern == MachineCombinerPattern::FMLSv2i32_indexed_OP2) {
5056       Opc = AArch64::FMLSv2i32_indexed;
5057       MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC,
5058                              FMAInstKind::Indexed);
5059     } else {
5060       Opc = AArch64::FMLSv2f32;
5061       MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC,
5062                              FMAInstKind::Accumulator);
5063     }
5064     break;
5065 
5066   case MachineCombinerPattern::FMLSv8f16_OP1:
5067   case MachineCombinerPattern::FMLSv8i16_indexed_OP1: {
5068     RC = &AArch64::FPR128RegClass;
5069     Register NewVR = MRI.createVirtualRegister(RC);
5070     MachineInstrBuilder MIB1 =
5071         BuildMI(MF, Root.getDebugLoc(), TII->get(AArch64::FNEGv8f16), NewVR)
5072             .add(Root.getOperand(2));
5073     InsInstrs.push_back(MIB1);
5074     InstrIdxForVirtReg.insert(std::make_pair(NewVR, 0));
5075     if (Pattern == MachineCombinerPattern::FMLSv8f16_OP1) {
5076       Opc = AArch64::FMLAv8f16;
5077       MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC,
5078                              FMAInstKind::Accumulator, &NewVR);
5079     } else {
5080       Opc = AArch64::FMLAv8i16_indexed;
5081       MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC,
5082                              FMAInstKind::Indexed, &NewVR);
5083     }
5084     break;
5085   }
5086   case MachineCombinerPattern::FMLSv8f16_OP2:
5087     RC = &AArch64::FPR128RegClass;
5088     Opc = AArch64::FMLSv8f16;
5089     MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC,
5090                            FMAInstKind::Accumulator);
5091     break;
5092   case MachineCombinerPattern::FMLSv8i16_indexed_OP2:
5093     RC = &AArch64::FPR128RegClass;
5094     Opc = AArch64::FMLSv8i16_indexed;
5095     MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC,
5096                            FMAInstKind::Indexed);
5097     break;
5098 
5099   case MachineCombinerPattern::FMLSv2f64_OP2:
5100   case MachineCombinerPattern::FMLSv2i64_indexed_OP2:
5101     RC = &AArch64::FPR128RegClass;
5102     if (Pattern == MachineCombinerPattern::FMLSv2i64_indexed_OP2) {
5103       Opc = AArch64::FMLSv2i64_indexed;
5104       MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC,
5105                              FMAInstKind::Indexed);
5106     } else {
5107       Opc = AArch64::FMLSv2f64;
5108       MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC,
5109                              FMAInstKind::Accumulator);
5110     }
5111     break;
5112 
5113   case MachineCombinerPattern::FMLSv4f32_OP2:
5114   case MachineCombinerPattern::FMLSv4i32_indexed_OP2:
5115     RC = &AArch64::FPR128RegClass;
5116     if (Pattern == MachineCombinerPattern::FMLSv4i32_indexed_OP2) {
5117       Opc = AArch64::FMLSv4i32_indexed;
5118       MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC,
5119                              FMAInstKind::Indexed);
5120     } else {
5121       Opc = AArch64::FMLSv4f32;
5122       MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC,
5123                              FMAInstKind::Accumulator);
5124     }
5125     break;
5126   case MachineCombinerPattern::FMLSv2f32_OP1:
5127   case MachineCombinerPattern::FMLSv2i32_indexed_OP1: {
5128     RC = &AArch64::FPR64RegClass;
5129     Register NewVR = MRI.createVirtualRegister(RC);
5130     MachineInstrBuilder MIB1 =
5131         BuildMI(MF, Root.getDebugLoc(), TII->get(AArch64::FNEGv2f32), NewVR)
5132             .add(Root.getOperand(2));
5133     InsInstrs.push_back(MIB1);
5134     InstrIdxForVirtReg.insert(std::make_pair(NewVR, 0));
5135     if (Pattern == MachineCombinerPattern::FMLSv2i32_indexed_OP1) {
5136       Opc = AArch64::FMLAv2i32_indexed;
5137       MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC,
5138                              FMAInstKind::Indexed, &NewVR);
5139     } else {
5140       Opc = AArch64::FMLAv2f32;
5141       MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC,
5142                              FMAInstKind::Accumulator, &NewVR);
5143     }
5144     break;
5145   }
5146   case MachineCombinerPattern::FMLSv4f32_OP1:
5147   case MachineCombinerPattern::FMLSv4i32_indexed_OP1: {
5148     RC = &AArch64::FPR128RegClass;
5149     Register NewVR = MRI.createVirtualRegister(RC);
5150     MachineInstrBuilder MIB1 =
5151         BuildMI(MF, Root.getDebugLoc(), TII->get(AArch64::FNEGv4f32), NewVR)
5152             .add(Root.getOperand(2));
5153     InsInstrs.push_back(MIB1);
5154     InstrIdxForVirtReg.insert(std::make_pair(NewVR, 0));
5155     if (Pattern == MachineCombinerPattern::FMLSv4i32_indexed_OP1) {
5156       Opc = AArch64::FMLAv4i32_indexed;
5157       MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC,
5158                              FMAInstKind::Indexed, &NewVR);
5159     } else {
5160       Opc = AArch64::FMLAv4f32;
5161       MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC,
5162                              FMAInstKind::Accumulator, &NewVR);
5163     }
5164     break;
5165   }
5166   case MachineCombinerPattern::FMLSv2f64_OP1:
5167   case MachineCombinerPattern::FMLSv2i64_indexed_OP1: {
5168     RC = &AArch64::FPR128RegClass;
5169     Register NewVR = MRI.createVirtualRegister(RC);
5170     MachineInstrBuilder MIB1 =
5171         BuildMI(MF, Root.getDebugLoc(), TII->get(AArch64::FNEGv2f64), NewVR)
5172             .add(Root.getOperand(2));
5173     InsInstrs.push_back(MIB1);
5174     InstrIdxForVirtReg.insert(std::make_pair(NewVR, 0));
5175     if (Pattern == MachineCombinerPattern::FMLSv2i64_indexed_OP1) {
5176       Opc = AArch64::FMLAv2i64_indexed;
5177       MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC,
5178                              FMAInstKind::Indexed, &NewVR);
5179     } else {
5180       Opc = AArch64::FMLAv2f64;
5181       MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC,
5182                              FMAInstKind::Accumulator, &NewVR);
5183     }
5184     break;
5185   }
5186   } // end switch (Pattern)
5187   // Record MUL and ADD/SUB for deletion
5188   DelInstrs.push_back(MUL);
5189   DelInstrs.push_back(&Root);
5190 }
5191 
5192 /// Replace csincr-branch sequence by simple conditional branch
5193 ///
5194 /// Examples:
5195 /// 1. \code
5196 ///   csinc  w9, wzr, wzr, <condition code>
5197 ///   tbnz   w9, #0, 0x44
5198 ///    \endcode
5199 /// to
5200 ///    \code
5201 ///   b.<inverted condition code>
5202 ///    \endcode
5203 ///
5204 /// 2. \code
5205 ///   csinc w9, wzr, wzr, <condition code>
5206 ///   tbz   w9, #0, 0x44
5207 ///    \endcode
5208 /// to
5209 ///    \code
5210 ///   b.<condition code>
5211 ///    \endcode
5212 ///
5213 /// Replace compare and branch sequence by TBZ/TBNZ instruction when the
5214 /// compare's constant operand is power of 2.
5215 ///
5216 /// Examples:
5217 ///    \code
5218 ///   and  w8, w8, #0x400
5219 ///   cbnz w8, L1
5220 ///    \endcode
5221 /// to
5222 ///    \code
5223 ///   tbnz w8, #10, L1
5224 ///    \endcode
5225 ///
5226 /// \param  MI Conditional Branch
5227 /// \return True when the simple conditional branch is generated
5228 ///
5229 bool AArch64InstrInfo::optimizeCondBranch(MachineInstr &MI) const {
5230   bool IsNegativeBranch = false;
5231   bool IsTestAndBranch = false;
5232   unsigned TargetBBInMI = 0;
5233   switch (MI.getOpcode()) {
5234   default:
5235     llvm_unreachable("Unknown branch instruction?");
5236   case AArch64::Bcc:
5237     return false;
5238   case AArch64::CBZW:
5239   case AArch64::CBZX:
5240     TargetBBInMI = 1;
5241     break;
5242   case AArch64::CBNZW:
5243   case AArch64::CBNZX:
5244     TargetBBInMI = 1;
5245     IsNegativeBranch = true;
5246     break;
5247   case AArch64::TBZW:
5248   case AArch64::TBZX:
5249     TargetBBInMI = 2;
5250     IsTestAndBranch = true;
5251     break;
5252   case AArch64::TBNZW:
5253   case AArch64::TBNZX:
5254     TargetBBInMI = 2;
5255     IsNegativeBranch = true;
5256     IsTestAndBranch = true;
5257     break;
5258   }
5259   // So we increment a zero register and test for bits other
5260   // than bit 0? Conservatively bail out in case the verifier
5261   // missed this case.
5262   if (IsTestAndBranch && MI.getOperand(1).getImm())
5263     return false;
5264 
5265   // Find Definition.
5266   assert(MI.getParent() && "Incomplete machine instruciton\n");
5267   MachineBasicBlock *MBB = MI.getParent();
5268   MachineFunction *MF = MBB->getParent();
5269   MachineRegisterInfo *MRI = &MF->getRegInfo();
5270   Register VReg = MI.getOperand(0).getReg();
5271   if (!Register::isVirtualRegister(VReg))
5272     return false;
5273 
5274   MachineInstr *DefMI = MRI->getVRegDef(VReg);
5275 
5276   // Look through COPY instructions to find definition.
5277   while (DefMI->isCopy()) {
5278     Register CopyVReg = DefMI->getOperand(1).getReg();
5279     if (!MRI->hasOneNonDBGUse(CopyVReg))
5280       return false;
5281     if (!MRI->hasOneDef(CopyVReg))
5282       return false;
5283     DefMI = MRI->getVRegDef(CopyVReg);
5284   }
5285 
5286   switch (DefMI->getOpcode()) {
5287   default:
5288     return false;
5289   // Fold AND into a TBZ/TBNZ if constant operand is power of 2.
5290   case AArch64::ANDWri:
5291   case AArch64::ANDXri: {
5292     if (IsTestAndBranch)
5293       return false;
5294     if (DefMI->getParent() != MBB)
5295       return false;
5296     if (!MRI->hasOneNonDBGUse(VReg))
5297       return false;
5298 
5299     bool Is32Bit = (DefMI->getOpcode() == AArch64::ANDWri);
5300     uint64_t Mask = AArch64_AM::decodeLogicalImmediate(
5301         DefMI->getOperand(2).getImm(), Is32Bit ? 32 : 64);
5302     if (!isPowerOf2_64(Mask))
5303       return false;
5304 
5305     MachineOperand &MO = DefMI->getOperand(1);
5306     Register NewReg = MO.getReg();
5307     if (!Register::isVirtualRegister(NewReg))
5308       return false;
5309 
5310     assert(!MRI->def_empty(NewReg) && "Register must be defined.");
5311 
5312     MachineBasicBlock &RefToMBB = *MBB;
5313     MachineBasicBlock *TBB = MI.getOperand(1).getMBB();
5314     DebugLoc DL = MI.getDebugLoc();
5315     unsigned Imm = Log2_64(Mask);
5316     unsigned Opc = (Imm < 32)
5317                        ? (IsNegativeBranch ? AArch64::TBNZW : AArch64::TBZW)
5318                        : (IsNegativeBranch ? AArch64::TBNZX : AArch64::TBZX);
5319     MachineInstr *NewMI = BuildMI(RefToMBB, MI, DL, get(Opc))
5320                               .addReg(NewReg)
5321                               .addImm(Imm)
5322                               .addMBB(TBB);
5323     // Register lives on to the CBZ now.
5324     MO.setIsKill(false);
5325 
5326     // For immediate smaller than 32, we need to use the 32-bit
5327     // variant (W) in all cases. Indeed the 64-bit variant does not
5328     // allow to encode them.
5329     // Therefore, if the input register is 64-bit, we need to take the
5330     // 32-bit sub-part.
5331     if (!Is32Bit && Imm < 32)
5332       NewMI->getOperand(0).setSubReg(AArch64::sub_32);
5333     MI.eraseFromParent();
5334     return true;
5335   }
5336   // Look for CSINC
5337   case AArch64::CSINCWr:
5338   case AArch64::CSINCXr: {
5339     if (!(DefMI->getOperand(1).getReg() == AArch64::WZR &&
5340           DefMI->getOperand(2).getReg() == AArch64::WZR) &&
5341         !(DefMI->getOperand(1).getReg() == AArch64::XZR &&
5342           DefMI->getOperand(2).getReg() == AArch64::XZR))
5343       return false;
5344 
5345     if (DefMI->findRegisterDefOperandIdx(AArch64::NZCV, true) != -1)
5346       return false;
5347 
5348     AArch64CC::CondCode CC = (AArch64CC::CondCode)DefMI->getOperand(3).getImm();
5349     // Convert only when the condition code is not modified between
5350     // the CSINC and the branch. The CC may be used by other
5351     // instructions in between.
5352     if (areCFlagsAccessedBetweenInstrs(DefMI, MI, &getRegisterInfo(), AK_Write))
5353       return false;
5354     MachineBasicBlock &RefToMBB = *MBB;
5355     MachineBasicBlock *TBB = MI.getOperand(TargetBBInMI).getMBB();
5356     DebugLoc DL = MI.getDebugLoc();
5357     if (IsNegativeBranch)
5358       CC = AArch64CC::getInvertedCondCode(CC);
5359     BuildMI(RefToMBB, MI, DL, get(AArch64::Bcc)).addImm(CC).addMBB(TBB);
5360     MI.eraseFromParent();
5361     return true;
5362   }
5363   }
5364 }
5365 
5366 std::pair<unsigned, unsigned>
5367 AArch64InstrInfo::decomposeMachineOperandsTargetFlags(unsigned TF) const {
5368   const unsigned Mask = AArch64II::MO_FRAGMENT;
5369   return std::make_pair(TF & Mask, TF & ~Mask);
5370 }
5371 
5372 ArrayRef<std::pair<unsigned, const char *>>
5373 AArch64InstrInfo::getSerializableDirectMachineOperandTargetFlags() const {
5374   using namespace AArch64II;
5375 
5376   static const std::pair<unsigned, const char *> TargetFlags[] = {
5377       {MO_PAGE, "aarch64-page"}, {MO_PAGEOFF, "aarch64-pageoff"},
5378       {MO_G3, "aarch64-g3"},     {MO_G2, "aarch64-g2"},
5379       {MO_G1, "aarch64-g1"},     {MO_G0, "aarch64-g0"},
5380       {MO_HI12, "aarch64-hi12"}};
5381   return makeArrayRef(TargetFlags);
5382 }
5383 
5384 ArrayRef<std::pair<unsigned, const char *>>
5385 AArch64InstrInfo::getSerializableBitmaskMachineOperandTargetFlags() const {
5386   using namespace AArch64II;
5387 
5388   static const std::pair<unsigned, const char *> TargetFlags[] = {
5389       {MO_COFFSTUB, "aarch64-coffstub"},
5390       {MO_GOT, "aarch64-got"},
5391       {MO_NC, "aarch64-nc"},
5392       {MO_S, "aarch64-s"},
5393       {MO_TLS, "aarch64-tls"},
5394       {MO_DLLIMPORT, "aarch64-dllimport"},
5395       {MO_PREL, "aarch64-prel"},
5396       {MO_TAGGED, "aarch64-tagged"}};
5397   return makeArrayRef(TargetFlags);
5398 }
5399 
5400 ArrayRef<std::pair<MachineMemOperand::Flags, const char *>>
5401 AArch64InstrInfo::getSerializableMachineMemOperandTargetFlags() const {
5402   static const std::pair<MachineMemOperand::Flags, const char *> TargetFlags[] =
5403       {{MOSuppressPair, "aarch64-suppress-pair"},
5404        {MOStridedAccess, "aarch64-strided-access"}};
5405   return makeArrayRef(TargetFlags);
5406 }
5407 
5408 /// Constants defining how certain sequences should be outlined.
5409 /// This encompasses how an outlined function should be called, and what kind of
5410 /// frame should be emitted for that outlined function.
5411 ///
5412 /// \p MachineOutlinerDefault implies that the function should be called with
5413 /// a save and restore of LR to the stack.
5414 ///
5415 /// That is,
5416 ///
5417 /// I1     Save LR                    OUTLINED_FUNCTION:
5418 /// I2 --> BL OUTLINED_FUNCTION       I1
5419 /// I3     Restore LR                 I2
5420 ///                                   I3
5421 ///                                   RET
5422 ///
5423 /// * Call construction overhead: 3 (save + BL + restore)
5424 /// * Frame construction overhead: 1 (ret)
5425 /// * Requires stack fixups? Yes
5426 ///
5427 /// \p MachineOutlinerTailCall implies that the function is being created from
5428 /// a sequence of instructions ending in a return.
5429 ///
5430 /// That is,
5431 ///
5432 /// I1                             OUTLINED_FUNCTION:
5433 /// I2 --> B OUTLINED_FUNCTION     I1
5434 /// RET                            I2
5435 ///                                RET
5436 ///
5437 /// * Call construction overhead: 1 (B)
5438 /// * Frame construction overhead: 0 (Return included in sequence)
5439 /// * Requires stack fixups? No
5440 ///
5441 /// \p MachineOutlinerNoLRSave implies that the function should be called using
5442 /// a BL instruction, but doesn't require LR to be saved and restored. This
5443 /// happens when LR is known to be dead.
5444 ///
5445 /// That is,
5446 ///
5447 /// I1                                OUTLINED_FUNCTION:
5448 /// I2 --> BL OUTLINED_FUNCTION       I1
5449 /// I3                                I2
5450 ///                                   I3
5451 ///                                   RET
5452 ///
5453 /// * Call construction overhead: 1 (BL)
5454 /// * Frame construction overhead: 1 (RET)
5455 /// * Requires stack fixups? No
5456 ///
5457 /// \p MachineOutlinerThunk implies that the function is being created from
5458 /// a sequence of instructions ending in a call. The outlined function is
5459 /// called with a BL instruction, and the outlined function tail-calls the
5460 /// original call destination.
5461 ///
5462 /// That is,
5463 ///
5464 /// I1                                OUTLINED_FUNCTION:
5465 /// I2 --> BL OUTLINED_FUNCTION       I1
5466 /// BL f                              I2
5467 ///                                   B f
5468 /// * Call construction overhead: 1 (BL)
5469 /// * Frame construction overhead: 0
5470 /// * Requires stack fixups? No
5471 ///
5472 /// \p MachineOutlinerRegSave implies that the function should be called with a
5473 /// save and restore of LR to an available register. This allows us to avoid
5474 /// stack fixups. Note that this outlining variant is compatible with the
5475 /// NoLRSave case.
5476 ///
5477 /// That is,
5478 ///
5479 /// I1     Save LR                    OUTLINED_FUNCTION:
5480 /// I2 --> BL OUTLINED_FUNCTION       I1
5481 /// I3     Restore LR                 I2
5482 ///                                   I3
5483 ///                                   RET
5484 ///
5485 /// * Call construction overhead: 3 (save + BL + restore)
5486 /// * Frame construction overhead: 1 (ret)
5487 /// * Requires stack fixups? No
5488 enum MachineOutlinerClass {
5489   MachineOutlinerDefault,  /// Emit a save, restore, call, and return.
5490   MachineOutlinerTailCall, /// Only emit a branch.
5491   MachineOutlinerNoLRSave, /// Emit a call and return.
5492   MachineOutlinerThunk,    /// Emit a call and tail-call.
5493   MachineOutlinerRegSave   /// Same as default, but save to a register.
5494 };
5495 
5496 enum MachineOutlinerMBBFlags {
5497   LRUnavailableSomewhere = 0x2,
5498   HasCalls = 0x4,
5499   UnsafeRegsDead = 0x8
5500 };
5501 
5502 unsigned
5503 AArch64InstrInfo::findRegisterToSaveLRTo(const outliner::Candidate &C) const {
5504   assert(C.LRUWasSet && "LRU wasn't set?");
5505   MachineFunction *MF = C.getMF();
5506   const AArch64RegisterInfo *ARI = static_cast<const AArch64RegisterInfo *>(
5507       MF->getSubtarget().getRegisterInfo());
5508 
5509   // Check if there is an available register across the sequence that we can
5510   // use.
5511   for (unsigned Reg : AArch64::GPR64RegClass) {
5512     if (!ARI->isReservedReg(*MF, Reg) &&
5513         Reg != AArch64::LR &&  // LR is not reserved, but don't use it.
5514         Reg != AArch64::X16 && // X16 is not guaranteed to be preserved.
5515         Reg != AArch64::X17 && // Ditto for X17.
5516         C.LRU.available(Reg) && C.UsedInSequence.available(Reg))
5517       return Reg;
5518   }
5519 
5520   // No suitable register. Return 0.
5521   return 0u;
5522 }
5523 
5524 static bool
5525 outliningCandidatesSigningScopeConsensus(const outliner::Candidate &a,
5526                                          const outliner::Candidate &b) {
5527   const Function &Fa = a.getMF()->getFunction();
5528   const Function &Fb = b.getMF()->getFunction();
5529 
5530   // If none of the functions have the "sign-return-address" attribute their
5531   // signing behaviour is equal
5532   if (!Fa.hasFnAttribute("sign-return-address") &&
5533       !Fb.hasFnAttribute("sign-return-address")) {
5534     return true;
5535   }
5536 
5537   // If both functions have the "sign-return-address" attribute their signing
5538   // behaviour is equal, if the values of the attributes are equal
5539   if (Fa.hasFnAttribute("sign-return-address") &&
5540       Fb.hasFnAttribute("sign-return-address")) {
5541     StringRef ScopeA =
5542         Fa.getFnAttribute("sign-return-address").getValueAsString();
5543     StringRef ScopeB =
5544         Fb.getFnAttribute("sign-return-address").getValueAsString();
5545     return ScopeA.equals(ScopeB);
5546   }
5547 
5548   // If function B doesn't have the "sign-return-address" attribute but A does,
5549   // the functions' signing behaviour is equal if A's value for
5550   // "sign-return-address" is "none" and vice versa.
5551   if (Fa.hasFnAttribute("sign-return-address")) {
5552     StringRef ScopeA =
5553         Fa.getFnAttribute("sign-return-address").getValueAsString();
5554     return ScopeA.equals("none");
5555   }
5556 
5557   if (Fb.hasFnAttribute("sign-return-address")) {
5558     StringRef ScopeB =
5559         Fb.getFnAttribute("sign-return-address").getValueAsString();
5560     return ScopeB.equals("none");
5561   }
5562 
5563   llvm_unreachable("Unkown combination of sign-return-address attributes");
5564 }
5565 
5566 static bool
5567 outliningCandidatesSigningKeyConsensus(const outliner::Candidate &a,
5568                                        const outliner::Candidate &b) {
5569   const Function &Fa = a.getMF()->getFunction();
5570   const Function &Fb = b.getMF()->getFunction();
5571 
5572   // If none of the functions have the "sign-return-address-key" attribute
5573   // their keys are equal
5574   if (!Fa.hasFnAttribute("sign-return-address-key") &&
5575       !Fb.hasFnAttribute("sign-return-address-key")) {
5576     return true;
5577   }
5578 
5579   // If both functions have the "sign-return-address-key" attribute their
5580   // keys are equal if the values of "sign-return-address-key" are equal
5581   if (Fa.hasFnAttribute("sign-return-address-key") &&
5582       Fb.hasFnAttribute("sign-return-address-key")) {
5583     StringRef KeyA =
5584         Fa.getFnAttribute("sign-return-address-key").getValueAsString();
5585     StringRef KeyB =
5586         Fb.getFnAttribute("sign-return-address-key").getValueAsString();
5587     return KeyA.equals(KeyB);
5588   }
5589 
5590   // If B doesn't have the "sign-return-address-key" attribute, both keys are
5591   // equal, if function a has the default key (a_key)
5592   if (Fa.hasFnAttribute("sign-return-address-key")) {
5593     StringRef KeyA =
5594         Fa.getFnAttribute("sign-return-address-key").getValueAsString();
5595     return KeyA.equals_lower("a_key");
5596   }
5597 
5598   if (Fb.hasFnAttribute("sign-return-address-key")) {
5599     StringRef KeyB =
5600         Fb.getFnAttribute("sign-return-address-key").getValueAsString();
5601     return KeyB.equals_lower("a_key");
5602   }
5603 
5604   llvm_unreachable("Unkown combination of sign-return-address-key attributes");
5605 }
5606 
5607 static bool outliningCandidatesV8_3OpsConsensus(const outliner::Candidate &a,
5608                                                 const outliner::Candidate &b) {
5609   const AArch64Subtarget &SubtargetA =
5610       a.getMF()->getSubtarget<AArch64Subtarget>();
5611   const AArch64Subtarget &SubtargetB =
5612       b.getMF()->getSubtarget<AArch64Subtarget>();
5613   return SubtargetA.hasV8_3aOps() == SubtargetB.hasV8_3aOps();
5614 }
5615 
5616 outliner::OutlinedFunction AArch64InstrInfo::getOutliningCandidateInfo(
5617     std::vector<outliner::Candidate> &RepeatedSequenceLocs) const {
5618   outliner::Candidate &FirstCand = RepeatedSequenceLocs[0];
5619   unsigned SequenceSize =
5620       std::accumulate(FirstCand.front(), std::next(FirstCand.back()), 0,
5621                       [this](unsigned Sum, const MachineInstr &MI) {
5622                         return Sum + getInstSizeInBytes(MI);
5623                       });
5624   unsigned NumBytesToCreateFrame = 0;
5625 
5626   // We only allow outlining for functions having exactly matching return
5627   // address signing attributes, i.e., all share the same value for the
5628   // attribute "sign-return-address" and all share the same type of key they
5629   // are signed with.
5630   // Additionally we require all functions to simultaniously either support
5631   // v8.3a features or not. Otherwise an outlined function could get signed
5632   // using dedicated v8.3 instructions and a call from a function that doesn't
5633   // support v8.3 instructions would therefore be invalid.
5634   if (std::adjacent_find(
5635           RepeatedSequenceLocs.begin(), RepeatedSequenceLocs.end(),
5636           [](const outliner::Candidate &a, const outliner::Candidate &b) {
5637             // Return true if a and b are non-equal w.r.t. return address
5638             // signing or support of v8.3a features
5639             if (outliningCandidatesSigningScopeConsensus(a, b) &&
5640                 outliningCandidatesSigningKeyConsensus(a, b) &&
5641                 outliningCandidatesV8_3OpsConsensus(a, b)) {
5642               return false;
5643             }
5644             return true;
5645           }) != RepeatedSequenceLocs.end()) {
5646     return outliner::OutlinedFunction();
5647   }
5648 
5649   // Since at this point all candidates agree on their return address signing
5650   // picking just one is fine. If the candidate functions potentially sign their
5651   // return addresses, the outlined function should do the same. Note that in
5652   // the case of "sign-return-address"="non-leaf" this is an assumption: It is
5653   // not certainly true that the outlined function will have to sign its return
5654   // address but this decision is made later, when the decision to outline
5655   // has already been made.
5656   // The same holds for the number of additional instructions we need: On
5657   // v8.3a RET can be replaced by RETAA/RETAB and no AUT instruction is
5658   // necessary. However, at this point we don't know if the outlined function
5659   // will have a RET instruction so we assume the worst.
5660   const Function &FCF = FirstCand.getMF()->getFunction();
5661   const TargetRegisterInfo &TRI = getRegisterInfo();
5662   if (FCF.hasFnAttribute("sign-return-address")) {
5663     // One PAC and one AUT instructions
5664     NumBytesToCreateFrame += 8;
5665 
5666     // We have to check if sp modifying instructions would get outlined.
5667     // If so we only allow outlining if sp is unchanged overall, so matching
5668     // sub and add instructions are okay to outline, all other sp modifications
5669     // are not
5670     auto hasIllegalSPModification = [&TRI](outliner::Candidate &C) {
5671       int SPValue = 0;
5672       MachineBasicBlock::iterator MBBI = C.front();
5673       for (;;) {
5674         if (MBBI->modifiesRegister(AArch64::SP, &TRI)) {
5675           switch (MBBI->getOpcode()) {
5676           case AArch64::ADDXri:
5677           case AArch64::ADDWri:
5678             assert(MBBI->getNumOperands() == 4 && "Wrong number of operands");
5679             assert(MBBI->getOperand(2).isImm() &&
5680                    "Expected operand to be immediate");
5681             assert(MBBI->getOperand(1).isReg() &&
5682                    "Expected operand to be a register");
5683             // Check if the add just increments sp. If so, we search for
5684             // matching sub instructions that decrement sp. If not, the
5685             // modification is illegal
5686             if (MBBI->getOperand(1).getReg() == AArch64::SP)
5687               SPValue += MBBI->getOperand(2).getImm();
5688             else
5689               return true;
5690             break;
5691           case AArch64::SUBXri:
5692           case AArch64::SUBWri:
5693             assert(MBBI->getNumOperands() == 4 && "Wrong number of operands");
5694             assert(MBBI->getOperand(2).isImm() &&
5695                    "Expected operand to be immediate");
5696             assert(MBBI->getOperand(1).isReg() &&
5697                    "Expected operand to be a register");
5698             // Check if the sub just decrements sp. If so, we search for
5699             // matching add instructions that increment sp. If not, the
5700             // modification is illegal
5701             if (MBBI->getOperand(1).getReg() == AArch64::SP)
5702               SPValue -= MBBI->getOperand(2).getImm();
5703             else
5704               return true;
5705             break;
5706           default:
5707             return true;
5708           }
5709         }
5710         if (MBBI == C.back())
5711           break;
5712         ++MBBI;
5713       }
5714       if (SPValue)
5715         return true;
5716       return false;
5717     };
5718     // Remove candidates with illegal stack modifying instructions
5719     RepeatedSequenceLocs.erase(std::remove_if(RepeatedSequenceLocs.begin(),
5720                                               RepeatedSequenceLocs.end(),
5721                                               hasIllegalSPModification),
5722                                RepeatedSequenceLocs.end());
5723 
5724     // If the sequence doesn't have enough candidates left, then we're done.
5725     if (RepeatedSequenceLocs.size() < 2)
5726       return outliner::OutlinedFunction();
5727   }
5728 
5729   // Properties about candidate MBBs that hold for all of them.
5730   unsigned FlagsSetInAll = 0xF;
5731 
5732   // Compute liveness information for each candidate, and set FlagsSetInAll.
5733   std::for_each(RepeatedSequenceLocs.begin(), RepeatedSequenceLocs.end(),
5734                 [&FlagsSetInAll](outliner::Candidate &C) {
5735                   FlagsSetInAll &= C.Flags;
5736                 });
5737 
5738   // According to the AArch64 Procedure Call Standard, the following are
5739   // undefined on entry/exit from a function call:
5740   //
5741   // * Registers x16, x17, (and thus w16, w17)
5742   // * Condition codes (and thus the NZCV register)
5743   //
5744   // Because if this, we can't outline any sequence of instructions where
5745   // one
5746   // of these registers is live into/across it. Thus, we need to delete
5747   // those
5748   // candidates.
5749   auto CantGuaranteeValueAcrossCall = [&TRI](outliner::Candidate &C) {
5750     // If the unsafe registers in this block are all dead, then we don't need
5751     // to compute liveness here.
5752     if (C.Flags & UnsafeRegsDead)
5753       return false;
5754     C.initLRU(TRI);
5755     LiveRegUnits LRU = C.LRU;
5756     return (!LRU.available(AArch64::W16) || !LRU.available(AArch64::W17) ||
5757             !LRU.available(AArch64::NZCV));
5758   };
5759 
5760   // Are there any candidates where those registers are live?
5761   if (!(FlagsSetInAll & UnsafeRegsDead)) {
5762     // Erase every candidate that violates the restrictions above. (It could be
5763     // true that we have viable candidates, so it's not worth bailing out in
5764     // the case that, say, 1 out of 20 candidates violate the restructions.)
5765     RepeatedSequenceLocs.erase(std::remove_if(RepeatedSequenceLocs.begin(),
5766                                               RepeatedSequenceLocs.end(),
5767                                               CantGuaranteeValueAcrossCall),
5768                                RepeatedSequenceLocs.end());
5769 
5770     // If the sequence doesn't have enough candidates left, then we're done.
5771     if (RepeatedSequenceLocs.size() < 2)
5772       return outliner::OutlinedFunction();
5773   }
5774 
5775   // At this point, we have only "safe" candidates to outline. Figure out
5776   // frame + call instruction information.
5777 
5778   unsigned LastInstrOpcode = RepeatedSequenceLocs[0].back()->getOpcode();
5779 
5780   // Helper lambda which sets call information for every candidate.
5781   auto SetCandidateCallInfo =
5782       [&RepeatedSequenceLocs](unsigned CallID, unsigned NumBytesForCall) {
5783         for (outliner::Candidate &C : RepeatedSequenceLocs)
5784           C.setCallInfo(CallID, NumBytesForCall);
5785       };
5786 
5787   unsigned FrameID = MachineOutlinerDefault;
5788   NumBytesToCreateFrame += 4;
5789 
5790   bool HasBTI = any_of(RepeatedSequenceLocs, [](outliner::Candidate &C) {
5791     return C.getMF()->getFunction().hasFnAttribute("branch-target-enforcement");
5792   });
5793 
5794   // Returns true if an instructions is safe to fix up, false otherwise.
5795   auto IsSafeToFixup = [this, &TRI](MachineInstr &MI) {
5796     if (MI.isCall())
5797       return true;
5798 
5799     if (!MI.modifiesRegister(AArch64::SP, &TRI) &&
5800         !MI.readsRegister(AArch64::SP, &TRI))
5801       return true;
5802 
5803     // Any modification of SP will break our code to save/restore LR.
5804     // FIXME: We could handle some instructions which add a constant
5805     // offset to SP, with a bit more work.
5806     if (MI.modifiesRegister(AArch64::SP, &TRI))
5807       return false;
5808 
5809     // At this point, we have a stack instruction that we might need to
5810     // fix up. We'll handle it if it's a load or store.
5811     if (MI.mayLoadOrStore()) {
5812       const MachineOperand *Base; // Filled with the base operand of MI.
5813       int64_t Offset;             // Filled with the offset of MI.
5814 
5815       // Does it allow us to offset the base operand and is the base the
5816       // register SP?
5817       if (!getMemOperandWithOffset(MI, Base, Offset, &TRI) || !Base->isReg() ||
5818           Base->getReg() != AArch64::SP)
5819         return false;
5820 
5821       // Find the minimum/maximum offset for this instruction and check
5822       // if fixing it up would be in range.
5823       int64_t MinOffset,
5824           MaxOffset;  // Unscaled offsets for the instruction.
5825       unsigned Scale; // The scale to multiply the offsets by.
5826       unsigned DummyWidth;
5827       getMemOpInfo(MI.getOpcode(), Scale, DummyWidth, MinOffset, MaxOffset);
5828 
5829       Offset += 16; // Update the offset to what it would be if we outlined.
5830       if (Offset < MinOffset * Scale || Offset > MaxOffset * Scale)
5831         return false;
5832 
5833       // It's in range, so we can outline it.
5834       return true;
5835     }
5836 
5837     // FIXME: Add handling for instructions like "add x0, sp, #8".
5838 
5839     // We can't fix it up, so don't outline it.
5840     return false;
5841   };
5842 
5843   // True if it's possible to fix up each stack instruction in this sequence.
5844   // Important for frames/call variants that modify the stack.
5845   bool AllStackInstrsSafe = std::all_of(
5846       FirstCand.front(), std::next(FirstCand.back()), IsSafeToFixup);
5847 
5848   // If the last instruction in any candidate is a terminator, then we should
5849   // tail call all of the candidates.
5850   if (RepeatedSequenceLocs[0].back()->isTerminator()) {
5851     FrameID = MachineOutlinerTailCall;
5852     NumBytesToCreateFrame = 0;
5853     SetCandidateCallInfo(MachineOutlinerTailCall, 4);
5854   }
5855 
5856   else if (LastInstrOpcode == AArch64::BL ||
5857            (LastInstrOpcode == AArch64::BLR && !HasBTI)) {
5858     // FIXME: Do we need to check if the code after this uses the value of LR?
5859     FrameID = MachineOutlinerThunk;
5860     NumBytesToCreateFrame = 0;
5861     SetCandidateCallInfo(MachineOutlinerThunk, 4);
5862   }
5863 
5864   else {
5865     // We need to decide how to emit calls + frames. We can always emit the same
5866     // frame if we don't need to save to the stack. If we have to save to the
5867     // stack, then we need a different frame.
5868     unsigned NumBytesNoStackCalls = 0;
5869     std::vector<outliner::Candidate> CandidatesWithoutStackFixups;
5870 
5871     // Check if we have to save LR.
5872     for (outliner::Candidate &C : RepeatedSequenceLocs) {
5873       C.initLRU(TRI);
5874 
5875       // If we have a noreturn caller, then we're going to be conservative and
5876       // say that we have to save LR. If we don't have a ret at the end of the
5877       // block, then we can't reason about liveness accurately.
5878       //
5879       // FIXME: We can probably do better than always disabling this in
5880       // noreturn functions by fixing up the liveness info.
5881       bool IsNoReturn =
5882           C.getMF()->getFunction().hasFnAttribute(Attribute::NoReturn);
5883 
5884       // Is LR available? If so, we don't need a save.
5885       if (C.LRU.available(AArch64::LR) && !IsNoReturn) {
5886         NumBytesNoStackCalls += 4;
5887         C.setCallInfo(MachineOutlinerNoLRSave, 4);
5888         CandidatesWithoutStackFixups.push_back(C);
5889       }
5890 
5891       // Is an unused register available? If so, we won't modify the stack, so
5892       // we can outline with the same frame type as those that don't save LR.
5893       else if (findRegisterToSaveLRTo(C)) {
5894         NumBytesNoStackCalls += 12;
5895         C.setCallInfo(MachineOutlinerRegSave, 12);
5896         CandidatesWithoutStackFixups.push_back(C);
5897       }
5898 
5899       // Is SP used in the sequence at all? If not, we don't have to modify
5900       // the stack, so we are guaranteed to get the same frame.
5901       else if (C.UsedInSequence.available(AArch64::SP)) {
5902         NumBytesNoStackCalls += 12;
5903         C.setCallInfo(MachineOutlinerDefault, 12);
5904         CandidatesWithoutStackFixups.push_back(C);
5905       }
5906 
5907       // If we outline this, we need to modify the stack. Pretend we don't
5908       // outline this by saving all of its bytes.
5909       else {
5910         NumBytesNoStackCalls += SequenceSize;
5911       }
5912     }
5913 
5914     // If there are no places where we have to save LR, then note that we
5915     // don't have to update the stack. Otherwise, give every candidate the
5916     // default call type, as long as it's safe to do so.
5917     if (!AllStackInstrsSafe ||
5918         NumBytesNoStackCalls <= RepeatedSequenceLocs.size() * 12) {
5919       RepeatedSequenceLocs = CandidatesWithoutStackFixups;
5920       FrameID = MachineOutlinerNoLRSave;
5921     } else {
5922       SetCandidateCallInfo(MachineOutlinerDefault, 12);
5923     }
5924 
5925     // If we dropped all of the candidates, bail out here.
5926     if (RepeatedSequenceLocs.size() < 2) {
5927       RepeatedSequenceLocs.clear();
5928       return outliner::OutlinedFunction();
5929     }
5930   }
5931 
5932   // Does every candidate's MBB contain a call? If so, then we might have a call
5933   // in the range.
5934   if (FlagsSetInAll & MachineOutlinerMBBFlags::HasCalls) {
5935     // Check if the range contains a call. These require a save + restore of the
5936     // link register.
5937     bool ModStackToSaveLR = false;
5938     if (std::any_of(FirstCand.front(), FirstCand.back(),
5939                     [](const MachineInstr &MI) { return MI.isCall(); }))
5940       ModStackToSaveLR = true;
5941 
5942     // Handle the last instruction separately. If this is a tail call, then the
5943     // last instruction is a call. We don't want to save + restore in this case.
5944     // However, it could be possible that the last instruction is a call without
5945     // it being valid to tail call this sequence. We should consider this as
5946     // well.
5947     else if (FrameID != MachineOutlinerThunk &&
5948              FrameID != MachineOutlinerTailCall && FirstCand.back()->isCall())
5949       ModStackToSaveLR = true;
5950 
5951     if (ModStackToSaveLR) {
5952       // We can't fix up the stack. Bail out.
5953       if (!AllStackInstrsSafe) {
5954         RepeatedSequenceLocs.clear();
5955         return outliner::OutlinedFunction();
5956       }
5957 
5958       // Save + restore LR.
5959       NumBytesToCreateFrame += 8;
5960     }
5961   }
5962 
5963   return outliner::OutlinedFunction(RepeatedSequenceLocs, SequenceSize,
5964                                     NumBytesToCreateFrame, FrameID);
5965 }
5966 
5967 bool AArch64InstrInfo::isFunctionSafeToOutlineFrom(
5968     MachineFunction &MF, bool OutlineFromLinkOnceODRs) const {
5969   const Function &F = MF.getFunction();
5970 
5971   // Can F be deduplicated by the linker? If it can, don't outline from it.
5972   if (!OutlineFromLinkOnceODRs && F.hasLinkOnceODRLinkage())
5973     return false;
5974 
5975   // Don't outline from functions with section markings; the program could
5976   // expect that all the code is in the named section.
5977   // FIXME: Allow outlining from multiple functions with the same section
5978   // marking.
5979   if (F.hasSection())
5980     return false;
5981 
5982   // Outlining from functions with redzones is unsafe since the outliner may
5983   // modify the stack. Check if hasRedZone is true or unknown; if yes, don't
5984   // outline from it.
5985   AArch64FunctionInfo *AFI = MF.getInfo<AArch64FunctionInfo>();
5986   if (!AFI || AFI->hasRedZone().getValueOr(true))
5987     return false;
5988 
5989   // It's safe to outline from MF.
5990   return true;
5991 }
5992 
5993 bool AArch64InstrInfo::isMBBSafeToOutlineFrom(MachineBasicBlock &MBB,
5994                                               unsigned &Flags) const {
5995   // Check if LR is available through all of the MBB. If it's not, then set
5996   // a flag.
5997   assert(MBB.getParent()->getRegInfo().tracksLiveness() &&
5998          "Suitable Machine Function for outlining must track liveness");
5999   LiveRegUnits LRU(getRegisterInfo());
6000 
6001   std::for_each(MBB.rbegin(), MBB.rend(),
6002                 [&LRU](MachineInstr &MI) { LRU.accumulate(MI); });
6003 
6004   // Check if each of the unsafe registers are available...
6005   bool W16AvailableInBlock = LRU.available(AArch64::W16);
6006   bool W17AvailableInBlock = LRU.available(AArch64::W17);
6007   bool NZCVAvailableInBlock = LRU.available(AArch64::NZCV);
6008 
6009   // If all of these are dead (and not live out), we know we don't have to check
6010   // them later.
6011   if (W16AvailableInBlock && W17AvailableInBlock && NZCVAvailableInBlock)
6012     Flags |= MachineOutlinerMBBFlags::UnsafeRegsDead;
6013 
6014   // Now, add the live outs to the set.
6015   LRU.addLiveOuts(MBB);
6016 
6017   // If any of these registers is available in the MBB, but also a live out of
6018   // the block, then we know outlining is unsafe.
6019   if (W16AvailableInBlock && !LRU.available(AArch64::W16))
6020     return false;
6021   if (W17AvailableInBlock && !LRU.available(AArch64::W17))
6022     return false;
6023   if (NZCVAvailableInBlock && !LRU.available(AArch64::NZCV))
6024     return false;
6025 
6026   // Check if there's a call inside this MachineBasicBlock. If there is, then
6027   // set a flag.
6028   if (any_of(MBB, [](MachineInstr &MI) { return MI.isCall(); }))
6029     Flags |= MachineOutlinerMBBFlags::HasCalls;
6030 
6031   MachineFunction *MF = MBB.getParent();
6032 
6033   // In the event that we outline, we may have to save LR. If there is an
6034   // available register in the MBB, then we'll always save LR there. Check if
6035   // this is true.
6036   bool CanSaveLR = false;
6037   const AArch64RegisterInfo *ARI = static_cast<const AArch64RegisterInfo *>(
6038       MF->getSubtarget().getRegisterInfo());
6039 
6040   // Check if there is an available register across the sequence that we can
6041   // use.
6042   for (unsigned Reg : AArch64::GPR64RegClass) {
6043     if (!ARI->isReservedReg(*MF, Reg) && Reg != AArch64::LR &&
6044         Reg != AArch64::X16 && Reg != AArch64::X17 && LRU.available(Reg)) {
6045       CanSaveLR = true;
6046       break;
6047     }
6048   }
6049 
6050   // Check if we have a register we can save LR to, and if LR was used
6051   // somewhere. If both of those things are true, then we need to evaluate the
6052   // safety of outlining stack instructions later.
6053   if (!CanSaveLR && !LRU.available(AArch64::LR))
6054     Flags |= MachineOutlinerMBBFlags::LRUnavailableSomewhere;
6055 
6056   return true;
6057 }
6058 
6059 outliner::InstrType
6060 AArch64InstrInfo::getOutliningType(MachineBasicBlock::iterator &MIT,
6061                                    unsigned Flags) const {
6062   MachineInstr &MI = *MIT;
6063   MachineBasicBlock *MBB = MI.getParent();
6064   MachineFunction *MF = MBB->getParent();
6065   AArch64FunctionInfo *FuncInfo = MF->getInfo<AArch64FunctionInfo>();
6066 
6067   // Don't outline anything used for return address signing. The outlined
6068   // function will get signed later if needed
6069   switch (MI.getOpcode()) {
6070   case AArch64::PACIASP:
6071   case AArch64::PACIBSP:
6072   case AArch64::AUTIASP:
6073   case AArch64::AUTIBSP:
6074   case AArch64::RETAA:
6075   case AArch64::RETAB:
6076   case AArch64::EMITBKEY:
6077     return outliner::InstrType::Illegal;
6078   }
6079 
6080   // Don't outline LOHs.
6081   if (FuncInfo->getLOHRelated().count(&MI))
6082     return outliner::InstrType::Illegal;
6083 
6084   // Don't allow debug values to impact outlining type.
6085   if (MI.isDebugInstr() || MI.isIndirectDebugValue())
6086     return outliner::InstrType::Invisible;
6087 
6088   // At this point, KILL instructions don't really tell us much so we can go
6089   // ahead and skip over them.
6090   if (MI.isKill())
6091     return outliner::InstrType::Invisible;
6092 
6093   // Is this a terminator for a basic block?
6094   if (MI.isTerminator()) {
6095 
6096     // Is this the end of a function?
6097     if (MI.getParent()->succ_empty())
6098       return outliner::InstrType::Legal;
6099 
6100     // It's not, so don't outline it.
6101     return outliner::InstrType::Illegal;
6102   }
6103 
6104   // Make sure none of the operands are un-outlinable.
6105   for (const MachineOperand &MOP : MI.operands()) {
6106     if (MOP.isCPI() || MOP.isJTI() || MOP.isCFIIndex() || MOP.isFI() ||
6107         MOP.isTargetIndex())
6108       return outliner::InstrType::Illegal;
6109 
6110     // If it uses LR or W30 explicitly, then don't touch it.
6111     if (MOP.isReg() && !MOP.isImplicit() &&
6112         (MOP.getReg() == AArch64::LR || MOP.getReg() == AArch64::W30))
6113       return outliner::InstrType::Illegal;
6114   }
6115 
6116   // Special cases for instructions that can always be outlined, but will fail
6117   // the later tests. e.g, ADRPs, which are PC-relative use LR, but can always
6118   // be outlined because they don't require a *specific* value to be in LR.
6119   if (MI.getOpcode() == AArch64::ADRP)
6120     return outliner::InstrType::Legal;
6121 
6122   // If MI is a call we might be able to outline it. We don't want to outline
6123   // any calls that rely on the position of items on the stack. When we outline
6124   // something containing a call, we have to emit a save and restore of LR in
6125   // the outlined function. Currently, this always happens by saving LR to the
6126   // stack. Thus, if we outline, say, half the parameters for a function call
6127   // plus the call, then we'll break the callee's expectations for the layout
6128   // of the stack.
6129   //
6130   // FIXME: Allow calls to functions which construct a stack frame, as long
6131   // as they don't access arguments on the stack.
6132   // FIXME: Figure out some way to analyze functions defined in other modules.
6133   // We should be able to compute the memory usage based on the IR calling
6134   // convention, even if we can't see the definition.
6135   if (MI.isCall()) {
6136     // Get the function associated with the call. Look at each operand and find
6137     // the one that represents the callee and get its name.
6138     const Function *Callee = nullptr;
6139     for (const MachineOperand &MOP : MI.operands()) {
6140       if (MOP.isGlobal()) {
6141         Callee = dyn_cast<Function>(MOP.getGlobal());
6142         break;
6143       }
6144     }
6145 
6146     // Never outline calls to mcount.  There isn't any rule that would require
6147     // this, but the Linux kernel's "ftrace" feature depends on it.
6148     if (Callee && Callee->getName() == "\01_mcount")
6149       return outliner::InstrType::Illegal;
6150 
6151     // If we don't know anything about the callee, assume it depends on the
6152     // stack layout of the caller. In that case, it's only legal to outline
6153     // as a tail-call.  Whitelist the call instructions we know about so we
6154     // don't get unexpected results with call pseudo-instructions.
6155     auto UnknownCallOutlineType = outliner::InstrType::Illegal;
6156     if (MI.getOpcode() == AArch64::BLR || MI.getOpcode() == AArch64::BL)
6157       UnknownCallOutlineType = outliner::InstrType::LegalTerminator;
6158 
6159     if (!Callee)
6160       return UnknownCallOutlineType;
6161 
6162     // We have a function we have information about. Check it if it's something
6163     // can safely outline.
6164     MachineFunction *CalleeMF = MF->getMMI().getMachineFunction(*Callee);
6165 
6166     // We don't know what's going on with the callee at all. Don't touch it.
6167     if (!CalleeMF)
6168       return UnknownCallOutlineType;
6169 
6170     // Check if we know anything about the callee saves on the function. If we
6171     // don't, then don't touch it, since that implies that we haven't
6172     // computed anything about its stack frame yet.
6173     MachineFrameInfo &MFI = CalleeMF->getFrameInfo();
6174     if (!MFI.isCalleeSavedInfoValid() || MFI.getStackSize() > 0 ||
6175         MFI.getNumObjects() > 0)
6176       return UnknownCallOutlineType;
6177 
6178     // At this point, we can say that CalleeMF ought to not pass anything on the
6179     // stack. Therefore, we can outline it.
6180     return outliner::InstrType::Legal;
6181   }
6182 
6183   // Don't outline positions.
6184   if (MI.isPosition())
6185     return outliner::InstrType::Illegal;
6186 
6187   // Don't touch the link register or W30.
6188   if (MI.readsRegister(AArch64::W30, &getRegisterInfo()) ||
6189       MI.modifiesRegister(AArch64::W30, &getRegisterInfo()))
6190     return outliner::InstrType::Illegal;
6191 
6192   // Don't outline BTI instructions, because that will prevent the outlining
6193   // site from being indirectly callable.
6194   if (MI.getOpcode() == AArch64::HINT) {
6195     int64_t Imm = MI.getOperand(0).getImm();
6196     if (Imm == 32 || Imm == 34 || Imm == 36 || Imm == 38)
6197       return outliner::InstrType::Illegal;
6198   }
6199 
6200   return outliner::InstrType::Legal;
6201 }
6202 
6203 void AArch64InstrInfo::fixupPostOutline(MachineBasicBlock &MBB) const {
6204   for (MachineInstr &MI : MBB) {
6205     const MachineOperand *Base;
6206     unsigned Width;
6207     int64_t Offset;
6208 
6209     // Is this a load or store with an immediate offset with SP as the base?
6210     if (!MI.mayLoadOrStore() ||
6211         !getMemOperandWithOffsetWidth(MI, Base, Offset, Width, &RI) ||
6212         (Base->isReg() && Base->getReg() != AArch64::SP))
6213       continue;
6214 
6215     // It is, so we have to fix it up.
6216     unsigned Scale;
6217     int64_t Dummy1, Dummy2;
6218 
6219     MachineOperand &StackOffsetOperand = getMemOpBaseRegImmOfsOffsetOperand(MI);
6220     assert(StackOffsetOperand.isImm() && "Stack offset wasn't immediate!");
6221     getMemOpInfo(MI.getOpcode(), Scale, Width, Dummy1, Dummy2);
6222     assert(Scale != 0 && "Unexpected opcode!");
6223 
6224     // We've pushed the return address to the stack, so add 16 to the offset.
6225     // This is safe, since we already checked if it would overflow when we
6226     // checked if this instruction was legal to outline.
6227     int64_t NewImm = (Offset + 16) / Scale;
6228     StackOffsetOperand.setImm(NewImm);
6229   }
6230 }
6231 
6232 static void signOutlinedFunction(MachineFunction &MF, MachineBasicBlock &MBB,
6233                                  bool ShouldSignReturnAddr,
6234                                  bool ShouldSignReturnAddrWithAKey) {
6235   if (ShouldSignReturnAddr) {
6236     MachineBasicBlock::iterator MBBPAC = MBB.begin();
6237     MachineBasicBlock::iterator MBBAUT = MBB.getFirstTerminator();
6238     const AArch64Subtarget &Subtarget = MF.getSubtarget<AArch64Subtarget>();
6239     const TargetInstrInfo *TII = Subtarget.getInstrInfo();
6240     DebugLoc DL;
6241 
6242     if (MBBAUT != MBB.end())
6243       DL = MBBAUT->getDebugLoc();
6244 
6245     // At the very beginning of the basic block we insert the following
6246     // depending on the key type
6247     //
6248     // a_key:                   b_key:
6249     //    PACIASP                   EMITBKEY
6250     //    CFI_INSTRUCTION           PACIBSP
6251     //                              CFI_INSTRUCTION
6252     if (ShouldSignReturnAddrWithAKey) {
6253       BuildMI(MBB, MBBPAC, DebugLoc(), TII->get(AArch64::PACIASP))
6254           .setMIFlag(MachineInstr::FrameSetup);
6255     } else {
6256       BuildMI(MBB, MBBPAC, DebugLoc(), TII->get(AArch64::EMITBKEY))
6257           .setMIFlag(MachineInstr::FrameSetup);
6258       BuildMI(MBB, MBBPAC, DebugLoc(), TII->get(AArch64::PACIBSP))
6259           .setMIFlag(MachineInstr::FrameSetup);
6260     }
6261     unsigned CFIIndex =
6262         MF.addFrameInst(MCCFIInstruction::createNegateRAState(nullptr));
6263     BuildMI(MBB, MBBPAC, DebugLoc(), TII->get(AArch64::CFI_INSTRUCTION))
6264         .addCFIIndex(CFIIndex)
6265         .setMIFlags(MachineInstr::FrameSetup);
6266 
6267     // If v8.3a features are available we can replace a RET instruction by
6268     // RETAA or RETAB and omit the AUT instructions
6269     if (Subtarget.hasV8_3aOps() && MBBAUT != MBB.end() &&
6270         MBBAUT->getOpcode() == AArch64::RET) {
6271       BuildMI(MBB, MBBAUT, DL,
6272               TII->get(ShouldSignReturnAddrWithAKey ? AArch64::RETAA
6273                                                     : AArch64::RETAB))
6274           .copyImplicitOps(*MBBAUT);
6275       MBB.erase(MBBAUT);
6276     } else {
6277       BuildMI(MBB, MBBAUT, DL,
6278               TII->get(ShouldSignReturnAddrWithAKey ? AArch64::AUTIASP
6279                                                     : AArch64::AUTIBSP))
6280           .setMIFlag(MachineInstr::FrameDestroy);
6281     }
6282   }
6283 }
6284 
6285 void AArch64InstrInfo::buildOutlinedFrame(
6286     MachineBasicBlock &MBB, MachineFunction &MF,
6287     const outliner::OutlinedFunction &OF) const {
6288   // For thunk outlining, rewrite the last instruction from a call to a
6289   // tail-call.
6290   if (OF.FrameConstructionID == MachineOutlinerThunk) {
6291     MachineInstr *Call = &*--MBB.instr_end();
6292     unsigned TailOpcode;
6293     if (Call->getOpcode() == AArch64::BL) {
6294       TailOpcode = AArch64::TCRETURNdi;
6295     } else {
6296       assert(Call->getOpcode() == AArch64::BLR);
6297       TailOpcode = AArch64::TCRETURNriALL;
6298     }
6299     MachineInstr *TC = BuildMI(MF, DebugLoc(), get(TailOpcode))
6300                            .add(Call->getOperand(0))
6301                            .addImm(0);
6302     MBB.insert(MBB.end(), TC);
6303     Call->eraseFromParent();
6304   }
6305 
6306   bool IsLeafFunction = true;
6307 
6308   // Is there a call in the outlined range?
6309   auto IsNonTailCall = [](const MachineInstr &MI) {
6310     return MI.isCall() && !MI.isReturn();
6311   };
6312 
6313   if (std::any_of(MBB.instr_begin(), MBB.instr_end(), IsNonTailCall)) {
6314     // Fix up the instructions in the range, since we're going to modify the
6315     // stack.
6316     assert(OF.FrameConstructionID != MachineOutlinerDefault &&
6317            "Can only fix up stack references once");
6318     fixupPostOutline(MBB);
6319 
6320     IsLeafFunction = false;
6321 
6322     // LR has to be a live in so that we can save it.
6323     MBB.addLiveIn(AArch64::LR);
6324 
6325     MachineBasicBlock::iterator It = MBB.begin();
6326     MachineBasicBlock::iterator Et = MBB.end();
6327 
6328     if (OF.FrameConstructionID == MachineOutlinerTailCall ||
6329         OF.FrameConstructionID == MachineOutlinerThunk)
6330       Et = std::prev(MBB.end());
6331 
6332     // Insert a save before the outlined region
6333     MachineInstr *STRXpre = BuildMI(MF, DebugLoc(), get(AArch64::STRXpre))
6334                                 .addReg(AArch64::SP, RegState::Define)
6335                                 .addReg(AArch64::LR)
6336                                 .addReg(AArch64::SP)
6337                                 .addImm(-16);
6338     It = MBB.insert(It, STRXpre);
6339 
6340     const TargetSubtargetInfo &STI = MF.getSubtarget();
6341     const MCRegisterInfo *MRI = STI.getRegisterInfo();
6342     unsigned DwarfReg = MRI->getDwarfRegNum(AArch64::LR, true);
6343 
6344     // Add a CFI saying the stack was moved 16 B down.
6345     int64_t StackPosEntry =
6346         MF.addFrameInst(MCCFIInstruction::createDefCfaOffset(nullptr, 16));
6347     BuildMI(MBB, It, DebugLoc(), get(AArch64::CFI_INSTRUCTION))
6348         .addCFIIndex(StackPosEntry)
6349         .setMIFlags(MachineInstr::FrameSetup);
6350 
6351     // Add a CFI saying that the LR that we want to find is now 16 B higher than
6352     // before.
6353     int64_t LRPosEntry =
6354         MF.addFrameInst(MCCFIInstruction::createOffset(nullptr, DwarfReg, 16));
6355     BuildMI(MBB, It, DebugLoc(), get(AArch64::CFI_INSTRUCTION))
6356         .addCFIIndex(LRPosEntry)
6357         .setMIFlags(MachineInstr::FrameSetup);
6358 
6359     // Insert a restore before the terminator for the function.
6360     MachineInstr *LDRXpost = BuildMI(MF, DebugLoc(), get(AArch64::LDRXpost))
6361                                  .addReg(AArch64::SP, RegState::Define)
6362                                  .addReg(AArch64::LR, RegState::Define)
6363                                  .addReg(AArch64::SP)
6364                                  .addImm(16);
6365     Et = MBB.insert(Et, LDRXpost);
6366   }
6367 
6368   // If a bunch of candidates reach this point they must agree on their return
6369   // address signing. It is therefore enough to just consider the signing
6370   // behaviour of one of them
6371   const Function &CF = OF.Candidates.front().getMF()->getFunction();
6372   bool ShouldSignReturnAddr = false;
6373   if (CF.hasFnAttribute("sign-return-address")) {
6374     StringRef Scope =
6375         CF.getFnAttribute("sign-return-address").getValueAsString();
6376     if (Scope.equals("all"))
6377       ShouldSignReturnAddr = true;
6378     else if (Scope.equals("non-leaf") && !IsLeafFunction)
6379       ShouldSignReturnAddr = true;
6380   }
6381 
6382   // a_key is the default
6383   bool ShouldSignReturnAddrWithAKey = true;
6384   if (CF.hasFnAttribute("sign-return-address-key")) {
6385     const StringRef Key =
6386         CF.getFnAttribute("sign-return-address-key").getValueAsString();
6387     // Key can either be a_key or b_key
6388     assert((Key.equals_lower("a_key") || Key.equals_lower("b_key")) &&
6389            "Return address signing key must be either a_key or b_key");
6390     ShouldSignReturnAddrWithAKey = Key.equals_lower("a_key");
6391   }
6392 
6393   // If this is a tail call outlined function, then there's already a return.
6394   if (OF.FrameConstructionID == MachineOutlinerTailCall ||
6395       OF.FrameConstructionID == MachineOutlinerThunk) {
6396     signOutlinedFunction(MF, MBB, ShouldSignReturnAddr,
6397                          ShouldSignReturnAddrWithAKey);
6398     return;
6399   }
6400 
6401   // It's not a tail call, so we have to insert the return ourselves.
6402   MachineInstr *ret = BuildMI(MF, DebugLoc(), get(AArch64::RET))
6403                           .addReg(AArch64::LR, RegState::Undef);
6404   MBB.insert(MBB.end(), ret);
6405 
6406   signOutlinedFunction(MF, MBB, ShouldSignReturnAddr,
6407                        ShouldSignReturnAddrWithAKey);
6408 
6409   // Did we have to modify the stack by saving the link register?
6410   if (OF.FrameConstructionID != MachineOutlinerDefault)
6411     return;
6412 
6413   // We modified the stack.
6414   // Walk over the basic block and fix up all the stack accesses.
6415   fixupPostOutline(MBB);
6416 }
6417 
6418 MachineBasicBlock::iterator AArch64InstrInfo::insertOutlinedCall(
6419     Module &M, MachineBasicBlock &MBB, MachineBasicBlock::iterator &It,
6420     MachineFunction &MF, const outliner::Candidate &C) const {
6421 
6422   // Are we tail calling?
6423   if (C.CallConstructionID == MachineOutlinerTailCall) {
6424     // If yes, then we can just branch to the label.
6425     It = MBB.insert(It, BuildMI(MF, DebugLoc(), get(AArch64::TCRETURNdi))
6426                             .addGlobalAddress(M.getNamedValue(MF.getName()))
6427                             .addImm(0));
6428     return It;
6429   }
6430 
6431   // Are we saving the link register?
6432   if (C.CallConstructionID == MachineOutlinerNoLRSave ||
6433       C.CallConstructionID == MachineOutlinerThunk) {
6434     // No, so just insert the call.
6435     It = MBB.insert(It, BuildMI(MF, DebugLoc(), get(AArch64::BL))
6436                             .addGlobalAddress(M.getNamedValue(MF.getName())));
6437     return It;
6438   }
6439 
6440   // We want to return the spot where we inserted the call.
6441   MachineBasicBlock::iterator CallPt;
6442 
6443   // Instructions for saving and restoring LR around the call instruction we're
6444   // going to insert.
6445   MachineInstr *Save;
6446   MachineInstr *Restore;
6447   // Can we save to a register?
6448   if (C.CallConstructionID == MachineOutlinerRegSave) {
6449     // FIXME: This logic should be sunk into a target-specific interface so that
6450     // we don't have to recompute the register.
6451     unsigned Reg = findRegisterToSaveLRTo(C);
6452     assert(Reg != 0 && "No callee-saved register available?");
6453 
6454     // Save and restore LR from that register.
6455     Save = BuildMI(MF, DebugLoc(), get(AArch64::ORRXrs), Reg)
6456                .addReg(AArch64::XZR)
6457                .addReg(AArch64::LR)
6458                .addImm(0);
6459     Restore = BuildMI(MF, DebugLoc(), get(AArch64::ORRXrs), AArch64::LR)
6460                 .addReg(AArch64::XZR)
6461                 .addReg(Reg)
6462                 .addImm(0);
6463   } else {
6464     // We have the default case. Save and restore from SP.
6465     Save = BuildMI(MF, DebugLoc(), get(AArch64::STRXpre))
6466                .addReg(AArch64::SP, RegState::Define)
6467                .addReg(AArch64::LR)
6468                .addReg(AArch64::SP)
6469                .addImm(-16);
6470     Restore = BuildMI(MF, DebugLoc(), get(AArch64::LDRXpost))
6471                   .addReg(AArch64::SP, RegState::Define)
6472                   .addReg(AArch64::LR, RegState::Define)
6473                   .addReg(AArch64::SP)
6474                   .addImm(16);
6475   }
6476 
6477   It = MBB.insert(It, Save);
6478   It++;
6479 
6480   // Insert the call.
6481   It = MBB.insert(It, BuildMI(MF, DebugLoc(), get(AArch64::BL))
6482                           .addGlobalAddress(M.getNamedValue(MF.getName())));
6483   CallPt = It;
6484   It++;
6485 
6486   It = MBB.insert(It, Restore);
6487   return CallPt;
6488 }
6489 
6490 bool AArch64InstrInfo::shouldOutlineFromFunctionByDefault(
6491   MachineFunction &MF) const {
6492   return MF.getFunction().hasMinSize();
6493 }
6494 
6495 Optional<DestSourcePair>
6496 AArch64InstrInfo::isCopyInstrImpl(const MachineInstr &MI) const {
6497 
6498   // AArch64::ORRWrs and AArch64::ORRXrs with WZR/XZR reg
6499   // and zero immediate operands used as an alias for mov instruction.
6500   if (MI.getOpcode() == AArch64::ORRWrs &&
6501       MI.getOperand(1).getReg() == AArch64::WZR &&
6502       MI.getOperand(3).getImm() == 0x0) {
6503     return DestSourcePair{MI.getOperand(0), MI.getOperand(2)};
6504   }
6505 
6506   if (MI.getOpcode() == AArch64::ORRXrs &&
6507       MI.getOperand(1).getReg() == AArch64::XZR &&
6508       MI.getOperand(3).getImm() == 0x0) {
6509     return DestSourcePair{MI.getOperand(0), MI.getOperand(2)};
6510   }
6511 
6512   return None;
6513 }
6514 
6515 Optional<RegImmPair> AArch64InstrInfo::isAddImmediate(const MachineInstr &MI,
6516                                                       Register Reg) const {
6517   int Sign = 1;
6518   int64_t Offset = 0;
6519 
6520   // TODO: Handle cases where Reg is a super- or sub-register of the
6521   // destination register.
6522   if (Reg != MI.getOperand(0).getReg())
6523     return None;
6524 
6525   switch (MI.getOpcode()) {
6526   default:
6527     return None;
6528   case AArch64::SUBWri:
6529   case AArch64::SUBXri:
6530   case AArch64::SUBSWri:
6531   case AArch64::SUBSXri:
6532     Sign *= -1;
6533     LLVM_FALLTHROUGH;
6534   case AArch64::ADDSWri:
6535   case AArch64::ADDSXri:
6536   case AArch64::ADDWri:
6537   case AArch64::ADDXri: {
6538     // TODO: Third operand can be global address (usually some string).
6539     if (!MI.getOperand(0).isReg() || !MI.getOperand(1).isReg() ||
6540         !MI.getOperand(2).isImm())
6541       return None;
6542     Offset = MI.getOperand(2).getImm() * Sign;
6543     int Shift = MI.getOperand(3).getImm();
6544     assert((Shift == 0 || Shift == 12) && "Shift can be either 0 or 12");
6545     Offset = Offset << Shift;
6546   }
6547   }
6548   return RegImmPair{MI.getOperand(1).getReg(), Offset};
6549 }
6550 
6551 /// If the given ORR instruction is a copy, and \p DescribedReg overlaps with
6552 /// the destination register then, if possible, describe the value in terms of
6553 /// the source register.
6554 static Optional<ParamLoadedValue>
6555 describeORRLoadedValue(const MachineInstr &MI, Register DescribedReg,
6556                        const TargetInstrInfo *TII,
6557                        const TargetRegisterInfo *TRI) {
6558   auto DestSrc = TII->isCopyInstr(MI);
6559   if (!DestSrc)
6560     return None;
6561 
6562   Register DestReg = DestSrc->Destination->getReg();
6563   Register SrcReg = DestSrc->Source->getReg();
6564 
6565   auto Expr = DIExpression::get(MI.getMF()->getFunction().getContext(), {});
6566 
6567   // If the described register is the destination, just return the source.
6568   if (DestReg == DescribedReg)
6569     return ParamLoadedValue(MachineOperand::CreateReg(SrcReg, false), Expr);
6570 
6571   // ORRWrs zero-extends to 64-bits, so we need to consider such cases.
6572   if (MI.getOpcode() == AArch64::ORRWrs &&
6573       TRI->isSuperRegister(DestReg, DescribedReg))
6574     return ParamLoadedValue(MachineOperand::CreateReg(SrcReg, false), Expr);
6575 
6576   // We may need to describe the lower part of a ORRXrs move.
6577   if (MI.getOpcode() == AArch64::ORRXrs &&
6578       TRI->isSubRegister(DestReg, DescribedReg)) {
6579     Register SrcSubReg = TRI->getSubReg(SrcReg, AArch64::sub_32);
6580     return ParamLoadedValue(MachineOperand::CreateReg(SrcSubReg, false), Expr);
6581   }
6582 
6583   assert(!TRI->isSuperOrSubRegisterEq(DestReg, DescribedReg) &&
6584          "Unhandled ORR[XW]rs copy case");
6585 
6586   return None;
6587 }
6588 
6589 Optional<ParamLoadedValue>
6590 AArch64InstrInfo::describeLoadedValue(const MachineInstr &MI,
6591                                       Register Reg) const {
6592   const MachineFunction *MF = MI.getMF();
6593   const TargetRegisterInfo *TRI = MF->getSubtarget().getRegisterInfo();
6594   switch (MI.getOpcode()) {
6595   case AArch64::MOVZWi:
6596   case AArch64::MOVZXi: {
6597     // MOVZWi may be used for producing zero-extended 32-bit immediates in
6598     // 64-bit parameters, so we need to consider super-registers.
6599     if (!TRI->isSuperRegisterEq(MI.getOperand(0).getReg(), Reg))
6600       return None;
6601 
6602     if (!MI.getOperand(1).isImm())
6603       return None;
6604     int64_t Immediate = MI.getOperand(1).getImm();
6605     int Shift = MI.getOperand(2).getImm();
6606     return ParamLoadedValue(MachineOperand::CreateImm(Immediate << Shift),
6607                             nullptr);
6608   }
6609   case AArch64::ORRWrs:
6610   case AArch64::ORRXrs:
6611     return describeORRLoadedValue(MI, Reg, this, TRI);
6612   }
6613 
6614   return TargetInstrInfo::describeLoadedValue(MI, Reg);
6615 }
6616 
6617 #define GET_INSTRINFO_HELPERS
6618 #include "AArch64GenInstrInfo.inc"
6619